diff options
Diffstat (limited to 'kernel')
100 files changed, 8332 insertions, 2985 deletions
diff --git a/kernel/audit.c b/kernel/audit.c index 906ae5a0233a..3392d3e0254a 100644 --- a/kernel/audit.c +++ b/kernel/audit.c | |||
@@ -41,6 +41,8 @@ | |||
41 | * Example user-space utilities: http://people.redhat.com/sgrubb/audit/ | 41 | * Example user-space utilities: http://people.redhat.com/sgrubb/audit/ |
42 | */ | 42 | */ |
43 | 43 | ||
44 | #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt | ||
45 | |||
44 | #include <linux/init.h> | 46 | #include <linux/init.h> |
45 | #include <asm/types.h> | 47 | #include <asm/types.h> |
46 | #include <linux/atomic.h> | 48 | #include <linux/atomic.h> |
@@ -63,6 +65,7 @@ | |||
63 | #include <linux/freezer.h> | 65 | #include <linux/freezer.h> |
64 | #include <linux/tty.h> | 66 | #include <linux/tty.h> |
65 | #include <linux/pid_namespace.h> | 67 | #include <linux/pid_namespace.h> |
68 | #include <net/netns/generic.h> | ||
66 | 69 | ||
67 | #include "audit.h" | 70 | #include "audit.h" |
68 | 71 | ||
@@ -76,16 +79,16 @@ static int audit_initialized; | |||
76 | #define AUDIT_OFF 0 | 79 | #define AUDIT_OFF 0 |
77 | #define AUDIT_ON 1 | 80 | #define AUDIT_ON 1 |
78 | #define AUDIT_LOCKED 2 | 81 | #define AUDIT_LOCKED 2 |
79 | int audit_enabled; | 82 | u32 audit_enabled; |
80 | int audit_ever_enabled; | 83 | u32 audit_ever_enabled; |
81 | 84 | ||
82 | EXPORT_SYMBOL_GPL(audit_enabled); | 85 | EXPORT_SYMBOL_GPL(audit_enabled); |
83 | 86 | ||
84 | /* Default state when kernel boots without any parameters. */ | 87 | /* Default state when kernel boots without any parameters. */ |
85 | static int audit_default; | 88 | static u32 audit_default; |
86 | 89 | ||
87 | /* If auditing cannot proceed, audit_failure selects what happens. */ | 90 | /* If auditing cannot proceed, audit_failure selects what happens. */ |
88 | static int audit_failure = AUDIT_FAIL_PRINTK; | 91 | static u32 audit_failure = AUDIT_FAIL_PRINTK; |
89 | 92 | ||
90 | /* | 93 | /* |
91 | * If audit records are to be written to the netlink socket, audit_pid | 94 | * If audit records are to be written to the netlink socket, audit_pid |
@@ -93,17 +96,19 @@ static int audit_failure = AUDIT_FAIL_PRINTK; | |||
93 | * the portid to use to send netlink messages to that process. | 96 | * the portid to use to send netlink messages to that process. |
94 | */ | 97 | */ |
95 | int audit_pid; | 98 | int audit_pid; |
96 | static int audit_nlk_portid; | 99 | static __u32 audit_nlk_portid; |
97 | 100 | ||
98 | /* If audit_rate_limit is non-zero, limit the rate of sending audit records | 101 | /* If audit_rate_limit is non-zero, limit the rate of sending audit records |
99 | * to that number per second. This prevents DoS attacks, but results in | 102 | * to that number per second. This prevents DoS attacks, but results in |
100 | * audit records being dropped. */ | 103 | * audit records being dropped. */ |
101 | static int audit_rate_limit; | 104 | static u32 audit_rate_limit; |
102 | 105 | ||
103 | /* Number of outstanding audit_buffers allowed. */ | 106 | /* Number of outstanding audit_buffers allowed. |
104 | static int audit_backlog_limit = 64; | 107 | * When set to zero, this means unlimited. */ |
105 | static int audit_backlog_wait_time = 60 * HZ; | 108 | static u32 audit_backlog_limit = 64; |
106 | static int audit_backlog_wait_overflow = 0; | 109 | #define AUDIT_BACKLOG_WAIT_TIME (60 * HZ) |
110 | static u32 audit_backlog_wait_time = AUDIT_BACKLOG_WAIT_TIME; | ||
111 | static u32 audit_backlog_wait_overflow = 0; | ||
107 | 112 | ||
108 | /* The identity of the user shutting down the audit system. */ | 113 | /* The identity of the user shutting down the audit system. */ |
109 | kuid_t audit_sig_uid = INVALID_UID; | 114 | kuid_t audit_sig_uid = INVALID_UID; |
@@ -121,6 +126,7 @@ static atomic_t audit_lost = ATOMIC_INIT(0); | |||
121 | 126 | ||
122 | /* The netlink socket. */ | 127 | /* The netlink socket. */ |
123 | static struct sock *audit_sock; | 128 | static struct sock *audit_sock; |
129 | int audit_net_id; | ||
124 | 130 | ||
125 | /* Hash for inode-based rules */ | 131 | /* Hash for inode-based rules */ |
126 | struct list_head audit_inode_hash[AUDIT_INODE_BUCKETS]; | 132 | struct list_head audit_inode_hash[AUDIT_INODE_BUCKETS]; |
@@ -175,27 +181,27 @@ struct audit_buffer { | |||
175 | }; | 181 | }; |
176 | 182 | ||
177 | struct audit_reply { | 183 | struct audit_reply { |
178 | int pid; | 184 | __u32 portid; |
185 | struct net *net; | ||
179 | struct sk_buff *skb; | 186 | struct sk_buff *skb; |
180 | }; | 187 | }; |
181 | 188 | ||
182 | static void audit_set_pid(struct audit_buffer *ab, pid_t pid) | 189 | static void audit_set_portid(struct audit_buffer *ab, __u32 portid) |
183 | { | 190 | { |
184 | if (ab) { | 191 | if (ab) { |
185 | struct nlmsghdr *nlh = nlmsg_hdr(ab->skb); | 192 | struct nlmsghdr *nlh = nlmsg_hdr(ab->skb); |
186 | nlh->nlmsg_pid = pid; | 193 | nlh->nlmsg_pid = portid; |
187 | } | 194 | } |
188 | } | 195 | } |
189 | 196 | ||
190 | void audit_panic(const char *message) | 197 | void audit_panic(const char *message) |
191 | { | 198 | { |
192 | switch (audit_failure) | 199 | switch (audit_failure) { |
193 | { | ||
194 | case AUDIT_FAIL_SILENT: | 200 | case AUDIT_FAIL_SILENT: |
195 | break; | 201 | break; |
196 | case AUDIT_FAIL_PRINTK: | 202 | case AUDIT_FAIL_PRINTK: |
197 | if (printk_ratelimit()) | 203 | if (printk_ratelimit()) |
198 | printk(KERN_ERR "audit: %s\n", message); | 204 | pr_err("%s\n", message); |
199 | break; | 205 | break; |
200 | case AUDIT_FAIL_PANIC: | 206 | case AUDIT_FAIL_PANIC: |
201 | /* test audit_pid since printk is always losey, why bother? */ | 207 | /* test audit_pid since printk is always losey, why bother? */ |
@@ -266,9 +272,7 @@ void audit_log_lost(const char *message) | |||
266 | 272 | ||
267 | if (print) { | 273 | if (print) { |
268 | if (printk_ratelimit()) | 274 | if (printk_ratelimit()) |
269 | printk(KERN_WARNING | 275 | pr_warn("audit_lost=%u audit_rate_limit=%u audit_backlog_limit=%u\n", |
270 | "audit: audit_lost=%d audit_rate_limit=%d " | ||
271 | "audit_backlog_limit=%d\n", | ||
272 | atomic_read(&audit_lost), | 276 | atomic_read(&audit_lost), |
273 | audit_rate_limit, | 277 | audit_rate_limit, |
274 | audit_backlog_limit); | 278 | audit_backlog_limit); |
@@ -276,7 +280,7 @@ void audit_log_lost(const char *message) | |||
276 | } | 280 | } |
277 | } | 281 | } |
278 | 282 | ||
279 | static int audit_log_config_change(char *function_name, int new, int old, | 283 | static int audit_log_config_change(char *function_name, u32 new, u32 old, |
280 | int allow_changes) | 284 | int allow_changes) |
281 | { | 285 | { |
282 | struct audit_buffer *ab; | 286 | struct audit_buffer *ab; |
@@ -285,7 +289,7 @@ static int audit_log_config_change(char *function_name, int new, int old, | |||
285 | ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE); | 289 | ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE); |
286 | if (unlikely(!ab)) | 290 | if (unlikely(!ab)) |
287 | return rc; | 291 | return rc; |
288 | audit_log_format(ab, "%s=%d old=%d", function_name, new, old); | 292 | audit_log_format(ab, "%s=%u old=%u", function_name, new, old); |
289 | audit_log_session_info(ab); | 293 | audit_log_session_info(ab); |
290 | rc = audit_log_task_context(ab); | 294 | rc = audit_log_task_context(ab); |
291 | if (rc) | 295 | if (rc) |
@@ -295,9 +299,10 @@ static int audit_log_config_change(char *function_name, int new, int old, | |||
295 | return rc; | 299 | return rc; |
296 | } | 300 | } |
297 | 301 | ||
298 | static int audit_do_config_change(char *function_name, int *to_change, int new) | 302 | static int audit_do_config_change(char *function_name, u32 *to_change, u32 new) |
299 | { | 303 | { |
300 | int allow_changes, rc = 0, old = *to_change; | 304 | int allow_changes, rc = 0; |
305 | u32 old = *to_change; | ||
301 | 306 | ||
302 | /* check if we are locked */ | 307 | /* check if we are locked */ |
303 | if (audit_enabled == AUDIT_LOCKED) | 308 | if (audit_enabled == AUDIT_LOCKED) |
@@ -320,17 +325,23 @@ static int audit_do_config_change(char *function_name, int *to_change, int new) | |||
320 | return rc; | 325 | return rc; |
321 | } | 326 | } |
322 | 327 | ||
323 | static int audit_set_rate_limit(int limit) | 328 | static int audit_set_rate_limit(u32 limit) |
324 | { | 329 | { |
325 | return audit_do_config_change("audit_rate_limit", &audit_rate_limit, limit); | 330 | return audit_do_config_change("audit_rate_limit", &audit_rate_limit, limit); |
326 | } | 331 | } |
327 | 332 | ||
328 | static int audit_set_backlog_limit(int limit) | 333 | static int audit_set_backlog_limit(u32 limit) |
329 | { | 334 | { |
330 | return audit_do_config_change("audit_backlog_limit", &audit_backlog_limit, limit); | 335 | return audit_do_config_change("audit_backlog_limit", &audit_backlog_limit, limit); |
331 | } | 336 | } |
332 | 337 | ||
333 | static int audit_set_enabled(int state) | 338 | static int audit_set_backlog_wait_time(u32 timeout) |
339 | { | ||
340 | return audit_do_config_change("audit_backlog_wait_time", | ||
341 | &audit_backlog_wait_time, timeout); | ||
342 | } | ||
343 | |||
344 | static int audit_set_enabled(u32 state) | ||
334 | { | 345 | { |
335 | int rc; | 346 | int rc; |
336 | if (state < AUDIT_OFF || state > AUDIT_LOCKED) | 347 | if (state < AUDIT_OFF || state > AUDIT_LOCKED) |
@@ -343,7 +354,7 @@ static int audit_set_enabled(int state) | |||
343 | return rc; | 354 | return rc; |
344 | } | 355 | } |
345 | 356 | ||
346 | static int audit_set_failure(int state) | 357 | static int audit_set_failure(u32 state) |
347 | { | 358 | { |
348 | if (state != AUDIT_FAIL_SILENT | 359 | if (state != AUDIT_FAIL_SILENT |
349 | && state != AUDIT_FAIL_PRINTK | 360 | && state != AUDIT_FAIL_PRINTK |
@@ -365,7 +376,8 @@ static int audit_set_failure(int state) | |||
365 | static void audit_hold_skb(struct sk_buff *skb) | 376 | static void audit_hold_skb(struct sk_buff *skb) |
366 | { | 377 | { |
367 | if (audit_default && | 378 | if (audit_default && |
368 | skb_queue_len(&audit_skb_hold_queue) < audit_backlog_limit) | 379 | (!audit_backlog_limit || |
380 | skb_queue_len(&audit_skb_hold_queue) < audit_backlog_limit)) | ||
369 | skb_queue_tail(&audit_skb_hold_queue, skb); | 381 | skb_queue_tail(&audit_skb_hold_queue, skb); |
370 | else | 382 | else |
371 | kfree_skb(skb); | 383 | kfree_skb(skb); |
@@ -382,7 +394,7 @@ static void audit_printk_skb(struct sk_buff *skb) | |||
382 | 394 | ||
383 | if (nlh->nlmsg_type != AUDIT_EOE) { | 395 | if (nlh->nlmsg_type != AUDIT_EOE) { |
384 | if (printk_ratelimit()) | 396 | if (printk_ratelimit()) |
385 | printk(KERN_NOTICE "type=%d %s\n", nlh->nlmsg_type, data); | 397 | pr_notice("type=%d %s\n", nlh->nlmsg_type, data); |
386 | else | 398 | else |
387 | audit_log_lost("printk limit exceeded\n"); | 399 | audit_log_lost("printk limit exceeded\n"); |
388 | } | 400 | } |
@@ -398,9 +410,12 @@ static void kauditd_send_skb(struct sk_buff *skb) | |||
398 | err = netlink_unicast(audit_sock, skb, audit_nlk_portid, 0); | 410 | err = netlink_unicast(audit_sock, skb, audit_nlk_portid, 0); |
399 | if (err < 0) { | 411 | if (err < 0) { |
400 | BUG_ON(err != -ECONNREFUSED); /* Shouldn't happen */ | 412 | BUG_ON(err != -ECONNREFUSED); /* Shouldn't happen */ |
401 | printk(KERN_ERR "audit: *NO* daemon at audit_pid=%d\n", audit_pid); | 413 | if (audit_pid) { |
402 | audit_log_lost("auditd disappeared\n"); | 414 | pr_err("*NO* daemon at audit_pid=%d\n", audit_pid); |
403 | audit_pid = 0; | 415 | audit_log_lost("auditd disappeared\n"); |
416 | audit_pid = 0; | ||
417 | audit_sock = NULL; | ||
418 | } | ||
404 | /* we might get lucky and get this in the next auditd */ | 419 | /* we might get lucky and get this in the next auditd */ |
405 | audit_hold_skb(skb); | 420 | audit_hold_skb(skb); |
406 | } else | 421 | } else |
@@ -457,8 +472,10 @@ static int kauditd_thread(void *dummy) | |||
457 | flush_hold_queue(); | 472 | flush_hold_queue(); |
458 | 473 | ||
459 | skb = skb_dequeue(&audit_skb_queue); | 474 | skb = skb_dequeue(&audit_skb_queue); |
460 | wake_up(&audit_backlog_wait); | 475 | |
461 | if (skb) { | 476 | if (skb) { |
477 | if (skb_queue_len(&audit_skb_queue) <= audit_backlog_limit) | ||
478 | wake_up(&audit_backlog_wait); | ||
462 | if (audit_pid) | 479 | if (audit_pid) |
463 | kauditd_send_skb(skb); | 480 | kauditd_send_skb(skb); |
464 | else | 481 | else |
@@ -482,22 +499,24 @@ static int kauditd_thread(void *dummy) | |||
482 | int audit_send_list(void *_dest) | 499 | int audit_send_list(void *_dest) |
483 | { | 500 | { |
484 | struct audit_netlink_list *dest = _dest; | 501 | struct audit_netlink_list *dest = _dest; |
485 | int pid = dest->pid; | ||
486 | struct sk_buff *skb; | 502 | struct sk_buff *skb; |
503 | struct net *net = dest->net; | ||
504 | struct audit_net *aunet = net_generic(net, audit_net_id); | ||
487 | 505 | ||
488 | /* wait for parent to finish and send an ACK */ | 506 | /* wait for parent to finish and send an ACK */ |
489 | mutex_lock(&audit_cmd_mutex); | 507 | mutex_lock(&audit_cmd_mutex); |
490 | mutex_unlock(&audit_cmd_mutex); | 508 | mutex_unlock(&audit_cmd_mutex); |
491 | 509 | ||
492 | while ((skb = __skb_dequeue(&dest->q)) != NULL) | 510 | while ((skb = __skb_dequeue(&dest->q)) != NULL) |
493 | netlink_unicast(audit_sock, skb, pid, 0); | 511 | netlink_unicast(aunet->nlsk, skb, dest->portid, 0); |
494 | 512 | ||
513 | put_net(net); | ||
495 | kfree(dest); | 514 | kfree(dest); |
496 | 515 | ||
497 | return 0; | 516 | return 0; |
498 | } | 517 | } |
499 | 518 | ||
500 | struct sk_buff *audit_make_reply(int pid, int seq, int type, int done, | 519 | struct sk_buff *audit_make_reply(__u32 portid, int seq, int type, int done, |
501 | int multi, const void *payload, int size) | 520 | int multi, const void *payload, int size) |
502 | { | 521 | { |
503 | struct sk_buff *skb; | 522 | struct sk_buff *skb; |
@@ -510,7 +529,7 @@ struct sk_buff *audit_make_reply(int pid, int seq, int type, int done, | |||
510 | if (!skb) | 529 | if (!skb) |
511 | return NULL; | 530 | return NULL; |
512 | 531 | ||
513 | nlh = nlmsg_put(skb, pid, seq, t, size, flags); | 532 | nlh = nlmsg_put(skb, portid, seq, t, size, flags); |
514 | if (!nlh) | 533 | if (!nlh) |
515 | goto out_kfree_skb; | 534 | goto out_kfree_skb; |
516 | data = nlmsg_data(nlh); | 535 | data = nlmsg_data(nlh); |
@@ -525,19 +544,22 @@ out_kfree_skb: | |||
525 | static int audit_send_reply_thread(void *arg) | 544 | static int audit_send_reply_thread(void *arg) |
526 | { | 545 | { |
527 | struct audit_reply *reply = (struct audit_reply *)arg; | 546 | struct audit_reply *reply = (struct audit_reply *)arg; |
547 | struct net *net = reply->net; | ||
548 | struct audit_net *aunet = net_generic(net, audit_net_id); | ||
528 | 549 | ||
529 | mutex_lock(&audit_cmd_mutex); | 550 | mutex_lock(&audit_cmd_mutex); |
530 | mutex_unlock(&audit_cmd_mutex); | 551 | mutex_unlock(&audit_cmd_mutex); |
531 | 552 | ||
532 | /* Ignore failure. It'll only happen if the sender goes away, | 553 | /* Ignore failure. It'll only happen if the sender goes away, |
533 | because our timeout is set to infinite. */ | 554 | because our timeout is set to infinite. */ |
534 | netlink_unicast(audit_sock, reply->skb, reply->pid, 0); | 555 | netlink_unicast(aunet->nlsk , reply->skb, reply->portid, 0); |
556 | put_net(net); | ||
535 | kfree(reply); | 557 | kfree(reply); |
536 | return 0; | 558 | return 0; |
537 | } | 559 | } |
538 | /** | 560 | /** |
539 | * audit_send_reply - send an audit reply message via netlink | 561 | * audit_send_reply - send an audit reply message via netlink |
540 | * @pid: process id to send reply to | 562 | * @request_skb: skb of request we are replying to (used to target the reply) |
541 | * @seq: sequence number | 563 | * @seq: sequence number |
542 | * @type: audit message type | 564 | * @type: audit message type |
543 | * @done: done (last) flag | 565 | * @done: done (last) flag |
@@ -545,12 +567,14 @@ static int audit_send_reply_thread(void *arg) | |||
545 | * @payload: payload data | 567 | * @payload: payload data |
546 | * @size: payload size | 568 | * @size: payload size |
547 | * | 569 | * |
548 | * Allocates an skb, builds the netlink message, and sends it to the pid. | 570 | * Allocates an skb, builds the netlink message, and sends it to the port id. |
549 | * No failure notifications. | 571 | * No failure notifications. |
550 | */ | 572 | */ |
551 | static void audit_send_reply(int pid, int seq, int type, int done, int multi, | 573 | static void audit_send_reply(struct sk_buff *request_skb, int seq, int type, int done, |
552 | const void *payload, int size) | 574 | int multi, const void *payload, int size) |
553 | { | 575 | { |
576 | u32 portid = NETLINK_CB(request_skb).portid; | ||
577 | struct net *net = sock_net(NETLINK_CB(request_skb).sk); | ||
554 | struct sk_buff *skb; | 578 | struct sk_buff *skb; |
555 | struct task_struct *tsk; | 579 | struct task_struct *tsk; |
556 | struct audit_reply *reply = kmalloc(sizeof(struct audit_reply), | 580 | struct audit_reply *reply = kmalloc(sizeof(struct audit_reply), |
@@ -559,11 +583,12 @@ static void audit_send_reply(int pid, int seq, int type, int done, int multi, | |||
559 | if (!reply) | 583 | if (!reply) |
560 | return; | 584 | return; |
561 | 585 | ||
562 | skb = audit_make_reply(pid, seq, type, done, multi, payload, size); | 586 | skb = audit_make_reply(portid, seq, type, done, multi, payload, size); |
563 | if (!skb) | 587 | if (!skb) |
564 | goto out; | 588 | goto out; |
565 | 589 | ||
566 | reply->pid = pid; | 590 | reply->net = get_net(net); |
591 | reply->portid = portid; | ||
567 | reply->skb = skb; | 592 | reply->skb = skb; |
568 | 593 | ||
569 | tsk = kthread_run(audit_send_reply_thread, reply, "audit_send_reply"); | 594 | tsk = kthread_run(audit_send_reply_thread, reply, "audit_send_reply"); |
@@ -652,8 +677,7 @@ static int audit_get_feature(struct sk_buff *skb) | |||
652 | 677 | ||
653 | seq = nlmsg_hdr(skb)->nlmsg_seq; | 678 | seq = nlmsg_hdr(skb)->nlmsg_seq; |
654 | 679 | ||
655 | audit_send_reply(NETLINK_CB(skb).portid, seq, AUDIT_GET, 0, 0, | 680 | audit_send_reply(skb, seq, AUDIT_GET, 0, 0, &af, sizeof(af)); |
656 | &af, sizeof(af)); | ||
657 | 681 | ||
658 | return 0; | 682 | return 0; |
659 | } | 683 | } |
@@ -663,8 +687,12 @@ static void audit_log_feature_change(int which, u32 old_feature, u32 new_feature | |||
663 | { | 687 | { |
664 | struct audit_buffer *ab; | 688 | struct audit_buffer *ab; |
665 | 689 | ||
690 | if (audit_enabled == AUDIT_OFF) | ||
691 | return; | ||
692 | |||
666 | ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_FEATURE_CHANGE); | 693 | ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_FEATURE_CHANGE); |
667 | audit_log_format(ab, "feature=%s new=%d old=%d old_lock=%d new_lock=%d res=%d", | 694 | audit_log_task_info(ab, current); |
695 | audit_log_format(ab, "feature=%s old=%u new=%u old_lock=%u new_lock=%u res=%d", | ||
668 | audit_feature_names[which], !!old_feature, !!new_feature, | 696 | audit_feature_names[which], !!old_feature, !!new_feature, |
669 | !!old_lock, !!new_lock, res); | 697 | !!old_lock, !!new_lock, res); |
670 | audit_log_end(ab); | 698 | audit_log_end(ab); |
@@ -694,7 +722,7 @@ static int audit_set_feature(struct sk_buff *skb) | |||
694 | old_lock = af.lock & feature; | 722 | old_lock = af.lock & feature; |
695 | 723 | ||
696 | /* are we changing a locked feature? */ | 724 | /* are we changing a locked feature? */ |
697 | if ((af.lock & feature) && (new_feature != old_feature)) { | 725 | if (old_lock && (new_feature != old_feature)) { |
698 | audit_log_feature_change(i, old_feature, new_feature, | 726 | audit_log_feature_change(i, old_feature, new_feature, |
699 | old_lock, new_lock, 0); | 727 | old_lock, new_lock, 0); |
700 | return -EPERM; | 728 | return -EPERM; |
@@ -732,7 +760,6 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) | |||
732 | { | 760 | { |
733 | u32 seq; | 761 | u32 seq; |
734 | void *data; | 762 | void *data; |
735 | struct audit_status *status_get, status_set; | ||
736 | int err; | 763 | int err; |
737 | struct audit_buffer *ab; | 764 | struct audit_buffer *ab; |
738 | u16 msg_type = nlh->nlmsg_type; | 765 | u16 msg_type = nlh->nlmsg_type; |
@@ -758,48 +785,69 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) | |||
758 | data = nlmsg_data(nlh); | 785 | data = nlmsg_data(nlh); |
759 | 786 | ||
760 | switch (msg_type) { | 787 | switch (msg_type) { |
761 | case AUDIT_GET: | 788 | case AUDIT_GET: { |
762 | memset(&status_set, 0, sizeof(status_set)); | 789 | struct audit_status s; |
763 | status_set.enabled = audit_enabled; | 790 | memset(&s, 0, sizeof(s)); |
764 | status_set.failure = audit_failure; | 791 | s.enabled = audit_enabled; |
765 | status_set.pid = audit_pid; | 792 | s.failure = audit_failure; |
766 | status_set.rate_limit = audit_rate_limit; | 793 | s.pid = audit_pid; |
767 | status_set.backlog_limit = audit_backlog_limit; | 794 | s.rate_limit = audit_rate_limit; |
768 | status_set.lost = atomic_read(&audit_lost); | 795 | s.backlog_limit = audit_backlog_limit; |
769 | status_set.backlog = skb_queue_len(&audit_skb_queue); | 796 | s.lost = atomic_read(&audit_lost); |
770 | audit_send_reply(NETLINK_CB(skb).portid, seq, AUDIT_GET, 0, 0, | 797 | s.backlog = skb_queue_len(&audit_skb_queue); |
771 | &status_set, sizeof(status_set)); | 798 | s.version = AUDIT_VERSION_LATEST; |
799 | s.backlog_wait_time = audit_backlog_wait_time; | ||
800 | audit_send_reply(skb, seq, AUDIT_GET, 0, 0, &s, sizeof(s)); | ||
772 | break; | 801 | break; |
773 | case AUDIT_SET: | 802 | } |
774 | if (nlmsg_len(nlh) < sizeof(struct audit_status)) | 803 | case AUDIT_SET: { |
775 | return -EINVAL; | 804 | struct audit_status s; |
776 | status_get = (struct audit_status *)data; | 805 | memset(&s, 0, sizeof(s)); |
777 | if (status_get->mask & AUDIT_STATUS_ENABLED) { | 806 | /* guard against past and future API changes */ |
778 | err = audit_set_enabled(status_get->enabled); | 807 | memcpy(&s, data, min_t(size_t, sizeof(s), nlmsg_len(nlh))); |
808 | if (s.mask & AUDIT_STATUS_ENABLED) { | ||
809 | err = audit_set_enabled(s.enabled); | ||
779 | if (err < 0) | 810 | if (err < 0) |
780 | return err; | 811 | return err; |
781 | } | 812 | } |
782 | if (status_get->mask & AUDIT_STATUS_FAILURE) { | 813 | if (s.mask & AUDIT_STATUS_FAILURE) { |
783 | err = audit_set_failure(status_get->failure); | 814 | err = audit_set_failure(s.failure); |
784 | if (err < 0) | 815 | if (err < 0) |
785 | return err; | 816 | return err; |
786 | } | 817 | } |
787 | if (status_get->mask & AUDIT_STATUS_PID) { | 818 | if (s.mask & AUDIT_STATUS_PID) { |
788 | int new_pid = status_get->pid; | 819 | int new_pid = s.pid; |
789 | 820 | ||
821 | if ((!new_pid) && (task_tgid_vnr(current) != audit_pid)) | ||
822 | return -EACCES; | ||
790 | if (audit_enabled != AUDIT_OFF) | 823 | if (audit_enabled != AUDIT_OFF) |
791 | audit_log_config_change("audit_pid", new_pid, audit_pid, 1); | 824 | audit_log_config_change("audit_pid", new_pid, audit_pid, 1); |
792 | audit_pid = new_pid; | 825 | audit_pid = new_pid; |
793 | audit_nlk_portid = NETLINK_CB(skb).portid; | 826 | audit_nlk_portid = NETLINK_CB(skb).portid; |
827 | audit_sock = skb->sk; | ||
794 | } | 828 | } |
795 | if (status_get->mask & AUDIT_STATUS_RATE_LIMIT) { | 829 | if (s.mask & AUDIT_STATUS_RATE_LIMIT) { |
796 | err = audit_set_rate_limit(status_get->rate_limit); | 830 | err = audit_set_rate_limit(s.rate_limit); |
831 | if (err < 0) | ||
832 | return err; | ||
833 | } | ||
834 | if (s.mask & AUDIT_STATUS_BACKLOG_LIMIT) { | ||
835 | err = audit_set_backlog_limit(s.backlog_limit); | ||
836 | if (err < 0) | ||
837 | return err; | ||
838 | } | ||
839 | if (s.mask & AUDIT_STATUS_BACKLOG_WAIT_TIME) { | ||
840 | if (sizeof(s) > (size_t)nlh->nlmsg_len) | ||
841 | return -EINVAL; | ||
842 | if (s.backlog_wait_time < 0 || | ||
843 | s.backlog_wait_time > 10*AUDIT_BACKLOG_WAIT_TIME) | ||
844 | return -EINVAL; | ||
845 | err = audit_set_backlog_wait_time(s.backlog_wait_time); | ||
797 | if (err < 0) | 846 | if (err < 0) |
798 | return err; | 847 | return err; |
799 | } | 848 | } |
800 | if (status_get->mask & AUDIT_STATUS_BACKLOG_LIMIT) | ||
801 | err = audit_set_backlog_limit(status_get->backlog_limit); | ||
802 | break; | 849 | break; |
850 | } | ||
803 | case AUDIT_GET_FEATURE: | 851 | case AUDIT_GET_FEATURE: |
804 | err = audit_get_feature(skb); | 852 | err = audit_get_feature(skb); |
805 | if (err) | 853 | if (err) |
@@ -817,13 +865,14 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) | |||
817 | return 0; | 865 | return 0; |
818 | 866 | ||
819 | err = audit_filter_user(msg_type); | 867 | err = audit_filter_user(msg_type); |
820 | if (err == 1) { | 868 | if (err == 1) { /* match or error */ |
821 | err = 0; | 869 | err = 0; |
822 | if (msg_type == AUDIT_USER_TTY) { | 870 | if (msg_type == AUDIT_USER_TTY) { |
823 | err = tty_audit_push_current(); | 871 | err = tty_audit_push_current(); |
824 | if (err) | 872 | if (err) |
825 | break; | 873 | break; |
826 | } | 874 | } |
875 | mutex_unlock(&audit_cmd_mutex); | ||
827 | audit_log_common_recv_msg(&ab, msg_type); | 876 | audit_log_common_recv_msg(&ab, msg_type); |
828 | if (msg_type != AUDIT_USER_TTY) | 877 | if (msg_type != AUDIT_USER_TTY) |
829 | audit_log_format(ab, " msg='%.*s'", | 878 | audit_log_format(ab, " msg='%.*s'", |
@@ -839,8 +888,9 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) | |||
839 | size--; | 888 | size--; |
840 | audit_log_n_untrustedstring(ab, data, size); | 889 | audit_log_n_untrustedstring(ab, data, size); |
841 | } | 890 | } |
842 | audit_set_pid(ab, NETLINK_CB(skb).portid); | 891 | audit_set_portid(ab, NETLINK_CB(skb).portid); |
843 | audit_log_end(ab); | 892 | audit_log_end(ab); |
893 | mutex_lock(&audit_cmd_mutex); | ||
844 | } | 894 | } |
845 | break; | 895 | break; |
846 | case AUDIT_ADD_RULE: | 896 | case AUDIT_ADD_RULE: |
@@ -853,11 +903,12 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) | |||
853 | audit_log_end(ab); | 903 | audit_log_end(ab); |
854 | return -EPERM; | 904 | return -EPERM; |
855 | } | 905 | } |
856 | /* fallthrough */ | 906 | err = audit_rule_change(msg_type, NETLINK_CB(skb).portid, |
857 | case AUDIT_LIST_RULES: | ||
858 | err = audit_receive_filter(msg_type, NETLINK_CB(skb).portid, | ||
859 | seq, data, nlmsg_len(nlh)); | 907 | seq, data, nlmsg_len(nlh)); |
860 | break; | 908 | break; |
909 | case AUDIT_LIST_RULES: | ||
910 | err = audit_list_rules_send(skb, seq); | ||
911 | break; | ||
861 | case AUDIT_TRIM: | 912 | case AUDIT_TRIM: |
862 | audit_trim_trees(); | 913 | audit_trim_trees(); |
863 | audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE); | 914 | audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE); |
@@ -921,8 +972,8 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) | |||
921 | memcpy(sig_data->ctx, ctx, len); | 972 | memcpy(sig_data->ctx, ctx, len); |
922 | security_release_secctx(ctx, len); | 973 | security_release_secctx(ctx, len); |
923 | } | 974 | } |
924 | audit_send_reply(NETLINK_CB(skb).portid, seq, AUDIT_SIGNAL_INFO, | 975 | audit_send_reply(skb, seq, AUDIT_SIGNAL_INFO, 0, 0, |
925 | 0, 0, sig_data, sizeof(*sig_data) + len); | 976 | sig_data, sizeof(*sig_data) + len); |
926 | kfree(sig_data); | 977 | kfree(sig_data); |
927 | break; | 978 | break; |
928 | case AUDIT_TTY_GET: { | 979 | case AUDIT_TTY_GET: { |
@@ -934,25 +985,37 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) | |||
934 | s.log_passwd = tsk->signal->audit_tty_log_passwd; | 985 | s.log_passwd = tsk->signal->audit_tty_log_passwd; |
935 | spin_unlock(&tsk->sighand->siglock); | 986 | spin_unlock(&tsk->sighand->siglock); |
936 | 987 | ||
937 | audit_send_reply(NETLINK_CB(skb).portid, seq, | 988 | audit_send_reply(skb, seq, AUDIT_TTY_GET, 0, 0, &s, sizeof(s)); |
938 | AUDIT_TTY_GET, 0, 0, &s, sizeof(s)); | ||
939 | break; | 989 | break; |
940 | } | 990 | } |
941 | case AUDIT_TTY_SET: { | 991 | case AUDIT_TTY_SET: { |
942 | struct audit_tty_status s; | 992 | struct audit_tty_status s, old; |
943 | struct task_struct *tsk = current; | 993 | struct task_struct *tsk = current; |
994 | struct audit_buffer *ab; | ||
944 | 995 | ||
945 | memset(&s, 0, sizeof(s)); | 996 | memset(&s, 0, sizeof(s)); |
946 | /* guard against past and future API changes */ | 997 | /* guard against past and future API changes */ |
947 | memcpy(&s, data, min_t(size_t, sizeof(s), nlmsg_len(nlh))); | 998 | memcpy(&s, data, min_t(size_t, sizeof(s), nlmsg_len(nlh))); |
999 | /* check if new data is valid */ | ||
948 | if ((s.enabled != 0 && s.enabled != 1) || | 1000 | if ((s.enabled != 0 && s.enabled != 1) || |
949 | (s.log_passwd != 0 && s.log_passwd != 1)) | 1001 | (s.log_passwd != 0 && s.log_passwd != 1)) |
950 | return -EINVAL; | 1002 | err = -EINVAL; |
951 | 1003 | ||
952 | spin_lock(&tsk->sighand->siglock); | 1004 | spin_lock(&tsk->sighand->siglock); |
953 | tsk->signal->audit_tty = s.enabled; | 1005 | old.enabled = tsk->signal->audit_tty; |
954 | tsk->signal->audit_tty_log_passwd = s.log_passwd; | 1006 | old.log_passwd = tsk->signal->audit_tty_log_passwd; |
1007 | if (!err) { | ||
1008 | tsk->signal->audit_tty = s.enabled; | ||
1009 | tsk->signal->audit_tty_log_passwd = s.log_passwd; | ||
1010 | } | ||
955 | spin_unlock(&tsk->sighand->siglock); | 1011 | spin_unlock(&tsk->sighand->siglock); |
1012 | |||
1013 | audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE); | ||
1014 | audit_log_format(ab, " op=tty_set old-enabled=%d new-enabled=%d" | ||
1015 | " old-log_passwd=%d new-log_passwd=%d res=%d", | ||
1016 | old.enabled, s.enabled, old.log_passwd, | ||
1017 | s.log_passwd, !err); | ||
1018 | audit_log_end(ab); | ||
956 | break; | 1019 | break; |
957 | } | 1020 | } |
958 | default: | 1021 | default: |
@@ -998,24 +1061,55 @@ static void audit_receive(struct sk_buff *skb) | |||
998 | mutex_unlock(&audit_cmd_mutex); | 1061 | mutex_unlock(&audit_cmd_mutex); |
999 | } | 1062 | } |
1000 | 1063 | ||
1001 | /* Initialize audit support at boot time. */ | 1064 | static int __net_init audit_net_init(struct net *net) |
1002 | static int __init audit_init(void) | ||
1003 | { | 1065 | { |
1004 | int i; | ||
1005 | struct netlink_kernel_cfg cfg = { | 1066 | struct netlink_kernel_cfg cfg = { |
1006 | .input = audit_receive, | 1067 | .input = audit_receive, |
1007 | }; | 1068 | }; |
1008 | 1069 | ||
1070 | struct audit_net *aunet = net_generic(net, audit_net_id); | ||
1071 | |||
1072 | aunet->nlsk = netlink_kernel_create(net, NETLINK_AUDIT, &cfg); | ||
1073 | if (aunet->nlsk == NULL) { | ||
1074 | audit_panic("cannot initialize netlink socket in namespace"); | ||
1075 | return -ENOMEM; | ||
1076 | } | ||
1077 | aunet->nlsk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT; | ||
1078 | return 0; | ||
1079 | } | ||
1080 | |||
1081 | static void __net_exit audit_net_exit(struct net *net) | ||
1082 | { | ||
1083 | struct audit_net *aunet = net_generic(net, audit_net_id); | ||
1084 | struct sock *sock = aunet->nlsk; | ||
1085 | if (sock == audit_sock) { | ||
1086 | audit_pid = 0; | ||
1087 | audit_sock = NULL; | ||
1088 | } | ||
1089 | |||
1090 | rcu_assign_pointer(aunet->nlsk, NULL); | ||
1091 | synchronize_net(); | ||
1092 | netlink_kernel_release(sock); | ||
1093 | } | ||
1094 | |||
1095 | static struct pernet_operations audit_net_ops __net_initdata = { | ||
1096 | .init = audit_net_init, | ||
1097 | .exit = audit_net_exit, | ||
1098 | .id = &audit_net_id, | ||
1099 | .size = sizeof(struct audit_net), | ||
1100 | }; | ||
1101 | |||
1102 | /* Initialize audit support at boot time. */ | ||
1103 | static int __init audit_init(void) | ||
1104 | { | ||
1105 | int i; | ||
1106 | |||
1009 | if (audit_initialized == AUDIT_DISABLED) | 1107 | if (audit_initialized == AUDIT_DISABLED) |
1010 | return 0; | 1108 | return 0; |
1011 | 1109 | ||
1012 | printk(KERN_INFO "audit: initializing netlink socket (%s)\n", | 1110 | pr_info("initializing netlink subsys (%s)\n", |
1013 | audit_default ? "enabled" : "disabled"); | 1111 | audit_default ? "enabled" : "disabled"); |
1014 | audit_sock = netlink_kernel_create(&init_net, NETLINK_AUDIT, &cfg); | 1112 | register_pernet_subsys(&audit_net_ops); |
1015 | if (!audit_sock) | ||
1016 | audit_panic("cannot initialize netlink socket"); | ||
1017 | else | ||
1018 | audit_sock->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT; | ||
1019 | 1113 | ||
1020 | skb_queue_head_init(&audit_skb_queue); | 1114 | skb_queue_head_init(&audit_skb_queue); |
1021 | skb_queue_head_init(&audit_skb_hold_queue); | 1115 | skb_queue_head_init(&audit_skb_hold_queue); |
@@ -1039,22 +1133,32 @@ static int __init audit_enable(char *str) | |||
1039 | if (!audit_default) | 1133 | if (!audit_default) |
1040 | audit_initialized = AUDIT_DISABLED; | 1134 | audit_initialized = AUDIT_DISABLED; |
1041 | 1135 | ||
1042 | printk(KERN_INFO "audit: %s", audit_default ? "enabled" : "disabled"); | 1136 | pr_info("%s\n", audit_default ? |
1137 | "enabled (after initialization)" : "disabled (until reboot)"); | ||
1043 | 1138 | ||
1044 | if (audit_initialized == AUDIT_INITIALIZED) { | 1139 | return 1; |
1045 | audit_enabled = audit_default; | 1140 | } |
1046 | audit_ever_enabled |= !!audit_default; | 1141 | __setup("audit=", audit_enable); |
1047 | } else if (audit_initialized == AUDIT_UNINITIALIZED) { | 1142 | |
1048 | printk(" (after initialization)"); | 1143 | /* Process kernel command-line parameter at boot time. |
1049 | } else { | 1144 | * audit_backlog_limit=<n> */ |
1050 | printk(" (until reboot)"); | 1145 | static int __init audit_backlog_limit_set(char *str) |
1146 | { | ||
1147 | u32 audit_backlog_limit_arg; | ||
1148 | |||
1149 | pr_info("audit_backlog_limit: "); | ||
1150 | if (kstrtouint(str, 0, &audit_backlog_limit_arg)) { | ||
1151 | pr_cont("using default of %u, unable to parse %s\n", | ||
1152 | audit_backlog_limit, str); | ||
1153 | return 1; | ||
1051 | } | 1154 | } |
1052 | printk("\n"); | 1155 | |
1156 | audit_backlog_limit = audit_backlog_limit_arg; | ||
1157 | pr_cont("%d\n", audit_backlog_limit); | ||
1053 | 1158 | ||
1054 | return 1; | 1159 | return 1; |
1055 | } | 1160 | } |
1056 | 1161 | __setup("audit_backlog_limit=", audit_backlog_limit_set); | |
1057 | __setup("audit=", audit_enable); | ||
1058 | 1162 | ||
1059 | static void audit_buffer_free(struct audit_buffer *ab) | 1163 | static void audit_buffer_free(struct audit_buffer *ab) |
1060 | { | 1164 | { |
@@ -1165,18 +1269,20 @@ static inline void audit_get_stamp(struct audit_context *ctx, | |||
1165 | /* | 1269 | /* |
1166 | * Wait for auditd to drain the queue a little | 1270 | * Wait for auditd to drain the queue a little |
1167 | */ | 1271 | */ |
1168 | static void wait_for_auditd(unsigned long sleep_time) | 1272 | static long wait_for_auditd(long sleep_time) |
1169 | { | 1273 | { |
1170 | DECLARE_WAITQUEUE(wait, current); | 1274 | DECLARE_WAITQUEUE(wait, current); |
1171 | set_current_state(TASK_UNINTERRUPTIBLE); | 1275 | set_current_state(TASK_UNINTERRUPTIBLE); |
1172 | add_wait_queue(&audit_backlog_wait, &wait); | 1276 | add_wait_queue_exclusive(&audit_backlog_wait, &wait); |
1173 | 1277 | ||
1174 | if (audit_backlog_limit && | 1278 | if (audit_backlog_limit && |
1175 | skb_queue_len(&audit_skb_queue) > audit_backlog_limit) | 1279 | skb_queue_len(&audit_skb_queue) > audit_backlog_limit) |
1176 | schedule_timeout(sleep_time); | 1280 | sleep_time = schedule_timeout(sleep_time); |
1177 | 1281 | ||
1178 | __set_current_state(TASK_RUNNING); | 1282 | __set_current_state(TASK_RUNNING); |
1179 | remove_wait_queue(&audit_backlog_wait, &wait); | 1283 | remove_wait_queue(&audit_backlog_wait, &wait); |
1284 | |||
1285 | return sleep_time; | ||
1180 | } | 1286 | } |
1181 | 1287 | ||
1182 | /** | 1288 | /** |
@@ -1200,7 +1306,8 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask, | |||
1200 | struct audit_buffer *ab = NULL; | 1306 | struct audit_buffer *ab = NULL; |
1201 | struct timespec t; | 1307 | struct timespec t; |
1202 | unsigned int uninitialized_var(serial); | 1308 | unsigned int uninitialized_var(serial); |
1203 | int reserve; | 1309 | int reserve = 5; /* Allow atomic callers to go up to five |
1310 | entries over the normal backlog limit */ | ||
1204 | unsigned long timeout_start = jiffies; | 1311 | unsigned long timeout_start = jiffies; |
1205 | 1312 | ||
1206 | if (audit_initialized != AUDIT_INITIALIZED) | 1313 | if (audit_initialized != AUDIT_INITIALIZED) |
@@ -1209,36 +1316,37 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask, | |||
1209 | if (unlikely(audit_filter_type(type))) | 1316 | if (unlikely(audit_filter_type(type))) |
1210 | return NULL; | 1317 | return NULL; |
1211 | 1318 | ||
1212 | if (gfp_mask & __GFP_WAIT) | 1319 | if (gfp_mask & __GFP_WAIT) { |
1213 | reserve = 0; | 1320 | if (audit_pid && audit_pid == current->pid) |
1214 | else | 1321 | gfp_mask &= ~__GFP_WAIT; |
1215 | reserve = 5; /* Allow atomic callers to go up to five | 1322 | else |
1216 | entries over the normal backlog limit */ | 1323 | reserve = 0; |
1324 | } | ||
1217 | 1325 | ||
1218 | while (audit_backlog_limit | 1326 | while (audit_backlog_limit |
1219 | && skb_queue_len(&audit_skb_queue) > audit_backlog_limit + reserve) { | 1327 | && skb_queue_len(&audit_skb_queue) > audit_backlog_limit + reserve) { |
1220 | if (gfp_mask & __GFP_WAIT && audit_backlog_wait_time) { | 1328 | if (gfp_mask & __GFP_WAIT && audit_backlog_wait_time) { |
1221 | unsigned long sleep_time; | 1329 | long sleep_time; |
1222 | 1330 | ||
1223 | sleep_time = timeout_start + audit_backlog_wait_time - | 1331 | sleep_time = timeout_start + audit_backlog_wait_time - jiffies; |
1224 | jiffies; | 1332 | if (sleep_time > 0) { |
1225 | if ((long)sleep_time > 0) { | 1333 | sleep_time = wait_for_auditd(sleep_time); |
1226 | wait_for_auditd(sleep_time); | 1334 | if (sleep_time > 0) |
1227 | continue; | 1335 | continue; |
1228 | } | 1336 | } |
1229 | } | 1337 | } |
1230 | if (audit_rate_check() && printk_ratelimit()) | 1338 | if (audit_rate_check() && printk_ratelimit()) |
1231 | printk(KERN_WARNING | 1339 | pr_warn("audit_backlog=%d > audit_backlog_limit=%d\n", |
1232 | "audit: audit_backlog=%d > " | 1340 | skb_queue_len(&audit_skb_queue), |
1233 | "audit_backlog_limit=%d\n", | 1341 | audit_backlog_limit); |
1234 | skb_queue_len(&audit_skb_queue), | ||
1235 | audit_backlog_limit); | ||
1236 | audit_log_lost("backlog limit exceeded"); | 1342 | audit_log_lost("backlog limit exceeded"); |
1237 | audit_backlog_wait_time = audit_backlog_wait_overflow; | 1343 | audit_backlog_wait_time = audit_backlog_wait_overflow; |
1238 | wake_up(&audit_backlog_wait); | 1344 | wake_up(&audit_backlog_wait); |
1239 | return NULL; | 1345 | return NULL; |
1240 | } | 1346 | } |
1241 | 1347 | ||
1348 | audit_backlog_wait_time = AUDIT_BACKLOG_WAIT_TIME; | ||
1349 | |||
1242 | ab = audit_buffer_alloc(ctx, gfp_mask, type); | 1350 | ab = audit_buffer_alloc(ctx, gfp_mask, type); |
1243 | if (!ab) { | 1351 | if (!ab) { |
1244 | audit_log_lost("out of memory in audit_log_start"); | 1352 | audit_log_lost("out of memory in audit_log_start"); |
@@ -1356,7 +1464,6 @@ void audit_log_n_hex(struct audit_buffer *ab, const unsigned char *buf, | |||
1356 | int i, avail, new_len; | 1464 | int i, avail, new_len; |
1357 | unsigned char *ptr; | 1465 | unsigned char *ptr; |
1358 | struct sk_buff *skb; | 1466 | struct sk_buff *skb; |
1359 | static const unsigned char *hex = "0123456789ABCDEF"; | ||
1360 | 1467 | ||
1361 | if (!ab) | 1468 | if (!ab) |
1362 | return; | 1469 | return; |
@@ -1374,10 +1481,8 @@ void audit_log_n_hex(struct audit_buffer *ab, const unsigned char *buf, | |||
1374 | } | 1481 | } |
1375 | 1482 | ||
1376 | ptr = skb_tail_pointer(skb); | 1483 | ptr = skb_tail_pointer(skb); |
1377 | for (i=0; i<len; i++) { | 1484 | for (i = 0; i < len; i++) |
1378 | *ptr++ = hex[(buf[i] & 0xF0)>>4]; /* Upper nibble */ | 1485 | ptr = hex_byte_pack_upper(ptr, buf[i]); |
1379 | *ptr++ = hex[buf[i] & 0x0F]; /* Lower nibble */ | ||
1380 | } | ||
1381 | *ptr = 0; | 1486 | *ptr = 0; |
1382 | skb_put(skb, len << 1); /* new string is twice the old string */ | 1487 | skb_put(skb, len << 1); /* new string is twice the old string */ |
1383 | } | 1488 | } |
@@ -1491,7 +1596,7 @@ void audit_log_d_path(struct audit_buffer *ab, const char *prefix, | |||
1491 | 1596 | ||
1492 | void audit_log_session_info(struct audit_buffer *ab) | 1597 | void audit_log_session_info(struct audit_buffer *ab) |
1493 | { | 1598 | { |
1494 | u32 sessionid = audit_get_sessionid(current); | 1599 | unsigned int sessionid = audit_get_sessionid(current); |
1495 | uid_t auid = from_kuid(&init_user_ns, audit_get_loginuid(current)); | 1600 | uid_t auid = from_kuid(&init_user_ns, audit_get_loginuid(current)); |
1496 | 1601 | ||
1497 | audit_log_format(ab, " auid=%u ses=%u", auid, sessionid); | 1602 | audit_log_format(ab, " auid=%u ses=%u", auid, sessionid); |
@@ -1716,7 +1821,7 @@ void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk) | |||
1716 | audit_log_format(ab, | 1821 | audit_log_format(ab, |
1717 | " ppid=%ld pid=%d auid=%u uid=%u gid=%u" | 1822 | " ppid=%ld pid=%d auid=%u uid=%u gid=%u" |
1718 | " euid=%u suid=%u fsuid=%u" | 1823 | " euid=%u suid=%u fsuid=%u" |
1719 | " egid=%u sgid=%u fsgid=%u ses=%u tty=%s", | 1824 | " egid=%u sgid=%u fsgid=%u tty=%s ses=%u", |
1720 | sys_getppid(), | 1825 | sys_getppid(), |
1721 | tsk->pid, | 1826 | tsk->pid, |
1722 | from_kuid(&init_user_ns, audit_get_loginuid(tsk)), | 1827 | from_kuid(&init_user_ns, audit_get_loginuid(tsk)), |
@@ -1728,7 +1833,7 @@ void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk) | |||
1728 | from_kgid(&init_user_ns, cred->egid), | 1833 | from_kgid(&init_user_ns, cred->egid), |
1729 | from_kgid(&init_user_ns, cred->sgid), | 1834 | from_kgid(&init_user_ns, cred->sgid), |
1730 | from_kgid(&init_user_ns, cred->fsgid), | 1835 | from_kgid(&init_user_ns, cred->fsgid), |
1731 | audit_get_sessionid(tsk), tty); | 1836 | tty, audit_get_sessionid(tsk)); |
1732 | 1837 | ||
1733 | get_task_comm(name, tsk); | 1838 | get_task_comm(name, tsk); |
1734 | audit_log_format(ab, " comm="); | 1839 | audit_log_format(ab, " comm="); |
@@ -1739,7 +1844,8 @@ void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk) | |||
1739 | if (mm->exe_file) | 1844 | if (mm->exe_file) |
1740 | audit_log_d_path(ab, " exe=", &mm->exe_file->f_path); | 1845 | audit_log_d_path(ab, " exe=", &mm->exe_file->f_path); |
1741 | up_read(&mm->mmap_sem); | 1846 | up_read(&mm->mmap_sem); |
1742 | } | 1847 | } else |
1848 | audit_log_format(ab, " exe=(null)"); | ||
1743 | audit_log_task_context(ab); | 1849 | audit_log_task_context(ab); |
1744 | } | 1850 | } |
1745 | EXPORT_SYMBOL(audit_log_task_info); | 1851 | EXPORT_SYMBOL(audit_log_task_info); |
diff --git a/kernel/audit.h b/kernel/audit.h index b779642b29af..8df132214606 100644 --- a/kernel/audit.h +++ b/kernel/audit.h | |||
@@ -209,7 +209,7 @@ struct audit_context { | |||
209 | #endif | 209 | #endif |
210 | }; | 210 | }; |
211 | 211 | ||
212 | extern int audit_ever_enabled; | 212 | extern u32 audit_ever_enabled; |
213 | 213 | ||
214 | extern void audit_copy_inode(struct audit_names *name, | 214 | extern void audit_copy_inode(struct audit_names *name, |
215 | const struct dentry *dentry, | 215 | const struct dentry *dentry, |
@@ -240,18 +240,23 @@ extern int audit_uid_comparator(kuid_t left, u32 op, kuid_t right); | |||
240 | extern int audit_gid_comparator(kgid_t left, u32 op, kgid_t right); | 240 | extern int audit_gid_comparator(kgid_t left, u32 op, kgid_t right); |
241 | extern int parent_len(const char *path); | 241 | extern int parent_len(const char *path); |
242 | extern int audit_compare_dname_path(const char *dname, const char *path, int plen); | 242 | extern int audit_compare_dname_path(const char *dname, const char *path, int plen); |
243 | extern struct sk_buff * audit_make_reply(int pid, int seq, int type, | 243 | extern struct sk_buff *audit_make_reply(__u32 portid, int seq, int type, |
244 | int done, int multi, | 244 | int done, int multi, |
245 | const void *payload, int size); | 245 | const void *payload, int size); |
246 | extern void audit_panic(const char *message); | 246 | extern void audit_panic(const char *message); |
247 | 247 | ||
248 | struct audit_netlink_list { | 248 | struct audit_netlink_list { |
249 | int pid; | 249 | __u32 portid; |
250 | struct net *net; | ||
250 | struct sk_buff_head q; | 251 | struct sk_buff_head q; |
251 | }; | 252 | }; |
252 | 253 | ||
253 | int audit_send_list(void *); | 254 | int audit_send_list(void *); |
254 | 255 | ||
256 | struct audit_net { | ||
257 | struct sock *nlsk; | ||
258 | }; | ||
259 | |||
255 | extern int selinux_audit_rule_update(void); | 260 | extern int selinux_audit_rule_update(void); |
256 | 261 | ||
257 | extern struct mutex audit_filter_mutex; | 262 | extern struct mutex audit_filter_mutex; |
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index 43c307dc9453..135944a7b28a 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c | |||
@@ -912,12 +912,13 @@ static void evict_chunk(struct audit_chunk *chunk) | |||
912 | } | 912 | } |
913 | 913 | ||
914 | static int audit_tree_handle_event(struct fsnotify_group *group, | 914 | static int audit_tree_handle_event(struct fsnotify_group *group, |
915 | struct inode *to_tell, | ||
915 | struct fsnotify_mark *inode_mark, | 916 | struct fsnotify_mark *inode_mark, |
916 | struct fsnotify_mark *vfsmonut_mark, | 917 | struct fsnotify_mark *vfsmount_mark, |
917 | struct fsnotify_event *event) | 918 | u32 mask, void *data, int data_type, |
919 | const unsigned char *file_name, u32 cookie) | ||
918 | { | 920 | { |
919 | BUG(); | 921 | return 0; |
920 | return -EOPNOTSUPP; | ||
921 | } | 922 | } |
922 | 923 | ||
923 | static void audit_tree_freeing_mark(struct fsnotify_mark *entry, struct fsnotify_group *group) | 924 | static void audit_tree_freeing_mark(struct fsnotify_mark *entry, struct fsnotify_group *group) |
@@ -933,19 +934,8 @@ static void audit_tree_freeing_mark(struct fsnotify_mark *entry, struct fsnotify | |||
933 | BUG_ON(atomic_read(&entry->refcnt) < 1); | 934 | BUG_ON(atomic_read(&entry->refcnt) < 1); |
934 | } | 935 | } |
935 | 936 | ||
936 | static bool audit_tree_send_event(struct fsnotify_group *group, struct inode *inode, | ||
937 | struct fsnotify_mark *inode_mark, | ||
938 | struct fsnotify_mark *vfsmount_mark, | ||
939 | __u32 mask, void *data, int data_type) | ||
940 | { | ||
941 | return false; | ||
942 | } | ||
943 | |||
944 | static const struct fsnotify_ops audit_tree_ops = { | 937 | static const struct fsnotify_ops audit_tree_ops = { |
945 | .handle_event = audit_tree_handle_event, | 938 | .handle_event = audit_tree_handle_event, |
946 | .should_send_event = audit_tree_send_event, | ||
947 | .free_group_priv = NULL, | ||
948 | .free_event_priv = NULL, | ||
949 | .freeing_mark = audit_tree_freeing_mark, | 939 | .freeing_mark = audit_tree_freeing_mark, |
950 | }; | 940 | }; |
951 | 941 | ||
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index 22831c4d369c..70b4554d2fbe 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c | |||
@@ -465,35 +465,27 @@ void audit_remove_watch_rule(struct audit_krule *krule) | |||
465 | } | 465 | } |
466 | } | 466 | } |
467 | 467 | ||
468 | static bool audit_watch_should_send_event(struct fsnotify_group *group, struct inode *inode, | ||
469 | struct fsnotify_mark *inode_mark, | ||
470 | struct fsnotify_mark *vfsmount_mark, | ||
471 | __u32 mask, void *data, int data_type) | ||
472 | { | ||
473 | return true; | ||
474 | } | ||
475 | |||
476 | /* Update watch data in audit rules based on fsnotify events. */ | 468 | /* Update watch data in audit rules based on fsnotify events. */ |
477 | static int audit_watch_handle_event(struct fsnotify_group *group, | 469 | static int audit_watch_handle_event(struct fsnotify_group *group, |
470 | struct inode *to_tell, | ||
478 | struct fsnotify_mark *inode_mark, | 471 | struct fsnotify_mark *inode_mark, |
479 | struct fsnotify_mark *vfsmount_mark, | 472 | struct fsnotify_mark *vfsmount_mark, |
480 | struct fsnotify_event *event) | 473 | u32 mask, void *data, int data_type, |
474 | const unsigned char *dname, u32 cookie) | ||
481 | { | 475 | { |
482 | struct inode *inode; | 476 | struct inode *inode; |
483 | __u32 mask = event->mask; | ||
484 | const char *dname = event->file_name; | ||
485 | struct audit_parent *parent; | 477 | struct audit_parent *parent; |
486 | 478 | ||
487 | parent = container_of(inode_mark, struct audit_parent, mark); | 479 | parent = container_of(inode_mark, struct audit_parent, mark); |
488 | 480 | ||
489 | BUG_ON(group != audit_watch_group); | 481 | BUG_ON(group != audit_watch_group); |
490 | 482 | ||
491 | switch (event->data_type) { | 483 | switch (data_type) { |
492 | case (FSNOTIFY_EVENT_PATH): | 484 | case (FSNOTIFY_EVENT_PATH): |
493 | inode = event->path.dentry->d_inode; | 485 | inode = ((struct path *)data)->dentry->d_inode; |
494 | break; | 486 | break; |
495 | case (FSNOTIFY_EVENT_INODE): | 487 | case (FSNOTIFY_EVENT_INODE): |
496 | inode = event->inode; | 488 | inode = (struct inode *)data; |
497 | break; | 489 | break; |
498 | default: | 490 | default: |
499 | BUG(); | 491 | BUG(); |
@@ -512,11 +504,7 @@ static int audit_watch_handle_event(struct fsnotify_group *group, | |||
512 | } | 504 | } |
513 | 505 | ||
514 | static const struct fsnotify_ops audit_watch_fsnotify_ops = { | 506 | static const struct fsnotify_ops audit_watch_fsnotify_ops = { |
515 | .should_send_event = audit_watch_should_send_event, | ||
516 | .handle_event = audit_watch_handle_event, | 507 | .handle_event = audit_watch_handle_event, |
517 | .free_group_priv = NULL, | ||
518 | .freeing_mark = NULL, | ||
519 | .free_event_priv = NULL, | ||
520 | }; | 508 | }; |
521 | 509 | ||
522 | static int __init audit_watch_init(void) | 510 | static int __init audit_watch_init(void) |
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 51f3fd4c1ed3..92062fd6cc8c 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c | |||
@@ -29,6 +29,8 @@ | |||
29 | #include <linux/sched.h> | 29 | #include <linux/sched.h> |
30 | #include <linux/slab.h> | 30 | #include <linux/slab.h> |
31 | #include <linux/security.h> | 31 | #include <linux/security.h> |
32 | #include <net/net_namespace.h> | ||
33 | #include <net/sock.h> | ||
32 | #include "audit.h" | 34 | #include "audit.h" |
33 | 35 | ||
34 | /* | 36 | /* |
@@ -972,7 +974,7 @@ out: | |||
972 | } | 974 | } |
973 | 975 | ||
974 | /* List rules using struct audit_rule_data. */ | 976 | /* List rules using struct audit_rule_data. */ |
975 | static void audit_list_rules(int pid, int seq, struct sk_buff_head *q) | 977 | static void audit_list_rules(__u32 portid, int seq, struct sk_buff_head *q) |
976 | { | 978 | { |
977 | struct sk_buff *skb; | 979 | struct sk_buff *skb; |
978 | struct audit_krule *r; | 980 | struct audit_krule *r; |
@@ -987,14 +989,15 @@ static void audit_list_rules(int pid, int seq, struct sk_buff_head *q) | |||
987 | data = audit_krule_to_data(r); | 989 | data = audit_krule_to_data(r); |
988 | if (unlikely(!data)) | 990 | if (unlikely(!data)) |
989 | break; | 991 | break; |
990 | skb = audit_make_reply(pid, seq, AUDIT_LIST_RULES, 0, 1, | 992 | skb = audit_make_reply(portid, seq, AUDIT_LIST_RULES, |
991 | data, sizeof(*data) + data->buflen); | 993 | 0, 1, data, |
994 | sizeof(*data) + data->buflen); | ||
992 | if (skb) | 995 | if (skb) |
993 | skb_queue_tail(q, skb); | 996 | skb_queue_tail(q, skb); |
994 | kfree(data); | 997 | kfree(data); |
995 | } | 998 | } |
996 | } | 999 | } |
997 | skb = audit_make_reply(pid, seq, AUDIT_LIST_RULES, 1, 1, NULL, 0); | 1000 | skb = audit_make_reply(portid, seq, AUDIT_LIST_RULES, 1, 1, NULL, 0); |
998 | if (skb) | 1001 | if (skb) |
999 | skb_queue_tail(q, skb); | 1002 | skb_queue_tail(q, skb); |
1000 | } | 1003 | } |
@@ -1004,7 +1007,7 @@ static void audit_log_rule_change(char *action, struct audit_krule *rule, int re | |||
1004 | { | 1007 | { |
1005 | struct audit_buffer *ab; | 1008 | struct audit_buffer *ab; |
1006 | uid_t loginuid = from_kuid(&init_user_ns, audit_get_loginuid(current)); | 1009 | uid_t loginuid = from_kuid(&init_user_ns, audit_get_loginuid(current)); |
1007 | u32 sessionid = audit_get_sessionid(current); | 1010 | unsigned int sessionid = audit_get_sessionid(current); |
1008 | 1011 | ||
1009 | if (!audit_enabled) | 1012 | if (!audit_enabled) |
1010 | return; | 1013 | return; |
@@ -1022,45 +1025,20 @@ static void audit_log_rule_change(char *action, struct audit_krule *rule, int re | |||
1022 | } | 1025 | } |
1023 | 1026 | ||
1024 | /** | 1027 | /** |
1025 | * audit_receive_filter - apply all rules to the specified message type | 1028 | * audit_rule_change - apply all rules to the specified message type |
1026 | * @type: audit message type | 1029 | * @type: audit message type |
1027 | * @pid: target pid for netlink audit messages | 1030 | * @portid: target port id for netlink audit messages |
1028 | * @seq: netlink audit message sequence (serial) number | 1031 | * @seq: netlink audit message sequence (serial) number |
1029 | * @data: payload data | 1032 | * @data: payload data |
1030 | * @datasz: size of payload data | 1033 | * @datasz: size of payload data |
1031 | */ | 1034 | */ |
1032 | int audit_receive_filter(int type, int pid, int seq, void *data, size_t datasz) | 1035 | int audit_rule_change(int type, __u32 portid, int seq, void *data, |
1036 | size_t datasz) | ||
1033 | { | 1037 | { |
1034 | struct task_struct *tsk; | ||
1035 | struct audit_netlink_list *dest; | ||
1036 | int err = 0; | 1038 | int err = 0; |
1037 | struct audit_entry *entry; | 1039 | struct audit_entry *entry; |
1038 | 1040 | ||
1039 | switch (type) { | 1041 | switch (type) { |
1040 | case AUDIT_LIST_RULES: | ||
1041 | /* We can't just spew out the rules here because we might fill | ||
1042 | * the available socket buffer space and deadlock waiting for | ||
1043 | * auditctl to read from it... which isn't ever going to | ||
1044 | * happen if we're actually running in the context of auditctl | ||
1045 | * trying to _send_ the stuff */ | ||
1046 | |||
1047 | dest = kmalloc(sizeof(struct audit_netlink_list), GFP_KERNEL); | ||
1048 | if (!dest) | ||
1049 | return -ENOMEM; | ||
1050 | dest->pid = pid; | ||
1051 | skb_queue_head_init(&dest->q); | ||
1052 | |||
1053 | mutex_lock(&audit_filter_mutex); | ||
1054 | audit_list_rules(pid, seq, &dest->q); | ||
1055 | mutex_unlock(&audit_filter_mutex); | ||
1056 | |||
1057 | tsk = kthread_run(audit_send_list, dest, "audit_send_list"); | ||
1058 | if (IS_ERR(tsk)) { | ||
1059 | skb_queue_purge(&dest->q); | ||
1060 | kfree(dest); | ||
1061 | err = PTR_ERR(tsk); | ||
1062 | } | ||
1063 | break; | ||
1064 | case AUDIT_ADD_RULE: | 1042 | case AUDIT_ADD_RULE: |
1065 | entry = audit_data_to_entry(data, datasz); | 1043 | entry = audit_data_to_entry(data, datasz); |
1066 | if (IS_ERR(entry)) | 1044 | if (IS_ERR(entry)) |
@@ -1087,6 +1065,46 @@ int audit_receive_filter(int type, int pid, int seq, void *data, size_t datasz) | |||
1087 | return err; | 1065 | return err; |
1088 | } | 1066 | } |
1089 | 1067 | ||
1068 | /** | ||
1069 | * audit_list_rules_send - list the audit rules | ||
1070 | * @request_skb: skb of request we are replying to (used to target the reply) | ||
1071 | * @seq: netlink audit message sequence (serial) number | ||
1072 | */ | ||
1073 | int audit_list_rules_send(struct sk_buff *request_skb, int seq) | ||
1074 | { | ||
1075 | u32 portid = NETLINK_CB(request_skb).portid; | ||
1076 | struct net *net = sock_net(NETLINK_CB(request_skb).sk); | ||
1077 | struct task_struct *tsk; | ||
1078 | struct audit_netlink_list *dest; | ||
1079 | int err = 0; | ||
1080 | |||
1081 | /* We can't just spew out the rules here because we might fill | ||
1082 | * the available socket buffer space and deadlock waiting for | ||
1083 | * auditctl to read from it... which isn't ever going to | ||
1084 | * happen if we're actually running in the context of auditctl | ||
1085 | * trying to _send_ the stuff */ | ||
1086 | |||
1087 | dest = kmalloc(sizeof(struct audit_netlink_list), GFP_KERNEL); | ||
1088 | if (!dest) | ||
1089 | return -ENOMEM; | ||
1090 | dest->net = get_net(net); | ||
1091 | dest->portid = portid; | ||
1092 | skb_queue_head_init(&dest->q); | ||
1093 | |||
1094 | mutex_lock(&audit_filter_mutex); | ||
1095 | audit_list_rules(portid, seq, &dest->q); | ||
1096 | mutex_unlock(&audit_filter_mutex); | ||
1097 | |||
1098 | tsk = kthread_run(audit_send_list, dest, "audit_send_list"); | ||
1099 | if (IS_ERR(tsk)) { | ||
1100 | skb_queue_purge(&dest->q); | ||
1101 | kfree(dest); | ||
1102 | err = PTR_ERR(tsk); | ||
1103 | } | ||
1104 | |||
1105 | return err; | ||
1106 | } | ||
1107 | |||
1090 | int audit_comparator(u32 left, u32 op, u32 right) | 1108 | int audit_comparator(u32 left, u32 op, u32 right) |
1091 | { | 1109 | { |
1092 | switch (op) { | 1110 | switch (op) { |
@@ -1276,19 +1294,22 @@ int audit_filter_user(int type) | |||
1276 | { | 1294 | { |
1277 | enum audit_state state = AUDIT_DISABLED; | 1295 | enum audit_state state = AUDIT_DISABLED; |
1278 | struct audit_entry *e; | 1296 | struct audit_entry *e; |
1279 | int ret = 1; | 1297 | int rc, ret; |
1298 | |||
1299 | ret = 1; /* Audit by default */ | ||
1280 | 1300 | ||
1281 | rcu_read_lock(); | 1301 | rcu_read_lock(); |
1282 | list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_USER], list) { | 1302 | list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_USER], list) { |
1283 | if (audit_filter_user_rules(&e->rule, type, &state)) { | 1303 | rc = audit_filter_user_rules(&e->rule, type, &state); |
1284 | if (state == AUDIT_DISABLED) | 1304 | if (rc) { |
1305 | if (rc > 0 && state == AUDIT_DISABLED) | ||
1285 | ret = 0; | 1306 | ret = 0; |
1286 | break; | 1307 | break; |
1287 | } | 1308 | } |
1288 | } | 1309 | } |
1289 | rcu_read_unlock(); | 1310 | rcu_read_unlock(); |
1290 | 1311 | ||
1291 | return ret; /* Audit by default */ | 1312 | return ret; |
1292 | } | 1313 | } |
1293 | 1314 | ||
1294 | int audit_filter_type(int type) | 1315 | int audit_filter_type(int type) |
diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 90594c9f7552..7aef2f4b6c64 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c | |||
@@ -1719,7 +1719,7 @@ void audit_putname(struct filename *name) | |||
1719 | struct audit_context *context = current->audit_context; | 1719 | struct audit_context *context = current->audit_context; |
1720 | 1720 | ||
1721 | BUG_ON(!context); | 1721 | BUG_ON(!context); |
1722 | if (!context->in_syscall) { | 1722 | if (!name->aname || !context->in_syscall) { |
1723 | #if AUDIT_DEBUG == 2 | 1723 | #if AUDIT_DEBUG == 2 |
1724 | printk(KERN_ERR "%s:%d(:%d): final_putname(%p)\n", | 1724 | printk(KERN_ERR "%s:%d(:%d): final_putname(%p)\n", |
1725 | __FILE__, __LINE__, context->serial, name); | 1725 | __FILE__, __LINE__, context->serial, name); |
@@ -1969,18 +1969,24 @@ static void audit_log_set_loginuid(kuid_t koldloginuid, kuid_t kloginuid, | |||
1969 | int rc) | 1969 | int rc) |
1970 | { | 1970 | { |
1971 | struct audit_buffer *ab; | 1971 | struct audit_buffer *ab; |
1972 | uid_t uid, ologinuid, nloginuid; | 1972 | uid_t uid, oldloginuid, loginuid; |
1973 | |||
1974 | if (!audit_enabled) | ||
1975 | return; | ||
1973 | 1976 | ||
1974 | uid = from_kuid(&init_user_ns, task_uid(current)); | 1977 | uid = from_kuid(&init_user_ns, task_uid(current)); |
1975 | ologinuid = from_kuid(&init_user_ns, koldloginuid); | 1978 | oldloginuid = from_kuid(&init_user_ns, koldloginuid); |
1976 | nloginuid = from_kuid(&init_user_ns, kloginuid), | 1979 | loginuid = from_kuid(&init_user_ns, kloginuid), |
1977 | 1980 | ||
1978 | ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_LOGIN); | 1981 | ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_LOGIN); |
1979 | if (!ab) | 1982 | if (!ab) |
1980 | return; | 1983 | return; |
1981 | audit_log_format(ab, "pid=%d uid=%u old auid=%u new auid=%u old " | 1984 | audit_log_format(ab, "pid=%d uid=%u" |
1982 | "ses=%u new ses=%u res=%d", current->pid, uid, ologinuid, | 1985 | " old-auid=%u new-auid=%u old-ses=%u new-ses=%u" |
1983 | nloginuid, oldsessionid, sessionid, !rc); | 1986 | " res=%d", |
1987 | current->pid, uid, | ||
1988 | oldloginuid, loginuid, oldsessionid, sessionid, | ||
1989 | !rc); | ||
1984 | audit_log_end(ab); | 1990 | audit_log_end(ab); |
1985 | } | 1991 | } |
1986 | 1992 | ||
@@ -2008,7 +2014,7 @@ int audit_set_loginuid(kuid_t loginuid) | |||
2008 | 2014 | ||
2009 | /* are we setting or clearing? */ | 2015 | /* are we setting or clearing? */ |
2010 | if (uid_valid(loginuid)) | 2016 | if (uid_valid(loginuid)) |
2011 | sessionid = atomic_inc_return(&session_id); | 2017 | sessionid = (unsigned int)atomic_inc_return(&session_id); |
2012 | 2018 | ||
2013 | task->sessionid = sessionid; | 2019 | task->sessionid = sessionid; |
2014 | task->loginuid = loginuid; | 2020 | task->loginuid = loginuid; |
@@ -2321,18 +2327,16 @@ int __audit_log_bprm_fcaps(struct linux_binprm *bprm, | |||
2321 | 2327 | ||
2322 | /** | 2328 | /** |
2323 | * __audit_log_capset - store information about the arguments to the capset syscall | 2329 | * __audit_log_capset - store information about the arguments to the capset syscall |
2324 | * @pid: target pid of the capset call | ||
2325 | * @new: the new credentials | 2330 | * @new: the new credentials |
2326 | * @old: the old (current) credentials | 2331 | * @old: the old (current) credentials |
2327 | * | 2332 | * |
2328 | * Record the aguments userspace sent to sys_capset for later printing by the | 2333 | * Record the aguments userspace sent to sys_capset for later printing by the |
2329 | * audit system if applicable | 2334 | * audit system if applicable |
2330 | */ | 2335 | */ |
2331 | void __audit_log_capset(pid_t pid, | 2336 | void __audit_log_capset(const struct cred *new, const struct cred *old) |
2332 | const struct cred *new, const struct cred *old) | ||
2333 | { | 2337 | { |
2334 | struct audit_context *context = current->audit_context; | 2338 | struct audit_context *context = current->audit_context; |
2335 | context->capset.pid = pid; | 2339 | context->capset.pid = task_pid_nr(current); |
2336 | context->capset.cap.effective = new->cap_effective; | 2340 | context->capset.cap.effective = new->cap_effective; |
2337 | context->capset.cap.inheritable = new->cap_effective; | 2341 | context->capset.cap.inheritable = new->cap_effective; |
2338 | context->capset.cap.permitted = new->cap_permitted; | 2342 | context->capset.cap.permitted = new->cap_permitted; |
@@ -2352,6 +2356,7 @@ static void audit_log_task(struct audit_buffer *ab) | |||
2352 | kuid_t auid, uid; | 2356 | kuid_t auid, uid; |
2353 | kgid_t gid; | 2357 | kgid_t gid; |
2354 | unsigned int sessionid; | 2358 | unsigned int sessionid; |
2359 | struct mm_struct *mm = current->mm; | ||
2355 | 2360 | ||
2356 | auid = audit_get_loginuid(current); | 2361 | auid = audit_get_loginuid(current); |
2357 | sessionid = audit_get_sessionid(current); | 2362 | sessionid = audit_get_sessionid(current); |
@@ -2365,15 +2370,15 @@ static void audit_log_task(struct audit_buffer *ab) | |||
2365 | audit_log_task_context(ab); | 2370 | audit_log_task_context(ab); |
2366 | audit_log_format(ab, " pid=%d comm=", current->pid); | 2371 | audit_log_format(ab, " pid=%d comm=", current->pid); |
2367 | audit_log_untrustedstring(ab, current->comm); | 2372 | audit_log_untrustedstring(ab, current->comm); |
2373 | if (mm) { | ||
2374 | down_read(&mm->mmap_sem); | ||
2375 | if (mm->exe_file) | ||
2376 | audit_log_d_path(ab, " exe=", &mm->exe_file->f_path); | ||
2377 | up_read(&mm->mmap_sem); | ||
2378 | } else | ||
2379 | audit_log_format(ab, " exe=(null)"); | ||
2368 | } | 2380 | } |
2369 | 2381 | ||
2370 | static void audit_log_abend(struct audit_buffer *ab, char *reason, long signr) | ||
2371 | { | ||
2372 | audit_log_task(ab); | ||
2373 | audit_log_format(ab, " reason="); | ||
2374 | audit_log_string(ab, reason); | ||
2375 | audit_log_format(ab, " sig=%ld", signr); | ||
2376 | } | ||
2377 | /** | 2382 | /** |
2378 | * audit_core_dumps - record information about processes that end abnormally | 2383 | * audit_core_dumps - record information about processes that end abnormally |
2379 | * @signr: signal value | 2384 | * @signr: signal value |
@@ -2394,7 +2399,8 @@ void audit_core_dumps(long signr) | |||
2394 | ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_ANOM_ABEND); | 2399 | ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_ANOM_ABEND); |
2395 | if (unlikely(!ab)) | 2400 | if (unlikely(!ab)) |
2396 | return; | 2401 | return; |
2397 | audit_log_abend(ab, "memory violation", signr); | 2402 | audit_log_task(ab); |
2403 | audit_log_format(ab, " sig=%ld", signr); | ||
2398 | audit_log_end(ab); | 2404 | audit_log_end(ab); |
2399 | } | 2405 | } |
2400 | 2406 | ||
diff --git a/kernel/capability.c b/kernel/capability.c index 4e66bf9275b0..34019c57888d 100644 --- a/kernel/capability.c +++ b/kernel/capability.c | |||
@@ -277,7 +277,7 @@ SYSCALL_DEFINE2(capset, cap_user_header_t, header, const cap_user_data_t, data) | |||
277 | if (ret < 0) | 277 | if (ret < 0) |
278 | goto error; | 278 | goto error; |
279 | 279 | ||
280 | audit_log_capset(pid, new, current_cred()); | 280 | audit_log_capset(new, current_cred()); |
281 | 281 | ||
282 | return commit_creds(new); | 282 | return commit_creds(new); |
283 | 283 | ||
diff --git a/kernel/cgroup.c b/kernel/cgroup.c index bc1dcabe9217..0c753ddd223b 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c | |||
@@ -41,7 +41,6 @@ | |||
41 | #include <linux/rcupdate.h> | 41 | #include <linux/rcupdate.h> |
42 | #include <linux/sched.h> | 42 | #include <linux/sched.h> |
43 | #include <linux/backing-dev.h> | 43 | #include <linux/backing-dev.h> |
44 | #include <linux/seq_file.h> | ||
45 | #include <linux/slab.h> | 44 | #include <linux/slab.h> |
46 | #include <linux/magic.h> | 45 | #include <linux/magic.h> |
47 | #include <linux/spinlock.h> | 46 | #include <linux/spinlock.h> |
@@ -56,15 +55,20 @@ | |||
56 | #include <linux/pid_namespace.h> | 55 | #include <linux/pid_namespace.h> |
57 | #include <linux/idr.h> | 56 | #include <linux/idr.h> |
58 | #include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */ | 57 | #include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */ |
59 | #include <linux/eventfd.h> | ||
60 | #include <linux/poll.h> | ||
61 | #include <linux/flex_array.h> /* used in cgroup_attach_task */ | 58 | #include <linux/flex_array.h> /* used in cgroup_attach_task */ |
62 | #include <linux/kthread.h> | 59 | #include <linux/kthread.h> |
63 | #include <linux/file.h> | ||
64 | 60 | ||
65 | #include <linux/atomic.h> | 61 | #include <linux/atomic.h> |
66 | 62 | ||
67 | /* | 63 | /* |
64 | * pidlists linger the following amount before being destroyed. The goal | ||
65 | * is avoiding frequent destruction in the middle of consecutive read calls | ||
66 | * Expiring in the middle is a performance problem not a correctness one. | ||
67 | * 1 sec should be enough. | ||
68 | */ | ||
69 | #define CGROUP_PIDLIST_DESTROY_DELAY HZ | ||
70 | |||
71 | /* | ||
68 | * cgroup_mutex is the master lock. Any modification to cgroup or its | 72 | * cgroup_mutex is the master lock. Any modification to cgroup or its |
69 | * hierarchy must be performed while holding it. | 73 | * hierarchy must be performed while holding it. |
70 | * | 74 | * |
@@ -89,6 +93,19 @@ static DEFINE_MUTEX(cgroup_mutex); | |||
89 | 93 | ||
90 | static DEFINE_MUTEX(cgroup_root_mutex); | 94 | static DEFINE_MUTEX(cgroup_root_mutex); |
91 | 95 | ||
96 | #define cgroup_assert_mutex_or_rcu_locked() \ | ||
97 | rcu_lockdep_assert(rcu_read_lock_held() || \ | ||
98 | lockdep_is_held(&cgroup_mutex), \ | ||
99 | "cgroup_mutex or RCU read lock required"); | ||
100 | |||
101 | #ifdef CONFIG_LOCKDEP | ||
102 | #define cgroup_assert_mutex_or_root_locked() \ | ||
103 | WARN_ON_ONCE(debug_locks && (!lockdep_is_held(&cgroup_mutex) && \ | ||
104 | !lockdep_is_held(&cgroup_root_mutex))) | ||
105 | #else | ||
106 | #define cgroup_assert_mutex_or_root_locked() do { } while (0) | ||
107 | #endif | ||
108 | |||
92 | /* | 109 | /* |
93 | * cgroup destruction makes heavy use of work items and there can be a lot | 110 | * cgroup destruction makes heavy use of work items and there can be a lot |
94 | * of concurrent destructions. Use a separate workqueue so that cgroup | 111 | * of concurrent destructions. Use a separate workqueue so that cgroup |
@@ -98,6 +115,12 @@ static DEFINE_MUTEX(cgroup_root_mutex); | |||
98 | static struct workqueue_struct *cgroup_destroy_wq; | 115 | static struct workqueue_struct *cgroup_destroy_wq; |
99 | 116 | ||
100 | /* | 117 | /* |
118 | * pidlist destructions need to be flushed on cgroup destruction. Use a | ||
119 | * separate workqueue as flush domain. | ||
120 | */ | ||
121 | static struct workqueue_struct *cgroup_pidlist_destroy_wq; | ||
122 | |||
123 | /* | ||
101 | * Generate an array of cgroup subsystem pointers. At boot time, this is | 124 | * Generate an array of cgroup subsystem pointers. At boot time, this is |
102 | * populated with the built in subsystems, and modular subsystems are | 125 | * populated with the built in subsystems, and modular subsystems are |
103 | * registered after that. The mutable section of this array is protected by | 126 | * registered after that. The mutable section of this array is protected by |
@@ -119,49 +142,6 @@ static struct cgroupfs_root cgroup_dummy_root; | |||
119 | /* dummy_top is a shorthand for the dummy hierarchy's top cgroup */ | 142 | /* dummy_top is a shorthand for the dummy hierarchy's top cgroup */ |
120 | static struct cgroup * const cgroup_dummy_top = &cgroup_dummy_root.top_cgroup; | 143 | static struct cgroup * const cgroup_dummy_top = &cgroup_dummy_root.top_cgroup; |
121 | 144 | ||
122 | /* | ||
123 | * cgroupfs file entry, pointed to from leaf dentry->d_fsdata. | ||
124 | */ | ||
125 | struct cfent { | ||
126 | struct list_head node; | ||
127 | struct dentry *dentry; | ||
128 | struct cftype *type; | ||
129 | struct cgroup_subsys_state *css; | ||
130 | |||
131 | /* file xattrs */ | ||
132 | struct simple_xattrs xattrs; | ||
133 | }; | ||
134 | |||
135 | /* | ||
136 | * cgroup_event represents events which userspace want to receive. | ||
137 | */ | ||
138 | struct cgroup_event { | ||
139 | /* | ||
140 | * css which the event belongs to. | ||
141 | */ | ||
142 | struct cgroup_subsys_state *css; | ||
143 | /* | ||
144 | * Control file which the event associated. | ||
145 | */ | ||
146 | struct cftype *cft; | ||
147 | /* | ||
148 | * eventfd to signal userspace about the event. | ||
149 | */ | ||
150 | struct eventfd_ctx *eventfd; | ||
151 | /* | ||
152 | * Each of these stored in a list by the cgroup. | ||
153 | */ | ||
154 | struct list_head list; | ||
155 | /* | ||
156 | * All fields below needed to unregister event when | ||
157 | * userspace closes eventfd. | ||
158 | */ | ||
159 | poll_table pt; | ||
160 | wait_queue_head_t *wqh; | ||
161 | wait_queue_t wait; | ||
162 | struct work_struct remove; | ||
163 | }; | ||
164 | |||
165 | /* The list of hierarchy roots */ | 145 | /* The list of hierarchy roots */ |
166 | 146 | ||
167 | static LIST_HEAD(cgroup_roots); | 147 | static LIST_HEAD(cgroup_roots); |
@@ -200,6 +180,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp); | |||
200 | static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], | 180 | static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], |
201 | bool is_add); | 181 | bool is_add); |
202 | static int cgroup_file_release(struct inode *inode, struct file *file); | 182 | static int cgroup_file_release(struct inode *inode, struct file *file); |
183 | static void cgroup_pidlist_destroy_all(struct cgroup *cgrp); | ||
203 | 184 | ||
204 | /** | 185 | /** |
205 | * cgroup_css - obtain a cgroup's css for the specified subsystem | 186 | * cgroup_css - obtain a cgroup's css for the specified subsystem |
@@ -262,16 +243,32 @@ static int notify_on_release(const struct cgroup *cgrp) | |||
262 | } | 243 | } |
263 | 244 | ||
264 | /** | 245 | /** |
246 | * for_each_css - iterate all css's of a cgroup | ||
247 | * @css: the iteration cursor | ||
248 | * @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end | ||
249 | * @cgrp: the target cgroup to iterate css's of | ||
250 | * | ||
251 | * Should be called under cgroup_mutex. | ||
252 | */ | ||
253 | #define for_each_css(css, ssid, cgrp) \ | ||
254 | for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \ | ||
255 | if (!((css) = rcu_dereference_check( \ | ||
256 | (cgrp)->subsys[(ssid)], \ | ||
257 | lockdep_is_held(&cgroup_mutex)))) { } \ | ||
258 | else | ||
259 | |||
260 | /** | ||
265 | * for_each_subsys - iterate all loaded cgroup subsystems | 261 | * for_each_subsys - iterate all loaded cgroup subsystems |
266 | * @ss: the iteration cursor | 262 | * @ss: the iteration cursor |
267 | * @i: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end | 263 | * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end |
268 | * | 264 | * |
269 | * Should be called under cgroup_mutex. | 265 | * Iterates through all loaded subsystems. Should be called under |
266 | * cgroup_mutex or cgroup_root_mutex. | ||
270 | */ | 267 | */ |
271 | #define for_each_subsys(ss, i) \ | 268 | #define for_each_subsys(ss, ssid) \ |
272 | for ((i) = 0; (i) < CGROUP_SUBSYS_COUNT; (i)++) \ | 269 | for (({ cgroup_assert_mutex_or_root_locked(); (ssid) = 0; }); \ |
273 | if (({ lockdep_assert_held(&cgroup_mutex); \ | 270 | (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \ |
274 | !((ss) = cgroup_subsys[i]); })) { } \ | 271 | if (!((ss) = cgroup_subsys[(ssid)])) { } \ |
275 | else | 272 | else |
276 | 273 | ||
277 | /** | 274 | /** |
@@ -286,10 +283,6 @@ static int notify_on_release(const struct cgroup *cgrp) | |||
286 | for ((i) = 0; (i) < CGROUP_BUILTIN_SUBSYS_COUNT && \ | 283 | for ((i) = 0; (i) < CGROUP_BUILTIN_SUBSYS_COUNT && \ |
287 | (((ss) = cgroup_subsys[i]) || true); (i)++) | 284 | (((ss) = cgroup_subsys[i]) || true); (i)++) |
288 | 285 | ||
289 | /* iterate each subsystem attached to a hierarchy */ | ||
290 | #define for_each_root_subsys(root, ss) \ | ||
291 | list_for_each_entry((ss), &(root)->subsys_list, sibling) | ||
292 | |||
293 | /* iterate across the active hierarchies */ | 286 | /* iterate across the active hierarchies */ |
294 | #define for_each_active_root(root) \ | 287 | #define for_each_active_root(root) \ |
295 | list_for_each_entry((root), &cgroup_roots, root_list) | 288 | list_for_each_entry((root), &cgroup_roots, root_list) |
@@ -863,11 +856,7 @@ static void cgroup_free_fn(struct work_struct *work) | |||
863 | */ | 856 | */ |
864 | deactivate_super(cgrp->root->sb); | 857 | deactivate_super(cgrp->root->sb); |
865 | 858 | ||
866 | /* | 859 | cgroup_pidlist_destroy_all(cgrp); |
867 | * if we're getting rid of the cgroup, refcount should ensure | ||
868 | * that there are no pidlists left. | ||
869 | */ | ||
870 | BUG_ON(!list_empty(&cgrp->pidlists)); | ||
871 | 860 | ||
872 | simple_xattrs_free(&cgrp->xattrs); | 861 | simple_xattrs_free(&cgrp->xattrs); |
873 | 862 | ||
@@ -897,7 +886,9 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode) | |||
897 | * per-subsystem and moved to css->id so that lookups are | 886 | * per-subsystem and moved to css->id so that lookups are |
898 | * successful until the target css is released. | 887 | * successful until the target css is released. |
899 | */ | 888 | */ |
889 | mutex_lock(&cgroup_mutex); | ||
900 | idr_remove(&cgrp->root->cgroup_idr, cgrp->id); | 890 | idr_remove(&cgrp->root->cgroup_idr, cgrp->id); |
891 | mutex_unlock(&cgroup_mutex); | ||
901 | cgrp->id = -1; | 892 | cgrp->id = -1; |
902 | 893 | ||
903 | call_rcu(&cgrp->rcu_head, cgroup_free_rcu); | 894 | call_rcu(&cgrp->rcu_head, cgroup_free_rcu); |
@@ -1050,7 +1041,6 @@ static int rebind_subsystems(struct cgroupfs_root *root, | |||
1050 | cgroup_css(cgroup_dummy_top, ss)); | 1041 | cgroup_css(cgroup_dummy_top, ss)); |
1051 | cgroup_css(cgrp, ss)->cgroup = cgrp; | 1042 | cgroup_css(cgrp, ss)->cgroup = cgrp; |
1052 | 1043 | ||
1053 | list_move(&ss->sibling, &root->subsys_list); | ||
1054 | ss->root = root; | 1044 | ss->root = root; |
1055 | if (ss->bind) | 1045 | if (ss->bind) |
1056 | ss->bind(cgroup_css(cgrp, ss)); | 1046 | ss->bind(cgroup_css(cgrp, ss)); |
@@ -1069,7 +1059,6 @@ static int rebind_subsystems(struct cgroupfs_root *root, | |||
1069 | RCU_INIT_POINTER(cgrp->subsys[i], NULL); | 1059 | RCU_INIT_POINTER(cgrp->subsys[i], NULL); |
1070 | 1060 | ||
1071 | cgroup_subsys[i]->root = &cgroup_dummy_root; | 1061 | cgroup_subsys[i]->root = &cgroup_dummy_root; |
1072 | list_move(&ss->sibling, &cgroup_dummy_root.subsys_list); | ||
1073 | 1062 | ||
1074 | /* subsystem is now free - drop reference on module */ | 1063 | /* subsystem is now free - drop reference on module */ |
1075 | module_put(ss->module); | 1064 | module_put(ss->module); |
@@ -1096,10 +1085,12 @@ static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry) | |||
1096 | { | 1085 | { |
1097 | struct cgroupfs_root *root = dentry->d_sb->s_fs_info; | 1086 | struct cgroupfs_root *root = dentry->d_sb->s_fs_info; |
1098 | struct cgroup_subsys *ss; | 1087 | struct cgroup_subsys *ss; |
1088 | int ssid; | ||
1099 | 1089 | ||
1100 | mutex_lock(&cgroup_root_mutex); | 1090 | mutex_lock(&cgroup_root_mutex); |
1101 | for_each_root_subsys(root, ss) | 1091 | for_each_subsys(ss, ssid) |
1102 | seq_printf(seq, ",%s", ss->name); | 1092 | if (root->subsys_mask & (1 << ssid)) |
1093 | seq_printf(seq, ",%s", ss->name); | ||
1103 | if (root->flags & CGRP_ROOT_SANE_BEHAVIOR) | 1094 | if (root->flags & CGRP_ROOT_SANE_BEHAVIOR) |
1104 | seq_puts(seq, ",sane_behavior"); | 1095 | seq_puts(seq, ",sane_behavior"); |
1105 | if (root->flags & CGRP_ROOT_NOPREFIX) | 1096 | if (root->flags & CGRP_ROOT_NOPREFIX) |
@@ -1362,8 +1353,6 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) | |||
1362 | INIT_LIST_HEAD(&cgrp->pidlists); | 1353 | INIT_LIST_HEAD(&cgrp->pidlists); |
1363 | mutex_init(&cgrp->pidlist_mutex); | 1354 | mutex_init(&cgrp->pidlist_mutex); |
1364 | cgrp->dummy_css.cgroup = cgrp; | 1355 | cgrp->dummy_css.cgroup = cgrp; |
1365 | INIT_LIST_HEAD(&cgrp->event_list); | ||
1366 | spin_lock_init(&cgrp->event_list_lock); | ||
1367 | simple_xattrs_init(&cgrp->xattrs); | 1356 | simple_xattrs_init(&cgrp->xattrs); |
1368 | } | 1357 | } |
1369 | 1358 | ||
@@ -1371,7 +1360,6 @@ static void init_cgroup_root(struct cgroupfs_root *root) | |||
1371 | { | 1360 | { |
1372 | struct cgroup *cgrp = &root->top_cgroup; | 1361 | struct cgroup *cgrp = &root->top_cgroup; |
1373 | 1362 | ||
1374 | INIT_LIST_HEAD(&root->subsys_list); | ||
1375 | INIT_LIST_HEAD(&root->root_list); | 1363 | INIT_LIST_HEAD(&root->root_list); |
1376 | root->number_of_cgroups = 1; | 1364 | root->number_of_cgroups = 1; |
1377 | cgrp->root = root; | 1365 | cgrp->root = root; |
@@ -1580,10 +1568,10 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, | |||
1580 | mutex_lock(&cgroup_mutex); | 1568 | mutex_lock(&cgroup_mutex); |
1581 | mutex_lock(&cgroup_root_mutex); | 1569 | mutex_lock(&cgroup_root_mutex); |
1582 | 1570 | ||
1583 | root_cgrp->id = idr_alloc(&root->cgroup_idr, root_cgrp, | 1571 | ret = idr_alloc(&root->cgroup_idr, root_cgrp, 0, 1, GFP_KERNEL); |
1584 | 0, 1, GFP_KERNEL); | 1572 | if (ret < 0) |
1585 | if (root_cgrp->id < 0) | ||
1586 | goto unlock_drop; | 1573 | goto unlock_drop; |
1574 | root_cgrp->id = ret; | ||
1587 | 1575 | ||
1588 | /* Check for name clashes with existing mounts */ | 1576 | /* Check for name clashes with existing mounts */ |
1589 | ret = -EBUSY; | 1577 | ret = -EBUSY; |
@@ -1693,7 +1681,8 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, | |||
1693 | return ERR_PTR(ret); | 1681 | return ERR_PTR(ret); |
1694 | } | 1682 | } |
1695 | 1683 | ||
1696 | static void cgroup_kill_sb(struct super_block *sb) { | 1684 | static void cgroup_kill_sb(struct super_block *sb) |
1685 | { | ||
1697 | struct cgroupfs_root *root = sb->s_fs_info; | 1686 | struct cgroupfs_root *root = sb->s_fs_info; |
1698 | struct cgroup *cgrp = &root->top_cgroup; | 1687 | struct cgroup *cgrp = &root->top_cgroup; |
1699 | struct cgrp_cset_link *link, *tmp_link; | 1688 | struct cgrp_cset_link *link, *tmp_link; |
@@ -1976,8 +1965,8 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk, | |||
1976 | bool threadgroup) | 1965 | bool threadgroup) |
1977 | { | 1966 | { |
1978 | int retval, i, group_size; | 1967 | int retval, i, group_size; |
1979 | struct cgroup_subsys *ss, *failed_ss = NULL; | ||
1980 | struct cgroupfs_root *root = cgrp->root; | 1968 | struct cgroupfs_root *root = cgrp->root; |
1969 | struct cgroup_subsys_state *css, *failed_css = NULL; | ||
1981 | /* threadgroup list cursor and array */ | 1970 | /* threadgroup list cursor and array */ |
1982 | struct task_struct *leader = tsk; | 1971 | struct task_struct *leader = tsk; |
1983 | struct task_and_cgroup *tc; | 1972 | struct task_and_cgroup *tc; |
@@ -2050,13 +2039,11 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk, | |||
2050 | /* | 2039 | /* |
2051 | * step 1: check that we can legitimately attach to the cgroup. | 2040 | * step 1: check that we can legitimately attach to the cgroup. |
2052 | */ | 2041 | */ |
2053 | for_each_root_subsys(root, ss) { | 2042 | for_each_css(css, i, cgrp) { |
2054 | struct cgroup_subsys_state *css = cgroup_css(cgrp, ss); | 2043 | if (css->ss->can_attach) { |
2055 | 2044 | retval = css->ss->can_attach(css, &tset); | |
2056 | if (ss->can_attach) { | ||
2057 | retval = ss->can_attach(css, &tset); | ||
2058 | if (retval) { | 2045 | if (retval) { |
2059 | failed_ss = ss; | 2046 | failed_css = css; |
2060 | goto out_cancel_attach; | 2047 | goto out_cancel_attach; |
2061 | } | 2048 | } |
2062 | } | 2049 | } |
@@ -2092,12 +2079,9 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk, | |||
2092 | /* | 2079 | /* |
2093 | * step 4: do subsystem attach callbacks. | 2080 | * step 4: do subsystem attach callbacks. |
2094 | */ | 2081 | */ |
2095 | for_each_root_subsys(root, ss) { | 2082 | for_each_css(css, i, cgrp) |
2096 | struct cgroup_subsys_state *css = cgroup_css(cgrp, ss); | 2083 | if (css->ss->attach) |
2097 | 2084 | css->ss->attach(css, &tset); | |
2098 | if (ss->attach) | ||
2099 | ss->attach(css, &tset); | ||
2100 | } | ||
2101 | 2085 | ||
2102 | /* | 2086 | /* |
2103 | * step 5: success! and cleanup | 2087 | * step 5: success! and cleanup |
@@ -2114,13 +2098,11 @@ out_put_css_set_refs: | |||
2114 | } | 2098 | } |
2115 | out_cancel_attach: | 2099 | out_cancel_attach: |
2116 | if (retval) { | 2100 | if (retval) { |
2117 | for_each_root_subsys(root, ss) { | 2101 | for_each_css(css, i, cgrp) { |
2118 | struct cgroup_subsys_state *css = cgroup_css(cgrp, ss); | 2102 | if (css == failed_css) |
2119 | |||
2120 | if (ss == failed_ss) | ||
2121 | break; | 2103 | break; |
2122 | if (ss->cancel_attach) | 2104 | if (css->ss->cancel_attach) |
2123 | ss->cancel_attach(css, &tset); | 2105 | css->ss->cancel_attach(css, &tset); |
2124 | } | 2106 | } |
2125 | } | 2107 | } |
2126 | out_free_group_list: | 2108 | out_free_group_list: |
@@ -2148,7 +2130,7 @@ retry_find_task: | |||
2148 | tsk = find_task_by_vpid(pid); | 2130 | tsk = find_task_by_vpid(pid); |
2149 | if (!tsk) { | 2131 | if (!tsk) { |
2150 | rcu_read_unlock(); | 2132 | rcu_read_unlock(); |
2151 | ret= -ESRCH; | 2133 | ret = -ESRCH; |
2152 | goto out_unlock_cgroup; | 2134 | goto out_unlock_cgroup; |
2153 | } | 2135 | } |
2154 | /* | 2136 | /* |
@@ -2260,10 +2242,9 @@ static int cgroup_release_agent_write(struct cgroup_subsys_state *css, | |||
2260 | return 0; | 2242 | return 0; |
2261 | } | 2243 | } |
2262 | 2244 | ||
2263 | static int cgroup_release_agent_show(struct cgroup_subsys_state *css, | 2245 | static int cgroup_release_agent_show(struct seq_file *seq, void *v) |
2264 | struct cftype *cft, struct seq_file *seq) | ||
2265 | { | 2246 | { |
2266 | struct cgroup *cgrp = css->cgroup; | 2247 | struct cgroup *cgrp = seq_css(seq)->cgroup; |
2267 | 2248 | ||
2268 | if (!cgroup_lock_live_group(cgrp)) | 2249 | if (!cgroup_lock_live_group(cgrp)) |
2269 | return -ENODEV; | 2250 | return -ENODEV; |
@@ -2273,174 +2254,129 @@ static int cgroup_release_agent_show(struct cgroup_subsys_state *css, | |||
2273 | return 0; | 2254 | return 0; |
2274 | } | 2255 | } |
2275 | 2256 | ||
2276 | static int cgroup_sane_behavior_show(struct cgroup_subsys_state *css, | 2257 | static int cgroup_sane_behavior_show(struct seq_file *seq, void *v) |
2277 | struct cftype *cft, struct seq_file *seq) | ||
2278 | { | 2258 | { |
2279 | seq_printf(seq, "%d\n", cgroup_sane_behavior(css->cgroup)); | 2259 | struct cgroup *cgrp = seq_css(seq)->cgroup; |
2260 | |||
2261 | seq_printf(seq, "%d\n", cgroup_sane_behavior(cgrp)); | ||
2280 | return 0; | 2262 | return 0; |
2281 | } | 2263 | } |
2282 | 2264 | ||
2283 | /* A buffer size big enough for numbers or short strings */ | 2265 | /* A buffer size big enough for numbers or short strings */ |
2284 | #define CGROUP_LOCAL_BUFFER_SIZE 64 | 2266 | #define CGROUP_LOCAL_BUFFER_SIZE 64 |
2285 | 2267 | ||
2286 | static ssize_t cgroup_write_X64(struct cgroup_subsys_state *css, | 2268 | static ssize_t cgroup_file_write(struct file *file, const char __user *userbuf, |
2287 | struct cftype *cft, struct file *file, | 2269 | size_t nbytes, loff_t *ppos) |
2288 | const char __user *userbuf, size_t nbytes, | ||
2289 | loff_t *unused_ppos) | ||
2290 | { | 2270 | { |
2291 | char buffer[CGROUP_LOCAL_BUFFER_SIZE]; | 2271 | struct cfent *cfe = __d_cfe(file->f_dentry); |
2292 | int retval = 0; | 2272 | struct cftype *cft = __d_cft(file->f_dentry); |
2293 | char *end; | 2273 | struct cgroup_subsys_state *css = cfe->css; |
2274 | size_t max_bytes = cft->max_write_len ?: CGROUP_LOCAL_BUFFER_SIZE - 1; | ||
2275 | char *buf; | ||
2276 | int ret; | ||
2294 | 2277 | ||
2295 | if (!nbytes) | 2278 | if (nbytes >= max_bytes) |
2296 | return -EINVAL; | ||
2297 | if (nbytes >= sizeof(buffer)) | ||
2298 | return -E2BIG; | 2279 | return -E2BIG; |
2299 | if (copy_from_user(buffer, userbuf, nbytes)) | ||
2300 | return -EFAULT; | ||
2301 | 2280 | ||
2302 | buffer[nbytes] = 0; /* nul-terminate */ | 2281 | buf = kmalloc(nbytes + 1, GFP_KERNEL); |
2303 | if (cft->write_u64) { | 2282 | if (!buf) |
2304 | u64 val = simple_strtoull(strstrip(buffer), &end, 0); | 2283 | return -ENOMEM; |
2305 | if (*end) | 2284 | |
2306 | return -EINVAL; | 2285 | if (copy_from_user(buf, userbuf, nbytes)) { |
2307 | retval = cft->write_u64(css, cft, val); | 2286 | ret = -EFAULT; |
2287 | goto out_free; | ||
2288 | } | ||
2289 | |||
2290 | buf[nbytes] = '\0'; | ||
2291 | |||
2292 | if (cft->write_string) { | ||
2293 | ret = cft->write_string(css, cft, strstrip(buf)); | ||
2294 | } else if (cft->write_u64) { | ||
2295 | unsigned long long v; | ||
2296 | ret = kstrtoull(buf, 0, &v); | ||
2297 | if (!ret) | ||
2298 | ret = cft->write_u64(css, cft, v); | ||
2299 | } else if (cft->write_s64) { | ||
2300 | long long v; | ||
2301 | ret = kstrtoll(buf, 0, &v); | ||
2302 | if (!ret) | ||
2303 | ret = cft->write_s64(css, cft, v); | ||
2304 | } else if (cft->trigger) { | ||
2305 | ret = cft->trigger(css, (unsigned int)cft->private); | ||
2308 | } else { | 2306 | } else { |
2309 | s64 val = simple_strtoll(strstrip(buffer), &end, 0); | 2307 | ret = -EINVAL; |
2310 | if (*end) | ||
2311 | return -EINVAL; | ||
2312 | retval = cft->write_s64(css, cft, val); | ||
2313 | } | 2308 | } |
2314 | if (!retval) | 2309 | out_free: |
2315 | retval = nbytes; | 2310 | kfree(buf); |
2316 | return retval; | 2311 | return ret ?: nbytes; |
2317 | } | 2312 | } |
2318 | 2313 | ||
2319 | static ssize_t cgroup_write_string(struct cgroup_subsys_state *css, | 2314 | /* |
2320 | struct cftype *cft, struct file *file, | 2315 | * seqfile ops/methods for returning structured data. Currently just |
2321 | const char __user *userbuf, size_t nbytes, | 2316 | * supports string->u64 maps, but can be extended in future. |
2322 | loff_t *unused_ppos) | 2317 | */ |
2318 | |||
2319 | static void *cgroup_seqfile_start(struct seq_file *seq, loff_t *ppos) | ||
2323 | { | 2320 | { |
2324 | char local_buffer[CGROUP_LOCAL_BUFFER_SIZE]; | 2321 | struct cftype *cft = seq_cft(seq); |
2325 | int retval = 0; | ||
2326 | size_t max_bytes = cft->max_write_len; | ||
2327 | char *buffer = local_buffer; | ||
2328 | 2322 | ||
2329 | if (!max_bytes) | 2323 | if (cft->seq_start) { |
2330 | max_bytes = sizeof(local_buffer) - 1; | 2324 | return cft->seq_start(seq, ppos); |
2331 | if (nbytes >= max_bytes) | 2325 | } else { |
2332 | return -E2BIG; | 2326 | /* |
2333 | /* Allocate a dynamic buffer if we need one */ | 2327 | * The same behavior and code as single_open(). Returns |
2334 | if (nbytes >= sizeof(local_buffer)) { | 2328 | * !NULL if pos is at the beginning; otherwise, NULL. |
2335 | buffer = kmalloc(nbytes + 1, GFP_KERNEL); | 2329 | */ |
2336 | if (buffer == NULL) | 2330 | return NULL + !*ppos; |
2337 | return -ENOMEM; | ||
2338 | } | ||
2339 | if (nbytes && copy_from_user(buffer, userbuf, nbytes)) { | ||
2340 | retval = -EFAULT; | ||
2341 | goto out; | ||
2342 | } | 2331 | } |
2343 | |||
2344 | buffer[nbytes] = 0; /* nul-terminate */ | ||
2345 | retval = cft->write_string(css, cft, strstrip(buffer)); | ||
2346 | if (!retval) | ||
2347 | retval = nbytes; | ||
2348 | out: | ||
2349 | if (buffer != local_buffer) | ||
2350 | kfree(buffer); | ||
2351 | return retval; | ||
2352 | } | 2332 | } |
2353 | 2333 | ||
2354 | static ssize_t cgroup_file_write(struct file *file, const char __user *buf, | 2334 | static void *cgroup_seqfile_next(struct seq_file *seq, void *v, loff_t *ppos) |
2355 | size_t nbytes, loff_t *ppos) | ||
2356 | { | 2335 | { |
2357 | struct cfent *cfe = __d_cfe(file->f_dentry); | 2336 | struct cftype *cft = seq_cft(seq); |
2358 | struct cftype *cft = __d_cft(file->f_dentry); | ||
2359 | struct cgroup_subsys_state *css = cfe->css; | ||
2360 | 2337 | ||
2361 | if (cft->write) | 2338 | if (cft->seq_next) { |
2362 | return cft->write(css, cft, file, buf, nbytes, ppos); | 2339 | return cft->seq_next(seq, v, ppos); |
2363 | if (cft->write_u64 || cft->write_s64) | 2340 | } else { |
2364 | return cgroup_write_X64(css, cft, file, buf, nbytes, ppos); | 2341 | /* |
2365 | if (cft->write_string) | 2342 | * The same behavior and code as single_open(), always |
2366 | return cgroup_write_string(css, cft, file, buf, nbytes, ppos); | 2343 | * terminate after the initial read. |
2367 | if (cft->trigger) { | 2344 | */ |
2368 | int ret = cft->trigger(css, (unsigned int)cft->private); | 2345 | ++*ppos; |
2369 | return ret ? ret : nbytes; | 2346 | return NULL; |
2370 | } | 2347 | } |
2371 | return -EINVAL; | ||
2372 | } | 2348 | } |
2373 | 2349 | ||
2374 | static ssize_t cgroup_read_u64(struct cgroup_subsys_state *css, | 2350 | static void cgroup_seqfile_stop(struct seq_file *seq, void *v) |
2375 | struct cftype *cft, struct file *file, | ||
2376 | char __user *buf, size_t nbytes, loff_t *ppos) | ||
2377 | { | 2351 | { |
2378 | char tmp[CGROUP_LOCAL_BUFFER_SIZE]; | 2352 | struct cftype *cft = seq_cft(seq); |
2379 | u64 val = cft->read_u64(css, cft); | ||
2380 | int len = sprintf(tmp, "%llu\n", (unsigned long long) val); | ||
2381 | 2353 | ||
2382 | return simple_read_from_buffer(buf, nbytes, ppos, tmp, len); | 2354 | if (cft->seq_stop) |
2355 | cft->seq_stop(seq, v); | ||
2383 | } | 2356 | } |
2384 | 2357 | ||
2385 | static ssize_t cgroup_read_s64(struct cgroup_subsys_state *css, | 2358 | static int cgroup_seqfile_show(struct seq_file *m, void *arg) |
2386 | struct cftype *cft, struct file *file, | ||
2387 | char __user *buf, size_t nbytes, loff_t *ppos) | ||
2388 | { | 2359 | { |
2389 | char tmp[CGROUP_LOCAL_BUFFER_SIZE]; | 2360 | struct cftype *cft = seq_cft(m); |
2390 | s64 val = cft->read_s64(css, cft); | 2361 | struct cgroup_subsys_state *css = seq_css(m); |
2391 | int len = sprintf(tmp, "%lld\n", (long long) val); | ||
2392 | |||
2393 | return simple_read_from_buffer(buf, nbytes, ppos, tmp, len); | ||
2394 | } | ||
2395 | 2362 | ||
2396 | static ssize_t cgroup_file_read(struct file *file, char __user *buf, | 2363 | if (cft->seq_show) |
2397 | size_t nbytes, loff_t *ppos) | 2364 | return cft->seq_show(m, arg); |
2398 | { | ||
2399 | struct cfent *cfe = __d_cfe(file->f_dentry); | ||
2400 | struct cftype *cft = __d_cft(file->f_dentry); | ||
2401 | struct cgroup_subsys_state *css = cfe->css; | ||
2402 | 2365 | ||
2403 | if (cft->read) | ||
2404 | return cft->read(css, cft, file, buf, nbytes, ppos); | ||
2405 | if (cft->read_u64) | 2366 | if (cft->read_u64) |
2406 | return cgroup_read_u64(css, cft, file, buf, nbytes, ppos); | 2367 | seq_printf(m, "%llu\n", cft->read_u64(css, cft)); |
2407 | if (cft->read_s64) | 2368 | else if (cft->read_s64) |
2408 | return cgroup_read_s64(css, cft, file, buf, nbytes, ppos); | 2369 | seq_printf(m, "%lld\n", cft->read_s64(css, cft)); |
2409 | return -EINVAL; | 2370 | else |
2410 | } | 2371 | return -EINVAL; |
2411 | 2372 | return 0; | |
2412 | /* | ||
2413 | * seqfile ops/methods for returning structured data. Currently just | ||
2414 | * supports string->u64 maps, but can be extended in future. | ||
2415 | */ | ||
2416 | |||
2417 | static int cgroup_map_add(struct cgroup_map_cb *cb, const char *key, u64 value) | ||
2418 | { | ||
2419 | struct seq_file *sf = cb->state; | ||
2420 | return seq_printf(sf, "%s %llu\n", key, (unsigned long long)value); | ||
2421 | } | ||
2422 | |||
2423 | static int cgroup_seqfile_show(struct seq_file *m, void *arg) | ||
2424 | { | ||
2425 | struct cfent *cfe = m->private; | ||
2426 | struct cftype *cft = cfe->type; | ||
2427 | struct cgroup_subsys_state *css = cfe->css; | ||
2428 | |||
2429 | if (cft->read_map) { | ||
2430 | struct cgroup_map_cb cb = { | ||
2431 | .fill = cgroup_map_add, | ||
2432 | .state = m, | ||
2433 | }; | ||
2434 | return cft->read_map(css, cft, &cb); | ||
2435 | } | ||
2436 | return cft->read_seq_string(css, cft, m); | ||
2437 | } | 2373 | } |
2438 | 2374 | ||
2439 | static const struct file_operations cgroup_seqfile_operations = { | 2375 | static struct seq_operations cgroup_seq_operations = { |
2440 | .read = seq_read, | 2376 | .start = cgroup_seqfile_start, |
2441 | .write = cgroup_file_write, | 2377 | .next = cgroup_seqfile_next, |
2442 | .llseek = seq_lseek, | 2378 | .stop = cgroup_seqfile_stop, |
2443 | .release = cgroup_file_release, | 2379 | .show = cgroup_seqfile_show, |
2444 | }; | 2380 | }; |
2445 | 2381 | ||
2446 | static int cgroup_file_open(struct inode *inode, struct file *file) | 2382 | static int cgroup_file_open(struct inode *inode, struct file *file) |
@@ -2449,6 +2385,7 @@ static int cgroup_file_open(struct inode *inode, struct file *file) | |||
2449 | struct cftype *cft = __d_cft(file->f_dentry); | 2385 | struct cftype *cft = __d_cft(file->f_dentry); |
2450 | struct cgroup *cgrp = __d_cgrp(cfe->dentry->d_parent); | 2386 | struct cgroup *cgrp = __d_cgrp(cfe->dentry->d_parent); |
2451 | struct cgroup_subsys_state *css; | 2387 | struct cgroup_subsys_state *css; |
2388 | struct cgroup_open_file *of; | ||
2452 | int err; | 2389 | int err; |
2453 | 2390 | ||
2454 | err = generic_file_open(inode, file); | 2391 | err = generic_file_open(inode, file); |
@@ -2478,32 +2415,26 @@ static int cgroup_file_open(struct inode *inode, struct file *file) | |||
2478 | WARN_ON_ONCE(cfe->css && cfe->css != css); | 2415 | WARN_ON_ONCE(cfe->css && cfe->css != css); |
2479 | cfe->css = css; | 2416 | cfe->css = css; |
2480 | 2417 | ||
2481 | if (cft->read_map || cft->read_seq_string) { | 2418 | of = __seq_open_private(file, &cgroup_seq_operations, |
2482 | file->f_op = &cgroup_seqfile_operations; | 2419 | sizeof(struct cgroup_open_file)); |
2483 | err = single_open(file, cgroup_seqfile_show, cfe); | 2420 | if (of) { |
2484 | } else if (cft->open) { | 2421 | of->cfe = cfe; |
2485 | err = cft->open(inode, file); | 2422 | return 0; |
2486 | } | 2423 | } |
2487 | 2424 | ||
2488 | if (css->ss && err) | 2425 | if (css->ss) |
2489 | css_put(css); | 2426 | css_put(css); |
2490 | return err; | 2427 | return -ENOMEM; |
2491 | } | 2428 | } |
2492 | 2429 | ||
2493 | static int cgroup_file_release(struct inode *inode, struct file *file) | 2430 | static int cgroup_file_release(struct inode *inode, struct file *file) |
2494 | { | 2431 | { |
2495 | struct cfent *cfe = __d_cfe(file->f_dentry); | 2432 | struct cfent *cfe = __d_cfe(file->f_dentry); |
2496 | struct cftype *cft = __d_cft(file->f_dentry); | ||
2497 | struct cgroup_subsys_state *css = cfe->css; | 2433 | struct cgroup_subsys_state *css = cfe->css; |
2498 | int ret = 0; | ||
2499 | 2434 | ||
2500 | if (cft->release) | ||
2501 | ret = cft->release(inode, file); | ||
2502 | if (css->ss) | 2435 | if (css->ss) |
2503 | css_put(css); | 2436 | css_put(css); |
2504 | if (file->f_op == &cgroup_seqfile_operations) | 2437 | return seq_release_private(inode, file); |
2505 | single_release(inode, file); | ||
2506 | return ret; | ||
2507 | } | 2438 | } |
2508 | 2439 | ||
2509 | /* | 2440 | /* |
@@ -2614,7 +2545,7 @@ static ssize_t cgroup_listxattr(struct dentry *dentry, char *buf, size_t size) | |||
2614 | } | 2545 | } |
2615 | 2546 | ||
2616 | static const struct file_operations cgroup_file_operations = { | 2547 | static const struct file_operations cgroup_file_operations = { |
2617 | .read = cgroup_file_read, | 2548 | .read = seq_read, |
2618 | .write = cgroup_file_write, | 2549 | .write = cgroup_file_write, |
2619 | .llseek = generic_file_llseek, | 2550 | .llseek = generic_file_llseek, |
2620 | .open = cgroup_file_open, | 2551 | .open = cgroup_file_open, |
@@ -2639,16 +2570,6 @@ static const struct inode_operations cgroup_dir_inode_operations = { | |||
2639 | .removexattr = cgroup_removexattr, | 2570 | .removexattr = cgroup_removexattr, |
2640 | }; | 2571 | }; |
2641 | 2572 | ||
2642 | /* | ||
2643 | * Check if a file is a control file | ||
2644 | */ | ||
2645 | static inline struct cftype *__file_cft(struct file *file) | ||
2646 | { | ||
2647 | if (file_inode(file)->i_fop != &cgroup_file_operations) | ||
2648 | return ERR_PTR(-EINVAL); | ||
2649 | return __d_cft(file->f_dentry); | ||
2650 | } | ||
2651 | |||
2652 | static int cgroup_create_file(struct dentry *dentry, umode_t mode, | 2573 | static int cgroup_create_file(struct dentry *dentry, umode_t mode, |
2653 | struct super_block *sb) | 2574 | struct super_block *sb) |
2654 | { | 2575 | { |
@@ -2706,12 +2627,11 @@ static umode_t cgroup_file_mode(const struct cftype *cft) | |||
2706 | if (cft->mode) | 2627 | if (cft->mode) |
2707 | return cft->mode; | 2628 | return cft->mode; |
2708 | 2629 | ||
2709 | if (cft->read || cft->read_u64 || cft->read_s64 || | 2630 | if (cft->read_u64 || cft->read_s64 || cft->seq_show) |
2710 | cft->read_map || cft->read_seq_string) | ||
2711 | mode |= S_IRUGO; | 2631 | mode |= S_IRUGO; |
2712 | 2632 | ||
2713 | if (cft->write || cft->write_u64 || cft->write_s64 || | 2633 | if (cft->write_u64 || cft->write_s64 || cft->write_string || |
2714 | cft->write_string || cft->trigger) | 2634 | cft->trigger) |
2715 | mode |= S_IWUSR; | 2635 | mode |= S_IWUSR; |
2716 | 2636 | ||
2717 | return mode; | 2637 | return mode; |
@@ -2845,10 +2765,7 @@ static int cgroup_cfts_commit(struct cftype *cfts, bool is_add) | |||
2845 | */ | 2765 | */ |
2846 | update_before = cgroup_serial_nr_next; | 2766 | update_before = cgroup_serial_nr_next; |
2847 | 2767 | ||
2848 | mutex_unlock(&cgroup_mutex); | ||
2849 | |||
2850 | /* add/rm files for all cgroups created before */ | 2768 | /* add/rm files for all cgroups created before */ |
2851 | rcu_read_lock(); | ||
2852 | css_for_each_descendant_pre(css, cgroup_css(root, ss)) { | 2769 | css_for_each_descendant_pre(css, cgroup_css(root, ss)) { |
2853 | struct cgroup *cgrp = css->cgroup; | 2770 | struct cgroup *cgrp = css->cgroup; |
2854 | 2771 | ||
@@ -2857,23 +2774,19 @@ static int cgroup_cfts_commit(struct cftype *cfts, bool is_add) | |||
2857 | 2774 | ||
2858 | inode = cgrp->dentry->d_inode; | 2775 | inode = cgrp->dentry->d_inode; |
2859 | dget(cgrp->dentry); | 2776 | dget(cgrp->dentry); |
2860 | rcu_read_unlock(); | ||
2861 | |||
2862 | dput(prev); | 2777 | dput(prev); |
2863 | prev = cgrp->dentry; | 2778 | prev = cgrp->dentry; |
2864 | 2779 | ||
2780 | mutex_unlock(&cgroup_mutex); | ||
2865 | mutex_lock(&inode->i_mutex); | 2781 | mutex_lock(&inode->i_mutex); |
2866 | mutex_lock(&cgroup_mutex); | 2782 | mutex_lock(&cgroup_mutex); |
2867 | if (cgrp->serial_nr < update_before && !cgroup_is_dead(cgrp)) | 2783 | if (cgrp->serial_nr < update_before && !cgroup_is_dead(cgrp)) |
2868 | ret = cgroup_addrm_files(cgrp, cfts, is_add); | 2784 | ret = cgroup_addrm_files(cgrp, cfts, is_add); |
2869 | mutex_unlock(&cgroup_mutex); | ||
2870 | mutex_unlock(&inode->i_mutex); | 2785 | mutex_unlock(&inode->i_mutex); |
2871 | |||
2872 | rcu_read_lock(); | ||
2873 | if (ret) | 2786 | if (ret) |
2874 | break; | 2787 | break; |
2875 | } | 2788 | } |
2876 | rcu_read_unlock(); | 2789 | mutex_unlock(&cgroup_mutex); |
2877 | dput(prev); | 2790 | dput(prev); |
2878 | deactivate_super(sb); | 2791 | deactivate_super(sb); |
2879 | return ret; | 2792 | return ret; |
@@ -2992,9 +2905,14 @@ static void cgroup_enable_task_cg_lists(void) | |||
2992 | * We should check if the process is exiting, otherwise | 2905 | * We should check if the process is exiting, otherwise |
2993 | * it will race with cgroup_exit() in that the list | 2906 | * it will race with cgroup_exit() in that the list |
2994 | * entry won't be deleted though the process has exited. | 2907 | * entry won't be deleted though the process has exited. |
2908 | * Do it while holding siglock so that we don't end up | ||
2909 | * racing against cgroup_exit(). | ||
2995 | */ | 2910 | */ |
2911 | spin_lock_irq(&p->sighand->siglock); | ||
2996 | if (!(p->flags & PF_EXITING) && list_empty(&p->cg_list)) | 2912 | if (!(p->flags & PF_EXITING) && list_empty(&p->cg_list)) |
2997 | list_add(&p->cg_list, &task_css_set(p)->tasks); | 2913 | list_add(&p->cg_list, &task_css_set(p)->tasks); |
2914 | spin_unlock_irq(&p->sighand->siglock); | ||
2915 | |||
2998 | task_unlock(p); | 2916 | task_unlock(p); |
2999 | } while_each_thread(g, p); | 2917 | } while_each_thread(g, p); |
3000 | read_unlock(&tasklist_lock); | 2918 | read_unlock(&tasklist_lock); |
@@ -3007,9 +2925,9 @@ static void cgroup_enable_task_cg_lists(void) | |||
3007 | * @parent_css: css whose children to walk | 2925 | * @parent_css: css whose children to walk |
3008 | * | 2926 | * |
3009 | * This function returns the next child of @parent_css and should be called | 2927 | * This function returns the next child of @parent_css and should be called |
3010 | * under RCU read lock. The only requirement is that @parent_css and | 2928 | * under either cgroup_mutex or RCU read lock. The only requirement is |
3011 | * @pos_css are accessible. The next sibling is guaranteed to be returned | 2929 | * that @parent_css and @pos_css are accessible. The next sibling is |
3012 | * regardless of their states. | 2930 | * guaranteed to be returned regardless of their states. |
3013 | */ | 2931 | */ |
3014 | struct cgroup_subsys_state * | 2932 | struct cgroup_subsys_state * |
3015 | css_next_child(struct cgroup_subsys_state *pos_css, | 2933 | css_next_child(struct cgroup_subsys_state *pos_css, |
@@ -3019,7 +2937,7 @@ css_next_child(struct cgroup_subsys_state *pos_css, | |||
3019 | struct cgroup *cgrp = parent_css->cgroup; | 2937 | struct cgroup *cgrp = parent_css->cgroup; |
3020 | struct cgroup *next; | 2938 | struct cgroup *next; |
3021 | 2939 | ||
3022 | WARN_ON_ONCE(!rcu_read_lock_held()); | 2940 | cgroup_assert_mutex_or_rcu_locked(); |
3023 | 2941 | ||
3024 | /* | 2942 | /* |
3025 | * @pos could already have been removed. Once a cgroup is removed, | 2943 | * @pos could already have been removed. Once a cgroup is removed, |
@@ -3066,10 +2984,10 @@ EXPORT_SYMBOL_GPL(css_next_child); | |||
3066 | * to visit for pre-order traversal of @root's descendants. @root is | 2984 | * to visit for pre-order traversal of @root's descendants. @root is |
3067 | * included in the iteration and the first node to be visited. | 2985 | * included in the iteration and the first node to be visited. |
3068 | * | 2986 | * |
3069 | * While this function requires RCU read locking, it doesn't require the | 2987 | * While this function requires cgroup_mutex or RCU read locking, it |
3070 | * whole traversal to be contained in a single RCU critical section. This | 2988 | * doesn't require the whole traversal to be contained in a single critical |
3071 | * function will return the correct next descendant as long as both @pos | 2989 | * section. This function will return the correct next descendant as long |
3072 | * and @root are accessible and @pos is a descendant of @root. | 2990 | * as both @pos and @root are accessible and @pos is a descendant of @root. |
3073 | */ | 2991 | */ |
3074 | struct cgroup_subsys_state * | 2992 | struct cgroup_subsys_state * |
3075 | css_next_descendant_pre(struct cgroup_subsys_state *pos, | 2993 | css_next_descendant_pre(struct cgroup_subsys_state *pos, |
@@ -3077,7 +2995,7 @@ css_next_descendant_pre(struct cgroup_subsys_state *pos, | |||
3077 | { | 2995 | { |
3078 | struct cgroup_subsys_state *next; | 2996 | struct cgroup_subsys_state *next; |
3079 | 2997 | ||
3080 | WARN_ON_ONCE(!rcu_read_lock_held()); | 2998 | cgroup_assert_mutex_or_rcu_locked(); |
3081 | 2999 | ||
3082 | /* if first iteration, visit @root */ | 3000 | /* if first iteration, visit @root */ |
3083 | if (!pos) | 3001 | if (!pos) |
@@ -3108,17 +3026,17 @@ EXPORT_SYMBOL_GPL(css_next_descendant_pre); | |||
3108 | * is returned. This can be used during pre-order traversal to skip | 3026 | * is returned. This can be used during pre-order traversal to skip |
3109 | * subtree of @pos. | 3027 | * subtree of @pos. |
3110 | * | 3028 | * |
3111 | * While this function requires RCU read locking, it doesn't require the | 3029 | * While this function requires cgroup_mutex or RCU read locking, it |
3112 | * whole traversal to be contained in a single RCU critical section. This | 3030 | * doesn't require the whole traversal to be contained in a single critical |
3113 | * function will return the correct rightmost descendant as long as @pos is | 3031 | * section. This function will return the correct rightmost descendant as |
3114 | * accessible. | 3032 | * long as @pos is accessible. |
3115 | */ | 3033 | */ |
3116 | struct cgroup_subsys_state * | 3034 | struct cgroup_subsys_state * |
3117 | css_rightmost_descendant(struct cgroup_subsys_state *pos) | 3035 | css_rightmost_descendant(struct cgroup_subsys_state *pos) |
3118 | { | 3036 | { |
3119 | struct cgroup_subsys_state *last, *tmp; | 3037 | struct cgroup_subsys_state *last, *tmp; |
3120 | 3038 | ||
3121 | WARN_ON_ONCE(!rcu_read_lock_held()); | 3039 | cgroup_assert_mutex_or_rcu_locked(); |
3122 | 3040 | ||
3123 | do { | 3041 | do { |
3124 | last = pos; | 3042 | last = pos; |
@@ -3154,10 +3072,11 @@ css_leftmost_descendant(struct cgroup_subsys_state *pos) | |||
3154 | * to visit for post-order traversal of @root's descendants. @root is | 3072 | * to visit for post-order traversal of @root's descendants. @root is |
3155 | * included in the iteration and the last node to be visited. | 3073 | * included in the iteration and the last node to be visited. |
3156 | * | 3074 | * |
3157 | * While this function requires RCU read locking, it doesn't require the | 3075 | * While this function requires cgroup_mutex or RCU read locking, it |
3158 | * whole traversal to be contained in a single RCU critical section. This | 3076 | * doesn't require the whole traversal to be contained in a single critical |
3159 | * function will return the correct next descendant as long as both @pos | 3077 | * section. This function will return the correct next descendant as long |
3160 | * and @cgroup are accessible and @pos is a descendant of @cgroup. | 3078 | * as both @pos and @cgroup are accessible and @pos is a descendant of |
3079 | * @cgroup. | ||
3161 | */ | 3080 | */ |
3162 | struct cgroup_subsys_state * | 3081 | struct cgroup_subsys_state * |
3163 | css_next_descendant_post(struct cgroup_subsys_state *pos, | 3082 | css_next_descendant_post(struct cgroup_subsys_state *pos, |
@@ -3165,7 +3084,7 @@ css_next_descendant_post(struct cgroup_subsys_state *pos, | |||
3165 | { | 3084 | { |
3166 | struct cgroup_subsys_state *next; | 3085 | struct cgroup_subsys_state *next; |
3167 | 3086 | ||
3168 | WARN_ON_ONCE(!rcu_read_lock_held()); | 3087 | cgroup_assert_mutex_or_rcu_locked(); |
3169 | 3088 | ||
3170 | /* if first iteration, visit leftmost descendant which may be @root */ | 3089 | /* if first iteration, visit leftmost descendant which may be @root */ |
3171 | if (!pos) | 3090 | if (!pos) |
@@ -3504,14 +3423,12 @@ struct cgroup_pidlist { | |||
3504 | pid_t *list; | 3423 | pid_t *list; |
3505 | /* how many elements the above list has */ | 3424 | /* how many elements the above list has */ |
3506 | int length; | 3425 | int length; |
3507 | /* how many files are using the current array */ | ||
3508 | int use_count; | ||
3509 | /* each of these stored in a list by its cgroup */ | 3426 | /* each of these stored in a list by its cgroup */ |
3510 | struct list_head links; | 3427 | struct list_head links; |
3511 | /* pointer to the cgroup we belong to, for list removal purposes */ | 3428 | /* pointer to the cgroup we belong to, for list removal purposes */ |
3512 | struct cgroup *owner; | 3429 | struct cgroup *owner; |
3513 | /* protects the other fields */ | 3430 | /* for delayed destruction */ |
3514 | struct rw_semaphore rwsem; | 3431 | struct delayed_work destroy_dwork; |
3515 | }; | 3432 | }; |
3516 | 3433 | ||
3517 | /* | 3434 | /* |
@@ -3527,6 +3444,7 @@ static void *pidlist_allocate(int count) | |||
3527 | else | 3444 | else |
3528 | return kmalloc(count * sizeof(pid_t), GFP_KERNEL); | 3445 | return kmalloc(count * sizeof(pid_t), GFP_KERNEL); |
3529 | } | 3446 | } |
3447 | |||
3530 | static void pidlist_free(void *p) | 3448 | static void pidlist_free(void *p) |
3531 | { | 3449 | { |
3532 | if (is_vmalloc_addr(p)) | 3450 | if (is_vmalloc_addr(p)) |
@@ -3536,6 +3454,47 @@ static void pidlist_free(void *p) | |||
3536 | } | 3454 | } |
3537 | 3455 | ||
3538 | /* | 3456 | /* |
3457 | * Used to destroy all pidlists lingering waiting for destroy timer. None | ||
3458 | * should be left afterwards. | ||
3459 | */ | ||
3460 | static void cgroup_pidlist_destroy_all(struct cgroup *cgrp) | ||
3461 | { | ||
3462 | struct cgroup_pidlist *l, *tmp_l; | ||
3463 | |||
3464 | mutex_lock(&cgrp->pidlist_mutex); | ||
3465 | list_for_each_entry_safe(l, tmp_l, &cgrp->pidlists, links) | ||
3466 | mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork, 0); | ||
3467 | mutex_unlock(&cgrp->pidlist_mutex); | ||
3468 | |||
3469 | flush_workqueue(cgroup_pidlist_destroy_wq); | ||
3470 | BUG_ON(!list_empty(&cgrp->pidlists)); | ||
3471 | } | ||
3472 | |||
3473 | static void cgroup_pidlist_destroy_work_fn(struct work_struct *work) | ||
3474 | { | ||
3475 | struct delayed_work *dwork = to_delayed_work(work); | ||
3476 | struct cgroup_pidlist *l = container_of(dwork, struct cgroup_pidlist, | ||
3477 | destroy_dwork); | ||
3478 | struct cgroup_pidlist *tofree = NULL; | ||
3479 | |||
3480 | mutex_lock(&l->owner->pidlist_mutex); | ||
3481 | |||
3482 | /* | ||
3483 | * Destroy iff we didn't get queued again. The state won't change | ||
3484 | * as destroy_dwork can only be queued while locked. | ||
3485 | */ | ||
3486 | if (!delayed_work_pending(dwork)) { | ||
3487 | list_del(&l->links); | ||
3488 | pidlist_free(l->list); | ||
3489 | put_pid_ns(l->key.ns); | ||
3490 | tofree = l; | ||
3491 | } | ||
3492 | |||
3493 | mutex_unlock(&l->owner->pidlist_mutex); | ||
3494 | kfree(tofree); | ||
3495 | } | ||
3496 | |||
3497 | /* | ||
3539 | * pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries | 3498 | * pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries |
3540 | * Returns the number of unique elements. | 3499 | * Returns the number of unique elements. |
3541 | */ | 3500 | */ |
@@ -3565,52 +3524,92 @@ after: | |||
3565 | return dest; | 3524 | return dest; |
3566 | } | 3525 | } |
3567 | 3526 | ||
3527 | /* | ||
3528 | * The two pid files - task and cgroup.procs - guaranteed that the result | ||
3529 | * is sorted, which forced this whole pidlist fiasco. As pid order is | ||
3530 | * different per namespace, each namespace needs differently sorted list, | ||
3531 | * making it impossible to use, for example, single rbtree of member tasks | ||
3532 | * sorted by task pointer. As pidlists can be fairly large, allocating one | ||
3533 | * per open file is dangerous, so cgroup had to implement shared pool of | ||
3534 | * pidlists keyed by cgroup and namespace. | ||
3535 | * | ||
3536 | * All this extra complexity was caused by the original implementation | ||
3537 | * committing to an entirely unnecessary property. In the long term, we | ||
3538 | * want to do away with it. Explicitly scramble sort order if | ||
3539 | * sane_behavior so that no such expectation exists in the new interface. | ||
3540 | * | ||
3541 | * Scrambling is done by swapping every two consecutive bits, which is | ||
3542 | * non-identity one-to-one mapping which disturbs sort order sufficiently. | ||
3543 | */ | ||
3544 | static pid_t pid_fry(pid_t pid) | ||
3545 | { | ||
3546 | unsigned a = pid & 0x55555555; | ||
3547 | unsigned b = pid & 0xAAAAAAAA; | ||
3548 | |||
3549 | return (a << 1) | (b >> 1); | ||
3550 | } | ||
3551 | |||
3552 | static pid_t cgroup_pid_fry(struct cgroup *cgrp, pid_t pid) | ||
3553 | { | ||
3554 | if (cgroup_sane_behavior(cgrp)) | ||
3555 | return pid_fry(pid); | ||
3556 | else | ||
3557 | return pid; | ||
3558 | } | ||
3559 | |||
3568 | static int cmppid(const void *a, const void *b) | 3560 | static int cmppid(const void *a, const void *b) |
3569 | { | 3561 | { |
3570 | return *(pid_t *)a - *(pid_t *)b; | 3562 | return *(pid_t *)a - *(pid_t *)b; |
3571 | } | 3563 | } |
3572 | 3564 | ||
3565 | static int fried_cmppid(const void *a, const void *b) | ||
3566 | { | ||
3567 | return pid_fry(*(pid_t *)a) - pid_fry(*(pid_t *)b); | ||
3568 | } | ||
3569 | |||
3570 | static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp, | ||
3571 | enum cgroup_filetype type) | ||
3572 | { | ||
3573 | struct cgroup_pidlist *l; | ||
3574 | /* don't need task_nsproxy() if we're looking at ourself */ | ||
3575 | struct pid_namespace *ns = task_active_pid_ns(current); | ||
3576 | |||
3577 | lockdep_assert_held(&cgrp->pidlist_mutex); | ||
3578 | |||
3579 | list_for_each_entry(l, &cgrp->pidlists, links) | ||
3580 | if (l->key.type == type && l->key.ns == ns) | ||
3581 | return l; | ||
3582 | return NULL; | ||
3583 | } | ||
3584 | |||
3573 | /* | 3585 | /* |
3574 | * find the appropriate pidlist for our purpose (given procs vs tasks) | 3586 | * find the appropriate pidlist for our purpose (given procs vs tasks) |
3575 | * returns with the lock on that pidlist already held, and takes care | 3587 | * returns with the lock on that pidlist already held, and takes care |
3576 | * of the use count, or returns NULL with no locks held if we're out of | 3588 | * of the use count, or returns NULL with no locks held if we're out of |
3577 | * memory. | 3589 | * memory. |
3578 | */ | 3590 | */ |
3579 | static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp, | 3591 | static struct cgroup_pidlist *cgroup_pidlist_find_create(struct cgroup *cgrp, |
3580 | enum cgroup_filetype type) | 3592 | enum cgroup_filetype type) |
3581 | { | 3593 | { |
3582 | struct cgroup_pidlist *l; | 3594 | struct cgroup_pidlist *l; |
3583 | /* don't need task_nsproxy() if we're looking at ourself */ | ||
3584 | struct pid_namespace *ns = task_active_pid_ns(current); | ||
3585 | 3595 | ||
3586 | /* | 3596 | lockdep_assert_held(&cgrp->pidlist_mutex); |
3587 | * We can't drop the pidlist_mutex before taking the l->rwsem in case | 3597 | |
3588 | * the last ref-holder is trying to remove l from the list at the same | 3598 | l = cgroup_pidlist_find(cgrp, type); |
3589 | * time. Holding the pidlist_mutex precludes somebody taking whichever | 3599 | if (l) |
3590 | * list we find out from under us - compare release_pid_array(). | 3600 | return l; |
3591 | */ | 3601 | |
3592 | mutex_lock(&cgrp->pidlist_mutex); | ||
3593 | list_for_each_entry(l, &cgrp->pidlists, links) { | ||
3594 | if (l->key.type == type && l->key.ns == ns) { | ||
3595 | /* make sure l doesn't vanish out from under us */ | ||
3596 | down_write(&l->rwsem); | ||
3597 | mutex_unlock(&cgrp->pidlist_mutex); | ||
3598 | return l; | ||
3599 | } | ||
3600 | } | ||
3601 | /* entry not found; create a new one */ | 3602 | /* entry not found; create a new one */ |
3602 | l = kzalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL); | 3603 | l = kzalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL); |
3603 | if (!l) { | 3604 | if (!l) |
3604 | mutex_unlock(&cgrp->pidlist_mutex); | ||
3605 | return l; | 3605 | return l; |
3606 | } | 3606 | |
3607 | init_rwsem(&l->rwsem); | 3607 | INIT_DELAYED_WORK(&l->destroy_dwork, cgroup_pidlist_destroy_work_fn); |
3608 | down_write(&l->rwsem); | ||
3609 | l->key.type = type; | 3608 | l->key.type = type; |
3610 | l->key.ns = get_pid_ns(ns); | 3609 | /* don't need task_nsproxy() if we're looking at ourself */ |
3610 | l->key.ns = get_pid_ns(task_active_pid_ns(current)); | ||
3611 | l->owner = cgrp; | 3611 | l->owner = cgrp; |
3612 | list_add(&l->links, &cgrp->pidlists); | 3612 | list_add(&l->links, &cgrp->pidlists); |
3613 | mutex_unlock(&cgrp->pidlist_mutex); | ||
3614 | return l; | 3613 | return l; |
3615 | } | 3614 | } |
3616 | 3615 | ||
@@ -3627,6 +3626,8 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type, | |||
3627 | struct task_struct *tsk; | 3626 | struct task_struct *tsk; |
3628 | struct cgroup_pidlist *l; | 3627 | struct cgroup_pidlist *l; |
3629 | 3628 | ||
3629 | lockdep_assert_held(&cgrp->pidlist_mutex); | ||
3630 | |||
3630 | /* | 3631 | /* |
3631 | * If cgroup gets more users after we read count, we won't have | 3632 | * If cgroup gets more users after we read count, we won't have |
3632 | * enough space - tough. This race is indistinguishable to the | 3633 | * enough space - tough. This race is indistinguishable to the |
@@ -3653,20 +3654,24 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type, | |||
3653 | css_task_iter_end(&it); | 3654 | css_task_iter_end(&it); |
3654 | length = n; | 3655 | length = n; |
3655 | /* now sort & (if procs) strip out duplicates */ | 3656 | /* now sort & (if procs) strip out duplicates */ |
3656 | sort(array, length, sizeof(pid_t), cmppid, NULL); | 3657 | if (cgroup_sane_behavior(cgrp)) |
3658 | sort(array, length, sizeof(pid_t), fried_cmppid, NULL); | ||
3659 | else | ||
3660 | sort(array, length, sizeof(pid_t), cmppid, NULL); | ||
3657 | if (type == CGROUP_FILE_PROCS) | 3661 | if (type == CGROUP_FILE_PROCS) |
3658 | length = pidlist_uniq(array, length); | 3662 | length = pidlist_uniq(array, length); |
3659 | l = cgroup_pidlist_find(cgrp, type); | 3663 | |
3664 | l = cgroup_pidlist_find_create(cgrp, type); | ||
3660 | if (!l) { | 3665 | if (!l) { |
3666 | mutex_unlock(&cgrp->pidlist_mutex); | ||
3661 | pidlist_free(array); | 3667 | pidlist_free(array); |
3662 | return -ENOMEM; | 3668 | return -ENOMEM; |
3663 | } | 3669 | } |
3664 | /* store array, freeing old if necessary - lock already held */ | 3670 | |
3671 | /* store array, freeing old if necessary */ | ||
3665 | pidlist_free(l->list); | 3672 | pidlist_free(l->list); |
3666 | l->list = array; | 3673 | l->list = array; |
3667 | l->length = length; | 3674 | l->length = length; |
3668 | l->use_count++; | ||
3669 | up_write(&l->rwsem); | ||
3670 | *lp = l; | 3675 | *lp = l; |
3671 | return 0; | 3676 | return 0; |
3672 | } | 3677 | } |
@@ -3740,20 +3745,45 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos) | |||
3740 | * after a seek to the start). Use a binary-search to find the | 3745 | * after a seek to the start). Use a binary-search to find the |
3741 | * next pid to display, if any | 3746 | * next pid to display, if any |
3742 | */ | 3747 | */ |
3743 | struct cgroup_pidlist *l = s->private; | 3748 | struct cgroup_open_file *of = s->private; |
3749 | struct cgroup *cgrp = seq_css(s)->cgroup; | ||
3750 | struct cgroup_pidlist *l; | ||
3751 | enum cgroup_filetype type = seq_cft(s)->private; | ||
3744 | int index = 0, pid = *pos; | 3752 | int index = 0, pid = *pos; |
3745 | int *iter; | 3753 | int *iter, ret; |
3754 | |||
3755 | mutex_lock(&cgrp->pidlist_mutex); | ||
3756 | |||
3757 | /* | ||
3758 | * !NULL @of->priv indicates that this isn't the first start() | ||
3759 | * after open. If the matching pidlist is around, we can use that. | ||
3760 | * Look for it. Note that @of->priv can't be used directly. It | ||
3761 | * could already have been destroyed. | ||
3762 | */ | ||
3763 | if (of->priv) | ||
3764 | of->priv = cgroup_pidlist_find(cgrp, type); | ||
3765 | |||
3766 | /* | ||
3767 | * Either this is the first start() after open or the matching | ||
3768 | * pidlist has been destroyed inbetween. Create a new one. | ||
3769 | */ | ||
3770 | if (!of->priv) { | ||
3771 | ret = pidlist_array_load(cgrp, type, | ||
3772 | (struct cgroup_pidlist **)&of->priv); | ||
3773 | if (ret) | ||
3774 | return ERR_PTR(ret); | ||
3775 | } | ||
3776 | l = of->priv; | ||
3746 | 3777 | ||
3747 | down_read(&l->rwsem); | ||
3748 | if (pid) { | 3778 | if (pid) { |
3749 | int end = l->length; | 3779 | int end = l->length; |
3750 | 3780 | ||
3751 | while (index < end) { | 3781 | while (index < end) { |
3752 | int mid = (index + end) / 2; | 3782 | int mid = (index + end) / 2; |
3753 | if (l->list[mid] == pid) { | 3783 | if (cgroup_pid_fry(cgrp, l->list[mid]) == pid) { |
3754 | index = mid; | 3784 | index = mid; |
3755 | break; | 3785 | break; |
3756 | } else if (l->list[mid] <= pid) | 3786 | } else if (cgroup_pid_fry(cgrp, l->list[mid]) <= pid) |
3757 | index = mid + 1; | 3787 | index = mid + 1; |
3758 | else | 3788 | else |
3759 | end = mid; | 3789 | end = mid; |
@@ -3764,19 +3794,25 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos) | |||
3764 | return NULL; | 3794 | return NULL; |
3765 | /* Update the abstract position to be the actual pid that we found */ | 3795 | /* Update the abstract position to be the actual pid that we found */ |
3766 | iter = l->list + index; | 3796 | iter = l->list + index; |
3767 | *pos = *iter; | 3797 | *pos = cgroup_pid_fry(cgrp, *iter); |
3768 | return iter; | 3798 | return iter; |
3769 | } | 3799 | } |
3770 | 3800 | ||
3771 | static void cgroup_pidlist_stop(struct seq_file *s, void *v) | 3801 | static void cgroup_pidlist_stop(struct seq_file *s, void *v) |
3772 | { | 3802 | { |
3773 | struct cgroup_pidlist *l = s->private; | 3803 | struct cgroup_open_file *of = s->private; |
3774 | up_read(&l->rwsem); | 3804 | struct cgroup_pidlist *l = of->priv; |
3805 | |||
3806 | if (l) | ||
3807 | mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork, | ||
3808 | CGROUP_PIDLIST_DESTROY_DELAY); | ||
3809 | mutex_unlock(&seq_css(s)->cgroup->pidlist_mutex); | ||
3775 | } | 3810 | } |
3776 | 3811 | ||
3777 | static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos) | 3812 | static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos) |
3778 | { | 3813 | { |
3779 | struct cgroup_pidlist *l = s->private; | 3814 | struct cgroup_open_file *of = s->private; |
3815 | struct cgroup_pidlist *l = of->priv; | ||
3780 | pid_t *p = v; | 3816 | pid_t *p = v; |
3781 | pid_t *end = l->list + l->length; | 3817 | pid_t *end = l->list + l->length; |
3782 | /* | 3818 | /* |
@@ -3787,7 +3823,7 @@ static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos) | |||
3787 | if (p >= end) { | 3823 | if (p >= end) { |
3788 | return NULL; | 3824 | return NULL; |
3789 | } else { | 3825 | } else { |
3790 | *pos = *p; | 3826 | *pos = cgroup_pid_fry(seq_css(s)->cgroup, *p); |
3791 | return p; | 3827 | return p; |
3792 | } | 3828 | } |
3793 | } | 3829 | } |
@@ -3808,92 +3844,6 @@ static const struct seq_operations cgroup_pidlist_seq_operations = { | |||
3808 | .show = cgroup_pidlist_show, | 3844 | .show = cgroup_pidlist_show, |
3809 | }; | 3845 | }; |
3810 | 3846 | ||
3811 | static void cgroup_release_pid_array(struct cgroup_pidlist *l) | ||
3812 | { | ||
3813 | /* | ||
3814 | * the case where we're the last user of this particular pidlist will | ||
3815 | * have us remove it from the cgroup's list, which entails taking the | ||
3816 | * mutex. since in pidlist_find the pidlist->lock depends on cgroup-> | ||
3817 | * pidlist_mutex, we have to take pidlist_mutex first. | ||
3818 | */ | ||
3819 | mutex_lock(&l->owner->pidlist_mutex); | ||
3820 | down_write(&l->rwsem); | ||
3821 | BUG_ON(!l->use_count); | ||
3822 | if (!--l->use_count) { | ||
3823 | /* we're the last user if refcount is 0; remove and free */ | ||
3824 | list_del(&l->links); | ||
3825 | mutex_unlock(&l->owner->pidlist_mutex); | ||
3826 | pidlist_free(l->list); | ||
3827 | put_pid_ns(l->key.ns); | ||
3828 | up_write(&l->rwsem); | ||
3829 | kfree(l); | ||
3830 | return; | ||
3831 | } | ||
3832 | mutex_unlock(&l->owner->pidlist_mutex); | ||
3833 | up_write(&l->rwsem); | ||
3834 | } | ||
3835 | |||
3836 | static int cgroup_pidlist_release(struct inode *inode, struct file *file) | ||
3837 | { | ||
3838 | struct cgroup_pidlist *l; | ||
3839 | if (!(file->f_mode & FMODE_READ)) | ||
3840 | return 0; | ||
3841 | /* | ||
3842 | * the seq_file will only be initialized if the file was opened for | ||
3843 | * reading; hence we check if it's not null only in that case. | ||
3844 | */ | ||
3845 | l = ((struct seq_file *)file->private_data)->private; | ||
3846 | cgroup_release_pid_array(l); | ||
3847 | return seq_release(inode, file); | ||
3848 | } | ||
3849 | |||
3850 | static const struct file_operations cgroup_pidlist_operations = { | ||
3851 | .read = seq_read, | ||
3852 | .llseek = seq_lseek, | ||
3853 | .write = cgroup_file_write, | ||
3854 | .release = cgroup_pidlist_release, | ||
3855 | }; | ||
3856 | |||
3857 | /* | ||
3858 | * The following functions handle opens on a file that displays a pidlist | ||
3859 | * (tasks or procs). Prepare an array of the process/thread IDs of whoever's | ||
3860 | * in the cgroup. | ||
3861 | */ | ||
3862 | /* helper function for the two below it */ | ||
3863 | static int cgroup_pidlist_open(struct file *file, enum cgroup_filetype type) | ||
3864 | { | ||
3865 | struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); | ||
3866 | struct cgroup_pidlist *l; | ||
3867 | int retval; | ||
3868 | |||
3869 | /* Nothing to do for write-only files */ | ||
3870 | if (!(file->f_mode & FMODE_READ)) | ||
3871 | return 0; | ||
3872 | |||
3873 | /* have the array populated */ | ||
3874 | retval = pidlist_array_load(cgrp, type, &l); | ||
3875 | if (retval) | ||
3876 | return retval; | ||
3877 | /* configure file information */ | ||
3878 | file->f_op = &cgroup_pidlist_operations; | ||
3879 | |||
3880 | retval = seq_open(file, &cgroup_pidlist_seq_operations); | ||
3881 | if (retval) { | ||
3882 | cgroup_release_pid_array(l); | ||
3883 | return retval; | ||
3884 | } | ||
3885 | ((struct seq_file *)file->private_data)->private = l; | ||
3886 | return 0; | ||
3887 | } | ||
3888 | static int cgroup_tasks_open(struct inode *unused, struct file *file) | ||
3889 | { | ||
3890 | return cgroup_pidlist_open(file, CGROUP_FILE_TASKS); | ||
3891 | } | ||
3892 | static int cgroup_procs_open(struct inode *unused, struct file *file) | ||
3893 | { | ||
3894 | return cgroup_pidlist_open(file, CGROUP_FILE_PROCS); | ||
3895 | } | ||
3896 | |||
3897 | static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css, | 3847 | static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css, |
3898 | struct cftype *cft) | 3848 | struct cftype *cft) |
3899 | { | 3849 | { |
@@ -3928,202 +3878,6 @@ static void cgroup_dput(struct cgroup *cgrp) | |||
3928 | deactivate_super(sb); | 3878 | deactivate_super(sb); |
3929 | } | 3879 | } |
3930 | 3880 | ||
3931 | /* | ||
3932 | * Unregister event and free resources. | ||
3933 | * | ||
3934 | * Gets called from workqueue. | ||
3935 | */ | ||
3936 | static void cgroup_event_remove(struct work_struct *work) | ||
3937 | { | ||
3938 | struct cgroup_event *event = container_of(work, struct cgroup_event, | ||
3939 | remove); | ||
3940 | struct cgroup_subsys_state *css = event->css; | ||
3941 | |||
3942 | remove_wait_queue(event->wqh, &event->wait); | ||
3943 | |||
3944 | event->cft->unregister_event(css, event->cft, event->eventfd); | ||
3945 | |||
3946 | /* Notify userspace the event is going away. */ | ||
3947 | eventfd_signal(event->eventfd, 1); | ||
3948 | |||
3949 | eventfd_ctx_put(event->eventfd); | ||
3950 | kfree(event); | ||
3951 | css_put(css); | ||
3952 | } | ||
3953 | |||
3954 | /* | ||
3955 | * Gets called on POLLHUP on eventfd when user closes it. | ||
3956 | * | ||
3957 | * Called with wqh->lock held and interrupts disabled. | ||
3958 | */ | ||
3959 | static int cgroup_event_wake(wait_queue_t *wait, unsigned mode, | ||
3960 | int sync, void *key) | ||
3961 | { | ||
3962 | struct cgroup_event *event = container_of(wait, | ||
3963 | struct cgroup_event, wait); | ||
3964 | struct cgroup *cgrp = event->css->cgroup; | ||
3965 | unsigned long flags = (unsigned long)key; | ||
3966 | |||
3967 | if (flags & POLLHUP) { | ||
3968 | /* | ||
3969 | * If the event has been detached at cgroup removal, we | ||
3970 | * can simply return knowing the other side will cleanup | ||
3971 | * for us. | ||
3972 | * | ||
3973 | * We can't race against event freeing since the other | ||
3974 | * side will require wqh->lock via remove_wait_queue(), | ||
3975 | * which we hold. | ||
3976 | */ | ||
3977 | spin_lock(&cgrp->event_list_lock); | ||
3978 | if (!list_empty(&event->list)) { | ||
3979 | list_del_init(&event->list); | ||
3980 | /* | ||
3981 | * We are in atomic context, but cgroup_event_remove() | ||
3982 | * may sleep, so we have to call it in workqueue. | ||
3983 | */ | ||
3984 | schedule_work(&event->remove); | ||
3985 | } | ||
3986 | spin_unlock(&cgrp->event_list_lock); | ||
3987 | } | ||
3988 | |||
3989 | return 0; | ||
3990 | } | ||
3991 | |||
3992 | static void cgroup_event_ptable_queue_proc(struct file *file, | ||
3993 | wait_queue_head_t *wqh, poll_table *pt) | ||
3994 | { | ||
3995 | struct cgroup_event *event = container_of(pt, | ||
3996 | struct cgroup_event, pt); | ||
3997 | |||
3998 | event->wqh = wqh; | ||
3999 | add_wait_queue(wqh, &event->wait); | ||
4000 | } | ||
4001 | |||
4002 | /* | ||
4003 | * Parse input and register new cgroup event handler. | ||
4004 | * | ||
4005 | * Input must be in format '<event_fd> <control_fd> <args>'. | ||
4006 | * Interpretation of args is defined by control file implementation. | ||
4007 | */ | ||
4008 | static int cgroup_write_event_control(struct cgroup_subsys_state *dummy_css, | ||
4009 | struct cftype *cft, const char *buffer) | ||
4010 | { | ||
4011 | struct cgroup *cgrp = dummy_css->cgroup; | ||
4012 | struct cgroup_event *event; | ||
4013 | struct cgroup_subsys_state *cfile_css; | ||
4014 | unsigned int efd, cfd; | ||
4015 | struct fd efile; | ||
4016 | struct fd cfile; | ||
4017 | char *endp; | ||
4018 | int ret; | ||
4019 | |||
4020 | efd = simple_strtoul(buffer, &endp, 10); | ||
4021 | if (*endp != ' ') | ||
4022 | return -EINVAL; | ||
4023 | buffer = endp + 1; | ||
4024 | |||
4025 | cfd = simple_strtoul(buffer, &endp, 10); | ||
4026 | if ((*endp != ' ') && (*endp != '\0')) | ||
4027 | return -EINVAL; | ||
4028 | buffer = endp + 1; | ||
4029 | |||
4030 | event = kzalloc(sizeof(*event), GFP_KERNEL); | ||
4031 | if (!event) | ||
4032 | return -ENOMEM; | ||
4033 | |||
4034 | INIT_LIST_HEAD(&event->list); | ||
4035 | init_poll_funcptr(&event->pt, cgroup_event_ptable_queue_proc); | ||
4036 | init_waitqueue_func_entry(&event->wait, cgroup_event_wake); | ||
4037 | INIT_WORK(&event->remove, cgroup_event_remove); | ||
4038 | |||
4039 | efile = fdget(efd); | ||
4040 | if (!efile.file) { | ||
4041 | ret = -EBADF; | ||
4042 | goto out_kfree; | ||
4043 | } | ||
4044 | |||
4045 | event->eventfd = eventfd_ctx_fileget(efile.file); | ||
4046 | if (IS_ERR(event->eventfd)) { | ||
4047 | ret = PTR_ERR(event->eventfd); | ||
4048 | goto out_put_efile; | ||
4049 | } | ||
4050 | |||
4051 | cfile = fdget(cfd); | ||
4052 | if (!cfile.file) { | ||
4053 | ret = -EBADF; | ||
4054 | goto out_put_eventfd; | ||
4055 | } | ||
4056 | |||
4057 | /* the process need read permission on control file */ | ||
4058 | /* AV: shouldn't we check that it's been opened for read instead? */ | ||
4059 | ret = inode_permission(file_inode(cfile.file), MAY_READ); | ||
4060 | if (ret < 0) | ||
4061 | goto out_put_cfile; | ||
4062 | |||
4063 | event->cft = __file_cft(cfile.file); | ||
4064 | if (IS_ERR(event->cft)) { | ||
4065 | ret = PTR_ERR(event->cft); | ||
4066 | goto out_put_cfile; | ||
4067 | } | ||
4068 | |||
4069 | if (!event->cft->ss) { | ||
4070 | ret = -EBADF; | ||
4071 | goto out_put_cfile; | ||
4072 | } | ||
4073 | |||
4074 | /* | ||
4075 | * Determine the css of @cfile, verify it belongs to the same | ||
4076 | * cgroup as cgroup.event_control, and associate @event with it. | ||
4077 | * Remaining events are automatically removed on cgroup destruction | ||
4078 | * but the removal is asynchronous, so take an extra ref. | ||
4079 | */ | ||
4080 | rcu_read_lock(); | ||
4081 | |||
4082 | ret = -EINVAL; | ||
4083 | event->css = cgroup_css(cgrp, event->cft->ss); | ||
4084 | cfile_css = css_from_dir(cfile.file->f_dentry->d_parent, event->cft->ss); | ||
4085 | if (event->css && event->css == cfile_css && css_tryget(event->css)) | ||
4086 | ret = 0; | ||
4087 | |||
4088 | rcu_read_unlock(); | ||
4089 | if (ret) | ||
4090 | goto out_put_cfile; | ||
4091 | |||
4092 | if (!event->cft->register_event || !event->cft->unregister_event) { | ||
4093 | ret = -EINVAL; | ||
4094 | goto out_put_css; | ||
4095 | } | ||
4096 | |||
4097 | ret = event->cft->register_event(event->css, event->cft, | ||
4098 | event->eventfd, buffer); | ||
4099 | if (ret) | ||
4100 | goto out_put_css; | ||
4101 | |||
4102 | efile.file->f_op->poll(efile.file, &event->pt); | ||
4103 | |||
4104 | spin_lock(&cgrp->event_list_lock); | ||
4105 | list_add(&event->list, &cgrp->event_list); | ||
4106 | spin_unlock(&cgrp->event_list_lock); | ||
4107 | |||
4108 | fdput(cfile); | ||
4109 | fdput(efile); | ||
4110 | |||
4111 | return 0; | ||
4112 | |||
4113 | out_put_css: | ||
4114 | css_put(event->css); | ||
4115 | out_put_cfile: | ||
4116 | fdput(cfile); | ||
4117 | out_put_eventfd: | ||
4118 | eventfd_ctx_put(event->eventfd); | ||
4119 | out_put_efile: | ||
4120 | fdput(efile); | ||
4121 | out_kfree: | ||
4122 | kfree(event); | ||
4123 | |||
4124 | return ret; | ||
4125 | } | ||
4126 | |||
4127 | static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css, | 3881 | static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css, |
4128 | struct cftype *cft) | 3882 | struct cftype *cft) |
4129 | { | 3883 | { |
@@ -4143,17 +3897,15 @@ static int cgroup_clone_children_write(struct cgroup_subsys_state *css, | |||
4143 | static struct cftype cgroup_base_files[] = { | 3897 | static struct cftype cgroup_base_files[] = { |
4144 | { | 3898 | { |
4145 | .name = "cgroup.procs", | 3899 | .name = "cgroup.procs", |
4146 | .open = cgroup_procs_open, | 3900 | .seq_start = cgroup_pidlist_start, |
3901 | .seq_next = cgroup_pidlist_next, | ||
3902 | .seq_stop = cgroup_pidlist_stop, | ||
3903 | .seq_show = cgroup_pidlist_show, | ||
3904 | .private = CGROUP_FILE_PROCS, | ||
4147 | .write_u64 = cgroup_procs_write, | 3905 | .write_u64 = cgroup_procs_write, |
4148 | .release = cgroup_pidlist_release, | ||
4149 | .mode = S_IRUGO | S_IWUSR, | 3906 | .mode = S_IRUGO | S_IWUSR, |
4150 | }, | 3907 | }, |
4151 | { | 3908 | { |
4152 | .name = "cgroup.event_control", | ||
4153 | .write_string = cgroup_write_event_control, | ||
4154 | .mode = S_IWUGO, | ||
4155 | }, | ||
4156 | { | ||
4157 | .name = "cgroup.clone_children", | 3909 | .name = "cgroup.clone_children", |
4158 | .flags = CFTYPE_INSANE, | 3910 | .flags = CFTYPE_INSANE, |
4159 | .read_u64 = cgroup_clone_children_read, | 3911 | .read_u64 = cgroup_clone_children_read, |
@@ -4162,7 +3914,7 @@ static struct cftype cgroup_base_files[] = { | |||
4162 | { | 3914 | { |
4163 | .name = "cgroup.sane_behavior", | 3915 | .name = "cgroup.sane_behavior", |
4164 | .flags = CFTYPE_ONLY_ON_ROOT, | 3916 | .flags = CFTYPE_ONLY_ON_ROOT, |
4165 | .read_seq_string = cgroup_sane_behavior_show, | 3917 | .seq_show = cgroup_sane_behavior_show, |
4166 | }, | 3918 | }, |
4167 | 3919 | ||
4168 | /* | 3920 | /* |
@@ -4173,9 +3925,12 @@ static struct cftype cgroup_base_files[] = { | |||
4173 | { | 3925 | { |
4174 | .name = "tasks", | 3926 | .name = "tasks", |
4175 | .flags = CFTYPE_INSANE, /* use "procs" instead */ | 3927 | .flags = CFTYPE_INSANE, /* use "procs" instead */ |
4176 | .open = cgroup_tasks_open, | 3928 | .seq_start = cgroup_pidlist_start, |
3929 | .seq_next = cgroup_pidlist_next, | ||
3930 | .seq_stop = cgroup_pidlist_stop, | ||
3931 | .seq_show = cgroup_pidlist_show, | ||
3932 | .private = CGROUP_FILE_TASKS, | ||
4177 | .write_u64 = cgroup_tasks_write, | 3933 | .write_u64 = cgroup_tasks_write, |
4178 | .release = cgroup_pidlist_release, | ||
4179 | .mode = S_IRUGO | S_IWUSR, | 3934 | .mode = S_IRUGO | S_IWUSR, |
4180 | }, | 3935 | }, |
4181 | { | 3936 | { |
@@ -4187,7 +3942,7 @@ static struct cftype cgroup_base_files[] = { | |||
4187 | { | 3942 | { |
4188 | .name = "release_agent", | 3943 | .name = "release_agent", |
4189 | .flags = CFTYPE_INSANE | CFTYPE_ONLY_ON_ROOT, | 3944 | .flags = CFTYPE_INSANE | CFTYPE_ONLY_ON_ROOT, |
4190 | .read_seq_string = cgroup_release_agent_show, | 3945 | .seq_show = cgroup_release_agent_show, |
4191 | .write_string = cgroup_release_agent_write, | 3946 | .write_string = cgroup_release_agent_write, |
4192 | .max_write_len = PATH_MAX, | 3947 | .max_write_len = PATH_MAX, |
4193 | }, | 3948 | }, |
@@ -4333,6 +4088,65 @@ static void offline_css(struct cgroup_subsys_state *css) | |||
4333 | RCU_INIT_POINTER(css->cgroup->subsys[ss->subsys_id], css); | 4088 | RCU_INIT_POINTER(css->cgroup->subsys[ss->subsys_id], css); |
4334 | } | 4089 | } |
4335 | 4090 | ||
4091 | /** | ||
4092 | * create_css - create a cgroup_subsys_state | ||
4093 | * @cgrp: the cgroup new css will be associated with | ||
4094 | * @ss: the subsys of new css | ||
4095 | * | ||
4096 | * Create a new css associated with @cgrp - @ss pair. On success, the new | ||
4097 | * css is online and installed in @cgrp with all interface files created. | ||
4098 | * Returns 0 on success, -errno on failure. | ||
4099 | */ | ||
4100 | static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss) | ||
4101 | { | ||
4102 | struct cgroup *parent = cgrp->parent; | ||
4103 | struct cgroup_subsys_state *css; | ||
4104 | int err; | ||
4105 | |||
4106 | lockdep_assert_held(&cgrp->dentry->d_inode->i_mutex); | ||
4107 | lockdep_assert_held(&cgroup_mutex); | ||
4108 | |||
4109 | css = ss->css_alloc(cgroup_css(parent, ss)); | ||
4110 | if (IS_ERR(css)) | ||
4111 | return PTR_ERR(css); | ||
4112 | |||
4113 | err = percpu_ref_init(&css->refcnt, css_release); | ||
4114 | if (err) | ||
4115 | goto err_free_css; | ||
4116 | |||
4117 | init_css(css, ss, cgrp); | ||
4118 | |||
4119 | err = cgroup_populate_dir(cgrp, 1 << ss->subsys_id); | ||
4120 | if (err) | ||
4121 | goto err_free_percpu_ref; | ||
4122 | |||
4123 | err = online_css(css); | ||
4124 | if (err) | ||
4125 | goto err_clear_dir; | ||
4126 | |||
4127 | dget(cgrp->dentry); | ||
4128 | css_get(css->parent); | ||
4129 | |||
4130 | if (ss->broken_hierarchy && !ss->warned_broken_hierarchy && | ||
4131 | parent->parent) { | ||
4132 | pr_warning("cgroup: %s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n", | ||
4133 | current->comm, current->pid, ss->name); | ||
4134 | if (!strcmp(ss->name, "memory")) | ||
4135 | pr_warning("cgroup: \"memory\" requires setting use_hierarchy to 1 on the root.\n"); | ||
4136 | ss->warned_broken_hierarchy = true; | ||
4137 | } | ||
4138 | |||
4139 | return 0; | ||
4140 | |||
4141 | err_clear_dir: | ||
4142 | cgroup_clear_dir(css->cgroup, 1 << css->ss->subsys_id); | ||
4143 | err_free_percpu_ref: | ||
4144 | percpu_ref_cancel_init(&css->refcnt); | ||
4145 | err_free_css: | ||
4146 | ss->css_free(css); | ||
4147 | return err; | ||
4148 | } | ||
4149 | |||
4336 | /* | 4150 | /* |
4337 | * cgroup_create - create a cgroup | 4151 | * cgroup_create - create a cgroup |
4338 | * @parent: cgroup that will be parent of the new cgroup | 4152 | * @parent: cgroup that will be parent of the new cgroup |
@@ -4344,11 +4158,10 @@ static void offline_css(struct cgroup_subsys_state *css) | |||
4344 | static long cgroup_create(struct cgroup *parent, struct dentry *dentry, | 4158 | static long cgroup_create(struct cgroup *parent, struct dentry *dentry, |
4345 | umode_t mode) | 4159 | umode_t mode) |
4346 | { | 4160 | { |
4347 | struct cgroup_subsys_state *css_ar[CGROUP_SUBSYS_COUNT] = { }; | ||
4348 | struct cgroup *cgrp; | 4161 | struct cgroup *cgrp; |
4349 | struct cgroup_name *name; | 4162 | struct cgroup_name *name; |
4350 | struct cgroupfs_root *root = parent->root; | 4163 | struct cgroupfs_root *root = parent->root; |
4351 | int err = 0; | 4164 | int ssid, err; |
4352 | struct cgroup_subsys *ss; | 4165 | struct cgroup_subsys *ss; |
4353 | struct super_block *sb = root->sb; | 4166 | struct super_block *sb = root->sb; |
4354 | 4167 | ||
@@ -4358,19 +4171,13 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, | |||
4358 | return -ENOMEM; | 4171 | return -ENOMEM; |
4359 | 4172 | ||
4360 | name = cgroup_alloc_name(dentry); | 4173 | name = cgroup_alloc_name(dentry); |
4361 | if (!name) | 4174 | if (!name) { |
4175 | err = -ENOMEM; | ||
4362 | goto err_free_cgrp; | 4176 | goto err_free_cgrp; |
4177 | } | ||
4363 | rcu_assign_pointer(cgrp->name, name); | 4178 | rcu_assign_pointer(cgrp->name, name); |
4364 | 4179 | ||
4365 | /* | 4180 | /* |
4366 | * Temporarily set the pointer to NULL, so idr_find() won't return | ||
4367 | * a half-baked cgroup. | ||
4368 | */ | ||
4369 | cgrp->id = idr_alloc(&root->cgroup_idr, NULL, 1, 0, GFP_KERNEL); | ||
4370 | if (cgrp->id < 0) | ||
4371 | goto err_free_name; | ||
4372 | |||
4373 | /* | ||
4374 | * Only live parents can have children. Note that the liveliness | 4181 | * Only live parents can have children. Note that the liveliness |
4375 | * check isn't strictly necessary because cgroup_mkdir() and | 4182 | * check isn't strictly necessary because cgroup_mkdir() and |
4376 | * cgroup_rmdir() are fully synchronized by i_mutex; however, do it | 4183 | * cgroup_rmdir() are fully synchronized by i_mutex; however, do it |
@@ -4379,7 +4186,17 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, | |||
4379 | */ | 4186 | */ |
4380 | if (!cgroup_lock_live_group(parent)) { | 4187 | if (!cgroup_lock_live_group(parent)) { |
4381 | err = -ENODEV; | 4188 | err = -ENODEV; |
4382 | goto err_free_id; | 4189 | goto err_free_name; |
4190 | } | ||
4191 | |||
4192 | /* | ||
4193 | * Temporarily set the pointer to NULL, so idr_find() won't return | ||
4194 | * a half-baked cgroup. | ||
4195 | */ | ||
4196 | cgrp->id = idr_alloc(&root->cgroup_idr, NULL, 1, 0, GFP_KERNEL); | ||
4197 | if (cgrp->id < 0) { | ||
4198 | err = -ENOMEM; | ||
4199 | goto err_unlock; | ||
4383 | } | 4200 | } |
4384 | 4201 | ||
4385 | /* Grab a reference on the superblock so the hierarchy doesn't | 4202 | /* Grab a reference on the superblock so the hierarchy doesn't |
@@ -4404,23 +4221,6 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, | |||
4404 | if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags)) | 4221 | if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags)) |
4405 | set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags); | 4222 | set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags); |
4406 | 4223 | ||
4407 | for_each_root_subsys(root, ss) { | ||
4408 | struct cgroup_subsys_state *css; | ||
4409 | |||
4410 | css = ss->css_alloc(cgroup_css(parent, ss)); | ||
4411 | if (IS_ERR(css)) { | ||
4412 | err = PTR_ERR(css); | ||
4413 | goto err_free_all; | ||
4414 | } | ||
4415 | css_ar[ss->subsys_id] = css; | ||
4416 | |||
4417 | err = percpu_ref_init(&css->refcnt, css_release); | ||
4418 | if (err) | ||
4419 | goto err_free_all; | ||
4420 | |||
4421 | init_css(css, ss, cgrp); | ||
4422 | } | ||
4423 | |||
4424 | /* | 4224 | /* |
4425 | * Create directory. cgroup_create_file() returns with the new | 4225 | * Create directory. cgroup_create_file() returns with the new |
4426 | * directory locked on success so that it can be populated without | 4226 | * directory locked on success so that it can be populated without |
@@ -4428,7 +4228,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, | |||
4428 | */ | 4228 | */ |
4429 | err = cgroup_create_file(dentry, S_IFDIR | mode, sb); | 4229 | err = cgroup_create_file(dentry, S_IFDIR | mode, sb); |
4430 | if (err < 0) | 4230 | if (err < 0) |
4431 | goto err_free_all; | 4231 | goto err_free_id; |
4432 | lockdep_assert_held(&dentry->d_inode->i_mutex); | 4232 | lockdep_assert_held(&dentry->d_inode->i_mutex); |
4433 | 4233 | ||
4434 | cgrp->serial_nr = cgroup_serial_nr_next++; | 4234 | cgrp->serial_nr = cgroup_serial_nr_next++; |
@@ -4440,60 +4240,36 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, | |||
4440 | /* hold a ref to the parent's dentry */ | 4240 | /* hold a ref to the parent's dentry */ |
4441 | dget(parent->dentry); | 4241 | dget(parent->dentry); |
4442 | 4242 | ||
4443 | /* creation succeeded, notify subsystems */ | 4243 | /* |
4444 | for_each_root_subsys(root, ss) { | 4244 | * @cgrp is now fully operational. If something fails after this |
4445 | struct cgroup_subsys_state *css = css_ar[ss->subsys_id]; | 4245 | * point, it'll be released via the normal destruction path. |
4446 | 4246 | */ | |
4447 | err = online_css(css); | ||
4448 | if (err) | ||
4449 | goto err_destroy; | ||
4450 | |||
4451 | /* each css holds a ref to the cgroup's dentry and parent css */ | ||
4452 | dget(dentry); | ||
4453 | css_get(css->parent); | ||
4454 | |||
4455 | /* mark it consumed for error path */ | ||
4456 | css_ar[ss->subsys_id] = NULL; | ||
4457 | |||
4458 | if (ss->broken_hierarchy && !ss->warned_broken_hierarchy && | ||
4459 | parent->parent) { | ||
4460 | pr_warning("cgroup: %s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n", | ||
4461 | current->comm, current->pid, ss->name); | ||
4462 | if (!strcmp(ss->name, "memory")) | ||
4463 | pr_warning("cgroup: \"memory\" requires setting use_hierarchy to 1 on the root.\n"); | ||
4464 | ss->warned_broken_hierarchy = true; | ||
4465 | } | ||
4466 | } | ||
4467 | |||
4468 | idr_replace(&root->cgroup_idr, cgrp, cgrp->id); | 4247 | idr_replace(&root->cgroup_idr, cgrp, cgrp->id); |
4469 | 4248 | ||
4470 | err = cgroup_addrm_files(cgrp, cgroup_base_files, true); | 4249 | err = cgroup_addrm_files(cgrp, cgroup_base_files, true); |
4471 | if (err) | 4250 | if (err) |
4472 | goto err_destroy; | 4251 | goto err_destroy; |
4473 | 4252 | ||
4474 | err = cgroup_populate_dir(cgrp, root->subsys_mask); | 4253 | /* let's create and online css's */ |
4475 | if (err) | 4254 | for_each_subsys(ss, ssid) { |
4476 | goto err_destroy; | 4255 | if (root->subsys_mask & (1 << ssid)) { |
4256 | err = create_css(cgrp, ss); | ||
4257 | if (err) | ||
4258 | goto err_destroy; | ||
4259 | } | ||
4260 | } | ||
4477 | 4261 | ||
4478 | mutex_unlock(&cgroup_mutex); | 4262 | mutex_unlock(&cgroup_mutex); |
4479 | mutex_unlock(&cgrp->dentry->d_inode->i_mutex); | 4263 | mutex_unlock(&cgrp->dentry->d_inode->i_mutex); |
4480 | 4264 | ||
4481 | return 0; | 4265 | return 0; |
4482 | 4266 | ||
4483 | err_free_all: | ||
4484 | for_each_root_subsys(root, ss) { | ||
4485 | struct cgroup_subsys_state *css = css_ar[ss->subsys_id]; | ||
4486 | |||
4487 | if (css) { | ||
4488 | percpu_ref_cancel_init(&css->refcnt); | ||
4489 | ss->css_free(css); | ||
4490 | } | ||
4491 | } | ||
4492 | mutex_unlock(&cgroup_mutex); | ||
4493 | /* Release the reference count that we took on the superblock */ | ||
4494 | deactivate_super(sb); | ||
4495 | err_free_id: | 4267 | err_free_id: |
4496 | idr_remove(&root->cgroup_idr, cgrp->id); | 4268 | idr_remove(&root->cgroup_idr, cgrp->id); |
4269 | /* Release the reference count that we took on the superblock */ | ||
4270 | deactivate_super(sb); | ||
4271 | err_unlock: | ||
4272 | mutex_unlock(&cgroup_mutex); | ||
4497 | err_free_name: | 4273 | err_free_name: |
4498 | kfree(rcu_dereference_raw(cgrp->name)); | 4274 | kfree(rcu_dereference_raw(cgrp->name)); |
4499 | err_free_cgrp: | 4275 | err_free_cgrp: |
@@ -4501,14 +4277,6 @@ err_free_cgrp: | |||
4501 | return err; | 4277 | return err; |
4502 | 4278 | ||
4503 | err_destroy: | 4279 | err_destroy: |
4504 | for_each_root_subsys(root, ss) { | ||
4505 | struct cgroup_subsys_state *css = css_ar[ss->subsys_id]; | ||
4506 | |||
4507 | if (css) { | ||
4508 | percpu_ref_cancel_init(&css->refcnt); | ||
4509 | ss->css_free(css); | ||
4510 | } | ||
4511 | } | ||
4512 | cgroup_destroy_locked(cgrp); | 4280 | cgroup_destroy_locked(cgrp); |
4513 | mutex_unlock(&cgroup_mutex); | 4281 | mutex_unlock(&cgroup_mutex); |
4514 | mutex_unlock(&dentry->d_inode->i_mutex); | 4282 | mutex_unlock(&dentry->d_inode->i_mutex); |
@@ -4631,10 +4399,10 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) | |||
4631 | __releases(&cgroup_mutex) __acquires(&cgroup_mutex) | 4399 | __releases(&cgroup_mutex) __acquires(&cgroup_mutex) |
4632 | { | 4400 | { |
4633 | struct dentry *d = cgrp->dentry; | 4401 | struct dentry *d = cgrp->dentry; |
4634 | struct cgroup_event *event, *tmp; | 4402 | struct cgroup_subsys_state *css; |
4635 | struct cgroup_subsys *ss; | ||
4636 | struct cgroup *child; | 4403 | struct cgroup *child; |
4637 | bool empty; | 4404 | bool empty; |
4405 | int ssid; | ||
4638 | 4406 | ||
4639 | lockdep_assert_held(&d->d_inode->i_mutex); | 4407 | lockdep_assert_held(&d->d_inode->i_mutex); |
4640 | lockdep_assert_held(&cgroup_mutex); | 4408 | lockdep_assert_held(&cgroup_mutex); |
@@ -4670,12 +4438,8 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) | |||
4670 | * will be invoked to perform the rest of destruction once the | 4438 | * will be invoked to perform the rest of destruction once the |
4671 | * percpu refs of all css's are confirmed to be killed. | 4439 | * percpu refs of all css's are confirmed to be killed. |
4672 | */ | 4440 | */ |
4673 | for_each_root_subsys(cgrp->root, ss) { | 4441 | for_each_css(css, ssid, cgrp) |
4674 | struct cgroup_subsys_state *css = cgroup_css(cgrp, ss); | 4442 | kill_css(css); |
4675 | |||
4676 | if (css) | ||
4677 | kill_css(css); | ||
4678 | } | ||
4679 | 4443 | ||
4680 | /* | 4444 | /* |
4681 | * Mark @cgrp dead. This prevents further task migration and child | 4445 | * Mark @cgrp dead. This prevents further task migration and child |
@@ -4710,18 +4474,6 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) | |||
4710 | dget(d); | 4474 | dget(d); |
4711 | cgroup_d_remove_dir(d); | 4475 | cgroup_d_remove_dir(d); |
4712 | 4476 | ||
4713 | /* | ||
4714 | * Unregister events and notify userspace. | ||
4715 | * Notify userspace about cgroup removing only after rmdir of cgroup | ||
4716 | * directory to avoid race between userspace and kernelspace. | ||
4717 | */ | ||
4718 | spin_lock(&cgrp->event_list_lock); | ||
4719 | list_for_each_entry_safe(event, tmp, &cgrp->event_list, list) { | ||
4720 | list_del_init(&event->list); | ||
4721 | schedule_work(&event->remove); | ||
4722 | } | ||
4723 | spin_unlock(&cgrp->event_list_lock); | ||
4724 | |||
4725 | return 0; | 4477 | return 0; |
4726 | }; | 4478 | }; |
4727 | 4479 | ||
@@ -4792,7 +4544,6 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) | |||
4792 | cgroup_init_cftsets(ss); | 4544 | cgroup_init_cftsets(ss); |
4793 | 4545 | ||
4794 | /* Create the top cgroup state for this subsystem */ | 4546 | /* Create the top cgroup state for this subsystem */ |
4795 | list_add(&ss->sibling, &cgroup_dummy_root.subsys_list); | ||
4796 | ss->root = &cgroup_dummy_root; | 4547 | ss->root = &cgroup_dummy_root; |
4797 | css = ss->css_alloc(cgroup_css(cgroup_dummy_top, ss)); | 4548 | css = ss->css_alloc(cgroup_css(cgroup_dummy_top, ss)); |
4798 | /* We don't handle early failures gracefully */ | 4549 | /* We don't handle early failures gracefully */ |
@@ -4866,6 +4617,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) | |||
4866 | cgroup_init_cftsets(ss); | 4617 | cgroup_init_cftsets(ss); |
4867 | 4618 | ||
4868 | mutex_lock(&cgroup_mutex); | 4619 | mutex_lock(&cgroup_mutex); |
4620 | mutex_lock(&cgroup_root_mutex); | ||
4869 | cgroup_subsys[ss->subsys_id] = ss; | 4621 | cgroup_subsys[ss->subsys_id] = ss; |
4870 | 4622 | ||
4871 | /* | 4623 | /* |
@@ -4877,11 +4629,11 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) | |||
4877 | if (IS_ERR(css)) { | 4629 | if (IS_ERR(css)) { |
4878 | /* failure case - need to deassign the cgroup_subsys[] slot. */ | 4630 | /* failure case - need to deassign the cgroup_subsys[] slot. */ |
4879 | cgroup_subsys[ss->subsys_id] = NULL; | 4631 | cgroup_subsys[ss->subsys_id] = NULL; |
4632 | mutex_unlock(&cgroup_root_mutex); | ||
4880 | mutex_unlock(&cgroup_mutex); | 4633 | mutex_unlock(&cgroup_mutex); |
4881 | return PTR_ERR(css); | 4634 | return PTR_ERR(css); |
4882 | } | 4635 | } |
4883 | 4636 | ||
4884 | list_add(&ss->sibling, &cgroup_dummy_root.subsys_list); | ||
4885 | ss->root = &cgroup_dummy_root; | 4637 | ss->root = &cgroup_dummy_root; |
4886 | 4638 | ||
4887 | /* our new subsystem will be attached to the dummy hierarchy. */ | 4639 | /* our new subsystem will be attached to the dummy hierarchy. */ |
@@ -4911,14 +4663,18 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) | |||
4911 | write_unlock(&css_set_lock); | 4663 | write_unlock(&css_set_lock); |
4912 | 4664 | ||
4913 | ret = online_css(css); | 4665 | ret = online_css(css); |
4914 | if (ret) | 4666 | if (ret) { |
4667 | ss->css_free(css); | ||
4915 | goto err_unload; | 4668 | goto err_unload; |
4669 | } | ||
4916 | 4670 | ||
4917 | /* success! */ | 4671 | /* success! */ |
4672 | mutex_unlock(&cgroup_root_mutex); | ||
4918 | mutex_unlock(&cgroup_mutex); | 4673 | mutex_unlock(&cgroup_mutex); |
4919 | return 0; | 4674 | return 0; |
4920 | 4675 | ||
4921 | err_unload: | 4676 | err_unload: |
4677 | mutex_unlock(&cgroup_root_mutex); | ||
4922 | mutex_unlock(&cgroup_mutex); | 4678 | mutex_unlock(&cgroup_mutex); |
4923 | /* @ss can't be mounted here as try_module_get() would fail */ | 4679 | /* @ss can't be mounted here as try_module_get() would fail */ |
4924 | cgroup_unload_subsys(ss); | 4680 | cgroup_unload_subsys(ss); |
@@ -4937,6 +4693,7 @@ EXPORT_SYMBOL_GPL(cgroup_load_subsys); | |||
4937 | void cgroup_unload_subsys(struct cgroup_subsys *ss) | 4693 | void cgroup_unload_subsys(struct cgroup_subsys *ss) |
4938 | { | 4694 | { |
4939 | struct cgrp_cset_link *link; | 4695 | struct cgrp_cset_link *link; |
4696 | struct cgroup_subsys_state *css; | ||
4940 | 4697 | ||
4941 | BUG_ON(ss->module == NULL); | 4698 | BUG_ON(ss->module == NULL); |
4942 | 4699 | ||
@@ -4948,15 +4705,15 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss) | |||
4948 | BUG_ON(ss->root != &cgroup_dummy_root); | 4705 | BUG_ON(ss->root != &cgroup_dummy_root); |
4949 | 4706 | ||
4950 | mutex_lock(&cgroup_mutex); | 4707 | mutex_lock(&cgroup_mutex); |
4708 | mutex_lock(&cgroup_root_mutex); | ||
4951 | 4709 | ||
4952 | offline_css(cgroup_css(cgroup_dummy_top, ss)); | 4710 | css = cgroup_css(cgroup_dummy_top, ss); |
4711 | if (css) | ||
4712 | offline_css(css); | ||
4953 | 4713 | ||
4954 | /* deassign the subsys_id */ | 4714 | /* deassign the subsys_id */ |
4955 | cgroup_subsys[ss->subsys_id] = NULL; | 4715 | cgroup_subsys[ss->subsys_id] = NULL; |
4956 | 4716 | ||
4957 | /* remove subsystem from the dummy root's list of subsystems */ | ||
4958 | list_del_init(&ss->sibling); | ||
4959 | |||
4960 | /* | 4717 | /* |
4961 | * disentangle the css from all css_sets attached to the dummy | 4718 | * disentangle the css from all css_sets attached to the dummy |
4962 | * top. as in loading, we need to pay our respects to the hashtable | 4719 | * top. as in loading, we need to pay our respects to the hashtable |
@@ -4979,9 +4736,11 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss) | |||
4979 | * need to free before marking as null because ss->css_free needs | 4736 | * need to free before marking as null because ss->css_free needs |
4980 | * the cgrp->subsys pointer to find their state. | 4737 | * the cgrp->subsys pointer to find their state. |
4981 | */ | 4738 | */ |
4982 | ss->css_free(cgroup_css(cgroup_dummy_top, ss)); | 4739 | if (css) |
4740 | ss->css_free(css); | ||
4983 | RCU_INIT_POINTER(cgroup_dummy_top->subsys[ss->subsys_id], NULL); | 4741 | RCU_INIT_POINTER(cgroup_dummy_top->subsys[ss->subsys_id], NULL); |
4984 | 4742 | ||
4743 | mutex_unlock(&cgroup_root_mutex); | ||
4985 | mutex_unlock(&cgroup_mutex); | 4744 | mutex_unlock(&cgroup_mutex); |
4986 | } | 4745 | } |
4987 | EXPORT_SYMBOL_GPL(cgroup_unload_subsys); | 4746 | EXPORT_SYMBOL_GPL(cgroup_unload_subsys); |
@@ -5100,6 +4859,15 @@ static int __init cgroup_wq_init(void) | |||
5100 | */ | 4859 | */ |
5101 | cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1); | 4860 | cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1); |
5102 | BUG_ON(!cgroup_destroy_wq); | 4861 | BUG_ON(!cgroup_destroy_wq); |
4862 | |||
4863 | /* | ||
4864 | * Used to destroy pidlists and separate to serve as flush domain. | ||
4865 | * Cap @max_active to 1 too. | ||
4866 | */ | ||
4867 | cgroup_pidlist_destroy_wq = alloc_workqueue("cgroup_pidlist_destroy", | ||
4868 | 0, 1); | ||
4869 | BUG_ON(!cgroup_pidlist_destroy_wq); | ||
4870 | |||
5103 | return 0; | 4871 | return 0; |
5104 | } | 4872 | } |
5105 | core_initcall(cgroup_wq_init); | 4873 | core_initcall(cgroup_wq_init); |
@@ -5143,11 +4911,12 @@ int proc_cgroup_show(struct seq_file *m, void *v) | |||
5143 | for_each_active_root(root) { | 4911 | for_each_active_root(root) { |
5144 | struct cgroup_subsys *ss; | 4912 | struct cgroup_subsys *ss; |
5145 | struct cgroup *cgrp; | 4913 | struct cgroup *cgrp; |
5146 | int count = 0; | 4914 | int ssid, count = 0; |
5147 | 4915 | ||
5148 | seq_printf(m, "%d:", root->hierarchy_id); | 4916 | seq_printf(m, "%d:", root->hierarchy_id); |
5149 | for_each_root_subsys(root, ss) | 4917 | for_each_subsys(ss, ssid) |
5150 | seq_printf(m, "%s%s", count++ ? "," : "", ss->name); | 4918 | if (root->subsys_mask & (1 << ssid)) |
4919 | seq_printf(m, "%s%s", count++ ? "," : "", ss->name); | ||
5151 | if (strlen(root->name)) | 4920 | if (strlen(root->name)) |
5152 | seq_printf(m, "%sname=%s", count ? "," : "", | 4921 | seq_printf(m, "%sname=%s", count ? "," : "", |
5153 | root->name); | 4922 | root->name); |
@@ -5488,16 +5257,16 @@ __setup("cgroup_disable=", cgroup_disable); | |||
5488 | * @dentry: directory dentry of interest | 5257 | * @dentry: directory dentry of interest |
5489 | * @ss: subsystem of interest | 5258 | * @ss: subsystem of interest |
5490 | * | 5259 | * |
5491 | * Must be called under RCU read lock. The caller is responsible for | 5260 | * Must be called under cgroup_mutex or RCU read lock. The caller is |
5492 | * pinning the returned css if it needs to be accessed outside the RCU | 5261 | * responsible for pinning the returned css if it needs to be accessed |
5493 | * critical section. | 5262 | * outside the critical section. |
5494 | */ | 5263 | */ |
5495 | struct cgroup_subsys_state *css_from_dir(struct dentry *dentry, | 5264 | struct cgroup_subsys_state *css_from_dir(struct dentry *dentry, |
5496 | struct cgroup_subsys *ss) | 5265 | struct cgroup_subsys *ss) |
5497 | { | 5266 | { |
5498 | struct cgroup *cgrp; | 5267 | struct cgroup *cgrp; |
5499 | 5268 | ||
5500 | WARN_ON_ONCE(!rcu_read_lock_held()); | 5269 | cgroup_assert_mutex_or_rcu_locked(); |
5501 | 5270 | ||
5502 | /* is @dentry a cgroup dir? */ | 5271 | /* is @dentry a cgroup dir? */ |
5503 | if (!dentry->d_inode || | 5272 | if (!dentry->d_inode || |
@@ -5520,9 +5289,7 @@ struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss) | |||
5520 | { | 5289 | { |
5521 | struct cgroup *cgrp; | 5290 | struct cgroup *cgrp; |
5522 | 5291 | ||
5523 | rcu_lockdep_assert(rcu_read_lock_held() || | 5292 | cgroup_assert_mutex_or_rcu_locked(); |
5524 | lockdep_is_held(&cgroup_mutex), | ||
5525 | "css_from_id() needs proper protection"); | ||
5526 | 5293 | ||
5527 | cgrp = idr_find(&ss->root->cgroup_idr, id); | 5294 | cgrp = idr_find(&ss->root->cgroup_idr, id); |
5528 | if (cgrp) | 5295 | if (cgrp) |
@@ -5570,9 +5337,7 @@ static u64 current_css_set_refcount_read(struct cgroup_subsys_state *css, | |||
5570 | return count; | 5337 | return count; |
5571 | } | 5338 | } |
5572 | 5339 | ||
5573 | static int current_css_set_cg_links_read(struct cgroup_subsys_state *css, | 5340 | static int current_css_set_cg_links_read(struct seq_file *seq, void *v) |
5574 | struct cftype *cft, | ||
5575 | struct seq_file *seq) | ||
5576 | { | 5341 | { |
5577 | struct cgrp_cset_link *link; | 5342 | struct cgrp_cset_link *link; |
5578 | struct css_set *cset; | 5343 | struct css_set *cset; |
@@ -5597,9 +5362,9 @@ static int current_css_set_cg_links_read(struct cgroup_subsys_state *css, | |||
5597 | } | 5362 | } |
5598 | 5363 | ||
5599 | #define MAX_TASKS_SHOWN_PER_CSS 25 | 5364 | #define MAX_TASKS_SHOWN_PER_CSS 25 |
5600 | static int cgroup_css_links_read(struct cgroup_subsys_state *css, | 5365 | static int cgroup_css_links_read(struct seq_file *seq, void *v) |
5601 | struct cftype *cft, struct seq_file *seq) | ||
5602 | { | 5366 | { |
5367 | struct cgroup_subsys_state *css = seq_css(seq); | ||
5603 | struct cgrp_cset_link *link; | 5368 | struct cgrp_cset_link *link; |
5604 | 5369 | ||
5605 | read_lock(&css_set_lock); | 5370 | read_lock(&css_set_lock); |
@@ -5645,12 +5410,12 @@ static struct cftype debug_files[] = { | |||
5645 | 5410 | ||
5646 | { | 5411 | { |
5647 | .name = "current_css_set_cg_links", | 5412 | .name = "current_css_set_cg_links", |
5648 | .read_seq_string = current_css_set_cg_links_read, | 5413 | .seq_show = current_css_set_cg_links_read, |
5649 | }, | 5414 | }, |
5650 | 5415 | ||
5651 | { | 5416 | { |
5652 | .name = "cgroup_css_links", | 5417 | .name = "cgroup_css_links", |
5653 | .read_seq_string = cgroup_css_links_read, | 5418 | .seq_show = cgroup_css_links_read, |
5654 | }, | 5419 | }, |
5655 | 5420 | ||
5656 | { | 5421 | { |
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index f0ff64d0ebaa..6c3154e477f6 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c | |||
@@ -301,10 +301,9 @@ out_unlock: | |||
301 | spin_unlock_irq(&freezer->lock); | 301 | spin_unlock_irq(&freezer->lock); |
302 | } | 302 | } |
303 | 303 | ||
304 | static int freezer_read(struct cgroup_subsys_state *css, struct cftype *cft, | 304 | static int freezer_read(struct seq_file *m, void *v) |
305 | struct seq_file *m) | ||
306 | { | 305 | { |
307 | struct cgroup_subsys_state *pos; | 306 | struct cgroup_subsys_state *css = seq_css(m), *pos; |
308 | 307 | ||
309 | rcu_read_lock(); | 308 | rcu_read_lock(); |
310 | 309 | ||
@@ -458,7 +457,7 @@ static struct cftype files[] = { | |||
458 | { | 457 | { |
459 | .name = "state", | 458 | .name = "state", |
460 | .flags = CFTYPE_NOT_ON_ROOT, | 459 | .flags = CFTYPE_NOT_ON_ROOT, |
461 | .read_seq_string = freezer_read, | 460 | .seq_show = freezer_read, |
462 | .write_string = freezer_write, | 461 | .write_string = freezer_write, |
463 | }, | 462 | }, |
464 | { | 463 | { |
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c index e5f3917aa05b..6cb20d2e7ee0 100644 --- a/kernel/context_tracking.c +++ b/kernel/context_tracking.c | |||
@@ -53,10 +53,10 @@ void context_tracking_user_enter(void) | |||
53 | /* | 53 | /* |
54 | * Repeat the user_enter() check here because some archs may be calling | 54 | * Repeat the user_enter() check here because some archs may be calling |
55 | * this from asm and if no CPU needs context tracking, they shouldn't | 55 | * this from asm and if no CPU needs context tracking, they shouldn't |
56 | * go further. Repeat the check here until they support the static key | 56 | * go further. Repeat the check here until they support the inline static |
57 | * check. | 57 | * key check. |
58 | */ | 58 | */ |
59 | if (!static_key_false(&context_tracking_enabled)) | 59 | if (!context_tracking_is_enabled()) |
60 | return; | 60 | return; |
61 | 61 | ||
62 | /* | 62 | /* |
@@ -160,7 +160,7 @@ void context_tracking_user_exit(void) | |||
160 | { | 160 | { |
161 | unsigned long flags; | 161 | unsigned long flags; |
162 | 162 | ||
163 | if (!static_key_false(&context_tracking_enabled)) | 163 | if (!context_tracking_is_enabled()) |
164 | return; | 164 | return; |
165 | 165 | ||
166 | if (in_interrupt()) | 166 | if (in_interrupt()) |
diff --git a/kernel/cpu/idle.c b/kernel/cpu/idle.c index 988573a9a387..277f494c2a9a 100644 --- a/kernel/cpu/idle.c +++ b/kernel/cpu/idle.c | |||
@@ -105,14 +105,17 @@ static void cpu_idle_loop(void) | |||
105 | __current_set_polling(); | 105 | __current_set_polling(); |
106 | } | 106 | } |
107 | arch_cpu_idle_exit(); | 107 | arch_cpu_idle_exit(); |
108 | /* | ||
109 | * We need to test and propagate the TIF_NEED_RESCHED | ||
110 | * bit here because we might not have send the | ||
111 | * reschedule IPI to idle tasks. | ||
112 | */ | ||
113 | if (tif_need_resched()) | ||
114 | set_preempt_need_resched(); | ||
115 | } | 108 | } |
109 | |||
110 | /* | ||
111 | * Since we fell out of the loop above, we know | ||
112 | * TIF_NEED_RESCHED must be set, propagate it into | ||
113 | * PREEMPT_NEED_RESCHED. | ||
114 | * | ||
115 | * This is required because for polling idle loops we will | ||
116 | * not have had an IPI to fold the state for us. | ||
117 | */ | ||
118 | preempt_set_need_resched(); | ||
116 | tick_nohz_idle_exit(); | 119 | tick_nohz_idle_exit(); |
117 | schedule_preempt_disabled(); | 120 | schedule_preempt_disabled(); |
118 | } | 121 | } |
diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 4772034b4b17..e6b1b66afe52 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c | |||
@@ -974,12 +974,6 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, | |||
974 | * Temporarilly set tasks mems_allowed to target nodes of migration, | 974 | * Temporarilly set tasks mems_allowed to target nodes of migration, |
975 | * so that the migration code can allocate pages on these nodes. | 975 | * so that the migration code can allocate pages on these nodes. |
976 | * | 976 | * |
977 | * Call holding cpuset_mutex, so current's cpuset won't change | ||
978 | * during this call, as manage_mutex holds off any cpuset_attach() | ||
979 | * calls. Therefore we don't need to take task_lock around the | ||
980 | * call to guarantee_online_mems(), as we know no one is changing | ||
981 | * our task's cpuset. | ||
982 | * | ||
983 | * While the mm_struct we are migrating is typically from some | 977 | * While the mm_struct we are migrating is typically from some |
984 | * other task, the task_struct mems_allowed that we are hacking | 978 | * other task, the task_struct mems_allowed that we are hacking |
985 | * is for our current task, which must allocate new pages for that | 979 | * is for our current task, which must allocate new pages for that |
@@ -996,8 +990,10 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from, | |||
996 | 990 | ||
997 | do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL); | 991 | do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL); |
998 | 992 | ||
993 | rcu_read_lock(); | ||
999 | mems_cs = effective_nodemask_cpuset(task_cs(tsk)); | 994 | mems_cs = effective_nodemask_cpuset(task_cs(tsk)); |
1000 | guarantee_online_mems(mems_cs, &tsk->mems_allowed); | 995 | guarantee_online_mems(mems_cs, &tsk->mems_allowed); |
996 | rcu_read_unlock(); | ||
1001 | } | 997 | } |
1002 | 998 | ||
1003 | /* | 999 | /* |
@@ -1731,66 +1727,41 @@ out_unlock: | |||
1731 | * used, list of ranges of sequential numbers, is variable length, | 1727 | * used, list of ranges of sequential numbers, is variable length, |
1732 | * and since these maps can change value dynamically, one could read | 1728 | * and since these maps can change value dynamically, one could read |
1733 | * gibberish by doing partial reads while a list was changing. | 1729 | * gibberish by doing partial reads while a list was changing. |
1734 | * A single large read to a buffer that crosses a page boundary is | ||
1735 | * ok, because the result being copied to user land is not recomputed | ||
1736 | * across a page fault. | ||
1737 | */ | 1730 | */ |
1738 | 1731 | static int cpuset_common_seq_show(struct seq_file *sf, void *v) | |
1739 | static size_t cpuset_sprintf_cpulist(char *page, struct cpuset *cs) | ||
1740 | { | 1732 | { |
1741 | size_t count; | 1733 | struct cpuset *cs = css_cs(seq_css(sf)); |
1734 | cpuset_filetype_t type = seq_cft(sf)->private; | ||
1735 | ssize_t count; | ||
1736 | char *buf, *s; | ||
1737 | int ret = 0; | ||
1742 | 1738 | ||
1743 | mutex_lock(&callback_mutex); | 1739 | count = seq_get_buf(sf, &buf); |
1744 | count = cpulist_scnprintf(page, PAGE_SIZE, cs->cpus_allowed); | 1740 | s = buf; |
1745 | mutex_unlock(&callback_mutex); | ||
1746 | |||
1747 | return count; | ||
1748 | } | ||
1749 | |||
1750 | static size_t cpuset_sprintf_memlist(char *page, struct cpuset *cs) | ||
1751 | { | ||
1752 | size_t count; | ||
1753 | 1741 | ||
1754 | mutex_lock(&callback_mutex); | 1742 | mutex_lock(&callback_mutex); |
1755 | count = nodelist_scnprintf(page, PAGE_SIZE, cs->mems_allowed); | ||
1756 | mutex_unlock(&callback_mutex); | ||
1757 | |||
1758 | return count; | ||
1759 | } | ||
1760 | |||
1761 | static ssize_t cpuset_common_file_read(struct cgroup_subsys_state *css, | ||
1762 | struct cftype *cft, struct file *file, | ||
1763 | char __user *buf, size_t nbytes, | ||
1764 | loff_t *ppos) | ||
1765 | { | ||
1766 | struct cpuset *cs = css_cs(css); | ||
1767 | cpuset_filetype_t type = cft->private; | ||
1768 | char *page; | ||
1769 | ssize_t retval = 0; | ||
1770 | char *s; | ||
1771 | |||
1772 | if (!(page = (char *)__get_free_page(GFP_TEMPORARY))) | ||
1773 | return -ENOMEM; | ||
1774 | |||
1775 | s = page; | ||
1776 | 1743 | ||
1777 | switch (type) { | 1744 | switch (type) { |
1778 | case FILE_CPULIST: | 1745 | case FILE_CPULIST: |
1779 | s += cpuset_sprintf_cpulist(s, cs); | 1746 | s += cpulist_scnprintf(s, count, cs->cpus_allowed); |
1780 | break; | 1747 | break; |
1781 | case FILE_MEMLIST: | 1748 | case FILE_MEMLIST: |
1782 | s += cpuset_sprintf_memlist(s, cs); | 1749 | s += nodelist_scnprintf(s, count, cs->mems_allowed); |
1783 | break; | 1750 | break; |
1784 | default: | 1751 | default: |
1785 | retval = -EINVAL; | 1752 | ret = -EINVAL; |
1786 | goto out; | 1753 | goto out_unlock; |
1787 | } | 1754 | } |
1788 | *s++ = '\n'; | ||
1789 | 1755 | ||
1790 | retval = simple_read_from_buffer(buf, nbytes, ppos, page, s - page); | 1756 | if (s < buf + count - 1) { |
1791 | out: | 1757 | *s++ = '\n'; |
1792 | free_page((unsigned long)page); | 1758 | seq_commit(sf, s - buf); |
1793 | return retval; | 1759 | } else { |
1760 | seq_commit(sf, -1); | ||
1761 | } | ||
1762 | out_unlock: | ||
1763 | mutex_unlock(&callback_mutex); | ||
1764 | return ret; | ||
1794 | } | 1765 | } |
1795 | 1766 | ||
1796 | static u64 cpuset_read_u64(struct cgroup_subsys_state *css, struct cftype *cft) | 1767 | static u64 cpuset_read_u64(struct cgroup_subsys_state *css, struct cftype *cft) |
@@ -1847,7 +1818,7 @@ static s64 cpuset_read_s64(struct cgroup_subsys_state *css, struct cftype *cft) | |||
1847 | static struct cftype files[] = { | 1818 | static struct cftype files[] = { |
1848 | { | 1819 | { |
1849 | .name = "cpus", | 1820 | .name = "cpus", |
1850 | .read = cpuset_common_file_read, | 1821 | .seq_show = cpuset_common_seq_show, |
1851 | .write_string = cpuset_write_resmask, | 1822 | .write_string = cpuset_write_resmask, |
1852 | .max_write_len = (100U + 6 * NR_CPUS), | 1823 | .max_write_len = (100U + 6 * NR_CPUS), |
1853 | .private = FILE_CPULIST, | 1824 | .private = FILE_CPULIST, |
@@ -1855,7 +1826,7 @@ static struct cftype files[] = { | |||
1855 | 1826 | ||
1856 | { | 1827 | { |
1857 | .name = "mems", | 1828 | .name = "mems", |
1858 | .read = cpuset_common_file_read, | 1829 | .seq_show = cpuset_common_seq_show, |
1859 | .write_string = cpuset_write_resmask, | 1830 | .write_string = cpuset_write_resmask, |
1860 | .max_write_len = (100U + 6 * MAX_NUMNODES), | 1831 | .max_write_len = (100U + 6 * MAX_NUMNODES), |
1861 | .private = FILE_MEMLIST, | 1832 | .private = FILE_MEMLIST, |
@@ -2511,9 +2482,9 @@ int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask) | |||
2511 | 2482 | ||
2512 | task_lock(current); | 2483 | task_lock(current); |
2513 | cs = nearest_hardwall_ancestor(task_cs(current)); | 2484 | cs = nearest_hardwall_ancestor(task_cs(current)); |
2485 | allowed = node_isset(node, cs->mems_allowed); | ||
2514 | task_unlock(current); | 2486 | task_unlock(current); |
2515 | 2487 | ||
2516 | allowed = node_isset(node, cs->mems_allowed); | ||
2517 | mutex_unlock(&callback_mutex); | 2488 | mutex_unlock(&callback_mutex); |
2518 | return allowed; | 2489 | return allowed; |
2519 | } | 2490 | } |
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index 7d2f35e5df2f..334b3980ffc1 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c | |||
@@ -736,7 +736,8 @@ int kgdb_nmicallback(int cpu, void *regs) | |||
736 | return 1; | 736 | return 1; |
737 | } | 737 | } |
738 | 738 | ||
739 | int kgdb_nmicallin(int cpu, int trapnr, void *regs, atomic_t *send_ready) | 739 | int kgdb_nmicallin(int cpu, int trapnr, void *regs, int err_code, |
740 | atomic_t *send_ready) | ||
740 | { | 741 | { |
741 | #ifdef CONFIG_SMP | 742 | #ifdef CONFIG_SMP |
742 | if (!kgdb_io_ready(0) || !send_ready) | 743 | if (!kgdb_io_ready(0) || !send_ready) |
@@ -750,7 +751,7 @@ int kgdb_nmicallin(int cpu, int trapnr, void *regs, atomic_t *send_ready) | |||
750 | ks->cpu = cpu; | 751 | ks->cpu = cpu; |
751 | ks->ex_vector = trapnr; | 752 | ks->ex_vector = trapnr; |
752 | ks->signo = SIGTRAP; | 753 | ks->signo = SIGTRAP; |
753 | ks->err_code = KGDB_KDB_REASON_SYSTEM_NMI; | 754 | ks->err_code = err_code; |
754 | ks->linux_regs = regs; | 755 | ks->linux_regs = regs; |
755 | ks->send_ready = send_ready; | 756 | ks->send_ready = send_ready; |
756 | kgdb_cpu_enter(ks, regs, DCPU_WANT_MASTER); | 757 | kgdb_cpu_enter(ks, regs, DCPU_WANT_MASTER); |
diff --git a/kernel/debug/debug_core.h b/kernel/debug/debug_core.h index 572aa4f5677c..127d9bc49fb4 100644 --- a/kernel/debug/debug_core.h +++ b/kernel/debug/debug_core.h | |||
@@ -75,13 +75,11 @@ extern int kdb_stub(struct kgdb_state *ks); | |||
75 | extern int kdb_parse(const char *cmdstr); | 75 | extern int kdb_parse(const char *cmdstr); |
76 | extern int kdb_common_init_state(struct kgdb_state *ks); | 76 | extern int kdb_common_init_state(struct kgdb_state *ks); |
77 | extern int kdb_common_deinit_state(void); | 77 | extern int kdb_common_deinit_state(void); |
78 | #define KGDB_KDB_REASON_SYSTEM_NMI KDB_REASON_SYSTEM_NMI | ||
79 | #else /* ! CONFIG_KGDB_KDB */ | 78 | #else /* ! CONFIG_KGDB_KDB */ |
80 | static inline int kdb_stub(struct kgdb_state *ks) | 79 | static inline int kdb_stub(struct kgdb_state *ks) |
81 | { | 80 | { |
82 | return DBG_PASS_EVENT; | 81 | return DBG_PASS_EVENT; |
83 | } | 82 | } |
84 | #define KGDB_KDB_REASON_SYSTEM_NMI 0 | ||
85 | #endif /* CONFIG_KGDB_KDB */ | 83 | #endif /* CONFIG_KGDB_KDB */ |
86 | 84 | ||
87 | #endif /* _DEBUG_CORE_H_ */ | 85 | #endif /* _DEBUG_CORE_H_ */ |
diff --git a/kernel/events/core.c b/kernel/events/core.c index f5744010a8d2..fa0b2d4ad83c 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c | |||
@@ -119,7 +119,8 @@ static int cpu_function_call(int cpu, int (*func) (void *info), void *info) | |||
119 | 119 | ||
120 | #define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\ | 120 | #define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\ |
121 | PERF_FLAG_FD_OUTPUT |\ | 121 | PERF_FLAG_FD_OUTPUT |\ |
122 | PERF_FLAG_PID_CGROUP) | 122 | PERF_FLAG_PID_CGROUP |\ |
123 | PERF_FLAG_FD_CLOEXEC) | ||
123 | 124 | ||
124 | /* | 125 | /* |
125 | * branch priv levels that need permission checks | 126 | * branch priv levels that need permission checks |
@@ -3542,7 +3543,7 @@ static void perf_event_for_each(struct perf_event *event, | |||
3542 | static int perf_event_period(struct perf_event *event, u64 __user *arg) | 3543 | static int perf_event_period(struct perf_event *event, u64 __user *arg) |
3543 | { | 3544 | { |
3544 | struct perf_event_context *ctx = event->ctx; | 3545 | struct perf_event_context *ctx = event->ctx; |
3545 | int ret = 0; | 3546 | int ret = 0, active; |
3546 | u64 value; | 3547 | u64 value; |
3547 | 3548 | ||
3548 | if (!is_sampling_event(event)) | 3549 | if (!is_sampling_event(event)) |
@@ -3566,6 +3567,20 @@ static int perf_event_period(struct perf_event *event, u64 __user *arg) | |||
3566 | event->attr.sample_period = value; | 3567 | event->attr.sample_period = value; |
3567 | event->hw.sample_period = value; | 3568 | event->hw.sample_period = value; |
3568 | } | 3569 | } |
3570 | |||
3571 | active = (event->state == PERF_EVENT_STATE_ACTIVE); | ||
3572 | if (active) { | ||
3573 | perf_pmu_disable(ctx->pmu); | ||
3574 | event->pmu->stop(event, PERF_EF_UPDATE); | ||
3575 | } | ||
3576 | |||
3577 | local64_set(&event->hw.period_left, 0); | ||
3578 | |||
3579 | if (active) { | ||
3580 | event->pmu->start(event, PERF_EF_RELOAD); | ||
3581 | perf_pmu_enable(ctx->pmu); | ||
3582 | } | ||
3583 | |||
3569 | unlock: | 3584 | unlock: |
3570 | raw_spin_unlock_irq(&ctx->lock); | 3585 | raw_spin_unlock_irq(&ctx->lock); |
3571 | 3586 | ||
@@ -6670,6 +6685,9 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, | |||
6670 | INIT_LIST_HEAD(&event->event_entry); | 6685 | INIT_LIST_HEAD(&event->event_entry); |
6671 | INIT_LIST_HEAD(&event->sibling_list); | 6686 | INIT_LIST_HEAD(&event->sibling_list); |
6672 | INIT_LIST_HEAD(&event->rb_entry); | 6687 | INIT_LIST_HEAD(&event->rb_entry); |
6688 | INIT_LIST_HEAD(&event->active_entry); | ||
6689 | INIT_HLIST_NODE(&event->hlist_entry); | ||
6690 | |||
6673 | 6691 | ||
6674 | init_waitqueue_head(&event->waitq); | 6692 | init_waitqueue_head(&event->waitq); |
6675 | init_irq_work(&event->pending, perf_pending_event); | 6693 | init_irq_work(&event->pending, perf_pending_event); |
@@ -6980,6 +6998,7 @@ SYSCALL_DEFINE5(perf_event_open, | |||
6980 | int event_fd; | 6998 | int event_fd; |
6981 | int move_group = 0; | 6999 | int move_group = 0; |
6982 | int err; | 7000 | int err; |
7001 | int f_flags = O_RDWR; | ||
6983 | 7002 | ||
6984 | /* for future expandability... */ | 7003 | /* for future expandability... */ |
6985 | if (flags & ~PERF_FLAG_ALL) | 7004 | if (flags & ~PERF_FLAG_ALL) |
@@ -7008,7 +7027,10 @@ SYSCALL_DEFINE5(perf_event_open, | |||
7008 | if ((flags & PERF_FLAG_PID_CGROUP) && (pid == -1 || cpu == -1)) | 7027 | if ((flags & PERF_FLAG_PID_CGROUP) && (pid == -1 || cpu == -1)) |
7009 | return -EINVAL; | 7028 | return -EINVAL; |
7010 | 7029 | ||
7011 | event_fd = get_unused_fd(); | 7030 | if (flags & PERF_FLAG_FD_CLOEXEC) |
7031 | f_flags |= O_CLOEXEC; | ||
7032 | |||
7033 | event_fd = get_unused_fd_flags(f_flags); | ||
7012 | if (event_fd < 0) | 7034 | if (event_fd < 0) |
7013 | return event_fd; | 7035 | return event_fd; |
7014 | 7036 | ||
@@ -7130,7 +7152,8 @@ SYSCALL_DEFINE5(perf_event_open, | |||
7130 | goto err_context; | 7152 | goto err_context; |
7131 | } | 7153 | } |
7132 | 7154 | ||
7133 | event_file = anon_inode_getfile("[perf_event]", &perf_fops, event, O_RDWR); | 7155 | event_file = anon_inode_getfile("[perf_event]", &perf_fops, event, |
7156 | f_flags); | ||
7134 | if (IS_ERR(event_file)) { | 7157 | if (IS_ERR(event_file)) { |
7135 | err = PTR_ERR(event_file); | 7158 | err = PTR_ERR(event_file); |
7136 | goto err_context; | 7159 | goto err_context; |
@@ -7833,14 +7856,14 @@ static void perf_pmu_rotate_stop(struct pmu *pmu) | |||
7833 | static void __perf_event_exit_context(void *__info) | 7856 | static void __perf_event_exit_context(void *__info) |
7834 | { | 7857 | { |
7835 | struct perf_event_context *ctx = __info; | 7858 | struct perf_event_context *ctx = __info; |
7836 | struct perf_event *event, *tmp; | 7859 | struct perf_event *event; |
7837 | 7860 | ||
7838 | perf_pmu_rotate_stop(ctx->pmu); | 7861 | perf_pmu_rotate_stop(ctx->pmu); |
7839 | 7862 | ||
7840 | list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry) | 7863 | rcu_read_lock(); |
7841 | __perf_remove_from_context(event); | 7864 | list_for_each_entry_rcu(event, &ctx->event_list, event_entry) |
7842 | list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, group_entry) | ||
7843 | __perf_remove_from_context(event); | 7865 | __perf_remove_from_context(event); |
7866 | rcu_read_unlock(); | ||
7844 | } | 7867 | } |
7845 | 7868 | ||
7846 | static void perf_event_exit_cpu_context(int cpu) | 7869 | static void perf_event_exit_cpu_context(int cpu) |
@@ -7864,11 +7887,11 @@ static void perf_event_exit_cpu(int cpu) | |||
7864 | { | 7887 | { |
7865 | struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); | 7888 | struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); |
7866 | 7889 | ||
7890 | perf_event_exit_cpu_context(cpu); | ||
7891 | |||
7867 | mutex_lock(&swhash->hlist_mutex); | 7892 | mutex_lock(&swhash->hlist_mutex); |
7868 | swevent_hlist_release(swhash); | 7893 | swevent_hlist_release(swhash); |
7869 | mutex_unlock(&swhash->hlist_mutex); | 7894 | mutex_unlock(&swhash->hlist_mutex); |
7870 | |||
7871 | perf_event_exit_cpu_context(cpu); | ||
7872 | } | 7895 | } |
7873 | #else | 7896 | #else |
7874 | static inline void perf_event_exit_cpu(int cpu) { } | 7897 | static inline void perf_event_exit_cpu(int cpu) { } |
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index e8b168af135b..146a5792b1d2 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c | |||
@@ -61,19 +61,20 @@ again: | |||
61 | * | 61 | * |
62 | * kernel user | 62 | * kernel user |
63 | * | 63 | * |
64 | * READ ->data_tail READ ->data_head | 64 | * if (LOAD ->data_tail) { LOAD ->data_head |
65 | * smp_mb() (A) smp_rmb() (C) | 65 | * (A) smp_rmb() (C) |
66 | * WRITE $data READ $data | 66 | * STORE $data LOAD $data |
67 | * smp_wmb() (B) smp_mb() (D) | 67 | * smp_wmb() (B) smp_mb() (D) |
68 | * STORE ->data_head WRITE ->data_tail | 68 | * STORE ->data_head STORE ->data_tail |
69 | * } | ||
69 | * | 70 | * |
70 | * Where A pairs with D, and B pairs with C. | 71 | * Where A pairs with D, and B pairs with C. |
71 | * | 72 | * |
72 | * I don't think A needs to be a full barrier because we won't in fact | 73 | * In our case (A) is a control dependency that separates the load of |
73 | * write data until we see the store from userspace. So we simply don't | 74 | * the ->data_tail and the stores of $data. In case ->data_tail |
74 | * issue the data WRITE until we observe it. Be conservative for now. | 75 | * indicates there is no room in the buffer to store $data we do not. |
75 | * | 76 | * |
76 | * OTOH, D needs to be a full barrier since it separates the data READ | 77 | * D needs to be a full barrier since it separates the data READ |
77 | * from the tail WRITE. | 78 | * from the tail WRITE. |
78 | * | 79 | * |
79 | * For B a WMB is sufficient since it separates two WRITEs, and for C | 80 | * For B a WMB is sufficient since it separates two WRITEs, and for C |
@@ -81,7 +82,7 @@ again: | |||
81 | * | 82 | * |
82 | * See perf_output_begin(). | 83 | * See perf_output_begin(). |
83 | */ | 84 | */ |
84 | smp_wmb(); | 85 | smp_wmb(); /* B, matches C */ |
85 | rb->user_page->data_head = head; | 86 | rb->user_page->data_head = head; |
86 | 87 | ||
87 | /* | 88 | /* |
@@ -144,17 +145,26 @@ int perf_output_begin(struct perf_output_handle *handle, | |||
144 | if (!rb->overwrite && | 145 | if (!rb->overwrite && |
145 | unlikely(CIRC_SPACE(head, tail, perf_data_size(rb)) < size)) | 146 | unlikely(CIRC_SPACE(head, tail, perf_data_size(rb)) < size)) |
146 | goto fail; | 147 | goto fail; |
148 | |||
149 | /* | ||
150 | * The above forms a control dependency barrier separating the | ||
151 | * @tail load above from the data stores below. Since the @tail | ||
152 | * load is required to compute the branch to fail below. | ||
153 | * | ||
154 | * A, matches D; the full memory barrier userspace SHOULD issue | ||
155 | * after reading the data and before storing the new tail | ||
156 | * position. | ||
157 | * | ||
158 | * See perf_output_put_handle(). | ||
159 | */ | ||
160 | |||
147 | head += size; | 161 | head += size; |
148 | } while (local_cmpxchg(&rb->head, offset, head) != offset); | 162 | } while (local_cmpxchg(&rb->head, offset, head) != offset); |
149 | 163 | ||
150 | /* | 164 | /* |
151 | * Separate the userpage->tail read from the data stores below. | 165 | * We rely on the implied barrier() by local_cmpxchg() to ensure |
152 | * Matches the MB userspace SHOULD issue after reading the data | 166 | * none of the data stores below can be lifted up by the compiler. |
153 | * and before storing the new tail position. | ||
154 | * | ||
155 | * See perf_output_put_handle(). | ||
156 | */ | 167 | */ |
157 | smp_mb(); | ||
158 | 168 | ||
159 | if (unlikely(head - local_read(&rb->wakeup) > rb->watermark)) | 169 | if (unlikely(head - local_read(&rb->wakeup) > rb->watermark)) |
160 | local_add(rb->watermark, &rb->wakeup); | 170 | local_add(rb->watermark, &rb->wakeup); |
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 24b7d6ca871b..307d87c0991a 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c | |||
@@ -73,6 +73,17 @@ struct uprobe { | |||
73 | struct inode *inode; /* Also hold a ref to inode */ | 73 | struct inode *inode; /* Also hold a ref to inode */ |
74 | loff_t offset; | 74 | loff_t offset; |
75 | unsigned long flags; | 75 | unsigned long flags; |
76 | |||
77 | /* | ||
78 | * The generic code assumes that it has two members of unknown type | ||
79 | * owned by the arch-specific code: | ||
80 | * | ||
81 | * insn - copy_insn() saves the original instruction here for | ||
82 | * arch_uprobe_analyze_insn(). | ||
83 | * | ||
84 | * ixol - potentially modified instruction to execute out of | ||
85 | * line, copied to xol_area by xol_get_insn_slot(). | ||
86 | */ | ||
76 | struct arch_uprobe arch; | 87 | struct arch_uprobe arch; |
77 | }; | 88 | }; |
78 | 89 | ||
@@ -86,6 +97,29 @@ struct return_instance { | |||
86 | }; | 97 | }; |
87 | 98 | ||
88 | /* | 99 | /* |
100 | * Execute out of line area: anonymous executable mapping installed | ||
101 | * by the probed task to execute the copy of the original instruction | ||
102 | * mangled by set_swbp(). | ||
103 | * | ||
104 | * On a breakpoint hit, thread contests for a slot. It frees the | ||
105 | * slot after singlestep. Currently a fixed number of slots are | ||
106 | * allocated. | ||
107 | */ | ||
108 | struct xol_area { | ||
109 | wait_queue_head_t wq; /* if all slots are busy */ | ||
110 | atomic_t slot_count; /* number of in-use slots */ | ||
111 | unsigned long *bitmap; /* 0 = free slot */ | ||
112 | struct page *page; | ||
113 | |||
114 | /* | ||
115 | * We keep the vma's vm_start rather than a pointer to the vma | ||
116 | * itself. The probed process or a naughty kernel module could make | ||
117 | * the vma go away, and we must handle that reasonably gracefully. | ||
118 | */ | ||
119 | unsigned long vaddr; /* Page(s) of instruction slots */ | ||
120 | }; | ||
121 | |||
122 | /* | ||
89 | * valid_vma: Verify if the specified vma is an executable vma | 123 | * valid_vma: Verify if the specified vma is an executable vma |
90 | * Relax restrictions while unregistering: vm_flags might have | 124 | * Relax restrictions while unregistering: vm_flags might have |
91 | * changed after breakpoint was inserted. | 125 | * changed after breakpoint was inserted. |
@@ -330,7 +364,7 @@ int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned | |||
330 | int __weak | 364 | int __weak |
331 | set_orig_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr) | 365 | set_orig_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr) |
332 | { | 366 | { |
333 | return uprobe_write_opcode(mm, vaddr, *(uprobe_opcode_t *)auprobe->insn); | 367 | return uprobe_write_opcode(mm, vaddr, *(uprobe_opcode_t *)&auprobe->insn); |
334 | } | 368 | } |
335 | 369 | ||
336 | static int match_uprobe(struct uprobe *l, struct uprobe *r) | 370 | static int match_uprobe(struct uprobe *l, struct uprobe *r) |
@@ -529,8 +563,8 @@ static int copy_insn(struct uprobe *uprobe, struct file *filp) | |||
529 | { | 563 | { |
530 | struct address_space *mapping = uprobe->inode->i_mapping; | 564 | struct address_space *mapping = uprobe->inode->i_mapping; |
531 | loff_t offs = uprobe->offset; | 565 | loff_t offs = uprobe->offset; |
532 | void *insn = uprobe->arch.insn; | 566 | void *insn = &uprobe->arch.insn; |
533 | int size = MAX_UINSN_BYTES; | 567 | int size = sizeof(uprobe->arch.insn); |
534 | int len, err = -EIO; | 568 | int len, err = -EIO; |
535 | 569 | ||
536 | /* Copy only available bytes, -EIO if nothing was read */ | 570 | /* Copy only available bytes, -EIO if nothing was read */ |
@@ -569,7 +603,7 @@ static int prepare_uprobe(struct uprobe *uprobe, struct file *file, | |||
569 | goto out; | 603 | goto out; |
570 | 604 | ||
571 | ret = -ENOTSUPP; | 605 | ret = -ENOTSUPP; |
572 | if (is_trap_insn((uprobe_opcode_t *)uprobe->arch.insn)) | 606 | if (is_trap_insn((uprobe_opcode_t *)&uprobe->arch.insn)) |
573 | goto out; | 607 | goto out; |
574 | 608 | ||
575 | ret = arch_uprobe_analyze_insn(&uprobe->arch, mm, vaddr); | 609 | ret = arch_uprobe_analyze_insn(&uprobe->arch, mm, vaddr); |
@@ -1264,7 +1298,7 @@ static unsigned long xol_get_insn_slot(struct uprobe *uprobe) | |||
1264 | 1298 | ||
1265 | /* Initialize the slot */ | 1299 | /* Initialize the slot */ |
1266 | copy_to_page(area->page, xol_vaddr, | 1300 | copy_to_page(area->page, xol_vaddr, |
1267 | uprobe->arch.ixol, sizeof(uprobe->arch.ixol)); | 1301 | &uprobe->arch.ixol, sizeof(uprobe->arch.ixol)); |
1268 | /* | 1302 | /* |
1269 | * We probably need flush_icache_user_range() but it needs vma. | 1303 | * We probably need flush_icache_user_range() but it needs vma. |
1270 | * This should work on supported architectures too. | 1304 | * This should work on supported architectures too. |
@@ -1403,12 +1437,10 @@ static void uprobe_warn(struct task_struct *t, const char *msg) | |||
1403 | 1437 | ||
1404 | static void dup_xol_work(struct callback_head *work) | 1438 | static void dup_xol_work(struct callback_head *work) |
1405 | { | 1439 | { |
1406 | kfree(work); | ||
1407 | |||
1408 | if (current->flags & PF_EXITING) | 1440 | if (current->flags & PF_EXITING) |
1409 | return; | 1441 | return; |
1410 | 1442 | ||
1411 | if (!__create_xol_area(current->utask->vaddr)) | 1443 | if (!__create_xol_area(current->utask->dup_xol_addr)) |
1412 | uprobe_warn(current, "dup xol area"); | 1444 | uprobe_warn(current, "dup xol area"); |
1413 | } | 1445 | } |
1414 | 1446 | ||
@@ -1419,7 +1451,6 @@ void uprobe_copy_process(struct task_struct *t, unsigned long flags) | |||
1419 | { | 1451 | { |
1420 | struct uprobe_task *utask = current->utask; | 1452 | struct uprobe_task *utask = current->utask; |
1421 | struct mm_struct *mm = current->mm; | 1453 | struct mm_struct *mm = current->mm; |
1422 | struct callback_head *work; | ||
1423 | struct xol_area *area; | 1454 | struct xol_area *area; |
1424 | 1455 | ||
1425 | t->utask = NULL; | 1456 | t->utask = NULL; |
@@ -1441,14 +1472,9 @@ void uprobe_copy_process(struct task_struct *t, unsigned long flags) | |||
1441 | if (mm == t->mm) | 1472 | if (mm == t->mm) |
1442 | return; | 1473 | return; |
1443 | 1474 | ||
1444 | /* TODO: move it into the union in uprobe_task */ | 1475 | t->utask->dup_xol_addr = area->vaddr; |
1445 | work = kmalloc(sizeof(*work), GFP_KERNEL); | 1476 | init_task_work(&t->utask->dup_xol_work, dup_xol_work); |
1446 | if (!work) | 1477 | task_work_add(t, &t->utask->dup_xol_work, true); |
1447 | return uprobe_warn(t, "dup xol area"); | ||
1448 | |||
1449 | t->utask->vaddr = area->vaddr; | ||
1450 | init_task_work(work, dup_xol_work); | ||
1451 | task_work_add(t, work, true); | ||
1452 | } | 1478 | } |
1453 | 1479 | ||
1454 | /* | 1480 | /* |
@@ -1828,6 +1854,10 @@ static void handle_swbp(struct pt_regs *regs) | |||
1828 | if (unlikely(!test_bit(UPROBE_COPY_INSN, &uprobe->flags))) | 1854 | if (unlikely(!test_bit(UPROBE_COPY_INSN, &uprobe->flags))) |
1829 | goto out; | 1855 | goto out; |
1830 | 1856 | ||
1857 | /* Tracing handlers use ->utask to communicate with fetch methods */ | ||
1858 | if (!get_utask()) | ||
1859 | goto out; | ||
1860 | |||
1831 | handler_chain(uprobe, regs); | 1861 | handler_chain(uprobe, regs); |
1832 | if (can_skip_sstep(uprobe, regs)) | 1862 | if (can_skip_sstep(uprobe, regs)) |
1833 | goto out; | 1863 | goto out; |
diff --git a/kernel/exit.c b/kernel/exit.c index a949819055d5..1e77fc645317 100644 --- a/kernel/exit.c +++ b/kernel/exit.c | |||
@@ -74,6 +74,7 @@ static void __unhash_process(struct task_struct *p, bool group_dead) | |||
74 | __this_cpu_dec(process_counts); | 74 | __this_cpu_dec(process_counts); |
75 | } | 75 | } |
76 | list_del_rcu(&p->thread_group); | 76 | list_del_rcu(&p->thread_group); |
77 | list_del_rcu(&p->thread_node); | ||
77 | } | 78 | } |
78 | 79 | ||
79 | /* | 80 | /* |
diff --git a/kernel/fork.c b/kernel/fork.c index 5721f0e3f2da..a17621c6cd42 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
@@ -800,14 +800,11 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm) | |||
800 | * Allocate a new mm structure and copy contents from the | 800 | * Allocate a new mm structure and copy contents from the |
801 | * mm structure of the passed in task structure. | 801 | * mm structure of the passed in task structure. |
802 | */ | 802 | */ |
803 | struct mm_struct *dup_mm(struct task_struct *tsk) | 803 | static struct mm_struct *dup_mm(struct task_struct *tsk) |
804 | { | 804 | { |
805 | struct mm_struct *mm, *oldmm = current->mm; | 805 | struct mm_struct *mm, *oldmm = current->mm; |
806 | int err; | 806 | int err; |
807 | 807 | ||
808 | if (!oldmm) | ||
809 | return NULL; | ||
810 | |||
811 | mm = allocate_mm(); | 808 | mm = allocate_mm(); |
812 | if (!mm) | 809 | if (!mm) |
813 | goto fail_nomem; | 810 | goto fail_nomem; |
@@ -1035,6 +1032,11 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) | |||
1035 | sig->nr_threads = 1; | 1032 | sig->nr_threads = 1; |
1036 | atomic_set(&sig->live, 1); | 1033 | atomic_set(&sig->live, 1); |
1037 | atomic_set(&sig->sigcnt, 1); | 1034 | atomic_set(&sig->sigcnt, 1); |
1035 | |||
1036 | /* list_add(thread_node, thread_head) without INIT_LIST_HEAD() */ | ||
1037 | sig->thread_head = (struct list_head)LIST_HEAD_INIT(tsk->thread_node); | ||
1038 | tsk->thread_node = (struct list_head)LIST_HEAD_INIT(sig->thread_head); | ||
1039 | |||
1038 | init_waitqueue_head(&sig->wait_chldexit); | 1040 | init_waitqueue_head(&sig->wait_chldexit); |
1039 | sig->curr_target = tsk; | 1041 | sig->curr_target = tsk; |
1040 | init_sigpending(&sig->shared_pending); | 1042 | init_sigpending(&sig->shared_pending); |
@@ -1087,8 +1089,10 @@ static void rt_mutex_init_task(struct task_struct *p) | |||
1087 | { | 1089 | { |
1088 | raw_spin_lock_init(&p->pi_lock); | 1090 | raw_spin_lock_init(&p->pi_lock); |
1089 | #ifdef CONFIG_RT_MUTEXES | 1091 | #ifdef CONFIG_RT_MUTEXES |
1090 | plist_head_init(&p->pi_waiters); | 1092 | p->pi_waiters = RB_ROOT; |
1093 | p->pi_waiters_leftmost = NULL; | ||
1091 | p->pi_blocked_on = NULL; | 1094 | p->pi_blocked_on = NULL; |
1095 | p->pi_top_task = NULL; | ||
1092 | #endif | 1096 | #endif |
1093 | } | 1097 | } |
1094 | 1098 | ||
@@ -1172,7 +1176,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1172 | * do not allow it to share a thread group or signal handlers or | 1176 | * do not allow it to share a thread group or signal handlers or |
1173 | * parent with the forking task. | 1177 | * parent with the forking task. |
1174 | */ | 1178 | */ |
1175 | if (clone_flags & (CLONE_SIGHAND | CLONE_PARENT)) { | 1179 | if (clone_flags & CLONE_SIGHAND) { |
1176 | if ((clone_flags & (CLONE_NEWUSER | CLONE_NEWPID)) || | 1180 | if ((clone_flags & (CLONE_NEWUSER | CLONE_NEWPID)) || |
1177 | (task_active_pid_ns(current) != | 1181 | (task_active_pid_ns(current) != |
1178 | current->nsproxy->pid_ns_for_children)) | 1182 | current->nsproxy->pid_ns_for_children)) |
@@ -1222,7 +1226,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1222 | if (!try_module_get(task_thread_info(p)->exec_domain->module)) | 1226 | if (!try_module_get(task_thread_info(p)->exec_domain->module)) |
1223 | goto bad_fork_cleanup_count; | 1227 | goto bad_fork_cleanup_count; |
1224 | 1228 | ||
1225 | p->did_exec = 0; | ||
1226 | delayacct_tsk_init(p); /* Must remain after dup_task_struct() */ | 1229 | delayacct_tsk_init(p); /* Must remain after dup_task_struct() */ |
1227 | copy_flags(clone_flags, p); | 1230 | copy_flags(clone_flags, p); |
1228 | INIT_LIST_HEAD(&p->children); | 1231 | INIT_LIST_HEAD(&p->children); |
@@ -1311,7 +1314,9 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1311 | #endif | 1314 | #endif |
1312 | 1315 | ||
1313 | /* Perform scheduler related setup. Assign this task to a CPU. */ | 1316 | /* Perform scheduler related setup. Assign this task to a CPU. */ |
1314 | sched_fork(clone_flags, p); | 1317 | retval = sched_fork(clone_flags, p); |
1318 | if (retval) | ||
1319 | goto bad_fork_cleanup_policy; | ||
1315 | 1320 | ||
1316 | retval = perf_event_init_task(p); | 1321 | retval = perf_event_init_task(p); |
1317 | if (retval) | 1322 | if (retval) |
@@ -1403,13 +1408,11 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1403 | p->tgid = p->pid; | 1408 | p->tgid = p->pid; |
1404 | } | 1409 | } |
1405 | 1410 | ||
1406 | p->pdeath_signal = 0; | ||
1407 | p->exit_state = 0; | ||
1408 | |||
1409 | p->nr_dirtied = 0; | 1411 | p->nr_dirtied = 0; |
1410 | p->nr_dirtied_pause = 128 >> (PAGE_SHIFT - 10); | 1412 | p->nr_dirtied_pause = 128 >> (PAGE_SHIFT - 10); |
1411 | p->dirty_paused_when = 0; | 1413 | p->dirty_paused_when = 0; |
1412 | 1414 | ||
1415 | p->pdeath_signal = 0; | ||
1413 | INIT_LIST_HEAD(&p->thread_group); | 1416 | INIT_LIST_HEAD(&p->thread_group); |
1414 | p->task_works = NULL; | 1417 | p->task_works = NULL; |
1415 | 1418 | ||
@@ -1472,6 +1475,8 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1472 | atomic_inc(¤t->signal->sigcnt); | 1475 | atomic_inc(¤t->signal->sigcnt); |
1473 | list_add_tail_rcu(&p->thread_group, | 1476 | list_add_tail_rcu(&p->thread_group, |
1474 | &p->group_leader->thread_group); | 1477 | &p->group_leader->thread_group); |
1478 | list_add_tail_rcu(&p->thread_node, | ||
1479 | &p->signal->thread_head); | ||
1475 | } | 1480 | } |
1476 | attach_pid(p, PIDTYPE_PID); | 1481 | attach_pid(p, PIDTYPE_PID); |
1477 | nr_threads++; | 1482 | nr_threads++; |
@@ -1645,7 +1650,7 @@ SYSCALL_DEFINE0(fork) | |||
1645 | return do_fork(SIGCHLD, 0, 0, NULL, NULL); | 1650 | return do_fork(SIGCHLD, 0, 0, NULL, NULL); |
1646 | #else | 1651 | #else |
1647 | /* can not support in nommu mode */ | 1652 | /* can not support in nommu mode */ |
1648 | return(-EINVAL); | 1653 | return -EINVAL; |
1649 | #endif | 1654 | #endif |
1650 | } | 1655 | } |
1651 | #endif | 1656 | #endif |
@@ -1653,7 +1658,7 @@ SYSCALL_DEFINE0(fork) | |||
1653 | #ifdef __ARCH_WANT_SYS_VFORK | 1658 | #ifdef __ARCH_WANT_SYS_VFORK |
1654 | SYSCALL_DEFINE0(vfork) | 1659 | SYSCALL_DEFINE0(vfork) |
1655 | { | 1660 | { |
1656 | return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, 0, | 1661 | return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, 0, |
1657 | 0, NULL, NULL); | 1662 | 0, NULL, NULL); |
1658 | } | 1663 | } |
1659 | #endif | 1664 | #endif |
diff --git a/kernel/futex.c b/kernel/futex.c index f6ff0191ecf7..08ec814ad9d2 100644 --- a/kernel/futex.c +++ b/kernel/futex.c | |||
@@ -63,14 +63,101 @@ | |||
63 | #include <linux/sched/rt.h> | 63 | #include <linux/sched/rt.h> |
64 | #include <linux/hugetlb.h> | 64 | #include <linux/hugetlb.h> |
65 | #include <linux/freezer.h> | 65 | #include <linux/freezer.h> |
66 | #include <linux/bootmem.h> | ||
66 | 67 | ||
67 | #include <asm/futex.h> | 68 | #include <asm/futex.h> |
68 | 69 | ||
69 | #include "locking/rtmutex_common.h" | 70 | #include "locking/rtmutex_common.h" |
70 | 71 | ||
71 | int __read_mostly futex_cmpxchg_enabled; | 72 | /* |
73 | * Basic futex operation and ordering guarantees: | ||
74 | * | ||
75 | * The waiter reads the futex value in user space and calls | ||
76 | * futex_wait(). This function computes the hash bucket and acquires | ||
77 | * the hash bucket lock. After that it reads the futex user space value | ||
78 | * again and verifies that the data has not changed. If it has not changed | ||
79 | * it enqueues itself into the hash bucket, releases the hash bucket lock | ||
80 | * and schedules. | ||
81 | * | ||
82 | * The waker side modifies the user space value of the futex and calls | ||
83 | * futex_wake(). This function computes the hash bucket and acquires the | ||
84 | * hash bucket lock. Then it looks for waiters on that futex in the hash | ||
85 | * bucket and wakes them. | ||
86 | * | ||
87 | * In futex wake up scenarios where no tasks are blocked on a futex, taking | ||
88 | * the hb spinlock can be avoided and simply return. In order for this | ||
89 | * optimization to work, ordering guarantees must exist so that the waiter | ||
90 | * being added to the list is acknowledged when the list is concurrently being | ||
91 | * checked by the waker, avoiding scenarios like the following: | ||
92 | * | ||
93 | * CPU 0 CPU 1 | ||
94 | * val = *futex; | ||
95 | * sys_futex(WAIT, futex, val); | ||
96 | * futex_wait(futex, val); | ||
97 | * uval = *futex; | ||
98 | * *futex = newval; | ||
99 | * sys_futex(WAKE, futex); | ||
100 | * futex_wake(futex); | ||
101 | * if (queue_empty()) | ||
102 | * return; | ||
103 | * if (uval == val) | ||
104 | * lock(hash_bucket(futex)); | ||
105 | * queue(); | ||
106 | * unlock(hash_bucket(futex)); | ||
107 | * schedule(); | ||
108 | * | ||
109 | * This would cause the waiter on CPU 0 to wait forever because it | ||
110 | * missed the transition of the user space value from val to newval | ||
111 | * and the waker did not find the waiter in the hash bucket queue. | ||
112 | * | ||
113 | * The correct serialization ensures that a waiter either observes | ||
114 | * the changed user space value before blocking or is woken by a | ||
115 | * concurrent waker: | ||
116 | * | ||
117 | * CPU 0 CPU 1 | ||
118 | * val = *futex; | ||
119 | * sys_futex(WAIT, futex, val); | ||
120 | * futex_wait(futex, val); | ||
121 | * | ||
122 | * waiters++; | ||
123 | * mb(); (A) <-- paired with -. | ||
124 | * | | ||
125 | * lock(hash_bucket(futex)); | | ||
126 | * | | ||
127 | * uval = *futex; | | ||
128 | * | *futex = newval; | ||
129 | * | sys_futex(WAKE, futex); | ||
130 | * | futex_wake(futex); | ||
131 | * | | ||
132 | * `-------> mb(); (B) | ||
133 | * if (uval == val) | ||
134 | * queue(); | ||
135 | * unlock(hash_bucket(futex)); | ||
136 | * schedule(); if (waiters) | ||
137 | * lock(hash_bucket(futex)); | ||
138 | * wake_waiters(futex); | ||
139 | * unlock(hash_bucket(futex)); | ||
140 | * | ||
141 | * Where (A) orders the waiters increment and the futex value read -- this | ||
142 | * is guaranteed by the head counter in the hb spinlock; and where (B) | ||
143 | * orders the write to futex and the waiters read -- this is done by the | ||
144 | * barriers in get_futex_key_refs(), through either ihold or atomic_inc, | ||
145 | * depending on the futex type. | ||
146 | * | ||
147 | * This yields the following case (where X:=waiters, Y:=futex): | ||
148 | * | ||
149 | * X = Y = 0 | ||
150 | * | ||
151 | * w[X]=1 w[Y]=1 | ||
152 | * MB MB | ||
153 | * r[Y]=y r[X]=x | ||
154 | * | ||
155 | * Which guarantees that x==0 && y==0 is impossible; which translates back into | ||
156 | * the guarantee that we cannot both miss the futex variable change and the | ||
157 | * enqueue. | ||
158 | */ | ||
72 | 159 | ||
73 | #define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8) | 160 | int __read_mostly futex_cmpxchg_enabled; |
74 | 161 | ||
75 | /* | 162 | /* |
76 | * Futex flags used to encode options to functions and preserve them across | 163 | * Futex flags used to encode options to functions and preserve them across |
@@ -147,11 +234,59 @@ static const struct futex_q futex_q_init = { | |||
147 | * waiting on a futex. | 234 | * waiting on a futex. |
148 | */ | 235 | */ |
149 | struct futex_hash_bucket { | 236 | struct futex_hash_bucket { |
237 | atomic_t waiters; | ||
150 | spinlock_t lock; | 238 | spinlock_t lock; |
151 | struct plist_head chain; | 239 | struct plist_head chain; |
152 | }; | 240 | } ____cacheline_aligned_in_smp; |
241 | |||
242 | static unsigned long __read_mostly futex_hashsize; | ||
243 | |||
244 | static struct futex_hash_bucket *futex_queues; | ||
245 | |||
246 | static inline void futex_get_mm(union futex_key *key) | ||
247 | { | ||
248 | atomic_inc(&key->private.mm->mm_count); | ||
249 | /* | ||
250 | * Ensure futex_get_mm() implies a full barrier such that | ||
251 | * get_futex_key() implies a full barrier. This is relied upon | ||
252 | * as full barrier (B), see the ordering comment above. | ||
253 | */ | ||
254 | smp_mb__after_atomic_inc(); | ||
255 | } | ||
256 | |||
257 | /* | ||
258 | * Reflects a new waiter being added to the waitqueue. | ||
259 | */ | ||
260 | static inline void hb_waiters_inc(struct futex_hash_bucket *hb) | ||
261 | { | ||
262 | #ifdef CONFIG_SMP | ||
263 | atomic_inc(&hb->waiters); | ||
264 | /* | ||
265 | * Full barrier (A), see the ordering comment above. | ||
266 | */ | ||
267 | smp_mb__after_atomic_inc(); | ||
268 | #endif | ||
269 | } | ||
153 | 270 | ||
154 | static struct futex_hash_bucket futex_queues[1<<FUTEX_HASHBITS]; | 271 | /* |
272 | * Reflects a waiter being removed from the waitqueue by wakeup | ||
273 | * paths. | ||
274 | */ | ||
275 | static inline void hb_waiters_dec(struct futex_hash_bucket *hb) | ||
276 | { | ||
277 | #ifdef CONFIG_SMP | ||
278 | atomic_dec(&hb->waiters); | ||
279 | #endif | ||
280 | } | ||
281 | |||
282 | static inline int hb_waiters_pending(struct futex_hash_bucket *hb) | ||
283 | { | ||
284 | #ifdef CONFIG_SMP | ||
285 | return atomic_read(&hb->waiters); | ||
286 | #else | ||
287 | return 1; | ||
288 | #endif | ||
289 | } | ||
155 | 290 | ||
156 | /* | 291 | /* |
157 | * We hash on the keys returned from get_futex_key (see below). | 292 | * We hash on the keys returned from get_futex_key (see below). |
@@ -161,7 +296,7 @@ static struct futex_hash_bucket *hash_futex(union futex_key *key) | |||
161 | u32 hash = jhash2((u32*)&key->both.word, | 296 | u32 hash = jhash2((u32*)&key->both.word, |
162 | (sizeof(key->both.word)+sizeof(key->both.ptr))/4, | 297 | (sizeof(key->both.word)+sizeof(key->both.ptr))/4, |
163 | key->both.offset); | 298 | key->both.offset); |
164 | return &futex_queues[hash & ((1 << FUTEX_HASHBITS)-1)]; | 299 | return &futex_queues[hash & (futex_hashsize - 1)]; |
165 | } | 300 | } |
166 | 301 | ||
167 | /* | 302 | /* |
@@ -187,10 +322,10 @@ static void get_futex_key_refs(union futex_key *key) | |||
187 | 322 | ||
188 | switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) { | 323 | switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) { |
189 | case FUT_OFF_INODE: | 324 | case FUT_OFF_INODE: |
190 | ihold(key->shared.inode); | 325 | ihold(key->shared.inode); /* implies MB (B) */ |
191 | break; | 326 | break; |
192 | case FUT_OFF_MMSHARED: | 327 | case FUT_OFF_MMSHARED: |
193 | atomic_inc(&key->private.mm->mm_count); | 328 | futex_get_mm(key); /* implies MB (B) */ |
194 | break; | 329 | break; |
195 | } | 330 | } |
196 | } | 331 | } |
@@ -264,7 +399,7 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw) | |||
264 | if (!fshared) { | 399 | if (!fshared) { |
265 | key->private.mm = mm; | 400 | key->private.mm = mm; |
266 | key->private.address = address; | 401 | key->private.address = address; |
267 | get_futex_key_refs(key); | 402 | get_futex_key_refs(key); /* implies MB (B) */ |
268 | return 0; | 403 | return 0; |
269 | } | 404 | } |
270 | 405 | ||
@@ -371,7 +506,7 @@ again: | |||
371 | key->shared.pgoff = basepage_index(page); | 506 | key->shared.pgoff = basepage_index(page); |
372 | } | 507 | } |
373 | 508 | ||
374 | get_futex_key_refs(key); | 509 | get_futex_key_refs(key); /* implies MB (B) */ |
375 | 510 | ||
376 | out: | 511 | out: |
377 | unlock_page(page_head); | 512 | unlock_page(page_head); |
@@ -598,13 +733,10 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, | |||
598 | { | 733 | { |
599 | struct futex_pi_state *pi_state = NULL; | 734 | struct futex_pi_state *pi_state = NULL; |
600 | struct futex_q *this, *next; | 735 | struct futex_q *this, *next; |
601 | struct plist_head *head; | ||
602 | struct task_struct *p; | 736 | struct task_struct *p; |
603 | pid_t pid = uval & FUTEX_TID_MASK; | 737 | pid_t pid = uval & FUTEX_TID_MASK; |
604 | 738 | ||
605 | head = &hb->chain; | 739 | plist_for_each_entry_safe(this, next, &hb->chain, list) { |
606 | |||
607 | plist_for_each_entry_safe(this, next, head, list) { | ||
608 | if (match_futex(&this->key, key)) { | 740 | if (match_futex(&this->key, key)) { |
609 | /* | 741 | /* |
610 | * Another waiter already exists - bump up | 742 | * Another waiter already exists - bump up |
@@ -838,6 +970,7 @@ static void __unqueue_futex(struct futex_q *q) | |||
838 | 970 | ||
839 | hb = container_of(q->lock_ptr, struct futex_hash_bucket, lock); | 971 | hb = container_of(q->lock_ptr, struct futex_hash_bucket, lock); |
840 | plist_del(&q->list, &hb->chain); | 972 | plist_del(&q->list, &hb->chain); |
973 | hb_waiters_dec(hb); | ||
841 | } | 974 | } |
842 | 975 | ||
843 | /* | 976 | /* |
@@ -986,7 +1119,6 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset) | |||
986 | { | 1119 | { |
987 | struct futex_hash_bucket *hb; | 1120 | struct futex_hash_bucket *hb; |
988 | struct futex_q *this, *next; | 1121 | struct futex_q *this, *next; |
989 | struct plist_head *head; | ||
990 | union futex_key key = FUTEX_KEY_INIT; | 1122 | union futex_key key = FUTEX_KEY_INIT; |
991 | int ret; | 1123 | int ret; |
992 | 1124 | ||
@@ -998,10 +1130,14 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset) | |||
998 | goto out; | 1130 | goto out; |
999 | 1131 | ||
1000 | hb = hash_futex(&key); | 1132 | hb = hash_futex(&key); |
1133 | |||
1134 | /* Make sure we really have tasks to wakeup */ | ||
1135 | if (!hb_waiters_pending(hb)) | ||
1136 | goto out_put_key; | ||
1137 | |||
1001 | spin_lock(&hb->lock); | 1138 | spin_lock(&hb->lock); |
1002 | head = &hb->chain; | ||
1003 | 1139 | ||
1004 | plist_for_each_entry_safe(this, next, head, list) { | 1140 | plist_for_each_entry_safe(this, next, &hb->chain, list) { |
1005 | if (match_futex (&this->key, &key)) { | 1141 | if (match_futex (&this->key, &key)) { |
1006 | if (this->pi_state || this->rt_waiter) { | 1142 | if (this->pi_state || this->rt_waiter) { |
1007 | ret = -EINVAL; | 1143 | ret = -EINVAL; |
@@ -1019,6 +1155,7 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset) | |||
1019 | } | 1155 | } |
1020 | 1156 | ||
1021 | spin_unlock(&hb->lock); | 1157 | spin_unlock(&hb->lock); |
1158 | out_put_key: | ||
1022 | put_futex_key(&key); | 1159 | put_futex_key(&key); |
1023 | out: | 1160 | out: |
1024 | return ret; | 1161 | return ret; |
@@ -1034,7 +1171,6 @@ futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2, | |||
1034 | { | 1171 | { |
1035 | union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT; | 1172 | union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT; |
1036 | struct futex_hash_bucket *hb1, *hb2; | 1173 | struct futex_hash_bucket *hb1, *hb2; |
1037 | struct plist_head *head; | ||
1038 | struct futex_q *this, *next; | 1174 | struct futex_q *this, *next; |
1039 | int ret, op_ret; | 1175 | int ret, op_ret; |
1040 | 1176 | ||
@@ -1082,9 +1218,7 @@ retry_private: | |||
1082 | goto retry; | 1218 | goto retry; |
1083 | } | 1219 | } |
1084 | 1220 | ||
1085 | head = &hb1->chain; | 1221 | plist_for_each_entry_safe(this, next, &hb1->chain, list) { |
1086 | |||
1087 | plist_for_each_entry_safe(this, next, head, list) { | ||
1088 | if (match_futex (&this->key, &key1)) { | 1222 | if (match_futex (&this->key, &key1)) { |
1089 | if (this->pi_state || this->rt_waiter) { | 1223 | if (this->pi_state || this->rt_waiter) { |
1090 | ret = -EINVAL; | 1224 | ret = -EINVAL; |
@@ -1097,10 +1231,8 @@ retry_private: | |||
1097 | } | 1231 | } |
1098 | 1232 | ||
1099 | if (op_ret > 0) { | 1233 | if (op_ret > 0) { |
1100 | head = &hb2->chain; | ||
1101 | |||
1102 | op_ret = 0; | 1234 | op_ret = 0; |
1103 | plist_for_each_entry_safe(this, next, head, list) { | 1235 | plist_for_each_entry_safe(this, next, &hb2->chain, list) { |
1104 | if (match_futex (&this->key, &key2)) { | 1236 | if (match_futex (&this->key, &key2)) { |
1105 | if (this->pi_state || this->rt_waiter) { | 1237 | if (this->pi_state || this->rt_waiter) { |
1106 | ret = -EINVAL; | 1238 | ret = -EINVAL; |
@@ -1142,7 +1274,9 @@ void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1, | |||
1142 | */ | 1274 | */ |
1143 | if (likely(&hb1->chain != &hb2->chain)) { | 1275 | if (likely(&hb1->chain != &hb2->chain)) { |
1144 | plist_del(&q->list, &hb1->chain); | 1276 | plist_del(&q->list, &hb1->chain); |
1277 | hb_waiters_dec(hb1); | ||
1145 | plist_add(&q->list, &hb2->chain); | 1278 | plist_add(&q->list, &hb2->chain); |
1279 | hb_waiters_inc(hb2); | ||
1146 | q->lock_ptr = &hb2->lock; | 1280 | q->lock_ptr = &hb2->lock; |
1147 | } | 1281 | } |
1148 | get_futex_key_refs(key2); | 1282 | get_futex_key_refs(key2); |
@@ -1270,7 +1404,6 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags, | |||
1270 | int drop_count = 0, task_count = 0, ret; | 1404 | int drop_count = 0, task_count = 0, ret; |
1271 | struct futex_pi_state *pi_state = NULL; | 1405 | struct futex_pi_state *pi_state = NULL; |
1272 | struct futex_hash_bucket *hb1, *hb2; | 1406 | struct futex_hash_bucket *hb1, *hb2; |
1273 | struct plist_head *head1; | ||
1274 | struct futex_q *this, *next; | 1407 | struct futex_q *this, *next; |
1275 | u32 curval2; | 1408 | u32 curval2; |
1276 | 1409 | ||
@@ -1393,8 +1526,7 @@ retry_private: | |||
1393 | } | 1526 | } |
1394 | } | 1527 | } |
1395 | 1528 | ||
1396 | head1 = &hb1->chain; | 1529 | plist_for_each_entry_safe(this, next, &hb1->chain, list) { |
1397 | plist_for_each_entry_safe(this, next, head1, list) { | ||
1398 | if (task_count - nr_wake >= nr_requeue) | 1530 | if (task_count - nr_wake >= nr_requeue) |
1399 | break; | 1531 | break; |
1400 | 1532 | ||
@@ -1487,17 +1619,29 @@ static inline struct futex_hash_bucket *queue_lock(struct futex_q *q) | |||
1487 | struct futex_hash_bucket *hb; | 1619 | struct futex_hash_bucket *hb; |
1488 | 1620 | ||
1489 | hb = hash_futex(&q->key); | 1621 | hb = hash_futex(&q->key); |
1622 | |||
1623 | /* | ||
1624 | * Increment the counter before taking the lock so that | ||
1625 | * a potential waker won't miss a to-be-slept task that is | ||
1626 | * waiting for the spinlock. This is safe as all queue_lock() | ||
1627 | * users end up calling queue_me(). Similarly, for housekeeping, | ||
1628 | * decrement the counter at queue_unlock() when some error has | ||
1629 | * occurred and we don't end up adding the task to the list. | ||
1630 | */ | ||
1631 | hb_waiters_inc(hb); | ||
1632 | |||
1490 | q->lock_ptr = &hb->lock; | 1633 | q->lock_ptr = &hb->lock; |
1491 | 1634 | ||
1492 | spin_lock(&hb->lock); | 1635 | spin_lock(&hb->lock); /* implies MB (A) */ |
1493 | return hb; | 1636 | return hb; |
1494 | } | 1637 | } |
1495 | 1638 | ||
1496 | static inline void | 1639 | static inline void |
1497 | queue_unlock(struct futex_q *q, struct futex_hash_bucket *hb) | 1640 | queue_unlock(struct futex_hash_bucket *hb) |
1498 | __releases(&hb->lock) | 1641 | __releases(&hb->lock) |
1499 | { | 1642 | { |
1500 | spin_unlock(&hb->lock); | 1643 | spin_unlock(&hb->lock); |
1644 | hb_waiters_dec(hb); | ||
1501 | } | 1645 | } |
1502 | 1646 | ||
1503 | /** | 1647 | /** |
@@ -1867,7 +2011,7 @@ retry_private: | |||
1867 | ret = get_futex_value_locked(&uval, uaddr); | 2011 | ret = get_futex_value_locked(&uval, uaddr); |
1868 | 2012 | ||
1869 | if (ret) { | 2013 | if (ret) { |
1870 | queue_unlock(q, *hb); | 2014 | queue_unlock(*hb); |
1871 | 2015 | ||
1872 | ret = get_user(uval, uaddr); | 2016 | ret = get_user(uval, uaddr); |
1873 | if (ret) | 2017 | if (ret) |
@@ -1881,7 +2025,7 @@ retry_private: | |||
1881 | } | 2025 | } |
1882 | 2026 | ||
1883 | if (uval != val) { | 2027 | if (uval != val) { |
1884 | queue_unlock(q, *hb); | 2028 | queue_unlock(*hb); |
1885 | ret = -EWOULDBLOCK; | 2029 | ret = -EWOULDBLOCK; |
1886 | } | 2030 | } |
1887 | 2031 | ||
@@ -2029,7 +2173,7 @@ retry_private: | |||
2029 | * Task is exiting and we just wait for the | 2173 | * Task is exiting and we just wait for the |
2030 | * exit to complete. | 2174 | * exit to complete. |
2031 | */ | 2175 | */ |
2032 | queue_unlock(&q, hb); | 2176 | queue_unlock(hb); |
2033 | put_futex_key(&q.key); | 2177 | put_futex_key(&q.key); |
2034 | cond_resched(); | 2178 | cond_resched(); |
2035 | goto retry; | 2179 | goto retry; |
@@ -2081,7 +2225,7 @@ retry_private: | |||
2081 | goto out_put_key; | 2225 | goto out_put_key; |
2082 | 2226 | ||
2083 | out_unlock_put_key: | 2227 | out_unlock_put_key: |
2084 | queue_unlock(&q, hb); | 2228 | queue_unlock(hb); |
2085 | 2229 | ||
2086 | out_put_key: | 2230 | out_put_key: |
2087 | put_futex_key(&q.key); | 2231 | put_futex_key(&q.key); |
@@ -2091,7 +2235,7 @@ out: | |||
2091 | return ret != -EINTR ? ret : -ERESTARTNOINTR; | 2235 | return ret != -EINTR ? ret : -ERESTARTNOINTR; |
2092 | 2236 | ||
2093 | uaddr_faulted: | 2237 | uaddr_faulted: |
2094 | queue_unlock(&q, hb); | 2238 | queue_unlock(hb); |
2095 | 2239 | ||
2096 | ret = fault_in_user_writeable(uaddr); | 2240 | ret = fault_in_user_writeable(uaddr); |
2097 | if (ret) | 2241 | if (ret) |
@@ -2113,7 +2257,6 @@ static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags) | |||
2113 | { | 2257 | { |
2114 | struct futex_hash_bucket *hb; | 2258 | struct futex_hash_bucket *hb; |
2115 | struct futex_q *this, *next; | 2259 | struct futex_q *this, *next; |
2116 | struct plist_head *head; | ||
2117 | union futex_key key = FUTEX_KEY_INIT; | 2260 | union futex_key key = FUTEX_KEY_INIT; |
2118 | u32 uval, vpid = task_pid_vnr(current); | 2261 | u32 uval, vpid = task_pid_vnr(current); |
2119 | int ret; | 2262 | int ret; |
@@ -2153,9 +2296,7 @@ retry: | |||
2153 | * Ok, other tasks may need to be woken up - check waiters | 2296 | * Ok, other tasks may need to be woken up - check waiters |
2154 | * and do the wakeup if necessary: | 2297 | * and do the wakeup if necessary: |
2155 | */ | 2298 | */ |
2156 | head = &hb->chain; | 2299 | plist_for_each_entry_safe(this, next, &hb->chain, list) { |
2157 | |||
2158 | plist_for_each_entry_safe(this, next, head, list) { | ||
2159 | if (!match_futex (&this->key, &key)) | 2300 | if (!match_futex (&this->key, &key)) |
2160 | continue; | 2301 | continue; |
2161 | ret = wake_futex_pi(uaddr, uval, this); | 2302 | ret = wake_futex_pi(uaddr, uval, this); |
@@ -2232,6 +2373,7 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb, | |||
2232 | * Unqueue the futex_q and determine which it was. | 2373 | * Unqueue the futex_q and determine which it was. |
2233 | */ | 2374 | */ |
2234 | plist_del(&q->list, &hb->chain); | 2375 | plist_del(&q->list, &hb->chain); |
2376 | hb_waiters_dec(hb); | ||
2235 | 2377 | ||
2236 | /* Handle spurious wakeups gracefully */ | 2378 | /* Handle spurious wakeups gracefully */ |
2237 | ret = -EWOULDBLOCK; | 2379 | ret = -EWOULDBLOCK; |
@@ -2316,6 +2458,8 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, | |||
2316 | * code while we sleep on uaddr. | 2458 | * code while we sleep on uaddr. |
2317 | */ | 2459 | */ |
2318 | debug_rt_mutex_init_waiter(&rt_waiter); | 2460 | debug_rt_mutex_init_waiter(&rt_waiter); |
2461 | RB_CLEAR_NODE(&rt_waiter.pi_tree_entry); | ||
2462 | RB_CLEAR_NODE(&rt_waiter.tree_entry); | ||
2319 | rt_waiter.task = NULL; | 2463 | rt_waiter.task = NULL; |
2320 | 2464 | ||
2321 | ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE); | 2465 | ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE); |
@@ -2734,8 +2878,21 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, | |||
2734 | static int __init futex_init(void) | 2878 | static int __init futex_init(void) |
2735 | { | 2879 | { |
2736 | u32 curval; | 2880 | u32 curval; |
2737 | int i; | 2881 | unsigned int futex_shift; |
2882 | unsigned long i; | ||
2883 | |||
2884 | #if CONFIG_BASE_SMALL | ||
2885 | futex_hashsize = 16; | ||
2886 | #else | ||
2887 | futex_hashsize = roundup_pow_of_two(256 * num_possible_cpus()); | ||
2888 | #endif | ||
2738 | 2889 | ||
2890 | futex_queues = alloc_large_system_hash("futex", sizeof(*futex_queues), | ||
2891 | futex_hashsize, 0, | ||
2892 | futex_hashsize < 256 ? HASH_SMALL : 0, | ||
2893 | &futex_shift, NULL, | ||
2894 | futex_hashsize, futex_hashsize); | ||
2895 | futex_hashsize = 1UL << futex_shift; | ||
2739 | /* | 2896 | /* |
2740 | * This will fail and we want it. Some arch implementations do | 2897 | * This will fail and we want it. Some arch implementations do |
2741 | * runtime detection of the futex_atomic_cmpxchg_inatomic() | 2898 | * runtime detection of the futex_atomic_cmpxchg_inatomic() |
@@ -2749,7 +2906,8 @@ static int __init futex_init(void) | |||
2749 | if (cmpxchg_futex_value_locked(&curval, NULL, 0, 0) == -EFAULT) | 2906 | if (cmpxchg_futex_value_locked(&curval, NULL, 0, 0) == -EFAULT) |
2750 | futex_cmpxchg_enabled = 1; | 2907 | futex_cmpxchg_enabled = 1; |
2751 | 2908 | ||
2752 | for (i = 0; i < ARRAY_SIZE(futex_queues); i++) { | 2909 | for (i = 0; i < futex_hashsize; i++) { |
2910 | atomic_set(&futex_queues[i].waiters, 0); | ||
2753 | plist_head_init(&futex_queues[i].chain); | 2911 | plist_head_init(&futex_queues[i].chain); |
2754 | spin_lock_init(&futex_queues[i].lock); | 2912 | spin_lock_init(&futex_queues[i].lock); |
2755 | } | 2913 | } |
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 383319bae3f7..09094361dce5 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c | |||
@@ -46,6 +46,7 @@ | |||
46 | #include <linux/sched.h> | 46 | #include <linux/sched.h> |
47 | #include <linux/sched/sysctl.h> | 47 | #include <linux/sched/sysctl.h> |
48 | #include <linux/sched/rt.h> | 48 | #include <linux/sched/rt.h> |
49 | #include <linux/sched/deadline.h> | ||
49 | #include <linux/timer.h> | 50 | #include <linux/timer.h> |
50 | #include <linux/freezer.h> | 51 | #include <linux/freezer.h> |
51 | 52 | ||
@@ -1610,7 +1611,7 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp, | |||
1610 | unsigned long slack; | 1611 | unsigned long slack; |
1611 | 1612 | ||
1612 | slack = current->timer_slack_ns; | 1613 | slack = current->timer_slack_ns; |
1613 | if (rt_task(current)) | 1614 | if (dl_task(current) || rt_task(current)) |
1614 | slack = 0; | 1615 | slack = 0; |
1615 | 1616 | ||
1616 | hrtimer_init_on_stack(&t.timer, clockid, mode); | 1617 | hrtimer_init_on_stack(&t.timer, clockid, mode); |
diff --git a/kernel/hung_task.c b/kernel/hung_task.c index 9328b80eaf14..0b9c169d577f 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c | |||
@@ -37,7 +37,7 @@ int __read_mostly sysctl_hung_task_check_count = PID_MAX_LIMIT; | |||
37 | */ | 37 | */ |
38 | unsigned long __read_mostly sysctl_hung_task_timeout_secs = CONFIG_DEFAULT_HUNG_TASK_TIMEOUT; | 38 | unsigned long __read_mostly sysctl_hung_task_timeout_secs = CONFIG_DEFAULT_HUNG_TASK_TIMEOUT; |
39 | 39 | ||
40 | unsigned long __read_mostly sysctl_hung_task_warnings = 10; | 40 | int __read_mostly sysctl_hung_task_warnings = 10; |
41 | 41 | ||
42 | static int __read_mostly did_panic; | 42 | static int __read_mostly did_panic; |
43 | 43 | ||
@@ -98,7 +98,9 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout) | |||
98 | 98 | ||
99 | if (!sysctl_hung_task_warnings) | 99 | if (!sysctl_hung_task_warnings) |
100 | return; | 100 | return; |
101 | sysctl_hung_task_warnings--; | 101 | |
102 | if (sysctl_hung_task_warnings > 0) | ||
103 | sysctl_hung_task_warnings--; | ||
102 | 104 | ||
103 | /* | 105 | /* |
104 | * Ok, the task did not get scheduled for more than 2 minutes, | 106 | * Ok, the task did not get scheduled for more than 2 minutes, |
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index 4a1fef09f658..07cbdfea9ae2 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig | |||
@@ -40,6 +40,7 @@ config IRQ_EDGE_EOI_HANDLER | |||
40 | # Generic configurable interrupt chip implementation | 40 | # Generic configurable interrupt chip implementation |
41 | config GENERIC_IRQ_CHIP | 41 | config GENERIC_IRQ_CHIP |
42 | bool | 42 | bool |
43 | select IRQ_DOMAIN | ||
43 | 44 | ||
44 | # Generic irq_domain hw <--> linux irq number translation | 45 | # Generic irq_domain hw <--> linux irq number translation |
45 | config IRQ_DOMAIN | 46 | config IRQ_DOMAIN |
diff --git a/kernel/irq/devres.c b/kernel/irq/devres.c index bd8e788d71e0..1ef0606797c9 100644 --- a/kernel/irq/devres.c +++ b/kernel/irq/devres.c | |||
@@ -73,6 +73,51 @@ int devm_request_threaded_irq(struct device *dev, unsigned int irq, | |||
73 | EXPORT_SYMBOL(devm_request_threaded_irq); | 73 | EXPORT_SYMBOL(devm_request_threaded_irq); |
74 | 74 | ||
75 | /** | 75 | /** |
76 | * devm_request_any_context_irq - allocate an interrupt line for a managed device | ||
77 | * @dev: device to request interrupt for | ||
78 | * @irq: Interrupt line to allocate | ||
79 | * @handler: Function to be called when the IRQ occurs | ||
80 | * @thread_fn: function to be called in a threaded interrupt context. NULL | ||
81 | * for devices which handle everything in @handler | ||
82 | * @irqflags: Interrupt type flags | ||
83 | * @devname: An ascii name for the claiming device | ||
84 | * @dev_id: A cookie passed back to the handler function | ||
85 | * | ||
86 | * Except for the extra @dev argument, this function takes the | ||
87 | * same arguments and performs the same function as | ||
88 | * request_any_context_irq(). IRQs requested with this function will be | ||
89 | * automatically freed on driver detach. | ||
90 | * | ||
91 | * If an IRQ allocated with this function needs to be freed | ||
92 | * separately, devm_free_irq() must be used. | ||
93 | */ | ||
94 | int devm_request_any_context_irq(struct device *dev, unsigned int irq, | ||
95 | irq_handler_t handler, unsigned long irqflags, | ||
96 | const char *devname, void *dev_id) | ||
97 | { | ||
98 | struct irq_devres *dr; | ||
99 | int rc; | ||
100 | |||
101 | dr = devres_alloc(devm_irq_release, sizeof(struct irq_devres), | ||
102 | GFP_KERNEL); | ||
103 | if (!dr) | ||
104 | return -ENOMEM; | ||
105 | |||
106 | rc = request_any_context_irq(irq, handler, irqflags, devname, dev_id); | ||
107 | if (rc) { | ||
108 | devres_free(dr); | ||
109 | return rc; | ||
110 | } | ||
111 | |||
112 | dr->irq = irq; | ||
113 | dr->dev_id = dev_id; | ||
114 | devres_add(dev, dr); | ||
115 | |||
116 | return 0; | ||
117 | } | ||
118 | EXPORT_SYMBOL(devm_request_any_context_irq); | ||
119 | |||
120 | /** | ||
76 | * devm_free_irq - free an interrupt | 121 | * devm_free_irq - free an interrupt |
77 | * @dev: device to free interrupt for | 122 | * @dev: device to free interrupt for |
78 | * @irq: Interrupt line to free | 123 | * @irq: Interrupt line to free |
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 192a302d6cfd..8ab8e9390297 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c | |||
@@ -274,6 +274,7 @@ struct irq_desc *irq_to_desc(unsigned int irq) | |||
274 | { | 274 | { |
275 | return (irq < NR_IRQS) ? irq_desc + irq : NULL; | 275 | return (irq < NR_IRQS) ? irq_desc + irq : NULL; |
276 | } | 276 | } |
277 | EXPORT_SYMBOL(irq_to_desc); | ||
277 | 278 | ||
278 | static void free_desc(unsigned int irq) | 279 | static void free_desc(unsigned int irq) |
279 | { | 280 | { |
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index cf68bb36fe58..f14033700c25 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c | |||
@@ -10,6 +10,7 @@ | |||
10 | #include <linux/mutex.h> | 10 | #include <linux/mutex.h> |
11 | #include <linux/of.h> | 11 | #include <linux/of.h> |
12 | #include <linux/of_address.h> | 12 | #include <linux/of_address.h> |
13 | #include <linux/of_irq.h> | ||
13 | #include <linux/topology.h> | 14 | #include <linux/topology.h> |
14 | #include <linux/seq_file.h> | 15 | #include <linux/seq_file.h> |
15 | #include <linux/slab.h> | 16 | #include <linux/slab.h> |
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 481a13c43b17..d3bf660cb57f 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c | |||
@@ -802,8 +802,7 @@ static irqreturn_t irq_thread_fn(struct irq_desc *desc, | |||
802 | 802 | ||
803 | static void wake_threads_waitq(struct irq_desc *desc) | 803 | static void wake_threads_waitq(struct irq_desc *desc) |
804 | { | 804 | { |
805 | if (atomic_dec_and_test(&desc->threads_active) && | 805 | if (atomic_dec_and_test(&desc->threads_active)) |
806 | waitqueue_active(&desc->wait_for_threads)) | ||
807 | wake_up(&desc->wait_for_threads); | 806 | wake_up(&desc->wait_for_threads); |
808 | } | 807 | } |
809 | 808 | ||
diff --git a/kernel/kexec.c b/kernel/kexec.c index 9c970167e402..60bafbed06ab 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c | |||
@@ -932,6 +932,7 @@ static int kimage_load_segment(struct kimage *image, | |||
932 | */ | 932 | */ |
933 | struct kimage *kexec_image; | 933 | struct kimage *kexec_image; |
934 | struct kimage *kexec_crash_image; | 934 | struct kimage *kexec_crash_image; |
935 | int kexec_load_disabled; | ||
935 | 936 | ||
936 | static DEFINE_MUTEX(kexec_mutex); | 937 | static DEFINE_MUTEX(kexec_mutex); |
937 | 938 | ||
@@ -942,7 +943,7 @@ SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments, | |||
942 | int result; | 943 | int result; |
943 | 944 | ||
944 | /* We only trust the superuser with rebooting the system. */ | 945 | /* We only trust the superuser with rebooting the system. */ |
945 | if (!capable(CAP_SYS_BOOT)) | 946 | if (!capable(CAP_SYS_BOOT) || kexec_load_disabled) |
946 | return -EPERM; | 947 | return -EPERM; |
947 | 948 | ||
948 | /* | 949 | /* |
@@ -1536,7 +1537,7 @@ void vmcoreinfo_append_str(const char *fmt, ...) | |||
1536 | size_t r; | 1537 | size_t r; |
1537 | 1538 | ||
1538 | va_start(args, fmt); | 1539 | va_start(args, fmt); |
1539 | r = vsnprintf(buf, sizeof(buf), fmt, args); | 1540 | r = vscnprintf(buf, sizeof(buf), fmt, args); |
1540 | va_end(args); | 1541 | va_end(args); |
1541 | 1542 | ||
1542 | r = min(r, vmcoreinfo_max_size - vmcoreinfo_size); | 1543 | r = min(r, vmcoreinfo_max_size - vmcoreinfo_size); |
diff --git a/kernel/kmod.c b/kernel/kmod.c index b086006c59e7..6b375af4958d 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c | |||
@@ -239,7 +239,7 @@ static int ____call_usermodehelper(void *data) | |||
239 | 239 | ||
240 | commit_creds(new); | 240 | commit_creds(new); |
241 | 241 | ||
242 | retval = do_execve(sub_info->path, | 242 | retval = do_execve(getname_kernel(sub_info->path), |
243 | (const char __user *const __user *)sub_info->argv, | 243 | (const char __user *const __user *)sub_info->argv, |
244 | (const char __user *const __user *)sub_info->envp); | 244 | (const char __user *const __user *)sub_info->envp); |
245 | if (!retval) | 245 | if (!retval) |
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index 9659d38e008f..d945a949760f 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c | |||
@@ -126,7 +126,7 @@ static ssize_t vmcoreinfo_show(struct kobject *kobj, | |||
126 | { | 126 | { |
127 | return sprintf(buf, "%lx %x\n", | 127 | return sprintf(buf, "%lx %x\n", |
128 | paddr_vmcoreinfo_note(), | 128 | paddr_vmcoreinfo_note(), |
129 | (unsigned int)vmcoreinfo_max_size); | 129 | (unsigned int)sizeof(vmcoreinfo_note)); |
130 | } | 130 | } |
131 | KERNEL_ATTR_RO(vmcoreinfo); | 131 | KERNEL_ATTR_RO(vmcoreinfo); |
132 | 132 | ||
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 576ba756a32d..eb8a54783fa0 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c | |||
@@ -590,6 +590,7 @@ static int very_verbose(struct lock_class *class) | |||
590 | /* | 590 | /* |
591 | * Is this the address of a static object: | 591 | * Is this the address of a static object: |
592 | */ | 592 | */ |
593 | #ifdef __KERNEL__ | ||
593 | static int static_obj(void *obj) | 594 | static int static_obj(void *obj) |
594 | { | 595 | { |
595 | unsigned long start = (unsigned long) &_stext, | 596 | unsigned long start = (unsigned long) &_stext, |
@@ -616,6 +617,7 @@ static int static_obj(void *obj) | |||
616 | */ | 617 | */ |
617 | return is_module_address(addr) || is_module_percpu_address(addr); | 618 | return is_module_address(addr) || is_module_percpu_address(addr); |
618 | } | 619 | } |
620 | #endif | ||
619 | 621 | ||
620 | /* | 622 | /* |
621 | * To make lock name printouts unique, we calculate a unique | 623 | * To make lock name printouts unique, we calculate a unique |
@@ -4115,6 +4117,7 @@ void debug_check_no_locks_held(void) | |||
4115 | } | 4117 | } |
4116 | EXPORT_SYMBOL_GPL(debug_check_no_locks_held); | 4118 | EXPORT_SYMBOL_GPL(debug_check_no_locks_held); |
4117 | 4119 | ||
4120 | #ifdef __KERNEL__ | ||
4118 | void debug_show_all_locks(void) | 4121 | void debug_show_all_locks(void) |
4119 | { | 4122 | { |
4120 | struct task_struct *g, *p; | 4123 | struct task_struct *g, *p; |
@@ -4172,6 +4175,7 @@ retry: | |||
4172 | read_unlock(&tasklist_lock); | 4175 | read_unlock(&tasklist_lock); |
4173 | } | 4176 | } |
4174 | EXPORT_SYMBOL_GPL(debug_show_all_locks); | 4177 | EXPORT_SYMBOL_GPL(debug_show_all_locks); |
4178 | #endif | ||
4175 | 4179 | ||
4176 | /* | 4180 | /* |
4177 | * Careful: only use this function if you are sure that | 4181 | * Careful: only use this function if you are sure that |
diff --git a/kernel/locking/mutex-debug.c b/kernel/locking/mutex-debug.c index 7e3443fe1f48..faf6f5b53e77 100644 --- a/kernel/locking/mutex-debug.c +++ b/kernel/locking/mutex-debug.c | |||
@@ -75,7 +75,12 @@ void debug_mutex_unlock(struct mutex *lock) | |||
75 | return; | 75 | return; |
76 | 76 | ||
77 | DEBUG_LOCKS_WARN_ON(lock->magic != lock); | 77 | DEBUG_LOCKS_WARN_ON(lock->magic != lock); |
78 | DEBUG_LOCKS_WARN_ON(lock->owner != current); | 78 | |
79 | if (!lock->owner) | ||
80 | DEBUG_LOCKS_WARN_ON(!lock->owner); | ||
81 | else | ||
82 | DEBUG_LOCKS_WARN_ON(lock->owner != current); | ||
83 | |||
79 | DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next); | 84 | DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next); |
80 | mutex_clear_owner(lock); | 85 | mutex_clear_owner(lock); |
81 | } | 86 | } |
diff --git a/kernel/locking/rtmutex-debug.c b/kernel/locking/rtmutex-debug.c index 13b243a323fa..49b2ed3dced8 100644 --- a/kernel/locking/rtmutex-debug.c +++ b/kernel/locking/rtmutex-debug.c | |||
@@ -24,7 +24,7 @@ | |||
24 | #include <linux/kallsyms.h> | 24 | #include <linux/kallsyms.h> |
25 | #include <linux/syscalls.h> | 25 | #include <linux/syscalls.h> |
26 | #include <linux/interrupt.h> | 26 | #include <linux/interrupt.h> |
27 | #include <linux/plist.h> | 27 | #include <linux/rbtree.h> |
28 | #include <linux/fs.h> | 28 | #include <linux/fs.h> |
29 | #include <linux/debug_locks.h> | 29 | #include <linux/debug_locks.h> |
30 | 30 | ||
@@ -57,7 +57,7 @@ static void printk_lock(struct rt_mutex *lock, int print_owner) | |||
57 | 57 | ||
58 | void rt_mutex_debug_task_free(struct task_struct *task) | 58 | void rt_mutex_debug_task_free(struct task_struct *task) |
59 | { | 59 | { |
60 | DEBUG_LOCKS_WARN_ON(!plist_head_empty(&task->pi_waiters)); | 60 | DEBUG_LOCKS_WARN_ON(!RB_EMPTY_ROOT(&task->pi_waiters)); |
61 | DEBUG_LOCKS_WARN_ON(task->pi_blocked_on); | 61 | DEBUG_LOCKS_WARN_ON(task->pi_blocked_on); |
62 | } | 62 | } |
63 | 63 | ||
@@ -154,16 +154,12 @@ void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock) | |||
154 | void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter) | 154 | void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter) |
155 | { | 155 | { |
156 | memset(waiter, 0x11, sizeof(*waiter)); | 156 | memset(waiter, 0x11, sizeof(*waiter)); |
157 | plist_node_init(&waiter->list_entry, MAX_PRIO); | ||
158 | plist_node_init(&waiter->pi_list_entry, MAX_PRIO); | ||
159 | waiter->deadlock_task_pid = NULL; | 157 | waiter->deadlock_task_pid = NULL; |
160 | } | 158 | } |
161 | 159 | ||
162 | void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter) | 160 | void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter) |
163 | { | 161 | { |
164 | put_pid(waiter->deadlock_task_pid); | 162 | put_pid(waiter->deadlock_task_pid); |
165 | DEBUG_LOCKS_WARN_ON(!plist_node_empty(&waiter->list_entry)); | ||
166 | DEBUG_LOCKS_WARN_ON(!plist_node_empty(&waiter->pi_list_entry)); | ||
167 | memset(waiter, 0x22, sizeof(*waiter)); | 163 | memset(waiter, 0x22, sizeof(*waiter)); |
168 | } | 164 | } |
169 | 165 | ||
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 0dd6aec1cb6a..2e960a2bab81 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c | |||
@@ -14,6 +14,7 @@ | |||
14 | #include <linux/export.h> | 14 | #include <linux/export.h> |
15 | #include <linux/sched.h> | 15 | #include <linux/sched.h> |
16 | #include <linux/sched/rt.h> | 16 | #include <linux/sched/rt.h> |
17 | #include <linux/sched/deadline.h> | ||
17 | #include <linux/timer.h> | 18 | #include <linux/timer.h> |
18 | 19 | ||
19 | #include "rtmutex_common.h" | 20 | #include "rtmutex_common.h" |
@@ -91,10 +92,107 @@ static inline void mark_rt_mutex_waiters(struct rt_mutex *lock) | |||
91 | } | 92 | } |
92 | #endif | 93 | #endif |
93 | 94 | ||
95 | static inline int | ||
96 | rt_mutex_waiter_less(struct rt_mutex_waiter *left, | ||
97 | struct rt_mutex_waiter *right) | ||
98 | { | ||
99 | if (left->prio < right->prio) | ||
100 | return 1; | ||
101 | |||
102 | /* | ||
103 | * If both waiters have dl_prio(), we check the deadlines of the | ||
104 | * associated tasks. | ||
105 | * If left waiter has a dl_prio(), and we didn't return 1 above, | ||
106 | * then right waiter has a dl_prio() too. | ||
107 | */ | ||
108 | if (dl_prio(left->prio)) | ||
109 | return (left->task->dl.deadline < right->task->dl.deadline); | ||
110 | |||
111 | return 0; | ||
112 | } | ||
113 | |||
114 | static void | ||
115 | rt_mutex_enqueue(struct rt_mutex *lock, struct rt_mutex_waiter *waiter) | ||
116 | { | ||
117 | struct rb_node **link = &lock->waiters.rb_node; | ||
118 | struct rb_node *parent = NULL; | ||
119 | struct rt_mutex_waiter *entry; | ||
120 | int leftmost = 1; | ||
121 | |||
122 | while (*link) { | ||
123 | parent = *link; | ||
124 | entry = rb_entry(parent, struct rt_mutex_waiter, tree_entry); | ||
125 | if (rt_mutex_waiter_less(waiter, entry)) { | ||
126 | link = &parent->rb_left; | ||
127 | } else { | ||
128 | link = &parent->rb_right; | ||
129 | leftmost = 0; | ||
130 | } | ||
131 | } | ||
132 | |||
133 | if (leftmost) | ||
134 | lock->waiters_leftmost = &waiter->tree_entry; | ||
135 | |||
136 | rb_link_node(&waiter->tree_entry, parent, link); | ||
137 | rb_insert_color(&waiter->tree_entry, &lock->waiters); | ||
138 | } | ||
139 | |||
140 | static void | ||
141 | rt_mutex_dequeue(struct rt_mutex *lock, struct rt_mutex_waiter *waiter) | ||
142 | { | ||
143 | if (RB_EMPTY_NODE(&waiter->tree_entry)) | ||
144 | return; | ||
145 | |||
146 | if (lock->waiters_leftmost == &waiter->tree_entry) | ||
147 | lock->waiters_leftmost = rb_next(&waiter->tree_entry); | ||
148 | |||
149 | rb_erase(&waiter->tree_entry, &lock->waiters); | ||
150 | RB_CLEAR_NODE(&waiter->tree_entry); | ||
151 | } | ||
152 | |||
153 | static void | ||
154 | rt_mutex_enqueue_pi(struct task_struct *task, struct rt_mutex_waiter *waiter) | ||
155 | { | ||
156 | struct rb_node **link = &task->pi_waiters.rb_node; | ||
157 | struct rb_node *parent = NULL; | ||
158 | struct rt_mutex_waiter *entry; | ||
159 | int leftmost = 1; | ||
160 | |||
161 | while (*link) { | ||
162 | parent = *link; | ||
163 | entry = rb_entry(parent, struct rt_mutex_waiter, pi_tree_entry); | ||
164 | if (rt_mutex_waiter_less(waiter, entry)) { | ||
165 | link = &parent->rb_left; | ||
166 | } else { | ||
167 | link = &parent->rb_right; | ||
168 | leftmost = 0; | ||
169 | } | ||
170 | } | ||
171 | |||
172 | if (leftmost) | ||
173 | task->pi_waiters_leftmost = &waiter->pi_tree_entry; | ||
174 | |||
175 | rb_link_node(&waiter->pi_tree_entry, parent, link); | ||
176 | rb_insert_color(&waiter->pi_tree_entry, &task->pi_waiters); | ||
177 | } | ||
178 | |||
179 | static void | ||
180 | rt_mutex_dequeue_pi(struct task_struct *task, struct rt_mutex_waiter *waiter) | ||
181 | { | ||
182 | if (RB_EMPTY_NODE(&waiter->pi_tree_entry)) | ||
183 | return; | ||
184 | |||
185 | if (task->pi_waiters_leftmost == &waiter->pi_tree_entry) | ||
186 | task->pi_waiters_leftmost = rb_next(&waiter->pi_tree_entry); | ||
187 | |||
188 | rb_erase(&waiter->pi_tree_entry, &task->pi_waiters); | ||
189 | RB_CLEAR_NODE(&waiter->pi_tree_entry); | ||
190 | } | ||
191 | |||
94 | /* | 192 | /* |
95 | * Calculate task priority from the waiter list priority | 193 | * Calculate task priority from the waiter tree priority |
96 | * | 194 | * |
97 | * Return task->normal_prio when the waiter list is empty or when | 195 | * Return task->normal_prio when the waiter tree is empty or when |
98 | * the waiter is not allowed to do priority boosting | 196 | * the waiter is not allowed to do priority boosting |
99 | */ | 197 | */ |
100 | int rt_mutex_getprio(struct task_struct *task) | 198 | int rt_mutex_getprio(struct task_struct *task) |
@@ -102,10 +200,18 @@ int rt_mutex_getprio(struct task_struct *task) | |||
102 | if (likely(!task_has_pi_waiters(task))) | 200 | if (likely(!task_has_pi_waiters(task))) |
103 | return task->normal_prio; | 201 | return task->normal_prio; |
104 | 202 | ||
105 | return min(task_top_pi_waiter(task)->pi_list_entry.prio, | 203 | return min(task_top_pi_waiter(task)->prio, |
106 | task->normal_prio); | 204 | task->normal_prio); |
107 | } | 205 | } |
108 | 206 | ||
207 | struct task_struct *rt_mutex_get_top_task(struct task_struct *task) | ||
208 | { | ||
209 | if (likely(!task_has_pi_waiters(task))) | ||
210 | return NULL; | ||
211 | |||
212 | return task_top_pi_waiter(task)->task; | ||
213 | } | ||
214 | |||
109 | /* | 215 | /* |
110 | * Adjust the priority of a task, after its pi_waiters got modified. | 216 | * Adjust the priority of a task, after its pi_waiters got modified. |
111 | * | 217 | * |
@@ -115,7 +221,7 @@ static void __rt_mutex_adjust_prio(struct task_struct *task) | |||
115 | { | 221 | { |
116 | int prio = rt_mutex_getprio(task); | 222 | int prio = rt_mutex_getprio(task); |
117 | 223 | ||
118 | if (task->prio != prio) | 224 | if (task->prio != prio || dl_prio(prio)) |
119 | rt_mutex_setprio(task, prio); | 225 | rt_mutex_setprio(task, prio); |
120 | } | 226 | } |
121 | 227 | ||
@@ -233,7 +339,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, | |||
233 | * When deadlock detection is off then we check, if further | 339 | * When deadlock detection is off then we check, if further |
234 | * priority adjustment is necessary. | 340 | * priority adjustment is necessary. |
235 | */ | 341 | */ |
236 | if (!detect_deadlock && waiter->list_entry.prio == task->prio) | 342 | if (!detect_deadlock && waiter->prio == task->prio) |
237 | goto out_unlock_pi; | 343 | goto out_unlock_pi; |
238 | 344 | ||
239 | lock = waiter->lock; | 345 | lock = waiter->lock; |
@@ -254,9 +360,9 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, | |||
254 | top_waiter = rt_mutex_top_waiter(lock); | 360 | top_waiter = rt_mutex_top_waiter(lock); |
255 | 361 | ||
256 | /* Requeue the waiter */ | 362 | /* Requeue the waiter */ |
257 | plist_del(&waiter->list_entry, &lock->wait_list); | 363 | rt_mutex_dequeue(lock, waiter); |
258 | waiter->list_entry.prio = task->prio; | 364 | waiter->prio = task->prio; |
259 | plist_add(&waiter->list_entry, &lock->wait_list); | 365 | rt_mutex_enqueue(lock, waiter); |
260 | 366 | ||
261 | /* Release the task */ | 367 | /* Release the task */ |
262 | raw_spin_unlock_irqrestore(&task->pi_lock, flags); | 368 | raw_spin_unlock_irqrestore(&task->pi_lock, flags); |
@@ -280,17 +386,15 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, | |||
280 | 386 | ||
281 | if (waiter == rt_mutex_top_waiter(lock)) { | 387 | if (waiter == rt_mutex_top_waiter(lock)) { |
282 | /* Boost the owner */ | 388 | /* Boost the owner */ |
283 | plist_del(&top_waiter->pi_list_entry, &task->pi_waiters); | 389 | rt_mutex_dequeue_pi(task, top_waiter); |
284 | waiter->pi_list_entry.prio = waiter->list_entry.prio; | 390 | rt_mutex_enqueue_pi(task, waiter); |
285 | plist_add(&waiter->pi_list_entry, &task->pi_waiters); | ||
286 | __rt_mutex_adjust_prio(task); | 391 | __rt_mutex_adjust_prio(task); |
287 | 392 | ||
288 | } else if (top_waiter == waiter) { | 393 | } else if (top_waiter == waiter) { |
289 | /* Deboost the owner */ | 394 | /* Deboost the owner */ |
290 | plist_del(&waiter->pi_list_entry, &task->pi_waiters); | 395 | rt_mutex_dequeue_pi(task, waiter); |
291 | waiter = rt_mutex_top_waiter(lock); | 396 | waiter = rt_mutex_top_waiter(lock); |
292 | waiter->pi_list_entry.prio = waiter->list_entry.prio; | 397 | rt_mutex_enqueue_pi(task, waiter); |
293 | plist_add(&waiter->pi_list_entry, &task->pi_waiters); | ||
294 | __rt_mutex_adjust_prio(task); | 398 | __rt_mutex_adjust_prio(task); |
295 | } | 399 | } |
296 | 400 | ||
@@ -355,7 +459,7 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task, | |||
355 | * 3) it is top waiter | 459 | * 3) it is top waiter |
356 | */ | 460 | */ |
357 | if (rt_mutex_has_waiters(lock)) { | 461 | if (rt_mutex_has_waiters(lock)) { |
358 | if (task->prio >= rt_mutex_top_waiter(lock)->list_entry.prio) { | 462 | if (task->prio >= rt_mutex_top_waiter(lock)->prio) { |
359 | if (!waiter || waiter != rt_mutex_top_waiter(lock)) | 463 | if (!waiter || waiter != rt_mutex_top_waiter(lock)) |
360 | return 0; | 464 | return 0; |
361 | } | 465 | } |
@@ -369,7 +473,7 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task, | |||
369 | 473 | ||
370 | /* remove the queued waiter. */ | 474 | /* remove the queued waiter. */ |
371 | if (waiter) { | 475 | if (waiter) { |
372 | plist_del(&waiter->list_entry, &lock->wait_list); | 476 | rt_mutex_dequeue(lock, waiter); |
373 | task->pi_blocked_on = NULL; | 477 | task->pi_blocked_on = NULL; |
374 | } | 478 | } |
375 | 479 | ||
@@ -379,8 +483,7 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task, | |||
379 | */ | 483 | */ |
380 | if (rt_mutex_has_waiters(lock)) { | 484 | if (rt_mutex_has_waiters(lock)) { |
381 | top = rt_mutex_top_waiter(lock); | 485 | top = rt_mutex_top_waiter(lock); |
382 | top->pi_list_entry.prio = top->list_entry.prio; | 486 | rt_mutex_enqueue_pi(task, top); |
383 | plist_add(&top->pi_list_entry, &task->pi_waiters); | ||
384 | } | 487 | } |
385 | raw_spin_unlock_irqrestore(&task->pi_lock, flags); | 488 | raw_spin_unlock_irqrestore(&task->pi_lock, flags); |
386 | } | 489 | } |
@@ -416,13 +519,12 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, | |||
416 | __rt_mutex_adjust_prio(task); | 519 | __rt_mutex_adjust_prio(task); |
417 | waiter->task = task; | 520 | waiter->task = task; |
418 | waiter->lock = lock; | 521 | waiter->lock = lock; |
419 | plist_node_init(&waiter->list_entry, task->prio); | 522 | waiter->prio = task->prio; |
420 | plist_node_init(&waiter->pi_list_entry, task->prio); | ||
421 | 523 | ||
422 | /* Get the top priority waiter on the lock */ | 524 | /* Get the top priority waiter on the lock */ |
423 | if (rt_mutex_has_waiters(lock)) | 525 | if (rt_mutex_has_waiters(lock)) |
424 | top_waiter = rt_mutex_top_waiter(lock); | 526 | top_waiter = rt_mutex_top_waiter(lock); |
425 | plist_add(&waiter->list_entry, &lock->wait_list); | 527 | rt_mutex_enqueue(lock, waiter); |
426 | 528 | ||
427 | task->pi_blocked_on = waiter; | 529 | task->pi_blocked_on = waiter; |
428 | 530 | ||
@@ -433,8 +535,8 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, | |||
433 | 535 | ||
434 | if (waiter == rt_mutex_top_waiter(lock)) { | 536 | if (waiter == rt_mutex_top_waiter(lock)) { |
435 | raw_spin_lock_irqsave(&owner->pi_lock, flags); | 537 | raw_spin_lock_irqsave(&owner->pi_lock, flags); |
436 | plist_del(&top_waiter->pi_list_entry, &owner->pi_waiters); | 538 | rt_mutex_dequeue_pi(owner, top_waiter); |
437 | plist_add(&waiter->pi_list_entry, &owner->pi_waiters); | 539 | rt_mutex_enqueue_pi(owner, waiter); |
438 | 540 | ||
439 | __rt_mutex_adjust_prio(owner); | 541 | __rt_mutex_adjust_prio(owner); |
440 | if (owner->pi_blocked_on) | 542 | if (owner->pi_blocked_on) |
@@ -486,7 +588,7 @@ static void wakeup_next_waiter(struct rt_mutex *lock) | |||
486 | * boosted mode and go back to normal after releasing | 588 | * boosted mode and go back to normal after releasing |
487 | * lock->wait_lock. | 589 | * lock->wait_lock. |
488 | */ | 590 | */ |
489 | plist_del(&waiter->pi_list_entry, ¤t->pi_waiters); | 591 | rt_mutex_dequeue_pi(current, waiter); |
490 | 592 | ||
491 | rt_mutex_set_owner(lock, NULL); | 593 | rt_mutex_set_owner(lock, NULL); |
492 | 594 | ||
@@ -510,7 +612,7 @@ static void remove_waiter(struct rt_mutex *lock, | |||
510 | int chain_walk = 0; | 612 | int chain_walk = 0; |
511 | 613 | ||
512 | raw_spin_lock_irqsave(¤t->pi_lock, flags); | 614 | raw_spin_lock_irqsave(¤t->pi_lock, flags); |
513 | plist_del(&waiter->list_entry, &lock->wait_list); | 615 | rt_mutex_dequeue(lock, waiter); |
514 | current->pi_blocked_on = NULL; | 616 | current->pi_blocked_on = NULL; |
515 | raw_spin_unlock_irqrestore(¤t->pi_lock, flags); | 617 | raw_spin_unlock_irqrestore(¤t->pi_lock, flags); |
516 | 618 | ||
@@ -521,13 +623,13 @@ static void remove_waiter(struct rt_mutex *lock, | |||
521 | 623 | ||
522 | raw_spin_lock_irqsave(&owner->pi_lock, flags); | 624 | raw_spin_lock_irqsave(&owner->pi_lock, flags); |
523 | 625 | ||
524 | plist_del(&waiter->pi_list_entry, &owner->pi_waiters); | 626 | rt_mutex_dequeue_pi(owner, waiter); |
525 | 627 | ||
526 | if (rt_mutex_has_waiters(lock)) { | 628 | if (rt_mutex_has_waiters(lock)) { |
527 | struct rt_mutex_waiter *next; | 629 | struct rt_mutex_waiter *next; |
528 | 630 | ||
529 | next = rt_mutex_top_waiter(lock); | 631 | next = rt_mutex_top_waiter(lock); |
530 | plist_add(&next->pi_list_entry, &owner->pi_waiters); | 632 | rt_mutex_enqueue_pi(owner, next); |
531 | } | 633 | } |
532 | __rt_mutex_adjust_prio(owner); | 634 | __rt_mutex_adjust_prio(owner); |
533 | 635 | ||
@@ -537,8 +639,6 @@ static void remove_waiter(struct rt_mutex *lock, | |||
537 | raw_spin_unlock_irqrestore(&owner->pi_lock, flags); | 639 | raw_spin_unlock_irqrestore(&owner->pi_lock, flags); |
538 | } | 640 | } |
539 | 641 | ||
540 | WARN_ON(!plist_node_empty(&waiter->pi_list_entry)); | ||
541 | |||
542 | if (!chain_walk) | 642 | if (!chain_walk) |
543 | return; | 643 | return; |
544 | 644 | ||
@@ -565,7 +665,8 @@ void rt_mutex_adjust_pi(struct task_struct *task) | |||
565 | raw_spin_lock_irqsave(&task->pi_lock, flags); | 665 | raw_spin_lock_irqsave(&task->pi_lock, flags); |
566 | 666 | ||
567 | waiter = task->pi_blocked_on; | 667 | waiter = task->pi_blocked_on; |
568 | if (!waiter || waiter->list_entry.prio == task->prio) { | 668 | if (!waiter || (waiter->prio == task->prio && |
669 | !dl_prio(task->prio))) { | ||
569 | raw_spin_unlock_irqrestore(&task->pi_lock, flags); | 670 | raw_spin_unlock_irqrestore(&task->pi_lock, flags); |
570 | return; | 671 | return; |
571 | } | 672 | } |
@@ -638,6 +739,8 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state, | |||
638 | int ret = 0; | 739 | int ret = 0; |
639 | 740 | ||
640 | debug_rt_mutex_init_waiter(&waiter); | 741 | debug_rt_mutex_init_waiter(&waiter); |
742 | RB_CLEAR_NODE(&waiter.pi_tree_entry); | ||
743 | RB_CLEAR_NODE(&waiter.tree_entry); | ||
641 | 744 | ||
642 | raw_spin_lock(&lock->wait_lock); | 745 | raw_spin_lock(&lock->wait_lock); |
643 | 746 | ||
@@ -904,7 +1007,8 @@ void __rt_mutex_init(struct rt_mutex *lock, const char *name) | |||
904 | { | 1007 | { |
905 | lock->owner = NULL; | 1008 | lock->owner = NULL; |
906 | raw_spin_lock_init(&lock->wait_lock); | 1009 | raw_spin_lock_init(&lock->wait_lock); |
907 | plist_head_init(&lock->wait_list); | 1010 | lock->waiters = RB_ROOT; |
1011 | lock->waiters_leftmost = NULL; | ||
908 | 1012 | ||
909 | debug_rt_mutex_init(lock, name); | 1013 | debug_rt_mutex_init(lock, name); |
910 | } | 1014 | } |
diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h index 53a66c85261b..7431a9c86f35 100644 --- a/kernel/locking/rtmutex_common.h +++ b/kernel/locking/rtmutex_common.h | |||
@@ -40,13 +40,13 @@ extern void schedule_rt_mutex_test(struct rt_mutex *lock); | |||
40 | * This is the control structure for tasks blocked on a rt_mutex, | 40 | * This is the control structure for tasks blocked on a rt_mutex, |
41 | * which is allocated on the kernel stack on of the blocked task. | 41 | * which is allocated on the kernel stack on of the blocked task. |
42 | * | 42 | * |
43 | * @list_entry: pi node to enqueue into the mutex waiters list | 43 | * @tree_entry: pi node to enqueue into the mutex waiters tree |
44 | * @pi_list_entry: pi node to enqueue into the mutex owner waiters list | 44 | * @pi_tree_entry: pi node to enqueue into the mutex owner waiters tree |
45 | * @task: task reference to the blocked task | 45 | * @task: task reference to the blocked task |
46 | */ | 46 | */ |
47 | struct rt_mutex_waiter { | 47 | struct rt_mutex_waiter { |
48 | struct plist_node list_entry; | 48 | struct rb_node tree_entry; |
49 | struct plist_node pi_list_entry; | 49 | struct rb_node pi_tree_entry; |
50 | struct task_struct *task; | 50 | struct task_struct *task; |
51 | struct rt_mutex *lock; | 51 | struct rt_mutex *lock; |
52 | #ifdef CONFIG_DEBUG_RT_MUTEXES | 52 | #ifdef CONFIG_DEBUG_RT_MUTEXES |
@@ -54,14 +54,15 @@ struct rt_mutex_waiter { | |||
54 | struct pid *deadlock_task_pid; | 54 | struct pid *deadlock_task_pid; |
55 | struct rt_mutex *deadlock_lock; | 55 | struct rt_mutex *deadlock_lock; |
56 | #endif | 56 | #endif |
57 | int prio; | ||
57 | }; | 58 | }; |
58 | 59 | ||
59 | /* | 60 | /* |
60 | * Various helpers to access the waiters-plist: | 61 | * Various helpers to access the waiters-tree: |
61 | */ | 62 | */ |
62 | static inline int rt_mutex_has_waiters(struct rt_mutex *lock) | 63 | static inline int rt_mutex_has_waiters(struct rt_mutex *lock) |
63 | { | 64 | { |
64 | return !plist_head_empty(&lock->wait_list); | 65 | return !RB_EMPTY_ROOT(&lock->waiters); |
65 | } | 66 | } |
66 | 67 | ||
67 | static inline struct rt_mutex_waiter * | 68 | static inline struct rt_mutex_waiter * |
@@ -69,8 +70,8 @@ rt_mutex_top_waiter(struct rt_mutex *lock) | |||
69 | { | 70 | { |
70 | struct rt_mutex_waiter *w; | 71 | struct rt_mutex_waiter *w; |
71 | 72 | ||
72 | w = plist_first_entry(&lock->wait_list, struct rt_mutex_waiter, | 73 | w = rb_entry(lock->waiters_leftmost, struct rt_mutex_waiter, |
73 | list_entry); | 74 | tree_entry); |
74 | BUG_ON(w->lock != lock); | 75 | BUG_ON(w->lock != lock); |
75 | 76 | ||
76 | return w; | 77 | return w; |
@@ -78,14 +79,14 @@ rt_mutex_top_waiter(struct rt_mutex *lock) | |||
78 | 79 | ||
79 | static inline int task_has_pi_waiters(struct task_struct *p) | 80 | static inline int task_has_pi_waiters(struct task_struct *p) |
80 | { | 81 | { |
81 | return !plist_head_empty(&p->pi_waiters); | 82 | return !RB_EMPTY_ROOT(&p->pi_waiters); |
82 | } | 83 | } |
83 | 84 | ||
84 | static inline struct rt_mutex_waiter * | 85 | static inline struct rt_mutex_waiter * |
85 | task_top_pi_waiter(struct task_struct *p) | 86 | task_top_pi_waiter(struct task_struct *p) |
86 | { | 87 | { |
87 | return plist_first_entry(&p->pi_waiters, struct rt_mutex_waiter, | 88 | return rb_entry(p->pi_waiters_leftmost, struct rt_mutex_waiter, |
88 | pi_list_entry); | 89 | pi_tree_entry); |
89 | } | 90 | } |
90 | 91 | ||
91 | /* | 92 | /* |
diff --git a/kernel/module.c b/kernel/module.c index f5a3b1e8ec51..d24fcf29cb64 100644 --- a/kernel/module.c +++ b/kernel/module.c | |||
@@ -815,10 +815,8 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user, | |||
815 | return -EFAULT; | 815 | return -EFAULT; |
816 | name[MODULE_NAME_LEN-1] = '\0'; | 816 | name[MODULE_NAME_LEN-1] = '\0'; |
817 | 817 | ||
818 | if (!(flags & O_NONBLOCK)) { | 818 | if (!(flags & O_NONBLOCK)) |
819 | printk(KERN_WARNING | 819 | pr_warn("waiting module removal not supported: please upgrade\n"); |
820 | "waiting module removal not supported: please upgrade"); | ||
821 | } | ||
822 | 820 | ||
823 | if (mutex_lock_interruptible(&module_mutex) != 0) | 821 | if (mutex_lock_interruptible(&module_mutex) != 0) |
824 | return -EINTR; | 822 | return -EINTR; |
diff --git a/kernel/padata.c b/kernel/padata.c index 2abd25d79cc8..161402f0b517 100644 --- a/kernel/padata.c +++ b/kernel/padata.c | |||
@@ -112,7 +112,7 @@ int padata_do_parallel(struct padata_instance *pinst, | |||
112 | 112 | ||
113 | rcu_read_lock_bh(); | 113 | rcu_read_lock_bh(); |
114 | 114 | ||
115 | pd = rcu_dereference(pinst->pd); | 115 | pd = rcu_dereference_bh(pinst->pd); |
116 | 116 | ||
117 | err = -EINVAL; | 117 | err = -EINVAL; |
118 | if (!(pinst->flags & PADATA_INIT) || pinst->flags & PADATA_INVALID) | 118 | if (!(pinst->flags & PADATA_INIT) || pinst->flags & PADATA_INVALID) |
diff --git a/kernel/panic.c b/kernel/panic.c index c00b4ceb39e8..6d6300375090 100644 --- a/kernel/panic.c +++ b/kernel/panic.c | |||
@@ -33,7 +33,7 @@ static int pause_on_oops; | |||
33 | static int pause_on_oops_flag; | 33 | static int pause_on_oops_flag; |
34 | static DEFINE_SPINLOCK(pause_on_oops_lock); | 34 | static DEFINE_SPINLOCK(pause_on_oops_lock); |
35 | 35 | ||
36 | int panic_timeout; | 36 | int panic_timeout = CONFIG_PANIC_TIMEOUT; |
37 | EXPORT_SYMBOL_GPL(panic_timeout); | 37 | EXPORT_SYMBOL_GPL(panic_timeout); |
38 | 38 | ||
39 | ATOMIC_NOTIFIER_HEAD(panic_notifier_list); | 39 | ATOMIC_NOTIFIER_HEAD(panic_notifier_list); |
diff --git a/kernel/params.c b/kernel/params.c index c00d5b502aa4..b00142e7f3ba 100644 --- a/kernel/params.c +++ b/kernel/params.c | |||
@@ -227,17 +227,10 @@ int parse_args(const char *doing, | |||
227 | } | 227 | } |
228 | 228 | ||
229 | /* Lazy bastard, eh? */ | 229 | /* Lazy bastard, eh? */ |
230 | #define STANDARD_PARAM_DEF(name, type, format, tmptype, strtolfn) \ | 230 | #define STANDARD_PARAM_DEF(name, type, format, strtolfn) \ |
231 | int param_set_##name(const char *val, const struct kernel_param *kp) \ | 231 | int param_set_##name(const char *val, const struct kernel_param *kp) \ |
232 | { \ | 232 | { \ |
233 | tmptype l; \ | 233 | return strtolfn(val, 0, (type *)kp->arg); \ |
234 | int ret; \ | ||
235 | \ | ||
236 | ret = strtolfn(val, 0, &l); \ | ||
237 | if (ret < 0 || ((type)l != l)) \ | ||
238 | return ret < 0 ? ret : -EINVAL; \ | ||
239 | *((type *)kp->arg) = l; \ | ||
240 | return 0; \ | ||
241 | } \ | 234 | } \ |
242 | int param_get_##name(char *buffer, const struct kernel_param *kp) \ | 235 | int param_get_##name(char *buffer, const struct kernel_param *kp) \ |
243 | { \ | 236 | { \ |
@@ -253,13 +246,13 @@ int parse_args(const char *doing, | |||
253 | EXPORT_SYMBOL(param_ops_##name) | 246 | EXPORT_SYMBOL(param_ops_##name) |
254 | 247 | ||
255 | 248 | ||
256 | STANDARD_PARAM_DEF(byte, unsigned char, "%hhu", unsigned long, kstrtoul); | 249 | STANDARD_PARAM_DEF(byte, unsigned char, "%hhu", kstrtou8); |
257 | STANDARD_PARAM_DEF(short, short, "%hi", long, kstrtol); | 250 | STANDARD_PARAM_DEF(short, short, "%hi", kstrtos16); |
258 | STANDARD_PARAM_DEF(ushort, unsigned short, "%hu", unsigned long, kstrtoul); | 251 | STANDARD_PARAM_DEF(ushort, unsigned short, "%hu", kstrtou16); |
259 | STANDARD_PARAM_DEF(int, int, "%i", long, kstrtol); | 252 | STANDARD_PARAM_DEF(int, int, "%i", kstrtoint); |
260 | STANDARD_PARAM_DEF(uint, unsigned int, "%u", unsigned long, kstrtoul); | 253 | STANDARD_PARAM_DEF(uint, unsigned int, "%u", kstrtouint); |
261 | STANDARD_PARAM_DEF(long, long, "%li", long, kstrtol); | 254 | STANDARD_PARAM_DEF(long, long, "%li", kstrtol); |
262 | STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", unsigned long, kstrtoul); | 255 | STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", kstrtoul); |
263 | 256 | ||
264 | int param_set_charp(const char *val, const struct kernel_param *kp) | 257 | int param_set_charp(const char *val, const struct kernel_param *kp) |
265 | { | 258 | { |
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index c7f31aa272f7..3b8946416a5f 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c | |||
@@ -233,7 +233,8 @@ void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times) | |||
233 | 233 | ||
234 | /* | 234 | /* |
235 | * Sample a process (thread group) clock for the given group_leader task. | 235 | * Sample a process (thread group) clock for the given group_leader task. |
236 | * Must be called with tasklist_lock held for reading. | 236 | * Must be called with task sighand lock held for safe while_each_thread() |
237 | * traversal. | ||
237 | */ | 238 | */ |
238 | static int cpu_clock_sample_group(const clockid_t which_clock, | 239 | static int cpu_clock_sample_group(const clockid_t which_clock, |
239 | struct task_struct *p, | 240 | struct task_struct *p, |
@@ -260,30 +261,53 @@ static int cpu_clock_sample_group(const clockid_t which_clock, | |||
260 | return 0; | 261 | return 0; |
261 | } | 262 | } |
262 | 263 | ||
264 | static int posix_cpu_clock_get_task(struct task_struct *tsk, | ||
265 | const clockid_t which_clock, | ||
266 | struct timespec *tp) | ||
267 | { | ||
268 | int err = -EINVAL; | ||
269 | unsigned long long rtn; | ||
270 | |||
271 | if (CPUCLOCK_PERTHREAD(which_clock)) { | ||
272 | if (same_thread_group(tsk, current)) | ||
273 | err = cpu_clock_sample(which_clock, tsk, &rtn); | ||
274 | } else { | ||
275 | unsigned long flags; | ||
276 | struct sighand_struct *sighand; | ||
277 | |||
278 | /* | ||
279 | * while_each_thread() is not yet entirely RCU safe, | ||
280 | * keep locking the group while sampling process | ||
281 | * clock for now. | ||
282 | */ | ||
283 | sighand = lock_task_sighand(tsk, &flags); | ||
284 | if (!sighand) | ||
285 | return err; | ||
286 | |||
287 | if (tsk == current || thread_group_leader(tsk)) | ||
288 | err = cpu_clock_sample_group(which_clock, tsk, &rtn); | ||
289 | |||
290 | unlock_task_sighand(tsk, &flags); | ||
291 | } | ||
292 | |||
293 | if (!err) | ||
294 | sample_to_timespec(which_clock, rtn, tp); | ||
295 | |||
296 | return err; | ||
297 | } | ||
298 | |||
263 | 299 | ||
264 | static int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp) | 300 | static int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp) |
265 | { | 301 | { |
266 | const pid_t pid = CPUCLOCK_PID(which_clock); | 302 | const pid_t pid = CPUCLOCK_PID(which_clock); |
267 | int error = -EINVAL; | 303 | int err = -EINVAL; |
268 | unsigned long long rtn; | ||
269 | 304 | ||
270 | if (pid == 0) { | 305 | if (pid == 0) { |
271 | /* | 306 | /* |
272 | * Special case constant value for our own clocks. | 307 | * Special case constant value for our own clocks. |
273 | * We don't have to do any lookup to find ourselves. | 308 | * We don't have to do any lookup to find ourselves. |
274 | */ | 309 | */ |
275 | if (CPUCLOCK_PERTHREAD(which_clock)) { | 310 | err = posix_cpu_clock_get_task(current, which_clock, tp); |
276 | /* | ||
277 | * Sampling just ourselves we can do with no locking. | ||
278 | */ | ||
279 | error = cpu_clock_sample(which_clock, | ||
280 | current, &rtn); | ||
281 | } else { | ||
282 | read_lock(&tasklist_lock); | ||
283 | error = cpu_clock_sample_group(which_clock, | ||
284 | current, &rtn); | ||
285 | read_unlock(&tasklist_lock); | ||
286 | } | ||
287 | } else { | 311 | } else { |
288 | /* | 312 | /* |
289 | * Find the given PID, and validate that the caller | 313 | * Find the given PID, and validate that the caller |
@@ -292,29 +316,12 @@ static int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp) | |||
292 | struct task_struct *p; | 316 | struct task_struct *p; |
293 | rcu_read_lock(); | 317 | rcu_read_lock(); |
294 | p = find_task_by_vpid(pid); | 318 | p = find_task_by_vpid(pid); |
295 | if (p) { | 319 | if (p) |
296 | if (CPUCLOCK_PERTHREAD(which_clock)) { | 320 | err = posix_cpu_clock_get_task(p, which_clock, tp); |
297 | if (same_thread_group(p, current)) { | ||
298 | error = cpu_clock_sample(which_clock, | ||
299 | p, &rtn); | ||
300 | } | ||
301 | } else { | ||
302 | read_lock(&tasklist_lock); | ||
303 | if (thread_group_leader(p) && p->sighand) { | ||
304 | error = | ||
305 | cpu_clock_sample_group(which_clock, | ||
306 | p, &rtn); | ||
307 | } | ||
308 | read_unlock(&tasklist_lock); | ||
309 | } | ||
310 | } | ||
311 | rcu_read_unlock(); | 321 | rcu_read_unlock(); |
312 | } | 322 | } |
313 | 323 | ||
314 | if (error) | 324 | return err; |
315 | return error; | ||
316 | sample_to_timespec(which_clock, rtn, tp); | ||
317 | return 0; | ||
318 | } | 325 | } |
319 | 326 | ||
320 | 327 | ||
@@ -371,36 +378,40 @@ static int posix_cpu_timer_create(struct k_itimer *new_timer) | |||
371 | */ | 378 | */ |
372 | static int posix_cpu_timer_del(struct k_itimer *timer) | 379 | static int posix_cpu_timer_del(struct k_itimer *timer) |
373 | { | 380 | { |
374 | struct task_struct *p = timer->it.cpu.task; | ||
375 | int ret = 0; | 381 | int ret = 0; |
382 | unsigned long flags; | ||
383 | struct sighand_struct *sighand; | ||
384 | struct task_struct *p = timer->it.cpu.task; | ||
376 | 385 | ||
377 | if (likely(p != NULL)) { | 386 | WARN_ON_ONCE(p == NULL); |
378 | read_lock(&tasklist_lock); | ||
379 | if (unlikely(p->sighand == NULL)) { | ||
380 | /* | ||
381 | * We raced with the reaping of the task. | ||
382 | * The deletion should have cleared us off the list. | ||
383 | */ | ||
384 | BUG_ON(!list_empty(&timer->it.cpu.entry)); | ||
385 | } else { | ||
386 | spin_lock(&p->sighand->siglock); | ||
387 | if (timer->it.cpu.firing) | ||
388 | ret = TIMER_RETRY; | ||
389 | else | ||
390 | list_del(&timer->it.cpu.entry); | ||
391 | spin_unlock(&p->sighand->siglock); | ||
392 | } | ||
393 | read_unlock(&tasklist_lock); | ||
394 | 387 | ||
395 | if (!ret) | 388 | /* |
396 | put_task_struct(p); | 389 | * Protect against sighand release/switch in exit/exec and process/ |
390 | * thread timer list entry concurrent read/writes. | ||
391 | */ | ||
392 | sighand = lock_task_sighand(p, &flags); | ||
393 | if (unlikely(sighand == NULL)) { | ||
394 | /* | ||
395 | * We raced with the reaping of the task. | ||
396 | * The deletion should have cleared us off the list. | ||
397 | */ | ||
398 | WARN_ON_ONCE(!list_empty(&timer->it.cpu.entry)); | ||
399 | } else { | ||
400 | if (timer->it.cpu.firing) | ||
401 | ret = TIMER_RETRY; | ||
402 | else | ||
403 | list_del(&timer->it.cpu.entry); | ||
404 | |||
405 | unlock_task_sighand(p, &flags); | ||
397 | } | 406 | } |
398 | 407 | ||
408 | if (!ret) | ||
409 | put_task_struct(p); | ||
410 | |||
399 | return ret; | 411 | return ret; |
400 | } | 412 | } |
401 | 413 | ||
402 | static void cleanup_timers_list(struct list_head *head, | 414 | static void cleanup_timers_list(struct list_head *head) |
403 | unsigned long long curr) | ||
404 | { | 415 | { |
405 | struct cpu_timer_list *timer, *next; | 416 | struct cpu_timer_list *timer, *next; |
406 | 417 | ||
@@ -414,16 +425,11 @@ static void cleanup_timers_list(struct list_head *head, | |||
414 | * time for later timer_gettime calls to return. | 425 | * time for later timer_gettime calls to return. |
415 | * This must be called with the siglock held. | 426 | * This must be called with the siglock held. |
416 | */ | 427 | */ |
417 | static void cleanup_timers(struct list_head *head, | 428 | static void cleanup_timers(struct list_head *head) |
418 | cputime_t utime, cputime_t stime, | ||
419 | unsigned long long sum_exec_runtime) | ||
420 | { | 429 | { |
421 | 430 | cleanup_timers_list(head); | |
422 | cputime_t ptime = utime + stime; | 431 | cleanup_timers_list(++head); |
423 | 432 | cleanup_timers_list(++head); | |
424 | cleanup_timers_list(head, cputime_to_expires(ptime)); | ||
425 | cleanup_timers_list(++head, cputime_to_expires(utime)); | ||
426 | cleanup_timers_list(++head, sum_exec_runtime); | ||
427 | } | 433 | } |
428 | 434 | ||
429 | /* | 435 | /* |
@@ -433,41 +439,14 @@ static void cleanup_timers(struct list_head *head, | |||
433 | */ | 439 | */ |
434 | void posix_cpu_timers_exit(struct task_struct *tsk) | 440 | void posix_cpu_timers_exit(struct task_struct *tsk) |
435 | { | 441 | { |
436 | cputime_t utime, stime; | ||
437 | |||
438 | add_device_randomness((const void*) &tsk->se.sum_exec_runtime, | 442 | add_device_randomness((const void*) &tsk->se.sum_exec_runtime, |
439 | sizeof(unsigned long long)); | 443 | sizeof(unsigned long long)); |
440 | task_cputime(tsk, &utime, &stime); | 444 | cleanup_timers(tsk->cpu_timers); |
441 | cleanup_timers(tsk->cpu_timers, | ||
442 | utime, stime, tsk->se.sum_exec_runtime); | ||
443 | 445 | ||
444 | } | 446 | } |
445 | void posix_cpu_timers_exit_group(struct task_struct *tsk) | 447 | void posix_cpu_timers_exit_group(struct task_struct *tsk) |
446 | { | 448 | { |
447 | struct signal_struct *const sig = tsk->signal; | 449 | cleanup_timers(tsk->signal->cpu_timers); |
448 | cputime_t utime, stime; | ||
449 | |||
450 | task_cputime(tsk, &utime, &stime); | ||
451 | cleanup_timers(tsk->signal->cpu_timers, | ||
452 | utime + sig->utime, stime + sig->stime, | ||
453 | tsk->se.sum_exec_runtime + sig->sum_sched_runtime); | ||
454 | } | ||
455 | |||
456 | static void clear_dead_task(struct k_itimer *itimer, unsigned long long now) | ||
457 | { | ||
458 | struct cpu_timer_list *timer = &itimer->it.cpu; | ||
459 | |||
460 | /* | ||
461 | * That's all for this thread or process. | ||
462 | * We leave our residual in expires to be reported. | ||
463 | */ | ||
464 | put_task_struct(timer->task); | ||
465 | timer->task = NULL; | ||
466 | if (timer->expires < now) { | ||
467 | timer->expires = 0; | ||
468 | } else { | ||
469 | timer->expires -= now; | ||
470 | } | ||
471 | } | 450 | } |
472 | 451 | ||
473 | static inline int expires_gt(cputime_t expires, cputime_t new_exp) | 452 | static inline int expires_gt(cputime_t expires, cputime_t new_exp) |
@@ -477,8 +456,7 @@ static inline int expires_gt(cputime_t expires, cputime_t new_exp) | |||
477 | 456 | ||
478 | /* | 457 | /* |
479 | * Insert the timer on the appropriate list before any timers that | 458 | * Insert the timer on the appropriate list before any timers that |
480 | * expire later. This must be called with the tasklist_lock held | 459 | * expire later. This must be called with the sighand lock held. |
481 | * for reading, interrupts disabled and p->sighand->siglock taken. | ||
482 | */ | 460 | */ |
483 | static void arm_timer(struct k_itimer *timer) | 461 | static void arm_timer(struct k_itimer *timer) |
484 | { | 462 | { |
@@ -569,7 +547,8 @@ static void cpu_timer_fire(struct k_itimer *timer) | |||
569 | 547 | ||
570 | /* | 548 | /* |
571 | * Sample a process (thread group) timer for the given group_leader task. | 549 | * Sample a process (thread group) timer for the given group_leader task. |
572 | * Must be called with tasklist_lock held for reading. | 550 | * Must be called with task sighand lock held for safe while_each_thread() |
551 | * traversal. | ||
573 | */ | 552 | */ |
574 | static int cpu_timer_sample_group(const clockid_t which_clock, | 553 | static int cpu_timer_sample_group(const clockid_t which_clock, |
575 | struct task_struct *p, | 554 | struct task_struct *p, |
@@ -608,7 +587,8 @@ static DECLARE_WORK(nohz_kick_work, nohz_kick_work_fn); | |||
608 | */ | 587 | */ |
609 | static void posix_cpu_timer_kick_nohz(void) | 588 | static void posix_cpu_timer_kick_nohz(void) |
610 | { | 589 | { |
611 | schedule_work(&nohz_kick_work); | 590 | if (context_tracking_is_enabled()) |
591 | schedule_work(&nohz_kick_work); | ||
612 | } | 592 | } |
613 | 593 | ||
614 | bool posix_cpu_timers_can_stop_tick(struct task_struct *tsk) | 594 | bool posix_cpu_timers_can_stop_tick(struct task_struct *tsk) |
@@ -631,43 +611,39 @@ static inline void posix_cpu_timer_kick_nohz(void) { } | |||
631 | * If we return TIMER_RETRY, it's necessary to release the timer's lock | 611 | * If we return TIMER_RETRY, it's necessary to release the timer's lock |
632 | * and try again. (This happens when the timer is in the middle of firing.) | 612 | * and try again. (This happens when the timer is in the middle of firing.) |
633 | */ | 613 | */ |
634 | static int posix_cpu_timer_set(struct k_itimer *timer, int flags, | 614 | static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags, |
635 | struct itimerspec *new, struct itimerspec *old) | 615 | struct itimerspec *new, struct itimerspec *old) |
636 | { | 616 | { |
617 | unsigned long flags; | ||
618 | struct sighand_struct *sighand; | ||
637 | struct task_struct *p = timer->it.cpu.task; | 619 | struct task_struct *p = timer->it.cpu.task; |
638 | unsigned long long old_expires, new_expires, old_incr, val; | 620 | unsigned long long old_expires, new_expires, old_incr, val; |
639 | int ret; | 621 | int ret; |
640 | 622 | ||
641 | if (unlikely(p == NULL)) { | 623 | WARN_ON_ONCE(p == NULL); |
642 | /* | ||
643 | * Timer refers to a dead task's clock. | ||
644 | */ | ||
645 | return -ESRCH; | ||
646 | } | ||
647 | 624 | ||
648 | new_expires = timespec_to_sample(timer->it_clock, &new->it_value); | 625 | new_expires = timespec_to_sample(timer->it_clock, &new->it_value); |
649 | 626 | ||
650 | read_lock(&tasklist_lock); | ||
651 | /* | 627 | /* |
652 | * We need the tasklist_lock to protect against reaping that | 628 | * Protect against sighand release/switch in exit/exec and p->cpu_timers |
653 | * clears p->sighand. If p has just been reaped, we can no | 629 | * and p->signal->cpu_timers read/write in arm_timer() |
630 | */ | ||
631 | sighand = lock_task_sighand(p, &flags); | ||
632 | /* | ||
633 | * If p has just been reaped, we can no | ||
654 | * longer get any information about it at all. | 634 | * longer get any information about it at all. |
655 | */ | 635 | */ |
656 | if (unlikely(p->sighand == NULL)) { | 636 | if (unlikely(sighand == NULL)) { |
657 | read_unlock(&tasklist_lock); | ||
658 | put_task_struct(p); | ||
659 | timer->it.cpu.task = NULL; | ||
660 | return -ESRCH; | 637 | return -ESRCH; |
661 | } | 638 | } |
662 | 639 | ||
663 | /* | 640 | /* |
664 | * Disarm any old timer after extracting its expiry time. | 641 | * Disarm any old timer after extracting its expiry time. |
665 | */ | 642 | */ |
666 | BUG_ON(!irqs_disabled()); | 643 | WARN_ON_ONCE(!irqs_disabled()); |
667 | 644 | ||
668 | ret = 0; | 645 | ret = 0; |
669 | old_incr = timer->it.cpu.incr; | 646 | old_incr = timer->it.cpu.incr; |
670 | spin_lock(&p->sighand->siglock); | ||
671 | old_expires = timer->it.cpu.expires; | 647 | old_expires = timer->it.cpu.expires; |
672 | if (unlikely(timer->it.cpu.firing)) { | 648 | if (unlikely(timer->it.cpu.firing)) { |
673 | timer->it.cpu.firing = -1; | 649 | timer->it.cpu.firing = -1; |
@@ -724,12 +700,11 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int flags, | |||
724 | * disable this firing since we are already reporting | 700 | * disable this firing since we are already reporting |
725 | * it as an overrun (thanks to bump_cpu_timer above). | 701 | * it as an overrun (thanks to bump_cpu_timer above). |
726 | */ | 702 | */ |
727 | spin_unlock(&p->sighand->siglock); | 703 | unlock_task_sighand(p, &flags); |
728 | read_unlock(&tasklist_lock); | ||
729 | goto out; | 704 | goto out; |
730 | } | 705 | } |
731 | 706 | ||
732 | if (new_expires != 0 && !(flags & TIMER_ABSTIME)) { | 707 | if (new_expires != 0 && !(timer_flags & TIMER_ABSTIME)) { |
733 | new_expires += val; | 708 | new_expires += val; |
734 | } | 709 | } |
735 | 710 | ||
@@ -743,9 +718,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int flags, | |||
743 | arm_timer(timer); | 718 | arm_timer(timer); |
744 | } | 719 | } |
745 | 720 | ||
746 | spin_unlock(&p->sighand->siglock); | 721 | unlock_task_sighand(p, &flags); |
747 | read_unlock(&tasklist_lock); | ||
748 | |||
749 | /* | 722 | /* |
750 | * Install the new reload setting, and | 723 | * Install the new reload setting, and |
751 | * set up the signal and overrun bookkeeping. | 724 | * set up the signal and overrun bookkeeping. |
@@ -787,7 +760,8 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp) | |||
787 | { | 760 | { |
788 | unsigned long long now; | 761 | unsigned long long now; |
789 | struct task_struct *p = timer->it.cpu.task; | 762 | struct task_struct *p = timer->it.cpu.task; |
790 | int clear_dead; | 763 | |
764 | WARN_ON_ONCE(p == NULL); | ||
791 | 765 | ||
792 | /* | 766 | /* |
793 | * Easy part: convert the reload time. | 767 | * Easy part: convert the reload time. |
@@ -800,52 +774,34 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp) | |||
800 | return; | 774 | return; |
801 | } | 775 | } |
802 | 776 | ||
803 | if (unlikely(p == NULL)) { | ||
804 | /* | ||
805 | * This task already died and the timer will never fire. | ||
806 | * In this case, expires is actually the dead value. | ||
807 | */ | ||
808 | dead: | ||
809 | sample_to_timespec(timer->it_clock, timer->it.cpu.expires, | ||
810 | &itp->it_value); | ||
811 | return; | ||
812 | } | ||
813 | |||
814 | /* | 777 | /* |
815 | * Sample the clock to take the difference with the expiry time. | 778 | * Sample the clock to take the difference with the expiry time. |
816 | */ | 779 | */ |
817 | if (CPUCLOCK_PERTHREAD(timer->it_clock)) { | 780 | if (CPUCLOCK_PERTHREAD(timer->it_clock)) { |
818 | cpu_clock_sample(timer->it_clock, p, &now); | 781 | cpu_clock_sample(timer->it_clock, p, &now); |
819 | clear_dead = p->exit_state; | ||
820 | } else { | 782 | } else { |
821 | read_lock(&tasklist_lock); | 783 | struct sighand_struct *sighand; |
822 | if (unlikely(p->sighand == NULL)) { | 784 | unsigned long flags; |
785 | |||
786 | /* | ||
787 | * Protect against sighand release/switch in exit/exec and | ||
788 | * also make timer sampling safe if it ends up calling | ||
789 | * thread_group_cputime(). | ||
790 | */ | ||
791 | sighand = lock_task_sighand(p, &flags); | ||
792 | if (unlikely(sighand == NULL)) { | ||
823 | /* | 793 | /* |
824 | * The process has been reaped. | 794 | * The process has been reaped. |
825 | * We can't even collect a sample any more. | 795 | * We can't even collect a sample any more. |
826 | * Call the timer disarmed, nothing else to do. | 796 | * Call the timer disarmed, nothing else to do. |
827 | */ | 797 | */ |
828 | put_task_struct(p); | ||
829 | timer->it.cpu.task = NULL; | ||
830 | timer->it.cpu.expires = 0; | 798 | timer->it.cpu.expires = 0; |
831 | read_unlock(&tasklist_lock); | 799 | sample_to_timespec(timer->it_clock, timer->it.cpu.expires, |
832 | goto dead; | 800 | &itp->it_value); |
833 | } else { | 801 | } else { |
834 | cpu_timer_sample_group(timer->it_clock, p, &now); | 802 | cpu_timer_sample_group(timer->it_clock, p, &now); |
835 | clear_dead = (unlikely(p->exit_state) && | 803 | unlock_task_sighand(p, &flags); |
836 | thread_group_empty(p)); | ||
837 | } | 804 | } |
838 | read_unlock(&tasklist_lock); | ||
839 | } | ||
840 | |||
841 | if (unlikely(clear_dead)) { | ||
842 | /* | ||
843 | * We've noticed that the thread is dead, but | ||
844 | * not yet reaped. Take this opportunity to | ||
845 | * drop our task ref. | ||
846 | */ | ||
847 | clear_dead_task(timer, now); | ||
848 | goto dead; | ||
849 | } | 805 | } |
850 | 806 | ||
851 | if (now < timer->it.cpu.expires) { | 807 | if (now < timer->it.cpu.expires) { |
@@ -1059,14 +1015,12 @@ static void check_process_timers(struct task_struct *tsk, | |||
1059 | */ | 1015 | */ |
1060 | void posix_cpu_timer_schedule(struct k_itimer *timer) | 1016 | void posix_cpu_timer_schedule(struct k_itimer *timer) |
1061 | { | 1017 | { |
1018 | struct sighand_struct *sighand; | ||
1019 | unsigned long flags; | ||
1062 | struct task_struct *p = timer->it.cpu.task; | 1020 | struct task_struct *p = timer->it.cpu.task; |
1063 | unsigned long long now; | 1021 | unsigned long long now; |
1064 | 1022 | ||
1065 | if (unlikely(p == NULL)) | 1023 | WARN_ON_ONCE(p == NULL); |
1066 | /* | ||
1067 | * The task was cleaned up already, no future firings. | ||
1068 | */ | ||
1069 | goto out; | ||
1070 | 1024 | ||
1071 | /* | 1025 | /* |
1072 | * Fetch the current sample and update the timer's expiry time. | 1026 | * Fetch the current sample and update the timer's expiry time. |
@@ -1074,49 +1028,45 @@ void posix_cpu_timer_schedule(struct k_itimer *timer) | |||
1074 | if (CPUCLOCK_PERTHREAD(timer->it_clock)) { | 1028 | if (CPUCLOCK_PERTHREAD(timer->it_clock)) { |
1075 | cpu_clock_sample(timer->it_clock, p, &now); | 1029 | cpu_clock_sample(timer->it_clock, p, &now); |
1076 | bump_cpu_timer(timer, now); | 1030 | bump_cpu_timer(timer, now); |
1077 | if (unlikely(p->exit_state)) { | 1031 | if (unlikely(p->exit_state)) |
1078 | clear_dead_task(timer, now); | 1032 | goto out; |
1033 | |||
1034 | /* Protect timer list r/w in arm_timer() */ | ||
1035 | sighand = lock_task_sighand(p, &flags); | ||
1036 | if (!sighand) | ||
1079 | goto out; | 1037 | goto out; |
1080 | } | ||
1081 | read_lock(&tasklist_lock); /* arm_timer needs it. */ | ||
1082 | spin_lock(&p->sighand->siglock); | ||
1083 | } else { | 1038 | } else { |
1084 | read_lock(&tasklist_lock); | 1039 | /* |
1085 | if (unlikely(p->sighand == NULL)) { | 1040 | * Protect arm_timer() and timer sampling in case of call to |
1041 | * thread_group_cputime(). | ||
1042 | */ | ||
1043 | sighand = lock_task_sighand(p, &flags); | ||
1044 | if (unlikely(sighand == NULL)) { | ||
1086 | /* | 1045 | /* |
1087 | * The process has been reaped. | 1046 | * The process has been reaped. |
1088 | * We can't even collect a sample any more. | 1047 | * We can't even collect a sample any more. |
1089 | */ | 1048 | */ |
1090 | put_task_struct(p); | ||
1091 | timer->it.cpu.task = p = NULL; | ||
1092 | timer->it.cpu.expires = 0; | 1049 | timer->it.cpu.expires = 0; |
1093 | goto out_unlock; | 1050 | goto out; |
1094 | } else if (unlikely(p->exit_state) && thread_group_empty(p)) { | 1051 | } else if (unlikely(p->exit_state) && thread_group_empty(p)) { |
1095 | /* | 1052 | unlock_task_sighand(p, &flags); |
1096 | * We've noticed that the thread is dead, but | 1053 | /* Optimizations: if the process is dying, no need to rearm */ |
1097 | * not yet reaped. Take this opportunity to | 1054 | goto out; |
1098 | * drop our task ref. | ||
1099 | */ | ||
1100 | cpu_timer_sample_group(timer->it_clock, p, &now); | ||
1101 | clear_dead_task(timer, now); | ||
1102 | goto out_unlock; | ||
1103 | } | 1055 | } |
1104 | spin_lock(&p->sighand->siglock); | ||
1105 | cpu_timer_sample_group(timer->it_clock, p, &now); | 1056 | cpu_timer_sample_group(timer->it_clock, p, &now); |
1106 | bump_cpu_timer(timer, now); | 1057 | bump_cpu_timer(timer, now); |
1107 | /* Leave the tasklist_lock locked for the call below. */ | 1058 | /* Leave the sighand locked for the call below. */ |
1108 | } | 1059 | } |
1109 | 1060 | ||
1110 | /* | 1061 | /* |
1111 | * Now re-arm for the new expiry time. | 1062 | * Now re-arm for the new expiry time. |
1112 | */ | 1063 | */ |
1113 | BUG_ON(!irqs_disabled()); | 1064 | WARN_ON_ONCE(!irqs_disabled()); |
1114 | arm_timer(timer); | 1065 | arm_timer(timer); |
1115 | spin_unlock(&p->sighand->siglock); | 1066 | unlock_task_sighand(p, &flags); |
1116 | |||
1117 | out_unlock: | ||
1118 | read_unlock(&tasklist_lock); | ||
1119 | 1067 | ||
1068 | /* Kick full dynticks CPUs in case they need to tick on the new timer */ | ||
1069 | posix_cpu_timer_kick_nohz(); | ||
1120 | out: | 1070 | out: |
1121 | timer->it_overrun_last = timer->it_overrun; | 1071 | timer->it_overrun_last = timer->it_overrun; |
1122 | timer->it_overrun = -1; | 1072 | timer->it_overrun = -1; |
@@ -1200,7 +1150,7 @@ void run_posix_cpu_timers(struct task_struct *tsk) | |||
1200 | struct k_itimer *timer, *next; | 1150 | struct k_itimer *timer, *next; |
1201 | unsigned long flags; | 1151 | unsigned long flags; |
1202 | 1152 | ||
1203 | BUG_ON(!irqs_disabled()); | 1153 | WARN_ON_ONCE(!irqs_disabled()); |
1204 | 1154 | ||
1205 | /* | 1155 | /* |
1206 | * The fast path checks that there are no expired thread or thread | 1156 | * The fast path checks that there are no expired thread or thread |
@@ -1256,13 +1206,6 @@ void run_posix_cpu_timers(struct task_struct *tsk) | |||
1256 | cpu_timer_fire(timer); | 1206 | cpu_timer_fire(timer); |
1257 | spin_unlock(&timer->it_lock); | 1207 | spin_unlock(&timer->it_lock); |
1258 | } | 1208 | } |
1259 | |||
1260 | /* | ||
1261 | * In case some timers were rescheduled after the queue got emptied, | ||
1262 | * wake up full dynticks CPUs. | ||
1263 | */ | ||
1264 | if (tsk->signal->cputimer.running) | ||
1265 | posix_cpu_timer_kick_nohz(); | ||
1266 | } | 1209 | } |
1267 | 1210 | ||
1268 | /* | 1211 | /* |
@@ -1274,7 +1217,7 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx, | |||
1274 | { | 1217 | { |
1275 | unsigned long long now; | 1218 | unsigned long long now; |
1276 | 1219 | ||
1277 | BUG_ON(clock_idx == CPUCLOCK_SCHED); | 1220 | WARN_ON_ONCE(clock_idx == CPUCLOCK_SCHED); |
1278 | cpu_timer_sample_group(clock_idx, tsk, &now); | 1221 | cpu_timer_sample_group(clock_idx, tsk, &now); |
1279 | 1222 | ||
1280 | if (oldval) { | 1223 | if (oldval) { |
diff --git a/kernel/power/block_io.c b/kernel/power/block_io.c index d09dd10c5a5e..9a58bc258810 100644 --- a/kernel/power/block_io.c +++ b/kernel/power/block_io.c | |||
@@ -32,7 +32,7 @@ static int submit(int rw, struct block_device *bdev, sector_t sector, | |||
32 | struct bio *bio; | 32 | struct bio *bio; |
33 | 33 | ||
34 | bio = bio_alloc(__GFP_WAIT | __GFP_HIGH, 1); | 34 | bio = bio_alloc(__GFP_WAIT | __GFP_HIGH, 1); |
35 | bio->bi_sector = sector; | 35 | bio->bi_iter.bi_sector = sector; |
36 | bio->bi_bdev = bdev; | 36 | bio->bi_bdev = bdev; |
37 | bio->bi_end_io = end_swap_bio_read; | 37 | bio->bi_end_io = end_swap_bio_read; |
38 | 38 | ||
diff --git a/kernel/power/console.c b/kernel/power/console.c index eacb8bd8cab4..aba9c545a0e3 100644 --- a/kernel/power/console.c +++ b/kernel/power/console.c | |||
@@ -9,6 +9,7 @@ | |||
9 | #include <linux/kbd_kern.h> | 9 | #include <linux/kbd_kern.h> |
10 | #include <linux/vt.h> | 10 | #include <linux/vt.h> |
11 | #include <linux/module.h> | 11 | #include <linux/module.h> |
12 | #include <linux/slab.h> | ||
12 | #include "power.h" | 13 | #include "power.h" |
13 | 14 | ||
14 | #define SUSPEND_CONSOLE (MAX_NR_CONSOLES-1) | 15 | #define SUSPEND_CONSOLE (MAX_NR_CONSOLES-1) |
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 0121dab83f43..37170d4dd9a6 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c | |||
@@ -82,6 +82,7 @@ void hibernation_set_ops(const struct platform_hibernation_ops *ops) | |||
82 | 82 | ||
83 | unlock_system_sleep(); | 83 | unlock_system_sleep(); |
84 | } | 84 | } |
85 | EXPORT_SYMBOL_GPL(hibernation_set_ops); | ||
85 | 86 | ||
86 | static bool entering_platform_hibernation; | 87 | static bool entering_platform_hibernation; |
87 | 88 | ||
@@ -293,10 +294,10 @@ static int create_image(int platform_mode) | |||
293 | error); | 294 | error); |
294 | /* Restore control flow magically appears here */ | 295 | /* Restore control flow magically appears here */ |
295 | restore_processor_state(); | 296 | restore_processor_state(); |
296 | if (!in_suspend) { | 297 | if (!in_suspend) |
297 | events_check_enabled = false; | 298 | events_check_enabled = false; |
298 | platform_leave(platform_mode); | 299 | |
299 | } | 300 | platform_leave(platform_mode); |
300 | 301 | ||
301 | Power_up: | 302 | Power_up: |
302 | syscore_resume(); | 303 | syscore_resume(); |
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index b38109e204af..d9f61a145802 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c | |||
@@ -637,7 +637,7 @@ __register_nosave_region(unsigned long start_pfn, unsigned long end_pfn, | |||
637 | BUG_ON(!region); | 637 | BUG_ON(!region); |
638 | } else | 638 | } else |
639 | /* This allocation cannot fail */ | 639 | /* This allocation cannot fail */ |
640 | region = alloc_bootmem(sizeof(struct nosave_region)); | 640 | region = memblock_virt_alloc(sizeof(struct nosave_region), 0); |
641 | region->start_pfn = start_pfn; | 641 | region->start_pfn = start_pfn; |
642 | region->end_pfn = end_pfn; | 642 | region->end_pfn = end_pfn; |
643 | list_add_tail(®ion->list, &nosave_regions); | 643 | list_add_tail(®ion->list, &nosave_regions); |
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index be7c86bae576..4dae9cbe9259 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c | |||
@@ -757,14 +757,10 @@ void __init setup_log_buf(int early) | |||
757 | return; | 757 | return; |
758 | 758 | ||
759 | if (early) { | 759 | if (early) { |
760 | unsigned long mem; | 760 | new_log_buf = |
761 | 761 | memblock_virt_alloc(new_log_buf_len, PAGE_SIZE); | |
762 | mem = memblock_alloc(new_log_buf_len, PAGE_SIZE); | ||
763 | if (!mem) | ||
764 | return; | ||
765 | new_log_buf = __va(mem); | ||
766 | } else { | 762 | } else { |
767 | new_log_buf = alloc_bootmem_nopanic(new_log_buf_len); | 763 | new_log_buf = memblock_virt_alloc_nopanic(new_log_buf_len, 0); |
768 | } | 764 | } |
769 | 765 | ||
770 | if (unlikely(!new_log_buf)) { | 766 | if (unlikely(!new_log_buf)) { |
@@ -1080,7 +1076,6 @@ static int syslog_print_all(char __user *buf, int size, bool clear) | |||
1080 | next_seq = log_next_seq; | 1076 | next_seq = log_next_seq; |
1081 | 1077 | ||
1082 | len = 0; | 1078 | len = 0; |
1083 | prev = 0; | ||
1084 | while (len >= 0 && seq < next_seq) { | 1079 | while (len >= 0 && seq < next_seq) { |
1085 | struct printk_log *msg = log_from_idx(idx); | 1080 | struct printk_log *msg = log_from_idx(idx); |
1086 | int textlen; | 1081 | int textlen; |
@@ -1599,10 +1594,13 @@ asmlinkage int vprintk_emit(int facility, int level, | |||
1599 | * either merge it with the current buffer and flush, or if | 1594 | * either merge it with the current buffer and flush, or if |
1600 | * there was a race with interrupts (prefix == true) then just | 1595 | * there was a race with interrupts (prefix == true) then just |
1601 | * flush it out and store this line separately. | 1596 | * flush it out and store this line separately. |
1597 | * If the preceding printk was from a different task and missed | ||
1598 | * a newline, flush and append the newline. | ||
1602 | */ | 1599 | */ |
1603 | if (cont.len && cont.owner == current) { | 1600 | if (cont.len) { |
1604 | if (!(lflags & LOG_PREFIX)) | 1601 | if (cont.owner == current && !(lflags & LOG_PREFIX)) |
1605 | stored = cont_add(facility, level, text, text_len); | 1602 | stored = cont_add(facility, level, text, |
1603 | text_len); | ||
1606 | cont_flush(LOG_NEWLINE); | 1604 | cont_flush(LOG_NEWLINE); |
1607 | } | 1605 | } |
1608 | 1606 | ||
@@ -2789,7 +2787,6 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog, | |||
2789 | next_idx = idx; | 2787 | next_idx = idx; |
2790 | 2788 | ||
2791 | l = 0; | 2789 | l = 0; |
2792 | prev = 0; | ||
2793 | while (seq < dumper->next_seq) { | 2790 | while (seq < dumper->next_seq) { |
2794 | struct printk_log *msg = log_from_idx(idx); | 2791 | struct printk_log *msg = log_from_idx(idx); |
2795 | 2792 | ||
diff --git a/kernel/profile.c b/kernel/profile.c index 6631e1ef55ab..ebdd9c1a86b4 100644 --- a/kernel/profile.c +++ b/kernel/profile.c | |||
@@ -549,14 +549,14 @@ static int create_hash_tables(void) | |||
549 | struct page *page; | 549 | struct page *page; |
550 | 550 | ||
551 | page = alloc_pages_exact_node(node, | 551 | page = alloc_pages_exact_node(node, |
552 | GFP_KERNEL | __GFP_ZERO | GFP_THISNODE, | 552 | GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE, |
553 | 0); | 553 | 0); |
554 | if (!page) | 554 | if (!page) |
555 | goto out_cleanup; | 555 | goto out_cleanup; |
556 | per_cpu(cpu_profile_hits, cpu)[1] | 556 | per_cpu(cpu_profile_hits, cpu)[1] |
557 | = (struct profile_hit *)page_address(page); | 557 | = (struct profile_hit *)page_address(page); |
558 | page = alloc_pages_exact_node(node, | 558 | page = alloc_pages_exact_node(node, |
559 | GFP_KERNEL | __GFP_ZERO | GFP_THISNODE, | 559 | GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE, |
560 | 0); | 560 | 0); |
561 | if (!page) | 561 | if (!page) |
562 | goto out_cleanup; | 562 | goto out_cleanup; |
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 7859a0a3951e..79c3877e9c5b 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h | |||
@@ -96,19 +96,22 @@ static inline void debug_rcu_head_unqueue(struct rcu_head *head) | |||
96 | } | 96 | } |
97 | #endif /* #else !CONFIG_DEBUG_OBJECTS_RCU_HEAD */ | 97 | #endif /* #else !CONFIG_DEBUG_OBJECTS_RCU_HEAD */ |
98 | 98 | ||
99 | extern void kfree(const void *); | 99 | void kfree(const void *); |
100 | 100 | ||
101 | static inline bool __rcu_reclaim(const char *rn, struct rcu_head *head) | 101 | static inline bool __rcu_reclaim(const char *rn, struct rcu_head *head) |
102 | { | 102 | { |
103 | unsigned long offset = (unsigned long)head->func; | 103 | unsigned long offset = (unsigned long)head->func; |
104 | 104 | ||
105 | rcu_lock_acquire(&rcu_callback_map); | ||
105 | if (__is_kfree_rcu_offset(offset)) { | 106 | if (__is_kfree_rcu_offset(offset)) { |
106 | RCU_TRACE(trace_rcu_invoke_kfree_callback(rn, head, offset)); | 107 | RCU_TRACE(trace_rcu_invoke_kfree_callback(rn, head, offset)); |
107 | kfree((void *)head - offset); | 108 | kfree((void *)head - offset); |
109 | rcu_lock_release(&rcu_callback_map); | ||
108 | return 1; | 110 | return 1; |
109 | } else { | 111 | } else { |
110 | RCU_TRACE(trace_rcu_invoke_callback(rn, head)); | 112 | RCU_TRACE(trace_rcu_invoke_callback(rn, head)); |
111 | head->func(head); | 113 | head->func(head); |
114 | rcu_lock_release(&rcu_callback_map); | ||
112 | return 0; | 115 | return 0; |
113 | } | 116 | } |
114 | } | 117 | } |
diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c index 01d5ccb8bfe3..3318d8284384 100644 --- a/kernel/rcu/srcu.c +++ b/kernel/rcu/srcu.c | |||
@@ -363,6 +363,29 @@ static void srcu_flip(struct srcu_struct *sp) | |||
363 | /* | 363 | /* |
364 | * Enqueue an SRCU callback on the specified srcu_struct structure, | 364 | * Enqueue an SRCU callback on the specified srcu_struct structure, |
365 | * initiating grace-period processing if it is not already running. | 365 | * initiating grace-period processing if it is not already running. |
366 | * | ||
367 | * Note that all CPUs must agree that the grace period extended beyond | ||
368 | * all pre-existing SRCU read-side critical section. On systems with | ||
369 | * more than one CPU, this means that when "func()" is invoked, each CPU | ||
370 | * is guaranteed to have executed a full memory barrier since the end of | ||
371 | * its last corresponding SRCU read-side critical section whose beginning | ||
372 | * preceded the call to call_rcu(). It also means that each CPU executing | ||
373 | * an SRCU read-side critical section that continues beyond the start of | ||
374 | * "func()" must have executed a memory barrier after the call_rcu() | ||
375 | * but before the beginning of that SRCU read-side critical section. | ||
376 | * Note that these guarantees include CPUs that are offline, idle, or | ||
377 | * executing in user mode, as well as CPUs that are executing in the kernel. | ||
378 | * | ||
379 | * Furthermore, if CPU A invoked call_rcu() and CPU B invoked the | ||
380 | * resulting SRCU callback function "func()", then both CPU A and CPU | ||
381 | * B are guaranteed to execute a full memory barrier during the time | ||
382 | * interval between the call to call_rcu() and the invocation of "func()". | ||
383 | * This guarantee applies even if CPU A and CPU B are the same CPU (but | ||
384 | * again only if the system has more than one CPU). | ||
385 | * | ||
386 | * Of course, these guarantees apply only for invocations of call_srcu(), | ||
387 | * srcu_read_lock(), and srcu_read_unlock() that are all passed the same | ||
388 | * srcu_struct structure. | ||
366 | */ | 389 | */ |
367 | void call_srcu(struct srcu_struct *sp, struct rcu_head *head, | 390 | void call_srcu(struct srcu_struct *sp, struct rcu_head *head, |
368 | void (*func)(struct rcu_head *head)) | 391 | void (*func)(struct rcu_head *head)) |
@@ -459,7 +482,30 @@ static void __synchronize_srcu(struct srcu_struct *sp, int trycount) | |||
459 | * Note that it is illegal to call synchronize_srcu() from the corresponding | 482 | * Note that it is illegal to call synchronize_srcu() from the corresponding |
460 | * SRCU read-side critical section; doing so will result in deadlock. | 483 | * SRCU read-side critical section; doing so will result in deadlock. |
461 | * However, it is perfectly legal to call synchronize_srcu() on one | 484 | * However, it is perfectly legal to call synchronize_srcu() on one |
462 | * srcu_struct from some other srcu_struct's read-side critical section. | 485 | * srcu_struct from some other srcu_struct's read-side critical section, |
486 | * as long as the resulting graph of srcu_structs is acyclic. | ||
487 | * | ||
488 | * There are memory-ordering constraints implied by synchronize_srcu(). | ||
489 | * On systems with more than one CPU, when synchronize_srcu() returns, | ||
490 | * each CPU is guaranteed to have executed a full memory barrier since | ||
491 | * the end of its last corresponding SRCU-sched read-side critical section | ||
492 | * whose beginning preceded the call to synchronize_srcu(). In addition, | ||
493 | * each CPU having an SRCU read-side critical section that extends beyond | ||
494 | * the return from synchronize_srcu() is guaranteed to have executed a | ||
495 | * full memory barrier after the beginning of synchronize_srcu() and before | ||
496 | * the beginning of that SRCU read-side critical section. Note that these | ||
497 | * guarantees include CPUs that are offline, idle, or executing in user mode, | ||
498 | * as well as CPUs that are executing in the kernel. | ||
499 | * | ||
500 | * Furthermore, if CPU A invoked synchronize_srcu(), which returned | ||
501 | * to its caller on CPU B, then both CPU A and CPU B are guaranteed | ||
502 | * to have executed a full memory barrier during the execution of | ||
503 | * synchronize_srcu(). This guarantee applies even if CPU A and CPU B | ||
504 | * are the same CPU, but again only if the system has more than one CPU. | ||
505 | * | ||
506 | * Of course, these memory-ordering guarantees apply only when | ||
507 | * synchronize_srcu(), srcu_read_lock(), and srcu_read_unlock() are | ||
508 | * passed the same srcu_struct structure. | ||
463 | */ | 509 | */ |
464 | void synchronize_srcu(struct srcu_struct *sp) | 510 | void synchronize_srcu(struct srcu_struct *sp) |
465 | { | 511 | { |
@@ -476,12 +522,8 @@ EXPORT_SYMBOL_GPL(synchronize_srcu); | |||
476 | * Wait for an SRCU grace period to elapse, but be more aggressive about | 522 | * Wait for an SRCU grace period to elapse, but be more aggressive about |
477 | * spinning rather than blocking when waiting. | 523 | * spinning rather than blocking when waiting. |
478 | * | 524 | * |
479 | * Note that it is also illegal to call synchronize_srcu_expedited() | 525 | * Note that synchronize_srcu_expedited() has the same deadlock and |
480 | * from the corresponding SRCU read-side critical section; | 526 | * memory-ordering properties as does synchronize_srcu(). |
481 | * doing so will result in deadlock. However, it is perfectly legal | ||
482 | * to call synchronize_srcu_expedited() on one srcu_struct from some | ||
483 | * other srcu_struct's read-side critical section, as long as | ||
484 | * the resulting graph of srcu_structs is acyclic. | ||
485 | */ | 527 | */ |
486 | void synchronize_srcu_expedited(struct srcu_struct *sp) | 528 | void synchronize_srcu_expedited(struct srcu_struct *sp) |
487 | { | 529 | { |
@@ -491,6 +533,7 @@ EXPORT_SYMBOL_GPL(synchronize_srcu_expedited); | |||
491 | 533 | ||
492 | /** | 534 | /** |
493 | * srcu_barrier - Wait until all in-flight call_srcu() callbacks complete. | 535 | * srcu_barrier - Wait until all in-flight call_srcu() callbacks complete. |
536 | * @sp: srcu_struct on which to wait for in-flight callbacks. | ||
494 | */ | 537 | */ |
495 | void srcu_barrier(struct srcu_struct *sp) | 538 | void srcu_barrier(struct srcu_struct *sp) |
496 | { | 539 | { |
diff --git a/kernel/rcu/torture.c b/kernel/rcu/torture.c index 3929cd451511..732f8ae3086a 100644 --- a/kernel/rcu/torture.c +++ b/kernel/rcu/torture.c | |||
@@ -139,8 +139,6 @@ MODULE_PARM_DESC(verbose, "Enable verbose debugging printk()s"); | |||
139 | #define VERBOSE_PRINTK_ERRSTRING(s) \ | 139 | #define VERBOSE_PRINTK_ERRSTRING(s) \ |
140 | do { if (verbose) pr_alert("%s" TORTURE_FLAG "!!! " s "\n", torture_type); } while (0) | 140 | do { if (verbose) pr_alert("%s" TORTURE_FLAG "!!! " s "\n", torture_type); } while (0) |
141 | 141 | ||
142 | static char printk_buf[4096]; | ||
143 | |||
144 | static int nrealreaders; | 142 | static int nrealreaders; |
145 | static struct task_struct *writer_task; | 143 | static struct task_struct *writer_task; |
146 | static struct task_struct **fakewriter_tasks; | 144 | static struct task_struct **fakewriter_tasks; |
@@ -376,7 +374,7 @@ struct rcu_torture_ops { | |||
376 | void (*call)(struct rcu_head *head, void (*func)(struct rcu_head *rcu)); | 374 | void (*call)(struct rcu_head *head, void (*func)(struct rcu_head *rcu)); |
377 | void (*cb_barrier)(void); | 375 | void (*cb_barrier)(void); |
378 | void (*fqs)(void); | 376 | void (*fqs)(void); |
379 | int (*stats)(char *page); | 377 | void (*stats)(char *page); |
380 | int irq_capable; | 378 | int irq_capable; |
381 | int can_boost; | 379 | int can_boost; |
382 | const char *name; | 380 | const char *name; |
@@ -578,21 +576,19 @@ static void srcu_torture_barrier(void) | |||
578 | srcu_barrier(&srcu_ctl); | 576 | srcu_barrier(&srcu_ctl); |
579 | } | 577 | } |
580 | 578 | ||
581 | static int srcu_torture_stats(char *page) | 579 | static void srcu_torture_stats(char *page) |
582 | { | 580 | { |
583 | int cnt = 0; | ||
584 | int cpu; | 581 | int cpu; |
585 | int idx = srcu_ctl.completed & 0x1; | 582 | int idx = srcu_ctl.completed & 0x1; |
586 | 583 | ||
587 | cnt += sprintf(&page[cnt], "%s%s per-CPU(idx=%d):", | 584 | page += sprintf(page, "%s%s per-CPU(idx=%d):", |
588 | torture_type, TORTURE_FLAG, idx); | 585 | torture_type, TORTURE_FLAG, idx); |
589 | for_each_possible_cpu(cpu) { | 586 | for_each_possible_cpu(cpu) { |
590 | cnt += sprintf(&page[cnt], " %d(%lu,%lu)", cpu, | 587 | page += sprintf(page, " %d(%lu,%lu)", cpu, |
591 | per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[!idx], | 588 | per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[!idx], |
592 | per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[idx]); | 589 | per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[idx]); |
593 | } | 590 | } |
594 | cnt += sprintf(&page[cnt], "\n"); | 591 | sprintf(page, "\n"); |
595 | return cnt; | ||
596 | } | 592 | } |
597 | 593 | ||
598 | static void srcu_torture_synchronize_expedited(void) | 594 | static void srcu_torture_synchronize_expedited(void) |
@@ -1052,10 +1048,9 @@ rcu_torture_reader(void *arg) | |||
1052 | /* | 1048 | /* |
1053 | * Create an RCU-torture statistics message in the specified buffer. | 1049 | * Create an RCU-torture statistics message in the specified buffer. |
1054 | */ | 1050 | */ |
1055 | static int | 1051 | static void |
1056 | rcu_torture_printk(char *page) | 1052 | rcu_torture_printk(char *page) |
1057 | { | 1053 | { |
1058 | int cnt = 0; | ||
1059 | int cpu; | 1054 | int cpu; |
1060 | int i; | 1055 | int i; |
1061 | long pipesummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 }; | 1056 | long pipesummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 }; |
@@ -1071,8 +1066,8 @@ rcu_torture_printk(char *page) | |||
1071 | if (pipesummary[i] != 0) | 1066 | if (pipesummary[i] != 0) |
1072 | break; | 1067 | break; |
1073 | } | 1068 | } |
1074 | cnt += sprintf(&page[cnt], "%s%s ", torture_type, TORTURE_FLAG); | 1069 | page += sprintf(page, "%s%s ", torture_type, TORTURE_FLAG); |
1075 | cnt += sprintf(&page[cnt], | 1070 | page += sprintf(page, |
1076 | "rtc: %p ver: %lu tfle: %d rta: %d rtaf: %d rtf: %d ", | 1071 | "rtc: %p ver: %lu tfle: %d rta: %d rtaf: %d rtf: %d ", |
1077 | rcu_torture_current, | 1072 | rcu_torture_current, |
1078 | rcu_torture_current_version, | 1073 | rcu_torture_current_version, |
@@ -1080,53 +1075,52 @@ rcu_torture_printk(char *page) | |||
1080 | atomic_read(&n_rcu_torture_alloc), | 1075 | atomic_read(&n_rcu_torture_alloc), |
1081 | atomic_read(&n_rcu_torture_alloc_fail), | 1076 | atomic_read(&n_rcu_torture_alloc_fail), |
1082 | atomic_read(&n_rcu_torture_free)); | 1077 | atomic_read(&n_rcu_torture_free)); |
1083 | cnt += sprintf(&page[cnt], "rtmbe: %d rtbke: %ld rtbre: %ld ", | 1078 | page += sprintf(page, "rtmbe: %d rtbke: %ld rtbre: %ld ", |
1084 | atomic_read(&n_rcu_torture_mberror), | 1079 | atomic_read(&n_rcu_torture_mberror), |
1085 | n_rcu_torture_boost_ktrerror, | 1080 | n_rcu_torture_boost_ktrerror, |
1086 | n_rcu_torture_boost_rterror); | 1081 | n_rcu_torture_boost_rterror); |
1087 | cnt += sprintf(&page[cnt], "rtbf: %ld rtb: %ld nt: %ld ", | 1082 | page += sprintf(page, "rtbf: %ld rtb: %ld nt: %ld ", |
1088 | n_rcu_torture_boost_failure, | 1083 | n_rcu_torture_boost_failure, |
1089 | n_rcu_torture_boosts, | 1084 | n_rcu_torture_boosts, |
1090 | n_rcu_torture_timers); | 1085 | n_rcu_torture_timers); |
1091 | cnt += sprintf(&page[cnt], | 1086 | page += sprintf(page, |
1092 | "onoff: %ld/%ld:%ld/%ld %d,%d:%d,%d %lu:%lu (HZ=%d) ", | 1087 | "onoff: %ld/%ld:%ld/%ld %d,%d:%d,%d %lu:%lu (HZ=%d) ", |
1093 | n_online_successes, n_online_attempts, | 1088 | n_online_successes, n_online_attempts, |
1094 | n_offline_successes, n_offline_attempts, | 1089 | n_offline_successes, n_offline_attempts, |
1095 | min_online, max_online, | 1090 | min_online, max_online, |
1096 | min_offline, max_offline, | 1091 | min_offline, max_offline, |
1097 | sum_online, sum_offline, HZ); | 1092 | sum_online, sum_offline, HZ); |
1098 | cnt += sprintf(&page[cnt], "barrier: %ld/%ld:%ld", | 1093 | page += sprintf(page, "barrier: %ld/%ld:%ld", |
1099 | n_barrier_successes, | 1094 | n_barrier_successes, |
1100 | n_barrier_attempts, | 1095 | n_barrier_attempts, |
1101 | n_rcu_torture_barrier_error); | 1096 | n_rcu_torture_barrier_error); |
1102 | cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG); | 1097 | page += sprintf(page, "\n%s%s ", torture_type, TORTURE_FLAG); |
1103 | if (atomic_read(&n_rcu_torture_mberror) != 0 || | 1098 | if (atomic_read(&n_rcu_torture_mberror) != 0 || |
1104 | n_rcu_torture_barrier_error != 0 || | 1099 | n_rcu_torture_barrier_error != 0 || |
1105 | n_rcu_torture_boost_ktrerror != 0 || | 1100 | n_rcu_torture_boost_ktrerror != 0 || |
1106 | n_rcu_torture_boost_rterror != 0 || | 1101 | n_rcu_torture_boost_rterror != 0 || |
1107 | n_rcu_torture_boost_failure != 0 || | 1102 | n_rcu_torture_boost_failure != 0 || |
1108 | i > 1) { | 1103 | i > 1) { |
1109 | cnt += sprintf(&page[cnt], "!!! "); | 1104 | page += sprintf(page, "!!! "); |
1110 | atomic_inc(&n_rcu_torture_error); | 1105 | atomic_inc(&n_rcu_torture_error); |
1111 | WARN_ON_ONCE(1); | 1106 | WARN_ON_ONCE(1); |
1112 | } | 1107 | } |
1113 | cnt += sprintf(&page[cnt], "Reader Pipe: "); | 1108 | page += sprintf(page, "Reader Pipe: "); |
1114 | for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) | 1109 | for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) |
1115 | cnt += sprintf(&page[cnt], " %ld", pipesummary[i]); | 1110 | page += sprintf(page, " %ld", pipesummary[i]); |
1116 | cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG); | 1111 | page += sprintf(page, "\n%s%s ", torture_type, TORTURE_FLAG); |
1117 | cnt += sprintf(&page[cnt], "Reader Batch: "); | 1112 | page += sprintf(page, "Reader Batch: "); |
1118 | for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) | 1113 | for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) |
1119 | cnt += sprintf(&page[cnt], " %ld", batchsummary[i]); | 1114 | page += sprintf(page, " %ld", batchsummary[i]); |
1120 | cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG); | 1115 | page += sprintf(page, "\n%s%s ", torture_type, TORTURE_FLAG); |
1121 | cnt += sprintf(&page[cnt], "Free-Block Circulation: "); | 1116 | page += sprintf(page, "Free-Block Circulation: "); |
1122 | for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) { | 1117 | for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) { |
1123 | cnt += sprintf(&page[cnt], " %d", | 1118 | page += sprintf(page, " %d", |
1124 | atomic_read(&rcu_torture_wcount[i])); | 1119 | atomic_read(&rcu_torture_wcount[i])); |
1125 | } | 1120 | } |
1126 | cnt += sprintf(&page[cnt], "\n"); | 1121 | page += sprintf(page, "\n"); |
1127 | if (cur_ops->stats) | 1122 | if (cur_ops->stats) |
1128 | cnt += cur_ops->stats(&page[cnt]); | 1123 | cur_ops->stats(page); |
1129 | return cnt; | ||
1130 | } | 1124 | } |
1131 | 1125 | ||
1132 | /* | 1126 | /* |
@@ -1140,10 +1134,17 @@ rcu_torture_printk(char *page) | |||
1140 | static void | 1134 | static void |
1141 | rcu_torture_stats_print(void) | 1135 | rcu_torture_stats_print(void) |
1142 | { | 1136 | { |
1143 | int cnt; | 1137 | int size = nr_cpu_ids * 200 + 8192; |
1138 | char *buf; | ||
1144 | 1139 | ||
1145 | cnt = rcu_torture_printk(printk_buf); | 1140 | buf = kmalloc(size, GFP_KERNEL); |
1146 | pr_alert("%s", printk_buf); | 1141 | if (!buf) { |
1142 | pr_err("rcu-torture: Out of memory, need: %d", size); | ||
1143 | return; | ||
1144 | } | ||
1145 | rcu_torture_printk(buf); | ||
1146 | pr_alert("%s", buf); | ||
1147 | kfree(buf); | ||
1147 | } | 1148 | } |
1148 | 1149 | ||
1149 | /* | 1150 | /* |
@@ -1578,6 +1579,7 @@ static int rcu_torture_barrier_cbs(void *arg) | |||
1578 | { | 1579 | { |
1579 | long myid = (long)arg; | 1580 | long myid = (long)arg; |
1580 | bool lastphase = 0; | 1581 | bool lastphase = 0; |
1582 | bool newphase; | ||
1581 | struct rcu_head rcu; | 1583 | struct rcu_head rcu; |
1582 | 1584 | ||
1583 | init_rcu_head_on_stack(&rcu); | 1585 | init_rcu_head_on_stack(&rcu); |
@@ -1585,10 +1587,11 @@ static int rcu_torture_barrier_cbs(void *arg) | |||
1585 | set_user_nice(current, 19); | 1587 | set_user_nice(current, 19); |
1586 | do { | 1588 | do { |
1587 | wait_event(barrier_cbs_wq[myid], | 1589 | wait_event(barrier_cbs_wq[myid], |
1588 | barrier_phase != lastphase || | 1590 | (newphase = |
1591 | ACCESS_ONCE(barrier_phase)) != lastphase || | ||
1589 | kthread_should_stop() || | 1592 | kthread_should_stop() || |
1590 | fullstop != FULLSTOP_DONTSTOP); | 1593 | fullstop != FULLSTOP_DONTSTOP); |
1591 | lastphase = barrier_phase; | 1594 | lastphase = newphase; |
1592 | smp_mb(); /* ensure barrier_phase load before ->call(). */ | 1595 | smp_mb(); /* ensure barrier_phase load before ->call(). */ |
1593 | if (kthread_should_stop() || fullstop != FULLSTOP_DONTSTOP) | 1596 | if (kthread_should_stop() || fullstop != FULLSTOP_DONTSTOP) |
1594 | break; | 1597 | break; |
@@ -1625,7 +1628,7 @@ static int rcu_torture_barrier(void *arg) | |||
1625 | if (kthread_should_stop() || fullstop != FULLSTOP_DONTSTOP) | 1628 | if (kthread_should_stop() || fullstop != FULLSTOP_DONTSTOP) |
1626 | break; | 1629 | break; |
1627 | n_barrier_attempts++; | 1630 | n_barrier_attempts++; |
1628 | cur_ops->cb_barrier(); | 1631 | cur_ops->cb_barrier(); /* Implies smp_mb() for wait_event(). */ |
1629 | if (atomic_read(&barrier_cbs_invoked) != n_barrier_cbs) { | 1632 | if (atomic_read(&barrier_cbs_invoked) != n_barrier_cbs) { |
1630 | n_rcu_torture_barrier_error++; | 1633 | n_rcu_torture_barrier_error++; |
1631 | WARN_ON_ONCE(1); | 1634 | WARN_ON_ONCE(1); |
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index dd081987a8ec..b3d116cd072d 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c | |||
@@ -369,6 +369,9 @@ static struct rcu_node *rcu_get_root(struct rcu_state *rsp) | |||
369 | static void rcu_eqs_enter_common(struct rcu_dynticks *rdtp, long long oldval, | 369 | static void rcu_eqs_enter_common(struct rcu_dynticks *rdtp, long long oldval, |
370 | bool user) | 370 | bool user) |
371 | { | 371 | { |
372 | struct rcu_state *rsp; | ||
373 | struct rcu_data *rdp; | ||
374 | |||
372 | trace_rcu_dyntick(TPS("Start"), oldval, rdtp->dynticks_nesting); | 375 | trace_rcu_dyntick(TPS("Start"), oldval, rdtp->dynticks_nesting); |
373 | if (!user && !is_idle_task(current)) { | 376 | if (!user && !is_idle_task(current)) { |
374 | struct task_struct *idle __maybe_unused = | 377 | struct task_struct *idle __maybe_unused = |
@@ -380,6 +383,10 @@ static void rcu_eqs_enter_common(struct rcu_dynticks *rdtp, long long oldval, | |||
380 | current->pid, current->comm, | 383 | current->pid, current->comm, |
381 | idle->pid, idle->comm); /* must be idle task! */ | 384 | idle->pid, idle->comm); /* must be idle task! */ |
382 | } | 385 | } |
386 | for_each_rcu_flavor(rsp) { | ||
387 | rdp = this_cpu_ptr(rsp->rda); | ||
388 | do_nocb_deferred_wakeup(rdp); | ||
389 | } | ||
383 | rcu_prepare_for_idle(smp_processor_id()); | 390 | rcu_prepare_for_idle(smp_processor_id()); |
384 | /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */ | 391 | /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */ |
385 | smp_mb__before_atomic_inc(); /* See above. */ | 392 | smp_mb__before_atomic_inc(); /* See above. */ |
@@ -411,11 +418,12 @@ static void rcu_eqs_enter(bool user) | |||
411 | rdtp = this_cpu_ptr(&rcu_dynticks); | 418 | rdtp = this_cpu_ptr(&rcu_dynticks); |
412 | oldval = rdtp->dynticks_nesting; | 419 | oldval = rdtp->dynticks_nesting; |
413 | WARN_ON_ONCE((oldval & DYNTICK_TASK_NEST_MASK) == 0); | 420 | WARN_ON_ONCE((oldval & DYNTICK_TASK_NEST_MASK) == 0); |
414 | if ((oldval & DYNTICK_TASK_NEST_MASK) == DYNTICK_TASK_NEST_VALUE) | 421 | if ((oldval & DYNTICK_TASK_NEST_MASK) == DYNTICK_TASK_NEST_VALUE) { |
415 | rdtp->dynticks_nesting = 0; | 422 | rdtp->dynticks_nesting = 0; |
416 | else | 423 | rcu_eqs_enter_common(rdtp, oldval, user); |
424 | } else { | ||
417 | rdtp->dynticks_nesting -= DYNTICK_TASK_NEST_VALUE; | 425 | rdtp->dynticks_nesting -= DYNTICK_TASK_NEST_VALUE; |
418 | rcu_eqs_enter_common(rdtp, oldval, user); | 426 | } |
419 | } | 427 | } |
420 | 428 | ||
421 | /** | 429 | /** |
@@ -533,11 +541,12 @@ static void rcu_eqs_exit(bool user) | |||
533 | rdtp = this_cpu_ptr(&rcu_dynticks); | 541 | rdtp = this_cpu_ptr(&rcu_dynticks); |
534 | oldval = rdtp->dynticks_nesting; | 542 | oldval = rdtp->dynticks_nesting; |
535 | WARN_ON_ONCE(oldval < 0); | 543 | WARN_ON_ONCE(oldval < 0); |
536 | if (oldval & DYNTICK_TASK_NEST_MASK) | 544 | if (oldval & DYNTICK_TASK_NEST_MASK) { |
537 | rdtp->dynticks_nesting += DYNTICK_TASK_NEST_VALUE; | 545 | rdtp->dynticks_nesting += DYNTICK_TASK_NEST_VALUE; |
538 | else | 546 | } else { |
539 | rdtp->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; | 547 | rdtp->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; |
540 | rcu_eqs_exit_common(rdtp, oldval, user); | 548 | rcu_eqs_exit_common(rdtp, oldval, user); |
549 | } | ||
541 | } | 550 | } |
542 | 551 | ||
543 | /** | 552 | /** |
@@ -716,7 +725,7 @@ bool rcu_lockdep_current_cpu_online(void) | |||
716 | bool ret; | 725 | bool ret; |
717 | 726 | ||
718 | if (in_nmi()) | 727 | if (in_nmi()) |
719 | return 1; | 728 | return true; |
720 | preempt_disable(); | 729 | preempt_disable(); |
721 | rdp = this_cpu_ptr(&rcu_sched_data); | 730 | rdp = this_cpu_ptr(&rcu_sched_data); |
722 | rnp = rdp->mynode; | 731 | rnp = rdp->mynode; |
@@ -755,6 +764,12 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp, | |||
755 | } | 764 | } |
756 | 765 | ||
757 | /* | 766 | /* |
767 | * This function really isn't for public consumption, but RCU is special in | ||
768 | * that context switches can allow the state machine to make progress. | ||
769 | */ | ||
770 | extern void resched_cpu(int cpu); | ||
771 | |||
772 | /* | ||
758 | * Return true if the specified CPU has passed through a quiescent | 773 | * Return true if the specified CPU has passed through a quiescent |
759 | * state by virtue of being in or having passed through an dynticks | 774 | * state by virtue of being in or having passed through an dynticks |
760 | * idle state since the last call to dyntick_save_progress_counter() | 775 | * idle state since the last call to dyntick_save_progress_counter() |
@@ -812,16 +827,34 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp, | |||
812 | */ | 827 | */ |
813 | rcu_kick_nohz_cpu(rdp->cpu); | 828 | rcu_kick_nohz_cpu(rdp->cpu); |
814 | 829 | ||
830 | /* | ||
831 | * Alternatively, the CPU might be running in the kernel | ||
832 | * for an extended period of time without a quiescent state. | ||
833 | * Attempt to force the CPU through the scheduler to gain the | ||
834 | * needed quiescent state, but only if the grace period has gone | ||
835 | * on for an uncommonly long time. If there are many stuck CPUs, | ||
836 | * we will beat on the first one until it gets unstuck, then move | ||
837 | * to the next. Only do this for the primary flavor of RCU. | ||
838 | */ | ||
839 | if (rdp->rsp == rcu_state && | ||
840 | ULONG_CMP_GE(ACCESS_ONCE(jiffies), rdp->rsp->jiffies_resched)) { | ||
841 | rdp->rsp->jiffies_resched += 5; | ||
842 | resched_cpu(rdp->cpu); | ||
843 | } | ||
844 | |||
815 | return 0; | 845 | return 0; |
816 | } | 846 | } |
817 | 847 | ||
818 | static void record_gp_stall_check_time(struct rcu_state *rsp) | 848 | static void record_gp_stall_check_time(struct rcu_state *rsp) |
819 | { | 849 | { |
820 | unsigned long j = ACCESS_ONCE(jiffies); | 850 | unsigned long j = ACCESS_ONCE(jiffies); |
851 | unsigned long j1; | ||
821 | 852 | ||
822 | rsp->gp_start = j; | 853 | rsp->gp_start = j; |
823 | smp_wmb(); /* Record start time before stall time. */ | 854 | smp_wmb(); /* Record start time before stall time. */ |
824 | rsp->jiffies_stall = j + rcu_jiffies_till_stall_check(); | 855 | j1 = rcu_jiffies_till_stall_check(); |
856 | rsp->jiffies_stall = j + j1; | ||
857 | rsp->jiffies_resched = j + j1 / 2; | ||
825 | } | 858 | } |
826 | 859 | ||
827 | /* | 860 | /* |
@@ -1133,8 +1166,10 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp) | |||
1133 | * hold it, acquire the root rcu_node structure's lock in order to | 1166 | * hold it, acquire the root rcu_node structure's lock in order to |
1134 | * start one (if needed). | 1167 | * start one (if needed). |
1135 | */ | 1168 | */ |
1136 | if (rnp != rnp_root) | 1169 | if (rnp != rnp_root) { |
1137 | raw_spin_lock(&rnp_root->lock); | 1170 | raw_spin_lock(&rnp_root->lock); |
1171 | smp_mb__after_unlock_lock(); | ||
1172 | } | ||
1138 | 1173 | ||
1139 | /* | 1174 | /* |
1140 | * Get a new grace-period number. If there really is no grace | 1175 | * Get a new grace-period number. If there really is no grace |
@@ -1354,6 +1389,7 @@ static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp) | |||
1354 | local_irq_restore(flags); | 1389 | local_irq_restore(flags); |
1355 | return; | 1390 | return; |
1356 | } | 1391 | } |
1392 | smp_mb__after_unlock_lock(); | ||
1357 | __note_gp_changes(rsp, rnp, rdp); | 1393 | __note_gp_changes(rsp, rnp, rdp); |
1358 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | 1394 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
1359 | } | 1395 | } |
@@ -1368,6 +1404,7 @@ static int rcu_gp_init(struct rcu_state *rsp) | |||
1368 | 1404 | ||
1369 | rcu_bind_gp_kthread(); | 1405 | rcu_bind_gp_kthread(); |
1370 | raw_spin_lock_irq(&rnp->lock); | 1406 | raw_spin_lock_irq(&rnp->lock); |
1407 | smp_mb__after_unlock_lock(); | ||
1371 | if (rsp->gp_flags == 0) { | 1408 | if (rsp->gp_flags == 0) { |
1372 | /* Spurious wakeup, tell caller to go back to sleep. */ | 1409 | /* Spurious wakeup, tell caller to go back to sleep. */ |
1373 | raw_spin_unlock_irq(&rnp->lock); | 1410 | raw_spin_unlock_irq(&rnp->lock); |
@@ -1409,6 +1446,7 @@ static int rcu_gp_init(struct rcu_state *rsp) | |||
1409 | */ | 1446 | */ |
1410 | rcu_for_each_node_breadth_first(rsp, rnp) { | 1447 | rcu_for_each_node_breadth_first(rsp, rnp) { |
1411 | raw_spin_lock_irq(&rnp->lock); | 1448 | raw_spin_lock_irq(&rnp->lock); |
1449 | smp_mb__after_unlock_lock(); | ||
1412 | rdp = this_cpu_ptr(rsp->rda); | 1450 | rdp = this_cpu_ptr(rsp->rda); |
1413 | rcu_preempt_check_blocked_tasks(rnp); | 1451 | rcu_preempt_check_blocked_tasks(rnp); |
1414 | rnp->qsmask = rnp->qsmaskinit; | 1452 | rnp->qsmask = rnp->qsmaskinit; |
@@ -1463,6 +1501,7 @@ static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in) | |||
1463 | /* Clear flag to prevent immediate re-entry. */ | 1501 | /* Clear flag to prevent immediate re-entry. */ |
1464 | if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) { | 1502 | if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) { |
1465 | raw_spin_lock_irq(&rnp->lock); | 1503 | raw_spin_lock_irq(&rnp->lock); |
1504 | smp_mb__after_unlock_lock(); | ||
1466 | rsp->gp_flags &= ~RCU_GP_FLAG_FQS; | 1505 | rsp->gp_flags &= ~RCU_GP_FLAG_FQS; |
1467 | raw_spin_unlock_irq(&rnp->lock); | 1506 | raw_spin_unlock_irq(&rnp->lock); |
1468 | } | 1507 | } |
@@ -1480,6 +1519,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) | |||
1480 | struct rcu_node *rnp = rcu_get_root(rsp); | 1519 | struct rcu_node *rnp = rcu_get_root(rsp); |
1481 | 1520 | ||
1482 | raw_spin_lock_irq(&rnp->lock); | 1521 | raw_spin_lock_irq(&rnp->lock); |
1522 | smp_mb__after_unlock_lock(); | ||
1483 | gp_duration = jiffies - rsp->gp_start; | 1523 | gp_duration = jiffies - rsp->gp_start; |
1484 | if (gp_duration > rsp->gp_max) | 1524 | if (gp_duration > rsp->gp_max) |
1485 | rsp->gp_max = gp_duration; | 1525 | rsp->gp_max = gp_duration; |
@@ -1505,16 +1545,19 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) | |||
1505 | */ | 1545 | */ |
1506 | rcu_for_each_node_breadth_first(rsp, rnp) { | 1546 | rcu_for_each_node_breadth_first(rsp, rnp) { |
1507 | raw_spin_lock_irq(&rnp->lock); | 1547 | raw_spin_lock_irq(&rnp->lock); |
1548 | smp_mb__after_unlock_lock(); | ||
1508 | ACCESS_ONCE(rnp->completed) = rsp->gpnum; | 1549 | ACCESS_ONCE(rnp->completed) = rsp->gpnum; |
1509 | rdp = this_cpu_ptr(rsp->rda); | 1550 | rdp = this_cpu_ptr(rsp->rda); |
1510 | if (rnp == rdp->mynode) | 1551 | if (rnp == rdp->mynode) |
1511 | __note_gp_changes(rsp, rnp, rdp); | 1552 | __note_gp_changes(rsp, rnp, rdp); |
1553 | /* smp_mb() provided by prior unlock-lock pair. */ | ||
1512 | nocb += rcu_future_gp_cleanup(rsp, rnp); | 1554 | nocb += rcu_future_gp_cleanup(rsp, rnp); |
1513 | raw_spin_unlock_irq(&rnp->lock); | 1555 | raw_spin_unlock_irq(&rnp->lock); |
1514 | cond_resched(); | 1556 | cond_resched(); |
1515 | } | 1557 | } |
1516 | rnp = rcu_get_root(rsp); | 1558 | rnp = rcu_get_root(rsp); |
1517 | raw_spin_lock_irq(&rnp->lock); | 1559 | raw_spin_lock_irq(&rnp->lock); |
1560 | smp_mb__after_unlock_lock(); | ||
1518 | rcu_nocb_gp_set(rnp, nocb); | 1561 | rcu_nocb_gp_set(rnp, nocb); |
1519 | 1562 | ||
1520 | rsp->completed = rsp->gpnum; /* Declare grace period done. */ | 1563 | rsp->completed = rsp->gpnum; /* Declare grace period done. */ |
@@ -1553,6 +1596,7 @@ static int __noreturn rcu_gp_kthread(void *arg) | |||
1553 | wait_event_interruptible(rsp->gp_wq, | 1596 | wait_event_interruptible(rsp->gp_wq, |
1554 | ACCESS_ONCE(rsp->gp_flags) & | 1597 | ACCESS_ONCE(rsp->gp_flags) & |
1555 | RCU_GP_FLAG_INIT); | 1598 | RCU_GP_FLAG_INIT); |
1599 | /* Locking provides needed memory barrier. */ | ||
1556 | if (rcu_gp_init(rsp)) | 1600 | if (rcu_gp_init(rsp)) |
1557 | break; | 1601 | break; |
1558 | cond_resched(); | 1602 | cond_resched(); |
@@ -1582,6 +1626,7 @@ static int __noreturn rcu_gp_kthread(void *arg) | |||
1582 | (!ACCESS_ONCE(rnp->qsmask) && | 1626 | (!ACCESS_ONCE(rnp->qsmask) && |
1583 | !rcu_preempt_blocked_readers_cgp(rnp)), | 1627 | !rcu_preempt_blocked_readers_cgp(rnp)), |
1584 | j); | 1628 | j); |
1629 | /* Locking provides needed memory barriers. */ | ||
1585 | /* If grace period done, leave loop. */ | 1630 | /* If grace period done, leave loop. */ |
1586 | if (!ACCESS_ONCE(rnp->qsmask) && | 1631 | if (!ACCESS_ONCE(rnp->qsmask) && |
1587 | !rcu_preempt_blocked_readers_cgp(rnp)) | 1632 | !rcu_preempt_blocked_readers_cgp(rnp)) |
@@ -1749,6 +1794,7 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp, | |||
1749 | rnp_c = rnp; | 1794 | rnp_c = rnp; |
1750 | rnp = rnp->parent; | 1795 | rnp = rnp->parent; |
1751 | raw_spin_lock_irqsave(&rnp->lock, flags); | 1796 | raw_spin_lock_irqsave(&rnp->lock, flags); |
1797 | smp_mb__after_unlock_lock(); | ||
1752 | WARN_ON_ONCE(rnp_c->qsmask); | 1798 | WARN_ON_ONCE(rnp_c->qsmask); |
1753 | } | 1799 | } |
1754 | 1800 | ||
@@ -1778,6 +1824,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp) | |||
1778 | 1824 | ||
1779 | rnp = rdp->mynode; | 1825 | rnp = rdp->mynode; |
1780 | raw_spin_lock_irqsave(&rnp->lock, flags); | 1826 | raw_spin_lock_irqsave(&rnp->lock, flags); |
1827 | smp_mb__after_unlock_lock(); | ||
1781 | if (rdp->passed_quiesce == 0 || rdp->gpnum != rnp->gpnum || | 1828 | if (rdp->passed_quiesce == 0 || rdp->gpnum != rnp->gpnum || |
1782 | rnp->completed == rnp->gpnum) { | 1829 | rnp->completed == rnp->gpnum) { |
1783 | 1830 | ||
@@ -1901,13 +1948,13 @@ rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp, | |||
1901 | * Adopt the RCU callbacks from the specified rcu_state structure's | 1948 | * Adopt the RCU callbacks from the specified rcu_state structure's |
1902 | * orphanage. The caller must hold the ->orphan_lock. | 1949 | * orphanage. The caller must hold the ->orphan_lock. |
1903 | */ | 1950 | */ |
1904 | static void rcu_adopt_orphan_cbs(struct rcu_state *rsp) | 1951 | static void rcu_adopt_orphan_cbs(struct rcu_state *rsp, unsigned long flags) |
1905 | { | 1952 | { |
1906 | int i; | 1953 | int i; |
1907 | struct rcu_data *rdp = __this_cpu_ptr(rsp->rda); | 1954 | struct rcu_data *rdp = __this_cpu_ptr(rsp->rda); |
1908 | 1955 | ||
1909 | /* No-CBs CPUs are handled specially. */ | 1956 | /* No-CBs CPUs are handled specially. */ |
1910 | if (rcu_nocb_adopt_orphan_cbs(rsp, rdp)) | 1957 | if (rcu_nocb_adopt_orphan_cbs(rsp, rdp, flags)) |
1911 | return; | 1958 | return; |
1912 | 1959 | ||
1913 | /* Do the accounting first. */ | 1960 | /* Do the accounting first. */ |
@@ -1986,12 +2033,13 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) | |||
1986 | 2033 | ||
1987 | /* Orphan the dead CPU's callbacks, and adopt them if appropriate. */ | 2034 | /* Orphan the dead CPU's callbacks, and adopt them if appropriate. */ |
1988 | rcu_send_cbs_to_orphanage(cpu, rsp, rnp, rdp); | 2035 | rcu_send_cbs_to_orphanage(cpu, rsp, rnp, rdp); |
1989 | rcu_adopt_orphan_cbs(rsp); | 2036 | rcu_adopt_orphan_cbs(rsp, flags); |
1990 | 2037 | ||
1991 | /* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */ | 2038 | /* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */ |
1992 | mask = rdp->grpmask; /* rnp->grplo is constant. */ | 2039 | mask = rdp->grpmask; /* rnp->grplo is constant. */ |
1993 | do { | 2040 | do { |
1994 | raw_spin_lock(&rnp->lock); /* irqs already disabled. */ | 2041 | raw_spin_lock(&rnp->lock); /* irqs already disabled. */ |
2042 | smp_mb__after_unlock_lock(); | ||
1995 | rnp->qsmaskinit &= ~mask; | 2043 | rnp->qsmaskinit &= ~mask; |
1996 | if (rnp->qsmaskinit != 0) { | 2044 | if (rnp->qsmaskinit != 0) { |
1997 | if (rnp != rdp->mynode) | 2045 | if (rnp != rdp->mynode) |
@@ -2202,6 +2250,7 @@ static void force_qs_rnp(struct rcu_state *rsp, | |||
2202 | cond_resched(); | 2250 | cond_resched(); |
2203 | mask = 0; | 2251 | mask = 0; |
2204 | raw_spin_lock_irqsave(&rnp->lock, flags); | 2252 | raw_spin_lock_irqsave(&rnp->lock, flags); |
2253 | smp_mb__after_unlock_lock(); | ||
2205 | if (!rcu_gp_in_progress(rsp)) { | 2254 | if (!rcu_gp_in_progress(rsp)) { |
2206 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | 2255 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
2207 | return; | 2256 | return; |
@@ -2231,6 +2280,7 @@ static void force_qs_rnp(struct rcu_state *rsp, | |||
2231 | rnp = rcu_get_root(rsp); | 2280 | rnp = rcu_get_root(rsp); |
2232 | if (rnp->qsmask == 0) { | 2281 | if (rnp->qsmask == 0) { |
2233 | raw_spin_lock_irqsave(&rnp->lock, flags); | 2282 | raw_spin_lock_irqsave(&rnp->lock, flags); |
2283 | smp_mb__after_unlock_lock(); | ||
2234 | rcu_initiate_boost(rnp, flags); /* releases rnp->lock. */ | 2284 | rcu_initiate_boost(rnp, flags); /* releases rnp->lock. */ |
2235 | } | 2285 | } |
2236 | } | 2286 | } |
@@ -2263,6 +2313,7 @@ static void force_quiescent_state(struct rcu_state *rsp) | |||
2263 | 2313 | ||
2264 | /* Reached the root of the rcu_node tree, acquire lock. */ | 2314 | /* Reached the root of the rcu_node tree, acquire lock. */ |
2265 | raw_spin_lock_irqsave(&rnp_old->lock, flags); | 2315 | raw_spin_lock_irqsave(&rnp_old->lock, flags); |
2316 | smp_mb__after_unlock_lock(); | ||
2266 | raw_spin_unlock(&rnp_old->fqslock); | 2317 | raw_spin_unlock(&rnp_old->fqslock); |
2267 | if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) { | 2318 | if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) { |
2268 | rsp->n_force_qs_lh++; | 2319 | rsp->n_force_qs_lh++; |
@@ -2303,6 +2354,9 @@ __rcu_process_callbacks(struct rcu_state *rsp) | |||
2303 | /* If there are callbacks ready, invoke them. */ | 2354 | /* If there are callbacks ready, invoke them. */ |
2304 | if (cpu_has_callbacks_ready_to_invoke(rdp)) | 2355 | if (cpu_has_callbacks_ready_to_invoke(rdp)) |
2305 | invoke_rcu_callbacks(rsp, rdp); | 2356 | invoke_rcu_callbacks(rsp, rdp); |
2357 | |||
2358 | /* Do any needed deferred wakeups of rcuo kthreads. */ | ||
2359 | do_nocb_deferred_wakeup(rdp); | ||
2306 | } | 2360 | } |
2307 | 2361 | ||
2308 | /* | 2362 | /* |
@@ -2378,6 +2432,7 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp, | |||
2378 | struct rcu_node *rnp_root = rcu_get_root(rsp); | 2432 | struct rcu_node *rnp_root = rcu_get_root(rsp); |
2379 | 2433 | ||
2380 | raw_spin_lock(&rnp_root->lock); | 2434 | raw_spin_lock(&rnp_root->lock); |
2435 | smp_mb__after_unlock_lock(); | ||
2381 | rcu_start_gp(rsp); | 2436 | rcu_start_gp(rsp); |
2382 | raw_spin_unlock(&rnp_root->lock); | 2437 | raw_spin_unlock(&rnp_root->lock); |
2383 | } else { | 2438 | } else { |
@@ -2437,7 +2492,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), | |||
2437 | 2492 | ||
2438 | if (cpu != -1) | 2493 | if (cpu != -1) |
2439 | rdp = per_cpu_ptr(rsp->rda, cpu); | 2494 | rdp = per_cpu_ptr(rsp->rda, cpu); |
2440 | offline = !__call_rcu_nocb(rdp, head, lazy); | 2495 | offline = !__call_rcu_nocb(rdp, head, lazy, flags); |
2441 | WARN_ON_ONCE(offline); | 2496 | WARN_ON_ONCE(offline); |
2442 | /* _call_rcu() is illegal on offline CPU; leak the callback. */ | 2497 | /* _call_rcu() is illegal on offline CPU; leak the callback. */ |
2443 | local_irq_restore(flags); | 2498 | local_irq_restore(flags); |
@@ -2757,6 +2812,10 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp) | |||
2757 | /* Check for CPU stalls, if enabled. */ | 2812 | /* Check for CPU stalls, if enabled. */ |
2758 | check_cpu_stall(rsp, rdp); | 2813 | check_cpu_stall(rsp, rdp); |
2759 | 2814 | ||
2815 | /* Is this CPU a NO_HZ_FULL CPU that should ignore RCU? */ | ||
2816 | if (rcu_nohz_full_cpu(rsp)) | ||
2817 | return 0; | ||
2818 | |||
2760 | /* Is the RCU core waiting for a quiescent state from this CPU? */ | 2819 | /* Is the RCU core waiting for a quiescent state from this CPU? */ |
2761 | if (rcu_scheduler_fully_active && | 2820 | if (rcu_scheduler_fully_active && |
2762 | rdp->qs_pending && !rdp->passed_quiesce) { | 2821 | rdp->qs_pending && !rdp->passed_quiesce) { |
@@ -2790,6 +2849,12 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp) | |||
2790 | return 1; | 2849 | return 1; |
2791 | } | 2850 | } |
2792 | 2851 | ||
2852 | /* Does this CPU need a deferred NOCB wakeup? */ | ||
2853 | if (rcu_nocb_need_deferred_wakeup(rdp)) { | ||
2854 | rdp->n_rp_nocb_defer_wakeup++; | ||
2855 | return 1; | ||
2856 | } | ||
2857 | |||
2793 | /* nothing to do */ | 2858 | /* nothing to do */ |
2794 | rdp->n_rp_need_nothing++; | 2859 | rdp->n_rp_need_nothing++; |
2795 | return 0; | 2860 | return 0; |
@@ -3214,9 +3279,9 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp) | |||
3214 | { | 3279 | { |
3215 | int i; | 3280 | int i; |
3216 | 3281 | ||
3217 | for (i = rcu_num_lvls - 1; i > 0; i--) | 3282 | rsp->levelspread[rcu_num_lvls - 1] = rcu_fanout_leaf; |
3283 | for (i = rcu_num_lvls - 2; i >= 0; i--) | ||
3218 | rsp->levelspread[i] = CONFIG_RCU_FANOUT; | 3284 | rsp->levelspread[i] = CONFIG_RCU_FANOUT; |
3219 | rsp->levelspread[0] = rcu_fanout_leaf; | ||
3220 | } | 3285 | } |
3221 | #else /* #ifdef CONFIG_RCU_FANOUT_EXACT */ | 3286 | #else /* #ifdef CONFIG_RCU_FANOUT_EXACT */ |
3222 | static void __init rcu_init_levelspread(struct rcu_state *rsp) | 3287 | static void __init rcu_init_levelspread(struct rcu_state *rsp) |
@@ -3346,6 +3411,8 @@ static void __init rcu_init_geometry(void) | |||
3346 | if (rcu_fanout_leaf == CONFIG_RCU_FANOUT_LEAF && | 3411 | if (rcu_fanout_leaf == CONFIG_RCU_FANOUT_LEAF && |
3347 | nr_cpu_ids == NR_CPUS) | 3412 | nr_cpu_ids == NR_CPUS) |
3348 | return; | 3413 | return; |
3414 | pr_info("RCU: Adjusting geometry for rcu_fanout_leaf=%d, nr_cpu_ids=%d\n", | ||
3415 | rcu_fanout_leaf, nr_cpu_ids); | ||
3349 | 3416 | ||
3350 | /* | 3417 | /* |
3351 | * Compute number of nodes that can be handled an rcu_node tree | 3418 | * Compute number of nodes that can be handled an rcu_node tree |
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 52be957c9fe2..8c19873f1ac9 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h | |||
@@ -317,6 +317,7 @@ struct rcu_data { | |||
317 | unsigned long n_rp_cpu_needs_gp; | 317 | unsigned long n_rp_cpu_needs_gp; |
318 | unsigned long n_rp_gp_completed; | 318 | unsigned long n_rp_gp_completed; |
319 | unsigned long n_rp_gp_started; | 319 | unsigned long n_rp_gp_started; |
320 | unsigned long n_rp_nocb_defer_wakeup; | ||
320 | unsigned long n_rp_need_nothing; | 321 | unsigned long n_rp_need_nothing; |
321 | 322 | ||
322 | /* 6) _rcu_barrier() and OOM callbacks. */ | 323 | /* 6) _rcu_barrier() and OOM callbacks. */ |
@@ -335,6 +336,7 @@ struct rcu_data { | |||
335 | int nocb_p_count_lazy; /* (approximate). */ | 336 | int nocb_p_count_lazy; /* (approximate). */ |
336 | wait_queue_head_t nocb_wq; /* For nocb kthreads to sleep on. */ | 337 | wait_queue_head_t nocb_wq; /* For nocb kthreads to sleep on. */ |
337 | struct task_struct *nocb_kthread; | 338 | struct task_struct *nocb_kthread; |
339 | bool nocb_defer_wakeup; /* Defer wakeup of nocb_kthread. */ | ||
338 | #endif /* #ifdef CONFIG_RCU_NOCB_CPU */ | 340 | #endif /* #ifdef CONFIG_RCU_NOCB_CPU */ |
339 | 341 | ||
340 | /* 8) RCU CPU stall data. */ | 342 | /* 8) RCU CPU stall data. */ |
@@ -453,6 +455,8 @@ struct rcu_state { | |||
453 | /* but in jiffies. */ | 455 | /* but in jiffies. */ |
454 | unsigned long jiffies_stall; /* Time at which to check */ | 456 | unsigned long jiffies_stall; /* Time at which to check */ |
455 | /* for CPU stalls. */ | 457 | /* for CPU stalls. */ |
458 | unsigned long jiffies_resched; /* Time at which to resched */ | ||
459 | /* a reluctant CPU. */ | ||
456 | unsigned long gp_max; /* Maximum GP duration in */ | 460 | unsigned long gp_max; /* Maximum GP duration in */ |
457 | /* jiffies. */ | 461 | /* jiffies. */ |
458 | const char *name; /* Name of structure. */ | 462 | const char *name; /* Name of structure. */ |
@@ -548,9 +552,12 @@ static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq); | |||
548 | static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp); | 552 | static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp); |
549 | static void rcu_init_one_nocb(struct rcu_node *rnp); | 553 | static void rcu_init_one_nocb(struct rcu_node *rnp); |
550 | static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, | 554 | static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, |
551 | bool lazy); | 555 | bool lazy, unsigned long flags); |
552 | static bool rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp, | 556 | static bool rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp, |
553 | struct rcu_data *rdp); | 557 | struct rcu_data *rdp, |
558 | unsigned long flags); | ||
559 | static bool rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp); | ||
560 | static void do_nocb_deferred_wakeup(struct rcu_data *rdp); | ||
554 | static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp); | 561 | static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp); |
555 | static void rcu_spawn_nocb_kthreads(struct rcu_state *rsp); | 562 | static void rcu_spawn_nocb_kthreads(struct rcu_state *rsp); |
556 | static void rcu_kick_nohz_cpu(int cpu); | 563 | static void rcu_kick_nohz_cpu(int cpu); |
@@ -564,6 +571,7 @@ static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle, | |||
564 | unsigned long maxj); | 571 | unsigned long maxj); |
565 | static void rcu_bind_gp_kthread(void); | 572 | static void rcu_bind_gp_kthread(void); |
566 | static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp); | 573 | static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp); |
574 | static bool rcu_nohz_full_cpu(struct rcu_state *rsp); | ||
567 | 575 | ||
568 | #endif /* #ifndef RCU_TREE_NONCORE */ | 576 | #endif /* #ifndef RCU_TREE_NONCORE */ |
569 | 577 | ||
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 08a765232432..6e2ef4b2b920 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h | |||
@@ -204,6 +204,7 @@ static void rcu_preempt_note_context_switch(int cpu) | |||
204 | rdp = per_cpu_ptr(rcu_preempt_state.rda, cpu); | 204 | rdp = per_cpu_ptr(rcu_preempt_state.rda, cpu); |
205 | rnp = rdp->mynode; | 205 | rnp = rdp->mynode; |
206 | raw_spin_lock_irqsave(&rnp->lock, flags); | 206 | raw_spin_lock_irqsave(&rnp->lock, flags); |
207 | smp_mb__after_unlock_lock(); | ||
207 | t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED; | 208 | t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED; |
208 | t->rcu_blocked_node = rnp; | 209 | t->rcu_blocked_node = rnp; |
209 | 210 | ||
@@ -312,6 +313,7 @@ static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags) | |||
312 | mask = rnp->grpmask; | 313 | mask = rnp->grpmask; |
313 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ | 314 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ |
314 | raw_spin_lock(&rnp_p->lock); /* irqs already disabled. */ | 315 | raw_spin_lock(&rnp_p->lock); /* irqs already disabled. */ |
316 | smp_mb__after_unlock_lock(); | ||
315 | rcu_report_qs_rnp(mask, &rcu_preempt_state, rnp_p, flags); | 317 | rcu_report_qs_rnp(mask, &rcu_preempt_state, rnp_p, flags); |
316 | } | 318 | } |
317 | 319 | ||
@@ -361,10 +363,14 @@ void rcu_read_unlock_special(struct task_struct *t) | |||
361 | special = t->rcu_read_unlock_special; | 363 | special = t->rcu_read_unlock_special; |
362 | if (special & RCU_READ_UNLOCK_NEED_QS) { | 364 | if (special & RCU_READ_UNLOCK_NEED_QS) { |
363 | rcu_preempt_qs(smp_processor_id()); | 365 | rcu_preempt_qs(smp_processor_id()); |
366 | if (!t->rcu_read_unlock_special) { | ||
367 | local_irq_restore(flags); | ||
368 | return; | ||
369 | } | ||
364 | } | 370 | } |
365 | 371 | ||
366 | /* Hardware IRQ handlers cannot block. */ | 372 | /* Hardware IRQ handlers cannot block, complain if they get here. */ |
367 | if (in_irq() || in_serving_softirq()) { | 373 | if (WARN_ON_ONCE(in_irq() || in_serving_softirq())) { |
368 | local_irq_restore(flags); | 374 | local_irq_restore(flags); |
369 | return; | 375 | return; |
370 | } | 376 | } |
@@ -381,6 +387,7 @@ void rcu_read_unlock_special(struct task_struct *t) | |||
381 | for (;;) { | 387 | for (;;) { |
382 | rnp = t->rcu_blocked_node; | 388 | rnp = t->rcu_blocked_node; |
383 | raw_spin_lock(&rnp->lock); /* irqs already disabled. */ | 389 | raw_spin_lock(&rnp->lock); /* irqs already disabled. */ |
390 | smp_mb__after_unlock_lock(); | ||
384 | if (rnp == t->rcu_blocked_node) | 391 | if (rnp == t->rcu_blocked_node) |
385 | break; | 392 | break; |
386 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ | 393 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ |
@@ -605,6 +612,7 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp, | |||
605 | while (!list_empty(lp)) { | 612 | while (!list_empty(lp)) { |
606 | t = list_entry(lp->next, typeof(*t), rcu_node_entry); | 613 | t = list_entry(lp->next, typeof(*t), rcu_node_entry); |
607 | raw_spin_lock(&rnp_root->lock); /* irqs already disabled */ | 614 | raw_spin_lock(&rnp_root->lock); /* irqs already disabled */ |
615 | smp_mb__after_unlock_lock(); | ||
608 | list_del(&t->rcu_node_entry); | 616 | list_del(&t->rcu_node_entry); |
609 | t->rcu_blocked_node = rnp_root; | 617 | t->rcu_blocked_node = rnp_root; |
610 | list_add(&t->rcu_node_entry, lp_root); | 618 | list_add(&t->rcu_node_entry, lp_root); |
@@ -629,6 +637,7 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp, | |||
629 | * in this case. | 637 | * in this case. |
630 | */ | 638 | */ |
631 | raw_spin_lock(&rnp_root->lock); /* irqs already disabled */ | 639 | raw_spin_lock(&rnp_root->lock); /* irqs already disabled */ |
640 | smp_mb__after_unlock_lock(); | ||
632 | if (rnp_root->boost_tasks != NULL && | 641 | if (rnp_root->boost_tasks != NULL && |
633 | rnp_root->boost_tasks != rnp_root->gp_tasks && | 642 | rnp_root->boost_tasks != rnp_root->gp_tasks && |
634 | rnp_root->boost_tasks != rnp_root->exp_tasks) | 643 | rnp_root->boost_tasks != rnp_root->exp_tasks) |
@@ -772,6 +781,7 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, | |||
772 | unsigned long mask; | 781 | unsigned long mask; |
773 | 782 | ||
774 | raw_spin_lock_irqsave(&rnp->lock, flags); | 783 | raw_spin_lock_irqsave(&rnp->lock, flags); |
784 | smp_mb__after_unlock_lock(); | ||
775 | for (;;) { | 785 | for (;;) { |
776 | if (!sync_rcu_preempt_exp_done(rnp)) { | 786 | if (!sync_rcu_preempt_exp_done(rnp)) { |
777 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | 787 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
@@ -779,14 +789,17 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, | |||
779 | } | 789 | } |
780 | if (rnp->parent == NULL) { | 790 | if (rnp->parent == NULL) { |
781 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | 791 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
782 | if (wake) | 792 | if (wake) { |
793 | smp_mb(); /* EGP done before wake_up(). */ | ||
783 | wake_up(&sync_rcu_preempt_exp_wq); | 794 | wake_up(&sync_rcu_preempt_exp_wq); |
795 | } | ||
784 | break; | 796 | break; |
785 | } | 797 | } |
786 | mask = rnp->grpmask; | 798 | mask = rnp->grpmask; |
787 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled */ | 799 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled */ |
788 | rnp = rnp->parent; | 800 | rnp = rnp->parent; |
789 | raw_spin_lock(&rnp->lock); /* irqs already disabled */ | 801 | raw_spin_lock(&rnp->lock); /* irqs already disabled */ |
802 | smp_mb__after_unlock_lock(); | ||
790 | rnp->expmask &= ~mask; | 803 | rnp->expmask &= ~mask; |
791 | } | 804 | } |
792 | } | 805 | } |
@@ -806,6 +819,7 @@ sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp) | |||
806 | int must_wait = 0; | 819 | int must_wait = 0; |
807 | 820 | ||
808 | raw_spin_lock_irqsave(&rnp->lock, flags); | 821 | raw_spin_lock_irqsave(&rnp->lock, flags); |
822 | smp_mb__after_unlock_lock(); | ||
809 | if (list_empty(&rnp->blkd_tasks)) { | 823 | if (list_empty(&rnp->blkd_tasks)) { |
810 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | 824 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
811 | } else { | 825 | } else { |
@@ -886,6 +900,7 @@ void synchronize_rcu_expedited(void) | |||
886 | /* Initialize ->expmask for all non-leaf rcu_node structures. */ | 900 | /* Initialize ->expmask for all non-leaf rcu_node structures. */ |
887 | rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) { | 901 | rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) { |
888 | raw_spin_lock_irqsave(&rnp->lock, flags); | 902 | raw_spin_lock_irqsave(&rnp->lock, flags); |
903 | smp_mb__after_unlock_lock(); | ||
889 | rnp->expmask = rnp->qsmaskinit; | 904 | rnp->expmask = rnp->qsmaskinit; |
890 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | 905 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
891 | } | 906 | } |
@@ -1191,6 +1206,7 @@ static int rcu_boost(struct rcu_node *rnp) | |||
1191 | return 0; /* Nothing left to boost. */ | 1206 | return 0; /* Nothing left to boost. */ |
1192 | 1207 | ||
1193 | raw_spin_lock_irqsave(&rnp->lock, flags); | 1208 | raw_spin_lock_irqsave(&rnp->lock, flags); |
1209 | smp_mb__after_unlock_lock(); | ||
1194 | 1210 | ||
1195 | /* | 1211 | /* |
1196 | * Recheck under the lock: all tasks in need of boosting | 1212 | * Recheck under the lock: all tasks in need of boosting |
@@ -1377,6 +1393,7 @@ static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp, | |||
1377 | if (IS_ERR(t)) | 1393 | if (IS_ERR(t)) |
1378 | return PTR_ERR(t); | 1394 | return PTR_ERR(t); |
1379 | raw_spin_lock_irqsave(&rnp->lock, flags); | 1395 | raw_spin_lock_irqsave(&rnp->lock, flags); |
1396 | smp_mb__after_unlock_lock(); | ||
1380 | rnp->boost_kthread_task = t; | 1397 | rnp->boost_kthread_task = t; |
1381 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | 1398 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
1382 | sp.sched_priority = RCU_BOOST_PRIO; | 1399 | sp.sched_priority = RCU_BOOST_PRIO; |
@@ -1769,6 +1786,7 @@ static void rcu_prepare_for_idle(int cpu) | |||
1769 | continue; | 1786 | continue; |
1770 | rnp = rdp->mynode; | 1787 | rnp = rdp->mynode; |
1771 | raw_spin_lock(&rnp->lock); /* irqs already disabled. */ | 1788 | raw_spin_lock(&rnp->lock); /* irqs already disabled. */ |
1789 | smp_mb__after_unlock_lock(); | ||
1772 | rcu_accelerate_cbs(rsp, rnp, rdp); | 1790 | rcu_accelerate_cbs(rsp, rnp, rdp); |
1773 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ | 1791 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ |
1774 | } | 1792 | } |
@@ -1852,6 +1870,7 @@ static int rcu_oom_notify(struct notifier_block *self, | |||
1852 | 1870 | ||
1853 | /* Wait for callbacks from earlier instance to complete. */ | 1871 | /* Wait for callbacks from earlier instance to complete. */ |
1854 | wait_event(oom_callback_wq, atomic_read(&oom_callback_count) == 0); | 1872 | wait_event(oom_callback_wq, atomic_read(&oom_callback_count) == 0); |
1873 | smp_mb(); /* Ensure callback reuse happens after callback invocation. */ | ||
1855 | 1874 | ||
1856 | /* | 1875 | /* |
1857 | * Prevent premature wakeup: ensure that all increments happen | 1876 | * Prevent premature wakeup: ensure that all increments happen |
@@ -2101,7 +2120,8 @@ bool rcu_is_nocb_cpu(int cpu) | |||
2101 | static void __call_rcu_nocb_enqueue(struct rcu_data *rdp, | 2120 | static void __call_rcu_nocb_enqueue(struct rcu_data *rdp, |
2102 | struct rcu_head *rhp, | 2121 | struct rcu_head *rhp, |
2103 | struct rcu_head **rhtp, | 2122 | struct rcu_head **rhtp, |
2104 | int rhcount, int rhcount_lazy) | 2123 | int rhcount, int rhcount_lazy, |
2124 | unsigned long flags) | ||
2105 | { | 2125 | { |
2106 | int len; | 2126 | int len; |
2107 | struct rcu_head **old_rhpp; | 2127 | struct rcu_head **old_rhpp; |
@@ -2122,9 +2142,16 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp, | |||
2122 | } | 2142 | } |
2123 | len = atomic_long_read(&rdp->nocb_q_count); | 2143 | len = atomic_long_read(&rdp->nocb_q_count); |
2124 | if (old_rhpp == &rdp->nocb_head) { | 2144 | if (old_rhpp == &rdp->nocb_head) { |
2125 | wake_up(&rdp->nocb_wq); /* ... only if queue was empty ... */ | 2145 | if (!irqs_disabled_flags(flags)) { |
2146 | wake_up(&rdp->nocb_wq); /* ... if queue was empty ... */ | ||
2147 | trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, | ||
2148 | TPS("WakeEmpty")); | ||
2149 | } else { | ||
2150 | rdp->nocb_defer_wakeup = true; | ||
2151 | trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, | ||
2152 | TPS("WakeEmptyIsDeferred")); | ||
2153 | } | ||
2126 | rdp->qlen_last_fqs_check = 0; | 2154 | rdp->qlen_last_fqs_check = 0; |
2127 | trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeEmpty")); | ||
2128 | } else if (len > rdp->qlen_last_fqs_check + qhimark) { | 2155 | } else if (len > rdp->qlen_last_fqs_check + qhimark) { |
2129 | wake_up_process(t); /* ... or if many callbacks queued. */ | 2156 | wake_up_process(t); /* ... or if many callbacks queued. */ |
2130 | rdp->qlen_last_fqs_check = LONG_MAX / 2; | 2157 | rdp->qlen_last_fqs_check = LONG_MAX / 2; |
@@ -2145,12 +2172,12 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp, | |||
2145 | * "rcuo" kthread can find it. | 2172 | * "rcuo" kthread can find it. |
2146 | */ | 2173 | */ |
2147 | static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, | 2174 | static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, |
2148 | bool lazy) | 2175 | bool lazy, unsigned long flags) |
2149 | { | 2176 | { |
2150 | 2177 | ||
2151 | if (!rcu_is_nocb_cpu(rdp->cpu)) | 2178 | if (!rcu_is_nocb_cpu(rdp->cpu)) |
2152 | return 0; | 2179 | return 0; |
2153 | __call_rcu_nocb_enqueue(rdp, rhp, &rhp->next, 1, lazy); | 2180 | __call_rcu_nocb_enqueue(rdp, rhp, &rhp->next, 1, lazy, flags); |
2154 | if (__is_kfree_rcu_offset((unsigned long)rhp->func)) | 2181 | if (__is_kfree_rcu_offset((unsigned long)rhp->func)) |
2155 | trace_rcu_kfree_callback(rdp->rsp->name, rhp, | 2182 | trace_rcu_kfree_callback(rdp->rsp->name, rhp, |
2156 | (unsigned long)rhp->func, | 2183 | (unsigned long)rhp->func, |
@@ -2168,7 +2195,8 @@ static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, | |||
2168 | * not a no-CBs CPU. | 2195 | * not a no-CBs CPU. |
2169 | */ | 2196 | */ |
2170 | static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp, | 2197 | static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp, |
2171 | struct rcu_data *rdp) | 2198 | struct rcu_data *rdp, |
2199 | unsigned long flags) | ||
2172 | { | 2200 | { |
2173 | long ql = rsp->qlen; | 2201 | long ql = rsp->qlen; |
2174 | long qll = rsp->qlen_lazy; | 2202 | long qll = rsp->qlen_lazy; |
@@ -2182,14 +2210,14 @@ static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp, | |||
2182 | /* First, enqueue the donelist, if any. This preserves CB ordering. */ | 2210 | /* First, enqueue the donelist, if any. This preserves CB ordering. */ |
2183 | if (rsp->orphan_donelist != NULL) { | 2211 | if (rsp->orphan_donelist != NULL) { |
2184 | __call_rcu_nocb_enqueue(rdp, rsp->orphan_donelist, | 2212 | __call_rcu_nocb_enqueue(rdp, rsp->orphan_donelist, |
2185 | rsp->orphan_donetail, ql, qll); | 2213 | rsp->orphan_donetail, ql, qll, flags); |
2186 | ql = qll = 0; | 2214 | ql = qll = 0; |
2187 | rsp->orphan_donelist = NULL; | 2215 | rsp->orphan_donelist = NULL; |
2188 | rsp->orphan_donetail = &rsp->orphan_donelist; | 2216 | rsp->orphan_donetail = &rsp->orphan_donelist; |
2189 | } | 2217 | } |
2190 | if (rsp->orphan_nxtlist != NULL) { | 2218 | if (rsp->orphan_nxtlist != NULL) { |
2191 | __call_rcu_nocb_enqueue(rdp, rsp->orphan_nxtlist, | 2219 | __call_rcu_nocb_enqueue(rdp, rsp->orphan_nxtlist, |
2192 | rsp->orphan_nxttail, ql, qll); | 2220 | rsp->orphan_nxttail, ql, qll, flags); |
2193 | ql = qll = 0; | 2221 | ql = qll = 0; |
2194 | rsp->orphan_nxtlist = NULL; | 2222 | rsp->orphan_nxtlist = NULL; |
2195 | rsp->orphan_nxttail = &rsp->orphan_nxtlist; | 2223 | rsp->orphan_nxttail = &rsp->orphan_nxtlist; |
@@ -2209,6 +2237,7 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp) | |||
2209 | struct rcu_node *rnp = rdp->mynode; | 2237 | struct rcu_node *rnp = rdp->mynode; |
2210 | 2238 | ||
2211 | raw_spin_lock_irqsave(&rnp->lock, flags); | 2239 | raw_spin_lock_irqsave(&rnp->lock, flags); |
2240 | smp_mb__after_unlock_lock(); | ||
2212 | c = rcu_start_future_gp(rnp, rdp); | 2241 | c = rcu_start_future_gp(rnp, rdp); |
2213 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | 2242 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
2214 | 2243 | ||
@@ -2250,6 +2279,7 @@ static int rcu_nocb_kthread(void *arg) | |||
2250 | trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, | 2279 | trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, |
2251 | TPS("Sleep")); | 2280 | TPS("Sleep")); |
2252 | wait_event_interruptible(rdp->nocb_wq, rdp->nocb_head); | 2281 | wait_event_interruptible(rdp->nocb_wq, rdp->nocb_head); |
2282 | /* Memory barrier provide by xchg() below. */ | ||
2253 | } else if (firsttime) { | 2283 | } else if (firsttime) { |
2254 | firsttime = 0; | 2284 | firsttime = 0; |
2255 | trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, | 2285 | trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, |
@@ -2310,6 +2340,22 @@ static int rcu_nocb_kthread(void *arg) | |||
2310 | return 0; | 2340 | return 0; |
2311 | } | 2341 | } |
2312 | 2342 | ||
2343 | /* Is a deferred wakeup of rcu_nocb_kthread() required? */ | ||
2344 | static bool rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp) | ||
2345 | { | ||
2346 | return ACCESS_ONCE(rdp->nocb_defer_wakeup); | ||
2347 | } | ||
2348 | |||
2349 | /* Do a deferred wakeup of rcu_nocb_kthread(). */ | ||
2350 | static void do_nocb_deferred_wakeup(struct rcu_data *rdp) | ||
2351 | { | ||
2352 | if (!rcu_nocb_need_deferred_wakeup(rdp)) | ||
2353 | return; | ||
2354 | ACCESS_ONCE(rdp->nocb_defer_wakeup) = false; | ||
2355 | wake_up(&rdp->nocb_wq); | ||
2356 | trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("DeferredWakeEmpty")); | ||
2357 | } | ||
2358 | |||
2313 | /* Initialize per-rcu_data variables for no-CBs CPUs. */ | 2359 | /* Initialize per-rcu_data variables for no-CBs CPUs. */ |
2314 | static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp) | 2360 | static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp) |
2315 | { | 2361 | { |
@@ -2365,13 +2411,14 @@ static void rcu_init_one_nocb(struct rcu_node *rnp) | |||
2365 | } | 2411 | } |
2366 | 2412 | ||
2367 | static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, | 2413 | static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, |
2368 | bool lazy) | 2414 | bool lazy, unsigned long flags) |
2369 | { | 2415 | { |
2370 | return 0; | 2416 | return 0; |
2371 | } | 2417 | } |
2372 | 2418 | ||
2373 | static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp, | 2419 | static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp, |
2374 | struct rcu_data *rdp) | 2420 | struct rcu_data *rdp, |
2421 | unsigned long flags) | ||
2375 | { | 2422 | { |
2376 | return 0; | 2423 | return 0; |
2377 | } | 2424 | } |
@@ -2380,6 +2427,15 @@ static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp) | |||
2380 | { | 2427 | { |
2381 | } | 2428 | } |
2382 | 2429 | ||
2430 | static bool rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp) | ||
2431 | { | ||
2432 | return false; | ||
2433 | } | ||
2434 | |||
2435 | static void do_nocb_deferred_wakeup(struct rcu_data *rdp) | ||
2436 | { | ||
2437 | } | ||
2438 | |||
2383 | static void __init rcu_spawn_nocb_kthreads(struct rcu_state *rsp) | 2439 | static void __init rcu_spawn_nocb_kthreads(struct rcu_state *rsp) |
2384 | { | 2440 | { |
2385 | } | 2441 | } |
@@ -2829,3 +2885,23 @@ static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp) | |||
2829 | } | 2885 | } |
2830 | 2886 | ||
2831 | #endif /* #else #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ | 2887 | #endif /* #else #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ |
2888 | |||
2889 | /* | ||
2890 | * Is this CPU a NO_HZ_FULL CPU that should ignore RCU so that the | ||
2891 | * grace-period kthread will do force_quiescent_state() processing? | ||
2892 | * The idea is to avoid waking up RCU core processing on such a | ||
2893 | * CPU unless the grace period has extended for too long. | ||
2894 | * | ||
2895 | * This code relies on the fact that all NO_HZ_FULL CPUs are also | ||
2896 | * CONFIG_RCU_NOCB_CPUs. | ||
2897 | */ | ||
2898 | static bool rcu_nohz_full_cpu(struct rcu_state *rsp) | ||
2899 | { | ||
2900 | #ifdef CONFIG_NO_HZ_FULL | ||
2901 | if (tick_nohz_full_cpu(smp_processor_id()) && | ||
2902 | (!rcu_gp_in_progress(rsp) || | ||
2903 | ULONG_CMP_LT(jiffies, ACCESS_ONCE(rsp->gp_start) + HZ))) | ||
2904 | return 1; | ||
2905 | #endif /* #ifdef CONFIG_NO_HZ_FULL */ | ||
2906 | return 0; | ||
2907 | } | ||
diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c index 3596797b7e46..4def475336d4 100644 --- a/kernel/rcu/tree_trace.c +++ b/kernel/rcu/tree_trace.c | |||
@@ -364,9 +364,10 @@ static void print_one_rcu_pending(struct seq_file *m, struct rcu_data *rdp) | |||
364 | rdp->n_rp_report_qs, | 364 | rdp->n_rp_report_qs, |
365 | rdp->n_rp_cb_ready, | 365 | rdp->n_rp_cb_ready, |
366 | rdp->n_rp_cpu_needs_gp); | 366 | rdp->n_rp_cpu_needs_gp); |
367 | seq_printf(m, "gpc=%ld gps=%ld nn=%ld\n", | 367 | seq_printf(m, "gpc=%ld gps=%ld nn=%ld ndw%ld\n", |
368 | rdp->n_rp_gp_completed, | 368 | rdp->n_rp_gp_completed, |
369 | rdp->n_rp_gp_started, | 369 | rdp->n_rp_gp_started, |
370 | rdp->n_rp_nocb_defer_wakeup, | ||
370 | rdp->n_rp_need_nothing); | 371 | rdp->n_rp_need_nothing); |
371 | } | 372 | } |
372 | 373 | ||
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 6cb3dff89e2b..c54609faf233 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c | |||
@@ -128,6 +128,11 @@ struct lockdep_map rcu_sched_lock_map = | |||
128 | STATIC_LOCKDEP_MAP_INIT("rcu_read_lock_sched", &rcu_sched_lock_key); | 128 | STATIC_LOCKDEP_MAP_INIT("rcu_read_lock_sched", &rcu_sched_lock_key); |
129 | EXPORT_SYMBOL_GPL(rcu_sched_lock_map); | 129 | EXPORT_SYMBOL_GPL(rcu_sched_lock_map); |
130 | 130 | ||
131 | static struct lock_class_key rcu_callback_key; | ||
132 | struct lockdep_map rcu_callback_map = | ||
133 | STATIC_LOCKDEP_MAP_INIT("rcu_callback", &rcu_callback_key); | ||
134 | EXPORT_SYMBOL_GPL(rcu_callback_map); | ||
135 | |||
131 | int notrace debug_lockdep_rcu_enabled(void) | 136 | int notrace debug_lockdep_rcu_enabled(void) |
132 | { | 137 | { |
133 | return rcu_scheduler_active && debug_locks && | 138 | return rcu_scheduler_active && debug_locks && |
@@ -195,17 +200,6 @@ void wait_rcu_gp(call_rcu_func_t crf) | |||
195 | } | 200 | } |
196 | EXPORT_SYMBOL_GPL(wait_rcu_gp); | 201 | EXPORT_SYMBOL_GPL(wait_rcu_gp); |
197 | 202 | ||
198 | #ifdef CONFIG_PROVE_RCU | ||
199 | /* | ||
200 | * wrapper function to avoid #include problems. | ||
201 | */ | ||
202 | int rcu_my_thread_group_empty(void) | ||
203 | { | ||
204 | return thread_group_empty(current); | ||
205 | } | ||
206 | EXPORT_SYMBOL_GPL(rcu_my_thread_group_empty); | ||
207 | #endif /* #ifdef CONFIG_PROVE_RCU */ | ||
208 | |||
209 | #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD | 203 | #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD |
210 | static inline void debug_init_rcu_head(struct rcu_head *head) | 204 | static inline void debug_init_rcu_head(struct rcu_head *head) |
211 | { | 205 | { |
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 7b621409cf15..9a95c8c2af2a 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile | |||
@@ -11,9 +11,10 @@ ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) | |||
11 | CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer | 11 | CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer |
12 | endif | 12 | endif |
13 | 13 | ||
14 | obj-y += core.o proc.o clock.o cputime.o idle_task.o fair.o rt.o stop_task.o | 14 | obj-y += core.o proc.o clock.o cputime.o |
15 | obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o | ||
15 | obj-y += wait.o completion.o | 16 | obj-y += wait.o completion.o |
16 | obj-$(CONFIG_SMP) += cpupri.o | 17 | obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o |
17 | obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o | 18 | obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o |
18 | obj-$(CONFIG_SCHEDSTATS) += stats.o | 19 | obj-$(CONFIG_SCHEDSTATS) += stats.o |
19 | obj-$(CONFIG_SCHED_DEBUG) += debug.o | 20 | obj-$(CONFIG_SCHED_DEBUG) += debug.o |
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index c3ae1446461c..b30a2924ef14 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c | |||
@@ -26,9 +26,10 @@ | |||
26 | * at 0 on boot (but people really shouldn't rely on that). | 26 | * at 0 on boot (but people really shouldn't rely on that). |
27 | * | 27 | * |
28 | * cpu_clock(i) -- can be used from any context, including NMI. | 28 | * cpu_clock(i) -- can be used from any context, including NMI. |
29 | * sched_clock_cpu(i) -- must be used with local IRQs disabled (implied by NMI) | ||
30 | * local_clock() -- is cpu_clock() on the current cpu. | 29 | * local_clock() -- is cpu_clock() on the current cpu. |
31 | * | 30 | * |
31 | * sched_clock_cpu(i) | ||
32 | * | ||
32 | * How: | 33 | * How: |
33 | * | 34 | * |
34 | * The implementation either uses sched_clock() when | 35 | * The implementation either uses sched_clock() when |
@@ -50,15 +51,6 @@ | |||
50 | * Furthermore, explicit sleep and wakeup hooks allow us to account for time | 51 | * Furthermore, explicit sleep and wakeup hooks allow us to account for time |
51 | * that is otherwise invisible (TSC gets stopped). | 52 | * that is otherwise invisible (TSC gets stopped). |
52 | * | 53 | * |
53 | * | ||
54 | * Notes: | ||
55 | * | ||
56 | * The !IRQ-safetly of sched_clock() and sched_clock_cpu() comes from things | ||
57 | * like cpufreq interrupts that can change the base clock (TSC) multiplier | ||
58 | * and cause funny jumps in time -- although the filtering provided by | ||
59 | * sched_clock_cpu() should mitigate serious artifacts we cannot rely on it | ||
60 | * in general since for !CONFIG_HAVE_UNSTABLE_SCHED_CLOCK we fully rely on | ||
61 | * sched_clock(). | ||
62 | */ | 54 | */ |
63 | #include <linux/spinlock.h> | 55 | #include <linux/spinlock.h> |
64 | #include <linux/hardirq.h> | 56 | #include <linux/hardirq.h> |
@@ -66,6 +58,8 @@ | |||
66 | #include <linux/percpu.h> | 58 | #include <linux/percpu.h> |
67 | #include <linux/ktime.h> | 59 | #include <linux/ktime.h> |
68 | #include <linux/sched.h> | 60 | #include <linux/sched.h> |
61 | #include <linux/static_key.h> | ||
62 | #include <linux/workqueue.h> | ||
69 | 63 | ||
70 | /* | 64 | /* |
71 | * Scheduler clock - returns current time in nanosec units. | 65 | * Scheduler clock - returns current time in nanosec units. |
@@ -82,7 +76,52 @@ EXPORT_SYMBOL_GPL(sched_clock); | |||
82 | __read_mostly int sched_clock_running; | 76 | __read_mostly int sched_clock_running; |
83 | 77 | ||
84 | #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK | 78 | #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK |
85 | __read_mostly int sched_clock_stable; | 79 | static struct static_key __sched_clock_stable = STATIC_KEY_INIT; |
80 | static int __sched_clock_stable_early; | ||
81 | |||
82 | int sched_clock_stable(void) | ||
83 | { | ||
84 | return static_key_false(&__sched_clock_stable); | ||
85 | } | ||
86 | |||
87 | static void __set_sched_clock_stable(void) | ||
88 | { | ||
89 | if (!sched_clock_stable()) | ||
90 | static_key_slow_inc(&__sched_clock_stable); | ||
91 | } | ||
92 | |||
93 | void set_sched_clock_stable(void) | ||
94 | { | ||
95 | __sched_clock_stable_early = 1; | ||
96 | |||
97 | smp_mb(); /* matches sched_clock_init() */ | ||
98 | |||
99 | if (!sched_clock_running) | ||
100 | return; | ||
101 | |||
102 | __set_sched_clock_stable(); | ||
103 | } | ||
104 | |||
105 | static void __clear_sched_clock_stable(struct work_struct *work) | ||
106 | { | ||
107 | /* XXX worry about clock continuity */ | ||
108 | if (sched_clock_stable()) | ||
109 | static_key_slow_dec(&__sched_clock_stable); | ||
110 | } | ||
111 | |||
112 | static DECLARE_WORK(sched_clock_work, __clear_sched_clock_stable); | ||
113 | |||
114 | void clear_sched_clock_stable(void) | ||
115 | { | ||
116 | __sched_clock_stable_early = 0; | ||
117 | |||
118 | smp_mb(); /* matches sched_clock_init() */ | ||
119 | |||
120 | if (!sched_clock_running) | ||
121 | return; | ||
122 | |||
123 | schedule_work(&sched_clock_work); | ||
124 | } | ||
86 | 125 | ||
87 | struct sched_clock_data { | 126 | struct sched_clock_data { |
88 | u64 tick_raw; | 127 | u64 tick_raw; |
@@ -116,6 +155,20 @@ void sched_clock_init(void) | |||
116 | } | 155 | } |
117 | 156 | ||
118 | sched_clock_running = 1; | 157 | sched_clock_running = 1; |
158 | |||
159 | /* | ||
160 | * Ensure that it is impossible to not do a static_key update. | ||
161 | * | ||
162 | * Either {set,clear}_sched_clock_stable() must see sched_clock_running | ||
163 | * and do the update, or we must see their __sched_clock_stable_early | ||
164 | * and do the update, or both. | ||
165 | */ | ||
166 | smp_mb(); /* matches {set,clear}_sched_clock_stable() */ | ||
167 | |||
168 | if (__sched_clock_stable_early) | ||
169 | __set_sched_clock_stable(); | ||
170 | else | ||
171 | __clear_sched_clock_stable(NULL); | ||
119 | } | 172 | } |
120 | 173 | ||
121 | /* | 174 | /* |
@@ -242,20 +295,20 @@ u64 sched_clock_cpu(int cpu) | |||
242 | struct sched_clock_data *scd; | 295 | struct sched_clock_data *scd; |
243 | u64 clock; | 296 | u64 clock; |
244 | 297 | ||
245 | WARN_ON_ONCE(!irqs_disabled()); | 298 | if (sched_clock_stable()) |
246 | |||
247 | if (sched_clock_stable) | ||
248 | return sched_clock(); | 299 | return sched_clock(); |
249 | 300 | ||
250 | if (unlikely(!sched_clock_running)) | 301 | if (unlikely(!sched_clock_running)) |
251 | return 0ull; | 302 | return 0ull; |
252 | 303 | ||
304 | preempt_disable_notrace(); | ||
253 | scd = cpu_sdc(cpu); | 305 | scd = cpu_sdc(cpu); |
254 | 306 | ||
255 | if (cpu != smp_processor_id()) | 307 | if (cpu != smp_processor_id()) |
256 | clock = sched_clock_remote(scd); | 308 | clock = sched_clock_remote(scd); |
257 | else | 309 | else |
258 | clock = sched_clock_local(scd); | 310 | clock = sched_clock_local(scd); |
311 | preempt_enable_notrace(); | ||
259 | 312 | ||
260 | return clock; | 313 | return clock; |
261 | } | 314 | } |
@@ -265,7 +318,7 @@ void sched_clock_tick(void) | |||
265 | struct sched_clock_data *scd; | 318 | struct sched_clock_data *scd; |
266 | u64 now, now_gtod; | 319 | u64 now, now_gtod; |
267 | 320 | ||
268 | if (sched_clock_stable) | 321 | if (sched_clock_stable()) |
269 | return; | 322 | return; |
270 | 323 | ||
271 | if (unlikely(!sched_clock_running)) | 324 | if (unlikely(!sched_clock_running)) |
@@ -316,14 +369,10 @@ EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); | |||
316 | */ | 369 | */ |
317 | u64 cpu_clock(int cpu) | 370 | u64 cpu_clock(int cpu) |
318 | { | 371 | { |
319 | u64 clock; | 372 | if (!sched_clock_stable()) |
320 | unsigned long flags; | 373 | return sched_clock_cpu(cpu); |
321 | |||
322 | local_irq_save(flags); | ||
323 | clock = sched_clock_cpu(cpu); | ||
324 | local_irq_restore(flags); | ||
325 | 374 | ||
326 | return clock; | 375 | return sched_clock(); |
327 | } | 376 | } |
328 | 377 | ||
329 | /* | 378 | /* |
@@ -335,14 +384,10 @@ u64 cpu_clock(int cpu) | |||
335 | */ | 384 | */ |
336 | u64 local_clock(void) | 385 | u64 local_clock(void) |
337 | { | 386 | { |
338 | u64 clock; | 387 | if (!sched_clock_stable()) |
339 | unsigned long flags; | 388 | return sched_clock_cpu(raw_smp_processor_id()); |
340 | 389 | ||
341 | local_irq_save(flags); | 390 | return sched_clock(); |
342 | clock = sched_clock_cpu(smp_processor_id()); | ||
343 | local_irq_restore(flags); | ||
344 | |||
345 | return clock; | ||
346 | } | 391 | } |
347 | 392 | ||
348 | #else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ | 393 | #else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ |
@@ -362,12 +407,12 @@ u64 sched_clock_cpu(int cpu) | |||
362 | 407 | ||
363 | u64 cpu_clock(int cpu) | 408 | u64 cpu_clock(int cpu) |
364 | { | 409 | { |
365 | return sched_clock_cpu(cpu); | 410 | return sched_clock(); |
366 | } | 411 | } |
367 | 412 | ||
368 | u64 local_clock(void) | 413 | u64 local_clock(void) |
369 | { | 414 | { |
370 | return sched_clock_cpu(0); | 415 | return sched_clock(); |
371 | } | 416 | } |
372 | 417 | ||
373 | #endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ | 418 | #endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ |
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index a88f4a485c5e..f5c6635b806c 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c | |||
@@ -296,8 +296,6 @@ __read_mostly int scheduler_running; | |||
296 | */ | 296 | */ |
297 | int sysctl_sched_rt_runtime = 950000; | 297 | int sysctl_sched_rt_runtime = 950000; |
298 | 298 | ||
299 | |||
300 | |||
301 | /* | 299 | /* |
302 | * __task_rq_lock - lock the rq @p resides on. | 300 | * __task_rq_lock - lock the rq @p resides on. |
303 | */ | 301 | */ |
@@ -899,7 +897,9 @@ static inline int normal_prio(struct task_struct *p) | |||
899 | { | 897 | { |
900 | int prio; | 898 | int prio; |
901 | 899 | ||
902 | if (task_has_rt_policy(p)) | 900 | if (task_has_dl_policy(p)) |
901 | prio = MAX_DL_PRIO-1; | ||
902 | else if (task_has_rt_policy(p)) | ||
903 | prio = MAX_RT_PRIO-1 - p->rt_priority; | 903 | prio = MAX_RT_PRIO-1 - p->rt_priority; |
904 | else | 904 | else |
905 | prio = __normal_prio(p); | 905 | prio = __normal_prio(p); |
@@ -945,7 +945,7 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p, | |||
945 | if (prev_class->switched_from) | 945 | if (prev_class->switched_from) |
946 | prev_class->switched_from(rq, p); | 946 | prev_class->switched_from(rq, p); |
947 | p->sched_class->switched_to(rq, p); | 947 | p->sched_class->switched_to(rq, p); |
948 | } else if (oldprio != p->prio) | 948 | } else if (oldprio != p->prio || dl_task(p)) |
949 | p->sched_class->prio_changed(rq, p, oldprio); | 949 | p->sched_class->prio_changed(rq, p, oldprio); |
950 | } | 950 | } |
951 | 951 | ||
@@ -1108,6 +1108,7 @@ int migrate_swap(struct task_struct *cur, struct task_struct *p) | |||
1108 | if (!cpumask_test_cpu(arg.src_cpu, tsk_cpus_allowed(arg.dst_task))) | 1108 | if (!cpumask_test_cpu(arg.src_cpu, tsk_cpus_allowed(arg.dst_task))) |
1109 | goto out; | 1109 | goto out; |
1110 | 1110 | ||
1111 | trace_sched_swap_numa(cur, arg.src_cpu, p, arg.dst_cpu); | ||
1111 | ret = stop_two_cpus(arg.dst_cpu, arg.src_cpu, migrate_swap_stop, &arg); | 1112 | ret = stop_two_cpus(arg.dst_cpu, arg.src_cpu, migrate_swap_stop, &arg); |
1112 | 1113 | ||
1113 | out: | 1114 | out: |
@@ -1499,8 +1500,7 @@ void scheduler_ipi(void) | |||
1499 | * TIF_NEED_RESCHED remotely (for the first time) will also send | 1500 | * TIF_NEED_RESCHED remotely (for the first time) will also send |
1500 | * this IPI. | 1501 | * this IPI. |
1501 | */ | 1502 | */ |
1502 | if (tif_need_resched()) | 1503 | preempt_fold_need_resched(); |
1503 | set_preempt_need_resched(); | ||
1504 | 1504 | ||
1505 | if (llist_empty(&this_rq()->wake_list) | 1505 | if (llist_empty(&this_rq()->wake_list) |
1506 | && !tick_nohz_full_cpu(smp_processor_id()) | 1506 | && !tick_nohz_full_cpu(smp_processor_id()) |
@@ -1717,6 +1717,13 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) | |||
1717 | memset(&p->se.statistics, 0, sizeof(p->se.statistics)); | 1717 | memset(&p->se.statistics, 0, sizeof(p->se.statistics)); |
1718 | #endif | 1718 | #endif |
1719 | 1719 | ||
1720 | RB_CLEAR_NODE(&p->dl.rb_node); | ||
1721 | hrtimer_init(&p->dl.dl_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); | ||
1722 | p->dl.dl_runtime = p->dl.runtime = 0; | ||
1723 | p->dl.dl_deadline = p->dl.deadline = 0; | ||
1724 | p->dl.dl_period = 0; | ||
1725 | p->dl.flags = 0; | ||
1726 | |||
1720 | INIT_LIST_HEAD(&p->rt.run_list); | 1727 | INIT_LIST_HEAD(&p->rt.run_list); |
1721 | 1728 | ||
1722 | #ifdef CONFIG_PREEMPT_NOTIFIERS | 1729 | #ifdef CONFIG_PREEMPT_NOTIFIERS |
@@ -1763,12 +1770,34 @@ void set_numabalancing_state(bool enabled) | |||
1763 | numabalancing_enabled = enabled; | 1770 | numabalancing_enabled = enabled; |
1764 | } | 1771 | } |
1765 | #endif /* CONFIG_SCHED_DEBUG */ | 1772 | #endif /* CONFIG_SCHED_DEBUG */ |
1766 | #endif /* CONFIG_NUMA_BALANCING */ | 1773 | |
1774 | #ifdef CONFIG_PROC_SYSCTL | ||
1775 | int sysctl_numa_balancing(struct ctl_table *table, int write, | ||
1776 | void __user *buffer, size_t *lenp, loff_t *ppos) | ||
1777 | { | ||
1778 | struct ctl_table t; | ||
1779 | int err; | ||
1780 | int state = numabalancing_enabled; | ||
1781 | |||
1782 | if (write && !capable(CAP_SYS_ADMIN)) | ||
1783 | return -EPERM; | ||
1784 | |||
1785 | t = *table; | ||
1786 | t.data = &state; | ||
1787 | err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos); | ||
1788 | if (err < 0) | ||
1789 | return err; | ||
1790 | if (write) | ||
1791 | set_numabalancing_state(state); | ||
1792 | return err; | ||
1793 | } | ||
1794 | #endif | ||
1795 | #endif | ||
1767 | 1796 | ||
1768 | /* | 1797 | /* |
1769 | * fork()/clone()-time setup: | 1798 | * fork()/clone()-time setup: |
1770 | */ | 1799 | */ |
1771 | void sched_fork(unsigned long clone_flags, struct task_struct *p) | 1800 | int sched_fork(unsigned long clone_flags, struct task_struct *p) |
1772 | { | 1801 | { |
1773 | unsigned long flags; | 1802 | unsigned long flags; |
1774 | int cpu = get_cpu(); | 1803 | int cpu = get_cpu(); |
@@ -1790,7 +1819,7 @@ void sched_fork(unsigned long clone_flags, struct task_struct *p) | |||
1790 | * Revert to default priority/policy on fork if requested. | 1819 | * Revert to default priority/policy on fork if requested. |
1791 | */ | 1820 | */ |
1792 | if (unlikely(p->sched_reset_on_fork)) { | 1821 | if (unlikely(p->sched_reset_on_fork)) { |
1793 | if (task_has_rt_policy(p)) { | 1822 | if (task_has_dl_policy(p) || task_has_rt_policy(p)) { |
1794 | p->policy = SCHED_NORMAL; | 1823 | p->policy = SCHED_NORMAL; |
1795 | p->static_prio = NICE_TO_PRIO(0); | 1824 | p->static_prio = NICE_TO_PRIO(0); |
1796 | p->rt_priority = 0; | 1825 | p->rt_priority = 0; |
@@ -1807,8 +1836,14 @@ void sched_fork(unsigned long clone_flags, struct task_struct *p) | |||
1807 | p->sched_reset_on_fork = 0; | 1836 | p->sched_reset_on_fork = 0; |
1808 | } | 1837 | } |
1809 | 1838 | ||
1810 | if (!rt_prio(p->prio)) | 1839 | if (dl_prio(p->prio)) { |
1840 | put_cpu(); | ||
1841 | return -EAGAIN; | ||
1842 | } else if (rt_prio(p->prio)) { | ||
1843 | p->sched_class = &rt_sched_class; | ||
1844 | } else { | ||
1811 | p->sched_class = &fair_sched_class; | 1845 | p->sched_class = &fair_sched_class; |
1846 | } | ||
1812 | 1847 | ||
1813 | if (p->sched_class->task_fork) | 1848 | if (p->sched_class->task_fork) |
1814 | p->sched_class->task_fork(p); | 1849 | p->sched_class->task_fork(p); |
@@ -1834,11 +1869,124 @@ void sched_fork(unsigned long clone_flags, struct task_struct *p) | |||
1834 | init_task_preempt_count(p); | 1869 | init_task_preempt_count(p); |
1835 | #ifdef CONFIG_SMP | 1870 | #ifdef CONFIG_SMP |
1836 | plist_node_init(&p->pushable_tasks, MAX_PRIO); | 1871 | plist_node_init(&p->pushable_tasks, MAX_PRIO); |
1872 | RB_CLEAR_NODE(&p->pushable_dl_tasks); | ||
1837 | #endif | 1873 | #endif |
1838 | 1874 | ||
1839 | put_cpu(); | 1875 | put_cpu(); |
1876 | return 0; | ||
1877 | } | ||
1878 | |||
1879 | unsigned long to_ratio(u64 period, u64 runtime) | ||
1880 | { | ||
1881 | if (runtime == RUNTIME_INF) | ||
1882 | return 1ULL << 20; | ||
1883 | |||
1884 | /* | ||
1885 | * Doing this here saves a lot of checks in all | ||
1886 | * the calling paths, and returning zero seems | ||
1887 | * safe for them anyway. | ||
1888 | */ | ||
1889 | if (period == 0) | ||
1890 | return 0; | ||
1891 | |||
1892 | return div64_u64(runtime << 20, period); | ||
1840 | } | 1893 | } |
1841 | 1894 | ||
1895 | #ifdef CONFIG_SMP | ||
1896 | inline struct dl_bw *dl_bw_of(int i) | ||
1897 | { | ||
1898 | return &cpu_rq(i)->rd->dl_bw; | ||
1899 | } | ||
1900 | |||
1901 | static inline int dl_bw_cpus(int i) | ||
1902 | { | ||
1903 | struct root_domain *rd = cpu_rq(i)->rd; | ||
1904 | int cpus = 0; | ||
1905 | |||
1906 | for_each_cpu_and(i, rd->span, cpu_active_mask) | ||
1907 | cpus++; | ||
1908 | |||
1909 | return cpus; | ||
1910 | } | ||
1911 | #else | ||
1912 | inline struct dl_bw *dl_bw_of(int i) | ||
1913 | { | ||
1914 | return &cpu_rq(i)->dl.dl_bw; | ||
1915 | } | ||
1916 | |||
1917 | static inline int dl_bw_cpus(int i) | ||
1918 | { | ||
1919 | return 1; | ||
1920 | } | ||
1921 | #endif | ||
1922 | |||
1923 | static inline | ||
1924 | void __dl_clear(struct dl_bw *dl_b, u64 tsk_bw) | ||
1925 | { | ||
1926 | dl_b->total_bw -= tsk_bw; | ||
1927 | } | ||
1928 | |||
1929 | static inline | ||
1930 | void __dl_add(struct dl_bw *dl_b, u64 tsk_bw) | ||
1931 | { | ||
1932 | dl_b->total_bw += tsk_bw; | ||
1933 | } | ||
1934 | |||
1935 | static inline | ||
1936 | bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw) | ||
1937 | { | ||
1938 | return dl_b->bw != -1 && | ||
1939 | dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw; | ||
1940 | } | ||
1941 | |||
1942 | /* | ||
1943 | * We must be sure that accepting a new task (or allowing changing the | ||
1944 | * parameters of an existing one) is consistent with the bandwidth | ||
1945 | * constraints. If yes, this function also accordingly updates the currently | ||
1946 | * allocated bandwidth to reflect the new situation. | ||
1947 | * | ||
1948 | * This function is called while holding p's rq->lock. | ||
1949 | */ | ||
1950 | static int dl_overflow(struct task_struct *p, int policy, | ||
1951 | const struct sched_attr *attr) | ||
1952 | { | ||
1953 | |||
1954 | struct dl_bw *dl_b = dl_bw_of(task_cpu(p)); | ||
1955 | u64 period = attr->sched_period ?: attr->sched_deadline; | ||
1956 | u64 runtime = attr->sched_runtime; | ||
1957 | u64 new_bw = dl_policy(policy) ? to_ratio(period, runtime) : 0; | ||
1958 | int cpus, err = -1; | ||
1959 | |||
1960 | if (new_bw == p->dl.dl_bw) | ||
1961 | return 0; | ||
1962 | |||
1963 | /* | ||
1964 | * Either if a task, enters, leave, or stays -deadline but changes | ||
1965 | * its parameters, we may need to update accordingly the total | ||
1966 | * allocated bandwidth of the container. | ||
1967 | */ | ||
1968 | raw_spin_lock(&dl_b->lock); | ||
1969 | cpus = dl_bw_cpus(task_cpu(p)); | ||
1970 | if (dl_policy(policy) && !task_has_dl_policy(p) && | ||
1971 | !__dl_overflow(dl_b, cpus, 0, new_bw)) { | ||
1972 | __dl_add(dl_b, new_bw); | ||
1973 | err = 0; | ||
1974 | } else if (dl_policy(policy) && task_has_dl_policy(p) && | ||
1975 | !__dl_overflow(dl_b, cpus, p->dl.dl_bw, new_bw)) { | ||
1976 | __dl_clear(dl_b, p->dl.dl_bw); | ||
1977 | __dl_add(dl_b, new_bw); | ||
1978 | err = 0; | ||
1979 | } else if (!dl_policy(policy) && task_has_dl_policy(p)) { | ||
1980 | __dl_clear(dl_b, p->dl.dl_bw); | ||
1981 | err = 0; | ||
1982 | } | ||
1983 | raw_spin_unlock(&dl_b->lock); | ||
1984 | |||
1985 | return err; | ||
1986 | } | ||
1987 | |||
1988 | extern void init_dl_bw(struct dl_bw *dl_b); | ||
1989 | |||
1842 | /* | 1990 | /* |
1843 | * wake_up_new_task - wake up a newly created task for the first time. | 1991 | * wake_up_new_task - wake up a newly created task for the first time. |
1844 | * | 1992 | * |
@@ -2003,6 +2151,9 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) | |||
2003 | if (unlikely(prev_state == TASK_DEAD)) { | 2151 | if (unlikely(prev_state == TASK_DEAD)) { |
2004 | task_numa_free(prev); | 2152 | task_numa_free(prev); |
2005 | 2153 | ||
2154 | if (prev->sched_class->task_dead) | ||
2155 | prev->sched_class->task_dead(prev); | ||
2156 | |||
2006 | /* | 2157 | /* |
2007 | * Remove function-return probe instances associated with this | 2158 | * Remove function-return probe instances associated with this |
2008 | * task and put them back on the free list. | 2159 | * task and put them back on the free list. |
@@ -2296,7 +2447,7 @@ void scheduler_tick(void) | |||
2296 | 2447 | ||
2297 | #ifdef CONFIG_SMP | 2448 | #ifdef CONFIG_SMP |
2298 | rq->idle_balance = idle_cpu(cpu); | 2449 | rq->idle_balance = idle_cpu(cpu); |
2299 | trigger_load_balance(rq, cpu); | 2450 | trigger_load_balance(rq); |
2300 | #endif | 2451 | #endif |
2301 | rq_last_tick_reset(rq); | 2452 | rq_last_tick_reset(rq); |
2302 | } | 2453 | } |
@@ -2325,7 +2476,7 @@ u64 scheduler_tick_max_deferment(void) | |||
2325 | if (time_before_eq(next, now)) | 2476 | if (time_before_eq(next, now)) |
2326 | return 0; | 2477 | return 0; |
2327 | 2478 | ||
2328 | return jiffies_to_usecs(next - now) * NSEC_PER_USEC; | 2479 | return jiffies_to_nsecs(next - now); |
2329 | } | 2480 | } |
2330 | #endif | 2481 | #endif |
2331 | 2482 | ||
@@ -2414,10 +2565,10 @@ static inline void schedule_debug(struct task_struct *prev) | |||
2414 | { | 2565 | { |
2415 | /* | 2566 | /* |
2416 | * Test if we are atomic. Since do_exit() needs to call into | 2567 | * Test if we are atomic. Since do_exit() needs to call into |
2417 | * schedule() atomically, we ignore that path for now. | 2568 | * schedule() atomically, we ignore that path. Otherwise whine |
2418 | * Otherwise, whine if we are scheduling when we should not be. | 2569 | * if we are scheduling when we should not. |
2419 | */ | 2570 | */ |
2420 | if (unlikely(in_atomic_preempt_off() && !prev->exit_state)) | 2571 | if (unlikely(in_atomic_preempt_off() && prev->state != TASK_DEAD)) |
2421 | __schedule_bug(prev); | 2572 | __schedule_bug(prev); |
2422 | rcu_sleep_check(); | 2573 | rcu_sleep_check(); |
2423 | 2574 | ||
@@ -2761,11 +2912,11 @@ EXPORT_SYMBOL(sleep_on_timeout); | |||
2761 | */ | 2912 | */ |
2762 | void rt_mutex_setprio(struct task_struct *p, int prio) | 2913 | void rt_mutex_setprio(struct task_struct *p, int prio) |
2763 | { | 2914 | { |
2764 | int oldprio, on_rq, running; | 2915 | int oldprio, on_rq, running, enqueue_flag = 0; |
2765 | struct rq *rq; | 2916 | struct rq *rq; |
2766 | const struct sched_class *prev_class; | 2917 | const struct sched_class *prev_class; |
2767 | 2918 | ||
2768 | BUG_ON(prio < 0 || prio > MAX_PRIO); | 2919 | BUG_ON(prio > MAX_PRIO); |
2769 | 2920 | ||
2770 | rq = __task_rq_lock(p); | 2921 | rq = __task_rq_lock(p); |
2771 | 2922 | ||
@@ -2788,6 +2939,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio) | |||
2788 | } | 2939 | } |
2789 | 2940 | ||
2790 | trace_sched_pi_setprio(p, prio); | 2941 | trace_sched_pi_setprio(p, prio); |
2942 | p->pi_top_task = rt_mutex_get_top_task(p); | ||
2791 | oldprio = p->prio; | 2943 | oldprio = p->prio; |
2792 | prev_class = p->sched_class; | 2944 | prev_class = p->sched_class; |
2793 | on_rq = p->on_rq; | 2945 | on_rq = p->on_rq; |
@@ -2797,23 +2949,49 @@ void rt_mutex_setprio(struct task_struct *p, int prio) | |||
2797 | if (running) | 2949 | if (running) |
2798 | p->sched_class->put_prev_task(rq, p); | 2950 | p->sched_class->put_prev_task(rq, p); |
2799 | 2951 | ||
2800 | if (rt_prio(prio)) | 2952 | /* |
2953 | * Boosting condition are: | ||
2954 | * 1. -rt task is running and holds mutex A | ||
2955 | * --> -dl task blocks on mutex A | ||
2956 | * | ||
2957 | * 2. -dl task is running and holds mutex A | ||
2958 | * --> -dl task blocks on mutex A and could preempt the | ||
2959 | * running task | ||
2960 | */ | ||
2961 | if (dl_prio(prio)) { | ||
2962 | if (!dl_prio(p->normal_prio) || (p->pi_top_task && | ||
2963 | dl_entity_preempt(&p->pi_top_task->dl, &p->dl))) { | ||
2964 | p->dl.dl_boosted = 1; | ||
2965 | p->dl.dl_throttled = 0; | ||
2966 | enqueue_flag = ENQUEUE_REPLENISH; | ||
2967 | } else | ||
2968 | p->dl.dl_boosted = 0; | ||
2969 | p->sched_class = &dl_sched_class; | ||
2970 | } else if (rt_prio(prio)) { | ||
2971 | if (dl_prio(oldprio)) | ||
2972 | p->dl.dl_boosted = 0; | ||
2973 | if (oldprio < prio) | ||
2974 | enqueue_flag = ENQUEUE_HEAD; | ||
2801 | p->sched_class = &rt_sched_class; | 2975 | p->sched_class = &rt_sched_class; |
2802 | else | 2976 | } else { |
2977 | if (dl_prio(oldprio)) | ||
2978 | p->dl.dl_boosted = 0; | ||
2803 | p->sched_class = &fair_sched_class; | 2979 | p->sched_class = &fair_sched_class; |
2980 | } | ||
2804 | 2981 | ||
2805 | p->prio = prio; | 2982 | p->prio = prio; |
2806 | 2983 | ||
2807 | if (running) | 2984 | if (running) |
2808 | p->sched_class->set_curr_task(rq); | 2985 | p->sched_class->set_curr_task(rq); |
2809 | if (on_rq) | 2986 | if (on_rq) |
2810 | enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0); | 2987 | enqueue_task(rq, p, enqueue_flag); |
2811 | 2988 | ||
2812 | check_class_changed(rq, p, prev_class, oldprio); | 2989 | check_class_changed(rq, p, prev_class, oldprio); |
2813 | out_unlock: | 2990 | out_unlock: |
2814 | __task_rq_unlock(rq); | 2991 | __task_rq_unlock(rq); |
2815 | } | 2992 | } |
2816 | #endif | 2993 | #endif |
2994 | |||
2817 | void set_user_nice(struct task_struct *p, long nice) | 2995 | void set_user_nice(struct task_struct *p, long nice) |
2818 | { | 2996 | { |
2819 | int old_prio, delta, on_rq; | 2997 | int old_prio, delta, on_rq; |
@@ -2831,9 +3009,9 @@ void set_user_nice(struct task_struct *p, long nice) | |||
2831 | * The RT priorities are set via sched_setscheduler(), but we still | 3009 | * The RT priorities are set via sched_setscheduler(), but we still |
2832 | * allow the 'normal' nice value to be set - but as expected | 3010 | * allow the 'normal' nice value to be set - but as expected |
2833 | * it wont have any effect on scheduling until the task is | 3011 | * it wont have any effect on scheduling until the task is |
2834 | * SCHED_FIFO/SCHED_RR: | 3012 | * SCHED_DEADLINE, SCHED_FIFO or SCHED_RR: |
2835 | */ | 3013 | */ |
2836 | if (task_has_rt_policy(p)) { | 3014 | if (task_has_dl_policy(p) || task_has_rt_policy(p)) { |
2837 | p->static_prio = NICE_TO_PRIO(nice); | 3015 | p->static_prio = NICE_TO_PRIO(nice); |
2838 | goto out_unlock; | 3016 | goto out_unlock; |
2839 | } | 3017 | } |
@@ -2988,22 +3166,95 @@ static struct task_struct *find_process_by_pid(pid_t pid) | |||
2988 | return pid ? find_task_by_vpid(pid) : current; | 3166 | return pid ? find_task_by_vpid(pid) : current; |
2989 | } | 3167 | } |
2990 | 3168 | ||
2991 | /* Actually do priority change: must hold rq lock. */ | 3169 | /* |
3170 | * This function initializes the sched_dl_entity of a newly becoming | ||
3171 | * SCHED_DEADLINE task. | ||
3172 | * | ||
3173 | * Only the static values are considered here, the actual runtime and the | ||
3174 | * absolute deadline will be properly calculated when the task is enqueued | ||
3175 | * for the first time with its new policy. | ||
3176 | */ | ||
2992 | static void | 3177 | static void |
2993 | __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio) | 3178 | __setparam_dl(struct task_struct *p, const struct sched_attr *attr) |
2994 | { | 3179 | { |
3180 | struct sched_dl_entity *dl_se = &p->dl; | ||
3181 | |||
3182 | init_dl_task_timer(dl_se); | ||
3183 | dl_se->dl_runtime = attr->sched_runtime; | ||
3184 | dl_se->dl_deadline = attr->sched_deadline; | ||
3185 | dl_se->dl_period = attr->sched_period ?: dl_se->dl_deadline; | ||
3186 | dl_se->flags = attr->sched_flags; | ||
3187 | dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime); | ||
3188 | dl_se->dl_throttled = 0; | ||
3189 | dl_se->dl_new = 1; | ||
3190 | } | ||
3191 | |||
3192 | /* Actually do priority change: must hold pi & rq lock. */ | ||
3193 | static void __setscheduler(struct rq *rq, struct task_struct *p, | ||
3194 | const struct sched_attr *attr) | ||
3195 | { | ||
3196 | int policy = attr->sched_policy; | ||
3197 | |||
3198 | if (policy == -1) /* setparam */ | ||
3199 | policy = p->policy; | ||
3200 | |||
2995 | p->policy = policy; | 3201 | p->policy = policy; |
2996 | p->rt_priority = prio; | 3202 | |
3203 | if (dl_policy(policy)) | ||
3204 | __setparam_dl(p, attr); | ||
3205 | else if (fair_policy(policy)) | ||
3206 | p->static_prio = NICE_TO_PRIO(attr->sched_nice); | ||
3207 | |||
3208 | /* | ||
3209 | * __sched_setscheduler() ensures attr->sched_priority == 0 when | ||
3210 | * !rt_policy. Always setting this ensures that things like | ||
3211 | * getparam()/getattr() don't report silly values for !rt tasks. | ||
3212 | */ | ||
3213 | p->rt_priority = attr->sched_priority; | ||
3214 | |||
2997 | p->normal_prio = normal_prio(p); | 3215 | p->normal_prio = normal_prio(p); |
2998 | /* we are holding p->pi_lock already */ | ||
2999 | p->prio = rt_mutex_getprio(p); | 3216 | p->prio = rt_mutex_getprio(p); |
3000 | if (rt_prio(p->prio)) | 3217 | |
3218 | if (dl_prio(p->prio)) | ||
3219 | p->sched_class = &dl_sched_class; | ||
3220 | else if (rt_prio(p->prio)) | ||
3001 | p->sched_class = &rt_sched_class; | 3221 | p->sched_class = &rt_sched_class; |
3002 | else | 3222 | else |
3003 | p->sched_class = &fair_sched_class; | 3223 | p->sched_class = &fair_sched_class; |
3224 | |||
3004 | set_load_weight(p); | 3225 | set_load_weight(p); |
3005 | } | 3226 | } |
3006 | 3227 | ||
3228 | static void | ||
3229 | __getparam_dl(struct task_struct *p, struct sched_attr *attr) | ||
3230 | { | ||
3231 | struct sched_dl_entity *dl_se = &p->dl; | ||
3232 | |||
3233 | attr->sched_priority = p->rt_priority; | ||
3234 | attr->sched_runtime = dl_se->dl_runtime; | ||
3235 | attr->sched_deadline = dl_se->dl_deadline; | ||
3236 | attr->sched_period = dl_se->dl_period; | ||
3237 | attr->sched_flags = dl_se->flags; | ||
3238 | } | ||
3239 | |||
3240 | /* | ||
3241 | * This function validates the new parameters of a -deadline task. | ||
3242 | * We ask for the deadline not being zero, and greater or equal | ||
3243 | * than the runtime, as well as the period of being zero or | ||
3244 | * greater than deadline. Furthermore, we have to be sure that | ||
3245 | * user parameters are above the internal resolution (1us); we | ||
3246 | * check sched_runtime only since it is always the smaller one. | ||
3247 | */ | ||
3248 | static bool | ||
3249 | __checkparam_dl(const struct sched_attr *attr) | ||
3250 | { | ||
3251 | return attr && attr->sched_deadline != 0 && | ||
3252 | (attr->sched_period == 0 || | ||
3253 | (s64)(attr->sched_period - attr->sched_deadline) >= 0) && | ||
3254 | (s64)(attr->sched_deadline - attr->sched_runtime ) >= 0 && | ||
3255 | attr->sched_runtime >= (2 << (DL_SCALE - 1)); | ||
3256 | } | ||
3257 | |||
3007 | /* | 3258 | /* |
3008 | * check the target process has a UID that matches the current process's | 3259 | * check the target process has a UID that matches the current process's |
3009 | */ | 3260 | */ |
@@ -3020,10 +3271,12 @@ static bool check_same_owner(struct task_struct *p) | |||
3020 | return match; | 3271 | return match; |
3021 | } | 3272 | } |
3022 | 3273 | ||
3023 | static int __sched_setscheduler(struct task_struct *p, int policy, | 3274 | static int __sched_setscheduler(struct task_struct *p, |
3024 | const struct sched_param *param, bool user) | 3275 | const struct sched_attr *attr, |
3276 | bool user) | ||
3025 | { | 3277 | { |
3026 | int retval, oldprio, oldpolicy = -1, on_rq, running; | 3278 | int retval, oldprio, oldpolicy = -1, on_rq, running; |
3279 | int policy = attr->sched_policy; | ||
3027 | unsigned long flags; | 3280 | unsigned long flags; |
3028 | const struct sched_class *prev_class; | 3281 | const struct sched_class *prev_class; |
3029 | struct rq *rq; | 3282 | struct rq *rq; |
@@ -3037,31 +3290,40 @@ recheck: | |||
3037 | reset_on_fork = p->sched_reset_on_fork; | 3290 | reset_on_fork = p->sched_reset_on_fork; |
3038 | policy = oldpolicy = p->policy; | 3291 | policy = oldpolicy = p->policy; |
3039 | } else { | 3292 | } else { |
3040 | reset_on_fork = !!(policy & SCHED_RESET_ON_FORK); | 3293 | reset_on_fork = !!(attr->sched_flags & SCHED_FLAG_RESET_ON_FORK); |
3041 | policy &= ~SCHED_RESET_ON_FORK; | ||
3042 | 3294 | ||
3043 | if (policy != SCHED_FIFO && policy != SCHED_RR && | 3295 | if (policy != SCHED_DEADLINE && |
3296 | policy != SCHED_FIFO && policy != SCHED_RR && | ||
3044 | policy != SCHED_NORMAL && policy != SCHED_BATCH && | 3297 | policy != SCHED_NORMAL && policy != SCHED_BATCH && |
3045 | policy != SCHED_IDLE) | 3298 | policy != SCHED_IDLE) |
3046 | return -EINVAL; | 3299 | return -EINVAL; |
3047 | } | 3300 | } |
3048 | 3301 | ||
3302 | if (attr->sched_flags & ~(SCHED_FLAG_RESET_ON_FORK)) | ||
3303 | return -EINVAL; | ||
3304 | |||
3049 | /* | 3305 | /* |
3050 | * Valid priorities for SCHED_FIFO and SCHED_RR are | 3306 | * Valid priorities for SCHED_FIFO and SCHED_RR are |
3051 | * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL, | 3307 | * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL, |
3052 | * SCHED_BATCH and SCHED_IDLE is 0. | 3308 | * SCHED_BATCH and SCHED_IDLE is 0. |
3053 | */ | 3309 | */ |
3054 | if (param->sched_priority < 0 || | 3310 | if ((p->mm && attr->sched_priority > MAX_USER_RT_PRIO-1) || |
3055 | (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) || | 3311 | (!p->mm && attr->sched_priority > MAX_RT_PRIO-1)) |
3056 | (!p->mm && param->sched_priority > MAX_RT_PRIO-1)) | ||
3057 | return -EINVAL; | 3312 | return -EINVAL; |
3058 | if (rt_policy(policy) != (param->sched_priority != 0)) | 3313 | if ((dl_policy(policy) && !__checkparam_dl(attr)) || |
3314 | (rt_policy(policy) != (attr->sched_priority != 0))) | ||
3059 | return -EINVAL; | 3315 | return -EINVAL; |
3060 | 3316 | ||
3061 | /* | 3317 | /* |
3062 | * Allow unprivileged RT tasks to decrease priority: | 3318 | * Allow unprivileged RT tasks to decrease priority: |
3063 | */ | 3319 | */ |
3064 | if (user && !capable(CAP_SYS_NICE)) { | 3320 | if (user && !capable(CAP_SYS_NICE)) { |
3321 | if (fair_policy(policy)) { | ||
3322 | if (attr->sched_nice < TASK_NICE(p) && | ||
3323 | !can_nice(p, attr->sched_nice)) | ||
3324 | return -EPERM; | ||
3325 | } | ||
3326 | |||
3065 | if (rt_policy(policy)) { | 3327 | if (rt_policy(policy)) { |
3066 | unsigned long rlim_rtprio = | 3328 | unsigned long rlim_rtprio = |
3067 | task_rlimit(p, RLIMIT_RTPRIO); | 3329 | task_rlimit(p, RLIMIT_RTPRIO); |
@@ -3071,11 +3333,20 @@ recheck: | |||
3071 | return -EPERM; | 3333 | return -EPERM; |
3072 | 3334 | ||
3073 | /* can't increase priority */ | 3335 | /* can't increase priority */ |
3074 | if (param->sched_priority > p->rt_priority && | 3336 | if (attr->sched_priority > p->rt_priority && |
3075 | param->sched_priority > rlim_rtprio) | 3337 | attr->sched_priority > rlim_rtprio) |
3076 | return -EPERM; | 3338 | return -EPERM; |
3077 | } | 3339 | } |
3078 | 3340 | ||
3341 | /* | ||
3342 | * Can't set/change SCHED_DEADLINE policy at all for now | ||
3343 | * (safest behavior); in the future we would like to allow | ||
3344 | * unprivileged DL tasks to increase their relative deadline | ||
3345 | * or reduce their runtime (both ways reducing utilization) | ||
3346 | */ | ||
3347 | if (dl_policy(policy)) | ||
3348 | return -EPERM; | ||
3349 | |||
3079 | /* | 3350 | /* |
3080 | * Treat SCHED_IDLE as nice 20. Only allow a switch to | 3351 | * Treat SCHED_IDLE as nice 20. Only allow a switch to |
3081 | * SCHED_NORMAL if the RLIMIT_NICE would normally permit it. | 3352 | * SCHED_NORMAL if the RLIMIT_NICE would normally permit it. |
@@ -3120,14 +3391,21 @@ recheck: | |||
3120 | /* | 3391 | /* |
3121 | * If not changing anything there's no need to proceed further: | 3392 | * If not changing anything there's no need to proceed further: |
3122 | */ | 3393 | */ |
3123 | if (unlikely(policy == p->policy && (!rt_policy(policy) || | 3394 | if (unlikely(policy == p->policy)) { |
3124 | param->sched_priority == p->rt_priority))) { | 3395 | if (fair_policy(policy) && attr->sched_nice != TASK_NICE(p)) |
3396 | goto change; | ||
3397 | if (rt_policy(policy) && attr->sched_priority != p->rt_priority) | ||
3398 | goto change; | ||
3399 | if (dl_policy(policy)) | ||
3400 | goto change; | ||
3401 | |||
3125 | task_rq_unlock(rq, p, &flags); | 3402 | task_rq_unlock(rq, p, &flags); |
3126 | return 0; | 3403 | return 0; |
3127 | } | 3404 | } |
3405 | change: | ||
3128 | 3406 | ||
3129 | #ifdef CONFIG_RT_GROUP_SCHED | ||
3130 | if (user) { | 3407 | if (user) { |
3408 | #ifdef CONFIG_RT_GROUP_SCHED | ||
3131 | /* | 3409 | /* |
3132 | * Do not allow realtime tasks into groups that have no runtime | 3410 | * Do not allow realtime tasks into groups that have no runtime |
3133 | * assigned. | 3411 | * assigned. |
@@ -3138,8 +3416,24 @@ recheck: | |||
3138 | task_rq_unlock(rq, p, &flags); | 3416 | task_rq_unlock(rq, p, &flags); |
3139 | return -EPERM; | 3417 | return -EPERM; |
3140 | } | 3418 | } |
3141 | } | ||
3142 | #endif | 3419 | #endif |
3420 | #ifdef CONFIG_SMP | ||
3421 | if (dl_bandwidth_enabled() && dl_policy(policy)) { | ||
3422 | cpumask_t *span = rq->rd->span; | ||
3423 | |||
3424 | /* | ||
3425 | * Don't allow tasks with an affinity mask smaller than | ||
3426 | * the entire root_domain to become SCHED_DEADLINE. We | ||
3427 | * will also fail if there's no bandwidth available. | ||
3428 | */ | ||
3429 | if (!cpumask_subset(span, &p->cpus_allowed) || | ||
3430 | rq->rd->dl_bw.bw == 0) { | ||
3431 | task_rq_unlock(rq, p, &flags); | ||
3432 | return -EPERM; | ||
3433 | } | ||
3434 | } | ||
3435 | #endif | ||
3436 | } | ||
3143 | 3437 | ||
3144 | /* recheck policy now with rq lock held */ | 3438 | /* recheck policy now with rq lock held */ |
3145 | if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { | 3439 | if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { |
@@ -3147,6 +3441,17 @@ recheck: | |||
3147 | task_rq_unlock(rq, p, &flags); | 3441 | task_rq_unlock(rq, p, &flags); |
3148 | goto recheck; | 3442 | goto recheck; |
3149 | } | 3443 | } |
3444 | |||
3445 | /* | ||
3446 | * If setscheduling to SCHED_DEADLINE (or changing the parameters | ||
3447 | * of a SCHED_DEADLINE task) we need to check if enough bandwidth | ||
3448 | * is available. | ||
3449 | */ | ||
3450 | if ((dl_policy(policy) || dl_task(p)) && dl_overflow(p, policy, attr)) { | ||
3451 | task_rq_unlock(rq, p, &flags); | ||
3452 | return -EBUSY; | ||
3453 | } | ||
3454 | |||
3150 | on_rq = p->on_rq; | 3455 | on_rq = p->on_rq; |
3151 | running = task_current(rq, p); | 3456 | running = task_current(rq, p); |
3152 | if (on_rq) | 3457 | if (on_rq) |
@@ -3158,7 +3463,7 @@ recheck: | |||
3158 | 3463 | ||
3159 | oldprio = p->prio; | 3464 | oldprio = p->prio; |
3160 | prev_class = p->sched_class; | 3465 | prev_class = p->sched_class; |
3161 | __setscheduler(rq, p, policy, param->sched_priority); | 3466 | __setscheduler(rq, p, attr); |
3162 | 3467 | ||
3163 | if (running) | 3468 | if (running) |
3164 | p->sched_class->set_curr_task(rq); | 3469 | p->sched_class->set_curr_task(rq); |
@@ -3173,6 +3478,26 @@ recheck: | |||
3173 | return 0; | 3478 | return 0; |
3174 | } | 3479 | } |
3175 | 3480 | ||
3481 | static int _sched_setscheduler(struct task_struct *p, int policy, | ||
3482 | const struct sched_param *param, bool check) | ||
3483 | { | ||
3484 | struct sched_attr attr = { | ||
3485 | .sched_policy = policy, | ||
3486 | .sched_priority = param->sched_priority, | ||
3487 | .sched_nice = PRIO_TO_NICE(p->static_prio), | ||
3488 | }; | ||
3489 | |||
3490 | /* | ||
3491 | * Fixup the legacy SCHED_RESET_ON_FORK hack | ||
3492 | */ | ||
3493 | if (policy & SCHED_RESET_ON_FORK) { | ||
3494 | attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK; | ||
3495 | policy &= ~SCHED_RESET_ON_FORK; | ||
3496 | attr.sched_policy = policy; | ||
3497 | } | ||
3498 | |||
3499 | return __sched_setscheduler(p, &attr, check); | ||
3500 | } | ||
3176 | /** | 3501 | /** |
3177 | * sched_setscheduler - change the scheduling policy and/or RT priority of a thread. | 3502 | * sched_setscheduler - change the scheduling policy and/or RT priority of a thread. |
3178 | * @p: the task in question. | 3503 | * @p: the task in question. |
@@ -3186,10 +3511,16 @@ recheck: | |||
3186 | int sched_setscheduler(struct task_struct *p, int policy, | 3511 | int sched_setscheduler(struct task_struct *p, int policy, |
3187 | const struct sched_param *param) | 3512 | const struct sched_param *param) |
3188 | { | 3513 | { |
3189 | return __sched_setscheduler(p, policy, param, true); | 3514 | return _sched_setscheduler(p, policy, param, true); |
3190 | } | 3515 | } |
3191 | EXPORT_SYMBOL_GPL(sched_setscheduler); | 3516 | EXPORT_SYMBOL_GPL(sched_setscheduler); |
3192 | 3517 | ||
3518 | int sched_setattr(struct task_struct *p, const struct sched_attr *attr) | ||
3519 | { | ||
3520 | return __sched_setscheduler(p, attr, true); | ||
3521 | } | ||
3522 | EXPORT_SYMBOL_GPL(sched_setattr); | ||
3523 | |||
3193 | /** | 3524 | /** |
3194 | * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace. | 3525 | * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace. |
3195 | * @p: the task in question. | 3526 | * @p: the task in question. |
@@ -3206,7 +3537,7 @@ EXPORT_SYMBOL_GPL(sched_setscheduler); | |||
3206 | int sched_setscheduler_nocheck(struct task_struct *p, int policy, | 3537 | int sched_setscheduler_nocheck(struct task_struct *p, int policy, |
3207 | const struct sched_param *param) | 3538 | const struct sched_param *param) |
3208 | { | 3539 | { |
3209 | return __sched_setscheduler(p, policy, param, false); | 3540 | return _sched_setscheduler(p, policy, param, false); |
3210 | } | 3541 | } |
3211 | 3542 | ||
3212 | static int | 3543 | static int |
@@ -3231,6 +3562,79 @@ do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) | |||
3231 | return retval; | 3562 | return retval; |
3232 | } | 3563 | } |
3233 | 3564 | ||
3565 | /* | ||
3566 | * Mimics kernel/events/core.c perf_copy_attr(). | ||
3567 | */ | ||
3568 | static int sched_copy_attr(struct sched_attr __user *uattr, | ||
3569 | struct sched_attr *attr) | ||
3570 | { | ||
3571 | u32 size; | ||
3572 | int ret; | ||
3573 | |||
3574 | if (!access_ok(VERIFY_WRITE, uattr, SCHED_ATTR_SIZE_VER0)) | ||
3575 | return -EFAULT; | ||
3576 | |||
3577 | /* | ||
3578 | * zero the full structure, so that a short copy will be nice. | ||
3579 | */ | ||
3580 | memset(attr, 0, sizeof(*attr)); | ||
3581 | |||
3582 | ret = get_user(size, &uattr->size); | ||
3583 | if (ret) | ||
3584 | return ret; | ||
3585 | |||
3586 | if (size > PAGE_SIZE) /* silly large */ | ||
3587 | goto err_size; | ||
3588 | |||
3589 | if (!size) /* abi compat */ | ||
3590 | size = SCHED_ATTR_SIZE_VER0; | ||
3591 | |||
3592 | if (size < SCHED_ATTR_SIZE_VER0) | ||
3593 | goto err_size; | ||
3594 | |||
3595 | /* | ||
3596 | * If we're handed a bigger struct than we know of, | ||
3597 | * ensure all the unknown bits are 0 - i.e. new | ||
3598 | * user-space does not rely on any kernel feature | ||
3599 | * extensions we dont know about yet. | ||
3600 | */ | ||
3601 | if (size > sizeof(*attr)) { | ||
3602 | unsigned char __user *addr; | ||
3603 | unsigned char __user *end; | ||
3604 | unsigned char val; | ||
3605 | |||
3606 | addr = (void __user *)uattr + sizeof(*attr); | ||
3607 | end = (void __user *)uattr + size; | ||
3608 | |||
3609 | for (; addr < end; addr++) { | ||
3610 | ret = get_user(val, addr); | ||
3611 | if (ret) | ||
3612 | return ret; | ||
3613 | if (val) | ||
3614 | goto err_size; | ||
3615 | } | ||
3616 | size = sizeof(*attr); | ||
3617 | } | ||
3618 | |||
3619 | ret = copy_from_user(attr, uattr, size); | ||
3620 | if (ret) | ||
3621 | return -EFAULT; | ||
3622 | |||
3623 | /* | ||
3624 | * XXX: do we want to be lenient like existing syscalls; or do we want | ||
3625 | * to be strict and return an error on out-of-bounds values? | ||
3626 | */ | ||
3627 | attr->sched_nice = clamp(attr->sched_nice, -20, 19); | ||
3628 | |||
3629 | out: | ||
3630 | return ret; | ||
3631 | |||
3632 | err_size: | ||
3633 | put_user(sizeof(*attr), &uattr->size); | ||
3634 | ret = -E2BIG; | ||
3635 | goto out; | ||
3636 | } | ||
3637 | |||
3234 | /** | 3638 | /** |
3235 | * sys_sched_setscheduler - set/change the scheduler policy and RT priority | 3639 | * sys_sched_setscheduler - set/change the scheduler policy and RT priority |
3236 | * @pid: the pid in question. | 3640 | * @pid: the pid in question. |
@@ -3262,6 +3666,34 @@ SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param) | |||
3262 | } | 3666 | } |
3263 | 3667 | ||
3264 | /** | 3668 | /** |
3669 | * sys_sched_setattr - same as above, but with extended sched_attr | ||
3670 | * @pid: the pid in question. | ||
3671 | * @uattr: structure containing the extended parameters. | ||
3672 | */ | ||
3673 | SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr, | ||
3674 | unsigned int, flags) | ||
3675 | { | ||
3676 | struct sched_attr attr; | ||
3677 | struct task_struct *p; | ||
3678 | int retval; | ||
3679 | |||
3680 | if (!uattr || pid < 0 || flags) | ||
3681 | return -EINVAL; | ||
3682 | |||
3683 | if (sched_copy_attr(uattr, &attr)) | ||
3684 | return -EFAULT; | ||
3685 | |||
3686 | rcu_read_lock(); | ||
3687 | retval = -ESRCH; | ||
3688 | p = find_process_by_pid(pid); | ||
3689 | if (p != NULL) | ||
3690 | retval = sched_setattr(p, &attr); | ||
3691 | rcu_read_unlock(); | ||
3692 | |||
3693 | return retval; | ||
3694 | } | ||
3695 | |||
3696 | /** | ||
3265 | * sys_sched_getscheduler - get the policy (scheduling class) of a thread | 3697 | * sys_sched_getscheduler - get the policy (scheduling class) of a thread |
3266 | * @pid: the pid in question. | 3698 | * @pid: the pid in question. |
3267 | * | 3699 | * |
@@ -3316,6 +3748,10 @@ SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param) | |||
3316 | if (retval) | 3748 | if (retval) |
3317 | goto out_unlock; | 3749 | goto out_unlock; |
3318 | 3750 | ||
3751 | if (task_has_dl_policy(p)) { | ||
3752 | retval = -EINVAL; | ||
3753 | goto out_unlock; | ||
3754 | } | ||
3319 | lp.sched_priority = p->rt_priority; | 3755 | lp.sched_priority = p->rt_priority; |
3320 | rcu_read_unlock(); | 3756 | rcu_read_unlock(); |
3321 | 3757 | ||
@@ -3331,6 +3767,96 @@ out_unlock: | |||
3331 | return retval; | 3767 | return retval; |
3332 | } | 3768 | } |
3333 | 3769 | ||
3770 | static int sched_read_attr(struct sched_attr __user *uattr, | ||
3771 | struct sched_attr *attr, | ||
3772 | unsigned int usize) | ||
3773 | { | ||
3774 | int ret; | ||
3775 | |||
3776 | if (!access_ok(VERIFY_WRITE, uattr, usize)) | ||
3777 | return -EFAULT; | ||
3778 | |||
3779 | /* | ||
3780 | * If we're handed a smaller struct than we know of, | ||
3781 | * ensure all the unknown bits are 0 - i.e. old | ||
3782 | * user-space does not get uncomplete information. | ||
3783 | */ | ||
3784 | if (usize < sizeof(*attr)) { | ||
3785 | unsigned char *addr; | ||
3786 | unsigned char *end; | ||
3787 | |||
3788 | addr = (void *)attr + usize; | ||
3789 | end = (void *)attr + sizeof(*attr); | ||
3790 | |||
3791 | for (; addr < end; addr++) { | ||
3792 | if (*addr) | ||
3793 | goto err_size; | ||
3794 | } | ||
3795 | |||
3796 | attr->size = usize; | ||
3797 | } | ||
3798 | |||
3799 | ret = copy_to_user(uattr, attr, attr->size); | ||
3800 | if (ret) | ||
3801 | return -EFAULT; | ||
3802 | |||
3803 | out: | ||
3804 | return ret; | ||
3805 | |||
3806 | err_size: | ||
3807 | ret = -E2BIG; | ||
3808 | goto out; | ||
3809 | } | ||
3810 | |||
3811 | /** | ||
3812 | * sys_sched_getattr - similar to sched_getparam, but with sched_attr | ||
3813 | * @pid: the pid in question. | ||
3814 | * @uattr: structure containing the extended parameters. | ||
3815 | * @size: sizeof(attr) for fwd/bwd comp. | ||
3816 | */ | ||
3817 | SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, | ||
3818 | unsigned int, size, unsigned int, flags) | ||
3819 | { | ||
3820 | struct sched_attr attr = { | ||
3821 | .size = sizeof(struct sched_attr), | ||
3822 | }; | ||
3823 | struct task_struct *p; | ||
3824 | int retval; | ||
3825 | |||
3826 | if (!uattr || pid < 0 || size > PAGE_SIZE || | ||
3827 | size < SCHED_ATTR_SIZE_VER0 || flags) | ||
3828 | return -EINVAL; | ||
3829 | |||
3830 | rcu_read_lock(); | ||
3831 | p = find_process_by_pid(pid); | ||
3832 | retval = -ESRCH; | ||
3833 | if (!p) | ||
3834 | goto out_unlock; | ||
3835 | |||
3836 | retval = security_task_getscheduler(p); | ||
3837 | if (retval) | ||
3838 | goto out_unlock; | ||
3839 | |||
3840 | attr.sched_policy = p->policy; | ||
3841 | if (p->sched_reset_on_fork) | ||
3842 | attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK; | ||
3843 | if (task_has_dl_policy(p)) | ||
3844 | __getparam_dl(p, &attr); | ||
3845 | else if (task_has_rt_policy(p)) | ||
3846 | attr.sched_priority = p->rt_priority; | ||
3847 | else | ||
3848 | attr.sched_nice = TASK_NICE(p); | ||
3849 | |||
3850 | rcu_read_unlock(); | ||
3851 | |||
3852 | retval = sched_read_attr(uattr, &attr, size); | ||
3853 | return retval; | ||
3854 | |||
3855 | out_unlock: | ||
3856 | rcu_read_unlock(); | ||
3857 | return retval; | ||
3858 | } | ||
3859 | |||
3334 | long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) | 3860 | long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) |
3335 | { | 3861 | { |
3336 | cpumask_var_t cpus_allowed, new_mask; | 3862 | cpumask_var_t cpus_allowed, new_mask; |
@@ -3375,8 +3901,26 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) | |||
3375 | if (retval) | 3901 | if (retval) |
3376 | goto out_unlock; | 3902 | goto out_unlock; |
3377 | 3903 | ||
3904 | |||
3378 | cpuset_cpus_allowed(p, cpus_allowed); | 3905 | cpuset_cpus_allowed(p, cpus_allowed); |
3379 | cpumask_and(new_mask, in_mask, cpus_allowed); | 3906 | cpumask_and(new_mask, in_mask, cpus_allowed); |
3907 | |||
3908 | /* | ||
3909 | * Since bandwidth control happens on root_domain basis, | ||
3910 | * if admission test is enabled, we only admit -deadline | ||
3911 | * tasks allowed to run on all the CPUs in the task's | ||
3912 | * root_domain. | ||
3913 | */ | ||
3914 | #ifdef CONFIG_SMP | ||
3915 | if (task_has_dl_policy(p)) { | ||
3916 | const struct cpumask *span = task_rq(p)->rd->span; | ||
3917 | |||
3918 | if (dl_bandwidth_enabled() && !cpumask_subset(span, new_mask)) { | ||
3919 | retval = -EBUSY; | ||
3920 | goto out_unlock; | ||
3921 | } | ||
3922 | } | ||
3923 | #endif | ||
3380 | again: | 3924 | again: |
3381 | retval = set_cpus_allowed_ptr(p, new_mask); | 3925 | retval = set_cpus_allowed_ptr(p, new_mask); |
3382 | 3926 | ||
@@ -3653,7 +4197,7 @@ again: | |||
3653 | } | 4197 | } |
3654 | 4198 | ||
3655 | double_rq_lock(rq, p_rq); | 4199 | double_rq_lock(rq, p_rq); |
3656 | while (task_rq(p) != p_rq) { | 4200 | if (task_rq(p) != p_rq) { |
3657 | double_rq_unlock(rq, p_rq); | 4201 | double_rq_unlock(rq, p_rq); |
3658 | goto again; | 4202 | goto again; |
3659 | } | 4203 | } |
@@ -3742,6 +4286,7 @@ SYSCALL_DEFINE1(sched_get_priority_max, int, policy) | |||
3742 | case SCHED_RR: | 4286 | case SCHED_RR: |
3743 | ret = MAX_USER_RT_PRIO-1; | 4287 | ret = MAX_USER_RT_PRIO-1; |
3744 | break; | 4288 | break; |
4289 | case SCHED_DEADLINE: | ||
3745 | case SCHED_NORMAL: | 4290 | case SCHED_NORMAL: |
3746 | case SCHED_BATCH: | 4291 | case SCHED_BATCH: |
3747 | case SCHED_IDLE: | 4292 | case SCHED_IDLE: |
@@ -3768,6 +4313,7 @@ SYSCALL_DEFINE1(sched_get_priority_min, int, policy) | |||
3768 | case SCHED_RR: | 4313 | case SCHED_RR: |
3769 | ret = 1; | 4314 | ret = 1; |
3770 | break; | 4315 | break; |
4316 | case SCHED_DEADLINE: | ||
3771 | case SCHED_NORMAL: | 4317 | case SCHED_NORMAL: |
3772 | case SCHED_BATCH: | 4318 | case SCHED_BATCH: |
3773 | case SCHED_IDLE: | 4319 | case SCHED_IDLE: |
@@ -3811,7 +4357,9 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, | |||
3811 | goto out_unlock; | 4357 | goto out_unlock; |
3812 | 4358 | ||
3813 | rq = task_rq_lock(p, &flags); | 4359 | rq = task_rq_lock(p, &flags); |
3814 | time_slice = p->sched_class->get_rr_interval(rq, p); | 4360 | time_slice = 0; |
4361 | if (p->sched_class->get_rr_interval) | ||
4362 | time_slice = p->sched_class->get_rr_interval(rq, p); | ||
3815 | task_rq_unlock(rq, p, &flags); | 4363 | task_rq_unlock(rq, p, &flags); |
3816 | 4364 | ||
3817 | rcu_read_unlock(); | 4365 | rcu_read_unlock(); |
@@ -4090,6 +4638,7 @@ int migrate_task_to(struct task_struct *p, int target_cpu) | |||
4090 | 4638 | ||
4091 | /* TODO: This is not properly updating schedstats */ | 4639 | /* TODO: This is not properly updating schedstats */ |
4092 | 4640 | ||
4641 | trace_sched_move_numa(p, curr_cpu, target_cpu); | ||
4093 | return stop_one_cpu(curr_cpu, migration_cpu_stop, &arg); | 4642 | return stop_one_cpu(curr_cpu, migration_cpu_stop, &arg); |
4094 | } | 4643 | } |
4095 | 4644 | ||
@@ -4514,13 +5063,31 @@ static int sched_cpu_active(struct notifier_block *nfb, | |||
4514 | static int sched_cpu_inactive(struct notifier_block *nfb, | 5063 | static int sched_cpu_inactive(struct notifier_block *nfb, |
4515 | unsigned long action, void *hcpu) | 5064 | unsigned long action, void *hcpu) |
4516 | { | 5065 | { |
5066 | unsigned long flags; | ||
5067 | long cpu = (long)hcpu; | ||
5068 | |||
4517 | switch (action & ~CPU_TASKS_FROZEN) { | 5069 | switch (action & ~CPU_TASKS_FROZEN) { |
4518 | case CPU_DOWN_PREPARE: | 5070 | case CPU_DOWN_PREPARE: |
4519 | set_cpu_active((long)hcpu, false); | 5071 | set_cpu_active(cpu, false); |
5072 | |||
5073 | /* explicitly allow suspend */ | ||
5074 | if (!(action & CPU_TASKS_FROZEN)) { | ||
5075 | struct dl_bw *dl_b = dl_bw_of(cpu); | ||
5076 | bool overflow; | ||
5077 | int cpus; | ||
5078 | |||
5079 | raw_spin_lock_irqsave(&dl_b->lock, flags); | ||
5080 | cpus = dl_bw_cpus(cpu); | ||
5081 | overflow = __dl_overflow(dl_b, cpus, 0, 0); | ||
5082 | raw_spin_unlock_irqrestore(&dl_b->lock, flags); | ||
5083 | |||
5084 | if (overflow) | ||
5085 | return notifier_from_errno(-EBUSY); | ||
5086 | } | ||
4520 | return NOTIFY_OK; | 5087 | return NOTIFY_OK; |
4521 | default: | ||
4522 | return NOTIFY_DONE; | ||
4523 | } | 5088 | } |
5089 | |||
5090 | return NOTIFY_DONE; | ||
4524 | } | 5091 | } |
4525 | 5092 | ||
4526 | static int __init migration_init(void) | 5093 | static int __init migration_init(void) |
@@ -4739,6 +5306,8 @@ static void free_rootdomain(struct rcu_head *rcu) | |||
4739 | struct root_domain *rd = container_of(rcu, struct root_domain, rcu); | 5306 | struct root_domain *rd = container_of(rcu, struct root_domain, rcu); |
4740 | 5307 | ||
4741 | cpupri_cleanup(&rd->cpupri); | 5308 | cpupri_cleanup(&rd->cpupri); |
5309 | cpudl_cleanup(&rd->cpudl); | ||
5310 | free_cpumask_var(rd->dlo_mask); | ||
4742 | free_cpumask_var(rd->rto_mask); | 5311 | free_cpumask_var(rd->rto_mask); |
4743 | free_cpumask_var(rd->online); | 5312 | free_cpumask_var(rd->online); |
4744 | free_cpumask_var(rd->span); | 5313 | free_cpumask_var(rd->span); |
@@ -4790,8 +5359,14 @@ static int init_rootdomain(struct root_domain *rd) | |||
4790 | goto out; | 5359 | goto out; |
4791 | if (!alloc_cpumask_var(&rd->online, GFP_KERNEL)) | 5360 | if (!alloc_cpumask_var(&rd->online, GFP_KERNEL)) |
4792 | goto free_span; | 5361 | goto free_span; |
4793 | if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL)) | 5362 | if (!alloc_cpumask_var(&rd->dlo_mask, GFP_KERNEL)) |
4794 | goto free_online; | 5363 | goto free_online; |
5364 | if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL)) | ||
5365 | goto free_dlo_mask; | ||
5366 | |||
5367 | init_dl_bw(&rd->dl_bw); | ||
5368 | if (cpudl_init(&rd->cpudl) != 0) | ||
5369 | goto free_dlo_mask; | ||
4795 | 5370 | ||
4796 | if (cpupri_init(&rd->cpupri) != 0) | 5371 | if (cpupri_init(&rd->cpupri) != 0) |
4797 | goto free_rto_mask; | 5372 | goto free_rto_mask; |
@@ -4799,6 +5374,8 @@ static int init_rootdomain(struct root_domain *rd) | |||
4799 | 5374 | ||
4800 | free_rto_mask: | 5375 | free_rto_mask: |
4801 | free_cpumask_var(rd->rto_mask); | 5376 | free_cpumask_var(rd->rto_mask); |
5377 | free_dlo_mask: | ||
5378 | free_cpumask_var(rd->dlo_mask); | ||
4802 | free_online: | 5379 | free_online: |
4803 | free_cpumask_var(rd->online); | 5380 | free_cpumask_var(rd->online); |
4804 | free_span: | 5381 | free_span: |
@@ -6150,6 +6727,7 @@ void __init sched_init_smp(void) | |||
6150 | free_cpumask_var(non_isolated_cpus); | 6727 | free_cpumask_var(non_isolated_cpus); |
6151 | 6728 | ||
6152 | init_sched_rt_class(); | 6729 | init_sched_rt_class(); |
6730 | init_sched_dl_class(); | ||
6153 | } | 6731 | } |
6154 | #else | 6732 | #else |
6155 | void __init sched_init_smp(void) | 6733 | void __init sched_init_smp(void) |
@@ -6219,13 +6797,15 @@ void __init sched_init(void) | |||
6219 | #endif /* CONFIG_CPUMASK_OFFSTACK */ | 6797 | #endif /* CONFIG_CPUMASK_OFFSTACK */ |
6220 | } | 6798 | } |
6221 | 6799 | ||
6800 | init_rt_bandwidth(&def_rt_bandwidth, | ||
6801 | global_rt_period(), global_rt_runtime()); | ||
6802 | init_dl_bandwidth(&def_dl_bandwidth, | ||
6803 | global_rt_period(), global_rt_runtime()); | ||
6804 | |||
6222 | #ifdef CONFIG_SMP | 6805 | #ifdef CONFIG_SMP |
6223 | init_defrootdomain(); | 6806 | init_defrootdomain(); |
6224 | #endif | 6807 | #endif |
6225 | 6808 | ||
6226 | init_rt_bandwidth(&def_rt_bandwidth, | ||
6227 | global_rt_period(), global_rt_runtime()); | ||
6228 | |||
6229 | #ifdef CONFIG_RT_GROUP_SCHED | 6809 | #ifdef CONFIG_RT_GROUP_SCHED |
6230 | init_rt_bandwidth(&root_task_group.rt_bandwidth, | 6810 | init_rt_bandwidth(&root_task_group.rt_bandwidth, |
6231 | global_rt_period(), global_rt_runtime()); | 6811 | global_rt_period(), global_rt_runtime()); |
@@ -6249,6 +6829,7 @@ void __init sched_init(void) | |||
6249 | rq->calc_load_update = jiffies + LOAD_FREQ; | 6829 | rq->calc_load_update = jiffies + LOAD_FREQ; |
6250 | init_cfs_rq(&rq->cfs); | 6830 | init_cfs_rq(&rq->cfs); |
6251 | init_rt_rq(&rq->rt, rq); | 6831 | init_rt_rq(&rq->rt, rq); |
6832 | init_dl_rq(&rq->dl, rq); | ||
6252 | #ifdef CONFIG_FAIR_GROUP_SCHED | 6833 | #ifdef CONFIG_FAIR_GROUP_SCHED |
6253 | root_task_group.shares = ROOT_TASK_GROUP_LOAD; | 6834 | root_task_group.shares = ROOT_TASK_GROUP_LOAD; |
6254 | INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); | 6835 | INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); |
@@ -6320,10 +6901,6 @@ void __init sched_init(void) | |||
6320 | INIT_HLIST_HEAD(&init_task.preempt_notifiers); | 6901 | INIT_HLIST_HEAD(&init_task.preempt_notifiers); |
6321 | #endif | 6902 | #endif |
6322 | 6903 | ||
6323 | #ifdef CONFIG_RT_MUTEXES | ||
6324 | plist_head_init(&init_task.pi_waiters); | ||
6325 | #endif | ||
6326 | |||
6327 | /* | 6904 | /* |
6328 | * The boot idle thread does lazy MMU switching as well: | 6905 | * The boot idle thread does lazy MMU switching as well: |
6329 | */ | 6906 | */ |
@@ -6397,13 +6974,16 @@ EXPORT_SYMBOL(__might_sleep); | |||
6397 | static void normalize_task(struct rq *rq, struct task_struct *p) | 6974 | static void normalize_task(struct rq *rq, struct task_struct *p) |
6398 | { | 6975 | { |
6399 | const struct sched_class *prev_class = p->sched_class; | 6976 | const struct sched_class *prev_class = p->sched_class; |
6977 | struct sched_attr attr = { | ||
6978 | .sched_policy = SCHED_NORMAL, | ||
6979 | }; | ||
6400 | int old_prio = p->prio; | 6980 | int old_prio = p->prio; |
6401 | int on_rq; | 6981 | int on_rq; |
6402 | 6982 | ||
6403 | on_rq = p->on_rq; | 6983 | on_rq = p->on_rq; |
6404 | if (on_rq) | 6984 | if (on_rq) |
6405 | dequeue_task(rq, p, 0); | 6985 | dequeue_task(rq, p, 0); |
6406 | __setscheduler(rq, p, SCHED_NORMAL, 0); | 6986 | __setscheduler(rq, p, &attr); |
6407 | if (on_rq) { | 6987 | if (on_rq) { |
6408 | enqueue_task(rq, p, 0); | 6988 | enqueue_task(rq, p, 0); |
6409 | resched_task(rq->curr); | 6989 | resched_task(rq->curr); |
@@ -6433,7 +7013,7 @@ void normalize_rt_tasks(void) | |||
6433 | p->se.statistics.block_start = 0; | 7013 | p->se.statistics.block_start = 0; |
6434 | #endif | 7014 | #endif |
6435 | 7015 | ||
6436 | if (!rt_task(p)) { | 7016 | if (!dl_task(p) && !rt_task(p)) { |
6437 | /* | 7017 | /* |
6438 | * Renice negative nice level userspace | 7018 | * Renice negative nice level userspace |
6439 | * tasks back to 0: | 7019 | * tasks back to 0: |
@@ -6628,16 +7208,6 @@ void sched_move_task(struct task_struct *tsk) | |||
6628 | } | 7208 | } |
6629 | #endif /* CONFIG_CGROUP_SCHED */ | 7209 | #endif /* CONFIG_CGROUP_SCHED */ |
6630 | 7210 | ||
6631 | #if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_CFS_BANDWIDTH) | ||
6632 | static unsigned long to_ratio(u64 period, u64 runtime) | ||
6633 | { | ||
6634 | if (runtime == RUNTIME_INF) | ||
6635 | return 1ULL << 20; | ||
6636 | |||
6637 | return div64_u64(runtime << 20, period); | ||
6638 | } | ||
6639 | #endif | ||
6640 | |||
6641 | #ifdef CONFIG_RT_GROUP_SCHED | 7211 | #ifdef CONFIG_RT_GROUP_SCHED |
6642 | /* | 7212 | /* |
6643 | * Ensure that the real time constraints are schedulable. | 7213 | * Ensure that the real time constraints are schedulable. |
@@ -6811,24 +7381,13 @@ static long sched_group_rt_period(struct task_group *tg) | |||
6811 | do_div(rt_period_us, NSEC_PER_USEC); | 7381 | do_div(rt_period_us, NSEC_PER_USEC); |
6812 | return rt_period_us; | 7382 | return rt_period_us; |
6813 | } | 7383 | } |
7384 | #endif /* CONFIG_RT_GROUP_SCHED */ | ||
6814 | 7385 | ||
7386 | #ifdef CONFIG_RT_GROUP_SCHED | ||
6815 | static int sched_rt_global_constraints(void) | 7387 | static int sched_rt_global_constraints(void) |
6816 | { | 7388 | { |
6817 | u64 runtime, period; | ||
6818 | int ret = 0; | 7389 | int ret = 0; |
6819 | 7390 | ||
6820 | if (sysctl_sched_rt_period <= 0) | ||
6821 | return -EINVAL; | ||
6822 | |||
6823 | runtime = global_rt_runtime(); | ||
6824 | period = global_rt_period(); | ||
6825 | |||
6826 | /* | ||
6827 | * Sanity check on the sysctl variables. | ||
6828 | */ | ||
6829 | if (runtime > period && runtime != RUNTIME_INF) | ||
6830 | return -EINVAL; | ||
6831 | |||
6832 | mutex_lock(&rt_constraints_mutex); | 7391 | mutex_lock(&rt_constraints_mutex); |
6833 | read_lock(&tasklist_lock); | 7392 | read_lock(&tasklist_lock); |
6834 | ret = __rt_schedulable(NULL, 0, 0); | 7393 | ret = __rt_schedulable(NULL, 0, 0); |
@@ -6851,17 +7410,7 @@ static int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk) | |||
6851 | static int sched_rt_global_constraints(void) | 7410 | static int sched_rt_global_constraints(void) |
6852 | { | 7411 | { |
6853 | unsigned long flags; | 7412 | unsigned long flags; |
6854 | int i; | 7413 | int i, ret = 0; |
6855 | |||
6856 | if (sysctl_sched_rt_period <= 0) | ||
6857 | return -EINVAL; | ||
6858 | |||
6859 | /* | ||
6860 | * There's always some RT tasks in the root group | ||
6861 | * -- migration, kstopmachine etc.. | ||
6862 | */ | ||
6863 | if (sysctl_sched_rt_runtime == 0) | ||
6864 | return -EBUSY; | ||
6865 | 7414 | ||
6866 | raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); | 7415 | raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); |
6867 | for_each_possible_cpu(i) { | 7416 | for_each_possible_cpu(i) { |
@@ -6873,36 +7422,91 @@ static int sched_rt_global_constraints(void) | |||
6873 | } | 7422 | } |
6874 | raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags); | 7423 | raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags); |
6875 | 7424 | ||
6876 | return 0; | 7425 | return ret; |
6877 | } | 7426 | } |
6878 | #endif /* CONFIG_RT_GROUP_SCHED */ | 7427 | #endif /* CONFIG_RT_GROUP_SCHED */ |
6879 | 7428 | ||
6880 | int sched_rr_handler(struct ctl_table *table, int write, | 7429 | static int sched_dl_global_constraints(void) |
6881 | void __user *buffer, size_t *lenp, | ||
6882 | loff_t *ppos) | ||
6883 | { | 7430 | { |
6884 | int ret; | 7431 | u64 runtime = global_rt_runtime(); |
6885 | static DEFINE_MUTEX(mutex); | 7432 | u64 period = global_rt_period(); |
7433 | u64 new_bw = to_ratio(period, runtime); | ||
7434 | int cpu, ret = 0; | ||
7435 | unsigned long flags; | ||
6886 | 7436 | ||
6887 | mutex_lock(&mutex); | 7437 | /* |
6888 | ret = proc_dointvec(table, write, buffer, lenp, ppos); | 7438 | * Here we want to check the bandwidth not being set to some |
6889 | /* make sure that internally we keep jiffies */ | 7439 | * value smaller than the currently allocated bandwidth in |
6890 | /* also, writing zero resets timeslice to default */ | 7440 | * any of the root_domains. |
6891 | if (!ret && write) { | 7441 | * |
6892 | sched_rr_timeslice = sched_rr_timeslice <= 0 ? | 7442 | * FIXME: Cycling on all the CPUs is overdoing, but simpler than |
6893 | RR_TIMESLICE : msecs_to_jiffies(sched_rr_timeslice); | 7443 | * cycling on root_domains... Discussion on different/better |
7444 | * solutions is welcome! | ||
7445 | */ | ||
7446 | for_each_possible_cpu(cpu) { | ||
7447 | struct dl_bw *dl_b = dl_bw_of(cpu); | ||
7448 | |||
7449 | raw_spin_lock_irqsave(&dl_b->lock, flags); | ||
7450 | if (new_bw < dl_b->total_bw) | ||
7451 | ret = -EBUSY; | ||
7452 | raw_spin_unlock_irqrestore(&dl_b->lock, flags); | ||
7453 | |||
7454 | if (ret) | ||
7455 | break; | ||
6894 | } | 7456 | } |
6895 | mutex_unlock(&mutex); | 7457 | |
6896 | return ret; | 7458 | return ret; |
6897 | } | 7459 | } |
6898 | 7460 | ||
7461 | static void sched_dl_do_global(void) | ||
7462 | { | ||
7463 | u64 new_bw = -1; | ||
7464 | int cpu; | ||
7465 | unsigned long flags; | ||
7466 | |||
7467 | def_dl_bandwidth.dl_period = global_rt_period(); | ||
7468 | def_dl_bandwidth.dl_runtime = global_rt_runtime(); | ||
7469 | |||
7470 | if (global_rt_runtime() != RUNTIME_INF) | ||
7471 | new_bw = to_ratio(global_rt_period(), global_rt_runtime()); | ||
7472 | |||
7473 | /* | ||
7474 | * FIXME: As above... | ||
7475 | */ | ||
7476 | for_each_possible_cpu(cpu) { | ||
7477 | struct dl_bw *dl_b = dl_bw_of(cpu); | ||
7478 | |||
7479 | raw_spin_lock_irqsave(&dl_b->lock, flags); | ||
7480 | dl_b->bw = new_bw; | ||
7481 | raw_spin_unlock_irqrestore(&dl_b->lock, flags); | ||
7482 | } | ||
7483 | } | ||
7484 | |||
7485 | static int sched_rt_global_validate(void) | ||
7486 | { | ||
7487 | if (sysctl_sched_rt_period <= 0) | ||
7488 | return -EINVAL; | ||
7489 | |||
7490 | if ((sysctl_sched_rt_runtime != RUNTIME_INF) && | ||
7491 | (sysctl_sched_rt_runtime > sysctl_sched_rt_period)) | ||
7492 | return -EINVAL; | ||
7493 | |||
7494 | return 0; | ||
7495 | } | ||
7496 | |||
7497 | static void sched_rt_do_global(void) | ||
7498 | { | ||
7499 | def_rt_bandwidth.rt_runtime = global_rt_runtime(); | ||
7500 | def_rt_bandwidth.rt_period = ns_to_ktime(global_rt_period()); | ||
7501 | } | ||
7502 | |||
6899 | int sched_rt_handler(struct ctl_table *table, int write, | 7503 | int sched_rt_handler(struct ctl_table *table, int write, |
6900 | void __user *buffer, size_t *lenp, | 7504 | void __user *buffer, size_t *lenp, |
6901 | loff_t *ppos) | 7505 | loff_t *ppos) |
6902 | { | 7506 | { |
6903 | int ret; | ||
6904 | int old_period, old_runtime; | 7507 | int old_period, old_runtime; |
6905 | static DEFINE_MUTEX(mutex); | 7508 | static DEFINE_MUTEX(mutex); |
7509 | int ret; | ||
6906 | 7510 | ||
6907 | mutex_lock(&mutex); | 7511 | mutex_lock(&mutex); |
6908 | old_period = sysctl_sched_rt_period; | 7512 | old_period = sysctl_sched_rt_period; |
@@ -6911,21 +7515,50 @@ int sched_rt_handler(struct ctl_table *table, int write, | |||
6911 | ret = proc_dointvec(table, write, buffer, lenp, ppos); | 7515 | ret = proc_dointvec(table, write, buffer, lenp, ppos); |
6912 | 7516 | ||
6913 | if (!ret && write) { | 7517 | if (!ret && write) { |
7518 | ret = sched_rt_global_validate(); | ||
7519 | if (ret) | ||
7520 | goto undo; | ||
7521 | |||
6914 | ret = sched_rt_global_constraints(); | 7522 | ret = sched_rt_global_constraints(); |
6915 | if (ret) { | 7523 | if (ret) |
6916 | sysctl_sched_rt_period = old_period; | 7524 | goto undo; |
6917 | sysctl_sched_rt_runtime = old_runtime; | 7525 | |
6918 | } else { | 7526 | ret = sched_dl_global_constraints(); |
6919 | def_rt_bandwidth.rt_runtime = global_rt_runtime(); | 7527 | if (ret) |
6920 | def_rt_bandwidth.rt_period = | 7528 | goto undo; |
6921 | ns_to_ktime(global_rt_period()); | 7529 | |
6922 | } | 7530 | sched_rt_do_global(); |
7531 | sched_dl_do_global(); | ||
7532 | } | ||
7533 | if (0) { | ||
7534 | undo: | ||
7535 | sysctl_sched_rt_period = old_period; | ||
7536 | sysctl_sched_rt_runtime = old_runtime; | ||
6923 | } | 7537 | } |
6924 | mutex_unlock(&mutex); | 7538 | mutex_unlock(&mutex); |
6925 | 7539 | ||
6926 | return ret; | 7540 | return ret; |
6927 | } | 7541 | } |
6928 | 7542 | ||
7543 | int sched_rr_handler(struct ctl_table *table, int write, | ||
7544 | void __user *buffer, size_t *lenp, | ||
7545 | loff_t *ppos) | ||
7546 | { | ||
7547 | int ret; | ||
7548 | static DEFINE_MUTEX(mutex); | ||
7549 | |||
7550 | mutex_lock(&mutex); | ||
7551 | ret = proc_dointvec(table, write, buffer, lenp, ppos); | ||
7552 | /* make sure that internally we keep jiffies */ | ||
7553 | /* also, writing zero resets timeslice to default */ | ||
7554 | if (!ret && write) { | ||
7555 | sched_rr_timeslice = sched_rr_timeslice <= 0 ? | ||
7556 | RR_TIMESLICE : msecs_to_jiffies(sched_rr_timeslice); | ||
7557 | } | ||
7558 | mutex_unlock(&mutex); | ||
7559 | return ret; | ||
7560 | } | ||
7561 | |||
6929 | #ifdef CONFIG_CGROUP_SCHED | 7562 | #ifdef CONFIG_CGROUP_SCHED |
6930 | 7563 | ||
6931 | static inline struct task_group *css_tg(struct cgroup_subsys_state *css) | 7564 | static inline struct task_group *css_tg(struct cgroup_subsys_state *css) |
@@ -7258,15 +7891,14 @@ static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota) | |||
7258 | return ret; | 7891 | return ret; |
7259 | } | 7892 | } |
7260 | 7893 | ||
7261 | static int cpu_stats_show(struct cgroup_subsys_state *css, struct cftype *cft, | 7894 | static int cpu_stats_show(struct seq_file *sf, void *v) |
7262 | struct cgroup_map_cb *cb) | ||
7263 | { | 7895 | { |
7264 | struct task_group *tg = css_tg(css); | 7896 | struct task_group *tg = css_tg(seq_css(sf)); |
7265 | struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; | 7897 | struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; |
7266 | 7898 | ||
7267 | cb->fill(cb, "nr_periods", cfs_b->nr_periods); | 7899 | seq_printf(sf, "nr_periods %d\n", cfs_b->nr_periods); |
7268 | cb->fill(cb, "nr_throttled", cfs_b->nr_throttled); | 7900 | seq_printf(sf, "nr_throttled %d\n", cfs_b->nr_throttled); |
7269 | cb->fill(cb, "throttled_time", cfs_b->throttled_time); | 7901 | seq_printf(sf, "throttled_time %llu\n", cfs_b->throttled_time); |
7270 | 7902 | ||
7271 | return 0; | 7903 | return 0; |
7272 | } | 7904 | } |
@@ -7320,7 +7952,7 @@ static struct cftype cpu_files[] = { | |||
7320 | }, | 7952 | }, |
7321 | { | 7953 | { |
7322 | .name = "stat", | 7954 | .name = "stat", |
7323 | .read_map = cpu_stats_show, | 7955 | .seq_show = cpu_stats_show, |
7324 | }, | 7956 | }, |
7325 | #endif | 7957 | #endif |
7326 | #ifdef CONFIG_RT_GROUP_SCHED | 7958 | #ifdef CONFIG_RT_GROUP_SCHED |
diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index f64722ff0299..622e0818f905 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c | |||
@@ -163,10 +163,9 @@ out: | |||
163 | return err; | 163 | return err; |
164 | } | 164 | } |
165 | 165 | ||
166 | static int cpuacct_percpu_seq_read(struct cgroup_subsys_state *css, | 166 | static int cpuacct_percpu_seq_show(struct seq_file *m, void *V) |
167 | struct cftype *cft, struct seq_file *m) | ||
168 | { | 167 | { |
169 | struct cpuacct *ca = css_ca(css); | 168 | struct cpuacct *ca = css_ca(seq_css(m)); |
170 | u64 percpu; | 169 | u64 percpu; |
171 | int i; | 170 | int i; |
172 | 171 | ||
@@ -183,10 +182,9 @@ static const char * const cpuacct_stat_desc[] = { | |||
183 | [CPUACCT_STAT_SYSTEM] = "system", | 182 | [CPUACCT_STAT_SYSTEM] = "system", |
184 | }; | 183 | }; |
185 | 184 | ||
186 | static int cpuacct_stats_show(struct cgroup_subsys_state *css, | 185 | static int cpuacct_stats_show(struct seq_file *sf, void *v) |
187 | struct cftype *cft, struct cgroup_map_cb *cb) | ||
188 | { | 186 | { |
189 | struct cpuacct *ca = css_ca(css); | 187 | struct cpuacct *ca = css_ca(seq_css(sf)); |
190 | int cpu; | 188 | int cpu; |
191 | s64 val = 0; | 189 | s64 val = 0; |
192 | 190 | ||
@@ -196,7 +194,7 @@ static int cpuacct_stats_show(struct cgroup_subsys_state *css, | |||
196 | val += kcpustat->cpustat[CPUTIME_NICE]; | 194 | val += kcpustat->cpustat[CPUTIME_NICE]; |
197 | } | 195 | } |
198 | val = cputime64_to_clock_t(val); | 196 | val = cputime64_to_clock_t(val); |
199 | cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_USER], val); | 197 | seq_printf(sf, "%s %lld\n", cpuacct_stat_desc[CPUACCT_STAT_USER], val); |
200 | 198 | ||
201 | val = 0; | 199 | val = 0; |
202 | for_each_online_cpu(cpu) { | 200 | for_each_online_cpu(cpu) { |
@@ -207,7 +205,7 @@ static int cpuacct_stats_show(struct cgroup_subsys_state *css, | |||
207 | } | 205 | } |
208 | 206 | ||
209 | val = cputime64_to_clock_t(val); | 207 | val = cputime64_to_clock_t(val); |
210 | cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val); | 208 | seq_printf(sf, "%s %lld\n", cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val); |
211 | 209 | ||
212 | return 0; | 210 | return 0; |
213 | } | 211 | } |
@@ -220,11 +218,11 @@ static struct cftype files[] = { | |||
220 | }, | 218 | }, |
221 | { | 219 | { |
222 | .name = "usage_percpu", | 220 | .name = "usage_percpu", |
223 | .read_seq_string = cpuacct_percpu_seq_read, | 221 | .seq_show = cpuacct_percpu_seq_show, |
224 | }, | 222 | }, |
225 | { | 223 | { |
226 | .name = "stat", | 224 | .name = "stat", |
227 | .read_map = cpuacct_stats_show, | 225 | .seq_show = cpuacct_stats_show, |
228 | }, | 226 | }, |
229 | { } /* terminate */ | 227 | { } /* terminate */ |
230 | }; | 228 | }; |
diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c new file mode 100644 index 000000000000..5b9bb42b2d47 --- /dev/null +++ b/kernel/sched/cpudeadline.c | |||
@@ -0,0 +1,216 @@ | |||
1 | /* | ||
2 | * kernel/sched/cpudl.c | ||
3 | * | ||
4 | * Global CPU deadline management | ||
5 | * | ||
6 | * Author: Juri Lelli <j.lelli@sssup.it> | ||
7 | * | ||
8 | * This program is free software; you can redistribute it and/or | ||
9 | * modify it under the terms of the GNU General Public License | ||
10 | * as published by the Free Software Foundation; version 2 | ||
11 | * of the License. | ||
12 | */ | ||
13 | |||
14 | #include <linux/gfp.h> | ||
15 | #include <linux/kernel.h> | ||
16 | #include "cpudeadline.h" | ||
17 | |||
18 | static inline int parent(int i) | ||
19 | { | ||
20 | return (i - 1) >> 1; | ||
21 | } | ||
22 | |||
23 | static inline int left_child(int i) | ||
24 | { | ||
25 | return (i << 1) + 1; | ||
26 | } | ||
27 | |||
28 | static inline int right_child(int i) | ||
29 | { | ||
30 | return (i << 1) + 2; | ||
31 | } | ||
32 | |||
33 | static inline int dl_time_before(u64 a, u64 b) | ||
34 | { | ||
35 | return (s64)(a - b) < 0; | ||
36 | } | ||
37 | |||
38 | static void cpudl_exchange(struct cpudl *cp, int a, int b) | ||
39 | { | ||
40 | int cpu_a = cp->elements[a].cpu, cpu_b = cp->elements[b].cpu; | ||
41 | |||
42 | swap(cp->elements[a], cp->elements[b]); | ||
43 | swap(cp->cpu_to_idx[cpu_a], cp->cpu_to_idx[cpu_b]); | ||
44 | } | ||
45 | |||
46 | static void cpudl_heapify(struct cpudl *cp, int idx) | ||
47 | { | ||
48 | int l, r, largest; | ||
49 | |||
50 | /* adapted from lib/prio_heap.c */ | ||
51 | while(1) { | ||
52 | l = left_child(idx); | ||
53 | r = right_child(idx); | ||
54 | largest = idx; | ||
55 | |||
56 | if ((l < cp->size) && dl_time_before(cp->elements[idx].dl, | ||
57 | cp->elements[l].dl)) | ||
58 | largest = l; | ||
59 | if ((r < cp->size) && dl_time_before(cp->elements[largest].dl, | ||
60 | cp->elements[r].dl)) | ||
61 | largest = r; | ||
62 | if (largest == idx) | ||
63 | break; | ||
64 | |||
65 | /* Push idx down the heap one level and bump one up */ | ||
66 | cpudl_exchange(cp, largest, idx); | ||
67 | idx = largest; | ||
68 | } | ||
69 | } | ||
70 | |||
71 | static void cpudl_change_key(struct cpudl *cp, int idx, u64 new_dl) | ||
72 | { | ||
73 | WARN_ON(idx == IDX_INVALID || !cpu_present(idx)); | ||
74 | |||
75 | if (dl_time_before(new_dl, cp->elements[idx].dl)) { | ||
76 | cp->elements[idx].dl = new_dl; | ||
77 | cpudl_heapify(cp, idx); | ||
78 | } else { | ||
79 | cp->elements[idx].dl = new_dl; | ||
80 | while (idx > 0 && dl_time_before(cp->elements[parent(idx)].dl, | ||
81 | cp->elements[idx].dl)) { | ||
82 | cpudl_exchange(cp, idx, parent(idx)); | ||
83 | idx = parent(idx); | ||
84 | } | ||
85 | } | ||
86 | } | ||
87 | |||
88 | static inline int cpudl_maximum(struct cpudl *cp) | ||
89 | { | ||
90 | return cp->elements[0].cpu; | ||
91 | } | ||
92 | |||
93 | /* | ||
94 | * cpudl_find - find the best (later-dl) CPU in the system | ||
95 | * @cp: the cpudl max-heap context | ||
96 | * @p: the task | ||
97 | * @later_mask: a mask to fill in with the selected CPUs (or NULL) | ||
98 | * | ||
99 | * Returns: int - best CPU (heap maximum if suitable) | ||
100 | */ | ||
101 | int cpudl_find(struct cpudl *cp, struct task_struct *p, | ||
102 | struct cpumask *later_mask) | ||
103 | { | ||
104 | int best_cpu = -1; | ||
105 | const struct sched_dl_entity *dl_se = &p->dl; | ||
106 | |||
107 | if (later_mask && cpumask_and(later_mask, cp->free_cpus, | ||
108 | &p->cpus_allowed) && cpumask_and(later_mask, | ||
109 | later_mask, cpu_active_mask)) { | ||
110 | best_cpu = cpumask_any(later_mask); | ||
111 | goto out; | ||
112 | } else if (cpumask_test_cpu(cpudl_maximum(cp), &p->cpus_allowed) && | ||
113 | dl_time_before(dl_se->deadline, cp->elements[0].dl)) { | ||
114 | best_cpu = cpudl_maximum(cp); | ||
115 | if (later_mask) | ||
116 | cpumask_set_cpu(best_cpu, later_mask); | ||
117 | } | ||
118 | |||
119 | out: | ||
120 | WARN_ON(best_cpu != -1 && !cpu_present(best_cpu)); | ||
121 | |||
122 | return best_cpu; | ||
123 | } | ||
124 | |||
125 | /* | ||
126 | * cpudl_set - update the cpudl max-heap | ||
127 | * @cp: the cpudl max-heap context | ||
128 | * @cpu: the target cpu | ||
129 | * @dl: the new earliest deadline for this cpu | ||
130 | * | ||
131 | * Notes: assumes cpu_rq(cpu)->lock is locked | ||
132 | * | ||
133 | * Returns: (void) | ||
134 | */ | ||
135 | void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid) | ||
136 | { | ||
137 | int old_idx, new_cpu; | ||
138 | unsigned long flags; | ||
139 | |||
140 | WARN_ON(!cpu_present(cpu)); | ||
141 | |||
142 | raw_spin_lock_irqsave(&cp->lock, flags); | ||
143 | old_idx = cp->cpu_to_idx[cpu]; | ||
144 | if (!is_valid) { | ||
145 | /* remove item */ | ||
146 | if (old_idx == IDX_INVALID) { | ||
147 | /* | ||
148 | * Nothing to remove if old_idx was invalid. | ||
149 | * This could happen if a rq_offline_dl is | ||
150 | * called for a CPU without -dl tasks running. | ||
151 | */ | ||
152 | goto out; | ||
153 | } | ||
154 | new_cpu = cp->elements[cp->size - 1].cpu; | ||
155 | cp->elements[old_idx].dl = cp->elements[cp->size - 1].dl; | ||
156 | cp->elements[old_idx].cpu = new_cpu; | ||
157 | cp->size--; | ||
158 | cp->cpu_to_idx[new_cpu] = old_idx; | ||
159 | cp->cpu_to_idx[cpu] = IDX_INVALID; | ||
160 | while (old_idx > 0 && dl_time_before( | ||
161 | cp->elements[parent(old_idx)].dl, | ||
162 | cp->elements[old_idx].dl)) { | ||
163 | cpudl_exchange(cp, old_idx, parent(old_idx)); | ||
164 | old_idx = parent(old_idx); | ||
165 | } | ||
166 | cpumask_set_cpu(cpu, cp->free_cpus); | ||
167 | cpudl_heapify(cp, old_idx); | ||
168 | |||
169 | goto out; | ||
170 | } | ||
171 | |||
172 | if (old_idx == IDX_INVALID) { | ||
173 | cp->size++; | ||
174 | cp->elements[cp->size - 1].dl = 0; | ||
175 | cp->elements[cp->size - 1].cpu = cpu; | ||
176 | cp->cpu_to_idx[cpu] = cp->size - 1; | ||
177 | cpudl_change_key(cp, cp->size - 1, dl); | ||
178 | cpumask_clear_cpu(cpu, cp->free_cpus); | ||
179 | } else { | ||
180 | cpudl_change_key(cp, old_idx, dl); | ||
181 | } | ||
182 | |||
183 | out: | ||
184 | raw_spin_unlock_irqrestore(&cp->lock, flags); | ||
185 | } | ||
186 | |||
187 | /* | ||
188 | * cpudl_init - initialize the cpudl structure | ||
189 | * @cp: the cpudl max-heap context | ||
190 | */ | ||
191 | int cpudl_init(struct cpudl *cp) | ||
192 | { | ||
193 | int i; | ||
194 | |||
195 | memset(cp, 0, sizeof(*cp)); | ||
196 | raw_spin_lock_init(&cp->lock); | ||
197 | cp->size = 0; | ||
198 | for (i = 0; i < NR_CPUS; i++) | ||
199 | cp->cpu_to_idx[i] = IDX_INVALID; | ||
200 | if (!alloc_cpumask_var(&cp->free_cpus, GFP_KERNEL)) | ||
201 | return -ENOMEM; | ||
202 | cpumask_setall(cp->free_cpus); | ||
203 | |||
204 | return 0; | ||
205 | } | ||
206 | |||
207 | /* | ||
208 | * cpudl_cleanup - clean up the cpudl structure | ||
209 | * @cp: the cpudl max-heap context | ||
210 | */ | ||
211 | void cpudl_cleanup(struct cpudl *cp) | ||
212 | { | ||
213 | /* | ||
214 | * nothing to do for the moment | ||
215 | */ | ||
216 | } | ||
diff --git a/kernel/sched/cpudeadline.h b/kernel/sched/cpudeadline.h new file mode 100644 index 000000000000..a202789a412c --- /dev/null +++ b/kernel/sched/cpudeadline.h | |||
@@ -0,0 +1,33 @@ | |||
1 | #ifndef _LINUX_CPUDL_H | ||
2 | #define _LINUX_CPUDL_H | ||
3 | |||
4 | #include <linux/sched.h> | ||
5 | |||
6 | #define IDX_INVALID -1 | ||
7 | |||
8 | struct array_item { | ||
9 | u64 dl; | ||
10 | int cpu; | ||
11 | }; | ||
12 | |||
13 | struct cpudl { | ||
14 | raw_spinlock_t lock; | ||
15 | int size; | ||
16 | int cpu_to_idx[NR_CPUS]; | ||
17 | struct array_item elements[NR_CPUS]; | ||
18 | cpumask_var_t free_cpus; | ||
19 | }; | ||
20 | |||
21 | |||
22 | #ifdef CONFIG_SMP | ||
23 | int cpudl_find(struct cpudl *cp, struct task_struct *p, | ||
24 | struct cpumask *later_mask); | ||
25 | void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid); | ||
26 | int cpudl_init(struct cpudl *cp); | ||
27 | void cpudl_cleanup(struct cpudl *cp); | ||
28 | #else | ||
29 | #define cpudl_set(cp, cpu, dl) do { } while (0) | ||
30 | #define cpudl_init() do { } while (0) | ||
31 | #endif /* CONFIG_SMP */ | ||
32 | |||
33 | #endif /* _LINUX_CPUDL_H */ | ||
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c new file mode 100644 index 000000000000..6e79b3faa4cd --- /dev/null +++ b/kernel/sched/deadline.c | |||
@@ -0,0 +1,1639 @@ | |||
1 | /* | ||
2 | * Deadline Scheduling Class (SCHED_DEADLINE) | ||
3 | * | ||
4 | * Earliest Deadline First (EDF) + Constant Bandwidth Server (CBS). | ||
5 | * | ||
6 | * Tasks that periodically executes their instances for less than their | ||
7 | * runtime won't miss any of their deadlines. | ||
8 | * Tasks that are not periodic or sporadic or that tries to execute more | ||
9 | * than their reserved bandwidth will be slowed down (and may potentially | ||
10 | * miss some of their deadlines), and won't affect any other task. | ||
11 | * | ||
12 | * Copyright (C) 2012 Dario Faggioli <raistlin@linux.it>, | ||
13 | * Juri Lelli <juri.lelli@gmail.com>, | ||
14 | * Michael Trimarchi <michael@amarulasolutions.com>, | ||
15 | * Fabio Checconi <fchecconi@gmail.com> | ||
16 | */ | ||
17 | #include "sched.h" | ||
18 | |||
19 | #include <linux/slab.h> | ||
20 | |||
21 | struct dl_bandwidth def_dl_bandwidth; | ||
22 | |||
23 | static inline struct task_struct *dl_task_of(struct sched_dl_entity *dl_se) | ||
24 | { | ||
25 | return container_of(dl_se, struct task_struct, dl); | ||
26 | } | ||
27 | |||
28 | static inline struct rq *rq_of_dl_rq(struct dl_rq *dl_rq) | ||
29 | { | ||
30 | return container_of(dl_rq, struct rq, dl); | ||
31 | } | ||
32 | |||
33 | static inline struct dl_rq *dl_rq_of_se(struct sched_dl_entity *dl_se) | ||
34 | { | ||
35 | struct task_struct *p = dl_task_of(dl_se); | ||
36 | struct rq *rq = task_rq(p); | ||
37 | |||
38 | return &rq->dl; | ||
39 | } | ||
40 | |||
41 | static inline int on_dl_rq(struct sched_dl_entity *dl_se) | ||
42 | { | ||
43 | return !RB_EMPTY_NODE(&dl_se->rb_node); | ||
44 | } | ||
45 | |||
46 | static inline int is_leftmost(struct task_struct *p, struct dl_rq *dl_rq) | ||
47 | { | ||
48 | struct sched_dl_entity *dl_se = &p->dl; | ||
49 | |||
50 | return dl_rq->rb_leftmost == &dl_se->rb_node; | ||
51 | } | ||
52 | |||
53 | void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime) | ||
54 | { | ||
55 | raw_spin_lock_init(&dl_b->dl_runtime_lock); | ||
56 | dl_b->dl_period = period; | ||
57 | dl_b->dl_runtime = runtime; | ||
58 | } | ||
59 | |||
60 | extern unsigned long to_ratio(u64 period, u64 runtime); | ||
61 | |||
62 | void init_dl_bw(struct dl_bw *dl_b) | ||
63 | { | ||
64 | raw_spin_lock_init(&dl_b->lock); | ||
65 | raw_spin_lock(&def_dl_bandwidth.dl_runtime_lock); | ||
66 | if (global_rt_runtime() == RUNTIME_INF) | ||
67 | dl_b->bw = -1; | ||
68 | else | ||
69 | dl_b->bw = to_ratio(global_rt_period(), global_rt_runtime()); | ||
70 | raw_spin_unlock(&def_dl_bandwidth.dl_runtime_lock); | ||
71 | dl_b->total_bw = 0; | ||
72 | } | ||
73 | |||
74 | void init_dl_rq(struct dl_rq *dl_rq, struct rq *rq) | ||
75 | { | ||
76 | dl_rq->rb_root = RB_ROOT; | ||
77 | |||
78 | #ifdef CONFIG_SMP | ||
79 | /* zero means no -deadline tasks */ | ||
80 | dl_rq->earliest_dl.curr = dl_rq->earliest_dl.next = 0; | ||
81 | |||
82 | dl_rq->dl_nr_migratory = 0; | ||
83 | dl_rq->overloaded = 0; | ||
84 | dl_rq->pushable_dl_tasks_root = RB_ROOT; | ||
85 | #else | ||
86 | init_dl_bw(&dl_rq->dl_bw); | ||
87 | #endif | ||
88 | } | ||
89 | |||
90 | #ifdef CONFIG_SMP | ||
91 | |||
92 | static inline int dl_overloaded(struct rq *rq) | ||
93 | { | ||
94 | return atomic_read(&rq->rd->dlo_count); | ||
95 | } | ||
96 | |||
97 | static inline void dl_set_overload(struct rq *rq) | ||
98 | { | ||
99 | if (!rq->online) | ||
100 | return; | ||
101 | |||
102 | cpumask_set_cpu(rq->cpu, rq->rd->dlo_mask); | ||
103 | /* | ||
104 | * Must be visible before the overload count is | ||
105 | * set (as in sched_rt.c). | ||
106 | * | ||
107 | * Matched by the barrier in pull_dl_task(). | ||
108 | */ | ||
109 | smp_wmb(); | ||
110 | atomic_inc(&rq->rd->dlo_count); | ||
111 | } | ||
112 | |||
113 | static inline void dl_clear_overload(struct rq *rq) | ||
114 | { | ||
115 | if (!rq->online) | ||
116 | return; | ||
117 | |||
118 | atomic_dec(&rq->rd->dlo_count); | ||
119 | cpumask_clear_cpu(rq->cpu, rq->rd->dlo_mask); | ||
120 | } | ||
121 | |||
122 | static void update_dl_migration(struct dl_rq *dl_rq) | ||
123 | { | ||
124 | if (dl_rq->dl_nr_migratory && dl_rq->dl_nr_running > 1) { | ||
125 | if (!dl_rq->overloaded) { | ||
126 | dl_set_overload(rq_of_dl_rq(dl_rq)); | ||
127 | dl_rq->overloaded = 1; | ||
128 | } | ||
129 | } else if (dl_rq->overloaded) { | ||
130 | dl_clear_overload(rq_of_dl_rq(dl_rq)); | ||
131 | dl_rq->overloaded = 0; | ||
132 | } | ||
133 | } | ||
134 | |||
135 | static void inc_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) | ||
136 | { | ||
137 | struct task_struct *p = dl_task_of(dl_se); | ||
138 | |||
139 | if (p->nr_cpus_allowed > 1) | ||
140 | dl_rq->dl_nr_migratory++; | ||
141 | |||
142 | update_dl_migration(dl_rq); | ||
143 | } | ||
144 | |||
145 | static void dec_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) | ||
146 | { | ||
147 | struct task_struct *p = dl_task_of(dl_se); | ||
148 | |||
149 | if (p->nr_cpus_allowed > 1) | ||
150 | dl_rq->dl_nr_migratory--; | ||
151 | |||
152 | update_dl_migration(dl_rq); | ||
153 | } | ||
154 | |||
155 | /* | ||
156 | * The list of pushable -deadline task is not a plist, like in | ||
157 | * sched_rt.c, it is an rb-tree with tasks ordered by deadline. | ||
158 | */ | ||
159 | static void enqueue_pushable_dl_task(struct rq *rq, struct task_struct *p) | ||
160 | { | ||
161 | struct dl_rq *dl_rq = &rq->dl; | ||
162 | struct rb_node **link = &dl_rq->pushable_dl_tasks_root.rb_node; | ||
163 | struct rb_node *parent = NULL; | ||
164 | struct task_struct *entry; | ||
165 | int leftmost = 1; | ||
166 | |||
167 | BUG_ON(!RB_EMPTY_NODE(&p->pushable_dl_tasks)); | ||
168 | |||
169 | while (*link) { | ||
170 | parent = *link; | ||
171 | entry = rb_entry(parent, struct task_struct, | ||
172 | pushable_dl_tasks); | ||
173 | if (dl_entity_preempt(&p->dl, &entry->dl)) | ||
174 | link = &parent->rb_left; | ||
175 | else { | ||
176 | link = &parent->rb_right; | ||
177 | leftmost = 0; | ||
178 | } | ||
179 | } | ||
180 | |||
181 | if (leftmost) | ||
182 | dl_rq->pushable_dl_tasks_leftmost = &p->pushable_dl_tasks; | ||
183 | |||
184 | rb_link_node(&p->pushable_dl_tasks, parent, link); | ||
185 | rb_insert_color(&p->pushable_dl_tasks, &dl_rq->pushable_dl_tasks_root); | ||
186 | } | ||
187 | |||
188 | static void dequeue_pushable_dl_task(struct rq *rq, struct task_struct *p) | ||
189 | { | ||
190 | struct dl_rq *dl_rq = &rq->dl; | ||
191 | |||
192 | if (RB_EMPTY_NODE(&p->pushable_dl_tasks)) | ||
193 | return; | ||
194 | |||
195 | if (dl_rq->pushable_dl_tasks_leftmost == &p->pushable_dl_tasks) { | ||
196 | struct rb_node *next_node; | ||
197 | |||
198 | next_node = rb_next(&p->pushable_dl_tasks); | ||
199 | dl_rq->pushable_dl_tasks_leftmost = next_node; | ||
200 | } | ||
201 | |||
202 | rb_erase(&p->pushable_dl_tasks, &dl_rq->pushable_dl_tasks_root); | ||
203 | RB_CLEAR_NODE(&p->pushable_dl_tasks); | ||
204 | } | ||
205 | |||
206 | static inline int has_pushable_dl_tasks(struct rq *rq) | ||
207 | { | ||
208 | return !RB_EMPTY_ROOT(&rq->dl.pushable_dl_tasks_root); | ||
209 | } | ||
210 | |||
211 | static int push_dl_task(struct rq *rq); | ||
212 | |||
213 | #else | ||
214 | |||
215 | static inline | ||
216 | void enqueue_pushable_dl_task(struct rq *rq, struct task_struct *p) | ||
217 | { | ||
218 | } | ||
219 | |||
220 | static inline | ||
221 | void dequeue_pushable_dl_task(struct rq *rq, struct task_struct *p) | ||
222 | { | ||
223 | } | ||
224 | |||
225 | static inline | ||
226 | void inc_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) | ||
227 | { | ||
228 | } | ||
229 | |||
230 | static inline | ||
231 | void dec_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) | ||
232 | { | ||
233 | } | ||
234 | |||
235 | #endif /* CONFIG_SMP */ | ||
236 | |||
237 | static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags); | ||
238 | static void __dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags); | ||
239 | static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p, | ||
240 | int flags); | ||
241 | |||
242 | /* | ||
243 | * We are being explicitly informed that a new instance is starting, | ||
244 | * and this means that: | ||
245 | * - the absolute deadline of the entity has to be placed at | ||
246 | * current time + relative deadline; | ||
247 | * - the runtime of the entity has to be set to the maximum value. | ||
248 | * | ||
249 | * The capability of specifying such event is useful whenever a -deadline | ||
250 | * entity wants to (try to!) synchronize its behaviour with the scheduler's | ||
251 | * one, and to (try to!) reconcile itself with its own scheduling | ||
252 | * parameters. | ||
253 | */ | ||
254 | static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se, | ||
255 | struct sched_dl_entity *pi_se) | ||
256 | { | ||
257 | struct dl_rq *dl_rq = dl_rq_of_se(dl_se); | ||
258 | struct rq *rq = rq_of_dl_rq(dl_rq); | ||
259 | |||
260 | WARN_ON(!dl_se->dl_new || dl_se->dl_throttled); | ||
261 | |||
262 | /* | ||
263 | * We use the regular wall clock time to set deadlines in the | ||
264 | * future; in fact, we must consider execution overheads (time | ||
265 | * spent on hardirq context, etc.). | ||
266 | */ | ||
267 | dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline; | ||
268 | dl_se->runtime = pi_se->dl_runtime; | ||
269 | dl_se->dl_new = 0; | ||
270 | } | ||
271 | |||
272 | /* | ||
273 | * Pure Earliest Deadline First (EDF) scheduling does not deal with the | ||
274 | * possibility of a entity lasting more than what it declared, and thus | ||
275 | * exhausting its runtime. | ||
276 | * | ||
277 | * Here we are interested in making runtime overrun possible, but we do | ||
278 | * not want a entity which is misbehaving to affect the scheduling of all | ||
279 | * other entities. | ||
280 | * Therefore, a budgeting strategy called Constant Bandwidth Server (CBS) | ||
281 | * is used, in order to confine each entity within its own bandwidth. | ||
282 | * | ||
283 | * This function deals exactly with that, and ensures that when the runtime | ||
284 | * of a entity is replenished, its deadline is also postponed. That ensures | ||
285 | * the overrunning entity can't interfere with other entity in the system and | ||
286 | * can't make them miss their deadlines. Reasons why this kind of overruns | ||
287 | * could happen are, typically, a entity voluntarily trying to overcome its | ||
288 | * runtime, or it just underestimated it during sched_setscheduler_ex(). | ||
289 | */ | ||
290 | static void replenish_dl_entity(struct sched_dl_entity *dl_se, | ||
291 | struct sched_dl_entity *pi_se) | ||
292 | { | ||
293 | struct dl_rq *dl_rq = dl_rq_of_se(dl_se); | ||
294 | struct rq *rq = rq_of_dl_rq(dl_rq); | ||
295 | |||
296 | BUG_ON(pi_se->dl_runtime <= 0); | ||
297 | |||
298 | /* | ||
299 | * This could be the case for a !-dl task that is boosted. | ||
300 | * Just go with full inherited parameters. | ||
301 | */ | ||
302 | if (dl_se->dl_deadline == 0) { | ||
303 | dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline; | ||
304 | dl_se->runtime = pi_se->dl_runtime; | ||
305 | } | ||
306 | |||
307 | /* | ||
308 | * We keep moving the deadline away until we get some | ||
309 | * available runtime for the entity. This ensures correct | ||
310 | * handling of situations where the runtime overrun is | ||
311 | * arbitrary large. | ||
312 | */ | ||
313 | while (dl_se->runtime <= 0) { | ||
314 | dl_se->deadline += pi_se->dl_period; | ||
315 | dl_se->runtime += pi_se->dl_runtime; | ||
316 | } | ||
317 | |||
318 | /* | ||
319 | * At this point, the deadline really should be "in | ||
320 | * the future" with respect to rq->clock. If it's | ||
321 | * not, we are, for some reason, lagging too much! | ||
322 | * Anyway, after having warn userspace abut that, | ||
323 | * we still try to keep the things running by | ||
324 | * resetting the deadline and the budget of the | ||
325 | * entity. | ||
326 | */ | ||
327 | if (dl_time_before(dl_se->deadline, rq_clock(rq))) { | ||
328 | static bool lag_once = false; | ||
329 | |||
330 | if (!lag_once) { | ||
331 | lag_once = true; | ||
332 | printk_sched("sched: DL replenish lagged to much\n"); | ||
333 | } | ||
334 | dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline; | ||
335 | dl_se->runtime = pi_se->dl_runtime; | ||
336 | } | ||
337 | } | ||
338 | |||
339 | /* | ||
340 | * Here we check if --at time t-- an entity (which is probably being | ||
341 | * [re]activated or, in general, enqueued) can use its remaining runtime | ||
342 | * and its current deadline _without_ exceeding the bandwidth it is | ||
343 | * assigned (function returns true if it can't). We are in fact applying | ||
344 | * one of the CBS rules: when a task wakes up, if the residual runtime | ||
345 | * over residual deadline fits within the allocated bandwidth, then we | ||
346 | * can keep the current (absolute) deadline and residual budget without | ||
347 | * disrupting the schedulability of the system. Otherwise, we should | ||
348 | * refill the runtime and set the deadline a period in the future, | ||
349 | * because keeping the current (absolute) deadline of the task would | ||
350 | * result in breaking guarantees promised to other tasks (refer to | ||
351 | * Documentation/scheduler/sched-deadline.txt for more informations). | ||
352 | * | ||
353 | * This function returns true if: | ||
354 | * | ||
355 | * runtime / (deadline - t) > dl_runtime / dl_period , | ||
356 | * | ||
357 | * IOW we can't recycle current parameters. | ||
358 | * | ||
359 | * Notice that the bandwidth check is done against the period. For | ||
360 | * task with deadline equal to period this is the same of using | ||
361 | * dl_deadline instead of dl_period in the equation above. | ||
362 | */ | ||
363 | static bool dl_entity_overflow(struct sched_dl_entity *dl_se, | ||
364 | struct sched_dl_entity *pi_se, u64 t) | ||
365 | { | ||
366 | u64 left, right; | ||
367 | |||
368 | /* | ||
369 | * left and right are the two sides of the equation above, | ||
370 | * after a bit of shuffling to use multiplications instead | ||
371 | * of divisions. | ||
372 | * | ||
373 | * Note that none of the time values involved in the two | ||
374 | * multiplications are absolute: dl_deadline and dl_runtime | ||
375 | * are the relative deadline and the maximum runtime of each | ||
376 | * instance, runtime is the runtime left for the last instance | ||
377 | * and (deadline - t), since t is rq->clock, is the time left | ||
378 | * to the (absolute) deadline. Even if overflowing the u64 type | ||
379 | * is very unlikely to occur in both cases, here we scale down | ||
380 | * as we want to avoid that risk at all. Scaling down by 10 | ||
381 | * means that we reduce granularity to 1us. We are fine with it, | ||
382 | * since this is only a true/false check and, anyway, thinking | ||
383 | * of anything below microseconds resolution is actually fiction | ||
384 | * (but still we want to give the user that illusion >;). | ||
385 | */ | ||
386 | left = (pi_se->dl_period >> DL_SCALE) * (dl_se->runtime >> DL_SCALE); | ||
387 | right = ((dl_se->deadline - t) >> DL_SCALE) * | ||
388 | (pi_se->dl_runtime >> DL_SCALE); | ||
389 | |||
390 | return dl_time_before(right, left); | ||
391 | } | ||
392 | |||
393 | /* | ||
394 | * When a -deadline entity is queued back on the runqueue, its runtime and | ||
395 | * deadline might need updating. | ||
396 | * | ||
397 | * The policy here is that we update the deadline of the entity only if: | ||
398 | * - the current deadline is in the past, | ||
399 | * - using the remaining runtime with the current deadline would make | ||
400 | * the entity exceed its bandwidth. | ||
401 | */ | ||
402 | static void update_dl_entity(struct sched_dl_entity *dl_se, | ||
403 | struct sched_dl_entity *pi_se) | ||
404 | { | ||
405 | struct dl_rq *dl_rq = dl_rq_of_se(dl_se); | ||
406 | struct rq *rq = rq_of_dl_rq(dl_rq); | ||
407 | |||
408 | /* | ||
409 | * The arrival of a new instance needs special treatment, i.e., | ||
410 | * the actual scheduling parameters have to be "renewed". | ||
411 | */ | ||
412 | if (dl_se->dl_new) { | ||
413 | setup_new_dl_entity(dl_se, pi_se); | ||
414 | return; | ||
415 | } | ||
416 | |||
417 | if (dl_time_before(dl_se->deadline, rq_clock(rq)) || | ||
418 | dl_entity_overflow(dl_se, pi_se, rq_clock(rq))) { | ||
419 | dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline; | ||
420 | dl_se->runtime = pi_se->dl_runtime; | ||
421 | } | ||
422 | } | ||
423 | |||
424 | /* | ||
425 | * If the entity depleted all its runtime, and if we want it to sleep | ||
426 | * while waiting for some new execution time to become available, we | ||
427 | * set the bandwidth enforcement timer to the replenishment instant | ||
428 | * and try to activate it. | ||
429 | * | ||
430 | * Notice that it is important for the caller to know if the timer | ||
431 | * actually started or not (i.e., the replenishment instant is in | ||
432 | * the future or in the past). | ||
433 | */ | ||
434 | static int start_dl_timer(struct sched_dl_entity *dl_se, bool boosted) | ||
435 | { | ||
436 | struct dl_rq *dl_rq = dl_rq_of_se(dl_se); | ||
437 | struct rq *rq = rq_of_dl_rq(dl_rq); | ||
438 | ktime_t now, act; | ||
439 | ktime_t soft, hard; | ||
440 | unsigned long range; | ||
441 | s64 delta; | ||
442 | |||
443 | if (boosted) | ||
444 | return 0; | ||
445 | /* | ||
446 | * We want the timer to fire at the deadline, but considering | ||
447 | * that it is actually coming from rq->clock and not from | ||
448 | * hrtimer's time base reading. | ||
449 | */ | ||
450 | act = ns_to_ktime(dl_se->deadline); | ||
451 | now = hrtimer_cb_get_time(&dl_se->dl_timer); | ||
452 | delta = ktime_to_ns(now) - rq_clock(rq); | ||
453 | act = ktime_add_ns(act, delta); | ||
454 | |||
455 | /* | ||
456 | * If the expiry time already passed, e.g., because the value | ||
457 | * chosen as the deadline is too small, don't even try to | ||
458 | * start the timer in the past! | ||
459 | */ | ||
460 | if (ktime_us_delta(act, now) < 0) | ||
461 | return 0; | ||
462 | |||
463 | hrtimer_set_expires(&dl_se->dl_timer, act); | ||
464 | |||
465 | soft = hrtimer_get_softexpires(&dl_se->dl_timer); | ||
466 | hard = hrtimer_get_expires(&dl_se->dl_timer); | ||
467 | range = ktime_to_ns(ktime_sub(hard, soft)); | ||
468 | __hrtimer_start_range_ns(&dl_se->dl_timer, soft, | ||
469 | range, HRTIMER_MODE_ABS, 0); | ||
470 | |||
471 | return hrtimer_active(&dl_se->dl_timer); | ||
472 | } | ||
473 | |||
474 | /* | ||
475 | * This is the bandwidth enforcement timer callback. If here, we know | ||
476 | * a task is not on its dl_rq, since the fact that the timer was running | ||
477 | * means the task is throttled and needs a runtime replenishment. | ||
478 | * | ||
479 | * However, what we actually do depends on the fact the task is active, | ||
480 | * (it is on its rq) or has been removed from there by a call to | ||
481 | * dequeue_task_dl(). In the former case we must issue the runtime | ||
482 | * replenishment and add the task back to the dl_rq; in the latter, we just | ||
483 | * do nothing but clearing dl_throttled, so that runtime and deadline | ||
484 | * updating (and the queueing back to dl_rq) will be done by the | ||
485 | * next call to enqueue_task_dl(). | ||
486 | */ | ||
487 | static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) | ||
488 | { | ||
489 | struct sched_dl_entity *dl_se = container_of(timer, | ||
490 | struct sched_dl_entity, | ||
491 | dl_timer); | ||
492 | struct task_struct *p = dl_task_of(dl_se); | ||
493 | struct rq *rq = task_rq(p); | ||
494 | raw_spin_lock(&rq->lock); | ||
495 | |||
496 | /* | ||
497 | * We need to take care of a possible races here. In fact, the | ||
498 | * task might have changed its scheduling policy to something | ||
499 | * different from SCHED_DEADLINE or changed its reservation | ||
500 | * parameters (through sched_setscheduler()). | ||
501 | */ | ||
502 | if (!dl_task(p) || dl_se->dl_new) | ||
503 | goto unlock; | ||
504 | |||
505 | sched_clock_tick(); | ||
506 | update_rq_clock(rq); | ||
507 | dl_se->dl_throttled = 0; | ||
508 | if (p->on_rq) { | ||
509 | enqueue_task_dl(rq, p, ENQUEUE_REPLENISH); | ||
510 | if (task_has_dl_policy(rq->curr)) | ||
511 | check_preempt_curr_dl(rq, p, 0); | ||
512 | else | ||
513 | resched_task(rq->curr); | ||
514 | #ifdef CONFIG_SMP | ||
515 | /* | ||
516 | * Queueing this task back might have overloaded rq, | ||
517 | * check if we need to kick someone away. | ||
518 | */ | ||
519 | if (has_pushable_dl_tasks(rq)) | ||
520 | push_dl_task(rq); | ||
521 | #endif | ||
522 | } | ||
523 | unlock: | ||
524 | raw_spin_unlock(&rq->lock); | ||
525 | |||
526 | return HRTIMER_NORESTART; | ||
527 | } | ||
528 | |||
529 | void init_dl_task_timer(struct sched_dl_entity *dl_se) | ||
530 | { | ||
531 | struct hrtimer *timer = &dl_se->dl_timer; | ||
532 | |||
533 | if (hrtimer_active(timer)) { | ||
534 | hrtimer_try_to_cancel(timer); | ||
535 | return; | ||
536 | } | ||
537 | |||
538 | hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); | ||
539 | timer->function = dl_task_timer; | ||
540 | } | ||
541 | |||
542 | static | ||
543 | int dl_runtime_exceeded(struct rq *rq, struct sched_dl_entity *dl_se) | ||
544 | { | ||
545 | int dmiss = dl_time_before(dl_se->deadline, rq_clock(rq)); | ||
546 | int rorun = dl_se->runtime <= 0; | ||
547 | |||
548 | if (!rorun && !dmiss) | ||
549 | return 0; | ||
550 | |||
551 | /* | ||
552 | * If we are beyond our current deadline and we are still | ||
553 | * executing, then we have already used some of the runtime of | ||
554 | * the next instance. Thus, if we do not account that, we are | ||
555 | * stealing bandwidth from the system at each deadline miss! | ||
556 | */ | ||
557 | if (dmiss) { | ||
558 | dl_se->runtime = rorun ? dl_se->runtime : 0; | ||
559 | dl_se->runtime -= rq_clock(rq) - dl_se->deadline; | ||
560 | } | ||
561 | |||
562 | return 1; | ||
563 | } | ||
564 | |||
565 | extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq); | ||
566 | |||
567 | /* | ||
568 | * Update the current task's runtime statistics (provided it is still | ||
569 | * a -deadline task and has not been removed from the dl_rq). | ||
570 | */ | ||
571 | static void update_curr_dl(struct rq *rq) | ||
572 | { | ||
573 | struct task_struct *curr = rq->curr; | ||
574 | struct sched_dl_entity *dl_se = &curr->dl; | ||
575 | u64 delta_exec; | ||
576 | |||
577 | if (!dl_task(curr) || !on_dl_rq(dl_se)) | ||
578 | return; | ||
579 | |||
580 | /* | ||
581 | * Consumed budget is computed considering the time as | ||
582 | * observed by schedulable tasks (excluding time spent | ||
583 | * in hardirq context, etc.). Deadlines are instead | ||
584 | * computed using hard walltime. This seems to be the more | ||
585 | * natural solution, but the full ramifications of this | ||
586 | * approach need further study. | ||
587 | */ | ||
588 | delta_exec = rq_clock_task(rq) - curr->se.exec_start; | ||
589 | if (unlikely((s64)delta_exec < 0)) | ||
590 | delta_exec = 0; | ||
591 | |||
592 | schedstat_set(curr->se.statistics.exec_max, | ||
593 | max(curr->se.statistics.exec_max, delta_exec)); | ||
594 | |||
595 | curr->se.sum_exec_runtime += delta_exec; | ||
596 | account_group_exec_runtime(curr, delta_exec); | ||
597 | |||
598 | curr->se.exec_start = rq_clock_task(rq); | ||
599 | cpuacct_charge(curr, delta_exec); | ||
600 | |||
601 | sched_rt_avg_update(rq, delta_exec); | ||
602 | |||
603 | dl_se->runtime -= delta_exec; | ||
604 | if (dl_runtime_exceeded(rq, dl_se)) { | ||
605 | __dequeue_task_dl(rq, curr, 0); | ||
606 | if (likely(start_dl_timer(dl_se, curr->dl.dl_boosted))) | ||
607 | dl_se->dl_throttled = 1; | ||
608 | else | ||
609 | enqueue_task_dl(rq, curr, ENQUEUE_REPLENISH); | ||
610 | |||
611 | if (!is_leftmost(curr, &rq->dl)) | ||
612 | resched_task(curr); | ||
613 | } | ||
614 | |||
615 | /* | ||
616 | * Because -- for now -- we share the rt bandwidth, we need to | ||
617 | * account our runtime there too, otherwise actual rt tasks | ||
618 | * would be able to exceed the shared quota. | ||
619 | * | ||
620 | * Account to the root rt group for now. | ||
621 | * | ||
622 | * The solution we're working towards is having the RT groups scheduled | ||
623 | * using deadline servers -- however there's a few nasties to figure | ||
624 | * out before that can happen. | ||
625 | */ | ||
626 | if (rt_bandwidth_enabled()) { | ||
627 | struct rt_rq *rt_rq = &rq->rt; | ||
628 | |||
629 | raw_spin_lock(&rt_rq->rt_runtime_lock); | ||
630 | /* | ||
631 | * We'll let actual RT tasks worry about the overflow here, we | ||
632 | * have our own CBS to keep us inline; only account when RT | ||
633 | * bandwidth is relevant. | ||
634 | */ | ||
635 | if (sched_rt_bandwidth_account(rt_rq)) | ||
636 | rt_rq->rt_time += delta_exec; | ||
637 | raw_spin_unlock(&rt_rq->rt_runtime_lock); | ||
638 | } | ||
639 | } | ||
640 | |||
641 | #ifdef CONFIG_SMP | ||
642 | |||
643 | static struct task_struct *pick_next_earliest_dl_task(struct rq *rq, int cpu); | ||
644 | |||
645 | static inline u64 next_deadline(struct rq *rq) | ||
646 | { | ||
647 | struct task_struct *next = pick_next_earliest_dl_task(rq, rq->cpu); | ||
648 | |||
649 | if (next && dl_prio(next->prio)) | ||
650 | return next->dl.deadline; | ||
651 | else | ||
652 | return 0; | ||
653 | } | ||
654 | |||
655 | static void inc_dl_deadline(struct dl_rq *dl_rq, u64 deadline) | ||
656 | { | ||
657 | struct rq *rq = rq_of_dl_rq(dl_rq); | ||
658 | |||
659 | if (dl_rq->earliest_dl.curr == 0 || | ||
660 | dl_time_before(deadline, dl_rq->earliest_dl.curr)) { | ||
661 | /* | ||
662 | * If the dl_rq had no -deadline tasks, or if the new task | ||
663 | * has shorter deadline than the current one on dl_rq, we | ||
664 | * know that the previous earliest becomes our next earliest, | ||
665 | * as the new task becomes the earliest itself. | ||
666 | */ | ||
667 | dl_rq->earliest_dl.next = dl_rq->earliest_dl.curr; | ||
668 | dl_rq->earliest_dl.curr = deadline; | ||
669 | cpudl_set(&rq->rd->cpudl, rq->cpu, deadline, 1); | ||
670 | } else if (dl_rq->earliest_dl.next == 0 || | ||
671 | dl_time_before(deadline, dl_rq->earliest_dl.next)) { | ||
672 | /* | ||
673 | * On the other hand, if the new -deadline task has a | ||
674 | * a later deadline than the earliest one on dl_rq, but | ||
675 | * it is earlier than the next (if any), we must | ||
676 | * recompute the next-earliest. | ||
677 | */ | ||
678 | dl_rq->earliest_dl.next = next_deadline(rq); | ||
679 | } | ||
680 | } | ||
681 | |||
682 | static void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline) | ||
683 | { | ||
684 | struct rq *rq = rq_of_dl_rq(dl_rq); | ||
685 | |||
686 | /* | ||
687 | * Since we may have removed our earliest (and/or next earliest) | ||
688 | * task we must recompute them. | ||
689 | */ | ||
690 | if (!dl_rq->dl_nr_running) { | ||
691 | dl_rq->earliest_dl.curr = 0; | ||
692 | dl_rq->earliest_dl.next = 0; | ||
693 | cpudl_set(&rq->rd->cpudl, rq->cpu, 0, 0); | ||
694 | } else { | ||
695 | struct rb_node *leftmost = dl_rq->rb_leftmost; | ||
696 | struct sched_dl_entity *entry; | ||
697 | |||
698 | entry = rb_entry(leftmost, struct sched_dl_entity, rb_node); | ||
699 | dl_rq->earliest_dl.curr = entry->deadline; | ||
700 | dl_rq->earliest_dl.next = next_deadline(rq); | ||
701 | cpudl_set(&rq->rd->cpudl, rq->cpu, entry->deadline, 1); | ||
702 | } | ||
703 | } | ||
704 | |||
705 | #else | ||
706 | |||
707 | static inline void inc_dl_deadline(struct dl_rq *dl_rq, u64 deadline) {} | ||
708 | static inline void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline) {} | ||
709 | |||
710 | #endif /* CONFIG_SMP */ | ||
711 | |||
712 | static inline | ||
713 | void inc_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) | ||
714 | { | ||
715 | int prio = dl_task_of(dl_se)->prio; | ||
716 | u64 deadline = dl_se->deadline; | ||
717 | |||
718 | WARN_ON(!dl_prio(prio)); | ||
719 | dl_rq->dl_nr_running++; | ||
720 | inc_nr_running(rq_of_dl_rq(dl_rq)); | ||
721 | |||
722 | inc_dl_deadline(dl_rq, deadline); | ||
723 | inc_dl_migration(dl_se, dl_rq); | ||
724 | } | ||
725 | |||
726 | static inline | ||
727 | void dec_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) | ||
728 | { | ||
729 | int prio = dl_task_of(dl_se)->prio; | ||
730 | |||
731 | WARN_ON(!dl_prio(prio)); | ||
732 | WARN_ON(!dl_rq->dl_nr_running); | ||
733 | dl_rq->dl_nr_running--; | ||
734 | dec_nr_running(rq_of_dl_rq(dl_rq)); | ||
735 | |||
736 | dec_dl_deadline(dl_rq, dl_se->deadline); | ||
737 | dec_dl_migration(dl_se, dl_rq); | ||
738 | } | ||
739 | |||
740 | static void __enqueue_dl_entity(struct sched_dl_entity *dl_se) | ||
741 | { | ||
742 | struct dl_rq *dl_rq = dl_rq_of_se(dl_se); | ||
743 | struct rb_node **link = &dl_rq->rb_root.rb_node; | ||
744 | struct rb_node *parent = NULL; | ||
745 | struct sched_dl_entity *entry; | ||
746 | int leftmost = 1; | ||
747 | |||
748 | BUG_ON(!RB_EMPTY_NODE(&dl_se->rb_node)); | ||
749 | |||
750 | while (*link) { | ||
751 | parent = *link; | ||
752 | entry = rb_entry(parent, struct sched_dl_entity, rb_node); | ||
753 | if (dl_time_before(dl_se->deadline, entry->deadline)) | ||
754 | link = &parent->rb_left; | ||
755 | else { | ||
756 | link = &parent->rb_right; | ||
757 | leftmost = 0; | ||
758 | } | ||
759 | } | ||
760 | |||
761 | if (leftmost) | ||
762 | dl_rq->rb_leftmost = &dl_se->rb_node; | ||
763 | |||
764 | rb_link_node(&dl_se->rb_node, parent, link); | ||
765 | rb_insert_color(&dl_se->rb_node, &dl_rq->rb_root); | ||
766 | |||
767 | inc_dl_tasks(dl_se, dl_rq); | ||
768 | } | ||
769 | |||
770 | static void __dequeue_dl_entity(struct sched_dl_entity *dl_se) | ||
771 | { | ||
772 | struct dl_rq *dl_rq = dl_rq_of_se(dl_se); | ||
773 | |||
774 | if (RB_EMPTY_NODE(&dl_se->rb_node)) | ||
775 | return; | ||
776 | |||
777 | if (dl_rq->rb_leftmost == &dl_se->rb_node) { | ||
778 | struct rb_node *next_node; | ||
779 | |||
780 | next_node = rb_next(&dl_se->rb_node); | ||
781 | dl_rq->rb_leftmost = next_node; | ||
782 | } | ||
783 | |||
784 | rb_erase(&dl_se->rb_node, &dl_rq->rb_root); | ||
785 | RB_CLEAR_NODE(&dl_se->rb_node); | ||
786 | |||
787 | dec_dl_tasks(dl_se, dl_rq); | ||
788 | } | ||
789 | |||
790 | static void | ||
791 | enqueue_dl_entity(struct sched_dl_entity *dl_se, | ||
792 | struct sched_dl_entity *pi_se, int flags) | ||
793 | { | ||
794 | BUG_ON(on_dl_rq(dl_se)); | ||
795 | |||
796 | /* | ||
797 | * If this is a wakeup or a new instance, the scheduling | ||
798 | * parameters of the task might need updating. Otherwise, | ||
799 | * we want a replenishment of its runtime. | ||
800 | */ | ||
801 | if (!dl_se->dl_new && flags & ENQUEUE_REPLENISH) | ||
802 | replenish_dl_entity(dl_se, pi_se); | ||
803 | else | ||
804 | update_dl_entity(dl_se, pi_se); | ||
805 | |||
806 | __enqueue_dl_entity(dl_se); | ||
807 | } | ||
808 | |||
809 | static void dequeue_dl_entity(struct sched_dl_entity *dl_se) | ||
810 | { | ||
811 | __dequeue_dl_entity(dl_se); | ||
812 | } | ||
813 | |||
814 | static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) | ||
815 | { | ||
816 | struct task_struct *pi_task = rt_mutex_get_top_task(p); | ||
817 | struct sched_dl_entity *pi_se = &p->dl; | ||
818 | |||
819 | /* | ||
820 | * Use the scheduling parameters of the top pi-waiter | ||
821 | * task if we have one and its (relative) deadline is | ||
822 | * smaller than our one... OTW we keep our runtime and | ||
823 | * deadline. | ||
824 | */ | ||
825 | if (pi_task && p->dl.dl_boosted && dl_prio(pi_task->normal_prio)) | ||
826 | pi_se = &pi_task->dl; | ||
827 | |||
828 | /* | ||
829 | * If p is throttled, we do nothing. In fact, if it exhausted | ||
830 | * its budget it needs a replenishment and, since it now is on | ||
831 | * its rq, the bandwidth timer callback (which clearly has not | ||
832 | * run yet) will take care of this. | ||
833 | */ | ||
834 | if (p->dl.dl_throttled) | ||
835 | return; | ||
836 | |||
837 | enqueue_dl_entity(&p->dl, pi_se, flags); | ||
838 | |||
839 | if (!task_current(rq, p) && p->nr_cpus_allowed > 1) | ||
840 | enqueue_pushable_dl_task(rq, p); | ||
841 | } | ||
842 | |||
843 | static void __dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags) | ||
844 | { | ||
845 | dequeue_dl_entity(&p->dl); | ||
846 | dequeue_pushable_dl_task(rq, p); | ||
847 | } | ||
848 | |||
849 | static void dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags) | ||
850 | { | ||
851 | update_curr_dl(rq); | ||
852 | __dequeue_task_dl(rq, p, flags); | ||
853 | } | ||
854 | |||
855 | /* | ||
856 | * Yield task semantic for -deadline tasks is: | ||
857 | * | ||
858 | * get off from the CPU until our next instance, with | ||
859 | * a new runtime. This is of little use now, since we | ||
860 | * don't have a bandwidth reclaiming mechanism. Anyway, | ||
861 | * bandwidth reclaiming is planned for the future, and | ||
862 | * yield_task_dl will indicate that some spare budget | ||
863 | * is available for other task instances to use it. | ||
864 | */ | ||
865 | static void yield_task_dl(struct rq *rq) | ||
866 | { | ||
867 | struct task_struct *p = rq->curr; | ||
868 | |||
869 | /* | ||
870 | * We make the task go to sleep until its current deadline by | ||
871 | * forcing its runtime to zero. This way, update_curr_dl() stops | ||
872 | * it and the bandwidth timer will wake it up and will give it | ||
873 | * new scheduling parameters (thanks to dl_new=1). | ||
874 | */ | ||
875 | if (p->dl.runtime > 0) { | ||
876 | rq->curr->dl.dl_new = 1; | ||
877 | p->dl.runtime = 0; | ||
878 | } | ||
879 | update_curr_dl(rq); | ||
880 | } | ||
881 | |||
882 | #ifdef CONFIG_SMP | ||
883 | |||
884 | static int find_later_rq(struct task_struct *task); | ||
885 | |||
886 | static int | ||
887 | select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags) | ||
888 | { | ||
889 | struct task_struct *curr; | ||
890 | struct rq *rq; | ||
891 | |||
892 | if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK) | ||
893 | goto out; | ||
894 | |||
895 | rq = cpu_rq(cpu); | ||
896 | |||
897 | rcu_read_lock(); | ||
898 | curr = ACCESS_ONCE(rq->curr); /* unlocked access */ | ||
899 | |||
900 | /* | ||
901 | * If we are dealing with a -deadline task, we must | ||
902 | * decide where to wake it up. | ||
903 | * If it has a later deadline and the current task | ||
904 | * on this rq can't move (provided the waking task | ||
905 | * can!) we prefer to send it somewhere else. On the | ||
906 | * other hand, if it has a shorter deadline, we | ||
907 | * try to make it stay here, it might be important. | ||
908 | */ | ||
909 | if (unlikely(dl_task(curr)) && | ||
910 | (curr->nr_cpus_allowed < 2 || | ||
911 | !dl_entity_preempt(&p->dl, &curr->dl)) && | ||
912 | (p->nr_cpus_allowed > 1)) { | ||
913 | int target = find_later_rq(p); | ||
914 | |||
915 | if (target != -1) | ||
916 | cpu = target; | ||
917 | } | ||
918 | rcu_read_unlock(); | ||
919 | |||
920 | out: | ||
921 | return cpu; | ||
922 | } | ||
923 | |||
924 | static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p) | ||
925 | { | ||
926 | /* | ||
927 | * Current can't be migrated, useless to reschedule, | ||
928 | * let's hope p can move out. | ||
929 | */ | ||
930 | if (rq->curr->nr_cpus_allowed == 1 || | ||
931 | cpudl_find(&rq->rd->cpudl, rq->curr, NULL) == -1) | ||
932 | return; | ||
933 | |||
934 | /* | ||
935 | * p is migratable, so let's not schedule it and | ||
936 | * see if it is pushed or pulled somewhere else. | ||
937 | */ | ||
938 | if (p->nr_cpus_allowed != 1 && | ||
939 | cpudl_find(&rq->rd->cpudl, p, NULL) != -1) | ||
940 | return; | ||
941 | |||
942 | resched_task(rq->curr); | ||
943 | } | ||
944 | |||
945 | #endif /* CONFIG_SMP */ | ||
946 | |||
947 | /* | ||
948 | * Only called when both the current and waking task are -deadline | ||
949 | * tasks. | ||
950 | */ | ||
951 | static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p, | ||
952 | int flags) | ||
953 | { | ||
954 | if (dl_entity_preempt(&p->dl, &rq->curr->dl)) { | ||
955 | resched_task(rq->curr); | ||
956 | return; | ||
957 | } | ||
958 | |||
959 | #ifdef CONFIG_SMP | ||
960 | /* | ||
961 | * In the unlikely case current and p have the same deadline | ||
962 | * let us try to decide what's the best thing to do... | ||
963 | */ | ||
964 | if ((p->dl.deadline == rq->curr->dl.deadline) && | ||
965 | !test_tsk_need_resched(rq->curr)) | ||
966 | check_preempt_equal_dl(rq, p); | ||
967 | #endif /* CONFIG_SMP */ | ||
968 | } | ||
969 | |||
970 | #ifdef CONFIG_SCHED_HRTICK | ||
971 | static void start_hrtick_dl(struct rq *rq, struct task_struct *p) | ||
972 | { | ||
973 | s64 delta = p->dl.dl_runtime - p->dl.runtime; | ||
974 | |||
975 | if (delta > 10000) | ||
976 | hrtick_start(rq, p->dl.runtime); | ||
977 | } | ||
978 | #endif | ||
979 | |||
980 | static struct sched_dl_entity *pick_next_dl_entity(struct rq *rq, | ||
981 | struct dl_rq *dl_rq) | ||
982 | { | ||
983 | struct rb_node *left = dl_rq->rb_leftmost; | ||
984 | |||
985 | if (!left) | ||
986 | return NULL; | ||
987 | |||
988 | return rb_entry(left, struct sched_dl_entity, rb_node); | ||
989 | } | ||
990 | |||
991 | struct task_struct *pick_next_task_dl(struct rq *rq) | ||
992 | { | ||
993 | struct sched_dl_entity *dl_se; | ||
994 | struct task_struct *p; | ||
995 | struct dl_rq *dl_rq; | ||
996 | |||
997 | dl_rq = &rq->dl; | ||
998 | |||
999 | if (unlikely(!dl_rq->dl_nr_running)) | ||
1000 | return NULL; | ||
1001 | |||
1002 | dl_se = pick_next_dl_entity(rq, dl_rq); | ||
1003 | BUG_ON(!dl_se); | ||
1004 | |||
1005 | p = dl_task_of(dl_se); | ||
1006 | p->se.exec_start = rq_clock_task(rq); | ||
1007 | |||
1008 | /* Running task will never be pushed. */ | ||
1009 | dequeue_pushable_dl_task(rq, p); | ||
1010 | |||
1011 | #ifdef CONFIG_SCHED_HRTICK | ||
1012 | if (hrtick_enabled(rq)) | ||
1013 | start_hrtick_dl(rq, p); | ||
1014 | #endif | ||
1015 | |||
1016 | #ifdef CONFIG_SMP | ||
1017 | rq->post_schedule = has_pushable_dl_tasks(rq); | ||
1018 | #endif /* CONFIG_SMP */ | ||
1019 | |||
1020 | return p; | ||
1021 | } | ||
1022 | |||
1023 | static void put_prev_task_dl(struct rq *rq, struct task_struct *p) | ||
1024 | { | ||
1025 | update_curr_dl(rq); | ||
1026 | |||
1027 | if (on_dl_rq(&p->dl) && p->nr_cpus_allowed > 1) | ||
1028 | enqueue_pushable_dl_task(rq, p); | ||
1029 | } | ||
1030 | |||
1031 | static void task_tick_dl(struct rq *rq, struct task_struct *p, int queued) | ||
1032 | { | ||
1033 | update_curr_dl(rq); | ||
1034 | |||
1035 | #ifdef CONFIG_SCHED_HRTICK | ||
1036 | if (hrtick_enabled(rq) && queued && p->dl.runtime > 0) | ||
1037 | start_hrtick_dl(rq, p); | ||
1038 | #endif | ||
1039 | } | ||
1040 | |||
1041 | static void task_fork_dl(struct task_struct *p) | ||
1042 | { | ||
1043 | /* | ||
1044 | * SCHED_DEADLINE tasks cannot fork and this is achieved through | ||
1045 | * sched_fork() | ||
1046 | */ | ||
1047 | } | ||
1048 | |||
1049 | static void task_dead_dl(struct task_struct *p) | ||
1050 | { | ||
1051 | struct hrtimer *timer = &p->dl.dl_timer; | ||
1052 | struct dl_bw *dl_b = dl_bw_of(task_cpu(p)); | ||
1053 | |||
1054 | /* | ||
1055 | * Since we are TASK_DEAD we won't slip out of the domain! | ||
1056 | */ | ||
1057 | raw_spin_lock_irq(&dl_b->lock); | ||
1058 | dl_b->total_bw -= p->dl.dl_bw; | ||
1059 | raw_spin_unlock_irq(&dl_b->lock); | ||
1060 | |||
1061 | hrtimer_cancel(timer); | ||
1062 | } | ||
1063 | |||
1064 | static void set_curr_task_dl(struct rq *rq) | ||
1065 | { | ||
1066 | struct task_struct *p = rq->curr; | ||
1067 | |||
1068 | p->se.exec_start = rq_clock_task(rq); | ||
1069 | |||
1070 | /* You can't push away the running task */ | ||
1071 | dequeue_pushable_dl_task(rq, p); | ||
1072 | } | ||
1073 | |||
1074 | #ifdef CONFIG_SMP | ||
1075 | |||
1076 | /* Only try algorithms three times */ | ||
1077 | #define DL_MAX_TRIES 3 | ||
1078 | |||
1079 | static int pick_dl_task(struct rq *rq, struct task_struct *p, int cpu) | ||
1080 | { | ||
1081 | if (!task_running(rq, p) && | ||
1082 | (cpu < 0 || cpumask_test_cpu(cpu, &p->cpus_allowed)) && | ||
1083 | (p->nr_cpus_allowed > 1)) | ||
1084 | return 1; | ||
1085 | |||
1086 | return 0; | ||
1087 | } | ||
1088 | |||
1089 | /* Returns the second earliest -deadline task, NULL otherwise */ | ||
1090 | static struct task_struct *pick_next_earliest_dl_task(struct rq *rq, int cpu) | ||
1091 | { | ||
1092 | struct rb_node *next_node = rq->dl.rb_leftmost; | ||
1093 | struct sched_dl_entity *dl_se; | ||
1094 | struct task_struct *p = NULL; | ||
1095 | |||
1096 | next_node: | ||
1097 | next_node = rb_next(next_node); | ||
1098 | if (next_node) { | ||
1099 | dl_se = rb_entry(next_node, struct sched_dl_entity, rb_node); | ||
1100 | p = dl_task_of(dl_se); | ||
1101 | |||
1102 | if (pick_dl_task(rq, p, cpu)) | ||
1103 | return p; | ||
1104 | |||
1105 | goto next_node; | ||
1106 | } | ||
1107 | |||
1108 | return NULL; | ||
1109 | } | ||
1110 | |||
1111 | static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask_dl); | ||
1112 | |||
1113 | static int find_later_rq(struct task_struct *task) | ||
1114 | { | ||
1115 | struct sched_domain *sd; | ||
1116 | struct cpumask *later_mask = __get_cpu_var(local_cpu_mask_dl); | ||
1117 | int this_cpu = smp_processor_id(); | ||
1118 | int best_cpu, cpu = task_cpu(task); | ||
1119 | |||
1120 | /* Make sure the mask is initialized first */ | ||
1121 | if (unlikely(!later_mask)) | ||
1122 | return -1; | ||
1123 | |||
1124 | if (task->nr_cpus_allowed == 1) | ||
1125 | return -1; | ||
1126 | |||
1127 | best_cpu = cpudl_find(&task_rq(task)->rd->cpudl, | ||
1128 | task, later_mask); | ||
1129 | if (best_cpu == -1) | ||
1130 | return -1; | ||
1131 | |||
1132 | /* | ||
1133 | * If we are here, some target has been found, | ||
1134 | * the most suitable of which is cached in best_cpu. | ||
1135 | * This is, among the runqueues where the current tasks | ||
1136 | * have later deadlines than the task's one, the rq | ||
1137 | * with the latest possible one. | ||
1138 | * | ||
1139 | * Now we check how well this matches with task's | ||
1140 | * affinity and system topology. | ||
1141 | * | ||
1142 | * The last cpu where the task run is our first | ||
1143 | * guess, since it is most likely cache-hot there. | ||
1144 | */ | ||
1145 | if (cpumask_test_cpu(cpu, later_mask)) | ||
1146 | return cpu; | ||
1147 | /* | ||
1148 | * Check if this_cpu is to be skipped (i.e., it is | ||
1149 | * not in the mask) or not. | ||
1150 | */ | ||
1151 | if (!cpumask_test_cpu(this_cpu, later_mask)) | ||
1152 | this_cpu = -1; | ||
1153 | |||
1154 | rcu_read_lock(); | ||
1155 | for_each_domain(cpu, sd) { | ||
1156 | if (sd->flags & SD_WAKE_AFFINE) { | ||
1157 | |||
1158 | /* | ||
1159 | * If possible, preempting this_cpu is | ||
1160 | * cheaper than migrating. | ||
1161 | */ | ||
1162 | if (this_cpu != -1 && | ||
1163 | cpumask_test_cpu(this_cpu, sched_domain_span(sd))) { | ||
1164 | rcu_read_unlock(); | ||
1165 | return this_cpu; | ||
1166 | } | ||
1167 | |||
1168 | /* | ||
1169 | * Last chance: if best_cpu is valid and is | ||
1170 | * in the mask, that becomes our choice. | ||
1171 | */ | ||
1172 | if (best_cpu < nr_cpu_ids && | ||
1173 | cpumask_test_cpu(best_cpu, sched_domain_span(sd))) { | ||
1174 | rcu_read_unlock(); | ||
1175 | return best_cpu; | ||
1176 | } | ||
1177 | } | ||
1178 | } | ||
1179 | rcu_read_unlock(); | ||
1180 | |||
1181 | /* | ||
1182 | * At this point, all our guesses failed, we just return | ||
1183 | * 'something', and let the caller sort the things out. | ||
1184 | */ | ||
1185 | if (this_cpu != -1) | ||
1186 | return this_cpu; | ||
1187 | |||
1188 | cpu = cpumask_any(later_mask); | ||
1189 | if (cpu < nr_cpu_ids) | ||
1190 | return cpu; | ||
1191 | |||
1192 | return -1; | ||
1193 | } | ||
1194 | |||
1195 | /* Locks the rq it finds */ | ||
1196 | static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq) | ||
1197 | { | ||
1198 | struct rq *later_rq = NULL; | ||
1199 | int tries; | ||
1200 | int cpu; | ||
1201 | |||
1202 | for (tries = 0; tries < DL_MAX_TRIES; tries++) { | ||
1203 | cpu = find_later_rq(task); | ||
1204 | |||
1205 | if ((cpu == -1) || (cpu == rq->cpu)) | ||
1206 | break; | ||
1207 | |||
1208 | later_rq = cpu_rq(cpu); | ||
1209 | |||
1210 | /* Retry if something changed. */ | ||
1211 | if (double_lock_balance(rq, later_rq)) { | ||
1212 | if (unlikely(task_rq(task) != rq || | ||
1213 | !cpumask_test_cpu(later_rq->cpu, | ||
1214 | &task->cpus_allowed) || | ||
1215 | task_running(rq, task) || !task->on_rq)) { | ||
1216 | double_unlock_balance(rq, later_rq); | ||
1217 | later_rq = NULL; | ||
1218 | break; | ||
1219 | } | ||
1220 | } | ||
1221 | |||
1222 | /* | ||
1223 | * If the rq we found has no -deadline task, or | ||
1224 | * its earliest one has a later deadline than our | ||
1225 | * task, the rq is a good one. | ||
1226 | */ | ||
1227 | if (!later_rq->dl.dl_nr_running || | ||
1228 | dl_time_before(task->dl.deadline, | ||
1229 | later_rq->dl.earliest_dl.curr)) | ||
1230 | break; | ||
1231 | |||
1232 | /* Otherwise we try again. */ | ||
1233 | double_unlock_balance(rq, later_rq); | ||
1234 | later_rq = NULL; | ||
1235 | } | ||
1236 | |||
1237 | return later_rq; | ||
1238 | } | ||
1239 | |||
1240 | static struct task_struct *pick_next_pushable_dl_task(struct rq *rq) | ||
1241 | { | ||
1242 | struct task_struct *p; | ||
1243 | |||
1244 | if (!has_pushable_dl_tasks(rq)) | ||
1245 | return NULL; | ||
1246 | |||
1247 | p = rb_entry(rq->dl.pushable_dl_tasks_leftmost, | ||
1248 | struct task_struct, pushable_dl_tasks); | ||
1249 | |||
1250 | BUG_ON(rq->cpu != task_cpu(p)); | ||
1251 | BUG_ON(task_current(rq, p)); | ||
1252 | BUG_ON(p->nr_cpus_allowed <= 1); | ||
1253 | |||
1254 | BUG_ON(!p->on_rq); | ||
1255 | BUG_ON(!dl_task(p)); | ||
1256 | |||
1257 | return p; | ||
1258 | } | ||
1259 | |||
1260 | /* | ||
1261 | * See if the non running -deadline tasks on this rq | ||
1262 | * can be sent to some other CPU where they can preempt | ||
1263 | * and start executing. | ||
1264 | */ | ||
1265 | static int push_dl_task(struct rq *rq) | ||
1266 | { | ||
1267 | struct task_struct *next_task; | ||
1268 | struct rq *later_rq; | ||
1269 | |||
1270 | if (!rq->dl.overloaded) | ||
1271 | return 0; | ||
1272 | |||
1273 | next_task = pick_next_pushable_dl_task(rq); | ||
1274 | if (!next_task) | ||
1275 | return 0; | ||
1276 | |||
1277 | retry: | ||
1278 | if (unlikely(next_task == rq->curr)) { | ||
1279 | WARN_ON(1); | ||
1280 | return 0; | ||
1281 | } | ||
1282 | |||
1283 | /* | ||
1284 | * If next_task preempts rq->curr, and rq->curr | ||
1285 | * can move away, it makes sense to just reschedule | ||
1286 | * without going further in pushing next_task. | ||
1287 | */ | ||
1288 | if (dl_task(rq->curr) && | ||
1289 | dl_time_before(next_task->dl.deadline, rq->curr->dl.deadline) && | ||
1290 | rq->curr->nr_cpus_allowed > 1) { | ||
1291 | resched_task(rq->curr); | ||
1292 | return 0; | ||
1293 | } | ||
1294 | |||
1295 | /* We might release rq lock */ | ||
1296 | get_task_struct(next_task); | ||
1297 | |||
1298 | /* Will lock the rq it'll find */ | ||
1299 | later_rq = find_lock_later_rq(next_task, rq); | ||
1300 | if (!later_rq) { | ||
1301 | struct task_struct *task; | ||
1302 | |||
1303 | /* | ||
1304 | * We must check all this again, since | ||
1305 | * find_lock_later_rq releases rq->lock and it is | ||
1306 | * then possible that next_task has migrated. | ||
1307 | */ | ||
1308 | task = pick_next_pushable_dl_task(rq); | ||
1309 | if (task_cpu(next_task) == rq->cpu && task == next_task) { | ||
1310 | /* | ||
1311 | * The task is still there. We don't try | ||
1312 | * again, some other cpu will pull it when ready. | ||
1313 | */ | ||
1314 | dequeue_pushable_dl_task(rq, next_task); | ||
1315 | goto out; | ||
1316 | } | ||
1317 | |||
1318 | if (!task) | ||
1319 | /* No more tasks */ | ||
1320 | goto out; | ||
1321 | |||
1322 | put_task_struct(next_task); | ||
1323 | next_task = task; | ||
1324 | goto retry; | ||
1325 | } | ||
1326 | |||
1327 | deactivate_task(rq, next_task, 0); | ||
1328 | set_task_cpu(next_task, later_rq->cpu); | ||
1329 | activate_task(later_rq, next_task, 0); | ||
1330 | |||
1331 | resched_task(later_rq->curr); | ||
1332 | |||
1333 | double_unlock_balance(rq, later_rq); | ||
1334 | |||
1335 | out: | ||
1336 | put_task_struct(next_task); | ||
1337 | |||
1338 | return 1; | ||
1339 | } | ||
1340 | |||
1341 | static void push_dl_tasks(struct rq *rq) | ||
1342 | { | ||
1343 | /* Terminates as it moves a -deadline task */ | ||
1344 | while (push_dl_task(rq)) | ||
1345 | ; | ||
1346 | } | ||
1347 | |||
1348 | static int pull_dl_task(struct rq *this_rq) | ||
1349 | { | ||
1350 | int this_cpu = this_rq->cpu, ret = 0, cpu; | ||
1351 | struct task_struct *p; | ||
1352 | struct rq *src_rq; | ||
1353 | u64 dmin = LONG_MAX; | ||
1354 | |||
1355 | if (likely(!dl_overloaded(this_rq))) | ||
1356 | return 0; | ||
1357 | |||
1358 | /* | ||
1359 | * Match the barrier from dl_set_overloaded; this guarantees that if we | ||
1360 | * see overloaded we must also see the dlo_mask bit. | ||
1361 | */ | ||
1362 | smp_rmb(); | ||
1363 | |||
1364 | for_each_cpu(cpu, this_rq->rd->dlo_mask) { | ||
1365 | if (this_cpu == cpu) | ||
1366 | continue; | ||
1367 | |||
1368 | src_rq = cpu_rq(cpu); | ||
1369 | |||
1370 | /* | ||
1371 | * It looks racy, abd it is! However, as in sched_rt.c, | ||
1372 | * we are fine with this. | ||
1373 | */ | ||
1374 | if (this_rq->dl.dl_nr_running && | ||
1375 | dl_time_before(this_rq->dl.earliest_dl.curr, | ||
1376 | src_rq->dl.earliest_dl.next)) | ||
1377 | continue; | ||
1378 | |||
1379 | /* Might drop this_rq->lock */ | ||
1380 | double_lock_balance(this_rq, src_rq); | ||
1381 | |||
1382 | /* | ||
1383 | * If there are no more pullable tasks on the | ||
1384 | * rq, we're done with it. | ||
1385 | */ | ||
1386 | if (src_rq->dl.dl_nr_running <= 1) | ||
1387 | goto skip; | ||
1388 | |||
1389 | p = pick_next_earliest_dl_task(src_rq, this_cpu); | ||
1390 | |||
1391 | /* | ||
1392 | * We found a task to be pulled if: | ||
1393 | * - it preempts our current (if there's one), | ||
1394 | * - it will preempt the last one we pulled (if any). | ||
1395 | */ | ||
1396 | if (p && dl_time_before(p->dl.deadline, dmin) && | ||
1397 | (!this_rq->dl.dl_nr_running || | ||
1398 | dl_time_before(p->dl.deadline, | ||
1399 | this_rq->dl.earliest_dl.curr))) { | ||
1400 | WARN_ON(p == src_rq->curr); | ||
1401 | WARN_ON(!p->on_rq); | ||
1402 | |||
1403 | /* | ||
1404 | * Then we pull iff p has actually an earlier | ||
1405 | * deadline than the current task of its runqueue. | ||
1406 | */ | ||
1407 | if (dl_time_before(p->dl.deadline, | ||
1408 | src_rq->curr->dl.deadline)) | ||
1409 | goto skip; | ||
1410 | |||
1411 | ret = 1; | ||
1412 | |||
1413 | deactivate_task(src_rq, p, 0); | ||
1414 | set_task_cpu(p, this_cpu); | ||
1415 | activate_task(this_rq, p, 0); | ||
1416 | dmin = p->dl.deadline; | ||
1417 | |||
1418 | /* Is there any other task even earlier? */ | ||
1419 | } | ||
1420 | skip: | ||
1421 | double_unlock_balance(this_rq, src_rq); | ||
1422 | } | ||
1423 | |||
1424 | return ret; | ||
1425 | } | ||
1426 | |||
1427 | static void pre_schedule_dl(struct rq *rq, struct task_struct *prev) | ||
1428 | { | ||
1429 | /* Try to pull other tasks here */ | ||
1430 | if (dl_task(prev)) | ||
1431 | pull_dl_task(rq); | ||
1432 | } | ||
1433 | |||
1434 | static void post_schedule_dl(struct rq *rq) | ||
1435 | { | ||
1436 | push_dl_tasks(rq); | ||
1437 | } | ||
1438 | |||
1439 | /* | ||
1440 | * Since the task is not running and a reschedule is not going to happen | ||
1441 | * anytime soon on its runqueue, we try pushing it away now. | ||
1442 | */ | ||
1443 | static void task_woken_dl(struct rq *rq, struct task_struct *p) | ||
1444 | { | ||
1445 | if (!task_running(rq, p) && | ||
1446 | !test_tsk_need_resched(rq->curr) && | ||
1447 | has_pushable_dl_tasks(rq) && | ||
1448 | p->nr_cpus_allowed > 1 && | ||
1449 | dl_task(rq->curr) && | ||
1450 | (rq->curr->nr_cpus_allowed < 2 || | ||
1451 | dl_entity_preempt(&rq->curr->dl, &p->dl))) { | ||
1452 | push_dl_tasks(rq); | ||
1453 | } | ||
1454 | } | ||
1455 | |||
1456 | static void set_cpus_allowed_dl(struct task_struct *p, | ||
1457 | const struct cpumask *new_mask) | ||
1458 | { | ||
1459 | struct rq *rq; | ||
1460 | int weight; | ||
1461 | |||
1462 | BUG_ON(!dl_task(p)); | ||
1463 | |||
1464 | /* | ||
1465 | * Update only if the task is actually running (i.e., | ||
1466 | * it is on the rq AND it is not throttled). | ||
1467 | */ | ||
1468 | if (!on_dl_rq(&p->dl)) | ||
1469 | return; | ||
1470 | |||
1471 | weight = cpumask_weight(new_mask); | ||
1472 | |||
1473 | /* | ||
1474 | * Only update if the process changes its state from whether it | ||
1475 | * can migrate or not. | ||
1476 | */ | ||
1477 | if ((p->nr_cpus_allowed > 1) == (weight > 1)) | ||
1478 | return; | ||
1479 | |||
1480 | rq = task_rq(p); | ||
1481 | |||
1482 | /* | ||
1483 | * The process used to be able to migrate OR it can now migrate | ||
1484 | */ | ||
1485 | if (weight <= 1) { | ||
1486 | if (!task_current(rq, p)) | ||
1487 | dequeue_pushable_dl_task(rq, p); | ||
1488 | BUG_ON(!rq->dl.dl_nr_migratory); | ||
1489 | rq->dl.dl_nr_migratory--; | ||
1490 | } else { | ||
1491 | if (!task_current(rq, p)) | ||
1492 | enqueue_pushable_dl_task(rq, p); | ||
1493 | rq->dl.dl_nr_migratory++; | ||
1494 | } | ||
1495 | |||
1496 | update_dl_migration(&rq->dl); | ||
1497 | } | ||
1498 | |||
1499 | /* Assumes rq->lock is held */ | ||
1500 | static void rq_online_dl(struct rq *rq) | ||
1501 | { | ||
1502 | if (rq->dl.overloaded) | ||
1503 | dl_set_overload(rq); | ||
1504 | |||
1505 | if (rq->dl.dl_nr_running > 0) | ||
1506 | cpudl_set(&rq->rd->cpudl, rq->cpu, rq->dl.earliest_dl.curr, 1); | ||
1507 | } | ||
1508 | |||
1509 | /* Assumes rq->lock is held */ | ||
1510 | static void rq_offline_dl(struct rq *rq) | ||
1511 | { | ||
1512 | if (rq->dl.overloaded) | ||
1513 | dl_clear_overload(rq); | ||
1514 | |||
1515 | cpudl_set(&rq->rd->cpudl, rq->cpu, 0, 0); | ||
1516 | } | ||
1517 | |||
1518 | void init_sched_dl_class(void) | ||
1519 | { | ||
1520 | unsigned int i; | ||
1521 | |||
1522 | for_each_possible_cpu(i) | ||
1523 | zalloc_cpumask_var_node(&per_cpu(local_cpu_mask_dl, i), | ||
1524 | GFP_KERNEL, cpu_to_node(i)); | ||
1525 | } | ||
1526 | |||
1527 | #endif /* CONFIG_SMP */ | ||
1528 | |||
1529 | static void switched_from_dl(struct rq *rq, struct task_struct *p) | ||
1530 | { | ||
1531 | if (hrtimer_active(&p->dl.dl_timer) && !dl_policy(p->policy)) | ||
1532 | hrtimer_try_to_cancel(&p->dl.dl_timer); | ||
1533 | |||
1534 | #ifdef CONFIG_SMP | ||
1535 | /* | ||
1536 | * Since this might be the only -deadline task on the rq, | ||
1537 | * this is the right place to try to pull some other one | ||
1538 | * from an overloaded cpu, if any. | ||
1539 | */ | ||
1540 | if (!rq->dl.dl_nr_running) | ||
1541 | pull_dl_task(rq); | ||
1542 | #endif | ||
1543 | } | ||
1544 | |||
1545 | /* | ||
1546 | * When switching to -deadline, we may overload the rq, then | ||
1547 | * we try to push someone off, if possible. | ||
1548 | */ | ||
1549 | static void switched_to_dl(struct rq *rq, struct task_struct *p) | ||
1550 | { | ||
1551 | int check_resched = 1; | ||
1552 | |||
1553 | /* | ||
1554 | * If p is throttled, don't consider the possibility | ||
1555 | * of preempting rq->curr, the check will be done right | ||
1556 | * after its runtime will get replenished. | ||
1557 | */ | ||
1558 | if (unlikely(p->dl.dl_throttled)) | ||
1559 | return; | ||
1560 | |||
1561 | if (p->on_rq || rq->curr != p) { | ||
1562 | #ifdef CONFIG_SMP | ||
1563 | if (rq->dl.overloaded && push_dl_task(rq) && rq != task_rq(p)) | ||
1564 | /* Only reschedule if pushing failed */ | ||
1565 | check_resched = 0; | ||
1566 | #endif /* CONFIG_SMP */ | ||
1567 | if (check_resched && task_has_dl_policy(rq->curr)) | ||
1568 | check_preempt_curr_dl(rq, p, 0); | ||
1569 | } | ||
1570 | } | ||
1571 | |||
1572 | /* | ||
1573 | * If the scheduling parameters of a -deadline task changed, | ||
1574 | * a push or pull operation might be needed. | ||
1575 | */ | ||
1576 | static void prio_changed_dl(struct rq *rq, struct task_struct *p, | ||
1577 | int oldprio) | ||
1578 | { | ||
1579 | if (p->on_rq || rq->curr == p) { | ||
1580 | #ifdef CONFIG_SMP | ||
1581 | /* | ||
1582 | * This might be too much, but unfortunately | ||
1583 | * we don't have the old deadline value, and | ||
1584 | * we can't argue if the task is increasing | ||
1585 | * or lowering its prio, so... | ||
1586 | */ | ||
1587 | if (!rq->dl.overloaded) | ||
1588 | pull_dl_task(rq); | ||
1589 | |||
1590 | /* | ||
1591 | * If we now have a earlier deadline task than p, | ||
1592 | * then reschedule, provided p is still on this | ||
1593 | * runqueue. | ||
1594 | */ | ||
1595 | if (dl_time_before(rq->dl.earliest_dl.curr, p->dl.deadline) && | ||
1596 | rq->curr == p) | ||
1597 | resched_task(p); | ||
1598 | #else | ||
1599 | /* | ||
1600 | * Again, we don't know if p has a earlier | ||
1601 | * or later deadline, so let's blindly set a | ||
1602 | * (maybe not needed) rescheduling point. | ||
1603 | */ | ||
1604 | resched_task(p); | ||
1605 | #endif /* CONFIG_SMP */ | ||
1606 | } else | ||
1607 | switched_to_dl(rq, p); | ||
1608 | } | ||
1609 | |||
1610 | const struct sched_class dl_sched_class = { | ||
1611 | .next = &rt_sched_class, | ||
1612 | .enqueue_task = enqueue_task_dl, | ||
1613 | .dequeue_task = dequeue_task_dl, | ||
1614 | .yield_task = yield_task_dl, | ||
1615 | |||
1616 | .check_preempt_curr = check_preempt_curr_dl, | ||
1617 | |||
1618 | .pick_next_task = pick_next_task_dl, | ||
1619 | .put_prev_task = put_prev_task_dl, | ||
1620 | |||
1621 | #ifdef CONFIG_SMP | ||
1622 | .select_task_rq = select_task_rq_dl, | ||
1623 | .set_cpus_allowed = set_cpus_allowed_dl, | ||
1624 | .rq_online = rq_online_dl, | ||
1625 | .rq_offline = rq_offline_dl, | ||
1626 | .pre_schedule = pre_schedule_dl, | ||
1627 | .post_schedule = post_schedule_dl, | ||
1628 | .task_woken = task_woken_dl, | ||
1629 | #endif | ||
1630 | |||
1631 | .set_curr_task = set_curr_task_dl, | ||
1632 | .task_tick = task_tick_dl, | ||
1633 | .task_fork = task_fork_dl, | ||
1634 | .task_dead = task_dead_dl, | ||
1635 | |||
1636 | .prio_changed = prio_changed_dl, | ||
1637 | .switched_from = switched_from_dl, | ||
1638 | .switched_to = switched_to_dl, | ||
1639 | }; | ||
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 5c34d1817e8f..dd52e7ffb10e 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c | |||
@@ -139,7 +139,7 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) | |||
139 | 0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L); | 139 | 0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L); |
140 | #endif | 140 | #endif |
141 | #ifdef CONFIG_NUMA_BALANCING | 141 | #ifdef CONFIG_NUMA_BALANCING |
142 | SEQ_printf(m, " %d", cpu_to_node(task_cpu(p))); | 142 | SEQ_printf(m, " %d", task_node(p)); |
143 | #endif | 143 | #endif |
144 | #ifdef CONFIG_CGROUP_SCHED | 144 | #ifdef CONFIG_CGROUP_SCHED |
145 | SEQ_printf(m, " %s", task_group_path(task_group(p))); | 145 | SEQ_printf(m, " %s", task_group_path(task_group(p))); |
@@ -371,7 +371,7 @@ static void sched_debug_header(struct seq_file *m) | |||
371 | PN(cpu_clk); | 371 | PN(cpu_clk); |
372 | P(jiffies); | 372 | P(jiffies); |
373 | #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK | 373 | #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK |
374 | P(sched_clock_stable); | 374 | P(sched_clock_stable()); |
375 | #endif | 375 | #endif |
376 | #undef PN | 376 | #undef PN |
377 | #undef P | 377 | #undef P |
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index c7395d97e4cb..9b4c4f320130 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c | |||
@@ -872,15 +872,6 @@ static unsigned int task_scan_max(struct task_struct *p) | |||
872 | return max(smin, smax); | 872 | return max(smin, smax); |
873 | } | 873 | } |
874 | 874 | ||
875 | /* | ||
876 | * Once a preferred node is selected the scheduler balancer will prefer moving | ||
877 | * a task to that node for sysctl_numa_balancing_settle_count number of PTE | ||
878 | * scans. This will give the process the chance to accumulate more faults on | ||
879 | * the preferred node but still allow the scheduler to move the task again if | ||
880 | * the nodes CPUs are overloaded. | ||
881 | */ | ||
882 | unsigned int sysctl_numa_balancing_settle_count __read_mostly = 4; | ||
883 | |||
884 | static void account_numa_enqueue(struct rq *rq, struct task_struct *p) | 875 | static void account_numa_enqueue(struct rq *rq, struct task_struct *p) |
885 | { | 876 | { |
886 | rq->nr_numa_running += (p->numa_preferred_nid != -1); | 877 | rq->nr_numa_running += (p->numa_preferred_nid != -1); |
@@ -930,7 +921,8 @@ static inline unsigned long group_faults(struct task_struct *p, int nid) | |||
930 | if (!p->numa_group) | 921 | if (!p->numa_group) |
931 | return 0; | 922 | return 0; |
932 | 923 | ||
933 | return p->numa_group->faults[2*nid] + p->numa_group->faults[2*nid+1]; | 924 | return p->numa_group->faults[task_faults_idx(nid, 0)] + |
925 | p->numa_group->faults[task_faults_idx(nid, 1)]; | ||
934 | } | 926 | } |
935 | 927 | ||
936 | /* | 928 | /* |
@@ -1023,7 +1015,7 @@ struct task_numa_env { | |||
1023 | 1015 | ||
1024 | struct numa_stats src_stats, dst_stats; | 1016 | struct numa_stats src_stats, dst_stats; |
1025 | 1017 | ||
1026 | int imbalance_pct, idx; | 1018 | int imbalance_pct; |
1027 | 1019 | ||
1028 | struct task_struct *best_task; | 1020 | struct task_struct *best_task; |
1029 | long best_imp; | 1021 | long best_imp; |
@@ -1211,7 +1203,7 @@ static int task_numa_migrate(struct task_struct *p) | |||
1211 | * elsewhere, so there is no point in (re)trying. | 1203 | * elsewhere, so there is no point in (re)trying. |
1212 | */ | 1204 | */ |
1213 | if (unlikely(!sd)) { | 1205 | if (unlikely(!sd)) { |
1214 | p->numa_preferred_nid = cpu_to_node(task_cpu(p)); | 1206 | p->numa_preferred_nid = task_node(p); |
1215 | return -EINVAL; | 1207 | return -EINVAL; |
1216 | } | 1208 | } |
1217 | 1209 | ||
@@ -1258,11 +1250,15 @@ static int task_numa_migrate(struct task_struct *p) | |||
1258 | p->numa_scan_period = task_scan_min(p); | 1250 | p->numa_scan_period = task_scan_min(p); |
1259 | 1251 | ||
1260 | if (env.best_task == NULL) { | 1252 | if (env.best_task == NULL) { |
1261 | int ret = migrate_task_to(p, env.best_cpu); | 1253 | ret = migrate_task_to(p, env.best_cpu); |
1254 | if (ret != 0) | ||
1255 | trace_sched_stick_numa(p, env.src_cpu, env.best_cpu); | ||
1262 | return ret; | 1256 | return ret; |
1263 | } | 1257 | } |
1264 | 1258 | ||
1265 | ret = migrate_swap(p, env.best_task); | 1259 | ret = migrate_swap(p, env.best_task); |
1260 | if (ret != 0) | ||
1261 | trace_sched_stick_numa(p, env.src_cpu, task_cpu(env.best_task)); | ||
1266 | put_task_struct(env.best_task); | 1262 | put_task_struct(env.best_task); |
1267 | return ret; | 1263 | return ret; |
1268 | } | 1264 | } |
@@ -1278,7 +1274,7 @@ static void numa_migrate_preferred(struct task_struct *p) | |||
1278 | p->numa_migrate_retry = jiffies + HZ; | 1274 | p->numa_migrate_retry = jiffies + HZ; |
1279 | 1275 | ||
1280 | /* Success if task is already running on preferred CPU */ | 1276 | /* Success if task is already running on preferred CPU */ |
1281 | if (cpu_to_node(task_cpu(p)) == p->numa_preferred_nid) | 1277 | if (task_node(p) == p->numa_preferred_nid) |
1282 | return; | 1278 | return; |
1283 | 1279 | ||
1284 | /* Otherwise, try migrate to a CPU on the preferred node */ | 1280 | /* Otherwise, try migrate to a CPU on the preferred node */ |
@@ -1350,7 +1346,6 @@ static void update_task_scan_period(struct task_struct *p, | |||
1350 | * scanning faster if shared accesses dominate as it may | 1346 | * scanning faster if shared accesses dominate as it may |
1351 | * simply bounce migrations uselessly | 1347 | * simply bounce migrations uselessly |
1352 | */ | 1348 | */ |
1353 | period_slot = DIV_ROUND_UP(diff, NUMA_PERIOD_SLOTS); | ||
1354 | ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared)); | 1349 | ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared)); |
1355 | diff = (diff * ratio) / NUMA_PERIOD_SLOTS; | 1350 | diff = (diff * ratio) / NUMA_PERIOD_SLOTS; |
1356 | } | 1351 | } |
@@ -1762,6 +1757,8 @@ void task_numa_work(struct callback_head *work) | |||
1762 | start = end; | 1757 | start = end; |
1763 | if (pages <= 0) | 1758 | if (pages <= 0) |
1764 | goto out; | 1759 | goto out; |
1760 | |||
1761 | cond_resched(); | ||
1765 | } while (end != vma->vm_end); | 1762 | } while (end != vma->vm_end); |
1766 | } | 1763 | } |
1767 | 1764 | ||
@@ -2365,13 +2362,7 @@ static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq, | |||
2365 | } | 2362 | } |
2366 | wakeup = 0; | 2363 | wakeup = 0; |
2367 | } else { | 2364 | } else { |
2368 | /* | 2365 | __synchronize_entity_decay(se); |
2369 | * Task re-woke on same cpu (or else migrate_task_rq_fair() | ||
2370 | * would have made count negative); we must be careful to avoid | ||
2371 | * double-accounting blocked time after synchronizing decays. | ||
2372 | */ | ||
2373 | se->avg.last_runnable_update += __synchronize_entity_decay(se) | ||
2374 | << 20; | ||
2375 | } | 2366 | } |
2376 | 2367 | ||
2377 | /* migrated tasks did not contribute to our blocked load */ | 2368 | /* migrated tasks did not contribute to our blocked load */ |
@@ -3923,7 +3914,7 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg) | |||
3923 | { | 3914 | { |
3924 | struct sched_entity *se = tg->se[cpu]; | 3915 | struct sched_entity *se = tg->se[cpu]; |
3925 | 3916 | ||
3926 | if (!tg->parent || !wl) /* the trivial, non-cgroup case */ | 3917 | if (!tg->parent) /* the trivial, non-cgroup case */ |
3927 | return wl; | 3918 | return wl; |
3928 | 3919 | ||
3929 | for_each_sched_entity(se) { | 3920 | for_each_sched_entity(se) { |
@@ -4101,12 +4092,16 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) | |||
4101 | */ | 4092 | */ |
4102 | static struct sched_group * | 4093 | static struct sched_group * |
4103 | find_idlest_group(struct sched_domain *sd, struct task_struct *p, | 4094 | find_idlest_group(struct sched_domain *sd, struct task_struct *p, |
4104 | int this_cpu, int load_idx) | 4095 | int this_cpu, int sd_flag) |
4105 | { | 4096 | { |
4106 | struct sched_group *idlest = NULL, *group = sd->groups; | 4097 | struct sched_group *idlest = NULL, *group = sd->groups; |
4107 | unsigned long min_load = ULONG_MAX, this_load = 0; | 4098 | unsigned long min_load = ULONG_MAX, this_load = 0; |
4099 | int load_idx = sd->forkexec_idx; | ||
4108 | int imbalance = 100 + (sd->imbalance_pct-100)/2; | 4100 | int imbalance = 100 + (sd->imbalance_pct-100)/2; |
4109 | 4101 | ||
4102 | if (sd_flag & SD_BALANCE_WAKE) | ||
4103 | load_idx = sd->wake_idx; | ||
4104 | |||
4110 | do { | 4105 | do { |
4111 | unsigned long load, avg_load; | 4106 | unsigned long load, avg_load; |
4112 | int local_group; | 4107 | int local_group; |
@@ -4274,7 +4269,6 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f | |||
4274 | } | 4269 | } |
4275 | 4270 | ||
4276 | while (sd) { | 4271 | while (sd) { |
4277 | int load_idx = sd->forkexec_idx; | ||
4278 | struct sched_group *group; | 4272 | struct sched_group *group; |
4279 | int weight; | 4273 | int weight; |
4280 | 4274 | ||
@@ -4283,10 +4277,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f | |||
4283 | continue; | 4277 | continue; |
4284 | } | 4278 | } |
4285 | 4279 | ||
4286 | if (sd_flag & SD_BALANCE_WAKE) | 4280 | group = find_idlest_group(sd, p, cpu, sd_flag); |
4287 | load_idx = sd->wake_idx; | ||
4288 | |||
4289 | group = find_idlest_group(sd, p, cpu, load_idx); | ||
4290 | if (!group) { | 4281 | if (!group) { |
4291 | sd = sd->child; | 4282 | sd = sd->child; |
4292 | continue; | 4283 | continue; |
@@ -5512,7 +5503,6 @@ static inline void update_sg_lb_stats(struct lb_env *env, | |||
5512 | struct sched_group *group, int load_idx, | 5503 | struct sched_group *group, int load_idx, |
5513 | int local_group, struct sg_lb_stats *sgs) | 5504 | int local_group, struct sg_lb_stats *sgs) |
5514 | { | 5505 | { |
5515 | unsigned long nr_running; | ||
5516 | unsigned long load; | 5506 | unsigned long load; |
5517 | int i; | 5507 | int i; |
5518 | 5508 | ||
@@ -5521,8 +5511,6 @@ static inline void update_sg_lb_stats(struct lb_env *env, | |||
5521 | for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { | 5511 | for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { |
5522 | struct rq *rq = cpu_rq(i); | 5512 | struct rq *rq = cpu_rq(i); |
5523 | 5513 | ||
5524 | nr_running = rq->nr_running; | ||
5525 | |||
5526 | /* Bias balancing toward cpus of our domain */ | 5514 | /* Bias balancing toward cpus of our domain */ |
5527 | if (local_group) | 5515 | if (local_group) |
5528 | load = target_load(i, load_idx); | 5516 | load = target_load(i, load_idx); |
@@ -5530,7 +5518,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, | |||
5530 | load = source_load(i, load_idx); | 5518 | load = source_load(i, load_idx); |
5531 | 5519 | ||
5532 | sgs->group_load += load; | 5520 | sgs->group_load += load; |
5533 | sgs->sum_nr_running += nr_running; | 5521 | sgs->sum_nr_running += rq->nr_running; |
5534 | #ifdef CONFIG_NUMA_BALANCING | 5522 | #ifdef CONFIG_NUMA_BALANCING |
5535 | sgs->nr_numa_running += rq->nr_numa_running; | 5523 | sgs->nr_numa_running += rq->nr_numa_running; |
5536 | sgs->nr_preferred_running += rq->nr_preferred_running; | 5524 | sgs->nr_preferred_running += rq->nr_preferred_running; |
@@ -6521,7 +6509,7 @@ static struct { | |||
6521 | unsigned long next_balance; /* in jiffy units */ | 6509 | unsigned long next_balance; /* in jiffy units */ |
6522 | } nohz ____cacheline_aligned; | 6510 | } nohz ____cacheline_aligned; |
6523 | 6511 | ||
6524 | static inline int find_new_ilb(int call_cpu) | 6512 | static inline int find_new_ilb(void) |
6525 | { | 6513 | { |
6526 | int ilb = cpumask_first(nohz.idle_cpus_mask); | 6514 | int ilb = cpumask_first(nohz.idle_cpus_mask); |
6527 | 6515 | ||
@@ -6536,13 +6524,13 @@ static inline int find_new_ilb(int call_cpu) | |||
6536 | * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle | 6524 | * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle |
6537 | * CPU (if there is one). | 6525 | * CPU (if there is one). |
6538 | */ | 6526 | */ |
6539 | static void nohz_balancer_kick(int cpu) | 6527 | static void nohz_balancer_kick(void) |
6540 | { | 6528 | { |
6541 | int ilb_cpu; | 6529 | int ilb_cpu; |
6542 | 6530 | ||
6543 | nohz.next_balance++; | 6531 | nohz.next_balance++; |
6544 | 6532 | ||
6545 | ilb_cpu = find_new_ilb(cpu); | 6533 | ilb_cpu = find_new_ilb(); |
6546 | 6534 | ||
6547 | if (ilb_cpu >= nr_cpu_ids) | 6535 | if (ilb_cpu >= nr_cpu_ids) |
6548 | return; | 6536 | return; |
@@ -6652,10 +6640,10 @@ void update_max_interval(void) | |||
6652 | * | 6640 | * |
6653 | * Balancing parameters are set up in init_sched_domains. | 6641 | * Balancing parameters are set up in init_sched_domains. |
6654 | */ | 6642 | */ |
6655 | static void rebalance_domains(int cpu, enum cpu_idle_type idle) | 6643 | static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle) |
6656 | { | 6644 | { |
6657 | int continue_balancing = 1; | 6645 | int continue_balancing = 1; |
6658 | struct rq *rq = cpu_rq(cpu); | 6646 | int cpu = rq->cpu; |
6659 | unsigned long interval; | 6647 | unsigned long interval; |
6660 | struct sched_domain *sd; | 6648 | struct sched_domain *sd; |
6661 | /* Earliest time when we have to do rebalance again */ | 6649 | /* Earliest time when we have to do rebalance again */ |
@@ -6752,9 +6740,9 @@ out: | |||
6752 | * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the | 6740 | * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the |
6753 | * rebalancing for all the cpus for whom scheduler ticks are stopped. | 6741 | * rebalancing for all the cpus for whom scheduler ticks are stopped. |
6754 | */ | 6742 | */ |
6755 | static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) | 6743 | static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) |
6756 | { | 6744 | { |
6757 | struct rq *this_rq = cpu_rq(this_cpu); | 6745 | int this_cpu = this_rq->cpu; |
6758 | struct rq *rq; | 6746 | struct rq *rq; |
6759 | int balance_cpu; | 6747 | int balance_cpu; |
6760 | 6748 | ||
@@ -6781,7 +6769,7 @@ static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) | |||
6781 | update_idle_cpu_load(rq); | 6769 | update_idle_cpu_load(rq); |
6782 | raw_spin_unlock_irq(&rq->lock); | 6770 | raw_spin_unlock_irq(&rq->lock); |
6783 | 6771 | ||
6784 | rebalance_domains(balance_cpu, CPU_IDLE); | 6772 | rebalance_domains(rq, CPU_IDLE); |
6785 | 6773 | ||
6786 | if (time_after(this_rq->next_balance, rq->next_balance)) | 6774 | if (time_after(this_rq->next_balance, rq->next_balance)) |
6787 | this_rq->next_balance = rq->next_balance; | 6775 | this_rq->next_balance = rq->next_balance; |
@@ -6800,14 +6788,14 @@ end: | |||
6800 | * - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler | 6788 | * - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler |
6801 | * domain span are idle. | 6789 | * domain span are idle. |
6802 | */ | 6790 | */ |
6803 | static inline int nohz_kick_needed(struct rq *rq, int cpu) | 6791 | static inline int nohz_kick_needed(struct rq *rq) |
6804 | { | 6792 | { |
6805 | unsigned long now = jiffies; | 6793 | unsigned long now = jiffies; |
6806 | struct sched_domain *sd; | 6794 | struct sched_domain *sd; |
6807 | struct sched_group_power *sgp; | 6795 | struct sched_group_power *sgp; |
6808 | int nr_busy; | 6796 | int nr_busy, cpu = rq->cpu; |
6809 | 6797 | ||
6810 | if (unlikely(idle_cpu(cpu))) | 6798 | if (unlikely(rq->idle_balance)) |
6811 | return 0; | 6799 | return 0; |
6812 | 6800 | ||
6813 | /* | 6801 | /* |
@@ -6856,7 +6844,7 @@ need_kick: | |||
6856 | return 1; | 6844 | return 1; |
6857 | } | 6845 | } |
6858 | #else | 6846 | #else |
6859 | static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { } | 6847 | static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { } |
6860 | #endif | 6848 | #endif |
6861 | 6849 | ||
6862 | /* | 6850 | /* |
@@ -6865,38 +6853,39 @@ static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { } | |||
6865 | */ | 6853 | */ |
6866 | static void run_rebalance_domains(struct softirq_action *h) | 6854 | static void run_rebalance_domains(struct softirq_action *h) |
6867 | { | 6855 | { |
6868 | int this_cpu = smp_processor_id(); | 6856 | struct rq *this_rq = this_rq(); |
6869 | struct rq *this_rq = cpu_rq(this_cpu); | ||
6870 | enum cpu_idle_type idle = this_rq->idle_balance ? | 6857 | enum cpu_idle_type idle = this_rq->idle_balance ? |
6871 | CPU_IDLE : CPU_NOT_IDLE; | 6858 | CPU_IDLE : CPU_NOT_IDLE; |
6872 | 6859 | ||
6873 | rebalance_domains(this_cpu, idle); | 6860 | rebalance_domains(this_rq, idle); |
6874 | 6861 | ||
6875 | /* | 6862 | /* |
6876 | * If this cpu has a pending nohz_balance_kick, then do the | 6863 | * If this cpu has a pending nohz_balance_kick, then do the |
6877 | * balancing on behalf of the other idle cpus whose ticks are | 6864 | * balancing on behalf of the other idle cpus whose ticks are |
6878 | * stopped. | 6865 | * stopped. |
6879 | */ | 6866 | */ |
6880 | nohz_idle_balance(this_cpu, idle); | 6867 | nohz_idle_balance(this_rq, idle); |
6881 | } | 6868 | } |
6882 | 6869 | ||
6883 | static inline int on_null_domain(int cpu) | 6870 | static inline int on_null_domain(struct rq *rq) |
6884 | { | 6871 | { |
6885 | return !rcu_dereference_sched(cpu_rq(cpu)->sd); | 6872 | return !rcu_dereference_sched(rq->sd); |
6886 | } | 6873 | } |
6887 | 6874 | ||
6888 | /* | 6875 | /* |
6889 | * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing. | 6876 | * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing. |
6890 | */ | 6877 | */ |
6891 | void trigger_load_balance(struct rq *rq, int cpu) | 6878 | void trigger_load_balance(struct rq *rq) |
6892 | { | 6879 | { |
6893 | /* Don't need to rebalance while attached to NULL domain */ | 6880 | /* Don't need to rebalance while attached to NULL domain */ |
6894 | if (time_after_eq(jiffies, rq->next_balance) && | 6881 | if (unlikely(on_null_domain(rq))) |
6895 | likely(!on_null_domain(cpu))) | 6882 | return; |
6883 | |||
6884 | if (time_after_eq(jiffies, rq->next_balance)) | ||
6896 | raise_softirq(SCHED_SOFTIRQ); | 6885 | raise_softirq(SCHED_SOFTIRQ); |
6897 | #ifdef CONFIG_NO_HZ_COMMON | 6886 | #ifdef CONFIG_NO_HZ_COMMON |
6898 | if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu))) | 6887 | if (nohz_kick_needed(rq)) |
6899 | nohz_balancer_kick(cpu); | 6888 | nohz_balancer_kick(); |
6900 | #endif | 6889 | #endif |
6901 | } | 6890 | } |
6902 | 6891 | ||
@@ -7012,15 +7001,15 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p) | |||
7012 | struct cfs_rq *cfs_rq = cfs_rq_of(se); | 7001 | struct cfs_rq *cfs_rq = cfs_rq_of(se); |
7013 | 7002 | ||
7014 | /* | 7003 | /* |
7015 | * Ensure the task's vruntime is normalized, so that when its | 7004 | * Ensure the task's vruntime is normalized, so that when it's |
7016 | * switched back to the fair class the enqueue_entity(.flags=0) will | 7005 | * switched back to the fair class the enqueue_entity(.flags=0) will |
7017 | * do the right thing. | 7006 | * do the right thing. |
7018 | * | 7007 | * |
7019 | * If it was on_rq, then the dequeue_entity(.flags=0) will already | 7008 | * If it's on_rq, then the dequeue_entity(.flags=0) will already |
7020 | * have normalized the vruntime, if it was !on_rq, then only when | 7009 | * have normalized the vruntime, if it's !on_rq, then only when |
7021 | * the task is sleeping will it still have non-normalized vruntime. | 7010 | * the task is sleeping will it still have non-normalized vruntime. |
7022 | */ | 7011 | */ |
7023 | if (!se->on_rq && p->state != TASK_RUNNING) { | 7012 | if (!p->on_rq && p->state != TASK_RUNNING) { |
7024 | /* | 7013 | /* |
7025 | * Fix up our vruntime so that the current sleep doesn't | 7014 | * Fix up our vruntime so that the current sleep doesn't |
7026 | * cause 'unlimited' sleep bonus. | 7015 | * cause 'unlimited' sleep bonus. |
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 1c4065575fa2..1999021042c7 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c | |||
@@ -538,6 +538,14 @@ static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq) | |||
538 | 538 | ||
539 | #endif /* CONFIG_RT_GROUP_SCHED */ | 539 | #endif /* CONFIG_RT_GROUP_SCHED */ |
540 | 540 | ||
541 | bool sched_rt_bandwidth_account(struct rt_rq *rt_rq) | ||
542 | { | ||
543 | struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); | ||
544 | |||
545 | return (hrtimer_active(&rt_b->rt_period_timer) || | ||
546 | rt_rq->rt_time < rt_b->rt_runtime); | ||
547 | } | ||
548 | |||
541 | #ifdef CONFIG_SMP | 549 | #ifdef CONFIG_SMP |
542 | /* | 550 | /* |
543 | * We ran out of runtime, see if we can borrow some from our neighbours. | 551 | * We ran out of runtime, see if we can borrow some from our neighbours. |
@@ -1738,7 +1746,7 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p) | |||
1738 | !test_tsk_need_resched(rq->curr) && | 1746 | !test_tsk_need_resched(rq->curr) && |
1739 | has_pushable_tasks(rq) && | 1747 | has_pushable_tasks(rq) && |
1740 | p->nr_cpus_allowed > 1 && | 1748 | p->nr_cpus_allowed > 1 && |
1741 | rt_task(rq->curr) && | 1749 | (dl_task(rq->curr) || rt_task(rq->curr)) && |
1742 | (rq->curr->nr_cpus_allowed < 2 || | 1750 | (rq->curr->nr_cpus_allowed < 2 || |
1743 | rq->curr->prio <= p->prio)) | 1751 | rq->curr->prio <= p->prio)) |
1744 | push_rt_tasks(rq); | 1752 | push_rt_tasks(rq); |
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 88c85b21d633..f964add50f38 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h | |||
@@ -2,6 +2,7 @@ | |||
2 | #include <linux/sched.h> | 2 | #include <linux/sched.h> |
3 | #include <linux/sched/sysctl.h> | 3 | #include <linux/sched/sysctl.h> |
4 | #include <linux/sched/rt.h> | 4 | #include <linux/sched/rt.h> |
5 | #include <linux/sched/deadline.h> | ||
5 | #include <linux/mutex.h> | 6 | #include <linux/mutex.h> |
6 | #include <linux/spinlock.h> | 7 | #include <linux/spinlock.h> |
7 | #include <linux/stop_machine.h> | 8 | #include <linux/stop_machine.h> |
@@ -9,6 +10,7 @@ | |||
9 | #include <linux/slab.h> | 10 | #include <linux/slab.h> |
10 | 11 | ||
11 | #include "cpupri.h" | 12 | #include "cpupri.h" |
13 | #include "cpudeadline.h" | ||
12 | #include "cpuacct.h" | 14 | #include "cpuacct.h" |
13 | 15 | ||
14 | struct rq; | 16 | struct rq; |
@@ -73,6 +75,13 @@ extern void update_cpu_load_active(struct rq *this_rq); | |||
73 | #define NICE_0_SHIFT SCHED_LOAD_SHIFT | 75 | #define NICE_0_SHIFT SCHED_LOAD_SHIFT |
74 | 76 | ||
75 | /* | 77 | /* |
78 | * Single value that decides SCHED_DEADLINE internal math precision. | ||
79 | * 10 -> just above 1us | ||
80 | * 9 -> just above 0.5us | ||
81 | */ | ||
82 | #define DL_SCALE (10) | ||
83 | |||
84 | /* | ||
76 | * These are the 'tuning knobs' of the scheduler: | 85 | * These are the 'tuning knobs' of the scheduler: |
77 | */ | 86 | */ |
78 | 87 | ||
@@ -81,11 +90,19 @@ extern void update_cpu_load_active(struct rq *this_rq); | |||
81 | */ | 90 | */ |
82 | #define RUNTIME_INF ((u64)~0ULL) | 91 | #define RUNTIME_INF ((u64)~0ULL) |
83 | 92 | ||
93 | static inline int fair_policy(int policy) | ||
94 | { | ||
95 | return policy == SCHED_NORMAL || policy == SCHED_BATCH; | ||
96 | } | ||
97 | |||
84 | static inline int rt_policy(int policy) | 98 | static inline int rt_policy(int policy) |
85 | { | 99 | { |
86 | if (policy == SCHED_FIFO || policy == SCHED_RR) | 100 | return policy == SCHED_FIFO || policy == SCHED_RR; |
87 | return 1; | 101 | } |
88 | return 0; | 102 | |
103 | static inline int dl_policy(int policy) | ||
104 | { | ||
105 | return policy == SCHED_DEADLINE; | ||
89 | } | 106 | } |
90 | 107 | ||
91 | static inline int task_has_rt_policy(struct task_struct *p) | 108 | static inline int task_has_rt_policy(struct task_struct *p) |
@@ -93,6 +110,25 @@ static inline int task_has_rt_policy(struct task_struct *p) | |||
93 | return rt_policy(p->policy); | 110 | return rt_policy(p->policy); |
94 | } | 111 | } |
95 | 112 | ||
113 | static inline int task_has_dl_policy(struct task_struct *p) | ||
114 | { | ||
115 | return dl_policy(p->policy); | ||
116 | } | ||
117 | |||
118 | static inline bool dl_time_before(u64 a, u64 b) | ||
119 | { | ||
120 | return (s64)(a - b) < 0; | ||
121 | } | ||
122 | |||
123 | /* | ||
124 | * Tells if entity @a should preempt entity @b. | ||
125 | */ | ||
126 | static inline bool | ||
127 | dl_entity_preempt(struct sched_dl_entity *a, struct sched_dl_entity *b) | ||
128 | { | ||
129 | return dl_time_before(a->deadline, b->deadline); | ||
130 | } | ||
131 | |||
96 | /* | 132 | /* |
97 | * This is the priority-queue data structure of the RT scheduling class: | 133 | * This is the priority-queue data structure of the RT scheduling class: |
98 | */ | 134 | */ |
@@ -108,6 +144,47 @@ struct rt_bandwidth { | |||
108 | u64 rt_runtime; | 144 | u64 rt_runtime; |
109 | struct hrtimer rt_period_timer; | 145 | struct hrtimer rt_period_timer; |
110 | }; | 146 | }; |
147 | /* | ||
148 | * To keep the bandwidth of -deadline tasks and groups under control | ||
149 | * we need some place where: | ||
150 | * - store the maximum -deadline bandwidth of the system (the group); | ||
151 | * - cache the fraction of that bandwidth that is currently allocated. | ||
152 | * | ||
153 | * This is all done in the data structure below. It is similar to the | ||
154 | * one used for RT-throttling (rt_bandwidth), with the main difference | ||
155 | * that, since here we are only interested in admission control, we | ||
156 | * do not decrease any runtime while the group "executes", neither we | ||
157 | * need a timer to replenish it. | ||
158 | * | ||
159 | * With respect to SMP, the bandwidth is given on a per-CPU basis, | ||
160 | * meaning that: | ||
161 | * - dl_bw (< 100%) is the bandwidth of the system (group) on each CPU; | ||
162 | * - dl_total_bw array contains, in the i-eth element, the currently | ||
163 | * allocated bandwidth on the i-eth CPU. | ||
164 | * Moreover, groups consume bandwidth on each CPU, while tasks only | ||
165 | * consume bandwidth on the CPU they're running on. | ||
166 | * Finally, dl_total_bw_cpu is used to cache the index of dl_total_bw | ||
167 | * that will be shown the next time the proc or cgroup controls will | ||
168 | * be red. It on its turn can be changed by writing on its own | ||
169 | * control. | ||
170 | */ | ||
171 | struct dl_bandwidth { | ||
172 | raw_spinlock_t dl_runtime_lock; | ||
173 | u64 dl_runtime; | ||
174 | u64 dl_period; | ||
175 | }; | ||
176 | |||
177 | static inline int dl_bandwidth_enabled(void) | ||
178 | { | ||
179 | return sysctl_sched_rt_runtime >= 0; | ||
180 | } | ||
181 | |||
182 | extern struct dl_bw *dl_bw_of(int i); | ||
183 | |||
184 | struct dl_bw { | ||
185 | raw_spinlock_t lock; | ||
186 | u64 bw, total_bw; | ||
187 | }; | ||
111 | 188 | ||
112 | extern struct mutex sched_domains_mutex; | 189 | extern struct mutex sched_domains_mutex; |
113 | 190 | ||
@@ -364,6 +441,41 @@ struct rt_rq { | |||
364 | #endif | 441 | #endif |
365 | }; | 442 | }; |
366 | 443 | ||
444 | /* Deadline class' related fields in a runqueue */ | ||
445 | struct dl_rq { | ||
446 | /* runqueue is an rbtree, ordered by deadline */ | ||
447 | struct rb_root rb_root; | ||
448 | struct rb_node *rb_leftmost; | ||
449 | |||
450 | unsigned long dl_nr_running; | ||
451 | |||
452 | #ifdef CONFIG_SMP | ||
453 | /* | ||
454 | * Deadline values of the currently executing and the | ||
455 | * earliest ready task on this rq. Caching these facilitates | ||
456 | * the decision wether or not a ready but not running task | ||
457 | * should migrate somewhere else. | ||
458 | */ | ||
459 | struct { | ||
460 | u64 curr; | ||
461 | u64 next; | ||
462 | } earliest_dl; | ||
463 | |||
464 | unsigned long dl_nr_migratory; | ||
465 | int overloaded; | ||
466 | |||
467 | /* | ||
468 | * Tasks on this rq that can be pushed away. They are kept in | ||
469 | * an rb-tree, ordered by tasks' deadlines, with caching | ||
470 | * of the leftmost (earliest deadline) element. | ||
471 | */ | ||
472 | struct rb_root pushable_dl_tasks_root; | ||
473 | struct rb_node *pushable_dl_tasks_leftmost; | ||
474 | #else | ||
475 | struct dl_bw dl_bw; | ||
476 | #endif | ||
477 | }; | ||
478 | |||
367 | #ifdef CONFIG_SMP | 479 | #ifdef CONFIG_SMP |
368 | 480 | ||
369 | /* | 481 | /* |
@@ -382,6 +494,15 @@ struct root_domain { | |||
382 | cpumask_var_t online; | 494 | cpumask_var_t online; |
383 | 495 | ||
384 | /* | 496 | /* |
497 | * The bit corresponding to a CPU gets set here if such CPU has more | ||
498 | * than one runnable -deadline task (as it is below for RT tasks). | ||
499 | */ | ||
500 | cpumask_var_t dlo_mask; | ||
501 | atomic_t dlo_count; | ||
502 | struct dl_bw dl_bw; | ||
503 | struct cpudl cpudl; | ||
504 | |||
505 | /* | ||
385 | * The "RT overload" flag: it gets set if a CPU has more than | 506 | * The "RT overload" flag: it gets set if a CPU has more than |
386 | * one runnable RT task. | 507 | * one runnable RT task. |
387 | */ | 508 | */ |
@@ -432,6 +553,7 @@ struct rq { | |||
432 | 553 | ||
433 | struct cfs_rq cfs; | 554 | struct cfs_rq cfs; |
434 | struct rt_rq rt; | 555 | struct rt_rq rt; |
556 | struct dl_rq dl; | ||
435 | 557 | ||
436 | #ifdef CONFIG_FAIR_GROUP_SCHED | 558 | #ifdef CONFIG_FAIR_GROUP_SCHED |
437 | /* list of leaf cfs_rq on this cpu: */ | 559 | /* list of leaf cfs_rq on this cpu: */ |
@@ -827,8 +949,6 @@ static inline u64 global_rt_runtime(void) | |||
827 | return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC; | 949 | return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC; |
828 | } | 950 | } |
829 | 951 | ||
830 | |||
831 | |||
832 | static inline int task_current(struct rq *rq, struct task_struct *p) | 952 | static inline int task_current(struct rq *rq, struct task_struct *p) |
833 | { | 953 | { |
834 | return rq->curr == p; | 954 | return rq->curr == p; |
@@ -988,6 +1108,7 @@ static const u32 prio_to_wmult[40] = { | |||
988 | #else | 1108 | #else |
989 | #define ENQUEUE_WAKING 0 | 1109 | #define ENQUEUE_WAKING 0 |
990 | #endif | 1110 | #endif |
1111 | #define ENQUEUE_REPLENISH 8 | ||
991 | 1112 | ||
992 | #define DEQUEUE_SLEEP 1 | 1113 | #define DEQUEUE_SLEEP 1 |
993 | 1114 | ||
@@ -1023,6 +1144,7 @@ struct sched_class { | |||
1023 | void (*set_curr_task) (struct rq *rq); | 1144 | void (*set_curr_task) (struct rq *rq); |
1024 | void (*task_tick) (struct rq *rq, struct task_struct *p, int queued); | 1145 | void (*task_tick) (struct rq *rq, struct task_struct *p, int queued); |
1025 | void (*task_fork) (struct task_struct *p); | 1146 | void (*task_fork) (struct task_struct *p); |
1147 | void (*task_dead) (struct task_struct *p); | ||
1026 | 1148 | ||
1027 | void (*switched_from) (struct rq *this_rq, struct task_struct *task); | 1149 | void (*switched_from) (struct rq *this_rq, struct task_struct *task); |
1028 | void (*switched_to) (struct rq *this_rq, struct task_struct *task); | 1150 | void (*switched_to) (struct rq *this_rq, struct task_struct *task); |
@@ -1042,6 +1164,7 @@ struct sched_class { | |||
1042 | for (class = sched_class_highest; class; class = class->next) | 1164 | for (class = sched_class_highest; class; class = class->next) |
1043 | 1165 | ||
1044 | extern const struct sched_class stop_sched_class; | 1166 | extern const struct sched_class stop_sched_class; |
1167 | extern const struct sched_class dl_sched_class; | ||
1045 | extern const struct sched_class rt_sched_class; | 1168 | extern const struct sched_class rt_sched_class; |
1046 | extern const struct sched_class fair_sched_class; | 1169 | extern const struct sched_class fair_sched_class; |
1047 | extern const struct sched_class idle_sched_class; | 1170 | extern const struct sched_class idle_sched_class; |
@@ -1051,7 +1174,7 @@ extern const struct sched_class idle_sched_class; | |||
1051 | 1174 | ||
1052 | extern void update_group_power(struct sched_domain *sd, int cpu); | 1175 | extern void update_group_power(struct sched_domain *sd, int cpu); |
1053 | 1176 | ||
1054 | extern void trigger_load_balance(struct rq *rq, int cpu); | 1177 | extern void trigger_load_balance(struct rq *rq); |
1055 | extern void idle_balance(int this_cpu, struct rq *this_rq); | 1178 | extern void idle_balance(int this_cpu, struct rq *this_rq); |
1056 | 1179 | ||
1057 | extern void idle_enter_fair(struct rq *this_rq); | 1180 | extern void idle_enter_fair(struct rq *this_rq); |
@@ -1068,8 +1191,11 @@ static inline void idle_balance(int cpu, struct rq *rq) | |||
1068 | extern void sysrq_sched_debug_show(void); | 1191 | extern void sysrq_sched_debug_show(void); |
1069 | extern void sched_init_granularity(void); | 1192 | extern void sched_init_granularity(void); |
1070 | extern void update_max_interval(void); | 1193 | extern void update_max_interval(void); |
1194 | |||
1195 | extern void init_sched_dl_class(void); | ||
1071 | extern void init_sched_rt_class(void); | 1196 | extern void init_sched_rt_class(void); |
1072 | extern void init_sched_fair_class(void); | 1197 | extern void init_sched_fair_class(void); |
1198 | extern void init_sched_dl_class(void); | ||
1073 | 1199 | ||
1074 | extern void resched_task(struct task_struct *p); | 1200 | extern void resched_task(struct task_struct *p); |
1075 | extern void resched_cpu(int cpu); | 1201 | extern void resched_cpu(int cpu); |
@@ -1077,6 +1203,12 @@ extern void resched_cpu(int cpu); | |||
1077 | extern struct rt_bandwidth def_rt_bandwidth; | 1203 | extern struct rt_bandwidth def_rt_bandwidth; |
1078 | extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime); | 1204 | extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime); |
1079 | 1205 | ||
1206 | extern struct dl_bandwidth def_dl_bandwidth; | ||
1207 | extern void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime); | ||
1208 | extern void init_dl_task_timer(struct sched_dl_entity *dl_se); | ||
1209 | |||
1210 | unsigned long to_ratio(u64 period, u64 runtime); | ||
1211 | |||
1080 | extern void update_idle_cpu_load(struct rq *this_rq); | 1212 | extern void update_idle_cpu_load(struct rq *this_rq); |
1081 | 1213 | ||
1082 | extern void init_task_runnable_average(struct task_struct *p); | 1214 | extern void init_task_runnable_average(struct task_struct *p); |
@@ -1353,6 +1485,7 @@ extern void print_rt_stats(struct seq_file *m, int cpu); | |||
1353 | 1485 | ||
1354 | extern void init_cfs_rq(struct cfs_rq *cfs_rq); | 1486 | extern void init_cfs_rq(struct cfs_rq *cfs_rq); |
1355 | extern void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq); | 1487 | extern void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq); |
1488 | extern void init_dl_rq(struct dl_rq *dl_rq, struct rq *rq); | ||
1356 | 1489 | ||
1357 | extern void cfs_bandwidth_usage_inc(void); | 1490 | extern void cfs_bandwidth_usage_inc(void); |
1358 | extern void cfs_bandwidth_usage_dec(void); | 1491 | extern void cfs_bandwidth_usage_dec(void); |
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index 47197de8abd9..fdb6bb0b3356 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c | |||
@@ -103,7 +103,7 @@ get_rr_interval_stop(struct rq *rq, struct task_struct *task) | |||
103 | * Simple, special scheduling class for the per-CPU stop tasks: | 103 | * Simple, special scheduling class for the per-CPU stop tasks: |
104 | */ | 104 | */ |
105 | const struct sched_class stop_sched_class = { | 105 | const struct sched_class stop_sched_class = { |
106 | .next = &rt_sched_class, | 106 | .next = &dl_sched_class, |
107 | 107 | ||
108 | .enqueue_task = enqueue_task_stop, | 108 | .enqueue_task = enqueue_task_stop, |
109 | .dequeue_task = dequeue_task_stop, | 109 | .dequeue_task = dequeue_task_stop, |
diff --git a/kernel/signal.c b/kernel/signal.c index 940b30ee9a30..52f881db1ca0 100644 --- a/kernel/signal.c +++ b/kernel/signal.c | |||
@@ -2047,8 +2047,8 @@ static bool do_signal_stop(int signr) | |||
2047 | if (task_set_jobctl_pending(current, signr | gstop)) | 2047 | if (task_set_jobctl_pending(current, signr | gstop)) |
2048 | sig->group_stop_count++; | 2048 | sig->group_stop_count++; |
2049 | 2049 | ||
2050 | for (t = next_thread(current); t != current; | 2050 | t = current; |
2051 | t = next_thread(t)) { | 2051 | while_each_thread(current, t) { |
2052 | /* | 2052 | /* |
2053 | * Setting state to TASK_STOPPED for a group | 2053 | * Setting state to TASK_STOPPED for a group |
2054 | * stop is always done with the siglock held, | 2054 | * stop is always done with the siglock held, |
@@ -3125,8 +3125,7 @@ int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact) | |||
3125 | rm_from_queue_full(&mask, &t->signal->shared_pending); | 3125 | rm_from_queue_full(&mask, &t->signal->shared_pending); |
3126 | do { | 3126 | do { |
3127 | rm_from_queue_full(&mask, &t->pending); | 3127 | rm_from_queue_full(&mask, &t->pending); |
3128 | t = next_thread(t); | 3128 | } while_each_thread(current, t); |
3129 | } while (t != current); | ||
3130 | } | 3129 | } |
3131 | } | 3130 | } |
3132 | 3131 | ||
diff --git a/kernel/smp.c b/kernel/smp.c index bd9f94028838..ffee35bef179 100644 --- a/kernel/smp.c +++ b/kernel/smp.c | |||
@@ -23,17 +23,11 @@ enum { | |||
23 | struct call_function_data { | 23 | struct call_function_data { |
24 | struct call_single_data __percpu *csd; | 24 | struct call_single_data __percpu *csd; |
25 | cpumask_var_t cpumask; | 25 | cpumask_var_t cpumask; |
26 | cpumask_var_t cpumask_ipi; | ||
27 | }; | 26 | }; |
28 | 27 | ||
29 | static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_function_data, cfd_data); | 28 | static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_function_data, cfd_data); |
30 | 29 | ||
31 | struct call_single_queue { | 30 | static DEFINE_PER_CPU_SHARED_ALIGNED(struct llist_head, call_single_queue); |
32 | struct list_head list; | ||
33 | raw_spinlock_t lock; | ||
34 | }; | ||
35 | |||
36 | static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_single_queue, call_single_queue); | ||
37 | 31 | ||
38 | static int | 32 | static int |
39 | hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu) | 33 | hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu) |
@@ -47,14 +41,8 @@ hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu) | |||
47 | if (!zalloc_cpumask_var_node(&cfd->cpumask, GFP_KERNEL, | 41 | if (!zalloc_cpumask_var_node(&cfd->cpumask, GFP_KERNEL, |
48 | cpu_to_node(cpu))) | 42 | cpu_to_node(cpu))) |
49 | return notifier_from_errno(-ENOMEM); | 43 | return notifier_from_errno(-ENOMEM); |
50 | if (!zalloc_cpumask_var_node(&cfd->cpumask_ipi, GFP_KERNEL, | ||
51 | cpu_to_node(cpu))) { | ||
52 | free_cpumask_var(cfd->cpumask); | ||
53 | return notifier_from_errno(-ENOMEM); | ||
54 | } | ||
55 | cfd->csd = alloc_percpu(struct call_single_data); | 44 | cfd->csd = alloc_percpu(struct call_single_data); |
56 | if (!cfd->csd) { | 45 | if (!cfd->csd) { |
57 | free_cpumask_var(cfd->cpumask_ipi); | ||
58 | free_cpumask_var(cfd->cpumask); | 46 | free_cpumask_var(cfd->cpumask); |
59 | return notifier_from_errno(-ENOMEM); | 47 | return notifier_from_errno(-ENOMEM); |
60 | } | 48 | } |
@@ -67,7 +55,6 @@ hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu) | |||
67 | case CPU_DEAD: | 55 | case CPU_DEAD: |
68 | case CPU_DEAD_FROZEN: | 56 | case CPU_DEAD_FROZEN: |
69 | free_cpumask_var(cfd->cpumask); | 57 | free_cpumask_var(cfd->cpumask); |
70 | free_cpumask_var(cfd->cpumask_ipi); | ||
71 | free_percpu(cfd->csd); | 58 | free_percpu(cfd->csd); |
72 | break; | 59 | break; |
73 | #endif | 60 | #endif |
@@ -85,12 +72,8 @@ void __init call_function_init(void) | |||
85 | void *cpu = (void *)(long)smp_processor_id(); | 72 | void *cpu = (void *)(long)smp_processor_id(); |
86 | int i; | 73 | int i; |
87 | 74 | ||
88 | for_each_possible_cpu(i) { | 75 | for_each_possible_cpu(i) |
89 | struct call_single_queue *q = &per_cpu(call_single_queue, i); | 76 | init_llist_head(&per_cpu(call_single_queue, i)); |
90 | |||
91 | raw_spin_lock_init(&q->lock); | ||
92 | INIT_LIST_HEAD(&q->list); | ||
93 | } | ||
94 | 77 | ||
95 | hotplug_cfd(&hotplug_cfd_notifier, CPU_UP_PREPARE, cpu); | 78 | hotplug_cfd(&hotplug_cfd_notifier, CPU_UP_PREPARE, cpu); |
96 | register_cpu_notifier(&hotplug_cfd_notifier); | 79 | register_cpu_notifier(&hotplug_cfd_notifier); |
@@ -141,18 +124,9 @@ static void csd_unlock(struct call_single_data *csd) | |||
141 | */ | 124 | */ |
142 | static void generic_exec_single(int cpu, struct call_single_data *csd, int wait) | 125 | static void generic_exec_single(int cpu, struct call_single_data *csd, int wait) |
143 | { | 126 | { |
144 | struct call_single_queue *dst = &per_cpu(call_single_queue, cpu); | ||
145 | unsigned long flags; | ||
146 | int ipi; | ||
147 | |||
148 | if (wait) | 127 | if (wait) |
149 | csd->flags |= CSD_FLAG_WAIT; | 128 | csd->flags |= CSD_FLAG_WAIT; |
150 | 129 | ||
151 | raw_spin_lock_irqsave(&dst->lock, flags); | ||
152 | ipi = list_empty(&dst->list); | ||
153 | list_add_tail(&csd->list, &dst->list); | ||
154 | raw_spin_unlock_irqrestore(&dst->lock, flags); | ||
155 | |||
156 | /* | 130 | /* |
157 | * The list addition should be visible before sending the IPI | 131 | * The list addition should be visible before sending the IPI |
158 | * handler locks the list to pull the entry off it because of | 132 | * handler locks the list to pull the entry off it because of |
@@ -164,7 +138,7 @@ static void generic_exec_single(int cpu, struct call_single_data *csd, int wait) | |||
164 | * locking and barrier primitives. Generic code isn't really | 138 | * locking and barrier primitives. Generic code isn't really |
165 | * equipped to do the right thing... | 139 | * equipped to do the right thing... |
166 | */ | 140 | */ |
167 | if (ipi) | 141 | if (llist_add(&csd->llist, &per_cpu(call_single_queue, cpu))) |
168 | arch_send_call_function_single_ipi(cpu); | 142 | arch_send_call_function_single_ipi(cpu); |
169 | 143 | ||
170 | if (wait) | 144 | if (wait) |
@@ -177,27 +151,26 @@ static void generic_exec_single(int cpu, struct call_single_data *csd, int wait) | |||
177 | */ | 151 | */ |
178 | void generic_smp_call_function_single_interrupt(void) | 152 | void generic_smp_call_function_single_interrupt(void) |
179 | { | 153 | { |
180 | struct call_single_queue *q = &__get_cpu_var(call_single_queue); | 154 | struct llist_node *entry, *next; |
181 | LIST_HEAD(list); | ||
182 | 155 | ||
183 | /* | 156 | /* |
184 | * Shouldn't receive this interrupt on a cpu that is not yet online. | 157 | * Shouldn't receive this interrupt on a cpu that is not yet online. |
185 | */ | 158 | */ |
186 | WARN_ON_ONCE(!cpu_online(smp_processor_id())); | 159 | WARN_ON_ONCE(!cpu_online(smp_processor_id())); |
187 | 160 | ||
188 | raw_spin_lock(&q->lock); | 161 | entry = llist_del_all(&__get_cpu_var(call_single_queue)); |
189 | list_replace_init(&q->list, &list); | 162 | entry = llist_reverse_order(entry); |
190 | raw_spin_unlock(&q->lock); | ||
191 | 163 | ||
192 | while (!list_empty(&list)) { | 164 | while (entry) { |
193 | struct call_single_data *csd; | 165 | struct call_single_data *csd; |
194 | 166 | ||
195 | csd = list_entry(list.next, struct call_single_data, list); | 167 | next = entry->next; |
196 | list_del(&csd->list); | ||
197 | 168 | ||
169 | csd = llist_entry(entry, struct call_single_data, llist); | ||
198 | csd->func(csd->info); | 170 | csd->func(csd->info); |
199 | |||
200 | csd_unlock(csd); | 171 | csd_unlock(csd); |
172 | |||
173 | entry = next; | ||
201 | } | 174 | } |
202 | } | 175 | } |
203 | 176 | ||
@@ -402,30 +375,17 @@ void smp_call_function_many(const struct cpumask *mask, | |||
402 | if (unlikely(!cpumask_weight(cfd->cpumask))) | 375 | if (unlikely(!cpumask_weight(cfd->cpumask))) |
403 | return; | 376 | return; |
404 | 377 | ||
405 | /* | ||
406 | * After we put an entry into the list, cfd->cpumask may be cleared | ||
407 | * again when another CPU sends another IPI for a SMP function call, so | ||
408 | * cfd->cpumask will be zero. | ||
409 | */ | ||
410 | cpumask_copy(cfd->cpumask_ipi, cfd->cpumask); | ||
411 | |||
412 | for_each_cpu(cpu, cfd->cpumask) { | 378 | for_each_cpu(cpu, cfd->cpumask) { |
413 | struct call_single_data *csd = per_cpu_ptr(cfd->csd, cpu); | 379 | struct call_single_data *csd = per_cpu_ptr(cfd->csd, cpu); |
414 | struct call_single_queue *dst = | ||
415 | &per_cpu(call_single_queue, cpu); | ||
416 | unsigned long flags; | ||
417 | 380 | ||
418 | csd_lock(csd); | 381 | csd_lock(csd); |
419 | csd->func = func; | 382 | csd->func = func; |
420 | csd->info = info; | 383 | csd->info = info; |
421 | 384 | llist_add(&csd->llist, &per_cpu(call_single_queue, cpu)); | |
422 | raw_spin_lock_irqsave(&dst->lock, flags); | ||
423 | list_add_tail(&csd->list, &dst->list); | ||
424 | raw_spin_unlock_irqrestore(&dst->lock, flags); | ||
425 | } | 385 | } |
426 | 386 | ||
427 | /* Send a message to all CPUs in the map */ | 387 | /* Send a message to all CPUs in the map */ |
428 | arch_send_call_function_ipi_mask(cfd->cpumask_ipi); | 388 | arch_send_call_function_ipi_mask(cfd->cpumask); |
429 | 389 | ||
430 | if (wait) { | 390 | if (wait) { |
431 | for_each_cpu(cpu, cfd->cpumask) { | 391 | for_each_cpu(cpu, cfd->cpumask) { |
diff --git a/kernel/softirq.c b/kernel/softirq.c index 11025ccc06dd..490fcbb1dc5b 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c | |||
@@ -8,6 +8,8 @@ | |||
8 | * Rewritten. Old one was good in 2.2, but in 2.3 it was immoral. --ANK (990903) | 8 | * Rewritten. Old one was good in 2.2, but in 2.3 it was immoral. --ANK (990903) |
9 | */ | 9 | */ |
10 | 10 | ||
11 | #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt | ||
12 | |||
11 | #include <linux/export.h> | 13 | #include <linux/export.h> |
12 | #include <linux/kernel_stat.h> | 14 | #include <linux/kernel_stat.h> |
13 | #include <linux/interrupt.h> | 15 | #include <linux/interrupt.h> |
@@ -54,7 +56,7 @@ static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp | |||
54 | 56 | ||
55 | DEFINE_PER_CPU(struct task_struct *, ksoftirqd); | 57 | DEFINE_PER_CPU(struct task_struct *, ksoftirqd); |
56 | 58 | ||
57 | char *softirq_to_name[NR_SOFTIRQS] = { | 59 | const char * const softirq_to_name[NR_SOFTIRQS] = { |
58 | "HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "BLOCK_IOPOLL", | 60 | "HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "BLOCK_IOPOLL", |
59 | "TASKLET", "SCHED", "HRTIMER", "RCU" | 61 | "TASKLET", "SCHED", "HRTIMER", "RCU" |
60 | }; | 62 | }; |
@@ -89,7 +91,7 @@ static void wakeup_softirqd(void) | |||
89 | * where hardirqs are disabled legitimately: | 91 | * where hardirqs are disabled legitimately: |
90 | */ | 92 | */ |
91 | #ifdef CONFIG_TRACE_IRQFLAGS | 93 | #ifdef CONFIG_TRACE_IRQFLAGS |
92 | static void __local_bh_disable(unsigned long ip, unsigned int cnt) | 94 | void __local_bh_disable_ip(unsigned long ip, unsigned int cnt) |
93 | { | 95 | { |
94 | unsigned long flags; | 96 | unsigned long flags; |
95 | 97 | ||
@@ -107,33 +109,21 @@ static void __local_bh_disable(unsigned long ip, unsigned int cnt) | |||
107 | /* | 109 | /* |
108 | * Were softirqs turned off above: | 110 | * Were softirqs turned off above: |
109 | */ | 111 | */ |
110 | if (softirq_count() == cnt) | 112 | if (softirq_count() == (cnt & SOFTIRQ_MASK)) |
111 | trace_softirqs_off(ip); | 113 | trace_softirqs_off(ip); |
112 | raw_local_irq_restore(flags); | 114 | raw_local_irq_restore(flags); |
113 | 115 | ||
114 | if (preempt_count() == cnt) | 116 | if (preempt_count() == cnt) |
115 | trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); | 117 | trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); |
116 | } | 118 | } |
117 | #else /* !CONFIG_TRACE_IRQFLAGS */ | 119 | EXPORT_SYMBOL(__local_bh_disable_ip); |
118 | static inline void __local_bh_disable(unsigned long ip, unsigned int cnt) | ||
119 | { | ||
120 | preempt_count_add(cnt); | ||
121 | barrier(); | ||
122 | } | ||
123 | #endif /* CONFIG_TRACE_IRQFLAGS */ | 120 | #endif /* CONFIG_TRACE_IRQFLAGS */ |
124 | 121 | ||
125 | void local_bh_disable(void) | ||
126 | { | ||
127 | __local_bh_disable(_RET_IP_, SOFTIRQ_DISABLE_OFFSET); | ||
128 | } | ||
129 | |||
130 | EXPORT_SYMBOL(local_bh_disable); | ||
131 | |||
132 | static void __local_bh_enable(unsigned int cnt) | 122 | static void __local_bh_enable(unsigned int cnt) |
133 | { | 123 | { |
134 | WARN_ON_ONCE(!irqs_disabled()); | 124 | WARN_ON_ONCE(!irqs_disabled()); |
135 | 125 | ||
136 | if (softirq_count() == cnt) | 126 | if (softirq_count() == (cnt & SOFTIRQ_MASK)) |
137 | trace_softirqs_on(_RET_IP_); | 127 | trace_softirqs_on(_RET_IP_); |
138 | preempt_count_sub(cnt); | 128 | preempt_count_sub(cnt); |
139 | } | 129 | } |
@@ -148,10 +138,9 @@ void _local_bh_enable(void) | |||
148 | WARN_ON_ONCE(in_irq()); | 138 | WARN_ON_ONCE(in_irq()); |
149 | __local_bh_enable(SOFTIRQ_DISABLE_OFFSET); | 139 | __local_bh_enable(SOFTIRQ_DISABLE_OFFSET); |
150 | } | 140 | } |
151 | |||
152 | EXPORT_SYMBOL(_local_bh_enable); | 141 | EXPORT_SYMBOL(_local_bh_enable); |
153 | 142 | ||
154 | static inline void _local_bh_enable_ip(unsigned long ip) | 143 | void __local_bh_enable_ip(unsigned long ip, unsigned int cnt) |
155 | { | 144 | { |
156 | WARN_ON_ONCE(in_irq() || irqs_disabled()); | 145 | WARN_ON_ONCE(in_irq() || irqs_disabled()); |
157 | #ifdef CONFIG_TRACE_IRQFLAGS | 146 | #ifdef CONFIG_TRACE_IRQFLAGS |
@@ -165,8 +154,8 @@ static inline void _local_bh_enable_ip(unsigned long ip) | |||
165 | /* | 154 | /* |
166 | * Keep preemption disabled until we are done with | 155 | * Keep preemption disabled until we are done with |
167 | * softirq processing: | 156 | * softirq processing: |
168 | */ | 157 | */ |
169 | preempt_count_sub(SOFTIRQ_DISABLE_OFFSET - 1); | 158 | preempt_count_sub(cnt - 1); |
170 | 159 | ||
171 | if (unlikely(!in_interrupt() && local_softirq_pending())) { | 160 | if (unlikely(!in_interrupt() && local_softirq_pending())) { |
172 | /* | 161 | /* |
@@ -182,18 +171,7 @@ static inline void _local_bh_enable_ip(unsigned long ip) | |||
182 | #endif | 171 | #endif |
183 | preempt_check_resched(); | 172 | preempt_check_resched(); |
184 | } | 173 | } |
185 | 174 | EXPORT_SYMBOL(__local_bh_enable_ip); | |
186 | void local_bh_enable(void) | ||
187 | { | ||
188 | _local_bh_enable_ip(_RET_IP_); | ||
189 | } | ||
190 | EXPORT_SYMBOL(local_bh_enable); | ||
191 | |||
192 | void local_bh_enable_ip(unsigned long ip) | ||
193 | { | ||
194 | _local_bh_enable_ip(ip); | ||
195 | } | ||
196 | EXPORT_SYMBOL(local_bh_enable_ip); | ||
197 | 175 | ||
198 | /* | 176 | /* |
199 | * We restart softirq processing for at most MAX_SOFTIRQ_RESTART times, | 177 | * We restart softirq processing for at most MAX_SOFTIRQ_RESTART times, |
@@ -211,14 +189,49 @@ EXPORT_SYMBOL(local_bh_enable_ip); | |||
211 | #define MAX_SOFTIRQ_TIME msecs_to_jiffies(2) | 189 | #define MAX_SOFTIRQ_TIME msecs_to_jiffies(2) |
212 | #define MAX_SOFTIRQ_RESTART 10 | 190 | #define MAX_SOFTIRQ_RESTART 10 |
213 | 191 | ||
192 | #ifdef CONFIG_TRACE_IRQFLAGS | ||
193 | /* | ||
194 | * When we run softirqs from irq_exit() and thus on the hardirq stack we need | ||
195 | * to keep the lockdep irq context tracking as tight as possible in order to | ||
196 | * not miss-qualify lock contexts and miss possible deadlocks. | ||
197 | */ | ||
198 | |||
199 | static inline bool lockdep_softirq_start(void) | ||
200 | { | ||
201 | bool in_hardirq = false; | ||
202 | |||
203 | if (trace_hardirq_context(current)) { | ||
204 | in_hardirq = true; | ||
205 | trace_hardirq_exit(); | ||
206 | } | ||
207 | |||
208 | lockdep_softirq_enter(); | ||
209 | |||
210 | return in_hardirq; | ||
211 | } | ||
212 | |||
213 | static inline void lockdep_softirq_end(bool in_hardirq) | ||
214 | { | ||
215 | lockdep_softirq_exit(); | ||
216 | |||
217 | if (in_hardirq) | ||
218 | trace_hardirq_enter(); | ||
219 | } | ||
220 | #else | ||
221 | static inline bool lockdep_softirq_start(void) { return false; } | ||
222 | static inline void lockdep_softirq_end(bool in_hardirq) { } | ||
223 | #endif | ||
224 | |||
214 | asmlinkage void __do_softirq(void) | 225 | asmlinkage void __do_softirq(void) |
215 | { | 226 | { |
216 | struct softirq_action *h; | ||
217 | __u32 pending; | ||
218 | unsigned long end = jiffies + MAX_SOFTIRQ_TIME; | 227 | unsigned long end = jiffies + MAX_SOFTIRQ_TIME; |
219 | int cpu; | ||
220 | unsigned long old_flags = current->flags; | 228 | unsigned long old_flags = current->flags; |
221 | int max_restart = MAX_SOFTIRQ_RESTART; | 229 | int max_restart = MAX_SOFTIRQ_RESTART; |
230 | struct softirq_action *h; | ||
231 | bool in_hardirq; | ||
232 | __u32 pending; | ||
233 | int softirq_bit; | ||
234 | int cpu; | ||
222 | 235 | ||
223 | /* | 236 | /* |
224 | * Mask out PF_MEMALLOC s current task context is borrowed for the | 237 | * Mask out PF_MEMALLOC s current task context is borrowed for the |
@@ -230,8 +243,8 @@ asmlinkage void __do_softirq(void) | |||
230 | pending = local_softirq_pending(); | 243 | pending = local_softirq_pending(); |
231 | account_irq_enter_time(current); | 244 | account_irq_enter_time(current); |
232 | 245 | ||
233 | __local_bh_disable(_RET_IP_, SOFTIRQ_OFFSET); | 246 | __local_bh_disable_ip(_RET_IP_, SOFTIRQ_OFFSET); |
234 | lockdep_softirq_enter(); | 247 | in_hardirq = lockdep_softirq_start(); |
235 | 248 | ||
236 | cpu = smp_processor_id(); | 249 | cpu = smp_processor_id(); |
237 | restart: | 250 | restart: |
@@ -242,30 +255,30 @@ restart: | |||
242 | 255 | ||
243 | h = softirq_vec; | 256 | h = softirq_vec; |
244 | 257 | ||
245 | do { | 258 | while ((softirq_bit = ffs(pending))) { |
246 | if (pending & 1) { | 259 | unsigned int vec_nr; |
247 | unsigned int vec_nr = h - softirq_vec; | 260 | int prev_count; |
248 | int prev_count = preempt_count(); | 261 | |
249 | 262 | h += softirq_bit - 1; | |
250 | kstat_incr_softirqs_this_cpu(vec_nr); | 263 | |
251 | 264 | vec_nr = h - softirq_vec; | |
252 | trace_softirq_entry(vec_nr); | 265 | prev_count = preempt_count(); |
253 | h->action(h); | 266 | |
254 | trace_softirq_exit(vec_nr); | 267 | kstat_incr_softirqs_this_cpu(vec_nr); |
255 | if (unlikely(prev_count != preempt_count())) { | ||
256 | printk(KERN_ERR "huh, entered softirq %u %s %p" | ||
257 | "with preempt_count %08x," | ||
258 | " exited with %08x?\n", vec_nr, | ||
259 | softirq_to_name[vec_nr], h->action, | ||
260 | prev_count, preempt_count()); | ||
261 | preempt_count_set(prev_count); | ||
262 | } | ||
263 | 268 | ||
264 | rcu_bh_qs(cpu); | 269 | trace_softirq_entry(vec_nr); |
270 | h->action(h); | ||
271 | trace_softirq_exit(vec_nr); | ||
272 | if (unlikely(prev_count != preempt_count())) { | ||
273 | pr_err("huh, entered softirq %u %s %p with preempt_count %08x, exited with %08x?\n", | ||
274 | vec_nr, softirq_to_name[vec_nr], h->action, | ||
275 | prev_count, preempt_count()); | ||
276 | preempt_count_set(prev_count); | ||
265 | } | 277 | } |
278 | rcu_bh_qs(cpu); | ||
266 | h++; | 279 | h++; |
267 | pending >>= 1; | 280 | pending >>= softirq_bit; |
268 | } while (pending); | 281 | } |
269 | 282 | ||
270 | local_irq_disable(); | 283 | local_irq_disable(); |
271 | 284 | ||
@@ -278,16 +291,13 @@ restart: | |||
278 | wakeup_softirqd(); | 291 | wakeup_softirqd(); |
279 | } | 292 | } |
280 | 293 | ||
281 | lockdep_softirq_exit(); | 294 | lockdep_softirq_end(in_hardirq); |
282 | |||
283 | account_irq_exit_time(current); | 295 | account_irq_exit_time(current); |
284 | __local_bh_enable(SOFTIRQ_OFFSET); | 296 | __local_bh_enable(SOFTIRQ_OFFSET); |
285 | WARN_ON_ONCE(in_interrupt()); | 297 | WARN_ON_ONCE(in_interrupt()); |
286 | tsk_restore_flags(current, old_flags, PF_MEMALLOC); | 298 | tsk_restore_flags(current, old_flags, PF_MEMALLOC); |
287 | } | 299 | } |
288 | 300 | ||
289 | |||
290 | |||
291 | asmlinkage void do_softirq(void) | 301 | asmlinkage void do_softirq(void) |
292 | { | 302 | { |
293 | __u32 pending; | 303 | __u32 pending; |
@@ -311,8 +321,6 @@ asmlinkage void do_softirq(void) | |||
311 | */ | 321 | */ |
312 | void irq_enter(void) | 322 | void irq_enter(void) |
313 | { | 323 | { |
314 | int cpu = smp_processor_id(); | ||
315 | |||
316 | rcu_irq_enter(); | 324 | rcu_irq_enter(); |
317 | if (is_idle_task(current) && !in_interrupt()) { | 325 | if (is_idle_task(current) && !in_interrupt()) { |
318 | /* | 326 | /* |
@@ -320,7 +328,7 @@ void irq_enter(void) | |||
320 | * here, as softirq will be serviced on return from interrupt. | 328 | * here, as softirq will be serviced on return from interrupt. |
321 | */ | 329 | */ |
322 | local_bh_disable(); | 330 | local_bh_disable(); |
323 | tick_check_idle(cpu); | 331 | tick_irq_enter(); |
324 | _local_bh_enable(); | 332 | _local_bh_enable(); |
325 | } | 333 | } |
326 | 334 | ||
@@ -375,13 +383,13 @@ void irq_exit(void) | |||
375 | #endif | 383 | #endif |
376 | 384 | ||
377 | account_irq_exit_time(current); | 385 | account_irq_exit_time(current); |
378 | trace_hardirq_exit(); | ||
379 | preempt_count_sub(HARDIRQ_OFFSET); | 386 | preempt_count_sub(HARDIRQ_OFFSET); |
380 | if (!in_interrupt() && local_softirq_pending()) | 387 | if (!in_interrupt() && local_softirq_pending()) |
381 | invoke_softirq(); | 388 | invoke_softirq(); |
382 | 389 | ||
383 | tick_irq_exit(); | 390 | tick_irq_exit(); |
384 | rcu_irq_exit(); | 391 | rcu_irq_exit(); |
392 | trace_hardirq_exit(); /* must be last! */ | ||
385 | } | 393 | } |
386 | 394 | ||
387 | /* | 395 | /* |
@@ -427,8 +435,7 @@ void open_softirq(int nr, void (*action)(struct softirq_action *)) | |||
427 | /* | 435 | /* |
428 | * Tasklets | 436 | * Tasklets |
429 | */ | 437 | */ |
430 | struct tasklet_head | 438 | struct tasklet_head { |
431 | { | ||
432 | struct tasklet_struct *head; | 439 | struct tasklet_struct *head; |
433 | struct tasklet_struct **tail; | 440 | struct tasklet_struct **tail; |
434 | }; | 441 | }; |
@@ -447,7 +454,6 @@ void __tasklet_schedule(struct tasklet_struct *t) | |||
447 | raise_softirq_irqoff(TASKLET_SOFTIRQ); | 454 | raise_softirq_irqoff(TASKLET_SOFTIRQ); |
448 | local_irq_restore(flags); | 455 | local_irq_restore(flags); |
449 | } | 456 | } |
450 | |||
451 | EXPORT_SYMBOL(__tasklet_schedule); | 457 | EXPORT_SYMBOL(__tasklet_schedule); |
452 | 458 | ||
453 | void __tasklet_hi_schedule(struct tasklet_struct *t) | 459 | void __tasklet_hi_schedule(struct tasklet_struct *t) |
@@ -461,7 +467,6 @@ void __tasklet_hi_schedule(struct tasklet_struct *t) | |||
461 | raise_softirq_irqoff(HI_SOFTIRQ); | 467 | raise_softirq_irqoff(HI_SOFTIRQ); |
462 | local_irq_restore(flags); | 468 | local_irq_restore(flags); |
463 | } | 469 | } |
464 | |||
465 | EXPORT_SYMBOL(__tasklet_hi_schedule); | 470 | EXPORT_SYMBOL(__tasklet_hi_schedule); |
466 | 471 | ||
467 | void __tasklet_hi_schedule_first(struct tasklet_struct *t) | 472 | void __tasklet_hi_schedule_first(struct tasklet_struct *t) |
@@ -472,7 +477,6 @@ void __tasklet_hi_schedule_first(struct tasklet_struct *t) | |||
472 | __this_cpu_write(tasklet_hi_vec.head, t); | 477 | __this_cpu_write(tasklet_hi_vec.head, t); |
473 | __raise_softirq_irqoff(HI_SOFTIRQ); | 478 | __raise_softirq_irqoff(HI_SOFTIRQ); |
474 | } | 479 | } |
475 | |||
476 | EXPORT_SYMBOL(__tasklet_hi_schedule_first); | 480 | EXPORT_SYMBOL(__tasklet_hi_schedule_first); |
477 | 481 | ||
478 | static void tasklet_action(struct softirq_action *a) | 482 | static void tasklet_action(struct softirq_action *a) |
@@ -492,7 +496,8 @@ static void tasklet_action(struct softirq_action *a) | |||
492 | 496 | ||
493 | if (tasklet_trylock(t)) { | 497 | if (tasklet_trylock(t)) { |
494 | if (!atomic_read(&t->count)) { | 498 | if (!atomic_read(&t->count)) { |
495 | if (!test_and_clear_bit(TASKLET_STATE_SCHED, &t->state)) | 499 | if (!test_and_clear_bit(TASKLET_STATE_SCHED, |
500 | &t->state)) | ||
496 | BUG(); | 501 | BUG(); |
497 | t->func(t->data); | 502 | t->func(t->data); |
498 | tasklet_unlock(t); | 503 | tasklet_unlock(t); |
@@ -527,7 +532,8 @@ static void tasklet_hi_action(struct softirq_action *a) | |||
527 | 532 | ||
528 | if (tasklet_trylock(t)) { | 533 | if (tasklet_trylock(t)) { |
529 | if (!atomic_read(&t->count)) { | 534 | if (!atomic_read(&t->count)) { |
530 | if (!test_and_clear_bit(TASKLET_STATE_SCHED, &t->state)) | 535 | if (!test_and_clear_bit(TASKLET_STATE_SCHED, |
536 | &t->state)) | ||
531 | BUG(); | 537 | BUG(); |
532 | t->func(t->data); | 538 | t->func(t->data); |
533 | tasklet_unlock(t); | 539 | tasklet_unlock(t); |
@@ -545,7 +551,6 @@ static void tasklet_hi_action(struct softirq_action *a) | |||
545 | } | 551 | } |
546 | } | 552 | } |
547 | 553 | ||
548 | |||
549 | void tasklet_init(struct tasklet_struct *t, | 554 | void tasklet_init(struct tasklet_struct *t, |
550 | void (*func)(unsigned long), unsigned long data) | 555 | void (*func)(unsigned long), unsigned long data) |
551 | { | 556 | { |
@@ -555,13 +560,12 @@ void tasklet_init(struct tasklet_struct *t, | |||
555 | t->func = func; | 560 | t->func = func; |
556 | t->data = data; | 561 | t->data = data; |
557 | } | 562 | } |
558 | |||
559 | EXPORT_SYMBOL(tasklet_init); | 563 | EXPORT_SYMBOL(tasklet_init); |
560 | 564 | ||
561 | void tasklet_kill(struct tasklet_struct *t) | 565 | void tasklet_kill(struct tasklet_struct *t) |
562 | { | 566 | { |
563 | if (in_interrupt()) | 567 | if (in_interrupt()) |
564 | printk("Attempt to kill tasklet from interrupt\n"); | 568 | pr_notice("Attempt to kill tasklet from interrupt\n"); |
565 | 569 | ||
566 | while (test_and_set_bit(TASKLET_STATE_SCHED, &t->state)) { | 570 | while (test_and_set_bit(TASKLET_STATE_SCHED, &t->state)) { |
567 | do { | 571 | do { |
@@ -571,7 +575,6 @@ void tasklet_kill(struct tasklet_struct *t) | |||
571 | tasklet_unlock_wait(t); | 575 | tasklet_unlock_wait(t); |
572 | clear_bit(TASKLET_STATE_SCHED, &t->state); | 576 | clear_bit(TASKLET_STATE_SCHED, &t->state); |
573 | } | 577 | } |
574 | |||
575 | EXPORT_SYMBOL(tasklet_kill); | 578 | EXPORT_SYMBOL(tasklet_kill); |
576 | 579 | ||
577 | /* | 580 | /* |
@@ -721,9 +724,8 @@ static void takeover_tasklets(unsigned int cpu) | |||
721 | } | 724 | } |
722 | #endif /* CONFIG_HOTPLUG_CPU */ | 725 | #endif /* CONFIG_HOTPLUG_CPU */ |
723 | 726 | ||
724 | static int cpu_callback(struct notifier_block *nfb, | 727 | static int cpu_callback(struct notifier_block *nfb, unsigned long action, |
725 | unsigned long action, | 728 | void *hcpu) |
726 | void *hcpu) | ||
727 | { | 729 | { |
728 | switch (action) { | 730 | switch (action) { |
729 | #ifdef CONFIG_HOTPLUG_CPU | 731 | #ifdef CONFIG_HOTPLUG_CPU |
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 84571e09c907..01fbae5b97b7 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c | |||
@@ -293,7 +293,7 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void * | |||
293 | */ | 293 | */ |
294 | smp_call_function_single(min(cpu1, cpu2), | 294 | smp_call_function_single(min(cpu1, cpu2), |
295 | &irq_cpu_stop_queue_work, | 295 | &irq_cpu_stop_queue_work, |
296 | &call_args, 0); | 296 | &call_args, 1); |
297 | lg_local_unlock(&stop_cpus_lock); | 297 | lg_local_unlock(&stop_cpus_lock); |
298 | preempt_enable(); | 298 | preempt_enable(); |
299 | 299 | ||
diff --git a/kernel/sys.c b/kernel/sys.c index c72311324ea7..c0a58be780a4 100644 --- a/kernel/sys.c +++ b/kernel/sys.c | |||
@@ -895,8 +895,7 @@ SYSCALL_DEFINE1(times, struct tms __user *, tbuf) | |||
895 | * only important on a multi-user system anyway, to make sure one user | 895 | * only important on a multi-user system anyway, to make sure one user |
896 | * can't send a signal to a process owned by another. -TYT, 12/12/91 | 896 | * can't send a signal to a process owned by another. -TYT, 12/12/91 |
897 | * | 897 | * |
898 | * Auch. Had to add the 'did_exec' flag to conform completely to POSIX. | 898 | * !PF_FORKNOEXEC check to conform completely to POSIX. |
899 | * LBT 04.03.94 | ||
900 | */ | 899 | */ |
901 | SYSCALL_DEFINE2(setpgid, pid_t, pid, pid_t, pgid) | 900 | SYSCALL_DEFINE2(setpgid, pid_t, pid, pid_t, pgid) |
902 | { | 901 | { |
@@ -932,7 +931,7 @@ SYSCALL_DEFINE2(setpgid, pid_t, pid, pid_t, pgid) | |||
932 | if (task_session(p) != task_session(group_leader)) | 931 | if (task_session(p) != task_session(group_leader)) |
933 | goto out; | 932 | goto out; |
934 | err = -EACCES; | 933 | err = -EACCES; |
935 | if (p->did_exec) | 934 | if (!(p->flags & PF_FORKNOEXEC)) |
936 | goto out; | 935 | goto out; |
937 | } else { | 936 | } else { |
938 | err = -ESRCH; | 937 | err = -ESRCH; |
@@ -1572,8 +1571,7 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r) | |||
1572 | t = p; | 1571 | t = p; |
1573 | do { | 1572 | do { |
1574 | accumulate_thread_rusage(t, r); | 1573 | accumulate_thread_rusage(t, r); |
1575 | t = next_thread(t); | 1574 | } while_each_thread(p, t); |
1576 | } while (t != p); | ||
1577 | break; | 1575 | break; |
1578 | 1576 | ||
1579 | default: | 1577 | default: |
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 34a604726d0b..49e13e1f8fe6 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c | |||
@@ -62,6 +62,7 @@ | |||
62 | #include <linux/capability.h> | 62 | #include <linux/capability.h> |
63 | #include <linux/binfmts.h> | 63 | #include <linux/binfmts.h> |
64 | #include <linux/sched/sysctl.h> | 64 | #include <linux/sched/sysctl.h> |
65 | #include <linux/kexec.h> | ||
65 | 66 | ||
66 | #include <asm/uaccess.h> | 67 | #include <asm/uaccess.h> |
67 | #include <asm/processor.h> | 68 | #include <asm/processor.h> |
@@ -95,8 +96,6 @@ | |||
95 | #if defined(CONFIG_SYSCTL) | 96 | #if defined(CONFIG_SYSCTL) |
96 | 97 | ||
97 | /* External variables not in a header file. */ | 98 | /* External variables not in a header file. */ |
98 | extern int sysctl_overcommit_memory; | ||
99 | extern int sysctl_overcommit_ratio; | ||
100 | extern int max_threads; | 99 | extern int max_threads; |
101 | extern int suid_dumpable; | 100 | extern int suid_dumpable; |
102 | #ifdef CONFIG_COREDUMP | 101 | #ifdef CONFIG_COREDUMP |
@@ -122,6 +121,8 @@ extern int blk_iopoll_enabled; | |||
122 | static int sixty = 60; | 121 | static int sixty = 60; |
123 | #endif | 122 | #endif |
124 | 123 | ||
124 | static int __maybe_unused neg_one = -1; | ||
125 | |||
125 | static int zero; | 126 | static int zero; |
126 | static int __maybe_unused one = 1; | 127 | static int __maybe_unused one = 1; |
127 | static int __maybe_unused two = 2; | 128 | static int __maybe_unused two = 2; |
@@ -385,19 +386,21 @@ static struct ctl_table kern_table[] = { | |||
385 | .proc_handler = proc_dointvec, | 386 | .proc_handler = proc_dointvec, |
386 | }, | 387 | }, |
387 | { | 388 | { |
388 | .procname = "numa_balancing_settle_count", | ||
389 | .data = &sysctl_numa_balancing_settle_count, | ||
390 | .maxlen = sizeof(unsigned int), | ||
391 | .mode = 0644, | ||
392 | .proc_handler = proc_dointvec, | ||
393 | }, | ||
394 | { | ||
395 | .procname = "numa_balancing_migrate_deferred", | 389 | .procname = "numa_balancing_migrate_deferred", |
396 | .data = &sysctl_numa_balancing_migrate_deferred, | 390 | .data = &sysctl_numa_balancing_migrate_deferred, |
397 | .maxlen = sizeof(unsigned int), | 391 | .maxlen = sizeof(unsigned int), |
398 | .mode = 0644, | 392 | .mode = 0644, |
399 | .proc_handler = proc_dointvec, | 393 | .proc_handler = proc_dointvec, |
400 | }, | 394 | }, |
395 | { | ||
396 | .procname = "numa_balancing", | ||
397 | .data = NULL, /* filled in by handler */ | ||
398 | .maxlen = sizeof(unsigned int), | ||
399 | .mode = 0644, | ||
400 | .proc_handler = sysctl_numa_balancing, | ||
401 | .extra1 = &zero, | ||
402 | .extra2 = &one, | ||
403 | }, | ||
401 | #endif /* CONFIG_NUMA_BALANCING */ | 404 | #endif /* CONFIG_NUMA_BALANCING */ |
402 | #endif /* CONFIG_SCHED_DEBUG */ | 405 | #endif /* CONFIG_SCHED_DEBUG */ |
403 | { | 406 | { |
@@ -614,6 +617,18 @@ static struct ctl_table kern_table[] = { | |||
614 | .proc_handler = proc_dointvec, | 617 | .proc_handler = proc_dointvec, |
615 | }, | 618 | }, |
616 | #endif | 619 | #endif |
620 | #ifdef CONFIG_KEXEC | ||
621 | { | ||
622 | .procname = "kexec_load_disabled", | ||
623 | .data = &kexec_load_disabled, | ||
624 | .maxlen = sizeof(int), | ||
625 | .mode = 0644, | ||
626 | /* only handle a transition from default "0" to "1" */ | ||
627 | .proc_handler = proc_dointvec_minmax, | ||
628 | .extra1 = &one, | ||
629 | .extra2 = &one, | ||
630 | }, | ||
631 | #endif | ||
617 | #ifdef CONFIG_MODULES | 632 | #ifdef CONFIG_MODULES |
618 | { | 633 | { |
619 | .procname = "modprobe", | 634 | .procname = "modprobe", |
@@ -984,9 +999,10 @@ static struct ctl_table kern_table[] = { | |||
984 | { | 999 | { |
985 | .procname = "hung_task_warnings", | 1000 | .procname = "hung_task_warnings", |
986 | .data = &sysctl_hung_task_warnings, | 1001 | .data = &sysctl_hung_task_warnings, |
987 | .maxlen = sizeof(unsigned long), | 1002 | .maxlen = sizeof(int), |
988 | .mode = 0644, | 1003 | .mode = 0644, |
989 | .proc_handler = proc_doulongvec_minmax, | 1004 | .proc_handler = proc_dointvec_minmax, |
1005 | .extra1 = &neg_one, | ||
990 | }, | 1006 | }, |
991 | #endif | 1007 | #endif |
992 | #ifdef CONFIG_COMPAT | 1008 | #ifdef CONFIG_COMPAT |
@@ -1128,7 +1144,14 @@ static struct ctl_table vm_table[] = { | |||
1128 | .data = &sysctl_overcommit_ratio, | 1144 | .data = &sysctl_overcommit_ratio, |
1129 | .maxlen = sizeof(sysctl_overcommit_ratio), | 1145 | .maxlen = sizeof(sysctl_overcommit_ratio), |
1130 | .mode = 0644, | 1146 | .mode = 0644, |
1131 | .proc_handler = proc_dointvec, | 1147 | .proc_handler = overcommit_ratio_handler, |
1148 | }, | ||
1149 | { | ||
1150 | .procname = "overcommit_kbytes", | ||
1151 | .data = &sysctl_overcommit_kbytes, | ||
1152 | .maxlen = sizeof(sysctl_overcommit_kbytes), | ||
1153 | .mode = 0644, | ||
1154 | .proc_handler = overcommit_kbytes_handler, | ||
1132 | }, | 1155 | }, |
1133 | { | 1156 | { |
1134 | .procname = "page-cluster", | 1157 | .procname = "page-cluster", |
diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c index 7a925ba456fb..a6a5bf53e86d 100644 --- a/kernel/time/jiffies.c +++ b/kernel/time/jiffies.c | |||
@@ -51,7 +51,13 @@ | |||
51 | * HZ shrinks, so values greater than 8 overflow 32bits when | 51 | * HZ shrinks, so values greater than 8 overflow 32bits when |
52 | * HZ=100. | 52 | * HZ=100. |
53 | */ | 53 | */ |
54 | #if HZ < 34 | ||
55 | #define JIFFIES_SHIFT 6 | ||
56 | #elif HZ < 67 | ||
57 | #define JIFFIES_SHIFT 7 | ||
58 | #else | ||
54 | #define JIFFIES_SHIFT 8 | 59 | #define JIFFIES_SHIFT 8 |
60 | #endif | ||
55 | 61 | ||
56 | static cycle_t jiffies_read(struct clocksource *cs) | 62 | static cycle_t jiffies_read(struct clocksource *cs) |
57 | { | 63 | { |
diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c index 68b799375981..4d23dc4d8139 100644 --- a/kernel/time/sched_clock.c +++ b/kernel/time/sched_clock.c | |||
@@ -74,7 +74,7 @@ unsigned long long notrace sched_clock(void) | |||
74 | return cd.epoch_ns; | 74 | return cd.epoch_ns; |
75 | 75 | ||
76 | do { | 76 | do { |
77 | seq = read_seqcount_begin(&cd.seq); | 77 | seq = raw_read_seqcount_begin(&cd.seq); |
78 | epoch_cyc = cd.epoch_cyc; | 78 | epoch_cyc = cd.epoch_cyc; |
79 | epoch_ns = cd.epoch_ns; | 79 | epoch_ns = cd.epoch_ns; |
80 | } while (read_seqcount_retry(&cd.seq, seq)); | 80 | } while (read_seqcount_retry(&cd.seq, seq)); |
@@ -99,10 +99,10 @@ static void notrace update_sched_clock(void) | |||
99 | cd.mult, cd.shift); | 99 | cd.mult, cd.shift); |
100 | 100 | ||
101 | raw_local_irq_save(flags); | 101 | raw_local_irq_save(flags); |
102 | write_seqcount_begin(&cd.seq); | 102 | raw_write_seqcount_begin(&cd.seq); |
103 | cd.epoch_ns = ns; | 103 | cd.epoch_ns = ns; |
104 | cd.epoch_cyc = cyc; | 104 | cd.epoch_cyc = cyc; |
105 | write_seqcount_end(&cd.seq); | 105 | raw_write_seqcount_end(&cd.seq); |
106 | raw_local_irq_restore(flags); | 106 | raw_local_irq_restore(flags); |
107 | } | 107 | } |
108 | 108 | ||
@@ -116,20 +116,42 @@ static enum hrtimer_restart sched_clock_poll(struct hrtimer *hrt) | |||
116 | void __init sched_clock_register(u64 (*read)(void), int bits, | 116 | void __init sched_clock_register(u64 (*read)(void), int bits, |
117 | unsigned long rate) | 117 | unsigned long rate) |
118 | { | 118 | { |
119 | u64 res, wrap, new_mask, new_epoch, cyc, ns; | ||
120 | u32 new_mult, new_shift; | ||
121 | ktime_t new_wrap_kt; | ||
119 | unsigned long r; | 122 | unsigned long r; |
120 | u64 res, wrap; | ||
121 | char r_unit; | 123 | char r_unit; |
122 | 124 | ||
123 | if (cd.rate > rate) | 125 | if (cd.rate > rate) |
124 | return; | 126 | return; |
125 | 127 | ||
126 | WARN_ON(!irqs_disabled()); | 128 | WARN_ON(!irqs_disabled()); |
127 | read_sched_clock = read; | ||
128 | sched_clock_mask = CLOCKSOURCE_MASK(bits); | ||
129 | cd.rate = rate; | ||
130 | 129 | ||
131 | /* calculate the mult/shift to convert counter ticks to ns. */ | 130 | /* calculate the mult/shift to convert counter ticks to ns. */ |
132 | clocks_calc_mult_shift(&cd.mult, &cd.shift, rate, NSEC_PER_SEC, 3600); | 131 | clocks_calc_mult_shift(&new_mult, &new_shift, rate, NSEC_PER_SEC, 3600); |
132 | |||
133 | new_mask = CLOCKSOURCE_MASK(bits); | ||
134 | |||
135 | /* calculate how many ns until we wrap */ | ||
136 | wrap = clocks_calc_max_nsecs(new_mult, new_shift, 0, new_mask); | ||
137 | new_wrap_kt = ns_to_ktime(wrap - (wrap >> 3)); | ||
138 | |||
139 | /* update epoch for new counter and update epoch_ns from old counter*/ | ||
140 | new_epoch = read(); | ||
141 | cyc = read_sched_clock(); | ||
142 | ns = cd.epoch_ns + cyc_to_ns((cyc - cd.epoch_cyc) & sched_clock_mask, | ||
143 | cd.mult, cd.shift); | ||
144 | |||
145 | raw_write_seqcount_begin(&cd.seq); | ||
146 | read_sched_clock = read; | ||
147 | sched_clock_mask = new_mask; | ||
148 | cd.rate = rate; | ||
149 | cd.wrap_kt = new_wrap_kt; | ||
150 | cd.mult = new_mult; | ||
151 | cd.shift = new_shift; | ||
152 | cd.epoch_cyc = new_epoch; | ||
153 | cd.epoch_ns = ns; | ||
154 | raw_write_seqcount_end(&cd.seq); | ||
133 | 155 | ||
134 | r = rate; | 156 | r = rate; |
135 | if (r >= 4000000) { | 157 | if (r >= 4000000) { |
@@ -141,22 +163,12 @@ void __init sched_clock_register(u64 (*read)(void), int bits, | |||
141 | } else | 163 | } else |
142 | r_unit = ' '; | 164 | r_unit = ' '; |
143 | 165 | ||
144 | /* calculate how many ns until we wrap */ | ||
145 | wrap = clocks_calc_max_nsecs(cd.mult, cd.shift, 0, sched_clock_mask); | ||
146 | cd.wrap_kt = ns_to_ktime(wrap - (wrap >> 3)); | ||
147 | |||
148 | /* calculate the ns resolution of this counter */ | 166 | /* calculate the ns resolution of this counter */ |
149 | res = cyc_to_ns(1ULL, cd.mult, cd.shift); | 167 | res = cyc_to_ns(1ULL, new_mult, new_shift); |
168 | |||
150 | pr_info("sched_clock: %u bits at %lu%cHz, resolution %lluns, wraps every %lluns\n", | 169 | pr_info("sched_clock: %u bits at %lu%cHz, resolution %lluns, wraps every %lluns\n", |
151 | bits, r, r_unit, res, wrap); | 170 | bits, r, r_unit, res, wrap); |
152 | 171 | ||
153 | update_sched_clock(); | ||
154 | |||
155 | /* | ||
156 | * Ensure that sched_clock() starts off at 0ns | ||
157 | */ | ||
158 | cd.epoch_ns = 0; | ||
159 | |||
160 | /* Enable IRQ time accounting if we have a fast enough sched_clock */ | 172 | /* Enable IRQ time accounting if we have a fast enough sched_clock */ |
161 | if (irqtime > 0 || (irqtime == -1 && rate >= 1000000)) | 173 | if (irqtime > 0 || (irqtime == -1 && rate >= 1000000)) |
162 | enable_sched_clock_irqtime(); | 174 | enable_sched_clock_irqtime(); |
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 9532690daaa9..98977a57ac72 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c | |||
@@ -538,10 +538,10 @@ int tick_resume_broadcast_oneshot(struct clock_event_device *bc) | |||
538 | * Called from irq_enter() when idle was interrupted to reenable the | 538 | * Called from irq_enter() when idle was interrupted to reenable the |
539 | * per cpu device. | 539 | * per cpu device. |
540 | */ | 540 | */ |
541 | void tick_check_oneshot_broadcast(int cpu) | 541 | void tick_check_oneshot_broadcast_this_cpu(void) |
542 | { | 542 | { |
543 | if (cpumask_test_cpu(cpu, tick_broadcast_oneshot_mask)) { | 543 | if (cpumask_test_cpu(smp_processor_id(), tick_broadcast_oneshot_mask)) { |
544 | struct tick_device *td = &per_cpu(tick_cpu_device, cpu); | 544 | struct tick_device *td = &__get_cpu_var(tick_cpu_device); |
545 | 545 | ||
546 | /* | 546 | /* |
547 | * We might be in the middle of switching over from | 547 | * We might be in the middle of switching over from |
@@ -756,6 +756,7 @@ out: | |||
756 | static void tick_broadcast_clear_oneshot(int cpu) | 756 | static void tick_broadcast_clear_oneshot(int cpu) |
757 | { | 757 | { |
758 | cpumask_clear_cpu(cpu, tick_broadcast_oneshot_mask); | 758 | cpumask_clear_cpu(cpu, tick_broadcast_oneshot_mask); |
759 | cpumask_clear_cpu(cpu, tick_broadcast_pending_mask); | ||
759 | } | 760 | } |
760 | 761 | ||
761 | static void tick_broadcast_init_next_event(struct cpumask *mask, | 762 | static void tick_broadcast_init_next_event(struct cpumask *mask, |
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index 162b03ab0ad2..20b2fe37d105 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c | |||
@@ -85,6 +85,7 @@ static void tick_periodic(int cpu) | |||
85 | 85 | ||
86 | do_timer(1); | 86 | do_timer(1); |
87 | write_sequnlock(&jiffies_lock); | 87 | write_sequnlock(&jiffies_lock); |
88 | update_wall_time(); | ||
88 | } | 89 | } |
89 | 90 | ||
90 | update_process_times(user_mode(get_irq_regs())); | 91 | update_process_times(user_mode(get_irq_regs())); |
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index 18e71f7fbc2a..8329669b51ec 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h | |||
@@ -51,7 +51,7 @@ extern void tick_broadcast_switch_to_oneshot(void); | |||
51 | extern void tick_shutdown_broadcast_oneshot(unsigned int *cpup); | 51 | extern void tick_shutdown_broadcast_oneshot(unsigned int *cpup); |
52 | extern int tick_resume_broadcast_oneshot(struct clock_event_device *bc); | 52 | extern int tick_resume_broadcast_oneshot(struct clock_event_device *bc); |
53 | extern int tick_broadcast_oneshot_active(void); | 53 | extern int tick_broadcast_oneshot_active(void); |
54 | extern void tick_check_oneshot_broadcast(int cpu); | 54 | extern void tick_check_oneshot_broadcast_this_cpu(void); |
55 | bool tick_broadcast_oneshot_available(void); | 55 | bool tick_broadcast_oneshot_available(void); |
56 | # else /* BROADCAST */ | 56 | # else /* BROADCAST */ |
57 | static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc) | 57 | static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc) |
@@ -62,7 +62,7 @@ static inline void tick_broadcast_oneshot_control(unsigned long reason) { } | |||
62 | static inline void tick_broadcast_switch_to_oneshot(void) { } | 62 | static inline void tick_broadcast_switch_to_oneshot(void) { } |
63 | static inline void tick_shutdown_broadcast_oneshot(unsigned int *cpup) { } | 63 | static inline void tick_shutdown_broadcast_oneshot(unsigned int *cpup) { } |
64 | static inline int tick_broadcast_oneshot_active(void) { return 0; } | 64 | static inline int tick_broadcast_oneshot_active(void) { return 0; } |
65 | static inline void tick_check_oneshot_broadcast(int cpu) { } | 65 | static inline void tick_check_oneshot_broadcast_this_cpu(void) { } |
66 | static inline bool tick_broadcast_oneshot_available(void) { return true; } | 66 | static inline bool tick_broadcast_oneshot_available(void) { return true; } |
67 | # endif /* !BROADCAST */ | 67 | # endif /* !BROADCAST */ |
68 | 68 | ||
@@ -155,3 +155,4 @@ static inline int tick_device_is_functional(struct clock_event_device *dev) | |||
155 | #endif | 155 | #endif |
156 | 156 | ||
157 | extern void do_timer(unsigned long ticks); | 157 | extern void do_timer(unsigned long ticks); |
158 | extern void update_wall_time(void); | ||
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index ea20f7d1ac2c..9f8af69c67ec 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c | |||
@@ -86,6 +86,7 @@ static void tick_do_update_jiffies64(ktime_t now) | |||
86 | tick_next_period = ktime_add(last_jiffies_update, tick_period); | 86 | tick_next_period = ktime_add(last_jiffies_update, tick_period); |
87 | } | 87 | } |
88 | write_sequnlock(&jiffies_lock); | 88 | write_sequnlock(&jiffies_lock); |
89 | update_wall_time(); | ||
89 | } | 90 | } |
90 | 91 | ||
91 | /* | 92 | /* |
@@ -177,7 +178,7 @@ static bool can_stop_full_tick(void) | |||
177 | * TODO: kick full dynticks CPUs when | 178 | * TODO: kick full dynticks CPUs when |
178 | * sched_clock_stable is set. | 179 | * sched_clock_stable is set. |
179 | */ | 180 | */ |
180 | if (!sched_clock_stable) { | 181 | if (!sched_clock_stable()) { |
181 | trace_tick_stop(0, "unstable sched clock\n"); | 182 | trace_tick_stop(0, "unstable sched clock\n"); |
182 | /* | 183 | /* |
183 | * Don't allow the user to think they can get | 184 | * Don't allow the user to think they can get |
@@ -391,11 +392,9 @@ __setup("nohz=", setup_tick_nohz); | |||
391 | */ | 392 | */ |
392 | static void tick_nohz_update_jiffies(ktime_t now) | 393 | static void tick_nohz_update_jiffies(ktime_t now) |
393 | { | 394 | { |
394 | int cpu = smp_processor_id(); | ||
395 | struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); | ||
396 | unsigned long flags; | 395 | unsigned long flags; |
397 | 396 | ||
398 | ts->idle_waketime = now; | 397 | __this_cpu_write(tick_cpu_sched.idle_waketime, now); |
399 | 398 | ||
400 | local_irq_save(flags); | 399 | local_irq_save(flags); |
401 | tick_do_update_jiffies64(now); | 400 | tick_do_update_jiffies64(now); |
@@ -426,17 +425,15 @@ update_ts_time_stats(int cpu, struct tick_sched *ts, ktime_t now, u64 *last_upda | |||
426 | 425 | ||
427 | } | 426 | } |
428 | 427 | ||
429 | static void tick_nohz_stop_idle(int cpu, ktime_t now) | 428 | static void tick_nohz_stop_idle(struct tick_sched *ts, ktime_t now) |
430 | { | 429 | { |
431 | struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); | 430 | update_ts_time_stats(smp_processor_id(), ts, now, NULL); |
432 | |||
433 | update_ts_time_stats(cpu, ts, now, NULL); | ||
434 | ts->idle_active = 0; | 431 | ts->idle_active = 0; |
435 | 432 | ||
436 | sched_clock_idle_wakeup_event(0); | 433 | sched_clock_idle_wakeup_event(0); |
437 | } | 434 | } |
438 | 435 | ||
439 | static ktime_t tick_nohz_start_idle(int cpu, struct tick_sched *ts) | 436 | static ktime_t tick_nohz_start_idle(struct tick_sched *ts) |
440 | { | 437 | { |
441 | ktime_t now = ktime_get(); | 438 | ktime_t now = ktime_get(); |
442 | 439 | ||
@@ -536,12 +533,13 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, | |||
536 | struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; | 533 | struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; |
537 | u64 time_delta; | 534 | u64 time_delta; |
538 | 535 | ||
536 | time_delta = timekeeping_max_deferment(); | ||
537 | |||
539 | /* Read jiffies and the time when jiffies were updated last */ | 538 | /* Read jiffies and the time when jiffies were updated last */ |
540 | do { | 539 | do { |
541 | seq = read_seqbegin(&jiffies_lock); | 540 | seq = read_seqbegin(&jiffies_lock); |
542 | last_update = last_jiffies_update; | 541 | last_update = last_jiffies_update; |
543 | last_jiffies = jiffies; | 542 | last_jiffies = jiffies; |
544 | time_delta = timekeeping_max_deferment(); | ||
545 | } while (read_seqretry(&jiffies_lock, seq)); | 543 | } while (read_seqretry(&jiffies_lock, seq)); |
546 | 544 | ||
547 | if (rcu_needs_cpu(cpu, &rcu_delta_jiffies) || | 545 | if (rcu_needs_cpu(cpu, &rcu_delta_jiffies) || |
@@ -681,18 +679,18 @@ out: | |||
681 | static void tick_nohz_full_stop_tick(struct tick_sched *ts) | 679 | static void tick_nohz_full_stop_tick(struct tick_sched *ts) |
682 | { | 680 | { |
683 | #ifdef CONFIG_NO_HZ_FULL | 681 | #ifdef CONFIG_NO_HZ_FULL |
684 | int cpu = smp_processor_id(); | 682 | int cpu = smp_processor_id(); |
685 | 683 | ||
686 | if (!tick_nohz_full_cpu(cpu) || is_idle_task(current)) | 684 | if (!tick_nohz_full_cpu(cpu) || is_idle_task(current)) |
687 | return; | 685 | return; |
688 | 686 | ||
689 | if (!ts->tick_stopped && ts->nohz_mode == NOHZ_MODE_INACTIVE) | 687 | if (!ts->tick_stopped && ts->nohz_mode == NOHZ_MODE_INACTIVE) |
690 | return; | 688 | return; |
691 | 689 | ||
692 | if (!can_stop_full_tick()) | 690 | if (!can_stop_full_tick()) |
693 | return; | 691 | return; |
694 | 692 | ||
695 | tick_nohz_stop_sched_tick(ts, ktime_get(), cpu); | 693 | tick_nohz_stop_sched_tick(ts, ktime_get(), cpu); |
696 | #endif | 694 | #endif |
697 | } | 695 | } |
698 | 696 | ||
@@ -754,7 +752,7 @@ static void __tick_nohz_idle_enter(struct tick_sched *ts) | |||
754 | ktime_t now, expires; | 752 | ktime_t now, expires; |
755 | int cpu = smp_processor_id(); | 753 | int cpu = smp_processor_id(); |
756 | 754 | ||
757 | now = tick_nohz_start_idle(cpu, ts); | 755 | now = tick_nohz_start_idle(ts); |
758 | 756 | ||
759 | if (can_stop_idle_tick(cpu, ts)) { | 757 | if (can_stop_idle_tick(cpu, ts)) { |
760 | int was_stopped = ts->tick_stopped; | 758 | int was_stopped = ts->tick_stopped; |
@@ -911,8 +909,7 @@ static void tick_nohz_account_idle_ticks(struct tick_sched *ts) | |||
911 | */ | 909 | */ |
912 | void tick_nohz_idle_exit(void) | 910 | void tick_nohz_idle_exit(void) |
913 | { | 911 | { |
914 | int cpu = smp_processor_id(); | 912 | struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); |
915 | struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); | ||
916 | ktime_t now; | 913 | ktime_t now; |
917 | 914 | ||
918 | local_irq_disable(); | 915 | local_irq_disable(); |
@@ -925,7 +922,7 @@ void tick_nohz_idle_exit(void) | |||
925 | now = ktime_get(); | 922 | now = ktime_get(); |
926 | 923 | ||
927 | if (ts->idle_active) | 924 | if (ts->idle_active) |
928 | tick_nohz_stop_idle(cpu, now); | 925 | tick_nohz_stop_idle(ts, now); |
929 | 926 | ||
930 | if (ts->tick_stopped) { | 927 | if (ts->tick_stopped) { |
931 | tick_nohz_restart_sched_tick(ts, now); | 928 | tick_nohz_restart_sched_tick(ts, now); |
@@ -1009,12 +1006,10 @@ static void tick_nohz_switch_to_nohz(void) | |||
1009 | * timer and do not touch the other magic bits which need to be done | 1006 | * timer and do not touch the other magic bits which need to be done |
1010 | * when idle is left. | 1007 | * when idle is left. |
1011 | */ | 1008 | */ |
1012 | static void tick_nohz_kick_tick(int cpu, ktime_t now) | 1009 | static void tick_nohz_kick_tick(struct tick_sched *ts, ktime_t now) |
1013 | { | 1010 | { |
1014 | #if 0 | 1011 | #if 0 |
1015 | /* Switch back to 2.6.27 behaviour */ | 1012 | /* Switch back to 2.6.27 behaviour */ |
1016 | |||
1017 | struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); | ||
1018 | ktime_t delta; | 1013 | ktime_t delta; |
1019 | 1014 | ||
1020 | /* | 1015 | /* |
@@ -1029,36 +1024,36 @@ static void tick_nohz_kick_tick(int cpu, ktime_t now) | |||
1029 | #endif | 1024 | #endif |
1030 | } | 1025 | } |
1031 | 1026 | ||
1032 | static inline void tick_check_nohz(int cpu) | 1027 | static inline void tick_nohz_irq_enter(void) |
1033 | { | 1028 | { |
1034 | struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); | 1029 | struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); |
1035 | ktime_t now; | 1030 | ktime_t now; |
1036 | 1031 | ||
1037 | if (!ts->idle_active && !ts->tick_stopped) | 1032 | if (!ts->idle_active && !ts->tick_stopped) |
1038 | return; | 1033 | return; |
1039 | now = ktime_get(); | 1034 | now = ktime_get(); |
1040 | if (ts->idle_active) | 1035 | if (ts->idle_active) |
1041 | tick_nohz_stop_idle(cpu, now); | 1036 | tick_nohz_stop_idle(ts, now); |
1042 | if (ts->tick_stopped) { | 1037 | if (ts->tick_stopped) { |
1043 | tick_nohz_update_jiffies(now); | 1038 | tick_nohz_update_jiffies(now); |
1044 | tick_nohz_kick_tick(cpu, now); | 1039 | tick_nohz_kick_tick(ts, now); |
1045 | } | 1040 | } |
1046 | } | 1041 | } |
1047 | 1042 | ||
1048 | #else | 1043 | #else |
1049 | 1044 | ||
1050 | static inline void tick_nohz_switch_to_nohz(void) { } | 1045 | static inline void tick_nohz_switch_to_nohz(void) { } |
1051 | static inline void tick_check_nohz(int cpu) { } | 1046 | static inline void tick_nohz_irq_enter(void) { } |
1052 | 1047 | ||
1053 | #endif /* CONFIG_NO_HZ_COMMON */ | 1048 | #endif /* CONFIG_NO_HZ_COMMON */ |
1054 | 1049 | ||
1055 | /* | 1050 | /* |
1056 | * Called from irq_enter to notify about the possible interruption of idle() | 1051 | * Called from irq_enter to notify about the possible interruption of idle() |
1057 | */ | 1052 | */ |
1058 | void tick_check_idle(int cpu) | 1053 | void tick_irq_enter(void) |
1059 | { | 1054 | { |
1060 | tick_check_oneshot_broadcast(cpu); | 1055 | tick_check_oneshot_broadcast_this_cpu(); |
1061 | tick_check_nohz(cpu); | 1056 | tick_nohz_irq_enter(); |
1062 | } | 1057 | } |
1063 | 1058 | ||
1064 | /* | 1059 | /* |
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 87b4f00284c9..0aa4ce81bc16 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c | |||
@@ -77,7 +77,7 @@ static void tk_set_wall_to_mono(struct timekeeper *tk, struct timespec wtm) | |||
77 | tk->wall_to_monotonic = wtm; | 77 | tk->wall_to_monotonic = wtm; |
78 | set_normalized_timespec(&tmp, -wtm.tv_sec, -wtm.tv_nsec); | 78 | set_normalized_timespec(&tmp, -wtm.tv_sec, -wtm.tv_nsec); |
79 | tk->offs_real = timespec_to_ktime(tmp); | 79 | tk->offs_real = timespec_to_ktime(tmp); |
80 | tk->offs_tai = ktime_sub(tk->offs_real, ktime_set(tk->tai_offset, 0)); | 80 | tk->offs_tai = ktime_add(tk->offs_real, ktime_set(tk->tai_offset, 0)); |
81 | } | 81 | } |
82 | 82 | ||
83 | static void tk_set_sleep_time(struct timekeeper *tk, struct timespec t) | 83 | static void tk_set_sleep_time(struct timekeeper *tk, struct timespec t) |
@@ -90,8 +90,9 @@ static void tk_set_sleep_time(struct timekeeper *tk, struct timespec t) | |||
90 | } | 90 | } |
91 | 91 | ||
92 | /** | 92 | /** |
93 | * timekeeper_setup_internals - Set up internals to use clocksource clock. | 93 | * tk_setup_internals - Set up internals to use clocksource clock. |
94 | * | 94 | * |
95 | * @tk: The target timekeeper to setup. | ||
95 | * @clock: Pointer to clocksource. | 96 | * @clock: Pointer to clocksource. |
96 | * | 97 | * |
97 | * Calculates a fixed cycle/nsec interval for a given clocksource/adjustment | 98 | * Calculates a fixed cycle/nsec interval for a given clocksource/adjustment |
@@ -595,7 +596,7 @@ s32 timekeeping_get_tai_offset(void) | |||
595 | static void __timekeeping_set_tai_offset(struct timekeeper *tk, s32 tai_offset) | 596 | static void __timekeeping_set_tai_offset(struct timekeeper *tk, s32 tai_offset) |
596 | { | 597 | { |
597 | tk->tai_offset = tai_offset; | 598 | tk->tai_offset = tai_offset; |
598 | tk->offs_tai = ktime_sub(tk->offs_real, ktime_set(tai_offset, 0)); | 599 | tk->offs_tai = ktime_add(tk->offs_real, ktime_set(tai_offset, 0)); |
599 | } | 600 | } |
600 | 601 | ||
601 | /** | 602 | /** |
@@ -610,6 +611,7 @@ void timekeeping_set_tai_offset(s32 tai_offset) | |||
610 | raw_spin_lock_irqsave(&timekeeper_lock, flags); | 611 | raw_spin_lock_irqsave(&timekeeper_lock, flags); |
611 | write_seqcount_begin(&timekeeper_seq); | 612 | write_seqcount_begin(&timekeeper_seq); |
612 | __timekeeping_set_tai_offset(tk, tai_offset); | 613 | __timekeeping_set_tai_offset(tk, tai_offset); |
614 | timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET); | ||
613 | write_seqcount_end(&timekeeper_seq); | 615 | write_seqcount_end(&timekeeper_seq); |
614 | raw_spin_unlock_irqrestore(&timekeeper_lock, flags); | 616 | raw_spin_unlock_irqrestore(&timekeeper_lock, flags); |
615 | clock_was_set(); | 617 | clock_was_set(); |
@@ -1023,6 +1025,8 @@ static int timekeeping_suspend(void) | |||
1023 | timekeeping_suspend_time = | 1025 | timekeeping_suspend_time = |
1024 | timespec_add(timekeeping_suspend_time, delta_delta); | 1026 | timespec_add(timekeeping_suspend_time, delta_delta); |
1025 | } | 1027 | } |
1028 | |||
1029 | timekeeping_update(tk, TK_MIRROR); | ||
1026 | write_seqcount_end(&timekeeper_seq); | 1030 | write_seqcount_end(&timekeeper_seq); |
1027 | raw_spin_unlock_irqrestore(&timekeeper_lock, flags); | 1031 | raw_spin_unlock_irqrestore(&timekeeper_lock, flags); |
1028 | 1032 | ||
@@ -1130,16 +1134,6 @@ static void timekeeping_adjust(struct timekeeper *tk, s64 offset) | |||
1130 | * we can adjust by 1. | 1134 | * we can adjust by 1. |
1131 | */ | 1135 | */ |
1132 | error >>= 2; | 1136 | error >>= 2; |
1133 | /* | ||
1134 | * XXX - In update_wall_time, we round up to the next | ||
1135 | * nanosecond, and store the amount rounded up into | ||
1136 | * the error. This causes the likely below to be unlikely. | ||
1137 | * | ||
1138 | * The proper fix is to avoid rounding up by using | ||
1139 | * the high precision tk->xtime_nsec instead of | ||
1140 | * xtime.tv_nsec everywhere. Fixing this will take some | ||
1141 | * time. | ||
1142 | */ | ||
1143 | if (likely(error <= interval)) | 1137 | if (likely(error <= interval)) |
1144 | adj = 1; | 1138 | adj = 1; |
1145 | else | 1139 | else |
@@ -1255,7 +1249,7 @@ out_adjust: | |||
1255 | static inline unsigned int accumulate_nsecs_to_secs(struct timekeeper *tk) | 1249 | static inline unsigned int accumulate_nsecs_to_secs(struct timekeeper *tk) |
1256 | { | 1250 | { |
1257 | u64 nsecps = (u64)NSEC_PER_SEC << tk->shift; | 1251 | u64 nsecps = (u64)NSEC_PER_SEC << tk->shift; |
1258 | unsigned int action = 0; | 1252 | unsigned int clock_set = 0; |
1259 | 1253 | ||
1260 | while (tk->xtime_nsec >= nsecps) { | 1254 | while (tk->xtime_nsec >= nsecps) { |
1261 | int leap; | 1255 | int leap; |
@@ -1277,11 +1271,10 @@ static inline unsigned int accumulate_nsecs_to_secs(struct timekeeper *tk) | |||
1277 | 1271 | ||
1278 | __timekeeping_set_tai_offset(tk, tk->tai_offset - leap); | 1272 | __timekeeping_set_tai_offset(tk, tk->tai_offset - leap); |
1279 | 1273 | ||
1280 | clock_was_set_delayed(); | 1274 | clock_set = TK_CLOCK_WAS_SET; |
1281 | action = TK_CLOCK_WAS_SET; | ||
1282 | } | 1275 | } |
1283 | } | 1276 | } |
1284 | return action; | 1277 | return clock_set; |
1285 | } | 1278 | } |
1286 | 1279 | ||
1287 | /** | 1280 | /** |
@@ -1294,7 +1287,8 @@ static inline unsigned int accumulate_nsecs_to_secs(struct timekeeper *tk) | |||
1294 | * Returns the unconsumed cycles. | 1287 | * Returns the unconsumed cycles. |
1295 | */ | 1288 | */ |
1296 | static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset, | 1289 | static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset, |
1297 | u32 shift) | 1290 | u32 shift, |
1291 | unsigned int *clock_set) | ||
1298 | { | 1292 | { |
1299 | cycle_t interval = tk->cycle_interval << shift; | 1293 | cycle_t interval = tk->cycle_interval << shift; |
1300 | u64 raw_nsecs; | 1294 | u64 raw_nsecs; |
@@ -1308,7 +1302,7 @@ static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset, | |||
1308 | tk->cycle_last += interval; | 1302 | tk->cycle_last += interval; |
1309 | 1303 | ||
1310 | tk->xtime_nsec += tk->xtime_interval << shift; | 1304 | tk->xtime_nsec += tk->xtime_interval << shift; |
1311 | accumulate_nsecs_to_secs(tk); | 1305 | *clock_set |= accumulate_nsecs_to_secs(tk); |
1312 | 1306 | ||
1313 | /* Accumulate raw time */ | 1307 | /* Accumulate raw time */ |
1314 | raw_nsecs = (u64)tk->raw_interval << shift; | 1308 | raw_nsecs = (u64)tk->raw_interval << shift; |
@@ -1359,14 +1353,14 @@ static inline void old_vsyscall_fixup(struct timekeeper *tk) | |||
1359 | * update_wall_time - Uses the current clocksource to increment the wall time | 1353 | * update_wall_time - Uses the current clocksource to increment the wall time |
1360 | * | 1354 | * |
1361 | */ | 1355 | */ |
1362 | static void update_wall_time(void) | 1356 | void update_wall_time(void) |
1363 | { | 1357 | { |
1364 | struct clocksource *clock; | 1358 | struct clocksource *clock; |
1365 | struct timekeeper *real_tk = &timekeeper; | 1359 | struct timekeeper *real_tk = &timekeeper; |
1366 | struct timekeeper *tk = &shadow_timekeeper; | 1360 | struct timekeeper *tk = &shadow_timekeeper; |
1367 | cycle_t offset; | 1361 | cycle_t offset; |
1368 | int shift = 0, maxshift; | 1362 | int shift = 0, maxshift; |
1369 | unsigned int action; | 1363 | unsigned int clock_set = 0; |
1370 | unsigned long flags; | 1364 | unsigned long flags; |
1371 | 1365 | ||
1372 | raw_spin_lock_irqsave(&timekeeper_lock, flags); | 1366 | raw_spin_lock_irqsave(&timekeeper_lock, flags); |
@@ -1401,7 +1395,8 @@ static void update_wall_time(void) | |||
1401 | maxshift = (64 - (ilog2(ntp_tick_length())+1)) - 1; | 1395 | maxshift = (64 - (ilog2(ntp_tick_length())+1)) - 1; |
1402 | shift = min(shift, maxshift); | 1396 | shift = min(shift, maxshift); |
1403 | while (offset >= tk->cycle_interval) { | 1397 | while (offset >= tk->cycle_interval) { |
1404 | offset = logarithmic_accumulation(tk, offset, shift); | 1398 | offset = logarithmic_accumulation(tk, offset, shift, |
1399 | &clock_set); | ||
1405 | if (offset < tk->cycle_interval<<shift) | 1400 | if (offset < tk->cycle_interval<<shift) |
1406 | shift--; | 1401 | shift--; |
1407 | } | 1402 | } |
@@ -1419,7 +1414,7 @@ static void update_wall_time(void) | |||
1419 | * Finally, make sure that after the rounding | 1414 | * Finally, make sure that after the rounding |
1420 | * xtime_nsec isn't larger than NSEC_PER_SEC | 1415 | * xtime_nsec isn't larger than NSEC_PER_SEC |
1421 | */ | 1416 | */ |
1422 | action = accumulate_nsecs_to_secs(tk); | 1417 | clock_set |= accumulate_nsecs_to_secs(tk); |
1423 | 1418 | ||
1424 | write_seqcount_begin(&timekeeper_seq); | 1419 | write_seqcount_begin(&timekeeper_seq); |
1425 | /* Update clock->cycle_last with the new value */ | 1420 | /* Update clock->cycle_last with the new value */ |
@@ -1435,10 +1430,12 @@ static void update_wall_time(void) | |||
1435 | * updating. | 1430 | * updating. |
1436 | */ | 1431 | */ |
1437 | memcpy(real_tk, tk, sizeof(*tk)); | 1432 | memcpy(real_tk, tk, sizeof(*tk)); |
1438 | timekeeping_update(real_tk, action); | 1433 | timekeeping_update(real_tk, clock_set); |
1439 | write_seqcount_end(&timekeeper_seq); | 1434 | write_seqcount_end(&timekeeper_seq); |
1440 | out: | 1435 | out: |
1441 | raw_spin_unlock_irqrestore(&timekeeper_lock, flags); | 1436 | raw_spin_unlock_irqrestore(&timekeeper_lock, flags); |
1437 | if (clock_set) | ||
1438 | clock_was_set(); | ||
1442 | } | 1439 | } |
1443 | 1440 | ||
1444 | /** | 1441 | /** |
@@ -1583,7 +1580,6 @@ struct timespec get_monotonic_coarse(void) | |||
1583 | void do_timer(unsigned long ticks) | 1580 | void do_timer(unsigned long ticks) |
1584 | { | 1581 | { |
1585 | jiffies_64 += ticks; | 1582 | jiffies_64 += ticks; |
1586 | update_wall_time(); | ||
1587 | calc_global_load(ticks); | 1583 | calc_global_load(ticks); |
1588 | } | 1584 | } |
1589 | 1585 | ||
@@ -1698,12 +1694,14 @@ int do_adjtimex(struct timex *txc) | |||
1698 | 1694 | ||
1699 | if (tai != orig_tai) { | 1695 | if (tai != orig_tai) { |
1700 | __timekeeping_set_tai_offset(tk, tai); | 1696 | __timekeeping_set_tai_offset(tk, tai); |
1701 | update_pvclock_gtod(tk, true); | 1697 | timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET); |
1702 | clock_was_set_delayed(); | ||
1703 | } | 1698 | } |
1704 | write_seqcount_end(&timekeeper_seq); | 1699 | write_seqcount_end(&timekeeper_seq); |
1705 | raw_spin_unlock_irqrestore(&timekeeper_lock, flags); | 1700 | raw_spin_unlock_irqrestore(&timekeeper_lock, flags); |
1706 | 1701 | ||
1702 | if (tai != orig_tai) | ||
1703 | clock_was_set(); | ||
1704 | |||
1707 | ntp_notify_cmos_timer(); | 1705 | ntp_notify_cmos_timer(); |
1708 | 1706 | ||
1709 | return ret; | 1707 | return ret; |
@@ -1739,4 +1737,5 @@ void xtime_update(unsigned long ticks) | |||
1739 | write_seqlock(&jiffies_lock); | 1737 | write_seqlock(&jiffies_lock); |
1740 | do_timer(ticks); | 1738 | do_timer(ticks); |
1741 | write_sequnlock(&jiffies_lock); | 1739 | write_sequnlock(&jiffies_lock); |
1740 | update_wall_time(); | ||
1742 | } | 1741 | } |
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index d7e2068e4b71..1378e84fbe39 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile | |||
@@ -50,6 +50,7 @@ ifeq ($(CONFIG_PERF_EVENTS),y) | |||
50 | obj-$(CONFIG_EVENT_TRACING) += trace_event_perf.o | 50 | obj-$(CONFIG_EVENT_TRACING) += trace_event_perf.o |
51 | endif | 51 | endif |
52 | obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o | 52 | obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o |
53 | obj-$(CONFIG_EVENT_TRACING) += trace_events_trigger.o | ||
53 | obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o | 54 | obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o |
54 | obj-$(CONFIG_TRACEPOINTS) += power-traces.o | 55 | obj-$(CONFIG_TRACEPOINTS) += power-traces.o |
55 | ifeq ($(CONFIG_PM_RUNTIME),y) | 56 | ifeq ($(CONFIG_PM_RUNTIME),y) |
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index f785aef65799..b418cb0d7242 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c | |||
@@ -781,8 +781,8 @@ static void blk_add_trace_bio(struct request_queue *q, struct bio *bio, | |||
781 | if (!error && !bio_flagged(bio, BIO_UPTODATE)) | 781 | if (!error && !bio_flagged(bio, BIO_UPTODATE)) |
782 | error = EIO; | 782 | error = EIO; |
783 | 783 | ||
784 | __blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, what, | 784 | __blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size, |
785 | error, 0, NULL); | 785 | bio->bi_rw, what, error, 0, NULL); |
786 | } | 786 | } |
787 | 787 | ||
788 | static void blk_add_trace_bio_bounce(void *ignore, | 788 | static void blk_add_trace_bio_bounce(void *ignore, |
@@ -885,8 +885,9 @@ static void blk_add_trace_split(void *ignore, | |||
885 | if (bt) { | 885 | if (bt) { |
886 | __be64 rpdu = cpu_to_be64(pdu); | 886 | __be64 rpdu = cpu_to_be64(pdu); |
887 | 887 | ||
888 | __blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, | 888 | __blk_add_trace(bt, bio->bi_iter.bi_sector, |
889 | BLK_TA_SPLIT, !bio_flagged(bio, BIO_UPTODATE), | 889 | bio->bi_iter.bi_size, bio->bi_rw, BLK_TA_SPLIT, |
890 | !bio_flagged(bio, BIO_UPTODATE), | ||
890 | sizeof(rpdu), &rpdu); | 891 | sizeof(rpdu), &rpdu); |
891 | } | 892 | } |
892 | } | 893 | } |
@@ -918,9 +919,9 @@ static void blk_add_trace_bio_remap(void *ignore, | |||
918 | r.device_to = cpu_to_be32(bio->bi_bdev->bd_dev); | 919 | r.device_to = cpu_to_be32(bio->bi_bdev->bd_dev); |
919 | r.sector_from = cpu_to_be64(from); | 920 | r.sector_from = cpu_to_be64(from); |
920 | 921 | ||
921 | __blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, | 922 | __blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size, |
922 | BLK_TA_REMAP, !bio_flagged(bio, BIO_UPTODATE), | 923 | bio->bi_rw, BLK_TA_REMAP, |
923 | sizeof(r), &r); | 924 | !bio_flagged(bio, BIO_UPTODATE), sizeof(r), &r); |
924 | } | 925 | } |
925 | 926 | ||
926 | /** | 927 | /** |
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 72a0f81dc5a8..cd7f76d1eb86 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c | |||
@@ -85,6 +85,8 @@ int function_trace_stop __read_mostly; | |||
85 | 85 | ||
86 | /* Current function tracing op */ | 86 | /* Current function tracing op */ |
87 | struct ftrace_ops *function_trace_op __read_mostly = &ftrace_list_end; | 87 | struct ftrace_ops *function_trace_op __read_mostly = &ftrace_list_end; |
88 | /* What to set function_trace_op to */ | ||
89 | static struct ftrace_ops *set_function_trace_op; | ||
88 | 90 | ||
89 | /* List for set_ftrace_pid's pids. */ | 91 | /* List for set_ftrace_pid's pids. */ |
90 | LIST_HEAD(ftrace_pids); | 92 | LIST_HEAD(ftrace_pids); |
@@ -278,6 +280,29 @@ static void update_global_ops(void) | |||
278 | global_ops.func = func; | 280 | global_ops.func = func; |
279 | } | 281 | } |
280 | 282 | ||
283 | static void ftrace_sync(struct work_struct *work) | ||
284 | { | ||
285 | /* | ||
286 | * This function is just a stub to implement a hard force | ||
287 | * of synchronize_sched(). This requires synchronizing | ||
288 | * tasks even in userspace and idle. | ||
289 | * | ||
290 | * Yes, function tracing is rude. | ||
291 | */ | ||
292 | } | ||
293 | |||
294 | static void ftrace_sync_ipi(void *data) | ||
295 | { | ||
296 | /* Probably not needed, but do it anyway */ | ||
297 | smp_rmb(); | ||
298 | } | ||
299 | |||
300 | #ifdef CONFIG_FUNCTION_GRAPH_TRACER | ||
301 | static void update_function_graph_func(void); | ||
302 | #else | ||
303 | static inline void update_function_graph_func(void) { } | ||
304 | #endif | ||
305 | |||
281 | static void update_ftrace_function(void) | 306 | static void update_ftrace_function(void) |
282 | { | 307 | { |
283 | ftrace_func_t func; | 308 | ftrace_func_t func; |
@@ -296,16 +321,61 @@ static void update_ftrace_function(void) | |||
296 | !FTRACE_FORCE_LIST_FUNC)) { | 321 | !FTRACE_FORCE_LIST_FUNC)) { |
297 | /* Set the ftrace_ops that the arch callback uses */ | 322 | /* Set the ftrace_ops that the arch callback uses */ |
298 | if (ftrace_ops_list == &global_ops) | 323 | if (ftrace_ops_list == &global_ops) |
299 | function_trace_op = ftrace_global_list; | 324 | set_function_trace_op = ftrace_global_list; |
300 | else | 325 | else |
301 | function_trace_op = ftrace_ops_list; | 326 | set_function_trace_op = ftrace_ops_list; |
302 | func = ftrace_ops_list->func; | 327 | func = ftrace_ops_list->func; |
303 | } else { | 328 | } else { |
304 | /* Just use the default ftrace_ops */ | 329 | /* Just use the default ftrace_ops */ |
305 | function_trace_op = &ftrace_list_end; | 330 | set_function_trace_op = &ftrace_list_end; |
306 | func = ftrace_ops_list_func; | 331 | func = ftrace_ops_list_func; |
307 | } | 332 | } |
308 | 333 | ||
334 | /* If there's no change, then do nothing more here */ | ||
335 | if (ftrace_trace_function == func) | ||
336 | return; | ||
337 | |||
338 | update_function_graph_func(); | ||
339 | |||
340 | /* | ||
341 | * If we are using the list function, it doesn't care | ||
342 | * about the function_trace_ops. | ||
343 | */ | ||
344 | if (func == ftrace_ops_list_func) { | ||
345 | ftrace_trace_function = func; | ||
346 | /* | ||
347 | * Don't even bother setting function_trace_ops, | ||
348 | * it would be racy to do so anyway. | ||
349 | */ | ||
350 | return; | ||
351 | } | ||
352 | |||
353 | #ifndef CONFIG_DYNAMIC_FTRACE | ||
354 | /* | ||
355 | * For static tracing, we need to be a bit more careful. | ||
356 | * The function change takes affect immediately. Thus, | ||
357 | * we need to coorditate the setting of the function_trace_ops | ||
358 | * with the setting of the ftrace_trace_function. | ||
359 | * | ||
360 | * Set the function to the list ops, which will call the | ||
361 | * function we want, albeit indirectly, but it handles the | ||
362 | * ftrace_ops and doesn't depend on function_trace_op. | ||
363 | */ | ||
364 | ftrace_trace_function = ftrace_ops_list_func; | ||
365 | /* | ||
366 | * Make sure all CPUs see this. Yes this is slow, but static | ||
367 | * tracing is slow and nasty to have enabled. | ||
368 | */ | ||
369 | schedule_on_each_cpu(ftrace_sync); | ||
370 | /* Now all cpus are using the list ops. */ | ||
371 | function_trace_op = set_function_trace_op; | ||
372 | /* Make sure the function_trace_op is visible on all CPUs */ | ||
373 | smp_wmb(); | ||
374 | /* Nasty way to force a rmb on all cpus */ | ||
375 | smp_call_function(ftrace_sync_ipi, NULL, 1); | ||
376 | /* OK, we are all set to update the ftrace_trace_function now! */ | ||
377 | #endif /* !CONFIG_DYNAMIC_FTRACE */ | ||
378 | |||
309 | ftrace_trace_function = func; | 379 | ftrace_trace_function = func; |
310 | } | 380 | } |
311 | 381 | ||
@@ -410,17 +480,6 @@ static int __register_ftrace_function(struct ftrace_ops *ops) | |||
410 | return 0; | 480 | return 0; |
411 | } | 481 | } |
412 | 482 | ||
413 | static void ftrace_sync(struct work_struct *work) | ||
414 | { | ||
415 | /* | ||
416 | * This function is just a stub to implement a hard force | ||
417 | * of synchronize_sched(). This requires synchronizing | ||
418 | * tasks even in userspace and idle. | ||
419 | * | ||
420 | * Yes, function tracing is rude. | ||
421 | */ | ||
422 | } | ||
423 | |||
424 | static int __unregister_ftrace_function(struct ftrace_ops *ops) | 483 | static int __unregister_ftrace_function(struct ftrace_ops *ops) |
425 | { | 484 | { |
426 | int ret; | 485 | int ret; |
@@ -439,20 +498,6 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops) | |||
439 | } else if (ops->flags & FTRACE_OPS_FL_CONTROL) { | 498 | } else if (ops->flags & FTRACE_OPS_FL_CONTROL) { |
440 | ret = remove_ftrace_list_ops(&ftrace_control_list, | 499 | ret = remove_ftrace_list_ops(&ftrace_control_list, |
441 | &control_ops, ops); | 500 | &control_ops, ops); |
442 | if (!ret) { | ||
443 | /* | ||
444 | * The ftrace_ops is now removed from the list, | ||
445 | * so there'll be no new users. We must ensure | ||
446 | * all current users are done before we free | ||
447 | * the control data. | ||
448 | * Note synchronize_sched() is not enough, as we | ||
449 | * use preempt_disable() to do RCU, but the function | ||
450 | * tracer can be called where RCU is not active | ||
451 | * (before user_exit()). | ||
452 | */ | ||
453 | schedule_on_each_cpu(ftrace_sync); | ||
454 | control_ops_free(ops); | ||
455 | } | ||
456 | } else | 501 | } else |
457 | ret = remove_ftrace_ops(&ftrace_ops_list, ops); | 502 | ret = remove_ftrace_ops(&ftrace_ops_list, ops); |
458 | 503 | ||
@@ -462,17 +507,6 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops) | |||
462 | if (ftrace_enabled) | 507 | if (ftrace_enabled) |
463 | update_ftrace_function(); | 508 | update_ftrace_function(); |
464 | 509 | ||
465 | /* | ||
466 | * Dynamic ops may be freed, we must make sure that all | ||
467 | * callers are done before leaving this function. | ||
468 | * | ||
469 | * Again, normal synchronize_sched() is not good enough. | ||
470 | * We need to do a hard force of sched synchronization. | ||
471 | */ | ||
472 | if (ops->flags & FTRACE_OPS_FL_DYNAMIC) | ||
473 | schedule_on_each_cpu(ftrace_sync); | ||
474 | |||
475 | |||
476 | return 0; | 510 | return 0; |
477 | } | 511 | } |
478 | 512 | ||
@@ -1082,19 +1116,6 @@ static __init void ftrace_profile_debugfs(struct dentry *d_tracer) | |||
1082 | 1116 | ||
1083 | static struct pid * const ftrace_swapper_pid = &init_struct_pid; | 1117 | static struct pid * const ftrace_swapper_pid = &init_struct_pid; |
1084 | 1118 | ||
1085 | loff_t | ||
1086 | ftrace_filter_lseek(struct file *file, loff_t offset, int whence) | ||
1087 | { | ||
1088 | loff_t ret; | ||
1089 | |||
1090 | if (file->f_mode & FMODE_READ) | ||
1091 | ret = seq_lseek(file, offset, whence); | ||
1092 | else | ||
1093 | file->f_pos = ret = 1; | ||
1094 | |||
1095 | return ret; | ||
1096 | } | ||
1097 | |||
1098 | #ifdef CONFIG_DYNAMIC_FTRACE | 1119 | #ifdef CONFIG_DYNAMIC_FTRACE |
1099 | 1120 | ||
1100 | #ifndef CONFIG_FTRACE_MCOUNT_RECORD | 1121 | #ifndef CONFIG_FTRACE_MCOUNT_RECORD |
@@ -1992,8 +2013,14 @@ void ftrace_modify_all_code(int command) | |||
1992 | else if (command & FTRACE_DISABLE_CALLS) | 2013 | else if (command & FTRACE_DISABLE_CALLS) |
1993 | ftrace_replace_code(0); | 2014 | ftrace_replace_code(0); |
1994 | 2015 | ||
1995 | if (update && ftrace_trace_function != ftrace_ops_list_func) | 2016 | if (update && ftrace_trace_function != ftrace_ops_list_func) { |
2017 | function_trace_op = set_function_trace_op; | ||
2018 | smp_wmb(); | ||
2019 | /* If irqs are disabled, we are in stop machine */ | ||
2020 | if (!irqs_disabled()) | ||
2021 | smp_call_function(ftrace_sync_ipi, NULL, 1); | ||
1996 | ftrace_update_ftrace_func(ftrace_trace_function); | 2022 | ftrace_update_ftrace_func(ftrace_trace_function); |
2023 | } | ||
1997 | 2024 | ||
1998 | if (command & FTRACE_START_FUNC_RET) | 2025 | if (command & FTRACE_START_FUNC_RET) |
1999 | ftrace_enable_ftrace_graph_caller(); | 2026 | ftrace_enable_ftrace_graph_caller(); |
@@ -2156,10 +2183,41 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command) | |||
2156 | command |= FTRACE_UPDATE_TRACE_FUNC; | 2183 | command |= FTRACE_UPDATE_TRACE_FUNC; |
2157 | } | 2184 | } |
2158 | 2185 | ||
2159 | if (!command || !ftrace_enabled) | 2186 | if (!command || !ftrace_enabled) { |
2187 | /* | ||
2188 | * If these are control ops, they still need their | ||
2189 | * per_cpu field freed. Since, function tracing is | ||
2190 | * not currently active, we can just free them | ||
2191 | * without synchronizing all CPUs. | ||
2192 | */ | ||
2193 | if (ops->flags & FTRACE_OPS_FL_CONTROL) | ||
2194 | control_ops_free(ops); | ||
2160 | return 0; | 2195 | return 0; |
2196 | } | ||
2161 | 2197 | ||
2162 | ftrace_run_update_code(command); | 2198 | ftrace_run_update_code(command); |
2199 | |||
2200 | /* | ||
2201 | * Dynamic ops may be freed, we must make sure that all | ||
2202 | * callers are done before leaving this function. | ||
2203 | * The same goes for freeing the per_cpu data of the control | ||
2204 | * ops. | ||
2205 | * | ||
2206 | * Again, normal synchronize_sched() is not good enough. | ||
2207 | * We need to do a hard force of sched synchronization. | ||
2208 | * This is because we use preempt_disable() to do RCU, but | ||
2209 | * the function tracers can be called where RCU is not watching | ||
2210 | * (like before user_exit()). We can not rely on the RCU | ||
2211 | * infrastructure to do the synchronization, thus we must do it | ||
2212 | * ourselves. | ||
2213 | */ | ||
2214 | if (ops->flags & (FTRACE_OPS_FL_DYNAMIC | FTRACE_OPS_FL_CONTROL)) { | ||
2215 | schedule_on_each_cpu(ftrace_sync); | ||
2216 | |||
2217 | if (ops->flags & FTRACE_OPS_FL_CONTROL) | ||
2218 | control_ops_free(ops); | ||
2219 | } | ||
2220 | |||
2163 | return 0; | 2221 | return 0; |
2164 | } | 2222 | } |
2165 | 2223 | ||
@@ -2739,7 +2797,7 @@ static void ftrace_filter_reset(struct ftrace_hash *hash) | |||
2739 | * routine, you can use ftrace_filter_write() for the write | 2797 | * routine, you can use ftrace_filter_write() for the write |
2740 | * routine if @flag has FTRACE_ITER_FILTER set, or | 2798 | * routine if @flag has FTRACE_ITER_FILTER set, or |
2741 | * ftrace_notrace_write() if @flag has FTRACE_ITER_NOTRACE set. | 2799 | * ftrace_notrace_write() if @flag has FTRACE_ITER_NOTRACE set. |
2742 | * ftrace_filter_lseek() should be used as the lseek routine, and | 2800 | * tracing_lseek() should be used as the lseek routine, and |
2743 | * release must call ftrace_regex_release(). | 2801 | * release must call ftrace_regex_release(). |
2744 | */ | 2802 | */ |
2745 | int | 2803 | int |
@@ -3767,7 +3825,7 @@ static const struct file_operations ftrace_filter_fops = { | |||
3767 | .open = ftrace_filter_open, | 3825 | .open = ftrace_filter_open, |
3768 | .read = seq_read, | 3826 | .read = seq_read, |
3769 | .write = ftrace_filter_write, | 3827 | .write = ftrace_filter_write, |
3770 | .llseek = ftrace_filter_lseek, | 3828 | .llseek = tracing_lseek, |
3771 | .release = ftrace_regex_release, | 3829 | .release = ftrace_regex_release, |
3772 | }; | 3830 | }; |
3773 | 3831 | ||
@@ -3775,7 +3833,7 @@ static const struct file_operations ftrace_notrace_fops = { | |||
3775 | .open = ftrace_notrace_open, | 3833 | .open = ftrace_notrace_open, |
3776 | .read = seq_read, | 3834 | .read = seq_read, |
3777 | .write = ftrace_notrace_write, | 3835 | .write = ftrace_notrace_write, |
3778 | .llseek = ftrace_filter_lseek, | 3836 | .llseek = tracing_lseek, |
3779 | .release = ftrace_regex_release, | 3837 | .release = ftrace_regex_release, |
3780 | }; | 3838 | }; |
3781 | 3839 | ||
@@ -4038,7 +4096,7 @@ static const struct file_operations ftrace_graph_fops = { | |||
4038 | .open = ftrace_graph_open, | 4096 | .open = ftrace_graph_open, |
4039 | .read = seq_read, | 4097 | .read = seq_read, |
4040 | .write = ftrace_graph_write, | 4098 | .write = ftrace_graph_write, |
4041 | .llseek = ftrace_filter_lseek, | 4099 | .llseek = tracing_lseek, |
4042 | .release = ftrace_graph_release, | 4100 | .release = ftrace_graph_release, |
4043 | }; | 4101 | }; |
4044 | 4102 | ||
@@ -4046,7 +4104,7 @@ static const struct file_operations ftrace_graph_notrace_fops = { | |||
4046 | .open = ftrace_graph_notrace_open, | 4104 | .open = ftrace_graph_notrace_open, |
4047 | .read = seq_read, | 4105 | .read = seq_read, |
4048 | .write = ftrace_graph_write, | 4106 | .write = ftrace_graph_write, |
4049 | .llseek = ftrace_filter_lseek, | 4107 | .llseek = tracing_lseek, |
4050 | .release = ftrace_graph_release, | 4108 | .release = ftrace_graph_release, |
4051 | }; | 4109 | }; |
4052 | #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ | 4110 | #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ |
@@ -4719,7 +4777,7 @@ static const struct file_operations ftrace_pid_fops = { | |||
4719 | .open = ftrace_pid_open, | 4777 | .open = ftrace_pid_open, |
4720 | .write = ftrace_pid_write, | 4778 | .write = ftrace_pid_write, |
4721 | .read = seq_read, | 4779 | .read = seq_read, |
4722 | .llseek = ftrace_filter_lseek, | 4780 | .llseek = tracing_lseek, |
4723 | .release = ftrace_pid_release, | 4781 | .release = ftrace_pid_release, |
4724 | }; | 4782 | }; |
4725 | 4783 | ||
@@ -4862,6 +4920,7 @@ int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace) | |||
4862 | trace_func_graph_ret_t ftrace_graph_return = | 4920 | trace_func_graph_ret_t ftrace_graph_return = |
4863 | (trace_func_graph_ret_t)ftrace_stub; | 4921 | (trace_func_graph_ret_t)ftrace_stub; |
4864 | trace_func_graph_ent_t ftrace_graph_entry = ftrace_graph_entry_stub; | 4922 | trace_func_graph_ent_t ftrace_graph_entry = ftrace_graph_entry_stub; |
4923 | static trace_func_graph_ent_t __ftrace_graph_entry = ftrace_graph_entry_stub; | ||
4865 | 4924 | ||
4866 | /* Try to assign a return stack array on FTRACE_RETSTACK_ALLOC_SIZE tasks. */ | 4925 | /* Try to assign a return stack array on FTRACE_RETSTACK_ALLOC_SIZE tasks. */ |
4867 | static int alloc_retstack_tasklist(struct ftrace_ret_stack **ret_stack_list) | 4926 | static int alloc_retstack_tasklist(struct ftrace_ret_stack **ret_stack_list) |
@@ -5003,6 +5062,30 @@ static struct ftrace_ops fgraph_ops __read_mostly = { | |||
5003 | FTRACE_OPS_FL_RECURSION_SAFE, | 5062 | FTRACE_OPS_FL_RECURSION_SAFE, |
5004 | }; | 5063 | }; |
5005 | 5064 | ||
5065 | static int ftrace_graph_entry_test(struct ftrace_graph_ent *trace) | ||
5066 | { | ||
5067 | if (!ftrace_ops_test(&global_ops, trace->func, NULL)) | ||
5068 | return 0; | ||
5069 | return __ftrace_graph_entry(trace); | ||
5070 | } | ||
5071 | |||
5072 | /* | ||
5073 | * The function graph tracer should only trace the functions defined | ||
5074 | * by set_ftrace_filter and set_ftrace_notrace. If another function | ||
5075 | * tracer ops is registered, the graph tracer requires testing the | ||
5076 | * function against the global ops, and not just trace any function | ||
5077 | * that any ftrace_ops registered. | ||
5078 | */ | ||
5079 | static void update_function_graph_func(void) | ||
5080 | { | ||
5081 | if (ftrace_ops_list == &ftrace_list_end || | ||
5082 | (ftrace_ops_list == &global_ops && | ||
5083 | global_ops.next == &ftrace_list_end)) | ||
5084 | ftrace_graph_entry = __ftrace_graph_entry; | ||
5085 | else | ||
5086 | ftrace_graph_entry = ftrace_graph_entry_test; | ||
5087 | } | ||
5088 | |||
5006 | int register_ftrace_graph(trace_func_graph_ret_t retfunc, | 5089 | int register_ftrace_graph(trace_func_graph_ret_t retfunc, |
5007 | trace_func_graph_ent_t entryfunc) | 5090 | trace_func_graph_ent_t entryfunc) |
5008 | { | 5091 | { |
@@ -5027,7 +5110,16 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc, | |||
5027 | } | 5110 | } |
5028 | 5111 | ||
5029 | ftrace_graph_return = retfunc; | 5112 | ftrace_graph_return = retfunc; |
5030 | ftrace_graph_entry = entryfunc; | 5113 | |
5114 | /* | ||
5115 | * Update the indirect function to the entryfunc, and the | ||
5116 | * function that gets called to the entry_test first. Then | ||
5117 | * call the update fgraph entry function to determine if | ||
5118 | * the entryfunc should be called directly or not. | ||
5119 | */ | ||
5120 | __ftrace_graph_entry = entryfunc; | ||
5121 | ftrace_graph_entry = ftrace_graph_entry_test; | ||
5122 | update_function_graph_func(); | ||
5031 | 5123 | ||
5032 | ret = ftrace_startup(&fgraph_ops, FTRACE_START_FUNC_RET); | 5124 | ret = ftrace_startup(&fgraph_ops, FTRACE_START_FUNC_RET); |
5033 | 5125 | ||
@@ -5046,6 +5138,7 @@ void unregister_ftrace_graph(void) | |||
5046 | ftrace_graph_active--; | 5138 | ftrace_graph_active--; |
5047 | ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub; | 5139 | ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub; |
5048 | ftrace_graph_entry = ftrace_graph_entry_stub; | 5140 | ftrace_graph_entry = ftrace_graph_entry_stub; |
5141 | __ftrace_graph_entry = ftrace_graph_entry_stub; | ||
5049 | ftrace_shutdown(&fgraph_ops, FTRACE_STOP_FUNC_RET); | 5142 | ftrace_shutdown(&fgraph_ops, FTRACE_STOP_FUNC_RET); |
5050 | unregister_pm_notifier(&ftrace_suspend_notifier); | 5143 | unregister_pm_notifier(&ftrace_suspend_notifier); |
5051 | unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL); | 5144 | unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL); |
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index cc2f66f68dc5..fc4da2d97f9b 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c | |||
@@ -2397,6 +2397,13 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, | |||
2397 | write &= RB_WRITE_MASK; | 2397 | write &= RB_WRITE_MASK; |
2398 | tail = write - length; | 2398 | tail = write - length; |
2399 | 2399 | ||
2400 | /* | ||
2401 | * If this is the first commit on the page, then it has the same | ||
2402 | * timestamp as the page itself. | ||
2403 | */ | ||
2404 | if (!tail) | ||
2405 | delta = 0; | ||
2406 | |||
2400 | /* See if we shot pass the end of this buffer page */ | 2407 | /* See if we shot pass the end of this buffer page */ |
2401 | if (unlikely(write > BUF_PAGE_SIZE)) | 2408 | if (unlikely(write > BUF_PAGE_SIZE)) |
2402 | return rb_move_tail(cpu_buffer, length, tail, | 2409 | return rb_move_tail(cpu_buffer, length, tail, |
@@ -2558,7 +2565,7 @@ rb_reserve_next_event(struct ring_buffer *buffer, | |||
2558 | if (unlikely(test_time_stamp(delta))) { | 2565 | if (unlikely(test_time_stamp(delta))) { |
2559 | int local_clock_stable = 1; | 2566 | int local_clock_stable = 1; |
2560 | #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK | 2567 | #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK |
2561 | local_clock_stable = sched_clock_stable; | 2568 | local_clock_stable = sched_clock_stable(); |
2562 | #endif | 2569 | #endif |
2563 | WARN_ONCE(delta > (1ULL << 59), | 2570 | WARN_ONCE(delta > (1ULL << 59), |
2564 | KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n%s", | 2571 | KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n%s", |
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 9d20cd9743ef..815c878f409b 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c | |||
@@ -455,6 +455,9 @@ int __trace_puts(unsigned long ip, const char *str, int size) | |||
455 | unsigned long irq_flags; | 455 | unsigned long irq_flags; |
456 | int alloc; | 456 | int alloc; |
457 | 457 | ||
458 | if (unlikely(tracing_selftest_running || tracing_disabled)) | ||
459 | return 0; | ||
460 | |||
458 | alloc = sizeof(*entry) + size + 2; /* possible \n added */ | 461 | alloc = sizeof(*entry) + size + 2; /* possible \n added */ |
459 | 462 | ||
460 | local_save_flags(irq_flags); | 463 | local_save_flags(irq_flags); |
@@ -495,6 +498,9 @@ int __trace_bputs(unsigned long ip, const char *str) | |||
495 | unsigned long irq_flags; | 498 | unsigned long irq_flags; |
496 | int size = sizeof(struct bputs_entry); | 499 | int size = sizeof(struct bputs_entry); |
497 | 500 | ||
501 | if (unlikely(tracing_selftest_running || tracing_disabled)) | ||
502 | return 0; | ||
503 | |||
498 | local_save_flags(irq_flags); | 504 | local_save_flags(irq_flags); |
499 | buffer = global_trace.trace_buffer.buffer; | 505 | buffer = global_trace.trace_buffer.buffer; |
500 | event = trace_buffer_lock_reserve(buffer, TRACE_BPUTS, size, | 506 | event = trace_buffer_lock_reserve(buffer, TRACE_BPUTS, size, |
@@ -595,6 +601,28 @@ void free_snapshot(struct trace_array *tr) | |||
595 | } | 601 | } |
596 | 602 | ||
597 | /** | 603 | /** |
604 | * tracing_alloc_snapshot - allocate snapshot buffer. | ||
605 | * | ||
606 | * This only allocates the snapshot buffer if it isn't already | ||
607 | * allocated - it doesn't also take a snapshot. | ||
608 | * | ||
609 | * This is meant to be used in cases where the snapshot buffer needs | ||
610 | * to be set up for events that can't sleep but need to be able to | ||
611 | * trigger a snapshot. | ||
612 | */ | ||
613 | int tracing_alloc_snapshot(void) | ||
614 | { | ||
615 | struct trace_array *tr = &global_trace; | ||
616 | int ret; | ||
617 | |||
618 | ret = alloc_snapshot(tr); | ||
619 | WARN_ON(ret < 0); | ||
620 | |||
621 | return ret; | ||
622 | } | ||
623 | EXPORT_SYMBOL_GPL(tracing_alloc_snapshot); | ||
624 | |||
625 | /** | ||
598 | * trace_snapshot_alloc - allocate and take a snapshot of the current buffer. | 626 | * trace_snapshot_alloc - allocate and take a snapshot of the current buffer. |
599 | * | 627 | * |
600 | * This is similar to trace_snapshot(), but it will allocate the | 628 | * This is similar to trace_snapshot(), but it will allocate the |
@@ -607,11 +635,10 @@ void free_snapshot(struct trace_array *tr) | |||
607 | */ | 635 | */ |
608 | void tracing_snapshot_alloc(void) | 636 | void tracing_snapshot_alloc(void) |
609 | { | 637 | { |
610 | struct trace_array *tr = &global_trace; | ||
611 | int ret; | 638 | int ret; |
612 | 639 | ||
613 | ret = alloc_snapshot(tr); | 640 | ret = tracing_alloc_snapshot(); |
614 | if (WARN_ON(ret < 0)) | 641 | if (ret < 0) |
615 | return; | 642 | return; |
616 | 643 | ||
617 | tracing_snapshot(); | 644 | tracing_snapshot(); |
@@ -623,6 +650,12 @@ void tracing_snapshot(void) | |||
623 | WARN_ONCE(1, "Snapshot feature not enabled, but internal snapshot used"); | 650 | WARN_ONCE(1, "Snapshot feature not enabled, but internal snapshot used"); |
624 | } | 651 | } |
625 | EXPORT_SYMBOL_GPL(tracing_snapshot); | 652 | EXPORT_SYMBOL_GPL(tracing_snapshot); |
653 | int tracing_alloc_snapshot(void) | ||
654 | { | ||
655 | WARN_ONCE(1, "Snapshot feature not enabled, but snapshot allocation used"); | ||
656 | return -ENODEV; | ||
657 | } | ||
658 | EXPORT_SYMBOL_GPL(tracing_alloc_snapshot); | ||
626 | void tracing_snapshot_alloc(void) | 659 | void tracing_snapshot_alloc(void) |
627 | { | 660 | { |
628 | /* Give warning */ | 661 | /* Give warning */ |
@@ -3156,19 +3189,23 @@ tracing_write_stub(struct file *filp, const char __user *ubuf, | |||
3156 | return count; | 3189 | return count; |
3157 | } | 3190 | } |
3158 | 3191 | ||
3159 | static loff_t tracing_seek(struct file *file, loff_t offset, int origin) | 3192 | loff_t tracing_lseek(struct file *file, loff_t offset, int whence) |
3160 | { | 3193 | { |
3194 | int ret; | ||
3195 | |||
3161 | if (file->f_mode & FMODE_READ) | 3196 | if (file->f_mode & FMODE_READ) |
3162 | return seq_lseek(file, offset, origin); | 3197 | ret = seq_lseek(file, offset, whence); |
3163 | else | 3198 | else |
3164 | return 0; | 3199 | file->f_pos = ret = 0; |
3200 | |||
3201 | return ret; | ||
3165 | } | 3202 | } |
3166 | 3203 | ||
3167 | static const struct file_operations tracing_fops = { | 3204 | static const struct file_operations tracing_fops = { |
3168 | .open = tracing_open, | 3205 | .open = tracing_open, |
3169 | .read = seq_read, | 3206 | .read = seq_read, |
3170 | .write = tracing_write_stub, | 3207 | .write = tracing_write_stub, |
3171 | .llseek = tracing_seek, | 3208 | .llseek = tracing_lseek, |
3172 | .release = tracing_release, | 3209 | .release = tracing_release, |
3173 | }; | 3210 | }; |
3174 | 3211 | ||
@@ -3488,60 +3525,103 @@ static const char readme_msg[] = | |||
3488 | " instances\t\t- Make sub-buffers with: mkdir instances/foo\n" | 3525 | " instances\t\t- Make sub-buffers with: mkdir instances/foo\n" |
3489 | "\t\t\t Remove sub-buffer with rmdir\n" | 3526 | "\t\t\t Remove sub-buffer with rmdir\n" |
3490 | " trace_options\t\t- Set format or modify how tracing happens\n" | 3527 | " trace_options\t\t- Set format or modify how tracing happens\n" |
3491 | "\t\t\t Disable an option by adding a suffix 'no' to the option name\n" | 3528 | "\t\t\t Disable an option by adding a suffix 'no' to the\n" |
3529 | "\t\t\t option name\n" | ||
3492 | #ifdef CONFIG_DYNAMIC_FTRACE | 3530 | #ifdef CONFIG_DYNAMIC_FTRACE |
3493 | "\n available_filter_functions - list of functions that can be filtered on\n" | 3531 | "\n available_filter_functions - list of functions that can be filtered on\n" |
3494 | " set_ftrace_filter\t- echo function name in here to only trace these functions\n" | 3532 | " set_ftrace_filter\t- echo function name in here to only trace these\n" |
3495 | " accepts: func_full_name, *func_end, func_begin*, *func_middle*\n" | 3533 | "\t\t\t functions\n" |
3496 | " modules: Can select a group via module\n" | 3534 | "\t accepts: func_full_name, *func_end, func_begin*, *func_middle*\n" |
3497 | " Format: :mod:<module-name>\n" | 3535 | "\t modules: Can select a group via module\n" |
3498 | " example: echo :mod:ext3 > set_ftrace_filter\n" | 3536 | "\t Format: :mod:<module-name>\n" |
3499 | " triggers: a command to perform when function is hit\n" | 3537 | "\t example: echo :mod:ext3 > set_ftrace_filter\n" |
3500 | " Format: <function>:<trigger>[:count]\n" | 3538 | "\t triggers: a command to perform when function is hit\n" |
3501 | " trigger: traceon, traceoff\n" | 3539 | "\t Format: <function>:<trigger>[:count]\n" |
3502 | " enable_event:<system>:<event>\n" | 3540 | "\t trigger: traceon, traceoff\n" |
3503 | " disable_event:<system>:<event>\n" | 3541 | "\t\t enable_event:<system>:<event>\n" |
3542 | "\t\t disable_event:<system>:<event>\n" | ||
3504 | #ifdef CONFIG_STACKTRACE | 3543 | #ifdef CONFIG_STACKTRACE |
3505 | " stacktrace\n" | 3544 | "\t\t stacktrace\n" |
3506 | #endif | 3545 | #endif |
3507 | #ifdef CONFIG_TRACER_SNAPSHOT | 3546 | #ifdef CONFIG_TRACER_SNAPSHOT |
3508 | " snapshot\n" | 3547 | "\t\t snapshot\n" |
3509 | #endif | 3548 | #endif |
3510 | " example: echo do_fault:traceoff > set_ftrace_filter\n" | 3549 | "\t example: echo do_fault:traceoff > set_ftrace_filter\n" |
3511 | " echo do_trap:traceoff:3 > set_ftrace_filter\n" | 3550 | "\t echo do_trap:traceoff:3 > set_ftrace_filter\n" |
3512 | " The first one will disable tracing every time do_fault is hit\n" | 3551 | "\t The first one will disable tracing every time do_fault is hit\n" |
3513 | " The second will disable tracing at most 3 times when do_trap is hit\n" | 3552 | "\t The second will disable tracing at most 3 times when do_trap is hit\n" |
3514 | " The first time do trap is hit and it disables tracing, the counter\n" | 3553 | "\t The first time do trap is hit and it disables tracing, the\n" |
3515 | " will decrement to 2. If tracing is already disabled, the counter\n" | 3554 | "\t counter will decrement to 2. If tracing is already disabled,\n" |
3516 | " will not decrement. It only decrements when the trigger did work\n" | 3555 | "\t the counter will not decrement. It only decrements when the\n" |
3517 | " To remove trigger without count:\n" | 3556 | "\t trigger did work\n" |
3518 | " echo '!<function>:<trigger> > set_ftrace_filter\n" | 3557 | "\t To remove trigger without count:\n" |
3519 | " To remove trigger with a count:\n" | 3558 | "\t echo '!<function>:<trigger> > set_ftrace_filter\n" |
3520 | " echo '!<function>:<trigger>:0 > set_ftrace_filter\n" | 3559 | "\t To remove trigger with a count:\n" |
3560 | "\t echo '!<function>:<trigger>:0 > set_ftrace_filter\n" | ||
3521 | " set_ftrace_notrace\t- echo function name in here to never trace.\n" | 3561 | " set_ftrace_notrace\t- echo function name in here to never trace.\n" |
3522 | " accepts: func_full_name, *func_end, func_begin*, *func_middle*\n" | 3562 | "\t accepts: func_full_name, *func_end, func_begin*, *func_middle*\n" |
3523 | " modules: Can select a group via module command :mod:\n" | 3563 | "\t modules: Can select a group via module command :mod:\n" |
3524 | " Does not accept triggers\n" | 3564 | "\t Does not accept triggers\n" |
3525 | #endif /* CONFIG_DYNAMIC_FTRACE */ | 3565 | #endif /* CONFIG_DYNAMIC_FTRACE */ |
3526 | #ifdef CONFIG_FUNCTION_TRACER | 3566 | #ifdef CONFIG_FUNCTION_TRACER |
3527 | " set_ftrace_pid\t- Write pid(s) to only function trace those pids (function)\n" | 3567 | " set_ftrace_pid\t- Write pid(s) to only function trace those pids\n" |
3568 | "\t\t (function)\n" | ||
3528 | #endif | 3569 | #endif |
3529 | #ifdef CONFIG_FUNCTION_GRAPH_TRACER | 3570 | #ifdef CONFIG_FUNCTION_GRAPH_TRACER |
3530 | " set_graph_function\t- Trace the nested calls of a function (function_graph)\n" | 3571 | " set_graph_function\t- Trace the nested calls of a function (function_graph)\n" |
3531 | " max_graph_depth\t- Trace a limited depth of nested calls (0 is unlimited)\n" | 3572 | " max_graph_depth\t- Trace a limited depth of nested calls (0 is unlimited)\n" |
3532 | #endif | 3573 | #endif |
3533 | #ifdef CONFIG_TRACER_SNAPSHOT | 3574 | #ifdef CONFIG_TRACER_SNAPSHOT |
3534 | "\n snapshot\t\t- Like 'trace' but shows the content of the static snapshot buffer\n" | 3575 | "\n snapshot\t\t- Like 'trace' but shows the content of the static\n" |
3535 | "\t\t\t Read the contents for more information\n" | 3576 | "\t\t\t snapshot buffer. Read the contents for more\n" |
3577 | "\t\t\t information\n" | ||
3536 | #endif | 3578 | #endif |
3537 | #ifdef CONFIG_STACK_TRACER | 3579 | #ifdef CONFIG_STACK_TRACER |
3538 | " stack_trace\t\t- Shows the max stack trace when active\n" | 3580 | " stack_trace\t\t- Shows the max stack trace when active\n" |
3539 | " stack_max_size\t- Shows current max stack size that was traced\n" | 3581 | " stack_max_size\t- Shows current max stack size that was traced\n" |
3540 | "\t\t\t Write into this file to reset the max size (trigger a new trace)\n" | 3582 | "\t\t\t Write into this file to reset the max size (trigger a\n" |
3583 | "\t\t\t new trace)\n" | ||
3541 | #ifdef CONFIG_DYNAMIC_FTRACE | 3584 | #ifdef CONFIG_DYNAMIC_FTRACE |
3542 | " stack_trace_filter\t- Like set_ftrace_filter but limits what stack_trace traces\n" | 3585 | " stack_trace_filter\t- Like set_ftrace_filter but limits what stack_trace\n" |
3586 | "\t\t\t traces\n" | ||
3543 | #endif | 3587 | #endif |
3544 | #endif /* CONFIG_STACK_TRACER */ | 3588 | #endif /* CONFIG_STACK_TRACER */ |
3589 | " events/\t\t- Directory containing all trace event subsystems:\n" | ||
3590 | " enable\t\t- Write 0/1 to enable/disable tracing of all events\n" | ||
3591 | " events/<system>/\t- Directory containing all trace events for <system>:\n" | ||
3592 | " enable\t\t- Write 0/1 to enable/disable tracing of all <system>\n" | ||
3593 | "\t\t\t events\n" | ||
3594 | " filter\t\t- If set, only events passing filter are traced\n" | ||
3595 | " events/<system>/<event>/\t- Directory containing control files for\n" | ||
3596 | "\t\t\t <event>:\n" | ||
3597 | " enable\t\t- Write 0/1 to enable/disable tracing of <event>\n" | ||
3598 | " filter\t\t- If set, only events passing filter are traced\n" | ||
3599 | " trigger\t\t- If set, a command to perform when event is hit\n" | ||
3600 | "\t Format: <trigger>[:count][if <filter>]\n" | ||
3601 | "\t trigger: traceon, traceoff\n" | ||
3602 | "\t enable_event:<system>:<event>\n" | ||
3603 | "\t disable_event:<system>:<event>\n" | ||
3604 | #ifdef CONFIG_STACKTRACE | ||
3605 | "\t\t stacktrace\n" | ||
3606 | #endif | ||
3607 | #ifdef CONFIG_TRACER_SNAPSHOT | ||
3608 | "\t\t snapshot\n" | ||
3609 | #endif | ||
3610 | "\t example: echo traceoff > events/block/block_unplug/trigger\n" | ||
3611 | "\t echo traceoff:3 > events/block/block_unplug/trigger\n" | ||
3612 | "\t echo 'enable_event:kmem:kmalloc:3 if nr_rq > 1' > \\\n" | ||
3613 | "\t events/block/block_unplug/trigger\n" | ||
3614 | "\t The first disables tracing every time block_unplug is hit.\n" | ||
3615 | "\t The second disables tracing the first 3 times block_unplug is hit.\n" | ||
3616 | "\t The third enables the kmalloc event the first 3 times block_unplug\n" | ||
3617 | "\t is hit and has value of greater than 1 for the 'nr_rq' event field.\n" | ||
3618 | "\t Like function triggers, the counter is only decremented if it\n" | ||
3619 | "\t enabled or disabled tracing.\n" | ||
3620 | "\t To remove a trigger without a count:\n" | ||
3621 | "\t echo '!<trigger> > <system>/<event>/trigger\n" | ||
3622 | "\t To remove a trigger with a count:\n" | ||
3623 | "\t echo '!<trigger>:0 > <system>/<event>/trigger\n" | ||
3624 | "\t Filters can be ignored when removing a trigger.\n" | ||
3545 | ; | 3625 | ; |
3546 | 3626 | ||
3547 | static ssize_t | 3627 | static ssize_t |
@@ -4212,12 +4292,6 @@ out: | |||
4212 | return sret; | 4292 | return sret; |
4213 | } | 4293 | } |
4214 | 4294 | ||
4215 | static void tracing_pipe_buf_release(struct pipe_inode_info *pipe, | ||
4216 | struct pipe_buffer *buf) | ||
4217 | { | ||
4218 | __free_page(buf->page); | ||
4219 | } | ||
4220 | |||
4221 | static void tracing_spd_release_pipe(struct splice_pipe_desc *spd, | 4295 | static void tracing_spd_release_pipe(struct splice_pipe_desc *spd, |
4222 | unsigned int idx) | 4296 | unsigned int idx) |
4223 | { | 4297 | { |
@@ -4229,7 +4303,7 @@ static const struct pipe_buf_operations tracing_pipe_buf_ops = { | |||
4229 | .map = generic_pipe_buf_map, | 4303 | .map = generic_pipe_buf_map, |
4230 | .unmap = generic_pipe_buf_unmap, | 4304 | .unmap = generic_pipe_buf_unmap, |
4231 | .confirm = generic_pipe_buf_confirm, | 4305 | .confirm = generic_pipe_buf_confirm, |
4232 | .release = tracing_pipe_buf_release, | 4306 | .release = generic_pipe_buf_release, |
4233 | .steal = generic_pipe_buf_steal, | 4307 | .steal = generic_pipe_buf_steal, |
4234 | .get = generic_pipe_buf_get, | 4308 | .get = generic_pipe_buf_get, |
4235 | }; | 4309 | }; |
@@ -4913,7 +4987,7 @@ static const struct file_operations snapshot_fops = { | |||
4913 | .open = tracing_snapshot_open, | 4987 | .open = tracing_snapshot_open, |
4914 | .read = seq_read, | 4988 | .read = seq_read, |
4915 | .write = tracing_snapshot_write, | 4989 | .write = tracing_snapshot_write, |
4916 | .llseek = tracing_seek, | 4990 | .llseek = tracing_lseek, |
4917 | .release = tracing_snapshot_release, | 4991 | .release = tracing_snapshot_release, |
4918 | }; | 4992 | }; |
4919 | 4993 | ||
@@ -5883,6 +5957,8 @@ allocate_trace_buffer(struct trace_array *tr, struct trace_buffer *buf, int size | |||
5883 | 5957 | ||
5884 | rb_flags = trace_flags & TRACE_ITER_OVERWRITE ? RB_FL_OVERWRITE : 0; | 5958 | rb_flags = trace_flags & TRACE_ITER_OVERWRITE ? RB_FL_OVERWRITE : 0; |
5885 | 5959 | ||
5960 | buf->tr = tr; | ||
5961 | |||
5886 | buf->buffer = ring_buffer_alloc(size, rb_flags); | 5962 | buf->buffer = ring_buffer_alloc(size, rb_flags); |
5887 | if (!buf->buffer) | 5963 | if (!buf->buffer) |
5888 | return -ENOMEM; | 5964 | return -ENOMEM; |
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index ea189e027b80..02b592f2d4b7 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h | |||
@@ -1,3 +1,4 @@ | |||
1 | |||
1 | #ifndef _LINUX_KERNEL_TRACE_H | 2 | #ifndef _LINUX_KERNEL_TRACE_H |
2 | #define _LINUX_KERNEL_TRACE_H | 3 | #define _LINUX_KERNEL_TRACE_H |
3 | 4 | ||
@@ -587,6 +588,8 @@ void tracing_start_sched_switch_record(void); | |||
587 | int register_tracer(struct tracer *type); | 588 | int register_tracer(struct tracer *type); |
588 | int is_tracing_stopped(void); | 589 | int is_tracing_stopped(void); |
589 | 590 | ||
591 | loff_t tracing_lseek(struct file *file, loff_t offset, int whence); | ||
592 | |||
590 | extern cpumask_var_t __read_mostly tracing_buffer_mask; | 593 | extern cpumask_var_t __read_mostly tracing_buffer_mask; |
591 | 594 | ||
592 | #define for_each_tracing_cpu(cpu) \ | 595 | #define for_each_tracing_cpu(cpu) \ |
@@ -1020,6 +1023,10 @@ extern int apply_subsystem_event_filter(struct ftrace_subsystem_dir *dir, | |||
1020 | extern void print_subsystem_event_filter(struct event_subsystem *system, | 1023 | extern void print_subsystem_event_filter(struct event_subsystem *system, |
1021 | struct trace_seq *s); | 1024 | struct trace_seq *s); |
1022 | extern int filter_assign_type(const char *type); | 1025 | extern int filter_assign_type(const char *type); |
1026 | extern int create_event_filter(struct ftrace_event_call *call, | ||
1027 | char *filter_str, bool set_str, | ||
1028 | struct event_filter **filterp); | ||
1029 | extern void free_event_filter(struct event_filter *filter); | ||
1023 | 1030 | ||
1024 | struct ftrace_event_field * | 1031 | struct ftrace_event_field * |
1025 | trace_find_event_field(struct ftrace_event_call *call, char *name); | 1032 | trace_find_event_field(struct ftrace_event_call *call, char *name); |
@@ -1028,9 +1035,195 @@ extern void trace_event_enable_cmd_record(bool enable); | |||
1028 | extern int event_trace_add_tracer(struct dentry *parent, struct trace_array *tr); | 1035 | extern int event_trace_add_tracer(struct dentry *parent, struct trace_array *tr); |
1029 | extern int event_trace_del_tracer(struct trace_array *tr); | 1036 | extern int event_trace_del_tracer(struct trace_array *tr); |
1030 | 1037 | ||
1038 | extern struct ftrace_event_file *find_event_file(struct trace_array *tr, | ||
1039 | const char *system, | ||
1040 | const char *event); | ||
1041 | |||
1042 | static inline void *event_file_data(struct file *filp) | ||
1043 | { | ||
1044 | return ACCESS_ONCE(file_inode(filp)->i_private); | ||
1045 | } | ||
1046 | |||
1031 | extern struct mutex event_mutex; | 1047 | extern struct mutex event_mutex; |
1032 | extern struct list_head ftrace_events; | 1048 | extern struct list_head ftrace_events; |
1033 | 1049 | ||
1050 | extern const struct file_operations event_trigger_fops; | ||
1051 | |||
1052 | extern int register_trigger_cmds(void); | ||
1053 | extern void clear_event_triggers(struct trace_array *tr); | ||
1054 | |||
1055 | struct event_trigger_data { | ||
1056 | unsigned long count; | ||
1057 | int ref; | ||
1058 | struct event_trigger_ops *ops; | ||
1059 | struct event_command *cmd_ops; | ||
1060 | struct event_filter __rcu *filter; | ||
1061 | char *filter_str; | ||
1062 | void *private_data; | ||
1063 | struct list_head list; | ||
1064 | }; | ||
1065 | |||
1066 | /** | ||
1067 | * struct event_trigger_ops - callbacks for trace event triggers | ||
1068 | * | ||
1069 | * The methods in this structure provide per-event trigger hooks for | ||
1070 | * various trigger operations. | ||
1071 | * | ||
1072 | * All the methods below, except for @init() and @free(), must be | ||
1073 | * implemented. | ||
1074 | * | ||
1075 | * @func: The trigger 'probe' function called when the triggering | ||
1076 | * event occurs. The data passed into this callback is the data | ||
1077 | * that was supplied to the event_command @reg() function that | ||
1078 | * registered the trigger (see struct event_command). | ||
1079 | * | ||
1080 | * @init: An optional initialization function called for the trigger | ||
1081 | * when the trigger is registered (via the event_command reg() | ||
1082 | * function). This can be used to perform per-trigger | ||
1083 | * initialization such as incrementing a per-trigger reference | ||
1084 | * count, for instance. This is usually implemented by the | ||
1085 | * generic utility function @event_trigger_init() (see | ||
1086 | * trace_event_triggers.c). | ||
1087 | * | ||
1088 | * @free: An optional de-initialization function called for the | ||
1089 | * trigger when the trigger is unregistered (via the | ||
1090 | * event_command @reg() function). This can be used to perform | ||
1091 | * per-trigger de-initialization such as decrementing a | ||
1092 | * per-trigger reference count and freeing corresponding trigger | ||
1093 | * data, for instance. This is usually implemented by the | ||
1094 | * generic utility function @event_trigger_free() (see | ||
1095 | * trace_event_triggers.c). | ||
1096 | * | ||
1097 | * @print: The callback function invoked to have the trigger print | ||
1098 | * itself. This is usually implemented by a wrapper function | ||
1099 | * that calls the generic utility function @event_trigger_print() | ||
1100 | * (see trace_event_triggers.c). | ||
1101 | */ | ||
1102 | struct event_trigger_ops { | ||
1103 | void (*func)(struct event_trigger_data *data); | ||
1104 | int (*init)(struct event_trigger_ops *ops, | ||
1105 | struct event_trigger_data *data); | ||
1106 | void (*free)(struct event_trigger_ops *ops, | ||
1107 | struct event_trigger_data *data); | ||
1108 | int (*print)(struct seq_file *m, | ||
1109 | struct event_trigger_ops *ops, | ||
1110 | struct event_trigger_data *data); | ||
1111 | }; | ||
1112 | |||
1113 | /** | ||
1114 | * struct event_command - callbacks and data members for event commands | ||
1115 | * | ||
1116 | * Event commands are invoked by users by writing the command name | ||
1117 | * into the 'trigger' file associated with a trace event. The | ||
1118 | * parameters associated with a specific invocation of an event | ||
1119 | * command are used to create an event trigger instance, which is | ||
1120 | * added to the list of trigger instances associated with that trace | ||
1121 | * event. When the event is hit, the set of triggers associated with | ||
1122 | * that event is invoked. | ||
1123 | * | ||
1124 | * The data members in this structure provide per-event command data | ||
1125 | * for various event commands. | ||
1126 | * | ||
1127 | * All the data members below, except for @post_trigger, must be set | ||
1128 | * for each event command. | ||
1129 | * | ||
1130 | * @name: The unique name that identifies the event command. This is | ||
1131 | * the name used when setting triggers via trigger files. | ||
1132 | * | ||
1133 | * @trigger_type: A unique id that identifies the event command | ||
1134 | * 'type'. This value has two purposes, the first to ensure that | ||
1135 | * only one trigger of the same type can be set at a given time | ||
1136 | * for a particular event e.g. it doesn't make sense to have both | ||
1137 | * a traceon and traceoff trigger attached to a single event at | ||
1138 | * the same time, so traceon and traceoff have the same type | ||
1139 | * though they have different names. The @trigger_type value is | ||
1140 | * also used as a bit value for deferring the actual trigger | ||
1141 | * action until after the current event is finished. Some | ||
1142 | * commands need to do this if they themselves log to the trace | ||
1143 | * buffer (see the @post_trigger() member below). @trigger_type | ||
1144 | * values are defined by adding new values to the trigger_type | ||
1145 | * enum in include/linux/ftrace_event.h. | ||
1146 | * | ||
1147 | * @post_trigger: A flag that says whether or not this command needs | ||
1148 | * to have its action delayed until after the current event has | ||
1149 | * been closed. Some triggers need to avoid being invoked while | ||
1150 | * an event is currently in the process of being logged, since | ||
1151 | * the trigger may itself log data into the trace buffer. Thus | ||
1152 | * we make sure the current event is committed before invoking | ||
1153 | * those triggers. To do that, the trigger invocation is split | ||
1154 | * in two - the first part checks the filter using the current | ||
1155 | * trace record; if a command has the @post_trigger flag set, it | ||
1156 | * sets a bit for itself in the return value, otherwise it | ||
1157 | * directly invokes the trigger. Once all commands have been | ||
1158 | * either invoked or set their return flag, the current record is | ||
1159 | * either committed or discarded. At that point, if any commands | ||
1160 | * have deferred their triggers, those commands are finally | ||
1161 | * invoked following the close of the current event. In other | ||
1162 | * words, if the event_trigger_ops @func() probe implementation | ||
1163 | * itself logs to the trace buffer, this flag should be set, | ||
1164 | * otherwise it can be left unspecified. | ||
1165 | * | ||
1166 | * All the methods below, except for @set_filter(), must be | ||
1167 | * implemented. | ||
1168 | * | ||
1169 | * @func: The callback function responsible for parsing and | ||
1170 | * registering the trigger written to the 'trigger' file by the | ||
1171 | * user. It allocates the trigger instance and registers it with | ||
1172 | * the appropriate trace event. It makes use of the other | ||
1173 | * event_command callback functions to orchestrate this, and is | ||
1174 | * usually implemented by the generic utility function | ||
1175 | * @event_trigger_callback() (see trace_event_triggers.c). | ||
1176 | * | ||
1177 | * @reg: Adds the trigger to the list of triggers associated with the | ||
1178 | * event, and enables the event trigger itself, after | ||
1179 | * initializing it (via the event_trigger_ops @init() function). | ||
1180 | * This is also where commands can use the @trigger_type value to | ||
1181 | * make the decision as to whether or not multiple instances of | ||
1182 | * the trigger should be allowed. This is usually implemented by | ||
1183 | * the generic utility function @register_trigger() (see | ||
1184 | * trace_event_triggers.c). | ||
1185 | * | ||
1186 | * @unreg: Removes the trigger from the list of triggers associated | ||
1187 | * with the event, and disables the event trigger itself, after | ||
1188 | * initializing it (via the event_trigger_ops @free() function). | ||
1189 | * This is usually implemented by the generic utility function | ||
1190 | * @unregister_trigger() (see trace_event_triggers.c). | ||
1191 | * | ||
1192 | * @set_filter: An optional function called to parse and set a filter | ||
1193 | * for the trigger. If no @set_filter() method is set for the | ||
1194 | * event command, filters set by the user for the command will be | ||
1195 | * ignored. This is usually implemented by the generic utility | ||
1196 | * function @set_trigger_filter() (see trace_event_triggers.c). | ||
1197 | * | ||
1198 | * @get_trigger_ops: The callback function invoked to retrieve the | ||
1199 | * event_trigger_ops implementation associated with the command. | ||
1200 | */ | ||
1201 | struct event_command { | ||
1202 | struct list_head list; | ||
1203 | char *name; | ||
1204 | enum event_trigger_type trigger_type; | ||
1205 | bool post_trigger; | ||
1206 | int (*func)(struct event_command *cmd_ops, | ||
1207 | struct ftrace_event_file *file, | ||
1208 | char *glob, char *cmd, char *params); | ||
1209 | int (*reg)(char *glob, | ||
1210 | struct event_trigger_ops *ops, | ||
1211 | struct event_trigger_data *data, | ||
1212 | struct ftrace_event_file *file); | ||
1213 | void (*unreg)(char *glob, | ||
1214 | struct event_trigger_ops *ops, | ||
1215 | struct event_trigger_data *data, | ||
1216 | struct ftrace_event_file *file); | ||
1217 | int (*set_filter)(char *filter_str, | ||
1218 | struct event_trigger_data *data, | ||
1219 | struct ftrace_event_file *file); | ||
1220 | struct event_trigger_ops *(*get_trigger_ops)(char *cmd, char *param); | ||
1221 | }; | ||
1222 | |||
1223 | extern int trace_event_enable_disable(struct ftrace_event_file *file, | ||
1224 | int enable, int soft_disable); | ||
1225 | extern int tracing_alloc_snapshot(void); | ||
1226 | |||
1034 | extern const char *__start___trace_bprintk_fmt[]; | 1227 | extern const char *__start___trace_bprintk_fmt[]; |
1035 | extern const char *__stop___trace_bprintk_fmt[]; | 1228 | extern const char *__stop___trace_bprintk_fmt[]; |
1036 | 1229 | ||
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index a11800ae96de..7b16d40bd64d 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c | |||
@@ -27,12 +27,6 @@ | |||
27 | 27 | ||
28 | DEFINE_MUTEX(event_mutex); | 28 | DEFINE_MUTEX(event_mutex); |
29 | 29 | ||
30 | DEFINE_MUTEX(event_storage_mutex); | ||
31 | EXPORT_SYMBOL_GPL(event_storage_mutex); | ||
32 | |||
33 | char event_storage[EVENT_STORAGE_SIZE]; | ||
34 | EXPORT_SYMBOL_GPL(event_storage); | ||
35 | |||
36 | LIST_HEAD(ftrace_events); | 30 | LIST_HEAD(ftrace_events); |
37 | static LIST_HEAD(ftrace_common_fields); | 31 | static LIST_HEAD(ftrace_common_fields); |
38 | 32 | ||
@@ -342,6 +336,12 @@ static int __ftrace_event_enable_disable(struct ftrace_event_file *file, | |||
342 | return ret; | 336 | return ret; |
343 | } | 337 | } |
344 | 338 | ||
339 | int trace_event_enable_disable(struct ftrace_event_file *file, | ||
340 | int enable, int soft_disable) | ||
341 | { | ||
342 | return __ftrace_event_enable_disable(file, enable, soft_disable); | ||
343 | } | ||
344 | |||
345 | static int ftrace_event_enable_disable(struct ftrace_event_file *file, | 345 | static int ftrace_event_enable_disable(struct ftrace_event_file *file, |
346 | int enable) | 346 | int enable) |
347 | { | 347 | { |
@@ -421,11 +421,6 @@ static void remove_subsystem(struct ftrace_subsystem_dir *dir) | |||
421 | } | 421 | } |
422 | } | 422 | } |
423 | 423 | ||
424 | static void *event_file_data(struct file *filp) | ||
425 | { | ||
426 | return ACCESS_ONCE(file_inode(filp)->i_private); | ||
427 | } | ||
428 | |||
429 | static void remove_event_file_dir(struct ftrace_event_file *file) | 424 | static void remove_event_file_dir(struct ftrace_event_file *file) |
430 | { | 425 | { |
431 | struct dentry *dir = file->dir; | 426 | struct dentry *dir = file->dir; |
@@ -1549,6 +1544,9 @@ event_create_dir(struct dentry *parent, struct ftrace_event_file *file) | |||
1549 | trace_create_file("filter", 0644, file->dir, file, | 1544 | trace_create_file("filter", 0644, file->dir, file, |
1550 | &ftrace_event_filter_fops); | 1545 | &ftrace_event_filter_fops); |
1551 | 1546 | ||
1547 | trace_create_file("trigger", 0644, file->dir, file, | ||
1548 | &event_trigger_fops); | ||
1549 | |||
1552 | trace_create_file("format", 0444, file->dir, call, | 1550 | trace_create_file("format", 0444, file->dir, call, |
1553 | &ftrace_event_format_fops); | 1551 | &ftrace_event_format_fops); |
1554 | 1552 | ||
@@ -1645,6 +1643,8 @@ trace_create_new_event(struct ftrace_event_call *call, | |||
1645 | file->event_call = call; | 1643 | file->event_call = call; |
1646 | file->tr = tr; | 1644 | file->tr = tr; |
1647 | atomic_set(&file->sm_ref, 0); | 1645 | atomic_set(&file->sm_ref, 0); |
1646 | atomic_set(&file->tm_ref, 0); | ||
1647 | INIT_LIST_HEAD(&file->triggers); | ||
1648 | list_add(&file->list, &tr->events); | 1648 | list_add(&file->list, &tr->events); |
1649 | 1649 | ||
1650 | return file; | 1650 | return file; |
@@ -1771,6 +1771,16 @@ static void trace_module_add_events(struct module *mod) | |||
1771 | { | 1771 | { |
1772 | struct ftrace_event_call **call, **start, **end; | 1772 | struct ftrace_event_call **call, **start, **end; |
1773 | 1773 | ||
1774 | if (!mod->num_trace_events) | ||
1775 | return; | ||
1776 | |||
1777 | /* Don't add infrastructure for mods without tracepoints */ | ||
1778 | if (trace_module_has_bad_taint(mod)) { | ||
1779 | pr_err("%s: module has bad taint, not creating trace events\n", | ||
1780 | mod->name); | ||
1781 | return; | ||
1782 | } | ||
1783 | |||
1774 | start = mod->trace_events; | 1784 | start = mod->trace_events; |
1775 | end = mod->trace_events + mod->num_trace_events; | 1785 | end = mod->trace_events + mod->num_trace_events; |
1776 | 1786 | ||
@@ -1849,20 +1859,7 @@ __trace_add_event_dirs(struct trace_array *tr) | |||
1849 | } | 1859 | } |
1850 | } | 1860 | } |
1851 | 1861 | ||
1852 | #ifdef CONFIG_DYNAMIC_FTRACE | 1862 | struct ftrace_event_file * |
1853 | |||
1854 | /* Avoid typos */ | ||
1855 | #define ENABLE_EVENT_STR "enable_event" | ||
1856 | #define DISABLE_EVENT_STR "disable_event" | ||
1857 | |||
1858 | struct event_probe_data { | ||
1859 | struct ftrace_event_file *file; | ||
1860 | unsigned long count; | ||
1861 | int ref; | ||
1862 | bool enable; | ||
1863 | }; | ||
1864 | |||
1865 | static struct ftrace_event_file * | ||
1866 | find_event_file(struct trace_array *tr, const char *system, const char *event) | 1863 | find_event_file(struct trace_array *tr, const char *system, const char *event) |
1867 | { | 1864 | { |
1868 | struct ftrace_event_file *file; | 1865 | struct ftrace_event_file *file; |
@@ -1885,6 +1882,19 @@ find_event_file(struct trace_array *tr, const char *system, const char *event) | |||
1885 | return NULL; | 1882 | return NULL; |
1886 | } | 1883 | } |
1887 | 1884 | ||
1885 | #ifdef CONFIG_DYNAMIC_FTRACE | ||
1886 | |||
1887 | /* Avoid typos */ | ||
1888 | #define ENABLE_EVENT_STR "enable_event" | ||
1889 | #define DISABLE_EVENT_STR "disable_event" | ||
1890 | |||
1891 | struct event_probe_data { | ||
1892 | struct ftrace_event_file *file; | ||
1893 | unsigned long count; | ||
1894 | int ref; | ||
1895 | bool enable; | ||
1896 | }; | ||
1897 | |||
1888 | static void | 1898 | static void |
1889 | event_enable_probe(unsigned long ip, unsigned long parent_ip, void **_data) | 1899 | event_enable_probe(unsigned long ip, unsigned long parent_ip, void **_data) |
1890 | { | 1900 | { |
@@ -2311,6 +2321,9 @@ int event_trace_del_tracer(struct trace_array *tr) | |||
2311 | { | 2321 | { |
2312 | mutex_lock(&event_mutex); | 2322 | mutex_lock(&event_mutex); |
2313 | 2323 | ||
2324 | /* Disable any event triggers and associated soft-disabled events */ | ||
2325 | clear_event_triggers(tr); | ||
2326 | |||
2314 | /* Disable any running events */ | 2327 | /* Disable any running events */ |
2315 | __ftrace_set_clr_event_nolock(tr, NULL, NULL, NULL, 0); | 2328 | __ftrace_set_clr_event_nolock(tr, NULL, NULL, NULL, 0); |
2316 | 2329 | ||
@@ -2377,6 +2390,8 @@ static __init int event_trace_enable(void) | |||
2377 | 2390 | ||
2378 | register_event_cmds(); | 2391 | register_event_cmds(); |
2379 | 2392 | ||
2393 | register_trigger_cmds(); | ||
2394 | |||
2380 | return 0; | 2395 | return 0; |
2381 | } | 2396 | } |
2382 | 2397 | ||
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 2468f56dc5db..8a8631926a07 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c | |||
@@ -799,6 +799,11 @@ static void __free_filter(struct event_filter *filter) | |||
799 | kfree(filter); | 799 | kfree(filter); |
800 | } | 800 | } |
801 | 801 | ||
802 | void free_event_filter(struct event_filter *filter) | ||
803 | { | ||
804 | __free_filter(filter); | ||
805 | } | ||
806 | |||
802 | void destroy_call_preds(struct ftrace_event_call *call) | 807 | void destroy_call_preds(struct ftrace_event_call *call) |
803 | { | 808 | { |
804 | __free_filter(call->filter); | 809 | __free_filter(call->filter); |
@@ -1938,6 +1943,13 @@ static int create_filter(struct ftrace_event_call *call, | |||
1938 | return err; | 1943 | return err; |
1939 | } | 1944 | } |
1940 | 1945 | ||
1946 | int create_event_filter(struct ftrace_event_call *call, | ||
1947 | char *filter_str, bool set_str, | ||
1948 | struct event_filter **filterp) | ||
1949 | { | ||
1950 | return create_filter(call, filter_str, set_str, filterp); | ||
1951 | } | ||
1952 | |||
1941 | /** | 1953 | /** |
1942 | * create_system_filter - create a filter for an event_subsystem | 1954 | * create_system_filter - create a filter for an event_subsystem |
1943 | * @system: event_subsystem to create a filter for | 1955 | * @system: event_subsystem to create a filter for |
diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c new file mode 100644 index 000000000000..8efbb69b04f0 --- /dev/null +++ b/kernel/trace/trace_events_trigger.c | |||
@@ -0,0 +1,1437 @@ | |||
1 | /* | ||
2 | * trace_events_trigger - trace event triggers | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or modify | ||
5 | * it under the terms of the GNU General Public License as published by | ||
6 | * the Free Software Foundation; either version 2 of the License, or | ||
7 | * (at your option) any later version. | ||
8 | * | ||
9 | * This program is distributed in the hope that it will be useful, | ||
10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
12 | * GNU General Public License for more details. | ||
13 | * | ||
14 | * You should have received a copy of the GNU General Public License | ||
15 | * along with this program; if not, write to the Free Software | ||
16 | * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. | ||
17 | * | ||
18 | * Copyright (C) 2013 Tom Zanussi <tom.zanussi@linux.intel.com> | ||
19 | */ | ||
20 | |||
21 | #include <linux/module.h> | ||
22 | #include <linux/ctype.h> | ||
23 | #include <linux/mutex.h> | ||
24 | #include <linux/slab.h> | ||
25 | |||
26 | #include "trace.h" | ||
27 | |||
28 | static LIST_HEAD(trigger_commands); | ||
29 | static DEFINE_MUTEX(trigger_cmd_mutex); | ||
30 | |||
31 | static void | ||
32 | trigger_data_free(struct event_trigger_data *data) | ||
33 | { | ||
34 | if (data->cmd_ops->set_filter) | ||
35 | data->cmd_ops->set_filter(NULL, data, NULL); | ||
36 | |||
37 | synchronize_sched(); /* make sure current triggers exit before free */ | ||
38 | kfree(data); | ||
39 | } | ||
40 | |||
41 | /** | ||
42 | * event_triggers_call - Call triggers associated with a trace event | ||
43 | * @file: The ftrace_event_file associated with the event | ||
44 | * @rec: The trace entry for the event, NULL for unconditional invocation | ||
45 | * | ||
46 | * For each trigger associated with an event, invoke the trigger | ||
47 | * function registered with the associated trigger command. If rec is | ||
48 | * non-NULL, it means that the trigger requires further processing and | ||
49 | * shouldn't be unconditionally invoked. If rec is non-NULL and the | ||
50 | * trigger has a filter associated with it, rec will checked against | ||
51 | * the filter and if the record matches the trigger will be invoked. | ||
52 | * If the trigger is a 'post_trigger', meaning it shouldn't be invoked | ||
53 | * in any case until the current event is written, the trigger | ||
54 | * function isn't invoked but the bit associated with the deferred | ||
55 | * trigger is set in the return value. | ||
56 | * | ||
57 | * Returns an enum event_trigger_type value containing a set bit for | ||
58 | * any trigger that should be deferred, ETT_NONE if nothing to defer. | ||
59 | * | ||
60 | * Called from tracepoint handlers (with rcu_read_lock_sched() held). | ||
61 | * | ||
62 | * Return: an enum event_trigger_type value containing a set bit for | ||
63 | * any trigger that should be deferred, ETT_NONE if nothing to defer. | ||
64 | */ | ||
65 | enum event_trigger_type | ||
66 | event_triggers_call(struct ftrace_event_file *file, void *rec) | ||
67 | { | ||
68 | struct event_trigger_data *data; | ||
69 | enum event_trigger_type tt = ETT_NONE; | ||
70 | struct event_filter *filter; | ||
71 | |||
72 | if (list_empty(&file->triggers)) | ||
73 | return tt; | ||
74 | |||
75 | list_for_each_entry_rcu(data, &file->triggers, list) { | ||
76 | if (!rec) { | ||
77 | data->ops->func(data); | ||
78 | continue; | ||
79 | } | ||
80 | filter = rcu_dereference(data->filter); | ||
81 | if (filter && !filter_match_preds(filter, rec)) | ||
82 | continue; | ||
83 | if (data->cmd_ops->post_trigger) { | ||
84 | tt |= data->cmd_ops->trigger_type; | ||
85 | continue; | ||
86 | } | ||
87 | data->ops->func(data); | ||
88 | } | ||
89 | return tt; | ||
90 | } | ||
91 | EXPORT_SYMBOL_GPL(event_triggers_call); | ||
92 | |||
93 | /** | ||
94 | * event_triggers_post_call - Call 'post_triggers' for a trace event | ||
95 | * @file: The ftrace_event_file associated with the event | ||
96 | * @tt: enum event_trigger_type containing a set bit for each trigger to invoke | ||
97 | * | ||
98 | * For each trigger associated with an event, invoke the trigger | ||
99 | * function registered with the associated trigger command, if the | ||
100 | * corresponding bit is set in the tt enum passed into this function. | ||
101 | * See @event_triggers_call for details on how those bits are set. | ||
102 | * | ||
103 | * Called from tracepoint handlers (with rcu_read_lock_sched() held). | ||
104 | */ | ||
105 | void | ||
106 | event_triggers_post_call(struct ftrace_event_file *file, | ||
107 | enum event_trigger_type tt) | ||
108 | { | ||
109 | struct event_trigger_data *data; | ||
110 | |||
111 | list_for_each_entry_rcu(data, &file->triggers, list) { | ||
112 | if (data->cmd_ops->trigger_type & tt) | ||
113 | data->ops->func(data); | ||
114 | } | ||
115 | } | ||
116 | EXPORT_SYMBOL_GPL(event_triggers_post_call); | ||
117 | |||
118 | #define SHOW_AVAILABLE_TRIGGERS (void *)(1UL) | ||
119 | |||
120 | static void *trigger_next(struct seq_file *m, void *t, loff_t *pos) | ||
121 | { | ||
122 | struct ftrace_event_file *event_file = event_file_data(m->private); | ||
123 | |||
124 | if (t == SHOW_AVAILABLE_TRIGGERS) | ||
125 | return NULL; | ||
126 | |||
127 | return seq_list_next(t, &event_file->triggers, pos); | ||
128 | } | ||
129 | |||
130 | static void *trigger_start(struct seq_file *m, loff_t *pos) | ||
131 | { | ||
132 | struct ftrace_event_file *event_file; | ||
133 | |||
134 | /* ->stop() is called even if ->start() fails */ | ||
135 | mutex_lock(&event_mutex); | ||
136 | event_file = event_file_data(m->private); | ||
137 | if (unlikely(!event_file)) | ||
138 | return ERR_PTR(-ENODEV); | ||
139 | |||
140 | if (list_empty(&event_file->triggers)) | ||
141 | return *pos == 0 ? SHOW_AVAILABLE_TRIGGERS : NULL; | ||
142 | |||
143 | return seq_list_start(&event_file->triggers, *pos); | ||
144 | } | ||
145 | |||
146 | static void trigger_stop(struct seq_file *m, void *t) | ||
147 | { | ||
148 | mutex_unlock(&event_mutex); | ||
149 | } | ||
150 | |||
151 | static int trigger_show(struct seq_file *m, void *v) | ||
152 | { | ||
153 | struct event_trigger_data *data; | ||
154 | struct event_command *p; | ||
155 | |||
156 | if (v == SHOW_AVAILABLE_TRIGGERS) { | ||
157 | seq_puts(m, "# Available triggers:\n"); | ||
158 | seq_putc(m, '#'); | ||
159 | mutex_lock(&trigger_cmd_mutex); | ||
160 | list_for_each_entry_reverse(p, &trigger_commands, list) | ||
161 | seq_printf(m, " %s", p->name); | ||
162 | seq_putc(m, '\n'); | ||
163 | mutex_unlock(&trigger_cmd_mutex); | ||
164 | return 0; | ||
165 | } | ||
166 | |||
167 | data = list_entry(v, struct event_trigger_data, list); | ||
168 | data->ops->print(m, data->ops, data); | ||
169 | |||
170 | return 0; | ||
171 | } | ||
172 | |||
173 | static const struct seq_operations event_triggers_seq_ops = { | ||
174 | .start = trigger_start, | ||
175 | .next = trigger_next, | ||
176 | .stop = trigger_stop, | ||
177 | .show = trigger_show, | ||
178 | }; | ||
179 | |||
180 | static int event_trigger_regex_open(struct inode *inode, struct file *file) | ||
181 | { | ||
182 | int ret = 0; | ||
183 | |||
184 | mutex_lock(&event_mutex); | ||
185 | |||
186 | if (unlikely(!event_file_data(file))) { | ||
187 | mutex_unlock(&event_mutex); | ||
188 | return -ENODEV; | ||
189 | } | ||
190 | |||
191 | if (file->f_mode & FMODE_READ) { | ||
192 | ret = seq_open(file, &event_triggers_seq_ops); | ||
193 | if (!ret) { | ||
194 | struct seq_file *m = file->private_data; | ||
195 | m->private = file; | ||
196 | } | ||
197 | } | ||
198 | |||
199 | mutex_unlock(&event_mutex); | ||
200 | |||
201 | return ret; | ||
202 | } | ||
203 | |||
204 | static int trigger_process_regex(struct ftrace_event_file *file, char *buff) | ||
205 | { | ||
206 | char *command, *next = buff; | ||
207 | struct event_command *p; | ||
208 | int ret = -EINVAL; | ||
209 | |||
210 | command = strsep(&next, ": \t"); | ||
211 | command = (command[0] != '!') ? command : command + 1; | ||
212 | |||
213 | mutex_lock(&trigger_cmd_mutex); | ||
214 | list_for_each_entry(p, &trigger_commands, list) { | ||
215 | if (strcmp(p->name, command) == 0) { | ||
216 | ret = p->func(p, file, buff, command, next); | ||
217 | goto out_unlock; | ||
218 | } | ||
219 | } | ||
220 | out_unlock: | ||
221 | mutex_unlock(&trigger_cmd_mutex); | ||
222 | |||
223 | return ret; | ||
224 | } | ||
225 | |||
226 | static ssize_t event_trigger_regex_write(struct file *file, | ||
227 | const char __user *ubuf, | ||
228 | size_t cnt, loff_t *ppos) | ||
229 | { | ||
230 | struct ftrace_event_file *event_file; | ||
231 | ssize_t ret; | ||
232 | char *buf; | ||
233 | |||
234 | if (!cnt) | ||
235 | return 0; | ||
236 | |||
237 | if (cnt >= PAGE_SIZE) | ||
238 | return -EINVAL; | ||
239 | |||
240 | buf = (char *)__get_free_page(GFP_TEMPORARY); | ||
241 | if (!buf) | ||
242 | return -ENOMEM; | ||
243 | |||
244 | if (copy_from_user(buf, ubuf, cnt)) { | ||
245 | free_page((unsigned long)buf); | ||
246 | return -EFAULT; | ||
247 | } | ||
248 | buf[cnt] = '\0'; | ||
249 | strim(buf); | ||
250 | |||
251 | mutex_lock(&event_mutex); | ||
252 | event_file = event_file_data(file); | ||
253 | if (unlikely(!event_file)) { | ||
254 | mutex_unlock(&event_mutex); | ||
255 | free_page((unsigned long)buf); | ||
256 | return -ENODEV; | ||
257 | } | ||
258 | ret = trigger_process_regex(event_file, buf); | ||
259 | mutex_unlock(&event_mutex); | ||
260 | |||
261 | free_page((unsigned long)buf); | ||
262 | if (ret < 0) | ||
263 | goto out; | ||
264 | |||
265 | *ppos += cnt; | ||
266 | ret = cnt; | ||
267 | out: | ||
268 | return ret; | ||
269 | } | ||
270 | |||
271 | static int event_trigger_regex_release(struct inode *inode, struct file *file) | ||
272 | { | ||
273 | mutex_lock(&event_mutex); | ||
274 | |||
275 | if (file->f_mode & FMODE_READ) | ||
276 | seq_release(inode, file); | ||
277 | |||
278 | mutex_unlock(&event_mutex); | ||
279 | |||
280 | return 0; | ||
281 | } | ||
282 | |||
283 | static ssize_t | ||
284 | event_trigger_write(struct file *filp, const char __user *ubuf, | ||
285 | size_t cnt, loff_t *ppos) | ||
286 | { | ||
287 | return event_trigger_regex_write(filp, ubuf, cnt, ppos); | ||
288 | } | ||
289 | |||
290 | static int | ||
291 | event_trigger_open(struct inode *inode, struct file *filp) | ||
292 | { | ||
293 | return event_trigger_regex_open(inode, filp); | ||
294 | } | ||
295 | |||
296 | static int | ||
297 | event_trigger_release(struct inode *inode, struct file *file) | ||
298 | { | ||
299 | return event_trigger_regex_release(inode, file); | ||
300 | } | ||
301 | |||
302 | const struct file_operations event_trigger_fops = { | ||
303 | .open = event_trigger_open, | ||
304 | .read = seq_read, | ||
305 | .write = event_trigger_write, | ||
306 | .llseek = tracing_lseek, | ||
307 | .release = event_trigger_release, | ||
308 | }; | ||
309 | |||
310 | /* | ||
311 | * Currently we only register event commands from __init, so mark this | ||
312 | * __init too. | ||
313 | */ | ||
314 | static __init int register_event_command(struct event_command *cmd) | ||
315 | { | ||
316 | struct event_command *p; | ||
317 | int ret = 0; | ||
318 | |||
319 | mutex_lock(&trigger_cmd_mutex); | ||
320 | list_for_each_entry(p, &trigger_commands, list) { | ||
321 | if (strcmp(cmd->name, p->name) == 0) { | ||
322 | ret = -EBUSY; | ||
323 | goto out_unlock; | ||
324 | } | ||
325 | } | ||
326 | list_add(&cmd->list, &trigger_commands); | ||
327 | out_unlock: | ||
328 | mutex_unlock(&trigger_cmd_mutex); | ||
329 | |||
330 | return ret; | ||
331 | } | ||
332 | |||
333 | /* | ||
334 | * Currently we only unregister event commands from __init, so mark | ||
335 | * this __init too. | ||
336 | */ | ||
337 | static __init int unregister_event_command(struct event_command *cmd) | ||
338 | { | ||
339 | struct event_command *p, *n; | ||
340 | int ret = -ENODEV; | ||
341 | |||
342 | mutex_lock(&trigger_cmd_mutex); | ||
343 | list_for_each_entry_safe(p, n, &trigger_commands, list) { | ||
344 | if (strcmp(cmd->name, p->name) == 0) { | ||
345 | ret = 0; | ||
346 | list_del_init(&p->list); | ||
347 | goto out_unlock; | ||
348 | } | ||
349 | } | ||
350 | out_unlock: | ||
351 | mutex_unlock(&trigger_cmd_mutex); | ||
352 | |||
353 | return ret; | ||
354 | } | ||
355 | |||
356 | /** | ||
357 | * event_trigger_print - Generic event_trigger_ops @print implementation | ||
358 | * @name: The name of the event trigger | ||
359 | * @m: The seq_file being printed to | ||
360 | * @data: Trigger-specific data | ||
361 | * @filter_str: filter_str to print, if present | ||
362 | * | ||
363 | * Common implementation for event triggers to print themselves. | ||
364 | * | ||
365 | * Usually wrapped by a function that simply sets the @name of the | ||
366 | * trigger command and then invokes this. | ||
367 | * | ||
368 | * Return: 0 on success, errno otherwise | ||
369 | */ | ||
370 | static int | ||
371 | event_trigger_print(const char *name, struct seq_file *m, | ||
372 | void *data, char *filter_str) | ||
373 | { | ||
374 | long count = (long)data; | ||
375 | |||
376 | seq_printf(m, "%s", name); | ||
377 | |||
378 | if (count == -1) | ||
379 | seq_puts(m, ":unlimited"); | ||
380 | else | ||
381 | seq_printf(m, ":count=%ld", count); | ||
382 | |||
383 | if (filter_str) | ||
384 | seq_printf(m, " if %s\n", filter_str); | ||
385 | else | ||
386 | seq_puts(m, "\n"); | ||
387 | |||
388 | return 0; | ||
389 | } | ||
390 | |||
391 | /** | ||
392 | * event_trigger_init - Generic event_trigger_ops @init implementation | ||
393 | * @ops: The trigger ops associated with the trigger | ||
394 | * @data: Trigger-specific data | ||
395 | * | ||
396 | * Common implementation of event trigger initialization. | ||
397 | * | ||
398 | * Usually used directly as the @init method in event trigger | ||
399 | * implementations. | ||
400 | * | ||
401 | * Return: 0 on success, errno otherwise | ||
402 | */ | ||
403 | static int | ||
404 | event_trigger_init(struct event_trigger_ops *ops, | ||
405 | struct event_trigger_data *data) | ||
406 | { | ||
407 | data->ref++; | ||
408 | return 0; | ||
409 | } | ||
410 | |||
411 | /** | ||
412 | * event_trigger_free - Generic event_trigger_ops @free implementation | ||
413 | * @ops: The trigger ops associated with the trigger | ||
414 | * @data: Trigger-specific data | ||
415 | * | ||
416 | * Common implementation of event trigger de-initialization. | ||
417 | * | ||
418 | * Usually used directly as the @free method in event trigger | ||
419 | * implementations. | ||
420 | */ | ||
421 | static void | ||
422 | event_trigger_free(struct event_trigger_ops *ops, | ||
423 | struct event_trigger_data *data) | ||
424 | { | ||
425 | if (WARN_ON_ONCE(data->ref <= 0)) | ||
426 | return; | ||
427 | |||
428 | data->ref--; | ||
429 | if (!data->ref) | ||
430 | trigger_data_free(data); | ||
431 | } | ||
432 | |||
433 | static int trace_event_trigger_enable_disable(struct ftrace_event_file *file, | ||
434 | int trigger_enable) | ||
435 | { | ||
436 | int ret = 0; | ||
437 | |||
438 | if (trigger_enable) { | ||
439 | if (atomic_inc_return(&file->tm_ref) > 1) | ||
440 | return ret; | ||
441 | set_bit(FTRACE_EVENT_FL_TRIGGER_MODE_BIT, &file->flags); | ||
442 | ret = trace_event_enable_disable(file, 1, 1); | ||
443 | } else { | ||
444 | if (atomic_dec_return(&file->tm_ref) > 0) | ||
445 | return ret; | ||
446 | clear_bit(FTRACE_EVENT_FL_TRIGGER_MODE_BIT, &file->flags); | ||
447 | ret = trace_event_enable_disable(file, 0, 1); | ||
448 | } | ||
449 | |||
450 | return ret; | ||
451 | } | ||
452 | |||
453 | /** | ||
454 | * clear_event_triggers - Clear all triggers associated with a trace array | ||
455 | * @tr: The trace array to clear | ||
456 | * | ||
457 | * For each trigger, the triggering event has its tm_ref decremented | ||
458 | * via trace_event_trigger_enable_disable(), and any associated event | ||
459 | * (in the case of enable/disable_event triggers) will have its sm_ref | ||
460 | * decremented via free()->trace_event_enable_disable(). That | ||
461 | * combination effectively reverses the soft-mode/trigger state added | ||
462 | * by trigger registration. | ||
463 | * | ||
464 | * Must be called with event_mutex held. | ||
465 | */ | ||
466 | void | ||
467 | clear_event_triggers(struct trace_array *tr) | ||
468 | { | ||
469 | struct ftrace_event_file *file; | ||
470 | |||
471 | list_for_each_entry(file, &tr->events, list) { | ||
472 | struct event_trigger_data *data; | ||
473 | list_for_each_entry_rcu(data, &file->triggers, list) { | ||
474 | trace_event_trigger_enable_disable(file, 0); | ||
475 | if (data->ops->free) | ||
476 | data->ops->free(data->ops, data); | ||
477 | } | ||
478 | } | ||
479 | } | ||
480 | |||
481 | /** | ||
482 | * update_cond_flag - Set or reset the TRIGGER_COND bit | ||
483 | * @file: The ftrace_event_file associated with the event | ||
484 | * | ||
485 | * If an event has triggers and any of those triggers has a filter or | ||
486 | * a post_trigger, trigger invocation needs to be deferred until after | ||
487 | * the current event has logged its data, and the event should have | ||
488 | * its TRIGGER_COND bit set, otherwise the TRIGGER_COND bit should be | ||
489 | * cleared. | ||
490 | */ | ||
491 | static void update_cond_flag(struct ftrace_event_file *file) | ||
492 | { | ||
493 | struct event_trigger_data *data; | ||
494 | bool set_cond = false; | ||
495 | |||
496 | list_for_each_entry_rcu(data, &file->triggers, list) { | ||
497 | if (data->filter || data->cmd_ops->post_trigger) { | ||
498 | set_cond = true; | ||
499 | break; | ||
500 | } | ||
501 | } | ||
502 | |||
503 | if (set_cond) | ||
504 | set_bit(FTRACE_EVENT_FL_TRIGGER_COND_BIT, &file->flags); | ||
505 | else | ||
506 | clear_bit(FTRACE_EVENT_FL_TRIGGER_COND_BIT, &file->flags); | ||
507 | } | ||
508 | |||
509 | /** | ||
510 | * register_trigger - Generic event_command @reg implementation | ||
511 | * @glob: The raw string used to register the trigger | ||
512 | * @ops: The trigger ops associated with the trigger | ||
513 | * @data: Trigger-specific data to associate with the trigger | ||
514 | * @file: The ftrace_event_file associated with the event | ||
515 | * | ||
516 | * Common implementation for event trigger registration. | ||
517 | * | ||
518 | * Usually used directly as the @reg method in event command | ||
519 | * implementations. | ||
520 | * | ||
521 | * Return: 0 on success, errno otherwise | ||
522 | */ | ||
523 | static int register_trigger(char *glob, struct event_trigger_ops *ops, | ||
524 | struct event_trigger_data *data, | ||
525 | struct ftrace_event_file *file) | ||
526 | { | ||
527 | struct event_trigger_data *test; | ||
528 | int ret = 0; | ||
529 | |||
530 | list_for_each_entry_rcu(test, &file->triggers, list) { | ||
531 | if (test->cmd_ops->trigger_type == data->cmd_ops->trigger_type) { | ||
532 | ret = -EEXIST; | ||
533 | goto out; | ||
534 | } | ||
535 | } | ||
536 | |||
537 | if (data->ops->init) { | ||
538 | ret = data->ops->init(data->ops, data); | ||
539 | if (ret < 0) | ||
540 | goto out; | ||
541 | } | ||
542 | |||
543 | list_add_rcu(&data->list, &file->triggers); | ||
544 | ret++; | ||
545 | |||
546 | if (trace_event_trigger_enable_disable(file, 1) < 0) { | ||
547 | list_del_rcu(&data->list); | ||
548 | ret--; | ||
549 | } | ||
550 | update_cond_flag(file); | ||
551 | out: | ||
552 | return ret; | ||
553 | } | ||
554 | |||
555 | /** | ||
556 | * unregister_trigger - Generic event_command @unreg implementation | ||
557 | * @glob: The raw string used to register the trigger | ||
558 | * @ops: The trigger ops associated with the trigger | ||
559 | * @test: Trigger-specific data used to find the trigger to remove | ||
560 | * @file: The ftrace_event_file associated with the event | ||
561 | * | ||
562 | * Common implementation for event trigger unregistration. | ||
563 | * | ||
564 | * Usually used directly as the @unreg method in event command | ||
565 | * implementations. | ||
566 | */ | ||
567 | static void unregister_trigger(char *glob, struct event_trigger_ops *ops, | ||
568 | struct event_trigger_data *test, | ||
569 | struct ftrace_event_file *file) | ||
570 | { | ||
571 | struct event_trigger_data *data; | ||
572 | bool unregistered = false; | ||
573 | |||
574 | list_for_each_entry_rcu(data, &file->triggers, list) { | ||
575 | if (data->cmd_ops->trigger_type == test->cmd_ops->trigger_type) { | ||
576 | unregistered = true; | ||
577 | list_del_rcu(&data->list); | ||
578 | update_cond_flag(file); | ||
579 | trace_event_trigger_enable_disable(file, 0); | ||
580 | break; | ||
581 | } | ||
582 | } | ||
583 | |||
584 | if (unregistered && data->ops->free) | ||
585 | data->ops->free(data->ops, data); | ||
586 | } | ||
587 | |||
588 | /** | ||
589 | * event_trigger_callback - Generic event_command @func implementation | ||
590 | * @cmd_ops: The command ops, used for trigger registration | ||
591 | * @file: The ftrace_event_file associated with the event | ||
592 | * @glob: The raw string used to register the trigger | ||
593 | * @cmd: The cmd portion of the string used to register the trigger | ||
594 | * @param: The params portion of the string used to register the trigger | ||
595 | * | ||
596 | * Common implementation for event command parsing and trigger | ||
597 | * instantiation. | ||
598 | * | ||
599 | * Usually used directly as the @func method in event command | ||
600 | * implementations. | ||
601 | * | ||
602 | * Return: 0 on success, errno otherwise | ||
603 | */ | ||
604 | static int | ||
605 | event_trigger_callback(struct event_command *cmd_ops, | ||
606 | struct ftrace_event_file *file, | ||
607 | char *glob, char *cmd, char *param) | ||
608 | { | ||
609 | struct event_trigger_data *trigger_data; | ||
610 | struct event_trigger_ops *trigger_ops; | ||
611 | char *trigger = NULL; | ||
612 | char *number; | ||
613 | int ret; | ||
614 | |||
615 | /* separate the trigger from the filter (t:n [if filter]) */ | ||
616 | if (param && isdigit(param[0])) | ||
617 | trigger = strsep(¶m, " \t"); | ||
618 | |||
619 | trigger_ops = cmd_ops->get_trigger_ops(cmd, trigger); | ||
620 | |||
621 | ret = -ENOMEM; | ||
622 | trigger_data = kzalloc(sizeof(*trigger_data), GFP_KERNEL); | ||
623 | if (!trigger_data) | ||
624 | goto out; | ||
625 | |||
626 | trigger_data->count = -1; | ||
627 | trigger_data->ops = trigger_ops; | ||
628 | trigger_data->cmd_ops = cmd_ops; | ||
629 | INIT_LIST_HEAD(&trigger_data->list); | ||
630 | |||
631 | if (glob[0] == '!') { | ||
632 | cmd_ops->unreg(glob+1, trigger_ops, trigger_data, file); | ||
633 | kfree(trigger_data); | ||
634 | ret = 0; | ||
635 | goto out; | ||
636 | } | ||
637 | |||
638 | if (trigger) { | ||
639 | number = strsep(&trigger, ":"); | ||
640 | |||
641 | ret = -EINVAL; | ||
642 | if (!strlen(number)) | ||
643 | goto out_free; | ||
644 | |||
645 | /* | ||
646 | * We use the callback data field (which is a pointer) | ||
647 | * as our counter. | ||
648 | */ | ||
649 | ret = kstrtoul(number, 0, &trigger_data->count); | ||
650 | if (ret) | ||
651 | goto out_free; | ||
652 | } | ||
653 | |||
654 | if (!param) /* if param is non-empty, it's supposed to be a filter */ | ||
655 | goto out_reg; | ||
656 | |||
657 | if (!cmd_ops->set_filter) | ||
658 | goto out_reg; | ||
659 | |||
660 | ret = cmd_ops->set_filter(param, trigger_data, file); | ||
661 | if (ret < 0) | ||
662 | goto out_free; | ||
663 | |||
664 | out_reg: | ||
665 | ret = cmd_ops->reg(glob, trigger_ops, trigger_data, file); | ||
666 | /* | ||
667 | * The above returns on success the # of functions enabled, | ||
668 | * but if it didn't find any functions it returns zero. | ||
669 | * Consider no functions a failure too. | ||
670 | */ | ||
671 | if (!ret) { | ||
672 | ret = -ENOENT; | ||
673 | goto out_free; | ||
674 | } else if (ret < 0) | ||
675 | goto out_free; | ||
676 | ret = 0; | ||
677 | out: | ||
678 | return ret; | ||
679 | |||
680 | out_free: | ||
681 | if (cmd_ops->set_filter) | ||
682 | cmd_ops->set_filter(NULL, trigger_data, NULL); | ||
683 | kfree(trigger_data); | ||
684 | goto out; | ||
685 | } | ||
686 | |||
687 | /** | ||
688 | * set_trigger_filter - Generic event_command @set_filter implementation | ||
689 | * @filter_str: The filter string for the trigger, NULL to remove filter | ||
690 | * @trigger_data: Trigger-specific data | ||
691 | * @file: The ftrace_event_file associated with the event | ||
692 | * | ||
693 | * Common implementation for event command filter parsing and filter | ||
694 | * instantiation. | ||
695 | * | ||
696 | * Usually used directly as the @set_filter method in event command | ||
697 | * implementations. | ||
698 | * | ||
699 | * Also used to remove a filter (if filter_str = NULL). | ||
700 | * | ||
701 | * Return: 0 on success, errno otherwise | ||
702 | */ | ||
703 | static int set_trigger_filter(char *filter_str, | ||
704 | struct event_trigger_data *trigger_data, | ||
705 | struct ftrace_event_file *file) | ||
706 | { | ||
707 | struct event_trigger_data *data = trigger_data; | ||
708 | struct event_filter *filter = NULL, *tmp; | ||
709 | int ret = -EINVAL; | ||
710 | char *s; | ||
711 | |||
712 | if (!filter_str) /* clear the current filter */ | ||
713 | goto assign; | ||
714 | |||
715 | s = strsep(&filter_str, " \t"); | ||
716 | |||
717 | if (!strlen(s) || strcmp(s, "if") != 0) | ||
718 | goto out; | ||
719 | |||
720 | if (!filter_str) | ||
721 | goto out; | ||
722 | |||
723 | /* The filter is for the 'trigger' event, not the triggered event */ | ||
724 | ret = create_event_filter(file->event_call, filter_str, false, &filter); | ||
725 | if (ret) | ||
726 | goto out; | ||
727 | assign: | ||
728 | tmp = rcu_access_pointer(data->filter); | ||
729 | |||
730 | rcu_assign_pointer(data->filter, filter); | ||
731 | |||
732 | if (tmp) { | ||
733 | /* Make sure the call is done with the filter */ | ||
734 | synchronize_sched(); | ||
735 | free_event_filter(tmp); | ||
736 | } | ||
737 | |||
738 | kfree(data->filter_str); | ||
739 | data->filter_str = NULL; | ||
740 | |||
741 | if (filter_str) { | ||
742 | data->filter_str = kstrdup(filter_str, GFP_KERNEL); | ||
743 | if (!data->filter_str) { | ||
744 | free_event_filter(rcu_access_pointer(data->filter)); | ||
745 | data->filter = NULL; | ||
746 | ret = -ENOMEM; | ||
747 | } | ||
748 | } | ||
749 | out: | ||
750 | return ret; | ||
751 | } | ||
752 | |||
753 | static void | ||
754 | traceon_trigger(struct event_trigger_data *data) | ||
755 | { | ||
756 | if (tracing_is_on()) | ||
757 | return; | ||
758 | |||
759 | tracing_on(); | ||
760 | } | ||
761 | |||
762 | static void | ||
763 | traceon_count_trigger(struct event_trigger_data *data) | ||
764 | { | ||
765 | if (tracing_is_on()) | ||
766 | return; | ||
767 | |||
768 | if (!data->count) | ||
769 | return; | ||
770 | |||
771 | if (data->count != -1) | ||
772 | (data->count)--; | ||
773 | |||
774 | tracing_on(); | ||
775 | } | ||
776 | |||
777 | static void | ||
778 | traceoff_trigger(struct event_trigger_data *data) | ||
779 | { | ||
780 | if (!tracing_is_on()) | ||
781 | return; | ||
782 | |||
783 | tracing_off(); | ||
784 | } | ||
785 | |||
786 | static void | ||
787 | traceoff_count_trigger(struct event_trigger_data *data) | ||
788 | { | ||
789 | if (!tracing_is_on()) | ||
790 | return; | ||
791 | |||
792 | if (!data->count) | ||
793 | return; | ||
794 | |||
795 | if (data->count != -1) | ||
796 | (data->count)--; | ||
797 | |||
798 | tracing_off(); | ||
799 | } | ||
800 | |||
801 | static int | ||
802 | traceon_trigger_print(struct seq_file *m, struct event_trigger_ops *ops, | ||
803 | struct event_trigger_data *data) | ||
804 | { | ||
805 | return event_trigger_print("traceon", m, (void *)data->count, | ||
806 | data->filter_str); | ||
807 | } | ||
808 | |||
809 | static int | ||
810 | traceoff_trigger_print(struct seq_file *m, struct event_trigger_ops *ops, | ||
811 | struct event_trigger_data *data) | ||
812 | { | ||
813 | return event_trigger_print("traceoff", m, (void *)data->count, | ||
814 | data->filter_str); | ||
815 | } | ||
816 | |||
817 | static struct event_trigger_ops traceon_trigger_ops = { | ||
818 | .func = traceon_trigger, | ||
819 | .print = traceon_trigger_print, | ||
820 | .init = event_trigger_init, | ||
821 | .free = event_trigger_free, | ||
822 | }; | ||
823 | |||
824 | static struct event_trigger_ops traceon_count_trigger_ops = { | ||
825 | .func = traceon_count_trigger, | ||
826 | .print = traceon_trigger_print, | ||
827 | .init = event_trigger_init, | ||
828 | .free = event_trigger_free, | ||
829 | }; | ||
830 | |||
831 | static struct event_trigger_ops traceoff_trigger_ops = { | ||
832 | .func = traceoff_trigger, | ||
833 | .print = traceoff_trigger_print, | ||
834 | .init = event_trigger_init, | ||
835 | .free = event_trigger_free, | ||
836 | }; | ||
837 | |||
838 | static struct event_trigger_ops traceoff_count_trigger_ops = { | ||
839 | .func = traceoff_count_trigger, | ||
840 | .print = traceoff_trigger_print, | ||
841 | .init = event_trigger_init, | ||
842 | .free = event_trigger_free, | ||
843 | }; | ||
844 | |||
845 | static struct event_trigger_ops * | ||
846 | onoff_get_trigger_ops(char *cmd, char *param) | ||
847 | { | ||
848 | struct event_trigger_ops *ops; | ||
849 | |||
850 | /* we register both traceon and traceoff to this callback */ | ||
851 | if (strcmp(cmd, "traceon") == 0) | ||
852 | ops = param ? &traceon_count_trigger_ops : | ||
853 | &traceon_trigger_ops; | ||
854 | else | ||
855 | ops = param ? &traceoff_count_trigger_ops : | ||
856 | &traceoff_trigger_ops; | ||
857 | |||
858 | return ops; | ||
859 | } | ||
860 | |||
861 | static struct event_command trigger_traceon_cmd = { | ||
862 | .name = "traceon", | ||
863 | .trigger_type = ETT_TRACE_ONOFF, | ||
864 | .func = event_trigger_callback, | ||
865 | .reg = register_trigger, | ||
866 | .unreg = unregister_trigger, | ||
867 | .get_trigger_ops = onoff_get_trigger_ops, | ||
868 | .set_filter = set_trigger_filter, | ||
869 | }; | ||
870 | |||
871 | static struct event_command trigger_traceoff_cmd = { | ||
872 | .name = "traceoff", | ||
873 | .trigger_type = ETT_TRACE_ONOFF, | ||
874 | .func = event_trigger_callback, | ||
875 | .reg = register_trigger, | ||
876 | .unreg = unregister_trigger, | ||
877 | .get_trigger_ops = onoff_get_trigger_ops, | ||
878 | .set_filter = set_trigger_filter, | ||
879 | }; | ||
880 | |||
881 | #ifdef CONFIG_TRACER_SNAPSHOT | ||
882 | static void | ||
883 | snapshot_trigger(struct event_trigger_data *data) | ||
884 | { | ||
885 | tracing_snapshot(); | ||
886 | } | ||
887 | |||
888 | static void | ||
889 | snapshot_count_trigger(struct event_trigger_data *data) | ||
890 | { | ||
891 | if (!data->count) | ||
892 | return; | ||
893 | |||
894 | if (data->count != -1) | ||
895 | (data->count)--; | ||
896 | |||
897 | snapshot_trigger(data); | ||
898 | } | ||
899 | |||
900 | static int | ||
901 | register_snapshot_trigger(char *glob, struct event_trigger_ops *ops, | ||
902 | struct event_trigger_data *data, | ||
903 | struct ftrace_event_file *file) | ||
904 | { | ||
905 | int ret = register_trigger(glob, ops, data, file); | ||
906 | |||
907 | if (ret > 0 && tracing_alloc_snapshot() != 0) { | ||
908 | unregister_trigger(glob, ops, data, file); | ||
909 | ret = 0; | ||
910 | } | ||
911 | |||
912 | return ret; | ||
913 | } | ||
914 | |||
915 | static int | ||
916 | snapshot_trigger_print(struct seq_file *m, struct event_trigger_ops *ops, | ||
917 | struct event_trigger_data *data) | ||
918 | { | ||
919 | return event_trigger_print("snapshot", m, (void *)data->count, | ||
920 | data->filter_str); | ||
921 | } | ||
922 | |||
923 | static struct event_trigger_ops snapshot_trigger_ops = { | ||
924 | .func = snapshot_trigger, | ||
925 | .print = snapshot_trigger_print, | ||
926 | .init = event_trigger_init, | ||
927 | .free = event_trigger_free, | ||
928 | }; | ||
929 | |||
930 | static struct event_trigger_ops snapshot_count_trigger_ops = { | ||
931 | .func = snapshot_count_trigger, | ||
932 | .print = snapshot_trigger_print, | ||
933 | .init = event_trigger_init, | ||
934 | .free = event_trigger_free, | ||
935 | }; | ||
936 | |||
937 | static struct event_trigger_ops * | ||
938 | snapshot_get_trigger_ops(char *cmd, char *param) | ||
939 | { | ||
940 | return param ? &snapshot_count_trigger_ops : &snapshot_trigger_ops; | ||
941 | } | ||
942 | |||
943 | static struct event_command trigger_snapshot_cmd = { | ||
944 | .name = "snapshot", | ||
945 | .trigger_type = ETT_SNAPSHOT, | ||
946 | .func = event_trigger_callback, | ||
947 | .reg = register_snapshot_trigger, | ||
948 | .unreg = unregister_trigger, | ||
949 | .get_trigger_ops = snapshot_get_trigger_ops, | ||
950 | .set_filter = set_trigger_filter, | ||
951 | }; | ||
952 | |||
953 | static __init int register_trigger_snapshot_cmd(void) | ||
954 | { | ||
955 | int ret; | ||
956 | |||
957 | ret = register_event_command(&trigger_snapshot_cmd); | ||
958 | WARN_ON(ret < 0); | ||
959 | |||
960 | return ret; | ||
961 | } | ||
962 | #else | ||
963 | static __init int register_trigger_snapshot_cmd(void) { return 0; } | ||
964 | #endif /* CONFIG_TRACER_SNAPSHOT */ | ||
965 | |||
966 | #ifdef CONFIG_STACKTRACE | ||
967 | /* | ||
968 | * Skip 3: | ||
969 | * stacktrace_trigger() | ||
970 | * event_triggers_post_call() | ||
971 | * ftrace_raw_event_xxx() | ||
972 | */ | ||
973 | #define STACK_SKIP 3 | ||
974 | |||
975 | static void | ||
976 | stacktrace_trigger(struct event_trigger_data *data) | ||
977 | { | ||
978 | trace_dump_stack(STACK_SKIP); | ||
979 | } | ||
980 | |||
981 | static void | ||
982 | stacktrace_count_trigger(struct event_trigger_data *data) | ||
983 | { | ||
984 | if (!data->count) | ||
985 | return; | ||
986 | |||
987 | if (data->count != -1) | ||
988 | (data->count)--; | ||
989 | |||
990 | stacktrace_trigger(data); | ||
991 | } | ||
992 | |||
993 | static int | ||
994 | stacktrace_trigger_print(struct seq_file *m, struct event_trigger_ops *ops, | ||
995 | struct event_trigger_data *data) | ||
996 | { | ||
997 | return event_trigger_print("stacktrace", m, (void *)data->count, | ||
998 | data->filter_str); | ||
999 | } | ||
1000 | |||
1001 | static struct event_trigger_ops stacktrace_trigger_ops = { | ||
1002 | .func = stacktrace_trigger, | ||
1003 | .print = stacktrace_trigger_print, | ||
1004 | .init = event_trigger_init, | ||
1005 | .free = event_trigger_free, | ||
1006 | }; | ||
1007 | |||
1008 | static struct event_trigger_ops stacktrace_count_trigger_ops = { | ||
1009 | .func = stacktrace_count_trigger, | ||
1010 | .print = stacktrace_trigger_print, | ||
1011 | .init = event_trigger_init, | ||
1012 | .free = event_trigger_free, | ||
1013 | }; | ||
1014 | |||
1015 | static struct event_trigger_ops * | ||
1016 | stacktrace_get_trigger_ops(char *cmd, char *param) | ||
1017 | { | ||
1018 | return param ? &stacktrace_count_trigger_ops : &stacktrace_trigger_ops; | ||
1019 | } | ||
1020 | |||
1021 | static struct event_command trigger_stacktrace_cmd = { | ||
1022 | .name = "stacktrace", | ||
1023 | .trigger_type = ETT_STACKTRACE, | ||
1024 | .post_trigger = true, | ||
1025 | .func = event_trigger_callback, | ||
1026 | .reg = register_trigger, | ||
1027 | .unreg = unregister_trigger, | ||
1028 | .get_trigger_ops = stacktrace_get_trigger_ops, | ||
1029 | .set_filter = set_trigger_filter, | ||
1030 | }; | ||
1031 | |||
1032 | static __init int register_trigger_stacktrace_cmd(void) | ||
1033 | { | ||
1034 | int ret; | ||
1035 | |||
1036 | ret = register_event_command(&trigger_stacktrace_cmd); | ||
1037 | WARN_ON(ret < 0); | ||
1038 | |||
1039 | return ret; | ||
1040 | } | ||
1041 | #else | ||
1042 | static __init int register_trigger_stacktrace_cmd(void) { return 0; } | ||
1043 | #endif /* CONFIG_STACKTRACE */ | ||
1044 | |||
1045 | static __init void unregister_trigger_traceon_traceoff_cmds(void) | ||
1046 | { | ||
1047 | unregister_event_command(&trigger_traceon_cmd); | ||
1048 | unregister_event_command(&trigger_traceoff_cmd); | ||
1049 | } | ||
1050 | |||
1051 | /* Avoid typos */ | ||
1052 | #define ENABLE_EVENT_STR "enable_event" | ||
1053 | #define DISABLE_EVENT_STR "disable_event" | ||
1054 | |||
1055 | struct enable_trigger_data { | ||
1056 | struct ftrace_event_file *file; | ||
1057 | bool enable; | ||
1058 | }; | ||
1059 | |||
1060 | static void | ||
1061 | event_enable_trigger(struct event_trigger_data *data) | ||
1062 | { | ||
1063 | struct enable_trigger_data *enable_data = data->private_data; | ||
1064 | |||
1065 | if (enable_data->enable) | ||
1066 | clear_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &enable_data->file->flags); | ||
1067 | else | ||
1068 | set_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &enable_data->file->flags); | ||
1069 | } | ||
1070 | |||
1071 | static void | ||
1072 | event_enable_count_trigger(struct event_trigger_data *data) | ||
1073 | { | ||
1074 | struct enable_trigger_data *enable_data = data->private_data; | ||
1075 | |||
1076 | if (!data->count) | ||
1077 | return; | ||
1078 | |||
1079 | /* Skip if the event is in a state we want to switch to */ | ||
1080 | if (enable_data->enable == !(enable_data->file->flags & FTRACE_EVENT_FL_SOFT_DISABLED)) | ||
1081 | return; | ||
1082 | |||
1083 | if (data->count != -1) | ||
1084 | (data->count)--; | ||
1085 | |||
1086 | event_enable_trigger(data); | ||
1087 | } | ||
1088 | |||
1089 | static int | ||
1090 | event_enable_trigger_print(struct seq_file *m, struct event_trigger_ops *ops, | ||
1091 | struct event_trigger_data *data) | ||
1092 | { | ||
1093 | struct enable_trigger_data *enable_data = data->private_data; | ||
1094 | |||
1095 | seq_printf(m, "%s:%s:%s", | ||
1096 | enable_data->enable ? ENABLE_EVENT_STR : DISABLE_EVENT_STR, | ||
1097 | enable_data->file->event_call->class->system, | ||
1098 | enable_data->file->event_call->name); | ||
1099 | |||
1100 | if (data->count == -1) | ||
1101 | seq_puts(m, ":unlimited"); | ||
1102 | else | ||
1103 | seq_printf(m, ":count=%ld", data->count); | ||
1104 | |||
1105 | if (data->filter_str) | ||
1106 | seq_printf(m, " if %s\n", data->filter_str); | ||
1107 | else | ||
1108 | seq_puts(m, "\n"); | ||
1109 | |||
1110 | return 0; | ||
1111 | } | ||
1112 | |||
1113 | static void | ||
1114 | event_enable_trigger_free(struct event_trigger_ops *ops, | ||
1115 | struct event_trigger_data *data) | ||
1116 | { | ||
1117 | struct enable_trigger_data *enable_data = data->private_data; | ||
1118 | |||
1119 | if (WARN_ON_ONCE(data->ref <= 0)) | ||
1120 | return; | ||
1121 | |||
1122 | data->ref--; | ||
1123 | if (!data->ref) { | ||
1124 | /* Remove the SOFT_MODE flag */ | ||
1125 | trace_event_enable_disable(enable_data->file, 0, 1); | ||
1126 | module_put(enable_data->file->event_call->mod); | ||
1127 | trigger_data_free(data); | ||
1128 | kfree(enable_data); | ||
1129 | } | ||
1130 | } | ||
1131 | |||
1132 | static struct event_trigger_ops event_enable_trigger_ops = { | ||
1133 | .func = event_enable_trigger, | ||
1134 | .print = event_enable_trigger_print, | ||
1135 | .init = event_trigger_init, | ||
1136 | .free = event_enable_trigger_free, | ||
1137 | }; | ||
1138 | |||
1139 | static struct event_trigger_ops event_enable_count_trigger_ops = { | ||
1140 | .func = event_enable_count_trigger, | ||
1141 | .print = event_enable_trigger_print, | ||
1142 | .init = event_trigger_init, | ||
1143 | .free = event_enable_trigger_free, | ||
1144 | }; | ||
1145 | |||
1146 | static struct event_trigger_ops event_disable_trigger_ops = { | ||
1147 | .func = event_enable_trigger, | ||
1148 | .print = event_enable_trigger_print, | ||
1149 | .init = event_trigger_init, | ||
1150 | .free = event_enable_trigger_free, | ||
1151 | }; | ||
1152 | |||
1153 | static struct event_trigger_ops event_disable_count_trigger_ops = { | ||
1154 | .func = event_enable_count_trigger, | ||
1155 | .print = event_enable_trigger_print, | ||
1156 | .init = event_trigger_init, | ||
1157 | .free = event_enable_trigger_free, | ||
1158 | }; | ||
1159 | |||
1160 | static int | ||
1161 | event_enable_trigger_func(struct event_command *cmd_ops, | ||
1162 | struct ftrace_event_file *file, | ||
1163 | char *glob, char *cmd, char *param) | ||
1164 | { | ||
1165 | struct ftrace_event_file *event_enable_file; | ||
1166 | struct enable_trigger_data *enable_data; | ||
1167 | struct event_trigger_data *trigger_data; | ||
1168 | struct event_trigger_ops *trigger_ops; | ||
1169 | struct trace_array *tr = file->tr; | ||
1170 | const char *system; | ||
1171 | const char *event; | ||
1172 | char *trigger; | ||
1173 | char *number; | ||
1174 | bool enable; | ||
1175 | int ret; | ||
1176 | |||
1177 | if (!param) | ||
1178 | return -EINVAL; | ||
1179 | |||
1180 | /* separate the trigger from the filter (s:e:n [if filter]) */ | ||
1181 | trigger = strsep(¶m, " \t"); | ||
1182 | if (!trigger) | ||
1183 | return -EINVAL; | ||
1184 | |||
1185 | system = strsep(&trigger, ":"); | ||
1186 | if (!trigger) | ||
1187 | return -EINVAL; | ||
1188 | |||
1189 | event = strsep(&trigger, ":"); | ||
1190 | |||
1191 | ret = -EINVAL; | ||
1192 | event_enable_file = find_event_file(tr, system, event); | ||
1193 | if (!event_enable_file) | ||
1194 | goto out; | ||
1195 | |||
1196 | enable = strcmp(cmd, ENABLE_EVENT_STR) == 0; | ||
1197 | |||
1198 | trigger_ops = cmd_ops->get_trigger_ops(cmd, trigger); | ||
1199 | |||
1200 | ret = -ENOMEM; | ||
1201 | trigger_data = kzalloc(sizeof(*trigger_data), GFP_KERNEL); | ||
1202 | if (!trigger_data) | ||
1203 | goto out; | ||
1204 | |||
1205 | enable_data = kzalloc(sizeof(*enable_data), GFP_KERNEL); | ||
1206 | if (!enable_data) { | ||
1207 | kfree(trigger_data); | ||
1208 | goto out; | ||
1209 | } | ||
1210 | |||
1211 | trigger_data->count = -1; | ||
1212 | trigger_data->ops = trigger_ops; | ||
1213 | trigger_data->cmd_ops = cmd_ops; | ||
1214 | INIT_LIST_HEAD(&trigger_data->list); | ||
1215 | RCU_INIT_POINTER(trigger_data->filter, NULL); | ||
1216 | |||
1217 | enable_data->enable = enable; | ||
1218 | enable_data->file = event_enable_file; | ||
1219 | trigger_data->private_data = enable_data; | ||
1220 | |||
1221 | if (glob[0] == '!') { | ||
1222 | cmd_ops->unreg(glob+1, trigger_ops, trigger_data, file); | ||
1223 | kfree(trigger_data); | ||
1224 | kfree(enable_data); | ||
1225 | ret = 0; | ||
1226 | goto out; | ||
1227 | } | ||
1228 | |||
1229 | if (trigger) { | ||
1230 | number = strsep(&trigger, ":"); | ||
1231 | |||
1232 | ret = -EINVAL; | ||
1233 | if (!strlen(number)) | ||
1234 | goto out_free; | ||
1235 | |||
1236 | /* | ||
1237 | * We use the callback data field (which is a pointer) | ||
1238 | * as our counter. | ||
1239 | */ | ||
1240 | ret = kstrtoul(number, 0, &trigger_data->count); | ||
1241 | if (ret) | ||
1242 | goto out_free; | ||
1243 | } | ||
1244 | |||
1245 | if (!param) /* if param is non-empty, it's supposed to be a filter */ | ||
1246 | goto out_reg; | ||
1247 | |||
1248 | if (!cmd_ops->set_filter) | ||
1249 | goto out_reg; | ||
1250 | |||
1251 | ret = cmd_ops->set_filter(param, trigger_data, file); | ||
1252 | if (ret < 0) | ||
1253 | goto out_free; | ||
1254 | |||
1255 | out_reg: | ||
1256 | /* Don't let event modules unload while probe registered */ | ||
1257 | ret = try_module_get(event_enable_file->event_call->mod); | ||
1258 | if (!ret) { | ||
1259 | ret = -EBUSY; | ||
1260 | goto out_free; | ||
1261 | } | ||
1262 | |||
1263 | ret = trace_event_enable_disable(event_enable_file, 1, 1); | ||
1264 | if (ret < 0) | ||
1265 | goto out_put; | ||
1266 | ret = cmd_ops->reg(glob, trigger_ops, trigger_data, file); | ||
1267 | /* | ||
1268 | * The above returns on success the # of functions enabled, | ||
1269 | * but if it didn't find any functions it returns zero. | ||
1270 | * Consider no functions a failure too. | ||
1271 | */ | ||
1272 | if (!ret) { | ||
1273 | ret = -ENOENT; | ||
1274 | goto out_disable; | ||
1275 | } else if (ret < 0) | ||
1276 | goto out_disable; | ||
1277 | /* Just return zero, not the number of enabled functions */ | ||
1278 | ret = 0; | ||
1279 | out: | ||
1280 | return ret; | ||
1281 | |||
1282 | out_disable: | ||
1283 | trace_event_enable_disable(event_enable_file, 0, 1); | ||
1284 | out_put: | ||
1285 | module_put(event_enable_file->event_call->mod); | ||
1286 | out_free: | ||
1287 | if (cmd_ops->set_filter) | ||
1288 | cmd_ops->set_filter(NULL, trigger_data, NULL); | ||
1289 | kfree(trigger_data); | ||
1290 | kfree(enable_data); | ||
1291 | goto out; | ||
1292 | } | ||
1293 | |||
1294 | static int event_enable_register_trigger(char *glob, | ||
1295 | struct event_trigger_ops *ops, | ||
1296 | struct event_trigger_data *data, | ||
1297 | struct ftrace_event_file *file) | ||
1298 | { | ||
1299 | struct enable_trigger_data *enable_data = data->private_data; | ||
1300 | struct enable_trigger_data *test_enable_data; | ||
1301 | struct event_trigger_data *test; | ||
1302 | int ret = 0; | ||
1303 | |||
1304 | list_for_each_entry_rcu(test, &file->triggers, list) { | ||
1305 | test_enable_data = test->private_data; | ||
1306 | if (test_enable_data && | ||
1307 | (test_enable_data->file == enable_data->file)) { | ||
1308 | ret = -EEXIST; | ||
1309 | goto out; | ||
1310 | } | ||
1311 | } | ||
1312 | |||
1313 | if (data->ops->init) { | ||
1314 | ret = data->ops->init(data->ops, data); | ||
1315 | if (ret < 0) | ||
1316 | goto out; | ||
1317 | } | ||
1318 | |||
1319 | list_add_rcu(&data->list, &file->triggers); | ||
1320 | ret++; | ||
1321 | |||
1322 | if (trace_event_trigger_enable_disable(file, 1) < 0) { | ||
1323 | list_del_rcu(&data->list); | ||
1324 | ret--; | ||
1325 | } | ||
1326 | update_cond_flag(file); | ||
1327 | out: | ||
1328 | return ret; | ||
1329 | } | ||
1330 | |||
1331 | static void event_enable_unregister_trigger(char *glob, | ||
1332 | struct event_trigger_ops *ops, | ||
1333 | struct event_trigger_data *test, | ||
1334 | struct ftrace_event_file *file) | ||
1335 | { | ||
1336 | struct enable_trigger_data *test_enable_data = test->private_data; | ||
1337 | struct enable_trigger_data *enable_data; | ||
1338 | struct event_trigger_data *data; | ||
1339 | bool unregistered = false; | ||
1340 | |||
1341 | list_for_each_entry_rcu(data, &file->triggers, list) { | ||
1342 | enable_data = data->private_data; | ||
1343 | if (enable_data && | ||
1344 | (enable_data->file == test_enable_data->file)) { | ||
1345 | unregistered = true; | ||
1346 | list_del_rcu(&data->list); | ||
1347 | update_cond_flag(file); | ||
1348 | trace_event_trigger_enable_disable(file, 0); | ||
1349 | break; | ||
1350 | } | ||
1351 | } | ||
1352 | |||
1353 | if (unregistered && data->ops->free) | ||
1354 | data->ops->free(data->ops, data); | ||
1355 | } | ||
1356 | |||
1357 | static struct event_trigger_ops * | ||
1358 | event_enable_get_trigger_ops(char *cmd, char *param) | ||
1359 | { | ||
1360 | struct event_trigger_ops *ops; | ||
1361 | bool enable; | ||
1362 | |||
1363 | enable = strcmp(cmd, ENABLE_EVENT_STR) == 0; | ||
1364 | |||
1365 | if (enable) | ||
1366 | ops = param ? &event_enable_count_trigger_ops : | ||
1367 | &event_enable_trigger_ops; | ||
1368 | else | ||
1369 | ops = param ? &event_disable_count_trigger_ops : | ||
1370 | &event_disable_trigger_ops; | ||
1371 | |||
1372 | return ops; | ||
1373 | } | ||
1374 | |||
1375 | static struct event_command trigger_enable_cmd = { | ||
1376 | .name = ENABLE_EVENT_STR, | ||
1377 | .trigger_type = ETT_EVENT_ENABLE, | ||
1378 | .func = event_enable_trigger_func, | ||
1379 | .reg = event_enable_register_trigger, | ||
1380 | .unreg = event_enable_unregister_trigger, | ||
1381 | .get_trigger_ops = event_enable_get_trigger_ops, | ||
1382 | .set_filter = set_trigger_filter, | ||
1383 | }; | ||
1384 | |||
1385 | static struct event_command trigger_disable_cmd = { | ||
1386 | .name = DISABLE_EVENT_STR, | ||
1387 | .trigger_type = ETT_EVENT_ENABLE, | ||
1388 | .func = event_enable_trigger_func, | ||
1389 | .reg = event_enable_register_trigger, | ||
1390 | .unreg = event_enable_unregister_trigger, | ||
1391 | .get_trigger_ops = event_enable_get_trigger_ops, | ||
1392 | .set_filter = set_trigger_filter, | ||
1393 | }; | ||
1394 | |||
1395 | static __init void unregister_trigger_enable_disable_cmds(void) | ||
1396 | { | ||
1397 | unregister_event_command(&trigger_enable_cmd); | ||
1398 | unregister_event_command(&trigger_disable_cmd); | ||
1399 | } | ||
1400 | |||
1401 | static __init int register_trigger_enable_disable_cmds(void) | ||
1402 | { | ||
1403 | int ret; | ||
1404 | |||
1405 | ret = register_event_command(&trigger_enable_cmd); | ||
1406 | if (WARN_ON(ret < 0)) | ||
1407 | return ret; | ||
1408 | ret = register_event_command(&trigger_disable_cmd); | ||
1409 | if (WARN_ON(ret < 0)) | ||
1410 | unregister_trigger_enable_disable_cmds(); | ||
1411 | |||
1412 | return ret; | ||
1413 | } | ||
1414 | |||
1415 | static __init int register_trigger_traceon_traceoff_cmds(void) | ||
1416 | { | ||
1417 | int ret; | ||
1418 | |||
1419 | ret = register_event_command(&trigger_traceon_cmd); | ||
1420 | if (WARN_ON(ret < 0)) | ||
1421 | return ret; | ||
1422 | ret = register_event_command(&trigger_traceoff_cmd); | ||
1423 | if (WARN_ON(ret < 0)) | ||
1424 | unregister_trigger_traceon_traceoff_cmds(); | ||
1425 | |||
1426 | return ret; | ||
1427 | } | ||
1428 | |||
1429 | __init int register_trigger_cmds(void) | ||
1430 | { | ||
1431 | register_trigger_traceon_traceoff_cmds(); | ||
1432 | register_trigger_snapshot_cmd(); | ||
1433 | register_trigger_stacktrace_cmd(); | ||
1434 | register_trigger_enable_disable_cmds(); | ||
1435 | |||
1436 | return 0; | ||
1437 | } | ||
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index 7c3e3e72e2b6..ee0a5098ac43 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c | |||
@@ -95,15 +95,12 @@ static void __always_unused ____ftrace_check_##name(void) \ | |||
95 | #undef __array | 95 | #undef __array |
96 | #define __array(type, item, len) \ | 96 | #define __array(type, item, len) \ |
97 | do { \ | 97 | do { \ |
98 | char *type_str = #type"["__stringify(len)"]"; \ | ||
98 | BUILD_BUG_ON(len > MAX_FILTER_STR_VAL); \ | 99 | BUILD_BUG_ON(len > MAX_FILTER_STR_VAL); \ |
99 | mutex_lock(&event_storage_mutex); \ | 100 | ret = trace_define_field(event_call, type_str, #item, \ |
100 | snprintf(event_storage, sizeof(event_storage), \ | ||
101 | "%s[%d]", #type, len); \ | ||
102 | ret = trace_define_field(event_call, event_storage, #item, \ | ||
103 | offsetof(typeof(field), item), \ | 101 | offsetof(typeof(field), item), \ |
104 | sizeof(field.item), \ | 102 | sizeof(field.item), \ |
105 | is_signed_type(type), filter_type); \ | 103 | is_signed_type(type), filter_type); \ |
106 | mutex_unlock(&event_storage_mutex); \ | ||
107 | if (ret) \ | 104 | if (ret) \ |
108 | return ret; \ | 105 | return ret; \ |
109 | } while (0); | 106 | } while (0); |
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index dae9541ada9e..bdbae450c13e 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c | |||
@@ -27,18 +27,12 @@ | |||
27 | /** | 27 | /** |
28 | * Kprobe event core functions | 28 | * Kprobe event core functions |
29 | */ | 29 | */ |
30 | struct trace_probe { | 30 | struct trace_kprobe { |
31 | struct list_head list; | 31 | struct list_head list; |
32 | struct kretprobe rp; /* Use rp.kp for kprobe use */ | 32 | struct kretprobe rp; /* Use rp.kp for kprobe use */ |
33 | unsigned long nhit; | 33 | unsigned long nhit; |
34 | unsigned int flags; /* For TP_FLAG_* */ | ||
35 | const char *symbol; /* symbol name */ | 34 | const char *symbol; /* symbol name */ |
36 | struct ftrace_event_class class; | 35 | struct trace_probe tp; |
37 | struct ftrace_event_call call; | ||
38 | struct list_head files; | ||
39 | ssize_t size; /* trace entry size */ | ||
40 | unsigned int nr_args; | ||
41 | struct probe_arg args[]; | ||
42 | }; | 36 | }; |
43 | 37 | ||
44 | struct event_file_link { | 38 | struct event_file_link { |
@@ -46,56 +40,46 @@ struct event_file_link { | |||
46 | struct list_head list; | 40 | struct list_head list; |
47 | }; | 41 | }; |
48 | 42 | ||
49 | #define SIZEOF_TRACE_PROBE(n) \ | 43 | #define SIZEOF_TRACE_KPROBE(n) \ |
50 | (offsetof(struct trace_probe, args) + \ | 44 | (offsetof(struct trace_kprobe, tp.args) + \ |
51 | (sizeof(struct probe_arg) * (n))) | 45 | (sizeof(struct probe_arg) * (n))) |
52 | 46 | ||
53 | 47 | ||
54 | static __kprobes bool trace_probe_is_return(struct trace_probe *tp) | 48 | static __kprobes bool trace_kprobe_is_return(struct trace_kprobe *tk) |
55 | { | 49 | { |
56 | return tp->rp.handler != NULL; | 50 | return tk->rp.handler != NULL; |
57 | } | 51 | } |
58 | 52 | ||
59 | static __kprobes const char *trace_probe_symbol(struct trace_probe *tp) | 53 | static __kprobes const char *trace_kprobe_symbol(struct trace_kprobe *tk) |
60 | { | 54 | { |
61 | return tp->symbol ? tp->symbol : "unknown"; | 55 | return tk->symbol ? tk->symbol : "unknown"; |
62 | } | 56 | } |
63 | 57 | ||
64 | static __kprobes unsigned long trace_probe_offset(struct trace_probe *tp) | 58 | static __kprobes unsigned long trace_kprobe_offset(struct trace_kprobe *tk) |
65 | { | 59 | { |
66 | return tp->rp.kp.offset; | 60 | return tk->rp.kp.offset; |
67 | } | 61 | } |
68 | 62 | ||
69 | static __kprobes bool trace_probe_is_enabled(struct trace_probe *tp) | 63 | static __kprobes bool trace_kprobe_has_gone(struct trace_kprobe *tk) |
70 | { | 64 | { |
71 | return !!(tp->flags & (TP_FLAG_TRACE | TP_FLAG_PROFILE)); | 65 | return !!(kprobe_gone(&tk->rp.kp)); |
72 | } | 66 | } |
73 | 67 | ||
74 | static __kprobes bool trace_probe_is_registered(struct trace_probe *tp) | 68 | static __kprobes bool trace_kprobe_within_module(struct trace_kprobe *tk, |
75 | { | 69 | struct module *mod) |
76 | return !!(tp->flags & TP_FLAG_REGISTERED); | ||
77 | } | ||
78 | |||
79 | static __kprobes bool trace_probe_has_gone(struct trace_probe *tp) | ||
80 | { | ||
81 | return !!(kprobe_gone(&tp->rp.kp)); | ||
82 | } | ||
83 | |||
84 | static __kprobes bool trace_probe_within_module(struct trace_probe *tp, | ||
85 | struct module *mod) | ||
86 | { | 70 | { |
87 | int len = strlen(mod->name); | 71 | int len = strlen(mod->name); |
88 | const char *name = trace_probe_symbol(tp); | 72 | const char *name = trace_kprobe_symbol(tk); |
89 | return strncmp(mod->name, name, len) == 0 && name[len] == ':'; | 73 | return strncmp(mod->name, name, len) == 0 && name[len] == ':'; |
90 | } | 74 | } |
91 | 75 | ||
92 | static __kprobes bool trace_probe_is_on_module(struct trace_probe *tp) | 76 | static __kprobes bool trace_kprobe_is_on_module(struct trace_kprobe *tk) |
93 | { | 77 | { |
94 | return !!strchr(trace_probe_symbol(tp), ':'); | 78 | return !!strchr(trace_kprobe_symbol(tk), ':'); |
95 | } | 79 | } |
96 | 80 | ||
97 | static int register_probe_event(struct trace_probe *tp); | 81 | static int register_kprobe_event(struct trace_kprobe *tk); |
98 | static int unregister_probe_event(struct trace_probe *tp); | 82 | static int unregister_kprobe_event(struct trace_kprobe *tk); |
99 | 83 | ||
100 | static DEFINE_MUTEX(probe_lock); | 84 | static DEFINE_MUTEX(probe_lock); |
101 | static LIST_HEAD(probe_list); | 85 | static LIST_HEAD(probe_list); |
@@ -104,45 +88,224 @@ static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs); | |||
104 | static int kretprobe_dispatcher(struct kretprobe_instance *ri, | 88 | static int kretprobe_dispatcher(struct kretprobe_instance *ri, |
105 | struct pt_regs *regs); | 89 | struct pt_regs *regs); |
106 | 90 | ||
91 | /* Memory fetching by symbol */ | ||
92 | struct symbol_cache { | ||
93 | char *symbol; | ||
94 | long offset; | ||
95 | unsigned long addr; | ||
96 | }; | ||
97 | |||
98 | unsigned long update_symbol_cache(struct symbol_cache *sc) | ||
99 | { | ||
100 | sc->addr = (unsigned long)kallsyms_lookup_name(sc->symbol); | ||
101 | |||
102 | if (sc->addr) | ||
103 | sc->addr += sc->offset; | ||
104 | |||
105 | return sc->addr; | ||
106 | } | ||
107 | |||
108 | void free_symbol_cache(struct symbol_cache *sc) | ||
109 | { | ||
110 | kfree(sc->symbol); | ||
111 | kfree(sc); | ||
112 | } | ||
113 | |||
114 | struct symbol_cache *alloc_symbol_cache(const char *sym, long offset) | ||
115 | { | ||
116 | struct symbol_cache *sc; | ||
117 | |||
118 | if (!sym || strlen(sym) == 0) | ||
119 | return NULL; | ||
120 | |||
121 | sc = kzalloc(sizeof(struct symbol_cache), GFP_KERNEL); | ||
122 | if (!sc) | ||
123 | return NULL; | ||
124 | |||
125 | sc->symbol = kstrdup(sym, GFP_KERNEL); | ||
126 | if (!sc->symbol) { | ||
127 | kfree(sc); | ||
128 | return NULL; | ||
129 | } | ||
130 | sc->offset = offset; | ||
131 | update_symbol_cache(sc); | ||
132 | |||
133 | return sc; | ||
134 | } | ||
135 | |||
136 | /* | ||
137 | * Kprobes-specific fetch functions | ||
138 | */ | ||
139 | #define DEFINE_FETCH_stack(type) \ | ||
140 | static __kprobes void FETCH_FUNC_NAME(stack, type)(struct pt_regs *regs,\ | ||
141 | void *offset, void *dest) \ | ||
142 | { \ | ||
143 | *(type *)dest = (type)regs_get_kernel_stack_nth(regs, \ | ||
144 | (unsigned int)((unsigned long)offset)); \ | ||
145 | } | ||
146 | DEFINE_BASIC_FETCH_FUNCS(stack) | ||
147 | /* No string on the stack entry */ | ||
148 | #define fetch_stack_string NULL | ||
149 | #define fetch_stack_string_size NULL | ||
150 | |||
151 | #define DEFINE_FETCH_memory(type) \ | ||
152 | static __kprobes void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs,\ | ||
153 | void *addr, void *dest) \ | ||
154 | { \ | ||
155 | type retval; \ | ||
156 | if (probe_kernel_address(addr, retval)) \ | ||
157 | *(type *)dest = 0; \ | ||
158 | else \ | ||
159 | *(type *)dest = retval; \ | ||
160 | } | ||
161 | DEFINE_BASIC_FETCH_FUNCS(memory) | ||
162 | /* | ||
163 | * Fetch a null-terminated string. Caller MUST set *(u32 *)dest with max | ||
164 | * length and relative data location. | ||
165 | */ | ||
166 | static __kprobes void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs, | ||
167 | void *addr, void *dest) | ||
168 | { | ||
169 | long ret; | ||
170 | int maxlen = get_rloc_len(*(u32 *)dest); | ||
171 | u8 *dst = get_rloc_data(dest); | ||
172 | u8 *src = addr; | ||
173 | mm_segment_t old_fs = get_fs(); | ||
174 | |||
175 | if (!maxlen) | ||
176 | return; | ||
177 | |||
178 | /* | ||
179 | * Try to get string again, since the string can be changed while | ||
180 | * probing. | ||
181 | */ | ||
182 | set_fs(KERNEL_DS); | ||
183 | pagefault_disable(); | ||
184 | |||
185 | do | ||
186 | ret = __copy_from_user_inatomic(dst++, src++, 1); | ||
187 | while (dst[-1] && ret == 0 && src - (u8 *)addr < maxlen); | ||
188 | |||
189 | dst[-1] = '\0'; | ||
190 | pagefault_enable(); | ||
191 | set_fs(old_fs); | ||
192 | |||
193 | if (ret < 0) { /* Failed to fetch string */ | ||
194 | ((u8 *)get_rloc_data(dest))[0] = '\0'; | ||
195 | *(u32 *)dest = make_data_rloc(0, get_rloc_offs(*(u32 *)dest)); | ||
196 | } else { | ||
197 | *(u32 *)dest = make_data_rloc(src - (u8 *)addr, | ||
198 | get_rloc_offs(*(u32 *)dest)); | ||
199 | } | ||
200 | } | ||
201 | |||
202 | /* Return the length of string -- including null terminal byte */ | ||
203 | static __kprobes void FETCH_FUNC_NAME(memory, string_size)(struct pt_regs *regs, | ||
204 | void *addr, void *dest) | ||
205 | { | ||
206 | mm_segment_t old_fs; | ||
207 | int ret, len = 0; | ||
208 | u8 c; | ||
209 | |||
210 | old_fs = get_fs(); | ||
211 | set_fs(KERNEL_DS); | ||
212 | pagefault_disable(); | ||
213 | |||
214 | do { | ||
215 | ret = __copy_from_user_inatomic(&c, (u8 *)addr + len, 1); | ||
216 | len++; | ||
217 | } while (c && ret == 0 && len < MAX_STRING_SIZE); | ||
218 | |||
219 | pagefault_enable(); | ||
220 | set_fs(old_fs); | ||
221 | |||
222 | if (ret < 0) /* Failed to check the length */ | ||
223 | *(u32 *)dest = 0; | ||
224 | else | ||
225 | *(u32 *)dest = len; | ||
226 | } | ||
227 | |||
228 | #define DEFINE_FETCH_symbol(type) \ | ||
229 | __kprobes void FETCH_FUNC_NAME(symbol, type)(struct pt_regs *regs, \ | ||
230 | void *data, void *dest) \ | ||
231 | { \ | ||
232 | struct symbol_cache *sc = data; \ | ||
233 | if (sc->addr) \ | ||
234 | fetch_memory_##type(regs, (void *)sc->addr, dest); \ | ||
235 | else \ | ||
236 | *(type *)dest = 0; \ | ||
237 | } | ||
238 | DEFINE_BASIC_FETCH_FUNCS(symbol) | ||
239 | DEFINE_FETCH_symbol(string) | ||
240 | DEFINE_FETCH_symbol(string_size) | ||
241 | |||
242 | /* kprobes don't support file_offset fetch methods */ | ||
243 | #define fetch_file_offset_u8 NULL | ||
244 | #define fetch_file_offset_u16 NULL | ||
245 | #define fetch_file_offset_u32 NULL | ||
246 | #define fetch_file_offset_u64 NULL | ||
247 | #define fetch_file_offset_string NULL | ||
248 | #define fetch_file_offset_string_size NULL | ||
249 | |||
250 | /* Fetch type information table */ | ||
251 | const struct fetch_type kprobes_fetch_type_table[] = { | ||
252 | /* Special types */ | ||
253 | [FETCH_TYPE_STRING] = __ASSIGN_FETCH_TYPE("string", string, string, | ||
254 | sizeof(u32), 1, "__data_loc char[]"), | ||
255 | [FETCH_TYPE_STRSIZE] = __ASSIGN_FETCH_TYPE("string_size", u32, | ||
256 | string_size, sizeof(u32), 0, "u32"), | ||
257 | /* Basic types */ | ||
258 | ASSIGN_FETCH_TYPE(u8, u8, 0), | ||
259 | ASSIGN_FETCH_TYPE(u16, u16, 0), | ||
260 | ASSIGN_FETCH_TYPE(u32, u32, 0), | ||
261 | ASSIGN_FETCH_TYPE(u64, u64, 0), | ||
262 | ASSIGN_FETCH_TYPE(s8, u8, 1), | ||
263 | ASSIGN_FETCH_TYPE(s16, u16, 1), | ||
264 | ASSIGN_FETCH_TYPE(s32, u32, 1), | ||
265 | ASSIGN_FETCH_TYPE(s64, u64, 1), | ||
266 | |||
267 | ASSIGN_FETCH_TYPE_END | ||
268 | }; | ||
269 | |||
107 | /* | 270 | /* |
108 | * Allocate new trace_probe and initialize it (including kprobes). | 271 | * Allocate new trace_probe and initialize it (including kprobes). |
109 | */ | 272 | */ |
110 | static struct trace_probe *alloc_trace_probe(const char *group, | 273 | static struct trace_kprobe *alloc_trace_kprobe(const char *group, |
111 | const char *event, | 274 | const char *event, |
112 | void *addr, | 275 | void *addr, |
113 | const char *symbol, | 276 | const char *symbol, |
114 | unsigned long offs, | 277 | unsigned long offs, |
115 | int nargs, bool is_return) | 278 | int nargs, bool is_return) |
116 | { | 279 | { |
117 | struct trace_probe *tp; | 280 | struct trace_kprobe *tk; |
118 | int ret = -ENOMEM; | 281 | int ret = -ENOMEM; |
119 | 282 | ||
120 | tp = kzalloc(SIZEOF_TRACE_PROBE(nargs), GFP_KERNEL); | 283 | tk = kzalloc(SIZEOF_TRACE_KPROBE(nargs), GFP_KERNEL); |
121 | if (!tp) | 284 | if (!tk) |
122 | return ERR_PTR(ret); | 285 | return ERR_PTR(ret); |
123 | 286 | ||
124 | if (symbol) { | 287 | if (symbol) { |
125 | tp->symbol = kstrdup(symbol, GFP_KERNEL); | 288 | tk->symbol = kstrdup(symbol, GFP_KERNEL); |
126 | if (!tp->symbol) | 289 | if (!tk->symbol) |
127 | goto error; | 290 | goto error; |
128 | tp->rp.kp.symbol_name = tp->symbol; | 291 | tk->rp.kp.symbol_name = tk->symbol; |
129 | tp->rp.kp.offset = offs; | 292 | tk->rp.kp.offset = offs; |
130 | } else | 293 | } else |
131 | tp->rp.kp.addr = addr; | 294 | tk->rp.kp.addr = addr; |
132 | 295 | ||
133 | if (is_return) | 296 | if (is_return) |
134 | tp->rp.handler = kretprobe_dispatcher; | 297 | tk->rp.handler = kretprobe_dispatcher; |
135 | else | 298 | else |
136 | tp->rp.kp.pre_handler = kprobe_dispatcher; | 299 | tk->rp.kp.pre_handler = kprobe_dispatcher; |
137 | 300 | ||
138 | if (!event || !is_good_name(event)) { | 301 | if (!event || !is_good_name(event)) { |
139 | ret = -EINVAL; | 302 | ret = -EINVAL; |
140 | goto error; | 303 | goto error; |
141 | } | 304 | } |
142 | 305 | ||
143 | tp->call.class = &tp->class; | 306 | tk->tp.call.class = &tk->tp.class; |
144 | tp->call.name = kstrdup(event, GFP_KERNEL); | 307 | tk->tp.call.name = kstrdup(event, GFP_KERNEL); |
145 | if (!tp->call.name) | 308 | if (!tk->tp.call.name) |
146 | goto error; | 309 | goto error; |
147 | 310 | ||
148 | if (!group || !is_good_name(group)) { | 311 | if (!group || !is_good_name(group)) { |
@@ -150,42 +313,42 @@ static struct trace_probe *alloc_trace_probe(const char *group, | |||
150 | goto error; | 313 | goto error; |
151 | } | 314 | } |
152 | 315 | ||
153 | tp->class.system = kstrdup(group, GFP_KERNEL); | 316 | tk->tp.class.system = kstrdup(group, GFP_KERNEL); |
154 | if (!tp->class.system) | 317 | if (!tk->tp.class.system) |
155 | goto error; | 318 | goto error; |
156 | 319 | ||
157 | INIT_LIST_HEAD(&tp->list); | 320 | INIT_LIST_HEAD(&tk->list); |
158 | INIT_LIST_HEAD(&tp->files); | 321 | INIT_LIST_HEAD(&tk->tp.files); |
159 | return tp; | 322 | return tk; |
160 | error: | 323 | error: |
161 | kfree(tp->call.name); | 324 | kfree(tk->tp.call.name); |
162 | kfree(tp->symbol); | 325 | kfree(tk->symbol); |
163 | kfree(tp); | 326 | kfree(tk); |
164 | return ERR_PTR(ret); | 327 | return ERR_PTR(ret); |
165 | } | 328 | } |
166 | 329 | ||
167 | static void free_trace_probe(struct trace_probe *tp) | 330 | static void free_trace_kprobe(struct trace_kprobe *tk) |
168 | { | 331 | { |
169 | int i; | 332 | int i; |
170 | 333 | ||
171 | for (i = 0; i < tp->nr_args; i++) | 334 | for (i = 0; i < tk->tp.nr_args; i++) |
172 | traceprobe_free_probe_arg(&tp->args[i]); | 335 | traceprobe_free_probe_arg(&tk->tp.args[i]); |
173 | 336 | ||
174 | kfree(tp->call.class->system); | 337 | kfree(tk->tp.call.class->system); |
175 | kfree(tp->call.name); | 338 | kfree(tk->tp.call.name); |
176 | kfree(tp->symbol); | 339 | kfree(tk->symbol); |
177 | kfree(tp); | 340 | kfree(tk); |
178 | } | 341 | } |
179 | 342 | ||
180 | static struct trace_probe *find_trace_probe(const char *event, | 343 | static struct trace_kprobe *find_trace_kprobe(const char *event, |
181 | const char *group) | 344 | const char *group) |
182 | { | 345 | { |
183 | struct trace_probe *tp; | 346 | struct trace_kprobe *tk; |
184 | 347 | ||
185 | list_for_each_entry(tp, &probe_list, list) | 348 | list_for_each_entry(tk, &probe_list, list) |
186 | if (strcmp(tp->call.name, event) == 0 && | 349 | if (strcmp(tk->tp.call.name, event) == 0 && |
187 | strcmp(tp->call.class->system, group) == 0) | 350 | strcmp(tk->tp.call.class->system, group) == 0) |
188 | return tp; | 351 | return tk; |
189 | return NULL; | 352 | return NULL; |
190 | } | 353 | } |
191 | 354 | ||
@@ -194,7 +357,7 @@ static struct trace_probe *find_trace_probe(const char *event, | |||
194 | * if the file is NULL, enable "perf" handler, or enable "trace" handler. | 357 | * if the file is NULL, enable "perf" handler, or enable "trace" handler. |
195 | */ | 358 | */ |
196 | static int | 359 | static int |
197 | enable_trace_probe(struct trace_probe *tp, struct ftrace_event_file *file) | 360 | enable_trace_kprobe(struct trace_kprobe *tk, struct ftrace_event_file *file) |
198 | { | 361 | { |
199 | int ret = 0; | 362 | int ret = 0; |
200 | 363 | ||
@@ -208,17 +371,17 @@ enable_trace_probe(struct trace_probe *tp, struct ftrace_event_file *file) | |||
208 | } | 371 | } |
209 | 372 | ||
210 | link->file = file; | 373 | link->file = file; |
211 | list_add_tail_rcu(&link->list, &tp->files); | 374 | list_add_tail_rcu(&link->list, &tk->tp.files); |
212 | 375 | ||
213 | tp->flags |= TP_FLAG_TRACE; | 376 | tk->tp.flags |= TP_FLAG_TRACE; |
214 | } else | 377 | } else |
215 | tp->flags |= TP_FLAG_PROFILE; | 378 | tk->tp.flags |= TP_FLAG_PROFILE; |
216 | 379 | ||
217 | if (trace_probe_is_registered(tp) && !trace_probe_has_gone(tp)) { | 380 | if (trace_probe_is_registered(&tk->tp) && !trace_kprobe_has_gone(tk)) { |
218 | if (trace_probe_is_return(tp)) | 381 | if (trace_kprobe_is_return(tk)) |
219 | ret = enable_kretprobe(&tp->rp); | 382 | ret = enable_kretprobe(&tk->rp); |
220 | else | 383 | else |
221 | ret = enable_kprobe(&tp->rp.kp); | 384 | ret = enable_kprobe(&tk->rp.kp); |
222 | } | 385 | } |
223 | out: | 386 | out: |
224 | return ret; | 387 | return ret; |
@@ -241,14 +404,14 @@ find_event_file_link(struct trace_probe *tp, struct ftrace_event_file *file) | |||
241 | * if the file is NULL, disable "perf" handler, or disable "trace" handler. | 404 | * if the file is NULL, disable "perf" handler, or disable "trace" handler. |
242 | */ | 405 | */ |
243 | static int | 406 | static int |
244 | disable_trace_probe(struct trace_probe *tp, struct ftrace_event_file *file) | 407 | disable_trace_kprobe(struct trace_kprobe *tk, struct ftrace_event_file *file) |
245 | { | 408 | { |
246 | struct event_file_link *link = NULL; | 409 | struct event_file_link *link = NULL; |
247 | int wait = 0; | 410 | int wait = 0; |
248 | int ret = 0; | 411 | int ret = 0; |
249 | 412 | ||
250 | if (file) { | 413 | if (file) { |
251 | link = find_event_file_link(tp, file); | 414 | link = find_event_file_link(&tk->tp, file); |
252 | if (!link) { | 415 | if (!link) { |
253 | ret = -EINVAL; | 416 | ret = -EINVAL; |
254 | goto out; | 417 | goto out; |
@@ -256,18 +419,18 @@ disable_trace_probe(struct trace_probe *tp, struct ftrace_event_file *file) | |||
256 | 419 | ||
257 | list_del_rcu(&link->list); | 420 | list_del_rcu(&link->list); |
258 | wait = 1; | 421 | wait = 1; |
259 | if (!list_empty(&tp->files)) | 422 | if (!list_empty(&tk->tp.files)) |
260 | goto out; | 423 | goto out; |
261 | 424 | ||
262 | tp->flags &= ~TP_FLAG_TRACE; | 425 | tk->tp.flags &= ~TP_FLAG_TRACE; |
263 | } else | 426 | } else |
264 | tp->flags &= ~TP_FLAG_PROFILE; | 427 | tk->tp.flags &= ~TP_FLAG_PROFILE; |
265 | 428 | ||
266 | if (!trace_probe_is_enabled(tp) && trace_probe_is_registered(tp)) { | 429 | if (!trace_probe_is_enabled(&tk->tp) && trace_probe_is_registered(&tk->tp)) { |
267 | if (trace_probe_is_return(tp)) | 430 | if (trace_kprobe_is_return(tk)) |
268 | disable_kretprobe(&tp->rp); | 431 | disable_kretprobe(&tk->rp); |
269 | else | 432 | else |
270 | disable_kprobe(&tp->rp.kp); | 433 | disable_kprobe(&tk->rp.kp); |
271 | wait = 1; | 434 | wait = 1; |
272 | } | 435 | } |
273 | out: | 436 | out: |
@@ -288,40 +451,40 @@ disable_trace_probe(struct trace_probe *tp, struct ftrace_event_file *file) | |||
288 | } | 451 | } |
289 | 452 | ||
290 | /* Internal register function - just handle k*probes and flags */ | 453 | /* Internal register function - just handle k*probes and flags */ |
291 | static int __register_trace_probe(struct trace_probe *tp) | 454 | static int __register_trace_kprobe(struct trace_kprobe *tk) |
292 | { | 455 | { |
293 | int i, ret; | 456 | int i, ret; |
294 | 457 | ||
295 | if (trace_probe_is_registered(tp)) | 458 | if (trace_probe_is_registered(&tk->tp)) |
296 | return -EINVAL; | 459 | return -EINVAL; |
297 | 460 | ||
298 | for (i = 0; i < tp->nr_args; i++) | 461 | for (i = 0; i < tk->tp.nr_args; i++) |
299 | traceprobe_update_arg(&tp->args[i]); | 462 | traceprobe_update_arg(&tk->tp.args[i]); |
300 | 463 | ||
301 | /* Set/clear disabled flag according to tp->flag */ | 464 | /* Set/clear disabled flag according to tp->flag */ |
302 | if (trace_probe_is_enabled(tp)) | 465 | if (trace_probe_is_enabled(&tk->tp)) |
303 | tp->rp.kp.flags &= ~KPROBE_FLAG_DISABLED; | 466 | tk->rp.kp.flags &= ~KPROBE_FLAG_DISABLED; |
304 | else | 467 | else |
305 | tp->rp.kp.flags |= KPROBE_FLAG_DISABLED; | 468 | tk->rp.kp.flags |= KPROBE_FLAG_DISABLED; |
306 | 469 | ||
307 | if (trace_probe_is_return(tp)) | 470 | if (trace_kprobe_is_return(tk)) |
308 | ret = register_kretprobe(&tp->rp); | 471 | ret = register_kretprobe(&tk->rp); |
309 | else | 472 | else |
310 | ret = register_kprobe(&tp->rp.kp); | 473 | ret = register_kprobe(&tk->rp.kp); |
311 | 474 | ||
312 | if (ret == 0) | 475 | if (ret == 0) |
313 | tp->flags |= TP_FLAG_REGISTERED; | 476 | tk->tp.flags |= TP_FLAG_REGISTERED; |
314 | else { | 477 | else { |
315 | pr_warning("Could not insert probe at %s+%lu: %d\n", | 478 | pr_warning("Could not insert probe at %s+%lu: %d\n", |
316 | trace_probe_symbol(tp), trace_probe_offset(tp), ret); | 479 | trace_kprobe_symbol(tk), trace_kprobe_offset(tk), ret); |
317 | if (ret == -ENOENT && trace_probe_is_on_module(tp)) { | 480 | if (ret == -ENOENT && trace_kprobe_is_on_module(tk)) { |
318 | pr_warning("This probe might be able to register after" | 481 | pr_warning("This probe might be able to register after" |
319 | "target module is loaded. Continue.\n"); | 482 | "target module is loaded. Continue.\n"); |
320 | ret = 0; | 483 | ret = 0; |
321 | } else if (ret == -EILSEQ) { | 484 | } else if (ret == -EILSEQ) { |
322 | pr_warning("Probing address(0x%p) is not an " | 485 | pr_warning("Probing address(0x%p) is not an " |
323 | "instruction boundary.\n", | 486 | "instruction boundary.\n", |
324 | tp->rp.kp.addr); | 487 | tk->rp.kp.addr); |
325 | ret = -EINVAL; | 488 | ret = -EINVAL; |
326 | } | 489 | } |
327 | } | 490 | } |
@@ -330,67 +493,67 @@ static int __register_trace_probe(struct trace_probe *tp) | |||
330 | } | 493 | } |
331 | 494 | ||
332 | /* Internal unregister function - just handle k*probes and flags */ | 495 | /* Internal unregister function - just handle k*probes and flags */ |
333 | static void __unregister_trace_probe(struct trace_probe *tp) | 496 | static void __unregister_trace_kprobe(struct trace_kprobe *tk) |
334 | { | 497 | { |
335 | if (trace_probe_is_registered(tp)) { | 498 | if (trace_probe_is_registered(&tk->tp)) { |
336 | if (trace_probe_is_return(tp)) | 499 | if (trace_kprobe_is_return(tk)) |
337 | unregister_kretprobe(&tp->rp); | 500 | unregister_kretprobe(&tk->rp); |
338 | else | 501 | else |
339 | unregister_kprobe(&tp->rp.kp); | 502 | unregister_kprobe(&tk->rp.kp); |
340 | tp->flags &= ~TP_FLAG_REGISTERED; | 503 | tk->tp.flags &= ~TP_FLAG_REGISTERED; |
341 | /* Cleanup kprobe for reuse */ | 504 | /* Cleanup kprobe for reuse */ |
342 | if (tp->rp.kp.symbol_name) | 505 | if (tk->rp.kp.symbol_name) |
343 | tp->rp.kp.addr = NULL; | 506 | tk->rp.kp.addr = NULL; |
344 | } | 507 | } |
345 | } | 508 | } |
346 | 509 | ||
347 | /* Unregister a trace_probe and probe_event: call with locking probe_lock */ | 510 | /* Unregister a trace_probe and probe_event: call with locking probe_lock */ |
348 | static int unregister_trace_probe(struct trace_probe *tp) | 511 | static int unregister_trace_kprobe(struct trace_kprobe *tk) |
349 | { | 512 | { |
350 | /* Enabled event can not be unregistered */ | 513 | /* Enabled event can not be unregistered */ |
351 | if (trace_probe_is_enabled(tp)) | 514 | if (trace_probe_is_enabled(&tk->tp)) |
352 | return -EBUSY; | 515 | return -EBUSY; |
353 | 516 | ||
354 | /* Will fail if probe is being used by ftrace or perf */ | 517 | /* Will fail if probe is being used by ftrace or perf */ |
355 | if (unregister_probe_event(tp)) | 518 | if (unregister_kprobe_event(tk)) |
356 | return -EBUSY; | 519 | return -EBUSY; |
357 | 520 | ||
358 | __unregister_trace_probe(tp); | 521 | __unregister_trace_kprobe(tk); |
359 | list_del(&tp->list); | 522 | list_del(&tk->list); |
360 | 523 | ||
361 | return 0; | 524 | return 0; |
362 | } | 525 | } |
363 | 526 | ||
364 | /* Register a trace_probe and probe_event */ | 527 | /* Register a trace_probe and probe_event */ |
365 | static int register_trace_probe(struct trace_probe *tp) | 528 | static int register_trace_kprobe(struct trace_kprobe *tk) |
366 | { | 529 | { |
367 | struct trace_probe *old_tp; | 530 | struct trace_kprobe *old_tk; |
368 | int ret; | 531 | int ret; |
369 | 532 | ||
370 | mutex_lock(&probe_lock); | 533 | mutex_lock(&probe_lock); |
371 | 534 | ||
372 | /* Delete old (same name) event if exist */ | 535 | /* Delete old (same name) event if exist */ |
373 | old_tp = find_trace_probe(tp->call.name, tp->call.class->system); | 536 | old_tk = find_trace_kprobe(tk->tp.call.name, tk->tp.call.class->system); |
374 | if (old_tp) { | 537 | if (old_tk) { |
375 | ret = unregister_trace_probe(old_tp); | 538 | ret = unregister_trace_kprobe(old_tk); |
376 | if (ret < 0) | 539 | if (ret < 0) |
377 | goto end; | 540 | goto end; |
378 | free_trace_probe(old_tp); | 541 | free_trace_kprobe(old_tk); |
379 | } | 542 | } |
380 | 543 | ||
381 | /* Register new event */ | 544 | /* Register new event */ |
382 | ret = register_probe_event(tp); | 545 | ret = register_kprobe_event(tk); |
383 | if (ret) { | 546 | if (ret) { |
384 | pr_warning("Failed to register probe event(%d)\n", ret); | 547 | pr_warning("Failed to register probe event(%d)\n", ret); |
385 | goto end; | 548 | goto end; |
386 | } | 549 | } |
387 | 550 | ||
388 | /* Register k*probe */ | 551 | /* Register k*probe */ |
389 | ret = __register_trace_probe(tp); | 552 | ret = __register_trace_kprobe(tk); |
390 | if (ret < 0) | 553 | if (ret < 0) |
391 | unregister_probe_event(tp); | 554 | unregister_kprobe_event(tk); |
392 | else | 555 | else |
393 | list_add_tail(&tp->list, &probe_list); | 556 | list_add_tail(&tk->list, &probe_list); |
394 | 557 | ||
395 | end: | 558 | end: |
396 | mutex_unlock(&probe_lock); | 559 | mutex_unlock(&probe_lock); |
@@ -398,11 +561,11 @@ end: | |||
398 | } | 561 | } |
399 | 562 | ||
400 | /* Module notifier call back, checking event on the module */ | 563 | /* Module notifier call back, checking event on the module */ |
401 | static int trace_probe_module_callback(struct notifier_block *nb, | 564 | static int trace_kprobe_module_callback(struct notifier_block *nb, |
402 | unsigned long val, void *data) | 565 | unsigned long val, void *data) |
403 | { | 566 | { |
404 | struct module *mod = data; | 567 | struct module *mod = data; |
405 | struct trace_probe *tp; | 568 | struct trace_kprobe *tk; |
406 | int ret; | 569 | int ret; |
407 | 570 | ||
408 | if (val != MODULE_STATE_COMING) | 571 | if (val != MODULE_STATE_COMING) |
@@ -410,15 +573,15 @@ static int trace_probe_module_callback(struct notifier_block *nb, | |||
410 | 573 | ||
411 | /* Update probes on coming module */ | 574 | /* Update probes on coming module */ |
412 | mutex_lock(&probe_lock); | 575 | mutex_lock(&probe_lock); |
413 | list_for_each_entry(tp, &probe_list, list) { | 576 | list_for_each_entry(tk, &probe_list, list) { |
414 | if (trace_probe_within_module(tp, mod)) { | 577 | if (trace_kprobe_within_module(tk, mod)) { |
415 | /* Don't need to check busy - this should have gone. */ | 578 | /* Don't need to check busy - this should have gone. */ |
416 | __unregister_trace_probe(tp); | 579 | __unregister_trace_kprobe(tk); |
417 | ret = __register_trace_probe(tp); | 580 | ret = __register_trace_kprobe(tk); |
418 | if (ret) | 581 | if (ret) |
419 | pr_warning("Failed to re-register probe %s on" | 582 | pr_warning("Failed to re-register probe %s on" |
420 | "%s: %d\n", | 583 | "%s: %d\n", |
421 | tp->call.name, mod->name, ret); | 584 | tk->tp.call.name, mod->name, ret); |
422 | } | 585 | } |
423 | } | 586 | } |
424 | mutex_unlock(&probe_lock); | 587 | mutex_unlock(&probe_lock); |
@@ -426,12 +589,12 @@ static int trace_probe_module_callback(struct notifier_block *nb, | |||
426 | return NOTIFY_DONE; | 589 | return NOTIFY_DONE; |
427 | } | 590 | } |
428 | 591 | ||
429 | static struct notifier_block trace_probe_module_nb = { | 592 | static struct notifier_block trace_kprobe_module_nb = { |
430 | .notifier_call = trace_probe_module_callback, | 593 | .notifier_call = trace_kprobe_module_callback, |
431 | .priority = 1 /* Invoked after kprobe module callback */ | 594 | .priority = 1 /* Invoked after kprobe module callback */ |
432 | }; | 595 | }; |
433 | 596 | ||
434 | static int create_trace_probe(int argc, char **argv) | 597 | static int create_trace_kprobe(int argc, char **argv) |
435 | { | 598 | { |
436 | /* | 599 | /* |
437 | * Argument syntax: | 600 | * Argument syntax: |
@@ -451,7 +614,7 @@ static int create_trace_probe(int argc, char **argv) | |||
451 | * Type of args: | 614 | * Type of args: |
452 | * FETCHARG:TYPE : use TYPE instead of unsigned long. | 615 | * FETCHARG:TYPE : use TYPE instead of unsigned long. |
453 | */ | 616 | */ |
454 | struct trace_probe *tp; | 617 | struct trace_kprobe *tk; |
455 | int i, ret = 0; | 618 | int i, ret = 0; |
456 | bool is_return = false, is_delete = false; | 619 | bool is_return = false, is_delete = false; |
457 | char *symbol = NULL, *event = NULL, *group = NULL; | 620 | char *symbol = NULL, *event = NULL, *group = NULL; |
@@ -498,16 +661,16 @@ static int create_trace_probe(int argc, char **argv) | |||
498 | return -EINVAL; | 661 | return -EINVAL; |
499 | } | 662 | } |
500 | mutex_lock(&probe_lock); | 663 | mutex_lock(&probe_lock); |
501 | tp = find_trace_probe(event, group); | 664 | tk = find_trace_kprobe(event, group); |
502 | if (!tp) { | 665 | if (!tk) { |
503 | mutex_unlock(&probe_lock); | 666 | mutex_unlock(&probe_lock); |
504 | pr_info("Event %s/%s doesn't exist.\n", group, event); | 667 | pr_info("Event %s/%s doesn't exist.\n", group, event); |
505 | return -ENOENT; | 668 | return -ENOENT; |
506 | } | 669 | } |
507 | /* delete an event */ | 670 | /* delete an event */ |
508 | ret = unregister_trace_probe(tp); | 671 | ret = unregister_trace_kprobe(tk); |
509 | if (ret == 0) | 672 | if (ret == 0) |
510 | free_trace_probe(tp); | 673 | free_trace_kprobe(tk); |
511 | mutex_unlock(&probe_lock); | 674 | mutex_unlock(&probe_lock); |
512 | return ret; | 675 | return ret; |
513 | } | 676 | } |
@@ -554,47 +717,49 @@ static int create_trace_probe(int argc, char **argv) | |||
554 | is_return ? 'r' : 'p', addr); | 717 | is_return ? 'r' : 'p', addr); |
555 | event = buf; | 718 | event = buf; |
556 | } | 719 | } |
557 | tp = alloc_trace_probe(group, event, addr, symbol, offset, argc, | 720 | tk = alloc_trace_kprobe(group, event, addr, symbol, offset, argc, |
558 | is_return); | 721 | is_return); |
559 | if (IS_ERR(tp)) { | 722 | if (IS_ERR(tk)) { |
560 | pr_info("Failed to allocate trace_probe.(%d)\n", | 723 | pr_info("Failed to allocate trace_probe.(%d)\n", |
561 | (int)PTR_ERR(tp)); | 724 | (int)PTR_ERR(tk)); |
562 | return PTR_ERR(tp); | 725 | return PTR_ERR(tk); |
563 | } | 726 | } |
564 | 727 | ||
565 | /* parse arguments */ | 728 | /* parse arguments */ |
566 | ret = 0; | 729 | ret = 0; |
567 | for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) { | 730 | for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) { |
731 | struct probe_arg *parg = &tk->tp.args[i]; | ||
732 | |||
568 | /* Increment count for freeing args in error case */ | 733 | /* Increment count for freeing args in error case */ |
569 | tp->nr_args++; | 734 | tk->tp.nr_args++; |
570 | 735 | ||
571 | /* Parse argument name */ | 736 | /* Parse argument name */ |
572 | arg = strchr(argv[i], '='); | 737 | arg = strchr(argv[i], '='); |
573 | if (arg) { | 738 | if (arg) { |
574 | *arg++ = '\0'; | 739 | *arg++ = '\0'; |
575 | tp->args[i].name = kstrdup(argv[i], GFP_KERNEL); | 740 | parg->name = kstrdup(argv[i], GFP_KERNEL); |
576 | } else { | 741 | } else { |
577 | arg = argv[i]; | 742 | arg = argv[i]; |
578 | /* If argument name is omitted, set "argN" */ | 743 | /* If argument name is omitted, set "argN" */ |
579 | snprintf(buf, MAX_EVENT_NAME_LEN, "arg%d", i + 1); | 744 | snprintf(buf, MAX_EVENT_NAME_LEN, "arg%d", i + 1); |
580 | tp->args[i].name = kstrdup(buf, GFP_KERNEL); | 745 | parg->name = kstrdup(buf, GFP_KERNEL); |
581 | } | 746 | } |
582 | 747 | ||
583 | if (!tp->args[i].name) { | 748 | if (!parg->name) { |
584 | pr_info("Failed to allocate argument[%d] name.\n", i); | 749 | pr_info("Failed to allocate argument[%d] name.\n", i); |
585 | ret = -ENOMEM; | 750 | ret = -ENOMEM; |
586 | goto error; | 751 | goto error; |
587 | } | 752 | } |
588 | 753 | ||
589 | if (!is_good_name(tp->args[i].name)) { | 754 | if (!is_good_name(parg->name)) { |
590 | pr_info("Invalid argument[%d] name: %s\n", | 755 | pr_info("Invalid argument[%d] name: %s\n", |
591 | i, tp->args[i].name); | 756 | i, parg->name); |
592 | ret = -EINVAL; | 757 | ret = -EINVAL; |
593 | goto error; | 758 | goto error; |
594 | } | 759 | } |
595 | 760 | ||
596 | if (traceprobe_conflict_field_name(tp->args[i].name, | 761 | if (traceprobe_conflict_field_name(parg->name, |
597 | tp->args, i)) { | 762 | tk->tp.args, i)) { |
598 | pr_info("Argument[%d] name '%s' conflicts with " | 763 | pr_info("Argument[%d] name '%s' conflicts with " |
599 | "another field.\n", i, argv[i]); | 764 | "another field.\n", i, argv[i]); |
600 | ret = -EINVAL; | 765 | ret = -EINVAL; |
@@ -602,7 +767,7 @@ static int create_trace_probe(int argc, char **argv) | |||
602 | } | 767 | } |
603 | 768 | ||
604 | /* Parse fetch argument */ | 769 | /* Parse fetch argument */ |
605 | ret = traceprobe_parse_probe_arg(arg, &tp->size, &tp->args[i], | 770 | ret = traceprobe_parse_probe_arg(arg, &tk->tp.size, parg, |
606 | is_return, true); | 771 | is_return, true); |
607 | if (ret) { | 772 | if (ret) { |
608 | pr_info("Parse error at argument[%d]. (%d)\n", i, ret); | 773 | pr_info("Parse error at argument[%d]. (%d)\n", i, ret); |
@@ -610,35 +775,35 @@ static int create_trace_probe(int argc, char **argv) | |||
610 | } | 775 | } |
611 | } | 776 | } |
612 | 777 | ||
613 | ret = register_trace_probe(tp); | 778 | ret = register_trace_kprobe(tk); |
614 | if (ret) | 779 | if (ret) |
615 | goto error; | 780 | goto error; |
616 | return 0; | 781 | return 0; |
617 | 782 | ||
618 | error: | 783 | error: |
619 | free_trace_probe(tp); | 784 | free_trace_kprobe(tk); |
620 | return ret; | 785 | return ret; |
621 | } | 786 | } |
622 | 787 | ||
623 | static int release_all_trace_probes(void) | 788 | static int release_all_trace_kprobes(void) |
624 | { | 789 | { |
625 | struct trace_probe *tp; | 790 | struct trace_kprobe *tk; |
626 | int ret = 0; | 791 | int ret = 0; |
627 | 792 | ||
628 | mutex_lock(&probe_lock); | 793 | mutex_lock(&probe_lock); |
629 | /* Ensure no probe is in use. */ | 794 | /* Ensure no probe is in use. */ |
630 | list_for_each_entry(tp, &probe_list, list) | 795 | list_for_each_entry(tk, &probe_list, list) |
631 | if (trace_probe_is_enabled(tp)) { | 796 | if (trace_probe_is_enabled(&tk->tp)) { |
632 | ret = -EBUSY; | 797 | ret = -EBUSY; |
633 | goto end; | 798 | goto end; |
634 | } | 799 | } |
635 | /* TODO: Use batch unregistration */ | 800 | /* TODO: Use batch unregistration */ |
636 | while (!list_empty(&probe_list)) { | 801 | while (!list_empty(&probe_list)) { |
637 | tp = list_entry(probe_list.next, struct trace_probe, list); | 802 | tk = list_entry(probe_list.next, struct trace_kprobe, list); |
638 | ret = unregister_trace_probe(tp); | 803 | ret = unregister_trace_kprobe(tk); |
639 | if (ret) | 804 | if (ret) |
640 | goto end; | 805 | goto end; |
641 | free_trace_probe(tp); | 806 | free_trace_kprobe(tk); |
642 | } | 807 | } |
643 | 808 | ||
644 | end: | 809 | end: |
@@ -666,22 +831,22 @@ static void probes_seq_stop(struct seq_file *m, void *v) | |||
666 | 831 | ||
667 | static int probes_seq_show(struct seq_file *m, void *v) | 832 | static int probes_seq_show(struct seq_file *m, void *v) |
668 | { | 833 | { |
669 | struct trace_probe *tp = v; | 834 | struct trace_kprobe *tk = v; |
670 | int i; | 835 | int i; |
671 | 836 | ||
672 | seq_printf(m, "%c", trace_probe_is_return(tp) ? 'r' : 'p'); | 837 | seq_printf(m, "%c", trace_kprobe_is_return(tk) ? 'r' : 'p'); |
673 | seq_printf(m, ":%s/%s", tp->call.class->system, tp->call.name); | 838 | seq_printf(m, ":%s/%s", tk->tp.call.class->system, tk->tp.call.name); |
674 | 839 | ||
675 | if (!tp->symbol) | 840 | if (!tk->symbol) |
676 | seq_printf(m, " 0x%p", tp->rp.kp.addr); | 841 | seq_printf(m, " 0x%p", tk->rp.kp.addr); |
677 | else if (tp->rp.kp.offset) | 842 | else if (tk->rp.kp.offset) |
678 | seq_printf(m, " %s+%u", trace_probe_symbol(tp), | 843 | seq_printf(m, " %s+%u", trace_kprobe_symbol(tk), |
679 | tp->rp.kp.offset); | 844 | tk->rp.kp.offset); |
680 | else | 845 | else |
681 | seq_printf(m, " %s", trace_probe_symbol(tp)); | 846 | seq_printf(m, " %s", trace_kprobe_symbol(tk)); |
682 | 847 | ||
683 | for (i = 0; i < tp->nr_args; i++) | 848 | for (i = 0; i < tk->tp.nr_args; i++) |
684 | seq_printf(m, " %s=%s", tp->args[i].name, tp->args[i].comm); | 849 | seq_printf(m, " %s=%s", tk->tp.args[i].name, tk->tp.args[i].comm); |
685 | seq_printf(m, "\n"); | 850 | seq_printf(m, "\n"); |
686 | 851 | ||
687 | return 0; | 852 | return 0; |
@@ -699,7 +864,7 @@ static int probes_open(struct inode *inode, struct file *file) | |||
699 | int ret; | 864 | int ret; |
700 | 865 | ||
701 | if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) { | 866 | if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) { |
702 | ret = release_all_trace_probes(); | 867 | ret = release_all_trace_kprobes(); |
703 | if (ret < 0) | 868 | if (ret < 0) |
704 | return ret; | 869 | return ret; |
705 | } | 870 | } |
@@ -711,7 +876,7 @@ static ssize_t probes_write(struct file *file, const char __user *buffer, | |||
711 | size_t count, loff_t *ppos) | 876 | size_t count, loff_t *ppos) |
712 | { | 877 | { |
713 | return traceprobe_probes_write(file, buffer, count, ppos, | 878 | return traceprobe_probes_write(file, buffer, count, ppos, |
714 | create_trace_probe); | 879 | create_trace_kprobe); |
715 | } | 880 | } |
716 | 881 | ||
717 | static const struct file_operations kprobe_events_ops = { | 882 | static const struct file_operations kprobe_events_ops = { |
@@ -726,10 +891,10 @@ static const struct file_operations kprobe_events_ops = { | |||
726 | /* Probes profiling interfaces */ | 891 | /* Probes profiling interfaces */ |
727 | static int probes_profile_seq_show(struct seq_file *m, void *v) | 892 | static int probes_profile_seq_show(struct seq_file *m, void *v) |
728 | { | 893 | { |
729 | struct trace_probe *tp = v; | 894 | struct trace_kprobe *tk = v; |
730 | 895 | ||
731 | seq_printf(m, " %-44s %15lu %15lu\n", tp->call.name, tp->nhit, | 896 | seq_printf(m, " %-44s %15lu %15lu\n", tk->tp.call.name, tk->nhit, |
732 | tp->rp.kp.nmissed); | 897 | tk->rp.kp.nmissed); |
733 | 898 | ||
734 | return 0; | 899 | return 0; |
735 | } | 900 | } |
@@ -754,57 +919,9 @@ static const struct file_operations kprobe_profile_ops = { | |||
754 | .release = seq_release, | 919 | .release = seq_release, |
755 | }; | 920 | }; |
756 | 921 | ||
757 | /* Sum up total data length for dynamic arraies (strings) */ | ||
758 | static __kprobes int __get_data_size(struct trace_probe *tp, | ||
759 | struct pt_regs *regs) | ||
760 | { | ||
761 | int i, ret = 0; | ||
762 | u32 len; | ||
763 | |||
764 | for (i = 0; i < tp->nr_args; i++) | ||
765 | if (unlikely(tp->args[i].fetch_size.fn)) { | ||
766 | call_fetch(&tp->args[i].fetch_size, regs, &len); | ||
767 | ret += len; | ||
768 | } | ||
769 | |||
770 | return ret; | ||
771 | } | ||
772 | |||
773 | /* Store the value of each argument */ | ||
774 | static __kprobes void store_trace_args(int ent_size, struct trace_probe *tp, | ||
775 | struct pt_regs *regs, | ||
776 | u8 *data, int maxlen) | ||
777 | { | ||
778 | int i; | ||
779 | u32 end = tp->size; | ||
780 | u32 *dl; /* Data (relative) location */ | ||
781 | |||
782 | for (i = 0; i < tp->nr_args; i++) { | ||
783 | if (unlikely(tp->args[i].fetch_size.fn)) { | ||
784 | /* | ||
785 | * First, we set the relative location and | ||
786 | * maximum data length to *dl | ||
787 | */ | ||
788 | dl = (u32 *)(data + tp->args[i].offset); | ||
789 | *dl = make_data_rloc(maxlen, end - tp->args[i].offset); | ||
790 | /* Then try to fetch string or dynamic array data */ | ||
791 | call_fetch(&tp->args[i].fetch, regs, dl); | ||
792 | /* Reduce maximum length */ | ||
793 | end += get_rloc_len(*dl); | ||
794 | maxlen -= get_rloc_len(*dl); | ||
795 | /* Trick here, convert data_rloc to data_loc */ | ||
796 | *dl = convert_rloc_to_loc(*dl, | ||
797 | ent_size + tp->args[i].offset); | ||
798 | } else | ||
799 | /* Just fetching data normally */ | ||
800 | call_fetch(&tp->args[i].fetch, regs, | ||
801 | data + tp->args[i].offset); | ||
802 | } | ||
803 | } | ||
804 | |||
805 | /* Kprobe handler */ | 922 | /* Kprobe handler */ |
806 | static __kprobes void | 923 | static __kprobes void |
807 | __kprobe_trace_func(struct trace_probe *tp, struct pt_regs *regs, | 924 | __kprobe_trace_func(struct trace_kprobe *tk, struct pt_regs *regs, |
808 | struct ftrace_event_file *ftrace_file) | 925 | struct ftrace_event_file *ftrace_file) |
809 | { | 926 | { |
810 | struct kprobe_trace_entry_head *entry; | 927 | struct kprobe_trace_entry_head *entry; |
@@ -812,18 +929,18 @@ __kprobe_trace_func(struct trace_probe *tp, struct pt_regs *regs, | |||
812 | struct ring_buffer *buffer; | 929 | struct ring_buffer *buffer; |
813 | int size, dsize, pc; | 930 | int size, dsize, pc; |
814 | unsigned long irq_flags; | 931 | unsigned long irq_flags; |
815 | struct ftrace_event_call *call = &tp->call; | 932 | struct ftrace_event_call *call = &tk->tp.call; |
816 | 933 | ||
817 | WARN_ON(call != ftrace_file->event_call); | 934 | WARN_ON(call != ftrace_file->event_call); |
818 | 935 | ||
819 | if (test_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &ftrace_file->flags)) | 936 | if (ftrace_trigger_soft_disabled(ftrace_file)) |
820 | return; | 937 | return; |
821 | 938 | ||
822 | local_save_flags(irq_flags); | 939 | local_save_flags(irq_flags); |
823 | pc = preempt_count(); | 940 | pc = preempt_count(); |
824 | 941 | ||
825 | dsize = __get_data_size(tp, regs); | 942 | dsize = __get_data_size(&tk->tp, regs); |
826 | size = sizeof(*entry) + tp->size + dsize; | 943 | size = sizeof(*entry) + tk->tp.size + dsize; |
827 | 944 | ||
828 | event = trace_event_buffer_lock_reserve(&buffer, ftrace_file, | 945 | event = trace_event_buffer_lock_reserve(&buffer, ftrace_file, |
829 | call->event.type, | 946 | call->event.type, |
@@ -832,26 +949,25 @@ __kprobe_trace_func(struct trace_probe *tp, struct pt_regs *regs, | |||
832 | return; | 949 | return; |
833 | 950 | ||
834 | entry = ring_buffer_event_data(event); | 951 | entry = ring_buffer_event_data(event); |
835 | entry->ip = (unsigned long)tp->rp.kp.addr; | 952 | entry->ip = (unsigned long)tk->rp.kp.addr; |
836 | store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); | 953 | store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize); |
837 | 954 | ||
838 | if (!filter_check_discard(ftrace_file, entry, buffer, event)) | 955 | event_trigger_unlock_commit_regs(ftrace_file, buffer, event, |
839 | trace_buffer_unlock_commit_regs(buffer, event, | 956 | entry, irq_flags, pc, regs); |
840 | irq_flags, pc, regs); | ||
841 | } | 957 | } |
842 | 958 | ||
843 | static __kprobes void | 959 | static __kprobes void |
844 | kprobe_trace_func(struct trace_probe *tp, struct pt_regs *regs) | 960 | kprobe_trace_func(struct trace_kprobe *tk, struct pt_regs *regs) |
845 | { | 961 | { |
846 | struct event_file_link *link; | 962 | struct event_file_link *link; |
847 | 963 | ||
848 | list_for_each_entry_rcu(link, &tp->files, list) | 964 | list_for_each_entry_rcu(link, &tk->tp.files, list) |
849 | __kprobe_trace_func(tp, regs, link->file); | 965 | __kprobe_trace_func(tk, regs, link->file); |
850 | } | 966 | } |
851 | 967 | ||
852 | /* Kretprobe handler */ | 968 | /* Kretprobe handler */ |
853 | static __kprobes void | 969 | static __kprobes void |
854 | __kretprobe_trace_func(struct trace_probe *tp, struct kretprobe_instance *ri, | 970 | __kretprobe_trace_func(struct trace_kprobe *tk, struct kretprobe_instance *ri, |
855 | struct pt_regs *regs, | 971 | struct pt_regs *regs, |
856 | struct ftrace_event_file *ftrace_file) | 972 | struct ftrace_event_file *ftrace_file) |
857 | { | 973 | { |
@@ -860,18 +976,18 @@ __kretprobe_trace_func(struct trace_probe *tp, struct kretprobe_instance *ri, | |||
860 | struct ring_buffer *buffer; | 976 | struct ring_buffer *buffer; |
861 | int size, pc, dsize; | 977 | int size, pc, dsize; |
862 | unsigned long irq_flags; | 978 | unsigned long irq_flags; |
863 | struct ftrace_event_call *call = &tp->call; | 979 | struct ftrace_event_call *call = &tk->tp.call; |
864 | 980 | ||
865 | WARN_ON(call != ftrace_file->event_call); | 981 | WARN_ON(call != ftrace_file->event_call); |
866 | 982 | ||
867 | if (test_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &ftrace_file->flags)) | 983 | if (ftrace_trigger_soft_disabled(ftrace_file)) |
868 | return; | 984 | return; |
869 | 985 | ||
870 | local_save_flags(irq_flags); | 986 | local_save_flags(irq_flags); |
871 | pc = preempt_count(); | 987 | pc = preempt_count(); |
872 | 988 | ||
873 | dsize = __get_data_size(tp, regs); | 989 | dsize = __get_data_size(&tk->tp, regs); |
874 | size = sizeof(*entry) + tp->size + dsize; | 990 | size = sizeof(*entry) + tk->tp.size + dsize; |
875 | 991 | ||
876 | event = trace_event_buffer_lock_reserve(&buffer, ftrace_file, | 992 | event = trace_event_buffer_lock_reserve(&buffer, ftrace_file, |
877 | call->event.type, | 993 | call->event.type, |
@@ -880,23 +996,22 @@ __kretprobe_trace_func(struct trace_probe *tp, struct kretprobe_instance *ri, | |||
880 | return; | 996 | return; |
881 | 997 | ||
882 | entry = ring_buffer_event_data(event); | 998 | entry = ring_buffer_event_data(event); |
883 | entry->func = (unsigned long)tp->rp.kp.addr; | 999 | entry->func = (unsigned long)tk->rp.kp.addr; |
884 | entry->ret_ip = (unsigned long)ri->ret_addr; | 1000 | entry->ret_ip = (unsigned long)ri->ret_addr; |
885 | store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); | 1001 | store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize); |
886 | 1002 | ||
887 | if (!filter_check_discard(ftrace_file, entry, buffer, event)) | 1003 | event_trigger_unlock_commit_regs(ftrace_file, buffer, event, |
888 | trace_buffer_unlock_commit_regs(buffer, event, | 1004 | entry, irq_flags, pc, regs); |
889 | irq_flags, pc, regs); | ||
890 | } | 1005 | } |
891 | 1006 | ||
892 | static __kprobes void | 1007 | static __kprobes void |
893 | kretprobe_trace_func(struct trace_probe *tp, struct kretprobe_instance *ri, | 1008 | kretprobe_trace_func(struct trace_kprobe *tk, struct kretprobe_instance *ri, |
894 | struct pt_regs *regs) | 1009 | struct pt_regs *regs) |
895 | { | 1010 | { |
896 | struct event_file_link *link; | 1011 | struct event_file_link *link; |
897 | 1012 | ||
898 | list_for_each_entry_rcu(link, &tp->files, list) | 1013 | list_for_each_entry_rcu(link, &tk->tp.files, list) |
899 | __kretprobe_trace_func(tp, ri, regs, link->file); | 1014 | __kretprobe_trace_func(tk, ri, regs, link->file); |
900 | } | 1015 | } |
901 | 1016 | ||
902 | /* Event entry printers */ | 1017 | /* Event entry printers */ |
@@ -983,16 +1098,18 @@ static int kprobe_event_define_fields(struct ftrace_event_call *event_call) | |||
983 | { | 1098 | { |
984 | int ret, i; | 1099 | int ret, i; |
985 | struct kprobe_trace_entry_head field; | 1100 | struct kprobe_trace_entry_head field; |
986 | struct trace_probe *tp = (struct trace_probe *)event_call->data; | 1101 | struct trace_kprobe *tk = (struct trace_kprobe *)event_call->data; |
987 | 1102 | ||
988 | DEFINE_FIELD(unsigned long, ip, FIELD_STRING_IP, 0); | 1103 | DEFINE_FIELD(unsigned long, ip, FIELD_STRING_IP, 0); |
989 | /* Set argument names as fields */ | 1104 | /* Set argument names as fields */ |
990 | for (i = 0; i < tp->nr_args; i++) { | 1105 | for (i = 0; i < tk->tp.nr_args; i++) { |
991 | ret = trace_define_field(event_call, tp->args[i].type->fmttype, | 1106 | struct probe_arg *parg = &tk->tp.args[i]; |
992 | tp->args[i].name, | 1107 | |
993 | sizeof(field) + tp->args[i].offset, | 1108 | ret = trace_define_field(event_call, parg->type->fmttype, |
994 | tp->args[i].type->size, | 1109 | parg->name, |
995 | tp->args[i].type->is_signed, | 1110 | sizeof(field) + parg->offset, |
1111 | parg->type->size, | ||
1112 | parg->type->is_signed, | ||
996 | FILTER_OTHER); | 1113 | FILTER_OTHER); |
997 | if (ret) | 1114 | if (ret) |
998 | return ret; | 1115 | return ret; |
@@ -1004,17 +1121,19 @@ static int kretprobe_event_define_fields(struct ftrace_event_call *event_call) | |||
1004 | { | 1121 | { |
1005 | int ret, i; | 1122 | int ret, i; |
1006 | struct kretprobe_trace_entry_head field; | 1123 | struct kretprobe_trace_entry_head field; |
1007 | struct trace_probe *tp = (struct trace_probe *)event_call->data; | 1124 | struct trace_kprobe *tk = (struct trace_kprobe *)event_call->data; |
1008 | 1125 | ||
1009 | DEFINE_FIELD(unsigned long, func, FIELD_STRING_FUNC, 0); | 1126 | DEFINE_FIELD(unsigned long, func, FIELD_STRING_FUNC, 0); |
1010 | DEFINE_FIELD(unsigned long, ret_ip, FIELD_STRING_RETIP, 0); | 1127 | DEFINE_FIELD(unsigned long, ret_ip, FIELD_STRING_RETIP, 0); |
1011 | /* Set argument names as fields */ | 1128 | /* Set argument names as fields */ |
1012 | for (i = 0; i < tp->nr_args; i++) { | 1129 | for (i = 0; i < tk->tp.nr_args; i++) { |
1013 | ret = trace_define_field(event_call, tp->args[i].type->fmttype, | 1130 | struct probe_arg *parg = &tk->tp.args[i]; |
1014 | tp->args[i].name, | 1131 | |
1015 | sizeof(field) + tp->args[i].offset, | 1132 | ret = trace_define_field(event_call, parg->type->fmttype, |
1016 | tp->args[i].type->size, | 1133 | parg->name, |
1017 | tp->args[i].type->is_signed, | 1134 | sizeof(field) + parg->offset, |
1135 | parg->type->size, | ||
1136 | parg->type->is_signed, | ||
1018 | FILTER_OTHER); | 1137 | FILTER_OTHER); |
1019 | if (ret) | 1138 | if (ret) |
1020 | return ret; | 1139 | return ret; |
@@ -1022,74 +1141,13 @@ static int kretprobe_event_define_fields(struct ftrace_event_call *event_call) | |||
1022 | return 0; | 1141 | return 0; |
1023 | } | 1142 | } |
1024 | 1143 | ||
1025 | static int __set_print_fmt(struct trace_probe *tp, char *buf, int len) | ||
1026 | { | ||
1027 | int i; | ||
1028 | int pos = 0; | ||
1029 | |||
1030 | const char *fmt, *arg; | ||
1031 | |||
1032 | if (!trace_probe_is_return(tp)) { | ||
1033 | fmt = "(%lx)"; | ||
1034 | arg = "REC->" FIELD_STRING_IP; | ||
1035 | } else { | ||
1036 | fmt = "(%lx <- %lx)"; | ||
1037 | arg = "REC->" FIELD_STRING_FUNC ", REC->" FIELD_STRING_RETIP; | ||
1038 | } | ||
1039 | |||
1040 | /* When len=0, we just calculate the needed length */ | ||
1041 | #define LEN_OR_ZERO (len ? len - pos : 0) | ||
1042 | |||
1043 | pos += snprintf(buf + pos, LEN_OR_ZERO, "\"%s", fmt); | ||
1044 | |||
1045 | for (i = 0; i < tp->nr_args; i++) { | ||
1046 | pos += snprintf(buf + pos, LEN_OR_ZERO, " %s=%s", | ||
1047 | tp->args[i].name, tp->args[i].type->fmt); | ||
1048 | } | ||
1049 | |||
1050 | pos += snprintf(buf + pos, LEN_OR_ZERO, "\", %s", arg); | ||
1051 | |||
1052 | for (i = 0; i < tp->nr_args; i++) { | ||
1053 | if (strcmp(tp->args[i].type->name, "string") == 0) | ||
1054 | pos += snprintf(buf + pos, LEN_OR_ZERO, | ||
1055 | ", __get_str(%s)", | ||
1056 | tp->args[i].name); | ||
1057 | else | ||
1058 | pos += snprintf(buf + pos, LEN_OR_ZERO, ", REC->%s", | ||
1059 | tp->args[i].name); | ||
1060 | } | ||
1061 | |||
1062 | #undef LEN_OR_ZERO | ||
1063 | |||
1064 | /* return the length of print_fmt */ | ||
1065 | return pos; | ||
1066 | } | ||
1067 | |||
1068 | static int set_print_fmt(struct trace_probe *tp) | ||
1069 | { | ||
1070 | int len; | ||
1071 | char *print_fmt; | ||
1072 | |||
1073 | /* First: called with 0 length to calculate the needed length */ | ||
1074 | len = __set_print_fmt(tp, NULL, 0); | ||
1075 | print_fmt = kmalloc(len + 1, GFP_KERNEL); | ||
1076 | if (!print_fmt) | ||
1077 | return -ENOMEM; | ||
1078 | |||
1079 | /* Second: actually write the @print_fmt */ | ||
1080 | __set_print_fmt(tp, print_fmt, len + 1); | ||
1081 | tp->call.print_fmt = print_fmt; | ||
1082 | |||
1083 | return 0; | ||
1084 | } | ||
1085 | |||
1086 | #ifdef CONFIG_PERF_EVENTS | 1144 | #ifdef CONFIG_PERF_EVENTS |
1087 | 1145 | ||
1088 | /* Kprobe profile handler */ | 1146 | /* Kprobe profile handler */ |
1089 | static __kprobes void | 1147 | static __kprobes void |
1090 | kprobe_perf_func(struct trace_probe *tp, struct pt_regs *regs) | 1148 | kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs) |
1091 | { | 1149 | { |
1092 | struct ftrace_event_call *call = &tp->call; | 1150 | struct ftrace_event_call *call = &tk->tp.call; |
1093 | struct kprobe_trace_entry_head *entry; | 1151 | struct kprobe_trace_entry_head *entry; |
1094 | struct hlist_head *head; | 1152 | struct hlist_head *head; |
1095 | int size, __size, dsize; | 1153 | int size, __size, dsize; |
@@ -1099,8 +1157,8 @@ kprobe_perf_func(struct trace_probe *tp, struct pt_regs *regs) | |||
1099 | if (hlist_empty(head)) | 1157 | if (hlist_empty(head)) |
1100 | return; | 1158 | return; |
1101 | 1159 | ||
1102 | dsize = __get_data_size(tp, regs); | 1160 | dsize = __get_data_size(&tk->tp, regs); |
1103 | __size = sizeof(*entry) + tp->size + dsize; | 1161 | __size = sizeof(*entry) + tk->tp.size + dsize; |
1104 | size = ALIGN(__size + sizeof(u32), sizeof(u64)); | 1162 | size = ALIGN(__size + sizeof(u32), sizeof(u64)); |
1105 | size -= sizeof(u32); | 1163 | size -= sizeof(u32); |
1106 | 1164 | ||
@@ -1108,18 +1166,18 @@ kprobe_perf_func(struct trace_probe *tp, struct pt_regs *regs) | |||
1108 | if (!entry) | 1166 | if (!entry) |
1109 | return; | 1167 | return; |
1110 | 1168 | ||
1111 | entry->ip = (unsigned long)tp->rp.kp.addr; | 1169 | entry->ip = (unsigned long)tk->rp.kp.addr; |
1112 | memset(&entry[1], 0, dsize); | 1170 | memset(&entry[1], 0, dsize); |
1113 | store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); | 1171 | store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize); |
1114 | perf_trace_buf_submit(entry, size, rctx, 0, 1, regs, head, NULL); | 1172 | perf_trace_buf_submit(entry, size, rctx, 0, 1, regs, head, NULL); |
1115 | } | 1173 | } |
1116 | 1174 | ||
1117 | /* Kretprobe profile handler */ | 1175 | /* Kretprobe profile handler */ |
1118 | static __kprobes void | 1176 | static __kprobes void |
1119 | kretprobe_perf_func(struct trace_probe *tp, struct kretprobe_instance *ri, | 1177 | kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri, |
1120 | struct pt_regs *regs) | 1178 | struct pt_regs *regs) |
1121 | { | 1179 | { |
1122 | struct ftrace_event_call *call = &tp->call; | 1180 | struct ftrace_event_call *call = &tk->tp.call; |
1123 | struct kretprobe_trace_entry_head *entry; | 1181 | struct kretprobe_trace_entry_head *entry; |
1124 | struct hlist_head *head; | 1182 | struct hlist_head *head; |
1125 | int size, __size, dsize; | 1183 | int size, __size, dsize; |
@@ -1129,8 +1187,8 @@ kretprobe_perf_func(struct trace_probe *tp, struct kretprobe_instance *ri, | |||
1129 | if (hlist_empty(head)) | 1187 | if (hlist_empty(head)) |
1130 | return; | 1188 | return; |
1131 | 1189 | ||
1132 | dsize = __get_data_size(tp, regs); | 1190 | dsize = __get_data_size(&tk->tp, regs); |
1133 | __size = sizeof(*entry) + tp->size + dsize; | 1191 | __size = sizeof(*entry) + tk->tp.size + dsize; |
1134 | size = ALIGN(__size + sizeof(u32), sizeof(u64)); | 1192 | size = ALIGN(__size + sizeof(u32), sizeof(u64)); |
1135 | size -= sizeof(u32); | 1193 | size -= sizeof(u32); |
1136 | 1194 | ||
@@ -1138,9 +1196,9 @@ kretprobe_perf_func(struct trace_probe *tp, struct kretprobe_instance *ri, | |||
1138 | if (!entry) | 1196 | if (!entry) |
1139 | return; | 1197 | return; |
1140 | 1198 | ||
1141 | entry->func = (unsigned long)tp->rp.kp.addr; | 1199 | entry->func = (unsigned long)tk->rp.kp.addr; |
1142 | entry->ret_ip = (unsigned long)ri->ret_addr; | 1200 | entry->ret_ip = (unsigned long)ri->ret_addr; |
1143 | store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); | 1201 | store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize); |
1144 | perf_trace_buf_submit(entry, size, rctx, 0, 1, regs, head, NULL); | 1202 | perf_trace_buf_submit(entry, size, rctx, 0, 1, regs, head, NULL); |
1145 | } | 1203 | } |
1146 | #endif /* CONFIG_PERF_EVENTS */ | 1204 | #endif /* CONFIG_PERF_EVENTS */ |
@@ -1155,20 +1213,20 @@ static __kprobes | |||
1155 | int kprobe_register(struct ftrace_event_call *event, | 1213 | int kprobe_register(struct ftrace_event_call *event, |
1156 | enum trace_reg type, void *data) | 1214 | enum trace_reg type, void *data) |
1157 | { | 1215 | { |
1158 | struct trace_probe *tp = (struct trace_probe *)event->data; | 1216 | struct trace_kprobe *tk = (struct trace_kprobe *)event->data; |
1159 | struct ftrace_event_file *file = data; | 1217 | struct ftrace_event_file *file = data; |
1160 | 1218 | ||
1161 | switch (type) { | 1219 | switch (type) { |
1162 | case TRACE_REG_REGISTER: | 1220 | case TRACE_REG_REGISTER: |
1163 | return enable_trace_probe(tp, file); | 1221 | return enable_trace_kprobe(tk, file); |
1164 | case TRACE_REG_UNREGISTER: | 1222 | case TRACE_REG_UNREGISTER: |
1165 | return disable_trace_probe(tp, file); | 1223 | return disable_trace_kprobe(tk, file); |
1166 | 1224 | ||
1167 | #ifdef CONFIG_PERF_EVENTS | 1225 | #ifdef CONFIG_PERF_EVENTS |
1168 | case TRACE_REG_PERF_REGISTER: | 1226 | case TRACE_REG_PERF_REGISTER: |
1169 | return enable_trace_probe(tp, NULL); | 1227 | return enable_trace_kprobe(tk, NULL); |
1170 | case TRACE_REG_PERF_UNREGISTER: | 1228 | case TRACE_REG_PERF_UNREGISTER: |
1171 | return disable_trace_probe(tp, NULL); | 1229 | return disable_trace_kprobe(tk, NULL); |
1172 | case TRACE_REG_PERF_OPEN: | 1230 | case TRACE_REG_PERF_OPEN: |
1173 | case TRACE_REG_PERF_CLOSE: | 1231 | case TRACE_REG_PERF_CLOSE: |
1174 | case TRACE_REG_PERF_ADD: | 1232 | case TRACE_REG_PERF_ADD: |
@@ -1182,15 +1240,15 @@ int kprobe_register(struct ftrace_event_call *event, | |||
1182 | static __kprobes | 1240 | static __kprobes |
1183 | int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs) | 1241 | int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs) |
1184 | { | 1242 | { |
1185 | struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp); | 1243 | struct trace_kprobe *tk = container_of(kp, struct trace_kprobe, rp.kp); |
1186 | 1244 | ||
1187 | tp->nhit++; | 1245 | tk->nhit++; |
1188 | 1246 | ||
1189 | if (tp->flags & TP_FLAG_TRACE) | 1247 | if (tk->tp.flags & TP_FLAG_TRACE) |
1190 | kprobe_trace_func(tp, regs); | 1248 | kprobe_trace_func(tk, regs); |
1191 | #ifdef CONFIG_PERF_EVENTS | 1249 | #ifdef CONFIG_PERF_EVENTS |
1192 | if (tp->flags & TP_FLAG_PROFILE) | 1250 | if (tk->tp.flags & TP_FLAG_PROFILE) |
1193 | kprobe_perf_func(tp, regs); | 1251 | kprobe_perf_func(tk, regs); |
1194 | #endif | 1252 | #endif |
1195 | return 0; /* We don't tweek kernel, so just return 0 */ | 1253 | return 0; /* We don't tweek kernel, so just return 0 */ |
1196 | } | 1254 | } |
@@ -1198,15 +1256,15 @@ int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs) | |||
1198 | static __kprobes | 1256 | static __kprobes |
1199 | int kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs) | 1257 | int kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs) |
1200 | { | 1258 | { |
1201 | struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp); | 1259 | struct trace_kprobe *tk = container_of(ri->rp, struct trace_kprobe, rp); |
1202 | 1260 | ||
1203 | tp->nhit++; | 1261 | tk->nhit++; |
1204 | 1262 | ||
1205 | if (tp->flags & TP_FLAG_TRACE) | 1263 | if (tk->tp.flags & TP_FLAG_TRACE) |
1206 | kretprobe_trace_func(tp, ri, regs); | 1264 | kretprobe_trace_func(tk, ri, regs); |
1207 | #ifdef CONFIG_PERF_EVENTS | 1265 | #ifdef CONFIG_PERF_EVENTS |
1208 | if (tp->flags & TP_FLAG_PROFILE) | 1266 | if (tk->tp.flags & TP_FLAG_PROFILE) |
1209 | kretprobe_perf_func(tp, ri, regs); | 1267 | kretprobe_perf_func(tk, ri, regs); |
1210 | #endif | 1268 | #endif |
1211 | return 0; /* We don't tweek kernel, so just return 0 */ | 1269 | return 0; /* We don't tweek kernel, so just return 0 */ |
1212 | } | 1270 | } |
@@ -1219,21 +1277,21 @@ static struct trace_event_functions kprobe_funcs = { | |||
1219 | .trace = print_kprobe_event | 1277 | .trace = print_kprobe_event |
1220 | }; | 1278 | }; |
1221 | 1279 | ||
1222 | static int register_probe_event(struct trace_probe *tp) | 1280 | static int register_kprobe_event(struct trace_kprobe *tk) |
1223 | { | 1281 | { |
1224 | struct ftrace_event_call *call = &tp->call; | 1282 | struct ftrace_event_call *call = &tk->tp.call; |
1225 | int ret; | 1283 | int ret; |
1226 | 1284 | ||
1227 | /* Initialize ftrace_event_call */ | 1285 | /* Initialize ftrace_event_call */ |
1228 | INIT_LIST_HEAD(&call->class->fields); | 1286 | INIT_LIST_HEAD(&call->class->fields); |
1229 | if (trace_probe_is_return(tp)) { | 1287 | if (trace_kprobe_is_return(tk)) { |
1230 | call->event.funcs = &kretprobe_funcs; | 1288 | call->event.funcs = &kretprobe_funcs; |
1231 | call->class->define_fields = kretprobe_event_define_fields; | 1289 | call->class->define_fields = kretprobe_event_define_fields; |
1232 | } else { | 1290 | } else { |
1233 | call->event.funcs = &kprobe_funcs; | 1291 | call->event.funcs = &kprobe_funcs; |
1234 | call->class->define_fields = kprobe_event_define_fields; | 1292 | call->class->define_fields = kprobe_event_define_fields; |
1235 | } | 1293 | } |
1236 | if (set_print_fmt(tp) < 0) | 1294 | if (set_print_fmt(&tk->tp, trace_kprobe_is_return(tk)) < 0) |
1237 | return -ENOMEM; | 1295 | return -ENOMEM; |
1238 | ret = register_ftrace_event(&call->event); | 1296 | ret = register_ftrace_event(&call->event); |
1239 | if (!ret) { | 1297 | if (!ret) { |
@@ -1242,7 +1300,7 @@ static int register_probe_event(struct trace_probe *tp) | |||
1242 | } | 1300 | } |
1243 | call->flags = 0; | 1301 | call->flags = 0; |
1244 | call->class->reg = kprobe_register; | 1302 | call->class->reg = kprobe_register; |
1245 | call->data = tp; | 1303 | call->data = tk; |
1246 | ret = trace_add_event_call(call); | 1304 | ret = trace_add_event_call(call); |
1247 | if (ret) { | 1305 | if (ret) { |
1248 | pr_info("Failed to register kprobe event: %s\n", call->name); | 1306 | pr_info("Failed to register kprobe event: %s\n", call->name); |
@@ -1252,14 +1310,14 @@ static int register_probe_event(struct trace_probe *tp) | |||
1252 | return ret; | 1310 | return ret; |
1253 | } | 1311 | } |
1254 | 1312 | ||
1255 | static int unregister_probe_event(struct trace_probe *tp) | 1313 | static int unregister_kprobe_event(struct trace_kprobe *tk) |
1256 | { | 1314 | { |
1257 | int ret; | 1315 | int ret; |
1258 | 1316 | ||
1259 | /* tp->event is unregistered in trace_remove_event_call() */ | 1317 | /* tp->event is unregistered in trace_remove_event_call() */ |
1260 | ret = trace_remove_event_call(&tp->call); | 1318 | ret = trace_remove_event_call(&tk->tp.call); |
1261 | if (!ret) | 1319 | if (!ret) |
1262 | kfree(tp->call.print_fmt); | 1320 | kfree(tk->tp.call.print_fmt); |
1263 | return ret; | 1321 | return ret; |
1264 | } | 1322 | } |
1265 | 1323 | ||
@@ -1269,7 +1327,7 @@ static __init int init_kprobe_trace(void) | |||
1269 | struct dentry *d_tracer; | 1327 | struct dentry *d_tracer; |
1270 | struct dentry *entry; | 1328 | struct dentry *entry; |
1271 | 1329 | ||
1272 | if (register_module_notifier(&trace_probe_module_nb)) | 1330 | if (register_module_notifier(&trace_kprobe_module_nb)) |
1273 | return -EINVAL; | 1331 | return -EINVAL; |
1274 | 1332 | ||
1275 | d_tracer = tracing_init_dentry(); | 1333 | d_tracer = tracing_init_dentry(); |
@@ -1309,26 +1367,26 @@ static __used int kprobe_trace_selftest_target(int a1, int a2, int a3, | |||
1309 | } | 1367 | } |
1310 | 1368 | ||
1311 | static struct ftrace_event_file * | 1369 | static struct ftrace_event_file * |
1312 | find_trace_probe_file(struct trace_probe *tp, struct trace_array *tr) | 1370 | find_trace_probe_file(struct trace_kprobe *tk, struct trace_array *tr) |
1313 | { | 1371 | { |
1314 | struct ftrace_event_file *file; | 1372 | struct ftrace_event_file *file; |
1315 | 1373 | ||
1316 | list_for_each_entry(file, &tr->events, list) | 1374 | list_for_each_entry(file, &tr->events, list) |
1317 | if (file->event_call == &tp->call) | 1375 | if (file->event_call == &tk->tp.call) |
1318 | return file; | 1376 | return file; |
1319 | 1377 | ||
1320 | return NULL; | 1378 | return NULL; |
1321 | } | 1379 | } |
1322 | 1380 | ||
1323 | /* | 1381 | /* |
1324 | * Nobody but us can call enable_trace_probe/disable_trace_probe at this | 1382 | * Nobody but us can call enable_trace_kprobe/disable_trace_kprobe at this |
1325 | * stage, we can do this lockless. | 1383 | * stage, we can do this lockless. |
1326 | */ | 1384 | */ |
1327 | static __init int kprobe_trace_self_tests_init(void) | 1385 | static __init int kprobe_trace_self_tests_init(void) |
1328 | { | 1386 | { |
1329 | int ret, warn = 0; | 1387 | int ret, warn = 0; |
1330 | int (*target)(int, int, int, int, int, int); | 1388 | int (*target)(int, int, int, int, int, int); |
1331 | struct trace_probe *tp; | 1389 | struct trace_kprobe *tk; |
1332 | struct ftrace_event_file *file; | 1390 | struct ftrace_event_file *file; |
1333 | 1391 | ||
1334 | target = kprobe_trace_selftest_target; | 1392 | target = kprobe_trace_selftest_target; |
@@ -1337,44 +1395,44 @@ static __init int kprobe_trace_self_tests_init(void) | |||
1337 | 1395 | ||
1338 | ret = traceprobe_command("p:testprobe kprobe_trace_selftest_target " | 1396 | ret = traceprobe_command("p:testprobe kprobe_trace_selftest_target " |
1339 | "$stack $stack0 +0($stack)", | 1397 | "$stack $stack0 +0($stack)", |
1340 | create_trace_probe); | 1398 | create_trace_kprobe); |
1341 | if (WARN_ON_ONCE(ret)) { | 1399 | if (WARN_ON_ONCE(ret)) { |
1342 | pr_warn("error on probing function entry.\n"); | 1400 | pr_warn("error on probing function entry.\n"); |
1343 | warn++; | 1401 | warn++; |
1344 | } else { | 1402 | } else { |
1345 | /* Enable trace point */ | 1403 | /* Enable trace point */ |
1346 | tp = find_trace_probe("testprobe", KPROBE_EVENT_SYSTEM); | 1404 | tk = find_trace_kprobe("testprobe", KPROBE_EVENT_SYSTEM); |
1347 | if (WARN_ON_ONCE(tp == NULL)) { | 1405 | if (WARN_ON_ONCE(tk == NULL)) { |
1348 | pr_warn("error on getting new probe.\n"); | 1406 | pr_warn("error on getting new probe.\n"); |
1349 | warn++; | 1407 | warn++; |
1350 | } else { | 1408 | } else { |
1351 | file = find_trace_probe_file(tp, top_trace_array()); | 1409 | file = find_trace_probe_file(tk, top_trace_array()); |
1352 | if (WARN_ON_ONCE(file == NULL)) { | 1410 | if (WARN_ON_ONCE(file == NULL)) { |
1353 | pr_warn("error on getting probe file.\n"); | 1411 | pr_warn("error on getting probe file.\n"); |
1354 | warn++; | 1412 | warn++; |
1355 | } else | 1413 | } else |
1356 | enable_trace_probe(tp, file); | 1414 | enable_trace_kprobe(tk, file); |
1357 | } | 1415 | } |
1358 | } | 1416 | } |
1359 | 1417 | ||
1360 | ret = traceprobe_command("r:testprobe2 kprobe_trace_selftest_target " | 1418 | ret = traceprobe_command("r:testprobe2 kprobe_trace_selftest_target " |
1361 | "$retval", create_trace_probe); | 1419 | "$retval", create_trace_kprobe); |
1362 | if (WARN_ON_ONCE(ret)) { | 1420 | if (WARN_ON_ONCE(ret)) { |
1363 | pr_warn("error on probing function return.\n"); | 1421 | pr_warn("error on probing function return.\n"); |
1364 | warn++; | 1422 | warn++; |
1365 | } else { | 1423 | } else { |
1366 | /* Enable trace point */ | 1424 | /* Enable trace point */ |
1367 | tp = find_trace_probe("testprobe2", KPROBE_EVENT_SYSTEM); | 1425 | tk = find_trace_kprobe("testprobe2", KPROBE_EVENT_SYSTEM); |
1368 | if (WARN_ON_ONCE(tp == NULL)) { | 1426 | if (WARN_ON_ONCE(tk == NULL)) { |
1369 | pr_warn("error on getting 2nd new probe.\n"); | 1427 | pr_warn("error on getting 2nd new probe.\n"); |
1370 | warn++; | 1428 | warn++; |
1371 | } else { | 1429 | } else { |
1372 | file = find_trace_probe_file(tp, top_trace_array()); | 1430 | file = find_trace_probe_file(tk, top_trace_array()); |
1373 | if (WARN_ON_ONCE(file == NULL)) { | 1431 | if (WARN_ON_ONCE(file == NULL)) { |
1374 | pr_warn("error on getting probe file.\n"); | 1432 | pr_warn("error on getting probe file.\n"); |
1375 | warn++; | 1433 | warn++; |
1376 | } else | 1434 | } else |
1377 | enable_trace_probe(tp, file); | 1435 | enable_trace_kprobe(tk, file); |
1378 | } | 1436 | } |
1379 | } | 1437 | } |
1380 | 1438 | ||
@@ -1384,46 +1442,46 @@ static __init int kprobe_trace_self_tests_init(void) | |||
1384 | ret = target(1, 2, 3, 4, 5, 6); | 1442 | ret = target(1, 2, 3, 4, 5, 6); |
1385 | 1443 | ||
1386 | /* Disable trace points before removing it */ | 1444 | /* Disable trace points before removing it */ |
1387 | tp = find_trace_probe("testprobe", KPROBE_EVENT_SYSTEM); | 1445 | tk = find_trace_kprobe("testprobe", KPROBE_EVENT_SYSTEM); |
1388 | if (WARN_ON_ONCE(tp == NULL)) { | 1446 | if (WARN_ON_ONCE(tk == NULL)) { |
1389 | pr_warn("error on getting test probe.\n"); | 1447 | pr_warn("error on getting test probe.\n"); |
1390 | warn++; | 1448 | warn++; |
1391 | } else { | 1449 | } else { |
1392 | file = find_trace_probe_file(tp, top_trace_array()); | 1450 | file = find_trace_probe_file(tk, top_trace_array()); |
1393 | if (WARN_ON_ONCE(file == NULL)) { | 1451 | if (WARN_ON_ONCE(file == NULL)) { |
1394 | pr_warn("error on getting probe file.\n"); | 1452 | pr_warn("error on getting probe file.\n"); |
1395 | warn++; | 1453 | warn++; |
1396 | } else | 1454 | } else |
1397 | disable_trace_probe(tp, file); | 1455 | disable_trace_kprobe(tk, file); |
1398 | } | 1456 | } |
1399 | 1457 | ||
1400 | tp = find_trace_probe("testprobe2", KPROBE_EVENT_SYSTEM); | 1458 | tk = find_trace_kprobe("testprobe2", KPROBE_EVENT_SYSTEM); |
1401 | if (WARN_ON_ONCE(tp == NULL)) { | 1459 | if (WARN_ON_ONCE(tk == NULL)) { |
1402 | pr_warn("error on getting 2nd test probe.\n"); | 1460 | pr_warn("error on getting 2nd test probe.\n"); |
1403 | warn++; | 1461 | warn++; |
1404 | } else { | 1462 | } else { |
1405 | file = find_trace_probe_file(tp, top_trace_array()); | 1463 | file = find_trace_probe_file(tk, top_trace_array()); |
1406 | if (WARN_ON_ONCE(file == NULL)) { | 1464 | if (WARN_ON_ONCE(file == NULL)) { |
1407 | pr_warn("error on getting probe file.\n"); | 1465 | pr_warn("error on getting probe file.\n"); |
1408 | warn++; | 1466 | warn++; |
1409 | } else | 1467 | } else |
1410 | disable_trace_probe(tp, file); | 1468 | disable_trace_kprobe(tk, file); |
1411 | } | 1469 | } |
1412 | 1470 | ||
1413 | ret = traceprobe_command("-:testprobe", create_trace_probe); | 1471 | ret = traceprobe_command("-:testprobe", create_trace_kprobe); |
1414 | if (WARN_ON_ONCE(ret)) { | 1472 | if (WARN_ON_ONCE(ret)) { |
1415 | pr_warn("error on deleting a probe.\n"); | 1473 | pr_warn("error on deleting a probe.\n"); |
1416 | warn++; | 1474 | warn++; |
1417 | } | 1475 | } |
1418 | 1476 | ||
1419 | ret = traceprobe_command("-:testprobe2", create_trace_probe); | 1477 | ret = traceprobe_command("-:testprobe2", create_trace_kprobe); |
1420 | if (WARN_ON_ONCE(ret)) { | 1478 | if (WARN_ON_ONCE(ret)) { |
1421 | pr_warn("error on deleting a probe.\n"); | 1479 | pr_warn("error on deleting a probe.\n"); |
1422 | warn++; | 1480 | warn++; |
1423 | } | 1481 | } |
1424 | 1482 | ||
1425 | end: | 1483 | end: |
1426 | release_all_trace_probes(); | 1484 | release_all_trace_kprobes(); |
1427 | if (warn) | 1485 | if (warn) |
1428 | pr_cont("NG: Some tests are failed. Please check them.\n"); | 1486 | pr_cont("NG: Some tests are failed. Please check them.\n"); |
1429 | else | 1487 | else |
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 412e959709b4..8364a421b4df 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c | |||
@@ -35,46 +35,27 @@ const char *reserved_field_names[] = { | |||
35 | FIELD_STRING_FUNC, | 35 | FIELD_STRING_FUNC, |
36 | }; | 36 | }; |
37 | 37 | ||
38 | /* Printing function type */ | ||
39 | #define PRINT_TYPE_FUNC_NAME(type) print_type_##type | ||
40 | #define PRINT_TYPE_FMT_NAME(type) print_type_format_##type | ||
41 | |||
42 | /* Printing in basic type function template */ | 38 | /* Printing in basic type function template */ |
43 | #define DEFINE_BASIC_PRINT_TYPE_FUNC(type, fmt, cast) \ | 39 | #define DEFINE_BASIC_PRINT_TYPE_FUNC(type, fmt) \ |
44 | static __kprobes int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s, \ | 40 | __kprobes int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s, \ |
45 | const char *name, \ | 41 | const char *name, \ |
46 | void *data, void *ent)\ | 42 | void *data, void *ent) \ |
47 | { \ | 43 | { \ |
48 | return trace_seq_printf(s, " %s=" fmt, name, (cast)*(type *)data);\ | 44 | return trace_seq_printf(s, " %s=" fmt, name, *(type *)data); \ |
49 | } \ | 45 | } \ |
50 | static const char PRINT_TYPE_FMT_NAME(type)[] = fmt; | 46 | const char PRINT_TYPE_FMT_NAME(type)[] = fmt; |
51 | |||
52 | DEFINE_BASIC_PRINT_TYPE_FUNC(u8, "%x", unsigned int) | ||
53 | DEFINE_BASIC_PRINT_TYPE_FUNC(u16, "%x", unsigned int) | ||
54 | DEFINE_BASIC_PRINT_TYPE_FUNC(u32, "%lx", unsigned long) | ||
55 | DEFINE_BASIC_PRINT_TYPE_FUNC(u64, "%llx", unsigned long long) | ||
56 | DEFINE_BASIC_PRINT_TYPE_FUNC(s8, "%d", int) | ||
57 | DEFINE_BASIC_PRINT_TYPE_FUNC(s16, "%d", int) | ||
58 | DEFINE_BASIC_PRINT_TYPE_FUNC(s32, "%ld", long) | ||
59 | DEFINE_BASIC_PRINT_TYPE_FUNC(s64, "%lld", long long) | ||
60 | |||
61 | static inline void *get_rloc_data(u32 *dl) | ||
62 | { | ||
63 | return (u8 *)dl + get_rloc_offs(*dl); | ||
64 | } | ||
65 | 47 | ||
66 | /* For data_loc conversion */ | 48 | DEFINE_BASIC_PRINT_TYPE_FUNC(u8 , "0x%x") |
67 | static inline void *get_loc_data(u32 *dl, void *ent) | 49 | DEFINE_BASIC_PRINT_TYPE_FUNC(u16, "0x%x") |
68 | { | 50 | DEFINE_BASIC_PRINT_TYPE_FUNC(u32, "0x%x") |
69 | return (u8 *)ent + get_rloc_offs(*dl); | 51 | DEFINE_BASIC_PRINT_TYPE_FUNC(u64, "0x%Lx") |
70 | } | 52 | DEFINE_BASIC_PRINT_TYPE_FUNC(s8, "%d") |
71 | 53 | DEFINE_BASIC_PRINT_TYPE_FUNC(s16, "%d") | |
72 | /* For defining macros, define string/string_size types */ | 54 | DEFINE_BASIC_PRINT_TYPE_FUNC(s32, "%d") |
73 | typedef u32 string; | 55 | DEFINE_BASIC_PRINT_TYPE_FUNC(s64, "%Ld") |
74 | typedef u32 string_size; | ||
75 | 56 | ||
76 | /* Print type function for string type */ | 57 | /* Print type function for string type */ |
77 | static __kprobes int PRINT_TYPE_FUNC_NAME(string)(struct trace_seq *s, | 58 | __kprobes int PRINT_TYPE_FUNC_NAME(string)(struct trace_seq *s, |
78 | const char *name, | 59 | const char *name, |
79 | void *data, void *ent) | 60 | void *data, void *ent) |
80 | { | 61 | { |
@@ -87,18 +68,7 @@ static __kprobes int PRINT_TYPE_FUNC_NAME(string)(struct trace_seq *s, | |||
87 | (const char *)get_loc_data(data, ent)); | 68 | (const char *)get_loc_data(data, ent)); |
88 | } | 69 | } |
89 | 70 | ||
90 | static const char PRINT_TYPE_FMT_NAME(string)[] = "\\\"%s\\\""; | 71 | const char PRINT_TYPE_FMT_NAME(string)[] = "\\\"%s\\\""; |
91 | |||
92 | #define FETCH_FUNC_NAME(method, type) fetch_##method##_##type | ||
93 | /* | ||
94 | * Define macro for basic types - we don't need to define s* types, because | ||
95 | * we have to care only about bitwidth at recording time. | ||
96 | */ | ||
97 | #define DEFINE_BASIC_FETCH_FUNCS(method) \ | ||
98 | DEFINE_FETCH_##method(u8) \ | ||
99 | DEFINE_FETCH_##method(u16) \ | ||
100 | DEFINE_FETCH_##method(u32) \ | ||
101 | DEFINE_FETCH_##method(u64) | ||
102 | 72 | ||
103 | #define CHECK_FETCH_FUNCS(method, fn) \ | 73 | #define CHECK_FETCH_FUNCS(method, fn) \ |
104 | (((FETCH_FUNC_NAME(method, u8) == fn) || \ | 74 | (((FETCH_FUNC_NAME(method, u8) == fn) || \ |
@@ -111,7 +81,7 @@ DEFINE_FETCH_##method(u64) | |||
111 | 81 | ||
112 | /* Data fetch function templates */ | 82 | /* Data fetch function templates */ |
113 | #define DEFINE_FETCH_reg(type) \ | 83 | #define DEFINE_FETCH_reg(type) \ |
114 | static __kprobes void FETCH_FUNC_NAME(reg, type)(struct pt_regs *regs, \ | 84 | __kprobes void FETCH_FUNC_NAME(reg, type)(struct pt_regs *regs, \ |
115 | void *offset, void *dest) \ | 85 | void *offset, void *dest) \ |
116 | { \ | 86 | { \ |
117 | *(type *)dest = (type)regs_get_register(regs, \ | 87 | *(type *)dest = (type)regs_get_register(regs, \ |
@@ -122,20 +92,8 @@ DEFINE_BASIC_FETCH_FUNCS(reg) | |||
122 | #define fetch_reg_string NULL | 92 | #define fetch_reg_string NULL |
123 | #define fetch_reg_string_size NULL | 93 | #define fetch_reg_string_size NULL |
124 | 94 | ||
125 | #define DEFINE_FETCH_stack(type) \ | ||
126 | static __kprobes void FETCH_FUNC_NAME(stack, type)(struct pt_regs *regs,\ | ||
127 | void *offset, void *dest) \ | ||
128 | { \ | ||
129 | *(type *)dest = (type)regs_get_kernel_stack_nth(regs, \ | ||
130 | (unsigned int)((unsigned long)offset)); \ | ||
131 | } | ||
132 | DEFINE_BASIC_FETCH_FUNCS(stack) | ||
133 | /* No string on the stack entry */ | ||
134 | #define fetch_stack_string NULL | ||
135 | #define fetch_stack_string_size NULL | ||
136 | |||
137 | #define DEFINE_FETCH_retval(type) \ | 95 | #define DEFINE_FETCH_retval(type) \ |
138 | static __kprobes void FETCH_FUNC_NAME(retval, type)(struct pt_regs *regs,\ | 96 | __kprobes void FETCH_FUNC_NAME(retval, type)(struct pt_regs *regs, \ |
139 | void *dummy, void *dest) \ | 97 | void *dummy, void *dest) \ |
140 | { \ | 98 | { \ |
141 | *(type *)dest = (type)regs_return_value(regs); \ | 99 | *(type *)dest = (type)regs_return_value(regs); \ |
@@ -145,150 +103,16 @@ DEFINE_BASIC_FETCH_FUNCS(retval) | |||
145 | #define fetch_retval_string NULL | 103 | #define fetch_retval_string NULL |
146 | #define fetch_retval_string_size NULL | 104 | #define fetch_retval_string_size NULL |
147 | 105 | ||
148 | #define DEFINE_FETCH_memory(type) \ | ||
149 | static __kprobes void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs,\ | ||
150 | void *addr, void *dest) \ | ||
151 | { \ | ||
152 | type retval; \ | ||
153 | if (probe_kernel_address(addr, retval)) \ | ||
154 | *(type *)dest = 0; \ | ||
155 | else \ | ||
156 | *(type *)dest = retval; \ | ||
157 | } | ||
158 | DEFINE_BASIC_FETCH_FUNCS(memory) | ||
159 | /* | ||
160 | * Fetch a null-terminated string. Caller MUST set *(u32 *)dest with max | ||
161 | * length and relative data location. | ||
162 | */ | ||
163 | static __kprobes void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs, | ||
164 | void *addr, void *dest) | ||
165 | { | ||
166 | long ret; | ||
167 | int maxlen = get_rloc_len(*(u32 *)dest); | ||
168 | u8 *dst = get_rloc_data(dest); | ||
169 | u8 *src = addr; | ||
170 | mm_segment_t old_fs = get_fs(); | ||
171 | |||
172 | if (!maxlen) | ||
173 | return; | ||
174 | |||
175 | /* | ||
176 | * Try to get string again, since the string can be changed while | ||
177 | * probing. | ||
178 | */ | ||
179 | set_fs(KERNEL_DS); | ||
180 | pagefault_disable(); | ||
181 | |||
182 | do | ||
183 | ret = __copy_from_user_inatomic(dst++, src++, 1); | ||
184 | while (dst[-1] && ret == 0 && src - (u8 *)addr < maxlen); | ||
185 | |||
186 | dst[-1] = '\0'; | ||
187 | pagefault_enable(); | ||
188 | set_fs(old_fs); | ||
189 | |||
190 | if (ret < 0) { /* Failed to fetch string */ | ||
191 | ((u8 *)get_rloc_data(dest))[0] = '\0'; | ||
192 | *(u32 *)dest = make_data_rloc(0, get_rloc_offs(*(u32 *)dest)); | ||
193 | } else { | ||
194 | *(u32 *)dest = make_data_rloc(src - (u8 *)addr, | ||
195 | get_rloc_offs(*(u32 *)dest)); | ||
196 | } | ||
197 | } | ||
198 | |||
199 | /* Return the length of string -- including null terminal byte */ | ||
200 | static __kprobes void FETCH_FUNC_NAME(memory, string_size)(struct pt_regs *regs, | ||
201 | void *addr, void *dest) | ||
202 | { | ||
203 | mm_segment_t old_fs; | ||
204 | int ret, len = 0; | ||
205 | u8 c; | ||
206 | |||
207 | old_fs = get_fs(); | ||
208 | set_fs(KERNEL_DS); | ||
209 | pagefault_disable(); | ||
210 | |||
211 | do { | ||
212 | ret = __copy_from_user_inatomic(&c, (u8 *)addr + len, 1); | ||
213 | len++; | ||
214 | } while (c && ret == 0 && len < MAX_STRING_SIZE); | ||
215 | |||
216 | pagefault_enable(); | ||
217 | set_fs(old_fs); | ||
218 | |||
219 | if (ret < 0) /* Failed to check the length */ | ||
220 | *(u32 *)dest = 0; | ||
221 | else | ||
222 | *(u32 *)dest = len; | ||
223 | } | ||
224 | |||
225 | /* Memory fetching by symbol */ | ||
226 | struct symbol_cache { | ||
227 | char *symbol; | ||
228 | long offset; | ||
229 | unsigned long addr; | ||
230 | }; | ||
231 | |||
232 | static unsigned long update_symbol_cache(struct symbol_cache *sc) | ||
233 | { | ||
234 | sc->addr = (unsigned long)kallsyms_lookup_name(sc->symbol); | ||
235 | |||
236 | if (sc->addr) | ||
237 | sc->addr += sc->offset; | ||
238 | |||
239 | return sc->addr; | ||
240 | } | ||
241 | |||
242 | static void free_symbol_cache(struct symbol_cache *sc) | ||
243 | { | ||
244 | kfree(sc->symbol); | ||
245 | kfree(sc); | ||
246 | } | ||
247 | |||
248 | static struct symbol_cache *alloc_symbol_cache(const char *sym, long offset) | ||
249 | { | ||
250 | struct symbol_cache *sc; | ||
251 | |||
252 | if (!sym || strlen(sym) == 0) | ||
253 | return NULL; | ||
254 | |||
255 | sc = kzalloc(sizeof(struct symbol_cache), GFP_KERNEL); | ||
256 | if (!sc) | ||
257 | return NULL; | ||
258 | |||
259 | sc->symbol = kstrdup(sym, GFP_KERNEL); | ||
260 | if (!sc->symbol) { | ||
261 | kfree(sc); | ||
262 | return NULL; | ||
263 | } | ||
264 | sc->offset = offset; | ||
265 | update_symbol_cache(sc); | ||
266 | |||
267 | return sc; | ||
268 | } | ||
269 | |||
270 | #define DEFINE_FETCH_symbol(type) \ | ||
271 | static __kprobes void FETCH_FUNC_NAME(symbol, type)(struct pt_regs *regs,\ | ||
272 | void *data, void *dest) \ | ||
273 | { \ | ||
274 | struct symbol_cache *sc = data; \ | ||
275 | if (sc->addr) \ | ||
276 | fetch_memory_##type(regs, (void *)sc->addr, dest); \ | ||
277 | else \ | ||
278 | *(type *)dest = 0; \ | ||
279 | } | ||
280 | DEFINE_BASIC_FETCH_FUNCS(symbol) | ||
281 | DEFINE_FETCH_symbol(string) | ||
282 | DEFINE_FETCH_symbol(string_size) | ||
283 | |||
284 | /* Dereference memory access function */ | 106 | /* Dereference memory access function */ |
285 | struct deref_fetch_param { | 107 | struct deref_fetch_param { |
286 | struct fetch_param orig; | 108 | struct fetch_param orig; |
287 | long offset; | 109 | long offset; |
110 | fetch_func_t fetch; | ||
111 | fetch_func_t fetch_size; | ||
288 | }; | 112 | }; |
289 | 113 | ||
290 | #define DEFINE_FETCH_deref(type) \ | 114 | #define DEFINE_FETCH_deref(type) \ |
291 | static __kprobes void FETCH_FUNC_NAME(deref, type)(struct pt_regs *regs,\ | 115 | __kprobes void FETCH_FUNC_NAME(deref, type)(struct pt_regs *regs, \ |
292 | void *data, void *dest) \ | 116 | void *data, void *dest) \ |
293 | { \ | 117 | { \ |
294 | struct deref_fetch_param *dprm = data; \ | 118 | struct deref_fetch_param *dprm = data; \ |
@@ -296,13 +120,26 @@ static __kprobes void FETCH_FUNC_NAME(deref, type)(struct pt_regs *regs,\ | |||
296 | call_fetch(&dprm->orig, regs, &addr); \ | 120 | call_fetch(&dprm->orig, regs, &addr); \ |
297 | if (addr) { \ | 121 | if (addr) { \ |
298 | addr += dprm->offset; \ | 122 | addr += dprm->offset; \ |
299 | fetch_memory_##type(regs, (void *)addr, dest); \ | 123 | dprm->fetch(regs, (void *)addr, dest); \ |
300 | } else \ | 124 | } else \ |
301 | *(type *)dest = 0; \ | 125 | *(type *)dest = 0; \ |
302 | } | 126 | } |
303 | DEFINE_BASIC_FETCH_FUNCS(deref) | 127 | DEFINE_BASIC_FETCH_FUNCS(deref) |
304 | DEFINE_FETCH_deref(string) | 128 | DEFINE_FETCH_deref(string) |
305 | DEFINE_FETCH_deref(string_size) | 129 | |
130 | __kprobes void FETCH_FUNC_NAME(deref, string_size)(struct pt_regs *regs, | ||
131 | void *data, void *dest) | ||
132 | { | ||
133 | struct deref_fetch_param *dprm = data; | ||
134 | unsigned long addr; | ||
135 | |||
136 | call_fetch(&dprm->orig, regs, &addr); | ||
137 | if (addr && dprm->fetch_size) { | ||
138 | addr += dprm->offset; | ||
139 | dprm->fetch_size(regs, (void *)addr, dest); | ||
140 | } else | ||
141 | *(string_size *)dest = 0; | ||
142 | } | ||
306 | 143 | ||
307 | static __kprobes void update_deref_fetch_param(struct deref_fetch_param *data) | 144 | static __kprobes void update_deref_fetch_param(struct deref_fetch_param *data) |
308 | { | 145 | { |
@@ -329,7 +166,7 @@ struct bitfield_fetch_param { | |||
329 | }; | 166 | }; |
330 | 167 | ||
331 | #define DEFINE_FETCH_bitfield(type) \ | 168 | #define DEFINE_FETCH_bitfield(type) \ |
332 | static __kprobes void FETCH_FUNC_NAME(bitfield, type)(struct pt_regs *regs,\ | 169 | __kprobes void FETCH_FUNC_NAME(bitfield, type)(struct pt_regs *regs, \ |
333 | void *data, void *dest) \ | 170 | void *data, void *dest) \ |
334 | { \ | 171 | { \ |
335 | struct bitfield_fetch_param *bprm = data; \ | 172 | struct bitfield_fetch_param *bprm = data; \ |
@@ -374,58 +211,8 @@ free_bitfield_fetch_param(struct bitfield_fetch_param *data) | |||
374 | kfree(data); | 211 | kfree(data); |
375 | } | 212 | } |
376 | 213 | ||
377 | /* Default (unsigned long) fetch type */ | 214 | static const struct fetch_type *find_fetch_type(const char *type, |
378 | #define __DEFAULT_FETCH_TYPE(t) u##t | 215 | const struct fetch_type *ftbl) |
379 | #define _DEFAULT_FETCH_TYPE(t) __DEFAULT_FETCH_TYPE(t) | ||
380 | #define DEFAULT_FETCH_TYPE _DEFAULT_FETCH_TYPE(BITS_PER_LONG) | ||
381 | #define DEFAULT_FETCH_TYPE_STR __stringify(DEFAULT_FETCH_TYPE) | ||
382 | |||
383 | #define ASSIGN_FETCH_FUNC(method, type) \ | ||
384 | [FETCH_MTD_##method] = FETCH_FUNC_NAME(method, type) | ||
385 | |||
386 | #define __ASSIGN_FETCH_TYPE(_name, ptype, ftype, _size, sign, _fmttype) \ | ||
387 | {.name = _name, \ | ||
388 | .size = _size, \ | ||
389 | .is_signed = sign, \ | ||
390 | .print = PRINT_TYPE_FUNC_NAME(ptype), \ | ||
391 | .fmt = PRINT_TYPE_FMT_NAME(ptype), \ | ||
392 | .fmttype = _fmttype, \ | ||
393 | .fetch = { \ | ||
394 | ASSIGN_FETCH_FUNC(reg, ftype), \ | ||
395 | ASSIGN_FETCH_FUNC(stack, ftype), \ | ||
396 | ASSIGN_FETCH_FUNC(retval, ftype), \ | ||
397 | ASSIGN_FETCH_FUNC(memory, ftype), \ | ||
398 | ASSIGN_FETCH_FUNC(symbol, ftype), \ | ||
399 | ASSIGN_FETCH_FUNC(deref, ftype), \ | ||
400 | ASSIGN_FETCH_FUNC(bitfield, ftype), \ | ||
401 | } \ | ||
402 | } | ||
403 | |||
404 | #define ASSIGN_FETCH_TYPE(ptype, ftype, sign) \ | ||
405 | __ASSIGN_FETCH_TYPE(#ptype, ptype, ftype, sizeof(ftype), sign, #ptype) | ||
406 | |||
407 | #define FETCH_TYPE_STRING 0 | ||
408 | #define FETCH_TYPE_STRSIZE 1 | ||
409 | |||
410 | /* Fetch type information table */ | ||
411 | static const struct fetch_type fetch_type_table[] = { | ||
412 | /* Special types */ | ||
413 | [FETCH_TYPE_STRING] = __ASSIGN_FETCH_TYPE("string", string, string, | ||
414 | sizeof(u32), 1, "__data_loc char[]"), | ||
415 | [FETCH_TYPE_STRSIZE] = __ASSIGN_FETCH_TYPE("string_size", u32, | ||
416 | string_size, sizeof(u32), 0, "u32"), | ||
417 | /* Basic types */ | ||
418 | ASSIGN_FETCH_TYPE(u8, u8, 0), | ||
419 | ASSIGN_FETCH_TYPE(u16, u16, 0), | ||
420 | ASSIGN_FETCH_TYPE(u32, u32, 0), | ||
421 | ASSIGN_FETCH_TYPE(u64, u64, 0), | ||
422 | ASSIGN_FETCH_TYPE(s8, u8, 1), | ||
423 | ASSIGN_FETCH_TYPE(s16, u16, 1), | ||
424 | ASSIGN_FETCH_TYPE(s32, u32, 1), | ||
425 | ASSIGN_FETCH_TYPE(s64, u64, 1), | ||
426 | }; | ||
427 | |||
428 | static const struct fetch_type *find_fetch_type(const char *type) | ||
429 | { | 216 | { |
430 | int i; | 217 | int i; |
431 | 218 | ||
@@ -446,44 +233,52 @@ static const struct fetch_type *find_fetch_type(const char *type) | |||
446 | 233 | ||
447 | switch (bs) { | 234 | switch (bs) { |
448 | case 8: | 235 | case 8: |
449 | return find_fetch_type("u8"); | 236 | return find_fetch_type("u8", ftbl); |
450 | case 16: | 237 | case 16: |
451 | return find_fetch_type("u16"); | 238 | return find_fetch_type("u16", ftbl); |
452 | case 32: | 239 | case 32: |
453 | return find_fetch_type("u32"); | 240 | return find_fetch_type("u32", ftbl); |
454 | case 64: | 241 | case 64: |
455 | return find_fetch_type("u64"); | 242 | return find_fetch_type("u64", ftbl); |
456 | default: | 243 | default: |
457 | goto fail; | 244 | goto fail; |
458 | } | 245 | } |
459 | } | 246 | } |
460 | 247 | ||
461 | for (i = 0; i < ARRAY_SIZE(fetch_type_table); i++) | 248 | for (i = 0; ftbl[i].name; i++) { |
462 | if (strcmp(type, fetch_type_table[i].name) == 0) | 249 | if (strcmp(type, ftbl[i].name) == 0) |
463 | return &fetch_type_table[i]; | 250 | return &ftbl[i]; |
251 | } | ||
464 | 252 | ||
465 | fail: | 253 | fail: |
466 | return NULL; | 254 | return NULL; |
467 | } | 255 | } |
468 | 256 | ||
469 | /* Special function : only accept unsigned long */ | 257 | /* Special function : only accept unsigned long */ |
470 | static __kprobes void fetch_stack_address(struct pt_regs *regs, | 258 | static __kprobes void fetch_kernel_stack_address(struct pt_regs *regs, |
471 | void *dummy, void *dest) | 259 | void *dummy, void *dest) |
472 | { | 260 | { |
473 | *(unsigned long *)dest = kernel_stack_pointer(regs); | 261 | *(unsigned long *)dest = kernel_stack_pointer(regs); |
474 | } | 262 | } |
475 | 263 | ||
264 | static __kprobes void fetch_user_stack_address(struct pt_regs *regs, | ||
265 | void *dummy, void *dest) | ||
266 | { | ||
267 | *(unsigned long *)dest = user_stack_pointer(regs); | ||
268 | } | ||
269 | |||
476 | static fetch_func_t get_fetch_size_function(const struct fetch_type *type, | 270 | static fetch_func_t get_fetch_size_function(const struct fetch_type *type, |
477 | fetch_func_t orig_fn) | 271 | fetch_func_t orig_fn, |
272 | const struct fetch_type *ftbl) | ||
478 | { | 273 | { |
479 | int i; | 274 | int i; |
480 | 275 | ||
481 | if (type != &fetch_type_table[FETCH_TYPE_STRING]) | 276 | if (type != &ftbl[FETCH_TYPE_STRING]) |
482 | return NULL; /* Only string type needs size function */ | 277 | return NULL; /* Only string type needs size function */ |
483 | 278 | ||
484 | for (i = 0; i < FETCH_MTD_END; i++) | 279 | for (i = 0; i < FETCH_MTD_END; i++) |
485 | if (type->fetch[i] == orig_fn) | 280 | if (type->fetch[i] == orig_fn) |
486 | return fetch_type_table[FETCH_TYPE_STRSIZE].fetch[i]; | 281 | return ftbl[FETCH_TYPE_STRSIZE].fetch[i]; |
487 | 282 | ||
488 | WARN_ON(1); /* This should not happen */ | 283 | WARN_ON(1); /* This should not happen */ |
489 | 284 | ||
@@ -516,7 +311,8 @@ int traceprobe_split_symbol_offset(char *symbol, unsigned long *offset) | |||
516 | #define PARAM_MAX_STACK (THREAD_SIZE / sizeof(unsigned long)) | 311 | #define PARAM_MAX_STACK (THREAD_SIZE / sizeof(unsigned long)) |
517 | 312 | ||
518 | static int parse_probe_vars(char *arg, const struct fetch_type *t, | 313 | static int parse_probe_vars(char *arg, const struct fetch_type *t, |
519 | struct fetch_param *f, bool is_return) | 314 | struct fetch_param *f, bool is_return, |
315 | bool is_kprobe) | ||
520 | { | 316 | { |
521 | int ret = 0; | 317 | int ret = 0; |
522 | unsigned long param; | 318 | unsigned long param; |
@@ -528,13 +324,16 @@ static int parse_probe_vars(char *arg, const struct fetch_type *t, | |||
528 | ret = -EINVAL; | 324 | ret = -EINVAL; |
529 | } else if (strncmp(arg, "stack", 5) == 0) { | 325 | } else if (strncmp(arg, "stack", 5) == 0) { |
530 | if (arg[5] == '\0') { | 326 | if (arg[5] == '\0') { |
531 | if (strcmp(t->name, DEFAULT_FETCH_TYPE_STR) == 0) | 327 | if (strcmp(t->name, DEFAULT_FETCH_TYPE_STR)) |
532 | f->fn = fetch_stack_address; | 328 | return -EINVAL; |
329 | |||
330 | if (is_kprobe) | ||
331 | f->fn = fetch_kernel_stack_address; | ||
533 | else | 332 | else |
534 | ret = -EINVAL; | 333 | f->fn = fetch_user_stack_address; |
535 | } else if (isdigit(arg[5])) { | 334 | } else if (isdigit(arg[5])) { |
536 | ret = kstrtoul(arg + 5, 10, ¶m); | 335 | ret = kstrtoul(arg + 5, 10, ¶m); |
537 | if (ret || param > PARAM_MAX_STACK) | 336 | if (ret || (is_kprobe && param > PARAM_MAX_STACK)) |
538 | ret = -EINVAL; | 337 | ret = -EINVAL; |
539 | else { | 338 | else { |
540 | f->fn = t->fetch[FETCH_MTD_stack]; | 339 | f->fn = t->fetch[FETCH_MTD_stack]; |
@@ -552,20 +351,18 @@ static int parse_probe_vars(char *arg, const struct fetch_type *t, | |||
552 | static int parse_probe_arg(char *arg, const struct fetch_type *t, | 351 | static int parse_probe_arg(char *arg, const struct fetch_type *t, |
553 | struct fetch_param *f, bool is_return, bool is_kprobe) | 352 | struct fetch_param *f, bool is_return, bool is_kprobe) |
554 | { | 353 | { |
354 | const struct fetch_type *ftbl; | ||
555 | unsigned long param; | 355 | unsigned long param; |
556 | long offset; | 356 | long offset; |
557 | char *tmp; | 357 | char *tmp; |
558 | int ret; | 358 | int ret = 0; |
559 | |||
560 | ret = 0; | ||
561 | 359 | ||
562 | /* Until uprobe_events supports only reg arguments */ | 360 | ftbl = is_kprobe ? kprobes_fetch_type_table : uprobes_fetch_type_table; |
563 | if (!is_kprobe && arg[0] != '%') | 361 | BUG_ON(ftbl == NULL); |
564 | return -EINVAL; | ||
565 | 362 | ||
566 | switch (arg[0]) { | 363 | switch (arg[0]) { |
567 | case '$': | 364 | case '$': |
568 | ret = parse_probe_vars(arg + 1, t, f, is_return); | 365 | ret = parse_probe_vars(arg + 1, t, f, is_return, is_kprobe); |
569 | break; | 366 | break; |
570 | 367 | ||
571 | case '%': /* named register */ | 368 | case '%': /* named register */ |
@@ -577,7 +374,7 @@ static int parse_probe_arg(char *arg, const struct fetch_type *t, | |||
577 | } | 374 | } |
578 | break; | 375 | break; |
579 | 376 | ||
580 | case '@': /* memory or symbol */ | 377 | case '@': /* memory, file-offset or symbol */ |
581 | if (isdigit(arg[1])) { | 378 | if (isdigit(arg[1])) { |
582 | ret = kstrtoul(arg + 1, 0, ¶m); | 379 | ret = kstrtoul(arg + 1, 0, ¶m); |
583 | if (ret) | 380 | if (ret) |
@@ -585,7 +382,22 @@ static int parse_probe_arg(char *arg, const struct fetch_type *t, | |||
585 | 382 | ||
586 | f->fn = t->fetch[FETCH_MTD_memory]; | 383 | f->fn = t->fetch[FETCH_MTD_memory]; |
587 | f->data = (void *)param; | 384 | f->data = (void *)param; |
385 | } else if (arg[1] == '+') { | ||
386 | /* kprobes don't support file offsets */ | ||
387 | if (is_kprobe) | ||
388 | return -EINVAL; | ||
389 | |||
390 | ret = kstrtol(arg + 2, 0, &offset); | ||
391 | if (ret) | ||
392 | break; | ||
393 | |||
394 | f->fn = t->fetch[FETCH_MTD_file_offset]; | ||
395 | f->data = (void *)offset; | ||
588 | } else { | 396 | } else { |
397 | /* uprobes don't support symbols */ | ||
398 | if (!is_kprobe) | ||
399 | return -EINVAL; | ||
400 | |||
589 | ret = traceprobe_split_symbol_offset(arg + 1, &offset); | 401 | ret = traceprobe_split_symbol_offset(arg + 1, &offset); |
590 | if (ret) | 402 | if (ret) |
591 | break; | 403 | break; |
@@ -616,7 +428,7 @@ static int parse_probe_arg(char *arg, const struct fetch_type *t, | |||
616 | struct deref_fetch_param *dprm; | 428 | struct deref_fetch_param *dprm; |
617 | const struct fetch_type *t2; | 429 | const struct fetch_type *t2; |
618 | 430 | ||
619 | t2 = find_fetch_type(NULL); | 431 | t2 = find_fetch_type(NULL, ftbl); |
620 | *tmp = '\0'; | 432 | *tmp = '\0'; |
621 | dprm = kzalloc(sizeof(struct deref_fetch_param), GFP_KERNEL); | 433 | dprm = kzalloc(sizeof(struct deref_fetch_param), GFP_KERNEL); |
622 | 434 | ||
@@ -624,6 +436,9 @@ static int parse_probe_arg(char *arg, const struct fetch_type *t, | |||
624 | return -ENOMEM; | 436 | return -ENOMEM; |
625 | 437 | ||
626 | dprm->offset = offset; | 438 | dprm->offset = offset; |
439 | dprm->fetch = t->fetch[FETCH_MTD_memory]; | ||
440 | dprm->fetch_size = get_fetch_size_function(t, | ||
441 | dprm->fetch, ftbl); | ||
627 | ret = parse_probe_arg(arg, t2, &dprm->orig, is_return, | 442 | ret = parse_probe_arg(arg, t2, &dprm->orig, is_return, |
628 | is_kprobe); | 443 | is_kprobe); |
629 | if (ret) | 444 | if (ret) |
@@ -685,9 +500,13 @@ static int __parse_bitfield_probe_arg(const char *bf, | |||
685 | int traceprobe_parse_probe_arg(char *arg, ssize_t *size, | 500 | int traceprobe_parse_probe_arg(char *arg, ssize_t *size, |
686 | struct probe_arg *parg, bool is_return, bool is_kprobe) | 501 | struct probe_arg *parg, bool is_return, bool is_kprobe) |
687 | { | 502 | { |
503 | const struct fetch_type *ftbl; | ||
688 | const char *t; | 504 | const char *t; |
689 | int ret; | 505 | int ret; |
690 | 506 | ||
507 | ftbl = is_kprobe ? kprobes_fetch_type_table : uprobes_fetch_type_table; | ||
508 | BUG_ON(ftbl == NULL); | ||
509 | |||
691 | if (strlen(arg) > MAX_ARGSTR_LEN) { | 510 | if (strlen(arg) > MAX_ARGSTR_LEN) { |
692 | pr_info("Argument is too long.: %s\n", arg); | 511 | pr_info("Argument is too long.: %s\n", arg); |
693 | return -ENOSPC; | 512 | return -ENOSPC; |
@@ -702,7 +521,7 @@ int traceprobe_parse_probe_arg(char *arg, ssize_t *size, | |||
702 | arg[t - parg->comm] = '\0'; | 521 | arg[t - parg->comm] = '\0'; |
703 | t++; | 522 | t++; |
704 | } | 523 | } |
705 | parg->type = find_fetch_type(t); | 524 | parg->type = find_fetch_type(t, ftbl); |
706 | if (!parg->type) { | 525 | if (!parg->type) { |
707 | pr_info("Unsupported type: %s\n", t); | 526 | pr_info("Unsupported type: %s\n", t); |
708 | return -EINVAL; | 527 | return -EINVAL; |
@@ -716,7 +535,8 @@ int traceprobe_parse_probe_arg(char *arg, ssize_t *size, | |||
716 | 535 | ||
717 | if (ret >= 0) { | 536 | if (ret >= 0) { |
718 | parg->fetch_size.fn = get_fetch_size_function(parg->type, | 537 | parg->fetch_size.fn = get_fetch_size_function(parg->type, |
719 | parg->fetch.fn); | 538 | parg->fetch.fn, |
539 | ftbl); | ||
720 | parg->fetch_size.data = parg->fetch.data; | 540 | parg->fetch_size.data = parg->fetch.data; |
721 | } | 541 | } |
722 | 542 | ||
@@ -837,3 +657,65 @@ out: | |||
837 | 657 | ||
838 | return ret; | 658 | return ret; |
839 | } | 659 | } |
660 | |||
661 | static int __set_print_fmt(struct trace_probe *tp, char *buf, int len, | ||
662 | bool is_return) | ||
663 | { | ||
664 | int i; | ||
665 | int pos = 0; | ||
666 | |||
667 | const char *fmt, *arg; | ||
668 | |||
669 | if (!is_return) { | ||
670 | fmt = "(%lx)"; | ||
671 | arg = "REC->" FIELD_STRING_IP; | ||
672 | } else { | ||
673 | fmt = "(%lx <- %lx)"; | ||
674 | arg = "REC->" FIELD_STRING_FUNC ", REC->" FIELD_STRING_RETIP; | ||
675 | } | ||
676 | |||
677 | /* When len=0, we just calculate the needed length */ | ||
678 | #define LEN_OR_ZERO (len ? len - pos : 0) | ||
679 | |||
680 | pos += snprintf(buf + pos, LEN_OR_ZERO, "\"%s", fmt); | ||
681 | |||
682 | for (i = 0; i < tp->nr_args; i++) { | ||
683 | pos += snprintf(buf + pos, LEN_OR_ZERO, " %s=%s", | ||
684 | tp->args[i].name, tp->args[i].type->fmt); | ||
685 | } | ||
686 | |||
687 | pos += snprintf(buf + pos, LEN_OR_ZERO, "\", %s", arg); | ||
688 | |||
689 | for (i = 0; i < tp->nr_args; i++) { | ||
690 | if (strcmp(tp->args[i].type->name, "string") == 0) | ||
691 | pos += snprintf(buf + pos, LEN_OR_ZERO, | ||
692 | ", __get_str(%s)", | ||
693 | tp->args[i].name); | ||
694 | else | ||
695 | pos += snprintf(buf + pos, LEN_OR_ZERO, ", REC->%s", | ||
696 | tp->args[i].name); | ||
697 | } | ||
698 | |||
699 | #undef LEN_OR_ZERO | ||
700 | |||
701 | /* return the length of print_fmt */ | ||
702 | return pos; | ||
703 | } | ||
704 | |||
705 | int set_print_fmt(struct trace_probe *tp, bool is_return) | ||
706 | { | ||
707 | int len; | ||
708 | char *print_fmt; | ||
709 | |||
710 | /* First: called with 0 length to calculate the needed length */ | ||
711 | len = __set_print_fmt(tp, NULL, 0, is_return); | ||
712 | print_fmt = kmalloc(len + 1, GFP_KERNEL); | ||
713 | if (!print_fmt) | ||
714 | return -ENOMEM; | ||
715 | |||
716 | /* Second: actually write the @print_fmt */ | ||
717 | __set_print_fmt(tp, print_fmt, len + 1, is_return); | ||
718 | tp->call.print_fmt = print_fmt; | ||
719 | |||
720 | return 0; | ||
721 | } | ||
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index 5c7e09d10d74..b73574a5f429 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h | |||
@@ -81,6 +81,17 @@ | |||
81 | */ | 81 | */ |
82 | #define convert_rloc_to_loc(dl, offs) ((u32)(dl) + (offs)) | 82 | #define convert_rloc_to_loc(dl, offs) ((u32)(dl) + (offs)) |
83 | 83 | ||
84 | static inline void *get_rloc_data(u32 *dl) | ||
85 | { | ||
86 | return (u8 *)dl + get_rloc_offs(*dl); | ||
87 | } | ||
88 | |||
89 | /* For data_loc conversion */ | ||
90 | static inline void *get_loc_data(u32 *dl, void *ent) | ||
91 | { | ||
92 | return (u8 *)ent + get_rloc_offs(*dl); | ||
93 | } | ||
94 | |||
84 | /* Data fetch function type */ | 95 | /* Data fetch function type */ |
85 | typedef void (*fetch_func_t)(struct pt_regs *, void *, void *); | 96 | typedef void (*fetch_func_t)(struct pt_regs *, void *, void *); |
86 | /* Printing function type */ | 97 | /* Printing function type */ |
@@ -95,6 +106,7 @@ enum { | |||
95 | FETCH_MTD_symbol, | 106 | FETCH_MTD_symbol, |
96 | FETCH_MTD_deref, | 107 | FETCH_MTD_deref, |
97 | FETCH_MTD_bitfield, | 108 | FETCH_MTD_bitfield, |
109 | FETCH_MTD_file_offset, | ||
98 | FETCH_MTD_END, | 110 | FETCH_MTD_END, |
99 | }; | 111 | }; |
100 | 112 | ||
@@ -115,6 +127,148 @@ struct fetch_param { | |||
115 | void *data; | 127 | void *data; |
116 | }; | 128 | }; |
117 | 129 | ||
130 | /* For defining macros, define string/string_size types */ | ||
131 | typedef u32 string; | ||
132 | typedef u32 string_size; | ||
133 | |||
134 | #define PRINT_TYPE_FUNC_NAME(type) print_type_##type | ||
135 | #define PRINT_TYPE_FMT_NAME(type) print_type_format_##type | ||
136 | |||
137 | /* Printing in basic type function template */ | ||
138 | #define DECLARE_BASIC_PRINT_TYPE_FUNC(type) \ | ||
139 | __kprobes int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s, \ | ||
140 | const char *name, \ | ||
141 | void *data, void *ent); \ | ||
142 | extern const char PRINT_TYPE_FMT_NAME(type)[] | ||
143 | |||
144 | DECLARE_BASIC_PRINT_TYPE_FUNC(u8); | ||
145 | DECLARE_BASIC_PRINT_TYPE_FUNC(u16); | ||
146 | DECLARE_BASIC_PRINT_TYPE_FUNC(u32); | ||
147 | DECLARE_BASIC_PRINT_TYPE_FUNC(u64); | ||
148 | DECLARE_BASIC_PRINT_TYPE_FUNC(s8); | ||
149 | DECLARE_BASIC_PRINT_TYPE_FUNC(s16); | ||
150 | DECLARE_BASIC_PRINT_TYPE_FUNC(s32); | ||
151 | DECLARE_BASIC_PRINT_TYPE_FUNC(s64); | ||
152 | DECLARE_BASIC_PRINT_TYPE_FUNC(string); | ||
153 | |||
154 | #define FETCH_FUNC_NAME(method, type) fetch_##method##_##type | ||
155 | |||
156 | /* Declare macro for basic types */ | ||
157 | #define DECLARE_FETCH_FUNC(method, type) \ | ||
158 | extern void FETCH_FUNC_NAME(method, type)(struct pt_regs *regs, \ | ||
159 | void *data, void *dest) | ||
160 | |||
161 | #define DECLARE_BASIC_FETCH_FUNCS(method) \ | ||
162 | DECLARE_FETCH_FUNC(method, u8); \ | ||
163 | DECLARE_FETCH_FUNC(method, u16); \ | ||
164 | DECLARE_FETCH_FUNC(method, u32); \ | ||
165 | DECLARE_FETCH_FUNC(method, u64) | ||
166 | |||
167 | DECLARE_BASIC_FETCH_FUNCS(reg); | ||
168 | #define fetch_reg_string NULL | ||
169 | #define fetch_reg_string_size NULL | ||
170 | |||
171 | DECLARE_BASIC_FETCH_FUNCS(retval); | ||
172 | #define fetch_retval_string NULL | ||
173 | #define fetch_retval_string_size NULL | ||
174 | |||
175 | DECLARE_BASIC_FETCH_FUNCS(symbol); | ||
176 | DECLARE_FETCH_FUNC(symbol, string); | ||
177 | DECLARE_FETCH_FUNC(symbol, string_size); | ||
178 | |||
179 | DECLARE_BASIC_FETCH_FUNCS(deref); | ||
180 | DECLARE_FETCH_FUNC(deref, string); | ||
181 | DECLARE_FETCH_FUNC(deref, string_size); | ||
182 | |||
183 | DECLARE_BASIC_FETCH_FUNCS(bitfield); | ||
184 | #define fetch_bitfield_string NULL | ||
185 | #define fetch_bitfield_string_size NULL | ||
186 | |||
187 | /* | ||
188 | * Define macro for basic types - we don't need to define s* types, because | ||
189 | * we have to care only about bitwidth at recording time. | ||
190 | */ | ||
191 | #define DEFINE_BASIC_FETCH_FUNCS(method) \ | ||
192 | DEFINE_FETCH_##method(u8) \ | ||
193 | DEFINE_FETCH_##method(u16) \ | ||
194 | DEFINE_FETCH_##method(u32) \ | ||
195 | DEFINE_FETCH_##method(u64) | ||
196 | |||
197 | /* Default (unsigned long) fetch type */ | ||
198 | #define __DEFAULT_FETCH_TYPE(t) u##t | ||
199 | #define _DEFAULT_FETCH_TYPE(t) __DEFAULT_FETCH_TYPE(t) | ||
200 | #define DEFAULT_FETCH_TYPE _DEFAULT_FETCH_TYPE(BITS_PER_LONG) | ||
201 | #define DEFAULT_FETCH_TYPE_STR __stringify(DEFAULT_FETCH_TYPE) | ||
202 | |||
203 | #define ASSIGN_FETCH_FUNC(method, type) \ | ||
204 | [FETCH_MTD_##method] = FETCH_FUNC_NAME(method, type) | ||
205 | |||
206 | #define __ASSIGN_FETCH_TYPE(_name, ptype, ftype, _size, sign, _fmttype) \ | ||
207 | {.name = _name, \ | ||
208 | .size = _size, \ | ||
209 | .is_signed = sign, \ | ||
210 | .print = PRINT_TYPE_FUNC_NAME(ptype), \ | ||
211 | .fmt = PRINT_TYPE_FMT_NAME(ptype), \ | ||
212 | .fmttype = _fmttype, \ | ||
213 | .fetch = { \ | ||
214 | ASSIGN_FETCH_FUNC(reg, ftype), \ | ||
215 | ASSIGN_FETCH_FUNC(stack, ftype), \ | ||
216 | ASSIGN_FETCH_FUNC(retval, ftype), \ | ||
217 | ASSIGN_FETCH_FUNC(memory, ftype), \ | ||
218 | ASSIGN_FETCH_FUNC(symbol, ftype), \ | ||
219 | ASSIGN_FETCH_FUNC(deref, ftype), \ | ||
220 | ASSIGN_FETCH_FUNC(bitfield, ftype), \ | ||
221 | ASSIGN_FETCH_FUNC(file_offset, ftype), \ | ||
222 | } \ | ||
223 | } | ||
224 | |||
225 | #define ASSIGN_FETCH_TYPE(ptype, ftype, sign) \ | ||
226 | __ASSIGN_FETCH_TYPE(#ptype, ptype, ftype, sizeof(ftype), sign, #ptype) | ||
227 | |||
228 | #define ASSIGN_FETCH_TYPE_END {} | ||
229 | |||
230 | #define FETCH_TYPE_STRING 0 | ||
231 | #define FETCH_TYPE_STRSIZE 1 | ||
232 | |||
233 | /* | ||
234 | * Fetch type information table. | ||
235 | * It's declared as a weak symbol due to conditional compilation. | ||
236 | */ | ||
237 | extern __weak const struct fetch_type kprobes_fetch_type_table[]; | ||
238 | extern __weak const struct fetch_type uprobes_fetch_type_table[]; | ||
239 | |||
240 | #ifdef CONFIG_KPROBE_EVENT | ||
241 | struct symbol_cache; | ||
242 | unsigned long update_symbol_cache(struct symbol_cache *sc); | ||
243 | void free_symbol_cache(struct symbol_cache *sc); | ||
244 | struct symbol_cache *alloc_symbol_cache(const char *sym, long offset); | ||
245 | #else | ||
246 | /* uprobes do not support symbol fetch methods */ | ||
247 | #define fetch_symbol_u8 NULL | ||
248 | #define fetch_symbol_u16 NULL | ||
249 | #define fetch_symbol_u32 NULL | ||
250 | #define fetch_symbol_u64 NULL | ||
251 | #define fetch_symbol_string NULL | ||
252 | #define fetch_symbol_string_size NULL | ||
253 | |||
254 | struct symbol_cache { | ||
255 | }; | ||
256 | static inline unsigned long __used update_symbol_cache(struct symbol_cache *sc) | ||
257 | { | ||
258 | return 0; | ||
259 | } | ||
260 | |||
261 | static inline void __used free_symbol_cache(struct symbol_cache *sc) | ||
262 | { | ||
263 | } | ||
264 | |||
265 | static inline struct symbol_cache * __used | ||
266 | alloc_symbol_cache(const char *sym, long offset) | ||
267 | { | ||
268 | return NULL; | ||
269 | } | ||
270 | #endif /* CONFIG_KPROBE_EVENT */ | ||
271 | |||
118 | struct probe_arg { | 272 | struct probe_arg { |
119 | struct fetch_param fetch; | 273 | struct fetch_param fetch; |
120 | struct fetch_param fetch_size; | 274 | struct fetch_param fetch_size; |
@@ -124,6 +278,26 @@ struct probe_arg { | |||
124 | const struct fetch_type *type; /* Type of this argument */ | 278 | const struct fetch_type *type; /* Type of this argument */ |
125 | }; | 279 | }; |
126 | 280 | ||
281 | struct trace_probe { | ||
282 | unsigned int flags; /* For TP_FLAG_* */ | ||
283 | struct ftrace_event_class class; | ||
284 | struct ftrace_event_call call; | ||
285 | struct list_head files; | ||
286 | ssize_t size; /* trace entry size */ | ||
287 | unsigned int nr_args; | ||
288 | struct probe_arg args[]; | ||
289 | }; | ||
290 | |||
291 | static inline bool trace_probe_is_enabled(struct trace_probe *tp) | ||
292 | { | ||
293 | return !!(tp->flags & (TP_FLAG_TRACE | TP_FLAG_PROFILE)); | ||
294 | } | ||
295 | |||
296 | static inline bool trace_probe_is_registered(struct trace_probe *tp) | ||
297 | { | ||
298 | return !!(tp->flags & TP_FLAG_REGISTERED); | ||
299 | } | ||
300 | |||
127 | static inline __kprobes void call_fetch(struct fetch_param *fprm, | 301 | static inline __kprobes void call_fetch(struct fetch_param *fprm, |
128 | struct pt_regs *regs, void *dest) | 302 | struct pt_regs *regs, void *dest) |
129 | { | 303 | { |
@@ -158,3 +332,53 @@ extern ssize_t traceprobe_probes_write(struct file *file, | |||
158 | int (*createfn)(int, char**)); | 332 | int (*createfn)(int, char**)); |
159 | 333 | ||
160 | extern int traceprobe_command(const char *buf, int (*createfn)(int, char**)); | 334 | extern int traceprobe_command(const char *buf, int (*createfn)(int, char**)); |
335 | |||
336 | /* Sum up total data length for dynamic arraies (strings) */ | ||
337 | static inline __kprobes int | ||
338 | __get_data_size(struct trace_probe *tp, struct pt_regs *regs) | ||
339 | { | ||
340 | int i, ret = 0; | ||
341 | u32 len; | ||
342 | |||
343 | for (i = 0; i < tp->nr_args; i++) | ||
344 | if (unlikely(tp->args[i].fetch_size.fn)) { | ||
345 | call_fetch(&tp->args[i].fetch_size, regs, &len); | ||
346 | ret += len; | ||
347 | } | ||
348 | |||
349 | return ret; | ||
350 | } | ||
351 | |||
352 | /* Store the value of each argument */ | ||
353 | static inline __kprobes void | ||
354 | store_trace_args(int ent_size, struct trace_probe *tp, struct pt_regs *regs, | ||
355 | u8 *data, int maxlen) | ||
356 | { | ||
357 | int i; | ||
358 | u32 end = tp->size; | ||
359 | u32 *dl; /* Data (relative) location */ | ||
360 | |||
361 | for (i = 0; i < tp->nr_args; i++) { | ||
362 | if (unlikely(tp->args[i].fetch_size.fn)) { | ||
363 | /* | ||
364 | * First, we set the relative location and | ||
365 | * maximum data length to *dl | ||
366 | */ | ||
367 | dl = (u32 *)(data + tp->args[i].offset); | ||
368 | *dl = make_data_rloc(maxlen, end - tp->args[i].offset); | ||
369 | /* Then try to fetch string or dynamic array data */ | ||
370 | call_fetch(&tp->args[i].fetch, regs, dl); | ||
371 | /* Reduce maximum length */ | ||
372 | end += get_rloc_len(*dl); | ||
373 | maxlen -= get_rloc_len(*dl); | ||
374 | /* Trick here, convert data_rloc to data_loc */ | ||
375 | *dl = convert_rloc_to_loc(*dl, | ||
376 | ent_size + tp->args[i].offset); | ||
377 | } else | ||
378 | /* Just fetching data normally */ | ||
379 | call_fetch(&tp->args[i].fetch, regs, | ||
380 | data + tp->args[i].offset); | ||
381 | } | ||
382 | } | ||
383 | |||
384 | extern int set_print_fmt(struct trace_probe *tp, bool is_return); | ||
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index fee77e15d815..6e32635e5e57 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c | |||
@@ -16,6 +16,7 @@ | |||
16 | #include <linux/uaccess.h> | 16 | #include <linux/uaccess.h> |
17 | #include <linux/ftrace.h> | 17 | #include <linux/ftrace.h> |
18 | #include <linux/sched/rt.h> | 18 | #include <linux/sched/rt.h> |
19 | #include <linux/sched/deadline.h> | ||
19 | #include <trace/events/sched.h> | 20 | #include <trace/events/sched.h> |
20 | #include "trace.h" | 21 | #include "trace.h" |
21 | 22 | ||
@@ -27,6 +28,8 @@ static int wakeup_cpu; | |||
27 | static int wakeup_current_cpu; | 28 | static int wakeup_current_cpu; |
28 | static unsigned wakeup_prio = -1; | 29 | static unsigned wakeup_prio = -1; |
29 | static int wakeup_rt; | 30 | static int wakeup_rt; |
31 | static int wakeup_dl; | ||
32 | static int tracing_dl = 0; | ||
30 | 33 | ||
31 | static arch_spinlock_t wakeup_lock = | 34 | static arch_spinlock_t wakeup_lock = |
32 | (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; | 35 | (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; |
@@ -437,6 +440,7 @@ static void __wakeup_reset(struct trace_array *tr) | |||
437 | { | 440 | { |
438 | wakeup_cpu = -1; | 441 | wakeup_cpu = -1; |
439 | wakeup_prio = -1; | 442 | wakeup_prio = -1; |
443 | tracing_dl = 0; | ||
440 | 444 | ||
441 | if (wakeup_task) | 445 | if (wakeup_task) |
442 | put_task_struct(wakeup_task); | 446 | put_task_struct(wakeup_task); |
@@ -472,9 +476,17 @@ probe_wakeup(void *ignore, struct task_struct *p, int success) | |||
472 | tracing_record_cmdline(p); | 476 | tracing_record_cmdline(p); |
473 | tracing_record_cmdline(current); | 477 | tracing_record_cmdline(current); |
474 | 478 | ||
475 | if ((wakeup_rt && !rt_task(p)) || | 479 | /* |
476 | p->prio >= wakeup_prio || | 480 | * Semantic is like this: |
477 | p->prio >= current->prio) | 481 | * - wakeup tracer handles all tasks in the system, independently |
482 | * from their scheduling class; | ||
483 | * - wakeup_rt tracer handles tasks belonging to sched_dl and | ||
484 | * sched_rt class; | ||
485 | * - wakeup_dl handles tasks belonging to sched_dl class only. | ||
486 | */ | ||
487 | if (tracing_dl || (wakeup_dl && !dl_task(p)) || | ||
488 | (wakeup_rt && !dl_task(p) && !rt_task(p)) || | ||
489 | (!dl_task(p) && (p->prio >= wakeup_prio || p->prio >= current->prio))) | ||
478 | return; | 490 | return; |
479 | 491 | ||
480 | pc = preempt_count(); | 492 | pc = preempt_count(); |
@@ -486,7 +498,8 @@ probe_wakeup(void *ignore, struct task_struct *p, int success) | |||
486 | arch_spin_lock(&wakeup_lock); | 498 | arch_spin_lock(&wakeup_lock); |
487 | 499 | ||
488 | /* check for races. */ | 500 | /* check for races. */ |
489 | if (!tracer_enabled || p->prio >= wakeup_prio) | 501 | if (!tracer_enabled || tracing_dl || |
502 | (!dl_task(p) && p->prio >= wakeup_prio)) | ||
490 | goto out_locked; | 503 | goto out_locked; |
491 | 504 | ||
492 | /* reset the trace */ | 505 | /* reset the trace */ |
@@ -496,6 +509,15 @@ probe_wakeup(void *ignore, struct task_struct *p, int success) | |||
496 | wakeup_current_cpu = wakeup_cpu; | 509 | wakeup_current_cpu = wakeup_cpu; |
497 | wakeup_prio = p->prio; | 510 | wakeup_prio = p->prio; |
498 | 511 | ||
512 | /* | ||
513 | * Once you start tracing a -deadline task, don't bother tracing | ||
514 | * another task until the first one wakes up. | ||
515 | */ | ||
516 | if (dl_task(p)) | ||
517 | tracing_dl = 1; | ||
518 | else | ||
519 | tracing_dl = 0; | ||
520 | |||
499 | wakeup_task = p; | 521 | wakeup_task = p; |
500 | get_task_struct(wakeup_task); | 522 | get_task_struct(wakeup_task); |
501 | 523 | ||
@@ -597,16 +619,25 @@ static int __wakeup_tracer_init(struct trace_array *tr) | |||
597 | 619 | ||
598 | static int wakeup_tracer_init(struct trace_array *tr) | 620 | static int wakeup_tracer_init(struct trace_array *tr) |
599 | { | 621 | { |
622 | wakeup_dl = 0; | ||
600 | wakeup_rt = 0; | 623 | wakeup_rt = 0; |
601 | return __wakeup_tracer_init(tr); | 624 | return __wakeup_tracer_init(tr); |
602 | } | 625 | } |
603 | 626 | ||
604 | static int wakeup_rt_tracer_init(struct trace_array *tr) | 627 | static int wakeup_rt_tracer_init(struct trace_array *tr) |
605 | { | 628 | { |
629 | wakeup_dl = 0; | ||
606 | wakeup_rt = 1; | 630 | wakeup_rt = 1; |
607 | return __wakeup_tracer_init(tr); | 631 | return __wakeup_tracer_init(tr); |
608 | } | 632 | } |
609 | 633 | ||
634 | static int wakeup_dl_tracer_init(struct trace_array *tr) | ||
635 | { | ||
636 | wakeup_dl = 1; | ||
637 | wakeup_rt = 0; | ||
638 | return __wakeup_tracer_init(tr); | ||
639 | } | ||
640 | |||
610 | static void wakeup_tracer_reset(struct trace_array *tr) | 641 | static void wakeup_tracer_reset(struct trace_array *tr) |
611 | { | 642 | { |
612 | int lat_flag = save_flags & TRACE_ITER_LATENCY_FMT; | 643 | int lat_flag = save_flags & TRACE_ITER_LATENCY_FMT; |
@@ -674,6 +705,28 @@ static struct tracer wakeup_rt_tracer __read_mostly = | |||
674 | .use_max_tr = true, | 705 | .use_max_tr = true, |
675 | }; | 706 | }; |
676 | 707 | ||
708 | static struct tracer wakeup_dl_tracer __read_mostly = | ||
709 | { | ||
710 | .name = "wakeup_dl", | ||
711 | .init = wakeup_dl_tracer_init, | ||
712 | .reset = wakeup_tracer_reset, | ||
713 | .start = wakeup_tracer_start, | ||
714 | .stop = wakeup_tracer_stop, | ||
715 | .wait_pipe = poll_wait_pipe, | ||
716 | .print_max = true, | ||
717 | .print_header = wakeup_print_header, | ||
718 | .print_line = wakeup_print_line, | ||
719 | .flags = &tracer_flags, | ||
720 | .set_flag = wakeup_set_flag, | ||
721 | .flag_changed = wakeup_flag_changed, | ||
722 | #ifdef CONFIG_FTRACE_SELFTEST | ||
723 | .selftest = trace_selftest_startup_wakeup, | ||
724 | #endif | ||
725 | .open = wakeup_trace_open, | ||
726 | .close = wakeup_trace_close, | ||
727 | .use_max_tr = true, | ||
728 | }; | ||
729 | |||
677 | __init static int init_wakeup_tracer(void) | 730 | __init static int init_wakeup_tracer(void) |
678 | { | 731 | { |
679 | int ret; | 732 | int ret; |
@@ -686,6 +739,10 @@ __init static int init_wakeup_tracer(void) | |||
686 | if (ret) | 739 | if (ret) |
687 | return ret; | 740 | return ret; |
688 | 741 | ||
742 | ret = register_tracer(&wakeup_dl_tracer); | ||
743 | if (ret) | ||
744 | return ret; | ||
745 | |||
689 | return 0; | 746 | return 0; |
690 | } | 747 | } |
691 | core_initcall(init_wakeup_tracer); | 748 | core_initcall(init_wakeup_tracer); |
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index a7329b7902f8..e98fca60974f 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c | |||
@@ -1022,11 +1022,16 @@ trace_selftest_startup_nop(struct tracer *trace, struct trace_array *tr) | |||
1022 | #ifdef CONFIG_SCHED_TRACER | 1022 | #ifdef CONFIG_SCHED_TRACER |
1023 | static int trace_wakeup_test_thread(void *data) | 1023 | static int trace_wakeup_test_thread(void *data) |
1024 | { | 1024 | { |
1025 | /* Make this a RT thread, doesn't need to be too high */ | 1025 | /* Make this a -deadline thread */ |
1026 | static const struct sched_param param = { .sched_priority = 5 }; | 1026 | static const struct sched_attr attr = { |
1027 | .sched_policy = SCHED_DEADLINE, | ||
1028 | .sched_runtime = 100000ULL, | ||
1029 | .sched_deadline = 10000000ULL, | ||
1030 | .sched_period = 10000000ULL | ||
1031 | }; | ||
1027 | struct completion *x = data; | 1032 | struct completion *x = data; |
1028 | 1033 | ||
1029 | sched_setscheduler(current, SCHED_FIFO, ¶m); | 1034 | sched_setattr(current, &attr); |
1030 | 1035 | ||
1031 | /* Make it know we have a new prio */ | 1036 | /* Make it know we have a new prio */ |
1032 | complete(x); | 1037 | complete(x); |
@@ -1040,8 +1045,8 @@ static int trace_wakeup_test_thread(void *data) | |||
1040 | /* we are awake, now wait to disappear */ | 1045 | /* we are awake, now wait to disappear */ |
1041 | while (!kthread_should_stop()) { | 1046 | while (!kthread_should_stop()) { |
1042 | /* | 1047 | /* |
1043 | * This is an RT task, do short sleeps to let | 1048 | * This will likely be the system top priority |
1044 | * others run. | 1049 | * task, do short sleeps to let others run. |
1045 | */ | 1050 | */ |
1046 | msleep(100); | 1051 | msleep(100); |
1047 | } | 1052 | } |
@@ -1054,21 +1059,21 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr) | |||
1054 | { | 1059 | { |
1055 | unsigned long save_max = tracing_max_latency; | 1060 | unsigned long save_max = tracing_max_latency; |
1056 | struct task_struct *p; | 1061 | struct task_struct *p; |
1057 | struct completion isrt; | 1062 | struct completion is_ready; |
1058 | unsigned long count; | 1063 | unsigned long count; |
1059 | int ret; | 1064 | int ret; |
1060 | 1065 | ||
1061 | init_completion(&isrt); | 1066 | init_completion(&is_ready); |
1062 | 1067 | ||
1063 | /* create a high prio thread */ | 1068 | /* create a -deadline thread */ |
1064 | p = kthread_run(trace_wakeup_test_thread, &isrt, "ftrace-test"); | 1069 | p = kthread_run(trace_wakeup_test_thread, &is_ready, "ftrace-test"); |
1065 | if (IS_ERR(p)) { | 1070 | if (IS_ERR(p)) { |
1066 | printk(KERN_CONT "Failed to create ftrace wakeup test thread "); | 1071 | printk(KERN_CONT "Failed to create ftrace wakeup test thread "); |
1067 | return -1; | 1072 | return -1; |
1068 | } | 1073 | } |
1069 | 1074 | ||
1070 | /* make sure the thread is running at an RT prio */ | 1075 | /* make sure the thread is running at -deadline policy */ |
1071 | wait_for_completion(&isrt); | 1076 | wait_for_completion(&is_ready); |
1072 | 1077 | ||
1073 | /* start the tracing */ | 1078 | /* start the tracing */ |
1074 | ret = tracer_init(trace, tr); | 1079 | ret = tracer_init(trace, tr); |
@@ -1082,19 +1087,19 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr) | |||
1082 | 1087 | ||
1083 | while (p->on_rq) { | 1088 | while (p->on_rq) { |
1084 | /* | 1089 | /* |
1085 | * Sleep to make sure the RT thread is asleep too. | 1090 | * Sleep to make sure the -deadline thread is asleep too. |
1086 | * On virtual machines we can't rely on timings, | 1091 | * On virtual machines we can't rely on timings, |
1087 | * but we want to make sure this test still works. | 1092 | * but we want to make sure this test still works. |
1088 | */ | 1093 | */ |
1089 | msleep(100); | 1094 | msleep(100); |
1090 | } | 1095 | } |
1091 | 1096 | ||
1092 | init_completion(&isrt); | 1097 | init_completion(&is_ready); |
1093 | 1098 | ||
1094 | wake_up_process(p); | 1099 | wake_up_process(p); |
1095 | 1100 | ||
1096 | /* Wait for the task to wake up */ | 1101 | /* Wait for the task to wake up */ |
1097 | wait_for_completion(&isrt); | 1102 | wait_for_completion(&is_ready); |
1098 | 1103 | ||
1099 | /* stop the tracing. */ | 1104 | /* stop the tracing. */ |
1100 | tracing_stop(); | 1105 | tracing_stop(); |
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index b20428c5efe2..e6be585cf06a 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c | |||
@@ -382,7 +382,7 @@ static const struct file_operations stack_trace_filter_fops = { | |||
382 | .open = stack_trace_filter_open, | 382 | .open = stack_trace_filter_open, |
383 | .read = seq_read, | 383 | .read = seq_read, |
384 | .write = ftrace_filter_write, | 384 | .write = ftrace_filter_write, |
385 | .llseek = ftrace_filter_lseek, | 385 | .llseek = tracing_lseek, |
386 | .release = ftrace_regex_release, | 386 | .release = ftrace_regex_release, |
387 | }; | 387 | }; |
388 | 388 | ||
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index ea90eb5f6f17..759d5e004517 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c | |||
@@ -321,7 +321,7 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id) | |||
321 | if (!ftrace_file) | 321 | if (!ftrace_file) |
322 | return; | 322 | return; |
323 | 323 | ||
324 | if (test_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &ftrace_file->flags)) | 324 | if (ftrace_trigger_soft_disabled(ftrace_file)) |
325 | return; | 325 | return; |
326 | 326 | ||
327 | sys_data = syscall_nr_to_meta(syscall_nr); | 327 | sys_data = syscall_nr_to_meta(syscall_nr); |
@@ -343,9 +343,8 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id) | |||
343 | entry->nr = syscall_nr; | 343 | entry->nr = syscall_nr; |
344 | syscall_get_arguments(current, regs, 0, sys_data->nb_args, entry->args); | 344 | syscall_get_arguments(current, regs, 0, sys_data->nb_args, entry->args); |
345 | 345 | ||
346 | if (!filter_check_discard(ftrace_file, entry, buffer, event)) | 346 | event_trigger_unlock_commit(ftrace_file, buffer, event, entry, |
347 | trace_current_buffer_unlock_commit(buffer, event, | 347 | irq_flags, pc); |
348 | irq_flags, pc); | ||
349 | } | 348 | } |
350 | 349 | ||
351 | static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret) | 350 | static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret) |
@@ -369,7 +368,7 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret) | |||
369 | if (!ftrace_file) | 368 | if (!ftrace_file) |
370 | return; | 369 | return; |
371 | 370 | ||
372 | if (test_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &ftrace_file->flags)) | 371 | if (ftrace_trigger_soft_disabled(ftrace_file)) |
373 | return; | 372 | return; |
374 | 373 | ||
375 | sys_data = syscall_nr_to_meta(syscall_nr); | 374 | sys_data = syscall_nr_to_meta(syscall_nr); |
@@ -390,9 +389,8 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret) | |||
390 | entry->nr = syscall_nr; | 389 | entry->nr = syscall_nr; |
391 | entry->ret = syscall_get_return_value(current, regs); | 390 | entry->ret = syscall_get_return_value(current, regs); |
392 | 391 | ||
393 | if (!filter_check_discard(ftrace_file, entry, buffer, event)) | 392 | event_trigger_unlock_commit(ftrace_file, buffer, event, entry, |
394 | trace_current_buffer_unlock_commit(buffer, event, | 393 | irq_flags, pc); |
395 | irq_flags, pc); | ||
396 | } | 394 | } |
397 | 395 | ||
398 | static int reg_event_syscall_enter(struct ftrace_event_file *file, | 396 | static int reg_event_syscall_enter(struct ftrace_event_file *file, |
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index b6dcc42ef7f5..79e52d93860b 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c | |||
@@ -51,22 +51,17 @@ struct trace_uprobe_filter { | |||
51 | */ | 51 | */ |
52 | struct trace_uprobe { | 52 | struct trace_uprobe { |
53 | struct list_head list; | 53 | struct list_head list; |
54 | struct ftrace_event_class class; | ||
55 | struct ftrace_event_call call; | ||
56 | struct trace_uprobe_filter filter; | 54 | struct trace_uprobe_filter filter; |
57 | struct uprobe_consumer consumer; | 55 | struct uprobe_consumer consumer; |
58 | struct inode *inode; | 56 | struct inode *inode; |
59 | char *filename; | 57 | char *filename; |
60 | unsigned long offset; | 58 | unsigned long offset; |
61 | unsigned long nhit; | 59 | unsigned long nhit; |
62 | unsigned int flags; /* For TP_FLAG_* */ | 60 | struct trace_probe tp; |
63 | ssize_t size; /* trace entry size */ | ||
64 | unsigned int nr_args; | ||
65 | struct probe_arg args[]; | ||
66 | }; | 61 | }; |
67 | 62 | ||
68 | #define SIZEOF_TRACE_UPROBE(n) \ | 63 | #define SIZEOF_TRACE_UPROBE(n) \ |
69 | (offsetof(struct trace_uprobe, args) + \ | 64 | (offsetof(struct trace_uprobe, tp.args) + \ |
70 | (sizeof(struct probe_arg) * (n))) | 65 | (sizeof(struct probe_arg) * (n))) |
71 | 66 | ||
72 | static int register_uprobe_event(struct trace_uprobe *tu); | 67 | static int register_uprobe_event(struct trace_uprobe *tu); |
@@ -75,10 +70,151 @@ static int unregister_uprobe_event(struct trace_uprobe *tu); | |||
75 | static DEFINE_MUTEX(uprobe_lock); | 70 | static DEFINE_MUTEX(uprobe_lock); |
76 | static LIST_HEAD(uprobe_list); | 71 | static LIST_HEAD(uprobe_list); |
77 | 72 | ||
73 | struct uprobe_dispatch_data { | ||
74 | struct trace_uprobe *tu; | ||
75 | unsigned long bp_addr; | ||
76 | }; | ||
77 | |||
78 | static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs); | 78 | static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs); |
79 | static int uretprobe_dispatcher(struct uprobe_consumer *con, | 79 | static int uretprobe_dispatcher(struct uprobe_consumer *con, |
80 | unsigned long func, struct pt_regs *regs); | 80 | unsigned long func, struct pt_regs *regs); |
81 | 81 | ||
82 | #ifdef CONFIG_STACK_GROWSUP | ||
83 | static unsigned long adjust_stack_addr(unsigned long addr, unsigned int n) | ||
84 | { | ||
85 | return addr - (n * sizeof(long)); | ||
86 | } | ||
87 | #else | ||
88 | static unsigned long adjust_stack_addr(unsigned long addr, unsigned int n) | ||
89 | { | ||
90 | return addr + (n * sizeof(long)); | ||
91 | } | ||
92 | #endif | ||
93 | |||
94 | static unsigned long get_user_stack_nth(struct pt_regs *regs, unsigned int n) | ||
95 | { | ||
96 | unsigned long ret; | ||
97 | unsigned long addr = user_stack_pointer(regs); | ||
98 | |||
99 | addr = adjust_stack_addr(addr, n); | ||
100 | |||
101 | if (copy_from_user(&ret, (void __force __user *) addr, sizeof(ret))) | ||
102 | return 0; | ||
103 | |||
104 | return ret; | ||
105 | } | ||
106 | |||
107 | /* | ||
108 | * Uprobes-specific fetch functions | ||
109 | */ | ||
110 | #define DEFINE_FETCH_stack(type) \ | ||
111 | static __kprobes void FETCH_FUNC_NAME(stack, type)(struct pt_regs *regs,\ | ||
112 | void *offset, void *dest) \ | ||
113 | { \ | ||
114 | *(type *)dest = (type)get_user_stack_nth(regs, \ | ||
115 | ((unsigned long)offset)); \ | ||
116 | } | ||
117 | DEFINE_BASIC_FETCH_FUNCS(stack) | ||
118 | /* No string on the stack entry */ | ||
119 | #define fetch_stack_string NULL | ||
120 | #define fetch_stack_string_size NULL | ||
121 | |||
122 | #define DEFINE_FETCH_memory(type) \ | ||
123 | static __kprobes void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs,\ | ||
124 | void *addr, void *dest) \ | ||
125 | { \ | ||
126 | type retval; \ | ||
127 | void __user *vaddr = (void __force __user *) addr; \ | ||
128 | \ | ||
129 | if (copy_from_user(&retval, vaddr, sizeof(type))) \ | ||
130 | *(type *)dest = 0; \ | ||
131 | else \ | ||
132 | *(type *) dest = retval; \ | ||
133 | } | ||
134 | DEFINE_BASIC_FETCH_FUNCS(memory) | ||
135 | /* | ||
136 | * Fetch a null-terminated string. Caller MUST set *(u32 *)dest with max | ||
137 | * length and relative data location. | ||
138 | */ | ||
139 | static __kprobes void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs, | ||
140 | void *addr, void *dest) | ||
141 | { | ||
142 | long ret; | ||
143 | u32 rloc = *(u32 *)dest; | ||
144 | int maxlen = get_rloc_len(rloc); | ||
145 | u8 *dst = get_rloc_data(dest); | ||
146 | void __user *src = (void __force __user *) addr; | ||
147 | |||
148 | if (!maxlen) | ||
149 | return; | ||
150 | |||
151 | ret = strncpy_from_user(dst, src, maxlen); | ||
152 | |||
153 | if (ret < 0) { /* Failed to fetch string */ | ||
154 | ((u8 *)get_rloc_data(dest))[0] = '\0'; | ||
155 | *(u32 *)dest = make_data_rloc(0, get_rloc_offs(rloc)); | ||
156 | } else { | ||
157 | *(u32 *)dest = make_data_rloc(ret, get_rloc_offs(rloc)); | ||
158 | } | ||
159 | } | ||
160 | |||
161 | static __kprobes void FETCH_FUNC_NAME(memory, string_size)(struct pt_regs *regs, | ||
162 | void *addr, void *dest) | ||
163 | { | ||
164 | int len; | ||
165 | void __user *vaddr = (void __force __user *) addr; | ||
166 | |||
167 | len = strnlen_user(vaddr, MAX_STRING_SIZE); | ||
168 | |||
169 | if (len == 0 || len > MAX_STRING_SIZE) /* Failed to check length */ | ||
170 | *(u32 *)dest = 0; | ||
171 | else | ||
172 | *(u32 *)dest = len; | ||
173 | } | ||
174 | |||
175 | static unsigned long translate_user_vaddr(void *file_offset) | ||
176 | { | ||
177 | unsigned long base_addr; | ||
178 | struct uprobe_dispatch_data *udd; | ||
179 | |||
180 | udd = (void *) current->utask->vaddr; | ||
181 | |||
182 | base_addr = udd->bp_addr - udd->tu->offset; | ||
183 | return base_addr + (unsigned long)file_offset; | ||
184 | } | ||
185 | |||
186 | #define DEFINE_FETCH_file_offset(type) \ | ||
187 | static __kprobes void FETCH_FUNC_NAME(file_offset, type)(struct pt_regs *regs,\ | ||
188 | void *offset, void *dest) \ | ||
189 | { \ | ||
190 | void *vaddr = (void *)translate_user_vaddr(offset); \ | ||
191 | \ | ||
192 | FETCH_FUNC_NAME(memory, type)(regs, vaddr, dest); \ | ||
193 | } | ||
194 | DEFINE_BASIC_FETCH_FUNCS(file_offset) | ||
195 | DEFINE_FETCH_file_offset(string) | ||
196 | DEFINE_FETCH_file_offset(string_size) | ||
197 | |||
198 | /* Fetch type information table */ | ||
199 | const struct fetch_type uprobes_fetch_type_table[] = { | ||
200 | /* Special types */ | ||
201 | [FETCH_TYPE_STRING] = __ASSIGN_FETCH_TYPE("string", string, string, | ||
202 | sizeof(u32), 1, "__data_loc char[]"), | ||
203 | [FETCH_TYPE_STRSIZE] = __ASSIGN_FETCH_TYPE("string_size", u32, | ||
204 | string_size, sizeof(u32), 0, "u32"), | ||
205 | /* Basic types */ | ||
206 | ASSIGN_FETCH_TYPE(u8, u8, 0), | ||
207 | ASSIGN_FETCH_TYPE(u16, u16, 0), | ||
208 | ASSIGN_FETCH_TYPE(u32, u32, 0), | ||
209 | ASSIGN_FETCH_TYPE(u64, u64, 0), | ||
210 | ASSIGN_FETCH_TYPE(s8, u8, 1), | ||
211 | ASSIGN_FETCH_TYPE(s16, u16, 1), | ||
212 | ASSIGN_FETCH_TYPE(s32, u32, 1), | ||
213 | ASSIGN_FETCH_TYPE(s64, u64, 1), | ||
214 | |||
215 | ASSIGN_FETCH_TYPE_END | ||
216 | }; | ||
217 | |||
82 | static inline void init_trace_uprobe_filter(struct trace_uprobe_filter *filter) | 218 | static inline void init_trace_uprobe_filter(struct trace_uprobe_filter *filter) |
83 | { | 219 | { |
84 | rwlock_init(&filter->rwlock); | 220 | rwlock_init(&filter->rwlock); |
@@ -114,13 +250,13 @@ alloc_trace_uprobe(const char *group, const char *event, int nargs, bool is_ret) | |||
114 | if (!tu) | 250 | if (!tu) |
115 | return ERR_PTR(-ENOMEM); | 251 | return ERR_PTR(-ENOMEM); |
116 | 252 | ||
117 | tu->call.class = &tu->class; | 253 | tu->tp.call.class = &tu->tp.class; |
118 | tu->call.name = kstrdup(event, GFP_KERNEL); | 254 | tu->tp.call.name = kstrdup(event, GFP_KERNEL); |
119 | if (!tu->call.name) | 255 | if (!tu->tp.call.name) |
120 | goto error; | 256 | goto error; |
121 | 257 | ||
122 | tu->class.system = kstrdup(group, GFP_KERNEL); | 258 | tu->tp.class.system = kstrdup(group, GFP_KERNEL); |
123 | if (!tu->class.system) | 259 | if (!tu->tp.class.system) |
124 | goto error; | 260 | goto error; |
125 | 261 | ||
126 | INIT_LIST_HEAD(&tu->list); | 262 | INIT_LIST_HEAD(&tu->list); |
@@ -128,11 +264,11 @@ alloc_trace_uprobe(const char *group, const char *event, int nargs, bool is_ret) | |||
128 | if (is_ret) | 264 | if (is_ret) |
129 | tu->consumer.ret_handler = uretprobe_dispatcher; | 265 | tu->consumer.ret_handler = uretprobe_dispatcher; |
130 | init_trace_uprobe_filter(&tu->filter); | 266 | init_trace_uprobe_filter(&tu->filter); |
131 | tu->call.flags |= TRACE_EVENT_FL_USE_CALL_FILTER; | 267 | tu->tp.call.flags |= TRACE_EVENT_FL_USE_CALL_FILTER; |
132 | return tu; | 268 | return tu; |
133 | 269 | ||
134 | error: | 270 | error: |
135 | kfree(tu->call.name); | 271 | kfree(tu->tp.call.name); |
136 | kfree(tu); | 272 | kfree(tu); |
137 | 273 | ||
138 | return ERR_PTR(-ENOMEM); | 274 | return ERR_PTR(-ENOMEM); |
@@ -142,12 +278,12 @@ static void free_trace_uprobe(struct trace_uprobe *tu) | |||
142 | { | 278 | { |
143 | int i; | 279 | int i; |
144 | 280 | ||
145 | for (i = 0; i < tu->nr_args; i++) | 281 | for (i = 0; i < tu->tp.nr_args; i++) |
146 | traceprobe_free_probe_arg(&tu->args[i]); | 282 | traceprobe_free_probe_arg(&tu->tp.args[i]); |
147 | 283 | ||
148 | iput(tu->inode); | 284 | iput(tu->inode); |
149 | kfree(tu->call.class->system); | 285 | kfree(tu->tp.call.class->system); |
150 | kfree(tu->call.name); | 286 | kfree(tu->tp.call.name); |
151 | kfree(tu->filename); | 287 | kfree(tu->filename); |
152 | kfree(tu); | 288 | kfree(tu); |
153 | } | 289 | } |
@@ -157,8 +293,8 @@ static struct trace_uprobe *find_probe_event(const char *event, const char *grou | |||
157 | struct trace_uprobe *tu; | 293 | struct trace_uprobe *tu; |
158 | 294 | ||
159 | list_for_each_entry(tu, &uprobe_list, list) | 295 | list_for_each_entry(tu, &uprobe_list, list) |
160 | if (strcmp(tu->call.name, event) == 0 && | 296 | if (strcmp(tu->tp.call.name, event) == 0 && |
161 | strcmp(tu->call.class->system, group) == 0) | 297 | strcmp(tu->tp.call.class->system, group) == 0) |
162 | return tu; | 298 | return tu; |
163 | 299 | ||
164 | return NULL; | 300 | return NULL; |
@@ -181,16 +317,16 @@ static int unregister_trace_uprobe(struct trace_uprobe *tu) | |||
181 | /* Register a trace_uprobe and probe_event */ | 317 | /* Register a trace_uprobe and probe_event */ |
182 | static int register_trace_uprobe(struct trace_uprobe *tu) | 318 | static int register_trace_uprobe(struct trace_uprobe *tu) |
183 | { | 319 | { |
184 | struct trace_uprobe *old_tp; | 320 | struct trace_uprobe *old_tu; |
185 | int ret; | 321 | int ret; |
186 | 322 | ||
187 | mutex_lock(&uprobe_lock); | 323 | mutex_lock(&uprobe_lock); |
188 | 324 | ||
189 | /* register as an event */ | 325 | /* register as an event */ |
190 | old_tp = find_probe_event(tu->call.name, tu->call.class->system); | 326 | old_tu = find_probe_event(tu->tp.call.name, tu->tp.call.class->system); |
191 | if (old_tp) { | 327 | if (old_tu) { |
192 | /* delete old event */ | 328 | /* delete old event */ |
193 | ret = unregister_trace_uprobe(old_tp); | 329 | ret = unregister_trace_uprobe(old_tu); |
194 | if (ret) | 330 | if (ret) |
195 | goto end; | 331 | goto end; |
196 | } | 332 | } |
@@ -211,7 +347,7 @@ end: | |||
211 | 347 | ||
212 | /* | 348 | /* |
213 | * Argument syntax: | 349 | * Argument syntax: |
214 | * - Add uprobe: p|r[:[GRP/]EVENT] PATH:SYMBOL [FETCHARGS] | 350 | * - Add uprobe: p|r[:[GRP/]EVENT] PATH:OFFSET [FETCHARGS] |
215 | * | 351 | * |
216 | * - Remove uprobe: -:[GRP/]EVENT | 352 | * - Remove uprobe: -:[GRP/]EVENT |
217 | */ | 353 | */ |
@@ -360,34 +496,36 @@ static int create_trace_uprobe(int argc, char **argv) | |||
360 | /* parse arguments */ | 496 | /* parse arguments */ |
361 | ret = 0; | 497 | ret = 0; |
362 | for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) { | 498 | for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) { |
499 | struct probe_arg *parg = &tu->tp.args[i]; | ||
500 | |||
363 | /* Increment count for freeing args in error case */ | 501 | /* Increment count for freeing args in error case */ |
364 | tu->nr_args++; | 502 | tu->tp.nr_args++; |
365 | 503 | ||
366 | /* Parse argument name */ | 504 | /* Parse argument name */ |
367 | arg = strchr(argv[i], '='); | 505 | arg = strchr(argv[i], '='); |
368 | if (arg) { | 506 | if (arg) { |
369 | *arg++ = '\0'; | 507 | *arg++ = '\0'; |
370 | tu->args[i].name = kstrdup(argv[i], GFP_KERNEL); | 508 | parg->name = kstrdup(argv[i], GFP_KERNEL); |
371 | } else { | 509 | } else { |
372 | arg = argv[i]; | 510 | arg = argv[i]; |
373 | /* If argument name is omitted, set "argN" */ | 511 | /* If argument name is omitted, set "argN" */ |
374 | snprintf(buf, MAX_EVENT_NAME_LEN, "arg%d", i + 1); | 512 | snprintf(buf, MAX_EVENT_NAME_LEN, "arg%d", i + 1); |
375 | tu->args[i].name = kstrdup(buf, GFP_KERNEL); | 513 | parg->name = kstrdup(buf, GFP_KERNEL); |
376 | } | 514 | } |
377 | 515 | ||
378 | if (!tu->args[i].name) { | 516 | if (!parg->name) { |
379 | pr_info("Failed to allocate argument[%d] name.\n", i); | 517 | pr_info("Failed to allocate argument[%d] name.\n", i); |
380 | ret = -ENOMEM; | 518 | ret = -ENOMEM; |
381 | goto error; | 519 | goto error; |
382 | } | 520 | } |
383 | 521 | ||
384 | if (!is_good_name(tu->args[i].name)) { | 522 | if (!is_good_name(parg->name)) { |
385 | pr_info("Invalid argument[%d] name: %s\n", i, tu->args[i].name); | 523 | pr_info("Invalid argument[%d] name: %s\n", i, parg->name); |
386 | ret = -EINVAL; | 524 | ret = -EINVAL; |
387 | goto error; | 525 | goto error; |
388 | } | 526 | } |
389 | 527 | ||
390 | if (traceprobe_conflict_field_name(tu->args[i].name, tu->args, i)) { | 528 | if (traceprobe_conflict_field_name(parg->name, tu->tp.args, i)) { |
391 | pr_info("Argument[%d] name '%s' conflicts with " | 529 | pr_info("Argument[%d] name '%s' conflicts with " |
392 | "another field.\n", i, argv[i]); | 530 | "another field.\n", i, argv[i]); |
393 | ret = -EINVAL; | 531 | ret = -EINVAL; |
@@ -395,7 +533,8 @@ static int create_trace_uprobe(int argc, char **argv) | |||
395 | } | 533 | } |
396 | 534 | ||
397 | /* Parse fetch argument */ | 535 | /* Parse fetch argument */ |
398 | ret = traceprobe_parse_probe_arg(arg, &tu->size, &tu->args[i], false, false); | 536 | ret = traceprobe_parse_probe_arg(arg, &tu->tp.size, parg, |
537 | is_return, false); | ||
399 | if (ret) { | 538 | if (ret) { |
400 | pr_info("Parse error at argument[%d]. (%d)\n", i, ret); | 539 | pr_info("Parse error at argument[%d]. (%d)\n", i, ret); |
401 | goto error; | 540 | goto error; |
@@ -459,11 +598,11 @@ static int probes_seq_show(struct seq_file *m, void *v) | |||
459 | char c = is_ret_probe(tu) ? 'r' : 'p'; | 598 | char c = is_ret_probe(tu) ? 'r' : 'p'; |
460 | int i; | 599 | int i; |
461 | 600 | ||
462 | seq_printf(m, "%c:%s/%s", c, tu->call.class->system, tu->call.name); | 601 | seq_printf(m, "%c:%s/%s", c, tu->tp.call.class->system, tu->tp.call.name); |
463 | seq_printf(m, " %s:0x%p", tu->filename, (void *)tu->offset); | 602 | seq_printf(m, " %s:0x%p", tu->filename, (void *)tu->offset); |
464 | 603 | ||
465 | for (i = 0; i < tu->nr_args; i++) | 604 | for (i = 0; i < tu->tp.nr_args; i++) |
466 | seq_printf(m, " %s=%s", tu->args[i].name, tu->args[i].comm); | 605 | seq_printf(m, " %s=%s", tu->tp.args[i].name, tu->tp.args[i].comm); |
467 | 606 | ||
468 | seq_printf(m, "\n"); | 607 | seq_printf(m, "\n"); |
469 | return 0; | 608 | return 0; |
@@ -509,7 +648,7 @@ static int probes_profile_seq_show(struct seq_file *m, void *v) | |||
509 | { | 648 | { |
510 | struct trace_uprobe *tu = v; | 649 | struct trace_uprobe *tu = v; |
511 | 650 | ||
512 | seq_printf(m, " %s %-44s %15lu\n", tu->filename, tu->call.name, tu->nhit); | 651 | seq_printf(m, " %s %-44s %15lu\n", tu->filename, tu->tp.call.name, tu->nhit); |
513 | return 0; | 652 | return 0; |
514 | } | 653 | } |
515 | 654 | ||
@@ -533,21 +672,117 @@ static const struct file_operations uprobe_profile_ops = { | |||
533 | .release = seq_release, | 672 | .release = seq_release, |
534 | }; | 673 | }; |
535 | 674 | ||
675 | struct uprobe_cpu_buffer { | ||
676 | struct mutex mutex; | ||
677 | void *buf; | ||
678 | }; | ||
679 | static struct uprobe_cpu_buffer __percpu *uprobe_cpu_buffer; | ||
680 | static int uprobe_buffer_refcnt; | ||
681 | |||
682 | static int uprobe_buffer_init(void) | ||
683 | { | ||
684 | int cpu, err_cpu; | ||
685 | |||
686 | uprobe_cpu_buffer = alloc_percpu(struct uprobe_cpu_buffer); | ||
687 | if (uprobe_cpu_buffer == NULL) | ||
688 | return -ENOMEM; | ||
689 | |||
690 | for_each_possible_cpu(cpu) { | ||
691 | struct page *p = alloc_pages_node(cpu_to_node(cpu), | ||
692 | GFP_KERNEL, 0); | ||
693 | if (p == NULL) { | ||
694 | err_cpu = cpu; | ||
695 | goto err; | ||
696 | } | ||
697 | per_cpu_ptr(uprobe_cpu_buffer, cpu)->buf = page_address(p); | ||
698 | mutex_init(&per_cpu_ptr(uprobe_cpu_buffer, cpu)->mutex); | ||
699 | } | ||
700 | |||
701 | return 0; | ||
702 | |||
703 | err: | ||
704 | for_each_possible_cpu(cpu) { | ||
705 | if (cpu == err_cpu) | ||
706 | break; | ||
707 | free_page((unsigned long)per_cpu_ptr(uprobe_cpu_buffer, cpu)->buf); | ||
708 | } | ||
709 | |||
710 | free_percpu(uprobe_cpu_buffer); | ||
711 | return -ENOMEM; | ||
712 | } | ||
713 | |||
714 | static int uprobe_buffer_enable(void) | ||
715 | { | ||
716 | int ret = 0; | ||
717 | |||
718 | BUG_ON(!mutex_is_locked(&event_mutex)); | ||
719 | |||
720 | if (uprobe_buffer_refcnt++ == 0) { | ||
721 | ret = uprobe_buffer_init(); | ||
722 | if (ret < 0) | ||
723 | uprobe_buffer_refcnt--; | ||
724 | } | ||
725 | |||
726 | return ret; | ||
727 | } | ||
728 | |||
729 | static void uprobe_buffer_disable(void) | ||
730 | { | ||
731 | BUG_ON(!mutex_is_locked(&event_mutex)); | ||
732 | |||
733 | if (--uprobe_buffer_refcnt == 0) { | ||
734 | free_percpu(uprobe_cpu_buffer); | ||
735 | uprobe_cpu_buffer = NULL; | ||
736 | } | ||
737 | } | ||
738 | |||
739 | static struct uprobe_cpu_buffer *uprobe_buffer_get(void) | ||
740 | { | ||
741 | struct uprobe_cpu_buffer *ucb; | ||
742 | int cpu; | ||
743 | |||
744 | cpu = raw_smp_processor_id(); | ||
745 | ucb = per_cpu_ptr(uprobe_cpu_buffer, cpu); | ||
746 | |||
747 | /* | ||
748 | * Use per-cpu buffers for fastest access, but we might migrate | ||
749 | * so the mutex makes sure we have sole access to it. | ||
750 | */ | ||
751 | mutex_lock(&ucb->mutex); | ||
752 | |||
753 | return ucb; | ||
754 | } | ||
755 | |||
756 | static void uprobe_buffer_put(struct uprobe_cpu_buffer *ucb) | ||
757 | { | ||
758 | mutex_unlock(&ucb->mutex); | ||
759 | } | ||
760 | |||
536 | static void uprobe_trace_print(struct trace_uprobe *tu, | 761 | static void uprobe_trace_print(struct trace_uprobe *tu, |
537 | unsigned long func, struct pt_regs *regs) | 762 | unsigned long func, struct pt_regs *regs) |
538 | { | 763 | { |
539 | struct uprobe_trace_entry_head *entry; | 764 | struct uprobe_trace_entry_head *entry; |
540 | struct ring_buffer_event *event; | 765 | struct ring_buffer_event *event; |
541 | struct ring_buffer *buffer; | 766 | struct ring_buffer *buffer; |
767 | struct uprobe_cpu_buffer *ucb; | ||
542 | void *data; | 768 | void *data; |
543 | int size, i; | 769 | int size, dsize, esize; |
544 | struct ftrace_event_call *call = &tu->call; | 770 | struct ftrace_event_call *call = &tu->tp.call; |
771 | |||
772 | dsize = __get_data_size(&tu->tp, regs); | ||
773 | esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu)); | ||
545 | 774 | ||
546 | size = SIZEOF_TRACE_ENTRY(is_ret_probe(tu)); | 775 | if (WARN_ON_ONCE(!uprobe_cpu_buffer || tu->tp.size + dsize > PAGE_SIZE)) |
776 | return; | ||
777 | |||
778 | ucb = uprobe_buffer_get(); | ||
779 | store_trace_args(esize, &tu->tp, regs, ucb->buf, dsize); | ||
780 | |||
781 | size = esize + tu->tp.size + dsize; | ||
547 | event = trace_current_buffer_lock_reserve(&buffer, call->event.type, | 782 | event = trace_current_buffer_lock_reserve(&buffer, call->event.type, |
548 | size + tu->size, 0, 0); | 783 | size, 0, 0); |
549 | if (!event) | 784 | if (!event) |
550 | return; | 785 | goto out; |
551 | 786 | ||
552 | entry = ring_buffer_event_data(event); | 787 | entry = ring_buffer_event_data(event); |
553 | if (is_ret_probe(tu)) { | 788 | if (is_ret_probe(tu)) { |
@@ -559,11 +794,13 @@ static void uprobe_trace_print(struct trace_uprobe *tu, | |||
559 | data = DATAOF_TRACE_ENTRY(entry, false); | 794 | data = DATAOF_TRACE_ENTRY(entry, false); |
560 | } | 795 | } |
561 | 796 | ||
562 | for (i = 0; i < tu->nr_args; i++) | 797 | memcpy(data, ucb->buf, tu->tp.size + dsize); |
563 | call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset); | ||
564 | 798 | ||
565 | if (!call_filter_check_discard(call, entry, buffer, event)) | 799 | if (!call_filter_check_discard(call, entry, buffer, event)) |
566 | trace_buffer_unlock_commit(buffer, event, 0, 0); | 800 | trace_buffer_unlock_commit(buffer, event, 0, 0); |
801 | |||
802 | out: | ||
803 | uprobe_buffer_put(ucb); | ||
567 | } | 804 | } |
568 | 805 | ||
569 | /* uprobe handler */ | 806 | /* uprobe handler */ |
@@ -591,23 +828,24 @@ print_uprobe_event(struct trace_iterator *iter, int flags, struct trace_event *e | |||
591 | int i; | 828 | int i; |
592 | 829 | ||
593 | entry = (struct uprobe_trace_entry_head *)iter->ent; | 830 | entry = (struct uprobe_trace_entry_head *)iter->ent; |
594 | tu = container_of(event, struct trace_uprobe, call.event); | 831 | tu = container_of(event, struct trace_uprobe, tp.call.event); |
595 | 832 | ||
596 | if (is_ret_probe(tu)) { | 833 | if (is_ret_probe(tu)) { |
597 | if (!trace_seq_printf(s, "%s: (0x%lx <- 0x%lx)", tu->call.name, | 834 | if (!trace_seq_printf(s, "%s: (0x%lx <- 0x%lx)", tu->tp.call.name, |
598 | entry->vaddr[1], entry->vaddr[0])) | 835 | entry->vaddr[1], entry->vaddr[0])) |
599 | goto partial; | 836 | goto partial; |
600 | data = DATAOF_TRACE_ENTRY(entry, true); | 837 | data = DATAOF_TRACE_ENTRY(entry, true); |
601 | } else { | 838 | } else { |
602 | if (!trace_seq_printf(s, "%s: (0x%lx)", tu->call.name, | 839 | if (!trace_seq_printf(s, "%s: (0x%lx)", tu->tp.call.name, |
603 | entry->vaddr[0])) | 840 | entry->vaddr[0])) |
604 | goto partial; | 841 | goto partial; |
605 | data = DATAOF_TRACE_ENTRY(entry, false); | 842 | data = DATAOF_TRACE_ENTRY(entry, false); |
606 | } | 843 | } |
607 | 844 | ||
608 | for (i = 0; i < tu->nr_args; i++) { | 845 | for (i = 0; i < tu->tp.nr_args; i++) { |
609 | if (!tu->args[i].type->print(s, tu->args[i].name, | 846 | struct probe_arg *parg = &tu->tp.args[i]; |
610 | data + tu->args[i].offset, entry)) | 847 | |
848 | if (!parg->type->print(s, parg->name, data + parg->offset, entry)) | ||
611 | goto partial; | 849 | goto partial; |
612 | } | 850 | } |
613 | 851 | ||
@@ -618,11 +856,6 @@ partial: | |||
618 | return TRACE_TYPE_PARTIAL_LINE; | 856 | return TRACE_TYPE_PARTIAL_LINE; |
619 | } | 857 | } |
620 | 858 | ||
621 | static inline bool is_trace_uprobe_enabled(struct trace_uprobe *tu) | ||
622 | { | ||
623 | return tu->flags & (TP_FLAG_TRACE | TP_FLAG_PROFILE); | ||
624 | } | ||
625 | |||
626 | typedef bool (*filter_func_t)(struct uprobe_consumer *self, | 859 | typedef bool (*filter_func_t)(struct uprobe_consumer *self, |
627 | enum uprobe_filter_ctx ctx, | 860 | enum uprobe_filter_ctx ctx, |
628 | struct mm_struct *mm); | 861 | struct mm_struct *mm); |
@@ -632,29 +865,35 @@ probe_event_enable(struct trace_uprobe *tu, int flag, filter_func_t filter) | |||
632 | { | 865 | { |
633 | int ret = 0; | 866 | int ret = 0; |
634 | 867 | ||
635 | if (is_trace_uprobe_enabled(tu)) | 868 | if (trace_probe_is_enabled(&tu->tp)) |
636 | return -EINTR; | 869 | return -EINTR; |
637 | 870 | ||
871 | ret = uprobe_buffer_enable(); | ||
872 | if (ret < 0) | ||
873 | return ret; | ||
874 | |||
638 | WARN_ON(!uprobe_filter_is_empty(&tu->filter)); | 875 | WARN_ON(!uprobe_filter_is_empty(&tu->filter)); |
639 | 876 | ||
640 | tu->flags |= flag; | 877 | tu->tp.flags |= flag; |
641 | tu->consumer.filter = filter; | 878 | tu->consumer.filter = filter; |
642 | ret = uprobe_register(tu->inode, tu->offset, &tu->consumer); | 879 | ret = uprobe_register(tu->inode, tu->offset, &tu->consumer); |
643 | if (ret) | 880 | if (ret) |
644 | tu->flags &= ~flag; | 881 | tu->tp.flags &= ~flag; |
645 | 882 | ||
646 | return ret; | 883 | return ret; |
647 | } | 884 | } |
648 | 885 | ||
649 | static void probe_event_disable(struct trace_uprobe *tu, int flag) | 886 | static void probe_event_disable(struct trace_uprobe *tu, int flag) |
650 | { | 887 | { |
651 | if (!is_trace_uprobe_enabled(tu)) | 888 | if (!trace_probe_is_enabled(&tu->tp)) |
652 | return; | 889 | return; |
653 | 890 | ||
654 | WARN_ON(!uprobe_filter_is_empty(&tu->filter)); | 891 | WARN_ON(!uprobe_filter_is_empty(&tu->filter)); |
655 | 892 | ||
656 | uprobe_unregister(tu->inode, tu->offset, &tu->consumer); | 893 | uprobe_unregister(tu->inode, tu->offset, &tu->consumer); |
657 | tu->flags &= ~flag; | 894 | tu->tp.flags &= ~flag; |
895 | |||
896 | uprobe_buffer_disable(); | ||
658 | } | 897 | } |
659 | 898 | ||
660 | static int uprobe_event_define_fields(struct ftrace_event_call *event_call) | 899 | static int uprobe_event_define_fields(struct ftrace_event_call *event_call) |
@@ -672,12 +911,12 @@ static int uprobe_event_define_fields(struct ftrace_event_call *event_call) | |||
672 | size = SIZEOF_TRACE_ENTRY(false); | 911 | size = SIZEOF_TRACE_ENTRY(false); |
673 | } | 912 | } |
674 | /* Set argument names as fields */ | 913 | /* Set argument names as fields */ |
675 | for (i = 0; i < tu->nr_args; i++) { | 914 | for (i = 0; i < tu->tp.nr_args; i++) { |
676 | ret = trace_define_field(event_call, tu->args[i].type->fmttype, | 915 | struct probe_arg *parg = &tu->tp.args[i]; |
677 | tu->args[i].name, | 916 | |
678 | size + tu->args[i].offset, | 917 | ret = trace_define_field(event_call, parg->type->fmttype, |
679 | tu->args[i].type->size, | 918 | parg->name, size + parg->offset, |
680 | tu->args[i].type->is_signed, | 919 | parg->type->size, parg->type->is_signed, |
681 | FILTER_OTHER); | 920 | FILTER_OTHER); |
682 | 921 | ||
683 | if (ret) | 922 | if (ret) |
@@ -686,59 +925,6 @@ static int uprobe_event_define_fields(struct ftrace_event_call *event_call) | |||
686 | return 0; | 925 | return 0; |
687 | } | 926 | } |
688 | 927 | ||
689 | #define LEN_OR_ZERO (len ? len - pos : 0) | ||
690 | static int __set_print_fmt(struct trace_uprobe *tu, char *buf, int len) | ||
691 | { | ||
692 | const char *fmt, *arg; | ||
693 | int i; | ||
694 | int pos = 0; | ||
695 | |||
696 | if (is_ret_probe(tu)) { | ||
697 | fmt = "(%lx <- %lx)"; | ||
698 | arg = "REC->" FIELD_STRING_FUNC ", REC->" FIELD_STRING_RETIP; | ||
699 | } else { | ||
700 | fmt = "(%lx)"; | ||
701 | arg = "REC->" FIELD_STRING_IP; | ||
702 | } | ||
703 | |||
704 | /* When len=0, we just calculate the needed length */ | ||
705 | |||
706 | pos += snprintf(buf + pos, LEN_OR_ZERO, "\"%s", fmt); | ||
707 | |||
708 | for (i = 0; i < tu->nr_args; i++) { | ||
709 | pos += snprintf(buf + pos, LEN_OR_ZERO, " %s=%s", | ||
710 | tu->args[i].name, tu->args[i].type->fmt); | ||
711 | } | ||
712 | |||
713 | pos += snprintf(buf + pos, LEN_OR_ZERO, "\", %s", arg); | ||
714 | |||
715 | for (i = 0; i < tu->nr_args; i++) { | ||
716 | pos += snprintf(buf + pos, LEN_OR_ZERO, ", REC->%s", | ||
717 | tu->args[i].name); | ||
718 | } | ||
719 | |||
720 | return pos; /* return the length of print_fmt */ | ||
721 | } | ||
722 | #undef LEN_OR_ZERO | ||
723 | |||
724 | static int set_print_fmt(struct trace_uprobe *tu) | ||
725 | { | ||
726 | char *print_fmt; | ||
727 | int len; | ||
728 | |||
729 | /* First: called with 0 length to calculate the needed length */ | ||
730 | len = __set_print_fmt(tu, NULL, 0); | ||
731 | print_fmt = kmalloc(len + 1, GFP_KERNEL); | ||
732 | if (!print_fmt) | ||
733 | return -ENOMEM; | ||
734 | |||
735 | /* Second: actually write the @print_fmt */ | ||
736 | __set_print_fmt(tu, print_fmt, len + 1); | ||
737 | tu->call.print_fmt = print_fmt; | ||
738 | |||
739 | return 0; | ||
740 | } | ||
741 | |||
742 | #ifdef CONFIG_PERF_EVENTS | 928 | #ifdef CONFIG_PERF_EVENTS |
743 | static bool | 929 | static bool |
744 | __uprobe_perf_filter(struct trace_uprobe_filter *filter, struct mm_struct *mm) | 930 | __uprobe_perf_filter(struct trace_uprobe_filter *filter, struct mm_struct *mm) |
@@ -831,14 +1017,27 @@ static bool uprobe_perf_filter(struct uprobe_consumer *uc, | |||
831 | static void uprobe_perf_print(struct trace_uprobe *tu, | 1017 | static void uprobe_perf_print(struct trace_uprobe *tu, |
832 | unsigned long func, struct pt_regs *regs) | 1018 | unsigned long func, struct pt_regs *regs) |
833 | { | 1019 | { |
834 | struct ftrace_event_call *call = &tu->call; | 1020 | struct ftrace_event_call *call = &tu->tp.call; |
835 | struct uprobe_trace_entry_head *entry; | 1021 | struct uprobe_trace_entry_head *entry; |
836 | struct hlist_head *head; | 1022 | struct hlist_head *head; |
1023 | struct uprobe_cpu_buffer *ucb; | ||
837 | void *data; | 1024 | void *data; |
838 | int size, rctx, i; | 1025 | int size, dsize, esize; |
1026 | int rctx; | ||
839 | 1027 | ||
840 | size = SIZEOF_TRACE_ENTRY(is_ret_probe(tu)); | 1028 | dsize = __get_data_size(&tu->tp, regs); |
841 | size = ALIGN(size + tu->size + sizeof(u32), sizeof(u64)) - sizeof(u32); | 1029 | esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu)); |
1030 | |||
1031 | if (WARN_ON_ONCE(!uprobe_cpu_buffer)) | ||
1032 | return; | ||
1033 | |||
1034 | size = esize + tu->tp.size + dsize; | ||
1035 | size = ALIGN(size + sizeof(u32), sizeof(u64)) - sizeof(u32); | ||
1036 | if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, "profile buffer not large enough")) | ||
1037 | return; | ||
1038 | |||
1039 | ucb = uprobe_buffer_get(); | ||
1040 | store_trace_args(esize, &tu->tp, regs, ucb->buf, dsize); | ||
842 | 1041 | ||
843 | preempt_disable(); | 1042 | preempt_disable(); |
844 | head = this_cpu_ptr(call->perf_events); | 1043 | head = this_cpu_ptr(call->perf_events); |
@@ -858,12 +1057,18 @@ static void uprobe_perf_print(struct trace_uprobe *tu, | |||
858 | data = DATAOF_TRACE_ENTRY(entry, false); | 1057 | data = DATAOF_TRACE_ENTRY(entry, false); |
859 | } | 1058 | } |
860 | 1059 | ||
861 | for (i = 0; i < tu->nr_args; i++) | 1060 | memcpy(data, ucb->buf, tu->tp.size + dsize); |
862 | call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset); | 1061 | |
1062 | if (size - esize > tu->tp.size + dsize) { | ||
1063 | int len = tu->tp.size + dsize; | ||
1064 | |||
1065 | memset(data + len, 0, size - esize - len); | ||
1066 | } | ||
863 | 1067 | ||
864 | perf_trace_buf_submit(entry, size, rctx, 0, 1, regs, head, NULL); | 1068 | perf_trace_buf_submit(entry, size, rctx, 0, 1, regs, head, NULL); |
865 | out: | 1069 | out: |
866 | preempt_enable(); | 1070 | preempt_enable(); |
1071 | uprobe_buffer_put(ucb); | ||
867 | } | 1072 | } |
868 | 1073 | ||
869 | /* uprobe profile handler */ | 1074 | /* uprobe profile handler */ |
@@ -921,16 +1126,22 @@ int trace_uprobe_register(struct ftrace_event_call *event, enum trace_reg type, | |||
921 | static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs) | 1126 | static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs) |
922 | { | 1127 | { |
923 | struct trace_uprobe *tu; | 1128 | struct trace_uprobe *tu; |
1129 | struct uprobe_dispatch_data udd; | ||
924 | int ret = 0; | 1130 | int ret = 0; |
925 | 1131 | ||
926 | tu = container_of(con, struct trace_uprobe, consumer); | 1132 | tu = container_of(con, struct trace_uprobe, consumer); |
927 | tu->nhit++; | 1133 | tu->nhit++; |
928 | 1134 | ||
929 | if (tu->flags & TP_FLAG_TRACE) | 1135 | udd.tu = tu; |
1136 | udd.bp_addr = instruction_pointer(regs); | ||
1137 | |||
1138 | current->utask->vaddr = (unsigned long) &udd; | ||
1139 | |||
1140 | if (tu->tp.flags & TP_FLAG_TRACE) | ||
930 | ret |= uprobe_trace_func(tu, regs); | 1141 | ret |= uprobe_trace_func(tu, regs); |
931 | 1142 | ||
932 | #ifdef CONFIG_PERF_EVENTS | 1143 | #ifdef CONFIG_PERF_EVENTS |
933 | if (tu->flags & TP_FLAG_PROFILE) | 1144 | if (tu->tp.flags & TP_FLAG_PROFILE) |
934 | ret |= uprobe_perf_func(tu, regs); | 1145 | ret |= uprobe_perf_func(tu, regs); |
935 | #endif | 1146 | #endif |
936 | return ret; | 1147 | return ret; |
@@ -940,14 +1151,20 @@ static int uretprobe_dispatcher(struct uprobe_consumer *con, | |||
940 | unsigned long func, struct pt_regs *regs) | 1151 | unsigned long func, struct pt_regs *regs) |
941 | { | 1152 | { |
942 | struct trace_uprobe *tu; | 1153 | struct trace_uprobe *tu; |
1154 | struct uprobe_dispatch_data udd; | ||
943 | 1155 | ||
944 | tu = container_of(con, struct trace_uprobe, consumer); | 1156 | tu = container_of(con, struct trace_uprobe, consumer); |
945 | 1157 | ||
946 | if (tu->flags & TP_FLAG_TRACE) | 1158 | udd.tu = tu; |
1159 | udd.bp_addr = func; | ||
1160 | |||
1161 | current->utask->vaddr = (unsigned long) &udd; | ||
1162 | |||
1163 | if (tu->tp.flags & TP_FLAG_TRACE) | ||
947 | uretprobe_trace_func(tu, func, regs); | 1164 | uretprobe_trace_func(tu, func, regs); |
948 | 1165 | ||
949 | #ifdef CONFIG_PERF_EVENTS | 1166 | #ifdef CONFIG_PERF_EVENTS |
950 | if (tu->flags & TP_FLAG_PROFILE) | 1167 | if (tu->tp.flags & TP_FLAG_PROFILE) |
951 | uretprobe_perf_func(tu, func, regs); | 1168 | uretprobe_perf_func(tu, func, regs); |
952 | #endif | 1169 | #endif |
953 | return 0; | 1170 | return 0; |
@@ -959,7 +1176,7 @@ static struct trace_event_functions uprobe_funcs = { | |||
959 | 1176 | ||
960 | static int register_uprobe_event(struct trace_uprobe *tu) | 1177 | static int register_uprobe_event(struct trace_uprobe *tu) |
961 | { | 1178 | { |
962 | struct ftrace_event_call *call = &tu->call; | 1179 | struct ftrace_event_call *call = &tu->tp.call; |
963 | int ret; | 1180 | int ret; |
964 | 1181 | ||
965 | /* Initialize ftrace_event_call */ | 1182 | /* Initialize ftrace_event_call */ |
@@ -967,7 +1184,7 @@ static int register_uprobe_event(struct trace_uprobe *tu) | |||
967 | call->event.funcs = &uprobe_funcs; | 1184 | call->event.funcs = &uprobe_funcs; |
968 | call->class->define_fields = uprobe_event_define_fields; | 1185 | call->class->define_fields = uprobe_event_define_fields; |
969 | 1186 | ||
970 | if (set_print_fmt(tu) < 0) | 1187 | if (set_print_fmt(&tu->tp, is_ret_probe(tu)) < 0) |
971 | return -ENOMEM; | 1188 | return -ENOMEM; |
972 | 1189 | ||
973 | ret = register_ftrace_event(&call->event); | 1190 | ret = register_ftrace_event(&call->event); |
@@ -994,11 +1211,11 @@ static int unregister_uprobe_event(struct trace_uprobe *tu) | |||
994 | int ret; | 1211 | int ret; |
995 | 1212 | ||
996 | /* tu->event is unregistered in trace_remove_event_call() */ | 1213 | /* tu->event is unregistered in trace_remove_event_call() */ |
997 | ret = trace_remove_event_call(&tu->call); | 1214 | ret = trace_remove_event_call(&tu->tp.call); |
998 | if (ret) | 1215 | if (ret) |
999 | return ret; | 1216 | return ret; |
1000 | kfree(tu->call.print_fmt); | 1217 | kfree(tu->tp.call.print_fmt); |
1001 | tu->call.print_fmt = NULL; | 1218 | tu->tp.call.print_fmt = NULL; |
1002 | return 0; | 1219 | return 0; |
1003 | } | 1220 | } |
1004 | 1221 | ||
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index 29f26540e9c9..031cc5655a51 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c | |||
@@ -631,6 +631,11 @@ void tracepoint_iter_reset(struct tracepoint_iter *iter) | |||
631 | EXPORT_SYMBOL_GPL(tracepoint_iter_reset); | 631 | EXPORT_SYMBOL_GPL(tracepoint_iter_reset); |
632 | 632 | ||
633 | #ifdef CONFIG_MODULES | 633 | #ifdef CONFIG_MODULES |
634 | bool trace_module_has_bad_taint(struct module *mod) | ||
635 | { | ||
636 | return mod->taints & ~((1 << TAINT_OOT_MODULE) | (1 << TAINT_CRAP)); | ||
637 | } | ||
638 | |||
634 | static int tracepoint_module_coming(struct module *mod) | 639 | static int tracepoint_module_coming(struct module *mod) |
635 | { | 640 | { |
636 | struct tp_module *tp_mod, *iter; | 641 | struct tp_module *tp_mod, *iter; |
@@ -641,7 +646,7 @@ static int tracepoint_module_coming(struct module *mod) | |||
641 | * module headers (for forced load), to make sure we don't cause a crash. | 646 | * module headers (for forced load), to make sure we don't cause a crash. |
642 | * Staging and out-of-tree GPL modules are fine. | 647 | * Staging and out-of-tree GPL modules are fine. |
643 | */ | 648 | */ |
644 | if (mod->taints & ~((1 << TAINT_OOT_MODULE) | (1 << TAINT_CRAP))) | 649 | if (trace_module_has_bad_taint(mod)) |
645 | return 0; | 650 | return 0; |
646 | mutex_lock(&tracepoints_mutex); | 651 | mutex_lock(&tracepoints_mutex); |
647 | tp_mod = kmalloc(sizeof(struct tp_module), GFP_KERNEL); | 652 | tp_mod = kmalloc(sizeof(struct tp_module), GFP_KERNEL); |
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 240fb62cf394..dd06439b9c84 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c | |||
@@ -225,7 +225,7 @@ static u32 map_id_up(struct uid_gid_map *map, u32 id) | |||
225 | * | 225 | * |
226 | * When there is no mapping defined for the user-namespace uid | 226 | * When there is no mapping defined for the user-namespace uid |
227 | * pair INVALID_UID is returned. Callers are expected to test | 227 | * pair INVALID_UID is returned. Callers are expected to test |
228 | * for and handle handle INVALID_UID being returned. INVALID_UID | 228 | * for and handle INVALID_UID being returned. INVALID_UID |
229 | * may be tested for using uid_valid(). | 229 | * may be tested for using uid_valid(). |
230 | */ | 230 | */ |
231 | kuid_t make_kuid(struct user_namespace *ns, uid_t uid) | 231 | kuid_t make_kuid(struct user_namespace *ns, uid_t uid) |
diff --git a/kernel/workqueue.c b/kernel/workqueue.c index b010eac595d2..193e977a10ea 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c | |||
@@ -1851,6 +1851,12 @@ static void destroy_worker(struct worker *worker) | |||
1851 | if (worker->flags & WORKER_IDLE) | 1851 | if (worker->flags & WORKER_IDLE) |
1852 | pool->nr_idle--; | 1852 | pool->nr_idle--; |
1853 | 1853 | ||
1854 | /* | ||
1855 | * Once WORKER_DIE is set, the kworker may destroy itself at any | ||
1856 | * point. Pin to ensure the task stays until we're done with it. | ||
1857 | */ | ||
1858 | get_task_struct(worker->task); | ||
1859 | |||
1854 | list_del_init(&worker->entry); | 1860 | list_del_init(&worker->entry); |
1855 | worker->flags |= WORKER_DIE; | 1861 | worker->flags |= WORKER_DIE; |
1856 | 1862 | ||
@@ -1859,6 +1865,7 @@ static void destroy_worker(struct worker *worker) | |||
1859 | spin_unlock_irq(&pool->lock); | 1865 | spin_unlock_irq(&pool->lock); |
1860 | 1866 | ||
1861 | kthread_stop(worker->task); | 1867 | kthread_stop(worker->task); |
1868 | put_task_struct(worker->task); | ||
1862 | kfree(worker); | 1869 | kfree(worker); |
1863 | 1870 | ||
1864 | spin_lock_irq(&pool->lock); | 1871 | spin_lock_irq(&pool->lock); |
@@ -4789,6 +4796,7 @@ static int workqueue_cpu_down_callback(struct notifier_block *nfb, | |||
4789 | 4796 | ||
4790 | /* wait for per-cpu unbinding to finish */ | 4797 | /* wait for per-cpu unbinding to finish */ |
4791 | flush_work(&unbind_work); | 4798 | flush_work(&unbind_work); |
4799 | destroy_work_on_stack(&unbind_work); | ||
4792 | break; | 4800 | break; |
4793 | } | 4801 | } |
4794 | return NOTIFY_OK; | 4802 | return NOTIFY_OK; |
@@ -4828,6 +4836,7 @@ long work_on_cpu(int cpu, long (*fn)(void *), void *arg) | |||
4828 | INIT_WORK_ONSTACK(&wfc.work, work_for_cpu_fn); | 4836 | INIT_WORK_ONSTACK(&wfc.work, work_for_cpu_fn); |
4829 | schedule_work_on(cpu, &wfc.work); | 4837 | schedule_work_on(cpu, &wfc.work); |
4830 | flush_work(&wfc.work); | 4838 | flush_work(&wfc.work); |
4839 | destroy_work_on_stack(&wfc.work); | ||
4831 | return wfc.ret; | 4840 | return wfc.ret; |
4832 | } | 4841 | } |
4833 | EXPORT_SYMBOL_GPL(work_on_cpu); | 4842 | EXPORT_SYMBOL_GPL(work_on_cpu); |