aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/audit.c380
-rw-r--r--kernel/audit.h15
-rw-r--r--kernel/audit_tree.c20
-rw-r--r--kernel/audit_watch.c24
-rw-r--r--kernel/auditfilter.c97
-rw-r--r--kernel/auditsc.c46
-rw-r--r--kernel/capability.c2
-rw-r--r--kernel/cgroup.c1259
-rw-r--r--kernel/cgroup_freezer.c7
-rw-r--r--kernel/context_tracking.c8
-rw-r--r--kernel/cpu/idle.c17
-rw-r--r--kernel/cpuset.c81
-rw-r--r--kernel/debug/debug_core.c5
-rw-r--r--kernel/debug/debug_core.h2
-rw-r--r--kernel/events/core.c43
-rw-r--r--kernel/events/ring_buffer.c42
-rw-r--r--kernel/events/uprobes.c64
-rw-r--r--kernel/exit.c1
-rw-r--r--kernel/fork.c31
-rw-r--r--kernel/futex.c236
-rw-r--r--kernel/hrtimer.c3
-rw-r--r--kernel/hung_task.c6
-rw-r--r--kernel/irq/Kconfig1
-rw-r--r--kernel/irq/devres.c45
-rw-r--r--kernel/irq/irqdesc.c1
-rw-r--r--kernel/irq/irqdomain.c1
-rw-r--r--kernel/irq/manage.c3
-rw-r--r--kernel/kexec.c5
-rw-r--r--kernel/kmod.c2
-rw-r--r--kernel/ksysfs.c2
-rw-r--r--kernel/locking/lockdep.c4
-rw-r--r--kernel/locking/mutex-debug.c7
-rw-r--r--kernel/locking/rtmutex-debug.c8
-rw-r--r--kernel/locking/rtmutex.c166
-rw-r--r--kernel/locking/rtmutex_common.h23
-rw-r--r--kernel/module.c6
-rw-r--r--kernel/padata.c2
-rw-r--r--kernel/panic.c2
-rw-r--r--kernel/params.c25
-rw-r--r--kernel/posix-cpu-timers.c327
-rw-r--r--kernel/power/block_io.c2
-rw-r--r--kernel/power/console.c1
-rw-r--r--kernel/power/hibernate.c7
-rw-r--r--kernel/power/snapshot.c2
-rw-r--r--kernel/printk/printk.c21
-rw-r--r--kernel/profile.c4
-rw-r--r--kernel/rcu/rcu.h5
-rw-r--r--kernel/rcu/srcu.c57
-rw-r--r--kernel/rcu/torture.c75
-rw-r--r--kernel/rcu/tree.c97
-rw-r--r--kernel/rcu/tree.h12
-rw-r--r--kernel/rcu/tree_plugin.h102
-rw-r--r--kernel/rcu/tree_trace.c3
-rw-r--r--kernel/rcu/update.c16
-rw-r--r--kernel/sched/Makefile5
-rw-r--r--kernel/sched/clock.c107
-rw-r--r--kernel/sched/core.c880
-rw-r--r--kernel/sched/cpuacct.c18
-rw-r--r--kernel/sched/cpudeadline.c216
-rw-r--r--kernel/sched/cpudeadline.h33
-rw-r--r--kernel/sched/deadline.c1639
-rw-r--r--kernel/sched/debug.c4
-rw-r--r--kernel/sched/fair.c109
-rw-r--r--kernel/sched/rt.c10
-rw-r--r--kernel/sched/sched.h145
-rw-r--r--kernel/sched/stop_task.c2
-rw-r--r--kernel/signal.c7
-rw-r--r--kernel/smp.c68
-rw-r--r--kernel/softirq.c164
-rw-r--r--kernel/stop_machine.c2
-rw-r--r--kernel/sys.c8
-rw-r--r--kernel/sysctl.c47
-rw-r--r--kernel/time/jiffies.c6
-rw-r--r--kernel/time/sched_clock.c52
-rw-r--r--kernel/time/tick-broadcast.c7
-rw-r--r--kernel/time/tick-common.c1
-rw-r--r--kernel/time/tick-internal.h5
-rw-r--r--kernel/time/tick-sched.c61
-rw-r--r--kernel/time/timekeeping.c53
-rw-r--r--kernel/trace/Makefile1
-rw-r--r--kernel/trace/blktrace.c15
-rw-r--r--kernel/trace/ftrace.c215
-rw-r--r--kernel/trace/ring_buffer.c9
-rw-r--r--kernel/trace/trace.c170
-rw-r--r--kernel/trace/trace.h193
-rw-r--r--kernel/trace/trace_events.c65
-rw-r--r--kernel/trace/trace_events_filter.c12
-rw-r--r--kernel/trace/trace_events_trigger.c1437
-rw-r--r--kernel/trace/trace_export.c7
-rw-r--r--kernel/trace/trace_kprobe.c838
-rw-r--r--kernel/trace/trace_probe.c440
-rw-r--r--kernel/trace/trace_probe.h224
-rw-r--r--kernel/trace/trace_sched_wakeup.c65
-rw-r--r--kernel/trace/trace_selftest.c33
-rw-r--r--kernel/trace/trace_stack.c2
-rw-r--r--kernel/trace/trace_syscalls.c14
-rw-r--r--kernel/trace/trace_uprobe.c487
-rw-r--r--kernel/tracepoint.c7
-rw-r--r--kernel/user_namespace.c2
-rw-r--r--kernel/workqueue.c9
100 files changed, 8332 insertions, 2985 deletions
diff --git a/kernel/audit.c b/kernel/audit.c
index 906ae5a0233a..3392d3e0254a 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -41,6 +41,8 @@
41 * Example user-space utilities: http://people.redhat.com/sgrubb/audit/ 41 * Example user-space utilities: http://people.redhat.com/sgrubb/audit/
42 */ 42 */
43 43
44#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
45
44#include <linux/init.h> 46#include <linux/init.h>
45#include <asm/types.h> 47#include <asm/types.h>
46#include <linux/atomic.h> 48#include <linux/atomic.h>
@@ -63,6 +65,7 @@
63#include <linux/freezer.h> 65#include <linux/freezer.h>
64#include <linux/tty.h> 66#include <linux/tty.h>
65#include <linux/pid_namespace.h> 67#include <linux/pid_namespace.h>
68#include <net/netns/generic.h>
66 69
67#include "audit.h" 70#include "audit.h"
68 71
@@ -76,16 +79,16 @@ static int audit_initialized;
76#define AUDIT_OFF 0 79#define AUDIT_OFF 0
77#define AUDIT_ON 1 80#define AUDIT_ON 1
78#define AUDIT_LOCKED 2 81#define AUDIT_LOCKED 2
79int audit_enabled; 82u32 audit_enabled;
80int audit_ever_enabled; 83u32 audit_ever_enabled;
81 84
82EXPORT_SYMBOL_GPL(audit_enabled); 85EXPORT_SYMBOL_GPL(audit_enabled);
83 86
84/* Default state when kernel boots without any parameters. */ 87/* Default state when kernel boots without any parameters. */
85static int audit_default; 88static u32 audit_default;
86 89
87/* If auditing cannot proceed, audit_failure selects what happens. */ 90/* If auditing cannot proceed, audit_failure selects what happens. */
88static int audit_failure = AUDIT_FAIL_PRINTK; 91static u32 audit_failure = AUDIT_FAIL_PRINTK;
89 92
90/* 93/*
91 * If audit records are to be written to the netlink socket, audit_pid 94 * If audit records are to be written to the netlink socket, audit_pid
@@ -93,17 +96,19 @@ static int audit_failure = AUDIT_FAIL_PRINTK;
93 * the portid to use to send netlink messages to that process. 96 * the portid to use to send netlink messages to that process.
94 */ 97 */
95int audit_pid; 98int audit_pid;
96static int audit_nlk_portid; 99static __u32 audit_nlk_portid;
97 100
98/* If audit_rate_limit is non-zero, limit the rate of sending audit records 101/* If audit_rate_limit is non-zero, limit the rate of sending audit records
99 * to that number per second. This prevents DoS attacks, but results in 102 * to that number per second. This prevents DoS attacks, but results in
100 * audit records being dropped. */ 103 * audit records being dropped. */
101static int audit_rate_limit; 104static u32 audit_rate_limit;
102 105
103/* Number of outstanding audit_buffers allowed. */ 106/* Number of outstanding audit_buffers allowed.
104static int audit_backlog_limit = 64; 107 * When set to zero, this means unlimited. */
105static int audit_backlog_wait_time = 60 * HZ; 108static u32 audit_backlog_limit = 64;
106static int audit_backlog_wait_overflow = 0; 109#define AUDIT_BACKLOG_WAIT_TIME (60 * HZ)
110static u32 audit_backlog_wait_time = AUDIT_BACKLOG_WAIT_TIME;
111static u32 audit_backlog_wait_overflow = 0;
107 112
108/* The identity of the user shutting down the audit system. */ 113/* The identity of the user shutting down the audit system. */
109kuid_t audit_sig_uid = INVALID_UID; 114kuid_t audit_sig_uid = INVALID_UID;
@@ -121,6 +126,7 @@ static atomic_t audit_lost = ATOMIC_INIT(0);
121 126
122/* The netlink socket. */ 127/* The netlink socket. */
123static struct sock *audit_sock; 128static struct sock *audit_sock;
129int audit_net_id;
124 130
125/* Hash for inode-based rules */ 131/* Hash for inode-based rules */
126struct list_head audit_inode_hash[AUDIT_INODE_BUCKETS]; 132struct list_head audit_inode_hash[AUDIT_INODE_BUCKETS];
@@ -175,27 +181,27 @@ struct audit_buffer {
175}; 181};
176 182
177struct audit_reply { 183struct audit_reply {
178 int pid; 184 __u32 portid;
185 struct net *net;
179 struct sk_buff *skb; 186 struct sk_buff *skb;
180}; 187};
181 188
182static void audit_set_pid(struct audit_buffer *ab, pid_t pid) 189static void audit_set_portid(struct audit_buffer *ab, __u32 portid)
183{ 190{
184 if (ab) { 191 if (ab) {
185 struct nlmsghdr *nlh = nlmsg_hdr(ab->skb); 192 struct nlmsghdr *nlh = nlmsg_hdr(ab->skb);
186 nlh->nlmsg_pid = pid; 193 nlh->nlmsg_pid = portid;
187 } 194 }
188} 195}
189 196
190void audit_panic(const char *message) 197void audit_panic(const char *message)
191{ 198{
192 switch (audit_failure) 199 switch (audit_failure) {
193 {
194 case AUDIT_FAIL_SILENT: 200 case AUDIT_FAIL_SILENT:
195 break; 201 break;
196 case AUDIT_FAIL_PRINTK: 202 case AUDIT_FAIL_PRINTK:
197 if (printk_ratelimit()) 203 if (printk_ratelimit())
198 printk(KERN_ERR "audit: %s\n", message); 204 pr_err("%s\n", message);
199 break; 205 break;
200 case AUDIT_FAIL_PANIC: 206 case AUDIT_FAIL_PANIC:
201 /* test audit_pid since printk is always losey, why bother? */ 207 /* test audit_pid since printk is always losey, why bother? */
@@ -266,9 +272,7 @@ void audit_log_lost(const char *message)
266 272
267 if (print) { 273 if (print) {
268 if (printk_ratelimit()) 274 if (printk_ratelimit())
269 printk(KERN_WARNING 275 pr_warn("audit_lost=%u audit_rate_limit=%u audit_backlog_limit=%u\n",
270 "audit: audit_lost=%d audit_rate_limit=%d "
271 "audit_backlog_limit=%d\n",
272 atomic_read(&audit_lost), 276 atomic_read(&audit_lost),
273 audit_rate_limit, 277 audit_rate_limit,
274 audit_backlog_limit); 278 audit_backlog_limit);
@@ -276,7 +280,7 @@ void audit_log_lost(const char *message)
276 } 280 }
277} 281}
278 282
279static int audit_log_config_change(char *function_name, int new, int old, 283static int audit_log_config_change(char *function_name, u32 new, u32 old,
280 int allow_changes) 284 int allow_changes)
281{ 285{
282 struct audit_buffer *ab; 286 struct audit_buffer *ab;
@@ -285,7 +289,7 @@ static int audit_log_config_change(char *function_name, int new, int old,
285 ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE); 289 ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE);
286 if (unlikely(!ab)) 290 if (unlikely(!ab))
287 return rc; 291 return rc;
288 audit_log_format(ab, "%s=%d old=%d", function_name, new, old); 292 audit_log_format(ab, "%s=%u old=%u", function_name, new, old);
289 audit_log_session_info(ab); 293 audit_log_session_info(ab);
290 rc = audit_log_task_context(ab); 294 rc = audit_log_task_context(ab);
291 if (rc) 295 if (rc)
@@ -295,9 +299,10 @@ static int audit_log_config_change(char *function_name, int new, int old,
295 return rc; 299 return rc;
296} 300}
297 301
298static int audit_do_config_change(char *function_name, int *to_change, int new) 302static int audit_do_config_change(char *function_name, u32 *to_change, u32 new)
299{ 303{
300 int allow_changes, rc = 0, old = *to_change; 304 int allow_changes, rc = 0;
305 u32 old = *to_change;
301 306
302 /* check if we are locked */ 307 /* check if we are locked */
303 if (audit_enabled == AUDIT_LOCKED) 308 if (audit_enabled == AUDIT_LOCKED)
@@ -320,17 +325,23 @@ static int audit_do_config_change(char *function_name, int *to_change, int new)
320 return rc; 325 return rc;
321} 326}
322 327
323static int audit_set_rate_limit(int limit) 328static int audit_set_rate_limit(u32 limit)
324{ 329{
325 return audit_do_config_change("audit_rate_limit", &audit_rate_limit, limit); 330 return audit_do_config_change("audit_rate_limit", &audit_rate_limit, limit);
326} 331}
327 332
328static int audit_set_backlog_limit(int limit) 333static int audit_set_backlog_limit(u32 limit)
329{ 334{
330 return audit_do_config_change("audit_backlog_limit", &audit_backlog_limit, limit); 335 return audit_do_config_change("audit_backlog_limit", &audit_backlog_limit, limit);
331} 336}
332 337
333static int audit_set_enabled(int state) 338static int audit_set_backlog_wait_time(u32 timeout)
339{
340 return audit_do_config_change("audit_backlog_wait_time",
341 &audit_backlog_wait_time, timeout);
342}
343
344static int audit_set_enabled(u32 state)
334{ 345{
335 int rc; 346 int rc;
336 if (state < AUDIT_OFF || state > AUDIT_LOCKED) 347 if (state < AUDIT_OFF || state > AUDIT_LOCKED)
@@ -343,7 +354,7 @@ static int audit_set_enabled(int state)
343 return rc; 354 return rc;
344} 355}
345 356
346static int audit_set_failure(int state) 357static int audit_set_failure(u32 state)
347{ 358{
348 if (state != AUDIT_FAIL_SILENT 359 if (state != AUDIT_FAIL_SILENT
349 && state != AUDIT_FAIL_PRINTK 360 && state != AUDIT_FAIL_PRINTK
@@ -365,7 +376,8 @@ static int audit_set_failure(int state)
365static void audit_hold_skb(struct sk_buff *skb) 376static void audit_hold_skb(struct sk_buff *skb)
366{ 377{
367 if (audit_default && 378 if (audit_default &&
368 skb_queue_len(&audit_skb_hold_queue) < audit_backlog_limit) 379 (!audit_backlog_limit ||
380 skb_queue_len(&audit_skb_hold_queue) < audit_backlog_limit))
369 skb_queue_tail(&audit_skb_hold_queue, skb); 381 skb_queue_tail(&audit_skb_hold_queue, skb);
370 else 382 else
371 kfree_skb(skb); 383 kfree_skb(skb);
@@ -382,7 +394,7 @@ static void audit_printk_skb(struct sk_buff *skb)
382 394
383 if (nlh->nlmsg_type != AUDIT_EOE) { 395 if (nlh->nlmsg_type != AUDIT_EOE) {
384 if (printk_ratelimit()) 396 if (printk_ratelimit())
385 printk(KERN_NOTICE "type=%d %s\n", nlh->nlmsg_type, data); 397 pr_notice("type=%d %s\n", nlh->nlmsg_type, data);
386 else 398 else
387 audit_log_lost("printk limit exceeded\n"); 399 audit_log_lost("printk limit exceeded\n");
388 } 400 }
@@ -398,9 +410,12 @@ static void kauditd_send_skb(struct sk_buff *skb)
398 err = netlink_unicast(audit_sock, skb, audit_nlk_portid, 0); 410 err = netlink_unicast(audit_sock, skb, audit_nlk_portid, 0);
399 if (err < 0) { 411 if (err < 0) {
400 BUG_ON(err != -ECONNREFUSED); /* Shouldn't happen */ 412 BUG_ON(err != -ECONNREFUSED); /* Shouldn't happen */
401 printk(KERN_ERR "audit: *NO* daemon at audit_pid=%d\n", audit_pid); 413 if (audit_pid) {
402 audit_log_lost("auditd disappeared\n"); 414 pr_err("*NO* daemon at audit_pid=%d\n", audit_pid);
403 audit_pid = 0; 415 audit_log_lost("auditd disappeared\n");
416 audit_pid = 0;
417 audit_sock = NULL;
418 }
404 /* we might get lucky and get this in the next auditd */ 419 /* we might get lucky and get this in the next auditd */
405 audit_hold_skb(skb); 420 audit_hold_skb(skb);
406 } else 421 } else
@@ -457,8 +472,10 @@ static int kauditd_thread(void *dummy)
457 flush_hold_queue(); 472 flush_hold_queue();
458 473
459 skb = skb_dequeue(&audit_skb_queue); 474 skb = skb_dequeue(&audit_skb_queue);
460 wake_up(&audit_backlog_wait); 475
461 if (skb) { 476 if (skb) {
477 if (skb_queue_len(&audit_skb_queue) <= audit_backlog_limit)
478 wake_up(&audit_backlog_wait);
462 if (audit_pid) 479 if (audit_pid)
463 kauditd_send_skb(skb); 480 kauditd_send_skb(skb);
464 else 481 else
@@ -482,22 +499,24 @@ static int kauditd_thread(void *dummy)
482int audit_send_list(void *_dest) 499int audit_send_list(void *_dest)
483{ 500{
484 struct audit_netlink_list *dest = _dest; 501 struct audit_netlink_list *dest = _dest;
485 int pid = dest->pid;
486 struct sk_buff *skb; 502 struct sk_buff *skb;
503 struct net *net = dest->net;
504 struct audit_net *aunet = net_generic(net, audit_net_id);
487 505
488 /* wait for parent to finish and send an ACK */ 506 /* wait for parent to finish and send an ACK */
489 mutex_lock(&audit_cmd_mutex); 507 mutex_lock(&audit_cmd_mutex);
490 mutex_unlock(&audit_cmd_mutex); 508 mutex_unlock(&audit_cmd_mutex);
491 509
492 while ((skb = __skb_dequeue(&dest->q)) != NULL) 510 while ((skb = __skb_dequeue(&dest->q)) != NULL)
493 netlink_unicast(audit_sock, skb, pid, 0); 511 netlink_unicast(aunet->nlsk, skb, dest->portid, 0);
494 512
513 put_net(net);
495 kfree(dest); 514 kfree(dest);
496 515
497 return 0; 516 return 0;
498} 517}
499 518
500struct sk_buff *audit_make_reply(int pid, int seq, int type, int done, 519struct sk_buff *audit_make_reply(__u32 portid, int seq, int type, int done,
501 int multi, const void *payload, int size) 520 int multi, const void *payload, int size)
502{ 521{
503 struct sk_buff *skb; 522 struct sk_buff *skb;
@@ -510,7 +529,7 @@ struct sk_buff *audit_make_reply(int pid, int seq, int type, int done,
510 if (!skb) 529 if (!skb)
511 return NULL; 530 return NULL;
512 531
513 nlh = nlmsg_put(skb, pid, seq, t, size, flags); 532 nlh = nlmsg_put(skb, portid, seq, t, size, flags);
514 if (!nlh) 533 if (!nlh)
515 goto out_kfree_skb; 534 goto out_kfree_skb;
516 data = nlmsg_data(nlh); 535 data = nlmsg_data(nlh);
@@ -525,19 +544,22 @@ out_kfree_skb:
525static int audit_send_reply_thread(void *arg) 544static int audit_send_reply_thread(void *arg)
526{ 545{
527 struct audit_reply *reply = (struct audit_reply *)arg; 546 struct audit_reply *reply = (struct audit_reply *)arg;
547 struct net *net = reply->net;
548 struct audit_net *aunet = net_generic(net, audit_net_id);
528 549
529 mutex_lock(&audit_cmd_mutex); 550 mutex_lock(&audit_cmd_mutex);
530 mutex_unlock(&audit_cmd_mutex); 551 mutex_unlock(&audit_cmd_mutex);
531 552
532 /* Ignore failure. It'll only happen if the sender goes away, 553 /* Ignore failure. It'll only happen if the sender goes away,
533 because our timeout is set to infinite. */ 554 because our timeout is set to infinite. */
534 netlink_unicast(audit_sock, reply->skb, reply->pid, 0); 555 netlink_unicast(aunet->nlsk , reply->skb, reply->portid, 0);
556 put_net(net);
535 kfree(reply); 557 kfree(reply);
536 return 0; 558 return 0;
537} 559}
538/** 560/**
539 * audit_send_reply - send an audit reply message via netlink 561 * audit_send_reply - send an audit reply message via netlink
540 * @pid: process id to send reply to 562 * @request_skb: skb of request we are replying to (used to target the reply)
541 * @seq: sequence number 563 * @seq: sequence number
542 * @type: audit message type 564 * @type: audit message type
543 * @done: done (last) flag 565 * @done: done (last) flag
@@ -545,12 +567,14 @@ static int audit_send_reply_thread(void *arg)
545 * @payload: payload data 567 * @payload: payload data
546 * @size: payload size 568 * @size: payload size
547 * 569 *
548 * Allocates an skb, builds the netlink message, and sends it to the pid. 570 * Allocates an skb, builds the netlink message, and sends it to the port id.
549 * No failure notifications. 571 * No failure notifications.
550 */ 572 */
551static void audit_send_reply(int pid, int seq, int type, int done, int multi, 573static void audit_send_reply(struct sk_buff *request_skb, int seq, int type, int done,
552 const void *payload, int size) 574 int multi, const void *payload, int size)
553{ 575{
576 u32 portid = NETLINK_CB(request_skb).portid;
577 struct net *net = sock_net(NETLINK_CB(request_skb).sk);
554 struct sk_buff *skb; 578 struct sk_buff *skb;
555 struct task_struct *tsk; 579 struct task_struct *tsk;
556 struct audit_reply *reply = kmalloc(sizeof(struct audit_reply), 580 struct audit_reply *reply = kmalloc(sizeof(struct audit_reply),
@@ -559,11 +583,12 @@ static void audit_send_reply(int pid, int seq, int type, int done, int multi,
559 if (!reply) 583 if (!reply)
560 return; 584 return;
561 585
562 skb = audit_make_reply(pid, seq, type, done, multi, payload, size); 586 skb = audit_make_reply(portid, seq, type, done, multi, payload, size);
563 if (!skb) 587 if (!skb)
564 goto out; 588 goto out;
565 589
566 reply->pid = pid; 590 reply->net = get_net(net);
591 reply->portid = portid;
567 reply->skb = skb; 592 reply->skb = skb;
568 593
569 tsk = kthread_run(audit_send_reply_thread, reply, "audit_send_reply"); 594 tsk = kthread_run(audit_send_reply_thread, reply, "audit_send_reply");
@@ -652,8 +677,7 @@ static int audit_get_feature(struct sk_buff *skb)
652 677
653 seq = nlmsg_hdr(skb)->nlmsg_seq; 678 seq = nlmsg_hdr(skb)->nlmsg_seq;
654 679
655 audit_send_reply(NETLINK_CB(skb).portid, seq, AUDIT_GET, 0, 0, 680 audit_send_reply(skb, seq, AUDIT_GET, 0, 0, &af, sizeof(af));
656 &af, sizeof(af));
657 681
658 return 0; 682 return 0;
659} 683}
@@ -663,8 +687,12 @@ static void audit_log_feature_change(int which, u32 old_feature, u32 new_feature
663{ 687{
664 struct audit_buffer *ab; 688 struct audit_buffer *ab;
665 689
690 if (audit_enabled == AUDIT_OFF)
691 return;
692
666 ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_FEATURE_CHANGE); 693 ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_FEATURE_CHANGE);
667 audit_log_format(ab, "feature=%s new=%d old=%d old_lock=%d new_lock=%d res=%d", 694 audit_log_task_info(ab, current);
695 audit_log_format(ab, "feature=%s old=%u new=%u old_lock=%u new_lock=%u res=%d",
668 audit_feature_names[which], !!old_feature, !!new_feature, 696 audit_feature_names[which], !!old_feature, !!new_feature,
669 !!old_lock, !!new_lock, res); 697 !!old_lock, !!new_lock, res);
670 audit_log_end(ab); 698 audit_log_end(ab);
@@ -694,7 +722,7 @@ static int audit_set_feature(struct sk_buff *skb)
694 old_lock = af.lock & feature; 722 old_lock = af.lock & feature;
695 723
696 /* are we changing a locked feature? */ 724 /* are we changing a locked feature? */
697 if ((af.lock & feature) && (new_feature != old_feature)) { 725 if (old_lock && (new_feature != old_feature)) {
698 audit_log_feature_change(i, old_feature, new_feature, 726 audit_log_feature_change(i, old_feature, new_feature,
699 old_lock, new_lock, 0); 727 old_lock, new_lock, 0);
700 return -EPERM; 728 return -EPERM;
@@ -732,7 +760,6 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
732{ 760{
733 u32 seq; 761 u32 seq;
734 void *data; 762 void *data;
735 struct audit_status *status_get, status_set;
736 int err; 763 int err;
737 struct audit_buffer *ab; 764 struct audit_buffer *ab;
738 u16 msg_type = nlh->nlmsg_type; 765 u16 msg_type = nlh->nlmsg_type;
@@ -758,48 +785,69 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
758 data = nlmsg_data(nlh); 785 data = nlmsg_data(nlh);
759 786
760 switch (msg_type) { 787 switch (msg_type) {
761 case AUDIT_GET: 788 case AUDIT_GET: {
762 memset(&status_set, 0, sizeof(status_set)); 789 struct audit_status s;
763 status_set.enabled = audit_enabled; 790 memset(&s, 0, sizeof(s));
764 status_set.failure = audit_failure; 791 s.enabled = audit_enabled;
765 status_set.pid = audit_pid; 792 s.failure = audit_failure;
766 status_set.rate_limit = audit_rate_limit; 793 s.pid = audit_pid;
767 status_set.backlog_limit = audit_backlog_limit; 794 s.rate_limit = audit_rate_limit;
768 status_set.lost = atomic_read(&audit_lost); 795 s.backlog_limit = audit_backlog_limit;
769 status_set.backlog = skb_queue_len(&audit_skb_queue); 796 s.lost = atomic_read(&audit_lost);
770 audit_send_reply(NETLINK_CB(skb).portid, seq, AUDIT_GET, 0, 0, 797 s.backlog = skb_queue_len(&audit_skb_queue);
771 &status_set, sizeof(status_set)); 798 s.version = AUDIT_VERSION_LATEST;
799 s.backlog_wait_time = audit_backlog_wait_time;
800 audit_send_reply(skb, seq, AUDIT_GET, 0, 0, &s, sizeof(s));
772 break; 801 break;
773 case AUDIT_SET: 802 }
774 if (nlmsg_len(nlh) < sizeof(struct audit_status)) 803 case AUDIT_SET: {
775 return -EINVAL; 804 struct audit_status s;
776 status_get = (struct audit_status *)data; 805 memset(&s, 0, sizeof(s));
777 if (status_get->mask & AUDIT_STATUS_ENABLED) { 806 /* guard against past and future API changes */
778 err = audit_set_enabled(status_get->enabled); 807 memcpy(&s, data, min_t(size_t, sizeof(s), nlmsg_len(nlh)));
808 if (s.mask & AUDIT_STATUS_ENABLED) {
809 err = audit_set_enabled(s.enabled);
779 if (err < 0) 810 if (err < 0)
780 return err; 811 return err;
781 } 812 }
782 if (status_get->mask & AUDIT_STATUS_FAILURE) { 813 if (s.mask & AUDIT_STATUS_FAILURE) {
783 err = audit_set_failure(status_get->failure); 814 err = audit_set_failure(s.failure);
784 if (err < 0) 815 if (err < 0)
785 return err; 816 return err;
786 } 817 }
787 if (status_get->mask & AUDIT_STATUS_PID) { 818 if (s.mask & AUDIT_STATUS_PID) {
788 int new_pid = status_get->pid; 819 int new_pid = s.pid;
789 820
821 if ((!new_pid) && (task_tgid_vnr(current) != audit_pid))
822 return -EACCES;
790 if (audit_enabled != AUDIT_OFF) 823 if (audit_enabled != AUDIT_OFF)
791 audit_log_config_change("audit_pid", new_pid, audit_pid, 1); 824 audit_log_config_change("audit_pid", new_pid, audit_pid, 1);
792 audit_pid = new_pid; 825 audit_pid = new_pid;
793 audit_nlk_portid = NETLINK_CB(skb).portid; 826 audit_nlk_portid = NETLINK_CB(skb).portid;
827 audit_sock = skb->sk;
794 } 828 }
795 if (status_get->mask & AUDIT_STATUS_RATE_LIMIT) { 829 if (s.mask & AUDIT_STATUS_RATE_LIMIT) {
796 err = audit_set_rate_limit(status_get->rate_limit); 830 err = audit_set_rate_limit(s.rate_limit);
831 if (err < 0)
832 return err;
833 }
834 if (s.mask & AUDIT_STATUS_BACKLOG_LIMIT) {
835 err = audit_set_backlog_limit(s.backlog_limit);
836 if (err < 0)
837 return err;
838 }
839 if (s.mask & AUDIT_STATUS_BACKLOG_WAIT_TIME) {
840 if (sizeof(s) > (size_t)nlh->nlmsg_len)
841 return -EINVAL;
842 if (s.backlog_wait_time < 0 ||
843 s.backlog_wait_time > 10*AUDIT_BACKLOG_WAIT_TIME)
844 return -EINVAL;
845 err = audit_set_backlog_wait_time(s.backlog_wait_time);
797 if (err < 0) 846 if (err < 0)
798 return err; 847 return err;
799 } 848 }
800 if (status_get->mask & AUDIT_STATUS_BACKLOG_LIMIT)
801 err = audit_set_backlog_limit(status_get->backlog_limit);
802 break; 849 break;
850 }
803 case AUDIT_GET_FEATURE: 851 case AUDIT_GET_FEATURE:
804 err = audit_get_feature(skb); 852 err = audit_get_feature(skb);
805 if (err) 853 if (err)
@@ -817,13 +865,14 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
817 return 0; 865 return 0;
818 866
819 err = audit_filter_user(msg_type); 867 err = audit_filter_user(msg_type);
820 if (err == 1) { 868 if (err == 1) { /* match or error */
821 err = 0; 869 err = 0;
822 if (msg_type == AUDIT_USER_TTY) { 870 if (msg_type == AUDIT_USER_TTY) {
823 err = tty_audit_push_current(); 871 err = tty_audit_push_current();
824 if (err) 872 if (err)
825 break; 873 break;
826 } 874 }
875 mutex_unlock(&audit_cmd_mutex);
827 audit_log_common_recv_msg(&ab, msg_type); 876 audit_log_common_recv_msg(&ab, msg_type);
828 if (msg_type != AUDIT_USER_TTY) 877 if (msg_type != AUDIT_USER_TTY)
829 audit_log_format(ab, " msg='%.*s'", 878 audit_log_format(ab, " msg='%.*s'",
@@ -839,8 +888,9 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
839 size--; 888 size--;
840 audit_log_n_untrustedstring(ab, data, size); 889 audit_log_n_untrustedstring(ab, data, size);
841 } 890 }
842 audit_set_pid(ab, NETLINK_CB(skb).portid); 891 audit_set_portid(ab, NETLINK_CB(skb).portid);
843 audit_log_end(ab); 892 audit_log_end(ab);
893 mutex_lock(&audit_cmd_mutex);
844 } 894 }
845 break; 895 break;
846 case AUDIT_ADD_RULE: 896 case AUDIT_ADD_RULE:
@@ -853,11 +903,12 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
853 audit_log_end(ab); 903 audit_log_end(ab);
854 return -EPERM; 904 return -EPERM;
855 } 905 }
856 /* fallthrough */ 906 err = audit_rule_change(msg_type, NETLINK_CB(skb).portid,
857 case AUDIT_LIST_RULES:
858 err = audit_receive_filter(msg_type, NETLINK_CB(skb).portid,
859 seq, data, nlmsg_len(nlh)); 907 seq, data, nlmsg_len(nlh));
860 break; 908 break;
909 case AUDIT_LIST_RULES:
910 err = audit_list_rules_send(skb, seq);
911 break;
861 case AUDIT_TRIM: 912 case AUDIT_TRIM:
862 audit_trim_trees(); 913 audit_trim_trees();
863 audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE); 914 audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE);
@@ -921,8 +972,8 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
921 memcpy(sig_data->ctx, ctx, len); 972 memcpy(sig_data->ctx, ctx, len);
922 security_release_secctx(ctx, len); 973 security_release_secctx(ctx, len);
923 } 974 }
924 audit_send_reply(NETLINK_CB(skb).portid, seq, AUDIT_SIGNAL_INFO, 975 audit_send_reply(skb, seq, AUDIT_SIGNAL_INFO, 0, 0,
925 0, 0, sig_data, sizeof(*sig_data) + len); 976 sig_data, sizeof(*sig_data) + len);
926 kfree(sig_data); 977 kfree(sig_data);
927 break; 978 break;
928 case AUDIT_TTY_GET: { 979 case AUDIT_TTY_GET: {
@@ -934,25 +985,37 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
934 s.log_passwd = tsk->signal->audit_tty_log_passwd; 985 s.log_passwd = tsk->signal->audit_tty_log_passwd;
935 spin_unlock(&tsk->sighand->siglock); 986 spin_unlock(&tsk->sighand->siglock);
936 987
937 audit_send_reply(NETLINK_CB(skb).portid, seq, 988 audit_send_reply(skb, seq, AUDIT_TTY_GET, 0, 0, &s, sizeof(s));
938 AUDIT_TTY_GET, 0, 0, &s, sizeof(s));
939 break; 989 break;
940 } 990 }
941 case AUDIT_TTY_SET: { 991 case AUDIT_TTY_SET: {
942 struct audit_tty_status s; 992 struct audit_tty_status s, old;
943 struct task_struct *tsk = current; 993 struct task_struct *tsk = current;
994 struct audit_buffer *ab;
944 995
945 memset(&s, 0, sizeof(s)); 996 memset(&s, 0, sizeof(s));
946 /* guard against past and future API changes */ 997 /* guard against past and future API changes */
947 memcpy(&s, data, min_t(size_t, sizeof(s), nlmsg_len(nlh))); 998 memcpy(&s, data, min_t(size_t, sizeof(s), nlmsg_len(nlh)));
999 /* check if new data is valid */
948 if ((s.enabled != 0 && s.enabled != 1) || 1000 if ((s.enabled != 0 && s.enabled != 1) ||
949 (s.log_passwd != 0 && s.log_passwd != 1)) 1001 (s.log_passwd != 0 && s.log_passwd != 1))
950 return -EINVAL; 1002 err = -EINVAL;
951 1003
952 spin_lock(&tsk->sighand->siglock); 1004 spin_lock(&tsk->sighand->siglock);
953 tsk->signal->audit_tty = s.enabled; 1005 old.enabled = tsk->signal->audit_tty;
954 tsk->signal->audit_tty_log_passwd = s.log_passwd; 1006 old.log_passwd = tsk->signal->audit_tty_log_passwd;
1007 if (!err) {
1008 tsk->signal->audit_tty = s.enabled;
1009 tsk->signal->audit_tty_log_passwd = s.log_passwd;
1010 }
955 spin_unlock(&tsk->sighand->siglock); 1011 spin_unlock(&tsk->sighand->siglock);
1012
1013 audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE);
1014 audit_log_format(ab, " op=tty_set old-enabled=%d new-enabled=%d"
1015 " old-log_passwd=%d new-log_passwd=%d res=%d",
1016 old.enabled, s.enabled, old.log_passwd,
1017 s.log_passwd, !err);
1018 audit_log_end(ab);
956 break; 1019 break;
957 } 1020 }
958 default: 1021 default:
@@ -998,24 +1061,55 @@ static void audit_receive(struct sk_buff *skb)
998 mutex_unlock(&audit_cmd_mutex); 1061 mutex_unlock(&audit_cmd_mutex);
999} 1062}
1000 1063
1001/* Initialize audit support at boot time. */ 1064static int __net_init audit_net_init(struct net *net)
1002static int __init audit_init(void)
1003{ 1065{
1004 int i;
1005 struct netlink_kernel_cfg cfg = { 1066 struct netlink_kernel_cfg cfg = {
1006 .input = audit_receive, 1067 .input = audit_receive,
1007 }; 1068 };
1008 1069
1070 struct audit_net *aunet = net_generic(net, audit_net_id);
1071
1072 aunet->nlsk = netlink_kernel_create(net, NETLINK_AUDIT, &cfg);
1073 if (aunet->nlsk == NULL) {
1074 audit_panic("cannot initialize netlink socket in namespace");
1075 return -ENOMEM;
1076 }
1077 aunet->nlsk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
1078 return 0;
1079}
1080
1081static void __net_exit audit_net_exit(struct net *net)
1082{
1083 struct audit_net *aunet = net_generic(net, audit_net_id);
1084 struct sock *sock = aunet->nlsk;
1085 if (sock == audit_sock) {
1086 audit_pid = 0;
1087 audit_sock = NULL;
1088 }
1089
1090 rcu_assign_pointer(aunet->nlsk, NULL);
1091 synchronize_net();
1092 netlink_kernel_release(sock);
1093}
1094
1095static struct pernet_operations audit_net_ops __net_initdata = {
1096 .init = audit_net_init,
1097 .exit = audit_net_exit,
1098 .id = &audit_net_id,
1099 .size = sizeof(struct audit_net),
1100};
1101
1102/* Initialize audit support at boot time. */
1103static int __init audit_init(void)
1104{
1105 int i;
1106
1009 if (audit_initialized == AUDIT_DISABLED) 1107 if (audit_initialized == AUDIT_DISABLED)
1010 return 0; 1108 return 0;
1011 1109
1012 printk(KERN_INFO "audit: initializing netlink socket (%s)\n", 1110 pr_info("initializing netlink subsys (%s)\n",
1013 audit_default ? "enabled" : "disabled"); 1111 audit_default ? "enabled" : "disabled");
1014 audit_sock = netlink_kernel_create(&init_net, NETLINK_AUDIT, &cfg); 1112 register_pernet_subsys(&audit_net_ops);
1015 if (!audit_sock)
1016 audit_panic("cannot initialize netlink socket");
1017 else
1018 audit_sock->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
1019 1113
1020 skb_queue_head_init(&audit_skb_queue); 1114 skb_queue_head_init(&audit_skb_queue);
1021 skb_queue_head_init(&audit_skb_hold_queue); 1115 skb_queue_head_init(&audit_skb_hold_queue);
@@ -1039,22 +1133,32 @@ static int __init audit_enable(char *str)
1039 if (!audit_default) 1133 if (!audit_default)
1040 audit_initialized = AUDIT_DISABLED; 1134 audit_initialized = AUDIT_DISABLED;
1041 1135
1042 printk(KERN_INFO "audit: %s", audit_default ? "enabled" : "disabled"); 1136 pr_info("%s\n", audit_default ?
1137 "enabled (after initialization)" : "disabled (until reboot)");
1043 1138
1044 if (audit_initialized == AUDIT_INITIALIZED) { 1139 return 1;
1045 audit_enabled = audit_default; 1140}
1046 audit_ever_enabled |= !!audit_default; 1141__setup("audit=", audit_enable);
1047 } else if (audit_initialized == AUDIT_UNINITIALIZED) { 1142
1048 printk(" (after initialization)"); 1143/* Process kernel command-line parameter at boot time.
1049 } else { 1144 * audit_backlog_limit=<n> */
1050 printk(" (until reboot)"); 1145static int __init audit_backlog_limit_set(char *str)
1146{
1147 u32 audit_backlog_limit_arg;
1148
1149 pr_info("audit_backlog_limit: ");
1150 if (kstrtouint(str, 0, &audit_backlog_limit_arg)) {
1151 pr_cont("using default of %u, unable to parse %s\n",
1152 audit_backlog_limit, str);
1153 return 1;
1051 } 1154 }
1052 printk("\n"); 1155
1156 audit_backlog_limit = audit_backlog_limit_arg;
1157 pr_cont("%d\n", audit_backlog_limit);
1053 1158
1054 return 1; 1159 return 1;
1055} 1160}
1056 1161__setup("audit_backlog_limit=", audit_backlog_limit_set);
1057__setup("audit=", audit_enable);
1058 1162
1059static void audit_buffer_free(struct audit_buffer *ab) 1163static void audit_buffer_free(struct audit_buffer *ab)
1060{ 1164{
@@ -1165,18 +1269,20 @@ static inline void audit_get_stamp(struct audit_context *ctx,
1165/* 1269/*
1166 * Wait for auditd to drain the queue a little 1270 * Wait for auditd to drain the queue a little
1167 */ 1271 */
1168static void wait_for_auditd(unsigned long sleep_time) 1272static long wait_for_auditd(long sleep_time)
1169{ 1273{
1170 DECLARE_WAITQUEUE(wait, current); 1274 DECLARE_WAITQUEUE(wait, current);
1171 set_current_state(TASK_UNINTERRUPTIBLE); 1275 set_current_state(TASK_UNINTERRUPTIBLE);
1172 add_wait_queue(&audit_backlog_wait, &wait); 1276 add_wait_queue_exclusive(&audit_backlog_wait, &wait);
1173 1277
1174 if (audit_backlog_limit && 1278 if (audit_backlog_limit &&
1175 skb_queue_len(&audit_skb_queue) > audit_backlog_limit) 1279 skb_queue_len(&audit_skb_queue) > audit_backlog_limit)
1176 schedule_timeout(sleep_time); 1280 sleep_time = schedule_timeout(sleep_time);
1177 1281
1178 __set_current_state(TASK_RUNNING); 1282 __set_current_state(TASK_RUNNING);
1179 remove_wait_queue(&audit_backlog_wait, &wait); 1283 remove_wait_queue(&audit_backlog_wait, &wait);
1284
1285 return sleep_time;
1180} 1286}
1181 1287
1182/** 1288/**
@@ -1200,7 +1306,8 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask,
1200 struct audit_buffer *ab = NULL; 1306 struct audit_buffer *ab = NULL;
1201 struct timespec t; 1307 struct timespec t;
1202 unsigned int uninitialized_var(serial); 1308 unsigned int uninitialized_var(serial);
1203 int reserve; 1309 int reserve = 5; /* Allow atomic callers to go up to five
1310 entries over the normal backlog limit */
1204 unsigned long timeout_start = jiffies; 1311 unsigned long timeout_start = jiffies;
1205 1312
1206 if (audit_initialized != AUDIT_INITIALIZED) 1313 if (audit_initialized != AUDIT_INITIALIZED)
@@ -1209,36 +1316,37 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask,
1209 if (unlikely(audit_filter_type(type))) 1316 if (unlikely(audit_filter_type(type)))
1210 return NULL; 1317 return NULL;
1211 1318
1212 if (gfp_mask & __GFP_WAIT) 1319 if (gfp_mask & __GFP_WAIT) {
1213 reserve = 0; 1320 if (audit_pid && audit_pid == current->pid)
1214 else 1321 gfp_mask &= ~__GFP_WAIT;
1215 reserve = 5; /* Allow atomic callers to go up to five 1322 else
1216 entries over the normal backlog limit */ 1323 reserve = 0;
1324 }
1217 1325
1218 while (audit_backlog_limit 1326 while (audit_backlog_limit
1219 && skb_queue_len(&audit_skb_queue) > audit_backlog_limit + reserve) { 1327 && skb_queue_len(&audit_skb_queue) > audit_backlog_limit + reserve) {
1220 if (gfp_mask & __GFP_WAIT && audit_backlog_wait_time) { 1328 if (gfp_mask & __GFP_WAIT && audit_backlog_wait_time) {
1221 unsigned long sleep_time; 1329 long sleep_time;
1222 1330
1223 sleep_time = timeout_start + audit_backlog_wait_time - 1331 sleep_time = timeout_start + audit_backlog_wait_time - jiffies;
1224 jiffies; 1332 if (sleep_time > 0) {
1225 if ((long)sleep_time > 0) { 1333 sleep_time = wait_for_auditd(sleep_time);
1226 wait_for_auditd(sleep_time); 1334 if (sleep_time > 0)
1227 continue; 1335 continue;
1228 } 1336 }
1229 } 1337 }
1230 if (audit_rate_check() && printk_ratelimit()) 1338 if (audit_rate_check() && printk_ratelimit())
1231 printk(KERN_WARNING 1339 pr_warn("audit_backlog=%d > audit_backlog_limit=%d\n",
1232 "audit: audit_backlog=%d > " 1340 skb_queue_len(&audit_skb_queue),
1233 "audit_backlog_limit=%d\n", 1341 audit_backlog_limit);
1234 skb_queue_len(&audit_skb_queue),
1235 audit_backlog_limit);
1236 audit_log_lost("backlog limit exceeded"); 1342 audit_log_lost("backlog limit exceeded");
1237 audit_backlog_wait_time = audit_backlog_wait_overflow; 1343 audit_backlog_wait_time = audit_backlog_wait_overflow;
1238 wake_up(&audit_backlog_wait); 1344 wake_up(&audit_backlog_wait);
1239 return NULL; 1345 return NULL;
1240 } 1346 }
1241 1347
1348 audit_backlog_wait_time = AUDIT_BACKLOG_WAIT_TIME;
1349
1242 ab = audit_buffer_alloc(ctx, gfp_mask, type); 1350 ab = audit_buffer_alloc(ctx, gfp_mask, type);
1243 if (!ab) { 1351 if (!ab) {
1244 audit_log_lost("out of memory in audit_log_start"); 1352 audit_log_lost("out of memory in audit_log_start");
@@ -1356,7 +1464,6 @@ void audit_log_n_hex(struct audit_buffer *ab, const unsigned char *buf,
1356 int i, avail, new_len; 1464 int i, avail, new_len;
1357 unsigned char *ptr; 1465 unsigned char *ptr;
1358 struct sk_buff *skb; 1466 struct sk_buff *skb;
1359 static const unsigned char *hex = "0123456789ABCDEF";
1360 1467
1361 if (!ab) 1468 if (!ab)
1362 return; 1469 return;
@@ -1374,10 +1481,8 @@ void audit_log_n_hex(struct audit_buffer *ab, const unsigned char *buf,
1374 } 1481 }
1375 1482
1376 ptr = skb_tail_pointer(skb); 1483 ptr = skb_tail_pointer(skb);
1377 for (i=0; i<len; i++) { 1484 for (i = 0; i < len; i++)
1378 *ptr++ = hex[(buf[i] & 0xF0)>>4]; /* Upper nibble */ 1485 ptr = hex_byte_pack_upper(ptr, buf[i]);
1379 *ptr++ = hex[buf[i] & 0x0F]; /* Lower nibble */
1380 }
1381 *ptr = 0; 1486 *ptr = 0;
1382 skb_put(skb, len << 1); /* new string is twice the old string */ 1487 skb_put(skb, len << 1); /* new string is twice the old string */
1383} 1488}
@@ -1491,7 +1596,7 @@ void audit_log_d_path(struct audit_buffer *ab, const char *prefix,
1491 1596
1492void audit_log_session_info(struct audit_buffer *ab) 1597void audit_log_session_info(struct audit_buffer *ab)
1493{ 1598{
1494 u32 sessionid = audit_get_sessionid(current); 1599 unsigned int sessionid = audit_get_sessionid(current);
1495 uid_t auid = from_kuid(&init_user_ns, audit_get_loginuid(current)); 1600 uid_t auid = from_kuid(&init_user_ns, audit_get_loginuid(current));
1496 1601
1497 audit_log_format(ab, " auid=%u ses=%u", auid, sessionid); 1602 audit_log_format(ab, " auid=%u ses=%u", auid, sessionid);
@@ -1716,7 +1821,7 @@ void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk)
1716 audit_log_format(ab, 1821 audit_log_format(ab,
1717 " ppid=%ld pid=%d auid=%u uid=%u gid=%u" 1822 " ppid=%ld pid=%d auid=%u uid=%u gid=%u"
1718 " euid=%u suid=%u fsuid=%u" 1823 " euid=%u suid=%u fsuid=%u"
1719 " egid=%u sgid=%u fsgid=%u ses=%u tty=%s", 1824 " egid=%u sgid=%u fsgid=%u tty=%s ses=%u",
1720 sys_getppid(), 1825 sys_getppid(),
1721 tsk->pid, 1826 tsk->pid,
1722 from_kuid(&init_user_ns, audit_get_loginuid(tsk)), 1827 from_kuid(&init_user_ns, audit_get_loginuid(tsk)),
@@ -1728,7 +1833,7 @@ void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk)
1728 from_kgid(&init_user_ns, cred->egid), 1833 from_kgid(&init_user_ns, cred->egid),
1729 from_kgid(&init_user_ns, cred->sgid), 1834 from_kgid(&init_user_ns, cred->sgid),
1730 from_kgid(&init_user_ns, cred->fsgid), 1835 from_kgid(&init_user_ns, cred->fsgid),
1731 audit_get_sessionid(tsk), tty); 1836 tty, audit_get_sessionid(tsk));
1732 1837
1733 get_task_comm(name, tsk); 1838 get_task_comm(name, tsk);
1734 audit_log_format(ab, " comm="); 1839 audit_log_format(ab, " comm=");
@@ -1739,7 +1844,8 @@ void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk)
1739 if (mm->exe_file) 1844 if (mm->exe_file)
1740 audit_log_d_path(ab, " exe=", &mm->exe_file->f_path); 1845 audit_log_d_path(ab, " exe=", &mm->exe_file->f_path);
1741 up_read(&mm->mmap_sem); 1846 up_read(&mm->mmap_sem);
1742 } 1847 } else
1848 audit_log_format(ab, " exe=(null)");
1743 audit_log_task_context(ab); 1849 audit_log_task_context(ab);
1744} 1850}
1745EXPORT_SYMBOL(audit_log_task_info); 1851EXPORT_SYMBOL(audit_log_task_info);
diff --git a/kernel/audit.h b/kernel/audit.h
index b779642b29af..8df132214606 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -209,7 +209,7 @@ struct audit_context {
209#endif 209#endif
210}; 210};
211 211
212extern int audit_ever_enabled; 212extern u32 audit_ever_enabled;
213 213
214extern void audit_copy_inode(struct audit_names *name, 214extern void audit_copy_inode(struct audit_names *name,
215 const struct dentry *dentry, 215 const struct dentry *dentry,
@@ -240,18 +240,23 @@ extern int audit_uid_comparator(kuid_t left, u32 op, kuid_t right);
240extern int audit_gid_comparator(kgid_t left, u32 op, kgid_t right); 240extern int audit_gid_comparator(kgid_t left, u32 op, kgid_t right);
241extern int parent_len(const char *path); 241extern int parent_len(const char *path);
242extern int audit_compare_dname_path(const char *dname, const char *path, int plen); 242extern int audit_compare_dname_path(const char *dname, const char *path, int plen);
243extern struct sk_buff * audit_make_reply(int pid, int seq, int type, 243extern struct sk_buff *audit_make_reply(__u32 portid, int seq, int type,
244 int done, int multi, 244 int done, int multi,
245 const void *payload, int size); 245 const void *payload, int size);
246extern void audit_panic(const char *message); 246extern void audit_panic(const char *message);
247 247
248struct audit_netlink_list { 248struct audit_netlink_list {
249 int pid; 249 __u32 portid;
250 struct net *net;
250 struct sk_buff_head q; 251 struct sk_buff_head q;
251}; 252};
252 253
253int audit_send_list(void *); 254int audit_send_list(void *);
254 255
256struct audit_net {
257 struct sock *nlsk;
258};
259
255extern int selinux_audit_rule_update(void); 260extern int selinux_audit_rule_update(void);
256 261
257extern struct mutex audit_filter_mutex; 262extern struct mutex audit_filter_mutex;
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 43c307dc9453..135944a7b28a 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -912,12 +912,13 @@ static void evict_chunk(struct audit_chunk *chunk)
912} 912}
913 913
914static int audit_tree_handle_event(struct fsnotify_group *group, 914static int audit_tree_handle_event(struct fsnotify_group *group,
915 struct inode *to_tell,
915 struct fsnotify_mark *inode_mark, 916 struct fsnotify_mark *inode_mark,
916 struct fsnotify_mark *vfsmonut_mark, 917 struct fsnotify_mark *vfsmount_mark,
917 struct fsnotify_event *event) 918 u32 mask, void *data, int data_type,
919 const unsigned char *file_name, u32 cookie)
918{ 920{
919 BUG(); 921 return 0;
920 return -EOPNOTSUPP;
921} 922}
922 923
923static void audit_tree_freeing_mark(struct fsnotify_mark *entry, struct fsnotify_group *group) 924static void audit_tree_freeing_mark(struct fsnotify_mark *entry, struct fsnotify_group *group)
@@ -933,19 +934,8 @@ static void audit_tree_freeing_mark(struct fsnotify_mark *entry, struct fsnotify
933 BUG_ON(atomic_read(&entry->refcnt) < 1); 934 BUG_ON(atomic_read(&entry->refcnt) < 1);
934} 935}
935 936
936static bool audit_tree_send_event(struct fsnotify_group *group, struct inode *inode,
937 struct fsnotify_mark *inode_mark,
938 struct fsnotify_mark *vfsmount_mark,
939 __u32 mask, void *data, int data_type)
940{
941 return false;
942}
943
944static const struct fsnotify_ops audit_tree_ops = { 937static const struct fsnotify_ops audit_tree_ops = {
945 .handle_event = audit_tree_handle_event, 938 .handle_event = audit_tree_handle_event,
946 .should_send_event = audit_tree_send_event,
947 .free_group_priv = NULL,
948 .free_event_priv = NULL,
949 .freeing_mark = audit_tree_freeing_mark, 939 .freeing_mark = audit_tree_freeing_mark,
950}; 940};
951 941
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index 22831c4d369c..70b4554d2fbe 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -465,35 +465,27 @@ void audit_remove_watch_rule(struct audit_krule *krule)
465 } 465 }
466} 466}
467 467
468static bool audit_watch_should_send_event(struct fsnotify_group *group, struct inode *inode,
469 struct fsnotify_mark *inode_mark,
470 struct fsnotify_mark *vfsmount_mark,
471 __u32 mask, void *data, int data_type)
472{
473 return true;
474}
475
476/* Update watch data in audit rules based on fsnotify events. */ 468/* Update watch data in audit rules based on fsnotify events. */
477static int audit_watch_handle_event(struct fsnotify_group *group, 469static int audit_watch_handle_event(struct fsnotify_group *group,
470 struct inode *to_tell,
478 struct fsnotify_mark *inode_mark, 471 struct fsnotify_mark *inode_mark,
479 struct fsnotify_mark *vfsmount_mark, 472 struct fsnotify_mark *vfsmount_mark,
480 struct fsnotify_event *event) 473 u32 mask, void *data, int data_type,
474 const unsigned char *dname, u32 cookie)
481{ 475{
482 struct inode *inode; 476 struct inode *inode;
483 __u32 mask = event->mask;
484 const char *dname = event->file_name;
485 struct audit_parent *parent; 477 struct audit_parent *parent;
486 478
487 parent = container_of(inode_mark, struct audit_parent, mark); 479 parent = container_of(inode_mark, struct audit_parent, mark);
488 480
489 BUG_ON(group != audit_watch_group); 481 BUG_ON(group != audit_watch_group);
490 482
491 switch (event->data_type) { 483 switch (data_type) {
492 case (FSNOTIFY_EVENT_PATH): 484 case (FSNOTIFY_EVENT_PATH):
493 inode = event->path.dentry->d_inode; 485 inode = ((struct path *)data)->dentry->d_inode;
494 break; 486 break;
495 case (FSNOTIFY_EVENT_INODE): 487 case (FSNOTIFY_EVENT_INODE):
496 inode = event->inode; 488 inode = (struct inode *)data;
497 break; 489 break;
498 default: 490 default:
499 BUG(); 491 BUG();
@@ -512,11 +504,7 @@ static int audit_watch_handle_event(struct fsnotify_group *group,
512} 504}
513 505
514static const struct fsnotify_ops audit_watch_fsnotify_ops = { 506static const struct fsnotify_ops audit_watch_fsnotify_ops = {
515 .should_send_event = audit_watch_should_send_event,
516 .handle_event = audit_watch_handle_event, 507 .handle_event = audit_watch_handle_event,
517 .free_group_priv = NULL,
518 .freeing_mark = NULL,
519 .free_event_priv = NULL,
520}; 508};
521 509
522static int __init audit_watch_init(void) 510static int __init audit_watch_init(void)
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 51f3fd4c1ed3..92062fd6cc8c 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -29,6 +29,8 @@
29#include <linux/sched.h> 29#include <linux/sched.h>
30#include <linux/slab.h> 30#include <linux/slab.h>
31#include <linux/security.h> 31#include <linux/security.h>
32#include <net/net_namespace.h>
33#include <net/sock.h>
32#include "audit.h" 34#include "audit.h"
33 35
34/* 36/*
@@ -972,7 +974,7 @@ out:
972} 974}
973 975
974/* List rules using struct audit_rule_data. */ 976/* List rules using struct audit_rule_data. */
975static void audit_list_rules(int pid, int seq, struct sk_buff_head *q) 977static void audit_list_rules(__u32 portid, int seq, struct sk_buff_head *q)
976{ 978{
977 struct sk_buff *skb; 979 struct sk_buff *skb;
978 struct audit_krule *r; 980 struct audit_krule *r;
@@ -987,14 +989,15 @@ static void audit_list_rules(int pid, int seq, struct sk_buff_head *q)
987 data = audit_krule_to_data(r); 989 data = audit_krule_to_data(r);
988 if (unlikely(!data)) 990 if (unlikely(!data))
989 break; 991 break;
990 skb = audit_make_reply(pid, seq, AUDIT_LIST_RULES, 0, 1, 992 skb = audit_make_reply(portid, seq, AUDIT_LIST_RULES,
991 data, sizeof(*data) + data->buflen); 993 0, 1, data,
994 sizeof(*data) + data->buflen);
992 if (skb) 995 if (skb)
993 skb_queue_tail(q, skb); 996 skb_queue_tail(q, skb);
994 kfree(data); 997 kfree(data);
995 } 998 }
996 } 999 }
997 skb = audit_make_reply(pid, seq, AUDIT_LIST_RULES, 1, 1, NULL, 0); 1000 skb = audit_make_reply(portid, seq, AUDIT_LIST_RULES, 1, 1, NULL, 0);
998 if (skb) 1001 if (skb)
999 skb_queue_tail(q, skb); 1002 skb_queue_tail(q, skb);
1000} 1003}
@@ -1004,7 +1007,7 @@ static void audit_log_rule_change(char *action, struct audit_krule *rule, int re
1004{ 1007{
1005 struct audit_buffer *ab; 1008 struct audit_buffer *ab;
1006 uid_t loginuid = from_kuid(&init_user_ns, audit_get_loginuid(current)); 1009 uid_t loginuid = from_kuid(&init_user_ns, audit_get_loginuid(current));
1007 u32 sessionid = audit_get_sessionid(current); 1010 unsigned int sessionid = audit_get_sessionid(current);
1008 1011
1009 if (!audit_enabled) 1012 if (!audit_enabled)
1010 return; 1013 return;
@@ -1022,45 +1025,20 @@ static void audit_log_rule_change(char *action, struct audit_krule *rule, int re
1022} 1025}
1023 1026
1024/** 1027/**
1025 * audit_receive_filter - apply all rules to the specified message type 1028 * audit_rule_change - apply all rules to the specified message type
1026 * @type: audit message type 1029 * @type: audit message type
1027 * @pid: target pid for netlink audit messages 1030 * @portid: target port id for netlink audit messages
1028 * @seq: netlink audit message sequence (serial) number 1031 * @seq: netlink audit message sequence (serial) number
1029 * @data: payload data 1032 * @data: payload data
1030 * @datasz: size of payload data 1033 * @datasz: size of payload data
1031 */ 1034 */
1032int audit_receive_filter(int type, int pid, int seq, void *data, size_t datasz) 1035int audit_rule_change(int type, __u32 portid, int seq, void *data,
1036 size_t datasz)
1033{ 1037{
1034 struct task_struct *tsk;
1035 struct audit_netlink_list *dest;
1036 int err = 0; 1038 int err = 0;
1037 struct audit_entry *entry; 1039 struct audit_entry *entry;
1038 1040
1039 switch (type) { 1041 switch (type) {
1040 case AUDIT_LIST_RULES:
1041 /* We can't just spew out the rules here because we might fill
1042 * the available socket buffer space and deadlock waiting for
1043 * auditctl to read from it... which isn't ever going to
1044 * happen if we're actually running in the context of auditctl
1045 * trying to _send_ the stuff */
1046
1047 dest = kmalloc(sizeof(struct audit_netlink_list), GFP_KERNEL);
1048 if (!dest)
1049 return -ENOMEM;
1050 dest->pid = pid;
1051 skb_queue_head_init(&dest->q);
1052
1053 mutex_lock(&audit_filter_mutex);
1054 audit_list_rules(pid, seq, &dest->q);
1055 mutex_unlock(&audit_filter_mutex);
1056
1057 tsk = kthread_run(audit_send_list, dest, "audit_send_list");
1058 if (IS_ERR(tsk)) {
1059 skb_queue_purge(&dest->q);
1060 kfree(dest);
1061 err = PTR_ERR(tsk);
1062 }
1063 break;
1064 case AUDIT_ADD_RULE: 1042 case AUDIT_ADD_RULE:
1065 entry = audit_data_to_entry(data, datasz); 1043 entry = audit_data_to_entry(data, datasz);
1066 if (IS_ERR(entry)) 1044 if (IS_ERR(entry))
@@ -1087,6 +1065,46 @@ int audit_receive_filter(int type, int pid, int seq, void *data, size_t datasz)
1087 return err; 1065 return err;
1088} 1066}
1089 1067
1068/**
1069 * audit_list_rules_send - list the audit rules
1070 * @request_skb: skb of request we are replying to (used to target the reply)
1071 * @seq: netlink audit message sequence (serial) number
1072 */
1073int audit_list_rules_send(struct sk_buff *request_skb, int seq)
1074{
1075 u32 portid = NETLINK_CB(request_skb).portid;
1076 struct net *net = sock_net(NETLINK_CB(request_skb).sk);
1077 struct task_struct *tsk;
1078 struct audit_netlink_list *dest;
1079 int err = 0;
1080
1081 /* We can't just spew out the rules here because we might fill
1082 * the available socket buffer space and deadlock waiting for
1083 * auditctl to read from it... which isn't ever going to
1084 * happen if we're actually running in the context of auditctl
1085 * trying to _send_ the stuff */
1086
1087 dest = kmalloc(sizeof(struct audit_netlink_list), GFP_KERNEL);
1088 if (!dest)
1089 return -ENOMEM;
1090 dest->net = get_net(net);
1091 dest->portid = portid;
1092 skb_queue_head_init(&dest->q);
1093
1094 mutex_lock(&audit_filter_mutex);
1095 audit_list_rules(portid, seq, &dest->q);
1096 mutex_unlock(&audit_filter_mutex);
1097
1098 tsk = kthread_run(audit_send_list, dest, "audit_send_list");
1099 if (IS_ERR(tsk)) {
1100 skb_queue_purge(&dest->q);
1101 kfree(dest);
1102 err = PTR_ERR(tsk);
1103 }
1104
1105 return err;
1106}
1107
1090int audit_comparator(u32 left, u32 op, u32 right) 1108int audit_comparator(u32 left, u32 op, u32 right)
1091{ 1109{
1092 switch (op) { 1110 switch (op) {
@@ -1276,19 +1294,22 @@ int audit_filter_user(int type)
1276{ 1294{
1277 enum audit_state state = AUDIT_DISABLED; 1295 enum audit_state state = AUDIT_DISABLED;
1278 struct audit_entry *e; 1296 struct audit_entry *e;
1279 int ret = 1; 1297 int rc, ret;
1298
1299 ret = 1; /* Audit by default */
1280 1300
1281 rcu_read_lock(); 1301 rcu_read_lock();
1282 list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_USER], list) { 1302 list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_USER], list) {
1283 if (audit_filter_user_rules(&e->rule, type, &state)) { 1303 rc = audit_filter_user_rules(&e->rule, type, &state);
1284 if (state == AUDIT_DISABLED) 1304 if (rc) {
1305 if (rc > 0 && state == AUDIT_DISABLED)
1285 ret = 0; 1306 ret = 0;
1286 break; 1307 break;
1287 } 1308 }
1288 } 1309 }
1289 rcu_read_unlock(); 1310 rcu_read_unlock();
1290 1311
1291 return ret; /* Audit by default */ 1312 return ret;
1292} 1313}
1293 1314
1294int audit_filter_type(int type) 1315int audit_filter_type(int type)
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 90594c9f7552..7aef2f4b6c64 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1719,7 +1719,7 @@ void audit_putname(struct filename *name)
1719 struct audit_context *context = current->audit_context; 1719 struct audit_context *context = current->audit_context;
1720 1720
1721 BUG_ON(!context); 1721 BUG_ON(!context);
1722 if (!context->in_syscall) { 1722 if (!name->aname || !context->in_syscall) {
1723#if AUDIT_DEBUG == 2 1723#if AUDIT_DEBUG == 2
1724 printk(KERN_ERR "%s:%d(:%d): final_putname(%p)\n", 1724 printk(KERN_ERR "%s:%d(:%d): final_putname(%p)\n",
1725 __FILE__, __LINE__, context->serial, name); 1725 __FILE__, __LINE__, context->serial, name);
@@ -1969,18 +1969,24 @@ static void audit_log_set_loginuid(kuid_t koldloginuid, kuid_t kloginuid,
1969 int rc) 1969 int rc)
1970{ 1970{
1971 struct audit_buffer *ab; 1971 struct audit_buffer *ab;
1972 uid_t uid, ologinuid, nloginuid; 1972 uid_t uid, oldloginuid, loginuid;
1973
1974 if (!audit_enabled)
1975 return;
1973 1976
1974 uid = from_kuid(&init_user_ns, task_uid(current)); 1977 uid = from_kuid(&init_user_ns, task_uid(current));
1975 ologinuid = from_kuid(&init_user_ns, koldloginuid); 1978 oldloginuid = from_kuid(&init_user_ns, koldloginuid);
1976 nloginuid = from_kuid(&init_user_ns, kloginuid), 1979 loginuid = from_kuid(&init_user_ns, kloginuid),
1977 1980
1978 ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_LOGIN); 1981 ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_LOGIN);
1979 if (!ab) 1982 if (!ab)
1980 return; 1983 return;
1981 audit_log_format(ab, "pid=%d uid=%u old auid=%u new auid=%u old " 1984 audit_log_format(ab, "pid=%d uid=%u"
1982 "ses=%u new ses=%u res=%d", current->pid, uid, ologinuid, 1985 " old-auid=%u new-auid=%u old-ses=%u new-ses=%u"
1983 nloginuid, oldsessionid, sessionid, !rc); 1986 " res=%d",
1987 current->pid, uid,
1988 oldloginuid, loginuid, oldsessionid, sessionid,
1989 !rc);
1984 audit_log_end(ab); 1990 audit_log_end(ab);
1985} 1991}
1986 1992
@@ -2008,7 +2014,7 @@ int audit_set_loginuid(kuid_t loginuid)
2008 2014
2009 /* are we setting or clearing? */ 2015 /* are we setting or clearing? */
2010 if (uid_valid(loginuid)) 2016 if (uid_valid(loginuid))
2011 sessionid = atomic_inc_return(&session_id); 2017 sessionid = (unsigned int)atomic_inc_return(&session_id);
2012 2018
2013 task->sessionid = sessionid; 2019 task->sessionid = sessionid;
2014 task->loginuid = loginuid; 2020 task->loginuid = loginuid;
@@ -2321,18 +2327,16 @@ int __audit_log_bprm_fcaps(struct linux_binprm *bprm,
2321 2327
2322/** 2328/**
2323 * __audit_log_capset - store information about the arguments to the capset syscall 2329 * __audit_log_capset - store information about the arguments to the capset syscall
2324 * @pid: target pid of the capset call
2325 * @new: the new credentials 2330 * @new: the new credentials
2326 * @old: the old (current) credentials 2331 * @old: the old (current) credentials
2327 * 2332 *
2328 * Record the aguments userspace sent to sys_capset for later printing by the 2333 * Record the aguments userspace sent to sys_capset for later printing by the
2329 * audit system if applicable 2334 * audit system if applicable
2330 */ 2335 */
2331void __audit_log_capset(pid_t pid, 2336void __audit_log_capset(const struct cred *new, const struct cred *old)
2332 const struct cred *new, const struct cred *old)
2333{ 2337{
2334 struct audit_context *context = current->audit_context; 2338 struct audit_context *context = current->audit_context;
2335 context->capset.pid = pid; 2339 context->capset.pid = task_pid_nr(current);
2336 context->capset.cap.effective = new->cap_effective; 2340 context->capset.cap.effective = new->cap_effective;
2337 context->capset.cap.inheritable = new->cap_effective; 2341 context->capset.cap.inheritable = new->cap_effective;
2338 context->capset.cap.permitted = new->cap_permitted; 2342 context->capset.cap.permitted = new->cap_permitted;
@@ -2352,6 +2356,7 @@ static void audit_log_task(struct audit_buffer *ab)
2352 kuid_t auid, uid; 2356 kuid_t auid, uid;
2353 kgid_t gid; 2357 kgid_t gid;
2354 unsigned int sessionid; 2358 unsigned int sessionid;
2359 struct mm_struct *mm = current->mm;
2355 2360
2356 auid = audit_get_loginuid(current); 2361 auid = audit_get_loginuid(current);
2357 sessionid = audit_get_sessionid(current); 2362 sessionid = audit_get_sessionid(current);
@@ -2365,15 +2370,15 @@ static void audit_log_task(struct audit_buffer *ab)
2365 audit_log_task_context(ab); 2370 audit_log_task_context(ab);
2366 audit_log_format(ab, " pid=%d comm=", current->pid); 2371 audit_log_format(ab, " pid=%d comm=", current->pid);
2367 audit_log_untrustedstring(ab, current->comm); 2372 audit_log_untrustedstring(ab, current->comm);
2373 if (mm) {
2374 down_read(&mm->mmap_sem);
2375 if (mm->exe_file)
2376 audit_log_d_path(ab, " exe=", &mm->exe_file->f_path);
2377 up_read(&mm->mmap_sem);
2378 } else
2379 audit_log_format(ab, " exe=(null)");
2368} 2380}
2369 2381
2370static void audit_log_abend(struct audit_buffer *ab, char *reason, long signr)
2371{
2372 audit_log_task(ab);
2373 audit_log_format(ab, " reason=");
2374 audit_log_string(ab, reason);
2375 audit_log_format(ab, " sig=%ld", signr);
2376}
2377/** 2382/**
2378 * audit_core_dumps - record information about processes that end abnormally 2383 * audit_core_dumps - record information about processes that end abnormally
2379 * @signr: signal value 2384 * @signr: signal value
@@ -2394,7 +2399,8 @@ void audit_core_dumps(long signr)
2394 ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_ANOM_ABEND); 2399 ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_ANOM_ABEND);
2395 if (unlikely(!ab)) 2400 if (unlikely(!ab))
2396 return; 2401 return;
2397 audit_log_abend(ab, "memory violation", signr); 2402 audit_log_task(ab);
2403 audit_log_format(ab, " sig=%ld", signr);
2398 audit_log_end(ab); 2404 audit_log_end(ab);
2399} 2405}
2400 2406
diff --git a/kernel/capability.c b/kernel/capability.c
index 4e66bf9275b0..34019c57888d 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -277,7 +277,7 @@ SYSCALL_DEFINE2(capset, cap_user_header_t, header, const cap_user_data_t, data)
277 if (ret < 0) 277 if (ret < 0)
278 goto error; 278 goto error;
279 279
280 audit_log_capset(pid, new, current_cred()); 280 audit_log_capset(new, current_cred());
281 281
282 return commit_creds(new); 282 return commit_creds(new);
283 283
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index bc1dcabe9217..0c753ddd223b 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -41,7 +41,6 @@
41#include <linux/rcupdate.h> 41#include <linux/rcupdate.h>
42#include <linux/sched.h> 42#include <linux/sched.h>
43#include <linux/backing-dev.h> 43#include <linux/backing-dev.h>
44#include <linux/seq_file.h>
45#include <linux/slab.h> 44#include <linux/slab.h>
46#include <linux/magic.h> 45#include <linux/magic.h>
47#include <linux/spinlock.h> 46#include <linux/spinlock.h>
@@ -56,15 +55,20 @@
56#include <linux/pid_namespace.h> 55#include <linux/pid_namespace.h>
57#include <linux/idr.h> 56#include <linux/idr.h>
58#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */ 57#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
59#include <linux/eventfd.h>
60#include <linux/poll.h>
61#include <linux/flex_array.h> /* used in cgroup_attach_task */ 58#include <linux/flex_array.h> /* used in cgroup_attach_task */
62#include <linux/kthread.h> 59#include <linux/kthread.h>
63#include <linux/file.h>
64 60
65#include <linux/atomic.h> 61#include <linux/atomic.h>
66 62
67/* 63/*
64 * pidlists linger the following amount before being destroyed. The goal
65 * is avoiding frequent destruction in the middle of consecutive read calls
66 * Expiring in the middle is a performance problem not a correctness one.
67 * 1 sec should be enough.
68 */
69#define CGROUP_PIDLIST_DESTROY_DELAY HZ
70
71/*
68 * cgroup_mutex is the master lock. Any modification to cgroup or its 72 * cgroup_mutex is the master lock. Any modification to cgroup or its
69 * hierarchy must be performed while holding it. 73 * hierarchy must be performed while holding it.
70 * 74 *
@@ -89,6 +93,19 @@ static DEFINE_MUTEX(cgroup_mutex);
89 93
90static DEFINE_MUTEX(cgroup_root_mutex); 94static DEFINE_MUTEX(cgroup_root_mutex);
91 95
96#define cgroup_assert_mutex_or_rcu_locked() \
97 rcu_lockdep_assert(rcu_read_lock_held() || \
98 lockdep_is_held(&cgroup_mutex), \
99 "cgroup_mutex or RCU read lock required");
100
101#ifdef CONFIG_LOCKDEP
102#define cgroup_assert_mutex_or_root_locked() \
103 WARN_ON_ONCE(debug_locks && (!lockdep_is_held(&cgroup_mutex) && \
104 !lockdep_is_held(&cgroup_root_mutex)))
105#else
106#define cgroup_assert_mutex_or_root_locked() do { } while (0)
107#endif
108
92/* 109/*
93 * cgroup destruction makes heavy use of work items and there can be a lot 110 * cgroup destruction makes heavy use of work items and there can be a lot
94 * of concurrent destructions. Use a separate workqueue so that cgroup 111 * of concurrent destructions. Use a separate workqueue so that cgroup
@@ -98,6 +115,12 @@ static DEFINE_MUTEX(cgroup_root_mutex);
98static struct workqueue_struct *cgroup_destroy_wq; 115static struct workqueue_struct *cgroup_destroy_wq;
99 116
100/* 117/*
118 * pidlist destructions need to be flushed on cgroup destruction. Use a
119 * separate workqueue as flush domain.
120 */
121static struct workqueue_struct *cgroup_pidlist_destroy_wq;
122
123/*
101 * Generate an array of cgroup subsystem pointers. At boot time, this is 124 * Generate an array of cgroup subsystem pointers. At boot time, this is
102 * populated with the built in subsystems, and modular subsystems are 125 * populated with the built in subsystems, and modular subsystems are
103 * registered after that. The mutable section of this array is protected by 126 * registered after that. The mutable section of this array is protected by
@@ -119,49 +142,6 @@ static struct cgroupfs_root cgroup_dummy_root;
119/* dummy_top is a shorthand for the dummy hierarchy's top cgroup */ 142/* dummy_top is a shorthand for the dummy hierarchy's top cgroup */
120static struct cgroup * const cgroup_dummy_top = &cgroup_dummy_root.top_cgroup; 143static struct cgroup * const cgroup_dummy_top = &cgroup_dummy_root.top_cgroup;
121 144
122/*
123 * cgroupfs file entry, pointed to from leaf dentry->d_fsdata.
124 */
125struct cfent {
126 struct list_head node;
127 struct dentry *dentry;
128 struct cftype *type;
129 struct cgroup_subsys_state *css;
130
131 /* file xattrs */
132 struct simple_xattrs xattrs;
133};
134
135/*
136 * cgroup_event represents events which userspace want to receive.
137 */
138struct cgroup_event {
139 /*
140 * css which the event belongs to.
141 */
142 struct cgroup_subsys_state *css;
143 /*
144 * Control file which the event associated.
145 */
146 struct cftype *cft;
147 /*
148 * eventfd to signal userspace about the event.
149 */
150 struct eventfd_ctx *eventfd;
151 /*
152 * Each of these stored in a list by the cgroup.
153 */
154 struct list_head list;
155 /*
156 * All fields below needed to unregister event when
157 * userspace closes eventfd.
158 */
159 poll_table pt;
160 wait_queue_head_t *wqh;
161 wait_queue_t wait;
162 struct work_struct remove;
163};
164
165/* The list of hierarchy roots */ 145/* The list of hierarchy roots */
166 146
167static LIST_HEAD(cgroup_roots); 147static LIST_HEAD(cgroup_roots);
@@ -200,6 +180,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp);
200static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], 180static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
201 bool is_add); 181 bool is_add);
202static int cgroup_file_release(struct inode *inode, struct file *file); 182static int cgroup_file_release(struct inode *inode, struct file *file);
183static void cgroup_pidlist_destroy_all(struct cgroup *cgrp);
203 184
204/** 185/**
205 * cgroup_css - obtain a cgroup's css for the specified subsystem 186 * cgroup_css - obtain a cgroup's css for the specified subsystem
@@ -262,16 +243,32 @@ static int notify_on_release(const struct cgroup *cgrp)
262} 243}
263 244
264/** 245/**
246 * for_each_css - iterate all css's of a cgroup
247 * @css: the iteration cursor
248 * @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end
249 * @cgrp: the target cgroup to iterate css's of
250 *
251 * Should be called under cgroup_mutex.
252 */
253#define for_each_css(css, ssid, cgrp) \
254 for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \
255 if (!((css) = rcu_dereference_check( \
256 (cgrp)->subsys[(ssid)], \
257 lockdep_is_held(&cgroup_mutex)))) { } \
258 else
259
260/**
265 * for_each_subsys - iterate all loaded cgroup subsystems 261 * for_each_subsys - iterate all loaded cgroup subsystems
266 * @ss: the iteration cursor 262 * @ss: the iteration cursor
267 * @i: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end 263 * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end
268 * 264 *
269 * Should be called under cgroup_mutex. 265 * Iterates through all loaded subsystems. Should be called under
266 * cgroup_mutex or cgroup_root_mutex.
270 */ 267 */
271#define for_each_subsys(ss, i) \ 268#define for_each_subsys(ss, ssid) \
272 for ((i) = 0; (i) < CGROUP_SUBSYS_COUNT; (i)++) \ 269 for (({ cgroup_assert_mutex_or_root_locked(); (ssid) = 0; }); \
273 if (({ lockdep_assert_held(&cgroup_mutex); \ 270 (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \
274 !((ss) = cgroup_subsys[i]); })) { } \ 271 if (!((ss) = cgroup_subsys[(ssid)])) { } \
275 else 272 else
276 273
277/** 274/**
@@ -286,10 +283,6 @@ static int notify_on_release(const struct cgroup *cgrp)
286 for ((i) = 0; (i) < CGROUP_BUILTIN_SUBSYS_COUNT && \ 283 for ((i) = 0; (i) < CGROUP_BUILTIN_SUBSYS_COUNT && \
287 (((ss) = cgroup_subsys[i]) || true); (i)++) 284 (((ss) = cgroup_subsys[i]) || true); (i)++)
288 285
289/* iterate each subsystem attached to a hierarchy */
290#define for_each_root_subsys(root, ss) \
291 list_for_each_entry((ss), &(root)->subsys_list, sibling)
292
293/* iterate across the active hierarchies */ 286/* iterate across the active hierarchies */
294#define for_each_active_root(root) \ 287#define for_each_active_root(root) \
295 list_for_each_entry((root), &cgroup_roots, root_list) 288 list_for_each_entry((root), &cgroup_roots, root_list)
@@ -863,11 +856,7 @@ static void cgroup_free_fn(struct work_struct *work)
863 */ 856 */
864 deactivate_super(cgrp->root->sb); 857 deactivate_super(cgrp->root->sb);
865 858
866 /* 859 cgroup_pidlist_destroy_all(cgrp);
867 * if we're getting rid of the cgroup, refcount should ensure
868 * that there are no pidlists left.
869 */
870 BUG_ON(!list_empty(&cgrp->pidlists));
871 860
872 simple_xattrs_free(&cgrp->xattrs); 861 simple_xattrs_free(&cgrp->xattrs);
873 862
@@ -897,7 +886,9 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
897 * per-subsystem and moved to css->id so that lookups are 886 * per-subsystem and moved to css->id so that lookups are
898 * successful until the target css is released. 887 * successful until the target css is released.
899 */ 888 */
889 mutex_lock(&cgroup_mutex);
900 idr_remove(&cgrp->root->cgroup_idr, cgrp->id); 890 idr_remove(&cgrp->root->cgroup_idr, cgrp->id);
891 mutex_unlock(&cgroup_mutex);
901 cgrp->id = -1; 892 cgrp->id = -1;
902 893
903 call_rcu(&cgrp->rcu_head, cgroup_free_rcu); 894 call_rcu(&cgrp->rcu_head, cgroup_free_rcu);
@@ -1050,7 +1041,6 @@ static int rebind_subsystems(struct cgroupfs_root *root,
1050 cgroup_css(cgroup_dummy_top, ss)); 1041 cgroup_css(cgroup_dummy_top, ss));
1051 cgroup_css(cgrp, ss)->cgroup = cgrp; 1042 cgroup_css(cgrp, ss)->cgroup = cgrp;
1052 1043
1053 list_move(&ss->sibling, &root->subsys_list);
1054 ss->root = root; 1044 ss->root = root;
1055 if (ss->bind) 1045 if (ss->bind)
1056 ss->bind(cgroup_css(cgrp, ss)); 1046 ss->bind(cgroup_css(cgrp, ss));
@@ -1069,7 +1059,6 @@ static int rebind_subsystems(struct cgroupfs_root *root,
1069 RCU_INIT_POINTER(cgrp->subsys[i], NULL); 1059 RCU_INIT_POINTER(cgrp->subsys[i], NULL);
1070 1060
1071 cgroup_subsys[i]->root = &cgroup_dummy_root; 1061 cgroup_subsys[i]->root = &cgroup_dummy_root;
1072 list_move(&ss->sibling, &cgroup_dummy_root.subsys_list);
1073 1062
1074 /* subsystem is now free - drop reference on module */ 1063 /* subsystem is now free - drop reference on module */
1075 module_put(ss->module); 1064 module_put(ss->module);
@@ -1096,10 +1085,12 @@ static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry)
1096{ 1085{
1097 struct cgroupfs_root *root = dentry->d_sb->s_fs_info; 1086 struct cgroupfs_root *root = dentry->d_sb->s_fs_info;
1098 struct cgroup_subsys *ss; 1087 struct cgroup_subsys *ss;
1088 int ssid;
1099 1089
1100 mutex_lock(&cgroup_root_mutex); 1090 mutex_lock(&cgroup_root_mutex);
1101 for_each_root_subsys(root, ss) 1091 for_each_subsys(ss, ssid)
1102 seq_printf(seq, ",%s", ss->name); 1092 if (root->subsys_mask & (1 << ssid))
1093 seq_printf(seq, ",%s", ss->name);
1103 if (root->flags & CGRP_ROOT_SANE_BEHAVIOR) 1094 if (root->flags & CGRP_ROOT_SANE_BEHAVIOR)
1104 seq_puts(seq, ",sane_behavior"); 1095 seq_puts(seq, ",sane_behavior");
1105 if (root->flags & CGRP_ROOT_NOPREFIX) 1096 if (root->flags & CGRP_ROOT_NOPREFIX)
@@ -1362,8 +1353,6 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
1362 INIT_LIST_HEAD(&cgrp->pidlists); 1353 INIT_LIST_HEAD(&cgrp->pidlists);
1363 mutex_init(&cgrp->pidlist_mutex); 1354 mutex_init(&cgrp->pidlist_mutex);
1364 cgrp->dummy_css.cgroup = cgrp; 1355 cgrp->dummy_css.cgroup = cgrp;
1365 INIT_LIST_HEAD(&cgrp->event_list);
1366 spin_lock_init(&cgrp->event_list_lock);
1367 simple_xattrs_init(&cgrp->xattrs); 1356 simple_xattrs_init(&cgrp->xattrs);
1368} 1357}
1369 1358
@@ -1371,7 +1360,6 @@ static void init_cgroup_root(struct cgroupfs_root *root)
1371{ 1360{
1372 struct cgroup *cgrp = &root->top_cgroup; 1361 struct cgroup *cgrp = &root->top_cgroup;
1373 1362
1374 INIT_LIST_HEAD(&root->subsys_list);
1375 INIT_LIST_HEAD(&root->root_list); 1363 INIT_LIST_HEAD(&root->root_list);
1376 root->number_of_cgroups = 1; 1364 root->number_of_cgroups = 1;
1377 cgrp->root = root; 1365 cgrp->root = root;
@@ -1580,10 +1568,10 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1580 mutex_lock(&cgroup_mutex); 1568 mutex_lock(&cgroup_mutex);
1581 mutex_lock(&cgroup_root_mutex); 1569 mutex_lock(&cgroup_root_mutex);
1582 1570
1583 root_cgrp->id = idr_alloc(&root->cgroup_idr, root_cgrp, 1571 ret = idr_alloc(&root->cgroup_idr, root_cgrp, 0, 1, GFP_KERNEL);
1584 0, 1, GFP_KERNEL); 1572 if (ret < 0)
1585 if (root_cgrp->id < 0)
1586 goto unlock_drop; 1573 goto unlock_drop;
1574 root_cgrp->id = ret;
1587 1575
1588 /* Check for name clashes with existing mounts */ 1576 /* Check for name clashes with existing mounts */
1589 ret = -EBUSY; 1577 ret = -EBUSY;
@@ -1693,7 +1681,8 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1693 return ERR_PTR(ret); 1681 return ERR_PTR(ret);
1694} 1682}
1695 1683
1696static void cgroup_kill_sb(struct super_block *sb) { 1684static void cgroup_kill_sb(struct super_block *sb)
1685{
1697 struct cgroupfs_root *root = sb->s_fs_info; 1686 struct cgroupfs_root *root = sb->s_fs_info;
1698 struct cgroup *cgrp = &root->top_cgroup; 1687 struct cgroup *cgrp = &root->top_cgroup;
1699 struct cgrp_cset_link *link, *tmp_link; 1688 struct cgrp_cset_link *link, *tmp_link;
@@ -1976,8 +1965,8 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
1976 bool threadgroup) 1965 bool threadgroup)
1977{ 1966{
1978 int retval, i, group_size; 1967 int retval, i, group_size;
1979 struct cgroup_subsys *ss, *failed_ss = NULL;
1980 struct cgroupfs_root *root = cgrp->root; 1968 struct cgroupfs_root *root = cgrp->root;
1969 struct cgroup_subsys_state *css, *failed_css = NULL;
1981 /* threadgroup list cursor and array */ 1970 /* threadgroup list cursor and array */
1982 struct task_struct *leader = tsk; 1971 struct task_struct *leader = tsk;
1983 struct task_and_cgroup *tc; 1972 struct task_and_cgroup *tc;
@@ -2050,13 +2039,11 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
2050 /* 2039 /*
2051 * step 1: check that we can legitimately attach to the cgroup. 2040 * step 1: check that we can legitimately attach to the cgroup.
2052 */ 2041 */
2053 for_each_root_subsys(root, ss) { 2042 for_each_css(css, i, cgrp) {
2054 struct cgroup_subsys_state *css = cgroup_css(cgrp, ss); 2043 if (css->ss->can_attach) {
2055 2044 retval = css->ss->can_attach(css, &tset);
2056 if (ss->can_attach) {
2057 retval = ss->can_attach(css, &tset);
2058 if (retval) { 2045 if (retval) {
2059 failed_ss = ss; 2046 failed_css = css;
2060 goto out_cancel_attach; 2047 goto out_cancel_attach;
2061 } 2048 }
2062 } 2049 }
@@ -2092,12 +2079,9 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
2092 /* 2079 /*
2093 * step 4: do subsystem attach callbacks. 2080 * step 4: do subsystem attach callbacks.
2094 */ 2081 */
2095 for_each_root_subsys(root, ss) { 2082 for_each_css(css, i, cgrp)
2096 struct cgroup_subsys_state *css = cgroup_css(cgrp, ss); 2083 if (css->ss->attach)
2097 2084 css->ss->attach(css, &tset);
2098 if (ss->attach)
2099 ss->attach(css, &tset);
2100 }
2101 2085
2102 /* 2086 /*
2103 * step 5: success! and cleanup 2087 * step 5: success! and cleanup
@@ -2114,13 +2098,11 @@ out_put_css_set_refs:
2114 } 2098 }
2115out_cancel_attach: 2099out_cancel_attach:
2116 if (retval) { 2100 if (retval) {
2117 for_each_root_subsys(root, ss) { 2101 for_each_css(css, i, cgrp) {
2118 struct cgroup_subsys_state *css = cgroup_css(cgrp, ss); 2102 if (css == failed_css)
2119
2120 if (ss == failed_ss)
2121 break; 2103 break;
2122 if (ss->cancel_attach) 2104 if (css->ss->cancel_attach)
2123 ss->cancel_attach(css, &tset); 2105 css->ss->cancel_attach(css, &tset);
2124 } 2106 }
2125 } 2107 }
2126out_free_group_list: 2108out_free_group_list:
@@ -2148,7 +2130,7 @@ retry_find_task:
2148 tsk = find_task_by_vpid(pid); 2130 tsk = find_task_by_vpid(pid);
2149 if (!tsk) { 2131 if (!tsk) {
2150 rcu_read_unlock(); 2132 rcu_read_unlock();
2151 ret= -ESRCH; 2133 ret = -ESRCH;
2152 goto out_unlock_cgroup; 2134 goto out_unlock_cgroup;
2153 } 2135 }
2154 /* 2136 /*
@@ -2260,10 +2242,9 @@ static int cgroup_release_agent_write(struct cgroup_subsys_state *css,
2260 return 0; 2242 return 0;
2261} 2243}
2262 2244
2263static int cgroup_release_agent_show(struct cgroup_subsys_state *css, 2245static int cgroup_release_agent_show(struct seq_file *seq, void *v)
2264 struct cftype *cft, struct seq_file *seq)
2265{ 2246{
2266 struct cgroup *cgrp = css->cgroup; 2247 struct cgroup *cgrp = seq_css(seq)->cgroup;
2267 2248
2268 if (!cgroup_lock_live_group(cgrp)) 2249 if (!cgroup_lock_live_group(cgrp))
2269 return -ENODEV; 2250 return -ENODEV;
@@ -2273,174 +2254,129 @@ static int cgroup_release_agent_show(struct cgroup_subsys_state *css,
2273 return 0; 2254 return 0;
2274} 2255}
2275 2256
2276static int cgroup_sane_behavior_show(struct cgroup_subsys_state *css, 2257static int cgroup_sane_behavior_show(struct seq_file *seq, void *v)
2277 struct cftype *cft, struct seq_file *seq)
2278{ 2258{
2279 seq_printf(seq, "%d\n", cgroup_sane_behavior(css->cgroup)); 2259 struct cgroup *cgrp = seq_css(seq)->cgroup;
2260
2261 seq_printf(seq, "%d\n", cgroup_sane_behavior(cgrp));
2280 return 0; 2262 return 0;
2281} 2263}
2282 2264
2283/* A buffer size big enough for numbers or short strings */ 2265/* A buffer size big enough for numbers or short strings */
2284#define CGROUP_LOCAL_BUFFER_SIZE 64 2266#define CGROUP_LOCAL_BUFFER_SIZE 64
2285 2267
2286static ssize_t cgroup_write_X64(struct cgroup_subsys_state *css, 2268static ssize_t cgroup_file_write(struct file *file, const char __user *userbuf,
2287 struct cftype *cft, struct file *file, 2269 size_t nbytes, loff_t *ppos)
2288 const char __user *userbuf, size_t nbytes,
2289 loff_t *unused_ppos)
2290{ 2270{
2291 char buffer[CGROUP_LOCAL_BUFFER_SIZE]; 2271 struct cfent *cfe = __d_cfe(file->f_dentry);
2292 int retval = 0; 2272 struct cftype *cft = __d_cft(file->f_dentry);
2293 char *end; 2273 struct cgroup_subsys_state *css = cfe->css;
2274 size_t max_bytes = cft->max_write_len ?: CGROUP_LOCAL_BUFFER_SIZE - 1;
2275 char *buf;
2276 int ret;
2294 2277
2295 if (!nbytes) 2278 if (nbytes >= max_bytes)
2296 return -EINVAL;
2297 if (nbytes >= sizeof(buffer))
2298 return -E2BIG; 2279 return -E2BIG;
2299 if (copy_from_user(buffer, userbuf, nbytes))
2300 return -EFAULT;
2301 2280
2302 buffer[nbytes] = 0; /* nul-terminate */ 2281 buf = kmalloc(nbytes + 1, GFP_KERNEL);
2303 if (cft->write_u64) { 2282 if (!buf)
2304 u64 val = simple_strtoull(strstrip(buffer), &end, 0); 2283 return -ENOMEM;
2305 if (*end) 2284
2306 return -EINVAL; 2285 if (copy_from_user(buf, userbuf, nbytes)) {
2307 retval = cft->write_u64(css, cft, val); 2286 ret = -EFAULT;
2287 goto out_free;
2288 }
2289
2290 buf[nbytes] = '\0';
2291
2292 if (cft->write_string) {
2293 ret = cft->write_string(css, cft, strstrip(buf));
2294 } else if (cft->write_u64) {
2295 unsigned long long v;
2296 ret = kstrtoull(buf, 0, &v);
2297 if (!ret)
2298 ret = cft->write_u64(css, cft, v);
2299 } else if (cft->write_s64) {
2300 long long v;
2301 ret = kstrtoll(buf, 0, &v);
2302 if (!ret)
2303 ret = cft->write_s64(css, cft, v);
2304 } else if (cft->trigger) {
2305 ret = cft->trigger(css, (unsigned int)cft->private);
2308 } else { 2306 } else {
2309 s64 val = simple_strtoll(strstrip(buffer), &end, 0); 2307 ret = -EINVAL;
2310 if (*end)
2311 return -EINVAL;
2312 retval = cft->write_s64(css, cft, val);
2313 } 2308 }
2314 if (!retval) 2309out_free:
2315 retval = nbytes; 2310 kfree(buf);
2316 return retval; 2311 return ret ?: nbytes;
2317} 2312}
2318 2313
2319static ssize_t cgroup_write_string(struct cgroup_subsys_state *css, 2314/*
2320 struct cftype *cft, struct file *file, 2315 * seqfile ops/methods for returning structured data. Currently just
2321 const char __user *userbuf, size_t nbytes, 2316 * supports string->u64 maps, but can be extended in future.
2322 loff_t *unused_ppos) 2317 */
2318
2319static void *cgroup_seqfile_start(struct seq_file *seq, loff_t *ppos)
2323{ 2320{
2324 char local_buffer[CGROUP_LOCAL_BUFFER_SIZE]; 2321 struct cftype *cft = seq_cft(seq);
2325 int retval = 0;
2326 size_t max_bytes = cft->max_write_len;
2327 char *buffer = local_buffer;
2328 2322
2329 if (!max_bytes) 2323 if (cft->seq_start) {
2330 max_bytes = sizeof(local_buffer) - 1; 2324 return cft->seq_start(seq, ppos);
2331 if (nbytes >= max_bytes) 2325 } else {
2332 return -E2BIG; 2326 /*
2333 /* Allocate a dynamic buffer if we need one */ 2327 * The same behavior and code as single_open(). Returns
2334 if (nbytes >= sizeof(local_buffer)) { 2328 * !NULL if pos is at the beginning; otherwise, NULL.
2335 buffer = kmalloc(nbytes + 1, GFP_KERNEL); 2329 */
2336 if (buffer == NULL) 2330 return NULL + !*ppos;
2337 return -ENOMEM;
2338 }
2339 if (nbytes && copy_from_user(buffer, userbuf, nbytes)) {
2340 retval = -EFAULT;
2341 goto out;
2342 } 2331 }
2343
2344 buffer[nbytes] = 0; /* nul-terminate */
2345 retval = cft->write_string(css, cft, strstrip(buffer));
2346 if (!retval)
2347 retval = nbytes;
2348out:
2349 if (buffer != local_buffer)
2350 kfree(buffer);
2351 return retval;
2352} 2332}
2353 2333
2354static ssize_t cgroup_file_write(struct file *file, const char __user *buf, 2334static void *cgroup_seqfile_next(struct seq_file *seq, void *v, loff_t *ppos)
2355 size_t nbytes, loff_t *ppos)
2356{ 2335{
2357 struct cfent *cfe = __d_cfe(file->f_dentry); 2336 struct cftype *cft = seq_cft(seq);
2358 struct cftype *cft = __d_cft(file->f_dentry);
2359 struct cgroup_subsys_state *css = cfe->css;
2360 2337
2361 if (cft->write) 2338 if (cft->seq_next) {
2362 return cft->write(css, cft, file, buf, nbytes, ppos); 2339 return cft->seq_next(seq, v, ppos);
2363 if (cft->write_u64 || cft->write_s64) 2340 } else {
2364 return cgroup_write_X64(css, cft, file, buf, nbytes, ppos); 2341 /*
2365 if (cft->write_string) 2342 * The same behavior and code as single_open(), always
2366 return cgroup_write_string(css, cft, file, buf, nbytes, ppos); 2343 * terminate after the initial read.
2367 if (cft->trigger) { 2344 */
2368 int ret = cft->trigger(css, (unsigned int)cft->private); 2345 ++*ppos;
2369 return ret ? ret : nbytes; 2346 return NULL;
2370 } 2347 }
2371 return -EINVAL;
2372} 2348}
2373 2349
2374static ssize_t cgroup_read_u64(struct cgroup_subsys_state *css, 2350static void cgroup_seqfile_stop(struct seq_file *seq, void *v)
2375 struct cftype *cft, struct file *file,
2376 char __user *buf, size_t nbytes, loff_t *ppos)
2377{ 2351{
2378 char tmp[CGROUP_LOCAL_BUFFER_SIZE]; 2352 struct cftype *cft = seq_cft(seq);
2379 u64 val = cft->read_u64(css, cft);
2380 int len = sprintf(tmp, "%llu\n", (unsigned long long) val);
2381 2353
2382 return simple_read_from_buffer(buf, nbytes, ppos, tmp, len); 2354 if (cft->seq_stop)
2355 cft->seq_stop(seq, v);
2383} 2356}
2384 2357
2385static ssize_t cgroup_read_s64(struct cgroup_subsys_state *css, 2358static int cgroup_seqfile_show(struct seq_file *m, void *arg)
2386 struct cftype *cft, struct file *file,
2387 char __user *buf, size_t nbytes, loff_t *ppos)
2388{ 2359{
2389 char tmp[CGROUP_LOCAL_BUFFER_SIZE]; 2360 struct cftype *cft = seq_cft(m);
2390 s64 val = cft->read_s64(css, cft); 2361 struct cgroup_subsys_state *css = seq_css(m);
2391 int len = sprintf(tmp, "%lld\n", (long long) val);
2392
2393 return simple_read_from_buffer(buf, nbytes, ppos, tmp, len);
2394}
2395 2362
2396static ssize_t cgroup_file_read(struct file *file, char __user *buf, 2363 if (cft->seq_show)
2397 size_t nbytes, loff_t *ppos) 2364 return cft->seq_show(m, arg);
2398{
2399 struct cfent *cfe = __d_cfe(file->f_dentry);
2400 struct cftype *cft = __d_cft(file->f_dentry);
2401 struct cgroup_subsys_state *css = cfe->css;
2402 2365
2403 if (cft->read)
2404 return cft->read(css, cft, file, buf, nbytes, ppos);
2405 if (cft->read_u64) 2366 if (cft->read_u64)
2406 return cgroup_read_u64(css, cft, file, buf, nbytes, ppos); 2367 seq_printf(m, "%llu\n", cft->read_u64(css, cft));
2407 if (cft->read_s64) 2368 else if (cft->read_s64)
2408 return cgroup_read_s64(css, cft, file, buf, nbytes, ppos); 2369 seq_printf(m, "%lld\n", cft->read_s64(css, cft));
2409 return -EINVAL; 2370 else
2410} 2371 return -EINVAL;
2411 2372 return 0;
2412/*
2413 * seqfile ops/methods for returning structured data. Currently just
2414 * supports string->u64 maps, but can be extended in future.
2415 */
2416
2417static int cgroup_map_add(struct cgroup_map_cb *cb, const char *key, u64 value)
2418{
2419 struct seq_file *sf = cb->state;
2420 return seq_printf(sf, "%s %llu\n", key, (unsigned long long)value);
2421}
2422
2423static int cgroup_seqfile_show(struct seq_file *m, void *arg)
2424{
2425 struct cfent *cfe = m->private;
2426 struct cftype *cft = cfe->type;
2427 struct cgroup_subsys_state *css = cfe->css;
2428
2429 if (cft->read_map) {
2430 struct cgroup_map_cb cb = {
2431 .fill = cgroup_map_add,
2432 .state = m,
2433 };
2434 return cft->read_map(css, cft, &cb);
2435 }
2436 return cft->read_seq_string(css, cft, m);
2437} 2373}
2438 2374
2439static const struct file_operations cgroup_seqfile_operations = { 2375static struct seq_operations cgroup_seq_operations = {
2440 .read = seq_read, 2376 .start = cgroup_seqfile_start,
2441 .write = cgroup_file_write, 2377 .next = cgroup_seqfile_next,
2442 .llseek = seq_lseek, 2378 .stop = cgroup_seqfile_stop,
2443 .release = cgroup_file_release, 2379 .show = cgroup_seqfile_show,
2444}; 2380};
2445 2381
2446static int cgroup_file_open(struct inode *inode, struct file *file) 2382static int cgroup_file_open(struct inode *inode, struct file *file)
@@ -2449,6 +2385,7 @@ static int cgroup_file_open(struct inode *inode, struct file *file)
2449 struct cftype *cft = __d_cft(file->f_dentry); 2385 struct cftype *cft = __d_cft(file->f_dentry);
2450 struct cgroup *cgrp = __d_cgrp(cfe->dentry->d_parent); 2386 struct cgroup *cgrp = __d_cgrp(cfe->dentry->d_parent);
2451 struct cgroup_subsys_state *css; 2387 struct cgroup_subsys_state *css;
2388 struct cgroup_open_file *of;
2452 int err; 2389 int err;
2453 2390
2454 err = generic_file_open(inode, file); 2391 err = generic_file_open(inode, file);
@@ -2478,32 +2415,26 @@ static int cgroup_file_open(struct inode *inode, struct file *file)
2478 WARN_ON_ONCE(cfe->css && cfe->css != css); 2415 WARN_ON_ONCE(cfe->css && cfe->css != css);
2479 cfe->css = css; 2416 cfe->css = css;
2480 2417
2481 if (cft->read_map || cft->read_seq_string) { 2418 of = __seq_open_private(file, &cgroup_seq_operations,
2482 file->f_op = &cgroup_seqfile_operations; 2419 sizeof(struct cgroup_open_file));
2483 err = single_open(file, cgroup_seqfile_show, cfe); 2420 if (of) {
2484 } else if (cft->open) { 2421 of->cfe = cfe;
2485 err = cft->open(inode, file); 2422 return 0;
2486 } 2423 }
2487 2424
2488 if (css->ss && err) 2425 if (css->ss)
2489 css_put(css); 2426 css_put(css);
2490 return err; 2427 return -ENOMEM;
2491} 2428}
2492 2429
2493static int cgroup_file_release(struct inode *inode, struct file *file) 2430static int cgroup_file_release(struct inode *inode, struct file *file)
2494{ 2431{
2495 struct cfent *cfe = __d_cfe(file->f_dentry); 2432 struct cfent *cfe = __d_cfe(file->f_dentry);
2496 struct cftype *cft = __d_cft(file->f_dentry);
2497 struct cgroup_subsys_state *css = cfe->css; 2433 struct cgroup_subsys_state *css = cfe->css;
2498 int ret = 0;
2499 2434
2500 if (cft->release)
2501 ret = cft->release(inode, file);
2502 if (css->ss) 2435 if (css->ss)
2503 css_put(css); 2436 css_put(css);
2504 if (file->f_op == &cgroup_seqfile_operations) 2437 return seq_release_private(inode, file);
2505 single_release(inode, file);
2506 return ret;
2507} 2438}
2508 2439
2509/* 2440/*
@@ -2614,7 +2545,7 @@ static ssize_t cgroup_listxattr(struct dentry *dentry, char *buf, size_t size)
2614} 2545}
2615 2546
2616static const struct file_operations cgroup_file_operations = { 2547static const struct file_operations cgroup_file_operations = {
2617 .read = cgroup_file_read, 2548 .read = seq_read,
2618 .write = cgroup_file_write, 2549 .write = cgroup_file_write,
2619 .llseek = generic_file_llseek, 2550 .llseek = generic_file_llseek,
2620 .open = cgroup_file_open, 2551 .open = cgroup_file_open,
@@ -2639,16 +2570,6 @@ static const struct inode_operations cgroup_dir_inode_operations = {
2639 .removexattr = cgroup_removexattr, 2570 .removexattr = cgroup_removexattr,
2640}; 2571};
2641 2572
2642/*
2643 * Check if a file is a control file
2644 */
2645static inline struct cftype *__file_cft(struct file *file)
2646{
2647 if (file_inode(file)->i_fop != &cgroup_file_operations)
2648 return ERR_PTR(-EINVAL);
2649 return __d_cft(file->f_dentry);
2650}
2651
2652static int cgroup_create_file(struct dentry *dentry, umode_t mode, 2573static int cgroup_create_file(struct dentry *dentry, umode_t mode,
2653 struct super_block *sb) 2574 struct super_block *sb)
2654{ 2575{
@@ -2706,12 +2627,11 @@ static umode_t cgroup_file_mode(const struct cftype *cft)
2706 if (cft->mode) 2627 if (cft->mode)
2707 return cft->mode; 2628 return cft->mode;
2708 2629
2709 if (cft->read || cft->read_u64 || cft->read_s64 || 2630 if (cft->read_u64 || cft->read_s64 || cft->seq_show)
2710 cft->read_map || cft->read_seq_string)
2711 mode |= S_IRUGO; 2631 mode |= S_IRUGO;
2712 2632
2713 if (cft->write || cft->write_u64 || cft->write_s64 || 2633 if (cft->write_u64 || cft->write_s64 || cft->write_string ||
2714 cft->write_string || cft->trigger) 2634 cft->trigger)
2715 mode |= S_IWUSR; 2635 mode |= S_IWUSR;
2716 2636
2717 return mode; 2637 return mode;
@@ -2845,10 +2765,7 @@ static int cgroup_cfts_commit(struct cftype *cfts, bool is_add)
2845 */ 2765 */
2846 update_before = cgroup_serial_nr_next; 2766 update_before = cgroup_serial_nr_next;
2847 2767
2848 mutex_unlock(&cgroup_mutex);
2849
2850 /* add/rm files for all cgroups created before */ 2768 /* add/rm files for all cgroups created before */
2851 rcu_read_lock();
2852 css_for_each_descendant_pre(css, cgroup_css(root, ss)) { 2769 css_for_each_descendant_pre(css, cgroup_css(root, ss)) {
2853 struct cgroup *cgrp = css->cgroup; 2770 struct cgroup *cgrp = css->cgroup;
2854 2771
@@ -2857,23 +2774,19 @@ static int cgroup_cfts_commit(struct cftype *cfts, bool is_add)
2857 2774
2858 inode = cgrp->dentry->d_inode; 2775 inode = cgrp->dentry->d_inode;
2859 dget(cgrp->dentry); 2776 dget(cgrp->dentry);
2860 rcu_read_unlock();
2861
2862 dput(prev); 2777 dput(prev);
2863 prev = cgrp->dentry; 2778 prev = cgrp->dentry;
2864 2779
2780 mutex_unlock(&cgroup_mutex);
2865 mutex_lock(&inode->i_mutex); 2781 mutex_lock(&inode->i_mutex);
2866 mutex_lock(&cgroup_mutex); 2782 mutex_lock(&cgroup_mutex);
2867 if (cgrp->serial_nr < update_before && !cgroup_is_dead(cgrp)) 2783 if (cgrp->serial_nr < update_before && !cgroup_is_dead(cgrp))
2868 ret = cgroup_addrm_files(cgrp, cfts, is_add); 2784 ret = cgroup_addrm_files(cgrp, cfts, is_add);
2869 mutex_unlock(&cgroup_mutex);
2870 mutex_unlock(&inode->i_mutex); 2785 mutex_unlock(&inode->i_mutex);
2871
2872 rcu_read_lock();
2873 if (ret) 2786 if (ret)
2874 break; 2787 break;
2875 } 2788 }
2876 rcu_read_unlock(); 2789 mutex_unlock(&cgroup_mutex);
2877 dput(prev); 2790 dput(prev);
2878 deactivate_super(sb); 2791 deactivate_super(sb);
2879 return ret; 2792 return ret;
@@ -2992,9 +2905,14 @@ static void cgroup_enable_task_cg_lists(void)
2992 * We should check if the process is exiting, otherwise 2905 * We should check if the process is exiting, otherwise
2993 * it will race with cgroup_exit() in that the list 2906 * it will race with cgroup_exit() in that the list
2994 * entry won't be deleted though the process has exited. 2907 * entry won't be deleted though the process has exited.
2908 * Do it while holding siglock so that we don't end up
2909 * racing against cgroup_exit().
2995 */ 2910 */
2911 spin_lock_irq(&p->sighand->siglock);
2996 if (!(p->flags & PF_EXITING) && list_empty(&p->cg_list)) 2912 if (!(p->flags & PF_EXITING) && list_empty(&p->cg_list))
2997 list_add(&p->cg_list, &task_css_set(p)->tasks); 2913 list_add(&p->cg_list, &task_css_set(p)->tasks);
2914 spin_unlock_irq(&p->sighand->siglock);
2915
2998 task_unlock(p); 2916 task_unlock(p);
2999 } while_each_thread(g, p); 2917 } while_each_thread(g, p);
3000 read_unlock(&tasklist_lock); 2918 read_unlock(&tasklist_lock);
@@ -3007,9 +2925,9 @@ static void cgroup_enable_task_cg_lists(void)
3007 * @parent_css: css whose children to walk 2925 * @parent_css: css whose children to walk
3008 * 2926 *
3009 * This function returns the next child of @parent_css and should be called 2927 * This function returns the next child of @parent_css and should be called
3010 * under RCU read lock. The only requirement is that @parent_css and 2928 * under either cgroup_mutex or RCU read lock. The only requirement is
3011 * @pos_css are accessible. The next sibling is guaranteed to be returned 2929 * that @parent_css and @pos_css are accessible. The next sibling is
3012 * regardless of their states. 2930 * guaranteed to be returned regardless of their states.
3013 */ 2931 */
3014struct cgroup_subsys_state * 2932struct cgroup_subsys_state *
3015css_next_child(struct cgroup_subsys_state *pos_css, 2933css_next_child(struct cgroup_subsys_state *pos_css,
@@ -3019,7 +2937,7 @@ css_next_child(struct cgroup_subsys_state *pos_css,
3019 struct cgroup *cgrp = parent_css->cgroup; 2937 struct cgroup *cgrp = parent_css->cgroup;
3020 struct cgroup *next; 2938 struct cgroup *next;
3021 2939
3022 WARN_ON_ONCE(!rcu_read_lock_held()); 2940 cgroup_assert_mutex_or_rcu_locked();
3023 2941
3024 /* 2942 /*
3025 * @pos could already have been removed. Once a cgroup is removed, 2943 * @pos could already have been removed. Once a cgroup is removed,
@@ -3066,10 +2984,10 @@ EXPORT_SYMBOL_GPL(css_next_child);
3066 * to visit for pre-order traversal of @root's descendants. @root is 2984 * to visit for pre-order traversal of @root's descendants. @root is
3067 * included in the iteration and the first node to be visited. 2985 * included in the iteration and the first node to be visited.
3068 * 2986 *
3069 * While this function requires RCU read locking, it doesn't require the 2987 * While this function requires cgroup_mutex or RCU read locking, it
3070 * whole traversal to be contained in a single RCU critical section. This 2988 * doesn't require the whole traversal to be contained in a single critical
3071 * function will return the correct next descendant as long as both @pos 2989 * section. This function will return the correct next descendant as long
3072 * and @root are accessible and @pos is a descendant of @root. 2990 * as both @pos and @root are accessible and @pos is a descendant of @root.
3073 */ 2991 */
3074struct cgroup_subsys_state * 2992struct cgroup_subsys_state *
3075css_next_descendant_pre(struct cgroup_subsys_state *pos, 2993css_next_descendant_pre(struct cgroup_subsys_state *pos,
@@ -3077,7 +2995,7 @@ css_next_descendant_pre(struct cgroup_subsys_state *pos,
3077{ 2995{
3078 struct cgroup_subsys_state *next; 2996 struct cgroup_subsys_state *next;
3079 2997
3080 WARN_ON_ONCE(!rcu_read_lock_held()); 2998 cgroup_assert_mutex_or_rcu_locked();
3081 2999
3082 /* if first iteration, visit @root */ 3000 /* if first iteration, visit @root */
3083 if (!pos) 3001 if (!pos)
@@ -3108,17 +3026,17 @@ EXPORT_SYMBOL_GPL(css_next_descendant_pre);
3108 * is returned. This can be used during pre-order traversal to skip 3026 * is returned. This can be used during pre-order traversal to skip
3109 * subtree of @pos. 3027 * subtree of @pos.
3110 * 3028 *
3111 * While this function requires RCU read locking, it doesn't require the 3029 * While this function requires cgroup_mutex or RCU read locking, it
3112 * whole traversal to be contained in a single RCU critical section. This 3030 * doesn't require the whole traversal to be contained in a single critical
3113 * function will return the correct rightmost descendant as long as @pos is 3031 * section. This function will return the correct rightmost descendant as
3114 * accessible. 3032 * long as @pos is accessible.
3115 */ 3033 */
3116struct cgroup_subsys_state * 3034struct cgroup_subsys_state *
3117css_rightmost_descendant(struct cgroup_subsys_state *pos) 3035css_rightmost_descendant(struct cgroup_subsys_state *pos)
3118{ 3036{
3119 struct cgroup_subsys_state *last, *tmp; 3037 struct cgroup_subsys_state *last, *tmp;
3120 3038
3121 WARN_ON_ONCE(!rcu_read_lock_held()); 3039 cgroup_assert_mutex_or_rcu_locked();
3122 3040
3123 do { 3041 do {
3124 last = pos; 3042 last = pos;
@@ -3154,10 +3072,11 @@ css_leftmost_descendant(struct cgroup_subsys_state *pos)
3154 * to visit for post-order traversal of @root's descendants. @root is 3072 * to visit for post-order traversal of @root's descendants. @root is
3155 * included in the iteration and the last node to be visited. 3073 * included in the iteration and the last node to be visited.
3156 * 3074 *
3157 * While this function requires RCU read locking, it doesn't require the 3075 * While this function requires cgroup_mutex or RCU read locking, it
3158 * whole traversal to be contained in a single RCU critical section. This 3076 * doesn't require the whole traversal to be contained in a single critical
3159 * function will return the correct next descendant as long as both @pos 3077 * section. This function will return the correct next descendant as long
3160 * and @cgroup are accessible and @pos is a descendant of @cgroup. 3078 * as both @pos and @cgroup are accessible and @pos is a descendant of
3079 * @cgroup.
3161 */ 3080 */
3162struct cgroup_subsys_state * 3081struct cgroup_subsys_state *
3163css_next_descendant_post(struct cgroup_subsys_state *pos, 3082css_next_descendant_post(struct cgroup_subsys_state *pos,
@@ -3165,7 +3084,7 @@ css_next_descendant_post(struct cgroup_subsys_state *pos,
3165{ 3084{
3166 struct cgroup_subsys_state *next; 3085 struct cgroup_subsys_state *next;
3167 3086
3168 WARN_ON_ONCE(!rcu_read_lock_held()); 3087 cgroup_assert_mutex_or_rcu_locked();
3169 3088
3170 /* if first iteration, visit leftmost descendant which may be @root */ 3089 /* if first iteration, visit leftmost descendant which may be @root */
3171 if (!pos) 3090 if (!pos)
@@ -3504,14 +3423,12 @@ struct cgroup_pidlist {
3504 pid_t *list; 3423 pid_t *list;
3505 /* how many elements the above list has */ 3424 /* how many elements the above list has */
3506 int length; 3425 int length;
3507 /* how many files are using the current array */
3508 int use_count;
3509 /* each of these stored in a list by its cgroup */ 3426 /* each of these stored in a list by its cgroup */
3510 struct list_head links; 3427 struct list_head links;
3511 /* pointer to the cgroup we belong to, for list removal purposes */ 3428 /* pointer to the cgroup we belong to, for list removal purposes */
3512 struct cgroup *owner; 3429 struct cgroup *owner;
3513 /* protects the other fields */ 3430 /* for delayed destruction */
3514 struct rw_semaphore rwsem; 3431 struct delayed_work destroy_dwork;
3515}; 3432};
3516 3433
3517/* 3434/*
@@ -3527,6 +3444,7 @@ static void *pidlist_allocate(int count)
3527 else 3444 else
3528 return kmalloc(count * sizeof(pid_t), GFP_KERNEL); 3445 return kmalloc(count * sizeof(pid_t), GFP_KERNEL);
3529} 3446}
3447
3530static void pidlist_free(void *p) 3448static void pidlist_free(void *p)
3531{ 3449{
3532 if (is_vmalloc_addr(p)) 3450 if (is_vmalloc_addr(p))
@@ -3536,6 +3454,47 @@ static void pidlist_free(void *p)
3536} 3454}
3537 3455
3538/* 3456/*
3457 * Used to destroy all pidlists lingering waiting for destroy timer. None
3458 * should be left afterwards.
3459 */
3460static void cgroup_pidlist_destroy_all(struct cgroup *cgrp)
3461{
3462 struct cgroup_pidlist *l, *tmp_l;
3463
3464 mutex_lock(&cgrp->pidlist_mutex);
3465 list_for_each_entry_safe(l, tmp_l, &cgrp->pidlists, links)
3466 mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork, 0);
3467 mutex_unlock(&cgrp->pidlist_mutex);
3468
3469 flush_workqueue(cgroup_pidlist_destroy_wq);
3470 BUG_ON(!list_empty(&cgrp->pidlists));
3471}
3472
3473static void cgroup_pidlist_destroy_work_fn(struct work_struct *work)
3474{
3475 struct delayed_work *dwork = to_delayed_work(work);
3476 struct cgroup_pidlist *l = container_of(dwork, struct cgroup_pidlist,
3477 destroy_dwork);
3478 struct cgroup_pidlist *tofree = NULL;
3479
3480 mutex_lock(&l->owner->pidlist_mutex);
3481
3482 /*
3483 * Destroy iff we didn't get queued again. The state won't change
3484 * as destroy_dwork can only be queued while locked.
3485 */
3486 if (!delayed_work_pending(dwork)) {
3487 list_del(&l->links);
3488 pidlist_free(l->list);
3489 put_pid_ns(l->key.ns);
3490 tofree = l;
3491 }
3492
3493 mutex_unlock(&l->owner->pidlist_mutex);
3494 kfree(tofree);
3495}
3496
3497/*
3539 * pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries 3498 * pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries
3540 * Returns the number of unique elements. 3499 * Returns the number of unique elements.
3541 */ 3500 */
@@ -3565,52 +3524,92 @@ after:
3565 return dest; 3524 return dest;
3566} 3525}
3567 3526
3527/*
3528 * The two pid files - task and cgroup.procs - guaranteed that the result
3529 * is sorted, which forced this whole pidlist fiasco. As pid order is
3530 * different per namespace, each namespace needs differently sorted list,
3531 * making it impossible to use, for example, single rbtree of member tasks
3532 * sorted by task pointer. As pidlists can be fairly large, allocating one
3533 * per open file is dangerous, so cgroup had to implement shared pool of
3534 * pidlists keyed by cgroup and namespace.
3535 *
3536 * All this extra complexity was caused by the original implementation
3537 * committing to an entirely unnecessary property. In the long term, we
3538 * want to do away with it. Explicitly scramble sort order if
3539 * sane_behavior so that no such expectation exists in the new interface.
3540 *
3541 * Scrambling is done by swapping every two consecutive bits, which is
3542 * non-identity one-to-one mapping which disturbs sort order sufficiently.
3543 */
3544static pid_t pid_fry(pid_t pid)
3545{
3546 unsigned a = pid & 0x55555555;
3547 unsigned b = pid & 0xAAAAAAAA;
3548
3549 return (a << 1) | (b >> 1);
3550}
3551
3552static pid_t cgroup_pid_fry(struct cgroup *cgrp, pid_t pid)
3553{
3554 if (cgroup_sane_behavior(cgrp))
3555 return pid_fry(pid);
3556 else
3557 return pid;
3558}
3559
3568static int cmppid(const void *a, const void *b) 3560static int cmppid(const void *a, const void *b)
3569{ 3561{
3570 return *(pid_t *)a - *(pid_t *)b; 3562 return *(pid_t *)a - *(pid_t *)b;
3571} 3563}
3572 3564
3565static int fried_cmppid(const void *a, const void *b)
3566{
3567 return pid_fry(*(pid_t *)a) - pid_fry(*(pid_t *)b);
3568}
3569
3570static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
3571 enum cgroup_filetype type)
3572{
3573 struct cgroup_pidlist *l;
3574 /* don't need task_nsproxy() if we're looking at ourself */
3575 struct pid_namespace *ns = task_active_pid_ns(current);
3576
3577 lockdep_assert_held(&cgrp->pidlist_mutex);
3578
3579 list_for_each_entry(l, &cgrp->pidlists, links)
3580 if (l->key.type == type && l->key.ns == ns)
3581 return l;
3582 return NULL;
3583}
3584
3573/* 3585/*
3574 * find the appropriate pidlist for our purpose (given procs vs tasks) 3586 * find the appropriate pidlist for our purpose (given procs vs tasks)
3575 * returns with the lock on that pidlist already held, and takes care 3587 * returns with the lock on that pidlist already held, and takes care
3576 * of the use count, or returns NULL with no locks held if we're out of 3588 * of the use count, or returns NULL with no locks held if we're out of
3577 * memory. 3589 * memory.
3578 */ 3590 */
3579static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp, 3591static struct cgroup_pidlist *cgroup_pidlist_find_create(struct cgroup *cgrp,
3580 enum cgroup_filetype type) 3592 enum cgroup_filetype type)
3581{ 3593{
3582 struct cgroup_pidlist *l; 3594 struct cgroup_pidlist *l;
3583 /* don't need task_nsproxy() if we're looking at ourself */
3584 struct pid_namespace *ns = task_active_pid_ns(current);
3585 3595
3586 /* 3596 lockdep_assert_held(&cgrp->pidlist_mutex);
3587 * We can't drop the pidlist_mutex before taking the l->rwsem in case 3597
3588 * the last ref-holder is trying to remove l from the list at the same 3598 l = cgroup_pidlist_find(cgrp, type);
3589 * time. Holding the pidlist_mutex precludes somebody taking whichever 3599 if (l)
3590 * list we find out from under us - compare release_pid_array(). 3600 return l;
3591 */ 3601
3592 mutex_lock(&cgrp->pidlist_mutex);
3593 list_for_each_entry(l, &cgrp->pidlists, links) {
3594 if (l->key.type == type && l->key.ns == ns) {
3595 /* make sure l doesn't vanish out from under us */
3596 down_write(&l->rwsem);
3597 mutex_unlock(&cgrp->pidlist_mutex);
3598 return l;
3599 }
3600 }
3601 /* entry not found; create a new one */ 3602 /* entry not found; create a new one */
3602 l = kzalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL); 3603 l = kzalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL);
3603 if (!l) { 3604 if (!l)
3604 mutex_unlock(&cgrp->pidlist_mutex);
3605 return l; 3605 return l;
3606 } 3606
3607 init_rwsem(&l->rwsem); 3607 INIT_DELAYED_WORK(&l->destroy_dwork, cgroup_pidlist_destroy_work_fn);
3608 down_write(&l->rwsem);
3609 l->key.type = type; 3608 l->key.type = type;
3610 l->key.ns = get_pid_ns(ns); 3609 /* don't need task_nsproxy() if we're looking at ourself */
3610 l->key.ns = get_pid_ns(task_active_pid_ns(current));
3611 l->owner = cgrp; 3611 l->owner = cgrp;
3612 list_add(&l->links, &cgrp->pidlists); 3612 list_add(&l->links, &cgrp->pidlists);
3613 mutex_unlock(&cgrp->pidlist_mutex);
3614 return l; 3613 return l;
3615} 3614}
3616 3615
@@ -3627,6 +3626,8 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
3627 struct task_struct *tsk; 3626 struct task_struct *tsk;
3628 struct cgroup_pidlist *l; 3627 struct cgroup_pidlist *l;
3629 3628
3629 lockdep_assert_held(&cgrp->pidlist_mutex);
3630
3630 /* 3631 /*
3631 * If cgroup gets more users after we read count, we won't have 3632 * If cgroup gets more users after we read count, we won't have
3632 * enough space - tough. This race is indistinguishable to the 3633 * enough space - tough. This race is indistinguishable to the
@@ -3653,20 +3654,24 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
3653 css_task_iter_end(&it); 3654 css_task_iter_end(&it);
3654 length = n; 3655 length = n;
3655 /* now sort & (if procs) strip out duplicates */ 3656 /* now sort & (if procs) strip out duplicates */
3656 sort(array, length, sizeof(pid_t), cmppid, NULL); 3657 if (cgroup_sane_behavior(cgrp))
3658 sort(array, length, sizeof(pid_t), fried_cmppid, NULL);
3659 else
3660 sort(array, length, sizeof(pid_t), cmppid, NULL);
3657 if (type == CGROUP_FILE_PROCS) 3661 if (type == CGROUP_FILE_PROCS)
3658 length = pidlist_uniq(array, length); 3662 length = pidlist_uniq(array, length);
3659 l = cgroup_pidlist_find(cgrp, type); 3663
3664 l = cgroup_pidlist_find_create(cgrp, type);
3660 if (!l) { 3665 if (!l) {
3666 mutex_unlock(&cgrp->pidlist_mutex);
3661 pidlist_free(array); 3667 pidlist_free(array);
3662 return -ENOMEM; 3668 return -ENOMEM;
3663 } 3669 }
3664 /* store array, freeing old if necessary - lock already held */ 3670
3671 /* store array, freeing old if necessary */
3665 pidlist_free(l->list); 3672 pidlist_free(l->list);
3666 l->list = array; 3673 l->list = array;
3667 l->length = length; 3674 l->length = length;
3668 l->use_count++;
3669 up_write(&l->rwsem);
3670 *lp = l; 3675 *lp = l;
3671 return 0; 3676 return 0;
3672} 3677}
@@ -3740,20 +3745,45 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
3740 * after a seek to the start). Use a binary-search to find the 3745 * after a seek to the start). Use a binary-search to find the
3741 * next pid to display, if any 3746 * next pid to display, if any
3742 */ 3747 */
3743 struct cgroup_pidlist *l = s->private; 3748 struct cgroup_open_file *of = s->private;
3749 struct cgroup *cgrp = seq_css(s)->cgroup;
3750 struct cgroup_pidlist *l;
3751 enum cgroup_filetype type = seq_cft(s)->private;
3744 int index = 0, pid = *pos; 3752 int index = 0, pid = *pos;
3745 int *iter; 3753 int *iter, ret;
3754
3755 mutex_lock(&cgrp->pidlist_mutex);
3756
3757 /*
3758 * !NULL @of->priv indicates that this isn't the first start()
3759 * after open. If the matching pidlist is around, we can use that.
3760 * Look for it. Note that @of->priv can't be used directly. It
3761 * could already have been destroyed.
3762 */
3763 if (of->priv)
3764 of->priv = cgroup_pidlist_find(cgrp, type);
3765
3766 /*
3767 * Either this is the first start() after open or the matching
3768 * pidlist has been destroyed inbetween. Create a new one.
3769 */
3770 if (!of->priv) {
3771 ret = pidlist_array_load(cgrp, type,
3772 (struct cgroup_pidlist **)&of->priv);
3773 if (ret)
3774 return ERR_PTR(ret);
3775 }
3776 l = of->priv;
3746 3777
3747 down_read(&l->rwsem);
3748 if (pid) { 3778 if (pid) {
3749 int end = l->length; 3779 int end = l->length;
3750 3780
3751 while (index < end) { 3781 while (index < end) {
3752 int mid = (index + end) / 2; 3782 int mid = (index + end) / 2;
3753 if (l->list[mid] == pid) { 3783 if (cgroup_pid_fry(cgrp, l->list[mid]) == pid) {
3754 index = mid; 3784 index = mid;
3755 break; 3785 break;
3756 } else if (l->list[mid] <= pid) 3786 } else if (cgroup_pid_fry(cgrp, l->list[mid]) <= pid)
3757 index = mid + 1; 3787 index = mid + 1;
3758 else 3788 else
3759 end = mid; 3789 end = mid;
@@ -3764,19 +3794,25 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
3764 return NULL; 3794 return NULL;
3765 /* Update the abstract position to be the actual pid that we found */ 3795 /* Update the abstract position to be the actual pid that we found */
3766 iter = l->list + index; 3796 iter = l->list + index;
3767 *pos = *iter; 3797 *pos = cgroup_pid_fry(cgrp, *iter);
3768 return iter; 3798 return iter;
3769} 3799}
3770 3800
3771static void cgroup_pidlist_stop(struct seq_file *s, void *v) 3801static void cgroup_pidlist_stop(struct seq_file *s, void *v)
3772{ 3802{
3773 struct cgroup_pidlist *l = s->private; 3803 struct cgroup_open_file *of = s->private;
3774 up_read(&l->rwsem); 3804 struct cgroup_pidlist *l = of->priv;
3805
3806 if (l)
3807 mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork,
3808 CGROUP_PIDLIST_DESTROY_DELAY);
3809 mutex_unlock(&seq_css(s)->cgroup->pidlist_mutex);
3775} 3810}
3776 3811
3777static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos) 3812static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos)
3778{ 3813{
3779 struct cgroup_pidlist *l = s->private; 3814 struct cgroup_open_file *of = s->private;
3815 struct cgroup_pidlist *l = of->priv;
3780 pid_t *p = v; 3816 pid_t *p = v;
3781 pid_t *end = l->list + l->length; 3817 pid_t *end = l->list + l->length;
3782 /* 3818 /*
@@ -3787,7 +3823,7 @@ static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos)
3787 if (p >= end) { 3823 if (p >= end) {
3788 return NULL; 3824 return NULL;
3789 } else { 3825 } else {
3790 *pos = *p; 3826 *pos = cgroup_pid_fry(seq_css(s)->cgroup, *p);
3791 return p; 3827 return p;
3792 } 3828 }
3793} 3829}
@@ -3808,92 +3844,6 @@ static const struct seq_operations cgroup_pidlist_seq_operations = {
3808 .show = cgroup_pidlist_show, 3844 .show = cgroup_pidlist_show,
3809}; 3845};
3810 3846
3811static void cgroup_release_pid_array(struct cgroup_pidlist *l)
3812{
3813 /*
3814 * the case where we're the last user of this particular pidlist will
3815 * have us remove it from the cgroup's list, which entails taking the
3816 * mutex. since in pidlist_find the pidlist->lock depends on cgroup->
3817 * pidlist_mutex, we have to take pidlist_mutex first.
3818 */
3819 mutex_lock(&l->owner->pidlist_mutex);
3820 down_write(&l->rwsem);
3821 BUG_ON(!l->use_count);
3822 if (!--l->use_count) {
3823 /* we're the last user if refcount is 0; remove and free */
3824 list_del(&l->links);
3825 mutex_unlock(&l->owner->pidlist_mutex);
3826 pidlist_free(l->list);
3827 put_pid_ns(l->key.ns);
3828 up_write(&l->rwsem);
3829 kfree(l);
3830 return;
3831 }
3832 mutex_unlock(&l->owner->pidlist_mutex);
3833 up_write(&l->rwsem);
3834}
3835
3836static int cgroup_pidlist_release(struct inode *inode, struct file *file)
3837{
3838 struct cgroup_pidlist *l;
3839 if (!(file->f_mode & FMODE_READ))
3840 return 0;
3841 /*
3842 * the seq_file will only be initialized if the file was opened for
3843 * reading; hence we check if it's not null only in that case.
3844 */
3845 l = ((struct seq_file *)file->private_data)->private;
3846 cgroup_release_pid_array(l);
3847 return seq_release(inode, file);
3848}
3849
3850static const struct file_operations cgroup_pidlist_operations = {
3851 .read = seq_read,
3852 .llseek = seq_lseek,
3853 .write = cgroup_file_write,
3854 .release = cgroup_pidlist_release,
3855};
3856
3857/*
3858 * The following functions handle opens on a file that displays a pidlist
3859 * (tasks or procs). Prepare an array of the process/thread IDs of whoever's
3860 * in the cgroup.
3861 */
3862/* helper function for the two below it */
3863static int cgroup_pidlist_open(struct file *file, enum cgroup_filetype type)
3864{
3865 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
3866 struct cgroup_pidlist *l;
3867 int retval;
3868
3869 /* Nothing to do for write-only files */
3870 if (!(file->f_mode & FMODE_READ))
3871 return 0;
3872
3873 /* have the array populated */
3874 retval = pidlist_array_load(cgrp, type, &l);
3875 if (retval)
3876 return retval;
3877 /* configure file information */
3878 file->f_op = &cgroup_pidlist_operations;
3879
3880 retval = seq_open(file, &cgroup_pidlist_seq_operations);
3881 if (retval) {
3882 cgroup_release_pid_array(l);
3883 return retval;
3884 }
3885 ((struct seq_file *)file->private_data)->private = l;
3886 return 0;
3887}
3888static int cgroup_tasks_open(struct inode *unused, struct file *file)
3889{
3890 return cgroup_pidlist_open(file, CGROUP_FILE_TASKS);
3891}
3892static int cgroup_procs_open(struct inode *unused, struct file *file)
3893{
3894 return cgroup_pidlist_open(file, CGROUP_FILE_PROCS);
3895}
3896
3897static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css, 3847static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css,
3898 struct cftype *cft) 3848 struct cftype *cft)
3899{ 3849{
@@ -3928,202 +3878,6 @@ static void cgroup_dput(struct cgroup *cgrp)
3928 deactivate_super(sb); 3878 deactivate_super(sb);
3929} 3879}
3930 3880
3931/*
3932 * Unregister event and free resources.
3933 *
3934 * Gets called from workqueue.
3935 */
3936static void cgroup_event_remove(struct work_struct *work)
3937{
3938 struct cgroup_event *event = container_of(work, struct cgroup_event,
3939 remove);
3940 struct cgroup_subsys_state *css = event->css;
3941
3942 remove_wait_queue(event->wqh, &event->wait);
3943
3944 event->cft->unregister_event(css, event->cft, event->eventfd);
3945
3946 /* Notify userspace the event is going away. */
3947 eventfd_signal(event->eventfd, 1);
3948
3949 eventfd_ctx_put(event->eventfd);
3950 kfree(event);
3951 css_put(css);
3952}
3953
3954/*
3955 * Gets called on POLLHUP on eventfd when user closes it.
3956 *
3957 * Called with wqh->lock held and interrupts disabled.
3958 */
3959static int cgroup_event_wake(wait_queue_t *wait, unsigned mode,
3960 int sync, void *key)
3961{
3962 struct cgroup_event *event = container_of(wait,
3963 struct cgroup_event, wait);
3964 struct cgroup *cgrp = event->css->cgroup;
3965 unsigned long flags = (unsigned long)key;
3966
3967 if (flags & POLLHUP) {
3968 /*
3969 * If the event has been detached at cgroup removal, we
3970 * can simply return knowing the other side will cleanup
3971 * for us.
3972 *
3973 * We can't race against event freeing since the other
3974 * side will require wqh->lock via remove_wait_queue(),
3975 * which we hold.
3976 */
3977 spin_lock(&cgrp->event_list_lock);
3978 if (!list_empty(&event->list)) {
3979 list_del_init(&event->list);
3980 /*
3981 * We are in atomic context, but cgroup_event_remove()
3982 * may sleep, so we have to call it in workqueue.
3983 */
3984 schedule_work(&event->remove);
3985 }
3986 spin_unlock(&cgrp->event_list_lock);
3987 }
3988
3989 return 0;
3990}
3991
3992static void cgroup_event_ptable_queue_proc(struct file *file,
3993 wait_queue_head_t *wqh, poll_table *pt)
3994{
3995 struct cgroup_event *event = container_of(pt,
3996 struct cgroup_event, pt);
3997
3998 event->wqh = wqh;
3999 add_wait_queue(wqh, &event->wait);
4000}
4001
4002/*
4003 * Parse input and register new cgroup event handler.
4004 *
4005 * Input must be in format '<event_fd> <control_fd> <args>'.
4006 * Interpretation of args is defined by control file implementation.
4007 */
4008static int cgroup_write_event_control(struct cgroup_subsys_state *dummy_css,
4009 struct cftype *cft, const char *buffer)
4010{
4011 struct cgroup *cgrp = dummy_css->cgroup;
4012 struct cgroup_event *event;
4013 struct cgroup_subsys_state *cfile_css;
4014 unsigned int efd, cfd;
4015 struct fd efile;
4016 struct fd cfile;
4017 char *endp;
4018 int ret;
4019
4020 efd = simple_strtoul(buffer, &endp, 10);
4021 if (*endp != ' ')
4022 return -EINVAL;
4023 buffer = endp + 1;
4024
4025 cfd = simple_strtoul(buffer, &endp, 10);
4026 if ((*endp != ' ') && (*endp != '\0'))
4027 return -EINVAL;
4028 buffer = endp + 1;
4029
4030 event = kzalloc(sizeof(*event), GFP_KERNEL);
4031 if (!event)
4032 return -ENOMEM;
4033
4034 INIT_LIST_HEAD(&event->list);
4035 init_poll_funcptr(&event->pt, cgroup_event_ptable_queue_proc);
4036 init_waitqueue_func_entry(&event->wait, cgroup_event_wake);
4037 INIT_WORK(&event->remove, cgroup_event_remove);
4038
4039 efile = fdget(efd);
4040 if (!efile.file) {
4041 ret = -EBADF;
4042 goto out_kfree;
4043 }
4044
4045 event->eventfd = eventfd_ctx_fileget(efile.file);
4046 if (IS_ERR(event->eventfd)) {
4047 ret = PTR_ERR(event->eventfd);
4048 goto out_put_efile;
4049 }
4050
4051 cfile = fdget(cfd);
4052 if (!cfile.file) {
4053 ret = -EBADF;
4054 goto out_put_eventfd;
4055 }
4056
4057 /* the process need read permission on control file */
4058 /* AV: shouldn't we check that it's been opened for read instead? */
4059 ret = inode_permission(file_inode(cfile.file), MAY_READ);
4060 if (ret < 0)
4061 goto out_put_cfile;
4062
4063 event->cft = __file_cft(cfile.file);
4064 if (IS_ERR(event->cft)) {
4065 ret = PTR_ERR(event->cft);
4066 goto out_put_cfile;
4067 }
4068
4069 if (!event->cft->ss) {
4070 ret = -EBADF;
4071 goto out_put_cfile;
4072 }
4073
4074 /*
4075 * Determine the css of @cfile, verify it belongs to the same
4076 * cgroup as cgroup.event_control, and associate @event with it.
4077 * Remaining events are automatically removed on cgroup destruction
4078 * but the removal is asynchronous, so take an extra ref.
4079 */
4080 rcu_read_lock();
4081
4082 ret = -EINVAL;
4083 event->css = cgroup_css(cgrp, event->cft->ss);
4084 cfile_css = css_from_dir(cfile.file->f_dentry->d_parent, event->cft->ss);
4085 if (event->css && event->css == cfile_css && css_tryget(event->css))
4086 ret = 0;
4087
4088 rcu_read_unlock();
4089 if (ret)
4090 goto out_put_cfile;
4091
4092 if (!event->cft->register_event || !event->cft->unregister_event) {
4093 ret = -EINVAL;
4094 goto out_put_css;
4095 }
4096
4097 ret = event->cft->register_event(event->css, event->cft,
4098 event->eventfd, buffer);
4099 if (ret)
4100 goto out_put_css;
4101
4102 efile.file->f_op->poll(efile.file, &event->pt);
4103
4104 spin_lock(&cgrp->event_list_lock);
4105 list_add(&event->list, &cgrp->event_list);
4106 spin_unlock(&cgrp->event_list_lock);
4107
4108 fdput(cfile);
4109 fdput(efile);
4110
4111 return 0;
4112
4113out_put_css:
4114 css_put(event->css);
4115out_put_cfile:
4116 fdput(cfile);
4117out_put_eventfd:
4118 eventfd_ctx_put(event->eventfd);
4119out_put_efile:
4120 fdput(efile);
4121out_kfree:
4122 kfree(event);
4123
4124 return ret;
4125}
4126
4127static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css, 3881static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css,
4128 struct cftype *cft) 3882 struct cftype *cft)
4129{ 3883{
@@ -4143,17 +3897,15 @@ static int cgroup_clone_children_write(struct cgroup_subsys_state *css,
4143static struct cftype cgroup_base_files[] = { 3897static struct cftype cgroup_base_files[] = {
4144 { 3898 {
4145 .name = "cgroup.procs", 3899 .name = "cgroup.procs",
4146 .open = cgroup_procs_open, 3900 .seq_start = cgroup_pidlist_start,
3901 .seq_next = cgroup_pidlist_next,
3902 .seq_stop = cgroup_pidlist_stop,
3903 .seq_show = cgroup_pidlist_show,
3904 .private = CGROUP_FILE_PROCS,
4147 .write_u64 = cgroup_procs_write, 3905 .write_u64 = cgroup_procs_write,
4148 .release = cgroup_pidlist_release,
4149 .mode = S_IRUGO | S_IWUSR, 3906 .mode = S_IRUGO | S_IWUSR,
4150 }, 3907 },
4151 { 3908 {
4152 .name = "cgroup.event_control",
4153 .write_string = cgroup_write_event_control,
4154 .mode = S_IWUGO,
4155 },
4156 {
4157 .name = "cgroup.clone_children", 3909 .name = "cgroup.clone_children",
4158 .flags = CFTYPE_INSANE, 3910 .flags = CFTYPE_INSANE,
4159 .read_u64 = cgroup_clone_children_read, 3911 .read_u64 = cgroup_clone_children_read,
@@ -4162,7 +3914,7 @@ static struct cftype cgroup_base_files[] = {
4162 { 3914 {
4163 .name = "cgroup.sane_behavior", 3915 .name = "cgroup.sane_behavior",
4164 .flags = CFTYPE_ONLY_ON_ROOT, 3916 .flags = CFTYPE_ONLY_ON_ROOT,
4165 .read_seq_string = cgroup_sane_behavior_show, 3917 .seq_show = cgroup_sane_behavior_show,
4166 }, 3918 },
4167 3919
4168 /* 3920 /*
@@ -4173,9 +3925,12 @@ static struct cftype cgroup_base_files[] = {
4173 { 3925 {
4174 .name = "tasks", 3926 .name = "tasks",
4175 .flags = CFTYPE_INSANE, /* use "procs" instead */ 3927 .flags = CFTYPE_INSANE, /* use "procs" instead */
4176 .open = cgroup_tasks_open, 3928 .seq_start = cgroup_pidlist_start,
3929 .seq_next = cgroup_pidlist_next,
3930 .seq_stop = cgroup_pidlist_stop,
3931 .seq_show = cgroup_pidlist_show,
3932 .private = CGROUP_FILE_TASKS,
4177 .write_u64 = cgroup_tasks_write, 3933 .write_u64 = cgroup_tasks_write,
4178 .release = cgroup_pidlist_release,
4179 .mode = S_IRUGO | S_IWUSR, 3934 .mode = S_IRUGO | S_IWUSR,
4180 }, 3935 },
4181 { 3936 {
@@ -4187,7 +3942,7 @@ static struct cftype cgroup_base_files[] = {
4187 { 3942 {
4188 .name = "release_agent", 3943 .name = "release_agent",
4189 .flags = CFTYPE_INSANE | CFTYPE_ONLY_ON_ROOT, 3944 .flags = CFTYPE_INSANE | CFTYPE_ONLY_ON_ROOT,
4190 .read_seq_string = cgroup_release_agent_show, 3945 .seq_show = cgroup_release_agent_show,
4191 .write_string = cgroup_release_agent_write, 3946 .write_string = cgroup_release_agent_write,
4192 .max_write_len = PATH_MAX, 3947 .max_write_len = PATH_MAX,
4193 }, 3948 },
@@ -4333,6 +4088,65 @@ static void offline_css(struct cgroup_subsys_state *css)
4333 RCU_INIT_POINTER(css->cgroup->subsys[ss->subsys_id], css); 4088 RCU_INIT_POINTER(css->cgroup->subsys[ss->subsys_id], css);
4334} 4089}
4335 4090
4091/**
4092 * create_css - create a cgroup_subsys_state
4093 * @cgrp: the cgroup new css will be associated with
4094 * @ss: the subsys of new css
4095 *
4096 * Create a new css associated with @cgrp - @ss pair. On success, the new
4097 * css is online and installed in @cgrp with all interface files created.
4098 * Returns 0 on success, -errno on failure.
4099 */
4100static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss)
4101{
4102 struct cgroup *parent = cgrp->parent;
4103 struct cgroup_subsys_state *css;
4104 int err;
4105
4106 lockdep_assert_held(&cgrp->dentry->d_inode->i_mutex);
4107 lockdep_assert_held(&cgroup_mutex);
4108
4109 css = ss->css_alloc(cgroup_css(parent, ss));
4110 if (IS_ERR(css))
4111 return PTR_ERR(css);
4112
4113 err = percpu_ref_init(&css->refcnt, css_release);
4114 if (err)
4115 goto err_free_css;
4116
4117 init_css(css, ss, cgrp);
4118
4119 err = cgroup_populate_dir(cgrp, 1 << ss->subsys_id);
4120 if (err)
4121 goto err_free_percpu_ref;
4122
4123 err = online_css(css);
4124 if (err)
4125 goto err_clear_dir;
4126
4127 dget(cgrp->dentry);
4128 css_get(css->parent);
4129
4130 if (ss->broken_hierarchy && !ss->warned_broken_hierarchy &&
4131 parent->parent) {
4132 pr_warning("cgroup: %s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n",
4133 current->comm, current->pid, ss->name);
4134 if (!strcmp(ss->name, "memory"))
4135 pr_warning("cgroup: \"memory\" requires setting use_hierarchy to 1 on the root.\n");
4136 ss->warned_broken_hierarchy = true;
4137 }
4138
4139 return 0;
4140
4141err_clear_dir:
4142 cgroup_clear_dir(css->cgroup, 1 << css->ss->subsys_id);
4143err_free_percpu_ref:
4144 percpu_ref_cancel_init(&css->refcnt);
4145err_free_css:
4146 ss->css_free(css);
4147 return err;
4148}
4149
4336/* 4150/*
4337 * cgroup_create - create a cgroup 4151 * cgroup_create - create a cgroup
4338 * @parent: cgroup that will be parent of the new cgroup 4152 * @parent: cgroup that will be parent of the new cgroup
@@ -4344,11 +4158,10 @@ static void offline_css(struct cgroup_subsys_state *css)
4344static long cgroup_create(struct cgroup *parent, struct dentry *dentry, 4158static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4345 umode_t mode) 4159 umode_t mode)
4346{ 4160{
4347 struct cgroup_subsys_state *css_ar[CGROUP_SUBSYS_COUNT] = { };
4348 struct cgroup *cgrp; 4161 struct cgroup *cgrp;
4349 struct cgroup_name *name; 4162 struct cgroup_name *name;
4350 struct cgroupfs_root *root = parent->root; 4163 struct cgroupfs_root *root = parent->root;
4351 int err = 0; 4164 int ssid, err;
4352 struct cgroup_subsys *ss; 4165 struct cgroup_subsys *ss;
4353 struct super_block *sb = root->sb; 4166 struct super_block *sb = root->sb;
4354 4167
@@ -4358,19 +4171,13 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4358 return -ENOMEM; 4171 return -ENOMEM;
4359 4172
4360 name = cgroup_alloc_name(dentry); 4173 name = cgroup_alloc_name(dentry);
4361 if (!name) 4174 if (!name) {
4175 err = -ENOMEM;
4362 goto err_free_cgrp; 4176 goto err_free_cgrp;
4177 }
4363 rcu_assign_pointer(cgrp->name, name); 4178 rcu_assign_pointer(cgrp->name, name);
4364 4179
4365 /* 4180 /*
4366 * Temporarily set the pointer to NULL, so idr_find() won't return
4367 * a half-baked cgroup.
4368 */
4369 cgrp->id = idr_alloc(&root->cgroup_idr, NULL, 1, 0, GFP_KERNEL);
4370 if (cgrp->id < 0)
4371 goto err_free_name;
4372
4373 /*
4374 * Only live parents can have children. Note that the liveliness 4181 * Only live parents can have children. Note that the liveliness
4375 * check isn't strictly necessary because cgroup_mkdir() and 4182 * check isn't strictly necessary because cgroup_mkdir() and
4376 * cgroup_rmdir() are fully synchronized by i_mutex; however, do it 4183 * cgroup_rmdir() are fully synchronized by i_mutex; however, do it
@@ -4379,7 +4186,17 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4379 */ 4186 */
4380 if (!cgroup_lock_live_group(parent)) { 4187 if (!cgroup_lock_live_group(parent)) {
4381 err = -ENODEV; 4188 err = -ENODEV;
4382 goto err_free_id; 4189 goto err_free_name;
4190 }
4191
4192 /*
4193 * Temporarily set the pointer to NULL, so idr_find() won't return
4194 * a half-baked cgroup.
4195 */
4196 cgrp->id = idr_alloc(&root->cgroup_idr, NULL, 1, 0, GFP_KERNEL);
4197 if (cgrp->id < 0) {
4198 err = -ENOMEM;
4199 goto err_unlock;
4383 } 4200 }
4384 4201
4385 /* Grab a reference on the superblock so the hierarchy doesn't 4202 /* Grab a reference on the superblock so the hierarchy doesn't
@@ -4404,23 +4221,6 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4404 if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags)) 4221 if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags))
4405 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags); 4222 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);
4406 4223
4407 for_each_root_subsys(root, ss) {
4408 struct cgroup_subsys_state *css;
4409
4410 css = ss->css_alloc(cgroup_css(parent, ss));
4411 if (IS_ERR(css)) {
4412 err = PTR_ERR(css);
4413 goto err_free_all;
4414 }
4415 css_ar[ss->subsys_id] = css;
4416
4417 err = percpu_ref_init(&css->refcnt, css_release);
4418 if (err)
4419 goto err_free_all;
4420
4421 init_css(css, ss, cgrp);
4422 }
4423
4424 /* 4224 /*
4425 * Create directory. cgroup_create_file() returns with the new 4225 * Create directory. cgroup_create_file() returns with the new
4426 * directory locked on success so that it can be populated without 4226 * directory locked on success so that it can be populated without
@@ -4428,7 +4228,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4428 */ 4228 */
4429 err = cgroup_create_file(dentry, S_IFDIR | mode, sb); 4229 err = cgroup_create_file(dentry, S_IFDIR | mode, sb);
4430 if (err < 0) 4230 if (err < 0)
4431 goto err_free_all; 4231 goto err_free_id;
4432 lockdep_assert_held(&dentry->d_inode->i_mutex); 4232 lockdep_assert_held(&dentry->d_inode->i_mutex);
4433 4233
4434 cgrp->serial_nr = cgroup_serial_nr_next++; 4234 cgrp->serial_nr = cgroup_serial_nr_next++;
@@ -4440,60 +4240,36 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4440 /* hold a ref to the parent's dentry */ 4240 /* hold a ref to the parent's dentry */
4441 dget(parent->dentry); 4241 dget(parent->dentry);
4442 4242
4443 /* creation succeeded, notify subsystems */ 4243 /*
4444 for_each_root_subsys(root, ss) { 4244 * @cgrp is now fully operational. If something fails after this
4445 struct cgroup_subsys_state *css = css_ar[ss->subsys_id]; 4245 * point, it'll be released via the normal destruction path.
4446 4246 */
4447 err = online_css(css);
4448 if (err)
4449 goto err_destroy;
4450
4451 /* each css holds a ref to the cgroup's dentry and parent css */
4452 dget(dentry);
4453 css_get(css->parent);
4454
4455 /* mark it consumed for error path */
4456 css_ar[ss->subsys_id] = NULL;
4457
4458 if (ss->broken_hierarchy && !ss->warned_broken_hierarchy &&
4459 parent->parent) {
4460 pr_warning("cgroup: %s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n",
4461 current->comm, current->pid, ss->name);
4462 if (!strcmp(ss->name, "memory"))
4463 pr_warning("cgroup: \"memory\" requires setting use_hierarchy to 1 on the root.\n");
4464 ss->warned_broken_hierarchy = true;
4465 }
4466 }
4467
4468 idr_replace(&root->cgroup_idr, cgrp, cgrp->id); 4247 idr_replace(&root->cgroup_idr, cgrp, cgrp->id);
4469 4248
4470 err = cgroup_addrm_files(cgrp, cgroup_base_files, true); 4249 err = cgroup_addrm_files(cgrp, cgroup_base_files, true);
4471 if (err) 4250 if (err)
4472 goto err_destroy; 4251 goto err_destroy;
4473 4252
4474 err = cgroup_populate_dir(cgrp, root->subsys_mask); 4253 /* let's create and online css's */
4475 if (err) 4254 for_each_subsys(ss, ssid) {
4476 goto err_destroy; 4255 if (root->subsys_mask & (1 << ssid)) {
4256 err = create_css(cgrp, ss);
4257 if (err)
4258 goto err_destroy;
4259 }
4260 }
4477 4261
4478 mutex_unlock(&cgroup_mutex); 4262 mutex_unlock(&cgroup_mutex);
4479 mutex_unlock(&cgrp->dentry->d_inode->i_mutex); 4263 mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
4480 4264
4481 return 0; 4265 return 0;
4482 4266
4483err_free_all:
4484 for_each_root_subsys(root, ss) {
4485 struct cgroup_subsys_state *css = css_ar[ss->subsys_id];
4486
4487 if (css) {
4488 percpu_ref_cancel_init(&css->refcnt);
4489 ss->css_free(css);
4490 }
4491 }
4492 mutex_unlock(&cgroup_mutex);
4493 /* Release the reference count that we took on the superblock */
4494 deactivate_super(sb);
4495err_free_id: 4267err_free_id:
4496 idr_remove(&root->cgroup_idr, cgrp->id); 4268 idr_remove(&root->cgroup_idr, cgrp->id);
4269 /* Release the reference count that we took on the superblock */
4270 deactivate_super(sb);
4271err_unlock:
4272 mutex_unlock(&cgroup_mutex);
4497err_free_name: 4273err_free_name:
4498 kfree(rcu_dereference_raw(cgrp->name)); 4274 kfree(rcu_dereference_raw(cgrp->name));
4499err_free_cgrp: 4275err_free_cgrp:
@@ -4501,14 +4277,6 @@ err_free_cgrp:
4501 return err; 4277 return err;
4502 4278
4503err_destroy: 4279err_destroy:
4504 for_each_root_subsys(root, ss) {
4505 struct cgroup_subsys_state *css = css_ar[ss->subsys_id];
4506
4507 if (css) {
4508 percpu_ref_cancel_init(&css->refcnt);
4509 ss->css_free(css);
4510 }
4511 }
4512 cgroup_destroy_locked(cgrp); 4280 cgroup_destroy_locked(cgrp);
4513 mutex_unlock(&cgroup_mutex); 4281 mutex_unlock(&cgroup_mutex);
4514 mutex_unlock(&dentry->d_inode->i_mutex); 4282 mutex_unlock(&dentry->d_inode->i_mutex);
@@ -4631,10 +4399,10 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
4631 __releases(&cgroup_mutex) __acquires(&cgroup_mutex) 4399 __releases(&cgroup_mutex) __acquires(&cgroup_mutex)
4632{ 4400{
4633 struct dentry *d = cgrp->dentry; 4401 struct dentry *d = cgrp->dentry;
4634 struct cgroup_event *event, *tmp; 4402 struct cgroup_subsys_state *css;
4635 struct cgroup_subsys *ss;
4636 struct cgroup *child; 4403 struct cgroup *child;
4637 bool empty; 4404 bool empty;
4405 int ssid;
4638 4406
4639 lockdep_assert_held(&d->d_inode->i_mutex); 4407 lockdep_assert_held(&d->d_inode->i_mutex);
4640 lockdep_assert_held(&cgroup_mutex); 4408 lockdep_assert_held(&cgroup_mutex);
@@ -4670,12 +4438,8 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
4670 * will be invoked to perform the rest of destruction once the 4438 * will be invoked to perform the rest of destruction once the
4671 * percpu refs of all css's are confirmed to be killed. 4439 * percpu refs of all css's are confirmed to be killed.
4672 */ 4440 */
4673 for_each_root_subsys(cgrp->root, ss) { 4441 for_each_css(css, ssid, cgrp)
4674 struct cgroup_subsys_state *css = cgroup_css(cgrp, ss); 4442 kill_css(css);
4675
4676 if (css)
4677 kill_css(css);
4678 }
4679 4443
4680 /* 4444 /*
4681 * Mark @cgrp dead. This prevents further task migration and child 4445 * Mark @cgrp dead. This prevents further task migration and child
@@ -4710,18 +4474,6 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
4710 dget(d); 4474 dget(d);
4711 cgroup_d_remove_dir(d); 4475 cgroup_d_remove_dir(d);
4712 4476
4713 /*
4714 * Unregister events and notify userspace.
4715 * Notify userspace about cgroup removing only after rmdir of cgroup
4716 * directory to avoid race between userspace and kernelspace.
4717 */
4718 spin_lock(&cgrp->event_list_lock);
4719 list_for_each_entry_safe(event, tmp, &cgrp->event_list, list) {
4720 list_del_init(&event->list);
4721 schedule_work(&event->remove);
4722 }
4723 spin_unlock(&cgrp->event_list_lock);
4724
4725 return 0; 4477 return 0;
4726}; 4478};
4727 4479
@@ -4792,7 +4544,6 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
4792 cgroup_init_cftsets(ss); 4544 cgroup_init_cftsets(ss);
4793 4545
4794 /* Create the top cgroup state for this subsystem */ 4546 /* Create the top cgroup state for this subsystem */
4795 list_add(&ss->sibling, &cgroup_dummy_root.subsys_list);
4796 ss->root = &cgroup_dummy_root; 4547 ss->root = &cgroup_dummy_root;
4797 css = ss->css_alloc(cgroup_css(cgroup_dummy_top, ss)); 4548 css = ss->css_alloc(cgroup_css(cgroup_dummy_top, ss));
4798 /* We don't handle early failures gracefully */ 4549 /* We don't handle early failures gracefully */
@@ -4866,6 +4617,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
4866 cgroup_init_cftsets(ss); 4617 cgroup_init_cftsets(ss);
4867 4618
4868 mutex_lock(&cgroup_mutex); 4619 mutex_lock(&cgroup_mutex);
4620 mutex_lock(&cgroup_root_mutex);
4869 cgroup_subsys[ss->subsys_id] = ss; 4621 cgroup_subsys[ss->subsys_id] = ss;
4870 4622
4871 /* 4623 /*
@@ -4877,11 +4629,11 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
4877 if (IS_ERR(css)) { 4629 if (IS_ERR(css)) {
4878 /* failure case - need to deassign the cgroup_subsys[] slot. */ 4630 /* failure case - need to deassign the cgroup_subsys[] slot. */
4879 cgroup_subsys[ss->subsys_id] = NULL; 4631 cgroup_subsys[ss->subsys_id] = NULL;
4632 mutex_unlock(&cgroup_root_mutex);
4880 mutex_unlock(&cgroup_mutex); 4633 mutex_unlock(&cgroup_mutex);
4881 return PTR_ERR(css); 4634 return PTR_ERR(css);
4882 } 4635 }
4883 4636
4884 list_add(&ss->sibling, &cgroup_dummy_root.subsys_list);
4885 ss->root = &cgroup_dummy_root; 4637 ss->root = &cgroup_dummy_root;
4886 4638
4887 /* our new subsystem will be attached to the dummy hierarchy. */ 4639 /* our new subsystem will be attached to the dummy hierarchy. */
@@ -4911,14 +4663,18 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
4911 write_unlock(&css_set_lock); 4663 write_unlock(&css_set_lock);
4912 4664
4913 ret = online_css(css); 4665 ret = online_css(css);
4914 if (ret) 4666 if (ret) {
4667 ss->css_free(css);
4915 goto err_unload; 4668 goto err_unload;
4669 }
4916 4670
4917 /* success! */ 4671 /* success! */
4672 mutex_unlock(&cgroup_root_mutex);
4918 mutex_unlock(&cgroup_mutex); 4673 mutex_unlock(&cgroup_mutex);
4919 return 0; 4674 return 0;
4920 4675
4921err_unload: 4676err_unload:
4677 mutex_unlock(&cgroup_root_mutex);
4922 mutex_unlock(&cgroup_mutex); 4678 mutex_unlock(&cgroup_mutex);
4923 /* @ss can't be mounted here as try_module_get() would fail */ 4679 /* @ss can't be mounted here as try_module_get() would fail */
4924 cgroup_unload_subsys(ss); 4680 cgroup_unload_subsys(ss);
@@ -4937,6 +4693,7 @@ EXPORT_SYMBOL_GPL(cgroup_load_subsys);
4937void cgroup_unload_subsys(struct cgroup_subsys *ss) 4693void cgroup_unload_subsys(struct cgroup_subsys *ss)
4938{ 4694{
4939 struct cgrp_cset_link *link; 4695 struct cgrp_cset_link *link;
4696 struct cgroup_subsys_state *css;
4940 4697
4941 BUG_ON(ss->module == NULL); 4698 BUG_ON(ss->module == NULL);
4942 4699
@@ -4948,15 +4705,15 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss)
4948 BUG_ON(ss->root != &cgroup_dummy_root); 4705 BUG_ON(ss->root != &cgroup_dummy_root);
4949 4706
4950 mutex_lock(&cgroup_mutex); 4707 mutex_lock(&cgroup_mutex);
4708 mutex_lock(&cgroup_root_mutex);
4951 4709
4952 offline_css(cgroup_css(cgroup_dummy_top, ss)); 4710 css = cgroup_css(cgroup_dummy_top, ss);
4711 if (css)
4712 offline_css(css);
4953 4713
4954 /* deassign the subsys_id */ 4714 /* deassign the subsys_id */
4955 cgroup_subsys[ss->subsys_id] = NULL; 4715 cgroup_subsys[ss->subsys_id] = NULL;
4956 4716
4957 /* remove subsystem from the dummy root's list of subsystems */
4958 list_del_init(&ss->sibling);
4959
4960 /* 4717 /*
4961 * disentangle the css from all css_sets attached to the dummy 4718 * disentangle the css from all css_sets attached to the dummy
4962 * top. as in loading, we need to pay our respects to the hashtable 4719 * top. as in loading, we need to pay our respects to the hashtable
@@ -4979,9 +4736,11 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss)
4979 * need to free before marking as null because ss->css_free needs 4736 * need to free before marking as null because ss->css_free needs
4980 * the cgrp->subsys pointer to find their state. 4737 * the cgrp->subsys pointer to find their state.
4981 */ 4738 */
4982 ss->css_free(cgroup_css(cgroup_dummy_top, ss)); 4739 if (css)
4740 ss->css_free(css);
4983 RCU_INIT_POINTER(cgroup_dummy_top->subsys[ss->subsys_id], NULL); 4741 RCU_INIT_POINTER(cgroup_dummy_top->subsys[ss->subsys_id], NULL);
4984 4742
4743 mutex_unlock(&cgroup_root_mutex);
4985 mutex_unlock(&cgroup_mutex); 4744 mutex_unlock(&cgroup_mutex);
4986} 4745}
4987EXPORT_SYMBOL_GPL(cgroup_unload_subsys); 4746EXPORT_SYMBOL_GPL(cgroup_unload_subsys);
@@ -5100,6 +4859,15 @@ static int __init cgroup_wq_init(void)
5100 */ 4859 */
5101 cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1); 4860 cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1);
5102 BUG_ON(!cgroup_destroy_wq); 4861 BUG_ON(!cgroup_destroy_wq);
4862
4863 /*
4864 * Used to destroy pidlists and separate to serve as flush domain.
4865 * Cap @max_active to 1 too.
4866 */
4867 cgroup_pidlist_destroy_wq = alloc_workqueue("cgroup_pidlist_destroy",
4868 0, 1);
4869 BUG_ON(!cgroup_pidlist_destroy_wq);
4870
5103 return 0; 4871 return 0;
5104} 4872}
5105core_initcall(cgroup_wq_init); 4873core_initcall(cgroup_wq_init);
@@ -5143,11 +4911,12 @@ int proc_cgroup_show(struct seq_file *m, void *v)
5143 for_each_active_root(root) { 4911 for_each_active_root(root) {
5144 struct cgroup_subsys *ss; 4912 struct cgroup_subsys *ss;
5145 struct cgroup *cgrp; 4913 struct cgroup *cgrp;
5146 int count = 0; 4914 int ssid, count = 0;
5147 4915
5148 seq_printf(m, "%d:", root->hierarchy_id); 4916 seq_printf(m, "%d:", root->hierarchy_id);
5149 for_each_root_subsys(root, ss) 4917 for_each_subsys(ss, ssid)
5150 seq_printf(m, "%s%s", count++ ? "," : "", ss->name); 4918 if (root->subsys_mask & (1 << ssid))
4919 seq_printf(m, "%s%s", count++ ? "," : "", ss->name);
5151 if (strlen(root->name)) 4920 if (strlen(root->name))
5152 seq_printf(m, "%sname=%s", count ? "," : "", 4921 seq_printf(m, "%sname=%s", count ? "," : "",
5153 root->name); 4922 root->name);
@@ -5488,16 +5257,16 @@ __setup("cgroup_disable=", cgroup_disable);
5488 * @dentry: directory dentry of interest 5257 * @dentry: directory dentry of interest
5489 * @ss: subsystem of interest 5258 * @ss: subsystem of interest
5490 * 5259 *
5491 * Must be called under RCU read lock. The caller is responsible for 5260 * Must be called under cgroup_mutex or RCU read lock. The caller is
5492 * pinning the returned css if it needs to be accessed outside the RCU 5261 * responsible for pinning the returned css if it needs to be accessed
5493 * critical section. 5262 * outside the critical section.
5494 */ 5263 */
5495struct cgroup_subsys_state *css_from_dir(struct dentry *dentry, 5264struct cgroup_subsys_state *css_from_dir(struct dentry *dentry,
5496 struct cgroup_subsys *ss) 5265 struct cgroup_subsys *ss)
5497{ 5266{
5498 struct cgroup *cgrp; 5267 struct cgroup *cgrp;
5499 5268
5500 WARN_ON_ONCE(!rcu_read_lock_held()); 5269 cgroup_assert_mutex_or_rcu_locked();
5501 5270
5502 /* is @dentry a cgroup dir? */ 5271 /* is @dentry a cgroup dir? */
5503 if (!dentry->d_inode || 5272 if (!dentry->d_inode ||
@@ -5520,9 +5289,7 @@ struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss)
5520{ 5289{
5521 struct cgroup *cgrp; 5290 struct cgroup *cgrp;
5522 5291
5523 rcu_lockdep_assert(rcu_read_lock_held() || 5292 cgroup_assert_mutex_or_rcu_locked();
5524 lockdep_is_held(&cgroup_mutex),
5525 "css_from_id() needs proper protection");
5526 5293
5527 cgrp = idr_find(&ss->root->cgroup_idr, id); 5294 cgrp = idr_find(&ss->root->cgroup_idr, id);
5528 if (cgrp) 5295 if (cgrp)
@@ -5570,9 +5337,7 @@ static u64 current_css_set_refcount_read(struct cgroup_subsys_state *css,
5570 return count; 5337 return count;
5571} 5338}
5572 5339
5573static int current_css_set_cg_links_read(struct cgroup_subsys_state *css, 5340static int current_css_set_cg_links_read(struct seq_file *seq, void *v)
5574 struct cftype *cft,
5575 struct seq_file *seq)
5576{ 5341{
5577 struct cgrp_cset_link *link; 5342 struct cgrp_cset_link *link;
5578 struct css_set *cset; 5343 struct css_set *cset;
@@ -5597,9 +5362,9 @@ static int current_css_set_cg_links_read(struct cgroup_subsys_state *css,
5597} 5362}
5598 5363
5599#define MAX_TASKS_SHOWN_PER_CSS 25 5364#define MAX_TASKS_SHOWN_PER_CSS 25
5600static int cgroup_css_links_read(struct cgroup_subsys_state *css, 5365static int cgroup_css_links_read(struct seq_file *seq, void *v)
5601 struct cftype *cft, struct seq_file *seq)
5602{ 5366{
5367 struct cgroup_subsys_state *css = seq_css(seq);
5603 struct cgrp_cset_link *link; 5368 struct cgrp_cset_link *link;
5604 5369
5605 read_lock(&css_set_lock); 5370 read_lock(&css_set_lock);
@@ -5645,12 +5410,12 @@ static struct cftype debug_files[] = {
5645 5410
5646 { 5411 {
5647 .name = "current_css_set_cg_links", 5412 .name = "current_css_set_cg_links",
5648 .read_seq_string = current_css_set_cg_links_read, 5413 .seq_show = current_css_set_cg_links_read,
5649 }, 5414 },
5650 5415
5651 { 5416 {
5652 .name = "cgroup_css_links", 5417 .name = "cgroup_css_links",
5653 .read_seq_string = cgroup_css_links_read, 5418 .seq_show = cgroup_css_links_read,
5654 }, 5419 },
5655 5420
5656 { 5421 {
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index f0ff64d0ebaa..6c3154e477f6 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -301,10 +301,9 @@ out_unlock:
301 spin_unlock_irq(&freezer->lock); 301 spin_unlock_irq(&freezer->lock);
302} 302}
303 303
304static int freezer_read(struct cgroup_subsys_state *css, struct cftype *cft, 304static int freezer_read(struct seq_file *m, void *v)
305 struct seq_file *m)
306{ 305{
307 struct cgroup_subsys_state *pos; 306 struct cgroup_subsys_state *css = seq_css(m), *pos;
308 307
309 rcu_read_lock(); 308 rcu_read_lock();
310 309
@@ -458,7 +457,7 @@ static struct cftype files[] = {
458 { 457 {
459 .name = "state", 458 .name = "state",
460 .flags = CFTYPE_NOT_ON_ROOT, 459 .flags = CFTYPE_NOT_ON_ROOT,
461 .read_seq_string = freezer_read, 460 .seq_show = freezer_read,
462 .write_string = freezer_write, 461 .write_string = freezer_write,
463 }, 462 },
464 { 463 {
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c
index e5f3917aa05b..6cb20d2e7ee0 100644
--- a/kernel/context_tracking.c
+++ b/kernel/context_tracking.c
@@ -53,10 +53,10 @@ void context_tracking_user_enter(void)
53 /* 53 /*
54 * Repeat the user_enter() check here because some archs may be calling 54 * Repeat the user_enter() check here because some archs may be calling
55 * this from asm and if no CPU needs context tracking, they shouldn't 55 * this from asm and if no CPU needs context tracking, they shouldn't
56 * go further. Repeat the check here until they support the static key 56 * go further. Repeat the check here until they support the inline static
57 * check. 57 * key check.
58 */ 58 */
59 if (!static_key_false(&context_tracking_enabled)) 59 if (!context_tracking_is_enabled())
60 return; 60 return;
61 61
62 /* 62 /*
@@ -160,7 +160,7 @@ void context_tracking_user_exit(void)
160{ 160{
161 unsigned long flags; 161 unsigned long flags;
162 162
163 if (!static_key_false(&context_tracking_enabled)) 163 if (!context_tracking_is_enabled())
164 return; 164 return;
165 165
166 if (in_interrupt()) 166 if (in_interrupt())
diff --git a/kernel/cpu/idle.c b/kernel/cpu/idle.c
index 988573a9a387..277f494c2a9a 100644
--- a/kernel/cpu/idle.c
+++ b/kernel/cpu/idle.c
@@ -105,14 +105,17 @@ static void cpu_idle_loop(void)
105 __current_set_polling(); 105 __current_set_polling();
106 } 106 }
107 arch_cpu_idle_exit(); 107 arch_cpu_idle_exit();
108 /*
109 * We need to test and propagate the TIF_NEED_RESCHED
110 * bit here because we might not have send the
111 * reschedule IPI to idle tasks.
112 */
113 if (tif_need_resched())
114 set_preempt_need_resched();
115 } 108 }
109
110 /*
111 * Since we fell out of the loop above, we know
112 * TIF_NEED_RESCHED must be set, propagate it into
113 * PREEMPT_NEED_RESCHED.
114 *
115 * This is required because for polling idle loops we will
116 * not have had an IPI to fold the state for us.
117 */
118 preempt_set_need_resched();
116 tick_nohz_idle_exit(); 119 tick_nohz_idle_exit();
117 schedule_preempt_disabled(); 120 schedule_preempt_disabled();
118 } 121 }
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 4772034b4b17..e6b1b66afe52 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -974,12 +974,6 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
974 * Temporarilly set tasks mems_allowed to target nodes of migration, 974 * Temporarilly set tasks mems_allowed to target nodes of migration,
975 * so that the migration code can allocate pages on these nodes. 975 * so that the migration code can allocate pages on these nodes.
976 * 976 *
977 * Call holding cpuset_mutex, so current's cpuset won't change
978 * during this call, as manage_mutex holds off any cpuset_attach()
979 * calls. Therefore we don't need to take task_lock around the
980 * call to guarantee_online_mems(), as we know no one is changing
981 * our task's cpuset.
982 *
983 * While the mm_struct we are migrating is typically from some 977 * While the mm_struct we are migrating is typically from some
984 * other task, the task_struct mems_allowed that we are hacking 978 * other task, the task_struct mems_allowed that we are hacking
985 * is for our current task, which must allocate new pages for that 979 * is for our current task, which must allocate new pages for that
@@ -996,8 +990,10 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
996 990
997 do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL); 991 do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL);
998 992
993 rcu_read_lock();
999 mems_cs = effective_nodemask_cpuset(task_cs(tsk)); 994 mems_cs = effective_nodemask_cpuset(task_cs(tsk));
1000 guarantee_online_mems(mems_cs, &tsk->mems_allowed); 995 guarantee_online_mems(mems_cs, &tsk->mems_allowed);
996 rcu_read_unlock();
1001} 997}
1002 998
1003/* 999/*
@@ -1731,66 +1727,41 @@ out_unlock:
1731 * used, list of ranges of sequential numbers, is variable length, 1727 * used, list of ranges of sequential numbers, is variable length,
1732 * and since these maps can change value dynamically, one could read 1728 * and since these maps can change value dynamically, one could read
1733 * gibberish by doing partial reads while a list was changing. 1729 * gibberish by doing partial reads while a list was changing.
1734 * A single large read to a buffer that crosses a page boundary is
1735 * ok, because the result being copied to user land is not recomputed
1736 * across a page fault.
1737 */ 1730 */
1738 1731static int cpuset_common_seq_show(struct seq_file *sf, void *v)
1739static size_t cpuset_sprintf_cpulist(char *page, struct cpuset *cs)
1740{ 1732{
1741 size_t count; 1733 struct cpuset *cs = css_cs(seq_css(sf));
1734 cpuset_filetype_t type = seq_cft(sf)->private;
1735 ssize_t count;
1736 char *buf, *s;
1737 int ret = 0;
1742 1738
1743 mutex_lock(&callback_mutex); 1739 count = seq_get_buf(sf, &buf);
1744 count = cpulist_scnprintf(page, PAGE_SIZE, cs->cpus_allowed); 1740 s = buf;
1745 mutex_unlock(&callback_mutex);
1746
1747 return count;
1748}
1749
1750static size_t cpuset_sprintf_memlist(char *page, struct cpuset *cs)
1751{
1752 size_t count;
1753 1741
1754 mutex_lock(&callback_mutex); 1742 mutex_lock(&callback_mutex);
1755 count = nodelist_scnprintf(page, PAGE_SIZE, cs->mems_allowed);
1756 mutex_unlock(&callback_mutex);
1757
1758 return count;
1759}
1760
1761static ssize_t cpuset_common_file_read(struct cgroup_subsys_state *css,
1762 struct cftype *cft, struct file *file,
1763 char __user *buf, size_t nbytes,
1764 loff_t *ppos)
1765{
1766 struct cpuset *cs = css_cs(css);
1767 cpuset_filetype_t type = cft->private;
1768 char *page;
1769 ssize_t retval = 0;
1770 char *s;
1771
1772 if (!(page = (char *)__get_free_page(GFP_TEMPORARY)))
1773 return -ENOMEM;
1774
1775 s = page;
1776 1743
1777 switch (type) { 1744 switch (type) {
1778 case FILE_CPULIST: 1745 case FILE_CPULIST:
1779 s += cpuset_sprintf_cpulist(s, cs); 1746 s += cpulist_scnprintf(s, count, cs->cpus_allowed);
1780 break; 1747 break;
1781 case FILE_MEMLIST: 1748 case FILE_MEMLIST:
1782 s += cpuset_sprintf_memlist(s, cs); 1749 s += nodelist_scnprintf(s, count, cs->mems_allowed);
1783 break; 1750 break;
1784 default: 1751 default:
1785 retval = -EINVAL; 1752 ret = -EINVAL;
1786 goto out; 1753 goto out_unlock;
1787 } 1754 }
1788 *s++ = '\n';
1789 1755
1790 retval = simple_read_from_buffer(buf, nbytes, ppos, page, s - page); 1756 if (s < buf + count - 1) {
1791out: 1757 *s++ = '\n';
1792 free_page((unsigned long)page); 1758 seq_commit(sf, s - buf);
1793 return retval; 1759 } else {
1760 seq_commit(sf, -1);
1761 }
1762out_unlock:
1763 mutex_unlock(&callback_mutex);
1764 return ret;
1794} 1765}
1795 1766
1796static u64 cpuset_read_u64(struct cgroup_subsys_state *css, struct cftype *cft) 1767static u64 cpuset_read_u64(struct cgroup_subsys_state *css, struct cftype *cft)
@@ -1847,7 +1818,7 @@ static s64 cpuset_read_s64(struct cgroup_subsys_state *css, struct cftype *cft)
1847static struct cftype files[] = { 1818static struct cftype files[] = {
1848 { 1819 {
1849 .name = "cpus", 1820 .name = "cpus",
1850 .read = cpuset_common_file_read, 1821 .seq_show = cpuset_common_seq_show,
1851 .write_string = cpuset_write_resmask, 1822 .write_string = cpuset_write_resmask,
1852 .max_write_len = (100U + 6 * NR_CPUS), 1823 .max_write_len = (100U + 6 * NR_CPUS),
1853 .private = FILE_CPULIST, 1824 .private = FILE_CPULIST,
@@ -1855,7 +1826,7 @@ static struct cftype files[] = {
1855 1826
1856 { 1827 {
1857 .name = "mems", 1828 .name = "mems",
1858 .read = cpuset_common_file_read, 1829 .seq_show = cpuset_common_seq_show,
1859 .write_string = cpuset_write_resmask, 1830 .write_string = cpuset_write_resmask,
1860 .max_write_len = (100U + 6 * MAX_NUMNODES), 1831 .max_write_len = (100U + 6 * MAX_NUMNODES),
1861 .private = FILE_MEMLIST, 1832 .private = FILE_MEMLIST,
@@ -2511,9 +2482,9 @@ int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask)
2511 2482
2512 task_lock(current); 2483 task_lock(current);
2513 cs = nearest_hardwall_ancestor(task_cs(current)); 2484 cs = nearest_hardwall_ancestor(task_cs(current));
2485 allowed = node_isset(node, cs->mems_allowed);
2514 task_unlock(current); 2486 task_unlock(current);
2515 2487
2516 allowed = node_isset(node, cs->mems_allowed);
2517 mutex_unlock(&callback_mutex); 2488 mutex_unlock(&callback_mutex);
2518 return allowed; 2489 return allowed;
2519} 2490}
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index 7d2f35e5df2f..334b3980ffc1 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -736,7 +736,8 @@ int kgdb_nmicallback(int cpu, void *regs)
736 return 1; 736 return 1;
737} 737}
738 738
739int kgdb_nmicallin(int cpu, int trapnr, void *regs, atomic_t *send_ready) 739int kgdb_nmicallin(int cpu, int trapnr, void *regs, int err_code,
740 atomic_t *send_ready)
740{ 741{
741#ifdef CONFIG_SMP 742#ifdef CONFIG_SMP
742 if (!kgdb_io_ready(0) || !send_ready) 743 if (!kgdb_io_ready(0) || !send_ready)
@@ -750,7 +751,7 @@ int kgdb_nmicallin(int cpu, int trapnr, void *regs, atomic_t *send_ready)
750 ks->cpu = cpu; 751 ks->cpu = cpu;
751 ks->ex_vector = trapnr; 752 ks->ex_vector = trapnr;
752 ks->signo = SIGTRAP; 753 ks->signo = SIGTRAP;
753 ks->err_code = KGDB_KDB_REASON_SYSTEM_NMI; 754 ks->err_code = err_code;
754 ks->linux_regs = regs; 755 ks->linux_regs = regs;
755 ks->send_ready = send_ready; 756 ks->send_ready = send_ready;
756 kgdb_cpu_enter(ks, regs, DCPU_WANT_MASTER); 757 kgdb_cpu_enter(ks, regs, DCPU_WANT_MASTER);
diff --git a/kernel/debug/debug_core.h b/kernel/debug/debug_core.h
index 572aa4f5677c..127d9bc49fb4 100644
--- a/kernel/debug/debug_core.h
+++ b/kernel/debug/debug_core.h
@@ -75,13 +75,11 @@ extern int kdb_stub(struct kgdb_state *ks);
75extern int kdb_parse(const char *cmdstr); 75extern int kdb_parse(const char *cmdstr);
76extern int kdb_common_init_state(struct kgdb_state *ks); 76extern int kdb_common_init_state(struct kgdb_state *ks);
77extern int kdb_common_deinit_state(void); 77extern int kdb_common_deinit_state(void);
78#define KGDB_KDB_REASON_SYSTEM_NMI KDB_REASON_SYSTEM_NMI
79#else /* ! CONFIG_KGDB_KDB */ 78#else /* ! CONFIG_KGDB_KDB */
80static inline int kdb_stub(struct kgdb_state *ks) 79static inline int kdb_stub(struct kgdb_state *ks)
81{ 80{
82 return DBG_PASS_EVENT; 81 return DBG_PASS_EVENT;
83} 82}
84#define KGDB_KDB_REASON_SYSTEM_NMI 0
85#endif /* CONFIG_KGDB_KDB */ 83#endif /* CONFIG_KGDB_KDB */
86 84
87#endif /* _DEBUG_CORE_H_ */ 85#endif /* _DEBUG_CORE_H_ */
diff --git a/kernel/events/core.c b/kernel/events/core.c
index f5744010a8d2..fa0b2d4ad83c 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -119,7 +119,8 @@ static int cpu_function_call(int cpu, int (*func) (void *info), void *info)
119 119
120#define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\ 120#define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\
121 PERF_FLAG_FD_OUTPUT |\ 121 PERF_FLAG_FD_OUTPUT |\
122 PERF_FLAG_PID_CGROUP) 122 PERF_FLAG_PID_CGROUP |\
123 PERF_FLAG_FD_CLOEXEC)
123 124
124/* 125/*
125 * branch priv levels that need permission checks 126 * branch priv levels that need permission checks
@@ -3542,7 +3543,7 @@ static void perf_event_for_each(struct perf_event *event,
3542static int perf_event_period(struct perf_event *event, u64 __user *arg) 3543static int perf_event_period(struct perf_event *event, u64 __user *arg)
3543{ 3544{
3544 struct perf_event_context *ctx = event->ctx; 3545 struct perf_event_context *ctx = event->ctx;
3545 int ret = 0; 3546 int ret = 0, active;
3546 u64 value; 3547 u64 value;
3547 3548
3548 if (!is_sampling_event(event)) 3549 if (!is_sampling_event(event))
@@ -3566,6 +3567,20 @@ static int perf_event_period(struct perf_event *event, u64 __user *arg)
3566 event->attr.sample_period = value; 3567 event->attr.sample_period = value;
3567 event->hw.sample_period = value; 3568 event->hw.sample_period = value;
3568 } 3569 }
3570
3571 active = (event->state == PERF_EVENT_STATE_ACTIVE);
3572 if (active) {
3573 perf_pmu_disable(ctx->pmu);
3574 event->pmu->stop(event, PERF_EF_UPDATE);
3575 }
3576
3577 local64_set(&event->hw.period_left, 0);
3578
3579 if (active) {
3580 event->pmu->start(event, PERF_EF_RELOAD);
3581 perf_pmu_enable(ctx->pmu);
3582 }
3583
3569unlock: 3584unlock:
3570 raw_spin_unlock_irq(&ctx->lock); 3585 raw_spin_unlock_irq(&ctx->lock);
3571 3586
@@ -6670,6 +6685,9 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
6670 INIT_LIST_HEAD(&event->event_entry); 6685 INIT_LIST_HEAD(&event->event_entry);
6671 INIT_LIST_HEAD(&event->sibling_list); 6686 INIT_LIST_HEAD(&event->sibling_list);
6672 INIT_LIST_HEAD(&event->rb_entry); 6687 INIT_LIST_HEAD(&event->rb_entry);
6688 INIT_LIST_HEAD(&event->active_entry);
6689 INIT_HLIST_NODE(&event->hlist_entry);
6690
6673 6691
6674 init_waitqueue_head(&event->waitq); 6692 init_waitqueue_head(&event->waitq);
6675 init_irq_work(&event->pending, perf_pending_event); 6693 init_irq_work(&event->pending, perf_pending_event);
@@ -6980,6 +6998,7 @@ SYSCALL_DEFINE5(perf_event_open,
6980 int event_fd; 6998 int event_fd;
6981 int move_group = 0; 6999 int move_group = 0;
6982 int err; 7000 int err;
7001 int f_flags = O_RDWR;
6983 7002
6984 /* for future expandability... */ 7003 /* for future expandability... */
6985 if (flags & ~PERF_FLAG_ALL) 7004 if (flags & ~PERF_FLAG_ALL)
@@ -7008,7 +7027,10 @@ SYSCALL_DEFINE5(perf_event_open,
7008 if ((flags & PERF_FLAG_PID_CGROUP) && (pid == -1 || cpu == -1)) 7027 if ((flags & PERF_FLAG_PID_CGROUP) && (pid == -1 || cpu == -1))
7009 return -EINVAL; 7028 return -EINVAL;
7010 7029
7011 event_fd = get_unused_fd(); 7030 if (flags & PERF_FLAG_FD_CLOEXEC)
7031 f_flags |= O_CLOEXEC;
7032
7033 event_fd = get_unused_fd_flags(f_flags);
7012 if (event_fd < 0) 7034 if (event_fd < 0)
7013 return event_fd; 7035 return event_fd;
7014 7036
@@ -7130,7 +7152,8 @@ SYSCALL_DEFINE5(perf_event_open,
7130 goto err_context; 7152 goto err_context;
7131 } 7153 }
7132 7154
7133 event_file = anon_inode_getfile("[perf_event]", &perf_fops, event, O_RDWR); 7155 event_file = anon_inode_getfile("[perf_event]", &perf_fops, event,
7156 f_flags);
7134 if (IS_ERR(event_file)) { 7157 if (IS_ERR(event_file)) {
7135 err = PTR_ERR(event_file); 7158 err = PTR_ERR(event_file);
7136 goto err_context; 7159 goto err_context;
@@ -7833,14 +7856,14 @@ static void perf_pmu_rotate_stop(struct pmu *pmu)
7833static void __perf_event_exit_context(void *__info) 7856static void __perf_event_exit_context(void *__info)
7834{ 7857{
7835 struct perf_event_context *ctx = __info; 7858 struct perf_event_context *ctx = __info;
7836 struct perf_event *event, *tmp; 7859 struct perf_event *event;
7837 7860
7838 perf_pmu_rotate_stop(ctx->pmu); 7861 perf_pmu_rotate_stop(ctx->pmu);
7839 7862
7840 list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry) 7863 rcu_read_lock();
7841 __perf_remove_from_context(event); 7864 list_for_each_entry_rcu(event, &ctx->event_list, event_entry)
7842 list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, group_entry)
7843 __perf_remove_from_context(event); 7865 __perf_remove_from_context(event);
7866 rcu_read_unlock();
7844} 7867}
7845 7868
7846static void perf_event_exit_cpu_context(int cpu) 7869static void perf_event_exit_cpu_context(int cpu)
@@ -7864,11 +7887,11 @@ static void perf_event_exit_cpu(int cpu)
7864{ 7887{
7865 struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); 7888 struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
7866 7889
7890 perf_event_exit_cpu_context(cpu);
7891
7867 mutex_lock(&swhash->hlist_mutex); 7892 mutex_lock(&swhash->hlist_mutex);
7868 swevent_hlist_release(swhash); 7893 swevent_hlist_release(swhash);
7869 mutex_unlock(&swhash->hlist_mutex); 7894 mutex_unlock(&swhash->hlist_mutex);
7870
7871 perf_event_exit_cpu_context(cpu);
7872} 7895}
7873#else 7896#else
7874static inline void perf_event_exit_cpu(int cpu) { } 7897static inline void perf_event_exit_cpu(int cpu) { }
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index e8b168af135b..146a5792b1d2 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -61,19 +61,20 @@ again:
61 * 61 *
62 * kernel user 62 * kernel user
63 * 63 *
64 * READ ->data_tail READ ->data_head 64 * if (LOAD ->data_tail) { LOAD ->data_head
65 * smp_mb() (A) smp_rmb() (C) 65 * (A) smp_rmb() (C)
66 * WRITE $data READ $data 66 * STORE $data LOAD $data
67 * smp_wmb() (B) smp_mb() (D) 67 * smp_wmb() (B) smp_mb() (D)
68 * STORE ->data_head WRITE ->data_tail 68 * STORE ->data_head STORE ->data_tail
69 * }
69 * 70 *
70 * Where A pairs with D, and B pairs with C. 71 * Where A pairs with D, and B pairs with C.
71 * 72 *
72 * I don't think A needs to be a full barrier because we won't in fact 73 * In our case (A) is a control dependency that separates the load of
73 * write data until we see the store from userspace. So we simply don't 74 * the ->data_tail and the stores of $data. In case ->data_tail
74 * issue the data WRITE until we observe it. Be conservative for now. 75 * indicates there is no room in the buffer to store $data we do not.
75 * 76 *
76 * OTOH, D needs to be a full barrier since it separates the data READ 77 * D needs to be a full barrier since it separates the data READ
77 * from the tail WRITE. 78 * from the tail WRITE.
78 * 79 *
79 * For B a WMB is sufficient since it separates two WRITEs, and for C 80 * For B a WMB is sufficient since it separates two WRITEs, and for C
@@ -81,7 +82,7 @@ again:
81 * 82 *
82 * See perf_output_begin(). 83 * See perf_output_begin().
83 */ 84 */
84 smp_wmb(); 85 smp_wmb(); /* B, matches C */
85 rb->user_page->data_head = head; 86 rb->user_page->data_head = head;
86 87
87 /* 88 /*
@@ -144,17 +145,26 @@ int perf_output_begin(struct perf_output_handle *handle,
144 if (!rb->overwrite && 145 if (!rb->overwrite &&
145 unlikely(CIRC_SPACE(head, tail, perf_data_size(rb)) < size)) 146 unlikely(CIRC_SPACE(head, tail, perf_data_size(rb)) < size))
146 goto fail; 147 goto fail;
148
149 /*
150 * The above forms a control dependency barrier separating the
151 * @tail load above from the data stores below. Since the @tail
152 * load is required to compute the branch to fail below.
153 *
154 * A, matches D; the full memory barrier userspace SHOULD issue
155 * after reading the data and before storing the new tail
156 * position.
157 *
158 * See perf_output_put_handle().
159 */
160
147 head += size; 161 head += size;
148 } while (local_cmpxchg(&rb->head, offset, head) != offset); 162 } while (local_cmpxchg(&rb->head, offset, head) != offset);
149 163
150 /* 164 /*
151 * Separate the userpage->tail read from the data stores below. 165 * We rely on the implied barrier() by local_cmpxchg() to ensure
152 * Matches the MB userspace SHOULD issue after reading the data 166 * none of the data stores below can be lifted up by the compiler.
153 * and before storing the new tail position.
154 *
155 * See perf_output_put_handle().
156 */ 167 */
157 smp_mb();
158 168
159 if (unlikely(head - local_read(&rb->wakeup) > rb->watermark)) 169 if (unlikely(head - local_read(&rb->wakeup) > rb->watermark))
160 local_add(rb->watermark, &rb->wakeup); 170 local_add(rb->watermark, &rb->wakeup);
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 24b7d6ca871b..307d87c0991a 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -73,6 +73,17 @@ struct uprobe {
73 struct inode *inode; /* Also hold a ref to inode */ 73 struct inode *inode; /* Also hold a ref to inode */
74 loff_t offset; 74 loff_t offset;
75 unsigned long flags; 75 unsigned long flags;
76
77 /*
78 * The generic code assumes that it has two members of unknown type
79 * owned by the arch-specific code:
80 *
81 * insn - copy_insn() saves the original instruction here for
82 * arch_uprobe_analyze_insn().
83 *
84 * ixol - potentially modified instruction to execute out of
85 * line, copied to xol_area by xol_get_insn_slot().
86 */
76 struct arch_uprobe arch; 87 struct arch_uprobe arch;
77}; 88};
78 89
@@ -86,6 +97,29 @@ struct return_instance {
86}; 97};
87 98
88/* 99/*
100 * Execute out of line area: anonymous executable mapping installed
101 * by the probed task to execute the copy of the original instruction
102 * mangled by set_swbp().
103 *
104 * On a breakpoint hit, thread contests for a slot. It frees the
105 * slot after singlestep. Currently a fixed number of slots are
106 * allocated.
107 */
108struct xol_area {
109 wait_queue_head_t wq; /* if all slots are busy */
110 atomic_t slot_count; /* number of in-use slots */
111 unsigned long *bitmap; /* 0 = free slot */
112 struct page *page;
113
114 /*
115 * We keep the vma's vm_start rather than a pointer to the vma
116 * itself. The probed process or a naughty kernel module could make
117 * the vma go away, and we must handle that reasonably gracefully.
118 */
119 unsigned long vaddr; /* Page(s) of instruction slots */
120};
121
122/*
89 * valid_vma: Verify if the specified vma is an executable vma 123 * valid_vma: Verify if the specified vma is an executable vma
90 * Relax restrictions while unregistering: vm_flags might have 124 * Relax restrictions while unregistering: vm_flags might have
91 * changed after breakpoint was inserted. 125 * changed after breakpoint was inserted.
@@ -330,7 +364,7 @@ int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned
330int __weak 364int __weak
331set_orig_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr) 365set_orig_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr)
332{ 366{
333 return uprobe_write_opcode(mm, vaddr, *(uprobe_opcode_t *)auprobe->insn); 367 return uprobe_write_opcode(mm, vaddr, *(uprobe_opcode_t *)&auprobe->insn);
334} 368}
335 369
336static int match_uprobe(struct uprobe *l, struct uprobe *r) 370static int match_uprobe(struct uprobe *l, struct uprobe *r)
@@ -529,8 +563,8 @@ static int copy_insn(struct uprobe *uprobe, struct file *filp)
529{ 563{
530 struct address_space *mapping = uprobe->inode->i_mapping; 564 struct address_space *mapping = uprobe->inode->i_mapping;
531 loff_t offs = uprobe->offset; 565 loff_t offs = uprobe->offset;
532 void *insn = uprobe->arch.insn; 566 void *insn = &uprobe->arch.insn;
533 int size = MAX_UINSN_BYTES; 567 int size = sizeof(uprobe->arch.insn);
534 int len, err = -EIO; 568 int len, err = -EIO;
535 569
536 /* Copy only available bytes, -EIO if nothing was read */ 570 /* Copy only available bytes, -EIO if nothing was read */
@@ -569,7 +603,7 @@ static int prepare_uprobe(struct uprobe *uprobe, struct file *file,
569 goto out; 603 goto out;
570 604
571 ret = -ENOTSUPP; 605 ret = -ENOTSUPP;
572 if (is_trap_insn((uprobe_opcode_t *)uprobe->arch.insn)) 606 if (is_trap_insn((uprobe_opcode_t *)&uprobe->arch.insn))
573 goto out; 607 goto out;
574 608
575 ret = arch_uprobe_analyze_insn(&uprobe->arch, mm, vaddr); 609 ret = arch_uprobe_analyze_insn(&uprobe->arch, mm, vaddr);
@@ -1264,7 +1298,7 @@ static unsigned long xol_get_insn_slot(struct uprobe *uprobe)
1264 1298
1265 /* Initialize the slot */ 1299 /* Initialize the slot */
1266 copy_to_page(area->page, xol_vaddr, 1300 copy_to_page(area->page, xol_vaddr,
1267 uprobe->arch.ixol, sizeof(uprobe->arch.ixol)); 1301 &uprobe->arch.ixol, sizeof(uprobe->arch.ixol));
1268 /* 1302 /*
1269 * We probably need flush_icache_user_range() but it needs vma. 1303 * We probably need flush_icache_user_range() but it needs vma.
1270 * This should work on supported architectures too. 1304 * This should work on supported architectures too.
@@ -1403,12 +1437,10 @@ static void uprobe_warn(struct task_struct *t, const char *msg)
1403 1437
1404static void dup_xol_work(struct callback_head *work) 1438static void dup_xol_work(struct callback_head *work)
1405{ 1439{
1406 kfree(work);
1407
1408 if (current->flags & PF_EXITING) 1440 if (current->flags & PF_EXITING)
1409 return; 1441 return;
1410 1442
1411 if (!__create_xol_area(current->utask->vaddr)) 1443 if (!__create_xol_area(current->utask->dup_xol_addr))
1412 uprobe_warn(current, "dup xol area"); 1444 uprobe_warn(current, "dup xol area");
1413} 1445}
1414 1446
@@ -1419,7 +1451,6 @@ void uprobe_copy_process(struct task_struct *t, unsigned long flags)
1419{ 1451{
1420 struct uprobe_task *utask = current->utask; 1452 struct uprobe_task *utask = current->utask;
1421 struct mm_struct *mm = current->mm; 1453 struct mm_struct *mm = current->mm;
1422 struct callback_head *work;
1423 struct xol_area *area; 1454 struct xol_area *area;
1424 1455
1425 t->utask = NULL; 1456 t->utask = NULL;
@@ -1441,14 +1472,9 @@ void uprobe_copy_process(struct task_struct *t, unsigned long flags)
1441 if (mm == t->mm) 1472 if (mm == t->mm)
1442 return; 1473 return;
1443 1474
1444 /* TODO: move it into the union in uprobe_task */ 1475 t->utask->dup_xol_addr = area->vaddr;
1445 work = kmalloc(sizeof(*work), GFP_KERNEL); 1476 init_task_work(&t->utask->dup_xol_work, dup_xol_work);
1446 if (!work) 1477 task_work_add(t, &t->utask->dup_xol_work, true);
1447 return uprobe_warn(t, "dup xol area");
1448
1449 t->utask->vaddr = area->vaddr;
1450 init_task_work(work, dup_xol_work);
1451 task_work_add(t, work, true);
1452} 1478}
1453 1479
1454/* 1480/*
@@ -1828,6 +1854,10 @@ static void handle_swbp(struct pt_regs *regs)
1828 if (unlikely(!test_bit(UPROBE_COPY_INSN, &uprobe->flags))) 1854 if (unlikely(!test_bit(UPROBE_COPY_INSN, &uprobe->flags)))
1829 goto out; 1855 goto out;
1830 1856
1857 /* Tracing handlers use ->utask to communicate with fetch methods */
1858 if (!get_utask())
1859 goto out;
1860
1831 handler_chain(uprobe, regs); 1861 handler_chain(uprobe, regs);
1832 if (can_skip_sstep(uprobe, regs)) 1862 if (can_skip_sstep(uprobe, regs))
1833 goto out; 1863 goto out;
diff --git a/kernel/exit.c b/kernel/exit.c
index a949819055d5..1e77fc645317 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -74,6 +74,7 @@ static void __unhash_process(struct task_struct *p, bool group_dead)
74 __this_cpu_dec(process_counts); 74 __this_cpu_dec(process_counts);
75 } 75 }
76 list_del_rcu(&p->thread_group); 76 list_del_rcu(&p->thread_group);
77 list_del_rcu(&p->thread_node);
77} 78}
78 79
79/* 80/*
diff --git a/kernel/fork.c b/kernel/fork.c
index 5721f0e3f2da..a17621c6cd42 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -800,14 +800,11 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm)
800 * Allocate a new mm structure and copy contents from the 800 * Allocate a new mm structure and copy contents from the
801 * mm structure of the passed in task structure. 801 * mm structure of the passed in task structure.
802 */ 802 */
803struct mm_struct *dup_mm(struct task_struct *tsk) 803static struct mm_struct *dup_mm(struct task_struct *tsk)
804{ 804{
805 struct mm_struct *mm, *oldmm = current->mm; 805 struct mm_struct *mm, *oldmm = current->mm;
806 int err; 806 int err;
807 807
808 if (!oldmm)
809 return NULL;
810
811 mm = allocate_mm(); 808 mm = allocate_mm();
812 if (!mm) 809 if (!mm)
813 goto fail_nomem; 810 goto fail_nomem;
@@ -1035,6 +1032,11 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
1035 sig->nr_threads = 1; 1032 sig->nr_threads = 1;
1036 atomic_set(&sig->live, 1); 1033 atomic_set(&sig->live, 1);
1037 atomic_set(&sig->sigcnt, 1); 1034 atomic_set(&sig->sigcnt, 1);
1035
1036 /* list_add(thread_node, thread_head) without INIT_LIST_HEAD() */
1037 sig->thread_head = (struct list_head)LIST_HEAD_INIT(tsk->thread_node);
1038 tsk->thread_node = (struct list_head)LIST_HEAD_INIT(sig->thread_head);
1039
1038 init_waitqueue_head(&sig->wait_chldexit); 1040 init_waitqueue_head(&sig->wait_chldexit);
1039 sig->curr_target = tsk; 1041 sig->curr_target = tsk;
1040 init_sigpending(&sig->shared_pending); 1042 init_sigpending(&sig->shared_pending);
@@ -1087,8 +1089,10 @@ static void rt_mutex_init_task(struct task_struct *p)
1087{ 1089{
1088 raw_spin_lock_init(&p->pi_lock); 1090 raw_spin_lock_init(&p->pi_lock);
1089#ifdef CONFIG_RT_MUTEXES 1091#ifdef CONFIG_RT_MUTEXES
1090 plist_head_init(&p->pi_waiters); 1092 p->pi_waiters = RB_ROOT;
1093 p->pi_waiters_leftmost = NULL;
1091 p->pi_blocked_on = NULL; 1094 p->pi_blocked_on = NULL;
1095 p->pi_top_task = NULL;
1092#endif 1096#endif
1093} 1097}
1094 1098
@@ -1172,7 +1176,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1172 * do not allow it to share a thread group or signal handlers or 1176 * do not allow it to share a thread group or signal handlers or
1173 * parent with the forking task. 1177 * parent with the forking task.
1174 */ 1178 */
1175 if (clone_flags & (CLONE_SIGHAND | CLONE_PARENT)) { 1179 if (clone_flags & CLONE_SIGHAND) {
1176 if ((clone_flags & (CLONE_NEWUSER | CLONE_NEWPID)) || 1180 if ((clone_flags & (CLONE_NEWUSER | CLONE_NEWPID)) ||
1177 (task_active_pid_ns(current) != 1181 (task_active_pid_ns(current) !=
1178 current->nsproxy->pid_ns_for_children)) 1182 current->nsproxy->pid_ns_for_children))
@@ -1222,7 +1226,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1222 if (!try_module_get(task_thread_info(p)->exec_domain->module)) 1226 if (!try_module_get(task_thread_info(p)->exec_domain->module))
1223 goto bad_fork_cleanup_count; 1227 goto bad_fork_cleanup_count;
1224 1228
1225 p->did_exec = 0;
1226 delayacct_tsk_init(p); /* Must remain after dup_task_struct() */ 1229 delayacct_tsk_init(p); /* Must remain after dup_task_struct() */
1227 copy_flags(clone_flags, p); 1230 copy_flags(clone_flags, p);
1228 INIT_LIST_HEAD(&p->children); 1231 INIT_LIST_HEAD(&p->children);
@@ -1311,7 +1314,9 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1311#endif 1314#endif
1312 1315
1313 /* Perform scheduler related setup. Assign this task to a CPU. */ 1316 /* Perform scheduler related setup. Assign this task to a CPU. */
1314 sched_fork(clone_flags, p); 1317 retval = sched_fork(clone_flags, p);
1318 if (retval)
1319 goto bad_fork_cleanup_policy;
1315 1320
1316 retval = perf_event_init_task(p); 1321 retval = perf_event_init_task(p);
1317 if (retval) 1322 if (retval)
@@ -1403,13 +1408,11 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1403 p->tgid = p->pid; 1408 p->tgid = p->pid;
1404 } 1409 }
1405 1410
1406 p->pdeath_signal = 0;
1407 p->exit_state = 0;
1408
1409 p->nr_dirtied = 0; 1411 p->nr_dirtied = 0;
1410 p->nr_dirtied_pause = 128 >> (PAGE_SHIFT - 10); 1412 p->nr_dirtied_pause = 128 >> (PAGE_SHIFT - 10);
1411 p->dirty_paused_when = 0; 1413 p->dirty_paused_when = 0;
1412 1414
1415 p->pdeath_signal = 0;
1413 INIT_LIST_HEAD(&p->thread_group); 1416 INIT_LIST_HEAD(&p->thread_group);
1414 p->task_works = NULL; 1417 p->task_works = NULL;
1415 1418
@@ -1472,6 +1475,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1472 atomic_inc(&current->signal->sigcnt); 1475 atomic_inc(&current->signal->sigcnt);
1473 list_add_tail_rcu(&p->thread_group, 1476 list_add_tail_rcu(&p->thread_group,
1474 &p->group_leader->thread_group); 1477 &p->group_leader->thread_group);
1478 list_add_tail_rcu(&p->thread_node,
1479 &p->signal->thread_head);
1475 } 1480 }
1476 attach_pid(p, PIDTYPE_PID); 1481 attach_pid(p, PIDTYPE_PID);
1477 nr_threads++; 1482 nr_threads++;
@@ -1645,7 +1650,7 @@ SYSCALL_DEFINE0(fork)
1645 return do_fork(SIGCHLD, 0, 0, NULL, NULL); 1650 return do_fork(SIGCHLD, 0, 0, NULL, NULL);
1646#else 1651#else
1647 /* can not support in nommu mode */ 1652 /* can not support in nommu mode */
1648 return(-EINVAL); 1653 return -EINVAL;
1649#endif 1654#endif
1650} 1655}
1651#endif 1656#endif
@@ -1653,7 +1658,7 @@ SYSCALL_DEFINE0(fork)
1653#ifdef __ARCH_WANT_SYS_VFORK 1658#ifdef __ARCH_WANT_SYS_VFORK
1654SYSCALL_DEFINE0(vfork) 1659SYSCALL_DEFINE0(vfork)
1655{ 1660{
1656 return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, 0, 1661 return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, 0,
1657 0, NULL, NULL); 1662 0, NULL, NULL);
1658} 1663}
1659#endif 1664#endif
diff --git a/kernel/futex.c b/kernel/futex.c
index f6ff0191ecf7..08ec814ad9d2 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -63,14 +63,101 @@
63#include <linux/sched/rt.h> 63#include <linux/sched/rt.h>
64#include <linux/hugetlb.h> 64#include <linux/hugetlb.h>
65#include <linux/freezer.h> 65#include <linux/freezer.h>
66#include <linux/bootmem.h>
66 67
67#include <asm/futex.h> 68#include <asm/futex.h>
68 69
69#include "locking/rtmutex_common.h" 70#include "locking/rtmutex_common.h"
70 71
71int __read_mostly futex_cmpxchg_enabled; 72/*
73 * Basic futex operation and ordering guarantees:
74 *
75 * The waiter reads the futex value in user space and calls
76 * futex_wait(). This function computes the hash bucket and acquires
77 * the hash bucket lock. After that it reads the futex user space value
78 * again and verifies that the data has not changed. If it has not changed
79 * it enqueues itself into the hash bucket, releases the hash bucket lock
80 * and schedules.
81 *
82 * The waker side modifies the user space value of the futex and calls
83 * futex_wake(). This function computes the hash bucket and acquires the
84 * hash bucket lock. Then it looks for waiters on that futex in the hash
85 * bucket and wakes them.
86 *
87 * In futex wake up scenarios where no tasks are blocked on a futex, taking
88 * the hb spinlock can be avoided and simply return. In order for this
89 * optimization to work, ordering guarantees must exist so that the waiter
90 * being added to the list is acknowledged when the list is concurrently being
91 * checked by the waker, avoiding scenarios like the following:
92 *
93 * CPU 0 CPU 1
94 * val = *futex;
95 * sys_futex(WAIT, futex, val);
96 * futex_wait(futex, val);
97 * uval = *futex;
98 * *futex = newval;
99 * sys_futex(WAKE, futex);
100 * futex_wake(futex);
101 * if (queue_empty())
102 * return;
103 * if (uval == val)
104 * lock(hash_bucket(futex));
105 * queue();
106 * unlock(hash_bucket(futex));
107 * schedule();
108 *
109 * This would cause the waiter on CPU 0 to wait forever because it
110 * missed the transition of the user space value from val to newval
111 * and the waker did not find the waiter in the hash bucket queue.
112 *
113 * The correct serialization ensures that a waiter either observes
114 * the changed user space value before blocking or is woken by a
115 * concurrent waker:
116 *
117 * CPU 0 CPU 1
118 * val = *futex;
119 * sys_futex(WAIT, futex, val);
120 * futex_wait(futex, val);
121 *
122 * waiters++;
123 * mb(); (A) <-- paired with -.
124 * |
125 * lock(hash_bucket(futex)); |
126 * |
127 * uval = *futex; |
128 * | *futex = newval;
129 * | sys_futex(WAKE, futex);
130 * | futex_wake(futex);
131 * |
132 * `-------> mb(); (B)
133 * if (uval == val)
134 * queue();
135 * unlock(hash_bucket(futex));
136 * schedule(); if (waiters)
137 * lock(hash_bucket(futex));
138 * wake_waiters(futex);
139 * unlock(hash_bucket(futex));
140 *
141 * Where (A) orders the waiters increment and the futex value read -- this
142 * is guaranteed by the head counter in the hb spinlock; and where (B)
143 * orders the write to futex and the waiters read -- this is done by the
144 * barriers in get_futex_key_refs(), through either ihold or atomic_inc,
145 * depending on the futex type.
146 *
147 * This yields the following case (where X:=waiters, Y:=futex):
148 *
149 * X = Y = 0
150 *
151 * w[X]=1 w[Y]=1
152 * MB MB
153 * r[Y]=y r[X]=x
154 *
155 * Which guarantees that x==0 && y==0 is impossible; which translates back into
156 * the guarantee that we cannot both miss the futex variable change and the
157 * enqueue.
158 */
72 159
73#define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8) 160int __read_mostly futex_cmpxchg_enabled;
74 161
75/* 162/*
76 * Futex flags used to encode options to functions and preserve them across 163 * Futex flags used to encode options to functions and preserve them across
@@ -147,11 +234,59 @@ static const struct futex_q futex_q_init = {
147 * waiting on a futex. 234 * waiting on a futex.
148 */ 235 */
149struct futex_hash_bucket { 236struct futex_hash_bucket {
237 atomic_t waiters;
150 spinlock_t lock; 238 spinlock_t lock;
151 struct plist_head chain; 239 struct plist_head chain;
152}; 240} ____cacheline_aligned_in_smp;
241
242static unsigned long __read_mostly futex_hashsize;
243
244static struct futex_hash_bucket *futex_queues;
245
246static inline void futex_get_mm(union futex_key *key)
247{
248 atomic_inc(&key->private.mm->mm_count);
249 /*
250 * Ensure futex_get_mm() implies a full barrier such that
251 * get_futex_key() implies a full barrier. This is relied upon
252 * as full barrier (B), see the ordering comment above.
253 */
254 smp_mb__after_atomic_inc();
255}
256
257/*
258 * Reflects a new waiter being added to the waitqueue.
259 */
260static inline void hb_waiters_inc(struct futex_hash_bucket *hb)
261{
262#ifdef CONFIG_SMP
263 atomic_inc(&hb->waiters);
264 /*
265 * Full barrier (A), see the ordering comment above.
266 */
267 smp_mb__after_atomic_inc();
268#endif
269}
153 270
154static struct futex_hash_bucket futex_queues[1<<FUTEX_HASHBITS]; 271/*
272 * Reflects a waiter being removed from the waitqueue by wakeup
273 * paths.
274 */
275static inline void hb_waiters_dec(struct futex_hash_bucket *hb)
276{
277#ifdef CONFIG_SMP
278 atomic_dec(&hb->waiters);
279#endif
280}
281
282static inline int hb_waiters_pending(struct futex_hash_bucket *hb)
283{
284#ifdef CONFIG_SMP
285 return atomic_read(&hb->waiters);
286#else
287 return 1;
288#endif
289}
155 290
156/* 291/*
157 * We hash on the keys returned from get_futex_key (see below). 292 * We hash on the keys returned from get_futex_key (see below).
@@ -161,7 +296,7 @@ static struct futex_hash_bucket *hash_futex(union futex_key *key)
161 u32 hash = jhash2((u32*)&key->both.word, 296 u32 hash = jhash2((u32*)&key->both.word,
162 (sizeof(key->both.word)+sizeof(key->both.ptr))/4, 297 (sizeof(key->both.word)+sizeof(key->both.ptr))/4,
163 key->both.offset); 298 key->both.offset);
164 return &futex_queues[hash & ((1 << FUTEX_HASHBITS)-1)]; 299 return &futex_queues[hash & (futex_hashsize - 1)];
165} 300}
166 301
167/* 302/*
@@ -187,10 +322,10 @@ static void get_futex_key_refs(union futex_key *key)
187 322
188 switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) { 323 switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
189 case FUT_OFF_INODE: 324 case FUT_OFF_INODE:
190 ihold(key->shared.inode); 325 ihold(key->shared.inode); /* implies MB (B) */
191 break; 326 break;
192 case FUT_OFF_MMSHARED: 327 case FUT_OFF_MMSHARED:
193 atomic_inc(&key->private.mm->mm_count); 328 futex_get_mm(key); /* implies MB (B) */
194 break; 329 break;
195 } 330 }
196} 331}
@@ -264,7 +399,7 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw)
264 if (!fshared) { 399 if (!fshared) {
265 key->private.mm = mm; 400 key->private.mm = mm;
266 key->private.address = address; 401 key->private.address = address;
267 get_futex_key_refs(key); 402 get_futex_key_refs(key); /* implies MB (B) */
268 return 0; 403 return 0;
269 } 404 }
270 405
@@ -371,7 +506,7 @@ again:
371 key->shared.pgoff = basepage_index(page); 506 key->shared.pgoff = basepage_index(page);
372 } 507 }
373 508
374 get_futex_key_refs(key); 509 get_futex_key_refs(key); /* implies MB (B) */
375 510
376out: 511out:
377 unlock_page(page_head); 512 unlock_page(page_head);
@@ -598,13 +733,10 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
598{ 733{
599 struct futex_pi_state *pi_state = NULL; 734 struct futex_pi_state *pi_state = NULL;
600 struct futex_q *this, *next; 735 struct futex_q *this, *next;
601 struct plist_head *head;
602 struct task_struct *p; 736 struct task_struct *p;
603 pid_t pid = uval & FUTEX_TID_MASK; 737 pid_t pid = uval & FUTEX_TID_MASK;
604 738
605 head = &hb->chain; 739 plist_for_each_entry_safe(this, next, &hb->chain, list) {
606
607 plist_for_each_entry_safe(this, next, head, list) {
608 if (match_futex(&this->key, key)) { 740 if (match_futex(&this->key, key)) {
609 /* 741 /*
610 * Another waiter already exists - bump up 742 * Another waiter already exists - bump up
@@ -838,6 +970,7 @@ static void __unqueue_futex(struct futex_q *q)
838 970
839 hb = container_of(q->lock_ptr, struct futex_hash_bucket, lock); 971 hb = container_of(q->lock_ptr, struct futex_hash_bucket, lock);
840 plist_del(&q->list, &hb->chain); 972 plist_del(&q->list, &hb->chain);
973 hb_waiters_dec(hb);
841} 974}
842 975
843/* 976/*
@@ -986,7 +1119,6 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
986{ 1119{
987 struct futex_hash_bucket *hb; 1120 struct futex_hash_bucket *hb;
988 struct futex_q *this, *next; 1121 struct futex_q *this, *next;
989 struct plist_head *head;
990 union futex_key key = FUTEX_KEY_INIT; 1122 union futex_key key = FUTEX_KEY_INIT;
991 int ret; 1123 int ret;
992 1124
@@ -998,10 +1130,14 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
998 goto out; 1130 goto out;
999 1131
1000 hb = hash_futex(&key); 1132 hb = hash_futex(&key);
1133
1134 /* Make sure we really have tasks to wakeup */
1135 if (!hb_waiters_pending(hb))
1136 goto out_put_key;
1137
1001 spin_lock(&hb->lock); 1138 spin_lock(&hb->lock);
1002 head = &hb->chain;
1003 1139
1004 plist_for_each_entry_safe(this, next, head, list) { 1140 plist_for_each_entry_safe(this, next, &hb->chain, list) {
1005 if (match_futex (&this->key, &key)) { 1141 if (match_futex (&this->key, &key)) {
1006 if (this->pi_state || this->rt_waiter) { 1142 if (this->pi_state || this->rt_waiter) {
1007 ret = -EINVAL; 1143 ret = -EINVAL;
@@ -1019,6 +1155,7 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
1019 } 1155 }
1020 1156
1021 spin_unlock(&hb->lock); 1157 spin_unlock(&hb->lock);
1158out_put_key:
1022 put_futex_key(&key); 1159 put_futex_key(&key);
1023out: 1160out:
1024 return ret; 1161 return ret;
@@ -1034,7 +1171,6 @@ futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2,
1034{ 1171{
1035 union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT; 1172 union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
1036 struct futex_hash_bucket *hb1, *hb2; 1173 struct futex_hash_bucket *hb1, *hb2;
1037 struct plist_head *head;
1038 struct futex_q *this, *next; 1174 struct futex_q *this, *next;
1039 int ret, op_ret; 1175 int ret, op_ret;
1040 1176
@@ -1082,9 +1218,7 @@ retry_private:
1082 goto retry; 1218 goto retry;
1083 } 1219 }
1084 1220
1085 head = &hb1->chain; 1221 plist_for_each_entry_safe(this, next, &hb1->chain, list) {
1086
1087 plist_for_each_entry_safe(this, next, head, list) {
1088 if (match_futex (&this->key, &key1)) { 1222 if (match_futex (&this->key, &key1)) {
1089 if (this->pi_state || this->rt_waiter) { 1223 if (this->pi_state || this->rt_waiter) {
1090 ret = -EINVAL; 1224 ret = -EINVAL;
@@ -1097,10 +1231,8 @@ retry_private:
1097 } 1231 }
1098 1232
1099 if (op_ret > 0) { 1233 if (op_ret > 0) {
1100 head = &hb2->chain;
1101
1102 op_ret = 0; 1234 op_ret = 0;
1103 plist_for_each_entry_safe(this, next, head, list) { 1235 plist_for_each_entry_safe(this, next, &hb2->chain, list) {
1104 if (match_futex (&this->key, &key2)) { 1236 if (match_futex (&this->key, &key2)) {
1105 if (this->pi_state || this->rt_waiter) { 1237 if (this->pi_state || this->rt_waiter) {
1106 ret = -EINVAL; 1238 ret = -EINVAL;
@@ -1142,7 +1274,9 @@ void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1,
1142 */ 1274 */
1143 if (likely(&hb1->chain != &hb2->chain)) { 1275 if (likely(&hb1->chain != &hb2->chain)) {
1144 plist_del(&q->list, &hb1->chain); 1276 plist_del(&q->list, &hb1->chain);
1277 hb_waiters_dec(hb1);
1145 plist_add(&q->list, &hb2->chain); 1278 plist_add(&q->list, &hb2->chain);
1279 hb_waiters_inc(hb2);
1146 q->lock_ptr = &hb2->lock; 1280 q->lock_ptr = &hb2->lock;
1147 } 1281 }
1148 get_futex_key_refs(key2); 1282 get_futex_key_refs(key2);
@@ -1270,7 +1404,6 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
1270 int drop_count = 0, task_count = 0, ret; 1404 int drop_count = 0, task_count = 0, ret;
1271 struct futex_pi_state *pi_state = NULL; 1405 struct futex_pi_state *pi_state = NULL;
1272 struct futex_hash_bucket *hb1, *hb2; 1406 struct futex_hash_bucket *hb1, *hb2;
1273 struct plist_head *head1;
1274 struct futex_q *this, *next; 1407 struct futex_q *this, *next;
1275 u32 curval2; 1408 u32 curval2;
1276 1409
@@ -1393,8 +1526,7 @@ retry_private:
1393 } 1526 }
1394 } 1527 }
1395 1528
1396 head1 = &hb1->chain; 1529 plist_for_each_entry_safe(this, next, &hb1->chain, list) {
1397 plist_for_each_entry_safe(this, next, head1, list) {
1398 if (task_count - nr_wake >= nr_requeue) 1530 if (task_count - nr_wake >= nr_requeue)
1399 break; 1531 break;
1400 1532
@@ -1487,17 +1619,29 @@ static inline struct futex_hash_bucket *queue_lock(struct futex_q *q)
1487 struct futex_hash_bucket *hb; 1619 struct futex_hash_bucket *hb;
1488 1620
1489 hb = hash_futex(&q->key); 1621 hb = hash_futex(&q->key);
1622
1623 /*
1624 * Increment the counter before taking the lock so that
1625 * a potential waker won't miss a to-be-slept task that is
1626 * waiting for the spinlock. This is safe as all queue_lock()
1627 * users end up calling queue_me(). Similarly, for housekeeping,
1628 * decrement the counter at queue_unlock() when some error has
1629 * occurred and we don't end up adding the task to the list.
1630 */
1631 hb_waiters_inc(hb);
1632
1490 q->lock_ptr = &hb->lock; 1633 q->lock_ptr = &hb->lock;
1491 1634
1492 spin_lock(&hb->lock); 1635 spin_lock(&hb->lock); /* implies MB (A) */
1493 return hb; 1636 return hb;
1494} 1637}
1495 1638
1496static inline void 1639static inline void
1497queue_unlock(struct futex_q *q, struct futex_hash_bucket *hb) 1640queue_unlock(struct futex_hash_bucket *hb)
1498 __releases(&hb->lock) 1641 __releases(&hb->lock)
1499{ 1642{
1500 spin_unlock(&hb->lock); 1643 spin_unlock(&hb->lock);
1644 hb_waiters_dec(hb);
1501} 1645}
1502 1646
1503/** 1647/**
@@ -1867,7 +2011,7 @@ retry_private:
1867 ret = get_futex_value_locked(&uval, uaddr); 2011 ret = get_futex_value_locked(&uval, uaddr);
1868 2012
1869 if (ret) { 2013 if (ret) {
1870 queue_unlock(q, *hb); 2014 queue_unlock(*hb);
1871 2015
1872 ret = get_user(uval, uaddr); 2016 ret = get_user(uval, uaddr);
1873 if (ret) 2017 if (ret)
@@ -1881,7 +2025,7 @@ retry_private:
1881 } 2025 }
1882 2026
1883 if (uval != val) { 2027 if (uval != val) {
1884 queue_unlock(q, *hb); 2028 queue_unlock(*hb);
1885 ret = -EWOULDBLOCK; 2029 ret = -EWOULDBLOCK;
1886 } 2030 }
1887 2031
@@ -2029,7 +2173,7 @@ retry_private:
2029 * Task is exiting and we just wait for the 2173 * Task is exiting and we just wait for the
2030 * exit to complete. 2174 * exit to complete.
2031 */ 2175 */
2032 queue_unlock(&q, hb); 2176 queue_unlock(hb);
2033 put_futex_key(&q.key); 2177 put_futex_key(&q.key);
2034 cond_resched(); 2178 cond_resched();
2035 goto retry; 2179 goto retry;
@@ -2081,7 +2225,7 @@ retry_private:
2081 goto out_put_key; 2225 goto out_put_key;
2082 2226
2083out_unlock_put_key: 2227out_unlock_put_key:
2084 queue_unlock(&q, hb); 2228 queue_unlock(hb);
2085 2229
2086out_put_key: 2230out_put_key:
2087 put_futex_key(&q.key); 2231 put_futex_key(&q.key);
@@ -2091,7 +2235,7 @@ out:
2091 return ret != -EINTR ? ret : -ERESTARTNOINTR; 2235 return ret != -EINTR ? ret : -ERESTARTNOINTR;
2092 2236
2093uaddr_faulted: 2237uaddr_faulted:
2094 queue_unlock(&q, hb); 2238 queue_unlock(hb);
2095 2239
2096 ret = fault_in_user_writeable(uaddr); 2240 ret = fault_in_user_writeable(uaddr);
2097 if (ret) 2241 if (ret)
@@ -2113,7 +2257,6 @@ static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
2113{ 2257{
2114 struct futex_hash_bucket *hb; 2258 struct futex_hash_bucket *hb;
2115 struct futex_q *this, *next; 2259 struct futex_q *this, *next;
2116 struct plist_head *head;
2117 union futex_key key = FUTEX_KEY_INIT; 2260 union futex_key key = FUTEX_KEY_INIT;
2118 u32 uval, vpid = task_pid_vnr(current); 2261 u32 uval, vpid = task_pid_vnr(current);
2119 int ret; 2262 int ret;
@@ -2153,9 +2296,7 @@ retry:
2153 * Ok, other tasks may need to be woken up - check waiters 2296 * Ok, other tasks may need to be woken up - check waiters
2154 * and do the wakeup if necessary: 2297 * and do the wakeup if necessary:
2155 */ 2298 */
2156 head = &hb->chain; 2299 plist_for_each_entry_safe(this, next, &hb->chain, list) {
2157
2158 plist_for_each_entry_safe(this, next, head, list) {
2159 if (!match_futex (&this->key, &key)) 2300 if (!match_futex (&this->key, &key))
2160 continue; 2301 continue;
2161 ret = wake_futex_pi(uaddr, uval, this); 2302 ret = wake_futex_pi(uaddr, uval, this);
@@ -2232,6 +2373,7 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
2232 * Unqueue the futex_q and determine which it was. 2373 * Unqueue the futex_q and determine which it was.
2233 */ 2374 */
2234 plist_del(&q->list, &hb->chain); 2375 plist_del(&q->list, &hb->chain);
2376 hb_waiters_dec(hb);
2235 2377
2236 /* Handle spurious wakeups gracefully */ 2378 /* Handle spurious wakeups gracefully */
2237 ret = -EWOULDBLOCK; 2379 ret = -EWOULDBLOCK;
@@ -2316,6 +2458,8 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
2316 * code while we sleep on uaddr. 2458 * code while we sleep on uaddr.
2317 */ 2459 */
2318 debug_rt_mutex_init_waiter(&rt_waiter); 2460 debug_rt_mutex_init_waiter(&rt_waiter);
2461 RB_CLEAR_NODE(&rt_waiter.pi_tree_entry);
2462 RB_CLEAR_NODE(&rt_waiter.tree_entry);
2319 rt_waiter.task = NULL; 2463 rt_waiter.task = NULL;
2320 2464
2321 ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE); 2465 ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE);
@@ -2734,8 +2878,21 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
2734static int __init futex_init(void) 2878static int __init futex_init(void)
2735{ 2879{
2736 u32 curval; 2880 u32 curval;
2737 int i; 2881 unsigned int futex_shift;
2882 unsigned long i;
2883
2884#if CONFIG_BASE_SMALL
2885 futex_hashsize = 16;
2886#else
2887 futex_hashsize = roundup_pow_of_two(256 * num_possible_cpus());
2888#endif
2738 2889
2890 futex_queues = alloc_large_system_hash("futex", sizeof(*futex_queues),
2891 futex_hashsize, 0,
2892 futex_hashsize < 256 ? HASH_SMALL : 0,
2893 &futex_shift, NULL,
2894 futex_hashsize, futex_hashsize);
2895 futex_hashsize = 1UL << futex_shift;
2739 /* 2896 /*
2740 * This will fail and we want it. Some arch implementations do 2897 * This will fail and we want it. Some arch implementations do
2741 * runtime detection of the futex_atomic_cmpxchg_inatomic() 2898 * runtime detection of the futex_atomic_cmpxchg_inatomic()
@@ -2749,7 +2906,8 @@ static int __init futex_init(void)
2749 if (cmpxchg_futex_value_locked(&curval, NULL, 0, 0) == -EFAULT) 2906 if (cmpxchg_futex_value_locked(&curval, NULL, 0, 0) == -EFAULT)
2750 futex_cmpxchg_enabled = 1; 2907 futex_cmpxchg_enabled = 1;
2751 2908
2752 for (i = 0; i < ARRAY_SIZE(futex_queues); i++) { 2909 for (i = 0; i < futex_hashsize; i++) {
2910 atomic_set(&futex_queues[i].waiters, 0);
2753 plist_head_init(&futex_queues[i].chain); 2911 plist_head_init(&futex_queues[i].chain);
2754 spin_lock_init(&futex_queues[i].lock); 2912 spin_lock_init(&futex_queues[i].lock);
2755 } 2913 }
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 383319bae3f7..09094361dce5 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -46,6 +46,7 @@
46#include <linux/sched.h> 46#include <linux/sched.h>
47#include <linux/sched/sysctl.h> 47#include <linux/sched/sysctl.h>
48#include <linux/sched/rt.h> 48#include <linux/sched/rt.h>
49#include <linux/sched/deadline.h>
49#include <linux/timer.h> 50#include <linux/timer.h>
50#include <linux/freezer.h> 51#include <linux/freezer.h>
51 52
@@ -1610,7 +1611,7 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
1610 unsigned long slack; 1611 unsigned long slack;
1611 1612
1612 slack = current->timer_slack_ns; 1613 slack = current->timer_slack_ns;
1613 if (rt_task(current)) 1614 if (dl_task(current) || rt_task(current))
1614 slack = 0; 1615 slack = 0;
1615 1616
1616 hrtimer_init_on_stack(&t.timer, clockid, mode); 1617 hrtimer_init_on_stack(&t.timer, clockid, mode);
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index 9328b80eaf14..0b9c169d577f 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -37,7 +37,7 @@ int __read_mostly sysctl_hung_task_check_count = PID_MAX_LIMIT;
37 */ 37 */
38unsigned long __read_mostly sysctl_hung_task_timeout_secs = CONFIG_DEFAULT_HUNG_TASK_TIMEOUT; 38unsigned long __read_mostly sysctl_hung_task_timeout_secs = CONFIG_DEFAULT_HUNG_TASK_TIMEOUT;
39 39
40unsigned long __read_mostly sysctl_hung_task_warnings = 10; 40int __read_mostly sysctl_hung_task_warnings = 10;
41 41
42static int __read_mostly did_panic; 42static int __read_mostly did_panic;
43 43
@@ -98,7 +98,9 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout)
98 98
99 if (!sysctl_hung_task_warnings) 99 if (!sysctl_hung_task_warnings)
100 return; 100 return;
101 sysctl_hung_task_warnings--; 101
102 if (sysctl_hung_task_warnings > 0)
103 sysctl_hung_task_warnings--;
102 104
103 /* 105 /*
104 * Ok, the task did not get scheduled for more than 2 minutes, 106 * Ok, the task did not get scheduled for more than 2 minutes,
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
index 4a1fef09f658..07cbdfea9ae2 100644
--- a/kernel/irq/Kconfig
+++ b/kernel/irq/Kconfig
@@ -40,6 +40,7 @@ config IRQ_EDGE_EOI_HANDLER
40# Generic configurable interrupt chip implementation 40# Generic configurable interrupt chip implementation
41config GENERIC_IRQ_CHIP 41config GENERIC_IRQ_CHIP
42 bool 42 bool
43 select IRQ_DOMAIN
43 44
44# Generic irq_domain hw <--> linux irq number translation 45# Generic irq_domain hw <--> linux irq number translation
45config IRQ_DOMAIN 46config IRQ_DOMAIN
diff --git a/kernel/irq/devres.c b/kernel/irq/devres.c
index bd8e788d71e0..1ef0606797c9 100644
--- a/kernel/irq/devres.c
+++ b/kernel/irq/devres.c
@@ -73,6 +73,51 @@ int devm_request_threaded_irq(struct device *dev, unsigned int irq,
73EXPORT_SYMBOL(devm_request_threaded_irq); 73EXPORT_SYMBOL(devm_request_threaded_irq);
74 74
75/** 75/**
76 * devm_request_any_context_irq - allocate an interrupt line for a managed device
77 * @dev: device to request interrupt for
78 * @irq: Interrupt line to allocate
79 * @handler: Function to be called when the IRQ occurs
80 * @thread_fn: function to be called in a threaded interrupt context. NULL
81 * for devices which handle everything in @handler
82 * @irqflags: Interrupt type flags
83 * @devname: An ascii name for the claiming device
84 * @dev_id: A cookie passed back to the handler function
85 *
86 * Except for the extra @dev argument, this function takes the
87 * same arguments and performs the same function as
88 * request_any_context_irq(). IRQs requested with this function will be
89 * automatically freed on driver detach.
90 *
91 * If an IRQ allocated with this function needs to be freed
92 * separately, devm_free_irq() must be used.
93 */
94int devm_request_any_context_irq(struct device *dev, unsigned int irq,
95 irq_handler_t handler, unsigned long irqflags,
96 const char *devname, void *dev_id)
97{
98 struct irq_devres *dr;
99 int rc;
100
101 dr = devres_alloc(devm_irq_release, sizeof(struct irq_devres),
102 GFP_KERNEL);
103 if (!dr)
104 return -ENOMEM;
105
106 rc = request_any_context_irq(irq, handler, irqflags, devname, dev_id);
107 if (rc) {
108 devres_free(dr);
109 return rc;
110 }
111
112 dr->irq = irq;
113 dr->dev_id = dev_id;
114 devres_add(dev, dr);
115
116 return 0;
117}
118EXPORT_SYMBOL(devm_request_any_context_irq);
119
120/**
76 * devm_free_irq - free an interrupt 121 * devm_free_irq - free an interrupt
77 * @dev: device to free interrupt for 122 * @dev: device to free interrupt for
78 * @irq: Interrupt line to free 123 * @irq: Interrupt line to free
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 192a302d6cfd..8ab8e9390297 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -274,6 +274,7 @@ struct irq_desc *irq_to_desc(unsigned int irq)
274{ 274{
275 return (irq < NR_IRQS) ? irq_desc + irq : NULL; 275 return (irq < NR_IRQS) ? irq_desc + irq : NULL;
276} 276}
277EXPORT_SYMBOL(irq_to_desc);
277 278
278static void free_desc(unsigned int irq) 279static void free_desc(unsigned int irq)
279{ 280{
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index cf68bb36fe58..f14033700c25 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -10,6 +10,7 @@
10#include <linux/mutex.h> 10#include <linux/mutex.h>
11#include <linux/of.h> 11#include <linux/of.h>
12#include <linux/of_address.h> 12#include <linux/of_address.h>
13#include <linux/of_irq.h>
13#include <linux/topology.h> 14#include <linux/topology.h>
14#include <linux/seq_file.h> 15#include <linux/seq_file.h>
15#include <linux/slab.h> 16#include <linux/slab.h>
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 481a13c43b17..d3bf660cb57f 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -802,8 +802,7 @@ static irqreturn_t irq_thread_fn(struct irq_desc *desc,
802 802
803static void wake_threads_waitq(struct irq_desc *desc) 803static void wake_threads_waitq(struct irq_desc *desc)
804{ 804{
805 if (atomic_dec_and_test(&desc->threads_active) && 805 if (atomic_dec_and_test(&desc->threads_active))
806 waitqueue_active(&desc->wait_for_threads))
807 wake_up(&desc->wait_for_threads); 806 wake_up(&desc->wait_for_threads);
808} 807}
809 808
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 9c970167e402..60bafbed06ab 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -932,6 +932,7 @@ static int kimage_load_segment(struct kimage *image,
932 */ 932 */
933struct kimage *kexec_image; 933struct kimage *kexec_image;
934struct kimage *kexec_crash_image; 934struct kimage *kexec_crash_image;
935int kexec_load_disabled;
935 936
936static DEFINE_MUTEX(kexec_mutex); 937static DEFINE_MUTEX(kexec_mutex);
937 938
@@ -942,7 +943,7 @@ SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments,
942 int result; 943 int result;
943 944
944 /* We only trust the superuser with rebooting the system. */ 945 /* We only trust the superuser with rebooting the system. */
945 if (!capable(CAP_SYS_BOOT)) 946 if (!capable(CAP_SYS_BOOT) || kexec_load_disabled)
946 return -EPERM; 947 return -EPERM;
947 948
948 /* 949 /*
@@ -1536,7 +1537,7 @@ void vmcoreinfo_append_str(const char *fmt, ...)
1536 size_t r; 1537 size_t r;
1537 1538
1538 va_start(args, fmt); 1539 va_start(args, fmt);
1539 r = vsnprintf(buf, sizeof(buf), fmt, args); 1540 r = vscnprintf(buf, sizeof(buf), fmt, args);
1540 va_end(args); 1541 va_end(args);
1541 1542
1542 r = min(r, vmcoreinfo_max_size - vmcoreinfo_size); 1543 r = min(r, vmcoreinfo_max_size - vmcoreinfo_size);
diff --git a/kernel/kmod.c b/kernel/kmod.c
index b086006c59e7..6b375af4958d 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -239,7 +239,7 @@ static int ____call_usermodehelper(void *data)
239 239
240 commit_creds(new); 240 commit_creds(new);
241 241
242 retval = do_execve(sub_info->path, 242 retval = do_execve(getname_kernel(sub_info->path),
243 (const char __user *const __user *)sub_info->argv, 243 (const char __user *const __user *)sub_info->argv,
244 (const char __user *const __user *)sub_info->envp); 244 (const char __user *const __user *)sub_info->envp);
245 if (!retval) 245 if (!retval)
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 9659d38e008f..d945a949760f 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -126,7 +126,7 @@ static ssize_t vmcoreinfo_show(struct kobject *kobj,
126{ 126{
127 return sprintf(buf, "%lx %x\n", 127 return sprintf(buf, "%lx %x\n",
128 paddr_vmcoreinfo_note(), 128 paddr_vmcoreinfo_note(),
129 (unsigned int)vmcoreinfo_max_size); 129 (unsigned int)sizeof(vmcoreinfo_note));
130} 130}
131KERNEL_ATTR_RO(vmcoreinfo); 131KERNEL_ATTR_RO(vmcoreinfo);
132 132
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 576ba756a32d..eb8a54783fa0 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -590,6 +590,7 @@ static int very_verbose(struct lock_class *class)
590/* 590/*
591 * Is this the address of a static object: 591 * Is this the address of a static object:
592 */ 592 */
593#ifdef __KERNEL__
593static int static_obj(void *obj) 594static int static_obj(void *obj)
594{ 595{
595 unsigned long start = (unsigned long) &_stext, 596 unsigned long start = (unsigned long) &_stext,
@@ -616,6 +617,7 @@ static int static_obj(void *obj)
616 */ 617 */
617 return is_module_address(addr) || is_module_percpu_address(addr); 618 return is_module_address(addr) || is_module_percpu_address(addr);
618} 619}
620#endif
619 621
620/* 622/*
621 * To make lock name printouts unique, we calculate a unique 623 * To make lock name printouts unique, we calculate a unique
@@ -4115,6 +4117,7 @@ void debug_check_no_locks_held(void)
4115} 4117}
4116EXPORT_SYMBOL_GPL(debug_check_no_locks_held); 4118EXPORT_SYMBOL_GPL(debug_check_no_locks_held);
4117 4119
4120#ifdef __KERNEL__
4118void debug_show_all_locks(void) 4121void debug_show_all_locks(void)
4119{ 4122{
4120 struct task_struct *g, *p; 4123 struct task_struct *g, *p;
@@ -4172,6 +4175,7 @@ retry:
4172 read_unlock(&tasklist_lock); 4175 read_unlock(&tasklist_lock);
4173} 4176}
4174EXPORT_SYMBOL_GPL(debug_show_all_locks); 4177EXPORT_SYMBOL_GPL(debug_show_all_locks);
4178#endif
4175 4179
4176/* 4180/*
4177 * Careful: only use this function if you are sure that 4181 * Careful: only use this function if you are sure that
diff --git a/kernel/locking/mutex-debug.c b/kernel/locking/mutex-debug.c
index 7e3443fe1f48..faf6f5b53e77 100644
--- a/kernel/locking/mutex-debug.c
+++ b/kernel/locking/mutex-debug.c
@@ -75,7 +75,12 @@ void debug_mutex_unlock(struct mutex *lock)
75 return; 75 return;
76 76
77 DEBUG_LOCKS_WARN_ON(lock->magic != lock); 77 DEBUG_LOCKS_WARN_ON(lock->magic != lock);
78 DEBUG_LOCKS_WARN_ON(lock->owner != current); 78
79 if (!lock->owner)
80 DEBUG_LOCKS_WARN_ON(!lock->owner);
81 else
82 DEBUG_LOCKS_WARN_ON(lock->owner != current);
83
79 DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next); 84 DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next);
80 mutex_clear_owner(lock); 85 mutex_clear_owner(lock);
81} 86}
diff --git a/kernel/locking/rtmutex-debug.c b/kernel/locking/rtmutex-debug.c
index 13b243a323fa..49b2ed3dced8 100644
--- a/kernel/locking/rtmutex-debug.c
+++ b/kernel/locking/rtmutex-debug.c
@@ -24,7 +24,7 @@
24#include <linux/kallsyms.h> 24#include <linux/kallsyms.h>
25#include <linux/syscalls.h> 25#include <linux/syscalls.h>
26#include <linux/interrupt.h> 26#include <linux/interrupt.h>
27#include <linux/plist.h> 27#include <linux/rbtree.h>
28#include <linux/fs.h> 28#include <linux/fs.h>
29#include <linux/debug_locks.h> 29#include <linux/debug_locks.h>
30 30
@@ -57,7 +57,7 @@ static void printk_lock(struct rt_mutex *lock, int print_owner)
57 57
58void rt_mutex_debug_task_free(struct task_struct *task) 58void rt_mutex_debug_task_free(struct task_struct *task)
59{ 59{
60 DEBUG_LOCKS_WARN_ON(!plist_head_empty(&task->pi_waiters)); 60 DEBUG_LOCKS_WARN_ON(!RB_EMPTY_ROOT(&task->pi_waiters));
61 DEBUG_LOCKS_WARN_ON(task->pi_blocked_on); 61 DEBUG_LOCKS_WARN_ON(task->pi_blocked_on);
62} 62}
63 63
@@ -154,16 +154,12 @@ void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock)
154void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter) 154void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter)
155{ 155{
156 memset(waiter, 0x11, sizeof(*waiter)); 156 memset(waiter, 0x11, sizeof(*waiter));
157 plist_node_init(&waiter->list_entry, MAX_PRIO);
158 plist_node_init(&waiter->pi_list_entry, MAX_PRIO);
159 waiter->deadlock_task_pid = NULL; 157 waiter->deadlock_task_pid = NULL;
160} 158}
161 159
162void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter) 160void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter)
163{ 161{
164 put_pid(waiter->deadlock_task_pid); 162 put_pid(waiter->deadlock_task_pid);
165 DEBUG_LOCKS_WARN_ON(!plist_node_empty(&waiter->list_entry));
166 DEBUG_LOCKS_WARN_ON(!plist_node_empty(&waiter->pi_list_entry));
167 memset(waiter, 0x22, sizeof(*waiter)); 163 memset(waiter, 0x22, sizeof(*waiter));
168} 164}
169 165
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 0dd6aec1cb6a..2e960a2bab81 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -14,6 +14,7 @@
14#include <linux/export.h> 14#include <linux/export.h>
15#include <linux/sched.h> 15#include <linux/sched.h>
16#include <linux/sched/rt.h> 16#include <linux/sched/rt.h>
17#include <linux/sched/deadline.h>
17#include <linux/timer.h> 18#include <linux/timer.h>
18 19
19#include "rtmutex_common.h" 20#include "rtmutex_common.h"
@@ -91,10 +92,107 @@ static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
91} 92}
92#endif 93#endif
93 94
95static inline int
96rt_mutex_waiter_less(struct rt_mutex_waiter *left,
97 struct rt_mutex_waiter *right)
98{
99 if (left->prio < right->prio)
100 return 1;
101
102 /*
103 * If both waiters have dl_prio(), we check the deadlines of the
104 * associated tasks.
105 * If left waiter has a dl_prio(), and we didn't return 1 above,
106 * then right waiter has a dl_prio() too.
107 */
108 if (dl_prio(left->prio))
109 return (left->task->dl.deadline < right->task->dl.deadline);
110
111 return 0;
112}
113
114static void
115rt_mutex_enqueue(struct rt_mutex *lock, struct rt_mutex_waiter *waiter)
116{
117 struct rb_node **link = &lock->waiters.rb_node;
118 struct rb_node *parent = NULL;
119 struct rt_mutex_waiter *entry;
120 int leftmost = 1;
121
122 while (*link) {
123 parent = *link;
124 entry = rb_entry(parent, struct rt_mutex_waiter, tree_entry);
125 if (rt_mutex_waiter_less(waiter, entry)) {
126 link = &parent->rb_left;
127 } else {
128 link = &parent->rb_right;
129 leftmost = 0;
130 }
131 }
132
133 if (leftmost)
134 lock->waiters_leftmost = &waiter->tree_entry;
135
136 rb_link_node(&waiter->tree_entry, parent, link);
137 rb_insert_color(&waiter->tree_entry, &lock->waiters);
138}
139
140static void
141rt_mutex_dequeue(struct rt_mutex *lock, struct rt_mutex_waiter *waiter)
142{
143 if (RB_EMPTY_NODE(&waiter->tree_entry))
144 return;
145
146 if (lock->waiters_leftmost == &waiter->tree_entry)
147 lock->waiters_leftmost = rb_next(&waiter->tree_entry);
148
149 rb_erase(&waiter->tree_entry, &lock->waiters);
150 RB_CLEAR_NODE(&waiter->tree_entry);
151}
152
153static void
154rt_mutex_enqueue_pi(struct task_struct *task, struct rt_mutex_waiter *waiter)
155{
156 struct rb_node **link = &task->pi_waiters.rb_node;
157 struct rb_node *parent = NULL;
158 struct rt_mutex_waiter *entry;
159 int leftmost = 1;
160
161 while (*link) {
162 parent = *link;
163 entry = rb_entry(parent, struct rt_mutex_waiter, pi_tree_entry);
164 if (rt_mutex_waiter_less(waiter, entry)) {
165 link = &parent->rb_left;
166 } else {
167 link = &parent->rb_right;
168 leftmost = 0;
169 }
170 }
171
172 if (leftmost)
173 task->pi_waiters_leftmost = &waiter->pi_tree_entry;
174
175 rb_link_node(&waiter->pi_tree_entry, parent, link);
176 rb_insert_color(&waiter->pi_tree_entry, &task->pi_waiters);
177}
178
179static void
180rt_mutex_dequeue_pi(struct task_struct *task, struct rt_mutex_waiter *waiter)
181{
182 if (RB_EMPTY_NODE(&waiter->pi_tree_entry))
183 return;
184
185 if (task->pi_waiters_leftmost == &waiter->pi_tree_entry)
186 task->pi_waiters_leftmost = rb_next(&waiter->pi_tree_entry);
187
188 rb_erase(&waiter->pi_tree_entry, &task->pi_waiters);
189 RB_CLEAR_NODE(&waiter->pi_tree_entry);
190}
191
94/* 192/*
95 * Calculate task priority from the waiter list priority 193 * Calculate task priority from the waiter tree priority
96 * 194 *
97 * Return task->normal_prio when the waiter list is empty or when 195 * Return task->normal_prio when the waiter tree is empty or when
98 * the waiter is not allowed to do priority boosting 196 * the waiter is not allowed to do priority boosting
99 */ 197 */
100int rt_mutex_getprio(struct task_struct *task) 198int rt_mutex_getprio(struct task_struct *task)
@@ -102,10 +200,18 @@ int rt_mutex_getprio(struct task_struct *task)
102 if (likely(!task_has_pi_waiters(task))) 200 if (likely(!task_has_pi_waiters(task)))
103 return task->normal_prio; 201 return task->normal_prio;
104 202
105 return min(task_top_pi_waiter(task)->pi_list_entry.prio, 203 return min(task_top_pi_waiter(task)->prio,
106 task->normal_prio); 204 task->normal_prio);
107} 205}
108 206
207struct task_struct *rt_mutex_get_top_task(struct task_struct *task)
208{
209 if (likely(!task_has_pi_waiters(task)))
210 return NULL;
211
212 return task_top_pi_waiter(task)->task;
213}
214
109/* 215/*
110 * Adjust the priority of a task, after its pi_waiters got modified. 216 * Adjust the priority of a task, after its pi_waiters got modified.
111 * 217 *
@@ -115,7 +221,7 @@ static void __rt_mutex_adjust_prio(struct task_struct *task)
115{ 221{
116 int prio = rt_mutex_getprio(task); 222 int prio = rt_mutex_getprio(task);
117 223
118 if (task->prio != prio) 224 if (task->prio != prio || dl_prio(prio))
119 rt_mutex_setprio(task, prio); 225 rt_mutex_setprio(task, prio);
120} 226}
121 227
@@ -233,7 +339,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
233 * When deadlock detection is off then we check, if further 339 * When deadlock detection is off then we check, if further
234 * priority adjustment is necessary. 340 * priority adjustment is necessary.
235 */ 341 */
236 if (!detect_deadlock && waiter->list_entry.prio == task->prio) 342 if (!detect_deadlock && waiter->prio == task->prio)
237 goto out_unlock_pi; 343 goto out_unlock_pi;
238 344
239 lock = waiter->lock; 345 lock = waiter->lock;
@@ -254,9 +360,9 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
254 top_waiter = rt_mutex_top_waiter(lock); 360 top_waiter = rt_mutex_top_waiter(lock);
255 361
256 /* Requeue the waiter */ 362 /* Requeue the waiter */
257 plist_del(&waiter->list_entry, &lock->wait_list); 363 rt_mutex_dequeue(lock, waiter);
258 waiter->list_entry.prio = task->prio; 364 waiter->prio = task->prio;
259 plist_add(&waiter->list_entry, &lock->wait_list); 365 rt_mutex_enqueue(lock, waiter);
260 366
261 /* Release the task */ 367 /* Release the task */
262 raw_spin_unlock_irqrestore(&task->pi_lock, flags); 368 raw_spin_unlock_irqrestore(&task->pi_lock, flags);
@@ -280,17 +386,15 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
280 386
281 if (waiter == rt_mutex_top_waiter(lock)) { 387 if (waiter == rt_mutex_top_waiter(lock)) {
282 /* Boost the owner */ 388 /* Boost the owner */
283 plist_del(&top_waiter->pi_list_entry, &task->pi_waiters); 389 rt_mutex_dequeue_pi(task, top_waiter);
284 waiter->pi_list_entry.prio = waiter->list_entry.prio; 390 rt_mutex_enqueue_pi(task, waiter);
285 plist_add(&waiter->pi_list_entry, &task->pi_waiters);
286 __rt_mutex_adjust_prio(task); 391 __rt_mutex_adjust_prio(task);
287 392
288 } else if (top_waiter == waiter) { 393 } else if (top_waiter == waiter) {
289 /* Deboost the owner */ 394 /* Deboost the owner */
290 plist_del(&waiter->pi_list_entry, &task->pi_waiters); 395 rt_mutex_dequeue_pi(task, waiter);
291 waiter = rt_mutex_top_waiter(lock); 396 waiter = rt_mutex_top_waiter(lock);
292 waiter->pi_list_entry.prio = waiter->list_entry.prio; 397 rt_mutex_enqueue_pi(task, waiter);
293 plist_add(&waiter->pi_list_entry, &task->pi_waiters);
294 __rt_mutex_adjust_prio(task); 398 __rt_mutex_adjust_prio(task);
295 } 399 }
296 400
@@ -355,7 +459,7 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
355 * 3) it is top waiter 459 * 3) it is top waiter
356 */ 460 */
357 if (rt_mutex_has_waiters(lock)) { 461 if (rt_mutex_has_waiters(lock)) {
358 if (task->prio >= rt_mutex_top_waiter(lock)->list_entry.prio) { 462 if (task->prio >= rt_mutex_top_waiter(lock)->prio) {
359 if (!waiter || waiter != rt_mutex_top_waiter(lock)) 463 if (!waiter || waiter != rt_mutex_top_waiter(lock))
360 return 0; 464 return 0;
361 } 465 }
@@ -369,7 +473,7 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
369 473
370 /* remove the queued waiter. */ 474 /* remove the queued waiter. */
371 if (waiter) { 475 if (waiter) {
372 plist_del(&waiter->list_entry, &lock->wait_list); 476 rt_mutex_dequeue(lock, waiter);
373 task->pi_blocked_on = NULL; 477 task->pi_blocked_on = NULL;
374 } 478 }
375 479
@@ -379,8 +483,7 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
379 */ 483 */
380 if (rt_mutex_has_waiters(lock)) { 484 if (rt_mutex_has_waiters(lock)) {
381 top = rt_mutex_top_waiter(lock); 485 top = rt_mutex_top_waiter(lock);
382 top->pi_list_entry.prio = top->list_entry.prio; 486 rt_mutex_enqueue_pi(task, top);
383 plist_add(&top->pi_list_entry, &task->pi_waiters);
384 } 487 }
385 raw_spin_unlock_irqrestore(&task->pi_lock, flags); 488 raw_spin_unlock_irqrestore(&task->pi_lock, flags);
386 } 489 }
@@ -416,13 +519,12 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
416 __rt_mutex_adjust_prio(task); 519 __rt_mutex_adjust_prio(task);
417 waiter->task = task; 520 waiter->task = task;
418 waiter->lock = lock; 521 waiter->lock = lock;
419 plist_node_init(&waiter->list_entry, task->prio); 522 waiter->prio = task->prio;
420 plist_node_init(&waiter->pi_list_entry, task->prio);
421 523
422 /* Get the top priority waiter on the lock */ 524 /* Get the top priority waiter on the lock */
423 if (rt_mutex_has_waiters(lock)) 525 if (rt_mutex_has_waiters(lock))
424 top_waiter = rt_mutex_top_waiter(lock); 526 top_waiter = rt_mutex_top_waiter(lock);
425 plist_add(&waiter->list_entry, &lock->wait_list); 527 rt_mutex_enqueue(lock, waiter);
426 528
427 task->pi_blocked_on = waiter; 529 task->pi_blocked_on = waiter;
428 530
@@ -433,8 +535,8 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
433 535
434 if (waiter == rt_mutex_top_waiter(lock)) { 536 if (waiter == rt_mutex_top_waiter(lock)) {
435 raw_spin_lock_irqsave(&owner->pi_lock, flags); 537 raw_spin_lock_irqsave(&owner->pi_lock, flags);
436 plist_del(&top_waiter->pi_list_entry, &owner->pi_waiters); 538 rt_mutex_dequeue_pi(owner, top_waiter);
437 plist_add(&waiter->pi_list_entry, &owner->pi_waiters); 539 rt_mutex_enqueue_pi(owner, waiter);
438 540
439 __rt_mutex_adjust_prio(owner); 541 __rt_mutex_adjust_prio(owner);
440 if (owner->pi_blocked_on) 542 if (owner->pi_blocked_on)
@@ -486,7 +588,7 @@ static void wakeup_next_waiter(struct rt_mutex *lock)
486 * boosted mode and go back to normal after releasing 588 * boosted mode and go back to normal after releasing
487 * lock->wait_lock. 589 * lock->wait_lock.
488 */ 590 */
489 plist_del(&waiter->pi_list_entry, &current->pi_waiters); 591 rt_mutex_dequeue_pi(current, waiter);
490 592
491 rt_mutex_set_owner(lock, NULL); 593 rt_mutex_set_owner(lock, NULL);
492 594
@@ -510,7 +612,7 @@ static void remove_waiter(struct rt_mutex *lock,
510 int chain_walk = 0; 612 int chain_walk = 0;
511 613
512 raw_spin_lock_irqsave(&current->pi_lock, flags); 614 raw_spin_lock_irqsave(&current->pi_lock, flags);
513 plist_del(&waiter->list_entry, &lock->wait_list); 615 rt_mutex_dequeue(lock, waiter);
514 current->pi_blocked_on = NULL; 616 current->pi_blocked_on = NULL;
515 raw_spin_unlock_irqrestore(&current->pi_lock, flags); 617 raw_spin_unlock_irqrestore(&current->pi_lock, flags);
516 618
@@ -521,13 +623,13 @@ static void remove_waiter(struct rt_mutex *lock,
521 623
522 raw_spin_lock_irqsave(&owner->pi_lock, flags); 624 raw_spin_lock_irqsave(&owner->pi_lock, flags);
523 625
524 plist_del(&waiter->pi_list_entry, &owner->pi_waiters); 626 rt_mutex_dequeue_pi(owner, waiter);
525 627
526 if (rt_mutex_has_waiters(lock)) { 628 if (rt_mutex_has_waiters(lock)) {
527 struct rt_mutex_waiter *next; 629 struct rt_mutex_waiter *next;
528 630
529 next = rt_mutex_top_waiter(lock); 631 next = rt_mutex_top_waiter(lock);
530 plist_add(&next->pi_list_entry, &owner->pi_waiters); 632 rt_mutex_enqueue_pi(owner, next);
531 } 633 }
532 __rt_mutex_adjust_prio(owner); 634 __rt_mutex_adjust_prio(owner);
533 635
@@ -537,8 +639,6 @@ static void remove_waiter(struct rt_mutex *lock,
537 raw_spin_unlock_irqrestore(&owner->pi_lock, flags); 639 raw_spin_unlock_irqrestore(&owner->pi_lock, flags);
538 } 640 }
539 641
540 WARN_ON(!plist_node_empty(&waiter->pi_list_entry));
541
542 if (!chain_walk) 642 if (!chain_walk)
543 return; 643 return;
544 644
@@ -565,7 +665,8 @@ void rt_mutex_adjust_pi(struct task_struct *task)
565 raw_spin_lock_irqsave(&task->pi_lock, flags); 665 raw_spin_lock_irqsave(&task->pi_lock, flags);
566 666
567 waiter = task->pi_blocked_on; 667 waiter = task->pi_blocked_on;
568 if (!waiter || waiter->list_entry.prio == task->prio) { 668 if (!waiter || (waiter->prio == task->prio &&
669 !dl_prio(task->prio))) {
569 raw_spin_unlock_irqrestore(&task->pi_lock, flags); 670 raw_spin_unlock_irqrestore(&task->pi_lock, flags);
570 return; 671 return;
571 } 672 }
@@ -638,6 +739,8 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
638 int ret = 0; 739 int ret = 0;
639 740
640 debug_rt_mutex_init_waiter(&waiter); 741 debug_rt_mutex_init_waiter(&waiter);
742 RB_CLEAR_NODE(&waiter.pi_tree_entry);
743 RB_CLEAR_NODE(&waiter.tree_entry);
641 744
642 raw_spin_lock(&lock->wait_lock); 745 raw_spin_lock(&lock->wait_lock);
643 746
@@ -904,7 +1007,8 @@ void __rt_mutex_init(struct rt_mutex *lock, const char *name)
904{ 1007{
905 lock->owner = NULL; 1008 lock->owner = NULL;
906 raw_spin_lock_init(&lock->wait_lock); 1009 raw_spin_lock_init(&lock->wait_lock);
907 plist_head_init(&lock->wait_list); 1010 lock->waiters = RB_ROOT;
1011 lock->waiters_leftmost = NULL;
908 1012
909 debug_rt_mutex_init(lock, name); 1013 debug_rt_mutex_init(lock, name);
910} 1014}
diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h
index 53a66c85261b..7431a9c86f35 100644
--- a/kernel/locking/rtmutex_common.h
+++ b/kernel/locking/rtmutex_common.h
@@ -40,13 +40,13 @@ extern void schedule_rt_mutex_test(struct rt_mutex *lock);
40 * This is the control structure for tasks blocked on a rt_mutex, 40 * This is the control structure for tasks blocked on a rt_mutex,
41 * which is allocated on the kernel stack on of the blocked task. 41 * which is allocated on the kernel stack on of the blocked task.
42 * 42 *
43 * @list_entry: pi node to enqueue into the mutex waiters list 43 * @tree_entry: pi node to enqueue into the mutex waiters tree
44 * @pi_list_entry: pi node to enqueue into the mutex owner waiters list 44 * @pi_tree_entry: pi node to enqueue into the mutex owner waiters tree
45 * @task: task reference to the blocked task 45 * @task: task reference to the blocked task
46 */ 46 */
47struct rt_mutex_waiter { 47struct rt_mutex_waiter {
48 struct plist_node list_entry; 48 struct rb_node tree_entry;
49 struct plist_node pi_list_entry; 49 struct rb_node pi_tree_entry;
50 struct task_struct *task; 50 struct task_struct *task;
51 struct rt_mutex *lock; 51 struct rt_mutex *lock;
52#ifdef CONFIG_DEBUG_RT_MUTEXES 52#ifdef CONFIG_DEBUG_RT_MUTEXES
@@ -54,14 +54,15 @@ struct rt_mutex_waiter {
54 struct pid *deadlock_task_pid; 54 struct pid *deadlock_task_pid;
55 struct rt_mutex *deadlock_lock; 55 struct rt_mutex *deadlock_lock;
56#endif 56#endif
57 int prio;
57}; 58};
58 59
59/* 60/*
60 * Various helpers to access the waiters-plist: 61 * Various helpers to access the waiters-tree:
61 */ 62 */
62static inline int rt_mutex_has_waiters(struct rt_mutex *lock) 63static inline int rt_mutex_has_waiters(struct rt_mutex *lock)
63{ 64{
64 return !plist_head_empty(&lock->wait_list); 65 return !RB_EMPTY_ROOT(&lock->waiters);
65} 66}
66 67
67static inline struct rt_mutex_waiter * 68static inline struct rt_mutex_waiter *
@@ -69,8 +70,8 @@ rt_mutex_top_waiter(struct rt_mutex *lock)
69{ 70{
70 struct rt_mutex_waiter *w; 71 struct rt_mutex_waiter *w;
71 72
72 w = plist_first_entry(&lock->wait_list, struct rt_mutex_waiter, 73 w = rb_entry(lock->waiters_leftmost, struct rt_mutex_waiter,
73 list_entry); 74 tree_entry);
74 BUG_ON(w->lock != lock); 75 BUG_ON(w->lock != lock);
75 76
76 return w; 77 return w;
@@ -78,14 +79,14 @@ rt_mutex_top_waiter(struct rt_mutex *lock)
78 79
79static inline int task_has_pi_waiters(struct task_struct *p) 80static inline int task_has_pi_waiters(struct task_struct *p)
80{ 81{
81 return !plist_head_empty(&p->pi_waiters); 82 return !RB_EMPTY_ROOT(&p->pi_waiters);
82} 83}
83 84
84static inline struct rt_mutex_waiter * 85static inline struct rt_mutex_waiter *
85task_top_pi_waiter(struct task_struct *p) 86task_top_pi_waiter(struct task_struct *p)
86{ 87{
87 return plist_first_entry(&p->pi_waiters, struct rt_mutex_waiter, 88 return rb_entry(p->pi_waiters_leftmost, struct rt_mutex_waiter,
88 pi_list_entry); 89 pi_tree_entry);
89} 90}
90 91
91/* 92/*
diff --git a/kernel/module.c b/kernel/module.c
index f5a3b1e8ec51..d24fcf29cb64 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -815,10 +815,8 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user,
815 return -EFAULT; 815 return -EFAULT;
816 name[MODULE_NAME_LEN-1] = '\0'; 816 name[MODULE_NAME_LEN-1] = '\0';
817 817
818 if (!(flags & O_NONBLOCK)) { 818 if (!(flags & O_NONBLOCK))
819 printk(KERN_WARNING 819 pr_warn("waiting module removal not supported: please upgrade\n");
820 "waiting module removal not supported: please upgrade");
821 }
822 820
823 if (mutex_lock_interruptible(&module_mutex) != 0) 821 if (mutex_lock_interruptible(&module_mutex) != 0)
824 return -EINTR; 822 return -EINTR;
diff --git a/kernel/padata.c b/kernel/padata.c
index 2abd25d79cc8..161402f0b517 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -112,7 +112,7 @@ int padata_do_parallel(struct padata_instance *pinst,
112 112
113 rcu_read_lock_bh(); 113 rcu_read_lock_bh();
114 114
115 pd = rcu_dereference(pinst->pd); 115 pd = rcu_dereference_bh(pinst->pd);
116 116
117 err = -EINVAL; 117 err = -EINVAL;
118 if (!(pinst->flags & PADATA_INIT) || pinst->flags & PADATA_INVALID) 118 if (!(pinst->flags & PADATA_INIT) || pinst->flags & PADATA_INVALID)
diff --git a/kernel/panic.c b/kernel/panic.c
index c00b4ceb39e8..6d6300375090 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -33,7 +33,7 @@ static int pause_on_oops;
33static int pause_on_oops_flag; 33static int pause_on_oops_flag;
34static DEFINE_SPINLOCK(pause_on_oops_lock); 34static DEFINE_SPINLOCK(pause_on_oops_lock);
35 35
36int panic_timeout; 36int panic_timeout = CONFIG_PANIC_TIMEOUT;
37EXPORT_SYMBOL_GPL(panic_timeout); 37EXPORT_SYMBOL_GPL(panic_timeout);
38 38
39ATOMIC_NOTIFIER_HEAD(panic_notifier_list); 39ATOMIC_NOTIFIER_HEAD(panic_notifier_list);
diff --git a/kernel/params.c b/kernel/params.c
index c00d5b502aa4..b00142e7f3ba 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -227,17 +227,10 @@ int parse_args(const char *doing,
227} 227}
228 228
229/* Lazy bastard, eh? */ 229/* Lazy bastard, eh? */
230#define STANDARD_PARAM_DEF(name, type, format, tmptype, strtolfn) \ 230#define STANDARD_PARAM_DEF(name, type, format, strtolfn) \
231 int param_set_##name(const char *val, const struct kernel_param *kp) \ 231 int param_set_##name(const char *val, const struct kernel_param *kp) \
232 { \ 232 { \
233 tmptype l; \ 233 return strtolfn(val, 0, (type *)kp->arg); \
234 int ret; \
235 \
236 ret = strtolfn(val, 0, &l); \
237 if (ret < 0 || ((type)l != l)) \
238 return ret < 0 ? ret : -EINVAL; \
239 *((type *)kp->arg) = l; \
240 return 0; \
241 } \ 234 } \
242 int param_get_##name(char *buffer, const struct kernel_param *kp) \ 235 int param_get_##name(char *buffer, const struct kernel_param *kp) \
243 { \ 236 { \
@@ -253,13 +246,13 @@ int parse_args(const char *doing,
253 EXPORT_SYMBOL(param_ops_##name) 246 EXPORT_SYMBOL(param_ops_##name)
254 247
255 248
256STANDARD_PARAM_DEF(byte, unsigned char, "%hhu", unsigned long, kstrtoul); 249STANDARD_PARAM_DEF(byte, unsigned char, "%hhu", kstrtou8);
257STANDARD_PARAM_DEF(short, short, "%hi", long, kstrtol); 250STANDARD_PARAM_DEF(short, short, "%hi", kstrtos16);
258STANDARD_PARAM_DEF(ushort, unsigned short, "%hu", unsigned long, kstrtoul); 251STANDARD_PARAM_DEF(ushort, unsigned short, "%hu", kstrtou16);
259STANDARD_PARAM_DEF(int, int, "%i", long, kstrtol); 252STANDARD_PARAM_DEF(int, int, "%i", kstrtoint);
260STANDARD_PARAM_DEF(uint, unsigned int, "%u", unsigned long, kstrtoul); 253STANDARD_PARAM_DEF(uint, unsigned int, "%u", kstrtouint);
261STANDARD_PARAM_DEF(long, long, "%li", long, kstrtol); 254STANDARD_PARAM_DEF(long, long, "%li", kstrtol);
262STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", unsigned long, kstrtoul); 255STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", kstrtoul);
263 256
264int param_set_charp(const char *val, const struct kernel_param *kp) 257int param_set_charp(const char *val, const struct kernel_param *kp)
265{ 258{
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index c7f31aa272f7..3b8946416a5f 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -233,7 +233,8 @@ void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times)
233 233
234/* 234/*
235 * Sample a process (thread group) clock for the given group_leader task. 235 * Sample a process (thread group) clock for the given group_leader task.
236 * Must be called with tasklist_lock held for reading. 236 * Must be called with task sighand lock held for safe while_each_thread()
237 * traversal.
237 */ 238 */
238static int cpu_clock_sample_group(const clockid_t which_clock, 239static int cpu_clock_sample_group(const clockid_t which_clock,
239 struct task_struct *p, 240 struct task_struct *p,
@@ -260,30 +261,53 @@ static int cpu_clock_sample_group(const clockid_t which_clock,
260 return 0; 261 return 0;
261} 262}
262 263
264static int posix_cpu_clock_get_task(struct task_struct *tsk,
265 const clockid_t which_clock,
266 struct timespec *tp)
267{
268 int err = -EINVAL;
269 unsigned long long rtn;
270
271 if (CPUCLOCK_PERTHREAD(which_clock)) {
272 if (same_thread_group(tsk, current))
273 err = cpu_clock_sample(which_clock, tsk, &rtn);
274 } else {
275 unsigned long flags;
276 struct sighand_struct *sighand;
277
278 /*
279 * while_each_thread() is not yet entirely RCU safe,
280 * keep locking the group while sampling process
281 * clock for now.
282 */
283 sighand = lock_task_sighand(tsk, &flags);
284 if (!sighand)
285 return err;
286
287 if (tsk == current || thread_group_leader(tsk))
288 err = cpu_clock_sample_group(which_clock, tsk, &rtn);
289
290 unlock_task_sighand(tsk, &flags);
291 }
292
293 if (!err)
294 sample_to_timespec(which_clock, rtn, tp);
295
296 return err;
297}
298
263 299
264static int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp) 300static int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp)
265{ 301{
266 const pid_t pid = CPUCLOCK_PID(which_clock); 302 const pid_t pid = CPUCLOCK_PID(which_clock);
267 int error = -EINVAL; 303 int err = -EINVAL;
268 unsigned long long rtn;
269 304
270 if (pid == 0) { 305 if (pid == 0) {
271 /* 306 /*
272 * Special case constant value for our own clocks. 307 * Special case constant value for our own clocks.
273 * We don't have to do any lookup to find ourselves. 308 * We don't have to do any lookup to find ourselves.
274 */ 309 */
275 if (CPUCLOCK_PERTHREAD(which_clock)) { 310 err = posix_cpu_clock_get_task(current, which_clock, tp);
276 /*
277 * Sampling just ourselves we can do with no locking.
278 */
279 error = cpu_clock_sample(which_clock,
280 current, &rtn);
281 } else {
282 read_lock(&tasklist_lock);
283 error = cpu_clock_sample_group(which_clock,
284 current, &rtn);
285 read_unlock(&tasklist_lock);
286 }
287 } else { 311 } else {
288 /* 312 /*
289 * Find the given PID, and validate that the caller 313 * Find the given PID, and validate that the caller
@@ -292,29 +316,12 @@ static int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp)
292 struct task_struct *p; 316 struct task_struct *p;
293 rcu_read_lock(); 317 rcu_read_lock();
294 p = find_task_by_vpid(pid); 318 p = find_task_by_vpid(pid);
295 if (p) { 319 if (p)
296 if (CPUCLOCK_PERTHREAD(which_clock)) { 320 err = posix_cpu_clock_get_task(p, which_clock, tp);
297 if (same_thread_group(p, current)) {
298 error = cpu_clock_sample(which_clock,
299 p, &rtn);
300 }
301 } else {
302 read_lock(&tasklist_lock);
303 if (thread_group_leader(p) && p->sighand) {
304 error =
305 cpu_clock_sample_group(which_clock,
306 p, &rtn);
307 }
308 read_unlock(&tasklist_lock);
309 }
310 }
311 rcu_read_unlock(); 321 rcu_read_unlock();
312 } 322 }
313 323
314 if (error) 324 return err;
315 return error;
316 sample_to_timespec(which_clock, rtn, tp);
317 return 0;
318} 325}
319 326
320 327
@@ -371,36 +378,40 @@ static int posix_cpu_timer_create(struct k_itimer *new_timer)
371 */ 378 */
372static int posix_cpu_timer_del(struct k_itimer *timer) 379static int posix_cpu_timer_del(struct k_itimer *timer)
373{ 380{
374 struct task_struct *p = timer->it.cpu.task;
375 int ret = 0; 381 int ret = 0;
382 unsigned long flags;
383 struct sighand_struct *sighand;
384 struct task_struct *p = timer->it.cpu.task;
376 385
377 if (likely(p != NULL)) { 386 WARN_ON_ONCE(p == NULL);
378 read_lock(&tasklist_lock);
379 if (unlikely(p->sighand == NULL)) {
380 /*
381 * We raced with the reaping of the task.
382 * The deletion should have cleared us off the list.
383 */
384 BUG_ON(!list_empty(&timer->it.cpu.entry));
385 } else {
386 spin_lock(&p->sighand->siglock);
387 if (timer->it.cpu.firing)
388 ret = TIMER_RETRY;
389 else
390 list_del(&timer->it.cpu.entry);
391 spin_unlock(&p->sighand->siglock);
392 }
393 read_unlock(&tasklist_lock);
394 387
395 if (!ret) 388 /*
396 put_task_struct(p); 389 * Protect against sighand release/switch in exit/exec and process/
390 * thread timer list entry concurrent read/writes.
391 */
392 sighand = lock_task_sighand(p, &flags);
393 if (unlikely(sighand == NULL)) {
394 /*
395 * We raced with the reaping of the task.
396 * The deletion should have cleared us off the list.
397 */
398 WARN_ON_ONCE(!list_empty(&timer->it.cpu.entry));
399 } else {
400 if (timer->it.cpu.firing)
401 ret = TIMER_RETRY;
402 else
403 list_del(&timer->it.cpu.entry);
404
405 unlock_task_sighand(p, &flags);
397 } 406 }
398 407
408 if (!ret)
409 put_task_struct(p);
410
399 return ret; 411 return ret;
400} 412}
401 413
402static void cleanup_timers_list(struct list_head *head, 414static void cleanup_timers_list(struct list_head *head)
403 unsigned long long curr)
404{ 415{
405 struct cpu_timer_list *timer, *next; 416 struct cpu_timer_list *timer, *next;
406 417
@@ -414,16 +425,11 @@ static void cleanup_timers_list(struct list_head *head,
414 * time for later timer_gettime calls to return. 425 * time for later timer_gettime calls to return.
415 * This must be called with the siglock held. 426 * This must be called with the siglock held.
416 */ 427 */
417static void cleanup_timers(struct list_head *head, 428static void cleanup_timers(struct list_head *head)
418 cputime_t utime, cputime_t stime,
419 unsigned long long sum_exec_runtime)
420{ 429{
421 430 cleanup_timers_list(head);
422 cputime_t ptime = utime + stime; 431 cleanup_timers_list(++head);
423 432 cleanup_timers_list(++head);
424 cleanup_timers_list(head, cputime_to_expires(ptime));
425 cleanup_timers_list(++head, cputime_to_expires(utime));
426 cleanup_timers_list(++head, sum_exec_runtime);
427} 433}
428 434
429/* 435/*
@@ -433,41 +439,14 @@ static void cleanup_timers(struct list_head *head,
433 */ 439 */
434void posix_cpu_timers_exit(struct task_struct *tsk) 440void posix_cpu_timers_exit(struct task_struct *tsk)
435{ 441{
436 cputime_t utime, stime;
437
438 add_device_randomness((const void*) &tsk->se.sum_exec_runtime, 442 add_device_randomness((const void*) &tsk->se.sum_exec_runtime,
439 sizeof(unsigned long long)); 443 sizeof(unsigned long long));
440 task_cputime(tsk, &utime, &stime); 444 cleanup_timers(tsk->cpu_timers);
441 cleanup_timers(tsk->cpu_timers,
442 utime, stime, tsk->se.sum_exec_runtime);
443 445
444} 446}
445void posix_cpu_timers_exit_group(struct task_struct *tsk) 447void posix_cpu_timers_exit_group(struct task_struct *tsk)
446{ 448{
447 struct signal_struct *const sig = tsk->signal; 449 cleanup_timers(tsk->signal->cpu_timers);
448 cputime_t utime, stime;
449
450 task_cputime(tsk, &utime, &stime);
451 cleanup_timers(tsk->signal->cpu_timers,
452 utime + sig->utime, stime + sig->stime,
453 tsk->se.sum_exec_runtime + sig->sum_sched_runtime);
454}
455
456static void clear_dead_task(struct k_itimer *itimer, unsigned long long now)
457{
458 struct cpu_timer_list *timer = &itimer->it.cpu;
459
460 /*
461 * That's all for this thread or process.
462 * We leave our residual in expires to be reported.
463 */
464 put_task_struct(timer->task);
465 timer->task = NULL;
466 if (timer->expires < now) {
467 timer->expires = 0;
468 } else {
469 timer->expires -= now;
470 }
471} 450}
472 451
473static inline int expires_gt(cputime_t expires, cputime_t new_exp) 452static inline int expires_gt(cputime_t expires, cputime_t new_exp)
@@ -477,8 +456,7 @@ static inline int expires_gt(cputime_t expires, cputime_t new_exp)
477 456
478/* 457/*
479 * Insert the timer on the appropriate list before any timers that 458 * Insert the timer on the appropriate list before any timers that
480 * expire later. This must be called with the tasklist_lock held 459 * expire later. This must be called with the sighand lock held.
481 * for reading, interrupts disabled and p->sighand->siglock taken.
482 */ 460 */
483static void arm_timer(struct k_itimer *timer) 461static void arm_timer(struct k_itimer *timer)
484{ 462{
@@ -569,7 +547,8 @@ static void cpu_timer_fire(struct k_itimer *timer)
569 547
570/* 548/*
571 * Sample a process (thread group) timer for the given group_leader task. 549 * Sample a process (thread group) timer for the given group_leader task.
572 * Must be called with tasklist_lock held for reading. 550 * Must be called with task sighand lock held for safe while_each_thread()
551 * traversal.
573 */ 552 */
574static int cpu_timer_sample_group(const clockid_t which_clock, 553static int cpu_timer_sample_group(const clockid_t which_clock,
575 struct task_struct *p, 554 struct task_struct *p,
@@ -608,7 +587,8 @@ static DECLARE_WORK(nohz_kick_work, nohz_kick_work_fn);
608 */ 587 */
609static void posix_cpu_timer_kick_nohz(void) 588static void posix_cpu_timer_kick_nohz(void)
610{ 589{
611 schedule_work(&nohz_kick_work); 590 if (context_tracking_is_enabled())
591 schedule_work(&nohz_kick_work);
612} 592}
613 593
614bool posix_cpu_timers_can_stop_tick(struct task_struct *tsk) 594bool posix_cpu_timers_can_stop_tick(struct task_struct *tsk)
@@ -631,43 +611,39 @@ static inline void posix_cpu_timer_kick_nohz(void) { }
631 * If we return TIMER_RETRY, it's necessary to release the timer's lock 611 * If we return TIMER_RETRY, it's necessary to release the timer's lock
632 * and try again. (This happens when the timer is in the middle of firing.) 612 * and try again. (This happens when the timer is in the middle of firing.)
633 */ 613 */
634static int posix_cpu_timer_set(struct k_itimer *timer, int flags, 614static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags,
635 struct itimerspec *new, struct itimerspec *old) 615 struct itimerspec *new, struct itimerspec *old)
636{ 616{
617 unsigned long flags;
618 struct sighand_struct *sighand;
637 struct task_struct *p = timer->it.cpu.task; 619 struct task_struct *p = timer->it.cpu.task;
638 unsigned long long old_expires, new_expires, old_incr, val; 620 unsigned long long old_expires, new_expires, old_incr, val;
639 int ret; 621 int ret;
640 622
641 if (unlikely(p == NULL)) { 623 WARN_ON_ONCE(p == NULL);
642 /*
643 * Timer refers to a dead task's clock.
644 */
645 return -ESRCH;
646 }
647 624
648 new_expires = timespec_to_sample(timer->it_clock, &new->it_value); 625 new_expires = timespec_to_sample(timer->it_clock, &new->it_value);
649 626
650 read_lock(&tasklist_lock);
651 /* 627 /*
652 * We need the tasklist_lock to protect against reaping that 628 * Protect against sighand release/switch in exit/exec and p->cpu_timers
653 * clears p->sighand. If p has just been reaped, we can no 629 * and p->signal->cpu_timers read/write in arm_timer()
630 */
631 sighand = lock_task_sighand(p, &flags);
632 /*
633 * If p has just been reaped, we can no
654 * longer get any information about it at all. 634 * longer get any information about it at all.
655 */ 635 */
656 if (unlikely(p->sighand == NULL)) { 636 if (unlikely(sighand == NULL)) {
657 read_unlock(&tasklist_lock);
658 put_task_struct(p);
659 timer->it.cpu.task = NULL;
660 return -ESRCH; 637 return -ESRCH;
661 } 638 }
662 639
663 /* 640 /*
664 * Disarm any old timer after extracting its expiry time. 641 * Disarm any old timer after extracting its expiry time.
665 */ 642 */
666 BUG_ON(!irqs_disabled()); 643 WARN_ON_ONCE(!irqs_disabled());
667 644
668 ret = 0; 645 ret = 0;
669 old_incr = timer->it.cpu.incr; 646 old_incr = timer->it.cpu.incr;
670 spin_lock(&p->sighand->siglock);
671 old_expires = timer->it.cpu.expires; 647 old_expires = timer->it.cpu.expires;
672 if (unlikely(timer->it.cpu.firing)) { 648 if (unlikely(timer->it.cpu.firing)) {
673 timer->it.cpu.firing = -1; 649 timer->it.cpu.firing = -1;
@@ -724,12 +700,11 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int flags,
724 * disable this firing since we are already reporting 700 * disable this firing since we are already reporting
725 * it as an overrun (thanks to bump_cpu_timer above). 701 * it as an overrun (thanks to bump_cpu_timer above).
726 */ 702 */
727 spin_unlock(&p->sighand->siglock); 703 unlock_task_sighand(p, &flags);
728 read_unlock(&tasklist_lock);
729 goto out; 704 goto out;
730 } 705 }
731 706
732 if (new_expires != 0 && !(flags & TIMER_ABSTIME)) { 707 if (new_expires != 0 && !(timer_flags & TIMER_ABSTIME)) {
733 new_expires += val; 708 new_expires += val;
734 } 709 }
735 710
@@ -743,9 +718,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int flags,
743 arm_timer(timer); 718 arm_timer(timer);
744 } 719 }
745 720
746 spin_unlock(&p->sighand->siglock); 721 unlock_task_sighand(p, &flags);
747 read_unlock(&tasklist_lock);
748
749 /* 722 /*
750 * Install the new reload setting, and 723 * Install the new reload setting, and
751 * set up the signal and overrun bookkeeping. 724 * set up the signal and overrun bookkeeping.
@@ -787,7 +760,8 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
787{ 760{
788 unsigned long long now; 761 unsigned long long now;
789 struct task_struct *p = timer->it.cpu.task; 762 struct task_struct *p = timer->it.cpu.task;
790 int clear_dead; 763
764 WARN_ON_ONCE(p == NULL);
791 765
792 /* 766 /*
793 * Easy part: convert the reload time. 767 * Easy part: convert the reload time.
@@ -800,52 +774,34 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
800 return; 774 return;
801 } 775 }
802 776
803 if (unlikely(p == NULL)) {
804 /*
805 * This task already died and the timer will never fire.
806 * In this case, expires is actually the dead value.
807 */
808 dead:
809 sample_to_timespec(timer->it_clock, timer->it.cpu.expires,
810 &itp->it_value);
811 return;
812 }
813
814 /* 777 /*
815 * Sample the clock to take the difference with the expiry time. 778 * Sample the clock to take the difference with the expiry time.
816 */ 779 */
817 if (CPUCLOCK_PERTHREAD(timer->it_clock)) { 780 if (CPUCLOCK_PERTHREAD(timer->it_clock)) {
818 cpu_clock_sample(timer->it_clock, p, &now); 781 cpu_clock_sample(timer->it_clock, p, &now);
819 clear_dead = p->exit_state;
820 } else { 782 } else {
821 read_lock(&tasklist_lock); 783 struct sighand_struct *sighand;
822 if (unlikely(p->sighand == NULL)) { 784 unsigned long flags;
785
786 /*
787 * Protect against sighand release/switch in exit/exec and
788 * also make timer sampling safe if it ends up calling
789 * thread_group_cputime().
790 */
791 sighand = lock_task_sighand(p, &flags);
792 if (unlikely(sighand == NULL)) {
823 /* 793 /*
824 * The process has been reaped. 794 * The process has been reaped.
825 * We can't even collect a sample any more. 795 * We can't even collect a sample any more.
826 * Call the timer disarmed, nothing else to do. 796 * Call the timer disarmed, nothing else to do.
827 */ 797 */
828 put_task_struct(p);
829 timer->it.cpu.task = NULL;
830 timer->it.cpu.expires = 0; 798 timer->it.cpu.expires = 0;
831 read_unlock(&tasklist_lock); 799 sample_to_timespec(timer->it_clock, timer->it.cpu.expires,
832 goto dead; 800 &itp->it_value);
833 } else { 801 } else {
834 cpu_timer_sample_group(timer->it_clock, p, &now); 802 cpu_timer_sample_group(timer->it_clock, p, &now);
835 clear_dead = (unlikely(p->exit_state) && 803 unlock_task_sighand(p, &flags);
836 thread_group_empty(p));
837 } 804 }
838 read_unlock(&tasklist_lock);
839 }
840
841 if (unlikely(clear_dead)) {
842 /*
843 * We've noticed that the thread is dead, but
844 * not yet reaped. Take this opportunity to
845 * drop our task ref.
846 */
847 clear_dead_task(timer, now);
848 goto dead;
849 } 805 }
850 806
851 if (now < timer->it.cpu.expires) { 807 if (now < timer->it.cpu.expires) {
@@ -1059,14 +1015,12 @@ static void check_process_timers(struct task_struct *tsk,
1059 */ 1015 */
1060void posix_cpu_timer_schedule(struct k_itimer *timer) 1016void posix_cpu_timer_schedule(struct k_itimer *timer)
1061{ 1017{
1018 struct sighand_struct *sighand;
1019 unsigned long flags;
1062 struct task_struct *p = timer->it.cpu.task; 1020 struct task_struct *p = timer->it.cpu.task;
1063 unsigned long long now; 1021 unsigned long long now;
1064 1022
1065 if (unlikely(p == NULL)) 1023 WARN_ON_ONCE(p == NULL);
1066 /*
1067 * The task was cleaned up already, no future firings.
1068 */
1069 goto out;
1070 1024
1071 /* 1025 /*
1072 * Fetch the current sample and update the timer's expiry time. 1026 * Fetch the current sample and update the timer's expiry time.
@@ -1074,49 +1028,45 @@ void posix_cpu_timer_schedule(struct k_itimer *timer)
1074 if (CPUCLOCK_PERTHREAD(timer->it_clock)) { 1028 if (CPUCLOCK_PERTHREAD(timer->it_clock)) {
1075 cpu_clock_sample(timer->it_clock, p, &now); 1029 cpu_clock_sample(timer->it_clock, p, &now);
1076 bump_cpu_timer(timer, now); 1030 bump_cpu_timer(timer, now);
1077 if (unlikely(p->exit_state)) { 1031 if (unlikely(p->exit_state))
1078 clear_dead_task(timer, now); 1032 goto out;
1033
1034 /* Protect timer list r/w in arm_timer() */
1035 sighand = lock_task_sighand(p, &flags);
1036 if (!sighand)
1079 goto out; 1037 goto out;
1080 }
1081 read_lock(&tasklist_lock); /* arm_timer needs it. */
1082 spin_lock(&p->sighand->siglock);
1083 } else { 1038 } else {
1084 read_lock(&tasklist_lock); 1039 /*
1085 if (unlikely(p->sighand == NULL)) { 1040 * Protect arm_timer() and timer sampling in case of call to
1041 * thread_group_cputime().
1042 */
1043 sighand = lock_task_sighand(p, &flags);
1044 if (unlikely(sighand == NULL)) {
1086 /* 1045 /*
1087 * The process has been reaped. 1046 * The process has been reaped.
1088 * We can't even collect a sample any more. 1047 * We can't even collect a sample any more.
1089 */ 1048 */
1090 put_task_struct(p);
1091 timer->it.cpu.task = p = NULL;
1092 timer->it.cpu.expires = 0; 1049 timer->it.cpu.expires = 0;
1093 goto out_unlock; 1050 goto out;
1094 } else if (unlikely(p->exit_state) && thread_group_empty(p)) { 1051 } else if (unlikely(p->exit_state) && thread_group_empty(p)) {
1095 /* 1052 unlock_task_sighand(p, &flags);
1096 * We've noticed that the thread is dead, but 1053 /* Optimizations: if the process is dying, no need to rearm */
1097 * not yet reaped. Take this opportunity to 1054 goto out;
1098 * drop our task ref.
1099 */
1100 cpu_timer_sample_group(timer->it_clock, p, &now);
1101 clear_dead_task(timer, now);
1102 goto out_unlock;
1103 } 1055 }
1104 spin_lock(&p->sighand->siglock);
1105 cpu_timer_sample_group(timer->it_clock, p, &now); 1056 cpu_timer_sample_group(timer->it_clock, p, &now);
1106 bump_cpu_timer(timer, now); 1057 bump_cpu_timer(timer, now);
1107 /* Leave the tasklist_lock locked for the call below. */ 1058 /* Leave the sighand locked for the call below. */
1108 } 1059 }
1109 1060
1110 /* 1061 /*
1111 * Now re-arm for the new expiry time. 1062 * Now re-arm for the new expiry time.
1112 */ 1063 */
1113 BUG_ON(!irqs_disabled()); 1064 WARN_ON_ONCE(!irqs_disabled());
1114 arm_timer(timer); 1065 arm_timer(timer);
1115 spin_unlock(&p->sighand->siglock); 1066 unlock_task_sighand(p, &flags);
1116
1117out_unlock:
1118 read_unlock(&tasklist_lock);
1119 1067
1068 /* Kick full dynticks CPUs in case they need to tick on the new timer */
1069 posix_cpu_timer_kick_nohz();
1120out: 1070out:
1121 timer->it_overrun_last = timer->it_overrun; 1071 timer->it_overrun_last = timer->it_overrun;
1122 timer->it_overrun = -1; 1072 timer->it_overrun = -1;
@@ -1200,7 +1150,7 @@ void run_posix_cpu_timers(struct task_struct *tsk)
1200 struct k_itimer *timer, *next; 1150 struct k_itimer *timer, *next;
1201 unsigned long flags; 1151 unsigned long flags;
1202 1152
1203 BUG_ON(!irqs_disabled()); 1153 WARN_ON_ONCE(!irqs_disabled());
1204 1154
1205 /* 1155 /*
1206 * The fast path checks that there are no expired thread or thread 1156 * The fast path checks that there are no expired thread or thread
@@ -1256,13 +1206,6 @@ void run_posix_cpu_timers(struct task_struct *tsk)
1256 cpu_timer_fire(timer); 1206 cpu_timer_fire(timer);
1257 spin_unlock(&timer->it_lock); 1207 spin_unlock(&timer->it_lock);
1258 } 1208 }
1259
1260 /*
1261 * In case some timers were rescheduled after the queue got emptied,
1262 * wake up full dynticks CPUs.
1263 */
1264 if (tsk->signal->cputimer.running)
1265 posix_cpu_timer_kick_nohz();
1266} 1209}
1267 1210
1268/* 1211/*
@@ -1274,7 +1217,7 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
1274{ 1217{
1275 unsigned long long now; 1218 unsigned long long now;
1276 1219
1277 BUG_ON(clock_idx == CPUCLOCK_SCHED); 1220 WARN_ON_ONCE(clock_idx == CPUCLOCK_SCHED);
1278 cpu_timer_sample_group(clock_idx, tsk, &now); 1221 cpu_timer_sample_group(clock_idx, tsk, &now);
1279 1222
1280 if (oldval) { 1223 if (oldval) {
diff --git a/kernel/power/block_io.c b/kernel/power/block_io.c
index d09dd10c5a5e..9a58bc258810 100644
--- a/kernel/power/block_io.c
+++ b/kernel/power/block_io.c
@@ -32,7 +32,7 @@ static int submit(int rw, struct block_device *bdev, sector_t sector,
32 struct bio *bio; 32 struct bio *bio;
33 33
34 bio = bio_alloc(__GFP_WAIT | __GFP_HIGH, 1); 34 bio = bio_alloc(__GFP_WAIT | __GFP_HIGH, 1);
35 bio->bi_sector = sector; 35 bio->bi_iter.bi_sector = sector;
36 bio->bi_bdev = bdev; 36 bio->bi_bdev = bdev;
37 bio->bi_end_io = end_swap_bio_read; 37 bio->bi_end_io = end_swap_bio_read;
38 38
diff --git a/kernel/power/console.c b/kernel/power/console.c
index eacb8bd8cab4..aba9c545a0e3 100644
--- a/kernel/power/console.c
+++ b/kernel/power/console.c
@@ -9,6 +9,7 @@
9#include <linux/kbd_kern.h> 9#include <linux/kbd_kern.h>
10#include <linux/vt.h> 10#include <linux/vt.h>
11#include <linux/module.h> 11#include <linux/module.h>
12#include <linux/slab.h>
12#include "power.h" 13#include "power.h"
13 14
14#define SUSPEND_CONSOLE (MAX_NR_CONSOLES-1) 15#define SUSPEND_CONSOLE (MAX_NR_CONSOLES-1)
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 0121dab83f43..37170d4dd9a6 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -82,6 +82,7 @@ void hibernation_set_ops(const struct platform_hibernation_ops *ops)
82 82
83 unlock_system_sleep(); 83 unlock_system_sleep();
84} 84}
85EXPORT_SYMBOL_GPL(hibernation_set_ops);
85 86
86static bool entering_platform_hibernation; 87static bool entering_platform_hibernation;
87 88
@@ -293,10 +294,10 @@ static int create_image(int platform_mode)
293 error); 294 error);
294 /* Restore control flow magically appears here */ 295 /* Restore control flow magically appears here */
295 restore_processor_state(); 296 restore_processor_state();
296 if (!in_suspend) { 297 if (!in_suspend)
297 events_check_enabled = false; 298 events_check_enabled = false;
298 platform_leave(platform_mode); 299
299 } 300 platform_leave(platform_mode);
300 301
301 Power_up: 302 Power_up:
302 syscore_resume(); 303 syscore_resume();
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index b38109e204af..d9f61a145802 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -637,7 +637,7 @@ __register_nosave_region(unsigned long start_pfn, unsigned long end_pfn,
637 BUG_ON(!region); 637 BUG_ON(!region);
638 } else 638 } else
639 /* This allocation cannot fail */ 639 /* This allocation cannot fail */
640 region = alloc_bootmem(sizeof(struct nosave_region)); 640 region = memblock_virt_alloc(sizeof(struct nosave_region), 0);
641 region->start_pfn = start_pfn; 641 region->start_pfn = start_pfn;
642 region->end_pfn = end_pfn; 642 region->end_pfn = end_pfn;
643 list_add_tail(&region->list, &nosave_regions); 643 list_add_tail(&region->list, &nosave_regions);
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index be7c86bae576..4dae9cbe9259 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -757,14 +757,10 @@ void __init setup_log_buf(int early)
757 return; 757 return;
758 758
759 if (early) { 759 if (early) {
760 unsigned long mem; 760 new_log_buf =
761 761 memblock_virt_alloc(new_log_buf_len, PAGE_SIZE);
762 mem = memblock_alloc(new_log_buf_len, PAGE_SIZE);
763 if (!mem)
764 return;
765 new_log_buf = __va(mem);
766 } else { 762 } else {
767 new_log_buf = alloc_bootmem_nopanic(new_log_buf_len); 763 new_log_buf = memblock_virt_alloc_nopanic(new_log_buf_len, 0);
768 } 764 }
769 765
770 if (unlikely(!new_log_buf)) { 766 if (unlikely(!new_log_buf)) {
@@ -1080,7 +1076,6 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
1080 next_seq = log_next_seq; 1076 next_seq = log_next_seq;
1081 1077
1082 len = 0; 1078 len = 0;
1083 prev = 0;
1084 while (len >= 0 && seq < next_seq) { 1079 while (len >= 0 && seq < next_seq) {
1085 struct printk_log *msg = log_from_idx(idx); 1080 struct printk_log *msg = log_from_idx(idx);
1086 int textlen; 1081 int textlen;
@@ -1599,10 +1594,13 @@ asmlinkage int vprintk_emit(int facility, int level,
1599 * either merge it with the current buffer and flush, or if 1594 * either merge it with the current buffer and flush, or if
1600 * there was a race with interrupts (prefix == true) then just 1595 * there was a race with interrupts (prefix == true) then just
1601 * flush it out and store this line separately. 1596 * flush it out and store this line separately.
1597 * If the preceding printk was from a different task and missed
1598 * a newline, flush and append the newline.
1602 */ 1599 */
1603 if (cont.len && cont.owner == current) { 1600 if (cont.len) {
1604 if (!(lflags & LOG_PREFIX)) 1601 if (cont.owner == current && !(lflags & LOG_PREFIX))
1605 stored = cont_add(facility, level, text, text_len); 1602 stored = cont_add(facility, level, text,
1603 text_len);
1606 cont_flush(LOG_NEWLINE); 1604 cont_flush(LOG_NEWLINE);
1607 } 1605 }
1608 1606
@@ -2789,7 +2787,6 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog,
2789 next_idx = idx; 2787 next_idx = idx;
2790 2788
2791 l = 0; 2789 l = 0;
2792 prev = 0;
2793 while (seq < dumper->next_seq) { 2790 while (seq < dumper->next_seq) {
2794 struct printk_log *msg = log_from_idx(idx); 2791 struct printk_log *msg = log_from_idx(idx);
2795 2792
diff --git a/kernel/profile.c b/kernel/profile.c
index 6631e1ef55ab..ebdd9c1a86b4 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -549,14 +549,14 @@ static int create_hash_tables(void)
549 struct page *page; 549 struct page *page;
550 550
551 page = alloc_pages_exact_node(node, 551 page = alloc_pages_exact_node(node,
552 GFP_KERNEL | __GFP_ZERO | GFP_THISNODE, 552 GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE,
553 0); 553 0);
554 if (!page) 554 if (!page)
555 goto out_cleanup; 555 goto out_cleanup;
556 per_cpu(cpu_profile_hits, cpu)[1] 556 per_cpu(cpu_profile_hits, cpu)[1]
557 = (struct profile_hit *)page_address(page); 557 = (struct profile_hit *)page_address(page);
558 page = alloc_pages_exact_node(node, 558 page = alloc_pages_exact_node(node,
559 GFP_KERNEL | __GFP_ZERO | GFP_THISNODE, 559 GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE,
560 0); 560 0);
561 if (!page) 561 if (!page)
562 goto out_cleanup; 562 goto out_cleanup;
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index 7859a0a3951e..79c3877e9c5b 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -96,19 +96,22 @@ static inline void debug_rcu_head_unqueue(struct rcu_head *head)
96} 96}
97#endif /* #else !CONFIG_DEBUG_OBJECTS_RCU_HEAD */ 97#endif /* #else !CONFIG_DEBUG_OBJECTS_RCU_HEAD */
98 98
99extern void kfree(const void *); 99void kfree(const void *);
100 100
101static inline bool __rcu_reclaim(const char *rn, struct rcu_head *head) 101static inline bool __rcu_reclaim(const char *rn, struct rcu_head *head)
102{ 102{
103 unsigned long offset = (unsigned long)head->func; 103 unsigned long offset = (unsigned long)head->func;
104 104
105 rcu_lock_acquire(&rcu_callback_map);
105 if (__is_kfree_rcu_offset(offset)) { 106 if (__is_kfree_rcu_offset(offset)) {
106 RCU_TRACE(trace_rcu_invoke_kfree_callback(rn, head, offset)); 107 RCU_TRACE(trace_rcu_invoke_kfree_callback(rn, head, offset));
107 kfree((void *)head - offset); 108 kfree((void *)head - offset);
109 rcu_lock_release(&rcu_callback_map);
108 return 1; 110 return 1;
109 } else { 111 } else {
110 RCU_TRACE(trace_rcu_invoke_callback(rn, head)); 112 RCU_TRACE(trace_rcu_invoke_callback(rn, head));
111 head->func(head); 113 head->func(head);
114 rcu_lock_release(&rcu_callback_map);
112 return 0; 115 return 0;
113 } 116 }
114} 117}
diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c
index 01d5ccb8bfe3..3318d8284384 100644
--- a/kernel/rcu/srcu.c
+++ b/kernel/rcu/srcu.c
@@ -363,6 +363,29 @@ static void srcu_flip(struct srcu_struct *sp)
363/* 363/*
364 * Enqueue an SRCU callback on the specified srcu_struct structure, 364 * Enqueue an SRCU callback on the specified srcu_struct structure,
365 * initiating grace-period processing if it is not already running. 365 * initiating grace-period processing if it is not already running.
366 *
367 * Note that all CPUs must agree that the grace period extended beyond
368 * all pre-existing SRCU read-side critical section. On systems with
369 * more than one CPU, this means that when "func()" is invoked, each CPU
370 * is guaranteed to have executed a full memory barrier since the end of
371 * its last corresponding SRCU read-side critical section whose beginning
372 * preceded the call to call_rcu(). It also means that each CPU executing
373 * an SRCU read-side critical section that continues beyond the start of
374 * "func()" must have executed a memory barrier after the call_rcu()
375 * but before the beginning of that SRCU read-side critical section.
376 * Note that these guarantees include CPUs that are offline, idle, or
377 * executing in user mode, as well as CPUs that are executing in the kernel.
378 *
379 * Furthermore, if CPU A invoked call_rcu() and CPU B invoked the
380 * resulting SRCU callback function "func()", then both CPU A and CPU
381 * B are guaranteed to execute a full memory barrier during the time
382 * interval between the call to call_rcu() and the invocation of "func()".
383 * This guarantee applies even if CPU A and CPU B are the same CPU (but
384 * again only if the system has more than one CPU).
385 *
386 * Of course, these guarantees apply only for invocations of call_srcu(),
387 * srcu_read_lock(), and srcu_read_unlock() that are all passed the same
388 * srcu_struct structure.
366 */ 389 */
367void call_srcu(struct srcu_struct *sp, struct rcu_head *head, 390void call_srcu(struct srcu_struct *sp, struct rcu_head *head,
368 void (*func)(struct rcu_head *head)) 391 void (*func)(struct rcu_head *head))
@@ -459,7 +482,30 @@ static void __synchronize_srcu(struct srcu_struct *sp, int trycount)
459 * Note that it is illegal to call synchronize_srcu() from the corresponding 482 * Note that it is illegal to call synchronize_srcu() from the corresponding
460 * SRCU read-side critical section; doing so will result in deadlock. 483 * SRCU read-side critical section; doing so will result in deadlock.
461 * However, it is perfectly legal to call synchronize_srcu() on one 484 * However, it is perfectly legal to call synchronize_srcu() on one
462 * srcu_struct from some other srcu_struct's read-side critical section. 485 * srcu_struct from some other srcu_struct's read-side critical section,
486 * as long as the resulting graph of srcu_structs is acyclic.
487 *
488 * There are memory-ordering constraints implied by synchronize_srcu().
489 * On systems with more than one CPU, when synchronize_srcu() returns,
490 * each CPU is guaranteed to have executed a full memory barrier since
491 * the end of its last corresponding SRCU-sched read-side critical section
492 * whose beginning preceded the call to synchronize_srcu(). In addition,
493 * each CPU having an SRCU read-side critical section that extends beyond
494 * the return from synchronize_srcu() is guaranteed to have executed a
495 * full memory barrier after the beginning of synchronize_srcu() and before
496 * the beginning of that SRCU read-side critical section. Note that these
497 * guarantees include CPUs that are offline, idle, or executing in user mode,
498 * as well as CPUs that are executing in the kernel.
499 *
500 * Furthermore, if CPU A invoked synchronize_srcu(), which returned
501 * to its caller on CPU B, then both CPU A and CPU B are guaranteed
502 * to have executed a full memory barrier during the execution of
503 * synchronize_srcu(). This guarantee applies even if CPU A and CPU B
504 * are the same CPU, but again only if the system has more than one CPU.
505 *
506 * Of course, these memory-ordering guarantees apply only when
507 * synchronize_srcu(), srcu_read_lock(), and srcu_read_unlock() are
508 * passed the same srcu_struct structure.
463 */ 509 */
464void synchronize_srcu(struct srcu_struct *sp) 510void synchronize_srcu(struct srcu_struct *sp)
465{ 511{
@@ -476,12 +522,8 @@ EXPORT_SYMBOL_GPL(synchronize_srcu);
476 * Wait for an SRCU grace period to elapse, but be more aggressive about 522 * Wait for an SRCU grace period to elapse, but be more aggressive about
477 * spinning rather than blocking when waiting. 523 * spinning rather than blocking when waiting.
478 * 524 *
479 * Note that it is also illegal to call synchronize_srcu_expedited() 525 * Note that synchronize_srcu_expedited() has the same deadlock and
480 * from the corresponding SRCU read-side critical section; 526 * memory-ordering properties as does synchronize_srcu().
481 * doing so will result in deadlock. However, it is perfectly legal
482 * to call synchronize_srcu_expedited() on one srcu_struct from some
483 * other srcu_struct's read-side critical section, as long as
484 * the resulting graph of srcu_structs is acyclic.
485 */ 527 */
486void synchronize_srcu_expedited(struct srcu_struct *sp) 528void synchronize_srcu_expedited(struct srcu_struct *sp)
487{ 529{
@@ -491,6 +533,7 @@ EXPORT_SYMBOL_GPL(synchronize_srcu_expedited);
491 533
492/** 534/**
493 * srcu_barrier - Wait until all in-flight call_srcu() callbacks complete. 535 * srcu_barrier - Wait until all in-flight call_srcu() callbacks complete.
536 * @sp: srcu_struct on which to wait for in-flight callbacks.
494 */ 537 */
495void srcu_barrier(struct srcu_struct *sp) 538void srcu_barrier(struct srcu_struct *sp)
496{ 539{
diff --git a/kernel/rcu/torture.c b/kernel/rcu/torture.c
index 3929cd451511..732f8ae3086a 100644
--- a/kernel/rcu/torture.c
+++ b/kernel/rcu/torture.c
@@ -139,8 +139,6 @@ MODULE_PARM_DESC(verbose, "Enable verbose debugging printk()s");
139#define VERBOSE_PRINTK_ERRSTRING(s) \ 139#define VERBOSE_PRINTK_ERRSTRING(s) \
140 do { if (verbose) pr_alert("%s" TORTURE_FLAG "!!! " s "\n", torture_type); } while (0) 140 do { if (verbose) pr_alert("%s" TORTURE_FLAG "!!! " s "\n", torture_type); } while (0)
141 141
142static char printk_buf[4096];
143
144static int nrealreaders; 142static int nrealreaders;
145static struct task_struct *writer_task; 143static struct task_struct *writer_task;
146static struct task_struct **fakewriter_tasks; 144static struct task_struct **fakewriter_tasks;
@@ -376,7 +374,7 @@ struct rcu_torture_ops {
376 void (*call)(struct rcu_head *head, void (*func)(struct rcu_head *rcu)); 374 void (*call)(struct rcu_head *head, void (*func)(struct rcu_head *rcu));
377 void (*cb_barrier)(void); 375 void (*cb_barrier)(void);
378 void (*fqs)(void); 376 void (*fqs)(void);
379 int (*stats)(char *page); 377 void (*stats)(char *page);
380 int irq_capable; 378 int irq_capable;
381 int can_boost; 379 int can_boost;
382 const char *name; 380 const char *name;
@@ -578,21 +576,19 @@ static void srcu_torture_barrier(void)
578 srcu_barrier(&srcu_ctl); 576 srcu_barrier(&srcu_ctl);
579} 577}
580 578
581static int srcu_torture_stats(char *page) 579static void srcu_torture_stats(char *page)
582{ 580{
583 int cnt = 0;
584 int cpu; 581 int cpu;
585 int idx = srcu_ctl.completed & 0x1; 582 int idx = srcu_ctl.completed & 0x1;
586 583
587 cnt += sprintf(&page[cnt], "%s%s per-CPU(idx=%d):", 584 page += sprintf(page, "%s%s per-CPU(idx=%d):",
588 torture_type, TORTURE_FLAG, idx); 585 torture_type, TORTURE_FLAG, idx);
589 for_each_possible_cpu(cpu) { 586 for_each_possible_cpu(cpu) {
590 cnt += sprintf(&page[cnt], " %d(%lu,%lu)", cpu, 587 page += sprintf(page, " %d(%lu,%lu)", cpu,
591 per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[!idx], 588 per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[!idx],
592 per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[idx]); 589 per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[idx]);
593 } 590 }
594 cnt += sprintf(&page[cnt], "\n"); 591 sprintf(page, "\n");
595 return cnt;
596} 592}
597 593
598static void srcu_torture_synchronize_expedited(void) 594static void srcu_torture_synchronize_expedited(void)
@@ -1052,10 +1048,9 @@ rcu_torture_reader(void *arg)
1052/* 1048/*
1053 * Create an RCU-torture statistics message in the specified buffer. 1049 * Create an RCU-torture statistics message in the specified buffer.
1054 */ 1050 */
1055static int 1051static void
1056rcu_torture_printk(char *page) 1052rcu_torture_printk(char *page)
1057{ 1053{
1058 int cnt = 0;
1059 int cpu; 1054 int cpu;
1060 int i; 1055 int i;
1061 long pipesummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 }; 1056 long pipesummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 };
@@ -1071,8 +1066,8 @@ rcu_torture_printk(char *page)
1071 if (pipesummary[i] != 0) 1066 if (pipesummary[i] != 0)
1072 break; 1067 break;
1073 } 1068 }
1074 cnt += sprintf(&page[cnt], "%s%s ", torture_type, TORTURE_FLAG); 1069 page += sprintf(page, "%s%s ", torture_type, TORTURE_FLAG);
1075 cnt += sprintf(&page[cnt], 1070 page += sprintf(page,
1076 "rtc: %p ver: %lu tfle: %d rta: %d rtaf: %d rtf: %d ", 1071 "rtc: %p ver: %lu tfle: %d rta: %d rtaf: %d rtf: %d ",
1077 rcu_torture_current, 1072 rcu_torture_current,
1078 rcu_torture_current_version, 1073 rcu_torture_current_version,
@@ -1080,53 +1075,52 @@ rcu_torture_printk(char *page)
1080 atomic_read(&n_rcu_torture_alloc), 1075 atomic_read(&n_rcu_torture_alloc),
1081 atomic_read(&n_rcu_torture_alloc_fail), 1076 atomic_read(&n_rcu_torture_alloc_fail),
1082 atomic_read(&n_rcu_torture_free)); 1077 atomic_read(&n_rcu_torture_free));
1083 cnt += sprintf(&page[cnt], "rtmbe: %d rtbke: %ld rtbre: %ld ", 1078 page += sprintf(page, "rtmbe: %d rtbke: %ld rtbre: %ld ",
1084 atomic_read(&n_rcu_torture_mberror), 1079 atomic_read(&n_rcu_torture_mberror),
1085 n_rcu_torture_boost_ktrerror, 1080 n_rcu_torture_boost_ktrerror,
1086 n_rcu_torture_boost_rterror); 1081 n_rcu_torture_boost_rterror);
1087 cnt += sprintf(&page[cnt], "rtbf: %ld rtb: %ld nt: %ld ", 1082 page += sprintf(page, "rtbf: %ld rtb: %ld nt: %ld ",
1088 n_rcu_torture_boost_failure, 1083 n_rcu_torture_boost_failure,
1089 n_rcu_torture_boosts, 1084 n_rcu_torture_boosts,
1090 n_rcu_torture_timers); 1085 n_rcu_torture_timers);
1091 cnt += sprintf(&page[cnt], 1086 page += sprintf(page,
1092 "onoff: %ld/%ld:%ld/%ld %d,%d:%d,%d %lu:%lu (HZ=%d) ", 1087 "onoff: %ld/%ld:%ld/%ld %d,%d:%d,%d %lu:%lu (HZ=%d) ",
1093 n_online_successes, n_online_attempts, 1088 n_online_successes, n_online_attempts,
1094 n_offline_successes, n_offline_attempts, 1089 n_offline_successes, n_offline_attempts,
1095 min_online, max_online, 1090 min_online, max_online,
1096 min_offline, max_offline, 1091 min_offline, max_offline,
1097 sum_online, sum_offline, HZ); 1092 sum_online, sum_offline, HZ);
1098 cnt += sprintf(&page[cnt], "barrier: %ld/%ld:%ld", 1093 page += sprintf(page, "barrier: %ld/%ld:%ld",
1099 n_barrier_successes, 1094 n_barrier_successes,
1100 n_barrier_attempts, 1095 n_barrier_attempts,
1101 n_rcu_torture_barrier_error); 1096 n_rcu_torture_barrier_error);
1102 cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG); 1097 page += sprintf(page, "\n%s%s ", torture_type, TORTURE_FLAG);
1103 if (atomic_read(&n_rcu_torture_mberror) != 0 || 1098 if (atomic_read(&n_rcu_torture_mberror) != 0 ||
1104 n_rcu_torture_barrier_error != 0 || 1099 n_rcu_torture_barrier_error != 0 ||
1105 n_rcu_torture_boost_ktrerror != 0 || 1100 n_rcu_torture_boost_ktrerror != 0 ||
1106 n_rcu_torture_boost_rterror != 0 || 1101 n_rcu_torture_boost_rterror != 0 ||
1107 n_rcu_torture_boost_failure != 0 || 1102 n_rcu_torture_boost_failure != 0 ||
1108 i > 1) { 1103 i > 1) {
1109 cnt += sprintf(&page[cnt], "!!! "); 1104 page += sprintf(page, "!!! ");
1110 atomic_inc(&n_rcu_torture_error); 1105 atomic_inc(&n_rcu_torture_error);
1111 WARN_ON_ONCE(1); 1106 WARN_ON_ONCE(1);
1112 } 1107 }
1113 cnt += sprintf(&page[cnt], "Reader Pipe: "); 1108 page += sprintf(page, "Reader Pipe: ");
1114 for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) 1109 for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++)
1115 cnt += sprintf(&page[cnt], " %ld", pipesummary[i]); 1110 page += sprintf(page, " %ld", pipesummary[i]);
1116 cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG); 1111 page += sprintf(page, "\n%s%s ", torture_type, TORTURE_FLAG);
1117 cnt += sprintf(&page[cnt], "Reader Batch: "); 1112 page += sprintf(page, "Reader Batch: ");
1118 for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) 1113 for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++)
1119 cnt += sprintf(&page[cnt], " %ld", batchsummary[i]); 1114 page += sprintf(page, " %ld", batchsummary[i]);
1120 cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG); 1115 page += sprintf(page, "\n%s%s ", torture_type, TORTURE_FLAG);
1121 cnt += sprintf(&page[cnt], "Free-Block Circulation: "); 1116 page += sprintf(page, "Free-Block Circulation: ");
1122 for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) { 1117 for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) {
1123 cnt += sprintf(&page[cnt], " %d", 1118 page += sprintf(page, " %d",
1124 atomic_read(&rcu_torture_wcount[i])); 1119 atomic_read(&rcu_torture_wcount[i]));
1125 } 1120 }
1126 cnt += sprintf(&page[cnt], "\n"); 1121 page += sprintf(page, "\n");
1127 if (cur_ops->stats) 1122 if (cur_ops->stats)
1128 cnt += cur_ops->stats(&page[cnt]); 1123 cur_ops->stats(page);
1129 return cnt;
1130} 1124}
1131 1125
1132/* 1126/*
@@ -1140,10 +1134,17 @@ rcu_torture_printk(char *page)
1140static void 1134static void
1141rcu_torture_stats_print(void) 1135rcu_torture_stats_print(void)
1142{ 1136{
1143 int cnt; 1137 int size = nr_cpu_ids * 200 + 8192;
1138 char *buf;
1144 1139
1145 cnt = rcu_torture_printk(printk_buf); 1140 buf = kmalloc(size, GFP_KERNEL);
1146 pr_alert("%s", printk_buf); 1141 if (!buf) {
1142 pr_err("rcu-torture: Out of memory, need: %d", size);
1143 return;
1144 }
1145 rcu_torture_printk(buf);
1146 pr_alert("%s", buf);
1147 kfree(buf);
1147} 1148}
1148 1149
1149/* 1150/*
@@ -1578,6 +1579,7 @@ static int rcu_torture_barrier_cbs(void *arg)
1578{ 1579{
1579 long myid = (long)arg; 1580 long myid = (long)arg;
1580 bool lastphase = 0; 1581 bool lastphase = 0;
1582 bool newphase;
1581 struct rcu_head rcu; 1583 struct rcu_head rcu;
1582 1584
1583 init_rcu_head_on_stack(&rcu); 1585 init_rcu_head_on_stack(&rcu);
@@ -1585,10 +1587,11 @@ static int rcu_torture_barrier_cbs(void *arg)
1585 set_user_nice(current, 19); 1587 set_user_nice(current, 19);
1586 do { 1588 do {
1587 wait_event(barrier_cbs_wq[myid], 1589 wait_event(barrier_cbs_wq[myid],
1588 barrier_phase != lastphase || 1590 (newphase =
1591 ACCESS_ONCE(barrier_phase)) != lastphase ||
1589 kthread_should_stop() || 1592 kthread_should_stop() ||
1590 fullstop != FULLSTOP_DONTSTOP); 1593 fullstop != FULLSTOP_DONTSTOP);
1591 lastphase = barrier_phase; 1594 lastphase = newphase;
1592 smp_mb(); /* ensure barrier_phase load before ->call(). */ 1595 smp_mb(); /* ensure barrier_phase load before ->call(). */
1593 if (kthread_should_stop() || fullstop != FULLSTOP_DONTSTOP) 1596 if (kthread_should_stop() || fullstop != FULLSTOP_DONTSTOP)
1594 break; 1597 break;
@@ -1625,7 +1628,7 @@ static int rcu_torture_barrier(void *arg)
1625 if (kthread_should_stop() || fullstop != FULLSTOP_DONTSTOP) 1628 if (kthread_should_stop() || fullstop != FULLSTOP_DONTSTOP)
1626 break; 1629 break;
1627 n_barrier_attempts++; 1630 n_barrier_attempts++;
1628 cur_ops->cb_barrier(); 1631 cur_ops->cb_barrier(); /* Implies smp_mb() for wait_event(). */
1629 if (atomic_read(&barrier_cbs_invoked) != n_barrier_cbs) { 1632 if (atomic_read(&barrier_cbs_invoked) != n_barrier_cbs) {
1630 n_rcu_torture_barrier_error++; 1633 n_rcu_torture_barrier_error++;
1631 WARN_ON_ONCE(1); 1634 WARN_ON_ONCE(1);
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index dd081987a8ec..b3d116cd072d 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -369,6 +369,9 @@ static struct rcu_node *rcu_get_root(struct rcu_state *rsp)
369static void rcu_eqs_enter_common(struct rcu_dynticks *rdtp, long long oldval, 369static void rcu_eqs_enter_common(struct rcu_dynticks *rdtp, long long oldval,
370 bool user) 370 bool user)
371{ 371{
372 struct rcu_state *rsp;
373 struct rcu_data *rdp;
374
372 trace_rcu_dyntick(TPS("Start"), oldval, rdtp->dynticks_nesting); 375 trace_rcu_dyntick(TPS("Start"), oldval, rdtp->dynticks_nesting);
373 if (!user && !is_idle_task(current)) { 376 if (!user && !is_idle_task(current)) {
374 struct task_struct *idle __maybe_unused = 377 struct task_struct *idle __maybe_unused =
@@ -380,6 +383,10 @@ static void rcu_eqs_enter_common(struct rcu_dynticks *rdtp, long long oldval,
380 current->pid, current->comm, 383 current->pid, current->comm,
381 idle->pid, idle->comm); /* must be idle task! */ 384 idle->pid, idle->comm); /* must be idle task! */
382 } 385 }
386 for_each_rcu_flavor(rsp) {
387 rdp = this_cpu_ptr(rsp->rda);
388 do_nocb_deferred_wakeup(rdp);
389 }
383 rcu_prepare_for_idle(smp_processor_id()); 390 rcu_prepare_for_idle(smp_processor_id());
384 /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */ 391 /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */
385 smp_mb__before_atomic_inc(); /* See above. */ 392 smp_mb__before_atomic_inc(); /* See above. */
@@ -411,11 +418,12 @@ static void rcu_eqs_enter(bool user)
411 rdtp = this_cpu_ptr(&rcu_dynticks); 418 rdtp = this_cpu_ptr(&rcu_dynticks);
412 oldval = rdtp->dynticks_nesting; 419 oldval = rdtp->dynticks_nesting;
413 WARN_ON_ONCE((oldval & DYNTICK_TASK_NEST_MASK) == 0); 420 WARN_ON_ONCE((oldval & DYNTICK_TASK_NEST_MASK) == 0);
414 if ((oldval & DYNTICK_TASK_NEST_MASK) == DYNTICK_TASK_NEST_VALUE) 421 if ((oldval & DYNTICK_TASK_NEST_MASK) == DYNTICK_TASK_NEST_VALUE) {
415 rdtp->dynticks_nesting = 0; 422 rdtp->dynticks_nesting = 0;
416 else 423 rcu_eqs_enter_common(rdtp, oldval, user);
424 } else {
417 rdtp->dynticks_nesting -= DYNTICK_TASK_NEST_VALUE; 425 rdtp->dynticks_nesting -= DYNTICK_TASK_NEST_VALUE;
418 rcu_eqs_enter_common(rdtp, oldval, user); 426 }
419} 427}
420 428
421/** 429/**
@@ -533,11 +541,12 @@ static void rcu_eqs_exit(bool user)
533 rdtp = this_cpu_ptr(&rcu_dynticks); 541 rdtp = this_cpu_ptr(&rcu_dynticks);
534 oldval = rdtp->dynticks_nesting; 542 oldval = rdtp->dynticks_nesting;
535 WARN_ON_ONCE(oldval < 0); 543 WARN_ON_ONCE(oldval < 0);
536 if (oldval & DYNTICK_TASK_NEST_MASK) 544 if (oldval & DYNTICK_TASK_NEST_MASK) {
537 rdtp->dynticks_nesting += DYNTICK_TASK_NEST_VALUE; 545 rdtp->dynticks_nesting += DYNTICK_TASK_NEST_VALUE;
538 else 546 } else {
539 rdtp->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; 547 rdtp->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;
540 rcu_eqs_exit_common(rdtp, oldval, user); 548 rcu_eqs_exit_common(rdtp, oldval, user);
549 }
541} 550}
542 551
543/** 552/**
@@ -716,7 +725,7 @@ bool rcu_lockdep_current_cpu_online(void)
716 bool ret; 725 bool ret;
717 726
718 if (in_nmi()) 727 if (in_nmi())
719 return 1; 728 return true;
720 preempt_disable(); 729 preempt_disable();
721 rdp = this_cpu_ptr(&rcu_sched_data); 730 rdp = this_cpu_ptr(&rcu_sched_data);
722 rnp = rdp->mynode; 731 rnp = rdp->mynode;
@@ -755,6 +764,12 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp,
755} 764}
756 765
757/* 766/*
767 * This function really isn't for public consumption, but RCU is special in
768 * that context switches can allow the state machine to make progress.
769 */
770extern void resched_cpu(int cpu);
771
772/*
758 * Return true if the specified CPU has passed through a quiescent 773 * Return true if the specified CPU has passed through a quiescent
759 * state by virtue of being in or having passed through an dynticks 774 * state by virtue of being in or having passed through an dynticks
760 * idle state since the last call to dyntick_save_progress_counter() 775 * idle state since the last call to dyntick_save_progress_counter()
@@ -812,16 +827,34 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp,
812 */ 827 */
813 rcu_kick_nohz_cpu(rdp->cpu); 828 rcu_kick_nohz_cpu(rdp->cpu);
814 829
830 /*
831 * Alternatively, the CPU might be running in the kernel
832 * for an extended period of time without a quiescent state.
833 * Attempt to force the CPU through the scheduler to gain the
834 * needed quiescent state, but only if the grace period has gone
835 * on for an uncommonly long time. If there are many stuck CPUs,
836 * we will beat on the first one until it gets unstuck, then move
837 * to the next. Only do this for the primary flavor of RCU.
838 */
839 if (rdp->rsp == rcu_state &&
840 ULONG_CMP_GE(ACCESS_ONCE(jiffies), rdp->rsp->jiffies_resched)) {
841 rdp->rsp->jiffies_resched += 5;
842 resched_cpu(rdp->cpu);
843 }
844
815 return 0; 845 return 0;
816} 846}
817 847
818static void record_gp_stall_check_time(struct rcu_state *rsp) 848static void record_gp_stall_check_time(struct rcu_state *rsp)
819{ 849{
820 unsigned long j = ACCESS_ONCE(jiffies); 850 unsigned long j = ACCESS_ONCE(jiffies);
851 unsigned long j1;
821 852
822 rsp->gp_start = j; 853 rsp->gp_start = j;
823 smp_wmb(); /* Record start time before stall time. */ 854 smp_wmb(); /* Record start time before stall time. */
824 rsp->jiffies_stall = j + rcu_jiffies_till_stall_check(); 855 j1 = rcu_jiffies_till_stall_check();
856 rsp->jiffies_stall = j + j1;
857 rsp->jiffies_resched = j + j1 / 2;
825} 858}
826 859
827/* 860/*
@@ -1133,8 +1166,10 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp)
1133 * hold it, acquire the root rcu_node structure's lock in order to 1166 * hold it, acquire the root rcu_node structure's lock in order to
1134 * start one (if needed). 1167 * start one (if needed).
1135 */ 1168 */
1136 if (rnp != rnp_root) 1169 if (rnp != rnp_root) {
1137 raw_spin_lock(&rnp_root->lock); 1170 raw_spin_lock(&rnp_root->lock);
1171 smp_mb__after_unlock_lock();
1172 }
1138 1173
1139 /* 1174 /*
1140 * Get a new grace-period number. If there really is no grace 1175 * Get a new grace-period number. If there really is no grace
@@ -1354,6 +1389,7 @@ static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp)
1354 local_irq_restore(flags); 1389 local_irq_restore(flags);
1355 return; 1390 return;
1356 } 1391 }
1392 smp_mb__after_unlock_lock();
1357 __note_gp_changes(rsp, rnp, rdp); 1393 __note_gp_changes(rsp, rnp, rdp);
1358 raw_spin_unlock_irqrestore(&rnp->lock, flags); 1394 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1359} 1395}
@@ -1368,6 +1404,7 @@ static int rcu_gp_init(struct rcu_state *rsp)
1368 1404
1369 rcu_bind_gp_kthread(); 1405 rcu_bind_gp_kthread();
1370 raw_spin_lock_irq(&rnp->lock); 1406 raw_spin_lock_irq(&rnp->lock);
1407 smp_mb__after_unlock_lock();
1371 if (rsp->gp_flags == 0) { 1408 if (rsp->gp_flags == 0) {
1372 /* Spurious wakeup, tell caller to go back to sleep. */ 1409 /* Spurious wakeup, tell caller to go back to sleep. */
1373 raw_spin_unlock_irq(&rnp->lock); 1410 raw_spin_unlock_irq(&rnp->lock);
@@ -1409,6 +1446,7 @@ static int rcu_gp_init(struct rcu_state *rsp)
1409 */ 1446 */
1410 rcu_for_each_node_breadth_first(rsp, rnp) { 1447 rcu_for_each_node_breadth_first(rsp, rnp) {
1411 raw_spin_lock_irq(&rnp->lock); 1448 raw_spin_lock_irq(&rnp->lock);
1449 smp_mb__after_unlock_lock();
1412 rdp = this_cpu_ptr(rsp->rda); 1450 rdp = this_cpu_ptr(rsp->rda);
1413 rcu_preempt_check_blocked_tasks(rnp); 1451 rcu_preempt_check_blocked_tasks(rnp);
1414 rnp->qsmask = rnp->qsmaskinit; 1452 rnp->qsmask = rnp->qsmaskinit;
@@ -1463,6 +1501,7 @@ static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in)
1463 /* Clear flag to prevent immediate re-entry. */ 1501 /* Clear flag to prevent immediate re-entry. */
1464 if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) { 1502 if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) {
1465 raw_spin_lock_irq(&rnp->lock); 1503 raw_spin_lock_irq(&rnp->lock);
1504 smp_mb__after_unlock_lock();
1466 rsp->gp_flags &= ~RCU_GP_FLAG_FQS; 1505 rsp->gp_flags &= ~RCU_GP_FLAG_FQS;
1467 raw_spin_unlock_irq(&rnp->lock); 1506 raw_spin_unlock_irq(&rnp->lock);
1468 } 1507 }
@@ -1480,6 +1519,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
1480 struct rcu_node *rnp = rcu_get_root(rsp); 1519 struct rcu_node *rnp = rcu_get_root(rsp);
1481 1520
1482 raw_spin_lock_irq(&rnp->lock); 1521 raw_spin_lock_irq(&rnp->lock);
1522 smp_mb__after_unlock_lock();
1483 gp_duration = jiffies - rsp->gp_start; 1523 gp_duration = jiffies - rsp->gp_start;
1484 if (gp_duration > rsp->gp_max) 1524 if (gp_duration > rsp->gp_max)
1485 rsp->gp_max = gp_duration; 1525 rsp->gp_max = gp_duration;
@@ -1505,16 +1545,19 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
1505 */ 1545 */
1506 rcu_for_each_node_breadth_first(rsp, rnp) { 1546 rcu_for_each_node_breadth_first(rsp, rnp) {
1507 raw_spin_lock_irq(&rnp->lock); 1547 raw_spin_lock_irq(&rnp->lock);
1548 smp_mb__after_unlock_lock();
1508 ACCESS_ONCE(rnp->completed) = rsp->gpnum; 1549 ACCESS_ONCE(rnp->completed) = rsp->gpnum;
1509 rdp = this_cpu_ptr(rsp->rda); 1550 rdp = this_cpu_ptr(rsp->rda);
1510 if (rnp == rdp->mynode) 1551 if (rnp == rdp->mynode)
1511 __note_gp_changes(rsp, rnp, rdp); 1552 __note_gp_changes(rsp, rnp, rdp);
1553 /* smp_mb() provided by prior unlock-lock pair. */
1512 nocb += rcu_future_gp_cleanup(rsp, rnp); 1554 nocb += rcu_future_gp_cleanup(rsp, rnp);
1513 raw_spin_unlock_irq(&rnp->lock); 1555 raw_spin_unlock_irq(&rnp->lock);
1514 cond_resched(); 1556 cond_resched();
1515 } 1557 }
1516 rnp = rcu_get_root(rsp); 1558 rnp = rcu_get_root(rsp);
1517 raw_spin_lock_irq(&rnp->lock); 1559 raw_spin_lock_irq(&rnp->lock);
1560 smp_mb__after_unlock_lock();
1518 rcu_nocb_gp_set(rnp, nocb); 1561 rcu_nocb_gp_set(rnp, nocb);
1519 1562
1520 rsp->completed = rsp->gpnum; /* Declare grace period done. */ 1563 rsp->completed = rsp->gpnum; /* Declare grace period done. */
@@ -1553,6 +1596,7 @@ static int __noreturn rcu_gp_kthread(void *arg)
1553 wait_event_interruptible(rsp->gp_wq, 1596 wait_event_interruptible(rsp->gp_wq,
1554 ACCESS_ONCE(rsp->gp_flags) & 1597 ACCESS_ONCE(rsp->gp_flags) &
1555 RCU_GP_FLAG_INIT); 1598 RCU_GP_FLAG_INIT);
1599 /* Locking provides needed memory barrier. */
1556 if (rcu_gp_init(rsp)) 1600 if (rcu_gp_init(rsp))
1557 break; 1601 break;
1558 cond_resched(); 1602 cond_resched();
@@ -1582,6 +1626,7 @@ static int __noreturn rcu_gp_kthread(void *arg)
1582 (!ACCESS_ONCE(rnp->qsmask) && 1626 (!ACCESS_ONCE(rnp->qsmask) &&
1583 !rcu_preempt_blocked_readers_cgp(rnp)), 1627 !rcu_preempt_blocked_readers_cgp(rnp)),
1584 j); 1628 j);
1629 /* Locking provides needed memory barriers. */
1585 /* If grace period done, leave loop. */ 1630 /* If grace period done, leave loop. */
1586 if (!ACCESS_ONCE(rnp->qsmask) && 1631 if (!ACCESS_ONCE(rnp->qsmask) &&
1587 !rcu_preempt_blocked_readers_cgp(rnp)) 1632 !rcu_preempt_blocked_readers_cgp(rnp))
@@ -1749,6 +1794,7 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp,
1749 rnp_c = rnp; 1794 rnp_c = rnp;
1750 rnp = rnp->parent; 1795 rnp = rnp->parent;
1751 raw_spin_lock_irqsave(&rnp->lock, flags); 1796 raw_spin_lock_irqsave(&rnp->lock, flags);
1797 smp_mb__after_unlock_lock();
1752 WARN_ON_ONCE(rnp_c->qsmask); 1798 WARN_ON_ONCE(rnp_c->qsmask);
1753 } 1799 }
1754 1800
@@ -1778,6 +1824,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp)
1778 1824
1779 rnp = rdp->mynode; 1825 rnp = rdp->mynode;
1780 raw_spin_lock_irqsave(&rnp->lock, flags); 1826 raw_spin_lock_irqsave(&rnp->lock, flags);
1827 smp_mb__after_unlock_lock();
1781 if (rdp->passed_quiesce == 0 || rdp->gpnum != rnp->gpnum || 1828 if (rdp->passed_quiesce == 0 || rdp->gpnum != rnp->gpnum ||
1782 rnp->completed == rnp->gpnum) { 1829 rnp->completed == rnp->gpnum) {
1783 1830
@@ -1901,13 +1948,13 @@ rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp,
1901 * Adopt the RCU callbacks from the specified rcu_state structure's 1948 * Adopt the RCU callbacks from the specified rcu_state structure's
1902 * orphanage. The caller must hold the ->orphan_lock. 1949 * orphanage. The caller must hold the ->orphan_lock.
1903 */ 1950 */
1904static void rcu_adopt_orphan_cbs(struct rcu_state *rsp) 1951static void rcu_adopt_orphan_cbs(struct rcu_state *rsp, unsigned long flags)
1905{ 1952{
1906 int i; 1953 int i;
1907 struct rcu_data *rdp = __this_cpu_ptr(rsp->rda); 1954 struct rcu_data *rdp = __this_cpu_ptr(rsp->rda);
1908 1955
1909 /* No-CBs CPUs are handled specially. */ 1956 /* No-CBs CPUs are handled specially. */
1910 if (rcu_nocb_adopt_orphan_cbs(rsp, rdp)) 1957 if (rcu_nocb_adopt_orphan_cbs(rsp, rdp, flags))
1911 return; 1958 return;
1912 1959
1913 /* Do the accounting first. */ 1960 /* Do the accounting first. */
@@ -1986,12 +2033,13 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
1986 2033
1987 /* Orphan the dead CPU's callbacks, and adopt them if appropriate. */ 2034 /* Orphan the dead CPU's callbacks, and adopt them if appropriate. */
1988 rcu_send_cbs_to_orphanage(cpu, rsp, rnp, rdp); 2035 rcu_send_cbs_to_orphanage(cpu, rsp, rnp, rdp);
1989 rcu_adopt_orphan_cbs(rsp); 2036 rcu_adopt_orphan_cbs(rsp, flags);
1990 2037
1991 /* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */ 2038 /* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */
1992 mask = rdp->grpmask; /* rnp->grplo is constant. */ 2039 mask = rdp->grpmask; /* rnp->grplo is constant. */
1993 do { 2040 do {
1994 raw_spin_lock(&rnp->lock); /* irqs already disabled. */ 2041 raw_spin_lock(&rnp->lock); /* irqs already disabled. */
2042 smp_mb__after_unlock_lock();
1995 rnp->qsmaskinit &= ~mask; 2043 rnp->qsmaskinit &= ~mask;
1996 if (rnp->qsmaskinit != 0) { 2044 if (rnp->qsmaskinit != 0) {
1997 if (rnp != rdp->mynode) 2045 if (rnp != rdp->mynode)
@@ -2202,6 +2250,7 @@ static void force_qs_rnp(struct rcu_state *rsp,
2202 cond_resched(); 2250 cond_resched();
2203 mask = 0; 2251 mask = 0;
2204 raw_spin_lock_irqsave(&rnp->lock, flags); 2252 raw_spin_lock_irqsave(&rnp->lock, flags);
2253 smp_mb__after_unlock_lock();
2205 if (!rcu_gp_in_progress(rsp)) { 2254 if (!rcu_gp_in_progress(rsp)) {
2206 raw_spin_unlock_irqrestore(&rnp->lock, flags); 2255 raw_spin_unlock_irqrestore(&rnp->lock, flags);
2207 return; 2256 return;
@@ -2231,6 +2280,7 @@ static void force_qs_rnp(struct rcu_state *rsp,
2231 rnp = rcu_get_root(rsp); 2280 rnp = rcu_get_root(rsp);
2232 if (rnp->qsmask == 0) { 2281 if (rnp->qsmask == 0) {
2233 raw_spin_lock_irqsave(&rnp->lock, flags); 2282 raw_spin_lock_irqsave(&rnp->lock, flags);
2283 smp_mb__after_unlock_lock();
2234 rcu_initiate_boost(rnp, flags); /* releases rnp->lock. */ 2284 rcu_initiate_boost(rnp, flags); /* releases rnp->lock. */
2235 } 2285 }
2236} 2286}
@@ -2263,6 +2313,7 @@ static void force_quiescent_state(struct rcu_state *rsp)
2263 2313
2264 /* Reached the root of the rcu_node tree, acquire lock. */ 2314 /* Reached the root of the rcu_node tree, acquire lock. */
2265 raw_spin_lock_irqsave(&rnp_old->lock, flags); 2315 raw_spin_lock_irqsave(&rnp_old->lock, flags);
2316 smp_mb__after_unlock_lock();
2266 raw_spin_unlock(&rnp_old->fqslock); 2317 raw_spin_unlock(&rnp_old->fqslock);
2267 if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) { 2318 if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) {
2268 rsp->n_force_qs_lh++; 2319 rsp->n_force_qs_lh++;
@@ -2303,6 +2354,9 @@ __rcu_process_callbacks(struct rcu_state *rsp)
2303 /* If there are callbacks ready, invoke them. */ 2354 /* If there are callbacks ready, invoke them. */
2304 if (cpu_has_callbacks_ready_to_invoke(rdp)) 2355 if (cpu_has_callbacks_ready_to_invoke(rdp))
2305 invoke_rcu_callbacks(rsp, rdp); 2356 invoke_rcu_callbacks(rsp, rdp);
2357
2358 /* Do any needed deferred wakeups of rcuo kthreads. */
2359 do_nocb_deferred_wakeup(rdp);
2306} 2360}
2307 2361
2308/* 2362/*
@@ -2378,6 +2432,7 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp,
2378 struct rcu_node *rnp_root = rcu_get_root(rsp); 2432 struct rcu_node *rnp_root = rcu_get_root(rsp);
2379 2433
2380 raw_spin_lock(&rnp_root->lock); 2434 raw_spin_lock(&rnp_root->lock);
2435 smp_mb__after_unlock_lock();
2381 rcu_start_gp(rsp); 2436 rcu_start_gp(rsp);
2382 raw_spin_unlock(&rnp_root->lock); 2437 raw_spin_unlock(&rnp_root->lock);
2383 } else { 2438 } else {
@@ -2437,7 +2492,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
2437 2492
2438 if (cpu != -1) 2493 if (cpu != -1)
2439 rdp = per_cpu_ptr(rsp->rda, cpu); 2494 rdp = per_cpu_ptr(rsp->rda, cpu);
2440 offline = !__call_rcu_nocb(rdp, head, lazy); 2495 offline = !__call_rcu_nocb(rdp, head, lazy, flags);
2441 WARN_ON_ONCE(offline); 2496 WARN_ON_ONCE(offline);
2442 /* _call_rcu() is illegal on offline CPU; leak the callback. */ 2497 /* _call_rcu() is illegal on offline CPU; leak the callback. */
2443 local_irq_restore(flags); 2498 local_irq_restore(flags);
@@ -2757,6 +2812,10 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
2757 /* Check for CPU stalls, if enabled. */ 2812 /* Check for CPU stalls, if enabled. */
2758 check_cpu_stall(rsp, rdp); 2813 check_cpu_stall(rsp, rdp);
2759 2814
2815 /* Is this CPU a NO_HZ_FULL CPU that should ignore RCU? */
2816 if (rcu_nohz_full_cpu(rsp))
2817 return 0;
2818
2760 /* Is the RCU core waiting for a quiescent state from this CPU? */ 2819 /* Is the RCU core waiting for a quiescent state from this CPU? */
2761 if (rcu_scheduler_fully_active && 2820 if (rcu_scheduler_fully_active &&
2762 rdp->qs_pending && !rdp->passed_quiesce) { 2821 rdp->qs_pending && !rdp->passed_quiesce) {
@@ -2790,6 +2849,12 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
2790 return 1; 2849 return 1;
2791 } 2850 }
2792 2851
2852 /* Does this CPU need a deferred NOCB wakeup? */
2853 if (rcu_nocb_need_deferred_wakeup(rdp)) {
2854 rdp->n_rp_nocb_defer_wakeup++;
2855 return 1;
2856 }
2857
2793 /* nothing to do */ 2858 /* nothing to do */
2794 rdp->n_rp_need_nothing++; 2859 rdp->n_rp_need_nothing++;
2795 return 0; 2860 return 0;
@@ -3214,9 +3279,9 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp)
3214{ 3279{
3215 int i; 3280 int i;
3216 3281
3217 for (i = rcu_num_lvls - 1; i > 0; i--) 3282 rsp->levelspread[rcu_num_lvls - 1] = rcu_fanout_leaf;
3283 for (i = rcu_num_lvls - 2; i >= 0; i--)
3218 rsp->levelspread[i] = CONFIG_RCU_FANOUT; 3284 rsp->levelspread[i] = CONFIG_RCU_FANOUT;
3219 rsp->levelspread[0] = rcu_fanout_leaf;
3220} 3285}
3221#else /* #ifdef CONFIG_RCU_FANOUT_EXACT */ 3286#else /* #ifdef CONFIG_RCU_FANOUT_EXACT */
3222static void __init rcu_init_levelspread(struct rcu_state *rsp) 3287static void __init rcu_init_levelspread(struct rcu_state *rsp)
@@ -3346,6 +3411,8 @@ static void __init rcu_init_geometry(void)
3346 if (rcu_fanout_leaf == CONFIG_RCU_FANOUT_LEAF && 3411 if (rcu_fanout_leaf == CONFIG_RCU_FANOUT_LEAF &&
3347 nr_cpu_ids == NR_CPUS) 3412 nr_cpu_ids == NR_CPUS)
3348 return; 3413 return;
3414 pr_info("RCU: Adjusting geometry for rcu_fanout_leaf=%d, nr_cpu_ids=%d\n",
3415 rcu_fanout_leaf, nr_cpu_ids);
3349 3416
3350 /* 3417 /*
3351 * Compute number of nodes that can be handled an rcu_node tree 3418 * Compute number of nodes that can be handled an rcu_node tree
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 52be957c9fe2..8c19873f1ac9 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -317,6 +317,7 @@ struct rcu_data {
317 unsigned long n_rp_cpu_needs_gp; 317 unsigned long n_rp_cpu_needs_gp;
318 unsigned long n_rp_gp_completed; 318 unsigned long n_rp_gp_completed;
319 unsigned long n_rp_gp_started; 319 unsigned long n_rp_gp_started;
320 unsigned long n_rp_nocb_defer_wakeup;
320 unsigned long n_rp_need_nothing; 321 unsigned long n_rp_need_nothing;
321 322
322 /* 6) _rcu_barrier() and OOM callbacks. */ 323 /* 6) _rcu_barrier() and OOM callbacks. */
@@ -335,6 +336,7 @@ struct rcu_data {
335 int nocb_p_count_lazy; /* (approximate). */ 336 int nocb_p_count_lazy; /* (approximate). */
336 wait_queue_head_t nocb_wq; /* For nocb kthreads to sleep on. */ 337 wait_queue_head_t nocb_wq; /* For nocb kthreads to sleep on. */
337 struct task_struct *nocb_kthread; 338 struct task_struct *nocb_kthread;
339 bool nocb_defer_wakeup; /* Defer wakeup of nocb_kthread. */
338#endif /* #ifdef CONFIG_RCU_NOCB_CPU */ 340#endif /* #ifdef CONFIG_RCU_NOCB_CPU */
339 341
340 /* 8) RCU CPU stall data. */ 342 /* 8) RCU CPU stall data. */
@@ -453,6 +455,8 @@ struct rcu_state {
453 /* but in jiffies. */ 455 /* but in jiffies. */
454 unsigned long jiffies_stall; /* Time at which to check */ 456 unsigned long jiffies_stall; /* Time at which to check */
455 /* for CPU stalls. */ 457 /* for CPU stalls. */
458 unsigned long jiffies_resched; /* Time at which to resched */
459 /* a reluctant CPU. */
456 unsigned long gp_max; /* Maximum GP duration in */ 460 unsigned long gp_max; /* Maximum GP duration in */
457 /* jiffies. */ 461 /* jiffies. */
458 const char *name; /* Name of structure. */ 462 const char *name; /* Name of structure. */
@@ -548,9 +552,12 @@ static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq);
548static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp); 552static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp);
549static void rcu_init_one_nocb(struct rcu_node *rnp); 553static void rcu_init_one_nocb(struct rcu_node *rnp);
550static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, 554static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
551 bool lazy); 555 bool lazy, unsigned long flags);
552static bool rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp, 556static bool rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp,
553 struct rcu_data *rdp); 557 struct rcu_data *rdp,
558 unsigned long flags);
559static bool rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp);
560static void do_nocb_deferred_wakeup(struct rcu_data *rdp);
554static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp); 561static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp);
555static void rcu_spawn_nocb_kthreads(struct rcu_state *rsp); 562static void rcu_spawn_nocb_kthreads(struct rcu_state *rsp);
556static void rcu_kick_nohz_cpu(int cpu); 563static void rcu_kick_nohz_cpu(int cpu);
@@ -564,6 +571,7 @@ static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle,
564 unsigned long maxj); 571 unsigned long maxj);
565static void rcu_bind_gp_kthread(void); 572static void rcu_bind_gp_kthread(void);
566static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp); 573static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp);
574static bool rcu_nohz_full_cpu(struct rcu_state *rsp);
567 575
568#endif /* #ifndef RCU_TREE_NONCORE */ 576#endif /* #ifndef RCU_TREE_NONCORE */
569 577
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 08a765232432..6e2ef4b2b920 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -204,6 +204,7 @@ static void rcu_preempt_note_context_switch(int cpu)
204 rdp = per_cpu_ptr(rcu_preempt_state.rda, cpu); 204 rdp = per_cpu_ptr(rcu_preempt_state.rda, cpu);
205 rnp = rdp->mynode; 205 rnp = rdp->mynode;
206 raw_spin_lock_irqsave(&rnp->lock, flags); 206 raw_spin_lock_irqsave(&rnp->lock, flags);
207 smp_mb__after_unlock_lock();
207 t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED; 208 t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED;
208 t->rcu_blocked_node = rnp; 209 t->rcu_blocked_node = rnp;
209 210
@@ -312,6 +313,7 @@ static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags)
312 mask = rnp->grpmask; 313 mask = rnp->grpmask;
313 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ 314 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
314 raw_spin_lock(&rnp_p->lock); /* irqs already disabled. */ 315 raw_spin_lock(&rnp_p->lock); /* irqs already disabled. */
316 smp_mb__after_unlock_lock();
315 rcu_report_qs_rnp(mask, &rcu_preempt_state, rnp_p, flags); 317 rcu_report_qs_rnp(mask, &rcu_preempt_state, rnp_p, flags);
316} 318}
317 319
@@ -361,10 +363,14 @@ void rcu_read_unlock_special(struct task_struct *t)
361 special = t->rcu_read_unlock_special; 363 special = t->rcu_read_unlock_special;
362 if (special & RCU_READ_UNLOCK_NEED_QS) { 364 if (special & RCU_READ_UNLOCK_NEED_QS) {
363 rcu_preempt_qs(smp_processor_id()); 365 rcu_preempt_qs(smp_processor_id());
366 if (!t->rcu_read_unlock_special) {
367 local_irq_restore(flags);
368 return;
369 }
364 } 370 }
365 371
366 /* Hardware IRQ handlers cannot block. */ 372 /* Hardware IRQ handlers cannot block, complain if they get here. */
367 if (in_irq() || in_serving_softirq()) { 373 if (WARN_ON_ONCE(in_irq() || in_serving_softirq())) {
368 local_irq_restore(flags); 374 local_irq_restore(flags);
369 return; 375 return;
370 } 376 }
@@ -381,6 +387,7 @@ void rcu_read_unlock_special(struct task_struct *t)
381 for (;;) { 387 for (;;) {
382 rnp = t->rcu_blocked_node; 388 rnp = t->rcu_blocked_node;
383 raw_spin_lock(&rnp->lock); /* irqs already disabled. */ 389 raw_spin_lock(&rnp->lock); /* irqs already disabled. */
390 smp_mb__after_unlock_lock();
384 if (rnp == t->rcu_blocked_node) 391 if (rnp == t->rcu_blocked_node)
385 break; 392 break;
386 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ 393 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
@@ -605,6 +612,7 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
605 while (!list_empty(lp)) { 612 while (!list_empty(lp)) {
606 t = list_entry(lp->next, typeof(*t), rcu_node_entry); 613 t = list_entry(lp->next, typeof(*t), rcu_node_entry);
607 raw_spin_lock(&rnp_root->lock); /* irqs already disabled */ 614 raw_spin_lock(&rnp_root->lock); /* irqs already disabled */
615 smp_mb__after_unlock_lock();
608 list_del(&t->rcu_node_entry); 616 list_del(&t->rcu_node_entry);
609 t->rcu_blocked_node = rnp_root; 617 t->rcu_blocked_node = rnp_root;
610 list_add(&t->rcu_node_entry, lp_root); 618 list_add(&t->rcu_node_entry, lp_root);
@@ -629,6 +637,7 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
629 * in this case. 637 * in this case.
630 */ 638 */
631 raw_spin_lock(&rnp_root->lock); /* irqs already disabled */ 639 raw_spin_lock(&rnp_root->lock); /* irqs already disabled */
640 smp_mb__after_unlock_lock();
632 if (rnp_root->boost_tasks != NULL && 641 if (rnp_root->boost_tasks != NULL &&
633 rnp_root->boost_tasks != rnp_root->gp_tasks && 642 rnp_root->boost_tasks != rnp_root->gp_tasks &&
634 rnp_root->boost_tasks != rnp_root->exp_tasks) 643 rnp_root->boost_tasks != rnp_root->exp_tasks)
@@ -772,6 +781,7 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
772 unsigned long mask; 781 unsigned long mask;
773 782
774 raw_spin_lock_irqsave(&rnp->lock, flags); 783 raw_spin_lock_irqsave(&rnp->lock, flags);
784 smp_mb__after_unlock_lock();
775 for (;;) { 785 for (;;) {
776 if (!sync_rcu_preempt_exp_done(rnp)) { 786 if (!sync_rcu_preempt_exp_done(rnp)) {
777 raw_spin_unlock_irqrestore(&rnp->lock, flags); 787 raw_spin_unlock_irqrestore(&rnp->lock, flags);
@@ -779,14 +789,17 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
779 } 789 }
780 if (rnp->parent == NULL) { 790 if (rnp->parent == NULL) {
781 raw_spin_unlock_irqrestore(&rnp->lock, flags); 791 raw_spin_unlock_irqrestore(&rnp->lock, flags);
782 if (wake) 792 if (wake) {
793 smp_mb(); /* EGP done before wake_up(). */
783 wake_up(&sync_rcu_preempt_exp_wq); 794 wake_up(&sync_rcu_preempt_exp_wq);
795 }
784 break; 796 break;
785 } 797 }
786 mask = rnp->grpmask; 798 mask = rnp->grpmask;
787 raw_spin_unlock(&rnp->lock); /* irqs remain disabled */ 799 raw_spin_unlock(&rnp->lock); /* irqs remain disabled */
788 rnp = rnp->parent; 800 rnp = rnp->parent;
789 raw_spin_lock(&rnp->lock); /* irqs already disabled */ 801 raw_spin_lock(&rnp->lock); /* irqs already disabled */
802 smp_mb__after_unlock_lock();
790 rnp->expmask &= ~mask; 803 rnp->expmask &= ~mask;
791 } 804 }
792} 805}
@@ -806,6 +819,7 @@ sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp)
806 int must_wait = 0; 819 int must_wait = 0;
807 820
808 raw_spin_lock_irqsave(&rnp->lock, flags); 821 raw_spin_lock_irqsave(&rnp->lock, flags);
822 smp_mb__after_unlock_lock();
809 if (list_empty(&rnp->blkd_tasks)) { 823 if (list_empty(&rnp->blkd_tasks)) {
810 raw_spin_unlock_irqrestore(&rnp->lock, flags); 824 raw_spin_unlock_irqrestore(&rnp->lock, flags);
811 } else { 825 } else {
@@ -886,6 +900,7 @@ void synchronize_rcu_expedited(void)
886 /* Initialize ->expmask for all non-leaf rcu_node structures. */ 900 /* Initialize ->expmask for all non-leaf rcu_node structures. */
887 rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) { 901 rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) {
888 raw_spin_lock_irqsave(&rnp->lock, flags); 902 raw_spin_lock_irqsave(&rnp->lock, flags);
903 smp_mb__after_unlock_lock();
889 rnp->expmask = rnp->qsmaskinit; 904 rnp->expmask = rnp->qsmaskinit;
890 raw_spin_unlock_irqrestore(&rnp->lock, flags); 905 raw_spin_unlock_irqrestore(&rnp->lock, flags);
891 } 906 }
@@ -1191,6 +1206,7 @@ static int rcu_boost(struct rcu_node *rnp)
1191 return 0; /* Nothing left to boost. */ 1206 return 0; /* Nothing left to boost. */
1192 1207
1193 raw_spin_lock_irqsave(&rnp->lock, flags); 1208 raw_spin_lock_irqsave(&rnp->lock, flags);
1209 smp_mb__after_unlock_lock();
1194 1210
1195 /* 1211 /*
1196 * Recheck under the lock: all tasks in need of boosting 1212 * Recheck under the lock: all tasks in need of boosting
@@ -1377,6 +1393,7 @@ static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
1377 if (IS_ERR(t)) 1393 if (IS_ERR(t))
1378 return PTR_ERR(t); 1394 return PTR_ERR(t);
1379 raw_spin_lock_irqsave(&rnp->lock, flags); 1395 raw_spin_lock_irqsave(&rnp->lock, flags);
1396 smp_mb__after_unlock_lock();
1380 rnp->boost_kthread_task = t; 1397 rnp->boost_kthread_task = t;
1381 raw_spin_unlock_irqrestore(&rnp->lock, flags); 1398 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1382 sp.sched_priority = RCU_BOOST_PRIO; 1399 sp.sched_priority = RCU_BOOST_PRIO;
@@ -1769,6 +1786,7 @@ static void rcu_prepare_for_idle(int cpu)
1769 continue; 1786 continue;
1770 rnp = rdp->mynode; 1787 rnp = rdp->mynode;
1771 raw_spin_lock(&rnp->lock); /* irqs already disabled. */ 1788 raw_spin_lock(&rnp->lock); /* irqs already disabled. */
1789 smp_mb__after_unlock_lock();
1772 rcu_accelerate_cbs(rsp, rnp, rdp); 1790 rcu_accelerate_cbs(rsp, rnp, rdp);
1773 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ 1791 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
1774 } 1792 }
@@ -1852,6 +1870,7 @@ static int rcu_oom_notify(struct notifier_block *self,
1852 1870
1853 /* Wait for callbacks from earlier instance to complete. */ 1871 /* Wait for callbacks from earlier instance to complete. */
1854 wait_event(oom_callback_wq, atomic_read(&oom_callback_count) == 0); 1872 wait_event(oom_callback_wq, atomic_read(&oom_callback_count) == 0);
1873 smp_mb(); /* Ensure callback reuse happens after callback invocation. */
1855 1874
1856 /* 1875 /*
1857 * Prevent premature wakeup: ensure that all increments happen 1876 * Prevent premature wakeup: ensure that all increments happen
@@ -2101,7 +2120,8 @@ bool rcu_is_nocb_cpu(int cpu)
2101static void __call_rcu_nocb_enqueue(struct rcu_data *rdp, 2120static void __call_rcu_nocb_enqueue(struct rcu_data *rdp,
2102 struct rcu_head *rhp, 2121 struct rcu_head *rhp,
2103 struct rcu_head **rhtp, 2122 struct rcu_head **rhtp,
2104 int rhcount, int rhcount_lazy) 2123 int rhcount, int rhcount_lazy,
2124 unsigned long flags)
2105{ 2125{
2106 int len; 2126 int len;
2107 struct rcu_head **old_rhpp; 2127 struct rcu_head **old_rhpp;
@@ -2122,9 +2142,16 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp,
2122 } 2142 }
2123 len = atomic_long_read(&rdp->nocb_q_count); 2143 len = atomic_long_read(&rdp->nocb_q_count);
2124 if (old_rhpp == &rdp->nocb_head) { 2144 if (old_rhpp == &rdp->nocb_head) {
2125 wake_up(&rdp->nocb_wq); /* ... only if queue was empty ... */ 2145 if (!irqs_disabled_flags(flags)) {
2146 wake_up(&rdp->nocb_wq); /* ... if queue was empty ... */
2147 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
2148 TPS("WakeEmpty"));
2149 } else {
2150 rdp->nocb_defer_wakeup = true;
2151 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
2152 TPS("WakeEmptyIsDeferred"));
2153 }
2126 rdp->qlen_last_fqs_check = 0; 2154 rdp->qlen_last_fqs_check = 0;
2127 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeEmpty"));
2128 } else if (len > rdp->qlen_last_fqs_check + qhimark) { 2155 } else if (len > rdp->qlen_last_fqs_check + qhimark) {
2129 wake_up_process(t); /* ... or if many callbacks queued. */ 2156 wake_up_process(t); /* ... or if many callbacks queued. */
2130 rdp->qlen_last_fqs_check = LONG_MAX / 2; 2157 rdp->qlen_last_fqs_check = LONG_MAX / 2;
@@ -2145,12 +2172,12 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp,
2145 * "rcuo" kthread can find it. 2172 * "rcuo" kthread can find it.
2146 */ 2173 */
2147static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, 2174static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
2148 bool lazy) 2175 bool lazy, unsigned long flags)
2149{ 2176{
2150 2177
2151 if (!rcu_is_nocb_cpu(rdp->cpu)) 2178 if (!rcu_is_nocb_cpu(rdp->cpu))
2152 return 0; 2179 return 0;
2153 __call_rcu_nocb_enqueue(rdp, rhp, &rhp->next, 1, lazy); 2180 __call_rcu_nocb_enqueue(rdp, rhp, &rhp->next, 1, lazy, flags);
2154 if (__is_kfree_rcu_offset((unsigned long)rhp->func)) 2181 if (__is_kfree_rcu_offset((unsigned long)rhp->func))
2155 trace_rcu_kfree_callback(rdp->rsp->name, rhp, 2182 trace_rcu_kfree_callback(rdp->rsp->name, rhp,
2156 (unsigned long)rhp->func, 2183 (unsigned long)rhp->func,
@@ -2168,7 +2195,8 @@ static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
2168 * not a no-CBs CPU. 2195 * not a no-CBs CPU.
2169 */ 2196 */
2170static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp, 2197static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp,
2171 struct rcu_data *rdp) 2198 struct rcu_data *rdp,
2199 unsigned long flags)
2172{ 2200{
2173 long ql = rsp->qlen; 2201 long ql = rsp->qlen;
2174 long qll = rsp->qlen_lazy; 2202 long qll = rsp->qlen_lazy;
@@ -2182,14 +2210,14 @@ static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp,
2182 /* First, enqueue the donelist, if any. This preserves CB ordering. */ 2210 /* First, enqueue the donelist, if any. This preserves CB ordering. */
2183 if (rsp->orphan_donelist != NULL) { 2211 if (rsp->orphan_donelist != NULL) {
2184 __call_rcu_nocb_enqueue(rdp, rsp->orphan_donelist, 2212 __call_rcu_nocb_enqueue(rdp, rsp->orphan_donelist,
2185 rsp->orphan_donetail, ql, qll); 2213 rsp->orphan_donetail, ql, qll, flags);
2186 ql = qll = 0; 2214 ql = qll = 0;
2187 rsp->orphan_donelist = NULL; 2215 rsp->orphan_donelist = NULL;
2188 rsp->orphan_donetail = &rsp->orphan_donelist; 2216 rsp->orphan_donetail = &rsp->orphan_donelist;
2189 } 2217 }
2190 if (rsp->orphan_nxtlist != NULL) { 2218 if (rsp->orphan_nxtlist != NULL) {
2191 __call_rcu_nocb_enqueue(rdp, rsp->orphan_nxtlist, 2219 __call_rcu_nocb_enqueue(rdp, rsp->orphan_nxtlist,
2192 rsp->orphan_nxttail, ql, qll); 2220 rsp->orphan_nxttail, ql, qll, flags);
2193 ql = qll = 0; 2221 ql = qll = 0;
2194 rsp->orphan_nxtlist = NULL; 2222 rsp->orphan_nxtlist = NULL;
2195 rsp->orphan_nxttail = &rsp->orphan_nxtlist; 2223 rsp->orphan_nxttail = &rsp->orphan_nxtlist;
@@ -2209,6 +2237,7 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp)
2209 struct rcu_node *rnp = rdp->mynode; 2237 struct rcu_node *rnp = rdp->mynode;
2210 2238
2211 raw_spin_lock_irqsave(&rnp->lock, flags); 2239 raw_spin_lock_irqsave(&rnp->lock, flags);
2240 smp_mb__after_unlock_lock();
2212 c = rcu_start_future_gp(rnp, rdp); 2241 c = rcu_start_future_gp(rnp, rdp);
2213 raw_spin_unlock_irqrestore(&rnp->lock, flags); 2242 raw_spin_unlock_irqrestore(&rnp->lock, flags);
2214 2243
@@ -2250,6 +2279,7 @@ static int rcu_nocb_kthread(void *arg)
2250 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, 2279 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
2251 TPS("Sleep")); 2280 TPS("Sleep"));
2252 wait_event_interruptible(rdp->nocb_wq, rdp->nocb_head); 2281 wait_event_interruptible(rdp->nocb_wq, rdp->nocb_head);
2282 /* Memory barrier provide by xchg() below. */
2253 } else if (firsttime) { 2283 } else if (firsttime) {
2254 firsttime = 0; 2284 firsttime = 0;
2255 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, 2285 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
@@ -2310,6 +2340,22 @@ static int rcu_nocb_kthread(void *arg)
2310 return 0; 2340 return 0;
2311} 2341}
2312 2342
2343/* Is a deferred wakeup of rcu_nocb_kthread() required? */
2344static bool rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp)
2345{
2346 return ACCESS_ONCE(rdp->nocb_defer_wakeup);
2347}
2348
2349/* Do a deferred wakeup of rcu_nocb_kthread(). */
2350static void do_nocb_deferred_wakeup(struct rcu_data *rdp)
2351{
2352 if (!rcu_nocb_need_deferred_wakeup(rdp))
2353 return;
2354 ACCESS_ONCE(rdp->nocb_defer_wakeup) = false;
2355 wake_up(&rdp->nocb_wq);
2356 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("DeferredWakeEmpty"));
2357}
2358
2313/* Initialize per-rcu_data variables for no-CBs CPUs. */ 2359/* Initialize per-rcu_data variables for no-CBs CPUs. */
2314static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp) 2360static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp)
2315{ 2361{
@@ -2365,13 +2411,14 @@ static void rcu_init_one_nocb(struct rcu_node *rnp)
2365} 2411}
2366 2412
2367static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, 2413static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
2368 bool lazy) 2414 bool lazy, unsigned long flags)
2369{ 2415{
2370 return 0; 2416 return 0;
2371} 2417}
2372 2418
2373static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp, 2419static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp,
2374 struct rcu_data *rdp) 2420 struct rcu_data *rdp,
2421 unsigned long flags)
2375{ 2422{
2376 return 0; 2423 return 0;
2377} 2424}
@@ -2380,6 +2427,15 @@ static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp)
2380{ 2427{
2381} 2428}
2382 2429
2430static bool rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp)
2431{
2432 return false;
2433}
2434
2435static void do_nocb_deferred_wakeup(struct rcu_data *rdp)
2436{
2437}
2438
2383static void __init rcu_spawn_nocb_kthreads(struct rcu_state *rsp) 2439static void __init rcu_spawn_nocb_kthreads(struct rcu_state *rsp)
2384{ 2440{
2385} 2441}
@@ -2829,3 +2885,23 @@ static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp)
2829} 2885}
2830 2886
2831#endif /* #else #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ 2887#endif /* #else #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
2888
2889/*
2890 * Is this CPU a NO_HZ_FULL CPU that should ignore RCU so that the
2891 * grace-period kthread will do force_quiescent_state() processing?
2892 * The idea is to avoid waking up RCU core processing on such a
2893 * CPU unless the grace period has extended for too long.
2894 *
2895 * This code relies on the fact that all NO_HZ_FULL CPUs are also
2896 * CONFIG_RCU_NOCB_CPUs.
2897 */
2898static bool rcu_nohz_full_cpu(struct rcu_state *rsp)
2899{
2900#ifdef CONFIG_NO_HZ_FULL
2901 if (tick_nohz_full_cpu(smp_processor_id()) &&
2902 (!rcu_gp_in_progress(rsp) ||
2903 ULONG_CMP_LT(jiffies, ACCESS_ONCE(rsp->gp_start) + HZ)))
2904 return 1;
2905#endif /* #ifdef CONFIG_NO_HZ_FULL */
2906 return 0;
2907}
diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c
index 3596797b7e46..4def475336d4 100644
--- a/kernel/rcu/tree_trace.c
+++ b/kernel/rcu/tree_trace.c
@@ -364,9 +364,10 @@ static void print_one_rcu_pending(struct seq_file *m, struct rcu_data *rdp)
364 rdp->n_rp_report_qs, 364 rdp->n_rp_report_qs,
365 rdp->n_rp_cb_ready, 365 rdp->n_rp_cb_ready,
366 rdp->n_rp_cpu_needs_gp); 366 rdp->n_rp_cpu_needs_gp);
367 seq_printf(m, "gpc=%ld gps=%ld nn=%ld\n", 367 seq_printf(m, "gpc=%ld gps=%ld nn=%ld ndw%ld\n",
368 rdp->n_rp_gp_completed, 368 rdp->n_rp_gp_completed,
369 rdp->n_rp_gp_started, 369 rdp->n_rp_gp_started,
370 rdp->n_rp_nocb_defer_wakeup,
370 rdp->n_rp_need_nothing); 371 rdp->n_rp_need_nothing);
371} 372}
372 373
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index 6cb3dff89e2b..c54609faf233 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -128,6 +128,11 @@ struct lockdep_map rcu_sched_lock_map =
128 STATIC_LOCKDEP_MAP_INIT("rcu_read_lock_sched", &rcu_sched_lock_key); 128 STATIC_LOCKDEP_MAP_INIT("rcu_read_lock_sched", &rcu_sched_lock_key);
129EXPORT_SYMBOL_GPL(rcu_sched_lock_map); 129EXPORT_SYMBOL_GPL(rcu_sched_lock_map);
130 130
131static struct lock_class_key rcu_callback_key;
132struct lockdep_map rcu_callback_map =
133 STATIC_LOCKDEP_MAP_INIT("rcu_callback", &rcu_callback_key);
134EXPORT_SYMBOL_GPL(rcu_callback_map);
135
131int notrace debug_lockdep_rcu_enabled(void) 136int notrace debug_lockdep_rcu_enabled(void)
132{ 137{
133 return rcu_scheduler_active && debug_locks && 138 return rcu_scheduler_active && debug_locks &&
@@ -195,17 +200,6 @@ void wait_rcu_gp(call_rcu_func_t crf)
195} 200}
196EXPORT_SYMBOL_GPL(wait_rcu_gp); 201EXPORT_SYMBOL_GPL(wait_rcu_gp);
197 202
198#ifdef CONFIG_PROVE_RCU
199/*
200 * wrapper function to avoid #include problems.
201 */
202int rcu_my_thread_group_empty(void)
203{
204 return thread_group_empty(current);
205}
206EXPORT_SYMBOL_GPL(rcu_my_thread_group_empty);
207#endif /* #ifdef CONFIG_PROVE_RCU */
208
209#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD 203#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD
210static inline void debug_init_rcu_head(struct rcu_head *head) 204static inline void debug_init_rcu_head(struct rcu_head *head)
211{ 205{
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 7b621409cf15..9a95c8c2af2a 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -11,9 +11,10 @@ ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
11CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer 11CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer
12endif 12endif
13 13
14obj-y += core.o proc.o clock.o cputime.o idle_task.o fair.o rt.o stop_task.o 14obj-y += core.o proc.o clock.o cputime.o
15obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o
15obj-y += wait.o completion.o 16obj-y += wait.o completion.o
16obj-$(CONFIG_SMP) += cpupri.o 17obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o
17obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o 18obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
18obj-$(CONFIG_SCHEDSTATS) += stats.o 19obj-$(CONFIG_SCHEDSTATS) += stats.o
19obj-$(CONFIG_SCHED_DEBUG) += debug.o 20obj-$(CONFIG_SCHED_DEBUG) += debug.o
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
index c3ae1446461c..b30a2924ef14 100644
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -26,9 +26,10 @@
26 * at 0 on boot (but people really shouldn't rely on that). 26 * at 0 on boot (but people really shouldn't rely on that).
27 * 27 *
28 * cpu_clock(i) -- can be used from any context, including NMI. 28 * cpu_clock(i) -- can be used from any context, including NMI.
29 * sched_clock_cpu(i) -- must be used with local IRQs disabled (implied by NMI)
30 * local_clock() -- is cpu_clock() on the current cpu. 29 * local_clock() -- is cpu_clock() on the current cpu.
31 * 30 *
31 * sched_clock_cpu(i)
32 *
32 * How: 33 * How:
33 * 34 *
34 * The implementation either uses sched_clock() when 35 * The implementation either uses sched_clock() when
@@ -50,15 +51,6 @@
50 * Furthermore, explicit sleep and wakeup hooks allow us to account for time 51 * Furthermore, explicit sleep and wakeup hooks allow us to account for time
51 * that is otherwise invisible (TSC gets stopped). 52 * that is otherwise invisible (TSC gets stopped).
52 * 53 *
53 *
54 * Notes:
55 *
56 * The !IRQ-safetly of sched_clock() and sched_clock_cpu() comes from things
57 * like cpufreq interrupts that can change the base clock (TSC) multiplier
58 * and cause funny jumps in time -- although the filtering provided by
59 * sched_clock_cpu() should mitigate serious artifacts we cannot rely on it
60 * in general since for !CONFIG_HAVE_UNSTABLE_SCHED_CLOCK we fully rely on
61 * sched_clock().
62 */ 54 */
63#include <linux/spinlock.h> 55#include <linux/spinlock.h>
64#include <linux/hardirq.h> 56#include <linux/hardirq.h>
@@ -66,6 +58,8 @@
66#include <linux/percpu.h> 58#include <linux/percpu.h>
67#include <linux/ktime.h> 59#include <linux/ktime.h>
68#include <linux/sched.h> 60#include <linux/sched.h>
61#include <linux/static_key.h>
62#include <linux/workqueue.h>
69 63
70/* 64/*
71 * Scheduler clock - returns current time in nanosec units. 65 * Scheduler clock - returns current time in nanosec units.
@@ -82,7 +76,52 @@ EXPORT_SYMBOL_GPL(sched_clock);
82__read_mostly int sched_clock_running; 76__read_mostly int sched_clock_running;
83 77
84#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK 78#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
85__read_mostly int sched_clock_stable; 79static struct static_key __sched_clock_stable = STATIC_KEY_INIT;
80static int __sched_clock_stable_early;
81
82int sched_clock_stable(void)
83{
84 return static_key_false(&__sched_clock_stable);
85}
86
87static void __set_sched_clock_stable(void)
88{
89 if (!sched_clock_stable())
90 static_key_slow_inc(&__sched_clock_stable);
91}
92
93void set_sched_clock_stable(void)
94{
95 __sched_clock_stable_early = 1;
96
97 smp_mb(); /* matches sched_clock_init() */
98
99 if (!sched_clock_running)
100 return;
101
102 __set_sched_clock_stable();
103}
104
105static void __clear_sched_clock_stable(struct work_struct *work)
106{
107 /* XXX worry about clock continuity */
108 if (sched_clock_stable())
109 static_key_slow_dec(&__sched_clock_stable);
110}
111
112static DECLARE_WORK(sched_clock_work, __clear_sched_clock_stable);
113
114void clear_sched_clock_stable(void)
115{
116 __sched_clock_stable_early = 0;
117
118 smp_mb(); /* matches sched_clock_init() */
119
120 if (!sched_clock_running)
121 return;
122
123 schedule_work(&sched_clock_work);
124}
86 125
87struct sched_clock_data { 126struct sched_clock_data {
88 u64 tick_raw; 127 u64 tick_raw;
@@ -116,6 +155,20 @@ void sched_clock_init(void)
116 } 155 }
117 156
118 sched_clock_running = 1; 157 sched_clock_running = 1;
158
159 /*
160 * Ensure that it is impossible to not do a static_key update.
161 *
162 * Either {set,clear}_sched_clock_stable() must see sched_clock_running
163 * and do the update, or we must see their __sched_clock_stable_early
164 * and do the update, or both.
165 */
166 smp_mb(); /* matches {set,clear}_sched_clock_stable() */
167
168 if (__sched_clock_stable_early)
169 __set_sched_clock_stable();
170 else
171 __clear_sched_clock_stable(NULL);
119} 172}
120 173
121/* 174/*
@@ -242,20 +295,20 @@ u64 sched_clock_cpu(int cpu)
242 struct sched_clock_data *scd; 295 struct sched_clock_data *scd;
243 u64 clock; 296 u64 clock;
244 297
245 WARN_ON_ONCE(!irqs_disabled()); 298 if (sched_clock_stable())
246
247 if (sched_clock_stable)
248 return sched_clock(); 299 return sched_clock();
249 300
250 if (unlikely(!sched_clock_running)) 301 if (unlikely(!sched_clock_running))
251 return 0ull; 302 return 0ull;
252 303
304 preempt_disable_notrace();
253 scd = cpu_sdc(cpu); 305 scd = cpu_sdc(cpu);
254 306
255 if (cpu != smp_processor_id()) 307 if (cpu != smp_processor_id())
256 clock = sched_clock_remote(scd); 308 clock = sched_clock_remote(scd);
257 else 309 else
258 clock = sched_clock_local(scd); 310 clock = sched_clock_local(scd);
311 preempt_enable_notrace();
259 312
260 return clock; 313 return clock;
261} 314}
@@ -265,7 +318,7 @@ void sched_clock_tick(void)
265 struct sched_clock_data *scd; 318 struct sched_clock_data *scd;
266 u64 now, now_gtod; 319 u64 now, now_gtod;
267 320
268 if (sched_clock_stable) 321 if (sched_clock_stable())
269 return; 322 return;
270 323
271 if (unlikely(!sched_clock_running)) 324 if (unlikely(!sched_clock_running))
@@ -316,14 +369,10 @@ EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
316 */ 369 */
317u64 cpu_clock(int cpu) 370u64 cpu_clock(int cpu)
318{ 371{
319 u64 clock; 372 if (!sched_clock_stable())
320 unsigned long flags; 373 return sched_clock_cpu(cpu);
321
322 local_irq_save(flags);
323 clock = sched_clock_cpu(cpu);
324 local_irq_restore(flags);
325 374
326 return clock; 375 return sched_clock();
327} 376}
328 377
329/* 378/*
@@ -335,14 +384,10 @@ u64 cpu_clock(int cpu)
335 */ 384 */
336u64 local_clock(void) 385u64 local_clock(void)
337{ 386{
338 u64 clock; 387 if (!sched_clock_stable())
339 unsigned long flags; 388 return sched_clock_cpu(raw_smp_processor_id());
340 389
341 local_irq_save(flags); 390 return sched_clock();
342 clock = sched_clock_cpu(smp_processor_id());
343 local_irq_restore(flags);
344
345 return clock;
346} 391}
347 392
348#else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ 393#else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */
@@ -362,12 +407,12 @@ u64 sched_clock_cpu(int cpu)
362 407
363u64 cpu_clock(int cpu) 408u64 cpu_clock(int cpu)
364{ 409{
365 return sched_clock_cpu(cpu); 410 return sched_clock();
366} 411}
367 412
368u64 local_clock(void) 413u64 local_clock(void)
369{ 414{
370 return sched_clock_cpu(0); 415 return sched_clock();
371} 416}
372 417
373#endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ 418#endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index a88f4a485c5e..f5c6635b806c 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -296,8 +296,6 @@ __read_mostly int scheduler_running;
296 */ 296 */
297int sysctl_sched_rt_runtime = 950000; 297int sysctl_sched_rt_runtime = 950000;
298 298
299
300
301/* 299/*
302 * __task_rq_lock - lock the rq @p resides on. 300 * __task_rq_lock - lock the rq @p resides on.
303 */ 301 */
@@ -899,7 +897,9 @@ static inline int normal_prio(struct task_struct *p)
899{ 897{
900 int prio; 898 int prio;
901 899
902 if (task_has_rt_policy(p)) 900 if (task_has_dl_policy(p))
901 prio = MAX_DL_PRIO-1;
902 else if (task_has_rt_policy(p))
903 prio = MAX_RT_PRIO-1 - p->rt_priority; 903 prio = MAX_RT_PRIO-1 - p->rt_priority;
904 else 904 else
905 prio = __normal_prio(p); 905 prio = __normal_prio(p);
@@ -945,7 +945,7 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p,
945 if (prev_class->switched_from) 945 if (prev_class->switched_from)
946 prev_class->switched_from(rq, p); 946 prev_class->switched_from(rq, p);
947 p->sched_class->switched_to(rq, p); 947 p->sched_class->switched_to(rq, p);
948 } else if (oldprio != p->prio) 948 } else if (oldprio != p->prio || dl_task(p))
949 p->sched_class->prio_changed(rq, p, oldprio); 949 p->sched_class->prio_changed(rq, p, oldprio);
950} 950}
951 951
@@ -1108,6 +1108,7 @@ int migrate_swap(struct task_struct *cur, struct task_struct *p)
1108 if (!cpumask_test_cpu(arg.src_cpu, tsk_cpus_allowed(arg.dst_task))) 1108 if (!cpumask_test_cpu(arg.src_cpu, tsk_cpus_allowed(arg.dst_task)))
1109 goto out; 1109 goto out;
1110 1110
1111 trace_sched_swap_numa(cur, arg.src_cpu, p, arg.dst_cpu);
1111 ret = stop_two_cpus(arg.dst_cpu, arg.src_cpu, migrate_swap_stop, &arg); 1112 ret = stop_two_cpus(arg.dst_cpu, arg.src_cpu, migrate_swap_stop, &arg);
1112 1113
1113out: 1114out:
@@ -1499,8 +1500,7 @@ void scheduler_ipi(void)
1499 * TIF_NEED_RESCHED remotely (for the first time) will also send 1500 * TIF_NEED_RESCHED remotely (for the first time) will also send
1500 * this IPI. 1501 * this IPI.
1501 */ 1502 */
1502 if (tif_need_resched()) 1503 preempt_fold_need_resched();
1503 set_preempt_need_resched();
1504 1504
1505 if (llist_empty(&this_rq()->wake_list) 1505 if (llist_empty(&this_rq()->wake_list)
1506 && !tick_nohz_full_cpu(smp_processor_id()) 1506 && !tick_nohz_full_cpu(smp_processor_id())
@@ -1717,6 +1717,13 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
1717 memset(&p->se.statistics, 0, sizeof(p->se.statistics)); 1717 memset(&p->se.statistics, 0, sizeof(p->se.statistics));
1718#endif 1718#endif
1719 1719
1720 RB_CLEAR_NODE(&p->dl.rb_node);
1721 hrtimer_init(&p->dl.dl_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
1722 p->dl.dl_runtime = p->dl.runtime = 0;
1723 p->dl.dl_deadline = p->dl.deadline = 0;
1724 p->dl.dl_period = 0;
1725 p->dl.flags = 0;
1726
1720 INIT_LIST_HEAD(&p->rt.run_list); 1727 INIT_LIST_HEAD(&p->rt.run_list);
1721 1728
1722#ifdef CONFIG_PREEMPT_NOTIFIERS 1729#ifdef CONFIG_PREEMPT_NOTIFIERS
@@ -1763,12 +1770,34 @@ void set_numabalancing_state(bool enabled)
1763 numabalancing_enabled = enabled; 1770 numabalancing_enabled = enabled;
1764} 1771}
1765#endif /* CONFIG_SCHED_DEBUG */ 1772#endif /* CONFIG_SCHED_DEBUG */
1766#endif /* CONFIG_NUMA_BALANCING */ 1773
1774#ifdef CONFIG_PROC_SYSCTL
1775int sysctl_numa_balancing(struct ctl_table *table, int write,
1776 void __user *buffer, size_t *lenp, loff_t *ppos)
1777{
1778 struct ctl_table t;
1779 int err;
1780 int state = numabalancing_enabled;
1781
1782 if (write && !capable(CAP_SYS_ADMIN))
1783 return -EPERM;
1784
1785 t = *table;
1786 t.data = &state;
1787 err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
1788 if (err < 0)
1789 return err;
1790 if (write)
1791 set_numabalancing_state(state);
1792 return err;
1793}
1794#endif
1795#endif
1767 1796
1768/* 1797/*
1769 * fork()/clone()-time setup: 1798 * fork()/clone()-time setup:
1770 */ 1799 */
1771void sched_fork(unsigned long clone_flags, struct task_struct *p) 1800int sched_fork(unsigned long clone_flags, struct task_struct *p)
1772{ 1801{
1773 unsigned long flags; 1802 unsigned long flags;
1774 int cpu = get_cpu(); 1803 int cpu = get_cpu();
@@ -1790,7 +1819,7 @@ void sched_fork(unsigned long clone_flags, struct task_struct *p)
1790 * Revert to default priority/policy on fork if requested. 1819 * Revert to default priority/policy on fork if requested.
1791 */ 1820 */
1792 if (unlikely(p->sched_reset_on_fork)) { 1821 if (unlikely(p->sched_reset_on_fork)) {
1793 if (task_has_rt_policy(p)) { 1822 if (task_has_dl_policy(p) || task_has_rt_policy(p)) {
1794 p->policy = SCHED_NORMAL; 1823 p->policy = SCHED_NORMAL;
1795 p->static_prio = NICE_TO_PRIO(0); 1824 p->static_prio = NICE_TO_PRIO(0);
1796 p->rt_priority = 0; 1825 p->rt_priority = 0;
@@ -1807,8 +1836,14 @@ void sched_fork(unsigned long clone_flags, struct task_struct *p)
1807 p->sched_reset_on_fork = 0; 1836 p->sched_reset_on_fork = 0;
1808 } 1837 }
1809 1838
1810 if (!rt_prio(p->prio)) 1839 if (dl_prio(p->prio)) {
1840 put_cpu();
1841 return -EAGAIN;
1842 } else if (rt_prio(p->prio)) {
1843 p->sched_class = &rt_sched_class;
1844 } else {
1811 p->sched_class = &fair_sched_class; 1845 p->sched_class = &fair_sched_class;
1846 }
1812 1847
1813 if (p->sched_class->task_fork) 1848 if (p->sched_class->task_fork)
1814 p->sched_class->task_fork(p); 1849 p->sched_class->task_fork(p);
@@ -1834,11 +1869,124 @@ void sched_fork(unsigned long clone_flags, struct task_struct *p)
1834 init_task_preempt_count(p); 1869 init_task_preempt_count(p);
1835#ifdef CONFIG_SMP 1870#ifdef CONFIG_SMP
1836 plist_node_init(&p->pushable_tasks, MAX_PRIO); 1871 plist_node_init(&p->pushable_tasks, MAX_PRIO);
1872 RB_CLEAR_NODE(&p->pushable_dl_tasks);
1837#endif 1873#endif
1838 1874
1839 put_cpu(); 1875 put_cpu();
1876 return 0;
1877}
1878
1879unsigned long to_ratio(u64 period, u64 runtime)
1880{
1881 if (runtime == RUNTIME_INF)
1882 return 1ULL << 20;
1883
1884 /*
1885 * Doing this here saves a lot of checks in all
1886 * the calling paths, and returning zero seems
1887 * safe for them anyway.
1888 */
1889 if (period == 0)
1890 return 0;
1891
1892 return div64_u64(runtime << 20, period);
1840} 1893}
1841 1894
1895#ifdef CONFIG_SMP
1896inline struct dl_bw *dl_bw_of(int i)
1897{
1898 return &cpu_rq(i)->rd->dl_bw;
1899}
1900
1901static inline int dl_bw_cpus(int i)
1902{
1903 struct root_domain *rd = cpu_rq(i)->rd;
1904 int cpus = 0;
1905
1906 for_each_cpu_and(i, rd->span, cpu_active_mask)
1907 cpus++;
1908
1909 return cpus;
1910}
1911#else
1912inline struct dl_bw *dl_bw_of(int i)
1913{
1914 return &cpu_rq(i)->dl.dl_bw;
1915}
1916
1917static inline int dl_bw_cpus(int i)
1918{
1919 return 1;
1920}
1921#endif
1922
1923static inline
1924void __dl_clear(struct dl_bw *dl_b, u64 tsk_bw)
1925{
1926 dl_b->total_bw -= tsk_bw;
1927}
1928
1929static inline
1930void __dl_add(struct dl_bw *dl_b, u64 tsk_bw)
1931{
1932 dl_b->total_bw += tsk_bw;
1933}
1934
1935static inline
1936bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw)
1937{
1938 return dl_b->bw != -1 &&
1939 dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw;
1940}
1941
1942/*
1943 * We must be sure that accepting a new task (or allowing changing the
1944 * parameters of an existing one) is consistent with the bandwidth
1945 * constraints. If yes, this function also accordingly updates the currently
1946 * allocated bandwidth to reflect the new situation.
1947 *
1948 * This function is called while holding p's rq->lock.
1949 */
1950static int dl_overflow(struct task_struct *p, int policy,
1951 const struct sched_attr *attr)
1952{
1953
1954 struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
1955 u64 period = attr->sched_period ?: attr->sched_deadline;
1956 u64 runtime = attr->sched_runtime;
1957 u64 new_bw = dl_policy(policy) ? to_ratio(period, runtime) : 0;
1958 int cpus, err = -1;
1959
1960 if (new_bw == p->dl.dl_bw)
1961 return 0;
1962
1963 /*
1964 * Either if a task, enters, leave, or stays -deadline but changes
1965 * its parameters, we may need to update accordingly the total
1966 * allocated bandwidth of the container.
1967 */
1968 raw_spin_lock(&dl_b->lock);
1969 cpus = dl_bw_cpus(task_cpu(p));
1970 if (dl_policy(policy) && !task_has_dl_policy(p) &&
1971 !__dl_overflow(dl_b, cpus, 0, new_bw)) {
1972 __dl_add(dl_b, new_bw);
1973 err = 0;
1974 } else if (dl_policy(policy) && task_has_dl_policy(p) &&
1975 !__dl_overflow(dl_b, cpus, p->dl.dl_bw, new_bw)) {
1976 __dl_clear(dl_b, p->dl.dl_bw);
1977 __dl_add(dl_b, new_bw);
1978 err = 0;
1979 } else if (!dl_policy(policy) && task_has_dl_policy(p)) {
1980 __dl_clear(dl_b, p->dl.dl_bw);
1981 err = 0;
1982 }
1983 raw_spin_unlock(&dl_b->lock);
1984
1985 return err;
1986}
1987
1988extern void init_dl_bw(struct dl_bw *dl_b);
1989
1842/* 1990/*
1843 * wake_up_new_task - wake up a newly created task for the first time. 1991 * wake_up_new_task - wake up a newly created task for the first time.
1844 * 1992 *
@@ -2003,6 +2151,9 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
2003 if (unlikely(prev_state == TASK_DEAD)) { 2151 if (unlikely(prev_state == TASK_DEAD)) {
2004 task_numa_free(prev); 2152 task_numa_free(prev);
2005 2153
2154 if (prev->sched_class->task_dead)
2155 prev->sched_class->task_dead(prev);
2156
2006 /* 2157 /*
2007 * Remove function-return probe instances associated with this 2158 * Remove function-return probe instances associated with this
2008 * task and put them back on the free list. 2159 * task and put them back on the free list.
@@ -2296,7 +2447,7 @@ void scheduler_tick(void)
2296 2447
2297#ifdef CONFIG_SMP 2448#ifdef CONFIG_SMP
2298 rq->idle_balance = idle_cpu(cpu); 2449 rq->idle_balance = idle_cpu(cpu);
2299 trigger_load_balance(rq, cpu); 2450 trigger_load_balance(rq);
2300#endif 2451#endif
2301 rq_last_tick_reset(rq); 2452 rq_last_tick_reset(rq);
2302} 2453}
@@ -2325,7 +2476,7 @@ u64 scheduler_tick_max_deferment(void)
2325 if (time_before_eq(next, now)) 2476 if (time_before_eq(next, now))
2326 return 0; 2477 return 0;
2327 2478
2328 return jiffies_to_usecs(next - now) * NSEC_PER_USEC; 2479 return jiffies_to_nsecs(next - now);
2329} 2480}
2330#endif 2481#endif
2331 2482
@@ -2414,10 +2565,10 @@ static inline void schedule_debug(struct task_struct *prev)
2414{ 2565{
2415 /* 2566 /*
2416 * Test if we are atomic. Since do_exit() needs to call into 2567 * Test if we are atomic. Since do_exit() needs to call into
2417 * schedule() atomically, we ignore that path for now. 2568 * schedule() atomically, we ignore that path. Otherwise whine
2418 * Otherwise, whine if we are scheduling when we should not be. 2569 * if we are scheduling when we should not.
2419 */ 2570 */
2420 if (unlikely(in_atomic_preempt_off() && !prev->exit_state)) 2571 if (unlikely(in_atomic_preempt_off() && prev->state != TASK_DEAD))
2421 __schedule_bug(prev); 2572 __schedule_bug(prev);
2422 rcu_sleep_check(); 2573 rcu_sleep_check();
2423 2574
@@ -2761,11 +2912,11 @@ EXPORT_SYMBOL(sleep_on_timeout);
2761 */ 2912 */
2762void rt_mutex_setprio(struct task_struct *p, int prio) 2913void rt_mutex_setprio(struct task_struct *p, int prio)
2763{ 2914{
2764 int oldprio, on_rq, running; 2915 int oldprio, on_rq, running, enqueue_flag = 0;
2765 struct rq *rq; 2916 struct rq *rq;
2766 const struct sched_class *prev_class; 2917 const struct sched_class *prev_class;
2767 2918
2768 BUG_ON(prio < 0 || prio > MAX_PRIO); 2919 BUG_ON(prio > MAX_PRIO);
2769 2920
2770 rq = __task_rq_lock(p); 2921 rq = __task_rq_lock(p);
2771 2922
@@ -2788,6 +2939,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
2788 } 2939 }
2789 2940
2790 trace_sched_pi_setprio(p, prio); 2941 trace_sched_pi_setprio(p, prio);
2942 p->pi_top_task = rt_mutex_get_top_task(p);
2791 oldprio = p->prio; 2943 oldprio = p->prio;
2792 prev_class = p->sched_class; 2944 prev_class = p->sched_class;
2793 on_rq = p->on_rq; 2945 on_rq = p->on_rq;
@@ -2797,23 +2949,49 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
2797 if (running) 2949 if (running)
2798 p->sched_class->put_prev_task(rq, p); 2950 p->sched_class->put_prev_task(rq, p);
2799 2951
2800 if (rt_prio(prio)) 2952 /*
2953 * Boosting condition are:
2954 * 1. -rt task is running and holds mutex A
2955 * --> -dl task blocks on mutex A
2956 *
2957 * 2. -dl task is running and holds mutex A
2958 * --> -dl task blocks on mutex A and could preempt the
2959 * running task
2960 */
2961 if (dl_prio(prio)) {
2962 if (!dl_prio(p->normal_prio) || (p->pi_top_task &&
2963 dl_entity_preempt(&p->pi_top_task->dl, &p->dl))) {
2964 p->dl.dl_boosted = 1;
2965 p->dl.dl_throttled = 0;
2966 enqueue_flag = ENQUEUE_REPLENISH;
2967 } else
2968 p->dl.dl_boosted = 0;
2969 p->sched_class = &dl_sched_class;
2970 } else if (rt_prio(prio)) {
2971 if (dl_prio(oldprio))
2972 p->dl.dl_boosted = 0;
2973 if (oldprio < prio)
2974 enqueue_flag = ENQUEUE_HEAD;
2801 p->sched_class = &rt_sched_class; 2975 p->sched_class = &rt_sched_class;
2802 else 2976 } else {
2977 if (dl_prio(oldprio))
2978 p->dl.dl_boosted = 0;
2803 p->sched_class = &fair_sched_class; 2979 p->sched_class = &fair_sched_class;
2980 }
2804 2981
2805 p->prio = prio; 2982 p->prio = prio;
2806 2983
2807 if (running) 2984 if (running)
2808 p->sched_class->set_curr_task(rq); 2985 p->sched_class->set_curr_task(rq);
2809 if (on_rq) 2986 if (on_rq)
2810 enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0); 2987 enqueue_task(rq, p, enqueue_flag);
2811 2988
2812 check_class_changed(rq, p, prev_class, oldprio); 2989 check_class_changed(rq, p, prev_class, oldprio);
2813out_unlock: 2990out_unlock:
2814 __task_rq_unlock(rq); 2991 __task_rq_unlock(rq);
2815} 2992}
2816#endif 2993#endif
2994
2817void set_user_nice(struct task_struct *p, long nice) 2995void set_user_nice(struct task_struct *p, long nice)
2818{ 2996{
2819 int old_prio, delta, on_rq; 2997 int old_prio, delta, on_rq;
@@ -2831,9 +3009,9 @@ void set_user_nice(struct task_struct *p, long nice)
2831 * The RT priorities are set via sched_setscheduler(), but we still 3009 * The RT priorities are set via sched_setscheduler(), but we still
2832 * allow the 'normal' nice value to be set - but as expected 3010 * allow the 'normal' nice value to be set - but as expected
2833 * it wont have any effect on scheduling until the task is 3011 * it wont have any effect on scheduling until the task is
2834 * SCHED_FIFO/SCHED_RR: 3012 * SCHED_DEADLINE, SCHED_FIFO or SCHED_RR:
2835 */ 3013 */
2836 if (task_has_rt_policy(p)) { 3014 if (task_has_dl_policy(p) || task_has_rt_policy(p)) {
2837 p->static_prio = NICE_TO_PRIO(nice); 3015 p->static_prio = NICE_TO_PRIO(nice);
2838 goto out_unlock; 3016 goto out_unlock;
2839 } 3017 }
@@ -2988,22 +3166,95 @@ static struct task_struct *find_process_by_pid(pid_t pid)
2988 return pid ? find_task_by_vpid(pid) : current; 3166 return pid ? find_task_by_vpid(pid) : current;
2989} 3167}
2990 3168
2991/* Actually do priority change: must hold rq lock. */ 3169/*
3170 * This function initializes the sched_dl_entity of a newly becoming
3171 * SCHED_DEADLINE task.
3172 *
3173 * Only the static values are considered here, the actual runtime and the
3174 * absolute deadline will be properly calculated when the task is enqueued
3175 * for the first time with its new policy.
3176 */
2992static void 3177static void
2993__setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio) 3178__setparam_dl(struct task_struct *p, const struct sched_attr *attr)
2994{ 3179{
3180 struct sched_dl_entity *dl_se = &p->dl;
3181
3182 init_dl_task_timer(dl_se);
3183 dl_se->dl_runtime = attr->sched_runtime;
3184 dl_se->dl_deadline = attr->sched_deadline;
3185 dl_se->dl_period = attr->sched_period ?: dl_se->dl_deadline;
3186 dl_se->flags = attr->sched_flags;
3187 dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime);
3188 dl_se->dl_throttled = 0;
3189 dl_se->dl_new = 1;
3190}
3191
3192/* Actually do priority change: must hold pi & rq lock. */
3193static void __setscheduler(struct rq *rq, struct task_struct *p,
3194 const struct sched_attr *attr)
3195{
3196 int policy = attr->sched_policy;
3197
3198 if (policy == -1) /* setparam */
3199 policy = p->policy;
3200
2995 p->policy = policy; 3201 p->policy = policy;
2996 p->rt_priority = prio; 3202
3203 if (dl_policy(policy))
3204 __setparam_dl(p, attr);
3205 else if (fair_policy(policy))
3206 p->static_prio = NICE_TO_PRIO(attr->sched_nice);
3207
3208 /*
3209 * __sched_setscheduler() ensures attr->sched_priority == 0 when
3210 * !rt_policy. Always setting this ensures that things like
3211 * getparam()/getattr() don't report silly values for !rt tasks.
3212 */
3213 p->rt_priority = attr->sched_priority;
3214
2997 p->normal_prio = normal_prio(p); 3215 p->normal_prio = normal_prio(p);
2998 /* we are holding p->pi_lock already */
2999 p->prio = rt_mutex_getprio(p); 3216 p->prio = rt_mutex_getprio(p);
3000 if (rt_prio(p->prio)) 3217
3218 if (dl_prio(p->prio))
3219 p->sched_class = &dl_sched_class;
3220 else if (rt_prio(p->prio))
3001 p->sched_class = &rt_sched_class; 3221 p->sched_class = &rt_sched_class;
3002 else 3222 else
3003 p->sched_class = &fair_sched_class; 3223 p->sched_class = &fair_sched_class;
3224
3004 set_load_weight(p); 3225 set_load_weight(p);
3005} 3226}
3006 3227
3228static void
3229__getparam_dl(struct task_struct *p, struct sched_attr *attr)
3230{
3231 struct sched_dl_entity *dl_se = &p->dl;
3232
3233 attr->sched_priority = p->rt_priority;
3234 attr->sched_runtime = dl_se->dl_runtime;
3235 attr->sched_deadline = dl_se->dl_deadline;
3236 attr->sched_period = dl_se->dl_period;
3237 attr->sched_flags = dl_se->flags;
3238}
3239
3240/*
3241 * This function validates the new parameters of a -deadline task.
3242 * We ask for the deadline not being zero, and greater or equal
3243 * than the runtime, as well as the period of being zero or
3244 * greater than deadline. Furthermore, we have to be sure that
3245 * user parameters are above the internal resolution (1us); we
3246 * check sched_runtime only since it is always the smaller one.
3247 */
3248static bool
3249__checkparam_dl(const struct sched_attr *attr)
3250{
3251 return attr && attr->sched_deadline != 0 &&
3252 (attr->sched_period == 0 ||
3253 (s64)(attr->sched_period - attr->sched_deadline) >= 0) &&
3254 (s64)(attr->sched_deadline - attr->sched_runtime ) >= 0 &&
3255 attr->sched_runtime >= (2 << (DL_SCALE - 1));
3256}
3257
3007/* 3258/*
3008 * check the target process has a UID that matches the current process's 3259 * check the target process has a UID that matches the current process's
3009 */ 3260 */
@@ -3020,10 +3271,12 @@ static bool check_same_owner(struct task_struct *p)
3020 return match; 3271 return match;
3021} 3272}
3022 3273
3023static int __sched_setscheduler(struct task_struct *p, int policy, 3274static int __sched_setscheduler(struct task_struct *p,
3024 const struct sched_param *param, bool user) 3275 const struct sched_attr *attr,
3276 bool user)
3025{ 3277{
3026 int retval, oldprio, oldpolicy = -1, on_rq, running; 3278 int retval, oldprio, oldpolicy = -1, on_rq, running;
3279 int policy = attr->sched_policy;
3027 unsigned long flags; 3280 unsigned long flags;
3028 const struct sched_class *prev_class; 3281 const struct sched_class *prev_class;
3029 struct rq *rq; 3282 struct rq *rq;
@@ -3037,31 +3290,40 @@ recheck:
3037 reset_on_fork = p->sched_reset_on_fork; 3290 reset_on_fork = p->sched_reset_on_fork;
3038 policy = oldpolicy = p->policy; 3291 policy = oldpolicy = p->policy;
3039 } else { 3292 } else {
3040 reset_on_fork = !!(policy & SCHED_RESET_ON_FORK); 3293 reset_on_fork = !!(attr->sched_flags & SCHED_FLAG_RESET_ON_FORK);
3041 policy &= ~SCHED_RESET_ON_FORK;
3042 3294
3043 if (policy != SCHED_FIFO && policy != SCHED_RR && 3295 if (policy != SCHED_DEADLINE &&
3296 policy != SCHED_FIFO && policy != SCHED_RR &&
3044 policy != SCHED_NORMAL && policy != SCHED_BATCH && 3297 policy != SCHED_NORMAL && policy != SCHED_BATCH &&
3045 policy != SCHED_IDLE) 3298 policy != SCHED_IDLE)
3046 return -EINVAL; 3299 return -EINVAL;
3047 } 3300 }
3048 3301
3302 if (attr->sched_flags & ~(SCHED_FLAG_RESET_ON_FORK))
3303 return -EINVAL;
3304
3049 /* 3305 /*
3050 * Valid priorities for SCHED_FIFO and SCHED_RR are 3306 * Valid priorities for SCHED_FIFO and SCHED_RR are
3051 * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL, 3307 * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL,
3052 * SCHED_BATCH and SCHED_IDLE is 0. 3308 * SCHED_BATCH and SCHED_IDLE is 0.
3053 */ 3309 */
3054 if (param->sched_priority < 0 || 3310 if ((p->mm && attr->sched_priority > MAX_USER_RT_PRIO-1) ||
3055 (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) || 3311 (!p->mm && attr->sched_priority > MAX_RT_PRIO-1))
3056 (!p->mm && param->sched_priority > MAX_RT_PRIO-1))
3057 return -EINVAL; 3312 return -EINVAL;
3058 if (rt_policy(policy) != (param->sched_priority != 0)) 3313 if ((dl_policy(policy) && !__checkparam_dl(attr)) ||
3314 (rt_policy(policy) != (attr->sched_priority != 0)))
3059 return -EINVAL; 3315 return -EINVAL;
3060 3316
3061 /* 3317 /*
3062 * Allow unprivileged RT tasks to decrease priority: 3318 * Allow unprivileged RT tasks to decrease priority:
3063 */ 3319 */
3064 if (user && !capable(CAP_SYS_NICE)) { 3320 if (user && !capable(CAP_SYS_NICE)) {
3321 if (fair_policy(policy)) {
3322 if (attr->sched_nice < TASK_NICE(p) &&
3323 !can_nice(p, attr->sched_nice))
3324 return -EPERM;
3325 }
3326
3065 if (rt_policy(policy)) { 3327 if (rt_policy(policy)) {
3066 unsigned long rlim_rtprio = 3328 unsigned long rlim_rtprio =
3067 task_rlimit(p, RLIMIT_RTPRIO); 3329 task_rlimit(p, RLIMIT_RTPRIO);
@@ -3071,11 +3333,20 @@ recheck:
3071 return -EPERM; 3333 return -EPERM;
3072 3334
3073 /* can't increase priority */ 3335 /* can't increase priority */
3074 if (param->sched_priority > p->rt_priority && 3336 if (attr->sched_priority > p->rt_priority &&
3075 param->sched_priority > rlim_rtprio) 3337 attr->sched_priority > rlim_rtprio)
3076 return -EPERM; 3338 return -EPERM;
3077 } 3339 }
3078 3340
3341 /*
3342 * Can't set/change SCHED_DEADLINE policy at all for now
3343 * (safest behavior); in the future we would like to allow
3344 * unprivileged DL tasks to increase their relative deadline
3345 * or reduce their runtime (both ways reducing utilization)
3346 */
3347 if (dl_policy(policy))
3348 return -EPERM;
3349
3079 /* 3350 /*
3080 * Treat SCHED_IDLE as nice 20. Only allow a switch to 3351 * Treat SCHED_IDLE as nice 20. Only allow a switch to
3081 * SCHED_NORMAL if the RLIMIT_NICE would normally permit it. 3352 * SCHED_NORMAL if the RLIMIT_NICE would normally permit it.
@@ -3120,14 +3391,21 @@ recheck:
3120 /* 3391 /*
3121 * If not changing anything there's no need to proceed further: 3392 * If not changing anything there's no need to proceed further:
3122 */ 3393 */
3123 if (unlikely(policy == p->policy && (!rt_policy(policy) || 3394 if (unlikely(policy == p->policy)) {
3124 param->sched_priority == p->rt_priority))) { 3395 if (fair_policy(policy) && attr->sched_nice != TASK_NICE(p))
3396 goto change;
3397 if (rt_policy(policy) && attr->sched_priority != p->rt_priority)
3398 goto change;
3399 if (dl_policy(policy))
3400 goto change;
3401
3125 task_rq_unlock(rq, p, &flags); 3402 task_rq_unlock(rq, p, &flags);
3126 return 0; 3403 return 0;
3127 } 3404 }
3405change:
3128 3406
3129#ifdef CONFIG_RT_GROUP_SCHED
3130 if (user) { 3407 if (user) {
3408#ifdef CONFIG_RT_GROUP_SCHED
3131 /* 3409 /*
3132 * Do not allow realtime tasks into groups that have no runtime 3410 * Do not allow realtime tasks into groups that have no runtime
3133 * assigned. 3411 * assigned.
@@ -3138,8 +3416,24 @@ recheck:
3138 task_rq_unlock(rq, p, &flags); 3416 task_rq_unlock(rq, p, &flags);
3139 return -EPERM; 3417 return -EPERM;
3140 } 3418 }
3141 }
3142#endif 3419#endif
3420#ifdef CONFIG_SMP
3421 if (dl_bandwidth_enabled() && dl_policy(policy)) {
3422 cpumask_t *span = rq->rd->span;
3423
3424 /*
3425 * Don't allow tasks with an affinity mask smaller than
3426 * the entire root_domain to become SCHED_DEADLINE. We
3427 * will also fail if there's no bandwidth available.
3428 */
3429 if (!cpumask_subset(span, &p->cpus_allowed) ||
3430 rq->rd->dl_bw.bw == 0) {
3431 task_rq_unlock(rq, p, &flags);
3432 return -EPERM;
3433 }
3434 }
3435#endif
3436 }
3143 3437
3144 /* recheck policy now with rq lock held */ 3438 /* recheck policy now with rq lock held */
3145 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { 3439 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
@@ -3147,6 +3441,17 @@ recheck:
3147 task_rq_unlock(rq, p, &flags); 3441 task_rq_unlock(rq, p, &flags);
3148 goto recheck; 3442 goto recheck;
3149 } 3443 }
3444
3445 /*
3446 * If setscheduling to SCHED_DEADLINE (or changing the parameters
3447 * of a SCHED_DEADLINE task) we need to check if enough bandwidth
3448 * is available.
3449 */
3450 if ((dl_policy(policy) || dl_task(p)) && dl_overflow(p, policy, attr)) {
3451 task_rq_unlock(rq, p, &flags);
3452 return -EBUSY;
3453 }
3454
3150 on_rq = p->on_rq; 3455 on_rq = p->on_rq;
3151 running = task_current(rq, p); 3456 running = task_current(rq, p);
3152 if (on_rq) 3457 if (on_rq)
@@ -3158,7 +3463,7 @@ recheck:
3158 3463
3159 oldprio = p->prio; 3464 oldprio = p->prio;
3160 prev_class = p->sched_class; 3465 prev_class = p->sched_class;
3161 __setscheduler(rq, p, policy, param->sched_priority); 3466 __setscheduler(rq, p, attr);
3162 3467
3163 if (running) 3468 if (running)
3164 p->sched_class->set_curr_task(rq); 3469 p->sched_class->set_curr_task(rq);
@@ -3173,6 +3478,26 @@ recheck:
3173 return 0; 3478 return 0;
3174} 3479}
3175 3480
3481static int _sched_setscheduler(struct task_struct *p, int policy,
3482 const struct sched_param *param, bool check)
3483{
3484 struct sched_attr attr = {
3485 .sched_policy = policy,
3486 .sched_priority = param->sched_priority,
3487 .sched_nice = PRIO_TO_NICE(p->static_prio),
3488 };
3489
3490 /*
3491 * Fixup the legacy SCHED_RESET_ON_FORK hack
3492 */
3493 if (policy & SCHED_RESET_ON_FORK) {
3494 attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
3495 policy &= ~SCHED_RESET_ON_FORK;
3496 attr.sched_policy = policy;
3497 }
3498
3499 return __sched_setscheduler(p, &attr, check);
3500}
3176/** 3501/**
3177 * sched_setscheduler - change the scheduling policy and/or RT priority of a thread. 3502 * sched_setscheduler - change the scheduling policy and/or RT priority of a thread.
3178 * @p: the task in question. 3503 * @p: the task in question.
@@ -3186,10 +3511,16 @@ recheck:
3186int sched_setscheduler(struct task_struct *p, int policy, 3511int sched_setscheduler(struct task_struct *p, int policy,
3187 const struct sched_param *param) 3512 const struct sched_param *param)
3188{ 3513{
3189 return __sched_setscheduler(p, policy, param, true); 3514 return _sched_setscheduler(p, policy, param, true);
3190} 3515}
3191EXPORT_SYMBOL_GPL(sched_setscheduler); 3516EXPORT_SYMBOL_GPL(sched_setscheduler);
3192 3517
3518int sched_setattr(struct task_struct *p, const struct sched_attr *attr)
3519{
3520 return __sched_setscheduler(p, attr, true);
3521}
3522EXPORT_SYMBOL_GPL(sched_setattr);
3523
3193/** 3524/**
3194 * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace. 3525 * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace.
3195 * @p: the task in question. 3526 * @p: the task in question.
@@ -3206,7 +3537,7 @@ EXPORT_SYMBOL_GPL(sched_setscheduler);
3206int sched_setscheduler_nocheck(struct task_struct *p, int policy, 3537int sched_setscheduler_nocheck(struct task_struct *p, int policy,
3207 const struct sched_param *param) 3538 const struct sched_param *param)
3208{ 3539{
3209 return __sched_setscheduler(p, policy, param, false); 3540 return _sched_setscheduler(p, policy, param, false);
3210} 3541}
3211 3542
3212static int 3543static int
@@ -3231,6 +3562,79 @@ do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
3231 return retval; 3562 return retval;
3232} 3563}
3233 3564
3565/*
3566 * Mimics kernel/events/core.c perf_copy_attr().
3567 */
3568static int sched_copy_attr(struct sched_attr __user *uattr,
3569 struct sched_attr *attr)
3570{
3571 u32 size;
3572 int ret;
3573
3574 if (!access_ok(VERIFY_WRITE, uattr, SCHED_ATTR_SIZE_VER0))
3575 return -EFAULT;
3576
3577 /*
3578 * zero the full structure, so that a short copy will be nice.
3579 */
3580 memset(attr, 0, sizeof(*attr));
3581
3582 ret = get_user(size, &uattr->size);
3583 if (ret)
3584 return ret;
3585
3586 if (size > PAGE_SIZE) /* silly large */
3587 goto err_size;
3588
3589 if (!size) /* abi compat */
3590 size = SCHED_ATTR_SIZE_VER0;
3591
3592 if (size < SCHED_ATTR_SIZE_VER0)
3593 goto err_size;
3594
3595 /*
3596 * If we're handed a bigger struct than we know of,
3597 * ensure all the unknown bits are 0 - i.e. new
3598 * user-space does not rely on any kernel feature
3599 * extensions we dont know about yet.
3600 */
3601 if (size > sizeof(*attr)) {
3602 unsigned char __user *addr;
3603 unsigned char __user *end;
3604 unsigned char val;
3605
3606 addr = (void __user *)uattr + sizeof(*attr);
3607 end = (void __user *)uattr + size;
3608
3609 for (; addr < end; addr++) {
3610 ret = get_user(val, addr);
3611 if (ret)
3612 return ret;
3613 if (val)
3614 goto err_size;
3615 }
3616 size = sizeof(*attr);
3617 }
3618
3619 ret = copy_from_user(attr, uattr, size);
3620 if (ret)
3621 return -EFAULT;
3622
3623 /*
3624 * XXX: do we want to be lenient like existing syscalls; or do we want
3625 * to be strict and return an error on out-of-bounds values?
3626 */
3627 attr->sched_nice = clamp(attr->sched_nice, -20, 19);
3628
3629out:
3630 return ret;
3631
3632err_size:
3633 put_user(sizeof(*attr), &uattr->size);
3634 ret = -E2BIG;
3635 goto out;
3636}
3637
3234/** 3638/**
3235 * sys_sched_setscheduler - set/change the scheduler policy and RT priority 3639 * sys_sched_setscheduler - set/change the scheduler policy and RT priority
3236 * @pid: the pid in question. 3640 * @pid: the pid in question.
@@ -3262,6 +3666,34 @@ SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
3262} 3666}
3263 3667
3264/** 3668/**
3669 * sys_sched_setattr - same as above, but with extended sched_attr
3670 * @pid: the pid in question.
3671 * @uattr: structure containing the extended parameters.
3672 */
3673SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr,
3674 unsigned int, flags)
3675{
3676 struct sched_attr attr;
3677 struct task_struct *p;
3678 int retval;
3679
3680 if (!uattr || pid < 0 || flags)
3681 return -EINVAL;
3682
3683 if (sched_copy_attr(uattr, &attr))
3684 return -EFAULT;
3685
3686 rcu_read_lock();
3687 retval = -ESRCH;
3688 p = find_process_by_pid(pid);
3689 if (p != NULL)
3690 retval = sched_setattr(p, &attr);
3691 rcu_read_unlock();
3692
3693 return retval;
3694}
3695
3696/**
3265 * sys_sched_getscheduler - get the policy (scheduling class) of a thread 3697 * sys_sched_getscheduler - get the policy (scheduling class) of a thread
3266 * @pid: the pid in question. 3698 * @pid: the pid in question.
3267 * 3699 *
@@ -3316,6 +3748,10 @@ SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
3316 if (retval) 3748 if (retval)
3317 goto out_unlock; 3749 goto out_unlock;
3318 3750
3751 if (task_has_dl_policy(p)) {
3752 retval = -EINVAL;
3753 goto out_unlock;
3754 }
3319 lp.sched_priority = p->rt_priority; 3755 lp.sched_priority = p->rt_priority;
3320 rcu_read_unlock(); 3756 rcu_read_unlock();
3321 3757
@@ -3331,6 +3767,96 @@ out_unlock:
3331 return retval; 3767 return retval;
3332} 3768}
3333 3769
3770static int sched_read_attr(struct sched_attr __user *uattr,
3771 struct sched_attr *attr,
3772 unsigned int usize)
3773{
3774 int ret;
3775
3776 if (!access_ok(VERIFY_WRITE, uattr, usize))
3777 return -EFAULT;
3778
3779 /*
3780 * If we're handed a smaller struct than we know of,
3781 * ensure all the unknown bits are 0 - i.e. old
3782 * user-space does not get uncomplete information.
3783 */
3784 if (usize < sizeof(*attr)) {
3785 unsigned char *addr;
3786 unsigned char *end;
3787
3788 addr = (void *)attr + usize;
3789 end = (void *)attr + sizeof(*attr);
3790
3791 for (; addr < end; addr++) {
3792 if (*addr)
3793 goto err_size;
3794 }
3795
3796 attr->size = usize;
3797 }
3798
3799 ret = copy_to_user(uattr, attr, attr->size);
3800 if (ret)
3801 return -EFAULT;
3802
3803out:
3804 return ret;
3805
3806err_size:
3807 ret = -E2BIG;
3808 goto out;
3809}
3810
3811/**
3812 * sys_sched_getattr - similar to sched_getparam, but with sched_attr
3813 * @pid: the pid in question.
3814 * @uattr: structure containing the extended parameters.
3815 * @size: sizeof(attr) for fwd/bwd comp.
3816 */
3817SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
3818 unsigned int, size, unsigned int, flags)
3819{
3820 struct sched_attr attr = {
3821 .size = sizeof(struct sched_attr),
3822 };
3823 struct task_struct *p;
3824 int retval;
3825
3826 if (!uattr || pid < 0 || size > PAGE_SIZE ||
3827 size < SCHED_ATTR_SIZE_VER0 || flags)
3828 return -EINVAL;
3829
3830 rcu_read_lock();
3831 p = find_process_by_pid(pid);
3832 retval = -ESRCH;
3833 if (!p)
3834 goto out_unlock;
3835
3836 retval = security_task_getscheduler(p);
3837 if (retval)
3838 goto out_unlock;
3839
3840 attr.sched_policy = p->policy;
3841 if (p->sched_reset_on_fork)
3842 attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
3843 if (task_has_dl_policy(p))
3844 __getparam_dl(p, &attr);
3845 else if (task_has_rt_policy(p))
3846 attr.sched_priority = p->rt_priority;
3847 else
3848 attr.sched_nice = TASK_NICE(p);
3849
3850 rcu_read_unlock();
3851
3852 retval = sched_read_attr(uattr, &attr, size);
3853 return retval;
3854
3855out_unlock:
3856 rcu_read_unlock();
3857 return retval;
3858}
3859
3334long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) 3860long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
3335{ 3861{
3336 cpumask_var_t cpus_allowed, new_mask; 3862 cpumask_var_t cpus_allowed, new_mask;
@@ -3375,8 +3901,26 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
3375 if (retval) 3901 if (retval)
3376 goto out_unlock; 3902 goto out_unlock;
3377 3903
3904
3378 cpuset_cpus_allowed(p, cpus_allowed); 3905 cpuset_cpus_allowed(p, cpus_allowed);
3379 cpumask_and(new_mask, in_mask, cpus_allowed); 3906 cpumask_and(new_mask, in_mask, cpus_allowed);
3907
3908 /*
3909 * Since bandwidth control happens on root_domain basis,
3910 * if admission test is enabled, we only admit -deadline
3911 * tasks allowed to run on all the CPUs in the task's
3912 * root_domain.
3913 */
3914#ifdef CONFIG_SMP
3915 if (task_has_dl_policy(p)) {
3916 const struct cpumask *span = task_rq(p)->rd->span;
3917
3918 if (dl_bandwidth_enabled() && !cpumask_subset(span, new_mask)) {
3919 retval = -EBUSY;
3920 goto out_unlock;
3921 }
3922 }
3923#endif
3380again: 3924again:
3381 retval = set_cpus_allowed_ptr(p, new_mask); 3925 retval = set_cpus_allowed_ptr(p, new_mask);
3382 3926
@@ -3653,7 +4197,7 @@ again:
3653 } 4197 }
3654 4198
3655 double_rq_lock(rq, p_rq); 4199 double_rq_lock(rq, p_rq);
3656 while (task_rq(p) != p_rq) { 4200 if (task_rq(p) != p_rq) {
3657 double_rq_unlock(rq, p_rq); 4201 double_rq_unlock(rq, p_rq);
3658 goto again; 4202 goto again;
3659 } 4203 }
@@ -3742,6 +4286,7 @@ SYSCALL_DEFINE1(sched_get_priority_max, int, policy)
3742 case SCHED_RR: 4286 case SCHED_RR:
3743 ret = MAX_USER_RT_PRIO-1; 4287 ret = MAX_USER_RT_PRIO-1;
3744 break; 4288 break;
4289 case SCHED_DEADLINE:
3745 case SCHED_NORMAL: 4290 case SCHED_NORMAL:
3746 case SCHED_BATCH: 4291 case SCHED_BATCH:
3747 case SCHED_IDLE: 4292 case SCHED_IDLE:
@@ -3768,6 +4313,7 @@ SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
3768 case SCHED_RR: 4313 case SCHED_RR:
3769 ret = 1; 4314 ret = 1;
3770 break; 4315 break;
4316 case SCHED_DEADLINE:
3771 case SCHED_NORMAL: 4317 case SCHED_NORMAL:
3772 case SCHED_BATCH: 4318 case SCHED_BATCH:
3773 case SCHED_IDLE: 4319 case SCHED_IDLE:
@@ -3811,7 +4357,9 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
3811 goto out_unlock; 4357 goto out_unlock;
3812 4358
3813 rq = task_rq_lock(p, &flags); 4359 rq = task_rq_lock(p, &flags);
3814 time_slice = p->sched_class->get_rr_interval(rq, p); 4360 time_slice = 0;
4361 if (p->sched_class->get_rr_interval)
4362 time_slice = p->sched_class->get_rr_interval(rq, p);
3815 task_rq_unlock(rq, p, &flags); 4363 task_rq_unlock(rq, p, &flags);
3816 4364
3817 rcu_read_unlock(); 4365 rcu_read_unlock();
@@ -4090,6 +4638,7 @@ int migrate_task_to(struct task_struct *p, int target_cpu)
4090 4638
4091 /* TODO: This is not properly updating schedstats */ 4639 /* TODO: This is not properly updating schedstats */
4092 4640
4641 trace_sched_move_numa(p, curr_cpu, target_cpu);
4093 return stop_one_cpu(curr_cpu, migration_cpu_stop, &arg); 4642 return stop_one_cpu(curr_cpu, migration_cpu_stop, &arg);
4094} 4643}
4095 4644
@@ -4514,13 +5063,31 @@ static int sched_cpu_active(struct notifier_block *nfb,
4514static int sched_cpu_inactive(struct notifier_block *nfb, 5063static int sched_cpu_inactive(struct notifier_block *nfb,
4515 unsigned long action, void *hcpu) 5064 unsigned long action, void *hcpu)
4516{ 5065{
5066 unsigned long flags;
5067 long cpu = (long)hcpu;
5068
4517 switch (action & ~CPU_TASKS_FROZEN) { 5069 switch (action & ~CPU_TASKS_FROZEN) {
4518 case CPU_DOWN_PREPARE: 5070 case CPU_DOWN_PREPARE:
4519 set_cpu_active((long)hcpu, false); 5071 set_cpu_active(cpu, false);
5072
5073 /* explicitly allow suspend */
5074 if (!(action & CPU_TASKS_FROZEN)) {
5075 struct dl_bw *dl_b = dl_bw_of(cpu);
5076 bool overflow;
5077 int cpus;
5078
5079 raw_spin_lock_irqsave(&dl_b->lock, flags);
5080 cpus = dl_bw_cpus(cpu);
5081 overflow = __dl_overflow(dl_b, cpus, 0, 0);
5082 raw_spin_unlock_irqrestore(&dl_b->lock, flags);
5083
5084 if (overflow)
5085 return notifier_from_errno(-EBUSY);
5086 }
4520 return NOTIFY_OK; 5087 return NOTIFY_OK;
4521 default:
4522 return NOTIFY_DONE;
4523 } 5088 }
5089
5090 return NOTIFY_DONE;
4524} 5091}
4525 5092
4526static int __init migration_init(void) 5093static int __init migration_init(void)
@@ -4739,6 +5306,8 @@ static void free_rootdomain(struct rcu_head *rcu)
4739 struct root_domain *rd = container_of(rcu, struct root_domain, rcu); 5306 struct root_domain *rd = container_of(rcu, struct root_domain, rcu);
4740 5307
4741 cpupri_cleanup(&rd->cpupri); 5308 cpupri_cleanup(&rd->cpupri);
5309 cpudl_cleanup(&rd->cpudl);
5310 free_cpumask_var(rd->dlo_mask);
4742 free_cpumask_var(rd->rto_mask); 5311 free_cpumask_var(rd->rto_mask);
4743 free_cpumask_var(rd->online); 5312 free_cpumask_var(rd->online);
4744 free_cpumask_var(rd->span); 5313 free_cpumask_var(rd->span);
@@ -4790,8 +5359,14 @@ static int init_rootdomain(struct root_domain *rd)
4790 goto out; 5359 goto out;
4791 if (!alloc_cpumask_var(&rd->online, GFP_KERNEL)) 5360 if (!alloc_cpumask_var(&rd->online, GFP_KERNEL))
4792 goto free_span; 5361 goto free_span;
4793 if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL)) 5362 if (!alloc_cpumask_var(&rd->dlo_mask, GFP_KERNEL))
4794 goto free_online; 5363 goto free_online;
5364 if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
5365 goto free_dlo_mask;
5366
5367 init_dl_bw(&rd->dl_bw);
5368 if (cpudl_init(&rd->cpudl) != 0)
5369 goto free_dlo_mask;
4795 5370
4796 if (cpupri_init(&rd->cpupri) != 0) 5371 if (cpupri_init(&rd->cpupri) != 0)
4797 goto free_rto_mask; 5372 goto free_rto_mask;
@@ -4799,6 +5374,8 @@ static int init_rootdomain(struct root_domain *rd)
4799 5374
4800free_rto_mask: 5375free_rto_mask:
4801 free_cpumask_var(rd->rto_mask); 5376 free_cpumask_var(rd->rto_mask);
5377free_dlo_mask:
5378 free_cpumask_var(rd->dlo_mask);
4802free_online: 5379free_online:
4803 free_cpumask_var(rd->online); 5380 free_cpumask_var(rd->online);
4804free_span: 5381free_span:
@@ -6150,6 +6727,7 @@ void __init sched_init_smp(void)
6150 free_cpumask_var(non_isolated_cpus); 6727 free_cpumask_var(non_isolated_cpus);
6151 6728
6152 init_sched_rt_class(); 6729 init_sched_rt_class();
6730 init_sched_dl_class();
6153} 6731}
6154#else 6732#else
6155void __init sched_init_smp(void) 6733void __init sched_init_smp(void)
@@ -6219,13 +6797,15 @@ void __init sched_init(void)
6219#endif /* CONFIG_CPUMASK_OFFSTACK */ 6797#endif /* CONFIG_CPUMASK_OFFSTACK */
6220 } 6798 }
6221 6799
6800 init_rt_bandwidth(&def_rt_bandwidth,
6801 global_rt_period(), global_rt_runtime());
6802 init_dl_bandwidth(&def_dl_bandwidth,
6803 global_rt_period(), global_rt_runtime());
6804
6222#ifdef CONFIG_SMP 6805#ifdef CONFIG_SMP
6223 init_defrootdomain(); 6806 init_defrootdomain();
6224#endif 6807#endif
6225 6808
6226 init_rt_bandwidth(&def_rt_bandwidth,
6227 global_rt_period(), global_rt_runtime());
6228
6229#ifdef CONFIG_RT_GROUP_SCHED 6809#ifdef CONFIG_RT_GROUP_SCHED
6230 init_rt_bandwidth(&root_task_group.rt_bandwidth, 6810 init_rt_bandwidth(&root_task_group.rt_bandwidth,
6231 global_rt_period(), global_rt_runtime()); 6811 global_rt_period(), global_rt_runtime());
@@ -6249,6 +6829,7 @@ void __init sched_init(void)
6249 rq->calc_load_update = jiffies + LOAD_FREQ; 6829 rq->calc_load_update = jiffies + LOAD_FREQ;
6250 init_cfs_rq(&rq->cfs); 6830 init_cfs_rq(&rq->cfs);
6251 init_rt_rq(&rq->rt, rq); 6831 init_rt_rq(&rq->rt, rq);
6832 init_dl_rq(&rq->dl, rq);
6252#ifdef CONFIG_FAIR_GROUP_SCHED 6833#ifdef CONFIG_FAIR_GROUP_SCHED
6253 root_task_group.shares = ROOT_TASK_GROUP_LOAD; 6834 root_task_group.shares = ROOT_TASK_GROUP_LOAD;
6254 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); 6835 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
@@ -6320,10 +6901,6 @@ void __init sched_init(void)
6320 INIT_HLIST_HEAD(&init_task.preempt_notifiers); 6901 INIT_HLIST_HEAD(&init_task.preempt_notifiers);
6321#endif 6902#endif
6322 6903
6323#ifdef CONFIG_RT_MUTEXES
6324 plist_head_init(&init_task.pi_waiters);
6325#endif
6326
6327 /* 6904 /*
6328 * The boot idle thread does lazy MMU switching as well: 6905 * The boot idle thread does lazy MMU switching as well:
6329 */ 6906 */
@@ -6397,13 +6974,16 @@ EXPORT_SYMBOL(__might_sleep);
6397static void normalize_task(struct rq *rq, struct task_struct *p) 6974static void normalize_task(struct rq *rq, struct task_struct *p)
6398{ 6975{
6399 const struct sched_class *prev_class = p->sched_class; 6976 const struct sched_class *prev_class = p->sched_class;
6977 struct sched_attr attr = {
6978 .sched_policy = SCHED_NORMAL,
6979 };
6400 int old_prio = p->prio; 6980 int old_prio = p->prio;
6401 int on_rq; 6981 int on_rq;
6402 6982
6403 on_rq = p->on_rq; 6983 on_rq = p->on_rq;
6404 if (on_rq) 6984 if (on_rq)
6405 dequeue_task(rq, p, 0); 6985 dequeue_task(rq, p, 0);
6406 __setscheduler(rq, p, SCHED_NORMAL, 0); 6986 __setscheduler(rq, p, &attr);
6407 if (on_rq) { 6987 if (on_rq) {
6408 enqueue_task(rq, p, 0); 6988 enqueue_task(rq, p, 0);
6409 resched_task(rq->curr); 6989 resched_task(rq->curr);
@@ -6433,7 +7013,7 @@ void normalize_rt_tasks(void)
6433 p->se.statistics.block_start = 0; 7013 p->se.statistics.block_start = 0;
6434#endif 7014#endif
6435 7015
6436 if (!rt_task(p)) { 7016 if (!dl_task(p) && !rt_task(p)) {
6437 /* 7017 /*
6438 * Renice negative nice level userspace 7018 * Renice negative nice level userspace
6439 * tasks back to 0: 7019 * tasks back to 0:
@@ -6628,16 +7208,6 @@ void sched_move_task(struct task_struct *tsk)
6628} 7208}
6629#endif /* CONFIG_CGROUP_SCHED */ 7209#endif /* CONFIG_CGROUP_SCHED */
6630 7210
6631#if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_CFS_BANDWIDTH)
6632static unsigned long to_ratio(u64 period, u64 runtime)
6633{
6634 if (runtime == RUNTIME_INF)
6635 return 1ULL << 20;
6636
6637 return div64_u64(runtime << 20, period);
6638}
6639#endif
6640
6641#ifdef CONFIG_RT_GROUP_SCHED 7211#ifdef CONFIG_RT_GROUP_SCHED
6642/* 7212/*
6643 * Ensure that the real time constraints are schedulable. 7213 * Ensure that the real time constraints are schedulable.
@@ -6811,24 +7381,13 @@ static long sched_group_rt_period(struct task_group *tg)
6811 do_div(rt_period_us, NSEC_PER_USEC); 7381 do_div(rt_period_us, NSEC_PER_USEC);
6812 return rt_period_us; 7382 return rt_period_us;
6813} 7383}
7384#endif /* CONFIG_RT_GROUP_SCHED */
6814 7385
7386#ifdef CONFIG_RT_GROUP_SCHED
6815static int sched_rt_global_constraints(void) 7387static int sched_rt_global_constraints(void)
6816{ 7388{
6817 u64 runtime, period;
6818 int ret = 0; 7389 int ret = 0;
6819 7390
6820 if (sysctl_sched_rt_period <= 0)
6821 return -EINVAL;
6822
6823 runtime = global_rt_runtime();
6824 period = global_rt_period();
6825
6826 /*
6827 * Sanity check on the sysctl variables.
6828 */
6829 if (runtime > period && runtime != RUNTIME_INF)
6830 return -EINVAL;
6831
6832 mutex_lock(&rt_constraints_mutex); 7391 mutex_lock(&rt_constraints_mutex);
6833 read_lock(&tasklist_lock); 7392 read_lock(&tasklist_lock);
6834 ret = __rt_schedulable(NULL, 0, 0); 7393 ret = __rt_schedulable(NULL, 0, 0);
@@ -6851,17 +7410,7 @@ static int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
6851static int sched_rt_global_constraints(void) 7410static int sched_rt_global_constraints(void)
6852{ 7411{
6853 unsigned long flags; 7412 unsigned long flags;
6854 int i; 7413 int i, ret = 0;
6855
6856 if (sysctl_sched_rt_period <= 0)
6857 return -EINVAL;
6858
6859 /*
6860 * There's always some RT tasks in the root group
6861 * -- migration, kstopmachine etc..
6862 */
6863 if (sysctl_sched_rt_runtime == 0)
6864 return -EBUSY;
6865 7414
6866 raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); 7415 raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
6867 for_each_possible_cpu(i) { 7416 for_each_possible_cpu(i) {
@@ -6873,36 +7422,91 @@ static int sched_rt_global_constraints(void)
6873 } 7422 }
6874 raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags); 7423 raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);
6875 7424
6876 return 0; 7425 return ret;
6877} 7426}
6878#endif /* CONFIG_RT_GROUP_SCHED */ 7427#endif /* CONFIG_RT_GROUP_SCHED */
6879 7428
6880int sched_rr_handler(struct ctl_table *table, int write, 7429static int sched_dl_global_constraints(void)
6881 void __user *buffer, size_t *lenp,
6882 loff_t *ppos)
6883{ 7430{
6884 int ret; 7431 u64 runtime = global_rt_runtime();
6885 static DEFINE_MUTEX(mutex); 7432 u64 period = global_rt_period();
7433 u64 new_bw = to_ratio(period, runtime);
7434 int cpu, ret = 0;
7435 unsigned long flags;
6886 7436
6887 mutex_lock(&mutex); 7437 /*
6888 ret = proc_dointvec(table, write, buffer, lenp, ppos); 7438 * Here we want to check the bandwidth not being set to some
6889 /* make sure that internally we keep jiffies */ 7439 * value smaller than the currently allocated bandwidth in
6890 /* also, writing zero resets timeslice to default */ 7440 * any of the root_domains.
6891 if (!ret && write) { 7441 *
6892 sched_rr_timeslice = sched_rr_timeslice <= 0 ? 7442 * FIXME: Cycling on all the CPUs is overdoing, but simpler than
6893 RR_TIMESLICE : msecs_to_jiffies(sched_rr_timeslice); 7443 * cycling on root_domains... Discussion on different/better
7444 * solutions is welcome!
7445 */
7446 for_each_possible_cpu(cpu) {
7447 struct dl_bw *dl_b = dl_bw_of(cpu);
7448
7449 raw_spin_lock_irqsave(&dl_b->lock, flags);
7450 if (new_bw < dl_b->total_bw)
7451 ret = -EBUSY;
7452 raw_spin_unlock_irqrestore(&dl_b->lock, flags);
7453
7454 if (ret)
7455 break;
6894 } 7456 }
6895 mutex_unlock(&mutex); 7457
6896 return ret; 7458 return ret;
6897} 7459}
6898 7460
7461static void sched_dl_do_global(void)
7462{
7463 u64 new_bw = -1;
7464 int cpu;
7465 unsigned long flags;
7466
7467 def_dl_bandwidth.dl_period = global_rt_period();
7468 def_dl_bandwidth.dl_runtime = global_rt_runtime();
7469
7470 if (global_rt_runtime() != RUNTIME_INF)
7471 new_bw = to_ratio(global_rt_period(), global_rt_runtime());
7472
7473 /*
7474 * FIXME: As above...
7475 */
7476 for_each_possible_cpu(cpu) {
7477 struct dl_bw *dl_b = dl_bw_of(cpu);
7478
7479 raw_spin_lock_irqsave(&dl_b->lock, flags);
7480 dl_b->bw = new_bw;
7481 raw_spin_unlock_irqrestore(&dl_b->lock, flags);
7482 }
7483}
7484
7485static int sched_rt_global_validate(void)
7486{
7487 if (sysctl_sched_rt_period <= 0)
7488 return -EINVAL;
7489
7490 if ((sysctl_sched_rt_runtime != RUNTIME_INF) &&
7491 (sysctl_sched_rt_runtime > sysctl_sched_rt_period))
7492 return -EINVAL;
7493
7494 return 0;
7495}
7496
7497static void sched_rt_do_global(void)
7498{
7499 def_rt_bandwidth.rt_runtime = global_rt_runtime();
7500 def_rt_bandwidth.rt_period = ns_to_ktime(global_rt_period());
7501}
7502
6899int sched_rt_handler(struct ctl_table *table, int write, 7503int sched_rt_handler(struct ctl_table *table, int write,
6900 void __user *buffer, size_t *lenp, 7504 void __user *buffer, size_t *lenp,
6901 loff_t *ppos) 7505 loff_t *ppos)
6902{ 7506{
6903 int ret;
6904 int old_period, old_runtime; 7507 int old_period, old_runtime;
6905 static DEFINE_MUTEX(mutex); 7508 static DEFINE_MUTEX(mutex);
7509 int ret;
6906 7510
6907 mutex_lock(&mutex); 7511 mutex_lock(&mutex);
6908 old_period = sysctl_sched_rt_period; 7512 old_period = sysctl_sched_rt_period;
@@ -6911,21 +7515,50 @@ int sched_rt_handler(struct ctl_table *table, int write,
6911 ret = proc_dointvec(table, write, buffer, lenp, ppos); 7515 ret = proc_dointvec(table, write, buffer, lenp, ppos);
6912 7516
6913 if (!ret && write) { 7517 if (!ret && write) {
7518 ret = sched_rt_global_validate();
7519 if (ret)
7520 goto undo;
7521
6914 ret = sched_rt_global_constraints(); 7522 ret = sched_rt_global_constraints();
6915 if (ret) { 7523 if (ret)
6916 sysctl_sched_rt_period = old_period; 7524 goto undo;
6917 sysctl_sched_rt_runtime = old_runtime; 7525
6918 } else { 7526 ret = sched_dl_global_constraints();
6919 def_rt_bandwidth.rt_runtime = global_rt_runtime(); 7527 if (ret)
6920 def_rt_bandwidth.rt_period = 7528 goto undo;
6921 ns_to_ktime(global_rt_period()); 7529
6922 } 7530 sched_rt_do_global();
7531 sched_dl_do_global();
7532 }
7533 if (0) {
7534undo:
7535 sysctl_sched_rt_period = old_period;
7536 sysctl_sched_rt_runtime = old_runtime;
6923 } 7537 }
6924 mutex_unlock(&mutex); 7538 mutex_unlock(&mutex);
6925 7539
6926 return ret; 7540 return ret;
6927} 7541}
6928 7542
7543int sched_rr_handler(struct ctl_table *table, int write,
7544 void __user *buffer, size_t *lenp,
7545 loff_t *ppos)
7546{
7547 int ret;
7548 static DEFINE_MUTEX(mutex);
7549
7550 mutex_lock(&mutex);
7551 ret = proc_dointvec(table, write, buffer, lenp, ppos);
7552 /* make sure that internally we keep jiffies */
7553 /* also, writing zero resets timeslice to default */
7554 if (!ret && write) {
7555 sched_rr_timeslice = sched_rr_timeslice <= 0 ?
7556 RR_TIMESLICE : msecs_to_jiffies(sched_rr_timeslice);
7557 }
7558 mutex_unlock(&mutex);
7559 return ret;
7560}
7561
6929#ifdef CONFIG_CGROUP_SCHED 7562#ifdef CONFIG_CGROUP_SCHED
6930 7563
6931static inline struct task_group *css_tg(struct cgroup_subsys_state *css) 7564static inline struct task_group *css_tg(struct cgroup_subsys_state *css)
@@ -7258,15 +7891,14 @@ static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota)
7258 return ret; 7891 return ret;
7259} 7892}
7260 7893
7261static int cpu_stats_show(struct cgroup_subsys_state *css, struct cftype *cft, 7894static int cpu_stats_show(struct seq_file *sf, void *v)
7262 struct cgroup_map_cb *cb)
7263{ 7895{
7264 struct task_group *tg = css_tg(css); 7896 struct task_group *tg = css_tg(seq_css(sf));
7265 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; 7897 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
7266 7898
7267 cb->fill(cb, "nr_periods", cfs_b->nr_periods); 7899 seq_printf(sf, "nr_periods %d\n", cfs_b->nr_periods);
7268 cb->fill(cb, "nr_throttled", cfs_b->nr_throttled); 7900 seq_printf(sf, "nr_throttled %d\n", cfs_b->nr_throttled);
7269 cb->fill(cb, "throttled_time", cfs_b->throttled_time); 7901 seq_printf(sf, "throttled_time %llu\n", cfs_b->throttled_time);
7270 7902
7271 return 0; 7903 return 0;
7272} 7904}
@@ -7320,7 +7952,7 @@ static struct cftype cpu_files[] = {
7320 }, 7952 },
7321 { 7953 {
7322 .name = "stat", 7954 .name = "stat",
7323 .read_map = cpu_stats_show, 7955 .seq_show = cpu_stats_show,
7324 }, 7956 },
7325#endif 7957#endif
7326#ifdef CONFIG_RT_GROUP_SCHED 7958#ifdef CONFIG_RT_GROUP_SCHED
diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c
index f64722ff0299..622e0818f905 100644
--- a/kernel/sched/cpuacct.c
+++ b/kernel/sched/cpuacct.c
@@ -163,10 +163,9 @@ out:
163 return err; 163 return err;
164} 164}
165 165
166static int cpuacct_percpu_seq_read(struct cgroup_subsys_state *css, 166static int cpuacct_percpu_seq_show(struct seq_file *m, void *V)
167 struct cftype *cft, struct seq_file *m)
168{ 167{
169 struct cpuacct *ca = css_ca(css); 168 struct cpuacct *ca = css_ca(seq_css(m));
170 u64 percpu; 169 u64 percpu;
171 int i; 170 int i;
172 171
@@ -183,10 +182,9 @@ static const char * const cpuacct_stat_desc[] = {
183 [CPUACCT_STAT_SYSTEM] = "system", 182 [CPUACCT_STAT_SYSTEM] = "system",
184}; 183};
185 184
186static int cpuacct_stats_show(struct cgroup_subsys_state *css, 185static int cpuacct_stats_show(struct seq_file *sf, void *v)
187 struct cftype *cft, struct cgroup_map_cb *cb)
188{ 186{
189 struct cpuacct *ca = css_ca(css); 187 struct cpuacct *ca = css_ca(seq_css(sf));
190 int cpu; 188 int cpu;
191 s64 val = 0; 189 s64 val = 0;
192 190
@@ -196,7 +194,7 @@ static int cpuacct_stats_show(struct cgroup_subsys_state *css,
196 val += kcpustat->cpustat[CPUTIME_NICE]; 194 val += kcpustat->cpustat[CPUTIME_NICE];
197 } 195 }
198 val = cputime64_to_clock_t(val); 196 val = cputime64_to_clock_t(val);
199 cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_USER], val); 197 seq_printf(sf, "%s %lld\n", cpuacct_stat_desc[CPUACCT_STAT_USER], val);
200 198
201 val = 0; 199 val = 0;
202 for_each_online_cpu(cpu) { 200 for_each_online_cpu(cpu) {
@@ -207,7 +205,7 @@ static int cpuacct_stats_show(struct cgroup_subsys_state *css,
207 } 205 }
208 206
209 val = cputime64_to_clock_t(val); 207 val = cputime64_to_clock_t(val);
210 cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val); 208 seq_printf(sf, "%s %lld\n", cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val);
211 209
212 return 0; 210 return 0;
213} 211}
@@ -220,11 +218,11 @@ static struct cftype files[] = {
220 }, 218 },
221 { 219 {
222 .name = "usage_percpu", 220 .name = "usage_percpu",
223 .read_seq_string = cpuacct_percpu_seq_read, 221 .seq_show = cpuacct_percpu_seq_show,
224 }, 222 },
225 { 223 {
226 .name = "stat", 224 .name = "stat",
227 .read_map = cpuacct_stats_show, 225 .seq_show = cpuacct_stats_show,
228 }, 226 },
229 { } /* terminate */ 227 { } /* terminate */
230}; 228};
diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c
new file mode 100644
index 000000000000..5b9bb42b2d47
--- /dev/null
+++ b/kernel/sched/cpudeadline.c
@@ -0,0 +1,216 @@
1/*
2 * kernel/sched/cpudl.c
3 *
4 * Global CPU deadline management
5 *
6 * Author: Juri Lelli <j.lelli@sssup.it>
7 *
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License
10 * as published by the Free Software Foundation; version 2
11 * of the License.
12 */
13
14#include <linux/gfp.h>
15#include <linux/kernel.h>
16#include "cpudeadline.h"
17
18static inline int parent(int i)
19{
20 return (i - 1) >> 1;
21}
22
23static inline int left_child(int i)
24{
25 return (i << 1) + 1;
26}
27
28static inline int right_child(int i)
29{
30 return (i << 1) + 2;
31}
32
33static inline int dl_time_before(u64 a, u64 b)
34{
35 return (s64)(a - b) < 0;
36}
37
38static void cpudl_exchange(struct cpudl *cp, int a, int b)
39{
40 int cpu_a = cp->elements[a].cpu, cpu_b = cp->elements[b].cpu;
41
42 swap(cp->elements[a], cp->elements[b]);
43 swap(cp->cpu_to_idx[cpu_a], cp->cpu_to_idx[cpu_b]);
44}
45
46static void cpudl_heapify(struct cpudl *cp, int idx)
47{
48 int l, r, largest;
49
50 /* adapted from lib/prio_heap.c */
51 while(1) {
52 l = left_child(idx);
53 r = right_child(idx);
54 largest = idx;
55
56 if ((l < cp->size) && dl_time_before(cp->elements[idx].dl,
57 cp->elements[l].dl))
58 largest = l;
59 if ((r < cp->size) && dl_time_before(cp->elements[largest].dl,
60 cp->elements[r].dl))
61 largest = r;
62 if (largest == idx)
63 break;
64
65 /* Push idx down the heap one level and bump one up */
66 cpudl_exchange(cp, largest, idx);
67 idx = largest;
68 }
69}
70
71static void cpudl_change_key(struct cpudl *cp, int idx, u64 new_dl)
72{
73 WARN_ON(idx == IDX_INVALID || !cpu_present(idx));
74
75 if (dl_time_before(new_dl, cp->elements[idx].dl)) {
76 cp->elements[idx].dl = new_dl;
77 cpudl_heapify(cp, idx);
78 } else {
79 cp->elements[idx].dl = new_dl;
80 while (idx > 0 && dl_time_before(cp->elements[parent(idx)].dl,
81 cp->elements[idx].dl)) {
82 cpudl_exchange(cp, idx, parent(idx));
83 idx = parent(idx);
84 }
85 }
86}
87
88static inline int cpudl_maximum(struct cpudl *cp)
89{
90 return cp->elements[0].cpu;
91}
92
93/*
94 * cpudl_find - find the best (later-dl) CPU in the system
95 * @cp: the cpudl max-heap context
96 * @p: the task
97 * @later_mask: a mask to fill in with the selected CPUs (or NULL)
98 *
99 * Returns: int - best CPU (heap maximum if suitable)
100 */
101int cpudl_find(struct cpudl *cp, struct task_struct *p,
102 struct cpumask *later_mask)
103{
104 int best_cpu = -1;
105 const struct sched_dl_entity *dl_se = &p->dl;
106
107 if (later_mask && cpumask_and(later_mask, cp->free_cpus,
108 &p->cpus_allowed) && cpumask_and(later_mask,
109 later_mask, cpu_active_mask)) {
110 best_cpu = cpumask_any(later_mask);
111 goto out;
112 } else if (cpumask_test_cpu(cpudl_maximum(cp), &p->cpus_allowed) &&
113 dl_time_before(dl_se->deadline, cp->elements[0].dl)) {
114 best_cpu = cpudl_maximum(cp);
115 if (later_mask)
116 cpumask_set_cpu(best_cpu, later_mask);
117 }
118
119out:
120 WARN_ON(best_cpu != -1 && !cpu_present(best_cpu));
121
122 return best_cpu;
123}
124
125/*
126 * cpudl_set - update the cpudl max-heap
127 * @cp: the cpudl max-heap context
128 * @cpu: the target cpu
129 * @dl: the new earliest deadline for this cpu
130 *
131 * Notes: assumes cpu_rq(cpu)->lock is locked
132 *
133 * Returns: (void)
134 */
135void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid)
136{
137 int old_idx, new_cpu;
138 unsigned long flags;
139
140 WARN_ON(!cpu_present(cpu));
141
142 raw_spin_lock_irqsave(&cp->lock, flags);
143 old_idx = cp->cpu_to_idx[cpu];
144 if (!is_valid) {
145 /* remove item */
146 if (old_idx == IDX_INVALID) {
147 /*
148 * Nothing to remove if old_idx was invalid.
149 * This could happen if a rq_offline_dl is
150 * called for a CPU without -dl tasks running.
151 */
152 goto out;
153 }
154 new_cpu = cp->elements[cp->size - 1].cpu;
155 cp->elements[old_idx].dl = cp->elements[cp->size - 1].dl;
156 cp->elements[old_idx].cpu = new_cpu;
157 cp->size--;
158 cp->cpu_to_idx[new_cpu] = old_idx;
159 cp->cpu_to_idx[cpu] = IDX_INVALID;
160 while (old_idx > 0 && dl_time_before(
161 cp->elements[parent(old_idx)].dl,
162 cp->elements[old_idx].dl)) {
163 cpudl_exchange(cp, old_idx, parent(old_idx));
164 old_idx = parent(old_idx);
165 }
166 cpumask_set_cpu(cpu, cp->free_cpus);
167 cpudl_heapify(cp, old_idx);
168
169 goto out;
170 }
171
172 if (old_idx == IDX_INVALID) {
173 cp->size++;
174 cp->elements[cp->size - 1].dl = 0;
175 cp->elements[cp->size - 1].cpu = cpu;
176 cp->cpu_to_idx[cpu] = cp->size - 1;
177 cpudl_change_key(cp, cp->size - 1, dl);
178 cpumask_clear_cpu(cpu, cp->free_cpus);
179 } else {
180 cpudl_change_key(cp, old_idx, dl);
181 }
182
183out:
184 raw_spin_unlock_irqrestore(&cp->lock, flags);
185}
186
187/*
188 * cpudl_init - initialize the cpudl structure
189 * @cp: the cpudl max-heap context
190 */
191int cpudl_init(struct cpudl *cp)
192{
193 int i;
194
195 memset(cp, 0, sizeof(*cp));
196 raw_spin_lock_init(&cp->lock);
197 cp->size = 0;
198 for (i = 0; i < NR_CPUS; i++)
199 cp->cpu_to_idx[i] = IDX_INVALID;
200 if (!alloc_cpumask_var(&cp->free_cpus, GFP_KERNEL))
201 return -ENOMEM;
202 cpumask_setall(cp->free_cpus);
203
204 return 0;
205}
206
207/*
208 * cpudl_cleanup - clean up the cpudl structure
209 * @cp: the cpudl max-heap context
210 */
211void cpudl_cleanup(struct cpudl *cp)
212{
213 /*
214 * nothing to do for the moment
215 */
216}
diff --git a/kernel/sched/cpudeadline.h b/kernel/sched/cpudeadline.h
new file mode 100644
index 000000000000..a202789a412c
--- /dev/null
+++ b/kernel/sched/cpudeadline.h
@@ -0,0 +1,33 @@
1#ifndef _LINUX_CPUDL_H
2#define _LINUX_CPUDL_H
3
4#include <linux/sched.h>
5
6#define IDX_INVALID -1
7
8struct array_item {
9 u64 dl;
10 int cpu;
11};
12
13struct cpudl {
14 raw_spinlock_t lock;
15 int size;
16 int cpu_to_idx[NR_CPUS];
17 struct array_item elements[NR_CPUS];
18 cpumask_var_t free_cpus;
19};
20
21
22#ifdef CONFIG_SMP
23int cpudl_find(struct cpudl *cp, struct task_struct *p,
24 struct cpumask *later_mask);
25void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid);
26int cpudl_init(struct cpudl *cp);
27void cpudl_cleanup(struct cpudl *cp);
28#else
29#define cpudl_set(cp, cpu, dl) do { } while (0)
30#define cpudl_init() do { } while (0)
31#endif /* CONFIG_SMP */
32
33#endif /* _LINUX_CPUDL_H */
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
new file mode 100644
index 000000000000..6e79b3faa4cd
--- /dev/null
+++ b/kernel/sched/deadline.c
@@ -0,0 +1,1639 @@
1/*
2 * Deadline Scheduling Class (SCHED_DEADLINE)
3 *
4 * Earliest Deadline First (EDF) + Constant Bandwidth Server (CBS).
5 *
6 * Tasks that periodically executes their instances for less than their
7 * runtime won't miss any of their deadlines.
8 * Tasks that are not periodic or sporadic or that tries to execute more
9 * than their reserved bandwidth will be slowed down (and may potentially
10 * miss some of their deadlines), and won't affect any other task.
11 *
12 * Copyright (C) 2012 Dario Faggioli <raistlin@linux.it>,
13 * Juri Lelli <juri.lelli@gmail.com>,
14 * Michael Trimarchi <michael@amarulasolutions.com>,
15 * Fabio Checconi <fchecconi@gmail.com>
16 */
17#include "sched.h"
18
19#include <linux/slab.h>
20
21struct dl_bandwidth def_dl_bandwidth;
22
23static inline struct task_struct *dl_task_of(struct sched_dl_entity *dl_se)
24{
25 return container_of(dl_se, struct task_struct, dl);
26}
27
28static inline struct rq *rq_of_dl_rq(struct dl_rq *dl_rq)
29{
30 return container_of(dl_rq, struct rq, dl);
31}
32
33static inline struct dl_rq *dl_rq_of_se(struct sched_dl_entity *dl_se)
34{
35 struct task_struct *p = dl_task_of(dl_se);
36 struct rq *rq = task_rq(p);
37
38 return &rq->dl;
39}
40
41static inline int on_dl_rq(struct sched_dl_entity *dl_se)
42{
43 return !RB_EMPTY_NODE(&dl_se->rb_node);
44}
45
46static inline int is_leftmost(struct task_struct *p, struct dl_rq *dl_rq)
47{
48 struct sched_dl_entity *dl_se = &p->dl;
49
50 return dl_rq->rb_leftmost == &dl_se->rb_node;
51}
52
53void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime)
54{
55 raw_spin_lock_init(&dl_b->dl_runtime_lock);
56 dl_b->dl_period = period;
57 dl_b->dl_runtime = runtime;
58}
59
60extern unsigned long to_ratio(u64 period, u64 runtime);
61
62void init_dl_bw(struct dl_bw *dl_b)
63{
64 raw_spin_lock_init(&dl_b->lock);
65 raw_spin_lock(&def_dl_bandwidth.dl_runtime_lock);
66 if (global_rt_runtime() == RUNTIME_INF)
67 dl_b->bw = -1;
68 else
69 dl_b->bw = to_ratio(global_rt_period(), global_rt_runtime());
70 raw_spin_unlock(&def_dl_bandwidth.dl_runtime_lock);
71 dl_b->total_bw = 0;
72}
73
74void init_dl_rq(struct dl_rq *dl_rq, struct rq *rq)
75{
76 dl_rq->rb_root = RB_ROOT;
77
78#ifdef CONFIG_SMP
79 /* zero means no -deadline tasks */
80 dl_rq->earliest_dl.curr = dl_rq->earliest_dl.next = 0;
81
82 dl_rq->dl_nr_migratory = 0;
83 dl_rq->overloaded = 0;
84 dl_rq->pushable_dl_tasks_root = RB_ROOT;
85#else
86 init_dl_bw(&dl_rq->dl_bw);
87#endif
88}
89
90#ifdef CONFIG_SMP
91
92static inline int dl_overloaded(struct rq *rq)
93{
94 return atomic_read(&rq->rd->dlo_count);
95}
96
97static inline void dl_set_overload(struct rq *rq)
98{
99 if (!rq->online)
100 return;
101
102 cpumask_set_cpu(rq->cpu, rq->rd->dlo_mask);
103 /*
104 * Must be visible before the overload count is
105 * set (as in sched_rt.c).
106 *
107 * Matched by the barrier in pull_dl_task().
108 */
109 smp_wmb();
110 atomic_inc(&rq->rd->dlo_count);
111}
112
113static inline void dl_clear_overload(struct rq *rq)
114{
115 if (!rq->online)
116 return;
117
118 atomic_dec(&rq->rd->dlo_count);
119 cpumask_clear_cpu(rq->cpu, rq->rd->dlo_mask);
120}
121
122static void update_dl_migration(struct dl_rq *dl_rq)
123{
124 if (dl_rq->dl_nr_migratory && dl_rq->dl_nr_running > 1) {
125 if (!dl_rq->overloaded) {
126 dl_set_overload(rq_of_dl_rq(dl_rq));
127 dl_rq->overloaded = 1;
128 }
129 } else if (dl_rq->overloaded) {
130 dl_clear_overload(rq_of_dl_rq(dl_rq));
131 dl_rq->overloaded = 0;
132 }
133}
134
135static void inc_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
136{
137 struct task_struct *p = dl_task_of(dl_se);
138
139 if (p->nr_cpus_allowed > 1)
140 dl_rq->dl_nr_migratory++;
141
142 update_dl_migration(dl_rq);
143}
144
145static void dec_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
146{
147 struct task_struct *p = dl_task_of(dl_se);
148
149 if (p->nr_cpus_allowed > 1)
150 dl_rq->dl_nr_migratory--;
151
152 update_dl_migration(dl_rq);
153}
154
155/*
156 * The list of pushable -deadline task is not a plist, like in
157 * sched_rt.c, it is an rb-tree with tasks ordered by deadline.
158 */
159static void enqueue_pushable_dl_task(struct rq *rq, struct task_struct *p)
160{
161 struct dl_rq *dl_rq = &rq->dl;
162 struct rb_node **link = &dl_rq->pushable_dl_tasks_root.rb_node;
163 struct rb_node *parent = NULL;
164 struct task_struct *entry;
165 int leftmost = 1;
166
167 BUG_ON(!RB_EMPTY_NODE(&p->pushable_dl_tasks));
168
169 while (*link) {
170 parent = *link;
171 entry = rb_entry(parent, struct task_struct,
172 pushable_dl_tasks);
173 if (dl_entity_preempt(&p->dl, &entry->dl))
174 link = &parent->rb_left;
175 else {
176 link = &parent->rb_right;
177 leftmost = 0;
178 }
179 }
180
181 if (leftmost)
182 dl_rq->pushable_dl_tasks_leftmost = &p->pushable_dl_tasks;
183
184 rb_link_node(&p->pushable_dl_tasks, parent, link);
185 rb_insert_color(&p->pushable_dl_tasks, &dl_rq->pushable_dl_tasks_root);
186}
187
188static void dequeue_pushable_dl_task(struct rq *rq, struct task_struct *p)
189{
190 struct dl_rq *dl_rq = &rq->dl;
191
192 if (RB_EMPTY_NODE(&p->pushable_dl_tasks))
193 return;
194
195 if (dl_rq->pushable_dl_tasks_leftmost == &p->pushable_dl_tasks) {
196 struct rb_node *next_node;
197
198 next_node = rb_next(&p->pushable_dl_tasks);
199 dl_rq->pushable_dl_tasks_leftmost = next_node;
200 }
201
202 rb_erase(&p->pushable_dl_tasks, &dl_rq->pushable_dl_tasks_root);
203 RB_CLEAR_NODE(&p->pushable_dl_tasks);
204}
205
206static inline int has_pushable_dl_tasks(struct rq *rq)
207{
208 return !RB_EMPTY_ROOT(&rq->dl.pushable_dl_tasks_root);
209}
210
211static int push_dl_task(struct rq *rq);
212
213#else
214
215static inline
216void enqueue_pushable_dl_task(struct rq *rq, struct task_struct *p)
217{
218}
219
220static inline
221void dequeue_pushable_dl_task(struct rq *rq, struct task_struct *p)
222{
223}
224
225static inline
226void inc_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
227{
228}
229
230static inline
231void dec_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
232{
233}
234
235#endif /* CONFIG_SMP */
236
237static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags);
238static void __dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags);
239static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p,
240 int flags);
241
242/*
243 * We are being explicitly informed that a new instance is starting,
244 * and this means that:
245 * - the absolute deadline of the entity has to be placed at
246 * current time + relative deadline;
247 * - the runtime of the entity has to be set to the maximum value.
248 *
249 * The capability of specifying such event is useful whenever a -deadline
250 * entity wants to (try to!) synchronize its behaviour with the scheduler's
251 * one, and to (try to!) reconcile itself with its own scheduling
252 * parameters.
253 */
254static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se,
255 struct sched_dl_entity *pi_se)
256{
257 struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
258 struct rq *rq = rq_of_dl_rq(dl_rq);
259
260 WARN_ON(!dl_se->dl_new || dl_se->dl_throttled);
261
262 /*
263 * We use the regular wall clock time to set deadlines in the
264 * future; in fact, we must consider execution overheads (time
265 * spent on hardirq context, etc.).
266 */
267 dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline;
268 dl_se->runtime = pi_se->dl_runtime;
269 dl_se->dl_new = 0;
270}
271
272/*
273 * Pure Earliest Deadline First (EDF) scheduling does not deal with the
274 * possibility of a entity lasting more than what it declared, and thus
275 * exhausting its runtime.
276 *
277 * Here we are interested in making runtime overrun possible, but we do
278 * not want a entity which is misbehaving to affect the scheduling of all
279 * other entities.
280 * Therefore, a budgeting strategy called Constant Bandwidth Server (CBS)
281 * is used, in order to confine each entity within its own bandwidth.
282 *
283 * This function deals exactly with that, and ensures that when the runtime
284 * of a entity is replenished, its deadline is also postponed. That ensures
285 * the overrunning entity can't interfere with other entity in the system and
286 * can't make them miss their deadlines. Reasons why this kind of overruns
287 * could happen are, typically, a entity voluntarily trying to overcome its
288 * runtime, or it just underestimated it during sched_setscheduler_ex().
289 */
290static void replenish_dl_entity(struct sched_dl_entity *dl_se,
291 struct sched_dl_entity *pi_se)
292{
293 struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
294 struct rq *rq = rq_of_dl_rq(dl_rq);
295
296 BUG_ON(pi_se->dl_runtime <= 0);
297
298 /*
299 * This could be the case for a !-dl task that is boosted.
300 * Just go with full inherited parameters.
301 */
302 if (dl_se->dl_deadline == 0) {
303 dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline;
304 dl_se->runtime = pi_se->dl_runtime;
305 }
306
307 /*
308 * We keep moving the deadline away until we get some
309 * available runtime for the entity. This ensures correct
310 * handling of situations where the runtime overrun is
311 * arbitrary large.
312 */
313 while (dl_se->runtime <= 0) {
314 dl_se->deadline += pi_se->dl_period;
315 dl_se->runtime += pi_se->dl_runtime;
316 }
317
318 /*
319 * At this point, the deadline really should be "in
320 * the future" with respect to rq->clock. If it's
321 * not, we are, for some reason, lagging too much!
322 * Anyway, after having warn userspace abut that,
323 * we still try to keep the things running by
324 * resetting the deadline and the budget of the
325 * entity.
326 */
327 if (dl_time_before(dl_se->deadline, rq_clock(rq))) {
328 static bool lag_once = false;
329
330 if (!lag_once) {
331 lag_once = true;
332 printk_sched("sched: DL replenish lagged to much\n");
333 }
334 dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline;
335 dl_se->runtime = pi_se->dl_runtime;
336 }
337}
338
339/*
340 * Here we check if --at time t-- an entity (which is probably being
341 * [re]activated or, in general, enqueued) can use its remaining runtime
342 * and its current deadline _without_ exceeding the bandwidth it is
343 * assigned (function returns true if it can't). We are in fact applying
344 * one of the CBS rules: when a task wakes up, if the residual runtime
345 * over residual deadline fits within the allocated bandwidth, then we
346 * can keep the current (absolute) deadline and residual budget without
347 * disrupting the schedulability of the system. Otherwise, we should
348 * refill the runtime and set the deadline a period in the future,
349 * because keeping the current (absolute) deadline of the task would
350 * result in breaking guarantees promised to other tasks (refer to
351 * Documentation/scheduler/sched-deadline.txt for more informations).
352 *
353 * This function returns true if:
354 *
355 * runtime / (deadline - t) > dl_runtime / dl_period ,
356 *
357 * IOW we can't recycle current parameters.
358 *
359 * Notice that the bandwidth check is done against the period. For
360 * task with deadline equal to period this is the same of using
361 * dl_deadline instead of dl_period in the equation above.
362 */
363static bool dl_entity_overflow(struct sched_dl_entity *dl_se,
364 struct sched_dl_entity *pi_se, u64 t)
365{
366 u64 left, right;
367
368 /*
369 * left and right are the two sides of the equation above,
370 * after a bit of shuffling to use multiplications instead
371 * of divisions.
372 *
373 * Note that none of the time values involved in the two
374 * multiplications are absolute: dl_deadline and dl_runtime
375 * are the relative deadline and the maximum runtime of each
376 * instance, runtime is the runtime left for the last instance
377 * and (deadline - t), since t is rq->clock, is the time left
378 * to the (absolute) deadline. Even if overflowing the u64 type
379 * is very unlikely to occur in both cases, here we scale down
380 * as we want to avoid that risk at all. Scaling down by 10
381 * means that we reduce granularity to 1us. We are fine with it,
382 * since this is only a true/false check and, anyway, thinking
383 * of anything below microseconds resolution is actually fiction
384 * (but still we want to give the user that illusion >;).
385 */
386 left = (pi_se->dl_period >> DL_SCALE) * (dl_se->runtime >> DL_SCALE);
387 right = ((dl_se->deadline - t) >> DL_SCALE) *
388 (pi_se->dl_runtime >> DL_SCALE);
389
390 return dl_time_before(right, left);
391}
392
393/*
394 * When a -deadline entity is queued back on the runqueue, its runtime and
395 * deadline might need updating.
396 *
397 * The policy here is that we update the deadline of the entity only if:
398 * - the current deadline is in the past,
399 * - using the remaining runtime with the current deadline would make
400 * the entity exceed its bandwidth.
401 */
402static void update_dl_entity(struct sched_dl_entity *dl_se,
403 struct sched_dl_entity *pi_se)
404{
405 struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
406 struct rq *rq = rq_of_dl_rq(dl_rq);
407
408 /*
409 * The arrival of a new instance needs special treatment, i.e.,
410 * the actual scheduling parameters have to be "renewed".
411 */
412 if (dl_se->dl_new) {
413 setup_new_dl_entity(dl_se, pi_se);
414 return;
415 }
416
417 if (dl_time_before(dl_se->deadline, rq_clock(rq)) ||
418 dl_entity_overflow(dl_se, pi_se, rq_clock(rq))) {
419 dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline;
420 dl_se->runtime = pi_se->dl_runtime;
421 }
422}
423
424/*
425 * If the entity depleted all its runtime, and if we want it to sleep
426 * while waiting for some new execution time to become available, we
427 * set the bandwidth enforcement timer to the replenishment instant
428 * and try to activate it.
429 *
430 * Notice that it is important for the caller to know if the timer
431 * actually started or not (i.e., the replenishment instant is in
432 * the future or in the past).
433 */
434static int start_dl_timer(struct sched_dl_entity *dl_se, bool boosted)
435{
436 struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
437 struct rq *rq = rq_of_dl_rq(dl_rq);
438 ktime_t now, act;
439 ktime_t soft, hard;
440 unsigned long range;
441 s64 delta;
442
443 if (boosted)
444 return 0;
445 /*
446 * We want the timer to fire at the deadline, but considering
447 * that it is actually coming from rq->clock and not from
448 * hrtimer's time base reading.
449 */
450 act = ns_to_ktime(dl_se->deadline);
451 now = hrtimer_cb_get_time(&dl_se->dl_timer);
452 delta = ktime_to_ns(now) - rq_clock(rq);
453 act = ktime_add_ns(act, delta);
454
455 /*
456 * If the expiry time already passed, e.g., because the value
457 * chosen as the deadline is too small, don't even try to
458 * start the timer in the past!
459 */
460 if (ktime_us_delta(act, now) < 0)
461 return 0;
462
463 hrtimer_set_expires(&dl_se->dl_timer, act);
464
465 soft = hrtimer_get_softexpires(&dl_se->dl_timer);
466 hard = hrtimer_get_expires(&dl_se->dl_timer);
467 range = ktime_to_ns(ktime_sub(hard, soft));
468 __hrtimer_start_range_ns(&dl_se->dl_timer, soft,
469 range, HRTIMER_MODE_ABS, 0);
470
471 return hrtimer_active(&dl_se->dl_timer);
472}
473
474/*
475 * This is the bandwidth enforcement timer callback. If here, we know
476 * a task is not on its dl_rq, since the fact that the timer was running
477 * means the task is throttled and needs a runtime replenishment.
478 *
479 * However, what we actually do depends on the fact the task is active,
480 * (it is on its rq) or has been removed from there by a call to
481 * dequeue_task_dl(). In the former case we must issue the runtime
482 * replenishment and add the task back to the dl_rq; in the latter, we just
483 * do nothing but clearing dl_throttled, so that runtime and deadline
484 * updating (and the queueing back to dl_rq) will be done by the
485 * next call to enqueue_task_dl().
486 */
487static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
488{
489 struct sched_dl_entity *dl_se = container_of(timer,
490 struct sched_dl_entity,
491 dl_timer);
492 struct task_struct *p = dl_task_of(dl_se);
493 struct rq *rq = task_rq(p);
494 raw_spin_lock(&rq->lock);
495
496 /*
497 * We need to take care of a possible races here. In fact, the
498 * task might have changed its scheduling policy to something
499 * different from SCHED_DEADLINE or changed its reservation
500 * parameters (through sched_setscheduler()).
501 */
502 if (!dl_task(p) || dl_se->dl_new)
503 goto unlock;
504
505 sched_clock_tick();
506 update_rq_clock(rq);
507 dl_se->dl_throttled = 0;
508 if (p->on_rq) {
509 enqueue_task_dl(rq, p, ENQUEUE_REPLENISH);
510 if (task_has_dl_policy(rq->curr))
511 check_preempt_curr_dl(rq, p, 0);
512 else
513 resched_task(rq->curr);
514#ifdef CONFIG_SMP
515 /*
516 * Queueing this task back might have overloaded rq,
517 * check if we need to kick someone away.
518 */
519 if (has_pushable_dl_tasks(rq))
520 push_dl_task(rq);
521#endif
522 }
523unlock:
524 raw_spin_unlock(&rq->lock);
525
526 return HRTIMER_NORESTART;
527}
528
529void init_dl_task_timer(struct sched_dl_entity *dl_se)
530{
531 struct hrtimer *timer = &dl_se->dl_timer;
532
533 if (hrtimer_active(timer)) {
534 hrtimer_try_to_cancel(timer);
535 return;
536 }
537
538 hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
539 timer->function = dl_task_timer;
540}
541
542static
543int dl_runtime_exceeded(struct rq *rq, struct sched_dl_entity *dl_se)
544{
545 int dmiss = dl_time_before(dl_se->deadline, rq_clock(rq));
546 int rorun = dl_se->runtime <= 0;
547
548 if (!rorun && !dmiss)
549 return 0;
550
551 /*
552 * If we are beyond our current deadline and we are still
553 * executing, then we have already used some of the runtime of
554 * the next instance. Thus, if we do not account that, we are
555 * stealing bandwidth from the system at each deadline miss!
556 */
557 if (dmiss) {
558 dl_se->runtime = rorun ? dl_se->runtime : 0;
559 dl_se->runtime -= rq_clock(rq) - dl_se->deadline;
560 }
561
562 return 1;
563}
564
565extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq);
566
567/*
568 * Update the current task's runtime statistics (provided it is still
569 * a -deadline task and has not been removed from the dl_rq).
570 */
571static void update_curr_dl(struct rq *rq)
572{
573 struct task_struct *curr = rq->curr;
574 struct sched_dl_entity *dl_se = &curr->dl;
575 u64 delta_exec;
576
577 if (!dl_task(curr) || !on_dl_rq(dl_se))
578 return;
579
580 /*
581 * Consumed budget is computed considering the time as
582 * observed by schedulable tasks (excluding time spent
583 * in hardirq context, etc.). Deadlines are instead
584 * computed using hard walltime. This seems to be the more
585 * natural solution, but the full ramifications of this
586 * approach need further study.
587 */
588 delta_exec = rq_clock_task(rq) - curr->se.exec_start;
589 if (unlikely((s64)delta_exec < 0))
590 delta_exec = 0;
591
592 schedstat_set(curr->se.statistics.exec_max,
593 max(curr->se.statistics.exec_max, delta_exec));
594
595 curr->se.sum_exec_runtime += delta_exec;
596 account_group_exec_runtime(curr, delta_exec);
597
598 curr->se.exec_start = rq_clock_task(rq);
599 cpuacct_charge(curr, delta_exec);
600
601 sched_rt_avg_update(rq, delta_exec);
602
603 dl_se->runtime -= delta_exec;
604 if (dl_runtime_exceeded(rq, dl_se)) {
605 __dequeue_task_dl(rq, curr, 0);
606 if (likely(start_dl_timer(dl_se, curr->dl.dl_boosted)))
607 dl_se->dl_throttled = 1;
608 else
609 enqueue_task_dl(rq, curr, ENQUEUE_REPLENISH);
610
611 if (!is_leftmost(curr, &rq->dl))
612 resched_task(curr);
613 }
614
615 /*
616 * Because -- for now -- we share the rt bandwidth, we need to
617 * account our runtime there too, otherwise actual rt tasks
618 * would be able to exceed the shared quota.
619 *
620 * Account to the root rt group for now.
621 *
622 * The solution we're working towards is having the RT groups scheduled
623 * using deadline servers -- however there's a few nasties to figure
624 * out before that can happen.
625 */
626 if (rt_bandwidth_enabled()) {
627 struct rt_rq *rt_rq = &rq->rt;
628
629 raw_spin_lock(&rt_rq->rt_runtime_lock);
630 /*
631 * We'll let actual RT tasks worry about the overflow here, we
632 * have our own CBS to keep us inline; only account when RT
633 * bandwidth is relevant.
634 */
635 if (sched_rt_bandwidth_account(rt_rq))
636 rt_rq->rt_time += delta_exec;
637 raw_spin_unlock(&rt_rq->rt_runtime_lock);
638 }
639}
640
641#ifdef CONFIG_SMP
642
643static struct task_struct *pick_next_earliest_dl_task(struct rq *rq, int cpu);
644
645static inline u64 next_deadline(struct rq *rq)
646{
647 struct task_struct *next = pick_next_earliest_dl_task(rq, rq->cpu);
648
649 if (next && dl_prio(next->prio))
650 return next->dl.deadline;
651 else
652 return 0;
653}
654
655static void inc_dl_deadline(struct dl_rq *dl_rq, u64 deadline)
656{
657 struct rq *rq = rq_of_dl_rq(dl_rq);
658
659 if (dl_rq->earliest_dl.curr == 0 ||
660 dl_time_before(deadline, dl_rq->earliest_dl.curr)) {
661 /*
662 * If the dl_rq had no -deadline tasks, or if the new task
663 * has shorter deadline than the current one on dl_rq, we
664 * know that the previous earliest becomes our next earliest,
665 * as the new task becomes the earliest itself.
666 */
667 dl_rq->earliest_dl.next = dl_rq->earliest_dl.curr;
668 dl_rq->earliest_dl.curr = deadline;
669 cpudl_set(&rq->rd->cpudl, rq->cpu, deadline, 1);
670 } else if (dl_rq->earliest_dl.next == 0 ||
671 dl_time_before(deadline, dl_rq->earliest_dl.next)) {
672 /*
673 * On the other hand, if the new -deadline task has a
674 * a later deadline than the earliest one on dl_rq, but
675 * it is earlier than the next (if any), we must
676 * recompute the next-earliest.
677 */
678 dl_rq->earliest_dl.next = next_deadline(rq);
679 }
680}
681
682static void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline)
683{
684 struct rq *rq = rq_of_dl_rq(dl_rq);
685
686 /*
687 * Since we may have removed our earliest (and/or next earliest)
688 * task we must recompute them.
689 */
690 if (!dl_rq->dl_nr_running) {
691 dl_rq->earliest_dl.curr = 0;
692 dl_rq->earliest_dl.next = 0;
693 cpudl_set(&rq->rd->cpudl, rq->cpu, 0, 0);
694 } else {
695 struct rb_node *leftmost = dl_rq->rb_leftmost;
696 struct sched_dl_entity *entry;
697
698 entry = rb_entry(leftmost, struct sched_dl_entity, rb_node);
699 dl_rq->earliest_dl.curr = entry->deadline;
700 dl_rq->earliest_dl.next = next_deadline(rq);
701 cpudl_set(&rq->rd->cpudl, rq->cpu, entry->deadline, 1);
702 }
703}
704
705#else
706
707static inline void inc_dl_deadline(struct dl_rq *dl_rq, u64 deadline) {}
708static inline void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline) {}
709
710#endif /* CONFIG_SMP */
711
712static inline
713void inc_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
714{
715 int prio = dl_task_of(dl_se)->prio;
716 u64 deadline = dl_se->deadline;
717
718 WARN_ON(!dl_prio(prio));
719 dl_rq->dl_nr_running++;
720 inc_nr_running(rq_of_dl_rq(dl_rq));
721
722 inc_dl_deadline(dl_rq, deadline);
723 inc_dl_migration(dl_se, dl_rq);
724}
725
726static inline
727void dec_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
728{
729 int prio = dl_task_of(dl_se)->prio;
730
731 WARN_ON(!dl_prio(prio));
732 WARN_ON(!dl_rq->dl_nr_running);
733 dl_rq->dl_nr_running--;
734 dec_nr_running(rq_of_dl_rq(dl_rq));
735
736 dec_dl_deadline(dl_rq, dl_se->deadline);
737 dec_dl_migration(dl_se, dl_rq);
738}
739
740static void __enqueue_dl_entity(struct sched_dl_entity *dl_se)
741{
742 struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
743 struct rb_node **link = &dl_rq->rb_root.rb_node;
744 struct rb_node *parent = NULL;
745 struct sched_dl_entity *entry;
746 int leftmost = 1;
747
748 BUG_ON(!RB_EMPTY_NODE(&dl_se->rb_node));
749
750 while (*link) {
751 parent = *link;
752 entry = rb_entry(parent, struct sched_dl_entity, rb_node);
753 if (dl_time_before(dl_se->deadline, entry->deadline))
754 link = &parent->rb_left;
755 else {
756 link = &parent->rb_right;
757 leftmost = 0;
758 }
759 }
760
761 if (leftmost)
762 dl_rq->rb_leftmost = &dl_se->rb_node;
763
764 rb_link_node(&dl_se->rb_node, parent, link);
765 rb_insert_color(&dl_se->rb_node, &dl_rq->rb_root);
766
767 inc_dl_tasks(dl_se, dl_rq);
768}
769
770static void __dequeue_dl_entity(struct sched_dl_entity *dl_se)
771{
772 struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
773
774 if (RB_EMPTY_NODE(&dl_se->rb_node))
775 return;
776
777 if (dl_rq->rb_leftmost == &dl_se->rb_node) {
778 struct rb_node *next_node;
779
780 next_node = rb_next(&dl_se->rb_node);
781 dl_rq->rb_leftmost = next_node;
782 }
783
784 rb_erase(&dl_se->rb_node, &dl_rq->rb_root);
785 RB_CLEAR_NODE(&dl_se->rb_node);
786
787 dec_dl_tasks(dl_se, dl_rq);
788}
789
790static void
791enqueue_dl_entity(struct sched_dl_entity *dl_se,
792 struct sched_dl_entity *pi_se, int flags)
793{
794 BUG_ON(on_dl_rq(dl_se));
795
796 /*
797 * If this is a wakeup or a new instance, the scheduling
798 * parameters of the task might need updating. Otherwise,
799 * we want a replenishment of its runtime.
800 */
801 if (!dl_se->dl_new && flags & ENQUEUE_REPLENISH)
802 replenish_dl_entity(dl_se, pi_se);
803 else
804 update_dl_entity(dl_se, pi_se);
805
806 __enqueue_dl_entity(dl_se);
807}
808
809static void dequeue_dl_entity(struct sched_dl_entity *dl_se)
810{
811 __dequeue_dl_entity(dl_se);
812}
813
814static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
815{
816 struct task_struct *pi_task = rt_mutex_get_top_task(p);
817 struct sched_dl_entity *pi_se = &p->dl;
818
819 /*
820 * Use the scheduling parameters of the top pi-waiter
821 * task if we have one and its (relative) deadline is
822 * smaller than our one... OTW we keep our runtime and
823 * deadline.
824 */
825 if (pi_task && p->dl.dl_boosted && dl_prio(pi_task->normal_prio))
826 pi_se = &pi_task->dl;
827
828 /*
829 * If p is throttled, we do nothing. In fact, if it exhausted
830 * its budget it needs a replenishment and, since it now is on
831 * its rq, the bandwidth timer callback (which clearly has not
832 * run yet) will take care of this.
833 */
834 if (p->dl.dl_throttled)
835 return;
836
837 enqueue_dl_entity(&p->dl, pi_se, flags);
838
839 if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
840 enqueue_pushable_dl_task(rq, p);
841}
842
843static void __dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags)
844{
845 dequeue_dl_entity(&p->dl);
846 dequeue_pushable_dl_task(rq, p);
847}
848
849static void dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags)
850{
851 update_curr_dl(rq);
852 __dequeue_task_dl(rq, p, flags);
853}
854
855/*
856 * Yield task semantic for -deadline tasks is:
857 *
858 * get off from the CPU until our next instance, with
859 * a new runtime. This is of little use now, since we
860 * don't have a bandwidth reclaiming mechanism. Anyway,
861 * bandwidth reclaiming is planned for the future, and
862 * yield_task_dl will indicate that some spare budget
863 * is available for other task instances to use it.
864 */
865static void yield_task_dl(struct rq *rq)
866{
867 struct task_struct *p = rq->curr;
868
869 /*
870 * We make the task go to sleep until its current deadline by
871 * forcing its runtime to zero. This way, update_curr_dl() stops
872 * it and the bandwidth timer will wake it up and will give it
873 * new scheduling parameters (thanks to dl_new=1).
874 */
875 if (p->dl.runtime > 0) {
876 rq->curr->dl.dl_new = 1;
877 p->dl.runtime = 0;
878 }
879 update_curr_dl(rq);
880}
881
882#ifdef CONFIG_SMP
883
884static int find_later_rq(struct task_struct *task);
885
886static int
887select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags)
888{
889 struct task_struct *curr;
890 struct rq *rq;
891
892 if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK)
893 goto out;
894
895 rq = cpu_rq(cpu);
896
897 rcu_read_lock();
898 curr = ACCESS_ONCE(rq->curr); /* unlocked access */
899
900 /*
901 * If we are dealing with a -deadline task, we must
902 * decide where to wake it up.
903 * If it has a later deadline and the current task
904 * on this rq can't move (provided the waking task
905 * can!) we prefer to send it somewhere else. On the
906 * other hand, if it has a shorter deadline, we
907 * try to make it stay here, it might be important.
908 */
909 if (unlikely(dl_task(curr)) &&
910 (curr->nr_cpus_allowed < 2 ||
911 !dl_entity_preempt(&p->dl, &curr->dl)) &&
912 (p->nr_cpus_allowed > 1)) {
913 int target = find_later_rq(p);
914
915 if (target != -1)
916 cpu = target;
917 }
918 rcu_read_unlock();
919
920out:
921 return cpu;
922}
923
924static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p)
925{
926 /*
927 * Current can't be migrated, useless to reschedule,
928 * let's hope p can move out.
929 */
930 if (rq->curr->nr_cpus_allowed == 1 ||
931 cpudl_find(&rq->rd->cpudl, rq->curr, NULL) == -1)
932 return;
933
934 /*
935 * p is migratable, so let's not schedule it and
936 * see if it is pushed or pulled somewhere else.
937 */
938 if (p->nr_cpus_allowed != 1 &&
939 cpudl_find(&rq->rd->cpudl, p, NULL) != -1)
940 return;
941
942 resched_task(rq->curr);
943}
944
945#endif /* CONFIG_SMP */
946
947/*
948 * Only called when both the current and waking task are -deadline
949 * tasks.
950 */
951static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p,
952 int flags)
953{
954 if (dl_entity_preempt(&p->dl, &rq->curr->dl)) {
955 resched_task(rq->curr);
956 return;
957 }
958
959#ifdef CONFIG_SMP
960 /*
961 * In the unlikely case current and p have the same deadline
962 * let us try to decide what's the best thing to do...
963 */
964 if ((p->dl.deadline == rq->curr->dl.deadline) &&
965 !test_tsk_need_resched(rq->curr))
966 check_preempt_equal_dl(rq, p);
967#endif /* CONFIG_SMP */
968}
969
970#ifdef CONFIG_SCHED_HRTICK
971static void start_hrtick_dl(struct rq *rq, struct task_struct *p)
972{
973 s64 delta = p->dl.dl_runtime - p->dl.runtime;
974
975 if (delta > 10000)
976 hrtick_start(rq, p->dl.runtime);
977}
978#endif
979
980static struct sched_dl_entity *pick_next_dl_entity(struct rq *rq,
981 struct dl_rq *dl_rq)
982{
983 struct rb_node *left = dl_rq->rb_leftmost;
984
985 if (!left)
986 return NULL;
987
988 return rb_entry(left, struct sched_dl_entity, rb_node);
989}
990
991struct task_struct *pick_next_task_dl(struct rq *rq)
992{
993 struct sched_dl_entity *dl_se;
994 struct task_struct *p;
995 struct dl_rq *dl_rq;
996
997 dl_rq = &rq->dl;
998
999 if (unlikely(!dl_rq->dl_nr_running))
1000 return NULL;
1001
1002 dl_se = pick_next_dl_entity(rq, dl_rq);
1003 BUG_ON(!dl_se);
1004
1005 p = dl_task_of(dl_se);
1006 p->se.exec_start = rq_clock_task(rq);
1007
1008 /* Running task will never be pushed. */
1009 dequeue_pushable_dl_task(rq, p);
1010
1011#ifdef CONFIG_SCHED_HRTICK
1012 if (hrtick_enabled(rq))
1013 start_hrtick_dl(rq, p);
1014#endif
1015
1016#ifdef CONFIG_SMP
1017 rq->post_schedule = has_pushable_dl_tasks(rq);
1018#endif /* CONFIG_SMP */
1019
1020 return p;
1021}
1022
1023static void put_prev_task_dl(struct rq *rq, struct task_struct *p)
1024{
1025 update_curr_dl(rq);
1026
1027 if (on_dl_rq(&p->dl) && p->nr_cpus_allowed > 1)
1028 enqueue_pushable_dl_task(rq, p);
1029}
1030
1031static void task_tick_dl(struct rq *rq, struct task_struct *p, int queued)
1032{
1033 update_curr_dl(rq);
1034
1035#ifdef CONFIG_SCHED_HRTICK
1036 if (hrtick_enabled(rq) && queued && p->dl.runtime > 0)
1037 start_hrtick_dl(rq, p);
1038#endif
1039}
1040
1041static void task_fork_dl(struct task_struct *p)
1042{
1043 /*
1044 * SCHED_DEADLINE tasks cannot fork and this is achieved through
1045 * sched_fork()
1046 */
1047}
1048
1049static void task_dead_dl(struct task_struct *p)
1050{
1051 struct hrtimer *timer = &p->dl.dl_timer;
1052 struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
1053
1054 /*
1055 * Since we are TASK_DEAD we won't slip out of the domain!
1056 */
1057 raw_spin_lock_irq(&dl_b->lock);
1058 dl_b->total_bw -= p->dl.dl_bw;
1059 raw_spin_unlock_irq(&dl_b->lock);
1060
1061 hrtimer_cancel(timer);
1062}
1063
1064static void set_curr_task_dl(struct rq *rq)
1065{
1066 struct task_struct *p = rq->curr;
1067
1068 p->se.exec_start = rq_clock_task(rq);
1069
1070 /* You can't push away the running task */
1071 dequeue_pushable_dl_task(rq, p);
1072}
1073
1074#ifdef CONFIG_SMP
1075
1076/* Only try algorithms three times */
1077#define DL_MAX_TRIES 3
1078
1079static int pick_dl_task(struct rq *rq, struct task_struct *p, int cpu)
1080{
1081 if (!task_running(rq, p) &&
1082 (cpu < 0 || cpumask_test_cpu(cpu, &p->cpus_allowed)) &&
1083 (p->nr_cpus_allowed > 1))
1084 return 1;
1085
1086 return 0;
1087}
1088
1089/* Returns the second earliest -deadline task, NULL otherwise */
1090static struct task_struct *pick_next_earliest_dl_task(struct rq *rq, int cpu)
1091{
1092 struct rb_node *next_node = rq->dl.rb_leftmost;
1093 struct sched_dl_entity *dl_se;
1094 struct task_struct *p = NULL;
1095
1096next_node:
1097 next_node = rb_next(next_node);
1098 if (next_node) {
1099 dl_se = rb_entry(next_node, struct sched_dl_entity, rb_node);
1100 p = dl_task_of(dl_se);
1101
1102 if (pick_dl_task(rq, p, cpu))
1103 return p;
1104
1105 goto next_node;
1106 }
1107
1108 return NULL;
1109}
1110
1111static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask_dl);
1112
1113static int find_later_rq(struct task_struct *task)
1114{
1115 struct sched_domain *sd;
1116 struct cpumask *later_mask = __get_cpu_var(local_cpu_mask_dl);
1117 int this_cpu = smp_processor_id();
1118 int best_cpu, cpu = task_cpu(task);
1119
1120 /* Make sure the mask is initialized first */
1121 if (unlikely(!later_mask))
1122 return -1;
1123
1124 if (task->nr_cpus_allowed == 1)
1125 return -1;
1126
1127 best_cpu = cpudl_find(&task_rq(task)->rd->cpudl,
1128 task, later_mask);
1129 if (best_cpu == -1)
1130 return -1;
1131
1132 /*
1133 * If we are here, some target has been found,
1134 * the most suitable of which is cached in best_cpu.
1135 * This is, among the runqueues where the current tasks
1136 * have later deadlines than the task's one, the rq
1137 * with the latest possible one.
1138 *
1139 * Now we check how well this matches with task's
1140 * affinity and system topology.
1141 *
1142 * The last cpu where the task run is our first
1143 * guess, since it is most likely cache-hot there.
1144 */
1145 if (cpumask_test_cpu(cpu, later_mask))
1146 return cpu;
1147 /*
1148 * Check if this_cpu is to be skipped (i.e., it is
1149 * not in the mask) or not.
1150 */
1151 if (!cpumask_test_cpu(this_cpu, later_mask))
1152 this_cpu = -1;
1153
1154 rcu_read_lock();
1155 for_each_domain(cpu, sd) {
1156 if (sd->flags & SD_WAKE_AFFINE) {
1157
1158 /*
1159 * If possible, preempting this_cpu is
1160 * cheaper than migrating.
1161 */
1162 if (this_cpu != -1 &&
1163 cpumask_test_cpu(this_cpu, sched_domain_span(sd))) {
1164 rcu_read_unlock();
1165 return this_cpu;
1166 }
1167
1168 /*
1169 * Last chance: if best_cpu is valid and is
1170 * in the mask, that becomes our choice.
1171 */
1172 if (best_cpu < nr_cpu_ids &&
1173 cpumask_test_cpu(best_cpu, sched_domain_span(sd))) {
1174 rcu_read_unlock();
1175 return best_cpu;
1176 }
1177 }
1178 }
1179 rcu_read_unlock();
1180
1181 /*
1182 * At this point, all our guesses failed, we just return
1183 * 'something', and let the caller sort the things out.
1184 */
1185 if (this_cpu != -1)
1186 return this_cpu;
1187
1188 cpu = cpumask_any(later_mask);
1189 if (cpu < nr_cpu_ids)
1190 return cpu;
1191
1192 return -1;
1193}
1194
1195/* Locks the rq it finds */
1196static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq)
1197{
1198 struct rq *later_rq = NULL;
1199 int tries;
1200 int cpu;
1201
1202 for (tries = 0; tries < DL_MAX_TRIES; tries++) {
1203 cpu = find_later_rq(task);
1204
1205 if ((cpu == -1) || (cpu == rq->cpu))
1206 break;
1207
1208 later_rq = cpu_rq(cpu);
1209
1210 /* Retry if something changed. */
1211 if (double_lock_balance(rq, later_rq)) {
1212 if (unlikely(task_rq(task) != rq ||
1213 !cpumask_test_cpu(later_rq->cpu,
1214 &task->cpus_allowed) ||
1215 task_running(rq, task) || !task->on_rq)) {
1216 double_unlock_balance(rq, later_rq);
1217 later_rq = NULL;
1218 break;
1219 }
1220 }
1221
1222 /*
1223 * If the rq we found has no -deadline task, or
1224 * its earliest one has a later deadline than our
1225 * task, the rq is a good one.
1226 */
1227 if (!later_rq->dl.dl_nr_running ||
1228 dl_time_before(task->dl.deadline,
1229 later_rq->dl.earliest_dl.curr))
1230 break;
1231
1232 /* Otherwise we try again. */
1233 double_unlock_balance(rq, later_rq);
1234 later_rq = NULL;
1235 }
1236
1237 return later_rq;
1238}
1239
1240static struct task_struct *pick_next_pushable_dl_task(struct rq *rq)
1241{
1242 struct task_struct *p;
1243
1244 if (!has_pushable_dl_tasks(rq))
1245 return NULL;
1246
1247 p = rb_entry(rq->dl.pushable_dl_tasks_leftmost,
1248 struct task_struct, pushable_dl_tasks);
1249
1250 BUG_ON(rq->cpu != task_cpu(p));
1251 BUG_ON(task_current(rq, p));
1252 BUG_ON(p->nr_cpus_allowed <= 1);
1253
1254 BUG_ON(!p->on_rq);
1255 BUG_ON(!dl_task(p));
1256
1257 return p;
1258}
1259
1260/*
1261 * See if the non running -deadline tasks on this rq
1262 * can be sent to some other CPU where they can preempt
1263 * and start executing.
1264 */
1265static int push_dl_task(struct rq *rq)
1266{
1267 struct task_struct *next_task;
1268 struct rq *later_rq;
1269
1270 if (!rq->dl.overloaded)
1271 return 0;
1272
1273 next_task = pick_next_pushable_dl_task(rq);
1274 if (!next_task)
1275 return 0;
1276
1277retry:
1278 if (unlikely(next_task == rq->curr)) {
1279 WARN_ON(1);
1280 return 0;
1281 }
1282
1283 /*
1284 * If next_task preempts rq->curr, and rq->curr
1285 * can move away, it makes sense to just reschedule
1286 * without going further in pushing next_task.
1287 */
1288 if (dl_task(rq->curr) &&
1289 dl_time_before(next_task->dl.deadline, rq->curr->dl.deadline) &&
1290 rq->curr->nr_cpus_allowed > 1) {
1291 resched_task(rq->curr);
1292 return 0;
1293 }
1294
1295 /* We might release rq lock */
1296 get_task_struct(next_task);
1297
1298 /* Will lock the rq it'll find */
1299 later_rq = find_lock_later_rq(next_task, rq);
1300 if (!later_rq) {
1301 struct task_struct *task;
1302
1303 /*
1304 * We must check all this again, since
1305 * find_lock_later_rq releases rq->lock and it is
1306 * then possible that next_task has migrated.
1307 */
1308 task = pick_next_pushable_dl_task(rq);
1309 if (task_cpu(next_task) == rq->cpu && task == next_task) {
1310 /*
1311 * The task is still there. We don't try
1312 * again, some other cpu will pull it when ready.
1313 */
1314 dequeue_pushable_dl_task(rq, next_task);
1315 goto out;
1316 }
1317
1318 if (!task)
1319 /* No more tasks */
1320 goto out;
1321
1322 put_task_struct(next_task);
1323 next_task = task;
1324 goto retry;
1325 }
1326
1327 deactivate_task(rq, next_task, 0);
1328 set_task_cpu(next_task, later_rq->cpu);
1329 activate_task(later_rq, next_task, 0);
1330
1331 resched_task(later_rq->curr);
1332
1333 double_unlock_balance(rq, later_rq);
1334
1335out:
1336 put_task_struct(next_task);
1337
1338 return 1;
1339}
1340
1341static void push_dl_tasks(struct rq *rq)
1342{
1343 /* Terminates as it moves a -deadline task */
1344 while (push_dl_task(rq))
1345 ;
1346}
1347
1348static int pull_dl_task(struct rq *this_rq)
1349{
1350 int this_cpu = this_rq->cpu, ret = 0, cpu;
1351 struct task_struct *p;
1352 struct rq *src_rq;
1353 u64 dmin = LONG_MAX;
1354
1355 if (likely(!dl_overloaded(this_rq)))
1356 return 0;
1357
1358 /*
1359 * Match the barrier from dl_set_overloaded; this guarantees that if we
1360 * see overloaded we must also see the dlo_mask bit.
1361 */
1362 smp_rmb();
1363
1364 for_each_cpu(cpu, this_rq->rd->dlo_mask) {
1365 if (this_cpu == cpu)
1366 continue;
1367
1368 src_rq = cpu_rq(cpu);
1369
1370 /*
1371 * It looks racy, abd it is! However, as in sched_rt.c,
1372 * we are fine with this.
1373 */
1374 if (this_rq->dl.dl_nr_running &&
1375 dl_time_before(this_rq->dl.earliest_dl.curr,
1376 src_rq->dl.earliest_dl.next))
1377 continue;
1378
1379 /* Might drop this_rq->lock */
1380 double_lock_balance(this_rq, src_rq);
1381
1382 /*
1383 * If there are no more pullable tasks on the
1384 * rq, we're done with it.
1385 */
1386 if (src_rq->dl.dl_nr_running <= 1)
1387 goto skip;
1388
1389 p = pick_next_earliest_dl_task(src_rq, this_cpu);
1390
1391 /*
1392 * We found a task to be pulled if:
1393 * - it preempts our current (if there's one),
1394 * - it will preempt the last one we pulled (if any).
1395 */
1396 if (p && dl_time_before(p->dl.deadline, dmin) &&
1397 (!this_rq->dl.dl_nr_running ||
1398 dl_time_before(p->dl.deadline,
1399 this_rq->dl.earliest_dl.curr))) {
1400 WARN_ON(p == src_rq->curr);
1401 WARN_ON(!p->on_rq);
1402
1403 /*
1404 * Then we pull iff p has actually an earlier
1405 * deadline than the current task of its runqueue.
1406 */
1407 if (dl_time_before(p->dl.deadline,
1408 src_rq->curr->dl.deadline))
1409 goto skip;
1410
1411 ret = 1;
1412
1413 deactivate_task(src_rq, p, 0);
1414 set_task_cpu(p, this_cpu);
1415 activate_task(this_rq, p, 0);
1416 dmin = p->dl.deadline;
1417
1418 /* Is there any other task even earlier? */
1419 }
1420skip:
1421 double_unlock_balance(this_rq, src_rq);
1422 }
1423
1424 return ret;
1425}
1426
1427static void pre_schedule_dl(struct rq *rq, struct task_struct *prev)
1428{
1429 /* Try to pull other tasks here */
1430 if (dl_task(prev))
1431 pull_dl_task(rq);
1432}
1433
1434static void post_schedule_dl(struct rq *rq)
1435{
1436 push_dl_tasks(rq);
1437}
1438
1439/*
1440 * Since the task is not running and a reschedule is not going to happen
1441 * anytime soon on its runqueue, we try pushing it away now.
1442 */
1443static void task_woken_dl(struct rq *rq, struct task_struct *p)
1444{
1445 if (!task_running(rq, p) &&
1446 !test_tsk_need_resched(rq->curr) &&
1447 has_pushable_dl_tasks(rq) &&
1448 p->nr_cpus_allowed > 1 &&
1449 dl_task(rq->curr) &&
1450 (rq->curr->nr_cpus_allowed < 2 ||
1451 dl_entity_preempt(&rq->curr->dl, &p->dl))) {
1452 push_dl_tasks(rq);
1453 }
1454}
1455
1456static void set_cpus_allowed_dl(struct task_struct *p,
1457 const struct cpumask *new_mask)
1458{
1459 struct rq *rq;
1460 int weight;
1461
1462 BUG_ON(!dl_task(p));
1463
1464 /*
1465 * Update only if the task is actually running (i.e.,
1466 * it is on the rq AND it is not throttled).
1467 */
1468 if (!on_dl_rq(&p->dl))
1469 return;
1470
1471 weight = cpumask_weight(new_mask);
1472
1473 /*
1474 * Only update if the process changes its state from whether it
1475 * can migrate or not.
1476 */
1477 if ((p->nr_cpus_allowed > 1) == (weight > 1))
1478 return;
1479
1480 rq = task_rq(p);
1481
1482 /*
1483 * The process used to be able to migrate OR it can now migrate
1484 */
1485 if (weight <= 1) {
1486 if (!task_current(rq, p))
1487 dequeue_pushable_dl_task(rq, p);
1488 BUG_ON(!rq->dl.dl_nr_migratory);
1489 rq->dl.dl_nr_migratory--;
1490 } else {
1491 if (!task_current(rq, p))
1492 enqueue_pushable_dl_task(rq, p);
1493 rq->dl.dl_nr_migratory++;
1494 }
1495
1496 update_dl_migration(&rq->dl);
1497}
1498
1499/* Assumes rq->lock is held */
1500static void rq_online_dl(struct rq *rq)
1501{
1502 if (rq->dl.overloaded)
1503 dl_set_overload(rq);
1504
1505 if (rq->dl.dl_nr_running > 0)
1506 cpudl_set(&rq->rd->cpudl, rq->cpu, rq->dl.earliest_dl.curr, 1);
1507}
1508
1509/* Assumes rq->lock is held */
1510static void rq_offline_dl(struct rq *rq)
1511{
1512 if (rq->dl.overloaded)
1513 dl_clear_overload(rq);
1514
1515 cpudl_set(&rq->rd->cpudl, rq->cpu, 0, 0);
1516}
1517
1518void init_sched_dl_class(void)
1519{
1520 unsigned int i;
1521
1522 for_each_possible_cpu(i)
1523 zalloc_cpumask_var_node(&per_cpu(local_cpu_mask_dl, i),
1524 GFP_KERNEL, cpu_to_node(i));
1525}
1526
1527#endif /* CONFIG_SMP */
1528
1529static void switched_from_dl(struct rq *rq, struct task_struct *p)
1530{
1531 if (hrtimer_active(&p->dl.dl_timer) && !dl_policy(p->policy))
1532 hrtimer_try_to_cancel(&p->dl.dl_timer);
1533
1534#ifdef CONFIG_SMP
1535 /*
1536 * Since this might be the only -deadline task on the rq,
1537 * this is the right place to try to pull some other one
1538 * from an overloaded cpu, if any.
1539 */
1540 if (!rq->dl.dl_nr_running)
1541 pull_dl_task(rq);
1542#endif
1543}
1544
1545/*
1546 * When switching to -deadline, we may overload the rq, then
1547 * we try to push someone off, if possible.
1548 */
1549static void switched_to_dl(struct rq *rq, struct task_struct *p)
1550{
1551 int check_resched = 1;
1552
1553 /*
1554 * If p is throttled, don't consider the possibility
1555 * of preempting rq->curr, the check will be done right
1556 * after its runtime will get replenished.
1557 */
1558 if (unlikely(p->dl.dl_throttled))
1559 return;
1560
1561 if (p->on_rq || rq->curr != p) {
1562#ifdef CONFIG_SMP
1563 if (rq->dl.overloaded && push_dl_task(rq) && rq != task_rq(p))
1564 /* Only reschedule if pushing failed */
1565 check_resched = 0;
1566#endif /* CONFIG_SMP */
1567 if (check_resched && task_has_dl_policy(rq->curr))
1568 check_preempt_curr_dl(rq, p, 0);
1569 }
1570}
1571
1572/*
1573 * If the scheduling parameters of a -deadline task changed,
1574 * a push or pull operation might be needed.
1575 */
1576static void prio_changed_dl(struct rq *rq, struct task_struct *p,
1577 int oldprio)
1578{
1579 if (p->on_rq || rq->curr == p) {
1580#ifdef CONFIG_SMP
1581 /*
1582 * This might be too much, but unfortunately
1583 * we don't have the old deadline value, and
1584 * we can't argue if the task is increasing
1585 * or lowering its prio, so...
1586 */
1587 if (!rq->dl.overloaded)
1588 pull_dl_task(rq);
1589
1590 /*
1591 * If we now have a earlier deadline task than p,
1592 * then reschedule, provided p is still on this
1593 * runqueue.
1594 */
1595 if (dl_time_before(rq->dl.earliest_dl.curr, p->dl.deadline) &&
1596 rq->curr == p)
1597 resched_task(p);
1598#else
1599 /*
1600 * Again, we don't know if p has a earlier
1601 * or later deadline, so let's blindly set a
1602 * (maybe not needed) rescheduling point.
1603 */
1604 resched_task(p);
1605#endif /* CONFIG_SMP */
1606 } else
1607 switched_to_dl(rq, p);
1608}
1609
1610const struct sched_class dl_sched_class = {
1611 .next = &rt_sched_class,
1612 .enqueue_task = enqueue_task_dl,
1613 .dequeue_task = dequeue_task_dl,
1614 .yield_task = yield_task_dl,
1615
1616 .check_preempt_curr = check_preempt_curr_dl,
1617
1618 .pick_next_task = pick_next_task_dl,
1619 .put_prev_task = put_prev_task_dl,
1620
1621#ifdef CONFIG_SMP
1622 .select_task_rq = select_task_rq_dl,
1623 .set_cpus_allowed = set_cpus_allowed_dl,
1624 .rq_online = rq_online_dl,
1625 .rq_offline = rq_offline_dl,
1626 .pre_schedule = pre_schedule_dl,
1627 .post_schedule = post_schedule_dl,
1628 .task_woken = task_woken_dl,
1629#endif
1630
1631 .set_curr_task = set_curr_task_dl,
1632 .task_tick = task_tick_dl,
1633 .task_fork = task_fork_dl,
1634 .task_dead = task_dead_dl,
1635
1636 .prio_changed = prio_changed_dl,
1637 .switched_from = switched_from_dl,
1638 .switched_to = switched_to_dl,
1639};
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 5c34d1817e8f..dd52e7ffb10e 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -139,7 +139,7 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
139 0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L); 139 0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L);
140#endif 140#endif
141#ifdef CONFIG_NUMA_BALANCING 141#ifdef CONFIG_NUMA_BALANCING
142 SEQ_printf(m, " %d", cpu_to_node(task_cpu(p))); 142 SEQ_printf(m, " %d", task_node(p));
143#endif 143#endif
144#ifdef CONFIG_CGROUP_SCHED 144#ifdef CONFIG_CGROUP_SCHED
145 SEQ_printf(m, " %s", task_group_path(task_group(p))); 145 SEQ_printf(m, " %s", task_group_path(task_group(p)));
@@ -371,7 +371,7 @@ static void sched_debug_header(struct seq_file *m)
371 PN(cpu_clk); 371 PN(cpu_clk);
372 P(jiffies); 372 P(jiffies);
373#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK 373#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
374 P(sched_clock_stable); 374 P(sched_clock_stable());
375#endif 375#endif
376#undef PN 376#undef PN
377#undef P 377#undef P
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index c7395d97e4cb..9b4c4f320130 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -872,15 +872,6 @@ static unsigned int task_scan_max(struct task_struct *p)
872 return max(smin, smax); 872 return max(smin, smax);
873} 873}
874 874
875/*
876 * Once a preferred node is selected the scheduler balancer will prefer moving
877 * a task to that node for sysctl_numa_balancing_settle_count number of PTE
878 * scans. This will give the process the chance to accumulate more faults on
879 * the preferred node but still allow the scheduler to move the task again if
880 * the nodes CPUs are overloaded.
881 */
882unsigned int sysctl_numa_balancing_settle_count __read_mostly = 4;
883
884static void account_numa_enqueue(struct rq *rq, struct task_struct *p) 875static void account_numa_enqueue(struct rq *rq, struct task_struct *p)
885{ 876{
886 rq->nr_numa_running += (p->numa_preferred_nid != -1); 877 rq->nr_numa_running += (p->numa_preferred_nid != -1);
@@ -930,7 +921,8 @@ static inline unsigned long group_faults(struct task_struct *p, int nid)
930 if (!p->numa_group) 921 if (!p->numa_group)
931 return 0; 922 return 0;
932 923
933 return p->numa_group->faults[2*nid] + p->numa_group->faults[2*nid+1]; 924 return p->numa_group->faults[task_faults_idx(nid, 0)] +
925 p->numa_group->faults[task_faults_idx(nid, 1)];
934} 926}
935 927
936/* 928/*
@@ -1023,7 +1015,7 @@ struct task_numa_env {
1023 1015
1024 struct numa_stats src_stats, dst_stats; 1016 struct numa_stats src_stats, dst_stats;
1025 1017
1026 int imbalance_pct, idx; 1018 int imbalance_pct;
1027 1019
1028 struct task_struct *best_task; 1020 struct task_struct *best_task;
1029 long best_imp; 1021 long best_imp;
@@ -1211,7 +1203,7 @@ static int task_numa_migrate(struct task_struct *p)
1211 * elsewhere, so there is no point in (re)trying. 1203 * elsewhere, so there is no point in (re)trying.
1212 */ 1204 */
1213 if (unlikely(!sd)) { 1205 if (unlikely(!sd)) {
1214 p->numa_preferred_nid = cpu_to_node(task_cpu(p)); 1206 p->numa_preferred_nid = task_node(p);
1215 return -EINVAL; 1207 return -EINVAL;
1216 } 1208 }
1217 1209
@@ -1258,11 +1250,15 @@ static int task_numa_migrate(struct task_struct *p)
1258 p->numa_scan_period = task_scan_min(p); 1250 p->numa_scan_period = task_scan_min(p);
1259 1251
1260 if (env.best_task == NULL) { 1252 if (env.best_task == NULL) {
1261 int ret = migrate_task_to(p, env.best_cpu); 1253 ret = migrate_task_to(p, env.best_cpu);
1254 if (ret != 0)
1255 trace_sched_stick_numa(p, env.src_cpu, env.best_cpu);
1262 return ret; 1256 return ret;
1263 } 1257 }
1264 1258
1265 ret = migrate_swap(p, env.best_task); 1259 ret = migrate_swap(p, env.best_task);
1260 if (ret != 0)
1261 trace_sched_stick_numa(p, env.src_cpu, task_cpu(env.best_task));
1266 put_task_struct(env.best_task); 1262 put_task_struct(env.best_task);
1267 return ret; 1263 return ret;
1268} 1264}
@@ -1278,7 +1274,7 @@ static void numa_migrate_preferred(struct task_struct *p)
1278 p->numa_migrate_retry = jiffies + HZ; 1274 p->numa_migrate_retry = jiffies + HZ;
1279 1275
1280 /* Success if task is already running on preferred CPU */ 1276 /* Success if task is already running on preferred CPU */
1281 if (cpu_to_node(task_cpu(p)) == p->numa_preferred_nid) 1277 if (task_node(p) == p->numa_preferred_nid)
1282 return; 1278 return;
1283 1279
1284 /* Otherwise, try migrate to a CPU on the preferred node */ 1280 /* Otherwise, try migrate to a CPU on the preferred node */
@@ -1350,7 +1346,6 @@ static void update_task_scan_period(struct task_struct *p,
1350 * scanning faster if shared accesses dominate as it may 1346 * scanning faster if shared accesses dominate as it may
1351 * simply bounce migrations uselessly 1347 * simply bounce migrations uselessly
1352 */ 1348 */
1353 period_slot = DIV_ROUND_UP(diff, NUMA_PERIOD_SLOTS);
1354 ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared)); 1349 ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared));
1355 diff = (diff * ratio) / NUMA_PERIOD_SLOTS; 1350 diff = (diff * ratio) / NUMA_PERIOD_SLOTS;
1356 } 1351 }
@@ -1762,6 +1757,8 @@ void task_numa_work(struct callback_head *work)
1762 start = end; 1757 start = end;
1763 if (pages <= 0) 1758 if (pages <= 0)
1764 goto out; 1759 goto out;
1760
1761 cond_resched();
1765 } while (end != vma->vm_end); 1762 } while (end != vma->vm_end);
1766 } 1763 }
1767 1764
@@ -2365,13 +2362,7 @@ static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
2365 } 2362 }
2366 wakeup = 0; 2363 wakeup = 0;
2367 } else { 2364 } else {
2368 /* 2365 __synchronize_entity_decay(se);
2369 * Task re-woke on same cpu (or else migrate_task_rq_fair()
2370 * would have made count negative); we must be careful to avoid
2371 * double-accounting blocked time after synchronizing decays.
2372 */
2373 se->avg.last_runnable_update += __synchronize_entity_decay(se)
2374 << 20;
2375 } 2366 }
2376 2367
2377 /* migrated tasks did not contribute to our blocked load */ 2368 /* migrated tasks did not contribute to our blocked load */
@@ -3923,7 +3914,7 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
3923{ 3914{
3924 struct sched_entity *se = tg->se[cpu]; 3915 struct sched_entity *se = tg->se[cpu];
3925 3916
3926 if (!tg->parent || !wl) /* the trivial, non-cgroup case */ 3917 if (!tg->parent) /* the trivial, non-cgroup case */
3927 return wl; 3918 return wl;
3928 3919
3929 for_each_sched_entity(se) { 3920 for_each_sched_entity(se) {
@@ -4101,12 +4092,16 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
4101 */ 4092 */
4102static struct sched_group * 4093static struct sched_group *
4103find_idlest_group(struct sched_domain *sd, struct task_struct *p, 4094find_idlest_group(struct sched_domain *sd, struct task_struct *p,
4104 int this_cpu, int load_idx) 4095 int this_cpu, int sd_flag)
4105{ 4096{
4106 struct sched_group *idlest = NULL, *group = sd->groups; 4097 struct sched_group *idlest = NULL, *group = sd->groups;
4107 unsigned long min_load = ULONG_MAX, this_load = 0; 4098 unsigned long min_load = ULONG_MAX, this_load = 0;
4099 int load_idx = sd->forkexec_idx;
4108 int imbalance = 100 + (sd->imbalance_pct-100)/2; 4100 int imbalance = 100 + (sd->imbalance_pct-100)/2;
4109 4101
4102 if (sd_flag & SD_BALANCE_WAKE)
4103 load_idx = sd->wake_idx;
4104
4110 do { 4105 do {
4111 unsigned long load, avg_load; 4106 unsigned long load, avg_load;
4112 int local_group; 4107 int local_group;
@@ -4274,7 +4269,6 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
4274 } 4269 }
4275 4270
4276 while (sd) { 4271 while (sd) {
4277 int load_idx = sd->forkexec_idx;
4278 struct sched_group *group; 4272 struct sched_group *group;
4279 int weight; 4273 int weight;
4280 4274
@@ -4283,10 +4277,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
4283 continue; 4277 continue;
4284 } 4278 }
4285 4279
4286 if (sd_flag & SD_BALANCE_WAKE) 4280 group = find_idlest_group(sd, p, cpu, sd_flag);
4287 load_idx = sd->wake_idx;
4288
4289 group = find_idlest_group(sd, p, cpu, load_idx);
4290 if (!group) { 4281 if (!group) {
4291 sd = sd->child; 4282 sd = sd->child;
4292 continue; 4283 continue;
@@ -5512,7 +5503,6 @@ static inline void update_sg_lb_stats(struct lb_env *env,
5512 struct sched_group *group, int load_idx, 5503 struct sched_group *group, int load_idx,
5513 int local_group, struct sg_lb_stats *sgs) 5504 int local_group, struct sg_lb_stats *sgs)
5514{ 5505{
5515 unsigned long nr_running;
5516 unsigned long load; 5506 unsigned long load;
5517 int i; 5507 int i;
5518 5508
@@ -5521,8 +5511,6 @@ static inline void update_sg_lb_stats(struct lb_env *env,
5521 for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { 5511 for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
5522 struct rq *rq = cpu_rq(i); 5512 struct rq *rq = cpu_rq(i);
5523 5513
5524 nr_running = rq->nr_running;
5525
5526 /* Bias balancing toward cpus of our domain */ 5514 /* Bias balancing toward cpus of our domain */
5527 if (local_group) 5515 if (local_group)
5528 load = target_load(i, load_idx); 5516 load = target_load(i, load_idx);
@@ -5530,7 +5518,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
5530 load = source_load(i, load_idx); 5518 load = source_load(i, load_idx);
5531 5519
5532 sgs->group_load += load; 5520 sgs->group_load += load;
5533 sgs->sum_nr_running += nr_running; 5521 sgs->sum_nr_running += rq->nr_running;
5534#ifdef CONFIG_NUMA_BALANCING 5522#ifdef CONFIG_NUMA_BALANCING
5535 sgs->nr_numa_running += rq->nr_numa_running; 5523 sgs->nr_numa_running += rq->nr_numa_running;
5536 sgs->nr_preferred_running += rq->nr_preferred_running; 5524 sgs->nr_preferred_running += rq->nr_preferred_running;
@@ -6521,7 +6509,7 @@ static struct {
6521 unsigned long next_balance; /* in jiffy units */ 6509 unsigned long next_balance; /* in jiffy units */
6522} nohz ____cacheline_aligned; 6510} nohz ____cacheline_aligned;
6523 6511
6524static inline int find_new_ilb(int call_cpu) 6512static inline int find_new_ilb(void)
6525{ 6513{
6526 int ilb = cpumask_first(nohz.idle_cpus_mask); 6514 int ilb = cpumask_first(nohz.idle_cpus_mask);
6527 6515
@@ -6536,13 +6524,13 @@ static inline int find_new_ilb(int call_cpu)
6536 * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle 6524 * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle
6537 * CPU (if there is one). 6525 * CPU (if there is one).
6538 */ 6526 */
6539static void nohz_balancer_kick(int cpu) 6527static void nohz_balancer_kick(void)
6540{ 6528{
6541 int ilb_cpu; 6529 int ilb_cpu;
6542 6530
6543 nohz.next_balance++; 6531 nohz.next_balance++;
6544 6532
6545 ilb_cpu = find_new_ilb(cpu); 6533 ilb_cpu = find_new_ilb();
6546 6534
6547 if (ilb_cpu >= nr_cpu_ids) 6535 if (ilb_cpu >= nr_cpu_ids)
6548 return; 6536 return;
@@ -6652,10 +6640,10 @@ void update_max_interval(void)
6652 * 6640 *
6653 * Balancing parameters are set up in init_sched_domains. 6641 * Balancing parameters are set up in init_sched_domains.
6654 */ 6642 */
6655static void rebalance_domains(int cpu, enum cpu_idle_type idle) 6643static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
6656{ 6644{
6657 int continue_balancing = 1; 6645 int continue_balancing = 1;
6658 struct rq *rq = cpu_rq(cpu); 6646 int cpu = rq->cpu;
6659 unsigned long interval; 6647 unsigned long interval;
6660 struct sched_domain *sd; 6648 struct sched_domain *sd;
6661 /* Earliest time when we have to do rebalance again */ 6649 /* Earliest time when we have to do rebalance again */
@@ -6752,9 +6740,9 @@ out:
6752 * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the 6740 * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
6753 * rebalancing for all the cpus for whom scheduler ticks are stopped. 6741 * rebalancing for all the cpus for whom scheduler ticks are stopped.
6754 */ 6742 */
6755static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) 6743static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
6756{ 6744{
6757 struct rq *this_rq = cpu_rq(this_cpu); 6745 int this_cpu = this_rq->cpu;
6758 struct rq *rq; 6746 struct rq *rq;
6759 int balance_cpu; 6747 int balance_cpu;
6760 6748
@@ -6781,7 +6769,7 @@ static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle)
6781 update_idle_cpu_load(rq); 6769 update_idle_cpu_load(rq);
6782 raw_spin_unlock_irq(&rq->lock); 6770 raw_spin_unlock_irq(&rq->lock);
6783 6771
6784 rebalance_domains(balance_cpu, CPU_IDLE); 6772 rebalance_domains(rq, CPU_IDLE);
6785 6773
6786 if (time_after(this_rq->next_balance, rq->next_balance)) 6774 if (time_after(this_rq->next_balance, rq->next_balance))
6787 this_rq->next_balance = rq->next_balance; 6775 this_rq->next_balance = rq->next_balance;
@@ -6800,14 +6788,14 @@ end:
6800 * - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler 6788 * - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler
6801 * domain span are idle. 6789 * domain span are idle.
6802 */ 6790 */
6803static inline int nohz_kick_needed(struct rq *rq, int cpu) 6791static inline int nohz_kick_needed(struct rq *rq)
6804{ 6792{
6805 unsigned long now = jiffies; 6793 unsigned long now = jiffies;
6806 struct sched_domain *sd; 6794 struct sched_domain *sd;
6807 struct sched_group_power *sgp; 6795 struct sched_group_power *sgp;
6808 int nr_busy; 6796 int nr_busy, cpu = rq->cpu;
6809 6797
6810 if (unlikely(idle_cpu(cpu))) 6798 if (unlikely(rq->idle_balance))
6811 return 0; 6799 return 0;
6812 6800
6813 /* 6801 /*
@@ -6856,7 +6844,7 @@ need_kick:
6856 return 1; 6844 return 1;
6857} 6845}
6858#else 6846#else
6859static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { } 6847static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { }
6860#endif 6848#endif
6861 6849
6862/* 6850/*
@@ -6865,38 +6853,39 @@ static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { }
6865 */ 6853 */
6866static void run_rebalance_domains(struct softirq_action *h) 6854static void run_rebalance_domains(struct softirq_action *h)
6867{ 6855{
6868 int this_cpu = smp_processor_id(); 6856 struct rq *this_rq = this_rq();
6869 struct rq *this_rq = cpu_rq(this_cpu);
6870 enum cpu_idle_type idle = this_rq->idle_balance ? 6857 enum cpu_idle_type idle = this_rq->idle_balance ?
6871 CPU_IDLE : CPU_NOT_IDLE; 6858 CPU_IDLE : CPU_NOT_IDLE;
6872 6859
6873 rebalance_domains(this_cpu, idle); 6860 rebalance_domains(this_rq, idle);
6874 6861
6875 /* 6862 /*
6876 * If this cpu has a pending nohz_balance_kick, then do the 6863 * If this cpu has a pending nohz_balance_kick, then do the
6877 * balancing on behalf of the other idle cpus whose ticks are 6864 * balancing on behalf of the other idle cpus whose ticks are
6878 * stopped. 6865 * stopped.
6879 */ 6866 */
6880 nohz_idle_balance(this_cpu, idle); 6867 nohz_idle_balance(this_rq, idle);
6881} 6868}
6882 6869
6883static inline int on_null_domain(int cpu) 6870static inline int on_null_domain(struct rq *rq)
6884{ 6871{
6885 return !rcu_dereference_sched(cpu_rq(cpu)->sd); 6872 return !rcu_dereference_sched(rq->sd);
6886} 6873}
6887 6874
6888/* 6875/*
6889 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing. 6876 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
6890 */ 6877 */
6891void trigger_load_balance(struct rq *rq, int cpu) 6878void trigger_load_balance(struct rq *rq)
6892{ 6879{
6893 /* Don't need to rebalance while attached to NULL domain */ 6880 /* Don't need to rebalance while attached to NULL domain */
6894 if (time_after_eq(jiffies, rq->next_balance) && 6881 if (unlikely(on_null_domain(rq)))
6895 likely(!on_null_domain(cpu))) 6882 return;
6883
6884 if (time_after_eq(jiffies, rq->next_balance))
6896 raise_softirq(SCHED_SOFTIRQ); 6885 raise_softirq(SCHED_SOFTIRQ);
6897#ifdef CONFIG_NO_HZ_COMMON 6886#ifdef CONFIG_NO_HZ_COMMON
6898 if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu))) 6887 if (nohz_kick_needed(rq))
6899 nohz_balancer_kick(cpu); 6888 nohz_balancer_kick();
6900#endif 6889#endif
6901} 6890}
6902 6891
@@ -7012,15 +7001,15 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p)
7012 struct cfs_rq *cfs_rq = cfs_rq_of(se); 7001 struct cfs_rq *cfs_rq = cfs_rq_of(se);
7013 7002
7014 /* 7003 /*
7015 * Ensure the task's vruntime is normalized, so that when its 7004 * Ensure the task's vruntime is normalized, so that when it's
7016 * switched back to the fair class the enqueue_entity(.flags=0) will 7005 * switched back to the fair class the enqueue_entity(.flags=0) will
7017 * do the right thing. 7006 * do the right thing.
7018 * 7007 *
7019 * If it was on_rq, then the dequeue_entity(.flags=0) will already 7008 * If it's on_rq, then the dequeue_entity(.flags=0) will already
7020 * have normalized the vruntime, if it was !on_rq, then only when 7009 * have normalized the vruntime, if it's !on_rq, then only when
7021 * the task is sleeping will it still have non-normalized vruntime. 7010 * the task is sleeping will it still have non-normalized vruntime.
7022 */ 7011 */
7023 if (!se->on_rq && p->state != TASK_RUNNING) { 7012 if (!p->on_rq && p->state != TASK_RUNNING) {
7024 /* 7013 /*
7025 * Fix up our vruntime so that the current sleep doesn't 7014 * Fix up our vruntime so that the current sleep doesn't
7026 * cause 'unlimited' sleep bonus. 7015 * cause 'unlimited' sleep bonus.
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 1c4065575fa2..1999021042c7 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -538,6 +538,14 @@ static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
538 538
539#endif /* CONFIG_RT_GROUP_SCHED */ 539#endif /* CONFIG_RT_GROUP_SCHED */
540 540
541bool sched_rt_bandwidth_account(struct rt_rq *rt_rq)
542{
543 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
544
545 return (hrtimer_active(&rt_b->rt_period_timer) ||
546 rt_rq->rt_time < rt_b->rt_runtime);
547}
548
541#ifdef CONFIG_SMP 549#ifdef CONFIG_SMP
542/* 550/*
543 * We ran out of runtime, see if we can borrow some from our neighbours. 551 * We ran out of runtime, see if we can borrow some from our neighbours.
@@ -1738,7 +1746,7 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p)
1738 !test_tsk_need_resched(rq->curr) && 1746 !test_tsk_need_resched(rq->curr) &&
1739 has_pushable_tasks(rq) && 1747 has_pushable_tasks(rq) &&
1740 p->nr_cpus_allowed > 1 && 1748 p->nr_cpus_allowed > 1 &&
1741 rt_task(rq->curr) && 1749 (dl_task(rq->curr) || rt_task(rq->curr)) &&
1742 (rq->curr->nr_cpus_allowed < 2 || 1750 (rq->curr->nr_cpus_allowed < 2 ||
1743 rq->curr->prio <= p->prio)) 1751 rq->curr->prio <= p->prio))
1744 push_rt_tasks(rq); 1752 push_rt_tasks(rq);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 88c85b21d633..f964add50f38 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2,6 +2,7 @@
2#include <linux/sched.h> 2#include <linux/sched.h>
3#include <linux/sched/sysctl.h> 3#include <linux/sched/sysctl.h>
4#include <linux/sched/rt.h> 4#include <linux/sched/rt.h>
5#include <linux/sched/deadline.h>
5#include <linux/mutex.h> 6#include <linux/mutex.h>
6#include <linux/spinlock.h> 7#include <linux/spinlock.h>
7#include <linux/stop_machine.h> 8#include <linux/stop_machine.h>
@@ -9,6 +10,7 @@
9#include <linux/slab.h> 10#include <linux/slab.h>
10 11
11#include "cpupri.h" 12#include "cpupri.h"
13#include "cpudeadline.h"
12#include "cpuacct.h" 14#include "cpuacct.h"
13 15
14struct rq; 16struct rq;
@@ -73,6 +75,13 @@ extern void update_cpu_load_active(struct rq *this_rq);
73#define NICE_0_SHIFT SCHED_LOAD_SHIFT 75#define NICE_0_SHIFT SCHED_LOAD_SHIFT
74 76
75/* 77/*
78 * Single value that decides SCHED_DEADLINE internal math precision.
79 * 10 -> just above 1us
80 * 9 -> just above 0.5us
81 */
82#define DL_SCALE (10)
83
84/*
76 * These are the 'tuning knobs' of the scheduler: 85 * These are the 'tuning knobs' of the scheduler:
77 */ 86 */
78 87
@@ -81,11 +90,19 @@ extern void update_cpu_load_active(struct rq *this_rq);
81 */ 90 */
82#define RUNTIME_INF ((u64)~0ULL) 91#define RUNTIME_INF ((u64)~0ULL)
83 92
93static inline int fair_policy(int policy)
94{
95 return policy == SCHED_NORMAL || policy == SCHED_BATCH;
96}
97
84static inline int rt_policy(int policy) 98static inline int rt_policy(int policy)
85{ 99{
86 if (policy == SCHED_FIFO || policy == SCHED_RR) 100 return policy == SCHED_FIFO || policy == SCHED_RR;
87 return 1; 101}
88 return 0; 102
103static inline int dl_policy(int policy)
104{
105 return policy == SCHED_DEADLINE;
89} 106}
90 107
91static inline int task_has_rt_policy(struct task_struct *p) 108static inline int task_has_rt_policy(struct task_struct *p)
@@ -93,6 +110,25 @@ static inline int task_has_rt_policy(struct task_struct *p)
93 return rt_policy(p->policy); 110 return rt_policy(p->policy);
94} 111}
95 112
113static inline int task_has_dl_policy(struct task_struct *p)
114{
115 return dl_policy(p->policy);
116}
117
118static inline bool dl_time_before(u64 a, u64 b)
119{
120 return (s64)(a - b) < 0;
121}
122
123/*
124 * Tells if entity @a should preempt entity @b.
125 */
126static inline bool
127dl_entity_preempt(struct sched_dl_entity *a, struct sched_dl_entity *b)
128{
129 return dl_time_before(a->deadline, b->deadline);
130}
131
96/* 132/*
97 * This is the priority-queue data structure of the RT scheduling class: 133 * This is the priority-queue data structure of the RT scheduling class:
98 */ 134 */
@@ -108,6 +144,47 @@ struct rt_bandwidth {
108 u64 rt_runtime; 144 u64 rt_runtime;
109 struct hrtimer rt_period_timer; 145 struct hrtimer rt_period_timer;
110}; 146};
147/*
148 * To keep the bandwidth of -deadline tasks and groups under control
149 * we need some place where:
150 * - store the maximum -deadline bandwidth of the system (the group);
151 * - cache the fraction of that bandwidth that is currently allocated.
152 *
153 * This is all done in the data structure below. It is similar to the
154 * one used for RT-throttling (rt_bandwidth), with the main difference
155 * that, since here we are only interested in admission control, we
156 * do not decrease any runtime while the group "executes", neither we
157 * need a timer to replenish it.
158 *
159 * With respect to SMP, the bandwidth is given on a per-CPU basis,
160 * meaning that:
161 * - dl_bw (< 100%) is the bandwidth of the system (group) on each CPU;
162 * - dl_total_bw array contains, in the i-eth element, the currently
163 * allocated bandwidth on the i-eth CPU.
164 * Moreover, groups consume bandwidth on each CPU, while tasks only
165 * consume bandwidth on the CPU they're running on.
166 * Finally, dl_total_bw_cpu is used to cache the index of dl_total_bw
167 * that will be shown the next time the proc or cgroup controls will
168 * be red. It on its turn can be changed by writing on its own
169 * control.
170 */
171struct dl_bandwidth {
172 raw_spinlock_t dl_runtime_lock;
173 u64 dl_runtime;
174 u64 dl_period;
175};
176
177static inline int dl_bandwidth_enabled(void)
178{
179 return sysctl_sched_rt_runtime >= 0;
180}
181
182extern struct dl_bw *dl_bw_of(int i);
183
184struct dl_bw {
185 raw_spinlock_t lock;
186 u64 bw, total_bw;
187};
111 188
112extern struct mutex sched_domains_mutex; 189extern struct mutex sched_domains_mutex;
113 190
@@ -364,6 +441,41 @@ struct rt_rq {
364#endif 441#endif
365}; 442};
366 443
444/* Deadline class' related fields in a runqueue */
445struct dl_rq {
446 /* runqueue is an rbtree, ordered by deadline */
447 struct rb_root rb_root;
448 struct rb_node *rb_leftmost;
449
450 unsigned long dl_nr_running;
451
452#ifdef CONFIG_SMP
453 /*
454 * Deadline values of the currently executing and the
455 * earliest ready task on this rq. Caching these facilitates
456 * the decision wether or not a ready but not running task
457 * should migrate somewhere else.
458 */
459 struct {
460 u64 curr;
461 u64 next;
462 } earliest_dl;
463
464 unsigned long dl_nr_migratory;
465 int overloaded;
466
467 /*
468 * Tasks on this rq that can be pushed away. They are kept in
469 * an rb-tree, ordered by tasks' deadlines, with caching
470 * of the leftmost (earliest deadline) element.
471 */
472 struct rb_root pushable_dl_tasks_root;
473 struct rb_node *pushable_dl_tasks_leftmost;
474#else
475 struct dl_bw dl_bw;
476#endif
477};
478
367#ifdef CONFIG_SMP 479#ifdef CONFIG_SMP
368 480
369/* 481/*
@@ -382,6 +494,15 @@ struct root_domain {
382 cpumask_var_t online; 494 cpumask_var_t online;
383 495
384 /* 496 /*
497 * The bit corresponding to a CPU gets set here if such CPU has more
498 * than one runnable -deadline task (as it is below for RT tasks).
499 */
500 cpumask_var_t dlo_mask;
501 atomic_t dlo_count;
502 struct dl_bw dl_bw;
503 struct cpudl cpudl;
504
505 /*
385 * The "RT overload" flag: it gets set if a CPU has more than 506 * The "RT overload" flag: it gets set if a CPU has more than
386 * one runnable RT task. 507 * one runnable RT task.
387 */ 508 */
@@ -432,6 +553,7 @@ struct rq {
432 553
433 struct cfs_rq cfs; 554 struct cfs_rq cfs;
434 struct rt_rq rt; 555 struct rt_rq rt;
556 struct dl_rq dl;
435 557
436#ifdef CONFIG_FAIR_GROUP_SCHED 558#ifdef CONFIG_FAIR_GROUP_SCHED
437 /* list of leaf cfs_rq on this cpu: */ 559 /* list of leaf cfs_rq on this cpu: */
@@ -827,8 +949,6 @@ static inline u64 global_rt_runtime(void)
827 return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC; 949 return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;
828} 950}
829 951
830
831
832static inline int task_current(struct rq *rq, struct task_struct *p) 952static inline int task_current(struct rq *rq, struct task_struct *p)
833{ 953{
834 return rq->curr == p; 954 return rq->curr == p;
@@ -988,6 +1108,7 @@ static const u32 prio_to_wmult[40] = {
988#else 1108#else
989#define ENQUEUE_WAKING 0 1109#define ENQUEUE_WAKING 0
990#endif 1110#endif
1111#define ENQUEUE_REPLENISH 8
991 1112
992#define DEQUEUE_SLEEP 1 1113#define DEQUEUE_SLEEP 1
993 1114
@@ -1023,6 +1144,7 @@ struct sched_class {
1023 void (*set_curr_task) (struct rq *rq); 1144 void (*set_curr_task) (struct rq *rq);
1024 void (*task_tick) (struct rq *rq, struct task_struct *p, int queued); 1145 void (*task_tick) (struct rq *rq, struct task_struct *p, int queued);
1025 void (*task_fork) (struct task_struct *p); 1146 void (*task_fork) (struct task_struct *p);
1147 void (*task_dead) (struct task_struct *p);
1026 1148
1027 void (*switched_from) (struct rq *this_rq, struct task_struct *task); 1149 void (*switched_from) (struct rq *this_rq, struct task_struct *task);
1028 void (*switched_to) (struct rq *this_rq, struct task_struct *task); 1150 void (*switched_to) (struct rq *this_rq, struct task_struct *task);
@@ -1042,6 +1164,7 @@ struct sched_class {
1042 for (class = sched_class_highest; class; class = class->next) 1164 for (class = sched_class_highest; class; class = class->next)
1043 1165
1044extern const struct sched_class stop_sched_class; 1166extern const struct sched_class stop_sched_class;
1167extern const struct sched_class dl_sched_class;
1045extern const struct sched_class rt_sched_class; 1168extern const struct sched_class rt_sched_class;
1046extern const struct sched_class fair_sched_class; 1169extern const struct sched_class fair_sched_class;
1047extern const struct sched_class idle_sched_class; 1170extern const struct sched_class idle_sched_class;
@@ -1051,7 +1174,7 @@ extern const struct sched_class idle_sched_class;
1051 1174
1052extern void update_group_power(struct sched_domain *sd, int cpu); 1175extern void update_group_power(struct sched_domain *sd, int cpu);
1053 1176
1054extern void trigger_load_balance(struct rq *rq, int cpu); 1177extern void trigger_load_balance(struct rq *rq);
1055extern void idle_balance(int this_cpu, struct rq *this_rq); 1178extern void idle_balance(int this_cpu, struct rq *this_rq);
1056 1179
1057extern void idle_enter_fair(struct rq *this_rq); 1180extern void idle_enter_fair(struct rq *this_rq);
@@ -1068,8 +1191,11 @@ static inline void idle_balance(int cpu, struct rq *rq)
1068extern void sysrq_sched_debug_show(void); 1191extern void sysrq_sched_debug_show(void);
1069extern void sched_init_granularity(void); 1192extern void sched_init_granularity(void);
1070extern void update_max_interval(void); 1193extern void update_max_interval(void);
1194
1195extern void init_sched_dl_class(void);
1071extern void init_sched_rt_class(void); 1196extern void init_sched_rt_class(void);
1072extern void init_sched_fair_class(void); 1197extern void init_sched_fair_class(void);
1198extern void init_sched_dl_class(void);
1073 1199
1074extern void resched_task(struct task_struct *p); 1200extern void resched_task(struct task_struct *p);
1075extern void resched_cpu(int cpu); 1201extern void resched_cpu(int cpu);
@@ -1077,6 +1203,12 @@ extern void resched_cpu(int cpu);
1077extern struct rt_bandwidth def_rt_bandwidth; 1203extern struct rt_bandwidth def_rt_bandwidth;
1078extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime); 1204extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime);
1079 1205
1206extern struct dl_bandwidth def_dl_bandwidth;
1207extern void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime);
1208extern void init_dl_task_timer(struct sched_dl_entity *dl_se);
1209
1210unsigned long to_ratio(u64 period, u64 runtime);
1211
1080extern void update_idle_cpu_load(struct rq *this_rq); 1212extern void update_idle_cpu_load(struct rq *this_rq);
1081 1213
1082extern void init_task_runnable_average(struct task_struct *p); 1214extern void init_task_runnable_average(struct task_struct *p);
@@ -1353,6 +1485,7 @@ extern void print_rt_stats(struct seq_file *m, int cpu);
1353 1485
1354extern void init_cfs_rq(struct cfs_rq *cfs_rq); 1486extern void init_cfs_rq(struct cfs_rq *cfs_rq);
1355extern void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq); 1487extern void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq);
1488extern void init_dl_rq(struct dl_rq *dl_rq, struct rq *rq);
1356 1489
1357extern void cfs_bandwidth_usage_inc(void); 1490extern void cfs_bandwidth_usage_inc(void);
1358extern void cfs_bandwidth_usage_dec(void); 1491extern void cfs_bandwidth_usage_dec(void);
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index 47197de8abd9..fdb6bb0b3356 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -103,7 +103,7 @@ get_rr_interval_stop(struct rq *rq, struct task_struct *task)
103 * Simple, special scheduling class for the per-CPU stop tasks: 103 * Simple, special scheduling class for the per-CPU stop tasks:
104 */ 104 */
105const struct sched_class stop_sched_class = { 105const struct sched_class stop_sched_class = {
106 .next = &rt_sched_class, 106 .next = &dl_sched_class,
107 107
108 .enqueue_task = enqueue_task_stop, 108 .enqueue_task = enqueue_task_stop,
109 .dequeue_task = dequeue_task_stop, 109 .dequeue_task = dequeue_task_stop,
diff --git a/kernel/signal.c b/kernel/signal.c
index 940b30ee9a30..52f881db1ca0 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2047,8 +2047,8 @@ static bool do_signal_stop(int signr)
2047 if (task_set_jobctl_pending(current, signr | gstop)) 2047 if (task_set_jobctl_pending(current, signr | gstop))
2048 sig->group_stop_count++; 2048 sig->group_stop_count++;
2049 2049
2050 for (t = next_thread(current); t != current; 2050 t = current;
2051 t = next_thread(t)) { 2051 while_each_thread(current, t) {
2052 /* 2052 /*
2053 * Setting state to TASK_STOPPED for a group 2053 * Setting state to TASK_STOPPED for a group
2054 * stop is always done with the siglock held, 2054 * stop is always done with the siglock held,
@@ -3125,8 +3125,7 @@ int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact)
3125 rm_from_queue_full(&mask, &t->signal->shared_pending); 3125 rm_from_queue_full(&mask, &t->signal->shared_pending);
3126 do { 3126 do {
3127 rm_from_queue_full(&mask, &t->pending); 3127 rm_from_queue_full(&mask, &t->pending);
3128 t = next_thread(t); 3128 } while_each_thread(current, t);
3129 } while (t != current);
3130 } 3129 }
3131 } 3130 }
3132 3131
diff --git a/kernel/smp.c b/kernel/smp.c
index bd9f94028838..ffee35bef179 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -23,17 +23,11 @@ enum {
23struct call_function_data { 23struct call_function_data {
24 struct call_single_data __percpu *csd; 24 struct call_single_data __percpu *csd;
25 cpumask_var_t cpumask; 25 cpumask_var_t cpumask;
26 cpumask_var_t cpumask_ipi;
27}; 26};
28 27
29static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_function_data, cfd_data); 28static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_function_data, cfd_data);
30 29
31struct call_single_queue { 30static DEFINE_PER_CPU_SHARED_ALIGNED(struct llist_head, call_single_queue);
32 struct list_head list;
33 raw_spinlock_t lock;
34};
35
36static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_single_queue, call_single_queue);
37 31
38static int 32static int
39hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu) 33hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu)
@@ -47,14 +41,8 @@ hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu)
47 if (!zalloc_cpumask_var_node(&cfd->cpumask, GFP_KERNEL, 41 if (!zalloc_cpumask_var_node(&cfd->cpumask, GFP_KERNEL,
48 cpu_to_node(cpu))) 42 cpu_to_node(cpu)))
49 return notifier_from_errno(-ENOMEM); 43 return notifier_from_errno(-ENOMEM);
50 if (!zalloc_cpumask_var_node(&cfd->cpumask_ipi, GFP_KERNEL,
51 cpu_to_node(cpu))) {
52 free_cpumask_var(cfd->cpumask);
53 return notifier_from_errno(-ENOMEM);
54 }
55 cfd->csd = alloc_percpu(struct call_single_data); 44 cfd->csd = alloc_percpu(struct call_single_data);
56 if (!cfd->csd) { 45 if (!cfd->csd) {
57 free_cpumask_var(cfd->cpumask_ipi);
58 free_cpumask_var(cfd->cpumask); 46 free_cpumask_var(cfd->cpumask);
59 return notifier_from_errno(-ENOMEM); 47 return notifier_from_errno(-ENOMEM);
60 } 48 }
@@ -67,7 +55,6 @@ hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu)
67 case CPU_DEAD: 55 case CPU_DEAD:
68 case CPU_DEAD_FROZEN: 56 case CPU_DEAD_FROZEN:
69 free_cpumask_var(cfd->cpumask); 57 free_cpumask_var(cfd->cpumask);
70 free_cpumask_var(cfd->cpumask_ipi);
71 free_percpu(cfd->csd); 58 free_percpu(cfd->csd);
72 break; 59 break;
73#endif 60#endif
@@ -85,12 +72,8 @@ void __init call_function_init(void)
85 void *cpu = (void *)(long)smp_processor_id(); 72 void *cpu = (void *)(long)smp_processor_id();
86 int i; 73 int i;
87 74
88 for_each_possible_cpu(i) { 75 for_each_possible_cpu(i)
89 struct call_single_queue *q = &per_cpu(call_single_queue, i); 76 init_llist_head(&per_cpu(call_single_queue, i));
90
91 raw_spin_lock_init(&q->lock);
92 INIT_LIST_HEAD(&q->list);
93 }
94 77
95 hotplug_cfd(&hotplug_cfd_notifier, CPU_UP_PREPARE, cpu); 78 hotplug_cfd(&hotplug_cfd_notifier, CPU_UP_PREPARE, cpu);
96 register_cpu_notifier(&hotplug_cfd_notifier); 79 register_cpu_notifier(&hotplug_cfd_notifier);
@@ -141,18 +124,9 @@ static void csd_unlock(struct call_single_data *csd)
141 */ 124 */
142static void generic_exec_single(int cpu, struct call_single_data *csd, int wait) 125static void generic_exec_single(int cpu, struct call_single_data *csd, int wait)
143{ 126{
144 struct call_single_queue *dst = &per_cpu(call_single_queue, cpu);
145 unsigned long flags;
146 int ipi;
147
148 if (wait) 127 if (wait)
149 csd->flags |= CSD_FLAG_WAIT; 128 csd->flags |= CSD_FLAG_WAIT;
150 129
151 raw_spin_lock_irqsave(&dst->lock, flags);
152 ipi = list_empty(&dst->list);
153 list_add_tail(&csd->list, &dst->list);
154 raw_spin_unlock_irqrestore(&dst->lock, flags);
155
156 /* 130 /*
157 * The list addition should be visible before sending the IPI 131 * The list addition should be visible before sending the IPI
158 * handler locks the list to pull the entry off it because of 132 * handler locks the list to pull the entry off it because of
@@ -164,7 +138,7 @@ static void generic_exec_single(int cpu, struct call_single_data *csd, int wait)
164 * locking and barrier primitives. Generic code isn't really 138 * locking and barrier primitives. Generic code isn't really
165 * equipped to do the right thing... 139 * equipped to do the right thing...
166 */ 140 */
167 if (ipi) 141 if (llist_add(&csd->llist, &per_cpu(call_single_queue, cpu)))
168 arch_send_call_function_single_ipi(cpu); 142 arch_send_call_function_single_ipi(cpu);
169 143
170 if (wait) 144 if (wait)
@@ -177,27 +151,26 @@ static void generic_exec_single(int cpu, struct call_single_data *csd, int wait)
177 */ 151 */
178void generic_smp_call_function_single_interrupt(void) 152void generic_smp_call_function_single_interrupt(void)
179{ 153{
180 struct call_single_queue *q = &__get_cpu_var(call_single_queue); 154 struct llist_node *entry, *next;
181 LIST_HEAD(list);
182 155
183 /* 156 /*
184 * Shouldn't receive this interrupt on a cpu that is not yet online. 157 * Shouldn't receive this interrupt on a cpu that is not yet online.
185 */ 158 */
186 WARN_ON_ONCE(!cpu_online(smp_processor_id())); 159 WARN_ON_ONCE(!cpu_online(smp_processor_id()));
187 160
188 raw_spin_lock(&q->lock); 161 entry = llist_del_all(&__get_cpu_var(call_single_queue));
189 list_replace_init(&q->list, &list); 162 entry = llist_reverse_order(entry);
190 raw_spin_unlock(&q->lock);
191 163
192 while (!list_empty(&list)) { 164 while (entry) {
193 struct call_single_data *csd; 165 struct call_single_data *csd;
194 166
195 csd = list_entry(list.next, struct call_single_data, list); 167 next = entry->next;
196 list_del(&csd->list);
197 168
169 csd = llist_entry(entry, struct call_single_data, llist);
198 csd->func(csd->info); 170 csd->func(csd->info);
199
200 csd_unlock(csd); 171 csd_unlock(csd);
172
173 entry = next;
201 } 174 }
202} 175}
203 176
@@ -402,30 +375,17 @@ void smp_call_function_many(const struct cpumask *mask,
402 if (unlikely(!cpumask_weight(cfd->cpumask))) 375 if (unlikely(!cpumask_weight(cfd->cpumask)))
403 return; 376 return;
404 377
405 /*
406 * After we put an entry into the list, cfd->cpumask may be cleared
407 * again when another CPU sends another IPI for a SMP function call, so
408 * cfd->cpumask will be zero.
409 */
410 cpumask_copy(cfd->cpumask_ipi, cfd->cpumask);
411
412 for_each_cpu(cpu, cfd->cpumask) { 378 for_each_cpu(cpu, cfd->cpumask) {
413 struct call_single_data *csd = per_cpu_ptr(cfd->csd, cpu); 379 struct call_single_data *csd = per_cpu_ptr(cfd->csd, cpu);
414 struct call_single_queue *dst =
415 &per_cpu(call_single_queue, cpu);
416 unsigned long flags;
417 380
418 csd_lock(csd); 381 csd_lock(csd);
419 csd->func = func; 382 csd->func = func;
420 csd->info = info; 383 csd->info = info;
421 384 llist_add(&csd->llist, &per_cpu(call_single_queue, cpu));
422 raw_spin_lock_irqsave(&dst->lock, flags);
423 list_add_tail(&csd->list, &dst->list);
424 raw_spin_unlock_irqrestore(&dst->lock, flags);
425 } 385 }
426 386
427 /* Send a message to all CPUs in the map */ 387 /* Send a message to all CPUs in the map */
428 arch_send_call_function_ipi_mask(cfd->cpumask_ipi); 388 arch_send_call_function_ipi_mask(cfd->cpumask);
429 389
430 if (wait) { 390 if (wait) {
431 for_each_cpu(cpu, cfd->cpumask) { 391 for_each_cpu(cpu, cfd->cpumask) {
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 11025ccc06dd..490fcbb1dc5b 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -8,6 +8,8 @@
8 * Rewritten. Old one was good in 2.2, but in 2.3 it was immoral. --ANK (990903) 8 * Rewritten. Old one was good in 2.2, but in 2.3 it was immoral. --ANK (990903)
9 */ 9 */
10 10
11#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
12
11#include <linux/export.h> 13#include <linux/export.h>
12#include <linux/kernel_stat.h> 14#include <linux/kernel_stat.h>
13#include <linux/interrupt.h> 15#include <linux/interrupt.h>
@@ -54,7 +56,7 @@ static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp
54 56
55DEFINE_PER_CPU(struct task_struct *, ksoftirqd); 57DEFINE_PER_CPU(struct task_struct *, ksoftirqd);
56 58
57char *softirq_to_name[NR_SOFTIRQS] = { 59const char * const softirq_to_name[NR_SOFTIRQS] = {
58 "HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "BLOCK_IOPOLL", 60 "HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "BLOCK_IOPOLL",
59 "TASKLET", "SCHED", "HRTIMER", "RCU" 61 "TASKLET", "SCHED", "HRTIMER", "RCU"
60}; 62};
@@ -89,7 +91,7 @@ static void wakeup_softirqd(void)
89 * where hardirqs are disabled legitimately: 91 * where hardirqs are disabled legitimately:
90 */ 92 */
91#ifdef CONFIG_TRACE_IRQFLAGS 93#ifdef CONFIG_TRACE_IRQFLAGS
92static void __local_bh_disable(unsigned long ip, unsigned int cnt) 94void __local_bh_disable_ip(unsigned long ip, unsigned int cnt)
93{ 95{
94 unsigned long flags; 96 unsigned long flags;
95 97
@@ -107,33 +109,21 @@ static void __local_bh_disable(unsigned long ip, unsigned int cnt)
107 /* 109 /*
108 * Were softirqs turned off above: 110 * Were softirqs turned off above:
109 */ 111 */
110 if (softirq_count() == cnt) 112 if (softirq_count() == (cnt & SOFTIRQ_MASK))
111 trace_softirqs_off(ip); 113 trace_softirqs_off(ip);
112 raw_local_irq_restore(flags); 114 raw_local_irq_restore(flags);
113 115
114 if (preempt_count() == cnt) 116 if (preempt_count() == cnt)
115 trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); 117 trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
116} 118}
117#else /* !CONFIG_TRACE_IRQFLAGS */ 119EXPORT_SYMBOL(__local_bh_disable_ip);
118static inline void __local_bh_disable(unsigned long ip, unsigned int cnt)
119{
120 preempt_count_add(cnt);
121 barrier();
122}
123#endif /* CONFIG_TRACE_IRQFLAGS */ 120#endif /* CONFIG_TRACE_IRQFLAGS */
124 121
125void local_bh_disable(void)
126{
127 __local_bh_disable(_RET_IP_, SOFTIRQ_DISABLE_OFFSET);
128}
129
130EXPORT_SYMBOL(local_bh_disable);
131
132static void __local_bh_enable(unsigned int cnt) 122static void __local_bh_enable(unsigned int cnt)
133{ 123{
134 WARN_ON_ONCE(!irqs_disabled()); 124 WARN_ON_ONCE(!irqs_disabled());
135 125
136 if (softirq_count() == cnt) 126 if (softirq_count() == (cnt & SOFTIRQ_MASK))
137 trace_softirqs_on(_RET_IP_); 127 trace_softirqs_on(_RET_IP_);
138 preempt_count_sub(cnt); 128 preempt_count_sub(cnt);
139} 129}
@@ -148,10 +138,9 @@ void _local_bh_enable(void)
148 WARN_ON_ONCE(in_irq()); 138 WARN_ON_ONCE(in_irq());
149 __local_bh_enable(SOFTIRQ_DISABLE_OFFSET); 139 __local_bh_enable(SOFTIRQ_DISABLE_OFFSET);
150} 140}
151
152EXPORT_SYMBOL(_local_bh_enable); 141EXPORT_SYMBOL(_local_bh_enable);
153 142
154static inline void _local_bh_enable_ip(unsigned long ip) 143void __local_bh_enable_ip(unsigned long ip, unsigned int cnt)
155{ 144{
156 WARN_ON_ONCE(in_irq() || irqs_disabled()); 145 WARN_ON_ONCE(in_irq() || irqs_disabled());
157#ifdef CONFIG_TRACE_IRQFLAGS 146#ifdef CONFIG_TRACE_IRQFLAGS
@@ -165,8 +154,8 @@ static inline void _local_bh_enable_ip(unsigned long ip)
165 /* 154 /*
166 * Keep preemption disabled until we are done with 155 * Keep preemption disabled until we are done with
167 * softirq processing: 156 * softirq processing:
168 */ 157 */
169 preempt_count_sub(SOFTIRQ_DISABLE_OFFSET - 1); 158 preempt_count_sub(cnt - 1);
170 159
171 if (unlikely(!in_interrupt() && local_softirq_pending())) { 160 if (unlikely(!in_interrupt() && local_softirq_pending())) {
172 /* 161 /*
@@ -182,18 +171,7 @@ static inline void _local_bh_enable_ip(unsigned long ip)
182#endif 171#endif
183 preempt_check_resched(); 172 preempt_check_resched();
184} 173}
185 174EXPORT_SYMBOL(__local_bh_enable_ip);
186void local_bh_enable(void)
187{
188 _local_bh_enable_ip(_RET_IP_);
189}
190EXPORT_SYMBOL(local_bh_enable);
191
192void local_bh_enable_ip(unsigned long ip)
193{
194 _local_bh_enable_ip(ip);
195}
196EXPORT_SYMBOL(local_bh_enable_ip);
197 175
198/* 176/*
199 * We restart softirq processing for at most MAX_SOFTIRQ_RESTART times, 177 * We restart softirq processing for at most MAX_SOFTIRQ_RESTART times,
@@ -211,14 +189,49 @@ EXPORT_SYMBOL(local_bh_enable_ip);
211#define MAX_SOFTIRQ_TIME msecs_to_jiffies(2) 189#define MAX_SOFTIRQ_TIME msecs_to_jiffies(2)
212#define MAX_SOFTIRQ_RESTART 10 190#define MAX_SOFTIRQ_RESTART 10
213 191
192#ifdef CONFIG_TRACE_IRQFLAGS
193/*
194 * When we run softirqs from irq_exit() and thus on the hardirq stack we need
195 * to keep the lockdep irq context tracking as tight as possible in order to
196 * not miss-qualify lock contexts and miss possible deadlocks.
197 */
198
199static inline bool lockdep_softirq_start(void)
200{
201 bool in_hardirq = false;
202
203 if (trace_hardirq_context(current)) {
204 in_hardirq = true;
205 trace_hardirq_exit();
206 }
207
208 lockdep_softirq_enter();
209
210 return in_hardirq;
211}
212
213static inline void lockdep_softirq_end(bool in_hardirq)
214{
215 lockdep_softirq_exit();
216
217 if (in_hardirq)
218 trace_hardirq_enter();
219}
220#else
221static inline bool lockdep_softirq_start(void) { return false; }
222static inline void lockdep_softirq_end(bool in_hardirq) { }
223#endif
224
214asmlinkage void __do_softirq(void) 225asmlinkage void __do_softirq(void)
215{ 226{
216 struct softirq_action *h;
217 __u32 pending;
218 unsigned long end = jiffies + MAX_SOFTIRQ_TIME; 227 unsigned long end = jiffies + MAX_SOFTIRQ_TIME;
219 int cpu;
220 unsigned long old_flags = current->flags; 228 unsigned long old_flags = current->flags;
221 int max_restart = MAX_SOFTIRQ_RESTART; 229 int max_restart = MAX_SOFTIRQ_RESTART;
230 struct softirq_action *h;
231 bool in_hardirq;
232 __u32 pending;
233 int softirq_bit;
234 int cpu;
222 235
223 /* 236 /*
224 * Mask out PF_MEMALLOC s current task context is borrowed for the 237 * Mask out PF_MEMALLOC s current task context is borrowed for the
@@ -230,8 +243,8 @@ asmlinkage void __do_softirq(void)
230 pending = local_softirq_pending(); 243 pending = local_softirq_pending();
231 account_irq_enter_time(current); 244 account_irq_enter_time(current);
232 245
233 __local_bh_disable(_RET_IP_, SOFTIRQ_OFFSET); 246 __local_bh_disable_ip(_RET_IP_, SOFTIRQ_OFFSET);
234 lockdep_softirq_enter(); 247 in_hardirq = lockdep_softirq_start();
235 248
236 cpu = smp_processor_id(); 249 cpu = smp_processor_id();
237restart: 250restart:
@@ -242,30 +255,30 @@ restart:
242 255
243 h = softirq_vec; 256 h = softirq_vec;
244 257
245 do { 258 while ((softirq_bit = ffs(pending))) {
246 if (pending & 1) { 259 unsigned int vec_nr;
247 unsigned int vec_nr = h - softirq_vec; 260 int prev_count;
248 int prev_count = preempt_count(); 261
249 262 h += softirq_bit - 1;
250 kstat_incr_softirqs_this_cpu(vec_nr); 263
251 264 vec_nr = h - softirq_vec;
252 trace_softirq_entry(vec_nr); 265 prev_count = preempt_count();
253 h->action(h); 266
254 trace_softirq_exit(vec_nr); 267 kstat_incr_softirqs_this_cpu(vec_nr);
255 if (unlikely(prev_count != preempt_count())) {
256 printk(KERN_ERR "huh, entered softirq %u %s %p"
257 "with preempt_count %08x,"
258 " exited with %08x?\n", vec_nr,
259 softirq_to_name[vec_nr], h->action,
260 prev_count, preempt_count());
261 preempt_count_set(prev_count);
262 }
263 268
264 rcu_bh_qs(cpu); 269 trace_softirq_entry(vec_nr);
270 h->action(h);
271 trace_softirq_exit(vec_nr);
272 if (unlikely(prev_count != preempt_count())) {
273 pr_err("huh, entered softirq %u %s %p with preempt_count %08x, exited with %08x?\n",
274 vec_nr, softirq_to_name[vec_nr], h->action,
275 prev_count, preempt_count());
276 preempt_count_set(prev_count);
265 } 277 }
278 rcu_bh_qs(cpu);
266 h++; 279 h++;
267 pending >>= 1; 280 pending >>= softirq_bit;
268 } while (pending); 281 }
269 282
270 local_irq_disable(); 283 local_irq_disable();
271 284
@@ -278,16 +291,13 @@ restart:
278 wakeup_softirqd(); 291 wakeup_softirqd();
279 } 292 }
280 293
281 lockdep_softirq_exit(); 294 lockdep_softirq_end(in_hardirq);
282
283 account_irq_exit_time(current); 295 account_irq_exit_time(current);
284 __local_bh_enable(SOFTIRQ_OFFSET); 296 __local_bh_enable(SOFTIRQ_OFFSET);
285 WARN_ON_ONCE(in_interrupt()); 297 WARN_ON_ONCE(in_interrupt());
286 tsk_restore_flags(current, old_flags, PF_MEMALLOC); 298 tsk_restore_flags(current, old_flags, PF_MEMALLOC);
287} 299}
288 300
289
290
291asmlinkage void do_softirq(void) 301asmlinkage void do_softirq(void)
292{ 302{
293 __u32 pending; 303 __u32 pending;
@@ -311,8 +321,6 @@ asmlinkage void do_softirq(void)
311 */ 321 */
312void irq_enter(void) 322void irq_enter(void)
313{ 323{
314 int cpu = smp_processor_id();
315
316 rcu_irq_enter(); 324 rcu_irq_enter();
317 if (is_idle_task(current) && !in_interrupt()) { 325 if (is_idle_task(current) && !in_interrupt()) {
318 /* 326 /*
@@ -320,7 +328,7 @@ void irq_enter(void)
320 * here, as softirq will be serviced on return from interrupt. 328 * here, as softirq will be serviced on return from interrupt.
321 */ 329 */
322 local_bh_disable(); 330 local_bh_disable();
323 tick_check_idle(cpu); 331 tick_irq_enter();
324 _local_bh_enable(); 332 _local_bh_enable();
325 } 333 }
326 334
@@ -375,13 +383,13 @@ void irq_exit(void)
375#endif 383#endif
376 384
377 account_irq_exit_time(current); 385 account_irq_exit_time(current);
378 trace_hardirq_exit();
379 preempt_count_sub(HARDIRQ_OFFSET); 386 preempt_count_sub(HARDIRQ_OFFSET);
380 if (!in_interrupt() && local_softirq_pending()) 387 if (!in_interrupt() && local_softirq_pending())
381 invoke_softirq(); 388 invoke_softirq();
382 389
383 tick_irq_exit(); 390 tick_irq_exit();
384 rcu_irq_exit(); 391 rcu_irq_exit();
392 trace_hardirq_exit(); /* must be last! */
385} 393}
386 394
387/* 395/*
@@ -427,8 +435,7 @@ void open_softirq(int nr, void (*action)(struct softirq_action *))
427/* 435/*
428 * Tasklets 436 * Tasklets
429 */ 437 */
430struct tasklet_head 438struct tasklet_head {
431{
432 struct tasklet_struct *head; 439 struct tasklet_struct *head;
433 struct tasklet_struct **tail; 440 struct tasklet_struct **tail;
434}; 441};
@@ -447,7 +454,6 @@ void __tasklet_schedule(struct tasklet_struct *t)
447 raise_softirq_irqoff(TASKLET_SOFTIRQ); 454 raise_softirq_irqoff(TASKLET_SOFTIRQ);
448 local_irq_restore(flags); 455 local_irq_restore(flags);
449} 456}
450
451EXPORT_SYMBOL(__tasklet_schedule); 457EXPORT_SYMBOL(__tasklet_schedule);
452 458
453void __tasklet_hi_schedule(struct tasklet_struct *t) 459void __tasklet_hi_schedule(struct tasklet_struct *t)
@@ -461,7 +467,6 @@ void __tasklet_hi_schedule(struct tasklet_struct *t)
461 raise_softirq_irqoff(HI_SOFTIRQ); 467 raise_softirq_irqoff(HI_SOFTIRQ);
462 local_irq_restore(flags); 468 local_irq_restore(flags);
463} 469}
464
465EXPORT_SYMBOL(__tasklet_hi_schedule); 470EXPORT_SYMBOL(__tasklet_hi_schedule);
466 471
467void __tasklet_hi_schedule_first(struct tasklet_struct *t) 472void __tasklet_hi_schedule_first(struct tasklet_struct *t)
@@ -472,7 +477,6 @@ void __tasklet_hi_schedule_first(struct tasklet_struct *t)
472 __this_cpu_write(tasklet_hi_vec.head, t); 477 __this_cpu_write(tasklet_hi_vec.head, t);
473 __raise_softirq_irqoff(HI_SOFTIRQ); 478 __raise_softirq_irqoff(HI_SOFTIRQ);
474} 479}
475
476EXPORT_SYMBOL(__tasklet_hi_schedule_first); 480EXPORT_SYMBOL(__tasklet_hi_schedule_first);
477 481
478static void tasklet_action(struct softirq_action *a) 482static void tasklet_action(struct softirq_action *a)
@@ -492,7 +496,8 @@ static void tasklet_action(struct softirq_action *a)
492 496
493 if (tasklet_trylock(t)) { 497 if (tasklet_trylock(t)) {
494 if (!atomic_read(&t->count)) { 498 if (!atomic_read(&t->count)) {
495 if (!test_and_clear_bit(TASKLET_STATE_SCHED, &t->state)) 499 if (!test_and_clear_bit(TASKLET_STATE_SCHED,
500 &t->state))
496 BUG(); 501 BUG();
497 t->func(t->data); 502 t->func(t->data);
498 tasklet_unlock(t); 503 tasklet_unlock(t);
@@ -527,7 +532,8 @@ static void tasklet_hi_action(struct softirq_action *a)
527 532
528 if (tasklet_trylock(t)) { 533 if (tasklet_trylock(t)) {
529 if (!atomic_read(&t->count)) { 534 if (!atomic_read(&t->count)) {
530 if (!test_and_clear_bit(TASKLET_STATE_SCHED, &t->state)) 535 if (!test_and_clear_bit(TASKLET_STATE_SCHED,
536 &t->state))
531 BUG(); 537 BUG();
532 t->func(t->data); 538 t->func(t->data);
533 tasklet_unlock(t); 539 tasklet_unlock(t);
@@ -545,7 +551,6 @@ static void tasklet_hi_action(struct softirq_action *a)
545 } 551 }
546} 552}
547 553
548
549void tasklet_init(struct tasklet_struct *t, 554void tasklet_init(struct tasklet_struct *t,
550 void (*func)(unsigned long), unsigned long data) 555 void (*func)(unsigned long), unsigned long data)
551{ 556{
@@ -555,13 +560,12 @@ void tasklet_init(struct tasklet_struct *t,
555 t->func = func; 560 t->func = func;
556 t->data = data; 561 t->data = data;
557} 562}
558
559EXPORT_SYMBOL(tasklet_init); 563EXPORT_SYMBOL(tasklet_init);
560 564
561void tasklet_kill(struct tasklet_struct *t) 565void tasklet_kill(struct tasklet_struct *t)
562{ 566{
563 if (in_interrupt()) 567 if (in_interrupt())
564 printk("Attempt to kill tasklet from interrupt\n"); 568 pr_notice("Attempt to kill tasklet from interrupt\n");
565 569
566 while (test_and_set_bit(TASKLET_STATE_SCHED, &t->state)) { 570 while (test_and_set_bit(TASKLET_STATE_SCHED, &t->state)) {
567 do { 571 do {
@@ -571,7 +575,6 @@ void tasklet_kill(struct tasklet_struct *t)
571 tasklet_unlock_wait(t); 575 tasklet_unlock_wait(t);
572 clear_bit(TASKLET_STATE_SCHED, &t->state); 576 clear_bit(TASKLET_STATE_SCHED, &t->state);
573} 577}
574
575EXPORT_SYMBOL(tasklet_kill); 578EXPORT_SYMBOL(tasklet_kill);
576 579
577/* 580/*
@@ -721,9 +724,8 @@ static void takeover_tasklets(unsigned int cpu)
721} 724}
722#endif /* CONFIG_HOTPLUG_CPU */ 725#endif /* CONFIG_HOTPLUG_CPU */
723 726
724static int cpu_callback(struct notifier_block *nfb, 727static int cpu_callback(struct notifier_block *nfb, unsigned long action,
725 unsigned long action, 728 void *hcpu)
726 void *hcpu)
727{ 729{
728 switch (action) { 730 switch (action) {
729#ifdef CONFIG_HOTPLUG_CPU 731#ifdef CONFIG_HOTPLUG_CPU
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 84571e09c907..01fbae5b97b7 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -293,7 +293,7 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *
293 */ 293 */
294 smp_call_function_single(min(cpu1, cpu2), 294 smp_call_function_single(min(cpu1, cpu2),
295 &irq_cpu_stop_queue_work, 295 &irq_cpu_stop_queue_work,
296 &call_args, 0); 296 &call_args, 1);
297 lg_local_unlock(&stop_cpus_lock); 297 lg_local_unlock(&stop_cpus_lock);
298 preempt_enable(); 298 preempt_enable();
299 299
diff --git a/kernel/sys.c b/kernel/sys.c
index c72311324ea7..c0a58be780a4 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -895,8 +895,7 @@ SYSCALL_DEFINE1(times, struct tms __user *, tbuf)
895 * only important on a multi-user system anyway, to make sure one user 895 * only important on a multi-user system anyway, to make sure one user
896 * can't send a signal to a process owned by another. -TYT, 12/12/91 896 * can't send a signal to a process owned by another. -TYT, 12/12/91
897 * 897 *
898 * Auch. Had to add the 'did_exec' flag to conform completely to POSIX. 898 * !PF_FORKNOEXEC check to conform completely to POSIX.
899 * LBT 04.03.94
900 */ 899 */
901SYSCALL_DEFINE2(setpgid, pid_t, pid, pid_t, pgid) 900SYSCALL_DEFINE2(setpgid, pid_t, pid, pid_t, pgid)
902{ 901{
@@ -932,7 +931,7 @@ SYSCALL_DEFINE2(setpgid, pid_t, pid, pid_t, pgid)
932 if (task_session(p) != task_session(group_leader)) 931 if (task_session(p) != task_session(group_leader))
933 goto out; 932 goto out;
934 err = -EACCES; 933 err = -EACCES;
935 if (p->did_exec) 934 if (!(p->flags & PF_FORKNOEXEC))
936 goto out; 935 goto out;
937 } else { 936 } else {
938 err = -ESRCH; 937 err = -ESRCH;
@@ -1572,8 +1571,7 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
1572 t = p; 1571 t = p;
1573 do { 1572 do {
1574 accumulate_thread_rusage(t, r); 1573 accumulate_thread_rusage(t, r);
1575 t = next_thread(t); 1574 } while_each_thread(p, t);
1576 } while (t != p);
1577 break; 1575 break;
1578 1576
1579 default: 1577 default:
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 34a604726d0b..49e13e1f8fe6 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -62,6 +62,7 @@
62#include <linux/capability.h> 62#include <linux/capability.h>
63#include <linux/binfmts.h> 63#include <linux/binfmts.h>
64#include <linux/sched/sysctl.h> 64#include <linux/sched/sysctl.h>
65#include <linux/kexec.h>
65 66
66#include <asm/uaccess.h> 67#include <asm/uaccess.h>
67#include <asm/processor.h> 68#include <asm/processor.h>
@@ -95,8 +96,6 @@
95#if defined(CONFIG_SYSCTL) 96#if defined(CONFIG_SYSCTL)
96 97
97/* External variables not in a header file. */ 98/* External variables not in a header file. */
98extern int sysctl_overcommit_memory;
99extern int sysctl_overcommit_ratio;
100extern int max_threads; 99extern int max_threads;
101extern int suid_dumpable; 100extern int suid_dumpable;
102#ifdef CONFIG_COREDUMP 101#ifdef CONFIG_COREDUMP
@@ -122,6 +121,8 @@ extern int blk_iopoll_enabled;
122static int sixty = 60; 121static int sixty = 60;
123#endif 122#endif
124 123
124static int __maybe_unused neg_one = -1;
125
125static int zero; 126static int zero;
126static int __maybe_unused one = 1; 127static int __maybe_unused one = 1;
127static int __maybe_unused two = 2; 128static int __maybe_unused two = 2;
@@ -385,19 +386,21 @@ static struct ctl_table kern_table[] = {
385 .proc_handler = proc_dointvec, 386 .proc_handler = proc_dointvec,
386 }, 387 },
387 { 388 {
388 .procname = "numa_balancing_settle_count",
389 .data = &sysctl_numa_balancing_settle_count,
390 .maxlen = sizeof(unsigned int),
391 .mode = 0644,
392 .proc_handler = proc_dointvec,
393 },
394 {
395 .procname = "numa_balancing_migrate_deferred", 389 .procname = "numa_balancing_migrate_deferred",
396 .data = &sysctl_numa_balancing_migrate_deferred, 390 .data = &sysctl_numa_balancing_migrate_deferred,
397 .maxlen = sizeof(unsigned int), 391 .maxlen = sizeof(unsigned int),
398 .mode = 0644, 392 .mode = 0644,
399 .proc_handler = proc_dointvec, 393 .proc_handler = proc_dointvec,
400 }, 394 },
395 {
396 .procname = "numa_balancing",
397 .data = NULL, /* filled in by handler */
398 .maxlen = sizeof(unsigned int),
399 .mode = 0644,
400 .proc_handler = sysctl_numa_balancing,
401 .extra1 = &zero,
402 .extra2 = &one,
403 },
401#endif /* CONFIG_NUMA_BALANCING */ 404#endif /* CONFIG_NUMA_BALANCING */
402#endif /* CONFIG_SCHED_DEBUG */ 405#endif /* CONFIG_SCHED_DEBUG */
403 { 406 {
@@ -614,6 +617,18 @@ static struct ctl_table kern_table[] = {
614 .proc_handler = proc_dointvec, 617 .proc_handler = proc_dointvec,
615 }, 618 },
616#endif 619#endif
620#ifdef CONFIG_KEXEC
621 {
622 .procname = "kexec_load_disabled",
623 .data = &kexec_load_disabled,
624 .maxlen = sizeof(int),
625 .mode = 0644,
626 /* only handle a transition from default "0" to "1" */
627 .proc_handler = proc_dointvec_minmax,
628 .extra1 = &one,
629 .extra2 = &one,
630 },
631#endif
617#ifdef CONFIG_MODULES 632#ifdef CONFIG_MODULES
618 { 633 {
619 .procname = "modprobe", 634 .procname = "modprobe",
@@ -984,9 +999,10 @@ static struct ctl_table kern_table[] = {
984 { 999 {
985 .procname = "hung_task_warnings", 1000 .procname = "hung_task_warnings",
986 .data = &sysctl_hung_task_warnings, 1001 .data = &sysctl_hung_task_warnings,
987 .maxlen = sizeof(unsigned long), 1002 .maxlen = sizeof(int),
988 .mode = 0644, 1003 .mode = 0644,
989 .proc_handler = proc_doulongvec_minmax, 1004 .proc_handler = proc_dointvec_minmax,
1005 .extra1 = &neg_one,
990 }, 1006 },
991#endif 1007#endif
992#ifdef CONFIG_COMPAT 1008#ifdef CONFIG_COMPAT
@@ -1128,7 +1144,14 @@ static struct ctl_table vm_table[] = {
1128 .data = &sysctl_overcommit_ratio, 1144 .data = &sysctl_overcommit_ratio,
1129 .maxlen = sizeof(sysctl_overcommit_ratio), 1145 .maxlen = sizeof(sysctl_overcommit_ratio),
1130 .mode = 0644, 1146 .mode = 0644,
1131 .proc_handler = proc_dointvec, 1147 .proc_handler = overcommit_ratio_handler,
1148 },
1149 {
1150 .procname = "overcommit_kbytes",
1151 .data = &sysctl_overcommit_kbytes,
1152 .maxlen = sizeof(sysctl_overcommit_kbytes),
1153 .mode = 0644,
1154 .proc_handler = overcommit_kbytes_handler,
1132 }, 1155 },
1133 { 1156 {
1134 .procname = "page-cluster", 1157 .procname = "page-cluster",
diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
index 7a925ba456fb..a6a5bf53e86d 100644
--- a/kernel/time/jiffies.c
+++ b/kernel/time/jiffies.c
@@ -51,7 +51,13 @@
51 * HZ shrinks, so values greater than 8 overflow 32bits when 51 * HZ shrinks, so values greater than 8 overflow 32bits when
52 * HZ=100. 52 * HZ=100.
53 */ 53 */
54#if HZ < 34
55#define JIFFIES_SHIFT 6
56#elif HZ < 67
57#define JIFFIES_SHIFT 7
58#else
54#define JIFFIES_SHIFT 8 59#define JIFFIES_SHIFT 8
60#endif
55 61
56static cycle_t jiffies_read(struct clocksource *cs) 62static cycle_t jiffies_read(struct clocksource *cs)
57{ 63{
diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c
index 68b799375981..4d23dc4d8139 100644
--- a/kernel/time/sched_clock.c
+++ b/kernel/time/sched_clock.c
@@ -74,7 +74,7 @@ unsigned long long notrace sched_clock(void)
74 return cd.epoch_ns; 74 return cd.epoch_ns;
75 75
76 do { 76 do {
77 seq = read_seqcount_begin(&cd.seq); 77 seq = raw_read_seqcount_begin(&cd.seq);
78 epoch_cyc = cd.epoch_cyc; 78 epoch_cyc = cd.epoch_cyc;
79 epoch_ns = cd.epoch_ns; 79 epoch_ns = cd.epoch_ns;
80 } while (read_seqcount_retry(&cd.seq, seq)); 80 } while (read_seqcount_retry(&cd.seq, seq));
@@ -99,10 +99,10 @@ static void notrace update_sched_clock(void)
99 cd.mult, cd.shift); 99 cd.mult, cd.shift);
100 100
101 raw_local_irq_save(flags); 101 raw_local_irq_save(flags);
102 write_seqcount_begin(&cd.seq); 102 raw_write_seqcount_begin(&cd.seq);
103 cd.epoch_ns = ns; 103 cd.epoch_ns = ns;
104 cd.epoch_cyc = cyc; 104 cd.epoch_cyc = cyc;
105 write_seqcount_end(&cd.seq); 105 raw_write_seqcount_end(&cd.seq);
106 raw_local_irq_restore(flags); 106 raw_local_irq_restore(flags);
107} 107}
108 108
@@ -116,20 +116,42 @@ static enum hrtimer_restart sched_clock_poll(struct hrtimer *hrt)
116void __init sched_clock_register(u64 (*read)(void), int bits, 116void __init sched_clock_register(u64 (*read)(void), int bits,
117 unsigned long rate) 117 unsigned long rate)
118{ 118{
119 u64 res, wrap, new_mask, new_epoch, cyc, ns;
120 u32 new_mult, new_shift;
121 ktime_t new_wrap_kt;
119 unsigned long r; 122 unsigned long r;
120 u64 res, wrap;
121 char r_unit; 123 char r_unit;
122 124
123 if (cd.rate > rate) 125 if (cd.rate > rate)
124 return; 126 return;
125 127
126 WARN_ON(!irqs_disabled()); 128 WARN_ON(!irqs_disabled());
127 read_sched_clock = read;
128 sched_clock_mask = CLOCKSOURCE_MASK(bits);
129 cd.rate = rate;
130 129
131 /* calculate the mult/shift to convert counter ticks to ns. */ 130 /* calculate the mult/shift to convert counter ticks to ns. */
132 clocks_calc_mult_shift(&cd.mult, &cd.shift, rate, NSEC_PER_SEC, 3600); 131 clocks_calc_mult_shift(&new_mult, &new_shift, rate, NSEC_PER_SEC, 3600);
132
133 new_mask = CLOCKSOURCE_MASK(bits);
134
135 /* calculate how many ns until we wrap */
136 wrap = clocks_calc_max_nsecs(new_mult, new_shift, 0, new_mask);
137 new_wrap_kt = ns_to_ktime(wrap - (wrap >> 3));
138
139 /* update epoch for new counter and update epoch_ns from old counter*/
140 new_epoch = read();
141 cyc = read_sched_clock();
142 ns = cd.epoch_ns + cyc_to_ns((cyc - cd.epoch_cyc) & sched_clock_mask,
143 cd.mult, cd.shift);
144
145 raw_write_seqcount_begin(&cd.seq);
146 read_sched_clock = read;
147 sched_clock_mask = new_mask;
148 cd.rate = rate;
149 cd.wrap_kt = new_wrap_kt;
150 cd.mult = new_mult;
151 cd.shift = new_shift;
152 cd.epoch_cyc = new_epoch;
153 cd.epoch_ns = ns;
154 raw_write_seqcount_end(&cd.seq);
133 155
134 r = rate; 156 r = rate;
135 if (r >= 4000000) { 157 if (r >= 4000000) {
@@ -141,22 +163,12 @@ void __init sched_clock_register(u64 (*read)(void), int bits,
141 } else 163 } else
142 r_unit = ' '; 164 r_unit = ' ';
143 165
144 /* calculate how many ns until we wrap */
145 wrap = clocks_calc_max_nsecs(cd.mult, cd.shift, 0, sched_clock_mask);
146 cd.wrap_kt = ns_to_ktime(wrap - (wrap >> 3));
147
148 /* calculate the ns resolution of this counter */ 166 /* calculate the ns resolution of this counter */
149 res = cyc_to_ns(1ULL, cd.mult, cd.shift); 167 res = cyc_to_ns(1ULL, new_mult, new_shift);
168
150 pr_info("sched_clock: %u bits at %lu%cHz, resolution %lluns, wraps every %lluns\n", 169 pr_info("sched_clock: %u bits at %lu%cHz, resolution %lluns, wraps every %lluns\n",
151 bits, r, r_unit, res, wrap); 170 bits, r, r_unit, res, wrap);
152 171
153 update_sched_clock();
154
155 /*
156 * Ensure that sched_clock() starts off at 0ns
157 */
158 cd.epoch_ns = 0;
159
160 /* Enable IRQ time accounting if we have a fast enough sched_clock */ 172 /* Enable IRQ time accounting if we have a fast enough sched_clock */
161 if (irqtime > 0 || (irqtime == -1 && rate >= 1000000)) 173 if (irqtime > 0 || (irqtime == -1 && rate >= 1000000))
162 enable_sched_clock_irqtime(); 174 enable_sched_clock_irqtime();
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 9532690daaa9..98977a57ac72 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -538,10 +538,10 @@ int tick_resume_broadcast_oneshot(struct clock_event_device *bc)
538 * Called from irq_enter() when idle was interrupted to reenable the 538 * Called from irq_enter() when idle was interrupted to reenable the
539 * per cpu device. 539 * per cpu device.
540 */ 540 */
541void tick_check_oneshot_broadcast(int cpu) 541void tick_check_oneshot_broadcast_this_cpu(void)
542{ 542{
543 if (cpumask_test_cpu(cpu, tick_broadcast_oneshot_mask)) { 543 if (cpumask_test_cpu(smp_processor_id(), tick_broadcast_oneshot_mask)) {
544 struct tick_device *td = &per_cpu(tick_cpu_device, cpu); 544 struct tick_device *td = &__get_cpu_var(tick_cpu_device);
545 545
546 /* 546 /*
547 * We might be in the middle of switching over from 547 * We might be in the middle of switching over from
@@ -756,6 +756,7 @@ out:
756static void tick_broadcast_clear_oneshot(int cpu) 756static void tick_broadcast_clear_oneshot(int cpu)
757{ 757{
758 cpumask_clear_cpu(cpu, tick_broadcast_oneshot_mask); 758 cpumask_clear_cpu(cpu, tick_broadcast_oneshot_mask);
759 cpumask_clear_cpu(cpu, tick_broadcast_pending_mask);
759} 760}
760 761
761static void tick_broadcast_init_next_event(struct cpumask *mask, 762static void tick_broadcast_init_next_event(struct cpumask *mask,
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index 162b03ab0ad2..20b2fe37d105 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -85,6 +85,7 @@ static void tick_periodic(int cpu)
85 85
86 do_timer(1); 86 do_timer(1);
87 write_sequnlock(&jiffies_lock); 87 write_sequnlock(&jiffies_lock);
88 update_wall_time();
88 } 89 }
89 90
90 update_process_times(user_mode(get_irq_regs())); 91 update_process_times(user_mode(get_irq_regs()));
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index 18e71f7fbc2a..8329669b51ec 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -51,7 +51,7 @@ extern void tick_broadcast_switch_to_oneshot(void);
51extern void tick_shutdown_broadcast_oneshot(unsigned int *cpup); 51extern void tick_shutdown_broadcast_oneshot(unsigned int *cpup);
52extern int tick_resume_broadcast_oneshot(struct clock_event_device *bc); 52extern int tick_resume_broadcast_oneshot(struct clock_event_device *bc);
53extern int tick_broadcast_oneshot_active(void); 53extern int tick_broadcast_oneshot_active(void);
54extern void tick_check_oneshot_broadcast(int cpu); 54extern void tick_check_oneshot_broadcast_this_cpu(void);
55bool tick_broadcast_oneshot_available(void); 55bool tick_broadcast_oneshot_available(void);
56# else /* BROADCAST */ 56# else /* BROADCAST */
57static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc) 57static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
@@ -62,7 +62,7 @@ static inline void tick_broadcast_oneshot_control(unsigned long reason) { }
62static inline void tick_broadcast_switch_to_oneshot(void) { } 62static inline void tick_broadcast_switch_to_oneshot(void) { }
63static inline void tick_shutdown_broadcast_oneshot(unsigned int *cpup) { } 63static inline void tick_shutdown_broadcast_oneshot(unsigned int *cpup) { }
64static inline int tick_broadcast_oneshot_active(void) { return 0; } 64static inline int tick_broadcast_oneshot_active(void) { return 0; }
65static inline void tick_check_oneshot_broadcast(int cpu) { } 65static inline void tick_check_oneshot_broadcast_this_cpu(void) { }
66static inline bool tick_broadcast_oneshot_available(void) { return true; } 66static inline bool tick_broadcast_oneshot_available(void) { return true; }
67# endif /* !BROADCAST */ 67# endif /* !BROADCAST */
68 68
@@ -155,3 +155,4 @@ static inline int tick_device_is_functional(struct clock_event_device *dev)
155#endif 155#endif
156 156
157extern void do_timer(unsigned long ticks); 157extern void do_timer(unsigned long ticks);
158extern void update_wall_time(void);
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index ea20f7d1ac2c..9f8af69c67ec 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -86,6 +86,7 @@ static void tick_do_update_jiffies64(ktime_t now)
86 tick_next_period = ktime_add(last_jiffies_update, tick_period); 86 tick_next_period = ktime_add(last_jiffies_update, tick_period);
87 } 87 }
88 write_sequnlock(&jiffies_lock); 88 write_sequnlock(&jiffies_lock);
89 update_wall_time();
89} 90}
90 91
91/* 92/*
@@ -177,7 +178,7 @@ static bool can_stop_full_tick(void)
177 * TODO: kick full dynticks CPUs when 178 * TODO: kick full dynticks CPUs when
178 * sched_clock_stable is set. 179 * sched_clock_stable is set.
179 */ 180 */
180 if (!sched_clock_stable) { 181 if (!sched_clock_stable()) {
181 trace_tick_stop(0, "unstable sched clock\n"); 182 trace_tick_stop(0, "unstable sched clock\n");
182 /* 183 /*
183 * Don't allow the user to think they can get 184 * Don't allow the user to think they can get
@@ -391,11 +392,9 @@ __setup("nohz=", setup_tick_nohz);
391 */ 392 */
392static void tick_nohz_update_jiffies(ktime_t now) 393static void tick_nohz_update_jiffies(ktime_t now)
393{ 394{
394 int cpu = smp_processor_id();
395 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
396 unsigned long flags; 395 unsigned long flags;
397 396
398 ts->idle_waketime = now; 397 __this_cpu_write(tick_cpu_sched.idle_waketime, now);
399 398
400 local_irq_save(flags); 399 local_irq_save(flags);
401 tick_do_update_jiffies64(now); 400 tick_do_update_jiffies64(now);
@@ -426,17 +425,15 @@ update_ts_time_stats(int cpu, struct tick_sched *ts, ktime_t now, u64 *last_upda
426 425
427} 426}
428 427
429static void tick_nohz_stop_idle(int cpu, ktime_t now) 428static void tick_nohz_stop_idle(struct tick_sched *ts, ktime_t now)
430{ 429{
431 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); 430 update_ts_time_stats(smp_processor_id(), ts, now, NULL);
432
433 update_ts_time_stats(cpu, ts, now, NULL);
434 ts->idle_active = 0; 431 ts->idle_active = 0;
435 432
436 sched_clock_idle_wakeup_event(0); 433 sched_clock_idle_wakeup_event(0);
437} 434}
438 435
439static ktime_t tick_nohz_start_idle(int cpu, struct tick_sched *ts) 436static ktime_t tick_nohz_start_idle(struct tick_sched *ts)
440{ 437{
441 ktime_t now = ktime_get(); 438 ktime_t now = ktime_get();
442 439
@@ -536,12 +533,13 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
536 struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; 533 struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev;
537 u64 time_delta; 534 u64 time_delta;
538 535
536 time_delta = timekeeping_max_deferment();
537
539 /* Read jiffies and the time when jiffies were updated last */ 538 /* Read jiffies and the time when jiffies were updated last */
540 do { 539 do {
541 seq = read_seqbegin(&jiffies_lock); 540 seq = read_seqbegin(&jiffies_lock);
542 last_update = last_jiffies_update; 541 last_update = last_jiffies_update;
543 last_jiffies = jiffies; 542 last_jiffies = jiffies;
544 time_delta = timekeeping_max_deferment();
545 } while (read_seqretry(&jiffies_lock, seq)); 543 } while (read_seqretry(&jiffies_lock, seq));
546 544
547 if (rcu_needs_cpu(cpu, &rcu_delta_jiffies) || 545 if (rcu_needs_cpu(cpu, &rcu_delta_jiffies) ||
@@ -681,18 +679,18 @@ out:
681static void tick_nohz_full_stop_tick(struct tick_sched *ts) 679static void tick_nohz_full_stop_tick(struct tick_sched *ts)
682{ 680{
683#ifdef CONFIG_NO_HZ_FULL 681#ifdef CONFIG_NO_HZ_FULL
684 int cpu = smp_processor_id(); 682 int cpu = smp_processor_id();
685 683
686 if (!tick_nohz_full_cpu(cpu) || is_idle_task(current)) 684 if (!tick_nohz_full_cpu(cpu) || is_idle_task(current))
687 return; 685 return;
688 686
689 if (!ts->tick_stopped && ts->nohz_mode == NOHZ_MODE_INACTIVE) 687 if (!ts->tick_stopped && ts->nohz_mode == NOHZ_MODE_INACTIVE)
690 return; 688 return;
691 689
692 if (!can_stop_full_tick()) 690 if (!can_stop_full_tick())
693 return; 691 return;
694 692
695 tick_nohz_stop_sched_tick(ts, ktime_get(), cpu); 693 tick_nohz_stop_sched_tick(ts, ktime_get(), cpu);
696#endif 694#endif
697} 695}
698 696
@@ -754,7 +752,7 @@ static void __tick_nohz_idle_enter(struct tick_sched *ts)
754 ktime_t now, expires; 752 ktime_t now, expires;
755 int cpu = smp_processor_id(); 753 int cpu = smp_processor_id();
756 754
757 now = tick_nohz_start_idle(cpu, ts); 755 now = tick_nohz_start_idle(ts);
758 756
759 if (can_stop_idle_tick(cpu, ts)) { 757 if (can_stop_idle_tick(cpu, ts)) {
760 int was_stopped = ts->tick_stopped; 758 int was_stopped = ts->tick_stopped;
@@ -911,8 +909,7 @@ static void tick_nohz_account_idle_ticks(struct tick_sched *ts)
911 */ 909 */
912void tick_nohz_idle_exit(void) 910void tick_nohz_idle_exit(void)
913{ 911{
914 int cpu = smp_processor_id(); 912 struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
915 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
916 ktime_t now; 913 ktime_t now;
917 914
918 local_irq_disable(); 915 local_irq_disable();
@@ -925,7 +922,7 @@ void tick_nohz_idle_exit(void)
925 now = ktime_get(); 922 now = ktime_get();
926 923
927 if (ts->idle_active) 924 if (ts->idle_active)
928 tick_nohz_stop_idle(cpu, now); 925 tick_nohz_stop_idle(ts, now);
929 926
930 if (ts->tick_stopped) { 927 if (ts->tick_stopped) {
931 tick_nohz_restart_sched_tick(ts, now); 928 tick_nohz_restart_sched_tick(ts, now);
@@ -1009,12 +1006,10 @@ static void tick_nohz_switch_to_nohz(void)
1009 * timer and do not touch the other magic bits which need to be done 1006 * timer and do not touch the other magic bits which need to be done
1010 * when idle is left. 1007 * when idle is left.
1011 */ 1008 */
1012static void tick_nohz_kick_tick(int cpu, ktime_t now) 1009static void tick_nohz_kick_tick(struct tick_sched *ts, ktime_t now)
1013{ 1010{
1014#if 0 1011#if 0
1015 /* Switch back to 2.6.27 behaviour */ 1012 /* Switch back to 2.6.27 behaviour */
1016
1017 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
1018 ktime_t delta; 1013 ktime_t delta;
1019 1014
1020 /* 1015 /*
@@ -1029,36 +1024,36 @@ static void tick_nohz_kick_tick(int cpu, ktime_t now)
1029#endif 1024#endif
1030} 1025}
1031 1026
1032static inline void tick_check_nohz(int cpu) 1027static inline void tick_nohz_irq_enter(void)
1033{ 1028{
1034 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); 1029 struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
1035 ktime_t now; 1030 ktime_t now;
1036 1031
1037 if (!ts->idle_active && !ts->tick_stopped) 1032 if (!ts->idle_active && !ts->tick_stopped)
1038 return; 1033 return;
1039 now = ktime_get(); 1034 now = ktime_get();
1040 if (ts->idle_active) 1035 if (ts->idle_active)
1041 tick_nohz_stop_idle(cpu, now); 1036 tick_nohz_stop_idle(ts, now);
1042 if (ts->tick_stopped) { 1037 if (ts->tick_stopped) {
1043 tick_nohz_update_jiffies(now); 1038 tick_nohz_update_jiffies(now);
1044 tick_nohz_kick_tick(cpu, now); 1039 tick_nohz_kick_tick(ts, now);
1045 } 1040 }
1046} 1041}
1047 1042
1048#else 1043#else
1049 1044
1050static inline void tick_nohz_switch_to_nohz(void) { } 1045static inline void tick_nohz_switch_to_nohz(void) { }
1051static inline void tick_check_nohz(int cpu) { } 1046static inline void tick_nohz_irq_enter(void) { }
1052 1047
1053#endif /* CONFIG_NO_HZ_COMMON */ 1048#endif /* CONFIG_NO_HZ_COMMON */
1054 1049
1055/* 1050/*
1056 * Called from irq_enter to notify about the possible interruption of idle() 1051 * Called from irq_enter to notify about the possible interruption of idle()
1057 */ 1052 */
1058void tick_check_idle(int cpu) 1053void tick_irq_enter(void)
1059{ 1054{
1060 tick_check_oneshot_broadcast(cpu); 1055 tick_check_oneshot_broadcast_this_cpu();
1061 tick_check_nohz(cpu); 1056 tick_nohz_irq_enter();
1062} 1057}
1063 1058
1064/* 1059/*
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 87b4f00284c9..0aa4ce81bc16 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -77,7 +77,7 @@ static void tk_set_wall_to_mono(struct timekeeper *tk, struct timespec wtm)
77 tk->wall_to_monotonic = wtm; 77 tk->wall_to_monotonic = wtm;
78 set_normalized_timespec(&tmp, -wtm.tv_sec, -wtm.tv_nsec); 78 set_normalized_timespec(&tmp, -wtm.tv_sec, -wtm.tv_nsec);
79 tk->offs_real = timespec_to_ktime(tmp); 79 tk->offs_real = timespec_to_ktime(tmp);
80 tk->offs_tai = ktime_sub(tk->offs_real, ktime_set(tk->tai_offset, 0)); 80 tk->offs_tai = ktime_add(tk->offs_real, ktime_set(tk->tai_offset, 0));
81} 81}
82 82
83static void tk_set_sleep_time(struct timekeeper *tk, struct timespec t) 83static void tk_set_sleep_time(struct timekeeper *tk, struct timespec t)
@@ -90,8 +90,9 @@ static void tk_set_sleep_time(struct timekeeper *tk, struct timespec t)
90} 90}
91 91
92/** 92/**
93 * timekeeper_setup_internals - Set up internals to use clocksource clock. 93 * tk_setup_internals - Set up internals to use clocksource clock.
94 * 94 *
95 * @tk: The target timekeeper to setup.
95 * @clock: Pointer to clocksource. 96 * @clock: Pointer to clocksource.
96 * 97 *
97 * Calculates a fixed cycle/nsec interval for a given clocksource/adjustment 98 * Calculates a fixed cycle/nsec interval for a given clocksource/adjustment
@@ -595,7 +596,7 @@ s32 timekeeping_get_tai_offset(void)
595static void __timekeeping_set_tai_offset(struct timekeeper *tk, s32 tai_offset) 596static void __timekeeping_set_tai_offset(struct timekeeper *tk, s32 tai_offset)
596{ 597{
597 tk->tai_offset = tai_offset; 598 tk->tai_offset = tai_offset;
598 tk->offs_tai = ktime_sub(tk->offs_real, ktime_set(tai_offset, 0)); 599 tk->offs_tai = ktime_add(tk->offs_real, ktime_set(tai_offset, 0));
599} 600}
600 601
601/** 602/**
@@ -610,6 +611,7 @@ void timekeeping_set_tai_offset(s32 tai_offset)
610 raw_spin_lock_irqsave(&timekeeper_lock, flags); 611 raw_spin_lock_irqsave(&timekeeper_lock, flags);
611 write_seqcount_begin(&timekeeper_seq); 612 write_seqcount_begin(&timekeeper_seq);
612 __timekeeping_set_tai_offset(tk, tai_offset); 613 __timekeeping_set_tai_offset(tk, tai_offset);
614 timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET);
613 write_seqcount_end(&timekeeper_seq); 615 write_seqcount_end(&timekeeper_seq);
614 raw_spin_unlock_irqrestore(&timekeeper_lock, flags); 616 raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
615 clock_was_set(); 617 clock_was_set();
@@ -1023,6 +1025,8 @@ static int timekeeping_suspend(void)
1023 timekeeping_suspend_time = 1025 timekeeping_suspend_time =
1024 timespec_add(timekeeping_suspend_time, delta_delta); 1026 timespec_add(timekeeping_suspend_time, delta_delta);
1025 } 1027 }
1028
1029 timekeeping_update(tk, TK_MIRROR);
1026 write_seqcount_end(&timekeeper_seq); 1030 write_seqcount_end(&timekeeper_seq);
1027 raw_spin_unlock_irqrestore(&timekeeper_lock, flags); 1031 raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
1028 1032
@@ -1130,16 +1134,6 @@ static void timekeeping_adjust(struct timekeeper *tk, s64 offset)
1130 * we can adjust by 1. 1134 * we can adjust by 1.
1131 */ 1135 */
1132 error >>= 2; 1136 error >>= 2;
1133 /*
1134 * XXX - In update_wall_time, we round up to the next
1135 * nanosecond, and store the amount rounded up into
1136 * the error. This causes the likely below to be unlikely.
1137 *
1138 * The proper fix is to avoid rounding up by using
1139 * the high precision tk->xtime_nsec instead of
1140 * xtime.tv_nsec everywhere. Fixing this will take some
1141 * time.
1142 */
1143 if (likely(error <= interval)) 1137 if (likely(error <= interval))
1144 adj = 1; 1138 adj = 1;
1145 else 1139 else
@@ -1255,7 +1249,7 @@ out_adjust:
1255static inline unsigned int accumulate_nsecs_to_secs(struct timekeeper *tk) 1249static inline unsigned int accumulate_nsecs_to_secs(struct timekeeper *tk)
1256{ 1250{
1257 u64 nsecps = (u64)NSEC_PER_SEC << tk->shift; 1251 u64 nsecps = (u64)NSEC_PER_SEC << tk->shift;
1258 unsigned int action = 0; 1252 unsigned int clock_set = 0;
1259 1253
1260 while (tk->xtime_nsec >= nsecps) { 1254 while (tk->xtime_nsec >= nsecps) {
1261 int leap; 1255 int leap;
@@ -1277,11 +1271,10 @@ static inline unsigned int accumulate_nsecs_to_secs(struct timekeeper *tk)
1277 1271
1278 __timekeeping_set_tai_offset(tk, tk->tai_offset - leap); 1272 __timekeeping_set_tai_offset(tk, tk->tai_offset - leap);
1279 1273
1280 clock_was_set_delayed(); 1274 clock_set = TK_CLOCK_WAS_SET;
1281 action = TK_CLOCK_WAS_SET;
1282 } 1275 }
1283 } 1276 }
1284 return action; 1277 return clock_set;
1285} 1278}
1286 1279
1287/** 1280/**
@@ -1294,7 +1287,8 @@ static inline unsigned int accumulate_nsecs_to_secs(struct timekeeper *tk)
1294 * Returns the unconsumed cycles. 1287 * Returns the unconsumed cycles.
1295 */ 1288 */
1296static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset, 1289static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset,
1297 u32 shift) 1290 u32 shift,
1291 unsigned int *clock_set)
1298{ 1292{
1299 cycle_t interval = tk->cycle_interval << shift; 1293 cycle_t interval = tk->cycle_interval << shift;
1300 u64 raw_nsecs; 1294 u64 raw_nsecs;
@@ -1308,7 +1302,7 @@ static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset,
1308 tk->cycle_last += interval; 1302 tk->cycle_last += interval;
1309 1303
1310 tk->xtime_nsec += tk->xtime_interval << shift; 1304 tk->xtime_nsec += tk->xtime_interval << shift;
1311 accumulate_nsecs_to_secs(tk); 1305 *clock_set |= accumulate_nsecs_to_secs(tk);
1312 1306
1313 /* Accumulate raw time */ 1307 /* Accumulate raw time */
1314 raw_nsecs = (u64)tk->raw_interval << shift; 1308 raw_nsecs = (u64)tk->raw_interval << shift;
@@ -1359,14 +1353,14 @@ static inline void old_vsyscall_fixup(struct timekeeper *tk)
1359 * update_wall_time - Uses the current clocksource to increment the wall time 1353 * update_wall_time - Uses the current clocksource to increment the wall time
1360 * 1354 *
1361 */ 1355 */
1362static void update_wall_time(void) 1356void update_wall_time(void)
1363{ 1357{
1364 struct clocksource *clock; 1358 struct clocksource *clock;
1365 struct timekeeper *real_tk = &timekeeper; 1359 struct timekeeper *real_tk = &timekeeper;
1366 struct timekeeper *tk = &shadow_timekeeper; 1360 struct timekeeper *tk = &shadow_timekeeper;
1367 cycle_t offset; 1361 cycle_t offset;
1368 int shift = 0, maxshift; 1362 int shift = 0, maxshift;
1369 unsigned int action; 1363 unsigned int clock_set = 0;
1370 unsigned long flags; 1364 unsigned long flags;
1371 1365
1372 raw_spin_lock_irqsave(&timekeeper_lock, flags); 1366 raw_spin_lock_irqsave(&timekeeper_lock, flags);
@@ -1401,7 +1395,8 @@ static void update_wall_time(void)
1401 maxshift = (64 - (ilog2(ntp_tick_length())+1)) - 1; 1395 maxshift = (64 - (ilog2(ntp_tick_length())+1)) - 1;
1402 shift = min(shift, maxshift); 1396 shift = min(shift, maxshift);
1403 while (offset >= tk->cycle_interval) { 1397 while (offset >= tk->cycle_interval) {
1404 offset = logarithmic_accumulation(tk, offset, shift); 1398 offset = logarithmic_accumulation(tk, offset, shift,
1399 &clock_set);
1405 if (offset < tk->cycle_interval<<shift) 1400 if (offset < tk->cycle_interval<<shift)
1406 shift--; 1401 shift--;
1407 } 1402 }
@@ -1419,7 +1414,7 @@ static void update_wall_time(void)
1419 * Finally, make sure that after the rounding 1414 * Finally, make sure that after the rounding
1420 * xtime_nsec isn't larger than NSEC_PER_SEC 1415 * xtime_nsec isn't larger than NSEC_PER_SEC
1421 */ 1416 */
1422 action = accumulate_nsecs_to_secs(tk); 1417 clock_set |= accumulate_nsecs_to_secs(tk);
1423 1418
1424 write_seqcount_begin(&timekeeper_seq); 1419 write_seqcount_begin(&timekeeper_seq);
1425 /* Update clock->cycle_last with the new value */ 1420 /* Update clock->cycle_last with the new value */
@@ -1435,10 +1430,12 @@ static void update_wall_time(void)
1435 * updating. 1430 * updating.
1436 */ 1431 */
1437 memcpy(real_tk, tk, sizeof(*tk)); 1432 memcpy(real_tk, tk, sizeof(*tk));
1438 timekeeping_update(real_tk, action); 1433 timekeeping_update(real_tk, clock_set);
1439 write_seqcount_end(&timekeeper_seq); 1434 write_seqcount_end(&timekeeper_seq);
1440out: 1435out:
1441 raw_spin_unlock_irqrestore(&timekeeper_lock, flags); 1436 raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
1437 if (clock_set)
1438 clock_was_set();
1442} 1439}
1443 1440
1444/** 1441/**
@@ -1583,7 +1580,6 @@ struct timespec get_monotonic_coarse(void)
1583void do_timer(unsigned long ticks) 1580void do_timer(unsigned long ticks)
1584{ 1581{
1585 jiffies_64 += ticks; 1582 jiffies_64 += ticks;
1586 update_wall_time();
1587 calc_global_load(ticks); 1583 calc_global_load(ticks);
1588} 1584}
1589 1585
@@ -1698,12 +1694,14 @@ int do_adjtimex(struct timex *txc)
1698 1694
1699 if (tai != orig_tai) { 1695 if (tai != orig_tai) {
1700 __timekeeping_set_tai_offset(tk, tai); 1696 __timekeeping_set_tai_offset(tk, tai);
1701 update_pvclock_gtod(tk, true); 1697 timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET);
1702 clock_was_set_delayed();
1703 } 1698 }
1704 write_seqcount_end(&timekeeper_seq); 1699 write_seqcount_end(&timekeeper_seq);
1705 raw_spin_unlock_irqrestore(&timekeeper_lock, flags); 1700 raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
1706 1701
1702 if (tai != orig_tai)
1703 clock_was_set();
1704
1707 ntp_notify_cmos_timer(); 1705 ntp_notify_cmos_timer();
1708 1706
1709 return ret; 1707 return ret;
@@ -1739,4 +1737,5 @@ void xtime_update(unsigned long ticks)
1739 write_seqlock(&jiffies_lock); 1737 write_seqlock(&jiffies_lock);
1740 do_timer(ticks); 1738 do_timer(ticks);
1741 write_sequnlock(&jiffies_lock); 1739 write_sequnlock(&jiffies_lock);
1740 update_wall_time();
1742} 1741}
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index d7e2068e4b71..1378e84fbe39 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -50,6 +50,7 @@ ifeq ($(CONFIG_PERF_EVENTS),y)
50obj-$(CONFIG_EVENT_TRACING) += trace_event_perf.o 50obj-$(CONFIG_EVENT_TRACING) += trace_event_perf.o
51endif 51endif
52obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o 52obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o
53obj-$(CONFIG_EVENT_TRACING) += trace_events_trigger.o
53obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o 54obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o
54obj-$(CONFIG_TRACEPOINTS) += power-traces.o 55obj-$(CONFIG_TRACEPOINTS) += power-traces.o
55ifeq ($(CONFIG_PM_RUNTIME),y) 56ifeq ($(CONFIG_PM_RUNTIME),y)
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index f785aef65799..b418cb0d7242 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -781,8 +781,8 @@ static void blk_add_trace_bio(struct request_queue *q, struct bio *bio,
781 if (!error && !bio_flagged(bio, BIO_UPTODATE)) 781 if (!error && !bio_flagged(bio, BIO_UPTODATE))
782 error = EIO; 782 error = EIO;
783 783
784 __blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, what, 784 __blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size,
785 error, 0, NULL); 785 bio->bi_rw, what, error, 0, NULL);
786} 786}
787 787
788static void blk_add_trace_bio_bounce(void *ignore, 788static void blk_add_trace_bio_bounce(void *ignore,
@@ -885,8 +885,9 @@ static void blk_add_trace_split(void *ignore,
885 if (bt) { 885 if (bt) {
886 __be64 rpdu = cpu_to_be64(pdu); 886 __be64 rpdu = cpu_to_be64(pdu);
887 887
888 __blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, 888 __blk_add_trace(bt, bio->bi_iter.bi_sector,
889 BLK_TA_SPLIT, !bio_flagged(bio, BIO_UPTODATE), 889 bio->bi_iter.bi_size, bio->bi_rw, BLK_TA_SPLIT,
890 !bio_flagged(bio, BIO_UPTODATE),
890 sizeof(rpdu), &rpdu); 891 sizeof(rpdu), &rpdu);
891 } 892 }
892} 893}
@@ -918,9 +919,9 @@ static void blk_add_trace_bio_remap(void *ignore,
918 r.device_to = cpu_to_be32(bio->bi_bdev->bd_dev); 919 r.device_to = cpu_to_be32(bio->bi_bdev->bd_dev);
919 r.sector_from = cpu_to_be64(from); 920 r.sector_from = cpu_to_be64(from);
920 921
921 __blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, 922 __blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size,
922 BLK_TA_REMAP, !bio_flagged(bio, BIO_UPTODATE), 923 bio->bi_rw, BLK_TA_REMAP,
923 sizeof(r), &r); 924 !bio_flagged(bio, BIO_UPTODATE), sizeof(r), &r);
924} 925}
925 926
926/** 927/**
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 72a0f81dc5a8..cd7f76d1eb86 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -85,6 +85,8 @@ int function_trace_stop __read_mostly;
85 85
86/* Current function tracing op */ 86/* Current function tracing op */
87struct ftrace_ops *function_trace_op __read_mostly = &ftrace_list_end; 87struct ftrace_ops *function_trace_op __read_mostly = &ftrace_list_end;
88/* What to set function_trace_op to */
89static struct ftrace_ops *set_function_trace_op;
88 90
89/* List for set_ftrace_pid's pids. */ 91/* List for set_ftrace_pid's pids. */
90LIST_HEAD(ftrace_pids); 92LIST_HEAD(ftrace_pids);
@@ -278,6 +280,29 @@ static void update_global_ops(void)
278 global_ops.func = func; 280 global_ops.func = func;
279} 281}
280 282
283static void ftrace_sync(struct work_struct *work)
284{
285 /*
286 * This function is just a stub to implement a hard force
287 * of synchronize_sched(). This requires synchronizing
288 * tasks even in userspace and idle.
289 *
290 * Yes, function tracing is rude.
291 */
292}
293
294static void ftrace_sync_ipi(void *data)
295{
296 /* Probably not needed, but do it anyway */
297 smp_rmb();
298}
299
300#ifdef CONFIG_FUNCTION_GRAPH_TRACER
301static void update_function_graph_func(void);
302#else
303static inline void update_function_graph_func(void) { }
304#endif
305
281static void update_ftrace_function(void) 306static void update_ftrace_function(void)
282{ 307{
283 ftrace_func_t func; 308 ftrace_func_t func;
@@ -296,16 +321,61 @@ static void update_ftrace_function(void)
296 !FTRACE_FORCE_LIST_FUNC)) { 321 !FTRACE_FORCE_LIST_FUNC)) {
297 /* Set the ftrace_ops that the arch callback uses */ 322 /* Set the ftrace_ops that the arch callback uses */
298 if (ftrace_ops_list == &global_ops) 323 if (ftrace_ops_list == &global_ops)
299 function_trace_op = ftrace_global_list; 324 set_function_trace_op = ftrace_global_list;
300 else 325 else
301 function_trace_op = ftrace_ops_list; 326 set_function_trace_op = ftrace_ops_list;
302 func = ftrace_ops_list->func; 327 func = ftrace_ops_list->func;
303 } else { 328 } else {
304 /* Just use the default ftrace_ops */ 329 /* Just use the default ftrace_ops */
305 function_trace_op = &ftrace_list_end; 330 set_function_trace_op = &ftrace_list_end;
306 func = ftrace_ops_list_func; 331 func = ftrace_ops_list_func;
307 } 332 }
308 333
334 /* If there's no change, then do nothing more here */
335 if (ftrace_trace_function == func)
336 return;
337
338 update_function_graph_func();
339
340 /*
341 * If we are using the list function, it doesn't care
342 * about the function_trace_ops.
343 */
344 if (func == ftrace_ops_list_func) {
345 ftrace_trace_function = func;
346 /*
347 * Don't even bother setting function_trace_ops,
348 * it would be racy to do so anyway.
349 */
350 return;
351 }
352
353#ifndef CONFIG_DYNAMIC_FTRACE
354 /*
355 * For static tracing, we need to be a bit more careful.
356 * The function change takes affect immediately. Thus,
357 * we need to coorditate the setting of the function_trace_ops
358 * with the setting of the ftrace_trace_function.
359 *
360 * Set the function to the list ops, which will call the
361 * function we want, albeit indirectly, but it handles the
362 * ftrace_ops and doesn't depend on function_trace_op.
363 */
364 ftrace_trace_function = ftrace_ops_list_func;
365 /*
366 * Make sure all CPUs see this. Yes this is slow, but static
367 * tracing is slow and nasty to have enabled.
368 */
369 schedule_on_each_cpu(ftrace_sync);
370 /* Now all cpus are using the list ops. */
371 function_trace_op = set_function_trace_op;
372 /* Make sure the function_trace_op is visible on all CPUs */
373 smp_wmb();
374 /* Nasty way to force a rmb on all cpus */
375 smp_call_function(ftrace_sync_ipi, NULL, 1);
376 /* OK, we are all set to update the ftrace_trace_function now! */
377#endif /* !CONFIG_DYNAMIC_FTRACE */
378
309 ftrace_trace_function = func; 379 ftrace_trace_function = func;
310} 380}
311 381
@@ -410,17 +480,6 @@ static int __register_ftrace_function(struct ftrace_ops *ops)
410 return 0; 480 return 0;
411} 481}
412 482
413static void ftrace_sync(struct work_struct *work)
414{
415 /*
416 * This function is just a stub to implement a hard force
417 * of synchronize_sched(). This requires synchronizing
418 * tasks even in userspace and idle.
419 *
420 * Yes, function tracing is rude.
421 */
422}
423
424static int __unregister_ftrace_function(struct ftrace_ops *ops) 483static int __unregister_ftrace_function(struct ftrace_ops *ops)
425{ 484{
426 int ret; 485 int ret;
@@ -439,20 +498,6 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops)
439 } else if (ops->flags & FTRACE_OPS_FL_CONTROL) { 498 } else if (ops->flags & FTRACE_OPS_FL_CONTROL) {
440 ret = remove_ftrace_list_ops(&ftrace_control_list, 499 ret = remove_ftrace_list_ops(&ftrace_control_list,
441 &control_ops, ops); 500 &control_ops, ops);
442 if (!ret) {
443 /*
444 * The ftrace_ops is now removed from the list,
445 * so there'll be no new users. We must ensure
446 * all current users are done before we free
447 * the control data.
448 * Note synchronize_sched() is not enough, as we
449 * use preempt_disable() to do RCU, but the function
450 * tracer can be called where RCU is not active
451 * (before user_exit()).
452 */
453 schedule_on_each_cpu(ftrace_sync);
454 control_ops_free(ops);
455 }
456 } else 501 } else
457 ret = remove_ftrace_ops(&ftrace_ops_list, ops); 502 ret = remove_ftrace_ops(&ftrace_ops_list, ops);
458 503
@@ -462,17 +507,6 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops)
462 if (ftrace_enabled) 507 if (ftrace_enabled)
463 update_ftrace_function(); 508 update_ftrace_function();
464 509
465 /*
466 * Dynamic ops may be freed, we must make sure that all
467 * callers are done before leaving this function.
468 *
469 * Again, normal synchronize_sched() is not good enough.
470 * We need to do a hard force of sched synchronization.
471 */
472 if (ops->flags & FTRACE_OPS_FL_DYNAMIC)
473 schedule_on_each_cpu(ftrace_sync);
474
475
476 return 0; 510 return 0;
477} 511}
478 512
@@ -1082,19 +1116,6 @@ static __init void ftrace_profile_debugfs(struct dentry *d_tracer)
1082 1116
1083static struct pid * const ftrace_swapper_pid = &init_struct_pid; 1117static struct pid * const ftrace_swapper_pid = &init_struct_pid;
1084 1118
1085loff_t
1086ftrace_filter_lseek(struct file *file, loff_t offset, int whence)
1087{
1088 loff_t ret;
1089
1090 if (file->f_mode & FMODE_READ)
1091 ret = seq_lseek(file, offset, whence);
1092 else
1093 file->f_pos = ret = 1;
1094
1095 return ret;
1096}
1097
1098#ifdef CONFIG_DYNAMIC_FTRACE 1119#ifdef CONFIG_DYNAMIC_FTRACE
1099 1120
1100#ifndef CONFIG_FTRACE_MCOUNT_RECORD 1121#ifndef CONFIG_FTRACE_MCOUNT_RECORD
@@ -1992,8 +2013,14 @@ void ftrace_modify_all_code(int command)
1992 else if (command & FTRACE_DISABLE_CALLS) 2013 else if (command & FTRACE_DISABLE_CALLS)
1993 ftrace_replace_code(0); 2014 ftrace_replace_code(0);
1994 2015
1995 if (update && ftrace_trace_function != ftrace_ops_list_func) 2016 if (update && ftrace_trace_function != ftrace_ops_list_func) {
2017 function_trace_op = set_function_trace_op;
2018 smp_wmb();
2019 /* If irqs are disabled, we are in stop machine */
2020 if (!irqs_disabled())
2021 smp_call_function(ftrace_sync_ipi, NULL, 1);
1996 ftrace_update_ftrace_func(ftrace_trace_function); 2022 ftrace_update_ftrace_func(ftrace_trace_function);
2023 }
1997 2024
1998 if (command & FTRACE_START_FUNC_RET) 2025 if (command & FTRACE_START_FUNC_RET)
1999 ftrace_enable_ftrace_graph_caller(); 2026 ftrace_enable_ftrace_graph_caller();
@@ -2156,10 +2183,41 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command)
2156 command |= FTRACE_UPDATE_TRACE_FUNC; 2183 command |= FTRACE_UPDATE_TRACE_FUNC;
2157 } 2184 }
2158 2185
2159 if (!command || !ftrace_enabled) 2186 if (!command || !ftrace_enabled) {
2187 /*
2188 * If these are control ops, they still need their
2189 * per_cpu field freed. Since, function tracing is
2190 * not currently active, we can just free them
2191 * without synchronizing all CPUs.
2192 */
2193 if (ops->flags & FTRACE_OPS_FL_CONTROL)
2194 control_ops_free(ops);
2160 return 0; 2195 return 0;
2196 }
2161 2197
2162 ftrace_run_update_code(command); 2198 ftrace_run_update_code(command);
2199
2200 /*
2201 * Dynamic ops may be freed, we must make sure that all
2202 * callers are done before leaving this function.
2203 * The same goes for freeing the per_cpu data of the control
2204 * ops.
2205 *
2206 * Again, normal synchronize_sched() is not good enough.
2207 * We need to do a hard force of sched synchronization.
2208 * This is because we use preempt_disable() to do RCU, but
2209 * the function tracers can be called where RCU is not watching
2210 * (like before user_exit()). We can not rely on the RCU
2211 * infrastructure to do the synchronization, thus we must do it
2212 * ourselves.
2213 */
2214 if (ops->flags & (FTRACE_OPS_FL_DYNAMIC | FTRACE_OPS_FL_CONTROL)) {
2215 schedule_on_each_cpu(ftrace_sync);
2216
2217 if (ops->flags & FTRACE_OPS_FL_CONTROL)
2218 control_ops_free(ops);
2219 }
2220
2163 return 0; 2221 return 0;
2164} 2222}
2165 2223
@@ -2739,7 +2797,7 @@ static void ftrace_filter_reset(struct ftrace_hash *hash)
2739 * routine, you can use ftrace_filter_write() for the write 2797 * routine, you can use ftrace_filter_write() for the write
2740 * routine if @flag has FTRACE_ITER_FILTER set, or 2798 * routine if @flag has FTRACE_ITER_FILTER set, or
2741 * ftrace_notrace_write() if @flag has FTRACE_ITER_NOTRACE set. 2799 * ftrace_notrace_write() if @flag has FTRACE_ITER_NOTRACE set.
2742 * ftrace_filter_lseek() should be used as the lseek routine, and 2800 * tracing_lseek() should be used as the lseek routine, and
2743 * release must call ftrace_regex_release(). 2801 * release must call ftrace_regex_release().
2744 */ 2802 */
2745int 2803int
@@ -3767,7 +3825,7 @@ static const struct file_operations ftrace_filter_fops = {
3767 .open = ftrace_filter_open, 3825 .open = ftrace_filter_open,
3768 .read = seq_read, 3826 .read = seq_read,
3769 .write = ftrace_filter_write, 3827 .write = ftrace_filter_write,
3770 .llseek = ftrace_filter_lseek, 3828 .llseek = tracing_lseek,
3771 .release = ftrace_regex_release, 3829 .release = ftrace_regex_release,
3772}; 3830};
3773 3831
@@ -3775,7 +3833,7 @@ static const struct file_operations ftrace_notrace_fops = {
3775 .open = ftrace_notrace_open, 3833 .open = ftrace_notrace_open,
3776 .read = seq_read, 3834 .read = seq_read,
3777 .write = ftrace_notrace_write, 3835 .write = ftrace_notrace_write,
3778 .llseek = ftrace_filter_lseek, 3836 .llseek = tracing_lseek,
3779 .release = ftrace_regex_release, 3837 .release = ftrace_regex_release,
3780}; 3838};
3781 3839
@@ -4038,7 +4096,7 @@ static const struct file_operations ftrace_graph_fops = {
4038 .open = ftrace_graph_open, 4096 .open = ftrace_graph_open,
4039 .read = seq_read, 4097 .read = seq_read,
4040 .write = ftrace_graph_write, 4098 .write = ftrace_graph_write,
4041 .llseek = ftrace_filter_lseek, 4099 .llseek = tracing_lseek,
4042 .release = ftrace_graph_release, 4100 .release = ftrace_graph_release,
4043}; 4101};
4044 4102
@@ -4046,7 +4104,7 @@ static const struct file_operations ftrace_graph_notrace_fops = {
4046 .open = ftrace_graph_notrace_open, 4104 .open = ftrace_graph_notrace_open,
4047 .read = seq_read, 4105 .read = seq_read,
4048 .write = ftrace_graph_write, 4106 .write = ftrace_graph_write,
4049 .llseek = ftrace_filter_lseek, 4107 .llseek = tracing_lseek,
4050 .release = ftrace_graph_release, 4108 .release = ftrace_graph_release,
4051}; 4109};
4052#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ 4110#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
@@ -4719,7 +4777,7 @@ static const struct file_operations ftrace_pid_fops = {
4719 .open = ftrace_pid_open, 4777 .open = ftrace_pid_open,
4720 .write = ftrace_pid_write, 4778 .write = ftrace_pid_write,
4721 .read = seq_read, 4779 .read = seq_read,
4722 .llseek = ftrace_filter_lseek, 4780 .llseek = tracing_lseek,
4723 .release = ftrace_pid_release, 4781 .release = ftrace_pid_release,
4724}; 4782};
4725 4783
@@ -4862,6 +4920,7 @@ int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace)
4862trace_func_graph_ret_t ftrace_graph_return = 4920trace_func_graph_ret_t ftrace_graph_return =
4863 (trace_func_graph_ret_t)ftrace_stub; 4921 (trace_func_graph_ret_t)ftrace_stub;
4864trace_func_graph_ent_t ftrace_graph_entry = ftrace_graph_entry_stub; 4922trace_func_graph_ent_t ftrace_graph_entry = ftrace_graph_entry_stub;
4923static trace_func_graph_ent_t __ftrace_graph_entry = ftrace_graph_entry_stub;
4865 4924
4866/* Try to assign a return stack array on FTRACE_RETSTACK_ALLOC_SIZE tasks. */ 4925/* Try to assign a return stack array on FTRACE_RETSTACK_ALLOC_SIZE tasks. */
4867static int alloc_retstack_tasklist(struct ftrace_ret_stack **ret_stack_list) 4926static int alloc_retstack_tasklist(struct ftrace_ret_stack **ret_stack_list)
@@ -5003,6 +5062,30 @@ static struct ftrace_ops fgraph_ops __read_mostly = {
5003 FTRACE_OPS_FL_RECURSION_SAFE, 5062 FTRACE_OPS_FL_RECURSION_SAFE,
5004}; 5063};
5005 5064
5065static int ftrace_graph_entry_test(struct ftrace_graph_ent *trace)
5066{
5067 if (!ftrace_ops_test(&global_ops, trace->func, NULL))
5068 return 0;
5069 return __ftrace_graph_entry(trace);
5070}
5071
5072/*
5073 * The function graph tracer should only trace the functions defined
5074 * by set_ftrace_filter and set_ftrace_notrace. If another function
5075 * tracer ops is registered, the graph tracer requires testing the
5076 * function against the global ops, and not just trace any function
5077 * that any ftrace_ops registered.
5078 */
5079static void update_function_graph_func(void)
5080{
5081 if (ftrace_ops_list == &ftrace_list_end ||
5082 (ftrace_ops_list == &global_ops &&
5083 global_ops.next == &ftrace_list_end))
5084 ftrace_graph_entry = __ftrace_graph_entry;
5085 else
5086 ftrace_graph_entry = ftrace_graph_entry_test;
5087}
5088
5006int register_ftrace_graph(trace_func_graph_ret_t retfunc, 5089int register_ftrace_graph(trace_func_graph_ret_t retfunc,
5007 trace_func_graph_ent_t entryfunc) 5090 trace_func_graph_ent_t entryfunc)
5008{ 5091{
@@ -5027,7 +5110,16 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc,
5027 } 5110 }
5028 5111
5029 ftrace_graph_return = retfunc; 5112 ftrace_graph_return = retfunc;
5030 ftrace_graph_entry = entryfunc; 5113
5114 /*
5115 * Update the indirect function to the entryfunc, and the
5116 * function that gets called to the entry_test first. Then
5117 * call the update fgraph entry function to determine if
5118 * the entryfunc should be called directly or not.
5119 */
5120 __ftrace_graph_entry = entryfunc;
5121 ftrace_graph_entry = ftrace_graph_entry_test;
5122 update_function_graph_func();
5031 5123
5032 ret = ftrace_startup(&fgraph_ops, FTRACE_START_FUNC_RET); 5124 ret = ftrace_startup(&fgraph_ops, FTRACE_START_FUNC_RET);
5033 5125
@@ -5046,6 +5138,7 @@ void unregister_ftrace_graph(void)
5046 ftrace_graph_active--; 5138 ftrace_graph_active--;
5047 ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub; 5139 ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub;
5048 ftrace_graph_entry = ftrace_graph_entry_stub; 5140 ftrace_graph_entry = ftrace_graph_entry_stub;
5141 __ftrace_graph_entry = ftrace_graph_entry_stub;
5049 ftrace_shutdown(&fgraph_ops, FTRACE_STOP_FUNC_RET); 5142 ftrace_shutdown(&fgraph_ops, FTRACE_STOP_FUNC_RET);
5050 unregister_pm_notifier(&ftrace_suspend_notifier); 5143 unregister_pm_notifier(&ftrace_suspend_notifier);
5051 unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL); 5144 unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL);
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index cc2f66f68dc5..fc4da2d97f9b 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -2397,6 +2397,13 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
2397 write &= RB_WRITE_MASK; 2397 write &= RB_WRITE_MASK;
2398 tail = write - length; 2398 tail = write - length;
2399 2399
2400 /*
2401 * If this is the first commit on the page, then it has the same
2402 * timestamp as the page itself.
2403 */
2404 if (!tail)
2405 delta = 0;
2406
2400 /* See if we shot pass the end of this buffer page */ 2407 /* See if we shot pass the end of this buffer page */
2401 if (unlikely(write > BUF_PAGE_SIZE)) 2408 if (unlikely(write > BUF_PAGE_SIZE))
2402 return rb_move_tail(cpu_buffer, length, tail, 2409 return rb_move_tail(cpu_buffer, length, tail,
@@ -2558,7 +2565,7 @@ rb_reserve_next_event(struct ring_buffer *buffer,
2558 if (unlikely(test_time_stamp(delta))) { 2565 if (unlikely(test_time_stamp(delta))) {
2559 int local_clock_stable = 1; 2566 int local_clock_stable = 1;
2560#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK 2567#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
2561 local_clock_stable = sched_clock_stable; 2568 local_clock_stable = sched_clock_stable();
2562#endif 2569#endif
2563 WARN_ONCE(delta > (1ULL << 59), 2570 WARN_ONCE(delta > (1ULL << 59),
2564 KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n%s", 2571 KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n%s",
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 9d20cd9743ef..815c878f409b 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -455,6 +455,9 @@ int __trace_puts(unsigned long ip, const char *str, int size)
455 unsigned long irq_flags; 455 unsigned long irq_flags;
456 int alloc; 456 int alloc;
457 457
458 if (unlikely(tracing_selftest_running || tracing_disabled))
459 return 0;
460
458 alloc = sizeof(*entry) + size + 2; /* possible \n added */ 461 alloc = sizeof(*entry) + size + 2; /* possible \n added */
459 462
460 local_save_flags(irq_flags); 463 local_save_flags(irq_flags);
@@ -495,6 +498,9 @@ int __trace_bputs(unsigned long ip, const char *str)
495 unsigned long irq_flags; 498 unsigned long irq_flags;
496 int size = sizeof(struct bputs_entry); 499 int size = sizeof(struct bputs_entry);
497 500
501 if (unlikely(tracing_selftest_running || tracing_disabled))
502 return 0;
503
498 local_save_flags(irq_flags); 504 local_save_flags(irq_flags);
499 buffer = global_trace.trace_buffer.buffer; 505 buffer = global_trace.trace_buffer.buffer;
500 event = trace_buffer_lock_reserve(buffer, TRACE_BPUTS, size, 506 event = trace_buffer_lock_reserve(buffer, TRACE_BPUTS, size,
@@ -595,6 +601,28 @@ void free_snapshot(struct trace_array *tr)
595} 601}
596 602
597/** 603/**
604 * tracing_alloc_snapshot - allocate snapshot buffer.
605 *
606 * This only allocates the snapshot buffer if it isn't already
607 * allocated - it doesn't also take a snapshot.
608 *
609 * This is meant to be used in cases where the snapshot buffer needs
610 * to be set up for events that can't sleep but need to be able to
611 * trigger a snapshot.
612 */
613int tracing_alloc_snapshot(void)
614{
615 struct trace_array *tr = &global_trace;
616 int ret;
617
618 ret = alloc_snapshot(tr);
619 WARN_ON(ret < 0);
620
621 return ret;
622}
623EXPORT_SYMBOL_GPL(tracing_alloc_snapshot);
624
625/**
598 * trace_snapshot_alloc - allocate and take a snapshot of the current buffer. 626 * trace_snapshot_alloc - allocate and take a snapshot of the current buffer.
599 * 627 *
600 * This is similar to trace_snapshot(), but it will allocate the 628 * This is similar to trace_snapshot(), but it will allocate the
@@ -607,11 +635,10 @@ void free_snapshot(struct trace_array *tr)
607 */ 635 */
608void tracing_snapshot_alloc(void) 636void tracing_snapshot_alloc(void)
609{ 637{
610 struct trace_array *tr = &global_trace;
611 int ret; 638 int ret;
612 639
613 ret = alloc_snapshot(tr); 640 ret = tracing_alloc_snapshot();
614 if (WARN_ON(ret < 0)) 641 if (ret < 0)
615 return; 642 return;
616 643
617 tracing_snapshot(); 644 tracing_snapshot();
@@ -623,6 +650,12 @@ void tracing_snapshot(void)
623 WARN_ONCE(1, "Snapshot feature not enabled, but internal snapshot used"); 650 WARN_ONCE(1, "Snapshot feature not enabled, but internal snapshot used");
624} 651}
625EXPORT_SYMBOL_GPL(tracing_snapshot); 652EXPORT_SYMBOL_GPL(tracing_snapshot);
653int tracing_alloc_snapshot(void)
654{
655 WARN_ONCE(1, "Snapshot feature not enabled, but snapshot allocation used");
656 return -ENODEV;
657}
658EXPORT_SYMBOL_GPL(tracing_alloc_snapshot);
626void tracing_snapshot_alloc(void) 659void tracing_snapshot_alloc(void)
627{ 660{
628 /* Give warning */ 661 /* Give warning */
@@ -3156,19 +3189,23 @@ tracing_write_stub(struct file *filp, const char __user *ubuf,
3156 return count; 3189 return count;
3157} 3190}
3158 3191
3159static loff_t tracing_seek(struct file *file, loff_t offset, int origin) 3192loff_t tracing_lseek(struct file *file, loff_t offset, int whence)
3160{ 3193{
3194 int ret;
3195
3161 if (file->f_mode & FMODE_READ) 3196 if (file->f_mode & FMODE_READ)
3162 return seq_lseek(file, offset, origin); 3197 ret = seq_lseek(file, offset, whence);
3163 else 3198 else
3164 return 0; 3199 file->f_pos = ret = 0;
3200
3201 return ret;
3165} 3202}
3166 3203
3167static const struct file_operations tracing_fops = { 3204static const struct file_operations tracing_fops = {
3168 .open = tracing_open, 3205 .open = tracing_open,
3169 .read = seq_read, 3206 .read = seq_read,
3170 .write = tracing_write_stub, 3207 .write = tracing_write_stub,
3171 .llseek = tracing_seek, 3208 .llseek = tracing_lseek,
3172 .release = tracing_release, 3209 .release = tracing_release,
3173}; 3210};
3174 3211
@@ -3488,60 +3525,103 @@ static const char readme_msg[] =
3488 " instances\t\t- Make sub-buffers with: mkdir instances/foo\n" 3525 " instances\t\t- Make sub-buffers with: mkdir instances/foo\n"
3489 "\t\t\t Remove sub-buffer with rmdir\n" 3526 "\t\t\t Remove sub-buffer with rmdir\n"
3490 " trace_options\t\t- Set format or modify how tracing happens\n" 3527 " trace_options\t\t- Set format or modify how tracing happens\n"
3491 "\t\t\t Disable an option by adding a suffix 'no' to the option name\n" 3528 "\t\t\t Disable an option by adding a suffix 'no' to the\n"
3529 "\t\t\t option name\n"
3492#ifdef CONFIG_DYNAMIC_FTRACE 3530#ifdef CONFIG_DYNAMIC_FTRACE
3493 "\n available_filter_functions - list of functions that can be filtered on\n" 3531 "\n available_filter_functions - list of functions that can be filtered on\n"
3494 " set_ftrace_filter\t- echo function name in here to only trace these functions\n" 3532 " set_ftrace_filter\t- echo function name in here to only trace these\n"
3495 " accepts: func_full_name, *func_end, func_begin*, *func_middle*\n" 3533 "\t\t\t functions\n"
3496 " modules: Can select a group via module\n" 3534 "\t accepts: func_full_name, *func_end, func_begin*, *func_middle*\n"
3497 " Format: :mod:<module-name>\n" 3535 "\t modules: Can select a group via module\n"
3498 " example: echo :mod:ext3 > set_ftrace_filter\n" 3536 "\t Format: :mod:<module-name>\n"
3499 " triggers: a command to perform when function is hit\n" 3537 "\t example: echo :mod:ext3 > set_ftrace_filter\n"
3500 " Format: <function>:<trigger>[:count]\n" 3538 "\t triggers: a command to perform when function is hit\n"
3501 " trigger: traceon, traceoff\n" 3539 "\t Format: <function>:<trigger>[:count]\n"
3502 " enable_event:<system>:<event>\n" 3540 "\t trigger: traceon, traceoff\n"
3503 " disable_event:<system>:<event>\n" 3541 "\t\t enable_event:<system>:<event>\n"
3542 "\t\t disable_event:<system>:<event>\n"
3504#ifdef CONFIG_STACKTRACE 3543#ifdef CONFIG_STACKTRACE
3505 " stacktrace\n" 3544 "\t\t stacktrace\n"
3506#endif 3545#endif
3507#ifdef CONFIG_TRACER_SNAPSHOT 3546#ifdef CONFIG_TRACER_SNAPSHOT
3508 " snapshot\n" 3547 "\t\t snapshot\n"
3509#endif 3548#endif
3510 " example: echo do_fault:traceoff > set_ftrace_filter\n" 3549 "\t example: echo do_fault:traceoff > set_ftrace_filter\n"
3511 " echo do_trap:traceoff:3 > set_ftrace_filter\n" 3550 "\t echo do_trap:traceoff:3 > set_ftrace_filter\n"
3512 " The first one will disable tracing every time do_fault is hit\n" 3551 "\t The first one will disable tracing every time do_fault is hit\n"
3513 " The second will disable tracing at most 3 times when do_trap is hit\n" 3552 "\t The second will disable tracing at most 3 times when do_trap is hit\n"
3514 " The first time do trap is hit and it disables tracing, the counter\n" 3553 "\t The first time do trap is hit and it disables tracing, the\n"
3515 " will decrement to 2. If tracing is already disabled, the counter\n" 3554 "\t counter will decrement to 2. If tracing is already disabled,\n"
3516 " will not decrement. It only decrements when the trigger did work\n" 3555 "\t the counter will not decrement. It only decrements when the\n"
3517 " To remove trigger without count:\n" 3556 "\t trigger did work\n"
3518 " echo '!<function>:<trigger> > set_ftrace_filter\n" 3557 "\t To remove trigger without count:\n"
3519 " To remove trigger with a count:\n" 3558 "\t echo '!<function>:<trigger> > set_ftrace_filter\n"
3520 " echo '!<function>:<trigger>:0 > set_ftrace_filter\n" 3559 "\t To remove trigger with a count:\n"
3560 "\t echo '!<function>:<trigger>:0 > set_ftrace_filter\n"
3521 " set_ftrace_notrace\t- echo function name in here to never trace.\n" 3561 " set_ftrace_notrace\t- echo function name in here to never trace.\n"
3522 " accepts: func_full_name, *func_end, func_begin*, *func_middle*\n" 3562 "\t accepts: func_full_name, *func_end, func_begin*, *func_middle*\n"
3523 " modules: Can select a group via module command :mod:\n" 3563 "\t modules: Can select a group via module command :mod:\n"
3524 " Does not accept triggers\n" 3564 "\t Does not accept triggers\n"
3525#endif /* CONFIG_DYNAMIC_FTRACE */ 3565#endif /* CONFIG_DYNAMIC_FTRACE */
3526#ifdef CONFIG_FUNCTION_TRACER 3566#ifdef CONFIG_FUNCTION_TRACER
3527 " set_ftrace_pid\t- Write pid(s) to only function trace those pids (function)\n" 3567 " set_ftrace_pid\t- Write pid(s) to only function trace those pids\n"
3568 "\t\t (function)\n"
3528#endif 3569#endif
3529#ifdef CONFIG_FUNCTION_GRAPH_TRACER 3570#ifdef CONFIG_FUNCTION_GRAPH_TRACER
3530 " set_graph_function\t- Trace the nested calls of a function (function_graph)\n" 3571 " set_graph_function\t- Trace the nested calls of a function (function_graph)\n"
3531 " max_graph_depth\t- Trace a limited depth of nested calls (0 is unlimited)\n" 3572 " max_graph_depth\t- Trace a limited depth of nested calls (0 is unlimited)\n"
3532#endif 3573#endif
3533#ifdef CONFIG_TRACER_SNAPSHOT 3574#ifdef CONFIG_TRACER_SNAPSHOT
3534 "\n snapshot\t\t- Like 'trace' but shows the content of the static snapshot buffer\n" 3575 "\n snapshot\t\t- Like 'trace' but shows the content of the static\n"
3535 "\t\t\t Read the contents for more information\n" 3576 "\t\t\t snapshot buffer. Read the contents for more\n"
3577 "\t\t\t information\n"
3536#endif 3578#endif
3537#ifdef CONFIG_STACK_TRACER 3579#ifdef CONFIG_STACK_TRACER
3538 " stack_trace\t\t- Shows the max stack trace when active\n" 3580 " stack_trace\t\t- Shows the max stack trace when active\n"
3539 " stack_max_size\t- Shows current max stack size that was traced\n" 3581 " stack_max_size\t- Shows current max stack size that was traced\n"
3540 "\t\t\t Write into this file to reset the max size (trigger a new trace)\n" 3582 "\t\t\t Write into this file to reset the max size (trigger a\n"
3583 "\t\t\t new trace)\n"
3541#ifdef CONFIG_DYNAMIC_FTRACE 3584#ifdef CONFIG_DYNAMIC_FTRACE
3542 " stack_trace_filter\t- Like set_ftrace_filter but limits what stack_trace traces\n" 3585 " stack_trace_filter\t- Like set_ftrace_filter but limits what stack_trace\n"
3586 "\t\t\t traces\n"
3543#endif 3587#endif
3544#endif /* CONFIG_STACK_TRACER */ 3588#endif /* CONFIG_STACK_TRACER */
3589 " events/\t\t- Directory containing all trace event subsystems:\n"
3590 " enable\t\t- Write 0/1 to enable/disable tracing of all events\n"
3591 " events/<system>/\t- Directory containing all trace events for <system>:\n"
3592 " enable\t\t- Write 0/1 to enable/disable tracing of all <system>\n"
3593 "\t\t\t events\n"
3594 " filter\t\t- If set, only events passing filter are traced\n"
3595 " events/<system>/<event>/\t- Directory containing control files for\n"
3596 "\t\t\t <event>:\n"
3597 " enable\t\t- Write 0/1 to enable/disable tracing of <event>\n"
3598 " filter\t\t- If set, only events passing filter are traced\n"
3599 " trigger\t\t- If set, a command to perform when event is hit\n"
3600 "\t Format: <trigger>[:count][if <filter>]\n"
3601 "\t trigger: traceon, traceoff\n"
3602 "\t enable_event:<system>:<event>\n"
3603 "\t disable_event:<system>:<event>\n"
3604#ifdef CONFIG_STACKTRACE
3605 "\t\t stacktrace\n"
3606#endif
3607#ifdef CONFIG_TRACER_SNAPSHOT
3608 "\t\t snapshot\n"
3609#endif
3610 "\t example: echo traceoff > events/block/block_unplug/trigger\n"
3611 "\t echo traceoff:3 > events/block/block_unplug/trigger\n"
3612 "\t echo 'enable_event:kmem:kmalloc:3 if nr_rq > 1' > \\\n"
3613 "\t events/block/block_unplug/trigger\n"
3614 "\t The first disables tracing every time block_unplug is hit.\n"
3615 "\t The second disables tracing the first 3 times block_unplug is hit.\n"
3616 "\t The third enables the kmalloc event the first 3 times block_unplug\n"
3617 "\t is hit and has value of greater than 1 for the 'nr_rq' event field.\n"
3618 "\t Like function triggers, the counter is only decremented if it\n"
3619 "\t enabled or disabled tracing.\n"
3620 "\t To remove a trigger without a count:\n"
3621 "\t echo '!<trigger> > <system>/<event>/trigger\n"
3622 "\t To remove a trigger with a count:\n"
3623 "\t echo '!<trigger>:0 > <system>/<event>/trigger\n"
3624 "\t Filters can be ignored when removing a trigger.\n"
3545; 3625;
3546 3626
3547static ssize_t 3627static ssize_t
@@ -4212,12 +4292,6 @@ out:
4212 return sret; 4292 return sret;
4213} 4293}
4214 4294
4215static void tracing_pipe_buf_release(struct pipe_inode_info *pipe,
4216 struct pipe_buffer *buf)
4217{
4218 __free_page(buf->page);
4219}
4220
4221static void tracing_spd_release_pipe(struct splice_pipe_desc *spd, 4295static void tracing_spd_release_pipe(struct splice_pipe_desc *spd,
4222 unsigned int idx) 4296 unsigned int idx)
4223{ 4297{
@@ -4229,7 +4303,7 @@ static const struct pipe_buf_operations tracing_pipe_buf_ops = {
4229 .map = generic_pipe_buf_map, 4303 .map = generic_pipe_buf_map,
4230 .unmap = generic_pipe_buf_unmap, 4304 .unmap = generic_pipe_buf_unmap,
4231 .confirm = generic_pipe_buf_confirm, 4305 .confirm = generic_pipe_buf_confirm,
4232 .release = tracing_pipe_buf_release, 4306 .release = generic_pipe_buf_release,
4233 .steal = generic_pipe_buf_steal, 4307 .steal = generic_pipe_buf_steal,
4234 .get = generic_pipe_buf_get, 4308 .get = generic_pipe_buf_get,
4235}; 4309};
@@ -4913,7 +4987,7 @@ static const struct file_operations snapshot_fops = {
4913 .open = tracing_snapshot_open, 4987 .open = tracing_snapshot_open,
4914 .read = seq_read, 4988 .read = seq_read,
4915 .write = tracing_snapshot_write, 4989 .write = tracing_snapshot_write,
4916 .llseek = tracing_seek, 4990 .llseek = tracing_lseek,
4917 .release = tracing_snapshot_release, 4991 .release = tracing_snapshot_release,
4918}; 4992};
4919 4993
@@ -5883,6 +5957,8 @@ allocate_trace_buffer(struct trace_array *tr, struct trace_buffer *buf, int size
5883 5957
5884 rb_flags = trace_flags & TRACE_ITER_OVERWRITE ? RB_FL_OVERWRITE : 0; 5958 rb_flags = trace_flags & TRACE_ITER_OVERWRITE ? RB_FL_OVERWRITE : 0;
5885 5959
5960 buf->tr = tr;
5961
5886 buf->buffer = ring_buffer_alloc(size, rb_flags); 5962 buf->buffer = ring_buffer_alloc(size, rb_flags);
5887 if (!buf->buffer) 5963 if (!buf->buffer)
5888 return -ENOMEM; 5964 return -ENOMEM;
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index ea189e027b80..02b592f2d4b7 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -1,3 +1,4 @@
1
1#ifndef _LINUX_KERNEL_TRACE_H 2#ifndef _LINUX_KERNEL_TRACE_H
2#define _LINUX_KERNEL_TRACE_H 3#define _LINUX_KERNEL_TRACE_H
3 4
@@ -587,6 +588,8 @@ void tracing_start_sched_switch_record(void);
587int register_tracer(struct tracer *type); 588int register_tracer(struct tracer *type);
588int is_tracing_stopped(void); 589int is_tracing_stopped(void);
589 590
591loff_t tracing_lseek(struct file *file, loff_t offset, int whence);
592
590extern cpumask_var_t __read_mostly tracing_buffer_mask; 593extern cpumask_var_t __read_mostly tracing_buffer_mask;
591 594
592#define for_each_tracing_cpu(cpu) \ 595#define for_each_tracing_cpu(cpu) \
@@ -1020,6 +1023,10 @@ extern int apply_subsystem_event_filter(struct ftrace_subsystem_dir *dir,
1020extern void print_subsystem_event_filter(struct event_subsystem *system, 1023extern void print_subsystem_event_filter(struct event_subsystem *system,
1021 struct trace_seq *s); 1024 struct trace_seq *s);
1022extern int filter_assign_type(const char *type); 1025extern int filter_assign_type(const char *type);
1026extern int create_event_filter(struct ftrace_event_call *call,
1027 char *filter_str, bool set_str,
1028 struct event_filter **filterp);
1029extern void free_event_filter(struct event_filter *filter);
1023 1030
1024struct ftrace_event_field * 1031struct ftrace_event_field *
1025trace_find_event_field(struct ftrace_event_call *call, char *name); 1032trace_find_event_field(struct ftrace_event_call *call, char *name);
@@ -1028,9 +1035,195 @@ extern void trace_event_enable_cmd_record(bool enable);
1028extern int event_trace_add_tracer(struct dentry *parent, struct trace_array *tr); 1035extern int event_trace_add_tracer(struct dentry *parent, struct trace_array *tr);
1029extern int event_trace_del_tracer(struct trace_array *tr); 1036extern int event_trace_del_tracer(struct trace_array *tr);
1030 1037
1038extern struct ftrace_event_file *find_event_file(struct trace_array *tr,
1039 const char *system,
1040 const char *event);
1041
1042static inline void *event_file_data(struct file *filp)
1043{
1044 return ACCESS_ONCE(file_inode(filp)->i_private);
1045}
1046
1031extern struct mutex event_mutex; 1047extern struct mutex event_mutex;
1032extern struct list_head ftrace_events; 1048extern struct list_head ftrace_events;
1033 1049
1050extern const struct file_operations event_trigger_fops;
1051
1052extern int register_trigger_cmds(void);
1053extern void clear_event_triggers(struct trace_array *tr);
1054
1055struct event_trigger_data {
1056 unsigned long count;
1057 int ref;
1058 struct event_trigger_ops *ops;
1059 struct event_command *cmd_ops;
1060 struct event_filter __rcu *filter;
1061 char *filter_str;
1062 void *private_data;
1063 struct list_head list;
1064};
1065
1066/**
1067 * struct event_trigger_ops - callbacks for trace event triggers
1068 *
1069 * The methods in this structure provide per-event trigger hooks for
1070 * various trigger operations.
1071 *
1072 * All the methods below, except for @init() and @free(), must be
1073 * implemented.
1074 *
1075 * @func: The trigger 'probe' function called when the triggering
1076 * event occurs. The data passed into this callback is the data
1077 * that was supplied to the event_command @reg() function that
1078 * registered the trigger (see struct event_command).
1079 *
1080 * @init: An optional initialization function called for the trigger
1081 * when the trigger is registered (via the event_command reg()
1082 * function). This can be used to perform per-trigger
1083 * initialization such as incrementing a per-trigger reference
1084 * count, for instance. This is usually implemented by the
1085 * generic utility function @event_trigger_init() (see
1086 * trace_event_triggers.c).
1087 *
1088 * @free: An optional de-initialization function called for the
1089 * trigger when the trigger is unregistered (via the
1090 * event_command @reg() function). This can be used to perform
1091 * per-trigger de-initialization such as decrementing a
1092 * per-trigger reference count and freeing corresponding trigger
1093 * data, for instance. This is usually implemented by the
1094 * generic utility function @event_trigger_free() (see
1095 * trace_event_triggers.c).
1096 *
1097 * @print: The callback function invoked to have the trigger print
1098 * itself. This is usually implemented by a wrapper function
1099 * that calls the generic utility function @event_trigger_print()
1100 * (see trace_event_triggers.c).
1101 */
1102struct event_trigger_ops {
1103 void (*func)(struct event_trigger_data *data);
1104 int (*init)(struct event_trigger_ops *ops,
1105 struct event_trigger_data *data);
1106 void (*free)(struct event_trigger_ops *ops,
1107 struct event_trigger_data *data);
1108 int (*print)(struct seq_file *m,
1109 struct event_trigger_ops *ops,
1110 struct event_trigger_data *data);
1111};
1112
1113/**
1114 * struct event_command - callbacks and data members for event commands
1115 *
1116 * Event commands are invoked by users by writing the command name
1117 * into the 'trigger' file associated with a trace event. The
1118 * parameters associated with a specific invocation of an event
1119 * command are used to create an event trigger instance, which is
1120 * added to the list of trigger instances associated with that trace
1121 * event. When the event is hit, the set of triggers associated with
1122 * that event is invoked.
1123 *
1124 * The data members in this structure provide per-event command data
1125 * for various event commands.
1126 *
1127 * All the data members below, except for @post_trigger, must be set
1128 * for each event command.
1129 *
1130 * @name: The unique name that identifies the event command. This is
1131 * the name used when setting triggers via trigger files.
1132 *
1133 * @trigger_type: A unique id that identifies the event command
1134 * 'type'. This value has two purposes, the first to ensure that
1135 * only one trigger of the same type can be set at a given time
1136 * for a particular event e.g. it doesn't make sense to have both
1137 * a traceon and traceoff trigger attached to a single event at
1138 * the same time, so traceon and traceoff have the same type
1139 * though they have different names. The @trigger_type value is
1140 * also used as a bit value for deferring the actual trigger
1141 * action until after the current event is finished. Some
1142 * commands need to do this if they themselves log to the trace
1143 * buffer (see the @post_trigger() member below). @trigger_type
1144 * values are defined by adding new values to the trigger_type
1145 * enum in include/linux/ftrace_event.h.
1146 *
1147 * @post_trigger: A flag that says whether or not this command needs
1148 * to have its action delayed until after the current event has
1149 * been closed. Some triggers need to avoid being invoked while
1150 * an event is currently in the process of being logged, since
1151 * the trigger may itself log data into the trace buffer. Thus
1152 * we make sure the current event is committed before invoking
1153 * those triggers. To do that, the trigger invocation is split
1154 * in two - the first part checks the filter using the current
1155 * trace record; if a command has the @post_trigger flag set, it
1156 * sets a bit for itself in the return value, otherwise it
1157 * directly invokes the trigger. Once all commands have been
1158 * either invoked or set their return flag, the current record is
1159 * either committed or discarded. At that point, if any commands
1160 * have deferred their triggers, those commands are finally
1161 * invoked following the close of the current event. In other
1162 * words, if the event_trigger_ops @func() probe implementation
1163 * itself logs to the trace buffer, this flag should be set,
1164 * otherwise it can be left unspecified.
1165 *
1166 * All the methods below, except for @set_filter(), must be
1167 * implemented.
1168 *
1169 * @func: The callback function responsible for parsing and
1170 * registering the trigger written to the 'trigger' file by the
1171 * user. It allocates the trigger instance and registers it with
1172 * the appropriate trace event. It makes use of the other
1173 * event_command callback functions to orchestrate this, and is
1174 * usually implemented by the generic utility function
1175 * @event_trigger_callback() (see trace_event_triggers.c).
1176 *
1177 * @reg: Adds the trigger to the list of triggers associated with the
1178 * event, and enables the event trigger itself, after
1179 * initializing it (via the event_trigger_ops @init() function).
1180 * This is also where commands can use the @trigger_type value to
1181 * make the decision as to whether or not multiple instances of
1182 * the trigger should be allowed. This is usually implemented by
1183 * the generic utility function @register_trigger() (see
1184 * trace_event_triggers.c).
1185 *
1186 * @unreg: Removes the trigger from the list of triggers associated
1187 * with the event, and disables the event trigger itself, after
1188 * initializing it (via the event_trigger_ops @free() function).
1189 * This is usually implemented by the generic utility function
1190 * @unregister_trigger() (see trace_event_triggers.c).
1191 *
1192 * @set_filter: An optional function called to parse and set a filter
1193 * for the trigger. If no @set_filter() method is set for the
1194 * event command, filters set by the user for the command will be
1195 * ignored. This is usually implemented by the generic utility
1196 * function @set_trigger_filter() (see trace_event_triggers.c).
1197 *
1198 * @get_trigger_ops: The callback function invoked to retrieve the
1199 * event_trigger_ops implementation associated with the command.
1200 */
1201struct event_command {
1202 struct list_head list;
1203 char *name;
1204 enum event_trigger_type trigger_type;
1205 bool post_trigger;
1206 int (*func)(struct event_command *cmd_ops,
1207 struct ftrace_event_file *file,
1208 char *glob, char *cmd, char *params);
1209 int (*reg)(char *glob,
1210 struct event_trigger_ops *ops,
1211 struct event_trigger_data *data,
1212 struct ftrace_event_file *file);
1213 void (*unreg)(char *glob,
1214 struct event_trigger_ops *ops,
1215 struct event_trigger_data *data,
1216 struct ftrace_event_file *file);
1217 int (*set_filter)(char *filter_str,
1218 struct event_trigger_data *data,
1219 struct ftrace_event_file *file);
1220 struct event_trigger_ops *(*get_trigger_ops)(char *cmd, char *param);
1221};
1222
1223extern int trace_event_enable_disable(struct ftrace_event_file *file,
1224 int enable, int soft_disable);
1225extern int tracing_alloc_snapshot(void);
1226
1034extern const char *__start___trace_bprintk_fmt[]; 1227extern const char *__start___trace_bprintk_fmt[];
1035extern const char *__stop___trace_bprintk_fmt[]; 1228extern const char *__stop___trace_bprintk_fmt[];
1036 1229
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index a11800ae96de..7b16d40bd64d 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -27,12 +27,6 @@
27 27
28DEFINE_MUTEX(event_mutex); 28DEFINE_MUTEX(event_mutex);
29 29
30DEFINE_MUTEX(event_storage_mutex);
31EXPORT_SYMBOL_GPL(event_storage_mutex);
32
33char event_storage[EVENT_STORAGE_SIZE];
34EXPORT_SYMBOL_GPL(event_storage);
35
36LIST_HEAD(ftrace_events); 30LIST_HEAD(ftrace_events);
37static LIST_HEAD(ftrace_common_fields); 31static LIST_HEAD(ftrace_common_fields);
38 32
@@ -342,6 +336,12 @@ static int __ftrace_event_enable_disable(struct ftrace_event_file *file,
342 return ret; 336 return ret;
343} 337}
344 338
339int trace_event_enable_disable(struct ftrace_event_file *file,
340 int enable, int soft_disable)
341{
342 return __ftrace_event_enable_disable(file, enable, soft_disable);
343}
344
345static int ftrace_event_enable_disable(struct ftrace_event_file *file, 345static int ftrace_event_enable_disable(struct ftrace_event_file *file,
346 int enable) 346 int enable)
347{ 347{
@@ -421,11 +421,6 @@ static void remove_subsystem(struct ftrace_subsystem_dir *dir)
421 } 421 }
422} 422}
423 423
424static void *event_file_data(struct file *filp)
425{
426 return ACCESS_ONCE(file_inode(filp)->i_private);
427}
428
429static void remove_event_file_dir(struct ftrace_event_file *file) 424static void remove_event_file_dir(struct ftrace_event_file *file)
430{ 425{
431 struct dentry *dir = file->dir; 426 struct dentry *dir = file->dir;
@@ -1549,6 +1544,9 @@ event_create_dir(struct dentry *parent, struct ftrace_event_file *file)
1549 trace_create_file("filter", 0644, file->dir, file, 1544 trace_create_file("filter", 0644, file->dir, file,
1550 &ftrace_event_filter_fops); 1545 &ftrace_event_filter_fops);
1551 1546
1547 trace_create_file("trigger", 0644, file->dir, file,
1548 &event_trigger_fops);
1549
1552 trace_create_file("format", 0444, file->dir, call, 1550 trace_create_file("format", 0444, file->dir, call,
1553 &ftrace_event_format_fops); 1551 &ftrace_event_format_fops);
1554 1552
@@ -1645,6 +1643,8 @@ trace_create_new_event(struct ftrace_event_call *call,
1645 file->event_call = call; 1643 file->event_call = call;
1646 file->tr = tr; 1644 file->tr = tr;
1647 atomic_set(&file->sm_ref, 0); 1645 atomic_set(&file->sm_ref, 0);
1646 atomic_set(&file->tm_ref, 0);
1647 INIT_LIST_HEAD(&file->triggers);
1648 list_add(&file->list, &tr->events); 1648 list_add(&file->list, &tr->events);
1649 1649
1650 return file; 1650 return file;
@@ -1771,6 +1771,16 @@ static void trace_module_add_events(struct module *mod)
1771{ 1771{
1772 struct ftrace_event_call **call, **start, **end; 1772 struct ftrace_event_call **call, **start, **end;
1773 1773
1774 if (!mod->num_trace_events)
1775 return;
1776
1777 /* Don't add infrastructure for mods without tracepoints */
1778 if (trace_module_has_bad_taint(mod)) {
1779 pr_err("%s: module has bad taint, not creating trace events\n",
1780 mod->name);
1781 return;
1782 }
1783
1774 start = mod->trace_events; 1784 start = mod->trace_events;
1775 end = mod->trace_events + mod->num_trace_events; 1785 end = mod->trace_events + mod->num_trace_events;
1776 1786
@@ -1849,20 +1859,7 @@ __trace_add_event_dirs(struct trace_array *tr)
1849 } 1859 }
1850} 1860}
1851 1861
1852#ifdef CONFIG_DYNAMIC_FTRACE 1862struct ftrace_event_file *
1853
1854/* Avoid typos */
1855#define ENABLE_EVENT_STR "enable_event"
1856#define DISABLE_EVENT_STR "disable_event"
1857
1858struct event_probe_data {
1859 struct ftrace_event_file *file;
1860 unsigned long count;
1861 int ref;
1862 bool enable;
1863};
1864
1865static struct ftrace_event_file *
1866find_event_file(struct trace_array *tr, const char *system, const char *event) 1863find_event_file(struct trace_array *tr, const char *system, const char *event)
1867{ 1864{
1868 struct ftrace_event_file *file; 1865 struct ftrace_event_file *file;
@@ -1885,6 +1882,19 @@ find_event_file(struct trace_array *tr, const char *system, const char *event)
1885 return NULL; 1882 return NULL;
1886} 1883}
1887 1884
1885#ifdef CONFIG_DYNAMIC_FTRACE
1886
1887/* Avoid typos */
1888#define ENABLE_EVENT_STR "enable_event"
1889#define DISABLE_EVENT_STR "disable_event"
1890
1891struct event_probe_data {
1892 struct ftrace_event_file *file;
1893 unsigned long count;
1894 int ref;
1895 bool enable;
1896};
1897
1888static void 1898static void
1889event_enable_probe(unsigned long ip, unsigned long parent_ip, void **_data) 1899event_enable_probe(unsigned long ip, unsigned long parent_ip, void **_data)
1890{ 1900{
@@ -2311,6 +2321,9 @@ int event_trace_del_tracer(struct trace_array *tr)
2311{ 2321{
2312 mutex_lock(&event_mutex); 2322 mutex_lock(&event_mutex);
2313 2323
2324 /* Disable any event triggers and associated soft-disabled events */
2325 clear_event_triggers(tr);
2326
2314 /* Disable any running events */ 2327 /* Disable any running events */
2315 __ftrace_set_clr_event_nolock(tr, NULL, NULL, NULL, 0); 2328 __ftrace_set_clr_event_nolock(tr, NULL, NULL, NULL, 0);
2316 2329
@@ -2377,6 +2390,8 @@ static __init int event_trace_enable(void)
2377 2390
2378 register_event_cmds(); 2391 register_event_cmds();
2379 2392
2393 register_trigger_cmds();
2394
2380 return 0; 2395 return 0;
2381} 2396}
2382 2397
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 2468f56dc5db..8a8631926a07 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -799,6 +799,11 @@ static void __free_filter(struct event_filter *filter)
799 kfree(filter); 799 kfree(filter);
800} 800}
801 801
802void free_event_filter(struct event_filter *filter)
803{
804 __free_filter(filter);
805}
806
802void destroy_call_preds(struct ftrace_event_call *call) 807void destroy_call_preds(struct ftrace_event_call *call)
803{ 808{
804 __free_filter(call->filter); 809 __free_filter(call->filter);
@@ -1938,6 +1943,13 @@ static int create_filter(struct ftrace_event_call *call,
1938 return err; 1943 return err;
1939} 1944}
1940 1945
1946int create_event_filter(struct ftrace_event_call *call,
1947 char *filter_str, bool set_str,
1948 struct event_filter **filterp)
1949{
1950 return create_filter(call, filter_str, set_str, filterp);
1951}
1952
1941/** 1953/**
1942 * create_system_filter - create a filter for an event_subsystem 1954 * create_system_filter - create a filter for an event_subsystem
1943 * @system: event_subsystem to create a filter for 1955 * @system: event_subsystem to create a filter for
diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c
new file mode 100644
index 000000000000..8efbb69b04f0
--- /dev/null
+++ b/kernel/trace/trace_events_trigger.c
@@ -0,0 +1,1437 @@
1/*
2 * trace_events_trigger - trace event triggers
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 *
18 * Copyright (C) 2013 Tom Zanussi <tom.zanussi@linux.intel.com>
19 */
20
21#include <linux/module.h>
22#include <linux/ctype.h>
23#include <linux/mutex.h>
24#include <linux/slab.h>
25
26#include "trace.h"
27
28static LIST_HEAD(trigger_commands);
29static DEFINE_MUTEX(trigger_cmd_mutex);
30
31static void
32trigger_data_free(struct event_trigger_data *data)
33{
34 if (data->cmd_ops->set_filter)
35 data->cmd_ops->set_filter(NULL, data, NULL);
36
37 synchronize_sched(); /* make sure current triggers exit before free */
38 kfree(data);
39}
40
41/**
42 * event_triggers_call - Call triggers associated with a trace event
43 * @file: The ftrace_event_file associated with the event
44 * @rec: The trace entry for the event, NULL for unconditional invocation
45 *
46 * For each trigger associated with an event, invoke the trigger
47 * function registered with the associated trigger command. If rec is
48 * non-NULL, it means that the trigger requires further processing and
49 * shouldn't be unconditionally invoked. If rec is non-NULL and the
50 * trigger has a filter associated with it, rec will checked against
51 * the filter and if the record matches the trigger will be invoked.
52 * If the trigger is a 'post_trigger', meaning it shouldn't be invoked
53 * in any case until the current event is written, the trigger
54 * function isn't invoked but the bit associated with the deferred
55 * trigger is set in the return value.
56 *
57 * Returns an enum event_trigger_type value containing a set bit for
58 * any trigger that should be deferred, ETT_NONE if nothing to defer.
59 *
60 * Called from tracepoint handlers (with rcu_read_lock_sched() held).
61 *
62 * Return: an enum event_trigger_type value containing a set bit for
63 * any trigger that should be deferred, ETT_NONE if nothing to defer.
64 */
65enum event_trigger_type
66event_triggers_call(struct ftrace_event_file *file, void *rec)
67{
68 struct event_trigger_data *data;
69 enum event_trigger_type tt = ETT_NONE;
70 struct event_filter *filter;
71
72 if (list_empty(&file->triggers))
73 return tt;
74
75 list_for_each_entry_rcu(data, &file->triggers, list) {
76 if (!rec) {
77 data->ops->func(data);
78 continue;
79 }
80 filter = rcu_dereference(data->filter);
81 if (filter && !filter_match_preds(filter, rec))
82 continue;
83 if (data->cmd_ops->post_trigger) {
84 tt |= data->cmd_ops->trigger_type;
85 continue;
86 }
87 data->ops->func(data);
88 }
89 return tt;
90}
91EXPORT_SYMBOL_GPL(event_triggers_call);
92
93/**
94 * event_triggers_post_call - Call 'post_triggers' for a trace event
95 * @file: The ftrace_event_file associated with the event
96 * @tt: enum event_trigger_type containing a set bit for each trigger to invoke
97 *
98 * For each trigger associated with an event, invoke the trigger
99 * function registered with the associated trigger command, if the
100 * corresponding bit is set in the tt enum passed into this function.
101 * See @event_triggers_call for details on how those bits are set.
102 *
103 * Called from tracepoint handlers (with rcu_read_lock_sched() held).
104 */
105void
106event_triggers_post_call(struct ftrace_event_file *file,
107 enum event_trigger_type tt)
108{
109 struct event_trigger_data *data;
110
111 list_for_each_entry_rcu(data, &file->triggers, list) {
112 if (data->cmd_ops->trigger_type & tt)
113 data->ops->func(data);
114 }
115}
116EXPORT_SYMBOL_GPL(event_triggers_post_call);
117
118#define SHOW_AVAILABLE_TRIGGERS (void *)(1UL)
119
120static void *trigger_next(struct seq_file *m, void *t, loff_t *pos)
121{
122 struct ftrace_event_file *event_file = event_file_data(m->private);
123
124 if (t == SHOW_AVAILABLE_TRIGGERS)
125 return NULL;
126
127 return seq_list_next(t, &event_file->triggers, pos);
128}
129
130static void *trigger_start(struct seq_file *m, loff_t *pos)
131{
132 struct ftrace_event_file *event_file;
133
134 /* ->stop() is called even if ->start() fails */
135 mutex_lock(&event_mutex);
136 event_file = event_file_data(m->private);
137 if (unlikely(!event_file))
138 return ERR_PTR(-ENODEV);
139
140 if (list_empty(&event_file->triggers))
141 return *pos == 0 ? SHOW_AVAILABLE_TRIGGERS : NULL;
142
143 return seq_list_start(&event_file->triggers, *pos);
144}
145
146static void trigger_stop(struct seq_file *m, void *t)
147{
148 mutex_unlock(&event_mutex);
149}
150
151static int trigger_show(struct seq_file *m, void *v)
152{
153 struct event_trigger_data *data;
154 struct event_command *p;
155
156 if (v == SHOW_AVAILABLE_TRIGGERS) {
157 seq_puts(m, "# Available triggers:\n");
158 seq_putc(m, '#');
159 mutex_lock(&trigger_cmd_mutex);
160 list_for_each_entry_reverse(p, &trigger_commands, list)
161 seq_printf(m, " %s", p->name);
162 seq_putc(m, '\n');
163 mutex_unlock(&trigger_cmd_mutex);
164 return 0;
165 }
166
167 data = list_entry(v, struct event_trigger_data, list);
168 data->ops->print(m, data->ops, data);
169
170 return 0;
171}
172
173static const struct seq_operations event_triggers_seq_ops = {
174 .start = trigger_start,
175 .next = trigger_next,
176 .stop = trigger_stop,
177 .show = trigger_show,
178};
179
180static int event_trigger_regex_open(struct inode *inode, struct file *file)
181{
182 int ret = 0;
183
184 mutex_lock(&event_mutex);
185
186 if (unlikely(!event_file_data(file))) {
187 mutex_unlock(&event_mutex);
188 return -ENODEV;
189 }
190
191 if (file->f_mode & FMODE_READ) {
192 ret = seq_open(file, &event_triggers_seq_ops);
193 if (!ret) {
194 struct seq_file *m = file->private_data;
195 m->private = file;
196 }
197 }
198
199 mutex_unlock(&event_mutex);
200
201 return ret;
202}
203
204static int trigger_process_regex(struct ftrace_event_file *file, char *buff)
205{
206 char *command, *next = buff;
207 struct event_command *p;
208 int ret = -EINVAL;
209
210 command = strsep(&next, ": \t");
211 command = (command[0] != '!') ? command : command + 1;
212
213 mutex_lock(&trigger_cmd_mutex);
214 list_for_each_entry(p, &trigger_commands, list) {
215 if (strcmp(p->name, command) == 0) {
216 ret = p->func(p, file, buff, command, next);
217 goto out_unlock;
218 }
219 }
220 out_unlock:
221 mutex_unlock(&trigger_cmd_mutex);
222
223 return ret;
224}
225
226static ssize_t event_trigger_regex_write(struct file *file,
227 const char __user *ubuf,
228 size_t cnt, loff_t *ppos)
229{
230 struct ftrace_event_file *event_file;
231 ssize_t ret;
232 char *buf;
233
234 if (!cnt)
235 return 0;
236
237 if (cnt >= PAGE_SIZE)
238 return -EINVAL;
239
240 buf = (char *)__get_free_page(GFP_TEMPORARY);
241 if (!buf)
242 return -ENOMEM;
243
244 if (copy_from_user(buf, ubuf, cnt)) {
245 free_page((unsigned long)buf);
246 return -EFAULT;
247 }
248 buf[cnt] = '\0';
249 strim(buf);
250
251 mutex_lock(&event_mutex);
252 event_file = event_file_data(file);
253 if (unlikely(!event_file)) {
254 mutex_unlock(&event_mutex);
255 free_page((unsigned long)buf);
256 return -ENODEV;
257 }
258 ret = trigger_process_regex(event_file, buf);
259 mutex_unlock(&event_mutex);
260
261 free_page((unsigned long)buf);
262 if (ret < 0)
263 goto out;
264
265 *ppos += cnt;
266 ret = cnt;
267 out:
268 return ret;
269}
270
271static int event_trigger_regex_release(struct inode *inode, struct file *file)
272{
273 mutex_lock(&event_mutex);
274
275 if (file->f_mode & FMODE_READ)
276 seq_release(inode, file);
277
278 mutex_unlock(&event_mutex);
279
280 return 0;
281}
282
283static ssize_t
284event_trigger_write(struct file *filp, const char __user *ubuf,
285 size_t cnt, loff_t *ppos)
286{
287 return event_trigger_regex_write(filp, ubuf, cnt, ppos);
288}
289
290static int
291event_trigger_open(struct inode *inode, struct file *filp)
292{
293 return event_trigger_regex_open(inode, filp);
294}
295
296static int
297event_trigger_release(struct inode *inode, struct file *file)
298{
299 return event_trigger_regex_release(inode, file);
300}
301
302const struct file_operations event_trigger_fops = {
303 .open = event_trigger_open,
304 .read = seq_read,
305 .write = event_trigger_write,
306 .llseek = tracing_lseek,
307 .release = event_trigger_release,
308};
309
310/*
311 * Currently we only register event commands from __init, so mark this
312 * __init too.
313 */
314static __init int register_event_command(struct event_command *cmd)
315{
316 struct event_command *p;
317 int ret = 0;
318
319 mutex_lock(&trigger_cmd_mutex);
320 list_for_each_entry(p, &trigger_commands, list) {
321 if (strcmp(cmd->name, p->name) == 0) {
322 ret = -EBUSY;
323 goto out_unlock;
324 }
325 }
326 list_add(&cmd->list, &trigger_commands);
327 out_unlock:
328 mutex_unlock(&trigger_cmd_mutex);
329
330 return ret;
331}
332
333/*
334 * Currently we only unregister event commands from __init, so mark
335 * this __init too.
336 */
337static __init int unregister_event_command(struct event_command *cmd)
338{
339 struct event_command *p, *n;
340 int ret = -ENODEV;
341
342 mutex_lock(&trigger_cmd_mutex);
343 list_for_each_entry_safe(p, n, &trigger_commands, list) {
344 if (strcmp(cmd->name, p->name) == 0) {
345 ret = 0;
346 list_del_init(&p->list);
347 goto out_unlock;
348 }
349 }
350 out_unlock:
351 mutex_unlock(&trigger_cmd_mutex);
352
353 return ret;
354}
355
356/**
357 * event_trigger_print - Generic event_trigger_ops @print implementation
358 * @name: The name of the event trigger
359 * @m: The seq_file being printed to
360 * @data: Trigger-specific data
361 * @filter_str: filter_str to print, if present
362 *
363 * Common implementation for event triggers to print themselves.
364 *
365 * Usually wrapped by a function that simply sets the @name of the
366 * trigger command and then invokes this.
367 *
368 * Return: 0 on success, errno otherwise
369 */
370static int
371event_trigger_print(const char *name, struct seq_file *m,
372 void *data, char *filter_str)
373{
374 long count = (long)data;
375
376 seq_printf(m, "%s", name);
377
378 if (count == -1)
379 seq_puts(m, ":unlimited");
380 else
381 seq_printf(m, ":count=%ld", count);
382
383 if (filter_str)
384 seq_printf(m, " if %s\n", filter_str);
385 else
386 seq_puts(m, "\n");
387
388 return 0;
389}
390
391/**
392 * event_trigger_init - Generic event_trigger_ops @init implementation
393 * @ops: The trigger ops associated with the trigger
394 * @data: Trigger-specific data
395 *
396 * Common implementation of event trigger initialization.
397 *
398 * Usually used directly as the @init method in event trigger
399 * implementations.
400 *
401 * Return: 0 on success, errno otherwise
402 */
403static int
404event_trigger_init(struct event_trigger_ops *ops,
405 struct event_trigger_data *data)
406{
407 data->ref++;
408 return 0;
409}
410
411/**
412 * event_trigger_free - Generic event_trigger_ops @free implementation
413 * @ops: The trigger ops associated with the trigger
414 * @data: Trigger-specific data
415 *
416 * Common implementation of event trigger de-initialization.
417 *
418 * Usually used directly as the @free method in event trigger
419 * implementations.
420 */
421static void
422event_trigger_free(struct event_trigger_ops *ops,
423 struct event_trigger_data *data)
424{
425 if (WARN_ON_ONCE(data->ref <= 0))
426 return;
427
428 data->ref--;
429 if (!data->ref)
430 trigger_data_free(data);
431}
432
433static int trace_event_trigger_enable_disable(struct ftrace_event_file *file,
434 int trigger_enable)
435{
436 int ret = 0;
437
438 if (trigger_enable) {
439 if (atomic_inc_return(&file->tm_ref) > 1)
440 return ret;
441 set_bit(FTRACE_EVENT_FL_TRIGGER_MODE_BIT, &file->flags);
442 ret = trace_event_enable_disable(file, 1, 1);
443 } else {
444 if (atomic_dec_return(&file->tm_ref) > 0)
445 return ret;
446 clear_bit(FTRACE_EVENT_FL_TRIGGER_MODE_BIT, &file->flags);
447 ret = trace_event_enable_disable(file, 0, 1);
448 }
449
450 return ret;
451}
452
453/**
454 * clear_event_triggers - Clear all triggers associated with a trace array
455 * @tr: The trace array to clear
456 *
457 * For each trigger, the triggering event has its tm_ref decremented
458 * via trace_event_trigger_enable_disable(), and any associated event
459 * (in the case of enable/disable_event triggers) will have its sm_ref
460 * decremented via free()->trace_event_enable_disable(). That
461 * combination effectively reverses the soft-mode/trigger state added
462 * by trigger registration.
463 *
464 * Must be called with event_mutex held.
465 */
466void
467clear_event_triggers(struct trace_array *tr)
468{
469 struct ftrace_event_file *file;
470
471 list_for_each_entry(file, &tr->events, list) {
472 struct event_trigger_data *data;
473 list_for_each_entry_rcu(data, &file->triggers, list) {
474 trace_event_trigger_enable_disable(file, 0);
475 if (data->ops->free)
476 data->ops->free(data->ops, data);
477 }
478 }
479}
480
481/**
482 * update_cond_flag - Set or reset the TRIGGER_COND bit
483 * @file: The ftrace_event_file associated with the event
484 *
485 * If an event has triggers and any of those triggers has a filter or
486 * a post_trigger, trigger invocation needs to be deferred until after
487 * the current event has logged its data, and the event should have
488 * its TRIGGER_COND bit set, otherwise the TRIGGER_COND bit should be
489 * cleared.
490 */
491static void update_cond_flag(struct ftrace_event_file *file)
492{
493 struct event_trigger_data *data;
494 bool set_cond = false;
495
496 list_for_each_entry_rcu(data, &file->triggers, list) {
497 if (data->filter || data->cmd_ops->post_trigger) {
498 set_cond = true;
499 break;
500 }
501 }
502
503 if (set_cond)
504 set_bit(FTRACE_EVENT_FL_TRIGGER_COND_BIT, &file->flags);
505 else
506 clear_bit(FTRACE_EVENT_FL_TRIGGER_COND_BIT, &file->flags);
507}
508
509/**
510 * register_trigger - Generic event_command @reg implementation
511 * @glob: The raw string used to register the trigger
512 * @ops: The trigger ops associated with the trigger
513 * @data: Trigger-specific data to associate with the trigger
514 * @file: The ftrace_event_file associated with the event
515 *
516 * Common implementation for event trigger registration.
517 *
518 * Usually used directly as the @reg method in event command
519 * implementations.
520 *
521 * Return: 0 on success, errno otherwise
522 */
523static int register_trigger(char *glob, struct event_trigger_ops *ops,
524 struct event_trigger_data *data,
525 struct ftrace_event_file *file)
526{
527 struct event_trigger_data *test;
528 int ret = 0;
529
530 list_for_each_entry_rcu(test, &file->triggers, list) {
531 if (test->cmd_ops->trigger_type == data->cmd_ops->trigger_type) {
532 ret = -EEXIST;
533 goto out;
534 }
535 }
536
537 if (data->ops->init) {
538 ret = data->ops->init(data->ops, data);
539 if (ret < 0)
540 goto out;
541 }
542
543 list_add_rcu(&data->list, &file->triggers);
544 ret++;
545
546 if (trace_event_trigger_enable_disable(file, 1) < 0) {
547 list_del_rcu(&data->list);
548 ret--;
549 }
550 update_cond_flag(file);
551out:
552 return ret;
553}
554
555/**
556 * unregister_trigger - Generic event_command @unreg implementation
557 * @glob: The raw string used to register the trigger
558 * @ops: The trigger ops associated with the trigger
559 * @test: Trigger-specific data used to find the trigger to remove
560 * @file: The ftrace_event_file associated with the event
561 *
562 * Common implementation for event trigger unregistration.
563 *
564 * Usually used directly as the @unreg method in event command
565 * implementations.
566 */
567static void unregister_trigger(char *glob, struct event_trigger_ops *ops,
568 struct event_trigger_data *test,
569 struct ftrace_event_file *file)
570{
571 struct event_trigger_data *data;
572 bool unregistered = false;
573
574 list_for_each_entry_rcu(data, &file->triggers, list) {
575 if (data->cmd_ops->trigger_type == test->cmd_ops->trigger_type) {
576 unregistered = true;
577 list_del_rcu(&data->list);
578 update_cond_flag(file);
579 trace_event_trigger_enable_disable(file, 0);
580 break;
581 }
582 }
583
584 if (unregistered && data->ops->free)
585 data->ops->free(data->ops, data);
586}
587
588/**
589 * event_trigger_callback - Generic event_command @func implementation
590 * @cmd_ops: The command ops, used for trigger registration
591 * @file: The ftrace_event_file associated with the event
592 * @glob: The raw string used to register the trigger
593 * @cmd: The cmd portion of the string used to register the trigger
594 * @param: The params portion of the string used to register the trigger
595 *
596 * Common implementation for event command parsing and trigger
597 * instantiation.
598 *
599 * Usually used directly as the @func method in event command
600 * implementations.
601 *
602 * Return: 0 on success, errno otherwise
603 */
604static int
605event_trigger_callback(struct event_command *cmd_ops,
606 struct ftrace_event_file *file,
607 char *glob, char *cmd, char *param)
608{
609 struct event_trigger_data *trigger_data;
610 struct event_trigger_ops *trigger_ops;
611 char *trigger = NULL;
612 char *number;
613 int ret;
614
615 /* separate the trigger from the filter (t:n [if filter]) */
616 if (param && isdigit(param[0]))
617 trigger = strsep(&param, " \t");
618
619 trigger_ops = cmd_ops->get_trigger_ops(cmd, trigger);
620
621 ret = -ENOMEM;
622 trigger_data = kzalloc(sizeof(*trigger_data), GFP_KERNEL);
623 if (!trigger_data)
624 goto out;
625
626 trigger_data->count = -1;
627 trigger_data->ops = trigger_ops;
628 trigger_data->cmd_ops = cmd_ops;
629 INIT_LIST_HEAD(&trigger_data->list);
630
631 if (glob[0] == '!') {
632 cmd_ops->unreg(glob+1, trigger_ops, trigger_data, file);
633 kfree(trigger_data);
634 ret = 0;
635 goto out;
636 }
637
638 if (trigger) {
639 number = strsep(&trigger, ":");
640
641 ret = -EINVAL;
642 if (!strlen(number))
643 goto out_free;
644
645 /*
646 * We use the callback data field (which is a pointer)
647 * as our counter.
648 */
649 ret = kstrtoul(number, 0, &trigger_data->count);
650 if (ret)
651 goto out_free;
652 }
653
654 if (!param) /* if param is non-empty, it's supposed to be a filter */
655 goto out_reg;
656
657 if (!cmd_ops->set_filter)
658 goto out_reg;
659
660 ret = cmd_ops->set_filter(param, trigger_data, file);
661 if (ret < 0)
662 goto out_free;
663
664 out_reg:
665 ret = cmd_ops->reg(glob, trigger_ops, trigger_data, file);
666 /*
667 * The above returns on success the # of functions enabled,
668 * but if it didn't find any functions it returns zero.
669 * Consider no functions a failure too.
670 */
671 if (!ret) {
672 ret = -ENOENT;
673 goto out_free;
674 } else if (ret < 0)
675 goto out_free;
676 ret = 0;
677 out:
678 return ret;
679
680 out_free:
681 if (cmd_ops->set_filter)
682 cmd_ops->set_filter(NULL, trigger_data, NULL);
683 kfree(trigger_data);
684 goto out;
685}
686
687/**
688 * set_trigger_filter - Generic event_command @set_filter implementation
689 * @filter_str: The filter string for the trigger, NULL to remove filter
690 * @trigger_data: Trigger-specific data
691 * @file: The ftrace_event_file associated with the event
692 *
693 * Common implementation for event command filter parsing and filter
694 * instantiation.
695 *
696 * Usually used directly as the @set_filter method in event command
697 * implementations.
698 *
699 * Also used to remove a filter (if filter_str = NULL).
700 *
701 * Return: 0 on success, errno otherwise
702 */
703static int set_trigger_filter(char *filter_str,
704 struct event_trigger_data *trigger_data,
705 struct ftrace_event_file *file)
706{
707 struct event_trigger_data *data = trigger_data;
708 struct event_filter *filter = NULL, *tmp;
709 int ret = -EINVAL;
710 char *s;
711
712 if (!filter_str) /* clear the current filter */
713 goto assign;
714
715 s = strsep(&filter_str, " \t");
716
717 if (!strlen(s) || strcmp(s, "if") != 0)
718 goto out;
719
720 if (!filter_str)
721 goto out;
722
723 /* The filter is for the 'trigger' event, not the triggered event */
724 ret = create_event_filter(file->event_call, filter_str, false, &filter);
725 if (ret)
726 goto out;
727 assign:
728 tmp = rcu_access_pointer(data->filter);
729
730 rcu_assign_pointer(data->filter, filter);
731
732 if (tmp) {
733 /* Make sure the call is done with the filter */
734 synchronize_sched();
735 free_event_filter(tmp);
736 }
737
738 kfree(data->filter_str);
739 data->filter_str = NULL;
740
741 if (filter_str) {
742 data->filter_str = kstrdup(filter_str, GFP_KERNEL);
743 if (!data->filter_str) {
744 free_event_filter(rcu_access_pointer(data->filter));
745 data->filter = NULL;
746 ret = -ENOMEM;
747 }
748 }
749 out:
750 return ret;
751}
752
753static void
754traceon_trigger(struct event_trigger_data *data)
755{
756 if (tracing_is_on())
757 return;
758
759 tracing_on();
760}
761
762static void
763traceon_count_trigger(struct event_trigger_data *data)
764{
765 if (tracing_is_on())
766 return;
767
768 if (!data->count)
769 return;
770
771 if (data->count != -1)
772 (data->count)--;
773
774 tracing_on();
775}
776
777static void
778traceoff_trigger(struct event_trigger_data *data)
779{
780 if (!tracing_is_on())
781 return;
782
783 tracing_off();
784}
785
786static void
787traceoff_count_trigger(struct event_trigger_data *data)
788{
789 if (!tracing_is_on())
790 return;
791
792 if (!data->count)
793 return;
794
795 if (data->count != -1)
796 (data->count)--;
797
798 tracing_off();
799}
800
801static int
802traceon_trigger_print(struct seq_file *m, struct event_trigger_ops *ops,
803 struct event_trigger_data *data)
804{
805 return event_trigger_print("traceon", m, (void *)data->count,
806 data->filter_str);
807}
808
809static int
810traceoff_trigger_print(struct seq_file *m, struct event_trigger_ops *ops,
811 struct event_trigger_data *data)
812{
813 return event_trigger_print("traceoff", m, (void *)data->count,
814 data->filter_str);
815}
816
817static struct event_trigger_ops traceon_trigger_ops = {
818 .func = traceon_trigger,
819 .print = traceon_trigger_print,
820 .init = event_trigger_init,
821 .free = event_trigger_free,
822};
823
824static struct event_trigger_ops traceon_count_trigger_ops = {
825 .func = traceon_count_trigger,
826 .print = traceon_trigger_print,
827 .init = event_trigger_init,
828 .free = event_trigger_free,
829};
830
831static struct event_trigger_ops traceoff_trigger_ops = {
832 .func = traceoff_trigger,
833 .print = traceoff_trigger_print,
834 .init = event_trigger_init,
835 .free = event_trigger_free,
836};
837
838static struct event_trigger_ops traceoff_count_trigger_ops = {
839 .func = traceoff_count_trigger,
840 .print = traceoff_trigger_print,
841 .init = event_trigger_init,
842 .free = event_trigger_free,
843};
844
845static struct event_trigger_ops *
846onoff_get_trigger_ops(char *cmd, char *param)
847{
848 struct event_trigger_ops *ops;
849
850 /* we register both traceon and traceoff to this callback */
851 if (strcmp(cmd, "traceon") == 0)
852 ops = param ? &traceon_count_trigger_ops :
853 &traceon_trigger_ops;
854 else
855 ops = param ? &traceoff_count_trigger_ops :
856 &traceoff_trigger_ops;
857
858 return ops;
859}
860
861static struct event_command trigger_traceon_cmd = {
862 .name = "traceon",
863 .trigger_type = ETT_TRACE_ONOFF,
864 .func = event_trigger_callback,
865 .reg = register_trigger,
866 .unreg = unregister_trigger,
867 .get_trigger_ops = onoff_get_trigger_ops,
868 .set_filter = set_trigger_filter,
869};
870
871static struct event_command trigger_traceoff_cmd = {
872 .name = "traceoff",
873 .trigger_type = ETT_TRACE_ONOFF,
874 .func = event_trigger_callback,
875 .reg = register_trigger,
876 .unreg = unregister_trigger,
877 .get_trigger_ops = onoff_get_trigger_ops,
878 .set_filter = set_trigger_filter,
879};
880
881#ifdef CONFIG_TRACER_SNAPSHOT
882static void
883snapshot_trigger(struct event_trigger_data *data)
884{
885 tracing_snapshot();
886}
887
888static void
889snapshot_count_trigger(struct event_trigger_data *data)
890{
891 if (!data->count)
892 return;
893
894 if (data->count != -1)
895 (data->count)--;
896
897 snapshot_trigger(data);
898}
899
900static int
901register_snapshot_trigger(char *glob, struct event_trigger_ops *ops,
902 struct event_trigger_data *data,
903 struct ftrace_event_file *file)
904{
905 int ret = register_trigger(glob, ops, data, file);
906
907 if (ret > 0 && tracing_alloc_snapshot() != 0) {
908 unregister_trigger(glob, ops, data, file);
909 ret = 0;
910 }
911
912 return ret;
913}
914
915static int
916snapshot_trigger_print(struct seq_file *m, struct event_trigger_ops *ops,
917 struct event_trigger_data *data)
918{
919 return event_trigger_print("snapshot", m, (void *)data->count,
920 data->filter_str);
921}
922
923static struct event_trigger_ops snapshot_trigger_ops = {
924 .func = snapshot_trigger,
925 .print = snapshot_trigger_print,
926 .init = event_trigger_init,
927 .free = event_trigger_free,
928};
929
930static struct event_trigger_ops snapshot_count_trigger_ops = {
931 .func = snapshot_count_trigger,
932 .print = snapshot_trigger_print,
933 .init = event_trigger_init,
934 .free = event_trigger_free,
935};
936
937static struct event_trigger_ops *
938snapshot_get_trigger_ops(char *cmd, char *param)
939{
940 return param ? &snapshot_count_trigger_ops : &snapshot_trigger_ops;
941}
942
943static struct event_command trigger_snapshot_cmd = {
944 .name = "snapshot",
945 .trigger_type = ETT_SNAPSHOT,
946 .func = event_trigger_callback,
947 .reg = register_snapshot_trigger,
948 .unreg = unregister_trigger,
949 .get_trigger_ops = snapshot_get_trigger_ops,
950 .set_filter = set_trigger_filter,
951};
952
953static __init int register_trigger_snapshot_cmd(void)
954{
955 int ret;
956
957 ret = register_event_command(&trigger_snapshot_cmd);
958 WARN_ON(ret < 0);
959
960 return ret;
961}
962#else
963static __init int register_trigger_snapshot_cmd(void) { return 0; }
964#endif /* CONFIG_TRACER_SNAPSHOT */
965
966#ifdef CONFIG_STACKTRACE
967/*
968 * Skip 3:
969 * stacktrace_trigger()
970 * event_triggers_post_call()
971 * ftrace_raw_event_xxx()
972 */
973#define STACK_SKIP 3
974
975static void
976stacktrace_trigger(struct event_trigger_data *data)
977{
978 trace_dump_stack(STACK_SKIP);
979}
980
981static void
982stacktrace_count_trigger(struct event_trigger_data *data)
983{
984 if (!data->count)
985 return;
986
987 if (data->count != -1)
988 (data->count)--;
989
990 stacktrace_trigger(data);
991}
992
993static int
994stacktrace_trigger_print(struct seq_file *m, struct event_trigger_ops *ops,
995 struct event_trigger_data *data)
996{
997 return event_trigger_print("stacktrace", m, (void *)data->count,
998 data->filter_str);
999}
1000
1001static struct event_trigger_ops stacktrace_trigger_ops = {
1002 .func = stacktrace_trigger,
1003 .print = stacktrace_trigger_print,
1004 .init = event_trigger_init,
1005 .free = event_trigger_free,
1006};
1007
1008static struct event_trigger_ops stacktrace_count_trigger_ops = {
1009 .func = stacktrace_count_trigger,
1010 .print = stacktrace_trigger_print,
1011 .init = event_trigger_init,
1012 .free = event_trigger_free,
1013};
1014
1015static struct event_trigger_ops *
1016stacktrace_get_trigger_ops(char *cmd, char *param)
1017{
1018 return param ? &stacktrace_count_trigger_ops : &stacktrace_trigger_ops;
1019}
1020
1021static struct event_command trigger_stacktrace_cmd = {
1022 .name = "stacktrace",
1023 .trigger_type = ETT_STACKTRACE,
1024 .post_trigger = true,
1025 .func = event_trigger_callback,
1026 .reg = register_trigger,
1027 .unreg = unregister_trigger,
1028 .get_trigger_ops = stacktrace_get_trigger_ops,
1029 .set_filter = set_trigger_filter,
1030};
1031
1032static __init int register_trigger_stacktrace_cmd(void)
1033{
1034 int ret;
1035
1036 ret = register_event_command(&trigger_stacktrace_cmd);
1037 WARN_ON(ret < 0);
1038
1039 return ret;
1040}
1041#else
1042static __init int register_trigger_stacktrace_cmd(void) { return 0; }
1043#endif /* CONFIG_STACKTRACE */
1044
1045static __init void unregister_trigger_traceon_traceoff_cmds(void)
1046{
1047 unregister_event_command(&trigger_traceon_cmd);
1048 unregister_event_command(&trigger_traceoff_cmd);
1049}
1050
1051/* Avoid typos */
1052#define ENABLE_EVENT_STR "enable_event"
1053#define DISABLE_EVENT_STR "disable_event"
1054
1055struct enable_trigger_data {
1056 struct ftrace_event_file *file;
1057 bool enable;
1058};
1059
1060static void
1061event_enable_trigger(struct event_trigger_data *data)
1062{
1063 struct enable_trigger_data *enable_data = data->private_data;
1064
1065 if (enable_data->enable)
1066 clear_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &enable_data->file->flags);
1067 else
1068 set_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &enable_data->file->flags);
1069}
1070
1071static void
1072event_enable_count_trigger(struct event_trigger_data *data)
1073{
1074 struct enable_trigger_data *enable_data = data->private_data;
1075
1076 if (!data->count)
1077 return;
1078
1079 /* Skip if the event is in a state we want to switch to */
1080 if (enable_data->enable == !(enable_data->file->flags & FTRACE_EVENT_FL_SOFT_DISABLED))
1081 return;
1082
1083 if (data->count != -1)
1084 (data->count)--;
1085
1086 event_enable_trigger(data);
1087}
1088
1089static int
1090event_enable_trigger_print(struct seq_file *m, struct event_trigger_ops *ops,
1091 struct event_trigger_data *data)
1092{
1093 struct enable_trigger_data *enable_data = data->private_data;
1094
1095 seq_printf(m, "%s:%s:%s",
1096 enable_data->enable ? ENABLE_EVENT_STR : DISABLE_EVENT_STR,
1097 enable_data->file->event_call->class->system,
1098 enable_data->file->event_call->name);
1099
1100 if (data->count == -1)
1101 seq_puts(m, ":unlimited");
1102 else
1103 seq_printf(m, ":count=%ld", data->count);
1104
1105 if (data->filter_str)
1106 seq_printf(m, " if %s\n", data->filter_str);
1107 else
1108 seq_puts(m, "\n");
1109
1110 return 0;
1111}
1112
1113static void
1114event_enable_trigger_free(struct event_trigger_ops *ops,
1115 struct event_trigger_data *data)
1116{
1117 struct enable_trigger_data *enable_data = data->private_data;
1118
1119 if (WARN_ON_ONCE(data->ref <= 0))
1120 return;
1121
1122 data->ref--;
1123 if (!data->ref) {
1124 /* Remove the SOFT_MODE flag */
1125 trace_event_enable_disable(enable_data->file, 0, 1);
1126 module_put(enable_data->file->event_call->mod);
1127 trigger_data_free(data);
1128 kfree(enable_data);
1129 }
1130}
1131
1132static struct event_trigger_ops event_enable_trigger_ops = {
1133 .func = event_enable_trigger,
1134 .print = event_enable_trigger_print,
1135 .init = event_trigger_init,
1136 .free = event_enable_trigger_free,
1137};
1138
1139static struct event_trigger_ops event_enable_count_trigger_ops = {
1140 .func = event_enable_count_trigger,
1141 .print = event_enable_trigger_print,
1142 .init = event_trigger_init,
1143 .free = event_enable_trigger_free,
1144};
1145
1146static struct event_trigger_ops event_disable_trigger_ops = {
1147 .func = event_enable_trigger,
1148 .print = event_enable_trigger_print,
1149 .init = event_trigger_init,
1150 .free = event_enable_trigger_free,
1151};
1152
1153static struct event_trigger_ops event_disable_count_trigger_ops = {
1154 .func = event_enable_count_trigger,
1155 .print = event_enable_trigger_print,
1156 .init = event_trigger_init,
1157 .free = event_enable_trigger_free,
1158};
1159
1160static int
1161event_enable_trigger_func(struct event_command *cmd_ops,
1162 struct ftrace_event_file *file,
1163 char *glob, char *cmd, char *param)
1164{
1165 struct ftrace_event_file *event_enable_file;
1166 struct enable_trigger_data *enable_data;
1167 struct event_trigger_data *trigger_data;
1168 struct event_trigger_ops *trigger_ops;
1169 struct trace_array *tr = file->tr;
1170 const char *system;
1171 const char *event;
1172 char *trigger;
1173 char *number;
1174 bool enable;
1175 int ret;
1176
1177 if (!param)
1178 return -EINVAL;
1179
1180 /* separate the trigger from the filter (s:e:n [if filter]) */
1181 trigger = strsep(&param, " \t");
1182 if (!trigger)
1183 return -EINVAL;
1184
1185 system = strsep(&trigger, ":");
1186 if (!trigger)
1187 return -EINVAL;
1188
1189 event = strsep(&trigger, ":");
1190
1191 ret = -EINVAL;
1192 event_enable_file = find_event_file(tr, system, event);
1193 if (!event_enable_file)
1194 goto out;
1195
1196 enable = strcmp(cmd, ENABLE_EVENT_STR) == 0;
1197
1198 trigger_ops = cmd_ops->get_trigger_ops(cmd, trigger);
1199
1200 ret = -ENOMEM;
1201 trigger_data = kzalloc(sizeof(*trigger_data), GFP_KERNEL);
1202 if (!trigger_data)
1203 goto out;
1204
1205 enable_data = kzalloc(sizeof(*enable_data), GFP_KERNEL);
1206 if (!enable_data) {
1207 kfree(trigger_data);
1208 goto out;
1209 }
1210
1211 trigger_data->count = -1;
1212 trigger_data->ops = trigger_ops;
1213 trigger_data->cmd_ops = cmd_ops;
1214 INIT_LIST_HEAD(&trigger_data->list);
1215 RCU_INIT_POINTER(trigger_data->filter, NULL);
1216
1217 enable_data->enable = enable;
1218 enable_data->file = event_enable_file;
1219 trigger_data->private_data = enable_data;
1220
1221 if (glob[0] == '!') {
1222 cmd_ops->unreg(glob+1, trigger_ops, trigger_data, file);
1223 kfree(trigger_data);
1224 kfree(enable_data);
1225 ret = 0;
1226 goto out;
1227 }
1228
1229 if (trigger) {
1230 number = strsep(&trigger, ":");
1231
1232 ret = -EINVAL;
1233 if (!strlen(number))
1234 goto out_free;
1235
1236 /*
1237 * We use the callback data field (which is a pointer)
1238 * as our counter.
1239 */
1240 ret = kstrtoul(number, 0, &trigger_data->count);
1241 if (ret)
1242 goto out_free;
1243 }
1244
1245 if (!param) /* if param is non-empty, it's supposed to be a filter */
1246 goto out_reg;
1247
1248 if (!cmd_ops->set_filter)
1249 goto out_reg;
1250
1251 ret = cmd_ops->set_filter(param, trigger_data, file);
1252 if (ret < 0)
1253 goto out_free;
1254
1255 out_reg:
1256 /* Don't let event modules unload while probe registered */
1257 ret = try_module_get(event_enable_file->event_call->mod);
1258 if (!ret) {
1259 ret = -EBUSY;
1260 goto out_free;
1261 }
1262
1263 ret = trace_event_enable_disable(event_enable_file, 1, 1);
1264 if (ret < 0)
1265 goto out_put;
1266 ret = cmd_ops->reg(glob, trigger_ops, trigger_data, file);
1267 /*
1268 * The above returns on success the # of functions enabled,
1269 * but if it didn't find any functions it returns zero.
1270 * Consider no functions a failure too.
1271 */
1272 if (!ret) {
1273 ret = -ENOENT;
1274 goto out_disable;
1275 } else if (ret < 0)
1276 goto out_disable;
1277 /* Just return zero, not the number of enabled functions */
1278 ret = 0;
1279 out:
1280 return ret;
1281
1282 out_disable:
1283 trace_event_enable_disable(event_enable_file, 0, 1);
1284 out_put:
1285 module_put(event_enable_file->event_call->mod);
1286 out_free:
1287 if (cmd_ops->set_filter)
1288 cmd_ops->set_filter(NULL, trigger_data, NULL);
1289 kfree(trigger_data);
1290 kfree(enable_data);
1291 goto out;
1292}
1293
1294static int event_enable_register_trigger(char *glob,
1295 struct event_trigger_ops *ops,
1296 struct event_trigger_data *data,
1297 struct ftrace_event_file *file)
1298{
1299 struct enable_trigger_data *enable_data = data->private_data;
1300 struct enable_trigger_data *test_enable_data;
1301 struct event_trigger_data *test;
1302 int ret = 0;
1303
1304 list_for_each_entry_rcu(test, &file->triggers, list) {
1305 test_enable_data = test->private_data;
1306 if (test_enable_data &&
1307 (test_enable_data->file == enable_data->file)) {
1308 ret = -EEXIST;
1309 goto out;
1310 }
1311 }
1312
1313 if (data->ops->init) {
1314 ret = data->ops->init(data->ops, data);
1315 if (ret < 0)
1316 goto out;
1317 }
1318
1319 list_add_rcu(&data->list, &file->triggers);
1320 ret++;
1321
1322 if (trace_event_trigger_enable_disable(file, 1) < 0) {
1323 list_del_rcu(&data->list);
1324 ret--;
1325 }
1326 update_cond_flag(file);
1327out:
1328 return ret;
1329}
1330
1331static void event_enable_unregister_trigger(char *glob,
1332 struct event_trigger_ops *ops,
1333 struct event_trigger_data *test,
1334 struct ftrace_event_file *file)
1335{
1336 struct enable_trigger_data *test_enable_data = test->private_data;
1337 struct enable_trigger_data *enable_data;
1338 struct event_trigger_data *data;
1339 bool unregistered = false;
1340
1341 list_for_each_entry_rcu(data, &file->triggers, list) {
1342 enable_data = data->private_data;
1343 if (enable_data &&
1344 (enable_data->file == test_enable_data->file)) {
1345 unregistered = true;
1346 list_del_rcu(&data->list);
1347 update_cond_flag(file);
1348 trace_event_trigger_enable_disable(file, 0);
1349 break;
1350 }
1351 }
1352
1353 if (unregistered && data->ops->free)
1354 data->ops->free(data->ops, data);
1355}
1356
1357static struct event_trigger_ops *
1358event_enable_get_trigger_ops(char *cmd, char *param)
1359{
1360 struct event_trigger_ops *ops;
1361 bool enable;
1362
1363 enable = strcmp(cmd, ENABLE_EVENT_STR) == 0;
1364
1365 if (enable)
1366 ops = param ? &event_enable_count_trigger_ops :
1367 &event_enable_trigger_ops;
1368 else
1369 ops = param ? &event_disable_count_trigger_ops :
1370 &event_disable_trigger_ops;
1371
1372 return ops;
1373}
1374
1375static struct event_command trigger_enable_cmd = {
1376 .name = ENABLE_EVENT_STR,
1377 .trigger_type = ETT_EVENT_ENABLE,
1378 .func = event_enable_trigger_func,
1379 .reg = event_enable_register_trigger,
1380 .unreg = event_enable_unregister_trigger,
1381 .get_trigger_ops = event_enable_get_trigger_ops,
1382 .set_filter = set_trigger_filter,
1383};
1384
1385static struct event_command trigger_disable_cmd = {
1386 .name = DISABLE_EVENT_STR,
1387 .trigger_type = ETT_EVENT_ENABLE,
1388 .func = event_enable_trigger_func,
1389 .reg = event_enable_register_trigger,
1390 .unreg = event_enable_unregister_trigger,
1391 .get_trigger_ops = event_enable_get_trigger_ops,
1392 .set_filter = set_trigger_filter,
1393};
1394
1395static __init void unregister_trigger_enable_disable_cmds(void)
1396{
1397 unregister_event_command(&trigger_enable_cmd);
1398 unregister_event_command(&trigger_disable_cmd);
1399}
1400
1401static __init int register_trigger_enable_disable_cmds(void)
1402{
1403 int ret;
1404
1405 ret = register_event_command(&trigger_enable_cmd);
1406 if (WARN_ON(ret < 0))
1407 return ret;
1408 ret = register_event_command(&trigger_disable_cmd);
1409 if (WARN_ON(ret < 0))
1410 unregister_trigger_enable_disable_cmds();
1411
1412 return ret;
1413}
1414
1415static __init int register_trigger_traceon_traceoff_cmds(void)
1416{
1417 int ret;
1418
1419 ret = register_event_command(&trigger_traceon_cmd);
1420 if (WARN_ON(ret < 0))
1421 return ret;
1422 ret = register_event_command(&trigger_traceoff_cmd);
1423 if (WARN_ON(ret < 0))
1424 unregister_trigger_traceon_traceoff_cmds();
1425
1426 return ret;
1427}
1428
1429__init int register_trigger_cmds(void)
1430{
1431 register_trigger_traceon_traceoff_cmds();
1432 register_trigger_snapshot_cmd();
1433 register_trigger_stacktrace_cmd();
1434 register_trigger_enable_disable_cmds();
1435
1436 return 0;
1437}
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index 7c3e3e72e2b6..ee0a5098ac43 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -95,15 +95,12 @@ static void __always_unused ____ftrace_check_##name(void) \
95#undef __array 95#undef __array
96#define __array(type, item, len) \ 96#define __array(type, item, len) \
97 do { \ 97 do { \
98 char *type_str = #type"["__stringify(len)"]"; \
98 BUILD_BUG_ON(len > MAX_FILTER_STR_VAL); \ 99 BUILD_BUG_ON(len > MAX_FILTER_STR_VAL); \
99 mutex_lock(&event_storage_mutex); \ 100 ret = trace_define_field(event_call, type_str, #item, \
100 snprintf(event_storage, sizeof(event_storage), \
101 "%s[%d]", #type, len); \
102 ret = trace_define_field(event_call, event_storage, #item, \
103 offsetof(typeof(field), item), \ 101 offsetof(typeof(field), item), \
104 sizeof(field.item), \ 102 sizeof(field.item), \
105 is_signed_type(type), filter_type); \ 103 is_signed_type(type), filter_type); \
106 mutex_unlock(&event_storage_mutex); \
107 if (ret) \ 104 if (ret) \
108 return ret; \ 105 return ret; \
109 } while (0); 106 } while (0);
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index dae9541ada9e..bdbae450c13e 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -27,18 +27,12 @@
27/** 27/**
28 * Kprobe event core functions 28 * Kprobe event core functions
29 */ 29 */
30struct trace_probe { 30struct trace_kprobe {
31 struct list_head list; 31 struct list_head list;
32 struct kretprobe rp; /* Use rp.kp for kprobe use */ 32 struct kretprobe rp; /* Use rp.kp for kprobe use */
33 unsigned long nhit; 33 unsigned long nhit;
34 unsigned int flags; /* For TP_FLAG_* */
35 const char *symbol; /* symbol name */ 34 const char *symbol; /* symbol name */
36 struct ftrace_event_class class; 35 struct trace_probe tp;
37 struct ftrace_event_call call;
38 struct list_head files;
39 ssize_t size; /* trace entry size */
40 unsigned int nr_args;
41 struct probe_arg args[];
42}; 36};
43 37
44struct event_file_link { 38struct event_file_link {
@@ -46,56 +40,46 @@ struct event_file_link {
46 struct list_head list; 40 struct list_head list;
47}; 41};
48 42
49#define SIZEOF_TRACE_PROBE(n) \ 43#define SIZEOF_TRACE_KPROBE(n) \
50 (offsetof(struct trace_probe, args) + \ 44 (offsetof(struct trace_kprobe, tp.args) + \
51 (sizeof(struct probe_arg) * (n))) 45 (sizeof(struct probe_arg) * (n)))
52 46
53 47
54static __kprobes bool trace_probe_is_return(struct trace_probe *tp) 48static __kprobes bool trace_kprobe_is_return(struct trace_kprobe *tk)
55{ 49{
56 return tp->rp.handler != NULL; 50 return tk->rp.handler != NULL;
57} 51}
58 52
59static __kprobes const char *trace_probe_symbol(struct trace_probe *tp) 53static __kprobes const char *trace_kprobe_symbol(struct trace_kprobe *tk)
60{ 54{
61 return tp->symbol ? tp->symbol : "unknown"; 55 return tk->symbol ? tk->symbol : "unknown";
62} 56}
63 57
64static __kprobes unsigned long trace_probe_offset(struct trace_probe *tp) 58static __kprobes unsigned long trace_kprobe_offset(struct trace_kprobe *tk)
65{ 59{
66 return tp->rp.kp.offset; 60 return tk->rp.kp.offset;
67} 61}
68 62
69static __kprobes bool trace_probe_is_enabled(struct trace_probe *tp) 63static __kprobes bool trace_kprobe_has_gone(struct trace_kprobe *tk)
70{ 64{
71 return !!(tp->flags & (TP_FLAG_TRACE | TP_FLAG_PROFILE)); 65 return !!(kprobe_gone(&tk->rp.kp));
72} 66}
73 67
74static __kprobes bool trace_probe_is_registered(struct trace_probe *tp) 68static __kprobes bool trace_kprobe_within_module(struct trace_kprobe *tk,
75{ 69 struct module *mod)
76 return !!(tp->flags & TP_FLAG_REGISTERED);
77}
78
79static __kprobes bool trace_probe_has_gone(struct trace_probe *tp)
80{
81 return !!(kprobe_gone(&tp->rp.kp));
82}
83
84static __kprobes bool trace_probe_within_module(struct trace_probe *tp,
85 struct module *mod)
86{ 70{
87 int len = strlen(mod->name); 71 int len = strlen(mod->name);
88 const char *name = trace_probe_symbol(tp); 72 const char *name = trace_kprobe_symbol(tk);
89 return strncmp(mod->name, name, len) == 0 && name[len] == ':'; 73 return strncmp(mod->name, name, len) == 0 && name[len] == ':';
90} 74}
91 75
92static __kprobes bool trace_probe_is_on_module(struct trace_probe *tp) 76static __kprobes bool trace_kprobe_is_on_module(struct trace_kprobe *tk)
93{ 77{
94 return !!strchr(trace_probe_symbol(tp), ':'); 78 return !!strchr(trace_kprobe_symbol(tk), ':');
95} 79}
96 80
97static int register_probe_event(struct trace_probe *tp); 81static int register_kprobe_event(struct trace_kprobe *tk);
98static int unregister_probe_event(struct trace_probe *tp); 82static int unregister_kprobe_event(struct trace_kprobe *tk);
99 83
100static DEFINE_MUTEX(probe_lock); 84static DEFINE_MUTEX(probe_lock);
101static LIST_HEAD(probe_list); 85static LIST_HEAD(probe_list);
@@ -104,45 +88,224 @@ static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs);
104static int kretprobe_dispatcher(struct kretprobe_instance *ri, 88static int kretprobe_dispatcher(struct kretprobe_instance *ri,
105 struct pt_regs *regs); 89 struct pt_regs *regs);
106 90
91/* Memory fetching by symbol */
92struct symbol_cache {
93 char *symbol;
94 long offset;
95 unsigned long addr;
96};
97
98unsigned long update_symbol_cache(struct symbol_cache *sc)
99{
100 sc->addr = (unsigned long)kallsyms_lookup_name(sc->symbol);
101
102 if (sc->addr)
103 sc->addr += sc->offset;
104
105 return sc->addr;
106}
107
108void free_symbol_cache(struct symbol_cache *sc)
109{
110 kfree(sc->symbol);
111 kfree(sc);
112}
113
114struct symbol_cache *alloc_symbol_cache(const char *sym, long offset)
115{
116 struct symbol_cache *sc;
117
118 if (!sym || strlen(sym) == 0)
119 return NULL;
120
121 sc = kzalloc(sizeof(struct symbol_cache), GFP_KERNEL);
122 if (!sc)
123 return NULL;
124
125 sc->symbol = kstrdup(sym, GFP_KERNEL);
126 if (!sc->symbol) {
127 kfree(sc);
128 return NULL;
129 }
130 sc->offset = offset;
131 update_symbol_cache(sc);
132
133 return sc;
134}
135
136/*
137 * Kprobes-specific fetch functions
138 */
139#define DEFINE_FETCH_stack(type) \
140static __kprobes void FETCH_FUNC_NAME(stack, type)(struct pt_regs *regs,\
141 void *offset, void *dest) \
142{ \
143 *(type *)dest = (type)regs_get_kernel_stack_nth(regs, \
144 (unsigned int)((unsigned long)offset)); \
145}
146DEFINE_BASIC_FETCH_FUNCS(stack)
147/* No string on the stack entry */
148#define fetch_stack_string NULL
149#define fetch_stack_string_size NULL
150
151#define DEFINE_FETCH_memory(type) \
152static __kprobes void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs,\
153 void *addr, void *dest) \
154{ \
155 type retval; \
156 if (probe_kernel_address(addr, retval)) \
157 *(type *)dest = 0; \
158 else \
159 *(type *)dest = retval; \
160}
161DEFINE_BASIC_FETCH_FUNCS(memory)
162/*
163 * Fetch a null-terminated string. Caller MUST set *(u32 *)dest with max
164 * length and relative data location.
165 */
166static __kprobes void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs,
167 void *addr, void *dest)
168{
169 long ret;
170 int maxlen = get_rloc_len(*(u32 *)dest);
171 u8 *dst = get_rloc_data(dest);
172 u8 *src = addr;
173 mm_segment_t old_fs = get_fs();
174
175 if (!maxlen)
176 return;
177
178 /*
179 * Try to get string again, since the string can be changed while
180 * probing.
181 */
182 set_fs(KERNEL_DS);
183 pagefault_disable();
184
185 do
186 ret = __copy_from_user_inatomic(dst++, src++, 1);
187 while (dst[-1] && ret == 0 && src - (u8 *)addr < maxlen);
188
189 dst[-1] = '\0';
190 pagefault_enable();
191 set_fs(old_fs);
192
193 if (ret < 0) { /* Failed to fetch string */
194 ((u8 *)get_rloc_data(dest))[0] = '\0';
195 *(u32 *)dest = make_data_rloc(0, get_rloc_offs(*(u32 *)dest));
196 } else {
197 *(u32 *)dest = make_data_rloc(src - (u8 *)addr,
198 get_rloc_offs(*(u32 *)dest));
199 }
200}
201
202/* Return the length of string -- including null terminal byte */
203static __kprobes void FETCH_FUNC_NAME(memory, string_size)(struct pt_regs *regs,
204 void *addr, void *dest)
205{
206 mm_segment_t old_fs;
207 int ret, len = 0;
208 u8 c;
209
210 old_fs = get_fs();
211 set_fs(KERNEL_DS);
212 pagefault_disable();
213
214 do {
215 ret = __copy_from_user_inatomic(&c, (u8 *)addr + len, 1);
216 len++;
217 } while (c && ret == 0 && len < MAX_STRING_SIZE);
218
219 pagefault_enable();
220 set_fs(old_fs);
221
222 if (ret < 0) /* Failed to check the length */
223 *(u32 *)dest = 0;
224 else
225 *(u32 *)dest = len;
226}
227
228#define DEFINE_FETCH_symbol(type) \
229__kprobes void FETCH_FUNC_NAME(symbol, type)(struct pt_regs *regs, \
230 void *data, void *dest) \
231{ \
232 struct symbol_cache *sc = data; \
233 if (sc->addr) \
234 fetch_memory_##type(regs, (void *)sc->addr, dest); \
235 else \
236 *(type *)dest = 0; \
237}
238DEFINE_BASIC_FETCH_FUNCS(symbol)
239DEFINE_FETCH_symbol(string)
240DEFINE_FETCH_symbol(string_size)
241
242/* kprobes don't support file_offset fetch methods */
243#define fetch_file_offset_u8 NULL
244#define fetch_file_offset_u16 NULL
245#define fetch_file_offset_u32 NULL
246#define fetch_file_offset_u64 NULL
247#define fetch_file_offset_string NULL
248#define fetch_file_offset_string_size NULL
249
250/* Fetch type information table */
251const struct fetch_type kprobes_fetch_type_table[] = {
252 /* Special types */
253 [FETCH_TYPE_STRING] = __ASSIGN_FETCH_TYPE("string", string, string,
254 sizeof(u32), 1, "__data_loc char[]"),
255 [FETCH_TYPE_STRSIZE] = __ASSIGN_FETCH_TYPE("string_size", u32,
256 string_size, sizeof(u32), 0, "u32"),
257 /* Basic types */
258 ASSIGN_FETCH_TYPE(u8, u8, 0),
259 ASSIGN_FETCH_TYPE(u16, u16, 0),
260 ASSIGN_FETCH_TYPE(u32, u32, 0),
261 ASSIGN_FETCH_TYPE(u64, u64, 0),
262 ASSIGN_FETCH_TYPE(s8, u8, 1),
263 ASSIGN_FETCH_TYPE(s16, u16, 1),
264 ASSIGN_FETCH_TYPE(s32, u32, 1),
265 ASSIGN_FETCH_TYPE(s64, u64, 1),
266
267 ASSIGN_FETCH_TYPE_END
268};
269
107/* 270/*
108 * Allocate new trace_probe and initialize it (including kprobes). 271 * Allocate new trace_probe and initialize it (including kprobes).
109 */ 272 */
110static struct trace_probe *alloc_trace_probe(const char *group, 273static struct trace_kprobe *alloc_trace_kprobe(const char *group,
111 const char *event, 274 const char *event,
112 void *addr, 275 void *addr,
113 const char *symbol, 276 const char *symbol,
114 unsigned long offs, 277 unsigned long offs,
115 int nargs, bool is_return) 278 int nargs, bool is_return)
116{ 279{
117 struct trace_probe *tp; 280 struct trace_kprobe *tk;
118 int ret = -ENOMEM; 281 int ret = -ENOMEM;
119 282
120 tp = kzalloc(SIZEOF_TRACE_PROBE(nargs), GFP_KERNEL); 283 tk = kzalloc(SIZEOF_TRACE_KPROBE(nargs), GFP_KERNEL);
121 if (!tp) 284 if (!tk)
122 return ERR_PTR(ret); 285 return ERR_PTR(ret);
123 286
124 if (symbol) { 287 if (symbol) {
125 tp->symbol = kstrdup(symbol, GFP_KERNEL); 288 tk->symbol = kstrdup(symbol, GFP_KERNEL);
126 if (!tp->symbol) 289 if (!tk->symbol)
127 goto error; 290 goto error;
128 tp->rp.kp.symbol_name = tp->symbol; 291 tk->rp.kp.symbol_name = tk->symbol;
129 tp->rp.kp.offset = offs; 292 tk->rp.kp.offset = offs;
130 } else 293 } else
131 tp->rp.kp.addr = addr; 294 tk->rp.kp.addr = addr;
132 295
133 if (is_return) 296 if (is_return)
134 tp->rp.handler = kretprobe_dispatcher; 297 tk->rp.handler = kretprobe_dispatcher;
135 else 298 else
136 tp->rp.kp.pre_handler = kprobe_dispatcher; 299 tk->rp.kp.pre_handler = kprobe_dispatcher;
137 300
138 if (!event || !is_good_name(event)) { 301 if (!event || !is_good_name(event)) {
139 ret = -EINVAL; 302 ret = -EINVAL;
140 goto error; 303 goto error;
141 } 304 }
142 305
143 tp->call.class = &tp->class; 306 tk->tp.call.class = &tk->tp.class;
144 tp->call.name = kstrdup(event, GFP_KERNEL); 307 tk->tp.call.name = kstrdup(event, GFP_KERNEL);
145 if (!tp->call.name) 308 if (!tk->tp.call.name)
146 goto error; 309 goto error;
147 310
148 if (!group || !is_good_name(group)) { 311 if (!group || !is_good_name(group)) {
@@ -150,42 +313,42 @@ static struct trace_probe *alloc_trace_probe(const char *group,
150 goto error; 313 goto error;
151 } 314 }
152 315
153 tp->class.system = kstrdup(group, GFP_KERNEL); 316 tk->tp.class.system = kstrdup(group, GFP_KERNEL);
154 if (!tp->class.system) 317 if (!tk->tp.class.system)
155 goto error; 318 goto error;
156 319
157 INIT_LIST_HEAD(&tp->list); 320 INIT_LIST_HEAD(&tk->list);
158 INIT_LIST_HEAD(&tp->files); 321 INIT_LIST_HEAD(&tk->tp.files);
159 return tp; 322 return tk;
160error: 323error:
161 kfree(tp->call.name); 324 kfree(tk->tp.call.name);
162 kfree(tp->symbol); 325 kfree(tk->symbol);
163 kfree(tp); 326 kfree(tk);
164 return ERR_PTR(ret); 327 return ERR_PTR(ret);
165} 328}
166 329
167static void free_trace_probe(struct trace_probe *tp) 330static void free_trace_kprobe(struct trace_kprobe *tk)
168{ 331{
169 int i; 332 int i;
170 333
171 for (i = 0; i < tp->nr_args; i++) 334 for (i = 0; i < tk->tp.nr_args; i++)
172 traceprobe_free_probe_arg(&tp->args[i]); 335 traceprobe_free_probe_arg(&tk->tp.args[i]);
173 336
174 kfree(tp->call.class->system); 337 kfree(tk->tp.call.class->system);
175 kfree(tp->call.name); 338 kfree(tk->tp.call.name);
176 kfree(tp->symbol); 339 kfree(tk->symbol);
177 kfree(tp); 340 kfree(tk);
178} 341}
179 342
180static struct trace_probe *find_trace_probe(const char *event, 343static struct trace_kprobe *find_trace_kprobe(const char *event,
181 const char *group) 344 const char *group)
182{ 345{
183 struct trace_probe *tp; 346 struct trace_kprobe *tk;
184 347
185 list_for_each_entry(tp, &probe_list, list) 348 list_for_each_entry(tk, &probe_list, list)
186 if (strcmp(tp->call.name, event) == 0 && 349 if (strcmp(tk->tp.call.name, event) == 0 &&
187 strcmp(tp->call.class->system, group) == 0) 350 strcmp(tk->tp.call.class->system, group) == 0)
188 return tp; 351 return tk;
189 return NULL; 352 return NULL;
190} 353}
191 354
@@ -194,7 +357,7 @@ static struct trace_probe *find_trace_probe(const char *event,
194 * if the file is NULL, enable "perf" handler, or enable "trace" handler. 357 * if the file is NULL, enable "perf" handler, or enable "trace" handler.
195 */ 358 */
196static int 359static int
197enable_trace_probe(struct trace_probe *tp, struct ftrace_event_file *file) 360enable_trace_kprobe(struct trace_kprobe *tk, struct ftrace_event_file *file)
198{ 361{
199 int ret = 0; 362 int ret = 0;
200 363
@@ -208,17 +371,17 @@ enable_trace_probe(struct trace_probe *tp, struct ftrace_event_file *file)
208 } 371 }
209 372
210 link->file = file; 373 link->file = file;
211 list_add_tail_rcu(&link->list, &tp->files); 374 list_add_tail_rcu(&link->list, &tk->tp.files);
212 375
213 tp->flags |= TP_FLAG_TRACE; 376 tk->tp.flags |= TP_FLAG_TRACE;
214 } else 377 } else
215 tp->flags |= TP_FLAG_PROFILE; 378 tk->tp.flags |= TP_FLAG_PROFILE;
216 379
217 if (trace_probe_is_registered(tp) && !trace_probe_has_gone(tp)) { 380 if (trace_probe_is_registered(&tk->tp) && !trace_kprobe_has_gone(tk)) {
218 if (trace_probe_is_return(tp)) 381 if (trace_kprobe_is_return(tk))
219 ret = enable_kretprobe(&tp->rp); 382 ret = enable_kretprobe(&tk->rp);
220 else 383 else
221 ret = enable_kprobe(&tp->rp.kp); 384 ret = enable_kprobe(&tk->rp.kp);
222 } 385 }
223 out: 386 out:
224 return ret; 387 return ret;
@@ -241,14 +404,14 @@ find_event_file_link(struct trace_probe *tp, struct ftrace_event_file *file)
241 * if the file is NULL, disable "perf" handler, or disable "trace" handler. 404 * if the file is NULL, disable "perf" handler, or disable "trace" handler.
242 */ 405 */
243static int 406static int
244disable_trace_probe(struct trace_probe *tp, struct ftrace_event_file *file) 407disable_trace_kprobe(struct trace_kprobe *tk, struct ftrace_event_file *file)
245{ 408{
246 struct event_file_link *link = NULL; 409 struct event_file_link *link = NULL;
247 int wait = 0; 410 int wait = 0;
248 int ret = 0; 411 int ret = 0;
249 412
250 if (file) { 413 if (file) {
251 link = find_event_file_link(tp, file); 414 link = find_event_file_link(&tk->tp, file);
252 if (!link) { 415 if (!link) {
253 ret = -EINVAL; 416 ret = -EINVAL;
254 goto out; 417 goto out;
@@ -256,18 +419,18 @@ disable_trace_probe(struct trace_probe *tp, struct ftrace_event_file *file)
256 419
257 list_del_rcu(&link->list); 420 list_del_rcu(&link->list);
258 wait = 1; 421 wait = 1;
259 if (!list_empty(&tp->files)) 422 if (!list_empty(&tk->tp.files))
260 goto out; 423 goto out;
261 424
262 tp->flags &= ~TP_FLAG_TRACE; 425 tk->tp.flags &= ~TP_FLAG_TRACE;
263 } else 426 } else
264 tp->flags &= ~TP_FLAG_PROFILE; 427 tk->tp.flags &= ~TP_FLAG_PROFILE;
265 428
266 if (!trace_probe_is_enabled(tp) && trace_probe_is_registered(tp)) { 429 if (!trace_probe_is_enabled(&tk->tp) && trace_probe_is_registered(&tk->tp)) {
267 if (trace_probe_is_return(tp)) 430 if (trace_kprobe_is_return(tk))
268 disable_kretprobe(&tp->rp); 431 disable_kretprobe(&tk->rp);
269 else 432 else
270 disable_kprobe(&tp->rp.kp); 433 disable_kprobe(&tk->rp.kp);
271 wait = 1; 434 wait = 1;
272 } 435 }
273 out: 436 out:
@@ -288,40 +451,40 @@ disable_trace_probe(struct trace_probe *tp, struct ftrace_event_file *file)
288} 451}
289 452
290/* Internal register function - just handle k*probes and flags */ 453/* Internal register function - just handle k*probes and flags */
291static int __register_trace_probe(struct trace_probe *tp) 454static int __register_trace_kprobe(struct trace_kprobe *tk)
292{ 455{
293 int i, ret; 456 int i, ret;
294 457
295 if (trace_probe_is_registered(tp)) 458 if (trace_probe_is_registered(&tk->tp))
296 return -EINVAL; 459 return -EINVAL;
297 460
298 for (i = 0; i < tp->nr_args; i++) 461 for (i = 0; i < tk->tp.nr_args; i++)
299 traceprobe_update_arg(&tp->args[i]); 462 traceprobe_update_arg(&tk->tp.args[i]);
300 463
301 /* Set/clear disabled flag according to tp->flag */ 464 /* Set/clear disabled flag according to tp->flag */
302 if (trace_probe_is_enabled(tp)) 465 if (trace_probe_is_enabled(&tk->tp))
303 tp->rp.kp.flags &= ~KPROBE_FLAG_DISABLED; 466 tk->rp.kp.flags &= ~KPROBE_FLAG_DISABLED;
304 else 467 else
305 tp->rp.kp.flags |= KPROBE_FLAG_DISABLED; 468 tk->rp.kp.flags |= KPROBE_FLAG_DISABLED;
306 469
307 if (trace_probe_is_return(tp)) 470 if (trace_kprobe_is_return(tk))
308 ret = register_kretprobe(&tp->rp); 471 ret = register_kretprobe(&tk->rp);
309 else 472 else
310 ret = register_kprobe(&tp->rp.kp); 473 ret = register_kprobe(&tk->rp.kp);
311 474
312 if (ret == 0) 475 if (ret == 0)
313 tp->flags |= TP_FLAG_REGISTERED; 476 tk->tp.flags |= TP_FLAG_REGISTERED;
314 else { 477 else {
315 pr_warning("Could not insert probe at %s+%lu: %d\n", 478 pr_warning("Could not insert probe at %s+%lu: %d\n",
316 trace_probe_symbol(tp), trace_probe_offset(tp), ret); 479 trace_kprobe_symbol(tk), trace_kprobe_offset(tk), ret);
317 if (ret == -ENOENT && trace_probe_is_on_module(tp)) { 480 if (ret == -ENOENT && trace_kprobe_is_on_module(tk)) {
318 pr_warning("This probe might be able to register after" 481 pr_warning("This probe might be able to register after"
319 "target module is loaded. Continue.\n"); 482 "target module is loaded. Continue.\n");
320 ret = 0; 483 ret = 0;
321 } else if (ret == -EILSEQ) { 484 } else if (ret == -EILSEQ) {
322 pr_warning("Probing address(0x%p) is not an " 485 pr_warning("Probing address(0x%p) is not an "
323 "instruction boundary.\n", 486 "instruction boundary.\n",
324 tp->rp.kp.addr); 487 tk->rp.kp.addr);
325 ret = -EINVAL; 488 ret = -EINVAL;
326 } 489 }
327 } 490 }
@@ -330,67 +493,67 @@ static int __register_trace_probe(struct trace_probe *tp)
330} 493}
331 494
332/* Internal unregister function - just handle k*probes and flags */ 495/* Internal unregister function - just handle k*probes and flags */
333static void __unregister_trace_probe(struct trace_probe *tp) 496static void __unregister_trace_kprobe(struct trace_kprobe *tk)
334{ 497{
335 if (trace_probe_is_registered(tp)) { 498 if (trace_probe_is_registered(&tk->tp)) {
336 if (trace_probe_is_return(tp)) 499 if (trace_kprobe_is_return(tk))
337 unregister_kretprobe(&tp->rp); 500 unregister_kretprobe(&tk->rp);
338 else 501 else
339 unregister_kprobe(&tp->rp.kp); 502 unregister_kprobe(&tk->rp.kp);
340 tp->flags &= ~TP_FLAG_REGISTERED; 503 tk->tp.flags &= ~TP_FLAG_REGISTERED;
341 /* Cleanup kprobe for reuse */ 504 /* Cleanup kprobe for reuse */
342 if (tp->rp.kp.symbol_name) 505 if (tk->rp.kp.symbol_name)
343 tp->rp.kp.addr = NULL; 506 tk->rp.kp.addr = NULL;
344 } 507 }
345} 508}
346 509
347/* Unregister a trace_probe and probe_event: call with locking probe_lock */ 510/* Unregister a trace_probe and probe_event: call with locking probe_lock */
348static int unregister_trace_probe(struct trace_probe *tp) 511static int unregister_trace_kprobe(struct trace_kprobe *tk)
349{ 512{
350 /* Enabled event can not be unregistered */ 513 /* Enabled event can not be unregistered */
351 if (trace_probe_is_enabled(tp)) 514 if (trace_probe_is_enabled(&tk->tp))
352 return -EBUSY; 515 return -EBUSY;
353 516
354 /* Will fail if probe is being used by ftrace or perf */ 517 /* Will fail if probe is being used by ftrace or perf */
355 if (unregister_probe_event(tp)) 518 if (unregister_kprobe_event(tk))
356 return -EBUSY; 519 return -EBUSY;
357 520
358 __unregister_trace_probe(tp); 521 __unregister_trace_kprobe(tk);
359 list_del(&tp->list); 522 list_del(&tk->list);
360 523
361 return 0; 524 return 0;
362} 525}
363 526
364/* Register a trace_probe and probe_event */ 527/* Register a trace_probe and probe_event */
365static int register_trace_probe(struct trace_probe *tp) 528static int register_trace_kprobe(struct trace_kprobe *tk)
366{ 529{
367 struct trace_probe *old_tp; 530 struct trace_kprobe *old_tk;
368 int ret; 531 int ret;
369 532
370 mutex_lock(&probe_lock); 533 mutex_lock(&probe_lock);
371 534
372 /* Delete old (same name) event if exist */ 535 /* Delete old (same name) event if exist */
373 old_tp = find_trace_probe(tp->call.name, tp->call.class->system); 536 old_tk = find_trace_kprobe(tk->tp.call.name, tk->tp.call.class->system);
374 if (old_tp) { 537 if (old_tk) {
375 ret = unregister_trace_probe(old_tp); 538 ret = unregister_trace_kprobe(old_tk);
376 if (ret < 0) 539 if (ret < 0)
377 goto end; 540 goto end;
378 free_trace_probe(old_tp); 541 free_trace_kprobe(old_tk);
379 } 542 }
380 543
381 /* Register new event */ 544 /* Register new event */
382 ret = register_probe_event(tp); 545 ret = register_kprobe_event(tk);
383 if (ret) { 546 if (ret) {
384 pr_warning("Failed to register probe event(%d)\n", ret); 547 pr_warning("Failed to register probe event(%d)\n", ret);
385 goto end; 548 goto end;
386 } 549 }
387 550
388 /* Register k*probe */ 551 /* Register k*probe */
389 ret = __register_trace_probe(tp); 552 ret = __register_trace_kprobe(tk);
390 if (ret < 0) 553 if (ret < 0)
391 unregister_probe_event(tp); 554 unregister_kprobe_event(tk);
392 else 555 else
393 list_add_tail(&tp->list, &probe_list); 556 list_add_tail(&tk->list, &probe_list);
394 557
395end: 558end:
396 mutex_unlock(&probe_lock); 559 mutex_unlock(&probe_lock);
@@ -398,11 +561,11 @@ end:
398} 561}
399 562
400/* Module notifier call back, checking event on the module */ 563/* Module notifier call back, checking event on the module */
401static int trace_probe_module_callback(struct notifier_block *nb, 564static int trace_kprobe_module_callback(struct notifier_block *nb,
402 unsigned long val, void *data) 565 unsigned long val, void *data)
403{ 566{
404 struct module *mod = data; 567 struct module *mod = data;
405 struct trace_probe *tp; 568 struct trace_kprobe *tk;
406 int ret; 569 int ret;
407 570
408 if (val != MODULE_STATE_COMING) 571 if (val != MODULE_STATE_COMING)
@@ -410,15 +573,15 @@ static int trace_probe_module_callback(struct notifier_block *nb,
410 573
411 /* Update probes on coming module */ 574 /* Update probes on coming module */
412 mutex_lock(&probe_lock); 575 mutex_lock(&probe_lock);
413 list_for_each_entry(tp, &probe_list, list) { 576 list_for_each_entry(tk, &probe_list, list) {
414 if (trace_probe_within_module(tp, mod)) { 577 if (trace_kprobe_within_module(tk, mod)) {
415 /* Don't need to check busy - this should have gone. */ 578 /* Don't need to check busy - this should have gone. */
416 __unregister_trace_probe(tp); 579 __unregister_trace_kprobe(tk);
417 ret = __register_trace_probe(tp); 580 ret = __register_trace_kprobe(tk);
418 if (ret) 581 if (ret)
419 pr_warning("Failed to re-register probe %s on" 582 pr_warning("Failed to re-register probe %s on"
420 "%s: %d\n", 583 "%s: %d\n",
421 tp->call.name, mod->name, ret); 584 tk->tp.call.name, mod->name, ret);
422 } 585 }
423 } 586 }
424 mutex_unlock(&probe_lock); 587 mutex_unlock(&probe_lock);
@@ -426,12 +589,12 @@ static int trace_probe_module_callback(struct notifier_block *nb,
426 return NOTIFY_DONE; 589 return NOTIFY_DONE;
427} 590}
428 591
429static struct notifier_block trace_probe_module_nb = { 592static struct notifier_block trace_kprobe_module_nb = {
430 .notifier_call = trace_probe_module_callback, 593 .notifier_call = trace_kprobe_module_callback,
431 .priority = 1 /* Invoked after kprobe module callback */ 594 .priority = 1 /* Invoked after kprobe module callback */
432}; 595};
433 596
434static int create_trace_probe(int argc, char **argv) 597static int create_trace_kprobe(int argc, char **argv)
435{ 598{
436 /* 599 /*
437 * Argument syntax: 600 * Argument syntax:
@@ -451,7 +614,7 @@ static int create_trace_probe(int argc, char **argv)
451 * Type of args: 614 * Type of args:
452 * FETCHARG:TYPE : use TYPE instead of unsigned long. 615 * FETCHARG:TYPE : use TYPE instead of unsigned long.
453 */ 616 */
454 struct trace_probe *tp; 617 struct trace_kprobe *tk;
455 int i, ret = 0; 618 int i, ret = 0;
456 bool is_return = false, is_delete = false; 619 bool is_return = false, is_delete = false;
457 char *symbol = NULL, *event = NULL, *group = NULL; 620 char *symbol = NULL, *event = NULL, *group = NULL;
@@ -498,16 +661,16 @@ static int create_trace_probe(int argc, char **argv)
498 return -EINVAL; 661 return -EINVAL;
499 } 662 }
500 mutex_lock(&probe_lock); 663 mutex_lock(&probe_lock);
501 tp = find_trace_probe(event, group); 664 tk = find_trace_kprobe(event, group);
502 if (!tp) { 665 if (!tk) {
503 mutex_unlock(&probe_lock); 666 mutex_unlock(&probe_lock);
504 pr_info("Event %s/%s doesn't exist.\n", group, event); 667 pr_info("Event %s/%s doesn't exist.\n", group, event);
505 return -ENOENT; 668 return -ENOENT;
506 } 669 }
507 /* delete an event */ 670 /* delete an event */
508 ret = unregister_trace_probe(tp); 671 ret = unregister_trace_kprobe(tk);
509 if (ret == 0) 672 if (ret == 0)
510 free_trace_probe(tp); 673 free_trace_kprobe(tk);
511 mutex_unlock(&probe_lock); 674 mutex_unlock(&probe_lock);
512 return ret; 675 return ret;
513 } 676 }
@@ -554,47 +717,49 @@ static int create_trace_probe(int argc, char **argv)
554 is_return ? 'r' : 'p', addr); 717 is_return ? 'r' : 'p', addr);
555 event = buf; 718 event = buf;
556 } 719 }
557 tp = alloc_trace_probe(group, event, addr, symbol, offset, argc, 720 tk = alloc_trace_kprobe(group, event, addr, symbol, offset, argc,
558 is_return); 721 is_return);
559 if (IS_ERR(tp)) { 722 if (IS_ERR(tk)) {
560 pr_info("Failed to allocate trace_probe.(%d)\n", 723 pr_info("Failed to allocate trace_probe.(%d)\n",
561 (int)PTR_ERR(tp)); 724 (int)PTR_ERR(tk));
562 return PTR_ERR(tp); 725 return PTR_ERR(tk);
563 } 726 }
564 727
565 /* parse arguments */ 728 /* parse arguments */
566 ret = 0; 729 ret = 0;
567 for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) { 730 for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) {
731 struct probe_arg *parg = &tk->tp.args[i];
732
568 /* Increment count for freeing args in error case */ 733 /* Increment count for freeing args in error case */
569 tp->nr_args++; 734 tk->tp.nr_args++;
570 735
571 /* Parse argument name */ 736 /* Parse argument name */
572 arg = strchr(argv[i], '='); 737 arg = strchr(argv[i], '=');
573 if (arg) { 738 if (arg) {
574 *arg++ = '\0'; 739 *arg++ = '\0';
575 tp->args[i].name = kstrdup(argv[i], GFP_KERNEL); 740 parg->name = kstrdup(argv[i], GFP_KERNEL);
576 } else { 741 } else {
577 arg = argv[i]; 742 arg = argv[i];
578 /* If argument name is omitted, set "argN" */ 743 /* If argument name is omitted, set "argN" */
579 snprintf(buf, MAX_EVENT_NAME_LEN, "arg%d", i + 1); 744 snprintf(buf, MAX_EVENT_NAME_LEN, "arg%d", i + 1);
580 tp->args[i].name = kstrdup(buf, GFP_KERNEL); 745 parg->name = kstrdup(buf, GFP_KERNEL);
581 } 746 }
582 747
583 if (!tp->args[i].name) { 748 if (!parg->name) {
584 pr_info("Failed to allocate argument[%d] name.\n", i); 749 pr_info("Failed to allocate argument[%d] name.\n", i);
585 ret = -ENOMEM; 750 ret = -ENOMEM;
586 goto error; 751 goto error;
587 } 752 }
588 753
589 if (!is_good_name(tp->args[i].name)) { 754 if (!is_good_name(parg->name)) {
590 pr_info("Invalid argument[%d] name: %s\n", 755 pr_info("Invalid argument[%d] name: %s\n",
591 i, tp->args[i].name); 756 i, parg->name);
592 ret = -EINVAL; 757 ret = -EINVAL;
593 goto error; 758 goto error;
594 } 759 }
595 760
596 if (traceprobe_conflict_field_name(tp->args[i].name, 761 if (traceprobe_conflict_field_name(parg->name,
597 tp->args, i)) { 762 tk->tp.args, i)) {
598 pr_info("Argument[%d] name '%s' conflicts with " 763 pr_info("Argument[%d] name '%s' conflicts with "
599 "another field.\n", i, argv[i]); 764 "another field.\n", i, argv[i]);
600 ret = -EINVAL; 765 ret = -EINVAL;
@@ -602,7 +767,7 @@ static int create_trace_probe(int argc, char **argv)
602 } 767 }
603 768
604 /* Parse fetch argument */ 769 /* Parse fetch argument */
605 ret = traceprobe_parse_probe_arg(arg, &tp->size, &tp->args[i], 770 ret = traceprobe_parse_probe_arg(arg, &tk->tp.size, parg,
606 is_return, true); 771 is_return, true);
607 if (ret) { 772 if (ret) {
608 pr_info("Parse error at argument[%d]. (%d)\n", i, ret); 773 pr_info("Parse error at argument[%d]. (%d)\n", i, ret);
@@ -610,35 +775,35 @@ static int create_trace_probe(int argc, char **argv)
610 } 775 }
611 } 776 }
612 777
613 ret = register_trace_probe(tp); 778 ret = register_trace_kprobe(tk);
614 if (ret) 779 if (ret)
615 goto error; 780 goto error;
616 return 0; 781 return 0;
617 782
618error: 783error:
619 free_trace_probe(tp); 784 free_trace_kprobe(tk);
620 return ret; 785 return ret;
621} 786}
622 787
623static int release_all_trace_probes(void) 788static int release_all_trace_kprobes(void)
624{ 789{
625 struct trace_probe *tp; 790 struct trace_kprobe *tk;
626 int ret = 0; 791 int ret = 0;
627 792
628 mutex_lock(&probe_lock); 793 mutex_lock(&probe_lock);
629 /* Ensure no probe is in use. */ 794 /* Ensure no probe is in use. */
630 list_for_each_entry(tp, &probe_list, list) 795 list_for_each_entry(tk, &probe_list, list)
631 if (trace_probe_is_enabled(tp)) { 796 if (trace_probe_is_enabled(&tk->tp)) {
632 ret = -EBUSY; 797 ret = -EBUSY;
633 goto end; 798 goto end;
634 } 799 }
635 /* TODO: Use batch unregistration */ 800 /* TODO: Use batch unregistration */
636 while (!list_empty(&probe_list)) { 801 while (!list_empty(&probe_list)) {
637 tp = list_entry(probe_list.next, struct trace_probe, list); 802 tk = list_entry(probe_list.next, struct trace_kprobe, list);
638 ret = unregister_trace_probe(tp); 803 ret = unregister_trace_kprobe(tk);
639 if (ret) 804 if (ret)
640 goto end; 805 goto end;
641 free_trace_probe(tp); 806 free_trace_kprobe(tk);
642 } 807 }
643 808
644end: 809end:
@@ -666,22 +831,22 @@ static void probes_seq_stop(struct seq_file *m, void *v)
666 831
667static int probes_seq_show(struct seq_file *m, void *v) 832static int probes_seq_show(struct seq_file *m, void *v)
668{ 833{
669 struct trace_probe *tp = v; 834 struct trace_kprobe *tk = v;
670 int i; 835 int i;
671 836
672 seq_printf(m, "%c", trace_probe_is_return(tp) ? 'r' : 'p'); 837 seq_printf(m, "%c", trace_kprobe_is_return(tk) ? 'r' : 'p');
673 seq_printf(m, ":%s/%s", tp->call.class->system, tp->call.name); 838 seq_printf(m, ":%s/%s", tk->tp.call.class->system, tk->tp.call.name);
674 839
675 if (!tp->symbol) 840 if (!tk->symbol)
676 seq_printf(m, " 0x%p", tp->rp.kp.addr); 841 seq_printf(m, " 0x%p", tk->rp.kp.addr);
677 else if (tp->rp.kp.offset) 842 else if (tk->rp.kp.offset)
678 seq_printf(m, " %s+%u", trace_probe_symbol(tp), 843 seq_printf(m, " %s+%u", trace_kprobe_symbol(tk),
679 tp->rp.kp.offset); 844 tk->rp.kp.offset);
680 else 845 else
681 seq_printf(m, " %s", trace_probe_symbol(tp)); 846 seq_printf(m, " %s", trace_kprobe_symbol(tk));
682 847
683 for (i = 0; i < tp->nr_args; i++) 848 for (i = 0; i < tk->tp.nr_args; i++)
684 seq_printf(m, " %s=%s", tp->args[i].name, tp->args[i].comm); 849 seq_printf(m, " %s=%s", tk->tp.args[i].name, tk->tp.args[i].comm);
685 seq_printf(m, "\n"); 850 seq_printf(m, "\n");
686 851
687 return 0; 852 return 0;
@@ -699,7 +864,7 @@ static int probes_open(struct inode *inode, struct file *file)
699 int ret; 864 int ret;
700 865
701 if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) { 866 if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) {
702 ret = release_all_trace_probes(); 867 ret = release_all_trace_kprobes();
703 if (ret < 0) 868 if (ret < 0)
704 return ret; 869 return ret;
705 } 870 }
@@ -711,7 +876,7 @@ static ssize_t probes_write(struct file *file, const char __user *buffer,
711 size_t count, loff_t *ppos) 876 size_t count, loff_t *ppos)
712{ 877{
713 return traceprobe_probes_write(file, buffer, count, ppos, 878 return traceprobe_probes_write(file, buffer, count, ppos,
714 create_trace_probe); 879 create_trace_kprobe);
715} 880}
716 881
717static const struct file_operations kprobe_events_ops = { 882static const struct file_operations kprobe_events_ops = {
@@ -726,10 +891,10 @@ static const struct file_operations kprobe_events_ops = {
726/* Probes profiling interfaces */ 891/* Probes profiling interfaces */
727static int probes_profile_seq_show(struct seq_file *m, void *v) 892static int probes_profile_seq_show(struct seq_file *m, void *v)
728{ 893{
729 struct trace_probe *tp = v; 894 struct trace_kprobe *tk = v;
730 895
731 seq_printf(m, " %-44s %15lu %15lu\n", tp->call.name, tp->nhit, 896 seq_printf(m, " %-44s %15lu %15lu\n", tk->tp.call.name, tk->nhit,
732 tp->rp.kp.nmissed); 897 tk->rp.kp.nmissed);
733 898
734 return 0; 899 return 0;
735} 900}
@@ -754,57 +919,9 @@ static const struct file_operations kprobe_profile_ops = {
754 .release = seq_release, 919 .release = seq_release,
755}; 920};
756 921
757/* Sum up total data length for dynamic arraies (strings) */
758static __kprobes int __get_data_size(struct trace_probe *tp,
759 struct pt_regs *regs)
760{
761 int i, ret = 0;
762 u32 len;
763
764 for (i = 0; i < tp->nr_args; i++)
765 if (unlikely(tp->args[i].fetch_size.fn)) {
766 call_fetch(&tp->args[i].fetch_size, regs, &len);
767 ret += len;
768 }
769
770 return ret;
771}
772
773/* Store the value of each argument */
774static __kprobes void store_trace_args(int ent_size, struct trace_probe *tp,
775 struct pt_regs *regs,
776 u8 *data, int maxlen)
777{
778 int i;
779 u32 end = tp->size;
780 u32 *dl; /* Data (relative) location */
781
782 for (i = 0; i < tp->nr_args; i++) {
783 if (unlikely(tp->args[i].fetch_size.fn)) {
784 /*
785 * First, we set the relative location and
786 * maximum data length to *dl
787 */
788 dl = (u32 *)(data + tp->args[i].offset);
789 *dl = make_data_rloc(maxlen, end - tp->args[i].offset);
790 /* Then try to fetch string or dynamic array data */
791 call_fetch(&tp->args[i].fetch, regs, dl);
792 /* Reduce maximum length */
793 end += get_rloc_len(*dl);
794 maxlen -= get_rloc_len(*dl);
795 /* Trick here, convert data_rloc to data_loc */
796 *dl = convert_rloc_to_loc(*dl,
797 ent_size + tp->args[i].offset);
798 } else
799 /* Just fetching data normally */
800 call_fetch(&tp->args[i].fetch, regs,
801 data + tp->args[i].offset);
802 }
803}
804
805/* Kprobe handler */ 922/* Kprobe handler */
806static __kprobes void 923static __kprobes void
807__kprobe_trace_func(struct trace_probe *tp, struct pt_regs *regs, 924__kprobe_trace_func(struct trace_kprobe *tk, struct pt_regs *regs,
808 struct ftrace_event_file *ftrace_file) 925 struct ftrace_event_file *ftrace_file)
809{ 926{
810 struct kprobe_trace_entry_head *entry; 927 struct kprobe_trace_entry_head *entry;
@@ -812,18 +929,18 @@ __kprobe_trace_func(struct trace_probe *tp, struct pt_regs *regs,
812 struct ring_buffer *buffer; 929 struct ring_buffer *buffer;
813 int size, dsize, pc; 930 int size, dsize, pc;
814 unsigned long irq_flags; 931 unsigned long irq_flags;
815 struct ftrace_event_call *call = &tp->call; 932 struct ftrace_event_call *call = &tk->tp.call;
816 933
817 WARN_ON(call != ftrace_file->event_call); 934 WARN_ON(call != ftrace_file->event_call);
818 935
819 if (test_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &ftrace_file->flags)) 936 if (ftrace_trigger_soft_disabled(ftrace_file))
820 return; 937 return;
821 938
822 local_save_flags(irq_flags); 939 local_save_flags(irq_flags);
823 pc = preempt_count(); 940 pc = preempt_count();
824 941
825 dsize = __get_data_size(tp, regs); 942 dsize = __get_data_size(&tk->tp, regs);
826 size = sizeof(*entry) + tp->size + dsize; 943 size = sizeof(*entry) + tk->tp.size + dsize;
827 944
828 event = trace_event_buffer_lock_reserve(&buffer, ftrace_file, 945 event = trace_event_buffer_lock_reserve(&buffer, ftrace_file,
829 call->event.type, 946 call->event.type,
@@ -832,26 +949,25 @@ __kprobe_trace_func(struct trace_probe *tp, struct pt_regs *regs,
832 return; 949 return;
833 950
834 entry = ring_buffer_event_data(event); 951 entry = ring_buffer_event_data(event);
835 entry->ip = (unsigned long)tp->rp.kp.addr; 952 entry->ip = (unsigned long)tk->rp.kp.addr;
836 store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); 953 store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize);
837 954
838 if (!filter_check_discard(ftrace_file, entry, buffer, event)) 955 event_trigger_unlock_commit_regs(ftrace_file, buffer, event,
839 trace_buffer_unlock_commit_regs(buffer, event, 956 entry, irq_flags, pc, regs);
840 irq_flags, pc, regs);
841} 957}
842 958
843static __kprobes void 959static __kprobes void
844kprobe_trace_func(struct trace_probe *tp, struct pt_regs *regs) 960kprobe_trace_func(struct trace_kprobe *tk, struct pt_regs *regs)
845{ 961{
846 struct event_file_link *link; 962 struct event_file_link *link;
847 963
848 list_for_each_entry_rcu(link, &tp->files, list) 964 list_for_each_entry_rcu(link, &tk->tp.files, list)
849 __kprobe_trace_func(tp, regs, link->file); 965 __kprobe_trace_func(tk, regs, link->file);
850} 966}
851 967
852/* Kretprobe handler */ 968/* Kretprobe handler */
853static __kprobes void 969static __kprobes void
854__kretprobe_trace_func(struct trace_probe *tp, struct kretprobe_instance *ri, 970__kretprobe_trace_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
855 struct pt_regs *regs, 971 struct pt_regs *regs,
856 struct ftrace_event_file *ftrace_file) 972 struct ftrace_event_file *ftrace_file)
857{ 973{
@@ -860,18 +976,18 @@ __kretprobe_trace_func(struct trace_probe *tp, struct kretprobe_instance *ri,
860 struct ring_buffer *buffer; 976 struct ring_buffer *buffer;
861 int size, pc, dsize; 977 int size, pc, dsize;
862 unsigned long irq_flags; 978 unsigned long irq_flags;
863 struct ftrace_event_call *call = &tp->call; 979 struct ftrace_event_call *call = &tk->tp.call;
864 980
865 WARN_ON(call != ftrace_file->event_call); 981 WARN_ON(call != ftrace_file->event_call);
866 982
867 if (test_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &ftrace_file->flags)) 983 if (ftrace_trigger_soft_disabled(ftrace_file))
868 return; 984 return;
869 985
870 local_save_flags(irq_flags); 986 local_save_flags(irq_flags);
871 pc = preempt_count(); 987 pc = preempt_count();
872 988
873 dsize = __get_data_size(tp, regs); 989 dsize = __get_data_size(&tk->tp, regs);
874 size = sizeof(*entry) + tp->size + dsize; 990 size = sizeof(*entry) + tk->tp.size + dsize;
875 991
876 event = trace_event_buffer_lock_reserve(&buffer, ftrace_file, 992 event = trace_event_buffer_lock_reserve(&buffer, ftrace_file,
877 call->event.type, 993 call->event.type,
@@ -880,23 +996,22 @@ __kretprobe_trace_func(struct trace_probe *tp, struct kretprobe_instance *ri,
880 return; 996 return;
881 997
882 entry = ring_buffer_event_data(event); 998 entry = ring_buffer_event_data(event);
883 entry->func = (unsigned long)tp->rp.kp.addr; 999 entry->func = (unsigned long)tk->rp.kp.addr;
884 entry->ret_ip = (unsigned long)ri->ret_addr; 1000 entry->ret_ip = (unsigned long)ri->ret_addr;
885 store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); 1001 store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize);
886 1002
887 if (!filter_check_discard(ftrace_file, entry, buffer, event)) 1003 event_trigger_unlock_commit_regs(ftrace_file, buffer, event,
888 trace_buffer_unlock_commit_regs(buffer, event, 1004 entry, irq_flags, pc, regs);
889 irq_flags, pc, regs);
890} 1005}
891 1006
892static __kprobes void 1007static __kprobes void
893kretprobe_trace_func(struct trace_probe *tp, struct kretprobe_instance *ri, 1008kretprobe_trace_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
894 struct pt_regs *regs) 1009 struct pt_regs *regs)
895{ 1010{
896 struct event_file_link *link; 1011 struct event_file_link *link;
897 1012
898 list_for_each_entry_rcu(link, &tp->files, list) 1013 list_for_each_entry_rcu(link, &tk->tp.files, list)
899 __kretprobe_trace_func(tp, ri, regs, link->file); 1014 __kretprobe_trace_func(tk, ri, regs, link->file);
900} 1015}
901 1016
902/* Event entry printers */ 1017/* Event entry printers */
@@ -983,16 +1098,18 @@ static int kprobe_event_define_fields(struct ftrace_event_call *event_call)
983{ 1098{
984 int ret, i; 1099 int ret, i;
985 struct kprobe_trace_entry_head field; 1100 struct kprobe_trace_entry_head field;
986 struct trace_probe *tp = (struct trace_probe *)event_call->data; 1101 struct trace_kprobe *tk = (struct trace_kprobe *)event_call->data;
987 1102
988 DEFINE_FIELD(unsigned long, ip, FIELD_STRING_IP, 0); 1103 DEFINE_FIELD(unsigned long, ip, FIELD_STRING_IP, 0);
989 /* Set argument names as fields */ 1104 /* Set argument names as fields */
990 for (i = 0; i < tp->nr_args; i++) { 1105 for (i = 0; i < tk->tp.nr_args; i++) {
991 ret = trace_define_field(event_call, tp->args[i].type->fmttype, 1106 struct probe_arg *parg = &tk->tp.args[i];
992 tp->args[i].name, 1107
993 sizeof(field) + tp->args[i].offset, 1108 ret = trace_define_field(event_call, parg->type->fmttype,
994 tp->args[i].type->size, 1109 parg->name,
995 tp->args[i].type->is_signed, 1110 sizeof(field) + parg->offset,
1111 parg->type->size,
1112 parg->type->is_signed,
996 FILTER_OTHER); 1113 FILTER_OTHER);
997 if (ret) 1114 if (ret)
998 return ret; 1115 return ret;
@@ -1004,17 +1121,19 @@ static int kretprobe_event_define_fields(struct ftrace_event_call *event_call)
1004{ 1121{
1005 int ret, i; 1122 int ret, i;
1006 struct kretprobe_trace_entry_head field; 1123 struct kretprobe_trace_entry_head field;
1007 struct trace_probe *tp = (struct trace_probe *)event_call->data; 1124 struct trace_kprobe *tk = (struct trace_kprobe *)event_call->data;
1008 1125
1009 DEFINE_FIELD(unsigned long, func, FIELD_STRING_FUNC, 0); 1126 DEFINE_FIELD(unsigned long, func, FIELD_STRING_FUNC, 0);
1010 DEFINE_FIELD(unsigned long, ret_ip, FIELD_STRING_RETIP, 0); 1127 DEFINE_FIELD(unsigned long, ret_ip, FIELD_STRING_RETIP, 0);
1011 /* Set argument names as fields */ 1128 /* Set argument names as fields */
1012 for (i = 0; i < tp->nr_args; i++) { 1129 for (i = 0; i < tk->tp.nr_args; i++) {
1013 ret = trace_define_field(event_call, tp->args[i].type->fmttype, 1130 struct probe_arg *parg = &tk->tp.args[i];
1014 tp->args[i].name, 1131
1015 sizeof(field) + tp->args[i].offset, 1132 ret = trace_define_field(event_call, parg->type->fmttype,
1016 tp->args[i].type->size, 1133 parg->name,
1017 tp->args[i].type->is_signed, 1134 sizeof(field) + parg->offset,
1135 parg->type->size,
1136 parg->type->is_signed,
1018 FILTER_OTHER); 1137 FILTER_OTHER);
1019 if (ret) 1138 if (ret)
1020 return ret; 1139 return ret;
@@ -1022,74 +1141,13 @@ static int kretprobe_event_define_fields(struct ftrace_event_call *event_call)
1022 return 0; 1141 return 0;
1023} 1142}
1024 1143
1025static int __set_print_fmt(struct trace_probe *tp, char *buf, int len)
1026{
1027 int i;
1028 int pos = 0;
1029
1030 const char *fmt, *arg;
1031
1032 if (!trace_probe_is_return(tp)) {
1033 fmt = "(%lx)";
1034 arg = "REC->" FIELD_STRING_IP;
1035 } else {
1036 fmt = "(%lx <- %lx)";
1037 arg = "REC->" FIELD_STRING_FUNC ", REC->" FIELD_STRING_RETIP;
1038 }
1039
1040 /* When len=0, we just calculate the needed length */
1041#define LEN_OR_ZERO (len ? len - pos : 0)
1042
1043 pos += snprintf(buf + pos, LEN_OR_ZERO, "\"%s", fmt);
1044
1045 for (i = 0; i < tp->nr_args; i++) {
1046 pos += snprintf(buf + pos, LEN_OR_ZERO, " %s=%s",
1047 tp->args[i].name, tp->args[i].type->fmt);
1048 }
1049
1050 pos += snprintf(buf + pos, LEN_OR_ZERO, "\", %s", arg);
1051
1052 for (i = 0; i < tp->nr_args; i++) {
1053 if (strcmp(tp->args[i].type->name, "string") == 0)
1054 pos += snprintf(buf + pos, LEN_OR_ZERO,
1055 ", __get_str(%s)",
1056 tp->args[i].name);
1057 else
1058 pos += snprintf(buf + pos, LEN_OR_ZERO, ", REC->%s",
1059 tp->args[i].name);
1060 }
1061
1062#undef LEN_OR_ZERO
1063
1064 /* return the length of print_fmt */
1065 return pos;
1066}
1067
1068static int set_print_fmt(struct trace_probe *tp)
1069{
1070 int len;
1071 char *print_fmt;
1072
1073 /* First: called with 0 length to calculate the needed length */
1074 len = __set_print_fmt(tp, NULL, 0);
1075 print_fmt = kmalloc(len + 1, GFP_KERNEL);
1076 if (!print_fmt)
1077 return -ENOMEM;
1078
1079 /* Second: actually write the @print_fmt */
1080 __set_print_fmt(tp, print_fmt, len + 1);
1081 tp->call.print_fmt = print_fmt;
1082
1083 return 0;
1084}
1085
1086#ifdef CONFIG_PERF_EVENTS 1144#ifdef CONFIG_PERF_EVENTS
1087 1145
1088/* Kprobe profile handler */ 1146/* Kprobe profile handler */
1089static __kprobes void 1147static __kprobes void
1090kprobe_perf_func(struct trace_probe *tp, struct pt_regs *regs) 1148kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs)
1091{ 1149{
1092 struct ftrace_event_call *call = &tp->call; 1150 struct ftrace_event_call *call = &tk->tp.call;
1093 struct kprobe_trace_entry_head *entry; 1151 struct kprobe_trace_entry_head *entry;
1094 struct hlist_head *head; 1152 struct hlist_head *head;
1095 int size, __size, dsize; 1153 int size, __size, dsize;
@@ -1099,8 +1157,8 @@ kprobe_perf_func(struct trace_probe *tp, struct pt_regs *regs)
1099 if (hlist_empty(head)) 1157 if (hlist_empty(head))
1100 return; 1158 return;
1101 1159
1102 dsize = __get_data_size(tp, regs); 1160 dsize = __get_data_size(&tk->tp, regs);
1103 __size = sizeof(*entry) + tp->size + dsize; 1161 __size = sizeof(*entry) + tk->tp.size + dsize;
1104 size = ALIGN(__size + sizeof(u32), sizeof(u64)); 1162 size = ALIGN(__size + sizeof(u32), sizeof(u64));
1105 size -= sizeof(u32); 1163 size -= sizeof(u32);
1106 1164
@@ -1108,18 +1166,18 @@ kprobe_perf_func(struct trace_probe *tp, struct pt_regs *regs)
1108 if (!entry) 1166 if (!entry)
1109 return; 1167 return;
1110 1168
1111 entry->ip = (unsigned long)tp->rp.kp.addr; 1169 entry->ip = (unsigned long)tk->rp.kp.addr;
1112 memset(&entry[1], 0, dsize); 1170 memset(&entry[1], 0, dsize);
1113 store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); 1171 store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize);
1114 perf_trace_buf_submit(entry, size, rctx, 0, 1, regs, head, NULL); 1172 perf_trace_buf_submit(entry, size, rctx, 0, 1, regs, head, NULL);
1115} 1173}
1116 1174
1117/* Kretprobe profile handler */ 1175/* Kretprobe profile handler */
1118static __kprobes void 1176static __kprobes void
1119kretprobe_perf_func(struct trace_probe *tp, struct kretprobe_instance *ri, 1177kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
1120 struct pt_regs *regs) 1178 struct pt_regs *regs)
1121{ 1179{
1122 struct ftrace_event_call *call = &tp->call; 1180 struct ftrace_event_call *call = &tk->tp.call;
1123 struct kretprobe_trace_entry_head *entry; 1181 struct kretprobe_trace_entry_head *entry;
1124 struct hlist_head *head; 1182 struct hlist_head *head;
1125 int size, __size, dsize; 1183 int size, __size, dsize;
@@ -1129,8 +1187,8 @@ kretprobe_perf_func(struct trace_probe *tp, struct kretprobe_instance *ri,
1129 if (hlist_empty(head)) 1187 if (hlist_empty(head))
1130 return; 1188 return;
1131 1189
1132 dsize = __get_data_size(tp, regs); 1190 dsize = __get_data_size(&tk->tp, regs);
1133 __size = sizeof(*entry) + tp->size + dsize; 1191 __size = sizeof(*entry) + tk->tp.size + dsize;
1134 size = ALIGN(__size + sizeof(u32), sizeof(u64)); 1192 size = ALIGN(__size + sizeof(u32), sizeof(u64));
1135 size -= sizeof(u32); 1193 size -= sizeof(u32);
1136 1194
@@ -1138,9 +1196,9 @@ kretprobe_perf_func(struct trace_probe *tp, struct kretprobe_instance *ri,
1138 if (!entry) 1196 if (!entry)
1139 return; 1197 return;
1140 1198
1141 entry->func = (unsigned long)tp->rp.kp.addr; 1199 entry->func = (unsigned long)tk->rp.kp.addr;
1142 entry->ret_ip = (unsigned long)ri->ret_addr; 1200 entry->ret_ip = (unsigned long)ri->ret_addr;
1143 store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); 1201 store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize);
1144 perf_trace_buf_submit(entry, size, rctx, 0, 1, regs, head, NULL); 1202 perf_trace_buf_submit(entry, size, rctx, 0, 1, regs, head, NULL);
1145} 1203}
1146#endif /* CONFIG_PERF_EVENTS */ 1204#endif /* CONFIG_PERF_EVENTS */
@@ -1155,20 +1213,20 @@ static __kprobes
1155int kprobe_register(struct ftrace_event_call *event, 1213int kprobe_register(struct ftrace_event_call *event,
1156 enum trace_reg type, void *data) 1214 enum trace_reg type, void *data)
1157{ 1215{
1158 struct trace_probe *tp = (struct trace_probe *)event->data; 1216 struct trace_kprobe *tk = (struct trace_kprobe *)event->data;
1159 struct ftrace_event_file *file = data; 1217 struct ftrace_event_file *file = data;
1160 1218
1161 switch (type) { 1219 switch (type) {
1162 case TRACE_REG_REGISTER: 1220 case TRACE_REG_REGISTER:
1163 return enable_trace_probe(tp, file); 1221 return enable_trace_kprobe(tk, file);
1164 case TRACE_REG_UNREGISTER: 1222 case TRACE_REG_UNREGISTER:
1165 return disable_trace_probe(tp, file); 1223 return disable_trace_kprobe(tk, file);
1166 1224
1167#ifdef CONFIG_PERF_EVENTS 1225#ifdef CONFIG_PERF_EVENTS
1168 case TRACE_REG_PERF_REGISTER: 1226 case TRACE_REG_PERF_REGISTER:
1169 return enable_trace_probe(tp, NULL); 1227 return enable_trace_kprobe(tk, NULL);
1170 case TRACE_REG_PERF_UNREGISTER: 1228 case TRACE_REG_PERF_UNREGISTER:
1171 return disable_trace_probe(tp, NULL); 1229 return disable_trace_kprobe(tk, NULL);
1172 case TRACE_REG_PERF_OPEN: 1230 case TRACE_REG_PERF_OPEN:
1173 case TRACE_REG_PERF_CLOSE: 1231 case TRACE_REG_PERF_CLOSE:
1174 case TRACE_REG_PERF_ADD: 1232 case TRACE_REG_PERF_ADD:
@@ -1182,15 +1240,15 @@ int kprobe_register(struct ftrace_event_call *event,
1182static __kprobes 1240static __kprobes
1183int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs) 1241int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs)
1184{ 1242{
1185 struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp); 1243 struct trace_kprobe *tk = container_of(kp, struct trace_kprobe, rp.kp);
1186 1244
1187 tp->nhit++; 1245 tk->nhit++;
1188 1246
1189 if (tp->flags & TP_FLAG_TRACE) 1247 if (tk->tp.flags & TP_FLAG_TRACE)
1190 kprobe_trace_func(tp, regs); 1248 kprobe_trace_func(tk, regs);
1191#ifdef CONFIG_PERF_EVENTS 1249#ifdef CONFIG_PERF_EVENTS
1192 if (tp->flags & TP_FLAG_PROFILE) 1250 if (tk->tp.flags & TP_FLAG_PROFILE)
1193 kprobe_perf_func(tp, regs); 1251 kprobe_perf_func(tk, regs);
1194#endif 1252#endif
1195 return 0; /* We don't tweek kernel, so just return 0 */ 1253 return 0; /* We don't tweek kernel, so just return 0 */
1196} 1254}
@@ -1198,15 +1256,15 @@ int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs)
1198static __kprobes 1256static __kprobes
1199int kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs) 1257int kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs)
1200{ 1258{
1201 struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp); 1259 struct trace_kprobe *tk = container_of(ri->rp, struct trace_kprobe, rp);
1202 1260
1203 tp->nhit++; 1261 tk->nhit++;
1204 1262
1205 if (tp->flags & TP_FLAG_TRACE) 1263 if (tk->tp.flags & TP_FLAG_TRACE)
1206 kretprobe_trace_func(tp, ri, regs); 1264 kretprobe_trace_func(tk, ri, regs);
1207#ifdef CONFIG_PERF_EVENTS 1265#ifdef CONFIG_PERF_EVENTS
1208 if (tp->flags & TP_FLAG_PROFILE) 1266 if (tk->tp.flags & TP_FLAG_PROFILE)
1209 kretprobe_perf_func(tp, ri, regs); 1267 kretprobe_perf_func(tk, ri, regs);
1210#endif 1268#endif
1211 return 0; /* We don't tweek kernel, so just return 0 */ 1269 return 0; /* We don't tweek kernel, so just return 0 */
1212} 1270}
@@ -1219,21 +1277,21 @@ static struct trace_event_functions kprobe_funcs = {
1219 .trace = print_kprobe_event 1277 .trace = print_kprobe_event
1220}; 1278};
1221 1279
1222static int register_probe_event(struct trace_probe *tp) 1280static int register_kprobe_event(struct trace_kprobe *tk)
1223{ 1281{
1224 struct ftrace_event_call *call = &tp->call; 1282 struct ftrace_event_call *call = &tk->tp.call;
1225 int ret; 1283 int ret;
1226 1284
1227 /* Initialize ftrace_event_call */ 1285 /* Initialize ftrace_event_call */
1228 INIT_LIST_HEAD(&call->class->fields); 1286 INIT_LIST_HEAD(&call->class->fields);
1229 if (trace_probe_is_return(tp)) { 1287 if (trace_kprobe_is_return(tk)) {
1230 call->event.funcs = &kretprobe_funcs; 1288 call->event.funcs = &kretprobe_funcs;
1231 call->class->define_fields = kretprobe_event_define_fields; 1289 call->class->define_fields = kretprobe_event_define_fields;
1232 } else { 1290 } else {
1233 call->event.funcs = &kprobe_funcs; 1291 call->event.funcs = &kprobe_funcs;
1234 call->class->define_fields = kprobe_event_define_fields; 1292 call->class->define_fields = kprobe_event_define_fields;
1235 } 1293 }
1236 if (set_print_fmt(tp) < 0) 1294 if (set_print_fmt(&tk->tp, trace_kprobe_is_return(tk)) < 0)
1237 return -ENOMEM; 1295 return -ENOMEM;
1238 ret = register_ftrace_event(&call->event); 1296 ret = register_ftrace_event(&call->event);
1239 if (!ret) { 1297 if (!ret) {
@@ -1242,7 +1300,7 @@ static int register_probe_event(struct trace_probe *tp)
1242 } 1300 }
1243 call->flags = 0; 1301 call->flags = 0;
1244 call->class->reg = kprobe_register; 1302 call->class->reg = kprobe_register;
1245 call->data = tp; 1303 call->data = tk;
1246 ret = trace_add_event_call(call); 1304 ret = trace_add_event_call(call);
1247 if (ret) { 1305 if (ret) {
1248 pr_info("Failed to register kprobe event: %s\n", call->name); 1306 pr_info("Failed to register kprobe event: %s\n", call->name);
@@ -1252,14 +1310,14 @@ static int register_probe_event(struct trace_probe *tp)
1252 return ret; 1310 return ret;
1253} 1311}
1254 1312
1255static int unregister_probe_event(struct trace_probe *tp) 1313static int unregister_kprobe_event(struct trace_kprobe *tk)
1256{ 1314{
1257 int ret; 1315 int ret;
1258 1316
1259 /* tp->event is unregistered in trace_remove_event_call() */ 1317 /* tp->event is unregistered in trace_remove_event_call() */
1260 ret = trace_remove_event_call(&tp->call); 1318 ret = trace_remove_event_call(&tk->tp.call);
1261 if (!ret) 1319 if (!ret)
1262 kfree(tp->call.print_fmt); 1320 kfree(tk->tp.call.print_fmt);
1263 return ret; 1321 return ret;
1264} 1322}
1265 1323
@@ -1269,7 +1327,7 @@ static __init int init_kprobe_trace(void)
1269 struct dentry *d_tracer; 1327 struct dentry *d_tracer;
1270 struct dentry *entry; 1328 struct dentry *entry;
1271 1329
1272 if (register_module_notifier(&trace_probe_module_nb)) 1330 if (register_module_notifier(&trace_kprobe_module_nb))
1273 return -EINVAL; 1331 return -EINVAL;
1274 1332
1275 d_tracer = tracing_init_dentry(); 1333 d_tracer = tracing_init_dentry();
@@ -1309,26 +1367,26 @@ static __used int kprobe_trace_selftest_target(int a1, int a2, int a3,
1309} 1367}
1310 1368
1311static struct ftrace_event_file * 1369static struct ftrace_event_file *
1312find_trace_probe_file(struct trace_probe *tp, struct trace_array *tr) 1370find_trace_probe_file(struct trace_kprobe *tk, struct trace_array *tr)
1313{ 1371{
1314 struct ftrace_event_file *file; 1372 struct ftrace_event_file *file;
1315 1373
1316 list_for_each_entry(file, &tr->events, list) 1374 list_for_each_entry(file, &tr->events, list)
1317 if (file->event_call == &tp->call) 1375 if (file->event_call == &tk->tp.call)
1318 return file; 1376 return file;
1319 1377
1320 return NULL; 1378 return NULL;
1321} 1379}
1322 1380
1323/* 1381/*
1324 * Nobody but us can call enable_trace_probe/disable_trace_probe at this 1382 * Nobody but us can call enable_trace_kprobe/disable_trace_kprobe at this
1325 * stage, we can do this lockless. 1383 * stage, we can do this lockless.
1326 */ 1384 */
1327static __init int kprobe_trace_self_tests_init(void) 1385static __init int kprobe_trace_self_tests_init(void)
1328{ 1386{
1329 int ret, warn = 0; 1387 int ret, warn = 0;
1330 int (*target)(int, int, int, int, int, int); 1388 int (*target)(int, int, int, int, int, int);
1331 struct trace_probe *tp; 1389 struct trace_kprobe *tk;
1332 struct ftrace_event_file *file; 1390 struct ftrace_event_file *file;
1333 1391
1334 target = kprobe_trace_selftest_target; 1392 target = kprobe_trace_selftest_target;
@@ -1337,44 +1395,44 @@ static __init int kprobe_trace_self_tests_init(void)
1337 1395
1338 ret = traceprobe_command("p:testprobe kprobe_trace_selftest_target " 1396 ret = traceprobe_command("p:testprobe kprobe_trace_selftest_target "
1339 "$stack $stack0 +0($stack)", 1397 "$stack $stack0 +0($stack)",
1340 create_trace_probe); 1398 create_trace_kprobe);
1341 if (WARN_ON_ONCE(ret)) { 1399 if (WARN_ON_ONCE(ret)) {
1342 pr_warn("error on probing function entry.\n"); 1400 pr_warn("error on probing function entry.\n");
1343 warn++; 1401 warn++;
1344 } else { 1402 } else {
1345 /* Enable trace point */ 1403 /* Enable trace point */
1346 tp = find_trace_probe("testprobe", KPROBE_EVENT_SYSTEM); 1404 tk = find_trace_kprobe("testprobe", KPROBE_EVENT_SYSTEM);
1347 if (WARN_ON_ONCE(tp == NULL)) { 1405 if (WARN_ON_ONCE(tk == NULL)) {
1348 pr_warn("error on getting new probe.\n"); 1406 pr_warn("error on getting new probe.\n");
1349 warn++; 1407 warn++;
1350 } else { 1408 } else {
1351 file = find_trace_probe_file(tp, top_trace_array()); 1409 file = find_trace_probe_file(tk, top_trace_array());
1352 if (WARN_ON_ONCE(file == NULL)) { 1410 if (WARN_ON_ONCE(file == NULL)) {
1353 pr_warn("error on getting probe file.\n"); 1411 pr_warn("error on getting probe file.\n");
1354 warn++; 1412 warn++;
1355 } else 1413 } else
1356 enable_trace_probe(tp, file); 1414 enable_trace_kprobe(tk, file);
1357 } 1415 }
1358 } 1416 }
1359 1417
1360 ret = traceprobe_command("r:testprobe2 kprobe_trace_selftest_target " 1418 ret = traceprobe_command("r:testprobe2 kprobe_trace_selftest_target "
1361 "$retval", create_trace_probe); 1419 "$retval", create_trace_kprobe);
1362 if (WARN_ON_ONCE(ret)) { 1420 if (WARN_ON_ONCE(ret)) {
1363 pr_warn("error on probing function return.\n"); 1421 pr_warn("error on probing function return.\n");
1364 warn++; 1422 warn++;
1365 } else { 1423 } else {
1366 /* Enable trace point */ 1424 /* Enable trace point */
1367 tp = find_trace_probe("testprobe2", KPROBE_EVENT_SYSTEM); 1425 tk = find_trace_kprobe("testprobe2", KPROBE_EVENT_SYSTEM);
1368 if (WARN_ON_ONCE(tp == NULL)) { 1426 if (WARN_ON_ONCE(tk == NULL)) {
1369 pr_warn("error on getting 2nd new probe.\n"); 1427 pr_warn("error on getting 2nd new probe.\n");
1370 warn++; 1428 warn++;
1371 } else { 1429 } else {
1372 file = find_trace_probe_file(tp, top_trace_array()); 1430 file = find_trace_probe_file(tk, top_trace_array());
1373 if (WARN_ON_ONCE(file == NULL)) { 1431 if (WARN_ON_ONCE(file == NULL)) {
1374 pr_warn("error on getting probe file.\n"); 1432 pr_warn("error on getting probe file.\n");
1375 warn++; 1433 warn++;
1376 } else 1434 } else
1377 enable_trace_probe(tp, file); 1435 enable_trace_kprobe(tk, file);
1378 } 1436 }
1379 } 1437 }
1380 1438
@@ -1384,46 +1442,46 @@ static __init int kprobe_trace_self_tests_init(void)
1384 ret = target(1, 2, 3, 4, 5, 6); 1442 ret = target(1, 2, 3, 4, 5, 6);
1385 1443
1386 /* Disable trace points before removing it */ 1444 /* Disable trace points before removing it */
1387 tp = find_trace_probe("testprobe", KPROBE_EVENT_SYSTEM); 1445 tk = find_trace_kprobe("testprobe", KPROBE_EVENT_SYSTEM);
1388 if (WARN_ON_ONCE(tp == NULL)) { 1446 if (WARN_ON_ONCE(tk == NULL)) {
1389 pr_warn("error on getting test probe.\n"); 1447 pr_warn("error on getting test probe.\n");
1390 warn++; 1448 warn++;
1391 } else { 1449 } else {
1392 file = find_trace_probe_file(tp, top_trace_array()); 1450 file = find_trace_probe_file(tk, top_trace_array());
1393 if (WARN_ON_ONCE(file == NULL)) { 1451 if (WARN_ON_ONCE(file == NULL)) {
1394 pr_warn("error on getting probe file.\n"); 1452 pr_warn("error on getting probe file.\n");
1395 warn++; 1453 warn++;
1396 } else 1454 } else
1397 disable_trace_probe(tp, file); 1455 disable_trace_kprobe(tk, file);
1398 } 1456 }
1399 1457
1400 tp = find_trace_probe("testprobe2", KPROBE_EVENT_SYSTEM); 1458 tk = find_trace_kprobe("testprobe2", KPROBE_EVENT_SYSTEM);
1401 if (WARN_ON_ONCE(tp == NULL)) { 1459 if (WARN_ON_ONCE(tk == NULL)) {
1402 pr_warn("error on getting 2nd test probe.\n"); 1460 pr_warn("error on getting 2nd test probe.\n");
1403 warn++; 1461 warn++;
1404 } else { 1462 } else {
1405 file = find_trace_probe_file(tp, top_trace_array()); 1463 file = find_trace_probe_file(tk, top_trace_array());
1406 if (WARN_ON_ONCE(file == NULL)) { 1464 if (WARN_ON_ONCE(file == NULL)) {
1407 pr_warn("error on getting probe file.\n"); 1465 pr_warn("error on getting probe file.\n");
1408 warn++; 1466 warn++;
1409 } else 1467 } else
1410 disable_trace_probe(tp, file); 1468 disable_trace_kprobe(tk, file);
1411 } 1469 }
1412 1470
1413 ret = traceprobe_command("-:testprobe", create_trace_probe); 1471 ret = traceprobe_command("-:testprobe", create_trace_kprobe);
1414 if (WARN_ON_ONCE(ret)) { 1472 if (WARN_ON_ONCE(ret)) {
1415 pr_warn("error on deleting a probe.\n"); 1473 pr_warn("error on deleting a probe.\n");
1416 warn++; 1474 warn++;
1417 } 1475 }
1418 1476
1419 ret = traceprobe_command("-:testprobe2", create_trace_probe); 1477 ret = traceprobe_command("-:testprobe2", create_trace_kprobe);
1420 if (WARN_ON_ONCE(ret)) { 1478 if (WARN_ON_ONCE(ret)) {
1421 pr_warn("error on deleting a probe.\n"); 1479 pr_warn("error on deleting a probe.\n");
1422 warn++; 1480 warn++;
1423 } 1481 }
1424 1482
1425end: 1483end:
1426 release_all_trace_probes(); 1484 release_all_trace_kprobes();
1427 if (warn) 1485 if (warn)
1428 pr_cont("NG: Some tests are failed. Please check them.\n"); 1486 pr_cont("NG: Some tests are failed. Please check them.\n");
1429 else 1487 else
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
index 412e959709b4..8364a421b4df 100644
--- a/kernel/trace/trace_probe.c
+++ b/kernel/trace/trace_probe.c
@@ -35,46 +35,27 @@ const char *reserved_field_names[] = {
35 FIELD_STRING_FUNC, 35 FIELD_STRING_FUNC,
36}; 36};
37 37
38/* Printing function type */
39#define PRINT_TYPE_FUNC_NAME(type) print_type_##type
40#define PRINT_TYPE_FMT_NAME(type) print_type_format_##type
41
42/* Printing in basic type function template */ 38/* Printing in basic type function template */
43#define DEFINE_BASIC_PRINT_TYPE_FUNC(type, fmt, cast) \ 39#define DEFINE_BASIC_PRINT_TYPE_FUNC(type, fmt) \
44static __kprobes int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s, \ 40__kprobes int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s, \
45 const char *name, \ 41 const char *name, \
46 void *data, void *ent)\ 42 void *data, void *ent) \
47{ \ 43{ \
48 return trace_seq_printf(s, " %s=" fmt, name, (cast)*(type *)data);\ 44 return trace_seq_printf(s, " %s=" fmt, name, *(type *)data); \
49} \ 45} \
50static const char PRINT_TYPE_FMT_NAME(type)[] = fmt; 46const char PRINT_TYPE_FMT_NAME(type)[] = fmt;
51
52DEFINE_BASIC_PRINT_TYPE_FUNC(u8, "%x", unsigned int)
53DEFINE_BASIC_PRINT_TYPE_FUNC(u16, "%x", unsigned int)
54DEFINE_BASIC_PRINT_TYPE_FUNC(u32, "%lx", unsigned long)
55DEFINE_BASIC_PRINT_TYPE_FUNC(u64, "%llx", unsigned long long)
56DEFINE_BASIC_PRINT_TYPE_FUNC(s8, "%d", int)
57DEFINE_BASIC_PRINT_TYPE_FUNC(s16, "%d", int)
58DEFINE_BASIC_PRINT_TYPE_FUNC(s32, "%ld", long)
59DEFINE_BASIC_PRINT_TYPE_FUNC(s64, "%lld", long long)
60
61static inline void *get_rloc_data(u32 *dl)
62{
63 return (u8 *)dl + get_rloc_offs(*dl);
64}
65 47
66/* For data_loc conversion */ 48DEFINE_BASIC_PRINT_TYPE_FUNC(u8 , "0x%x")
67static inline void *get_loc_data(u32 *dl, void *ent) 49DEFINE_BASIC_PRINT_TYPE_FUNC(u16, "0x%x")
68{ 50DEFINE_BASIC_PRINT_TYPE_FUNC(u32, "0x%x")
69 return (u8 *)ent + get_rloc_offs(*dl); 51DEFINE_BASIC_PRINT_TYPE_FUNC(u64, "0x%Lx")
70} 52DEFINE_BASIC_PRINT_TYPE_FUNC(s8, "%d")
71 53DEFINE_BASIC_PRINT_TYPE_FUNC(s16, "%d")
72/* For defining macros, define string/string_size types */ 54DEFINE_BASIC_PRINT_TYPE_FUNC(s32, "%d")
73typedef u32 string; 55DEFINE_BASIC_PRINT_TYPE_FUNC(s64, "%Ld")
74typedef u32 string_size;
75 56
76/* Print type function for string type */ 57/* Print type function for string type */
77static __kprobes int PRINT_TYPE_FUNC_NAME(string)(struct trace_seq *s, 58__kprobes int PRINT_TYPE_FUNC_NAME(string)(struct trace_seq *s,
78 const char *name, 59 const char *name,
79 void *data, void *ent) 60 void *data, void *ent)
80{ 61{
@@ -87,18 +68,7 @@ static __kprobes int PRINT_TYPE_FUNC_NAME(string)(struct trace_seq *s,
87 (const char *)get_loc_data(data, ent)); 68 (const char *)get_loc_data(data, ent));
88} 69}
89 70
90static const char PRINT_TYPE_FMT_NAME(string)[] = "\\\"%s\\\""; 71const char PRINT_TYPE_FMT_NAME(string)[] = "\\\"%s\\\"";
91
92#define FETCH_FUNC_NAME(method, type) fetch_##method##_##type
93/*
94 * Define macro for basic types - we don't need to define s* types, because
95 * we have to care only about bitwidth at recording time.
96 */
97#define DEFINE_BASIC_FETCH_FUNCS(method) \
98DEFINE_FETCH_##method(u8) \
99DEFINE_FETCH_##method(u16) \
100DEFINE_FETCH_##method(u32) \
101DEFINE_FETCH_##method(u64)
102 72
103#define CHECK_FETCH_FUNCS(method, fn) \ 73#define CHECK_FETCH_FUNCS(method, fn) \
104 (((FETCH_FUNC_NAME(method, u8) == fn) || \ 74 (((FETCH_FUNC_NAME(method, u8) == fn) || \
@@ -111,7 +81,7 @@ DEFINE_FETCH_##method(u64)
111 81
112/* Data fetch function templates */ 82/* Data fetch function templates */
113#define DEFINE_FETCH_reg(type) \ 83#define DEFINE_FETCH_reg(type) \
114static __kprobes void FETCH_FUNC_NAME(reg, type)(struct pt_regs *regs, \ 84__kprobes void FETCH_FUNC_NAME(reg, type)(struct pt_regs *regs, \
115 void *offset, void *dest) \ 85 void *offset, void *dest) \
116{ \ 86{ \
117 *(type *)dest = (type)regs_get_register(regs, \ 87 *(type *)dest = (type)regs_get_register(regs, \
@@ -122,20 +92,8 @@ DEFINE_BASIC_FETCH_FUNCS(reg)
122#define fetch_reg_string NULL 92#define fetch_reg_string NULL
123#define fetch_reg_string_size NULL 93#define fetch_reg_string_size NULL
124 94
125#define DEFINE_FETCH_stack(type) \
126static __kprobes void FETCH_FUNC_NAME(stack, type)(struct pt_regs *regs,\
127 void *offset, void *dest) \
128{ \
129 *(type *)dest = (type)regs_get_kernel_stack_nth(regs, \
130 (unsigned int)((unsigned long)offset)); \
131}
132DEFINE_BASIC_FETCH_FUNCS(stack)
133/* No string on the stack entry */
134#define fetch_stack_string NULL
135#define fetch_stack_string_size NULL
136
137#define DEFINE_FETCH_retval(type) \ 95#define DEFINE_FETCH_retval(type) \
138static __kprobes void FETCH_FUNC_NAME(retval, type)(struct pt_regs *regs,\ 96__kprobes void FETCH_FUNC_NAME(retval, type)(struct pt_regs *regs, \
139 void *dummy, void *dest) \ 97 void *dummy, void *dest) \
140{ \ 98{ \
141 *(type *)dest = (type)regs_return_value(regs); \ 99 *(type *)dest = (type)regs_return_value(regs); \
@@ -145,150 +103,16 @@ DEFINE_BASIC_FETCH_FUNCS(retval)
145#define fetch_retval_string NULL 103#define fetch_retval_string NULL
146#define fetch_retval_string_size NULL 104#define fetch_retval_string_size NULL
147 105
148#define DEFINE_FETCH_memory(type) \
149static __kprobes void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs,\
150 void *addr, void *dest) \
151{ \
152 type retval; \
153 if (probe_kernel_address(addr, retval)) \
154 *(type *)dest = 0; \
155 else \
156 *(type *)dest = retval; \
157}
158DEFINE_BASIC_FETCH_FUNCS(memory)
159/*
160 * Fetch a null-terminated string. Caller MUST set *(u32 *)dest with max
161 * length and relative data location.
162 */
163static __kprobes void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs,
164 void *addr, void *dest)
165{
166 long ret;
167 int maxlen = get_rloc_len(*(u32 *)dest);
168 u8 *dst = get_rloc_data(dest);
169 u8 *src = addr;
170 mm_segment_t old_fs = get_fs();
171
172 if (!maxlen)
173 return;
174
175 /*
176 * Try to get string again, since the string can be changed while
177 * probing.
178 */
179 set_fs(KERNEL_DS);
180 pagefault_disable();
181
182 do
183 ret = __copy_from_user_inatomic(dst++, src++, 1);
184 while (dst[-1] && ret == 0 && src - (u8 *)addr < maxlen);
185
186 dst[-1] = '\0';
187 pagefault_enable();
188 set_fs(old_fs);
189
190 if (ret < 0) { /* Failed to fetch string */
191 ((u8 *)get_rloc_data(dest))[0] = '\0';
192 *(u32 *)dest = make_data_rloc(0, get_rloc_offs(*(u32 *)dest));
193 } else {
194 *(u32 *)dest = make_data_rloc(src - (u8 *)addr,
195 get_rloc_offs(*(u32 *)dest));
196 }
197}
198
199/* Return the length of string -- including null terminal byte */
200static __kprobes void FETCH_FUNC_NAME(memory, string_size)(struct pt_regs *regs,
201 void *addr, void *dest)
202{
203 mm_segment_t old_fs;
204 int ret, len = 0;
205 u8 c;
206
207 old_fs = get_fs();
208 set_fs(KERNEL_DS);
209 pagefault_disable();
210
211 do {
212 ret = __copy_from_user_inatomic(&c, (u8 *)addr + len, 1);
213 len++;
214 } while (c && ret == 0 && len < MAX_STRING_SIZE);
215
216 pagefault_enable();
217 set_fs(old_fs);
218
219 if (ret < 0) /* Failed to check the length */
220 *(u32 *)dest = 0;
221 else
222 *(u32 *)dest = len;
223}
224
225/* Memory fetching by symbol */
226struct symbol_cache {
227 char *symbol;
228 long offset;
229 unsigned long addr;
230};
231
232static unsigned long update_symbol_cache(struct symbol_cache *sc)
233{
234 sc->addr = (unsigned long)kallsyms_lookup_name(sc->symbol);
235
236 if (sc->addr)
237 sc->addr += sc->offset;
238
239 return sc->addr;
240}
241
242static void free_symbol_cache(struct symbol_cache *sc)
243{
244 kfree(sc->symbol);
245 kfree(sc);
246}
247
248static struct symbol_cache *alloc_symbol_cache(const char *sym, long offset)
249{
250 struct symbol_cache *sc;
251
252 if (!sym || strlen(sym) == 0)
253 return NULL;
254
255 sc = kzalloc(sizeof(struct symbol_cache), GFP_KERNEL);
256 if (!sc)
257 return NULL;
258
259 sc->symbol = kstrdup(sym, GFP_KERNEL);
260 if (!sc->symbol) {
261 kfree(sc);
262 return NULL;
263 }
264 sc->offset = offset;
265 update_symbol_cache(sc);
266
267 return sc;
268}
269
270#define DEFINE_FETCH_symbol(type) \
271static __kprobes void FETCH_FUNC_NAME(symbol, type)(struct pt_regs *regs,\
272 void *data, void *dest) \
273{ \
274 struct symbol_cache *sc = data; \
275 if (sc->addr) \
276 fetch_memory_##type(regs, (void *)sc->addr, dest); \
277 else \
278 *(type *)dest = 0; \
279}
280DEFINE_BASIC_FETCH_FUNCS(symbol)
281DEFINE_FETCH_symbol(string)
282DEFINE_FETCH_symbol(string_size)
283
284/* Dereference memory access function */ 106/* Dereference memory access function */
285struct deref_fetch_param { 107struct deref_fetch_param {
286 struct fetch_param orig; 108 struct fetch_param orig;
287 long offset; 109 long offset;
110 fetch_func_t fetch;
111 fetch_func_t fetch_size;
288}; 112};
289 113
290#define DEFINE_FETCH_deref(type) \ 114#define DEFINE_FETCH_deref(type) \
291static __kprobes void FETCH_FUNC_NAME(deref, type)(struct pt_regs *regs,\ 115__kprobes void FETCH_FUNC_NAME(deref, type)(struct pt_regs *regs, \
292 void *data, void *dest) \ 116 void *data, void *dest) \
293{ \ 117{ \
294 struct deref_fetch_param *dprm = data; \ 118 struct deref_fetch_param *dprm = data; \
@@ -296,13 +120,26 @@ static __kprobes void FETCH_FUNC_NAME(deref, type)(struct pt_regs *regs,\
296 call_fetch(&dprm->orig, regs, &addr); \ 120 call_fetch(&dprm->orig, regs, &addr); \
297 if (addr) { \ 121 if (addr) { \
298 addr += dprm->offset; \ 122 addr += dprm->offset; \
299 fetch_memory_##type(regs, (void *)addr, dest); \ 123 dprm->fetch(regs, (void *)addr, dest); \
300 } else \ 124 } else \
301 *(type *)dest = 0; \ 125 *(type *)dest = 0; \
302} 126}
303DEFINE_BASIC_FETCH_FUNCS(deref) 127DEFINE_BASIC_FETCH_FUNCS(deref)
304DEFINE_FETCH_deref(string) 128DEFINE_FETCH_deref(string)
305DEFINE_FETCH_deref(string_size) 129
130__kprobes void FETCH_FUNC_NAME(deref, string_size)(struct pt_regs *regs,
131 void *data, void *dest)
132{
133 struct deref_fetch_param *dprm = data;
134 unsigned long addr;
135
136 call_fetch(&dprm->orig, regs, &addr);
137 if (addr && dprm->fetch_size) {
138 addr += dprm->offset;
139 dprm->fetch_size(regs, (void *)addr, dest);
140 } else
141 *(string_size *)dest = 0;
142}
306 143
307static __kprobes void update_deref_fetch_param(struct deref_fetch_param *data) 144static __kprobes void update_deref_fetch_param(struct deref_fetch_param *data)
308{ 145{
@@ -329,7 +166,7 @@ struct bitfield_fetch_param {
329}; 166};
330 167
331#define DEFINE_FETCH_bitfield(type) \ 168#define DEFINE_FETCH_bitfield(type) \
332static __kprobes void FETCH_FUNC_NAME(bitfield, type)(struct pt_regs *regs,\ 169__kprobes void FETCH_FUNC_NAME(bitfield, type)(struct pt_regs *regs, \
333 void *data, void *dest) \ 170 void *data, void *dest) \
334{ \ 171{ \
335 struct bitfield_fetch_param *bprm = data; \ 172 struct bitfield_fetch_param *bprm = data; \
@@ -374,58 +211,8 @@ free_bitfield_fetch_param(struct bitfield_fetch_param *data)
374 kfree(data); 211 kfree(data);
375} 212}
376 213
377/* Default (unsigned long) fetch type */ 214static const struct fetch_type *find_fetch_type(const char *type,
378#define __DEFAULT_FETCH_TYPE(t) u##t 215 const struct fetch_type *ftbl)
379#define _DEFAULT_FETCH_TYPE(t) __DEFAULT_FETCH_TYPE(t)
380#define DEFAULT_FETCH_TYPE _DEFAULT_FETCH_TYPE(BITS_PER_LONG)
381#define DEFAULT_FETCH_TYPE_STR __stringify(DEFAULT_FETCH_TYPE)
382
383#define ASSIGN_FETCH_FUNC(method, type) \
384 [FETCH_MTD_##method] = FETCH_FUNC_NAME(method, type)
385
386#define __ASSIGN_FETCH_TYPE(_name, ptype, ftype, _size, sign, _fmttype) \
387 {.name = _name, \
388 .size = _size, \
389 .is_signed = sign, \
390 .print = PRINT_TYPE_FUNC_NAME(ptype), \
391 .fmt = PRINT_TYPE_FMT_NAME(ptype), \
392 .fmttype = _fmttype, \
393 .fetch = { \
394ASSIGN_FETCH_FUNC(reg, ftype), \
395ASSIGN_FETCH_FUNC(stack, ftype), \
396ASSIGN_FETCH_FUNC(retval, ftype), \
397ASSIGN_FETCH_FUNC(memory, ftype), \
398ASSIGN_FETCH_FUNC(symbol, ftype), \
399ASSIGN_FETCH_FUNC(deref, ftype), \
400ASSIGN_FETCH_FUNC(bitfield, ftype), \
401 } \
402 }
403
404#define ASSIGN_FETCH_TYPE(ptype, ftype, sign) \
405 __ASSIGN_FETCH_TYPE(#ptype, ptype, ftype, sizeof(ftype), sign, #ptype)
406
407#define FETCH_TYPE_STRING 0
408#define FETCH_TYPE_STRSIZE 1
409
410/* Fetch type information table */
411static const struct fetch_type fetch_type_table[] = {
412 /* Special types */
413 [FETCH_TYPE_STRING] = __ASSIGN_FETCH_TYPE("string", string, string,
414 sizeof(u32), 1, "__data_loc char[]"),
415 [FETCH_TYPE_STRSIZE] = __ASSIGN_FETCH_TYPE("string_size", u32,
416 string_size, sizeof(u32), 0, "u32"),
417 /* Basic types */
418 ASSIGN_FETCH_TYPE(u8, u8, 0),
419 ASSIGN_FETCH_TYPE(u16, u16, 0),
420 ASSIGN_FETCH_TYPE(u32, u32, 0),
421 ASSIGN_FETCH_TYPE(u64, u64, 0),
422 ASSIGN_FETCH_TYPE(s8, u8, 1),
423 ASSIGN_FETCH_TYPE(s16, u16, 1),
424 ASSIGN_FETCH_TYPE(s32, u32, 1),
425 ASSIGN_FETCH_TYPE(s64, u64, 1),
426};
427
428static const struct fetch_type *find_fetch_type(const char *type)
429{ 216{
430 int i; 217 int i;
431 218
@@ -446,44 +233,52 @@ static const struct fetch_type *find_fetch_type(const char *type)
446 233
447 switch (bs) { 234 switch (bs) {
448 case 8: 235 case 8:
449 return find_fetch_type("u8"); 236 return find_fetch_type("u8", ftbl);
450 case 16: 237 case 16:
451 return find_fetch_type("u16"); 238 return find_fetch_type("u16", ftbl);
452 case 32: 239 case 32:
453 return find_fetch_type("u32"); 240 return find_fetch_type("u32", ftbl);
454 case 64: 241 case 64:
455 return find_fetch_type("u64"); 242 return find_fetch_type("u64", ftbl);
456 default: 243 default:
457 goto fail; 244 goto fail;
458 } 245 }
459 } 246 }
460 247
461 for (i = 0; i < ARRAY_SIZE(fetch_type_table); i++) 248 for (i = 0; ftbl[i].name; i++) {
462 if (strcmp(type, fetch_type_table[i].name) == 0) 249 if (strcmp(type, ftbl[i].name) == 0)
463 return &fetch_type_table[i]; 250 return &ftbl[i];
251 }
464 252
465fail: 253fail:
466 return NULL; 254 return NULL;
467} 255}
468 256
469/* Special function : only accept unsigned long */ 257/* Special function : only accept unsigned long */
470static __kprobes void fetch_stack_address(struct pt_regs *regs, 258static __kprobes void fetch_kernel_stack_address(struct pt_regs *regs,
471 void *dummy, void *dest) 259 void *dummy, void *dest)
472{ 260{
473 *(unsigned long *)dest = kernel_stack_pointer(regs); 261 *(unsigned long *)dest = kernel_stack_pointer(regs);
474} 262}
475 263
264static __kprobes void fetch_user_stack_address(struct pt_regs *regs,
265 void *dummy, void *dest)
266{
267 *(unsigned long *)dest = user_stack_pointer(regs);
268}
269
476static fetch_func_t get_fetch_size_function(const struct fetch_type *type, 270static fetch_func_t get_fetch_size_function(const struct fetch_type *type,
477 fetch_func_t orig_fn) 271 fetch_func_t orig_fn,
272 const struct fetch_type *ftbl)
478{ 273{
479 int i; 274 int i;
480 275
481 if (type != &fetch_type_table[FETCH_TYPE_STRING]) 276 if (type != &ftbl[FETCH_TYPE_STRING])
482 return NULL; /* Only string type needs size function */ 277 return NULL; /* Only string type needs size function */
483 278
484 for (i = 0; i < FETCH_MTD_END; i++) 279 for (i = 0; i < FETCH_MTD_END; i++)
485 if (type->fetch[i] == orig_fn) 280 if (type->fetch[i] == orig_fn)
486 return fetch_type_table[FETCH_TYPE_STRSIZE].fetch[i]; 281 return ftbl[FETCH_TYPE_STRSIZE].fetch[i];
487 282
488 WARN_ON(1); /* This should not happen */ 283 WARN_ON(1); /* This should not happen */
489 284
@@ -516,7 +311,8 @@ int traceprobe_split_symbol_offset(char *symbol, unsigned long *offset)
516#define PARAM_MAX_STACK (THREAD_SIZE / sizeof(unsigned long)) 311#define PARAM_MAX_STACK (THREAD_SIZE / sizeof(unsigned long))
517 312
518static int parse_probe_vars(char *arg, const struct fetch_type *t, 313static int parse_probe_vars(char *arg, const struct fetch_type *t,
519 struct fetch_param *f, bool is_return) 314 struct fetch_param *f, bool is_return,
315 bool is_kprobe)
520{ 316{
521 int ret = 0; 317 int ret = 0;
522 unsigned long param; 318 unsigned long param;
@@ -528,13 +324,16 @@ static int parse_probe_vars(char *arg, const struct fetch_type *t,
528 ret = -EINVAL; 324 ret = -EINVAL;
529 } else if (strncmp(arg, "stack", 5) == 0) { 325 } else if (strncmp(arg, "stack", 5) == 0) {
530 if (arg[5] == '\0') { 326 if (arg[5] == '\0') {
531 if (strcmp(t->name, DEFAULT_FETCH_TYPE_STR) == 0) 327 if (strcmp(t->name, DEFAULT_FETCH_TYPE_STR))
532 f->fn = fetch_stack_address; 328 return -EINVAL;
329
330 if (is_kprobe)
331 f->fn = fetch_kernel_stack_address;
533 else 332 else
534 ret = -EINVAL; 333 f->fn = fetch_user_stack_address;
535 } else if (isdigit(arg[5])) { 334 } else if (isdigit(arg[5])) {
536 ret = kstrtoul(arg + 5, 10, &param); 335 ret = kstrtoul(arg + 5, 10, &param);
537 if (ret || param > PARAM_MAX_STACK) 336 if (ret || (is_kprobe && param > PARAM_MAX_STACK))
538 ret = -EINVAL; 337 ret = -EINVAL;
539 else { 338 else {
540 f->fn = t->fetch[FETCH_MTD_stack]; 339 f->fn = t->fetch[FETCH_MTD_stack];
@@ -552,20 +351,18 @@ static int parse_probe_vars(char *arg, const struct fetch_type *t,
552static int parse_probe_arg(char *arg, const struct fetch_type *t, 351static int parse_probe_arg(char *arg, const struct fetch_type *t,
553 struct fetch_param *f, bool is_return, bool is_kprobe) 352 struct fetch_param *f, bool is_return, bool is_kprobe)
554{ 353{
354 const struct fetch_type *ftbl;
555 unsigned long param; 355 unsigned long param;
556 long offset; 356 long offset;
557 char *tmp; 357 char *tmp;
558 int ret; 358 int ret = 0;
559
560 ret = 0;
561 359
562 /* Until uprobe_events supports only reg arguments */ 360 ftbl = is_kprobe ? kprobes_fetch_type_table : uprobes_fetch_type_table;
563 if (!is_kprobe && arg[0] != '%') 361 BUG_ON(ftbl == NULL);
564 return -EINVAL;
565 362
566 switch (arg[0]) { 363 switch (arg[0]) {
567 case '$': 364 case '$':
568 ret = parse_probe_vars(arg + 1, t, f, is_return); 365 ret = parse_probe_vars(arg + 1, t, f, is_return, is_kprobe);
569 break; 366 break;
570 367
571 case '%': /* named register */ 368 case '%': /* named register */
@@ -577,7 +374,7 @@ static int parse_probe_arg(char *arg, const struct fetch_type *t,
577 } 374 }
578 break; 375 break;
579 376
580 case '@': /* memory or symbol */ 377 case '@': /* memory, file-offset or symbol */
581 if (isdigit(arg[1])) { 378 if (isdigit(arg[1])) {
582 ret = kstrtoul(arg + 1, 0, &param); 379 ret = kstrtoul(arg + 1, 0, &param);
583 if (ret) 380 if (ret)
@@ -585,7 +382,22 @@ static int parse_probe_arg(char *arg, const struct fetch_type *t,
585 382
586 f->fn = t->fetch[FETCH_MTD_memory]; 383 f->fn = t->fetch[FETCH_MTD_memory];
587 f->data = (void *)param; 384 f->data = (void *)param;
385 } else if (arg[1] == '+') {
386 /* kprobes don't support file offsets */
387 if (is_kprobe)
388 return -EINVAL;
389
390 ret = kstrtol(arg + 2, 0, &offset);
391 if (ret)
392 break;
393
394 f->fn = t->fetch[FETCH_MTD_file_offset];
395 f->data = (void *)offset;
588 } else { 396 } else {
397 /* uprobes don't support symbols */
398 if (!is_kprobe)
399 return -EINVAL;
400
589 ret = traceprobe_split_symbol_offset(arg + 1, &offset); 401 ret = traceprobe_split_symbol_offset(arg + 1, &offset);
590 if (ret) 402 if (ret)
591 break; 403 break;
@@ -616,7 +428,7 @@ static int parse_probe_arg(char *arg, const struct fetch_type *t,
616 struct deref_fetch_param *dprm; 428 struct deref_fetch_param *dprm;
617 const struct fetch_type *t2; 429 const struct fetch_type *t2;
618 430
619 t2 = find_fetch_type(NULL); 431 t2 = find_fetch_type(NULL, ftbl);
620 *tmp = '\0'; 432 *tmp = '\0';
621 dprm = kzalloc(sizeof(struct deref_fetch_param), GFP_KERNEL); 433 dprm = kzalloc(sizeof(struct deref_fetch_param), GFP_KERNEL);
622 434
@@ -624,6 +436,9 @@ static int parse_probe_arg(char *arg, const struct fetch_type *t,
624 return -ENOMEM; 436 return -ENOMEM;
625 437
626 dprm->offset = offset; 438 dprm->offset = offset;
439 dprm->fetch = t->fetch[FETCH_MTD_memory];
440 dprm->fetch_size = get_fetch_size_function(t,
441 dprm->fetch, ftbl);
627 ret = parse_probe_arg(arg, t2, &dprm->orig, is_return, 442 ret = parse_probe_arg(arg, t2, &dprm->orig, is_return,
628 is_kprobe); 443 is_kprobe);
629 if (ret) 444 if (ret)
@@ -685,9 +500,13 @@ static int __parse_bitfield_probe_arg(const char *bf,
685int traceprobe_parse_probe_arg(char *arg, ssize_t *size, 500int traceprobe_parse_probe_arg(char *arg, ssize_t *size,
686 struct probe_arg *parg, bool is_return, bool is_kprobe) 501 struct probe_arg *parg, bool is_return, bool is_kprobe)
687{ 502{
503 const struct fetch_type *ftbl;
688 const char *t; 504 const char *t;
689 int ret; 505 int ret;
690 506
507 ftbl = is_kprobe ? kprobes_fetch_type_table : uprobes_fetch_type_table;
508 BUG_ON(ftbl == NULL);
509
691 if (strlen(arg) > MAX_ARGSTR_LEN) { 510 if (strlen(arg) > MAX_ARGSTR_LEN) {
692 pr_info("Argument is too long.: %s\n", arg); 511 pr_info("Argument is too long.: %s\n", arg);
693 return -ENOSPC; 512 return -ENOSPC;
@@ -702,7 +521,7 @@ int traceprobe_parse_probe_arg(char *arg, ssize_t *size,
702 arg[t - parg->comm] = '\0'; 521 arg[t - parg->comm] = '\0';
703 t++; 522 t++;
704 } 523 }
705 parg->type = find_fetch_type(t); 524 parg->type = find_fetch_type(t, ftbl);
706 if (!parg->type) { 525 if (!parg->type) {
707 pr_info("Unsupported type: %s\n", t); 526 pr_info("Unsupported type: %s\n", t);
708 return -EINVAL; 527 return -EINVAL;
@@ -716,7 +535,8 @@ int traceprobe_parse_probe_arg(char *arg, ssize_t *size,
716 535
717 if (ret >= 0) { 536 if (ret >= 0) {
718 parg->fetch_size.fn = get_fetch_size_function(parg->type, 537 parg->fetch_size.fn = get_fetch_size_function(parg->type,
719 parg->fetch.fn); 538 parg->fetch.fn,
539 ftbl);
720 parg->fetch_size.data = parg->fetch.data; 540 parg->fetch_size.data = parg->fetch.data;
721 } 541 }
722 542
@@ -837,3 +657,65 @@ out:
837 657
838 return ret; 658 return ret;
839} 659}
660
661static int __set_print_fmt(struct trace_probe *tp, char *buf, int len,
662 bool is_return)
663{
664 int i;
665 int pos = 0;
666
667 const char *fmt, *arg;
668
669 if (!is_return) {
670 fmt = "(%lx)";
671 arg = "REC->" FIELD_STRING_IP;
672 } else {
673 fmt = "(%lx <- %lx)";
674 arg = "REC->" FIELD_STRING_FUNC ", REC->" FIELD_STRING_RETIP;
675 }
676
677 /* When len=0, we just calculate the needed length */
678#define LEN_OR_ZERO (len ? len - pos : 0)
679
680 pos += snprintf(buf + pos, LEN_OR_ZERO, "\"%s", fmt);
681
682 for (i = 0; i < tp->nr_args; i++) {
683 pos += snprintf(buf + pos, LEN_OR_ZERO, " %s=%s",
684 tp->args[i].name, tp->args[i].type->fmt);
685 }
686
687 pos += snprintf(buf + pos, LEN_OR_ZERO, "\", %s", arg);
688
689 for (i = 0; i < tp->nr_args; i++) {
690 if (strcmp(tp->args[i].type->name, "string") == 0)
691 pos += snprintf(buf + pos, LEN_OR_ZERO,
692 ", __get_str(%s)",
693 tp->args[i].name);
694 else
695 pos += snprintf(buf + pos, LEN_OR_ZERO, ", REC->%s",
696 tp->args[i].name);
697 }
698
699#undef LEN_OR_ZERO
700
701 /* return the length of print_fmt */
702 return pos;
703}
704
705int set_print_fmt(struct trace_probe *tp, bool is_return)
706{
707 int len;
708 char *print_fmt;
709
710 /* First: called with 0 length to calculate the needed length */
711 len = __set_print_fmt(tp, NULL, 0, is_return);
712 print_fmt = kmalloc(len + 1, GFP_KERNEL);
713 if (!print_fmt)
714 return -ENOMEM;
715
716 /* Second: actually write the @print_fmt */
717 __set_print_fmt(tp, print_fmt, len + 1, is_return);
718 tp->call.print_fmt = print_fmt;
719
720 return 0;
721}
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
index 5c7e09d10d74..b73574a5f429 100644
--- a/kernel/trace/trace_probe.h
+++ b/kernel/trace/trace_probe.h
@@ -81,6 +81,17 @@
81 */ 81 */
82#define convert_rloc_to_loc(dl, offs) ((u32)(dl) + (offs)) 82#define convert_rloc_to_loc(dl, offs) ((u32)(dl) + (offs))
83 83
84static inline void *get_rloc_data(u32 *dl)
85{
86 return (u8 *)dl + get_rloc_offs(*dl);
87}
88
89/* For data_loc conversion */
90static inline void *get_loc_data(u32 *dl, void *ent)
91{
92 return (u8 *)ent + get_rloc_offs(*dl);
93}
94
84/* Data fetch function type */ 95/* Data fetch function type */
85typedef void (*fetch_func_t)(struct pt_regs *, void *, void *); 96typedef void (*fetch_func_t)(struct pt_regs *, void *, void *);
86/* Printing function type */ 97/* Printing function type */
@@ -95,6 +106,7 @@ enum {
95 FETCH_MTD_symbol, 106 FETCH_MTD_symbol,
96 FETCH_MTD_deref, 107 FETCH_MTD_deref,
97 FETCH_MTD_bitfield, 108 FETCH_MTD_bitfield,
109 FETCH_MTD_file_offset,
98 FETCH_MTD_END, 110 FETCH_MTD_END,
99}; 111};
100 112
@@ -115,6 +127,148 @@ struct fetch_param {
115 void *data; 127 void *data;
116}; 128};
117 129
130/* For defining macros, define string/string_size types */
131typedef u32 string;
132typedef u32 string_size;
133
134#define PRINT_TYPE_FUNC_NAME(type) print_type_##type
135#define PRINT_TYPE_FMT_NAME(type) print_type_format_##type
136
137/* Printing in basic type function template */
138#define DECLARE_BASIC_PRINT_TYPE_FUNC(type) \
139__kprobes int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s, \
140 const char *name, \
141 void *data, void *ent); \
142extern const char PRINT_TYPE_FMT_NAME(type)[]
143
144DECLARE_BASIC_PRINT_TYPE_FUNC(u8);
145DECLARE_BASIC_PRINT_TYPE_FUNC(u16);
146DECLARE_BASIC_PRINT_TYPE_FUNC(u32);
147DECLARE_BASIC_PRINT_TYPE_FUNC(u64);
148DECLARE_BASIC_PRINT_TYPE_FUNC(s8);
149DECLARE_BASIC_PRINT_TYPE_FUNC(s16);
150DECLARE_BASIC_PRINT_TYPE_FUNC(s32);
151DECLARE_BASIC_PRINT_TYPE_FUNC(s64);
152DECLARE_BASIC_PRINT_TYPE_FUNC(string);
153
154#define FETCH_FUNC_NAME(method, type) fetch_##method##_##type
155
156/* Declare macro for basic types */
157#define DECLARE_FETCH_FUNC(method, type) \
158extern void FETCH_FUNC_NAME(method, type)(struct pt_regs *regs, \
159 void *data, void *dest)
160
161#define DECLARE_BASIC_FETCH_FUNCS(method) \
162DECLARE_FETCH_FUNC(method, u8); \
163DECLARE_FETCH_FUNC(method, u16); \
164DECLARE_FETCH_FUNC(method, u32); \
165DECLARE_FETCH_FUNC(method, u64)
166
167DECLARE_BASIC_FETCH_FUNCS(reg);
168#define fetch_reg_string NULL
169#define fetch_reg_string_size NULL
170
171DECLARE_BASIC_FETCH_FUNCS(retval);
172#define fetch_retval_string NULL
173#define fetch_retval_string_size NULL
174
175DECLARE_BASIC_FETCH_FUNCS(symbol);
176DECLARE_FETCH_FUNC(symbol, string);
177DECLARE_FETCH_FUNC(symbol, string_size);
178
179DECLARE_BASIC_FETCH_FUNCS(deref);
180DECLARE_FETCH_FUNC(deref, string);
181DECLARE_FETCH_FUNC(deref, string_size);
182
183DECLARE_BASIC_FETCH_FUNCS(bitfield);
184#define fetch_bitfield_string NULL
185#define fetch_bitfield_string_size NULL
186
187/*
188 * Define macro for basic types - we don't need to define s* types, because
189 * we have to care only about bitwidth at recording time.
190 */
191#define DEFINE_BASIC_FETCH_FUNCS(method) \
192DEFINE_FETCH_##method(u8) \
193DEFINE_FETCH_##method(u16) \
194DEFINE_FETCH_##method(u32) \
195DEFINE_FETCH_##method(u64)
196
197/* Default (unsigned long) fetch type */
198#define __DEFAULT_FETCH_TYPE(t) u##t
199#define _DEFAULT_FETCH_TYPE(t) __DEFAULT_FETCH_TYPE(t)
200#define DEFAULT_FETCH_TYPE _DEFAULT_FETCH_TYPE(BITS_PER_LONG)
201#define DEFAULT_FETCH_TYPE_STR __stringify(DEFAULT_FETCH_TYPE)
202
203#define ASSIGN_FETCH_FUNC(method, type) \
204 [FETCH_MTD_##method] = FETCH_FUNC_NAME(method, type)
205
206#define __ASSIGN_FETCH_TYPE(_name, ptype, ftype, _size, sign, _fmttype) \
207 {.name = _name, \
208 .size = _size, \
209 .is_signed = sign, \
210 .print = PRINT_TYPE_FUNC_NAME(ptype), \
211 .fmt = PRINT_TYPE_FMT_NAME(ptype), \
212 .fmttype = _fmttype, \
213 .fetch = { \
214ASSIGN_FETCH_FUNC(reg, ftype), \
215ASSIGN_FETCH_FUNC(stack, ftype), \
216ASSIGN_FETCH_FUNC(retval, ftype), \
217ASSIGN_FETCH_FUNC(memory, ftype), \
218ASSIGN_FETCH_FUNC(symbol, ftype), \
219ASSIGN_FETCH_FUNC(deref, ftype), \
220ASSIGN_FETCH_FUNC(bitfield, ftype), \
221ASSIGN_FETCH_FUNC(file_offset, ftype), \
222 } \
223 }
224
225#define ASSIGN_FETCH_TYPE(ptype, ftype, sign) \
226 __ASSIGN_FETCH_TYPE(#ptype, ptype, ftype, sizeof(ftype), sign, #ptype)
227
228#define ASSIGN_FETCH_TYPE_END {}
229
230#define FETCH_TYPE_STRING 0
231#define FETCH_TYPE_STRSIZE 1
232
233/*
234 * Fetch type information table.
235 * It's declared as a weak symbol due to conditional compilation.
236 */
237extern __weak const struct fetch_type kprobes_fetch_type_table[];
238extern __weak const struct fetch_type uprobes_fetch_type_table[];
239
240#ifdef CONFIG_KPROBE_EVENT
241struct symbol_cache;
242unsigned long update_symbol_cache(struct symbol_cache *sc);
243void free_symbol_cache(struct symbol_cache *sc);
244struct symbol_cache *alloc_symbol_cache(const char *sym, long offset);
245#else
246/* uprobes do not support symbol fetch methods */
247#define fetch_symbol_u8 NULL
248#define fetch_symbol_u16 NULL
249#define fetch_symbol_u32 NULL
250#define fetch_symbol_u64 NULL
251#define fetch_symbol_string NULL
252#define fetch_symbol_string_size NULL
253
254struct symbol_cache {
255};
256static inline unsigned long __used update_symbol_cache(struct symbol_cache *sc)
257{
258 return 0;
259}
260
261static inline void __used free_symbol_cache(struct symbol_cache *sc)
262{
263}
264
265static inline struct symbol_cache * __used
266alloc_symbol_cache(const char *sym, long offset)
267{
268 return NULL;
269}
270#endif /* CONFIG_KPROBE_EVENT */
271
118struct probe_arg { 272struct probe_arg {
119 struct fetch_param fetch; 273 struct fetch_param fetch;
120 struct fetch_param fetch_size; 274 struct fetch_param fetch_size;
@@ -124,6 +278,26 @@ struct probe_arg {
124 const struct fetch_type *type; /* Type of this argument */ 278 const struct fetch_type *type; /* Type of this argument */
125}; 279};
126 280
281struct trace_probe {
282 unsigned int flags; /* For TP_FLAG_* */
283 struct ftrace_event_class class;
284 struct ftrace_event_call call;
285 struct list_head files;
286 ssize_t size; /* trace entry size */
287 unsigned int nr_args;
288 struct probe_arg args[];
289};
290
291static inline bool trace_probe_is_enabled(struct trace_probe *tp)
292{
293 return !!(tp->flags & (TP_FLAG_TRACE | TP_FLAG_PROFILE));
294}
295
296static inline bool trace_probe_is_registered(struct trace_probe *tp)
297{
298 return !!(tp->flags & TP_FLAG_REGISTERED);
299}
300
127static inline __kprobes void call_fetch(struct fetch_param *fprm, 301static inline __kprobes void call_fetch(struct fetch_param *fprm,
128 struct pt_regs *regs, void *dest) 302 struct pt_regs *regs, void *dest)
129{ 303{
@@ -158,3 +332,53 @@ extern ssize_t traceprobe_probes_write(struct file *file,
158 int (*createfn)(int, char**)); 332 int (*createfn)(int, char**));
159 333
160extern int traceprobe_command(const char *buf, int (*createfn)(int, char**)); 334extern int traceprobe_command(const char *buf, int (*createfn)(int, char**));
335
336/* Sum up total data length for dynamic arraies (strings) */
337static inline __kprobes int
338__get_data_size(struct trace_probe *tp, struct pt_regs *regs)
339{
340 int i, ret = 0;
341 u32 len;
342
343 for (i = 0; i < tp->nr_args; i++)
344 if (unlikely(tp->args[i].fetch_size.fn)) {
345 call_fetch(&tp->args[i].fetch_size, regs, &len);
346 ret += len;
347 }
348
349 return ret;
350}
351
352/* Store the value of each argument */
353static inline __kprobes void
354store_trace_args(int ent_size, struct trace_probe *tp, struct pt_regs *regs,
355 u8 *data, int maxlen)
356{
357 int i;
358 u32 end = tp->size;
359 u32 *dl; /* Data (relative) location */
360
361 for (i = 0; i < tp->nr_args; i++) {
362 if (unlikely(tp->args[i].fetch_size.fn)) {
363 /*
364 * First, we set the relative location and
365 * maximum data length to *dl
366 */
367 dl = (u32 *)(data + tp->args[i].offset);
368 *dl = make_data_rloc(maxlen, end - tp->args[i].offset);
369 /* Then try to fetch string or dynamic array data */
370 call_fetch(&tp->args[i].fetch, regs, dl);
371 /* Reduce maximum length */
372 end += get_rloc_len(*dl);
373 maxlen -= get_rloc_len(*dl);
374 /* Trick here, convert data_rloc to data_loc */
375 *dl = convert_rloc_to_loc(*dl,
376 ent_size + tp->args[i].offset);
377 } else
378 /* Just fetching data normally */
379 call_fetch(&tp->args[i].fetch, regs,
380 data + tp->args[i].offset);
381 }
382}
383
384extern int set_print_fmt(struct trace_probe *tp, bool is_return);
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index fee77e15d815..6e32635e5e57 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -16,6 +16,7 @@
16#include <linux/uaccess.h> 16#include <linux/uaccess.h>
17#include <linux/ftrace.h> 17#include <linux/ftrace.h>
18#include <linux/sched/rt.h> 18#include <linux/sched/rt.h>
19#include <linux/sched/deadline.h>
19#include <trace/events/sched.h> 20#include <trace/events/sched.h>
20#include "trace.h" 21#include "trace.h"
21 22
@@ -27,6 +28,8 @@ static int wakeup_cpu;
27static int wakeup_current_cpu; 28static int wakeup_current_cpu;
28static unsigned wakeup_prio = -1; 29static unsigned wakeup_prio = -1;
29static int wakeup_rt; 30static int wakeup_rt;
31static int wakeup_dl;
32static int tracing_dl = 0;
30 33
31static arch_spinlock_t wakeup_lock = 34static arch_spinlock_t wakeup_lock =
32 (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; 35 (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
@@ -437,6 +440,7 @@ static void __wakeup_reset(struct trace_array *tr)
437{ 440{
438 wakeup_cpu = -1; 441 wakeup_cpu = -1;
439 wakeup_prio = -1; 442 wakeup_prio = -1;
443 tracing_dl = 0;
440 444
441 if (wakeup_task) 445 if (wakeup_task)
442 put_task_struct(wakeup_task); 446 put_task_struct(wakeup_task);
@@ -472,9 +476,17 @@ probe_wakeup(void *ignore, struct task_struct *p, int success)
472 tracing_record_cmdline(p); 476 tracing_record_cmdline(p);
473 tracing_record_cmdline(current); 477 tracing_record_cmdline(current);
474 478
475 if ((wakeup_rt && !rt_task(p)) || 479 /*
476 p->prio >= wakeup_prio || 480 * Semantic is like this:
477 p->prio >= current->prio) 481 * - wakeup tracer handles all tasks in the system, independently
482 * from their scheduling class;
483 * - wakeup_rt tracer handles tasks belonging to sched_dl and
484 * sched_rt class;
485 * - wakeup_dl handles tasks belonging to sched_dl class only.
486 */
487 if (tracing_dl || (wakeup_dl && !dl_task(p)) ||
488 (wakeup_rt && !dl_task(p) && !rt_task(p)) ||
489 (!dl_task(p) && (p->prio >= wakeup_prio || p->prio >= current->prio)))
478 return; 490 return;
479 491
480 pc = preempt_count(); 492 pc = preempt_count();
@@ -486,7 +498,8 @@ probe_wakeup(void *ignore, struct task_struct *p, int success)
486 arch_spin_lock(&wakeup_lock); 498 arch_spin_lock(&wakeup_lock);
487 499
488 /* check for races. */ 500 /* check for races. */
489 if (!tracer_enabled || p->prio >= wakeup_prio) 501 if (!tracer_enabled || tracing_dl ||
502 (!dl_task(p) && p->prio >= wakeup_prio))
490 goto out_locked; 503 goto out_locked;
491 504
492 /* reset the trace */ 505 /* reset the trace */
@@ -496,6 +509,15 @@ probe_wakeup(void *ignore, struct task_struct *p, int success)
496 wakeup_current_cpu = wakeup_cpu; 509 wakeup_current_cpu = wakeup_cpu;
497 wakeup_prio = p->prio; 510 wakeup_prio = p->prio;
498 511
512 /*
513 * Once you start tracing a -deadline task, don't bother tracing
514 * another task until the first one wakes up.
515 */
516 if (dl_task(p))
517 tracing_dl = 1;
518 else
519 tracing_dl = 0;
520
499 wakeup_task = p; 521 wakeup_task = p;
500 get_task_struct(wakeup_task); 522 get_task_struct(wakeup_task);
501 523
@@ -597,16 +619,25 @@ static int __wakeup_tracer_init(struct trace_array *tr)
597 619
598static int wakeup_tracer_init(struct trace_array *tr) 620static int wakeup_tracer_init(struct trace_array *tr)
599{ 621{
622 wakeup_dl = 0;
600 wakeup_rt = 0; 623 wakeup_rt = 0;
601 return __wakeup_tracer_init(tr); 624 return __wakeup_tracer_init(tr);
602} 625}
603 626
604static int wakeup_rt_tracer_init(struct trace_array *tr) 627static int wakeup_rt_tracer_init(struct trace_array *tr)
605{ 628{
629 wakeup_dl = 0;
606 wakeup_rt = 1; 630 wakeup_rt = 1;
607 return __wakeup_tracer_init(tr); 631 return __wakeup_tracer_init(tr);
608} 632}
609 633
634static int wakeup_dl_tracer_init(struct trace_array *tr)
635{
636 wakeup_dl = 1;
637 wakeup_rt = 0;
638 return __wakeup_tracer_init(tr);
639}
640
610static void wakeup_tracer_reset(struct trace_array *tr) 641static void wakeup_tracer_reset(struct trace_array *tr)
611{ 642{
612 int lat_flag = save_flags & TRACE_ITER_LATENCY_FMT; 643 int lat_flag = save_flags & TRACE_ITER_LATENCY_FMT;
@@ -674,6 +705,28 @@ static struct tracer wakeup_rt_tracer __read_mostly =
674 .use_max_tr = true, 705 .use_max_tr = true,
675}; 706};
676 707
708static struct tracer wakeup_dl_tracer __read_mostly =
709{
710 .name = "wakeup_dl",
711 .init = wakeup_dl_tracer_init,
712 .reset = wakeup_tracer_reset,
713 .start = wakeup_tracer_start,
714 .stop = wakeup_tracer_stop,
715 .wait_pipe = poll_wait_pipe,
716 .print_max = true,
717 .print_header = wakeup_print_header,
718 .print_line = wakeup_print_line,
719 .flags = &tracer_flags,
720 .set_flag = wakeup_set_flag,
721 .flag_changed = wakeup_flag_changed,
722#ifdef CONFIG_FTRACE_SELFTEST
723 .selftest = trace_selftest_startup_wakeup,
724#endif
725 .open = wakeup_trace_open,
726 .close = wakeup_trace_close,
727 .use_max_tr = true,
728};
729
677__init static int init_wakeup_tracer(void) 730__init static int init_wakeup_tracer(void)
678{ 731{
679 int ret; 732 int ret;
@@ -686,6 +739,10 @@ __init static int init_wakeup_tracer(void)
686 if (ret) 739 if (ret)
687 return ret; 740 return ret;
688 741
742 ret = register_tracer(&wakeup_dl_tracer);
743 if (ret)
744 return ret;
745
689 return 0; 746 return 0;
690} 747}
691core_initcall(init_wakeup_tracer); 748core_initcall(init_wakeup_tracer);
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index a7329b7902f8..e98fca60974f 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -1022,11 +1022,16 @@ trace_selftest_startup_nop(struct tracer *trace, struct trace_array *tr)
1022#ifdef CONFIG_SCHED_TRACER 1022#ifdef CONFIG_SCHED_TRACER
1023static int trace_wakeup_test_thread(void *data) 1023static int trace_wakeup_test_thread(void *data)
1024{ 1024{
1025 /* Make this a RT thread, doesn't need to be too high */ 1025 /* Make this a -deadline thread */
1026 static const struct sched_param param = { .sched_priority = 5 }; 1026 static const struct sched_attr attr = {
1027 .sched_policy = SCHED_DEADLINE,
1028 .sched_runtime = 100000ULL,
1029 .sched_deadline = 10000000ULL,
1030 .sched_period = 10000000ULL
1031 };
1027 struct completion *x = data; 1032 struct completion *x = data;
1028 1033
1029 sched_setscheduler(current, SCHED_FIFO, &param); 1034 sched_setattr(current, &attr);
1030 1035
1031 /* Make it know we have a new prio */ 1036 /* Make it know we have a new prio */
1032 complete(x); 1037 complete(x);
@@ -1040,8 +1045,8 @@ static int trace_wakeup_test_thread(void *data)
1040 /* we are awake, now wait to disappear */ 1045 /* we are awake, now wait to disappear */
1041 while (!kthread_should_stop()) { 1046 while (!kthread_should_stop()) {
1042 /* 1047 /*
1043 * This is an RT task, do short sleeps to let 1048 * This will likely be the system top priority
1044 * others run. 1049 * task, do short sleeps to let others run.
1045 */ 1050 */
1046 msleep(100); 1051 msleep(100);
1047 } 1052 }
@@ -1054,21 +1059,21 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr)
1054{ 1059{
1055 unsigned long save_max = tracing_max_latency; 1060 unsigned long save_max = tracing_max_latency;
1056 struct task_struct *p; 1061 struct task_struct *p;
1057 struct completion isrt; 1062 struct completion is_ready;
1058 unsigned long count; 1063 unsigned long count;
1059 int ret; 1064 int ret;
1060 1065
1061 init_completion(&isrt); 1066 init_completion(&is_ready);
1062 1067
1063 /* create a high prio thread */ 1068 /* create a -deadline thread */
1064 p = kthread_run(trace_wakeup_test_thread, &isrt, "ftrace-test"); 1069 p = kthread_run(trace_wakeup_test_thread, &is_ready, "ftrace-test");
1065 if (IS_ERR(p)) { 1070 if (IS_ERR(p)) {
1066 printk(KERN_CONT "Failed to create ftrace wakeup test thread "); 1071 printk(KERN_CONT "Failed to create ftrace wakeup test thread ");
1067 return -1; 1072 return -1;
1068 } 1073 }
1069 1074
1070 /* make sure the thread is running at an RT prio */ 1075 /* make sure the thread is running at -deadline policy */
1071 wait_for_completion(&isrt); 1076 wait_for_completion(&is_ready);
1072 1077
1073 /* start the tracing */ 1078 /* start the tracing */
1074 ret = tracer_init(trace, tr); 1079 ret = tracer_init(trace, tr);
@@ -1082,19 +1087,19 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr)
1082 1087
1083 while (p->on_rq) { 1088 while (p->on_rq) {
1084 /* 1089 /*
1085 * Sleep to make sure the RT thread is asleep too. 1090 * Sleep to make sure the -deadline thread is asleep too.
1086 * On virtual machines we can't rely on timings, 1091 * On virtual machines we can't rely on timings,
1087 * but we want to make sure this test still works. 1092 * but we want to make sure this test still works.
1088 */ 1093 */
1089 msleep(100); 1094 msleep(100);
1090 } 1095 }
1091 1096
1092 init_completion(&isrt); 1097 init_completion(&is_ready);
1093 1098
1094 wake_up_process(p); 1099 wake_up_process(p);
1095 1100
1096 /* Wait for the task to wake up */ 1101 /* Wait for the task to wake up */
1097 wait_for_completion(&isrt); 1102 wait_for_completion(&is_ready);
1098 1103
1099 /* stop the tracing. */ 1104 /* stop the tracing. */
1100 tracing_stop(); 1105 tracing_stop();
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index b20428c5efe2..e6be585cf06a 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -382,7 +382,7 @@ static const struct file_operations stack_trace_filter_fops = {
382 .open = stack_trace_filter_open, 382 .open = stack_trace_filter_open,
383 .read = seq_read, 383 .read = seq_read,
384 .write = ftrace_filter_write, 384 .write = ftrace_filter_write,
385 .llseek = ftrace_filter_lseek, 385 .llseek = tracing_lseek,
386 .release = ftrace_regex_release, 386 .release = ftrace_regex_release,
387}; 387};
388 388
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index ea90eb5f6f17..759d5e004517 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -321,7 +321,7 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)
321 if (!ftrace_file) 321 if (!ftrace_file)
322 return; 322 return;
323 323
324 if (test_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &ftrace_file->flags)) 324 if (ftrace_trigger_soft_disabled(ftrace_file))
325 return; 325 return;
326 326
327 sys_data = syscall_nr_to_meta(syscall_nr); 327 sys_data = syscall_nr_to_meta(syscall_nr);
@@ -343,9 +343,8 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)
343 entry->nr = syscall_nr; 343 entry->nr = syscall_nr;
344 syscall_get_arguments(current, regs, 0, sys_data->nb_args, entry->args); 344 syscall_get_arguments(current, regs, 0, sys_data->nb_args, entry->args);
345 345
346 if (!filter_check_discard(ftrace_file, entry, buffer, event)) 346 event_trigger_unlock_commit(ftrace_file, buffer, event, entry,
347 trace_current_buffer_unlock_commit(buffer, event, 347 irq_flags, pc);
348 irq_flags, pc);
349} 348}
350 349
351static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret) 350static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret)
@@ -369,7 +368,7 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret)
369 if (!ftrace_file) 368 if (!ftrace_file)
370 return; 369 return;
371 370
372 if (test_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &ftrace_file->flags)) 371 if (ftrace_trigger_soft_disabled(ftrace_file))
373 return; 372 return;
374 373
375 sys_data = syscall_nr_to_meta(syscall_nr); 374 sys_data = syscall_nr_to_meta(syscall_nr);
@@ -390,9 +389,8 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret)
390 entry->nr = syscall_nr; 389 entry->nr = syscall_nr;
391 entry->ret = syscall_get_return_value(current, regs); 390 entry->ret = syscall_get_return_value(current, regs);
392 391
393 if (!filter_check_discard(ftrace_file, entry, buffer, event)) 392 event_trigger_unlock_commit(ftrace_file, buffer, event, entry,
394 trace_current_buffer_unlock_commit(buffer, event, 393 irq_flags, pc);
395 irq_flags, pc);
396} 394}
397 395
398static int reg_event_syscall_enter(struct ftrace_event_file *file, 396static int reg_event_syscall_enter(struct ftrace_event_file *file,
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index b6dcc42ef7f5..79e52d93860b 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -51,22 +51,17 @@ struct trace_uprobe_filter {
51 */ 51 */
52struct trace_uprobe { 52struct trace_uprobe {
53 struct list_head list; 53 struct list_head list;
54 struct ftrace_event_class class;
55 struct ftrace_event_call call;
56 struct trace_uprobe_filter filter; 54 struct trace_uprobe_filter filter;
57 struct uprobe_consumer consumer; 55 struct uprobe_consumer consumer;
58 struct inode *inode; 56 struct inode *inode;
59 char *filename; 57 char *filename;
60 unsigned long offset; 58 unsigned long offset;
61 unsigned long nhit; 59 unsigned long nhit;
62 unsigned int flags; /* For TP_FLAG_* */ 60 struct trace_probe tp;
63 ssize_t size; /* trace entry size */
64 unsigned int nr_args;
65 struct probe_arg args[];
66}; 61};
67 62
68#define SIZEOF_TRACE_UPROBE(n) \ 63#define SIZEOF_TRACE_UPROBE(n) \
69 (offsetof(struct trace_uprobe, args) + \ 64 (offsetof(struct trace_uprobe, tp.args) + \
70 (sizeof(struct probe_arg) * (n))) 65 (sizeof(struct probe_arg) * (n)))
71 66
72static int register_uprobe_event(struct trace_uprobe *tu); 67static int register_uprobe_event(struct trace_uprobe *tu);
@@ -75,10 +70,151 @@ static int unregister_uprobe_event(struct trace_uprobe *tu);
75static DEFINE_MUTEX(uprobe_lock); 70static DEFINE_MUTEX(uprobe_lock);
76static LIST_HEAD(uprobe_list); 71static LIST_HEAD(uprobe_list);
77 72
73struct uprobe_dispatch_data {
74 struct trace_uprobe *tu;
75 unsigned long bp_addr;
76};
77
78static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs); 78static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs);
79static int uretprobe_dispatcher(struct uprobe_consumer *con, 79static int uretprobe_dispatcher(struct uprobe_consumer *con,
80 unsigned long func, struct pt_regs *regs); 80 unsigned long func, struct pt_regs *regs);
81 81
82#ifdef CONFIG_STACK_GROWSUP
83static unsigned long adjust_stack_addr(unsigned long addr, unsigned int n)
84{
85 return addr - (n * sizeof(long));
86}
87#else
88static unsigned long adjust_stack_addr(unsigned long addr, unsigned int n)
89{
90 return addr + (n * sizeof(long));
91}
92#endif
93
94static unsigned long get_user_stack_nth(struct pt_regs *regs, unsigned int n)
95{
96 unsigned long ret;
97 unsigned long addr = user_stack_pointer(regs);
98
99 addr = adjust_stack_addr(addr, n);
100
101 if (copy_from_user(&ret, (void __force __user *) addr, sizeof(ret)))
102 return 0;
103
104 return ret;
105}
106
107/*
108 * Uprobes-specific fetch functions
109 */
110#define DEFINE_FETCH_stack(type) \
111static __kprobes void FETCH_FUNC_NAME(stack, type)(struct pt_regs *regs,\
112 void *offset, void *dest) \
113{ \
114 *(type *)dest = (type)get_user_stack_nth(regs, \
115 ((unsigned long)offset)); \
116}
117DEFINE_BASIC_FETCH_FUNCS(stack)
118/* No string on the stack entry */
119#define fetch_stack_string NULL
120#define fetch_stack_string_size NULL
121
122#define DEFINE_FETCH_memory(type) \
123static __kprobes void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs,\
124 void *addr, void *dest) \
125{ \
126 type retval; \
127 void __user *vaddr = (void __force __user *) addr; \
128 \
129 if (copy_from_user(&retval, vaddr, sizeof(type))) \
130 *(type *)dest = 0; \
131 else \
132 *(type *) dest = retval; \
133}
134DEFINE_BASIC_FETCH_FUNCS(memory)
135/*
136 * Fetch a null-terminated string. Caller MUST set *(u32 *)dest with max
137 * length and relative data location.
138 */
139static __kprobes void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs,
140 void *addr, void *dest)
141{
142 long ret;
143 u32 rloc = *(u32 *)dest;
144 int maxlen = get_rloc_len(rloc);
145 u8 *dst = get_rloc_data(dest);
146 void __user *src = (void __force __user *) addr;
147
148 if (!maxlen)
149 return;
150
151 ret = strncpy_from_user(dst, src, maxlen);
152
153 if (ret < 0) { /* Failed to fetch string */
154 ((u8 *)get_rloc_data(dest))[0] = '\0';
155 *(u32 *)dest = make_data_rloc(0, get_rloc_offs(rloc));
156 } else {
157 *(u32 *)dest = make_data_rloc(ret, get_rloc_offs(rloc));
158 }
159}
160
161static __kprobes void FETCH_FUNC_NAME(memory, string_size)(struct pt_regs *regs,
162 void *addr, void *dest)
163{
164 int len;
165 void __user *vaddr = (void __force __user *) addr;
166
167 len = strnlen_user(vaddr, MAX_STRING_SIZE);
168
169 if (len == 0 || len > MAX_STRING_SIZE) /* Failed to check length */
170 *(u32 *)dest = 0;
171 else
172 *(u32 *)dest = len;
173}
174
175static unsigned long translate_user_vaddr(void *file_offset)
176{
177 unsigned long base_addr;
178 struct uprobe_dispatch_data *udd;
179
180 udd = (void *) current->utask->vaddr;
181
182 base_addr = udd->bp_addr - udd->tu->offset;
183 return base_addr + (unsigned long)file_offset;
184}
185
186#define DEFINE_FETCH_file_offset(type) \
187static __kprobes void FETCH_FUNC_NAME(file_offset, type)(struct pt_regs *regs,\
188 void *offset, void *dest) \
189{ \
190 void *vaddr = (void *)translate_user_vaddr(offset); \
191 \
192 FETCH_FUNC_NAME(memory, type)(regs, vaddr, dest); \
193}
194DEFINE_BASIC_FETCH_FUNCS(file_offset)
195DEFINE_FETCH_file_offset(string)
196DEFINE_FETCH_file_offset(string_size)
197
198/* Fetch type information table */
199const struct fetch_type uprobes_fetch_type_table[] = {
200 /* Special types */
201 [FETCH_TYPE_STRING] = __ASSIGN_FETCH_TYPE("string", string, string,
202 sizeof(u32), 1, "__data_loc char[]"),
203 [FETCH_TYPE_STRSIZE] = __ASSIGN_FETCH_TYPE("string_size", u32,
204 string_size, sizeof(u32), 0, "u32"),
205 /* Basic types */
206 ASSIGN_FETCH_TYPE(u8, u8, 0),
207 ASSIGN_FETCH_TYPE(u16, u16, 0),
208 ASSIGN_FETCH_TYPE(u32, u32, 0),
209 ASSIGN_FETCH_TYPE(u64, u64, 0),
210 ASSIGN_FETCH_TYPE(s8, u8, 1),
211 ASSIGN_FETCH_TYPE(s16, u16, 1),
212 ASSIGN_FETCH_TYPE(s32, u32, 1),
213 ASSIGN_FETCH_TYPE(s64, u64, 1),
214
215 ASSIGN_FETCH_TYPE_END
216};
217
82static inline void init_trace_uprobe_filter(struct trace_uprobe_filter *filter) 218static inline void init_trace_uprobe_filter(struct trace_uprobe_filter *filter)
83{ 219{
84 rwlock_init(&filter->rwlock); 220 rwlock_init(&filter->rwlock);
@@ -114,13 +250,13 @@ alloc_trace_uprobe(const char *group, const char *event, int nargs, bool is_ret)
114 if (!tu) 250 if (!tu)
115 return ERR_PTR(-ENOMEM); 251 return ERR_PTR(-ENOMEM);
116 252
117 tu->call.class = &tu->class; 253 tu->tp.call.class = &tu->tp.class;
118 tu->call.name = kstrdup(event, GFP_KERNEL); 254 tu->tp.call.name = kstrdup(event, GFP_KERNEL);
119 if (!tu->call.name) 255 if (!tu->tp.call.name)
120 goto error; 256 goto error;
121 257
122 tu->class.system = kstrdup(group, GFP_KERNEL); 258 tu->tp.class.system = kstrdup(group, GFP_KERNEL);
123 if (!tu->class.system) 259 if (!tu->tp.class.system)
124 goto error; 260 goto error;
125 261
126 INIT_LIST_HEAD(&tu->list); 262 INIT_LIST_HEAD(&tu->list);
@@ -128,11 +264,11 @@ alloc_trace_uprobe(const char *group, const char *event, int nargs, bool is_ret)
128 if (is_ret) 264 if (is_ret)
129 tu->consumer.ret_handler = uretprobe_dispatcher; 265 tu->consumer.ret_handler = uretprobe_dispatcher;
130 init_trace_uprobe_filter(&tu->filter); 266 init_trace_uprobe_filter(&tu->filter);
131 tu->call.flags |= TRACE_EVENT_FL_USE_CALL_FILTER; 267 tu->tp.call.flags |= TRACE_EVENT_FL_USE_CALL_FILTER;
132 return tu; 268 return tu;
133 269
134error: 270error:
135 kfree(tu->call.name); 271 kfree(tu->tp.call.name);
136 kfree(tu); 272 kfree(tu);
137 273
138 return ERR_PTR(-ENOMEM); 274 return ERR_PTR(-ENOMEM);
@@ -142,12 +278,12 @@ static void free_trace_uprobe(struct trace_uprobe *tu)
142{ 278{
143 int i; 279 int i;
144 280
145 for (i = 0; i < tu->nr_args; i++) 281 for (i = 0; i < tu->tp.nr_args; i++)
146 traceprobe_free_probe_arg(&tu->args[i]); 282 traceprobe_free_probe_arg(&tu->tp.args[i]);
147 283
148 iput(tu->inode); 284 iput(tu->inode);
149 kfree(tu->call.class->system); 285 kfree(tu->tp.call.class->system);
150 kfree(tu->call.name); 286 kfree(tu->tp.call.name);
151 kfree(tu->filename); 287 kfree(tu->filename);
152 kfree(tu); 288 kfree(tu);
153} 289}
@@ -157,8 +293,8 @@ static struct trace_uprobe *find_probe_event(const char *event, const char *grou
157 struct trace_uprobe *tu; 293 struct trace_uprobe *tu;
158 294
159 list_for_each_entry(tu, &uprobe_list, list) 295 list_for_each_entry(tu, &uprobe_list, list)
160 if (strcmp(tu->call.name, event) == 0 && 296 if (strcmp(tu->tp.call.name, event) == 0 &&
161 strcmp(tu->call.class->system, group) == 0) 297 strcmp(tu->tp.call.class->system, group) == 0)
162 return tu; 298 return tu;
163 299
164 return NULL; 300 return NULL;
@@ -181,16 +317,16 @@ static int unregister_trace_uprobe(struct trace_uprobe *tu)
181/* Register a trace_uprobe and probe_event */ 317/* Register a trace_uprobe and probe_event */
182static int register_trace_uprobe(struct trace_uprobe *tu) 318static int register_trace_uprobe(struct trace_uprobe *tu)
183{ 319{
184 struct trace_uprobe *old_tp; 320 struct trace_uprobe *old_tu;
185 int ret; 321 int ret;
186 322
187 mutex_lock(&uprobe_lock); 323 mutex_lock(&uprobe_lock);
188 324
189 /* register as an event */ 325 /* register as an event */
190 old_tp = find_probe_event(tu->call.name, tu->call.class->system); 326 old_tu = find_probe_event(tu->tp.call.name, tu->tp.call.class->system);
191 if (old_tp) { 327 if (old_tu) {
192 /* delete old event */ 328 /* delete old event */
193 ret = unregister_trace_uprobe(old_tp); 329 ret = unregister_trace_uprobe(old_tu);
194 if (ret) 330 if (ret)
195 goto end; 331 goto end;
196 } 332 }
@@ -211,7 +347,7 @@ end:
211 347
212/* 348/*
213 * Argument syntax: 349 * Argument syntax:
214 * - Add uprobe: p|r[:[GRP/]EVENT] PATH:SYMBOL [FETCHARGS] 350 * - Add uprobe: p|r[:[GRP/]EVENT] PATH:OFFSET [FETCHARGS]
215 * 351 *
216 * - Remove uprobe: -:[GRP/]EVENT 352 * - Remove uprobe: -:[GRP/]EVENT
217 */ 353 */
@@ -360,34 +496,36 @@ static int create_trace_uprobe(int argc, char **argv)
360 /* parse arguments */ 496 /* parse arguments */
361 ret = 0; 497 ret = 0;
362 for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) { 498 for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) {
499 struct probe_arg *parg = &tu->tp.args[i];
500
363 /* Increment count for freeing args in error case */ 501 /* Increment count for freeing args in error case */
364 tu->nr_args++; 502 tu->tp.nr_args++;
365 503
366 /* Parse argument name */ 504 /* Parse argument name */
367 arg = strchr(argv[i], '='); 505 arg = strchr(argv[i], '=');
368 if (arg) { 506 if (arg) {
369 *arg++ = '\0'; 507 *arg++ = '\0';
370 tu->args[i].name = kstrdup(argv[i], GFP_KERNEL); 508 parg->name = kstrdup(argv[i], GFP_KERNEL);
371 } else { 509 } else {
372 arg = argv[i]; 510 arg = argv[i];
373 /* If argument name is omitted, set "argN" */ 511 /* If argument name is omitted, set "argN" */
374 snprintf(buf, MAX_EVENT_NAME_LEN, "arg%d", i + 1); 512 snprintf(buf, MAX_EVENT_NAME_LEN, "arg%d", i + 1);
375 tu->args[i].name = kstrdup(buf, GFP_KERNEL); 513 parg->name = kstrdup(buf, GFP_KERNEL);
376 } 514 }
377 515
378 if (!tu->args[i].name) { 516 if (!parg->name) {
379 pr_info("Failed to allocate argument[%d] name.\n", i); 517 pr_info("Failed to allocate argument[%d] name.\n", i);
380 ret = -ENOMEM; 518 ret = -ENOMEM;
381 goto error; 519 goto error;
382 } 520 }
383 521
384 if (!is_good_name(tu->args[i].name)) { 522 if (!is_good_name(parg->name)) {
385 pr_info("Invalid argument[%d] name: %s\n", i, tu->args[i].name); 523 pr_info("Invalid argument[%d] name: %s\n", i, parg->name);
386 ret = -EINVAL; 524 ret = -EINVAL;
387 goto error; 525 goto error;
388 } 526 }
389 527
390 if (traceprobe_conflict_field_name(tu->args[i].name, tu->args, i)) { 528 if (traceprobe_conflict_field_name(parg->name, tu->tp.args, i)) {
391 pr_info("Argument[%d] name '%s' conflicts with " 529 pr_info("Argument[%d] name '%s' conflicts with "
392 "another field.\n", i, argv[i]); 530 "another field.\n", i, argv[i]);
393 ret = -EINVAL; 531 ret = -EINVAL;
@@ -395,7 +533,8 @@ static int create_trace_uprobe(int argc, char **argv)
395 } 533 }
396 534
397 /* Parse fetch argument */ 535 /* Parse fetch argument */
398 ret = traceprobe_parse_probe_arg(arg, &tu->size, &tu->args[i], false, false); 536 ret = traceprobe_parse_probe_arg(arg, &tu->tp.size, parg,
537 is_return, false);
399 if (ret) { 538 if (ret) {
400 pr_info("Parse error at argument[%d]. (%d)\n", i, ret); 539 pr_info("Parse error at argument[%d]. (%d)\n", i, ret);
401 goto error; 540 goto error;
@@ -459,11 +598,11 @@ static int probes_seq_show(struct seq_file *m, void *v)
459 char c = is_ret_probe(tu) ? 'r' : 'p'; 598 char c = is_ret_probe(tu) ? 'r' : 'p';
460 int i; 599 int i;
461 600
462 seq_printf(m, "%c:%s/%s", c, tu->call.class->system, tu->call.name); 601 seq_printf(m, "%c:%s/%s", c, tu->tp.call.class->system, tu->tp.call.name);
463 seq_printf(m, " %s:0x%p", tu->filename, (void *)tu->offset); 602 seq_printf(m, " %s:0x%p", tu->filename, (void *)tu->offset);
464 603
465 for (i = 0; i < tu->nr_args; i++) 604 for (i = 0; i < tu->tp.nr_args; i++)
466 seq_printf(m, " %s=%s", tu->args[i].name, tu->args[i].comm); 605 seq_printf(m, " %s=%s", tu->tp.args[i].name, tu->tp.args[i].comm);
467 606
468 seq_printf(m, "\n"); 607 seq_printf(m, "\n");
469 return 0; 608 return 0;
@@ -509,7 +648,7 @@ static int probes_profile_seq_show(struct seq_file *m, void *v)
509{ 648{
510 struct trace_uprobe *tu = v; 649 struct trace_uprobe *tu = v;
511 650
512 seq_printf(m, " %s %-44s %15lu\n", tu->filename, tu->call.name, tu->nhit); 651 seq_printf(m, " %s %-44s %15lu\n", tu->filename, tu->tp.call.name, tu->nhit);
513 return 0; 652 return 0;
514} 653}
515 654
@@ -533,21 +672,117 @@ static const struct file_operations uprobe_profile_ops = {
533 .release = seq_release, 672 .release = seq_release,
534}; 673};
535 674
675struct uprobe_cpu_buffer {
676 struct mutex mutex;
677 void *buf;
678};
679static struct uprobe_cpu_buffer __percpu *uprobe_cpu_buffer;
680static int uprobe_buffer_refcnt;
681
682static int uprobe_buffer_init(void)
683{
684 int cpu, err_cpu;
685
686 uprobe_cpu_buffer = alloc_percpu(struct uprobe_cpu_buffer);
687 if (uprobe_cpu_buffer == NULL)
688 return -ENOMEM;
689
690 for_each_possible_cpu(cpu) {
691 struct page *p = alloc_pages_node(cpu_to_node(cpu),
692 GFP_KERNEL, 0);
693 if (p == NULL) {
694 err_cpu = cpu;
695 goto err;
696 }
697 per_cpu_ptr(uprobe_cpu_buffer, cpu)->buf = page_address(p);
698 mutex_init(&per_cpu_ptr(uprobe_cpu_buffer, cpu)->mutex);
699 }
700
701 return 0;
702
703err:
704 for_each_possible_cpu(cpu) {
705 if (cpu == err_cpu)
706 break;
707 free_page((unsigned long)per_cpu_ptr(uprobe_cpu_buffer, cpu)->buf);
708 }
709
710 free_percpu(uprobe_cpu_buffer);
711 return -ENOMEM;
712}
713
714static int uprobe_buffer_enable(void)
715{
716 int ret = 0;
717
718 BUG_ON(!mutex_is_locked(&event_mutex));
719
720 if (uprobe_buffer_refcnt++ == 0) {
721 ret = uprobe_buffer_init();
722 if (ret < 0)
723 uprobe_buffer_refcnt--;
724 }
725
726 return ret;
727}
728
729static void uprobe_buffer_disable(void)
730{
731 BUG_ON(!mutex_is_locked(&event_mutex));
732
733 if (--uprobe_buffer_refcnt == 0) {
734 free_percpu(uprobe_cpu_buffer);
735 uprobe_cpu_buffer = NULL;
736 }
737}
738
739static struct uprobe_cpu_buffer *uprobe_buffer_get(void)
740{
741 struct uprobe_cpu_buffer *ucb;
742 int cpu;
743
744 cpu = raw_smp_processor_id();
745 ucb = per_cpu_ptr(uprobe_cpu_buffer, cpu);
746
747 /*
748 * Use per-cpu buffers for fastest access, but we might migrate
749 * so the mutex makes sure we have sole access to it.
750 */
751 mutex_lock(&ucb->mutex);
752
753 return ucb;
754}
755
756static void uprobe_buffer_put(struct uprobe_cpu_buffer *ucb)
757{
758 mutex_unlock(&ucb->mutex);
759}
760
536static void uprobe_trace_print(struct trace_uprobe *tu, 761static void uprobe_trace_print(struct trace_uprobe *tu,
537 unsigned long func, struct pt_regs *regs) 762 unsigned long func, struct pt_regs *regs)
538{ 763{
539 struct uprobe_trace_entry_head *entry; 764 struct uprobe_trace_entry_head *entry;
540 struct ring_buffer_event *event; 765 struct ring_buffer_event *event;
541 struct ring_buffer *buffer; 766 struct ring_buffer *buffer;
767 struct uprobe_cpu_buffer *ucb;
542 void *data; 768 void *data;
543 int size, i; 769 int size, dsize, esize;
544 struct ftrace_event_call *call = &tu->call; 770 struct ftrace_event_call *call = &tu->tp.call;
771
772 dsize = __get_data_size(&tu->tp, regs);
773 esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu));
545 774
546 size = SIZEOF_TRACE_ENTRY(is_ret_probe(tu)); 775 if (WARN_ON_ONCE(!uprobe_cpu_buffer || tu->tp.size + dsize > PAGE_SIZE))
776 return;
777
778 ucb = uprobe_buffer_get();
779 store_trace_args(esize, &tu->tp, regs, ucb->buf, dsize);
780
781 size = esize + tu->tp.size + dsize;
547 event = trace_current_buffer_lock_reserve(&buffer, call->event.type, 782 event = trace_current_buffer_lock_reserve(&buffer, call->event.type,
548 size + tu->size, 0, 0); 783 size, 0, 0);
549 if (!event) 784 if (!event)
550 return; 785 goto out;
551 786
552 entry = ring_buffer_event_data(event); 787 entry = ring_buffer_event_data(event);
553 if (is_ret_probe(tu)) { 788 if (is_ret_probe(tu)) {
@@ -559,11 +794,13 @@ static void uprobe_trace_print(struct trace_uprobe *tu,
559 data = DATAOF_TRACE_ENTRY(entry, false); 794 data = DATAOF_TRACE_ENTRY(entry, false);
560 } 795 }
561 796
562 for (i = 0; i < tu->nr_args; i++) 797 memcpy(data, ucb->buf, tu->tp.size + dsize);
563 call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset);
564 798
565 if (!call_filter_check_discard(call, entry, buffer, event)) 799 if (!call_filter_check_discard(call, entry, buffer, event))
566 trace_buffer_unlock_commit(buffer, event, 0, 0); 800 trace_buffer_unlock_commit(buffer, event, 0, 0);
801
802out:
803 uprobe_buffer_put(ucb);
567} 804}
568 805
569/* uprobe handler */ 806/* uprobe handler */
@@ -591,23 +828,24 @@ print_uprobe_event(struct trace_iterator *iter, int flags, struct trace_event *e
591 int i; 828 int i;
592 829
593 entry = (struct uprobe_trace_entry_head *)iter->ent; 830 entry = (struct uprobe_trace_entry_head *)iter->ent;
594 tu = container_of(event, struct trace_uprobe, call.event); 831 tu = container_of(event, struct trace_uprobe, tp.call.event);
595 832
596 if (is_ret_probe(tu)) { 833 if (is_ret_probe(tu)) {
597 if (!trace_seq_printf(s, "%s: (0x%lx <- 0x%lx)", tu->call.name, 834 if (!trace_seq_printf(s, "%s: (0x%lx <- 0x%lx)", tu->tp.call.name,
598 entry->vaddr[1], entry->vaddr[0])) 835 entry->vaddr[1], entry->vaddr[0]))
599 goto partial; 836 goto partial;
600 data = DATAOF_TRACE_ENTRY(entry, true); 837 data = DATAOF_TRACE_ENTRY(entry, true);
601 } else { 838 } else {
602 if (!trace_seq_printf(s, "%s: (0x%lx)", tu->call.name, 839 if (!trace_seq_printf(s, "%s: (0x%lx)", tu->tp.call.name,
603 entry->vaddr[0])) 840 entry->vaddr[0]))
604 goto partial; 841 goto partial;
605 data = DATAOF_TRACE_ENTRY(entry, false); 842 data = DATAOF_TRACE_ENTRY(entry, false);
606 } 843 }
607 844
608 for (i = 0; i < tu->nr_args; i++) { 845 for (i = 0; i < tu->tp.nr_args; i++) {
609 if (!tu->args[i].type->print(s, tu->args[i].name, 846 struct probe_arg *parg = &tu->tp.args[i];
610 data + tu->args[i].offset, entry)) 847
848 if (!parg->type->print(s, parg->name, data + parg->offset, entry))
611 goto partial; 849 goto partial;
612 } 850 }
613 851
@@ -618,11 +856,6 @@ partial:
618 return TRACE_TYPE_PARTIAL_LINE; 856 return TRACE_TYPE_PARTIAL_LINE;
619} 857}
620 858
621static inline bool is_trace_uprobe_enabled(struct trace_uprobe *tu)
622{
623 return tu->flags & (TP_FLAG_TRACE | TP_FLAG_PROFILE);
624}
625
626typedef bool (*filter_func_t)(struct uprobe_consumer *self, 859typedef bool (*filter_func_t)(struct uprobe_consumer *self,
627 enum uprobe_filter_ctx ctx, 860 enum uprobe_filter_ctx ctx,
628 struct mm_struct *mm); 861 struct mm_struct *mm);
@@ -632,29 +865,35 @@ probe_event_enable(struct trace_uprobe *tu, int flag, filter_func_t filter)
632{ 865{
633 int ret = 0; 866 int ret = 0;
634 867
635 if (is_trace_uprobe_enabled(tu)) 868 if (trace_probe_is_enabled(&tu->tp))
636 return -EINTR; 869 return -EINTR;
637 870
871 ret = uprobe_buffer_enable();
872 if (ret < 0)
873 return ret;
874
638 WARN_ON(!uprobe_filter_is_empty(&tu->filter)); 875 WARN_ON(!uprobe_filter_is_empty(&tu->filter));
639 876
640 tu->flags |= flag; 877 tu->tp.flags |= flag;
641 tu->consumer.filter = filter; 878 tu->consumer.filter = filter;
642 ret = uprobe_register(tu->inode, tu->offset, &tu->consumer); 879 ret = uprobe_register(tu->inode, tu->offset, &tu->consumer);
643 if (ret) 880 if (ret)
644 tu->flags &= ~flag; 881 tu->tp.flags &= ~flag;
645 882
646 return ret; 883 return ret;
647} 884}
648 885
649static void probe_event_disable(struct trace_uprobe *tu, int flag) 886static void probe_event_disable(struct trace_uprobe *tu, int flag)
650{ 887{
651 if (!is_trace_uprobe_enabled(tu)) 888 if (!trace_probe_is_enabled(&tu->tp))
652 return; 889 return;
653 890
654 WARN_ON(!uprobe_filter_is_empty(&tu->filter)); 891 WARN_ON(!uprobe_filter_is_empty(&tu->filter));
655 892
656 uprobe_unregister(tu->inode, tu->offset, &tu->consumer); 893 uprobe_unregister(tu->inode, tu->offset, &tu->consumer);
657 tu->flags &= ~flag; 894 tu->tp.flags &= ~flag;
895
896 uprobe_buffer_disable();
658} 897}
659 898
660static int uprobe_event_define_fields(struct ftrace_event_call *event_call) 899static int uprobe_event_define_fields(struct ftrace_event_call *event_call)
@@ -672,12 +911,12 @@ static int uprobe_event_define_fields(struct ftrace_event_call *event_call)
672 size = SIZEOF_TRACE_ENTRY(false); 911 size = SIZEOF_TRACE_ENTRY(false);
673 } 912 }
674 /* Set argument names as fields */ 913 /* Set argument names as fields */
675 for (i = 0; i < tu->nr_args; i++) { 914 for (i = 0; i < tu->tp.nr_args; i++) {
676 ret = trace_define_field(event_call, tu->args[i].type->fmttype, 915 struct probe_arg *parg = &tu->tp.args[i];
677 tu->args[i].name, 916
678 size + tu->args[i].offset, 917 ret = trace_define_field(event_call, parg->type->fmttype,
679 tu->args[i].type->size, 918 parg->name, size + parg->offset,
680 tu->args[i].type->is_signed, 919 parg->type->size, parg->type->is_signed,
681 FILTER_OTHER); 920 FILTER_OTHER);
682 921
683 if (ret) 922 if (ret)
@@ -686,59 +925,6 @@ static int uprobe_event_define_fields(struct ftrace_event_call *event_call)
686 return 0; 925 return 0;
687} 926}
688 927
689#define LEN_OR_ZERO (len ? len - pos : 0)
690static int __set_print_fmt(struct trace_uprobe *tu, char *buf, int len)
691{
692 const char *fmt, *arg;
693 int i;
694 int pos = 0;
695
696 if (is_ret_probe(tu)) {
697 fmt = "(%lx <- %lx)";
698 arg = "REC->" FIELD_STRING_FUNC ", REC->" FIELD_STRING_RETIP;
699 } else {
700 fmt = "(%lx)";
701 arg = "REC->" FIELD_STRING_IP;
702 }
703
704 /* When len=0, we just calculate the needed length */
705
706 pos += snprintf(buf + pos, LEN_OR_ZERO, "\"%s", fmt);
707
708 for (i = 0; i < tu->nr_args; i++) {
709 pos += snprintf(buf + pos, LEN_OR_ZERO, " %s=%s",
710 tu->args[i].name, tu->args[i].type->fmt);
711 }
712
713 pos += snprintf(buf + pos, LEN_OR_ZERO, "\", %s", arg);
714
715 for (i = 0; i < tu->nr_args; i++) {
716 pos += snprintf(buf + pos, LEN_OR_ZERO, ", REC->%s",
717 tu->args[i].name);
718 }
719
720 return pos; /* return the length of print_fmt */
721}
722#undef LEN_OR_ZERO
723
724static int set_print_fmt(struct trace_uprobe *tu)
725{
726 char *print_fmt;
727 int len;
728
729 /* First: called with 0 length to calculate the needed length */
730 len = __set_print_fmt(tu, NULL, 0);
731 print_fmt = kmalloc(len + 1, GFP_KERNEL);
732 if (!print_fmt)
733 return -ENOMEM;
734
735 /* Second: actually write the @print_fmt */
736 __set_print_fmt(tu, print_fmt, len + 1);
737 tu->call.print_fmt = print_fmt;
738
739 return 0;
740}
741
742#ifdef CONFIG_PERF_EVENTS 928#ifdef CONFIG_PERF_EVENTS
743static bool 929static bool
744__uprobe_perf_filter(struct trace_uprobe_filter *filter, struct mm_struct *mm) 930__uprobe_perf_filter(struct trace_uprobe_filter *filter, struct mm_struct *mm)
@@ -831,14 +1017,27 @@ static bool uprobe_perf_filter(struct uprobe_consumer *uc,
831static void uprobe_perf_print(struct trace_uprobe *tu, 1017static void uprobe_perf_print(struct trace_uprobe *tu,
832 unsigned long func, struct pt_regs *regs) 1018 unsigned long func, struct pt_regs *regs)
833{ 1019{
834 struct ftrace_event_call *call = &tu->call; 1020 struct ftrace_event_call *call = &tu->tp.call;
835 struct uprobe_trace_entry_head *entry; 1021 struct uprobe_trace_entry_head *entry;
836 struct hlist_head *head; 1022 struct hlist_head *head;
1023 struct uprobe_cpu_buffer *ucb;
837 void *data; 1024 void *data;
838 int size, rctx, i; 1025 int size, dsize, esize;
1026 int rctx;
839 1027
840 size = SIZEOF_TRACE_ENTRY(is_ret_probe(tu)); 1028 dsize = __get_data_size(&tu->tp, regs);
841 size = ALIGN(size + tu->size + sizeof(u32), sizeof(u64)) - sizeof(u32); 1029 esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu));
1030
1031 if (WARN_ON_ONCE(!uprobe_cpu_buffer))
1032 return;
1033
1034 size = esize + tu->tp.size + dsize;
1035 size = ALIGN(size + sizeof(u32), sizeof(u64)) - sizeof(u32);
1036 if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, "profile buffer not large enough"))
1037 return;
1038
1039 ucb = uprobe_buffer_get();
1040 store_trace_args(esize, &tu->tp, regs, ucb->buf, dsize);
842 1041
843 preempt_disable(); 1042 preempt_disable();
844 head = this_cpu_ptr(call->perf_events); 1043 head = this_cpu_ptr(call->perf_events);
@@ -858,12 +1057,18 @@ static void uprobe_perf_print(struct trace_uprobe *tu,
858 data = DATAOF_TRACE_ENTRY(entry, false); 1057 data = DATAOF_TRACE_ENTRY(entry, false);
859 } 1058 }
860 1059
861 for (i = 0; i < tu->nr_args; i++) 1060 memcpy(data, ucb->buf, tu->tp.size + dsize);
862 call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset); 1061
1062 if (size - esize > tu->tp.size + dsize) {
1063 int len = tu->tp.size + dsize;
1064
1065 memset(data + len, 0, size - esize - len);
1066 }
863 1067
864 perf_trace_buf_submit(entry, size, rctx, 0, 1, regs, head, NULL); 1068 perf_trace_buf_submit(entry, size, rctx, 0, 1, regs, head, NULL);
865 out: 1069 out:
866 preempt_enable(); 1070 preempt_enable();
1071 uprobe_buffer_put(ucb);
867} 1072}
868 1073
869/* uprobe profile handler */ 1074/* uprobe profile handler */
@@ -921,16 +1126,22 @@ int trace_uprobe_register(struct ftrace_event_call *event, enum trace_reg type,
921static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs) 1126static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs)
922{ 1127{
923 struct trace_uprobe *tu; 1128 struct trace_uprobe *tu;
1129 struct uprobe_dispatch_data udd;
924 int ret = 0; 1130 int ret = 0;
925 1131
926 tu = container_of(con, struct trace_uprobe, consumer); 1132 tu = container_of(con, struct trace_uprobe, consumer);
927 tu->nhit++; 1133 tu->nhit++;
928 1134
929 if (tu->flags & TP_FLAG_TRACE) 1135 udd.tu = tu;
1136 udd.bp_addr = instruction_pointer(regs);
1137
1138 current->utask->vaddr = (unsigned long) &udd;
1139
1140 if (tu->tp.flags & TP_FLAG_TRACE)
930 ret |= uprobe_trace_func(tu, regs); 1141 ret |= uprobe_trace_func(tu, regs);
931 1142
932#ifdef CONFIG_PERF_EVENTS 1143#ifdef CONFIG_PERF_EVENTS
933 if (tu->flags & TP_FLAG_PROFILE) 1144 if (tu->tp.flags & TP_FLAG_PROFILE)
934 ret |= uprobe_perf_func(tu, regs); 1145 ret |= uprobe_perf_func(tu, regs);
935#endif 1146#endif
936 return ret; 1147 return ret;
@@ -940,14 +1151,20 @@ static int uretprobe_dispatcher(struct uprobe_consumer *con,
940 unsigned long func, struct pt_regs *regs) 1151 unsigned long func, struct pt_regs *regs)
941{ 1152{
942 struct trace_uprobe *tu; 1153 struct trace_uprobe *tu;
1154 struct uprobe_dispatch_data udd;
943 1155
944 tu = container_of(con, struct trace_uprobe, consumer); 1156 tu = container_of(con, struct trace_uprobe, consumer);
945 1157
946 if (tu->flags & TP_FLAG_TRACE) 1158 udd.tu = tu;
1159 udd.bp_addr = func;
1160
1161 current->utask->vaddr = (unsigned long) &udd;
1162
1163 if (tu->tp.flags & TP_FLAG_TRACE)
947 uretprobe_trace_func(tu, func, regs); 1164 uretprobe_trace_func(tu, func, regs);
948 1165
949#ifdef CONFIG_PERF_EVENTS 1166#ifdef CONFIG_PERF_EVENTS
950 if (tu->flags & TP_FLAG_PROFILE) 1167 if (tu->tp.flags & TP_FLAG_PROFILE)
951 uretprobe_perf_func(tu, func, regs); 1168 uretprobe_perf_func(tu, func, regs);
952#endif 1169#endif
953 return 0; 1170 return 0;
@@ -959,7 +1176,7 @@ static struct trace_event_functions uprobe_funcs = {
959 1176
960static int register_uprobe_event(struct trace_uprobe *tu) 1177static int register_uprobe_event(struct trace_uprobe *tu)
961{ 1178{
962 struct ftrace_event_call *call = &tu->call; 1179 struct ftrace_event_call *call = &tu->tp.call;
963 int ret; 1180 int ret;
964 1181
965 /* Initialize ftrace_event_call */ 1182 /* Initialize ftrace_event_call */
@@ -967,7 +1184,7 @@ static int register_uprobe_event(struct trace_uprobe *tu)
967 call->event.funcs = &uprobe_funcs; 1184 call->event.funcs = &uprobe_funcs;
968 call->class->define_fields = uprobe_event_define_fields; 1185 call->class->define_fields = uprobe_event_define_fields;
969 1186
970 if (set_print_fmt(tu) < 0) 1187 if (set_print_fmt(&tu->tp, is_ret_probe(tu)) < 0)
971 return -ENOMEM; 1188 return -ENOMEM;
972 1189
973 ret = register_ftrace_event(&call->event); 1190 ret = register_ftrace_event(&call->event);
@@ -994,11 +1211,11 @@ static int unregister_uprobe_event(struct trace_uprobe *tu)
994 int ret; 1211 int ret;
995 1212
996 /* tu->event is unregistered in trace_remove_event_call() */ 1213 /* tu->event is unregistered in trace_remove_event_call() */
997 ret = trace_remove_event_call(&tu->call); 1214 ret = trace_remove_event_call(&tu->tp.call);
998 if (ret) 1215 if (ret)
999 return ret; 1216 return ret;
1000 kfree(tu->call.print_fmt); 1217 kfree(tu->tp.call.print_fmt);
1001 tu->call.print_fmt = NULL; 1218 tu->tp.call.print_fmt = NULL;
1002 return 0; 1219 return 0;
1003} 1220}
1004 1221
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index 29f26540e9c9..031cc5655a51 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -631,6 +631,11 @@ void tracepoint_iter_reset(struct tracepoint_iter *iter)
631EXPORT_SYMBOL_GPL(tracepoint_iter_reset); 631EXPORT_SYMBOL_GPL(tracepoint_iter_reset);
632 632
633#ifdef CONFIG_MODULES 633#ifdef CONFIG_MODULES
634bool trace_module_has_bad_taint(struct module *mod)
635{
636 return mod->taints & ~((1 << TAINT_OOT_MODULE) | (1 << TAINT_CRAP));
637}
638
634static int tracepoint_module_coming(struct module *mod) 639static int tracepoint_module_coming(struct module *mod)
635{ 640{
636 struct tp_module *tp_mod, *iter; 641 struct tp_module *tp_mod, *iter;
@@ -641,7 +646,7 @@ static int tracepoint_module_coming(struct module *mod)
641 * module headers (for forced load), to make sure we don't cause a crash. 646 * module headers (for forced load), to make sure we don't cause a crash.
642 * Staging and out-of-tree GPL modules are fine. 647 * Staging and out-of-tree GPL modules are fine.
643 */ 648 */
644 if (mod->taints & ~((1 << TAINT_OOT_MODULE) | (1 << TAINT_CRAP))) 649 if (trace_module_has_bad_taint(mod))
645 return 0; 650 return 0;
646 mutex_lock(&tracepoints_mutex); 651 mutex_lock(&tracepoints_mutex);
647 tp_mod = kmalloc(sizeof(struct tp_module), GFP_KERNEL); 652 tp_mod = kmalloc(sizeof(struct tp_module), GFP_KERNEL);
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 240fb62cf394..dd06439b9c84 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -225,7 +225,7 @@ static u32 map_id_up(struct uid_gid_map *map, u32 id)
225 * 225 *
226 * When there is no mapping defined for the user-namespace uid 226 * When there is no mapping defined for the user-namespace uid
227 * pair INVALID_UID is returned. Callers are expected to test 227 * pair INVALID_UID is returned. Callers are expected to test
228 * for and handle handle INVALID_UID being returned. INVALID_UID 228 * for and handle INVALID_UID being returned. INVALID_UID
229 * may be tested for using uid_valid(). 229 * may be tested for using uid_valid().
230 */ 230 */
231kuid_t make_kuid(struct user_namespace *ns, uid_t uid) 231kuid_t make_kuid(struct user_namespace *ns, uid_t uid)
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index b010eac595d2..193e977a10ea 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1851,6 +1851,12 @@ static void destroy_worker(struct worker *worker)
1851 if (worker->flags & WORKER_IDLE) 1851 if (worker->flags & WORKER_IDLE)
1852 pool->nr_idle--; 1852 pool->nr_idle--;
1853 1853
1854 /*
1855 * Once WORKER_DIE is set, the kworker may destroy itself at any
1856 * point. Pin to ensure the task stays until we're done with it.
1857 */
1858 get_task_struct(worker->task);
1859
1854 list_del_init(&worker->entry); 1860 list_del_init(&worker->entry);
1855 worker->flags |= WORKER_DIE; 1861 worker->flags |= WORKER_DIE;
1856 1862
@@ -1859,6 +1865,7 @@ static void destroy_worker(struct worker *worker)
1859 spin_unlock_irq(&pool->lock); 1865 spin_unlock_irq(&pool->lock);
1860 1866
1861 kthread_stop(worker->task); 1867 kthread_stop(worker->task);
1868 put_task_struct(worker->task);
1862 kfree(worker); 1869 kfree(worker);
1863 1870
1864 spin_lock_irq(&pool->lock); 1871 spin_lock_irq(&pool->lock);
@@ -4789,6 +4796,7 @@ static int workqueue_cpu_down_callback(struct notifier_block *nfb,
4789 4796
4790 /* wait for per-cpu unbinding to finish */ 4797 /* wait for per-cpu unbinding to finish */
4791 flush_work(&unbind_work); 4798 flush_work(&unbind_work);
4799 destroy_work_on_stack(&unbind_work);
4792 break; 4800 break;
4793 } 4801 }
4794 return NOTIFY_OK; 4802 return NOTIFY_OK;
@@ -4828,6 +4836,7 @@ long work_on_cpu(int cpu, long (*fn)(void *), void *arg)
4828 INIT_WORK_ONSTACK(&wfc.work, work_for_cpu_fn); 4836 INIT_WORK_ONSTACK(&wfc.work, work_for_cpu_fn);
4829 schedule_work_on(cpu, &wfc.work); 4837 schedule_work_on(cpu, &wfc.work);
4830 flush_work(&wfc.work); 4838 flush_work(&wfc.work);
4839 destroy_work_on_stack(&wfc.work);
4831 return wfc.ret; 4840 return wfc.ret;
4832} 4841}
4833EXPORT_SYMBOL_GPL(work_on_cpu); 4842EXPORT_SYMBOL_GPL(work_on_cpu);