aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorDmitry Torokhov <dmitry.torokhov@gmail.com>2015-02-10 14:35:36 -0500
committerDmitry Torokhov <dmitry.torokhov@gmail.com>2015-02-10 14:35:36 -0500
commit4ba24fef3eb3b142197135223b90ced2f319cd53 (patch)
treea20c125b27740ec7b4c761b11d801108e1b316b2 /kernel
parent47c1ffb2b6b630894e9a16442611c056ab21c057 (diff)
parent98a4a59ee31a12105a2b84f5b8b515ac2cb208ef (diff)
Merge branch 'next' into for-linus
Prepare first round of input updates for 3.20.
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile3
-rw-r--r--kernel/acct.c14
-rw-r--r--kernel/async.c8
-rw-r--r--kernel/audit.c55
-rw-r--r--kernel/audit.h1
-rw-r--r--kernel/audit_tree.c23
-rw-r--r--kernel/audit_watch.c4
-rw-r--r--kernel/auditfilter.c79
-rw-r--r--kernel/auditsc.c86
-rw-r--r--kernel/bpf/Makefile4
-rw-r--r--kernel/bpf/arraymap.c156
-rw-r--r--kernel/bpf/core.c136
-rw-r--r--kernel/bpf/hashtab.c367
-rw-r--r--kernel/bpf/helpers.c89
-rw-r--r--kernel/bpf/syscall.c606
-rw-r--r--kernel/bpf/test_stub.c78
-rw-r--r--kernel/bpf/verifier.c2003
-rw-r--r--kernel/cgroup.c366
-rw-r--r--kernel/configs/tiny.config4
-rw-r--r--kernel/context_tracking.c40
-rw-r--r--kernel/cpu.c37
-rw-r--r--kernel/cpuset.c200
-rw-r--r--kernel/crash_dump.c1
-rw-r--r--kernel/debug/debug_core.c52
-rw-r--r--kernel/debug/kdb/kdb_bp.c43
-rw-r--r--kernel/debug/kdb/kdb_debugger.c4
-rw-r--r--kernel/debug/kdb/kdb_main.c267
-rw-r--r--kernel/debug/kdb/kdb_private.h3
-rw-r--r--kernel/events/callchain.c6
-rw-r--r--kernel/events/core.c347
-rw-r--r--kernel/events/hw_breakpoint.c7
-rw-r--r--kernel/events/uprobes.c9
-rw-r--r--kernel/exit.c317
-rw-r--r--kernel/extable.c7
-rw-r--r--kernel/fork.c25
-rw-r--r--kernel/freezer.c9
-rw-r--r--kernel/futex.c38
-rw-r--r--kernel/gcov/Kconfig5
-rw-r--r--kernel/groups.c11
-rw-r--r--kernel/irq/Kconfig18
-rw-r--r--kernel/irq/Makefile1
-rw-r--r--kernel/irq/chip.c217
-rw-r--r--kernel/irq/devres.c2
-rw-r--r--kernel/irq/generic-chip.c36
-rw-r--r--kernel/irq/internals.h20
-rw-r--r--kernel/irq/irqdesc.c94
-rw-r--r--kernel/irq/irqdomain.c567
-rw-r--r--kernel/irq/manage.c34
-rw-r--r--kernel/irq/msi.c330
-rw-r--r--kernel/irq/pm.c159
-rw-r--r--kernel/irq/proc.c22
-rw-r--r--kernel/irq_work.c27
-rw-r--r--kernel/kallsyms.c11
-rw-r--r--kernel/kexec.c34
-rw-r--r--kernel/kmod.c111
-rw-r--r--kernel/kprobes.c20
-rw-r--r--kernel/kthread.c2
-rw-r--r--kernel/locking/locktorture.c529
-rw-r--r--kernel/locking/mcs_spinlock.h3
-rw-r--r--kernel/locking/mutex-debug.c2
-rw-r--r--kernel/locking/mutex.c422
-rw-r--r--kernel/locking/mutex.h2
-rw-r--r--kernel/locking/rtmutex.c2
-rw-r--r--kernel/locking/rwsem-xadd.c27
-rw-r--r--kernel/locking/semaphore.c12
-rw-r--r--kernel/module.c206
-rw-r--r--kernel/nsproxy.c10
-rw-r--r--kernel/panic.c14
-rw-r--r--kernel/params.c121
-rw-r--r--kernel/pid.c7
-rw-r--r--kernel/pid_namespace.c57
-rw-r--r--kernel/power/Kconfig21
-rw-r--r--kernel/power/hibernate.c22
-rw-r--r--kernel/power/power.h3
-rw-r--r--kernel/power/process.c58
-rw-r--r--kernel/power/qos.c27
-rw-r--r--kernel/power/snapshot.c63
-rw-r--r--kernel/power/suspend.c51
-rw-r--r--kernel/power/suspend_test.c32
-rw-r--r--kernel/power/swap.c43
-rw-r--r--kernel/printk/printk.c134
-rw-r--r--kernel/ptrace.c23
-rw-r--r--kernel/rcu/Makefile2
-rw-r--r--kernel/rcu/rcu.h2
-rw-r--r--kernel/rcu/rcutorture.c279
-rw-r--r--kernel/rcu/tiny.c26
-rw-r--r--kernel/rcu/tree.c227
-rw-r--r--kernel/rcu/tree.h41
-rw-r--r--kernel/rcu/tree_plugin.h538
-rw-r--r--kernel/rcu/update.c432
-rw-r--r--kernel/reboot.c81
-rw-r--r--kernel/res_counter.c211
-rw-r--r--kernel/resource.c106
-rw-r--r--kernel/sched/auto_group.c5
-rw-r--r--kernel/sched/clock.c2
-rw-r--r--kernel/sched/completion.c5
-rw-r--r--kernel/sched/core.c695
-rw-r--r--kernel/sched/cpudeadline.c4
-rw-r--r--kernel/sched/cpudeadline.h3
-rw-r--r--kernel/sched/cpupri.h3
-rw-r--r--kernel/sched/cputime.c64
-rw-r--r--kernel/sched/deadline.c202
-rw-r--r--kernel/sched/debug.c24
-rw-r--r--kernel/sched/fair.c876
-rw-r--r--kernel/sched/idle.c6
-rw-r--r--kernel/sched/idle_task.c5
-rw-r--r--kernel/sched/rt.c42
-rw-r--r--kernel/sched/sched.h129
-rw-r--r--kernel/sched/stop_task.c7
-rw-r--r--kernel/sched/wait.c102
-rw-r--r--kernel/seccomp.c259
-rw-r--r--kernel/signal.c46
-rw-r--r--kernel/smp.c28
-rw-r--r--kernel/smpboot.c15
-rw-r--r--kernel/softirq.c8
-rw-r--r--kernel/stacktrace.c32
-rw-r--r--kernel/sys.c503
-rw-r--r--kernel/sys_ni.c11
-rw-r--r--kernel/sysctl.c35
-rw-r--r--kernel/sysctl_binary.c2
-rw-r--r--kernel/taskstats.c4
-rw-r--r--kernel/time/Makefile2
-rw-r--r--kernel/time/clockevents.c2
-rw-r--r--kernel/time/clocksource.c2
-rw-r--r--kernel/time/hrtimer.c23
-rw-r--r--kernel/time/posix-cpu-timers.c16
-rw-r--r--kernel/time/posix-timers.c1
-rw-r--r--kernel/time/test_udelay.c (renamed from kernel/time/udelay_test.c)0
-rw-r--r--kernel/time/tick-broadcast.c2
-rw-r--r--kernel/time/tick-common.c7
-rw-r--r--kernel/time/tick-internal.h7
-rw-r--r--kernel/time/tick-oneshot.c2
-rw-r--r--kernel/time/tick-sched.c92
-rw-r--r--kernel/time/time.c21
-rw-r--r--kernel/time/timekeeping.c127
-rw-r--r--kernel/time/timer.c7
-rw-r--r--kernel/torture.c32
-rw-r--r--kernel/trace/Makefile2
-rw-r--r--kernel/trace/blktrace.c151
-rw-r--r--kernel/trace/ftrace.c778
-rw-r--r--kernel/trace/ring_buffer.c154
-rw-r--r--kernel/trace/ring_buffer_benchmark.c3
-rw-r--r--kernel/trace/trace.c288
-rw-r--r--kernel/trace/trace.h31
-rw-r--r--kernel/trace/trace_branch.c47
-rw-r--r--kernel/trace/trace_events.c62
-rw-r--r--kernel/trace/trace_events_filter.c29
-rw-r--r--kernel/trace/trace_events_trigger.c6
-rw-r--r--kernel/trace/trace_functions.c119
-rw-r--r--kernel/trace/trace_functions_graph.c423
-rw-r--r--kernel/trace/trace_kdb.c25
-rw-r--r--kernel/trace/trace_kprobe.c46
-rw-r--r--kernel/trace/trace_mmiotrace.c52
-rw-r--r--kernel/trace/trace_output.c446
-rw-r--r--kernel/trace/trace_output.h16
-rw-r--r--kernel/trace/trace_printk.c2
-rw-r--r--kernel/trace/trace_probe.c10
-rw-r--r--kernel/trace/trace_sched_switch.c144
-rw-r--r--kernel/trace/trace_sched_wakeup.c56
-rw-r--r--kernel/trace/trace_selftest.c51
-rw-r--r--kernel/trace/trace_seq.c253
-rw-r--r--kernel/trace/trace_stack.c4
-rw-r--r--kernel/trace/trace_syscalls.c66
-rw-r--r--kernel/trace/trace_uprobe.c28
-rw-r--r--kernel/uid16.c2
-rw-r--r--kernel/user-return-notifier.c4
-rw-r--r--kernel/user.c6
-rw-r--r--kernel/user_namespace.c153
-rw-r--r--kernel/utsname.c31
-rw-r--r--kernel/watchdog.c90
-rw-r--r--kernel/workqueue.c35
171 files changed, 12976 insertions, 5148 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index dc5c77544fd6..a59481a3fa6c 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -57,7 +57,6 @@ obj-$(CONFIG_UTS_NS) += utsname.o
57obj-$(CONFIG_USER_NS) += user_namespace.o 57obj-$(CONFIG_USER_NS) += user_namespace.o
58obj-$(CONFIG_PID_NS) += pid_namespace.o 58obj-$(CONFIG_PID_NS) += pid_namespace.o
59obj-$(CONFIG_IKCONFIG) += configs.o 59obj-$(CONFIG_IKCONFIG) += configs.o
60obj-$(CONFIG_RESOURCE_COUNTERS) += res_counter.o
61obj-$(CONFIG_SMP) += stop_machine.o 60obj-$(CONFIG_SMP) += stop_machine.o
62obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o 61obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o
63obj-$(CONFIG_AUDIT) += audit.o auditfilter.o 62obj-$(CONFIG_AUDIT) += audit.o auditfilter.o
@@ -86,7 +85,7 @@ obj-$(CONFIG_RING_BUFFER) += trace/
86obj-$(CONFIG_TRACEPOINTS) += trace/ 85obj-$(CONFIG_TRACEPOINTS) += trace/
87obj-$(CONFIG_IRQ_WORK) += irq_work.o 86obj-$(CONFIG_IRQ_WORK) += irq_work.o
88obj-$(CONFIG_CPU_PM) += cpu_pm.o 87obj-$(CONFIG_CPU_PM) += cpu_pm.o
89obj-$(CONFIG_NET) += bpf/ 88obj-$(CONFIG_BPF) += bpf/
90 89
91obj-$(CONFIG_PERF_EVENTS) += events/ 90obj-$(CONFIG_PERF_EVENTS) += events/
92 91
diff --git a/kernel/acct.c b/kernel/acct.c
index b4c667d22e79..33738ef972f3 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -472,7 +472,6 @@ static void do_acct_process(struct bsd_acct_struct *acct)
472 acct_t ac; 472 acct_t ac;
473 unsigned long flim; 473 unsigned long flim;
474 const struct cred *orig_cred; 474 const struct cred *orig_cred;
475 struct pid_namespace *ns = acct->ns;
476 struct file *file = acct->file; 475 struct file *file = acct->file;
477 476
478 /* 477 /*
@@ -500,10 +499,15 @@ static void do_acct_process(struct bsd_acct_struct *acct)
500 ac.ac_gid16 = ac.ac_gid; 499 ac.ac_gid16 = ac.ac_gid;
501#endif 500#endif
502#if ACCT_VERSION == 3 501#if ACCT_VERSION == 3
503 ac.ac_pid = task_tgid_nr_ns(current, ns); 502 {
504 rcu_read_lock(); 503 struct pid_namespace *ns = acct->ns;
505 ac.ac_ppid = task_tgid_nr_ns(rcu_dereference(current->real_parent), ns); 504
506 rcu_read_unlock(); 505 ac.ac_pid = task_tgid_nr_ns(current, ns);
506 rcu_read_lock();
507 ac.ac_ppid = task_tgid_nr_ns(rcu_dereference(current->real_parent),
508 ns);
509 rcu_read_unlock();
510 }
507#endif 511#endif
508 /* 512 /*
509 * Get freeze protection. If the fs is frozen, just skip the write 513 * Get freeze protection. If the fs is frozen, just skip the write
diff --git a/kernel/async.c b/kernel/async.c
index 61f023ce0228..4c3773c0bf63 100644
--- a/kernel/async.c
+++ b/kernel/async.c
@@ -115,7 +115,7 @@ static void async_run_entry_fn(struct work_struct *work)
115 115
116 /* 1) run (and print duration) */ 116 /* 1) run (and print duration) */
117 if (initcall_debug && system_state == SYSTEM_BOOTING) { 117 if (initcall_debug && system_state == SYSTEM_BOOTING) {
118 printk(KERN_DEBUG "calling %lli_%pF @ %i\n", 118 pr_debug("calling %lli_%pF @ %i\n",
119 (long long)entry->cookie, 119 (long long)entry->cookie,
120 entry->func, task_pid_nr(current)); 120 entry->func, task_pid_nr(current));
121 calltime = ktime_get(); 121 calltime = ktime_get();
@@ -124,7 +124,7 @@ static void async_run_entry_fn(struct work_struct *work)
124 if (initcall_debug && system_state == SYSTEM_BOOTING) { 124 if (initcall_debug && system_state == SYSTEM_BOOTING) {
125 rettime = ktime_get(); 125 rettime = ktime_get();
126 delta = ktime_sub(rettime, calltime); 126 delta = ktime_sub(rettime, calltime);
127 printk(KERN_DEBUG "initcall %lli_%pF returned 0 after %lld usecs\n", 127 pr_debug("initcall %lli_%pF returned 0 after %lld usecs\n",
128 (long long)entry->cookie, 128 (long long)entry->cookie,
129 entry->func, 129 entry->func,
130 (long long)ktime_to_ns(delta) >> 10); 130 (long long)ktime_to_ns(delta) >> 10);
@@ -285,7 +285,7 @@ void async_synchronize_cookie_domain(async_cookie_t cookie, struct async_domain
285 ktime_t uninitialized_var(starttime), delta, endtime; 285 ktime_t uninitialized_var(starttime), delta, endtime;
286 286
287 if (initcall_debug && system_state == SYSTEM_BOOTING) { 287 if (initcall_debug && system_state == SYSTEM_BOOTING) {
288 printk(KERN_DEBUG "async_waiting @ %i\n", task_pid_nr(current)); 288 pr_debug("async_waiting @ %i\n", task_pid_nr(current));
289 starttime = ktime_get(); 289 starttime = ktime_get();
290 } 290 }
291 291
@@ -295,7 +295,7 @@ void async_synchronize_cookie_domain(async_cookie_t cookie, struct async_domain
295 endtime = ktime_get(); 295 endtime = ktime_get();
296 delta = ktime_sub(endtime, starttime); 296 delta = ktime_sub(endtime, starttime);
297 297
298 printk(KERN_DEBUG "async_continuing @ %i after %lli usec\n", 298 pr_debug("async_continuing @ %i after %lli usec\n",
299 task_pid_nr(current), 299 task_pid_nr(current),
300 (long long)ktime_to_ns(delta) >> 10); 300 (long long)ktime_to_ns(delta) >> 10);
301 } 301 }
diff --git a/kernel/audit.c b/kernel/audit.c
index ba2ff5a5c600..72ab759a0b43 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -126,7 +126,7 @@ static atomic_t audit_lost = ATOMIC_INIT(0);
126 126
127/* The netlink socket. */ 127/* The netlink socket. */
128static struct sock *audit_sock; 128static struct sock *audit_sock;
129int audit_net_id; 129static int audit_net_id;
130 130
131/* Hash for inode-based rules */ 131/* Hash for inode-based rules */
132struct list_head audit_inode_hash[AUDIT_INODE_BUCKETS]; 132struct list_head audit_inode_hash[AUDIT_INODE_BUCKETS];
@@ -429,7 +429,7 @@ static void kauditd_send_skb(struct sk_buff *skb)
429 * This function doesn't consume an skb as might be expected since it has to 429 * This function doesn't consume an skb as might be expected since it has to
430 * copy it anyways. 430 * copy it anyways.
431 */ 431 */
432static void kauditd_send_multicast_skb(struct sk_buff *skb) 432static void kauditd_send_multicast_skb(struct sk_buff *skb, gfp_t gfp_mask)
433{ 433{
434 struct sk_buff *copy; 434 struct sk_buff *copy;
435 struct audit_net *aunet = net_generic(&init_net, audit_net_id); 435 struct audit_net *aunet = net_generic(&init_net, audit_net_id);
@@ -448,11 +448,11 @@ static void kauditd_send_multicast_skb(struct sk_buff *skb)
448 * no reason for new multicast clients to continue with this 448 * no reason for new multicast clients to continue with this
449 * non-compliance. 449 * non-compliance.
450 */ 450 */
451 copy = skb_copy(skb, GFP_KERNEL); 451 copy = skb_copy(skb, gfp_mask);
452 if (!copy) 452 if (!copy)
453 return; 453 return;
454 454
455 nlmsg_multicast(sock, copy, 0, AUDIT_NLGRP_READLOG, GFP_KERNEL); 455 nlmsg_multicast(sock, copy, 0, AUDIT_NLGRP_READLOG, gfp_mask);
456} 456}
457 457
458/* 458/*
@@ -499,7 +499,6 @@ static int kauditd_thread(void *dummy)
499 set_freezable(); 499 set_freezable();
500 while (!kthread_should_stop()) { 500 while (!kthread_should_stop()) {
501 struct sk_buff *skb; 501 struct sk_buff *skb;
502 DECLARE_WAITQUEUE(wait, current);
503 502
504 flush_hold_queue(); 503 flush_hold_queue();
505 504
@@ -514,16 +513,8 @@ static int kauditd_thread(void *dummy)
514 audit_printk_skb(skb); 513 audit_printk_skb(skb);
515 continue; 514 continue;
516 } 515 }
517 set_current_state(TASK_INTERRUPTIBLE);
518 add_wait_queue(&kauditd_wait, &wait);
519 516
520 if (!skb_queue_len(&audit_skb_queue)) { 517 wait_event_freezable(kauditd_wait, skb_queue_len(&audit_skb_queue));
521 try_to_freeze();
522 schedule();
523 }
524
525 __set_current_state(TASK_RUNNING);
526 remove_wait_queue(&kauditd_wait, &wait);
527 } 518 }
528 return 0; 519 return 0;
529} 520}
@@ -724,7 +715,7 @@ static int audit_get_feature(struct sk_buff *skb)
724 715
725 seq = nlmsg_hdr(skb)->nlmsg_seq; 716 seq = nlmsg_hdr(skb)->nlmsg_seq;
726 717
727 audit_send_reply(skb, seq, AUDIT_GET, 0, 0, &af, sizeof(af)); 718 audit_send_reply(skb, seq, AUDIT_GET_FEATURE, 0, 0, &af, sizeof(af));
728 719
729 return 0; 720 return 0;
730} 721}
@@ -739,7 +730,7 @@ static void audit_log_feature_change(int which, u32 old_feature, u32 new_feature
739 730
740 ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_FEATURE_CHANGE); 731 ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_FEATURE_CHANGE);
741 audit_log_task_info(ab, current); 732 audit_log_task_info(ab, current);
742 audit_log_format(ab, "feature=%s old=%u new=%u old_lock=%u new_lock=%u res=%d", 733 audit_log_format(ab, " feature=%s old=%u new=%u old_lock=%u new_lock=%u res=%d",
743 audit_feature_names[which], !!old_feature, !!new_feature, 734 audit_feature_names[which], !!old_feature, !!new_feature,
744 !!old_lock, !!new_lock, res); 735 !!old_lock, !!new_lock, res);
745 audit_log_end(ab); 736 audit_log_end(ab);
@@ -750,7 +741,7 @@ static int audit_set_feature(struct sk_buff *skb)
750 struct audit_features *uaf; 741 struct audit_features *uaf;
751 int i; 742 int i;
752 743
753 BUILD_BUG_ON(AUDIT_LAST_FEATURE + 1 > sizeof(audit_feature_names)/sizeof(audit_feature_names[0])); 744 BUILD_BUG_ON(AUDIT_LAST_FEATURE + 1 > ARRAY_SIZE(audit_feature_names));
754 uaf = nlmsg_data(nlmsg_hdr(skb)); 745 uaf = nlmsg_data(nlmsg_hdr(skb));
755 746
756 /* if there is ever a version 2 we should handle that here */ 747 /* if there is ever a version 2 we should handle that here */
@@ -842,7 +833,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
842 s.backlog_limit = audit_backlog_limit; 833 s.backlog_limit = audit_backlog_limit;
843 s.lost = atomic_read(&audit_lost); 834 s.lost = atomic_read(&audit_lost);
844 s.backlog = skb_queue_len(&audit_skb_queue); 835 s.backlog = skb_queue_len(&audit_skb_queue);
845 s.version = AUDIT_VERSION_LATEST; 836 s.feature_bitmap = AUDIT_FEATURE_BITMAP_ALL;
846 s.backlog_wait_time = audit_backlog_wait_time; 837 s.backlog_wait_time = audit_backlog_wait_time;
847 audit_send_reply(skb, seq, AUDIT_GET, 0, 0, &s, sizeof(s)); 838 audit_send_reply(skb, seq, AUDIT_GET, 0, 0, &s, sizeof(s));
848 break; 839 break;
@@ -1109,7 +1100,7 @@ static void audit_receive(struct sk_buff *skb)
1109} 1100}
1110 1101
1111/* Run custom bind function on netlink socket group connect or bind requests. */ 1102/* Run custom bind function on netlink socket group connect or bind requests. */
1112static int audit_bind(int group) 1103static int audit_bind(struct net *net, int group)
1113{ 1104{
1114 if (!capable(CAP_AUDIT_READ)) 1105 if (!capable(CAP_AUDIT_READ))
1115 return -EPERM; 1106 return -EPERM;
@@ -1301,19 +1292,9 @@ err:
1301 */ 1292 */
1302unsigned int audit_serial(void) 1293unsigned int audit_serial(void)
1303{ 1294{
1304 static DEFINE_SPINLOCK(serial_lock); 1295 static atomic_t serial = ATOMIC_INIT(0);
1305 static unsigned int serial = 0;
1306
1307 unsigned long flags;
1308 unsigned int ret;
1309
1310 spin_lock_irqsave(&serial_lock, flags);
1311 do {
1312 ret = ++serial;
1313 } while (unlikely(!ret));
1314 spin_unlock_irqrestore(&serial_lock, flags);
1315 1296
1316 return ret; 1297 return atomic_add_return(1, &serial);
1317} 1298}
1318 1299
1319static inline void audit_get_stamp(struct audit_context *ctx, 1300static inline void audit_get_stamp(struct audit_context *ctx,
@@ -1681,7 +1662,7 @@ void audit_log_cap(struct audit_buffer *ab, char *prefix, kernel_cap_t *cap)
1681 } 1662 }
1682} 1663}
1683 1664
1684void audit_log_fcaps(struct audit_buffer *ab, struct audit_names *name) 1665static void audit_log_fcaps(struct audit_buffer *ab, struct audit_names *name)
1685{ 1666{
1686 kernel_cap_t *perm = &name->fcap.permitted; 1667 kernel_cap_t *perm = &name->fcap.permitted;
1687 kernel_cap_t *inh = &name->fcap.inheritable; 1668 kernel_cap_t *inh = &name->fcap.inheritable;
@@ -1860,7 +1841,7 @@ EXPORT_SYMBOL(audit_log_task_context);
1860void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk) 1841void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk)
1861{ 1842{
1862 const struct cred *cred; 1843 const struct cred *cred;
1863 char name[sizeof(tsk->comm)]; 1844 char comm[sizeof(tsk->comm)];
1864 struct mm_struct *mm = tsk->mm; 1845 struct mm_struct *mm = tsk->mm;
1865 char *tty; 1846 char *tty;
1866 1847
@@ -1894,9 +1875,8 @@ void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk)
1894 from_kgid(&init_user_ns, cred->fsgid), 1875 from_kgid(&init_user_ns, cred->fsgid),
1895 tty, audit_get_sessionid(tsk)); 1876 tty, audit_get_sessionid(tsk));
1896 1877
1897 get_task_comm(name, tsk);
1898 audit_log_format(ab, " comm="); 1878 audit_log_format(ab, " comm=");
1899 audit_log_untrustedstring(ab, name); 1879 audit_log_untrustedstring(ab, get_task_comm(comm, tsk));
1900 1880
1901 if (mm) { 1881 if (mm) {
1902 down_read(&mm->mmap_sem); 1882 down_read(&mm->mmap_sem);
@@ -1959,7 +1939,8 @@ void audit_log_end(struct audit_buffer *ab)
1959 } else { 1939 } else {
1960 struct nlmsghdr *nlh = nlmsg_hdr(ab->skb); 1940 struct nlmsghdr *nlh = nlmsg_hdr(ab->skb);
1961 1941
1962 kauditd_send_multicast_skb(ab->skb); 1942 nlh->nlmsg_len = ab->skb->len;
1943 kauditd_send_multicast_skb(ab->skb, ab->gfp_mask);
1963 1944
1964 /* 1945 /*
1965 * The original kaudit unicast socket sends up messages with 1946 * The original kaudit unicast socket sends up messages with
@@ -1970,7 +1951,7 @@ void audit_log_end(struct audit_buffer *ab)
1970 * protocol between the kaudit kernel subsystem and the auditd 1951 * protocol between the kaudit kernel subsystem and the auditd
1971 * userspace code. 1952 * userspace code.
1972 */ 1953 */
1973 nlh->nlmsg_len = ab->skb->len - NLMSG_HDRLEN; 1954 nlh->nlmsg_len -= NLMSG_HDRLEN;
1974 1955
1975 if (audit_pid) { 1956 if (audit_pid) {
1976 skb_queue_tail(&audit_skb_queue, ab->skb); 1957 skb_queue_tail(&audit_skb_queue, ab->skb);
diff --git a/kernel/audit.h b/kernel/audit.h
index 7bb65730c890..3cdffad5a1d9 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -222,7 +222,6 @@ extern void audit_copy_inode(struct audit_names *name,
222 const struct inode *inode); 222 const struct inode *inode);
223extern void audit_log_cap(struct audit_buffer *ab, char *prefix, 223extern void audit_log_cap(struct audit_buffer *ab, char *prefix,
224 kernel_cap_t *cap); 224 kernel_cap_t *cap);
225extern void audit_log_fcaps(struct audit_buffer *ab, struct audit_names *name);
226extern void audit_log_name(struct audit_context *context, 225extern void audit_log_name(struct audit_context *context,
227 struct audit_names *n, struct path *path, 226 struct audit_names *n, struct path *path,
228 int record_num, int *call_panic); 227 int record_num, int *call_panic);
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 135944a7b28a..2e0c97427b33 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -154,6 +154,7 @@ static struct audit_chunk *alloc_chunk(int count)
154 chunk->owners[i].index = i; 154 chunk->owners[i].index = i;
155 } 155 }
156 fsnotify_init_mark(&chunk->mark, audit_tree_destroy_watch); 156 fsnotify_init_mark(&chunk->mark, audit_tree_destroy_watch);
157 chunk->mark.mask = FS_IN_IGNORED;
157 return chunk; 158 return chunk;
158} 159}
159 160
@@ -173,9 +174,9 @@ static void insert_hash(struct audit_chunk *chunk)
173 struct fsnotify_mark *entry = &chunk->mark; 174 struct fsnotify_mark *entry = &chunk->mark;
174 struct list_head *list; 175 struct list_head *list;
175 176
176 if (!entry->i.inode) 177 if (!entry->inode)
177 return; 178 return;
178 list = chunk_hash(entry->i.inode); 179 list = chunk_hash(entry->inode);
179 list_add_rcu(&chunk->hash, list); 180 list_add_rcu(&chunk->hash, list);
180} 181}
181 182
@@ -187,7 +188,7 @@ struct audit_chunk *audit_tree_lookup(const struct inode *inode)
187 188
188 list_for_each_entry_rcu(p, list, hash) { 189 list_for_each_entry_rcu(p, list, hash) {
189 /* mark.inode may have gone NULL, but who cares? */ 190 /* mark.inode may have gone NULL, but who cares? */
190 if (p->mark.i.inode == inode) { 191 if (p->mark.inode == inode) {
191 atomic_long_inc(&p->refs); 192 atomic_long_inc(&p->refs);
192 return p; 193 return p;
193 } 194 }
@@ -230,7 +231,7 @@ static void untag_chunk(struct node *p)
230 new = alloc_chunk(size); 231 new = alloc_chunk(size);
231 232
232 spin_lock(&entry->lock); 233 spin_lock(&entry->lock);
233 if (chunk->dead || !entry->i.inode) { 234 if (chunk->dead || !entry->inode) {
234 spin_unlock(&entry->lock); 235 spin_unlock(&entry->lock);
235 if (new) 236 if (new)
236 free_chunk(new); 237 free_chunk(new);
@@ -257,7 +258,7 @@ static void untag_chunk(struct node *p)
257 goto Fallback; 258 goto Fallback;
258 259
259 fsnotify_duplicate_mark(&new->mark, entry); 260 fsnotify_duplicate_mark(&new->mark, entry);
260 if (fsnotify_add_mark(&new->mark, new->mark.group, new->mark.i.inode, NULL, 1)) { 261 if (fsnotify_add_mark(&new->mark, new->mark.group, new->mark.inode, NULL, 1)) {
261 fsnotify_put_mark(&new->mark); 262 fsnotify_put_mark(&new->mark);
262 goto Fallback; 263 goto Fallback;
263 } 264 }
@@ -385,7 +386,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
385 chunk_entry = &chunk->mark; 386 chunk_entry = &chunk->mark;
386 387
387 spin_lock(&old_entry->lock); 388 spin_lock(&old_entry->lock);
388 if (!old_entry->i.inode) { 389 if (!old_entry->inode) {
389 /* old_entry is being shot, lets just lie */ 390 /* old_entry is being shot, lets just lie */
390 spin_unlock(&old_entry->lock); 391 spin_unlock(&old_entry->lock);
391 fsnotify_put_mark(old_entry); 392 fsnotify_put_mark(old_entry);
@@ -394,7 +395,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
394 } 395 }
395 396
396 fsnotify_duplicate_mark(chunk_entry, old_entry); 397 fsnotify_duplicate_mark(chunk_entry, old_entry);
397 if (fsnotify_add_mark(chunk_entry, chunk_entry->group, chunk_entry->i.inode, NULL, 1)) { 398 if (fsnotify_add_mark(chunk_entry, chunk_entry->group, chunk_entry->inode, NULL, 1)) {
398 spin_unlock(&old_entry->lock); 399 spin_unlock(&old_entry->lock);
399 fsnotify_put_mark(chunk_entry); 400 fsnotify_put_mark(chunk_entry);
400 fsnotify_put_mark(old_entry); 401 fsnotify_put_mark(old_entry);
@@ -449,7 +450,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
449 return 0; 450 return 0;
450} 451}
451 452
452static void audit_log_remove_rule(struct audit_krule *rule) 453static void audit_tree_log_remove_rule(struct audit_krule *rule)
453{ 454{
454 struct audit_buffer *ab; 455 struct audit_buffer *ab;
455 456
@@ -457,7 +458,7 @@ static void audit_log_remove_rule(struct audit_krule *rule)
457 if (unlikely(!ab)) 458 if (unlikely(!ab))
458 return; 459 return;
459 audit_log_format(ab, "op="); 460 audit_log_format(ab, "op=");
460 audit_log_string(ab, "remove rule"); 461 audit_log_string(ab, "remove_rule");
461 audit_log_format(ab, " dir="); 462 audit_log_format(ab, " dir=");
462 audit_log_untrustedstring(ab, rule->tree->pathname); 463 audit_log_untrustedstring(ab, rule->tree->pathname);
463 audit_log_key(ab, rule->filterkey); 464 audit_log_key(ab, rule->filterkey);
@@ -476,7 +477,7 @@ static void kill_rules(struct audit_tree *tree)
476 list_del_init(&rule->rlist); 477 list_del_init(&rule->rlist);
477 if (rule->tree) { 478 if (rule->tree) {
478 /* not a half-baked one */ 479 /* not a half-baked one */
479 audit_log_remove_rule(rule); 480 audit_tree_log_remove_rule(rule);
480 rule->tree = NULL; 481 rule->tree = NULL;
481 list_del_rcu(&entry->list); 482 list_del_rcu(&entry->list);
482 list_del(&entry->rule.list); 483 list_del(&entry->rule.list);
@@ -610,7 +611,7 @@ void audit_trim_trees(void)
610 list_for_each_entry(node, &tree->chunks, list) { 611 list_for_each_entry(node, &tree->chunks, list) {
611 struct audit_chunk *chunk = find_chunk(node); 612 struct audit_chunk *chunk = find_chunk(node);
612 /* this could be NULL if the watch is dying else where... */ 613 /* this could be NULL if the watch is dying else where... */
613 struct inode *inode = chunk->mark.i.inode; 614 struct inode *inode = chunk->mark.inode;
614 node->index |= 1U<<31; 615 node->index |= 1U<<31;
615 if (iterate_mounts(compare_root, inode, root_mnt)) 616 if (iterate_mounts(compare_root, inode, root_mnt))
616 node->index &= ~(1U<<31); 617 node->index &= ~(1U<<31);
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index 70b4554d2fbe..ad9c1682f616 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -314,7 +314,7 @@ static void audit_update_watch(struct audit_parent *parent,
314 &nentry->rule.list); 314 &nentry->rule.list);
315 } 315 }
316 316
317 audit_watch_log_rule_change(r, owatch, "updated rules"); 317 audit_watch_log_rule_change(r, owatch, "updated_rules");
318 318
319 call_rcu(&oentry->rcu, audit_free_rule_rcu); 319 call_rcu(&oentry->rcu, audit_free_rule_rcu);
320 } 320 }
@@ -342,7 +342,7 @@ static void audit_remove_parent_watches(struct audit_parent *parent)
342 list_for_each_entry_safe(w, nextw, &parent->watches, wlist) { 342 list_for_each_entry_safe(w, nextw, &parent->watches, wlist) {
343 list_for_each_entry_safe(r, nextr, &w->rules, rlist) { 343 list_for_each_entry_safe(r, nextr, &w->rules, rlist) {
344 e = container_of(r, struct audit_entry, rule); 344 e = container_of(r, struct audit_entry, rule);
345 audit_watch_log_rule_change(r, w, "remove rule"); 345 audit_watch_log_rule_change(r, w, "remove_rule");
346 list_del(&r->rlist); 346 list_del(&r->rlist);
347 list_del(&r->list); 347 list_del(&r->list);
348 list_del_rcu(&e->list); 348 list_del_rcu(&e->list);
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index c447cd9848d1..4f68a326d92e 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -71,6 +71,24 @@ static struct list_head audit_rules_list[AUDIT_NR_FILTERS] = {
71 71
72DEFINE_MUTEX(audit_filter_mutex); 72DEFINE_MUTEX(audit_filter_mutex);
73 73
74static void audit_free_lsm_field(struct audit_field *f)
75{
76 switch (f->type) {
77 case AUDIT_SUBJ_USER:
78 case AUDIT_SUBJ_ROLE:
79 case AUDIT_SUBJ_TYPE:
80 case AUDIT_SUBJ_SEN:
81 case AUDIT_SUBJ_CLR:
82 case AUDIT_OBJ_USER:
83 case AUDIT_OBJ_ROLE:
84 case AUDIT_OBJ_TYPE:
85 case AUDIT_OBJ_LEV_LOW:
86 case AUDIT_OBJ_LEV_HIGH:
87 kfree(f->lsm_str);
88 security_audit_rule_free(f->lsm_rule);
89 }
90}
91
74static inline void audit_free_rule(struct audit_entry *e) 92static inline void audit_free_rule(struct audit_entry *e)
75{ 93{
76 int i; 94 int i;
@@ -80,11 +98,8 @@ static inline void audit_free_rule(struct audit_entry *e)
80 if (erule->watch) 98 if (erule->watch)
81 audit_put_watch(erule->watch); 99 audit_put_watch(erule->watch);
82 if (erule->fields) 100 if (erule->fields)
83 for (i = 0; i < erule->field_count; i++) { 101 for (i = 0; i < erule->field_count; i++)
84 struct audit_field *f = &erule->fields[i]; 102 audit_free_lsm_field(&erule->fields[i]);
85 kfree(f->lsm_str);
86 security_audit_rule_free(f->lsm_rule);
87 }
88 kfree(erule->fields); 103 kfree(erule->fields);
89 kfree(erule->filterkey); 104 kfree(erule->filterkey);
90 kfree(e); 105 kfree(e);
@@ -148,7 +163,7 @@ static inline int audit_to_inode(struct audit_krule *krule,
148 struct audit_field *f) 163 struct audit_field *f)
149{ 164{
150 if (krule->listnr != AUDIT_FILTER_EXIT || 165 if (krule->listnr != AUDIT_FILTER_EXIT ||
151 krule->watch || krule->inode_f || krule->tree || 166 krule->inode_f || krule->watch || krule->tree ||
152 (f->op != Audit_equal && f->op != Audit_not_equal)) 167 (f->op != Audit_equal && f->op != Audit_not_equal))
153 return -EINVAL; 168 return -EINVAL;
154 169
@@ -422,28 +437,12 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
422 437
423 f->type = data->fields[i]; 438 f->type = data->fields[i];
424 f->val = data->values[i]; 439 f->val = data->values[i];
425 f->uid = INVALID_UID;
426 f->gid = INVALID_GID;
427 f->lsm_str = NULL;
428 f->lsm_rule = NULL;
429 440
430 /* Support legacy tests for a valid loginuid */ 441 /* Support legacy tests for a valid loginuid */
431 if ((f->type == AUDIT_LOGINUID) && (f->val == AUDIT_UID_UNSET)) { 442 if ((f->type == AUDIT_LOGINUID) && (f->val == AUDIT_UID_UNSET)) {
432 f->type = AUDIT_LOGINUID_SET; 443 f->type = AUDIT_LOGINUID_SET;
433 f->val = 0; 444 f->val = 0;
434 } 445 entry->rule.pflags |= AUDIT_LOGINUID_LEGACY;
435
436 if ((f->type == AUDIT_PID) || (f->type == AUDIT_PPID)) {
437 struct pid *pid;
438 rcu_read_lock();
439 pid = find_vpid(f->val);
440 if (!pid) {
441 rcu_read_unlock();
442 err = -ESRCH;
443 goto exit_free;
444 }
445 f->val = pid_nr(pid);
446 rcu_read_unlock();
447 } 446 }
448 447
449 err = audit_field_valid(entry, f); 448 err = audit_field_valid(entry, f);
@@ -619,6 +618,13 @@ static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule)
619 data->buflen += data->values[i] = 618 data->buflen += data->values[i] =
620 audit_pack_string(&bufp, krule->filterkey); 619 audit_pack_string(&bufp, krule->filterkey);
621 break; 620 break;
621 case AUDIT_LOGINUID_SET:
622 if (krule->pflags & AUDIT_LOGINUID_LEGACY && !f->val) {
623 data->fields[i] = AUDIT_LOGINUID;
624 data->values[i] = AUDIT_UID_UNSET;
625 break;
626 }
627 /* fallthrough if set */
622 default: 628 default:
623 data->values[i] = f->val; 629 data->values[i] = f->val;
624 } 630 }
@@ -635,6 +641,7 @@ static int audit_compare_rule(struct audit_krule *a, struct audit_krule *b)
635 int i; 641 int i;
636 642
637 if (a->flags != b->flags || 643 if (a->flags != b->flags ||
644 a->pflags != b->pflags ||
638 a->listnr != b->listnr || 645 a->listnr != b->listnr ||
639 a->action != b->action || 646 a->action != b->action ||
640 a->field_count != b->field_count) 647 a->field_count != b->field_count)
@@ -753,6 +760,7 @@ struct audit_entry *audit_dupe_rule(struct audit_krule *old)
753 new = &entry->rule; 760 new = &entry->rule;
754 new->vers_ops = old->vers_ops; 761 new->vers_ops = old->vers_ops;
755 new->flags = old->flags; 762 new->flags = old->flags;
763 new->pflags = old->pflags;
756 new->listnr = old->listnr; 764 new->listnr = old->listnr;
757 new->action = old->action; 765 new->action = old->action;
758 for (i = 0; i < AUDIT_BITMASK_SIZE; i++) 766 for (i = 0; i < AUDIT_BITMASK_SIZE; i++)
@@ -1053,30 +1061,27 @@ int audit_rule_change(int type, __u32 portid, int seq, void *data,
1053 int err = 0; 1061 int err = 0;
1054 struct audit_entry *entry; 1062 struct audit_entry *entry;
1055 1063
1064 entry = audit_data_to_entry(data, datasz);
1065 if (IS_ERR(entry))
1066 return PTR_ERR(entry);
1067
1056 switch (type) { 1068 switch (type) {
1057 case AUDIT_ADD_RULE: 1069 case AUDIT_ADD_RULE:
1058 entry = audit_data_to_entry(data, datasz);
1059 if (IS_ERR(entry))
1060 return PTR_ERR(entry);
1061
1062 err = audit_add_rule(entry); 1070 err = audit_add_rule(entry);
1063 audit_log_rule_change("add rule", &entry->rule, !err); 1071 audit_log_rule_change("add_rule", &entry->rule, !err);
1064 if (err)
1065 audit_free_rule(entry);
1066 break; 1072 break;
1067 case AUDIT_DEL_RULE: 1073 case AUDIT_DEL_RULE:
1068 entry = audit_data_to_entry(data, datasz);
1069 if (IS_ERR(entry))
1070 return PTR_ERR(entry);
1071
1072 err = audit_del_rule(entry); 1074 err = audit_del_rule(entry);
1073 audit_log_rule_change("remove rule", &entry->rule, !err); 1075 audit_log_rule_change("remove_rule", &entry->rule, !err);
1074 audit_free_rule(entry);
1075 break; 1076 break;
1076 default: 1077 default:
1077 return -EINVAL; 1078 err = -EINVAL;
1079 WARN_ON(1);
1078 } 1080 }
1079 1081
1082 if (err || type == AUDIT_DEL_RULE)
1083 audit_free_rule(entry);
1084
1080 return err; 1085 return err;
1081} 1086}
1082 1087
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 21eae3c05ec0..072566dd0caf 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -67,10 +67,13 @@
67#include <linux/binfmts.h> 67#include <linux/binfmts.h>
68#include <linux/highmem.h> 68#include <linux/highmem.h>
69#include <linux/syscalls.h> 69#include <linux/syscalls.h>
70#include <asm/syscall.h>
70#include <linux/capability.h> 71#include <linux/capability.h>
71#include <linux/fs_struct.h> 72#include <linux/fs_struct.h>
72#include <linux/compat.h> 73#include <linux/compat.h>
73#include <linux/ctype.h> 74#include <linux/ctype.h>
75#include <linux/string.h>
76#include <uapi/linux/limits.h>
74 77
75#include "audit.h" 78#include "audit.h"
76 79
@@ -125,14 +128,6 @@ struct audit_tree_refs {
125 struct audit_chunk *c[31]; 128 struct audit_chunk *c[31];
126}; 129};
127 130
128static inline int open_arg(int flags, int mask)
129{
130 int n = ACC_MODE(flags);
131 if (flags & (O_TRUNC | O_CREAT))
132 n |= AUDIT_PERM_WRITE;
133 return n & mask;
134}
135
136static int audit_match_perm(struct audit_context *ctx, int mask) 131static int audit_match_perm(struct audit_context *ctx, int mask)
137{ 132{
138 unsigned n; 133 unsigned n;
@@ -1505,7 +1500,6 @@ void __audit_free(struct task_struct *tsk)
1505 1500
1506/** 1501/**
1507 * audit_syscall_entry - fill in an audit record at syscall entry 1502 * audit_syscall_entry - fill in an audit record at syscall entry
1508 * @arch: architecture type
1509 * @major: major syscall type (function) 1503 * @major: major syscall type (function)
1510 * @a1: additional syscall register 1 1504 * @a1: additional syscall register 1
1511 * @a2: additional syscall register 2 1505 * @a2: additional syscall register 2
@@ -1520,9 +1514,8 @@ void __audit_free(struct task_struct *tsk)
1520 * will only be written if another part of the kernel requests that it 1514 * will only be written if another part of the kernel requests that it
1521 * be written). 1515 * be written).
1522 */ 1516 */
1523void __audit_syscall_entry(int arch, int major, 1517void __audit_syscall_entry(int major, unsigned long a1, unsigned long a2,
1524 unsigned long a1, unsigned long a2, 1518 unsigned long a3, unsigned long a4)
1525 unsigned long a3, unsigned long a4)
1526{ 1519{
1527 struct task_struct *tsk = current; 1520 struct task_struct *tsk = current;
1528 struct audit_context *context = tsk->audit_context; 1521 struct audit_context *context = tsk->audit_context;
@@ -1536,7 +1529,7 @@ void __audit_syscall_entry(int arch, int major,
1536 if (!audit_enabled) 1529 if (!audit_enabled)
1537 return; 1530 return;
1538 1531
1539 context->arch = arch; 1532 context->arch = syscall_get_arch();
1540 context->major = major; 1533 context->major = major;
1541 context->argv[0] = a1; 1534 context->argv[0] = a1;
1542 context->argv[1] = a2; 1535 context->argv[1] = a2;
@@ -1870,8 +1863,7 @@ void __audit_inode(struct filename *name, const struct dentry *dentry,
1870 } 1863 }
1871 1864
1872 list_for_each_entry_reverse(n, &context->names_list, list) { 1865 list_for_each_entry_reverse(n, &context->names_list, list) {
1873 /* does the name pointer match? */ 1866 if (!n->name || strcmp(n->name->name, name->name))
1874 if (!n->name || n->name->name != name->name)
1875 continue; 1867 continue;
1876 1868
1877 /* match the correct record type */ 1869 /* match the correct record type */
@@ -1886,12 +1878,48 @@ void __audit_inode(struct filename *name, const struct dentry *dentry,
1886 } 1878 }
1887 1879
1888out_alloc: 1880out_alloc:
1889 /* unable to find the name from a previous getname(). Allocate a new 1881 /* unable to find an entry with both a matching name and type */
1890 * anonymous entry. 1882 n = audit_alloc_name(context, AUDIT_TYPE_UNKNOWN);
1891 */
1892 n = audit_alloc_name(context, AUDIT_TYPE_NORMAL);
1893 if (!n) 1883 if (!n)
1894 return; 1884 return;
1885 /* unfortunately, while we may have a path name to record with the
1886 * inode, we can't always rely on the string lasting until the end of
1887 * the syscall so we need to create our own copy, it may fail due to
1888 * memory allocation issues, but we do our best */
1889 if (name) {
1890 /* we can't use getname_kernel() due to size limits */
1891 size_t len = strlen(name->name) + 1;
1892 struct filename *new = __getname();
1893
1894 if (unlikely(!new))
1895 goto out;
1896
1897 if (len <= (PATH_MAX - sizeof(*new))) {
1898 new->name = (char *)(new) + sizeof(*new);
1899 new->separate = false;
1900 } else if (len <= PATH_MAX) {
1901 /* this looks odd, but is due to final_putname() */
1902 struct filename *new2;
1903
1904 new2 = kmalloc(sizeof(*new2), GFP_KERNEL);
1905 if (unlikely(!new2)) {
1906 __putname(new);
1907 goto out;
1908 }
1909 new2->name = (char *)new;
1910 new2->separate = true;
1911 new = new2;
1912 } else {
1913 /* we should never get here, but let's be safe */
1914 __putname(new);
1915 goto out;
1916 }
1917 strlcpy((char *)new->name, name->name, len);
1918 new->uptr = NULL;
1919 new->aname = n;
1920 n->name = new;
1921 n->name_put = true;
1922 }
1895out: 1923out:
1896 if (parent) { 1924 if (parent) {
1897 n->name_len = n->name ? parent_len(n->name->name) : AUDIT_NAME_FULL; 1925 n->name_len = n->name ? parent_len(n->name->name) : AUDIT_NAME_FULL;
@@ -1906,6 +1934,11 @@ out:
1906 audit_copy_inode(n, dentry, inode); 1934 audit_copy_inode(n, dentry, inode);
1907} 1935}
1908 1936
1937void __audit_file(const struct file *file)
1938{
1939 __audit_inode(NULL, file->f_path.dentry, 0);
1940}
1941
1909/** 1942/**
1910 * __audit_inode_child - collect inode info for created/removed objects 1943 * __audit_inode_child - collect inode info for created/removed objects
1911 * @parent: inode of dentry parent 1944 * @parent: inode of dentry parent
@@ -2382,7 +2415,7 @@ int __audit_log_bprm_fcaps(struct linux_binprm *bprm,
2382 ax->d.next = context->aux; 2415 ax->d.next = context->aux;
2383 context->aux = (void *)ax; 2416 context->aux = (void *)ax;
2384 2417
2385 dentry = dget(bprm->file->f_dentry); 2418 dentry = dget(bprm->file->f_path.dentry);
2386 get_vfs_caps_from_disk(dentry, &vcaps); 2419 get_vfs_caps_from_disk(dentry, &vcaps);
2387 dput(dentry); 2420 dput(dentry);
2388 2421
@@ -2406,7 +2439,7 @@ int __audit_log_bprm_fcaps(struct linux_binprm *bprm,
2406 * @new: the new credentials 2439 * @new: the new credentials
2407 * @old: the old (current) credentials 2440 * @old: the old (current) credentials
2408 * 2441 *
2409 * Record the aguments userspace sent to sys_capset for later printing by the 2442 * Record the arguments userspace sent to sys_capset for later printing by the
2410 * audit system if applicable 2443 * audit system if applicable
2411 */ 2444 */
2412void __audit_log_capset(const struct cred *new, const struct cred *old) 2445void __audit_log_capset(const struct cred *new, const struct cred *old)
@@ -2433,6 +2466,7 @@ static void audit_log_task(struct audit_buffer *ab)
2433 kgid_t gid; 2466 kgid_t gid;
2434 unsigned int sessionid; 2467 unsigned int sessionid;
2435 struct mm_struct *mm = current->mm; 2468 struct mm_struct *mm = current->mm;
2469 char comm[sizeof(current->comm)];
2436 2470
2437 auid = audit_get_loginuid(current); 2471 auid = audit_get_loginuid(current);
2438 sessionid = audit_get_sessionid(current); 2472 sessionid = audit_get_sessionid(current);
@@ -2445,7 +2479,7 @@ static void audit_log_task(struct audit_buffer *ab)
2445 sessionid); 2479 sessionid);
2446 audit_log_task_context(ab); 2480 audit_log_task_context(ab);
2447 audit_log_format(ab, " pid=%d comm=", task_pid_nr(current)); 2481 audit_log_format(ab, " pid=%d comm=", task_pid_nr(current));
2448 audit_log_untrustedstring(ab, current->comm); 2482 audit_log_untrustedstring(ab, get_task_comm(comm, current));
2449 if (mm) { 2483 if (mm) {
2450 down_read(&mm->mmap_sem); 2484 down_read(&mm->mmap_sem);
2451 if (mm->exe_file) 2485 if (mm->exe_file)
@@ -2488,11 +2522,9 @@ void __audit_seccomp(unsigned long syscall, long signr, int code)
2488 if (unlikely(!ab)) 2522 if (unlikely(!ab))
2489 return; 2523 return;
2490 audit_log_task(ab); 2524 audit_log_task(ab);
2491 audit_log_format(ab, " sig=%ld", signr); 2525 audit_log_format(ab, " sig=%ld arch=%x syscall=%ld compat=%d ip=0x%lx code=0x%x",
2492 audit_log_format(ab, " syscall=%ld", syscall); 2526 signr, syscall_get_arch(), syscall, is_compat_task(),
2493 audit_log_format(ab, " compat=%d", is_compat_task()); 2527 KSTK_EIP(current), code);
2494 audit_log_format(ab, " ip=0x%lx", KSTK_EIP(current));
2495 audit_log_format(ab, " code=0x%x", code);
2496 audit_log_end(ab); 2528 audit_log_end(ab);
2497} 2529}
2498 2530
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 6a71145e2769..a5ae60f0b0a2 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -1 +1,5 @@
1obj-y := core.o 1obj-y := core.o
2obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o hashtab.o arraymap.o helpers.o
3ifdef CONFIG_TEST_BPF
4obj-$(CONFIG_BPF_SYSCALL) += test_stub.o
5endif
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
new file mode 100644
index 000000000000..9eb4d8a7cd87
--- /dev/null
+++ b/kernel/bpf/arraymap.c
@@ -0,0 +1,156 @@
1/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
2 *
3 * This program is free software; you can redistribute it and/or
4 * modify it under the terms of version 2 of the GNU General Public
5 * License as published by the Free Software Foundation.
6 *
7 * This program is distributed in the hope that it will be useful, but
8 * WITHOUT ANY WARRANTY; without even the implied warranty of
9 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
10 * General Public License for more details.
11 */
12#include <linux/bpf.h>
13#include <linux/err.h>
14#include <linux/vmalloc.h>
15#include <linux/slab.h>
16#include <linux/mm.h>
17
18struct bpf_array {
19 struct bpf_map map;
20 u32 elem_size;
21 char value[0] __aligned(8);
22};
23
24/* Called from syscall */
25static struct bpf_map *array_map_alloc(union bpf_attr *attr)
26{
27 struct bpf_array *array;
28 u32 elem_size, array_size;
29
30 /* check sanity of attributes */
31 if (attr->max_entries == 0 || attr->key_size != 4 ||
32 attr->value_size == 0)
33 return ERR_PTR(-EINVAL);
34
35 elem_size = round_up(attr->value_size, 8);
36
37 /* check round_up into zero and u32 overflow */
38 if (elem_size == 0 ||
39 attr->max_entries > (U32_MAX - sizeof(*array)) / elem_size)
40 return ERR_PTR(-ENOMEM);
41
42 array_size = sizeof(*array) + attr->max_entries * elem_size;
43
44 /* allocate all map elements and zero-initialize them */
45 array = kzalloc(array_size, GFP_USER | __GFP_NOWARN);
46 if (!array) {
47 array = vzalloc(array_size);
48 if (!array)
49 return ERR_PTR(-ENOMEM);
50 }
51
52 /* copy mandatory map attributes */
53 array->map.key_size = attr->key_size;
54 array->map.value_size = attr->value_size;
55 array->map.max_entries = attr->max_entries;
56
57 array->elem_size = elem_size;
58
59 return &array->map;
60}
61
62/* Called from syscall or from eBPF program */
63static void *array_map_lookup_elem(struct bpf_map *map, void *key)
64{
65 struct bpf_array *array = container_of(map, struct bpf_array, map);
66 u32 index = *(u32 *)key;
67
68 if (index >= array->map.max_entries)
69 return NULL;
70
71 return array->value + array->elem_size * index;
72}
73
74/* Called from syscall */
75static int array_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
76{
77 struct bpf_array *array = container_of(map, struct bpf_array, map);
78 u32 index = *(u32 *)key;
79 u32 *next = (u32 *)next_key;
80
81 if (index >= array->map.max_entries) {
82 *next = 0;
83 return 0;
84 }
85
86 if (index == array->map.max_entries - 1)
87 return -ENOENT;
88
89 *next = index + 1;
90 return 0;
91}
92
93/* Called from syscall or from eBPF program */
94static int array_map_update_elem(struct bpf_map *map, void *key, void *value,
95 u64 map_flags)
96{
97 struct bpf_array *array = container_of(map, struct bpf_array, map);
98 u32 index = *(u32 *)key;
99
100 if (map_flags > BPF_EXIST)
101 /* unknown flags */
102 return -EINVAL;
103
104 if (index >= array->map.max_entries)
105 /* all elements were pre-allocated, cannot insert a new one */
106 return -E2BIG;
107
108 if (map_flags == BPF_NOEXIST)
109 /* all elements already exist */
110 return -EEXIST;
111
112 memcpy(array->value + array->elem_size * index, value, array->elem_size);
113 return 0;
114}
115
116/* Called from syscall or from eBPF program */
117static int array_map_delete_elem(struct bpf_map *map, void *key)
118{
119 return -EINVAL;
120}
121
122/* Called when map->refcnt goes to zero, either from workqueue or from syscall */
123static void array_map_free(struct bpf_map *map)
124{
125 struct bpf_array *array = container_of(map, struct bpf_array, map);
126
127 /* at this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
128 * so the programs (can be more than one that used this map) were
129 * disconnected from events. Wait for outstanding programs to complete
130 * and free the array
131 */
132 synchronize_rcu();
133
134 kvfree(array);
135}
136
137static struct bpf_map_ops array_ops = {
138 .map_alloc = array_map_alloc,
139 .map_free = array_map_free,
140 .map_get_next_key = array_map_get_next_key,
141 .map_lookup_elem = array_map_lookup_elem,
142 .map_update_elem = array_map_update_elem,
143 .map_delete_elem = array_map_delete_elem,
144};
145
146static struct bpf_map_type_list tl = {
147 .ops = &array_ops,
148 .type = BPF_MAP_TYPE_ARRAY,
149};
150
151static int __init register_array_map(void)
152{
153 bpf_register_map_type(&tl);
154 return 0;
155}
156late_initcall(register_array_map);
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 7f0dbcbb34af..d6594e457a25 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -20,9 +20,14 @@
20 * Andi Kleen - Fix a few bad bugs and races. 20 * Andi Kleen - Fix a few bad bugs and races.
21 * Kris Katterjohn - Added many additional checks in bpf_check_classic() 21 * Kris Katterjohn - Added many additional checks in bpf_check_classic()
22 */ 22 */
23
23#include <linux/filter.h> 24#include <linux/filter.h>
24#include <linux/skbuff.h> 25#include <linux/skbuff.h>
26#include <linux/vmalloc.h>
27#include <linux/random.h>
28#include <linux/moduleloader.h>
25#include <asm/unaligned.h> 29#include <asm/unaligned.h>
30#include <linux/bpf.h>
26 31
27/* Registers */ 32/* Registers */
28#define BPF_R0 regs[BPF_REG_0] 33#define BPF_R0 regs[BPF_REG_0]
@@ -63,6 +68,105 @@ void *bpf_internal_load_pointer_neg_helper(const struct sk_buff *skb, int k, uns
63 return NULL; 68 return NULL;
64} 69}
65 70
71struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags)
72{
73 gfp_t gfp_flags = GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO |
74 gfp_extra_flags;
75 struct bpf_prog_aux *aux;
76 struct bpf_prog *fp;
77
78 size = round_up(size, PAGE_SIZE);
79 fp = __vmalloc(size, gfp_flags, PAGE_KERNEL);
80 if (fp == NULL)
81 return NULL;
82
83 aux = kzalloc(sizeof(*aux), GFP_KERNEL | gfp_extra_flags);
84 if (aux == NULL) {
85 vfree(fp);
86 return NULL;
87 }
88
89 fp->pages = size / PAGE_SIZE;
90 fp->aux = aux;
91
92 return fp;
93}
94EXPORT_SYMBOL_GPL(bpf_prog_alloc);
95
96struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size,
97 gfp_t gfp_extra_flags)
98{
99 gfp_t gfp_flags = GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO |
100 gfp_extra_flags;
101 struct bpf_prog *fp;
102
103 BUG_ON(fp_old == NULL);
104
105 size = round_up(size, PAGE_SIZE);
106 if (size <= fp_old->pages * PAGE_SIZE)
107 return fp_old;
108
109 fp = __vmalloc(size, gfp_flags, PAGE_KERNEL);
110 if (fp != NULL) {
111 memcpy(fp, fp_old, fp_old->pages * PAGE_SIZE);
112 fp->pages = size / PAGE_SIZE;
113
114 /* We keep fp->aux from fp_old around in the new
115 * reallocated structure.
116 */
117 fp_old->aux = NULL;
118 __bpf_prog_free(fp_old);
119 }
120
121 return fp;
122}
123EXPORT_SYMBOL_GPL(bpf_prog_realloc);
124
125void __bpf_prog_free(struct bpf_prog *fp)
126{
127 kfree(fp->aux);
128 vfree(fp);
129}
130EXPORT_SYMBOL_GPL(__bpf_prog_free);
131
132#ifdef CONFIG_BPF_JIT
133struct bpf_binary_header *
134bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr,
135 unsigned int alignment,
136 bpf_jit_fill_hole_t bpf_fill_ill_insns)
137{
138 struct bpf_binary_header *hdr;
139 unsigned int size, hole, start;
140
141 /* Most of BPF filters are really small, but if some of them
142 * fill a page, allow at least 128 extra bytes to insert a
143 * random section of illegal instructions.
144 */
145 size = round_up(proglen + sizeof(*hdr) + 128, PAGE_SIZE);
146 hdr = module_alloc(size);
147 if (hdr == NULL)
148 return NULL;
149
150 /* Fill space with illegal/arch-dep instructions. */
151 bpf_fill_ill_insns(hdr, size);
152
153 hdr->pages = size / PAGE_SIZE;
154 hole = min_t(unsigned int, size - (proglen + sizeof(*hdr)),
155 PAGE_SIZE - sizeof(*hdr));
156 start = (prandom_u32() % hole) & ~(alignment - 1);
157
158 /* Leave a random number of instructions before BPF code. */
159 *image_ptr = &hdr->image[start];
160
161 return hdr;
162}
163
164void bpf_jit_binary_free(struct bpf_binary_header *hdr)
165{
166 module_free(NULL, hdr);
167}
168#endif /* CONFIG_BPF_JIT */
169
66/* Base function for offset calculation. Needs to go into .text section, 170/* Base function for offset calculation. Needs to go into .text section,
67 * therefore keeping it non-static as well; will also be used by JITs 171 * therefore keeping it non-static as well; will also be used by JITs
68 * anyway later on, so do not let the compiler omit it. 172 * anyway later on, so do not let the compiler omit it.
@@ -180,6 +284,7 @@ static unsigned int __bpf_prog_run(void *ctx, const struct bpf_insn *insn)
180 [BPF_LD | BPF_IND | BPF_W] = &&LD_IND_W, 284 [BPF_LD | BPF_IND | BPF_W] = &&LD_IND_W,
181 [BPF_LD | BPF_IND | BPF_H] = &&LD_IND_H, 285 [BPF_LD | BPF_IND | BPF_H] = &&LD_IND_H,
182 [BPF_LD | BPF_IND | BPF_B] = &&LD_IND_B, 286 [BPF_LD | BPF_IND | BPF_B] = &&LD_IND_B,
287 [BPF_LD | BPF_IMM | BPF_DW] = &&LD_IMM_DW,
183 }; 288 };
184 void *ptr; 289 void *ptr;
185 int off; 290 int off;
@@ -239,6 +344,10 @@ select_insn:
239 ALU64_MOV_K: 344 ALU64_MOV_K:
240 DST = IMM; 345 DST = IMM;
241 CONT; 346 CONT;
347 LD_IMM_DW:
348 DST = (u64) (u32) insn[0].imm | ((u64) (u32) insn[1].imm) << 32;
349 insn++;
350 CONT;
242 ALU64_ARSH_X: 351 ALU64_ARSH_X:
243 (*(s64 *) &DST) >>= SRC; 352 (*(s64 *) &DST) >>= SRC;
244 CONT; 353 CONT;
@@ -523,12 +632,35 @@ void bpf_prog_select_runtime(struct bpf_prog *fp)
523 632
524 /* Probe if internal BPF can be JITed */ 633 /* Probe if internal BPF can be JITed */
525 bpf_int_jit_compile(fp); 634 bpf_int_jit_compile(fp);
635 /* Lock whole bpf_prog as read-only */
636 bpf_prog_lock_ro(fp);
526} 637}
527EXPORT_SYMBOL_GPL(bpf_prog_select_runtime); 638EXPORT_SYMBOL_GPL(bpf_prog_select_runtime);
528 639
529/* free internal BPF program */ 640static void bpf_prog_free_deferred(struct work_struct *work)
641{
642 struct bpf_prog_aux *aux;
643
644 aux = container_of(work, struct bpf_prog_aux, work);
645 bpf_jit_free(aux->prog);
646}
647
648/* Free internal BPF program */
530void bpf_prog_free(struct bpf_prog *fp) 649void bpf_prog_free(struct bpf_prog *fp)
531{ 650{
532 bpf_jit_free(fp); 651 struct bpf_prog_aux *aux = fp->aux;
652
653 INIT_WORK(&aux->work, bpf_prog_free_deferred);
654 aux->prog = fp;
655 schedule_work(&aux->work);
533} 656}
534EXPORT_SYMBOL_GPL(bpf_prog_free); 657EXPORT_SYMBOL_GPL(bpf_prog_free);
658
659/* To execute LD_ABS/LD_IND instructions __bpf_prog_run() may call
660 * skb_copy_bits(), so provide a weak definition of it for NET-less config.
661 */
662int __weak skb_copy_bits(const struct sk_buff *skb, int offset, void *to,
663 int len)
664{
665 return -EFAULT;
666}
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
new file mode 100644
index 000000000000..b3ba43674310
--- /dev/null
+++ b/kernel/bpf/hashtab.c
@@ -0,0 +1,367 @@
1/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
2 *
3 * This program is free software; you can redistribute it and/or
4 * modify it under the terms of version 2 of the GNU General Public
5 * License as published by the Free Software Foundation.
6 *
7 * This program is distributed in the hope that it will be useful, but
8 * WITHOUT ANY WARRANTY; without even the implied warranty of
9 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
10 * General Public License for more details.
11 */
12#include <linux/bpf.h>
13#include <linux/jhash.h>
14#include <linux/filter.h>
15#include <linux/vmalloc.h>
16
17struct bpf_htab {
18 struct bpf_map map;
19 struct hlist_head *buckets;
20 spinlock_t lock;
21 u32 count; /* number of elements in this hashtable */
22 u32 n_buckets; /* number of hash buckets */
23 u32 elem_size; /* size of each element in bytes */
24};
25
26/* each htab element is struct htab_elem + key + value */
27struct htab_elem {
28 struct hlist_node hash_node;
29 struct rcu_head rcu;
30 u32 hash;
31 char key[0] __aligned(8);
32};
33
34/* Called from syscall */
35static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
36{
37 struct bpf_htab *htab;
38 int err, i;
39
40 htab = kzalloc(sizeof(*htab), GFP_USER);
41 if (!htab)
42 return ERR_PTR(-ENOMEM);
43
44 /* mandatory map attributes */
45 htab->map.key_size = attr->key_size;
46 htab->map.value_size = attr->value_size;
47 htab->map.max_entries = attr->max_entries;
48
49 /* check sanity of attributes.
50 * value_size == 0 may be allowed in the future to use map as a set
51 */
52 err = -EINVAL;
53 if (htab->map.max_entries == 0 || htab->map.key_size == 0 ||
54 htab->map.value_size == 0)
55 goto free_htab;
56
57 /* hash table size must be power of 2 */
58 htab->n_buckets = roundup_pow_of_two(htab->map.max_entries);
59
60 err = -E2BIG;
61 if (htab->map.key_size > MAX_BPF_STACK)
62 /* eBPF programs initialize keys on stack, so they cannot be
63 * larger than max stack size
64 */
65 goto free_htab;
66
67 err = -ENOMEM;
68 /* prevent zero size kmalloc and check for u32 overflow */
69 if (htab->n_buckets == 0 ||
70 htab->n_buckets > U32_MAX / sizeof(struct hlist_head))
71 goto free_htab;
72
73 htab->buckets = kmalloc_array(htab->n_buckets, sizeof(struct hlist_head),
74 GFP_USER | __GFP_NOWARN);
75
76 if (!htab->buckets) {
77 htab->buckets = vmalloc(htab->n_buckets * sizeof(struct hlist_head));
78 if (!htab->buckets)
79 goto free_htab;
80 }
81
82 for (i = 0; i < htab->n_buckets; i++)
83 INIT_HLIST_HEAD(&htab->buckets[i]);
84
85 spin_lock_init(&htab->lock);
86 htab->count = 0;
87
88 htab->elem_size = sizeof(struct htab_elem) +
89 round_up(htab->map.key_size, 8) +
90 htab->map.value_size;
91 return &htab->map;
92
93free_htab:
94 kfree(htab);
95 return ERR_PTR(err);
96}
97
98static inline u32 htab_map_hash(const void *key, u32 key_len)
99{
100 return jhash(key, key_len, 0);
101}
102
103static inline struct hlist_head *select_bucket(struct bpf_htab *htab, u32 hash)
104{
105 return &htab->buckets[hash & (htab->n_buckets - 1)];
106}
107
108static struct htab_elem *lookup_elem_raw(struct hlist_head *head, u32 hash,
109 void *key, u32 key_size)
110{
111 struct htab_elem *l;
112
113 hlist_for_each_entry_rcu(l, head, hash_node)
114 if (l->hash == hash && !memcmp(&l->key, key, key_size))
115 return l;
116
117 return NULL;
118}
119
120/* Called from syscall or from eBPF program */
121static void *htab_map_lookup_elem(struct bpf_map *map, void *key)
122{
123 struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
124 struct hlist_head *head;
125 struct htab_elem *l;
126 u32 hash, key_size;
127
128 /* Must be called with rcu_read_lock. */
129 WARN_ON_ONCE(!rcu_read_lock_held());
130
131 key_size = map->key_size;
132
133 hash = htab_map_hash(key, key_size);
134
135 head = select_bucket(htab, hash);
136
137 l = lookup_elem_raw(head, hash, key, key_size);
138
139 if (l)
140 return l->key + round_up(map->key_size, 8);
141
142 return NULL;
143}
144
145/* Called from syscall */
146static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
147{
148 struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
149 struct hlist_head *head;
150 struct htab_elem *l, *next_l;
151 u32 hash, key_size;
152 int i;
153
154 WARN_ON_ONCE(!rcu_read_lock_held());
155
156 key_size = map->key_size;
157
158 hash = htab_map_hash(key, key_size);
159
160 head = select_bucket(htab, hash);
161
162 /* lookup the key */
163 l = lookup_elem_raw(head, hash, key, key_size);
164
165 if (!l) {
166 i = 0;
167 goto find_first_elem;
168 }
169
170 /* key was found, get next key in the same bucket */
171 next_l = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(&l->hash_node)),
172 struct htab_elem, hash_node);
173
174 if (next_l) {
175 /* if next elem in this hash list is non-zero, just return it */
176 memcpy(next_key, next_l->key, key_size);
177 return 0;
178 }
179
180 /* no more elements in this hash list, go to the next bucket */
181 i = hash & (htab->n_buckets - 1);
182 i++;
183
184find_first_elem:
185 /* iterate over buckets */
186 for (; i < htab->n_buckets; i++) {
187 head = select_bucket(htab, i);
188
189 /* pick first element in the bucket */
190 next_l = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(head)),
191 struct htab_elem, hash_node);
192 if (next_l) {
193 /* if it's not empty, just return it */
194 memcpy(next_key, next_l->key, key_size);
195 return 0;
196 }
197 }
198
199 /* itereated over all buckets and all elements */
200 return -ENOENT;
201}
202
203/* Called from syscall or from eBPF program */
204static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
205 u64 map_flags)
206{
207 struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
208 struct htab_elem *l_new, *l_old;
209 struct hlist_head *head;
210 unsigned long flags;
211 u32 key_size;
212 int ret;
213
214 if (map_flags > BPF_EXIST)
215 /* unknown flags */
216 return -EINVAL;
217
218 WARN_ON_ONCE(!rcu_read_lock_held());
219
220 /* allocate new element outside of lock */
221 l_new = kmalloc(htab->elem_size, GFP_ATOMIC);
222 if (!l_new)
223 return -ENOMEM;
224
225 key_size = map->key_size;
226
227 memcpy(l_new->key, key, key_size);
228 memcpy(l_new->key + round_up(key_size, 8), value, map->value_size);
229
230 l_new->hash = htab_map_hash(l_new->key, key_size);
231
232 /* bpf_map_update_elem() can be called in_irq() */
233 spin_lock_irqsave(&htab->lock, flags);
234
235 head = select_bucket(htab, l_new->hash);
236
237 l_old = lookup_elem_raw(head, l_new->hash, key, key_size);
238
239 if (!l_old && unlikely(htab->count >= map->max_entries)) {
240 /* if elem with this 'key' doesn't exist and we've reached
241 * max_entries limit, fail insertion of new elem
242 */
243 ret = -E2BIG;
244 goto err;
245 }
246
247 if (l_old && map_flags == BPF_NOEXIST) {
248 /* elem already exists */
249 ret = -EEXIST;
250 goto err;
251 }
252
253 if (!l_old && map_flags == BPF_EXIST) {
254 /* elem doesn't exist, cannot update it */
255 ret = -ENOENT;
256 goto err;
257 }
258
259 /* add new element to the head of the list, so that concurrent
260 * search will find it before old elem
261 */
262 hlist_add_head_rcu(&l_new->hash_node, head);
263 if (l_old) {
264 hlist_del_rcu(&l_old->hash_node);
265 kfree_rcu(l_old, rcu);
266 } else {
267 htab->count++;
268 }
269 spin_unlock_irqrestore(&htab->lock, flags);
270
271 return 0;
272err:
273 spin_unlock_irqrestore(&htab->lock, flags);
274 kfree(l_new);
275 return ret;
276}
277
278/* Called from syscall or from eBPF program */
279static int htab_map_delete_elem(struct bpf_map *map, void *key)
280{
281 struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
282 struct hlist_head *head;
283 struct htab_elem *l;
284 unsigned long flags;
285 u32 hash, key_size;
286 int ret = -ENOENT;
287
288 WARN_ON_ONCE(!rcu_read_lock_held());
289
290 key_size = map->key_size;
291
292 hash = htab_map_hash(key, key_size);
293
294 spin_lock_irqsave(&htab->lock, flags);
295
296 head = select_bucket(htab, hash);
297
298 l = lookup_elem_raw(head, hash, key, key_size);
299
300 if (l) {
301 hlist_del_rcu(&l->hash_node);
302 htab->count--;
303 kfree_rcu(l, rcu);
304 ret = 0;
305 }
306
307 spin_unlock_irqrestore(&htab->lock, flags);
308 return ret;
309}
310
311static void delete_all_elements(struct bpf_htab *htab)
312{
313 int i;
314
315 for (i = 0; i < htab->n_buckets; i++) {
316 struct hlist_head *head = select_bucket(htab, i);
317 struct hlist_node *n;
318 struct htab_elem *l;
319
320 hlist_for_each_entry_safe(l, n, head, hash_node) {
321 hlist_del_rcu(&l->hash_node);
322 htab->count--;
323 kfree(l);
324 }
325 }
326}
327
328/* Called when map->refcnt goes to zero, either from workqueue or from syscall */
329static void htab_map_free(struct bpf_map *map)
330{
331 struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
332
333 /* at this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
334 * so the programs (can be more than one that used this map) were
335 * disconnected from events. Wait for outstanding critical sections in
336 * these programs to complete
337 */
338 synchronize_rcu();
339
340 /* some of kfree_rcu() callbacks for elements of this map may not have
341 * executed. It's ok. Proceed to free residual elements and map itself
342 */
343 delete_all_elements(htab);
344 kvfree(htab->buckets);
345 kfree(htab);
346}
347
348static struct bpf_map_ops htab_ops = {
349 .map_alloc = htab_map_alloc,
350 .map_free = htab_map_free,
351 .map_get_next_key = htab_map_get_next_key,
352 .map_lookup_elem = htab_map_lookup_elem,
353 .map_update_elem = htab_map_update_elem,
354 .map_delete_elem = htab_map_delete_elem,
355};
356
357static struct bpf_map_type_list tl = {
358 .ops = &htab_ops,
359 .type = BPF_MAP_TYPE_HASH,
360};
361
362static int __init register_htab_map(void)
363{
364 bpf_register_map_type(&tl);
365 return 0;
366}
367late_initcall(register_htab_map);
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
new file mode 100644
index 000000000000..9e3414d85459
--- /dev/null
+++ b/kernel/bpf/helpers.c
@@ -0,0 +1,89 @@
1/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
2 *
3 * This program is free software; you can redistribute it and/or
4 * modify it under the terms of version 2 of the GNU General Public
5 * License as published by the Free Software Foundation.
6 *
7 * This program is distributed in the hope that it will be useful, but
8 * WITHOUT ANY WARRANTY; without even the implied warranty of
9 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
10 * General Public License for more details.
11 */
12#include <linux/bpf.h>
13#include <linux/rcupdate.h>
14
15/* If kernel subsystem is allowing eBPF programs to call this function,
16 * inside its own verifier_ops->get_func_proto() callback it should return
17 * bpf_map_lookup_elem_proto, so that verifier can properly check the arguments
18 *
19 * Different map implementations will rely on rcu in map methods
20 * lookup/update/delete, therefore eBPF programs must run under rcu lock
21 * if program is allowed to access maps, so check rcu_read_lock_held in
22 * all three functions.
23 */
24static u64 bpf_map_lookup_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
25{
26 /* verifier checked that R1 contains a valid pointer to bpf_map
27 * and R2 points to a program stack and map->key_size bytes were
28 * initialized
29 */
30 struct bpf_map *map = (struct bpf_map *) (unsigned long) r1;
31 void *key = (void *) (unsigned long) r2;
32 void *value;
33
34 WARN_ON_ONCE(!rcu_read_lock_held());
35
36 value = map->ops->map_lookup_elem(map, key);
37
38 /* lookup() returns either pointer to element value or NULL
39 * which is the meaning of PTR_TO_MAP_VALUE_OR_NULL type
40 */
41 return (unsigned long) value;
42}
43
44struct bpf_func_proto bpf_map_lookup_elem_proto = {
45 .func = bpf_map_lookup_elem,
46 .gpl_only = false,
47 .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL,
48 .arg1_type = ARG_CONST_MAP_PTR,
49 .arg2_type = ARG_PTR_TO_MAP_KEY,
50};
51
52static u64 bpf_map_update_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
53{
54 struct bpf_map *map = (struct bpf_map *) (unsigned long) r1;
55 void *key = (void *) (unsigned long) r2;
56 void *value = (void *) (unsigned long) r3;
57
58 WARN_ON_ONCE(!rcu_read_lock_held());
59
60 return map->ops->map_update_elem(map, key, value, r4);
61}
62
63struct bpf_func_proto bpf_map_update_elem_proto = {
64 .func = bpf_map_update_elem,
65 .gpl_only = false,
66 .ret_type = RET_INTEGER,
67 .arg1_type = ARG_CONST_MAP_PTR,
68 .arg2_type = ARG_PTR_TO_MAP_KEY,
69 .arg3_type = ARG_PTR_TO_MAP_VALUE,
70 .arg4_type = ARG_ANYTHING,
71};
72
73static u64 bpf_map_delete_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
74{
75 struct bpf_map *map = (struct bpf_map *) (unsigned long) r1;
76 void *key = (void *) (unsigned long) r2;
77
78 WARN_ON_ONCE(!rcu_read_lock_held());
79
80 return map->ops->map_delete_elem(map, key);
81}
82
83struct bpf_func_proto bpf_map_delete_elem_proto = {
84 .func = bpf_map_delete_elem,
85 .gpl_only = false,
86 .ret_type = RET_INTEGER,
87 .arg1_type = ARG_CONST_MAP_PTR,
88 .arg2_type = ARG_PTR_TO_MAP_KEY,
89};
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
new file mode 100644
index 000000000000..088ac0b1b106
--- /dev/null
+++ b/kernel/bpf/syscall.c
@@ -0,0 +1,606 @@
1/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
2 *
3 * This program is free software; you can redistribute it and/or
4 * modify it under the terms of version 2 of the GNU General Public
5 * License as published by the Free Software Foundation.
6 *
7 * This program is distributed in the hope that it will be useful, but
8 * WITHOUT ANY WARRANTY; without even the implied warranty of
9 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
10 * General Public License for more details.
11 */
12#include <linux/bpf.h>
13#include <linux/syscalls.h>
14#include <linux/slab.h>
15#include <linux/anon_inodes.h>
16#include <linux/file.h>
17#include <linux/license.h>
18#include <linux/filter.h>
19
20static LIST_HEAD(bpf_map_types);
21
22static struct bpf_map *find_and_alloc_map(union bpf_attr *attr)
23{
24 struct bpf_map_type_list *tl;
25 struct bpf_map *map;
26
27 list_for_each_entry(tl, &bpf_map_types, list_node) {
28 if (tl->type == attr->map_type) {
29 map = tl->ops->map_alloc(attr);
30 if (IS_ERR(map))
31 return map;
32 map->ops = tl->ops;
33 map->map_type = attr->map_type;
34 return map;
35 }
36 }
37 return ERR_PTR(-EINVAL);
38}
39
40/* boot time registration of different map implementations */
41void bpf_register_map_type(struct bpf_map_type_list *tl)
42{
43 list_add(&tl->list_node, &bpf_map_types);
44}
45
46/* called from workqueue */
47static void bpf_map_free_deferred(struct work_struct *work)
48{
49 struct bpf_map *map = container_of(work, struct bpf_map, work);
50
51 /* implementation dependent freeing */
52 map->ops->map_free(map);
53}
54
55/* decrement map refcnt and schedule it for freeing via workqueue
56 * (unrelying map implementation ops->map_free() might sleep)
57 */
58void bpf_map_put(struct bpf_map *map)
59{
60 if (atomic_dec_and_test(&map->refcnt)) {
61 INIT_WORK(&map->work, bpf_map_free_deferred);
62 schedule_work(&map->work);
63 }
64}
65
66static int bpf_map_release(struct inode *inode, struct file *filp)
67{
68 struct bpf_map *map = filp->private_data;
69
70 bpf_map_put(map);
71 return 0;
72}
73
74static const struct file_operations bpf_map_fops = {
75 .release = bpf_map_release,
76};
77
78/* helper macro to check that unused fields 'union bpf_attr' are zero */
79#define CHECK_ATTR(CMD) \
80 memchr_inv((void *) &attr->CMD##_LAST_FIELD + \
81 sizeof(attr->CMD##_LAST_FIELD), 0, \
82 sizeof(*attr) - \
83 offsetof(union bpf_attr, CMD##_LAST_FIELD) - \
84 sizeof(attr->CMD##_LAST_FIELD)) != NULL
85
86#define BPF_MAP_CREATE_LAST_FIELD max_entries
87/* called via syscall */
88static int map_create(union bpf_attr *attr)
89{
90 struct bpf_map *map;
91 int err;
92
93 err = CHECK_ATTR(BPF_MAP_CREATE);
94 if (err)
95 return -EINVAL;
96
97 /* find map type and init map: hashtable vs rbtree vs bloom vs ... */
98 map = find_and_alloc_map(attr);
99 if (IS_ERR(map))
100 return PTR_ERR(map);
101
102 atomic_set(&map->refcnt, 1);
103
104 err = anon_inode_getfd("bpf-map", &bpf_map_fops, map, O_RDWR | O_CLOEXEC);
105
106 if (err < 0)
107 /* failed to allocate fd */
108 goto free_map;
109
110 return err;
111
112free_map:
113 map->ops->map_free(map);
114 return err;
115}
116
117/* if error is returned, fd is released.
118 * On success caller should complete fd access with matching fdput()
119 */
120struct bpf_map *bpf_map_get(struct fd f)
121{
122 struct bpf_map *map;
123
124 if (!f.file)
125 return ERR_PTR(-EBADF);
126
127 if (f.file->f_op != &bpf_map_fops) {
128 fdput(f);
129 return ERR_PTR(-EINVAL);
130 }
131
132 map = f.file->private_data;
133
134 return map;
135}
136
137/* helper to convert user pointers passed inside __aligned_u64 fields */
138static void __user *u64_to_ptr(__u64 val)
139{
140 return (void __user *) (unsigned long) val;
141}
142
143/* last field in 'union bpf_attr' used by this command */
144#define BPF_MAP_LOOKUP_ELEM_LAST_FIELD value
145
146static int map_lookup_elem(union bpf_attr *attr)
147{
148 void __user *ukey = u64_to_ptr(attr->key);
149 void __user *uvalue = u64_to_ptr(attr->value);
150 int ufd = attr->map_fd;
151 struct fd f = fdget(ufd);
152 struct bpf_map *map;
153 void *key, *value;
154 int err;
155
156 if (CHECK_ATTR(BPF_MAP_LOOKUP_ELEM))
157 return -EINVAL;
158
159 map = bpf_map_get(f);
160 if (IS_ERR(map))
161 return PTR_ERR(map);
162
163 err = -ENOMEM;
164 key = kmalloc(map->key_size, GFP_USER);
165 if (!key)
166 goto err_put;
167
168 err = -EFAULT;
169 if (copy_from_user(key, ukey, map->key_size) != 0)
170 goto free_key;
171
172 err = -ENOENT;
173 rcu_read_lock();
174 value = map->ops->map_lookup_elem(map, key);
175 if (!value)
176 goto err_unlock;
177
178 err = -EFAULT;
179 if (copy_to_user(uvalue, value, map->value_size) != 0)
180 goto err_unlock;
181
182 err = 0;
183
184err_unlock:
185 rcu_read_unlock();
186free_key:
187 kfree(key);
188err_put:
189 fdput(f);
190 return err;
191}
192
193#define BPF_MAP_UPDATE_ELEM_LAST_FIELD flags
194
195static int map_update_elem(union bpf_attr *attr)
196{
197 void __user *ukey = u64_to_ptr(attr->key);
198 void __user *uvalue = u64_to_ptr(attr->value);
199 int ufd = attr->map_fd;
200 struct fd f = fdget(ufd);
201 struct bpf_map *map;
202 void *key, *value;
203 int err;
204
205 if (CHECK_ATTR(BPF_MAP_UPDATE_ELEM))
206 return -EINVAL;
207
208 map = bpf_map_get(f);
209 if (IS_ERR(map))
210 return PTR_ERR(map);
211
212 err = -ENOMEM;
213 key = kmalloc(map->key_size, GFP_USER);
214 if (!key)
215 goto err_put;
216
217 err = -EFAULT;
218 if (copy_from_user(key, ukey, map->key_size) != 0)
219 goto free_key;
220
221 err = -ENOMEM;
222 value = kmalloc(map->value_size, GFP_USER);
223 if (!value)
224 goto free_key;
225
226 err = -EFAULT;
227 if (copy_from_user(value, uvalue, map->value_size) != 0)
228 goto free_value;
229
230 /* eBPF program that use maps are running under rcu_read_lock(),
231 * therefore all map accessors rely on this fact, so do the same here
232 */
233 rcu_read_lock();
234 err = map->ops->map_update_elem(map, key, value, attr->flags);
235 rcu_read_unlock();
236
237free_value:
238 kfree(value);
239free_key:
240 kfree(key);
241err_put:
242 fdput(f);
243 return err;
244}
245
246#define BPF_MAP_DELETE_ELEM_LAST_FIELD key
247
248static int map_delete_elem(union bpf_attr *attr)
249{
250 void __user *ukey = u64_to_ptr(attr->key);
251 int ufd = attr->map_fd;
252 struct fd f = fdget(ufd);
253 struct bpf_map *map;
254 void *key;
255 int err;
256
257 if (CHECK_ATTR(BPF_MAP_DELETE_ELEM))
258 return -EINVAL;
259
260 map = bpf_map_get(f);
261 if (IS_ERR(map))
262 return PTR_ERR(map);
263
264 err = -ENOMEM;
265 key = kmalloc(map->key_size, GFP_USER);
266 if (!key)
267 goto err_put;
268
269 err = -EFAULT;
270 if (copy_from_user(key, ukey, map->key_size) != 0)
271 goto free_key;
272
273 rcu_read_lock();
274 err = map->ops->map_delete_elem(map, key);
275 rcu_read_unlock();
276
277free_key:
278 kfree(key);
279err_put:
280 fdput(f);
281 return err;
282}
283
284/* last field in 'union bpf_attr' used by this command */
285#define BPF_MAP_GET_NEXT_KEY_LAST_FIELD next_key
286
287static int map_get_next_key(union bpf_attr *attr)
288{
289 void __user *ukey = u64_to_ptr(attr->key);
290 void __user *unext_key = u64_to_ptr(attr->next_key);
291 int ufd = attr->map_fd;
292 struct fd f = fdget(ufd);
293 struct bpf_map *map;
294 void *key, *next_key;
295 int err;
296
297 if (CHECK_ATTR(BPF_MAP_GET_NEXT_KEY))
298 return -EINVAL;
299
300 map = bpf_map_get(f);
301 if (IS_ERR(map))
302 return PTR_ERR(map);
303
304 err = -ENOMEM;
305 key = kmalloc(map->key_size, GFP_USER);
306 if (!key)
307 goto err_put;
308
309 err = -EFAULT;
310 if (copy_from_user(key, ukey, map->key_size) != 0)
311 goto free_key;
312
313 err = -ENOMEM;
314 next_key = kmalloc(map->key_size, GFP_USER);
315 if (!next_key)
316 goto free_key;
317
318 rcu_read_lock();
319 err = map->ops->map_get_next_key(map, key, next_key);
320 rcu_read_unlock();
321 if (err)
322 goto free_next_key;
323
324 err = -EFAULT;
325 if (copy_to_user(unext_key, next_key, map->key_size) != 0)
326 goto free_next_key;
327
328 err = 0;
329
330free_next_key:
331 kfree(next_key);
332free_key:
333 kfree(key);
334err_put:
335 fdput(f);
336 return err;
337}
338
339static LIST_HEAD(bpf_prog_types);
340
341static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog)
342{
343 struct bpf_prog_type_list *tl;
344
345 list_for_each_entry(tl, &bpf_prog_types, list_node) {
346 if (tl->type == type) {
347 prog->aux->ops = tl->ops;
348 prog->aux->prog_type = type;
349 return 0;
350 }
351 }
352 return -EINVAL;
353}
354
355void bpf_register_prog_type(struct bpf_prog_type_list *tl)
356{
357 list_add(&tl->list_node, &bpf_prog_types);
358}
359
360/* fixup insn->imm field of bpf_call instructions:
361 * if (insn->imm == BPF_FUNC_map_lookup_elem)
362 * insn->imm = bpf_map_lookup_elem - __bpf_call_base;
363 * else if (insn->imm == BPF_FUNC_map_update_elem)
364 * insn->imm = bpf_map_update_elem - __bpf_call_base;
365 * else ...
366 *
367 * this function is called after eBPF program passed verification
368 */
369static void fixup_bpf_calls(struct bpf_prog *prog)
370{
371 const struct bpf_func_proto *fn;
372 int i;
373
374 for (i = 0; i < prog->len; i++) {
375 struct bpf_insn *insn = &prog->insnsi[i];
376
377 if (insn->code == (BPF_JMP | BPF_CALL)) {
378 /* we reach here when program has bpf_call instructions
379 * and it passed bpf_check(), means that
380 * ops->get_func_proto must have been supplied, check it
381 */
382 BUG_ON(!prog->aux->ops->get_func_proto);
383
384 fn = prog->aux->ops->get_func_proto(insn->imm);
385 /* all functions that have prototype and verifier allowed
386 * programs to call them, must be real in-kernel functions
387 */
388 BUG_ON(!fn->func);
389 insn->imm = fn->func - __bpf_call_base;
390 }
391 }
392}
393
394/* drop refcnt on maps used by eBPF program and free auxilary data */
395static void free_used_maps(struct bpf_prog_aux *aux)
396{
397 int i;
398
399 for (i = 0; i < aux->used_map_cnt; i++)
400 bpf_map_put(aux->used_maps[i]);
401
402 kfree(aux->used_maps);
403}
404
405void bpf_prog_put(struct bpf_prog *prog)
406{
407 if (atomic_dec_and_test(&prog->aux->refcnt)) {
408 free_used_maps(prog->aux);
409 bpf_prog_free(prog);
410 }
411}
412
413static int bpf_prog_release(struct inode *inode, struct file *filp)
414{
415 struct bpf_prog *prog = filp->private_data;
416
417 bpf_prog_put(prog);
418 return 0;
419}
420
421static const struct file_operations bpf_prog_fops = {
422 .release = bpf_prog_release,
423};
424
425static struct bpf_prog *get_prog(struct fd f)
426{
427 struct bpf_prog *prog;
428
429 if (!f.file)
430 return ERR_PTR(-EBADF);
431
432 if (f.file->f_op != &bpf_prog_fops) {
433 fdput(f);
434 return ERR_PTR(-EINVAL);
435 }
436
437 prog = f.file->private_data;
438
439 return prog;
440}
441
442/* called by sockets/tracing/seccomp before attaching program to an event
443 * pairs with bpf_prog_put()
444 */
445struct bpf_prog *bpf_prog_get(u32 ufd)
446{
447 struct fd f = fdget(ufd);
448 struct bpf_prog *prog;
449
450 prog = get_prog(f);
451
452 if (IS_ERR(prog))
453 return prog;
454
455 atomic_inc(&prog->aux->refcnt);
456 fdput(f);
457 return prog;
458}
459
460/* last field in 'union bpf_attr' used by this command */
461#define BPF_PROG_LOAD_LAST_FIELD log_buf
462
463static int bpf_prog_load(union bpf_attr *attr)
464{
465 enum bpf_prog_type type = attr->prog_type;
466 struct bpf_prog *prog;
467 int err;
468 char license[128];
469 bool is_gpl;
470
471 if (CHECK_ATTR(BPF_PROG_LOAD))
472 return -EINVAL;
473
474 /* copy eBPF program license from user space */
475 if (strncpy_from_user(license, u64_to_ptr(attr->license),
476 sizeof(license) - 1) < 0)
477 return -EFAULT;
478 license[sizeof(license) - 1] = 0;
479
480 /* eBPF programs must be GPL compatible to use GPL-ed functions */
481 is_gpl = license_is_gpl_compatible(license);
482
483 if (attr->insn_cnt >= BPF_MAXINSNS)
484 return -EINVAL;
485
486 /* plain bpf_prog allocation */
487 prog = bpf_prog_alloc(bpf_prog_size(attr->insn_cnt), GFP_USER);
488 if (!prog)
489 return -ENOMEM;
490
491 prog->len = attr->insn_cnt;
492
493 err = -EFAULT;
494 if (copy_from_user(prog->insns, u64_to_ptr(attr->insns),
495 prog->len * sizeof(struct bpf_insn)) != 0)
496 goto free_prog;
497
498 prog->orig_prog = NULL;
499 prog->jited = false;
500
501 atomic_set(&prog->aux->refcnt, 1);
502 prog->aux->is_gpl_compatible = is_gpl;
503
504 /* find program type: socket_filter vs tracing_filter */
505 err = find_prog_type(type, prog);
506 if (err < 0)
507 goto free_prog;
508
509 /* run eBPF verifier */
510 err = bpf_check(prog, attr);
511
512 if (err < 0)
513 goto free_used_maps;
514
515 /* fixup BPF_CALL->imm field */
516 fixup_bpf_calls(prog);
517
518 /* eBPF program is ready to be JITed */
519 bpf_prog_select_runtime(prog);
520
521 err = anon_inode_getfd("bpf-prog", &bpf_prog_fops, prog, O_RDWR | O_CLOEXEC);
522
523 if (err < 0)
524 /* failed to allocate fd */
525 goto free_used_maps;
526
527 return err;
528
529free_used_maps:
530 free_used_maps(prog->aux);
531free_prog:
532 bpf_prog_free(prog);
533 return err;
534}
535
536SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
537{
538 union bpf_attr attr = {};
539 int err;
540
541 /* the syscall is limited to root temporarily. This restriction will be
542 * lifted when security audit is clean. Note that eBPF+tracing must have
543 * this restriction, since it may pass kernel data to user space
544 */
545 if (!capable(CAP_SYS_ADMIN))
546 return -EPERM;
547
548 if (!access_ok(VERIFY_READ, uattr, 1))
549 return -EFAULT;
550
551 if (size > PAGE_SIZE) /* silly large */
552 return -E2BIG;
553
554 /* If we're handed a bigger struct than we know of,
555 * ensure all the unknown bits are 0 - i.e. new
556 * user-space does not rely on any kernel feature
557 * extensions we dont know about yet.
558 */
559 if (size > sizeof(attr)) {
560 unsigned char __user *addr;
561 unsigned char __user *end;
562 unsigned char val;
563
564 addr = (void __user *)uattr + sizeof(attr);
565 end = (void __user *)uattr + size;
566
567 for (; addr < end; addr++) {
568 err = get_user(val, addr);
569 if (err)
570 return err;
571 if (val)
572 return -E2BIG;
573 }
574 size = sizeof(attr);
575 }
576
577 /* copy attributes from user space, may be less than sizeof(bpf_attr) */
578 if (copy_from_user(&attr, uattr, size) != 0)
579 return -EFAULT;
580
581 switch (cmd) {
582 case BPF_MAP_CREATE:
583 err = map_create(&attr);
584 break;
585 case BPF_MAP_LOOKUP_ELEM:
586 err = map_lookup_elem(&attr);
587 break;
588 case BPF_MAP_UPDATE_ELEM:
589 err = map_update_elem(&attr);
590 break;
591 case BPF_MAP_DELETE_ELEM:
592 err = map_delete_elem(&attr);
593 break;
594 case BPF_MAP_GET_NEXT_KEY:
595 err = map_get_next_key(&attr);
596 break;
597 case BPF_PROG_LOAD:
598 err = bpf_prog_load(&attr);
599 break;
600 default:
601 err = -EINVAL;
602 break;
603 }
604
605 return err;
606}
diff --git a/kernel/bpf/test_stub.c b/kernel/bpf/test_stub.c
new file mode 100644
index 000000000000..0ceae1e6e8b5
--- /dev/null
+++ b/kernel/bpf/test_stub.c
@@ -0,0 +1,78 @@
1/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
2 *
3 * This program is free software; you can redistribute it and/or
4 * modify it under the terms of version 2 of the GNU General Public
5 * License as published by the Free Software Foundation.
6 */
7#include <linux/kernel.h>
8#include <linux/types.h>
9#include <linux/slab.h>
10#include <linux/err.h>
11#include <linux/bpf.h>
12
13/* test stubs for BPF_MAP_TYPE_UNSPEC and for BPF_PROG_TYPE_UNSPEC
14 * to be used by user space verifier testsuite
15 */
16struct bpf_context {
17 u64 arg1;
18 u64 arg2;
19};
20
21static const struct bpf_func_proto *test_func_proto(enum bpf_func_id func_id)
22{
23 switch (func_id) {
24 case BPF_FUNC_map_lookup_elem:
25 return &bpf_map_lookup_elem_proto;
26 case BPF_FUNC_map_update_elem:
27 return &bpf_map_update_elem_proto;
28 case BPF_FUNC_map_delete_elem:
29 return &bpf_map_delete_elem_proto;
30 default:
31 return NULL;
32 }
33}
34
35static const struct bpf_context_access {
36 int size;
37 enum bpf_access_type type;
38} test_ctx_access[] = {
39 [offsetof(struct bpf_context, arg1)] = {
40 FIELD_SIZEOF(struct bpf_context, arg1),
41 BPF_READ
42 },
43 [offsetof(struct bpf_context, arg2)] = {
44 FIELD_SIZEOF(struct bpf_context, arg2),
45 BPF_READ
46 },
47};
48
49static bool test_is_valid_access(int off, int size, enum bpf_access_type type)
50{
51 const struct bpf_context_access *access;
52
53 if (off < 0 || off >= ARRAY_SIZE(test_ctx_access))
54 return false;
55
56 access = &test_ctx_access[off];
57 if (access->size == size && (access->type & type))
58 return true;
59
60 return false;
61}
62
63static struct bpf_verifier_ops test_ops = {
64 .get_func_proto = test_func_proto,
65 .is_valid_access = test_is_valid_access,
66};
67
68static struct bpf_prog_type_list tl_prog = {
69 .ops = &test_ops,
70 .type = BPF_PROG_TYPE_UNSPEC,
71};
72
73static int __init register_test_ops(void)
74{
75 bpf_register_prog_type(&tl_prog);
76 return 0;
77}
78late_initcall(register_test_ops);
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
new file mode 100644
index 000000000000..a28e09c7825d
--- /dev/null
+++ b/kernel/bpf/verifier.c
@@ -0,0 +1,2003 @@
1/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
2 *
3 * This program is free software; you can redistribute it and/or
4 * modify it under the terms of version 2 of the GNU General Public
5 * License as published by the Free Software Foundation.
6 *
7 * This program is distributed in the hope that it will be useful, but
8 * WITHOUT ANY WARRANTY; without even the implied warranty of
9 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
10 * General Public License for more details.
11 */
12#include <linux/kernel.h>
13#include <linux/types.h>
14#include <linux/slab.h>
15#include <linux/bpf.h>
16#include <linux/filter.h>
17#include <net/netlink.h>
18#include <linux/file.h>
19#include <linux/vmalloc.h>
20
21/* bpf_check() is a static code analyzer that walks eBPF program
22 * instruction by instruction and updates register/stack state.
23 * All paths of conditional branches are analyzed until 'bpf_exit' insn.
24 *
25 * The first pass is depth-first-search to check that the program is a DAG.
26 * It rejects the following programs:
27 * - larger than BPF_MAXINSNS insns
28 * - if loop is present (detected via back-edge)
29 * - unreachable insns exist (shouldn't be a forest. program = one function)
30 * - out of bounds or malformed jumps
31 * The second pass is all possible path descent from the 1st insn.
32 * Since it's analyzing all pathes through the program, the length of the
33 * analysis is limited to 32k insn, which may be hit even if total number of
34 * insn is less then 4K, but there are too many branches that change stack/regs.
35 * Number of 'branches to be analyzed' is limited to 1k
36 *
37 * On entry to each instruction, each register has a type, and the instruction
38 * changes the types of the registers depending on instruction semantics.
39 * If instruction is BPF_MOV64_REG(BPF_REG_1, BPF_REG_5), then type of R5 is
40 * copied to R1.
41 *
42 * All registers are 64-bit.
43 * R0 - return register
44 * R1-R5 argument passing registers
45 * R6-R9 callee saved registers
46 * R10 - frame pointer read-only
47 *
48 * At the start of BPF program the register R1 contains a pointer to bpf_context
49 * and has type PTR_TO_CTX.
50 *
51 * Verifier tracks arithmetic operations on pointers in case:
52 * BPF_MOV64_REG(BPF_REG_1, BPF_REG_10),
53 * BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -20),
54 * 1st insn copies R10 (which has FRAME_PTR) type into R1
55 * and 2nd arithmetic instruction is pattern matched to recognize
56 * that it wants to construct a pointer to some element within stack.
57 * So after 2nd insn, the register R1 has type PTR_TO_STACK
58 * (and -20 constant is saved for further stack bounds checking).
59 * Meaning that this reg is a pointer to stack plus known immediate constant.
60 *
61 * Most of the time the registers have UNKNOWN_VALUE type, which
62 * means the register has some value, but it's not a valid pointer.
63 * (like pointer plus pointer becomes UNKNOWN_VALUE type)
64 *
65 * When verifier sees load or store instructions the type of base register
66 * can be: PTR_TO_MAP_VALUE, PTR_TO_CTX, FRAME_PTR. These are three pointer
67 * types recognized by check_mem_access() function.
68 *
69 * PTR_TO_MAP_VALUE means that this register is pointing to 'map element value'
70 * and the range of [ptr, ptr + map's value_size) is accessible.
71 *
72 * registers used to pass values to function calls are checked against
73 * function argument constraints.
74 *
75 * ARG_PTR_TO_MAP_KEY is one of such argument constraints.
76 * It means that the register type passed to this function must be
77 * PTR_TO_STACK and it will be used inside the function as
78 * 'pointer to map element key'
79 *
80 * For example the argument constraints for bpf_map_lookup_elem():
81 * .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL,
82 * .arg1_type = ARG_CONST_MAP_PTR,
83 * .arg2_type = ARG_PTR_TO_MAP_KEY,
84 *
85 * ret_type says that this function returns 'pointer to map elem value or null'
86 * function expects 1st argument to be a const pointer to 'struct bpf_map' and
87 * 2nd argument should be a pointer to stack, which will be used inside
88 * the helper function as a pointer to map element key.
89 *
90 * On the kernel side the helper function looks like:
91 * u64 bpf_map_lookup_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
92 * {
93 * struct bpf_map *map = (struct bpf_map *) (unsigned long) r1;
94 * void *key = (void *) (unsigned long) r2;
95 * void *value;
96 *
97 * here kernel can access 'key' and 'map' pointers safely, knowing that
98 * [key, key + map->key_size) bytes are valid and were initialized on
99 * the stack of eBPF program.
100 * }
101 *
102 * Corresponding eBPF program may look like:
103 * BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), // after this insn R2 type is FRAME_PTR
104 * BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), // after this insn R2 type is PTR_TO_STACK
105 * BPF_LD_MAP_FD(BPF_REG_1, map_fd), // after this insn R1 type is CONST_PTR_TO_MAP
106 * BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
107 * here verifier looks at prototype of map_lookup_elem() and sees:
108 * .arg1_type == ARG_CONST_MAP_PTR and R1->type == CONST_PTR_TO_MAP, which is ok,
109 * Now verifier knows that this map has key of R1->map_ptr->key_size bytes
110 *
111 * Then .arg2_type == ARG_PTR_TO_MAP_KEY and R2->type == PTR_TO_STACK, ok so far,
112 * Now verifier checks that [R2, R2 + map's key_size) are within stack limits
113 * and were initialized prior to this call.
114 * If it's ok, then verifier allows this BPF_CALL insn and looks at
115 * .ret_type which is RET_PTR_TO_MAP_VALUE_OR_NULL, so it sets
116 * R0->type = PTR_TO_MAP_VALUE_OR_NULL which means bpf_map_lookup_elem() function
117 * returns ether pointer to map value or NULL.
118 *
119 * When type PTR_TO_MAP_VALUE_OR_NULL passes through 'if (reg != 0) goto +off'
120 * insn, the register holding that pointer in the true branch changes state to
121 * PTR_TO_MAP_VALUE and the same register changes state to CONST_IMM in the false
122 * branch. See check_cond_jmp_op().
123 *
124 * After the call R0 is set to return type of the function and registers R1-R5
125 * are set to NOT_INIT to indicate that they are no longer readable.
126 */
127
128/* types of values stored in eBPF registers */
129enum bpf_reg_type {
130 NOT_INIT = 0, /* nothing was written into register */
131 UNKNOWN_VALUE, /* reg doesn't contain a valid pointer */
132 PTR_TO_CTX, /* reg points to bpf_context */
133 CONST_PTR_TO_MAP, /* reg points to struct bpf_map */
134 PTR_TO_MAP_VALUE, /* reg points to map element value */
135 PTR_TO_MAP_VALUE_OR_NULL,/* points to map elem value or NULL */
136 FRAME_PTR, /* reg == frame_pointer */
137 PTR_TO_STACK, /* reg == frame_pointer + imm */
138 CONST_IMM, /* constant integer value */
139};
140
141struct reg_state {
142 enum bpf_reg_type type;
143 union {
144 /* valid when type == CONST_IMM | PTR_TO_STACK */
145 int imm;
146
147 /* valid when type == CONST_PTR_TO_MAP | PTR_TO_MAP_VALUE |
148 * PTR_TO_MAP_VALUE_OR_NULL
149 */
150 struct bpf_map *map_ptr;
151 };
152};
153
154enum bpf_stack_slot_type {
155 STACK_INVALID, /* nothing was stored in this stack slot */
156 STACK_SPILL, /* register spilled into stack */
157 STACK_MISC /* BPF program wrote some data into this slot */
158};
159
160#define BPF_REG_SIZE 8 /* size of eBPF register in bytes */
161
162/* state of the program:
163 * type of all registers and stack info
164 */
165struct verifier_state {
166 struct reg_state regs[MAX_BPF_REG];
167 u8 stack_slot_type[MAX_BPF_STACK];
168 struct reg_state spilled_regs[MAX_BPF_STACK / BPF_REG_SIZE];
169};
170
171/* linked list of verifier states used to prune search */
172struct verifier_state_list {
173 struct verifier_state state;
174 struct verifier_state_list *next;
175};
176
177/* verifier_state + insn_idx are pushed to stack when branch is encountered */
178struct verifier_stack_elem {
179 /* verifer state is 'st'
180 * before processing instruction 'insn_idx'
181 * and after processing instruction 'prev_insn_idx'
182 */
183 struct verifier_state st;
184 int insn_idx;
185 int prev_insn_idx;
186 struct verifier_stack_elem *next;
187};
188
189#define MAX_USED_MAPS 64 /* max number of maps accessed by one eBPF program */
190
191/* single container for all structs
192 * one verifier_env per bpf_check() call
193 */
194struct verifier_env {
195 struct bpf_prog *prog; /* eBPF program being verified */
196 struct verifier_stack_elem *head; /* stack of verifier states to be processed */
197 int stack_size; /* number of states to be processed */
198 struct verifier_state cur_state; /* current verifier state */
199 struct verifier_state_list **explored_states; /* search pruning optimization */
200 struct bpf_map *used_maps[MAX_USED_MAPS]; /* array of map's used by eBPF program */
201 u32 used_map_cnt; /* number of used maps */
202};
203
204/* verbose verifier prints what it's seeing
205 * bpf_check() is called under lock, so no race to access these global vars
206 */
207static u32 log_level, log_size, log_len;
208static char *log_buf;
209
210static DEFINE_MUTEX(bpf_verifier_lock);
211
212/* log_level controls verbosity level of eBPF verifier.
213 * verbose() is used to dump the verification trace to the log, so the user
214 * can figure out what's wrong with the program
215 */
216static void verbose(const char *fmt, ...)
217{
218 va_list args;
219
220 if (log_level == 0 || log_len >= log_size - 1)
221 return;
222
223 va_start(args, fmt);
224 log_len += vscnprintf(log_buf + log_len, log_size - log_len, fmt, args);
225 va_end(args);
226}
227
228/* string representation of 'enum bpf_reg_type' */
229static const char * const reg_type_str[] = {
230 [NOT_INIT] = "?",
231 [UNKNOWN_VALUE] = "inv",
232 [PTR_TO_CTX] = "ctx",
233 [CONST_PTR_TO_MAP] = "map_ptr",
234 [PTR_TO_MAP_VALUE] = "map_value",
235 [PTR_TO_MAP_VALUE_OR_NULL] = "map_value_or_null",
236 [FRAME_PTR] = "fp",
237 [PTR_TO_STACK] = "fp",
238 [CONST_IMM] = "imm",
239};
240
241static void print_verifier_state(struct verifier_env *env)
242{
243 enum bpf_reg_type t;
244 int i;
245
246 for (i = 0; i < MAX_BPF_REG; i++) {
247 t = env->cur_state.regs[i].type;
248 if (t == NOT_INIT)
249 continue;
250 verbose(" R%d=%s", i, reg_type_str[t]);
251 if (t == CONST_IMM || t == PTR_TO_STACK)
252 verbose("%d", env->cur_state.regs[i].imm);
253 else if (t == CONST_PTR_TO_MAP || t == PTR_TO_MAP_VALUE ||
254 t == PTR_TO_MAP_VALUE_OR_NULL)
255 verbose("(ks=%d,vs=%d)",
256 env->cur_state.regs[i].map_ptr->key_size,
257 env->cur_state.regs[i].map_ptr->value_size);
258 }
259 for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) {
260 if (env->cur_state.stack_slot_type[i] == STACK_SPILL)
261 verbose(" fp%d=%s", -MAX_BPF_STACK + i,
262 reg_type_str[env->cur_state.spilled_regs[i / BPF_REG_SIZE].type]);
263 }
264 verbose("\n");
265}
266
267static const char *const bpf_class_string[] = {
268 [BPF_LD] = "ld",
269 [BPF_LDX] = "ldx",
270 [BPF_ST] = "st",
271 [BPF_STX] = "stx",
272 [BPF_ALU] = "alu",
273 [BPF_JMP] = "jmp",
274 [BPF_RET] = "BUG",
275 [BPF_ALU64] = "alu64",
276};
277
278static const char *const bpf_alu_string[] = {
279 [BPF_ADD >> 4] = "+=",
280 [BPF_SUB >> 4] = "-=",
281 [BPF_MUL >> 4] = "*=",
282 [BPF_DIV >> 4] = "/=",
283 [BPF_OR >> 4] = "|=",
284 [BPF_AND >> 4] = "&=",
285 [BPF_LSH >> 4] = "<<=",
286 [BPF_RSH >> 4] = ">>=",
287 [BPF_NEG >> 4] = "neg",
288 [BPF_MOD >> 4] = "%=",
289 [BPF_XOR >> 4] = "^=",
290 [BPF_MOV >> 4] = "=",
291 [BPF_ARSH >> 4] = "s>>=",
292 [BPF_END >> 4] = "endian",
293};
294
295static const char *const bpf_ldst_string[] = {
296 [BPF_W >> 3] = "u32",
297 [BPF_H >> 3] = "u16",
298 [BPF_B >> 3] = "u8",
299 [BPF_DW >> 3] = "u64",
300};
301
302static const char *const bpf_jmp_string[] = {
303 [BPF_JA >> 4] = "jmp",
304 [BPF_JEQ >> 4] = "==",
305 [BPF_JGT >> 4] = ">",
306 [BPF_JGE >> 4] = ">=",
307 [BPF_JSET >> 4] = "&",
308 [BPF_JNE >> 4] = "!=",
309 [BPF_JSGT >> 4] = "s>",
310 [BPF_JSGE >> 4] = "s>=",
311 [BPF_CALL >> 4] = "call",
312 [BPF_EXIT >> 4] = "exit",
313};
314
315static void print_bpf_insn(struct bpf_insn *insn)
316{
317 u8 class = BPF_CLASS(insn->code);
318
319 if (class == BPF_ALU || class == BPF_ALU64) {
320 if (BPF_SRC(insn->code) == BPF_X)
321 verbose("(%02x) %sr%d %s %sr%d\n",
322 insn->code, class == BPF_ALU ? "(u32) " : "",
323 insn->dst_reg,
324 bpf_alu_string[BPF_OP(insn->code) >> 4],
325 class == BPF_ALU ? "(u32) " : "",
326 insn->src_reg);
327 else
328 verbose("(%02x) %sr%d %s %s%d\n",
329 insn->code, class == BPF_ALU ? "(u32) " : "",
330 insn->dst_reg,
331 bpf_alu_string[BPF_OP(insn->code) >> 4],
332 class == BPF_ALU ? "(u32) " : "",
333 insn->imm);
334 } else if (class == BPF_STX) {
335 if (BPF_MODE(insn->code) == BPF_MEM)
336 verbose("(%02x) *(%s *)(r%d %+d) = r%d\n",
337 insn->code,
338 bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
339 insn->dst_reg,
340 insn->off, insn->src_reg);
341 else if (BPF_MODE(insn->code) == BPF_XADD)
342 verbose("(%02x) lock *(%s *)(r%d %+d) += r%d\n",
343 insn->code,
344 bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
345 insn->dst_reg, insn->off,
346 insn->src_reg);
347 else
348 verbose("BUG_%02x\n", insn->code);
349 } else if (class == BPF_ST) {
350 if (BPF_MODE(insn->code) != BPF_MEM) {
351 verbose("BUG_st_%02x\n", insn->code);
352 return;
353 }
354 verbose("(%02x) *(%s *)(r%d %+d) = %d\n",
355 insn->code,
356 bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
357 insn->dst_reg,
358 insn->off, insn->imm);
359 } else if (class == BPF_LDX) {
360 if (BPF_MODE(insn->code) != BPF_MEM) {
361 verbose("BUG_ldx_%02x\n", insn->code);
362 return;
363 }
364 verbose("(%02x) r%d = *(%s *)(r%d %+d)\n",
365 insn->code, insn->dst_reg,
366 bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
367 insn->src_reg, insn->off);
368 } else if (class == BPF_LD) {
369 if (BPF_MODE(insn->code) == BPF_ABS) {
370 verbose("(%02x) r0 = *(%s *)skb[%d]\n",
371 insn->code,
372 bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
373 insn->imm);
374 } else if (BPF_MODE(insn->code) == BPF_IND) {
375 verbose("(%02x) r0 = *(%s *)skb[r%d + %d]\n",
376 insn->code,
377 bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
378 insn->src_reg, insn->imm);
379 } else if (BPF_MODE(insn->code) == BPF_IMM) {
380 verbose("(%02x) r%d = 0x%x\n",
381 insn->code, insn->dst_reg, insn->imm);
382 } else {
383 verbose("BUG_ld_%02x\n", insn->code);
384 return;
385 }
386 } else if (class == BPF_JMP) {
387 u8 opcode = BPF_OP(insn->code);
388
389 if (opcode == BPF_CALL) {
390 verbose("(%02x) call %d\n", insn->code, insn->imm);
391 } else if (insn->code == (BPF_JMP | BPF_JA)) {
392 verbose("(%02x) goto pc%+d\n",
393 insn->code, insn->off);
394 } else if (insn->code == (BPF_JMP | BPF_EXIT)) {
395 verbose("(%02x) exit\n", insn->code);
396 } else if (BPF_SRC(insn->code) == BPF_X) {
397 verbose("(%02x) if r%d %s r%d goto pc%+d\n",
398 insn->code, insn->dst_reg,
399 bpf_jmp_string[BPF_OP(insn->code) >> 4],
400 insn->src_reg, insn->off);
401 } else {
402 verbose("(%02x) if r%d %s 0x%x goto pc%+d\n",
403 insn->code, insn->dst_reg,
404 bpf_jmp_string[BPF_OP(insn->code) >> 4],
405 insn->imm, insn->off);
406 }
407 } else {
408 verbose("(%02x) %s\n", insn->code, bpf_class_string[class]);
409 }
410}
411
412static int pop_stack(struct verifier_env *env, int *prev_insn_idx)
413{
414 struct verifier_stack_elem *elem;
415 int insn_idx;
416
417 if (env->head == NULL)
418 return -1;
419
420 memcpy(&env->cur_state, &env->head->st, sizeof(env->cur_state));
421 insn_idx = env->head->insn_idx;
422 if (prev_insn_idx)
423 *prev_insn_idx = env->head->prev_insn_idx;
424 elem = env->head->next;
425 kfree(env->head);
426 env->head = elem;
427 env->stack_size--;
428 return insn_idx;
429}
430
431static struct verifier_state *push_stack(struct verifier_env *env, int insn_idx,
432 int prev_insn_idx)
433{
434 struct verifier_stack_elem *elem;
435
436 elem = kmalloc(sizeof(struct verifier_stack_elem), GFP_KERNEL);
437 if (!elem)
438 goto err;
439
440 memcpy(&elem->st, &env->cur_state, sizeof(env->cur_state));
441 elem->insn_idx = insn_idx;
442 elem->prev_insn_idx = prev_insn_idx;
443 elem->next = env->head;
444 env->head = elem;
445 env->stack_size++;
446 if (env->stack_size > 1024) {
447 verbose("BPF program is too complex\n");
448 goto err;
449 }
450 return &elem->st;
451err:
452 /* pop all elements and return */
453 while (pop_stack(env, NULL) >= 0);
454 return NULL;
455}
456
457#define CALLER_SAVED_REGS 6
458static const int caller_saved[CALLER_SAVED_REGS] = {
459 BPF_REG_0, BPF_REG_1, BPF_REG_2, BPF_REG_3, BPF_REG_4, BPF_REG_5
460};
461
462static void init_reg_state(struct reg_state *regs)
463{
464 int i;
465
466 for (i = 0; i < MAX_BPF_REG; i++) {
467 regs[i].type = NOT_INIT;
468 regs[i].imm = 0;
469 regs[i].map_ptr = NULL;
470 }
471
472 /* frame pointer */
473 regs[BPF_REG_FP].type = FRAME_PTR;
474
475 /* 1st arg to a function */
476 regs[BPF_REG_1].type = PTR_TO_CTX;
477}
478
479static void mark_reg_unknown_value(struct reg_state *regs, u32 regno)
480{
481 BUG_ON(regno >= MAX_BPF_REG);
482 regs[regno].type = UNKNOWN_VALUE;
483 regs[regno].imm = 0;
484 regs[regno].map_ptr = NULL;
485}
486
487enum reg_arg_type {
488 SRC_OP, /* register is used as source operand */
489 DST_OP, /* register is used as destination operand */
490 DST_OP_NO_MARK /* same as above, check only, don't mark */
491};
492
493static int check_reg_arg(struct reg_state *regs, u32 regno,
494 enum reg_arg_type t)
495{
496 if (regno >= MAX_BPF_REG) {
497 verbose("R%d is invalid\n", regno);
498 return -EINVAL;
499 }
500
501 if (t == SRC_OP) {
502 /* check whether register used as source operand can be read */
503 if (regs[regno].type == NOT_INIT) {
504 verbose("R%d !read_ok\n", regno);
505 return -EACCES;
506 }
507 } else {
508 /* check whether register used as dest operand can be written to */
509 if (regno == BPF_REG_FP) {
510 verbose("frame pointer is read only\n");
511 return -EACCES;
512 }
513 if (t == DST_OP)
514 mark_reg_unknown_value(regs, regno);
515 }
516 return 0;
517}
518
519static int bpf_size_to_bytes(int bpf_size)
520{
521 if (bpf_size == BPF_W)
522 return 4;
523 else if (bpf_size == BPF_H)
524 return 2;
525 else if (bpf_size == BPF_B)
526 return 1;
527 else if (bpf_size == BPF_DW)
528 return 8;
529 else
530 return -EINVAL;
531}
532
533/* check_stack_read/write functions track spill/fill of registers,
534 * stack boundary and alignment are checked in check_mem_access()
535 */
536static int check_stack_write(struct verifier_state *state, int off, int size,
537 int value_regno)
538{
539 int i;
540 /* caller checked that off % size == 0 and -MAX_BPF_STACK <= off < 0,
541 * so it's aligned access and [off, off + size) are within stack limits
542 */
543
544 if (value_regno >= 0 &&
545 (state->regs[value_regno].type == PTR_TO_MAP_VALUE ||
546 state->regs[value_regno].type == PTR_TO_STACK ||
547 state->regs[value_regno].type == PTR_TO_CTX)) {
548
549 /* register containing pointer is being spilled into stack */
550 if (size != BPF_REG_SIZE) {
551 verbose("invalid size of register spill\n");
552 return -EACCES;
553 }
554
555 /* save register state */
556 state->spilled_regs[(MAX_BPF_STACK + off) / BPF_REG_SIZE] =
557 state->regs[value_regno];
558
559 for (i = 0; i < BPF_REG_SIZE; i++)
560 state->stack_slot_type[MAX_BPF_STACK + off + i] = STACK_SPILL;
561 } else {
562 /* regular write of data into stack */
563 state->spilled_regs[(MAX_BPF_STACK + off) / BPF_REG_SIZE] =
564 (struct reg_state) {};
565
566 for (i = 0; i < size; i++)
567 state->stack_slot_type[MAX_BPF_STACK + off + i] = STACK_MISC;
568 }
569 return 0;
570}
571
572static int check_stack_read(struct verifier_state *state, int off, int size,
573 int value_regno)
574{
575 u8 *slot_type;
576 int i;
577
578 slot_type = &state->stack_slot_type[MAX_BPF_STACK + off];
579
580 if (slot_type[0] == STACK_SPILL) {
581 if (size != BPF_REG_SIZE) {
582 verbose("invalid size of register spill\n");
583 return -EACCES;
584 }
585 for (i = 1; i < BPF_REG_SIZE; i++) {
586 if (slot_type[i] != STACK_SPILL) {
587 verbose("corrupted spill memory\n");
588 return -EACCES;
589 }
590 }
591
592 if (value_regno >= 0)
593 /* restore register state from stack */
594 state->regs[value_regno] =
595 state->spilled_regs[(MAX_BPF_STACK + off) / BPF_REG_SIZE];
596 return 0;
597 } else {
598 for (i = 0; i < size; i++) {
599 if (slot_type[i] != STACK_MISC) {
600 verbose("invalid read from stack off %d+%d size %d\n",
601 off, i, size);
602 return -EACCES;
603 }
604 }
605 if (value_regno >= 0)
606 /* have read misc data from the stack */
607 mark_reg_unknown_value(state->regs, value_regno);
608 return 0;
609 }
610}
611
612/* check read/write into map element returned by bpf_map_lookup_elem() */
613static int check_map_access(struct verifier_env *env, u32 regno, int off,
614 int size)
615{
616 struct bpf_map *map = env->cur_state.regs[regno].map_ptr;
617
618 if (off < 0 || off + size > map->value_size) {
619 verbose("invalid access to map value, value_size=%d off=%d size=%d\n",
620 map->value_size, off, size);
621 return -EACCES;
622 }
623 return 0;
624}
625
626/* check access to 'struct bpf_context' fields */
627static int check_ctx_access(struct verifier_env *env, int off, int size,
628 enum bpf_access_type t)
629{
630 if (env->prog->aux->ops->is_valid_access &&
631 env->prog->aux->ops->is_valid_access(off, size, t))
632 return 0;
633
634 verbose("invalid bpf_context access off=%d size=%d\n", off, size);
635 return -EACCES;
636}
637
638/* check whether memory at (regno + off) is accessible for t = (read | write)
639 * if t==write, value_regno is a register which value is stored into memory
640 * if t==read, value_regno is a register which will receive the value from memory
641 * if t==write && value_regno==-1, some unknown value is stored into memory
642 * if t==read && value_regno==-1, don't care what we read from memory
643 */
644static int check_mem_access(struct verifier_env *env, u32 regno, int off,
645 int bpf_size, enum bpf_access_type t,
646 int value_regno)
647{
648 struct verifier_state *state = &env->cur_state;
649 int size, err = 0;
650
651 size = bpf_size_to_bytes(bpf_size);
652 if (size < 0)
653 return size;
654
655 if (off % size != 0) {
656 verbose("misaligned access off %d size %d\n", off, size);
657 return -EACCES;
658 }
659
660 if (state->regs[regno].type == PTR_TO_MAP_VALUE) {
661 err = check_map_access(env, regno, off, size);
662 if (!err && t == BPF_READ && value_regno >= 0)
663 mark_reg_unknown_value(state->regs, value_regno);
664
665 } else if (state->regs[regno].type == PTR_TO_CTX) {
666 err = check_ctx_access(env, off, size, t);
667 if (!err && t == BPF_READ && value_regno >= 0)
668 mark_reg_unknown_value(state->regs, value_regno);
669
670 } else if (state->regs[regno].type == FRAME_PTR) {
671 if (off >= 0 || off < -MAX_BPF_STACK) {
672 verbose("invalid stack off=%d size=%d\n", off, size);
673 return -EACCES;
674 }
675 if (t == BPF_WRITE)
676 err = check_stack_write(state, off, size, value_regno);
677 else
678 err = check_stack_read(state, off, size, value_regno);
679 } else {
680 verbose("R%d invalid mem access '%s'\n",
681 regno, reg_type_str[state->regs[regno].type]);
682 return -EACCES;
683 }
684 return err;
685}
686
687static int check_xadd(struct verifier_env *env, struct bpf_insn *insn)
688{
689 struct reg_state *regs = env->cur_state.regs;
690 int err;
691
692 if ((BPF_SIZE(insn->code) != BPF_W && BPF_SIZE(insn->code) != BPF_DW) ||
693 insn->imm != 0) {
694 verbose("BPF_XADD uses reserved fields\n");
695 return -EINVAL;
696 }
697
698 /* check src1 operand */
699 err = check_reg_arg(regs, insn->src_reg, SRC_OP);
700 if (err)
701 return err;
702
703 /* check src2 operand */
704 err = check_reg_arg(regs, insn->dst_reg, SRC_OP);
705 if (err)
706 return err;
707
708 /* check whether atomic_add can read the memory */
709 err = check_mem_access(env, insn->dst_reg, insn->off,
710 BPF_SIZE(insn->code), BPF_READ, -1);
711 if (err)
712 return err;
713
714 /* check whether atomic_add can write into the same memory */
715 return check_mem_access(env, insn->dst_reg, insn->off,
716 BPF_SIZE(insn->code), BPF_WRITE, -1);
717}
718
719/* when register 'regno' is passed into function that will read 'access_size'
720 * bytes from that pointer, make sure that it's within stack boundary
721 * and all elements of stack are initialized
722 */
723static int check_stack_boundary(struct verifier_env *env,
724 int regno, int access_size)
725{
726 struct verifier_state *state = &env->cur_state;
727 struct reg_state *regs = state->regs;
728 int off, i;
729
730 if (regs[regno].type != PTR_TO_STACK)
731 return -EACCES;
732
733 off = regs[regno].imm;
734 if (off >= 0 || off < -MAX_BPF_STACK || off + access_size > 0 ||
735 access_size <= 0) {
736 verbose("invalid stack type R%d off=%d access_size=%d\n",
737 regno, off, access_size);
738 return -EACCES;
739 }
740
741 for (i = 0; i < access_size; i++) {
742 if (state->stack_slot_type[MAX_BPF_STACK + off + i] != STACK_MISC) {
743 verbose("invalid indirect read from stack off %d+%d size %d\n",
744 off, i, access_size);
745 return -EACCES;
746 }
747 }
748 return 0;
749}
750
751static int check_func_arg(struct verifier_env *env, u32 regno,
752 enum bpf_arg_type arg_type, struct bpf_map **mapp)
753{
754 struct reg_state *reg = env->cur_state.regs + regno;
755 enum bpf_reg_type expected_type;
756 int err = 0;
757
758 if (arg_type == ARG_ANYTHING)
759 return 0;
760
761 if (reg->type == NOT_INIT) {
762 verbose("R%d !read_ok\n", regno);
763 return -EACCES;
764 }
765
766 if (arg_type == ARG_PTR_TO_STACK || arg_type == ARG_PTR_TO_MAP_KEY ||
767 arg_type == ARG_PTR_TO_MAP_VALUE) {
768 expected_type = PTR_TO_STACK;
769 } else if (arg_type == ARG_CONST_STACK_SIZE) {
770 expected_type = CONST_IMM;
771 } else if (arg_type == ARG_CONST_MAP_PTR) {
772 expected_type = CONST_PTR_TO_MAP;
773 } else {
774 verbose("unsupported arg_type %d\n", arg_type);
775 return -EFAULT;
776 }
777
778 if (reg->type != expected_type) {
779 verbose("R%d type=%s expected=%s\n", regno,
780 reg_type_str[reg->type], reg_type_str[expected_type]);
781 return -EACCES;
782 }
783
784 if (arg_type == ARG_CONST_MAP_PTR) {
785 /* bpf_map_xxx(map_ptr) call: remember that map_ptr */
786 *mapp = reg->map_ptr;
787
788 } else if (arg_type == ARG_PTR_TO_MAP_KEY) {
789 /* bpf_map_xxx(..., map_ptr, ..., key) call:
790 * check that [key, key + map->key_size) are within
791 * stack limits and initialized
792 */
793 if (!*mapp) {
794 /* in function declaration map_ptr must come before
795 * map_key, so that it's verified and known before
796 * we have to check map_key here. Otherwise it means
797 * that kernel subsystem misconfigured verifier
798 */
799 verbose("invalid map_ptr to access map->key\n");
800 return -EACCES;
801 }
802 err = check_stack_boundary(env, regno, (*mapp)->key_size);
803
804 } else if (arg_type == ARG_PTR_TO_MAP_VALUE) {
805 /* bpf_map_xxx(..., map_ptr, ..., value) call:
806 * check [value, value + map->value_size) validity
807 */
808 if (!*mapp) {
809 /* kernel subsystem misconfigured verifier */
810 verbose("invalid map_ptr to access map->value\n");
811 return -EACCES;
812 }
813 err = check_stack_boundary(env, regno, (*mapp)->value_size);
814
815 } else if (arg_type == ARG_CONST_STACK_SIZE) {
816 /* bpf_xxx(..., buf, len) call will access 'len' bytes
817 * from stack pointer 'buf'. Check it
818 * note: regno == len, regno - 1 == buf
819 */
820 if (regno == 0) {
821 /* kernel subsystem misconfigured verifier */
822 verbose("ARG_CONST_STACK_SIZE cannot be first argument\n");
823 return -EACCES;
824 }
825 err = check_stack_boundary(env, regno - 1, reg->imm);
826 }
827
828 return err;
829}
830
831static int check_call(struct verifier_env *env, int func_id)
832{
833 struct verifier_state *state = &env->cur_state;
834 const struct bpf_func_proto *fn = NULL;
835 struct reg_state *regs = state->regs;
836 struct bpf_map *map = NULL;
837 struct reg_state *reg;
838 int i, err;
839
840 /* find function prototype */
841 if (func_id < 0 || func_id >= __BPF_FUNC_MAX_ID) {
842 verbose("invalid func %d\n", func_id);
843 return -EINVAL;
844 }
845
846 if (env->prog->aux->ops->get_func_proto)
847 fn = env->prog->aux->ops->get_func_proto(func_id);
848
849 if (!fn) {
850 verbose("unknown func %d\n", func_id);
851 return -EINVAL;
852 }
853
854 /* eBPF programs must be GPL compatible to use GPL-ed functions */
855 if (!env->prog->aux->is_gpl_compatible && fn->gpl_only) {
856 verbose("cannot call GPL only function from proprietary program\n");
857 return -EINVAL;
858 }
859
860 /* check args */
861 err = check_func_arg(env, BPF_REG_1, fn->arg1_type, &map);
862 if (err)
863 return err;
864 err = check_func_arg(env, BPF_REG_2, fn->arg2_type, &map);
865 if (err)
866 return err;
867 err = check_func_arg(env, BPF_REG_3, fn->arg3_type, &map);
868 if (err)
869 return err;
870 err = check_func_arg(env, BPF_REG_4, fn->arg4_type, &map);
871 if (err)
872 return err;
873 err = check_func_arg(env, BPF_REG_5, fn->arg5_type, &map);
874 if (err)
875 return err;
876
877 /* reset caller saved regs */
878 for (i = 0; i < CALLER_SAVED_REGS; i++) {
879 reg = regs + caller_saved[i];
880 reg->type = NOT_INIT;
881 reg->imm = 0;
882 }
883
884 /* update return register */
885 if (fn->ret_type == RET_INTEGER) {
886 regs[BPF_REG_0].type = UNKNOWN_VALUE;
887 } else if (fn->ret_type == RET_VOID) {
888 regs[BPF_REG_0].type = NOT_INIT;
889 } else if (fn->ret_type == RET_PTR_TO_MAP_VALUE_OR_NULL) {
890 regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL;
891 /* remember map_ptr, so that check_map_access()
892 * can check 'value_size' boundary of memory access
893 * to map element returned from bpf_map_lookup_elem()
894 */
895 if (map == NULL) {
896 verbose("kernel subsystem misconfigured verifier\n");
897 return -EINVAL;
898 }
899 regs[BPF_REG_0].map_ptr = map;
900 } else {
901 verbose("unknown return type %d of func %d\n",
902 fn->ret_type, func_id);
903 return -EINVAL;
904 }
905 return 0;
906}
907
908/* check validity of 32-bit and 64-bit arithmetic operations */
909static int check_alu_op(struct reg_state *regs, struct bpf_insn *insn)
910{
911 u8 opcode = BPF_OP(insn->code);
912 int err;
913
914 if (opcode == BPF_END || opcode == BPF_NEG) {
915 if (opcode == BPF_NEG) {
916 if (BPF_SRC(insn->code) != 0 ||
917 insn->src_reg != BPF_REG_0 ||
918 insn->off != 0 || insn->imm != 0) {
919 verbose("BPF_NEG uses reserved fields\n");
920 return -EINVAL;
921 }
922 } else {
923 if (insn->src_reg != BPF_REG_0 || insn->off != 0 ||
924 (insn->imm != 16 && insn->imm != 32 && insn->imm != 64)) {
925 verbose("BPF_END uses reserved fields\n");
926 return -EINVAL;
927 }
928 }
929
930 /* check src operand */
931 err = check_reg_arg(regs, insn->dst_reg, SRC_OP);
932 if (err)
933 return err;
934
935 /* check dest operand */
936 err = check_reg_arg(regs, insn->dst_reg, DST_OP);
937 if (err)
938 return err;
939
940 } else if (opcode == BPF_MOV) {
941
942 if (BPF_SRC(insn->code) == BPF_X) {
943 if (insn->imm != 0 || insn->off != 0) {
944 verbose("BPF_MOV uses reserved fields\n");
945 return -EINVAL;
946 }
947
948 /* check src operand */
949 err = check_reg_arg(regs, insn->src_reg, SRC_OP);
950 if (err)
951 return err;
952 } else {
953 if (insn->src_reg != BPF_REG_0 || insn->off != 0) {
954 verbose("BPF_MOV uses reserved fields\n");
955 return -EINVAL;
956 }
957 }
958
959 /* check dest operand */
960 err = check_reg_arg(regs, insn->dst_reg, DST_OP);
961 if (err)
962 return err;
963
964 if (BPF_SRC(insn->code) == BPF_X) {
965 if (BPF_CLASS(insn->code) == BPF_ALU64) {
966 /* case: R1 = R2
967 * copy register state to dest reg
968 */
969 regs[insn->dst_reg] = regs[insn->src_reg];
970 } else {
971 regs[insn->dst_reg].type = UNKNOWN_VALUE;
972 regs[insn->dst_reg].map_ptr = NULL;
973 }
974 } else {
975 /* case: R = imm
976 * remember the value we stored into this reg
977 */
978 regs[insn->dst_reg].type = CONST_IMM;
979 regs[insn->dst_reg].imm = insn->imm;
980 }
981
982 } else if (opcode > BPF_END) {
983 verbose("invalid BPF_ALU opcode %x\n", opcode);
984 return -EINVAL;
985
986 } else { /* all other ALU ops: and, sub, xor, add, ... */
987
988 bool stack_relative = false;
989
990 if (BPF_SRC(insn->code) == BPF_X) {
991 if (insn->imm != 0 || insn->off != 0) {
992 verbose("BPF_ALU uses reserved fields\n");
993 return -EINVAL;
994 }
995 /* check src1 operand */
996 err = check_reg_arg(regs, insn->src_reg, SRC_OP);
997 if (err)
998 return err;
999 } else {
1000 if (insn->src_reg != BPF_REG_0 || insn->off != 0) {
1001 verbose("BPF_ALU uses reserved fields\n");
1002 return -EINVAL;
1003 }
1004 }
1005
1006 /* check src2 operand */
1007 err = check_reg_arg(regs, insn->dst_reg, SRC_OP);
1008 if (err)
1009 return err;
1010
1011 if ((opcode == BPF_MOD || opcode == BPF_DIV) &&
1012 BPF_SRC(insn->code) == BPF_K && insn->imm == 0) {
1013 verbose("div by zero\n");
1014 return -EINVAL;
1015 }
1016
1017 /* pattern match 'bpf_add Rx, imm' instruction */
1018 if (opcode == BPF_ADD && BPF_CLASS(insn->code) == BPF_ALU64 &&
1019 regs[insn->dst_reg].type == FRAME_PTR &&
1020 BPF_SRC(insn->code) == BPF_K)
1021 stack_relative = true;
1022
1023 /* check dest operand */
1024 err = check_reg_arg(regs, insn->dst_reg, DST_OP);
1025 if (err)
1026 return err;
1027
1028 if (stack_relative) {
1029 regs[insn->dst_reg].type = PTR_TO_STACK;
1030 regs[insn->dst_reg].imm = insn->imm;
1031 }
1032 }
1033
1034 return 0;
1035}
1036
1037static int check_cond_jmp_op(struct verifier_env *env,
1038 struct bpf_insn *insn, int *insn_idx)
1039{
1040 struct reg_state *regs = env->cur_state.regs;
1041 struct verifier_state *other_branch;
1042 u8 opcode = BPF_OP(insn->code);
1043 int err;
1044
1045 if (opcode > BPF_EXIT) {
1046 verbose("invalid BPF_JMP opcode %x\n", opcode);
1047 return -EINVAL;
1048 }
1049
1050 if (BPF_SRC(insn->code) == BPF_X) {
1051 if (insn->imm != 0) {
1052 verbose("BPF_JMP uses reserved fields\n");
1053 return -EINVAL;
1054 }
1055
1056 /* check src1 operand */
1057 err = check_reg_arg(regs, insn->src_reg, SRC_OP);
1058 if (err)
1059 return err;
1060 } else {
1061 if (insn->src_reg != BPF_REG_0) {
1062 verbose("BPF_JMP uses reserved fields\n");
1063 return -EINVAL;
1064 }
1065 }
1066
1067 /* check src2 operand */
1068 err = check_reg_arg(regs, insn->dst_reg, SRC_OP);
1069 if (err)
1070 return err;
1071
1072 /* detect if R == 0 where R was initialized to zero earlier */
1073 if (BPF_SRC(insn->code) == BPF_K &&
1074 (opcode == BPF_JEQ || opcode == BPF_JNE) &&
1075 regs[insn->dst_reg].type == CONST_IMM &&
1076 regs[insn->dst_reg].imm == insn->imm) {
1077 if (opcode == BPF_JEQ) {
1078 /* if (imm == imm) goto pc+off;
1079 * only follow the goto, ignore fall-through
1080 */
1081 *insn_idx += insn->off;
1082 return 0;
1083 } else {
1084 /* if (imm != imm) goto pc+off;
1085 * only follow fall-through branch, since
1086 * that's where the program will go
1087 */
1088 return 0;
1089 }
1090 }
1091
1092 other_branch = push_stack(env, *insn_idx + insn->off + 1, *insn_idx);
1093 if (!other_branch)
1094 return -EFAULT;
1095
1096 /* detect if R == 0 where R is returned value from bpf_map_lookup_elem() */
1097 if (BPF_SRC(insn->code) == BPF_K &&
1098 insn->imm == 0 && (opcode == BPF_JEQ ||
1099 opcode == BPF_JNE) &&
1100 regs[insn->dst_reg].type == PTR_TO_MAP_VALUE_OR_NULL) {
1101 if (opcode == BPF_JEQ) {
1102 /* next fallthrough insn can access memory via
1103 * this register
1104 */
1105 regs[insn->dst_reg].type = PTR_TO_MAP_VALUE;
1106 /* branch targer cannot access it, since reg == 0 */
1107 other_branch->regs[insn->dst_reg].type = CONST_IMM;
1108 other_branch->regs[insn->dst_reg].imm = 0;
1109 } else {
1110 other_branch->regs[insn->dst_reg].type = PTR_TO_MAP_VALUE;
1111 regs[insn->dst_reg].type = CONST_IMM;
1112 regs[insn->dst_reg].imm = 0;
1113 }
1114 } else if (BPF_SRC(insn->code) == BPF_K &&
1115 (opcode == BPF_JEQ || opcode == BPF_JNE)) {
1116
1117 if (opcode == BPF_JEQ) {
1118 /* detect if (R == imm) goto
1119 * and in the target state recognize that R = imm
1120 */
1121 other_branch->regs[insn->dst_reg].type = CONST_IMM;
1122 other_branch->regs[insn->dst_reg].imm = insn->imm;
1123 } else {
1124 /* detect if (R != imm) goto
1125 * and in the fall-through state recognize that R = imm
1126 */
1127 regs[insn->dst_reg].type = CONST_IMM;
1128 regs[insn->dst_reg].imm = insn->imm;
1129 }
1130 }
1131 if (log_level)
1132 print_verifier_state(env);
1133 return 0;
1134}
1135
1136/* return the map pointer stored inside BPF_LD_IMM64 instruction */
1137static struct bpf_map *ld_imm64_to_map_ptr(struct bpf_insn *insn)
1138{
1139 u64 imm64 = ((u64) (u32) insn[0].imm) | ((u64) (u32) insn[1].imm) << 32;
1140
1141 return (struct bpf_map *) (unsigned long) imm64;
1142}
1143
1144/* verify BPF_LD_IMM64 instruction */
1145static int check_ld_imm(struct verifier_env *env, struct bpf_insn *insn)
1146{
1147 struct reg_state *regs = env->cur_state.regs;
1148 int err;
1149
1150 if (BPF_SIZE(insn->code) != BPF_DW) {
1151 verbose("invalid BPF_LD_IMM insn\n");
1152 return -EINVAL;
1153 }
1154 if (insn->off != 0) {
1155 verbose("BPF_LD_IMM64 uses reserved fields\n");
1156 return -EINVAL;
1157 }
1158
1159 err = check_reg_arg(regs, insn->dst_reg, DST_OP);
1160 if (err)
1161 return err;
1162
1163 if (insn->src_reg == 0)
1164 /* generic move 64-bit immediate into a register */
1165 return 0;
1166
1167 /* replace_map_fd_with_map_ptr() should have caught bad ld_imm64 */
1168 BUG_ON(insn->src_reg != BPF_PSEUDO_MAP_FD);
1169
1170 regs[insn->dst_reg].type = CONST_PTR_TO_MAP;
1171 regs[insn->dst_reg].map_ptr = ld_imm64_to_map_ptr(insn);
1172 return 0;
1173}
1174
1175/* verify safety of LD_ABS|LD_IND instructions:
1176 * - they can only appear in the programs where ctx == skb
1177 * - since they are wrappers of function calls, they scratch R1-R5 registers,
1178 * preserve R6-R9, and store return value into R0
1179 *
1180 * Implicit input:
1181 * ctx == skb == R6 == CTX
1182 *
1183 * Explicit input:
1184 * SRC == any register
1185 * IMM == 32-bit immediate
1186 *
1187 * Output:
1188 * R0 - 8/16/32-bit skb data converted to cpu endianness
1189 */
1190static int check_ld_abs(struct verifier_env *env, struct bpf_insn *insn)
1191{
1192 struct reg_state *regs = env->cur_state.regs;
1193 u8 mode = BPF_MODE(insn->code);
1194 struct reg_state *reg;
1195 int i, err;
1196
1197 if (env->prog->aux->prog_type != BPF_PROG_TYPE_SOCKET_FILTER) {
1198 verbose("BPF_LD_ABS|IND instructions are only allowed in socket filters\n");
1199 return -EINVAL;
1200 }
1201
1202 if (insn->dst_reg != BPF_REG_0 || insn->off != 0 ||
1203 (mode == BPF_ABS && insn->src_reg != BPF_REG_0)) {
1204 verbose("BPF_LD_ABS uses reserved fields\n");
1205 return -EINVAL;
1206 }
1207
1208 /* check whether implicit source operand (register R6) is readable */
1209 err = check_reg_arg(regs, BPF_REG_6, SRC_OP);
1210 if (err)
1211 return err;
1212
1213 if (regs[BPF_REG_6].type != PTR_TO_CTX) {
1214 verbose("at the time of BPF_LD_ABS|IND R6 != pointer to skb\n");
1215 return -EINVAL;
1216 }
1217
1218 if (mode == BPF_IND) {
1219 /* check explicit source operand */
1220 err = check_reg_arg(regs, insn->src_reg, SRC_OP);
1221 if (err)
1222 return err;
1223 }
1224
1225 /* reset caller saved regs to unreadable */
1226 for (i = 0; i < CALLER_SAVED_REGS; i++) {
1227 reg = regs + caller_saved[i];
1228 reg->type = NOT_INIT;
1229 reg->imm = 0;
1230 }
1231
1232 /* mark destination R0 register as readable, since it contains
1233 * the value fetched from the packet
1234 */
1235 regs[BPF_REG_0].type = UNKNOWN_VALUE;
1236 return 0;
1237}
1238
1239/* non-recursive DFS pseudo code
1240 * 1 procedure DFS-iterative(G,v):
1241 * 2 label v as discovered
1242 * 3 let S be a stack
1243 * 4 S.push(v)
1244 * 5 while S is not empty
1245 * 6 t <- S.pop()
1246 * 7 if t is what we're looking for:
1247 * 8 return t
1248 * 9 for all edges e in G.adjacentEdges(t) do
1249 * 10 if edge e is already labelled
1250 * 11 continue with the next edge
1251 * 12 w <- G.adjacentVertex(t,e)
1252 * 13 if vertex w is not discovered and not explored
1253 * 14 label e as tree-edge
1254 * 15 label w as discovered
1255 * 16 S.push(w)
1256 * 17 continue at 5
1257 * 18 else if vertex w is discovered
1258 * 19 label e as back-edge
1259 * 20 else
1260 * 21 // vertex w is explored
1261 * 22 label e as forward- or cross-edge
1262 * 23 label t as explored
1263 * 24 S.pop()
1264 *
1265 * convention:
1266 * 0x10 - discovered
1267 * 0x11 - discovered and fall-through edge labelled
1268 * 0x12 - discovered and fall-through and branch edges labelled
1269 * 0x20 - explored
1270 */
1271
1272enum {
1273 DISCOVERED = 0x10,
1274 EXPLORED = 0x20,
1275 FALLTHROUGH = 1,
1276 BRANCH = 2,
1277};
1278
1279#define STATE_LIST_MARK ((struct verifier_state_list *) -1L)
1280
1281static int *insn_stack; /* stack of insns to process */
1282static int cur_stack; /* current stack index */
1283static int *insn_state;
1284
1285/* t, w, e - match pseudo-code above:
1286 * t - index of current instruction
1287 * w - next instruction
1288 * e - edge
1289 */
1290static int push_insn(int t, int w, int e, struct verifier_env *env)
1291{
1292 if (e == FALLTHROUGH && insn_state[t] >= (DISCOVERED | FALLTHROUGH))
1293 return 0;
1294
1295 if (e == BRANCH && insn_state[t] >= (DISCOVERED | BRANCH))
1296 return 0;
1297
1298 if (w < 0 || w >= env->prog->len) {
1299 verbose("jump out of range from insn %d to %d\n", t, w);
1300 return -EINVAL;
1301 }
1302
1303 if (e == BRANCH)
1304 /* mark branch target for state pruning */
1305 env->explored_states[w] = STATE_LIST_MARK;
1306
1307 if (insn_state[w] == 0) {
1308 /* tree-edge */
1309 insn_state[t] = DISCOVERED | e;
1310 insn_state[w] = DISCOVERED;
1311 if (cur_stack >= env->prog->len)
1312 return -E2BIG;
1313 insn_stack[cur_stack++] = w;
1314 return 1;
1315 } else if ((insn_state[w] & 0xF0) == DISCOVERED) {
1316 verbose("back-edge from insn %d to %d\n", t, w);
1317 return -EINVAL;
1318 } else if (insn_state[w] == EXPLORED) {
1319 /* forward- or cross-edge */
1320 insn_state[t] = DISCOVERED | e;
1321 } else {
1322 verbose("insn state internal bug\n");
1323 return -EFAULT;
1324 }
1325 return 0;
1326}
1327
1328/* non-recursive depth-first-search to detect loops in BPF program
1329 * loop == back-edge in directed graph
1330 */
1331static int check_cfg(struct verifier_env *env)
1332{
1333 struct bpf_insn *insns = env->prog->insnsi;
1334 int insn_cnt = env->prog->len;
1335 int ret = 0;
1336 int i, t;
1337
1338 insn_state = kcalloc(insn_cnt, sizeof(int), GFP_KERNEL);
1339 if (!insn_state)
1340 return -ENOMEM;
1341
1342 insn_stack = kcalloc(insn_cnt, sizeof(int), GFP_KERNEL);
1343 if (!insn_stack) {
1344 kfree(insn_state);
1345 return -ENOMEM;
1346 }
1347
1348 insn_state[0] = DISCOVERED; /* mark 1st insn as discovered */
1349 insn_stack[0] = 0; /* 0 is the first instruction */
1350 cur_stack = 1;
1351
1352peek_stack:
1353 if (cur_stack == 0)
1354 goto check_state;
1355 t = insn_stack[cur_stack - 1];
1356
1357 if (BPF_CLASS(insns[t].code) == BPF_JMP) {
1358 u8 opcode = BPF_OP(insns[t].code);
1359
1360 if (opcode == BPF_EXIT) {
1361 goto mark_explored;
1362 } else if (opcode == BPF_CALL) {
1363 ret = push_insn(t, t + 1, FALLTHROUGH, env);
1364 if (ret == 1)
1365 goto peek_stack;
1366 else if (ret < 0)
1367 goto err_free;
1368 } else if (opcode == BPF_JA) {
1369 if (BPF_SRC(insns[t].code) != BPF_K) {
1370 ret = -EINVAL;
1371 goto err_free;
1372 }
1373 /* unconditional jump with single edge */
1374 ret = push_insn(t, t + insns[t].off + 1,
1375 FALLTHROUGH, env);
1376 if (ret == 1)
1377 goto peek_stack;
1378 else if (ret < 0)
1379 goto err_free;
1380 /* tell verifier to check for equivalent states
1381 * after every call and jump
1382 */
1383 env->explored_states[t + 1] = STATE_LIST_MARK;
1384 } else {
1385 /* conditional jump with two edges */
1386 ret = push_insn(t, t + 1, FALLTHROUGH, env);
1387 if (ret == 1)
1388 goto peek_stack;
1389 else if (ret < 0)
1390 goto err_free;
1391
1392 ret = push_insn(t, t + insns[t].off + 1, BRANCH, env);
1393 if (ret == 1)
1394 goto peek_stack;
1395 else if (ret < 0)
1396 goto err_free;
1397 }
1398 } else {
1399 /* all other non-branch instructions with single
1400 * fall-through edge
1401 */
1402 ret = push_insn(t, t + 1, FALLTHROUGH, env);
1403 if (ret == 1)
1404 goto peek_stack;
1405 else if (ret < 0)
1406 goto err_free;
1407 }
1408
1409mark_explored:
1410 insn_state[t] = EXPLORED;
1411 if (cur_stack-- <= 0) {
1412 verbose("pop stack internal bug\n");
1413 ret = -EFAULT;
1414 goto err_free;
1415 }
1416 goto peek_stack;
1417
1418check_state:
1419 for (i = 0; i < insn_cnt; i++) {
1420 if (insn_state[i] != EXPLORED) {
1421 verbose("unreachable insn %d\n", i);
1422 ret = -EINVAL;
1423 goto err_free;
1424 }
1425 }
1426 ret = 0; /* cfg looks good */
1427
1428err_free:
1429 kfree(insn_state);
1430 kfree(insn_stack);
1431 return ret;
1432}
1433
1434/* compare two verifier states
1435 *
1436 * all states stored in state_list are known to be valid, since
1437 * verifier reached 'bpf_exit' instruction through them
1438 *
1439 * this function is called when verifier exploring different branches of
1440 * execution popped from the state stack. If it sees an old state that has
1441 * more strict register state and more strict stack state then this execution
1442 * branch doesn't need to be explored further, since verifier already
1443 * concluded that more strict state leads to valid finish.
1444 *
1445 * Therefore two states are equivalent if register state is more conservative
1446 * and explored stack state is more conservative than the current one.
1447 * Example:
1448 * explored current
1449 * (slot1=INV slot2=MISC) == (slot1=MISC slot2=MISC)
1450 * (slot1=MISC slot2=MISC) != (slot1=INV slot2=MISC)
1451 *
1452 * In other words if current stack state (one being explored) has more
1453 * valid slots than old one that already passed validation, it means
1454 * the verifier can stop exploring and conclude that current state is valid too
1455 *
1456 * Similarly with registers. If explored state has register type as invalid
1457 * whereas register type in current state is meaningful, it means that
1458 * the current state will reach 'bpf_exit' instruction safely
1459 */
1460static bool states_equal(struct verifier_state *old, struct verifier_state *cur)
1461{
1462 int i;
1463
1464 for (i = 0; i < MAX_BPF_REG; i++) {
1465 if (memcmp(&old->regs[i], &cur->regs[i],
1466 sizeof(old->regs[0])) != 0) {
1467 if (old->regs[i].type == NOT_INIT ||
1468 (old->regs[i].type == UNKNOWN_VALUE &&
1469 cur->regs[i].type != NOT_INIT))
1470 continue;
1471 return false;
1472 }
1473 }
1474
1475 for (i = 0; i < MAX_BPF_STACK; i++) {
1476 if (old->stack_slot_type[i] == STACK_INVALID)
1477 continue;
1478 if (old->stack_slot_type[i] != cur->stack_slot_type[i])
1479 /* Ex: old explored (safe) state has STACK_SPILL in
1480 * this stack slot, but current has has STACK_MISC ->
1481 * this verifier states are not equivalent,
1482 * return false to continue verification of this path
1483 */
1484 return false;
1485 if (i % BPF_REG_SIZE)
1486 continue;
1487 if (memcmp(&old->spilled_regs[i / BPF_REG_SIZE],
1488 &cur->spilled_regs[i / BPF_REG_SIZE],
1489 sizeof(old->spilled_regs[0])))
1490 /* when explored and current stack slot types are
1491 * the same, check that stored pointers types
1492 * are the same as well.
1493 * Ex: explored safe path could have stored
1494 * (struct reg_state) {.type = PTR_TO_STACK, .imm = -8}
1495 * but current path has stored:
1496 * (struct reg_state) {.type = PTR_TO_STACK, .imm = -16}
1497 * such verifier states are not equivalent.
1498 * return false to continue verification of this path
1499 */
1500 return false;
1501 else
1502 continue;
1503 }
1504 return true;
1505}
1506
1507static int is_state_visited(struct verifier_env *env, int insn_idx)
1508{
1509 struct verifier_state_list *new_sl;
1510 struct verifier_state_list *sl;
1511
1512 sl = env->explored_states[insn_idx];
1513 if (!sl)
1514 /* this 'insn_idx' instruction wasn't marked, so we will not
1515 * be doing state search here
1516 */
1517 return 0;
1518
1519 while (sl != STATE_LIST_MARK) {
1520 if (states_equal(&sl->state, &env->cur_state))
1521 /* reached equivalent register/stack state,
1522 * prune the search
1523 */
1524 return 1;
1525 sl = sl->next;
1526 }
1527
1528 /* there were no equivalent states, remember current one.
1529 * technically the current state is not proven to be safe yet,
1530 * but it will either reach bpf_exit (which means it's safe) or
1531 * it will be rejected. Since there are no loops, we won't be
1532 * seeing this 'insn_idx' instruction again on the way to bpf_exit
1533 */
1534 new_sl = kmalloc(sizeof(struct verifier_state_list), GFP_USER);
1535 if (!new_sl)
1536 return -ENOMEM;
1537
1538 /* add new state to the head of linked list */
1539 memcpy(&new_sl->state, &env->cur_state, sizeof(env->cur_state));
1540 new_sl->next = env->explored_states[insn_idx];
1541 env->explored_states[insn_idx] = new_sl;
1542 return 0;
1543}
1544
1545static int do_check(struct verifier_env *env)
1546{
1547 struct verifier_state *state = &env->cur_state;
1548 struct bpf_insn *insns = env->prog->insnsi;
1549 struct reg_state *regs = state->regs;
1550 int insn_cnt = env->prog->len;
1551 int insn_idx, prev_insn_idx = 0;
1552 int insn_processed = 0;
1553 bool do_print_state = false;
1554
1555 init_reg_state(regs);
1556 insn_idx = 0;
1557 for (;;) {
1558 struct bpf_insn *insn;
1559 u8 class;
1560 int err;
1561
1562 if (insn_idx >= insn_cnt) {
1563 verbose("invalid insn idx %d insn_cnt %d\n",
1564 insn_idx, insn_cnt);
1565 return -EFAULT;
1566 }
1567
1568 insn = &insns[insn_idx];
1569 class = BPF_CLASS(insn->code);
1570
1571 if (++insn_processed > 32768) {
1572 verbose("BPF program is too large. Proccessed %d insn\n",
1573 insn_processed);
1574 return -E2BIG;
1575 }
1576
1577 err = is_state_visited(env, insn_idx);
1578 if (err < 0)
1579 return err;
1580 if (err == 1) {
1581 /* found equivalent state, can prune the search */
1582 if (log_level) {
1583 if (do_print_state)
1584 verbose("\nfrom %d to %d: safe\n",
1585 prev_insn_idx, insn_idx);
1586 else
1587 verbose("%d: safe\n", insn_idx);
1588 }
1589 goto process_bpf_exit;
1590 }
1591
1592 if (log_level && do_print_state) {
1593 verbose("\nfrom %d to %d:", prev_insn_idx, insn_idx);
1594 print_verifier_state(env);
1595 do_print_state = false;
1596 }
1597
1598 if (log_level) {
1599 verbose("%d: ", insn_idx);
1600 print_bpf_insn(insn);
1601 }
1602
1603 if (class == BPF_ALU || class == BPF_ALU64) {
1604 err = check_alu_op(regs, insn);
1605 if (err)
1606 return err;
1607
1608 } else if (class == BPF_LDX) {
1609 if (BPF_MODE(insn->code) != BPF_MEM ||
1610 insn->imm != 0) {
1611 verbose("BPF_LDX uses reserved fields\n");
1612 return -EINVAL;
1613 }
1614 /* check src operand */
1615 err = check_reg_arg(regs, insn->src_reg, SRC_OP);
1616 if (err)
1617 return err;
1618
1619 err = check_reg_arg(regs, insn->dst_reg, DST_OP_NO_MARK);
1620 if (err)
1621 return err;
1622
1623 /* check that memory (src_reg + off) is readable,
1624 * the state of dst_reg will be updated by this func
1625 */
1626 err = check_mem_access(env, insn->src_reg, insn->off,
1627 BPF_SIZE(insn->code), BPF_READ,
1628 insn->dst_reg);
1629 if (err)
1630 return err;
1631
1632 } else if (class == BPF_STX) {
1633 if (BPF_MODE(insn->code) == BPF_XADD) {
1634 err = check_xadd(env, insn);
1635 if (err)
1636 return err;
1637 insn_idx++;
1638 continue;
1639 }
1640
1641 if (BPF_MODE(insn->code) != BPF_MEM ||
1642 insn->imm != 0) {
1643 verbose("BPF_STX uses reserved fields\n");
1644 return -EINVAL;
1645 }
1646 /* check src1 operand */
1647 err = check_reg_arg(regs, insn->src_reg, SRC_OP);
1648 if (err)
1649 return err;
1650 /* check src2 operand */
1651 err = check_reg_arg(regs, insn->dst_reg, SRC_OP);
1652 if (err)
1653 return err;
1654
1655 /* check that memory (dst_reg + off) is writeable */
1656 err = check_mem_access(env, insn->dst_reg, insn->off,
1657 BPF_SIZE(insn->code), BPF_WRITE,
1658 insn->src_reg);
1659 if (err)
1660 return err;
1661
1662 } else if (class == BPF_ST) {
1663 if (BPF_MODE(insn->code) != BPF_MEM ||
1664 insn->src_reg != BPF_REG_0) {
1665 verbose("BPF_ST uses reserved fields\n");
1666 return -EINVAL;
1667 }
1668 /* check src operand */
1669 err = check_reg_arg(regs, insn->dst_reg, SRC_OP);
1670 if (err)
1671 return err;
1672
1673 /* check that memory (dst_reg + off) is writeable */
1674 err = check_mem_access(env, insn->dst_reg, insn->off,
1675 BPF_SIZE(insn->code), BPF_WRITE,
1676 -1);
1677 if (err)
1678 return err;
1679
1680 } else if (class == BPF_JMP) {
1681 u8 opcode = BPF_OP(insn->code);
1682
1683 if (opcode == BPF_CALL) {
1684 if (BPF_SRC(insn->code) != BPF_K ||
1685 insn->off != 0 ||
1686 insn->src_reg != BPF_REG_0 ||
1687 insn->dst_reg != BPF_REG_0) {
1688 verbose("BPF_CALL uses reserved fields\n");
1689 return -EINVAL;
1690 }
1691
1692 err = check_call(env, insn->imm);
1693 if (err)
1694 return err;
1695
1696 } else if (opcode == BPF_JA) {
1697 if (BPF_SRC(insn->code) != BPF_K ||
1698 insn->imm != 0 ||
1699 insn->src_reg != BPF_REG_0 ||
1700 insn->dst_reg != BPF_REG_0) {
1701 verbose("BPF_JA uses reserved fields\n");
1702 return -EINVAL;
1703 }
1704
1705 insn_idx += insn->off + 1;
1706 continue;
1707
1708 } else if (opcode == BPF_EXIT) {
1709 if (BPF_SRC(insn->code) != BPF_K ||
1710 insn->imm != 0 ||
1711 insn->src_reg != BPF_REG_0 ||
1712 insn->dst_reg != BPF_REG_0) {
1713 verbose("BPF_EXIT uses reserved fields\n");
1714 return -EINVAL;
1715 }
1716
1717 /* eBPF calling convetion is such that R0 is used
1718 * to return the value from eBPF program.
1719 * Make sure that it's readable at this time
1720 * of bpf_exit, which means that program wrote
1721 * something into it earlier
1722 */
1723 err = check_reg_arg(regs, BPF_REG_0, SRC_OP);
1724 if (err)
1725 return err;
1726
1727process_bpf_exit:
1728 insn_idx = pop_stack(env, &prev_insn_idx);
1729 if (insn_idx < 0) {
1730 break;
1731 } else {
1732 do_print_state = true;
1733 continue;
1734 }
1735 } else {
1736 err = check_cond_jmp_op(env, insn, &insn_idx);
1737 if (err)
1738 return err;
1739 }
1740 } else if (class == BPF_LD) {
1741 u8 mode = BPF_MODE(insn->code);
1742
1743 if (mode == BPF_ABS || mode == BPF_IND) {
1744 err = check_ld_abs(env, insn);
1745 if (err)
1746 return err;
1747
1748 } else if (mode == BPF_IMM) {
1749 err = check_ld_imm(env, insn);
1750 if (err)
1751 return err;
1752
1753 insn_idx++;
1754 } else {
1755 verbose("invalid BPF_LD mode\n");
1756 return -EINVAL;
1757 }
1758 } else {
1759 verbose("unknown insn class %d\n", class);
1760 return -EINVAL;
1761 }
1762
1763 insn_idx++;
1764 }
1765
1766 return 0;
1767}
1768
1769/* look for pseudo eBPF instructions that access map FDs and
1770 * replace them with actual map pointers
1771 */
1772static int replace_map_fd_with_map_ptr(struct verifier_env *env)
1773{
1774 struct bpf_insn *insn = env->prog->insnsi;
1775 int insn_cnt = env->prog->len;
1776 int i, j;
1777
1778 for (i = 0; i < insn_cnt; i++, insn++) {
1779 if (insn[0].code == (BPF_LD | BPF_IMM | BPF_DW)) {
1780 struct bpf_map *map;
1781 struct fd f;
1782
1783 if (i == insn_cnt - 1 || insn[1].code != 0 ||
1784 insn[1].dst_reg != 0 || insn[1].src_reg != 0 ||
1785 insn[1].off != 0) {
1786 verbose("invalid bpf_ld_imm64 insn\n");
1787 return -EINVAL;
1788 }
1789
1790 if (insn->src_reg == 0)
1791 /* valid generic load 64-bit imm */
1792 goto next_insn;
1793
1794 if (insn->src_reg != BPF_PSEUDO_MAP_FD) {
1795 verbose("unrecognized bpf_ld_imm64 insn\n");
1796 return -EINVAL;
1797 }
1798
1799 f = fdget(insn->imm);
1800
1801 map = bpf_map_get(f);
1802 if (IS_ERR(map)) {
1803 verbose("fd %d is not pointing to valid bpf_map\n",
1804 insn->imm);
1805 fdput(f);
1806 return PTR_ERR(map);
1807 }
1808
1809 /* store map pointer inside BPF_LD_IMM64 instruction */
1810 insn[0].imm = (u32) (unsigned long) map;
1811 insn[1].imm = ((u64) (unsigned long) map) >> 32;
1812
1813 /* check whether we recorded this map already */
1814 for (j = 0; j < env->used_map_cnt; j++)
1815 if (env->used_maps[j] == map) {
1816 fdput(f);
1817 goto next_insn;
1818 }
1819
1820 if (env->used_map_cnt >= MAX_USED_MAPS) {
1821 fdput(f);
1822 return -E2BIG;
1823 }
1824
1825 /* remember this map */
1826 env->used_maps[env->used_map_cnt++] = map;
1827
1828 /* hold the map. If the program is rejected by verifier,
1829 * the map will be released by release_maps() or it
1830 * will be used by the valid program until it's unloaded
1831 * and all maps are released in free_bpf_prog_info()
1832 */
1833 atomic_inc(&map->refcnt);
1834
1835 fdput(f);
1836next_insn:
1837 insn++;
1838 i++;
1839 }
1840 }
1841
1842 /* now all pseudo BPF_LD_IMM64 instructions load valid
1843 * 'struct bpf_map *' into a register instead of user map_fd.
1844 * These pointers will be used later by verifier to validate map access.
1845 */
1846 return 0;
1847}
1848
1849/* drop refcnt of maps used by the rejected program */
1850static void release_maps(struct verifier_env *env)
1851{
1852 int i;
1853
1854 for (i = 0; i < env->used_map_cnt; i++)
1855 bpf_map_put(env->used_maps[i]);
1856}
1857
1858/* convert pseudo BPF_LD_IMM64 into generic BPF_LD_IMM64 */
1859static void convert_pseudo_ld_imm64(struct verifier_env *env)
1860{
1861 struct bpf_insn *insn = env->prog->insnsi;
1862 int insn_cnt = env->prog->len;
1863 int i;
1864
1865 for (i = 0; i < insn_cnt; i++, insn++)
1866 if (insn->code == (BPF_LD | BPF_IMM | BPF_DW))
1867 insn->src_reg = 0;
1868}
1869
1870static void free_states(struct verifier_env *env)
1871{
1872 struct verifier_state_list *sl, *sln;
1873 int i;
1874
1875 if (!env->explored_states)
1876 return;
1877
1878 for (i = 0; i < env->prog->len; i++) {
1879 sl = env->explored_states[i];
1880
1881 if (sl)
1882 while (sl != STATE_LIST_MARK) {
1883 sln = sl->next;
1884 kfree(sl);
1885 sl = sln;
1886 }
1887 }
1888
1889 kfree(env->explored_states);
1890}
1891
1892int bpf_check(struct bpf_prog *prog, union bpf_attr *attr)
1893{
1894 char __user *log_ubuf = NULL;
1895 struct verifier_env *env;
1896 int ret = -EINVAL;
1897
1898 if (prog->len <= 0 || prog->len > BPF_MAXINSNS)
1899 return -E2BIG;
1900
1901 /* 'struct verifier_env' can be global, but since it's not small,
1902 * allocate/free it every time bpf_check() is called
1903 */
1904 env = kzalloc(sizeof(struct verifier_env), GFP_KERNEL);
1905 if (!env)
1906 return -ENOMEM;
1907
1908 env->prog = prog;
1909
1910 /* grab the mutex to protect few globals used by verifier */
1911 mutex_lock(&bpf_verifier_lock);
1912
1913 if (attr->log_level || attr->log_buf || attr->log_size) {
1914 /* user requested verbose verifier output
1915 * and supplied buffer to store the verification trace
1916 */
1917 log_level = attr->log_level;
1918 log_ubuf = (char __user *) (unsigned long) attr->log_buf;
1919 log_size = attr->log_size;
1920 log_len = 0;
1921
1922 ret = -EINVAL;
1923 /* log_* values have to be sane */
1924 if (log_size < 128 || log_size > UINT_MAX >> 8 ||
1925 log_level == 0 || log_ubuf == NULL)
1926 goto free_env;
1927
1928 ret = -ENOMEM;
1929 log_buf = vmalloc(log_size);
1930 if (!log_buf)
1931 goto free_env;
1932 } else {
1933 log_level = 0;
1934 }
1935
1936 ret = replace_map_fd_with_map_ptr(env);
1937 if (ret < 0)
1938 goto skip_full_check;
1939
1940 env->explored_states = kcalloc(prog->len,
1941 sizeof(struct verifier_state_list *),
1942 GFP_USER);
1943 ret = -ENOMEM;
1944 if (!env->explored_states)
1945 goto skip_full_check;
1946
1947 ret = check_cfg(env);
1948 if (ret < 0)
1949 goto skip_full_check;
1950
1951 ret = do_check(env);
1952
1953skip_full_check:
1954 while (pop_stack(env, NULL) >= 0);
1955 free_states(env);
1956
1957 if (log_level && log_len >= log_size - 1) {
1958 BUG_ON(log_len >= log_size);
1959 /* verifier log exceeded user supplied buffer */
1960 ret = -ENOSPC;
1961 /* fall through to return what was recorded */
1962 }
1963
1964 /* copy verifier log back to user space including trailing zero */
1965 if (log_level && copy_to_user(log_ubuf, log_buf, log_len + 1) != 0) {
1966 ret = -EFAULT;
1967 goto free_log_buf;
1968 }
1969
1970 if (ret == 0 && env->used_map_cnt) {
1971 /* if program passed verifier, update used_maps in bpf_prog_info */
1972 prog->aux->used_maps = kmalloc_array(env->used_map_cnt,
1973 sizeof(env->used_maps[0]),
1974 GFP_KERNEL);
1975
1976 if (!prog->aux->used_maps) {
1977 ret = -ENOMEM;
1978 goto free_log_buf;
1979 }
1980
1981 memcpy(prog->aux->used_maps, env->used_maps,
1982 sizeof(env->used_maps[0]) * env->used_map_cnt);
1983 prog->aux->used_map_cnt = env->used_map_cnt;
1984
1985 /* program is valid. Convert pseudo bpf_ld_imm64 into generic
1986 * bpf_ld_imm64 instructions
1987 */
1988 convert_pseudo_ld_imm64(env);
1989 }
1990
1991free_log_buf:
1992 if (log_level)
1993 vfree(log_buf);
1994free_env:
1995 if (!prog->aux->used_maps)
1996 /* if we didn't copy map pointers into bpf_prog_info, release
1997 * them now. Otherwise free_bpf_prog_info() will release them.
1998 */
1999 release_maps(env);
2000 kfree(env);
2001 mutex_unlock(&bpf_verifier_lock);
2002 return ret;
2003}
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 3a73f995a81e..bb263d0caab3 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -185,7 +185,6 @@ static int need_forkexit_callback __read_mostly;
185static struct cftype cgroup_dfl_base_files[]; 185static struct cftype cgroup_dfl_base_files[];
186static struct cftype cgroup_legacy_base_files[]; 186static struct cftype cgroup_legacy_base_files[];
187 187
188static void cgroup_put(struct cgroup *cgrp);
189static int rebind_subsystems(struct cgroup_root *dst_root, 188static int rebind_subsystems(struct cgroup_root *dst_root,
190 unsigned int ss_mask); 189 unsigned int ss_mask);
191static int cgroup_destroy_locked(struct cgroup *cgrp); 190static int cgroup_destroy_locked(struct cgroup *cgrp);
@@ -195,7 +194,6 @@ static void css_release(struct percpu_ref *ref);
195static void kill_css(struct cgroup_subsys_state *css); 194static void kill_css(struct cgroup_subsys_state *css);
196static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], 195static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
197 bool is_add); 196 bool is_add);
198static void cgroup_pidlist_destroy_all(struct cgroup *cgrp);
199 197
200/* IDR wrappers which synchronize using cgroup_idr_lock */ 198/* IDR wrappers which synchronize using cgroup_idr_lock */
201static int cgroup_idr_alloc(struct idr *idr, void *ptr, int start, int end, 199static int cgroup_idr_alloc(struct idr *idr, void *ptr, int start, int end,
@@ -279,6 +277,10 @@ static struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp,
279 if (!(cgrp->root->subsys_mask & (1 << ss->id))) 277 if (!(cgrp->root->subsys_mask & (1 << ss->id)))
280 return NULL; 278 return NULL;
281 279
280 /*
281 * This function is used while updating css associations and thus
282 * can't test the csses directly. Use ->child_subsys_mask.
283 */
282 while (cgroup_parent(cgrp) && 284 while (cgroup_parent(cgrp) &&
283 !(cgroup_parent(cgrp)->child_subsys_mask & (1 << ss->id))) 285 !(cgroup_parent(cgrp)->child_subsys_mask & (1 << ss->id)))
284 cgrp = cgroup_parent(cgrp); 286 cgrp = cgroup_parent(cgrp);
@@ -286,6 +288,39 @@ static struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp,
286 return cgroup_css(cgrp, ss); 288 return cgroup_css(cgrp, ss);
287} 289}
288 290
291/**
292 * cgroup_get_e_css - get a cgroup's effective css for the specified subsystem
293 * @cgrp: the cgroup of interest
294 * @ss: the subsystem of interest
295 *
296 * Find and get the effective css of @cgrp for @ss. The effective css is
297 * defined as the matching css of the nearest ancestor including self which
298 * has @ss enabled. If @ss is not mounted on the hierarchy @cgrp is on,
299 * the root css is returned, so this function always returns a valid css.
300 * The returned css must be put using css_put().
301 */
302struct cgroup_subsys_state *cgroup_get_e_css(struct cgroup *cgrp,
303 struct cgroup_subsys *ss)
304{
305 struct cgroup_subsys_state *css;
306
307 rcu_read_lock();
308
309 do {
310 css = cgroup_css(cgrp, ss);
311
312 if (css && css_tryget_online(css))
313 goto out_unlock;
314 cgrp = cgroup_parent(cgrp);
315 } while (cgrp);
316
317 css = init_css_set.subsys[ss->id];
318 css_get(css);
319out_unlock:
320 rcu_read_unlock();
321 return css;
322}
323
289/* convenient tests for these bits */ 324/* convenient tests for these bits */
290static inline bool cgroup_is_dead(const struct cgroup *cgrp) 325static inline bool cgroup_is_dead(const struct cgroup *cgrp)
291{ 326{
@@ -331,14 +366,6 @@ bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor)
331 return false; 366 return false;
332} 367}
333 368
334static int cgroup_is_releasable(const struct cgroup *cgrp)
335{
336 const int bits =
337 (1 << CGRP_RELEASABLE) |
338 (1 << CGRP_NOTIFY_ON_RELEASE);
339 return (cgrp->flags & bits) == bits;
340}
341
342static int notify_on_release(const struct cgroup *cgrp) 369static int notify_on_release(const struct cgroup *cgrp)
343{ 370{
344 return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); 371 return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
@@ -394,12 +421,7 @@ static int notify_on_release(const struct cgroup *cgrp)
394 ; \ 421 ; \
395 else 422 else
396 423
397/* the list of cgroups eligible for automatic release. Protected by
398 * release_list_lock */
399static LIST_HEAD(release_list);
400static DEFINE_RAW_SPINLOCK(release_list_lock);
401static void cgroup_release_agent(struct work_struct *work); 424static void cgroup_release_agent(struct work_struct *work);
402static DECLARE_WORK(release_agent_work, cgroup_release_agent);
403static void check_for_release(struct cgroup *cgrp); 425static void check_for_release(struct cgroup *cgrp);
404 426
405/* 427/*
@@ -498,7 +520,7 @@ static unsigned long css_set_hash(struct cgroup_subsys_state *css[])
498 return key; 520 return key;
499} 521}
500 522
501static void put_css_set_locked(struct css_set *cset, bool taskexit) 523static void put_css_set_locked(struct css_set *cset)
502{ 524{
503 struct cgrp_cset_link *link, *tmp_link; 525 struct cgrp_cset_link *link, *tmp_link;
504 struct cgroup_subsys *ss; 526 struct cgroup_subsys *ss;
@@ -524,11 +546,7 @@ static void put_css_set_locked(struct css_set *cset, bool taskexit)
524 /* @cgrp can't go away while we're holding css_set_rwsem */ 546 /* @cgrp can't go away while we're holding css_set_rwsem */
525 if (list_empty(&cgrp->cset_links)) { 547 if (list_empty(&cgrp->cset_links)) {
526 cgroup_update_populated(cgrp, false); 548 cgroup_update_populated(cgrp, false);
527 if (notify_on_release(cgrp)) { 549 check_for_release(cgrp);
528 if (taskexit)
529 set_bit(CGRP_RELEASABLE, &cgrp->flags);
530 check_for_release(cgrp);
531 }
532 } 550 }
533 551
534 kfree(link); 552 kfree(link);
@@ -537,7 +555,7 @@ static void put_css_set_locked(struct css_set *cset, bool taskexit)
537 kfree_rcu(cset, rcu_head); 555 kfree_rcu(cset, rcu_head);
538} 556}
539 557
540static void put_css_set(struct css_set *cset, bool taskexit) 558static void put_css_set(struct css_set *cset)
541{ 559{
542 /* 560 /*
543 * Ensure that the refcount doesn't hit zero while any readers 561 * Ensure that the refcount doesn't hit zero while any readers
@@ -548,7 +566,7 @@ static void put_css_set(struct css_set *cset, bool taskexit)
548 return; 566 return;
549 567
550 down_write(&css_set_rwsem); 568 down_write(&css_set_rwsem);
551 put_css_set_locked(cset, taskexit); 569 put_css_set_locked(cset);
552 up_write(&css_set_rwsem); 570 up_write(&css_set_rwsem);
553} 571}
554 572
@@ -969,14 +987,6 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task,
969 * knows that the cgroup won't be removed, as cgroup_rmdir() 987 * knows that the cgroup won't be removed, as cgroup_rmdir()
970 * needs that mutex. 988 * needs that mutex.
971 * 989 *
972 * The fork and exit callbacks cgroup_fork() and cgroup_exit(), don't
973 * (usually) take cgroup_mutex. These are the two most performance
974 * critical pieces of code here. The exception occurs on cgroup_exit(),
975 * when a task in a notify_on_release cgroup exits. Then cgroup_mutex
976 * is taken, and if the cgroup count is zero, a usermode call made
977 * to the release agent with the name of the cgroup (path relative to
978 * the root of cgroup file system) as the argument.
979 *
980 * A cgroup can only be deleted if both its 'count' of using tasks 990 * A cgroup can only be deleted if both its 'count' of using tasks
981 * is zero, and its list of 'children' cgroups is empty. Since all 991 * is zero, and its list of 'children' cgroups is empty. Since all
982 * tasks in the system use _some_ cgroup, and since there is always at 992 * tasks in the system use _some_ cgroup, and since there is always at
@@ -1046,31 +1056,30 @@ static void cgroup_put(struct cgroup *cgrp)
1046} 1056}
1047 1057
1048/** 1058/**
1049 * cgroup_refresh_child_subsys_mask - update child_subsys_mask 1059 * cgroup_calc_child_subsys_mask - calculate child_subsys_mask
1050 * @cgrp: the target cgroup 1060 * @cgrp: the target cgroup
1061 * @subtree_control: the new subtree_control mask to consider
1051 * 1062 *
1052 * On the default hierarchy, a subsystem may request other subsystems to be 1063 * On the default hierarchy, a subsystem may request other subsystems to be
1053 * enabled together through its ->depends_on mask. In such cases, more 1064 * enabled together through its ->depends_on mask. In such cases, more
1054 * subsystems than specified in "cgroup.subtree_control" may be enabled. 1065 * subsystems than specified in "cgroup.subtree_control" may be enabled.
1055 * 1066 *
1056 * This function determines which subsystems need to be enabled given the 1067 * This function calculates which subsystems need to be enabled if
1057 * current @cgrp->subtree_control and records it in 1068 * @subtree_control is to be applied to @cgrp. The returned mask is always
1058 * @cgrp->child_subsys_mask. The resulting mask is always a superset of 1069 * a superset of @subtree_control and follows the usual hierarchy rules.
1059 * @cgrp->subtree_control and follows the usual hierarchy rules.
1060 */ 1070 */
1061static void cgroup_refresh_child_subsys_mask(struct cgroup *cgrp) 1071static unsigned int cgroup_calc_child_subsys_mask(struct cgroup *cgrp,
1072 unsigned int subtree_control)
1062{ 1073{
1063 struct cgroup *parent = cgroup_parent(cgrp); 1074 struct cgroup *parent = cgroup_parent(cgrp);
1064 unsigned int cur_ss_mask = cgrp->subtree_control; 1075 unsigned int cur_ss_mask = subtree_control;
1065 struct cgroup_subsys *ss; 1076 struct cgroup_subsys *ss;
1066 int ssid; 1077 int ssid;
1067 1078
1068 lockdep_assert_held(&cgroup_mutex); 1079 lockdep_assert_held(&cgroup_mutex);
1069 1080
1070 if (!cgroup_on_dfl(cgrp)) { 1081 if (!cgroup_on_dfl(cgrp))
1071 cgrp->child_subsys_mask = cur_ss_mask; 1082 return cur_ss_mask;
1072 return;
1073 }
1074 1083
1075 while (true) { 1084 while (true) {
1076 unsigned int new_ss_mask = cur_ss_mask; 1085 unsigned int new_ss_mask = cur_ss_mask;
@@ -1094,7 +1103,20 @@ static void cgroup_refresh_child_subsys_mask(struct cgroup *cgrp)
1094 cur_ss_mask = new_ss_mask; 1103 cur_ss_mask = new_ss_mask;
1095 } 1104 }
1096 1105
1097 cgrp->child_subsys_mask = cur_ss_mask; 1106 return cur_ss_mask;
1107}
1108
1109/**
1110 * cgroup_refresh_child_subsys_mask - update child_subsys_mask
1111 * @cgrp: the target cgroup
1112 *
1113 * Update @cgrp->child_subsys_mask according to the current
1114 * @cgrp->subtree_control using cgroup_calc_child_subsys_mask().
1115 */
1116static void cgroup_refresh_child_subsys_mask(struct cgroup *cgrp)
1117{
1118 cgrp->child_subsys_mask =
1119 cgroup_calc_child_subsys_mask(cgrp, cgrp->subtree_control);
1098} 1120}
1099 1121
1100/** 1122/**
@@ -1587,7 +1609,6 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
1587 INIT_LIST_HEAD(&cgrp->self.sibling); 1609 INIT_LIST_HEAD(&cgrp->self.sibling);
1588 INIT_LIST_HEAD(&cgrp->self.children); 1610 INIT_LIST_HEAD(&cgrp->self.children);
1589 INIT_LIST_HEAD(&cgrp->cset_links); 1611 INIT_LIST_HEAD(&cgrp->cset_links);
1590 INIT_LIST_HEAD(&cgrp->release_list);
1591 INIT_LIST_HEAD(&cgrp->pidlists); 1612 INIT_LIST_HEAD(&cgrp->pidlists);
1592 mutex_init(&cgrp->pidlist_mutex); 1613 mutex_init(&cgrp->pidlist_mutex);
1593 cgrp->self.cgroup = cgrp; 1614 cgrp->self.cgroup = cgrp;
@@ -1597,6 +1618,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
1597 INIT_LIST_HEAD(&cgrp->e_csets[ssid]); 1618 INIT_LIST_HEAD(&cgrp->e_csets[ssid]);
1598 1619
1599 init_waitqueue_head(&cgrp->offline_waitq); 1620 init_waitqueue_head(&cgrp->offline_waitq);
1621 INIT_WORK(&cgrp->release_agent_work, cgroup_release_agent);
1600} 1622}
1601 1623
1602static void init_cgroup_root(struct cgroup_root *root, 1624static void init_cgroup_root(struct cgroup_root *root,
@@ -1634,7 +1656,8 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned int ss_mask)
1634 goto out; 1656 goto out;
1635 root_cgrp->id = ret; 1657 root_cgrp->id = ret;
1636 1658
1637 ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release); 1659 ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release, 0,
1660 GFP_KERNEL);
1638 if (ret) 1661 if (ret)
1639 goto out; 1662 goto out;
1640 1663
@@ -2052,8 +2075,7 @@ static void cgroup_task_migrate(struct cgroup *old_cgrp,
2052 * task. As trading it for new_cset is protected by cgroup_mutex, 2075 * task. As trading it for new_cset is protected by cgroup_mutex,
2053 * we're safe to drop it here; it will be freed under RCU. 2076 * we're safe to drop it here; it will be freed under RCU.
2054 */ 2077 */
2055 set_bit(CGRP_RELEASABLE, &old_cgrp->flags); 2078 put_css_set_locked(old_cset);
2056 put_css_set_locked(old_cset, false);
2057} 2079}
2058 2080
2059/** 2081/**
@@ -2074,7 +2096,7 @@ static void cgroup_migrate_finish(struct list_head *preloaded_csets)
2074 cset->mg_src_cgrp = NULL; 2096 cset->mg_src_cgrp = NULL;
2075 cset->mg_dst_cset = NULL; 2097 cset->mg_dst_cset = NULL;
2076 list_del_init(&cset->mg_preload_node); 2098 list_del_init(&cset->mg_preload_node);
2077 put_css_set_locked(cset, false); 2099 put_css_set_locked(cset);
2078 } 2100 }
2079 up_write(&css_set_rwsem); 2101 up_write(&css_set_rwsem);
2080} 2102}
@@ -2168,8 +2190,8 @@ static int cgroup_migrate_prepare_dst(struct cgroup *dst_cgrp,
2168 if (src_cset == dst_cset) { 2190 if (src_cset == dst_cset) {
2169 src_cset->mg_src_cgrp = NULL; 2191 src_cset->mg_src_cgrp = NULL;
2170 list_del_init(&src_cset->mg_preload_node); 2192 list_del_init(&src_cset->mg_preload_node);
2171 put_css_set(src_cset, false); 2193 put_css_set(src_cset);
2172 put_css_set(dst_cset, false); 2194 put_css_set(dst_cset);
2173 continue; 2195 continue;
2174 } 2196 }
2175 2197
@@ -2178,7 +2200,7 @@ static int cgroup_migrate_prepare_dst(struct cgroup *dst_cgrp,
2178 if (list_empty(&dst_cset->mg_preload_node)) 2200 if (list_empty(&dst_cset->mg_preload_node))
2179 list_add(&dst_cset->mg_preload_node, &csets); 2201 list_add(&dst_cset->mg_preload_node, &csets);
2180 else 2202 else
2181 put_css_set(dst_cset, false); 2203 put_css_set(dst_cset);
2182 } 2204 }
2183 2205
2184 list_splice_tail(&csets, preloaded_csets); 2206 list_splice_tail(&csets, preloaded_csets);
@@ -2668,7 +2690,7 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
2668 loff_t off) 2690 loff_t off)
2669{ 2691{
2670 unsigned int enable = 0, disable = 0; 2692 unsigned int enable = 0, disable = 0;
2671 unsigned int css_enable, css_disable, old_ctrl, new_ctrl; 2693 unsigned int css_enable, css_disable, old_sc, new_sc, old_ss, new_ss;
2672 struct cgroup *cgrp, *child; 2694 struct cgroup *cgrp, *child;
2673 struct cgroup_subsys *ss; 2695 struct cgroup_subsys *ss;
2674 char *tok; 2696 char *tok;
@@ -2720,36 +2742,6 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
2720 ret = -ENOENT; 2742 ret = -ENOENT;
2721 goto out_unlock; 2743 goto out_unlock;
2722 } 2744 }
2723
2724 /*
2725 * @ss is already enabled through dependency and
2726 * we'll just make it visible. Skip draining.
2727 */
2728 if (cgrp->child_subsys_mask & (1 << ssid))
2729 continue;
2730
2731 /*
2732 * Because css offlining is asynchronous, userland
2733 * might try to re-enable the same controller while
2734 * the previous instance is still around. In such
2735 * cases, wait till it's gone using offline_waitq.
2736 */
2737 cgroup_for_each_live_child(child, cgrp) {
2738 DEFINE_WAIT(wait);
2739
2740 if (!cgroup_css(child, ss))
2741 continue;
2742
2743 cgroup_get(child);
2744 prepare_to_wait(&child->offline_waitq, &wait,
2745 TASK_UNINTERRUPTIBLE);
2746 cgroup_kn_unlock(of->kn);
2747 schedule();
2748 finish_wait(&child->offline_waitq, &wait);
2749 cgroup_put(child);
2750
2751 return restart_syscall();
2752 }
2753 } else if (disable & (1 << ssid)) { 2745 } else if (disable & (1 << ssid)) {
2754 if (!(cgrp->subtree_control & (1 << ssid))) { 2746 if (!(cgrp->subtree_control & (1 << ssid))) {
2755 disable &= ~(1 << ssid); 2747 disable &= ~(1 << ssid);
@@ -2785,19 +2777,48 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
2785 * subsystems than specified may need to be enabled or disabled 2777 * subsystems than specified may need to be enabled or disabled
2786 * depending on subsystem dependencies. 2778 * depending on subsystem dependencies.
2787 */ 2779 */
2788 cgrp->subtree_control |= enable; 2780 old_sc = cgrp->subtree_control;
2789 cgrp->subtree_control &= ~disable; 2781 old_ss = cgrp->child_subsys_mask;
2790 2782 new_sc = (old_sc | enable) & ~disable;
2791 old_ctrl = cgrp->child_subsys_mask; 2783 new_ss = cgroup_calc_child_subsys_mask(cgrp, new_sc);
2792 cgroup_refresh_child_subsys_mask(cgrp);
2793 new_ctrl = cgrp->child_subsys_mask;
2794 2784
2795 css_enable = ~old_ctrl & new_ctrl; 2785 css_enable = ~old_ss & new_ss;
2796 css_disable = old_ctrl & ~new_ctrl; 2786 css_disable = old_ss & ~new_ss;
2797 enable |= css_enable; 2787 enable |= css_enable;
2798 disable |= css_disable; 2788 disable |= css_disable;
2799 2789
2800 /* 2790 /*
2791 * Because css offlining is asynchronous, userland might try to
2792 * re-enable the same controller while the previous instance is
2793 * still around. In such cases, wait till it's gone using
2794 * offline_waitq.
2795 */
2796 for_each_subsys(ss, ssid) {
2797 if (!(css_enable & (1 << ssid)))
2798 continue;
2799
2800 cgroup_for_each_live_child(child, cgrp) {
2801 DEFINE_WAIT(wait);
2802
2803 if (!cgroup_css(child, ss))
2804 continue;
2805
2806 cgroup_get(child);
2807 prepare_to_wait(&child->offline_waitq, &wait,
2808 TASK_UNINTERRUPTIBLE);
2809 cgroup_kn_unlock(of->kn);
2810 schedule();
2811 finish_wait(&child->offline_waitq, &wait);
2812 cgroup_put(child);
2813
2814 return restart_syscall();
2815 }
2816 }
2817
2818 cgrp->subtree_control = new_sc;
2819 cgrp->child_subsys_mask = new_ss;
2820
2821 /*
2801 * Create new csses or make the existing ones visible. A css is 2822 * Create new csses or make the existing ones visible. A css is
2802 * created invisible if it's being implicitly enabled through 2823 * created invisible if it's being implicitly enabled through
2803 * dependency. An invisible css is made visible when the userland 2824 * dependency. An invisible css is made visible when the userland
@@ -2852,6 +2873,24 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
2852 } 2873 }
2853 } 2874 }
2854 2875
2876 /*
2877 * The effective csses of all the descendants (excluding @cgrp) may
2878 * have changed. Subsystems can optionally subscribe to this event
2879 * by implementing ->css_e_css_changed() which is invoked if any of
2880 * the effective csses seen from the css's cgroup may have changed.
2881 */
2882 for_each_subsys(ss, ssid) {
2883 struct cgroup_subsys_state *this_css = cgroup_css(cgrp, ss);
2884 struct cgroup_subsys_state *css;
2885
2886 if (!ss->css_e_css_changed || !this_css)
2887 continue;
2888
2889 css_for_each_descendant_pre(css, this_css)
2890 if (css != this_css)
2891 ss->css_e_css_changed(css);
2892 }
2893
2855 kernfs_activate(cgrp->kn); 2894 kernfs_activate(cgrp->kn);
2856 ret = 0; 2895 ret = 0;
2857out_unlock: 2896out_unlock:
@@ -2859,9 +2898,8 @@ out_unlock:
2859 return ret ?: nbytes; 2898 return ret ?: nbytes;
2860 2899
2861err_undo_css: 2900err_undo_css:
2862 cgrp->subtree_control &= ~enable; 2901 cgrp->subtree_control = old_sc;
2863 cgrp->subtree_control |= disable; 2902 cgrp->child_subsys_mask = old_ss;
2864 cgroup_refresh_child_subsys_mask(cgrp);
2865 2903
2866 for_each_subsys(ss, ssid) { 2904 for_each_subsys(ss, ssid) {
2867 if (!(enable & (1 << ssid))) 2905 if (!(enable & (1 << ssid)))
@@ -4173,7 +4211,6 @@ static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css,
4173static int cgroup_write_notify_on_release(struct cgroup_subsys_state *css, 4211static int cgroup_write_notify_on_release(struct cgroup_subsys_state *css,
4174 struct cftype *cft, u64 val) 4212 struct cftype *cft, u64 val)
4175{ 4213{
4176 clear_bit(CGRP_RELEASABLE, &css->cgroup->flags);
4177 if (val) 4214 if (val)
4178 set_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags); 4215 set_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);
4179 else 4216 else
@@ -4351,6 +4388,7 @@ static void css_free_work_fn(struct work_struct *work)
4351 /* cgroup free path */ 4388 /* cgroup free path */
4352 atomic_dec(&cgrp->root->nr_cgrps); 4389 atomic_dec(&cgrp->root->nr_cgrps);
4353 cgroup_pidlist_destroy_all(cgrp); 4390 cgroup_pidlist_destroy_all(cgrp);
4391 cancel_work_sync(&cgrp->release_agent_work);
4354 4392
4355 if (cgroup_parent(cgrp)) { 4393 if (cgroup_parent(cgrp)) {
4356 /* 4394 /*
@@ -4397,6 +4435,8 @@ static void css_release_work_fn(struct work_struct *work)
4397 if (ss) { 4435 if (ss) {
4398 /* css release path */ 4436 /* css release path */
4399 cgroup_idr_remove(&ss->css_idr, css->id); 4437 cgroup_idr_remove(&ss->css_idr, css->id);
4438 if (ss->css_released)
4439 ss->css_released(css);
4400 } else { 4440 } else {
4401 /* cgroup release path */ 4441 /* cgroup release path */
4402 cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id); 4442 cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id);
@@ -4510,7 +4550,7 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss,
4510 4550
4511 init_and_link_css(css, ss, cgrp); 4551 init_and_link_css(css, ss, cgrp);
4512 4552
4513 err = percpu_ref_init(&css->refcnt, css_release); 4553 err = percpu_ref_init(&css->refcnt, css_release, 0, GFP_KERNEL);
4514 if (err) 4554 if (err)
4515 goto err_free_css; 4555 goto err_free_css;
4516 4556
@@ -4583,7 +4623,7 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
4583 goto out_unlock; 4623 goto out_unlock;
4584 } 4624 }
4585 4625
4586 ret = percpu_ref_init(&cgrp->self.refcnt, css_release); 4626 ret = percpu_ref_init(&cgrp->self.refcnt, css_release, 0, GFP_KERNEL);
4587 if (ret) 4627 if (ret)
4588 goto out_free_cgrp; 4628 goto out_free_cgrp;
4589 4629
@@ -4813,19 +4853,12 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
4813 for_each_css(css, ssid, cgrp) 4853 for_each_css(css, ssid, cgrp)
4814 kill_css(css); 4854 kill_css(css);
4815 4855
4816 /* CSS_ONLINE is clear, remove from ->release_list for the last time */
4817 raw_spin_lock(&release_list_lock);
4818 if (!list_empty(&cgrp->release_list))
4819 list_del_init(&cgrp->release_list);
4820 raw_spin_unlock(&release_list_lock);
4821
4822 /* 4856 /*
4823 * Remove @cgrp directory along with the base files. @cgrp has an 4857 * Remove @cgrp directory along with the base files. @cgrp has an
4824 * extra ref on its kn. 4858 * extra ref on its kn.
4825 */ 4859 */
4826 kernfs_remove(cgrp->kn); 4860 kernfs_remove(cgrp->kn);
4827 4861
4828 set_bit(CGRP_RELEASABLE, &cgroup_parent(cgrp)->flags);
4829 check_for_release(cgroup_parent(cgrp)); 4862 check_for_release(cgroup_parent(cgrp));
4830 4863
4831 /* put the base reference */ 4864 /* put the base reference */
@@ -4842,13 +4875,10 @@ static int cgroup_rmdir(struct kernfs_node *kn)
4842 cgrp = cgroup_kn_lock_live(kn); 4875 cgrp = cgroup_kn_lock_live(kn);
4843 if (!cgrp) 4876 if (!cgrp)
4844 return 0; 4877 return 0;
4845 cgroup_get(cgrp); /* for @kn->priv clearing */
4846 4878
4847 ret = cgroup_destroy_locked(cgrp); 4879 ret = cgroup_destroy_locked(cgrp);
4848 4880
4849 cgroup_kn_unlock(kn); 4881 cgroup_kn_unlock(kn);
4850
4851 cgroup_put(cgrp);
4852 return ret; 4882 return ret;
4853} 4883}
4854 4884
@@ -5052,12 +5082,9 @@ core_initcall(cgroup_wq_init);
5052 * - Print task's cgroup paths into seq_file, one line for each hierarchy 5082 * - Print task's cgroup paths into seq_file, one line for each hierarchy
5053 * - Used for /proc/<pid>/cgroup. 5083 * - Used for /proc/<pid>/cgroup.
5054 */ 5084 */
5055 5085int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
5056/* TODO: Use a proper seq_file iterator */ 5086 struct pid *pid, struct task_struct *tsk)
5057int proc_cgroup_show(struct seq_file *m, void *v)
5058{ 5087{
5059 struct pid *pid;
5060 struct task_struct *tsk;
5061 char *buf, *path; 5088 char *buf, *path;
5062 int retval; 5089 int retval;
5063 struct cgroup_root *root; 5090 struct cgroup_root *root;
@@ -5067,14 +5094,6 @@ int proc_cgroup_show(struct seq_file *m, void *v)
5067 if (!buf) 5094 if (!buf)
5068 goto out; 5095 goto out;
5069 5096
5070 retval = -ESRCH;
5071 pid = m->private;
5072 tsk = get_pid_task(pid, PIDTYPE_PID);
5073 if (!tsk)
5074 goto out_free;
5075
5076 retval = 0;
5077
5078 mutex_lock(&cgroup_mutex); 5097 mutex_lock(&cgroup_mutex);
5079 down_read(&css_set_rwsem); 5098 down_read(&css_set_rwsem);
5080 5099
@@ -5104,11 +5123,10 @@ int proc_cgroup_show(struct seq_file *m, void *v)
5104 seq_putc(m, '\n'); 5123 seq_putc(m, '\n');
5105 } 5124 }
5106 5125
5126 retval = 0;
5107out_unlock: 5127out_unlock:
5108 up_read(&css_set_rwsem); 5128 up_read(&css_set_rwsem);
5109 mutex_unlock(&cgroup_mutex); 5129 mutex_unlock(&cgroup_mutex);
5110 put_task_struct(tsk);
5111out_free:
5112 kfree(buf); 5130 kfree(buf);
5113out: 5131out:
5114 return retval; 5132 return retval;
@@ -5179,7 +5197,7 @@ void cgroup_post_fork(struct task_struct *child)
5179 int i; 5197 int i;
5180 5198
5181 /* 5199 /*
5182 * This may race against cgroup_enable_task_cg_links(). As that 5200 * This may race against cgroup_enable_task_cg_lists(). As that
5183 * function sets use_task_css_set_links before grabbing 5201 * function sets use_task_css_set_links before grabbing
5184 * tasklist_lock and we just went through tasklist_lock to add 5202 * tasklist_lock and we just went through tasklist_lock to add
5185 * @child, it's guaranteed that either we see the set 5203 * @child, it's guaranteed that either we see the set
@@ -5194,7 +5212,7 @@ void cgroup_post_fork(struct task_struct *child)
5194 * when implementing operations which need to migrate all tasks of 5212 * when implementing operations which need to migrate all tasks of
5195 * a cgroup to another. 5213 * a cgroup to another.
5196 * 5214 *
5197 * Note that if we lose to cgroup_enable_task_cg_links(), @child 5215 * Note that if we lose to cgroup_enable_task_cg_lists(), @child
5198 * will remain in init_css_set. This is safe because all tasks are 5216 * will remain in init_css_set. This is safe because all tasks are
5199 * in the init_css_set before cg_links is enabled and there's no 5217 * in the init_css_set before cg_links is enabled and there's no
5200 * operation which transfers all tasks out of init_css_set. 5218 * operation which transfers all tasks out of init_css_set.
@@ -5278,30 +5296,14 @@ void cgroup_exit(struct task_struct *tsk)
5278 } 5296 }
5279 5297
5280 if (put_cset) 5298 if (put_cset)
5281 put_css_set(cset, true); 5299 put_css_set(cset);
5282} 5300}
5283 5301
5284static void check_for_release(struct cgroup *cgrp) 5302static void check_for_release(struct cgroup *cgrp)
5285{ 5303{
5286 if (cgroup_is_releasable(cgrp) && list_empty(&cgrp->cset_links) && 5304 if (notify_on_release(cgrp) && !cgroup_has_tasks(cgrp) &&
5287 !css_has_online_children(&cgrp->self)) { 5305 !css_has_online_children(&cgrp->self) && !cgroup_is_dead(cgrp))
5288 /* 5306 schedule_work(&cgrp->release_agent_work);
5289 * Control Group is currently removeable. If it's not
5290 * already queued for a userspace notification, queue
5291 * it now
5292 */
5293 int need_schedule_work = 0;
5294
5295 raw_spin_lock(&release_list_lock);
5296 if (!cgroup_is_dead(cgrp) &&
5297 list_empty(&cgrp->release_list)) {
5298 list_add(&cgrp->release_list, &release_list);
5299 need_schedule_work = 1;
5300 }
5301 raw_spin_unlock(&release_list_lock);
5302 if (need_schedule_work)
5303 schedule_work(&release_agent_work);
5304 }
5305} 5307}
5306 5308
5307/* 5309/*
@@ -5329,52 +5331,39 @@ static void check_for_release(struct cgroup *cgrp)
5329 */ 5331 */
5330static void cgroup_release_agent(struct work_struct *work) 5332static void cgroup_release_agent(struct work_struct *work)
5331{ 5333{
5332 BUG_ON(work != &release_agent_work); 5334 struct cgroup *cgrp =
5335 container_of(work, struct cgroup, release_agent_work);
5336 char *pathbuf = NULL, *agentbuf = NULL, *path;
5337 char *argv[3], *envp[3];
5338
5333 mutex_lock(&cgroup_mutex); 5339 mutex_lock(&cgroup_mutex);
5334 raw_spin_lock(&release_list_lock); 5340
5335 while (!list_empty(&release_list)) { 5341 pathbuf = kmalloc(PATH_MAX, GFP_KERNEL);
5336 char *argv[3], *envp[3]; 5342 agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL);
5337 int i; 5343 if (!pathbuf || !agentbuf)
5338 char *pathbuf = NULL, *agentbuf = NULL, *path; 5344 goto out;
5339 struct cgroup *cgrp = list_entry(release_list.next, 5345
5340 struct cgroup, 5346 path = cgroup_path(cgrp, pathbuf, PATH_MAX);
5341 release_list); 5347 if (!path)
5342 list_del_init(&cgrp->release_list); 5348 goto out;
5343 raw_spin_unlock(&release_list_lock); 5349
5344 pathbuf = kmalloc(PATH_MAX, GFP_KERNEL); 5350 argv[0] = agentbuf;
5345 if (!pathbuf) 5351 argv[1] = path;
5346 goto continue_free; 5352 argv[2] = NULL;
5347 path = cgroup_path(cgrp, pathbuf, PATH_MAX); 5353
5348 if (!path) 5354 /* minimal command environment */
5349 goto continue_free; 5355 envp[0] = "HOME=/";
5350 agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL); 5356 envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
5351 if (!agentbuf) 5357 envp[2] = NULL;
5352 goto continue_free; 5358
5353
5354 i = 0;
5355 argv[i++] = agentbuf;
5356 argv[i++] = path;
5357 argv[i] = NULL;
5358
5359 i = 0;
5360 /* minimal command environment */
5361 envp[i++] = "HOME=/";
5362 envp[i++] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
5363 envp[i] = NULL;
5364
5365 /* Drop the lock while we invoke the usermode helper,
5366 * since the exec could involve hitting disk and hence
5367 * be a slow process */
5368 mutex_unlock(&cgroup_mutex);
5369 call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
5370 mutex_lock(&cgroup_mutex);
5371 continue_free:
5372 kfree(pathbuf);
5373 kfree(agentbuf);
5374 raw_spin_lock(&release_list_lock);
5375 }
5376 raw_spin_unlock(&release_list_lock);
5377 mutex_unlock(&cgroup_mutex); 5359 mutex_unlock(&cgroup_mutex);
5360 call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
5361 goto out_free;
5362out:
5363 mutex_unlock(&cgroup_mutex);
5364out_free:
5365 kfree(agentbuf);
5366 kfree(pathbuf);
5378} 5367}
5379 5368
5380static int __init cgroup_disable(char *str) 5369static int __init cgroup_disable(char *str)
@@ -5562,7 +5551,8 @@ static int cgroup_css_links_read(struct seq_file *seq, void *v)
5562 5551
5563static u64 releasable_read(struct cgroup_subsys_state *css, struct cftype *cft) 5552static u64 releasable_read(struct cgroup_subsys_state *css, struct cftype *cft)
5564{ 5553{
5565 return test_bit(CGRP_RELEASABLE, &css->cgroup->flags); 5554 return (!cgroup_has_tasks(css->cgroup) &&
5555 !css_has_online_children(&css->cgroup->self));
5566} 5556}
5567 5557
5568static struct cftype debug_files[] = { 5558static struct cftype debug_files[] = {
diff --git a/kernel/configs/tiny.config b/kernel/configs/tiny.config
new file mode 100644
index 000000000000..c2de56ab0fce
--- /dev/null
+++ b/kernel/configs/tiny.config
@@ -0,0 +1,4 @@
1CONFIG_CC_OPTIMIZE_FOR_SIZE=y
2CONFIG_KERNEL_XZ=y
3CONFIG_OPTIMIZE_INLINING=y
4CONFIG_SLOB=y
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c
index 5664985c46a0..937ecdfdf258 100644
--- a/kernel/context_tracking.c
+++ b/kernel/context_tracking.c
@@ -107,46 +107,6 @@ void context_tracking_user_enter(void)
107} 107}
108NOKPROBE_SYMBOL(context_tracking_user_enter); 108NOKPROBE_SYMBOL(context_tracking_user_enter);
109 109
110#ifdef CONFIG_PREEMPT
111/**
112 * preempt_schedule_context - preempt_schedule called by tracing
113 *
114 * The tracing infrastructure uses preempt_enable_notrace to prevent
115 * recursion and tracing preempt enabling caused by the tracing
116 * infrastructure itself. But as tracing can happen in areas coming
117 * from userspace or just about to enter userspace, a preempt enable
118 * can occur before user_exit() is called. This will cause the scheduler
119 * to be called when the system is still in usermode.
120 *
121 * To prevent this, the preempt_enable_notrace will use this function
122 * instead of preempt_schedule() to exit user context if needed before
123 * calling the scheduler.
124 */
125asmlinkage __visible void __sched notrace preempt_schedule_context(void)
126{
127 enum ctx_state prev_ctx;
128
129 if (likely(!preemptible()))
130 return;
131
132 /*
133 * Need to disable preemption in case user_exit() is traced
134 * and the tracer calls preempt_enable_notrace() causing
135 * an infinite recursion.
136 */
137 preempt_disable_notrace();
138 prev_ctx = exception_enter();
139 preempt_enable_no_resched_notrace();
140
141 preempt_schedule();
142
143 preempt_disable_notrace();
144 exception_exit(prev_ctx);
145 preempt_enable_notrace();
146}
147EXPORT_SYMBOL_GPL(preempt_schedule_context);
148#endif /* CONFIG_PREEMPT */
149
150/** 110/**
151 * context_tracking_user_exit - Inform the context tracking that the CPU is 111 * context_tracking_user_exit - Inform the context tracking that the CPU is
152 * exiting userspace mode and entering the kernel. 112 * exiting userspace mode and entering the kernel.
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 81e2a388a0f6..5d220234b3ca 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -64,6 +64,8 @@ static struct {
64 * an ongoing cpu hotplug operation. 64 * an ongoing cpu hotplug operation.
65 */ 65 */
66 int refcount; 66 int refcount;
67 /* And allows lockless put_online_cpus(). */
68 atomic_t puts_pending;
67 69
68#ifdef CONFIG_DEBUG_LOCK_ALLOC 70#ifdef CONFIG_DEBUG_LOCK_ALLOC
69 struct lockdep_map dep_map; 71 struct lockdep_map dep_map;
@@ -79,9 +81,21 @@ static struct {
79 81
80/* Lockdep annotations for get/put_online_cpus() and cpu_hotplug_begin/end() */ 82/* Lockdep annotations for get/put_online_cpus() and cpu_hotplug_begin/end() */
81#define cpuhp_lock_acquire_read() lock_map_acquire_read(&cpu_hotplug.dep_map) 83#define cpuhp_lock_acquire_read() lock_map_acquire_read(&cpu_hotplug.dep_map)
84#define cpuhp_lock_acquire_tryread() \
85 lock_map_acquire_tryread(&cpu_hotplug.dep_map)
82#define cpuhp_lock_acquire() lock_map_acquire(&cpu_hotplug.dep_map) 86#define cpuhp_lock_acquire() lock_map_acquire(&cpu_hotplug.dep_map)
83#define cpuhp_lock_release() lock_map_release(&cpu_hotplug.dep_map) 87#define cpuhp_lock_release() lock_map_release(&cpu_hotplug.dep_map)
84 88
89static void apply_puts_pending(int max)
90{
91 int delta;
92
93 if (atomic_read(&cpu_hotplug.puts_pending) >= max) {
94 delta = atomic_xchg(&cpu_hotplug.puts_pending, 0);
95 cpu_hotplug.refcount -= delta;
96 }
97}
98
85void get_online_cpus(void) 99void get_online_cpus(void)
86{ 100{
87 might_sleep(); 101 might_sleep();
@@ -89,17 +103,35 @@ void get_online_cpus(void)
89 return; 103 return;
90 cpuhp_lock_acquire_read(); 104 cpuhp_lock_acquire_read();
91 mutex_lock(&cpu_hotplug.lock); 105 mutex_lock(&cpu_hotplug.lock);
106 apply_puts_pending(65536);
92 cpu_hotplug.refcount++; 107 cpu_hotplug.refcount++;
93 mutex_unlock(&cpu_hotplug.lock); 108 mutex_unlock(&cpu_hotplug.lock);
94
95} 109}
96EXPORT_SYMBOL_GPL(get_online_cpus); 110EXPORT_SYMBOL_GPL(get_online_cpus);
97 111
112bool try_get_online_cpus(void)
113{
114 if (cpu_hotplug.active_writer == current)
115 return true;
116 if (!mutex_trylock(&cpu_hotplug.lock))
117 return false;
118 cpuhp_lock_acquire_tryread();
119 apply_puts_pending(65536);
120 cpu_hotplug.refcount++;
121 mutex_unlock(&cpu_hotplug.lock);
122 return true;
123}
124EXPORT_SYMBOL_GPL(try_get_online_cpus);
125
98void put_online_cpus(void) 126void put_online_cpus(void)
99{ 127{
100 if (cpu_hotplug.active_writer == current) 128 if (cpu_hotplug.active_writer == current)
101 return; 129 return;
102 mutex_lock(&cpu_hotplug.lock); 130 if (!mutex_trylock(&cpu_hotplug.lock)) {
131 atomic_inc(&cpu_hotplug.puts_pending);
132 cpuhp_lock_release();
133 return;
134 }
103 135
104 if (WARN_ON(!cpu_hotplug.refcount)) 136 if (WARN_ON(!cpu_hotplug.refcount))
105 cpu_hotplug.refcount++; /* try to fix things up */ 137 cpu_hotplug.refcount++; /* try to fix things up */
@@ -141,6 +173,7 @@ void cpu_hotplug_begin(void)
141 cpuhp_lock_acquire(); 173 cpuhp_lock_acquire();
142 for (;;) { 174 for (;;) {
143 mutex_lock(&cpu_hotplug.lock); 175 mutex_lock(&cpu_hotplug.lock);
176 apply_puts_pending(1);
144 if (likely(!cpu_hotplug.refcount)) 177 if (likely(!cpu_hotplug.refcount))
145 break; 178 break;
146 __set_current_state(TASK_UNINTERRUPTIBLE); 179 __set_current_state(TASK_UNINTERRUPTIBLE);
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 52cb04c993b7..64b257f6bca2 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -248,34 +248,34 @@ static struct cpuset top_cpuset = {
248 if (is_cpuset_online(((des_cs) = css_cs((pos_css))))) 248 if (is_cpuset_online(((des_cs) = css_cs((pos_css)))))
249 249
250/* 250/*
251 * There are two global mutexes guarding cpuset structures - cpuset_mutex 251 * There are two global locks guarding cpuset structures - cpuset_mutex and
252 * and callback_mutex. The latter may nest inside the former. We also 252 * callback_lock. We also require taking task_lock() when dereferencing a
253 * require taking task_lock() when dereferencing a task's cpuset pointer. 253 * task's cpuset pointer. See "The task_lock() exception", at the end of this
254 * See "The task_lock() exception", at the end of this comment. 254 * comment.
255 * 255 *
256 * A task must hold both mutexes to modify cpusets. If a task holds 256 * A task must hold both locks to modify cpusets. If a task holds
257 * cpuset_mutex, then it blocks others wanting that mutex, ensuring that it 257 * cpuset_mutex, then it blocks others wanting that mutex, ensuring that it
258 * is the only task able to also acquire callback_mutex and be able to 258 * is the only task able to also acquire callback_lock and be able to
259 * modify cpusets. It can perform various checks on the cpuset structure 259 * modify cpusets. It can perform various checks on the cpuset structure
260 * first, knowing nothing will change. It can also allocate memory while 260 * first, knowing nothing will change. It can also allocate memory while
261 * just holding cpuset_mutex. While it is performing these checks, various 261 * just holding cpuset_mutex. While it is performing these checks, various
262 * callback routines can briefly acquire callback_mutex to query cpusets. 262 * callback routines can briefly acquire callback_lock to query cpusets.
263 * Once it is ready to make the changes, it takes callback_mutex, blocking 263 * Once it is ready to make the changes, it takes callback_lock, blocking
264 * everyone else. 264 * everyone else.
265 * 265 *
266 * Calls to the kernel memory allocator can not be made while holding 266 * Calls to the kernel memory allocator can not be made while holding
267 * callback_mutex, as that would risk double tripping on callback_mutex 267 * callback_lock, as that would risk double tripping on callback_lock
268 * from one of the callbacks into the cpuset code from within 268 * from one of the callbacks into the cpuset code from within
269 * __alloc_pages(). 269 * __alloc_pages().
270 * 270 *
271 * If a task is only holding callback_mutex, then it has read-only 271 * If a task is only holding callback_lock, then it has read-only
272 * access to cpusets. 272 * access to cpusets.
273 * 273 *
274 * Now, the task_struct fields mems_allowed and mempolicy may be changed 274 * Now, the task_struct fields mems_allowed and mempolicy may be changed
275 * by other task, we use alloc_lock in the task_struct fields to protect 275 * by other task, we use alloc_lock in the task_struct fields to protect
276 * them. 276 * them.
277 * 277 *
278 * The cpuset_common_file_read() handlers only hold callback_mutex across 278 * The cpuset_common_file_read() handlers only hold callback_lock across
279 * small pieces of code, such as when reading out possibly multi-word 279 * small pieces of code, such as when reading out possibly multi-word
280 * cpumasks and nodemasks. 280 * cpumasks and nodemasks.
281 * 281 *
@@ -284,7 +284,7 @@ static struct cpuset top_cpuset = {
284 */ 284 */
285 285
286static DEFINE_MUTEX(cpuset_mutex); 286static DEFINE_MUTEX(cpuset_mutex);
287static DEFINE_MUTEX(callback_mutex); 287static DEFINE_SPINLOCK(callback_lock);
288 288
289/* 289/*
290 * CPU / memory hotplug is handled asynchronously. 290 * CPU / memory hotplug is handled asynchronously.
@@ -329,7 +329,7 @@ static struct file_system_type cpuset_fs_type = {
329 * One way or another, we guarantee to return some non-empty subset 329 * One way or another, we guarantee to return some non-empty subset
330 * of cpu_online_mask. 330 * of cpu_online_mask.
331 * 331 *
332 * Call with callback_mutex held. 332 * Call with callback_lock or cpuset_mutex held.
333 */ 333 */
334static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask) 334static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask)
335{ 335{
@@ -347,7 +347,7 @@ static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask)
347 * One way or another, we guarantee to return some non-empty subset 347 * One way or another, we guarantee to return some non-empty subset
348 * of node_states[N_MEMORY]. 348 * of node_states[N_MEMORY].
349 * 349 *
350 * Call with callback_mutex held. 350 * Call with callback_lock or cpuset_mutex held.
351 */ 351 */
352static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask) 352static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask)
353{ 353{
@@ -359,7 +359,7 @@ static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask)
359/* 359/*
360 * update task's spread flag if cpuset's page/slab spread flag is set 360 * update task's spread flag if cpuset's page/slab spread flag is set
361 * 361 *
362 * Called with callback_mutex/cpuset_mutex held 362 * Call with callback_lock or cpuset_mutex held.
363 */ 363 */
364static void cpuset_update_task_spread_flag(struct cpuset *cs, 364static void cpuset_update_task_spread_flag(struct cpuset *cs,
365 struct task_struct *tsk) 365 struct task_struct *tsk)
@@ -506,6 +506,16 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial)
506 goto out; 506 goto out;
507 } 507 }
508 508
509 /*
510 * We can't shrink if we won't have enough room for SCHED_DEADLINE
511 * tasks.
512 */
513 ret = -EBUSY;
514 if (is_cpu_exclusive(cur) &&
515 !cpuset_cpumask_can_shrink(cur->cpus_allowed,
516 trial->cpus_allowed))
517 goto out;
518
509 ret = 0; 519 ret = 0;
510out: 520out:
511 rcu_read_unlock(); 521 rcu_read_unlock();
@@ -876,9 +886,9 @@ static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus)
876 continue; 886 continue;
877 rcu_read_unlock(); 887 rcu_read_unlock();
878 888
879 mutex_lock(&callback_mutex); 889 spin_lock_irq(&callback_lock);
880 cpumask_copy(cp->effective_cpus, new_cpus); 890 cpumask_copy(cp->effective_cpus, new_cpus);
881 mutex_unlock(&callback_mutex); 891 spin_unlock_irq(&callback_lock);
882 892
883 WARN_ON(!cgroup_on_dfl(cp->css.cgroup) && 893 WARN_ON(!cgroup_on_dfl(cp->css.cgroup) &&
884 !cpumask_equal(cp->cpus_allowed, cp->effective_cpus)); 894 !cpumask_equal(cp->cpus_allowed, cp->effective_cpus));
@@ -943,9 +953,9 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
943 if (retval < 0) 953 if (retval < 0)
944 return retval; 954 return retval;
945 955
946 mutex_lock(&callback_mutex); 956 spin_lock_irq(&callback_lock);
947 cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed); 957 cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);
948 mutex_unlock(&callback_mutex); 958 spin_unlock_irq(&callback_lock);
949 959
950 /* use trialcs->cpus_allowed as a temp variable */ 960 /* use trialcs->cpus_allowed as a temp variable */
951 update_cpumasks_hier(cs, trialcs->cpus_allowed); 961 update_cpumasks_hier(cs, trialcs->cpus_allowed);
@@ -1132,9 +1142,9 @@ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems)
1132 continue; 1142 continue;
1133 rcu_read_unlock(); 1143 rcu_read_unlock();
1134 1144
1135 mutex_lock(&callback_mutex); 1145 spin_lock_irq(&callback_lock);
1136 cp->effective_mems = *new_mems; 1146 cp->effective_mems = *new_mems;
1137 mutex_unlock(&callback_mutex); 1147 spin_unlock_irq(&callback_lock);
1138 1148
1139 WARN_ON(!cgroup_on_dfl(cp->css.cgroup) && 1149 WARN_ON(!cgroup_on_dfl(cp->css.cgroup) &&
1140 !nodes_equal(cp->mems_allowed, cp->effective_mems)); 1150 !nodes_equal(cp->mems_allowed, cp->effective_mems));
@@ -1155,7 +1165,7 @@ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems)
1155 * mempolicies and if the cpuset is marked 'memory_migrate', 1165 * mempolicies and if the cpuset is marked 'memory_migrate',
1156 * migrate the tasks pages to the new memory. 1166 * migrate the tasks pages to the new memory.
1157 * 1167 *
1158 * Call with cpuset_mutex held. May take callback_mutex during call. 1168 * Call with cpuset_mutex held. May take callback_lock during call.
1159 * Will take tasklist_lock, scan tasklist for tasks in cpuset cs, 1169 * Will take tasklist_lock, scan tasklist for tasks in cpuset cs,
1160 * lock each such tasks mm->mmap_sem, scan its vma's and rebind 1170 * lock each such tasks mm->mmap_sem, scan its vma's and rebind
1161 * their mempolicies to the cpusets new mems_allowed. 1171 * their mempolicies to the cpusets new mems_allowed.
@@ -1202,9 +1212,9 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
1202 if (retval < 0) 1212 if (retval < 0)
1203 goto done; 1213 goto done;
1204 1214
1205 mutex_lock(&callback_mutex); 1215 spin_lock_irq(&callback_lock);
1206 cs->mems_allowed = trialcs->mems_allowed; 1216 cs->mems_allowed = trialcs->mems_allowed;
1207 mutex_unlock(&callback_mutex); 1217 spin_unlock_irq(&callback_lock);
1208 1218
1209 /* use trialcs->mems_allowed as a temp variable */ 1219 /* use trialcs->mems_allowed as a temp variable */
1210 update_nodemasks_hier(cs, &cs->mems_allowed); 1220 update_nodemasks_hier(cs, &cs->mems_allowed);
@@ -1295,9 +1305,9 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
1295 spread_flag_changed = ((is_spread_slab(cs) != is_spread_slab(trialcs)) 1305 spread_flag_changed = ((is_spread_slab(cs) != is_spread_slab(trialcs))
1296 || (is_spread_page(cs) != is_spread_page(trialcs))); 1306 || (is_spread_page(cs) != is_spread_page(trialcs)));
1297 1307
1298 mutex_lock(&callback_mutex); 1308 spin_lock_irq(&callback_lock);
1299 cs->flags = trialcs->flags; 1309 cs->flags = trialcs->flags;
1300 mutex_unlock(&callback_mutex); 1310 spin_unlock_irq(&callback_lock);
1301 1311
1302 if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed) 1312 if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed)
1303 rebuild_sched_domains_locked(); 1313 rebuild_sched_domains_locked();
@@ -1429,17 +1439,8 @@ static int cpuset_can_attach(struct cgroup_subsys_state *css,
1429 goto out_unlock; 1439 goto out_unlock;
1430 1440
1431 cgroup_taskset_for_each(task, tset) { 1441 cgroup_taskset_for_each(task, tset) {
1432 /* 1442 ret = task_can_attach(task, cs->cpus_allowed);
1433 * Kthreads which disallow setaffinity shouldn't be moved 1443 if (ret)
1434 * to a new cpuset; we don't want to change their cpu
1435 * affinity and isolating such threads by their set of
1436 * allowed nodes is unnecessary. Thus, cpusets are not
1437 * applicable for such threads. This prevents checking for
1438 * success of set_cpus_allowed_ptr() on all attached tasks
1439 * before cpus_allowed may be changed.
1440 */
1441 ret = -EINVAL;
1442 if (task->flags & PF_NO_SETAFFINITY)
1443 goto out_unlock; 1444 goto out_unlock;
1444 ret = security_task_setscheduler(task); 1445 ret = security_task_setscheduler(task);
1445 if (ret) 1446 if (ret)
@@ -1713,7 +1714,7 @@ static int cpuset_common_seq_show(struct seq_file *sf, void *v)
1713 count = seq_get_buf(sf, &buf); 1714 count = seq_get_buf(sf, &buf);
1714 s = buf; 1715 s = buf;
1715 1716
1716 mutex_lock(&callback_mutex); 1717 spin_lock_irq(&callback_lock);
1717 1718
1718 switch (type) { 1719 switch (type) {
1719 case FILE_CPULIST: 1720 case FILE_CPULIST:
@@ -1740,7 +1741,7 @@ static int cpuset_common_seq_show(struct seq_file *sf, void *v)
1740 seq_commit(sf, -1); 1741 seq_commit(sf, -1);
1741 } 1742 }
1742out_unlock: 1743out_unlock:
1743 mutex_unlock(&callback_mutex); 1744 spin_unlock_irq(&callback_lock);
1744 return ret; 1745 return ret;
1745} 1746}
1746 1747
@@ -1957,12 +1958,12 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
1957 1958
1958 cpuset_inc(); 1959 cpuset_inc();
1959 1960
1960 mutex_lock(&callback_mutex); 1961 spin_lock_irq(&callback_lock);
1961 if (cgroup_on_dfl(cs->css.cgroup)) { 1962 if (cgroup_on_dfl(cs->css.cgroup)) {
1962 cpumask_copy(cs->effective_cpus, parent->effective_cpus); 1963 cpumask_copy(cs->effective_cpus, parent->effective_cpus);
1963 cs->effective_mems = parent->effective_mems; 1964 cs->effective_mems = parent->effective_mems;
1964 } 1965 }
1965 mutex_unlock(&callback_mutex); 1966 spin_unlock_irq(&callback_lock);
1966 1967
1967 if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags)) 1968 if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags))
1968 goto out_unlock; 1969 goto out_unlock;
@@ -1989,10 +1990,10 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
1989 } 1990 }
1990 rcu_read_unlock(); 1991 rcu_read_unlock();
1991 1992
1992 mutex_lock(&callback_mutex); 1993 spin_lock_irq(&callback_lock);
1993 cs->mems_allowed = parent->mems_allowed; 1994 cs->mems_allowed = parent->mems_allowed;
1994 cpumask_copy(cs->cpus_allowed, parent->cpus_allowed); 1995 cpumask_copy(cs->cpus_allowed, parent->cpus_allowed);
1995 mutex_unlock(&callback_mutex); 1996 spin_unlock_irq(&callback_lock);
1996out_unlock: 1997out_unlock:
1997 mutex_unlock(&cpuset_mutex); 1998 mutex_unlock(&cpuset_mutex);
1998 return 0; 1999 return 0;
@@ -2031,7 +2032,7 @@ static void cpuset_css_free(struct cgroup_subsys_state *css)
2031static void cpuset_bind(struct cgroup_subsys_state *root_css) 2032static void cpuset_bind(struct cgroup_subsys_state *root_css)
2032{ 2033{
2033 mutex_lock(&cpuset_mutex); 2034 mutex_lock(&cpuset_mutex);
2034 mutex_lock(&callback_mutex); 2035 spin_lock_irq(&callback_lock);
2035 2036
2036 if (cgroup_on_dfl(root_css->cgroup)) { 2037 if (cgroup_on_dfl(root_css->cgroup)) {
2037 cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask); 2038 cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask);
@@ -2042,7 +2043,7 @@ static void cpuset_bind(struct cgroup_subsys_state *root_css)
2042 top_cpuset.mems_allowed = top_cpuset.effective_mems; 2043 top_cpuset.mems_allowed = top_cpuset.effective_mems;
2043 } 2044 }
2044 2045
2045 mutex_unlock(&callback_mutex); 2046 spin_unlock_irq(&callback_lock);
2046 mutex_unlock(&cpuset_mutex); 2047 mutex_unlock(&cpuset_mutex);
2047} 2048}
2048 2049
@@ -2127,12 +2128,12 @@ hotplug_update_tasks_legacy(struct cpuset *cs,
2127{ 2128{
2128 bool is_empty; 2129 bool is_empty;
2129 2130
2130 mutex_lock(&callback_mutex); 2131 spin_lock_irq(&callback_lock);
2131 cpumask_copy(cs->cpus_allowed, new_cpus); 2132 cpumask_copy(cs->cpus_allowed, new_cpus);
2132 cpumask_copy(cs->effective_cpus, new_cpus); 2133 cpumask_copy(cs->effective_cpus, new_cpus);
2133 cs->mems_allowed = *new_mems; 2134 cs->mems_allowed = *new_mems;
2134 cs->effective_mems = *new_mems; 2135 cs->effective_mems = *new_mems;
2135 mutex_unlock(&callback_mutex); 2136 spin_unlock_irq(&callback_lock);
2136 2137
2137 /* 2138 /*
2138 * Don't call update_tasks_cpumask() if the cpuset becomes empty, 2139 * Don't call update_tasks_cpumask() if the cpuset becomes empty,
@@ -2169,10 +2170,10 @@ hotplug_update_tasks(struct cpuset *cs,
2169 if (nodes_empty(*new_mems)) 2170 if (nodes_empty(*new_mems))
2170 *new_mems = parent_cs(cs)->effective_mems; 2171 *new_mems = parent_cs(cs)->effective_mems;
2171 2172
2172 mutex_lock(&callback_mutex); 2173 spin_lock_irq(&callback_lock);
2173 cpumask_copy(cs->effective_cpus, new_cpus); 2174 cpumask_copy(cs->effective_cpus, new_cpus);
2174 cs->effective_mems = *new_mems; 2175 cs->effective_mems = *new_mems;
2175 mutex_unlock(&callback_mutex); 2176 spin_unlock_irq(&callback_lock);
2176 2177
2177 if (cpus_updated) 2178 if (cpus_updated)
2178 update_tasks_cpumask(cs); 2179 update_tasks_cpumask(cs);
@@ -2258,21 +2259,21 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
2258 2259
2259 /* synchronize cpus_allowed to cpu_active_mask */ 2260 /* synchronize cpus_allowed to cpu_active_mask */
2260 if (cpus_updated) { 2261 if (cpus_updated) {
2261 mutex_lock(&callback_mutex); 2262 spin_lock_irq(&callback_lock);
2262 if (!on_dfl) 2263 if (!on_dfl)
2263 cpumask_copy(top_cpuset.cpus_allowed, &new_cpus); 2264 cpumask_copy(top_cpuset.cpus_allowed, &new_cpus);
2264 cpumask_copy(top_cpuset.effective_cpus, &new_cpus); 2265 cpumask_copy(top_cpuset.effective_cpus, &new_cpus);
2265 mutex_unlock(&callback_mutex); 2266 spin_unlock_irq(&callback_lock);
2266 /* we don't mess with cpumasks of tasks in top_cpuset */ 2267 /* we don't mess with cpumasks of tasks in top_cpuset */
2267 } 2268 }
2268 2269
2269 /* synchronize mems_allowed to N_MEMORY */ 2270 /* synchronize mems_allowed to N_MEMORY */
2270 if (mems_updated) { 2271 if (mems_updated) {
2271 mutex_lock(&callback_mutex); 2272 spin_lock_irq(&callback_lock);
2272 if (!on_dfl) 2273 if (!on_dfl)
2273 top_cpuset.mems_allowed = new_mems; 2274 top_cpuset.mems_allowed = new_mems;
2274 top_cpuset.effective_mems = new_mems; 2275 top_cpuset.effective_mems = new_mems;
2275 mutex_unlock(&callback_mutex); 2276 spin_unlock_irq(&callback_lock);
2276 update_tasks_nodemask(&top_cpuset); 2277 update_tasks_nodemask(&top_cpuset);
2277 } 2278 }
2278 2279
@@ -2365,11 +2366,13 @@ void __init cpuset_init_smp(void)
2365 2366
2366void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask) 2367void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
2367{ 2368{
2368 mutex_lock(&callback_mutex); 2369 unsigned long flags;
2370
2371 spin_lock_irqsave(&callback_lock, flags);
2369 rcu_read_lock(); 2372 rcu_read_lock();
2370 guarantee_online_cpus(task_cs(tsk), pmask); 2373 guarantee_online_cpus(task_cs(tsk), pmask);
2371 rcu_read_unlock(); 2374 rcu_read_unlock();
2372 mutex_unlock(&callback_mutex); 2375 spin_unlock_irqrestore(&callback_lock, flags);
2373} 2376}
2374 2377
2375void cpuset_cpus_allowed_fallback(struct task_struct *tsk) 2378void cpuset_cpus_allowed_fallback(struct task_struct *tsk)
@@ -2415,12 +2418,13 @@ void cpuset_init_current_mems_allowed(void)
2415nodemask_t cpuset_mems_allowed(struct task_struct *tsk) 2418nodemask_t cpuset_mems_allowed(struct task_struct *tsk)
2416{ 2419{
2417 nodemask_t mask; 2420 nodemask_t mask;
2421 unsigned long flags;
2418 2422
2419 mutex_lock(&callback_mutex); 2423 spin_lock_irqsave(&callback_lock, flags);
2420 rcu_read_lock(); 2424 rcu_read_lock();
2421 guarantee_online_mems(task_cs(tsk), &mask); 2425 guarantee_online_mems(task_cs(tsk), &mask);
2422 rcu_read_unlock(); 2426 rcu_read_unlock();
2423 mutex_unlock(&callback_mutex); 2427 spin_unlock_irqrestore(&callback_lock, flags);
2424 2428
2425 return mask; 2429 return mask;
2426} 2430}
@@ -2439,7 +2443,7 @@ int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask)
2439/* 2443/*
2440 * nearest_hardwall_ancestor() - Returns the nearest mem_exclusive or 2444 * nearest_hardwall_ancestor() - Returns the nearest mem_exclusive or
2441 * mem_hardwall ancestor to the specified cpuset. Call holding 2445 * mem_hardwall ancestor to the specified cpuset. Call holding
2442 * callback_mutex. If no ancestor is mem_exclusive or mem_hardwall 2446 * callback_lock. If no ancestor is mem_exclusive or mem_hardwall
2443 * (an unusual configuration), then returns the root cpuset. 2447 * (an unusual configuration), then returns the root cpuset.
2444 */ 2448 */
2445static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs) 2449static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
@@ -2450,7 +2454,7 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
2450} 2454}
2451 2455
2452/** 2456/**
2453 * cpuset_node_allowed_softwall - Can we allocate on a memory node? 2457 * cpuset_node_allowed - Can we allocate on a memory node?
2454 * @node: is this an allowed node? 2458 * @node: is this an allowed node?
2455 * @gfp_mask: memory allocation flags 2459 * @gfp_mask: memory allocation flags
2456 * 2460 *
@@ -2462,13 +2466,6 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
2462 * flag, yes. 2466 * flag, yes.
2463 * Otherwise, no. 2467 * Otherwise, no.
2464 * 2468 *
2465 * If __GFP_HARDWALL is set, cpuset_node_allowed_softwall() reduces to
2466 * cpuset_node_allowed_hardwall(). Otherwise, cpuset_node_allowed_softwall()
2467 * might sleep, and might allow a node from an enclosing cpuset.
2468 *
2469 * cpuset_node_allowed_hardwall() only handles the simpler case of hardwall
2470 * cpusets, and never sleeps.
2471 *
2472 * The __GFP_THISNODE placement logic is really handled elsewhere, 2469 * The __GFP_THISNODE placement logic is really handled elsewhere,
2473 * by forcibly using a zonelist starting at a specified node, and by 2470 * by forcibly using a zonelist starting at a specified node, and by
2474 * (in get_page_from_freelist()) refusing to consider the zones for 2471 * (in get_page_from_freelist()) refusing to consider the zones for
@@ -2481,13 +2478,12 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
2481 * GFP_KERNEL allocations are not so marked, so can escape to the 2478 * GFP_KERNEL allocations are not so marked, so can escape to the
2482 * nearest enclosing hardwalled ancestor cpuset. 2479 * nearest enclosing hardwalled ancestor cpuset.
2483 * 2480 *
2484 * Scanning up parent cpusets requires callback_mutex. The 2481 * Scanning up parent cpusets requires callback_lock. The
2485 * __alloc_pages() routine only calls here with __GFP_HARDWALL bit 2482 * __alloc_pages() routine only calls here with __GFP_HARDWALL bit
2486 * _not_ set if it's a GFP_KERNEL allocation, and all nodes in the 2483 * _not_ set if it's a GFP_KERNEL allocation, and all nodes in the
2487 * current tasks mems_allowed came up empty on the first pass over 2484 * current tasks mems_allowed came up empty on the first pass over
2488 * the zonelist. So only GFP_KERNEL allocations, if all nodes in the 2485 * the zonelist. So only GFP_KERNEL allocations, if all nodes in the
2489 * cpuset are short of memory, might require taking the callback_mutex 2486 * cpuset are short of memory, might require taking the callback_lock.
2490 * mutex.
2491 * 2487 *
2492 * The first call here from mm/page_alloc:get_page_from_freelist() 2488 * The first call here from mm/page_alloc:get_page_from_freelist()
2493 * has __GFP_HARDWALL set in gfp_mask, enforcing hardwall cpusets, 2489 * has __GFP_HARDWALL set in gfp_mask, enforcing hardwall cpusets,
@@ -2504,20 +2500,15 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
2504 * TIF_MEMDIE - any node ok 2500 * TIF_MEMDIE - any node ok
2505 * GFP_KERNEL - any node in enclosing hardwalled cpuset ok 2501 * GFP_KERNEL - any node in enclosing hardwalled cpuset ok
2506 * GFP_USER - only nodes in current tasks mems allowed ok. 2502 * GFP_USER - only nodes in current tasks mems allowed ok.
2507 *
2508 * Rule:
2509 * Don't call cpuset_node_allowed_softwall if you can't sleep, unless you
2510 * pass in the __GFP_HARDWALL flag set in gfp_flag, which disables
2511 * the code that might scan up ancestor cpusets and sleep.
2512 */ 2503 */
2513int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask) 2504int __cpuset_node_allowed(int node, gfp_t gfp_mask)
2514{ 2505{
2515 struct cpuset *cs; /* current cpuset ancestors */ 2506 struct cpuset *cs; /* current cpuset ancestors */
2516 int allowed; /* is allocation in zone z allowed? */ 2507 int allowed; /* is allocation in zone z allowed? */
2508 unsigned long flags;
2517 2509
2518 if (in_interrupt() || (gfp_mask & __GFP_THISNODE)) 2510 if (in_interrupt() || (gfp_mask & __GFP_THISNODE))
2519 return 1; 2511 return 1;
2520 might_sleep_if(!(gfp_mask & __GFP_HARDWALL));
2521 if (node_isset(node, current->mems_allowed)) 2512 if (node_isset(node, current->mems_allowed))
2522 return 1; 2513 return 1;
2523 /* 2514 /*
@@ -2533,55 +2524,17 @@ int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask)
2533 return 1; 2524 return 1;
2534 2525
2535 /* Not hardwall and node outside mems_allowed: scan up cpusets */ 2526 /* Not hardwall and node outside mems_allowed: scan up cpusets */
2536 mutex_lock(&callback_mutex); 2527 spin_lock_irqsave(&callback_lock, flags);
2537 2528
2538 rcu_read_lock(); 2529 rcu_read_lock();
2539 cs = nearest_hardwall_ancestor(task_cs(current)); 2530 cs = nearest_hardwall_ancestor(task_cs(current));
2540 allowed = node_isset(node, cs->mems_allowed); 2531 allowed = node_isset(node, cs->mems_allowed);
2541 rcu_read_unlock(); 2532 rcu_read_unlock();
2542 2533
2543 mutex_unlock(&callback_mutex); 2534 spin_unlock_irqrestore(&callback_lock, flags);
2544 return allowed; 2535 return allowed;
2545} 2536}
2546 2537
2547/*
2548 * cpuset_node_allowed_hardwall - Can we allocate on a memory node?
2549 * @node: is this an allowed node?
2550 * @gfp_mask: memory allocation flags
2551 *
2552 * If we're in interrupt, yes, we can always allocate. If __GFP_THISNODE is
2553 * set, yes, we can always allocate. If node is in our task's mems_allowed,
2554 * yes. If the task has been OOM killed and has access to memory reserves as
2555 * specified by the TIF_MEMDIE flag, yes.
2556 * Otherwise, no.
2557 *
2558 * The __GFP_THISNODE placement logic is really handled elsewhere,
2559 * by forcibly using a zonelist starting at a specified node, and by
2560 * (in get_page_from_freelist()) refusing to consider the zones for
2561 * any node on the zonelist except the first. By the time any such
2562 * calls get to this routine, we should just shut up and say 'yes'.
2563 *
2564 * Unlike the cpuset_node_allowed_softwall() variant, above,
2565 * this variant requires that the node be in the current task's
2566 * mems_allowed or that we're in interrupt. It does not scan up the
2567 * cpuset hierarchy for the nearest enclosing mem_exclusive cpuset.
2568 * It never sleeps.
2569 */
2570int __cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask)
2571{
2572 if (in_interrupt() || (gfp_mask & __GFP_THISNODE))
2573 return 1;
2574 if (node_isset(node, current->mems_allowed))
2575 return 1;
2576 /*
2577 * Allow tasks that have access to memory reserves because they have
2578 * been OOM killed to get memory anywhere.
2579 */
2580 if (unlikely(test_thread_flag(TIF_MEMDIE)))
2581 return 1;
2582 return 0;
2583}
2584
2585/** 2538/**
2586 * cpuset_mem_spread_node() - On which node to begin search for a file page 2539 * cpuset_mem_spread_node() - On which node to begin search for a file page
2587 * cpuset_slab_spread_node() - On which node to begin search for a slab page 2540 * cpuset_slab_spread_node() - On which node to begin search for a slab page
@@ -2730,10 +2683,9 @@ void __cpuset_memory_pressure_bump(void)
2730 * and we take cpuset_mutex, keeping cpuset_attach() from changing it 2683 * and we take cpuset_mutex, keeping cpuset_attach() from changing it
2731 * anyway. 2684 * anyway.
2732 */ 2685 */
2733int proc_cpuset_show(struct seq_file *m, void *unused_v) 2686int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns,
2687 struct pid *pid, struct task_struct *tsk)
2734{ 2688{
2735 struct pid *pid;
2736 struct task_struct *tsk;
2737 char *buf, *p; 2689 char *buf, *p;
2738 struct cgroup_subsys_state *css; 2690 struct cgroup_subsys_state *css;
2739 int retval; 2691 int retval;
@@ -2743,24 +2695,16 @@ int proc_cpuset_show(struct seq_file *m, void *unused_v)
2743 if (!buf) 2695 if (!buf)
2744 goto out; 2696 goto out;
2745 2697
2746 retval = -ESRCH;
2747 pid = m->private;
2748 tsk = get_pid_task(pid, PIDTYPE_PID);
2749 if (!tsk)
2750 goto out_free;
2751
2752 retval = -ENAMETOOLONG; 2698 retval = -ENAMETOOLONG;
2753 rcu_read_lock(); 2699 rcu_read_lock();
2754 css = task_css(tsk, cpuset_cgrp_id); 2700 css = task_css(tsk, cpuset_cgrp_id);
2755 p = cgroup_path(css->cgroup, buf, PATH_MAX); 2701 p = cgroup_path(css->cgroup, buf, PATH_MAX);
2756 rcu_read_unlock(); 2702 rcu_read_unlock();
2757 if (!p) 2703 if (!p)
2758 goto out_put_task; 2704 goto out_free;
2759 seq_puts(m, p); 2705 seq_puts(m, p);
2760 seq_putc(m, '\n'); 2706 seq_putc(m, '\n');
2761 retval = 0; 2707 retval = 0;
2762out_put_task:
2763 put_task_struct(tsk);
2764out_free: 2708out_free:
2765 kfree(buf); 2709 kfree(buf);
2766out: 2710out:
diff --git a/kernel/crash_dump.c b/kernel/crash_dump.c
index c766ee54c0b1..b64e238b553b 100644
--- a/kernel/crash_dump.c
+++ b/kernel/crash_dump.c
@@ -18,6 +18,7 @@ unsigned long saved_max_pfn;
18 * it under CONFIG_CRASH_DUMP and not CONFIG_PROC_VMCORE. 18 * it under CONFIG_CRASH_DUMP and not CONFIG_PROC_VMCORE.
19 */ 19 */
20unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX; 20unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX;
21EXPORT_SYMBOL_GPL(elfcorehdr_addr);
21 22
22/* 23/*
23 * stores the size of elf header of crash image 24 * stores the size of elf header of crash image
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index 1adf62b39b96..07ce18ca71e0 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -27,6 +27,9 @@
27 * version 2. This program is licensed "as is" without any warranty of any 27 * version 2. This program is licensed "as is" without any warranty of any
28 * kind, whether express or implied. 28 * kind, whether express or implied.
29 */ 29 */
30
31#define pr_fmt(fmt) "KGDB: " fmt
32
30#include <linux/pid_namespace.h> 33#include <linux/pid_namespace.h>
31#include <linux/clocksource.h> 34#include <linux/clocksource.h>
32#include <linux/serial_core.h> 35#include <linux/serial_core.h>
@@ -196,8 +199,8 @@ int __weak kgdb_validate_break_address(unsigned long addr)
196 return err; 199 return err;
197 err = kgdb_arch_remove_breakpoint(&tmp); 200 err = kgdb_arch_remove_breakpoint(&tmp);
198 if (err) 201 if (err)
199 printk(KERN_ERR "KGDB: Critical breakpoint error, kernel " 202 pr_err("Critical breakpoint error, kernel memory destroyed at: %lx\n",
200 "memory destroyed at: %lx", addr); 203 addr);
201 return err; 204 return err;
202} 205}
203 206
@@ -256,8 +259,8 @@ int dbg_activate_sw_breakpoints(void)
256 error = kgdb_arch_set_breakpoint(&kgdb_break[i]); 259 error = kgdb_arch_set_breakpoint(&kgdb_break[i]);
257 if (error) { 260 if (error) {
258 ret = error; 261 ret = error;
259 printk(KERN_INFO "KGDB: BP install failed: %lx", 262 pr_info("BP install failed: %lx\n",
260 kgdb_break[i].bpt_addr); 263 kgdb_break[i].bpt_addr);
261 continue; 264 continue;
262 } 265 }
263 266
@@ -319,8 +322,8 @@ int dbg_deactivate_sw_breakpoints(void)
319 continue; 322 continue;
320 error = kgdb_arch_remove_breakpoint(&kgdb_break[i]); 323 error = kgdb_arch_remove_breakpoint(&kgdb_break[i]);
321 if (error) { 324 if (error) {
322 printk(KERN_INFO "KGDB: BP remove failed: %lx\n", 325 pr_info("BP remove failed: %lx\n",
323 kgdb_break[i].bpt_addr); 326 kgdb_break[i].bpt_addr);
324 ret = error; 327 ret = error;
325 } 328 }
326 329
@@ -367,7 +370,7 @@ int dbg_remove_all_break(void)
367 goto setundefined; 370 goto setundefined;
368 error = kgdb_arch_remove_breakpoint(&kgdb_break[i]); 371 error = kgdb_arch_remove_breakpoint(&kgdb_break[i]);
369 if (error) 372 if (error)
370 printk(KERN_ERR "KGDB: breakpoint remove failed: %lx\n", 373 pr_err("breakpoint remove failed: %lx\n",
371 kgdb_break[i].bpt_addr); 374 kgdb_break[i].bpt_addr);
372setundefined: 375setundefined:
373 kgdb_break[i].state = BP_UNDEFINED; 376 kgdb_break[i].state = BP_UNDEFINED;
@@ -400,9 +403,9 @@ static int kgdb_io_ready(int print_wait)
400 if (print_wait) { 403 if (print_wait) {
401#ifdef CONFIG_KGDB_KDB 404#ifdef CONFIG_KGDB_KDB
402 if (!dbg_kdb_mode) 405 if (!dbg_kdb_mode)
403 printk(KERN_CRIT "KGDB: waiting... or $3#33 for KDB\n"); 406 pr_crit("waiting... or $3#33 for KDB\n");
404#else 407#else
405 printk(KERN_CRIT "KGDB: Waiting for remote debugger\n"); 408 pr_crit("Waiting for remote debugger\n");
406#endif 409#endif
407 } 410 }
408 return 1; 411 return 1;
@@ -430,8 +433,7 @@ static int kgdb_reenter_check(struct kgdb_state *ks)
430 exception_level = 0; 433 exception_level = 0;
431 kgdb_skipexception(ks->ex_vector, ks->linux_regs); 434 kgdb_skipexception(ks->ex_vector, ks->linux_regs);
432 dbg_activate_sw_breakpoints(); 435 dbg_activate_sw_breakpoints();
433 printk(KERN_CRIT "KGDB: re-enter error: breakpoint removed %lx\n", 436 pr_crit("re-enter error: breakpoint removed %lx\n", addr);
434 addr);
435 WARN_ON_ONCE(1); 437 WARN_ON_ONCE(1);
436 438
437 return 1; 439 return 1;
@@ -444,7 +446,7 @@ static int kgdb_reenter_check(struct kgdb_state *ks)
444 panic("Recursive entry to debugger"); 446 panic("Recursive entry to debugger");
445 } 447 }
446 448
447 printk(KERN_CRIT "KGDB: re-enter exception: ALL breakpoints killed\n"); 449 pr_crit("re-enter exception: ALL breakpoints killed\n");
448#ifdef CONFIG_KGDB_KDB 450#ifdef CONFIG_KGDB_KDB
449 /* Allow kdb to debug itself one level */ 451 /* Allow kdb to debug itself one level */
450 return 0; 452 return 0;
@@ -471,6 +473,7 @@ static int kgdb_cpu_enter(struct kgdb_state *ks, struct pt_regs *regs,
471 int cpu; 473 int cpu;
472 int trace_on = 0; 474 int trace_on = 0;
473 int online_cpus = num_online_cpus(); 475 int online_cpus = num_online_cpus();
476 u64 time_left;
474 477
475 kgdb_info[ks->cpu].enter_kgdb++; 478 kgdb_info[ks->cpu].enter_kgdb++;
476 kgdb_info[ks->cpu].exception_state |= exception_state; 479 kgdb_info[ks->cpu].exception_state |= exception_state;
@@ -595,9 +598,13 @@ return_normal:
595 /* 598 /*
596 * Wait for the other CPUs to be notified and be waiting for us: 599 * Wait for the other CPUs to be notified and be waiting for us:
597 */ 600 */
598 while (kgdb_do_roundup && (atomic_read(&masters_in_kgdb) + 601 time_left = loops_per_jiffy * HZ;
599 atomic_read(&slaves_in_kgdb)) != online_cpus) 602 while (kgdb_do_roundup && --time_left &&
603 (atomic_read(&masters_in_kgdb) + atomic_read(&slaves_in_kgdb)) !=
604 online_cpus)
600 cpu_relax(); 605 cpu_relax();
606 if (!time_left)
607 pr_crit("KGDB: Timed out waiting for secondary CPUs.\n");
601 608
602 /* 609 /*
603 * At this point the primary processor is completely 610 * At this point the primary processor is completely
@@ -795,15 +802,15 @@ static struct console kgdbcons = {
795static void sysrq_handle_dbg(int key) 802static void sysrq_handle_dbg(int key)
796{ 803{
797 if (!dbg_io_ops) { 804 if (!dbg_io_ops) {
798 printk(KERN_CRIT "ERROR: No KGDB I/O module available\n"); 805 pr_crit("ERROR: No KGDB I/O module available\n");
799 return; 806 return;
800 } 807 }
801 if (!kgdb_connected) { 808 if (!kgdb_connected) {
802#ifdef CONFIG_KGDB_KDB 809#ifdef CONFIG_KGDB_KDB
803 if (!dbg_kdb_mode) 810 if (!dbg_kdb_mode)
804 printk(KERN_CRIT "KGDB or $3#33 for KDB\n"); 811 pr_crit("KGDB or $3#33 for KDB\n");
805#else 812#else
806 printk(KERN_CRIT "Entering KGDB\n"); 813 pr_crit("Entering KGDB\n");
807#endif 814#endif
808 } 815 }
809 816
@@ -945,7 +952,7 @@ static void kgdb_initial_breakpoint(void)
945{ 952{
946 kgdb_break_asap = 0; 953 kgdb_break_asap = 0;
947 954
948 printk(KERN_CRIT "kgdb: Waiting for connection from remote gdb...\n"); 955 pr_crit("Waiting for connection from remote gdb...\n");
949 kgdb_breakpoint(); 956 kgdb_breakpoint();
950} 957}
951 958
@@ -964,8 +971,7 @@ int kgdb_register_io_module(struct kgdb_io *new_dbg_io_ops)
964 if (dbg_io_ops) { 971 if (dbg_io_ops) {
965 spin_unlock(&kgdb_registration_lock); 972 spin_unlock(&kgdb_registration_lock);
966 973
967 printk(KERN_ERR "kgdb: Another I/O driver is already " 974 pr_err("Another I/O driver is already registered with KGDB\n");
968 "registered with KGDB.\n");
969 return -EBUSY; 975 return -EBUSY;
970 } 976 }
971 977
@@ -981,8 +987,7 @@ int kgdb_register_io_module(struct kgdb_io *new_dbg_io_ops)
981 987
982 spin_unlock(&kgdb_registration_lock); 988 spin_unlock(&kgdb_registration_lock);
983 989
984 printk(KERN_INFO "kgdb: Registered I/O driver %s.\n", 990 pr_info("Registered I/O driver %s\n", new_dbg_io_ops->name);
985 new_dbg_io_ops->name);
986 991
987 /* Arm KGDB now. */ 992 /* Arm KGDB now. */
988 kgdb_register_callbacks(); 993 kgdb_register_callbacks();
@@ -1017,8 +1022,7 @@ void kgdb_unregister_io_module(struct kgdb_io *old_dbg_io_ops)
1017 1022
1018 spin_unlock(&kgdb_registration_lock); 1023 spin_unlock(&kgdb_registration_lock);
1019 1024
1020 printk(KERN_INFO 1025 pr_info("Unregistered I/O driver %s, debugger disabled\n",
1021 "kgdb: Unregistered I/O driver %s, debugger disabled.\n",
1022 old_dbg_io_ops->name); 1026 old_dbg_io_ops->name);
1023} 1027}
1024EXPORT_SYMBOL_GPL(kgdb_unregister_io_module); 1028EXPORT_SYMBOL_GPL(kgdb_unregister_io_module);
diff --git a/kernel/debug/kdb/kdb_bp.c b/kernel/debug/kdb/kdb_bp.c
index 70a504601dc3..e1dbf4a2c69e 100644
--- a/kernel/debug/kdb/kdb_bp.c
+++ b/kernel/debug/kdb/kdb_bp.c
@@ -52,11 +52,11 @@ static int kdb_parsebp(int argc, const char **argv, int *nextargp, kdb_bp_t *bp)
52 52
53 bp->bph_length = 1; 53 bp->bph_length = 1;
54 if ((argc + 1) != nextarg) { 54 if ((argc + 1) != nextarg) {
55 if (strnicmp(argv[nextarg], "datar", sizeof("datar")) == 0) 55 if (strncasecmp(argv[nextarg], "datar", sizeof("datar")) == 0)
56 bp->bp_type = BP_ACCESS_WATCHPOINT; 56 bp->bp_type = BP_ACCESS_WATCHPOINT;
57 else if (strnicmp(argv[nextarg], "dataw", sizeof("dataw")) == 0) 57 else if (strncasecmp(argv[nextarg], "dataw", sizeof("dataw")) == 0)
58 bp->bp_type = BP_WRITE_WATCHPOINT; 58 bp->bp_type = BP_WRITE_WATCHPOINT;
59 else if (strnicmp(argv[nextarg], "inst", sizeof("inst")) == 0) 59 else if (strncasecmp(argv[nextarg], "inst", sizeof("inst")) == 0)
60 bp->bp_type = BP_HARDWARE_BREAKPOINT; 60 bp->bp_type = BP_HARDWARE_BREAKPOINT;
61 else 61 else
62 return KDB_ARGCOUNT; 62 return KDB_ARGCOUNT;
@@ -531,22 +531,29 @@ void __init kdb_initbptab(void)
531 for (i = 0, bp = kdb_breakpoints; i < KDB_MAXBPT; i++, bp++) 531 for (i = 0, bp = kdb_breakpoints; i < KDB_MAXBPT; i++, bp++)
532 bp->bp_free = 1; 532 bp->bp_free = 1;
533 533
534 kdb_register_repeat("bp", kdb_bp, "[<vaddr>]", 534 kdb_register_flags("bp", kdb_bp, "[<vaddr>]",
535 "Set/Display breakpoints", 0, KDB_REPEAT_NO_ARGS); 535 "Set/Display breakpoints", 0,
536 kdb_register_repeat("bl", kdb_bp, "[<vaddr>]", 536 KDB_ENABLE_FLOW_CTRL | KDB_REPEAT_NO_ARGS);
537 "Display breakpoints", 0, KDB_REPEAT_NO_ARGS); 537 kdb_register_flags("bl", kdb_bp, "[<vaddr>]",
538 "Display breakpoints", 0,
539 KDB_ENABLE_FLOW_CTRL | KDB_REPEAT_NO_ARGS);
538 if (arch_kgdb_ops.flags & KGDB_HW_BREAKPOINT) 540 if (arch_kgdb_ops.flags & KGDB_HW_BREAKPOINT)
539 kdb_register_repeat("bph", kdb_bp, "[<vaddr>]", 541 kdb_register_flags("bph", kdb_bp, "[<vaddr>]",
540 "[datar [length]|dataw [length]] Set hw brk", 0, KDB_REPEAT_NO_ARGS); 542 "[datar [length]|dataw [length]] Set hw brk", 0,
541 kdb_register_repeat("bc", kdb_bc, "<bpnum>", 543 KDB_ENABLE_FLOW_CTRL | KDB_REPEAT_NO_ARGS);
542 "Clear Breakpoint", 0, KDB_REPEAT_NONE); 544 kdb_register_flags("bc", kdb_bc, "<bpnum>",
543 kdb_register_repeat("be", kdb_bc, "<bpnum>", 545 "Clear Breakpoint", 0,
544 "Enable Breakpoint", 0, KDB_REPEAT_NONE); 546 KDB_ENABLE_FLOW_CTRL);
545 kdb_register_repeat("bd", kdb_bc, "<bpnum>", 547 kdb_register_flags("be", kdb_bc, "<bpnum>",
546 "Disable Breakpoint", 0, KDB_REPEAT_NONE); 548 "Enable Breakpoint", 0,
547 549 KDB_ENABLE_FLOW_CTRL);
548 kdb_register_repeat("ss", kdb_ss, "", 550 kdb_register_flags("bd", kdb_bc, "<bpnum>",
549 "Single Step", 1, KDB_REPEAT_NO_ARGS); 551 "Disable Breakpoint", 0,
552 KDB_ENABLE_FLOW_CTRL);
553
554 kdb_register_flags("ss", kdb_ss, "",
555 "Single Step", 1,
556 KDB_ENABLE_FLOW_CTRL | KDB_REPEAT_NO_ARGS);
550 /* 557 /*
551 * Architecture dependent initialization. 558 * Architecture dependent initialization.
552 */ 559 */
diff --git a/kernel/debug/kdb/kdb_debugger.c b/kernel/debug/kdb/kdb_debugger.c
index 8859ca34dcfe..15e1a7af5dd0 100644
--- a/kernel/debug/kdb/kdb_debugger.c
+++ b/kernel/debug/kdb/kdb_debugger.c
@@ -129,6 +129,10 @@ int kdb_stub(struct kgdb_state *ks)
129 ks->pass_exception = 1; 129 ks->pass_exception = 1;
130 KDB_FLAG_SET(CATASTROPHIC); 130 KDB_FLAG_SET(CATASTROPHIC);
131 } 131 }
132 /* set CATASTROPHIC if the system contains unresponsive processors */
133 for_each_online_cpu(i)
134 if (!kgdb_info[i].enter_kgdb)
135 KDB_FLAG_SET(CATASTROPHIC);
132 if (KDB_STATE(SSBPT) && reason == KDB_REASON_SSTEP) { 136 if (KDB_STATE(SSBPT) && reason == KDB_REASON_SSTEP) {
133 KDB_STATE_CLEAR(SSBPT); 137 KDB_STATE_CLEAR(SSBPT);
134 KDB_STATE_CLEAR(DOING_SS); 138 KDB_STATE_CLEAR(DOING_SS);
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index 379650b984f8..f191bddf64b8 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -12,6 +12,7 @@
12 */ 12 */
13 13
14#include <linux/ctype.h> 14#include <linux/ctype.h>
15#include <linux/types.h>
15#include <linux/string.h> 16#include <linux/string.h>
16#include <linux/kernel.h> 17#include <linux/kernel.h>
17#include <linux/kmsg_dump.h> 18#include <linux/kmsg_dump.h>
@@ -23,6 +24,7 @@
23#include <linux/vmalloc.h> 24#include <linux/vmalloc.h>
24#include <linux/atomic.h> 25#include <linux/atomic.h>
25#include <linux/module.h> 26#include <linux/module.h>
27#include <linux/moduleparam.h>
26#include <linux/mm.h> 28#include <linux/mm.h>
27#include <linux/init.h> 29#include <linux/init.h>
28#include <linux/kallsyms.h> 30#include <linux/kallsyms.h>
@@ -42,6 +44,12 @@
42#include <linux/slab.h> 44#include <linux/slab.h>
43#include "kdb_private.h" 45#include "kdb_private.h"
44 46
47#undef MODULE_PARAM_PREFIX
48#define MODULE_PARAM_PREFIX "kdb."
49
50static int kdb_cmd_enabled = CONFIG_KDB_DEFAULT_ENABLE;
51module_param_named(cmd_enable, kdb_cmd_enabled, int, 0600);
52
45#define GREP_LEN 256 53#define GREP_LEN 256
46char kdb_grep_string[GREP_LEN]; 54char kdb_grep_string[GREP_LEN];
47int kdb_grepping_flag; 55int kdb_grepping_flag;
@@ -121,6 +129,7 @@ static kdbmsg_t kdbmsgs[] = {
121 KDBMSG(BADLENGTH, "Invalid length field"), 129 KDBMSG(BADLENGTH, "Invalid length field"),
122 KDBMSG(NOBP, "No Breakpoint exists"), 130 KDBMSG(NOBP, "No Breakpoint exists"),
123 KDBMSG(BADADDR, "Invalid address"), 131 KDBMSG(BADADDR, "Invalid address"),
132 KDBMSG(NOPERM, "Permission denied"),
124}; 133};
125#undef KDBMSG 134#undef KDBMSG
126 135
@@ -188,6 +197,26 @@ struct task_struct *kdb_curr_task(int cpu)
188} 197}
189 198
190/* 199/*
200 * Check whether the flags of the current command and the permissions
201 * of the kdb console has allow a command to be run.
202 */
203static inline bool kdb_check_flags(kdb_cmdflags_t flags, int permissions,
204 bool no_args)
205{
206 /* permissions comes from userspace so needs massaging slightly */
207 permissions &= KDB_ENABLE_MASK;
208 permissions |= KDB_ENABLE_ALWAYS_SAFE;
209
210 /* some commands change group when launched with no arguments */
211 if (no_args)
212 permissions |= permissions << KDB_ENABLE_NO_ARGS_SHIFT;
213
214 flags |= KDB_ENABLE_ALL;
215
216 return permissions & flags;
217}
218
219/*
191 * kdbgetenv - This function will return the character string value of 220 * kdbgetenv - This function will return the character string value of
192 * an environment variable. 221 * an environment variable.
193 * Parameters: 222 * Parameters:
@@ -476,6 +505,15 @@ int kdbgetaddrarg(int argc, const char **argv, int *nextarg,
476 kdb_symtab_t symtab; 505 kdb_symtab_t symtab;
477 506
478 /* 507 /*
508 * If the enable flags prohibit both arbitrary memory access
509 * and flow control then there are no reasonable grounds to
510 * provide symbol lookup.
511 */
512 if (!kdb_check_flags(KDB_ENABLE_MEM_READ | KDB_ENABLE_FLOW_CTRL,
513 kdb_cmd_enabled, false))
514 return KDB_NOPERM;
515
516 /*
479 * Process arguments which follow the following syntax: 517 * Process arguments which follow the following syntax:
480 * 518 *
481 * symbol | numeric-address [+/- numeric-offset] 519 * symbol | numeric-address [+/- numeric-offset]
@@ -641,8 +679,13 @@ static int kdb_defcmd2(const char *cmdstr, const char *argv0)
641 if (!s->count) 679 if (!s->count)
642 s->usable = 0; 680 s->usable = 0;
643 if (s->usable) 681 if (s->usable)
644 kdb_register(s->name, kdb_exec_defcmd, 682 /* macros are always safe because when executed each
645 s->usage, s->help, 0); 683 * internal command re-enters kdb_parse() and is
684 * safety checked individually.
685 */
686 kdb_register_flags(s->name, kdb_exec_defcmd, s->usage,
687 s->help, 0,
688 KDB_ENABLE_ALWAYS_SAFE);
646 return 0; 689 return 0;
647 } 690 }
648 if (!s->usable) 691 if (!s->usable)
@@ -1003,25 +1046,22 @@ int kdb_parse(const char *cmdstr)
1003 1046
1004 if (i < kdb_max_commands) { 1047 if (i < kdb_max_commands) {
1005 int result; 1048 int result;
1049
1050 if (!kdb_check_flags(tp->cmd_flags, kdb_cmd_enabled, argc <= 1))
1051 return KDB_NOPERM;
1052
1006 KDB_STATE_SET(CMD); 1053 KDB_STATE_SET(CMD);
1007 result = (*tp->cmd_func)(argc-1, (const char **)argv); 1054 result = (*tp->cmd_func)(argc-1, (const char **)argv);
1008 if (result && ignore_errors && result > KDB_CMD_GO) 1055 if (result && ignore_errors && result > KDB_CMD_GO)
1009 result = 0; 1056 result = 0;
1010 KDB_STATE_CLEAR(CMD); 1057 KDB_STATE_CLEAR(CMD);
1011 switch (tp->cmd_repeat) { 1058
1012 case KDB_REPEAT_NONE: 1059 if (tp->cmd_flags & KDB_REPEAT_WITH_ARGS)
1013 argc = 0; 1060 return result;
1014 if (argv[0]) 1061
1015 *(argv[0]) = '\0'; 1062 argc = tp->cmd_flags & KDB_REPEAT_NO_ARGS ? 1 : 0;
1016 break; 1063 if (argv[argc])
1017 case KDB_REPEAT_NO_ARGS: 1064 *(argv[argc]) = '\0';
1018 argc = 1;
1019 if (argv[1])
1020 *(argv[1]) = '\0';
1021 break;
1022 case KDB_REPEAT_WITH_ARGS:
1023 break;
1024 }
1025 return result; 1065 return result;
1026 } 1066 }
1027 1067
@@ -1921,10 +1961,14 @@ static int kdb_rm(int argc, const char **argv)
1921 */ 1961 */
1922static int kdb_sr(int argc, const char **argv) 1962static int kdb_sr(int argc, const char **argv)
1923{ 1963{
1964 bool check_mask =
1965 !kdb_check_flags(KDB_ENABLE_ALL, kdb_cmd_enabled, false);
1966
1924 if (argc != 1) 1967 if (argc != 1)
1925 return KDB_ARGCOUNT; 1968 return KDB_ARGCOUNT;
1969
1926 kdb_trap_printk++; 1970 kdb_trap_printk++;
1927 __handle_sysrq(*argv[1], false); 1971 __handle_sysrq(*argv[1], check_mask);
1928 kdb_trap_printk--; 1972 kdb_trap_printk--;
1929 1973
1930 return 0; 1974 return 0;
@@ -2157,6 +2201,8 @@ static void kdb_cpu_status(void)
2157 for (start_cpu = -1, i = 0; i < NR_CPUS; i++) { 2201 for (start_cpu = -1, i = 0; i < NR_CPUS; i++) {
2158 if (!cpu_online(i)) { 2202 if (!cpu_online(i)) {
2159 state = 'F'; /* cpu is offline */ 2203 state = 'F'; /* cpu is offline */
2204 } else if (!kgdb_info[i].enter_kgdb) {
2205 state = 'D'; /* cpu is online but unresponsive */
2160 } else { 2206 } else {
2161 state = ' '; /* cpu is responding to kdb */ 2207 state = ' '; /* cpu is responding to kdb */
2162 if (kdb_task_state_char(KDB_TSK(i)) == 'I') 2208 if (kdb_task_state_char(KDB_TSK(i)) == 'I')
@@ -2210,7 +2256,7 @@ static int kdb_cpu(int argc, const char **argv)
2210 /* 2256 /*
2211 * Validate cpunum 2257 * Validate cpunum
2212 */ 2258 */
2213 if ((cpunum > NR_CPUS) || !cpu_online(cpunum)) 2259 if ((cpunum > NR_CPUS) || !kgdb_info[cpunum].enter_kgdb)
2214 return KDB_BADCPUNUM; 2260 return KDB_BADCPUNUM;
2215 2261
2216 dbg_switch_cpu = cpunum; 2262 dbg_switch_cpu = cpunum;
@@ -2375,6 +2421,8 @@ static int kdb_help(int argc, const char **argv)
2375 return 0; 2421 return 0;
2376 if (!kt->cmd_name) 2422 if (!kt->cmd_name)
2377 continue; 2423 continue;
2424 if (!kdb_check_flags(kt->cmd_flags, kdb_cmd_enabled, true))
2425 continue;
2378 if (strlen(kt->cmd_usage) > 20) 2426 if (strlen(kt->cmd_usage) > 20)
2379 space = "\n "; 2427 space = "\n ";
2380 kdb_printf("%-15.15s %-20s%s%s\n", kt->cmd_name, 2428 kdb_printf("%-15.15s %-20s%s%s\n", kt->cmd_name,
@@ -2629,7 +2677,7 @@ static int kdb_grep_help(int argc, const char **argv)
2629} 2677}
2630 2678
2631/* 2679/*
2632 * kdb_register_repeat - This function is used to register a kernel 2680 * kdb_register_flags - This function is used to register a kernel
2633 * debugger command. 2681 * debugger command.
2634 * Inputs: 2682 * Inputs:
2635 * cmd Command name 2683 * cmd Command name
@@ -2641,12 +2689,12 @@ static int kdb_grep_help(int argc, const char **argv)
2641 * zero for success, one if a duplicate command. 2689 * zero for success, one if a duplicate command.
2642 */ 2690 */
2643#define kdb_command_extend 50 /* arbitrary */ 2691#define kdb_command_extend 50 /* arbitrary */
2644int kdb_register_repeat(char *cmd, 2692int kdb_register_flags(char *cmd,
2645 kdb_func_t func, 2693 kdb_func_t func,
2646 char *usage, 2694 char *usage,
2647 char *help, 2695 char *help,
2648 short minlen, 2696 short minlen,
2649 kdb_repeat_t repeat) 2697 kdb_cmdflags_t flags)
2650{ 2698{
2651 int i; 2699 int i;
2652 kdbtab_t *kp; 2700 kdbtab_t *kp;
@@ -2694,19 +2742,18 @@ int kdb_register_repeat(char *cmd,
2694 kp->cmd_func = func; 2742 kp->cmd_func = func;
2695 kp->cmd_usage = usage; 2743 kp->cmd_usage = usage;
2696 kp->cmd_help = help; 2744 kp->cmd_help = help;
2697 kp->cmd_flags = 0;
2698 kp->cmd_minlen = minlen; 2745 kp->cmd_minlen = minlen;
2699 kp->cmd_repeat = repeat; 2746 kp->cmd_flags = flags;
2700 2747
2701 return 0; 2748 return 0;
2702} 2749}
2703EXPORT_SYMBOL_GPL(kdb_register_repeat); 2750EXPORT_SYMBOL_GPL(kdb_register_flags);
2704 2751
2705 2752
2706/* 2753/*
2707 * kdb_register - Compatibility register function for commands that do 2754 * kdb_register - Compatibility register function for commands that do
2708 * not need to specify a repeat state. Equivalent to 2755 * not need to specify a repeat state. Equivalent to
2709 * kdb_register_repeat with KDB_REPEAT_NONE. 2756 * kdb_register_flags with flags set to 0.
2710 * Inputs: 2757 * Inputs:
2711 * cmd Command name 2758 * cmd Command name
2712 * func Function to execute the command 2759 * func Function to execute the command
@@ -2721,8 +2768,7 @@ int kdb_register(char *cmd,
2721 char *help, 2768 char *help,
2722 short minlen) 2769 short minlen)
2723{ 2770{
2724 return kdb_register_repeat(cmd, func, usage, help, minlen, 2771 return kdb_register_flags(cmd, func, usage, help, minlen, 0);
2725 KDB_REPEAT_NONE);
2726} 2772}
2727EXPORT_SYMBOL_GPL(kdb_register); 2773EXPORT_SYMBOL_GPL(kdb_register);
2728 2774
@@ -2764,80 +2810,109 @@ static void __init kdb_inittab(void)
2764 for_each_kdbcmd(kp, i) 2810 for_each_kdbcmd(kp, i)
2765 kp->cmd_name = NULL; 2811 kp->cmd_name = NULL;
2766 2812
2767 kdb_register_repeat("md", kdb_md, "<vaddr>", 2813 kdb_register_flags("md", kdb_md, "<vaddr>",
2768 "Display Memory Contents, also mdWcN, e.g. md8c1", 1, 2814 "Display Memory Contents, also mdWcN, e.g. md8c1", 1,
2769 KDB_REPEAT_NO_ARGS); 2815 KDB_ENABLE_MEM_READ | KDB_REPEAT_NO_ARGS);
2770 kdb_register_repeat("mdr", kdb_md, "<vaddr> <bytes>", 2816 kdb_register_flags("mdr", kdb_md, "<vaddr> <bytes>",
2771 "Display Raw Memory", 0, KDB_REPEAT_NO_ARGS); 2817 "Display Raw Memory", 0,
2772 kdb_register_repeat("mdp", kdb_md, "<paddr> <bytes>", 2818 KDB_ENABLE_MEM_READ | KDB_REPEAT_NO_ARGS);
2773 "Display Physical Memory", 0, KDB_REPEAT_NO_ARGS); 2819 kdb_register_flags("mdp", kdb_md, "<paddr> <bytes>",
2774 kdb_register_repeat("mds", kdb_md, "<vaddr>", 2820 "Display Physical Memory", 0,
2775 "Display Memory Symbolically", 0, KDB_REPEAT_NO_ARGS); 2821 KDB_ENABLE_MEM_READ | KDB_REPEAT_NO_ARGS);
2776 kdb_register_repeat("mm", kdb_mm, "<vaddr> <contents>", 2822 kdb_register_flags("mds", kdb_md, "<vaddr>",
2777 "Modify Memory Contents", 0, KDB_REPEAT_NO_ARGS); 2823 "Display Memory Symbolically", 0,
2778 kdb_register_repeat("go", kdb_go, "[<vaddr>]", 2824 KDB_ENABLE_MEM_READ | KDB_REPEAT_NO_ARGS);
2779 "Continue Execution", 1, KDB_REPEAT_NONE); 2825 kdb_register_flags("mm", kdb_mm, "<vaddr> <contents>",
2780 kdb_register_repeat("rd", kdb_rd, "", 2826 "Modify Memory Contents", 0,
2781 "Display Registers", 0, KDB_REPEAT_NONE); 2827 KDB_ENABLE_MEM_WRITE | KDB_REPEAT_NO_ARGS);
2782 kdb_register_repeat("rm", kdb_rm, "<reg> <contents>", 2828 kdb_register_flags("go", kdb_go, "[<vaddr>]",
2783 "Modify Registers", 0, KDB_REPEAT_NONE); 2829 "Continue Execution", 1,
2784 kdb_register_repeat("ef", kdb_ef, "<vaddr>", 2830 KDB_ENABLE_REG_WRITE | KDB_ENABLE_ALWAYS_SAFE_NO_ARGS);
2785 "Display exception frame", 0, KDB_REPEAT_NONE); 2831 kdb_register_flags("rd", kdb_rd, "",
2786 kdb_register_repeat("bt", kdb_bt, "[<vaddr>]", 2832 "Display Registers", 0,
2787 "Stack traceback", 1, KDB_REPEAT_NONE); 2833 KDB_ENABLE_REG_READ);
2788 kdb_register_repeat("btp", kdb_bt, "<pid>", 2834 kdb_register_flags("rm", kdb_rm, "<reg> <contents>",
2789 "Display stack for process <pid>", 0, KDB_REPEAT_NONE); 2835 "Modify Registers", 0,
2790 kdb_register_repeat("bta", kdb_bt, "[D|R|S|T|C|Z|E|U|I|M|A]", 2836 KDB_ENABLE_REG_WRITE);
2791 "Backtrace all processes matching state flag", 0, KDB_REPEAT_NONE); 2837 kdb_register_flags("ef", kdb_ef, "<vaddr>",
2792 kdb_register_repeat("btc", kdb_bt, "", 2838 "Display exception frame", 0,
2793 "Backtrace current process on each cpu", 0, KDB_REPEAT_NONE); 2839 KDB_ENABLE_MEM_READ);
2794 kdb_register_repeat("btt", kdb_bt, "<vaddr>", 2840 kdb_register_flags("bt", kdb_bt, "[<vaddr>]",
2841 "Stack traceback", 1,
2842 KDB_ENABLE_MEM_READ | KDB_ENABLE_INSPECT_NO_ARGS);
2843 kdb_register_flags("btp", kdb_bt, "<pid>",
2844 "Display stack for process <pid>", 0,
2845 KDB_ENABLE_INSPECT);
2846 kdb_register_flags("bta", kdb_bt, "[D|R|S|T|C|Z|E|U|I|M|A]",
2847 "Backtrace all processes matching state flag", 0,
2848 KDB_ENABLE_INSPECT);
2849 kdb_register_flags("btc", kdb_bt, "",
2850 "Backtrace current process on each cpu", 0,
2851 KDB_ENABLE_INSPECT);
2852 kdb_register_flags("btt", kdb_bt, "<vaddr>",
2795 "Backtrace process given its struct task address", 0, 2853 "Backtrace process given its struct task address", 0,
2796 KDB_REPEAT_NONE); 2854 KDB_ENABLE_MEM_READ | KDB_ENABLE_INSPECT_NO_ARGS);
2797 kdb_register_repeat("env", kdb_env, "", 2855 kdb_register_flags("env", kdb_env, "",
2798 "Show environment variables", 0, KDB_REPEAT_NONE); 2856 "Show environment variables", 0,
2799 kdb_register_repeat("set", kdb_set, "", 2857 KDB_ENABLE_ALWAYS_SAFE);
2800 "Set environment variables", 0, KDB_REPEAT_NONE); 2858 kdb_register_flags("set", kdb_set, "",
2801 kdb_register_repeat("help", kdb_help, "", 2859 "Set environment variables", 0,
2802 "Display Help Message", 1, KDB_REPEAT_NONE); 2860 KDB_ENABLE_ALWAYS_SAFE);
2803 kdb_register_repeat("?", kdb_help, "", 2861 kdb_register_flags("help", kdb_help, "",
2804 "Display Help Message", 0, KDB_REPEAT_NONE); 2862 "Display Help Message", 1,
2805 kdb_register_repeat("cpu", kdb_cpu, "<cpunum>", 2863 KDB_ENABLE_ALWAYS_SAFE);
2806 "Switch to new cpu", 0, KDB_REPEAT_NONE); 2864 kdb_register_flags("?", kdb_help, "",
2807 kdb_register_repeat("kgdb", kdb_kgdb, "", 2865 "Display Help Message", 0,
2808 "Enter kgdb mode", 0, KDB_REPEAT_NONE); 2866 KDB_ENABLE_ALWAYS_SAFE);
2809 kdb_register_repeat("ps", kdb_ps, "[<flags>|A]", 2867 kdb_register_flags("cpu", kdb_cpu, "<cpunum>",
2810 "Display active task list", 0, KDB_REPEAT_NONE); 2868 "Switch to new cpu", 0,
2811 kdb_register_repeat("pid", kdb_pid, "<pidnum>", 2869 KDB_ENABLE_ALWAYS_SAFE_NO_ARGS);
2812 "Switch to another task", 0, KDB_REPEAT_NONE); 2870 kdb_register_flags("kgdb", kdb_kgdb, "",
2813 kdb_register_repeat("reboot", kdb_reboot, "", 2871 "Enter kgdb mode", 0, 0);
2814 "Reboot the machine immediately", 0, KDB_REPEAT_NONE); 2872 kdb_register_flags("ps", kdb_ps, "[<flags>|A]",
2873 "Display active task list", 0,
2874 KDB_ENABLE_INSPECT);
2875 kdb_register_flags("pid", kdb_pid, "<pidnum>",
2876 "Switch to another task", 0,
2877 KDB_ENABLE_INSPECT);
2878 kdb_register_flags("reboot", kdb_reboot, "",
2879 "Reboot the machine immediately", 0,
2880 KDB_ENABLE_REBOOT);
2815#if defined(CONFIG_MODULES) 2881#if defined(CONFIG_MODULES)
2816 kdb_register_repeat("lsmod", kdb_lsmod, "", 2882 kdb_register_flags("lsmod", kdb_lsmod, "",
2817 "List loaded kernel modules", 0, KDB_REPEAT_NONE); 2883 "List loaded kernel modules", 0,
2884 KDB_ENABLE_INSPECT);
2818#endif 2885#endif
2819#if defined(CONFIG_MAGIC_SYSRQ) 2886#if defined(CONFIG_MAGIC_SYSRQ)
2820 kdb_register_repeat("sr", kdb_sr, "<key>", 2887 kdb_register_flags("sr", kdb_sr, "<key>",
2821 "Magic SysRq key", 0, KDB_REPEAT_NONE); 2888 "Magic SysRq key", 0,
2889 KDB_ENABLE_ALWAYS_SAFE);
2822#endif 2890#endif
2823#if defined(CONFIG_PRINTK) 2891#if defined(CONFIG_PRINTK)
2824 kdb_register_repeat("dmesg", kdb_dmesg, "[lines]", 2892 kdb_register_flags("dmesg", kdb_dmesg, "[lines]",
2825 "Display syslog buffer", 0, KDB_REPEAT_NONE); 2893 "Display syslog buffer", 0,
2894 KDB_ENABLE_ALWAYS_SAFE);
2826#endif 2895#endif
2827 if (arch_kgdb_ops.enable_nmi) { 2896 if (arch_kgdb_ops.enable_nmi) {
2828 kdb_register_repeat("disable_nmi", kdb_disable_nmi, "", 2897 kdb_register_flags("disable_nmi", kdb_disable_nmi, "",
2829 "Disable NMI entry to KDB", 0, KDB_REPEAT_NONE); 2898 "Disable NMI entry to KDB", 0,
2830 } 2899 KDB_ENABLE_ALWAYS_SAFE);
2831 kdb_register_repeat("defcmd", kdb_defcmd, "name \"usage\" \"help\"", 2900 }
2832 "Define a set of commands, down to endefcmd", 0, KDB_REPEAT_NONE); 2901 kdb_register_flags("defcmd", kdb_defcmd, "name \"usage\" \"help\"",
2833 kdb_register_repeat("kill", kdb_kill, "<-signal> <pid>", 2902 "Define a set of commands, down to endefcmd", 0,
2834 "Send a signal to a process", 0, KDB_REPEAT_NONE); 2903 KDB_ENABLE_ALWAYS_SAFE);
2835 kdb_register_repeat("summary", kdb_summary, "", 2904 kdb_register_flags("kill", kdb_kill, "<-signal> <pid>",
2836 "Summarize the system", 4, KDB_REPEAT_NONE); 2905 "Send a signal to a process", 0,
2837 kdb_register_repeat("per_cpu", kdb_per_cpu, "<sym> [<bytes>] [<cpu>]", 2906 KDB_ENABLE_SIGNAL);
2838 "Display per_cpu variables", 3, KDB_REPEAT_NONE); 2907 kdb_register_flags("summary", kdb_summary, "",
2839 kdb_register_repeat("grephelp", kdb_grep_help, "", 2908 "Summarize the system", 4,
2840 "Display help on | grep", 0, KDB_REPEAT_NONE); 2909 KDB_ENABLE_ALWAYS_SAFE);
2910 kdb_register_flags("per_cpu", kdb_per_cpu, "<sym> [<bytes>] [<cpu>]",
2911 "Display per_cpu variables", 3,
2912 KDB_ENABLE_MEM_READ);
2913 kdb_register_flags("grephelp", kdb_grep_help, "",
2914 "Display help on | grep", 0,
2915 KDB_ENABLE_ALWAYS_SAFE);
2841} 2916}
2842 2917
2843/* Execute any commands defined in kdb_cmds. */ 2918/* Execute any commands defined in kdb_cmds. */
diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h
index 7afd3c8c41d5..eaacd1693954 100644
--- a/kernel/debug/kdb/kdb_private.h
+++ b/kernel/debug/kdb/kdb_private.h
@@ -172,10 +172,9 @@ typedef struct _kdbtab {
172 kdb_func_t cmd_func; /* Function to execute command */ 172 kdb_func_t cmd_func; /* Function to execute command */
173 char *cmd_usage; /* Usage String for this command */ 173 char *cmd_usage; /* Usage String for this command */
174 char *cmd_help; /* Help message for this command */ 174 char *cmd_help; /* Help message for this command */
175 short cmd_flags; /* Parsing flags */
176 short cmd_minlen; /* Minimum legal # command 175 short cmd_minlen; /* Minimum legal # command
177 * chars required */ 176 * chars required */
178 kdb_repeat_t cmd_repeat; /* Does command auto repeat on enter? */ 177 kdb_cmdflags_t cmd_flags; /* Command behaviour flags */
179} kdbtab_t; 178} kdbtab_t;
180 179
181extern int kdb_bt(int, const char **); /* KDB display back trace */ 180extern int kdb_bt(int, const char **); /* KDB display back trace */
diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
index 97b67df8fbfe..d659487254d5 100644
--- a/kernel/events/callchain.c
+++ b/kernel/events/callchain.c
@@ -52,7 +52,7 @@ static void release_callchain_buffers(void)
52 struct callchain_cpus_entries *entries; 52 struct callchain_cpus_entries *entries;
53 53
54 entries = callchain_cpus_entries; 54 entries = callchain_cpus_entries;
55 rcu_assign_pointer(callchain_cpus_entries, NULL); 55 RCU_INIT_POINTER(callchain_cpus_entries, NULL);
56 call_rcu(&entries->rcu_head, release_callchain_buffers_rcu); 56 call_rcu(&entries->rcu_head, release_callchain_buffers_rcu);
57} 57}
58 58
@@ -137,7 +137,7 @@ static struct perf_callchain_entry *get_callchain_entry(int *rctx)
137 int cpu; 137 int cpu;
138 struct callchain_cpus_entries *entries; 138 struct callchain_cpus_entries *entries;
139 139
140 *rctx = get_recursion_context(__get_cpu_var(callchain_recursion)); 140 *rctx = get_recursion_context(this_cpu_ptr(callchain_recursion));
141 if (*rctx == -1) 141 if (*rctx == -1)
142 return NULL; 142 return NULL;
143 143
@@ -153,7 +153,7 @@ static struct perf_callchain_entry *get_callchain_entry(int *rctx)
153static void 153static void
154put_callchain_entry(int rctx) 154put_callchain_entry(int rctx)
155{ 155{
156 put_recursion_context(__get_cpu_var(callchain_recursion), rctx); 156 put_recursion_context(this_cpu_ptr(callchain_recursion), rctx);
157} 157}
158 158
159struct perf_callchain_entry * 159struct perf_callchain_entry *
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 963bf139e2b2..882f835a0d85 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -47,6 +47,8 @@
47 47
48#include <asm/irq_regs.h> 48#include <asm/irq_regs.h>
49 49
50static struct workqueue_struct *perf_wq;
51
50struct remote_function_call { 52struct remote_function_call {
51 struct task_struct *p; 53 struct task_struct *p;
52 int (*func)(void *info); 54 int (*func)(void *info);
@@ -120,6 +122,13 @@ static int cpu_function_call(int cpu, int (*func) (void *info), void *info)
120 return data.ret; 122 return data.ret;
121} 123}
122 124
125#define EVENT_OWNER_KERNEL ((void *) -1)
126
127static bool is_kernel_event(struct perf_event *event)
128{
129 return event->owner == EVENT_OWNER_KERNEL;
130}
131
123#define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\ 132#define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\
124 PERF_FLAG_FD_OUTPUT |\ 133 PERF_FLAG_FD_OUTPUT |\
125 PERF_FLAG_PID_CGROUP |\ 134 PERF_FLAG_PID_CGROUP |\
@@ -240,7 +249,7 @@ static void perf_duration_warn(struct irq_work *w)
240 u64 avg_local_sample_len; 249 u64 avg_local_sample_len;
241 u64 local_samples_len; 250 u64 local_samples_len;
242 251
243 local_samples_len = __get_cpu_var(running_sample_length); 252 local_samples_len = __this_cpu_read(running_sample_length);
244 avg_local_sample_len = local_samples_len/NR_ACCUMULATED_SAMPLES; 253 avg_local_sample_len = local_samples_len/NR_ACCUMULATED_SAMPLES;
245 254
246 printk_ratelimited(KERN_WARNING 255 printk_ratelimited(KERN_WARNING
@@ -262,10 +271,10 @@ void perf_sample_event_took(u64 sample_len_ns)
262 return; 271 return;
263 272
264 /* decay the counter by 1 average sample */ 273 /* decay the counter by 1 average sample */
265 local_samples_len = __get_cpu_var(running_sample_length); 274 local_samples_len = __this_cpu_read(running_sample_length);
266 local_samples_len -= local_samples_len/NR_ACCUMULATED_SAMPLES; 275 local_samples_len -= local_samples_len/NR_ACCUMULATED_SAMPLES;
267 local_samples_len += sample_len_ns; 276 local_samples_len += sample_len_ns;
268 __get_cpu_var(running_sample_length) = local_samples_len; 277 __this_cpu_write(running_sample_length, local_samples_len);
269 278
270 /* 279 /*
271 * note: this will be biased artifically low until we have 280 * note: this will be biased artifically low until we have
@@ -392,14 +401,9 @@ perf_cgroup_match(struct perf_event *event)
392 event->cgrp->css.cgroup); 401 event->cgrp->css.cgroup);
393} 402}
394 403
395static inline void perf_put_cgroup(struct perf_event *event)
396{
397 css_put(&event->cgrp->css);
398}
399
400static inline void perf_detach_cgroup(struct perf_event *event) 404static inline void perf_detach_cgroup(struct perf_event *event)
401{ 405{
402 perf_put_cgroup(event); 406 css_put(&event->cgrp->css);
403 event->cgrp = NULL; 407 event->cgrp = NULL;
404} 408}
405 409
@@ -610,7 +614,7 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event,
610 if (!f.file) 614 if (!f.file)
611 return -EBADF; 615 return -EBADF;
612 616
613 css = css_tryget_online_from_dir(f.file->f_dentry, 617 css = css_tryget_online_from_dir(f.file->f_path.dentry,
614 &perf_event_cgrp_subsys); 618 &perf_event_cgrp_subsys);
615 if (IS_ERR(css)) { 619 if (IS_ERR(css)) {
616 ret = PTR_ERR(css); 620 ret = PTR_ERR(css);
@@ -878,7 +882,7 @@ static DEFINE_PER_CPU(struct list_head, rotation_list);
878static void perf_pmu_rotate_start(struct pmu *pmu) 882static void perf_pmu_rotate_start(struct pmu *pmu)
879{ 883{
880 struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); 884 struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
881 struct list_head *head = &__get_cpu_var(rotation_list); 885 struct list_head *head = this_cpu_ptr(&rotation_list);
882 886
883 WARN_ON(!irqs_disabled()); 887 WARN_ON(!irqs_disabled());
884 888
@@ -902,13 +906,23 @@ static void put_ctx(struct perf_event_context *ctx)
902 } 906 }
903} 907}
904 908
905static void unclone_ctx(struct perf_event_context *ctx) 909/*
910 * This must be done under the ctx->lock, such as to serialize against
911 * context_equiv(), therefore we cannot call put_ctx() since that might end up
912 * calling scheduler related locks and ctx->lock nests inside those.
913 */
914static __must_check struct perf_event_context *
915unclone_ctx(struct perf_event_context *ctx)
906{ 916{
907 if (ctx->parent_ctx) { 917 struct perf_event_context *parent_ctx = ctx->parent_ctx;
908 put_ctx(ctx->parent_ctx); 918
919 lockdep_assert_held(&ctx->lock);
920
921 if (parent_ctx)
909 ctx->parent_ctx = NULL; 922 ctx->parent_ctx = NULL;
910 }
911 ctx->generation++; 923 ctx->generation++;
924
925 return parent_ctx;
912} 926}
913 927
914static u32 perf_event_pid(struct perf_event *event, struct task_struct *p) 928static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
@@ -1375,6 +1389,45 @@ out:
1375 perf_event__header_size(tmp); 1389 perf_event__header_size(tmp);
1376} 1390}
1377 1391
1392/*
1393 * User event without the task.
1394 */
1395static bool is_orphaned_event(struct perf_event *event)
1396{
1397 return event && !is_kernel_event(event) && !event->owner;
1398}
1399
1400/*
1401 * Event has a parent but parent's task finished and it's
1402 * alive only because of children holding refference.
1403 */
1404static bool is_orphaned_child(struct perf_event *event)
1405{
1406 return is_orphaned_event(event->parent);
1407}
1408
1409static void orphans_remove_work(struct work_struct *work);
1410
1411static void schedule_orphans_remove(struct perf_event_context *ctx)
1412{
1413 if (!ctx->task || ctx->orphans_remove_sched || !perf_wq)
1414 return;
1415
1416 if (queue_delayed_work(perf_wq, &ctx->orphans_remove, 1)) {
1417 get_ctx(ctx);
1418 ctx->orphans_remove_sched = true;
1419 }
1420}
1421
1422static int __init perf_workqueue_init(void)
1423{
1424 perf_wq = create_singlethread_workqueue("perf");
1425 WARN(!perf_wq, "failed to create perf workqueue\n");
1426 return perf_wq ? 0 : -1;
1427}
1428
1429core_initcall(perf_workqueue_init);
1430
1378static inline int 1431static inline int
1379event_filter_match(struct perf_event *event) 1432event_filter_match(struct perf_event *event)
1380{ 1433{
@@ -1424,6 +1477,9 @@ event_sched_out(struct perf_event *event,
1424 if (event->attr.exclusive || !cpuctx->active_oncpu) 1477 if (event->attr.exclusive || !cpuctx->active_oncpu)
1425 cpuctx->exclusive = 0; 1478 cpuctx->exclusive = 0;
1426 1479
1480 if (is_orphaned_child(event))
1481 schedule_orphans_remove(ctx);
1482
1427 perf_pmu_enable(event->pmu); 1483 perf_pmu_enable(event->pmu);
1428} 1484}
1429 1485
@@ -1506,8 +1562,10 @@ static void perf_remove_from_context(struct perf_event *event, bool detach_group
1506 1562
1507 if (!task) { 1563 if (!task) {
1508 /* 1564 /*
1509 * Per cpu events are removed via an smp call and 1565 * Per cpu events are removed via an smp call. The removal can
1510 * the removal is always successful. 1566 * fail if the CPU is currently offline, but in that case we
1567 * already called __perf_remove_from_context from
1568 * perf_event_exit_cpu.
1511 */ 1569 */
1512 cpu_function_call(event->cpu, __perf_remove_from_context, &re); 1570 cpu_function_call(event->cpu, __perf_remove_from_context, &re);
1513 return; 1571 return;
@@ -1731,6 +1789,9 @@ event_sched_in(struct perf_event *event,
1731 if (event->attr.exclusive) 1789 if (event->attr.exclusive)
1732 cpuctx->exclusive = 1; 1790 cpuctx->exclusive = 1;
1733 1791
1792 if (is_orphaned_child(event))
1793 schedule_orphans_remove(ctx);
1794
1734out: 1795out:
1735 perf_pmu_enable(event->pmu); 1796 perf_pmu_enable(event->pmu);
1736 1797
@@ -2210,6 +2271,9 @@ static void ctx_sched_out(struct perf_event_context *ctx,
2210static int context_equiv(struct perf_event_context *ctx1, 2271static int context_equiv(struct perf_event_context *ctx1,
2211 struct perf_event_context *ctx2) 2272 struct perf_event_context *ctx2)
2212{ 2273{
2274 lockdep_assert_held(&ctx1->lock);
2275 lockdep_assert_held(&ctx2->lock);
2276
2213 /* Pinning disables the swap optimization */ 2277 /* Pinning disables the swap optimization */
2214 if (ctx1->pin_count || ctx2->pin_count) 2278 if (ctx1->pin_count || ctx2->pin_count)
2215 return 0; 2279 return 0;
@@ -2331,7 +2395,7 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
2331 next_parent = rcu_dereference(next_ctx->parent_ctx); 2395 next_parent = rcu_dereference(next_ctx->parent_ctx);
2332 2396
2333 /* If neither context have a parent context; they cannot be clones. */ 2397 /* If neither context have a parent context; they cannot be clones. */
2334 if (!parent || !next_parent) 2398 if (!parent && !next_parent)
2335 goto unlock; 2399 goto unlock;
2336 2400
2337 if (next_parent == ctx || next_ctx == parent || next_parent == parent) { 2401 if (next_parent == ctx || next_ctx == parent || next_parent == parent) {
@@ -2400,7 +2464,7 @@ void __perf_event_task_sched_out(struct task_struct *task,
2400 * to check if we have to switch out PMU state. 2464 * to check if we have to switch out PMU state.
2401 * cgroup event are system-wide mode only 2465 * cgroup event are system-wide mode only
2402 */ 2466 */
2403 if (atomic_read(&__get_cpu_var(perf_cgroup_events))) 2467 if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
2404 perf_cgroup_sched_out(task, next); 2468 perf_cgroup_sched_out(task, next);
2405} 2469}
2406 2470
@@ -2643,11 +2707,11 @@ void __perf_event_task_sched_in(struct task_struct *prev,
2643 * to check if we have to switch in PMU state. 2707 * to check if we have to switch in PMU state.
2644 * cgroup event are system-wide mode only 2708 * cgroup event are system-wide mode only
2645 */ 2709 */
2646 if (atomic_read(&__get_cpu_var(perf_cgroup_events))) 2710 if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
2647 perf_cgroup_sched_in(prev, task); 2711 perf_cgroup_sched_in(prev, task);
2648 2712
2649 /* check for system-wide branch_stack events */ 2713 /* check for system-wide branch_stack events */
2650 if (atomic_read(&__get_cpu_var(perf_branch_stack_events))) 2714 if (atomic_read(this_cpu_ptr(&perf_branch_stack_events)))
2651 perf_branch_stack_sched_in(prev, task); 2715 perf_branch_stack_sched_in(prev, task);
2652} 2716}
2653 2717
@@ -2902,7 +2966,7 @@ bool perf_event_can_stop_tick(void)
2902 2966
2903void perf_event_task_tick(void) 2967void perf_event_task_tick(void)
2904{ 2968{
2905 struct list_head *head = &__get_cpu_var(rotation_list); 2969 struct list_head *head = this_cpu_ptr(&rotation_list);
2906 struct perf_cpu_context *cpuctx, *tmp; 2970 struct perf_cpu_context *cpuctx, *tmp;
2907 struct perf_event_context *ctx; 2971 struct perf_event_context *ctx;
2908 int throttled; 2972 int throttled;
@@ -2943,6 +3007,7 @@ static int event_enable_on_exec(struct perf_event *event,
2943 */ 3007 */
2944static void perf_event_enable_on_exec(struct perf_event_context *ctx) 3008static void perf_event_enable_on_exec(struct perf_event_context *ctx)
2945{ 3009{
3010 struct perf_event_context *clone_ctx = NULL;
2946 struct perf_event *event; 3011 struct perf_event *event;
2947 unsigned long flags; 3012 unsigned long flags;
2948 int enabled = 0; 3013 int enabled = 0;
@@ -2974,7 +3039,7 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx)
2974 * Unclone this context if we enabled any event. 3039 * Unclone this context if we enabled any event.
2975 */ 3040 */
2976 if (enabled) 3041 if (enabled)
2977 unclone_ctx(ctx); 3042 clone_ctx = unclone_ctx(ctx);
2978 3043
2979 raw_spin_unlock(&ctx->lock); 3044 raw_spin_unlock(&ctx->lock);
2980 3045
@@ -2984,6 +3049,9 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx)
2984 perf_event_context_sched_in(ctx, ctx->task); 3049 perf_event_context_sched_in(ctx, ctx->task);
2985out: 3050out:
2986 local_irq_restore(flags); 3051 local_irq_restore(flags);
3052
3053 if (clone_ctx)
3054 put_ctx(clone_ctx);
2987} 3055}
2988 3056
2989void perf_event_exec(void) 3057void perf_event_exec(void)
@@ -3078,6 +3146,7 @@ static void __perf_event_init_context(struct perf_event_context *ctx)
3078 INIT_LIST_HEAD(&ctx->flexible_groups); 3146 INIT_LIST_HEAD(&ctx->flexible_groups);
3079 INIT_LIST_HEAD(&ctx->event_list); 3147 INIT_LIST_HEAD(&ctx->event_list);
3080 atomic_set(&ctx->refcount, 1); 3148 atomic_set(&ctx->refcount, 1);
3149 INIT_DELAYED_WORK(&ctx->orphans_remove, orphans_remove_work);
3081} 3150}
3082 3151
3083static struct perf_event_context * 3152static struct perf_event_context *
@@ -3135,7 +3204,7 @@ errout:
3135static struct perf_event_context * 3204static struct perf_event_context *
3136find_get_context(struct pmu *pmu, struct task_struct *task, int cpu) 3205find_get_context(struct pmu *pmu, struct task_struct *task, int cpu)
3137{ 3206{
3138 struct perf_event_context *ctx; 3207 struct perf_event_context *ctx, *clone_ctx = NULL;
3139 struct perf_cpu_context *cpuctx; 3208 struct perf_cpu_context *cpuctx;
3140 unsigned long flags; 3209 unsigned long flags;
3141 int ctxn, err; 3210 int ctxn, err;
@@ -3169,9 +3238,12 @@ find_get_context(struct pmu *pmu, struct task_struct *task, int cpu)
3169retry: 3238retry:
3170 ctx = perf_lock_task_context(task, ctxn, &flags); 3239 ctx = perf_lock_task_context(task, ctxn, &flags);
3171 if (ctx) { 3240 if (ctx) {
3172 unclone_ctx(ctx); 3241 clone_ctx = unclone_ctx(ctx);
3173 ++ctx->pin_count; 3242 ++ctx->pin_count;
3174 raw_spin_unlock_irqrestore(&ctx->lock, flags); 3243 raw_spin_unlock_irqrestore(&ctx->lock, flags);
3244
3245 if (clone_ctx)
3246 put_ctx(clone_ctx);
3175 } else { 3247 } else {
3176 ctx = alloc_perf_context(pmu, task); 3248 ctx = alloc_perf_context(pmu, task);
3177 err = -ENOMEM; 3249 err = -ENOMEM;
@@ -3323,16 +3395,12 @@ static void free_event(struct perf_event *event)
3323} 3395}
3324 3396
3325/* 3397/*
3326 * Called when the last reference to the file is gone. 3398 * Remove user event from the owner task.
3327 */ 3399 */
3328static void put_event(struct perf_event *event) 3400static void perf_remove_from_owner(struct perf_event *event)
3329{ 3401{
3330 struct perf_event_context *ctx = event->ctx;
3331 struct task_struct *owner; 3402 struct task_struct *owner;
3332 3403
3333 if (!atomic_long_dec_and_test(&event->refcount))
3334 return;
3335
3336 rcu_read_lock(); 3404 rcu_read_lock();
3337 owner = ACCESS_ONCE(event->owner); 3405 owner = ACCESS_ONCE(event->owner);
3338 /* 3406 /*
@@ -3365,6 +3433,20 @@ static void put_event(struct perf_event *event)
3365 mutex_unlock(&owner->perf_event_mutex); 3433 mutex_unlock(&owner->perf_event_mutex);
3366 put_task_struct(owner); 3434 put_task_struct(owner);
3367 } 3435 }
3436}
3437
3438/*
3439 * Called when the last reference to the file is gone.
3440 */
3441static void put_event(struct perf_event *event)
3442{
3443 struct perf_event_context *ctx = event->ctx;
3444
3445 if (!atomic_long_dec_and_test(&event->refcount))
3446 return;
3447
3448 if (!is_kernel_event(event))
3449 perf_remove_from_owner(event);
3368 3450
3369 WARN_ON_ONCE(ctx->parent_ctx); 3451 WARN_ON_ONCE(ctx->parent_ctx);
3370 /* 3452 /*
@@ -3399,6 +3481,42 @@ static int perf_release(struct inode *inode, struct file *file)
3399 return 0; 3481 return 0;
3400} 3482}
3401 3483
3484/*
3485 * Remove all orphanes events from the context.
3486 */
3487static void orphans_remove_work(struct work_struct *work)
3488{
3489 struct perf_event_context *ctx;
3490 struct perf_event *event, *tmp;
3491
3492 ctx = container_of(work, struct perf_event_context,
3493 orphans_remove.work);
3494
3495 mutex_lock(&ctx->mutex);
3496 list_for_each_entry_safe(event, tmp, &ctx->event_list, event_entry) {
3497 struct perf_event *parent_event = event->parent;
3498
3499 if (!is_orphaned_child(event))
3500 continue;
3501
3502 perf_remove_from_context(event, true);
3503
3504 mutex_lock(&parent_event->child_mutex);
3505 list_del_init(&event->child_list);
3506 mutex_unlock(&parent_event->child_mutex);
3507
3508 free_event(event);
3509 put_event(parent_event);
3510 }
3511
3512 raw_spin_lock_irq(&ctx->lock);
3513 ctx->orphans_remove_sched = false;
3514 raw_spin_unlock_irq(&ctx->lock);
3515 mutex_unlock(&ctx->mutex);
3516
3517 put_ctx(ctx);
3518}
3519
3402u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running) 3520u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
3403{ 3521{
3404 struct perf_event *child; 3522 struct perf_event *child;
@@ -3496,6 +3614,19 @@ static int perf_event_read_one(struct perf_event *event,
3496 return n * sizeof(u64); 3614 return n * sizeof(u64);
3497} 3615}
3498 3616
3617static bool is_event_hup(struct perf_event *event)
3618{
3619 bool no_children;
3620
3621 if (event->state != PERF_EVENT_STATE_EXIT)
3622 return false;
3623
3624 mutex_lock(&event->child_mutex);
3625 no_children = list_empty(&event->child_list);
3626 mutex_unlock(&event->child_mutex);
3627 return no_children;
3628}
3629
3499/* 3630/*
3500 * Read the performance event - simple non blocking version for now 3631 * Read the performance event - simple non blocking version for now
3501 */ 3632 */
@@ -3537,7 +3668,12 @@ static unsigned int perf_poll(struct file *file, poll_table *wait)
3537{ 3668{
3538 struct perf_event *event = file->private_data; 3669 struct perf_event *event = file->private_data;
3539 struct ring_buffer *rb; 3670 struct ring_buffer *rb;
3540 unsigned int events = POLL_HUP; 3671 unsigned int events = POLLHUP;
3672
3673 poll_wait(file, &event->waitq, wait);
3674
3675 if (is_event_hup(event))
3676 return events;
3541 3677
3542 /* 3678 /*
3543 * Pin the event->rb by taking event->mmap_mutex; otherwise 3679 * Pin the event->rb by taking event->mmap_mutex; otherwise
@@ -3548,9 +3684,6 @@ static unsigned int perf_poll(struct file *file, poll_table *wait)
3548 if (rb) 3684 if (rb)
3549 events = atomic_xchg(&rb->poll, 0); 3685 events = atomic_xchg(&rb->poll, 0);
3550 mutex_unlock(&event->mmap_mutex); 3686 mutex_unlock(&event->mmap_mutex);
3551
3552 poll_wait(file, &event->waitq, wait);
3553
3554 return events; 3687 return events;
3555} 3688}
3556 3689
@@ -4327,22 +4460,29 @@ perf_output_sample_regs(struct perf_output_handle *handle,
4327 } 4460 }
4328} 4461}
4329 4462
4330static void perf_sample_regs_user(struct perf_regs_user *regs_user, 4463static void perf_sample_regs_user(struct perf_regs *regs_user,
4331 struct pt_regs *regs) 4464 struct pt_regs *regs,
4465 struct pt_regs *regs_user_copy)
4332{ 4466{
4333 if (!user_mode(regs)) { 4467 if (user_mode(regs)) {
4334 if (current->mm) 4468 regs_user->abi = perf_reg_abi(current);
4335 regs = task_pt_regs(current);
4336 else
4337 regs = NULL;
4338 }
4339
4340 if (regs) {
4341 regs_user->regs = regs; 4469 regs_user->regs = regs;
4342 regs_user->abi = perf_reg_abi(current); 4470 } else if (current->mm) {
4471 perf_get_regs_user(regs_user, regs, regs_user_copy);
4472 } else {
4473 regs_user->abi = PERF_SAMPLE_REGS_ABI_NONE;
4474 regs_user->regs = NULL;
4343 } 4475 }
4344} 4476}
4345 4477
4478static void perf_sample_regs_intr(struct perf_regs *regs_intr,
4479 struct pt_regs *regs)
4480{
4481 regs_intr->regs = regs;
4482 regs_intr->abi = perf_reg_abi(current);
4483}
4484
4485
4346/* 4486/*
4347 * Get remaining task size from user stack pointer. 4487 * Get remaining task size from user stack pointer.
4348 * 4488 *
@@ -4724,6 +4864,23 @@ void perf_output_sample(struct perf_output_handle *handle,
4724 if (sample_type & PERF_SAMPLE_TRANSACTION) 4864 if (sample_type & PERF_SAMPLE_TRANSACTION)
4725 perf_output_put(handle, data->txn); 4865 perf_output_put(handle, data->txn);
4726 4866
4867 if (sample_type & PERF_SAMPLE_REGS_INTR) {
4868 u64 abi = data->regs_intr.abi;
4869 /*
4870 * If there are no regs to dump, notice it through
4871 * first u64 being zero (PERF_SAMPLE_REGS_ABI_NONE).
4872 */
4873 perf_output_put(handle, abi);
4874
4875 if (abi) {
4876 u64 mask = event->attr.sample_regs_intr;
4877
4878 perf_output_sample_regs(handle,
4879 data->regs_intr.regs,
4880 mask);
4881 }
4882 }
4883
4727 if (!event->attr.watermark) { 4884 if (!event->attr.watermark) {
4728 int wakeup_events = event->attr.wakeup_events; 4885 int wakeup_events = event->attr.wakeup_events;
4729 4886
@@ -4789,12 +4946,14 @@ void perf_prepare_sample(struct perf_event_header *header,
4789 header->size += size; 4946 header->size += size;
4790 } 4947 }
4791 4948
4949 if (sample_type & (PERF_SAMPLE_REGS_USER | PERF_SAMPLE_STACK_USER))
4950 perf_sample_regs_user(&data->regs_user, regs,
4951 &data->regs_user_copy);
4952
4792 if (sample_type & PERF_SAMPLE_REGS_USER) { 4953 if (sample_type & PERF_SAMPLE_REGS_USER) {
4793 /* regs dump ABI info */ 4954 /* regs dump ABI info */
4794 int size = sizeof(u64); 4955 int size = sizeof(u64);
4795 4956
4796 perf_sample_regs_user(&data->regs_user, regs);
4797
4798 if (data->regs_user.regs) { 4957 if (data->regs_user.regs) {
4799 u64 mask = event->attr.sample_regs_user; 4958 u64 mask = event->attr.sample_regs_user;
4800 size += hweight64(mask) * sizeof(u64); 4959 size += hweight64(mask) * sizeof(u64);
@@ -4810,15 +4969,11 @@ void perf_prepare_sample(struct perf_event_header *header,
4810 * in case new sample type is added, because we could eat 4969 * in case new sample type is added, because we could eat
4811 * up the rest of the sample size. 4970 * up the rest of the sample size.
4812 */ 4971 */
4813 struct perf_regs_user *uregs = &data->regs_user;
4814 u16 stack_size = event->attr.sample_stack_user; 4972 u16 stack_size = event->attr.sample_stack_user;
4815 u16 size = sizeof(u64); 4973 u16 size = sizeof(u64);
4816 4974
4817 if (!uregs->abi)
4818 perf_sample_regs_user(uregs, regs);
4819
4820 stack_size = perf_sample_ustack_size(stack_size, header->size, 4975 stack_size = perf_sample_ustack_size(stack_size, header->size,
4821 uregs->regs); 4976 data->regs_user.regs);
4822 4977
4823 /* 4978 /*
4824 * If there is something to dump, add space for the dump 4979 * If there is something to dump, add space for the dump
@@ -4831,6 +4986,21 @@ void perf_prepare_sample(struct perf_event_header *header,
4831 data->stack_user_size = stack_size; 4986 data->stack_user_size = stack_size;
4832 header->size += size; 4987 header->size += size;
4833 } 4988 }
4989
4990 if (sample_type & PERF_SAMPLE_REGS_INTR) {
4991 /* regs dump ABI info */
4992 int size = sizeof(u64);
4993
4994 perf_sample_regs_intr(&data->regs_intr, regs);
4995
4996 if (data->regs_intr.regs) {
4997 u64 mask = event->attr.sample_regs_intr;
4998
4999 size += hweight64(mask) * sizeof(u64);
5000 }
5001
5002 header->size += size;
5003 }
4834} 5004}
4835 5005
4836static void perf_event_output(struct perf_event *event, 5006static void perf_event_output(struct perf_event *event,
@@ -5702,7 +5872,7 @@ static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
5702 struct perf_sample_data *data, 5872 struct perf_sample_data *data,
5703 struct pt_regs *regs) 5873 struct pt_regs *regs)
5704{ 5874{
5705 struct swevent_htable *swhash = &__get_cpu_var(swevent_htable); 5875 struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
5706 struct perf_event *event; 5876 struct perf_event *event;
5707 struct hlist_head *head; 5877 struct hlist_head *head;
5708 5878
@@ -5721,7 +5891,7 @@ end:
5721 5891
5722int perf_swevent_get_recursion_context(void) 5892int perf_swevent_get_recursion_context(void)
5723{ 5893{
5724 struct swevent_htable *swhash = &__get_cpu_var(swevent_htable); 5894 struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
5725 5895
5726 return get_recursion_context(swhash->recursion); 5896 return get_recursion_context(swhash->recursion);
5727} 5897}
@@ -5729,7 +5899,7 @@ EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context);
5729 5899
5730inline void perf_swevent_put_recursion_context(int rctx) 5900inline void perf_swevent_put_recursion_context(int rctx)
5731{ 5901{
5732 struct swevent_htable *swhash = &__get_cpu_var(swevent_htable); 5902 struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
5733 5903
5734 put_recursion_context(swhash->recursion, rctx); 5904 put_recursion_context(swhash->recursion, rctx);
5735} 5905}
@@ -5758,7 +5928,7 @@ static void perf_swevent_read(struct perf_event *event)
5758 5928
5759static int perf_swevent_add(struct perf_event *event, int flags) 5929static int perf_swevent_add(struct perf_event *event, int flags)
5760{ 5930{
5761 struct swevent_htable *swhash = &__get_cpu_var(swevent_htable); 5931 struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
5762 struct hw_perf_event *hwc = &event->hw; 5932 struct hw_perf_event *hwc = &event->hw;
5763 struct hlist_head *head; 5933 struct hlist_head *head;
5764 5934
@@ -5814,7 +5984,7 @@ static void swevent_hlist_release(struct swevent_htable *swhash)
5814 if (!hlist) 5984 if (!hlist)
5815 return; 5985 return;
5816 5986
5817 rcu_assign_pointer(swhash->swevent_hlist, NULL); 5987 RCU_INIT_POINTER(swhash->swevent_hlist, NULL);
5818 kfree_rcu(hlist, rcu_head); 5988 kfree_rcu(hlist, rcu_head);
5819} 5989}
5820 5990
@@ -5940,11 +6110,6 @@ static int perf_swevent_init(struct perf_event *event)
5940 return 0; 6110 return 0;
5941} 6111}
5942 6112
5943static int perf_swevent_event_idx(struct perf_event *event)
5944{
5945 return 0;
5946}
5947
5948static struct pmu perf_swevent = { 6113static struct pmu perf_swevent = {
5949 .task_ctx_nr = perf_sw_context, 6114 .task_ctx_nr = perf_sw_context,
5950 6115
@@ -5954,8 +6119,6 @@ static struct pmu perf_swevent = {
5954 .start = perf_swevent_start, 6119 .start = perf_swevent_start,
5955 .stop = perf_swevent_stop, 6120 .stop = perf_swevent_stop,
5956 .read = perf_swevent_read, 6121 .read = perf_swevent_read,
5957
5958 .event_idx = perf_swevent_event_idx,
5959}; 6122};
5960 6123
5961#ifdef CONFIG_EVENT_TRACING 6124#ifdef CONFIG_EVENT_TRACING
@@ -6073,8 +6236,6 @@ static struct pmu perf_tracepoint = {
6073 .start = perf_swevent_start, 6236 .start = perf_swevent_start,
6074 .stop = perf_swevent_stop, 6237 .stop = perf_swevent_stop,
6075 .read = perf_swevent_read, 6238 .read = perf_swevent_read,
6076
6077 .event_idx = perf_swevent_event_idx,
6078}; 6239};
6079 6240
6080static inline void perf_tp_register(void) 6241static inline void perf_tp_register(void)
@@ -6300,8 +6461,6 @@ static struct pmu perf_cpu_clock = {
6300 .start = cpu_clock_event_start, 6461 .start = cpu_clock_event_start,
6301 .stop = cpu_clock_event_stop, 6462 .stop = cpu_clock_event_stop,
6302 .read = cpu_clock_event_read, 6463 .read = cpu_clock_event_read,
6303
6304 .event_idx = perf_swevent_event_idx,
6305}; 6464};
6306 6465
6307/* 6466/*
@@ -6380,8 +6539,6 @@ static struct pmu perf_task_clock = {
6380 .start = task_clock_event_start, 6539 .start = task_clock_event_start,
6381 .stop = task_clock_event_stop, 6540 .stop = task_clock_event_stop,
6382 .read = task_clock_event_read, 6541 .read = task_clock_event_read,
6383
6384 .event_idx = perf_swevent_event_idx,
6385}; 6542};
6386 6543
6387static void perf_pmu_nop_void(struct pmu *pmu) 6544static void perf_pmu_nop_void(struct pmu *pmu)
@@ -6411,7 +6568,7 @@ static void perf_pmu_cancel_txn(struct pmu *pmu)
6411 6568
6412static int perf_event_idx_default(struct perf_event *event) 6569static int perf_event_idx_default(struct perf_event *event)
6413{ 6570{
6414 return event->hw.idx + 1; 6571 return 0;
6415} 6572}
6416 6573
6417/* 6574/*
@@ -7031,6 +7188,8 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr,
7031 ret = -EINVAL; 7188 ret = -EINVAL;
7032 } 7189 }
7033 7190
7191 if (attr->sample_type & PERF_SAMPLE_REGS_INTR)
7192 ret = perf_reg_validate(attr->sample_regs_intr);
7034out: 7193out:
7035 return ret; 7194 return ret;
7036 7195
@@ -7315,11 +7474,11 @@ SYSCALL_DEFINE5(perf_event_open,
7315 7474
7316 if (move_group) { 7475 if (move_group) {
7317 synchronize_rcu(); 7476 synchronize_rcu();
7318 perf_install_in_context(ctx, group_leader, event->cpu); 7477 perf_install_in_context(ctx, group_leader, group_leader->cpu);
7319 get_ctx(ctx); 7478 get_ctx(ctx);
7320 list_for_each_entry(sibling, &group_leader->sibling_list, 7479 list_for_each_entry(sibling, &group_leader->sibling_list,
7321 group_entry) { 7480 group_entry) {
7322 perf_install_in_context(ctx, sibling, event->cpu); 7481 perf_install_in_context(ctx, sibling, sibling->cpu);
7323 get_ctx(ctx); 7482 get_ctx(ctx);
7324 } 7483 }
7325 } 7484 }
@@ -7397,6 +7556,9 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
7397 goto err; 7556 goto err;
7398 } 7557 }
7399 7558
7559 /* Mark owner so we could distinguish it from user events. */
7560 event->owner = EVENT_OWNER_KERNEL;
7561
7400 account_event(event); 7562 account_event(event);
7401 7563
7402 ctx = find_get_context(event->pmu, task, cpu); 7564 ctx = find_get_context(event->pmu, task, cpu);
@@ -7484,6 +7646,12 @@ static void sync_child_event(struct perf_event *child_event,
7484 mutex_unlock(&parent_event->child_mutex); 7646 mutex_unlock(&parent_event->child_mutex);
7485 7647
7486 /* 7648 /*
7649 * Make sure user/parent get notified, that we just
7650 * lost one event.
7651 */
7652 perf_event_wakeup(parent_event);
7653
7654 /*
7487 * Release the parent event, if this was the last 7655 * Release the parent event, if this was the last
7488 * reference to it. 7656 * reference to it.
7489 */ 7657 */
@@ -7517,13 +7685,16 @@ __perf_event_exit_task(struct perf_event *child_event,
7517 if (child_event->parent) { 7685 if (child_event->parent) {
7518 sync_child_event(child_event, child); 7686 sync_child_event(child_event, child);
7519 free_event(child_event); 7687 free_event(child_event);
7688 } else {
7689 child_event->state = PERF_EVENT_STATE_EXIT;
7690 perf_event_wakeup(child_event);
7520 } 7691 }
7521} 7692}
7522 7693
7523static void perf_event_exit_task_context(struct task_struct *child, int ctxn) 7694static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
7524{ 7695{
7525 struct perf_event *child_event, *next; 7696 struct perf_event *child_event, *next;
7526 struct perf_event_context *child_ctx, *parent_ctx; 7697 struct perf_event_context *child_ctx, *clone_ctx = NULL;
7527 unsigned long flags; 7698 unsigned long flags;
7528 7699
7529 if (likely(!child->perf_event_ctxp[ctxn])) { 7700 if (likely(!child->perf_event_ctxp[ctxn])) {
@@ -7550,28 +7721,16 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
7550 child->perf_event_ctxp[ctxn] = NULL; 7721 child->perf_event_ctxp[ctxn] = NULL;
7551 7722
7552 /* 7723 /*
7553 * In order to avoid freeing: child_ctx->parent_ctx->task
7554 * under perf_event_context::lock, grab another reference.
7555 */
7556 parent_ctx = child_ctx->parent_ctx;
7557 if (parent_ctx)
7558 get_ctx(parent_ctx);
7559
7560 /*
7561 * If this context is a clone; unclone it so it can't get 7724 * If this context is a clone; unclone it so it can't get
7562 * swapped to another process while we're removing all 7725 * swapped to another process while we're removing all
7563 * the events from it. 7726 * the events from it.
7564 */ 7727 */
7565 unclone_ctx(child_ctx); 7728 clone_ctx = unclone_ctx(child_ctx);
7566 update_context_time(child_ctx); 7729 update_context_time(child_ctx);
7567 raw_spin_unlock_irqrestore(&child_ctx->lock, flags); 7730 raw_spin_unlock_irqrestore(&child_ctx->lock, flags);
7568 7731
7569 /* 7732 if (clone_ctx)
7570 * Now that we no longer hold perf_event_context::lock, drop 7733 put_ctx(clone_ctx);
7571 * our extra child_ctx->parent_ctx reference.
7572 */
7573 if (parent_ctx)
7574 put_ctx(parent_ctx);
7575 7734
7576 /* 7735 /*
7577 * Report the task dead after unscheduling the events so that we 7736 * Report the task dead after unscheduling the events so that we
@@ -7700,6 +7859,7 @@ inherit_event(struct perf_event *parent_event,
7700 struct perf_event *group_leader, 7859 struct perf_event *group_leader,
7701 struct perf_event_context *child_ctx) 7860 struct perf_event_context *child_ctx)
7702{ 7861{
7862 enum perf_event_active_state parent_state = parent_event->state;
7703 struct perf_event *child_event; 7863 struct perf_event *child_event;
7704 unsigned long flags; 7864 unsigned long flags;
7705 7865
@@ -7720,7 +7880,8 @@ inherit_event(struct perf_event *parent_event,
7720 if (IS_ERR(child_event)) 7880 if (IS_ERR(child_event))
7721 return child_event; 7881 return child_event;
7722 7882
7723 if (!atomic_long_inc_not_zero(&parent_event->refcount)) { 7883 if (is_orphaned_event(parent_event) ||
7884 !atomic_long_inc_not_zero(&parent_event->refcount)) {
7724 free_event(child_event); 7885 free_event(child_event);
7725 return NULL; 7886 return NULL;
7726 } 7887 }
@@ -7732,7 +7893,7 @@ inherit_event(struct perf_event *parent_event,
7732 * not its attr.disabled bit. We hold the parent's mutex, 7893 * not its attr.disabled bit. We hold the parent's mutex,
7733 * so we won't race with perf_event_{en, dis}able_family. 7894 * so we won't race with perf_event_{en, dis}able_family.
7734 */ 7895 */
7735 if (parent_event->state >= PERF_EVENT_STATE_INACTIVE) 7896 if (parent_state >= PERF_EVENT_STATE_INACTIVE)
7736 child_event->state = PERF_EVENT_STATE_INACTIVE; 7897 child_event->state = PERF_EVENT_STATE_INACTIVE;
7737 else 7898 else
7738 child_event->state = PERF_EVENT_STATE_OFF; 7899 child_event->state = PERF_EVENT_STATE_OFF;
@@ -7997,7 +8158,7 @@ static void perf_pmu_rotate_stop(struct pmu *pmu)
7997 8158
7998static void __perf_event_exit_context(void *__info) 8159static void __perf_event_exit_context(void *__info)
7999{ 8160{
8000 struct remove_event re = { .detach_group = false }; 8161 struct remove_event re = { .detach_group = true };
8001 struct perf_event_context *ctx = __info; 8162 struct perf_event_context *ctx = __info;
8002 8163
8003 perf_pmu_rotate_stop(ctx->pmu); 8164 perf_pmu_rotate_stop(ctx->pmu);
diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c
index 1559fb0b9296..9803a6600d49 100644
--- a/kernel/events/hw_breakpoint.c
+++ b/kernel/events/hw_breakpoint.c
@@ -605,11 +605,6 @@ static void hw_breakpoint_stop(struct perf_event *bp, int flags)
605 bp->hw.state = PERF_HES_STOPPED; 605 bp->hw.state = PERF_HES_STOPPED;
606} 606}
607 607
608static int hw_breakpoint_event_idx(struct perf_event *bp)
609{
610 return 0;
611}
612
613static struct pmu perf_breakpoint = { 608static struct pmu perf_breakpoint = {
614 .task_ctx_nr = perf_sw_context, /* could eventually get its own */ 609 .task_ctx_nr = perf_sw_context, /* could eventually get its own */
615 610
@@ -619,8 +614,6 @@ static struct pmu perf_breakpoint = {
619 .start = hw_breakpoint_start, 614 .start = hw_breakpoint_start,
620 .stop = hw_breakpoint_stop, 615 .stop = hw_breakpoint_stop,
621 .read = hw_breakpoint_pmu_read, 616 .read = hw_breakpoint_pmu_read,
622
623 .event_idx = hw_breakpoint_event_idx,
624}; 617};
625 618
626int __init init_hw_breakpoint(void) 619int __init init_hw_breakpoint(void)
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 1d0af8a2c646..cb346f26a22d 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -193,7 +193,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
193 } 193 }
194 194
195 flush_cache_page(vma, addr, pte_pfn(*ptep)); 195 flush_cache_page(vma, addr, pte_pfn(*ptep));
196 ptep_clear_flush(vma, addr, ptep); 196 ptep_clear_flush_notify(vma, addr, ptep);
197 set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot)); 197 set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot));
198 198
199 page_remove_rmap(page); 199 page_remove_rmap(page);
@@ -724,14 +724,14 @@ build_map_info(struct address_space *mapping, loff_t offset, bool is_register)
724 int more = 0; 724 int more = 0;
725 725
726 again: 726 again:
727 mutex_lock(&mapping->i_mmap_mutex); 727 i_mmap_lock_read(mapping);
728 vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { 728 vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
729 if (!valid_vma(vma, is_register)) 729 if (!valid_vma(vma, is_register))
730 continue; 730 continue;
731 731
732 if (!prev && !more) { 732 if (!prev && !more) {
733 /* 733 /*
734 * Needs GFP_NOWAIT to avoid i_mmap_mutex recursion through 734 * Needs GFP_NOWAIT to avoid i_mmap_rwsem recursion through
735 * reclaim. This is optimistic, no harm done if it fails. 735 * reclaim. This is optimistic, no harm done if it fails.
736 */ 736 */
737 prev = kmalloc(sizeof(struct map_info), 737 prev = kmalloc(sizeof(struct map_info),
@@ -755,7 +755,7 @@ build_map_info(struct address_space *mapping, loff_t offset, bool is_register)
755 info->mm = vma->vm_mm; 755 info->mm = vma->vm_mm;
756 info->vaddr = offset_to_vaddr(vma, offset); 756 info->vaddr = offset_to_vaddr(vma, offset);
757 } 757 }
758 mutex_unlock(&mapping->i_mmap_mutex); 758 i_mmap_unlock_read(mapping);
759 759
760 if (!more) 760 if (!more)
761 goto out; 761 goto out;
@@ -1640,7 +1640,6 @@ bool uprobe_deny_signal(void)
1640 if (__fatal_signal_pending(t) || arch_uprobe_xol_was_trapped(t)) { 1640 if (__fatal_signal_pending(t) || arch_uprobe_xol_was_trapped(t)) {
1641 utask->state = UTASK_SSTEP_TRAPPED; 1641 utask->state = UTASK_SSTEP_TRAPPED;
1642 set_tsk_thread_flag(t, TIF_UPROBE); 1642 set_tsk_thread_flag(t, TIF_UPROBE);
1643 set_tsk_thread_flag(t, TIF_NOTIFY_RESUME);
1644 } 1643 }
1645 } 1644 }
1646 1645
diff --git a/kernel/exit.c b/kernel/exit.c
index 32c58f7433a3..6806c55475ee 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -115,32 +115,30 @@ static void __exit_signal(struct task_struct *tsk)
115 115
116 if (tsk == sig->curr_target) 116 if (tsk == sig->curr_target)
117 sig->curr_target = next_thread(tsk); 117 sig->curr_target = next_thread(tsk);
118 /*
119 * Accumulate here the counters for all threads but the
120 * group leader as they die, so they can be added into
121 * the process-wide totals when those are taken.
122 * The group leader stays around as a zombie as long
123 * as there are other threads. When it gets reaped,
124 * the exit.c code will add its counts into these totals.
125 * We won't ever get here for the group leader, since it
126 * will have been the last reference on the signal_struct.
127 */
128 task_cputime(tsk, &utime, &stime);
129 sig->utime += utime;
130 sig->stime += stime;
131 sig->gtime += task_gtime(tsk);
132 sig->min_flt += tsk->min_flt;
133 sig->maj_flt += tsk->maj_flt;
134 sig->nvcsw += tsk->nvcsw;
135 sig->nivcsw += tsk->nivcsw;
136 sig->inblock += task_io_get_inblock(tsk);
137 sig->oublock += task_io_get_oublock(tsk);
138 task_io_accounting_add(&sig->ioac, &tsk->ioac);
139 sig->sum_sched_runtime += tsk->se.sum_exec_runtime;
140 } 118 }
141 119
120 /*
121 * Accumulate here the counters for all threads as they die. We could
122 * skip the group leader because it is the last user of signal_struct,
123 * but we want to avoid the race with thread_group_cputime() which can
124 * see the empty ->thread_head list.
125 */
126 task_cputime(tsk, &utime, &stime);
127 write_seqlock(&sig->stats_lock);
128 sig->utime += utime;
129 sig->stime += stime;
130 sig->gtime += task_gtime(tsk);
131 sig->min_flt += tsk->min_flt;
132 sig->maj_flt += tsk->maj_flt;
133 sig->nvcsw += tsk->nvcsw;
134 sig->nivcsw += tsk->nivcsw;
135 sig->inblock += task_io_get_inblock(tsk);
136 sig->oublock += task_io_get_oublock(tsk);
137 task_io_accounting_add(&sig->ioac, &tsk->ioac);
138 sig->sum_sched_runtime += tsk->se.sum_exec_runtime;
142 sig->nr_threads--; 139 sig->nr_threads--;
143 __unhash_process(tsk, group_dead); 140 __unhash_process(tsk, group_dead);
141 write_sequnlock(&sig->stats_lock);
144 142
145 /* 143 /*
146 * Do this under ->siglock, we can race with another thread 144 * Do this under ->siglock, we can race with another thread
@@ -214,27 +212,6 @@ repeat:
214} 212}
215 213
216/* 214/*
217 * This checks not only the pgrp, but falls back on the pid if no
218 * satisfactory pgrp is found. I dunno - gdb doesn't work correctly
219 * without this...
220 *
221 * The caller must hold rcu lock or the tasklist lock.
222 */
223struct pid *session_of_pgrp(struct pid *pgrp)
224{
225 struct task_struct *p;
226 struct pid *sid = NULL;
227
228 p = pid_task(pgrp, PIDTYPE_PGID);
229 if (p == NULL)
230 p = pid_task(pgrp, PIDTYPE_PID);
231 if (p != NULL)
232 sid = task_session(p);
233
234 return sid;
235}
236
237/*
238 * Determine if a process group is "orphaned", according to the POSIX 215 * Determine if a process group is "orphaned", according to the POSIX
239 * definition in 2.2.2.52. Orphaned process groups are not to be affected 216 * definition in 2.2.2.52. Orphaned process groups are not to be affected
240 * by terminal-generated stop signals. Newly orphaned process groups are 217 * by terminal-generated stop signals. Newly orphaned process groups are
@@ -461,6 +438,44 @@ static void exit_mm(struct task_struct *tsk)
461 clear_thread_flag(TIF_MEMDIE); 438 clear_thread_flag(TIF_MEMDIE);
462} 439}
463 440
441static struct task_struct *find_alive_thread(struct task_struct *p)
442{
443 struct task_struct *t;
444
445 for_each_thread(p, t) {
446 if (!(t->flags & PF_EXITING))
447 return t;
448 }
449 return NULL;
450}
451
452static struct task_struct *find_child_reaper(struct task_struct *father)
453 __releases(&tasklist_lock)
454 __acquires(&tasklist_lock)
455{
456 struct pid_namespace *pid_ns = task_active_pid_ns(father);
457 struct task_struct *reaper = pid_ns->child_reaper;
458
459 if (likely(reaper != father))
460 return reaper;
461
462 reaper = find_alive_thread(father);
463 if (reaper) {
464 pid_ns->child_reaper = reaper;
465 return reaper;
466 }
467
468 write_unlock_irq(&tasklist_lock);
469 if (unlikely(pid_ns == &init_pid_ns)) {
470 panic("Attempted to kill init! exitcode=0x%08x\n",
471 father->signal->group_exit_code ?: father->exit_code);
472 }
473 zap_pid_ns_processes(pid_ns);
474 write_lock_irq(&tasklist_lock);
475
476 return father;
477}
478
464/* 479/*
465 * When we die, we re-parent all our children, and try to: 480 * When we die, we re-parent all our children, and try to:
466 * 1. give them to another thread in our thread group, if such a member exists 481 * 1. give them to another thread in our thread group, if such a member exists
@@ -468,58 +483,36 @@ static void exit_mm(struct task_struct *tsk)
468 * child_subreaper for its children (like a service manager) 483 * child_subreaper for its children (like a service manager)
469 * 3. give it to the init process (PID 1) in our pid namespace 484 * 3. give it to the init process (PID 1) in our pid namespace
470 */ 485 */
471static struct task_struct *find_new_reaper(struct task_struct *father) 486static struct task_struct *find_new_reaper(struct task_struct *father,
472 __releases(&tasklist_lock) 487 struct task_struct *child_reaper)
473 __acquires(&tasklist_lock)
474{ 488{
475 struct pid_namespace *pid_ns = task_active_pid_ns(father); 489 struct task_struct *thread, *reaper;
476 struct task_struct *thread;
477 490
478 thread = father; 491 thread = find_alive_thread(father);
479 while_each_thread(father, thread) { 492 if (thread)
480 if (thread->flags & PF_EXITING)
481 continue;
482 if (unlikely(pid_ns->child_reaper == father))
483 pid_ns->child_reaper = thread;
484 return thread; 493 return thread;
485 }
486
487 if (unlikely(pid_ns->child_reaper == father)) {
488 write_unlock_irq(&tasklist_lock);
489 if (unlikely(pid_ns == &init_pid_ns)) {
490 panic("Attempted to kill init! exitcode=0x%08x\n",
491 father->signal->group_exit_code ?:
492 father->exit_code);
493 }
494
495 zap_pid_ns_processes(pid_ns);
496 write_lock_irq(&tasklist_lock);
497 } else if (father->signal->has_child_subreaper) {
498 struct task_struct *reaper;
499 494
495 if (father->signal->has_child_subreaper) {
500 /* 496 /*
501 * Find the first ancestor marked as child_subreaper. 497 * Find the first ->is_child_subreaper ancestor in our pid_ns.
502 * Note that the code below checks same_thread_group(reaper, 498 * We start from father to ensure we can not look into another
503 * pid_ns->child_reaper). This is what we need to DTRT in a 499 * namespace, this is safe because all its threads are dead.
504 * PID namespace. However we still need the check above, see
505 * http://marc.info/?l=linux-kernel&m=131385460420380
506 */ 500 */
507 for (reaper = father->real_parent; 501 for (reaper = father;
508 reaper != &init_task; 502 !same_thread_group(reaper, child_reaper);
509 reaper = reaper->real_parent) { 503 reaper = reaper->real_parent) {
510 if (same_thread_group(reaper, pid_ns->child_reaper)) 504 /* call_usermodehelper() descendants need this check */
505 if (reaper == &init_task)
511 break; 506 break;
512 if (!reaper->signal->is_child_subreaper) 507 if (!reaper->signal->is_child_subreaper)
513 continue; 508 continue;
514 thread = reaper; 509 thread = find_alive_thread(reaper);
515 do { 510 if (thread)
516 if (!(thread->flags & PF_EXITING)) 511 return thread;
517 return reaper;
518 } while_each_thread(reaper, thread);
519 } 512 }
520 } 513 }
521 514
522 return pid_ns->child_reaper; 515 return child_reaper;
523} 516}
524 517
525/* 518/*
@@ -528,15 +521,7 @@ static struct task_struct *find_new_reaper(struct task_struct *father)
528static void reparent_leader(struct task_struct *father, struct task_struct *p, 521static void reparent_leader(struct task_struct *father, struct task_struct *p,
529 struct list_head *dead) 522 struct list_head *dead)
530{ 523{
531 list_move_tail(&p->sibling, &p->real_parent->children); 524 if (unlikely(p->exit_state == EXIT_DEAD))
532
533 if (p->exit_state == EXIT_DEAD)
534 return;
535 /*
536 * If this is a threaded reparent there is no need to
537 * notify anyone anything has happened.
538 */
539 if (same_thread_group(p->real_parent, father))
540 return; 525 return;
541 526
542 /* We don't want people slaying init. */ 527 /* We don't want people slaying init. */
@@ -547,49 +532,53 @@ static void reparent_leader(struct task_struct *father, struct task_struct *p,
547 p->exit_state == EXIT_ZOMBIE && thread_group_empty(p)) { 532 p->exit_state == EXIT_ZOMBIE && thread_group_empty(p)) {
548 if (do_notify_parent(p, p->exit_signal)) { 533 if (do_notify_parent(p, p->exit_signal)) {
549 p->exit_state = EXIT_DEAD; 534 p->exit_state = EXIT_DEAD;
550 list_move_tail(&p->sibling, dead); 535 list_add(&p->ptrace_entry, dead);
551 } 536 }
552 } 537 }
553 538
554 kill_orphaned_pgrp(p, father); 539 kill_orphaned_pgrp(p, father);
555} 540}
556 541
557static void forget_original_parent(struct task_struct *father) 542/*
543 * This does two things:
544 *
545 * A. Make init inherit all the child processes
546 * B. Check to see if any process groups have become orphaned
547 * as a result of our exiting, and if they have any stopped
548 * jobs, send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2)
549 */
550static void forget_original_parent(struct task_struct *father,
551 struct list_head *dead)
558{ 552{
559 struct task_struct *p, *n, *reaper; 553 struct task_struct *p, *t, *reaper;
560 LIST_HEAD(dead_children);
561 554
562 write_lock_irq(&tasklist_lock); 555 if (unlikely(!list_empty(&father->ptraced)))
563 /* 556 exit_ptrace(father, dead);
564 * Note that exit_ptrace() and find_new_reaper() might
565 * drop tasklist_lock and reacquire it.
566 */
567 exit_ptrace(father);
568 reaper = find_new_reaper(father);
569 557
570 list_for_each_entry_safe(p, n, &father->children, sibling) { 558 /* Can drop and reacquire tasklist_lock */
571 struct task_struct *t = p; 559 reaper = find_child_reaper(father);
560 if (list_empty(&father->children))
561 return;
572 562
573 do { 563 reaper = find_new_reaper(father, reaper);
564 list_for_each_entry(p, &father->children, sibling) {
565 for_each_thread(p, t) {
574 t->real_parent = reaper; 566 t->real_parent = reaper;
575 if (t->parent == father) { 567 BUG_ON((!t->ptrace) != (t->parent == father));
576 BUG_ON(t->ptrace); 568 if (likely(!t->ptrace))
577 t->parent = t->real_parent; 569 t->parent = t->real_parent;
578 }
579 if (t->pdeath_signal) 570 if (t->pdeath_signal)
580 group_send_sig_info(t->pdeath_signal, 571 group_send_sig_info(t->pdeath_signal,
581 SEND_SIG_NOINFO, t); 572 SEND_SIG_NOINFO, t);
582 } while_each_thread(p, t); 573 }
583 reparent_leader(father, p, &dead_children); 574 /*
584 } 575 * If this is a threaded reparent there is no need to
585 write_unlock_irq(&tasklist_lock); 576 * notify anyone anything has happened.
586 577 */
587 BUG_ON(!list_empty(&father->children)); 578 if (!same_thread_group(reaper, father))
588 579 reparent_leader(father, p, dead);
589 list_for_each_entry_safe(p, n, &dead_children, sibling) {
590 list_del_init(&p->sibling);
591 release_task(p);
592 } 580 }
581 list_splice_tail_init(&father->children, &reaper->children);
593} 582}
594 583
595/* 584/*
@@ -599,18 +588,12 @@ static void forget_original_parent(struct task_struct *father)
599static void exit_notify(struct task_struct *tsk, int group_dead) 588static void exit_notify(struct task_struct *tsk, int group_dead)
600{ 589{
601 bool autoreap; 590 bool autoreap;
602 591 struct task_struct *p, *n;
603 /* 592 LIST_HEAD(dead);
604 * This does two things:
605 *
606 * A. Make init inherit all the child processes
607 * B. Check to see if any process groups have become orphaned
608 * as a result of our exiting, and if they have any stopped
609 * jobs, send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2)
610 */
611 forget_original_parent(tsk);
612 593
613 write_lock_irq(&tasklist_lock); 594 write_lock_irq(&tasklist_lock);
595 forget_original_parent(tsk, &dead);
596
614 if (group_dead) 597 if (group_dead)
615 kill_orphaned_pgrp(tsk->group_leader, NULL); 598 kill_orphaned_pgrp(tsk->group_leader, NULL);
616 599
@@ -628,15 +611,18 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
628 } 611 }
629 612
630 tsk->exit_state = autoreap ? EXIT_DEAD : EXIT_ZOMBIE; 613 tsk->exit_state = autoreap ? EXIT_DEAD : EXIT_ZOMBIE;
614 if (tsk->exit_state == EXIT_DEAD)
615 list_add(&tsk->ptrace_entry, &dead);
631 616
632 /* mt-exec, de_thread() is waiting for group leader */ 617 /* mt-exec, de_thread() is waiting for group leader */
633 if (unlikely(tsk->signal->notify_count < 0)) 618 if (unlikely(tsk->signal->notify_count < 0))
634 wake_up_process(tsk->signal->group_exit_task); 619 wake_up_process(tsk->signal->group_exit_task);
635 write_unlock_irq(&tasklist_lock); 620 write_unlock_irq(&tasklist_lock);
636 621
637 /* If the process is dead, release it - nobody will wait for it */ 622 list_for_each_entry_safe(p, n, &dead, ptrace_entry) {
638 if (autoreap) 623 list_del_init(&p->ptrace_entry);
639 release_task(tsk); 624 release_task(p);
625 }
640} 626}
641 627
642#ifdef CONFIG_DEBUG_STACK_USAGE 628#ifdef CONFIG_DEBUG_STACK_USAGE
@@ -667,6 +653,7 @@ void do_exit(long code)
667{ 653{
668 struct task_struct *tsk = current; 654 struct task_struct *tsk = current;
669 int group_dead; 655 int group_dead;
656 TASKS_RCU(int tasks_rcu_i);
670 657
671 profile_task_exit(tsk); 658 profile_task_exit(tsk);
672 659
@@ -775,6 +762,7 @@ void do_exit(long code)
775 */ 762 */
776 flush_ptrace_hw_breakpoint(tsk); 763 flush_ptrace_hw_breakpoint(tsk);
777 764
765 TASKS_RCU(tasks_rcu_i = __srcu_read_lock(&tasks_rcu_exit_srcu));
778 exit_notify(tsk, group_dead); 766 exit_notify(tsk, group_dead);
779 proc_exit_connector(tsk); 767 proc_exit_connector(tsk);
780#ifdef CONFIG_NUMA 768#ifdef CONFIG_NUMA
@@ -814,6 +802,7 @@ void do_exit(long code)
814 if (tsk->nr_dirtied) 802 if (tsk->nr_dirtied)
815 __this_cpu_add(dirty_throttle_leaks, tsk->nr_dirtied); 803 __this_cpu_add(dirty_throttle_leaks, tsk->nr_dirtied);
816 exit_rcu(); 804 exit_rcu();
805 TASKS_RCU(__srcu_read_unlock(&tasks_rcu_exit_srcu, tasks_rcu_i));
817 806
818 /* 807 /*
819 * The setting of TASK_RUNNING by try_to_wake_up() may be delayed 808 * The setting of TASK_RUNNING by try_to_wake_up() may be delayed
@@ -978,8 +967,7 @@ static int wait_noreap_copyout(struct wait_opts *wo, struct task_struct *p,
978 */ 967 */
979static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) 968static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
980{ 969{
981 unsigned long state; 970 int state, retval, status;
982 int retval, status, traced;
983 pid_t pid = task_pid_vnr(p); 971 pid_t pid = task_pid_vnr(p);
984 uid_t uid = from_kuid_munged(current_user_ns(), task_uid(p)); 972 uid_t uid = from_kuid_munged(current_user_ns(), task_uid(p));
985 struct siginfo __user *infop; 973 struct siginfo __user *infop;
@@ -993,6 +981,8 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
993 981
994 get_task_struct(p); 982 get_task_struct(p);
995 read_unlock(&tasklist_lock); 983 read_unlock(&tasklist_lock);
984 sched_annotate_sleep();
985
996 if ((exit_code & 0x7f) == 0) { 986 if ((exit_code & 0x7f) == 0) {
997 why = CLD_EXITED; 987 why = CLD_EXITED;
998 status = exit_code >> 8; 988 status = exit_code >> 8;
@@ -1002,21 +992,25 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
1002 } 992 }
1003 return wait_noreap_copyout(wo, p, pid, uid, why, status); 993 return wait_noreap_copyout(wo, p, pid, uid, why, status);
1004 } 994 }
1005
1006 traced = ptrace_reparented(p);
1007 /* 995 /*
1008 * Move the task's state to DEAD/TRACE, only one thread can do this. 996 * Move the task's state to DEAD/TRACE, only one thread can do this.
1009 */ 997 */
1010 state = traced && thread_group_leader(p) ? EXIT_TRACE : EXIT_DEAD; 998 state = (ptrace_reparented(p) && thread_group_leader(p)) ?
999 EXIT_TRACE : EXIT_DEAD;
1011 if (cmpxchg(&p->exit_state, EXIT_ZOMBIE, state) != EXIT_ZOMBIE) 1000 if (cmpxchg(&p->exit_state, EXIT_ZOMBIE, state) != EXIT_ZOMBIE)
1012 return 0; 1001 return 0;
1013 /* 1002 /*
1014 * It can be ptraced but not reparented, check 1003 * We own this thread, nobody else can reap it.
1015 * thread_group_leader() to filter out sub-threads.
1016 */ 1004 */
1017 if (likely(!traced) && thread_group_leader(p)) { 1005 read_unlock(&tasklist_lock);
1018 struct signal_struct *psig; 1006 sched_annotate_sleep();
1019 struct signal_struct *sig; 1007
1008 /*
1009 * Check thread_group_leader() to exclude the traced sub-threads.
1010 */
1011 if (state == EXIT_DEAD && thread_group_leader(p)) {
1012 struct signal_struct *sig = p->signal;
1013 struct signal_struct *psig = current->signal;
1020 unsigned long maxrss; 1014 unsigned long maxrss;
1021 cputime_t tgutime, tgstime; 1015 cputime_t tgutime, tgstime;
1022 1016
@@ -1028,21 +1022,21 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
1028 * accumulate in the parent's signal_struct c* fields. 1022 * accumulate in the parent's signal_struct c* fields.
1029 * 1023 *
1030 * We don't bother to take a lock here to protect these 1024 * We don't bother to take a lock here to protect these
1031 * p->signal fields, because they are only touched by 1025 * p->signal fields because the whole thread group is dead
1032 * __exit_signal, which runs with tasklist_lock 1026 * and nobody can change them.
1033 * write-locked anyway, and so is excluded here. We do 1027 *
1034 * need to protect the access to parent->signal fields, 1028 * psig->stats_lock also protects us from our sub-theads
1035 * as other threads in the parent group can be right 1029 * which can reap other children at the same time. Until
1036 * here reaping other children at the same time. 1030 * we change k_getrusage()-like users to rely on this lock
1031 * we have to take ->siglock as well.
1037 * 1032 *
1038 * We use thread_group_cputime_adjusted() to get times for 1033 * We use thread_group_cputime_adjusted() to get times for
1039 * the thread group, which consolidates times for all threads 1034 * the thread group, which consolidates times for all threads
1040 * in the group including the group leader. 1035 * in the group including the group leader.
1041 */ 1036 */
1042 thread_group_cputime_adjusted(p, &tgutime, &tgstime); 1037 thread_group_cputime_adjusted(p, &tgutime, &tgstime);
1043 spin_lock_irq(&p->real_parent->sighand->siglock); 1038 spin_lock_irq(&current->sighand->siglock);
1044 psig = p->real_parent->signal; 1039 write_seqlock(&psig->stats_lock);
1045 sig = p->signal;
1046 psig->cutime += tgutime + sig->cutime; 1040 psig->cutime += tgutime + sig->cutime;
1047 psig->cstime += tgstime + sig->cstime; 1041 psig->cstime += tgstime + sig->cstime;
1048 psig->cgtime += task_gtime(p) + sig->gtime + sig->cgtime; 1042 psig->cgtime += task_gtime(p) + sig->gtime + sig->cgtime;
@@ -1065,15 +1059,10 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
1065 psig->cmaxrss = maxrss; 1059 psig->cmaxrss = maxrss;
1066 task_io_accounting_add(&psig->ioac, &p->ioac); 1060 task_io_accounting_add(&psig->ioac, &p->ioac);
1067 task_io_accounting_add(&psig->ioac, &sig->ioac); 1061 task_io_accounting_add(&psig->ioac, &sig->ioac);
1068 spin_unlock_irq(&p->real_parent->sighand->siglock); 1062 write_sequnlock(&psig->stats_lock);
1063 spin_unlock_irq(&current->sighand->siglock);
1069 } 1064 }
1070 1065
1071 /*
1072 * Now we are sure this task is interesting, and no other
1073 * thread can reap it because we its state == DEAD/TRACE.
1074 */
1075 read_unlock(&tasklist_lock);
1076
1077 retval = wo->wo_rusage 1066 retval = wo->wo_rusage
1078 ? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0; 1067 ? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0;
1079 status = (p->signal->flags & SIGNAL_GROUP_EXIT) 1068 status = (p->signal->flags & SIGNAL_GROUP_EXIT)
@@ -1204,6 +1193,7 @@ unlock_sig:
1204 pid = task_pid_vnr(p); 1193 pid = task_pid_vnr(p);
1205 why = ptrace ? CLD_TRAPPED : CLD_STOPPED; 1194 why = ptrace ? CLD_TRAPPED : CLD_STOPPED;
1206 read_unlock(&tasklist_lock); 1195 read_unlock(&tasklist_lock);
1196 sched_annotate_sleep();
1207 1197
1208 if (unlikely(wo->wo_flags & WNOWAIT)) 1198 if (unlikely(wo->wo_flags & WNOWAIT))
1209 return wait_noreap_copyout(wo, p, pid, uid, why, exit_code); 1199 return wait_noreap_copyout(wo, p, pid, uid, why, exit_code);
@@ -1266,6 +1256,7 @@ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p)
1266 pid = task_pid_vnr(p); 1256 pid = task_pid_vnr(p);
1267 get_task_struct(p); 1257 get_task_struct(p);
1268 read_unlock(&tasklist_lock); 1258 read_unlock(&tasklist_lock);
1259 sched_annotate_sleep();
1269 1260
1270 if (!wo->wo_info) { 1261 if (!wo->wo_info) {
1271 retval = wo->wo_rusage 1262 retval = wo->wo_rusage
@@ -1296,9 +1287,15 @@ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p)
1296static int wait_consider_task(struct wait_opts *wo, int ptrace, 1287static int wait_consider_task(struct wait_opts *wo, int ptrace,
1297 struct task_struct *p) 1288 struct task_struct *p)
1298{ 1289{
1290 /*
1291 * We can race with wait_task_zombie() from another thread.
1292 * Ensure that EXIT_ZOMBIE -> EXIT_DEAD/EXIT_TRACE transition
1293 * can't confuse the checks below.
1294 */
1295 int exit_state = ACCESS_ONCE(p->exit_state);
1299 int ret; 1296 int ret;
1300 1297
1301 if (unlikely(p->exit_state == EXIT_DEAD)) 1298 if (unlikely(exit_state == EXIT_DEAD))
1302 return 0; 1299 return 0;
1303 1300
1304 ret = eligible_child(wo, p); 1301 ret = eligible_child(wo, p);
@@ -1319,7 +1316,7 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace,
1319 return 0; 1316 return 0;
1320 } 1317 }
1321 1318
1322 if (unlikely(p->exit_state == EXIT_TRACE)) { 1319 if (unlikely(exit_state == EXIT_TRACE)) {
1323 /* 1320 /*
1324 * ptrace == 0 means we are the natural parent. In this case 1321 * ptrace == 0 means we are the natural parent. In this case
1325 * we should clear notask_error, debugger will notify us. 1322 * we should clear notask_error, debugger will notify us.
@@ -1346,7 +1343,7 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace,
1346 } 1343 }
1347 1344
1348 /* slay zombie? */ 1345 /* slay zombie? */
1349 if (p->exit_state == EXIT_ZOMBIE) { 1346 if (exit_state == EXIT_ZOMBIE) {
1350 /* we don't reap group leaders with subthreads */ 1347 /* we don't reap group leaders with subthreads */
1351 if (!delay_group_leader(p)) { 1348 if (!delay_group_leader(p)) {
1352 /* 1349 /*
diff --git a/kernel/extable.c b/kernel/extable.c
index d8a6446adbcb..c98f926277a8 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -18,6 +18,7 @@
18#include <linux/ftrace.h> 18#include <linux/ftrace.h>
19#include <linux/memory.h> 19#include <linux/memory.h>
20#include <linux/module.h> 20#include <linux/module.h>
21#include <linux/ftrace.h>
21#include <linux/mutex.h> 22#include <linux/mutex.h>
22#include <linux/init.h> 23#include <linux/init.h>
23 24
@@ -102,6 +103,8 @@ int __kernel_text_address(unsigned long addr)
102 return 1; 103 return 1;
103 if (is_module_text_address(addr)) 104 if (is_module_text_address(addr))
104 return 1; 105 return 1;
106 if (is_ftrace_trampoline(addr))
107 return 1;
105 /* 108 /*
106 * There might be init symbols in saved stacktraces. 109 * There might be init symbols in saved stacktraces.
107 * Give those symbols a chance to be printed in 110 * Give those symbols a chance to be printed in
@@ -119,7 +122,9 @@ int kernel_text_address(unsigned long addr)
119{ 122{
120 if (core_kernel_text(addr)) 123 if (core_kernel_text(addr))
121 return 1; 124 return 1;
122 return is_module_text_address(addr); 125 if (is_module_text_address(addr))
126 return 1;
127 return is_ftrace_trampoline(addr);
123} 128}
124 129
125/* 130/*
diff --git a/kernel/fork.c b/kernel/fork.c
index a91e47d86de2..4dc2ddade9f1 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -294,11 +294,18 @@ int __weak arch_dup_task_struct(struct task_struct *dst,
294 return 0; 294 return 0;
295} 295}
296 296
297void set_task_stack_end_magic(struct task_struct *tsk)
298{
299 unsigned long *stackend;
300
301 stackend = end_of_stack(tsk);
302 *stackend = STACK_END_MAGIC; /* for overflow detection */
303}
304
297static struct task_struct *dup_task_struct(struct task_struct *orig) 305static struct task_struct *dup_task_struct(struct task_struct *orig)
298{ 306{
299 struct task_struct *tsk; 307 struct task_struct *tsk;
300 struct thread_info *ti; 308 struct thread_info *ti;
301 unsigned long *stackend;
302 int node = tsk_fork_get_node(orig); 309 int node = tsk_fork_get_node(orig);
303 int err; 310 int err;
304 311
@@ -328,8 +335,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
328 setup_thread_stack(tsk, orig); 335 setup_thread_stack(tsk, orig);
329 clear_user_return_notifier(tsk); 336 clear_user_return_notifier(tsk);
330 clear_tsk_need_resched(tsk); 337 clear_tsk_need_resched(tsk);
331 stackend = end_of_stack(tsk); 338 set_task_stack_end_magic(tsk);
332 *stackend = STACK_END_MAGIC; /* for overflow detection */
333 339
334#ifdef CONFIG_CC_STACKPROTECTOR 340#ifdef CONFIG_CC_STACKPROTECTOR
335 tsk->stack_canary = get_random_int(); 341 tsk->stack_canary = get_random_int();
@@ -427,7 +433,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
427 get_file(file); 433 get_file(file);
428 if (tmp->vm_flags & VM_DENYWRITE) 434 if (tmp->vm_flags & VM_DENYWRITE)
429 atomic_dec(&inode->i_writecount); 435 atomic_dec(&inode->i_writecount);
430 mutex_lock(&mapping->i_mmap_mutex); 436 i_mmap_lock_write(mapping);
431 if (tmp->vm_flags & VM_SHARED) 437 if (tmp->vm_flags & VM_SHARED)
432 atomic_inc(&mapping->i_mmap_writable); 438 atomic_inc(&mapping->i_mmap_writable);
433 flush_dcache_mmap_lock(mapping); 439 flush_dcache_mmap_lock(mapping);
@@ -439,7 +445,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
439 vma_interval_tree_insert_after(tmp, mpnt, 445 vma_interval_tree_insert_after(tmp, mpnt,
440 &mapping->i_mmap); 446 &mapping->i_mmap);
441 flush_dcache_mmap_unlock(mapping); 447 flush_dcache_mmap_unlock(mapping);
442 mutex_unlock(&mapping->i_mmap_mutex); 448 i_mmap_unlock_write(mapping);
443 } 449 }
444 450
445 /* 451 /*
@@ -601,9 +607,8 @@ static void check_mm(struct mm_struct *mm)
601 printk(KERN_ALERT "BUG: Bad rss-counter state " 607 printk(KERN_ALERT "BUG: Bad rss-counter state "
602 "mm:%p idx:%d val:%ld\n", mm, i, x); 608 "mm:%p idx:%d val:%ld\n", mm, i, x);
603 } 609 }
604
605#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS 610#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
606 VM_BUG_ON(mm->pmd_huge_pte); 611 VM_BUG_ON_MM(mm->pmd_huge_pte, mm);
607#endif 612#endif
608} 613}
609 614
@@ -1017,11 +1022,14 @@ void __cleanup_sighand(struct sighand_struct *sighand)
1017{ 1022{
1018 if (atomic_dec_and_test(&sighand->count)) { 1023 if (atomic_dec_and_test(&sighand->count)) {
1019 signalfd_cleanup(sighand); 1024 signalfd_cleanup(sighand);
1025 /*
1026 * sighand_cachep is SLAB_DESTROY_BY_RCU so we can free it
1027 * without an RCU grace period, see __lock_task_sighand().
1028 */
1020 kmem_cache_free(sighand_cachep, sighand); 1029 kmem_cache_free(sighand_cachep, sighand);
1021 } 1030 }
1022} 1031}
1023 1032
1024
1025/* 1033/*
1026 * Initialize POSIX timer handling for a thread group. 1034 * Initialize POSIX timer handling for a thread group.
1027 */ 1035 */
@@ -1068,6 +1076,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
1068 sig->curr_target = tsk; 1076 sig->curr_target = tsk;
1069 init_sigpending(&sig->shared_pending); 1077 init_sigpending(&sig->shared_pending);
1070 INIT_LIST_HEAD(&sig->posix_timers); 1078 INIT_LIST_HEAD(&sig->posix_timers);
1079 seqlock_init(&sig->stats_lock);
1071 1080
1072 hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 1081 hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
1073 sig->real_timer.function = it_real_fn; 1082 sig->real_timer.function = it_real_fn;
diff --git a/kernel/freezer.c b/kernel/freezer.c
index aa6a8aadb911..a8900a3bc27a 100644
--- a/kernel/freezer.c
+++ b/kernel/freezer.c
@@ -42,6 +42,9 @@ bool freezing_slow_path(struct task_struct *p)
42 if (p->flags & (PF_NOFREEZE | PF_SUSPEND_TASK)) 42 if (p->flags & (PF_NOFREEZE | PF_SUSPEND_TASK))
43 return false; 43 return false;
44 44
45 if (test_thread_flag(TIF_MEMDIE))
46 return false;
47
45 if (pm_nosig_freezing || cgroup_freezing(p)) 48 if (pm_nosig_freezing || cgroup_freezing(p))
46 return true; 49 return true;
47 50
@@ -147,12 +150,6 @@ void __thaw_task(struct task_struct *p)
147{ 150{
148 unsigned long flags; 151 unsigned long flags;
149 152
150 /*
151 * Clear freezing and kick @p if FROZEN. Clearing is guaranteed to
152 * be visible to @p as waking up implies wmb. Waking up inside
153 * freezer_lock also prevents wakeups from leaking outside
154 * refrigerator.
155 */
156 spin_lock_irqsave(&freezer_lock, flags); 153 spin_lock_irqsave(&freezer_lock, flags);
157 if (frozen(p)) 154 if (frozen(p))
158 wake_up_process(p); 155 wake_up_process(p);
diff --git a/kernel/futex.c b/kernel/futex.c
index 815d7af2ffe8..63678b573d61 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -143,9 +143,8 @@
143 * 143 *
144 * Where (A) orders the waiters increment and the futex value read through 144 * Where (A) orders the waiters increment and the futex value read through
145 * atomic operations (see hb_waiters_inc) and where (B) orders the write 145 * atomic operations (see hb_waiters_inc) and where (B) orders the write
146 * to futex and the waiters read -- this is done by the barriers in 146 * to futex and the waiters read -- this is done by the barriers for both
147 * get_futex_key_refs(), through either ihold or atomic_inc, depending on the 147 * shared and private futexes in get_futex_key_refs().
148 * futex type.
149 * 148 *
150 * This yields the following case (where X:=waiters, Y:=futex): 149 * This yields the following case (where X:=waiters, Y:=futex):
151 * 150 *
@@ -343,12 +342,21 @@ static void get_futex_key_refs(union futex_key *key)
343 case FUT_OFF_MMSHARED: 342 case FUT_OFF_MMSHARED:
344 futex_get_mm(key); /* implies MB (B) */ 343 futex_get_mm(key); /* implies MB (B) */
345 break; 344 break;
345 default:
346 /*
347 * Private futexes do not hold reference on an inode or
348 * mm, therefore the only purpose of calling get_futex_key_refs
349 * is because we need the barrier for the lockless waiter check.
350 */
351 smp_mb(); /* explicit MB (B) */
346 } 352 }
347} 353}
348 354
349/* 355/*
350 * Drop a reference to the resource addressed by a key. 356 * Drop a reference to the resource addressed by a key.
351 * The hash bucket spinlock must not be held. 357 * The hash bucket spinlock must not be held. This is
358 * a no-op for private futexes, see comment in the get
359 * counterpart.
352 */ 360 */
353static void drop_futex_key_refs(union futex_key *key) 361static void drop_futex_key_refs(union futex_key *key)
354{ 362{
@@ -639,8 +647,14 @@ static struct futex_pi_state * alloc_pi_state(void)
639 return pi_state; 647 return pi_state;
640} 648}
641 649
650/*
651 * Must be called with the hb lock held.
652 */
642static void free_pi_state(struct futex_pi_state *pi_state) 653static void free_pi_state(struct futex_pi_state *pi_state)
643{ 654{
655 if (!pi_state)
656 return;
657
644 if (!atomic_dec_and_test(&pi_state->refcount)) 658 if (!atomic_dec_and_test(&pi_state->refcount))
645 return; 659 return;
646 660
@@ -1519,15 +1533,6 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
1519 } 1533 }
1520 1534
1521retry: 1535retry:
1522 if (pi_state != NULL) {
1523 /*
1524 * We will have to lookup the pi_state again, so free this one
1525 * to keep the accounting correct.
1526 */
1527 free_pi_state(pi_state);
1528 pi_state = NULL;
1529 }
1530
1531 ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, VERIFY_READ); 1536 ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, VERIFY_READ);
1532 if (unlikely(ret != 0)) 1537 if (unlikely(ret != 0))
1533 goto out; 1538 goto out;
@@ -1617,6 +1622,8 @@ retry_private:
1617 case 0: 1622 case 0:
1618 break; 1623 break;
1619 case -EFAULT: 1624 case -EFAULT:
1625 free_pi_state(pi_state);
1626 pi_state = NULL;
1620 double_unlock_hb(hb1, hb2); 1627 double_unlock_hb(hb1, hb2);
1621 hb_waiters_dec(hb2); 1628 hb_waiters_dec(hb2);
1622 put_futex_key(&key2); 1629 put_futex_key(&key2);
@@ -1632,6 +1639,8 @@ retry_private:
1632 * exit to complete. 1639 * exit to complete.
1633 * - The user space value changed. 1640 * - The user space value changed.
1634 */ 1641 */
1642 free_pi_state(pi_state);
1643 pi_state = NULL;
1635 double_unlock_hb(hb1, hb2); 1644 double_unlock_hb(hb1, hb2);
1636 hb_waiters_dec(hb2); 1645 hb_waiters_dec(hb2);
1637 put_futex_key(&key2); 1646 put_futex_key(&key2);
@@ -1708,6 +1717,7 @@ retry_private:
1708 } 1717 }
1709 1718
1710out_unlock: 1719out_unlock:
1720 free_pi_state(pi_state);
1711 double_unlock_hb(hb1, hb2); 1721 double_unlock_hb(hb1, hb2);
1712 hb_waiters_dec(hb2); 1722 hb_waiters_dec(hb2);
1713 1723
@@ -1725,8 +1735,6 @@ out_put_keys:
1725out_put_key1: 1735out_put_key1:
1726 put_futex_key(&key1); 1736 put_futex_key(&key1);
1727out: 1737out:
1728 if (pi_state != NULL)
1729 free_pi_state(pi_state);
1730 return ret ? ret : task_count; 1738 return ret ? ret : task_count;
1731} 1739}
1732 1740
diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig
index d04ce8ac4399..c92e44855ddd 100644
--- a/kernel/gcov/Kconfig
+++ b/kernel/gcov/Kconfig
@@ -32,10 +32,13 @@ config GCOV_KERNEL
32 Note that the debugfs filesystem has to be mounted to access 32 Note that the debugfs filesystem has to be mounted to access
33 profiling data. 33 profiling data.
34 34
35config ARCH_HAS_GCOV_PROFILE_ALL
36 def_bool n
37
35config GCOV_PROFILE_ALL 38config GCOV_PROFILE_ALL
36 bool "Profile entire Kernel" 39 bool "Profile entire Kernel"
37 depends on GCOV_KERNEL 40 depends on GCOV_KERNEL
38 depends on SUPERH || S390 || X86 || PPC || MICROBLAZE 41 depends on ARCH_HAS_GCOV_PROFILE_ALL
39 default n 42 default n
40 ---help--- 43 ---help---
41 This options activates profiling for the entire kernel. 44 This options activates profiling for the entire kernel.
diff --git a/kernel/groups.c b/kernel/groups.c
index 451698f86cfa..664411f171b5 100644
--- a/kernel/groups.c
+++ b/kernel/groups.c
@@ -6,6 +6,7 @@
6#include <linux/slab.h> 6#include <linux/slab.h>
7#include <linux/security.h> 7#include <linux/security.h>
8#include <linux/syscalls.h> 8#include <linux/syscalls.h>
9#include <linux/user_namespace.h>
9#include <asm/uaccess.h> 10#include <asm/uaccess.h>
10 11
11/* init to 2 - one for init_task, one to ensure it is never freed */ 12/* init to 2 - one for init_task, one to ensure it is never freed */
@@ -213,6 +214,14 @@ out:
213 return i; 214 return i;
214} 215}
215 216
217bool may_setgroups(void)
218{
219 struct user_namespace *user_ns = current_user_ns();
220
221 return ns_capable(user_ns, CAP_SETGID) &&
222 userns_may_setgroups(user_ns);
223}
224
216/* 225/*
217 * SMP: Our groups are copy-on-write. We can set them safely 226 * SMP: Our groups are copy-on-write. We can set them safely
218 * without another task interfering. 227 * without another task interfering.
@@ -223,7 +232,7 @@ SYSCALL_DEFINE2(setgroups, int, gidsetsize, gid_t __user *, grouplist)
223 struct group_info *group_info; 232 struct group_info *group_info;
224 int retval; 233 int retval;
225 234
226 if (!ns_capable(current_user_ns(), CAP_SETGID)) 235 if (!may_setgroups())
227 return -EPERM; 236 return -EPERM;
228 if ((unsigned)gidsetsize > NGROUPS_MAX) 237 if ((unsigned)gidsetsize > NGROUPS_MAX)
229 return -EINVAL; 238 return -EINVAL;
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
index d269cecdfbf0..9a76e3beda54 100644
--- a/kernel/irq/Kconfig
+++ b/kernel/irq/Kconfig
@@ -55,6 +55,24 @@ config GENERIC_IRQ_CHIP
55config IRQ_DOMAIN 55config IRQ_DOMAIN
56 bool 56 bool
57 57
58# Support for hierarchical irq domains
59config IRQ_DOMAIN_HIERARCHY
60 bool
61 select IRQ_DOMAIN
62
63# Generic MSI interrupt support
64config GENERIC_MSI_IRQ
65 bool
66
67# Generic MSI hierarchical interrupt domain support
68config GENERIC_MSI_IRQ_DOMAIN
69 bool
70 select IRQ_DOMAIN_HIERARCHY
71 select GENERIC_MSI_IRQ
72
73config HANDLE_DOMAIN_IRQ
74 bool
75
58config IRQ_DOMAIN_DEBUG 76config IRQ_DOMAIN_DEBUG
59 bool "Expose hardware/virtual IRQ mapping via debugfs" 77 bool "Expose hardware/virtual IRQ mapping via debugfs"
60 depends on IRQ_DOMAIN && DEBUG_FS 78 depends on IRQ_DOMAIN && DEBUG_FS
diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile
index fff17381f0af..d12123526e2b 100644
--- a/kernel/irq/Makefile
+++ b/kernel/irq/Makefile
@@ -6,3 +6,4 @@ obj-$(CONFIG_IRQ_DOMAIN) += irqdomain.o
6obj-$(CONFIG_PROC_FS) += proc.o 6obj-$(CONFIG_PROC_FS) += proc.o
7obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o 7obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o
8obj-$(CONFIG_PM_SLEEP) += pm.o 8obj-$(CONFIG_PM_SLEEP) += pm.o
9obj-$(CONFIG_GENERIC_MSI_IRQ) += msi.o
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 6223fab9a9d2..6f1c7a566b95 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -15,6 +15,7 @@
15#include <linux/module.h> 15#include <linux/module.h>
16#include <linux/interrupt.h> 16#include <linux/interrupt.h>
17#include <linux/kernel_stat.h> 17#include <linux/kernel_stat.h>
18#include <linux/irqdomain.h>
18 19
19#include <trace/events/irq.h> 20#include <trace/events/irq.h>
20 21
@@ -178,6 +179,7 @@ int irq_startup(struct irq_desc *desc, bool resend)
178 irq_state_clr_disabled(desc); 179 irq_state_clr_disabled(desc);
179 desc->depth = 0; 180 desc->depth = 0;
180 181
182 irq_domain_activate_irq(&desc->irq_data);
181 if (desc->irq_data.chip->irq_startup) { 183 if (desc->irq_data.chip->irq_startup) {
182 ret = desc->irq_data.chip->irq_startup(&desc->irq_data); 184 ret = desc->irq_data.chip->irq_startup(&desc->irq_data);
183 irq_state_clr_masked(desc); 185 irq_state_clr_masked(desc);
@@ -199,6 +201,7 @@ void irq_shutdown(struct irq_desc *desc)
199 desc->irq_data.chip->irq_disable(&desc->irq_data); 201 desc->irq_data.chip->irq_disable(&desc->irq_data);
200 else 202 else
201 desc->irq_data.chip->irq_mask(&desc->irq_data); 203 desc->irq_data.chip->irq_mask(&desc->irq_data);
204 irq_domain_deactivate_irq(&desc->irq_data);
202 irq_state_set_masked(desc); 205 irq_state_set_masked(desc);
203} 206}
204 207
@@ -342,6 +345,31 @@ static bool irq_check_poll(struct irq_desc *desc)
342 return irq_wait_for_poll(desc); 345 return irq_wait_for_poll(desc);
343} 346}
344 347
348static bool irq_may_run(struct irq_desc *desc)
349{
350 unsigned int mask = IRQD_IRQ_INPROGRESS | IRQD_WAKEUP_ARMED;
351
352 /*
353 * If the interrupt is not in progress and is not an armed
354 * wakeup interrupt, proceed.
355 */
356 if (!irqd_has_set(&desc->irq_data, mask))
357 return true;
358
359 /*
360 * If the interrupt is an armed wakeup source, mark it pending
361 * and suspended, disable it and notify the pm core about the
362 * event.
363 */
364 if (irq_pm_check_wakeup(desc))
365 return false;
366
367 /*
368 * Handle a potential concurrent poll on a different core.
369 */
370 return irq_check_poll(desc);
371}
372
345/** 373/**
346 * handle_simple_irq - Simple and software-decoded IRQs. 374 * handle_simple_irq - Simple and software-decoded IRQs.
347 * @irq: the interrupt number 375 * @irq: the interrupt number
@@ -359,9 +387,8 @@ handle_simple_irq(unsigned int irq, struct irq_desc *desc)
359{ 387{
360 raw_spin_lock(&desc->lock); 388 raw_spin_lock(&desc->lock);
361 389
362 if (unlikely(irqd_irq_inprogress(&desc->irq_data))) 390 if (!irq_may_run(desc))
363 if (!irq_check_poll(desc)) 391 goto out_unlock;
364 goto out_unlock;
365 392
366 desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); 393 desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
367 kstat_incr_irqs_this_cpu(irq, desc); 394 kstat_incr_irqs_this_cpu(irq, desc);
@@ -412,9 +439,8 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc)
412 raw_spin_lock(&desc->lock); 439 raw_spin_lock(&desc->lock);
413 mask_ack_irq(desc); 440 mask_ack_irq(desc);
414 441
415 if (unlikely(irqd_irq_inprogress(&desc->irq_data))) 442 if (!irq_may_run(desc))
416 if (!irq_check_poll(desc)) 443 goto out_unlock;
417 goto out_unlock;
418 444
419 desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); 445 desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
420 kstat_incr_irqs_this_cpu(irq, desc); 446 kstat_incr_irqs_this_cpu(irq, desc);
@@ -485,9 +511,8 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc)
485 511
486 raw_spin_lock(&desc->lock); 512 raw_spin_lock(&desc->lock);
487 513
488 if (unlikely(irqd_irq_inprogress(&desc->irq_data))) 514 if (!irq_may_run(desc))
489 if (!irq_check_poll(desc)) 515 goto out;
490 goto out;
491 516
492 desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); 517 desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
493 kstat_incr_irqs_this_cpu(irq, desc); 518 kstat_incr_irqs_this_cpu(irq, desc);
@@ -541,19 +566,23 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
541 raw_spin_lock(&desc->lock); 566 raw_spin_lock(&desc->lock);
542 567
543 desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); 568 desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
569
570 if (!irq_may_run(desc)) {
571 desc->istate |= IRQS_PENDING;
572 mask_ack_irq(desc);
573 goto out_unlock;
574 }
575
544 /* 576 /*
545 * If we're currently running this IRQ, or its disabled, 577 * If its disabled or no action available then mask it and get
546 * we shouldn't process the IRQ. Mark it pending, handle 578 * out of here.
547 * the necessary masking and go out
548 */ 579 */
549 if (unlikely(irqd_irq_disabled(&desc->irq_data) || 580 if (irqd_irq_disabled(&desc->irq_data) || !desc->action) {
550 irqd_irq_inprogress(&desc->irq_data) || !desc->action)) { 581 desc->istate |= IRQS_PENDING;
551 if (!irq_check_poll(desc)) { 582 mask_ack_irq(desc);
552 desc->istate |= IRQS_PENDING; 583 goto out_unlock;
553 mask_ack_irq(desc);
554 goto out_unlock;
555 }
556 } 584 }
585
557 kstat_incr_irqs_this_cpu(irq, desc); 586 kstat_incr_irqs_this_cpu(irq, desc);
558 587
559 /* Start handling the irq */ 588 /* Start handling the irq */
@@ -602,18 +631,21 @@ void handle_edge_eoi_irq(unsigned int irq, struct irq_desc *desc)
602 raw_spin_lock(&desc->lock); 631 raw_spin_lock(&desc->lock);
603 632
604 desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); 633 desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
634
635 if (!irq_may_run(desc)) {
636 desc->istate |= IRQS_PENDING;
637 goto out_eoi;
638 }
639
605 /* 640 /*
606 * If we're currently running this IRQ, or its disabled, 641 * If its disabled or no action available then mask it and get
607 * we shouldn't process the IRQ. Mark it pending, handle 642 * out of here.
608 * the necessary masking and go out
609 */ 643 */
610 if (unlikely(irqd_irq_disabled(&desc->irq_data) || 644 if (irqd_irq_disabled(&desc->irq_data) || !desc->action) {
611 irqd_irq_inprogress(&desc->irq_data) || !desc->action)) { 645 desc->istate |= IRQS_PENDING;
612 if (!irq_check_poll(desc)) { 646 goto out_eoi;
613 desc->istate |= IRQS_PENDING;
614 goto out_eoi;
615 }
616 } 647 }
648
617 kstat_incr_irqs_this_cpu(irq, desc); 649 kstat_incr_irqs_this_cpu(irq, desc);
618 650
619 do { 651 do {
@@ -670,7 +702,7 @@ void handle_percpu_devid_irq(unsigned int irq, struct irq_desc *desc)
670{ 702{
671 struct irq_chip *chip = irq_desc_get_chip(desc); 703 struct irq_chip *chip = irq_desc_get_chip(desc);
672 struct irqaction *action = desc->action; 704 struct irqaction *action = desc->action;
673 void *dev_id = __this_cpu_ptr(action->percpu_dev_id); 705 void *dev_id = raw_cpu_ptr(action->percpu_dev_id);
674 irqreturn_t res; 706 irqreturn_t res;
675 707
676 kstat_incr_irqs_this_cpu(irq, desc); 708 kstat_incr_irqs_this_cpu(irq, desc);
@@ -699,7 +731,30 @@ __irq_set_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
699 if (!handle) { 731 if (!handle) {
700 handle = handle_bad_irq; 732 handle = handle_bad_irq;
701 } else { 733 } else {
702 if (WARN_ON(desc->irq_data.chip == &no_irq_chip)) 734 struct irq_data *irq_data = &desc->irq_data;
735#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
736 /*
737 * With hierarchical domains we might run into a
738 * situation where the outermost chip is not yet set
739 * up, but the inner chips are there. Instead of
740 * bailing we install the handler, but obviously we
741 * cannot enable/startup the interrupt at this point.
742 */
743 while (irq_data) {
744 if (irq_data->chip != &no_irq_chip)
745 break;
746 /*
747 * Bail out if the outer chip is not set up
748 * and the interrrupt supposed to be started
749 * right away.
750 */
751 if (WARN_ON(is_chained))
752 goto out;
753 /* Try the parent */
754 irq_data = irq_data->parent_data;
755 }
756#endif
757 if (WARN_ON(!irq_data || irq_data->chip == &no_irq_chip))
703 goto out; 758 goto out;
704 } 759 }
705 760
@@ -818,3 +873,105 @@ void irq_cpu_offline(void)
818 raw_spin_unlock_irqrestore(&desc->lock, flags); 873 raw_spin_unlock_irqrestore(&desc->lock, flags);
819 } 874 }
820} 875}
876
877#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
878/**
879 * irq_chip_ack_parent - Acknowledge the parent interrupt
880 * @data: Pointer to interrupt specific data
881 */
882void irq_chip_ack_parent(struct irq_data *data)
883{
884 data = data->parent_data;
885 data->chip->irq_ack(data);
886}
887
888/**
889 * irq_chip_mask_parent - Mask the parent interrupt
890 * @data: Pointer to interrupt specific data
891 */
892void irq_chip_mask_parent(struct irq_data *data)
893{
894 data = data->parent_data;
895 data->chip->irq_mask(data);
896}
897
898/**
899 * irq_chip_unmask_parent - Unmask the parent interrupt
900 * @data: Pointer to interrupt specific data
901 */
902void irq_chip_unmask_parent(struct irq_data *data)
903{
904 data = data->parent_data;
905 data->chip->irq_unmask(data);
906}
907
908/**
909 * irq_chip_eoi_parent - Invoke EOI on the parent interrupt
910 * @data: Pointer to interrupt specific data
911 */
912void irq_chip_eoi_parent(struct irq_data *data)
913{
914 data = data->parent_data;
915 data->chip->irq_eoi(data);
916}
917
918/**
919 * irq_chip_set_affinity_parent - Set affinity on the parent interrupt
920 * @data: Pointer to interrupt specific data
921 * @dest: The affinity mask to set
922 * @force: Flag to enforce setting (disable online checks)
923 *
924 * Conditinal, as the underlying parent chip might not implement it.
925 */
926int irq_chip_set_affinity_parent(struct irq_data *data,
927 const struct cpumask *dest, bool force)
928{
929 data = data->parent_data;
930 if (data->chip->irq_set_affinity)
931 return data->chip->irq_set_affinity(data, dest, force);
932
933 return -ENOSYS;
934}
935
936/**
937 * irq_chip_retrigger_hierarchy - Retrigger an interrupt in hardware
938 * @data: Pointer to interrupt specific data
939 *
940 * Iterate through the domain hierarchy of the interrupt and check
941 * whether a hw retrigger function exists. If yes, invoke it.
942 */
943int irq_chip_retrigger_hierarchy(struct irq_data *data)
944{
945 for (data = data->parent_data; data; data = data->parent_data)
946 if (data->chip && data->chip->irq_retrigger)
947 return data->chip->irq_retrigger(data);
948
949 return -ENOSYS;
950}
951#endif
952
953/**
954 * irq_chip_compose_msi_msg - Componse msi message for a irq chip
955 * @data: Pointer to interrupt specific data
956 * @msg: Pointer to the MSI message
957 *
958 * For hierarchical domains we find the first chip in the hierarchy
959 * which implements the irq_compose_msi_msg callback. For non
960 * hierarchical we use the top level chip.
961 */
962int irq_chip_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
963{
964 struct irq_data *pos = NULL;
965
966#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
967 for (; data; data = data->parent_data)
968#endif
969 if (data->chip && data->chip->irq_compose_msi_msg)
970 pos = data;
971 if (!pos)
972 return -ENOSYS;
973
974 pos->chip->irq_compose_msi_msg(pos, msg);
975
976 return 0;
977}
diff --git a/kernel/irq/devres.c b/kernel/irq/devres.c
index 1ef0606797c9..d5d0f7345c54 100644
--- a/kernel/irq/devres.c
+++ b/kernel/irq/devres.c
@@ -38,7 +38,7 @@ static int devm_irq_match(struct device *dev, void *res, void *data)
38 * 38 *
39 * Except for the extra @dev argument, this function takes the 39 * Except for the extra @dev argument, this function takes the
40 * same arguments and performs the same function as 40 * same arguments and performs the same function as
41 * request_irq(). IRQs requested with this function will be 41 * request_threaded_irq(). IRQs requested with this function will be
42 * automatically freed on driver detach. 42 * automatically freed on driver detach.
43 * 43 *
44 * If an IRQ allocated with this function needs to be freed 44 * If an IRQ allocated with this function needs to be freed
diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c
index cf80e7b0ddab..61024e8abdef 100644
--- a/kernel/irq/generic-chip.c
+++ b/kernel/irq/generic-chip.c
@@ -39,7 +39,7 @@ void irq_gc_mask_disable_reg(struct irq_data *d)
39 u32 mask = d->mask; 39 u32 mask = d->mask;
40 40
41 irq_gc_lock(gc); 41 irq_gc_lock(gc);
42 irq_reg_writel(mask, gc->reg_base + ct->regs.disable); 42 irq_reg_writel(gc, mask, ct->regs.disable);
43 *ct->mask_cache &= ~mask; 43 *ct->mask_cache &= ~mask;
44 irq_gc_unlock(gc); 44 irq_gc_unlock(gc);
45} 45}
@@ -59,7 +59,7 @@ void irq_gc_mask_set_bit(struct irq_data *d)
59 59
60 irq_gc_lock(gc); 60 irq_gc_lock(gc);
61 *ct->mask_cache |= mask; 61 *ct->mask_cache |= mask;
62 irq_reg_writel(*ct->mask_cache, gc->reg_base + ct->regs.mask); 62 irq_reg_writel(gc, *ct->mask_cache, ct->regs.mask);
63 irq_gc_unlock(gc); 63 irq_gc_unlock(gc);
64} 64}
65EXPORT_SYMBOL_GPL(irq_gc_mask_set_bit); 65EXPORT_SYMBOL_GPL(irq_gc_mask_set_bit);
@@ -79,7 +79,7 @@ void irq_gc_mask_clr_bit(struct irq_data *d)
79 79
80 irq_gc_lock(gc); 80 irq_gc_lock(gc);
81 *ct->mask_cache &= ~mask; 81 *ct->mask_cache &= ~mask;
82 irq_reg_writel(*ct->mask_cache, gc->reg_base + ct->regs.mask); 82 irq_reg_writel(gc, *ct->mask_cache, ct->regs.mask);
83 irq_gc_unlock(gc); 83 irq_gc_unlock(gc);
84} 84}
85EXPORT_SYMBOL_GPL(irq_gc_mask_clr_bit); 85EXPORT_SYMBOL_GPL(irq_gc_mask_clr_bit);
@@ -98,7 +98,7 @@ void irq_gc_unmask_enable_reg(struct irq_data *d)
98 u32 mask = d->mask; 98 u32 mask = d->mask;
99 99
100 irq_gc_lock(gc); 100 irq_gc_lock(gc);
101 irq_reg_writel(mask, gc->reg_base + ct->regs.enable); 101 irq_reg_writel(gc, mask, ct->regs.enable);
102 *ct->mask_cache |= mask; 102 *ct->mask_cache |= mask;
103 irq_gc_unlock(gc); 103 irq_gc_unlock(gc);
104} 104}
@@ -114,7 +114,7 @@ void irq_gc_ack_set_bit(struct irq_data *d)
114 u32 mask = d->mask; 114 u32 mask = d->mask;
115 115
116 irq_gc_lock(gc); 116 irq_gc_lock(gc);
117 irq_reg_writel(mask, gc->reg_base + ct->regs.ack); 117 irq_reg_writel(gc, mask, ct->regs.ack);
118 irq_gc_unlock(gc); 118 irq_gc_unlock(gc);
119} 119}
120EXPORT_SYMBOL_GPL(irq_gc_ack_set_bit); 120EXPORT_SYMBOL_GPL(irq_gc_ack_set_bit);
@@ -130,7 +130,7 @@ void irq_gc_ack_clr_bit(struct irq_data *d)
130 u32 mask = ~d->mask; 130 u32 mask = ~d->mask;
131 131
132 irq_gc_lock(gc); 132 irq_gc_lock(gc);
133 irq_reg_writel(mask, gc->reg_base + ct->regs.ack); 133 irq_reg_writel(gc, mask, ct->regs.ack);
134 irq_gc_unlock(gc); 134 irq_gc_unlock(gc);
135} 135}
136 136
@@ -145,8 +145,8 @@ void irq_gc_mask_disable_reg_and_ack(struct irq_data *d)
145 u32 mask = d->mask; 145 u32 mask = d->mask;
146 146
147 irq_gc_lock(gc); 147 irq_gc_lock(gc);
148 irq_reg_writel(mask, gc->reg_base + ct->regs.mask); 148 irq_reg_writel(gc, mask, ct->regs.mask);
149 irq_reg_writel(mask, gc->reg_base + ct->regs.ack); 149 irq_reg_writel(gc, mask, ct->regs.ack);
150 irq_gc_unlock(gc); 150 irq_gc_unlock(gc);
151} 151}
152 152
@@ -161,7 +161,7 @@ void irq_gc_eoi(struct irq_data *d)
161 u32 mask = d->mask; 161 u32 mask = d->mask;
162 162
163 irq_gc_lock(gc); 163 irq_gc_lock(gc);
164 irq_reg_writel(mask, gc->reg_base + ct->regs.eoi); 164 irq_reg_writel(gc, mask, ct->regs.eoi);
165 irq_gc_unlock(gc); 165 irq_gc_unlock(gc);
166} 166}
167 167
@@ -191,6 +191,16 @@ int irq_gc_set_wake(struct irq_data *d, unsigned int on)
191 return 0; 191 return 0;
192} 192}
193 193
194static u32 irq_readl_be(void __iomem *addr)
195{
196 return ioread32be(addr);
197}
198
199static void irq_writel_be(u32 val, void __iomem *addr)
200{
201 iowrite32be(val, addr);
202}
203
194static void 204static void
195irq_init_generic_chip(struct irq_chip_generic *gc, const char *name, 205irq_init_generic_chip(struct irq_chip_generic *gc, const char *name,
196 int num_ct, unsigned int irq_base, 206 int num_ct, unsigned int irq_base,
@@ -245,7 +255,7 @@ irq_gc_init_mask_cache(struct irq_chip_generic *gc, enum irq_gc_flags flags)
245 } 255 }
246 ct[i].mask_cache = mskptr; 256 ct[i].mask_cache = mskptr;
247 if (flags & IRQ_GC_INIT_MASK_CACHE) 257 if (flags & IRQ_GC_INIT_MASK_CACHE)
248 *mskptr = irq_reg_readl(gc->reg_base + mskreg); 258 *mskptr = irq_reg_readl(gc, mskreg);
249 } 259 }
250} 260}
251 261
@@ -300,7 +310,13 @@ int irq_alloc_domain_generic_chips(struct irq_domain *d, int irqs_per_chip,
300 dgc->gc[i] = gc = tmp; 310 dgc->gc[i] = gc = tmp;
301 irq_init_generic_chip(gc, name, num_ct, i * irqs_per_chip, 311 irq_init_generic_chip(gc, name, num_ct, i * irqs_per_chip,
302 NULL, handler); 312 NULL, handler);
313
303 gc->domain = d; 314 gc->domain = d;
315 if (gcflags & IRQ_GC_BE_IO) {
316 gc->reg_readl = &irq_readl_be;
317 gc->reg_writel = &irq_writel_be;
318 }
319
304 raw_spin_lock_irqsave(&gc_lock, flags); 320 raw_spin_lock_irqsave(&gc_lock, flags);
305 list_add_tail(&gc->list, &gc_list); 321 list_add_tail(&gc->list, &gc_list);
306 raw_spin_unlock_irqrestore(&gc_lock, flags); 322 raw_spin_unlock_irqrestore(&gc_lock, flags);
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 099ea2e0eb88..df553b0af936 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -63,8 +63,8 @@ enum {
63 63
64extern int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, 64extern int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
65 unsigned long flags); 65 unsigned long flags);
66extern void __disable_irq(struct irq_desc *desc, unsigned int irq, bool susp); 66extern void __disable_irq(struct irq_desc *desc, unsigned int irq);
67extern void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume); 67extern void __enable_irq(struct irq_desc *desc, unsigned int irq);
68 68
69extern int irq_startup(struct irq_desc *desc, bool resend); 69extern int irq_startup(struct irq_desc *desc, bool resend);
70extern void irq_shutdown(struct irq_desc *desc); 70extern void irq_shutdown(struct irq_desc *desc);
@@ -78,8 +78,12 @@ extern void unmask_threaded_irq(struct irq_desc *desc);
78 78
79#ifdef CONFIG_SPARSE_IRQ 79#ifdef CONFIG_SPARSE_IRQ
80static inline void irq_mark_irq(unsigned int irq) { } 80static inline void irq_mark_irq(unsigned int irq) { }
81extern void irq_lock_sparse(void);
82extern void irq_unlock_sparse(void);
81#else 83#else
82extern void irq_mark_irq(unsigned int irq); 84extern void irq_mark_irq(unsigned int irq);
85static inline void irq_lock_sparse(void) { }
86static inline void irq_unlock_sparse(void) { }
83#endif 87#endif
84 88
85extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr); 89extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr);
@@ -194,3 +198,15 @@ static inline void kstat_incr_irqs_this_cpu(unsigned int irq, struct irq_desc *d
194 __this_cpu_inc(*desc->kstat_irqs); 198 __this_cpu_inc(*desc->kstat_irqs);
195 __this_cpu_inc(kstat.irqs_sum); 199 __this_cpu_inc(kstat.irqs_sum);
196} 200}
201
202#ifdef CONFIG_PM_SLEEP
203bool irq_pm_check_wakeup(struct irq_desc *desc);
204void irq_pm_install_action(struct irq_desc *desc, struct irqaction *action);
205void irq_pm_remove_action(struct irq_desc *desc, struct irqaction *action);
206#else
207static inline bool irq_pm_check_wakeup(struct irq_desc *desc) { return false; }
208static inline void
209irq_pm_install_action(struct irq_desc *desc, struct irqaction *action) { }
210static inline void
211irq_pm_remove_action(struct irq_desc *desc, struct irqaction *action) { }
212#endif
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 1487a123db5c..99793b9b6d23 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -14,6 +14,7 @@
14#include <linux/kernel_stat.h> 14#include <linux/kernel_stat.h>
15#include <linux/radix-tree.h> 15#include <linux/radix-tree.h>
16#include <linux/bitmap.h> 16#include <linux/bitmap.h>
17#include <linux/irqdomain.h>
17 18
18#include "internals.h" 19#include "internals.h"
19 20
@@ -131,6 +132,16 @@ static void free_masks(struct irq_desc *desc)
131static inline void free_masks(struct irq_desc *desc) { } 132static inline void free_masks(struct irq_desc *desc) { }
132#endif 133#endif
133 134
135void irq_lock_sparse(void)
136{
137 mutex_lock(&sparse_irq_lock);
138}
139
140void irq_unlock_sparse(void)
141{
142 mutex_unlock(&sparse_irq_lock);
143}
144
134static struct irq_desc *alloc_desc(int irq, int node, struct module *owner) 145static struct irq_desc *alloc_desc(int irq, int node, struct module *owner)
135{ 146{
136 struct irq_desc *desc; 147 struct irq_desc *desc;
@@ -167,6 +178,12 @@ static void free_desc(unsigned int irq)
167 178
168 unregister_irq_proc(irq, desc); 179 unregister_irq_proc(irq, desc);
169 180
181 /*
182 * sparse_irq_lock protects also show_interrupts() and
183 * kstat_irq_usr(). Once we deleted the descriptor from the
184 * sparse tree we can free it. Access in proc will fail to
185 * lookup the descriptor.
186 */
170 mutex_lock(&sparse_irq_lock); 187 mutex_lock(&sparse_irq_lock);
171 delete_irq_desc(irq); 188 delete_irq_desc(irq);
172 mutex_unlock(&sparse_irq_lock); 189 mutex_unlock(&sparse_irq_lock);
@@ -336,6 +353,47 @@ int generic_handle_irq(unsigned int irq)
336} 353}
337EXPORT_SYMBOL_GPL(generic_handle_irq); 354EXPORT_SYMBOL_GPL(generic_handle_irq);
338 355
356#ifdef CONFIG_HANDLE_DOMAIN_IRQ
357/**
358 * __handle_domain_irq - Invoke the handler for a HW irq belonging to a domain
359 * @domain: The domain where to perform the lookup
360 * @hwirq: The HW irq number to convert to a logical one
361 * @lookup: Whether to perform the domain lookup or not
362 * @regs: Register file coming from the low-level handling code
363 *
364 * Returns: 0 on success, or -EINVAL if conversion has failed
365 */
366int __handle_domain_irq(struct irq_domain *domain, unsigned int hwirq,
367 bool lookup, struct pt_regs *regs)
368{
369 struct pt_regs *old_regs = set_irq_regs(regs);
370 unsigned int irq = hwirq;
371 int ret = 0;
372
373 irq_enter();
374
375#ifdef CONFIG_IRQ_DOMAIN
376 if (lookup)
377 irq = irq_find_mapping(domain, hwirq);
378#endif
379
380 /*
381 * Some hardware gives randomly wrong interrupts. Rather
382 * than crashing, do something sensible.
383 */
384 if (unlikely(!irq || irq >= nr_irqs)) {
385 ack_bad_irq(irq);
386 ret = -EINVAL;
387 } else {
388 generic_handle_irq(irq);
389 }
390
391 irq_exit();
392 set_irq_regs(old_regs);
393 return ret;
394}
395#endif
396
339/* Dynamic interrupt handling */ 397/* Dynamic interrupt handling */
340 398
341/** 399/**
@@ -532,6 +590,15 @@ void kstat_incr_irq_this_cpu(unsigned int irq)
532 kstat_incr_irqs_this_cpu(irq, irq_to_desc(irq)); 590 kstat_incr_irqs_this_cpu(irq, irq_to_desc(irq));
533} 591}
534 592
593/**
594 * kstat_irqs_cpu - Get the statistics for an interrupt on a cpu
595 * @irq: The interrupt number
596 * @cpu: The cpu number
597 *
598 * Returns the sum of interrupt counts on @cpu since boot for
599 * @irq. The caller must ensure that the interrupt is not removed
600 * concurrently.
601 */
535unsigned int kstat_irqs_cpu(unsigned int irq, int cpu) 602unsigned int kstat_irqs_cpu(unsigned int irq, int cpu)
536{ 603{
537 struct irq_desc *desc = irq_to_desc(irq); 604 struct irq_desc *desc = irq_to_desc(irq);
@@ -540,6 +607,14 @@ unsigned int kstat_irqs_cpu(unsigned int irq, int cpu)
540 *per_cpu_ptr(desc->kstat_irqs, cpu) : 0; 607 *per_cpu_ptr(desc->kstat_irqs, cpu) : 0;
541} 608}
542 609
610/**
611 * kstat_irqs - Get the statistics for an interrupt
612 * @irq: The interrupt number
613 *
614 * Returns the sum of interrupt counts on all cpus since boot for
615 * @irq. The caller must ensure that the interrupt is not removed
616 * concurrently.
617 */
543unsigned int kstat_irqs(unsigned int irq) 618unsigned int kstat_irqs(unsigned int irq)
544{ 619{
545 struct irq_desc *desc = irq_to_desc(irq); 620 struct irq_desc *desc = irq_to_desc(irq);
@@ -552,3 +627,22 @@ unsigned int kstat_irqs(unsigned int irq)
552 sum += *per_cpu_ptr(desc->kstat_irqs, cpu); 627 sum += *per_cpu_ptr(desc->kstat_irqs, cpu);
553 return sum; 628 return sum;
554} 629}
630
631/**
632 * kstat_irqs_usr - Get the statistics for an interrupt
633 * @irq: The interrupt number
634 *
635 * Returns the sum of interrupt counts on all cpus since boot for
636 * @irq. Contrary to kstat_irqs() this can be called from any
637 * preemptible context. It's protected against concurrent removal of
638 * an interrupt descriptor when sparse irqs are enabled.
639 */
640unsigned int kstat_irqs_usr(unsigned int irq)
641{
642 int sum;
643
644 irq_lock_sparse();
645 sum = kstat_irqs(irq);
646 irq_unlock_sparse();
647 return sum;
648}
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 6534ff6ce02e..7fac311057b8 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -23,6 +23,10 @@ static DEFINE_MUTEX(irq_domain_mutex);
23static DEFINE_MUTEX(revmap_trees_mutex); 23static DEFINE_MUTEX(revmap_trees_mutex);
24static struct irq_domain *irq_default_domain; 24static struct irq_domain *irq_default_domain;
25 25
26static int irq_domain_alloc_descs(int virq, unsigned int nr_irqs,
27 irq_hw_number_t hwirq, int node);
28static void irq_domain_check_hierarchy(struct irq_domain *domain);
29
26/** 30/**
27 * __irq_domain_add() - Allocate a new irq_domain data structure 31 * __irq_domain_add() - Allocate a new irq_domain data structure
28 * @of_node: optional device-tree node of the interrupt controller 32 * @of_node: optional device-tree node of the interrupt controller
@@ -30,7 +34,7 @@ static struct irq_domain *irq_default_domain;
30 * @hwirq_max: Maximum number of interrupts supported by controller 34 * @hwirq_max: Maximum number of interrupts supported by controller
31 * @direct_max: Maximum value of direct maps; Use ~0 for no limit; 0 for no 35 * @direct_max: Maximum value of direct maps; Use ~0 for no limit; 0 for no
32 * direct mapping 36 * direct mapping
33 * @ops: map/unmap domain callbacks 37 * @ops: domain callbacks
34 * @host_data: Controller private data pointer 38 * @host_data: Controller private data pointer
35 * 39 *
36 * Allocates and initialize and irq_domain structure. 40 * Allocates and initialize and irq_domain structure.
@@ -56,6 +60,7 @@ struct irq_domain *__irq_domain_add(struct device_node *of_node, int size,
56 domain->hwirq_max = hwirq_max; 60 domain->hwirq_max = hwirq_max;
57 domain->revmap_size = size; 61 domain->revmap_size = size;
58 domain->revmap_direct_max_irq = direct_max; 62 domain->revmap_direct_max_irq = direct_max;
63 irq_domain_check_hierarchy(domain);
59 64
60 mutex_lock(&irq_domain_mutex); 65 mutex_lock(&irq_domain_mutex);
61 list_add(&domain->link, &irq_domain_list); 66 list_add(&domain->link, &irq_domain_list);
@@ -109,7 +114,7 @@ EXPORT_SYMBOL_GPL(irq_domain_remove);
109 * @first_irq: first number of irq block assigned to the domain, 114 * @first_irq: first number of irq block assigned to the domain,
110 * pass zero to assign irqs on-the-fly. If first_irq is non-zero, then 115 * pass zero to assign irqs on-the-fly. If first_irq is non-zero, then
111 * pre-map all of the irqs in the domain to virqs starting at first_irq. 116 * pre-map all of the irqs in the domain to virqs starting at first_irq.
112 * @ops: map/unmap domain callbacks 117 * @ops: domain callbacks
113 * @host_data: Controller private data pointer 118 * @host_data: Controller private data pointer
114 * 119 *
115 * Allocates an irq_domain, and optionally if first_irq is positive then also 120 * Allocates an irq_domain, and optionally if first_irq is positive then also
@@ -174,10 +179,8 @@ struct irq_domain *irq_domain_add_legacy(struct device_node *of_node,
174 179
175 domain = __irq_domain_add(of_node, first_hwirq + size, 180 domain = __irq_domain_add(of_node, first_hwirq + size,
176 first_hwirq + size, 0, ops, host_data); 181 first_hwirq + size, 0, ops, host_data);
177 if (!domain) 182 if (domain)
178 return NULL; 183 irq_domain_associate_many(domain, first_irq, first_hwirq, size);
179
180 irq_domain_associate_many(domain, first_irq, first_hwirq, size);
181 184
182 return domain; 185 return domain;
183} 186}
@@ -388,7 +391,6 @@ EXPORT_SYMBOL_GPL(irq_create_direct_mapping);
388unsigned int irq_create_mapping(struct irq_domain *domain, 391unsigned int irq_create_mapping(struct irq_domain *domain,
389 irq_hw_number_t hwirq) 392 irq_hw_number_t hwirq)
390{ 393{
391 unsigned int hint;
392 int virq; 394 int virq;
393 395
394 pr_debug("irq_create_mapping(0x%p, 0x%lx)\n", domain, hwirq); 396 pr_debug("irq_create_mapping(0x%p, 0x%lx)\n", domain, hwirq);
@@ -410,12 +412,8 @@ unsigned int irq_create_mapping(struct irq_domain *domain,
410 } 412 }
411 413
412 /* Allocate a virtual interrupt number */ 414 /* Allocate a virtual interrupt number */
413 hint = hwirq % nr_irqs; 415 virq = irq_domain_alloc_descs(-1, 1, hwirq,
414 if (hint == 0) 416 of_node_to_nid(domain->of_node));
415 hint++;
416 virq = irq_alloc_desc_from(hint, of_node_to_nid(domain->of_node));
417 if (virq <= 0)
418 virq = irq_alloc_desc_from(1, of_node_to_nid(domain->of_node));
419 if (virq <= 0) { 417 if (virq <= 0) {
420 pr_debug("-> virq allocation failed\n"); 418 pr_debug("-> virq allocation failed\n");
421 return 0; 419 return 0;
@@ -471,7 +469,7 @@ unsigned int irq_create_of_mapping(struct of_phandle_args *irq_data)
471 struct irq_domain *domain; 469 struct irq_domain *domain;
472 irq_hw_number_t hwirq; 470 irq_hw_number_t hwirq;
473 unsigned int type = IRQ_TYPE_NONE; 471 unsigned int type = IRQ_TYPE_NONE;
474 unsigned int virq; 472 int virq;
475 473
476 domain = irq_data->np ? irq_find_host(irq_data->np) : irq_default_domain; 474 domain = irq_data->np ? irq_find_host(irq_data->np) : irq_default_domain;
477 if (!domain) { 475 if (!domain) {
@@ -489,10 +487,24 @@ unsigned int irq_create_of_mapping(struct of_phandle_args *irq_data)
489 return 0; 487 return 0;
490 } 488 }
491 489
492 /* Create mapping */ 490 if (irq_domain_is_hierarchy(domain)) {
493 virq = irq_create_mapping(domain, hwirq); 491 /*
494 if (!virq) 492 * If we've already configured this interrupt,
495 return virq; 493 * don't do it again, or hell will break loose.
494 */
495 virq = irq_find_mapping(domain, hwirq);
496 if (virq)
497 return virq;
498
499 virq = irq_domain_alloc_irqs(domain, 1, NUMA_NO_NODE, irq_data);
500 if (virq <= 0)
501 return 0;
502 } else {
503 /* Create mapping */
504 virq = irq_create_mapping(domain, hwirq);
505 if (!virq)
506 return virq;
507 }
496 508
497 /* Set type if specified and different than the current one */ 509 /* Set type if specified and different than the current one */
498 if (type != IRQ_TYPE_NONE && 510 if (type != IRQ_TYPE_NONE &&
@@ -540,8 +552,8 @@ unsigned int irq_find_mapping(struct irq_domain *domain,
540 return 0; 552 return 0;
541 553
542 if (hwirq < domain->revmap_direct_max_irq) { 554 if (hwirq < domain->revmap_direct_max_irq) {
543 data = irq_get_irq_data(hwirq); 555 data = irq_domain_get_irq_data(domain, hwirq);
544 if (data && (data->domain == domain) && (data->hwirq == hwirq)) 556 if (data && data->hwirq == hwirq)
545 return hwirq; 557 return hwirq;
546 } 558 }
547 559
@@ -709,3 +721,518 @@ const struct irq_domain_ops irq_domain_simple_ops = {
709 .xlate = irq_domain_xlate_onetwocell, 721 .xlate = irq_domain_xlate_onetwocell,
710}; 722};
711EXPORT_SYMBOL_GPL(irq_domain_simple_ops); 723EXPORT_SYMBOL_GPL(irq_domain_simple_ops);
724
725static int irq_domain_alloc_descs(int virq, unsigned int cnt,
726 irq_hw_number_t hwirq, int node)
727{
728 unsigned int hint;
729
730 if (virq >= 0) {
731 virq = irq_alloc_descs(virq, virq, cnt, node);
732 } else {
733 hint = hwirq % nr_irqs;
734 if (hint == 0)
735 hint++;
736 virq = irq_alloc_descs_from(hint, cnt, node);
737 if (virq <= 0 && hint > 1)
738 virq = irq_alloc_descs_from(1, cnt, node);
739 }
740
741 return virq;
742}
743
744#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
745/**
746 * irq_domain_add_hierarchy - Add a irqdomain into the hierarchy
747 * @parent: Parent irq domain to associate with the new domain
748 * @flags: Irq domain flags associated to the domain
749 * @size: Size of the domain. See below
750 * @node: Optional device-tree node of the interrupt controller
751 * @ops: Pointer to the interrupt domain callbacks
752 * @host_data: Controller private data pointer
753 *
754 * If @size is 0 a tree domain is created, otherwise a linear domain.
755 *
756 * If successful the parent is associated to the new domain and the
757 * domain flags are set.
758 * Returns pointer to IRQ domain, or NULL on failure.
759 */
760struct irq_domain *irq_domain_add_hierarchy(struct irq_domain *parent,
761 unsigned int flags,
762 unsigned int size,
763 struct device_node *node,
764 const struct irq_domain_ops *ops,
765 void *host_data)
766{
767 struct irq_domain *domain;
768
769 if (size)
770 domain = irq_domain_add_linear(node, size, ops, host_data);
771 else
772 domain = irq_domain_add_tree(node, ops, host_data);
773 if (domain) {
774 domain->parent = parent;
775 domain->flags |= flags;
776 }
777
778 return domain;
779}
780
781static void irq_domain_insert_irq(int virq)
782{
783 struct irq_data *data;
784
785 for (data = irq_get_irq_data(virq); data; data = data->parent_data) {
786 struct irq_domain *domain = data->domain;
787 irq_hw_number_t hwirq = data->hwirq;
788
789 if (hwirq < domain->revmap_size) {
790 domain->linear_revmap[hwirq] = virq;
791 } else {
792 mutex_lock(&revmap_trees_mutex);
793 radix_tree_insert(&domain->revmap_tree, hwirq, data);
794 mutex_unlock(&revmap_trees_mutex);
795 }
796
797 /* If not already assigned, give the domain the chip's name */
798 if (!domain->name && data->chip)
799 domain->name = data->chip->name;
800 }
801
802 irq_clear_status_flags(virq, IRQ_NOREQUEST);
803}
804
805static void irq_domain_remove_irq(int virq)
806{
807 struct irq_data *data;
808
809 irq_set_status_flags(virq, IRQ_NOREQUEST);
810 irq_set_chip_and_handler(virq, NULL, NULL);
811 synchronize_irq(virq);
812 smp_mb();
813
814 for (data = irq_get_irq_data(virq); data; data = data->parent_data) {
815 struct irq_domain *domain = data->domain;
816 irq_hw_number_t hwirq = data->hwirq;
817
818 if (hwirq < domain->revmap_size) {
819 domain->linear_revmap[hwirq] = 0;
820 } else {
821 mutex_lock(&revmap_trees_mutex);
822 radix_tree_delete(&domain->revmap_tree, hwirq);
823 mutex_unlock(&revmap_trees_mutex);
824 }
825 }
826}
827
828static struct irq_data *irq_domain_insert_irq_data(struct irq_domain *domain,
829 struct irq_data *child)
830{
831 struct irq_data *irq_data;
832
833 irq_data = kzalloc_node(sizeof(*irq_data), GFP_KERNEL, child->node);
834 if (irq_data) {
835 child->parent_data = irq_data;
836 irq_data->irq = child->irq;
837 irq_data->node = child->node;
838 irq_data->domain = domain;
839 }
840
841 return irq_data;
842}
843
844static void irq_domain_free_irq_data(unsigned int virq, unsigned int nr_irqs)
845{
846 struct irq_data *irq_data, *tmp;
847 int i;
848
849 for (i = 0; i < nr_irqs; i++) {
850 irq_data = irq_get_irq_data(virq + i);
851 tmp = irq_data->parent_data;
852 irq_data->parent_data = NULL;
853 irq_data->domain = NULL;
854
855 while (tmp) {
856 irq_data = tmp;
857 tmp = tmp->parent_data;
858 kfree(irq_data);
859 }
860 }
861}
862
863static int irq_domain_alloc_irq_data(struct irq_domain *domain,
864 unsigned int virq, unsigned int nr_irqs)
865{
866 struct irq_data *irq_data;
867 struct irq_domain *parent;
868 int i;
869
870 /* The outermost irq_data is embedded in struct irq_desc */
871 for (i = 0; i < nr_irqs; i++) {
872 irq_data = irq_get_irq_data(virq + i);
873 irq_data->domain = domain;
874
875 for (parent = domain->parent; parent; parent = parent->parent) {
876 irq_data = irq_domain_insert_irq_data(parent, irq_data);
877 if (!irq_data) {
878 irq_domain_free_irq_data(virq, i + 1);
879 return -ENOMEM;
880 }
881 }
882 }
883
884 return 0;
885}
886
887/**
888 * irq_domain_get_irq_data - Get irq_data associated with @virq and @domain
889 * @domain: domain to match
890 * @virq: IRQ number to get irq_data
891 */
892struct irq_data *irq_domain_get_irq_data(struct irq_domain *domain,
893 unsigned int virq)
894{
895 struct irq_data *irq_data;
896
897 for (irq_data = irq_get_irq_data(virq); irq_data;
898 irq_data = irq_data->parent_data)
899 if (irq_data->domain == domain)
900 return irq_data;
901
902 return NULL;
903}
904
905/**
906 * irq_domain_set_hwirq_and_chip - Set hwirq and irqchip of @virq at @domain
907 * @domain: Interrupt domain to match
908 * @virq: IRQ number
909 * @hwirq: The hwirq number
910 * @chip: The associated interrupt chip
911 * @chip_data: The associated chip data
912 */
913int irq_domain_set_hwirq_and_chip(struct irq_domain *domain, unsigned int virq,
914 irq_hw_number_t hwirq, struct irq_chip *chip,
915 void *chip_data)
916{
917 struct irq_data *irq_data = irq_domain_get_irq_data(domain, virq);
918
919 if (!irq_data)
920 return -ENOENT;
921
922 irq_data->hwirq = hwirq;
923 irq_data->chip = chip ? chip : &no_irq_chip;
924 irq_data->chip_data = chip_data;
925
926 return 0;
927}
928
929/**
930 * irq_domain_set_info - Set the complete data for a @virq in @domain
931 * @domain: Interrupt domain to match
932 * @virq: IRQ number
933 * @hwirq: The hardware interrupt number
934 * @chip: The associated interrupt chip
935 * @chip_data: The associated interrupt chip data
936 * @handler: The interrupt flow handler
937 * @handler_data: The interrupt flow handler data
938 * @handler_name: The interrupt handler name
939 */
940void irq_domain_set_info(struct irq_domain *domain, unsigned int virq,
941 irq_hw_number_t hwirq, struct irq_chip *chip,
942 void *chip_data, irq_flow_handler_t handler,
943 void *handler_data, const char *handler_name)
944{
945 irq_domain_set_hwirq_and_chip(domain, virq, hwirq, chip, chip_data);
946 __irq_set_handler(virq, handler, 0, handler_name);
947 irq_set_handler_data(virq, handler_data);
948}
949
950/**
951 * irq_domain_reset_irq_data - Clear hwirq, chip and chip_data in @irq_data
952 * @irq_data: The pointer to irq_data
953 */
954void irq_domain_reset_irq_data(struct irq_data *irq_data)
955{
956 irq_data->hwirq = 0;
957 irq_data->chip = &no_irq_chip;
958 irq_data->chip_data = NULL;
959}
960
961/**
962 * irq_domain_free_irqs_common - Clear irq_data and free the parent
963 * @domain: Interrupt domain to match
964 * @virq: IRQ number to start with
965 * @nr_irqs: The number of irqs to free
966 */
967void irq_domain_free_irqs_common(struct irq_domain *domain, unsigned int virq,
968 unsigned int nr_irqs)
969{
970 struct irq_data *irq_data;
971 int i;
972
973 for (i = 0; i < nr_irqs; i++) {
974 irq_data = irq_domain_get_irq_data(domain, virq + i);
975 if (irq_data)
976 irq_domain_reset_irq_data(irq_data);
977 }
978 irq_domain_free_irqs_parent(domain, virq, nr_irqs);
979}
980
981/**
982 * irq_domain_free_irqs_top - Clear handler and handler data, clear irqdata and free parent
983 * @domain: Interrupt domain to match
984 * @virq: IRQ number to start with
985 * @nr_irqs: The number of irqs to free
986 */
987void irq_domain_free_irqs_top(struct irq_domain *domain, unsigned int virq,
988 unsigned int nr_irqs)
989{
990 int i;
991
992 for (i = 0; i < nr_irqs; i++) {
993 irq_set_handler_data(virq + i, NULL);
994 irq_set_handler(virq + i, NULL);
995 }
996 irq_domain_free_irqs_common(domain, virq, nr_irqs);
997}
998
999static bool irq_domain_is_auto_recursive(struct irq_domain *domain)
1000{
1001 return domain->flags & IRQ_DOMAIN_FLAG_AUTO_RECURSIVE;
1002}
1003
1004static void irq_domain_free_irqs_recursive(struct irq_domain *domain,
1005 unsigned int irq_base,
1006 unsigned int nr_irqs)
1007{
1008 domain->ops->free(domain, irq_base, nr_irqs);
1009 if (irq_domain_is_auto_recursive(domain)) {
1010 BUG_ON(!domain->parent);
1011 irq_domain_free_irqs_recursive(domain->parent, irq_base,
1012 nr_irqs);
1013 }
1014}
1015
1016static int irq_domain_alloc_irqs_recursive(struct irq_domain *domain,
1017 unsigned int irq_base,
1018 unsigned int nr_irqs, void *arg)
1019{
1020 int ret = 0;
1021 struct irq_domain *parent = domain->parent;
1022 bool recursive = irq_domain_is_auto_recursive(domain);
1023
1024 BUG_ON(recursive && !parent);
1025 if (recursive)
1026 ret = irq_domain_alloc_irqs_recursive(parent, irq_base,
1027 nr_irqs, arg);
1028 if (ret >= 0)
1029 ret = domain->ops->alloc(domain, irq_base, nr_irqs, arg);
1030 if (ret < 0 && recursive)
1031 irq_domain_free_irqs_recursive(parent, irq_base, nr_irqs);
1032
1033 return ret;
1034}
1035
1036/**
1037 * __irq_domain_alloc_irqs - Allocate IRQs from domain
1038 * @domain: domain to allocate from
1039 * @irq_base: allocate specified IRQ nubmer if irq_base >= 0
1040 * @nr_irqs: number of IRQs to allocate
1041 * @node: NUMA node id for memory allocation
1042 * @arg: domain specific argument
1043 * @realloc: IRQ descriptors have already been allocated if true
1044 *
1045 * Allocate IRQ numbers and initialized all data structures to support
1046 * hierarchy IRQ domains.
1047 * Parameter @realloc is mainly to support legacy IRQs.
1048 * Returns error code or allocated IRQ number
1049 *
1050 * The whole process to setup an IRQ has been split into two steps.
1051 * The first step, __irq_domain_alloc_irqs(), is to allocate IRQ
1052 * descriptor and required hardware resources. The second step,
1053 * irq_domain_activate_irq(), is to program hardwares with preallocated
1054 * resources. In this way, it's easier to rollback when failing to
1055 * allocate resources.
1056 */
1057int __irq_domain_alloc_irqs(struct irq_domain *domain, int irq_base,
1058 unsigned int nr_irqs, int node, void *arg,
1059 bool realloc)
1060{
1061 int i, ret, virq;
1062
1063 if (domain == NULL) {
1064 domain = irq_default_domain;
1065 if (WARN(!domain, "domain is NULL; cannot allocate IRQ\n"))
1066 return -EINVAL;
1067 }
1068
1069 if (!domain->ops->alloc) {
1070 pr_debug("domain->ops->alloc() is NULL\n");
1071 return -ENOSYS;
1072 }
1073
1074 if (realloc && irq_base >= 0) {
1075 virq = irq_base;
1076 } else {
1077 virq = irq_domain_alloc_descs(irq_base, nr_irqs, 0, node);
1078 if (virq < 0) {
1079 pr_debug("cannot allocate IRQ(base %d, count %d)\n",
1080 irq_base, nr_irqs);
1081 return virq;
1082 }
1083 }
1084
1085 if (irq_domain_alloc_irq_data(domain, virq, nr_irqs)) {
1086 pr_debug("cannot allocate memory for IRQ%d\n", virq);
1087 ret = -ENOMEM;
1088 goto out_free_desc;
1089 }
1090
1091 mutex_lock(&irq_domain_mutex);
1092 ret = irq_domain_alloc_irqs_recursive(domain, virq, nr_irqs, arg);
1093 if (ret < 0) {
1094 mutex_unlock(&irq_domain_mutex);
1095 goto out_free_irq_data;
1096 }
1097 for (i = 0; i < nr_irqs; i++)
1098 irq_domain_insert_irq(virq + i);
1099 mutex_unlock(&irq_domain_mutex);
1100
1101 return virq;
1102
1103out_free_irq_data:
1104 irq_domain_free_irq_data(virq, nr_irqs);
1105out_free_desc:
1106 irq_free_descs(virq, nr_irqs);
1107 return ret;
1108}
1109
1110/**
1111 * irq_domain_free_irqs - Free IRQ number and associated data structures
1112 * @virq: base IRQ number
1113 * @nr_irqs: number of IRQs to free
1114 */
1115void irq_domain_free_irqs(unsigned int virq, unsigned int nr_irqs)
1116{
1117 struct irq_data *data = irq_get_irq_data(virq);
1118 int i;
1119
1120 if (WARN(!data || !data->domain || !data->domain->ops->free,
1121 "NULL pointer, cannot free irq\n"))
1122 return;
1123
1124 mutex_lock(&irq_domain_mutex);
1125 for (i = 0; i < nr_irqs; i++)
1126 irq_domain_remove_irq(virq + i);
1127 irq_domain_free_irqs_recursive(data->domain, virq, nr_irqs);
1128 mutex_unlock(&irq_domain_mutex);
1129
1130 irq_domain_free_irq_data(virq, nr_irqs);
1131 irq_free_descs(virq, nr_irqs);
1132}
1133
1134/**
1135 * irq_domain_alloc_irqs_parent - Allocate interrupts from parent domain
1136 * @irq_base: Base IRQ number
1137 * @nr_irqs: Number of IRQs to allocate
1138 * @arg: Allocation data (arch/domain specific)
1139 *
1140 * Check whether the domain has been setup recursive. If not allocate
1141 * through the parent domain.
1142 */
1143int irq_domain_alloc_irqs_parent(struct irq_domain *domain,
1144 unsigned int irq_base, unsigned int nr_irqs,
1145 void *arg)
1146{
1147 /* irq_domain_alloc_irqs_recursive() has called parent's alloc() */
1148 if (irq_domain_is_auto_recursive(domain))
1149 return 0;
1150
1151 domain = domain->parent;
1152 if (domain)
1153 return irq_domain_alloc_irqs_recursive(domain, irq_base,
1154 nr_irqs, arg);
1155 return -ENOSYS;
1156}
1157
1158/**
1159 * irq_domain_free_irqs_parent - Free interrupts from parent domain
1160 * @irq_base: Base IRQ number
1161 * @nr_irqs: Number of IRQs to free
1162 *
1163 * Check whether the domain has been setup recursive. If not free
1164 * through the parent domain.
1165 */
1166void irq_domain_free_irqs_parent(struct irq_domain *domain,
1167 unsigned int irq_base, unsigned int nr_irqs)
1168{
1169 /* irq_domain_free_irqs_recursive() will call parent's free */
1170 if (!irq_domain_is_auto_recursive(domain) && domain->parent)
1171 irq_domain_free_irqs_recursive(domain->parent, irq_base,
1172 nr_irqs);
1173}
1174
1175/**
1176 * irq_domain_activate_irq - Call domain_ops->activate recursively to activate
1177 * interrupt
1178 * @irq_data: outermost irq_data associated with interrupt
1179 *
1180 * This is the second step to call domain_ops->activate to program interrupt
1181 * controllers, so the interrupt could actually get delivered.
1182 */
1183void irq_domain_activate_irq(struct irq_data *irq_data)
1184{
1185 if (irq_data && irq_data->domain) {
1186 struct irq_domain *domain = irq_data->domain;
1187
1188 if (irq_data->parent_data)
1189 irq_domain_activate_irq(irq_data->parent_data);
1190 if (domain->ops->activate)
1191 domain->ops->activate(domain, irq_data);
1192 }
1193}
1194
1195/**
1196 * irq_domain_deactivate_irq - Call domain_ops->deactivate recursively to
1197 * deactivate interrupt
1198 * @irq_data: outermost irq_data associated with interrupt
1199 *
1200 * It calls domain_ops->deactivate to program interrupt controllers to disable
1201 * interrupt delivery.
1202 */
1203void irq_domain_deactivate_irq(struct irq_data *irq_data)
1204{
1205 if (irq_data && irq_data->domain) {
1206 struct irq_domain *domain = irq_data->domain;
1207
1208 if (domain->ops->deactivate)
1209 domain->ops->deactivate(domain, irq_data);
1210 if (irq_data->parent_data)
1211 irq_domain_deactivate_irq(irq_data->parent_data);
1212 }
1213}
1214
1215static void irq_domain_check_hierarchy(struct irq_domain *domain)
1216{
1217 /* Hierarchy irq_domains must implement callback alloc() */
1218 if (domain->ops->alloc)
1219 domain->flags |= IRQ_DOMAIN_FLAG_HIERARCHY;
1220}
1221#else /* CONFIG_IRQ_DOMAIN_HIERARCHY */
1222/**
1223 * irq_domain_get_irq_data - Get irq_data associated with @virq and @domain
1224 * @domain: domain to match
1225 * @virq: IRQ number to get irq_data
1226 */
1227struct irq_data *irq_domain_get_irq_data(struct irq_domain *domain,
1228 unsigned int virq)
1229{
1230 struct irq_data *irq_data = irq_get_irq_data(virq);
1231
1232 return (irq_data && irq_data->domain == domain) ? irq_data : NULL;
1233}
1234
1235static void irq_domain_check_hierarchy(struct irq_domain *domain)
1236{
1237}
1238#endif /* CONFIG_IRQ_DOMAIN_HIERARCHY */
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 3dc6a61bf06a..80692373abd6 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -183,6 +183,7 @@ int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask,
183 ret = chip->irq_set_affinity(data, mask, force); 183 ret = chip->irq_set_affinity(data, mask, force);
184 switch (ret) { 184 switch (ret) {
185 case IRQ_SET_MASK_OK: 185 case IRQ_SET_MASK_OK:
186 case IRQ_SET_MASK_OK_DONE:
186 cpumask_copy(data->affinity, mask); 187 cpumask_copy(data->affinity, mask);
187 case IRQ_SET_MASK_OK_NOCOPY: 188 case IRQ_SET_MASK_OK_NOCOPY:
188 irq_set_thread_affinity(desc); 189 irq_set_thread_affinity(desc);
@@ -382,14 +383,8 @@ setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask)
382} 383}
383#endif 384#endif
384 385
385void __disable_irq(struct irq_desc *desc, unsigned int irq, bool suspend) 386void __disable_irq(struct irq_desc *desc, unsigned int irq)
386{ 387{
387 if (suspend) {
388 if (!desc->action || (desc->action->flags & IRQF_NO_SUSPEND))
389 return;
390 desc->istate |= IRQS_SUSPENDED;
391 }
392
393 if (!desc->depth++) 388 if (!desc->depth++)
394 irq_disable(desc); 389 irq_disable(desc);
395} 390}
@@ -401,7 +396,7 @@ static int __disable_irq_nosync(unsigned int irq)
401 396
402 if (!desc) 397 if (!desc)
403 return -EINVAL; 398 return -EINVAL;
404 __disable_irq(desc, irq, false); 399 __disable_irq(desc, irq);
405 irq_put_desc_busunlock(desc, flags); 400 irq_put_desc_busunlock(desc, flags);
406 return 0; 401 return 0;
407} 402}
@@ -442,20 +437,8 @@ void disable_irq(unsigned int irq)
442} 437}
443EXPORT_SYMBOL(disable_irq); 438EXPORT_SYMBOL(disable_irq);
444 439
445void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume) 440void __enable_irq(struct irq_desc *desc, unsigned int irq)
446{ 441{
447 if (resume) {
448 if (!(desc->istate & IRQS_SUSPENDED)) {
449 if (!desc->action)
450 return;
451 if (!(desc->action->flags & IRQF_FORCE_RESUME))
452 return;
453 /* Pretend that it got disabled ! */
454 desc->depth++;
455 }
456 desc->istate &= ~IRQS_SUSPENDED;
457 }
458
459 switch (desc->depth) { 442 switch (desc->depth) {
460 case 0: 443 case 0:
461 err_out: 444 err_out:
@@ -497,7 +480,7 @@ void enable_irq(unsigned int irq)
497 KERN_ERR "enable_irq before setup/request_irq: irq %u\n", irq)) 480 KERN_ERR "enable_irq before setup/request_irq: irq %u\n", irq))
498 goto out; 481 goto out;
499 482
500 __enable_irq(desc, irq, false); 483 __enable_irq(desc, irq);
501out: 484out:
502 irq_put_desc_busunlock(desc, flags); 485 irq_put_desc_busunlock(desc, flags);
503} 486}
@@ -618,6 +601,7 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
618 601
619 switch (ret) { 602 switch (ret) {
620 case IRQ_SET_MASK_OK: 603 case IRQ_SET_MASK_OK:
604 case IRQ_SET_MASK_OK_DONE:
621 irqd_clear(&desc->irq_data, IRQD_TRIGGER_MASK); 605 irqd_clear(&desc->irq_data, IRQD_TRIGGER_MASK);
622 irqd_set(&desc->irq_data, flags); 606 irqd_set(&desc->irq_data, flags);
623 607
@@ -1218,6 +1202,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
1218 new->irq = irq; 1202 new->irq = irq;
1219 *old_ptr = new; 1203 *old_ptr = new;
1220 1204
1205 irq_pm_install_action(desc, new);
1206
1221 /* Reset broken irq detection when installing new handler */ 1207 /* Reset broken irq detection when installing new handler */
1222 desc->irq_count = 0; 1208 desc->irq_count = 0;
1223 desc->irqs_unhandled = 0; 1209 desc->irqs_unhandled = 0;
@@ -1228,7 +1214,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
1228 */ 1214 */
1229 if (shared && (desc->istate & IRQS_SPURIOUS_DISABLED)) { 1215 if (shared && (desc->istate & IRQS_SPURIOUS_DISABLED)) {
1230 desc->istate &= ~IRQS_SPURIOUS_DISABLED; 1216 desc->istate &= ~IRQS_SPURIOUS_DISABLED;
1231 __enable_irq(desc, irq, false); 1217 __enable_irq(desc, irq);
1232 } 1218 }
1233 1219
1234 raw_spin_unlock_irqrestore(&desc->lock, flags); 1220 raw_spin_unlock_irqrestore(&desc->lock, flags);
@@ -1336,6 +1322,8 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
1336 /* Found it - now remove it from the list of entries: */ 1322 /* Found it - now remove it from the list of entries: */
1337 *action_ptr = action->next; 1323 *action_ptr = action->next;
1338 1324
1325 irq_pm_remove_action(desc, action);
1326
1339 /* If this was the last handler, shut down the IRQ line: */ 1327 /* If this was the last handler, shut down the IRQ line: */
1340 if (!desc->action) { 1328 if (!desc->action) {
1341 irq_shutdown(desc); 1329 irq_shutdown(desc);
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
new file mode 100644
index 000000000000..3e18163f336f
--- /dev/null
+++ b/kernel/irq/msi.c
@@ -0,0 +1,330 @@
1/*
2 * linux/kernel/irq/msi.c
3 *
4 * Copyright (C) 2014 Intel Corp.
5 * Author: Jiang Liu <jiang.liu@linux.intel.com>
6 *
7 * This file is licensed under GPLv2.
8 *
9 * This file contains common code to support Message Signalled Interrupt for
10 * PCI compatible and non PCI compatible devices.
11 */
12#include <linux/types.h>
13#include <linux/device.h>
14#include <linux/irq.h>
15#include <linux/irqdomain.h>
16#include <linux/msi.h>
17
18/* Temparory solution for building, will be removed later */
19#include <linux/pci.h>
20
21void __get_cached_msi_msg(struct msi_desc *entry, struct msi_msg *msg)
22{
23 *msg = entry->msg;
24}
25
26void get_cached_msi_msg(unsigned int irq, struct msi_msg *msg)
27{
28 struct msi_desc *entry = irq_get_msi_desc(irq);
29
30 __get_cached_msi_msg(entry, msg);
31}
32EXPORT_SYMBOL_GPL(get_cached_msi_msg);
33
34#ifdef CONFIG_GENERIC_MSI_IRQ_DOMAIN
35static inline void irq_chip_write_msi_msg(struct irq_data *data,
36 struct msi_msg *msg)
37{
38 data->chip->irq_write_msi_msg(data, msg);
39}
40
41/**
42 * msi_domain_set_affinity - Generic affinity setter function for MSI domains
43 * @irq_data: The irq data associated to the interrupt
44 * @mask: The affinity mask to set
45 * @force: Flag to enforce setting (disable online checks)
46 *
47 * Intended to be used by MSI interrupt controllers which are
48 * implemented with hierarchical domains.
49 */
50int msi_domain_set_affinity(struct irq_data *irq_data,
51 const struct cpumask *mask, bool force)
52{
53 struct irq_data *parent = irq_data->parent_data;
54 struct msi_msg msg;
55 int ret;
56
57 ret = parent->chip->irq_set_affinity(parent, mask, force);
58 if (ret >= 0 && ret != IRQ_SET_MASK_OK_DONE) {
59 BUG_ON(irq_chip_compose_msi_msg(irq_data, &msg));
60 irq_chip_write_msi_msg(irq_data, &msg);
61 }
62
63 return ret;
64}
65
66static void msi_domain_activate(struct irq_domain *domain,
67 struct irq_data *irq_data)
68{
69 struct msi_msg msg;
70
71 BUG_ON(irq_chip_compose_msi_msg(irq_data, &msg));
72 irq_chip_write_msi_msg(irq_data, &msg);
73}
74
75static void msi_domain_deactivate(struct irq_domain *domain,
76 struct irq_data *irq_data)
77{
78 struct msi_msg msg;
79
80 memset(&msg, 0, sizeof(msg));
81 irq_chip_write_msi_msg(irq_data, &msg);
82}
83
84static int msi_domain_alloc(struct irq_domain *domain, unsigned int virq,
85 unsigned int nr_irqs, void *arg)
86{
87 struct msi_domain_info *info = domain->host_data;
88 struct msi_domain_ops *ops = info->ops;
89 irq_hw_number_t hwirq = ops->get_hwirq(info, arg);
90 int i, ret;
91
92 if (irq_find_mapping(domain, hwirq) > 0)
93 return -EEXIST;
94
95 ret = irq_domain_alloc_irqs_parent(domain, virq, nr_irqs, arg);
96 if (ret < 0)
97 return ret;
98
99 for (i = 0; i < nr_irqs; i++) {
100 ret = ops->msi_init(domain, info, virq + i, hwirq + i, arg);
101 if (ret < 0) {
102 if (ops->msi_free) {
103 for (i--; i > 0; i--)
104 ops->msi_free(domain, info, virq + i);
105 }
106 irq_domain_free_irqs_top(domain, virq, nr_irqs);
107 return ret;
108 }
109 }
110
111 return 0;
112}
113
114static void msi_domain_free(struct irq_domain *domain, unsigned int virq,
115 unsigned int nr_irqs)
116{
117 struct msi_domain_info *info = domain->host_data;
118 int i;
119
120 if (info->ops->msi_free) {
121 for (i = 0; i < nr_irqs; i++)
122 info->ops->msi_free(domain, info, virq + i);
123 }
124 irq_domain_free_irqs_top(domain, virq, nr_irqs);
125}
126
127static struct irq_domain_ops msi_domain_ops = {
128 .alloc = msi_domain_alloc,
129 .free = msi_domain_free,
130 .activate = msi_domain_activate,
131 .deactivate = msi_domain_deactivate,
132};
133
134#ifdef GENERIC_MSI_DOMAIN_OPS
135static irq_hw_number_t msi_domain_ops_get_hwirq(struct msi_domain_info *info,
136 msi_alloc_info_t *arg)
137{
138 return arg->hwirq;
139}
140
141static int msi_domain_ops_prepare(struct irq_domain *domain, struct device *dev,
142 int nvec, msi_alloc_info_t *arg)
143{
144 memset(arg, 0, sizeof(*arg));
145 return 0;
146}
147
148static void msi_domain_ops_set_desc(msi_alloc_info_t *arg,
149 struct msi_desc *desc)
150{
151 arg->desc = desc;
152}
153#else
154#define msi_domain_ops_get_hwirq NULL
155#define msi_domain_ops_prepare NULL
156#define msi_domain_ops_set_desc NULL
157#endif /* !GENERIC_MSI_DOMAIN_OPS */
158
159static int msi_domain_ops_init(struct irq_domain *domain,
160 struct msi_domain_info *info,
161 unsigned int virq, irq_hw_number_t hwirq,
162 msi_alloc_info_t *arg)
163{
164 irq_domain_set_hwirq_and_chip(domain, virq, hwirq, info->chip,
165 info->chip_data);
166 if (info->handler && info->handler_name) {
167 __irq_set_handler(virq, info->handler, 0, info->handler_name);
168 if (info->handler_data)
169 irq_set_handler_data(virq, info->handler_data);
170 }
171 return 0;
172}
173
174static int msi_domain_ops_check(struct irq_domain *domain,
175 struct msi_domain_info *info,
176 struct device *dev)
177{
178 return 0;
179}
180
181static struct msi_domain_ops msi_domain_ops_default = {
182 .get_hwirq = msi_domain_ops_get_hwirq,
183 .msi_init = msi_domain_ops_init,
184 .msi_check = msi_domain_ops_check,
185 .msi_prepare = msi_domain_ops_prepare,
186 .set_desc = msi_domain_ops_set_desc,
187};
188
189static void msi_domain_update_dom_ops(struct msi_domain_info *info)
190{
191 struct msi_domain_ops *ops = info->ops;
192
193 if (ops == NULL) {
194 info->ops = &msi_domain_ops_default;
195 return;
196 }
197
198 if (ops->get_hwirq == NULL)
199 ops->get_hwirq = msi_domain_ops_default.get_hwirq;
200 if (ops->msi_init == NULL)
201 ops->msi_init = msi_domain_ops_default.msi_init;
202 if (ops->msi_check == NULL)
203 ops->msi_check = msi_domain_ops_default.msi_check;
204 if (ops->msi_prepare == NULL)
205 ops->msi_prepare = msi_domain_ops_default.msi_prepare;
206 if (ops->set_desc == NULL)
207 ops->set_desc = msi_domain_ops_default.set_desc;
208}
209
210static void msi_domain_update_chip_ops(struct msi_domain_info *info)
211{
212 struct irq_chip *chip = info->chip;
213
214 BUG_ON(!chip);
215 if (!chip->irq_mask)
216 chip->irq_mask = pci_msi_mask_irq;
217 if (!chip->irq_unmask)
218 chip->irq_unmask = pci_msi_unmask_irq;
219 if (!chip->irq_set_affinity)
220 chip->irq_set_affinity = msi_domain_set_affinity;
221}
222
223/**
224 * msi_create_irq_domain - Create a MSI interrupt domain
225 * @of_node: Optional device-tree node of the interrupt controller
226 * @info: MSI domain info
227 * @parent: Parent irq domain
228 */
229struct irq_domain *msi_create_irq_domain(struct device_node *node,
230 struct msi_domain_info *info,
231 struct irq_domain *parent)
232{
233 if (info->flags & MSI_FLAG_USE_DEF_DOM_OPS)
234 msi_domain_update_dom_ops(info);
235 if (info->flags & MSI_FLAG_USE_DEF_CHIP_OPS)
236 msi_domain_update_chip_ops(info);
237
238 return irq_domain_add_hierarchy(parent, 0, 0, node, &msi_domain_ops,
239 info);
240}
241
242/**
243 * msi_domain_alloc_irqs - Allocate interrupts from a MSI interrupt domain
244 * @domain: The domain to allocate from
245 * @dev: Pointer to device struct of the device for which the interrupts
246 * are allocated
247 * @nvec: The number of interrupts to allocate
248 *
249 * Returns 0 on success or an error code.
250 */
251int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
252 int nvec)
253{
254 struct msi_domain_info *info = domain->host_data;
255 struct msi_domain_ops *ops = info->ops;
256 msi_alloc_info_t arg;
257 struct msi_desc *desc;
258 int i, ret, virq = -1;
259
260 ret = ops->msi_check(domain, info, dev);
261 if (ret == 0)
262 ret = ops->msi_prepare(domain, dev, nvec, &arg);
263 if (ret)
264 return ret;
265
266 for_each_msi_entry(desc, dev) {
267 ops->set_desc(&arg, desc);
268 if (info->flags & MSI_FLAG_IDENTITY_MAP)
269 virq = (int)ops->get_hwirq(info, &arg);
270 else
271 virq = -1;
272
273 virq = __irq_domain_alloc_irqs(domain, virq, desc->nvec_used,
274 dev_to_node(dev), &arg, false);
275 if (virq < 0) {
276 ret = -ENOSPC;
277 if (ops->handle_error)
278 ret = ops->handle_error(domain, desc, ret);
279 if (ops->msi_finish)
280 ops->msi_finish(&arg, ret);
281 return ret;
282 }
283
284 for (i = 0; i < desc->nvec_used; i++)
285 irq_set_msi_desc_off(virq, i, desc);
286 }
287
288 if (ops->msi_finish)
289 ops->msi_finish(&arg, 0);
290
291 for_each_msi_entry(desc, dev) {
292 if (desc->nvec_used == 1)
293 dev_dbg(dev, "irq %d for MSI\n", virq);
294 else
295 dev_dbg(dev, "irq [%d-%d] for MSI\n",
296 virq, virq + desc->nvec_used - 1);
297 }
298
299 return 0;
300}
301
302/**
303 * msi_domain_free_irqs - Free interrupts from a MSI interrupt @domain associated tp @dev
304 * @domain: The domain to managing the interrupts
305 * @dev: Pointer to device struct of the device for which the interrupts
306 * are free
307 */
308void msi_domain_free_irqs(struct irq_domain *domain, struct device *dev)
309{
310 struct msi_desc *desc;
311
312 for_each_msi_entry(desc, dev) {
313 irq_domain_free_irqs(desc->irq, desc->nvec_used);
314 desc->irq = 0;
315 }
316}
317
318/**
319 * msi_get_domain_info - Get the MSI interrupt domain info for @domain
320 * @domain: The interrupt domain to retrieve data from
321 *
322 * Returns the pointer to the msi_domain_info stored in
323 * @domain->host_data.
324 */
325struct msi_domain_info *msi_get_domain_info(struct irq_domain *domain)
326{
327 return (struct msi_domain_info *)domain->host_data;
328}
329
330#endif /* CONFIG_GENERIC_MSI_IRQ_DOMAIN */
diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c
index abcd6ca86cb7..3ca532592704 100644
--- a/kernel/irq/pm.c
+++ b/kernel/irq/pm.c
@@ -9,17 +9,105 @@
9#include <linux/irq.h> 9#include <linux/irq.h>
10#include <linux/module.h> 10#include <linux/module.h>
11#include <linux/interrupt.h> 11#include <linux/interrupt.h>
12#include <linux/suspend.h>
12#include <linux/syscore_ops.h> 13#include <linux/syscore_ops.h>
13 14
14#include "internals.h" 15#include "internals.h"
15 16
17bool irq_pm_check_wakeup(struct irq_desc *desc)
18{
19 if (irqd_is_wakeup_armed(&desc->irq_data)) {
20 irqd_clear(&desc->irq_data, IRQD_WAKEUP_ARMED);
21 desc->istate |= IRQS_SUSPENDED | IRQS_PENDING;
22 desc->depth++;
23 irq_disable(desc);
24 pm_system_wakeup();
25 return true;
26 }
27 return false;
28}
29
30/*
31 * Called from __setup_irq() with desc->lock held after @action has
32 * been installed in the action chain.
33 */
34void irq_pm_install_action(struct irq_desc *desc, struct irqaction *action)
35{
36 desc->nr_actions++;
37
38 if (action->flags & IRQF_FORCE_RESUME)
39 desc->force_resume_depth++;
40
41 WARN_ON_ONCE(desc->force_resume_depth &&
42 desc->force_resume_depth != desc->nr_actions);
43
44 if (action->flags & IRQF_NO_SUSPEND)
45 desc->no_suspend_depth++;
46
47 WARN_ON_ONCE(desc->no_suspend_depth &&
48 desc->no_suspend_depth != desc->nr_actions);
49}
50
51/*
52 * Called from __free_irq() with desc->lock held after @action has
53 * been removed from the action chain.
54 */
55void irq_pm_remove_action(struct irq_desc *desc, struct irqaction *action)
56{
57 desc->nr_actions--;
58
59 if (action->flags & IRQF_FORCE_RESUME)
60 desc->force_resume_depth--;
61
62 if (action->flags & IRQF_NO_SUSPEND)
63 desc->no_suspend_depth--;
64}
65
66static bool suspend_device_irq(struct irq_desc *desc, int irq)
67{
68 if (!desc->action || desc->no_suspend_depth)
69 return false;
70
71 if (irqd_is_wakeup_set(&desc->irq_data)) {
72 irqd_set(&desc->irq_data, IRQD_WAKEUP_ARMED);
73 /*
74 * We return true here to force the caller to issue
75 * synchronize_irq(). We need to make sure that the
76 * IRQD_WAKEUP_ARMED is visible before we return from
77 * suspend_device_irqs().
78 */
79 return true;
80 }
81
82 desc->istate |= IRQS_SUSPENDED;
83 __disable_irq(desc, irq);
84
85 /*
86 * Hardware which has no wakeup source configuration facility
87 * requires that the non wakeup interrupts are masked at the
88 * chip level. The chip implementation indicates that with
89 * IRQCHIP_MASK_ON_SUSPEND.
90 */
91 if (irq_desc_get_chip(desc)->flags & IRQCHIP_MASK_ON_SUSPEND)
92 mask_irq(desc);
93 return true;
94}
95
16/** 96/**
17 * suspend_device_irqs - disable all currently enabled interrupt lines 97 * suspend_device_irqs - disable all currently enabled interrupt lines
18 * 98 *
19 * During system-wide suspend or hibernation device drivers need to be prevented 99 * During system-wide suspend or hibernation device drivers need to be
20 * from receiving interrupts and this function is provided for this purpose. 100 * prevented from receiving interrupts and this function is provided
21 * It marks all interrupt lines in use, except for the timer ones, as disabled 101 * for this purpose.
22 * and sets the IRQS_SUSPENDED flag for each of them. 102 *
103 * So we disable all interrupts and mark them IRQS_SUSPENDED except
104 * for those which are unused, those which are marked as not
105 * suspendable via an interrupt request with the flag IRQF_NO_SUSPEND
106 * set and those which are marked as active wakeup sources.
107 *
108 * The active wakeup sources are handled by the flow handler entry
109 * code which checks for the IRQD_WAKEUP_ARMED flag, suspends the
110 * interrupt and notifies the pm core about the wakeup.
23 */ 111 */
24void suspend_device_irqs(void) 112void suspend_device_irqs(void)
25{ 113{
@@ -28,18 +116,36 @@ void suspend_device_irqs(void)
28 116
29 for_each_irq_desc(irq, desc) { 117 for_each_irq_desc(irq, desc) {
30 unsigned long flags; 118 unsigned long flags;
119 bool sync;
31 120
32 raw_spin_lock_irqsave(&desc->lock, flags); 121 raw_spin_lock_irqsave(&desc->lock, flags);
33 __disable_irq(desc, irq, true); 122 sync = suspend_device_irq(desc, irq);
34 raw_spin_unlock_irqrestore(&desc->lock, flags); 123 raw_spin_unlock_irqrestore(&desc->lock, flags);
35 }
36 124
37 for_each_irq_desc(irq, desc) 125 if (sync)
38 if (desc->istate & IRQS_SUSPENDED)
39 synchronize_irq(irq); 126 synchronize_irq(irq);
127 }
40} 128}
41EXPORT_SYMBOL_GPL(suspend_device_irqs); 129EXPORT_SYMBOL_GPL(suspend_device_irqs);
42 130
131static void resume_irq(struct irq_desc *desc, int irq)
132{
133 irqd_clear(&desc->irq_data, IRQD_WAKEUP_ARMED);
134
135 if (desc->istate & IRQS_SUSPENDED)
136 goto resume;
137
138 /* Force resume the interrupt? */
139 if (!desc->force_resume_depth)
140 return;
141
142 /* Pretend that it got disabled ! */
143 desc->depth++;
144resume:
145 desc->istate &= ~IRQS_SUSPENDED;
146 __enable_irq(desc, irq);
147}
148
43static void resume_irqs(bool want_early) 149static void resume_irqs(bool want_early)
44{ 150{
45 struct irq_desc *desc; 151 struct irq_desc *desc;
@@ -54,7 +160,7 @@ static void resume_irqs(bool want_early)
54 continue; 160 continue;
55 161
56 raw_spin_lock_irqsave(&desc->lock, flags); 162 raw_spin_lock_irqsave(&desc->lock, flags);
57 __enable_irq(desc, irq, true); 163 resume_irq(desc, irq);
58 raw_spin_unlock_irqrestore(&desc->lock, flags); 164 raw_spin_unlock_irqrestore(&desc->lock, flags);
59 } 165 }
60} 166}
@@ -93,38 +199,3 @@ void resume_device_irqs(void)
93 resume_irqs(false); 199 resume_irqs(false);
94} 200}
95EXPORT_SYMBOL_GPL(resume_device_irqs); 201EXPORT_SYMBOL_GPL(resume_device_irqs);
96
97/**
98 * check_wakeup_irqs - check if any wake-up interrupts are pending
99 */
100int check_wakeup_irqs(void)
101{
102 struct irq_desc *desc;
103 int irq;
104
105 for_each_irq_desc(irq, desc) {
106 /*
107 * Only interrupts which are marked as wakeup source
108 * and have not been disabled before the suspend check
109 * can abort suspend.
110 */
111 if (irqd_is_wakeup_set(&desc->irq_data)) {
112 if (desc->depth == 1 && desc->istate & IRQS_PENDING)
113 return -EBUSY;
114 continue;
115 }
116 /*
117 * Check the non wakeup interrupts whether they need
118 * to be masked before finally going into suspend
119 * state. That's for hardware which has no wakeup
120 * source configuration facility. The chip
121 * implementation indicates that with
122 * IRQCHIP_MASK_ON_SUSPEND.
123 */
124 if (desc->istate & IRQS_SUSPENDED &&
125 irq_desc_get_chip(desc)->flags & IRQCHIP_MASK_ON_SUSPEND)
126 mask_irq(desc);
127 }
128
129 return 0;
130}
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index ac1ba2f11032..9dc9bfd8a678 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -15,6 +15,23 @@
15 15
16#include "internals.h" 16#include "internals.h"
17 17
18/*
19 * Access rules:
20 *
21 * procfs protects read/write of /proc/irq/N/ files against a
22 * concurrent free of the interrupt descriptor. remove_proc_entry()
23 * immediately prevents new read/writes to happen and waits for
24 * already running read/write functions to complete.
25 *
26 * We remove the proc entries first and then delete the interrupt
27 * descriptor from the radix tree and free it. So it is guaranteed
28 * that irq_to_desc(N) is valid as long as the read/writes are
29 * permitted by procfs.
30 *
31 * The read from /proc/interrupts is a different problem because there
32 * is no protection. So the lookup and the access to irqdesc
33 * information must be protected by sparse_irq_lock.
34 */
18static struct proc_dir_entry *root_irq_dir; 35static struct proc_dir_entry *root_irq_dir;
19 36
20#ifdef CONFIG_SMP 37#ifdef CONFIG_SMP
@@ -437,9 +454,10 @@ int show_interrupts(struct seq_file *p, void *v)
437 seq_putc(p, '\n'); 454 seq_putc(p, '\n');
438 } 455 }
439 456
457 irq_lock_sparse();
440 desc = irq_to_desc(i); 458 desc = irq_to_desc(i);
441 if (!desc) 459 if (!desc)
442 return 0; 460 goto outsparse;
443 461
444 raw_spin_lock_irqsave(&desc->lock, flags); 462 raw_spin_lock_irqsave(&desc->lock, flags);
445 for_each_online_cpu(j) 463 for_each_online_cpu(j)
@@ -479,6 +497,8 @@ int show_interrupts(struct seq_file *p, void *v)
479 seq_putc(p, '\n'); 497 seq_putc(p, '\n');
480out: 498out:
481 raw_spin_unlock_irqrestore(&desc->lock, flags); 499 raw_spin_unlock_irqrestore(&desc->lock, flags);
500outsparse:
501 irq_unlock_sparse();
482 return 0; 502 return 0;
483} 503}
484#endif 504#endif
diff --git a/kernel/irq_work.c b/kernel/irq_work.c
index e6bcbe756663..cbf9fb899d92 100644
--- a/kernel/irq_work.c
+++ b/kernel/irq_work.c
@@ -95,11 +95,11 @@ bool irq_work_queue(struct irq_work *work)
95 95
96 /* If the work is "lazy", handle it from next tick if any */ 96 /* If the work is "lazy", handle it from next tick if any */
97 if (work->flags & IRQ_WORK_LAZY) { 97 if (work->flags & IRQ_WORK_LAZY) {
98 if (llist_add(&work->llnode, &__get_cpu_var(lazy_list)) && 98 if (llist_add(&work->llnode, this_cpu_ptr(&lazy_list)) &&
99 tick_nohz_tick_stopped()) 99 tick_nohz_tick_stopped())
100 arch_irq_work_raise(); 100 arch_irq_work_raise();
101 } else { 101 } else {
102 if (llist_add(&work->llnode, &__get_cpu_var(raised_list))) 102 if (llist_add(&work->llnode, this_cpu_ptr(&raised_list)))
103 arch_irq_work_raise(); 103 arch_irq_work_raise();
104 } 104 }
105 105
@@ -113,10 +113,12 @@ bool irq_work_needs_cpu(void)
113{ 113{
114 struct llist_head *raised, *lazy; 114 struct llist_head *raised, *lazy;
115 115
116 raised = &__get_cpu_var(raised_list); 116 raised = this_cpu_ptr(&raised_list);
117 lazy = &__get_cpu_var(lazy_list); 117 lazy = this_cpu_ptr(&lazy_list);
118 if (llist_empty(raised) && llist_empty(lazy)) 118
119 return false; 119 if (llist_empty(raised) || arch_irq_work_has_interrupt())
120 if (llist_empty(lazy))
121 return false;
120 122
121 /* All work should have been flushed before going offline */ 123 /* All work should have been flushed before going offline */
122 WARN_ON_ONCE(cpu_is_offline(smp_processor_id())); 124 WARN_ON_ONCE(cpu_is_offline(smp_processor_id()));
@@ -166,11 +168,20 @@ static void irq_work_run_list(struct llist_head *list)
166 */ 168 */
167void irq_work_run(void) 169void irq_work_run(void)
168{ 170{
169 irq_work_run_list(&__get_cpu_var(raised_list)); 171 irq_work_run_list(this_cpu_ptr(&raised_list));
170 irq_work_run_list(&__get_cpu_var(lazy_list)); 172 irq_work_run_list(this_cpu_ptr(&lazy_list));
171} 173}
172EXPORT_SYMBOL_GPL(irq_work_run); 174EXPORT_SYMBOL_GPL(irq_work_run);
173 175
176void irq_work_tick(void)
177{
178 struct llist_head *raised = this_cpu_ptr(&raised_list);
179
180 if (!llist_empty(raised) && !arch_irq_work_has_interrupt())
181 irq_work_run_list(raised);
182 irq_work_run_list(this_cpu_ptr(&lazy_list));
183}
184
174/* 185/*
175 * Synchronize against the irq_work @entry, ensures the entry is not 186 * Synchronize against the irq_work @entry, ensures the entry is not
176 * currently in use. 187 * currently in use.
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index ae5167087845..5c5987f10819 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -565,19 +565,12 @@ static int kallsyms_open(struct inode *inode, struct file *file)
565 * using get_symbol_offset for every symbol. 565 * using get_symbol_offset for every symbol.
566 */ 566 */
567 struct kallsym_iter *iter; 567 struct kallsym_iter *iter;
568 int ret; 568 iter = __seq_open_private(file, &kallsyms_op, sizeof(*iter));
569
570 iter = kmalloc(sizeof(*iter), GFP_KERNEL);
571 if (!iter) 569 if (!iter)
572 return -ENOMEM; 570 return -ENOMEM;
573 reset_iter(iter, 0); 571 reset_iter(iter, 0);
574 572
575 ret = seq_open(file, &kallsyms_op); 573 return 0;
576 if (ret == 0)
577 ((struct seq_file *)file->private_data)->private = iter;
578 else
579 kfree(iter);
580 return ret;
581} 574}
582 575
583#ifdef CONFIG_KGDB_KDB 576#ifdef CONFIG_KGDB_KDB
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 2bee072268d9..9a8a01abbaed 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -600,7 +600,7 @@ kimage_file_alloc_init(struct kimage **rimage, int kernel_fd,
600 if (!kexec_on_panic) { 600 if (!kexec_on_panic) {
601 image->swap_page = kimage_alloc_control_pages(image, 0); 601 image->swap_page = kimage_alloc_control_pages(image, 0);
602 if (!image->swap_page) { 602 if (!image->swap_page) {
603 pr_err(KERN_ERR "Could not allocate swap buffer\n"); 603 pr_err("Could not allocate swap buffer\n");
604 goto out_free_control_pages; 604 goto out_free_control_pages;
605 } 605 }
606 } 606 }
@@ -1759,7 +1759,6 @@ static __initdata char *suffix_tbl[] = {
1759 */ 1759 */
1760static int __init parse_crashkernel_suffix(char *cmdline, 1760static int __init parse_crashkernel_suffix(char *cmdline,
1761 unsigned long long *crash_size, 1761 unsigned long long *crash_size,
1762 unsigned long long *crash_base,
1763 const char *suffix) 1762 const char *suffix)
1764{ 1763{
1765 char *cur = cmdline; 1764 char *cur = cmdline;
@@ -1848,7 +1847,7 @@ static int __init __parse_crashkernel(char *cmdline,
1848 1847
1849 if (suffix) 1848 if (suffix)
1850 return parse_crashkernel_suffix(ck_cmdline, crash_size, 1849 return parse_crashkernel_suffix(ck_cmdline, crash_size,
1851 crash_base, suffix); 1850 suffix);
1852 /* 1851 /*
1853 * if the commandline contains a ':', then that's the extended 1852 * if the commandline contains a ':', then that's the extended
1854 * syntax -- if not, it must be the classic syntax 1853 * syntax -- if not, it must be the classic syntax
@@ -2016,22 +2015,6 @@ static int __init crash_save_vmcoreinfo_init(void)
2016subsys_initcall(crash_save_vmcoreinfo_init); 2015subsys_initcall(crash_save_vmcoreinfo_init);
2017 2016
2018#ifdef CONFIG_KEXEC_FILE 2017#ifdef CONFIG_KEXEC_FILE
2019static int __kexec_add_segment(struct kimage *image, char *buf,
2020 unsigned long bufsz, unsigned long mem,
2021 unsigned long memsz)
2022{
2023 struct kexec_segment *ksegment;
2024
2025 ksegment = &image->segment[image->nr_segments];
2026 ksegment->kbuf = buf;
2027 ksegment->bufsz = bufsz;
2028 ksegment->mem = mem;
2029 ksegment->memsz = memsz;
2030 image->nr_segments++;
2031
2032 return 0;
2033}
2034
2035static int locate_mem_hole_top_down(unsigned long start, unsigned long end, 2018static int locate_mem_hole_top_down(unsigned long start, unsigned long end,
2036 struct kexec_buf *kbuf) 2019 struct kexec_buf *kbuf)
2037{ 2020{
@@ -2064,8 +2047,7 @@ static int locate_mem_hole_top_down(unsigned long start, unsigned long end,
2064 } while (1); 2047 } while (1);
2065 2048
2066 /* If we are here, we found a suitable memory range */ 2049 /* If we are here, we found a suitable memory range */
2067 __kexec_add_segment(image, kbuf->buffer, kbuf->bufsz, temp_start, 2050 kbuf->mem = temp_start;
2068 kbuf->memsz);
2069 2051
2070 /* Success, stop navigating through remaining System RAM ranges */ 2052 /* Success, stop navigating through remaining System RAM ranges */
2071 return 1; 2053 return 1;
@@ -2099,8 +2081,7 @@ static int locate_mem_hole_bottom_up(unsigned long start, unsigned long end,
2099 } while (1); 2081 } while (1);
2100 2082
2101 /* If we are here, we found a suitable memory range */ 2083 /* If we are here, we found a suitable memory range */
2102 __kexec_add_segment(image, kbuf->buffer, kbuf->bufsz, temp_start, 2084 kbuf->mem = temp_start;
2103 kbuf->memsz);
2104 2085
2105 /* Success, stop navigating through remaining System RAM ranges */ 2086 /* Success, stop navigating through remaining System RAM ranges */
2106 return 1; 2087 return 1;
@@ -2187,7 +2168,12 @@ int kexec_add_buffer(struct kimage *image, char *buffer, unsigned long bufsz,
2187 } 2168 }
2188 2169
2189 /* Found a suitable memory range */ 2170 /* Found a suitable memory range */
2190 ksegment = &image->segment[image->nr_segments - 1]; 2171 ksegment = &image->segment[image->nr_segments];
2172 ksegment->kbuf = kbuf->buffer;
2173 ksegment->bufsz = kbuf->bufsz;
2174 ksegment->mem = kbuf->mem;
2175 ksegment->memsz = kbuf->memsz;
2176 image->nr_segments++;
2191 *load_addr = ksegment->mem; 2177 *load_addr = ksegment->mem;
2192 return 0; 2178 return 0;
2193} 2179}
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 8637e041a247..2777f40a9c7b 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -47,13 +47,6 @@ extern int max_threads;
47 47
48static struct workqueue_struct *khelper_wq; 48static struct workqueue_struct *khelper_wq;
49 49
50/*
51 * kmod_thread_locker is used for deadlock avoidance. There is no explicit
52 * locking to protect this global - it is private to the singleton khelper
53 * thread and should only ever be modified by that thread.
54 */
55static const struct task_struct *kmod_thread_locker;
56
57#define CAP_BSET (void *)1 50#define CAP_BSET (void *)1
58#define CAP_PI (void *)2 51#define CAP_PI (void *)2
59 52
@@ -196,6 +189,27 @@ int __request_module(bool wait, const char *fmt, ...)
196EXPORT_SYMBOL(__request_module); 189EXPORT_SYMBOL(__request_module);
197#endif /* CONFIG_MODULES */ 190#endif /* CONFIG_MODULES */
198 191
192static void call_usermodehelper_freeinfo(struct subprocess_info *info)
193{
194 if (info->cleanup)
195 (*info->cleanup)(info);
196 kfree(info);
197}
198
199static void umh_complete(struct subprocess_info *sub_info)
200{
201 struct completion *comp = xchg(&sub_info->complete, NULL);
202 /*
203 * See call_usermodehelper_exec(). If xchg() returns NULL
204 * we own sub_info, the UMH_KILLABLE caller has gone away
205 * or the caller used UMH_NO_WAIT.
206 */
207 if (comp)
208 complete(comp);
209 else
210 call_usermodehelper_freeinfo(sub_info);
211}
212
199/* 213/*
200 * This is the task which runs the usermode application 214 * This is the task which runs the usermode application
201 */ 215 */
@@ -221,7 +235,7 @@ static int ____call_usermodehelper(void *data)
221 retval = -ENOMEM; 235 retval = -ENOMEM;
222 new = prepare_kernel_cred(current); 236 new = prepare_kernel_cred(current);
223 if (!new) 237 if (!new)
224 goto fail; 238 goto out;
225 239
226 spin_lock(&umh_sysctl_lock); 240 spin_lock(&umh_sysctl_lock);
227 new->cap_bset = cap_intersect(usermodehelper_bset, new->cap_bset); 241 new->cap_bset = cap_intersect(usermodehelper_bset, new->cap_bset);
@@ -233,7 +247,7 @@ static int ____call_usermodehelper(void *data)
233 retval = sub_info->init(sub_info, new); 247 retval = sub_info->init(sub_info, new);
234 if (retval) { 248 if (retval) {
235 abort_creds(new); 249 abort_creds(new);
236 goto fail; 250 goto out;
237 } 251 }
238 } 252 }
239 253
@@ -242,42 +256,16 @@ static int ____call_usermodehelper(void *data)
242 retval = do_execve(getname_kernel(sub_info->path), 256 retval = do_execve(getname_kernel(sub_info->path),
243 (const char __user *const __user *)sub_info->argv, 257 (const char __user *const __user *)sub_info->argv,
244 (const char __user *const __user *)sub_info->envp); 258 (const char __user *const __user *)sub_info->envp);
259out:
260 sub_info->retval = retval;
261 /* wait_for_helper() will call umh_complete if UHM_WAIT_PROC. */
262 if (!(sub_info->wait & UMH_WAIT_PROC))
263 umh_complete(sub_info);
245 if (!retval) 264 if (!retval)
246 return 0; 265 return 0;
247
248 /* Exec failed? */
249fail:
250 sub_info->retval = retval;
251 do_exit(0); 266 do_exit(0);
252} 267}
253 268
254static int call_helper(void *data)
255{
256 /* Worker thread started blocking khelper thread. */
257 kmod_thread_locker = current;
258 return ____call_usermodehelper(data);
259}
260
261static void call_usermodehelper_freeinfo(struct subprocess_info *info)
262{
263 if (info->cleanup)
264 (*info->cleanup)(info);
265 kfree(info);
266}
267
268static void umh_complete(struct subprocess_info *sub_info)
269{
270 struct completion *comp = xchg(&sub_info->complete, NULL);
271 /*
272 * See call_usermodehelper_exec(). If xchg() returns NULL
273 * we own sub_info, the UMH_KILLABLE caller has gone away.
274 */
275 if (comp)
276 complete(comp);
277 else
278 call_usermodehelper_freeinfo(sub_info);
279}
280
281/* Keventd can't block, but this (a child) can. */ 269/* Keventd can't block, but this (a child) can. */
282static int wait_for_helper(void *data) 270static int wait_for_helper(void *data)
283{ 271{
@@ -320,34 +308,17 @@ static void __call_usermodehelper(struct work_struct *work)
320{ 308{
321 struct subprocess_info *sub_info = 309 struct subprocess_info *sub_info =
322 container_of(work, struct subprocess_info, work); 310 container_of(work, struct subprocess_info, work);
323 int wait = sub_info->wait & ~UMH_KILLABLE;
324 pid_t pid; 311 pid_t pid;
325 312
326 /* CLONE_VFORK: wait until the usermode helper has execve'd 313 if (sub_info->wait & UMH_WAIT_PROC)
327 * successfully We need the data structures to stay around
328 * until that is done. */
329 if (wait == UMH_WAIT_PROC)
330 pid = kernel_thread(wait_for_helper, sub_info, 314 pid = kernel_thread(wait_for_helper, sub_info,
331 CLONE_FS | CLONE_FILES | SIGCHLD); 315 CLONE_FS | CLONE_FILES | SIGCHLD);
332 else { 316 else
333 pid = kernel_thread(call_helper, sub_info, 317 pid = kernel_thread(____call_usermodehelper, sub_info,
334 CLONE_VFORK | SIGCHLD); 318 SIGCHLD);
335 /* Worker thread stopped blocking khelper thread. */
336 kmod_thread_locker = NULL;
337 }
338
339 switch (wait) {
340 case UMH_NO_WAIT:
341 call_usermodehelper_freeinfo(sub_info);
342 break;
343 319
344 case UMH_WAIT_PROC: 320 if (pid < 0) {
345 if (pid > 0) 321 sub_info->retval = pid;
346 break;
347 /* FALLTHROUGH */
348 case UMH_WAIT_EXEC:
349 if (pid < 0)
350 sub_info->retval = pid;
351 umh_complete(sub_info); 322 umh_complete(sub_info);
352 } 323 }
353} 324}
@@ -578,17 +549,11 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait)
578 goto out; 549 goto out;
579 } 550 }
580 /* 551 /*
581 * Worker thread must not wait for khelper thread at below 552 * Set the completion pointer only if there is a waiter.
582 * wait_for_completion() if the thread was created with CLONE_VFORK 553 * This makes it possible to use umh_complete to free
583 * flag, for khelper thread is already waiting for the thread at 554 * the data structure in case of UMH_NO_WAIT.
584 * wait_for_completion() in do_fork().
585 */ 555 */
586 if (wait != UMH_NO_WAIT && current == kmod_thread_locker) { 556 sub_info->complete = (wait == UMH_NO_WAIT) ? NULL : &done;
587 retval = -EBUSY;
588 goto out;
589 }
590
591 sub_info->complete = &done;
592 sub_info->wait = wait; 557 sub_info->wait = wait;
593 558
594 queue_work(khelper_wq, &sub_info->work); 559 queue_work(khelper_wq, &sub_info->work);
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 3995f546d0f3..06f58309fed2 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -915,7 +915,7 @@ static struct kprobe *alloc_aggr_kprobe(struct kprobe *p)
915#ifdef CONFIG_KPROBES_ON_FTRACE 915#ifdef CONFIG_KPROBES_ON_FTRACE
916static struct ftrace_ops kprobe_ftrace_ops __read_mostly = { 916static struct ftrace_ops kprobe_ftrace_ops __read_mostly = {
917 .func = kprobe_ftrace_handler, 917 .func = kprobe_ftrace_handler,
918 .flags = FTRACE_OPS_FL_SAVE_REGS, 918 .flags = FTRACE_OPS_FL_SAVE_REGS | FTRACE_OPS_FL_IPMODIFY,
919}; 919};
920static int kprobe_ftrace_enabled; 920static int kprobe_ftrace_enabled;
921 921
@@ -1410,16 +1410,10 @@ static inline int check_kprobe_rereg(struct kprobe *p)
1410 return ret; 1410 return ret;
1411} 1411}
1412 1412
1413static int check_kprobe_address_safe(struct kprobe *p, 1413int __weak arch_check_ftrace_location(struct kprobe *p)
1414 struct module **probed_mod)
1415{ 1414{
1416 int ret = 0;
1417 unsigned long ftrace_addr; 1415 unsigned long ftrace_addr;
1418 1416
1419 /*
1420 * If the address is located on a ftrace nop, set the
1421 * breakpoint to the following instruction.
1422 */
1423 ftrace_addr = ftrace_location((unsigned long)p->addr); 1417 ftrace_addr = ftrace_location((unsigned long)p->addr);
1424 if (ftrace_addr) { 1418 if (ftrace_addr) {
1425#ifdef CONFIG_KPROBES_ON_FTRACE 1419#ifdef CONFIG_KPROBES_ON_FTRACE
@@ -1431,7 +1425,17 @@ static int check_kprobe_address_safe(struct kprobe *p,
1431 return -EINVAL; 1425 return -EINVAL;
1432#endif 1426#endif
1433 } 1427 }
1428 return 0;
1429}
1434 1430
1431static int check_kprobe_address_safe(struct kprobe *p,
1432 struct module **probed_mod)
1433{
1434 int ret;
1435
1436 ret = arch_check_ftrace_location(p);
1437 if (ret)
1438 return ret;
1435 jump_label_lock(); 1439 jump_label_lock();
1436 preempt_disable(); 1440 preempt_disable();
1437 1441
diff --git a/kernel/kthread.c b/kernel/kthread.c
index ef483220e855..10e489c448fe 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -369,7 +369,7 @@ struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data),
369{ 369{
370 struct task_struct *p; 370 struct task_struct *p;
371 371
372 p = kthread_create_on_node(threadfn, data, cpu_to_mem(cpu), namefmt, 372 p = kthread_create_on_node(threadfn, data, cpu_to_node(cpu), namefmt,
373 cpu); 373 cpu);
374 if (IS_ERR(p)) 374 if (IS_ERR(p))
375 return p; 375 return p;
diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c
index 0955b885d0dc..ec8cce259779 100644
--- a/kernel/locking/locktorture.c
+++ b/kernel/locking/locktorture.c
@@ -20,30 +20,20 @@
20 * Author: Paul E. McKenney <paulmck@us.ibm.com> 20 * Author: Paul E. McKenney <paulmck@us.ibm.com>
21 * Based on kernel/rcu/torture.c. 21 * Based on kernel/rcu/torture.c.
22 */ 22 */
23#include <linux/types.h>
24#include <linux/kernel.h> 23#include <linux/kernel.h>
25#include <linux/init.h>
26#include <linux/module.h> 24#include <linux/module.h>
27#include <linux/kthread.h> 25#include <linux/kthread.h>
28#include <linux/err.h>
29#include <linux/spinlock.h> 26#include <linux/spinlock.h>
27#include <linux/rwlock.h>
28#include <linux/mutex.h>
29#include <linux/rwsem.h>
30#include <linux/smp.h> 30#include <linux/smp.h>
31#include <linux/interrupt.h> 31#include <linux/interrupt.h>
32#include <linux/sched.h> 32#include <linux/sched.h>
33#include <linux/atomic.h> 33#include <linux/atomic.h>
34#include <linux/bitops.h>
35#include <linux/completion.h>
36#include <linux/moduleparam.h> 34#include <linux/moduleparam.h>
37#include <linux/percpu.h>
38#include <linux/notifier.h>
39#include <linux/reboot.h>
40#include <linux/freezer.h>
41#include <linux/cpu.h>
42#include <linux/delay.h> 35#include <linux/delay.h>
43#include <linux/stat.h>
44#include <linux/slab.h> 36#include <linux/slab.h>
45#include <linux/trace_clock.h>
46#include <asm/byteorder.h>
47#include <linux/torture.h> 37#include <linux/torture.h>
48 38
49MODULE_LICENSE("GPL"); 39MODULE_LICENSE("GPL");
@@ -51,6 +41,8 @@ MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com>");
51 41
52torture_param(int, nwriters_stress, -1, 42torture_param(int, nwriters_stress, -1,
53 "Number of write-locking stress-test threads"); 43 "Number of write-locking stress-test threads");
44torture_param(int, nreaders_stress, -1,
45 "Number of read-locking stress-test threads");
54torture_param(int, onoff_holdoff, 0, "Time after boot before CPU hotplugs (s)"); 46torture_param(int, onoff_holdoff, 0, "Time after boot before CPU hotplugs (s)");
55torture_param(int, onoff_interval, 0, 47torture_param(int, onoff_interval, 0,
56 "Time between CPU hotplugs (s), 0=disable"); 48 "Time between CPU hotplugs (s), 0=disable");
@@ -66,30 +58,28 @@ torture_param(bool, verbose, true,
66static char *torture_type = "spin_lock"; 58static char *torture_type = "spin_lock";
67module_param(torture_type, charp, 0444); 59module_param(torture_type, charp, 0444);
68MODULE_PARM_DESC(torture_type, 60MODULE_PARM_DESC(torture_type,
69 "Type of lock to torture (spin_lock, spin_lock_irq, ...)"); 61 "Type of lock to torture (spin_lock, spin_lock_irq, mutex_lock, ...)");
70
71static atomic_t n_lock_torture_errors;
72 62
73static struct task_struct *stats_task; 63static struct task_struct *stats_task;
74static struct task_struct **writer_tasks; 64static struct task_struct **writer_tasks;
65static struct task_struct **reader_tasks;
75 66
76static int nrealwriters_stress;
77static bool lock_is_write_held; 67static bool lock_is_write_held;
68static bool lock_is_read_held;
78 69
79struct lock_writer_stress_stats { 70struct lock_stress_stats {
80 long n_write_lock_fail; 71 long n_lock_fail;
81 long n_write_lock_acquired; 72 long n_lock_acquired;
82}; 73};
83static struct lock_writer_stress_stats *lwsa;
84 74
85#if defined(MODULE) 75#if defined(MODULE)
86#define LOCKTORTURE_RUNNABLE_INIT 1 76#define LOCKTORTURE_RUNNABLE_INIT 1
87#else 77#else
88#define LOCKTORTURE_RUNNABLE_INIT 0 78#define LOCKTORTURE_RUNNABLE_INIT 0
89#endif 79#endif
90int locktorture_runnable = LOCKTORTURE_RUNNABLE_INIT; 80int torture_runnable = LOCKTORTURE_RUNNABLE_INIT;
91module_param(locktorture_runnable, int, 0444); 81module_param(torture_runnable, int, 0444);
92MODULE_PARM_DESC(locktorture_runnable, "Start locktorture at module init"); 82MODULE_PARM_DESC(torture_runnable, "Start locktorture at module init");
93 83
94/* Forward reference. */ 84/* Forward reference. */
95static void lock_torture_cleanup(void); 85static void lock_torture_cleanup(void);
@@ -102,12 +92,25 @@ struct lock_torture_ops {
102 int (*writelock)(void); 92 int (*writelock)(void);
103 void (*write_delay)(struct torture_random_state *trsp); 93 void (*write_delay)(struct torture_random_state *trsp);
104 void (*writeunlock)(void); 94 void (*writeunlock)(void);
95 int (*readlock)(void);
96 void (*read_delay)(struct torture_random_state *trsp);
97 void (*readunlock)(void);
105 unsigned long flags; 98 unsigned long flags;
106 const char *name; 99 const char *name;
107}; 100};
108 101
109static struct lock_torture_ops *cur_ops; 102struct lock_torture_cxt {
110 103 int nrealwriters_stress;
104 int nrealreaders_stress;
105 bool debug_lock;
106 atomic_t n_lock_torture_errors;
107 struct lock_torture_ops *cur_ops;
108 struct lock_stress_stats *lwsa; /* writer statistics */
109 struct lock_stress_stats *lrsa; /* reader statistics */
110};
111static struct lock_torture_cxt cxt = { 0, 0, false,
112 ATOMIC_INIT(0),
113 NULL, NULL};
111/* 114/*
112 * Definitions for lock torture testing. 115 * Definitions for lock torture testing.
113 */ 116 */
@@ -123,10 +126,10 @@ static void torture_lock_busted_write_delay(struct torture_random_state *trsp)
123 126
124 /* We want a long delay occasionally to force massive contention. */ 127 /* We want a long delay occasionally to force massive contention. */
125 if (!(torture_random(trsp) % 128 if (!(torture_random(trsp) %
126 (nrealwriters_stress * 2000 * longdelay_us))) 129 (cxt.nrealwriters_stress * 2000 * longdelay_us)))
127 mdelay(longdelay_us); 130 mdelay(longdelay_us);
128#ifdef CONFIG_PREEMPT 131#ifdef CONFIG_PREEMPT
129 if (!(torture_random(trsp) % (nrealwriters_stress * 20000))) 132 if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000)))
130 preempt_schedule(); /* Allow test to be preempted. */ 133 preempt_schedule(); /* Allow test to be preempted. */
131#endif 134#endif
132} 135}
@@ -140,6 +143,9 @@ static struct lock_torture_ops lock_busted_ops = {
140 .writelock = torture_lock_busted_write_lock, 143 .writelock = torture_lock_busted_write_lock,
141 .write_delay = torture_lock_busted_write_delay, 144 .write_delay = torture_lock_busted_write_delay,
142 .writeunlock = torture_lock_busted_write_unlock, 145 .writeunlock = torture_lock_busted_write_unlock,
146 .readlock = NULL,
147 .read_delay = NULL,
148 .readunlock = NULL,
143 .name = "lock_busted" 149 .name = "lock_busted"
144}; 150};
145 151
@@ -160,13 +166,13 @@ static void torture_spin_lock_write_delay(struct torture_random_state *trsp)
160 * we want a long delay occasionally to force massive contention. 166 * we want a long delay occasionally to force massive contention.
161 */ 167 */
162 if (!(torture_random(trsp) % 168 if (!(torture_random(trsp) %
163 (nrealwriters_stress * 2000 * longdelay_us))) 169 (cxt.nrealwriters_stress * 2000 * longdelay_us)))
164 mdelay(longdelay_us); 170 mdelay(longdelay_us);
165 if (!(torture_random(trsp) % 171 if (!(torture_random(trsp) %
166 (nrealwriters_stress * 2 * shortdelay_us))) 172 (cxt.nrealwriters_stress * 2 * shortdelay_us)))
167 udelay(shortdelay_us); 173 udelay(shortdelay_us);
168#ifdef CONFIG_PREEMPT 174#ifdef CONFIG_PREEMPT
169 if (!(torture_random(trsp) % (nrealwriters_stress * 20000))) 175 if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000)))
170 preempt_schedule(); /* Allow test to be preempted. */ 176 preempt_schedule(); /* Allow test to be preempted. */
171#endif 177#endif
172} 178}
@@ -180,39 +186,253 @@ static struct lock_torture_ops spin_lock_ops = {
180 .writelock = torture_spin_lock_write_lock, 186 .writelock = torture_spin_lock_write_lock,
181 .write_delay = torture_spin_lock_write_delay, 187 .write_delay = torture_spin_lock_write_delay,
182 .writeunlock = torture_spin_lock_write_unlock, 188 .writeunlock = torture_spin_lock_write_unlock,
189 .readlock = NULL,
190 .read_delay = NULL,
191 .readunlock = NULL,
183 .name = "spin_lock" 192 .name = "spin_lock"
184}; 193};
185 194
186static int torture_spin_lock_write_lock_irq(void) 195static int torture_spin_lock_write_lock_irq(void)
187__acquires(torture_spinlock_irq) 196__acquires(torture_spinlock)
188{ 197{
189 unsigned long flags; 198 unsigned long flags;
190 199
191 spin_lock_irqsave(&torture_spinlock, flags); 200 spin_lock_irqsave(&torture_spinlock, flags);
192 cur_ops->flags = flags; 201 cxt.cur_ops->flags = flags;
193 return 0; 202 return 0;
194} 203}
195 204
196static void torture_lock_spin_write_unlock_irq(void) 205static void torture_lock_spin_write_unlock_irq(void)
197__releases(torture_spinlock) 206__releases(torture_spinlock)
198{ 207{
199 spin_unlock_irqrestore(&torture_spinlock, cur_ops->flags); 208 spin_unlock_irqrestore(&torture_spinlock, cxt.cur_ops->flags);
200} 209}
201 210
202static struct lock_torture_ops spin_lock_irq_ops = { 211static struct lock_torture_ops spin_lock_irq_ops = {
203 .writelock = torture_spin_lock_write_lock_irq, 212 .writelock = torture_spin_lock_write_lock_irq,
204 .write_delay = torture_spin_lock_write_delay, 213 .write_delay = torture_spin_lock_write_delay,
205 .writeunlock = torture_lock_spin_write_unlock_irq, 214 .writeunlock = torture_lock_spin_write_unlock_irq,
215 .readlock = NULL,
216 .read_delay = NULL,
217 .readunlock = NULL,
206 .name = "spin_lock_irq" 218 .name = "spin_lock_irq"
207}; 219};
208 220
221static DEFINE_RWLOCK(torture_rwlock);
222
223static int torture_rwlock_write_lock(void) __acquires(torture_rwlock)
224{
225 write_lock(&torture_rwlock);
226 return 0;
227}
228
229static void torture_rwlock_write_delay(struct torture_random_state *trsp)
230{
231 const unsigned long shortdelay_us = 2;
232 const unsigned long longdelay_ms = 100;
233
234 /* We want a short delay mostly to emulate likely code, and
235 * we want a long delay occasionally to force massive contention.
236 */
237 if (!(torture_random(trsp) %
238 (cxt.nrealwriters_stress * 2000 * longdelay_ms)))
239 mdelay(longdelay_ms);
240 else
241 udelay(shortdelay_us);
242}
243
244static void torture_rwlock_write_unlock(void) __releases(torture_rwlock)
245{
246 write_unlock(&torture_rwlock);
247}
248
249static int torture_rwlock_read_lock(void) __acquires(torture_rwlock)
250{
251 read_lock(&torture_rwlock);
252 return 0;
253}
254
255static void torture_rwlock_read_delay(struct torture_random_state *trsp)
256{
257 const unsigned long shortdelay_us = 10;
258 const unsigned long longdelay_ms = 100;
259
260 /* We want a short delay mostly to emulate likely code, and
261 * we want a long delay occasionally to force massive contention.
262 */
263 if (!(torture_random(trsp) %
264 (cxt.nrealreaders_stress * 2000 * longdelay_ms)))
265 mdelay(longdelay_ms);
266 else
267 udelay(shortdelay_us);
268}
269
270static void torture_rwlock_read_unlock(void) __releases(torture_rwlock)
271{
272 read_unlock(&torture_rwlock);
273}
274
275static struct lock_torture_ops rw_lock_ops = {
276 .writelock = torture_rwlock_write_lock,
277 .write_delay = torture_rwlock_write_delay,
278 .writeunlock = torture_rwlock_write_unlock,
279 .readlock = torture_rwlock_read_lock,
280 .read_delay = torture_rwlock_read_delay,
281 .readunlock = torture_rwlock_read_unlock,
282 .name = "rw_lock"
283};
284
285static int torture_rwlock_write_lock_irq(void) __acquires(torture_rwlock)
286{
287 unsigned long flags;
288
289 write_lock_irqsave(&torture_rwlock, flags);
290 cxt.cur_ops->flags = flags;
291 return 0;
292}
293
294static void torture_rwlock_write_unlock_irq(void)
295__releases(torture_rwlock)
296{
297 write_unlock_irqrestore(&torture_rwlock, cxt.cur_ops->flags);
298}
299
300static int torture_rwlock_read_lock_irq(void) __acquires(torture_rwlock)
301{
302 unsigned long flags;
303
304 read_lock_irqsave(&torture_rwlock, flags);
305 cxt.cur_ops->flags = flags;
306 return 0;
307}
308
309static void torture_rwlock_read_unlock_irq(void)
310__releases(torture_rwlock)
311{
312 write_unlock_irqrestore(&torture_rwlock, cxt.cur_ops->flags);
313}
314
315static struct lock_torture_ops rw_lock_irq_ops = {
316 .writelock = torture_rwlock_write_lock_irq,
317 .write_delay = torture_rwlock_write_delay,
318 .writeunlock = torture_rwlock_write_unlock_irq,
319 .readlock = torture_rwlock_read_lock_irq,
320 .read_delay = torture_rwlock_read_delay,
321 .readunlock = torture_rwlock_read_unlock_irq,
322 .name = "rw_lock_irq"
323};
324
325static DEFINE_MUTEX(torture_mutex);
326
327static int torture_mutex_lock(void) __acquires(torture_mutex)
328{
329 mutex_lock(&torture_mutex);
330 return 0;
331}
332
333static void torture_mutex_delay(struct torture_random_state *trsp)
334{
335 const unsigned long longdelay_ms = 100;
336
337 /* We want a long delay occasionally to force massive contention. */
338 if (!(torture_random(trsp) %
339 (cxt.nrealwriters_stress * 2000 * longdelay_ms)))
340 mdelay(longdelay_ms * 5);
341 else
342 mdelay(longdelay_ms / 5);
343#ifdef CONFIG_PREEMPT
344 if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000)))
345 preempt_schedule(); /* Allow test to be preempted. */
346#endif
347}
348
349static void torture_mutex_unlock(void) __releases(torture_mutex)
350{
351 mutex_unlock(&torture_mutex);
352}
353
354static struct lock_torture_ops mutex_lock_ops = {
355 .writelock = torture_mutex_lock,
356 .write_delay = torture_mutex_delay,
357 .writeunlock = torture_mutex_unlock,
358 .readlock = NULL,
359 .read_delay = NULL,
360 .readunlock = NULL,
361 .name = "mutex_lock"
362};
363
364static DECLARE_RWSEM(torture_rwsem);
365static int torture_rwsem_down_write(void) __acquires(torture_rwsem)
366{
367 down_write(&torture_rwsem);
368 return 0;
369}
370
371static void torture_rwsem_write_delay(struct torture_random_state *trsp)
372{
373 const unsigned long longdelay_ms = 100;
374
375 /* We want a long delay occasionally to force massive contention. */
376 if (!(torture_random(trsp) %
377 (cxt.nrealwriters_stress * 2000 * longdelay_ms)))
378 mdelay(longdelay_ms * 10);
379 else
380 mdelay(longdelay_ms / 10);
381#ifdef CONFIG_PREEMPT
382 if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000)))
383 preempt_schedule(); /* Allow test to be preempted. */
384#endif
385}
386
387static void torture_rwsem_up_write(void) __releases(torture_rwsem)
388{
389 up_write(&torture_rwsem);
390}
391
392static int torture_rwsem_down_read(void) __acquires(torture_rwsem)
393{
394 down_read(&torture_rwsem);
395 return 0;
396}
397
398static void torture_rwsem_read_delay(struct torture_random_state *trsp)
399{
400 const unsigned long longdelay_ms = 100;
401
402 /* We want a long delay occasionally to force massive contention. */
403 if (!(torture_random(trsp) %
404 (cxt.nrealwriters_stress * 2000 * longdelay_ms)))
405 mdelay(longdelay_ms * 2);
406 else
407 mdelay(longdelay_ms / 2);
408#ifdef CONFIG_PREEMPT
409 if (!(torture_random(trsp) % (cxt.nrealreaders_stress * 20000)))
410 preempt_schedule(); /* Allow test to be preempted. */
411#endif
412}
413
414static void torture_rwsem_up_read(void) __releases(torture_rwsem)
415{
416 up_read(&torture_rwsem);
417}
418
419static struct lock_torture_ops rwsem_lock_ops = {
420 .writelock = torture_rwsem_down_write,
421 .write_delay = torture_rwsem_write_delay,
422 .writeunlock = torture_rwsem_up_write,
423 .readlock = torture_rwsem_down_read,
424 .read_delay = torture_rwsem_read_delay,
425 .readunlock = torture_rwsem_up_read,
426 .name = "rwsem_lock"
427};
428
209/* 429/*
210 * Lock torture writer kthread. Repeatedly acquires and releases 430 * Lock torture writer kthread. Repeatedly acquires and releases
211 * the lock, checking for duplicate acquisitions. 431 * the lock, checking for duplicate acquisitions.
212 */ 432 */
213static int lock_torture_writer(void *arg) 433static int lock_torture_writer(void *arg)
214{ 434{
215 struct lock_writer_stress_stats *lwsp = arg; 435 struct lock_stress_stats *lwsp = arg;
216 static DEFINE_TORTURE_RANDOM(rand); 436 static DEFINE_TORTURE_RANDOM(rand);
217 437
218 VERBOSE_TOROUT_STRING("lock_torture_writer task started"); 438 VERBOSE_TOROUT_STRING("lock_torture_writer task started");
@@ -221,14 +441,19 @@ static int lock_torture_writer(void *arg)
221 do { 441 do {
222 if ((torture_random(&rand) & 0xfffff) == 0) 442 if ((torture_random(&rand) & 0xfffff) == 0)
223 schedule_timeout_uninterruptible(1); 443 schedule_timeout_uninterruptible(1);
224 cur_ops->writelock(); 444
445 cxt.cur_ops->writelock();
225 if (WARN_ON_ONCE(lock_is_write_held)) 446 if (WARN_ON_ONCE(lock_is_write_held))
226 lwsp->n_write_lock_fail++; 447 lwsp->n_lock_fail++;
227 lock_is_write_held = 1; 448 lock_is_write_held = 1;
228 lwsp->n_write_lock_acquired++; 449 if (WARN_ON_ONCE(lock_is_read_held))
229 cur_ops->write_delay(&rand); 450 lwsp->n_lock_fail++; /* rare, but... */
451
452 lwsp->n_lock_acquired++;
453 cxt.cur_ops->write_delay(&rand);
230 lock_is_write_held = 0; 454 lock_is_write_held = 0;
231 cur_ops->writeunlock(); 455 cxt.cur_ops->writeunlock();
456
232 stutter_wait("lock_torture_writer"); 457 stutter_wait("lock_torture_writer");
233 } while (!torture_must_stop()); 458 } while (!torture_must_stop());
234 torture_kthread_stopping("lock_torture_writer"); 459 torture_kthread_stopping("lock_torture_writer");
@@ -236,32 +461,66 @@ static int lock_torture_writer(void *arg)
236} 461}
237 462
238/* 463/*
464 * Lock torture reader kthread. Repeatedly acquires and releases
465 * the reader lock.
466 */
467static int lock_torture_reader(void *arg)
468{
469 struct lock_stress_stats *lrsp = arg;
470 static DEFINE_TORTURE_RANDOM(rand);
471
472 VERBOSE_TOROUT_STRING("lock_torture_reader task started");
473 set_user_nice(current, MAX_NICE);
474
475 do {
476 if ((torture_random(&rand) & 0xfffff) == 0)
477 schedule_timeout_uninterruptible(1);
478
479 cxt.cur_ops->readlock();
480 lock_is_read_held = 1;
481 if (WARN_ON_ONCE(lock_is_write_held))
482 lrsp->n_lock_fail++; /* rare, but... */
483
484 lrsp->n_lock_acquired++;
485 cxt.cur_ops->read_delay(&rand);
486 lock_is_read_held = 0;
487 cxt.cur_ops->readunlock();
488
489 stutter_wait("lock_torture_reader");
490 } while (!torture_must_stop());
491 torture_kthread_stopping("lock_torture_reader");
492 return 0;
493}
494
495/*
239 * Create an lock-torture-statistics message in the specified buffer. 496 * Create an lock-torture-statistics message in the specified buffer.
240 */ 497 */
241static void lock_torture_printk(char *page) 498static void __torture_print_stats(char *page,
499 struct lock_stress_stats *statp, bool write)
242{ 500{
243 bool fail = 0; 501 bool fail = 0;
244 int i; 502 int i, n_stress;
245 long max = 0; 503 long max = 0;
246 long min = lwsa[0].n_write_lock_acquired; 504 long min = statp[0].n_lock_acquired;
247 long long sum = 0; 505 long long sum = 0;
248 506
249 for (i = 0; i < nrealwriters_stress; i++) { 507 n_stress = write ? cxt.nrealwriters_stress : cxt.nrealreaders_stress;
250 if (lwsa[i].n_write_lock_fail) 508 for (i = 0; i < n_stress; i++) {
509 if (statp[i].n_lock_fail)
251 fail = true; 510 fail = true;
252 sum += lwsa[i].n_write_lock_acquired; 511 sum += statp[i].n_lock_acquired;
253 if (max < lwsa[i].n_write_lock_fail) 512 if (max < statp[i].n_lock_fail)
254 max = lwsa[i].n_write_lock_fail; 513 max = statp[i].n_lock_fail;
255 if (min > lwsa[i].n_write_lock_fail) 514 if (min > statp[i].n_lock_fail)
256 min = lwsa[i].n_write_lock_fail; 515 min = statp[i].n_lock_fail;
257 } 516 }
258 page += sprintf(page, "%s%s ", torture_type, TORTURE_FLAG);
259 page += sprintf(page, 517 page += sprintf(page,
260 "Writes: Total: %lld Max/Min: %ld/%ld %s Fail: %d %s\n", 518 "%s: Total: %lld Max/Min: %ld/%ld %s Fail: %d %s\n",
519 write ? "Writes" : "Reads ",
261 sum, max, min, max / 2 > min ? "???" : "", 520 sum, max, min, max / 2 > min ? "???" : "",
262 fail, fail ? "!!!" : ""); 521 fail, fail ? "!!!" : "");
263 if (fail) 522 if (fail)
264 atomic_inc(&n_lock_torture_errors); 523 atomic_inc(&cxt.n_lock_torture_errors);
265} 524}
266 525
267/* 526/*
@@ -274,18 +533,35 @@ static void lock_torture_printk(char *page)
274 */ 533 */
275static void lock_torture_stats_print(void) 534static void lock_torture_stats_print(void)
276{ 535{
277 int size = nrealwriters_stress * 200 + 8192; 536 int size = cxt.nrealwriters_stress * 200 + 8192;
278 char *buf; 537 char *buf;
279 538
539 if (cxt.cur_ops->readlock)
540 size += cxt.nrealreaders_stress * 200 + 8192;
541
280 buf = kmalloc(size, GFP_KERNEL); 542 buf = kmalloc(size, GFP_KERNEL);
281 if (!buf) { 543 if (!buf) {
282 pr_err("lock_torture_stats_print: Out of memory, need: %d", 544 pr_err("lock_torture_stats_print: Out of memory, need: %d",
283 size); 545 size);
284 return; 546 return;
285 } 547 }
286 lock_torture_printk(buf); 548
549 __torture_print_stats(buf, cxt.lwsa, true);
287 pr_alert("%s", buf); 550 pr_alert("%s", buf);
288 kfree(buf); 551 kfree(buf);
552
553 if (cxt.cur_ops->readlock) {
554 buf = kmalloc(size, GFP_KERNEL);
555 if (!buf) {
556 pr_err("lock_torture_stats_print: Out of memory, need: %d",
557 size);
558 return;
559 }
560
561 __torture_print_stats(buf, cxt.lrsa, false);
562 pr_alert("%s", buf);
563 kfree(buf);
564 }
289} 565}
290 566
291/* 567/*
@@ -312,9 +588,10 @@ lock_torture_print_module_parms(struct lock_torture_ops *cur_ops,
312 const char *tag) 588 const char *tag)
313{ 589{
314 pr_alert("%s" TORTURE_FLAG 590 pr_alert("%s" TORTURE_FLAG
315 "--- %s: nwriters_stress=%d stat_interval=%d verbose=%d shuffle_interval=%d stutter=%d shutdown_secs=%d onoff_interval=%d onoff_holdoff=%d\n", 591 "--- %s%s: nwriters_stress=%d nreaders_stress=%d stat_interval=%d verbose=%d shuffle_interval=%d stutter=%d shutdown_secs=%d onoff_interval=%d onoff_holdoff=%d\n",
316 torture_type, tag, nrealwriters_stress, stat_interval, verbose, 592 torture_type, tag, cxt.debug_lock ? " [debug]": "",
317 shuffle_interval, stutter, shutdown_secs, 593 cxt.nrealwriters_stress, cxt.nrealreaders_stress, stat_interval,
594 verbose, shuffle_interval, stutter, shutdown_secs,
318 onoff_interval, onoff_holdoff); 595 onoff_interval, onoff_holdoff);
319} 596}
320 597
@@ -322,46 +599,59 @@ static void lock_torture_cleanup(void)
322{ 599{
323 int i; 600 int i;
324 601
325 if (torture_cleanup()) 602 if (torture_cleanup_begin())
326 return; 603 return;
327 604
328 if (writer_tasks) { 605 if (writer_tasks) {
329 for (i = 0; i < nrealwriters_stress; i++) 606 for (i = 0; i < cxt.nrealwriters_stress; i++)
330 torture_stop_kthread(lock_torture_writer, 607 torture_stop_kthread(lock_torture_writer,
331 writer_tasks[i]); 608 writer_tasks[i]);
332 kfree(writer_tasks); 609 kfree(writer_tasks);
333 writer_tasks = NULL; 610 writer_tasks = NULL;
334 } 611 }
335 612
613 if (reader_tasks) {
614 for (i = 0; i < cxt.nrealreaders_stress; i++)
615 torture_stop_kthread(lock_torture_reader,
616 reader_tasks[i]);
617 kfree(reader_tasks);
618 reader_tasks = NULL;
619 }
620
336 torture_stop_kthread(lock_torture_stats, stats_task); 621 torture_stop_kthread(lock_torture_stats, stats_task);
337 lock_torture_stats_print(); /* -After- the stats thread is stopped! */ 622 lock_torture_stats_print(); /* -After- the stats thread is stopped! */
338 623
339 if (atomic_read(&n_lock_torture_errors)) 624 if (atomic_read(&cxt.n_lock_torture_errors))
340 lock_torture_print_module_parms(cur_ops, 625 lock_torture_print_module_parms(cxt.cur_ops,
341 "End of test: FAILURE"); 626 "End of test: FAILURE");
342 else if (torture_onoff_failures()) 627 else if (torture_onoff_failures())
343 lock_torture_print_module_parms(cur_ops, 628 lock_torture_print_module_parms(cxt.cur_ops,
344 "End of test: LOCK_HOTPLUG"); 629 "End of test: LOCK_HOTPLUG");
345 else 630 else
346 lock_torture_print_module_parms(cur_ops, 631 lock_torture_print_module_parms(cxt.cur_ops,
347 "End of test: SUCCESS"); 632 "End of test: SUCCESS");
633 torture_cleanup_end();
348} 634}
349 635
350static int __init lock_torture_init(void) 636static int __init lock_torture_init(void)
351{ 637{
352 int i; 638 int i, j;
353 int firsterr = 0; 639 int firsterr = 0;
354 static struct lock_torture_ops *torture_ops[] = { 640 static struct lock_torture_ops *torture_ops[] = {
355 &lock_busted_ops, &spin_lock_ops, &spin_lock_irq_ops, 641 &lock_busted_ops,
642 &spin_lock_ops, &spin_lock_irq_ops,
643 &rw_lock_ops, &rw_lock_irq_ops,
644 &mutex_lock_ops,
645 &rwsem_lock_ops,
356 }; 646 };
357 647
358 if (!torture_init_begin(torture_type, verbose, &locktorture_runnable)) 648 if (!torture_init_begin(torture_type, verbose, &torture_runnable))
359 return -EBUSY; 649 return -EBUSY;
360 650
361 /* Process args and tell the world that the torturer is on the job. */ 651 /* Process args and tell the world that the torturer is on the job. */
362 for (i = 0; i < ARRAY_SIZE(torture_ops); i++) { 652 for (i = 0; i < ARRAY_SIZE(torture_ops); i++) {
363 cur_ops = torture_ops[i]; 653 cxt.cur_ops = torture_ops[i];
364 if (strcmp(torture_type, cur_ops->name) == 0) 654 if (strcmp(torture_type, cxt.cur_ops->name) == 0)
365 break; 655 break;
366 } 656 }
367 if (i == ARRAY_SIZE(torture_ops)) { 657 if (i == ARRAY_SIZE(torture_ops)) {
@@ -374,31 +664,69 @@ static int __init lock_torture_init(void)
374 torture_init_end(); 664 torture_init_end();
375 return -EINVAL; 665 return -EINVAL;
376 } 666 }
377 if (cur_ops->init) 667 if (cxt.cur_ops->init)
378 cur_ops->init(); /* no "goto unwind" prior to this point!!! */ 668 cxt.cur_ops->init(); /* no "goto unwind" prior to this point!!! */
379 669
380 if (nwriters_stress >= 0) 670 if (nwriters_stress >= 0)
381 nrealwriters_stress = nwriters_stress; 671 cxt.nrealwriters_stress = nwriters_stress;
382 else 672 else
383 nrealwriters_stress = 2 * num_online_cpus(); 673 cxt.nrealwriters_stress = 2 * num_online_cpus();
384 lock_torture_print_module_parms(cur_ops, "Start of test"); 674
675#ifdef CONFIG_DEBUG_MUTEXES
676 if (strncmp(torture_type, "mutex", 5) == 0)
677 cxt.debug_lock = true;
678#endif
679#ifdef CONFIG_DEBUG_SPINLOCK
680 if ((strncmp(torture_type, "spin", 4) == 0) ||
681 (strncmp(torture_type, "rw_lock", 7) == 0))
682 cxt.debug_lock = true;
683#endif
385 684
386 /* Initialize the statistics so that each run gets its own numbers. */ 685 /* Initialize the statistics so that each run gets its own numbers. */
387 686
388 lock_is_write_held = 0; 687 lock_is_write_held = 0;
389 lwsa = kmalloc(sizeof(*lwsa) * nrealwriters_stress, GFP_KERNEL); 688 cxt.lwsa = kmalloc(sizeof(*cxt.lwsa) * cxt.nrealwriters_stress, GFP_KERNEL);
390 if (lwsa == NULL) { 689 if (cxt.lwsa == NULL) {
391 VERBOSE_TOROUT_STRING("lwsa: Out of memory"); 690 VERBOSE_TOROUT_STRING("cxt.lwsa: Out of memory");
392 firsterr = -ENOMEM; 691 firsterr = -ENOMEM;
393 goto unwind; 692 goto unwind;
394 } 693 }
395 for (i = 0; i < nrealwriters_stress; i++) { 694 for (i = 0; i < cxt.nrealwriters_stress; i++) {
396 lwsa[i].n_write_lock_fail = 0; 695 cxt.lwsa[i].n_lock_fail = 0;
397 lwsa[i].n_write_lock_acquired = 0; 696 cxt.lwsa[i].n_lock_acquired = 0;
398 } 697 }
399 698
400 /* Start up the kthreads. */ 699 if (cxt.cur_ops->readlock) {
700 if (nreaders_stress >= 0)
701 cxt.nrealreaders_stress = nreaders_stress;
702 else {
703 /*
704 * By default distribute evenly the number of
705 * readers and writers. We still run the same number
706 * of threads as the writer-only locks default.
707 */
708 if (nwriters_stress < 0) /* user doesn't care */
709 cxt.nrealwriters_stress = num_online_cpus();
710 cxt.nrealreaders_stress = cxt.nrealwriters_stress;
711 }
712
713 lock_is_read_held = 0;
714 cxt.lrsa = kmalloc(sizeof(*cxt.lrsa) * cxt.nrealreaders_stress, GFP_KERNEL);
715 if (cxt.lrsa == NULL) {
716 VERBOSE_TOROUT_STRING("cxt.lrsa: Out of memory");
717 firsterr = -ENOMEM;
718 kfree(cxt.lwsa);
719 goto unwind;
720 }
721
722 for (i = 0; i < cxt.nrealreaders_stress; i++) {
723 cxt.lrsa[i].n_lock_fail = 0;
724 cxt.lrsa[i].n_lock_acquired = 0;
725 }
726 }
727 lock_torture_print_module_parms(cxt.cur_ops, "Start of test");
401 728
729 /* Prepare torture context. */
402 if (onoff_interval > 0) { 730 if (onoff_interval > 0) {
403 firsterr = torture_onoff_init(onoff_holdoff * HZ, 731 firsterr = torture_onoff_init(onoff_holdoff * HZ,
404 onoff_interval * HZ); 732 onoff_interval * HZ);
@@ -422,18 +750,51 @@ static int __init lock_torture_init(void)
422 goto unwind; 750 goto unwind;
423 } 751 }
424 752
425 writer_tasks = kzalloc(nrealwriters_stress * sizeof(writer_tasks[0]), 753 writer_tasks = kzalloc(cxt.nrealwriters_stress * sizeof(writer_tasks[0]),
426 GFP_KERNEL); 754 GFP_KERNEL);
427 if (writer_tasks == NULL) { 755 if (writer_tasks == NULL) {
428 VERBOSE_TOROUT_ERRSTRING("writer_tasks: Out of memory"); 756 VERBOSE_TOROUT_ERRSTRING("writer_tasks: Out of memory");
429 firsterr = -ENOMEM; 757 firsterr = -ENOMEM;
430 goto unwind; 758 goto unwind;
431 } 759 }
432 for (i = 0; i < nrealwriters_stress; i++) { 760
433 firsterr = torture_create_kthread(lock_torture_writer, &lwsa[i], 761 if (cxt.cur_ops->readlock) {
762 reader_tasks = kzalloc(cxt.nrealreaders_stress * sizeof(reader_tasks[0]),
763 GFP_KERNEL);
764 if (reader_tasks == NULL) {
765 VERBOSE_TOROUT_ERRSTRING("reader_tasks: Out of memory");
766 firsterr = -ENOMEM;
767 goto unwind;
768 }
769 }
770
771 /*
772 * Create the kthreads and start torturing (oh, those poor little locks).
773 *
774 * TODO: Note that we interleave writers with readers, giving writers a
775 * slight advantage, by creating its kthread first. This can be modified
776 * for very specific needs, or even let the user choose the policy, if
777 * ever wanted.
778 */
779 for (i = 0, j = 0; i < cxt.nrealwriters_stress ||
780 j < cxt.nrealreaders_stress; i++, j++) {
781 if (i >= cxt.nrealwriters_stress)
782 goto create_reader;
783
784 /* Create writer. */
785 firsterr = torture_create_kthread(lock_torture_writer, &cxt.lwsa[i],
434 writer_tasks[i]); 786 writer_tasks[i]);
435 if (firsterr) 787 if (firsterr)
436 goto unwind; 788 goto unwind;
789
790 create_reader:
791 if (cxt.cur_ops->readlock == NULL || (j >= cxt.nrealreaders_stress))
792 continue;
793 /* Create reader. */
794 firsterr = torture_create_kthread(lock_torture_reader, &cxt.lrsa[j],
795 reader_tasks[j]);
796 if (firsterr)
797 goto unwind;
437 } 798 }
438 if (stat_interval > 0) { 799 if (stat_interval > 0) {
439 firsterr = torture_create_kthread(lock_torture_stats, NULL, 800 firsterr = torture_create_kthread(lock_torture_stats, NULL,
diff --git a/kernel/locking/mcs_spinlock.h b/kernel/locking/mcs_spinlock.h
index 23e89c5930e9..4d60986fcbee 100644
--- a/kernel/locking/mcs_spinlock.h
+++ b/kernel/locking/mcs_spinlock.h
@@ -56,9 +56,6 @@ do { \
56 * If the lock has already been acquired, then this will proceed to spin 56 * If the lock has already been acquired, then this will proceed to spin
57 * on this node->locked until the previous lock holder sets the node->locked 57 * on this node->locked until the previous lock holder sets the node->locked
58 * in mcs_spin_unlock(). 58 * in mcs_spin_unlock().
59 *
60 * We don't inline mcs_spin_lock() so that perf can correctly account for the
61 * time spent in this lock function.
62 */ 59 */
63static inline 60static inline
64void mcs_spin_lock(struct mcs_spinlock **lock, struct mcs_spinlock *node) 61void mcs_spin_lock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
diff --git a/kernel/locking/mutex-debug.c b/kernel/locking/mutex-debug.c
index 5cf6731b98e9..3ef3736002d8 100644
--- a/kernel/locking/mutex-debug.c
+++ b/kernel/locking/mutex-debug.c
@@ -80,13 +80,13 @@ void debug_mutex_unlock(struct mutex *lock)
80 DEBUG_LOCKS_WARN_ON(lock->owner != current); 80 DEBUG_LOCKS_WARN_ON(lock->owner != current);
81 81
82 DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next); 82 DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next);
83 mutex_clear_owner(lock);
84 } 83 }
85 84
86 /* 85 /*
87 * __mutex_slowpath_needs_to_unlock() is explicitly 0 for debug 86 * __mutex_slowpath_needs_to_unlock() is explicitly 0 for debug
88 * mutexes so that we can do it here after we've verified state. 87 * mutexes so that we can do it here after we've verified state.
89 */ 88 */
89 mutex_clear_owner(lock);
90 atomic_set(&lock->count, 1); 90 atomic_set(&lock->count, 1);
91} 91}
92 92
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index ae712b25e492..454195194d4a 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -15,7 +15,7 @@
15 * by Steven Rostedt, based on work by Gregory Haskins, Peter Morreale 15 * by Steven Rostedt, based on work by Gregory Haskins, Peter Morreale
16 * and Sven Dietrich. 16 * and Sven Dietrich.
17 * 17 *
18 * Also see Documentation/mutex-design.txt. 18 * Also see Documentation/locking/mutex-design.txt.
19 */ 19 */
20#include <linux/mutex.h> 20#include <linux/mutex.h>
21#include <linux/ww_mutex.h> 21#include <linux/ww_mutex.h>
@@ -106,6 +106,92 @@ void __sched mutex_lock(struct mutex *lock)
106EXPORT_SYMBOL(mutex_lock); 106EXPORT_SYMBOL(mutex_lock);
107#endif 107#endif
108 108
109static __always_inline void ww_mutex_lock_acquired(struct ww_mutex *ww,
110 struct ww_acquire_ctx *ww_ctx)
111{
112#ifdef CONFIG_DEBUG_MUTEXES
113 /*
114 * If this WARN_ON triggers, you used ww_mutex_lock to acquire,
115 * but released with a normal mutex_unlock in this call.
116 *
117 * This should never happen, always use ww_mutex_unlock.
118 */
119 DEBUG_LOCKS_WARN_ON(ww->ctx);
120
121 /*
122 * Not quite done after calling ww_acquire_done() ?
123 */
124 DEBUG_LOCKS_WARN_ON(ww_ctx->done_acquire);
125
126 if (ww_ctx->contending_lock) {
127 /*
128 * After -EDEADLK you tried to
129 * acquire a different ww_mutex? Bad!
130 */
131 DEBUG_LOCKS_WARN_ON(ww_ctx->contending_lock != ww);
132
133 /*
134 * You called ww_mutex_lock after receiving -EDEADLK,
135 * but 'forgot' to unlock everything else first?
136 */
137 DEBUG_LOCKS_WARN_ON(ww_ctx->acquired > 0);
138 ww_ctx->contending_lock = NULL;
139 }
140
141 /*
142 * Naughty, using a different class will lead to undefined behavior!
143 */
144 DEBUG_LOCKS_WARN_ON(ww_ctx->ww_class != ww->ww_class);
145#endif
146 ww_ctx->acquired++;
147}
148
149/*
150 * after acquiring lock with fastpath or when we lost out in contested
151 * slowpath, set ctx and wake up any waiters so they can recheck.
152 *
153 * This function is never called when CONFIG_DEBUG_LOCK_ALLOC is set,
154 * as the fastpath and opportunistic spinning are disabled in that case.
155 */
156static __always_inline void
157ww_mutex_set_context_fastpath(struct ww_mutex *lock,
158 struct ww_acquire_ctx *ctx)
159{
160 unsigned long flags;
161 struct mutex_waiter *cur;
162
163 ww_mutex_lock_acquired(lock, ctx);
164
165 lock->ctx = ctx;
166
167 /*
168 * The lock->ctx update should be visible on all cores before
169 * the atomic read is done, otherwise contended waiters might be
170 * missed. The contended waiters will either see ww_ctx == NULL
171 * and keep spinning, or it will acquire wait_lock, add itself
172 * to waiter list and sleep.
173 */
174 smp_mb(); /* ^^^ */
175
176 /*
177 * Check if lock is contended, if not there is nobody to wake up
178 */
179 if (likely(atomic_read(&lock->base.count) == 0))
180 return;
181
182 /*
183 * Uh oh, we raced in fastpath, wake up everyone in this case,
184 * so they can see the new lock->ctx.
185 */
186 spin_lock_mutex(&lock->base.wait_lock, flags);
187 list_for_each_entry(cur, &lock->base.wait_list, list) {
188 debug_mutex_wake_waiter(&lock->base, cur);
189 wake_up_process(cur->task);
190 }
191 spin_unlock_mutex(&lock->base.wait_lock, flags);
192}
193
194
109#ifdef CONFIG_MUTEX_SPIN_ON_OWNER 195#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
110/* 196/*
111 * In order to avoid a stampede of mutex spinners from acquiring the mutex 197 * In order to avoid a stampede of mutex spinners from acquiring the mutex
@@ -180,6 +266,135 @@ static inline int mutex_can_spin_on_owner(struct mutex *lock)
180 */ 266 */
181 return retval; 267 return retval;
182} 268}
269
270/*
271 * Atomically try to take the lock when it is available
272 */
273static inline bool mutex_try_to_acquire(struct mutex *lock)
274{
275 return !mutex_is_locked(lock) &&
276 (atomic_cmpxchg(&lock->count, 1, 0) == 1);
277}
278
279/*
280 * Optimistic spinning.
281 *
282 * We try to spin for acquisition when we find that the lock owner
283 * is currently running on a (different) CPU and while we don't
284 * need to reschedule. The rationale is that if the lock owner is
285 * running, it is likely to release the lock soon.
286 *
287 * Since this needs the lock owner, and this mutex implementation
288 * doesn't track the owner atomically in the lock field, we need to
289 * track it non-atomically.
290 *
291 * We can't do this for DEBUG_MUTEXES because that relies on wait_lock
292 * to serialize everything.
293 *
294 * The mutex spinners are queued up using MCS lock so that only one
295 * spinner can compete for the mutex. However, if mutex spinning isn't
296 * going to happen, there is no point in going through the lock/unlock
297 * overhead.
298 *
299 * Returns true when the lock was taken, otherwise false, indicating
300 * that we need to jump to the slowpath and sleep.
301 */
302static bool mutex_optimistic_spin(struct mutex *lock,
303 struct ww_acquire_ctx *ww_ctx, const bool use_ww_ctx)
304{
305 struct task_struct *task = current;
306
307 if (!mutex_can_spin_on_owner(lock))
308 goto done;
309
310 if (!osq_lock(&lock->osq))
311 goto done;
312
313 while (true) {
314 struct task_struct *owner;
315
316 if (use_ww_ctx && ww_ctx->acquired > 0) {
317 struct ww_mutex *ww;
318
319 ww = container_of(lock, struct ww_mutex, base);
320 /*
321 * If ww->ctx is set the contents are undefined, only
322 * by acquiring wait_lock there is a guarantee that
323 * they are not invalid when reading.
324 *
325 * As such, when deadlock detection needs to be
326 * performed the optimistic spinning cannot be done.
327 */
328 if (ACCESS_ONCE(ww->ctx))
329 break;
330 }
331
332 /*
333 * If there's an owner, wait for it to either
334 * release the lock or go to sleep.
335 */
336 owner = ACCESS_ONCE(lock->owner);
337 if (owner && !mutex_spin_on_owner(lock, owner))
338 break;
339
340 /* Try to acquire the mutex if it is unlocked. */
341 if (mutex_try_to_acquire(lock)) {
342 lock_acquired(&lock->dep_map, ip);
343
344 if (use_ww_ctx) {
345 struct ww_mutex *ww;
346 ww = container_of(lock, struct ww_mutex, base);
347
348 ww_mutex_set_context_fastpath(ww, ww_ctx);
349 }
350
351 mutex_set_owner(lock);
352 osq_unlock(&lock->osq);
353 return true;
354 }
355
356 /*
357 * When there's no owner, we might have preempted between the
358 * owner acquiring the lock and setting the owner field. If
359 * we're an RT task that will live-lock because we won't let
360 * the owner complete.
361 */
362 if (!owner && (need_resched() || rt_task(task)))
363 break;
364
365 /*
366 * The cpu_relax() call is a compiler barrier which forces
367 * everything in this loop to be re-loaded. We don't need
368 * memory barriers as we'll eventually observe the right
369 * values at the cost of a few extra spins.
370 */
371 cpu_relax_lowlatency();
372 }
373
374 osq_unlock(&lock->osq);
375done:
376 /*
377 * If we fell out of the spin path because of need_resched(),
378 * reschedule now, before we try-lock the mutex. This avoids getting
379 * scheduled out right after we obtained the mutex.
380 */
381 if (need_resched()) {
382 /*
383 * We _should_ have TASK_RUNNING here, but just in case
384 * we do not, make it so, otherwise we might get stuck.
385 */
386 __set_current_state(TASK_RUNNING);
387 schedule_preempt_disabled();
388 }
389
390 return false;
391}
392#else
393static bool mutex_optimistic_spin(struct mutex *lock,
394 struct ww_acquire_ctx *ww_ctx, const bool use_ww_ctx)
395{
396 return false;
397}
183#endif 398#endif
184 399
185__visible __used noinline 400__visible __used noinline
@@ -277,91 +492,6 @@ __mutex_lock_check_stamp(struct mutex *lock, struct ww_acquire_ctx *ctx)
277 return 0; 492 return 0;
278} 493}
279 494
280static __always_inline void ww_mutex_lock_acquired(struct ww_mutex *ww,
281 struct ww_acquire_ctx *ww_ctx)
282{
283#ifdef CONFIG_DEBUG_MUTEXES
284 /*
285 * If this WARN_ON triggers, you used ww_mutex_lock to acquire,
286 * but released with a normal mutex_unlock in this call.
287 *
288 * This should never happen, always use ww_mutex_unlock.
289 */
290 DEBUG_LOCKS_WARN_ON(ww->ctx);
291
292 /*
293 * Not quite done after calling ww_acquire_done() ?
294 */
295 DEBUG_LOCKS_WARN_ON(ww_ctx->done_acquire);
296
297 if (ww_ctx->contending_lock) {
298 /*
299 * After -EDEADLK you tried to
300 * acquire a different ww_mutex? Bad!
301 */
302 DEBUG_LOCKS_WARN_ON(ww_ctx->contending_lock != ww);
303
304 /*
305 * You called ww_mutex_lock after receiving -EDEADLK,
306 * but 'forgot' to unlock everything else first?
307 */
308 DEBUG_LOCKS_WARN_ON(ww_ctx->acquired > 0);
309 ww_ctx->contending_lock = NULL;
310 }
311
312 /*
313 * Naughty, using a different class will lead to undefined behavior!
314 */
315 DEBUG_LOCKS_WARN_ON(ww_ctx->ww_class != ww->ww_class);
316#endif
317 ww_ctx->acquired++;
318}
319
320/*
321 * after acquiring lock with fastpath or when we lost out in contested
322 * slowpath, set ctx and wake up any waiters so they can recheck.
323 *
324 * This function is never called when CONFIG_DEBUG_LOCK_ALLOC is set,
325 * as the fastpath and opportunistic spinning are disabled in that case.
326 */
327static __always_inline void
328ww_mutex_set_context_fastpath(struct ww_mutex *lock,
329 struct ww_acquire_ctx *ctx)
330{
331 unsigned long flags;
332 struct mutex_waiter *cur;
333
334 ww_mutex_lock_acquired(lock, ctx);
335
336 lock->ctx = ctx;
337
338 /*
339 * The lock->ctx update should be visible on all cores before
340 * the atomic read is done, otherwise contended waiters might be
341 * missed. The contended waiters will either see ww_ctx == NULL
342 * and keep spinning, or it will acquire wait_lock, add itself
343 * to waiter list and sleep.
344 */
345 smp_mb(); /* ^^^ */
346
347 /*
348 * Check if lock is contended, if not there is nobody to wake up
349 */
350 if (likely(atomic_read(&lock->base.count) == 0))
351 return;
352
353 /*
354 * Uh oh, we raced in fastpath, wake up everyone in this case,
355 * so they can see the new lock->ctx.
356 */
357 spin_lock_mutex(&lock->base.wait_lock, flags);
358 list_for_each_entry(cur, &lock->base.wait_list, list) {
359 debug_mutex_wake_waiter(&lock->base, cur);
360 wake_up_process(cur->task);
361 }
362 spin_unlock_mutex(&lock->base.wait_lock, flags);
363}
364
365/* 495/*
366 * Lock a mutex (possibly interruptible), slowpath: 496 * Lock a mutex (possibly interruptible), slowpath:
367 */ 497 */
@@ -378,104 +508,12 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
378 preempt_disable(); 508 preempt_disable();
379 mutex_acquire_nest(&lock->dep_map, subclass, 0, nest_lock, ip); 509 mutex_acquire_nest(&lock->dep_map, subclass, 0, nest_lock, ip);
380 510
381#ifdef CONFIG_MUTEX_SPIN_ON_OWNER 511 if (mutex_optimistic_spin(lock, ww_ctx, use_ww_ctx)) {
382 /* 512 /* got the lock, yay! */
383 * Optimistic spinning. 513 preempt_enable();
384 * 514 return 0;
385 * We try to spin for acquisition when we find that the lock owner
386 * is currently running on a (different) CPU and while we don't
387 * need to reschedule. The rationale is that if the lock owner is
388 * running, it is likely to release the lock soon.
389 *
390 * Since this needs the lock owner, and this mutex implementation
391 * doesn't track the owner atomically in the lock field, we need to
392 * track it non-atomically.
393 *
394 * We can't do this for DEBUG_MUTEXES because that relies on wait_lock
395 * to serialize everything.
396 *
397 * The mutex spinners are queued up using MCS lock so that only one
398 * spinner can compete for the mutex. However, if mutex spinning isn't
399 * going to happen, there is no point in going through the lock/unlock
400 * overhead.
401 */
402 if (!mutex_can_spin_on_owner(lock))
403 goto slowpath;
404
405 if (!osq_lock(&lock->osq))
406 goto slowpath;
407
408 for (;;) {
409 struct task_struct *owner;
410
411 if (use_ww_ctx && ww_ctx->acquired > 0) {
412 struct ww_mutex *ww;
413
414 ww = container_of(lock, struct ww_mutex, base);
415 /*
416 * If ww->ctx is set the contents are undefined, only
417 * by acquiring wait_lock there is a guarantee that
418 * they are not invalid when reading.
419 *
420 * As such, when deadlock detection needs to be
421 * performed the optimistic spinning cannot be done.
422 */
423 if (ACCESS_ONCE(ww->ctx))
424 break;
425 }
426
427 /*
428 * If there's an owner, wait for it to either
429 * release the lock or go to sleep.
430 */
431 owner = ACCESS_ONCE(lock->owner);
432 if (owner && !mutex_spin_on_owner(lock, owner))
433 break;
434
435 /* Try to acquire the mutex if it is unlocked. */
436 if (!mutex_is_locked(lock) &&
437 (atomic_cmpxchg(&lock->count, 1, 0) == 1)) {
438 lock_acquired(&lock->dep_map, ip);
439 if (use_ww_ctx) {
440 struct ww_mutex *ww;
441 ww = container_of(lock, struct ww_mutex, base);
442
443 ww_mutex_set_context_fastpath(ww, ww_ctx);
444 }
445
446 mutex_set_owner(lock);
447 osq_unlock(&lock->osq);
448 preempt_enable();
449 return 0;
450 }
451
452 /*
453 * When there's no owner, we might have preempted between the
454 * owner acquiring the lock and setting the owner field. If
455 * we're an RT task that will live-lock because we won't let
456 * the owner complete.
457 */
458 if (!owner && (need_resched() || rt_task(task)))
459 break;
460
461 /*
462 * The cpu_relax() call is a compiler barrier which forces
463 * everything in this loop to be re-loaded. We don't need
464 * memory barriers as we'll eventually observe the right
465 * values at the cost of a few extra spins.
466 */
467 cpu_relax_lowlatency();
468 } 515 }
469 osq_unlock(&lock->osq); 516
470slowpath:
471 /*
472 * If we fell out of the spin path because of need_resched(),
473 * reschedule now, before we try-lock the mutex. This avoids getting
474 * scheduled out right after we obtained the mutex.
475 */
476 if (need_resched())
477 schedule_preempt_disabled();
478#endif
479 spin_lock_mutex(&lock->wait_lock, flags); 517 spin_lock_mutex(&lock->wait_lock, flags);
480 518
481 /* 519 /*
@@ -679,15 +717,21 @@ EXPORT_SYMBOL_GPL(__ww_mutex_lock_interruptible);
679 * Release the lock, slowpath: 717 * Release the lock, slowpath:
680 */ 718 */
681static inline void 719static inline void
682__mutex_unlock_common_slowpath(atomic_t *lock_count, int nested) 720__mutex_unlock_common_slowpath(struct mutex *lock, int nested)
683{ 721{
684 struct mutex *lock = container_of(lock_count, struct mutex, count);
685 unsigned long flags; 722 unsigned long flags;
686 723
687 /* 724 /*
688 * some architectures leave the lock unlocked in the fastpath failure 725 * As a performance measurement, release the lock before doing other
726 * wakeup related duties to follow. This allows other tasks to acquire
727 * the lock sooner, while still handling cleanups in past unlock calls.
728 * This can be done as we do not enforce strict equivalence between the
729 * mutex counter and wait_list.
730 *
731 *
732 * Some architectures leave the lock unlocked in the fastpath failure
689 * case, others need to leave it locked. In the later case we have to 733 * case, others need to leave it locked. In the later case we have to
690 * unlock it here 734 * unlock it here - as the lock counter is currently 0 or negative.
691 */ 735 */
692 if (__mutex_slowpath_needs_to_unlock()) 736 if (__mutex_slowpath_needs_to_unlock())
693 atomic_set(&lock->count, 1); 737 atomic_set(&lock->count, 1);
@@ -716,7 +760,9 @@ __mutex_unlock_common_slowpath(atomic_t *lock_count, int nested)
716__visible void 760__visible void
717__mutex_unlock_slowpath(atomic_t *lock_count) 761__mutex_unlock_slowpath(atomic_t *lock_count)
718{ 762{
719 __mutex_unlock_common_slowpath(lock_count, 1); 763 struct mutex *lock = container_of(lock_count, struct mutex, count);
764
765 __mutex_unlock_common_slowpath(lock, 1);
720} 766}
721 767
722#ifndef CONFIG_DEBUG_LOCK_ALLOC 768#ifndef CONFIG_DEBUG_LOCK_ALLOC
diff --git a/kernel/locking/mutex.h b/kernel/locking/mutex.h
index 4115fbf83b12..5cda397607f2 100644
--- a/kernel/locking/mutex.h
+++ b/kernel/locking/mutex.h
@@ -16,7 +16,7 @@
16#define mutex_remove_waiter(lock, waiter, ti) \ 16#define mutex_remove_waiter(lock, waiter, ti) \
17 __list_del((waiter)->list.prev, (waiter)->list.next) 17 __list_del((waiter)->list.prev, (waiter)->list.next)
18 18
19#ifdef CONFIG_SMP 19#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
20static inline void mutex_set_owner(struct mutex *lock) 20static inline void mutex_set_owner(struct mutex *lock)
21{ 21{
22 lock->owner = current; 22 lock->owner = current;
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index a0ea2a141b3b..7c98873a3077 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -8,7 +8,7 @@
8 * Copyright (C) 2005 Kihon Technologies Inc., Steven Rostedt 8 * Copyright (C) 2005 Kihon Technologies Inc., Steven Rostedt
9 * Copyright (C) 2006 Esben Nielsen 9 * Copyright (C) 2006 Esben Nielsen
10 * 10 *
11 * See Documentation/rt-mutex-design.txt for details. 11 * See Documentation/locking/rt-mutex-design.txt for details.
12 */ 12 */
13#include <linux/spinlock.h> 13#include <linux/spinlock.h>
14#include <linux/export.h> 14#include <linux/export.h>
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index d6203faf2eb1..7628c3fc37ca 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -246,19 +246,22 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem)
246 246
247 return sem; 247 return sem;
248} 248}
249EXPORT_SYMBOL(rwsem_down_read_failed);
249 250
250static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem) 251static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem)
251{ 252{
252 if (!(count & RWSEM_ACTIVE_MASK)) { 253 /*
253 /* try acquiring the write lock */ 254 * Try acquiring the write lock. Check count first in order
254 if (sem->count == RWSEM_WAITING_BIAS && 255 * to reduce unnecessary expensive cmpxchg() operations.
255 cmpxchg(&sem->count, RWSEM_WAITING_BIAS, 256 */
256 RWSEM_ACTIVE_WRITE_BIAS) == RWSEM_WAITING_BIAS) { 257 if (count == RWSEM_WAITING_BIAS &&
257 if (!list_is_singular(&sem->wait_list)) 258 cmpxchg(&sem->count, RWSEM_WAITING_BIAS,
258 rwsem_atomic_update(RWSEM_WAITING_BIAS, sem); 259 RWSEM_ACTIVE_WRITE_BIAS) == RWSEM_WAITING_BIAS) {
259 return true; 260 if (!list_is_singular(&sem->wait_list))
260 } 261 rwsem_atomic_update(RWSEM_WAITING_BIAS, sem);
262 return true;
261 } 263 }
264
262 return false; 265 return false;
263} 266}
264 267
@@ -465,6 +468,7 @@ struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem)
465 468
466 return sem; 469 return sem;
467} 470}
471EXPORT_SYMBOL(rwsem_down_write_failed);
468 472
469/* 473/*
470 * handle waking up a waiter on the semaphore 474 * handle waking up a waiter on the semaphore
@@ -485,6 +489,7 @@ struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem)
485 489
486 return sem; 490 return sem;
487} 491}
492EXPORT_SYMBOL(rwsem_wake);
488 493
489/* 494/*
490 * downgrade a write lock into a read lock 495 * downgrade a write lock into a read lock
@@ -506,8 +511,4 @@ struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem)
506 511
507 return sem; 512 return sem;
508} 513}
509
510EXPORT_SYMBOL(rwsem_down_read_failed);
511EXPORT_SYMBOL(rwsem_down_write_failed);
512EXPORT_SYMBOL(rwsem_wake);
513EXPORT_SYMBOL(rwsem_downgrade_wake); 514EXPORT_SYMBOL(rwsem_downgrade_wake);
diff --git a/kernel/locking/semaphore.c b/kernel/locking/semaphore.c
index 6815171a4fff..b8120abe594b 100644
--- a/kernel/locking/semaphore.c
+++ b/kernel/locking/semaphore.c
@@ -36,7 +36,7 @@
36static noinline void __down(struct semaphore *sem); 36static noinline void __down(struct semaphore *sem);
37static noinline int __down_interruptible(struct semaphore *sem); 37static noinline int __down_interruptible(struct semaphore *sem);
38static noinline int __down_killable(struct semaphore *sem); 38static noinline int __down_killable(struct semaphore *sem);
39static noinline int __down_timeout(struct semaphore *sem, long jiffies); 39static noinline int __down_timeout(struct semaphore *sem, long timeout);
40static noinline void __up(struct semaphore *sem); 40static noinline void __up(struct semaphore *sem);
41 41
42/** 42/**
@@ -145,14 +145,14 @@ EXPORT_SYMBOL(down_trylock);
145/** 145/**
146 * down_timeout - acquire the semaphore within a specified time 146 * down_timeout - acquire the semaphore within a specified time
147 * @sem: the semaphore to be acquired 147 * @sem: the semaphore to be acquired
148 * @jiffies: how long to wait before failing 148 * @timeout: how long to wait before failing
149 * 149 *
150 * Attempts to acquire the semaphore. If no more tasks are allowed to 150 * Attempts to acquire the semaphore. If no more tasks are allowed to
151 * acquire the semaphore, calling this function will put the task to sleep. 151 * acquire the semaphore, calling this function will put the task to sleep.
152 * If the semaphore is not released within the specified number of jiffies, 152 * If the semaphore is not released within the specified number of jiffies,
153 * this function returns -ETIME. It returns 0 if the semaphore was acquired. 153 * this function returns -ETIME. It returns 0 if the semaphore was acquired.
154 */ 154 */
155int down_timeout(struct semaphore *sem, long jiffies) 155int down_timeout(struct semaphore *sem, long timeout)
156{ 156{
157 unsigned long flags; 157 unsigned long flags;
158 int result = 0; 158 int result = 0;
@@ -161,7 +161,7 @@ int down_timeout(struct semaphore *sem, long jiffies)
161 if (likely(sem->count > 0)) 161 if (likely(sem->count > 0))
162 sem->count--; 162 sem->count--;
163 else 163 else
164 result = __down_timeout(sem, jiffies); 164 result = __down_timeout(sem, timeout);
165 raw_spin_unlock_irqrestore(&sem->lock, flags); 165 raw_spin_unlock_irqrestore(&sem->lock, flags);
166 166
167 return result; 167 return result;
@@ -248,9 +248,9 @@ static noinline int __sched __down_killable(struct semaphore *sem)
248 return __down_common(sem, TASK_KILLABLE, MAX_SCHEDULE_TIMEOUT); 248 return __down_common(sem, TASK_KILLABLE, MAX_SCHEDULE_TIMEOUT);
249} 249}
250 250
251static noinline int __sched __down_timeout(struct semaphore *sem, long jiffies) 251static noinline int __sched __down_timeout(struct semaphore *sem, long timeout)
252{ 252{
253 return __down_common(sem, TASK_UNINTERRUPTIBLE, jiffies); 253 return __down_common(sem, TASK_UNINTERRUPTIBLE, timeout);
254} 254}
255 255
256static noinline void __sched __up(struct semaphore *sem) 256static noinline void __sched __up(struct semaphore *sem)
diff --git a/kernel/module.c b/kernel/module.c
index 03214bd288e9..3965511ae133 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -42,7 +42,6 @@
42#include <linux/vermagic.h> 42#include <linux/vermagic.h>
43#include <linux/notifier.h> 43#include <linux/notifier.h>
44#include <linux/sched.h> 44#include <linux/sched.h>
45#include <linux/stop_machine.h>
46#include <linux/device.h> 45#include <linux/device.h>
47#include <linux/string.h> 46#include <linux/string.h>
48#include <linux/mutex.h> 47#include <linux/mutex.h>
@@ -98,7 +97,7 @@
98 * 1) List of modules (also safely readable with preempt_disable), 97 * 1) List of modules (also safely readable with preempt_disable),
99 * 2) module_use links, 98 * 2) module_use links,
100 * 3) module_addr_min/module_addr_max. 99 * 3) module_addr_min/module_addr_max.
101 * (delete uses stop_machine/add uses RCU list operations). */ 100 * (delete and add uses RCU list operations). */
102DEFINE_MUTEX(module_mutex); 101DEFINE_MUTEX(module_mutex);
103EXPORT_SYMBOL_GPL(module_mutex); 102EXPORT_SYMBOL_GPL(module_mutex);
104static LIST_HEAD(modules); 103static LIST_HEAD(modules);
@@ -135,7 +134,7 @@ static int param_set_bool_enable_only(const char *val,
135} 134}
136 135
137static const struct kernel_param_ops param_ops_bool_enable_only = { 136static const struct kernel_param_ops param_ops_bool_enable_only = {
138 .flags = KERNEL_PARAM_FL_NOARG, 137 .flags = KERNEL_PARAM_OPS_FL_NOARG,
139 .set = param_set_bool_enable_only, 138 .set = param_set_bool_enable_only,
140 .get = param_get_bool, 139 .get = param_get_bool,
141}; 140};
@@ -158,13 +157,13 @@ static BLOCKING_NOTIFIER_HEAD(module_notify_list);
158 * Protected by module_mutex. */ 157 * Protected by module_mutex. */
159static unsigned long module_addr_min = -1UL, module_addr_max = 0; 158static unsigned long module_addr_min = -1UL, module_addr_max = 0;
160 159
161int register_module_notifier(struct notifier_block * nb) 160int register_module_notifier(struct notifier_block *nb)
162{ 161{
163 return blocking_notifier_chain_register(&module_notify_list, nb); 162 return blocking_notifier_chain_register(&module_notify_list, nb);
164} 163}
165EXPORT_SYMBOL(register_module_notifier); 164EXPORT_SYMBOL(register_module_notifier);
166 165
167int unregister_module_notifier(struct notifier_block * nb) 166int unregister_module_notifier(struct notifier_block *nb)
168{ 167{
169 return blocking_notifier_chain_unregister(&module_notify_list, nb); 168 return blocking_notifier_chain_unregister(&module_notify_list, nb);
170} 169}
@@ -628,18 +627,23 @@ static char last_unloaded_module[MODULE_NAME_LEN+1];
628 627
629EXPORT_TRACEPOINT_SYMBOL(module_get); 628EXPORT_TRACEPOINT_SYMBOL(module_get);
630 629
630/* MODULE_REF_BASE is the base reference count by kmodule loader. */
631#define MODULE_REF_BASE 1
632
631/* Init the unload section of the module. */ 633/* Init the unload section of the module. */
632static int module_unload_init(struct module *mod) 634static int module_unload_init(struct module *mod)
633{ 635{
634 mod->refptr = alloc_percpu(struct module_ref); 636 /*
635 if (!mod->refptr) 637 * Initialize reference counter to MODULE_REF_BASE.
636 return -ENOMEM; 638 * refcnt == 0 means module is going.
639 */
640 atomic_set(&mod->refcnt, MODULE_REF_BASE);
637 641
638 INIT_LIST_HEAD(&mod->source_list); 642 INIT_LIST_HEAD(&mod->source_list);
639 INIT_LIST_HEAD(&mod->target_list); 643 INIT_LIST_HEAD(&mod->target_list);
640 644
641 /* Hold reference count during initialization. */ 645 /* Hold reference count during initialization. */
642 raw_cpu_write(mod->refptr->incs, 1); 646 atomic_inc(&mod->refcnt);
643 647
644 return 0; 648 return 0;
645} 649}
@@ -721,8 +725,6 @@ static void module_unload_free(struct module *mod)
721 kfree(use); 725 kfree(use);
722 } 726 }
723 mutex_unlock(&module_mutex); 727 mutex_unlock(&module_mutex);
724
725 free_percpu(mod->refptr);
726} 728}
727 729
728#ifdef CONFIG_MODULE_FORCE_UNLOAD 730#ifdef CONFIG_MODULE_FORCE_UNLOAD
@@ -740,60 +742,39 @@ static inline int try_force_unload(unsigned int flags)
740} 742}
741#endif /* CONFIG_MODULE_FORCE_UNLOAD */ 743#endif /* CONFIG_MODULE_FORCE_UNLOAD */
742 744
743struct stopref 745/* Try to release refcount of module, 0 means success. */
746static int try_release_module_ref(struct module *mod)
744{ 747{
745 struct module *mod; 748 int ret;
746 int flags;
747 int *forced;
748};
749 749
750/* Whole machine is stopped with interrupts off when this runs. */ 750 /* Try to decrement refcnt which we set at loading */
751static int __try_stop_module(void *_sref) 751 ret = atomic_sub_return(MODULE_REF_BASE, &mod->refcnt);
752{ 752 BUG_ON(ret < 0);
753 struct stopref *sref = _sref; 753 if (ret)
754 /* Someone can put this right now, recover with checking */
755 ret = atomic_add_unless(&mod->refcnt, MODULE_REF_BASE, 0);
756
757 return ret;
758}
754 759
760static int try_stop_module(struct module *mod, int flags, int *forced)
761{
755 /* If it's not unused, quit unless we're forcing. */ 762 /* If it's not unused, quit unless we're forcing. */
756 if (module_refcount(sref->mod) != 0) { 763 if (try_release_module_ref(mod) != 0) {
757 if (!(*sref->forced = try_force_unload(sref->flags))) 764 *forced = try_force_unload(flags);
765 if (!(*forced))
758 return -EWOULDBLOCK; 766 return -EWOULDBLOCK;
759 } 767 }
760 768
761 /* Mark it as dying. */ 769 /* Mark it as dying. */
762 sref->mod->state = MODULE_STATE_GOING; 770 mod->state = MODULE_STATE_GOING;
763 return 0;
764}
765 771
766static int try_stop_module(struct module *mod, int flags, int *forced) 772 return 0;
767{
768 struct stopref sref = { mod, flags, forced };
769
770 return stop_machine(__try_stop_module, &sref, NULL);
771} 773}
772 774
773unsigned long module_refcount(struct module *mod) 775unsigned long module_refcount(struct module *mod)
774{ 776{
775 unsigned long incs = 0, decs = 0; 777 return (unsigned long)atomic_read(&mod->refcnt) - MODULE_REF_BASE;
776 int cpu;
777
778 for_each_possible_cpu(cpu)
779 decs += per_cpu_ptr(mod->refptr, cpu)->decs;
780 /*
781 * ensure the incs are added up after the decs.
782 * module_put ensures incs are visible before decs with smp_wmb.
783 *
784 * This 2-count scheme avoids the situation where the refcount
785 * for CPU0 is read, then CPU0 increments the module refcount,
786 * then CPU1 drops that refcount, then the refcount for CPU1 is
787 * read. We would record a decrement but not its corresponding
788 * increment so we would see a low count (disaster).
789 *
790 * Rare situation? But module_refcount can be preempted, and we
791 * might be tallying up 4096+ CPUs. So it is not impossible.
792 */
793 smp_rmb();
794 for_each_possible_cpu(cpu)
795 incs += per_cpu_ptr(mod->refptr, cpu)->incs;
796 return incs - decs;
797} 778}
798EXPORT_SYMBOL(module_refcount); 779EXPORT_SYMBOL(module_refcount);
799 780
@@ -877,8 +858,10 @@ static inline void print_unload_info(struct seq_file *m, struct module *mod)
877 858
878 seq_printf(m, " %lu ", module_refcount(mod)); 859 seq_printf(m, " %lu ", module_refcount(mod));
879 860
880 /* Always include a trailing , so userspace can differentiate 861 /*
881 between this and the old multi-field proc format. */ 862 * Always include a trailing , so userspace can differentiate
863 * between this and the old multi-field proc format.
864 */
882 list_for_each_entry(use, &mod->source_list, source_list) { 865 list_for_each_entry(use, &mod->source_list, source_list) {
883 printed_something = 1; 866 printed_something = 1;
884 seq_printf(m, "%s,", use->source->name); 867 seq_printf(m, "%s,", use->source->name);
@@ -886,11 +869,11 @@ static inline void print_unload_info(struct seq_file *m, struct module *mod)
886 869
887 if (mod->init != NULL && mod->exit == NULL) { 870 if (mod->init != NULL && mod->exit == NULL) {
888 printed_something = 1; 871 printed_something = 1;
889 seq_printf(m, "[permanent],"); 872 seq_puts(m, "[permanent],");
890 } 873 }
891 874
892 if (!printed_something) 875 if (!printed_something)
893 seq_printf(m, "-"); 876 seq_puts(m, "-");
894} 877}
895 878
896void __symbol_put(const char *symbol) 879void __symbol_put(const char *symbol)
@@ -935,7 +918,7 @@ void __module_get(struct module *module)
935{ 918{
936 if (module) { 919 if (module) {
937 preempt_disable(); 920 preempt_disable();
938 __this_cpu_inc(module->refptr->incs); 921 atomic_inc(&module->refcnt);
939 trace_module_get(module, _RET_IP_); 922 trace_module_get(module, _RET_IP_);
940 preempt_enable(); 923 preempt_enable();
941 } 924 }
@@ -948,11 +931,11 @@ bool try_module_get(struct module *module)
948 931
949 if (module) { 932 if (module) {
950 preempt_disable(); 933 preempt_disable();
951 934 /* Note: here, we can fail to get a reference */
952 if (likely(module_is_live(module))) { 935 if (likely(module_is_live(module) &&
953 __this_cpu_inc(module->refptr->incs); 936 atomic_inc_not_zero(&module->refcnt) != 0))
954 trace_module_get(module, _RET_IP_); 937 trace_module_get(module, _RET_IP_);
955 } else 938 else
956 ret = false; 939 ret = false;
957 940
958 preempt_enable(); 941 preempt_enable();
@@ -963,11 +946,12 @@ EXPORT_SYMBOL(try_module_get);
963 946
964void module_put(struct module *module) 947void module_put(struct module *module)
965{ 948{
949 int ret;
950
966 if (module) { 951 if (module) {
967 preempt_disable(); 952 preempt_disable();
968 smp_wmb(); /* see comment in module_refcount */ 953 ret = atomic_dec_if_positive(&module->refcnt);
969 __this_cpu_inc(module->refptr->decs); 954 WARN_ON(ret < 0); /* Failed to put refcount */
970
971 trace_module_put(module, _RET_IP_); 955 trace_module_put(module, _RET_IP_);
972 preempt_enable(); 956 preempt_enable();
973 } 957 }
@@ -978,7 +962,7 @@ EXPORT_SYMBOL(module_put);
978static inline void print_unload_info(struct seq_file *m, struct module *mod) 962static inline void print_unload_info(struct seq_file *m, struct module *mod)
979{ 963{
980 /* We don't know the usage count, or what modules are using. */ 964 /* We don't know the usage count, or what modules are using. */
981 seq_printf(m, " - -"); 965 seq_puts(m, " - -");
982} 966}
983 967
984static inline void module_unload_free(struct module *mod) 968static inline void module_unload_free(struct module *mod)
@@ -1131,7 +1115,7 @@ static unsigned long maybe_relocated(unsigned long crc,
1131static int check_version(Elf_Shdr *sechdrs, 1115static int check_version(Elf_Shdr *sechdrs,
1132 unsigned int versindex, 1116 unsigned int versindex,
1133 const char *symname, 1117 const char *symname,
1134 struct module *mod, 1118 struct module *mod,
1135 const unsigned long *crc, 1119 const unsigned long *crc,
1136 const struct module *crc_owner) 1120 const struct module *crc_owner)
1137{ 1121{
@@ -1165,7 +1149,7 @@ static int check_version(Elf_Shdr *sechdrs,
1165 return 0; 1149 return 0;
1166 1150
1167bad_version: 1151bad_version:
1168 printk("%s: disagrees about version of symbol %s\n", 1152 pr_warn("%s: disagrees about version of symbol %s\n",
1169 mod->name, symname); 1153 mod->name, symname);
1170 return 0; 1154 return 0;
1171} 1155}
@@ -1200,7 +1184,7 @@ static inline int same_magic(const char *amagic, const char *bmagic,
1200static inline int check_version(Elf_Shdr *sechdrs, 1184static inline int check_version(Elf_Shdr *sechdrs,
1201 unsigned int versindex, 1185 unsigned int versindex,
1202 const char *symname, 1186 const char *symname,
1203 struct module *mod, 1187 struct module *mod,
1204 const unsigned long *crc, 1188 const unsigned long *crc,
1205 const struct module *crc_owner) 1189 const struct module *crc_owner)
1206{ 1190{
@@ -1288,15 +1272,13 @@ static inline bool sect_empty(const Elf_Shdr *sect)
1288 return !(sect->sh_flags & SHF_ALLOC) || sect->sh_size == 0; 1272 return !(sect->sh_flags & SHF_ALLOC) || sect->sh_size == 0;
1289} 1273}
1290 1274
1291struct module_sect_attr 1275struct module_sect_attr {
1292{
1293 struct module_attribute mattr; 1276 struct module_attribute mattr;
1294 char *name; 1277 char *name;
1295 unsigned long address; 1278 unsigned long address;
1296}; 1279};
1297 1280
1298struct module_sect_attrs 1281struct module_sect_attrs {
1299{
1300 struct attribute_group grp; 1282 struct attribute_group grp;
1301 unsigned int nsections; 1283 unsigned int nsections;
1302 struct module_sect_attr attrs[0]; 1284 struct module_sect_attr attrs[0];
@@ -1550,7 +1532,8 @@ static int module_add_modinfo_attrs(struct module *mod)
1550 (attr->test && attr->test(mod))) { 1532 (attr->test && attr->test(mod))) {
1551 memcpy(temp_attr, attr, sizeof(*temp_attr)); 1533 memcpy(temp_attr, attr, sizeof(*temp_attr));
1552 sysfs_attr_init(&temp_attr->attr); 1534 sysfs_attr_init(&temp_attr->attr);
1553 error = sysfs_create_file(&mod->mkobj.kobj,&temp_attr->attr); 1535 error = sysfs_create_file(&mod->mkobj.kobj,
1536 &temp_attr->attr);
1554 ++temp_attr; 1537 ++temp_attr;
1555 } 1538 }
1556 } 1539 }
@@ -1566,7 +1549,7 @@ static void module_remove_modinfo_attrs(struct module *mod)
1566 /* pick a field to test for end of list */ 1549 /* pick a field to test for end of list */
1567 if (!attr->attr.name) 1550 if (!attr->attr.name)
1568 break; 1551 break;
1569 sysfs_remove_file(&mod->mkobj.kobj,&attr->attr); 1552 sysfs_remove_file(&mod->mkobj.kobj, &attr->attr);
1570 if (attr->free) 1553 if (attr->free)
1571 attr->free(mod); 1554 attr->free(mod);
1572 } 1555 }
@@ -1697,18 +1680,6 @@ static void mod_sysfs_teardown(struct module *mod)
1697 mod_sysfs_fini(mod); 1680 mod_sysfs_fini(mod);
1698} 1681}
1699 1682
1700/*
1701 * unlink the module with the whole machine is stopped with interrupts off
1702 * - this defends against kallsyms not taking locks
1703 */
1704static int __unlink_module(void *_mod)
1705{
1706 struct module *mod = _mod;
1707 list_del(&mod->list);
1708 module_bug_cleanup(mod);
1709 return 0;
1710}
1711
1712#ifdef CONFIG_DEBUG_SET_MODULE_RONX 1683#ifdef CONFIG_DEBUG_SET_MODULE_RONX
1713/* 1684/*
1714 * LKM RO/NX protection: protect module's text/ro-data 1685 * LKM RO/NX protection: protect module's text/ro-data
@@ -1842,7 +1813,9 @@ static void free_module(struct module *mod)
1842 1813
1843 /* We leave it in list to prevent duplicate loads, but make sure 1814 /* We leave it in list to prevent duplicate loads, but make sure
1844 * that noone uses it while it's being deconstructed. */ 1815 * that noone uses it while it's being deconstructed. */
1816 mutex_lock(&module_mutex);
1845 mod->state = MODULE_STATE_UNFORMED; 1817 mod->state = MODULE_STATE_UNFORMED;
1818 mutex_unlock(&module_mutex);
1846 1819
1847 /* Remove dynamic debug info */ 1820 /* Remove dynamic debug info */
1848 ddebug_remove_module(mod->name); 1821 ddebug_remove_module(mod->name);
@@ -1858,7 +1831,12 @@ static void free_module(struct module *mod)
1858 1831
1859 /* Now we can delete it from the lists */ 1832 /* Now we can delete it from the lists */
1860 mutex_lock(&module_mutex); 1833 mutex_lock(&module_mutex);
1861 stop_machine(__unlink_module, mod, NULL); 1834 /* Unlink carefully: kallsyms could be walking list. */
1835 list_del_rcu(&mod->list);
1836 /* Remove this module from bug list, this uses list_del_rcu */
1837 module_bug_cleanup(mod);
1838 /* Wait for RCU synchronizing before releasing mod->list and buglist. */
1839 synchronize_rcu();
1862 mutex_unlock(&module_mutex); 1840 mutex_unlock(&module_mutex);
1863 1841
1864 /* This may be NULL, but that's OK */ 1842 /* This may be NULL, but that's OK */
@@ -1953,7 +1931,7 @@ static int simplify_symbols(struct module *mod, const struct load_info *info)
1953 /* We compiled with -fno-common. These are not 1931 /* We compiled with -fno-common. These are not
1954 supposed to happen. */ 1932 supposed to happen. */
1955 pr_debug("Common symbol: %s\n", name); 1933 pr_debug("Common symbol: %s\n", name);
1956 printk("%s: please compile with -fno-common\n", 1934 pr_warn("%s: please compile with -fno-common\n",
1957 mod->name); 1935 mod->name);
1958 ret = -ENOEXEC; 1936 ret = -ENOEXEC;
1959 break; 1937 break;
@@ -2257,7 +2235,7 @@ static char elf_type(const Elf_Sym *sym, const struct load_info *info)
2257} 2235}
2258 2236
2259static bool is_core_symbol(const Elf_Sym *src, const Elf_Shdr *sechdrs, 2237static bool is_core_symbol(const Elf_Sym *src, const Elf_Shdr *sechdrs,
2260 unsigned int shnum) 2238 unsigned int shnum)
2261{ 2239{
2262 const Elf_Shdr *sec; 2240 const Elf_Shdr *sec;
2263 2241
@@ -2733,7 +2711,7 @@ static int find_module_sections(struct module *mod, struct load_info *info)
2733 * This shouldn't happen with same compiler and binutils 2711 * This shouldn't happen with same compiler and binutils
2734 * building all parts of the module. 2712 * building all parts of the module.
2735 */ 2713 */
2736 printk(KERN_WARNING "%s: has both .ctors and .init_array.\n", 2714 pr_warn("%s: has both .ctors and .init_array.\n",
2737 mod->name); 2715 mod->name);
2738 return -EINVAL; 2716 return -EINVAL;
2739 } 2717 }
@@ -3021,8 +2999,10 @@ static int do_init_module(struct module *mod)
3021 if (mod->init != NULL) 2999 if (mod->init != NULL)
3022 ret = do_one_initcall(mod->init); 3000 ret = do_one_initcall(mod->init);
3023 if (ret < 0) { 3001 if (ret < 0) {
3024 /* Init routine failed: abort. Try to protect us from 3002 /*
3025 buggy refcounters. */ 3003 * Init routine failed: abort. Try to protect us from
3004 * buggy refcounters.
3005 */
3026 mod->state = MODULE_STATE_GOING; 3006 mod->state = MODULE_STATE_GOING;
3027 synchronize_sched(); 3007 synchronize_sched();
3028 module_put(mod); 3008 module_put(mod);
@@ -3095,6 +3075,32 @@ static int may_init_module(void)
3095} 3075}
3096 3076
3097/* 3077/*
3078 * Can't use wait_event_interruptible() because our condition
3079 * 'finished_loading()' contains a blocking primitive itself (mutex_lock).
3080 */
3081static int wait_finished_loading(struct module *mod)
3082{
3083 DEFINE_WAIT_FUNC(wait, woken_wake_function);
3084 int ret = 0;
3085
3086 add_wait_queue(&module_wq, &wait);
3087 for (;;) {
3088 if (finished_loading(mod->name))
3089 break;
3090
3091 if (signal_pending(current)) {
3092 ret = -ERESTARTSYS;
3093 break;
3094 }
3095
3096 wait_woken(&wait, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
3097 }
3098 remove_wait_queue(&module_wq, &wait);
3099
3100 return ret;
3101}
3102
3103/*
3098 * We try to place it in the list now to make sure it's unique before 3104 * We try to place it in the list now to make sure it's unique before
3099 * we dedicate too many resources. In particular, temporary percpu 3105 * we dedicate too many resources. In particular, temporary percpu
3100 * memory exhaustion. 3106 * memory exhaustion.
@@ -3114,8 +3120,8 @@ again:
3114 || old->state == MODULE_STATE_UNFORMED) { 3120 || old->state == MODULE_STATE_UNFORMED) {
3115 /* Wait in case it fails to load. */ 3121 /* Wait in case it fails to load. */
3116 mutex_unlock(&module_mutex); 3122 mutex_unlock(&module_mutex);
3117 err = wait_event_interruptible(module_wq, 3123
3118 finished_loading(mod->name)); 3124 err = wait_finished_loading(mod);
3119 if (err) 3125 if (err)
3120 goto out_unlocked; 3126 goto out_unlocked;
3121 goto again; 3127 goto again;
@@ -3174,7 +3180,7 @@ out:
3174 3180
3175static int unknown_module_param_cb(char *param, char *val, const char *modname) 3181static int unknown_module_param_cb(char *param, char *val, const char *modname)
3176{ 3182{
3177 /* Check for magic 'dyndbg' arg */ 3183 /* Check for magic 'dyndbg' arg */
3178 int ret = ddebug_dyndbg_module_param_cb(param, val, modname); 3184 int ret = ddebug_dyndbg_module_param_cb(param, val, modname);
3179 if (ret != 0) 3185 if (ret != 0)
3180 pr_warn("%s: unknown parameter '%s' ignored\n", modname, param); 3186 pr_warn("%s: unknown parameter '%s' ignored\n", modname, param);
@@ -3324,6 +3330,8 @@ static int load_module(struct load_info *info, const char __user *uargs,
3324 /* Unlink carefully: kallsyms could be walking list. */ 3330 /* Unlink carefully: kallsyms could be walking list. */
3325 list_del_rcu(&mod->list); 3331 list_del_rcu(&mod->list);
3326 wake_up_all(&module_wq); 3332 wake_up_all(&module_wq);
3333 /* Wait for RCU synchronizing before releasing mod->list. */
3334 synchronize_rcu();
3327 mutex_unlock(&module_mutex); 3335 mutex_unlock(&module_mutex);
3328 free_module: 3336 free_module:
3329 module_deallocate(mod, info); 3337 module_deallocate(mod, info);
@@ -3388,7 +3396,7 @@ static inline int is_arm_mapping_symbol(const char *str)
3388{ 3396{
3389 if (str[0] == '.' && str[1] == 'L') 3397 if (str[0] == '.' && str[1] == 'L')
3390 return true; 3398 return true;
3391 return str[0] == '$' && strchr("atd", str[1]) 3399 return str[0] == '$' && strchr("axtd", str[1])
3392 && (str[2] == '\0' || str[2] == '.'); 3400 && (str[2] == '\0' || str[2] == '.');
3393} 3401}
3394 3402
@@ -3657,8 +3665,8 @@ static int m_show(struct seq_file *m, void *p)
3657 3665
3658 /* Informative for users. */ 3666 /* Informative for users. */
3659 seq_printf(m, " %s", 3667 seq_printf(m, " %s",
3660 mod->state == MODULE_STATE_GOING ? "Unloading": 3668 mod->state == MODULE_STATE_GOING ? "Unloading" :
3661 mod->state == MODULE_STATE_COMING ? "Loading": 3669 mod->state == MODULE_STATE_COMING ? "Loading" :
3662 "Live"); 3670 "Live");
3663 /* Used by oprofile and other similar tools. */ 3671 /* Used by oprofile and other similar tools. */
3664 seq_printf(m, " 0x%pK", mod->module_core); 3672 seq_printf(m, " 0x%pK", mod->module_core);
@@ -3667,7 +3675,7 @@ static int m_show(struct seq_file *m, void *p)
3667 if (mod->taints) 3675 if (mod->taints)
3668 seq_printf(m, " %s", module_flags(mod, buf)); 3676 seq_printf(m, " %s", module_flags(mod, buf));
3669 3677
3670 seq_printf(m, "\n"); 3678 seq_puts(m, "\n");
3671 return 0; 3679 return 0;
3672} 3680}
3673 3681
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index ef42d0ab3115..49746c81ad8d 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -220,11 +220,10 @@ void exit_task_namespaces(struct task_struct *p)
220 220
221SYSCALL_DEFINE2(setns, int, fd, int, nstype) 221SYSCALL_DEFINE2(setns, int, fd, int, nstype)
222{ 222{
223 const struct proc_ns_operations *ops;
224 struct task_struct *tsk = current; 223 struct task_struct *tsk = current;
225 struct nsproxy *new_nsproxy; 224 struct nsproxy *new_nsproxy;
226 struct proc_ns *ei;
227 struct file *file; 225 struct file *file;
226 struct ns_common *ns;
228 int err; 227 int err;
229 228
230 file = proc_ns_fget(fd); 229 file = proc_ns_fget(fd);
@@ -232,9 +231,8 @@ SYSCALL_DEFINE2(setns, int, fd, int, nstype)
232 return PTR_ERR(file); 231 return PTR_ERR(file);
233 232
234 err = -EINVAL; 233 err = -EINVAL;
235 ei = get_proc_ns(file_inode(file)); 234 ns = get_proc_ns(file_inode(file));
236 ops = ei->ns_ops; 235 if (nstype && (ns->ops->type != nstype))
237 if (nstype && (ops->type != nstype))
238 goto out; 236 goto out;
239 237
240 new_nsproxy = create_new_namespaces(0, tsk, current_user_ns(), tsk->fs); 238 new_nsproxy = create_new_namespaces(0, tsk, current_user_ns(), tsk->fs);
@@ -243,7 +241,7 @@ SYSCALL_DEFINE2(setns, int, fd, int, nstype)
243 goto out; 241 goto out;
244 } 242 }
245 243
246 err = ops->install(new_nsproxy, ei->ns); 244 err = ns->ops->install(new_nsproxy, ns);
247 if (err) { 245 if (err) {
248 free_nsproxy(new_nsproxy); 246 free_nsproxy(new_nsproxy);
249 goto out; 247 goto out;
diff --git a/kernel/panic.c b/kernel/panic.c
index d09dc5c32c67..4d8d6f906dec 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -33,6 +33,7 @@ static int pause_on_oops;
33static int pause_on_oops_flag; 33static int pause_on_oops_flag;
34static DEFINE_SPINLOCK(pause_on_oops_lock); 34static DEFINE_SPINLOCK(pause_on_oops_lock);
35static bool crash_kexec_post_notifiers; 35static bool crash_kexec_post_notifiers;
36int panic_on_warn __read_mostly;
36 37
37int panic_timeout = CONFIG_PANIC_TIMEOUT; 38int panic_timeout = CONFIG_PANIC_TIMEOUT;
38EXPORT_SYMBOL_GPL(panic_timeout); 39EXPORT_SYMBOL_GPL(panic_timeout);
@@ -244,6 +245,7 @@ static const struct tnt tnts[] = {
244 * 'I' - Working around severe firmware bug. 245 * 'I' - Working around severe firmware bug.
245 * 'O' - Out-of-tree module has been loaded. 246 * 'O' - Out-of-tree module has been loaded.
246 * 'E' - Unsigned module has been loaded. 247 * 'E' - Unsigned module has been loaded.
248 * 'L' - A soft lockup has previously occurred.
247 * 249 *
248 * The string is overwritten by the next call to print_tainted(). 250 * The string is overwritten by the next call to print_tainted().
249 */ 251 */
@@ -427,6 +429,17 @@ static void warn_slowpath_common(const char *file, int line, void *caller,
427 if (args) 429 if (args)
428 vprintk(args->fmt, args->args); 430 vprintk(args->fmt, args->args);
429 431
432 if (panic_on_warn) {
433 /*
434 * This thread may hit another WARN() in the panic path.
435 * Resetting this prevents additional WARN() from panicking the
436 * system on this thread. Other threads are blocked by the
437 * panic_mutex in panic().
438 */
439 panic_on_warn = 0;
440 panic("panic_on_warn set ...\n");
441 }
442
430 print_modules(); 443 print_modules();
431 dump_stack(); 444 dump_stack();
432 print_oops_end_marker(); 445 print_oops_end_marker();
@@ -484,6 +497,7 @@ EXPORT_SYMBOL(__stack_chk_fail);
484 497
485core_param(panic, panic_timeout, int, 0644); 498core_param(panic, panic_timeout, int, 0644);
486core_param(pause_on_oops, pause_on_oops, int, 0644); 499core_param(pause_on_oops, pause_on_oops, int, 0644);
500core_param(panic_on_warn, panic_on_warn, int, 0644);
487 501
488static int __init setup_crash_kexec_post_notifiers(char *s) 502static int __init setup_crash_kexec_post_notifiers(char *s)
489{ 503{
diff --git a/kernel/params.c b/kernel/params.c
index 34f527023794..0af9b2c4e56c 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -19,6 +19,7 @@
19#include <linux/string.h> 19#include <linux/string.h>
20#include <linux/errno.h> 20#include <linux/errno.h>
21#include <linux/module.h> 21#include <linux/module.h>
22#include <linux/moduleparam.h>
22#include <linux/device.h> 23#include <linux/device.h>
23#include <linux/err.h> 24#include <linux/err.h>
24#include <linux/slab.h> 25#include <linux/slab.h>
@@ -83,6 +84,15 @@ bool parameq(const char *a, const char *b)
83 return parameqn(a, b, strlen(a)+1); 84 return parameqn(a, b, strlen(a)+1);
84} 85}
85 86
87static void param_check_unsafe(const struct kernel_param *kp)
88{
89 if (kp->flags & KERNEL_PARAM_FL_UNSAFE) {
90 pr_warn("Setting dangerous option %s - tainting kernel\n",
91 kp->name);
92 add_taint(TAINT_USER, LOCKDEP_STILL_OK);
93 }
94}
95
86static int parse_one(char *param, 96static int parse_one(char *param,
87 char *val, 97 char *val,
88 const char *doing, 98 const char *doing,
@@ -104,11 +114,12 @@ static int parse_one(char *param,
104 return 0; 114 return 0;
105 /* No one handled NULL, so do it here. */ 115 /* No one handled NULL, so do it here. */
106 if (!val && 116 if (!val &&
107 !(params[i].ops->flags & KERNEL_PARAM_FL_NOARG)) 117 !(params[i].ops->flags & KERNEL_PARAM_OPS_FL_NOARG))
108 return -EINVAL; 118 return -EINVAL;
109 pr_debug("handling %s with %p\n", param, 119 pr_debug("handling %s with %p\n", param,
110 params[i].ops->set); 120 params[i].ops->set);
111 mutex_lock(&param_lock); 121 mutex_lock(&param_lock);
122 param_check_unsafe(&params[i]);
112 err = params[i].ops->set(val, &params[i]); 123 err = params[i].ops->set(val, &params[i]);
113 mutex_unlock(&param_lock); 124 mutex_unlock(&param_lock);
114 return err; 125 return err;
@@ -318,7 +329,7 @@ int param_get_bool(char *buffer, const struct kernel_param *kp)
318EXPORT_SYMBOL(param_get_bool); 329EXPORT_SYMBOL(param_get_bool);
319 330
320struct kernel_param_ops param_ops_bool = { 331struct kernel_param_ops param_ops_bool = {
321 .flags = KERNEL_PARAM_FL_NOARG, 332 .flags = KERNEL_PARAM_OPS_FL_NOARG,
322 .set = param_set_bool, 333 .set = param_set_bool,
323 .get = param_get_bool, 334 .get = param_get_bool,
324}; 335};
@@ -369,7 +380,7 @@ int param_set_bint(const char *val, const struct kernel_param *kp)
369EXPORT_SYMBOL(param_set_bint); 380EXPORT_SYMBOL(param_set_bint);
370 381
371struct kernel_param_ops param_ops_bint = { 382struct kernel_param_ops param_ops_bint = {
372 .flags = KERNEL_PARAM_FL_NOARG, 383 .flags = KERNEL_PARAM_OPS_FL_NOARG,
373 .set = param_set_bint, 384 .set = param_set_bint,
374 .get = param_get_int, 385 .get = param_get_int,
375}; 386};
@@ -503,8 +514,6 @@ EXPORT_SYMBOL(param_ops_string);
503#define to_module_attr(n) container_of(n, struct module_attribute, attr) 514#define to_module_attr(n) container_of(n, struct module_attribute, attr)
504#define to_module_kobject(n) container_of(n, struct module_kobject, kobj) 515#define to_module_kobject(n) container_of(n, struct module_kobject, kobj)
505 516
506extern struct kernel_param __start___param[], __stop___param[];
507
508struct param_attribute 517struct param_attribute
509{ 518{
510 struct module_attribute mattr; 519 struct module_attribute mattr;
@@ -552,6 +561,7 @@ static ssize_t param_attr_store(struct module_attribute *mattr,
552 return -EPERM; 561 return -EPERM;
553 562
554 mutex_lock(&param_lock); 563 mutex_lock(&param_lock);
564 param_check_unsafe(attribute->param);
555 err = attribute->param->ops->set(buf, attribute->param); 565 err = attribute->param->ops->set(buf, attribute->param);
556 mutex_unlock(&param_lock); 566 mutex_unlock(&param_lock);
557 if (!err) 567 if (!err)
@@ -593,74 +603,67 @@ static __modinit int add_sysfs_param(struct module_kobject *mk,
593 const struct kernel_param *kp, 603 const struct kernel_param *kp,
594 const char *name) 604 const char *name)
595{ 605{
596 struct module_param_attrs *new; 606 struct module_param_attrs *new_mp;
597 struct attribute **attrs; 607 struct attribute **new_attrs;
598 int err, num; 608 unsigned int i;
599 609
600 /* We don't bother calling this with invisible parameters. */ 610 /* We don't bother calling this with invisible parameters. */
601 BUG_ON(!kp->perm); 611 BUG_ON(!kp->perm);
602 612
603 if (!mk->mp) { 613 if (!mk->mp) {
604 num = 0; 614 /* First allocation. */
605 attrs = NULL; 615 mk->mp = kzalloc(sizeof(*mk->mp), GFP_KERNEL);
606 } else { 616 if (!mk->mp)
607 num = mk->mp->num; 617 return -ENOMEM;
608 attrs = mk->mp->grp.attrs; 618 mk->mp->grp.name = "parameters";
619 /* NULL-terminated attribute array. */
620 mk->mp->grp.attrs = kzalloc(sizeof(mk->mp->grp.attrs[0]),
621 GFP_KERNEL);
622 /* Caller will cleanup via free_module_param_attrs */
623 if (!mk->mp->grp.attrs)
624 return -ENOMEM;
609 } 625 }
610 626
611 /* Enlarge. */ 627 /* Enlarge allocations. */
612 new = krealloc(mk->mp, 628 new_mp = krealloc(mk->mp,
613 sizeof(*mk->mp) + sizeof(mk->mp->attrs[0]) * (num+1), 629 sizeof(*mk->mp) +
614 GFP_KERNEL); 630 sizeof(mk->mp->attrs[0]) * (mk->mp->num + 1),
615 if (!new) { 631 GFP_KERNEL);
616 kfree(attrs); 632 if (!new_mp)
617 err = -ENOMEM; 633 return -ENOMEM;
618 goto fail; 634 mk->mp = new_mp;
619 }
620 /* Despite looking like the typical realloc() bug, this is safe.
621 * We *want* the old 'attrs' to be freed either way, and we'll store
622 * the new one in the success case. */
623 attrs = krealloc(attrs, sizeof(new->grp.attrs[0])*(num+2), GFP_KERNEL);
624 if (!attrs) {
625 err = -ENOMEM;
626 goto fail_free_new;
627 }
628 635
629 /* Sysfs wants everything zeroed. */ 636 /* Extra pointer for NULL terminator */
630 memset(new, 0, sizeof(*new)); 637 new_attrs = krealloc(mk->mp->grp.attrs,
631 memset(&new->attrs[num], 0, sizeof(new->attrs[num])); 638 sizeof(mk->mp->grp.attrs[0]) * (mk->mp->num + 2),
632 memset(&attrs[num], 0, sizeof(attrs[num])); 639 GFP_KERNEL);
633 new->grp.name = "parameters"; 640 if (!new_attrs)
634 new->grp.attrs = attrs; 641 return -ENOMEM;
642 mk->mp->grp.attrs = new_attrs;
635 643
636 /* Tack new one on the end. */ 644 /* Tack new one on the end. */
637 sysfs_attr_init(&new->attrs[num].mattr.attr); 645 sysfs_attr_init(&mk->mp->attrs[mk->mp->num].mattr.attr);
638 new->attrs[num].param = kp; 646 mk->mp->attrs[mk->mp->num].param = kp;
639 new->attrs[num].mattr.show = param_attr_show; 647 mk->mp->attrs[mk->mp->num].mattr.show = param_attr_show;
640 new->attrs[num].mattr.store = param_attr_store; 648 /* Do not allow runtime DAC changes to make param writable. */
641 new->attrs[num].mattr.attr.name = (char *)name; 649 if ((kp->perm & (S_IWUSR | S_IWGRP | S_IWOTH)) != 0)
642 new->attrs[num].mattr.attr.mode = kp->perm; 650 mk->mp->attrs[mk->mp->num].mattr.store = param_attr_store;
643 new->num = num+1; 651 mk->mp->attrs[mk->mp->num].mattr.attr.name = (char *)name;
652 mk->mp->attrs[mk->mp->num].mattr.attr.mode = kp->perm;
653 mk->mp->num++;
644 654
645 /* Fix up all the pointers, since krealloc can move us */ 655 /* Fix up all the pointers, since krealloc can move us */
646 for (num = 0; num < new->num; num++) 656 for (i = 0; i < mk->mp->num; i++)
647 new->grp.attrs[num] = &new->attrs[num].mattr.attr; 657 mk->mp->grp.attrs[i] = &mk->mp->attrs[i].mattr.attr;
648 new->grp.attrs[num] = NULL; 658 mk->mp->grp.attrs[mk->mp->num] = NULL;
649
650 mk->mp = new;
651 return 0; 659 return 0;
652
653fail_free_new:
654 kfree(new);
655fail:
656 mk->mp = NULL;
657 return err;
658} 660}
659 661
660#ifdef CONFIG_MODULES 662#ifdef CONFIG_MODULES
661static void free_module_param_attrs(struct module_kobject *mk) 663static void free_module_param_attrs(struct module_kobject *mk)
662{ 664{
663 kfree(mk->mp->grp.attrs); 665 if (mk->mp)
666 kfree(mk->mp->grp.attrs);
664 kfree(mk->mp); 667 kfree(mk->mp);
665 mk->mp = NULL; 668 mk->mp = NULL;
666} 669}
@@ -685,8 +688,10 @@ int module_param_sysfs_setup(struct module *mod,
685 if (kparam[i].perm == 0) 688 if (kparam[i].perm == 0)
686 continue; 689 continue;
687 err = add_sysfs_param(&mod->mkobj, &kparam[i], kparam[i].name); 690 err = add_sysfs_param(&mod->mkobj, &kparam[i], kparam[i].name);
688 if (err) 691 if (err) {
692 free_module_param_attrs(&mod->mkobj);
689 return err; 693 return err;
694 }
690 params = true; 695 params = true;
691 } 696 }
692 697
@@ -763,7 +768,7 @@ static struct module_kobject * __init locate_module_kobject(const char *name)
763} 768}
764 769
765static void __init kernel_add_sysfs_param(const char *name, 770static void __init kernel_add_sysfs_param(const char *name,
766 struct kernel_param *kparam, 771 const struct kernel_param *kparam,
767 unsigned int name_skip) 772 unsigned int name_skip)
768{ 773{
769 struct module_kobject *mk; 774 struct module_kobject *mk;
@@ -798,7 +803,7 @@ static void __init kernel_add_sysfs_param(const char *name,
798 */ 803 */
799static void __init param_sysfs_builtin(void) 804static void __init param_sysfs_builtin(void)
800{ 805{
801 struct kernel_param *kp; 806 const struct kernel_param *kp;
802 unsigned int name_len; 807 unsigned int name_len;
803 char modname[MODULE_NAME_LEN]; 808 char modname[MODULE_NAME_LEN];
804 809
diff --git a/kernel/pid.c b/kernel/pid.c
index 9b9a26698144..cd36a5e0d173 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -79,7 +79,10 @@ struct pid_namespace init_pid_ns = {
79 .level = 0, 79 .level = 0,
80 .child_reaper = &init_task, 80 .child_reaper = &init_task,
81 .user_ns = &init_user_ns, 81 .user_ns = &init_user_ns,
82 .proc_inum = PROC_PID_INIT_INO, 82 .ns.inum = PROC_PID_INIT_INO,
83#ifdef CONFIG_PID_NS
84 .ns.ops = &pidns_operations,
85#endif
83}; 86};
84EXPORT_SYMBOL_GPL(init_pid_ns); 87EXPORT_SYMBOL_GPL(init_pid_ns);
85 88
@@ -341,6 +344,8 @@ out:
341 344
342out_unlock: 345out_unlock:
343 spin_unlock_irq(&pidmap_lock); 346 spin_unlock_irq(&pidmap_lock);
347 put_pid_ns(ns);
348
344out_free: 349out_free:
345 while (++i <= ns->level) 350 while (++i <= ns->level)
346 free_pidmap(pid->numbers + i); 351 free_pidmap(pid->numbers + i);
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index db95d8eb761b..a65ba137fd15 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -105,9 +105,10 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns
105 if (ns->pid_cachep == NULL) 105 if (ns->pid_cachep == NULL)
106 goto out_free_map; 106 goto out_free_map;
107 107
108 err = proc_alloc_inum(&ns->proc_inum); 108 err = ns_alloc_inum(&ns->ns);
109 if (err) 109 if (err)
110 goto out_free_map; 110 goto out_free_map;
111 ns->ns.ops = &pidns_operations;
111 112
112 kref_init(&ns->kref); 113 kref_init(&ns->kref);
113 ns->level = level; 114 ns->level = level;
@@ -142,7 +143,7 @@ static void destroy_pid_namespace(struct pid_namespace *ns)
142{ 143{
143 int i; 144 int i;
144 145
145 proc_free_inum(ns->proc_inum); 146 ns_free_inum(&ns->ns);
146 for (i = 0; i < PIDMAP_ENTRIES; i++) 147 for (i = 0; i < PIDMAP_ENTRIES; i++)
147 kfree(ns->pidmap[i].page); 148 kfree(ns->pidmap[i].page);
148 put_user_ns(ns->user_ns); 149 put_user_ns(ns->user_ns);
@@ -190,7 +191,11 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
190 /* Don't allow any more processes into the pid namespace */ 191 /* Don't allow any more processes into the pid namespace */
191 disable_pid_allocation(pid_ns); 192 disable_pid_allocation(pid_ns);
192 193
193 /* Ignore SIGCHLD causing any terminated children to autoreap */ 194 /*
195 * Ignore SIGCHLD causing any terminated children to autoreap.
196 * This speeds up the namespace shutdown, plus see the comment
197 * below.
198 */
194 spin_lock_irq(&me->sighand->siglock); 199 spin_lock_irq(&me->sighand->siglock);
195 me->sighand->action[SIGCHLD - 1].sa.sa_handler = SIG_IGN; 200 me->sighand->action[SIGCHLD - 1].sa.sa_handler = SIG_IGN;
196 spin_unlock_irq(&me->sighand->siglock); 201 spin_unlock_irq(&me->sighand->siglock);
@@ -223,15 +228,31 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
223 } 228 }
224 read_unlock(&tasklist_lock); 229 read_unlock(&tasklist_lock);
225 230
226 /* Firstly reap the EXIT_ZOMBIE children we may have. */ 231 /*
232 * Reap the EXIT_ZOMBIE children we had before we ignored SIGCHLD.
233 * sys_wait4() will also block until our children traced from the
234 * parent namespace are detached and become EXIT_DEAD.
235 */
227 do { 236 do {
228 clear_thread_flag(TIF_SIGPENDING); 237 clear_thread_flag(TIF_SIGPENDING);
229 rc = sys_wait4(-1, NULL, __WALL, NULL); 238 rc = sys_wait4(-1, NULL, __WALL, NULL);
230 } while (rc != -ECHILD); 239 } while (rc != -ECHILD);
231 240
232 /* 241 /*
233 * sys_wait4() above can't reap the TASK_DEAD children. 242 * sys_wait4() above can't reap the EXIT_DEAD children but we do not
234 * Make sure they all go away, see free_pid(). 243 * really care, we could reparent them to the global init. We could
244 * exit and reap ->child_reaper even if it is not the last thread in
245 * this pid_ns, free_pid(nr_hashed == 0) calls proc_cleanup_work(),
246 * pid_ns can not go away until proc_kill_sb() drops the reference.
247 *
248 * But this ns can also have other tasks injected by setns()+fork().
249 * Again, ignoring the user visible semantics we do not really need
250 * to wait until they are all reaped, but they can be reparented to
251 * us and thus we need to ensure that pid->child_reaper stays valid
252 * until they all go away. See free_pid()->wake_up_process().
253 *
254 * We rely on ignored SIGCHLD, an injected zombie must be autoreaped
255 * if reparented.
235 */ 256 */
236 for (;;) { 257 for (;;) {
237 set_current_state(TASK_UNINTERRUPTIBLE); 258 set_current_state(TASK_UNINTERRUPTIBLE);
@@ -313,7 +334,12 @@ int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd)
313 return 0; 334 return 0;
314} 335}
315 336
316static void *pidns_get(struct task_struct *task) 337static inline struct pid_namespace *to_pid_ns(struct ns_common *ns)
338{
339 return container_of(ns, struct pid_namespace, ns);
340}
341
342static struct ns_common *pidns_get(struct task_struct *task)
317{ 343{
318 struct pid_namespace *ns; 344 struct pid_namespace *ns;
319 345
@@ -323,18 +349,18 @@ static void *pidns_get(struct task_struct *task)
323 get_pid_ns(ns); 349 get_pid_ns(ns);
324 rcu_read_unlock(); 350 rcu_read_unlock();
325 351
326 return ns; 352 return ns ? &ns->ns : NULL;
327} 353}
328 354
329static void pidns_put(void *ns) 355static void pidns_put(struct ns_common *ns)
330{ 356{
331 put_pid_ns(ns); 357 put_pid_ns(to_pid_ns(ns));
332} 358}
333 359
334static int pidns_install(struct nsproxy *nsproxy, void *ns) 360static int pidns_install(struct nsproxy *nsproxy, struct ns_common *ns)
335{ 361{
336 struct pid_namespace *active = task_active_pid_ns(current); 362 struct pid_namespace *active = task_active_pid_ns(current);
337 struct pid_namespace *ancestor, *new = ns; 363 struct pid_namespace *ancestor, *new = to_pid_ns(ns);
338 364
339 if (!ns_capable(new->user_ns, CAP_SYS_ADMIN) || 365 if (!ns_capable(new->user_ns, CAP_SYS_ADMIN) ||
340 !ns_capable(current_user_ns(), CAP_SYS_ADMIN)) 366 !ns_capable(current_user_ns(), CAP_SYS_ADMIN))
@@ -362,19 +388,12 @@ static int pidns_install(struct nsproxy *nsproxy, void *ns)
362 return 0; 388 return 0;
363} 389}
364 390
365static unsigned int pidns_inum(void *ns)
366{
367 struct pid_namespace *pid_ns = ns;
368 return pid_ns->proc_inum;
369}
370
371const struct proc_ns_operations pidns_operations = { 391const struct proc_ns_operations pidns_operations = {
372 .name = "pid", 392 .name = "pid",
373 .type = CLONE_NEWPID, 393 .type = CLONE_NEWPID,
374 .get = pidns_get, 394 .get = pidns_get,
375 .put = pidns_put, 395 .put = pidns_put,
376 .install = pidns_install, 396 .install = pidns_install,
377 .inum = pidns_inum,
378}; 397};
379 398
380static __init int pid_namespaces_init(void) 399static __init int pid_namespaces_init(void)
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index e4e4121fa327..48b28d387c7f 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -94,6 +94,7 @@ config PM_STD_PARTITION
94config PM_SLEEP 94config PM_SLEEP
95 def_bool y 95 def_bool y
96 depends on SUSPEND || HIBERNATE_CALLBACKS 96 depends on SUSPEND || HIBERNATE_CALLBACKS
97 select PM
97 98
98config PM_SLEEP_SMP 99config PM_SLEEP_SMP
99 def_bool y 100 def_bool y
@@ -129,24 +130,19 @@ config PM_WAKELOCKS_GC
129 depends on PM_WAKELOCKS 130 depends on PM_WAKELOCKS
130 default y 131 default y
131 132
132config PM_RUNTIME 133config PM
133 bool "Run-time PM core functionality" 134 bool "Device power management core functionality"
134 depends on !IA64_HP_SIM
135 ---help--- 135 ---help---
136 Enable functionality allowing I/O devices to be put into energy-saving 136 Enable functionality allowing I/O devices to be put into energy-saving
137 (low power) states at run time (or autosuspended) after a specified 137 (low power) states, for example after a specified period of inactivity
138 period of inactivity and woken up in response to a hardware-generated 138 (autosuspended), and woken up in response to a hardware-generated
139 wake-up event or a driver's request. 139 wake-up event or a driver's request.
140 140
141 Hardware support is generally required for this functionality to work 141 Hardware support is generally required for this functionality to work
142 and the bus type drivers of the buses the devices are on are 142 and the bus type drivers of the buses the devices are on are
143 responsible for the actual handling of the autosuspend requests and 143 responsible for the actual handling of device suspend requests and
144 wake-up events. 144 wake-up events.
145 145
146config PM
147 def_bool y
148 depends on PM_SLEEP || PM_RUNTIME
149
150config PM_DEBUG 146config PM_DEBUG
151 bool "Power Management Debug Support" 147 bool "Power Management Debug Support"
152 depends on PM 148 depends on PM
@@ -298,10 +294,9 @@ config PM_GENERIC_DOMAINS_SLEEP
298 def_bool y 294 def_bool y
299 depends on PM_SLEEP && PM_GENERIC_DOMAINS 295 depends on PM_SLEEP && PM_GENERIC_DOMAINS
300 296
301config PM_GENERIC_DOMAINS_RUNTIME 297config PM_GENERIC_DOMAINS_OF
302 def_bool y 298 def_bool y
303 depends on PM_RUNTIME && PM_GENERIC_DOMAINS 299 depends on PM_GENERIC_DOMAINS && OF
304 300
305config CPU_PM 301config CPU_PM
306 bool 302 bool
307 depends on SUSPEND || CPU_IDLE
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index a9dfa79b6bab..2329daae5255 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -28,6 +28,7 @@
28#include <linux/syscore_ops.h> 28#include <linux/syscore_ops.h>
29#include <linux/ctype.h> 29#include <linux/ctype.h>
30#include <linux/genhd.h> 30#include <linux/genhd.h>
31#include <linux/ktime.h>
31#include <trace/events/power.h> 32#include <trace/events/power.h>
32 33
33#include "power.h" 34#include "power.h"
@@ -232,20 +233,17 @@ static void platform_recover(int platform_mode)
232 * @nr_pages: Number of memory pages processed between @start and @stop. 233 * @nr_pages: Number of memory pages processed between @start and @stop.
233 * @msg: Additional diagnostic message to print. 234 * @msg: Additional diagnostic message to print.
234 */ 235 */
235void swsusp_show_speed(struct timeval *start, struct timeval *stop, 236void swsusp_show_speed(ktime_t start, ktime_t stop,
236 unsigned nr_pages, char *msg) 237 unsigned nr_pages, char *msg)
237{ 238{
239 ktime_t diff;
238 u64 elapsed_centisecs64; 240 u64 elapsed_centisecs64;
239 unsigned int centisecs; 241 unsigned int centisecs;
240 unsigned int k; 242 unsigned int k;
241 unsigned int kps; 243 unsigned int kps;
242 244
243 elapsed_centisecs64 = timeval_to_ns(stop) - timeval_to_ns(start); 245 diff = ktime_sub(stop, start);
244 /* 246 elapsed_centisecs64 = ktime_divns(diff, 10*NSEC_PER_MSEC);
245 * If "(s64)elapsed_centisecs64 < 0", it will print long elapsed time,
246 * it is obvious enough for what went wrong.
247 */
248 do_div(elapsed_centisecs64, NSEC_PER_SEC / 100);
249 centisecs = elapsed_centisecs64; 247 centisecs = elapsed_centisecs64;
250 if (centisecs == 0) 248 if (centisecs == 0)
251 centisecs = 1; /* avoid div-by-zero */ 249 centisecs = 1; /* avoid div-by-zero */
@@ -502,8 +500,14 @@ int hibernation_restore(int platform_mode)
502 error = dpm_suspend_start(PMSG_QUIESCE); 500 error = dpm_suspend_start(PMSG_QUIESCE);
503 if (!error) { 501 if (!error) {
504 error = resume_target_kernel(platform_mode); 502 error = resume_target_kernel(platform_mode);
505 dpm_resume_end(PMSG_RECOVER); 503 /*
504 * The above should either succeed and jump to the new kernel,
505 * or return with an error. Otherwise things are just
506 * undefined, so let's be paranoid.
507 */
508 BUG_ON(!error);
506 } 509 }
510 dpm_resume_end(PMSG_RECOVER);
507 pm_restore_gfp_mask(); 511 pm_restore_gfp_mask();
508 resume_console(); 512 resume_console();
509 pm_restore_console(); 513 pm_restore_console();
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 2df883a9d3cb..ce9b8328a689 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -174,8 +174,7 @@ extern int hib_wait_on_bio_chain(struct bio **bio_chain);
174 174
175struct timeval; 175struct timeval;
176/* kernel/power/swsusp.c */ 176/* kernel/power/swsusp.c */
177extern void swsusp_show_speed(struct timeval *, struct timeval *, 177extern void swsusp_show_speed(ktime_t, ktime_t, unsigned int, char *);
178 unsigned int, char *);
179 178
180#ifdef CONFIG_SUSPEND 179#ifdef CONFIG_SUSPEND
181/* kernel/power/suspend.c */ 180/* kernel/power/suspend.c */
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 4ee194eb524b..5a6ec8678b9a 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -46,13 +46,13 @@ static int try_to_freeze_tasks(bool user_only)
46 while (true) { 46 while (true) {
47 todo = 0; 47 todo = 0;
48 read_lock(&tasklist_lock); 48 read_lock(&tasklist_lock);
49 do_each_thread(g, p) { 49 for_each_process_thread(g, p) {
50 if (p == current || !freeze_task(p)) 50 if (p == current || !freeze_task(p))
51 continue; 51 continue;
52 52
53 if (!freezer_should_skip(p)) 53 if (!freezer_should_skip(p))
54 todo++; 54 todo++;
55 } while_each_thread(g, p); 55 }
56 read_unlock(&tasklist_lock); 56 read_unlock(&tasklist_lock);
57 57
58 if (!user_only) { 58 if (!user_only) {
@@ -93,11 +93,11 @@ static int try_to_freeze_tasks(bool user_only)
93 93
94 if (!wakeup) { 94 if (!wakeup) {
95 read_lock(&tasklist_lock); 95 read_lock(&tasklist_lock);
96 do_each_thread(g, p) { 96 for_each_process_thread(g, p) {
97 if (p != current && !freezer_should_skip(p) 97 if (p != current && !freezer_should_skip(p)
98 && freezing(p) && !frozen(p)) 98 && freezing(p) && !frozen(p))
99 sched_show_task(p); 99 sched_show_task(p);
100 } while_each_thread(g, p); 100 }
101 read_unlock(&tasklist_lock); 101 read_unlock(&tasklist_lock);
102 } 102 }
103 } else { 103 } else {
@@ -108,6 +108,30 @@ static int try_to_freeze_tasks(bool user_only)
108 return todo ? -EBUSY : 0; 108 return todo ? -EBUSY : 0;
109} 109}
110 110
111static bool __check_frozen_processes(void)
112{
113 struct task_struct *g, *p;
114
115 for_each_process_thread(g, p)
116 if (p != current && !freezer_should_skip(p) && !frozen(p))
117 return false;
118
119 return true;
120}
121
122/*
123 * Returns true if all freezable tasks (except for current) are frozen already
124 */
125static bool check_frozen_processes(void)
126{
127 bool ret;
128
129 read_lock(&tasklist_lock);
130 ret = __check_frozen_processes();
131 read_unlock(&tasklist_lock);
132 return ret;
133}
134
111/** 135/**
112 * freeze_processes - Signal user space processes to enter the refrigerator. 136 * freeze_processes - Signal user space processes to enter the refrigerator.
113 * The current thread will not be frozen. The same process that calls 137 * The current thread will not be frozen. The same process that calls
@@ -118,6 +142,7 @@ static int try_to_freeze_tasks(bool user_only)
118int freeze_processes(void) 142int freeze_processes(void)
119{ 143{
120 int error; 144 int error;
145 int oom_kills_saved;
121 146
122 error = __usermodehelper_disable(UMH_FREEZING); 147 error = __usermodehelper_disable(UMH_FREEZING);
123 if (error) 148 if (error)
@@ -129,13 +154,28 @@ int freeze_processes(void)
129 if (!pm_freezing) 154 if (!pm_freezing)
130 atomic_inc(&system_freezing_cnt); 155 atomic_inc(&system_freezing_cnt);
131 156
157 pm_wakeup_clear();
132 printk("Freezing user space processes ... "); 158 printk("Freezing user space processes ... ");
133 pm_freezing = true; 159 pm_freezing = true;
160 oom_kills_saved = oom_kills_count();
134 error = try_to_freeze_tasks(true); 161 error = try_to_freeze_tasks(true);
135 if (!error) { 162 if (!error) {
136 printk("done.");
137 __usermodehelper_set_disable_depth(UMH_DISABLED); 163 __usermodehelper_set_disable_depth(UMH_DISABLED);
138 oom_killer_disable(); 164 oom_killer_disable();
165
166 /*
167 * There might have been an OOM kill while we were
168 * freezing tasks and the killed task might be still
169 * on the way out so we have to double check for race.
170 */
171 if (oom_kills_count() != oom_kills_saved &&
172 !check_frozen_processes()) {
173 __usermodehelper_set_disable_depth(UMH_ENABLED);
174 printk("OOM in progress.");
175 error = -EBUSY;
176 } else {
177 printk("done.");
178 }
139 } 179 }
140 printk("\n"); 180 printk("\n");
141 BUG_ON(in_atomic()); 181 BUG_ON(in_atomic());
@@ -190,11 +230,11 @@ void thaw_processes(void)
190 thaw_workqueues(); 230 thaw_workqueues();
191 231
192 read_lock(&tasklist_lock); 232 read_lock(&tasklist_lock);
193 do_each_thread(g, p) { 233 for_each_process_thread(g, p) {
194 /* No other threads should have PF_SUSPEND_TASK set */ 234 /* No other threads should have PF_SUSPEND_TASK set */
195 WARN_ON((p != curr) && (p->flags & PF_SUSPEND_TASK)); 235 WARN_ON((p != curr) && (p->flags & PF_SUSPEND_TASK));
196 __thaw_task(p); 236 __thaw_task(p);
197 } while_each_thread(g, p); 237 }
198 read_unlock(&tasklist_lock); 238 read_unlock(&tasklist_lock);
199 239
200 WARN_ON(!(curr->flags & PF_SUSPEND_TASK)); 240 WARN_ON(!(curr->flags & PF_SUSPEND_TASK));
@@ -217,10 +257,10 @@ void thaw_kernel_threads(void)
217 thaw_workqueues(); 257 thaw_workqueues();
218 258
219 read_lock(&tasklist_lock); 259 read_lock(&tasklist_lock);
220 do_each_thread(g, p) { 260 for_each_process_thread(g, p) {
221 if (p->flags & (PF_KTHREAD | PF_WQ_WORKER)) 261 if (p->flags & (PF_KTHREAD | PF_WQ_WORKER))
222 __thaw_task(p); 262 __thaw_task(p);
223 } while_each_thread(g, p); 263 }
224 read_unlock(&tasklist_lock); 264 read_unlock(&tasklist_lock);
225 265
226 schedule(); 266 schedule();
diff --git a/kernel/power/qos.c b/kernel/power/qos.c
index 884b77058864..5f4c006c4b1e 100644
--- a/kernel/power/qos.c
+++ b/kernel/power/qos.c
@@ -105,11 +105,27 @@ static struct pm_qos_object network_throughput_pm_qos = {
105}; 105};
106 106
107 107
108static BLOCKING_NOTIFIER_HEAD(memory_bandwidth_notifier);
109static struct pm_qos_constraints memory_bw_constraints = {
110 .list = PLIST_HEAD_INIT(memory_bw_constraints.list),
111 .target_value = PM_QOS_MEMORY_BANDWIDTH_DEFAULT_VALUE,
112 .default_value = PM_QOS_MEMORY_BANDWIDTH_DEFAULT_VALUE,
113 .no_constraint_value = PM_QOS_MEMORY_BANDWIDTH_DEFAULT_VALUE,
114 .type = PM_QOS_SUM,
115 .notifiers = &memory_bandwidth_notifier,
116};
117static struct pm_qos_object memory_bandwidth_pm_qos = {
118 .constraints = &memory_bw_constraints,
119 .name = "memory_bandwidth",
120};
121
122
108static struct pm_qos_object *pm_qos_array[] = { 123static struct pm_qos_object *pm_qos_array[] = {
109 &null_pm_qos, 124 &null_pm_qos,
110 &cpu_dma_pm_qos, 125 &cpu_dma_pm_qos,
111 &network_lat_pm_qos, 126 &network_lat_pm_qos,
112 &network_throughput_pm_qos 127 &network_throughput_pm_qos,
128 &memory_bandwidth_pm_qos,
113}; 129};
114 130
115static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf, 131static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf,
@@ -130,6 +146,9 @@ static const struct file_operations pm_qos_power_fops = {
130/* unlocked internal variant */ 146/* unlocked internal variant */
131static inline int pm_qos_get_value(struct pm_qos_constraints *c) 147static inline int pm_qos_get_value(struct pm_qos_constraints *c)
132{ 148{
149 struct plist_node *node;
150 int total_value = 0;
151
133 if (plist_head_empty(&c->list)) 152 if (plist_head_empty(&c->list))
134 return c->no_constraint_value; 153 return c->no_constraint_value;
135 154
@@ -140,6 +159,12 @@ static inline int pm_qos_get_value(struct pm_qos_constraints *c)
140 case PM_QOS_MAX: 159 case PM_QOS_MAX:
141 return plist_last(&c->list)->prio; 160 return plist_last(&c->list)->prio;
142 161
162 case PM_QOS_SUM:
163 plist_for_each(node, &c->list)
164 total_value += node->prio;
165
166 return total_value;
167
143 default: 168 default:
144 /* runtime check for not using enum */ 169 /* runtime check for not using enum */
145 BUG(); 170 BUG();
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index f1604d8cf489..0c40c16174b4 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -28,6 +28,7 @@
28#include <linux/list.h> 28#include <linux/list.h>
29#include <linux/slab.h> 29#include <linux/slab.h>
30#include <linux/compiler.h> 30#include <linux/compiler.h>
31#include <linux/ktime.h>
31 32
32#include <asm/uaccess.h> 33#include <asm/uaccess.h>
33#include <asm/mmu_context.h> 34#include <asm/mmu_context.h>
@@ -725,6 +726,14 @@ static void memory_bm_clear_bit(struct memory_bitmap *bm, unsigned long pfn)
725 clear_bit(bit, addr); 726 clear_bit(bit, addr);
726} 727}
727 728
729static void memory_bm_clear_current(struct memory_bitmap *bm)
730{
731 int bit;
732
733 bit = max(bm->cur.node_bit - 1, 0);
734 clear_bit(bit, bm->cur.node->data);
735}
736
728static int memory_bm_test_bit(struct memory_bitmap *bm, unsigned long pfn) 737static int memory_bm_test_bit(struct memory_bitmap *bm, unsigned long pfn)
729{ 738{
730 void *addr; 739 void *addr;
@@ -1333,23 +1342,39 @@ static struct memory_bitmap copy_bm;
1333 1342
1334void swsusp_free(void) 1343void swsusp_free(void)
1335{ 1344{
1336 struct zone *zone; 1345 unsigned long fb_pfn, fr_pfn;
1337 unsigned long pfn, max_zone_pfn;
1338 1346
1339 for_each_populated_zone(zone) { 1347 if (!forbidden_pages_map || !free_pages_map)
1340 max_zone_pfn = zone_end_pfn(zone); 1348 goto out;
1341 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) 1349
1342 if (pfn_valid(pfn)) { 1350 memory_bm_position_reset(forbidden_pages_map);
1343 struct page *page = pfn_to_page(pfn); 1351 memory_bm_position_reset(free_pages_map);
1344 1352
1345 if (swsusp_page_is_forbidden(page) && 1353loop:
1346 swsusp_page_is_free(page)) { 1354 fr_pfn = memory_bm_next_pfn(free_pages_map);
1347 swsusp_unset_page_forbidden(page); 1355 fb_pfn = memory_bm_next_pfn(forbidden_pages_map);
1348 swsusp_unset_page_free(page); 1356
1349 __free_page(page); 1357 /*
1350 } 1358 * Find the next bit set in both bitmaps. This is guaranteed to
1351 } 1359 * terminate when fb_pfn == fr_pfn == BM_END_OF_MAP.
1360 */
1361 do {
1362 if (fb_pfn < fr_pfn)
1363 fb_pfn = memory_bm_next_pfn(forbidden_pages_map);
1364 if (fr_pfn < fb_pfn)
1365 fr_pfn = memory_bm_next_pfn(free_pages_map);
1366 } while (fb_pfn != fr_pfn);
1367
1368 if (fr_pfn != BM_END_OF_MAP && pfn_valid(fr_pfn)) {
1369 struct page *page = pfn_to_page(fr_pfn);
1370
1371 memory_bm_clear_current(forbidden_pages_map);
1372 memory_bm_clear_current(free_pages_map);
1373 __free_page(page);
1374 goto loop;
1352 } 1375 }
1376
1377out:
1353 nr_copy_pages = 0; 1378 nr_copy_pages = 0;
1354 nr_meta_pages = 0; 1379 nr_meta_pages = 0;
1355 restore_pblist = NULL; 1380 restore_pblist = NULL;
@@ -1552,11 +1577,11 @@ int hibernate_preallocate_memory(void)
1552 struct zone *zone; 1577 struct zone *zone;
1553 unsigned long saveable, size, max_size, count, highmem, pages = 0; 1578 unsigned long saveable, size, max_size, count, highmem, pages = 0;
1554 unsigned long alloc, save_highmem, pages_highmem, avail_normal; 1579 unsigned long alloc, save_highmem, pages_highmem, avail_normal;
1555 struct timeval start, stop; 1580 ktime_t start, stop;
1556 int error; 1581 int error;
1557 1582
1558 printk(KERN_INFO "PM: Preallocating image memory... "); 1583 printk(KERN_INFO "PM: Preallocating image memory... ");
1559 do_gettimeofday(&start); 1584 start = ktime_get();
1560 1585
1561 error = memory_bm_create(&orig_bm, GFP_IMAGE, PG_ANY); 1586 error = memory_bm_create(&orig_bm, GFP_IMAGE, PG_ANY);
1562 if (error) 1587 if (error)
@@ -1685,9 +1710,9 @@ int hibernate_preallocate_memory(void)
1685 free_unnecessary_pages(); 1710 free_unnecessary_pages();
1686 1711
1687 out: 1712 out:
1688 do_gettimeofday(&stop); 1713 stop = ktime_get();
1689 printk(KERN_CONT "done (allocated %lu pages)\n", pages); 1714 printk(KERN_CONT "done (allocated %lu pages)\n", pages);
1690 swsusp_show_speed(&start, &stop, pages, "Allocated"); 1715 swsusp_show_speed(start, stop, pages, "Allocated");
1691 1716
1692 return 0; 1717 return 0;
1693 1718
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 18c62195660f..c347e3ce3a55 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -146,17 +146,29 @@ static int platform_suspend_prepare(suspend_state_t state)
146 146
147static int platform_suspend_prepare_late(suspend_state_t state) 147static int platform_suspend_prepare_late(suspend_state_t state)
148{ 148{
149 return state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->prepare ?
150 freeze_ops->prepare() : 0;
151}
152
153static int platform_suspend_prepare_noirq(suspend_state_t state)
154{
149 return state != PM_SUSPEND_FREEZE && suspend_ops->prepare_late ? 155 return state != PM_SUSPEND_FREEZE && suspend_ops->prepare_late ?
150 suspend_ops->prepare_late() : 0; 156 suspend_ops->prepare_late() : 0;
151} 157}
152 158
153static void platform_suspend_wake(suspend_state_t state) 159static void platform_resume_noirq(suspend_state_t state)
154{ 160{
155 if (state != PM_SUSPEND_FREEZE && suspend_ops->wake) 161 if (state != PM_SUSPEND_FREEZE && suspend_ops->wake)
156 suspend_ops->wake(); 162 suspend_ops->wake();
157} 163}
158 164
159static void platform_suspend_finish(suspend_state_t state) 165static void platform_resume_early(suspend_state_t state)
166{
167 if (state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->restore)
168 freeze_ops->restore();
169}
170
171static void platform_resume_finish(suspend_state_t state)
160{ 172{
161 if (state != PM_SUSPEND_FREEZE && suspend_ops->finish) 173 if (state != PM_SUSPEND_FREEZE && suspend_ops->finish)
162 suspend_ops->finish(); 174 suspend_ops->finish();
@@ -172,7 +184,7 @@ static int platform_suspend_begin(suspend_state_t state)
172 return 0; 184 return 0;
173} 185}
174 186
175static void platform_suspend_end(suspend_state_t state) 187static void platform_resume_end(suspend_state_t state)
176{ 188{
177 if (state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->end) 189 if (state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->end)
178 freeze_ops->end(); 190 freeze_ops->end();
@@ -180,7 +192,7 @@ static void platform_suspend_end(suspend_state_t state)
180 suspend_ops->end(); 192 suspend_ops->end();
181} 193}
182 194
183static void platform_suspend_recover(suspend_state_t state) 195static void platform_recover(suspend_state_t state)
184{ 196{
185 if (state != PM_SUSPEND_FREEZE && suspend_ops->recover) 197 if (state != PM_SUSPEND_FREEZE && suspend_ops->recover)
186 suspend_ops->recover(); 198 suspend_ops->recover();
@@ -265,13 +277,22 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
265 if (error) 277 if (error)
266 goto Platform_finish; 278 goto Platform_finish;
267 279
268 error = dpm_suspend_end(PMSG_SUSPEND); 280 error = dpm_suspend_late(PMSG_SUSPEND);
269 if (error) { 281 if (error) {
270 printk(KERN_ERR "PM: Some devices failed to power down\n"); 282 printk(KERN_ERR "PM: late suspend of devices failed\n");
271 goto Platform_finish; 283 goto Platform_finish;
272 } 284 }
273 error = platform_suspend_prepare_late(state); 285 error = platform_suspend_prepare_late(state);
274 if (error) 286 if (error)
287 goto Devices_early_resume;
288
289 error = dpm_suspend_noirq(PMSG_SUSPEND);
290 if (error) {
291 printk(KERN_ERR "PM: noirq suspend of devices failed\n");
292 goto Platform_early_resume;
293 }
294 error = platform_suspend_prepare_noirq(state);
295 if (error)
275 goto Platform_wake; 296 goto Platform_wake;
276 297
277 if (suspend_test(TEST_PLATFORM)) 298 if (suspend_test(TEST_PLATFORM))
@@ -318,11 +339,17 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
318 enable_nonboot_cpus(); 339 enable_nonboot_cpus();
319 340
320 Platform_wake: 341 Platform_wake:
321 platform_suspend_wake(state); 342 platform_resume_noirq(state);
322 dpm_resume_start(PMSG_RESUME); 343 dpm_resume_noirq(PMSG_RESUME);
344
345 Platform_early_resume:
346 platform_resume_early(state);
347
348 Devices_early_resume:
349 dpm_resume_early(PMSG_RESUME);
323 350
324 Platform_finish: 351 Platform_finish:
325 platform_suspend_finish(state); 352 platform_resume_finish(state);
326 return error; 353 return error;
327} 354}
328 355
@@ -361,14 +388,16 @@ int suspend_devices_and_enter(suspend_state_t state)
361 suspend_test_start(); 388 suspend_test_start();
362 dpm_resume_end(PMSG_RESUME); 389 dpm_resume_end(PMSG_RESUME);
363 suspend_test_finish("resume devices"); 390 suspend_test_finish("resume devices");
391 trace_suspend_resume(TPS("resume_console"), state, true);
364 resume_console(); 392 resume_console();
393 trace_suspend_resume(TPS("resume_console"), state, false);
365 394
366 Close: 395 Close:
367 platform_suspend_end(state); 396 platform_resume_end(state);
368 return error; 397 return error;
369 398
370 Recover_platform: 399 Recover_platform:
371 platform_suspend_recover(state); 400 platform_recover(state);
372 goto Resume_devices; 401 goto Resume_devices;
373} 402}
374 403
diff --git a/kernel/power/suspend_test.c b/kernel/power/suspend_test.c
index bd91bc177c93..084452e34a12 100644
--- a/kernel/power/suspend_test.c
+++ b/kernel/power/suspend_test.c
@@ -22,6 +22,8 @@
22#define TEST_SUSPEND_SECONDS 10 22#define TEST_SUSPEND_SECONDS 10
23 23
24static unsigned long suspend_test_start_time; 24static unsigned long suspend_test_start_time;
25static u32 test_repeat_count_max = 1;
26static u32 test_repeat_count_current;
25 27
26void suspend_test_start(void) 28void suspend_test_start(void)
27{ 29{
@@ -74,6 +76,7 @@ static void __init test_wakealarm(struct rtc_device *rtc, suspend_state_t state)
74 int status; 76 int status;
75 77
76 /* this may fail if the RTC hasn't been initialized */ 78 /* this may fail if the RTC hasn't been initialized */
79repeat:
77 status = rtc_read_time(rtc, &alm.time); 80 status = rtc_read_time(rtc, &alm.time);
78 if (status < 0) { 81 if (status < 0) {
79 printk(err_readtime, dev_name(&rtc->dev), status); 82 printk(err_readtime, dev_name(&rtc->dev), status);
@@ -100,10 +103,21 @@ static void __init test_wakealarm(struct rtc_device *rtc, suspend_state_t state)
100 if (state == PM_SUSPEND_STANDBY) { 103 if (state == PM_SUSPEND_STANDBY) {
101 printk(info_test, pm_states[state]); 104 printk(info_test, pm_states[state]);
102 status = pm_suspend(state); 105 status = pm_suspend(state);
106 if (status < 0)
107 state = PM_SUSPEND_FREEZE;
103 } 108 }
109 if (state == PM_SUSPEND_FREEZE) {
110 printk(info_test, pm_states[state]);
111 status = pm_suspend(state);
112 }
113
104 if (status < 0) 114 if (status < 0)
105 printk(err_suspend, status); 115 printk(err_suspend, status);
106 116
117 test_repeat_count_current++;
118 if (test_repeat_count_current < test_repeat_count_max)
119 goto repeat;
120
107 /* Some platforms can't detect that the alarm triggered the 121 /* Some platforms can't detect that the alarm triggered the
108 * wakeup, or (accordingly) disable it after it afterwards. 122 * wakeup, or (accordingly) disable it after it afterwards.
109 * It's supposed to give oneshot behavior; cope. 123 * It's supposed to give oneshot behavior; cope.
@@ -137,16 +151,28 @@ static char warn_bad_state[] __initdata =
137static int __init setup_test_suspend(char *value) 151static int __init setup_test_suspend(char *value)
138{ 152{
139 int i; 153 int i;
154 char *repeat;
155 char *suspend_type;
140 156
141 /* "=mem" ==> "mem" */ 157 /* example : "=mem[,N]" ==> "mem[,N]" */
142 value++; 158 value++;
159 suspend_type = strsep(&value, ",");
160 if (!suspend_type)
161 return 0;
162
163 repeat = strsep(&value, ",");
164 if (repeat) {
165 if (kstrtou32(repeat, 0, &test_repeat_count_max))
166 return 0;
167 }
168
143 for (i = 0; pm_labels[i]; i++) 169 for (i = 0; pm_labels[i]; i++)
144 if (!strcmp(pm_labels[i], value)) { 170 if (!strcmp(pm_labels[i], suspend_type)) {
145 test_state_label = pm_labels[i]; 171 test_state_label = pm_labels[i];
146 return 0; 172 return 0;
147 } 173 }
148 174
149 printk(warn_bad_state, value); 175 printk(warn_bad_state, suspend_type);
150 return 0; 176 return 0;
151} 177}
152__setup("test_suspend", setup_test_suspend); 178__setup("test_suspend", setup_test_suspend);
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index aaa3261dea5d..570aff817543 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -30,6 +30,7 @@
30#include <linux/atomic.h> 30#include <linux/atomic.h>
31#include <linux/kthread.h> 31#include <linux/kthread.h>
32#include <linux/crc32.h> 32#include <linux/crc32.h>
33#include <linux/ktime.h>
33 34
34#include "power.h" 35#include "power.h"
35 36
@@ -445,8 +446,8 @@ static int save_image(struct swap_map_handle *handle,
445 int nr_pages; 446 int nr_pages;
446 int err2; 447 int err2;
447 struct bio *bio; 448 struct bio *bio;
448 struct timeval start; 449 ktime_t start;
449 struct timeval stop; 450 ktime_t stop;
450 451
451 printk(KERN_INFO "PM: Saving image data pages (%u pages)...\n", 452 printk(KERN_INFO "PM: Saving image data pages (%u pages)...\n",
452 nr_to_write); 453 nr_to_write);
@@ -455,7 +456,7 @@ static int save_image(struct swap_map_handle *handle,
455 m = 1; 456 m = 1;
456 nr_pages = 0; 457 nr_pages = 0;
457 bio = NULL; 458 bio = NULL;
458 do_gettimeofday(&start); 459 start = ktime_get();
459 while (1) { 460 while (1) {
460 ret = snapshot_read_next(snapshot); 461 ret = snapshot_read_next(snapshot);
461 if (ret <= 0) 462 if (ret <= 0)
@@ -469,12 +470,12 @@ static int save_image(struct swap_map_handle *handle,
469 nr_pages++; 470 nr_pages++;
470 } 471 }
471 err2 = hib_wait_on_bio_chain(&bio); 472 err2 = hib_wait_on_bio_chain(&bio);
472 do_gettimeofday(&stop); 473 stop = ktime_get();
473 if (!ret) 474 if (!ret)
474 ret = err2; 475 ret = err2;
475 if (!ret) 476 if (!ret)
476 printk(KERN_INFO "PM: Image saving done.\n"); 477 printk(KERN_INFO "PM: Image saving done.\n");
477 swsusp_show_speed(&start, &stop, nr_to_write, "Wrote"); 478 swsusp_show_speed(start, stop, nr_to_write, "Wrote");
478 return ret; 479 return ret;
479} 480}
480 481
@@ -580,8 +581,8 @@ static int save_image_lzo(struct swap_map_handle *handle,
580 int nr_pages; 581 int nr_pages;
581 int err2; 582 int err2;
582 struct bio *bio; 583 struct bio *bio;
583 struct timeval start; 584 ktime_t start;
584 struct timeval stop; 585 ktime_t stop;
585 size_t off; 586 size_t off;
586 unsigned thr, run_threads, nr_threads; 587 unsigned thr, run_threads, nr_threads;
587 unsigned char *page = NULL; 588 unsigned char *page = NULL;
@@ -674,7 +675,7 @@ static int save_image_lzo(struct swap_map_handle *handle,
674 m = 1; 675 m = 1;
675 nr_pages = 0; 676 nr_pages = 0;
676 bio = NULL; 677 bio = NULL;
677 do_gettimeofday(&start); 678 start = ktime_get();
678 for (;;) { 679 for (;;) {
679 for (thr = 0; thr < nr_threads; thr++) { 680 for (thr = 0; thr < nr_threads; thr++) {
680 for (off = 0; off < LZO_UNC_SIZE; off += PAGE_SIZE) { 681 for (off = 0; off < LZO_UNC_SIZE; off += PAGE_SIZE) {
@@ -759,12 +760,12 @@ static int save_image_lzo(struct swap_map_handle *handle,
759 760
760out_finish: 761out_finish:
761 err2 = hib_wait_on_bio_chain(&bio); 762 err2 = hib_wait_on_bio_chain(&bio);
762 do_gettimeofday(&stop); 763 stop = ktime_get();
763 if (!ret) 764 if (!ret)
764 ret = err2; 765 ret = err2;
765 if (!ret) 766 if (!ret)
766 printk(KERN_INFO "PM: Image saving done.\n"); 767 printk(KERN_INFO "PM: Image saving done.\n");
767 swsusp_show_speed(&start, &stop, nr_to_write, "Wrote"); 768 swsusp_show_speed(start, stop, nr_to_write, "Wrote");
768out_clean: 769out_clean:
769 if (crc) { 770 if (crc) {
770 if (crc->thr) 771 if (crc->thr)
@@ -965,8 +966,8 @@ static int load_image(struct swap_map_handle *handle,
965{ 966{
966 unsigned int m; 967 unsigned int m;
967 int ret = 0; 968 int ret = 0;
968 struct timeval start; 969 ktime_t start;
969 struct timeval stop; 970 ktime_t stop;
970 struct bio *bio; 971 struct bio *bio;
971 int err2; 972 int err2;
972 unsigned nr_pages; 973 unsigned nr_pages;
@@ -978,7 +979,7 @@ static int load_image(struct swap_map_handle *handle,
978 m = 1; 979 m = 1;
979 nr_pages = 0; 980 nr_pages = 0;
980 bio = NULL; 981 bio = NULL;
981 do_gettimeofday(&start); 982 start = ktime_get();
982 for ( ; ; ) { 983 for ( ; ; ) {
983 ret = snapshot_write_next(snapshot); 984 ret = snapshot_write_next(snapshot);
984 if (ret <= 0) 985 if (ret <= 0)
@@ -996,7 +997,7 @@ static int load_image(struct swap_map_handle *handle,
996 nr_pages++; 997 nr_pages++;
997 } 998 }
998 err2 = hib_wait_on_bio_chain(&bio); 999 err2 = hib_wait_on_bio_chain(&bio);
999 do_gettimeofday(&stop); 1000 stop = ktime_get();
1000 if (!ret) 1001 if (!ret)
1001 ret = err2; 1002 ret = err2;
1002 if (!ret) { 1003 if (!ret) {
@@ -1005,7 +1006,7 @@ static int load_image(struct swap_map_handle *handle,
1005 if (!snapshot_image_loaded(snapshot)) 1006 if (!snapshot_image_loaded(snapshot))
1006 ret = -ENODATA; 1007 ret = -ENODATA;
1007 } 1008 }
1008 swsusp_show_speed(&start, &stop, nr_to_read, "Read"); 1009 swsusp_show_speed(start, stop, nr_to_read, "Read");
1009 return ret; 1010 return ret;
1010} 1011}
1011 1012
@@ -1067,8 +1068,8 @@ static int load_image_lzo(struct swap_map_handle *handle,
1067 int ret = 0; 1068 int ret = 0;
1068 int eof = 0; 1069 int eof = 0;
1069 struct bio *bio; 1070 struct bio *bio;
1070 struct timeval start; 1071 ktime_t start;
1071 struct timeval stop; 1072 ktime_t stop;
1072 unsigned nr_pages; 1073 unsigned nr_pages;
1073 size_t off; 1074 size_t off;
1074 unsigned i, thr, run_threads, nr_threads; 1075 unsigned i, thr, run_threads, nr_threads;
@@ -1190,7 +1191,7 @@ static int load_image_lzo(struct swap_map_handle *handle,
1190 m = 1; 1191 m = 1;
1191 nr_pages = 0; 1192 nr_pages = 0;
1192 bio = NULL; 1193 bio = NULL;
1193 do_gettimeofday(&start); 1194 start = ktime_get();
1194 1195
1195 ret = snapshot_write_next(snapshot); 1196 ret = snapshot_write_next(snapshot);
1196 if (ret <= 0) 1197 if (ret <= 0)
@@ -1343,7 +1344,7 @@ out_finish:
1343 wait_event(crc->done, atomic_read(&crc->stop)); 1344 wait_event(crc->done, atomic_read(&crc->stop));
1344 atomic_set(&crc->stop, 0); 1345 atomic_set(&crc->stop, 0);
1345 } 1346 }
1346 do_gettimeofday(&stop); 1347 stop = ktime_get();
1347 if (!ret) { 1348 if (!ret) {
1348 printk(KERN_INFO "PM: Image loading done.\n"); 1349 printk(KERN_INFO "PM: Image loading done.\n");
1349 snapshot_write_finalize(snapshot); 1350 snapshot_write_finalize(snapshot);
@@ -1359,7 +1360,7 @@ out_finish:
1359 } 1360 }
1360 } 1361 }
1361 } 1362 }
1362 swsusp_show_speed(&start, &stop, nr_to_read, "Read"); 1363 swsusp_show_speed(start, stop, nr_to_read, "Read");
1363out_clean: 1364out_clean:
1364 for (i = 0; i < ring_size; i++) 1365 for (i = 0; i < ring_size; i++)
1365 free_page((unsigned long)page[i]); 1366 free_page((unsigned long)page[i]);
@@ -1374,7 +1375,7 @@ out_clean:
1374 kthread_stop(data[thr].thr); 1375 kthread_stop(data[thr].thr);
1375 vfree(data); 1376 vfree(data);
1376 } 1377 }
1377 if (page) vfree(page); 1378 vfree(page);
1378 1379
1379 return ret; 1380 return ret;
1380} 1381}
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 1ce770687ea8..02d6b6d28796 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -62,9 +62,6 @@ int console_printk[4] = {
62 CONSOLE_LOGLEVEL_DEFAULT, /* default_console_loglevel */ 62 CONSOLE_LOGLEVEL_DEFAULT, /* default_console_loglevel */
63}; 63};
64 64
65/* Deferred messaged from sched code are marked by this special level */
66#define SCHED_MESSAGE_LOGLEVEL -2
67
68/* 65/*
69 * Low level drivers may need that to know if they can schedule in 66 * Low level drivers may need that to know if they can schedule in
70 * their unblank() callback or not. So let's export it. 67 * their unblank() callback or not. So let's export it.
@@ -267,7 +264,6 @@ static u32 clear_idx;
267#define LOG_ALIGN __alignof__(struct printk_log) 264#define LOG_ALIGN __alignof__(struct printk_log)
268#endif 265#endif
269#define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT) 266#define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT)
270#define __LOG_CPU_MAX_BUF_LEN (1 << CONFIG_LOG_CPU_MAX_BUF_SHIFT)
271static char __log_buf[__LOG_BUF_LEN] __aligned(LOG_ALIGN); 267static char __log_buf[__LOG_BUF_LEN] __aligned(LOG_ALIGN);
272static char *log_buf = __log_buf; 268static char *log_buf = __log_buf;
273static u32 log_buf_len = __LOG_BUF_LEN; 269static u32 log_buf_len = __LOG_BUF_LEN;
@@ -481,7 +477,7 @@ static int syslog_action_restricted(int type)
481 type != SYSLOG_ACTION_SIZE_BUFFER; 477 type != SYSLOG_ACTION_SIZE_BUFFER;
482} 478}
483 479
484static int check_syslog_permissions(int type, bool from_file) 480int check_syslog_permissions(int type, bool from_file)
485{ 481{
486 /* 482 /*
487 * If this is from /proc/kmsg and we've already opened it, then we've 483 * If this is from /proc/kmsg and we've already opened it, then we've
@@ -519,14 +515,13 @@ struct devkmsg_user {
519 char buf[8192]; 515 char buf[8192];
520}; 516};
521 517
522static ssize_t devkmsg_writev(struct kiocb *iocb, const struct iovec *iv, 518static ssize_t devkmsg_write(struct kiocb *iocb, struct iov_iter *from)
523 unsigned long count, loff_t pos)
524{ 519{
525 char *buf, *line; 520 char *buf, *line;
526 int i; 521 int i;
527 int level = default_message_loglevel; 522 int level = default_message_loglevel;
528 int facility = 1; /* LOG_USER */ 523 int facility = 1; /* LOG_USER */
529 size_t len = iov_length(iv, count); 524 size_t len = iocb->ki_nbytes;
530 ssize_t ret = len; 525 ssize_t ret = len;
531 526
532 if (len > LOG_LINE_MAX) 527 if (len > LOG_LINE_MAX)
@@ -535,13 +530,10 @@ static ssize_t devkmsg_writev(struct kiocb *iocb, const struct iovec *iv,
535 if (buf == NULL) 530 if (buf == NULL)
536 return -ENOMEM; 531 return -ENOMEM;
537 532
538 line = buf; 533 buf[len] = '\0';
539 for (i = 0; i < count; i++) { 534 if (copy_from_iter(buf, len, from) != len) {
540 if (copy_from_user(line, iv[i].iov_base, iv[i].iov_len)) { 535 kfree(buf);
541 ret = -EFAULT; 536 return -EFAULT;
542 goto out;
543 }
544 line += iv[i].iov_len;
545 } 537 }
546 538
547 /* 539 /*
@@ -567,10 +559,8 @@ static ssize_t devkmsg_writev(struct kiocb *iocb, const struct iovec *iv,
567 line = endp; 559 line = endp;
568 } 560 }
569 } 561 }
570 line[len] = '\0';
571 562
572 printk_emit(facility, level, NULL, 0, "%s", line); 563 printk_emit(facility, level, NULL, 0, "%s", line);
573out:
574 kfree(buf); 564 kfree(buf);
575 return ret; 565 return ret;
576} 566}
@@ -802,7 +792,7 @@ static int devkmsg_release(struct inode *inode, struct file *file)
802const struct file_operations kmsg_fops = { 792const struct file_operations kmsg_fops = {
803 .open = devkmsg_open, 793 .open = devkmsg_open,
804 .read = devkmsg_read, 794 .read = devkmsg_read,
805 .aio_write = devkmsg_writev, 795 .write_iter = devkmsg_write,
806 .llseek = devkmsg_llseek, 796 .llseek = devkmsg_llseek,
807 .poll = devkmsg_poll, 797 .poll = devkmsg_poll,
808 .release = devkmsg_release, 798 .release = devkmsg_release,
@@ -858,6 +848,9 @@ static int __init log_buf_len_setup(char *str)
858} 848}
859early_param("log_buf_len", log_buf_len_setup); 849early_param("log_buf_len", log_buf_len_setup);
860 850
851#ifdef CONFIG_SMP
852#define __LOG_CPU_MAX_BUF_LEN (1 << CONFIG_LOG_CPU_MAX_BUF_SHIFT)
853
861static void __init log_buf_add_cpu(void) 854static void __init log_buf_add_cpu(void)
862{ 855{
863 unsigned int cpu_extra; 856 unsigned int cpu_extra;
@@ -884,6 +877,9 @@ static void __init log_buf_add_cpu(void)
884 877
885 log_buf_len_update(cpu_extra + __LOG_BUF_LEN); 878 log_buf_len_update(cpu_extra + __LOG_BUF_LEN);
886} 879}
880#else /* !CONFIG_SMP */
881static inline void log_buf_add_cpu(void) {}
882#endif /* CONFIG_SMP */
887 883
888void __init setup_log_buf(int early) 884void __init setup_log_buf(int early)
889{ 885{
@@ -1260,7 +1256,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
1260int do_syslog(int type, char __user *buf, int len, bool from_file) 1256int do_syslog(int type, char __user *buf, int len, bool from_file)
1261{ 1257{
1262 bool clear = false; 1258 bool clear = false;
1263 static int saved_console_loglevel = -1; 1259 static int saved_console_loglevel = LOGLEVEL_DEFAULT;
1264 int error; 1260 int error;
1265 1261
1266 error = check_syslog_permissions(type, from_file); 1262 error = check_syslog_permissions(type, from_file);
@@ -1317,15 +1313,15 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)
1317 break; 1313 break;
1318 /* Disable logging to console */ 1314 /* Disable logging to console */
1319 case SYSLOG_ACTION_CONSOLE_OFF: 1315 case SYSLOG_ACTION_CONSOLE_OFF:
1320 if (saved_console_loglevel == -1) 1316 if (saved_console_loglevel == LOGLEVEL_DEFAULT)
1321 saved_console_loglevel = console_loglevel; 1317 saved_console_loglevel = console_loglevel;
1322 console_loglevel = minimum_console_loglevel; 1318 console_loglevel = minimum_console_loglevel;
1323 break; 1319 break;
1324 /* Enable logging to console */ 1320 /* Enable logging to console */
1325 case SYSLOG_ACTION_CONSOLE_ON: 1321 case SYSLOG_ACTION_CONSOLE_ON:
1326 if (saved_console_loglevel != -1) { 1322 if (saved_console_loglevel != LOGLEVEL_DEFAULT) {
1327 console_loglevel = saved_console_loglevel; 1323 console_loglevel = saved_console_loglevel;
1328 saved_console_loglevel = -1; 1324 saved_console_loglevel = LOGLEVEL_DEFAULT;
1329 } 1325 }
1330 break; 1326 break;
1331 /* Set level of messages printed to console */ 1327 /* Set level of messages printed to console */
@@ -1337,7 +1333,7 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)
1337 len = minimum_console_loglevel; 1333 len = minimum_console_loglevel;
1338 console_loglevel = len; 1334 console_loglevel = len;
1339 /* Implicitly re-enable logging to console */ 1335 /* Implicitly re-enable logging to console */
1340 saved_console_loglevel = -1; 1336 saved_console_loglevel = LOGLEVEL_DEFAULT;
1341 error = 0; 1337 error = 0;
1342 break; 1338 break;
1343 /* Number of chars in the log buffer */ 1339 /* Number of chars in the log buffer */
@@ -1628,10 +1624,10 @@ asmlinkage int vprintk_emit(int facility, int level,
1628 int printed_len = 0; 1624 int printed_len = 0;
1629 bool in_sched = false; 1625 bool in_sched = false;
1630 /* cpu currently holding logbuf_lock in this function */ 1626 /* cpu currently holding logbuf_lock in this function */
1631 static volatile unsigned int logbuf_cpu = UINT_MAX; 1627 static unsigned int logbuf_cpu = UINT_MAX;
1632 1628
1633 if (level == SCHED_MESSAGE_LOGLEVEL) { 1629 if (level == LOGLEVEL_SCHED) {
1634 level = -1; 1630 level = LOGLEVEL_DEFAULT;
1635 in_sched = true; 1631 in_sched = true;
1636 } 1632 }
1637 1633
@@ -1680,12 +1676,7 @@ asmlinkage int vprintk_emit(int facility, int level,
1680 * The printf needs to come first; we need the syslog 1676 * The printf needs to come first; we need the syslog
1681 * prefix which might be passed-in as a parameter. 1677 * prefix which might be passed-in as a parameter.
1682 */ 1678 */
1683 if (in_sched) 1679 text_len = vscnprintf(text, sizeof(textbuf), fmt, args);
1684 text_len = scnprintf(text, sizeof(textbuf),
1685 KERN_WARNING "[sched_delayed] ");
1686
1687 text_len += vscnprintf(text + text_len,
1688 sizeof(textbuf) - text_len, fmt, args);
1689 1680
1690 /* mark and strip a trailing newline */ 1681 /* mark and strip a trailing newline */
1691 if (text_len && text[text_len-1] == '\n') { 1682 if (text_len && text[text_len-1] == '\n') {
@@ -1701,8 +1692,9 @@ asmlinkage int vprintk_emit(int facility, int level,
1701 const char *end_of_header = printk_skip_level(text); 1692 const char *end_of_header = printk_skip_level(text);
1702 switch (kern_level) { 1693 switch (kern_level) {
1703 case '0' ... '7': 1694 case '0' ... '7':
1704 if (level == -1) 1695 if (level == LOGLEVEL_DEFAULT)
1705 level = kern_level - '0'; 1696 level = kern_level - '0';
1697 /* fallthrough */
1706 case 'd': /* KERN_DEFAULT */ 1698 case 'd': /* KERN_DEFAULT */
1707 lflags |= LOG_PREFIX; 1699 lflags |= LOG_PREFIX;
1708 } 1700 }
@@ -1716,7 +1708,7 @@ asmlinkage int vprintk_emit(int facility, int level,
1716 } 1708 }
1717 } 1709 }
1718 1710
1719 if (level == -1) 1711 if (level == LOGLEVEL_DEFAULT)
1720 level = default_message_loglevel; 1712 level = default_message_loglevel;
1721 1713
1722 if (dict) 1714 if (dict)
@@ -1794,7 +1786,7 @@ EXPORT_SYMBOL(vprintk_emit);
1794 1786
1795asmlinkage int vprintk(const char *fmt, va_list args) 1787asmlinkage int vprintk(const char *fmt, va_list args)
1796{ 1788{
1797 return vprintk_emit(0, -1, NULL, 0, fmt, args); 1789 return vprintk_emit(0, LOGLEVEL_DEFAULT, NULL, 0, fmt, args);
1798} 1790}
1799EXPORT_SYMBOL(vprintk); 1791EXPORT_SYMBOL(vprintk);
1800 1792
@@ -1813,6 +1805,30 @@ asmlinkage int printk_emit(int facility, int level,
1813} 1805}
1814EXPORT_SYMBOL(printk_emit); 1806EXPORT_SYMBOL(printk_emit);
1815 1807
1808int vprintk_default(const char *fmt, va_list args)
1809{
1810 int r;
1811
1812#ifdef CONFIG_KGDB_KDB
1813 if (unlikely(kdb_trap_printk)) {
1814 r = vkdb_printf(fmt, args);
1815 return r;
1816 }
1817#endif
1818 r = vprintk_emit(0, LOGLEVEL_DEFAULT, NULL, 0, fmt, args);
1819
1820 return r;
1821}
1822EXPORT_SYMBOL_GPL(vprintk_default);
1823
1824/*
1825 * This allows printk to be diverted to another function per cpu.
1826 * This is useful for calling printk functions from within NMI
1827 * without worrying about race conditions that can lock up the
1828 * box.
1829 */
1830DEFINE_PER_CPU(printk_func_t, printk_func) = vprintk_default;
1831
1816/** 1832/**
1817 * printk - print a kernel message 1833 * printk - print a kernel message
1818 * @fmt: format string 1834 * @fmt: format string
@@ -1836,19 +1852,21 @@ EXPORT_SYMBOL(printk_emit);
1836 */ 1852 */
1837asmlinkage __visible int printk(const char *fmt, ...) 1853asmlinkage __visible int printk(const char *fmt, ...)
1838{ 1854{
1855 printk_func_t vprintk_func;
1839 va_list args; 1856 va_list args;
1840 int r; 1857 int r;
1841 1858
1842#ifdef CONFIG_KGDB_KDB
1843 if (unlikely(kdb_trap_printk)) {
1844 va_start(args, fmt);
1845 r = vkdb_printf(fmt, args);
1846 va_end(args);
1847 return r;
1848 }
1849#endif
1850 va_start(args, fmt); 1859 va_start(args, fmt);
1851 r = vprintk_emit(0, -1, NULL, 0, fmt, args); 1860
1861 /*
1862 * If a caller overrides the per_cpu printk_func, then it needs
1863 * to disable preemption when calling printk(). Otherwise
1864 * the printk_func should be set to the default. No need to
1865 * disable preemption here.
1866 */
1867 vprintk_func = this_cpu_read(printk_func);
1868 r = vprintk_func(fmt, args);
1869
1852 va_end(args); 1870 va_end(args);
1853 1871
1854 return r; 1872 return r;
@@ -1882,28 +1900,28 @@ static size_t msg_print_text(const struct printk_log *msg, enum log_flags prev,
1882 bool syslog, char *buf, size_t size) { return 0; } 1900 bool syslog, char *buf, size_t size) { return 0; }
1883static size_t cont_print_text(char *text, size_t size) { return 0; } 1901static size_t cont_print_text(char *text, size_t size) { return 0; }
1884 1902
1903/* Still needs to be defined for users */
1904DEFINE_PER_CPU(printk_func_t, printk_func);
1905
1885#endif /* CONFIG_PRINTK */ 1906#endif /* CONFIG_PRINTK */
1886 1907
1887#ifdef CONFIG_EARLY_PRINTK 1908#ifdef CONFIG_EARLY_PRINTK
1888struct console *early_console; 1909struct console *early_console;
1889 1910
1890void early_vprintk(const char *fmt, va_list ap)
1891{
1892 if (early_console) {
1893 char buf[512];
1894 int n = vscnprintf(buf, sizeof(buf), fmt, ap);
1895
1896 early_console->write(early_console, buf, n);
1897 }
1898}
1899
1900asmlinkage __visible void early_printk(const char *fmt, ...) 1911asmlinkage __visible void early_printk(const char *fmt, ...)
1901{ 1912{
1902 va_list ap; 1913 va_list ap;
1914 char buf[512];
1915 int n;
1916
1917 if (!early_console)
1918 return;
1903 1919
1904 va_start(ap, fmt); 1920 va_start(ap, fmt);
1905 early_vprintk(fmt, ap); 1921 n = vscnprintf(buf, sizeof(buf), fmt, ap);
1906 va_end(ap); 1922 va_end(ap);
1923
1924 early_console->write(early_console, buf, n);
1907} 1925}
1908#endif 1926#endif
1909 1927
@@ -2628,7 +2646,7 @@ void wake_up_klogd(void)
2628 preempt_disable(); 2646 preempt_disable();
2629 if (waitqueue_active(&log_wait)) { 2647 if (waitqueue_active(&log_wait)) {
2630 this_cpu_or(printk_pending, PRINTK_PENDING_WAKEUP); 2648 this_cpu_or(printk_pending, PRINTK_PENDING_WAKEUP);
2631 irq_work_queue(&__get_cpu_var(wake_up_klogd_work)); 2649 irq_work_queue(this_cpu_ptr(&wake_up_klogd_work));
2632 } 2650 }
2633 preempt_enable(); 2651 preempt_enable();
2634} 2652}
@@ -2640,11 +2658,11 @@ int printk_deferred(const char *fmt, ...)
2640 2658
2641 preempt_disable(); 2659 preempt_disable();
2642 va_start(args, fmt); 2660 va_start(args, fmt);
2643 r = vprintk_emit(0, SCHED_MESSAGE_LOGLEVEL, NULL, 0, fmt, args); 2661 r = vprintk_emit(0, LOGLEVEL_SCHED, NULL, 0, fmt, args);
2644 va_end(args); 2662 va_end(args);
2645 2663
2646 __this_cpu_or(printk_pending, PRINTK_PENDING_OUTPUT); 2664 __this_cpu_or(printk_pending, PRINTK_PENDING_OUTPUT);
2647 irq_work_queue(&__get_cpu_var(wake_up_klogd_work)); 2665 irq_work_queue(this_cpu_ptr(&wake_up_klogd_work));
2648 preempt_enable(); 2666 preempt_enable();
2649 2667
2650 return r; 2668 return r;
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 54e75226c2c4..1eb9d90c3af9 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -485,36 +485,19 @@ static int ptrace_detach(struct task_struct *child, unsigned int data)
485 485
486/* 486/*
487 * Detach all tasks we were using ptrace on. Called with tasklist held 487 * Detach all tasks we were using ptrace on. Called with tasklist held
488 * for writing, and returns with it held too. But note it can release 488 * for writing.
489 * and reacquire the lock.
490 */ 489 */
491void exit_ptrace(struct task_struct *tracer) 490void exit_ptrace(struct task_struct *tracer, struct list_head *dead)
492 __releases(&tasklist_lock)
493 __acquires(&tasklist_lock)
494{ 491{
495 struct task_struct *p, *n; 492 struct task_struct *p, *n;
496 LIST_HEAD(ptrace_dead);
497
498 if (likely(list_empty(&tracer->ptraced)))
499 return;
500 493
501 list_for_each_entry_safe(p, n, &tracer->ptraced, ptrace_entry) { 494 list_for_each_entry_safe(p, n, &tracer->ptraced, ptrace_entry) {
502 if (unlikely(p->ptrace & PT_EXITKILL)) 495 if (unlikely(p->ptrace & PT_EXITKILL))
503 send_sig_info(SIGKILL, SEND_SIG_FORCED, p); 496 send_sig_info(SIGKILL, SEND_SIG_FORCED, p);
504 497
505 if (__ptrace_detach(tracer, p)) 498 if (__ptrace_detach(tracer, p))
506 list_add(&p->ptrace_entry, &ptrace_dead); 499 list_add(&p->ptrace_entry, dead);
507 }
508
509 write_unlock_irq(&tasklist_lock);
510 BUG_ON(!list_empty(&tracer->ptraced));
511
512 list_for_each_entry_safe(p, n, &ptrace_dead, ptrace_entry) {
513 list_del_init(&p->ptrace_entry);
514 release_task(p);
515 } 500 }
516
517 write_lock_irq(&tasklist_lock);
518} 501}
519 502
520int ptrace_readdata(struct task_struct *tsk, unsigned long src, char __user *dst, int len) 503int ptrace_readdata(struct task_struct *tsk, unsigned long src, char __user *dst, int len)
diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile
index 807ccfbf69b3..e6fae503d1bc 100644
--- a/kernel/rcu/Makefile
+++ b/kernel/rcu/Makefile
@@ -1,6 +1,6 @@
1obj-y += update.o srcu.o 1obj-y += update.o srcu.o
2obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o 2obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
3obj-$(CONFIG_TREE_RCU) += tree.o 3obj-$(CONFIG_TREE_RCU) += tree.o
4obj-$(CONFIG_TREE_PREEMPT_RCU) += tree.o 4obj-$(CONFIG_PREEMPT_RCU) += tree.o
5obj-$(CONFIG_TREE_RCU_TRACE) += tree_trace.o 5obj-$(CONFIG_TREE_RCU_TRACE) += tree_trace.o
6obj-$(CONFIG_TINY_RCU) += tiny.o 6obj-$(CONFIG_TINY_RCU) += tiny.o
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index ff1a6de62f17..07bb02eda844 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -135,4 +135,6 @@ int rcu_jiffies_till_stall_check(void);
135 */ 135 */
136#define TPS(x) tracepoint_string(x) 136#define TPS(x) tracepoint_string(x)
137 137
138void rcu_early_boot_tests(void);
139
138#endif /* __LINUX_RCU_H */ 140#endif /* __LINUX_RCU_H */
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 948a7693748e..4d559baf06e0 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -49,11 +49,19 @@
49#include <linux/trace_clock.h> 49#include <linux/trace_clock.h>
50#include <asm/byteorder.h> 50#include <asm/byteorder.h>
51#include <linux/torture.h> 51#include <linux/torture.h>
52#include <linux/vmalloc.h>
52 53
53MODULE_LICENSE("GPL"); 54MODULE_LICENSE("GPL");
54MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and Josh Triplett <josh@joshtriplett.org>"); 55MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and Josh Triplett <josh@joshtriplett.org>");
55 56
56 57
58torture_param(int, cbflood_inter_holdoff, HZ,
59 "Holdoff between floods (jiffies)");
60torture_param(int, cbflood_intra_holdoff, 1,
61 "Holdoff between bursts (jiffies)");
62torture_param(int, cbflood_n_burst, 3, "# bursts in flood, zero to disable");
63torture_param(int, cbflood_n_per_burst, 20000,
64 "# callbacks per burst in flood");
57torture_param(int, fqs_duration, 0, 65torture_param(int, fqs_duration, 0,
58 "Duration of fqs bursts (us), 0 to disable"); 66 "Duration of fqs bursts (us), 0 to disable");
59torture_param(int, fqs_holdoff, 0, "Holdoff time within fqs bursts (us)"); 67torture_param(int, fqs_holdoff, 0, "Holdoff time within fqs bursts (us)");
@@ -96,10 +104,12 @@ module_param(torture_type, charp, 0444);
96MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, ...)"); 104MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, ...)");
97 105
98static int nrealreaders; 106static int nrealreaders;
107static int ncbflooders;
99static struct task_struct *writer_task; 108static struct task_struct *writer_task;
100static struct task_struct **fakewriter_tasks; 109static struct task_struct **fakewriter_tasks;
101static struct task_struct **reader_tasks; 110static struct task_struct **reader_tasks;
102static struct task_struct *stats_task; 111static struct task_struct *stats_task;
112static struct task_struct **cbflood_task;
103static struct task_struct *fqs_task; 113static struct task_struct *fqs_task;
104static struct task_struct *boost_tasks[NR_CPUS]; 114static struct task_struct *boost_tasks[NR_CPUS];
105static struct task_struct *stall_task; 115static struct task_struct *stall_task;
@@ -138,6 +148,7 @@ static long n_rcu_torture_boosts;
138static long n_rcu_torture_timers; 148static long n_rcu_torture_timers;
139static long n_barrier_attempts; 149static long n_barrier_attempts;
140static long n_barrier_successes; 150static long n_barrier_successes;
151static atomic_long_t n_cbfloods;
141static struct list_head rcu_torture_removed; 152static struct list_head rcu_torture_removed;
142 153
143static int rcu_torture_writer_state; 154static int rcu_torture_writer_state;
@@ -157,9 +168,9 @@ static int rcu_torture_writer_state;
157#else 168#else
158#define RCUTORTURE_RUNNABLE_INIT 0 169#define RCUTORTURE_RUNNABLE_INIT 0
159#endif 170#endif
160int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT; 171static int torture_runnable = RCUTORTURE_RUNNABLE_INIT;
161module_param(rcutorture_runnable, int, 0444); 172module_param(torture_runnable, int, 0444);
162MODULE_PARM_DESC(rcutorture_runnable, "Start rcutorture at boot"); 173MODULE_PARM_DESC(torture_runnable, "Start rcutorture at boot");
163 174
164#if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) 175#if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU)
165#define rcu_can_boost() 1 176#define rcu_can_boost() 1
@@ -182,7 +193,7 @@ static u64 notrace rcu_trace_clock_local(void)
182#endif /* #else #ifdef CONFIG_RCU_TRACE */ 193#endif /* #else #ifdef CONFIG_RCU_TRACE */
183 194
184static unsigned long boost_starttime; /* jiffies of next boost test start. */ 195static unsigned long boost_starttime; /* jiffies of next boost test start. */
185DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */ 196static DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */
186 /* and boost task create/destroy. */ 197 /* and boost task create/destroy. */
187static atomic_t barrier_cbs_count; /* Barrier callbacks registered. */ 198static atomic_t barrier_cbs_count; /* Barrier callbacks registered. */
188static bool barrier_phase; /* Test phase. */ 199static bool barrier_phase; /* Test phase. */
@@ -242,7 +253,7 @@ struct rcu_torture_ops {
242 void (*call)(struct rcu_head *head, void (*func)(struct rcu_head *rcu)); 253 void (*call)(struct rcu_head *head, void (*func)(struct rcu_head *rcu));
243 void (*cb_barrier)(void); 254 void (*cb_barrier)(void);
244 void (*fqs)(void); 255 void (*fqs)(void);
245 void (*stats)(char *page); 256 void (*stats)(void);
246 int irq_capable; 257 int irq_capable;
247 int can_boost; 258 int can_boost;
248 const char *name; 259 const char *name;
@@ -525,21 +536,21 @@ static void srcu_torture_barrier(void)
525 srcu_barrier(&srcu_ctl); 536 srcu_barrier(&srcu_ctl);
526} 537}
527 538
528static void srcu_torture_stats(char *page) 539static void srcu_torture_stats(void)
529{ 540{
530 int cpu; 541 int cpu;
531 int idx = srcu_ctl.completed & 0x1; 542 int idx = srcu_ctl.completed & 0x1;
532 543
533 page += sprintf(page, "%s%s per-CPU(idx=%d):", 544 pr_alert("%s%s per-CPU(idx=%d):",
534 torture_type, TORTURE_FLAG, idx); 545 torture_type, TORTURE_FLAG, idx);
535 for_each_possible_cpu(cpu) { 546 for_each_possible_cpu(cpu) {
536 long c0, c1; 547 long c0, c1;
537 548
538 c0 = (long)per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[!idx]; 549 c0 = (long)per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[!idx];
539 c1 = (long)per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[idx]; 550 c1 = (long)per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[idx];
540 page += sprintf(page, " %d(%ld,%ld)", cpu, c0, c1); 551 pr_cont(" %d(%ld,%ld)", cpu, c0, c1);
541 } 552 }
542 sprintf(page, "\n"); 553 pr_cont("\n");
543} 554}
544 555
545static void srcu_torture_synchronize_expedited(void) 556static void srcu_torture_synchronize_expedited(void)
@@ -601,6 +612,52 @@ static struct rcu_torture_ops sched_ops = {
601 .name = "sched" 612 .name = "sched"
602}; 613};
603 614
615#ifdef CONFIG_TASKS_RCU
616
617/*
618 * Definitions for RCU-tasks torture testing.
619 */
620
621static int tasks_torture_read_lock(void)
622{
623 return 0;
624}
625
626static void tasks_torture_read_unlock(int idx)
627{
628}
629
630static void rcu_tasks_torture_deferred_free(struct rcu_torture *p)
631{
632 call_rcu_tasks(&p->rtort_rcu, rcu_torture_cb);
633}
634
635static struct rcu_torture_ops tasks_ops = {
636 .ttype = RCU_TASKS_FLAVOR,
637 .init = rcu_sync_torture_init,
638 .readlock = tasks_torture_read_lock,
639 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
640 .readunlock = tasks_torture_read_unlock,
641 .completed = rcu_no_completed,
642 .deferred_free = rcu_tasks_torture_deferred_free,
643 .sync = synchronize_rcu_tasks,
644 .exp_sync = synchronize_rcu_tasks,
645 .call = call_rcu_tasks,
646 .cb_barrier = rcu_barrier_tasks,
647 .fqs = NULL,
648 .stats = NULL,
649 .irq_capable = 1,
650 .name = "tasks"
651};
652
653#define RCUTORTURE_TASKS_OPS &tasks_ops,
654
655#else /* #ifdef CONFIG_TASKS_RCU */
656
657#define RCUTORTURE_TASKS_OPS
658
659#endif /* #else #ifdef CONFIG_TASKS_RCU */
660
604/* 661/*
605 * RCU torture priority-boost testing. Runs one real-time thread per 662 * RCU torture priority-boost testing. Runs one real-time thread per
606 * CPU for moderate bursts, repeatedly registering RCU callbacks and 663 * CPU for moderate bursts, repeatedly registering RCU callbacks and
@@ -667,7 +724,7 @@ static int rcu_torture_boost(void *arg)
667 } 724 }
668 call_rcu_time = jiffies; 725 call_rcu_time = jiffies;
669 } 726 }
670 cond_resched(); 727 cond_resched_rcu_qs();
671 stutter_wait("rcu_torture_boost"); 728 stutter_wait("rcu_torture_boost");
672 if (torture_must_stop()) 729 if (torture_must_stop())
673 goto checkwait; 730 goto checkwait;
@@ -707,6 +764,59 @@ checkwait: stutter_wait("rcu_torture_boost");
707 return 0; 764 return 0;
708} 765}
709 766
767static void rcu_torture_cbflood_cb(struct rcu_head *rhp)
768{
769}
770
771/*
772 * RCU torture callback-flood kthread. Repeatedly induces bursts of calls
773 * to call_rcu() or analogous, increasing the probability of occurrence
774 * of callback-overflow corner cases.
775 */
776static int
777rcu_torture_cbflood(void *arg)
778{
779 int err = 1;
780 int i;
781 int j;
782 struct rcu_head *rhp;
783
784 if (cbflood_n_per_burst > 0 &&
785 cbflood_inter_holdoff > 0 &&
786 cbflood_intra_holdoff > 0 &&
787 cur_ops->call &&
788 cur_ops->cb_barrier) {
789 rhp = vmalloc(sizeof(*rhp) *
790 cbflood_n_burst * cbflood_n_per_burst);
791 err = !rhp;
792 }
793 if (err) {
794 VERBOSE_TOROUT_STRING("rcu_torture_cbflood disabled: Bad args or OOM");
795 while (!torture_must_stop())
796 schedule_timeout_interruptible(HZ);
797 return 0;
798 }
799 VERBOSE_TOROUT_STRING("rcu_torture_cbflood task started");
800 do {
801 schedule_timeout_interruptible(cbflood_inter_holdoff);
802 atomic_long_inc(&n_cbfloods);
803 WARN_ON(signal_pending(current));
804 for (i = 0; i < cbflood_n_burst; i++) {
805 for (j = 0; j < cbflood_n_per_burst; j++) {
806 cur_ops->call(&rhp[i * cbflood_n_per_burst + j],
807 rcu_torture_cbflood_cb);
808 }
809 schedule_timeout_interruptible(cbflood_intra_holdoff);
810 WARN_ON(signal_pending(current));
811 }
812 cur_ops->cb_barrier();
813 stutter_wait("rcu_torture_cbflood");
814 } while (!torture_must_stop());
815 vfree(rhp);
816 torture_kthread_stopping("rcu_torture_cbflood");
817 return 0;
818}
819
710/* 820/*
711 * RCU torture force-quiescent-state kthread. Repeatedly induces 821 * RCU torture force-quiescent-state kthread. Repeatedly induces
712 * bursts of calls to force_quiescent_state(), increasing the probability 822 * bursts of calls to force_quiescent_state(), increasing the probability
@@ -1019,7 +1129,7 @@ rcu_torture_reader(void *arg)
1019 __this_cpu_inc(rcu_torture_batch[completed]); 1129 __this_cpu_inc(rcu_torture_batch[completed]);
1020 preempt_enable(); 1130 preempt_enable();
1021 cur_ops->readunlock(idx); 1131 cur_ops->readunlock(idx);
1022 cond_resched(); 1132 cond_resched_rcu_qs();
1023 stutter_wait("rcu_torture_reader"); 1133 stutter_wait("rcu_torture_reader");
1024 } while (!torture_must_stop()); 1134 } while (!torture_must_stop());
1025 if (irqreader && cur_ops->irq_capable) { 1135 if (irqreader && cur_ops->irq_capable) {
@@ -1031,10 +1141,15 @@ rcu_torture_reader(void *arg)
1031} 1141}
1032 1142
1033/* 1143/*
1034 * Create an RCU-torture statistics message in the specified buffer. 1144 * Print torture statistics. Caller must ensure that there is only
1145 * one call to this function at a given time!!! This is normally
1146 * accomplished by relying on the module system to only have one copy
1147 * of the module loaded, and then by giving the rcu_torture_stats
1148 * kthread full control (or the init/cleanup functions when rcu_torture_stats
1149 * thread is not running).
1035 */ 1150 */
1036static void 1151static void
1037rcu_torture_printk(char *page) 1152rcu_torture_stats_print(void)
1038{ 1153{
1039 int cpu; 1154 int cpu;
1040 int i; 1155 int i;
@@ -1052,55 +1167,61 @@ rcu_torture_printk(char *page)
1052 if (pipesummary[i] != 0) 1167 if (pipesummary[i] != 0)
1053 break; 1168 break;
1054 } 1169 }
1055 page += sprintf(page, "%s%s ", torture_type, TORTURE_FLAG); 1170
1056 page += sprintf(page, 1171 pr_alert("%s%s ", torture_type, TORTURE_FLAG);
1057 "rtc: %p ver: %lu tfle: %d rta: %d rtaf: %d rtf: %d ", 1172 pr_cont("rtc: %p ver: %lu tfle: %d rta: %d rtaf: %d rtf: %d ",
1058 rcu_torture_current, 1173 rcu_torture_current,
1059 rcu_torture_current_version, 1174 rcu_torture_current_version,
1060 list_empty(&rcu_torture_freelist), 1175 list_empty(&rcu_torture_freelist),
1061 atomic_read(&n_rcu_torture_alloc), 1176 atomic_read(&n_rcu_torture_alloc),
1062 atomic_read(&n_rcu_torture_alloc_fail), 1177 atomic_read(&n_rcu_torture_alloc_fail),
1063 atomic_read(&n_rcu_torture_free)); 1178 atomic_read(&n_rcu_torture_free));
1064 page += sprintf(page, "rtmbe: %d rtbke: %ld rtbre: %ld ", 1179 pr_cont("rtmbe: %d rtbke: %ld rtbre: %ld ",
1065 atomic_read(&n_rcu_torture_mberror), 1180 atomic_read(&n_rcu_torture_mberror),
1066 n_rcu_torture_boost_ktrerror, 1181 n_rcu_torture_boost_ktrerror,
1067 n_rcu_torture_boost_rterror); 1182 n_rcu_torture_boost_rterror);
1068 page += sprintf(page, "rtbf: %ld rtb: %ld nt: %ld ", 1183 pr_cont("rtbf: %ld rtb: %ld nt: %ld ",
1069 n_rcu_torture_boost_failure, 1184 n_rcu_torture_boost_failure,
1070 n_rcu_torture_boosts, 1185 n_rcu_torture_boosts,
1071 n_rcu_torture_timers); 1186 n_rcu_torture_timers);
1072 page = torture_onoff_stats(page); 1187 torture_onoff_stats();
1073 page += sprintf(page, "barrier: %ld/%ld:%ld", 1188 pr_cont("barrier: %ld/%ld:%ld ",
1074 n_barrier_successes, 1189 n_barrier_successes,
1075 n_barrier_attempts, 1190 n_barrier_attempts,
1076 n_rcu_torture_barrier_error); 1191 n_rcu_torture_barrier_error);
1077 page += sprintf(page, "\n%s%s ", torture_type, TORTURE_FLAG); 1192 pr_cont("cbflood: %ld\n", atomic_long_read(&n_cbfloods));
1193
1194 pr_alert("%s%s ", torture_type, TORTURE_FLAG);
1078 if (atomic_read(&n_rcu_torture_mberror) != 0 || 1195 if (atomic_read(&n_rcu_torture_mberror) != 0 ||
1079 n_rcu_torture_barrier_error != 0 || 1196 n_rcu_torture_barrier_error != 0 ||
1080 n_rcu_torture_boost_ktrerror != 0 || 1197 n_rcu_torture_boost_ktrerror != 0 ||
1081 n_rcu_torture_boost_rterror != 0 || 1198 n_rcu_torture_boost_rterror != 0 ||
1082 n_rcu_torture_boost_failure != 0 || 1199 n_rcu_torture_boost_failure != 0 ||
1083 i > 1) { 1200 i > 1) {
1084 page += sprintf(page, "!!! "); 1201 pr_cont("%s", "!!! ");
1085 atomic_inc(&n_rcu_torture_error); 1202 atomic_inc(&n_rcu_torture_error);
1086 WARN_ON_ONCE(1); 1203 WARN_ON_ONCE(1);
1087 } 1204 }
1088 page += sprintf(page, "Reader Pipe: "); 1205 pr_cont("Reader Pipe: ");
1089 for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) 1206 for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++)
1090 page += sprintf(page, " %ld", pipesummary[i]); 1207 pr_cont(" %ld", pipesummary[i]);
1091 page += sprintf(page, "\n%s%s ", torture_type, TORTURE_FLAG); 1208 pr_cont("\n");
1092 page += sprintf(page, "Reader Batch: "); 1209
1210 pr_alert("%s%s ", torture_type, TORTURE_FLAG);
1211 pr_cont("Reader Batch: ");
1093 for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) 1212 for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++)
1094 page += sprintf(page, " %ld", batchsummary[i]); 1213 pr_cont(" %ld", batchsummary[i]);
1095 page += sprintf(page, "\n%s%s ", torture_type, TORTURE_FLAG); 1214 pr_cont("\n");
1096 page += sprintf(page, "Free-Block Circulation: "); 1215
1216 pr_alert("%s%s ", torture_type, TORTURE_FLAG);
1217 pr_cont("Free-Block Circulation: ");
1097 for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) { 1218 for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) {
1098 page += sprintf(page, " %d", 1219 pr_cont(" %d", atomic_read(&rcu_torture_wcount[i]));
1099 atomic_read(&rcu_torture_wcount[i]));
1100 } 1220 }
1101 page += sprintf(page, "\n"); 1221 pr_cont("\n");
1222
1102 if (cur_ops->stats) 1223 if (cur_ops->stats)
1103 cur_ops->stats(page); 1224 cur_ops->stats();
1104 if (rtcv_snap == rcu_torture_current_version && 1225 if (rtcv_snap == rcu_torture_current_version &&
1105 rcu_torture_current != NULL) { 1226 rcu_torture_current != NULL) {
1106 int __maybe_unused flags; 1227 int __maybe_unused flags;
@@ -1109,10 +1230,9 @@ rcu_torture_printk(char *page)
1109 1230
1110 rcutorture_get_gp_data(cur_ops->ttype, 1231 rcutorture_get_gp_data(cur_ops->ttype,
1111 &flags, &gpnum, &completed); 1232 &flags, &gpnum, &completed);
1112 page += sprintf(page, 1233 pr_alert("??? Writer stall state %d g%lu c%lu f%#x\n",
1113 "??? Writer stall state %d g%lu c%lu f%#x\n", 1234 rcu_torture_writer_state,
1114 rcu_torture_writer_state, 1235 gpnum, completed, flags);
1115 gpnum, completed, flags);
1116 show_rcu_gp_kthreads(); 1236 show_rcu_gp_kthreads();
1117 rcutorture_trace_dump(); 1237 rcutorture_trace_dump();
1118 } 1238 }
@@ -1120,30 +1240,6 @@ rcu_torture_printk(char *page)
1120} 1240}
1121 1241
1122/* 1242/*
1123 * Print torture statistics. Caller must ensure that there is only
1124 * one call to this function at a given time!!! This is normally
1125 * accomplished by relying on the module system to only have one copy
1126 * of the module loaded, and then by giving the rcu_torture_stats
1127 * kthread full control (or the init/cleanup functions when rcu_torture_stats
1128 * thread is not running).
1129 */
1130static void
1131rcu_torture_stats_print(void)
1132{
1133 int size = nr_cpu_ids * 200 + 8192;
1134 char *buf;
1135
1136 buf = kmalloc(size, GFP_KERNEL);
1137 if (!buf) {
1138 pr_err("rcu-torture: Out of memory, need: %d", size);
1139 return;
1140 }
1141 rcu_torture_printk(buf);
1142 pr_alert("%s", buf);
1143 kfree(buf);
1144}
1145
1146/*
1147 * Periodically prints torture statistics, if periodic statistics printing 1243 * Periodically prints torture statistics, if periodic statistics printing
1148 * was specified via the stat_interval module parameter. 1244 * was specified via the stat_interval module parameter.
1149 */ 1245 */
@@ -1295,7 +1391,8 @@ static int rcu_torture_barrier_cbs(void *arg)
1295 if (atomic_dec_and_test(&barrier_cbs_count)) 1391 if (atomic_dec_and_test(&barrier_cbs_count))
1296 wake_up(&barrier_wq); 1392 wake_up(&barrier_wq);
1297 } while (!torture_must_stop()); 1393 } while (!torture_must_stop());
1298 cur_ops->cb_barrier(); 1394 if (cur_ops->cb_barrier != NULL)
1395 cur_ops->cb_barrier();
1299 destroy_rcu_head_on_stack(&rcu); 1396 destroy_rcu_head_on_stack(&rcu);
1300 torture_kthread_stopping("rcu_torture_barrier_cbs"); 1397 torture_kthread_stopping("rcu_torture_barrier_cbs");
1301 return 0; 1398 return 0;
@@ -1418,7 +1515,7 @@ rcu_torture_cleanup(void)
1418 int i; 1515 int i;
1419 1516
1420 rcutorture_record_test_transition(); 1517 rcutorture_record_test_transition();
1421 if (torture_cleanup()) { 1518 if (torture_cleanup_begin()) {
1422 if (cur_ops->cb_barrier != NULL) 1519 if (cur_ops->cb_barrier != NULL)
1423 cur_ops->cb_barrier(); 1520 cur_ops->cb_barrier();
1424 return; 1521 return;
@@ -1447,6 +1544,8 @@ rcu_torture_cleanup(void)
1447 1544
1448 torture_stop_kthread(rcu_torture_stats, stats_task); 1545 torture_stop_kthread(rcu_torture_stats, stats_task);
1449 torture_stop_kthread(rcu_torture_fqs, fqs_task); 1546 torture_stop_kthread(rcu_torture_fqs, fqs_task);
1547 for (i = 0; i < ncbflooders; i++)
1548 torture_stop_kthread(rcu_torture_cbflood, cbflood_task[i]);
1450 if ((test_boost == 1 && cur_ops->can_boost) || 1549 if ((test_boost == 1 && cur_ops->can_boost) ||
1451 test_boost == 2) { 1550 test_boost == 2) {
1452 unregister_cpu_notifier(&rcutorture_cpu_nb); 1551 unregister_cpu_notifier(&rcutorture_cpu_nb);
@@ -1468,6 +1567,7 @@ rcu_torture_cleanup(void)
1468 "End of test: RCU_HOTPLUG"); 1567 "End of test: RCU_HOTPLUG");
1469 else 1568 else
1470 rcu_torture_print_module_parms(cur_ops, "End of test: SUCCESS"); 1569 rcu_torture_print_module_parms(cur_ops, "End of test: SUCCESS");
1570 torture_cleanup_end();
1471} 1571}
1472 1572
1473#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD 1573#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD
@@ -1534,9 +1634,10 @@ rcu_torture_init(void)
1534 int firsterr = 0; 1634 int firsterr = 0;
1535 static struct rcu_torture_ops *torture_ops[] = { 1635 static struct rcu_torture_ops *torture_ops[] = {
1536 &rcu_ops, &rcu_bh_ops, &rcu_busted_ops, &srcu_ops, &sched_ops, 1636 &rcu_ops, &rcu_bh_ops, &rcu_busted_ops, &srcu_ops, &sched_ops,
1637 RCUTORTURE_TASKS_OPS
1537 }; 1638 };
1538 1639
1539 if (!torture_init_begin(torture_type, verbose, &rcutorture_runnable)) 1640 if (!torture_init_begin(torture_type, verbose, &torture_runnable))
1540 return -EBUSY; 1641 return -EBUSY;
1541 1642
1542 /* Process args and tell the world that the torturer is on the job. */ 1643 /* Process args and tell the world that the torturer is on the job. */
@@ -1693,6 +1794,24 @@ rcu_torture_init(void)
1693 goto unwind; 1794 goto unwind;
1694 if (object_debug) 1795 if (object_debug)
1695 rcu_test_debug_objects(); 1796 rcu_test_debug_objects();
1797 if (cbflood_n_burst > 0) {
1798 /* Create the cbflood threads */
1799 ncbflooders = (num_online_cpus() + 3) / 4;
1800 cbflood_task = kcalloc(ncbflooders, sizeof(*cbflood_task),
1801 GFP_KERNEL);
1802 if (!cbflood_task) {
1803 VERBOSE_TOROUT_ERRSTRING("out of memory");
1804 firsterr = -ENOMEM;
1805 goto unwind;
1806 }
1807 for (i = 0; i < ncbflooders; i++) {
1808 firsterr = torture_create_kthread(rcu_torture_cbflood,
1809 NULL,
1810 cbflood_task[i]);
1811 if (firsterr)
1812 goto unwind;
1813 }
1814 }
1696 rcutorture_record_test_transition(); 1815 rcutorture_record_test_transition();
1697 torture_init_end(); 1816 torture_init_end();
1698 return 0; 1817 return 0;
diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c
index d9efcc13008c..0db5649f8817 100644
--- a/kernel/rcu/tiny.c
+++ b/kernel/rcu/tiny.c
@@ -51,7 +51,7 @@ static long long rcu_dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;
51 51
52#include "tiny_plugin.h" 52#include "tiny_plugin.h"
53 53
54/* Common code for rcu_idle_enter() and rcu_irq_exit(), see kernel/rcutree.c. */ 54/* Common code for rcu_idle_enter() and rcu_irq_exit(), see kernel/rcu/tree.c. */
55static void rcu_idle_enter_common(long long newval) 55static void rcu_idle_enter_common(long long newval)
56{ 56{
57 if (newval) { 57 if (newval) {
@@ -62,7 +62,7 @@ static void rcu_idle_enter_common(long long newval)
62 } 62 }
63 RCU_TRACE(trace_rcu_dyntick(TPS("Start"), 63 RCU_TRACE(trace_rcu_dyntick(TPS("Start"),
64 rcu_dynticks_nesting, newval)); 64 rcu_dynticks_nesting, newval));
65 if (!is_idle_task(current)) { 65 if (IS_ENABLED(CONFIG_RCU_TRACE) && !is_idle_task(current)) {
66 struct task_struct *idle __maybe_unused = idle_task(smp_processor_id()); 66 struct task_struct *idle __maybe_unused = idle_task(smp_processor_id());
67 67
68 RCU_TRACE(trace_rcu_dyntick(TPS("Entry error: not idle task"), 68 RCU_TRACE(trace_rcu_dyntick(TPS("Entry error: not idle task"),
@@ -72,7 +72,7 @@ static void rcu_idle_enter_common(long long newval)
72 current->pid, current->comm, 72 current->pid, current->comm,
73 idle->pid, idle->comm); /* must be idle task! */ 73 idle->pid, idle->comm); /* must be idle task! */
74 } 74 }
75 rcu_sched_qs(0); /* implies rcu_bh_qsctr_inc(0) */ 75 rcu_sched_qs(); /* implies rcu_bh_inc() */
76 barrier(); 76 barrier();
77 rcu_dynticks_nesting = newval; 77 rcu_dynticks_nesting = newval;
78} 78}
@@ -114,7 +114,7 @@ void rcu_irq_exit(void)
114} 114}
115EXPORT_SYMBOL_GPL(rcu_irq_exit); 115EXPORT_SYMBOL_GPL(rcu_irq_exit);
116 116
117/* Common code for rcu_idle_exit() and rcu_irq_enter(), see kernel/rcutree.c. */ 117/* Common code for rcu_idle_exit() and rcu_irq_enter(), see kernel/rcu/tree.c. */
118static void rcu_idle_exit_common(long long oldval) 118static void rcu_idle_exit_common(long long oldval)
119{ 119{
120 if (oldval) { 120 if (oldval) {
@@ -123,7 +123,7 @@ static void rcu_idle_exit_common(long long oldval)
123 return; 123 return;
124 } 124 }
125 RCU_TRACE(trace_rcu_dyntick(TPS("End"), oldval, rcu_dynticks_nesting)); 125 RCU_TRACE(trace_rcu_dyntick(TPS("End"), oldval, rcu_dynticks_nesting));
126 if (!is_idle_task(current)) { 126 if (IS_ENABLED(CONFIG_RCU_TRACE) && !is_idle_task(current)) {
127 struct task_struct *idle __maybe_unused = idle_task(smp_processor_id()); 127 struct task_struct *idle __maybe_unused = idle_task(smp_processor_id());
128 128
129 RCU_TRACE(trace_rcu_dyntick(TPS("Exit error: not idle task"), 129 RCU_TRACE(trace_rcu_dyntick(TPS("Exit error: not idle task"),
@@ -217,7 +217,7 @@ static int rcu_qsctr_help(struct rcu_ctrlblk *rcp)
217 * are at it, given that any rcu quiescent state is also an rcu_bh 217 * are at it, given that any rcu quiescent state is also an rcu_bh
218 * quiescent state. Use "+" instead of "||" to defeat short circuiting. 218 * quiescent state. Use "+" instead of "||" to defeat short circuiting.
219 */ 219 */
220void rcu_sched_qs(int cpu) 220void rcu_sched_qs(void)
221{ 221{
222 unsigned long flags; 222 unsigned long flags;
223 223
@@ -231,7 +231,7 @@ void rcu_sched_qs(int cpu)
231/* 231/*
232 * Record an rcu_bh quiescent state. 232 * Record an rcu_bh quiescent state.
233 */ 233 */
234void rcu_bh_qs(int cpu) 234void rcu_bh_qs(void)
235{ 235{
236 unsigned long flags; 236 unsigned long flags;
237 237
@@ -247,13 +247,15 @@ void rcu_bh_qs(int cpu)
247 * be called from hardirq context. It is normally called from the 247 * be called from hardirq context. It is normally called from the
248 * scheduling-clock interrupt. 248 * scheduling-clock interrupt.
249 */ 249 */
250void rcu_check_callbacks(int cpu, int user) 250void rcu_check_callbacks(int user)
251{ 251{
252 RCU_TRACE(check_cpu_stalls()); 252 RCU_TRACE(check_cpu_stalls());
253 if (user || rcu_is_cpu_rrupt_from_idle()) 253 if (user || rcu_is_cpu_rrupt_from_idle())
254 rcu_sched_qs(cpu); 254 rcu_sched_qs();
255 else if (!in_softirq()) 255 else if (!in_softirq())
256 rcu_bh_qs(cpu); 256 rcu_bh_qs();
257 if (user)
258 rcu_note_voluntary_context_switch(current);
257} 259}
258 260
259/* 261/*
@@ -378,7 +380,9 @@ void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
378} 380}
379EXPORT_SYMBOL_GPL(call_rcu_bh); 381EXPORT_SYMBOL_GPL(call_rcu_bh);
380 382
381void rcu_init(void) 383void __init rcu_init(void)
382{ 384{
383 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); 385 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
386
387 rcu_early_boot_tests();
384} 388}
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 1b70cb6fbe3c..7680fc275036 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -79,9 +79,18 @@ static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS];
79 * the tracing userspace tools to be able to decipher the string 79 * the tracing userspace tools to be able to decipher the string
80 * address to the matching string. 80 * address to the matching string.
81 */ 81 */
82#define RCU_STATE_INITIALIZER(sname, sabbr, cr) \ 82#ifdef CONFIG_TRACING
83# define DEFINE_RCU_TPS(sname) \
83static char sname##_varname[] = #sname; \ 84static char sname##_varname[] = #sname; \
84static const char *tp_##sname##_varname __used __tracepoint_string = sname##_varname; \ 85static const char *tp_##sname##_varname __used __tracepoint_string = sname##_varname;
86# define RCU_STATE_NAME(sname) sname##_varname
87#else
88# define DEFINE_RCU_TPS(sname)
89# define RCU_STATE_NAME(sname) __stringify(sname)
90#endif
91
92#define RCU_STATE_INITIALIZER(sname, sabbr, cr) \
93DEFINE_RCU_TPS(sname) \
85struct rcu_state sname##_state = { \ 94struct rcu_state sname##_state = { \
86 .level = { &sname##_state.node[0] }, \ 95 .level = { &sname##_state.node[0] }, \
87 .call = cr, \ 96 .call = cr, \
@@ -93,10 +102,10 @@ struct rcu_state sname##_state = { \
93 .orphan_donetail = &sname##_state.orphan_donelist, \ 102 .orphan_donetail = &sname##_state.orphan_donelist, \
94 .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \ 103 .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \
95 .onoff_mutex = __MUTEX_INITIALIZER(sname##_state.onoff_mutex), \ 104 .onoff_mutex = __MUTEX_INITIALIZER(sname##_state.onoff_mutex), \
96 .name = sname##_varname, \ 105 .name = RCU_STATE_NAME(sname), \
97 .abbr = sabbr, \ 106 .abbr = sabbr, \
98}; \ 107}; \
99DEFINE_PER_CPU(struct rcu_data, sname##_data) 108DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_data, sname##_data)
100 109
101RCU_STATE_INITIALIZER(rcu_sched, 's', call_rcu_sched); 110RCU_STATE_INITIALIZER(rcu_sched, 's', call_rcu_sched);
102RCU_STATE_INITIALIZER(rcu_bh, 'b', call_rcu_bh); 111RCU_STATE_INITIALIZER(rcu_bh, 'b', call_rcu_bh);
@@ -143,19 +152,6 @@ EXPORT_SYMBOL_GPL(rcu_scheduler_active);
143 */ 152 */
144static int rcu_scheduler_fully_active __read_mostly; 153static int rcu_scheduler_fully_active __read_mostly;
145 154
146#ifdef CONFIG_RCU_BOOST
147
148/*
149 * Control variables for per-CPU and per-rcu_node kthreads. These
150 * handle all flavors of RCU.
151 */
152static DEFINE_PER_CPU(struct task_struct *, rcu_cpu_kthread_task);
153DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
154DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_loops);
155DEFINE_PER_CPU(char, rcu_cpu_has_work);
156
157#endif /* #ifdef CONFIG_RCU_BOOST */
158
159static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu); 155static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu);
160static void invoke_rcu_core(void); 156static void invoke_rcu_core(void);
161static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp); 157static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp);
@@ -188,22 +184,24 @@ static int rcu_gp_in_progress(struct rcu_state *rsp)
188 * one since the start of the grace period, this just sets a flag. 184 * one since the start of the grace period, this just sets a flag.
189 * The caller must have disabled preemption. 185 * The caller must have disabled preemption.
190 */ 186 */
191void rcu_sched_qs(int cpu) 187void rcu_sched_qs(void)
192{ 188{
193 struct rcu_data *rdp = &per_cpu(rcu_sched_data, cpu); 189 if (!__this_cpu_read(rcu_sched_data.passed_quiesce)) {
194 190 trace_rcu_grace_period(TPS("rcu_sched"),
195 if (rdp->passed_quiesce == 0) 191 __this_cpu_read(rcu_sched_data.gpnum),
196 trace_rcu_grace_period(TPS("rcu_sched"), rdp->gpnum, TPS("cpuqs")); 192 TPS("cpuqs"));
197 rdp->passed_quiesce = 1; 193 __this_cpu_write(rcu_sched_data.passed_quiesce, 1);
194 }
198} 195}
199 196
200void rcu_bh_qs(int cpu) 197void rcu_bh_qs(void)
201{ 198{
202 struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu); 199 if (!__this_cpu_read(rcu_bh_data.passed_quiesce)) {
203 200 trace_rcu_grace_period(TPS("rcu_bh"),
204 if (rdp->passed_quiesce == 0) 201 __this_cpu_read(rcu_bh_data.gpnum),
205 trace_rcu_grace_period(TPS("rcu_bh"), rdp->gpnum, TPS("cpuqs")); 202 TPS("cpuqs"));
206 rdp->passed_quiesce = 1; 203 __this_cpu_write(rcu_bh_data.passed_quiesce, 1);
204 }
207} 205}
208 206
209static DEFINE_PER_CPU(int, rcu_sched_qs_mask); 207static DEFINE_PER_CPU(int, rcu_sched_qs_mask);
@@ -275,11 +273,11 @@ static void rcu_momentary_dyntick_idle(void)
275 * and requires special handling for preemptible RCU. 273 * and requires special handling for preemptible RCU.
276 * The caller must have disabled preemption. 274 * The caller must have disabled preemption.
277 */ 275 */
278void rcu_note_context_switch(int cpu) 276void rcu_note_context_switch(void)
279{ 277{
280 trace_rcu_utilization(TPS("Start context switch")); 278 trace_rcu_utilization(TPS("Start context switch"));
281 rcu_sched_qs(cpu); 279 rcu_sched_qs();
282 rcu_preempt_note_context_switch(cpu); 280 rcu_preempt_note_context_switch();
283 if (unlikely(raw_cpu_read(rcu_sched_qs_mask))) 281 if (unlikely(raw_cpu_read(rcu_sched_qs_mask)))
284 rcu_momentary_dyntick_idle(); 282 rcu_momentary_dyntick_idle();
285 trace_rcu_utilization(TPS("End context switch")); 283 trace_rcu_utilization(TPS("End context switch"));
@@ -314,7 +312,7 @@ static void force_qs_rnp(struct rcu_state *rsp,
314 unsigned long *maxj), 312 unsigned long *maxj),
315 bool *isidle, unsigned long *maxj); 313 bool *isidle, unsigned long *maxj);
316static void force_quiescent_state(struct rcu_state *rsp); 314static void force_quiescent_state(struct rcu_state *rsp);
317static int rcu_pending(int cpu); 315static int rcu_pending(void);
318 316
319/* 317/*
320 * Return the number of RCU-sched batches processed thus far for debug & stats. 318 * Return the number of RCU-sched batches processed thus far for debug & stats.
@@ -499,11 +497,11 @@ cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp)
499 * we really have entered idle, and must do the appropriate accounting. 497 * we really have entered idle, and must do the appropriate accounting.
500 * The caller must have disabled interrupts. 498 * The caller must have disabled interrupts.
501 */ 499 */
502static void rcu_eqs_enter_common(struct rcu_dynticks *rdtp, long long oldval, 500static void rcu_eqs_enter_common(long long oldval, bool user)
503 bool user)
504{ 501{
505 struct rcu_state *rsp; 502 struct rcu_state *rsp;
506 struct rcu_data *rdp; 503 struct rcu_data *rdp;
504 struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
507 505
508 trace_rcu_dyntick(TPS("Start"), oldval, rdtp->dynticks_nesting); 506 trace_rcu_dyntick(TPS("Start"), oldval, rdtp->dynticks_nesting);
509 if (!user && !is_idle_task(current)) { 507 if (!user && !is_idle_task(current)) {
@@ -520,12 +518,13 @@ static void rcu_eqs_enter_common(struct rcu_dynticks *rdtp, long long oldval,
520 rdp = this_cpu_ptr(rsp->rda); 518 rdp = this_cpu_ptr(rsp->rda);
521 do_nocb_deferred_wakeup(rdp); 519 do_nocb_deferred_wakeup(rdp);
522 } 520 }
523 rcu_prepare_for_idle(smp_processor_id()); 521 rcu_prepare_for_idle();
524 /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */ 522 /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */
525 smp_mb__before_atomic(); /* See above. */ 523 smp_mb__before_atomic(); /* See above. */
526 atomic_inc(&rdtp->dynticks); 524 atomic_inc(&rdtp->dynticks);
527 smp_mb__after_atomic(); /* Force ordering with next sojourn. */ 525 smp_mb__after_atomic(); /* Force ordering with next sojourn. */
528 WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1); 526 WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1);
527 rcu_dynticks_task_enter();
529 528
530 /* 529 /*
531 * It is illegal to enter an extended quiescent state while 530 * It is illegal to enter an extended quiescent state while
@@ -553,7 +552,7 @@ static void rcu_eqs_enter(bool user)
553 WARN_ON_ONCE((oldval & DYNTICK_TASK_NEST_MASK) == 0); 552 WARN_ON_ONCE((oldval & DYNTICK_TASK_NEST_MASK) == 0);
554 if ((oldval & DYNTICK_TASK_NEST_MASK) == DYNTICK_TASK_NEST_VALUE) { 553 if ((oldval & DYNTICK_TASK_NEST_MASK) == DYNTICK_TASK_NEST_VALUE) {
555 rdtp->dynticks_nesting = 0; 554 rdtp->dynticks_nesting = 0;
556 rcu_eqs_enter_common(rdtp, oldval, user); 555 rcu_eqs_enter_common(oldval, user);
557 } else { 556 } else {
558 rdtp->dynticks_nesting -= DYNTICK_TASK_NEST_VALUE; 557 rdtp->dynticks_nesting -= DYNTICK_TASK_NEST_VALUE;
559 } 558 }
@@ -577,7 +576,7 @@ void rcu_idle_enter(void)
577 576
578 local_irq_save(flags); 577 local_irq_save(flags);
579 rcu_eqs_enter(false); 578 rcu_eqs_enter(false);
580 rcu_sysidle_enter(this_cpu_ptr(&rcu_dynticks), 0); 579 rcu_sysidle_enter(0);
581 local_irq_restore(flags); 580 local_irq_restore(flags);
582} 581}
583EXPORT_SYMBOL_GPL(rcu_idle_enter); 582EXPORT_SYMBOL_GPL(rcu_idle_enter);
@@ -627,8 +626,8 @@ void rcu_irq_exit(void)
627 if (rdtp->dynticks_nesting) 626 if (rdtp->dynticks_nesting)
628 trace_rcu_dyntick(TPS("--="), oldval, rdtp->dynticks_nesting); 627 trace_rcu_dyntick(TPS("--="), oldval, rdtp->dynticks_nesting);
629 else 628 else
630 rcu_eqs_enter_common(rdtp, oldval, true); 629 rcu_eqs_enter_common(oldval, true);
631 rcu_sysidle_enter(rdtp, 1); 630 rcu_sysidle_enter(1);
632 local_irq_restore(flags); 631 local_irq_restore(flags);
633} 632}
634 633
@@ -639,15 +638,17 @@ void rcu_irq_exit(void)
639 * we really have exited idle, and must do the appropriate accounting. 638 * we really have exited idle, and must do the appropriate accounting.
640 * The caller must have disabled interrupts. 639 * The caller must have disabled interrupts.
641 */ 640 */
642static void rcu_eqs_exit_common(struct rcu_dynticks *rdtp, long long oldval, 641static void rcu_eqs_exit_common(long long oldval, int user)
643 int user)
644{ 642{
643 struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
644
645 rcu_dynticks_task_exit();
645 smp_mb__before_atomic(); /* Force ordering w/previous sojourn. */ 646 smp_mb__before_atomic(); /* Force ordering w/previous sojourn. */
646 atomic_inc(&rdtp->dynticks); 647 atomic_inc(&rdtp->dynticks);
647 /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */ 648 /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */
648 smp_mb__after_atomic(); /* See above. */ 649 smp_mb__after_atomic(); /* See above. */
649 WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1)); 650 WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1));
650 rcu_cleanup_after_idle(smp_processor_id()); 651 rcu_cleanup_after_idle();
651 trace_rcu_dyntick(TPS("End"), oldval, rdtp->dynticks_nesting); 652 trace_rcu_dyntick(TPS("End"), oldval, rdtp->dynticks_nesting);
652 if (!user && !is_idle_task(current)) { 653 if (!user && !is_idle_task(current)) {
653 struct task_struct *idle __maybe_unused = 654 struct task_struct *idle __maybe_unused =
@@ -678,7 +679,7 @@ static void rcu_eqs_exit(bool user)
678 rdtp->dynticks_nesting += DYNTICK_TASK_NEST_VALUE; 679 rdtp->dynticks_nesting += DYNTICK_TASK_NEST_VALUE;
679 } else { 680 } else {
680 rdtp->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; 681 rdtp->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;
681 rcu_eqs_exit_common(rdtp, oldval, user); 682 rcu_eqs_exit_common(oldval, user);
682 } 683 }
683} 684}
684 685
@@ -699,7 +700,7 @@ void rcu_idle_exit(void)
699 700
700 local_irq_save(flags); 701 local_irq_save(flags);
701 rcu_eqs_exit(false); 702 rcu_eqs_exit(false);
702 rcu_sysidle_exit(this_cpu_ptr(&rcu_dynticks), 0); 703 rcu_sysidle_exit(0);
703 local_irq_restore(flags); 704 local_irq_restore(flags);
704} 705}
705EXPORT_SYMBOL_GPL(rcu_idle_exit); 706EXPORT_SYMBOL_GPL(rcu_idle_exit);
@@ -750,8 +751,8 @@ void rcu_irq_enter(void)
750 if (oldval) 751 if (oldval)
751 trace_rcu_dyntick(TPS("++="), oldval, rdtp->dynticks_nesting); 752 trace_rcu_dyntick(TPS("++="), oldval, rdtp->dynticks_nesting);
752 else 753 else
753 rcu_eqs_exit_common(rdtp, oldval, true); 754 rcu_eqs_exit_common(oldval, true);
754 rcu_sysidle_exit(rdtp, 1); 755 rcu_sysidle_exit(1);
755 local_irq_restore(flags); 756 local_irq_restore(flags);
756} 757}
757 758
@@ -819,7 +820,7 @@ bool notrace __rcu_is_watching(void)
819 */ 820 */
820bool notrace rcu_is_watching(void) 821bool notrace rcu_is_watching(void)
821{ 822{
822 int ret; 823 bool ret;
823 824
824 preempt_disable(); 825 preempt_disable();
825 ret = __rcu_is_watching(); 826 ret = __rcu_is_watching();
@@ -1647,7 +1648,7 @@ static int rcu_gp_init(struct rcu_state *rsp)
1647 rnp->level, rnp->grplo, 1648 rnp->level, rnp->grplo,
1648 rnp->grphi, rnp->qsmask); 1649 rnp->grphi, rnp->qsmask);
1649 raw_spin_unlock_irq(&rnp->lock); 1650 raw_spin_unlock_irq(&rnp->lock);
1650 cond_resched(); 1651 cond_resched_rcu_qs();
1651 } 1652 }
1652 1653
1653 mutex_unlock(&rsp->onoff_mutex); 1654 mutex_unlock(&rsp->onoff_mutex);
@@ -1668,7 +1669,7 @@ static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in)
1668 if (fqs_state == RCU_SAVE_DYNTICK) { 1669 if (fqs_state == RCU_SAVE_DYNTICK) {
1669 /* Collect dyntick-idle snapshots. */ 1670 /* Collect dyntick-idle snapshots. */
1670 if (is_sysidle_rcu_state(rsp)) { 1671 if (is_sysidle_rcu_state(rsp)) {
1671 isidle = 1; 1672 isidle = true;
1672 maxj = jiffies - ULONG_MAX / 4; 1673 maxj = jiffies - ULONG_MAX / 4;
1673 } 1674 }
1674 force_qs_rnp(rsp, dyntick_save_progress_counter, 1675 force_qs_rnp(rsp, dyntick_save_progress_counter,
@@ -1677,14 +1678,15 @@ static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in)
1677 fqs_state = RCU_FORCE_QS; 1678 fqs_state = RCU_FORCE_QS;
1678 } else { 1679 } else {
1679 /* Handle dyntick-idle and offline CPUs. */ 1680 /* Handle dyntick-idle and offline CPUs. */
1680 isidle = 0; 1681 isidle = false;
1681 force_qs_rnp(rsp, rcu_implicit_dynticks_qs, &isidle, &maxj); 1682 force_qs_rnp(rsp, rcu_implicit_dynticks_qs, &isidle, &maxj);
1682 } 1683 }
1683 /* Clear flag to prevent immediate re-entry. */ 1684 /* Clear flag to prevent immediate re-entry. */
1684 if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) { 1685 if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) {
1685 raw_spin_lock_irq(&rnp->lock); 1686 raw_spin_lock_irq(&rnp->lock);
1686 smp_mb__after_unlock_lock(); 1687 smp_mb__after_unlock_lock();
1687 ACCESS_ONCE(rsp->gp_flags) &= ~RCU_GP_FLAG_FQS; 1688 ACCESS_ONCE(rsp->gp_flags) =
1689 ACCESS_ONCE(rsp->gp_flags) & ~RCU_GP_FLAG_FQS;
1688 raw_spin_unlock_irq(&rnp->lock); 1690 raw_spin_unlock_irq(&rnp->lock);
1689 } 1691 }
1690 return fqs_state; 1692 return fqs_state;
@@ -1736,7 +1738,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
1736 /* smp_mb() provided by prior unlock-lock pair. */ 1738 /* smp_mb() provided by prior unlock-lock pair. */
1737 nocb += rcu_future_gp_cleanup(rsp, rnp); 1739 nocb += rcu_future_gp_cleanup(rsp, rnp);
1738 raw_spin_unlock_irq(&rnp->lock); 1740 raw_spin_unlock_irq(&rnp->lock);
1739 cond_resched(); 1741 cond_resched_rcu_qs();
1740 } 1742 }
1741 rnp = rcu_get_root(rsp); 1743 rnp = rcu_get_root(rsp);
1742 raw_spin_lock_irq(&rnp->lock); 1744 raw_spin_lock_irq(&rnp->lock);
@@ -1785,8 +1787,8 @@ static int __noreturn rcu_gp_kthread(void *arg)
1785 /* Locking provides needed memory barrier. */ 1787 /* Locking provides needed memory barrier. */
1786 if (rcu_gp_init(rsp)) 1788 if (rcu_gp_init(rsp))
1787 break; 1789 break;
1788 cond_resched(); 1790 cond_resched_rcu_qs();
1789 flush_signals(current); 1791 WARN_ON(signal_pending(current));
1790 trace_rcu_grace_period(rsp->name, 1792 trace_rcu_grace_period(rsp->name,
1791 ACCESS_ONCE(rsp->gpnum), 1793 ACCESS_ONCE(rsp->gpnum),
1792 TPS("reqwaitsig")); 1794 TPS("reqwaitsig"));
@@ -1828,11 +1830,11 @@ static int __noreturn rcu_gp_kthread(void *arg)
1828 trace_rcu_grace_period(rsp->name, 1830 trace_rcu_grace_period(rsp->name,
1829 ACCESS_ONCE(rsp->gpnum), 1831 ACCESS_ONCE(rsp->gpnum),
1830 TPS("fqsend")); 1832 TPS("fqsend"));
1831 cond_resched(); 1833 cond_resched_rcu_qs();
1832 } else { 1834 } else {
1833 /* Deal with stray signal. */ 1835 /* Deal with stray signal. */
1834 cond_resched(); 1836 cond_resched_rcu_qs();
1835 flush_signals(current); 1837 WARN_ON(signal_pending(current));
1836 trace_rcu_grace_period(rsp->name, 1838 trace_rcu_grace_period(rsp->name,
1837 ACCESS_ONCE(rsp->gpnum), 1839 ACCESS_ONCE(rsp->gpnum),
1838 TPS("fqswaitsig")); 1840 TPS("fqswaitsig"));
@@ -1928,7 +1930,7 @@ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags)
1928{ 1930{
1929 WARN_ON_ONCE(!rcu_gp_in_progress(rsp)); 1931 WARN_ON_ONCE(!rcu_gp_in_progress(rsp));
1930 raw_spin_unlock_irqrestore(&rcu_get_root(rsp)->lock, flags); 1932 raw_spin_unlock_irqrestore(&rcu_get_root(rsp)->lock, flags);
1931 wake_up(&rsp->gp_wq); /* Memory barrier implied by wake_up() path. */ 1933 rcu_gp_kthread_wake(rsp);
1932} 1934}
1933 1935
1934/* 1936/*
@@ -2210,8 +2212,6 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
2210 /* Adjust any no-longer-needed kthreads. */ 2212 /* Adjust any no-longer-needed kthreads. */
2211 rcu_boost_kthread_setaffinity(rnp, -1); 2213 rcu_boost_kthread_setaffinity(rnp, -1);
2212 2214
2213 /* Remove the dead CPU from the bitmasks in the rcu_node hierarchy. */
2214
2215 /* Exclude any attempts to start a new grace period. */ 2215 /* Exclude any attempts to start a new grace period. */
2216 mutex_lock(&rsp->onoff_mutex); 2216 mutex_lock(&rsp->onoff_mutex);
2217 raw_spin_lock_irqsave(&rsp->orphan_lock, flags); 2217 raw_spin_lock_irqsave(&rsp->orphan_lock, flags);
@@ -2375,7 +2375,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
2375 * invoked from the scheduling-clock interrupt. If rcu_pending returns 2375 * invoked from the scheduling-clock interrupt. If rcu_pending returns
2376 * false, there is no point in invoking rcu_check_callbacks(). 2376 * false, there is no point in invoking rcu_check_callbacks().
2377 */ 2377 */
2378void rcu_check_callbacks(int cpu, int user) 2378void rcu_check_callbacks(int user)
2379{ 2379{
2380 trace_rcu_utilization(TPS("Start scheduler-tick")); 2380 trace_rcu_utilization(TPS("Start scheduler-tick"));
2381 increment_cpu_stall_ticks(); 2381 increment_cpu_stall_ticks();
@@ -2393,8 +2393,8 @@ void rcu_check_callbacks(int cpu, int user)
2393 * at least not while the corresponding CPU is online. 2393 * at least not while the corresponding CPU is online.
2394 */ 2394 */
2395 2395
2396 rcu_sched_qs(cpu); 2396 rcu_sched_qs();
2397 rcu_bh_qs(cpu); 2397 rcu_bh_qs();
2398 2398
2399 } else if (!in_softirq()) { 2399 } else if (!in_softirq()) {
2400 2400
@@ -2405,11 +2405,13 @@ void rcu_check_callbacks(int cpu, int user)
2405 * critical section, so note it. 2405 * critical section, so note it.
2406 */ 2406 */
2407 2407
2408 rcu_bh_qs(cpu); 2408 rcu_bh_qs();
2409 } 2409 }
2410 rcu_preempt_check_callbacks(cpu); 2410 rcu_preempt_check_callbacks();
2411 if (rcu_pending(cpu)) 2411 if (rcu_pending())
2412 invoke_rcu_core(); 2412 invoke_rcu_core();
2413 if (user)
2414 rcu_note_voluntary_context_switch(current);
2413 trace_rcu_utilization(TPS("End scheduler-tick")); 2415 trace_rcu_utilization(TPS("End scheduler-tick"));
2414} 2416}
2415 2417
@@ -2432,7 +2434,7 @@ static void force_qs_rnp(struct rcu_state *rsp,
2432 struct rcu_node *rnp; 2434 struct rcu_node *rnp;
2433 2435
2434 rcu_for_each_leaf_node(rsp, rnp) { 2436 rcu_for_each_leaf_node(rsp, rnp) {
2435 cond_resched(); 2437 cond_resched_rcu_qs();
2436 mask = 0; 2438 mask = 0;
2437 raw_spin_lock_irqsave(&rnp->lock, flags); 2439 raw_spin_lock_irqsave(&rnp->lock, flags);
2438 smp_mb__after_unlock_lock(); 2440 smp_mb__after_unlock_lock();
@@ -2449,7 +2451,7 @@ static void force_qs_rnp(struct rcu_state *rsp,
2449 for (; cpu <= rnp->grphi; cpu++, bit <<= 1) { 2451 for (; cpu <= rnp->grphi; cpu++, bit <<= 1) {
2450 if ((rnp->qsmask & bit) != 0) { 2452 if ((rnp->qsmask & bit) != 0) {
2451 if ((rnp->qsmaskinit & bit) != 0) 2453 if ((rnp->qsmaskinit & bit) != 0)
2452 *isidle = 0; 2454 *isidle = false;
2453 if (f(per_cpu_ptr(rsp->rda, cpu), isidle, maxj)) 2455 if (f(per_cpu_ptr(rsp->rda, cpu), isidle, maxj))
2454 mask |= bit; 2456 mask |= bit;
2455 } 2457 }
@@ -2505,9 +2507,10 @@ static void force_quiescent_state(struct rcu_state *rsp)
2505 raw_spin_unlock_irqrestore(&rnp_old->lock, flags); 2507 raw_spin_unlock_irqrestore(&rnp_old->lock, flags);
2506 return; /* Someone beat us to it. */ 2508 return; /* Someone beat us to it. */
2507 } 2509 }
2508 ACCESS_ONCE(rsp->gp_flags) |= RCU_GP_FLAG_FQS; 2510 ACCESS_ONCE(rsp->gp_flags) =
2511 ACCESS_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS;
2509 raw_spin_unlock_irqrestore(&rnp_old->lock, flags); 2512 raw_spin_unlock_irqrestore(&rnp_old->lock, flags);
2510 wake_up(&rsp->gp_wq); /* Memory barrier implied by wake_up() path. */ 2513 rcu_gp_kthread_wake(rsp);
2511} 2514}
2512 2515
2513/* 2516/*
@@ -2925,11 +2928,6 @@ static int synchronize_sched_expedited_cpu_stop(void *data)
2925 * restructure your code to batch your updates, and then use a single 2928 * restructure your code to batch your updates, and then use a single
2926 * synchronize_sched() instead. 2929 * synchronize_sched() instead.
2927 * 2930 *
2928 * Note that it is illegal to call this function while holding any lock
2929 * that is acquired by a CPU-hotplug notifier. And yes, it is also illegal
2930 * to call this function from a CPU-hotplug notifier. Failing to observe
2931 * these restriction will result in deadlock.
2932 *
2933 * This implementation can be thought of as an application of ticket 2931 * This implementation can be thought of as an application of ticket
2934 * locking to RCU, with sync_sched_expedited_started and 2932 * locking to RCU, with sync_sched_expedited_started and
2935 * sync_sched_expedited_done taking on the roles of the halves 2933 * sync_sched_expedited_done taking on the roles of the halves
@@ -2953,6 +2951,9 @@ static int synchronize_sched_expedited_cpu_stop(void *data)
2953 */ 2951 */
2954void synchronize_sched_expedited(void) 2952void synchronize_sched_expedited(void)
2955{ 2953{
2954 cpumask_var_t cm;
2955 bool cma = false;
2956 int cpu;
2956 long firstsnap, s, snap; 2957 long firstsnap, s, snap;
2957 int trycount = 0; 2958 int trycount = 0;
2958 struct rcu_state *rsp = &rcu_sched_state; 2959 struct rcu_state *rsp = &rcu_sched_state;
@@ -2979,14 +2980,34 @@ void synchronize_sched_expedited(void)
2979 */ 2980 */
2980 snap = atomic_long_inc_return(&rsp->expedited_start); 2981 snap = atomic_long_inc_return(&rsp->expedited_start);
2981 firstsnap = snap; 2982 firstsnap = snap;
2982 get_online_cpus(); 2983 if (!try_get_online_cpus()) {
2984 /* CPU hotplug operation in flight, fall back to normal GP. */
2985 wait_rcu_gp(call_rcu_sched);
2986 atomic_long_inc(&rsp->expedited_normal);
2987 return;
2988 }
2983 WARN_ON_ONCE(cpu_is_offline(raw_smp_processor_id())); 2989 WARN_ON_ONCE(cpu_is_offline(raw_smp_processor_id()));
2984 2990
2991 /* Offline CPUs, idle CPUs, and any CPU we run on are quiescent. */
2992 cma = zalloc_cpumask_var(&cm, GFP_KERNEL);
2993 if (cma) {
2994 cpumask_copy(cm, cpu_online_mask);
2995 cpumask_clear_cpu(raw_smp_processor_id(), cm);
2996 for_each_cpu(cpu, cm) {
2997 struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
2998
2999 if (!(atomic_add_return(0, &rdtp->dynticks) & 0x1))
3000 cpumask_clear_cpu(cpu, cm);
3001 }
3002 if (cpumask_weight(cm) == 0)
3003 goto all_cpus_idle;
3004 }
3005
2985 /* 3006 /*
2986 * Each pass through the following loop attempts to force a 3007 * Each pass through the following loop attempts to force a
2987 * context switch on each CPU. 3008 * context switch on each CPU.
2988 */ 3009 */
2989 while (try_stop_cpus(cpu_online_mask, 3010 while (try_stop_cpus(cma ? cm : cpu_online_mask,
2990 synchronize_sched_expedited_cpu_stop, 3011 synchronize_sched_expedited_cpu_stop,
2991 NULL) == -EAGAIN) { 3012 NULL) == -EAGAIN) {
2992 put_online_cpus(); 3013 put_online_cpus();
@@ -2998,6 +3019,7 @@ void synchronize_sched_expedited(void)
2998 /* ensure test happens before caller kfree */ 3019 /* ensure test happens before caller kfree */
2999 smp_mb__before_atomic(); /* ^^^ */ 3020 smp_mb__before_atomic(); /* ^^^ */
3000 atomic_long_inc(&rsp->expedited_workdone1); 3021 atomic_long_inc(&rsp->expedited_workdone1);
3022 free_cpumask_var(cm);
3001 return; 3023 return;
3002 } 3024 }
3003 3025
@@ -3007,6 +3029,7 @@ void synchronize_sched_expedited(void)
3007 } else { 3029 } else {
3008 wait_rcu_gp(call_rcu_sched); 3030 wait_rcu_gp(call_rcu_sched);
3009 atomic_long_inc(&rsp->expedited_normal); 3031 atomic_long_inc(&rsp->expedited_normal);
3032 free_cpumask_var(cm);
3010 return; 3033 return;
3011 } 3034 }
3012 3035
@@ -3016,6 +3039,7 @@ void synchronize_sched_expedited(void)
3016 /* ensure test happens before caller kfree */ 3039 /* ensure test happens before caller kfree */
3017 smp_mb__before_atomic(); /* ^^^ */ 3040 smp_mb__before_atomic(); /* ^^^ */
3018 atomic_long_inc(&rsp->expedited_workdone2); 3041 atomic_long_inc(&rsp->expedited_workdone2);
3042 free_cpumask_var(cm);
3019 return; 3043 return;
3020 } 3044 }
3021 3045
@@ -3026,12 +3050,21 @@ void synchronize_sched_expedited(void)
3026 * and they started after our first try, so their grace 3050 * and they started after our first try, so their grace
3027 * period works for us. 3051 * period works for us.
3028 */ 3052 */
3029 get_online_cpus(); 3053 if (!try_get_online_cpus()) {
3054 /* CPU hotplug operation in flight, use normal GP. */
3055 wait_rcu_gp(call_rcu_sched);
3056 atomic_long_inc(&rsp->expedited_normal);
3057 free_cpumask_var(cm);
3058 return;
3059 }
3030 snap = atomic_long_read(&rsp->expedited_start); 3060 snap = atomic_long_read(&rsp->expedited_start);
3031 smp_mb(); /* ensure read is before try_stop_cpus(). */ 3061 smp_mb(); /* ensure read is before try_stop_cpus(). */
3032 } 3062 }
3033 atomic_long_inc(&rsp->expedited_stoppedcpus); 3063 atomic_long_inc(&rsp->expedited_stoppedcpus);
3034 3064
3065all_cpus_idle:
3066 free_cpumask_var(cm);
3067
3035 /* 3068 /*
3036 * Everyone up to our most recent fetch is covered by our grace 3069 * Everyone up to our most recent fetch is covered by our grace
3037 * period. Update the counter, but only if our work is still 3070 * period. Update the counter, but only if our work is still
@@ -3123,12 +3156,12 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
3123 * by the current CPU, returning 1 if so. This function is part of the 3156 * by the current CPU, returning 1 if so. This function is part of the
3124 * RCU implementation; it is -not- an exported member of the RCU API. 3157 * RCU implementation; it is -not- an exported member of the RCU API.
3125 */ 3158 */
3126static int rcu_pending(int cpu) 3159static int rcu_pending(void)
3127{ 3160{
3128 struct rcu_state *rsp; 3161 struct rcu_state *rsp;
3129 3162
3130 for_each_rcu_flavor(rsp) 3163 for_each_rcu_flavor(rsp)
3131 if (__rcu_pending(rsp, per_cpu_ptr(rsp->rda, cpu))) 3164 if (__rcu_pending(rsp, this_cpu_ptr(rsp->rda)))
3132 return 1; 3165 return 1;
3133 return 0; 3166 return 0;
3134} 3167}
@@ -3138,7 +3171,7 @@ static int rcu_pending(int cpu)
3138 * non-NULL, store an indication of whether all callbacks are lazy. 3171 * non-NULL, store an indication of whether all callbacks are lazy.
3139 * (If there are no callbacks, all of them are deemed to be lazy.) 3172 * (If there are no callbacks, all of them are deemed to be lazy.)
3140 */ 3173 */
3141static int __maybe_unused rcu_cpu_has_callbacks(int cpu, bool *all_lazy) 3174static int __maybe_unused rcu_cpu_has_callbacks(bool *all_lazy)
3142{ 3175{
3143 bool al = true; 3176 bool al = true;
3144 bool hc = false; 3177 bool hc = false;
@@ -3146,7 +3179,7 @@ static int __maybe_unused rcu_cpu_has_callbacks(int cpu, bool *all_lazy)
3146 struct rcu_state *rsp; 3179 struct rcu_state *rsp;
3147 3180
3148 for_each_rcu_flavor(rsp) { 3181 for_each_rcu_flavor(rsp) {
3149 rdp = per_cpu_ptr(rsp->rda, cpu); 3182 rdp = this_cpu_ptr(rsp->rda);
3150 if (!rdp->nxtlist) 3183 if (!rdp->nxtlist)
3151 continue; 3184 continue;
3152 hc = true; 3185 hc = true;
@@ -3279,11 +3312,16 @@ static void _rcu_barrier(struct rcu_state *rsp)
3279 continue; 3312 continue;
3280 rdp = per_cpu_ptr(rsp->rda, cpu); 3313 rdp = per_cpu_ptr(rsp->rda, cpu);
3281 if (rcu_is_nocb_cpu(cpu)) { 3314 if (rcu_is_nocb_cpu(cpu)) {
3282 _rcu_barrier_trace(rsp, "OnlineNoCB", cpu, 3315 if (!rcu_nocb_cpu_needs_barrier(rsp, cpu)) {
3283 rsp->n_barrier_done); 3316 _rcu_barrier_trace(rsp, "OfflineNoCB", cpu,
3284 atomic_inc(&rsp->barrier_cpu_count); 3317 rsp->n_barrier_done);
3285 __call_rcu(&rdp->barrier_head, rcu_barrier_callback, 3318 } else {
3286 rsp, cpu, 0); 3319 _rcu_barrier_trace(rsp, "OnlineNoCB", cpu,
3320 rsp->n_barrier_done);
3321 atomic_inc(&rsp->barrier_cpu_count);
3322 __call_rcu(&rdp->barrier_head,
3323 rcu_barrier_callback, rsp, cpu, 0);
3324 }
3287 } else if (ACCESS_ONCE(rdp->qlen)) { 3325 } else if (ACCESS_ONCE(rdp->qlen)) {
3288 _rcu_barrier_trace(rsp, "OnlineQ", cpu, 3326 _rcu_barrier_trace(rsp, "OnlineQ", cpu,
3289 rsp->n_barrier_done); 3327 rsp->n_barrier_done);
@@ -3442,6 +3480,7 @@ static int rcu_cpu_notify(struct notifier_block *self,
3442 case CPU_UP_PREPARE_FROZEN: 3480 case CPU_UP_PREPARE_FROZEN:
3443 rcu_prepare_cpu(cpu); 3481 rcu_prepare_cpu(cpu);
3444 rcu_prepare_kthreads(cpu); 3482 rcu_prepare_kthreads(cpu);
3483 rcu_spawn_all_nocb_kthreads(cpu);
3445 break; 3484 break;
3446 case CPU_ONLINE: 3485 case CPU_ONLINE:
3447 case CPU_DOWN_FAILED: 3486 case CPU_DOWN_FAILED:
@@ -3459,8 +3498,10 @@ static int rcu_cpu_notify(struct notifier_block *self,
3459 case CPU_DEAD_FROZEN: 3498 case CPU_DEAD_FROZEN:
3460 case CPU_UP_CANCELED: 3499 case CPU_UP_CANCELED:
3461 case CPU_UP_CANCELED_FROZEN: 3500 case CPU_UP_CANCELED_FROZEN:
3462 for_each_rcu_flavor(rsp) 3501 for_each_rcu_flavor(rsp) {
3463 rcu_cleanup_dead_cpu(cpu, rsp); 3502 rcu_cleanup_dead_cpu(cpu, rsp);
3503 do_nocb_deferred_wakeup(per_cpu_ptr(rsp->rda, cpu));
3504 }
3464 break; 3505 break;
3465 default: 3506 default:
3466 break; 3507 break;
@@ -3489,7 +3530,7 @@ static int rcu_pm_notify(struct notifier_block *self,
3489} 3530}
3490 3531
3491/* 3532/*
3492 * Spawn the kthread that handles this RCU flavor's grace periods. 3533 * Spawn the kthreads that handle each RCU flavor's grace periods.
3493 */ 3534 */
3494static int __init rcu_spawn_gp_kthread(void) 3535static int __init rcu_spawn_gp_kthread(void)
3495{ 3536{
@@ -3498,6 +3539,7 @@ static int __init rcu_spawn_gp_kthread(void)
3498 struct rcu_state *rsp; 3539 struct rcu_state *rsp;
3499 struct task_struct *t; 3540 struct task_struct *t;
3500 3541
3542 rcu_scheduler_fully_active = 1;
3501 for_each_rcu_flavor(rsp) { 3543 for_each_rcu_flavor(rsp) {
3502 t = kthread_run(rcu_gp_kthread, rsp, "%s", rsp->name); 3544 t = kthread_run(rcu_gp_kthread, rsp, "%s", rsp->name);
3503 BUG_ON(IS_ERR(t)); 3545 BUG_ON(IS_ERR(t));
@@ -3505,8 +3547,9 @@ static int __init rcu_spawn_gp_kthread(void)
3505 raw_spin_lock_irqsave(&rnp->lock, flags); 3547 raw_spin_lock_irqsave(&rnp->lock, flags);
3506 rsp->gp_kthread = t; 3548 rsp->gp_kthread = t;
3507 raw_spin_unlock_irqrestore(&rnp->lock, flags); 3549 raw_spin_unlock_irqrestore(&rnp->lock, flags);
3508 rcu_spawn_nocb_kthreads(rsp);
3509 } 3550 }
3551 rcu_spawn_nocb_kthreads();
3552 rcu_spawn_boost_kthreads();
3510 return 0; 3553 return 0;
3511} 3554}
3512early_initcall(rcu_spawn_gp_kthread); 3555early_initcall(rcu_spawn_gp_kthread);
@@ -3738,6 +3781,8 @@ void __init rcu_init(void)
3738 pm_notifier(rcu_pm_notify, 0); 3781 pm_notifier(rcu_pm_notify, 0);
3739 for_each_online_cpu(cpu) 3782 for_each_online_cpu(cpu)
3740 rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)cpu); 3783 rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)cpu);
3784
3785 rcu_early_boot_tests();
3741} 3786}
3742 3787
3743#include "tree_plugin.h" 3788#include "tree_plugin.h"
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 6a86eb7bac45..8e7b1843896e 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -139,7 +139,7 @@ struct rcu_node {
139 unsigned long expmask; /* Groups that have ->blkd_tasks */ 139 unsigned long expmask; /* Groups that have ->blkd_tasks */
140 /* elements that need to drain to allow the */ 140 /* elements that need to drain to allow the */
141 /* current expedited grace period to */ 141 /* current expedited grace period to */
142 /* complete (only for TREE_PREEMPT_RCU). */ 142 /* complete (only for PREEMPT_RCU). */
143 unsigned long qsmaskinit; 143 unsigned long qsmaskinit;
144 /* Per-GP initial value for qsmask & expmask. */ 144 /* Per-GP initial value for qsmask & expmask. */
145 unsigned long grpmask; /* Mask to apply to parent qsmask. */ 145 unsigned long grpmask; /* Mask to apply to parent qsmask. */
@@ -350,7 +350,7 @@ struct rcu_data {
350 int nocb_p_count_lazy; /* (approximate). */ 350 int nocb_p_count_lazy; /* (approximate). */
351 wait_queue_head_t nocb_wq; /* For nocb kthreads to sleep on. */ 351 wait_queue_head_t nocb_wq; /* For nocb kthreads to sleep on. */
352 struct task_struct *nocb_kthread; 352 struct task_struct *nocb_kthread;
353 bool nocb_defer_wakeup; /* Defer wakeup of nocb_kthread. */ 353 int nocb_defer_wakeup; /* Defer wakeup of nocb_kthread. */
354 354
355 /* The following fields are used by the leader, hence own cacheline. */ 355 /* The following fields are used by the leader, hence own cacheline. */
356 struct rcu_head *nocb_gp_head ____cacheline_internodealigned_in_smp; 356 struct rcu_head *nocb_gp_head ____cacheline_internodealigned_in_smp;
@@ -383,6 +383,11 @@ struct rcu_data {
383#define RCU_FORCE_QS 3 /* Need to force quiescent state. */ 383#define RCU_FORCE_QS 3 /* Need to force quiescent state. */
384#define RCU_SIGNAL_INIT RCU_SAVE_DYNTICK 384#define RCU_SIGNAL_INIT RCU_SAVE_DYNTICK
385 385
386/* Values for nocb_defer_wakeup field in struct rcu_data. */
387#define RCU_NOGP_WAKE_NOT 0
388#define RCU_NOGP_WAKE 1
389#define RCU_NOGP_WAKE_FORCE 2
390
386#define RCU_JIFFIES_TILL_FORCE_QS (1 + (HZ > 250) + (HZ > 500)) 391#define RCU_JIFFIES_TILL_FORCE_QS (1 + (HZ > 250) + (HZ > 500))
387 /* For jiffies_till_first_fqs and */ 392 /* For jiffies_till_first_fqs and */
388 /* and jiffies_till_next_fqs. */ 393 /* and jiffies_till_next_fqs. */
@@ -525,10 +530,10 @@ DECLARE_PER_CPU(struct rcu_data, rcu_sched_data);
525extern struct rcu_state rcu_bh_state; 530extern struct rcu_state rcu_bh_state;
526DECLARE_PER_CPU(struct rcu_data, rcu_bh_data); 531DECLARE_PER_CPU(struct rcu_data, rcu_bh_data);
527 532
528#ifdef CONFIG_TREE_PREEMPT_RCU 533#ifdef CONFIG_PREEMPT_RCU
529extern struct rcu_state rcu_preempt_state; 534extern struct rcu_state rcu_preempt_state;
530DECLARE_PER_CPU(struct rcu_data, rcu_preempt_data); 535DECLARE_PER_CPU(struct rcu_data, rcu_preempt_data);
531#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ 536#endif /* #ifdef CONFIG_PREEMPT_RCU */
532 537
533#ifdef CONFIG_RCU_BOOST 538#ifdef CONFIG_RCU_BOOST
534DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_status); 539DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
@@ -542,7 +547,7 @@ DECLARE_PER_CPU(char, rcu_cpu_has_work);
542/* Forward declarations for rcutree_plugin.h */ 547/* Forward declarations for rcutree_plugin.h */
543static void rcu_bootup_announce(void); 548static void rcu_bootup_announce(void);
544long rcu_batches_completed(void); 549long rcu_batches_completed(void);
545static void rcu_preempt_note_context_switch(int cpu); 550static void rcu_preempt_note_context_switch(void);
546static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp); 551static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp);
547#ifdef CONFIG_HOTPLUG_CPU 552#ifdef CONFIG_HOTPLUG_CPU
548static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, 553static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp,
@@ -556,12 +561,12 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
556 struct rcu_node *rnp, 561 struct rcu_node *rnp,
557 struct rcu_data *rdp); 562 struct rcu_data *rdp);
558#endif /* #ifdef CONFIG_HOTPLUG_CPU */ 563#endif /* #ifdef CONFIG_HOTPLUG_CPU */
559static void rcu_preempt_check_callbacks(int cpu); 564static void rcu_preempt_check_callbacks(void);
560void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)); 565void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu));
561#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU) 566#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_PREEMPT_RCU)
562static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, 567static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
563 bool wake); 568 bool wake);
564#endif /* #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU) */ 569#endif /* #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_PREEMPT_RCU) */
565static void __init __rcu_init_preempt(void); 570static void __init __rcu_init_preempt(void);
566static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags); 571static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags);
567static void rcu_preempt_boost_start_gp(struct rcu_node *rnp); 572static void rcu_preempt_boost_start_gp(struct rcu_node *rnp);
@@ -572,15 +577,17 @@ static void rcu_preempt_do_callbacks(void);
572static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp, 577static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
573 struct rcu_node *rnp); 578 struct rcu_node *rnp);
574#endif /* #ifdef CONFIG_RCU_BOOST */ 579#endif /* #ifdef CONFIG_RCU_BOOST */
580static void __init rcu_spawn_boost_kthreads(void);
575static void rcu_prepare_kthreads(int cpu); 581static void rcu_prepare_kthreads(int cpu);
576static void rcu_cleanup_after_idle(int cpu); 582static void rcu_cleanup_after_idle(void);
577static void rcu_prepare_for_idle(int cpu); 583static void rcu_prepare_for_idle(void);
578static void rcu_idle_count_callbacks_posted(void); 584static void rcu_idle_count_callbacks_posted(void);
579static void print_cpu_stall_info_begin(void); 585static void print_cpu_stall_info_begin(void);
580static void print_cpu_stall_info(struct rcu_state *rsp, int cpu); 586static void print_cpu_stall_info(struct rcu_state *rsp, int cpu);
581static void print_cpu_stall_info_end(void); 587static void print_cpu_stall_info_end(void);
582static void zero_cpu_stall_ticks(struct rcu_data *rdp); 588static void zero_cpu_stall_ticks(struct rcu_data *rdp);
583static void increment_cpu_stall_ticks(void); 589static void increment_cpu_stall_ticks(void);
590static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu);
584static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq); 591static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq);
585static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp); 592static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp);
586static void rcu_init_one_nocb(struct rcu_node *rnp); 593static void rcu_init_one_nocb(struct rcu_node *rnp);
@@ -589,14 +596,18 @@ static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
589static bool rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp, 596static bool rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp,
590 struct rcu_data *rdp, 597 struct rcu_data *rdp,
591 unsigned long flags); 598 unsigned long flags);
592static bool rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp); 599static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp);
593static void do_nocb_deferred_wakeup(struct rcu_data *rdp); 600static void do_nocb_deferred_wakeup(struct rcu_data *rdp);
594static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp); 601static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp);
595static void rcu_spawn_nocb_kthreads(struct rcu_state *rsp); 602static void rcu_spawn_all_nocb_kthreads(int cpu);
603static void __init rcu_spawn_nocb_kthreads(void);
604#ifdef CONFIG_RCU_NOCB_CPU
605static void __init rcu_organize_nocb_kthreads(struct rcu_state *rsp);
606#endif /* #ifdef CONFIG_RCU_NOCB_CPU */
596static void __maybe_unused rcu_kick_nohz_cpu(int cpu); 607static void __maybe_unused rcu_kick_nohz_cpu(int cpu);
597static bool init_nocb_callback_list(struct rcu_data *rdp); 608static bool init_nocb_callback_list(struct rcu_data *rdp);
598static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq); 609static void rcu_sysidle_enter(int irq);
599static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq); 610static void rcu_sysidle_exit(int irq);
600static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle, 611static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle,
601 unsigned long *maxj); 612 unsigned long *maxj);
602static bool is_sysidle_rcu_state(struct rcu_state *rsp); 613static bool is_sysidle_rcu_state(struct rcu_state *rsp);
@@ -605,6 +616,8 @@ static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle,
605static void rcu_bind_gp_kthread(void); 616static void rcu_bind_gp_kthread(void);
606static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp); 617static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp);
607static bool rcu_nohz_full_cpu(struct rcu_state *rsp); 618static bool rcu_nohz_full_cpu(struct rcu_state *rsp);
619static void rcu_dynticks_task_enter(void);
620static void rcu_dynticks_task_exit(void);
608 621
609#endif /* #ifndef RCU_TREE_NONCORE */ 622#endif /* #ifndef RCU_TREE_NONCORE */
610 623
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index a7997e272564..3ec85cb5d544 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -30,14 +30,24 @@
30#include <linux/smpboot.h> 30#include <linux/smpboot.h>
31#include "../time/tick-internal.h" 31#include "../time/tick-internal.h"
32 32
33#define RCU_KTHREAD_PRIO 1
34
35#ifdef CONFIG_RCU_BOOST 33#ifdef CONFIG_RCU_BOOST
34
36#include "../locking/rtmutex_common.h" 35#include "../locking/rtmutex_common.h"
37#define RCU_BOOST_PRIO CONFIG_RCU_BOOST_PRIO 36
38#else 37/* rcuc/rcub kthread realtime priority */
39#define RCU_BOOST_PRIO RCU_KTHREAD_PRIO 38static int kthread_prio = CONFIG_RCU_KTHREAD_PRIO;
40#endif 39module_param(kthread_prio, int, 0644);
40
41/*
42 * Control variables for per-CPU and per-rcu_node kthreads. These
43 * handle all flavors of RCU.
44 */
45static DEFINE_PER_CPU(struct task_struct *, rcu_cpu_kthread_task);
46DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
47DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_loops);
48DEFINE_PER_CPU(char, rcu_cpu_has_work);
49
50#endif /* #ifdef CONFIG_RCU_BOOST */
41 51
42#ifdef CONFIG_RCU_NOCB_CPU 52#ifdef CONFIG_RCU_NOCB_CPU
43static cpumask_var_t rcu_nocb_mask; /* CPUs to have callbacks offloaded. */ 53static cpumask_var_t rcu_nocb_mask; /* CPUs to have callbacks offloaded. */
@@ -72,9 +82,6 @@ static void __init rcu_bootup_announce_oddness(void)
72#ifdef CONFIG_RCU_TORTURE_TEST_RUNNABLE 82#ifdef CONFIG_RCU_TORTURE_TEST_RUNNABLE
73 pr_info("\tRCU torture testing starts during boot.\n"); 83 pr_info("\tRCU torture testing starts during boot.\n");
74#endif 84#endif
75#if defined(CONFIG_TREE_PREEMPT_RCU) && !defined(CONFIG_RCU_CPU_STALL_VERBOSE)
76 pr_info("\tDump stacks of tasks blocking RCU-preempt GP.\n");
77#endif
78#if defined(CONFIG_RCU_CPU_STALL_INFO) 85#if defined(CONFIG_RCU_CPU_STALL_INFO)
79 pr_info("\tAdditional per-CPU info printed with stalls.\n"); 86 pr_info("\tAdditional per-CPU info printed with stalls.\n");
80#endif 87#endif
@@ -85,36 +92,12 @@ static void __init rcu_bootup_announce_oddness(void)
85 pr_info("\tBoot-time adjustment of leaf fanout to %d.\n", rcu_fanout_leaf); 92 pr_info("\tBoot-time adjustment of leaf fanout to %d.\n", rcu_fanout_leaf);
86 if (nr_cpu_ids != NR_CPUS) 93 if (nr_cpu_ids != NR_CPUS)
87 pr_info("\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids); 94 pr_info("\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids);
88#ifdef CONFIG_RCU_NOCB_CPU 95#ifdef CONFIG_RCU_BOOST
89#ifndef CONFIG_RCU_NOCB_CPU_NONE 96 pr_info("\tRCU kthread priority: %d.\n", kthread_prio);
90 if (!have_rcu_nocb_mask) { 97#endif
91 zalloc_cpumask_var(&rcu_nocb_mask, GFP_KERNEL);
92 have_rcu_nocb_mask = true;
93 }
94#ifdef CONFIG_RCU_NOCB_CPU_ZERO
95 pr_info("\tOffload RCU callbacks from CPU 0\n");
96 cpumask_set_cpu(0, rcu_nocb_mask);
97#endif /* #ifdef CONFIG_RCU_NOCB_CPU_ZERO */
98#ifdef CONFIG_RCU_NOCB_CPU_ALL
99 pr_info("\tOffload RCU callbacks from all CPUs\n");
100 cpumask_copy(rcu_nocb_mask, cpu_possible_mask);
101#endif /* #ifdef CONFIG_RCU_NOCB_CPU_ALL */
102#endif /* #ifndef CONFIG_RCU_NOCB_CPU_NONE */
103 if (have_rcu_nocb_mask) {
104 if (!cpumask_subset(rcu_nocb_mask, cpu_possible_mask)) {
105 pr_info("\tNote: kernel parameter 'rcu_nocbs=' contains nonexistent CPUs.\n");
106 cpumask_and(rcu_nocb_mask, cpu_possible_mask,
107 rcu_nocb_mask);
108 }
109 cpulist_scnprintf(nocb_buf, sizeof(nocb_buf), rcu_nocb_mask);
110 pr_info("\tOffload RCU callbacks from CPUs: %s.\n", nocb_buf);
111 if (rcu_nocb_poll)
112 pr_info("\tPoll for callbacks from no-CBs CPUs.\n");
113 }
114#endif /* #ifdef CONFIG_RCU_NOCB_CPU */
115} 98}
116 99
117#ifdef CONFIG_TREE_PREEMPT_RCU 100#ifdef CONFIG_PREEMPT_RCU
118 101
119RCU_STATE_INITIALIZER(rcu_preempt, 'p', call_rcu); 102RCU_STATE_INITIALIZER(rcu_preempt, 'p', call_rcu);
120static struct rcu_state *rcu_state_p = &rcu_preempt_state; 103static struct rcu_state *rcu_state_p = &rcu_preempt_state;
@@ -134,7 +117,7 @@ static void __init rcu_bootup_announce(void)
134 * Return the number of RCU-preempt batches processed thus far 117 * Return the number of RCU-preempt batches processed thus far
135 * for debug and statistics. 118 * for debug and statistics.
136 */ 119 */
137long rcu_batches_completed_preempt(void) 120static long rcu_batches_completed_preempt(void)
138{ 121{
139 return rcu_preempt_state.completed; 122 return rcu_preempt_state.completed;
140} 123}
@@ -155,18 +138,19 @@ EXPORT_SYMBOL_GPL(rcu_batches_completed);
155 * not in a quiescent state. There might be any number of tasks blocked 138 * not in a quiescent state. There might be any number of tasks blocked
156 * while in an RCU read-side critical section. 139 * while in an RCU read-side critical section.
157 * 140 *
158 * Unlike the other rcu_*_qs() functions, callers to this function 141 * As with the other rcu_*_qs() functions, callers to this function
159 * must disable irqs in order to protect the assignment to 142 * must disable preemption.
160 * ->rcu_read_unlock_special. 143 */
161 */ 144static void rcu_preempt_qs(void)
162static void rcu_preempt_qs(int cpu) 145{
163{ 146 if (!__this_cpu_read(rcu_preempt_data.passed_quiesce)) {
164 struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu); 147 trace_rcu_grace_period(TPS("rcu_preempt"),
165 148 __this_cpu_read(rcu_preempt_data.gpnum),
166 if (rdp->passed_quiesce == 0) 149 TPS("cpuqs"));
167 trace_rcu_grace_period(TPS("rcu_preempt"), rdp->gpnum, TPS("cpuqs")); 150 __this_cpu_write(rcu_preempt_data.passed_quiesce, 1);
168 rdp->passed_quiesce = 1; 151 barrier(); /* Coordinate with rcu_preempt_check_callbacks(). */
169 current->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS; 152 current->rcu_read_unlock_special.b.need_qs = false;
153 }
170} 154}
171 155
172/* 156/*
@@ -182,7 +166,7 @@ static void rcu_preempt_qs(int cpu)
182 * 166 *
183 * Caller must disable preemption. 167 * Caller must disable preemption.
184 */ 168 */
185static void rcu_preempt_note_context_switch(int cpu) 169static void rcu_preempt_note_context_switch(void)
186{ 170{
187 struct task_struct *t = current; 171 struct task_struct *t = current;
188 unsigned long flags; 172 unsigned long flags;
@@ -190,14 +174,14 @@ static void rcu_preempt_note_context_switch(int cpu)
190 struct rcu_node *rnp; 174 struct rcu_node *rnp;
191 175
192 if (t->rcu_read_lock_nesting > 0 && 176 if (t->rcu_read_lock_nesting > 0 &&
193 (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) { 177 !t->rcu_read_unlock_special.b.blocked) {
194 178
195 /* Possibly blocking in an RCU read-side critical section. */ 179 /* Possibly blocking in an RCU read-side critical section. */
196 rdp = per_cpu_ptr(rcu_preempt_state.rda, cpu); 180 rdp = this_cpu_ptr(rcu_preempt_state.rda);
197 rnp = rdp->mynode; 181 rnp = rdp->mynode;
198 raw_spin_lock_irqsave(&rnp->lock, flags); 182 raw_spin_lock_irqsave(&rnp->lock, flags);
199 smp_mb__after_unlock_lock(); 183 smp_mb__after_unlock_lock();
200 t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED; 184 t->rcu_read_unlock_special.b.blocked = true;
201 t->rcu_blocked_node = rnp; 185 t->rcu_blocked_node = rnp;
202 186
203 /* 187 /*
@@ -239,7 +223,7 @@ static void rcu_preempt_note_context_switch(int cpu)
239 : rnp->gpnum + 1); 223 : rnp->gpnum + 1);
240 raw_spin_unlock_irqrestore(&rnp->lock, flags); 224 raw_spin_unlock_irqrestore(&rnp->lock, flags);
241 } else if (t->rcu_read_lock_nesting < 0 && 225 } else if (t->rcu_read_lock_nesting < 0 &&
242 t->rcu_read_unlock_special) { 226 t->rcu_read_unlock_special.s) {
243 227
244 /* 228 /*
245 * Complete exit from RCU read-side critical section on 229 * Complete exit from RCU read-side critical section on
@@ -257,9 +241,7 @@ static void rcu_preempt_note_context_switch(int cpu)
257 * grace period, then the fact that the task has been enqueued 241 * grace period, then the fact that the task has been enqueued
258 * means that we continue to block the current grace period. 242 * means that we continue to block the current grace period.
259 */ 243 */
260 local_irq_save(flags); 244 rcu_preempt_qs();
261 rcu_preempt_qs(cpu);
262 local_irq_restore(flags);
263} 245}
264 246
265/* 247/*
@@ -340,7 +322,7 @@ void rcu_read_unlock_special(struct task_struct *t)
340 bool drop_boost_mutex = false; 322 bool drop_boost_mutex = false;
341#endif /* #ifdef CONFIG_RCU_BOOST */ 323#endif /* #ifdef CONFIG_RCU_BOOST */
342 struct rcu_node *rnp; 324 struct rcu_node *rnp;
343 int special; 325 union rcu_special special;
344 326
345 /* NMI handlers cannot block and cannot safely manipulate state. */ 327 /* NMI handlers cannot block and cannot safely manipulate state. */
346 if (in_nmi()) 328 if (in_nmi())
@@ -350,12 +332,13 @@ void rcu_read_unlock_special(struct task_struct *t)
350 332
351 /* 333 /*
352 * If RCU core is waiting for this CPU to exit critical section, 334 * If RCU core is waiting for this CPU to exit critical section,
353 * let it know that we have done so. 335 * let it know that we have done so. Because irqs are disabled,
336 * t->rcu_read_unlock_special cannot change.
354 */ 337 */
355 special = t->rcu_read_unlock_special; 338 special = t->rcu_read_unlock_special;
356 if (special & RCU_READ_UNLOCK_NEED_QS) { 339 if (special.b.need_qs) {
357 rcu_preempt_qs(smp_processor_id()); 340 rcu_preempt_qs();
358 if (!t->rcu_read_unlock_special) { 341 if (!t->rcu_read_unlock_special.s) {
359 local_irq_restore(flags); 342 local_irq_restore(flags);
360 return; 343 return;
361 } 344 }
@@ -368,8 +351,8 @@ void rcu_read_unlock_special(struct task_struct *t)
368 } 351 }
369 352
370 /* Clean up if blocked during RCU read-side critical section. */ 353 /* Clean up if blocked during RCU read-side critical section. */
371 if (special & RCU_READ_UNLOCK_BLOCKED) { 354 if (special.b.blocked) {
372 t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_BLOCKED; 355 t->rcu_read_unlock_special.b.blocked = false;
373 356
374 /* 357 /*
375 * Remove this task from the list it blocked on. The 358 * Remove this task from the list it blocked on. The
@@ -442,8 +425,6 @@ void rcu_read_unlock_special(struct task_struct *t)
442 } 425 }
443} 426}
444 427
445#ifdef CONFIG_RCU_CPU_STALL_VERBOSE
446
447/* 428/*
448 * Dump detailed information for all tasks blocking the current RCU 429 * Dump detailed information for all tasks blocking the current RCU
449 * grace period on the specified rcu_node structure. 430 * grace period on the specified rcu_node structure.
@@ -478,14 +459,6 @@ static void rcu_print_detail_task_stall(struct rcu_state *rsp)
478 rcu_print_detail_task_stall_rnp(rnp); 459 rcu_print_detail_task_stall_rnp(rnp);
479} 460}
480 461
481#else /* #ifdef CONFIG_RCU_CPU_STALL_VERBOSE */
482
483static void rcu_print_detail_task_stall(struct rcu_state *rsp)
484{
485}
486
487#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_VERBOSE */
488
489#ifdef CONFIG_RCU_CPU_STALL_INFO 462#ifdef CONFIG_RCU_CPU_STALL_INFO
490 463
491static void rcu_print_task_stall_begin(struct rcu_node *rnp) 464static void rcu_print_task_stall_begin(struct rcu_node *rnp)
@@ -648,17 +621,18 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
648 * 621 *
649 * Caller must disable hard irqs. 622 * Caller must disable hard irqs.
650 */ 623 */
651static void rcu_preempt_check_callbacks(int cpu) 624static void rcu_preempt_check_callbacks(void)
652{ 625{
653 struct task_struct *t = current; 626 struct task_struct *t = current;
654 627
655 if (t->rcu_read_lock_nesting == 0) { 628 if (t->rcu_read_lock_nesting == 0) {
656 rcu_preempt_qs(cpu); 629 rcu_preempt_qs();
657 return; 630 return;
658 } 631 }
659 if (t->rcu_read_lock_nesting > 0 && 632 if (t->rcu_read_lock_nesting > 0 &&
660 per_cpu(rcu_preempt_data, cpu).qs_pending) 633 __this_cpu_read(rcu_preempt_data.qs_pending) &&
661 t->rcu_read_unlock_special |= RCU_READ_UNLOCK_NEED_QS; 634 !__this_cpu_read(rcu_preempt_data.passed_quiesce))
635 t->rcu_read_unlock_special.b.need_qs = true;
662} 636}
663 637
664#ifdef CONFIG_RCU_BOOST 638#ifdef CONFIG_RCU_BOOST
@@ -819,11 +793,6 @@ sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp)
819 * In fact, if you are using synchronize_rcu_expedited() in a loop, 793 * In fact, if you are using synchronize_rcu_expedited() in a loop,
820 * please restructure your code to batch your updates, and then Use a 794 * please restructure your code to batch your updates, and then Use a
821 * single synchronize_rcu() instead. 795 * single synchronize_rcu() instead.
822 *
823 * Note that it is illegal to call this function while holding any lock
824 * that is acquired by a CPU-hotplug notifier. And yes, it is also illegal
825 * to call this function from a CPU-hotplug notifier. Failing to observe
826 * these restriction will result in deadlock.
827 */ 796 */
828void synchronize_rcu_expedited(void) 797void synchronize_rcu_expedited(void)
829{ 798{
@@ -845,7 +814,11 @@ void synchronize_rcu_expedited(void)
845 * being boosted. This simplifies the process of moving tasks 814 * being boosted. This simplifies the process of moving tasks
846 * from leaf to root rcu_node structures. 815 * from leaf to root rcu_node structures.
847 */ 816 */
848 get_online_cpus(); 817 if (!try_get_online_cpus()) {
818 /* CPU-hotplug operation in flight, fall back to normal GP. */
819 wait_rcu_gp(call_rcu);
820 return;
821 }
849 822
850 /* 823 /*
851 * Acquire lock, falling back to synchronize_rcu() if too many 824 * Acquire lock, falling back to synchronize_rcu() if too many
@@ -897,7 +870,8 @@ void synchronize_rcu_expedited(void)
897 870
898 /* Clean up and exit. */ 871 /* Clean up and exit. */
899 smp_mb(); /* ensure expedited GP seen before counter increment. */ 872 smp_mb(); /* ensure expedited GP seen before counter increment. */
900 ACCESS_ONCE(sync_rcu_preempt_exp_count)++; 873 ACCESS_ONCE(sync_rcu_preempt_exp_count) =
874 sync_rcu_preempt_exp_count + 1;
901unlock_mb_ret: 875unlock_mb_ret:
902 mutex_unlock(&sync_rcu_preempt_exp_mutex); 876 mutex_unlock(&sync_rcu_preempt_exp_mutex);
903mb_ret: 877mb_ret:
@@ -941,11 +915,11 @@ void exit_rcu(void)
941 return; 915 return;
942 t->rcu_read_lock_nesting = 1; 916 t->rcu_read_lock_nesting = 1;
943 barrier(); 917 barrier();
944 t->rcu_read_unlock_special = RCU_READ_UNLOCK_BLOCKED; 918 t->rcu_read_unlock_special.b.blocked = true;
945 __rcu_read_unlock(); 919 __rcu_read_unlock();
946} 920}
947 921
948#else /* #ifdef CONFIG_TREE_PREEMPT_RCU */ 922#else /* #ifdef CONFIG_PREEMPT_RCU */
949 923
950static struct rcu_state *rcu_state_p = &rcu_sched_state; 924static struct rcu_state *rcu_state_p = &rcu_sched_state;
951 925
@@ -971,7 +945,7 @@ EXPORT_SYMBOL_GPL(rcu_batches_completed);
971 * Because preemptible RCU does not exist, we never have to check for 945 * Because preemptible RCU does not exist, we never have to check for
972 * CPUs being in quiescent states. 946 * CPUs being in quiescent states.
973 */ 947 */
974static void rcu_preempt_note_context_switch(int cpu) 948static void rcu_preempt_note_context_switch(void)
975{ 949{
976} 950}
977 951
@@ -1043,7 +1017,7 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
1043 * Because preemptible RCU does not exist, it never has any callbacks 1017 * Because preemptible RCU does not exist, it never has any callbacks
1044 * to check. 1018 * to check.
1045 */ 1019 */
1046static void rcu_preempt_check_callbacks(int cpu) 1020static void rcu_preempt_check_callbacks(void)
1047{ 1021{
1048} 1022}
1049 1023
@@ -1096,7 +1070,7 @@ void exit_rcu(void)
1096{ 1070{
1097} 1071}
1098 1072
1099#endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */ 1073#endif /* #else #ifdef CONFIG_PREEMPT_RCU */
1100 1074
1101#ifdef CONFIG_RCU_BOOST 1075#ifdef CONFIG_RCU_BOOST
1102 1076
@@ -1352,7 +1326,7 @@ static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
1352 smp_mb__after_unlock_lock(); 1326 smp_mb__after_unlock_lock();
1353 rnp->boost_kthread_task = t; 1327 rnp->boost_kthread_task = t;
1354 raw_spin_unlock_irqrestore(&rnp->lock, flags); 1328 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1355 sp.sched_priority = RCU_BOOST_PRIO; 1329 sp.sched_priority = kthread_prio;
1356 sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); 1330 sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
1357 wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */ 1331 wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */
1358 return 0; 1332 return 0;
@@ -1369,7 +1343,7 @@ static void rcu_cpu_kthread_setup(unsigned int cpu)
1369{ 1343{
1370 struct sched_param sp; 1344 struct sched_param sp;
1371 1345
1372 sp.sched_priority = RCU_KTHREAD_PRIO; 1346 sp.sched_priority = kthread_prio;
1373 sched_setscheduler_nocheck(current, SCHED_FIFO, &sp); 1347 sched_setscheduler_nocheck(current, SCHED_FIFO, &sp);
1374} 1348}
1375 1349
@@ -1462,14 +1436,13 @@ static struct smp_hotplug_thread rcu_cpu_thread_spec = {
1462}; 1436};
1463 1437
1464/* 1438/*
1465 * Spawn all kthreads -- called as soon as the scheduler is running. 1439 * Spawn boost kthreads -- called as soon as the scheduler is running.
1466 */ 1440 */
1467static int __init rcu_spawn_kthreads(void) 1441static void __init rcu_spawn_boost_kthreads(void)
1468{ 1442{
1469 struct rcu_node *rnp; 1443 struct rcu_node *rnp;
1470 int cpu; 1444 int cpu;
1471 1445
1472 rcu_scheduler_fully_active = 1;
1473 for_each_possible_cpu(cpu) 1446 for_each_possible_cpu(cpu)
1474 per_cpu(rcu_cpu_has_work, cpu) = 0; 1447 per_cpu(rcu_cpu_has_work, cpu) = 0;
1475 BUG_ON(smpboot_register_percpu_thread(&rcu_cpu_thread_spec)); 1448 BUG_ON(smpboot_register_percpu_thread(&rcu_cpu_thread_spec));
@@ -1479,9 +1452,7 @@ static int __init rcu_spawn_kthreads(void)
1479 rcu_for_each_leaf_node(rcu_state_p, rnp) 1452 rcu_for_each_leaf_node(rcu_state_p, rnp)
1480 (void)rcu_spawn_one_boost_kthread(rcu_state_p, rnp); 1453 (void)rcu_spawn_one_boost_kthread(rcu_state_p, rnp);
1481 } 1454 }
1482 return 0;
1483} 1455}
1484early_initcall(rcu_spawn_kthreads);
1485 1456
1486static void rcu_prepare_kthreads(int cpu) 1457static void rcu_prepare_kthreads(int cpu)
1487{ 1458{
@@ -1519,12 +1490,9 @@ static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
1519{ 1490{
1520} 1491}
1521 1492
1522static int __init rcu_scheduler_really_started(void) 1493static void __init rcu_spawn_boost_kthreads(void)
1523{ 1494{
1524 rcu_scheduler_fully_active = 1;
1525 return 0;
1526} 1495}
1527early_initcall(rcu_scheduler_really_started);
1528 1496
1529static void rcu_prepare_kthreads(int cpu) 1497static void rcu_prepare_kthreads(int cpu)
1530{ 1498{
@@ -1544,10 +1512,10 @@ static void rcu_prepare_kthreads(int cpu)
1544 * any flavor of RCU. 1512 * any flavor of RCU.
1545 */ 1513 */
1546#ifndef CONFIG_RCU_NOCB_CPU_ALL 1514#ifndef CONFIG_RCU_NOCB_CPU_ALL
1547int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies) 1515int rcu_needs_cpu(unsigned long *delta_jiffies)
1548{ 1516{
1549 *delta_jiffies = ULONG_MAX; 1517 *delta_jiffies = ULONG_MAX;
1550 return rcu_cpu_has_callbacks(cpu, NULL); 1518 return rcu_cpu_has_callbacks(NULL);
1551} 1519}
1552#endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */ 1520#endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */
1553 1521
@@ -1555,7 +1523,7 @@ int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies)
1555 * Because we do not have RCU_FAST_NO_HZ, don't bother cleaning up 1523 * Because we do not have RCU_FAST_NO_HZ, don't bother cleaning up
1556 * after it. 1524 * after it.
1557 */ 1525 */
1558static void rcu_cleanup_after_idle(int cpu) 1526static void rcu_cleanup_after_idle(void)
1559{ 1527{
1560} 1528}
1561 1529
@@ -1563,7 +1531,7 @@ static void rcu_cleanup_after_idle(int cpu)
1563 * Do the idle-entry grace-period work, which, because CONFIG_RCU_FAST_NO_HZ=n, 1531 * Do the idle-entry grace-period work, which, because CONFIG_RCU_FAST_NO_HZ=n,
1564 * is nothing. 1532 * is nothing.
1565 */ 1533 */
1566static void rcu_prepare_for_idle(int cpu) 1534static void rcu_prepare_for_idle(void)
1567{ 1535{
1568} 1536}
1569 1537
@@ -1625,7 +1593,7 @@ static bool __maybe_unused rcu_try_advance_all_cbs(void)
1625 1593
1626 /* Exit early if we advanced recently. */ 1594 /* Exit early if we advanced recently. */
1627 if (jiffies == rdtp->last_advance_all) 1595 if (jiffies == rdtp->last_advance_all)
1628 return 0; 1596 return false;
1629 rdtp->last_advance_all = jiffies; 1597 rdtp->last_advance_all = jiffies;
1630 1598
1631 for_each_rcu_flavor(rsp) { 1599 for_each_rcu_flavor(rsp) {
@@ -1656,15 +1624,15 @@ static bool __maybe_unused rcu_try_advance_all_cbs(void)
1656 * The caller must have disabled interrupts. 1624 * The caller must have disabled interrupts.
1657 */ 1625 */
1658#ifndef CONFIG_RCU_NOCB_CPU_ALL 1626#ifndef CONFIG_RCU_NOCB_CPU_ALL
1659int rcu_needs_cpu(int cpu, unsigned long *dj) 1627int rcu_needs_cpu(unsigned long *dj)
1660{ 1628{
1661 struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); 1629 struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
1662 1630
1663 /* Snapshot to detect later posting of non-lazy callback. */ 1631 /* Snapshot to detect later posting of non-lazy callback. */
1664 rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted; 1632 rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted;
1665 1633
1666 /* If no callbacks, RCU doesn't need the CPU. */ 1634 /* If no callbacks, RCU doesn't need the CPU. */
1667 if (!rcu_cpu_has_callbacks(cpu, &rdtp->all_lazy)) { 1635 if (!rcu_cpu_has_callbacks(&rdtp->all_lazy)) {
1668 *dj = ULONG_MAX; 1636 *dj = ULONG_MAX;
1669 return 0; 1637 return 0;
1670 } 1638 }
@@ -1698,12 +1666,12 @@ int rcu_needs_cpu(int cpu, unsigned long *dj)
1698 * 1666 *
1699 * The caller must have disabled interrupts. 1667 * The caller must have disabled interrupts.
1700 */ 1668 */
1701static void rcu_prepare_for_idle(int cpu) 1669static void rcu_prepare_for_idle(void)
1702{ 1670{
1703#ifndef CONFIG_RCU_NOCB_CPU_ALL 1671#ifndef CONFIG_RCU_NOCB_CPU_ALL
1704 bool needwake; 1672 bool needwake;
1705 struct rcu_data *rdp; 1673 struct rcu_data *rdp;
1706 struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); 1674 struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
1707 struct rcu_node *rnp; 1675 struct rcu_node *rnp;
1708 struct rcu_state *rsp; 1676 struct rcu_state *rsp;
1709 int tne; 1677 int tne;
@@ -1711,7 +1679,7 @@ static void rcu_prepare_for_idle(int cpu)
1711 /* Handle nohz enablement switches conservatively. */ 1679 /* Handle nohz enablement switches conservatively. */
1712 tne = ACCESS_ONCE(tick_nohz_active); 1680 tne = ACCESS_ONCE(tick_nohz_active);
1713 if (tne != rdtp->tick_nohz_enabled_snap) { 1681 if (tne != rdtp->tick_nohz_enabled_snap) {
1714 if (rcu_cpu_has_callbacks(cpu, NULL)) 1682 if (rcu_cpu_has_callbacks(NULL))
1715 invoke_rcu_core(); /* force nohz to see update. */ 1683 invoke_rcu_core(); /* force nohz to see update. */
1716 rdtp->tick_nohz_enabled_snap = tne; 1684 rdtp->tick_nohz_enabled_snap = tne;
1717 return; 1685 return;
@@ -1720,7 +1688,7 @@ static void rcu_prepare_for_idle(int cpu)
1720 return; 1688 return;
1721 1689
1722 /* If this is a no-CBs CPU, no callbacks, just return. */ 1690 /* If this is a no-CBs CPU, no callbacks, just return. */
1723 if (rcu_is_nocb_cpu(cpu)) 1691 if (rcu_is_nocb_cpu(smp_processor_id()))
1724 return; 1692 return;
1725 1693
1726 /* 1694 /*
@@ -1744,7 +1712,7 @@ static void rcu_prepare_for_idle(int cpu)
1744 return; 1712 return;
1745 rdtp->last_accelerate = jiffies; 1713 rdtp->last_accelerate = jiffies;
1746 for_each_rcu_flavor(rsp) { 1714 for_each_rcu_flavor(rsp) {
1747 rdp = per_cpu_ptr(rsp->rda, cpu); 1715 rdp = this_cpu_ptr(rsp->rda);
1748 if (!*rdp->nxttail[RCU_DONE_TAIL]) 1716 if (!*rdp->nxttail[RCU_DONE_TAIL])
1749 continue; 1717 continue;
1750 rnp = rdp->mynode; 1718 rnp = rdp->mynode;
@@ -1763,10 +1731,10 @@ static void rcu_prepare_for_idle(int cpu)
1763 * any grace periods that elapsed while the CPU was idle, and if any 1731 * any grace periods that elapsed while the CPU was idle, and if any
1764 * callbacks are now ready to invoke, initiate invocation. 1732 * callbacks are now ready to invoke, initiate invocation.
1765 */ 1733 */
1766static void rcu_cleanup_after_idle(int cpu) 1734static void rcu_cleanup_after_idle(void)
1767{ 1735{
1768#ifndef CONFIG_RCU_NOCB_CPU_ALL 1736#ifndef CONFIG_RCU_NOCB_CPU_ALL
1769 if (rcu_is_nocb_cpu(cpu)) 1737 if (rcu_is_nocb_cpu(smp_processor_id()))
1770 return; 1738 return;
1771 if (rcu_try_advance_all_cbs()) 1739 if (rcu_try_advance_all_cbs())
1772 invoke_rcu_core(); 1740 invoke_rcu_core();
@@ -1848,7 +1816,7 @@ static int rcu_oom_notify(struct notifier_block *self,
1848 get_online_cpus(); 1816 get_online_cpus();
1849 for_each_online_cpu(cpu) { 1817 for_each_online_cpu(cpu) {
1850 smp_call_function_single(cpu, rcu_oom_notify_cpu, NULL, 1); 1818 smp_call_function_single(cpu, rcu_oom_notify_cpu, NULL, 1);
1851 cond_resched(); 1819 cond_resched_rcu_qs();
1852 } 1820 }
1853 put_online_cpus(); 1821 put_online_cpus();
1854 1822
@@ -2075,13 +2043,40 @@ static void wake_nocb_leader(struct rcu_data *rdp, bool force)
2075 if (!ACCESS_ONCE(rdp_leader->nocb_kthread)) 2043 if (!ACCESS_ONCE(rdp_leader->nocb_kthread))
2076 return; 2044 return;
2077 if (ACCESS_ONCE(rdp_leader->nocb_leader_sleep) || force) { 2045 if (ACCESS_ONCE(rdp_leader->nocb_leader_sleep) || force) {
2078 /* Prior xchg orders against prior callback enqueue. */ 2046 /* Prior smp_mb__after_atomic() orders against prior enqueue. */
2079 ACCESS_ONCE(rdp_leader->nocb_leader_sleep) = false; 2047 ACCESS_ONCE(rdp_leader->nocb_leader_sleep) = false;
2080 wake_up(&rdp_leader->nocb_wq); 2048 wake_up(&rdp_leader->nocb_wq);
2081 } 2049 }
2082} 2050}
2083 2051
2084/* 2052/*
2053 * Does the specified CPU need an RCU callback for the specified flavor
2054 * of rcu_barrier()?
2055 */
2056static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu)
2057{
2058 struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
2059 struct rcu_head *rhp;
2060
2061 /* No-CBs CPUs might have callbacks on any of three lists. */
2062 rhp = ACCESS_ONCE(rdp->nocb_head);
2063 if (!rhp)
2064 rhp = ACCESS_ONCE(rdp->nocb_gp_head);
2065 if (!rhp)
2066 rhp = ACCESS_ONCE(rdp->nocb_follower_head);
2067
2068 /* Having no rcuo kthread but CBs after scheduler starts is bad! */
2069 if (!ACCESS_ONCE(rdp->nocb_kthread) && rhp) {
2070 /* RCU callback enqueued before CPU first came online??? */
2071 pr_err("RCU: Never-onlined no-CBs CPU %d has CB %p\n",
2072 cpu, rhp->func);
2073 WARN_ON_ONCE(1);
2074 }
2075
2076 return !!rhp;
2077}
2078
2079/*
2085 * Enqueue the specified string of rcu_head structures onto the specified 2080 * Enqueue the specified string of rcu_head structures onto the specified
2086 * CPU's no-CBs lists. The CPU is specified by rdp, the head of the 2081 * CPU's no-CBs lists. The CPU is specified by rdp, the head of the
2087 * string by rhp, and the tail of the string by rhtp. The non-lazy/lazy 2082 * string by rhp, and the tail of the string by rhtp. The non-lazy/lazy
@@ -2104,6 +2099,7 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp,
2104 ACCESS_ONCE(*old_rhpp) = rhp; 2099 ACCESS_ONCE(*old_rhpp) = rhp;
2105 atomic_long_add(rhcount, &rdp->nocb_q_count); 2100 atomic_long_add(rhcount, &rdp->nocb_q_count);
2106 atomic_long_add(rhcount_lazy, &rdp->nocb_q_count_lazy); 2101 atomic_long_add(rhcount_lazy, &rdp->nocb_q_count_lazy);
2102 smp_mb__after_atomic(); /* Store *old_rhpp before _wake test. */
2107 2103
2108 /* If we are not being polled and there is a kthread, awaken it ... */ 2104 /* If we are not being polled and there is a kthread, awaken it ... */
2109 t = ACCESS_ONCE(rdp->nocb_kthread); 2105 t = ACCESS_ONCE(rdp->nocb_kthread);
@@ -2120,16 +2116,23 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp,
2120 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, 2116 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
2121 TPS("WakeEmpty")); 2117 TPS("WakeEmpty"));
2122 } else { 2118 } else {
2123 rdp->nocb_defer_wakeup = true; 2119 rdp->nocb_defer_wakeup = RCU_NOGP_WAKE;
2124 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, 2120 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
2125 TPS("WakeEmptyIsDeferred")); 2121 TPS("WakeEmptyIsDeferred"));
2126 } 2122 }
2127 rdp->qlen_last_fqs_check = 0; 2123 rdp->qlen_last_fqs_check = 0;
2128 } else if (len > rdp->qlen_last_fqs_check + qhimark) { 2124 } else if (len > rdp->qlen_last_fqs_check + qhimark) {
2129 /* ... or if many callbacks queued. */ 2125 /* ... or if many callbacks queued. */
2130 wake_nocb_leader(rdp, true); 2126 if (!irqs_disabled_flags(flags)) {
2127 wake_nocb_leader(rdp, true);
2128 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
2129 TPS("WakeOvf"));
2130 } else {
2131 rdp->nocb_defer_wakeup = RCU_NOGP_WAKE_FORCE;
2132 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
2133 TPS("WakeOvfIsDeferred"));
2134 }
2131 rdp->qlen_last_fqs_check = LONG_MAX / 2; 2135 rdp->qlen_last_fqs_check = LONG_MAX / 2;
2132 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeOvf"));
2133 } else { 2136 } else {
2134 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeNot")); 2137 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeNot"));
2135 } 2138 }
@@ -2150,7 +2153,7 @@ static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
2150{ 2153{
2151 2154
2152 if (!rcu_is_nocb_cpu(rdp->cpu)) 2155 if (!rcu_is_nocb_cpu(rdp->cpu))
2153 return 0; 2156 return false;
2154 __call_rcu_nocb_enqueue(rdp, rhp, &rhp->next, 1, lazy, flags); 2157 __call_rcu_nocb_enqueue(rdp, rhp, &rhp->next, 1, lazy, flags);
2155 if (__is_kfree_rcu_offset((unsigned long)rhp->func)) 2158 if (__is_kfree_rcu_offset((unsigned long)rhp->func))
2156 trace_rcu_kfree_callback(rdp->rsp->name, rhp, 2159 trace_rcu_kfree_callback(rdp->rsp->name, rhp,
@@ -2161,7 +2164,18 @@ static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
2161 trace_rcu_callback(rdp->rsp->name, rhp, 2164 trace_rcu_callback(rdp->rsp->name, rhp,
2162 -atomic_long_read(&rdp->nocb_q_count_lazy), 2165 -atomic_long_read(&rdp->nocb_q_count_lazy),
2163 -atomic_long_read(&rdp->nocb_q_count)); 2166 -atomic_long_read(&rdp->nocb_q_count));
2164 return 1; 2167
2168 /*
2169 * If called from an extended quiescent state with interrupts
2170 * disabled, invoke the RCU core in order to allow the idle-entry
2171 * deferred-wakeup check to function.
2172 */
2173 if (irqs_disabled_flags(flags) &&
2174 !rcu_is_watching() &&
2175 cpu_online(smp_processor_id()))
2176 invoke_rcu_core();
2177
2178 return true;
2165} 2179}
2166 2180
2167/* 2181/*
@@ -2177,7 +2191,7 @@ static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp,
2177 2191
2178 /* If this is not a no-CBs CPU, tell the caller to do it the old way. */ 2192 /* If this is not a no-CBs CPU, tell the caller to do it the old way. */
2179 if (!rcu_is_nocb_cpu(smp_processor_id())) 2193 if (!rcu_is_nocb_cpu(smp_processor_id()))
2180 return 0; 2194 return false;
2181 rsp->qlen = 0; 2195 rsp->qlen = 0;
2182 rsp->qlen_lazy = 0; 2196 rsp->qlen_lazy = 0;
2183 2197
@@ -2196,7 +2210,7 @@ static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp,
2196 rsp->orphan_nxtlist = NULL; 2210 rsp->orphan_nxtlist = NULL;
2197 rsp->orphan_nxttail = &rsp->orphan_nxtlist; 2211 rsp->orphan_nxttail = &rsp->orphan_nxtlist;
2198 } 2212 }
2199 return 1; 2213 return true;
2200} 2214}
2201 2215
2202/* 2216/*
@@ -2229,7 +2243,7 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp)
2229 (d = ULONG_CMP_GE(ACCESS_ONCE(rnp->completed), c))); 2243 (d = ULONG_CMP_GE(ACCESS_ONCE(rnp->completed), c)));
2230 if (likely(d)) 2244 if (likely(d))
2231 break; 2245 break;
2232 flush_signals(current); 2246 WARN_ON(signal_pending(current));
2233 trace_rcu_future_gp(rnp, rdp, c, TPS("ResumeWait")); 2247 trace_rcu_future_gp(rnp, rdp, c, TPS("ResumeWait"));
2234 } 2248 }
2235 trace_rcu_future_gp(rnp, rdp, c, TPS("EndWait")); 2249 trace_rcu_future_gp(rnp, rdp, c, TPS("EndWait"));
@@ -2288,7 +2302,7 @@ wait_again:
2288 if (!rcu_nocb_poll) 2302 if (!rcu_nocb_poll)
2289 trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, 2303 trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu,
2290 "WokeEmpty"); 2304 "WokeEmpty");
2291 flush_signals(current); 2305 WARN_ON(signal_pending(current));
2292 schedule_timeout_interruptible(1); 2306 schedule_timeout_interruptible(1);
2293 2307
2294 /* Rescan in case we were a victim of memory ordering. */ 2308 /* Rescan in case we were a victim of memory ordering. */
@@ -2327,6 +2341,7 @@ wait_again:
2327 atomic_long_add(rdp->nocb_gp_count, &rdp->nocb_follower_count); 2341 atomic_long_add(rdp->nocb_gp_count, &rdp->nocb_follower_count);
2328 atomic_long_add(rdp->nocb_gp_count_lazy, 2342 atomic_long_add(rdp->nocb_gp_count_lazy,
2329 &rdp->nocb_follower_count_lazy); 2343 &rdp->nocb_follower_count_lazy);
2344 smp_mb__after_atomic(); /* Store *tail before wakeup. */
2330 if (rdp != my_rdp && tail == &rdp->nocb_follower_head) { 2345 if (rdp != my_rdp && tail == &rdp->nocb_follower_head) {
2331 /* 2346 /*
2332 * List was empty, wake up the follower. 2347 * List was empty, wake up the follower.
@@ -2367,7 +2382,7 @@ static void nocb_follower_wait(struct rcu_data *rdp)
2367 if (!rcu_nocb_poll) 2382 if (!rcu_nocb_poll)
2368 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, 2383 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
2369 "WokeEmpty"); 2384 "WokeEmpty");
2370 flush_signals(current); 2385 WARN_ON(signal_pending(current));
2371 schedule_timeout_interruptible(1); 2386 schedule_timeout_interruptible(1);
2372 } 2387 }
2373} 2388}
@@ -2428,15 +2443,16 @@ static int rcu_nocb_kthread(void *arg)
2428 list = next; 2443 list = next;
2429 } 2444 }
2430 trace_rcu_batch_end(rdp->rsp->name, c, !!list, 0, 0, 1); 2445 trace_rcu_batch_end(rdp->rsp->name, c, !!list, 0, 0, 1);
2431 ACCESS_ONCE(rdp->nocb_p_count) -= c; 2446 ACCESS_ONCE(rdp->nocb_p_count) = rdp->nocb_p_count - c;
2432 ACCESS_ONCE(rdp->nocb_p_count_lazy) -= cl; 2447 ACCESS_ONCE(rdp->nocb_p_count_lazy) =
2448 rdp->nocb_p_count_lazy - cl;
2433 rdp->n_nocbs_invoked += c; 2449 rdp->n_nocbs_invoked += c;
2434 } 2450 }
2435 return 0; 2451 return 0;
2436} 2452}
2437 2453
2438/* Is a deferred wakeup of rcu_nocb_kthread() required? */ 2454/* Is a deferred wakeup of rcu_nocb_kthread() required? */
2439static bool rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp) 2455static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp)
2440{ 2456{
2441 return ACCESS_ONCE(rdp->nocb_defer_wakeup); 2457 return ACCESS_ONCE(rdp->nocb_defer_wakeup);
2442} 2458}
@@ -2444,11 +2460,79 @@ static bool rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp)
2444/* Do a deferred wakeup of rcu_nocb_kthread(). */ 2460/* Do a deferred wakeup of rcu_nocb_kthread(). */
2445static void do_nocb_deferred_wakeup(struct rcu_data *rdp) 2461static void do_nocb_deferred_wakeup(struct rcu_data *rdp)
2446{ 2462{
2463 int ndw;
2464
2447 if (!rcu_nocb_need_deferred_wakeup(rdp)) 2465 if (!rcu_nocb_need_deferred_wakeup(rdp))
2448 return; 2466 return;
2449 ACCESS_ONCE(rdp->nocb_defer_wakeup) = false; 2467 ndw = ACCESS_ONCE(rdp->nocb_defer_wakeup);
2450 wake_nocb_leader(rdp, false); 2468 ACCESS_ONCE(rdp->nocb_defer_wakeup) = RCU_NOGP_WAKE_NOT;
2451 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("DeferredWakeEmpty")); 2469 wake_nocb_leader(rdp, ndw == RCU_NOGP_WAKE_FORCE);
2470 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("DeferredWake"));
2471}
2472
2473void __init rcu_init_nohz(void)
2474{
2475 int cpu;
2476 bool need_rcu_nocb_mask = true;
2477 struct rcu_state *rsp;
2478
2479#ifdef CONFIG_RCU_NOCB_CPU_NONE
2480 need_rcu_nocb_mask = false;
2481#endif /* #ifndef CONFIG_RCU_NOCB_CPU_NONE */
2482
2483#if defined(CONFIG_NO_HZ_FULL)
2484 if (tick_nohz_full_running && cpumask_weight(tick_nohz_full_mask))
2485 need_rcu_nocb_mask = true;
2486#endif /* #if defined(CONFIG_NO_HZ_FULL) */
2487
2488 if (!have_rcu_nocb_mask && need_rcu_nocb_mask) {
2489 if (!zalloc_cpumask_var(&rcu_nocb_mask, GFP_KERNEL)) {
2490 pr_info("rcu_nocb_mask allocation failed, callback offloading disabled.\n");
2491 return;
2492 }
2493 have_rcu_nocb_mask = true;
2494 }
2495 if (!have_rcu_nocb_mask)
2496 return;
2497
2498#ifdef CONFIG_RCU_NOCB_CPU_ZERO
2499 pr_info("\tOffload RCU callbacks from CPU 0\n");
2500 cpumask_set_cpu(0, rcu_nocb_mask);
2501#endif /* #ifdef CONFIG_RCU_NOCB_CPU_ZERO */
2502#ifdef CONFIG_RCU_NOCB_CPU_ALL
2503 pr_info("\tOffload RCU callbacks from all CPUs\n");
2504 cpumask_copy(rcu_nocb_mask, cpu_possible_mask);
2505#endif /* #ifdef CONFIG_RCU_NOCB_CPU_ALL */
2506#if defined(CONFIG_NO_HZ_FULL)
2507 if (tick_nohz_full_running)
2508 cpumask_or(rcu_nocb_mask, rcu_nocb_mask, tick_nohz_full_mask);
2509#endif /* #if defined(CONFIG_NO_HZ_FULL) */
2510
2511 if (!cpumask_subset(rcu_nocb_mask, cpu_possible_mask)) {
2512 pr_info("\tNote: kernel parameter 'rcu_nocbs=' contains nonexistent CPUs.\n");
2513 cpumask_and(rcu_nocb_mask, cpu_possible_mask,
2514 rcu_nocb_mask);
2515 }
2516 cpulist_scnprintf(nocb_buf, sizeof(nocb_buf), rcu_nocb_mask);
2517 pr_info("\tOffload RCU callbacks from CPUs: %s.\n", nocb_buf);
2518 if (rcu_nocb_poll)
2519 pr_info("\tPoll for callbacks from no-CBs CPUs.\n");
2520
2521 for_each_rcu_flavor(rsp) {
2522 for_each_cpu(cpu, rcu_nocb_mask) {
2523 struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
2524
2525 /*
2526 * If there are early callbacks, they will need
2527 * to be moved to the nocb lists.
2528 */
2529 WARN_ON_ONCE(rdp->nxttail[RCU_NEXT_TAIL] !=
2530 &rdp->nxtlist &&
2531 rdp->nxttail[RCU_NEXT_TAIL] != NULL);
2532 init_nocb_callback_list(rdp);
2533 }
2534 rcu_organize_nocb_kthreads(rsp);
2535 }
2452} 2536}
2453 2537
2454/* Initialize per-rcu_data variables for no-CBs CPUs. */ 2538/* Initialize per-rcu_data variables for no-CBs CPUs. */
@@ -2459,15 +2543,89 @@ static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp)
2459 rdp->nocb_follower_tail = &rdp->nocb_follower_head; 2543 rdp->nocb_follower_tail = &rdp->nocb_follower_head;
2460} 2544}
2461 2545
2546/*
2547 * If the specified CPU is a no-CBs CPU that does not already have its
2548 * rcuo kthread for the specified RCU flavor, spawn it. If the CPUs are
2549 * brought online out of order, this can require re-organizing the
2550 * leader-follower relationships.
2551 */
2552static void rcu_spawn_one_nocb_kthread(struct rcu_state *rsp, int cpu)
2553{
2554 struct rcu_data *rdp;
2555 struct rcu_data *rdp_last;
2556 struct rcu_data *rdp_old_leader;
2557 struct rcu_data *rdp_spawn = per_cpu_ptr(rsp->rda, cpu);
2558 struct task_struct *t;
2559
2560 /*
2561 * If this isn't a no-CBs CPU or if it already has an rcuo kthread,
2562 * then nothing to do.
2563 */
2564 if (!rcu_is_nocb_cpu(cpu) || rdp_spawn->nocb_kthread)
2565 return;
2566
2567 /* If we didn't spawn the leader first, reorganize! */
2568 rdp_old_leader = rdp_spawn->nocb_leader;
2569 if (rdp_old_leader != rdp_spawn && !rdp_old_leader->nocb_kthread) {
2570 rdp_last = NULL;
2571 rdp = rdp_old_leader;
2572 do {
2573 rdp->nocb_leader = rdp_spawn;
2574 if (rdp_last && rdp != rdp_spawn)
2575 rdp_last->nocb_next_follower = rdp;
2576 if (rdp == rdp_spawn) {
2577 rdp = rdp->nocb_next_follower;
2578 } else {
2579 rdp_last = rdp;
2580 rdp = rdp->nocb_next_follower;
2581 rdp_last->nocb_next_follower = NULL;
2582 }
2583 } while (rdp);
2584 rdp_spawn->nocb_next_follower = rdp_old_leader;
2585 }
2586
2587 /* Spawn the kthread for this CPU and RCU flavor. */
2588 t = kthread_run(rcu_nocb_kthread, rdp_spawn,
2589 "rcuo%c/%d", rsp->abbr, cpu);
2590 BUG_ON(IS_ERR(t));
2591 ACCESS_ONCE(rdp_spawn->nocb_kthread) = t;
2592}
2593
2594/*
2595 * If the specified CPU is a no-CBs CPU that does not already have its
2596 * rcuo kthreads, spawn them.
2597 */
2598static void rcu_spawn_all_nocb_kthreads(int cpu)
2599{
2600 struct rcu_state *rsp;
2601
2602 if (rcu_scheduler_fully_active)
2603 for_each_rcu_flavor(rsp)
2604 rcu_spawn_one_nocb_kthread(rsp, cpu);
2605}
2606
2607/*
2608 * Once the scheduler is running, spawn rcuo kthreads for all online
2609 * no-CBs CPUs. This assumes that the early_initcall()s happen before
2610 * non-boot CPUs come online -- if this changes, we will need to add
2611 * some mutual exclusion.
2612 */
2613static void __init rcu_spawn_nocb_kthreads(void)
2614{
2615 int cpu;
2616
2617 for_each_online_cpu(cpu)
2618 rcu_spawn_all_nocb_kthreads(cpu);
2619}
2620
2462/* How many follower CPU IDs per leader? Default of -1 for sqrt(nr_cpu_ids). */ 2621/* How many follower CPU IDs per leader? Default of -1 for sqrt(nr_cpu_ids). */
2463static int rcu_nocb_leader_stride = -1; 2622static int rcu_nocb_leader_stride = -1;
2464module_param(rcu_nocb_leader_stride, int, 0444); 2623module_param(rcu_nocb_leader_stride, int, 0444);
2465 2624
2466/* 2625/*
2467 * Create a kthread for each RCU flavor for each no-CBs CPU. 2626 * Initialize leader-follower relationships for all no-CBs CPU.
2468 * Also initialize leader-follower relationships.
2469 */ 2627 */
2470static void __init rcu_spawn_nocb_kthreads(struct rcu_state *rsp) 2628static void __init rcu_organize_nocb_kthreads(struct rcu_state *rsp)
2471{ 2629{
2472 int cpu; 2630 int cpu;
2473 int ls = rcu_nocb_leader_stride; 2631 int ls = rcu_nocb_leader_stride;
@@ -2475,14 +2633,9 @@ static void __init rcu_spawn_nocb_kthreads(struct rcu_state *rsp)
2475 struct rcu_data *rdp; 2633 struct rcu_data *rdp;
2476 struct rcu_data *rdp_leader = NULL; /* Suppress misguided gcc warn. */ 2634 struct rcu_data *rdp_leader = NULL; /* Suppress misguided gcc warn. */
2477 struct rcu_data *rdp_prev = NULL; 2635 struct rcu_data *rdp_prev = NULL;
2478 struct task_struct *t;
2479 2636
2480 if (rcu_nocb_mask == NULL) 2637 if (!have_rcu_nocb_mask)
2481 return; 2638 return;
2482#if defined(CONFIG_NO_HZ_FULL) && !defined(CONFIG_NO_HZ_FULL_ALL)
2483 if (tick_nohz_full_running)
2484 cpumask_or(rcu_nocb_mask, rcu_nocb_mask, tick_nohz_full_mask);
2485#endif /* #if defined(CONFIG_NO_HZ_FULL) && !defined(CONFIG_NO_HZ_FULL_ALL) */
2486 if (ls == -1) { 2639 if (ls == -1) {
2487 ls = int_sqrt(nr_cpu_ids); 2640 ls = int_sqrt(nr_cpu_ids);
2488 rcu_nocb_leader_stride = ls; 2641 rcu_nocb_leader_stride = ls;
@@ -2505,27 +2658,27 @@ static void __init rcu_spawn_nocb_kthreads(struct rcu_state *rsp)
2505 rdp_prev->nocb_next_follower = rdp; 2658 rdp_prev->nocb_next_follower = rdp;
2506 } 2659 }
2507 rdp_prev = rdp; 2660 rdp_prev = rdp;
2508
2509 /* Spawn the kthread for this CPU. */
2510 t = kthread_run(rcu_nocb_kthread, rdp,
2511 "rcuo%c/%d", rsp->abbr, cpu);
2512 BUG_ON(IS_ERR(t));
2513 ACCESS_ONCE(rdp->nocb_kthread) = t;
2514 } 2661 }
2515} 2662}
2516 2663
2517/* Prevent __call_rcu() from enqueuing callbacks on no-CBs CPUs */ 2664/* Prevent __call_rcu() from enqueuing callbacks on no-CBs CPUs */
2518static bool init_nocb_callback_list(struct rcu_data *rdp) 2665static bool init_nocb_callback_list(struct rcu_data *rdp)
2519{ 2666{
2520 if (rcu_nocb_mask == NULL || 2667 if (!rcu_is_nocb_cpu(rdp->cpu))
2521 !cpumask_test_cpu(rdp->cpu, rcu_nocb_mask))
2522 return false; 2668 return false;
2669
2523 rdp->nxttail[RCU_NEXT_TAIL] = NULL; 2670 rdp->nxttail[RCU_NEXT_TAIL] = NULL;
2524 return true; 2671 return true;
2525} 2672}
2526 2673
2527#else /* #ifdef CONFIG_RCU_NOCB_CPU */ 2674#else /* #ifdef CONFIG_RCU_NOCB_CPU */
2528 2675
2676static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu)
2677{
2678 WARN_ON_ONCE(1); /* Should be dead code. */
2679 return false;
2680}
2681
2529static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) 2682static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp)
2530{ 2683{
2531} 2684}
@@ -2541,21 +2694,21 @@ static void rcu_init_one_nocb(struct rcu_node *rnp)
2541static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, 2694static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
2542 bool lazy, unsigned long flags) 2695 bool lazy, unsigned long flags)
2543{ 2696{
2544 return 0; 2697 return false;
2545} 2698}
2546 2699
2547static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp, 2700static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp,
2548 struct rcu_data *rdp, 2701 struct rcu_data *rdp,
2549 unsigned long flags) 2702 unsigned long flags)
2550{ 2703{
2551 return 0; 2704 return false;
2552} 2705}
2553 2706
2554static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp) 2707static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp)
2555{ 2708{
2556} 2709}
2557 2710
2558static bool rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp) 2711static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp)
2559{ 2712{
2560 return false; 2713 return false;
2561} 2714}
@@ -2564,7 +2717,11 @@ static void do_nocb_deferred_wakeup(struct rcu_data *rdp)
2564{ 2717{
2565} 2718}
2566 2719
2567static void __init rcu_spawn_nocb_kthreads(struct rcu_state *rsp) 2720static void rcu_spawn_all_nocb_kthreads(int cpu)
2721{
2722}
2723
2724static void __init rcu_spawn_nocb_kthreads(void)
2568{ 2725{
2569} 2726}
2570 2727
@@ -2595,16 +2752,6 @@ static void __maybe_unused rcu_kick_nohz_cpu(int cpu)
2595 2752
2596#ifdef CONFIG_NO_HZ_FULL_SYSIDLE 2753#ifdef CONFIG_NO_HZ_FULL_SYSIDLE
2597 2754
2598/*
2599 * Define RCU flavor that holds sysidle state. This needs to be the
2600 * most active flavor of RCU.
2601 */
2602#ifdef CONFIG_PREEMPT_RCU
2603static struct rcu_state *rcu_sysidle_state = &rcu_preempt_state;
2604#else /* #ifdef CONFIG_PREEMPT_RCU */
2605static struct rcu_state *rcu_sysidle_state = &rcu_sched_state;
2606#endif /* #else #ifdef CONFIG_PREEMPT_RCU */
2607
2608static int full_sysidle_state; /* Current system-idle state. */ 2755static int full_sysidle_state; /* Current system-idle state. */
2609#define RCU_SYSIDLE_NOT 0 /* Some CPU is not idle. */ 2756#define RCU_SYSIDLE_NOT 0 /* Some CPU is not idle. */
2610#define RCU_SYSIDLE_SHORT 1 /* All CPUs idle for brief period. */ 2757#define RCU_SYSIDLE_SHORT 1 /* All CPUs idle for brief period. */
@@ -2618,9 +2765,14 @@ static int full_sysidle_state; /* Current system-idle state. */
2618 * to detect full-system idle states, not RCU quiescent states and grace 2765 * to detect full-system idle states, not RCU quiescent states and grace
2619 * periods. The caller must have disabled interrupts. 2766 * periods. The caller must have disabled interrupts.
2620 */ 2767 */
2621static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq) 2768static void rcu_sysidle_enter(int irq)
2622{ 2769{
2623 unsigned long j; 2770 unsigned long j;
2771 struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
2772
2773 /* If there are no nohz_full= CPUs, no need to track this. */
2774 if (!tick_nohz_full_enabled())
2775 return;
2624 2776
2625 /* Adjust nesting, check for fully idle. */ 2777 /* Adjust nesting, check for fully idle. */
2626 if (irq) { 2778 if (irq) {
@@ -2685,8 +2837,14 @@ void rcu_sysidle_force_exit(void)
2685 * usermode execution does -not- count as idle here! The caller must 2837 * usermode execution does -not- count as idle here! The caller must
2686 * have disabled interrupts. 2838 * have disabled interrupts.
2687 */ 2839 */
2688static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq) 2840static void rcu_sysidle_exit(int irq)
2689{ 2841{
2842 struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
2843
2844 /* If there are no nohz_full= CPUs, no need to track this. */
2845 if (!tick_nohz_full_enabled())
2846 return;
2847
2690 /* Adjust nesting, check for already non-idle. */ 2848 /* Adjust nesting, check for already non-idle. */
2691 if (irq) { 2849 if (irq) {
2692 rdtp->dynticks_idle_nesting++; 2850 rdtp->dynticks_idle_nesting++;
@@ -2741,12 +2899,16 @@ static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle,
2741 unsigned long j; 2899 unsigned long j;
2742 struct rcu_dynticks *rdtp = rdp->dynticks; 2900 struct rcu_dynticks *rdtp = rdp->dynticks;
2743 2901
2902 /* If there are no nohz_full= CPUs, don't check system-wide idleness. */
2903 if (!tick_nohz_full_enabled())
2904 return;
2905
2744 /* 2906 /*
2745 * If some other CPU has already reported non-idle, if this is 2907 * If some other CPU has already reported non-idle, if this is
2746 * not the flavor of RCU that tracks sysidle state, or if this 2908 * not the flavor of RCU that tracks sysidle state, or if this
2747 * is an offline or the timekeeping CPU, nothing to do. 2909 * is an offline or the timekeeping CPU, nothing to do.
2748 */ 2910 */
2749 if (!*isidle || rdp->rsp != rcu_sysidle_state || 2911 if (!*isidle || rdp->rsp != rcu_state_p ||
2750 cpu_is_offline(rdp->cpu) || rdp->cpu == tick_do_timer_cpu) 2912 cpu_is_offline(rdp->cpu) || rdp->cpu == tick_do_timer_cpu)
2751 return; 2913 return;
2752 if (rcu_gp_in_progress(rdp->rsp)) 2914 if (rcu_gp_in_progress(rdp->rsp))
@@ -2772,7 +2934,7 @@ static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle,
2772 */ 2934 */
2773static bool is_sysidle_rcu_state(struct rcu_state *rsp) 2935static bool is_sysidle_rcu_state(struct rcu_state *rsp)
2774{ 2936{
2775 return rsp == rcu_sysidle_state; 2937 return rsp == rcu_state_p;
2776} 2938}
2777 2939
2778/* 2940/*
@@ -2850,7 +3012,7 @@ static void rcu_sysidle_cancel(void)
2850static void rcu_sysidle_report(struct rcu_state *rsp, int isidle, 3012static void rcu_sysidle_report(struct rcu_state *rsp, int isidle,
2851 unsigned long maxj, bool gpkt) 3013 unsigned long maxj, bool gpkt)
2852{ 3014{
2853 if (rsp != rcu_sysidle_state) 3015 if (rsp != rcu_state_p)
2854 return; /* Wrong flavor, ignore. */ 3016 return; /* Wrong flavor, ignore. */
2855 if (gpkt && nr_cpu_ids <= CONFIG_NO_HZ_FULL_SYSIDLE_SMALL) 3017 if (gpkt && nr_cpu_ids <= CONFIG_NO_HZ_FULL_SYSIDLE_SMALL)
2856 return; /* Running state machine from timekeeping CPU. */ 3018 return; /* Running state machine from timekeeping CPU. */
@@ -2867,6 +3029,10 @@ static void rcu_sysidle_report(struct rcu_state *rsp, int isidle,
2867static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle, 3029static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle,
2868 unsigned long maxj) 3030 unsigned long maxj)
2869{ 3031{
3032 /* If there are no nohz_full= CPUs, no need to track this. */
3033 if (!tick_nohz_full_enabled())
3034 return;
3035
2870 rcu_sysidle_report(rsp, isidle, maxj, true); 3036 rcu_sysidle_report(rsp, isidle, maxj, true);
2871} 3037}
2872 3038
@@ -2893,7 +3059,8 @@ static void rcu_sysidle_cb(struct rcu_head *rhp)
2893 3059
2894/* 3060/*
2895 * Check to see if the system is fully idle, other than the timekeeping CPU. 3061 * Check to see if the system is fully idle, other than the timekeeping CPU.
2896 * The caller must have disabled interrupts. 3062 * The caller must have disabled interrupts. This is not intended to be
3063 * called unless tick_nohz_full_enabled().
2897 */ 3064 */
2898bool rcu_sys_is_idle(void) 3065bool rcu_sys_is_idle(void)
2899{ 3066{
@@ -2919,13 +3086,12 @@ bool rcu_sys_is_idle(void)
2919 3086
2920 /* Scan all the CPUs looking for nonidle CPUs. */ 3087 /* Scan all the CPUs looking for nonidle CPUs. */
2921 for_each_possible_cpu(cpu) { 3088 for_each_possible_cpu(cpu) {
2922 rdp = per_cpu_ptr(rcu_sysidle_state->rda, cpu); 3089 rdp = per_cpu_ptr(rcu_state_p->rda, cpu);
2923 rcu_sysidle_check_cpu(rdp, &isidle, &maxj); 3090 rcu_sysidle_check_cpu(rdp, &isidle, &maxj);
2924 if (!isidle) 3091 if (!isidle)
2925 break; 3092 break;
2926 } 3093 }
2927 rcu_sysidle_report(rcu_sysidle_state, 3094 rcu_sysidle_report(rcu_state_p, isidle, maxj, false);
2928 isidle, maxj, false);
2929 oldrss = rss; 3095 oldrss = rss;
2930 rss = ACCESS_ONCE(full_sysidle_state); 3096 rss = ACCESS_ONCE(full_sysidle_state);
2931 } 3097 }
@@ -2952,7 +3118,7 @@ bool rcu_sys_is_idle(void)
2952 * provided by the memory allocator. 3118 * provided by the memory allocator.
2953 */ 3119 */
2954 if (nr_cpu_ids > CONFIG_NO_HZ_FULL_SYSIDLE_SMALL && 3120 if (nr_cpu_ids > CONFIG_NO_HZ_FULL_SYSIDLE_SMALL &&
2955 !rcu_gp_in_progress(rcu_sysidle_state) && 3121 !rcu_gp_in_progress(rcu_state_p) &&
2956 !rsh.inuse && xchg(&rsh.inuse, 1) == 0) 3122 !rsh.inuse && xchg(&rsh.inuse, 1) == 0)
2957 call_rcu(&rsh.rh, rcu_sysidle_cb); 3123 call_rcu(&rsh.rh, rcu_sysidle_cb);
2958 return false; 3124 return false;
@@ -2968,11 +3134,11 @@ static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp)
2968 3134
2969#else /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ 3135#else /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
2970 3136
2971static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq) 3137static void rcu_sysidle_enter(int irq)
2972{ 3138{
2973} 3139}
2974 3140
2975static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq) 3141static void rcu_sysidle_exit(int irq)
2976{ 3142{
2977} 3143}
2978 3144
@@ -3036,3 +3202,19 @@ static void rcu_bind_gp_kthread(void)
3036 housekeeping_affine(current); 3202 housekeeping_affine(current);
3037#endif /* #else #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ 3203#endif /* #else #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
3038} 3204}
3205
3206/* Record the current task on dyntick-idle entry. */
3207static void rcu_dynticks_task_enter(void)
3208{
3209#if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL)
3210 ACCESS_ONCE(current->rcu_tasks_idle_cpu) = smp_processor_id();
3211#endif /* #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) */
3212}
3213
3214/* Record no current task on dyntick-idle exit. */
3215static void rcu_dynticks_task_exit(void)
3216{
3217#if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL)
3218 ACCESS_ONCE(current->rcu_tasks_idle_cpu) = -1;
3219#endif /* #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) */
3220}
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index 4056d7992a6c..e0d31a345ee6 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -47,6 +47,8 @@
47#include <linux/hardirq.h> 47#include <linux/hardirq.h>
48#include <linux/delay.h> 48#include <linux/delay.h>
49#include <linux/module.h> 49#include <linux/module.h>
50#include <linux/kthread.h>
51#include <linux/tick.h>
50 52
51#define CREATE_TRACE_POINTS 53#define CREATE_TRACE_POINTS
52 54
@@ -91,7 +93,7 @@ void __rcu_read_unlock(void)
91 barrier(); /* critical section before exit code. */ 93 barrier(); /* critical section before exit code. */
92 t->rcu_read_lock_nesting = INT_MIN; 94 t->rcu_read_lock_nesting = INT_MIN;
93 barrier(); /* assign before ->rcu_read_unlock_special load */ 95 barrier(); /* assign before ->rcu_read_unlock_special load */
94 if (unlikely(ACCESS_ONCE(t->rcu_read_unlock_special))) 96 if (unlikely(ACCESS_ONCE(t->rcu_read_unlock_special.s)))
95 rcu_read_unlock_special(t); 97 rcu_read_unlock_special(t);
96 barrier(); /* ->rcu_read_unlock_special load before assign */ 98 barrier(); /* ->rcu_read_unlock_special load before assign */
97 t->rcu_read_lock_nesting = 0; 99 t->rcu_read_lock_nesting = 0;
@@ -137,6 +139,38 @@ int notrace debug_lockdep_rcu_enabled(void)
137EXPORT_SYMBOL_GPL(debug_lockdep_rcu_enabled); 139EXPORT_SYMBOL_GPL(debug_lockdep_rcu_enabled);
138 140
139/** 141/**
142 * rcu_read_lock_held() - might we be in RCU read-side critical section?
143 *
144 * If CONFIG_DEBUG_LOCK_ALLOC is selected, returns nonzero iff in an RCU
145 * read-side critical section. In absence of CONFIG_DEBUG_LOCK_ALLOC,
146 * this assumes we are in an RCU read-side critical section unless it can
147 * prove otherwise. This is useful for debug checks in functions that
148 * require that they be called within an RCU read-side critical section.
149 *
150 * Checks debug_lockdep_rcu_enabled() to prevent false positives during boot
151 * and while lockdep is disabled.
152 *
153 * Note that rcu_read_lock() and the matching rcu_read_unlock() must
154 * occur in the same context, for example, it is illegal to invoke
155 * rcu_read_unlock() in process context if the matching rcu_read_lock()
156 * was invoked from within an irq handler.
157 *
158 * Note that rcu_read_lock() is disallowed if the CPU is either idle or
159 * offline from an RCU perspective, so check for those as well.
160 */
161int rcu_read_lock_held(void)
162{
163 if (!debug_lockdep_rcu_enabled())
164 return 1;
165 if (!rcu_is_watching())
166 return 0;
167 if (!rcu_lockdep_current_cpu_online())
168 return 0;
169 return lock_is_held(&rcu_lock_map);
170}
171EXPORT_SYMBOL_GPL(rcu_read_lock_held);
172
173/**
140 * rcu_read_lock_bh_held() - might we be in RCU-bh read-side critical section? 174 * rcu_read_lock_bh_held() - might we be in RCU-bh read-side critical section?
141 * 175 *
142 * Check for bottom half being disabled, which covers both the 176 * Check for bottom half being disabled, which covers both the
@@ -272,7 +306,7 @@ struct debug_obj_descr rcuhead_debug_descr = {
272EXPORT_SYMBOL_GPL(rcuhead_debug_descr); 306EXPORT_SYMBOL_GPL(rcuhead_debug_descr);
273#endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ 307#endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */
274 308
275#if defined(CONFIG_TREE_RCU) || defined(CONFIG_TREE_PREEMPT_RCU) || defined(CONFIG_RCU_TRACE) 309#if defined(CONFIG_TREE_RCU) || defined(CONFIG_PREEMPT_RCU) || defined(CONFIG_RCU_TRACE)
276void do_trace_rcu_torture_read(const char *rcutorturename, struct rcu_head *rhp, 310void do_trace_rcu_torture_read(const char *rcutorturename, struct rcu_head *rhp,
277 unsigned long secs, 311 unsigned long secs,
278 unsigned long c_old, unsigned long c) 312 unsigned long c_old, unsigned long c)
@@ -347,3 +381,397 @@ static int __init check_cpu_stall_init(void)
347early_initcall(check_cpu_stall_init); 381early_initcall(check_cpu_stall_init);
348 382
349#endif /* #ifdef CONFIG_RCU_STALL_COMMON */ 383#endif /* #ifdef CONFIG_RCU_STALL_COMMON */
384
385#ifdef CONFIG_TASKS_RCU
386
387/*
388 * Simple variant of RCU whose quiescent states are voluntary context switch,
389 * user-space execution, and idle. As such, grace periods can take one good
390 * long time. There are no read-side primitives similar to rcu_read_lock()
391 * and rcu_read_unlock() because this implementation is intended to get
392 * the system into a safe state for some of the manipulations involved in
393 * tracing and the like. Finally, this implementation does not support
394 * high call_rcu_tasks() rates from multiple CPUs. If this is required,
395 * per-CPU callback lists will be needed.
396 */
397
398/* Global list of callbacks and associated lock. */
399static struct rcu_head *rcu_tasks_cbs_head;
400static struct rcu_head **rcu_tasks_cbs_tail = &rcu_tasks_cbs_head;
401static DECLARE_WAIT_QUEUE_HEAD(rcu_tasks_cbs_wq);
402static DEFINE_RAW_SPINLOCK(rcu_tasks_cbs_lock);
403
404/* Track exiting tasks in order to allow them to be waited for. */
405DEFINE_SRCU(tasks_rcu_exit_srcu);
406
407/* Control stall timeouts. Disable with <= 0, otherwise jiffies till stall. */
408static int rcu_task_stall_timeout __read_mostly = HZ * 60 * 10;
409module_param(rcu_task_stall_timeout, int, 0644);
410
411static void rcu_spawn_tasks_kthread(void);
412
413/*
414 * Post an RCU-tasks callback. First call must be from process context
415 * after the scheduler if fully operational.
416 */
417void call_rcu_tasks(struct rcu_head *rhp, void (*func)(struct rcu_head *rhp))
418{
419 unsigned long flags;
420 bool needwake;
421
422 rhp->next = NULL;
423 rhp->func = func;
424 raw_spin_lock_irqsave(&rcu_tasks_cbs_lock, flags);
425 needwake = !rcu_tasks_cbs_head;
426 *rcu_tasks_cbs_tail = rhp;
427 rcu_tasks_cbs_tail = &rhp->next;
428 raw_spin_unlock_irqrestore(&rcu_tasks_cbs_lock, flags);
429 if (needwake) {
430 rcu_spawn_tasks_kthread();
431 wake_up(&rcu_tasks_cbs_wq);
432 }
433}
434EXPORT_SYMBOL_GPL(call_rcu_tasks);
435
436/**
437 * synchronize_rcu_tasks - wait until an rcu-tasks grace period has elapsed.
438 *
439 * Control will return to the caller some time after a full rcu-tasks
440 * grace period has elapsed, in other words after all currently
441 * executing rcu-tasks read-side critical sections have elapsed. These
442 * read-side critical sections are delimited by calls to schedule(),
443 * cond_resched_rcu_qs(), idle execution, userspace execution, calls
444 * to synchronize_rcu_tasks(), and (in theory, anyway) cond_resched().
445 *
446 * This is a very specialized primitive, intended only for a few uses in
447 * tracing and other situations requiring manipulation of function
448 * preambles and profiling hooks. The synchronize_rcu_tasks() function
449 * is not (yet) intended for heavy use from multiple CPUs.
450 *
451 * Note that this guarantee implies further memory-ordering guarantees.
452 * On systems with more than one CPU, when synchronize_rcu_tasks() returns,
453 * each CPU is guaranteed to have executed a full memory barrier since the
454 * end of its last RCU-tasks read-side critical section whose beginning
455 * preceded the call to synchronize_rcu_tasks(). In addition, each CPU
456 * having an RCU-tasks read-side critical section that extends beyond
457 * the return from synchronize_rcu_tasks() is guaranteed to have executed
458 * a full memory barrier after the beginning of synchronize_rcu_tasks()
459 * and before the beginning of that RCU-tasks read-side critical section.
460 * Note that these guarantees include CPUs that are offline, idle, or
461 * executing in user mode, as well as CPUs that are executing in the kernel.
462 *
463 * Furthermore, if CPU A invoked synchronize_rcu_tasks(), which returned
464 * to its caller on CPU B, then both CPU A and CPU B are guaranteed
465 * to have executed a full memory barrier during the execution of
466 * synchronize_rcu_tasks() -- even if CPU A and CPU B are the same CPU
467 * (but again only if the system has more than one CPU).
468 */
469void synchronize_rcu_tasks(void)
470{
471 /* Complain if the scheduler has not started. */
472 rcu_lockdep_assert(!rcu_scheduler_active,
473 "synchronize_rcu_tasks called too soon");
474
475 /* Wait for the grace period. */
476 wait_rcu_gp(call_rcu_tasks);
477}
478EXPORT_SYMBOL_GPL(synchronize_rcu_tasks);
479
480/**
481 * rcu_barrier_tasks - Wait for in-flight call_rcu_tasks() callbacks.
482 *
483 * Although the current implementation is guaranteed to wait, it is not
484 * obligated to, for example, if there are no pending callbacks.
485 */
486void rcu_barrier_tasks(void)
487{
488 /* There is only one callback queue, so this is easy. ;-) */
489 synchronize_rcu_tasks();
490}
491EXPORT_SYMBOL_GPL(rcu_barrier_tasks);
492
493/* See if tasks are still holding out, complain if so. */
494static void check_holdout_task(struct task_struct *t,
495 bool needreport, bool *firstreport)
496{
497 int cpu;
498
499 if (!ACCESS_ONCE(t->rcu_tasks_holdout) ||
500 t->rcu_tasks_nvcsw != ACCESS_ONCE(t->nvcsw) ||
501 !ACCESS_ONCE(t->on_rq) ||
502 (IS_ENABLED(CONFIG_NO_HZ_FULL) &&
503 !is_idle_task(t) && t->rcu_tasks_idle_cpu >= 0)) {
504 ACCESS_ONCE(t->rcu_tasks_holdout) = false;
505 list_del_init(&t->rcu_tasks_holdout_list);
506 put_task_struct(t);
507 return;
508 }
509 if (!needreport)
510 return;
511 if (*firstreport) {
512 pr_err("INFO: rcu_tasks detected stalls on tasks:\n");
513 *firstreport = false;
514 }
515 cpu = task_cpu(t);
516 pr_alert("%p: %c%c nvcsw: %lu/%lu holdout: %d idle_cpu: %d/%d\n",
517 t, ".I"[is_idle_task(t)],
518 "N."[cpu < 0 || !tick_nohz_full_cpu(cpu)],
519 t->rcu_tasks_nvcsw, t->nvcsw, t->rcu_tasks_holdout,
520 t->rcu_tasks_idle_cpu, cpu);
521 sched_show_task(t);
522}
523
524/* RCU-tasks kthread that detects grace periods and invokes callbacks. */
525static int __noreturn rcu_tasks_kthread(void *arg)
526{
527 unsigned long flags;
528 struct task_struct *g, *t;
529 unsigned long lastreport;
530 struct rcu_head *list;
531 struct rcu_head *next;
532 LIST_HEAD(rcu_tasks_holdouts);
533
534 /* Run on housekeeping CPUs by default. Sysadm can move if desired. */
535 housekeeping_affine(current);
536
537 /*
538 * Each pass through the following loop makes one check for
539 * newly arrived callbacks, and, if there are some, waits for
540 * one RCU-tasks grace period and then invokes the callbacks.
541 * This loop is terminated by the system going down. ;-)
542 */
543 for (;;) {
544
545 /* Pick up any new callbacks. */
546 raw_spin_lock_irqsave(&rcu_tasks_cbs_lock, flags);
547 list = rcu_tasks_cbs_head;
548 rcu_tasks_cbs_head = NULL;
549 rcu_tasks_cbs_tail = &rcu_tasks_cbs_head;
550 raw_spin_unlock_irqrestore(&rcu_tasks_cbs_lock, flags);
551
552 /* If there were none, wait a bit and start over. */
553 if (!list) {
554 wait_event_interruptible(rcu_tasks_cbs_wq,
555 rcu_tasks_cbs_head);
556 if (!rcu_tasks_cbs_head) {
557 WARN_ON(signal_pending(current));
558 schedule_timeout_interruptible(HZ/10);
559 }
560 continue;
561 }
562
563 /*
564 * Wait for all pre-existing t->on_rq and t->nvcsw
565 * transitions to complete. Invoking synchronize_sched()
566 * suffices because all these transitions occur with
567 * interrupts disabled. Without this synchronize_sched(),
568 * a read-side critical section that started before the
569 * grace period might be incorrectly seen as having started
570 * after the grace period.
571 *
572 * This synchronize_sched() also dispenses with the
573 * need for a memory barrier on the first store to
574 * ->rcu_tasks_holdout, as it forces the store to happen
575 * after the beginning of the grace period.
576 */
577 synchronize_sched();
578
579 /*
580 * There were callbacks, so we need to wait for an
581 * RCU-tasks grace period. Start off by scanning
582 * the task list for tasks that are not already
583 * voluntarily blocked. Mark these tasks and make
584 * a list of them in rcu_tasks_holdouts.
585 */
586 rcu_read_lock();
587 for_each_process_thread(g, t) {
588 if (t != current && ACCESS_ONCE(t->on_rq) &&
589 !is_idle_task(t)) {
590 get_task_struct(t);
591 t->rcu_tasks_nvcsw = ACCESS_ONCE(t->nvcsw);
592 ACCESS_ONCE(t->rcu_tasks_holdout) = true;
593 list_add(&t->rcu_tasks_holdout_list,
594 &rcu_tasks_holdouts);
595 }
596 }
597 rcu_read_unlock();
598
599 /*
600 * Wait for tasks that are in the process of exiting.
601 * This does only part of the job, ensuring that all
602 * tasks that were previously exiting reach the point
603 * where they have disabled preemption, allowing the
604 * later synchronize_sched() to finish the job.
605 */
606 synchronize_srcu(&tasks_rcu_exit_srcu);
607
608 /*
609 * Each pass through the following loop scans the list
610 * of holdout tasks, removing any that are no longer
611 * holdouts. When the list is empty, we are done.
612 */
613 lastreport = jiffies;
614 while (!list_empty(&rcu_tasks_holdouts)) {
615 bool firstreport;
616 bool needreport;
617 int rtst;
618 struct task_struct *t1;
619
620 schedule_timeout_interruptible(HZ);
621 rtst = ACCESS_ONCE(rcu_task_stall_timeout);
622 needreport = rtst > 0 &&
623 time_after(jiffies, lastreport + rtst);
624 if (needreport)
625 lastreport = jiffies;
626 firstreport = true;
627 WARN_ON(signal_pending(current));
628 list_for_each_entry_safe(t, t1, &rcu_tasks_holdouts,
629 rcu_tasks_holdout_list) {
630 check_holdout_task(t, needreport, &firstreport);
631 cond_resched();
632 }
633 }
634
635 /*
636 * Because ->on_rq and ->nvcsw are not guaranteed
637 * to have a full memory barriers prior to them in the
638 * schedule() path, memory reordering on other CPUs could
639 * cause their RCU-tasks read-side critical sections to
640 * extend past the end of the grace period. However,
641 * because these ->nvcsw updates are carried out with
642 * interrupts disabled, we can use synchronize_sched()
643 * to force the needed ordering on all such CPUs.
644 *
645 * This synchronize_sched() also confines all
646 * ->rcu_tasks_holdout accesses to be within the grace
647 * period, avoiding the need for memory barriers for
648 * ->rcu_tasks_holdout accesses.
649 *
650 * In addition, this synchronize_sched() waits for exiting
651 * tasks to complete their final preempt_disable() region
652 * of execution, cleaning up after the synchronize_srcu()
653 * above.
654 */
655 synchronize_sched();
656
657 /* Invoke the callbacks. */
658 while (list) {
659 next = list->next;
660 local_bh_disable();
661 list->func(list);
662 local_bh_enable();
663 list = next;
664 cond_resched();
665 }
666 schedule_timeout_uninterruptible(HZ/10);
667 }
668}
669
670/* Spawn rcu_tasks_kthread() at first call to call_rcu_tasks(). */
671static void rcu_spawn_tasks_kthread(void)
672{
673 static DEFINE_MUTEX(rcu_tasks_kthread_mutex);
674 static struct task_struct *rcu_tasks_kthread_ptr;
675 struct task_struct *t;
676
677 if (ACCESS_ONCE(rcu_tasks_kthread_ptr)) {
678 smp_mb(); /* Ensure caller sees full kthread. */
679 return;
680 }
681 mutex_lock(&rcu_tasks_kthread_mutex);
682 if (rcu_tasks_kthread_ptr) {
683 mutex_unlock(&rcu_tasks_kthread_mutex);
684 return;
685 }
686 t = kthread_run(rcu_tasks_kthread, NULL, "rcu_tasks_kthread");
687 BUG_ON(IS_ERR(t));
688 smp_mb(); /* Ensure others see full kthread. */
689 ACCESS_ONCE(rcu_tasks_kthread_ptr) = t;
690 mutex_unlock(&rcu_tasks_kthread_mutex);
691}
692
693#endif /* #ifdef CONFIG_TASKS_RCU */
694
695#ifdef CONFIG_PROVE_RCU
696
697/*
698 * Early boot self test parameters, one for each flavor
699 */
700static bool rcu_self_test;
701static bool rcu_self_test_bh;
702static bool rcu_self_test_sched;
703
704module_param(rcu_self_test, bool, 0444);
705module_param(rcu_self_test_bh, bool, 0444);
706module_param(rcu_self_test_sched, bool, 0444);
707
708static int rcu_self_test_counter;
709
710static void test_callback(struct rcu_head *r)
711{
712 rcu_self_test_counter++;
713 pr_info("RCU test callback executed %d\n", rcu_self_test_counter);
714}
715
716static void early_boot_test_call_rcu(void)
717{
718 static struct rcu_head head;
719
720 call_rcu(&head, test_callback);
721}
722
723static void early_boot_test_call_rcu_bh(void)
724{
725 static struct rcu_head head;
726
727 call_rcu_bh(&head, test_callback);
728}
729
730static void early_boot_test_call_rcu_sched(void)
731{
732 static struct rcu_head head;
733
734 call_rcu_sched(&head, test_callback);
735}
736
737void rcu_early_boot_tests(void)
738{
739 pr_info("Running RCU self tests\n");
740
741 if (rcu_self_test)
742 early_boot_test_call_rcu();
743 if (rcu_self_test_bh)
744 early_boot_test_call_rcu_bh();
745 if (rcu_self_test_sched)
746 early_boot_test_call_rcu_sched();
747}
748
749static int rcu_verify_early_boot_tests(void)
750{
751 int ret = 0;
752 int early_boot_test_counter = 0;
753
754 if (rcu_self_test) {
755 early_boot_test_counter++;
756 rcu_barrier();
757 }
758 if (rcu_self_test_bh) {
759 early_boot_test_counter++;
760 rcu_barrier_bh();
761 }
762 if (rcu_self_test_sched) {
763 early_boot_test_counter++;
764 rcu_barrier_sched();
765 }
766
767 if (rcu_self_test_counter != early_boot_test_counter) {
768 WARN_ON(1);
769 ret = -1;
770 }
771
772 return ret;
773}
774late_initcall(rcu_verify_early_boot_tests);
775#else
776void rcu_early_boot_tests(void) {}
777#endif /* CONFIG_PROVE_RCU */
diff --git a/kernel/reboot.c b/kernel/reboot.c
index a3a9e240fcdb..5925f5ae8dff 100644
--- a/kernel/reboot.c
+++ b/kernel/reboot.c
@@ -104,6 +104,87 @@ int unregister_reboot_notifier(struct notifier_block *nb)
104} 104}
105EXPORT_SYMBOL(unregister_reboot_notifier); 105EXPORT_SYMBOL(unregister_reboot_notifier);
106 106
107/*
108 * Notifier list for kernel code which wants to be called
109 * to restart the system.
110 */
111static ATOMIC_NOTIFIER_HEAD(restart_handler_list);
112
113/**
114 * register_restart_handler - Register function to be called to reset
115 * the system
116 * @nb: Info about handler function to be called
117 * @nb->priority: Handler priority. Handlers should follow the
118 * following guidelines for setting priorities.
119 * 0: Restart handler of last resort,
120 * with limited restart capabilities
121 * 128: Default restart handler; use if no other
122 * restart handler is expected to be available,
123 * and/or if restart functionality is
124 * sufficient to restart the entire system
125 * 255: Highest priority restart handler, will
126 * preempt all other restart handlers
127 *
128 * Registers a function with code to be called to restart the
129 * system.
130 *
131 * Registered functions will be called from machine_restart as last
132 * step of the restart sequence (if the architecture specific
133 * machine_restart function calls do_kernel_restart - see below
134 * for details).
135 * Registered functions are expected to restart the system immediately.
136 * If more than one function is registered, the restart handler priority
137 * selects which function will be called first.
138 *
139 * Restart handlers are expected to be registered from non-architecture
140 * code, typically from drivers. A typical use case would be a system
141 * where restart functionality is provided through a watchdog. Multiple
142 * restart handlers may exist; for example, one restart handler might
143 * restart the entire system, while another only restarts the CPU.
144 * In such cases, the restart handler which only restarts part of the
145 * hardware is expected to register with low priority to ensure that
146 * it only runs if no other means to restart the system is available.
147 *
148 * Currently always returns zero, as atomic_notifier_chain_register()
149 * always returns zero.
150 */
151int register_restart_handler(struct notifier_block *nb)
152{
153 return atomic_notifier_chain_register(&restart_handler_list, nb);
154}
155EXPORT_SYMBOL(register_restart_handler);
156
157/**
158 * unregister_restart_handler - Unregister previously registered
159 * restart handler
160 * @nb: Hook to be unregistered
161 *
162 * Unregisters a previously registered restart handler function.
163 *
164 * Returns zero on success, or %-ENOENT on failure.
165 */
166int unregister_restart_handler(struct notifier_block *nb)
167{
168 return atomic_notifier_chain_unregister(&restart_handler_list, nb);
169}
170EXPORT_SYMBOL(unregister_restart_handler);
171
172/**
173 * do_kernel_restart - Execute kernel restart handler call chain
174 *
175 * Calls functions registered with register_restart_handler.
176 *
177 * Expected to be called from machine_restart as last step of the restart
178 * sequence.
179 *
180 * Restarts the system immediately if a restart handler function has been
181 * registered. Otherwise does nothing.
182 */
183void do_kernel_restart(char *cmd)
184{
185 atomic_notifier_call_chain(&restart_handler_list, reboot_mode, cmd);
186}
187
107void migrate_to_reboot_cpu(void) 188void migrate_to_reboot_cpu(void)
108{ 189{
109 /* The boot cpu is always logical cpu 0 */ 190 /* The boot cpu is always logical cpu 0 */
diff --git a/kernel/res_counter.c b/kernel/res_counter.c
deleted file mode 100644
index e791130f85a7..000000000000
--- a/kernel/res_counter.c
+++ /dev/null
@@ -1,211 +0,0 @@
1/*
2 * resource cgroups
3 *
4 * Copyright 2007 OpenVZ SWsoft Inc
5 *
6 * Author: Pavel Emelianov <xemul@openvz.org>
7 *
8 */
9
10#include <linux/types.h>
11#include <linux/parser.h>
12#include <linux/fs.h>
13#include <linux/res_counter.h>
14#include <linux/uaccess.h>
15#include <linux/mm.h>
16
17void res_counter_init(struct res_counter *counter, struct res_counter *parent)
18{
19 spin_lock_init(&counter->lock);
20 counter->limit = RES_COUNTER_MAX;
21 counter->soft_limit = RES_COUNTER_MAX;
22 counter->parent = parent;
23}
24
25static u64 res_counter_uncharge_locked(struct res_counter *counter,
26 unsigned long val)
27{
28 if (WARN_ON(counter->usage < val))
29 val = counter->usage;
30
31 counter->usage -= val;
32 return counter->usage;
33}
34
35static int res_counter_charge_locked(struct res_counter *counter,
36 unsigned long val, bool force)
37{
38 int ret = 0;
39
40 if (counter->usage + val > counter->limit) {
41 counter->failcnt++;
42 ret = -ENOMEM;
43 if (!force)
44 return ret;
45 }
46
47 counter->usage += val;
48 if (counter->usage > counter->max_usage)
49 counter->max_usage = counter->usage;
50 return ret;
51}
52
53static int __res_counter_charge(struct res_counter *counter, unsigned long val,
54 struct res_counter **limit_fail_at, bool force)
55{
56 int ret, r;
57 unsigned long flags;
58 struct res_counter *c, *u;
59
60 r = ret = 0;
61 *limit_fail_at = NULL;
62 local_irq_save(flags);
63 for (c = counter; c != NULL; c = c->parent) {
64 spin_lock(&c->lock);
65 r = res_counter_charge_locked(c, val, force);
66 spin_unlock(&c->lock);
67 if (r < 0 && !ret) {
68 ret = r;
69 *limit_fail_at = c;
70 if (!force)
71 break;
72 }
73 }
74
75 if (ret < 0 && !force) {
76 for (u = counter; u != c; u = u->parent) {
77 spin_lock(&u->lock);
78 res_counter_uncharge_locked(u, val);
79 spin_unlock(&u->lock);
80 }
81 }
82 local_irq_restore(flags);
83
84 return ret;
85}
86
87int res_counter_charge(struct res_counter *counter, unsigned long val,
88 struct res_counter **limit_fail_at)
89{
90 return __res_counter_charge(counter, val, limit_fail_at, false);
91}
92
93int res_counter_charge_nofail(struct res_counter *counter, unsigned long val,
94 struct res_counter **limit_fail_at)
95{
96 return __res_counter_charge(counter, val, limit_fail_at, true);
97}
98
99u64 res_counter_uncharge_until(struct res_counter *counter,
100 struct res_counter *top,
101 unsigned long val)
102{
103 unsigned long flags;
104 struct res_counter *c;
105 u64 ret = 0;
106
107 local_irq_save(flags);
108 for (c = counter; c != top; c = c->parent) {
109 u64 r;
110 spin_lock(&c->lock);
111 r = res_counter_uncharge_locked(c, val);
112 if (c == counter)
113 ret = r;
114 spin_unlock(&c->lock);
115 }
116 local_irq_restore(flags);
117 return ret;
118}
119
120u64 res_counter_uncharge(struct res_counter *counter, unsigned long val)
121{
122 return res_counter_uncharge_until(counter, NULL, val);
123}
124
125static inline unsigned long long *
126res_counter_member(struct res_counter *counter, int member)
127{
128 switch (member) {
129 case RES_USAGE:
130 return &counter->usage;
131 case RES_MAX_USAGE:
132 return &counter->max_usage;
133 case RES_LIMIT:
134 return &counter->limit;
135 case RES_FAILCNT:
136 return &counter->failcnt;
137 case RES_SOFT_LIMIT:
138 return &counter->soft_limit;
139 };
140
141 BUG();
142 return NULL;
143}
144
145ssize_t res_counter_read(struct res_counter *counter, int member,
146 const char __user *userbuf, size_t nbytes, loff_t *pos,
147 int (*read_strategy)(unsigned long long val, char *st_buf))
148{
149 unsigned long long *val;
150 char buf[64], *s;
151
152 s = buf;
153 val = res_counter_member(counter, member);
154 if (read_strategy)
155 s += read_strategy(*val, s);
156 else
157 s += sprintf(s, "%llu\n", *val);
158 return simple_read_from_buffer((void __user *)userbuf, nbytes,
159 pos, buf, s - buf);
160}
161
162#if BITS_PER_LONG == 32
163u64 res_counter_read_u64(struct res_counter *counter, int member)
164{
165 unsigned long flags;
166 u64 ret;
167
168 spin_lock_irqsave(&counter->lock, flags);
169 ret = *res_counter_member(counter, member);
170 spin_unlock_irqrestore(&counter->lock, flags);
171
172 return ret;
173}
174#else
175u64 res_counter_read_u64(struct res_counter *counter, int member)
176{
177 return *res_counter_member(counter, member);
178}
179#endif
180
181int res_counter_memparse_write_strategy(const char *buf,
182 unsigned long long *resp)
183{
184 char *end;
185 unsigned long long res;
186
187 /* return RES_COUNTER_MAX(unlimited) if "-1" is specified */
188 if (*buf == '-') {
189 int rc = kstrtoull(buf + 1, 10, &res);
190
191 if (rc)
192 return rc;
193 if (res != 1)
194 return -EINVAL;
195 *resp = RES_COUNTER_MAX;
196 return 0;
197 }
198
199 res = memparse(buf, &end);
200 if (*end != '\0')
201 return -EINVAL;
202
203 if (PAGE_ALIGN(res) >= res)
204 res = PAGE_ALIGN(res);
205 else
206 res = RES_COUNTER_MAX;
207
208 *resp = res;
209
210 return 0;
211}
diff --git a/kernel/resource.c b/kernel/resource.c
index 60c5a3856ab7..0bcebffc4e77 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -491,6 +491,42 @@ int __weak page_is_ram(unsigned long pfn)
491} 491}
492EXPORT_SYMBOL_GPL(page_is_ram); 492EXPORT_SYMBOL_GPL(page_is_ram);
493 493
494/*
495 * Search for a resouce entry that fully contains the specified region.
496 * If found, return 1 if it is RAM, 0 if not.
497 * If not found, or region is not fully contained, return -1
498 *
499 * Used by the ioremap functions to ensure the user is not remapping RAM and is
500 * a vast speed up over walking through the resource table page by page.
501 */
502int region_is_ram(resource_size_t start, unsigned long size)
503{
504 struct resource *p;
505 resource_size_t end = start + size - 1;
506 int flags = IORESOURCE_MEM | IORESOURCE_BUSY;
507 const char *name = "System RAM";
508 int ret = -1;
509
510 read_lock(&resource_lock);
511 for (p = iomem_resource.child; p ; p = p->sibling) {
512 if (end < p->start)
513 continue;
514
515 if (p->start <= start && end <= p->end) {
516 /* resource fully contains region */
517 if ((p->flags != flags) || strcmp(p->name, name))
518 ret = 0;
519 else
520 ret = 1;
521 break;
522 }
523 if (p->end < start)
524 break; /* not found */
525 }
526 read_unlock(&resource_lock);
527 return ret;
528}
529
494void __weak arch_remove_reservations(struct resource *avail) 530void __weak arch_remove_reservations(struct resource *avail)
495{ 531{
496} 532}
@@ -1245,6 +1281,76 @@ int release_mem_region_adjustable(struct resource *parent,
1245/* 1281/*
1246 * Managed region resource 1282 * Managed region resource
1247 */ 1283 */
1284static void devm_resource_release(struct device *dev, void *ptr)
1285{
1286 struct resource **r = ptr;
1287
1288 release_resource(*r);
1289}
1290
1291/**
1292 * devm_request_resource() - request and reserve an I/O or memory resource
1293 * @dev: device for which to request the resource
1294 * @root: root of the resource tree from which to request the resource
1295 * @new: descriptor of the resource to request
1296 *
1297 * This is a device-managed version of request_resource(). There is usually
1298 * no need to release resources requested by this function explicitly since
1299 * that will be taken care of when the device is unbound from its driver.
1300 * If for some reason the resource needs to be released explicitly, because
1301 * of ordering issues for example, drivers must call devm_release_resource()
1302 * rather than the regular release_resource().
1303 *
1304 * When a conflict is detected between any existing resources and the newly
1305 * requested resource, an error message will be printed.
1306 *
1307 * Returns 0 on success or a negative error code on failure.
1308 */
1309int devm_request_resource(struct device *dev, struct resource *root,
1310 struct resource *new)
1311{
1312 struct resource *conflict, **ptr;
1313
1314 ptr = devres_alloc(devm_resource_release, sizeof(*ptr), GFP_KERNEL);
1315 if (!ptr)
1316 return -ENOMEM;
1317
1318 *ptr = new;
1319
1320 conflict = request_resource_conflict(root, new);
1321 if (conflict) {
1322 dev_err(dev, "resource collision: %pR conflicts with %s %pR\n",
1323 new, conflict->name, conflict);
1324 devres_free(ptr);
1325 return -EBUSY;
1326 }
1327
1328 devres_add(dev, ptr);
1329 return 0;
1330}
1331EXPORT_SYMBOL(devm_request_resource);
1332
1333static int devm_resource_match(struct device *dev, void *res, void *data)
1334{
1335 struct resource **ptr = res;
1336
1337 return *ptr == data;
1338}
1339
1340/**
1341 * devm_release_resource() - release a previously requested resource
1342 * @dev: device for which to release the resource
1343 * @new: descriptor of the resource to release
1344 *
1345 * Releases a resource previously requested using devm_request_resource().
1346 */
1347void devm_release_resource(struct device *dev, struct resource *new)
1348{
1349 WARN_ON(devres_release(dev, devm_resource_release, devm_resource_match,
1350 new));
1351}
1352EXPORT_SYMBOL(devm_release_resource);
1353
1248struct region_devres { 1354struct region_devres {
1249 struct resource *parent; 1355 struct resource *parent;
1250 resource_size_t start; 1356 resource_size_t start;
diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c
index e73efba98301..8a2e230fb86a 100644
--- a/kernel/sched/auto_group.c
+++ b/kernel/sched/auto_group.c
@@ -148,11 +148,8 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag)
148 if (!ACCESS_ONCE(sysctl_sched_autogroup_enabled)) 148 if (!ACCESS_ONCE(sysctl_sched_autogroup_enabled))
149 goto out; 149 goto out;
150 150
151 t = p; 151 for_each_thread(p, t)
152 do {
153 sched_move_task(t); 152 sched_move_task(t);
154 } while_each_thread(p, t);
155
156out: 153out:
157 unlock_task_sighand(p, &flags); 154 unlock_task_sighand(p, &flags);
158 autogroup_kref_put(prev); 155 autogroup_kref_put(prev);
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
index 3ef6451e972e..c27e4f8f4879 100644
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -134,7 +134,7 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(struct sched_clock_data, sched_clock_data);
134 134
135static inline struct sched_clock_data *this_scd(void) 135static inline struct sched_clock_data *this_scd(void)
136{ 136{
137 return &__get_cpu_var(sched_clock_data); 137 return this_cpu_ptr(&sched_clock_data);
138} 138}
139 139
140static inline struct sched_clock_data *cpu_sdc(int cpu) 140static inline struct sched_clock_data *cpu_sdc(int cpu)
diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c
index a63f4dc27909..607f852b4d04 100644
--- a/kernel/sched/completion.c
+++ b/kernel/sched/completion.c
@@ -148,7 +148,7 @@ EXPORT_SYMBOL(wait_for_completion_timeout);
148 * 148 *
149 * This waits to be signaled for completion of a specific task. It is NOT 149 * This waits to be signaled for completion of a specific task. It is NOT
150 * interruptible and there is no timeout. The caller is accounted as waiting 150 * interruptible and there is no timeout. The caller is accounted as waiting
151 * for IO. 151 * for IO (which traditionally means blkio only).
152 */ 152 */
153void __sched wait_for_completion_io(struct completion *x) 153void __sched wait_for_completion_io(struct completion *x)
154{ 154{
@@ -163,7 +163,8 @@ EXPORT_SYMBOL(wait_for_completion_io);
163 * 163 *
164 * This waits for either a completion of a specific task to be signaled or for a 164 * This waits for either a completion of a specific task to be signaled or for a
165 * specified timeout to expire. The timeout is in jiffies. It is not 165 * specified timeout to expire. The timeout is in jiffies. It is not
166 * interruptible. The caller is accounted as waiting for IO. 166 * interruptible. The caller is accounted as waiting for IO (which traditionally
167 * means blkio only).
167 * 168 *
168 * Return: 0 if timed out, and positive (at least 1, or number of jiffies left 169 * Return: 0 if timed out, and positive (at least 1, or number of jiffies left
169 * till timeout) if completed. 170 * till timeout) if completed.
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index ec1a286684a5..c0accc00566e 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -90,22 +90,6 @@
90#define CREATE_TRACE_POINTS 90#define CREATE_TRACE_POINTS
91#include <trace/events/sched.h> 91#include <trace/events/sched.h>
92 92
93#ifdef smp_mb__before_atomic
94void __smp_mb__before_atomic(void)
95{
96 smp_mb__before_atomic();
97}
98EXPORT_SYMBOL(__smp_mb__before_atomic);
99#endif
100
101#ifdef smp_mb__after_atomic
102void __smp_mb__after_atomic(void)
103{
104 smp_mb__after_atomic();
105}
106EXPORT_SYMBOL(__smp_mb__after_atomic);
107#endif
108
109void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period) 93void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period)
110{ 94{
111 unsigned long delta; 95 unsigned long delta;
@@ -333,9 +317,12 @@ static inline struct rq *__task_rq_lock(struct task_struct *p)
333 for (;;) { 317 for (;;) {
334 rq = task_rq(p); 318 rq = task_rq(p);
335 raw_spin_lock(&rq->lock); 319 raw_spin_lock(&rq->lock);
336 if (likely(rq == task_rq(p))) 320 if (likely(rq == task_rq(p) && !task_on_rq_migrating(p)))
337 return rq; 321 return rq;
338 raw_spin_unlock(&rq->lock); 322 raw_spin_unlock(&rq->lock);
323
324 while (unlikely(task_on_rq_migrating(p)))
325 cpu_relax();
339 } 326 }
340} 327}
341 328
@@ -352,10 +339,13 @@ static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
352 raw_spin_lock_irqsave(&p->pi_lock, *flags); 339 raw_spin_lock_irqsave(&p->pi_lock, *flags);
353 rq = task_rq(p); 340 rq = task_rq(p);
354 raw_spin_lock(&rq->lock); 341 raw_spin_lock(&rq->lock);
355 if (likely(rq == task_rq(p))) 342 if (likely(rq == task_rq(p) && !task_on_rq_migrating(p)))
356 return rq; 343 return rq;
357 raw_spin_unlock(&rq->lock); 344 raw_spin_unlock(&rq->lock);
358 raw_spin_unlock_irqrestore(&p->pi_lock, *flags); 345 raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
346
347 while (unlikely(task_on_rq_migrating(p)))
348 cpu_relax();
359 } 349 }
360} 350}
361 351
@@ -449,7 +439,15 @@ static void __hrtick_start(void *arg)
449void hrtick_start(struct rq *rq, u64 delay) 439void hrtick_start(struct rq *rq, u64 delay)
450{ 440{
451 struct hrtimer *timer = &rq->hrtick_timer; 441 struct hrtimer *timer = &rq->hrtick_timer;
452 ktime_t time = ktime_add_ns(timer->base->get_time(), delay); 442 ktime_t time;
443 s64 delta;
444
445 /*
446 * Don't schedule slices shorter than 10000ns, that just
447 * doesn't make sense and can cause timer DoS.
448 */
449 delta = max_t(s64, delay, 10000LL);
450 time = ktime_add_ns(timer->base->get_time(), delta);
453 451
454 hrtimer_set_expires(timer, time); 452 hrtimer_set_expires(timer, time);
455 453
@@ -1010,6 +1008,9 @@ inline int task_curr(const struct task_struct *p)
1010 return cpu_curr(task_cpu(p)) == p; 1008 return cpu_curr(task_cpu(p)) == p;
1011} 1009}
1012 1010
1011/*
1012 * Can drop rq->lock because from sched_class::switched_from() methods drop it.
1013 */
1013static inline void check_class_changed(struct rq *rq, struct task_struct *p, 1014static inline void check_class_changed(struct rq *rq, struct task_struct *p,
1014 const struct sched_class *prev_class, 1015 const struct sched_class *prev_class,
1015 int oldprio) 1016 int oldprio)
@@ -1017,6 +1018,7 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p,
1017 if (prev_class != p->sched_class) { 1018 if (prev_class != p->sched_class) {
1018 if (prev_class->switched_from) 1019 if (prev_class->switched_from)
1019 prev_class->switched_from(rq, p); 1020 prev_class->switched_from(rq, p);
1021 /* Possble rq->lock 'hole'. */
1020 p->sched_class->switched_to(rq, p); 1022 p->sched_class->switched_to(rq, p);
1021 } else if (oldprio != p->prio || dl_task(p)) 1023 } else if (oldprio != p->prio || dl_task(p))
1022 p->sched_class->prio_changed(rq, p, oldprio); 1024 p->sched_class->prio_changed(rq, p, oldprio);
@@ -1043,7 +1045,7 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
1043 * A queue event has occurred, and we're going to schedule. In 1045 * A queue event has occurred, and we're going to schedule. In
1044 * this case, we can save a useless back to back clock update. 1046 * this case, we can save a useless back to back clock update.
1045 */ 1047 */
1046 if (rq->curr->on_rq && test_tsk_need_resched(rq->curr)) 1048 if (task_on_rq_queued(rq->curr) && test_tsk_need_resched(rq->curr))
1047 rq->skip_clock_update = 1; 1049 rq->skip_clock_update = 1;
1048} 1050}
1049 1051
@@ -1056,7 +1058,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
1056 * ttwu() will sort out the placement. 1058 * ttwu() will sort out the placement.
1057 */ 1059 */
1058 WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING && 1060 WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&
1059 !(task_preempt_count(p) & PREEMPT_ACTIVE)); 1061 !p->on_rq);
1060 1062
1061#ifdef CONFIG_LOCKDEP 1063#ifdef CONFIG_LOCKDEP
1062 /* 1064 /*
@@ -1088,7 +1090,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
1088 1090
1089static void __migrate_swap_task(struct task_struct *p, int cpu) 1091static void __migrate_swap_task(struct task_struct *p, int cpu)
1090{ 1092{
1091 if (p->on_rq) { 1093 if (task_on_rq_queued(p)) {
1092 struct rq *src_rq, *dst_rq; 1094 struct rq *src_rq, *dst_rq;
1093 1095
1094 src_rq = task_rq(p); 1096 src_rq = task_rq(p);
@@ -1214,7 +1216,7 @@ static int migration_cpu_stop(void *data);
1214unsigned long wait_task_inactive(struct task_struct *p, long match_state) 1216unsigned long wait_task_inactive(struct task_struct *p, long match_state)
1215{ 1217{
1216 unsigned long flags; 1218 unsigned long flags;
1217 int running, on_rq; 1219 int running, queued;
1218 unsigned long ncsw; 1220 unsigned long ncsw;
1219 struct rq *rq; 1221 struct rq *rq;
1220 1222
@@ -1252,7 +1254,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
1252 rq = task_rq_lock(p, &flags); 1254 rq = task_rq_lock(p, &flags);
1253 trace_sched_wait_task(p); 1255 trace_sched_wait_task(p);
1254 running = task_running(rq, p); 1256 running = task_running(rq, p);
1255 on_rq = p->on_rq; 1257 queued = task_on_rq_queued(p);
1256 ncsw = 0; 1258 ncsw = 0;
1257 if (!match_state || p->state == match_state) 1259 if (!match_state || p->state == match_state)
1258 ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ 1260 ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
@@ -1284,7 +1286,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
1284 * running right now), it's preempted, and we should 1286 * running right now), it's preempted, and we should
1285 * yield - it could be a while. 1287 * yield - it could be a while.
1286 */ 1288 */
1287 if (unlikely(on_rq)) { 1289 if (unlikely(queued)) {
1288 ktime_t to = ktime_set(0, NSEC_PER_SEC/HZ); 1290 ktime_t to = ktime_set(0, NSEC_PER_SEC/HZ);
1289 1291
1290 set_current_state(TASK_UNINTERRUPTIBLE); 1292 set_current_state(TASK_UNINTERRUPTIBLE);
@@ -1409,7 +1411,8 @@ out:
1409static inline 1411static inline
1410int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags) 1412int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
1411{ 1413{
1412 cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags); 1414 if (p->nr_cpus_allowed > 1)
1415 cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);
1413 1416
1414 /* 1417 /*
1415 * In order not to call set_task_cpu() on a blocking task we need 1418 * In order not to call set_task_cpu() on a blocking task we need
@@ -1478,7 +1481,7 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
1478static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags) 1481static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
1479{ 1482{
1480 activate_task(rq, p, en_flags); 1483 activate_task(rq, p, en_flags);
1481 p->on_rq = 1; 1484 p->on_rq = TASK_ON_RQ_QUEUED;
1482 1485
1483 /* if a worker is waking up, notify workqueue */ 1486 /* if a worker is waking up, notify workqueue */
1484 if (p->flags & PF_WQ_WORKER) 1487 if (p->flags & PF_WQ_WORKER)
@@ -1537,7 +1540,7 @@ static int ttwu_remote(struct task_struct *p, int wake_flags)
1537 int ret = 0; 1540 int ret = 0;
1538 1541
1539 rq = __task_rq_lock(p); 1542 rq = __task_rq_lock(p);
1540 if (p->on_rq) { 1543 if (task_on_rq_queued(p)) {
1541 /* check_preempt_curr() may use rq clock */ 1544 /* check_preempt_curr() may use rq clock */
1542 update_rq_clock(rq); 1545 update_rq_clock(rq);
1543 ttwu_do_wakeup(rq, p, wake_flags); 1546 ttwu_do_wakeup(rq, p, wake_flags);
@@ -1620,6 +1623,30 @@ static void ttwu_queue_remote(struct task_struct *p, int cpu)
1620 } 1623 }
1621} 1624}
1622 1625
1626void wake_up_if_idle(int cpu)
1627{
1628 struct rq *rq = cpu_rq(cpu);
1629 unsigned long flags;
1630
1631 rcu_read_lock();
1632
1633 if (!is_idle_task(rcu_dereference(rq->curr)))
1634 goto out;
1635
1636 if (set_nr_if_polling(rq->idle)) {
1637 trace_sched_wake_idle_without_ipi(cpu);
1638 } else {
1639 raw_spin_lock_irqsave(&rq->lock, flags);
1640 if (is_idle_task(rq->curr))
1641 smp_send_reschedule(cpu);
1642 /* Else cpu is not in idle, do nothing here */
1643 raw_spin_unlock_irqrestore(&rq->lock, flags);
1644 }
1645
1646out:
1647 rcu_read_unlock();
1648}
1649
1623bool cpus_share_cache(int this_cpu, int that_cpu) 1650bool cpus_share_cache(int this_cpu, int that_cpu)
1624{ 1651{
1625 return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu); 1652 return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu);
@@ -1742,7 +1769,7 @@ static void try_to_wake_up_local(struct task_struct *p)
1742 if (!(p->state & TASK_NORMAL)) 1769 if (!(p->state & TASK_NORMAL))
1743 goto out; 1770 goto out;
1744 1771
1745 if (!p->on_rq) 1772 if (!task_on_rq_queued(p))
1746 ttwu_activate(rq, p, ENQUEUE_WAKEUP); 1773 ttwu_activate(rq, p, ENQUEUE_WAKEUP);
1747 1774
1748 ttwu_do_wakeup(rq, p, 0); 1775 ttwu_do_wakeup(rq, p, 0);
@@ -1776,6 +1803,20 @@ int wake_up_state(struct task_struct *p, unsigned int state)
1776} 1803}
1777 1804
1778/* 1805/*
1806 * This function clears the sched_dl_entity static params.
1807 */
1808void __dl_clear_params(struct task_struct *p)
1809{
1810 struct sched_dl_entity *dl_se = &p->dl;
1811
1812 dl_se->dl_runtime = 0;
1813 dl_se->dl_deadline = 0;
1814 dl_se->dl_period = 0;
1815 dl_se->flags = 0;
1816 dl_se->dl_bw = 0;
1817}
1818
1819/*
1779 * Perform scheduler related setup for a newly forked process p. 1820 * Perform scheduler related setup for a newly forked process p.
1780 * p is forked by current. 1821 * p is forked by current.
1781 * 1822 *
@@ -1799,10 +1840,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
1799 1840
1800 RB_CLEAR_NODE(&p->dl.rb_node); 1841 RB_CLEAR_NODE(&p->dl.rb_node);
1801 hrtimer_init(&p->dl.dl_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 1842 hrtimer_init(&p->dl.dl_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
1802 p->dl.dl_runtime = p->dl.runtime = 0; 1843 __dl_clear_params(p);
1803 p->dl.dl_deadline = p->dl.deadline = 0;
1804 p->dl.dl_period = 0;
1805 p->dl.flags = 0;
1806 1844
1807 INIT_LIST_HEAD(&p->rt.run_list); 1845 INIT_LIST_HEAD(&p->rt.run_list);
1808 1846
@@ -1825,12 +1863,10 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
1825 p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0; 1863 p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0;
1826 p->numa_scan_period = sysctl_numa_balancing_scan_delay; 1864 p->numa_scan_period = sysctl_numa_balancing_scan_delay;
1827 p->numa_work.next = &p->numa_work; 1865 p->numa_work.next = &p->numa_work;
1828 p->numa_faults_memory = NULL; 1866 p->numa_faults = NULL;
1829 p->numa_faults_buffer_memory = NULL;
1830 p->last_task_numa_placement = 0; 1867 p->last_task_numa_placement = 0;
1831 p->last_sum_exec_runtime = 0; 1868 p->last_sum_exec_runtime = 0;
1832 1869
1833 INIT_LIST_HEAD(&p->numa_entry);
1834 p->numa_group = NULL; 1870 p->numa_group = NULL;
1835#endif /* CONFIG_NUMA_BALANCING */ 1871#endif /* CONFIG_NUMA_BALANCING */
1836} 1872}
@@ -1977,6 +2013,8 @@ unsigned long to_ratio(u64 period, u64 runtime)
1977#ifdef CONFIG_SMP 2013#ifdef CONFIG_SMP
1978inline struct dl_bw *dl_bw_of(int i) 2014inline struct dl_bw *dl_bw_of(int i)
1979{ 2015{
2016 rcu_lockdep_assert(rcu_read_lock_sched_held(),
2017 "sched RCU must be held");
1980 return &cpu_rq(i)->rd->dl_bw; 2018 return &cpu_rq(i)->rd->dl_bw;
1981} 2019}
1982 2020
@@ -1985,6 +2023,8 @@ static inline int dl_bw_cpus(int i)
1985 struct root_domain *rd = cpu_rq(i)->rd; 2023 struct root_domain *rd = cpu_rq(i)->rd;
1986 int cpus = 0; 2024 int cpus = 0;
1987 2025
2026 rcu_lockdep_assert(rcu_read_lock_sched_held(),
2027 "sched RCU must be held");
1988 for_each_cpu_and(i, rd->span, cpu_active_mask) 2028 for_each_cpu_and(i, rd->span, cpu_active_mask)
1989 cpus++; 2029 cpus++;
1990 2030
@@ -2002,25 +2042,6 @@ static inline int dl_bw_cpus(int i)
2002} 2042}
2003#endif 2043#endif
2004 2044
2005static inline
2006void __dl_clear(struct dl_bw *dl_b, u64 tsk_bw)
2007{
2008 dl_b->total_bw -= tsk_bw;
2009}
2010
2011static inline
2012void __dl_add(struct dl_bw *dl_b, u64 tsk_bw)
2013{
2014 dl_b->total_bw += tsk_bw;
2015}
2016
2017static inline
2018bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw)
2019{
2020 return dl_b->bw != -1 &&
2021 dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw;
2022}
2023
2024/* 2045/*
2025 * We must be sure that accepting a new task (or allowing changing the 2046 * We must be sure that accepting a new task (or allowing changing the
2026 * parameters of an existing one) is consistent with the bandwidth 2047 * parameters of an existing one) is consistent with the bandwidth
@@ -2095,7 +2116,7 @@ void wake_up_new_task(struct task_struct *p)
2095 init_task_runnable_average(p); 2116 init_task_runnable_average(p);
2096 rq = __task_rq_lock(p); 2117 rq = __task_rq_lock(p);
2097 activate_task(rq, p, 0); 2118 activate_task(rq, p, 0);
2098 p->on_rq = 1; 2119 p->on_rq = TASK_ON_RQ_QUEUED;
2099 trace_sched_wakeup_new(p, true); 2120 trace_sched_wakeup_new(p, true);
2100 check_preempt_curr(rq, p, WF_FORK); 2121 check_preempt_curr(rq, p, WF_FORK);
2101#ifdef CONFIG_SMP 2122#ifdef CONFIG_SMP
@@ -2188,7 +2209,6 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev,
2188 2209
2189/** 2210/**
2190 * finish_task_switch - clean up after a task-switch 2211 * finish_task_switch - clean up after a task-switch
2191 * @rq: runqueue associated with task-switch
2192 * @prev: the thread we just switched away from. 2212 * @prev: the thread we just switched away from.
2193 * 2213 *
2194 * finish_task_switch must be called after the context switch, paired 2214 * finish_task_switch must be called after the context switch, paired
@@ -2200,10 +2220,16 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev,
2200 * so, we finish that here outside of the runqueue lock. (Doing it 2220 * so, we finish that here outside of the runqueue lock. (Doing it
2201 * with the lock held can cause deadlocks; see schedule() for 2221 * with the lock held can cause deadlocks; see schedule() for
2202 * details.) 2222 * details.)
2223 *
2224 * The context switch have flipped the stack from under us and restored the
2225 * local variables which were saved when this task called schedule() in the
2226 * past. prev == current is still correct but we need to recalculate this_rq
2227 * because prev may have moved to another CPU.
2203 */ 2228 */
2204static void finish_task_switch(struct rq *rq, struct task_struct *prev) 2229static struct rq *finish_task_switch(struct task_struct *prev)
2205 __releases(rq->lock) 2230 __releases(rq->lock)
2206{ 2231{
2232 struct rq *rq = this_rq();
2207 struct mm_struct *mm = rq->prev_mm; 2233 struct mm_struct *mm = rq->prev_mm;
2208 long prev_state; 2234 long prev_state;
2209 2235
@@ -2243,6 +2269,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
2243 } 2269 }
2244 2270
2245 tick_nohz_task_switch(current); 2271 tick_nohz_task_switch(current);
2272 return rq;
2246} 2273}
2247 2274
2248#ifdef CONFIG_SMP 2275#ifdef CONFIG_SMP
@@ -2277,29 +2304,22 @@ static inline void post_schedule(struct rq *rq)
2277asmlinkage __visible void schedule_tail(struct task_struct *prev) 2304asmlinkage __visible void schedule_tail(struct task_struct *prev)
2278 __releases(rq->lock) 2305 __releases(rq->lock)
2279{ 2306{
2280 struct rq *rq = this_rq(); 2307 struct rq *rq;
2281
2282 finish_task_switch(rq, prev);
2283 2308
2284 /* 2309 /* finish_task_switch() drops rq->lock and enables preemtion */
2285 * FIXME: do we need to worry about rq being invalidated by the 2310 preempt_disable();
2286 * task_switch? 2311 rq = finish_task_switch(prev);
2287 */
2288 post_schedule(rq); 2312 post_schedule(rq);
2289
2290#ifdef __ARCH_WANT_UNLOCKED_CTXSW
2291 /* In this case, finish_task_switch does not reenable preemption */
2292 preempt_enable(); 2313 preempt_enable();
2293#endif 2314
2294 if (current->set_child_tid) 2315 if (current->set_child_tid)
2295 put_user(task_pid_vnr(current), current->set_child_tid); 2316 put_user(task_pid_vnr(current), current->set_child_tid);
2296} 2317}
2297 2318
2298/* 2319/*
2299 * context_switch - switch to the new MM and the new 2320 * context_switch - switch to the new MM and the new thread's register state.
2300 * thread's register state.
2301 */ 2321 */
2302static inline void 2322static inline struct rq *
2303context_switch(struct rq *rq, struct task_struct *prev, 2323context_switch(struct rq *rq, struct task_struct *prev,
2304 struct task_struct *next) 2324 struct task_struct *next)
2305{ 2325{
@@ -2333,21 +2353,14 @@ context_switch(struct rq *rq, struct task_struct *prev,
2333 * of the scheduler it's an obvious special-case), so we 2353 * of the scheduler it's an obvious special-case), so we
2334 * do an early lockdep release here: 2354 * do an early lockdep release here:
2335 */ 2355 */
2336#ifndef __ARCH_WANT_UNLOCKED_CTXSW
2337 spin_release(&rq->lock.dep_map, 1, _THIS_IP_); 2356 spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
2338#endif
2339 2357
2340 context_tracking_task_switch(prev, next); 2358 context_tracking_task_switch(prev, next);
2341 /* Here we just switch the register state and the stack. */ 2359 /* Here we just switch the register state and the stack. */
2342 switch_to(prev, next, prev); 2360 switch_to(prev, next, prev);
2343
2344 barrier(); 2361 barrier();
2345 /* 2362
2346 * this_rq must be evaluated again because prev may have moved 2363 return finish_task_switch(prev);
2347 * CPUs since it called schedule(), thus the 'rq' on its stack
2348 * frame will be invalid.
2349 */
2350 finish_task_switch(this_rq(), prev);
2351} 2364}
2352 2365
2353/* 2366/*
@@ -2366,6 +2379,18 @@ unsigned long nr_running(void)
2366 return sum; 2379 return sum;
2367} 2380}
2368 2381
2382/*
2383 * Check if only the current task is running on the cpu.
2384 */
2385bool single_task_running(void)
2386{
2387 if (cpu_rq(smp_processor_id())->nr_running == 1)
2388 return true;
2389 else
2390 return false;
2391}
2392EXPORT_SYMBOL(single_task_running);
2393
2369unsigned long long nr_context_switches(void) 2394unsigned long long nr_context_switches(void)
2370{ 2395{
2371 int i; 2396 int i;
@@ -2437,44 +2462,6 @@ EXPORT_PER_CPU_SYMBOL(kstat);
2437EXPORT_PER_CPU_SYMBOL(kernel_cpustat); 2462EXPORT_PER_CPU_SYMBOL(kernel_cpustat);
2438 2463
2439/* 2464/*
2440 * Return any ns on the sched_clock that have not yet been accounted in
2441 * @p in case that task is currently running.
2442 *
2443 * Called with task_rq_lock() held on @rq.
2444 */
2445static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq)
2446{
2447 u64 ns = 0;
2448
2449 /*
2450 * Must be ->curr _and_ ->on_rq. If dequeued, we would
2451 * project cycles that may never be accounted to this
2452 * thread, breaking clock_gettime().
2453 */
2454 if (task_current(rq, p) && p->on_rq) {
2455 update_rq_clock(rq);
2456 ns = rq_clock_task(rq) - p->se.exec_start;
2457 if ((s64)ns < 0)
2458 ns = 0;
2459 }
2460
2461 return ns;
2462}
2463
2464unsigned long long task_delta_exec(struct task_struct *p)
2465{
2466 unsigned long flags;
2467 struct rq *rq;
2468 u64 ns = 0;
2469
2470 rq = task_rq_lock(p, &flags);
2471 ns = do_task_delta_exec(p, rq);
2472 task_rq_unlock(rq, p, &flags);
2473
2474 return ns;
2475}
2476
2477/*
2478 * Return accounted runtime for the task. 2465 * Return accounted runtime for the task.
2479 * In case the task is currently running, return the runtime plus current's 2466 * In case the task is currently running, return the runtime plus current's
2480 * pending runtime that have not been accounted yet. 2467 * pending runtime that have not been accounted yet.
@@ -2483,7 +2470,7 @@ unsigned long long task_sched_runtime(struct task_struct *p)
2483{ 2470{
2484 unsigned long flags; 2471 unsigned long flags;
2485 struct rq *rq; 2472 struct rq *rq;
2486 u64 ns = 0; 2473 u64 ns;
2487 2474
2488#if defined(CONFIG_64BIT) && defined(CONFIG_SMP) 2475#if defined(CONFIG_64BIT) && defined(CONFIG_SMP)
2489 /* 2476 /*
@@ -2497,12 +2484,21 @@ unsigned long long task_sched_runtime(struct task_struct *p)
2497 * If we see ->on_cpu without ->on_rq, the task is leaving, and has 2484 * If we see ->on_cpu without ->on_rq, the task is leaving, and has
2498 * been accounted, so we're correct here as well. 2485 * been accounted, so we're correct here as well.
2499 */ 2486 */
2500 if (!p->on_cpu || !p->on_rq) 2487 if (!p->on_cpu || !task_on_rq_queued(p))
2501 return p->se.sum_exec_runtime; 2488 return p->se.sum_exec_runtime;
2502#endif 2489#endif
2503 2490
2504 rq = task_rq_lock(p, &flags); 2491 rq = task_rq_lock(p, &flags);
2505 ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq); 2492 /*
2493 * Must be ->curr _and_ ->on_rq. If dequeued, we would
2494 * project cycles that may never be accounted to this
2495 * thread, breaking clock_gettime().
2496 */
2497 if (task_current(rq, p) && task_on_rq_queued(p)) {
2498 update_rq_clock(rq);
2499 p->sched_class->update_curr(rq);
2500 }
2501 ns = p->se.sum_exec_runtime;
2506 task_rq_unlock(rq, p, &flags); 2502 task_rq_unlock(rq, p, &flags);
2507 2503
2508 return ns; 2504 return ns;
@@ -2660,6 +2656,9 @@ static noinline void __schedule_bug(struct task_struct *prev)
2660 */ 2656 */
2661static inline void schedule_debug(struct task_struct *prev) 2657static inline void schedule_debug(struct task_struct *prev)
2662{ 2658{
2659#ifdef CONFIG_SCHED_STACK_END_CHECK
2660 BUG_ON(unlikely(task_stack_end_corrupted(prev)));
2661#endif
2663 /* 2662 /*
2664 * Test if we are atomic. Since do_exit() needs to call into 2663 * Test if we are atomic. Since do_exit() needs to call into
2665 * schedule() atomically, we ignore that path. Otherwise whine 2664 * schedule() atomically, we ignore that path. Otherwise whine
@@ -2761,7 +2760,7 @@ need_resched:
2761 preempt_disable(); 2760 preempt_disable();
2762 cpu = smp_processor_id(); 2761 cpu = smp_processor_id();
2763 rq = cpu_rq(cpu); 2762 rq = cpu_rq(cpu);
2764 rcu_note_context_switch(cpu); 2763 rcu_note_context_switch();
2765 prev = rq->curr; 2764 prev = rq->curr;
2766 2765
2767 schedule_debug(prev); 2766 schedule_debug(prev);
@@ -2801,7 +2800,7 @@ need_resched:
2801 switch_count = &prev->nvcsw; 2800 switch_count = &prev->nvcsw;
2802 } 2801 }
2803 2802
2804 if (prev->on_rq || rq->skip_clock_update < 0) 2803 if (task_on_rq_queued(prev) || rq->skip_clock_update < 0)
2805 update_rq_clock(rq); 2804 update_rq_clock(rq);
2806 2805
2807 next = pick_next_task(rq, prev); 2806 next = pick_next_task(rq, prev);
@@ -2814,15 +2813,8 @@ need_resched:
2814 rq->curr = next; 2813 rq->curr = next;
2815 ++*switch_count; 2814 ++*switch_count;
2816 2815
2817 context_switch(rq, prev, next); /* unlocks the rq */ 2816 rq = context_switch(rq, prev, next); /* unlocks the rq */
2818 /* 2817 cpu = cpu_of(rq);
2819 * The context switch have flipped the stack from under us
2820 * and restored the local variables which were saved when
2821 * this task called schedule() in the past. prev == current
2822 * is still correct, but it can be moved to another cpu/rq.
2823 */
2824 cpu = smp_processor_id();
2825 rq = cpu_rq(cpu);
2826 } else 2818 } else
2827 raw_spin_unlock_irq(&rq->lock); 2819 raw_spin_unlock_irq(&rq->lock);
2828 2820
@@ -2862,10 +2854,14 @@ asmlinkage __visible void __sched schedule_user(void)
2862 * or we have been woken up remotely but the IPI has not yet arrived, 2854 * or we have been woken up remotely but the IPI has not yet arrived,
2863 * we haven't yet exited the RCU idle mode. Do it here manually until 2855 * we haven't yet exited the RCU idle mode. Do it here manually until
2864 * we find a better solution. 2856 * we find a better solution.
2857 *
2858 * NB: There are buggy callers of this function. Ideally we
2859 * should warn if prev_state != IN_USER, but that will trigger
2860 * too frequently to make sense yet.
2865 */ 2861 */
2866 user_exit(); 2862 enum ctx_state prev_state = exception_enter();
2867 schedule(); 2863 schedule();
2868 user_enter(); 2864 exception_exit(prev_state);
2869} 2865}
2870#endif 2866#endif
2871 2867
@@ -2910,6 +2906,47 @@ asmlinkage __visible void __sched notrace preempt_schedule(void)
2910} 2906}
2911NOKPROBE_SYMBOL(preempt_schedule); 2907NOKPROBE_SYMBOL(preempt_schedule);
2912EXPORT_SYMBOL(preempt_schedule); 2908EXPORT_SYMBOL(preempt_schedule);
2909
2910#ifdef CONFIG_CONTEXT_TRACKING
2911/**
2912 * preempt_schedule_context - preempt_schedule called by tracing
2913 *
2914 * The tracing infrastructure uses preempt_enable_notrace to prevent
2915 * recursion and tracing preempt enabling caused by the tracing
2916 * infrastructure itself. But as tracing can happen in areas coming
2917 * from userspace or just about to enter userspace, a preempt enable
2918 * can occur before user_exit() is called. This will cause the scheduler
2919 * to be called when the system is still in usermode.
2920 *
2921 * To prevent this, the preempt_enable_notrace will use this function
2922 * instead of preempt_schedule() to exit user context if needed before
2923 * calling the scheduler.
2924 */
2925asmlinkage __visible void __sched notrace preempt_schedule_context(void)
2926{
2927 enum ctx_state prev_ctx;
2928
2929 if (likely(!preemptible()))
2930 return;
2931
2932 do {
2933 __preempt_count_add(PREEMPT_ACTIVE);
2934 /*
2935 * Needs preempt disabled in case user_exit() is traced
2936 * and the tracer calls preempt_enable_notrace() causing
2937 * an infinite recursion.
2938 */
2939 prev_ctx = exception_enter();
2940 __schedule();
2941 exception_exit(prev_ctx);
2942
2943 __preempt_count_sub(PREEMPT_ACTIVE);
2944 barrier();
2945 } while (need_resched());
2946}
2947EXPORT_SYMBOL_GPL(preempt_schedule_context);
2948#endif /* CONFIG_CONTEXT_TRACKING */
2949
2913#endif /* CONFIG_PREEMPT */ 2950#endif /* CONFIG_PREEMPT */
2914 2951
2915/* 2952/*
@@ -2966,7 +3003,7 @@ EXPORT_SYMBOL(default_wake_function);
2966 */ 3003 */
2967void rt_mutex_setprio(struct task_struct *p, int prio) 3004void rt_mutex_setprio(struct task_struct *p, int prio)
2968{ 3005{
2969 int oldprio, on_rq, running, enqueue_flag = 0; 3006 int oldprio, queued, running, enqueue_flag = 0;
2970 struct rq *rq; 3007 struct rq *rq;
2971 const struct sched_class *prev_class; 3008 const struct sched_class *prev_class;
2972 3009
@@ -2995,12 +3032,12 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
2995 trace_sched_pi_setprio(p, prio); 3032 trace_sched_pi_setprio(p, prio);
2996 oldprio = p->prio; 3033 oldprio = p->prio;
2997 prev_class = p->sched_class; 3034 prev_class = p->sched_class;
2998 on_rq = p->on_rq; 3035 queued = task_on_rq_queued(p);
2999 running = task_current(rq, p); 3036 running = task_current(rq, p);
3000 if (on_rq) 3037 if (queued)
3001 dequeue_task(rq, p, 0); 3038 dequeue_task(rq, p, 0);
3002 if (running) 3039 if (running)
3003 p->sched_class->put_prev_task(rq, p); 3040 put_prev_task(rq, p);
3004 3041
3005 /* 3042 /*
3006 * Boosting condition are: 3043 * Boosting condition are:
@@ -3037,7 +3074,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
3037 3074
3038 if (running) 3075 if (running)
3039 p->sched_class->set_curr_task(rq); 3076 p->sched_class->set_curr_task(rq);
3040 if (on_rq) 3077 if (queued)
3041 enqueue_task(rq, p, enqueue_flag); 3078 enqueue_task(rq, p, enqueue_flag);
3042 3079
3043 check_class_changed(rq, p, prev_class, oldprio); 3080 check_class_changed(rq, p, prev_class, oldprio);
@@ -3048,7 +3085,7 @@ out_unlock:
3048 3085
3049void set_user_nice(struct task_struct *p, long nice) 3086void set_user_nice(struct task_struct *p, long nice)
3050{ 3087{
3051 int old_prio, delta, on_rq; 3088 int old_prio, delta, queued;
3052 unsigned long flags; 3089 unsigned long flags;
3053 struct rq *rq; 3090 struct rq *rq;
3054 3091
@@ -3069,8 +3106,8 @@ void set_user_nice(struct task_struct *p, long nice)
3069 p->static_prio = NICE_TO_PRIO(nice); 3106 p->static_prio = NICE_TO_PRIO(nice);
3070 goto out_unlock; 3107 goto out_unlock;
3071 } 3108 }
3072 on_rq = p->on_rq; 3109 queued = task_on_rq_queued(p);
3073 if (on_rq) 3110 if (queued)
3074 dequeue_task(rq, p, 0); 3111 dequeue_task(rq, p, 0);
3075 3112
3076 p->static_prio = NICE_TO_PRIO(nice); 3113 p->static_prio = NICE_TO_PRIO(nice);
@@ -3079,7 +3116,7 @@ void set_user_nice(struct task_struct *p, long nice)
3079 p->prio = effective_prio(p); 3116 p->prio = effective_prio(p);
3080 delta = p->prio - old_prio; 3117 delta = p->prio - old_prio;
3081 3118
3082 if (on_rq) { 3119 if (queued) {
3083 enqueue_task(rq, p, 0); 3120 enqueue_task(rq, p, 0);
3084 /* 3121 /*
3085 * If the task increased its priority or is running and 3122 * If the task increased its priority or is running and
@@ -3351,7 +3388,7 @@ static int __sched_setscheduler(struct task_struct *p,
3351{ 3388{
3352 int newprio = dl_policy(attr->sched_policy) ? MAX_DL_PRIO - 1 : 3389 int newprio = dl_policy(attr->sched_policy) ? MAX_DL_PRIO - 1 :
3353 MAX_RT_PRIO - 1 - attr->sched_priority; 3390 MAX_RT_PRIO - 1 - attr->sched_priority;
3354 int retval, oldprio, oldpolicy = -1, on_rq, running; 3391 int retval, oldprio, oldpolicy = -1, queued, running;
3355 int policy = attr->sched_policy; 3392 int policy = attr->sched_policy;
3356 unsigned long flags; 3393 unsigned long flags;
3357 const struct sched_class *prev_class; 3394 const struct sched_class *prev_class;
@@ -3548,19 +3585,19 @@ change:
3548 return 0; 3585 return 0;
3549 } 3586 }
3550 3587
3551 on_rq = p->on_rq; 3588 queued = task_on_rq_queued(p);
3552 running = task_current(rq, p); 3589 running = task_current(rq, p);
3553 if (on_rq) 3590 if (queued)
3554 dequeue_task(rq, p, 0); 3591 dequeue_task(rq, p, 0);
3555 if (running) 3592 if (running)
3556 p->sched_class->put_prev_task(rq, p); 3593 put_prev_task(rq, p);
3557 3594
3558 prev_class = p->sched_class; 3595 prev_class = p->sched_class;
3559 __setscheduler(rq, p, attr); 3596 __setscheduler(rq, p, attr);
3560 3597
3561 if (running) 3598 if (running)
3562 p->sched_class->set_curr_task(rq); 3599 p->sched_class->set_curr_task(rq);
3563 if (on_rq) { 3600 if (queued) {
3564 /* 3601 /*
3565 * We enqueue to tail when the priority of a task is 3602 * We enqueue to tail when the priority of a task is
3566 * increased (user space view). 3603 * increased (user space view).
@@ -3984,14 +4021,14 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
3984 rcu_read_lock(); 4021 rcu_read_lock();
3985 if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) { 4022 if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) {
3986 rcu_read_unlock(); 4023 rcu_read_unlock();
3987 goto out_unlock; 4024 goto out_free_new_mask;
3988 } 4025 }
3989 rcu_read_unlock(); 4026 rcu_read_unlock();
3990 } 4027 }
3991 4028
3992 retval = security_task_setscheduler(p); 4029 retval = security_task_setscheduler(p);
3993 if (retval) 4030 if (retval)
3994 goto out_unlock; 4031 goto out_free_new_mask;
3995 4032
3996 4033
3997 cpuset_cpus_allowed(p, cpus_allowed); 4034 cpuset_cpus_allowed(p, cpus_allowed);
@@ -4004,13 +4041,14 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
4004 * root_domain. 4041 * root_domain.
4005 */ 4042 */
4006#ifdef CONFIG_SMP 4043#ifdef CONFIG_SMP
4007 if (task_has_dl_policy(p)) { 4044 if (task_has_dl_policy(p) && dl_bandwidth_enabled()) {
4008 const struct cpumask *span = task_rq(p)->rd->span; 4045 rcu_read_lock();
4009 4046 if (!cpumask_subset(task_rq(p)->rd->span, new_mask)) {
4010 if (dl_bandwidth_enabled() && !cpumask_subset(span, new_mask)) {
4011 retval = -EBUSY; 4047 retval = -EBUSY;
4012 goto out_unlock; 4048 rcu_read_unlock();
4049 goto out_free_new_mask;
4013 } 4050 }
4051 rcu_read_unlock();
4014 } 4052 }
4015#endif 4053#endif
4016again: 4054again:
@@ -4028,7 +4066,7 @@ again:
4028 goto again; 4066 goto again;
4029 } 4067 }
4030 } 4068 }
4031out_unlock: 4069out_free_new_mask:
4032 free_cpumask_var(new_mask); 4070 free_cpumask_var(new_mask);
4033out_free_cpus_allowed: 4071out_free_cpus_allowed:
4034 free_cpumask_var(cpus_allowed); 4072 free_cpumask_var(cpus_allowed);
@@ -4489,8 +4527,10 @@ void sched_show_task(struct task_struct *p)
4489#ifdef CONFIG_DEBUG_STACK_USAGE 4527#ifdef CONFIG_DEBUG_STACK_USAGE
4490 free = stack_not_used(p); 4528 free = stack_not_used(p);
4491#endif 4529#endif
4530 ppid = 0;
4492 rcu_read_lock(); 4531 rcu_read_lock();
4493 ppid = task_pid_nr(rcu_dereference(p->real_parent)); 4532 if (pid_alive(p))
4533 ppid = task_pid_nr(rcu_dereference(p->real_parent));
4494 rcu_read_unlock(); 4534 rcu_read_unlock();
4495 printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free, 4535 printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free,
4496 task_pid_nr(p), ppid, 4536 task_pid_nr(p), ppid,
@@ -4512,7 +4552,7 @@ void show_state_filter(unsigned long state_filter)
4512 " task PC stack pid father\n"); 4552 " task PC stack pid father\n");
4513#endif 4553#endif
4514 rcu_read_lock(); 4554 rcu_read_lock();
4515 do_each_thread(g, p) { 4555 for_each_process_thread(g, p) {
4516 /* 4556 /*
4517 * reset the NMI-timeout, listing all files on a slow 4557 * reset the NMI-timeout, listing all files on a slow
4518 * console might take a lot of time: 4558 * console might take a lot of time:
@@ -4520,7 +4560,7 @@ void show_state_filter(unsigned long state_filter)
4520 touch_nmi_watchdog(); 4560 touch_nmi_watchdog();
4521 if (!state_filter || (p->state & state_filter)) 4561 if (!state_filter || (p->state & state_filter))
4522 sched_show_task(p); 4562 sched_show_task(p);
4523 } while_each_thread(g, p); 4563 }
4524 4564
4525 touch_all_softlockup_watchdogs(); 4565 touch_all_softlockup_watchdogs();
4526 4566
@@ -4575,7 +4615,7 @@ void init_idle(struct task_struct *idle, int cpu)
4575 rcu_read_unlock(); 4615 rcu_read_unlock();
4576 4616
4577 rq->curr = rq->idle = idle; 4617 rq->curr = rq->idle = idle;
4578 idle->on_rq = 1; 4618 idle->on_rq = TASK_ON_RQ_QUEUED;
4579#if defined(CONFIG_SMP) 4619#if defined(CONFIG_SMP)
4580 idle->on_cpu = 1; 4620 idle->on_cpu = 1;
4581#endif 4621#endif
@@ -4595,7 +4635,109 @@ void init_idle(struct task_struct *idle, int cpu)
4595#endif 4635#endif
4596} 4636}
4597 4637
4638int cpuset_cpumask_can_shrink(const struct cpumask *cur,
4639 const struct cpumask *trial)
4640{
4641 int ret = 1, trial_cpus;
4642 struct dl_bw *cur_dl_b;
4643 unsigned long flags;
4644
4645 rcu_read_lock_sched();
4646 cur_dl_b = dl_bw_of(cpumask_any(cur));
4647 trial_cpus = cpumask_weight(trial);
4648
4649 raw_spin_lock_irqsave(&cur_dl_b->lock, flags);
4650 if (cur_dl_b->bw != -1 &&
4651 cur_dl_b->bw * trial_cpus < cur_dl_b->total_bw)
4652 ret = 0;
4653 raw_spin_unlock_irqrestore(&cur_dl_b->lock, flags);
4654 rcu_read_unlock_sched();
4655
4656 return ret;
4657}
4658
4659int task_can_attach(struct task_struct *p,
4660 const struct cpumask *cs_cpus_allowed)
4661{
4662 int ret = 0;
4663
4664 /*
4665 * Kthreads which disallow setaffinity shouldn't be moved
4666 * to a new cpuset; we don't want to change their cpu
4667 * affinity and isolating such threads by their set of
4668 * allowed nodes is unnecessary. Thus, cpusets are not
4669 * applicable for such threads. This prevents checking for
4670 * success of set_cpus_allowed_ptr() on all attached tasks
4671 * before cpus_allowed may be changed.
4672 */
4673 if (p->flags & PF_NO_SETAFFINITY) {
4674 ret = -EINVAL;
4675 goto out;
4676 }
4677
4678#ifdef CONFIG_SMP
4679 if (dl_task(p) && !cpumask_intersects(task_rq(p)->rd->span,
4680 cs_cpus_allowed)) {
4681 unsigned int dest_cpu = cpumask_any_and(cpu_active_mask,
4682 cs_cpus_allowed);
4683 struct dl_bw *dl_b;
4684 bool overflow;
4685 int cpus;
4686 unsigned long flags;
4687
4688 rcu_read_lock_sched();
4689 dl_b = dl_bw_of(dest_cpu);
4690 raw_spin_lock_irqsave(&dl_b->lock, flags);
4691 cpus = dl_bw_cpus(dest_cpu);
4692 overflow = __dl_overflow(dl_b, cpus, 0, p->dl.dl_bw);
4693 if (overflow)
4694 ret = -EBUSY;
4695 else {
4696 /*
4697 * We reserve space for this task in the destination
4698 * root_domain, as we can't fail after this point.
4699 * We will free resources in the source root_domain
4700 * later on (see set_cpus_allowed_dl()).
4701 */
4702 __dl_add(dl_b, p->dl.dl_bw);
4703 }
4704 raw_spin_unlock_irqrestore(&dl_b->lock, flags);
4705 rcu_read_unlock_sched();
4706
4707 }
4708#endif
4709out:
4710 return ret;
4711}
4712
4598#ifdef CONFIG_SMP 4713#ifdef CONFIG_SMP
4714/*
4715 * move_queued_task - move a queued task to new rq.
4716 *
4717 * Returns (locked) new rq. Old rq's lock is released.
4718 */
4719static struct rq *move_queued_task(struct task_struct *p, int new_cpu)
4720{
4721 struct rq *rq = task_rq(p);
4722
4723 lockdep_assert_held(&rq->lock);
4724
4725 dequeue_task(rq, p, 0);
4726 p->on_rq = TASK_ON_RQ_MIGRATING;
4727 set_task_cpu(p, new_cpu);
4728 raw_spin_unlock(&rq->lock);
4729
4730 rq = cpu_rq(new_cpu);
4731
4732 raw_spin_lock(&rq->lock);
4733 BUG_ON(task_cpu(p) != new_cpu);
4734 p->on_rq = TASK_ON_RQ_QUEUED;
4735 enqueue_task(rq, p, 0);
4736 check_preempt_curr(rq, p, 0);
4737
4738 return rq;
4739}
4740
4599void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) 4741void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
4600{ 4742{
4601 if (p->sched_class && p->sched_class->set_cpus_allowed) 4743 if (p->sched_class && p->sched_class->set_cpus_allowed)
@@ -4652,14 +4794,15 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
4652 goto out; 4794 goto out;
4653 4795
4654 dest_cpu = cpumask_any_and(cpu_active_mask, new_mask); 4796 dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
4655 if (p->on_rq) { 4797 if (task_running(rq, p) || p->state == TASK_WAKING) {
4656 struct migration_arg arg = { p, dest_cpu }; 4798 struct migration_arg arg = { p, dest_cpu };
4657 /* Need help from migration thread: drop lock and wait. */ 4799 /* Need help from migration thread: drop lock and wait. */
4658 task_rq_unlock(rq, p, &flags); 4800 task_rq_unlock(rq, p, &flags);
4659 stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); 4801 stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
4660 tlb_migrate_finish(p->mm); 4802 tlb_migrate_finish(p->mm);
4661 return 0; 4803 return 0;
4662 } 4804 } else if (task_on_rq_queued(p))
4805 rq = move_queued_task(p, dest_cpu);
4663out: 4806out:
4664 task_rq_unlock(rq, p, &flags); 4807 task_rq_unlock(rq, p, &flags);
4665 4808
@@ -4680,20 +4823,20 @@ EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
4680 */ 4823 */
4681static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) 4824static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
4682{ 4825{
4683 struct rq *rq_dest, *rq_src; 4826 struct rq *rq;
4684 int ret = 0; 4827 int ret = 0;
4685 4828
4686 if (unlikely(!cpu_active(dest_cpu))) 4829 if (unlikely(!cpu_active(dest_cpu)))
4687 return ret; 4830 return ret;
4688 4831
4689 rq_src = cpu_rq(src_cpu); 4832 rq = cpu_rq(src_cpu);
4690 rq_dest = cpu_rq(dest_cpu);
4691 4833
4692 raw_spin_lock(&p->pi_lock); 4834 raw_spin_lock(&p->pi_lock);
4693 double_rq_lock(rq_src, rq_dest); 4835 raw_spin_lock(&rq->lock);
4694 /* Already moved. */ 4836 /* Already moved. */
4695 if (task_cpu(p) != src_cpu) 4837 if (task_cpu(p) != src_cpu)
4696 goto done; 4838 goto done;
4839
4697 /* Affinity changed (again). */ 4840 /* Affinity changed (again). */
4698 if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p))) 4841 if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
4699 goto fail; 4842 goto fail;
@@ -4702,16 +4845,12 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
4702 * If we're not on a rq, the next wake-up will ensure we're 4845 * If we're not on a rq, the next wake-up will ensure we're
4703 * placed properly. 4846 * placed properly.
4704 */ 4847 */
4705 if (p->on_rq) { 4848 if (task_on_rq_queued(p))
4706 dequeue_task(rq_src, p, 0); 4849 rq = move_queued_task(p, dest_cpu);
4707 set_task_cpu(p, dest_cpu);
4708 enqueue_task(rq_dest, p, 0);
4709 check_preempt_curr(rq_dest, p, 0);
4710 }
4711done: 4850done:
4712 ret = 1; 4851 ret = 1;
4713fail: 4852fail:
4714 double_rq_unlock(rq_src, rq_dest); 4853 raw_spin_unlock(&rq->lock);
4715 raw_spin_unlock(&p->pi_lock); 4854 raw_spin_unlock(&p->pi_lock);
4716 return ret; 4855 return ret;
4717} 4856}
@@ -4743,22 +4882,22 @@ void sched_setnuma(struct task_struct *p, int nid)
4743{ 4882{
4744 struct rq *rq; 4883 struct rq *rq;
4745 unsigned long flags; 4884 unsigned long flags;
4746 bool on_rq, running; 4885 bool queued, running;
4747 4886
4748 rq = task_rq_lock(p, &flags); 4887 rq = task_rq_lock(p, &flags);
4749 on_rq = p->on_rq; 4888 queued = task_on_rq_queued(p);
4750 running = task_current(rq, p); 4889 running = task_current(rq, p);
4751 4890
4752 if (on_rq) 4891 if (queued)
4753 dequeue_task(rq, p, 0); 4892 dequeue_task(rq, p, 0);
4754 if (running) 4893 if (running)
4755 p->sched_class->put_prev_task(rq, p); 4894 put_prev_task(rq, p);
4756 4895
4757 p->numa_preferred_nid = nid; 4896 p->numa_preferred_nid = nid;
4758 4897
4759 if (running) 4898 if (running)
4760 p->sched_class->set_curr_task(rq); 4899 p->sched_class->set_curr_task(rq);
4761 if (on_rq) 4900 if (queued)
4762 enqueue_task(rq, p, 0); 4901 enqueue_task(rq, p, 0);
4763 task_rq_unlock(rq, p, &flags); 4902 task_rq_unlock(rq, p, &flags);
4764} 4903}
@@ -4778,6 +4917,12 @@ static int migration_cpu_stop(void *data)
4778 * be on another cpu but it doesn't matter. 4917 * be on another cpu but it doesn't matter.
4779 */ 4918 */
4780 local_irq_disable(); 4919 local_irq_disable();
4920 /*
4921 * We need to explicitly wake pending tasks before running
4922 * __migrate_task() such that we will not miss enforcing cpus_allowed
4923 * during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test.
4924 */
4925 sched_ttwu_pending();
4781 __migrate_task(arg->task, raw_smp_processor_id(), arg->dest_cpu); 4926 __migrate_task(arg->task, raw_smp_processor_id(), arg->dest_cpu);
4782 local_irq_enable(); 4927 local_irq_enable();
4783 return 0; 4928 return 0;
@@ -5188,6 +5333,7 @@ static int sched_cpu_inactive(struct notifier_block *nfb,
5188{ 5333{
5189 unsigned long flags; 5334 unsigned long flags;
5190 long cpu = (long)hcpu; 5335 long cpu = (long)hcpu;
5336 struct dl_bw *dl_b;
5191 5337
5192 switch (action & ~CPU_TASKS_FROZEN) { 5338 switch (action & ~CPU_TASKS_FROZEN) {
5193 case CPU_DOWN_PREPARE: 5339 case CPU_DOWN_PREPARE:
@@ -5195,15 +5341,19 @@ static int sched_cpu_inactive(struct notifier_block *nfb,
5195 5341
5196 /* explicitly allow suspend */ 5342 /* explicitly allow suspend */
5197 if (!(action & CPU_TASKS_FROZEN)) { 5343 if (!(action & CPU_TASKS_FROZEN)) {
5198 struct dl_bw *dl_b = dl_bw_of(cpu);
5199 bool overflow; 5344 bool overflow;
5200 int cpus; 5345 int cpus;
5201 5346
5347 rcu_read_lock_sched();
5348 dl_b = dl_bw_of(cpu);
5349
5202 raw_spin_lock_irqsave(&dl_b->lock, flags); 5350 raw_spin_lock_irqsave(&dl_b->lock, flags);
5203 cpus = dl_bw_cpus(cpu); 5351 cpus = dl_bw_cpus(cpu);
5204 overflow = __dl_overflow(dl_b, cpus, 0, 0); 5352 overflow = __dl_overflow(dl_b, cpus, 0, 0);
5205 raw_spin_unlock_irqrestore(&dl_b->lock, flags); 5353 raw_spin_unlock_irqrestore(&dl_b->lock, flags);
5206 5354
5355 rcu_read_unlock_sched();
5356
5207 if (overflow) 5357 if (overflow)
5208 return notifier_from_errno(-EBUSY); 5358 return notifier_from_errno(-EBUSY);
5209 } 5359 }
@@ -5746,7 +5896,7 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
5746 const struct cpumask *span = sched_domain_span(sd); 5896 const struct cpumask *span = sched_domain_span(sd);
5747 struct cpumask *covered = sched_domains_tmpmask; 5897 struct cpumask *covered = sched_domains_tmpmask;
5748 struct sd_data *sdd = sd->private; 5898 struct sd_data *sdd = sd->private;
5749 struct sched_domain *child; 5899 struct sched_domain *sibling;
5750 int i; 5900 int i;
5751 5901
5752 cpumask_clear(covered); 5902 cpumask_clear(covered);
@@ -5757,10 +5907,10 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
5757 if (cpumask_test_cpu(i, covered)) 5907 if (cpumask_test_cpu(i, covered))
5758 continue; 5908 continue;
5759 5909
5760 child = *per_cpu_ptr(sdd->sd, i); 5910 sibling = *per_cpu_ptr(sdd->sd, i);
5761 5911
5762 /* See the comment near build_group_mask(). */ 5912 /* See the comment near build_group_mask(). */
5763 if (!cpumask_test_cpu(i, sched_domain_span(child))) 5913 if (!cpumask_test_cpu(i, sched_domain_span(sibling)))
5764 continue; 5914 continue;
5765 5915
5766 sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), 5916 sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
@@ -5770,10 +5920,9 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
5770 goto fail; 5920 goto fail;
5771 5921
5772 sg_span = sched_group_cpus(sg); 5922 sg_span = sched_group_cpus(sg);
5773 if (child->child) { 5923 if (sibling->child)
5774 child = child->child; 5924 cpumask_copy(sg_span, sched_domain_span(sibling->child));
5775 cpumask_copy(sg_span, sched_domain_span(child)); 5925 else
5776 } else
5777 cpumask_set_cpu(i, sg_span); 5926 cpumask_set_cpu(i, sg_span);
5778 5927
5779 cpumask_or(covered, covered, sg_span); 5928 cpumask_or(covered, covered, sg_span);
@@ -6011,7 +6160,9 @@ static void claim_allocations(int cpu, struct sched_domain *sd)
6011 6160
6012#ifdef CONFIG_NUMA 6161#ifdef CONFIG_NUMA
6013static int sched_domains_numa_levels; 6162static int sched_domains_numa_levels;
6163enum numa_topology_type sched_numa_topology_type;
6014static int *sched_domains_numa_distance; 6164static int *sched_domains_numa_distance;
6165int sched_max_numa_distance;
6015static struct cpumask ***sched_domains_numa_masks; 6166static struct cpumask ***sched_domains_numa_masks;
6016static int sched_domains_curr_level; 6167static int sched_domains_curr_level;
6017#endif 6168#endif
@@ -6183,7 +6334,7 @@ static void sched_numa_warn(const char *str)
6183 printk(KERN_WARNING "\n"); 6334 printk(KERN_WARNING "\n");
6184} 6335}
6185 6336
6186static bool find_numa_distance(int distance) 6337bool find_numa_distance(int distance)
6187{ 6338{
6188 int i; 6339 int i;
6189 6340
@@ -6198,6 +6349,56 @@ static bool find_numa_distance(int distance)
6198 return false; 6349 return false;
6199} 6350}
6200 6351
6352/*
6353 * A system can have three types of NUMA topology:
6354 * NUMA_DIRECT: all nodes are directly connected, or not a NUMA system
6355 * NUMA_GLUELESS_MESH: some nodes reachable through intermediary nodes
6356 * NUMA_BACKPLANE: nodes can reach other nodes through a backplane
6357 *
6358 * The difference between a glueless mesh topology and a backplane
6359 * topology lies in whether communication between not directly
6360 * connected nodes goes through intermediary nodes (where programs
6361 * could run), or through backplane controllers. This affects
6362 * placement of programs.
6363 *
6364 * The type of topology can be discerned with the following tests:
6365 * - If the maximum distance between any nodes is 1 hop, the system
6366 * is directly connected.
6367 * - If for two nodes A and B, located N > 1 hops away from each other,
6368 * there is an intermediary node C, which is < N hops away from both
6369 * nodes A and B, the system is a glueless mesh.
6370 */
6371static void init_numa_topology_type(void)
6372{
6373 int a, b, c, n;
6374
6375 n = sched_max_numa_distance;
6376
6377 if (n <= 1)
6378 sched_numa_topology_type = NUMA_DIRECT;
6379
6380 for_each_online_node(a) {
6381 for_each_online_node(b) {
6382 /* Find two nodes furthest removed from each other. */
6383 if (node_distance(a, b) < n)
6384 continue;
6385
6386 /* Is there an intermediary node between a and b? */
6387 for_each_online_node(c) {
6388 if (node_distance(a, c) < n &&
6389 node_distance(b, c) < n) {
6390 sched_numa_topology_type =
6391 NUMA_GLUELESS_MESH;
6392 return;
6393 }
6394 }
6395
6396 sched_numa_topology_type = NUMA_BACKPLANE;
6397 return;
6398 }
6399 }
6400}
6401
6201static void sched_init_numa(void) 6402static void sched_init_numa(void)
6202{ 6403{
6203 int next_distance, curr_distance = node_distance(0, 0); 6404 int next_distance, curr_distance = node_distance(0, 0);
@@ -6251,6 +6452,10 @@ static void sched_init_numa(void)
6251 if (!sched_debug()) 6452 if (!sched_debug())
6252 break; 6453 break;
6253 } 6454 }
6455
6456 if (!level)
6457 return;
6458
6254 /* 6459 /*
6255 * 'level' contains the number of unique distances, excluding the 6460 * 'level' contains the number of unique distances, excluding the
6256 * identity distance node_distance(i,i). 6461 * identity distance node_distance(i,i).
@@ -6330,6 +6535,9 @@ static void sched_init_numa(void)
6330 sched_domain_topology = tl; 6535 sched_domain_topology = tl;
6331 6536
6332 sched_domains_numa_levels = level; 6537 sched_domains_numa_levels = level;
6538 sched_max_numa_distance = sched_domains_numa_distance[level - 1];
6539
6540 init_numa_topology_type();
6333} 6541}
6334 6542
6335static void sched_domains_numa_masks_set(int cpu) 6543static void sched_domains_numa_masks_set(int cpu)
@@ -6905,9 +7113,6 @@ void __init sched_init(void)
6905#ifdef CONFIG_RT_GROUP_SCHED 7113#ifdef CONFIG_RT_GROUP_SCHED
6906 alloc_size += 2 * nr_cpu_ids * sizeof(void **); 7114 alloc_size += 2 * nr_cpu_ids * sizeof(void **);
6907#endif 7115#endif
6908#ifdef CONFIG_CPUMASK_OFFSTACK
6909 alloc_size += num_possible_cpus() * cpumask_size();
6910#endif
6911 if (alloc_size) { 7116 if (alloc_size) {
6912 ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT); 7117 ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);
6913 7118
@@ -6927,13 +7132,13 @@ void __init sched_init(void)
6927 ptr += nr_cpu_ids * sizeof(void **); 7132 ptr += nr_cpu_ids * sizeof(void **);
6928 7133
6929#endif /* CONFIG_RT_GROUP_SCHED */ 7134#endif /* CONFIG_RT_GROUP_SCHED */
7135 }
6930#ifdef CONFIG_CPUMASK_OFFSTACK 7136#ifdef CONFIG_CPUMASK_OFFSTACK
6931 for_each_possible_cpu(i) { 7137 for_each_possible_cpu(i) {
6932 per_cpu(load_balance_mask, i) = (void *)ptr; 7138 per_cpu(load_balance_mask, i) = (cpumask_var_t)kzalloc_node(
6933 ptr += cpumask_size(); 7139 cpumask_size(), GFP_KERNEL, cpu_to_node(i));
6934 }
6935#endif /* CONFIG_CPUMASK_OFFSTACK */
6936 } 7140 }
7141#endif /* CONFIG_CPUMASK_OFFSTACK */
6937 7142
6938 init_rt_bandwidth(&def_rt_bandwidth, 7143 init_rt_bandwidth(&def_rt_bandwidth,
6939 global_rt_period(), global_rt_runtime()); 7144 global_rt_period(), global_rt_runtime());
@@ -7082,6 +7287,25 @@ static inline int preempt_count_equals(int preempt_offset)
7082 7287
7083void __might_sleep(const char *file, int line, int preempt_offset) 7288void __might_sleep(const char *file, int line, int preempt_offset)
7084{ 7289{
7290 /*
7291 * Blocking primitives will set (and therefore destroy) current->state,
7292 * since we will exit with TASK_RUNNING make sure we enter with it,
7293 * otherwise we will destroy state.
7294 */
7295 if (WARN_ONCE(current->state != TASK_RUNNING,
7296 "do not call blocking ops when !TASK_RUNNING; "
7297 "state=%lx set at [<%p>] %pS\n",
7298 current->state,
7299 (void *)current->task_state_change,
7300 (void *)current->task_state_change))
7301 __set_current_state(TASK_RUNNING);
7302
7303 ___might_sleep(file, line, preempt_offset);
7304}
7305EXPORT_SYMBOL(__might_sleep);
7306
7307void ___might_sleep(const char *file, int line, int preempt_offset)
7308{
7085 static unsigned long prev_jiffy; /* ratelimiting */ 7309 static unsigned long prev_jiffy; /* ratelimiting */
7086 7310
7087 rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */ 7311 rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */
@@ -7113,7 +7337,7 @@ void __might_sleep(const char *file, int line, int preempt_offset)
7113#endif 7337#endif
7114 dump_stack(); 7338 dump_stack();
7115} 7339}
7116EXPORT_SYMBOL(__might_sleep); 7340EXPORT_SYMBOL(___might_sleep);
7117#endif 7341#endif
7118 7342
7119#ifdef CONFIG_MAGIC_SYSRQ 7343#ifdef CONFIG_MAGIC_SYSRQ
@@ -7124,13 +7348,13 @@ static void normalize_task(struct rq *rq, struct task_struct *p)
7124 .sched_policy = SCHED_NORMAL, 7348 .sched_policy = SCHED_NORMAL,
7125 }; 7349 };
7126 int old_prio = p->prio; 7350 int old_prio = p->prio;
7127 int on_rq; 7351 int queued;
7128 7352
7129 on_rq = p->on_rq; 7353 queued = task_on_rq_queued(p);
7130 if (on_rq) 7354 if (queued)
7131 dequeue_task(rq, p, 0); 7355 dequeue_task(rq, p, 0);
7132 __setscheduler(rq, p, &attr); 7356 __setscheduler(rq, p, &attr);
7133 if (on_rq) { 7357 if (queued) {
7134 enqueue_task(rq, p, 0); 7358 enqueue_task(rq, p, 0);
7135 resched_curr(rq); 7359 resched_curr(rq);
7136 } 7360 }
@@ -7144,12 +7368,12 @@ void normalize_rt_tasks(void)
7144 unsigned long flags; 7368 unsigned long flags;
7145 struct rq *rq; 7369 struct rq *rq;
7146 7370
7147 read_lock_irqsave(&tasklist_lock, flags); 7371 read_lock(&tasklist_lock);
7148 do_each_thread(g, p) { 7372 for_each_process_thread(g, p) {
7149 /* 7373 /*
7150 * Only normalize user tasks: 7374 * Only normalize user tasks:
7151 */ 7375 */
7152 if (!p->mm) 7376 if (p->flags & PF_KTHREAD)
7153 continue; 7377 continue;
7154 7378
7155 p->se.exec_start = 0; 7379 p->se.exec_start = 0;
@@ -7164,21 +7388,16 @@ void normalize_rt_tasks(void)
7164 * Renice negative nice level userspace 7388 * Renice negative nice level userspace
7165 * tasks back to 0: 7389 * tasks back to 0:
7166 */ 7390 */
7167 if (task_nice(p) < 0 && p->mm) 7391 if (task_nice(p) < 0)
7168 set_user_nice(p, 0); 7392 set_user_nice(p, 0);
7169 continue; 7393 continue;
7170 } 7394 }
7171 7395
7172 raw_spin_lock(&p->pi_lock); 7396 rq = task_rq_lock(p, &flags);
7173 rq = __task_rq_lock(p);
7174
7175 normalize_task(rq, p); 7397 normalize_task(rq, p);
7176 7398 task_rq_unlock(rq, p, &flags);
7177 __task_rq_unlock(rq); 7399 }
7178 raw_spin_unlock(&p->pi_lock); 7400 read_unlock(&tasklist_lock);
7179 } while_each_thread(g, p);
7180
7181 read_unlock_irqrestore(&tasklist_lock, flags);
7182} 7401}
7183 7402
7184#endif /* CONFIG_MAGIC_SYSRQ */ 7403#endif /* CONFIG_MAGIC_SYSRQ */
@@ -7318,36 +7537,40 @@ void sched_offline_group(struct task_group *tg)
7318void sched_move_task(struct task_struct *tsk) 7537void sched_move_task(struct task_struct *tsk)
7319{ 7538{
7320 struct task_group *tg; 7539 struct task_group *tg;
7321 int on_rq, running; 7540 int queued, running;
7322 unsigned long flags; 7541 unsigned long flags;
7323 struct rq *rq; 7542 struct rq *rq;
7324 7543
7325 rq = task_rq_lock(tsk, &flags); 7544 rq = task_rq_lock(tsk, &flags);
7326 7545
7327 running = task_current(rq, tsk); 7546 running = task_current(rq, tsk);
7328 on_rq = tsk->on_rq; 7547 queued = task_on_rq_queued(tsk);
7329 7548
7330 if (on_rq) 7549 if (queued)
7331 dequeue_task(rq, tsk, 0); 7550 dequeue_task(rq, tsk, 0);
7332 if (unlikely(running)) 7551 if (unlikely(running))
7333 tsk->sched_class->put_prev_task(rq, tsk); 7552 put_prev_task(rq, tsk);
7334 7553
7335 tg = container_of(task_css_check(tsk, cpu_cgrp_id, 7554 /*
7336 lockdep_is_held(&tsk->sighand->siglock)), 7555 * All callers are synchronized by task_rq_lock(); we do not use RCU
7556 * which is pointless here. Thus, we pass "true" to task_css_check()
7557 * to prevent lockdep warnings.
7558 */
7559 tg = container_of(task_css_check(tsk, cpu_cgrp_id, true),
7337 struct task_group, css); 7560 struct task_group, css);
7338 tg = autogroup_task_group(tsk, tg); 7561 tg = autogroup_task_group(tsk, tg);
7339 tsk->sched_task_group = tg; 7562 tsk->sched_task_group = tg;
7340 7563
7341#ifdef CONFIG_FAIR_GROUP_SCHED 7564#ifdef CONFIG_FAIR_GROUP_SCHED
7342 if (tsk->sched_class->task_move_group) 7565 if (tsk->sched_class->task_move_group)
7343 tsk->sched_class->task_move_group(tsk, on_rq); 7566 tsk->sched_class->task_move_group(tsk, queued);
7344 else 7567 else
7345#endif 7568#endif
7346 set_task_rq(tsk, task_cpu(tsk)); 7569 set_task_rq(tsk, task_cpu(tsk));
7347 7570
7348 if (unlikely(running)) 7571 if (unlikely(running))
7349 tsk->sched_class->set_curr_task(rq); 7572 tsk->sched_class->set_curr_task(rq);
7350 if (on_rq) 7573 if (queued)
7351 enqueue_task(rq, tsk, 0); 7574 enqueue_task(rq, tsk, 0);
7352 7575
7353 task_rq_unlock(rq, tsk, &flags); 7576 task_rq_unlock(rq, tsk, &flags);
@@ -7365,10 +7588,10 @@ static inline int tg_has_rt_tasks(struct task_group *tg)
7365{ 7588{
7366 struct task_struct *g, *p; 7589 struct task_struct *g, *p;
7367 7590
7368 do_each_thread(g, p) { 7591 for_each_process_thread(g, p) {
7369 if (rt_task(p) && task_rq(p)->rt.tg == tg) 7592 if (rt_task(p) && task_group(p) == tg)
7370 return 1; 7593 return 1;
7371 } while_each_thread(g, p); 7594 }
7372 7595
7373 return 0; 7596 return 0;
7374} 7597}
@@ -7577,6 +7800,7 @@ static int sched_dl_global_constraints(void)
7577 u64 runtime = global_rt_runtime(); 7800 u64 runtime = global_rt_runtime();
7578 u64 period = global_rt_period(); 7801 u64 period = global_rt_period();
7579 u64 new_bw = to_ratio(period, runtime); 7802 u64 new_bw = to_ratio(period, runtime);
7803 struct dl_bw *dl_b;
7580 int cpu, ret = 0; 7804 int cpu, ret = 0;
7581 unsigned long flags; 7805 unsigned long flags;
7582 7806
@@ -7590,13 +7814,16 @@ static int sched_dl_global_constraints(void)
7590 * solutions is welcome! 7814 * solutions is welcome!
7591 */ 7815 */
7592 for_each_possible_cpu(cpu) { 7816 for_each_possible_cpu(cpu) {
7593 struct dl_bw *dl_b = dl_bw_of(cpu); 7817 rcu_read_lock_sched();
7818 dl_b = dl_bw_of(cpu);
7594 7819
7595 raw_spin_lock_irqsave(&dl_b->lock, flags); 7820 raw_spin_lock_irqsave(&dl_b->lock, flags);
7596 if (new_bw < dl_b->total_bw) 7821 if (new_bw < dl_b->total_bw)
7597 ret = -EBUSY; 7822 ret = -EBUSY;
7598 raw_spin_unlock_irqrestore(&dl_b->lock, flags); 7823 raw_spin_unlock_irqrestore(&dl_b->lock, flags);
7599 7824
7825 rcu_read_unlock_sched();
7826
7600 if (ret) 7827 if (ret)
7601 break; 7828 break;
7602 } 7829 }
@@ -7607,6 +7834,7 @@ static int sched_dl_global_constraints(void)
7607static void sched_dl_do_global(void) 7834static void sched_dl_do_global(void)
7608{ 7835{
7609 u64 new_bw = -1; 7836 u64 new_bw = -1;
7837 struct dl_bw *dl_b;
7610 int cpu; 7838 int cpu;
7611 unsigned long flags; 7839 unsigned long flags;
7612 7840
@@ -7620,11 +7848,14 @@ static void sched_dl_do_global(void)
7620 * FIXME: As above... 7848 * FIXME: As above...
7621 */ 7849 */
7622 for_each_possible_cpu(cpu) { 7850 for_each_possible_cpu(cpu) {
7623 struct dl_bw *dl_b = dl_bw_of(cpu); 7851 rcu_read_lock_sched();
7852 dl_b = dl_bw_of(cpu);
7624 7853
7625 raw_spin_lock_irqsave(&dl_b->lock, flags); 7854 raw_spin_lock_irqsave(&dl_b->lock, flags);
7626 dl_b->bw = new_bw; 7855 dl_b->bw = new_bw;
7627 raw_spin_unlock_irqrestore(&dl_b->lock, flags); 7856 raw_spin_unlock_irqrestore(&dl_b->lock, flags);
7857
7858 rcu_read_unlock_sched();
7628 } 7859 }
7629} 7860}
7630 7861
@@ -7754,6 +7985,11 @@ static void cpu_cgroup_css_offline(struct cgroup_subsys_state *css)
7754 sched_offline_group(tg); 7985 sched_offline_group(tg);
7755} 7986}
7756 7987
7988static void cpu_cgroup_fork(struct task_struct *task)
7989{
7990 sched_move_task(task);
7991}
7992
7757static int cpu_cgroup_can_attach(struct cgroup_subsys_state *css, 7993static int cpu_cgroup_can_attach(struct cgroup_subsys_state *css,
7758 struct cgroup_taskset *tset) 7994 struct cgroup_taskset *tset)
7759{ 7995{
@@ -8005,7 +8241,7 @@ static int tg_cfs_schedulable_down(struct task_group *tg, void *data)
8005 struct cfs_bandwidth *parent_b = &tg->parent->cfs_bandwidth; 8241 struct cfs_bandwidth *parent_b = &tg->parent->cfs_bandwidth;
8006 8242
8007 quota = normalize_cfs_quota(tg, d); 8243 quota = normalize_cfs_quota(tg, d);
8008 parent_quota = parent_b->hierarchal_quota; 8244 parent_quota = parent_b->hierarchical_quota;
8009 8245
8010 /* 8246 /*
8011 * ensure max(child_quota) <= parent_quota, inherit when no 8247 * ensure max(child_quota) <= parent_quota, inherit when no
@@ -8016,7 +8252,7 @@ static int tg_cfs_schedulable_down(struct task_group *tg, void *data)
8016 else if (parent_quota != RUNTIME_INF && quota > parent_quota) 8252 else if (parent_quota != RUNTIME_INF && quota > parent_quota)
8017 return -EINVAL; 8253 return -EINVAL;
8018 } 8254 }
8019 cfs_b->hierarchal_quota = quota; 8255 cfs_b->hierarchical_quota = quota;
8020 8256
8021 return 0; 8257 return 0;
8022} 8258}
@@ -8126,6 +8362,7 @@ struct cgroup_subsys cpu_cgrp_subsys = {
8126 .css_free = cpu_cgroup_css_free, 8362 .css_free = cpu_cgroup_css_free,
8127 .css_online = cpu_cgroup_css_online, 8363 .css_online = cpu_cgroup_css_online,
8128 .css_offline = cpu_cgroup_css_offline, 8364 .css_offline = cpu_cgroup_css_offline,
8365 .fork = cpu_cgroup_fork,
8129 .can_attach = cpu_cgroup_can_attach, 8366 .can_attach = cpu_cgroup_can_attach,
8130 .attach = cpu_cgroup_attach, 8367 .attach = cpu_cgroup_attach,
8131 .exit = cpu_cgroup_exit, 8368 .exit = cpu_cgroup_exit,
diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c
index bd95963dae80..539ca3ce071b 100644
--- a/kernel/sched/cpudeadline.c
+++ b/kernel/sched/cpudeadline.c
@@ -107,9 +107,7 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p,
107 int best_cpu = -1; 107 int best_cpu = -1;
108 const struct sched_dl_entity *dl_se = &p->dl; 108 const struct sched_dl_entity *dl_se = &p->dl;
109 109
110 if (later_mask && cpumask_and(later_mask, cp->free_cpus, 110 if (later_mask && cpumask_and(later_mask, later_mask, cp->free_cpus)) {
111 &p->cpus_allowed) && cpumask_and(later_mask,
112 later_mask, cpu_active_mask)) {
113 best_cpu = cpumask_any(later_mask); 111 best_cpu = cpumask_any(later_mask);
114 goto out; 112 goto out;
115 } else if (cpumask_test_cpu(cpudl_maximum(cp), &p->cpus_allowed) && 113 } else if (cpumask_test_cpu(cpudl_maximum(cp), &p->cpus_allowed) &&
diff --git a/kernel/sched/cpudeadline.h b/kernel/sched/cpudeadline.h
index 538c9796ad4a..020039bd1326 100644
--- a/kernel/sched/cpudeadline.h
+++ b/kernel/sched/cpudeadline.h
@@ -25,9 +25,6 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p,
25void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid); 25void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid);
26int cpudl_init(struct cpudl *cp); 26int cpudl_init(struct cpudl *cp);
27void cpudl_cleanup(struct cpudl *cp); 27void cpudl_cleanup(struct cpudl *cp);
28#else
29#define cpudl_set(cp, cpu, dl) do { } while (0)
30#define cpudl_init() do { } while (0)
31#endif /* CONFIG_SMP */ 28#endif /* CONFIG_SMP */
32 29
33#endif /* _LINUX_CPUDL_H */ 30#endif /* _LINUX_CPUDL_H */
diff --git a/kernel/sched/cpupri.h b/kernel/sched/cpupri.h
index 6b033347fdfd..63cbb9ca0496 100644
--- a/kernel/sched/cpupri.h
+++ b/kernel/sched/cpupri.h
@@ -26,9 +26,6 @@ int cpupri_find(struct cpupri *cp,
26void cpupri_set(struct cpupri *cp, int cpu, int pri); 26void cpupri_set(struct cpupri *cp, int cpu, int pri);
27int cpupri_init(struct cpupri *cp); 27int cpupri_init(struct cpupri *cp);
28void cpupri_cleanup(struct cpupri *cp); 28void cpupri_cleanup(struct cpupri *cp);
29#else
30#define cpupri_set(cp, cpu, pri) do { } while (0)
31#define cpupri_init() do { } while (0)
32#endif 29#endif
33 30
34#endif /* _LINUX_CPUPRI_H */ 31#endif /* _LINUX_CPUPRI_H */
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 72fdf06ef865..8394b1ee600c 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -288,24 +288,29 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
288 struct signal_struct *sig = tsk->signal; 288 struct signal_struct *sig = tsk->signal;
289 cputime_t utime, stime; 289 cputime_t utime, stime;
290 struct task_struct *t; 290 struct task_struct *t;
291 291 unsigned int seq, nextseq;
292 times->utime = sig->utime; 292 unsigned long flags;
293 times->stime = sig->stime;
294 times->sum_exec_runtime = sig->sum_sched_runtime;
295 293
296 rcu_read_lock(); 294 rcu_read_lock();
297 /* make sure we can trust tsk->thread_group list */ 295 /* Attempt a lockless read on the first round. */
298 if (!likely(pid_alive(tsk))) 296 nextseq = 0;
299 goto out;
300
301 t = tsk;
302 do { 297 do {
303 task_cputime(t, &utime, &stime); 298 seq = nextseq;
304 times->utime += utime; 299 flags = read_seqbegin_or_lock_irqsave(&sig->stats_lock, &seq);
305 times->stime += stime; 300 times->utime = sig->utime;
306 times->sum_exec_runtime += task_sched_runtime(t); 301 times->stime = sig->stime;
307 } while_each_thread(tsk, t); 302 times->sum_exec_runtime = sig->sum_sched_runtime;
308out: 303
304 for_each_thread(tsk, t) {
305 task_cputime(t, &utime, &stime);
306 times->utime += utime;
307 times->stime += stime;
308 times->sum_exec_runtime += task_sched_runtime(t);
309 }
310 /* If lockless access failed, take the lock. */
311 nextseq = 1;
312 } while (need_seqretry(&sig->stats_lock, seq));
313 done_seqretry_irqrestore(&sig->stats_lock, seq, flags);
309 rcu_read_unlock(); 314 rcu_read_unlock();
310} 315}
311 316
@@ -550,6 +555,23 @@ drop_precision:
550} 555}
551 556
552/* 557/*
558 * Atomically advance counter to the new value. Interrupts, vcpu
559 * scheduling, and scaling inaccuracies can cause cputime_advance
560 * to be occasionally called with a new value smaller than counter.
561 * Let's enforce atomicity.
562 *
563 * Normally a caller will only go through this loop once, or not
564 * at all in case a previous caller updated counter the same jiffy.
565 */
566static void cputime_advance(cputime_t *counter, cputime_t new)
567{
568 cputime_t old;
569
570 while (new > (old = ACCESS_ONCE(*counter)))
571 cmpxchg_cputime(counter, old, new);
572}
573
574/*
553 * Adjust tick based cputime random precision against scheduler 575 * Adjust tick based cputime random precision against scheduler
554 * runtime accounting. 576 * runtime accounting.
555 */ 577 */
@@ -594,13 +616,8 @@ static void cputime_adjust(struct task_cputime *curr,
594 utime = rtime - stime; 616 utime = rtime - stime;
595 } 617 }
596 618
597 /* 619 cputime_advance(&prev->stime, stime);
598 * If the tick based count grows faster than the scheduler one, 620 cputime_advance(&prev->utime, utime);
599 * the result of the scaling may go backward.
600 * Let's enforce monotonicity.
601 */
602 prev->stime = max(prev->stime, stime);
603 prev->utime = max(prev->utime, utime);
604 621
605out: 622out:
606 *ut = prev->utime; 623 *ut = prev->utime;
@@ -617,9 +634,6 @@ void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
617 cputime_adjust(&cputime, &p->prev_cputime, ut, st); 634 cputime_adjust(&cputime, &p->prev_cputime, ut, st);
618} 635}
619 636
620/*
621 * Must be called with siglock held.
622 */
623void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) 637void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
624{ 638{
625 struct task_cputime cputime; 639 struct task_cputime cputime;
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 255ce138b652..b52092f2636d 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -518,21 +518,29 @@ again:
518 } 518 }
519 519
520 /* 520 /*
521 * We need to take care of a possible races here. In fact, the 521 * We need to take care of several possible races here:
522 * task might have changed its scheduling policy to something 522 *
523 * different from SCHED_DEADLINE or changed its reservation 523 * - the task might have changed its scheduling policy
524 * parameters (through sched_setattr()). 524 * to something different than SCHED_DEADLINE
525 * - the task might have changed its reservation parameters
526 * (through sched_setattr())
527 * - the task might have been boosted by someone else and
528 * might be in the boosting/deboosting path
529 *
530 * In all this cases we bail out, as the task is already
531 * in the runqueue or is going to be enqueued back anyway.
525 */ 532 */
526 if (!dl_task(p) || dl_se->dl_new) 533 if (!dl_task(p) || dl_se->dl_new ||
534 dl_se->dl_boosted || !dl_se->dl_throttled)
527 goto unlock; 535 goto unlock;
528 536
529 sched_clock_tick(); 537 sched_clock_tick();
530 update_rq_clock(rq); 538 update_rq_clock(rq);
531 dl_se->dl_throttled = 0; 539 dl_se->dl_throttled = 0;
532 dl_se->dl_yielded = 0; 540 dl_se->dl_yielded = 0;
533 if (p->on_rq) { 541 if (task_on_rq_queued(p)) {
534 enqueue_task_dl(rq, p, ENQUEUE_REPLENISH); 542 enqueue_task_dl(rq, p, ENQUEUE_REPLENISH);
535 if (task_has_dl_policy(rq->curr)) 543 if (dl_task(rq->curr))
536 check_preempt_curr_dl(rq, p, 0); 544 check_preempt_curr_dl(rq, p, 0);
537 else 545 else
538 resched_curr(rq); 546 resched_curr(rq);
@@ -555,11 +563,6 @@ void init_dl_task_timer(struct sched_dl_entity *dl_se)
555{ 563{
556 struct hrtimer *timer = &dl_se->dl_timer; 564 struct hrtimer *timer = &dl_se->dl_timer;
557 565
558 if (hrtimer_active(timer)) {
559 hrtimer_try_to_cancel(timer);
560 return;
561 }
562
563 hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 566 hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
564 timer->function = dl_task_timer; 567 timer->function = dl_task_timer;
565} 568}
@@ -567,24 +570,7 @@ void init_dl_task_timer(struct sched_dl_entity *dl_se)
567static 570static
568int dl_runtime_exceeded(struct rq *rq, struct sched_dl_entity *dl_se) 571int dl_runtime_exceeded(struct rq *rq, struct sched_dl_entity *dl_se)
569{ 572{
570 int dmiss = dl_time_before(dl_se->deadline, rq_clock(rq)); 573 return (dl_se->runtime <= 0);
571 int rorun = dl_se->runtime <= 0;
572
573 if (!rorun && !dmiss)
574 return 0;
575
576 /*
577 * If we are beyond our current deadline and we are still
578 * executing, then we have already used some of the runtime of
579 * the next instance. Thus, if we do not account that, we are
580 * stealing bandwidth from the system at each deadline miss!
581 */
582 if (dmiss) {
583 dl_se->runtime = rorun ? dl_se->runtime : 0;
584 dl_se->runtime -= rq_clock(rq) - dl_se->deadline;
585 }
586
587 return 1;
588} 574}
589 575
590extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq); 576extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq);
@@ -625,7 +611,7 @@ static void update_curr_dl(struct rq *rq)
625 611
626 sched_rt_avg_update(rq, delta_exec); 612 sched_rt_avg_update(rq, delta_exec);
627 613
628 dl_se->runtime -= delta_exec; 614 dl_se->runtime -= dl_se->dl_yielded ? 0 : delta_exec;
629 if (dl_runtime_exceeded(rq, dl_se)) { 615 if (dl_runtime_exceeded(rq, dl_se)) {
630 __dequeue_task_dl(rq, curr, 0); 616 __dequeue_task_dl(rq, curr, 0);
631 if (likely(start_dl_timer(dl_se, curr->dl.dl_boosted))) 617 if (likely(start_dl_timer(dl_se, curr->dl.dl_boosted)))
@@ -823,10 +809,10 @@ enqueue_dl_entity(struct sched_dl_entity *dl_se,
823 * parameters of the task might need updating. Otherwise, 809 * parameters of the task might need updating. Otherwise,
824 * we want a replenishment of its runtime. 810 * we want a replenishment of its runtime.
825 */ 811 */
826 if (!dl_se->dl_new && flags & ENQUEUE_REPLENISH) 812 if (dl_se->dl_new || flags & ENQUEUE_WAKEUP)
827 replenish_dl_entity(dl_se, pi_se);
828 else
829 update_dl_entity(dl_se, pi_se); 813 update_dl_entity(dl_se, pi_se);
814 else if (flags & ENQUEUE_REPLENISH)
815 replenish_dl_entity(dl_se, pi_se);
830 816
831 __enqueue_dl_entity(dl_se); 817 __enqueue_dl_entity(dl_se);
832} 818}
@@ -847,8 +833,19 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
847 * smaller than our one... OTW we keep our runtime and 833 * smaller than our one... OTW we keep our runtime and
848 * deadline. 834 * deadline.
849 */ 835 */
850 if (pi_task && p->dl.dl_boosted && dl_prio(pi_task->normal_prio)) 836 if (pi_task && p->dl.dl_boosted && dl_prio(pi_task->normal_prio)) {
851 pi_se = &pi_task->dl; 837 pi_se = &pi_task->dl;
838 } else if (!dl_prio(p->normal_prio)) {
839 /*
840 * Special case in which we have a !SCHED_DEADLINE task
841 * that is going to be deboosted, but exceedes its
842 * runtime while doing so. No point in replenishing
843 * it, as it's going to return back to its original
844 * scheduling class after this.
845 */
846 BUG_ON(!p->dl.dl_boosted || flags != ENQUEUE_REPLENISH);
847 return;
848 }
852 849
853 /* 850 /*
854 * If p is throttled, we do nothing. In fact, if it exhausted 851 * If p is throttled, we do nothing. In fact, if it exhausted
@@ -914,7 +911,7 @@ select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags)
914 struct task_struct *curr; 911 struct task_struct *curr;
915 struct rq *rq; 912 struct rq *rq;
916 913
917 if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK) 914 if (sd_flag != SD_BALANCE_WAKE)
918 goto out; 915 goto out;
919 916
920 rq = cpu_rq(cpu); 917 rq = cpu_rq(cpu);
@@ -997,10 +994,11 @@ static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p,
997#ifdef CONFIG_SCHED_HRTICK 994#ifdef CONFIG_SCHED_HRTICK
998static void start_hrtick_dl(struct rq *rq, struct task_struct *p) 995static void start_hrtick_dl(struct rq *rq, struct task_struct *p)
999{ 996{
1000 s64 delta = p->dl.dl_runtime - p->dl.runtime; 997 hrtick_start(rq, p->dl.runtime);
1001 998}
1002 if (delta > 10000) 999#else /* !CONFIG_SCHED_HRTICK */
1003 hrtick_start(rq, p->dl.runtime); 1000static void start_hrtick_dl(struct rq *rq, struct task_struct *p)
1001{
1004} 1002}
1005#endif 1003#endif
1006 1004
@@ -1030,7 +1028,7 @@ struct task_struct *pick_next_task_dl(struct rq *rq, struct task_struct *prev)
1030 * means a stop task can slip in, in which case we need to 1028 * means a stop task can slip in, in which case we need to
1031 * re-start task selection. 1029 * re-start task selection.
1032 */ 1030 */
1033 if (rq->stop && rq->stop->on_rq) 1031 if (rq->stop && task_on_rq_queued(rq->stop))
1034 return RETRY_TASK; 1032 return RETRY_TASK;
1035 } 1033 }
1036 1034
@@ -1055,10 +1053,8 @@ struct task_struct *pick_next_task_dl(struct rq *rq, struct task_struct *prev)
1055 /* Running task will never be pushed. */ 1053 /* Running task will never be pushed. */
1056 dequeue_pushable_dl_task(rq, p); 1054 dequeue_pushable_dl_task(rq, p);
1057 1055
1058#ifdef CONFIG_SCHED_HRTICK
1059 if (hrtick_enabled(rq)) 1056 if (hrtick_enabled(rq))
1060 start_hrtick_dl(rq, p); 1057 start_hrtick_dl(rq, p);
1061#endif
1062 1058
1063 set_post_schedule(rq); 1059 set_post_schedule(rq);
1064 1060
@@ -1077,10 +1073,8 @@ static void task_tick_dl(struct rq *rq, struct task_struct *p, int queued)
1077{ 1073{
1078 update_curr_dl(rq); 1074 update_curr_dl(rq);
1079 1075
1080#ifdef CONFIG_SCHED_HRTICK
1081 if (hrtick_enabled(rq) && queued && p->dl.runtime > 0) 1076 if (hrtick_enabled(rq) && queued && p->dl.runtime > 0)
1082 start_hrtick_dl(rq, p); 1077 start_hrtick_dl(rq, p);
1083#endif
1084} 1078}
1085 1079
1086static void task_fork_dl(struct task_struct *p) 1080static void task_fork_dl(struct task_struct *p)
@@ -1124,10 +1118,8 @@ static void set_curr_task_dl(struct rq *rq)
1124static int pick_dl_task(struct rq *rq, struct task_struct *p, int cpu) 1118static int pick_dl_task(struct rq *rq, struct task_struct *p, int cpu)
1125{ 1119{
1126 if (!task_running(rq, p) && 1120 if (!task_running(rq, p) &&
1127 (cpu < 0 || cpumask_test_cpu(cpu, &p->cpus_allowed)) && 1121 cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
1128 (p->nr_cpus_allowed > 1))
1129 return 1; 1122 return 1;
1130
1131 return 0; 1123 return 0;
1132} 1124}
1133 1125
@@ -1158,7 +1150,7 @@ static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask_dl);
1158static int find_later_rq(struct task_struct *task) 1150static int find_later_rq(struct task_struct *task)
1159{ 1151{
1160 struct sched_domain *sd; 1152 struct sched_domain *sd;
1161 struct cpumask *later_mask = __get_cpu_var(local_cpu_mask_dl); 1153 struct cpumask *later_mask = this_cpu_cpumask_var_ptr(local_cpu_mask_dl);
1162 int this_cpu = smp_processor_id(); 1154 int this_cpu = smp_processor_id();
1163 int best_cpu, cpu = task_cpu(task); 1155 int best_cpu, cpu = task_cpu(task);
1164 1156
@@ -1169,6 +1161,13 @@ static int find_later_rq(struct task_struct *task)
1169 if (task->nr_cpus_allowed == 1) 1161 if (task->nr_cpus_allowed == 1)
1170 return -1; 1162 return -1;
1171 1163
1164 /*
1165 * We have to consider system topology and task affinity
1166 * first, then we can look for a suitable cpu.
1167 */
1168 cpumask_copy(later_mask, task_rq(task)->rd->span);
1169 cpumask_and(later_mask, later_mask, cpu_active_mask);
1170 cpumask_and(later_mask, later_mask, &task->cpus_allowed);
1172 best_cpu = cpudl_find(&task_rq(task)->rd->cpudl, 1171 best_cpu = cpudl_find(&task_rq(task)->rd->cpudl,
1173 task, later_mask); 1172 task, later_mask);
1174 if (best_cpu == -1) 1173 if (best_cpu == -1)
@@ -1257,7 +1256,8 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq)
1257 if (unlikely(task_rq(task) != rq || 1256 if (unlikely(task_rq(task) != rq ||
1258 !cpumask_test_cpu(later_rq->cpu, 1257 !cpumask_test_cpu(later_rq->cpu,
1259 &task->cpus_allowed) || 1258 &task->cpus_allowed) ||
1260 task_running(rq, task) || !task->on_rq)) { 1259 task_running(rq, task) ||
1260 !task_on_rq_queued(task))) {
1261 double_unlock_balance(rq, later_rq); 1261 double_unlock_balance(rq, later_rq);
1262 later_rq = NULL; 1262 later_rq = NULL;
1263 break; 1263 break;
@@ -1296,7 +1296,7 @@ static struct task_struct *pick_next_pushable_dl_task(struct rq *rq)
1296 BUG_ON(task_current(rq, p)); 1296 BUG_ON(task_current(rq, p));
1297 BUG_ON(p->nr_cpus_allowed <= 1); 1297 BUG_ON(p->nr_cpus_allowed <= 1);
1298 1298
1299 BUG_ON(!p->on_rq); 1299 BUG_ON(!task_on_rq_queued(p));
1300 BUG_ON(!dl_task(p)); 1300 BUG_ON(!dl_task(p));
1301 1301
1302 return p; 1302 return p;
@@ -1311,6 +1311,7 @@ static int push_dl_task(struct rq *rq)
1311{ 1311{
1312 struct task_struct *next_task; 1312 struct task_struct *next_task;
1313 struct rq *later_rq; 1313 struct rq *later_rq;
1314 int ret = 0;
1314 1315
1315 if (!rq->dl.overloaded) 1316 if (!rq->dl.overloaded)
1316 return 0; 1317 return 0;
@@ -1356,7 +1357,6 @@ retry:
1356 * The task is still there. We don't try 1357 * The task is still there. We don't try
1357 * again, some other cpu will pull it when ready. 1358 * again, some other cpu will pull it when ready.
1358 */ 1359 */
1359 dequeue_pushable_dl_task(rq, next_task);
1360 goto out; 1360 goto out;
1361 } 1361 }
1362 1362
@@ -1372,6 +1372,7 @@ retry:
1372 deactivate_task(rq, next_task, 0); 1372 deactivate_task(rq, next_task, 0);
1373 set_task_cpu(next_task, later_rq->cpu); 1373 set_task_cpu(next_task, later_rq->cpu);
1374 activate_task(later_rq, next_task, 0); 1374 activate_task(later_rq, next_task, 0);
1375 ret = 1;
1375 1376
1376 resched_curr(later_rq); 1377 resched_curr(later_rq);
1377 1378
@@ -1380,7 +1381,7 @@ retry:
1380out: 1381out:
1381 put_task_struct(next_task); 1382 put_task_struct(next_task);
1382 1383
1383 return 1; 1384 return ret;
1384} 1385}
1385 1386
1386static void push_dl_tasks(struct rq *rq) 1387static void push_dl_tasks(struct rq *rq)
@@ -1443,7 +1444,7 @@ static int pull_dl_task(struct rq *this_rq)
1443 dl_time_before(p->dl.deadline, 1444 dl_time_before(p->dl.deadline,
1444 this_rq->dl.earliest_dl.curr))) { 1445 this_rq->dl.earliest_dl.curr))) {
1445 WARN_ON(p == src_rq->curr); 1446 WARN_ON(p == src_rq->curr);
1446 WARN_ON(!p->on_rq); 1447 WARN_ON(!task_on_rq_queued(p));
1447 1448
1448 /* 1449 /*
1449 * Then we pull iff p has actually an earlier 1450 * Then we pull iff p has actually an earlier
@@ -1486,7 +1487,7 @@ static void task_woken_dl(struct rq *rq, struct task_struct *p)
1486 p->nr_cpus_allowed > 1 && 1487 p->nr_cpus_allowed > 1 &&
1487 dl_task(rq->curr) && 1488 dl_task(rq->curr) &&
1488 (rq->curr->nr_cpus_allowed < 2 || 1489 (rq->curr->nr_cpus_allowed < 2 ||
1489 dl_entity_preempt(&rq->curr->dl, &p->dl))) { 1490 !dl_entity_preempt(&p->dl, &rq->curr->dl))) {
1490 push_dl_tasks(rq); 1491 push_dl_tasks(rq);
1491 } 1492 }
1492} 1493}
@@ -1495,10 +1496,33 @@ static void set_cpus_allowed_dl(struct task_struct *p,
1495 const struct cpumask *new_mask) 1496 const struct cpumask *new_mask)
1496{ 1497{
1497 struct rq *rq; 1498 struct rq *rq;
1499 struct root_domain *src_rd;
1498 int weight; 1500 int weight;
1499 1501
1500 BUG_ON(!dl_task(p)); 1502 BUG_ON(!dl_task(p));
1501 1503
1504 rq = task_rq(p);
1505 src_rd = rq->rd;
1506 /*
1507 * Migrating a SCHED_DEADLINE task between exclusive
1508 * cpusets (different root_domains) entails a bandwidth
1509 * update. We already made space for us in the destination
1510 * domain (see cpuset_can_attach()).
1511 */
1512 if (!cpumask_intersects(src_rd->span, new_mask)) {
1513 struct dl_bw *src_dl_b;
1514
1515 src_dl_b = dl_bw_of(cpu_of(rq));
1516 /*
1517 * We now free resources of the root_domain we are migrating
1518 * off. In the worst case, sched_setattr() may temporary fail
1519 * until we complete the update.
1520 */
1521 raw_spin_lock(&src_dl_b->lock);
1522 __dl_clear(src_dl_b, p->dl.dl_bw);
1523 raw_spin_unlock(&src_dl_b->lock);
1524 }
1525
1502 /* 1526 /*
1503 * Update only if the task is actually running (i.e., 1527 * Update only if the task is actually running (i.e.,
1504 * it is on the rq AND it is not throttled). 1528 * it is on the rq AND it is not throttled).
@@ -1515,8 +1539,6 @@ static void set_cpus_allowed_dl(struct task_struct *p,
1515 if ((p->nr_cpus_allowed > 1) == (weight > 1)) 1539 if ((p->nr_cpus_allowed > 1) == (weight > 1))
1516 return; 1540 return;
1517 1541
1518 rq = task_rq(p);
1519
1520 /* 1542 /*
1521 * The process used to be able to migrate OR it can now migrate 1543 * The process used to be able to migrate OR it can now migrate
1522 */ 1544 */
@@ -1564,20 +1586,48 @@ void init_sched_dl_class(void)
1564 1586
1565#endif /* CONFIG_SMP */ 1587#endif /* CONFIG_SMP */
1566 1588
1589/*
1590 * Ensure p's dl_timer is cancelled. May drop rq->lock for a while.
1591 */
1592static void cancel_dl_timer(struct rq *rq, struct task_struct *p)
1593{
1594 struct hrtimer *dl_timer = &p->dl.dl_timer;
1595
1596 /* Nobody will change task's class if pi_lock is held */
1597 lockdep_assert_held(&p->pi_lock);
1598
1599 if (hrtimer_active(dl_timer)) {
1600 int ret = hrtimer_try_to_cancel(dl_timer);
1601
1602 if (unlikely(ret == -1)) {
1603 /*
1604 * Note, p may migrate OR new deadline tasks
1605 * may appear in rq when we are unlocking it.
1606 * A caller of us must be fine with that.
1607 */
1608 raw_spin_unlock(&rq->lock);
1609 hrtimer_cancel(dl_timer);
1610 raw_spin_lock(&rq->lock);
1611 }
1612 }
1613}
1614
1567static void switched_from_dl(struct rq *rq, struct task_struct *p) 1615static void switched_from_dl(struct rq *rq, struct task_struct *p)
1568{ 1616{
1569 if (hrtimer_active(&p->dl.dl_timer) && !dl_policy(p->policy)) 1617 cancel_dl_timer(rq, p);
1570 hrtimer_try_to_cancel(&p->dl.dl_timer); 1618
1619 __dl_clear_params(p);
1571 1620
1572#ifdef CONFIG_SMP
1573 /* 1621 /*
1574 * Since this might be the only -deadline task on the rq, 1622 * Since this might be the only -deadline task on the rq,
1575 * this is the right place to try to pull some other one 1623 * this is the right place to try to pull some other one
1576 * from an overloaded cpu, if any. 1624 * from an overloaded cpu, if any.
1577 */ 1625 */
1578 if (!rq->dl.dl_nr_running) 1626 if (!task_on_rq_queued(p) || rq->dl.dl_nr_running)
1579 pull_dl_task(rq); 1627 return;
1580#endif 1628
1629 if (pull_dl_task(rq))
1630 resched_curr(rq);
1581} 1631}
1582 1632
1583/* 1633/*
@@ -1596,14 +1646,19 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p)
1596 if (unlikely(p->dl.dl_throttled)) 1646 if (unlikely(p->dl.dl_throttled))
1597 return; 1647 return;
1598 1648
1599 if (p->on_rq && rq->curr != p) { 1649 if (task_on_rq_queued(p) && rq->curr != p) {
1600#ifdef CONFIG_SMP 1650#ifdef CONFIG_SMP
1601 if (rq->dl.overloaded && push_dl_task(rq) && rq != task_rq(p)) 1651 if (p->nr_cpus_allowed > 1 && rq->dl.overloaded &&
1652 push_dl_task(rq) && rq != task_rq(p))
1602 /* Only reschedule if pushing failed */ 1653 /* Only reschedule if pushing failed */
1603 check_resched = 0; 1654 check_resched = 0;
1604#endif /* CONFIG_SMP */ 1655#endif /* CONFIG_SMP */
1605 if (check_resched && task_has_dl_policy(rq->curr)) 1656 if (check_resched) {
1606 check_preempt_curr_dl(rq, p, 0); 1657 if (dl_task(rq->curr))
1658 check_preempt_curr_dl(rq, p, 0);
1659 else
1660 resched_curr(rq);
1661 }
1607 } 1662 }
1608} 1663}
1609 1664
@@ -1614,7 +1669,7 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p)
1614static void prio_changed_dl(struct rq *rq, struct task_struct *p, 1669static void prio_changed_dl(struct rq *rq, struct task_struct *p,
1615 int oldprio) 1670 int oldprio)
1616{ 1671{
1617 if (p->on_rq || rq->curr == p) { 1672 if (task_on_rq_queued(p) || rq->curr == p) {
1618#ifdef CONFIG_SMP 1673#ifdef CONFIG_SMP
1619 /* 1674 /*
1620 * This might be too much, but unfortunately 1675 * This might be too much, but unfortunately
@@ -1673,4 +1728,15 @@ const struct sched_class dl_sched_class = {
1673 .prio_changed = prio_changed_dl, 1728 .prio_changed = prio_changed_dl,
1674 .switched_from = switched_from_dl, 1729 .switched_from = switched_from_dl,
1675 .switched_to = switched_to_dl, 1730 .switched_to = switched_to_dl,
1731
1732 .update_curr = update_curr_dl,
1676}; 1733};
1734
1735#ifdef CONFIG_SCHED_DEBUG
1736extern void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq);
1737
1738void print_dl_stats(struct seq_file *m, int cpu)
1739{
1740 print_dl_rq(m, cpu, &cpu_rq(cpu)->dl);
1741}
1742#endif /* CONFIG_SCHED_DEBUG */
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 627b3c34b821..92cc52001e74 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -150,7 +150,6 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
150static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu) 150static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
151{ 151{
152 struct task_struct *g, *p; 152 struct task_struct *g, *p;
153 unsigned long flags;
154 153
155 SEQ_printf(m, 154 SEQ_printf(m,
156 "\nrunnable tasks:\n" 155 "\nrunnable tasks:\n"
@@ -159,16 +158,14 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
159 "------------------------------------------------------" 158 "------------------------------------------------------"
160 "----------------------------------------------------\n"); 159 "----------------------------------------------------\n");
161 160
162 read_lock_irqsave(&tasklist_lock, flags); 161 rcu_read_lock();
163 162 for_each_process_thread(g, p) {
164 do_each_thread(g, p) {
165 if (task_cpu(p) != rq_cpu) 163 if (task_cpu(p) != rq_cpu)
166 continue; 164 continue;
167 165
168 print_task(m, rq, p); 166 print_task(m, rq, p);
169 } while_each_thread(g, p); 167 }
170 168 rcu_read_unlock();
171 read_unlock_irqrestore(&tasklist_lock, flags);
172} 169}
173 170
174void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) 171void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
@@ -264,6 +261,12 @@ void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
264#undef P 261#undef P
265} 262}
266 263
264void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq)
265{
266 SEQ_printf(m, "\ndl_rq[%d]:\n", cpu);
267 SEQ_printf(m, " .%-30s: %ld\n", "dl_nr_running", dl_rq->dl_nr_running);
268}
269
267extern __read_mostly int sched_clock_running; 270extern __read_mostly int sched_clock_running;
268 271
269static void print_cpu(struct seq_file *m, int cpu) 272static void print_cpu(struct seq_file *m, int cpu)
@@ -332,10 +335,9 @@ do { \
332 spin_lock_irqsave(&sched_debug_lock, flags); 335 spin_lock_irqsave(&sched_debug_lock, flags);
333 print_cfs_stats(m, cpu); 336 print_cfs_stats(m, cpu);
334 print_rt_stats(m, cpu); 337 print_rt_stats(m, cpu);
338 print_dl_stats(m, cpu);
335 339
336 rcu_read_lock();
337 print_rq(m, rq, cpu); 340 print_rq(m, rq, cpu);
338 rcu_read_unlock();
339 spin_unlock_irqrestore(&sched_debug_lock, flags); 341 spin_unlock_irqrestore(&sched_debug_lock, flags);
340 SEQ_printf(m, "\n"); 342 SEQ_printf(m, "\n");
341} 343}
@@ -533,8 +535,8 @@ static void sched_show_numa(struct task_struct *p, struct seq_file *m)
533 unsigned long nr_faults = -1; 535 unsigned long nr_faults = -1;
534 int cpu_current, home_node; 536 int cpu_current, home_node;
535 537
536 if (p->numa_faults_memory) 538 if (p->numa_faults)
537 nr_faults = p->numa_faults_memory[2*node + i]; 539 nr_faults = p->numa_faults[2*node + i];
538 540
539 cpu_current = !i ? (task_node(p) == node) : 541 cpu_current = !i ? (task_node(p) == node) :
540 (pol && node_isset(node, pol->v.nodes)); 542 (pol && node_isset(node, pol->v.nodes));
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index bfa3c86d0d68..40667cbf371b 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -23,6 +23,7 @@
23#include <linux/latencytop.h> 23#include <linux/latencytop.h>
24#include <linux/sched.h> 24#include <linux/sched.h>
25#include <linux/cpumask.h> 25#include <linux/cpumask.h>
26#include <linux/cpuidle.h>
26#include <linux/slab.h> 27#include <linux/slab.h>
27#include <linux/profile.h> 28#include <linux/profile.h>
28#include <linux/interrupt.h> 29#include <linux/interrupt.h>
@@ -665,6 +666,7 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
665} 666}
666 667
667#ifdef CONFIG_SMP 668#ifdef CONFIG_SMP
669static int select_idle_sibling(struct task_struct *p, int cpu);
668static unsigned long task_h_load(struct task_struct *p); 670static unsigned long task_h_load(struct task_struct *p);
669 671
670static inline void __update_task_entity_contrib(struct sched_entity *se); 672static inline void __update_task_entity_contrib(struct sched_entity *se);
@@ -724,6 +726,11 @@ static void update_curr(struct cfs_rq *cfs_rq)
724 account_cfs_rq_runtime(cfs_rq, delta_exec); 726 account_cfs_rq_runtime(cfs_rq, delta_exec);
725} 727}
726 728
729static void update_curr_fair(struct rq *rq)
730{
731 update_curr(cfs_rq_of(&rq->curr->se));
732}
733
727static inline void 734static inline void
728update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se) 735update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
729{ 736{
@@ -826,11 +833,12 @@ static unsigned int task_nr_scan_windows(struct task_struct *p)
826 833
827static unsigned int task_scan_min(struct task_struct *p) 834static unsigned int task_scan_min(struct task_struct *p)
828{ 835{
836 unsigned int scan_size = ACCESS_ONCE(sysctl_numa_balancing_scan_size);
829 unsigned int scan, floor; 837 unsigned int scan, floor;
830 unsigned int windows = 1; 838 unsigned int windows = 1;
831 839
832 if (sysctl_numa_balancing_scan_size < MAX_SCAN_WINDOW) 840 if (scan_size < MAX_SCAN_WINDOW)
833 windows = MAX_SCAN_WINDOW / sysctl_numa_balancing_scan_size; 841 windows = MAX_SCAN_WINDOW / scan_size;
834 floor = 1000 / windows; 842 floor = 1000 / windows;
835 843
836 scan = sysctl_numa_balancing_scan_period_min / task_nr_scan_windows(p); 844 scan = sysctl_numa_balancing_scan_period_min / task_nr_scan_windows(p);
@@ -865,7 +873,6 @@ struct numa_group {
865 spinlock_t lock; /* nr_tasks, tasks */ 873 spinlock_t lock; /* nr_tasks, tasks */
866 int nr_tasks; 874 int nr_tasks;
867 pid_t gid; 875 pid_t gid;
868 struct list_head task_list;
869 876
870 struct rcu_head rcu; 877 struct rcu_head rcu;
871 nodemask_t active_nodes; 878 nodemask_t active_nodes;
@@ -893,18 +900,24 @@ pid_t task_numa_group_id(struct task_struct *p)
893 return p->numa_group ? p->numa_group->gid : 0; 900 return p->numa_group ? p->numa_group->gid : 0;
894} 901}
895 902
896static inline int task_faults_idx(int nid, int priv) 903/*
904 * The averaged statistics, shared & private, memory & cpu,
905 * occupy the first half of the array. The second half of the
906 * array is for current counters, which are averaged into the
907 * first set by task_numa_placement.
908 */
909static inline int task_faults_idx(enum numa_faults_stats s, int nid, int priv)
897{ 910{
898 return NR_NUMA_HINT_FAULT_TYPES * nid + priv; 911 return NR_NUMA_HINT_FAULT_TYPES * (s * nr_node_ids + nid) + priv;
899} 912}
900 913
901static inline unsigned long task_faults(struct task_struct *p, int nid) 914static inline unsigned long task_faults(struct task_struct *p, int nid)
902{ 915{
903 if (!p->numa_faults_memory) 916 if (!p->numa_faults)
904 return 0; 917 return 0;
905 918
906 return p->numa_faults_memory[task_faults_idx(nid, 0)] + 919 return p->numa_faults[task_faults_idx(NUMA_MEM, nid, 0)] +
907 p->numa_faults_memory[task_faults_idx(nid, 1)]; 920 p->numa_faults[task_faults_idx(NUMA_MEM, nid, 1)];
908} 921}
909 922
910static inline unsigned long group_faults(struct task_struct *p, int nid) 923static inline unsigned long group_faults(struct task_struct *p, int nid)
@@ -912,14 +925,79 @@ static inline unsigned long group_faults(struct task_struct *p, int nid)
912 if (!p->numa_group) 925 if (!p->numa_group)
913 return 0; 926 return 0;
914 927
915 return p->numa_group->faults[task_faults_idx(nid, 0)] + 928 return p->numa_group->faults[task_faults_idx(NUMA_MEM, nid, 0)] +
916 p->numa_group->faults[task_faults_idx(nid, 1)]; 929 p->numa_group->faults[task_faults_idx(NUMA_MEM, nid, 1)];
917} 930}
918 931
919static inline unsigned long group_faults_cpu(struct numa_group *group, int nid) 932static inline unsigned long group_faults_cpu(struct numa_group *group, int nid)
920{ 933{
921 return group->faults_cpu[task_faults_idx(nid, 0)] + 934 return group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 0)] +
922 group->faults_cpu[task_faults_idx(nid, 1)]; 935 group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 1)];
936}
937
938/* Handle placement on systems where not all nodes are directly connected. */
939static unsigned long score_nearby_nodes(struct task_struct *p, int nid,
940 int maxdist, bool task)
941{
942 unsigned long score = 0;
943 int node;
944
945 /*
946 * All nodes are directly connected, and the same distance
947 * from each other. No need for fancy placement algorithms.
948 */
949 if (sched_numa_topology_type == NUMA_DIRECT)
950 return 0;
951
952 /*
953 * This code is called for each node, introducing N^2 complexity,
954 * which should be ok given the number of nodes rarely exceeds 8.
955 */
956 for_each_online_node(node) {
957 unsigned long faults;
958 int dist = node_distance(nid, node);
959
960 /*
961 * The furthest away nodes in the system are not interesting
962 * for placement; nid was already counted.
963 */
964 if (dist == sched_max_numa_distance || node == nid)
965 continue;
966
967 /*
968 * On systems with a backplane NUMA topology, compare groups
969 * of nodes, and move tasks towards the group with the most
970 * memory accesses. When comparing two nodes at distance
971 * "hoplimit", only nodes closer by than "hoplimit" are part
972 * of each group. Skip other nodes.
973 */
974 if (sched_numa_topology_type == NUMA_BACKPLANE &&
975 dist > maxdist)
976 continue;
977
978 /* Add up the faults from nearby nodes. */
979 if (task)
980 faults = task_faults(p, node);
981 else
982 faults = group_faults(p, node);
983
984 /*
985 * On systems with a glueless mesh NUMA topology, there are
986 * no fixed "groups of nodes". Instead, nodes that are not
987 * directly connected bounce traffic through intermediate
988 * nodes; a numa_group can occupy any set of nodes.
989 * The further away a node is, the less the faults count.
990 * This seems to result in good task placement.
991 */
992 if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {
993 faults *= (sched_max_numa_distance - dist);
994 faults /= (sched_max_numa_distance - LOCAL_DISTANCE);
995 }
996
997 score += faults;
998 }
999
1000 return score;
923} 1001}
924 1002
925/* 1003/*
@@ -928,11 +1006,12 @@ static inline unsigned long group_faults_cpu(struct numa_group *group, int nid)
928 * larger multiplier, in order to group tasks together that are almost 1006 * larger multiplier, in order to group tasks together that are almost
929 * evenly spread out between numa nodes. 1007 * evenly spread out between numa nodes.
930 */ 1008 */
931static inline unsigned long task_weight(struct task_struct *p, int nid) 1009static inline unsigned long task_weight(struct task_struct *p, int nid,
1010 int dist)
932{ 1011{
933 unsigned long total_faults; 1012 unsigned long faults, total_faults;
934 1013
935 if (!p->numa_faults_memory) 1014 if (!p->numa_faults)
936 return 0; 1015 return 0;
937 1016
938 total_faults = p->total_numa_faults; 1017 total_faults = p->total_numa_faults;
@@ -940,15 +1019,29 @@ static inline unsigned long task_weight(struct task_struct *p, int nid)
940 if (!total_faults) 1019 if (!total_faults)
941 return 0; 1020 return 0;
942 1021
943 return 1000 * task_faults(p, nid) / total_faults; 1022 faults = task_faults(p, nid);
1023 faults += score_nearby_nodes(p, nid, dist, true);
1024
1025 return 1000 * faults / total_faults;
944} 1026}
945 1027
946static inline unsigned long group_weight(struct task_struct *p, int nid) 1028static inline unsigned long group_weight(struct task_struct *p, int nid,
1029 int dist)
947{ 1030{
948 if (!p->numa_group || !p->numa_group->total_faults) 1031 unsigned long faults, total_faults;
1032
1033 if (!p->numa_group)
949 return 0; 1034 return 0;
950 1035
951 return 1000 * group_faults(p, nid) / p->numa_group->total_faults; 1036 total_faults = p->numa_group->total_faults;
1037
1038 if (!total_faults)
1039 return 0;
1040
1041 faults = group_faults(p, nid);
1042 faults += score_nearby_nodes(p, nid, dist, false);
1043
1044 return 1000 * faults / total_faults;
952} 1045}
953 1046
954bool should_numa_migrate_memory(struct task_struct *p, struct page * page, 1047bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
@@ -1038,7 +1131,8 @@ struct numa_stats {
1038 */ 1131 */
1039static void update_numa_stats(struct numa_stats *ns, int nid) 1132static void update_numa_stats(struct numa_stats *ns, int nid)
1040{ 1133{
1041 int cpu, cpus = 0; 1134 int smt, cpu, cpus = 0;
1135 unsigned long capacity;
1042 1136
1043 memset(ns, 0, sizeof(*ns)); 1137 memset(ns, 0, sizeof(*ns));
1044 for_each_cpu(cpu, cpumask_of_node(nid)) { 1138 for_each_cpu(cpu, cpumask_of_node(nid)) {
@@ -1062,8 +1156,12 @@ static void update_numa_stats(struct numa_stats *ns, int nid)
1062 if (!cpus) 1156 if (!cpus)
1063 return; 1157 return;
1064 1158
1065 ns->task_capacity = 1159 /* smt := ceil(cpus / capacity), assumes: 1 < smt_power < 2 */
1066 DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_CAPACITY_SCALE); 1160 smt = DIV_ROUND_UP(SCHED_CAPACITY_SCALE * cpus, ns->compute_capacity);
1161 capacity = cpus / smt; /* cores */
1162
1163 ns->task_capacity = min_t(unsigned, capacity,
1164 DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_CAPACITY_SCALE));
1067 ns->has_free_capacity = (ns->nr_running < ns->task_capacity); 1165 ns->has_free_capacity = (ns->nr_running < ns->task_capacity);
1068} 1166}
1069 1167
@@ -1076,6 +1174,7 @@ struct task_numa_env {
1076 struct numa_stats src_stats, dst_stats; 1174 struct numa_stats src_stats, dst_stats;
1077 1175
1078 int imbalance_pct; 1176 int imbalance_pct;
1177 int dist;
1079 1178
1080 struct task_struct *best_task; 1179 struct task_struct *best_task;
1081 long best_imp; 1180 long best_imp;
@@ -1155,11 +1254,29 @@ static void task_numa_compare(struct task_numa_env *env,
1155 long load; 1254 long load;
1156 long imp = env->p->numa_group ? groupimp : taskimp; 1255 long imp = env->p->numa_group ? groupimp : taskimp;
1157 long moveimp = imp; 1256 long moveimp = imp;
1257 int dist = env->dist;
1158 1258
1159 rcu_read_lock(); 1259 rcu_read_lock();
1160 cur = ACCESS_ONCE(dst_rq->curr); 1260
1161 if (cur->pid == 0) /* idle */ 1261 raw_spin_lock_irq(&dst_rq->lock);
1262 cur = dst_rq->curr;
1263 /*
1264 * No need to move the exiting task, and this ensures that ->curr
1265 * wasn't reaped and thus get_task_struct() in task_numa_assign()
1266 * is safe under RCU read lock.
1267 * Note that rcu_read_lock() itself can't protect from the final
1268 * put_task_struct() after the last schedule().
1269 */
1270 if ((cur->flags & PF_EXITING) || is_idle_task(cur))
1162 cur = NULL; 1271 cur = NULL;
1272 raw_spin_unlock_irq(&dst_rq->lock);
1273
1274 /*
1275 * Because we have preemption enabled we can get migrated around and
1276 * end try selecting ourselves (current == env->p) as a swap candidate.
1277 */
1278 if (cur == env->p)
1279 goto unlock;
1163 1280
1164 /* 1281 /*
1165 * "imp" is the fault differential for the source task between the 1282 * "imp" is the fault differential for the source task between the
@@ -1178,8 +1295,8 @@ static void task_numa_compare(struct task_numa_env *env,
1178 * in any group then look only at task weights. 1295 * in any group then look only at task weights.
1179 */ 1296 */
1180 if (cur->numa_group == env->p->numa_group) { 1297 if (cur->numa_group == env->p->numa_group) {
1181 imp = taskimp + task_weight(cur, env->src_nid) - 1298 imp = taskimp + task_weight(cur, env->src_nid, dist) -
1182 task_weight(cur, env->dst_nid); 1299 task_weight(cur, env->dst_nid, dist);
1183 /* 1300 /*
1184 * Add some hysteresis to prevent swapping the 1301 * Add some hysteresis to prevent swapping the
1185 * tasks within a group over tiny differences. 1302 * tasks within a group over tiny differences.
@@ -1193,11 +1310,11 @@ static void task_numa_compare(struct task_numa_env *env,
1193 * instead. 1310 * instead.
1194 */ 1311 */
1195 if (cur->numa_group) 1312 if (cur->numa_group)
1196 imp += group_weight(cur, env->src_nid) - 1313 imp += group_weight(cur, env->src_nid, dist) -
1197 group_weight(cur, env->dst_nid); 1314 group_weight(cur, env->dst_nid, dist);
1198 else 1315 else
1199 imp += task_weight(cur, env->src_nid) - 1316 imp += task_weight(cur, env->src_nid, dist) -
1200 task_weight(cur, env->dst_nid); 1317 task_weight(cur, env->dst_nid, dist);
1201 } 1318 }
1202 } 1319 }
1203 1320
@@ -1206,7 +1323,7 @@ static void task_numa_compare(struct task_numa_env *env,
1206 1323
1207 if (!cur) { 1324 if (!cur) {
1208 /* Is there capacity at our destination? */ 1325 /* Is there capacity at our destination? */
1209 if (env->src_stats.has_free_capacity && 1326 if (env->src_stats.nr_running <= env->src_stats.task_capacity &&
1210 !env->dst_stats.has_free_capacity) 1327 !env->dst_stats.has_free_capacity)
1211 goto unlock; 1328 goto unlock;
1212 1329
@@ -1252,6 +1369,13 @@ balance:
1252 if (load_too_imbalanced(src_load, dst_load, env)) 1369 if (load_too_imbalanced(src_load, dst_load, env))
1253 goto unlock; 1370 goto unlock;
1254 1371
1372 /*
1373 * One idle CPU per node is evaluated for a task numa move.
1374 * Call select_idle_sibling to maybe find a better one.
1375 */
1376 if (!cur)
1377 env->dst_cpu = select_idle_sibling(env->p, env->dst_cpu);
1378
1255assign: 1379assign:
1256 task_numa_assign(env, cur, imp); 1380 task_numa_assign(env, cur, imp);
1257unlock: 1381unlock:
@@ -1289,7 +1413,7 @@ static int task_numa_migrate(struct task_struct *p)
1289 }; 1413 };
1290 struct sched_domain *sd; 1414 struct sched_domain *sd;
1291 unsigned long taskweight, groupweight; 1415 unsigned long taskweight, groupweight;
1292 int nid, ret; 1416 int nid, ret, dist;
1293 long taskimp, groupimp; 1417 long taskimp, groupimp;
1294 1418
1295 /* 1419 /*
@@ -1317,29 +1441,45 @@ static int task_numa_migrate(struct task_struct *p)
1317 return -EINVAL; 1441 return -EINVAL;
1318 } 1442 }
1319 1443
1320 taskweight = task_weight(p, env.src_nid);
1321 groupweight = group_weight(p, env.src_nid);
1322 update_numa_stats(&env.src_stats, env.src_nid);
1323 env.dst_nid = p->numa_preferred_nid; 1444 env.dst_nid = p->numa_preferred_nid;
1324 taskimp = task_weight(p, env.dst_nid) - taskweight; 1445 dist = env.dist = node_distance(env.src_nid, env.dst_nid);
1325 groupimp = group_weight(p, env.dst_nid) - groupweight; 1446 taskweight = task_weight(p, env.src_nid, dist);
1447 groupweight = group_weight(p, env.src_nid, dist);
1448 update_numa_stats(&env.src_stats, env.src_nid);
1449 taskimp = task_weight(p, env.dst_nid, dist) - taskweight;
1450 groupimp = group_weight(p, env.dst_nid, dist) - groupweight;
1326 update_numa_stats(&env.dst_stats, env.dst_nid); 1451 update_numa_stats(&env.dst_stats, env.dst_nid);
1327 1452
1328 /* Try to find a spot on the preferred nid. */ 1453 /* Try to find a spot on the preferred nid. */
1329 task_numa_find_cpu(&env, taskimp, groupimp); 1454 task_numa_find_cpu(&env, taskimp, groupimp);
1330 1455
1331 /* No space available on the preferred nid. Look elsewhere. */ 1456 /*
1332 if (env.best_cpu == -1) { 1457 * Look at other nodes in these cases:
1458 * - there is no space available on the preferred_nid
1459 * - the task is part of a numa_group that is interleaved across
1460 * multiple NUMA nodes; in order to better consolidate the group,
1461 * we need to check other locations.
1462 */
1463 if (env.best_cpu == -1 || (p->numa_group &&
1464 nodes_weight(p->numa_group->active_nodes) > 1)) {
1333 for_each_online_node(nid) { 1465 for_each_online_node(nid) {
1334 if (nid == env.src_nid || nid == p->numa_preferred_nid) 1466 if (nid == env.src_nid || nid == p->numa_preferred_nid)
1335 continue; 1467 continue;
1336 1468
1469 dist = node_distance(env.src_nid, env.dst_nid);
1470 if (sched_numa_topology_type == NUMA_BACKPLANE &&
1471 dist != env.dist) {
1472 taskweight = task_weight(p, env.src_nid, dist);
1473 groupweight = group_weight(p, env.src_nid, dist);
1474 }
1475
1337 /* Only consider nodes where both task and groups benefit */ 1476 /* Only consider nodes where both task and groups benefit */
1338 taskimp = task_weight(p, nid) - taskweight; 1477 taskimp = task_weight(p, nid, dist) - taskweight;
1339 groupimp = group_weight(p, nid) - groupweight; 1478 groupimp = group_weight(p, nid, dist) - groupweight;
1340 if (taskimp < 0 && groupimp < 0) 1479 if (taskimp < 0 && groupimp < 0)
1341 continue; 1480 continue;
1342 1481
1482 env.dist = dist;
1343 env.dst_nid = nid; 1483 env.dst_nid = nid;
1344 update_numa_stats(&env.dst_stats, env.dst_nid); 1484 update_numa_stats(&env.dst_stats, env.dst_nid);
1345 task_numa_find_cpu(&env, taskimp, groupimp); 1485 task_numa_find_cpu(&env, taskimp, groupimp);
@@ -1394,7 +1534,7 @@ static void numa_migrate_preferred(struct task_struct *p)
1394 unsigned long interval = HZ; 1534 unsigned long interval = HZ;
1395 1535
1396 /* This task has no NUMA fault statistics yet */ 1536 /* This task has no NUMA fault statistics yet */
1397 if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults_memory)) 1537 if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults))
1398 return; 1538 return;
1399 1539
1400 /* Periodically retry migrating the task to the preferred node */ 1540 /* Periodically retry migrating the task to the preferred node */
@@ -1506,7 +1646,7 @@ static void update_task_scan_period(struct task_struct *p,
1506 * scanning faster if shared accesses dominate as it may 1646 * scanning faster if shared accesses dominate as it may
1507 * simply bounce migrations uselessly 1647 * simply bounce migrations uselessly
1508 */ 1648 */
1509 ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared)); 1649 ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared + 1));
1510 diff = (diff * ratio) / NUMA_PERIOD_SLOTS; 1650 diff = (diff * ratio) / NUMA_PERIOD_SLOTS;
1511 } 1651 }
1512 1652
@@ -1543,6 +1683,92 @@ static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period)
1543 return delta; 1683 return delta;
1544} 1684}
1545 1685
1686/*
1687 * Determine the preferred nid for a task in a numa_group. This needs to
1688 * be done in a way that produces consistent results with group_weight,
1689 * otherwise workloads might not converge.
1690 */
1691static int preferred_group_nid(struct task_struct *p, int nid)
1692{
1693 nodemask_t nodes;
1694 int dist;
1695
1696 /* Direct connections between all NUMA nodes. */
1697 if (sched_numa_topology_type == NUMA_DIRECT)
1698 return nid;
1699
1700 /*
1701 * On a system with glueless mesh NUMA topology, group_weight
1702 * scores nodes according to the number of NUMA hinting faults on
1703 * both the node itself, and on nearby nodes.
1704 */
1705 if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {
1706 unsigned long score, max_score = 0;
1707 int node, max_node = nid;
1708
1709 dist = sched_max_numa_distance;
1710
1711 for_each_online_node(node) {
1712 score = group_weight(p, node, dist);
1713 if (score > max_score) {
1714 max_score = score;
1715 max_node = node;
1716 }
1717 }
1718 return max_node;
1719 }
1720
1721 /*
1722 * Finding the preferred nid in a system with NUMA backplane
1723 * interconnect topology is more involved. The goal is to locate
1724 * tasks from numa_groups near each other in the system, and
1725 * untangle workloads from different sides of the system. This requires
1726 * searching down the hierarchy of node groups, recursively searching
1727 * inside the highest scoring group of nodes. The nodemask tricks
1728 * keep the complexity of the search down.
1729 */
1730 nodes = node_online_map;
1731 for (dist = sched_max_numa_distance; dist > LOCAL_DISTANCE; dist--) {
1732 unsigned long max_faults = 0;
1733 nodemask_t max_group;
1734 int a, b;
1735
1736 /* Are there nodes at this distance from each other? */
1737 if (!find_numa_distance(dist))
1738 continue;
1739
1740 for_each_node_mask(a, nodes) {
1741 unsigned long faults = 0;
1742 nodemask_t this_group;
1743 nodes_clear(this_group);
1744
1745 /* Sum group's NUMA faults; includes a==b case. */
1746 for_each_node_mask(b, nodes) {
1747 if (node_distance(a, b) < dist) {
1748 faults += group_faults(p, b);
1749 node_set(b, this_group);
1750 node_clear(b, nodes);
1751 }
1752 }
1753
1754 /* Remember the top group. */
1755 if (faults > max_faults) {
1756 max_faults = faults;
1757 max_group = this_group;
1758 /*
1759 * subtle: at the smallest distance there is
1760 * just one node left in each "group", the
1761 * winner is the preferred nid.
1762 */
1763 nid = a;
1764 }
1765 }
1766 /* Next round, evaluate the nodes within max_group. */
1767 nodes = max_group;
1768 }
1769 return nid;
1770}
1771
1546static void task_numa_placement(struct task_struct *p) 1772static void task_numa_placement(struct task_struct *p)
1547{ 1773{
1548 int seq, nid, max_nid = -1, max_group_nid = -1; 1774 int seq, nid, max_nid = -1, max_group_nid = -1;
@@ -1570,18 +1796,23 @@ static void task_numa_placement(struct task_struct *p)
1570 1796
1571 /* Find the node with the highest number of faults */ 1797 /* Find the node with the highest number of faults */
1572 for_each_online_node(nid) { 1798 for_each_online_node(nid) {
1799 /* Keep track of the offsets in numa_faults array */
1800 int mem_idx, membuf_idx, cpu_idx, cpubuf_idx;
1573 unsigned long faults = 0, group_faults = 0; 1801 unsigned long faults = 0, group_faults = 0;
1574 int priv, i; 1802 int priv;
1575 1803
1576 for (priv = 0; priv < NR_NUMA_HINT_FAULT_TYPES; priv++) { 1804 for (priv = 0; priv < NR_NUMA_HINT_FAULT_TYPES; priv++) {
1577 long diff, f_diff, f_weight; 1805 long diff, f_diff, f_weight;
1578 1806
1579 i = task_faults_idx(nid, priv); 1807 mem_idx = task_faults_idx(NUMA_MEM, nid, priv);
1808 membuf_idx = task_faults_idx(NUMA_MEMBUF, nid, priv);
1809 cpu_idx = task_faults_idx(NUMA_CPU, nid, priv);
1810 cpubuf_idx = task_faults_idx(NUMA_CPUBUF, nid, priv);
1580 1811
1581 /* Decay existing window, copy faults since last scan */ 1812 /* Decay existing window, copy faults since last scan */
1582 diff = p->numa_faults_buffer_memory[i] - p->numa_faults_memory[i] / 2; 1813 diff = p->numa_faults[membuf_idx] - p->numa_faults[mem_idx] / 2;
1583 fault_types[priv] += p->numa_faults_buffer_memory[i]; 1814 fault_types[priv] += p->numa_faults[membuf_idx];
1584 p->numa_faults_buffer_memory[i] = 0; 1815 p->numa_faults[membuf_idx] = 0;
1585 1816
1586 /* 1817 /*
1587 * Normalize the faults_from, so all tasks in a group 1818 * Normalize the faults_from, so all tasks in a group
@@ -1591,21 +1822,27 @@ static void task_numa_placement(struct task_struct *p)
1591 * faults are less important. 1822 * faults are less important.
1592 */ 1823 */
1593 f_weight = div64_u64(runtime << 16, period + 1); 1824 f_weight = div64_u64(runtime << 16, period + 1);
1594 f_weight = (f_weight * p->numa_faults_buffer_cpu[i]) / 1825 f_weight = (f_weight * p->numa_faults[cpubuf_idx]) /
1595 (total_faults + 1); 1826 (total_faults + 1);
1596 f_diff = f_weight - p->numa_faults_cpu[i] / 2; 1827 f_diff = f_weight - p->numa_faults[cpu_idx] / 2;
1597 p->numa_faults_buffer_cpu[i] = 0; 1828 p->numa_faults[cpubuf_idx] = 0;
1598 1829
1599 p->numa_faults_memory[i] += diff; 1830 p->numa_faults[mem_idx] += diff;
1600 p->numa_faults_cpu[i] += f_diff; 1831 p->numa_faults[cpu_idx] += f_diff;
1601 faults += p->numa_faults_memory[i]; 1832 faults += p->numa_faults[mem_idx];
1602 p->total_numa_faults += diff; 1833 p->total_numa_faults += diff;
1603 if (p->numa_group) { 1834 if (p->numa_group) {
1604 /* safe because we can only change our own group */ 1835 /*
1605 p->numa_group->faults[i] += diff; 1836 * safe because we can only change our own group
1606 p->numa_group->faults_cpu[i] += f_diff; 1837 *
1838 * mem_idx represents the offset for a given
1839 * nid and priv in a specific region because it
1840 * is at the beginning of the numa_faults array.
1841 */
1842 p->numa_group->faults[mem_idx] += diff;
1843 p->numa_group->faults_cpu[mem_idx] += f_diff;
1607 p->numa_group->total_faults += diff; 1844 p->numa_group->total_faults += diff;
1608 group_faults += p->numa_group->faults[i]; 1845 group_faults += p->numa_group->faults[mem_idx];
1609 } 1846 }
1610 } 1847 }
1611 1848
@@ -1625,7 +1862,7 @@ static void task_numa_placement(struct task_struct *p)
1625 if (p->numa_group) { 1862 if (p->numa_group) {
1626 update_numa_active_node_mask(p->numa_group); 1863 update_numa_active_node_mask(p->numa_group);
1627 spin_unlock_irq(group_lock); 1864 spin_unlock_irq(group_lock);
1628 max_nid = max_group_nid; 1865 max_nid = preferred_group_nid(p, max_group_nid);
1629 } 1866 }
1630 1867
1631 if (max_faults) { 1868 if (max_faults) {
@@ -1668,7 +1905,6 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
1668 1905
1669 atomic_set(&grp->refcount, 1); 1906 atomic_set(&grp->refcount, 1);
1670 spin_lock_init(&grp->lock); 1907 spin_lock_init(&grp->lock);
1671 INIT_LIST_HEAD(&grp->task_list);
1672 grp->gid = p->pid; 1908 grp->gid = p->pid;
1673 /* Second half of the array tracks nids where faults happen */ 1909 /* Second half of the array tracks nids where faults happen */
1674 grp->faults_cpu = grp->faults + NR_NUMA_HINT_FAULT_TYPES * 1910 grp->faults_cpu = grp->faults + NR_NUMA_HINT_FAULT_TYPES *
@@ -1677,11 +1913,10 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
1677 node_set(task_node(current), grp->active_nodes); 1913 node_set(task_node(current), grp->active_nodes);
1678 1914
1679 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) 1915 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
1680 grp->faults[i] = p->numa_faults_memory[i]; 1916 grp->faults[i] = p->numa_faults[i];
1681 1917
1682 grp->total_faults = p->total_numa_faults; 1918 grp->total_faults = p->total_numa_faults;
1683 1919
1684 list_add(&p->numa_entry, &grp->task_list);
1685 grp->nr_tasks++; 1920 grp->nr_tasks++;
1686 rcu_assign_pointer(p->numa_group, grp); 1921 rcu_assign_pointer(p->numa_group, grp);
1687 } 1922 }
@@ -1736,13 +1971,12 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
1736 double_lock_irq(&my_grp->lock, &grp->lock); 1971 double_lock_irq(&my_grp->lock, &grp->lock);
1737 1972
1738 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) { 1973 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) {
1739 my_grp->faults[i] -= p->numa_faults_memory[i]; 1974 my_grp->faults[i] -= p->numa_faults[i];
1740 grp->faults[i] += p->numa_faults_memory[i]; 1975 grp->faults[i] += p->numa_faults[i];
1741 } 1976 }
1742 my_grp->total_faults -= p->total_numa_faults; 1977 my_grp->total_faults -= p->total_numa_faults;
1743 grp->total_faults += p->total_numa_faults; 1978 grp->total_faults += p->total_numa_faults;
1744 1979
1745 list_move(&p->numa_entry, &grp->task_list);
1746 my_grp->nr_tasks--; 1980 my_grp->nr_tasks--;
1747 grp->nr_tasks++; 1981 grp->nr_tasks++;
1748 1982
@@ -1762,27 +1996,23 @@ no_join:
1762void task_numa_free(struct task_struct *p) 1996void task_numa_free(struct task_struct *p)
1763{ 1997{
1764 struct numa_group *grp = p->numa_group; 1998 struct numa_group *grp = p->numa_group;
1765 void *numa_faults = p->numa_faults_memory; 1999 void *numa_faults = p->numa_faults;
1766 unsigned long flags; 2000 unsigned long flags;
1767 int i; 2001 int i;
1768 2002
1769 if (grp) { 2003 if (grp) {
1770 spin_lock_irqsave(&grp->lock, flags); 2004 spin_lock_irqsave(&grp->lock, flags);
1771 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) 2005 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
1772 grp->faults[i] -= p->numa_faults_memory[i]; 2006 grp->faults[i] -= p->numa_faults[i];
1773 grp->total_faults -= p->total_numa_faults; 2007 grp->total_faults -= p->total_numa_faults;
1774 2008
1775 list_del(&p->numa_entry);
1776 grp->nr_tasks--; 2009 grp->nr_tasks--;
1777 spin_unlock_irqrestore(&grp->lock, flags); 2010 spin_unlock_irqrestore(&grp->lock, flags);
1778 rcu_assign_pointer(p->numa_group, NULL); 2011 RCU_INIT_POINTER(p->numa_group, NULL);
1779 put_numa_group(grp); 2012 put_numa_group(grp);
1780 } 2013 }
1781 2014
1782 p->numa_faults_memory = NULL; 2015 p->numa_faults = NULL;
1783 p->numa_faults_buffer_memory = NULL;
1784 p->numa_faults_cpu= NULL;
1785 p->numa_faults_buffer_cpu = NULL;
1786 kfree(numa_faults); 2016 kfree(numa_faults);
1787} 2017}
1788 2018
@@ -1804,29 +2034,15 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
1804 if (!p->mm) 2034 if (!p->mm)
1805 return; 2035 return;
1806 2036
1807 /* Do not worry about placement if exiting */
1808 if (p->state == TASK_DEAD)
1809 return;
1810
1811 /* Allocate buffer to track faults on a per-node basis */ 2037 /* Allocate buffer to track faults on a per-node basis */
1812 if (unlikely(!p->numa_faults_memory)) { 2038 if (unlikely(!p->numa_faults)) {
1813 int size = sizeof(*p->numa_faults_memory) * 2039 int size = sizeof(*p->numa_faults) *
1814 NR_NUMA_HINT_FAULT_BUCKETS * nr_node_ids; 2040 NR_NUMA_HINT_FAULT_BUCKETS * nr_node_ids;
1815 2041
1816 p->numa_faults_memory = kzalloc(size, GFP_KERNEL|__GFP_NOWARN); 2042 p->numa_faults = kzalloc(size, GFP_KERNEL|__GFP_NOWARN);
1817 if (!p->numa_faults_memory) 2043 if (!p->numa_faults)
1818 return; 2044 return;
1819 2045
1820 BUG_ON(p->numa_faults_buffer_memory);
1821 /*
1822 * The averaged statistics, shared & private, memory & cpu,
1823 * occupy the first half of the array. The second half of the
1824 * array is for current counters, which are averaged into the
1825 * first set by task_numa_placement.
1826 */
1827 p->numa_faults_cpu = p->numa_faults_memory + (2 * nr_node_ids);
1828 p->numa_faults_buffer_memory = p->numa_faults_memory + (4 * nr_node_ids);
1829 p->numa_faults_buffer_cpu = p->numa_faults_memory + (6 * nr_node_ids);
1830 p->total_numa_faults = 0; 2046 p->total_numa_faults = 0;
1831 memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality)); 2047 memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
1832 } 2048 }
@@ -1866,8 +2082,8 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
1866 if (migrated) 2082 if (migrated)
1867 p->numa_pages_migrated += pages; 2083 p->numa_pages_migrated += pages;
1868 2084
1869 p->numa_faults_buffer_memory[task_faults_idx(mem_node, priv)] += pages; 2085 p->numa_faults[task_faults_idx(NUMA_MEMBUF, mem_node, priv)] += pages;
1870 p->numa_faults_buffer_cpu[task_faults_idx(cpu_node, priv)] += pages; 2086 p->numa_faults[task_faults_idx(NUMA_CPUBUF, cpu_node, priv)] += pages;
1871 p->numa_faults_locality[local] += pages; 2087 p->numa_faults_locality[local] += pages;
1872} 2088}
1873 2089
@@ -1946,7 +2162,7 @@ void task_numa_work(struct callback_head *work)
1946 vma = mm->mmap; 2162 vma = mm->mmap;
1947 } 2163 }
1948 for (; vma; vma = vma->vm_next) { 2164 for (; vma; vma = vma->vm_next) {
1949 if (!vma_migratable(vma) || !vma_policy_mof(p, vma)) 2165 if (!vma_migratable(vma) || !vma_policy_mof(vma))
1950 continue; 2166 continue;
1951 2167
1952 /* 2168 /*
@@ -2211,8 +2427,8 @@ static __always_inline u64 decay_load(u64 val, u64 n)
2211 2427
2212 /* 2428 /*
2213 * As y^PERIOD = 1/2, we can combine 2429 * As y^PERIOD = 1/2, we can combine
2214 * y^n = 1/2^(n/PERIOD) * k^(n%PERIOD) 2430 * y^n = 1/2^(n/PERIOD) * y^(n%PERIOD)
2215 * With a look-up table which covers k^n (n<PERIOD) 2431 * With a look-up table which covers y^n (n<PERIOD)
2216 * 2432 *
2217 * To achieve constant time decay_load. 2433 * To achieve constant time decay_load.
2218 */ 2434 */
@@ -2377,6 +2593,9 @@ static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
2377 tg_contrib = cfs_rq->runnable_load_avg + cfs_rq->blocked_load_avg; 2593 tg_contrib = cfs_rq->runnable_load_avg + cfs_rq->blocked_load_avg;
2378 tg_contrib -= cfs_rq->tg_load_contrib; 2594 tg_contrib -= cfs_rq->tg_load_contrib;
2379 2595
2596 if (!tg_contrib)
2597 return;
2598
2380 if (force_update || abs(tg_contrib) > cfs_rq->tg_load_contrib / 8) { 2599 if (force_update || abs(tg_contrib) > cfs_rq->tg_load_contrib / 8) {
2381 atomic_long_add(tg_contrib, &tg->load_avg); 2600 atomic_long_add(tg_contrib, &tg->load_avg);
2382 cfs_rq->tg_load_contrib += tg_contrib; 2601 cfs_rq->tg_load_contrib += tg_contrib;
@@ -3786,6 +4005,10 @@ void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b, bool force)
3786 4005
3787static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) 4006static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
3788{ 4007{
4008 /* init_cfs_bandwidth() was not called */
4009 if (!cfs_b->throttled_cfs_rq.next)
4010 return;
4011
3789 hrtimer_cancel(&cfs_b->period_timer); 4012 hrtimer_cancel(&cfs_b->period_timer);
3790 hrtimer_cancel(&cfs_b->slack_timer); 4013 hrtimer_cancel(&cfs_b->slack_timer);
3791} 4014}
@@ -3892,14 +4115,6 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
3892 resched_curr(rq); 4115 resched_curr(rq);
3893 return; 4116 return;
3894 } 4117 }
3895
3896 /*
3897 * Don't schedule slices shorter than 10000ns, that just
3898 * doesn't make sense. Rely on vruntime for fairness.
3899 */
3900 if (rq->curr != p)
3901 delta = max_t(s64, 10000LL, delta);
3902
3903 hrtick_start(rq, delta); 4118 hrtick_start(rq, delta);
3904 } 4119 }
3905} 4120}
@@ -4087,7 +4302,7 @@ static unsigned long capacity_of(int cpu)
4087static unsigned long cpu_avg_load_per_task(int cpu) 4302static unsigned long cpu_avg_load_per_task(int cpu)
4088{ 4303{
4089 struct rq *rq = cpu_rq(cpu); 4304 struct rq *rq = cpu_rq(cpu);
4090 unsigned long nr_running = ACCESS_ONCE(rq->nr_running); 4305 unsigned long nr_running = ACCESS_ONCE(rq->cfs.h_nr_running);
4091 unsigned long load_avg = rq->cfs.runnable_load_avg; 4306 unsigned long load_avg = rq->cfs.runnable_load_avg;
4092 4307
4093 if (nr_running) 4308 if (nr_running)
@@ -4213,7 +4428,7 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
4213 * wl = S * s'_i; see (2) 4428 * wl = S * s'_i; see (2)
4214 */ 4429 */
4215 if (W > 0 && w < W) 4430 if (W > 0 && w < W)
4216 wl = (w * tg->shares) / W; 4431 wl = (w * (long)tg->shares) / W;
4217 else 4432 else
4218 wl = tg->shares; 4433 wl = tg->shares;
4219 4434
@@ -4276,8 +4491,8 @@ static int wake_wide(struct task_struct *p)
4276static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) 4491static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
4277{ 4492{
4278 s64 this_load, load; 4493 s64 this_load, load;
4494 s64 this_eff_load, prev_eff_load;
4279 int idx, this_cpu, prev_cpu; 4495 int idx, this_cpu, prev_cpu;
4280 unsigned long tl_per_task;
4281 struct task_group *tg; 4496 struct task_group *tg;
4282 unsigned long weight; 4497 unsigned long weight;
4283 int balanced; 4498 int balanced;
@@ -4320,47 +4535,30 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
4320 * Otherwise check if either cpus are near enough in load to allow this 4535 * Otherwise check if either cpus are near enough in load to allow this
4321 * task to be woken on this_cpu. 4536 * task to be woken on this_cpu.
4322 */ 4537 */
4323 if (this_load > 0) { 4538 this_eff_load = 100;
4324 s64 this_eff_load, prev_eff_load; 4539 this_eff_load *= capacity_of(prev_cpu);
4540
4541 prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
4542 prev_eff_load *= capacity_of(this_cpu);
4325 4543
4326 this_eff_load = 100; 4544 if (this_load > 0) {
4327 this_eff_load *= capacity_of(prev_cpu);
4328 this_eff_load *= this_load + 4545 this_eff_load *= this_load +
4329 effective_load(tg, this_cpu, weight, weight); 4546 effective_load(tg, this_cpu, weight, weight);
4330 4547
4331 prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
4332 prev_eff_load *= capacity_of(this_cpu);
4333 prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight); 4548 prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight);
4549 }
4334 4550
4335 balanced = this_eff_load <= prev_eff_load; 4551 balanced = this_eff_load <= prev_eff_load;
4336 } else
4337 balanced = true;
4338
4339 /*
4340 * If the currently running task will sleep within
4341 * a reasonable amount of time then attract this newly
4342 * woken task:
4343 */
4344 if (sync && balanced)
4345 return 1;
4346 4552
4347 schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts); 4553 schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts);
4348 tl_per_task = cpu_avg_load_per_task(this_cpu);
4349 4554
4350 if (balanced || 4555 if (!balanced)
4351 (this_load <= load && 4556 return 0;
4352 this_load + target_load(prev_cpu, idx) <= tl_per_task)) {
4353 /*
4354 * This domain has SD_WAKE_AFFINE and
4355 * p is cache cold in this domain, and
4356 * there is no bad imbalance.
4357 */
4358 schedstat_inc(sd, ttwu_move_affine);
4359 schedstat_inc(p, se.statistics.nr_wakeups_affine);
4360 4557
4361 return 1; 4558 schedstat_inc(sd, ttwu_move_affine);
4362 } 4559 schedstat_inc(p, se.statistics.nr_wakeups_affine);
4363 return 0; 4560
4561 return 1;
4364} 4562}
4365 4563
4366/* 4564/*
@@ -4428,20 +4626,46 @@ static int
4428find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) 4626find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
4429{ 4627{
4430 unsigned long load, min_load = ULONG_MAX; 4628 unsigned long load, min_load = ULONG_MAX;
4431 int idlest = -1; 4629 unsigned int min_exit_latency = UINT_MAX;
4630 u64 latest_idle_timestamp = 0;
4631 int least_loaded_cpu = this_cpu;
4632 int shallowest_idle_cpu = -1;
4432 int i; 4633 int i;
4433 4634
4434 /* Traverse only the allowed CPUs */ 4635 /* Traverse only the allowed CPUs */
4435 for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) { 4636 for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) {
4436 load = weighted_cpuload(i); 4637 if (idle_cpu(i)) {
4437 4638 struct rq *rq = cpu_rq(i);
4438 if (load < min_load || (load == min_load && i == this_cpu)) { 4639 struct cpuidle_state *idle = idle_get_state(rq);
4439 min_load = load; 4640 if (idle && idle->exit_latency < min_exit_latency) {
4440 idlest = i; 4641 /*
4642 * We give priority to a CPU whose idle state
4643 * has the smallest exit latency irrespective
4644 * of any idle timestamp.
4645 */
4646 min_exit_latency = idle->exit_latency;
4647 latest_idle_timestamp = rq->idle_stamp;
4648 shallowest_idle_cpu = i;
4649 } else if ((!idle || idle->exit_latency == min_exit_latency) &&
4650 rq->idle_stamp > latest_idle_timestamp) {
4651 /*
4652 * If equal or no active idle state, then
4653 * the most recently idled CPU might have
4654 * a warmer cache.
4655 */
4656 latest_idle_timestamp = rq->idle_stamp;
4657 shallowest_idle_cpu = i;
4658 }
4659 } else if (shallowest_idle_cpu == -1) {
4660 load = weighted_cpuload(i);
4661 if (load < min_load || (load == min_load && i == this_cpu)) {
4662 min_load = load;
4663 least_loaded_cpu = i;
4664 }
4441 } 4665 }
4442 } 4666 }
4443 4667
4444 return idlest; 4668 return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu;
4445} 4669}
4446 4670
4447/* 4671/*
@@ -4510,14 +4734,8 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
4510 int want_affine = 0; 4734 int want_affine = 0;
4511 int sync = wake_flags & WF_SYNC; 4735 int sync = wake_flags & WF_SYNC;
4512 4736
4513 if (p->nr_cpus_allowed == 1) 4737 if (sd_flag & SD_BALANCE_WAKE)
4514 return prev_cpu; 4738 want_affine = cpumask_test_cpu(cpu, tsk_cpus_allowed(p));
4515
4516 if (sd_flag & SD_BALANCE_WAKE) {
4517 if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
4518 want_affine = 1;
4519 new_cpu = prev_cpu;
4520 }
4521 4739
4522 rcu_read_lock(); 4740 rcu_read_lock();
4523 for_each_domain(cpu, tmp) { 4741 for_each_domain(cpu, tmp) {
@@ -4704,7 +4922,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
4704 return; 4922 return;
4705 4923
4706 /* 4924 /*
4707 * This is possible from callers such as move_task(), in which we 4925 * This is possible from callers such as attach_tasks(), in which we
4708 * unconditionally check_prempt_curr() after an enqueue (which may have 4926 * unconditionally check_prempt_curr() after an enqueue (which may have
4709 * lead to a throttle). This both saves work and prevents false 4927 * lead to a throttle). This both saves work and prevents false
4710 * next-buddy nomination below. 4928 * next-buddy nomination below.
@@ -5112,27 +5330,18 @@ struct lb_env {
5112 unsigned int loop_max; 5330 unsigned int loop_max;
5113 5331
5114 enum fbq_type fbq_type; 5332 enum fbq_type fbq_type;
5333 struct list_head tasks;
5115}; 5334};
5116 5335
5117/* 5336/*
5118 * move_task - move a task from one runqueue to another runqueue.
5119 * Both runqueues must be locked.
5120 */
5121static void move_task(struct task_struct *p, struct lb_env *env)
5122{
5123 deactivate_task(env->src_rq, p, 0);
5124 set_task_cpu(p, env->dst_cpu);
5125 activate_task(env->dst_rq, p, 0);
5126 check_preempt_curr(env->dst_rq, p, 0);
5127}
5128
5129/*
5130 * Is this task likely cache-hot: 5337 * Is this task likely cache-hot:
5131 */ 5338 */
5132static int task_hot(struct task_struct *p, struct lb_env *env) 5339static int task_hot(struct task_struct *p, struct lb_env *env)
5133{ 5340{
5134 s64 delta; 5341 s64 delta;
5135 5342
5343 lockdep_assert_held(&env->src_rq->lock);
5344
5136 if (p->sched_class != &fair_sched_class) 5345 if (p->sched_class != &fair_sched_class)
5137 return 0; 5346 return 0;
5138 5347
@@ -5164,7 +5373,7 @@ static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env)
5164 struct numa_group *numa_group = rcu_dereference(p->numa_group); 5373 struct numa_group *numa_group = rcu_dereference(p->numa_group);
5165 int src_nid, dst_nid; 5374 int src_nid, dst_nid;
5166 5375
5167 if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults_memory || 5376 if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults ||
5168 !(env->sd->flags & SD_NUMA)) { 5377 !(env->sd->flags & SD_NUMA)) {
5169 return false; 5378 return false;
5170 } 5379 }
@@ -5203,7 +5412,7 @@ static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
5203 if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER)) 5412 if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER))
5204 return false; 5413 return false;
5205 5414
5206 if (!p->numa_faults_memory || !(env->sd->flags & SD_NUMA)) 5415 if (!p->numa_faults || !(env->sd->flags & SD_NUMA))
5207 return false; 5416 return false;
5208 5417
5209 src_nid = cpu_to_node(env->src_cpu); 5418 src_nid = cpu_to_node(env->src_cpu);
@@ -5252,6 +5461,9 @@ static
5252int can_migrate_task(struct task_struct *p, struct lb_env *env) 5461int can_migrate_task(struct task_struct *p, struct lb_env *env)
5253{ 5462{
5254 int tsk_cache_hot = 0; 5463 int tsk_cache_hot = 0;
5464
5465 lockdep_assert_held(&env->src_rq->lock);
5466
5255 /* 5467 /*
5256 * We do not migrate tasks that are: 5468 * We do not migrate tasks that are:
5257 * 1) throttled_lb_pair, or 5469 * 1) throttled_lb_pair, or
@@ -5310,24 +5522,12 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
5310 if (!tsk_cache_hot) 5522 if (!tsk_cache_hot)
5311 tsk_cache_hot = migrate_degrades_locality(p, env); 5523 tsk_cache_hot = migrate_degrades_locality(p, env);
5312 5524
5313 if (migrate_improves_locality(p, env)) { 5525 if (migrate_improves_locality(p, env) || !tsk_cache_hot ||
5314#ifdef CONFIG_SCHEDSTATS 5526 env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
5315 if (tsk_cache_hot) { 5527 if (tsk_cache_hot) {
5316 schedstat_inc(env->sd, lb_hot_gained[env->idle]); 5528 schedstat_inc(env->sd, lb_hot_gained[env->idle]);
5317 schedstat_inc(p, se.statistics.nr_forced_migrations); 5529 schedstat_inc(p, se.statistics.nr_forced_migrations);
5318 } 5530 }
5319#endif
5320 return 1;
5321 }
5322
5323 if (!tsk_cache_hot ||
5324 env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
5325
5326 if (tsk_cache_hot) {
5327 schedstat_inc(env->sd, lb_hot_gained[env->idle]);
5328 schedstat_inc(p, se.statistics.nr_forced_migrations);
5329 }
5330
5331 return 1; 5531 return 1;
5332 } 5532 }
5333 5533
@@ -5336,47 +5536,63 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
5336} 5536}
5337 5537
5338/* 5538/*
5339 * move_one_task tries to move exactly one task from busiest to this_rq, as 5539 * detach_task() -- detach the task for the migration specified in env
5540 */
5541static void detach_task(struct task_struct *p, struct lb_env *env)
5542{
5543 lockdep_assert_held(&env->src_rq->lock);
5544
5545 deactivate_task(env->src_rq, p, 0);
5546 p->on_rq = TASK_ON_RQ_MIGRATING;
5547 set_task_cpu(p, env->dst_cpu);
5548}
5549
5550/*
5551 * detach_one_task() -- tries to dequeue exactly one task from env->src_rq, as
5340 * part of active balancing operations within "domain". 5552 * part of active balancing operations within "domain".
5341 * Returns 1 if successful and 0 otherwise.
5342 * 5553 *
5343 * Called with both runqueues locked. 5554 * Returns a task if successful and NULL otherwise.
5344 */ 5555 */
5345static int move_one_task(struct lb_env *env) 5556static struct task_struct *detach_one_task(struct lb_env *env)
5346{ 5557{
5347 struct task_struct *p, *n; 5558 struct task_struct *p, *n;
5348 5559
5560 lockdep_assert_held(&env->src_rq->lock);
5561
5349 list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) { 5562 list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) {
5350 if (!can_migrate_task(p, env)) 5563 if (!can_migrate_task(p, env))
5351 continue; 5564 continue;
5352 5565
5353 move_task(p, env); 5566 detach_task(p, env);
5567
5354 /* 5568 /*
5355 * Right now, this is only the second place move_task() 5569 * Right now, this is only the second place where
5356 * is called, so we can safely collect move_task() 5570 * lb_gained[env->idle] is updated (other is detach_tasks)
5357 * stats here rather than inside move_task(). 5571 * so we can safely collect stats here rather than
5572 * inside detach_tasks().
5358 */ 5573 */
5359 schedstat_inc(env->sd, lb_gained[env->idle]); 5574 schedstat_inc(env->sd, lb_gained[env->idle]);
5360 return 1; 5575 return p;
5361 } 5576 }
5362 return 0; 5577 return NULL;
5363} 5578}
5364 5579
5365static const unsigned int sched_nr_migrate_break = 32; 5580static const unsigned int sched_nr_migrate_break = 32;
5366 5581
5367/* 5582/*
5368 * move_tasks tries to move up to imbalance weighted load from busiest to 5583 * detach_tasks() -- tries to detach up to imbalance weighted load from
5369 * this_rq, as part of a balancing operation within domain "sd". 5584 * busiest_rq, as part of a balancing operation within domain "sd".
5370 * Returns 1 if successful and 0 otherwise.
5371 * 5585 *
5372 * Called with both runqueues locked. 5586 * Returns number of detached tasks if successful and 0 otherwise.
5373 */ 5587 */
5374static int move_tasks(struct lb_env *env) 5588static int detach_tasks(struct lb_env *env)
5375{ 5589{
5376 struct list_head *tasks = &env->src_rq->cfs_tasks; 5590 struct list_head *tasks = &env->src_rq->cfs_tasks;
5377 struct task_struct *p; 5591 struct task_struct *p;
5378 unsigned long load; 5592 unsigned long load;
5379 int pulled = 0; 5593 int detached = 0;
5594
5595 lockdep_assert_held(&env->src_rq->lock);
5380 5596
5381 if (env->imbalance <= 0) 5597 if (env->imbalance <= 0)
5382 return 0; 5598 return 0;
@@ -5407,14 +5623,16 @@ static int move_tasks(struct lb_env *env)
5407 if ((load / 2) > env->imbalance) 5623 if ((load / 2) > env->imbalance)
5408 goto next; 5624 goto next;
5409 5625
5410 move_task(p, env); 5626 detach_task(p, env);
5411 pulled++; 5627 list_add(&p->se.group_node, &env->tasks);
5628
5629 detached++;
5412 env->imbalance -= load; 5630 env->imbalance -= load;
5413 5631
5414#ifdef CONFIG_PREEMPT 5632#ifdef CONFIG_PREEMPT
5415 /* 5633 /*
5416 * NEWIDLE balancing is a source of latency, so preemptible 5634 * NEWIDLE balancing is a source of latency, so preemptible
5417 * kernels will stop after the first task is pulled to minimize 5635 * kernels will stop after the first task is detached to minimize
5418 * the critical section. 5636 * the critical section.
5419 */ 5637 */
5420 if (env->idle == CPU_NEWLY_IDLE) 5638 if (env->idle == CPU_NEWLY_IDLE)
@@ -5434,13 +5652,58 @@ next:
5434 } 5652 }
5435 5653
5436 /* 5654 /*
5437 * Right now, this is one of only two places move_task() is called, 5655 * Right now, this is one of only two places we collect this stat
5438 * so we can safely collect move_task() stats here rather than 5656 * so we can safely collect detach_one_task() stats here rather
5439 * inside move_task(). 5657 * than inside detach_one_task().
5440 */ 5658 */
5441 schedstat_add(env->sd, lb_gained[env->idle], pulled); 5659 schedstat_add(env->sd, lb_gained[env->idle], detached);
5660
5661 return detached;
5662}
5663
5664/*
5665 * attach_task() -- attach the task detached by detach_task() to its new rq.
5666 */
5667static void attach_task(struct rq *rq, struct task_struct *p)
5668{
5669 lockdep_assert_held(&rq->lock);
5442 5670
5443 return pulled; 5671 BUG_ON(task_rq(p) != rq);
5672 p->on_rq = TASK_ON_RQ_QUEUED;
5673 activate_task(rq, p, 0);
5674 check_preempt_curr(rq, p, 0);
5675}
5676
5677/*
5678 * attach_one_task() -- attaches the task returned from detach_one_task() to
5679 * its new rq.
5680 */
5681static void attach_one_task(struct rq *rq, struct task_struct *p)
5682{
5683 raw_spin_lock(&rq->lock);
5684 attach_task(rq, p);
5685 raw_spin_unlock(&rq->lock);
5686}
5687
5688/*
5689 * attach_tasks() -- attaches all tasks detached by detach_tasks() to their
5690 * new rq.
5691 */
5692static void attach_tasks(struct lb_env *env)
5693{
5694 struct list_head *tasks = &env->tasks;
5695 struct task_struct *p;
5696
5697 raw_spin_lock(&env->dst_rq->lock);
5698
5699 while (!list_empty(tasks)) {
5700 p = list_first_entry(tasks, struct task_struct, se.group_node);
5701 list_del_init(&p->se.group_node);
5702
5703 attach_task(env->dst_rq, p);
5704 }
5705
5706 raw_spin_unlock(&env->dst_rq->lock);
5444} 5707}
5445 5708
5446#ifdef CONFIG_FAIR_GROUP_SCHED 5709#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -5559,6 +5822,13 @@ static unsigned long task_h_load(struct task_struct *p)
5559#endif 5822#endif
5560 5823
5561/********** Helpers for find_busiest_group ************************/ 5824/********** Helpers for find_busiest_group ************************/
5825
5826enum group_type {
5827 group_other = 0,
5828 group_imbalanced,
5829 group_overloaded,
5830};
5831
5562/* 5832/*
5563 * sg_lb_stats - stats of a sched_group required for load_balancing 5833 * sg_lb_stats - stats of a sched_group required for load_balancing
5564 */ 5834 */
@@ -5572,7 +5842,7 @@ struct sg_lb_stats {
5572 unsigned int group_capacity_factor; 5842 unsigned int group_capacity_factor;
5573 unsigned int idle_cpus; 5843 unsigned int idle_cpus;
5574 unsigned int group_weight; 5844 unsigned int group_weight;
5575 int group_imb; /* Is there an imbalance in the group ? */ 5845 enum group_type group_type;
5576 int group_has_free_capacity; 5846 int group_has_free_capacity;
5577#ifdef CONFIG_NUMA_BALANCING 5847#ifdef CONFIG_NUMA_BALANCING
5578 unsigned int nr_numa_running; 5848 unsigned int nr_numa_running;
@@ -5610,6 +5880,8 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
5610 .total_capacity = 0UL, 5880 .total_capacity = 0UL,
5611 .busiest_stat = { 5881 .busiest_stat = {
5612 .avg_load = 0UL, 5882 .avg_load = 0UL,
5883 .sum_nr_running = 0,
5884 .group_type = group_other,
5613 }, 5885 },
5614 }; 5886 };
5615} 5887}
@@ -5652,19 +5924,17 @@ unsigned long __weak arch_scale_freq_capacity(struct sched_domain *sd, int cpu)
5652 return default_scale_capacity(sd, cpu); 5924 return default_scale_capacity(sd, cpu);
5653} 5925}
5654 5926
5655static unsigned long default_scale_smt_capacity(struct sched_domain *sd, int cpu) 5927static unsigned long default_scale_cpu_capacity(struct sched_domain *sd, int cpu)
5656{ 5928{
5657 unsigned long weight = sd->span_weight; 5929 if ((sd->flags & SD_SHARE_CPUCAPACITY) && (sd->span_weight > 1))
5658 unsigned long smt_gain = sd->smt_gain; 5930 return sd->smt_gain / sd->span_weight;
5659
5660 smt_gain /= weight;
5661 5931
5662 return smt_gain; 5932 return SCHED_CAPACITY_SCALE;
5663} 5933}
5664 5934
5665unsigned long __weak arch_scale_smt_capacity(struct sched_domain *sd, int cpu) 5935unsigned long __weak arch_scale_cpu_capacity(struct sched_domain *sd, int cpu)
5666{ 5936{
5667 return default_scale_smt_capacity(sd, cpu); 5937 return default_scale_cpu_capacity(sd, cpu);
5668} 5938}
5669 5939
5670static unsigned long scale_rt_capacity(int cpu) 5940static unsigned long scale_rt_capacity(int cpu)
@@ -5703,18 +5973,15 @@ static unsigned long scale_rt_capacity(int cpu)
5703 5973
5704static void update_cpu_capacity(struct sched_domain *sd, int cpu) 5974static void update_cpu_capacity(struct sched_domain *sd, int cpu)
5705{ 5975{
5706 unsigned long weight = sd->span_weight;
5707 unsigned long capacity = SCHED_CAPACITY_SCALE; 5976 unsigned long capacity = SCHED_CAPACITY_SCALE;
5708 struct sched_group *sdg = sd->groups; 5977 struct sched_group *sdg = sd->groups;
5709 5978
5710 if ((sd->flags & SD_SHARE_CPUCAPACITY) && weight > 1) { 5979 if (sched_feat(ARCH_CAPACITY))
5711 if (sched_feat(ARCH_CAPACITY)) 5980 capacity *= arch_scale_cpu_capacity(sd, cpu);
5712 capacity *= arch_scale_smt_capacity(sd, cpu); 5981 else
5713 else 5982 capacity *= default_scale_cpu_capacity(sd, cpu);
5714 capacity *= default_scale_smt_capacity(sd, cpu);
5715 5983
5716 capacity >>= SCHED_CAPACITY_SHIFT; 5984 capacity >>= SCHED_CAPACITY_SHIFT;
5717 }
5718 5985
5719 sdg->sgc->capacity_orig = capacity; 5986 sdg->sgc->capacity_orig = capacity;
5720 5987
@@ -5891,6 +6158,18 @@ static inline int sg_capacity_factor(struct lb_env *env, struct sched_group *gro
5891 return capacity_factor; 6158 return capacity_factor;
5892} 6159}
5893 6160
6161static enum group_type
6162group_classify(struct sched_group *group, struct sg_lb_stats *sgs)
6163{
6164 if (sgs->sum_nr_running > sgs->group_capacity_factor)
6165 return group_overloaded;
6166
6167 if (sg_imbalanced(group))
6168 return group_imbalanced;
6169
6170 return group_other;
6171}
6172
5894/** 6173/**
5895 * update_sg_lb_stats - Update sched_group's statistics for load balancing. 6174 * update_sg_lb_stats - Update sched_group's statistics for load balancing.
5896 * @env: The load balancing environment. 6175 * @env: The load balancing environment.
@@ -5920,7 +6199,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
5920 load = source_load(i, load_idx); 6199 load = source_load(i, load_idx);
5921 6200
5922 sgs->group_load += load; 6201 sgs->group_load += load;
5923 sgs->sum_nr_running += rq->nr_running; 6202 sgs->sum_nr_running += rq->cfs.h_nr_running;
5924 6203
5925 if (rq->nr_running > 1) 6204 if (rq->nr_running > 1)
5926 *overload = true; 6205 *overload = true;
@@ -5942,9 +6221,8 @@ static inline void update_sg_lb_stats(struct lb_env *env,
5942 sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; 6221 sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
5943 6222
5944 sgs->group_weight = group->group_weight; 6223 sgs->group_weight = group->group_weight;
5945
5946 sgs->group_imb = sg_imbalanced(group);
5947 sgs->group_capacity_factor = sg_capacity_factor(env, group); 6224 sgs->group_capacity_factor = sg_capacity_factor(env, group);
6225 sgs->group_type = group_classify(group, sgs);
5948 6226
5949 if (sgs->group_capacity_factor > sgs->sum_nr_running) 6227 if (sgs->group_capacity_factor > sgs->sum_nr_running)
5950 sgs->group_has_free_capacity = 1; 6228 sgs->group_has_free_capacity = 1;
@@ -5968,13 +6246,19 @@ static bool update_sd_pick_busiest(struct lb_env *env,
5968 struct sched_group *sg, 6246 struct sched_group *sg,
5969 struct sg_lb_stats *sgs) 6247 struct sg_lb_stats *sgs)
5970{ 6248{
5971 if (sgs->avg_load <= sds->busiest_stat.avg_load) 6249 struct sg_lb_stats *busiest = &sds->busiest_stat;
5972 return false;
5973 6250
5974 if (sgs->sum_nr_running > sgs->group_capacity_factor) 6251 if (sgs->group_type > busiest->group_type)
5975 return true; 6252 return true;
5976 6253
5977 if (sgs->group_imb) 6254 if (sgs->group_type < busiest->group_type)
6255 return false;
6256
6257 if (sgs->avg_load <= busiest->avg_load)
6258 return false;
6259
6260 /* This is the busiest node in its class. */
6261 if (!(env->sd->flags & SD_ASYM_PACKING))
5978 return true; 6262 return true;
5979 6263
5980 /* 6264 /*
@@ -5982,8 +6266,7 @@ static bool update_sd_pick_busiest(struct lb_env *env,
5982 * numbered CPUs in the group, therefore mark all groups 6266 * numbered CPUs in the group, therefore mark all groups
5983 * higher than ourself as busy. 6267 * higher than ourself as busy.
5984 */ 6268 */
5985 if ((env->sd->flags & SD_ASYM_PACKING) && sgs->sum_nr_running && 6269 if (sgs->sum_nr_running && env->dst_cpu < group_first_cpu(sg)) {
5986 env->dst_cpu < group_first_cpu(sg)) {
5987 if (!sds->busiest) 6270 if (!sds->busiest)
5988 return true; 6271 return true;
5989 6272
@@ -6073,8 +6356,10 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
6073 * with a large weight task outweighs the tasks on the system). 6356 * with a large weight task outweighs the tasks on the system).
6074 */ 6357 */
6075 if (prefer_sibling && sds->local && 6358 if (prefer_sibling && sds->local &&
6076 sds->local_stat.group_has_free_capacity) 6359 sds->local_stat.group_has_free_capacity) {
6077 sgs->group_capacity_factor = min(sgs->group_capacity_factor, 1U); 6360 sgs->group_capacity_factor = min(sgs->group_capacity_factor, 1U);
6361 sgs->group_type = group_classify(sg, sgs);
6362 }
6078 6363
6079 if (update_sd_pick_busiest(env, sds, sg, sgs)) { 6364 if (update_sd_pick_busiest(env, sds, sg, sgs)) {
6080 sds->busiest = sg; 6365 sds->busiest = sg;
@@ -6228,7 +6513,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
6228 local = &sds->local_stat; 6513 local = &sds->local_stat;
6229 busiest = &sds->busiest_stat; 6514 busiest = &sds->busiest_stat;
6230 6515
6231 if (busiest->group_imb) { 6516 if (busiest->group_type == group_imbalanced) {
6232 /* 6517 /*
6233 * In the group_imb case we cannot rely on group-wide averages 6518 * In the group_imb case we cannot rely on group-wide averages
6234 * to ensure cpu-load equilibrium, look at wider averages. XXX 6519 * to ensure cpu-load equilibrium, look at wider averages. XXX
@@ -6248,12 +6533,11 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
6248 return fix_small_imbalance(env, sds); 6533 return fix_small_imbalance(env, sds);
6249 } 6534 }
6250 6535
6251 if (!busiest->group_imb) { 6536 /*
6252 /* 6537 * If there aren't any idle cpus, avoid creating some.
6253 * Don't want to pull so many tasks that a group would go idle. 6538 */
6254 * Except of course for the group_imb case, since then we might 6539 if (busiest->group_type == group_overloaded &&
6255 * have to drop below capacity to reach cpu-load equilibrium. 6540 local->group_type == group_overloaded) {
6256 */
6257 load_above_capacity = 6541 load_above_capacity =
6258 (busiest->sum_nr_running - busiest->group_capacity_factor); 6542 (busiest->sum_nr_running - busiest->group_capacity_factor);
6259 6543
@@ -6337,7 +6621,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
6337 * work because they assume all things are equal, which typically 6621 * work because they assume all things are equal, which typically
6338 * isn't true due to cpus_allowed constraints and the like. 6622 * isn't true due to cpus_allowed constraints and the like.
6339 */ 6623 */
6340 if (busiest->group_imb) 6624 if (busiest->group_type == group_imbalanced)
6341 goto force_balance; 6625 goto force_balance;
6342 6626
6343 /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */ 6627 /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
@@ -6346,7 +6630,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
6346 goto force_balance; 6630 goto force_balance;
6347 6631
6348 /* 6632 /*
6349 * If the local group is more busy than the selected busiest group 6633 * If the local group is busier than the selected busiest group
6350 * don't try and pull any tasks. 6634 * don't try and pull any tasks.
6351 */ 6635 */
6352 if (local->avg_load >= busiest->avg_load) 6636 if (local->avg_load >= busiest->avg_load)
@@ -6361,13 +6645,14 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
6361 6645
6362 if (env->idle == CPU_IDLE) { 6646 if (env->idle == CPU_IDLE) {
6363 /* 6647 /*
6364 * This cpu is idle. If the busiest group load doesn't 6648 * This cpu is idle. If the busiest group is not overloaded
6365 * have more tasks than the number of available cpu's and 6649 * and there is no imbalance between this and busiest group
6366 * there is no imbalance between this and busiest group 6650 * wrt idle cpus, it is balanced. The imbalance becomes
6367 * wrt to idle cpu's, it is balanced. 6651 * significant if the diff is greater than 1 otherwise we
6652 * might end up to just move the imbalance on another group
6368 */ 6653 */
6369 if ((local->idle_cpus < busiest->idle_cpus) && 6654 if ((busiest->group_type != group_overloaded) &&
6370 busiest->sum_nr_running <= busiest->group_weight) 6655 (local->idle_cpus <= (busiest->idle_cpus + 1)))
6371 goto out_balanced; 6656 goto out_balanced;
6372 } else { 6657 } else {
6373 /* 6658 /*
@@ -6539,7 +6824,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
6539 struct sched_group *group; 6824 struct sched_group *group;
6540 struct rq *busiest; 6825 struct rq *busiest;
6541 unsigned long flags; 6826 unsigned long flags;
6542 struct cpumask *cpus = __get_cpu_var(load_balance_mask); 6827 struct cpumask *cpus = this_cpu_cpumask_var_ptr(load_balance_mask);
6543 6828
6544 struct lb_env env = { 6829 struct lb_env env = {
6545 .sd = sd, 6830 .sd = sd,
@@ -6550,6 +6835,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
6550 .loop_break = sched_nr_migrate_break, 6835 .loop_break = sched_nr_migrate_break,
6551 .cpus = cpus, 6836 .cpus = cpus,
6552 .fbq_type = all, 6837 .fbq_type = all,
6838 .tasks = LIST_HEAD_INIT(env.tasks),
6553 }; 6839 };
6554 6840
6555 /* 6841 /*
@@ -6599,23 +6885,30 @@ redo:
6599 env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running); 6885 env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running);
6600 6886
6601more_balance: 6887more_balance:
6602 local_irq_save(flags); 6888 raw_spin_lock_irqsave(&busiest->lock, flags);
6603 double_rq_lock(env.dst_rq, busiest);
6604 6889
6605 /* 6890 /*
6606 * cur_ld_moved - load moved in current iteration 6891 * cur_ld_moved - load moved in current iteration
6607 * ld_moved - cumulative load moved across iterations 6892 * ld_moved - cumulative load moved across iterations
6608 */ 6893 */
6609 cur_ld_moved = move_tasks(&env); 6894 cur_ld_moved = detach_tasks(&env);
6610 ld_moved += cur_ld_moved;
6611 double_rq_unlock(env.dst_rq, busiest);
6612 local_irq_restore(flags);
6613 6895
6614 /* 6896 /*
6615 * some other cpu did the load balance for us. 6897 * We've detached some tasks from busiest_rq. Every
6898 * task is masked "TASK_ON_RQ_MIGRATING", so we can safely
6899 * unlock busiest->lock, and we are able to be sure
6900 * that nobody can manipulate the tasks in parallel.
6901 * See task_rq_lock() family for the details.
6616 */ 6902 */
6617 if (cur_ld_moved && env.dst_cpu != smp_processor_id()) 6903
6618 resched_cpu(env.dst_cpu); 6904 raw_spin_unlock(&busiest->lock);
6905
6906 if (cur_ld_moved) {
6907 attach_tasks(&env);
6908 ld_moved += cur_ld_moved;
6909 }
6910
6911 local_irq_restore(flags);
6619 6912
6620 if (env.flags & LBF_NEED_BREAK) { 6913 if (env.flags & LBF_NEED_BREAK) {
6621 env.flags &= ~LBF_NEED_BREAK; 6914 env.flags &= ~LBF_NEED_BREAK;
@@ -6665,10 +6958,8 @@ more_balance:
6665 if (sd_parent) { 6958 if (sd_parent) {
6666 int *group_imbalance = &sd_parent->groups->sgc->imbalance; 6959 int *group_imbalance = &sd_parent->groups->sgc->imbalance;
6667 6960
6668 if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) { 6961 if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0)
6669 *group_imbalance = 1; 6962 *group_imbalance = 1;
6670 } else if (*group_imbalance)
6671 *group_imbalance = 0;
6672 } 6963 }
6673 6964
6674 /* All tasks on this runqueue were pinned by CPU affinity */ 6965 /* All tasks on this runqueue were pinned by CPU affinity */
@@ -6679,7 +6970,7 @@ more_balance:
6679 env.loop_break = sched_nr_migrate_break; 6970 env.loop_break = sched_nr_migrate_break;
6680 goto redo; 6971 goto redo;
6681 } 6972 }
6682 goto out_balanced; 6973 goto out_all_pinned;
6683 } 6974 }
6684 } 6975 }
6685 6976
@@ -6744,7 +7035,7 @@ more_balance:
6744 * If we've begun active balancing, start to back off. This 7035 * If we've begun active balancing, start to back off. This
6745 * case may not be covered by the all_pinned logic if there 7036 * case may not be covered by the all_pinned logic if there
6746 * is only 1 task on the busy runqueue (because we don't call 7037 * is only 1 task on the busy runqueue (because we don't call
6747 * move_tasks). 7038 * detach_tasks).
6748 */ 7039 */
6749 if (sd->balance_interval < sd->max_interval) 7040 if (sd->balance_interval < sd->max_interval)
6750 sd->balance_interval *= 2; 7041 sd->balance_interval *= 2;
@@ -6753,6 +7044,23 @@ more_balance:
6753 goto out; 7044 goto out;
6754 7045
6755out_balanced: 7046out_balanced:
7047 /*
7048 * We reach balance although we may have faced some affinity
7049 * constraints. Clear the imbalance flag if it was set.
7050 */
7051 if (sd_parent) {
7052 int *group_imbalance = &sd_parent->groups->sgc->imbalance;
7053
7054 if (*group_imbalance)
7055 *group_imbalance = 0;
7056 }
7057
7058out_all_pinned:
7059 /*
7060 * We reach balance because all tasks are pinned at this level so
7061 * we can't migrate them. Let the imbalance flag set so parent level
7062 * can try to migrate them.
7063 */
6756 schedstat_inc(sd, lb_balanced[idle]); 7064 schedstat_inc(sd, lb_balanced[idle]);
6757 7065
6758 sd->nr_balance_failed = 0; 7066 sd->nr_balance_failed = 0;
@@ -6914,6 +7222,7 @@ static int active_load_balance_cpu_stop(void *data)
6914 int target_cpu = busiest_rq->push_cpu; 7222 int target_cpu = busiest_rq->push_cpu;
6915 struct rq *target_rq = cpu_rq(target_cpu); 7223 struct rq *target_rq = cpu_rq(target_cpu);
6916 struct sched_domain *sd; 7224 struct sched_domain *sd;
7225 struct task_struct *p = NULL;
6917 7226
6918 raw_spin_lock_irq(&busiest_rq->lock); 7227 raw_spin_lock_irq(&busiest_rq->lock);
6919 7228
@@ -6933,9 +7242,6 @@ static int active_load_balance_cpu_stop(void *data)
6933 */ 7242 */
6934 BUG_ON(busiest_rq == target_rq); 7243 BUG_ON(busiest_rq == target_rq);
6935 7244
6936 /* move a task from busiest_rq to target_rq */
6937 double_lock_balance(busiest_rq, target_rq);
6938
6939 /* Search for an sd spanning us and the target CPU. */ 7245 /* Search for an sd spanning us and the target CPU. */
6940 rcu_read_lock(); 7246 rcu_read_lock();
6941 for_each_domain(target_cpu, sd) { 7247 for_each_domain(target_cpu, sd) {
@@ -6956,16 +7262,22 @@ static int active_load_balance_cpu_stop(void *data)
6956 7262
6957 schedstat_inc(sd, alb_count); 7263 schedstat_inc(sd, alb_count);
6958 7264
6959 if (move_one_task(&env)) 7265 p = detach_one_task(&env);
7266 if (p)
6960 schedstat_inc(sd, alb_pushed); 7267 schedstat_inc(sd, alb_pushed);
6961 else 7268 else
6962 schedstat_inc(sd, alb_failed); 7269 schedstat_inc(sd, alb_failed);
6963 } 7270 }
6964 rcu_read_unlock(); 7271 rcu_read_unlock();
6965 double_unlock_balance(busiest_rq, target_rq);
6966out_unlock: 7272out_unlock:
6967 busiest_rq->active_balance = 0; 7273 busiest_rq->active_balance = 0;
6968 raw_spin_unlock_irq(&busiest_rq->lock); 7274 raw_spin_unlock(&busiest_rq->lock);
7275
7276 if (p)
7277 attach_one_task(target_rq, p);
7278
7279 local_irq_enable();
7280
6969 return 0; 7281 return 0;
6970} 7282}
6971 7283
@@ -7465,7 +7777,7 @@ static void task_fork_fair(struct task_struct *p)
7465static void 7777static void
7466prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio) 7778prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
7467{ 7779{
7468 if (!p->se.on_rq) 7780 if (!task_on_rq_queued(p))
7469 return; 7781 return;
7470 7782
7471 /* 7783 /*
@@ -7490,11 +7802,11 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p)
7490 * switched back to the fair class the enqueue_entity(.flags=0) will 7802 * switched back to the fair class the enqueue_entity(.flags=0) will
7491 * do the right thing. 7803 * do the right thing.
7492 * 7804 *
7493 * If it's on_rq, then the dequeue_entity(.flags=0) will already 7805 * If it's queued, then the dequeue_entity(.flags=0) will already
7494 * have normalized the vruntime, if it's !on_rq, then only when 7806 * have normalized the vruntime, if it's !queued, then only when
7495 * the task is sleeping will it still have non-normalized vruntime. 7807 * the task is sleeping will it still have non-normalized vruntime.
7496 */ 7808 */
7497 if (!p->on_rq && p->state != TASK_RUNNING) { 7809 if (!task_on_rq_queued(p) && p->state != TASK_RUNNING) {
7498 /* 7810 /*
7499 * Fix up our vruntime so that the current sleep doesn't 7811 * Fix up our vruntime so that the current sleep doesn't
7500 * cause 'unlimited' sleep bonus. 7812 * cause 'unlimited' sleep bonus.
@@ -7521,15 +7833,15 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p)
7521 */ 7833 */
7522static void switched_to_fair(struct rq *rq, struct task_struct *p) 7834static void switched_to_fair(struct rq *rq, struct task_struct *p)
7523{ 7835{
7524 struct sched_entity *se = &p->se;
7525#ifdef CONFIG_FAIR_GROUP_SCHED 7836#ifdef CONFIG_FAIR_GROUP_SCHED
7837 struct sched_entity *se = &p->se;
7526 /* 7838 /*
7527 * Since the real-depth could have been changed (only FAIR 7839 * Since the real-depth could have been changed (only FAIR
7528 * class maintain depth value), reset depth properly. 7840 * class maintain depth value), reset depth properly.
7529 */ 7841 */
7530 se->depth = se->parent ? se->parent->depth + 1 : 0; 7842 se->depth = se->parent ? se->parent->depth + 1 : 0;
7531#endif 7843#endif
7532 if (!se->on_rq) 7844 if (!task_on_rq_queued(p))
7533 return; 7845 return;
7534 7846
7535 /* 7847 /*
@@ -7575,7 +7887,7 @@ void init_cfs_rq(struct cfs_rq *cfs_rq)
7575} 7887}
7576 7888
7577#ifdef CONFIG_FAIR_GROUP_SCHED 7889#ifdef CONFIG_FAIR_GROUP_SCHED
7578static void task_move_group_fair(struct task_struct *p, int on_rq) 7890static void task_move_group_fair(struct task_struct *p, int queued)
7579{ 7891{
7580 struct sched_entity *se = &p->se; 7892 struct sched_entity *se = &p->se;
7581 struct cfs_rq *cfs_rq; 7893 struct cfs_rq *cfs_rq;
@@ -7594,7 +7906,7 @@ static void task_move_group_fair(struct task_struct *p, int on_rq)
7594 * fair sleeper stuff for the first placement, but who cares. 7906 * fair sleeper stuff for the first placement, but who cares.
7595 */ 7907 */
7596 /* 7908 /*
7597 * When !on_rq, vruntime of the task has usually NOT been normalized. 7909 * When !queued, vruntime of the task has usually NOT been normalized.
7598 * But there are some cases where it has already been normalized: 7910 * But there are some cases where it has already been normalized:
7599 * 7911 *
7600 * - Moving a forked child which is waiting for being woken up by 7912 * - Moving a forked child which is waiting for being woken up by
@@ -7605,14 +7917,14 @@ static void task_move_group_fair(struct task_struct *p, int on_rq)
7605 * To prevent boost or penalty in the new cfs_rq caused by delta 7917 * To prevent boost or penalty in the new cfs_rq caused by delta
7606 * min_vruntime between the two cfs_rqs, we skip vruntime adjustment. 7918 * min_vruntime between the two cfs_rqs, we skip vruntime adjustment.
7607 */ 7919 */
7608 if (!on_rq && (!se->sum_exec_runtime || p->state == TASK_WAKING)) 7920 if (!queued && (!se->sum_exec_runtime || p->state == TASK_WAKING))
7609 on_rq = 1; 7921 queued = 1;
7610 7922
7611 if (!on_rq) 7923 if (!queued)
7612 se->vruntime -= cfs_rq_of(se)->min_vruntime; 7924 se->vruntime -= cfs_rq_of(se)->min_vruntime;
7613 set_task_rq(p, task_cpu(p)); 7925 set_task_rq(p, task_cpu(p));
7614 se->depth = se->parent ? se->parent->depth + 1 : 0; 7926 se->depth = se->parent ? se->parent->depth + 1 : 0;
7615 if (!on_rq) { 7927 if (!queued) {
7616 cfs_rq = cfs_rq_of(se); 7928 cfs_rq = cfs_rq_of(se);
7617 se->vruntime += cfs_rq->min_vruntime; 7929 se->vruntime += cfs_rq->min_vruntime;
7618#ifdef CONFIG_SMP 7930#ifdef CONFIG_SMP
@@ -7835,6 +8147,8 @@ const struct sched_class fair_sched_class = {
7835 8147
7836 .get_rr_interval = get_rr_interval_fair, 8148 .get_rr_interval = get_rr_interval_fair,
7837 8149
8150 .update_curr = update_curr_fair,
8151
7838#ifdef CONFIG_FAIR_GROUP_SCHED 8152#ifdef CONFIG_FAIR_GROUP_SCHED
7839 .task_move_group = task_move_group_fair, 8153 .task_move_group = task_move_group_fair,
7840#endif 8154#endif
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 11e7bc434f43..c47fce75e666 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -147,6 +147,9 @@ use_default:
147 clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &dev->cpu)) 147 clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &dev->cpu))
148 goto use_default; 148 goto use_default;
149 149
150 /* Take note of the planned idle state. */
151 idle_set_state(this_rq(), &drv->states[next_state]);
152
150 /* 153 /*
151 * Enter the idle state previously returned by the governor decision. 154 * Enter the idle state previously returned by the governor decision.
152 * This function will block until an interrupt occurs and will take 155 * This function will block until an interrupt occurs and will take
@@ -154,6 +157,9 @@ use_default:
154 */ 157 */
155 entered_state = cpuidle_enter(drv, dev, next_state); 158 entered_state = cpuidle_enter(drv, dev, next_state);
156 159
160 /* The cpu is no longer idle or about to enter idle. */
161 idle_set_state(this_rq(), NULL);
162
157 if (broadcast) 163 if (broadcast)
158 clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &dev->cpu); 164 clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &dev->cpu);
159 165
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c
index 67ad4e7f506a..c65dac8c97cd 100644
--- a/kernel/sched/idle_task.c
+++ b/kernel/sched/idle_task.c
@@ -75,6 +75,10 @@ static unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task
75 return 0; 75 return 0;
76} 76}
77 77
78static void update_curr_idle(struct rq *rq)
79{
80}
81
78/* 82/*
79 * Simple, special scheduling class for the per-CPU idle tasks: 83 * Simple, special scheduling class for the per-CPU idle tasks:
80 */ 84 */
@@ -101,4 +105,5 @@ const struct sched_class idle_sched_class = {
101 105
102 .prio_changed = prio_changed_idle, 106 .prio_changed = prio_changed_idle,
103 .switched_to = switched_to_idle, 107 .switched_to = switched_to_idle,
108 .update_curr = update_curr_idle,
104}; 109};
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 5f6edca4fafd..ee15f5a0d1c1 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1301,9 +1301,6 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)
1301 struct task_struct *curr; 1301 struct task_struct *curr;
1302 struct rq *rq; 1302 struct rq *rq;
1303 1303
1304 if (p->nr_cpus_allowed == 1)
1305 goto out;
1306
1307 /* For anything but wake ups, just return the task_cpu */ 1304 /* For anything but wake ups, just return the task_cpu */
1308 if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK) 1305 if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK)
1309 goto out; 1306 goto out;
@@ -1351,16 +1348,22 @@ out:
1351 1348
1352static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) 1349static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
1353{ 1350{
1354 if (rq->curr->nr_cpus_allowed == 1) 1351 /*
1352 * Current can't be migrated, useless to reschedule,
1353 * let's hope p can move out.
1354 */
1355 if (rq->curr->nr_cpus_allowed == 1 ||
1356 !cpupri_find(&rq->rd->cpupri, rq->curr, NULL))
1355 return; 1357 return;
1356 1358
1359 /*
1360 * p is migratable, so let's not schedule it and
1361 * see if it is pushed or pulled somewhere else.
1362 */
1357 if (p->nr_cpus_allowed != 1 1363 if (p->nr_cpus_allowed != 1
1358 && cpupri_find(&rq->rd->cpupri, p, NULL)) 1364 && cpupri_find(&rq->rd->cpupri, p, NULL))
1359 return; 1365 return;
1360 1366
1361 if (!cpupri_find(&rq->rd->cpupri, rq->curr, NULL))
1362 return;
1363
1364 /* 1367 /*
1365 * There appears to be other cpus that can accept 1368 * There appears to be other cpus that can accept
1366 * current and none to run 'p', so lets reschedule 1369 * current and none to run 'p', so lets reschedule
@@ -1448,7 +1451,7 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev)
1448 * means a dl or stop task can slip in, in which case we need 1451 * means a dl or stop task can slip in, in which case we need
1449 * to re-start task selection. 1452 * to re-start task selection.
1450 */ 1453 */
1451 if (unlikely((rq->stop && rq->stop->on_rq) || 1454 if (unlikely((rq->stop && task_on_rq_queued(rq->stop)) ||
1452 rq->dl.dl_nr_running)) 1455 rq->dl.dl_nr_running))
1453 return RETRY_TASK; 1456 return RETRY_TASK;
1454 } 1457 }
@@ -1468,8 +1471,7 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev)
1468 p = _pick_next_task_rt(rq); 1471 p = _pick_next_task_rt(rq);
1469 1472
1470 /* The running task is never eligible for pushing */ 1473 /* The running task is never eligible for pushing */
1471 if (p) 1474 dequeue_pushable_task(rq, p);
1472 dequeue_pushable_task(rq, p);
1473 1475
1474 set_post_schedule(rq); 1476 set_post_schedule(rq);
1475 1477
@@ -1526,7 +1528,7 @@ static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask);
1526static int find_lowest_rq(struct task_struct *task) 1528static int find_lowest_rq(struct task_struct *task)
1527{ 1529{
1528 struct sched_domain *sd; 1530 struct sched_domain *sd;
1529 struct cpumask *lowest_mask = __get_cpu_var(local_cpu_mask); 1531 struct cpumask *lowest_mask = this_cpu_cpumask_var_ptr(local_cpu_mask);
1530 int this_cpu = smp_processor_id(); 1532 int this_cpu = smp_processor_id();
1531 int cpu = task_cpu(task); 1533 int cpu = task_cpu(task);
1532 1534
@@ -1624,7 +1626,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
1624 !cpumask_test_cpu(lowest_rq->cpu, 1626 !cpumask_test_cpu(lowest_rq->cpu,
1625 tsk_cpus_allowed(task)) || 1627 tsk_cpus_allowed(task)) ||
1626 task_running(rq, task) || 1628 task_running(rq, task) ||
1627 !task->on_rq)) { 1629 !task_on_rq_queued(task))) {
1628 1630
1629 double_unlock_balance(rq, lowest_rq); 1631 double_unlock_balance(rq, lowest_rq);
1630 lowest_rq = NULL; 1632 lowest_rq = NULL;
@@ -1658,7 +1660,7 @@ static struct task_struct *pick_next_pushable_task(struct rq *rq)
1658 BUG_ON(task_current(rq, p)); 1660 BUG_ON(task_current(rq, p));
1659 BUG_ON(p->nr_cpus_allowed <= 1); 1661 BUG_ON(p->nr_cpus_allowed <= 1);
1660 1662
1661 BUG_ON(!p->on_rq); 1663 BUG_ON(!task_on_rq_queued(p));
1662 BUG_ON(!rt_task(p)); 1664 BUG_ON(!rt_task(p));
1663 1665
1664 return p; 1666 return p;
@@ -1809,7 +1811,7 @@ static int pull_rt_task(struct rq *this_rq)
1809 */ 1811 */
1810 if (p && (p->prio < this_rq->rt.highest_prio.curr)) { 1812 if (p && (p->prio < this_rq->rt.highest_prio.curr)) {
1811 WARN_ON(p == src_rq->curr); 1813 WARN_ON(p == src_rq->curr);
1812 WARN_ON(!p->on_rq); 1814 WARN_ON(!task_on_rq_queued(p));
1813 1815
1814 /* 1816 /*
1815 * There's a chance that p is higher in priority 1817 * There's a chance that p is higher in priority
@@ -1870,7 +1872,7 @@ static void set_cpus_allowed_rt(struct task_struct *p,
1870 1872
1871 BUG_ON(!rt_task(p)); 1873 BUG_ON(!rt_task(p));
1872 1874
1873 if (!p->on_rq) 1875 if (!task_on_rq_queued(p))
1874 return; 1876 return;
1875 1877
1876 weight = cpumask_weight(new_mask); 1878 weight = cpumask_weight(new_mask);
@@ -1936,7 +1938,7 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p)
1936 * we may need to handle the pulling of RT tasks 1938 * we may need to handle the pulling of RT tasks
1937 * now. 1939 * now.
1938 */ 1940 */
1939 if (!p->on_rq || rq->rt.rt_nr_running) 1941 if (!task_on_rq_queued(p) || rq->rt.rt_nr_running)
1940 return; 1942 return;
1941 1943
1942 if (pull_rt_task(rq)) 1944 if (pull_rt_task(rq))
@@ -1970,7 +1972,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p)
1970 * If that current running task is also an RT task 1972 * If that current running task is also an RT task
1971 * then see if we can move to another run queue. 1973 * then see if we can move to another run queue.
1972 */ 1974 */
1973 if (p->on_rq && rq->curr != p) { 1975 if (task_on_rq_queued(p) && rq->curr != p) {
1974#ifdef CONFIG_SMP 1976#ifdef CONFIG_SMP
1975 if (p->nr_cpus_allowed > 1 && rq->rt.overloaded && 1977 if (p->nr_cpus_allowed > 1 && rq->rt.overloaded &&
1976 /* Don't resched if we changed runqueues */ 1978 /* Don't resched if we changed runqueues */
@@ -1989,7 +1991,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p)
1989static void 1991static void
1990prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio) 1992prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio)
1991{ 1993{
1992 if (!p->on_rq) 1994 if (!task_on_rq_queued(p))
1993 return; 1995 return;
1994 1996
1995 if (rq->curr == p) { 1997 if (rq->curr == p) {
@@ -2073,7 +2075,7 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
2073 for_each_sched_rt_entity(rt_se) { 2075 for_each_sched_rt_entity(rt_se) {
2074 if (rt_se->run_list.prev != rt_se->run_list.next) { 2076 if (rt_se->run_list.prev != rt_se->run_list.next) {
2075 requeue_task_rt(rq, p, 0); 2077 requeue_task_rt(rq, p, 0);
2076 set_tsk_need_resched(p); 2078 resched_curr(rq);
2077 return; 2079 return;
2078 } 2080 }
2079 } 2081 }
@@ -2129,6 +2131,8 @@ const struct sched_class rt_sched_class = {
2129 2131
2130 .prio_changed = prio_changed_rt, 2132 .prio_changed = prio_changed_rt,
2131 .switched_to = switched_to_rt, 2133 .switched_to = switched_to_rt,
2134
2135 .update_curr = update_curr_rt,
2132}; 2136};
2133 2137
2134#ifdef CONFIG_SCHED_DEBUG 2138#ifdef CONFIG_SCHED_DEBUG
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 579712f4e9d5..9a2a45c970e7 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -14,6 +14,11 @@
14#include "cpuacct.h" 14#include "cpuacct.h"
15 15
16struct rq; 16struct rq;
17struct cpuidle_state;
18
19/* task_struct::on_rq states: */
20#define TASK_ON_RQ_QUEUED 1
21#define TASK_ON_RQ_MIGRATING 2
17 22
18extern __read_mostly int scheduler_running; 23extern __read_mostly int scheduler_running;
19 24
@@ -126,6 +131,9 @@ struct rt_bandwidth {
126 u64 rt_runtime; 131 u64 rt_runtime;
127 struct hrtimer rt_period_timer; 132 struct hrtimer rt_period_timer;
128}; 133};
134
135void __dl_clear_params(struct task_struct *p);
136
129/* 137/*
130 * To keep the bandwidth of -deadline tasks and groups under control 138 * To keep the bandwidth of -deadline tasks and groups under control
131 * we need some place where: 139 * we need some place where:
@@ -168,6 +176,25 @@ struct dl_bw {
168 u64 bw, total_bw; 176 u64 bw, total_bw;
169}; 177};
170 178
179static inline
180void __dl_clear(struct dl_bw *dl_b, u64 tsk_bw)
181{
182 dl_b->total_bw -= tsk_bw;
183}
184
185static inline
186void __dl_add(struct dl_bw *dl_b, u64 tsk_bw)
187{
188 dl_b->total_bw += tsk_bw;
189}
190
191static inline
192bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw)
193{
194 return dl_b->bw != -1 &&
195 dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw;
196}
197
171extern struct mutex sched_domains_mutex; 198extern struct mutex sched_domains_mutex;
172 199
173#ifdef CONFIG_CGROUP_SCHED 200#ifdef CONFIG_CGROUP_SCHED
@@ -184,7 +211,7 @@ struct cfs_bandwidth {
184 raw_spinlock_t lock; 211 raw_spinlock_t lock;
185 ktime_t period; 212 ktime_t period;
186 u64 quota, runtime; 213 u64 quota, runtime;
187 s64 hierarchal_quota; 214 s64 hierarchical_quota;
188 u64 runtime_expires; 215 u64 runtime_expires;
189 216
190 int idle, timer_active; 217 int idle, timer_active;
@@ -636,6 +663,11 @@ struct rq {
636#ifdef CONFIG_SMP 663#ifdef CONFIG_SMP
637 struct llist_head wake_list; 664 struct llist_head wake_list;
638#endif 665#endif
666
667#ifdef CONFIG_CPU_IDLE
668 /* Must be inspected within a rcu lock section */
669 struct cpuidle_state *idle_state;
670#endif
639}; 671};
640 672
641static inline int cpu_of(struct rq *rq) 673static inline int cpu_of(struct rq *rq)
@@ -647,13 +679,13 @@ static inline int cpu_of(struct rq *rq)
647#endif 679#endif
648} 680}
649 681
650DECLARE_PER_CPU(struct rq, runqueues); 682DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
651 683
652#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) 684#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))
653#define this_rq() (&__get_cpu_var(runqueues)) 685#define this_rq() this_cpu_ptr(&runqueues)
654#define task_rq(p) cpu_rq(task_cpu(p)) 686#define task_rq(p) cpu_rq(task_cpu(p))
655#define cpu_curr(cpu) (cpu_rq(cpu)->curr) 687#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
656#define raw_rq() (&__raw_get_cpu_var(runqueues)) 688#define raw_rq() raw_cpu_ptr(&runqueues)
657 689
658static inline u64 rq_clock(struct rq *rq) 690static inline u64 rq_clock(struct rq *rq)
659{ 691{
@@ -665,7 +697,25 @@ static inline u64 rq_clock_task(struct rq *rq)
665 return rq->clock_task; 697 return rq->clock_task;
666} 698}
667 699
700#ifdef CONFIG_NUMA
701enum numa_topology_type {
702 NUMA_DIRECT,
703 NUMA_GLUELESS_MESH,
704 NUMA_BACKPLANE,
705};
706extern enum numa_topology_type sched_numa_topology_type;
707extern int sched_max_numa_distance;
708extern bool find_numa_distance(int distance);
709#endif
710
668#ifdef CONFIG_NUMA_BALANCING 711#ifdef CONFIG_NUMA_BALANCING
712/* The regions in numa_faults array from task_struct */
713enum numa_faults_stats {
714 NUMA_MEM = 0,
715 NUMA_CPU,
716 NUMA_MEMBUF,
717 NUMA_CPUBUF
718};
669extern void sched_setnuma(struct task_struct *p, int node); 719extern void sched_setnuma(struct task_struct *p, int node);
670extern int migrate_task_to(struct task_struct *p, int cpu); 720extern int migrate_task_to(struct task_struct *p, int cpu);
671extern int migrate_swap(struct task_struct *, struct task_struct *); 721extern int migrate_swap(struct task_struct *, struct task_struct *);
@@ -942,6 +992,15 @@ static inline int task_running(struct rq *rq, struct task_struct *p)
942#endif 992#endif
943} 993}
944 994
995static inline int task_on_rq_queued(struct task_struct *p)
996{
997 return p->on_rq == TASK_ON_RQ_QUEUED;
998}
999
1000static inline int task_on_rq_migrating(struct task_struct *p)
1001{
1002 return p->on_rq == TASK_ON_RQ_MIGRATING;
1003}
945 1004
946#ifndef prepare_arch_switch 1005#ifndef prepare_arch_switch
947# define prepare_arch_switch(next) do { } while (0) 1006# define prepare_arch_switch(next) do { } while (0)
@@ -953,7 +1012,6 @@ static inline int task_running(struct rq *rq, struct task_struct *p)
953# define finish_arch_post_lock_switch() do { } while (0) 1012# define finish_arch_post_lock_switch() do { } while (0)
954#endif 1013#endif
955 1014
956#ifndef __ARCH_WANT_UNLOCKED_CTXSW
957static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) 1015static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
958{ 1016{
959#ifdef CONFIG_SMP 1017#ifdef CONFIG_SMP
@@ -991,35 +1049,6 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
991 raw_spin_unlock_irq(&rq->lock); 1049 raw_spin_unlock_irq(&rq->lock);
992} 1050}
993 1051
994#else /* __ARCH_WANT_UNLOCKED_CTXSW */
995static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
996{
997#ifdef CONFIG_SMP
998 /*
999 * We can optimise this out completely for !SMP, because the
1000 * SMP rebalancing from interrupt is the only thing that cares
1001 * here.
1002 */
1003 next->on_cpu = 1;
1004#endif
1005 raw_spin_unlock(&rq->lock);
1006}
1007
1008static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
1009{
1010#ifdef CONFIG_SMP
1011 /*
1012 * After ->on_cpu is cleared, the task can be moved to a different CPU.
1013 * We must ensure this doesn't happen until the switch is completely
1014 * finished.
1015 */
1016 smp_wmb();
1017 prev->on_cpu = 0;
1018#endif
1019 local_irq_enable();
1020}
1021#endif /* __ARCH_WANT_UNLOCKED_CTXSW */
1022
1023/* 1052/*
1024 * wake flags 1053 * wake flags
1025 */ 1054 */
@@ -1135,6 +1164,11 @@ struct sched_class {
1135 void (*task_fork) (struct task_struct *p); 1164 void (*task_fork) (struct task_struct *p);
1136 void (*task_dead) (struct task_struct *p); 1165 void (*task_dead) (struct task_struct *p);
1137 1166
1167 /*
1168 * The switched_from() call is allowed to drop rq->lock, therefore we
1169 * cannot assume the switched_from/switched_to pair is serliazed by
1170 * rq->lock. They are however serialized by p->pi_lock.
1171 */
1138 void (*switched_from) (struct rq *this_rq, struct task_struct *task); 1172 void (*switched_from) (struct rq *this_rq, struct task_struct *task);
1139 void (*switched_to) (struct rq *this_rq, struct task_struct *task); 1173 void (*switched_to) (struct rq *this_rq, struct task_struct *task);
1140 void (*prio_changed) (struct rq *this_rq, struct task_struct *task, 1174 void (*prio_changed) (struct rq *this_rq, struct task_struct *task,
@@ -1143,6 +1177,8 @@ struct sched_class {
1143 unsigned int (*get_rr_interval) (struct rq *rq, 1177 unsigned int (*get_rr_interval) (struct rq *rq,
1144 struct task_struct *task); 1178 struct task_struct *task);
1145 1179
1180 void (*update_curr) (struct rq *rq);
1181
1146#ifdef CONFIG_FAIR_GROUP_SCHED 1182#ifdef CONFIG_FAIR_GROUP_SCHED
1147 void (*task_move_group) (struct task_struct *p, int on_rq); 1183 void (*task_move_group) (struct task_struct *p, int on_rq);
1148#endif 1184#endif
@@ -1180,6 +1216,30 @@ static inline void idle_exit_fair(struct rq *rq) { }
1180 1216
1181#endif 1217#endif
1182 1218
1219#ifdef CONFIG_CPU_IDLE
1220static inline void idle_set_state(struct rq *rq,
1221 struct cpuidle_state *idle_state)
1222{
1223 rq->idle_state = idle_state;
1224}
1225
1226static inline struct cpuidle_state *idle_get_state(struct rq *rq)
1227{
1228 WARN_ON(!rcu_read_lock_held());
1229 return rq->idle_state;
1230}
1231#else
1232static inline void idle_set_state(struct rq *rq,
1233 struct cpuidle_state *idle_state)
1234{
1235}
1236
1237static inline struct cpuidle_state *idle_get_state(struct rq *rq)
1238{
1239 return NULL;
1240}
1241#endif
1242
1183extern void sysrq_sched_debug_show(void); 1243extern void sysrq_sched_debug_show(void);
1184extern void sched_init_granularity(void); 1244extern void sched_init_granularity(void);
1185extern void update_max_interval(void); 1245extern void update_max_interval(void);
@@ -1486,6 +1546,7 @@ extern struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq);
1486extern struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq); 1546extern struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq);
1487extern void print_cfs_stats(struct seq_file *m, int cpu); 1547extern void print_cfs_stats(struct seq_file *m, int cpu);
1488extern void print_rt_stats(struct seq_file *m, int cpu); 1548extern void print_rt_stats(struct seq_file *m, int cpu);
1549extern void print_dl_stats(struct seq_file *m, int cpu);
1489 1550
1490extern void init_cfs_rq(struct cfs_rq *cfs_rq); 1551extern void init_cfs_rq(struct cfs_rq *cfs_rq);
1491extern void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq); 1552extern void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq);
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index bfe0edadbfbb..79ffec45a6ac 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -28,7 +28,7 @@ pick_next_task_stop(struct rq *rq, struct task_struct *prev)
28{ 28{
29 struct task_struct *stop = rq->stop; 29 struct task_struct *stop = rq->stop;
30 30
31 if (!stop || !stop->on_rq) 31 if (!stop || !task_on_rq_queued(stop))
32 return NULL; 32 return NULL;
33 33
34 put_prev_task(rq, prev); 34 put_prev_task(rq, prev);
@@ -102,6 +102,10 @@ get_rr_interval_stop(struct rq *rq, struct task_struct *task)
102 return 0; 102 return 0;
103} 103}
104 104
105static void update_curr_stop(struct rq *rq)
106{
107}
108
105/* 109/*
106 * Simple, special scheduling class for the per-CPU stop tasks: 110 * Simple, special scheduling class for the per-CPU stop tasks:
107 */ 111 */
@@ -128,4 +132,5 @@ const struct sched_class stop_sched_class = {
128 132
129 .prio_changed = prio_changed_stop, 133 .prio_changed = prio_changed_stop,
130 .switched_to = switched_to_stop, 134 .switched_to = switched_to_stop,
135 .update_curr = update_curr_stop,
131}; 136};
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index 15cab1a4f84e..852143a79f36 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -9,6 +9,7 @@
9#include <linux/mm.h> 9#include <linux/mm.h>
10#include <linux/wait.h> 10#include <linux/wait.h>
11#include <linux/hash.h> 11#include <linux/hash.h>
12#include <linux/kthread.h>
12 13
13void __init_waitqueue_head(wait_queue_head_t *q, const char *name, struct lock_class_key *key) 14void __init_waitqueue_head(wait_queue_head_t *q, const char *name, struct lock_class_key *key)
14{ 15{
@@ -297,6 +298,71 @@ int autoremove_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *
297} 298}
298EXPORT_SYMBOL(autoremove_wake_function); 299EXPORT_SYMBOL(autoremove_wake_function);
299 300
301static inline bool is_kthread_should_stop(void)
302{
303 return (current->flags & PF_KTHREAD) && kthread_should_stop();
304}
305
306/*
307 * DEFINE_WAIT_FUNC(wait, woken_wake_func);
308 *
309 * add_wait_queue(&wq, &wait);
310 * for (;;) {
311 * if (condition)
312 * break;
313 *
314 * p->state = mode; condition = true;
315 * smp_mb(); // A smp_wmb(); // C
316 * if (!wait->flags & WQ_FLAG_WOKEN) wait->flags |= WQ_FLAG_WOKEN;
317 * schedule() try_to_wake_up();
318 * p->state = TASK_RUNNING; ~~~~~~~~~~~~~~~~~~
319 * wait->flags &= ~WQ_FLAG_WOKEN; condition = true;
320 * smp_mb() // B smp_wmb(); // C
321 * wait->flags |= WQ_FLAG_WOKEN;
322 * }
323 * remove_wait_queue(&wq, &wait);
324 *
325 */
326long wait_woken(wait_queue_t *wait, unsigned mode, long timeout)
327{
328 set_current_state(mode); /* A */
329 /*
330 * The above implies an smp_mb(), which matches with the smp_wmb() from
331 * woken_wake_function() such that if we observe WQ_FLAG_WOKEN we must
332 * also observe all state before the wakeup.
333 */
334 if (!(wait->flags & WQ_FLAG_WOKEN) && !is_kthread_should_stop())
335 timeout = schedule_timeout(timeout);
336 __set_current_state(TASK_RUNNING);
337
338 /*
339 * The below implies an smp_mb(), it too pairs with the smp_wmb() from
340 * woken_wake_function() such that we must either observe the wait
341 * condition being true _OR_ WQ_FLAG_WOKEN such that we will not miss
342 * an event.
343 */
344 set_mb(wait->flags, wait->flags & ~WQ_FLAG_WOKEN); /* B */
345
346 return timeout;
347}
348EXPORT_SYMBOL(wait_woken);
349
350int woken_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key)
351{
352 /*
353 * Although this function is called under waitqueue lock, LOCK
354 * doesn't imply write barrier and the users expects write
355 * barrier semantics on wakeup functions. The following
356 * smp_wmb() is equivalent to smp_wmb() in try_to_wake_up()
357 * and is paired with set_mb() in wait_woken().
358 */
359 smp_wmb(); /* C */
360 wait->flags |= WQ_FLAG_WOKEN;
361
362 return default_wake_function(wait, mode, sync, key);
363}
364EXPORT_SYMBOL(woken_wake_function);
365
300int wake_bit_function(wait_queue_t *wait, unsigned mode, int sync, void *arg) 366int wake_bit_function(wait_queue_t *wait, unsigned mode, int sync, void *arg)
301{ 367{
302 struct wait_bit_key *key = arg; 368 struct wait_bit_key *key = arg;
@@ -343,6 +409,18 @@ int __sched out_of_line_wait_on_bit(void *word, int bit,
343} 409}
344EXPORT_SYMBOL(out_of_line_wait_on_bit); 410EXPORT_SYMBOL(out_of_line_wait_on_bit);
345 411
412int __sched out_of_line_wait_on_bit_timeout(
413 void *word, int bit, wait_bit_action_f *action,
414 unsigned mode, unsigned long timeout)
415{
416 wait_queue_head_t *wq = bit_waitqueue(word, bit);
417 DEFINE_WAIT_BIT(wait, word, bit);
418
419 wait.key.timeout = jiffies + timeout;
420 return __wait_on_bit(wq, &wait, action, mode);
421}
422EXPORT_SYMBOL_GPL(out_of_line_wait_on_bit_timeout);
423
346int __sched 424int __sched
347__wait_on_bit_lock(wait_queue_head_t *wq, struct wait_bit_queue *q, 425__wait_on_bit_lock(wait_queue_head_t *wq, struct wait_bit_queue *q,
348 wait_bit_action_f *action, unsigned mode) 426 wait_bit_action_f *action, unsigned mode)
@@ -520,3 +598,27 @@ __sched int bit_wait_io(struct wait_bit_key *word)
520 return 0; 598 return 0;
521} 599}
522EXPORT_SYMBOL(bit_wait_io); 600EXPORT_SYMBOL(bit_wait_io);
601
602__sched int bit_wait_timeout(struct wait_bit_key *word)
603{
604 unsigned long now = ACCESS_ONCE(jiffies);
605 if (signal_pending_state(current->state, current))
606 return 1;
607 if (time_after_eq(now, word->timeout))
608 return -EAGAIN;
609 schedule_timeout(word->timeout - now);
610 return 0;
611}
612EXPORT_SYMBOL_GPL(bit_wait_timeout);
613
614__sched int bit_wait_io_timeout(struct wait_bit_key *word)
615{
616 unsigned long now = ACCESS_ONCE(jiffies);
617 if (signal_pending_state(current->state, current))
618 return 1;
619 if (time_after_eq(now, word->timeout))
620 return -EAGAIN;
621 io_schedule_timeout(word->timeout - now);
622 return 0;
623}
624EXPORT_SYMBOL_GPL(bit_wait_io_timeout);
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 44eb005c6695..4ef9687ac115 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -21,10 +21,11 @@
21#include <linux/slab.h> 21#include <linux/slab.h>
22#include <linux/syscalls.h> 22#include <linux/syscalls.h>
23 23
24/* #define SECCOMP_DEBUG 1 */ 24#ifdef CONFIG_HAVE_ARCH_SECCOMP_FILTER
25#include <asm/syscall.h>
26#endif
25 27
26#ifdef CONFIG_SECCOMP_FILTER 28#ifdef CONFIG_SECCOMP_FILTER
27#include <asm/syscall.h>
28#include <linux/filter.h> 29#include <linux/filter.h>
29#include <linux/pid.h> 30#include <linux/pid.h>
30#include <linux/ptrace.h> 31#include <linux/ptrace.h>
@@ -172,10 +173,10 @@ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen)
172 * 173 *
173 * Returns valid seccomp BPF response codes. 174 * Returns valid seccomp BPF response codes.
174 */ 175 */
175static u32 seccomp_run_filters(int syscall) 176static u32 seccomp_run_filters(struct seccomp_data *sd)
176{ 177{
177 struct seccomp_filter *f = ACCESS_ONCE(current->seccomp.filter); 178 struct seccomp_filter *f = ACCESS_ONCE(current->seccomp.filter);
178 struct seccomp_data sd; 179 struct seccomp_data sd_local;
179 u32 ret = SECCOMP_RET_ALLOW; 180 u32 ret = SECCOMP_RET_ALLOW;
180 181
181 /* Ensure unexpected behavior doesn't result in failing open. */ 182 /* Ensure unexpected behavior doesn't result in failing open. */
@@ -185,14 +186,17 @@ static u32 seccomp_run_filters(int syscall)
185 /* Make sure cross-thread synced filter points somewhere sane. */ 186 /* Make sure cross-thread synced filter points somewhere sane. */
186 smp_read_barrier_depends(); 187 smp_read_barrier_depends();
187 188
188 populate_seccomp_data(&sd); 189 if (!sd) {
190 populate_seccomp_data(&sd_local);
191 sd = &sd_local;
192 }
189 193
190 /* 194 /*
191 * All filters in the list are evaluated and the lowest BPF return 195 * All filters in the list are evaluated and the lowest BPF return
192 * value always takes priority (ignoring the DATA). 196 * value always takes priority (ignoring the DATA).
193 */ 197 */
194 for (; f; f = f->prev) { 198 for (; f; f = f->prev) {
195 u32 cur_ret = BPF_PROG_RUN(f->prog, (void *)&sd); 199 u32 cur_ret = BPF_PROG_RUN(f->prog, (void *)sd);
196 200
197 if ((cur_ret & SECCOMP_RET_ACTION) < (ret & SECCOMP_RET_ACTION)) 201 if ((cur_ret & SECCOMP_RET_ACTION) < (ret & SECCOMP_RET_ACTION))
198 ret = cur_ret; 202 ret = cur_ret;
@@ -395,16 +399,15 @@ static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog)
395 if (!filter) 399 if (!filter)
396 goto free_prog; 400 goto free_prog;
397 401
398 filter->prog = kzalloc(bpf_prog_size(new_len), 402 filter->prog = bpf_prog_alloc(bpf_prog_size(new_len), __GFP_NOWARN);
399 GFP_KERNEL|__GFP_NOWARN);
400 if (!filter->prog) 403 if (!filter->prog)
401 goto free_filter; 404 goto free_filter;
402 405
403 ret = bpf_convert_filter(fp, fprog->len, filter->prog->insnsi, &new_len); 406 ret = bpf_convert_filter(fp, fprog->len, filter->prog->insnsi, &new_len);
404 if (ret) 407 if (ret)
405 goto free_filter_prog; 408 goto free_filter_prog;
406 kfree(fp);
407 409
410 kfree(fp);
408 atomic_set(&filter->usage, 1); 411 atomic_set(&filter->usage, 1);
409 filter->prog->len = new_len; 412 filter->prog->len = new_len;
410 413
@@ -413,7 +416,7 @@ static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog)
413 return filter; 416 return filter;
414 417
415free_filter_prog: 418free_filter_prog:
416 kfree(filter->prog); 419 __bpf_prog_free(filter->prog);
417free_filter: 420free_filter:
418 kfree(filter); 421 kfree(filter);
419free_prog: 422free_prog:
@@ -564,11 +567,55 @@ static int mode1_syscalls_32[] = {
564}; 567};
565#endif 568#endif
566 569
567int __secure_computing(int this_syscall) 570static void __secure_computing_strict(int this_syscall)
571{
572 int *syscall_whitelist = mode1_syscalls;
573#ifdef CONFIG_COMPAT
574 if (is_compat_task())
575 syscall_whitelist = mode1_syscalls_32;
576#endif
577 do {
578 if (*syscall_whitelist == this_syscall)
579 return;
580 } while (*++syscall_whitelist);
581
582#ifdef SECCOMP_DEBUG
583 dump_stack();
584#endif
585 audit_seccomp(this_syscall, SIGKILL, SECCOMP_RET_KILL);
586 do_exit(SIGKILL);
587}
588
589#ifndef CONFIG_HAVE_ARCH_SECCOMP_FILTER
590void secure_computing_strict(int this_syscall)
591{
592 int mode = current->seccomp.mode;
593
594 if (mode == 0)
595 return;
596 else if (mode == SECCOMP_MODE_STRICT)
597 __secure_computing_strict(this_syscall);
598 else
599 BUG();
600}
601#else
602int __secure_computing(void)
603{
604 u32 phase1_result = seccomp_phase1(NULL);
605
606 if (likely(phase1_result == SECCOMP_PHASE1_OK))
607 return 0;
608 else if (likely(phase1_result == SECCOMP_PHASE1_SKIP))
609 return -1;
610 else
611 return seccomp_phase2(phase1_result);
612}
613
614#ifdef CONFIG_SECCOMP_FILTER
615static u32 __seccomp_phase1_filter(int this_syscall, struct seccomp_data *sd)
568{ 616{
569 int exit_sig = 0; 617 u32 filter_ret, action;
570 int *syscall; 618 int data;
571 u32 ret;
572 619
573 /* 620 /*
574 * Make sure that any changes to mode from another thread have 621 * Make sure that any changes to mode from another thread have
@@ -576,85 +623,127 @@ int __secure_computing(int this_syscall)
576 */ 623 */
577 rmb(); 624 rmb();
578 625
579 switch (current->seccomp.mode) { 626 filter_ret = seccomp_run_filters(sd);
580 case SECCOMP_MODE_STRICT: 627 data = filter_ret & SECCOMP_RET_DATA;
581 syscall = mode1_syscalls; 628 action = filter_ret & SECCOMP_RET_ACTION;
582#ifdef CONFIG_COMPAT 629
583 if (is_compat_task()) 630 switch (action) {
584 syscall = mode1_syscalls_32; 631 case SECCOMP_RET_ERRNO:
632 /* Set the low-order 16-bits as a errno. */
633 syscall_set_return_value(current, task_pt_regs(current),
634 -data, 0);
635 goto skip;
636
637 case SECCOMP_RET_TRAP:
638 /* Show the handler the original registers. */
639 syscall_rollback(current, task_pt_regs(current));
640 /* Let the filter pass back 16 bits of data. */
641 seccomp_send_sigsys(this_syscall, data);
642 goto skip;
643
644 case SECCOMP_RET_TRACE:
645 return filter_ret; /* Save the rest for phase 2. */
646
647 case SECCOMP_RET_ALLOW:
648 return SECCOMP_PHASE1_OK;
649
650 case SECCOMP_RET_KILL:
651 default:
652 audit_seccomp(this_syscall, SIGSYS, action);
653 do_exit(SIGSYS);
654 }
655
656 unreachable();
657
658skip:
659 audit_seccomp(this_syscall, 0, action);
660 return SECCOMP_PHASE1_SKIP;
661}
585#endif 662#endif
586 do { 663
587 if (*syscall == this_syscall) 664/**
588 return 0; 665 * seccomp_phase1() - run fast path seccomp checks on the current syscall
589 } while (*++syscall); 666 * @arg sd: The seccomp_data or NULL
590 exit_sig = SIGKILL; 667 *
591 ret = SECCOMP_RET_KILL; 668 * This only reads pt_regs via the syscall_xyz helpers. The only change
592 break; 669 * it will make to pt_regs is via syscall_set_return_value, and it will
670 * only do that if it returns SECCOMP_PHASE1_SKIP.
671 *
672 * If sd is provided, it will not read pt_regs at all.
673 *
674 * It may also call do_exit or force a signal; these actions must be
675 * safe.
676 *
677 * If it returns SECCOMP_PHASE1_OK, the syscall passes checks and should
678 * be processed normally.
679 *
680 * If it returns SECCOMP_PHASE1_SKIP, then the syscall should not be
681 * invoked. In this case, seccomp_phase1 will have set the return value
682 * using syscall_set_return_value.
683 *
684 * If it returns anything else, then the return value should be passed
685 * to seccomp_phase2 from a context in which ptrace hooks are safe.
686 */
687u32 seccomp_phase1(struct seccomp_data *sd)
688{
689 int mode = current->seccomp.mode;
690 int this_syscall = sd ? sd->nr :
691 syscall_get_nr(current, task_pt_regs(current));
692
693 switch (mode) {
694 case SECCOMP_MODE_STRICT:
695 __secure_computing_strict(this_syscall); /* may call do_exit */
696 return SECCOMP_PHASE1_OK;
593#ifdef CONFIG_SECCOMP_FILTER 697#ifdef CONFIG_SECCOMP_FILTER
594 case SECCOMP_MODE_FILTER: { 698 case SECCOMP_MODE_FILTER:
595 int data; 699 return __seccomp_phase1_filter(this_syscall, sd);
596 struct pt_regs *regs = task_pt_regs(current);
597 ret = seccomp_run_filters(this_syscall);
598 data = ret & SECCOMP_RET_DATA;
599 ret &= SECCOMP_RET_ACTION;
600 switch (ret) {
601 case SECCOMP_RET_ERRNO:
602 /* Set the low-order 16-bits as a errno. */
603 syscall_set_return_value(current, regs,
604 -data, 0);
605 goto skip;
606 case SECCOMP_RET_TRAP:
607 /* Show the handler the original registers. */
608 syscall_rollback(current, regs);
609 /* Let the filter pass back 16 bits of data. */
610 seccomp_send_sigsys(this_syscall, data);
611 goto skip;
612 case SECCOMP_RET_TRACE:
613 /* Skip these calls if there is no tracer. */
614 if (!ptrace_event_enabled(current, PTRACE_EVENT_SECCOMP)) {
615 syscall_set_return_value(current, regs,
616 -ENOSYS, 0);
617 goto skip;
618 }
619 /* Allow the BPF to provide the event message */
620 ptrace_event(PTRACE_EVENT_SECCOMP, data);
621 /*
622 * The delivery of a fatal signal during event
623 * notification may silently skip tracer notification.
624 * Terminating the task now avoids executing a system
625 * call that may not be intended.
626 */
627 if (fatal_signal_pending(current))
628 break;
629 if (syscall_get_nr(current, regs) < 0)
630 goto skip; /* Explicit request to skip. */
631
632 return 0;
633 case SECCOMP_RET_ALLOW:
634 return 0;
635 case SECCOMP_RET_KILL:
636 default:
637 break;
638 }
639 exit_sig = SIGSYS;
640 break;
641 }
642#endif 700#endif
643 default: 701 default:
644 BUG(); 702 BUG();
645 } 703 }
704}
646 705
647#ifdef SECCOMP_DEBUG 706/**
648 dump_stack(); 707 * seccomp_phase2() - finish slow path seccomp work for the current syscall
649#endif 708 * @phase1_result: The return value from seccomp_phase1()
650 audit_seccomp(this_syscall, exit_sig, ret); 709 *
651 do_exit(exit_sig); 710 * This must be called from a context in which ptrace hooks can be used.
652#ifdef CONFIG_SECCOMP_FILTER 711 *
653skip: 712 * Returns 0 if the syscall should be processed or -1 to skip the syscall.
654 audit_seccomp(this_syscall, exit_sig, ret); 713 */
655#endif 714int seccomp_phase2(u32 phase1_result)
656 return -1; 715{
716 struct pt_regs *regs = task_pt_regs(current);
717 u32 action = phase1_result & SECCOMP_RET_ACTION;
718 int data = phase1_result & SECCOMP_RET_DATA;
719
720 BUG_ON(action != SECCOMP_RET_TRACE);
721
722 audit_seccomp(syscall_get_nr(current, regs), 0, action);
723
724 /* Skip these calls if there is no tracer. */
725 if (!ptrace_event_enabled(current, PTRACE_EVENT_SECCOMP)) {
726 syscall_set_return_value(current, regs,
727 -ENOSYS, 0);
728 return -1;
729 }
730
731 /* Allow the BPF to provide the event message */
732 ptrace_event(PTRACE_EVENT_SECCOMP, data);
733 /*
734 * The delivery of a fatal signal during event
735 * notification may silently skip tracer notification.
736 * Terminating the task now avoids executing a system
737 * call that may not be intended.
738 */
739 if (fatal_signal_pending(current))
740 do_exit(SIGSYS);
741 if (syscall_get_nr(current, regs) < 0)
742 return -1; /* Explicit request to skip. */
743
744 return 0;
657} 745}
746#endif /* CONFIG_HAVE_ARCH_SECCOMP_FILTER */
658 747
659long prctl_get_seccomp(void) 748long prctl_get_seccomp(void)
660{ 749{
diff --git a/kernel/signal.c b/kernel/signal.c
index 8f0876f9f6dd..16a305295256 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1275,7 +1275,17 @@ struct sighand_struct *__lock_task_sighand(struct task_struct *tsk,
1275 local_irq_restore(*flags); 1275 local_irq_restore(*flags);
1276 break; 1276 break;
1277 } 1277 }
1278 1278 /*
1279 * This sighand can be already freed and even reused, but
1280 * we rely on SLAB_DESTROY_BY_RCU and sighand_ctor() which
1281 * initializes ->siglock: this slab can't go away, it has
1282 * the same object type, ->siglock can't be reinitialized.
1283 *
1284 * We need to ensure that tsk->sighand is still the same
1285 * after we take the lock, we can race with de_thread() or
1286 * __exit_signal(). In the latter case the next iteration
1287 * must see ->sighand == NULL.
1288 */
1279 spin_lock(&sighand->siglock); 1289 spin_lock(&sighand->siglock);
1280 if (likely(sighand == tsk->sighand)) { 1290 if (likely(sighand == tsk->sighand)) {
1281 rcu_read_unlock(); 1291 rcu_read_unlock();
@@ -1331,23 +1341,21 @@ int kill_pid_info(int sig, struct siginfo *info, struct pid *pid)
1331 int error = -ESRCH; 1341 int error = -ESRCH;
1332 struct task_struct *p; 1342 struct task_struct *p;
1333 1343
1334 rcu_read_lock(); 1344 for (;;) {
1335retry: 1345 rcu_read_lock();
1336 p = pid_task(pid, PIDTYPE_PID); 1346 p = pid_task(pid, PIDTYPE_PID);
1337 if (p) { 1347 if (p)
1338 error = group_send_sig_info(sig, info, p); 1348 error = group_send_sig_info(sig, info, p);
1339 if (unlikely(error == -ESRCH)) 1349 rcu_read_unlock();
1340 /* 1350 if (likely(!p || error != -ESRCH))
1341 * The task was unhashed in between, try again. 1351 return error;
1342 * If it is dead, pid_task() will return NULL,
1343 * if we race with de_thread() it will find the
1344 * new leader.
1345 */
1346 goto retry;
1347 }
1348 rcu_read_unlock();
1349 1352
1350 return error; 1353 /*
1354 * The task was unhashed in between, try again. If it
1355 * is dead, pid_task() will return NULL, if we race with
1356 * de_thread() it will find the new leader.
1357 */
1358 }
1351} 1359}
1352 1360
1353int kill_proc_info(int sig, struct siginfo *info, pid_t pid) 1361int kill_proc_info(int sig, struct siginfo *info, pid_t pid)
@@ -2748,6 +2756,10 @@ int copy_siginfo_to_user(siginfo_t __user *to, const siginfo_t *from)
2748 if (from->si_code == BUS_MCEERR_AR || from->si_code == BUS_MCEERR_AO) 2756 if (from->si_code == BUS_MCEERR_AR || from->si_code == BUS_MCEERR_AO)
2749 err |= __put_user(from->si_addr_lsb, &to->si_addr_lsb); 2757 err |= __put_user(from->si_addr_lsb, &to->si_addr_lsb);
2750#endif 2758#endif
2759#ifdef SEGV_BNDERR
2760 err |= __put_user(from->si_lower, &to->si_lower);
2761 err |= __put_user(from->si_upper, &to->si_upper);
2762#endif
2751 break; 2763 break;
2752 case __SI_CHLD: 2764 case __SI_CHLD:
2753 err |= __put_user(from->si_pid, &to->si_pid); 2765 err |= __put_user(from->si_pid, &to->si_pid);
diff --git a/kernel/smp.c b/kernel/smp.c
index aff8aa14f547..f38a1e692259 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -13,6 +13,7 @@
13#include <linux/gfp.h> 13#include <linux/gfp.h>
14#include <linux/smp.h> 14#include <linux/smp.h>
15#include <linux/cpu.h> 15#include <linux/cpu.h>
16#include <linux/sched.h>
16 17
17#include "smpboot.h" 18#include "smpboot.h"
18 19
@@ -164,7 +165,7 @@ static int generic_exec_single(int cpu, struct call_single_data *csd,
164 if (!csd) { 165 if (!csd) {
165 csd = &csd_stack; 166 csd = &csd_stack;
166 if (!wait) 167 if (!wait)
167 csd = &__get_cpu_var(csd_data); 168 csd = this_cpu_ptr(&csd_data);
168 } 169 }
169 170
170 csd_lock(csd); 171 csd_lock(csd);
@@ -229,7 +230,7 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline)
229 230
230 WARN_ON(!irqs_disabled()); 231 WARN_ON(!irqs_disabled());
231 232
232 head = &__get_cpu_var(call_single_queue); 233 head = this_cpu_ptr(&call_single_queue);
233 entry = llist_del_all(head); 234 entry = llist_del_all(head);
234 entry = llist_reverse_order(entry); 235 entry = llist_reverse_order(entry);
235 236
@@ -419,7 +420,7 @@ void smp_call_function_many(const struct cpumask *mask,
419 return; 420 return;
420 } 421 }
421 422
422 cfd = &__get_cpu_var(cfd_data); 423 cfd = this_cpu_ptr(&cfd_data);
423 424
424 cpumask_and(cfd->cpumask, mask, cpu_online_mask); 425 cpumask_and(cfd->cpumask, mask, cpu_online_mask);
425 cpumask_clear_cpu(this_cpu, cfd->cpumask); 426 cpumask_clear_cpu(this_cpu, cfd->cpumask);
@@ -699,3 +700,24 @@ void kick_all_cpus_sync(void)
699 smp_call_function(do_nothing, NULL, 1); 700 smp_call_function(do_nothing, NULL, 1);
700} 701}
701EXPORT_SYMBOL_GPL(kick_all_cpus_sync); 702EXPORT_SYMBOL_GPL(kick_all_cpus_sync);
703
704/**
705 * wake_up_all_idle_cpus - break all cpus out of idle
706 * wake_up_all_idle_cpus try to break all cpus which is in idle state even
707 * including idle polling cpus, for non-idle cpus, we will do nothing
708 * for them.
709 */
710void wake_up_all_idle_cpus(void)
711{
712 int cpu;
713
714 preempt_disable();
715 for_each_online_cpu(cpu) {
716 if (cpu == smp_processor_id())
717 continue;
718
719 wake_up_if_idle(cpu);
720 }
721 preempt_enable();
722}
723EXPORT_SYMBOL_GPL(wake_up_all_idle_cpus);
diff --git a/kernel/smpboot.c b/kernel/smpboot.c
index eb89e1807408..f032fb5284e3 100644
--- a/kernel/smpboot.c
+++ b/kernel/smpboot.c
@@ -110,7 +110,7 @@ static int smpboot_thread_fn(void *data)
110 set_current_state(TASK_INTERRUPTIBLE); 110 set_current_state(TASK_INTERRUPTIBLE);
111 preempt_disable(); 111 preempt_disable();
112 if (kthread_should_stop()) { 112 if (kthread_should_stop()) {
113 set_current_state(TASK_RUNNING); 113 __set_current_state(TASK_RUNNING);
114 preempt_enable(); 114 preempt_enable();
115 if (ht->cleanup) 115 if (ht->cleanup)
116 ht->cleanup(td->cpu, cpu_online(td->cpu)); 116 ht->cleanup(td->cpu, cpu_online(td->cpu));
@@ -136,26 +136,27 @@ static int smpboot_thread_fn(void *data)
136 /* Check for state change setup */ 136 /* Check for state change setup */
137 switch (td->status) { 137 switch (td->status) {
138 case HP_THREAD_NONE: 138 case HP_THREAD_NONE:
139 __set_current_state(TASK_RUNNING);
139 preempt_enable(); 140 preempt_enable();
140 if (ht->setup) 141 if (ht->setup)
141 ht->setup(td->cpu); 142 ht->setup(td->cpu);
142 td->status = HP_THREAD_ACTIVE; 143 td->status = HP_THREAD_ACTIVE;
143 preempt_disable(); 144 continue;
144 break; 145
145 case HP_THREAD_PARKED: 146 case HP_THREAD_PARKED:
147 __set_current_state(TASK_RUNNING);
146 preempt_enable(); 148 preempt_enable();
147 if (ht->unpark) 149 if (ht->unpark)
148 ht->unpark(td->cpu); 150 ht->unpark(td->cpu);
149 td->status = HP_THREAD_ACTIVE; 151 td->status = HP_THREAD_ACTIVE;
150 preempt_disable(); 152 continue;
151 break;
152 } 153 }
153 154
154 if (!ht->thread_should_run(td->cpu)) { 155 if (!ht->thread_should_run(td->cpu)) {
155 preempt_enable(); 156 preempt_enable_no_resched();
156 schedule(); 157 schedule();
157 } else { 158 } else {
158 set_current_state(TASK_RUNNING); 159 __set_current_state(TASK_RUNNING);
159 preempt_enable(); 160 preempt_enable();
160 ht->thread_fn(td->cpu); 161 ht->thread_fn(td->cpu);
161 } 162 }
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 5918d227730f..501baa9ac1be 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -278,7 +278,7 @@ restart:
278 pending >>= softirq_bit; 278 pending >>= softirq_bit;
279 } 279 }
280 280
281 rcu_bh_qs(smp_processor_id()); 281 rcu_bh_qs();
282 local_irq_disable(); 282 local_irq_disable();
283 283
284 pending = local_softirq_pending(); 284 pending = local_softirq_pending();
@@ -485,7 +485,7 @@ static void tasklet_action(struct softirq_action *a)
485 local_irq_disable(); 485 local_irq_disable();
486 list = __this_cpu_read(tasklet_vec.head); 486 list = __this_cpu_read(tasklet_vec.head);
487 __this_cpu_write(tasklet_vec.head, NULL); 487 __this_cpu_write(tasklet_vec.head, NULL);
488 __this_cpu_write(tasklet_vec.tail, &__get_cpu_var(tasklet_vec).head); 488 __this_cpu_write(tasklet_vec.tail, this_cpu_ptr(&tasklet_vec.head));
489 local_irq_enable(); 489 local_irq_enable();
490 490
491 while (list) { 491 while (list) {
@@ -521,7 +521,7 @@ static void tasklet_hi_action(struct softirq_action *a)
521 local_irq_disable(); 521 local_irq_disable();
522 list = __this_cpu_read(tasklet_hi_vec.head); 522 list = __this_cpu_read(tasklet_hi_vec.head);
523 __this_cpu_write(tasklet_hi_vec.head, NULL); 523 __this_cpu_write(tasklet_hi_vec.head, NULL);
524 __this_cpu_write(tasklet_hi_vec.tail, &__get_cpu_var(tasklet_hi_vec).head); 524 __this_cpu_write(tasklet_hi_vec.tail, this_cpu_ptr(&tasklet_hi_vec.head));
525 local_irq_enable(); 525 local_irq_enable();
526 526
527 while (list) { 527 while (list) {
@@ -656,7 +656,7 @@ static void run_ksoftirqd(unsigned int cpu)
656 * in the task stack here. 656 * in the task stack here.
657 */ 657 */
658 __do_softirq(); 658 __do_softirq();
659 rcu_note_context_switch(cpu); 659 rcu_note_context_switch();
660 local_irq_enable(); 660 local_irq_enable();
661 cond_resched(); 661 cond_resched();
662 return; 662 return;
diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c
index 00fe55cc5a82..b6e4c16377c7 100644
--- a/kernel/stacktrace.c
+++ b/kernel/stacktrace.c
@@ -25,6 +25,38 @@ void print_stack_trace(struct stack_trace *trace, int spaces)
25} 25}
26EXPORT_SYMBOL_GPL(print_stack_trace); 26EXPORT_SYMBOL_GPL(print_stack_trace);
27 27
28int snprint_stack_trace(char *buf, size_t size,
29 struct stack_trace *trace, int spaces)
30{
31 int i;
32 unsigned long ip;
33 int generated;
34 int total = 0;
35
36 if (WARN_ON(!trace->entries))
37 return 0;
38
39 for (i = 0; i < trace->nr_entries; i++) {
40 ip = trace->entries[i];
41 generated = snprintf(buf, size, "%*c[<%p>] %pS\n",
42 1 + spaces, ' ', (void *) ip, (void *) ip);
43
44 total += generated;
45
46 /* Assume that generated isn't a negative number */
47 if (generated >= size) {
48 buf += size;
49 size = 0;
50 } else {
51 buf += generated;
52 size -= generated;
53 }
54 }
55
56 return total;
57}
58EXPORT_SYMBOL_GPL(snprint_stack_trace);
59
28/* 60/*
29 * Architectures that do not implement save_stack_trace_tsk or 61 * Architectures that do not implement save_stack_trace_tsk or
30 * save_stack_trace_regs get this weak alias and a once-per-bootup warning 62 * save_stack_trace_regs get this weak alias and a once-per-bootup warning
diff --git a/kernel/sys.c b/kernel/sys.c
index ce8129192a26..a8c9f5a7dda6 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -62,28 +62,28 @@
62#include <asm/unistd.h> 62#include <asm/unistd.h>
63 63
64#ifndef SET_UNALIGN_CTL 64#ifndef SET_UNALIGN_CTL
65# define SET_UNALIGN_CTL(a,b) (-EINVAL) 65# define SET_UNALIGN_CTL(a, b) (-EINVAL)
66#endif 66#endif
67#ifndef GET_UNALIGN_CTL 67#ifndef GET_UNALIGN_CTL
68# define GET_UNALIGN_CTL(a,b) (-EINVAL) 68# define GET_UNALIGN_CTL(a, b) (-EINVAL)
69#endif 69#endif
70#ifndef SET_FPEMU_CTL 70#ifndef SET_FPEMU_CTL
71# define SET_FPEMU_CTL(a,b) (-EINVAL) 71# define SET_FPEMU_CTL(a, b) (-EINVAL)
72#endif 72#endif
73#ifndef GET_FPEMU_CTL 73#ifndef GET_FPEMU_CTL
74# define GET_FPEMU_CTL(a,b) (-EINVAL) 74# define GET_FPEMU_CTL(a, b) (-EINVAL)
75#endif 75#endif
76#ifndef SET_FPEXC_CTL 76#ifndef SET_FPEXC_CTL
77# define SET_FPEXC_CTL(a,b) (-EINVAL) 77# define SET_FPEXC_CTL(a, b) (-EINVAL)
78#endif 78#endif
79#ifndef GET_FPEXC_CTL 79#ifndef GET_FPEXC_CTL
80# define GET_FPEXC_CTL(a,b) (-EINVAL) 80# define GET_FPEXC_CTL(a, b) (-EINVAL)
81#endif 81#endif
82#ifndef GET_ENDIAN 82#ifndef GET_ENDIAN
83# define GET_ENDIAN(a,b) (-EINVAL) 83# define GET_ENDIAN(a, b) (-EINVAL)
84#endif 84#endif
85#ifndef SET_ENDIAN 85#ifndef SET_ENDIAN
86# define SET_ENDIAN(a,b) (-EINVAL) 86# define SET_ENDIAN(a, b) (-EINVAL)
87#endif 87#endif
88#ifndef GET_TSC_CTL 88#ifndef GET_TSC_CTL
89# define GET_TSC_CTL(a) (-EINVAL) 89# define GET_TSC_CTL(a) (-EINVAL)
@@ -91,6 +91,12 @@
91#ifndef SET_TSC_CTL 91#ifndef SET_TSC_CTL
92# define SET_TSC_CTL(a) (-EINVAL) 92# define SET_TSC_CTL(a) (-EINVAL)
93#endif 93#endif
94#ifndef MPX_ENABLE_MANAGEMENT
95# define MPX_ENABLE_MANAGEMENT(a) (-EINVAL)
96#endif
97#ifndef MPX_DISABLE_MANAGEMENT
98# define MPX_DISABLE_MANAGEMENT(a) (-EINVAL)
99#endif
94 100
95/* 101/*
96 * this is where the system-wide overflow UID and GID are defined, for 102 * this is where the system-wide overflow UID and GID are defined, for
@@ -182,39 +188,40 @@ SYSCALL_DEFINE3(setpriority, int, which, int, who, int, niceval)
182 rcu_read_lock(); 188 rcu_read_lock();
183 read_lock(&tasklist_lock); 189 read_lock(&tasklist_lock);
184 switch (which) { 190 switch (which) {
185 case PRIO_PROCESS: 191 case PRIO_PROCESS:
186 if (who) 192 if (who)
187 p = find_task_by_vpid(who); 193 p = find_task_by_vpid(who);
188 else 194 else
189 p = current; 195 p = current;
190 if (p) 196 if (p)
191 error = set_one_prio(p, niceval, error); 197 error = set_one_prio(p, niceval, error);
192 break; 198 break;
193 case PRIO_PGRP: 199 case PRIO_PGRP:
194 if (who) 200 if (who)
195 pgrp = find_vpid(who); 201 pgrp = find_vpid(who);
196 else 202 else
197 pgrp = task_pgrp(current); 203 pgrp = task_pgrp(current);
198 do_each_pid_thread(pgrp, PIDTYPE_PGID, p) { 204 do_each_pid_thread(pgrp, PIDTYPE_PGID, p) {
199 error = set_one_prio(p, niceval, error); 205 error = set_one_prio(p, niceval, error);
200 } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); 206 } while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
201 break; 207 break;
202 case PRIO_USER: 208 case PRIO_USER:
203 uid = make_kuid(cred->user_ns, who); 209 uid = make_kuid(cred->user_ns, who);
204 user = cred->user; 210 user = cred->user;
205 if (!who) 211 if (!who)
206 uid = cred->uid; 212 uid = cred->uid;
207 else if (!uid_eq(uid, cred->uid) && 213 else if (!uid_eq(uid, cred->uid)) {
208 !(user = find_user(uid))) 214 user = find_user(uid);
215 if (!user)
209 goto out_unlock; /* No processes for this user */ 216 goto out_unlock; /* No processes for this user */
210 217 }
211 do_each_thread(g, p) { 218 do_each_thread(g, p) {
212 if (uid_eq(task_uid(p), uid)) 219 if (uid_eq(task_uid(p), uid))
213 error = set_one_prio(p, niceval, error); 220 error = set_one_prio(p, niceval, error);
214 } while_each_thread(g, p); 221 } while_each_thread(g, p);
215 if (!uid_eq(uid, cred->uid)) 222 if (!uid_eq(uid, cred->uid))
216 free_uid(user); /* For find_user() */ 223 free_uid(user); /* For find_user() */
217 break; 224 break;
218 } 225 }
219out_unlock: 226out_unlock:
220 read_unlock(&tasklist_lock); 227 read_unlock(&tasklist_lock);
@@ -244,47 +251,48 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who)
244 rcu_read_lock(); 251 rcu_read_lock();
245 read_lock(&tasklist_lock); 252 read_lock(&tasklist_lock);
246 switch (which) { 253 switch (which) {
247 case PRIO_PROCESS: 254 case PRIO_PROCESS:
248 if (who) 255 if (who)
249 p = find_task_by_vpid(who); 256 p = find_task_by_vpid(who);
250 else 257 else
251 p = current; 258 p = current;
252 if (p) { 259 if (p) {
260 niceval = nice_to_rlimit(task_nice(p));
261 if (niceval > retval)
262 retval = niceval;
263 }
264 break;
265 case PRIO_PGRP:
266 if (who)
267 pgrp = find_vpid(who);
268 else
269 pgrp = task_pgrp(current);
270 do_each_pid_thread(pgrp, PIDTYPE_PGID, p) {
271 niceval = nice_to_rlimit(task_nice(p));
272 if (niceval > retval)
273 retval = niceval;
274 } while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
275 break;
276 case PRIO_USER:
277 uid = make_kuid(cred->user_ns, who);
278 user = cred->user;
279 if (!who)
280 uid = cred->uid;
281 else if (!uid_eq(uid, cred->uid)) {
282 user = find_user(uid);
283 if (!user)
284 goto out_unlock; /* No processes for this user */
285 }
286 do_each_thread(g, p) {
287 if (uid_eq(task_uid(p), uid)) {
253 niceval = nice_to_rlimit(task_nice(p)); 288 niceval = nice_to_rlimit(task_nice(p));
254 if (niceval > retval) 289 if (niceval > retval)
255 retval = niceval; 290 retval = niceval;
256 } 291 }
257 break; 292 } while_each_thread(g, p);
258 case PRIO_PGRP: 293 if (!uid_eq(uid, cred->uid))
259 if (who) 294 free_uid(user); /* for find_user() */
260 pgrp = find_vpid(who); 295 break;
261 else
262 pgrp = task_pgrp(current);
263 do_each_pid_thread(pgrp, PIDTYPE_PGID, p) {
264 niceval = nice_to_rlimit(task_nice(p));
265 if (niceval > retval)
266 retval = niceval;
267 } while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
268 break;
269 case PRIO_USER:
270 uid = make_kuid(cred->user_ns, who);
271 user = cred->user;
272 if (!who)
273 uid = cred->uid;
274 else if (!uid_eq(uid, cred->uid) &&
275 !(user = find_user(uid)))
276 goto out_unlock; /* No processes for this user */
277
278 do_each_thread(g, p) {
279 if (uid_eq(task_uid(p), uid)) {
280 niceval = nice_to_rlimit(task_nice(p));
281 if (niceval > retval)
282 retval = niceval;
283 }
284 } while_each_thread(g, p);
285 if (!uid_eq(uid, cred->uid))
286 free_uid(user); /* for find_user() */
287 break;
288 } 296 }
289out_unlock: 297out_unlock:
290 read_unlock(&tasklist_lock); 298 read_unlock(&tasklist_lock);
@@ -306,7 +314,7 @@ out_unlock:
306 * 314 *
307 * The general idea is that a program which uses just setregid() will be 315 * The general idea is that a program which uses just setregid() will be
308 * 100% compatible with BSD. A program which uses just setgid() will be 316 * 100% compatible with BSD. A program which uses just setgid() will be
309 * 100% compatible with POSIX with saved IDs. 317 * 100% compatible with POSIX with saved IDs.
310 * 318 *
311 * SMP: There are not races, the GIDs are checked only by filesystem 319 * SMP: There are not races, the GIDs are checked only by filesystem
312 * operations (as far as semantic preservation is concerned). 320 * operations (as far as semantic preservation is concerned).
@@ -364,7 +372,7 @@ error:
364} 372}
365 373
366/* 374/*
367 * setgid() is implemented like SysV w/ SAVED_IDS 375 * setgid() is implemented like SysV w/ SAVED_IDS
368 * 376 *
369 * SMP: Same implicit races as above. 377 * SMP: Same implicit races as above.
370 */ 378 */
@@ -442,7 +450,7 @@ static int set_user(struct cred *new)
442 * 450 *
443 * The general idea is that a program which uses just setreuid() will be 451 * The general idea is that a program which uses just setreuid() will be
444 * 100% compatible with BSD. A program which uses just setuid() will be 452 * 100% compatible with BSD. A program which uses just setuid() will be
445 * 100% compatible with POSIX with saved IDs. 453 * 100% compatible with POSIX with saved IDs.
446 */ 454 */
447SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid) 455SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid)
448{ 456{
@@ -503,17 +511,17 @@ error:
503 abort_creds(new); 511 abort_creds(new);
504 return retval; 512 return retval;
505} 513}
506 514
507/* 515/*
508 * setuid() is implemented like SysV with SAVED_IDS 516 * setuid() is implemented like SysV with SAVED_IDS
509 * 517 *
510 * Note that SAVED_ID's is deficient in that a setuid root program 518 * Note that SAVED_ID's is deficient in that a setuid root program
511 * like sendmail, for example, cannot set its uid to be a normal 519 * like sendmail, for example, cannot set its uid to be a normal
512 * user and then switch back, because if you're root, setuid() sets 520 * user and then switch back, because if you're root, setuid() sets
513 * the saved uid too. If you don't like this, blame the bright people 521 * the saved uid too. If you don't like this, blame the bright people
514 * in the POSIX committee and/or USG. Note that the BSD-style setreuid() 522 * in the POSIX committee and/or USG. Note that the BSD-style setreuid()
515 * will allow a root program to temporarily drop privileges and be able to 523 * will allow a root program to temporarily drop privileges and be able to
516 * regain them by swapping the real and effective uid. 524 * regain them by swapping the real and effective uid.
517 */ 525 */
518SYSCALL_DEFINE1(setuid, uid_t, uid) 526SYSCALL_DEFINE1(setuid, uid_t, uid)
519{ 527{
@@ -637,10 +645,12 @@ SYSCALL_DEFINE3(getresuid, uid_t __user *, ruidp, uid_t __user *, euidp, uid_t _
637 euid = from_kuid_munged(cred->user_ns, cred->euid); 645 euid = from_kuid_munged(cred->user_ns, cred->euid);
638 suid = from_kuid_munged(cred->user_ns, cred->suid); 646 suid = from_kuid_munged(cred->user_ns, cred->suid);
639 647
640 if (!(retval = put_user(ruid, ruidp)) && 648 retval = put_user(ruid, ruidp);
641 !(retval = put_user(euid, euidp))) 649 if (!retval) {
642 retval = put_user(suid, suidp); 650 retval = put_user(euid, euidp);
643 651 if (!retval)
652 return put_user(suid, suidp);
653 }
644 return retval; 654 return retval;
645} 655}
646 656
@@ -709,9 +719,12 @@ SYSCALL_DEFINE3(getresgid, gid_t __user *, rgidp, gid_t __user *, egidp, gid_t _
709 egid = from_kgid_munged(cred->user_ns, cred->egid); 719 egid = from_kgid_munged(cred->user_ns, cred->egid);
710 sgid = from_kgid_munged(cred->user_ns, cred->sgid); 720 sgid = from_kgid_munged(cred->user_ns, cred->sgid);
711 721
712 if (!(retval = put_user(rgid, rgidp)) && 722 retval = put_user(rgid, rgidp);
713 !(retval = put_user(egid, egidp))) 723 if (!retval) {
714 retval = put_user(sgid, sgidp); 724 retval = put_user(egid, egidp);
725 if (!retval)
726 retval = put_user(sgid, sgidp);
727 }
715 728
716 return retval; 729 return retval;
717} 730}
@@ -862,11 +875,9 @@ void do_sys_times(struct tms *tms)
862{ 875{
863 cputime_t tgutime, tgstime, cutime, cstime; 876 cputime_t tgutime, tgstime, cutime, cstime;
864 877
865 spin_lock_irq(&current->sighand->siglock);
866 thread_group_cputime_adjusted(current, &tgutime, &tgstime); 878 thread_group_cputime_adjusted(current, &tgutime, &tgstime);
867 cutime = current->signal->cutime; 879 cutime = current->signal->cutime;
868 cstime = current->signal->cstime; 880 cstime = current->signal->cstime;
869 spin_unlock_irq(&current->sighand->siglock);
870 tms->tms_utime = cputime_to_clock_t(tgutime); 881 tms->tms_utime = cputime_to_clock_t(tgutime);
871 tms->tms_stime = cputime_to_clock_t(tgstime); 882 tms->tms_stime = cputime_to_clock_t(tgstime);
872 tms->tms_cutime = cputime_to_clock_t(cutime); 883 tms->tms_cutime = cputime_to_clock_t(cutime);
@@ -1284,7 +1295,6 @@ SYSCALL_DEFINE2(getrlimit, unsigned int, resource, struct rlimit __user *, rlim)
1284/* 1295/*
1285 * Back compatibility for getrlimit. Needed for some apps. 1296 * Back compatibility for getrlimit. Needed for some apps.
1286 */ 1297 */
1287
1288SYSCALL_DEFINE2(old_getrlimit, unsigned int, resource, 1298SYSCALL_DEFINE2(old_getrlimit, unsigned int, resource,
1289 struct rlimit __user *, rlim) 1299 struct rlimit __user *, rlim)
1290{ 1300{
@@ -1299,7 +1309,7 @@ SYSCALL_DEFINE2(old_getrlimit, unsigned int, resource,
1299 x.rlim_cur = 0x7FFFFFFF; 1309 x.rlim_cur = 0x7FFFFFFF;
1300 if (x.rlim_max > 0x7FFFFFFF) 1310 if (x.rlim_max > 0x7FFFFFFF)
1301 x.rlim_max = 0x7FFFFFFF; 1311 x.rlim_max = 0x7FFFFFFF;
1302 return copy_to_user(rlim, &x, sizeof(x))?-EFAULT:0; 1312 return copy_to_user(rlim, &x, sizeof(x)) ? -EFAULT : 0;
1303} 1313}
1304 1314
1305#endif 1315#endif
@@ -1527,7 +1537,7 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
1527 cputime_t tgutime, tgstime, utime, stime; 1537 cputime_t tgutime, tgstime, utime, stime;
1528 unsigned long maxrss = 0; 1538 unsigned long maxrss = 0;
1529 1539
1530 memset((char *) r, 0, sizeof *r); 1540 memset((char *)r, 0, sizeof (*r));
1531 utime = stime = 0; 1541 utime = stime = 0;
1532 1542
1533 if (who == RUSAGE_THREAD) { 1543 if (who == RUSAGE_THREAD) {
@@ -1541,41 +1551,41 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
1541 return; 1551 return;
1542 1552
1543 switch (who) { 1553 switch (who) {
1544 case RUSAGE_BOTH: 1554 case RUSAGE_BOTH:
1545 case RUSAGE_CHILDREN: 1555 case RUSAGE_CHILDREN:
1546 utime = p->signal->cutime; 1556 utime = p->signal->cutime;
1547 stime = p->signal->cstime; 1557 stime = p->signal->cstime;
1548 r->ru_nvcsw = p->signal->cnvcsw; 1558 r->ru_nvcsw = p->signal->cnvcsw;
1549 r->ru_nivcsw = p->signal->cnivcsw; 1559 r->ru_nivcsw = p->signal->cnivcsw;
1550 r->ru_minflt = p->signal->cmin_flt; 1560 r->ru_minflt = p->signal->cmin_flt;
1551 r->ru_majflt = p->signal->cmaj_flt; 1561 r->ru_majflt = p->signal->cmaj_flt;
1552 r->ru_inblock = p->signal->cinblock; 1562 r->ru_inblock = p->signal->cinblock;
1553 r->ru_oublock = p->signal->coublock; 1563 r->ru_oublock = p->signal->coublock;
1554 maxrss = p->signal->cmaxrss; 1564 maxrss = p->signal->cmaxrss;
1555 1565
1556 if (who == RUSAGE_CHILDREN) 1566 if (who == RUSAGE_CHILDREN)
1557 break;
1558
1559 case RUSAGE_SELF:
1560 thread_group_cputime_adjusted(p, &tgutime, &tgstime);
1561 utime += tgutime;
1562 stime += tgstime;
1563 r->ru_nvcsw += p->signal->nvcsw;
1564 r->ru_nivcsw += p->signal->nivcsw;
1565 r->ru_minflt += p->signal->min_flt;
1566 r->ru_majflt += p->signal->maj_flt;
1567 r->ru_inblock += p->signal->inblock;
1568 r->ru_oublock += p->signal->oublock;
1569 if (maxrss < p->signal->maxrss)
1570 maxrss = p->signal->maxrss;
1571 t = p;
1572 do {
1573 accumulate_thread_rusage(t, r);
1574 } while_each_thread(p, t);
1575 break; 1567 break;
1576 1568
1577 default: 1569 case RUSAGE_SELF:
1578 BUG(); 1570 thread_group_cputime_adjusted(p, &tgutime, &tgstime);
1571 utime += tgutime;
1572 stime += tgstime;
1573 r->ru_nvcsw += p->signal->nvcsw;
1574 r->ru_nivcsw += p->signal->nivcsw;
1575 r->ru_minflt += p->signal->min_flt;
1576 r->ru_majflt += p->signal->maj_flt;
1577 r->ru_inblock += p->signal->inblock;
1578 r->ru_oublock += p->signal->oublock;
1579 if (maxrss < p->signal->maxrss)
1580 maxrss = p->signal->maxrss;
1581 t = p;
1582 do {
1583 accumulate_thread_rusage(t, r);
1584 } while_each_thread(p, t);
1585 break;
1586
1587 default:
1588 BUG();
1579 } 1589 }
1580 unlock_task_sighand(p, &flags); 1590 unlock_task_sighand(p, &flags);
1581 1591
@@ -1585,6 +1595,7 @@ out:
1585 1595
1586 if (who != RUSAGE_CHILDREN) { 1596 if (who != RUSAGE_CHILDREN) {
1587 struct mm_struct *mm = get_task_mm(p); 1597 struct mm_struct *mm = get_task_mm(p);
1598
1588 if (mm) { 1599 if (mm) {
1589 setmax_mm_hiwater_rss(&maxrss, mm); 1600 setmax_mm_hiwater_rss(&maxrss, mm);
1590 mmput(mm); 1601 mmput(mm);
@@ -1596,6 +1607,7 @@ out:
1596int getrusage(struct task_struct *p, int who, struct rusage __user *ru) 1607int getrusage(struct task_struct *p, int who, struct rusage __user *ru)
1597{ 1608{
1598 struct rusage r; 1609 struct rusage r;
1610
1599 k_getrusage(p, who, &r); 1611 k_getrusage(p, who, &r);
1600 return copy_to_user(ru, &r, sizeof(r)) ? -EFAULT : 0; 1612 return copy_to_user(ru, &r, sizeof(r)) ? -EFAULT : 0;
1601} 1613}
@@ -1628,12 +1640,14 @@ SYSCALL_DEFINE1(umask, int, mask)
1628 return mask; 1640 return mask;
1629} 1641}
1630 1642
1631static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd) 1643static int prctl_set_mm_exe_file_locked(struct mm_struct *mm, unsigned int fd)
1632{ 1644{
1633 struct fd exe; 1645 struct fd exe;
1634 struct inode *inode; 1646 struct inode *inode;
1635 int err; 1647 int err;
1636 1648
1649 VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_sem), mm);
1650
1637 exe = fdget(fd); 1651 exe = fdget(fd);
1638 if (!exe.file) 1652 if (!exe.file)
1639 return -EBADF; 1653 return -EBADF;
@@ -1654,8 +1668,6 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
1654 if (err) 1668 if (err)
1655 goto exit; 1669 goto exit;
1656 1670
1657 down_write(&mm->mmap_sem);
1658
1659 /* 1671 /*
1660 * Forbid mm->exe_file change if old file still mapped. 1672 * Forbid mm->exe_file change if old file still mapped.
1661 */ 1673 */
@@ -1667,7 +1679,7 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
1667 if (vma->vm_file && 1679 if (vma->vm_file &&
1668 path_equal(&vma->vm_file->f_path, 1680 path_equal(&vma->vm_file->f_path,
1669 &mm->exe_file->f_path)) 1681 &mm->exe_file->f_path))
1670 goto exit_unlock; 1682 goto exit;
1671 } 1683 }
1672 1684
1673 /* 1685 /*
@@ -1678,34 +1690,222 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
1678 */ 1690 */
1679 err = -EPERM; 1691 err = -EPERM;
1680 if (test_and_set_bit(MMF_EXE_FILE_CHANGED, &mm->flags)) 1692 if (test_and_set_bit(MMF_EXE_FILE_CHANGED, &mm->flags))
1681 goto exit_unlock; 1693 goto exit;
1682 1694
1683 err = 0; 1695 err = 0;
1684 set_mm_exe_file(mm, exe.file); /* this grabs a reference to exe.file */ 1696 set_mm_exe_file(mm, exe.file); /* this grabs a reference to exe.file */
1685exit_unlock:
1686 up_write(&mm->mmap_sem);
1687
1688exit: 1697exit:
1689 fdput(exe); 1698 fdput(exe);
1690 return err; 1699 return err;
1691} 1700}
1692 1701
1702#ifdef CONFIG_CHECKPOINT_RESTORE
1703/*
1704 * WARNING: we don't require any capability here so be very careful
1705 * in what is allowed for modification from userspace.
1706 */
1707static int validate_prctl_map(struct prctl_mm_map *prctl_map)
1708{
1709 unsigned long mmap_max_addr = TASK_SIZE;
1710 struct mm_struct *mm = current->mm;
1711 int error = -EINVAL, i;
1712
1713 static const unsigned char offsets[] = {
1714 offsetof(struct prctl_mm_map, start_code),
1715 offsetof(struct prctl_mm_map, end_code),
1716 offsetof(struct prctl_mm_map, start_data),
1717 offsetof(struct prctl_mm_map, end_data),
1718 offsetof(struct prctl_mm_map, start_brk),
1719 offsetof(struct prctl_mm_map, brk),
1720 offsetof(struct prctl_mm_map, start_stack),
1721 offsetof(struct prctl_mm_map, arg_start),
1722 offsetof(struct prctl_mm_map, arg_end),
1723 offsetof(struct prctl_mm_map, env_start),
1724 offsetof(struct prctl_mm_map, env_end),
1725 };
1726
1727 /*
1728 * Make sure the members are not somewhere outside
1729 * of allowed address space.
1730 */
1731 for (i = 0; i < ARRAY_SIZE(offsets); i++) {
1732 u64 val = *(u64 *)((char *)prctl_map + offsets[i]);
1733
1734 if ((unsigned long)val >= mmap_max_addr ||
1735 (unsigned long)val < mmap_min_addr)
1736 goto out;
1737 }
1738
1739 /*
1740 * Make sure the pairs are ordered.
1741 */
1742#define __prctl_check_order(__m1, __op, __m2) \
1743 ((unsigned long)prctl_map->__m1 __op \
1744 (unsigned long)prctl_map->__m2) ? 0 : -EINVAL
1745 error = __prctl_check_order(start_code, <, end_code);
1746 error |= __prctl_check_order(start_data, <, end_data);
1747 error |= __prctl_check_order(start_brk, <=, brk);
1748 error |= __prctl_check_order(arg_start, <=, arg_end);
1749 error |= __prctl_check_order(env_start, <=, env_end);
1750 if (error)
1751 goto out;
1752#undef __prctl_check_order
1753
1754 error = -EINVAL;
1755
1756 /*
1757 * @brk should be after @end_data in traditional maps.
1758 */
1759 if (prctl_map->start_brk <= prctl_map->end_data ||
1760 prctl_map->brk <= prctl_map->end_data)
1761 goto out;
1762
1763 /*
1764 * Neither we should allow to override limits if they set.
1765 */
1766 if (check_data_rlimit(rlimit(RLIMIT_DATA), prctl_map->brk,
1767 prctl_map->start_brk, prctl_map->end_data,
1768 prctl_map->start_data))
1769 goto out;
1770
1771 /*
1772 * Someone is trying to cheat the auxv vector.
1773 */
1774 if (prctl_map->auxv_size) {
1775 if (!prctl_map->auxv || prctl_map->auxv_size > sizeof(mm->saved_auxv))
1776 goto out;
1777 }
1778
1779 /*
1780 * Finally, make sure the caller has the rights to
1781 * change /proc/pid/exe link: only local root should
1782 * be allowed to.
1783 */
1784 if (prctl_map->exe_fd != (u32)-1) {
1785 struct user_namespace *ns = current_user_ns();
1786 const struct cred *cred = current_cred();
1787
1788 if (!uid_eq(cred->uid, make_kuid(ns, 0)) ||
1789 !gid_eq(cred->gid, make_kgid(ns, 0)))
1790 goto out;
1791 }
1792
1793 error = 0;
1794out:
1795 return error;
1796}
1797
1798static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data_size)
1799{
1800 struct prctl_mm_map prctl_map = { .exe_fd = (u32)-1, };
1801 unsigned long user_auxv[AT_VECTOR_SIZE];
1802 struct mm_struct *mm = current->mm;
1803 int error;
1804
1805 BUILD_BUG_ON(sizeof(user_auxv) != sizeof(mm->saved_auxv));
1806 BUILD_BUG_ON(sizeof(struct prctl_mm_map) > 256);
1807
1808 if (opt == PR_SET_MM_MAP_SIZE)
1809 return put_user((unsigned int)sizeof(prctl_map),
1810 (unsigned int __user *)addr);
1811
1812 if (data_size != sizeof(prctl_map))
1813 return -EINVAL;
1814
1815 if (copy_from_user(&prctl_map, addr, sizeof(prctl_map)))
1816 return -EFAULT;
1817
1818 error = validate_prctl_map(&prctl_map);
1819 if (error)
1820 return error;
1821
1822 if (prctl_map.auxv_size) {
1823 memset(user_auxv, 0, sizeof(user_auxv));
1824 if (copy_from_user(user_auxv,
1825 (const void __user *)prctl_map.auxv,
1826 prctl_map.auxv_size))
1827 return -EFAULT;
1828
1829 /* Last entry must be AT_NULL as specification requires */
1830 user_auxv[AT_VECTOR_SIZE - 2] = AT_NULL;
1831 user_auxv[AT_VECTOR_SIZE - 1] = AT_NULL;
1832 }
1833
1834 down_write(&mm->mmap_sem);
1835 if (prctl_map.exe_fd != (u32)-1)
1836 error = prctl_set_mm_exe_file_locked(mm, prctl_map.exe_fd);
1837 downgrade_write(&mm->mmap_sem);
1838 if (error)
1839 goto out;
1840
1841 /*
1842 * We don't validate if these members are pointing to
1843 * real present VMAs because application may have correspond
1844 * VMAs already unmapped and kernel uses these members for statistics
1845 * output in procfs mostly, except
1846 *
1847 * - @start_brk/@brk which are used in do_brk but kernel lookups
1848 * for VMAs when updating these memvers so anything wrong written
1849 * here cause kernel to swear at userspace program but won't lead
1850 * to any problem in kernel itself
1851 */
1852
1853 mm->start_code = prctl_map.start_code;
1854 mm->end_code = prctl_map.end_code;
1855 mm->start_data = prctl_map.start_data;
1856 mm->end_data = prctl_map.end_data;
1857 mm->start_brk = prctl_map.start_brk;
1858 mm->brk = prctl_map.brk;
1859 mm->start_stack = prctl_map.start_stack;
1860 mm->arg_start = prctl_map.arg_start;
1861 mm->arg_end = prctl_map.arg_end;
1862 mm->env_start = prctl_map.env_start;
1863 mm->env_end = prctl_map.env_end;
1864
1865 /*
1866 * Note this update of @saved_auxv is lockless thus
1867 * if someone reads this member in procfs while we're
1868 * updating -- it may get partly updated results. It's
1869 * known and acceptable trade off: we leave it as is to
1870 * not introduce additional locks here making the kernel
1871 * more complex.
1872 */
1873 if (prctl_map.auxv_size)
1874 memcpy(mm->saved_auxv, user_auxv, sizeof(user_auxv));
1875
1876 error = 0;
1877out:
1878 up_read(&mm->mmap_sem);
1879 return error;
1880}
1881#endif /* CONFIG_CHECKPOINT_RESTORE */
1882
1693static int prctl_set_mm(int opt, unsigned long addr, 1883static int prctl_set_mm(int opt, unsigned long addr,
1694 unsigned long arg4, unsigned long arg5) 1884 unsigned long arg4, unsigned long arg5)
1695{ 1885{
1696 unsigned long rlim = rlimit(RLIMIT_DATA);
1697 struct mm_struct *mm = current->mm; 1886 struct mm_struct *mm = current->mm;
1698 struct vm_area_struct *vma; 1887 struct vm_area_struct *vma;
1699 int error; 1888 int error;
1700 1889
1701 if (arg5 || (arg4 && opt != PR_SET_MM_AUXV)) 1890 if (arg5 || (arg4 && (opt != PR_SET_MM_AUXV &&
1891 opt != PR_SET_MM_MAP &&
1892 opt != PR_SET_MM_MAP_SIZE)))
1702 return -EINVAL; 1893 return -EINVAL;
1703 1894
1895#ifdef CONFIG_CHECKPOINT_RESTORE
1896 if (opt == PR_SET_MM_MAP || opt == PR_SET_MM_MAP_SIZE)
1897 return prctl_set_mm_map(opt, (const void __user *)addr, arg4);
1898#endif
1899
1704 if (!capable(CAP_SYS_RESOURCE)) 1900 if (!capable(CAP_SYS_RESOURCE))
1705 return -EPERM; 1901 return -EPERM;
1706 1902
1707 if (opt == PR_SET_MM_EXE_FILE) 1903 if (opt == PR_SET_MM_EXE_FILE) {
1708 return prctl_set_mm_exe_file(mm, (unsigned int)addr); 1904 down_write(&mm->mmap_sem);
1905 error = prctl_set_mm_exe_file_locked(mm, (unsigned int)addr);
1906 up_write(&mm->mmap_sem);
1907 return error;
1908 }
1709 1909
1710 if (addr >= TASK_SIZE || addr < mmap_min_addr) 1910 if (addr >= TASK_SIZE || addr < mmap_min_addr)
1711 return -EINVAL; 1911 return -EINVAL;
@@ -1733,9 +1933,8 @@ static int prctl_set_mm(int opt, unsigned long addr,
1733 if (addr <= mm->end_data) 1933 if (addr <= mm->end_data)
1734 goto out; 1934 goto out;
1735 1935
1736 if (rlim < RLIM_INFINITY && 1936 if (check_data_rlimit(rlimit(RLIMIT_DATA), mm->brk, addr,
1737 (mm->brk - addr) + 1937 mm->end_data, mm->start_data))
1738 (mm->end_data - mm->start_data) > rlim)
1739 goto out; 1938 goto out;
1740 1939
1741 mm->start_brk = addr; 1940 mm->start_brk = addr;
@@ -1745,9 +1944,8 @@ static int prctl_set_mm(int opt, unsigned long addr,
1745 if (addr <= mm->end_data) 1944 if (addr <= mm->end_data)
1746 goto out; 1945 goto out;
1747 1946
1748 if (rlim < RLIM_INFINITY && 1947 if (check_data_rlimit(rlimit(RLIMIT_DATA), addr, mm->start_brk,
1749 (addr - mm->start_brk) + 1948 mm->end_data, mm->start_data))
1750 (mm->end_data - mm->start_data) > rlim)
1751 goto out; 1949 goto out;
1752 1950
1753 mm->brk = addr; 1951 mm->brk = addr;
@@ -2011,6 +2209,12 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
2011 me->mm->def_flags &= ~VM_NOHUGEPAGE; 2209 me->mm->def_flags &= ~VM_NOHUGEPAGE;
2012 up_write(&me->mm->mmap_sem); 2210 up_write(&me->mm->mmap_sem);
2013 break; 2211 break;
2212 case PR_MPX_ENABLE_MANAGEMENT:
2213 error = MPX_ENABLE_MANAGEMENT(me);
2214 break;
2215 case PR_MPX_DISABLE_MANAGEMENT:
2216 error = MPX_DISABLE_MANAGEMENT(me);
2217 break;
2014 default: 2218 default:
2015 error = -EINVAL; 2219 error = -EINVAL;
2016 break; 2220 break;
@@ -2023,6 +2227,7 @@ SYSCALL_DEFINE3(getcpu, unsigned __user *, cpup, unsigned __user *, nodep,
2023{ 2227{
2024 int err = 0; 2228 int err = 0;
2025 int cpu = raw_smp_processor_id(); 2229 int cpu = raw_smp_processor_id();
2230
2026 if (cpup) 2231 if (cpup)
2027 err |= put_user(cpu, cpup); 2232 err |= put_user(cpu, cpup);
2028 if (nodep) 2233 if (nodep)
@@ -2135,7 +2340,7 @@ COMPAT_SYSCALL_DEFINE1(sysinfo, struct compat_sysinfo __user *, info)
2135 /* Check to see if any memory value is too large for 32-bit and scale 2340 /* Check to see if any memory value is too large for 32-bit and scale
2136 * down if needed 2341 * down if needed
2137 */ 2342 */
2138 if ((s.totalram >> 32) || (s.totalswap >> 32)) { 2343 if (upper_32_bits(s.totalram) || upper_32_bits(s.totalswap)) {
2139 int bitcount = 0; 2344 int bitcount = 0;
2140 2345
2141 while (s.mem_unit < PAGE_SIZE) { 2346 while (s.mem_unit < PAGE_SIZE) {
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 391d4ddb6f4b..5adcb0ae3a58 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -156,6 +156,9 @@ cond_syscall(sys_process_vm_writev);
156cond_syscall(compat_sys_process_vm_readv); 156cond_syscall(compat_sys_process_vm_readv);
157cond_syscall(compat_sys_process_vm_writev); 157cond_syscall(compat_sys_process_vm_writev);
158cond_syscall(sys_uselib); 158cond_syscall(sys_uselib);
159cond_syscall(sys_fadvise64);
160cond_syscall(sys_fadvise64_64);
161cond_syscall(sys_madvise);
159 162
160/* arch-specific weak syscall entries */ 163/* arch-specific weak syscall entries */
161cond_syscall(sys_pciconfig_read); 164cond_syscall(sys_pciconfig_read);
@@ -166,6 +169,8 @@ cond_syscall(ppc_rtas);
166cond_syscall(sys_spu_run); 169cond_syscall(sys_spu_run);
167cond_syscall(sys_spu_create); 170cond_syscall(sys_spu_create);
168cond_syscall(sys_subpage_prot); 171cond_syscall(sys_subpage_prot);
172cond_syscall(sys_s390_pci_mmio_read);
173cond_syscall(sys_s390_pci_mmio_write);
169 174
170/* mmu depending weak syscall entries */ 175/* mmu depending weak syscall entries */
171cond_syscall(sys_mprotect); 176cond_syscall(sys_mprotect);
@@ -218,3 +223,9 @@ cond_syscall(sys_kcmp);
218 223
219/* operate on Secure Computing state */ 224/* operate on Secure Computing state */
220cond_syscall(sys_seccomp); 225cond_syscall(sys_seccomp);
226
227/* access BPF programs and maps */
228cond_syscall(sys_bpf);
229
230/* execveat */
231cond_syscall(sys_execveat);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 75875a741b5e..137c7f69b264 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -387,7 +387,8 @@ static struct ctl_table kern_table[] = {
387 .data = &sysctl_numa_balancing_scan_size, 387 .data = &sysctl_numa_balancing_scan_size,
388 .maxlen = sizeof(unsigned int), 388 .maxlen = sizeof(unsigned int),
389 .mode = 0644, 389 .mode = 0644,
390 .proc_handler = proc_dointvec, 390 .proc_handler = proc_dointvec_minmax,
391 .extra1 = &one,
391 }, 392 },
392 { 393 {
393 .procname = "numa_balancing", 394 .procname = "numa_balancing",
@@ -622,6 +623,13 @@ static struct ctl_table kern_table[] = {
622 .mode = 0644, 623 .mode = 0644,
623 .proc_handler = proc_dointvec, 624 .proc_handler = proc_dointvec,
624 }, 625 },
626 {
627 .procname = "tracepoint_printk",
628 .data = &tracepoint_printk,
629 .maxlen = sizeof(tracepoint_printk),
630 .mode = 0644,
631 .proc_handler = proc_dointvec,
632 },
625#endif 633#endif
626#ifdef CONFIG_KEXEC 634#ifdef CONFIG_KEXEC
627 { 635 {
@@ -1055,15 +1063,6 @@ static struct ctl_table kern_table[] = {
1055 .child = key_sysctls, 1063 .child = key_sysctls,
1056 }, 1064 },
1057#endif 1065#endif
1058#ifdef CONFIG_RCU_TORTURE_TEST
1059 {
1060 .procname = "rcutorture_runnable",
1061 .data = &rcutorture_runnable,
1062 .maxlen = sizeof(int),
1063 .mode = 0644,
1064 .proc_handler = proc_dointvec,
1065 },
1066#endif
1067#ifdef CONFIG_PERF_EVENTS 1066#ifdef CONFIG_PERF_EVENTS
1068 /* 1067 /*
1069 * User-space scripts rely on the existence of this file 1068 * User-space scripts rely on the existence of this file
@@ -1112,6 +1111,15 @@ static struct ctl_table kern_table[] = {
1112 .proc_handler = proc_dointvec, 1111 .proc_handler = proc_dointvec,
1113 }, 1112 },
1114#endif 1113#endif
1114 {
1115 .procname = "panic_on_warn",
1116 .data = &panic_on_warn,
1117 .maxlen = sizeof(int),
1118 .mode = 0644,
1119 .proc_handler = proc_dointvec_minmax,
1120 .extra1 = &zero,
1121 .extra2 = &one,
1122 },
1115 { } 1123 { }
1116}; 1124};
1117 1125
@@ -1460,13 +1468,6 @@ static struct ctl_table vm_table[] = {
1460 .extra2 = &one, 1468 .extra2 = &one,
1461 }, 1469 },
1462#endif 1470#endif
1463 {
1464 .procname = "scan_unevictable_pages",
1465 .data = &scan_unevictable_pages,
1466 .maxlen = sizeof(scan_unevictable_pages),
1467 .mode = 0644,
1468 .proc_handler = scan_unevictable_handler,
1469 },
1470#ifdef CONFIG_MEMORY_FAILURE 1471#ifdef CONFIG_MEMORY_FAILURE
1471 { 1472 {
1472 .procname = "memory_failure_early_kill", 1473 .procname = "memory_failure_early_kill",
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index e4ba9a5a5ccb..7e7746a42a62 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -137,6 +137,7 @@ static const struct bin_table bin_kern_table[] = {
137 { CTL_INT, KERN_COMPAT_LOG, "compat-log" }, 137 { CTL_INT, KERN_COMPAT_LOG, "compat-log" },
138 { CTL_INT, KERN_MAX_LOCK_DEPTH, "max_lock_depth" }, 138 { CTL_INT, KERN_MAX_LOCK_DEPTH, "max_lock_depth" },
139 { CTL_INT, KERN_PANIC_ON_NMI, "panic_on_unrecovered_nmi" }, 139 { CTL_INT, KERN_PANIC_ON_NMI, "panic_on_unrecovered_nmi" },
140 { CTL_INT, KERN_PANIC_ON_WARN, "panic_on_warn" },
140 {} 141 {}
141}; 142};
142 143
@@ -390,7 +391,6 @@ static const struct bin_table bin_net_ipv4_table[] = {
390 { CTL_INT, NET_TCP_MTU_PROBING, "tcp_mtu_probing" }, 391 { CTL_INT, NET_TCP_MTU_PROBING, "tcp_mtu_probing" },
391 { CTL_INT, NET_TCP_BASE_MSS, "tcp_base_mss" }, 392 { CTL_INT, NET_TCP_BASE_MSS, "tcp_base_mss" },
392 { CTL_INT, NET_IPV4_TCP_WORKAROUND_SIGNED_WINDOWS, "tcp_workaround_signed_windows" }, 393 { CTL_INT, NET_IPV4_TCP_WORKAROUND_SIGNED_WINDOWS, "tcp_workaround_signed_windows" },
393 { CTL_INT, NET_TCP_DMA_COPYBREAK, "tcp_dma_copybreak" },
394 { CTL_INT, NET_TCP_SLOW_START_AFTER_IDLE, "tcp_slow_start_after_idle" }, 394 { CTL_INT, NET_TCP_SLOW_START_AFTER_IDLE, "tcp_slow_start_after_idle" },
395 { CTL_INT, NET_CIPSOV4_CACHE_ENABLE, "cipso_cache_enable" }, 395 { CTL_INT, NET_CIPSOV4_CACHE_ENABLE, "cipso_cache_enable" },
396 { CTL_INT, NET_CIPSOV4_CACHE_BUCKET_SIZE, "cipso_cache_bucket_size" }, 396 { CTL_INT, NET_CIPSOV4_CACHE_BUCKET_SIZE, "cipso_cache_bucket_size" },
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 13d2f7cd65db..670fff88a961 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -459,7 +459,7 @@ static int cgroupstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
459 stats = nla_data(na); 459 stats = nla_data(na);
460 memset(stats, 0, sizeof(*stats)); 460 memset(stats, 0, sizeof(*stats));
461 461
462 rc = cgroupstats_build(stats, f.file->f_dentry); 462 rc = cgroupstats_build(stats, f.file->f_path.dentry);
463 if (rc < 0) { 463 if (rc < 0) {
464 nlmsg_free(rep_skb); 464 nlmsg_free(rep_skb);
465 goto err; 465 goto err;
@@ -638,7 +638,7 @@ void taskstats_exit(struct task_struct *tsk, int group_dead)
638 fill_tgid_exit(tsk); 638 fill_tgid_exit(tsk);
639 } 639 }
640 640
641 listeners = __this_cpu_ptr(&listener_array); 641 listeners = raw_cpu_ptr(&listener_array);
642 if (list_empty(&listeners->list)) 642 if (list_empty(&listeners->list))
643 return; 643 return;
644 644
diff --git a/kernel/time/Makefile b/kernel/time/Makefile
index 7347426fa68d..f622cf28628a 100644
--- a/kernel/time/Makefile
+++ b/kernel/time/Makefile
@@ -13,7 +13,7 @@ obj-$(CONFIG_TICK_ONESHOT) += tick-oneshot.o
13obj-$(CONFIG_TICK_ONESHOT) += tick-sched.o 13obj-$(CONFIG_TICK_ONESHOT) += tick-sched.o
14obj-$(CONFIG_TIMER_STATS) += timer_stats.o 14obj-$(CONFIG_TIMER_STATS) += timer_stats.o
15obj-$(CONFIG_DEBUG_FS) += timekeeping_debug.o 15obj-$(CONFIG_DEBUG_FS) += timekeeping_debug.o
16obj-$(CONFIG_TEST_UDELAY) += udelay_test.o 16obj-$(CONFIG_TEST_UDELAY) += test_udelay.o
17 17
18$(obj)/time.o: $(obj)/timeconst.h 18$(obj)/time.o: $(obj)/timeconst.h
19 19
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 9c94c19f1305..55449909f114 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -72,7 +72,7 @@ static u64 cev_delta2ns(unsigned long latch, struct clock_event_device *evt,
72 * Also omit the add if it would overflow the u64 boundary. 72 * Also omit the add if it would overflow the u64 boundary.
73 */ 73 */
74 if ((~0ULL - clc > rnd) && 74 if ((~0ULL - clc > rnd) &&
75 (!ismax || evt->mult <= (1U << evt->shift))) 75 (!ismax || evt->mult <= (1ULL << evt->shift)))
76 clc += rnd; 76 clc += rnd;
77 77
78 do_div(clc, evt->mult); 78 do_div(clc, evt->mult);
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 2e949cc9c9f1..b79f39bda7e1 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -792,7 +792,7 @@ int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq)
792 /* Initialize mult/shift and max_idle_ns */ 792 /* Initialize mult/shift and max_idle_ns */
793 __clocksource_updatefreq_scale(cs, scale, freq); 793 __clocksource_updatefreq_scale(cs, scale, freq);
794 794
795 /* Add clocksource to the clcoksource list */ 795 /* Add clocksource to the clocksource list */
796 mutex_lock(&clocksource_mutex); 796 mutex_lock(&clocksource_mutex);
797 clocksource_enqueue(cs); 797 clocksource_enqueue(cs);
798 clocksource_enqueue_watchdog(cs); 798 clocksource_enqueue_watchdog(cs);
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 1c2fe7de2842..37e50aadd471 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -558,7 +558,7 @@ hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal)
558static int hrtimer_reprogram(struct hrtimer *timer, 558static int hrtimer_reprogram(struct hrtimer *timer,
559 struct hrtimer_clock_base *base) 559 struct hrtimer_clock_base *base)
560{ 560{
561 struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); 561 struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
562 ktime_t expires = ktime_sub(hrtimer_get_expires(timer), base->offset); 562 ktime_t expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
563 int res; 563 int res;
564 564
@@ -629,7 +629,7 @@ static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base)
629 */ 629 */
630static void retrigger_next_event(void *arg) 630static void retrigger_next_event(void *arg)
631{ 631{
632 struct hrtimer_cpu_base *base = &__get_cpu_var(hrtimer_bases); 632 struct hrtimer_cpu_base *base = this_cpu_ptr(&hrtimer_bases);
633 633
634 if (!hrtimer_hres_active()) 634 if (!hrtimer_hres_active())
635 return; 635 return;
@@ -903,7 +903,7 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base)
903 */ 903 */
904 debug_deactivate(timer); 904 debug_deactivate(timer);
905 timer_stats_hrtimer_clear_start_info(timer); 905 timer_stats_hrtimer_clear_start_info(timer);
906 reprogram = base->cpu_base == &__get_cpu_var(hrtimer_bases); 906 reprogram = base->cpu_base == this_cpu_ptr(&hrtimer_bases);
907 /* 907 /*
908 * We must preserve the CALLBACK state flag here, 908 * We must preserve the CALLBACK state flag here,
909 * otherwise we could move the timer base in 909 * otherwise we could move the timer base in
@@ -963,7 +963,7 @@ int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
963 * on dynticks target. 963 * on dynticks target.
964 */ 964 */
965 wake_up_nohz_cpu(new_base->cpu_base->cpu); 965 wake_up_nohz_cpu(new_base->cpu_base->cpu);
966 } else if (new_base->cpu_base == &__get_cpu_var(hrtimer_bases) && 966 } else if (new_base->cpu_base == this_cpu_ptr(&hrtimer_bases) &&
967 hrtimer_reprogram(timer, new_base)) { 967 hrtimer_reprogram(timer, new_base)) {
968 /* 968 /*
969 * Only allow reprogramming if the new base is on this CPU. 969 * Only allow reprogramming if the new base is on this CPU.
@@ -1103,7 +1103,7 @@ EXPORT_SYMBOL_GPL(hrtimer_get_remaining);
1103 */ 1103 */
1104ktime_t hrtimer_get_next_event(void) 1104ktime_t hrtimer_get_next_event(void)
1105{ 1105{
1106 struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); 1106 struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
1107 struct hrtimer_clock_base *base = cpu_base->clock_base; 1107 struct hrtimer_clock_base *base = cpu_base->clock_base;
1108 ktime_t delta, mindelta = { .tv64 = KTIME_MAX }; 1108 ktime_t delta, mindelta = { .tv64 = KTIME_MAX };
1109 unsigned long flags; 1109 unsigned long flags;
@@ -1144,7 +1144,7 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
1144 1144
1145 memset(timer, 0, sizeof(struct hrtimer)); 1145 memset(timer, 0, sizeof(struct hrtimer));
1146 1146
1147 cpu_base = &__raw_get_cpu_var(hrtimer_bases); 1147 cpu_base = raw_cpu_ptr(&hrtimer_bases);
1148 1148
1149 if (clock_id == CLOCK_REALTIME && mode != HRTIMER_MODE_ABS) 1149 if (clock_id == CLOCK_REALTIME && mode != HRTIMER_MODE_ABS)
1150 clock_id = CLOCK_MONOTONIC; 1150 clock_id = CLOCK_MONOTONIC;
@@ -1187,7 +1187,7 @@ int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp)
1187 struct hrtimer_cpu_base *cpu_base; 1187 struct hrtimer_cpu_base *cpu_base;
1188 int base = hrtimer_clockid_to_base(which_clock); 1188 int base = hrtimer_clockid_to_base(which_clock);
1189 1189
1190 cpu_base = &__raw_get_cpu_var(hrtimer_bases); 1190 cpu_base = raw_cpu_ptr(&hrtimer_bases);
1191 *tp = ktime_to_timespec(cpu_base->clock_base[base].resolution); 1191 *tp = ktime_to_timespec(cpu_base->clock_base[base].resolution);
1192 1192
1193 return 0; 1193 return 0;
@@ -1242,7 +1242,7 @@ static void __run_hrtimer(struct hrtimer *timer, ktime_t *now)
1242 */ 1242 */
1243void hrtimer_interrupt(struct clock_event_device *dev) 1243void hrtimer_interrupt(struct clock_event_device *dev)
1244{ 1244{
1245 struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); 1245 struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
1246 ktime_t expires_next, now, entry_time, delta; 1246 ktime_t expires_next, now, entry_time, delta;
1247 int i, retries = 0; 1247 int i, retries = 0;
1248 1248
@@ -1376,7 +1376,7 @@ static void __hrtimer_peek_ahead_timers(void)
1376 if (!hrtimer_hres_active()) 1376 if (!hrtimer_hres_active())
1377 return; 1377 return;
1378 1378
1379 td = &__get_cpu_var(tick_cpu_device); 1379 td = this_cpu_ptr(&tick_cpu_device);
1380 if (td && td->evtdev) 1380 if (td && td->evtdev)
1381 hrtimer_interrupt(td->evtdev); 1381 hrtimer_interrupt(td->evtdev);
1382} 1382}
@@ -1440,7 +1440,7 @@ void hrtimer_run_pending(void)
1440void hrtimer_run_queues(void) 1440void hrtimer_run_queues(void)
1441{ 1441{
1442 struct timerqueue_node *node; 1442 struct timerqueue_node *node;
1443 struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); 1443 struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
1444 struct hrtimer_clock_base *base; 1444 struct hrtimer_clock_base *base;
1445 int index, gettime = 1; 1445 int index, gettime = 1;
1446 1446
@@ -1679,7 +1679,7 @@ static void migrate_hrtimers(int scpu)
1679 1679
1680 local_irq_disable(); 1680 local_irq_disable();
1681 old_base = &per_cpu(hrtimer_bases, scpu); 1681 old_base = &per_cpu(hrtimer_bases, scpu);
1682 new_base = &__get_cpu_var(hrtimer_bases); 1682 new_base = this_cpu_ptr(&hrtimer_bases);
1683 /* 1683 /*
1684 * The caller is globally serialized and nobody else 1684 * The caller is globally serialized and nobody else
1685 * takes two locks at once, deadlock is not possible. 1685 * takes two locks at once, deadlock is not possible.
@@ -1776,7 +1776,6 @@ schedule_hrtimeout_range_clock(ktime_t *expires, unsigned long delta,
1776 */ 1776 */
1777 if (!expires) { 1777 if (!expires) {
1778 schedule(); 1778 schedule();
1779 __set_current_state(TASK_RUNNING);
1780 return -EINTR; 1779 return -EINTR;
1781 } 1780 }
1782 1781
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 3b8946416a5f..a16b67859e2a 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -272,22 +272,8 @@ static int posix_cpu_clock_get_task(struct task_struct *tsk,
272 if (same_thread_group(tsk, current)) 272 if (same_thread_group(tsk, current))
273 err = cpu_clock_sample(which_clock, tsk, &rtn); 273 err = cpu_clock_sample(which_clock, tsk, &rtn);
274 } else { 274 } else {
275 unsigned long flags;
276 struct sighand_struct *sighand;
277
278 /*
279 * while_each_thread() is not yet entirely RCU safe,
280 * keep locking the group while sampling process
281 * clock for now.
282 */
283 sighand = lock_task_sighand(tsk, &flags);
284 if (!sighand)
285 return err;
286
287 if (tsk == current || thread_group_leader(tsk)) 275 if (tsk == current || thread_group_leader(tsk))
288 err = cpu_clock_sample_group(which_clock, tsk, &rtn); 276 err = cpu_clock_sample_group(which_clock, tsk, &rtn);
289
290 unlock_task_sighand(tsk, &flags);
291 } 277 }
292 278
293 if (!err) 279 if (!err)
@@ -567,7 +553,7 @@ static int cpu_timer_sample_group(const clockid_t which_clock,
567 *sample = cputime_to_expires(cputime.utime); 553 *sample = cputime_to_expires(cputime.utime);
568 break; 554 break;
569 case CPUCLOCK_SCHED: 555 case CPUCLOCK_SCHED:
570 *sample = cputime.sum_exec_runtime + task_delta_exec(p); 556 *sample = cputime.sum_exec_runtime;
571 break; 557 break;
572 } 558 }
573 return 0; 559 return 0;
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index 42b463ad90f2..31ea01f42e1f 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -636,6 +636,7 @@ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock,
636 goto out; 636 goto out;
637 } 637 }
638 } else { 638 } else {
639 memset(&event.sigev_value, 0, sizeof(event.sigev_value));
639 event.sigev_notify = SIGEV_SIGNAL; 640 event.sigev_notify = SIGEV_SIGNAL;
640 event.sigev_signo = SIGALRM; 641 event.sigev_signo = SIGALRM;
641 event.sigev_value.sival_int = new_timer->it_id; 642 event.sigev_value.sival_int = new_timer->it_id;
diff --git a/kernel/time/udelay_test.c b/kernel/time/test_udelay.c
index e622ba365a13..e622ba365a13 100644
--- a/kernel/time/udelay_test.c
+++ b/kernel/time/test_udelay.c
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 64c5990fd500..066f0ec05e48 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -554,7 +554,7 @@ int tick_resume_broadcast_oneshot(struct clock_event_device *bc)
554void tick_check_oneshot_broadcast_this_cpu(void) 554void tick_check_oneshot_broadcast_this_cpu(void)
555{ 555{
556 if (cpumask_test_cpu(smp_processor_id(), tick_broadcast_oneshot_mask)) { 556 if (cpumask_test_cpu(smp_processor_id(), tick_broadcast_oneshot_mask)) {
557 struct tick_device *td = &__get_cpu_var(tick_cpu_device); 557 struct tick_device *td = this_cpu_ptr(&tick_cpu_device);
558 558
559 /* 559 /*
560 * We might be in the middle of switching over from 560 * We might be in the middle of switching over from
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index 0a0608edeb26..7efeedf53ebd 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -224,7 +224,7 @@ static void tick_setup_device(struct tick_device *td,
224 224
225void tick_install_replacement(struct clock_event_device *newdev) 225void tick_install_replacement(struct clock_event_device *newdev)
226{ 226{
227 struct tick_device *td = &__get_cpu_var(tick_cpu_device); 227 struct tick_device *td = this_cpu_ptr(&tick_cpu_device);
228 int cpu = smp_processor_id(); 228 int cpu = smp_processor_id();
229 229
230 clockevents_exchange_device(td->evtdev, newdev); 230 clockevents_exchange_device(td->evtdev, newdev);
@@ -374,14 +374,14 @@ void tick_shutdown(unsigned int *cpup)
374 374
375void tick_suspend(void) 375void tick_suspend(void)
376{ 376{
377 struct tick_device *td = &__get_cpu_var(tick_cpu_device); 377 struct tick_device *td = this_cpu_ptr(&tick_cpu_device);
378 378
379 clockevents_shutdown(td->evtdev); 379 clockevents_shutdown(td->evtdev);
380} 380}
381 381
382void tick_resume(void) 382void tick_resume(void)
383{ 383{
384 struct tick_device *td = &__get_cpu_var(tick_cpu_device); 384 struct tick_device *td = this_cpu_ptr(&tick_cpu_device);
385 int broadcast = tick_resume_broadcast(); 385 int broadcast = tick_resume_broadcast();
386 386
387 clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_RESUME); 387 clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_RESUME);
@@ -400,4 +400,5 @@ void tick_resume(void)
400void __init tick_init(void) 400void __init tick_init(void)
401{ 401{
402 tick_broadcast_init(); 402 tick_broadcast_init();
403 tick_nohz_init();
403} 404}
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index c19c1d84b6f3..366aeb4f2c66 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -99,6 +99,13 @@ static inline int tick_broadcast_oneshot_active(void) { return 0; }
99static inline bool tick_broadcast_oneshot_available(void) { return false; } 99static inline bool tick_broadcast_oneshot_available(void) { return false; }
100#endif /* !TICK_ONESHOT */ 100#endif /* !TICK_ONESHOT */
101 101
102/* NO_HZ_FULL internal */
103#ifdef CONFIG_NO_HZ_FULL
104extern void tick_nohz_init(void);
105# else
106static inline void tick_nohz_init(void) { }
107#endif
108
102/* 109/*
103 * Broadcasting support 110 * Broadcasting support
104 */ 111 */
diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c
index 824109060a33..7ce740e78e1b 100644
--- a/kernel/time/tick-oneshot.c
+++ b/kernel/time/tick-oneshot.c
@@ -59,7 +59,7 @@ void tick_setup_oneshot(struct clock_event_device *newdev,
59 */ 59 */
60int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *)) 60int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *))
61{ 61{
62 struct tick_device *td = &__get_cpu_var(tick_cpu_device); 62 struct tick_device *td = this_cpu_ptr(&tick_cpu_device);
63 struct clock_event_device *dev = td->evtdev; 63 struct clock_event_device *dev = td->evtdev;
64 64
65 if (!dev || !(dev->features & CLOCK_EVT_FEAT_ONESHOT) || 65 if (!dev || !(dev->features & CLOCK_EVT_FEAT_ONESHOT) ||
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index f654a8a298fa..1363d58f07e9 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -205,7 +205,7 @@ static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now);
205 */ 205 */
206void __tick_nohz_full_check(void) 206void __tick_nohz_full_check(void)
207{ 207{
208 struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); 208 struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
209 209
210 if (tick_nohz_full_cpu(smp_processor_id())) { 210 if (tick_nohz_full_cpu(smp_processor_id())) {
211 if (ts->tick_stopped && !is_idle_task(current)) { 211 if (ts->tick_stopped && !is_idle_task(current)) {
@@ -235,7 +235,7 @@ void tick_nohz_full_kick(void)
235 if (!tick_nohz_full_cpu(smp_processor_id())) 235 if (!tick_nohz_full_cpu(smp_processor_id()))
236 return; 236 return;
237 237
238 irq_work_queue(&__get_cpu_var(nohz_full_kick_work)); 238 irq_work_queue(this_cpu_ptr(&nohz_full_kick_work));
239} 239}
240 240
241/* 241/*
@@ -295,22 +295,12 @@ out:
295/* Parse the boot-time nohz CPU list from the kernel parameters. */ 295/* Parse the boot-time nohz CPU list from the kernel parameters. */
296static int __init tick_nohz_full_setup(char *str) 296static int __init tick_nohz_full_setup(char *str)
297{ 297{
298 int cpu;
299
300 alloc_bootmem_cpumask_var(&tick_nohz_full_mask); 298 alloc_bootmem_cpumask_var(&tick_nohz_full_mask);
301 alloc_bootmem_cpumask_var(&housekeeping_mask);
302 if (cpulist_parse(str, tick_nohz_full_mask) < 0) { 299 if (cpulist_parse(str, tick_nohz_full_mask) < 0) {
303 pr_warning("NOHZ: Incorrect nohz_full cpumask\n"); 300 pr_warning("NOHZ: Incorrect nohz_full cpumask\n");
301 free_bootmem_cpumask_var(tick_nohz_full_mask);
304 return 1; 302 return 1;
305 } 303 }
306
307 cpu = smp_processor_id();
308 if (cpumask_test_cpu(cpu, tick_nohz_full_mask)) {
309 pr_warning("NO_HZ: Clearing %d from nohz_full range for timekeeping\n", cpu);
310 cpumask_clear_cpu(cpu, tick_nohz_full_mask);
311 }
312 cpumask_andnot(housekeeping_mask,
313 cpu_possible_mask, tick_nohz_full_mask);
314 tick_nohz_full_running = true; 304 tick_nohz_full_running = true;
315 305
316 return 1; 306 return 1;
@@ -349,18 +339,11 @@ static int tick_nohz_init_all(void)
349 339
350#ifdef CONFIG_NO_HZ_FULL_ALL 340#ifdef CONFIG_NO_HZ_FULL_ALL
351 if (!alloc_cpumask_var(&tick_nohz_full_mask, GFP_KERNEL)) { 341 if (!alloc_cpumask_var(&tick_nohz_full_mask, GFP_KERNEL)) {
352 pr_err("NO_HZ: Can't allocate full dynticks cpumask\n"); 342 WARN(1, "NO_HZ: Can't allocate full dynticks cpumask\n");
353 return err;
354 }
355 if (!alloc_cpumask_var(&housekeeping_mask, GFP_KERNEL)) {
356 pr_err("NO_HZ: Can't allocate not-full dynticks cpumask\n");
357 return err; 343 return err;
358 } 344 }
359 err = 0; 345 err = 0;
360 cpumask_setall(tick_nohz_full_mask); 346 cpumask_setall(tick_nohz_full_mask);
361 cpumask_clear_cpu(smp_processor_id(), tick_nohz_full_mask);
362 cpumask_clear(housekeeping_mask);
363 cpumask_set_cpu(smp_processor_id(), housekeeping_mask);
364 tick_nohz_full_running = true; 347 tick_nohz_full_running = true;
365#endif 348#endif
366 return err; 349 return err;
@@ -375,6 +358,37 @@ void __init tick_nohz_init(void)
375 return; 358 return;
376 } 359 }
377 360
361 if (!alloc_cpumask_var(&housekeeping_mask, GFP_KERNEL)) {
362 WARN(1, "NO_HZ: Can't allocate not-full dynticks cpumask\n");
363 cpumask_clear(tick_nohz_full_mask);
364 tick_nohz_full_running = false;
365 return;
366 }
367
368 /*
369 * Full dynticks uses irq work to drive the tick rescheduling on safe
370 * locking contexts. But then we need irq work to raise its own
371 * interrupts to avoid circular dependency on the tick
372 */
373 if (!arch_irq_work_has_interrupt()) {
374 pr_warning("NO_HZ: Can't run full dynticks because arch doesn't "
375 "support irq work self-IPIs\n");
376 cpumask_clear(tick_nohz_full_mask);
377 cpumask_copy(housekeeping_mask, cpu_possible_mask);
378 tick_nohz_full_running = false;
379 return;
380 }
381
382 cpu = smp_processor_id();
383
384 if (cpumask_test_cpu(cpu, tick_nohz_full_mask)) {
385 pr_warning("NO_HZ: Clearing %d from nohz_full range for timekeeping\n", cpu);
386 cpumask_clear_cpu(cpu, tick_nohz_full_mask);
387 }
388
389 cpumask_andnot(housekeeping_mask,
390 cpu_possible_mask, tick_nohz_full_mask);
391
378 for_each_cpu(cpu, tick_nohz_full_mask) 392 for_each_cpu(cpu, tick_nohz_full_mask)
379 context_tracking_cpu_set(cpu); 393 context_tracking_cpu_set(cpu);
380 394
@@ -559,7 +573,7 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
559 unsigned long seq, last_jiffies, next_jiffies, delta_jiffies; 573 unsigned long seq, last_jiffies, next_jiffies, delta_jiffies;
560 ktime_t last_update, expires, ret = { .tv64 = 0 }; 574 ktime_t last_update, expires, ret = { .tv64 = 0 };
561 unsigned long rcu_delta_jiffies; 575 unsigned long rcu_delta_jiffies;
562 struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; 576 struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev);
563 u64 time_delta; 577 u64 time_delta;
564 578
565 time_delta = timekeeping_max_deferment(); 579 time_delta = timekeeping_max_deferment();
@@ -571,8 +585,8 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
571 last_jiffies = jiffies; 585 last_jiffies = jiffies;
572 } while (read_seqretry(&jiffies_lock, seq)); 586 } while (read_seqretry(&jiffies_lock, seq));
573 587
574 if (rcu_needs_cpu(cpu, &rcu_delta_jiffies) || 588 if (rcu_needs_cpu(&rcu_delta_jiffies) ||
575 arch_needs_cpu(cpu) || irq_work_needs_cpu()) { 589 arch_needs_cpu() || irq_work_needs_cpu()) {
576 next_jiffies = last_jiffies + 1; 590 next_jiffies = last_jiffies + 1;
577 delta_jiffies = 1; 591 delta_jiffies = 1;
578 } else { 592 } else {
@@ -827,13 +841,12 @@ void tick_nohz_idle_enter(void)
827 841
828 local_irq_disable(); 842 local_irq_disable();
829 843
830 ts = &__get_cpu_var(tick_cpu_sched); 844 ts = this_cpu_ptr(&tick_cpu_sched);
831 ts->inidle = 1; 845 ts->inidle = 1;
832 __tick_nohz_idle_enter(ts); 846 __tick_nohz_idle_enter(ts);
833 847
834 local_irq_enable(); 848 local_irq_enable();
835} 849}
836EXPORT_SYMBOL_GPL(tick_nohz_idle_enter);
837 850
838/** 851/**
839 * tick_nohz_irq_exit - update next tick event from interrupt exit 852 * tick_nohz_irq_exit - update next tick event from interrupt exit
@@ -845,7 +858,7 @@ EXPORT_SYMBOL_GPL(tick_nohz_idle_enter);
845 */ 858 */
846void tick_nohz_irq_exit(void) 859void tick_nohz_irq_exit(void)
847{ 860{
848 struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); 861 struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
849 862
850 if (ts->inidle) 863 if (ts->inidle)
851 __tick_nohz_idle_enter(ts); 864 __tick_nohz_idle_enter(ts);
@@ -860,7 +873,7 @@ void tick_nohz_irq_exit(void)
860 */ 873 */
861ktime_t tick_nohz_get_sleep_length(void) 874ktime_t tick_nohz_get_sleep_length(void)
862{ 875{
863 struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); 876 struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
864 877
865 return ts->sleep_length; 878 return ts->sleep_length;
866} 879}
@@ -938,7 +951,7 @@ static void tick_nohz_account_idle_ticks(struct tick_sched *ts)
938 */ 951 */
939void tick_nohz_idle_exit(void) 952void tick_nohz_idle_exit(void)
940{ 953{
941 struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); 954 struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
942 ktime_t now; 955 ktime_t now;
943 956
944 local_irq_disable(); 957 local_irq_disable();
@@ -960,7 +973,6 @@ void tick_nohz_idle_exit(void)
960 973
961 local_irq_enable(); 974 local_irq_enable();
962} 975}
963EXPORT_SYMBOL_GPL(tick_nohz_idle_exit);
964 976
965static int tick_nohz_reprogram(struct tick_sched *ts, ktime_t now) 977static int tick_nohz_reprogram(struct tick_sched *ts, ktime_t now)
966{ 978{
@@ -973,7 +985,7 @@ static int tick_nohz_reprogram(struct tick_sched *ts, ktime_t now)
973 */ 985 */
974static void tick_nohz_handler(struct clock_event_device *dev) 986static void tick_nohz_handler(struct clock_event_device *dev)
975{ 987{
976 struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); 988 struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
977 struct pt_regs *regs = get_irq_regs(); 989 struct pt_regs *regs = get_irq_regs();
978 ktime_t now = ktime_get(); 990 ktime_t now = ktime_get();
979 991
@@ -982,6 +994,10 @@ static void tick_nohz_handler(struct clock_event_device *dev)
982 tick_sched_do_timer(now); 994 tick_sched_do_timer(now);
983 tick_sched_handle(ts, regs); 995 tick_sched_handle(ts, regs);
984 996
997 /* No need to reprogram if we are running tickless */
998 if (unlikely(ts->tick_stopped))
999 return;
1000
985 while (tick_nohz_reprogram(ts, now)) { 1001 while (tick_nohz_reprogram(ts, now)) {
986 now = ktime_get(); 1002 now = ktime_get();
987 tick_do_update_jiffies64(now); 1003 tick_do_update_jiffies64(now);
@@ -993,7 +1009,7 @@ static void tick_nohz_handler(struct clock_event_device *dev)
993 */ 1009 */
994static void tick_nohz_switch_to_nohz(void) 1010static void tick_nohz_switch_to_nohz(void)
995{ 1011{
996 struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); 1012 struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
997 ktime_t next; 1013 ktime_t next;
998 1014
999 if (!tick_nohz_enabled) 1015 if (!tick_nohz_enabled)
@@ -1055,7 +1071,7 @@ static void tick_nohz_kick_tick(struct tick_sched *ts, ktime_t now)
1055 1071
1056static inline void tick_nohz_irq_enter(void) 1072static inline void tick_nohz_irq_enter(void)
1057{ 1073{
1058 struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); 1074 struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
1059 ktime_t now; 1075 ktime_t now;
1060 1076
1061 if (!ts->idle_active && !ts->tick_stopped) 1077 if (!ts->idle_active && !ts->tick_stopped)
@@ -1109,6 +1125,10 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer)
1109 if (regs) 1125 if (regs)
1110 tick_sched_handle(ts, regs); 1126 tick_sched_handle(ts, regs);
1111 1127
1128 /* No need to reprogram if we are in idle or full dynticks mode */
1129 if (unlikely(ts->tick_stopped))
1130 return HRTIMER_NORESTART;
1131
1112 hrtimer_forward(timer, now, tick_period); 1132 hrtimer_forward(timer, now, tick_period);
1113 1133
1114 return HRTIMER_RESTART; 1134 return HRTIMER_RESTART;
@@ -1129,7 +1149,7 @@ early_param("skew_tick", skew_tick);
1129 */ 1149 */
1130void tick_setup_sched_timer(void) 1150void tick_setup_sched_timer(void)
1131{ 1151{
1132 struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); 1152 struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
1133 ktime_t now = ktime_get(); 1153 ktime_t now = ktime_get();
1134 1154
1135 /* 1155 /*
@@ -1198,7 +1218,7 @@ void tick_clock_notify(void)
1198 */ 1218 */
1199void tick_oneshot_notify(void) 1219void tick_oneshot_notify(void)
1200{ 1220{
1201 struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); 1221 struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
1202 1222
1203 set_bit(0, &ts->check_clocks); 1223 set_bit(0, &ts->check_clocks);
1204} 1224}
@@ -1213,7 +1233,7 @@ void tick_oneshot_notify(void)
1213 */ 1233 */
1214int tick_check_oneshot_change(int allow_nohz) 1234int tick_check_oneshot_change(int allow_nohz)
1215{ 1235{
1216 struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); 1236 struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
1217 1237
1218 if (!test_and_clear_bit(0, &ts->check_clocks)) 1238 if (!test_and_clear_bit(0, &ts->check_clocks))
1219 return 0; 1239 return 0;
diff --git a/kernel/time/time.c b/kernel/time/time.c
index a9ae20fb0b11..6390517e77d4 100644
--- a/kernel/time/time.c
+++ b/kernel/time/time.c
@@ -304,7 +304,9 @@ struct timespec timespec_trunc(struct timespec t, unsigned gran)
304} 304}
305EXPORT_SYMBOL(timespec_trunc); 305EXPORT_SYMBOL(timespec_trunc);
306 306
307/* Converts Gregorian date to seconds since 1970-01-01 00:00:00. 307/*
308 * mktime64 - Converts date to seconds.
309 * Converts Gregorian date to seconds since 1970-01-01 00:00:00.
308 * Assumes input in normal date format, i.e. 1980-12-31 23:59:59 310 * Assumes input in normal date format, i.e. 1980-12-31 23:59:59
309 * => year=1980, mon=12, day=31, hour=23, min=59, sec=59. 311 * => year=1980, mon=12, day=31, hour=23, min=59, sec=59.
310 * 312 *
@@ -314,15 +316,10 @@ EXPORT_SYMBOL(timespec_trunc);
314 * -year/100+year/400 terms, and add 10.] 316 * -year/100+year/400 terms, and add 10.]
315 * 317 *
316 * This algorithm was first published by Gauss (I think). 318 * This algorithm was first published by Gauss (I think).
317 *
318 * WARNING: this function will overflow on 2106-02-07 06:28:16 on
319 * machines where long is 32-bit! (However, as time_t is signed, we
320 * will already get problems at other places on 2038-01-19 03:14:08)
321 */ 319 */
322unsigned long 320time64_t mktime64(const unsigned int year0, const unsigned int mon0,
323mktime(const unsigned int year0, const unsigned int mon0, 321 const unsigned int day, const unsigned int hour,
324 const unsigned int day, const unsigned int hour, 322 const unsigned int min, const unsigned int sec)
325 const unsigned int min, const unsigned int sec)
326{ 323{
327 unsigned int mon = mon0, year = year0; 324 unsigned int mon = mon0, year = year0;
328 325
@@ -332,15 +329,14 @@ mktime(const unsigned int year0, const unsigned int mon0,
332 year -= 1; 329 year -= 1;
333 } 330 }
334 331
335 return ((((unsigned long) 332 return ((((time64_t)
336 (year/4 - year/100 + year/400 + 367*mon/12 + day) + 333 (year/4 - year/100 + year/400 + 367*mon/12 + day) +
337 year*365 - 719499 334 year*365 - 719499
338 )*24 + hour /* now have hours */ 335 )*24 + hour /* now have hours */
339 )*60 + min /* now have minutes */ 336 )*60 + min /* now have minutes */
340 )*60 + sec; /* finally seconds */ 337 )*60 + sec; /* finally seconds */
341} 338}
342 339EXPORT_SYMBOL(mktime64);
343EXPORT_SYMBOL(mktime);
344 340
345/** 341/**
346 * set_normalized_timespec - set timespec sec and nsec parts and normalize 342 * set_normalized_timespec - set timespec sec and nsec parts and normalize
@@ -745,6 +741,7 @@ u64 nsecs_to_jiffies64(u64 n)
745 return div_u64(n * 9, (9ull * NSEC_PER_SEC + HZ / 2) / HZ); 741 return div_u64(n * 9, (9ull * NSEC_PER_SEC + HZ / 2) / HZ);
746#endif 742#endif
747} 743}
744EXPORT_SYMBOL(nsecs_to_jiffies64);
748 745
749/** 746/**
750 * nsecs_to_jiffies - Convert nsecs in u64 to jiffies 747 * nsecs_to_jiffies - Convert nsecs in u64 to jiffies
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index ec1791fae965..6a931852082f 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -417,7 +417,8 @@ EXPORT_SYMBOL_GPL(pvclock_gtod_unregister_notifier);
417 */ 417 */
418static inline void tk_update_ktime_data(struct timekeeper *tk) 418static inline void tk_update_ktime_data(struct timekeeper *tk)
419{ 419{
420 s64 nsec; 420 u64 seconds;
421 u32 nsec;
421 422
422 /* 423 /*
423 * The xtime based monotonic readout is: 424 * The xtime based monotonic readout is:
@@ -426,13 +427,22 @@ static inline void tk_update_ktime_data(struct timekeeper *tk)
426 * nsec = base_mono + now(); 427 * nsec = base_mono + now();
427 * ==> base_mono = (xtime_sec + wtm_sec) * 1e9 + wtm_nsec 428 * ==> base_mono = (xtime_sec + wtm_sec) * 1e9 + wtm_nsec
428 */ 429 */
429 nsec = (s64)(tk->xtime_sec + tk->wall_to_monotonic.tv_sec); 430 seconds = (u64)(tk->xtime_sec + tk->wall_to_monotonic.tv_sec);
430 nsec *= NSEC_PER_SEC; 431 nsec = (u32) tk->wall_to_monotonic.tv_nsec;
431 nsec += tk->wall_to_monotonic.tv_nsec; 432 tk->tkr.base_mono = ns_to_ktime(seconds * NSEC_PER_SEC + nsec);
432 tk->tkr.base_mono = ns_to_ktime(nsec);
433 433
434 /* Update the monotonic raw base */ 434 /* Update the monotonic raw base */
435 tk->base_raw = timespec64_to_ktime(tk->raw_time); 435 tk->base_raw = timespec64_to_ktime(tk->raw_time);
436
437 /*
438 * The sum of the nanoseconds portions of xtime and
439 * wall_to_monotonic can be greater/equal one second. Take
440 * this into account before updating tk->ktime_sec.
441 */
442 nsec += (u32)(tk->tkr.xtime_nsec >> tk->tkr.shift);
443 if (nsec >= NSEC_PER_SEC)
444 seconds++;
445 tk->ktime_sec = seconds;
436} 446}
437 447
438/* must hold timekeeper_lock */ 448/* must hold timekeeper_lock */
@@ -519,9 +529,9 @@ EXPORT_SYMBOL(__getnstimeofday64);
519 529
520/** 530/**
521 * getnstimeofday64 - Returns the time of day in a timespec64. 531 * getnstimeofday64 - Returns the time of day in a timespec64.
522 * @ts: pointer to the timespec to be set 532 * @ts: pointer to the timespec64 to be set
523 * 533 *
524 * Returns the time of day in a timespec (WARN if suspended). 534 * Returns the time of day in a timespec64 (WARN if suspended).
525 */ 535 */
526void getnstimeofday64(struct timespec64 *ts) 536void getnstimeofday64(struct timespec64 *ts)
527{ 537{
@@ -623,7 +633,7 @@ EXPORT_SYMBOL_GPL(ktime_get_raw);
623 * 633 *
624 * The function calculates the monotonic clock from the realtime 634 * The function calculates the monotonic clock from the realtime
625 * clock and the wall_to_monotonic offset and stores the result 635 * clock and the wall_to_monotonic offset and stores the result
626 * in normalized timespec format in the variable pointed to by @ts. 636 * in normalized timespec64 format in the variable pointed to by @ts.
627 */ 637 */
628void ktime_get_ts64(struct timespec64 *ts) 638void ktime_get_ts64(struct timespec64 *ts)
629{ 639{
@@ -648,6 +658,54 @@ void ktime_get_ts64(struct timespec64 *ts)
648} 658}
649EXPORT_SYMBOL_GPL(ktime_get_ts64); 659EXPORT_SYMBOL_GPL(ktime_get_ts64);
650 660
661/**
662 * ktime_get_seconds - Get the seconds portion of CLOCK_MONOTONIC
663 *
664 * Returns the seconds portion of CLOCK_MONOTONIC with a single non
665 * serialized read. tk->ktime_sec is of type 'unsigned long' so this
666 * works on both 32 and 64 bit systems. On 32 bit systems the readout
667 * covers ~136 years of uptime which should be enough to prevent
668 * premature wrap arounds.
669 */
670time64_t ktime_get_seconds(void)
671{
672 struct timekeeper *tk = &tk_core.timekeeper;
673
674 WARN_ON(timekeeping_suspended);
675 return tk->ktime_sec;
676}
677EXPORT_SYMBOL_GPL(ktime_get_seconds);
678
679/**
680 * ktime_get_real_seconds - Get the seconds portion of CLOCK_REALTIME
681 *
682 * Returns the wall clock seconds since 1970. This replaces the
683 * get_seconds() interface which is not y2038 safe on 32bit systems.
684 *
685 * For 64bit systems the fast access to tk->xtime_sec is preserved. On
686 * 32bit systems the access must be protected with the sequence
687 * counter to provide "atomic" access to the 64bit tk->xtime_sec
688 * value.
689 */
690time64_t ktime_get_real_seconds(void)
691{
692 struct timekeeper *tk = &tk_core.timekeeper;
693 time64_t seconds;
694 unsigned int seq;
695
696 if (IS_ENABLED(CONFIG_64BIT))
697 return tk->xtime_sec;
698
699 do {
700 seq = read_seqcount_begin(&tk_core.seq);
701 seconds = tk->xtime_sec;
702
703 } while (read_seqcount_retry(&tk_core.seq, seq));
704
705 return seconds;
706}
707EXPORT_SYMBOL_GPL(ktime_get_real_seconds);
708
651#ifdef CONFIG_NTP_PPS 709#ifdef CONFIG_NTP_PPS
652 710
653/** 711/**
@@ -703,18 +761,18 @@ void do_gettimeofday(struct timeval *tv)
703EXPORT_SYMBOL(do_gettimeofday); 761EXPORT_SYMBOL(do_gettimeofday);
704 762
705/** 763/**
706 * do_settimeofday - Sets the time of day 764 * do_settimeofday64 - Sets the time of day.
707 * @tv: pointer to the timespec variable containing the new time 765 * @ts: pointer to the timespec64 variable containing the new time
708 * 766 *
709 * Sets the time of day to the new time and update NTP and notify hrtimers 767 * Sets the time of day to the new time and update NTP and notify hrtimers
710 */ 768 */
711int do_settimeofday(const struct timespec *tv) 769int do_settimeofday64(const struct timespec64 *ts)
712{ 770{
713 struct timekeeper *tk = &tk_core.timekeeper; 771 struct timekeeper *tk = &tk_core.timekeeper;
714 struct timespec64 ts_delta, xt, tmp; 772 struct timespec64 ts_delta, xt;
715 unsigned long flags; 773 unsigned long flags;
716 774
717 if (!timespec_valid_strict(tv)) 775 if (!timespec64_valid_strict(ts))
718 return -EINVAL; 776 return -EINVAL;
719 777
720 raw_spin_lock_irqsave(&timekeeper_lock, flags); 778 raw_spin_lock_irqsave(&timekeeper_lock, flags);
@@ -723,13 +781,12 @@ int do_settimeofday(const struct timespec *tv)
723 timekeeping_forward_now(tk); 781 timekeeping_forward_now(tk);
724 782
725 xt = tk_xtime(tk); 783 xt = tk_xtime(tk);
726 ts_delta.tv_sec = tv->tv_sec - xt.tv_sec; 784 ts_delta.tv_sec = ts->tv_sec - xt.tv_sec;
727 ts_delta.tv_nsec = tv->tv_nsec - xt.tv_nsec; 785 ts_delta.tv_nsec = ts->tv_nsec - xt.tv_nsec;
728 786
729 tk_set_wall_to_mono(tk, timespec64_sub(tk->wall_to_monotonic, ts_delta)); 787 tk_set_wall_to_mono(tk, timespec64_sub(tk->wall_to_monotonic, ts_delta));
730 788
731 tmp = timespec_to_timespec64(*tv); 789 tk_set_xtime(tk, ts);
732 tk_set_xtime(tk, &tmp);
733 790
734 timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET); 791 timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET);
735 792
@@ -741,7 +798,7 @@ int do_settimeofday(const struct timespec *tv)
741 798
742 return 0; 799 return 0;
743} 800}
744EXPORT_SYMBOL(do_settimeofday); 801EXPORT_SYMBOL(do_settimeofday64);
745 802
746/** 803/**
747 * timekeeping_inject_offset - Adds or subtracts from the current time. 804 * timekeeping_inject_offset - Adds or subtracts from the current time.
@@ -895,12 +952,12 @@ int timekeeping_notify(struct clocksource *clock)
895} 952}
896 953
897/** 954/**
898 * getrawmonotonic - Returns the raw monotonic time in a timespec 955 * getrawmonotonic64 - Returns the raw monotonic time in a timespec
899 * @ts: pointer to the timespec to be set 956 * @ts: pointer to the timespec64 to be set
900 * 957 *
901 * Returns the raw monotonic time (completely un-modified by ntp) 958 * Returns the raw monotonic time (completely un-modified by ntp)
902 */ 959 */
903void getrawmonotonic(struct timespec *ts) 960void getrawmonotonic64(struct timespec64 *ts)
904{ 961{
905 struct timekeeper *tk = &tk_core.timekeeper; 962 struct timekeeper *tk = &tk_core.timekeeper;
906 struct timespec64 ts64; 963 struct timespec64 ts64;
@@ -915,9 +972,10 @@ void getrawmonotonic(struct timespec *ts)
915 } while (read_seqcount_retry(&tk_core.seq, seq)); 972 } while (read_seqcount_retry(&tk_core.seq, seq));
916 973
917 timespec64_add_ns(&ts64, nsecs); 974 timespec64_add_ns(&ts64, nsecs);
918 *ts = timespec64_to_timespec(ts64); 975 *ts = ts64;
919} 976}
920EXPORT_SYMBOL(getrawmonotonic); 977EXPORT_SYMBOL(getrawmonotonic64);
978
921 979
922/** 980/**
923 * timekeeping_valid_for_hres - Check if timekeeping is suitable for hres 981 * timekeeping_valid_for_hres - Check if timekeeping is suitable for hres
@@ -1068,8 +1126,8 @@ static void __timekeeping_inject_sleeptime(struct timekeeper *tk,
1068} 1126}
1069 1127
1070/** 1128/**
1071 * timekeeping_inject_sleeptime - Adds suspend interval to timeekeeping values 1129 * timekeeping_inject_sleeptime64 - Adds suspend interval to timeekeeping values
1072 * @delta: pointer to a timespec delta value 1130 * @delta: pointer to a timespec64 delta value
1073 * 1131 *
1074 * This hook is for architectures that cannot support read_persistent_clock 1132 * This hook is for architectures that cannot support read_persistent_clock
1075 * because their RTC/persistent clock is only accessible when irqs are enabled. 1133 * because their RTC/persistent clock is only accessible when irqs are enabled.
@@ -1077,10 +1135,9 @@ static void __timekeeping_inject_sleeptime(struct timekeeper *tk,
1077 * This function should only be called by rtc_resume(), and allows 1135 * This function should only be called by rtc_resume(), and allows
1078 * a suspend offset to be injected into the timekeeping values. 1136 * a suspend offset to be injected into the timekeeping values.
1079 */ 1137 */
1080void timekeeping_inject_sleeptime(struct timespec *delta) 1138void timekeeping_inject_sleeptime64(struct timespec64 *delta)
1081{ 1139{
1082 struct timekeeper *tk = &tk_core.timekeeper; 1140 struct timekeeper *tk = &tk_core.timekeeper;
1083 struct timespec64 tmp;
1084 unsigned long flags; 1141 unsigned long flags;
1085 1142
1086 /* 1143 /*
@@ -1095,8 +1152,7 @@ void timekeeping_inject_sleeptime(struct timespec *delta)
1095 1152
1096 timekeeping_forward_now(tk); 1153 timekeeping_forward_now(tk);
1097 1154
1098 tmp = timespec_to_timespec64(*delta); 1155 __timekeeping_inject_sleeptime(tk, delta);
1099 __timekeeping_inject_sleeptime(tk, &tmp);
1100 1156
1101 timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET); 1157 timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET);
1102 1158
@@ -1332,6 +1388,12 @@ static __always_inline void timekeeping_apply_adjustment(struct timekeeper *tk,
1332 * 1388 *
1333 * XXX - TODO: Doc ntp_error calculation. 1389 * XXX - TODO: Doc ntp_error calculation.
1334 */ 1390 */
1391 if ((mult_adj > 0) && (tk->tkr.mult + mult_adj < mult_adj)) {
1392 /* NTP adjustment caused clocksource mult overflow */
1393 WARN_ON_ONCE(1);
1394 return;
1395 }
1396
1335 tk->tkr.mult += mult_adj; 1397 tk->tkr.mult += mult_adj;
1336 tk->xtime_interval += interval; 1398 tk->xtime_interval += interval;
1337 tk->tkr.xtime_nsec -= offset; 1399 tk->tkr.xtime_nsec -= offset;
@@ -1397,7 +1459,8 @@ static void timekeeping_adjust(struct timekeeper *tk, s64 offset)
1397 } 1459 }
1398 1460
1399 if (unlikely(tk->tkr.clock->maxadj && 1461 if (unlikely(tk->tkr.clock->maxadj &&
1400 (tk->tkr.mult > tk->tkr.clock->mult + tk->tkr.clock->maxadj))) { 1462 (abs(tk->tkr.mult - tk->tkr.clock->mult)
1463 > tk->tkr.clock->maxadj))) {
1401 printk_once(KERN_WARNING 1464 printk_once(KERN_WARNING
1402 "Adjusting %s more than 11%% (%ld vs %ld)\n", 1465 "Adjusting %s more than 11%% (%ld vs %ld)\n",
1403 tk->tkr.clock->name, (long)tk->tkr.mult, 1466 tk->tkr.clock->name, (long)tk->tkr.mult,
@@ -1646,7 +1709,7 @@ struct timespec current_kernel_time(void)
1646} 1709}
1647EXPORT_SYMBOL(current_kernel_time); 1710EXPORT_SYMBOL(current_kernel_time);
1648 1711
1649struct timespec get_monotonic_coarse(void) 1712struct timespec64 get_monotonic_coarse64(void)
1650{ 1713{
1651 struct timekeeper *tk = &tk_core.timekeeper; 1714 struct timekeeper *tk = &tk_core.timekeeper;
1652 struct timespec64 now, mono; 1715 struct timespec64 now, mono;
@@ -1662,7 +1725,7 @@ struct timespec get_monotonic_coarse(void)
1662 set_normalized_timespec64(&now, now.tv_sec + mono.tv_sec, 1725 set_normalized_timespec64(&now, now.tv_sec + mono.tv_sec,
1663 now.tv_nsec + mono.tv_nsec); 1726 now.tv_nsec + mono.tv_nsec);
1664 1727
1665 return timespec64_to_timespec(now); 1728 return now;
1666} 1729}
1667 1730
1668/* 1731/*
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index aca5dfe2fa3d..2d3f5c504939 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -655,7 +655,7 @@ static inline void debug_assert_init(struct timer_list *timer)
655static void do_init_timer(struct timer_list *timer, unsigned int flags, 655static void do_init_timer(struct timer_list *timer, unsigned int flags,
656 const char *name, struct lock_class_key *key) 656 const char *name, struct lock_class_key *key)
657{ 657{
658 struct tvec_base *base = __raw_get_cpu_var(tvec_bases); 658 struct tvec_base *base = raw_cpu_read(tvec_bases);
659 659
660 timer->entry.next = NULL; 660 timer->entry.next = NULL;
661 timer->base = (void *)((unsigned long)base | flags); 661 timer->base = (void *)((unsigned long)base | flags);
@@ -1377,15 +1377,14 @@ unsigned long get_next_timer_interrupt(unsigned long now)
1377void update_process_times(int user_tick) 1377void update_process_times(int user_tick)
1378{ 1378{
1379 struct task_struct *p = current; 1379 struct task_struct *p = current;
1380 int cpu = smp_processor_id();
1381 1380
1382 /* Note: this timer irq context must be accounted for as well. */ 1381 /* Note: this timer irq context must be accounted for as well. */
1383 account_process_tick(p, user_tick); 1382 account_process_tick(p, user_tick);
1384 run_local_timers(); 1383 run_local_timers();
1385 rcu_check_callbacks(cpu, user_tick); 1384 rcu_check_callbacks(user_tick);
1386#ifdef CONFIG_IRQ_WORK 1385#ifdef CONFIG_IRQ_WORK
1387 if (in_irq()) 1386 if (in_irq())
1388 irq_work_run(); 1387 irq_work_tick();
1389#endif 1388#endif
1390 scheduler_tick(); 1389 scheduler_tick();
1391 run_posix_cpu_timers(p); 1390 run_posix_cpu_timers(p);
diff --git a/kernel/torture.c b/kernel/torture.c
index d600af21f022..dd70993c266c 100644
--- a/kernel/torture.c
+++ b/kernel/torture.c
@@ -211,18 +211,16 @@ EXPORT_SYMBOL_GPL(torture_onoff_cleanup);
211/* 211/*
212 * Print online/offline testing statistics. 212 * Print online/offline testing statistics.
213 */ 213 */
214char *torture_onoff_stats(char *page) 214void torture_onoff_stats(void)
215{ 215{
216#ifdef CONFIG_HOTPLUG_CPU 216#ifdef CONFIG_HOTPLUG_CPU
217 page += sprintf(page, 217 pr_cont("onoff: %ld/%ld:%ld/%ld %d,%d:%d,%d %lu:%lu (HZ=%d) ",
218 "onoff: %ld/%ld:%ld/%ld %d,%d:%d,%d %lu:%lu (HZ=%d) ", 218 n_online_successes, n_online_attempts,
219 n_online_successes, n_online_attempts, 219 n_offline_successes, n_offline_attempts,
220 n_offline_successes, n_offline_attempts, 220 min_online, max_online,
221 min_online, max_online, 221 min_offline, max_offline,
222 min_offline, max_offline, 222 sum_online, sum_offline, HZ);
223 sum_online, sum_offline, HZ);
224#endif /* #ifdef CONFIG_HOTPLUG_CPU */ 223#endif /* #ifdef CONFIG_HOTPLUG_CPU */
225 return page;
226} 224}
227EXPORT_SYMBOL_GPL(torture_onoff_stats); 225EXPORT_SYMBOL_GPL(torture_onoff_stats);
228 226
@@ -635,8 +633,13 @@ EXPORT_SYMBOL_GPL(torture_init_end);
635 * 633 *
636 * This must be called before the caller starts shutting down its own 634 * This must be called before the caller starts shutting down its own
637 * kthreads. 635 * kthreads.
636 *
637 * Both torture_cleanup_begin() and torture_cleanup_end() must be paired,
638 * in order to correctly perform the cleanup. They are separated because
639 * threads can still need to reference the torture_type type, thus nullify
640 * only after completing all other relevant calls.
638 */ 641 */
639bool torture_cleanup(void) 642bool torture_cleanup_begin(void)
640{ 643{
641 mutex_lock(&fullstop_mutex); 644 mutex_lock(&fullstop_mutex);
642 if (ACCESS_ONCE(fullstop) == FULLSTOP_SHUTDOWN) { 645 if (ACCESS_ONCE(fullstop) == FULLSTOP_SHUTDOWN) {
@@ -651,12 +654,17 @@ bool torture_cleanup(void)
651 torture_shuffle_cleanup(); 654 torture_shuffle_cleanup();
652 torture_stutter_cleanup(); 655 torture_stutter_cleanup();
653 torture_onoff_cleanup(); 656 torture_onoff_cleanup();
657 return false;
658}
659EXPORT_SYMBOL_GPL(torture_cleanup_begin);
660
661void torture_cleanup_end(void)
662{
654 mutex_lock(&fullstop_mutex); 663 mutex_lock(&fullstop_mutex);
655 torture_type = NULL; 664 torture_type = NULL;
656 mutex_unlock(&fullstop_mutex); 665 mutex_unlock(&fullstop_mutex);
657 return false;
658} 666}
659EXPORT_SYMBOL_GPL(torture_cleanup); 667EXPORT_SYMBOL_GPL(torture_cleanup_end);
660 668
661/* 669/*
662 * Is it time for the current torture test to stop? 670 * Is it time for the current torture test to stop?
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 67d6369ddf83..979ccde26720 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -55,7 +55,7 @@ obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o
55obj-$(CONFIG_EVENT_TRACING) += trace_events_trigger.o 55obj-$(CONFIG_EVENT_TRACING) += trace_events_trigger.o
56obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o 56obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o
57obj-$(CONFIG_TRACEPOINTS) += power-traces.o 57obj-$(CONFIG_TRACEPOINTS) += power-traces.o
58ifeq ($(CONFIG_PM_RUNTIME),y) 58ifeq ($(CONFIG_PM),y)
59obj-$(CONFIG_TRACEPOINTS) += rpm-traces.o 59obj-$(CONFIG_TRACEPOINTS) += rpm-traces.o
60endif 60endif
61ifeq ($(CONFIG_TRACING),y) 61ifeq ($(CONFIG_TRACING),y)
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index c1bd4ada2a04..483cecfa5c17 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -1142,9 +1142,9 @@ static void get_pdu_remap(const struct trace_entry *ent,
1142 r->sector_from = be64_to_cpu(sector_from); 1142 r->sector_from = be64_to_cpu(sector_from);
1143} 1143}
1144 1144
1145typedef int (blk_log_action_t) (struct trace_iterator *iter, const char *act); 1145typedef void (blk_log_action_t) (struct trace_iterator *iter, const char *act);
1146 1146
1147static int blk_log_action_classic(struct trace_iterator *iter, const char *act) 1147static void blk_log_action_classic(struct trace_iterator *iter, const char *act)
1148{ 1148{
1149 char rwbs[RWBS_LEN]; 1149 char rwbs[RWBS_LEN];
1150 unsigned long long ts = iter->ts; 1150 unsigned long long ts = iter->ts;
@@ -1154,33 +1154,33 @@ static int blk_log_action_classic(struct trace_iterator *iter, const char *act)
1154 1154
1155 fill_rwbs(rwbs, t); 1155 fill_rwbs(rwbs, t);
1156 1156
1157 return trace_seq_printf(&iter->seq, 1157 trace_seq_printf(&iter->seq,
1158 "%3d,%-3d %2d %5d.%09lu %5u %2s %3s ", 1158 "%3d,%-3d %2d %5d.%09lu %5u %2s %3s ",
1159 MAJOR(t->device), MINOR(t->device), iter->cpu, 1159 MAJOR(t->device), MINOR(t->device), iter->cpu,
1160 secs, nsec_rem, iter->ent->pid, act, rwbs); 1160 secs, nsec_rem, iter->ent->pid, act, rwbs);
1161} 1161}
1162 1162
1163static int blk_log_action(struct trace_iterator *iter, const char *act) 1163static void blk_log_action(struct trace_iterator *iter, const char *act)
1164{ 1164{
1165 char rwbs[RWBS_LEN]; 1165 char rwbs[RWBS_LEN];
1166 const struct blk_io_trace *t = te_blk_io_trace(iter->ent); 1166 const struct blk_io_trace *t = te_blk_io_trace(iter->ent);
1167 1167
1168 fill_rwbs(rwbs, t); 1168 fill_rwbs(rwbs, t);
1169 return trace_seq_printf(&iter->seq, "%3d,%-3d %2s %3s ", 1169 trace_seq_printf(&iter->seq, "%3d,%-3d %2s %3s ",
1170 MAJOR(t->device), MINOR(t->device), act, rwbs); 1170 MAJOR(t->device), MINOR(t->device), act, rwbs);
1171} 1171}
1172 1172
1173static int blk_log_dump_pdu(struct trace_seq *s, const struct trace_entry *ent) 1173static void blk_log_dump_pdu(struct trace_seq *s, const struct trace_entry *ent)
1174{ 1174{
1175 const unsigned char *pdu_buf; 1175 const unsigned char *pdu_buf;
1176 int pdu_len; 1176 int pdu_len;
1177 int i, end, ret; 1177 int i, end;
1178 1178
1179 pdu_buf = pdu_start(ent); 1179 pdu_buf = pdu_start(ent);
1180 pdu_len = te_blk_io_trace(ent)->pdu_len; 1180 pdu_len = te_blk_io_trace(ent)->pdu_len;
1181 1181
1182 if (!pdu_len) 1182 if (!pdu_len)
1183 return 1; 1183 return;
1184 1184
1185 /* find the last zero that needs to be printed */ 1185 /* find the last zero that needs to be printed */
1186 for (end = pdu_len - 1; end >= 0; end--) 1186 for (end = pdu_len - 1; end >= 0; end--)
@@ -1188,119 +1188,107 @@ static int blk_log_dump_pdu(struct trace_seq *s, const struct trace_entry *ent)
1188 break; 1188 break;
1189 end++; 1189 end++;
1190 1190
1191 if (!trace_seq_putc(s, '(')) 1191 trace_seq_putc(s, '(');
1192 return 0;
1193 1192
1194 for (i = 0; i < pdu_len; i++) { 1193 for (i = 0; i < pdu_len; i++) {
1195 1194
1196 ret = trace_seq_printf(s, "%s%02x", 1195 trace_seq_printf(s, "%s%02x",
1197 i == 0 ? "" : " ", pdu_buf[i]); 1196 i == 0 ? "" : " ", pdu_buf[i]);
1198 if (!ret)
1199 return ret;
1200 1197
1201 /* 1198 /*
1202 * stop when the rest is just zeroes and indicate so 1199 * stop when the rest is just zeroes and indicate so
1203 * with a ".." appended 1200 * with a ".." appended
1204 */ 1201 */
1205 if (i == end && end != pdu_len - 1) 1202 if (i == end && end != pdu_len - 1) {
1206 return trace_seq_puts(s, " ..) "); 1203 trace_seq_puts(s, " ..) ");
1204 return;
1205 }
1207 } 1206 }
1208 1207
1209 return trace_seq_puts(s, ") "); 1208 trace_seq_puts(s, ") ");
1210} 1209}
1211 1210
1212static int blk_log_generic(struct trace_seq *s, const struct trace_entry *ent) 1211static void blk_log_generic(struct trace_seq *s, const struct trace_entry *ent)
1213{ 1212{
1214 char cmd[TASK_COMM_LEN]; 1213 char cmd[TASK_COMM_LEN];
1215 1214
1216 trace_find_cmdline(ent->pid, cmd); 1215 trace_find_cmdline(ent->pid, cmd);
1217 1216
1218 if (t_action(ent) & BLK_TC_ACT(BLK_TC_PC)) { 1217 if (t_action(ent) & BLK_TC_ACT(BLK_TC_PC)) {
1219 int ret; 1218 trace_seq_printf(s, "%u ", t_bytes(ent));
1220 1219 blk_log_dump_pdu(s, ent);
1221 ret = trace_seq_printf(s, "%u ", t_bytes(ent)); 1220 trace_seq_printf(s, "[%s]\n", cmd);
1222 if (!ret)
1223 return 0;
1224 ret = blk_log_dump_pdu(s, ent);
1225 if (!ret)
1226 return 0;
1227 return trace_seq_printf(s, "[%s]\n", cmd);
1228 } else { 1221 } else {
1229 if (t_sec(ent)) 1222 if (t_sec(ent))
1230 return trace_seq_printf(s, "%llu + %u [%s]\n", 1223 trace_seq_printf(s, "%llu + %u [%s]\n",
1231 t_sector(ent), t_sec(ent), cmd); 1224 t_sector(ent), t_sec(ent), cmd);
1232 return trace_seq_printf(s, "[%s]\n", cmd); 1225 else
1226 trace_seq_printf(s, "[%s]\n", cmd);
1233 } 1227 }
1234} 1228}
1235 1229
1236static int blk_log_with_error(struct trace_seq *s, 1230static void blk_log_with_error(struct trace_seq *s,
1237 const struct trace_entry *ent) 1231 const struct trace_entry *ent)
1238{ 1232{
1239 if (t_action(ent) & BLK_TC_ACT(BLK_TC_PC)) { 1233 if (t_action(ent) & BLK_TC_ACT(BLK_TC_PC)) {
1240 int ret; 1234 blk_log_dump_pdu(s, ent);
1241 1235 trace_seq_printf(s, "[%d]\n", t_error(ent));
1242 ret = blk_log_dump_pdu(s, ent);
1243 if (ret)
1244 return trace_seq_printf(s, "[%d]\n", t_error(ent));
1245 return 0;
1246 } else { 1236 } else {
1247 if (t_sec(ent)) 1237 if (t_sec(ent))
1248 return trace_seq_printf(s, "%llu + %u [%d]\n", 1238 trace_seq_printf(s, "%llu + %u [%d]\n",
1249 t_sector(ent), 1239 t_sector(ent),
1250 t_sec(ent), t_error(ent)); 1240 t_sec(ent), t_error(ent));
1251 return trace_seq_printf(s, "%llu [%d]\n", 1241 else
1252 t_sector(ent), t_error(ent)); 1242 trace_seq_printf(s, "%llu [%d]\n",
1243 t_sector(ent), t_error(ent));
1253 } 1244 }
1254} 1245}
1255 1246
1256static int blk_log_remap(struct trace_seq *s, const struct trace_entry *ent) 1247static void blk_log_remap(struct trace_seq *s, const struct trace_entry *ent)
1257{ 1248{
1258 struct blk_io_trace_remap r = { .device_from = 0, }; 1249 struct blk_io_trace_remap r = { .device_from = 0, };
1259 1250
1260 get_pdu_remap(ent, &r); 1251 get_pdu_remap(ent, &r);
1261 return trace_seq_printf(s, "%llu + %u <- (%d,%d) %llu\n", 1252 trace_seq_printf(s, "%llu + %u <- (%d,%d) %llu\n",
1262 t_sector(ent), t_sec(ent), 1253 t_sector(ent), t_sec(ent),
1263 MAJOR(r.device_from), MINOR(r.device_from), 1254 MAJOR(r.device_from), MINOR(r.device_from),
1264 (unsigned long long)r.sector_from); 1255 (unsigned long long)r.sector_from);
1265} 1256}
1266 1257
1267static int blk_log_plug(struct trace_seq *s, const struct trace_entry *ent) 1258static void blk_log_plug(struct trace_seq *s, const struct trace_entry *ent)
1268{ 1259{
1269 char cmd[TASK_COMM_LEN]; 1260 char cmd[TASK_COMM_LEN];
1270 1261
1271 trace_find_cmdline(ent->pid, cmd); 1262 trace_find_cmdline(ent->pid, cmd);
1272 1263
1273 return trace_seq_printf(s, "[%s]\n", cmd); 1264 trace_seq_printf(s, "[%s]\n", cmd);
1274} 1265}
1275 1266
1276static int blk_log_unplug(struct trace_seq *s, const struct trace_entry *ent) 1267static void blk_log_unplug(struct trace_seq *s, const struct trace_entry *ent)
1277{ 1268{
1278 char cmd[TASK_COMM_LEN]; 1269 char cmd[TASK_COMM_LEN];
1279 1270
1280 trace_find_cmdline(ent->pid, cmd); 1271 trace_find_cmdline(ent->pid, cmd);
1281 1272
1282 return trace_seq_printf(s, "[%s] %llu\n", cmd, get_pdu_int(ent)); 1273 trace_seq_printf(s, "[%s] %llu\n", cmd, get_pdu_int(ent));
1283} 1274}
1284 1275
1285static int blk_log_split(struct trace_seq *s, const struct trace_entry *ent) 1276static void blk_log_split(struct trace_seq *s, const struct trace_entry *ent)
1286{ 1277{
1287 char cmd[TASK_COMM_LEN]; 1278 char cmd[TASK_COMM_LEN];
1288 1279
1289 trace_find_cmdline(ent->pid, cmd); 1280 trace_find_cmdline(ent->pid, cmd);
1290 1281
1291 return trace_seq_printf(s, "%llu / %llu [%s]\n", t_sector(ent), 1282 trace_seq_printf(s, "%llu / %llu [%s]\n", t_sector(ent),
1292 get_pdu_int(ent), cmd); 1283 get_pdu_int(ent), cmd);
1293} 1284}
1294 1285
1295static int blk_log_msg(struct trace_seq *s, const struct trace_entry *ent) 1286static void blk_log_msg(struct trace_seq *s, const struct trace_entry *ent)
1296{ 1287{
1297 int ret;
1298 const struct blk_io_trace *t = te_blk_io_trace(ent); 1288 const struct blk_io_trace *t = te_blk_io_trace(ent);
1299 1289
1300 ret = trace_seq_putmem(s, t + 1, t->pdu_len); 1290 trace_seq_putmem(s, t + 1, t->pdu_len);
1301 if (ret) 1291 trace_seq_putc(s, '\n');
1302 return trace_seq_putc(s, '\n');
1303 return ret;
1304} 1292}
1305 1293
1306/* 1294/*
@@ -1339,7 +1327,7 @@ static void blk_tracer_reset(struct trace_array *tr)
1339 1327
1340static const struct { 1328static const struct {
1341 const char *act[2]; 1329 const char *act[2];
1342 int (*print)(struct trace_seq *s, const struct trace_entry *ent); 1330 void (*print)(struct trace_seq *s, const struct trace_entry *ent);
1343} what2act[] = { 1331} what2act[] = {
1344 [__BLK_TA_QUEUE] = {{ "Q", "queue" }, blk_log_generic }, 1332 [__BLK_TA_QUEUE] = {{ "Q", "queue" }, blk_log_generic },
1345 [__BLK_TA_BACKMERGE] = {{ "M", "backmerge" }, blk_log_generic }, 1333 [__BLK_TA_BACKMERGE] = {{ "M", "backmerge" }, blk_log_generic },
@@ -1364,7 +1352,6 @@ static enum print_line_t print_one_line(struct trace_iterator *iter,
1364 struct trace_seq *s = &iter->seq; 1352 struct trace_seq *s = &iter->seq;
1365 const struct blk_io_trace *t; 1353 const struct blk_io_trace *t;
1366 u16 what; 1354 u16 what;
1367 int ret;
1368 bool long_act; 1355 bool long_act;
1369 blk_log_action_t *log_action; 1356 blk_log_action_t *log_action;
1370 1357
@@ -1374,21 +1361,18 @@ static enum print_line_t print_one_line(struct trace_iterator *iter,
1374 log_action = classic ? &blk_log_action_classic : &blk_log_action; 1361 log_action = classic ? &blk_log_action_classic : &blk_log_action;
1375 1362
1376 if (t->action == BLK_TN_MESSAGE) { 1363 if (t->action == BLK_TN_MESSAGE) {
1377 ret = log_action(iter, long_act ? "message" : "m"); 1364 log_action(iter, long_act ? "message" : "m");
1378 if (ret) 1365 blk_log_msg(s, iter->ent);
1379 ret = blk_log_msg(s, iter->ent);
1380 goto out;
1381 } 1366 }
1382 1367
1383 if (unlikely(what == 0 || what >= ARRAY_SIZE(what2act))) 1368 if (unlikely(what == 0 || what >= ARRAY_SIZE(what2act)))
1384 ret = trace_seq_printf(s, "Unknown action %x\n", what); 1369 trace_seq_printf(s, "Unknown action %x\n", what);
1385 else { 1370 else {
1386 ret = log_action(iter, what2act[what].act[long_act]); 1371 log_action(iter, what2act[what].act[long_act]);
1387 if (ret) 1372 what2act[what].print(s, iter->ent);
1388 ret = what2act[what].print(s, iter->ent);
1389 } 1373 }
1390out: 1374
1391 return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE; 1375 return trace_handle_return(s);
1392} 1376}
1393 1377
1394static enum print_line_t blk_trace_event_print(struct trace_iterator *iter, 1378static enum print_line_t blk_trace_event_print(struct trace_iterator *iter,
@@ -1397,7 +1381,7 @@ static enum print_line_t blk_trace_event_print(struct trace_iterator *iter,
1397 return print_one_line(iter, false); 1381 return print_one_line(iter, false);
1398} 1382}
1399 1383
1400static int blk_trace_synthesize_old_trace(struct trace_iterator *iter) 1384static void blk_trace_synthesize_old_trace(struct trace_iterator *iter)
1401{ 1385{
1402 struct trace_seq *s = &iter->seq; 1386 struct trace_seq *s = &iter->seq;
1403 struct blk_io_trace *t = (struct blk_io_trace *)iter->ent; 1387 struct blk_io_trace *t = (struct blk_io_trace *)iter->ent;
@@ -1407,18 +1391,18 @@ static int blk_trace_synthesize_old_trace(struct trace_iterator *iter)
1407 .time = iter->ts, 1391 .time = iter->ts,
1408 }; 1392 };
1409 1393
1410 if (!trace_seq_putmem(s, &old, offset)) 1394 trace_seq_putmem(s, &old, offset);
1411 return 0; 1395 trace_seq_putmem(s, &t->sector,
1412 return trace_seq_putmem(s, &t->sector, 1396 sizeof(old) - offset + t->pdu_len);
1413 sizeof(old) - offset + t->pdu_len);
1414} 1397}
1415 1398
1416static enum print_line_t 1399static enum print_line_t
1417blk_trace_event_print_binary(struct trace_iterator *iter, int flags, 1400blk_trace_event_print_binary(struct trace_iterator *iter, int flags,
1418 struct trace_event *event) 1401 struct trace_event *event)
1419{ 1402{
1420 return blk_trace_synthesize_old_trace(iter) ? 1403 blk_trace_synthesize_old_trace(iter);
1421 TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE; 1404
1405 return trace_handle_return(&iter->seq);
1422} 1406}
1423 1407
1424static enum print_line_t blk_tracer_print_line(struct trace_iterator *iter) 1408static enum print_line_t blk_tracer_print_line(struct trace_iterator *iter)
@@ -1493,9 +1477,6 @@ static int blk_trace_remove_queue(struct request_queue *q)
1493 if (atomic_dec_and_test(&blk_probes_ref)) 1477 if (atomic_dec_and_test(&blk_probes_ref))
1494 blk_unregister_tracepoints(); 1478 blk_unregister_tracepoints();
1495 1479
1496 spin_lock_irq(&running_trace_lock);
1497 list_del(&bt->running_list);
1498 spin_unlock_irq(&running_trace_lock);
1499 blk_trace_free(bt); 1480 blk_trace_free(bt);
1500 return 0; 1481 return 0;
1501} 1482}
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 5916a8e59e87..929a733d302e 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -113,6 +113,9 @@ ftrace_func_t ftrace_pid_function __read_mostly = ftrace_stub;
113static struct ftrace_ops global_ops; 113static struct ftrace_ops global_ops;
114static struct ftrace_ops control_ops; 114static struct ftrace_ops control_ops;
115 115
116static void ftrace_ops_recurs_func(unsigned long ip, unsigned long parent_ip,
117 struct ftrace_ops *op, struct pt_regs *regs);
118
116#if ARCH_SUPPORTS_FTRACE_OPS 119#if ARCH_SUPPORTS_FTRACE_OPS
117static void ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, 120static void ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
118 struct ftrace_ops *op, struct pt_regs *regs); 121 struct ftrace_ops *op, struct pt_regs *regs);
@@ -251,18 +254,24 @@ static void update_ftrace_function(void)
251 ftrace_func_t func; 254 ftrace_func_t func;
252 255
253 /* 256 /*
257 * Prepare the ftrace_ops that the arch callback will use.
258 * If there's only one ftrace_ops registered, the ftrace_ops_list
259 * will point to the ops we want.
260 */
261 set_function_trace_op = ftrace_ops_list;
262
263 /* If there's no ftrace_ops registered, just call the stub function */
264 if (ftrace_ops_list == &ftrace_list_end) {
265 func = ftrace_stub;
266
267 /*
254 * If we are at the end of the list and this ops is 268 * If we are at the end of the list and this ops is
255 * recursion safe and not dynamic and the arch supports passing ops, 269 * recursion safe and not dynamic and the arch supports passing ops,
256 * then have the mcount trampoline call the function directly. 270 * then have the mcount trampoline call the function directly.
257 */ 271 */
258 if (ftrace_ops_list == &ftrace_list_end || 272 } else if (ftrace_ops_list->next == &ftrace_list_end) {
259 (ftrace_ops_list->next == &ftrace_list_end && 273 func = ftrace_ops_get_func(ftrace_ops_list);
260 !(ftrace_ops_list->flags & FTRACE_OPS_FL_DYNAMIC) && 274
261 (ftrace_ops_list->flags & FTRACE_OPS_FL_RECURSION_SAFE) &&
262 !FTRACE_FORCE_LIST_FUNC)) {
263 /* Set the ftrace_ops that the arch callback uses */
264 set_function_trace_op = ftrace_ops_list;
265 func = ftrace_ops_list->func;
266 } else { 275 } else {
267 /* Just use the default ftrace_ops */ 276 /* Just use the default ftrace_ops */
268 set_function_trace_op = &ftrace_list_end; 277 set_function_trace_op = &ftrace_list_end;
@@ -378,6 +387,8 @@ static int remove_ftrace_list_ops(struct ftrace_ops **list,
378 return ret; 387 return ret;
379} 388}
380 389
390static void ftrace_update_trampoline(struct ftrace_ops *ops);
391
381static int __register_ftrace_function(struct ftrace_ops *ops) 392static int __register_ftrace_function(struct ftrace_ops *ops)
382{ 393{
383 if (ops->flags & FTRACE_OPS_FL_DELETED) 394 if (ops->flags & FTRACE_OPS_FL_DELETED)
@@ -407,9 +418,13 @@ static int __register_ftrace_function(struct ftrace_ops *ops)
407 if (control_ops_alloc(ops)) 418 if (control_ops_alloc(ops))
408 return -ENOMEM; 419 return -ENOMEM;
409 add_ftrace_list_ops(&ftrace_control_list, &control_ops, ops); 420 add_ftrace_list_ops(&ftrace_control_list, &control_ops, ops);
421 /* The control_ops needs the trampoline update */
422 ops = &control_ops;
410 } else 423 } else
411 add_ftrace_ops(&ftrace_ops_list, ops); 424 add_ftrace_ops(&ftrace_ops_list, ops);
412 425
426 ftrace_update_trampoline(ops);
427
413 if (ftrace_enabled) 428 if (ftrace_enabled)
414 update_ftrace_function(); 429 update_ftrace_function();
415 430
@@ -556,13 +571,13 @@ static int function_stat_cmp(void *p1, void *p2)
556static int function_stat_headers(struct seq_file *m) 571static int function_stat_headers(struct seq_file *m)
557{ 572{
558#ifdef CONFIG_FUNCTION_GRAPH_TRACER 573#ifdef CONFIG_FUNCTION_GRAPH_TRACER
559 seq_printf(m, " Function " 574 seq_puts(m, " Function "
560 "Hit Time Avg s^2\n" 575 "Hit Time Avg s^2\n"
561 " -------- " 576 " -------- "
562 "--- ---- --- ---\n"); 577 "--- ---- --- ---\n");
563#else 578#else
564 seq_printf(m, " Function Hit\n" 579 seq_puts(m, " Function Hit\n"
565 " -------- ---\n"); 580 " -------- ---\n");
566#endif 581#endif
567 return 0; 582 return 0;
568} 583}
@@ -589,7 +604,7 @@ static int function_stat_show(struct seq_file *m, void *v)
589 seq_printf(m, " %-30.30s %10lu", str, rec->counter); 604 seq_printf(m, " %-30.30s %10lu", str, rec->counter);
590 605
591#ifdef CONFIG_FUNCTION_GRAPH_TRACER 606#ifdef CONFIG_FUNCTION_GRAPH_TRACER
592 seq_printf(m, " "); 607 seq_puts(m, " ");
593 avg = rec->time; 608 avg = rec->time;
594 do_div(avg, rec->counter); 609 do_div(avg, rec->counter);
595 610
@@ -1048,6 +1063,12 @@ static struct pid * const ftrace_swapper_pid = &init_struct_pid;
1048 1063
1049static struct ftrace_ops *removed_ops; 1064static struct ftrace_ops *removed_ops;
1050 1065
1066/*
1067 * Set when doing a global update, like enabling all recs or disabling them.
1068 * It is not set when just updating a single ftrace_ops.
1069 */
1070static bool update_all_ops;
1071
1051#ifndef CONFIG_FTRACE_MCOUNT_RECORD 1072#ifndef CONFIG_FTRACE_MCOUNT_RECORD
1052# error Dynamic ftrace depends on MCOUNT_RECORD 1073# error Dynamic ftrace depends on MCOUNT_RECORD
1053#endif 1074#endif
@@ -1096,6 +1117,43 @@ static struct ftrace_ops global_ops = {
1096 FTRACE_OPS_FL_INITIALIZED, 1117 FTRACE_OPS_FL_INITIALIZED,
1097}; 1118};
1098 1119
1120/*
1121 * This is used by __kernel_text_address() to return true if the
1122 * address is on a dynamically allocated trampoline that would
1123 * not return true for either core_kernel_text() or
1124 * is_module_text_address().
1125 */
1126bool is_ftrace_trampoline(unsigned long addr)
1127{
1128 struct ftrace_ops *op;
1129 bool ret = false;
1130
1131 /*
1132 * Some of the ops may be dynamically allocated,
1133 * they are freed after a synchronize_sched().
1134 */
1135 preempt_disable_notrace();
1136
1137 do_for_each_ftrace_op(op, ftrace_ops_list) {
1138 /*
1139 * This is to check for dynamically allocated trampolines.
1140 * Trampolines that are in kernel text will have
1141 * core_kernel_text() return true.
1142 */
1143 if (op->trampoline && op->trampoline_size)
1144 if (addr >= op->trampoline &&
1145 addr < op->trampoline + op->trampoline_size) {
1146 ret = true;
1147 goto out;
1148 }
1149 } while_for_each_ftrace_op(op);
1150
1151 out:
1152 preempt_enable_notrace();
1153
1154 return ret;
1155}
1156
1099struct ftrace_page { 1157struct ftrace_page {
1100 struct ftrace_page *next; 1158 struct ftrace_page *next;
1101 struct dyn_ftrace *records; 1159 struct dyn_ftrace *records;
@@ -1300,6 +1358,9 @@ ftrace_hash_rec_disable_modify(struct ftrace_ops *ops, int filter_hash);
1300static void 1358static void
1301ftrace_hash_rec_enable_modify(struct ftrace_ops *ops, int filter_hash); 1359ftrace_hash_rec_enable_modify(struct ftrace_ops *ops, int filter_hash);
1302 1360
1361static int ftrace_hash_ipmodify_update(struct ftrace_ops *ops,
1362 struct ftrace_hash *new_hash);
1363
1303static int 1364static int
1304ftrace_hash_move(struct ftrace_ops *ops, int enable, 1365ftrace_hash_move(struct ftrace_ops *ops, int enable,
1305 struct ftrace_hash **dst, struct ftrace_hash *src) 1366 struct ftrace_hash **dst, struct ftrace_hash *src)
@@ -1307,12 +1368,16 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable,
1307 struct ftrace_func_entry *entry; 1368 struct ftrace_func_entry *entry;
1308 struct hlist_node *tn; 1369 struct hlist_node *tn;
1309 struct hlist_head *hhd; 1370 struct hlist_head *hhd;
1310 struct ftrace_hash *old_hash;
1311 struct ftrace_hash *new_hash; 1371 struct ftrace_hash *new_hash;
1312 int size = src->count; 1372 int size = src->count;
1313 int bits = 0; 1373 int bits = 0;
1374 int ret;
1314 int i; 1375 int i;
1315 1376
1377 /* Reject setting notrace hash on IPMODIFY ftrace_ops */
1378 if (ops->flags & FTRACE_OPS_FL_IPMODIFY && !enable)
1379 return -EINVAL;
1380
1316 /* 1381 /*
1317 * If the new source is empty, just free dst and assign it 1382 * If the new source is empty, just free dst and assign it
1318 * the empty_hash. 1383 * the empty_hash.
@@ -1346,21 +1411,44 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable,
1346 } 1411 }
1347 1412
1348update: 1413update:
1414 /* Make sure this can be applied if it is IPMODIFY ftrace_ops */
1415 if (enable) {
1416 /* IPMODIFY should be updated only when filter_hash updating */
1417 ret = ftrace_hash_ipmodify_update(ops, new_hash);
1418 if (ret < 0) {
1419 free_ftrace_hash(new_hash);
1420 return ret;
1421 }
1422 }
1423
1349 /* 1424 /*
1350 * Remove the current set, update the hash and add 1425 * Remove the current set, update the hash and add
1351 * them back. 1426 * them back.
1352 */ 1427 */
1353 ftrace_hash_rec_disable_modify(ops, enable); 1428 ftrace_hash_rec_disable_modify(ops, enable);
1354 1429
1355 old_hash = *dst;
1356 rcu_assign_pointer(*dst, new_hash); 1430 rcu_assign_pointer(*dst, new_hash);
1357 free_ftrace_hash_rcu(old_hash);
1358 1431
1359 ftrace_hash_rec_enable_modify(ops, enable); 1432 ftrace_hash_rec_enable_modify(ops, enable);
1360 1433
1361 return 0; 1434 return 0;
1362} 1435}
1363 1436
1437static bool hash_contains_ip(unsigned long ip,
1438 struct ftrace_ops_hash *hash)
1439{
1440 /*
1441 * The function record is a match if it exists in the filter
1442 * hash and not in the notrace hash. Note, an emty hash is
1443 * considered a match for the filter hash, but an empty
1444 * notrace hash is considered not in the notrace hash.
1445 */
1446 return (ftrace_hash_empty(hash->filter_hash) ||
1447 ftrace_lookup_ip(hash->filter_hash, ip)) &&
1448 (ftrace_hash_empty(hash->notrace_hash) ||
1449 !ftrace_lookup_ip(hash->notrace_hash, ip));
1450}
1451
1364/* 1452/*
1365 * Test the hashes for this ops to see if we want to call 1453 * Test the hashes for this ops to see if we want to call
1366 * the ops->func or not. 1454 * the ops->func or not.
@@ -1376,8 +1464,7 @@ update:
1376static int 1464static int
1377ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip, void *regs) 1465ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip, void *regs)
1378{ 1466{
1379 struct ftrace_hash *filter_hash; 1467 struct ftrace_ops_hash hash;
1380 struct ftrace_hash *notrace_hash;
1381 int ret; 1468 int ret;
1382 1469
1383#ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS 1470#ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS
@@ -1390,13 +1477,10 @@ ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip, void *regs)
1390 return 0; 1477 return 0;
1391#endif 1478#endif
1392 1479
1393 filter_hash = rcu_dereference_raw_notrace(ops->func_hash->filter_hash); 1480 hash.filter_hash = rcu_dereference_raw_notrace(ops->func_hash->filter_hash);
1394 notrace_hash = rcu_dereference_raw_notrace(ops->func_hash->notrace_hash); 1481 hash.notrace_hash = rcu_dereference_raw_notrace(ops->func_hash->notrace_hash);
1395 1482
1396 if ((ftrace_hash_empty(filter_hash) || 1483 if (hash_contains_ip(ip, &hash))
1397 ftrace_lookup_ip(filter_hash, ip)) &&
1398 (ftrace_hash_empty(notrace_hash) ||
1399 !ftrace_lookup_ip(notrace_hash, ip)))
1400 ret = 1; 1484 ret = 1;
1401 else 1485 else
1402 ret = 0; 1486 ret = 0;
@@ -1508,46 +1592,6 @@ static bool test_rec_ops_needs_regs(struct dyn_ftrace *rec)
1508 return keep_regs; 1592 return keep_regs;
1509} 1593}
1510 1594
1511static void ftrace_remove_tramp(struct ftrace_ops *ops,
1512 struct dyn_ftrace *rec)
1513{
1514 /* If TRAMP is not set, no ops should have a trampoline for this */
1515 if (!(rec->flags & FTRACE_FL_TRAMP))
1516 return;
1517
1518 rec->flags &= ~FTRACE_FL_TRAMP;
1519
1520 if ((!ftrace_hash_empty(ops->func_hash->filter_hash) &&
1521 !ftrace_lookup_ip(ops->func_hash->filter_hash, rec->ip)) ||
1522 ftrace_lookup_ip(ops->func_hash->notrace_hash, rec->ip))
1523 return;
1524 /*
1525 * The tramp_hash entry will be removed at time
1526 * of update.
1527 */
1528 ops->nr_trampolines--;
1529}
1530
1531static void ftrace_clear_tramps(struct dyn_ftrace *rec, struct ftrace_ops *ops)
1532{
1533 struct ftrace_ops *op;
1534
1535 /* If TRAMP is not set, no ops should have a trampoline for this */
1536 if (!(rec->flags & FTRACE_FL_TRAMP))
1537 return;
1538
1539 do_for_each_ftrace_op(op, ftrace_ops_list) {
1540 /*
1541 * This function is called to clear other tramps
1542 * not the one that is being updated.
1543 */
1544 if (op == ops)
1545 continue;
1546 if (op->nr_trampolines)
1547 ftrace_remove_tramp(op, rec);
1548 } while_for_each_ftrace_op(op);
1549}
1550
1551static void __ftrace_hash_rec_update(struct ftrace_ops *ops, 1595static void __ftrace_hash_rec_update(struct ftrace_ops *ops,
1552 int filter_hash, 1596 int filter_hash,
1553 bool inc) 1597 bool inc)
@@ -1636,18 +1680,16 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops,
1636 * function, and the ops has a trampoline registered 1680 * function, and the ops has a trampoline registered
1637 * for it, then we can call it directly. 1681 * for it, then we can call it directly.
1638 */ 1682 */
1639 if (ftrace_rec_count(rec) == 1 && ops->trampoline) { 1683 if (ftrace_rec_count(rec) == 1 && ops->trampoline)
1640 rec->flags |= FTRACE_FL_TRAMP; 1684 rec->flags |= FTRACE_FL_TRAMP;
1641 ops->nr_trampolines++; 1685 else
1642 } else {
1643 /* 1686 /*
1644 * If we are adding another function callback 1687 * If we are adding another function callback
1645 * to this function, and the previous had a 1688 * to this function, and the previous had a
1646 * custom trampoline in use, then we need to go 1689 * custom trampoline in use, then we need to go
1647 * back to the default trampoline. 1690 * back to the default trampoline.
1648 */ 1691 */
1649 ftrace_clear_tramps(rec, ops); 1692 rec->flags &= ~FTRACE_FL_TRAMP;
1650 }
1651 1693
1652 /* 1694 /*
1653 * If any ops wants regs saved for this function 1695 * If any ops wants regs saved for this function
@@ -1660,9 +1702,6 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops,
1660 return; 1702 return;
1661 rec->flags--; 1703 rec->flags--;
1662 1704
1663 if (ops->trampoline && !ftrace_rec_count(rec))
1664 ftrace_remove_tramp(ops, rec);
1665
1666 /* 1705 /*
1667 * If the rec had REGS enabled and the ops that is 1706 * If the rec had REGS enabled and the ops that is
1668 * being removed had REGS set, then see if there is 1707 * being removed had REGS set, then see if there is
@@ -1677,6 +1716,17 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops,
1677 } 1716 }
1678 1717
1679 /* 1718 /*
1719 * If the rec had TRAMP enabled, then it needs to
1720 * be cleared. As TRAMP can only be enabled iff
1721 * there is only a single ops attached to it.
1722 * In otherwords, always disable it on decrementing.
1723 * In the future, we may set it if rec count is
1724 * decremented to one, and the ops that is left
1725 * has a trampoline.
1726 */
1727 rec->flags &= ~FTRACE_FL_TRAMP;
1728
1729 /*
1680 * flags will be cleared in ftrace_check_record() 1730 * flags will be cleared in ftrace_check_record()
1681 * if rec count is zero. 1731 * if rec count is zero.
1682 */ 1732 */
@@ -1735,6 +1785,114 @@ static void ftrace_hash_rec_enable_modify(struct ftrace_ops *ops,
1735 ftrace_hash_rec_update_modify(ops, filter_hash, 1); 1785 ftrace_hash_rec_update_modify(ops, filter_hash, 1);
1736} 1786}
1737 1787
1788/*
1789 * Try to update IPMODIFY flag on each ftrace_rec. Return 0 if it is OK
1790 * or no-needed to update, -EBUSY if it detects a conflict of the flag
1791 * on a ftrace_rec, and -EINVAL if the new_hash tries to trace all recs.
1792 * Note that old_hash and new_hash has below meanings
1793 * - If the hash is NULL, it hits all recs (if IPMODIFY is set, this is rejected)
1794 * - If the hash is EMPTY_HASH, it hits nothing
1795 * - Anything else hits the recs which match the hash entries.
1796 */
1797static int __ftrace_hash_update_ipmodify(struct ftrace_ops *ops,
1798 struct ftrace_hash *old_hash,
1799 struct ftrace_hash *new_hash)
1800{
1801 struct ftrace_page *pg;
1802 struct dyn_ftrace *rec, *end = NULL;
1803 int in_old, in_new;
1804
1805 /* Only update if the ops has been registered */
1806 if (!(ops->flags & FTRACE_OPS_FL_ENABLED))
1807 return 0;
1808
1809 if (!(ops->flags & FTRACE_OPS_FL_IPMODIFY))
1810 return 0;
1811
1812 /*
1813 * Since the IPMODIFY is a very address sensitive action, we do not
1814 * allow ftrace_ops to set all functions to new hash.
1815 */
1816 if (!new_hash || !old_hash)
1817 return -EINVAL;
1818
1819 /* Update rec->flags */
1820 do_for_each_ftrace_rec(pg, rec) {
1821 /* We need to update only differences of filter_hash */
1822 in_old = !!ftrace_lookup_ip(old_hash, rec->ip);
1823 in_new = !!ftrace_lookup_ip(new_hash, rec->ip);
1824 if (in_old == in_new)
1825 continue;
1826
1827 if (in_new) {
1828 /* New entries must ensure no others are using it */
1829 if (rec->flags & FTRACE_FL_IPMODIFY)
1830 goto rollback;
1831 rec->flags |= FTRACE_FL_IPMODIFY;
1832 } else /* Removed entry */
1833 rec->flags &= ~FTRACE_FL_IPMODIFY;
1834 } while_for_each_ftrace_rec();
1835
1836 return 0;
1837
1838rollback:
1839 end = rec;
1840
1841 /* Roll back what we did above */
1842 do_for_each_ftrace_rec(pg, rec) {
1843 if (rec == end)
1844 goto err_out;
1845
1846 in_old = !!ftrace_lookup_ip(old_hash, rec->ip);
1847 in_new = !!ftrace_lookup_ip(new_hash, rec->ip);
1848 if (in_old == in_new)
1849 continue;
1850
1851 if (in_new)
1852 rec->flags &= ~FTRACE_FL_IPMODIFY;
1853 else
1854 rec->flags |= FTRACE_FL_IPMODIFY;
1855 } while_for_each_ftrace_rec();
1856
1857err_out:
1858 return -EBUSY;
1859}
1860
1861static int ftrace_hash_ipmodify_enable(struct ftrace_ops *ops)
1862{
1863 struct ftrace_hash *hash = ops->func_hash->filter_hash;
1864
1865 if (ftrace_hash_empty(hash))
1866 hash = NULL;
1867
1868 return __ftrace_hash_update_ipmodify(ops, EMPTY_HASH, hash);
1869}
1870
1871/* Disabling always succeeds */
1872static void ftrace_hash_ipmodify_disable(struct ftrace_ops *ops)
1873{
1874 struct ftrace_hash *hash = ops->func_hash->filter_hash;
1875
1876 if (ftrace_hash_empty(hash))
1877 hash = NULL;
1878
1879 __ftrace_hash_update_ipmodify(ops, hash, EMPTY_HASH);
1880}
1881
1882static int ftrace_hash_ipmodify_update(struct ftrace_ops *ops,
1883 struct ftrace_hash *new_hash)
1884{
1885 struct ftrace_hash *old_hash = ops->func_hash->filter_hash;
1886
1887 if (ftrace_hash_empty(old_hash))
1888 old_hash = NULL;
1889
1890 if (ftrace_hash_empty(new_hash))
1891 new_hash = NULL;
1892
1893 return __ftrace_hash_update_ipmodify(ops, old_hash, new_hash);
1894}
1895
1738static void print_ip_ins(const char *fmt, unsigned char *p) 1896static void print_ip_ins(const char *fmt, unsigned char *p)
1739{ 1897{
1740 int i; 1898 int i;
@@ -1745,10 +1903,13 @@ static void print_ip_ins(const char *fmt, unsigned char *p)
1745 printk(KERN_CONT "%s%02x", i ? ":" : "", p[i]); 1903 printk(KERN_CONT "%s%02x", i ? ":" : "", p[i]);
1746} 1904}
1747 1905
1906static struct ftrace_ops *
1907ftrace_find_tramp_ops_any(struct dyn_ftrace *rec);
1908
1748/** 1909/**
1749 * ftrace_bug - report and shutdown function tracer 1910 * ftrace_bug - report and shutdown function tracer
1750 * @failed: The failed type (EFAULT, EINVAL, EPERM) 1911 * @failed: The failed type (EFAULT, EINVAL, EPERM)
1751 * @ip: The address that failed 1912 * @rec: The record that failed
1752 * 1913 *
1753 * The arch code that enables or disables the function tracing 1914 * The arch code that enables or disables the function tracing
1754 * can call ftrace_bug() when it has detected a problem in 1915 * can call ftrace_bug() when it has detected a problem in
@@ -1757,8 +1918,10 @@ static void print_ip_ins(const char *fmt, unsigned char *p)
1757 * EINVAL - if what is read at @ip is not what was expected 1918 * EINVAL - if what is read at @ip is not what was expected
1758 * EPERM - if the problem happens on writting to the @ip address 1919 * EPERM - if the problem happens on writting to the @ip address
1759 */ 1920 */
1760void ftrace_bug(int failed, unsigned long ip) 1921void ftrace_bug(int failed, struct dyn_ftrace *rec)
1761{ 1922{
1923 unsigned long ip = rec ? rec->ip : 0;
1924
1762 switch (failed) { 1925 switch (failed) {
1763 case -EFAULT: 1926 case -EFAULT:
1764 FTRACE_WARN_ON_ONCE(1); 1927 FTRACE_WARN_ON_ONCE(1);
@@ -1770,7 +1933,7 @@ void ftrace_bug(int failed, unsigned long ip)
1770 pr_info("ftrace failed to modify "); 1933 pr_info("ftrace failed to modify ");
1771 print_ip_sym(ip); 1934 print_ip_sym(ip);
1772 print_ip_ins(" actual: ", (unsigned char *)ip); 1935 print_ip_ins(" actual: ", (unsigned char *)ip);
1773 printk(KERN_CONT "\n"); 1936 pr_cont("\n");
1774 break; 1937 break;
1775 case -EPERM: 1938 case -EPERM:
1776 FTRACE_WARN_ON_ONCE(1); 1939 FTRACE_WARN_ON_ONCE(1);
@@ -1782,6 +1945,24 @@ void ftrace_bug(int failed, unsigned long ip)
1782 pr_info("ftrace faulted on unknown error "); 1945 pr_info("ftrace faulted on unknown error ");
1783 print_ip_sym(ip); 1946 print_ip_sym(ip);
1784 } 1947 }
1948 if (rec) {
1949 struct ftrace_ops *ops = NULL;
1950
1951 pr_info("ftrace record flags: %lx\n", rec->flags);
1952 pr_cont(" (%ld)%s", ftrace_rec_count(rec),
1953 rec->flags & FTRACE_FL_REGS ? " R" : " ");
1954 if (rec->flags & FTRACE_FL_TRAMP_EN) {
1955 ops = ftrace_find_tramp_ops_any(rec);
1956 if (ops)
1957 pr_cont("\ttramp: %pS",
1958 (void *)ops->trampoline);
1959 else
1960 pr_cont("\ttramp: ERROR!");
1961
1962 }
1963 ip = ftrace_get_addr_curr(rec);
1964 pr_cont(" expected tramp: %lx\n", ip);
1965 }
1785} 1966}
1786 1967
1787static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update) 1968static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update)
@@ -1895,21 +2076,86 @@ int ftrace_test_record(struct dyn_ftrace *rec, int enable)
1895} 2076}
1896 2077
1897static struct ftrace_ops * 2078static struct ftrace_ops *
2079ftrace_find_tramp_ops_any(struct dyn_ftrace *rec)
2080{
2081 struct ftrace_ops *op;
2082 unsigned long ip = rec->ip;
2083
2084 do_for_each_ftrace_op(op, ftrace_ops_list) {
2085
2086 if (!op->trampoline)
2087 continue;
2088
2089 if (hash_contains_ip(ip, op->func_hash))
2090 return op;
2091 } while_for_each_ftrace_op(op);
2092
2093 return NULL;
2094}
2095
2096static struct ftrace_ops *
1898ftrace_find_tramp_ops_curr(struct dyn_ftrace *rec) 2097ftrace_find_tramp_ops_curr(struct dyn_ftrace *rec)
1899{ 2098{
1900 struct ftrace_ops *op; 2099 struct ftrace_ops *op;
2100 unsigned long ip = rec->ip;
1901 2101
1902 /* Removed ops need to be tested first */ 2102 /*
1903 if (removed_ops && removed_ops->tramp_hash) { 2103 * Need to check removed ops first.
1904 if (ftrace_lookup_ip(removed_ops->tramp_hash, rec->ip)) 2104 * If they are being removed, and this rec has a tramp,
2105 * and this rec is in the ops list, then it would be the
2106 * one with the tramp.
2107 */
2108 if (removed_ops) {
2109 if (hash_contains_ip(ip, &removed_ops->old_hash))
1905 return removed_ops; 2110 return removed_ops;
1906 } 2111 }
1907 2112
2113 /*
2114 * Need to find the current trampoline for a rec.
2115 * Now, a trampoline is only attached to a rec if there
2116 * was a single 'ops' attached to it. But this can be called
2117 * when we are adding another op to the rec or removing the
2118 * current one. Thus, if the op is being added, we can
2119 * ignore it because it hasn't attached itself to the rec
2120 * yet.
2121 *
2122 * If an ops is being modified (hooking to different functions)
2123 * then we don't care about the new functions that are being
2124 * added, just the old ones (that are probably being removed).
2125 *
2126 * If we are adding an ops to a function that already is using
2127 * a trampoline, it needs to be removed (trampolines are only
2128 * for single ops connected), then an ops that is not being
2129 * modified also needs to be checked.
2130 */
1908 do_for_each_ftrace_op(op, ftrace_ops_list) { 2131 do_for_each_ftrace_op(op, ftrace_ops_list) {
1909 if (!op->tramp_hash) 2132
2133 if (!op->trampoline)
1910 continue; 2134 continue;
1911 2135
1912 if (ftrace_lookup_ip(op->tramp_hash, rec->ip)) 2136 /*
2137 * If the ops is being added, it hasn't gotten to
2138 * the point to be removed from this tree yet.
2139 */
2140 if (op->flags & FTRACE_OPS_FL_ADDING)
2141 continue;
2142
2143
2144 /*
2145 * If the ops is being modified and is in the old
2146 * hash, then it is probably being removed from this
2147 * function.
2148 */
2149 if ((op->flags & FTRACE_OPS_FL_MODIFYING) &&
2150 hash_contains_ip(ip, &op->old_hash))
2151 return op;
2152 /*
2153 * If the ops is not being added or modified, and it's
2154 * in its normal filter hash, then this must be the one
2155 * we want!
2156 */
2157 if (!(op->flags & FTRACE_OPS_FL_MODIFYING) &&
2158 hash_contains_ip(ip, op->func_hash))
1913 return op; 2159 return op;
1914 2160
1915 } while_for_each_ftrace_op(op); 2161 } while_for_each_ftrace_op(op);
@@ -1921,10 +2167,11 @@ static struct ftrace_ops *
1921ftrace_find_tramp_ops_new(struct dyn_ftrace *rec) 2167ftrace_find_tramp_ops_new(struct dyn_ftrace *rec)
1922{ 2168{
1923 struct ftrace_ops *op; 2169 struct ftrace_ops *op;
2170 unsigned long ip = rec->ip;
1924 2171
1925 do_for_each_ftrace_op(op, ftrace_ops_list) { 2172 do_for_each_ftrace_op(op, ftrace_ops_list) {
1926 /* pass rec in as regs to have non-NULL val */ 2173 /* pass rec in as regs to have non-NULL val */
1927 if (ftrace_ops_test(op, rec->ip, rec)) 2174 if (hash_contains_ip(ip, op->func_hash))
1928 return op; 2175 return op;
1929 } while_for_each_ftrace_op(op); 2176 } while_for_each_ftrace_op(op);
1930 2177
@@ -2038,7 +2285,7 @@ void __weak ftrace_replace_code(int enable)
2038 do_for_each_ftrace_rec(pg, rec) { 2285 do_for_each_ftrace_rec(pg, rec) {
2039 failed = __ftrace_replace_code(rec, enable); 2286 failed = __ftrace_replace_code(rec, enable);
2040 if (failed) { 2287 if (failed) {
2041 ftrace_bug(failed, rec->ip); 2288 ftrace_bug(failed, rec);
2042 /* Stop processing */ 2289 /* Stop processing */
2043 return; 2290 return;
2044 } 2291 }
@@ -2120,17 +2367,14 @@ struct dyn_ftrace *ftrace_rec_iter_record(struct ftrace_rec_iter *iter)
2120static int 2367static int
2121ftrace_code_disable(struct module *mod, struct dyn_ftrace *rec) 2368ftrace_code_disable(struct module *mod, struct dyn_ftrace *rec)
2122{ 2369{
2123 unsigned long ip;
2124 int ret; 2370 int ret;
2125 2371
2126 ip = rec->ip;
2127
2128 if (unlikely(ftrace_disabled)) 2372 if (unlikely(ftrace_disabled))
2129 return 0; 2373 return 0;
2130 2374
2131 ret = ftrace_make_nop(mod, rec, MCOUNT_ADDR); 2375 ret = ftrace_make_nop(mod, rec, MCOUNT_ADDR);
2132 if (ret) { 2376 if (ret) {
2133 ftrace_bug(ret, ip); 2377 ftrace_bug(ret, rec);
2134 return 0; 2378 return 0;
2135 } 2379 }
2136 return 1; 2380 return 1;
@@ -2231,92 +2475,6 @@ void __weak arch_ftrace_update_code(int command)
2231 ftrace_run_stop_machine(command); 2475 ftrace_run_stop_machine(command);
2232} 2476}
2233 2477
2234static int ftrace_save_ops_tramp_hash(struct ftrace_ops *ops)
2235{
2236 struct ftrace_page *pg;
2237 struct dyn_ftrace *rec;
2238 int size, bits;
2239 int ret;
2240
2241 size = ops->nr_trampolines;
2242 bits = 0;
2243 /*
2244 * Make the hash size about 1/2 the # found
2245 */
2246 for (size /= 2; size; size >>= 1)
2247 bits++;
2248
2249 ops->tramp_hash = alloc_ftrace_hash(bits);
2250 /*
2251 * TODO: a failed allocation is going to screw up
2252 * the accounting of what needs to be modified
2253 * and not. For now, we kill ftrace if we fail
2254 * to allocate here. But there are ways around this,
2255 * but that will take a little more work.
2256 */
2257 if (!ops->tramp_hash)
2258 return -ENOMEM;
2259
2260 do_for_each_ftrace_rec(pg, rec) {
2261 if (ftrace_rec_count(rec) == 1 &&
2262 ftrace_ops_test(ops, rec->ip, rec)) {
2263
2264 /*
2265 * If another ops adds to a rec, the rec will
2266 * lose its trampoline and never get it back
2267 * until all ops are off of it.
2268 */
2269 if (!(rec->flags & FTRACE_FL_TRAMP))
2270 continue;
2271
2272 /* This record had better have a trampoline */
2273 if (FTRACE_WARN_ON(!(rec->flags & FTRACE_FL_TRAMP_EN)))
2274 return -1;
2275
2276 ret = add_hash_entry(ops->tramp_hash, rec->ip);
2277 if (ret < 0)
2278 return ret;
2279 }
2280 } while_for_each_ftrace_rec();
2281
2282 /* The number of recs in the hash must match nr_trampolines */
2283 if (FTRACE_WARN_ON(ops->tramp_hash->count != ops->nr_trampolines))
2284 pr_warn("count=%ld trampolines=%d\n",
2285 ops->tramp_hash->count,
2286 ops->nr_trampolines);
2287
2288 return 0;
2289}
2290
2291static int ftrace_save_tramp_hashes(void)
2292{
2293 struct ftrace_ops *op;
2294 int ret;
2295
2296 /*
2297 * Now that any trampoline is being used, we need to save the
2298 * hashes for the ops that have them. This allows the mapping
2299 * back from the record to the ops that has the trampoline to
2300 * know what code is being replaced. Modifying code must always
2301 * verify what it is changing.
2302 */
2303 do_for_each_ftrace_op(op, ftrace_ops_list) {
2304
2305 /* The tramp_hash is recreated each time. */
2306 free_ftrace_hash(op->tramp_hash);
2307 op->tramp_hash = NULL;
2308
2309 if (op->nr_trampolines) {
2310 ret = ftrace_save_ops_tramp_hash(op);
2311 if (ret)
2312 return ret;
2313 }
2314
2315 } while_for_each_ftrace_op(op);
2316
2317 return 0;
2318}
2319
2320static void ftrace_run_update_code(int command) 2478static void ftrace_run_update_code(int command)
2321{ 2479{
2322 int ret; 2480 int ret;
@@ -2336,14 +2494,25 @@ static void ftrace_run_update_code(int command)
2336 2494
2337 ret = ftrace_arch_code_modify_post_process(); 2495 ret = ftrace_arch_code_modify_post_process();
2338 FTRACE_WARN_ON(ret); 2496 FTRACE_WARN_ON(ret);
2497}
2339 2498
2340 ret = ftrace_save_tramp_hashes(); 2499static void ftrace_run_modify_code(struct ftrace_ops *ops, int command,
2341 FTRACE_WARN_ON(ret); 2500 struct ftrace_hash *old_hash)
2501{
2502 ops->flags |= FTRACE_OPS_FL_MODIFYING;
2503 ops->old_hash.filter_hash = old_hash;
2504 ftrace_run_update_code(command);
2505 ops->old_hash.filter_hash = NULL;
2506 ops->flags &= ~FTRACE_OPS_FL_MODIFYING;
2342} 2507}
2343 2508
2344static ftrace_func_t saved_ftrace_func; 2509static ftrace_func_t saved_ftrace_func;
2345static int ftrace_start_up; 2510static int ftrace_start_up;
2346 2511
2512void __weak arch_ftrace_trampoline_free(struct ftrace_ops *ops)
2513{
2514}
2515
2347static void control_ops_free(struct ftrace_ops *ops) 2516static void control_ops_free(struct ftrace_ops *ops)
2348{ 2517{
2349 free_percpu(ops->disabled); 2518 free_percpu(ops->disabled);
@@ -2362,6 +2531,13 @@ static void ftrace_startup_enable(int command)
2362 ftrace_run_update_code(command); 2531 ftrace_run_update_code(command);
2363} 2532}
2364 2533
2534static void ftrace_startup_all(int command)
2535{
2536 update_all_ops = true;
2537 ftrace_startup_enable(command);
2538 update_all_ops = false;
2539}
2540
2365static int ftrace_startup(struct ftrace_ops *ops, int command) 2541static int ftrace_startup(struct ftrace_ops *ops, int command)
2366{ 2542{
2367 int ret; 2543 int ret;
@@ -2376,12 +2552,31 @@ static int ftrace_startup(struct ftrace_ops *ops, int command)
2376 ftrace_start_up++; 2552 ftrace_start_up++;
2377 command |= FTRACE_UPDATE_CALLS; 2553 command |= FTRACE_UPDATE_CALLS;
2378 2554
2379 ops->flags |= FTRACE_OPS_FL_ENABLED; 2555 /*
2556 * Note that ftrace probes uses this to start up
2557 * and modify functions it will probe. But we still
2558 * set the ADDING flag for modification, as probes
2559 * do not have trampolines. If they add them in the
2560 * future, then the probes will need to distinguish
2561 * between adding and updating probes.
2562 */
2563 ops->flags |= FTRACE_OPS_FL_ENABLED | FTRACE_OPS_FL_ADDING;
2564
2565 ret = ftrace_hash_ipmodify_enable(ops);
2566 if (ret < 0) {
2567 /* Rollback registration process */
2568 __unregister_ftrace_function(ops);
2569 ftrace_start_up--;
2570 ops->flags &= ~FTRACE_OPS_FL_ENABLED;
2571 return ret;
2572 }
2380 2573
2381 ftrace_hash_rec_enable(ops, 1); 2574 ftrace_hash_rec_enable(ops, 1);
2382 2575
2383 ftrace_startup_enable(command); 2576 ftrace_startup_enable(command);
2384 2577
2578 ops->flags &= ~FTRACE_OPS_FL_ADDING;
2579
2385 return 0; 2580 return 0;
2386} 2581}
2387 2582
@@ -2404,6 +2599,8 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command)
2404 */ 2599 */
2405 WARN_ON_ONCE(ftrace_start_up < 0); 2600 WARN_ON_ONCE(ftrace_start_up < 0);
2406 2601
2602 /* Disabling ipmodify never fails */
2603 ftrace_hash_ipmodify_disable(ops);
2407 ftrace_hash_rec_disable(ops, 1); 2604 ftrace_hash_rec_disable(ops, 1);
2408 2605
2409 ops->flags &= ~FTRACE_OPS_FL_ENABLED; 2606 ops->flags &= ~FTRACE_OPS_FL_ENABLED;
@@ -2431,11 +2628,35 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command)
2431 * If the ops uses a trampoline, then it needs to be 2628 * If the ops uses a trampoline, then it needs to be
2432 * tested first on update. 2629 * tested first on update.
2433 */ 2630 */
2631 ops->flags |= FTRACE_OPS_FL_REMOVING;
2434 removed_ops = ops; 2632 removed_ops = ops;
2435 2633
2634 /* The trampoline logic checks the old hashes */
2635 ops->old_hash.filter_hash = ops->func_hash->filter_hash;
2636 ops->old_hash.notrace_hash = ops->func_hash->notrace_hash;
2637
2436 ftrace_run_update_code(command); 2638 ftrace_run_update_code(command);
2437 2639
2640 /*
2641 * If there's no more ops registered with ftrace, run a
2642 * sanity check to make sure all rec flags are cleared.
2643 */
2644 if (ftrace_ops_list == &ftrace_list_end) {
2645 struct ftrace_page *pg;
2646 struct dyn_ftrace *rec;
2647
2648 do_for_each_ftrace_rec(pg, rec) {
2649 if (FTRACE_WARN_ON_ONCE(rec->flags))
2650 pr_warn(" %pS flags:%lx\n",
2651 (void *)rec->ip, rec->flags);
2652 } while_for_each_ftrace_rec();
2653 }
2654
2655 ops->old_hash.filter_hash = NULL;
2656 ops->old_hash.notrace_hash = NULL;
2657
2438 removed_ops = NULL; 2658 removed_ops = NULL;
2659 ops->flags &= ~FTRACE_OPS_FL_REMOVING;
2439 2660
2440 /* 2661 /*
2441 * Dynamic ops may be freed, we must make sure that all 2662 * Dynamic ops may be freed, we must make sure that all
@@ -2454,6 +2675,8 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command)
2454 if (ops->flags & (FTRACE_OPS_FL_DYNAMIC | FTRACE_OPS_FL_CONTROL)) { 2675 if (ops->flags & (FTRACE_OPS_FL_DYNAMIC | FTRACE_OPS_FL_CONTROL)) {
2455 schedule_on_each_cpu(ftrace_sync); 2676 schedule_on_each_cpu(ftrace_sync);
2456 2677
2678 arch_ftrace_trampoline_free(ops);
2679
2457 if (ops->flags & FTRACE_OPS_FL_CONTROL) 2680 if (ops->flags & FTRACE_OPS_FL_CONTROL)
2458 control_ops_free(ops); 2681 control_ops_free(ops);
2459 } 2682 }
@@ -2606,7 +2829,7 @@ static int ftrace_update_code(struct module *mod, struct ftrace_page *new_pgs)
2606 if (ftrace_start_up && cnt) { 2829 if (ftrace_start_up && cnt) {
2607 int failed = __ftrace_replace_code(p, 1); 2830 int failed = __ftrace_replace_code(p, 1);
2608 if (failed) 2831 if (failed)
2609 ftrace_bug(failed, p->ip); 2832 ftrace_bug(failed, p);
2610 } 2833 }
2611 } 2834 }
2612 } 2835 }
@@ -2931,6 +3154,22 @@ static void t_stop(struct seq_file *m, void *p)
2931 mutex_unlock(&ftrace_lock); 3154 mutex_unlock(&ftrace_lock);
2932} 3155}
2933 3156
3157void * __weak
3158arch_ftrace_trampoline_func(struct ftrace_ops *ops, struct dyn_ftrace *rec)
3159{
3160 return NULL;
3161}
3162
3163static void add_trampoline_func(struct seq_file *m, struct ftrace_ops *ops,
3164 struct dyn_ftrace *rec)
3165{
3166 void *ptr;
3167
3168 ptr = arch_ftrace_trampoline_func(ops, rec);
3169 if (ptr)
3170 seq_printf(m, " ->%pS", ptr);
3171}
3172
2934static int t_show(struct seq_file *m, void *v) 3173static int t_show(struct seq_file *m, void *v)
2935{ 3174{
2936 struct ftrace_iterator *iter = m->private; 3175 struct ftrace_iterator *iter = m->private;
@@ -2941,9 +3180,9 @@ static int t_show(struct seq_file *m, void *v)
2941 3180
2942 if (iter->flags & FTRACE_ITER_PRINTALL) { 3181 if (iter->flags & FTRACE_ITER_PRINTALL) {
2943 if (iter->flags & FTRACE_ITER_NOTRACE) 3182 if (iter->flags & FTRACE_ITER_NOTRACE)
2944 seq_printf(m, "#### no functions disabled ####\n"); 3183 seq_puts(m, "#### no functions disabled ####\n");
2945 else 3184 else
2946 seq_printf(m, "#### all functions enabled ####\n"); 3185 seq_puts(m, "#### all functions enabled ####\n");
2947 return 0; 3186 return 0;
2948 } 3187 }
2949 3188
@@ -2954,22 +3193,25 @@ static int t_show(struct seq_file *m, void *v)
2954 3193
2955 seq_printf(m, "%ps", (void *)rec->ip); 3194 seq_printf(m, "%ps", (void *)rec->ip);
2956 if (iter->flags & FTRACE_ITER_ENABLED) { 3195 if (iter->flags & FTRACE_ITER_ENABLED) {
2957 seq_printf(m, " (%ld)%s", 3196 struct ftrace_ops *ops = NULL;
3197
3198 seq_printf(m, " (%ld)%s%s",
2958 ftrace_rec_count(rec), 3199 ftrace_rec_count(rec),
2959 rec->flags & FTRACE_FL_REGS ? " R" : " "); 3200 rec->flags & FTRACE_FL_REGS ? " R" : " ",
3201 rec->flags & FTRACE_FL_IPMODIFY ? " I" : " ");
2960 if (rec->flags & FTRACE_FL_TRAMP_EN) { 3202 if (rec->flags & FTRACE_FL_TRAMP_EN) {
2961 struct ftrace_ops *ops; 3203 ops = ftrace_find_tramp_ops_any(rec);
2962 3204 if (ops)
2963 ops = ftrace_find_tramp_ops_curr(rec);
2964 if (ops && ops->trampoline)
2965 seq_printf(m, "\ttramp: %pS", 3205 seq_printf(m, "\ttramp: %pS",
2966 (void *)ops->trampoline); 3206 (void *)ops->trampoline);
2967 else 3207 else
2968 seq_printf(m, "\ttramp: ERROR!"); 3208 seq_puts(m, "\ttramp: ERROR!");
3209
2969 } 3210 }
3211 add_trampoline_func(m, ops, rec);
2970 } 3212 }
2971 3213
2972 seq_printf(m, "\n"); 3214 seq_putc(m, '\n');
2973 3215
2974 return 0; 3216 return 0;
2975} 3217}
@@ -3003,9 +3245,6 @@ ftrace_enabled_open(struct inode *inode, struct file *file)
3003{ 3245{
3004 struct ftrace_iterator *iter; 3246 struct ftrace_iterator *iter;
3005 3247
3006 if (unlikely(ftrace_disabled))
3007 return -ENODEV;
3008
3009 iter = __seq_open_private(file, &show_ftrace_seq_ops, sizeof(*iter)); 3248 iter = __seq_open_private(file, &show_ftrace_seq_ops, sizeof(*iter));
3010 if (iter) { 3249 if (iter) {
3011 iter->pg = ftrace_pages_start; 3250 iter->pg = ftrace_pages_start;
@@ -3340,7 +3579,7 @@ static struct ftrace_ops trace_probe_ops __read_mostly =
3340 3579
3341static int ftrace_probe_registered; 3580static int ftrace_probe_registered;
3342 3581
3343static void __enable_ftrace_function_probe(void) 3582static void __enable_ftrace_function_probe(struct ftrace_hash *old_hash)
3344{ 3583{
3345 int ret; 3584 int ret;
3346 int i; 3585 int i;
@@ -3348,7 +3587,8 @@ static void __enable_ftrace_function_probe(void)
3348 if (ftrace_probe_registered) { 3587 if (ftrace_probe_registered) {
3349 /* still need to update the function call sites */ 3588 /* still need to update the function call sites */
3350 if (ftrace_enabled) 3589 if (ftrace_enabled)
3351 ftrace_run_update_code(FTRACE_UPDATE_CALLS); 3590 ftrace_run_modify_code(&trace_probe_ops, FTRACE_UPDATE_CALLS,
3591 old_hash);
3352 return; 3592 return;
3353 } 3593 }
3354 3594
@@ -3399,6 +3639,7 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
3399{ 3639{
3400 struct ftrace_func_probe *entry; 3640 struct ftrace_func_probe *entry;
3401 struct ftrace_hash **orig_hash = &trace_probe_ops.func_hash->filter_hash; 3641 struct ftrace_hash **orig_hash = &trace_probe_ops.func_hash->filter_hash;
3642 struct ftrace_hash *old_hash = *orig_hash;
3402 struct ftrace_hash *hash; 3643 struct ftrace_hash *hash;
3403 struct ftrace_page *pg; 3644 struct ftrace_page *pg;
3404 struct dyn_ftrace *rec; 3645 struct dyn_ftrace *rec;
@@ -3417,7 +3658,7 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
3417 3658
3418 mutex_lock(&trace_probe_ops.func_hash->regex_lock); 3659 mutex_lock(&trace_probe_ops.func_hash->regex_lock);
3419 3660
3420 hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, *orig_hash); 3661 hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, old_hash);
3421 if (!hash) { 3662 if (!hash) {
3422 count = -ENOMEM; 3663 count = -ENOMEM;
3423 goto out; 3664 goto out;
@@ -3476,10 +3717,13 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
3476 } while_for_each_ftrace_rec(); 3717 } while_for_each_ftrace_rec();
3477 3718
3478 ret = ftrace_hash_move(&trace_probe_ops, 1, orig_hash, hash); 3719 ret = ftrace_hash_move(&trace_probe_ops, 1, orig_hash, hash);
3479 if (ret < 0)
3480 count = ret;
3481 3720
3482 __enable_ftrace_function_probe(); 3721 __enable_ftrace_function_probe(old_hash);
3722
3723 if (!ret)
3724 free_ftrace_hash_rcu(old_hash);
3725 else
3726 count = ret;
3483 3727
3484 out_unlock: 3728 out_unlock:
3485 mutex_unlock(&ftrace_lock); 3729 mutex_unlock(&ftrace_lock);
@@ -3503,6 +3747,7 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
3503 struct ftrace_func_probe *entry; 3747 struct ftrace_func_probe *entry;
3504 struct ftrace_func_probe *p; 3748 struct ftrace_func_probe *p;
3505 struct ftrace_hash **orig_hash = &trace_probe_ops.func_hash->filter_hash; 3749 struct ftrace_hash **orig_hash = &trace_probe_ops.func_hash->filter_hash;
3750 struct ftrace_hash *old_hash = *orig_hash;
3506 struct list_head free_list; 3751 struct list_head free_list;
3507 struct ftrace_hash *hash; 3752 struct ftrace_hash *hash;
3508 struct hlist_node *tmp; 3753 struct hlist_node *tmp;
@@ -3510,6 +3755,7 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
3510 int type = MATCH_FULL; 3755 int type = MATCH_FULL;
3511 int i, len = 0; 3756 int i, len = 0;
3512 char *search; 3757 char *search;
3758 int ret;
3513 3759
3514 if (glob && (strcmp(glob, "*") == 0 || !strlen(glob))) 3760 if (glob && (strcmp(glob, "*") == 0 || !strlen(glob)))
3515 glob = NULL; 3761 glob = NULL;
@@ -3568,8 +3814,11 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
3568 * Remove after the disable is called. Otherwise, if the last 3814 * Remove after the disable is called. Otherwise, if the last
3569 * probe is removed, a null hash means *all enabled*. 3815 * probe is removed, a null hash means *all enabled*.
3570 */ 3816 */
3571 ftrace_hash_move(&trace_probe_ops, 1, orig_hash, hash); 3817 ret = ftrace_hash_move(&trace_probe_ops, 1, orig_hash, hash);
3572 synchronize_sched(); 3818 synchronize_sched();
3819 if (!ret)
3820 free_ftrace_hash_rcu(old_hash);
3821
3573 list_for_each_entry_safe(entry, p, &free_list, free_list) { 3822 list_for_each_entry_safe(entry, p, &free_list, free_list) {
3574 list_del(&entry->free_list); 3823 list_del(&entry->free_list);
3575 ftrace_free_entry(entry); 3824 ftrace_free_entry(entry);
@@ -3756,10 +4005,11 @@ ftrace_match_addr(struct ftrace_hash *hash, unsigned long ip, int remove)
3756 return add_hash_entry(hash, ip); 4005 return add_hash_entry(hash, ip);
3757} 4006}
3758 4007
3759static void ftrace_ops_update_code(struct ftrace_ops *ops) 4008static void ftrace_ops_update_code(struct ftrace_ops *ops,
4009 struct ftrace_hash *old_hash)
3760{ 4010{
3761 if (ops->flags & FTRACE_OPS_FL_ENABLED && ftrace_enabled) 4011 if (ops->flags & FTRACE_OPS_FL_ENABLED && ftrace_enabled)
3762 ftrace_run_update_code(FTRACE_UPDATE_CALLS); 4012 ftrace_run_modify_code(ops, FTRACE_UPDATE_CALLS, old_hash);
3763} 4013}
3764 4014
3765static int 4015static int
@@ -3767,6 +4017,7 @@ ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len,
3767 unsigned long ip, int remove, int reset, int enable) 4017 unsigned long ip, int remove, int reset, int enable)
3768{ 4018{
3769 struct ftrace_hash **orig_hash; 4019 struct ftrace_hash **orig_hash;
4020 struct ftrace_hash *old_hash;
3770 struct ftrace_hash *hash; 4021 struct ftrace_hash *hash;
3771 int ret; 4022 int ret;
3772 4023
@@ -3801,10 +4052,12 @@ ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len,
3801 } 4052 }
3802 4053
3803 mutex_lock(&ftrace_lock); 4054 mutex_lock(&ftrace_lock);
4055 old_hash = *orig_hash;
3804 ret = ftrace_hash_move(ops, enable, orig_hash, hash); 4056 ret = ftrace_hash_move(ops, enable, orig_hash, hash);
3805 if (!ret) 4057 if (!ret) {
3806 ftrace_ops_update_code(ops); 4058 ftrace_ops_update_code(ops, old_hash);
3807 4059 free_ftrace_hash_rcu(old_hash);
4060 }
3808 mutex_unlock(&ftrace_lock); 4061 mutex_unlock(&ftrace_lock);
3809 4062
3810 out_regex_unlock: 4063 out_regex_unlock:
@@ -3944,6 +4197,9 @@ static char ftrace_graph_buf[FTRACE_FILTER_SIZE] __initdata;
3944static char ftrace_graph_notrace_buf[FTRACE_FILTER_SIZE] __initdata; 4197static char ftrace_graph_notrace_buf[FTRACE_FILTER_SIZE] __initdata;
3945static int ftrace_set_func(unsigned long *array, int *idx, int size, char *buffer); 4198static int ftrace_set_func(unsigned long *array, int *idx, int size, char *buffer);
3946 4199
4200static unsigned long save_global_trampoline;
4201static unsigned long save_global_flags;
4202
3947static int __init set_graph_function(char *str) 4203static int __init set_graph_function(char *str)
3948{ 4204{
3949 strlcpy(ftrace_graph_buf, str, FTRACE_FILTER_SIZE); 4205 strlcpy(ftrace_graph_buf, str, FTRACE_FILTER_SIZE);
@@ -4013,6 +4269,7 @@ int ftrace_regex_release(struct inode *inode, struct file *file)
4013 struct seq_file *m = (struct seq_file *)file->private_data; 4269 struct seq_file *m = (struct seq_file *)file->private_data;
4014 struct ftrace_iterator *iter; 4270 struct ftrace_iterator *iter;
4015 struct ftrace_hash **orig_hash; 4271 struct ftrace_hash **orig_hash;
4272 struct ftrace_hash *old_hash;
4016 struct trace_parser *parser; 4273 struct trace_parser *parser;
4017 int filter_hash; 4274 int filter_hash;
4018 int ret; 4275 int ret;
@@ -4042,11 +4299,13 @@ int ftrace_regex_release(struct inode *inode, struct file *file)
4042 orig_hash = &iter->ops->func_hash->notrace_hash; 4299 orig_hash = &iter->ops->func_hash->notrace_hash;
4043 4300
4044 mutex_lock(&ftrace_lock); 4301 mutex_lock(&ftrace_lock);
4302 old_hash = *orig_hash;
4045 ret = ftrace_hash_move(iter->ops, filter_hash, 4303 ret = ftrace_hash_move(iter->ops, filter_hash,
4046 orig_hash, iter->hash); 4304 orig_hash, iter->hash);
4047 if (!ret) 4305 if (!ret) {
4048 ftrace_ops_update_code(iter->ops); 4306 ftrace_ops_update_code(iter->ops, old_hash);
4049 4307 free_ftrace_hash_rcu(old_hash);
4308 }
4050 mutex_unlock(&ftrace_lock); 4309 mutex_unlock(&ftrace_lock);
4051 } 4310 }
4052 4311
@@ -4149,9 +4408,9 @@ static int g_show(struct seq_file *m, void *v)
4149 struct ftrace_graph_data *fgd = m->private; 4408 struct ftrace_graph_data *fgd = m->private;
4150 4409
4151 if (fgd->table == ftrace_graph_funcs) 4410 if (fgd->table == ftrace_graph_funcs)
4152 seq_printf(m, "#### all functions enabled ####\n"); 4411 seq_puts(m, "#### all functions enabled ####\n");
4153 else 4412 else
4154 seq_printf(m, "#### no functions disabled ####\n"); 4413 seq_puts(m, "#### no functions disabled ####\n");
4155 return 0; 4414 return 0;
4156 } 4415 }
4157 4416
@@ -4662,6 +4921,32 @@ void __init ftrace_init(void)
4662 ftrace_disabled = 1; 4921 ftrace_disabled = 1;
4663} 4922}
4664 4923
4924/* Do nothing if arch does not support this */
4925void __weak arch_ftrace_update_trampoline(struct ftrace_ops *ops)
4926{
4927}
4928
4929static void ftrace_update_trampoline(struct ftrace_ops *ops)
4930{
4931
4932/*
4933 * Currently there's no safe way to free a trampoline when the kernel
4934 * is configured with PREEMPT. That is because a task could be preempted
4935 * when it jumped to the trampoline, it may be preempted for a long time
4936 * depending on the system load, and currently there's no way to know
4937 * when it will be off the trampoline. If the trampoline is freed
4938 * too early, when the task runs again, it will be executing on freed
4939 * memory and crash.
4940 */
4941#ifdef CONFIG_PREEMPT
4942 /* Currently, only non dynamic ops can have a trampoline */
4943 if (ops->flags & FTRACE_OPS_FL_DYNAMIC)
4944 return;
4945#endif
4946
4947 arch_ftrace_update_trampoline(ops);
4948}
4949
4665#else 4950#else
4666 4951
4667static struct ftrace_ops global_ops = { 4952static struct ftrace_ops global_ops = {
@@ -4678,6 +4963,7 @@ core_initcall(ftrace_nodyn_init);
4678 4963
4679static inline int ftrace_init_dyn_debugfs(struct dentry *d_tracer) { return 0; } 4964static inline int ftrace_init_dyn_debugfs(struct dentry *d_tracer) { return 0; }
4680static inline void ftrace_startup_enable(int command) { } 4965static inline void ftrace_startup_enable(int command) { }
4966static inline void ftrace_startup_all(int command) { }
4681/* Keep as macros so we do not need to define the commands */ 4967/* Keep as macros so we do not need to define the commands */
4682# define ftrace_startup(ops, command) \ 4968# define ftrace_startup(ops, command) \
4683 ({ \ 4969 ({ \
@@ -4703,6 +4989,10 @@ ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip, void *regs)
4703 return 1; 4989 return 1;
4704} 4990}
4705 4991
4992static void ftrace_update_trampoline(struct ftrace_ops *ops)
4993{
4994}
4995
4706#endif /* CONFIG_DYNAMIC_FTRACE */ 4996#endif /* CONFIG_DYNAMIC_FTRACE */
4707 4997
4708__init void ftrace_init_global_array_ops(struct trace_array *tr) 4998__init void ftrace_init_global_array_ops(struct trace_array *tr)
@@ -4827,6 +5117,56 @@ static void ftrace_ops_no_ops(unsigned long ip, unsigned long parent_ip)
4827} 5117}
4828#endif 5118#endif
4829 5119
5120/*
5121 * If there's only one function registered but it does not support
5122 * recursion, this function will be called by the mcount trampoline.
5123 * This function will handle recursion protection.
5124 */
5125static void ftrace_ops_recurs_func(unsigned long ip, unsigned long parent_ip,
5126 struct ftrace_ops *op, struct pt_regs *regs)
5127{
5128 int bit;
5129
5130 bit = trace_test_and_set_recursion(TRACE_LIST_START, TRACE_LIST_MAX);
5131 if (bit < 0)
5132 return;
5133
5134 op->func(ip, parent_ip, op, regs);
5135
5136 trace_clear_recursion(bit);
5137}
5138
5139/**
5140 * ftrace_ops_get_func - get the function a trampoline should call
5141 * @ops: the ops to get the function for
5142 *
5143 * Normally the mcount trampoline will call the ops->func, but there
5144 * are times that it should not. For example, if the ops does not
5145 * have its own recursion protection, then it should call the
5146 * ftrace_ops_recurs_func() instead.
5147 *
5148 * Returns the function that the trampoline should call for @ops.
5149 */
5150ftrace_func_t ftrace_ops_get_func(struct ftrace_ops *ops)
5151{
5152 /*
5153 * If this is a dynamic ops or we force list func,
5154 * then it needs to call the list anyway.
5155 */
5156 if (ops->flags & FTRACE_OPS_FL_DYNAMIC || FTRACE_FORCE_LIST_FUNC)
5157 return ftrace_ops_list_func;
5158
5159 /*
5160 * If the func handles its own recursion, call it directly.
5161 * Otherwise call the recursion protected function that
5162 * will call the ftrace ops function.
5163 */
5164 if (!(ops->flags & FTRACE_OPS_FL_RECURSION_SAFE))
5165 return ftrace_ops_recurs_func;
5166
5167 return ops->func;
5168}
5169
4830static void clear_ftrace_swapper(void) 5170static void clear_ftrace_swapper(void)
4831{ 5171{
4832 struct task_struct *p; 5172 struct task_struct *p;
@@ -4927,7 +5267,8 @@ static int ftrace_pid_add(int p)
4927 set_ftrace_pid_task(pid); 5267 set_ftrace_pid_task(pid);
4928 5268
4929 ftrace_update_pid_func(); 5269 ftrace_update_pid_func();
4930 ftrace_startup_enable(0); 5270
5271 ftrace_startup_all(0);
4931 5272
4932 mutex_unlock(&ftrace_lock); 5273 mutex_unlock(&ftrace_lock);
4933 return 0; 5274 return 0;
@@ -4956,7 +5297,7 @@ static void ftrace_pid_reset(void)
4956 } 5297 }
4957 5298
4958 ftrace_update_pid_func(); 5299 ftrace_update_pid_func();
4959 ftrace_startup_enable(0); 5300 ftrace_startup_all(0);
4960 5301
4961 mutex_unlock(&ftrace_lock); 5302 mutex_unlock(&ftrace_lock);
4962} 5303}
@@ -4989,12 +5330,12 @@ static int fpid_show(struct seq_file *m, void *v)
4989 const struct ftrace_pid *fpid = list_entry(v, struct ftrace_pid, list); 5330 const struct ftrace_pid *fpid = list_entry(v, struct ftrace_pid, list);
4990 5331
4991 if (v == (void *)1) { 5332 if (v == (void *)1) {
4992 seq_printf(m, "no pid\n"); 5333 seq_puts(m, "no pid\n");
4993 return 0; 5334 return 0;
4994 } 5335 }
4995 5336
4996 if (fpid->pid == ftrace_swapper_pid) 5337 if (fpid->pid == ftrace_swapper_pid)
4997 seq_printf(m, "swapper tasks\n"); 5338 seq_puts(m, "swapper tasks\n");
4998 else 5339 else
4999 seq_printf(m, "%u\n", pid_vnr(fpid->pid)); 5340 seq_printf(m, "%u\n", pid_vnr(fpid->pid));
5000 5341
@@ -5207,6 +5548,7 @@ static struct ftrace_ops graph_ops = {
5207 FTRACE_OPS_FL_STUB, 5548 FTRACE_OPS_FL_STUB,
5208#ifdef FTRACE_GRAPH_TRAMP_ADDR 5549#ifdef FTRACE_GRAPH_TRAMP_ADDR
5209 .trampoline = FTRACE_GRAPH_TRAMP_ADDR, 5550 .trampoline = FTRACE_GRAPH_TRAMP_ADDR,
5551 /* trampoline_size is only needed for dynamically allocated tramps */
5210#endif 5552#endif
5211 ASSIGN_OPS_HASH(graph_ops, &global_ops.local_hash) 5553 ASSIGN_OPS_HASH(graph_ops, &global_ops.local_hash)
5212}; 5554};
@@ -5436,7 +5778,6 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc,
5436 update_function_graph_func(); 5778 update_function_graph_func();
5437 5779
5438 ret = ftrace_startup(&graph_ops, FTRACE_START_FUNC_RET); 5780 ret = ftrace_startup(&graph_ops, FTRACE_START_FUNC_RET);
5439
5440out: 5781out:
5441 mutex_unlock(&ftrace_lock); 5782 mutex_unlock(&ftrace_lock);
5442 return ret; 5783 return ret;
@@ -5457,6 +5798,17 @@ void unregister_ftrace_graph(void)
5457 unregister_pm_notifier(&ftrace_suspend_notifier); 5798 unregister_pm_notifier(&ftrace_suspend_notifier);
5458 unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL); 5799 unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL);
5459 5800
5801#ifdef CONFIG_DYNAMIC_FTRACE
5802 /*
5803 * Function graph does not allocate the trampoline, but
5804 * other global_ops do. We need to reset the ALLOC_TRAMP flag
5805 * if one was used.
5806 */
5807 global_ops.trampoline = save_global_trampoline;
5808 if (save_global_flags & FTRACE_OPS_FL_ALLOC_TRAMP)
5809 global_ops.flags |= FTRACE_OPS_FL_ALLOC_TRAMP;
5810#endif
5811
5460 out: 5812 out:
5461 mutex_unlock(&ftrace_lock); 5813 mutex_unlock(&ftrace_lock);
5462} 5814}
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 2d75c94ae87d..7a4104cb95cb 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -34,21 +34,19 @@ static void update_pages_handler(struct work_struct *work);
34 */ 34 */
35int ring_buffer_print_entry_header(struct trace_seq *s) 35int ring_buffer_print_entry_header(struct trace_seq *s)
36{ 36{
37 int ret; 37 trace_seq_puts(s, "# compressed entry header\n");
38 38 trace_seq_puts(s, "\ttype_len : 5 bits\n");
39 ret = trace_seq_puts(s, "# compressed entry header\n"); 39 trace_seq_puts(s, "\ttime_delta : 27 bits\n");
40 ret = trace_seq_puts(s, "\ttype_len : 5 bits\n"); 40 trace_seq_puts(s, "\tarray : 32 bits\n");
41 ret = trace_seq_puts(s, "\ttime_delta : 27 bits\n"); 41 trace_seq_putc(s, '\n');
42 ret = trace_seq_puts(s, "\tarray : 32 bits\n"); 42 trace_seq_printf(s, "\tpadding : type == %d\n",
43 ret = trace_seq_putc(s, '\n'); 43 RINGBUF_TYPE_PADDING);
44 ret = trace_seq_printf(s, "\tpadding : type == %d\n", 44 trace_seq_printf(s, "\ttime_extend : type == %d\n",
45 RINGBUF_TYPE_PADDING); 45 RINGBUF_TYPE_TIME_EXTEND);
46 ret = trace_seq_printf(s, "\ttime_extend : type == %d\n", 46 trace_seq_printf(s, "\tdata max type_len == %d\n",
47 RINGBUF_TYPE_TIME_EXTEND); 47 RINGBUF_TYPE_DATA_TYPE_LEN_MAX);
48 ret = trace_seq_printf(s, "\tdata max type_len == %d\n",
49 RINGBUF_TYPE_DATA_TYPE_LEN_MAX);
50 48
51 return ret; 49 return !trace_seq_has_overflowed(s);
52} 50}
53 51
54/* 52/*
@@ -419,32 +417,31 @@ static inline int test_time_stamp(u64 delta)
419int ring_buffer_print_page_header(struct trace_seq *s) 417int ring_buffer_print_page_header(struct trace_seq *s)
420{ 418{
421 struct buffer_data_page field; 419 struct buffer_data_page field;
422 int ret;
423 420
424 ret = trace_seq_printf(s, "\tfield: u64 timestamp;\t" 421 trace_seq_printf(s, "\tfield: u64 timestamp;\t"
425 "offset:0;\tsize:%u;\tsigned:%u;\n", 422 "offset:0;\tsize:%u;\tsigned:%u;\n",
426 (unsigned int)sizeof(field.time_stamp), 423 (unsigned int)sizeof(field.time_stamp),
427 (unsigned int)is_signed_type(u64)); 424 (unsigned int)is_signed_type(u64));
428
429 ret = trace_seq_printf(s, "\tfield: local_t commit;\t"
430 "offset:%u;\tsize:%u;\tsigned:%u;\n",
431 (unsigned int)offsetof(typeof(field), commit),
432 (unsigned int)sizeof(field.commit),
433 (unsigned int)is_signed_type(long));
434
435 ret = trace_seq_printf(s, "\tfield: int overwrite;\t"
436 "offset:%u;\tsize:%u;\tsigned:%u;\n",
437 (unsigned int)offsetof(typeof(field), commit),
438 1,
439 (unsigned int)is_signed_type(long));
440
441 ret = trace_seq_printf(s, "\tfield: char data;\t"
442 "offset:%u;\tsize:%u;\tsigned:%u;\n",
443 (unsigned int)offsetof(typeof(field), data),
444 (unsigned int)BUF_PAGE_SIZE,
445 (unsigned int)is_signed_type(char));
446 425
447 return ret; 426 trace_seq_printf(s, "\tfield: local_t commit;\t"
427 "offset:%u;\tsize:%u;\tsigned:%u;\n",
428 (unsigned int)offsetof(typeof(field), commit),
429 (unsigned int)sizeof(field.commit),
430 (unsigned int)is_signed_type(long));
431
432 trace_seq_printf(s, "\tfield: int overwrite;\t"
433 "offset:%u;\tsize:%u;\tsigned:%u;\n",
434 (unsigned int)offsetof(typeof(field), commit),
435 1,
436 (unsigned int)is_signed_type(long));
437
438 trace_seq_printf(s, "\tfield: char data;\t"
439 "offset:%u;\tsize:%u;\tsigned:%u;\n",
440 (unsigned int)offsetof(typeof(field), data),
441 (unsigned int)BUF_PAGE_SIZE,
442 (unsigned int)is_signed_type(char));
443
444 return !trace_seq_has_overflowed(s);
448} 445}
449 446
450struct rb_irq_work { 447struct rb_irq_work {
@@ -538,16 +535,18 @@ static void rb_wake_up_waiters(struct irq_work *work)
538 * ring_buffer_wait - wait for input to the ring buffer 535 * ring_buffer_wait - wait for input to the ring buffer
539 * @buffer: buffer to wait on 536 * @buffer: buffer to wait on
540 * @cpu: the cpu buffer to wait on 537 * @cpu: the cpu buffer to wait on
538 * @full: wait until a full page is available, if @cpu != RING_BUFFER_ALL_CPUS
541 * 539 *
542 * If @cpu == RING_BUFFER_ALL_CPUS then the task will wake up as soon 540 * If @cpu == RING_BUFFER_ALL_CPUS then the task will wake up as soon
543 * as data is added to any of the @buffer's cpu buffers. Otherwise 541 * as data is added to any of the @buffer's cpu buffers. Otherwise
544 * it will wait for data to be added to a specific cpu buffer. 542 * it will wait for data to be added to a specific cpu buffer.
545 */ 543 */
546int ring_buffer_wait(struct ring_buffer *buffer, int cpu) 544int ring_buffer_wait(struct ring_buffer *buffer, int cpu, bool full)
547{ 545{
548 struct ring_buffer_per_cpu *cpu_buffer; 546 struct ring_buffer_per_cpu *uninitialized_var(cpu_buffer);
549 DEFINE_WAIT(wait); 547 DEFINE_WAIT(wait);
550 struct rb_irq_work *work; 548 struct rb_irq_work *work;
549 int ret = 0;
551 550
552 /* 551 /*
553 * Depending on what the caller is waiting for, either any 552 * Depending on what the caller is waiting for, either any
@@ -564,36 +563,61 @@ int ring_buffer_wait(struct ring_buffer *buffer, int cpu)
564 } 563 }
565 564
566 565
567 prepare_to_wait(&work->waiters, &wait, TASK_INTERRUPTIBLE); 566 while (true) {
567 prepare_to_wait(&work->waiters, &wait, TASK_INTERRUPTIBLE);
568 568
569 /* 569 /*
570 * The events can happen in critical sections where 570 * The events can happen in critical sections where
571 * checking a work queue can cause deadlocks. 571 * checking a work queue can cause deadlocks.
572 * After adding a task to the queue, this flag is set 572 * After adding a task to the queue, this flag is set
573 * only to notify events to try to wake up the queue 573 * only to notify events to try to wake up the queue
574 * using irq_work. 574 * using irq_work.
575 * 575 *
576 * We don't clear it even if the buffer is no longer 576 * We don't clear it even if the buffer is no longer
577 * empty. The flag only causes the next event to run 577 * empty. The flag only causes the next event to run
578 * irq_work to do the work queue wake up. The worse 578 * irq_work to do the work queue wake up. The worse
579 * that can happen if we race with !trace_empty() is that 579 * that can happen if we race with !trace_empty() is that
580 * an event will cause an irq_work to try to wake up 580 * an event will cause an irq_work to try to wake up
581 * an empty queue. 581 * an empty queue.
582 * 582 *
583 * There's no reason to protect this flag either, as 583 * There's no reason to protect this flag either, as
584 * the work queue and irq_work logic will do the necessary 584 * the work queue and irq_work logic will do the necessary
585 * synchronization for the wake ups. The only thing 585 * synchronization for the wake ups. The only thing
586 * that is necessary is that the wake up happens after 586 * that is necessary is that the wake up happens after
587 * a task has been queued. It's OK for spurious wake ups. 587 * a task has been queued. It's OK for spurious wake ups.
588 */ 588 */
589 work->waiters_pending = true; 589 work->waiters_pending = true;
590
591 if (signal_pending(current)) {
592 ret = -EINTR;
593 break;
594 }
595
596 if (cpu == RING_BUFFER_ALL_CPUS && !ring_buffer_empty(buffer))
597 break;
598
599 if (cpu != RING_BUFFER_ALL_CPUS &&
600 !ring_buffer_empty_cpu(buffer, cpu)) {
601 unsigned long flags;
602 bool pagebusy;
603
604 if (!full)
605 break;
606
607 raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
608 pagebusy = cpu_buffer->reader_page == cpu_buffer->commit_page;
609 raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
610
611 if (!pagebusy)
612 break;
613 }
590 614
591 if ((cpu == RING_BUFFER_ALL_CPUS && ring_buffer_empty(buffer)) ||
592 (cpu != RING_BUFFER_ALL_CPUS && ring_buffer_empty_cpu(buffer, cpu)))
593 schedule(); 615 schedule();
616 }
594 617
595 finish_wait(&work->waiters, &wait); 618 finish_wait(&work->waiters, &wait);
596 return 0; 619
620 return ret;
597} 621}
598 622
599/** 623/**
diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c
index 0434ff1b808e..3f9e328c30b5 100644
--- a/kernel/trace/ring_buffer_benchmark.c
+++ b/kernel/trace/ring_buffer_benchmark.c
@@ -205,7 +205,6 @@ static void ring_buffer_consumer(void)
205 break; 205 break;
206 206
207 schedule(); 207 schedule();
208 __set_current_state(TASK_RUNNING);
209 } 208 }
210 reader_finish = 0; 209 reader_finish = 0;
211 complete(&read_done); 210 complete(&read_done);
@@ -379,7 +378,6 @@ static int ring_buffer_consumer_thread(void *arg)
379 break; 378 break;
380 379
381 schedule(); 380 schedule();
382 __set_current_state(TASK_RUNNING);
383 } 381 }
384 __set_current_state(TASK_RUNNING); 382 __set_current_state(TASK_RUNNING);
385 383
@@ -407,7 +405,6 @@ static int ring_buffer_producer_thread(void *arg)
407 trace_printk("Sleeping for 10 secs\n"); 405 trace_printk("Sleeping for 10 secs\n");
408 set_current_state(TASK_INTERRUPTIBLE); 406 set_current_state(TASK_INTERRUPTIBLE);
409 schedule_timeout(HZ * SLEEP_TIME); 407 schedule_timeout(HZ * SLEEP_TIME);
410 __set_current_state(TASK_RUNNING);
411 } 408 }
412 409
413 if (kill_test) 410 if (kill_test)
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 8a528392b1f4..2e767972e99c 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -63,6 +63,10 @@ static bool __read_mostly tracing_selftest_running;
63 */ 63 */
64bool __read_mostly tracing_selftest_disabled; 64bool __read_mostly tracing_selftest_disabled;
65 65
66/* Pipe tracepoints to printk */
67struct trace_iterator *tracepoint_print_iter;
68int tracepoint_printk;
69
66/* For tracers that don't implement custom flags */ 70/* For tracers that don't implement custom flags */
67static struct tracer_opt dummy_tracer_opt[] = { 71static struct tracer_opt dummy_tracer_opt[] = {
68 { } 72 { }
@@ -155,10 +159,11 @@ __setup("ftrace_dump_on_oops", set_ftrace_dump_on_oops);
155 159
156static int __init stop_trace_on_warning(char *str) 160static int __init stop_trace_on_warning(char *str)
157{ 161{
158 __disable_trace_on_warning = 1; 162 if ((strcmp(str, "=0") != 0 && strcmp(str, "=off") != 0))
163 __disable_trace_on_warning = 1;
159 return 1; 164 return 1;
160} 165}
161__setup("traceoff_on_warning=", stop_trace_on_warning); 166__setup("traceoff_on_warning", stop_trace_on_warning);
162 167
163static int __init boot_alloc_snapshot(char *str) 168static int __init boot_alloc_snapshot(char *str)
164{ 169{
@@ -192,6 +197,13 @@ static int __init set_trace_boot_clock(char *str)
192} 197}
193__setup("trace_clock=", set_trace_boot_clock); 198__setup("trace_clock=", set_trace_boot_clock);
194 199
200static int __init set_tracepoint_printk(char *str)
201{
202 if ((strcmp(str, "=0") != 0 && strcmp(str, "=off") != 0))
203 tracepoint_printk = 1;
204 return 1;
205}
206__setup("tp_printk", set_tracepoint_printk);
195 207
196unsigned long long ns2usecs(cycle_t nsec) 208unsigned long long ns2usecs(cycle_t nsec)
197{ 209{
@@ -938,19 +950,20 @@ out:
938 return ret; 950 return ret;
939} 951}
940 952
953/* TODO add a seq_buf_to_buffer() */
941static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt) 954static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt)
942{ 955{
943 int len; 956 int len;
944 957
945 if (s->len <= s->readpos) 958 if (trace_seq_used(s) <= s->seq.readpos)
946 return -EBUSY; 959 return -EBUSY;
947 960
948 len = s->len - s->readpos; 961 len = trace_seq_used(s) - s->seq.readpos;
949 if (cnt > len) 962 if (cnt > len)
950 cnt = len; 963 cnt = len;
951 memcpy(buf, s->buffer + s->readpos, cnt); 964 memcpy(buf, s->buffer + s->seq.readpos, cnt);
952 965
953 s->readpos += cnt; 966 s->seq.readpos += cnt;
954 return cnt; 967 return cnt;
955} 968}
956 969
@@ -1076,13 +1089,14 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
1076} 1089}
1077#endif /* CONFIG_TRACER_MAX_TRACE */ 1090#endif /* CONFIG_TRACER_MAX_TRACE */
1078 1091
1079static int wait_on_pipe(struct trace_iterator *iter) 1092static int wait_on_pipe(struct trace_iterator *iter, bool full)
1080{ 1093{
1081 /* Iterators are static, they should be filled or empty */ 1094 /* Iterators are static, they should be filled or empty */
1082 if (trace_buffer_iter(iter, iter->cpu_file)) 1095 if (trace_buffer_iter(iter, iter->cpu_file))
1083 return 0; 1096 return 0;
1084 1097
1085 return ring_buffer_wait(iter->trace_buffer->buffer, iter->cpu_file); 1098 return ring_buffer_wait(iter->trace_buffer->buffer, iter->cpu_file,
1099 full);
1086} 1100}
1087 1101
1088#ifdef CONFIG_FTRACE_STARTUP_TEST 1102#ifdef CONFIG_FTRACE_STARTUP_TEST
@@ -2028,7 +2042,7 @@ void trace_printk_init_buffers(void)
2028 pr_warning("** trace_printk() being used. Allocating extra memory. **\n"); 2042 pr_warning("** trace_printk() being used. Allocating extra memory. **\n");
2029 pr_warning("** **\n"); 2043 pr_warning("** **\n");
2030 pr_warning("** This means that this is a DEBUG kernel and it is **\n"); 2044 pr_warning("** This means that this is a DEBUG kernel and it is **\n");
2031 pr_warning("** unsafe for produciton use. **\n"); 2045 pr_warning("** unsafe for production use. **\n");
2032 pr_warning("** **\n"); 2046 pr_warning("** **\n");
2033 pr_warning("** If you see this message and you are not debugging **\n"); 2047 pr_warning("** If you see this message and you are not debugging **\n");
2034 pr_warning("** the kernel, report this immediately to your vendor! **\n"); 2048 pr_warning("** the kernel, report this immediately to your vendor! **\n");
@@ -2157,9 +2171,7 @@ __trace_array_vprintk(struct ring_buffer *buffer,
2157 goto out; 2171 goto out;
2158 } 2172 }
2159 2173
2160 len = vsnprintf(tbuffer, TRACE_BUF_SIZE, fmt, args); 2174 len = vscnprintf(tbuffer, TRACE_BUF_SIZE, fmt, args);
2161 if (len > TRACE_BUF_SIZE)
2162 goto out;
2163 2175
2164 local_save_flags(flags); 2176 local_save_flags(flags);
2165 size = sizeof(*entry) + len + 1; 2177 size = sizeof(*entry) + len + 1;
@@ -2170,8 +2182,7 @@ __trace_array_vprintk(struct ring_buffer *buffer,
2170 entry = ring_buffer_event_data(event); 2182 entry = ring_buffer_event_data(event);
2171 entry->ip = ip; 2183 entry->ip = ip;
2172 2184
2173 memcpy(&entry->buf, tbuffer, len); 2185 memcpy(&entry->buf, tbuffer, len + 1);
2174 entry->buf[len] = '\0';
2175 if (!call_filter_check_discard(call, entry, buffer, event)) { 2186 if (!call_filter_check_discard(call, entry, buffer, event)) {
2176 __buffer_unlock_commit(buffer, event); 2187 __buffer_unlock_commit(buffer, event);
2177 ftrace_trace_stack(buffer, flags, 6, pc); 2188 ftrace_trace_stack(buffer, flags, 6, pc);
@@ -2508,14 +2519,14 @@ get_total_entries(struct trace_buffer *buf,
2508 2519
2509static void print_lat_help_header(struct seq_file *m) 2520static void print_lat_help_header(struct seq_file *m)
2510{ 2521{
2511 seq_puts(m, "# _------=> CPU# \n"); 2522 seq_puts(m, "# _------=> CPU# \n"
2512 seq_puts(m, "# / _-----=> irqs-off \n"); 2523 "# / _-----=> irqs-off \n"
2513 seq_puts(m, "# | / _----=> need-resched \n"); 2524 "# | / _----=> need-resched \n"
2514 seq_puts(m, "# || / _---=> hardirq/softirq \n"); 2525 "# || / _---=> hardirq/softirq \n"
2515 seq_puts(m, "# ||| / _--=> preempt-depth \n"); 2526 "# ||| / _--=> preempt-depth \n"
2516 seq_puts(m, "# |||| / delay \n"); 2527 "# |||| / delay \n"
2517 seq_puts(m, "# cmd pid ||||| time | caller \n"); 2528 "# cmd pid ||||| time | caller \n"
2518 seq_puts(m, "# \\ / ||||| \\ | / \n"); 2529 "# \\ / ||||| \\ | / \n");
2519} 2530}
2520 2531
2521static void print_event_info(struct trace_buffer *buf, struct seq_file *m) 2532static void print_event_info(struct trace_buffer *buf, struct seq_file *m)
@@ -2532,20 +2543,20 @@ static void print_event_info(struct trace_buffer *buf, struct seq_file *m)
2532static void print_func_help_header(struct trace_buffer *buf, struct seq_file *m) 2543static void print_func_help_header(struct trace_buffer *buf, struct seq_file *m)
2533{ 2544{
2534 print_event_info(buf, m); 2545 print_event_info(buf, m);
2535 seq_puts(m, "# TASK-PID CPU# TIMESTAMP FUNCTION\n"); 2546 seq_puts(m, "# TASK-PID CPU# TIMESTAMP FUNCTION\n"
2536 seq_puts(m, "# | | | | |\n"); 2547 "# | | | | |\n");
2537} 2548}
2538 2549
2539static void print_func_help_header_irq(struct trace_buffer *buf, struct seq_file *m) 2550static void print_func_help_header_irq(struct trace_buffer *buf, struct seq_file *m)
2540{ 2551{
2541 print_event_info(buf, m); 2552 print_event_info(buf, m);
2542 seq_puts(m, "# _-----=> irqs-off\n"); 2553 seq_puts(m, "# _-----=> irqs-off\n"
2543 seq_puts(m, "# / _----=> need-resched\n"); 2554 "# / _----=> need-resched\n"
2544 seq_puts(m, "# | / _---=> hardirq/softirq\n"); 2555 "# | / _---=> hardirq/softirq\n"
2545 seq_puts(m, "# || / _--=> preempt-depth\n"); 2556 "# || / _--=> preempt-depth\n"
2546 seq_puts(m, "# ||| / delay\n"); 2557 "# ||| / delay\n"
2547 seq_puts(m, "# TASK-PID CPU# |||| TIMESTAMP FUNCTION\n"); 2558 "# TASK-PID CPU# |||| TIMESTAMP FUNCTION\n"
2548 seq_puts(m, "# | | | |||| | |\n"); 2559 "# | | | |||| | |\n");
2549} 2560}
2550 2561
2551void 2562void
@@ -2648,24 +2659,21 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter)
2648 event = ftrace_find_event(entry->type); 2659 event = ftrace_find_event(entry->type);
2649 2660
2650 if (trace_flags & TRACE_ITER_CONTEXT_INFO) { 2661 if (trace_flags & TRACE_ITER_CONTEXT_INFO) {
2651 if (iter->iter_flags & TRACE_FILE_LAT_FMT) { 2662 if (iter->iter_flags & TRACE_FILE_LAT_FMT)
2652 if (!trace_print_lat_context(iter)) 2663 trace_print_lat_context(iter);
2653 goto partial; 2664 else
2654 } else { 2665 trace_print_context(iter);
2655 if (!trace_print_context(iter))
2656 goto partial;
2657 }
2658 } 2666 }
2659 2667
2668 if (trace_seq_has_overflowed(s))
2669 return TRACE_TYPE_PARTIAL_LINE;
2670
2660 if (event) 2671 if (event)
2661 return event->funcs->trace(iter, sym_flags, event); 2672 return event->funcs->trace(iter, sym_flags, event);
2662 2673
2663 if (!trace_seq_printf(s, "Unknown type %d\n", entry->type)) 2674 trace_seq_printf(s, "Unknown type %d\n", entry->type);
2664 goto partial;
2665 2675
2666 return TRACE_TYPE_HANDLED; 2676 return trace_handle_return(s);
2667partial:
2668 return TRACE_TYPE_PARTIAL_LINE;
2669} 2677}
2670 2678
2671static enum print_line_t print_raw_fmt(struct trace_iterator *iter) 2679static enum print_line_t print_raw_fmt(struct trace_iterator *iter)
@@ -2676,22 +2684,20 @@ static enum print_line_t print_raw_fmt(struct trace_iterator *iter)
2676 2684
2677 entry = iter->ent; 2685 entry = iter->ent;
2678 2686
2679 if (trace_flags & TRACE_ITER_CONTEXT_INFO) { 2687 if (trace_flags & TRACE_ITER_CONTEXT_INFO)
2680 if (!trace_seq_printf(s, "%d %d %llu ", 2688 trace_seq_printf(s, "%d %d %llu ",
2681 entry->pid, iter->cpu, iter->ts)) 2689 entry->pid, iter->cpu, iter->ts);
2682 goto partial; 2690
2683 } 2691 if (trace_seq_has_overflowed(s))
2692 return TRACE_TYPE_PARTIAL_LINE;
2684 2693
2685 event = ftrace_find_event(entry->type); 2694 event = ftrace_find_event(entry->type);
2686 if (event) 2695 if (event)
2687 return event->funcs->raw(iter, 0, event); 2696 return event->funcs->raw(iter, 0, event);
2688 2697
2689 if (!trace_seq_printf(s, "%d ?\n", entry->type)) 2698 trace_seq_printf(s, "%d ?\n", entry->type);
2690 goto partial;
2691 2699
2692 return TRACE_TYPE_HANDLED; 2700 return trace_handle_return(s);
2693partial:
2694 return TRACE_TYPE_PARTIAL_LINE;
2695} 2701}
2696 2702
2697static enum print_line_t print_hex_fmt(struct trace_iterator *iter) 2703static enum print_line_t print_hex_fmt(struct trace_iterator *iter)
@@ -2704,9 +2710,11 @@ static enum print_line_t print_hex_fmt(struct trace_iterator *iter)
2704 entry = iter->ent; 2710 entry = iter->ent;
2705 2711
2706 if (trace_flags & TRACE_ITER_CONTEXT_INFO) { 2712 if (trace_flags & TRACE_ITER_CONTEXT_INFO) {
2707 SEQ_PUT_HEX_FIELD_RET(s, entry->pid); 2713 SEQ_PUT_HEX_FIELD(s, entry->pid);
2708 SEQ_PUT_HEX_FIELD_RET(s, iter->cpu); 2714 SEQ_PUT_HEX_FIELD(s, iter->cpu);
2709 SEQ_PUT_HEX_FIELD_RET(s, iter->ts); 2715 SEQ_PUT_HEX_FIELD(s, iter->ts);
2716 if (trace_seq_has_overflowed(s))
2717 return TRACE_TYPE_PARTIAL_LINE;
2710 } 2718 }
2711 2719
2712 event = ftrace_find_event(entry->type); 2720 event = ftrace_find_event(entry->type);
@@ -2716,9 +2724,9 @@ static enum print_line_t print_hex_fmt(struct trace_iterator *iter)
2716 return ret; 2724 return ret;
2717 } 2725 }
2718 2726
2719 SEQ_PUT_FIELD_RET(s, newline); 2727 SEQ_PUT_FIELD(s, newline);
2720 2728
2721 return TRACE_TYPE_HANDLED; 2729 return trace_handle_return(s);
2722} 2730}
2723 2731
2724static enum print_line_t print_bin_fmt(struct trace_iterator *iter) 2732static enum print_line_t print_bin_fmt(struct trace_iterator *iter)
@@ -2730,9 +2738,11 @@ static enum print_line_t print_bin_fmt(struct trace_iterator *iter)
2730 entry = iter->ent; 2738 entry = iter->ent;
2731 2739
2732 if (trace_flags & TRACE_ITER_CONTEXT_INFO) { 2740 if (trace_flags & TRACE_ITER_CONTEXT_INFO) {
2733 SEQ_PUT_FIELD_RET(s, entry->pid); 2741 SEQ_PUT_FIELD(s, entry->pid);
2734 SEQ_PUT_FIELD_RET(s, iter->cpu); 2742 SEQ_PUT_FIELD(s, iter->cpu);
2735 SEQ_PUT_FIELD_RET(s, iter->ts); 2743 SEQ_PUT_FIELD(s, iter->ts);
2744 if (trace_seq_has_overflowed(s))
2745 return TRACE_TYPE_PARTIAL_LINE;
2736 } 2746 }
2737 2747
2738 event = ftrace_find_event(entry->type); 2748 event = ftrace_find_event(entry->type);
@@ -2778,10 +2788,12 @@ enum print_line_t print_trace_line(struct trace_iterator *iter)
2778{ 2788{
2779 enum print_line_t ret; 2789 enum print_line_t ret;
2780 2790
2781 if (iter->lost_events && 2791 if (iter->lost_events) {
2782 !trace_seq_printf(&iter->seq, "CPU:%d [LOST %lu EVENTS]\n", 2792 trace_seq_printf(&iter->seq, "CPU:%d [LOST %lu EVENTS]\n",
2783 iter->cpu, iter->lost_events)) 2793 iter->cpu, iter->lost_events);
2784 return TRACE_TYPE_PARTIAL_LINE; 2794 if (trace_seq_has_overflowed(&iter->seq))
2795 return TRACE_TYPE_PARTIAL_LINE;
2796 }
2785 2797
2786 if (iter->trace && iter->trace->print_line) { 2798 if (iter->trace && iter->trace->print_line) {
2787 ret = iter->trace->print_line(iter); 2799 ret = iter->trace->print_line(iter);
@@ -2859,44 +2871,44 @@ static void test_ftrace_alive(struct seq_file *m)
2859{ 2871{
2860 if (!ftrace_is_dead()) 2872 if (!ftrace_is_dead())
2861 return; 2873 return;
2862 seq_printf(m, "# WARNING: FUNCTION TRACING IS CORRUPTED\n"); 2874 seq_puts(m, "# WARNING: FUNCTION TRACING IS CORRUPTED\n"
2863 seq_printf(m, "# MAY BE MISSING FUNCTION EVENTS\n"); 2875 "# MAY BE MISSING FUNCTION EVENTS\n");
2864} 2876}
2865 2877
2866#ifdef CONFIG_TRACER_MAX_TRACE 2878#ifdef CONFIG_TRACER_MAX_TRACE
2867static void show_snapshot_main_help(struct seq_file *m) 2879static void show_snapshot_main_help(struct seq_file *m)
2868{ 2880{
2869 seq_printf(m, "# echo 0 > snapshot : Clears and frees snapshot buffer\n"); 2881 seq_puts(m, "# echo 0 > snapshot : Clears and frees snapshot buffer\n"
2870 seq_printf(m, "# echo 1 > snapshot : Allocates snapshot buffer, if not already allocated.\n"); 2882 "# echo 1 > snapshot : Allocates snapshot buffer, if not already allocated.\n"
2871 seq_printf(m, "# Takes a snapshot of the main buffer.\n"); 2883 "# Takes a snapshot of the main buffer.\n"
2872 seq_printf(m, "# echo 2 > snapshot : Clears snapshot buffer (but does not allocate or free)\n"); 2884 "# echo 2 > snapshot : Clears snapshot buffer (but does not allocate or free)\n"
2873 seq_printf(m, "# (Doesn't have to be '2' works with any number that\n"); 2885 "# (Doesn't have to be '2' works with any number that\n"
2874 seq_printf(m, "# is not a '0' or '1')\n"); 2886 "# is not a '0' or '1')\n");
2875} 2887}
2876 2888
2877static void show_snapshot_percpu_help(struct seq_file *m) 2889static void show_snapshot_percpu_help(struct seq_file *m)
2878{ 2890{
2879 seq_printf(m, "# echo 0 > snapshot : Invalid for per_cpu snapshot file.\n"); 2891 seq_puts(m, "# echo 0 > snapshot : Invalid for per_cpu snapshot file.\n");
2880#ifdef CONFIG_RING_BUFFER_ALLOW_SWAP 2892#ifdef CONFIG_RING_BUFFER_ALLOW_SWAP
2881 seq_printf(m, "# echo 1 > snapshot : Allocates snapshot buffer, if not already allocated.\n"); 2893 seq_puts(m, "# echo 1 > snapshot : Allocates snapshot buffer, if not already allocated.\n"
2882 seq_printf(m, "# Takes a snapshot of the main buffer for this cpu.\n"); 2894 "# Takes a snapshot of the main buffer for this cpu.\n");
2883#else 2895#else
2884 seq_printf(m, "# echo 1 > snapshot : Not supported with this kernel.\n"); 2896 seq_puts(m, "# echo 1 > snapshot : Not supported with this kernel.\n"
2885 seq_printf(m, "# Must use main snapshot file to allocate.\n"); 2897 "# Must use main snapshot file to allocate.\n");
2886#endif 2898#endif
2887 seq_printf(m, "# echo 2 > snapshot : Clears this cpu's snapshot buffer (but does not allocate)\n"); 2899 seq_puts(m, "# echo 2 > snapshot : Clears this cpu's snapshot buffer (but does not allocate)\n"
2888 seq_printf(m, "# (Doesn't have to be '2' works with any number that\n"); 2900 "# (Doesn't have to be '2' works with any number that\n"
2889 seq_printf(m, "# is not a '0' or '1')\n"); 2901 "# is not a '0' or '1')\n");
2890} 2902}
2891 2903
2892static void print_snapshot_help(struct seq_file *m, struct trace_iterator *iter) 2904static void print_snapshot_help(struct seq_file *m, struct trace_iterator *iter)
2893{ 2905{
2894 if (iter->tr->allocated_snapshot) 2906 if (iter->tr->allocated_snapshot)
2895 seq_printf(m, "#\n# * Snapshot is allocated *\n#\n"); 2907 seq_puts(m, "#\n# * Snapshot is allocated *\n#\n");
2896 else 2908 else
2897 seq_printf(m, "#\n# * Snapshot is freed *\n#\n"); 2909 seq_puts(m, "#\n# * Snapshot is freed *\n#\n");
2898 2910
2899 seq_printf(m, "# Snapshot commands:\n"); 2911 seq_puts(m, "# Snapshot commands:\n");
2900 if (iter->cpu_file == RING_BUFFER_ALL_CPUS) 2912 if (iter->cpu_file == RING_BUFFER_ALL_CPUS)
2901 show_snapshot_main_help(m); 2913 show_snapshot_main_help(m);
2902 else 2914 else
@@ -3250,7 +3262,7 @@ static int t_show(struct seq_file *m, void *v)
3250 if (!t) 3262 if (!t)
3251 return 0; 3263 return 0;
3252 3264
3253 seq_printf(m, "%s", t->name); 3265 seq_puts(m, t->name);
3254 if (t->next) 3266 if (t->next)
3255 seq_putc(m, ' '); 3267 seq_putc(m, ' ');
3256 else 3268 else
@@ -4313,6 +4325,8 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
4313 goto out; 4325 goto out;
4314 } 4326 }
4315 4327
4328 trace_seq_init(&iter->seq);
4329
4316 /* 4330 /*
4317 * We make a copy of the current tracer to avoid concurrent 4331 * We make a copy of the current tracer to avoid concurrent
4318 * changes on it while we are reading. 4332 * changes on it while we are reading.
@@ -4434,15 +4448,12 @@ static int tracing_wait_pipe(struct file *filp)
4434 4448
4435 mutex_unlock(&iter->mutex); 4449 mutex_unlock(&iter->mutex);
4436 4450
4437 ret = wait_on_pipe(iter); 4451 ret = wait_on_pipe(iter, false);
4438 4452
4439 mutex_lock(&iter->mutex); 4453 mutex_lock(&iter->mutex);
4440 4454
4441 if (ret) 4455 if (ret)
4442 return ret; 4456 return ret;
4443
4444 if (signal_pending(current))
4445 return -EINTR;
4446 } 4457 }
4447 4458
4448 return 1; 4459 return 1;
@@ -4509,18 +4520,18 @@ waitagain:
4509 trace_access_lock(iter->cpu_file); 4520 trace_access_lock(iter->cpu_file);
4510 while (trace_find_next_entry_inc(iter) != NULL) { 4521 while (trace_find_next_entry_inc(iter) != NULL) {
4511 enum print_line_t ret; 4522 enum print_line_t ret;
4512 int len = iter->seq.len; 4523 int save_len = iter->seq.seq.len;
4513 4524
4514 ret = print_trace_line(iter); 4525 ret = print_trace_line(iter);
4515 if (ret == TRACE_TYPE_PARTIAL_LINE) { 4526 if (ret == TRACE_TYPE_PARTIAL_LINE) {
4516 /* don't print partial lines */ 4527 /* don't print partial lines */
4517 iter->seq.len = len; 4528 iter->seq.seq.len = save_len;
4518 break; 4529 break;
4519 } 4530 }
4520 if (ret != TRACE_TYPE_NO_CONSUME) 4531 if (ret != TRACE_TYPE_NO_CONSUME)
4521 trace_consume(iter); 4532 trace_consume(iter);
4522 4533
4523 if (iter->seq.len >= cnt) 4534 if (trace_seq_used(&iter->seq) >= cnt)
4524 break; 4535 break;
4525 4536
4526 /* 4537 /*
@@ -4536,7 +4547,7 @@ waitagain:
4536 4547
4537 /* Now copy what we have to the user */ 4548 /* Now copy what we have to the user */
4538 sret = trace_seq_to_user(&iter->seq, ubuf, cnt); 4549 sret = trace_seq_to_user(&iter->seq, ubuf, cnt);
4539 if (iter->seq.readpos >= iter->seq.len) 4550 if (iter->seq.seq.readpos >= trace_seq_used(&iter->seq))
4540 trace_seq_init(&iter->seq); 4551 trace_seq_init(&iter->seq);
4541 4552
4542 /* 4553 /*
@@ -4570,20 +4581,33 @@ static size_t
4570tracing_fill_pipe_page(size_t rem, struct trace_iterator *iter) 4581tracing_fill_pipe_page(size_t rem, struct trace_iterator *iter)
4571{ 4582{
4572 size_t count; 4583 size_t count;
4584 int save_len;
4573 int ret; 4585 int ret;
4574 4586
4575 /* Seq buffer is page-sized, exactly what we need. */ 4587 /* Seq buffer is page-sized, exactly what we need. */
4576 for (;;) { 4588 for (;;) {
4577 count = iter->seq.len; 4589 save_len = iter->seq.seq.len;
4578 ret = print_trace_line(iter); 4590 ret = print_trace_line(iter);
4579 count = iter->seq.len - count; 4591
4580 if (rem < count) { 4592 if (trace_seq_has_overflowed(&iter->seq)) {
4581 rem = 0; 4593 iter->seq.seq.len = save_len;
4582 iter->seq.len -= count;
4583 break; 4594 break;
4584 } 4595 }
4596
4597 /*
4598 * This should not be hit, because it should only
4599 * be set if the iter->seq overflowed. But check it
4600 * anyway to be safe.
4601 */
4585 if (ret == TRACE_TYPE_PARTIAL_LINE) { 4602 if (ret == TRACE_TYPE_PARTIAL_LINE) {
4586 iter->seq.len -= count; 4603 iter->seq.seq.len = save_len;
4604 break;
4605 }
4606
4607 count = trace_seq_used(&iter->seq) - save_len;
4608 if (rem < count) {
4609 rem = 0;
4610 iter->seq.seq.len = save_len;
4587 break; 4611 break;
4588 } 4612 }
4589 4613
@@ -4664,13 +4688,13 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
4664 /* Copy the data into the page, so we can start over. */ 4688 /* Copy the data into the page, so we can start over. */
4665 ret = trace_seq_to_buffer(&iter->seq, 4689 ret = trace_seq_to_buffer(&iter->seq,
4666 page_address(spd.pages[i]), 4690 page_address(spd.pages[i]),
4667 iter->seq.len); 4691 trace_seq_used(&iter->seq));
4668 if (ret < 0) { 4692 if (ret < 0) {
4669 __free_page(spd.pages[i]); 4693 __free_page(spd.pages[i]);
4670 break; 4694 break;
4671 } 4695 }
4672 spd.partial[i].offset = 0; 4696 spd.partial[i].offset = 0;
4673 spd.partial[i].len = iter->seq.len; 4697 spd.partial[i].len = trace_seq_used(&iter->seq);
4674 4698
4675 trace_seq_init(&iter->seq); 4699 trace_seq_init(&iter->seq);
4676 } 4700 }
@@ -5372,16 +5396,12 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
5372 goto out_unlock; 5396 goto out_unlock;
5373 } 5397 }
5374 mutex_unlock(&trace_types_lock); 5398 mutex_unlock(&trace_types_lock);
5375 ret = wait_on_pipe(iter); 5399 ret = wait_on_pipe(iter, false);
5376 mutex_lock(&trace_types_lock); 5400 mutex_lock(&trace_types_lock);
5377 if (ret) { 5401 if (ret) {
5378 size = ret; 5402 size = ret;
5379 goto out_unlock; 5403 goto out_unlock;
5380 } 5404 }
5381 if (signal_pending(current)) {
5382 size = -EINTR;
5383 goto out_unlock;
5384 }
5385 goto again; 5405 goto again;
5386 } 5406 }
5387 size = 0; 5407 size = 0;
@@ -5500,7 +5520,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
5500 }; 5520 };
5501 struct buffer_ref *ref; 5521 struct buffer_ref *ref;
5502 int entries, size, i; 5522 int entries, size, i;
5503 ssize_t ret; 5523 ssize_t ret = 0;
5504 5524
5505 mutex_lock(&trace_types_lock); 5525 mutex_lock(&trace_types_lock);
5506 5526
@@ -5538,13 +5558,16 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
5538 int r; 5558 int r;
5539 5559
5540 ref = kzalloc(sizeof(*ref), GFP_KERNEL); 5560 ref = kzalloc(sizeof(*ref), GFP_KERNEL);
5541 if (!ref) 5561 if (!ref) {
5562 ret = -ENOMEM;
5542 break; 5563 break;
5564 }
5543 5565
5544 ref->ref = 1; 5566 ref->ref = 1;
5545 ref->buffer = iter->trace_buffer->buffer; 5567 ref->buffer = iter->trace_buffer->buffer;
5546 ref->page = ring_buffer_alloc_read_page(ref->buffer, iter->cpu_file); 5568 ref->page = ring_buffer_alloc_read_page(ref->buffer, iter->cpu_file);
5547 if (!ref->page) { 5569 if (!ref->page) {
5570 ret = -ENOMEM;
5548 kfree(ref); 5571 kfree(ref);
5549 break; 5572 break;
5550 } 5573 }
@@ -5582,19 +5605,19 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
5582 5605
5583 /* did we read anything? */ 5606 /* did we read anything? */
5584 if (!spd.nr_pages) { 5607 if (!spd.nr_pages) {
5608 if (ret)
5609 goto out;
5610
5585 if ((file->f_flags & O_NONBLOCK) || (flags & SPLICE_F_NONBLOCK)) { 5611 if ((file->f_flags & O_NONBLOCK) || (flags & SPLICE_F_NONBLOCK)) {
5586 ret = -EAGAIN; 5612 ret = -EAGAIN;
5587 goto out; 5613 goto out;
5588 } 5614 }
5589 mutex_unlock(&trace_types_lock); 5615 mutex_unlock(&trace_types_lock);
5590 ret = wait_on_pipe(iter); 5616 ret = wait_on_pipe(iter, true);
5591 mutex_lock(&trace_types_lock); 5617 mutex_lock(&trace_types_lock);
5592 if (ret) 5618 if (ret)
5593 goto out; 5619 goto out;
5594 if (signal_pending(current)) { 5620
5595 ret = -EINTR;
5596 goto out;
5597 }
5598 goto again; 5621 goto again;
5599 } 5622 }
5600 5623
@@ -5671,7 +5694,8 @@ tracing_stats_read(struct file *filp, char __user *ubuf,
5671 cnt = ring_buffer_read_events_cpu(trace_buf->buffer, cpu); 5694 cnt = ring_buffer_read_events_cpu(trace_buf->buffer, cpu);
5672 trace_seq_printf(s, "read events: %ld\n", cnt); 5695 trace_seq_printf(s, "read events: %ld\n", cnt);
5673 5696
5674 count = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len); 5697 count = simple_read_from_buffer(ubuf, count, ppos,
5698 s->buffer, trace_seq_used(s));
5675 5699
5676 kfree(s); 5700 kfree(s);
5677 5701
@@ -5752,10 +5776,10 @@ ftrace_snapshot_print(struct seq_file *m, unsigned long ip,
5752 5776
5753 seq_printf(m, "%ps:", (void *)ip); 5777 seq_printf(m, "%ps:", (void *)ip);
5754 5778
5755 seq_printf(m, "snapshot"); 5779 seq_puts(m, "snapshot");
5756 5780
5757 if (count == -1) 5781 if (count == -1)
5758 seq_printf(m, ":unlimited\n"); 5782 seq_puts(m, ":unlimited\n");
5759 else 5783 else
5760 seq_printf(m, ":count=%ld\n", count); 5784 seq_printf(m, ":count=%ld\n", count);
5761 5785
@@ -6420,7 +6444,7 @@ static int instance_mkdir (struct inode *inode, struct dentry *dentry, umode_t m
6420 int ret; 6444 int ret;
6421 6445
6422 /* Paranoid: Make sure the parent is the "instances" directory */ 6446 /* Paranoid: Make sure the parent is the "instances" directory */
6423 parent = hlist_entry(inode->i_dentry.first, struct dentry, d_alias); 6447 parent = hlist_entry(inode->i_dentry.first, struct dentry, d_u.d_alias);
6424 if (WARN_ON_ONCE(parent != trace_instance_dir)) 6448 if (WARN_ON_ONCE(parent != trace_instance_dir))
6425 return -ENOENT; 6449 return -ENOENT;
6426 6450
@@ -6447,7 +6471,7 @@ static int instance_rmdir(struct inode *inode, struct dentry *dentry)
6447 int ret; 6471 int ret;
6448 6472
6449 /* Paranoid: Make sure the parent is the "instances" directory */ 6473 /* Paranoid: Make sure the parent is the "instances" directory */
6450 parent = hlist_entry(inode->i_dentry.first, struct dentry, d_alias); 6474 parent = hlist_entry(inode->i_dentry.first, struct dentry, d_u.d_alias);
6451 if (WARN_ON_ONCE(parent != trace_instance_dir)) 6475 if (WARN_ON_ONCE(parent != trace_instance_dir))
6452 return -ENOENT; 6476 return -ENOENT;
6453 6477
@@ -6634,11 +6658,19 @@ void
6634trace_printk_seq(struct trace_seq *s) 6658trace_printk_seq(struct trace_seq *s)
6635{ 6659{
6636 /* Probably should print a warning here. */ 6660 /* Probably should print a warning here. */
6637 if (s->len >= TRACE_MAX_PRINT) 6661 if (s->seq.len >= TRACE_MAX_PRINT)
6638 s->len = TRACE_MAX_PRINT; 6662 s->seq.len = TRACE_MAX_PRINT;
6663
6664 /*
6665 * More paranoid code. Although the buffer size is set to
6666 * PAGE_SIZE, and TRACE_MAX_PRINT is 1000, this is just
6667 * an extra layer of protection.
6668 */
6669 if (WARN_ON_ONCE(s->seq.len >= s->seq.size))
6670 s->seq.len = s->seq.size - 1;
6639 6671
6640 /* should be zero ended, but we are paranoid. */ 6672 /* should be zero ended, but we are paranoid. */
6641 s->buffer[s->len] = 0; 6673 s->buffer[s->seq.len] = 0;
6642 6674
6643 printk(KERN_TRACE "%s", s->buffer); 6675 printk(KERN_TRACE "%s", s->buffer);
6644 6676
@@ -6877,6 +6909,19 @@ out:
6877 return ret; 6909 return ret;
6878} 6910}
6879 6911
6912void __init trace_init(void)
6913{
6914 if (tracepoint_printk) {
6915 tracepoint_print_iter =
6916 kmalloc(sizeof(*tracepoint_print_iter), GFP_KERNEL);
6917 if (WARN_ON(!tracepoint_print_iter))
6918 tracepoint_printk = 0;
6919 }
6920 tracer_alloc_buffers();
6921 init_ftrace_syscalls();
6922 trace_event_init();
6923}
6924
6880__init static int clear_boot_tracer(void) 6925__init static int clear_boot_tracer(void)
6881{ 6926{
6882 /* 6927 /*
@@ -6896,6 +6941,5 @@ __init static int clear_boot_tracer(void)
6896 return 0; 6941 return 0;
6897} 6942}
6898 6943
6899early_initcall(tracer_alloc_buffers);
6900fs_initcall(tracer_init_debugfs); 6944fs_initcall(tracer_init_debugfs);
6901late_initcall(clear_boot_tracer); 6945late_initcall(clear_boot_tracer);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 385391fb1d3b..8de48bac1ce2 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -14,6 +14,7 @@
14#include <linux/trace_seq.h> 14#include <linux/trace_seq.h>
15#include <linux/ftrace_event.h> 15#include <linux/ftrace_event.h>
16#include <linux/compiler.h> 16#include <linux/compiler.h>
17#include <linux/trace_seq.h>
17 18
18#ifdef CONFIG_FTRACE_SYSCALLS 19#ifdef CONFIG_FTRACE_SYSCALLS
19#include <asm/unistd.h> /* For NR_SYSCALLS */ 20#include <asm/unistd.h> /* For NR_SYSCALLS */
@@ -569,15 +570,6 @@ void trace_init_global_iter(struct trace_iterator *iter);
569 570
570void tracing_iter_reset(struct trace_iterator *iter, int cpu); 571void tracing_iter_reset(struct trace_iterator *iter, int cpu);
571 572
572void tracing_sched_switch_trace(struct trace_array *tr,
573 struct task_struct *prev,
574 struct task_struct *next,
575 unsigned long flags, int pc);
576
577void tracing_sched_wakeup_trace(struct trace_array *tr,
578 struct task_struct *wakee,
579 struct task_struct *cur,
580 unsigned long flags, int pc);
581void trace_function(struct trace_array *tr, 573void trace_function(struct trace_array *tr,
582 unsigned long ip, 574 unsigned long ip,
583 unsigned long parent_ip, 575 unsigned long parent_ip,
@@ -597,9 +589,6 @@ void set_graph_array(struct trace_array *tr);
597 589
598void tracing_start_cmdline_record(void); 590void tracing_start_cmdline_record(void);
599void tracing_stop_cmdline_record(void); 591void tracing_stop_cmdline_record(void);
600void tracing_sched_switch_assign_trace(struct trace_array *tr);
601void tracing_stop_sched_switch_record(void);
602void tracing_start_sched_switch_record(void);
603int register_tracer(struct tracer *type); 592int register_tracer(struct tracer *type);
604int is_tracing_stopped(void); 593int is_tracing_stopped(void);
605 594
@@ -719,6 +708,8 @@ enum print_line_t print_trace_line(struct trace_iterator *iter);
719 708
720extern unsigned long trace_flags; 709extern unsigned long trace_flags;
721 710
711extern char trace_find_mark(unsigned long long duration);
712
722/* Standard output formatting function used for function return traces */ 713/* Standard output formatting function used for function return traces */
723#ifdef CONFIG_FUNCTION_GRAPH_TRACER 714#ifdef CONFIG_FUNCTION_GRAPH_TRACER
724 715
@@ -737,7 +728,7 @@ extern unsigned long trace_flags;
737extern enum print_line_t 728extern enum print_line_t
738print_graph_function_flags(struct trace_iterator *iter, u32 flags); 729print_graph_function_flags(struct trace_iterator *iter, u32 flags);
739extern void print_graph_headers_flags(struct seq_file *s, u32 flags); 730extern void print_graph_headers_flags(struct seq_file *s, u32 flags);
740extern enum print_line_t 731extern void
741trace_print_graph_duration(unsigned long long duration, struct trace_seq *s); 732trace_print_graph_duration(unsigned long long duration, struct trace_seq *s);
742extern void graph_trace_open(struct trace_iterator *iter); 733extern void graph_trace_open(struct trace_iterator *iter);
743extern void graph_trace_close(struct trace_iterator *iter); 734extern void graph_trace_close(struct trace_iterator *iter);
@@ -1310,4 +1301,18 @@ int perf_ftrace_event_register(struct ftrace_event_call *call,
1310#define perf_ftrace_event_register NULL 1301#define perf_ftrace_event_register NULL
1311#endif 1302#endif
1312 1303
1304#ifdef CONFIG_FTRACE_SYSCALLS
1305void init_ftrace_syscalls(void);
1306#else
1307static inline void init_ftrace_syscalls(void) { }
1308#endif
1309
1310#ifdef CONFIG_EVENT_TRACING
1311void trace_event_init(void);
1312#else
1313static inline void __init trace_event_init(void) { }
1314#endif
1315
1316extern struct trace_iterator *tracepoint_print_iter;
1317
1313#endif /* _LINUX_KERNEL_TRACE_H */ 1318#endif /* _LINUX_KERNEL_TRACE_H */
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index 697fb9bac8f0..7d6e2afde669 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -151,22 +151,21 @@ static enum print_line_t trace_branch_print(struct trace_iterator *iter,
151 151
152 trace_assign_type(field, iter->ent); 152 trace_assign_type(field, iter->ent);
153 153
154 if (trace_seq_printf(&iter->seq, "[%s] %s:%s:%d\n", 154 trace_seq_printf(&iter->seq, "[%s] %s:%s:%d\n",
155 field->correct ? " ok " : " MISS ", 155 field->correct ? " ok " : " MISS ",
156 field->func, 156 field->func,
157 field->file, 157 field->file,
158 field->line)) 158 field->line);
159 return TRACE_TYPE_PARTIAL_LINE; 159
160 160 return trace_handle_return(&iter->seq);
161 return TRACE_TYPE_HANDLED;
162} 161}
163 162
164static void branch_print_header(struct seq_file *s) 163static void branch_print_header(struct seq_file *s)
165{ 164{
166 seq_puts(s, "# TASK-PID CPU# TIMESTAMP CORRECT" 165 seq_puts(s, "# TASK-PID CPU# TIMESTAMP CORRECT"
167 " FUNC:FILE:LINE\n"); 166 " FUNC:FILE:LINE\n"
168 seq_puts(s, "# | | | | | " 167 "# | | | | | "
169 " |\n"); 168 " |\n");
170} 169}
171 170
172static struct trace_event_functions trace_branch_funcs = { 171static struct trace_event_functions trace_branch_funcs = {
@@ -233,12 +232,12 @@ extern unsigned long __stop_annotated_branch_profile[];
233 232
234static int annotated_branch_stat_headers(struct seq_file *m) 233static int annotated_branch_stat_headers(struct seq_file *m)
235{ 234{
236 seq_printf(m, " correct incorrect %% "); 235 seq_puts(m, " correct incorrect % "
237 seq_printf(m, " Function " 236 " Function "
238 " File Line\n" 237 " File Line\n"
239 " ------- --------- - " 238 " ------- --------- - "
240 " -------- " 239 " -------- "
241 " ---- ----\n"); 240 " ---- ----\n");
242 return 0; 241 return 0;
243} 242}
244 243
@@ -274,7 +273,7 @@ static int branch_stat_show(struct seq_file *m, void *v)
274 273
275 seq_printf(m, "%8lu %8lu ", p->correct, p->incorrect); 274 seq_printf(m, "%8lu %8lu ", p->correct, p->incorrect);
276 if (percent < 0) 275 if (percent < 0)
277 seq_printf(m, " X "); 276 seq_puts(m, " X ");
278 else 277 else
279 seq_printf(m, "%3ld ", percent); 278 seq_printf(m, "%3ld ", percent);
280 seq_printf(m, "%-30.30s %-20.20s %d\n", p->func, f, p->line); 279 seq_printf(m, "%-30.30s %-20.20s %d\n", p->func, f, p->line);
@@ -362,12 +361,12 @@ extern unsigned long __stop_branch_profile[];
362 361
363static int all_branch_stat_headers(struct seq_file *m) 362static int all_branch_stat_headers(struct seq_file *m)
364{ 363{
365 seq_printf(m, " miss hit %% "); 364 seq_puts(m, " miss hit % "
366 seq_printf(m, " Function " 365 " Function "
367 " File Line\n" 366 " File Line\n"
368 " ------- --------- - " 367 " ------- --------- - "
369 " -------- " 368 " -------- "
370 " ---- ----\n"); 369 " ---- ----\n");
371 return 0; 370 return 0;
372} 371}
373 372
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index ef06ce7e9cf8..366a78a3e61e 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -212,8 +212,40 @@ void *ftrace_event_buffer_reserve(struct ftrace_event_buffer *fbuffer,
212} 212}
213EXPORT_SYMBOL_GPL(ftrace_event_buffer_reserve); 213EXPORT_SYMBOL_GPL(ftrace_event_buffer_reserve);
214 214
215static DEFINE_SPINLOCK(tracepoint_iter_lock);
216
217static void output_printk(struct ftrace_event_buffer *fbuffer)
218{
219 struct ftrace_event_call *event_call;
220 struct trace_event *event;
221 unsigned long flags;
222 struct trace_iterator *iter = tracepoint_print_iter;
223
224 if (!iter)
225 return;
226
227 event_call = fbuffer->ftrace_file->event_call;
228 if (!event_call || !event_call->event.funcs ||
229 !event_call->event.funcs->trace)
230 return;
231
232 event = &fbuffer->ftrace_file->event_call->event;
233
234 spin_lock_irqsave(&tracepoint_iter_lock, flags);
235 trace_seq_init(&iter->seq);
236 iter->ent = fbuffer->entry;
237 event_call->event.funcs->trace(iter, 0, event);
238 trace_seq_putc(&iter->seq, 0);
239 printk("%s", iter->seq.buffer);
240
241 spin_unlock_irqrestore(&tracepoint_iter_lock, flags);
242}
243
215void ftrace_event_buffer_commit(struct ftrace_event_buffer *fbuffer) 244void ftrace_event_buffer_commit(struct ftrace_event_buffer *fbuffer)
216{ 245{
246 if (tracepoint_printk)
247 output_printk(fbuffer);
248
217 event_trigger_unlock_commit(fbuffer->ftrace_file, fbuffer->buffer, 249 event_trigger_unlock_commit(fbuffer->ftrace_file, fbuffer->buffer,
218 fbuffer->event, fbuffer->entry, 250 fbuffer->event, fbuffer->entry,
219 fbuffer->flags, fbuffer->pc); 251 fbuffer->flags, fbuffer->pc);
@@ -461,7 +493,7 @@ static void remove_event_file_dir(struct ftrace_event_file *file)
461 493
462 if (dir) { 494 if (dir) {
463 spin_lock(&dir->d_lock); /* probably unneeded */ 495 spin_lock(&dir->d_lock); /* probably unneeded */
464 list_for_each_entry(child, &dir->d_subdirs, d_u.d_child) { 496 list_for_each_entry(child, &dir->d_subdirs, d_child) {
465 if (child->d_inode) /* probably unneeded */ 497 if (child->d_inode) /* probably unneeded */
466 child->d_inode->i_private = NULL; 498 child->d_inode->i_private = NULL;
467 } 499 }
@@ -918,7 +950,7 @@ static int f_show(struct seq_file *m, void *v)
918 case FORMAT_HEADER: 950 case FORMAT_HEADER:
919 seq_printf(m, "name: %s\n", ftrace_event_name(call)); 951 seq_printf(m, "name: %s\n", ftrace_event_name(call));
920 seq_printf(m, "ID: %d\n", call->event.type); 952 seq_printf(m, "ID: %d\n", call->event.type);
921 seq_printf(m, "format:\n"); 953 seq_puts(m, "format:\n");
922 return 0; 954 return 0;
923 955
924 case FORMAT_FIELD_SEPERATOR: 956 case FORMAT_FIELD_SEPERATOR:
@@ -1044,7 +1076,8 @@ event_filter_read(struct file *filp, char __user *ubuf, size_t cnt,
1044 mutex_unlock(&event_mutex); 1076 mutex_unlock(&event_mutex);
1045 1077
1046 if (file) 1078 if (file)
1047 r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len); 1079 r = simple_read_from_buffer(ubuf, cnt, ppos,
1080 s->buffer, trace_seq_used(s));
1048 1081
1049 kfree(s); 1082 kfree(s);
1050 1083
@@ -1210,7 +1243,8 @@ subsystem_filter_read(struct file *filp, char __user *ubuf, size_t cnt,
1210 trace_seq_init(s); 1243 trace_seq_init(s);
1211 1244
1212 print_subsystem_event_filter(system, s); 1245 print_subsystem_event_filter(system, s);
1213 r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len); 1246 r = simple_read_from_buffer(ubuf, cnt, ppos,
1247 s->buffer, trace_seq_used(s));
1214 1248
1215 kfree(s); 1249 kfree(s);
1216 1250
@@ -1265,7 +1299,8 @@ show_header(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
1265 trace_seq_init(s); 1299 trace_seq_init(s);
1266 1300
1267 func(s); 1301 func(s);
1268 r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len); 1302 r = simple_read_from_buffer(ubuf, cnt, ppos,
1303 s->buffer, trace_seq_used(s));
1269 1304
1270 kfree(s); 1305 kfree(s);
1271 1306
@@ -1988,7 +2023,7 @@ event_enable_print(struct seq_file *m, unsigned long ip,
1988 ftrace_event_name(data->file->event_call)); 2023 ftrace_event_name(data->file->event_call));
1989 2024
1990 if (data->count == -1) 2025 if (data->count == -1)
1991 seq_printf(m, ":unlimited\n"); 2026 seq_puts(m, ":unlimited\n");
1992 else 2027 else
1993 seq_printf(m, ":count=%ld\n", data->count); 2028 seq_printf(m, ":count=%ld\n", data->count);
1994 2029
@@ -2477,8 +2512,14 @@ static __init int event_trace_init(void)
2477#endif 2512#endif
2478 return 0; 2513 return 0;
2479} 2514}
2480early_initcall(event_trace_memsetup); 2515
2481core_initcall(event_trace_enable); 2516void __init trace_event_init(void)
2517{
2518 event_trace_memsetup();
2519 init_ftrace_syscalls();
2520 event_trace_enable();
2521}
2522
2482fs_initcall(event_trace_init); 2523fs_initcall(event_trace_init);
2483 2524
2484#ifdef CONFIG_FTRACE_STARTUP_TEST 2525#ifdef CONFIG_FTRACE_STARTUP_TEST
@@ -2513,8 +2554,11 @@ static __init int event_test_thread(void *unused)
2513 kfree(test_malloc); 2554 kfree(test_malloc);
2514 2555
2515 set_current_state(TASK_INTERRUPTIBLE); 2556 set_current_state(TASK_INTERRUPTIBLE);
2516 while (!kthread_should_stop()) 2557 while (!kthread_should_stop()) {
2517 schedule(); 2558 schedule();
2559 set_current_state(TASK_INTERRUPTIBLE);
2560 }
2561 __set_current_state(TASK_RUNNING);
2518 2562
2519 return 0; 2563 return 0;
2520} 2564}
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 7a8c1528e141..ced69da0ff55 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -45,6 +45,7 @@ enum filter_op_ids
45 OP_GT, 45 OP_GT,
46 OP_GE, 46 OP_GE,
47 OP_BAND, 47 OP_BAND,
48 OP_NOT,
48 OP_NONE, 49 OP_NONE,
49 OP_OPEN_PAREN, 50 OP_OPEN_PAREN,
50}; 51};
@@ -67,6 +68,7 @@ static struct filter_op filter_ops[] = {
67 { OP_GT, ">", 5 }, 68 { OP_GT, ">", 5 },
68 { OP_GE, ">=", 5 }, 69 { OP_GE, ">=", 5 },
69 { OP_BAND, "&", 6 }, 70 { OP_BAND, "&", 6 },
71 { OP_NOT, "!", 6 },
70 { OP_NONE, "OP_NONE", 0 }, 72 { OP_NONE, "OP_NONE", 0 },
71 { OP_OPEN_PAREN, "(", 0 }, 73 { OP_OPEN_PAREN, "(", 0 },
72}; 74};
@@ -85,6 +87,7 @@ enum {
85 FILT_ERR_MISSING_FIELD, 87 FILT_ERR_MISSING_FIELD,
86 FILT_ERR_INVALID_FILTER, 88 FILT_ERR_INVALID_FILTER,
87 FILT_ERR_IP_FIELD_ONLY, 89 FILT_ERR_IP_FIELD_ONLY,
90 FILT_ERR_ILLEGAL_NOT_OP,
88}; 91};
89 92
90static char *err_text[] = { 93static char *err_text[] = {
@@ -101,6 +104,7 @@ static char *err_text[] = {
101 "Missing field name and/or value", 104 "Missing field name and/or value",
102 "Meaningless filter expression", 105 "Meaningless filter expression",
103 "Only 'ip' field is supported for function trace", 106 "Only 'ip' field is supported for function trace",
107 "Illegal use of '!'",
104}; 108};
105 109
106struct opstack_op { 110struct opstack_op {
@@ -139,6 +143,7 @@ struct pred_stack {
139 int index; 143 int index;
140}; 144};
141 145
146/* If not of not match is equal to not of not, then it is a match */
142#define DEFINE_COMPARISON_PRED(type) \ 147#define DEFINE_COMPARISON_PRED(type) \
143static int filter_pred_##type(struct filter_pred *pred, void *event) \ 148static int filter_pred_##type(struct filter_pred *pred, void *event) \
144{ \ 149{ \
@@ -166,7 +171,7 @@ static int filter_pred_##type(struct filter_pred *pred, void *event) \
166 break; \ 171 break; \
167 } \ 172 } \
168 \ 173 \
169 return match; \ 174 return !!match == !pred->not; \
170} 175}
171 176
172#define DEFINE_EQUALITY_PRED(size) \ 177#define DEFINE_EQUALITY_PRED(size) \
@@ -484,9 +489,10 @@ static int process_ops(struct filter_pred *preds,
484 if (!WARN_ON_ONCE(!pred->fn)) 489 if (!WARN_ON_ONCE(!pred->fn))
485 match = pred->fn(pred, rec); 490 match = pred->fn(pred, rec);
486 if (!!match == type) 491 if (!!match == type)
487 return match; 492 break;
488 } 493 }
489 return match; 494 /* If not of not match is equal to not of not, then it is a match */
495 return !!match == !op->not;
490} 496}
491 497
492struct filter_match_preds_data { 498struct filter_match_preds_data {
@@ -735,10 +741,10 @@ static int filter_set_pred(struct event_filter *filter,
735 * then this op can be folded. 741 * then this op can be folded.
736 */ 742 */
737 if (left->index & FILTER_PRED_FOLD && 743 if (left->index & FILTER_PRED_FOLD &&
738 (left->op == dest->op || 744 ((left->op == dest->op && !left->not) ||
739 left->left == FILTER_PRED_INVALID) && 745 left->left == FILTER_PRED_INVALID) &&
740 right->index & FILTER_PRED_FOLD && 746 right->index & FILTER_PRED_FOLD &&
741 (right->op == dest->op || 747 ((right->op == dest->op && !right->not) ||
742 right->left == FILTER_PRED_INVALID)) 748 right->left == FILTER_PRED_INVALID))
743 dest->index |= FILTER_PRED_FOLD; 749 dest->index |= FILTER_PRED_FOLD;
744 750
@@ -1028,7 +1034,7 @@ static int init_pred(struct filter_parse_state *ps,
1028 } 1034 }
1029 1035
1030 if (pred->op == OP_NE) 1036 if (pred->op == OP_NE)
1031 pred->not = 1; 1037 pred->not ^= 1;
1032 1038
1033 pred->fn = fn; 1039 pred->fn = fn;
1034 return 0; 1040 return 0;
@@ -1590,6 +1596,17 @@ static int replace_preds(struct ftrace_event_call *call,
1590 continue; 1596 continue;
1591 } 1597 }
1592 1598
1599 if (elt->op == OP_NOT) {
1600 if (!n_preds || operand1 || operand2) {
1601 parse_error(ps, FILT_ERR_ILLEGAL_NOT_OP, 0);
1602 err = -EINVAL;
1603 goto fail;
1604 }
1605 if (!dry_run)
1606 filter->preds[n_preds - 1].not ^= 1;
1607 continue;
1608 }
1609
1593 if (WARN_ON(n_preds++ == MAX_FILTER_PRED)) { 1610 if (WARN_ON(n_preds++ == MAX_FILTER_PRED)) {
1594 parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0); 1611 parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0);
1595 err = -ENOSPC; 1612 err = -ENOSPC;
diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c
index 4747b476a030..8712df9decb4 100644
--- a/kernel/trace/trace_events_trigger.c
+++ b/kernel/trace/trace_events_trigger.c
@@ -373,7 +373,7 @@ event_trigger_print(const char *name, struct seq_file *m,
373{ 373{
374 long count = (long)data; 374 long count = (long)data;
375 375
376 seq_printf(m, "%s", name); 376 seq_puts(m, name);
377 377
378 if (count == -1) 378 if (count == -1)
379 seq_puts(m, ":unlimited"); 379 seq_puts(m, ":unlimited");
@@ -383,7 +383,7 @@ event_trigger_print(const char *name, struct seq_file *m,
383 if (filter_str) 383 if (filter_str)
384 seq_printf(m, " if %s\n", filter_str); 384 seq_printf(m, " if %s\n", filter_str);
385 else 385 else
386 seq_puts(m, "\n"); 386 seq_putc(m, '\n');
387 387
388 return 0; 388 return 0;
389} 389}
@@ -1105,7 +1105,7 @@ event_enable_trigger_print(struct seq_file *m, struct event_trigger_ops *ops,
1105 if (data->filter_str) 1105 if (data->filter_str)
1106 seq_printf(m, " if %s\n", data->filter_str); 1106 seq_printf(m, " if %s\n", data->filter_str);
1107 else 1107 else
1108 seq_puts(m, "\n"); 1108 seq_putc(m, '\n');
1109 1109
1110 return 0; 1110 return 0;
1111} 1111}
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 57f0ec962d2c..fcd41a166405 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -261,37 +261,74 @@ static struct tracer function_trace __tracer_data =
261}; 261};
262 262
263#ifdef CONFIG_DYNAMIC_FTRACE 263#ifdef CONFIG_DYNAMIC_FTRACE
264static int update_count(void **data) 264static void update_traceon_count(void **data, bool on)
265{ 265{
266 unsigned long *count = (long *)data; 266 long *count = (long *)data;
267 long old_count = *count;
267 268
268 if (!*count) 269 /*
269 return 0; 270 * Tracing gets disabled (or enabled) once per count.
271 * This function can be called at the same time on multiple CPUs.
272 * It is fine if both disable (or enable) tracing, as disabling
273 * (or enabling) the second time doesn't do anything as the
274 * state of the tracer is already disabled (or enabled).
275 * What needs to be synchronized in this case is that the count
276 * only gets decremented once, even if the tracer is disabled
277 * (or enabled) twice, as the second one is really a nop.
278 *
279 * The memory barriers guarantee that we only decrement the
280 * counter once. First the count is read to a local variable
281 * and a read barrier is used to make sure that it is loaded
282 * before checking if the tracer is in the state we want.
283 * If the tracer is not in the state we want, then the count
284 * is guaranteed to be the old count.
285 *
286 * Next the tracer is set to the state we want (disabled or enabled)
287 * then a write memory barrier is used to make sure that
288 * the new state is visible before changing the counter by
289 * one minus the old counter. This guarantees that another CPU
290 * executing this code will see the new state before seeing
291 * the new counter value, and would not do anything if the new
292 * counter is seen.
293 *
294 * Note, there is no synchronization between this and a user
295 * setting the tracing_on file. But we currently don't care
296 * about that.
297 */
298 if (!old_count)
299 return;
270 300
271 if (*count != -1) 301 /* Make sure we see count before checking tracing state */
272 (*count)--; 302 smp_rmb();
273 303
274 return 1; 304 if (on == !!tracing_is_on())
305 return;
306
307 if (on)
308 tracing_on();
309 else
310 tracing_off();
311
312 /* unlimited? */
313 if (old_count == -1)
314 return;
315
316 /* Make sure tracing state is visible before updating count */
317 smp_wmb();
318
319 *count = old_count - 1;
275} 320}
276 321
277static void 322static void
278ftrace_traceon_count(unsigned long ip, unsigned long parent_ip, void **data) 323ftrace_traceon_count(unsigned long ip, unsigned long parent_ip, void **data)
279{ 324{
280 if (tracing_is_on()) 325 update_traceon_count(data, 1);
281 return;
282
283 if (update_count(data))
284 tracing_on();
285} 326}
286 327
287static void 328static void
288ftrace_traceoff_count(unsigned long ip, unsigned long parent_ip, void **data) 329ftrace_traceoff_count(unsigned long ip, unsigned long parent_ip, void **data)
289{ 330{
290 if (!tracing_is_on()) 331 update_traceon_count(data, 0);
291 return;
292
293 if (update_count(data))
294 tracing_off();
295} 332}
296 333
297static void 334static void
@@ -330,11 +367,49 @@ ftrace_stacktrace(unsigned long ip, unsigned long parent_ip, void **data)
330static void 367static void
331ftrace_stacktrace_count(unsigned long ip, unsigned long parent_ip, void **data) 368ftrace_stacktrace_count(unsigned long ip, unsigned long parent_ip, void **data)
332{ 369{
333 if (!tracing_is_on()) 370 long *count = (long *)data;
334 return; 371 long old_count;
372 long new_count;
335 373
336 if (update_count(data)) 374 /*
337 trace_dump_stack(STACK_SKIP); 375 * Stack traces should only execute the number of times the
376 * user specified in the counter.
377 */
378 do {
379
380 if (!tracing_is_on())
381 return;
382
383 old_count = *count;
384
385 if (!old_count)
386 return;
387
388 /* unlimited? */
389 if (old_count == -1) {
390 trace_dump_stack(STACK_SKIP);
391 return;
392 }
393
394 new_count = old_count - 1;
395 new_count = cmpxchg(count, old_count, new_count);
396 if (new_count == old_count)
397 trace_dump_stack(STACK_SKIP);
398
399 } while (new_count != old_count);
400}
401
402static int update_count(void **data)
403{
404 unsigned long *count = (long *)data;
405
406 if (!*count)
407 return 0;
408
409 if (*count != -1)
410 (*count)--;
411
412 return 1;
338} 413}
339 414
340static void 415static void
@@ -361,7 +436,7 @@ ftrace_probe_print(const char *name, struct seq_file *m,
361 seq_printf(m, "%ps:%s", (void *)ip, name); 436 seq_printf(m, "%ps:%s", (void *)ip, name);
362 437
363 if (count == -1) 438 if (count == -1)
364 seq_printf(m, ":unlimited\n"); 439 seq_puts(m, ":unlimited\n");
365 else 440 else
366 seq_printf(m, ":count=%ld\n", count); 441 seq_printf(m, ":count=%ld\n", count);
367 442
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index f0a0c982cde3..ba476009e5de 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -107,7 +107,7 @@ enum {
107 FLAGS_FILL_END = 3 << TRACE_GRAPH_PRINT_FILL_SHIFT, 107 FLAGS_FILL_END = 3 << TRACE_GRAPH_PRINT_FILL_SHIFT,
108}; 108};
109 109
110static enum print_line_t 110static void
111print_graph_duration(unsigned long long duration, struct trace_seq *s, 111print_graph_duration(unsigned long long duration, struct trace_seq *s,
112 u32 flags); 112 u32 flags);
113 113
@@ -483,33 +483,24 @@ static int graph_trace_update_thresh(struct trace_array *tr)
483 483
484static int max_bytes_for_cpu; 484static int max_bytes_for_cpu;
485 485
486static enum print_line_t 486static void print_graph_cpu(struct trace_seq *s, int cpu)
487print_graph_cpu(struct trace_seq *s, int cpu)
488{ 487{
489 int ret;
490
491 /* 488 /*
492 * Start with a space character - to make it stand out 489 * Start with a space character - to make it stand out
493 * to the right a bit when trace output is pasted into 490 * to the right a bit when trace output is pasted into
494 * email: 491 * email:
495 */ 492 */
496 ret = trace_seq_printf(s, " %*d) ", max_bytes_for_cpu, cpu); 493 trace_seq_printf(s, " %*d) ", max_bytes_for_cpu, cpu);
497 if (!ret)
498 return TRACE_TYPE_PARTIAL_LINE;
499
500 return TRACE_TYPE_HANDLED;
501} 494}
502 495
503#define TRACE_GRAPH_PROCINFO_LENGTH 14 496#define TRACE_GRAPH_PROCINFO_LENGTH 14
504 497
505static enum print_line_t 498static void print_graph_proc(struct trace_seq *s, pid_t pid)
506print_graph_proc(struct trace_seq *s, pid_t pid)
507{ 499{
508 char comm[TASK_COMM_LEN]; 500 char comm[TASK_COMM_LEN];
509 /* sign + log10(MAX_INT) + '\0' */ 501 /* sign + log10(MAX_INT) + '\0' */
510 char pid_str[11]; 502 char pid_str[11];
511 int spaces = 0; 503 int spaces = 0;
512 int ret;
513 int len; 504 int len;
514 int i; 505 int i;
515 506
@@ -524,56 +515,43 @@ print_graph_proc(struct trace_seq *s, pid_t pid)
524 spaces = TRACE_GRAPH_PROCINFO_LENGTH - len; 515 spaces = TRACE_GRAPH_PROCINFO_LENGTH - len;
525 516
526 /* First spaces to align center */ 517 /* First spaces to align center */
527 for (i = 0; i < spaces / 2; i++) { 518 for (i = 0; i < spaces / 2; i++)
528 ret = trace_seq_putc(s, ' '); 519 trace_seq_putc(s, ' ');
529 if (!ret)
530 return TRACE_TYPE_PARTIAL_LINE;
531 }
532 520
533 ret = trace_seq_printf(s, "%s-%s", comm, pid_str); 521 trace_seq_printf(s, "%s-%s", comm, pid_str);
534 if (!ret)
535 return TRACE_TYPE_PARTIAL_LINE;
536 522
537 /* Last spaces to align center */ 523 /* Last spaces to align center */
538 for (i = 0; i < spaces - (spaces / 2); i++) { 524 for (i = 0; i < spaces - (spaces / 2); i++)
539 ret = trace_seq_putc(s, ' '); 525 trace_seq_putc(s, ' ');
540 if (!ret)
541 return TRACE_TYPE_PARTIAL_LINE;
542 }
543 return TRACE_TYPE_HANDLED;
544} 526}
545 527
546 528
547static enum print_line_t 529static void print_graph_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
548print_graph_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
549{ 530{
550 if (!trace_seq_putc(s, ' ')) 531 trace_seq_putc(s, ' ');
551 return 0; 532 trace_print_lat_fmt(s, entry);
552
553 return trace_print_lat_fmt(s, entry);
554} 533}
555 534
556/* If the pid changed since the last trace, output this event */ 535/* If the pid changed since the last trace, output this event */
557static enum print_line_t 536static void
558verif_pid(struct trace_seq *s, pid_t pid, int cpu, struct fgraph_data *data) 537verif_pid(struct trace_seq *s, pid_t pid, int cpu, struct fgraph_data *data)
559{ 538{
560 pid_t prev_pid; 539 pid_t prev_pid;
561 pid_t *last_pid; 540 pid_t *last_pid;
562 int ret;
563 541
564 if (!data) 542 if (!data)
565 return TRACE_TYPE_HANDLED; 543 return;
566 544
567 last_pid = &(per_cpu_ptr(data->cpu_data, cpu)->last_pid); 545 last_pid = &(per_cpu_ptr(data->cpu_data, cpu)->last_pid);
568 546
569 if (*last_pid == pid) 547 if (*last_pid == pid)
570 return TRACE_TYPE_HANDLED; 548 return;
571 549
572 prev_pid = *last_pid; 550 prev_pid = *last_pid;
573 *last_pid = pid; 551 *last_pid = pid;
574 552
575 if (prev_pid == -1) 553 if (prev_pid == -1)
576 return TRACE_TYPE_HANDLED; 554 return;
577/* 555/*
578 * Context-switch trace line: 556 * Context-switch trace line:
579 557
@@ -582,33 +560,12 @@ verif_pid(struct trace_seq *s, pid_t pid, int cpu, struct fgraph_data *data)
582 ------------------------------------------ 560 ------------------------------------------
583 561
584 */ 562 */
585 ret = trace_seq_puts(s, 563 trace_seq_puts(s, " ------------------------------------------\n");
586 " ------------------------------------------\n"); 564 print_graph_cpu(s, cpu);
587 if (!ret) 565 print_graph_proc(s, prev_pid);
588 return TRACE_TYPE_PARTIAL_LINE; 566 trace_seq_puts(s, " => ");
589 567 print_graph_proc(s, pid);
590 ret = print_graph_cpu(s, cpu); 568 trace_seq_puts(s, "\n ------------------------------------------\n\n");
591 if (ret == TRACE_TYPE_PARTIAL_LINE)
592 return TRACE_TYPE_PARTIAL_LINE;
593
594 ret = print_graph_proc(s, prev_pid);
595 if (ret == TRACE_TYPE_PARTIAL_LINE)
596 return TRACE_TYPE_PARTIAL_LINE;
597
598 ret = trace_seq_puts(s, " => ");
599 if (!ret)
600 return TRACE_TYPE_PARTIAL_LINE;
601
602 ret = print_graph_proc(s, pid);
603 if (ret == TRACE_TYPE_PARTIAL_LINE)
604 return TRACE_TYPE_PARTIAL_LINE;
605
606 ret = trace_seq_puts(s,
607 "\n ------------------------------------------\n\n");
608 if (!ret)
609 return TRACE_TYPE_PARTIAL_LINE;
610
611 return TRACE_TYPE_HANDLED;
612} 569}
613 570
614static struct ftrace_graph_ret_entry * 571static struct ftrace_graph_ret_entry *
@@ -682,175 +639,122 @@ get_return_for_leaf(struct trace_iterator *iter,
682 return next; 639 return next;
683} 640}
684 641
685static int print_graph_abs_time(u64 t, struct trace_seq *s) 642static void print_graph_abs_time(u64 t, struct trace_seq *s)
686{ 643{
687 unsigned long usecs_rem; 644 unsigned long usecs_rem;
688 645
689 usecs_rem = do_div(t, NSEC_PER_SEC); 646 usecs_rem = do_div(t, NSEC_PER_SEC);
690 usecs_rem /= 1000; 647 usecs_rem /= 1000;
691 648
692 return trace_seq_printf(s, "%5lu.%06lu | ", 649 trace_seq_printf(s, "%5lu.%06lu | ",
693 (unsigned long)t, usecs_rem); 650 (unsigned long)t, usecs_rem);
694} 651}
695 652
696static enum print_line_t 653static void
697print_graph_irq(struct trace_iterator *iter, unsigned long addr, 654print_graph_irq(struct trace_iterator *iter, unsigned long addr,
698 enum trace_type type, int cpu, pid_t pid, u32 flags) 655 enum trace_type type, int cpu, pid_t pid, u32 flags)
699{ 656{
700 int ret;
701 struct trace_seq *s = &iter->seq; 657 struct trace_seq *s = &iter->seq;
658 struct trace_entry *ent = iter->ent;
702 659
703 if (addr < (unsigned long)__irqentry_text_start || 660 if (addr < (unsigned long)__irqentry_text_start ||
704 addr >= (unsigned long)__irqentry_text_end) 661 addr >= (unsigned long)__irqentry_text_end)
705 return TRACE_TYPE_UNHANDLED; 662 return;
706 663
707 if (trace_flags & TRACE_ITER_CONTEXT_INFO) { 664 if (trace_flags & TRACE_ITER_CONTEXT_INFO) {
708 /* Absolute time */ 665 /* Absolute time */
709 if (flags & TRACE_GRAPH_PRINT_ABS_TIME) { 666 if (flags & TRACE_GRAPH_PRINT_ABS_TIME)
710 ret = print_graph_abs_time(iter->ts, s); 667 print_graph_abs_time(iter->ts, s);
711 if (!ret)
712 return TRACE_TYPE_PARTIAL_LINE;
713 }
714 668
715 /* Cpu */ 669 /* Cpu */
716 if (flags & TRACE_GRAPH_PRINT_CPU) { 670 if (flags & TRACE_GRAPH_PRINT_CPU)
717 ret = print_graph_cpu(s, cpu); 671 print_graph_cpu(s, cpu);
718 if (ret == TRACE_TYPE_PARTIAL_LINE)
719 return TRACE_TYPE_PARTIAL_LINE;
720 }
721 672
722 /* Proc */ 673 /* Proc */
723 if (flags & TRACE_GRAPH_PRINT_PROC) { 674 if (flags & TRACE_GRAPH_PRINT_PROC) {
724 ret = print_graph_proc(s, pid); 675 print_graph_proc(s, pid);
725 if (ret == TRACE_TYPE_PARTIAL_LINE) 676 trace_seq_puts(s, " | ");
726 return TRACE_TYPE_PARTIAL_LINE;
727 ret = trace_seq_puts(s, " | ");
728 if (!ret)
729 return TRACE_TYPE_PARTIAL_LINE;
730 } 677 }
678
679 /* Latency format */
680 if (trace_flags & TRACE_ITER_LATENCY_FMT)
681 print_graph_lat_fmt(s, ent);
731 } 682 }
732 683
733 /* No overhead */ 684 /* No overhead */
734 ret = print_graph_duration(0, s, flags | FLAGS_FILL_START); 685 print_graph_duration(0, s, flags | FLAGS_FILL_START);
735 if (ret != TRACE_TYPE_HANDLED)
736 return ret;
737 686
738 if (type == TRACE_GRAPH_ENT) 687 if (type == TRACE_GRAPH_ENT)
739 ret = trace_seq_puts(s, "==========>"); 688 trace_seq_puts(s, "==========>");
740 else 689 else
741 ret = trace_seq_puts(s, "<=========="); 690 trace_seq_puts(s, "<==========");
742
743 if (!ret)
744 return TRACE_TYPE_PARTIAL_LINE;
745
746 ret = print_graph_duration(0, s, flags | FLAGS_FILL_END);
747 if (ret != TRACE_TYPE_HANDLED)
748 return ret;
749
750 ret = trace_seq_putc(s, '\n');
751 691
752 if (!ret) 692 print_graph_duration(0, s, flags | FLAGS_FILL_END);
753 return TRACE_TYPE_PARTIAL_LINE; 693 trace_seq_putc(s, '\n');
754 return TRACE_TYPE_HANDLED;
755} 694}
756 695
757enum print_line_t 696void
758trace_print_graph_duration(unsigned long long duration, struct trace_seq *s) 697trace_print_graph_duration(unsigned long long duration, struct trace_seq *s)
759{ 698{
760 unsigned long nsecs_rem = do_div(duration, 1000); 699 unsigned long nsecs_rem = do_div(duration, 1000);
761 /* log10(ULONG_MAX) + '\0' */ 700 /* log10(ULONG_MAX) + '\0' */
762 char msecs_str[21]; 701 char usecs_str[21];
763 char nsecs_str[5]; 702 char nsecs_str[5];
764 int ret, len; 703 int len;
765 int i; 704 int i;
766 705
767 sprintf(msecs_str, "%lu", (unsigned long) duration); 706 sprintf(usecs_str, "%lu", (unsigned long) duration);
768 707
769 /* Print msecs */ 708 /* Print msecs */
770 ret = trace_seq_printf(s, "%s", msecs_str); 709 trace_seq_printf(s, "%s", usecs_str);
771 if (!ret)
772 return TRACE_TYPE_PARTIAL_LINE;
773 710
774 len = strlen(msecs_str); 711 len = strlen(usecs_str);
775 712
776 /* Print nsecs (we don't want to exceed 7 numbers) */ 713 /* Print nsecs (we don't want to exceed 7 numbers) */
777 if (len < 7) { 714 if (len < 7) {
778 size_t slen = min_t(size_t, sizeof(nsecs_str), 8UL - len); 715 size_t slen = min_t(size_t, sizeof(nsecs_str), 8UL - len);
779 716
780 snprintf(nsecs_str, slen, "%03lu", nsecs_rem); 717 snprintf(nsecs_str, slen, "%03lu", nsecs_rem);
781 ret = trace_seq_printf(s, ".%s", nsecs_str); 718 trace_seq_printf(s, ".%s", nsecs_str);
782 if (!ret)
783 return TRACE_TYPE_PARTIAL_LINE;
784 len += strlen(nsecs_str); 719 len += strlen(nsecs_str);
785 } 720 }
786 721
787 ret = trace_seq_puts(s, " us "); 722 trace_seq_puts(s, " us ");
788 if (!ret)
789 return TRACE_TYPE_PARTIAL_LINE;
790 723
791 /* Print remaining spaces to fit the row's width */ 724 /* Print remaining spaces to fit the row's width */
792 for (i = len; i < 7; i++) { 725 for (i = len; i < 7; i++)
793 ret = trace_seq_putc(s, ' '); 726 trace_seq_putc(s, ' ');
794 if (!ret)
795 return TRACE_TYPE_PARTIAL_LINE;
796 }
797 return TRACE_TYPE_HANDLED;
798} 727}
799 728
800static enum print_line_t 729static void
801print_graph_duration(unsigned long long duration, struct trace_seq *s, 730print_graph_duration(unsigned long long duration, struct trace_seq *s,
802 u32 flags) 731 u32 flags)
803{ 732{
804 int ret = -1;
805
806 if (!(flags & TRACE_GRAPH_PRINT_DURATION) || 733 if (!(flags & TRACE_GRAPH_PRINT_DURATION) ||
807 !(trace_flags & TRACE_ITER_CONTEXT_INFO)) 734 !(trace_flags & TRACE_ITER_CONTEXT_INFO))
808 return TRACE_TYPE_HANDLED; 735 return;
809 736
810 /* No real adata, just filling the column with spaces */ 737 /* No real adata, just filling the column with spaces */
811 switch (flags & TRACE_GRAPH_PRINT_FILL_MASK) { 738 switch (flags & TRACE_GRAPH_PRINT_FILL_MASK) {
812 case FLAGS_FILL_FULL: 739 case FLAGS_FILL_FULL:
813 ret = trace_seq_puts(s, " | "); 740 trace_seq_puts(s, " | ");
814 return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE; 741 return;
815 case FLAGS_FILL_START: 742 case FLAGS_FILL_START:
816 ret = trace_seq_puts(s, " "); 743 trace_seq_puts(s, " ");
817 return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE; 744 return;
818 case FLAGS_FILL_END: 745 case FLAGS_FILL_END:
819 ret = trace_seq_puts(s, " |"); 746 trace_seq_puts(s, " |");
820 return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE; 747 return;
821 } 748 }
822 749
823 /* Signal a overhead of time execution to the output */ 750 /* Signal a overhead of time execution to the output */
824 if (flags & TRACE_GRAPH_PRINT_OVERHEAD) { 751 if (flags & TRACE_GRAPH_PRINT_OVERHEAD)
825 /* Duration exceeded 100 msecs */ 752 trace_seq_printf(s, "%c ", trace_find_mark(duration));
826 if (duration > 100000ULL) 753 else
827 ret = trace_seq_puts(s, "! "); 754 trace_seq_puts(s, " ");
828 /* Duration exceeded 10 msecs */
829 else if (duration > 10000ULL)
830 ret = trace_seq_puts(s, "+ ");
831 }
832
833 /*
834 * The -1 means we either did not exceed the duration tresholds
835 * or we dont want to print out the overhead. Either way we need
836 * to fill out the space.
837 */
838 if (ret == -1)
839 ret = trace_seq_puts(s, " ");
840
841 /* Catching here any failure happenned above */
842 if (!ret)
843 return TRACE_TYPE_PARTIAL_LINE;
844
845 ret = trace_print_graph_duration(duration, s);
846 if (ret != TRACE_TYPE_HANDLED)
847 return ret;
848
849 ret = trace_seq_puts(s, "| ");
850 if (!ret)
851 return TRACE_TYPE_PARTIAL_LINE;
852 755
853 return TRACE_TYPE_HANDLED; 756 trace_print_graph_duration(duration, s);
757 trace_seq_puts(s, "| ");
854} 758}
855 759
856/* Case of a leaf function on its call entry */ 760/* Case of a leaf function on its call entry */
@@ -864,7 +768,6 @@ print_graph_entry_leaf(struct trace_iterator *iter,
864 struct ftrace_graph_ret *graph_ret; 768 struct ftrace_graph_ret *graph_ret;
865 struct ftrace_graph_ent *call; 769 struct ftrace_graph_ent *call;
866 unsigned long long duration; 770 unsigned long long duration;
867 int ret;
868 int i; 771 int i;
869 772
870 graph_ret = &ret_entry->ret; 773 graph_ret = &ret_entry->ret;
@@ -890,22 +793,15 @@ print_graph_entry_leaf(struct trace_iterator *iter,
890 } 793 }
891 794
892 /* Overhead and duration */ 795 /* Overhead and duration */
893 ret = print_graph_duration(duration, s, flags); 796 print_graph_duration(duration, s, flags);
894 if (ret == TRACE_TYPE_PARTIAL_LINE)
895 return TRACE_TYPE_PARTIAL_LINE;
896 797
897 /* Function */ 798 /* Function */
898 for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) { 799 for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++)
899 ret = trace_seq_putc(s, ' '); 800 trace_seq_putc(s, ' ');
900 if (!ret)
901 return TRACE_TYPE_PARTIAL_LINE;
902 }
903 801
904 ret = trace_seq_printf(s, "%ps();\n", (void *)call->func); 802 trace_seq_printf(s, "%ps();\n", (void *)call->func);
905 if (!ret)
906 return TRACE_TYPE_PARTIAL_LINE;
907 803
908 return TRACE_TYPE_HANDLED; 804 return trace_handle_return(s);
909} 805}
910 806
911static enum print_line_t 807static enum print_line_t
@@ -915,7 +811,6 @@ print_graph_entry_nested(struct trace_iterator *iter,
915{ 811{
916 struct ftrace_graph_ent *call = &entry->graph_ent; 812 struct ftrace_graph_ent *call = &entry->graph_ent;
917 struct fgraph_data *data = iter->private; 813 struct fgraph_data *data = iter->private;
918 int ret;
919 int i; 814 int i;
920 815
921 if (data) { 816 if (data) {
@@ -931,19 +826,15 @@ print_graph_entry_nested(struct trace_iterator *iter,
931 } 826 }
932 827
933 /* No time */ 828 /* No time */
934 ret = print_graph_duration(0, s, flags | FLAGS_FILL_FULL); 829 print_graph_duration(0, s, flags | FLAGS_FILL_FULL);
935 if (ret != TRACE_TYPE_HANDLED)
936 return ret;
937 830
938 /* Function */ 831 /* Function */
939 for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) { 832 for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++)
940 ret = trace_seq_putc(s, ' '); 833 trace_seq_putc(s, ' ');
941 if (!ret) 834
942 return TRACE_TYPE_PARTIAL_LINE; 835 trace_seq_printf(s, "%ps() {\n", (void *)call->func);
943 }
944 836
945 ret = trace_seq_printf(s, "%ps() {\n", (void *)call->func); 837 if (trace_seq_has_overflowed(s))
946 if (!ret)
947 return TRACE_TYPE_PARTIAL_LINE; 838 return TRACE_TYPE_PARTIAL_LINE;
948 839
949 /* 840 /*
@@ -953,62 +844,43 @@ print_graph_entry_nested(struct trace_iterator *iter,
953 return TRACE_TYPE_NO_CONSUME; 844 return TRACE_TYPE_NO_CONSUME;
954} 845}
955 846
956static enum print_line_t 847static void
957print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s, 848print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s,
958 int type, unsigned long addr, u32 flags) 849 int type, unsigned long addr, u32 flags)
959{ 850{
960 struct fgraph_data *data = iter->private; 851 struct fgraph_data *data = iter->private;
961 struct trace_entry *ent = iter->ent; 852 struct trace_entry *ent = iter->ent;
962 int cpu = iter->cpu; 853 int cpu = iter->cpu;
963 int ret;
964 854
965 /* Pid */ 855 /* Pid */
966 if (verif_pid(s, ent->pid, cpu, data) == TRACE_TYPE_PARTIAL_LINE) 856 verif_pid(s, ent->pid, cpu, data);
967 return TRACE_TYPE_PARTIAL_LINE;
968 857
969 if (type) { 858 if (type)
970 /* Interrupt */ 859 /* Interrupt */
971 ret = print_graph_irq(iter, addr, type, cpu, ent->pid, flags); 860 print_graph_irq(iter, addr, type, cpu, ent->pid, flags);
972 if (ret == TRACE_TYPE_PARTIAL_LINE)
973 return TRACE_TYPE_PARTIAL_LINE;
974 }
975 861
976 if (!(trace_flags & TRACE_ITER_CONTEXT_INFO)) 862 if (!(trace_flags & TRACE_ITER_CONTEXT_INFO))
977 return 0; 863 return;
978 864
979 /* Absolute time */ 865 /* Absolute time */
980 if (flags & TRACE_GRAPH_PRINT_ABS_TIME) { 866 if (flags & TRACE_GRAPH_PRINT_ABS_TIME)
981 ret = print_graph_abs_time(iter->ts, s); 867 print_graph_abs_time(iter->ts, s);
982 if (!ret)
983 return TRACE_TYPE_PARTIAL_LINE;
984 }
985 868
986 /* Cpu */ 869 /* Cpu */
987 if (flags & TRACE_GRAPH_PRINT_CPU) { 870 if (flags & TRACE_GRAPH_PRINT_CPU)
988 ret = print_graph_cpu(s, cpu); 871 print_graph_cpu(s, cpu);
989 if (ret == TRACE_TYPE_PARTIAL_LINE)
990 return TRACE_TYPE_PARTIAL_LINE;
991 }
992 872
993 /* Proc */ 873 /* Proc */
994 if (flags & TRACE_GRAPH_PRINT_PROC) { 874 if (flags & TRACE_GRAPH_PRINT_PROC) {
995 ret = print_graph_proc(s, ent->pid); 875 print_graph_proc(s, ent->pid);
996 if (ret == TRACE_TYPE_PARTIAL_LINE) 876 trace_seq_puts(s, " | ");
997 return TRACE_TYPE_PARTIAL_LINE;
998
999 ret = trace_seq_puts(s, " | ");
1000 if (!ret)
1001 return TRACE_TYPE_PARTIAL_LINE;
1002 } 877 }
1003 878
1004 /* Latency format */ 879 /* Latency format */
1005 if (trace_flags & TRACE_ITER_LATENCY_FMT) { 880 if (trace_flags & TRACE_ITER_LATENCY_FMT)
1006 ret = print_graph_lat_fmt(s, ent); 881 print_graph_lat_fmt(s, ent);
1007 if (ret == TRACE_TYPE_PARTIAL_LINE)
1008 return TRACE_TYPE_PARTIAL_LINE;
1009 }
1010 882
1011 return 0; 883 return;
1012} 884}
1013 885
1014/* 886/*
@@ -1126,8 +998,7 @@ print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s,
1126 if (check_irq_entry(iter, flags, call->func, call->depth)) 998 if (check_irq_entry(iter, flags, call->func, call->depth))
1127 return TRACE_TYPE_HANDLED; 999 return TRACE_TYPE_HANDLED;
1128 1000
1129 if (print_graph_prologue(iter, s, TRACE_GRAPH_ENT, call->func, flags)) 1001 print_graph_prologue(iter, s, TRACE_GRAPH_ENT, call->func, flags);
1130 return TRACE_TYPE_PARTIAL_LINE;
1131 1002
1132 leaf_ret = get_return_for_leaf(iter, field); 1003 leaf_ret = get_return_for_leaf(iter, field);
1133 if (leaf_ret) 1004 if (leaf_ret)
@@ -1160,7 +1031,6 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
1160 pid_t pid = ent->pid; 1031 pid_t pid = ent->pid;
1161 int cpu = iter->cpu; 1032 int cpu = iter->cpu;
1162 int func_match = 1; 1033 int func_match = 1;
1163 int ret;
1164 int i; 1034 int i;
1165 1035
1166 if (check_irq_return(iter, flags, trace->depth)) 1036 if (check_irq_return(iter, flags, trace->depth))
@@ -1186,20 +1056,14 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
1186 } 1056 }
1187 } 1057 }
1188 1058
1189 if (print_graph_prologue(iter, s, 0, 0, flags)) 1059 print_graph_prologue(iter, s, 0, 0, flags);
1190 return TRACE_TYPE_PARTIAL_LINE;
1191 1060
1192 /* Overhead and duration */ 1061 /* Overhead and duration */
1193 ret = print_graph_duration(duration, s, flags); 1062 print_graph_duration(duration, s, flags);
1194 if (ret == TRACE_TYPE_PARTIAL_LINE)
1195 return TRACE_TYPE_PARTIAL_LINE;
1196 1063
1197 /* Closing brace */ 1064 /* Closing brace */
1198 for (i = 0; i < trace->depth * TRACE_GRAPH_INDENT; i++) { 1065 for (i = 0; i < trace->depth * TRACE_GRAPH_INDENT; i++)
1199 ret = trace_seq_putc(s, ' '); 1066 trace_seq_putc(s, ' ');
1200 if (!ret)
1201 return TRACE_TYPE_PARTIAL_LINE;
1202 }
1203 1067
1204 /* 1068 /*
1205 * If the return function does not have a matching entry, 1069 * If the return function does not have a matching entry,
@@ -1208,30 +1072,20 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
1208 * belongs to, write out the function name. Always do 1072 * belongs to, write out the function name. Always do
1209 * that if the funcgraph-tail option is enabled. 1073 * that if the funcgraph-tail option is enabled.
1210 */ 1074 */
1211 if (func_match && !(flags & TRACE_GRAPH_PRINT_TAIL)) { 1075 if (func_match && !(flags & TRACE_GRAPH_PRINT_TAIL))
1212 ret = trace_seq_puts(s, "}\n"); 1076 trace_seq_puts(s, "}\n");
1213 if (!ret) 1077 else
1214 return TRACE_TYPE_PARTIAL_LINE; 1078 trace_seq_printf(s, "} /* %ps */\n", (void *)trace->func);
1215 } else {
1216 ret = trace_seq_printf(s, "} /* %ps */\n", (void *)trace->func);
1217 if (!ret)
1218 return TRACE_TYPE_PARTIAL_LINE;
1219 }
1220 1079
1221 /* Overrun */ 1080 /* Overrun */
1222 if (flags & TRACE_GRAPH_PRINT_OVERRUN) { 1081 if (flags & TRACE_GRAPH_PRINT_OVERRUN)
1223 ret = trace_seq_printf(s, " (Overruns: %lu)\n", 1082 trace_seq_printf(s, " (Overruns: %lu)\n",
1224 trace->overrun); 1083 trace->overrun);
1225 if (!ret)
1226 return TRACE_TYPE_PARTIAL_LINE;
1227 }
1228 1084
1229 ret = print_graph_irq(iter, trace->func, TRACE_GRAPH_RET, 1085 print_graph_irq(iter, trace->func, TRACE_GRAPH_RET,
1230 cpu, pid, flags); 1086 cpu, pid, flags);
1231 if (ret == TRACE_TYPE_PARTIAL_LINE)
1232 return TRACE_TYPE_PARTIAL_LINE;
1233 1087
1234 return TRACE_TYPE_HANDLED; 1088 return trace_handle_return(s);
1235} 1089}
1236 1090
1237static enum print_line_t 1091static enum print_line_t
@@ -1248,26 +1102,18 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent,
1248 if (data) 1102 if (data)
1249 depth = per_cpu_ptr(data->cpu_data, iter->cpu)->depth; 1103 depth = per_cpu_ptr(data->cpu_data, iter->cpu)->depth;
1250 1104
1251 if (print_graph_prologue(iter, s, 0, 0, flags)) 1105 print_graph_prologue(iter, s, 0, 0, flags);
1252 return TRACE_TYPE_PARTIAL_LINE;
1253 1106
1254 /* No time */ 1107 /* No time */
1255 ret = print_graph_duration(0, s, flags | FLAGS_FILL_FULL); 1108 print_graph_duration(0, s, flags | FLAGS_FILL_FULL);
1256 if (ret != TRACE_TYPE_HANDLED)
1257 return ret;
1258 1109
1259 /* Indentation */ 1110 /* Indentation */
1260 if (depth > 0) 1111 if (depth > 0)
1261 for (i = 0; i < (depth + 1) * TRACE_GRAPH_INDENT; i++) { 1112 for (i = 0; i < (depth + 1) * TRACE_GRAPH_INDENT; i++)
1262 ret = trace_seq_putc(s, ' '); 1113 trace_seq_putc(s, ' ');
1263 if (!ret)
1264 return TRACE_TYPE_PARTIAL_LINE;
1265 }
1266 1114
1267 /* The comment */ 1115 /* The comment */
1268 ret = trace_seq_puts(s, "/* "); 1116 trace_seq_puts(s, "/* ");
1269 if (!ret)
1270 return TRACE_TYPE_PARTIAL_LINE;
1271 1117
1272 switch (iter->ent->type) { 1118 switch (iter->ent->type) {
1273 case TRACE_BPRINT: 1119 case TRACE_BPRINT:
@@ -1290,17 +1136,18 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent,
1290 return ret; 1136 return ret;
1291 } 1137 }
1292 1138
1139 if (trace_seq_has_overflowed(s))
1140 goto out;
1141
1293 /* Strip ending newline */ 1142 /* Strip ending newline */
1294 if (s->buffer[s->len - 1] == '\n') { 1143 if (s->buffer[s->seq.len - 1] == '\n') {
1295 s->buffer[s->len - 1] = '\0'; 1144 s->buffer[s->seq.len - 1] = '\0';
1296 s->len--; 1145 s->seq.len--;
1297 } 1146 }
1298 1147
1299 ret = trace_seq_puts(s, " */\n"); 1148 trace_seq_puts(s, " */\n");
1300 if (!ret) 1149 out:
1301 return TRACE_TYPE_PARTIAL_LINE; 1150 return trace_handle_return(s);
1302
1303 return TRACE_TYPE_HANDLED;
1304} 1151}
1305 1152
1306 1153
@@ -1407,32 +1254,32 @@ static void __print_graph_headers_flags(struct seq_file *s, u32 flags)
1407 print_lat_header(s, flags); 1254 print_lat_header(s, flags);
1408 1255
1409 /* 1st line */ 1256 /* 1st line */
1410 seq_printf(s, "#"); 1257 seq_putc(s, '#');
1411 if (flags & TRACE_GRAPH_PRINT_ABS_TIME) 1258 if (flags & TRACE_GRAPH_PRINT_ABS_TIME)
1412 seq_printf(s, " TIME "); 1259 seq_puts(s, " TIME ");
1413 if (flags & TRACE_GRAPH_PRINT_CPU) 1260 if (flags & TRACE_GRAPH_PRINT_CPU)
1414 seq_printf(s, " CPU"); 1261 seq_puts(s, " CPU");
1415 if (flags & TRACE_GRAPH_PRINT_PROC) 1262 if (flags & TRACE_GRAPH_PRINT_PROC)
1416 seq_printf(s, " TASK/PID "); 1263 seq_puts(s, " TASK/PID ");
1417 if (lat) 1264 if (lat)
1418 seq_printf(s, "||||"); 1265 seq_puts(s, "||||");
1419 if (flags & TRACE_GRAPH_PRINT_DURATION) 1266 if (flags & TRACE_GRAPH_PRINT_DURATION)
1420 seq_printf(s, " DURATION "); 1267 seq_puts(s, " DURATION ");
1421 seq_printf(s, " FUNCTION CALLS\n"); 1268 seq_puts(s, " FUNCTION CALLS\n");
1422 1269
1423 /* 2nd line */ 1270 /* 2nd line */
1424 seq_printf(s, "#"); 1271 seq_putc(s, '#');
1425 if (flags & TRACE_GRAPH_PRINT_ABS_TIME) 1272 if (flags & TRACE_GRAPH_PRINT_ABS_TIME)
1426 seq_printf(s, " | "); 1273 seq_puts(s, " | ");
1427 if (flags & TRACE_GRAPH_PRINT_CPU) 1274 if (flags & TRACE_GRAPH_PRINT_CPU)
1428 seq_printf(s, " | "); 1275 seq_puts(s, " | ");
1429 if (flags & TRACE_GRAPH_PRINT_PROC) 1276 if (flags & TRACE_GRAPH_PRINT_PROC)
1430 seq_printf(s, " | | "); 1277 seq_puts(s, " | | ");
1431 if (lat) 1278 if (lat)
1432 seq_printf(s, "||||"); 1279 seq_puts(s, "||||");
1433 if (flags & TRACE_GRAPH_PRINT_DURATION) 1280 if (flags & TRACE_GRAPH_PRINT_DURATION)
1434 seq_printf(s, " | | "); 1281 seq_puts(s, " | | ");
1435 seq_printf(s, " | | | |\n"); 1282 seq_puts(s, " | | | |\n");
1436} 1283}
1437 1284
1438static void print_graph_headers(struct seq_file *s) 1285static void print_graph_headers(struct seq_file *s)
diff --git a/kernel/trace/trace_kdb.c b/kernel/trace/trace_kdb.c
index bd90e1b06088..3ccf5c2c1320 100644
--- a/kernel/trace/trace_kdb.c
+++ b/kernel/trace/trace_kdb.c
@@ -20,10 +20,12 @@ static void ftrace_dump_buf(int skip_lines, long cpu_file)
20{ 20{
21 /* use static because iter can be a bit big for the stack */ 21 /* use static because iter can be a bit big for the stack */
22 static struct trace_iterator iter; 22 static struct trace_iterator iter;
23 static struct ring_buffer_iter *buffer_iter[CONFIG_NR_CPUS];
23 unsigned int old_userobj; 24 unsigned int old_userobj;
24 int cnt = 0, cpu; 25 int cnt = 0, cpu;
25 26
26 trace_init_global_iter(&iter); 27 trace_init_global_iter(&iter);
28 iter.buffer_iter = buffer_iter;
27 29
28 for_each_tracing_cpu(cpu) { 30 for_each_tracing_cpu(cpu) {
29 atomic_inc(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled); 31 atomic_inc(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled);
@@ -57,19 +59,19 @@ static void ftrace_dump_buf(int skip_lines, long cpu_file)
57 ring_buffer_read_start(iter.buffer_iter[cpu_file]); 59 ring_buffer_read_start(iter.buffer_iter[cpu_file]);
58 tracing_iter_reset(&iter, cpu_file); 60 tracing_iter_reset(&iter, cpu_file);
59 } 61 }
60 if (!trace_empty(&iter)) 62
61 trace_find_next_entry_inc(&iter); 63 while (trace_find_next_entry_inc(&iter)) {
62 while (!trace_empty(&iter)) {
63 if (!cnt) 64 if (!cnt)
64 kdb_printf("---------------------------------\n"); 65 kdb_printf("---------------------------------\n");
65 cnt++; 66 cnt++;
66 67
67 if (trace_find_next_entry_inc(&iter) != NULL && !skip_lines) 68 if (!skip_lines) {
68 print_trace_line(&iter); 69 print_trace_line(&iter);
69 if (!skip_lines)
70 trace_printk_seq(&iter.seq); 70 trace_printk_seq(&iter.seq);
71 else 71 } else {
72 skip_lines--; 72 skip_lines--;
73 }
74
73 if (KDB_FLAG(CMD_INTERRUPT)) 75 if (KDB_FLAG(CMD_INTERRUPT))
74 goto out; 76 goto out;
75 } 77 }
@@ -86,9 +88,12 @@ out:
86 atomic_dec(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled); 88 atomic_dec(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled);
87 } 89 }
88 90
89 for_each_tracing_cpu(cpu) 91 for_each_tracing_cpu(cpu) {
90 if (iter.buffer_iter[cpu]) 92 if (iter.buffer_iter[cpu]) {
91 ring_buffer_read_finish(iter.buffer_iter[cpu]); 93 ring_buffer_read_finish(iter.buffer_iter[cpu]);
94 iter.buffer_iter[cpu] = NULL;
95 }
96 }
92} 97}
93 98
94/* 99/*
@@ -127,8 +132,8 @@ static int kdb_ftdump(int argc, const char **argv)
127 132
128static __init int kdb_ftrace_register(void) 133static __init int kdb_ftrace_register(void)
129{ 134{
130 kdb_register_repeat("ftdump", kdb_ftdump, "[skip_#lines] [cpu]", 135 kdb_register_flags("ftdump", kdb_ftdump, "[skip_#lines] [cpu]",
131 "Dump ftrace log", 0, KDB_REPEAT_NONE); 136 "Dump ftrace log", 0, KDB_ENABLE_ALWAYS_SAFE);
132 return 0; 137 return 0;
133} 138}
134 139
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 282f6e4e5539..5edb518be345 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -826,7 +826,7 @@ static int probes_seq_show(struct seq_file *m, void *v)
826 struct trace_kprobe *tk = v; 826 struct trace_kprobe *tk = v;
827 int i; 827 int i;
828 828
829 seq_printf(m, "%c", trace_kprobe_is_return(tk) ? 'r' : 'p'); 829 seq_putc(m, trace_kprobe_is_return(tk) ? 'r' : 'p');
830 seq_printf(m, ":%s/%s", tk->tp.call.class->system, 830 seq_printf(m, ":%s/%s", tk->tp.call.class->system,
831 ftrace_event_name(&tk->tp.call)); 831 ftrace_event_name(&tk->tp.call));
832 832
@@ -840,7 +840,7 @@ static int probes_seq_show(struct seq_file *m, void *v)
840 840
841 for (i = 0; i < tk->tp.nr_args; i++) 841 for (i = 0; i < tk->tp.nr_args; i++)
842 seq_printf(m, " %s=%s", tk->tp.args[i].name, tk->tp.args[i].comm); 842 seq_printf(m, " %s=%s", tk->tp.args[i].name, tk->tp.args[i].comm);
843 seq_printf(m, "\n"); 843 seq_putc(m, '\n');
844 844
845 return 0; 845 return 0;
846} 846}
@@ -1024,27 +1024,22 @@ print_kprobe_event(struct trace_iterator *iter, int flags,
1024 field = (struct kprobe_trace_entry_head *)iter->ent; 1024 field = (struct kprobe_trace_entry_head *)iter->ent;
1025 tp = container_of(event, struct trace_probe, call.event); 1025 tp = container_of(event, struct trace_probe, call.event);
1026 1026
1027 if (!trace_seq_printf(s, "%s: (", ftrace_event_name(&tp->call))) 1027 trace_seq_printf(s, "%s: (", ftrace_event_name(&tp->call));
1028 goto partial;
1029 1028
1030 if (!seq_print_ip_sym(s, field->ip, flags | TRACE_ITER_SYM_OFFSET)) 1029 if (!seq_print_ip_sym(s, field->ip, flags | TRACE_ITER_SYM_OFFSET))
1031 goto partial; 1030 goto out;
1032 1031
1033 if (!trace_seq_puts(s, ")")) 1032 trace_seq_putc(s, ')');
1034 goto partial;
1035 1033
1036 data = (u8 *)&field[1]; 1034 data = (u8 *)&field[1];
1037 for (i = 0; i < tp->nr_args; i++) 1035 for (i = 0; i < tp->nr_args; i++)
1038 if (!tp->args[i].type->print(s, tp->args[i].name, 1036 if (!tp->args[i].type->print(s, tp->args[i].name,
1039 data + tp->args[i].offset, field)) 1037 data + tp->args[i].offset, field))
1040 goto partial; 1038 goto out;
1041
1042 if (!trace_seq_puts(s, "\n"))
1043 goto partial;
1044 1039
1045 return TRACE_TYPE_HANDLED; 1040 trace_seq_putc(s, '\n');
1046partial: 1041 out:
1047 return TRACE_TYPE_PARTIAL_LINE; 1042 return trace_handle_return(s);
1048} 1043}
1049 1044
1050static enum print_line_t 1045static enum print_line_t
@@ -1060,33 +1055,28 @@ print_kretprobe_event(struct trace_iterator *iter, int flags,
1060 field = (struct kretprobe_trace_entry_head *)iter->ent; 1055 field = (struct kretprobe_trace_entry_head *)iter->ent;
1061 tp = container_of(event, struct trace_probe, call.event); 1056 tp = container_of(event, struct trace_probe, call.event);
1062 1057
1063 if (!trace_seq_printf(s, "%s: (", ftrace_event_name(&tp->call))) 1058 trace_seq_printf(s, "%s: (", ftrace_event_name(&tp->call));
1064 goto partial;
1065 1059
1066 if (!seq_print_ip_sym(s, field->ret_ip, flags | TRACE_ITER_SYM_OFFSET)) 1060 if (!seq_print_ip_sym(s, field->ret_ip, flags | TRACE_ITER_SYM_OFFSET))
1067 goto partial; 1061 goto out;
1068 1062
1069 if (!trace_seq_puts(s, " <- ")) 1063 trace_seq_puts(s, " <- ");
1070 goto partial;
1071 1064
1072 if (!seq_print_ip_sym(s, field->func, flags & ~TRACE_ITER_SYM_OFFSET)) 1065 if (!seq_print_ip_sym(s, field->func, flags & ~TRACE_ITER_SYM_OFFSET))
1073 goto partial; 1066 goto out;
1074 1067
1075 if (!trace_seq_puts(s, ")")) 1068 trace_seq_putc(s, ')');
1076 goto partial;
1077 1069
1078 data = (u8 *)&field[1]; 1070 data = (u8 *)&field[1];
1079 for (i = 0; i < tp->nr_args; i++) 1071 for (i = 0; i < tp->nr_args; i++)
1080 if (!tp->args[i].type->print(s, tp->args[i].name, 1072 if (!tp->args[i].type->print(s, tp->args[i].name,
1081 data + tp->args[i].offset, field)) 1073 data + tp->args[i].offset, field))
1082 goto partial; 1074 goto out;
1083 1075
1084 if (!trace_seq_puts(s, "\n")) 1076 trace_seq_putc(s, '\n');
1085 goto partial;
1086 1077
1087 return TRACE_TYPE_HANDLED; 1078 out:
1088partial: 1079 return trace_handle_return(s);
1089 return TRACE_TYPE_PARTIAL_LINE;
1090} 1080}
1091 1081
1092 1082
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index 0abd9b863474..7a9ba62e9fef 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -59,17 +59,15 @@ static void mmio_trace_start(struct trace_array *tr)
59 mmio_reset_data(tr); 59 mmio_reset_data(tr);
60} 60}
61 61
62static int mmio_print_pcidev(struct trace_seq *s, const struct pci_dev *dev) 62static void mmio_print_pcidev(struct trace_seq *s, const struct pci_dev *dev)
63{ 63{
64 int ret = 0;
65 int i; 64 int i;
66 resource_size_t start, end; 65 resource_size_t start, end;
67 const struct pci_driver *drv = pci_dev_driver(dev); 66 const struct pci_driver *drv = pci_dev_driver(dev);
68 67
69 /* XXX: incomplete checks for trace_seq_printf() return value */ 68 trace_seq_printf(s, "PCIDEV %02x%02x %04x%04x %x",
70 ret += trace_seq_printf(s, "PCIDEV %02x%02x %04x%04x %x", 69 dev->bus->number, dev->devfn,
71 dev->bus->number, dev->devfn, 70 dev->vendor, dev->device, dev->irq);
72 dev->vendor, dev->device, dev->irq);
73 /* 71 /*
74 * XXX: is pci_resource_to_user() appropriate, since we are 72 * XXX: is pci_resource_to_user() appropriate, since we are
75 * supposed to interpret the __ioremap() phys_addr argument based on 73 * supposed to interpret the __ioremap() phys_addr argument based on
@@ -77,21 +75,20 @@ static int mmio_print_pcidev(struct trace_seq *s, const struct pci_dev *dev)
77 */ 75 */
78 for (i = 0; i < 7; i++) { 76 for (i = 0; i < 7; i++) {
79 pci_resource_to_user(dev, i, &dev->resource[i], &start, &end); 77 pci_resource_to_user(dev, i, &dev->resource[i], &start, &end);
80 ret += trace_seq_printf(s, " %llx", 78 trace_seq_printf(s, " %llx",
81 (unsigned long long)(start | 79 (unsigned long long)(start |
82 (dev->resource[i].flags & PCI_REGION_FLAG_MASK))); 80 (dev->resource[i].flags & PCI_REGION_FLAG_MASK)));
83 } 81 }
84 for (i = 0; i < 7; i++) { 82 for (i = 0; i < 7; i++) {
85 pci_resource_to_user(dev, i, &dev->resource[i], &start, &end); 83 pci_resource_to_user(dev, i, &dev->resource[i], &start, &end);
86 ret += trace_seq_printf(s, " %llx", 84 trace_seq_printf(s, " %llx",
87 dev->resource[i].start < dev->resource[i].end ? 85 dev->resource[i].start < dev->resource[i].end ?
88 (unsigned long long)(end - start) + 1 : 0); 86 (unsigned long long)(end - start) + 1 : 0);
89 } 87 }
90 if (drv) 88 if (drv)
91 ret += trace_seq_printf(s, " %s\n", drv->name); 89 trace_seq_printf(s, " %s\n", drv->name);
92 else 90 else
93 ret += trace_seq_puts(s, " \n"); 91 trace_seq_puts(s, " \n");
94 return ret;
95} 92}
96 93
97static void destroy_header_iter(struct header_iter *hiter) 94static void destroy_header_iter(struct header_iter *hiter)
@@ -179,28 +176,27 @@ static enum print_line_t mmio_print_rw(struct trace_iterator *iter)
179 unsigned long long t = ns2usecs(iter->ts); 176 unsigned long long t = ns2usecs(iter->ts);
180 unsigned long usec_rem = do_div(t, USEC_PER_SEC); 177 unsigned long usec_rem = do_div(t, USEC_PER_SEC);
181 unsigned secs = (unsigned long)t; 178 unsigned secs = (unsigned long)t;
182 int ret = 1;
183 179
184 trace_assign_type(field, entry); 180 trace_assign_type(field, entry);
185 rw = &field->rw; 181 rw = &field->rw;
186 182
187 switch (rw->opcode) { 183 switch (rw->opcode) {
188 case MMIO_READ: 184 case MMIO_READ:
189 ret = trace_seq_printf(s, 185 trace_seq_printf(s,
190 "R %d %u.%06lu %d 0x%llx 0x%lx 0x%lx %d\n", 186 "R %d %u.%06lu %d 0x%llx 0x%lx 0x%lx %d\n",
191 rw->width, secs, usec_rem, rw->map_id, 187 rw->width, secs, usec_rem, rw->map_id,
192 (unsigned long long)rw->phys, 188 (unsigned long long)rw->phys,
193 rw->value, rw->pc, 0); 189 rw->value, rw->pc, 0);
194 break; 190 break;
195 case MMIO_WRITE: 191 case MMIO_WRITE:
196 ret = trace_seq_printf(s, 192 trace_seq_printf(s,
197 "W %d %u.%06lu %d 0x%llx 0x%lx 0x%lx %d\n", 193 "W %d %u.%06lu %d 0x%llx 0x%lx 0x%lx %d\n",
198 rw->width, secs, usec_rem, rw->map_id, 194 rw->width, secs, usec_rem, rw->map_id,
199 (unsigned long long)rw->phys, 195 (unsigned long long)rw->phys,
200 rw->value, rw->pc, 0); 196 rw->value, rw->pc, 0);
201 break; 197 break;
202 case MMIO_UNKNOWN_OP: 198 case MMIO_UNKNOWN_OP:
203 ret = trace_seq_printf(s, 199 trace_seq_printf(s,
204 "UNKNOWN %u.%06lu %d 0x%llx %02lx,%02lx," 200 "UNKNOWN %u.%06lu %d 0x%llx %02lx,%02lx,"
205 "%02lx 0x%lx %d\n", 201 "%02lx 0x%lx %d\n",
206 secs, usec_rem, rw->map_id, 202 secs, usec_rem, rw->map_id,
@@ -209,12 +205,11 @@ static enum print_line_t mmio_print_rw(struct trace_iterator *iter)
209 (rw->value >> 0) & 0xff, rw->pc, 0); 205 (rw->value >> 0) & 0xff, rw->pc, 0);
210 break; 206 break;
211 default: 207 default:
212 ret = trace_seq_puts(s, "rw what?\n"); 208 trace_seq_puts(s, "rw what?\n");
213 break; 209 break;
214 } 210 }
215 if (ret) 211
216 return TRACE_TYPE_HANDLED; 212 return trace_handle_return(s);
217 return TRACE_TYPE_PARTIAL_LINE;
218} 213}
219 214
220static enum print_line_t mmio_print_map(struct trace_iterator *iter) 215static enum print_line_t mmio_print_map(struct trace_iterator *iter)
@@ -226,31 +221,29 @@ static enum print_line_t mmio_print_map(struct trace_iterator *iter)
226 unsigned long long t = ns2usecs(iter->ts); 221 unsigned long long t = ns2usecs(iter->ts);
227 unsigned long usec_rem = do_div(t, USEC_PER_SEC); 222 unsigned long usec_rem = do_div(t, USEC_PER_SEC);
228 unsigned secs = (unsigned long)t; 223 unsigned secs = (unsigned long)t;
229 int ret;
230 224
231 trace_assign_type(field, entry); 225 trace_assign_type(field, entry);
232 m = &field->map; 226 m = &field->map;
233 227
234 switch (m->opcode) { 228 switch (m->opcode) {
235 case MMIO_PROBE: 229 case MMIO_PROBE:
236 ret = trace_seq_printf(s, 230 trace_seq_printf(s,
237 "MAP %u.%06lu %d 0x%llx 0x%lx 0x%lx 0x%lx %d\n", 231 "MAP %u.%06lu %d 0x%llx 0x%lx 0x%lx 0x%lx %d\n",
238 secs, usec_rem, m->map_id, 232 secs, usec_rem, m->map_id,
239 (unsigned long long)m->phys, m->virt, m->len, 233 (unsigned long long)m->phys, m->virt, m->len,
240 0UL, 0); 234 0UL, 0);
241 break; 235 break;
242 case MMIO_UNPROBE: 236 case MMIO_UNPROBE:
243 ret = trace_seq_printf(s, 237 trace_seq_printf(s,
244 "UNMAP %u.%06lu %d 0x%lx %d\n", 238 "UNMAP %u.%06lu %d 0x%lx %d\n",
245 secs, usec_rem, m->map_id, 0UL, 0); 239 secs, usec_rem, m->map_id, 0UL, 0);
246 break; 240 break;
247 default: 241 default:
248 ret = trace_seq_puts(s, "map what?\n"); 242 trace_seq_puts(s, "map what?\n");
249 break; 243 break;
250 } 244 }
251 if (ret) 245
252 return TRACE_TYPE_HANDLED; 246 return trace_handle_return(s);
253 return TRACE_TYPE_PARTIAL_LINE;
254} 247}
255 248
256static enum print_line_t mmio_print_mark(struct trace_iterator *iter) 249static enum print_line_t mmio_print_mark(struct trace_iterator *iter)
@@ -262,14 +255,11 @@ static enum print_line_t mmio_print_mark(struct trace_iterator *iter)
262 unsigned long long t = ns2usecs(iter->ts); 255 unsigned long long t = ns2usecs(iter->ts);
263 unsigned long usec_rem = do_div(t, USEC_PER_SEC); 256 unsigned long usec_rem = do_div(t, USEC_PER_SEC);
264 unsigned secs = (unsigned long)t; 257 unsigned secs = (unsigned long)t;
265 int ret;
266 258
267 /* The trailing newline must be in the message. */ 259 /* The trailing newline must be in the message. */
268 ret = trace_seq_printf(s, "MARK %u.%06lu %s", secs, usec_rem, msg); 260 trace_seq_printf(s, "MARK %u.%06lu %s", secs, usec_rem, msg);
269 if (!ret)
270 return TRACE_TYPE_PARTIAL_LINE;
271 261
272 return TRACE_TYPE_HANDLED; 262 return trace_handle_return(s);
273} 263}
274 264
275static enum print_line_t mmio_print_line(struct trace_iterator *iter) 265static enum print_line_t mmio_print_line(struct trace_iterator *iter)
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index c6977d5a9b12..b77b9a697619 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -25,15 +25,12 @@ enum print_line_t trace_print_bputs_msg_only(struct trace_iterator *iter)
25 struct trace_seq *s = &iter->seq; 25 struct trace_seq *s = &iter->seq;
26 struct trace_entry *entry = iter->ent; 26 struct trace_entry *entry = iter->ent;
27 struct bputs_entry *field; 27 struct bputs_entry *field;
28 int ret;
29 28
30 trace_assign_type(field, entry); 29 trace_assign_type(field, entry);
31 30
32 ret = trace_seq_puts(s, field->str); 31 trace_seq_puts(s, field->str);
33 if (!ret)
34 return TRACE_TYPE_PARTIAL_LINE;
35 32
36 return TRACE_TYPE_HANDLED; 33 return trace_handle_return(s);
37} 34}
38 35
39enum print_line_t trace_print_bprintk_msg_only(struct trace_iterator *iter) 36enum print_line_t trace_print_bprintk_msg_only(struct trace_iterator *iter)
@@ -41,15 +38,12 @@ enum print_line_t trace_print_bprintk_msg_only(struct trace_iterator *iter)
41 struct trace_seq *s = &iter->seq; 38 struct trace_seq *s = &iter->seq;
42 struct trace_entry *entry = iter->ent; 39 struct trace_entry *entry = iter->ent;
43 struct bprint_entry *field; 40 struct bprint_entry *field;
44 int ret;
45 41
46 trace_assign_type(field, entry); 42 trace_assign_type(field, entry);
47 43
48 ret = trace_seq_bprintf(s, field->fmt, field->buf); 44 trace_seq_bprintf(s, field->fmt, field->buf);
49 if (!ret)
50 return TRACE_TYPE_PARTIAL_LINE;
51 45
52 return TRACE_TYPE_HANDLED; 46 return trace_handle_return(s);
53} 47}
54 48
55enum print_line_t trace_print_printk_msg_only(struct trace_iterator *iter) 49enum print_line_t trace_print_printk_msg_only(struct trace_iterator *iter)
@@ -57,15 +51,12 @@ enum print_line_t trace_print_printk_msg_only(struct trace_iterator *iter)
57 struct trace_seq *s = &iter->seq; 51 struct trace_seq *s = &iter->seq;
58 struct trace_entry *entry = iter->ent; 52 struct trace_entry *entry = iter->ent;
59 struct print_entry *field; 53 struct print_entry *field;
60 int ret;
61 54
62 trace_assign_type(field, entry); 55 trace_assign_type(field, entry);
63 56
64 ret = trace_seq_puts(s, field->buf); 57 trace_seq_puts(s, field->buf);
65 if (!ret)
66 return TRACE_TYPE_PARTIAL_LINE;
67 58
68 return TRACE_TYPE_HANDLED; 59 return trace_handle_return(s);
69} 60}
70 61
71const char * 62const char *
@@ -124,7 +115,7 @@ ftrace_print_symbols_seq(struct trace_seq *p, unsigned long val,
124 115
125 if (ret == (const char *)(trace_seq_buffer_ptr(p))) 116 if (ret == (const char *)(trace_seq_buffer_ptr(p)))
126 trace_seq_printf(p, "0x%lx", val); 117 trace_seq_printf(p, "0x%lx", val);
127 118
128 trace_seq_putc(p, 0); 119 trace_seq_putc(p, 0);
129 120
130 return ret; 121 return ret;
@@ -193,7 +184,6 @@ int ftrace_raw_output_prep(struct trace_iterator *iter,
193 struct trace_seq *s = &iter->seq; 184 struct trace_seq *s = &iter->seq;
194 struct trace_seq *p = &iter->tmp_seq; 185 struct trace_seq *p = &iter->tmp_seq;
195 struct trace_entry *entry; 186 struct trace_entry *entry;
196 int ret;
197 187
198 event = container_of(trace_event, struct ftrace_event_call, event); 188 event = container_of(trace_event, struct ftrace_event_call, event);
199 entry = iter->ent; 189 entry = iter->ent;
@@ -204,11 +194,9 @@ int ftrace_raw_output_prep(struct trace_iterator *iter,
204 } 194 }
205 195
206 trace_seq_init(p); 196 trace_seq_init(p);
207 ret = trace_seq_printf(s, "%s: ", ftrace_event_name(event)); 197 trace_seq_printf(s, "%s: ", ftrace_event_name(event));
208 if (!ret)
209 return TRACE_TYPE_PARTIAL_LINE;
210 198
211 return 0; 199 return trace_handle_return(s);
212} 200}
213EXPORT_SYMBOL(ftrace_raw_output_prep); 201EXPORT_SYMBOL(ftrace_raw_output_prep);
214 202
@@ -216,18 +204,11 @@ static int ftrace_output_raw(struct trace_iterator *iter, char *name,
216 char *fmt, va_list ap) 204 char *fmt, va_list ap)
217{ 205{
218 struct trace_seq *s = &iter->seq; 206 struct trace_seq *s = &iter->seq;
219 int ret;
220
221 ret = trace_seq_printf(s, "%s: ", name);
222 if (!ret)
223 return TRACE_TYPE_PARTIAL_LINE;
224
225 ret = trace_seq_vprintf(s, fmt, ap);
226 207
227 if (!ret) 208 trace_seq_printf(s, "%s: ", name);
228 return TRACE_TYPE_PARTIAL_LINE; 209 trace_seq_vprintf(s, fmt, ap);
229 210
230 return TRACE_TYPE_HANDLED; 211 return trace_handle_return(s);
231} 212}
232 213
233int ftrace_output_call(struct trace_iterator *iter, char *name, char *fmt, ...) 214int ftrace_output_call(struct trace_iterator *iter, char *name, char *fmt, ...)
@@ -260,7 +241,7 @@ static inline const char *kretprobed(const char *name)
260} 241}
261#endif /* CONFIG_KRETPROBES */ 242#endif /* CONFIG_KRETPROBES */
262 243
263static int 244static void
264seq_print_sym_short(struct trace_seq *s, const char *fmt, unsigned long address) 245seq_print_sym_short(struct trace_seq *s, const char *fmt, unsigned long address)
265{ 246{
266#ifdef CONFIG_KALLSYMS 247#ifdef CONFIG_KALLSYMS
@@ -271,12 +252,11 @@ seq_print_sym_short(struct trace_seq *s, const char *fmt, unsigned long address)
271 252
272 name = kretprobed(str); 253 name = kretprobed(str);
273 254
274 return trace_seq_printf(s, fmt, name); 255 trace_seq_printf(s, fmt, name);
275#endif 256#endif
276 return 1;
277} 257}
278 258
279static int 259static void
280seq_print_sym_offset(struct trace_seq *s, const char *fmt, 260seq_print_sym_offset(struct trace_seq *s, const char *fmt,
281 unsigned long address) 261 unsigned long address)
282{ 262{
@@ -287,9 +267,8 @@ seq_print_sym_offset(struct trace_seq *s, const char *fmt,
287 sprint_symbol(str, address); 267 sprint_symbol(str, address);
288 name = kretprobed(str); 268 name = kretprobed(str);
289 269
290 return trace_seq_printf(s, fmt, name); 270 trace_seq_printf(s, fmt, name);
291#endif 271#endif
292 return 1;
293} 272}
294 273
295#ifndef CONFIG_64BIT 274#ifndef CONFIG_64BIT
@@ -320,14 +299,14 @@ int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm,
320 if (file) { 299 if (file) {
321 ret = trace_seq_path(s, &file->f_path); 300 ret = trace_seq_path(s, &file->f_path);
322 if (ret) 301 if (ret)
323 ret = trace_seq_printf(s, "[+0x%lx]", 302 trace_seq_printf(s, "[+0x%lx]",
324 ip - vmstart); 303 ip - vmstart);
325 } 304 }
326 up_read(&mm->mmap_sem); 305 up_read(&mm->mmap_sem);
327 } 306 }
328 if (ret && ((sym_flags & TRACE_ITER_SYM_ADDR) || !file)) 307 if (ret && ((sym_flags & TRACE_ITER_SYM_ADDR) || !file))
329 ret = trace_seq_printf(s, " <" IP_FMT ">", ip); 308 trace_seq_printf(s, " <" IP_FMT ">", ip);
330 return ret; 309 return !trace_seq_has_overflowed(s);
331} 310}
332 311
333int 312int
@@ -335,7 +314,6 @@ seq_print_userip_objs(const struct userstack_entry *entry, struct trace_seq *s,
335 unsigned long sym_flags) 314 unsigned long sym_flags)
336{ 315{
337 struct mm_struct *mm = NULL; 316 struct mm_struct *mm = NULL;
338 int ret = 1;
339 unsigned int i; 317 unsigned int i;
340 318
341 if (trace_flags & TRACE_ITER_SYM_USEROBJ) { 319 if (trace_flags & TRACE_ITER_SYM_USEROBJ) {
@@ -354,48 +332,45 @@ seq_print_userip_objs(const struct userstack_entry *entry, struct trace_seq *s,
354 for (i = 0; i < FTRACE_STACK_ENTRIES; i++) { 332 for (i = 0; i < FTRACE_STACK_ENTRIES; i++) {
355 unsigned long ip = entry->caller[i]; 333 unsigned long ip = entry->caller[i];
356 334
357 if (ip == ULONG_MAX || !ret) 335 if (ip == ULONG_MAX || trace_seq_has_overflowed(s))
358 break; 336 break;
359 if (ret) 337
360 ret = trace_seq_puts(s, " => "); 338 trace_seq_puts(s, " => ");
339
361 if (!ip) { 340 if (!ip) {
362 if (ret) 341 trace_seq_puts(s, "??");
363 ret = trace_seq_puts(s, "??"); 342 trace_seq_putc(s, '\n');
364 if (ret)
365 ret = trace_seq_putc(s, '\n');
366 continue; 343 continue;
367 } 344 }
368 if (!ret) 345
369 break; 346 seq_print_user_ip(s, mm, ip, sym_flags);
370 if (ret) 347 trace_seq_putc(s, '\n');
371 ret = seq_print_user_ip(s, mm, ip, sym_flags);
372 ret = trace_seq_putc(s, '\n');
373 } 348 }
374 349
375 if (mm) 350 if (mm)
376 mmput(mm); 351 mmput(mm);
377 return ret; 352
353 return !trace_seq_has_overflowed(s);
378} 354}
379 355
380int 356int
381seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags) 357seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags)
382{ 358{
383 int ret; 359 if (!ip) {
384 360 trace_seq_putc(s, '0');
385 if (!ip) 361 goto out;
386 return trace_seq_putc(s, '0'); 362 }
387 363
388 if (sym_flags & TRACE_ITER_SYM_OFFSET) 364 if (sym_flags & TRACE_ITER_SYM_OFFSET)
389 ret = seq_print_sym_offset(s, "%s", ip); 365 seq_print_sym_offset(s, "%s", ip);
390 else 366 else
391 ret = seq_print_sym_short(s, "%s", ip); 367 seq_print_sym_short(s, "%s", ip);
392
393 if (!ret)
394 return 0;
395 368
396 if (sym_flags & TRACE_ITER_SYM_ADDR) 369 if (sym_flags & TRACE_ITER_SYM_ADDR)
397 ret = trace_seq_printf(s, " <" IP_FMT ">", ip); 370 trace_seq_printf(s, " <" IP_FMT ">", ip);
398 return ret; 371
372 out:
373 return !trace_seq_has_overflowed(s);
399} 374}
400 375
401/** 376/**
@@ -413,7 +388,6 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
413 char irqs_off; 388 char irqs_off;
414 int hardirq; 389 int hardirq;
415 int softirq; 390 int softirq;
416 int ret;
417 391
418 hardirq = entry->flags & TRACE_FLAG_HARDIRQ; 392 hardirq = entry->flags & TRACE_FLAG_HARDIRQ;
419 softirq = entry->flags & TRACE_FLAG_SOFTIRQ; 393 softirq = entry->flags & TRACE_FLAG_SOFTIRQ;
@@ -445,16 +419,15 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
445 softirq ? 's' : 419 softirq ? 's' :
446 '.'; 420 '.';
447 421
448 if (!trace_seq_printf(s, "%c%c%c", 422 trace_seq_printf(s, "%c%c%c",
449 irqs_off, need_resched, hardsoft_irq)) 423 irqs_off, need_resched, hardsoft_irq);
450 return 0;
451 424
452 if (entry->preempt_count) 425 if (entry->preempt_count)
453 ret = trace_seq_printf(s, "%x", entry->preempt_count); 426 trace_seq_printf(s, "%x", entry->preempt_count);
454 else 427 else
455 ret = trace_seq_putc(s, '.'); 428 trace_seq_putc(s, '.');
456 429
457 return ret; 430 return !trace_seq_has_overflowed(s);
458} 431}
459 432
460static int 433static int
@@ -464,14 +437,38 @@ lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu)
464 437
465 trace_find_cmdline(entry->pid, comm); 438 trace_find_cmdline(entry->pid, comm);
466 439
467 if (!trace_seq_printf(s, "%8.8s-%-5d %3d", 440 trace_seq_printf(s, "%8.8s-%-5d %3d",
468 comm, entry->pid, cpu)) 441 comm, entry->pid, cpu);
469 return 0;
470 442
471 return trace_print_lat_fmt(s, entry); 443 return trace_print_lat_fmt(s, entry);
472} 444}
473 445
474static unsigned long preempt_mark_thresh_us = 100; 446#undef MARK
447#define MARK(v, s) {.val = v, .sym = s}
448/* trace overhead mark */
449static const struct trace_mark {
450 unsigned long long val; /* unit: nsec */
451 char sym;
452} mark[] = {
453 MARK(1000000000ULL , '$'), /* 1 sec */
454 MARK(1000000ULL , '#'), /* 1000 usecs */
455 MARK(100000ULL , '!'), /* 100 usecs */
456 MARK(10000ULL , '+'), /* 10 usecs */
457};
458#undef MARK
459
460char trace_find_mark(unsigned long long d)
461{
462 int i;
463 int size = ARRAY_SIZE(mark);
464
465 for (i = 0; i < size; i++) {
466 if (d >= mark[i].val)
467 break;
468 }
469
470 return (i == size) ? ' ' : mark[i].sym;
471}
475 472
476static int 473static int
477lat_print_timestamp(struct trace_iterator *iter, u64 next_ts) 474lat_print_timestamp(struct trace_iterator *iter, u64 next_ts)
@@ -493,24 +490,28 @@ lat_print_timestamp(struct trace_iterator *iter, u64 next_ts)
493 unsigned long rel_usec = do_div(rel_ts, USEC_PER_MSEC); 490 unsigned long rel_usec = do_div(rel_ts, USEC_PER_MSEC);
494 unsigned long rel_msec = (unsigned long)rel_ts; 491 unsigned long rel_msec = (unsigned long)rel_ts;
495 492
496 return trace_seq_printf( 493 trace_seq_printf(
497 s, "[%08llx] %ld.%03ldms (+%ld.%03ldms): ", 494 s, "[%08llx] %ld.%03ldms (+%ld.%03ldms): ",
498 ns2usecs(iter->ts), 495 ns2usecs(iter->ts),
499 abs_msec, abs_usec, 496 abs_msec, abs_usec,
500 rel_msec, rel_usec); 497 rel_msec, rel_usec);
498
501 } else if (verbose && !in_ns) { 499 } else if (verbose && !in_ns) {
502 return trace_seq_printf( 500 trace_seq_printf(
503 s, "[%016llx] %lld (+%lld): ", 501 s, "[%016llx] %lld (+%lld): ",
504 iter->ts, abs_ts, rel_ts); 502 iter->ts, abs_ts, rel_ts);
503
505 } else if (!verbose && in_ns) { 504 } else if (!verbose && in_ns) {
506 return trace_seq_printf( 505 trace_seq_printf(
507 s, " %4lldus%c: ", 506 s, " %4lldus%c: ",
508 abs_ts, 507 abs_ts,
509 rel_ts > preempt_mark_thresh_us ? '!' : 508 trace_find_mark(rel_ts * NSEC_PER_USEC));
510 rel_ts > 1 ? '+' : ' '); 509
511 } else { /* !verbose && !in_ns */ 510 } else { /* !verbose && !in_ns */
512 return trace_seq_printf(s, " %4lld: ", abs_ts); 511 trace_seq_printf(s, " %4lld: ", abs_ts);
513 } 512 }
513
514 return !trace_seq_has_overflowed(s);
514} 515}
515 516
516int trace_print_context(struct trace_iterator *iter) 517int trace_print_context(struct trace_iterator *iter)
@@ -520,34 +521,29 @@ int trace_print_context(struct trace_iterator *iter)
520 unsigned long long t; 521 unsigned long long t;
521 unsigned long secs, usec_rem; 522 unsigned long secs, usec_rem;
522 char comm[TASK_COMM_LEN]; 523 char comm[TASK_COMM_LEN];
523 int ret;
524 524
525 trace_find_cmdline(entry->pid, comm); 525 trace_find_cmdline(entry->pid, comm);
526 526
527 ret = trace_seq_printf(s, "%16s-%-5d [%03d] ", 527 trace_seq_printf(s, "%16s-%-5d [%03d] ",
528 comm, entry->pid, iter->cpu); 528 comm, entry->pid, iter->cpu);
529 if (!ret)
530 return 0;
531 529
532 if (trace_flags & TRACE_ITER_IRQ_INFO) { 530 if (trace_flags & TRACE_ITER_IRQ_INFO)
533 ret = trace_print_lat_fmt(s, entry); 531 trace_print_lat_fmt(s, entry);
534 if (!ret)
535 return 0;
536 }
537 532
538 if (iter->iter_flags & TRACE_FILE_TIME_IN_NS) { 533 if (iter->iter_flags & TRACE_FILE_TIME_IN_NS) {
539 t = ns2usecs(iter->ts); 534 t = ns2usecs(iter->ts);
540 usec_rem = do_div(t, USEC_PER_SEC); 535 usec_rem = do_div(t, USEC_PER_SEC);
541 secs = (unsigned long)t; 536 secs = (unsigned long)t;
542 return trace_seq_printf(s, " %5lu.%06lu: ", secs, usec_rem); 537 trace_seq_printf(s, " %5lu.%06lu: ", secs, usec_rem);
543 } else 538 } else
544 return trace_seq_printf(s, " %12llu: ", iter->ts); 539 trace_seq_printf(s, " %12llu: ", iter->ts);
540
541 return !trace_seq_has_overflowed(s);
545} 542}
546 543
547int trace_print_lat_context(struct trace_iterator *iter) 544int trace_print_lat_context(struct trace_iterator *iter)
548{ 545{
549 u64 next_ts; 546 u64 next_ts;
550 int ret;
551 /* trace_find_next_entry will reset ent_size */ 547 /* trace_find_next_entry will reset ent_size */
552 int ent_size = iter->ent_size; 548 int ent_size = iter->ent_size;
553 struct trace_seq *s = &iter->seq; 549 struct trace_seq *s = &iter->seq;
@@ -567,18 +563,17 @@ int trace_print_lat_context(struct trace_iterator *iter)
567 563
568 trace_find_cmdline(entry->pid, comm); 564 trace_find_cmdline(entry->pid, comm);
569 565
570 ret = trace_seq_printf( 566 trace_seq_printf(
571 s, "%16s %5d %3d %d %08x %08lx ", 567 s, "%16s %5d %3d %d %08x %08lx ",
572 comm, entry->pid, iter->cpu, entry->flags, 568 comm, entry->pid, iter->cpu, entry->flags,
573 entry->preempt_count, iter->idx); 569 entry->preempt_count, iter->idx);
574 } else { 570 } else {
575 ret = lat_print_generic(s, entry, iter->cpu); 571 lat_print_generic(s, entry, iter->cpu);
576 } 572 }
577 573
578 if (ret) 574 lat_print_timestamp(iter, next_ts);
579 ret = lat_print_timestamp(iter, next_ts);
580 575
581 return ret; 576 return !trace_seq_has_overflowed(s);
582} 577}
583 578
584static const char state_to_char[] = TASK_STATE_TO_CHAR_STR; 579static const char state_to_char[] = TASK_STATE_TO_CHAR_STR;
@@ -692,7 +687,7 @@ int register_ftrace_event(struct trace_event *event)
692 goto out; 687 goto out;
693 688
694 } else { 689 } else {
695 690
696 event->type = next_event_type++; 691 event->type = next_event_type++;
697 list = &ftrace_event_list; 692 list = &ftrace_event_list;
698 } 693 }
@@ -764,10 +759,9 @@ EXPORT_SYMBOL_GPL(unregister_ftrace_event);
764enum print_line_t trace_nop_print(struct trace_iterator *iter, int flags, 759enum print_line_t trace_nop_print(struct trace_iterator *iter, int flags,
765 struct trace_event *event) 760 struct trace_event *event)
766{ 761{
767 if (!trace_seq_printf(&iter->seq, "type: %d\n", iter->ent->type)) 762 trace_seq_printf(&iter->seq, "type: %d\n", iter->ent->type);
768 return TRACE_TYPE_PARTIAL_LINE;
769 763
770 return TRACE_TYPE_HANDLED; 764 return trace_handle_return(&iter->seq);
771} 765}
772 766
773/* TRACE_FN */ 767/* TRACE_FN */
@@ -779,24 +773,16 @@ static enum print_line_t trace_fn_trace(struct trace_iterator *iter, int flags,
779 773
780 trace_assign_type(field, iter->ent); 774 trace_assign_type(field, iter->ent);
781 775
782 if (!seq_print_ip_sym(s, field->ip, flags)) 776 seq_print_ip_sym(s, field->ip, flags);
783 goto partial;
784 777
785 if ((flags & TRACE_ITER_PRINT_PARENT) && field->parent_ip) { 778 if ((flags & TRACE_ITER_PRINT_PARENT) && field->parent_ip) {
786 if (!trace_seq_puts(s, " <-")) 779 trace_seq_puts(s, " <-");
787 goto partial; 780 seq_print_ip_sym(s, field->parent_ip, flags);
788 if (!seq_print_ip_sym(s,
789 field->parent_ip,
790 flags))
791 goto partial;
792 } 781 }
793 if (!trace_seq_putc(s, '\n'))
794 goto partial;
795 782
796 return TRACE_TYPE_HANDLED; 783 trace_seq_putc(s, '\n');
797 784
798 partial: 785 return trace_handle_return(s);
799 return TRACE_TYPE_PARTIAL_LINE;
800} 786}
801 787
802static enum print_line_t trace_fn_raw(struct trace_iterator *iter, int flags, 788static enum print_line_t trace_fn_raw(struct trace_iterator *iter, int flags,
@@ -806,12 +792,11 @@ static enum print_line_t trace_fn_raw(struct trace_iterator *iter, int flags,
806 792
807 trace_assign_type(field, iter->ent); 793 trace_assign_type(field, iter->ent);
808 794
809 if (!trace_seq_printf(&iter->seq, "%lx %lx\n", 795 trace_seq_printf(&iter->seq, "%lx %lx\n",
810 field->ip, 796 field->ip,
811 field->parent_ip)) 797 field->parent_ip);
812 return TRACE_TYPE_PARTIAL_LINE;
813 798
814 return TRACE_TYPE_HANDLED; 799 return trace_handle_return(&iter->seq);
815} 800}
816 801
817static enum print_line_t trace_fn_hex(struct trace_iterator *iter, int flags, 802static enum print_line_t trace_fn_hex(struct trace_iterator *iter, int flags,
@@ -822,10 +807,10 @@ static enum print_line_t trace_fn_hex(struct trace_iterator *iter, int flags,
822 807
823 trace_assign_type(field, iter->ent); 808 trace_assign_type(field, iter->ent);
824 809
825 SEQ_PUT_HEX_FIELD_RET(s, field->ip); 810 SEQ_PUT_HEX_FIELD(s, field->ip);
826 SEQ_PUT_HEX_FIELD_RET(s, field->parent_ip); 811 SEQ_PUT_HEX_FIELD(s, field->parent_ip);
827 812
828 return TRACE_TYPE_HANDLED; 813 return trace_handle_return(s);
829} 814}
830 815
831static enum print_line_t trace_fn_bin(struct trace_iterator *iter, int flags, 816static enum print_line_t trace_fn_bin(struct trace_iterator *iter, int flags,
@@ -836,10 +821,10 @@ static enum print_line_t trace_fn_bin(struct trace_iterator *iter, int flags,
836 821
837 trace_assign_type(field, iter->ent); 822 trace_assign_type(field, iter->ent);
838 823
839 SEQ_PUT_FIELD_RET(s, field->ip); 824 SEQ_PUT_FIELD(s, field->ip);
840 SEQ_PUT_FIELD_RET(s, field->parent_ip); 825 SEQ_PUT_FIELD(s, field->parent_ip);
841 826
842 return TRACE_TYPE_HANDLED; 827 return trace_handle_return(s);
843} 828}
844 829
845static struct trace_event_functions trace_fn_funcs = { 830static struct trace_event_functions trace_fn_funcs = {
@@ -868,18 +853,17 @@ static enum print_line_t trace_ctxwake_print(struct trace_iterator *iter,
868 T = task_state_char(field->next_state); 853 T = task_state_char(field->next_state);
869 S = task_state_char(field->prev_state); 854 S = task_state_char(field->prev_state);
870 trace_find_cmdline(field->next_pid, comm); 855 trace_find_cmdline(field->next_pid, comm);
871 if (!trace_seq_printf(&iter->seq, 856 trace_seq_printf(&iter->seq,
872 " %5d:%3d:%c %s [%03d] %5d:%3d:%c %s\n", 857 " %5d:%3d:%c %s [%03d] %5d:%3d:%c %s\n",
873 field->prev_pid, 858 field->prev_pid,
874 field->prev_prio, 859 field->prev_prio,
875 S, delim, 860 S, delim,
876 field->next_cpu, 861 field->next_cpu,
877 field->next_pid, 862 field->next_pid,
878 field->next_prio, 863 field->next_prio,
879 T, comm)) 864 T, comm);
880 return TRACE_TYPE_PARTIAL_LINE; 865
881 866 return trace_handle_return(&iter->seq);
882 return TRACE_TYPE_HANDLED;
883} 867}
884 868
885static enum print_line_t trace_ctx_print(struct trace_iterator *iter, int flags, 869static enum print_line_t trace_ctx_print(struct trace_iterator *iter, int flags,
@@ -904,17 +888,16 @@ static int trace_ctxwake_raw(struct trace_iterator *iter, char S)
904 if (!S) 888 if (!S)
905 S = task_state_char(field->prev_state); 889 S = task_state_char(field->prev_state);
906 T = task_state_char(field->next_state); 890 T = task_state_char(field->next_state);
907 if (!trace_seq_printf(&iter->seq, "%d %d %c %d %d %d %c\n", 891 trace_seq_printf(&iter->seq, "%d %d %c %d %d %d %c\n",
908 field->prev_pid, 892 field->prev_pid,
909 field->prev_prio, 893 field->prev_prio,
910 S, 894 S,
911 field->next_cpu, 895 field->next_cpu,
912 field->next_pid, 896 field->next_pid,
913 field->next_prio, 897 field->next_prio,
914 T)) 898 T);
915 return TRACE_TYPE_PARTIAL_LINE; 899
916 900 return trace_handle_return(&iter->seq);
917 return TRACE_TYPE_HANDLED;
918} 901}
919 902
920static enum print_line_t trace_ctx_raw(struct trace_iterator *iter, int flags, 903static enum print_line_t trace_ctx_raw(struct trace_iterator *iter, int flags,
@@ -942,15 +925,15 @@ static int trace_ctxwake_hex(struct trace_iterator *iter, char S)
942 S = task_state_char(field->prev_state); 925 S = task_state_char(field->prev_state);
943 T = task_state_char(field->next_state); 926 T = task_state_char(field->next_state);
944 927
945 SEQ_PUT_HEX_FIELD_RET(s, field->prev_pid); 928 SEQ_PUT_HEX_FIELD(s, field->prev_pid);
946 SEQ_PUT_HEX_FIELD_RET(s, field->prev_prio); 929 SEQ_PUT_HEX_FIELD(s, field->prev_prio);
947 SEQ_PUT_HEX_FIELD_RET(s, S); 930 SEQ_PUT_HEX_FIELD(s, S);
948 SEQ_PUT_HEX_FIELD_RET(s, field->next_cpu); 931 SEQ_PUT_HEX_FIELD(s, field->next_cpu);
949 SEQ_PUT_HEX_FIELD_RET(s, field->next_pid); 932 SEQ_PUT_HEX_FIELD(s, field->next_pid);
950 SEQ_PUT_HEX_FIELD_RET(s, field->next_prio); 933 SEQ_PUT_HEX_FIELD(s, field->next_prio);
951 SEQ_PUT_HEX_FIELD_RET(s, T); 934 SEQ_PUT_HEX_FIELD(s, T);
952 935
953 return TRACE_TYPE_HANDLED; 936 return trace_handle_return(s);
954} 937}
955 938
956static enum print_line_t trace_ctx_hex(struct trace_iterator *iter, int flags, 939static enum print_line_t trace_ctx_hex(struct trace_iterator *iter, int flags,
@@ -973,14 +956,15 @@ static enum print_line_t trace_ctxwake_bin(struct trace_iterator *iter,
973 956
974 trace_assign_type(field, iter->ent); 957 trace_assign_type(field, iter->ent);
975 958
976 SEQ_PUT_FIELD_RET(s, field->prev_pid); 959 SEQ_PUT_FIELD(s, field->prev_pid);
977 SEQ_PUT_FIELD_RET(s, field->prev_prio); 960 SEQ_PUT_FIELD(s, field->prev_prio);
978 SEQ_PUT_FIELD_RET(s, field->prev_state); 961 SEQ_PUT_FIELD(s, field->prev_state);
979 SEQ_PUT_FIELD_RET(s, field->next_pid); 962 SEQ_PUT_FIELD(s, field->next_cpu);
980 SEQ_PUT_FIELD_RET(s, field->next_prio); 963 SEQ_PUT_FIELD(s, field->next_pid);
981 SEQ_PUT_FIELD_RET(s, field->next_state); 964 SEQ_PUT_FIELD(s, field->next_prio);
965 SEQ_PUT_FIELD(s, field->next_state);
982 966
983 return TRACE_TYPE_HANDLED; 967 return trace_handle_return(s);
984} 968}
985 969
986static struct trace_event_functions trace_ctx_funcs = { 970static struct trace_event_functions trace_ctx_funcs = {
@@ -1020,23 +1004,19 @@ static enum print_line_t trace_stack_print(struct trace_iterator *iter,
1020 trace_assign_type(field, iter->ent); 1004 trace_assign_type(field, iter->ent);
1021 end = (unsigned long *)((long)iter->ent + iter->ent_size); 1005 end = (unsigned long *)((long)iter->ent + iter->ent_size);
1022 1006
1023 if (!trace_seq_puts(s, "<stack trace>\n")) 1007 trace_seq_puts(s, "<stack trace>\n");
1024 goto partial;
1025 1008
1026 for (p = field->caller; p && *p != ULONG_MAX && p < end; p++) { 1009 for (p = field->caller; p && *p != ULONG_MAX && p < end; p++) {
1027 if (!trace_seq_puts(s, " => "))
1028 goto partial;
1029 1010
1030 if (!seq_print_ip_sym(s, *p, flags)) 1011 if (trace_seq_has_overflowed(s))
1031 goto partial; 1012 break;
1032 if (!trace_seq_putc(s, '\n'))
1033 goto partial;
1034 }
1035 1013
1036 return TRACE_TYPE_HANDLED; 1014 trace_seq_puts(s, " => ");
1015 seq_print_ip_sym(s, *p, flags);
1016 trace_seq_putc(s, '\n');
1017 }
1037 1018
1038 partial: 1019 return trace_handle_return(s);
1039 return TRACE_TYPE_PARTIAL_LINE;
1040} 1020}
1041 1021
1042static struct trace_event_functions trace_stack_funcs = { 1022static struct trace_event_functions trace_stack_funcs = {
@@ -1057,16 +1037,10 @@ static enum print_line_t trace_user_stack_print(struct trace_iterator *iter,
1057 1037
1058 trace_assign_type(field, iter->ent); 1038 trace_assign_type(field, iter->ent);
1059 1039
1060 if (!trace_seq_puts(s, "<user stack trace>\n")) 1040 trace_seq_puts(s, "<user stack trace>\n");
1061 goto partial; 1041 seq_print_userip_objs(field, s, flags);
1062
1063 if (!seq_print_userip_objs(field, s, flags))
1064 goto partial;
1065
1066 return TRACE_TYPE_HANDLED;
1067 1042
1068 partial: 1043 return trace_handle_return(s);
1069 return TRACE_TYPE_PARTIAL_LINE;
1070} 1044}
1071 1045
1072static struct trace_event_functions trace_user_stack_funcs = { 1046static struct trace_event_functions trace_user_stack_funcs = {
@@ -1089,19 +1063,11 @@ trace_bputs_print(struct trace_iterator *iter, int flags,
1089 1063
1090 trace_assign_type(field, entry); 1064 trace_assign_type(field, entry);
1091 1065
1092 if (!seq_print_ip_sym(s, field->ip, flags)) 1066 seq_print_ip_sym(s, field->ip, flags);
1093 goto partial; 1067 trace_seq_puts(s, ": ");
1068 trace_seq_puts(s, field->str);
1094 1069
1095 if (!trace_seq_puts(s, ": ")) 1070 return trace_handle_return(s);
1096 goto partial;
1097
1098 if (!trace_seq_puts(s, field->str))
1099 goto partial;
1100
1101 return TRACE_TYPE_HANDLED;
1102
1103 partial:
1104 return TRACE_TYPE_PARTIAL_LINE;
1105} 1071}
1106 1072
1107 1073
@@ -1114,16 +1080,10 @@ trace_bputs_raw(struct trace_iterator *iter, int flags,
1114 1080
1115 trace_assign_type(field, iter->ent); 1081 trace_assign_type(field, iter->ent);
1116 1082
1117 if (!trace_seq_printf(s, ": %lx : ", field->ip)) 1083 trace_seq_printf(s, ": %lx : ", field->ip);
1118 goto partial; 1084 trace_seq_puts(s, field->str);
1119
1120 if (!trace_seq_puts(s, field->str))
1121 goto partial;
1122 1085
1123 return TRACE_TYPE_HANDLED; 1086 return trace_handle_return(s);
1124
1125 partial:
1126 return TRACE_TYPE_PARTIAL_LINE;
1127} 1087}
1128 1088
1129static struct trace_event_functions trace_bputs_funcs = { 1089static struct trace_event_functions trace_bputs_funcs = {
@@ -1147,19 +1107,11 @@ trace_bprint_print(struct trace_iterator *iter, int flags,
1147 1107
1148 trace_assign_type(field, entry); 1108 trace_assign_type(field, entry);
1149 1109
1150 if (!seq_print_ip_sym(s, field->ip, flags)) 1110 seq_print_ip_sym(s, field->ip, flags);
1151 goto partial; 1111 trace_seq_puts(s, ": ");
1152 1112 trace_seq_bprintf(s, field->fmt, field->buf);
1153 if (!trace_seq_puts(s, ": "))
1154 goto partial;
1155
1156 if (!trace_seq_bprintf(s, field->fmt, field->buf))
1157 goto partial;
1158 1113
1159 return TRACE_TYPE_HANDLED; 1114 return trace_handle_return(s);
1160
1161 partial:
1162 return TRACE_TYPE_PARTIAL_LINE;
1163} 1115}
1164 1116
1165 1117
@@ -1172,16 +1124,10 @@ trace_bprint_raw(struct trace_iterator *iter, int flags,
1172 1124
1173 trace_assign_type(field, iter->ent); 1125 trace_assign_type(field, iter->ent);
1174 1126
1175 if (!trace_seq_printf(s, ": %lx : ", field->ip)) 1127 trace_seq_printf(s, ": %lx : ", field->ip);
1176 goto partial; 1128 trace_seq_bprintf(s, field->fmt, field->buf);
1177
1178 if (!trace_seq_bprintf(s, field->fmt, field->buf))
1179 goto partial;
1180 1129
1181 return TRACE_TYPE_HANDLED; 1130 return trace_handle_return(s);
1182
1183 partial:
1184 return TRACE_TYPE_PARTIAL_LINE;
1185} 1131}
1186 1132
1187static struct trace_event_functions trace_bprint_funcs = { 1133static struct trace_event_functions trace_bprint_funcs = {
@@ -1203,16 +1149,10 @@ static enum print_line_t trace_print_print(struct trace_iterator *iter,
1203 1149
1204 trace_assign_type(field, iter->ent); 1150 trace_assign_type(field, iter->ent);
1205 1151
1206 if (!seq_print_ip_sym(s, field->ip, flags)) 1152 seq_print_ip_sym(s, field->ip, flags);
1207 goto partial; 1153 trace_seq_printf(s, ": %s", field->buf);
1208
1209 if (!trace_seq_printf(s, ": %s", field->buf))
1210 goto partial;
1211 1154
1212 return TRACE_TYPE_HANDLED; 1155 return trace_handle_return(s);
1213
1214 partial:
1215 return TRACE_TYPE_PARTIAL_LINE;
1216} 1156}
1217 1157
1218static enum print_line_t trace_print_raw(struct trace_iterator *iter, int flags, 1158static enum print_line_t trace_print_raw(struct trace_iterator *iter, int flags,
@@ -1222,13 +1162,9 @@ static enum print_line_t trace_print_raw(struct trace_iterator *iter, int flags,
1222 1162
1223 trace_assign_type(field, iter->ent); 1163 trace_assign_type(field, iter->ent);
1224 1164
1225 if (!trace_seq_printf(&iter->seq, "# %lx %s", field->ip, field->buf)) 1165 trace_seq_printf(&iter->seq, "# %lx %s", field->ip, field->buf);
1226 goto partial;
1227
1228 return TRACE_TYPE_HANDLED;
1229 1166
1230 partial: 1167 return trace_handle_return(&iter->seq);
1231 return TRACE_TYPE_PARTIAL_LINE;
1232} 1168}
1233 1169
1234static struct trace_event_functions trace_print_funcs = { 1170static struct trace_event_functions trace_print_funcs = {
diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h
index 80b25b585a70..8ef2c40efb3c 100644
--- a/kernel/trace/trace_output.h
+++ b/kernel/trace/trace_output.h
@@ -35,17 +35,11 @@ trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry);
35extern int __unregister_ftrace_event(struct trace_event *event); 35extern int __unregister_ftrace_event(struct trace_event *event);
36extern struct rw_semaphore trace_event_sem; 36extern struct rw_semaphore trace_event_sem;
37 37
38#define SEQ_PUT_FIELD_RET(s, x) \ 38#define SEQ_PUT_FIELD(s, x) \
39do { \ 39 trace_seq_putmem(s, &(x), sizeof(x))
40 if (!trace_seq_putmem(s, &(x), sizeof(x))) \ 40
41 return TRACE_TYPE_PARTIAL_LINE; \ 41#define SEQ_PUT_HEX_FIELD(s, x) \
42} while (0) 42 trace_seq_putmem_hex(s, &(x), sizeof(x))
43
44#define SEQ_PUT_HEX_FIELD_RET(s, x) \
45do { \
46 if (!trace_seq_putmem_hex(s, &(x), sizeof(x))) \
47 return TRACE_TYPE_PARTIAL_LINE; \
48} while (0)
49 43
50#endif 44#endif
51 45
diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c
index 2900817ba65c..c4e70b6bd7fa 100644
--- a/kernel/trace/trace_printk.c
+++ b/kernel/trace/trace_printk.c
@@ -305,7 +305,7 @@ static int t_show(struct seq_file *m, void *v)
305 seq_puts(m, "\\t"); 305 seq_puts(m, "\\t");
306 break; 306 break;
307 case '\\': 307 case '\\':
308 seq_puts(m, "\\"); 308 seq_putc(m, '\\');
309 break; 309 break;
310 case '"': 310 case '"':
311 seq_puts(m, "\\\""); 311 seq_puts(m, "\\\"");
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
index d4b9fc22cd27..b983b2fd2ca1 100644
--- a/kernel/trace/trace_probe.c
+++ b/kernel/trace/trace_probe.c
@@ -40,7 +40,8 @@ const char *reserved_field_names[] = {
40int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s, const char *name, \ 40int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s, const char *name, \
41 void *data, void *ent) \ 41 void *data, void *ent) \
42{ \ 42{ \
43 return trace_seq_printf(s, " %s=" fmt, name, *(type *)data); \ 43 trace_seq_printf(s, " %s=" fmt, name, *(type *)data); \
44 return !trace_seq_has_overflowed(s); \
44} \ 45} \
45const char PRINT_TYPE_FMT_NAME(type)[] = fmt; \ 46const char PRINT_TYPE_FMT_NAME(type)[] = fmt; \
46NOKPROBE_SYMBOL(PRINT_TYPE_FUNC_NAME(type)); 47NOKPROBE_SYMBOL(PRINT_TYPE_FUNC_NAME(type));
@@ -61,10 +62,11 @@ int PRINT_TYPE_FUNC_NAME(string)(struct trace_seq *s, const char *name,
61 int len = *(u32 *)data >> 16; 62 int len = *(u32 *)data >> 16;
62 63
63 if (!len) 64 if (!len)
64 return trace_seq_printf(s, " %s=(fault)", name); 65 trace_seq_printf(s, " %s=(fault)", name);
65 else 66 else
66 return trace_seq_printf(s, " %s=\"%s\"", name, 67 trace_seq_printf(s, " %s=\"%s\"", name,
67 (const char *)get_loc_data(data, ent)); 68 (const char *)get_loc_data(data, ent));
69 return !trace_seq_has_overflowed(s);
68} 70}
69NOKPROBE_SYMBOL(PRINT_TYPE_FUNC_NAME(string)); 71NOKPROBE_SYMBOL(PRINT_TYPE_FUNC_NAME(string));
70 72
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index 3f34dc9b40f3..2e293beb186e 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -14,122 +14,26 @@
14 14
15#include "trace.h" 15#include "trace.h"
16 16
17static struct trace_array *ctx_trace;
18static int __read_mostly tracer_enabled;
19static int sched_ref; 17static int sched_ref;
20static DEFINE_MUTEX(sched_register_mutex); 18static DEFINE_MUTEX(sched_register_mutex);
21static int sched_stopped;
22
23
24void
25tracing_sched_switch_trace(struct trace_array *tr,
26 struct task_struct *prev,
27 struct task_struct *next,
28 unsigned long flags, int pc)
29{
30 struct ftrace_event_call *call = &event_context_switch;
31 struct ring_buffer *buffer = tr->trace_buffer.buffer;
32 struct ring_buffer_event *event;
33 struct ctx_switch_entry *entry;
34
35 event = trace_buffer_lock_reserve(buffer, TRACE_CTX,
36 sizeof(*entry), flags, pc);
37 if (!event)
38 return;
39 entry = ring_buffer_event_data(event);
40 entry->prev_pid = prev->pid;
41 entry->prev_prio = prev->prio;
42 entry->prev_state = prev->state;
43 entry->next_pid = next->pid;
44 entry->next_prio = next->prio;
45 entry->next_state = next->state;
46 entry->next_cpu = task_cpu(next);
47
48 if (!call_filter_check_discard(call, entry, buffer, event))
49 trace_buffer_unlock_commit(buffer, event, flags, pc);
50}
51 19
52static void 20static void
53probe_sched_switch(void *ignore, struct task_struct *prev, struct task_struct *next) 21probe_sched_switch(void *ignore, struct task_struct *prev, struct task_struct *next)
54{ 22{
55 struct trace_array_cpu *data;
56 unsigned long flags;
57 int cpu;
58 int pc;
59
60 if (unlikely(!sched_ref)) 23 if (unlikely(!sched_ref))
61 return; 24 return;
62 25
63 tracing_record_cmdline(prev); 26 tracing_record_cmdline(prev);
64 tracing_record_cmdline(next); 27 tracing_record_cmdline(next);
65
66 if (!tracer_enabled || sched_stopped)
67 return;
68
69 pc = preempt_count();
70 local_irq_save(flags);
71 cpu = raw_smp_processor_id();
72 data = per_cpu_ptr(ctx_trace->trace_buffer.data, cpu);
73
74 if (likely(!atomic_read(&data->disabled)))
75 tracing_sched_switch_trace(ctx_trace, prev, next, flags, pc);
76
77 local_irq_restore(flags);
78}
79
80void
81tracing_sched_wakeup_trace(struct trace_array *tr,
82 struct task_struct *wakee,
83 struct task_struct *curr,
84 unsigned long flags, int pc)
85{
86 struct ftrace_event_call *call = &event_wakeup;
87 struct ring_buffer_event *event;
88 struct ctx_switch_entry *entry;
89 struct ring_buffer *buffer = tr->trace_buffer.buffer;
90
91 event = trace_buffer_lock_reserve(buffer, TRACE_WAKE,
92 sizeof(*entry), flags, pc);
93 if (!event)
94 return;
95 entry = ring_buffer_event_data(event);
96 entry->prev_pid = curr->pid;
97 entry->prev_prio = curr->prio;
98 entry->prev_state = curr->state;
99 entry->next_pid = wakee->pid;
100 entry->next_prio = wakee->prio;
101 entry->next_state = wakee->state;
102 entry->next_cpu = task_cpu(wakee);
103
104 if (!call_filter_check_discard(call, entry, buffer, event))
105 trace_buffer_unlock_commit(buffer, event, flags, pc);
106} 28}
107 29
108static void 30static void
109probe_sched_wakeup(void *ignore, struct task_struct *wakee, int success) 31probe_sched_wakeup(void *ignore, struct task_struct *wakee, int success)
110{ 32{
111 struct trace_array_cpu *data;
112 unsigned long flags;
113 int cpu, pc;
114
115 if (unlikely(!sched_ref)) 33 if (unlikely(!sched_ref))
116 return; 34 return;
117 35
118 tracing_record_cmdline(current); 36 tracing_record_cmdline(current);
119
120 if (!tracer_enabled || sched_stopped)
121 return;
122
123 pc = preempt_count();
124 local_irq_save(flags);
125 cpu = raw_smp_processor_id();
126 data = per_cpu_ptr(ctx_trace->trace_buffer.data, cpu);
127
128 if (likely(!atomic_read(&data->disabled)))
129 tracing_sched_wakeup_trace(ctx_trace, wakee, current,
130 flags, pc);
131
132 local_irq_restore(flags);
133} 37}
134 38
135static int tracing_sched_register(void) 39static int tracing_sched_register(void)
@@ -197,51 +101,3 @@ void tracing_stop_cmdline_record(void)
197{ 101{
198 tracing_stop_sched_switch(); 102 tracing_stop_sched_switch();
199} 103}
200
201/**
202 * tracing_start_sched_switch_record - start tracing context switches
203 *
204 * Turns on context switch tracing for a tracer.
205 */
206void tracing_start_sched_switch_record(void)
207{
208 if (unlikely(!ctx_trace)) {
209 WARN_ON(1);
210 return;
211 }
212
213 tracing_start_sched_switch();
214
215 mutex_lock(&sched_register_mutex);
216 tracer_enabled++;
217 mutex_unlock(&sched_register_mutex);
218}
219
220/**
221 * tracing_stop_sched_switch_record - start tracing context switches
222 *
223 * Turns off context switch tracing for a tracer.
224 */
225void tracing_stop_sched_switch_record(void)
226{
227 mutex_lock(&sched_register_mutex);
228 tracer_enabled--;
229 WARN_ON(tracer_enabled < 0);
230 mutex_unlock(&sched_register_mutex);
231
232 tracing_stop_sched_switch();
233}
234
235/**
236 * tracing_sched_switch_assign_trace - assign a trace array for ctx switch
237 * @tr: trace array pointer to assign
238 *
239 * Some tracers might want to record the context switches in their
240 * trace. This function lets those tracers assign the trace array
241 * to use.
242 */
243void tracing_sched_switch_assign_trace(struct trace_array *tr)
244{
245 ctx_trace = tr;
246}
247
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 19bd8928ce94..8fb84b362816 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -365,6 +365,62 @@ probe_wakeup_migrate_task(void *ignore, struct task_struct *task, int cpu)
365 wakeup_current_cpu = cpu; 365 wakeup_current_cpu = cpu;
366} 366}
367 367
368static void
369tracing_sched_switch_trace(struct trace_array *tr,
370 struct task_struct *prev,
371 struct task_struct *next,
372 unsigned long flags, int pc)
373{
374 struct ftrace_event_call *call = &event_context_switch;
375 struct ring_buffer *buffer = tr->trace_buffer.buffer;
376 struct ring_buffer_event *event;
377 struct ctx_switch_entry *entry;
378
379 event = trace_buffer_lock_reserve(buffer, TRACE_CTX,
380 sizeof(*entry), flags, pc);
381 if (!event)
382 return;
383 entry = ring_buffer_event_data(event);
384 entry->prev_pid = prev->pid;
385 entry->prev_prio = prev->prio;
386 entry->prev_state = prev->state;
387 entry->next_pid = next->pid;
388 entry->next_prio = next->prio;
389 entry->next_state = next->state;
390 entry->next_cpu = task_cpu(next);
391
392 if (!call_filter_check_discard(call, entry, buffer, event))
393 trace_buffer_unlock_commit(buffer, event, flags, pc);
394}
395
396static void
397tracing_sched_wakeup_trace(struct trace_array *tr,
398 struct task_struct *wakee,
399 struct task_struct *curr,
400 unsigned long flags, int pc)
401{
402 struct ftrace_event_call *call = &event_wakeup;
403 struct ring_buffer_event *event;
404 struct ctx_switch_entry *entry;
405 struct ring_buffer *buffer = tr->trace_buffer.buffer;
406
407 event = trace_buffer_lock_reserve(buffer, TRACE_WAKE,
408 sizeof(*entry), flags, pc);
409 if (!event)
410 return;
411 entry = ring_buffer_event_data(event);
412 entry->prev_pid = curr->pid;
413 entry->prev_prio = curr->prio;
414 entry->prev_state = curr->state;
415 entry->next_pid = wakee->pid;
416 entry->next_prio = wakee->prio;
417 entry->next_state = wakee->state;
418 entry->next_cpu = task_cpu(wakee);
419
420 if (!call_filter_check_discard(call, entry, buffer, event))
421 trace_buffer_unlock_commit(buffer, event, flags, pc);
422}
423
368static void notrace 424static void notrace
369probe_wakeup_sched_switch(void *ignore, 425probe_wakeup_sched_switch(void *ignore,
370 struct task_struct *prev, struct task_struct *next) 426 struct task_struct *prev, struct task_struct *next)
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 5ef60499dc8e..b0f86ea77881 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -382,6 +382,8 @@ static int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
382 382
383 /* check the trace buffer */ 383 /* check the trace buffer */
384 ret = trace_test_buffer(&tr->trace_buffer, &count); 384 ret = trace_test_buffer(&tr->trace_buffer, &count);
385
386 ftrace_enabled = 1;
385 tracing_start(); 387 tracing_start();
386 388
387 /* we should only have one item */ 389 /* we should only have one item */
@@ -679,6 +681,8 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr)
679 681
680 /* check the trace buffer */ 682 /* check the trace buffer */
681 ret = trace_test_buffer(&tr->trace_buffer, &count); 683 ret = trace_test_buffer(&tr->trace_buffer, &count);
684
685 ftrace_enabled = 1;
682 trace->reset(tr); 686 trace->reset(tr);
683 tracing_start(); 687 tracing_start();
684 688
@@ -1025,6 +1029,12 @@ trace_selftest_startup_nop(struct tracer *trace, struct trace_array *tr)
1025#endif 1029#endif
1026 1030
1027#ifdef CONFIG_SCHED_TRACER 1031#ifdef CONFIG_SCHED_TRACER
1032
1033struct wakeup_test_data {
1034 struct completion is_ready;
1035 int go;
1036};
1037
1028static int trace_wakeup_test_thread(void *data) 1038static int trace_wakeup_test_thread(void *data)
1029{ 1039{
1030 /* Make this a -deadline thread */ 1040 /* Make this a -deadline thread */
@@ -1034,51 +1044,56 @@ static int trace_wakeup_test_thread(void *data)
1034 .sched_deadline = 10000000ULL, 1044 .sched_deadline = 10000000ULL,
1035 .sched_period = 10000000ULL 1045 .sched_period = 10000000ULL
1036 }; 1046 };
1037 struct completion *x = data; 1047 struct wakeup_test_data *x = data;
1038 1048
1039 sched_setattr(current, &attr); 1049 sched_setattr(current, &attr);
1040 1050
1041 /* Make it know we have a new prio */ 1051 /* Make it know we have a new prio */
1042 complete(x); 1052 complete(&x->is_ready);
1043 1053
1044 /* now go to sleep and let the test wake us up */ 1054 /* now go to sleep and let the test wake us up */
1045 set_current_state(TASK_INTERRUPTIBLE); 1055 set_current_state(TASK_INTERRUPTIBLE);
1046 schedule(); 1056 while (!x->go) {
1057 schedule();
1058 set_current_state(TASK_INTERRUPTIBLE);
1059 }
1047 1060
1048 complete(x); 1061 complete(&x->is_ready);
1062
1063 set_current_state(TASK_INTERRUPTIBLE);
1049 1064
1050 /* we are awake, now wait to disappear */ 1065 /* we are awake, now wait to disappear */
1051 while (!kthread_should_stop()) { 1066 while (!kthread_should_stop()) {
1052 /* 1067 schedule();
1053 * This will likely be the system top priority 1068 set_current_state(TASK_INTERRUPTIBLE);
1054 * task, do short sleeps to let others run.
1055 */
1056 msleep(100);
1057 } 1069 }
1058 1070
1071 __set_current_state(TASK_RUNNING);
1072
1059 return 0; 1073 return 0;
1060} 1074}
1061
1062int 1075int
1063trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr) 1076trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr)
1064{ 1077{
1065 unsigned long save_max = tr->max_latency; 1078 unsigned long save_max = tr->max_latency;
1066 struct task_struct *p; 1079 struct task_struct *p;
1067 struct completion is_ready; 1080 struct wakeup_test_data data;
1068 unsigned long count; 1081 unsigned long count;
1069 int ret; 1082 int ret;
1070 1083
1071 init_completion(&is_ready); 1084 memset(&data, 0, sizeof(data));
1085
1086 init_completion(&data.is_ready);
1072 1087
1073 /* create a -deadline thread */ 1088 /* create a -deadline thread */
1074 p = kthread_run(trace_wakeup_test_thread, &is_ready, "ftrace-test"); 1089 p = kthread_run(trace_wakeup_test_thread, &data, "ftrace-test");
1075 if (IS_ERR(p)) { 1090 if (IS_ERR(p)) {
1076 printk(KERN_CONT "Failed to create ftrace wakeup test thread "); 1091 printk(KERN_CONT "Failed to create ftrace wakeup test thread ");
1077 return -1; 1092 return -1;
1078 } 1093 }
1079 1094
1080 /* make sure the thread is running at -deadline policy */ 1095 /* make sure the thread is running at -deadline policy */
1081 wait_for_completion(&is_ready); 1096 wait_for_completion(&data.is_ready);
1082 1097
1083 /* start the tracing */ 1098 /* start the tracing */
1084 ret = tracer_init(trace, tr); 1099 ret = tracer_init(trace, tr);
@@ -1099,18 +1114,20 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr)
1099 msleep(100); 1114 msleep(100);
1100 } 1115 }
1101 1116
1102 init_completion(&is_ready); 1117 init_completion(&data.is_ready);
1118
1119 data.go = 1;
1120 /* memory barrier is in the wake_up_process() */
1103 1121
1104 wake_up_process(p); 1122 wake_up_process(p);
1105 1123
1106 /* Wait for the task to wake up */ 1124 /* Wait for the task to wake up */
1107 wait_for_completion(&is_ready); 1125 wait_for_completion(&data.is_ready);
1108 1126
1109 /* stop the tracing. */ 1127 /* stop the tracing. */
1110 tracing_stop(); 1128 tracing_stop();
1111 /* check both trace buffers */ 1129 /* check both trace buffers */
1112 ret = trace_test_buffer(&tr->trace_buffer, NULL); 1130 ret = trace_test_buffer(&tr->trace_buffer, NULL);
1113 printk("ret = %d\n", ret);
1114 if (!ret) 1131 if (!ret)
1115 ret = trace_test_buffer(&tr->max_buffer, &count); 1132 ret = trace_test_buffer(&tr->max_buffer, &count);
1116 1133
diff --git a/kernel/trace/trace_seq.c b/kernel/trace/trace_seq.c
index 1f24ed99dca2..f8b45d8792f9 100644
--- a/kernel/trace/trace_seq.c
+++ b/kernel/trace/trace_seq.c
@@ -27,10 +27,19 @@
27#include <linux/trace_seq.h> 27#include <linux/trace_seq.h>
28 28
29/* How much buffer is left on the trace_seq? */ 29/* How much buffer is left on the trace_seq? */
30#define TRACE_SEQ_BUF_LEFT(s) ((PAGE_SIZE - 1) - (s)->len) 30#define TRACE_SEQ_BUF_LEFT(s) seq_buf_buffer_left(&(s)->seq)
31 31
32/* How much buffer is written? */ 32/* How much buffer is written? */
33#define TRACE_SEQ_BUF_USED(s) min((s)->len, (unsigned int)(PAGE_SIZE - 1)) 33#define TRACE_SEQ_BUF_USED(s) seq_buf_used(&(s)->seq)
34
35/*
36 * trace_seq should work with being initialized with 0s.
37 */
38static inline void __trace_seq_init(struct trace_seq *s)
39{
40 if (unlikely(!s->seq.size))
41 trace_seq_init(s);
42}
34 43
35/** 44/**
36 * trace_print_seq - move the contents of trace_seq into a seq_file 45 * trace_print_seq - move the contents of trace_seq into a seq_file
@@ -43,10 +52,11 @@
43 */ 52 */
44int trace_print_seq(struct seq_file *m, struct trace_seq *s) 53int trace_print_seq(struct seq_file *m, struct trace_seq *s)
45{ 54{
46 unsigned int len = TRACE_SEQ_BUF_USED(s);
47 int ret; 55 int ret;
48 56
49 ret = seq_write(m, s->buffer, len); 57 __trace_seq_init(s);
58
59 ret = seq_buf_print_seq(m, &s->seq);
50 60
51 /* 61 /*
52 * Only reset this buffer if we successfully wrote to the 62 * Only reset this buffer if we successfully wrote to the
@@ -69,34 +79,26 @@ int trace_print_seq(struct seq_file *m, struct trace_seq *s)
69 * trace_seq_printf() is used to store strings into a special 79 * trace_seq_printf() is used to store strings into a special
70 * buffer (@s). Then the output may be either used by 80 * buffer (@s). Then the output may be either used by
71 * the sequencer or pulled into another buffer. 81 * the sequencer or pulled into another buffer.
72 *
73 * Returns 1 if we successfully written all the contents to
74 * the buffer.
75 * Returns 0 if we the length to write is bigger than the
76 * reserved buffer space. In this case, nothing gets written.
77 */ 82 */
78int trace_seq_printf(struct trace_seq *s, const char *fmt, ...) 83void trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
79{ 84{
80 unsigned int len = TRACE_SEQ_BUF_LEFT(s); 85 unsigned int save_len = s->seq.len;
81 va_list ap; 86 va_list ap;
82 int ret;
83 87
84 if (s->full || !len) 88 if (s->full)
85 return 0; 89 return;
90
91 __trace_seq_init(s);
86 92
87 va_start(ap, fmt); 93 va_start(ap, fmt);
88 ret = vsnprintf(s->buffer + s->len, len, fmt, ap); 94 seq_buf_vprintf(&s->seq, fmt, ap);
89 va_end(ap); 95 va_end(ap);
90 96
91 /* If we can't write it all, don't bother writing anything */ 97 /* If we can't write it all, don't bother writing anything */
92 if (ret >= len) { 98 if (unlikely(seq_buf_has_overflowed(&s->seq))) {
99 s->seq.len = save_len;
93 s->full = 1; 100 s->full = 1;
94 return 0;
95 } 101 }
96
97 s->len += ret;
98
99 return 1;
100} 102}
101EXPORT_SYMBOL_GPL(trace_seq_printf); 103EXPORT_SYMBOL_GPL(trace_seq_printf);
102 104
@@ -107,25 +109,23 @@ EXPORT_SYMBOL_GPL(trace_seq_printf);
107 * @nmaskbits: The number of bits that are valid in @maskp 109 * @nmaskbits: The number of bits that are valid in @maskp
108 * 110 *
109 * Writes a ASCII representation of a bitmask string into @s. 111 * Writes a ASCII representation of a bitmask string into @s.
110 *
111 * Returns 1 if we successfully written all the contents to
112 * the buffer.
113 * Returns 0 if we the length to write is bigger than the
114 * reserved buffer space. In this case, nothing gets written.
115 */ 112 */
116int trace_seq_bitmask(struct trace_seq *s, const unsigned long *maskp, 113void trace_seq_bitmask(struct trace_seq *s, const unsigned long *maskp,
117 int nmaskbits) 114 int nmaskbits)
118{ 115{
119 unsigned int len = TRACE_SEQ_BUF_LEFT(s); 116 unsigned int save_len = s->seq.len;
120 int ret;
121 117
122 if (s->full || !len) 118 if (s->full)
123 return 0; 119 return;
124 120
125 ret = bitmap_scnprintf(s->buffer, len, maskp, nmaskbits); 121 __trace_seq_init(s);
126 s->len += ret;
127 122
128 return 1; 123 seq_buf_bitmask(&s->seq, maskp, nmaskbits);
124
125 if (unlikely(seq_buf_has_overflowed(&s->seq))) {
126 s->seq.len = save_len;
127 s->full = 1;
128 }
129} 129}
130EXPORT_SYMBOL_GPL(trace_seq_bitmask); 130EXPORT_SYMBOL_GPL(trace_seq_bitmask);
131 131
@@ -139,28 +139,23 @@ EXPORT_SYMBOL_GPL(trace_seq_bitmask);
139 * trace_seq_printf is used to store strings into a special 139 * trace_seq_printf is used to store strings into a special
140 * buffer (@s). Then the output may be either used by 140 * buffer (@s). Then the output may be either used by
141 * the sequencer or pulled into another buffer. 141 * the sequencer or pulled into another buffer.
142 *
143 * Returns how much it wrote to the buffer.
144 */ 142 */
145int trace_seq_vprintf(struct trace_seq *s, const char *fmt, va_list args) 143void trace_seq_vprintf(struct trace_seq *s, const char *fmt, va_list args)
146{ 144{
147 unsigned int len = TRACE_SEQ_BUF_LEFT(s); 145 unsigned int save_len = s->seq.len;
148 int ret;
149 146
150 if (s->full || !len) 147 if (s->full)
151 return 0; 148 return;
152 149
153 ret = vsnprintf(s->buffer + s->len, len, fmt, args); 150 __trace_seq_init(s);
151
152 seq_buf_vprintf(&s->seq, fmt, args);
154 153
155 /* If we can't write it all, don't bother writing anything */ 154 /* If we can't write it all, don't bother writing anything */
156 if (ret >= len) { 155 if (unlikely(seq_buf_has_overflowed(&s->seq))) {
156 s->seq.len = save_len;
157 s->full = 1; 157 s->full = 1;
158 return 0;
159 } 158 }
160
161 s->len += ret;
162
163 return len;
164} 159}
165EXPORT_SYMBOL_GPL(trace_seq_vprintf); 160EXPORT_SYMBOL_GPL(trace_seq_vprintf);
166 161
@@ -178,28 +173,24 @@ EXPORT_SYMBOL_GPL(trace_seq_vprintf);
178 * 173 *
179 * This function will take the format and the binary array and finish 174 * This function will take the format and the binary array and finish
180 * the conversion into the ASCII string within the buffer. 175 * the conversion into the ASCII string within the buffer.
181 *
182 * Returns how much it wrote to the buffer.
183 */ 176 */
184int trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary) 177void trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary)
185{ 178{
186 unsigned int len = TRACE_SEQ_BUF_LEFT(s); 179 unsigned int save_len = s->seq.len;
187 int ret;
188 180
189 if (s->full || !len) 181 if (s->full)
190 return 0; 182 return;
183
184 __trace_seq_init(s);
191 185
192 ret = bstr_printf(s->buffer + s->len, len, fmt, binary); 186 seq_buf_bprintf(&s->seq, fmt, binary);
193 187
194 /* If we can't write it all, don't bother writing anything */ 188 /* If we can't write it all, don't bother writing anything */
195 if (ret >= len) { 189 if (unlikely(seq_buf_has_overflowed(&s->seq))) {
190 s->seq.len = save_len;
196 s->full = 1; 191 s->full = 1;
197 return 0; 192 return;
198 } 193 }
199
200 s->len += ret;
201
202 return len;
203} 194}
204EXPORT_SYMBOL_GPL(trace_seq_bprintf); 195EXPORT_SYMBOL_GPL(trace_seq_bprintf);
205 196
@@ -212,25 +203,22 @@ EXPORT_SYMBOL_GPL(trace_seq_bprintf);
212 * copy to user routines. This function records a simple string 203 * copy to user routines. This function records a simple string
213 * into a special buffer (@s) for later retrieval by a sequencer 204 * into a special buffer (@s) for later retrieval by a sequencer
214 * or other mechanism. 205 * or other mechanism.
215 *
216 * Returns how much it wrote to the buffer.
217 */ 206 */
218int trace_seq_puts(struct trace_seq *s, const char *str) 207void trace_seq_puts(struct trace_seq *s, const char *str)
219{ 208{
220 unsigned int len = strlen(str); 209 unsigned int len = strlen(str);
221 210
222 if (s->full) 211 if (s->full)
223 return 0; 212 return;
213
214 __trace_seq_init(s);
224 215
225 if (len > TRACE_SEQ_BUF_LEFT(s)) { 216 if (len > TRACE_SEQ_BUF_LEFT(s)) {
226 s->full = 1; 217 s->full = 1;
227 return 0; 218 return;
228 } 219 }
229 220
230 memcpy(s->buffer + s->len, str, len); 221 seq_buf_putmem(&s->seq, str, len);
231 s->len += len;
232
233 return len;
234} 222}
235EXPORT_SYMBOL_GPL(trace_seq_puts); 223EXPORT_SYMBOL_GPL(trace_seq_puts);
236 224
@@ -243,22 +231,20 @@ EXPORT_SYMBOL_GPL(trace_seq_puts);
243 * copy to user routines. This function records a simple charater 231 * copy to user routines. This function records a simple charater
244 * into a special buffer (@s) for later retrieval by a sequencer 232 * into a special buffer (@s) for later retrieval by a sequencer
245 * or other mechanism. 233 * or other mechanism.
246 *
247 * Returns how much it wrote to the buffer.
248 */ 234 */
249int trace_seq_putc(struct trace_seq *s, unsigned char c) 235void trace_seq_putc(struct trace_seq *s, unsigned char c)
250{ 236{
251 if (s->full) 237 if (s->full)
252 return 0; 238 return;
239
240 __trace_seq_init(s);
253 241
254 if (TRACE_SEQ_BUF_LEFT(s) < 1) { 242 if (TRACE_SEQ_BUF_LEFT(s) < 1) {
255 s->full = 1; 243 s->full = 1;
256 return 0; 244 return;
257 } 245 }
258 246
259 s->buffer[s->len++] = c; 247 seq_buf_putc(&s->seq, c);
260
261 return 1;
262} 248}
263EXPORT_SYMBOL_GPL(trace_seq_putc); 249EXPORT_SYMBOL_GPL(trace_seq_putc);
264 250
@@ -271,29 +257,23 @@ EXPORT_SYMBOL_GPL(trace_seq_putc);
271 * There may be cases where raw memory needs to be written into the 257 * There may be cases where raw memory needs to be written into the
272 * buffer and a strcpy() would not work. Using this function allows 258 * buffer and a strcpy() would not work. Using this function allows
273 * for such cases. 259 * for such cases.
274 *
275 * Returns how much it wrote to the buffer.
276 */ 260 */
277int trace_seq_putmem(struct trace_seq *s, const void *mem, unsigned int len) 261void trace_seq_putmem(struct trace_seq *s, const void *mem, unsigned int len)
278{ 262{
279 if (s->full) 263 if (s->full)
280 return 0; 264 return;
265
266 __trace_seq_init(s);
281 267
282 if (len > TRACE_SEQ_BUF_LEFT(s)) { 268 if (len > TRACE_SEQ_BUF_LEFT(s)) {
283 s->full = 1; 269 s->full = 1;
284 return 0; 270 return;
285 } 271 }
286 272
287 memcpy(s->buffer + s->len, mem, len); 273 seq_buf_putmem(&s->seq, mem, len);
288 s->len += len;
289
290 return len;
291} 274}
292EXPORT_SYMBOL_GPL(trace_seq_putmem); 275EXPORT_SYMBOL_GPL(trace_seq_putmem);
293 276
294#define MAX_MEMHEX_BYTES 8U
295#define HEX_CHARS (MAX_MEMHEX_BYTES*2 + 1)
296
297/** 277/**
298 * trace_seq_putmem_hex - write raw memory into the buffer in ASCII hex 278 * trace_seq_putmem_hex - write raw memory into the buffer in ASCII hex
299 * @s: trace sequence descriptor 279 * @s: trace sequence descriptor
@@ -303,41 +283,31 @@ EXPORT_SYMBOL_GPL(trace_seq_putmem);
303 * This is similar to trace_seq_putmem() except instead of just copying the 283 * This is similar to trace_seq_putmem() except instead of just copying the
304 * raw memory into the buffer it writes its ASCII representation of it 284 * raw memory into the buffer it writes its ASCII representation of it
305 * in hex characters. 285 * in hex characters.
306 *
307 * Returns how much it wrote to the buffer.
308 */ 286 */
309int trace_seq_putmem_hex(struct trace_seq *s, const void *mem, 287void trace_seq_putmem_hex(struct trace_seq *s, const void *mem,
310 unsigned int len) 288 unsigned int len)
311{ 289{
312 unsigned char hex[HEX_CHARS]; 290 unsigned int save_len = s->seq.len;
313 const unsigned char *data = mem;
314 unsigned int start_len;
315 int i, j;
316 int cnt = 0;
317 291
318 if (s->full) 292 if (s->full)
319 return 0; 293 return;
320 294
321 while (len) { 295 __trace_seq_init(s);
322 start_len = min(len, HEX_CHARS - 1); 296
323#ifdef __BIG_ENDIAN 297 /* Each byte is represented by two chars */
324 for (i = 0, j = 0; i < start_len; i++) { 298 if (len * 2 > TRACE_SEQ_BUF_LEFT(s)) {
325#else 299 s->full = 1;
326 for (i = start_len-1, j = 0; i >= 0; i--) { 300 return;
327#endif 301 }
328 hex[j++] = hex_asc_hi(data[i]); 302
329 hex[j++] = hex_asc_lo(data[i]); 303 /* The added spaces can still cause an overflow */
330 } 304 seq_buf_putmem_hex(&s->seq, mem, len);
331 if (WARN_ON_ONCE(j == 0 || j/2 > len)) 305
332 break; 306 if (unlikely(seq_buf_has_overflowed(&s->seq))) {
333 307 s->seq.len = save_len;
334 /* j increments twice per loop */ 308 s->full = 1;
335 len -= j / 2; 309 return;
336 hex[j++] = ' ';
337
338 cnt += trace_seq_putmem(s, hex, j);
339 } 310 }
340 return cnt;
341} 311}
342EXPORT_SYMBOL_GPL(trace_seq_putmem_hex); 312EXPORT_SYMBOL_GPL(trace_seq_putmem_hex);
343 313
@@ -355,30 +325,27 @@ EXPORT_SYMBOL_GPL(trace_seq_putmem_hex);
355 */ 325 */
356int trace_seq_path(struct trace_seq *s, const struct path *path) 326int trace_seq_path(struct trace_seq *s, const struct path *path)
357{ 327{
358 unsigned char *p; 328 unsigned int save_len = s->seq.len;
359 329
360 if (s->full) 330 if (s->full)
361 return 0; 331 return 0;
362 332
333 __trace_seq_init(s);
334
363 if (TRACE_SEQ_BUF_LEFT(s) < 1) { 335 if (TRACE_SEQ_BUF_LEFT(s) < 1) {
364 s->full = 1; 336 s->full = 1;
365 return 0; 337 return 0;
366 } 338 }
367 339
368 p = d_path(path, s->buffer + s->len, PAGE_SIZE - s->len); 340 seq_buf_path(&s->seq, path, "\n");
369 if (!IS_ERR(p)) { 341
370 p = mangle_path(s->buffer + s->len, p, "\n"); 342 if (unlikely(seq_buf_has_overflowed(&s->seq))) {
371 if (p) { 343 s->seq.len = save_len;
372 s->len = p - s->buffer; 344 s->full = 1;
373 return 1; 345 return 0;
374 }
375 } else {
376 s->buffer[s->len++] = '?';
377 return 1;
378 } 346 }
379 347
380 s->full = 1; 348 return 1;
381 return 0;
382} 349}
383EXPORT_SYMBOL_GPL(trace_seq_path); 350EXPORT_SYMBOL_GPL(trace_seq_path);
384 351
@@ -404,25 +371,7 @@ EXPORT_SYMBOL_GPL(trace_seq_path);
404 */ 371 */
405int trace_seq_to_user(struct trace_seq *s, char __user *ubuf, int cnt) 372int trace_seq_to_user(struct trace_seq *s, char __user *ubuf, int cnt)
406{ 373{
407 int len; 374 __trace_seq_init(s);
408 int ret; 375 return seq_buf_to_user(&s->seq, ubuf, cnt);
409
410 if (!cnt)
411 return 0;
412
413 if (s->len <= s->readpos)
414 return -EBUSY;
415
416 len = s->len - s->readpos;
417 if (cnt > len)
418 cnt = len;
419 ret = copy_to_user(ubuf, s->buffer + s->readpos, cnt);
420 if (ret == cnt)
421 return -EFAULT;
422
423 cnt -= ret;
424
425 s->readpos += cnt;
426 return cnt;
427} 376}
428EXPORT_SYMBOL_GPL(trace_seq_to_user); 377EXPORT_SYMBOL_GPL(trace_seq_to_user);
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 8a4e5cb66a4c..16eddb308c33 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -13,7 +13,6 @@
13#include <linux/sysctl.h> 13#include <linux/sysctl.h>
14#include <linux/init.h> 14#include <linux/init.h>
15#include <linux/fs.h> 15#include <linux/fs.h>
16#include <linux/magic.h>
17 16
18#include <asm/setup.h> 17#include <asm/setup.h>
19 18
@@ -171,8 +170,7 @@ check_stack(unsigned long ip, unsigned long *stack)
171 i++; 170 i++;
172 } 171 }
173 172
174 if ((current != &init_task && 173 if (task_stack_end_corrupted(current)) {
175 *(end_of_stack(current)) != STACK_END_MAGIC)) {
176 print_max_stack(); 174 print_max_stack();
177 BUG(); 175 BUG();
178 } 176 }
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 759d5e004517..c6ee36fcbf90 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -114,7 +114,7 @@ print_syscall_enter(struct trace_iterator *iter, int flags,
114 struct trace_entry *ent = iter->ent; 114 struct trace_entry *ent = iter->ent;
115 struct syscall_trace_enter *trace; 115 struct syscall_trace_enter *trace;
116 struct syscall_metadata *entry; 116 struct syscall_metadata *entry;
117 int i, ret, syscall; 117 int i, syscall;
118 118
119 trace = (typeof(trace))ent; 119 trace = (typeof(trace))ent;
120 syscall = trace->nr; 120 syscall = trace->nr;
@@ -128,35 +128,28 @@ print_syscall_enter(struct trace_iterator *iter, int flags,
128 goto end; 128 goto end;
129 } 129 }
130 130
131 ret = trace_seq_printf(s, "%s(", entry->name); 131 trace_seq_printf(s, "%s(", entry->name);
132 if (!ret)
133 return TRACE_TYPE_PARTIAL_LINE;
134 132
135 for (i = 0; i < entry->nb_args; i++) { 133 for (i = 0; i < entry->nb_args; i++) {
134
135 if (trace_seq_has_overflowed(s))
136 goto end;
137
136 /* parameter types */ 138 /* parameter types */
137 if (trace_flags & TRACE_ITER_VERBOSE) { 139 if (trace_flags & TRACE_ITER_VERBOSE)
138 ret = trace_seq_printf(s, "%s ", entry->types[i]); 140 trace_seq_printf(s, "%s ", entry->types[i]);
139 if (!ret) 141
140 return TRACE_TYPE_PARTIAL_LINE;
141 }
142 /* parameter values */ 142 /* parameter values */
143 ret = trace_seq_printf(s, "%s: %lx%s", entry->args[i], 143 trace_seq_printf(s, "%s: %lx%s", entry->args[i],
144 trace->args[i], 144 trace->args[i],
145 i == entry->nb_args - 1 ? "" : ", "); 145 i == entry->nb_args - 1 ? "" : ", ");
146 if (!ret)
147 return TRACE_TYPE_PARTIAL_LINE;
148 } 146 }
149 147
150 ret = trace_seq_putc(s, ')'); 148 trace_seq_putc(s, ')');
151 if (!ret)
152 return TRACE_TYPE_PARTIAL_LINE;
153
154end: 149end:
155 ret = trace_seq_putc(s, '\n'); 150 trace_seq_putc(s, '\n');
156 if (!ret)
157 return TRACE_TYPE_PARTIAL_LINE;
158 151
159 return TRACE_TYPE_HANDLED; 152 return trace_handle_return(s);
160} 153}
161 154
162static enum print_line_t 155static enum print_line_t
@@ -168,7 +161,6 @@ print_syscall_exit(struct trace_iterator *iter, int flags,
168 struct syscall_trace_exit *trace; 161 struct syscall_trace_exit *trace;
169 int syscall; 162 int syscall;
170 struct syscall_metadata *entry; 163 struct syscall_metadata *entry;
171 int ret;
172 164
173 trace = (typeof(trace))ent; 165 trace = (typeof(trace))ent;
174 syscall = trace->nr; 166 syscall = trace->nr;
@@ -176,7 +168,7 @@ print_syscall_exit(struct trace_iterator *iter, int flags,
176 168
177 if (!entry) { 169 if (!entry) {
178 trace_seq_putc(s, '\n'); 170 trace_seq_putc(s, '\n');
179 return TRACE_TYPE_HANDLED; 171 goto out;
180 } 172 }
181 173
182 if (entry->exit_event->event.type != ent->type) { 174 if (entry->exit_event->event.type != ent->type) {
@@ -184,12 +176,11 @@ print_syscall_exit(struct trace_iterator *iter, int flags,
184 return TRACE_TYPE_UNHANDLED; 176 return TRACE_TYPE_UNHANDLED;
185 } 177 }
186 178
187 ret = trace_seq_printf(s, "%s -> 0x%lx\n", entry->name, 179 trace_seq_printf(s, "%s -> 0x%lx\n", entry->name,
188 trace->ret); 180 trace->ret);
189 if (!ret)
190 return TRACE_TYPE_PARTIAL_LINE;
191 181
192 return TRACE_TYPE_HANDLED; 182 out:
183 return trace_handle_return(s);
193} 184}
194 185
195extern char *__bad_type_size(void); 186extern char *__bad_type_size(void);
@@ -313,7 +304,7 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)
313 int size; 304 int size;
314 305
315 syscall_nr = trace_get_syscall_nr(current, regs); 306 syscall_nr = trace_get_syscall_nr(current, regs);
316 if (syscall_nr < 0) 307 if (syscall_nr < 0 || syscall_nr >= NR_syscalls)
317 return; 308 return;
318 309
319 /* Here we're inside tp handler's rcu_read_lock_sched (__DO_TRACE) */ 310 /* Here we're inside tp handler's rcu_read_lock_sched (__DO_TRACE) */
@@ -360,7 +351,7 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret)
360 int syscall_nr; 351 int syscall_nr;
361 352
362 syscall_nr = trace_get_syscall_nr(current, regs); 353 syscall_nr = trace_get_syscall_nr(current, regs);
363 if (syscall_nr < 0) 354 if (syscall_nr < 0 || syscall_nr >= NR_syscalls)
364 return; 355 return;
365 356
366 /* Here we're inside tp handler's rcu_read_lock_sched (__DO_TRACE()) */ 357 /* Here we're inside tp handler's rcu_read_lock_sched (__DO_TRACE()) */
@@ -425,7 +416,7 @@ static void unreg_event_syscall_enter(struct ftrace_event_file *file,
425 return; 416 return;
426 mutex_lock(&syscall_trace_lock); 417 mutex_lock(&syscall_trace_lock);
427 tr->sys_refcount_enter--; 418 tr->sys_refcount_enter--;
428 rcu_assign_pointer(tr->enter_syscall_files[num], NULL); 419 RCU_INIT_POINTER(tr->enter_syscall_files[num], NULL);
429 if (!tr->sys_refcount_enter) 420 if (!tr->sys_refcount_enter)
430 unregister_trace_sys_enter(ftrace_syscall_enter, tr); 421 unregister_trace_sys_enter(ftrace_syscall_enter, tr);
431 mutex_unlock(&syscall_trace_lock); 422 mutex_unlock(&syscall_trace_lock);
@@ -463,7 +454,7 @@ static void unreg_event_syscall_exit(struct ftrace_event_file *file,
463 return; 454 return;
464 mutex_lock(&syscall_trace_lock); 455 mutex_lock(&syscall_trace_lock);
465 tr->sys_refcount_exit--; 456 tr->sys_refcount_exit--;
466 rcu_assign_pointer(tr->exit_syscall_files[num], NULL); 457 RCU_INIT_POINTER(tr->exit_syscall_files[num], NULL);
467 if (!tr->sys_refcount_exit) 458 if (!tr->sys_refcount_exit)
468 unregister_trace_sys_exit(ftrace_syscall_exit, tr); 459 unregister_trace_sys_exit(ftrace_syscall_exit, tr);
469 mutex_unlock(&syscall_trace_lock); 460 mutex_unlock(&syscall_trace_lock);
@@ -523,7 +514,7 @@ unsigned long __init __weak arch_syscall_addr(int nr)
523 return (unsigned long)sys_call_table[nr]; 514 return (unsigned long)sys_call_table[nr];
524} 515}
525 516
526static int __init init_ftrace_syscalls(void) 517void __init init_ftrace_syscalls(void)
527{ 518{
528 struct syscall_metadata *meta; 519 struct syscall_metadata *meta;
529 unsigned long addr; 520 unsigned long addr;
@@ -533,7 +524,7 @@ static int __init init_ftrace_syscalls(void)
533 GFP_KERNEL); 524 GFP_KERNEL);
534 if (!syscalls_metadata) { 525 if (!syscalls_metadata) {
535 WARN_ON(1); 526 WARN_ON(1);
536 return -ENOMEM; 527 return;
537 } 528 }
538 529
539 for (i = 0; i < NR_syscalls; i++) { 530 for (i = 0; i < NR_syscalls; i++) {
@@ -545,10 +536,7 @@ static int __init init_ftrace_syscalls(void)
545 meta->syscall_nr = i; 536 meta->syscall_nr = i;
546 syscalls_metadata[i] = meta; 537 syscalls_metadata[i] = meta;
547 } 538 }
548
549 return 0;
550} 539}
551early_initcall(init_ftrace_syscalls);
552 540
553#ifdef CONFIG_PERF_EVENTS 541#ifdef CONFIG_PERF_EVENTS
554 542
@@ -567,7 +555,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
567 int size; 555 int size;
568 556
569 syscall_nr = trace_get_syscall_nr(current, regs); 557 syscall_nr = trace_get_syscall_nr(current, regs);
570 if (syscall_nr < 0) 558 if (syscall_nr < 0 || syscall_nr >= NR_syscalls)
571 return; 559 return;
572 if (!test_bit(syscall_nr, enabled_perf_enter_syscalls)) 560 if (!test_bit(syscall_nr, enabled_perf_enter_syscalls))
573 return; 561 return;
@@ -641,7 +629,7 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
641 int size; 629 int size;
642 630
643 syscall_nr = trace_get_syscall_nr(current, regs); 631 syscall_nr = trace_get_syscall_nr(current, regs);
644 if (syscall_nr < 0) 632 if (syscall_nr < 0 || syscall_nr >= NR_syscalls)
645 return; 633 return;
646 if (!test_bit(syscall_nr, enabled_perf_exit_syscalls)) 634 if (!test_bit(syscall_nr, enabled_perf_exit_syscalls))
647 return; 635 return;
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 33ff6a24b802..8520acc34b18 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -552,8 +552,7 @@ error:
552 return ret; 552 return ret;
553 553
554fail_address_parse: 554fail_address_parse:
555 if (inode) 555 iput(inode);
556 iput(inode);
557 556
558 pr_info("Failed to parse address or file.\n"); 557 pr_info("Failed to parse address or file.\n");
559 558
@@ -606,7 +605,7 @@ static int probes_seq_show(struct seq_file *m, void *v)
606 for (i = 0; i < tu->tp.nr_args; i++) 605 for (i = 0; i < tu->tp.nr_args; i++)
607 seq_printf(m, " %s=%s", tu->tp.args[i].name, tu->tp.args[i].comm); 606 seq_printf(m, " %s=%s", tu->tp.args[i].name, tu->tp.args[i].comm);
608 607
609 seq_printf(m, "\n"); 608 seq_putc(m, '\n');
610 return 0; 609 return 0;
611} 610}
612 611
@@ -852,16 +851,14 @@ print_uprobe_event(struct trace_iterator *iter, int flags, struct trace_event *e
852 tu = container_of(event, struct trace_uprobe, tp.call.event); 851 tu = container_of(event, struct trace_uprobe, tp.call.event);
853 852
854 if (is_ret_probe(tu)) { 853 if (is_ret_probe(tu)) {
855 if (!trace_seq_printf(s, "%s: (0x%lx <- 0x%lx)", 854 trace_seq_printf(s, "%s: (0x%lx <- 0x%lx)",
856 ftrace_event_name(&tu->tp.call), 855 ftrace_event_name(&tu->tp.call),
857 entry->vaddr[1], entry->vaddr[0])) 856 entry->vaddr[1], entry->vaddr[0]);
858 goto partial;
859 data = DATAOF_TRACE_ENTRY(entry, true); 857 data = DATAOF_TRACE_ENTRY(entry, true);
860 } else { 858 } else {
861 if (!trace_seq_printf(s, "%s: (0x%lx)", 859 trace_seq_printf(s, "%s: (0x%lx)",
862 ftrace_event_name(&tu->tp.call), 860 ftrace_event_name(&tu->tp.call),
863 entry->vaddr[0])) 861 entry->vaddr[0]);
864 goto partial;
865 data = DATAOF_TRACE_ENTRY(entry, false); 862 data = DATAOF_TRACE_ENTRY(entry, false);
866 } 863 }
867 864
@@ -869,14 +866,13 @@ print_uprobe_event(struct trace_iterator *iter, int flags, struct trace_event *e
869 struct probe_arg *parg = &tu->tp.args[i]; 866 struct probe_arg *parg = &tu->tp.args[i];
870 867
871 if (!parg->type->print(s, parg->name, data + parg->offset, entry)) 868 if (!parg->type->print(s, parg->name, data + parg->offset, entry))
872 goto partial; 869 goto out;
873 } 870 }
874 871
875 if (trace_seq_puts(s, "\n")) 872 trace_seq_putc(s, '\n');
876 return TRACE_TYPE_HANDLED;
877 873
878partial: 874 out:
879 return TRACE_TYPE_PARTIAL_LINE; 875 return trace_handle_return(s);
880} 876}
881 877
882typedef bool (*filter_func_t)(struct uprobe_consumer *self, 878typedef bool (*filter_func_t)(struct uprobe_consumer *self,
diff --git a/kernel/uid16.c b/kernel/uid16.c
index 602e5bbbceff..d58cc4d8f0d1 100644
--- a/kernel/uid16.c
+++ b/kernel/uid16.c
@@ -176,7 +176,7 @@ SYSCALL_DEFINE2(setgroups16, int, gidsetsize, old_gid_t __user *, grouplist)
176 struct group_info *group_info; 176 struct group_info *group_info;
177 int retval; 177 int retval;
178 178
179 if (!ns_capable(current_user_ns(), CAP_SETGID)) 179 if (!may_setgroups())
180 return -EPERM; 180 return -EPERM;
181 if ((unsigned)gidsetsize > NGROUPS_MAX) 181 if ((unsigned)gidsetsize > NGROUPS_MAX)
182 return -EINVAL; 182 return -EINVAL;
diff --git a/kernel/user-return-notifier.c b/kernel/user-return-notifier.c
index 394f70b17162..9586b670a5b2 100644
--- a/kernel/user-return-notifier.c
+++ b/kernel/user-return-notifier.c
@@ -14,7 +14,7 @@ static DEFINE_PER_CPU(struct hlist_head, return_notifier_list);
14void user_return_notifier_register(struct user_return_notifier *urn) 14void user_return_notifier_register(struct user_return_notifier *urn)
15{ 15{
16 set_tsk_thread_flag(current, TIF_USER_RETURN_NOTIFY); 16 set_tsk_thread_flag(current, TIF_USER_RETURN_NOTIFY);
17 hlist_add_head(&urn->link, &__get_cpu_var(return_notifier_list)); 17 hlist_add_head(&urn->link, this_cpu_ptr(&return_notifier_list));
18} 18}
19EXPORT_SYMBOL_GPL(user_return_notifier_register); 19EXPORT_SYMBOL_GPL(user_return_notifier_register);
20 20
@@ -25,7 +25,7 @@ EXPORT_SYMBOL_GPL(user_return_notifier_register);
25void user_return_notifier_unregister(struct user_return_notifier *urn) 25void user_return_notifier_unregister(struct user_return_notifier *urn)
26{ 26{
27 hlist_del(&urn->link); 27 hlist_del(&urn->link);
28 if (hlist_empty(&__get_cpu_var(return_notifier_list))) 28 if (hlist_empty(this_cpu_ptr(&return_notifier_list)))
29 clear_tsk_thread_flag(current, TIF_USER_RETURN_NOTIFY); 29 clear_tsk_thread_flag(current, TIF_USER_RETURN_NOTIFY);
30} 30}
31EXPORT_SYMBOL_GPL(user_return_notifier_unregister); 31EXPORT_SYMBOL_GPL(user_return_notifier_unregister);
diff --git a/kernel/user.c b/kernel/user.c
index 4efa39350e44..b069ccbfb0b0 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -50,7 +50,11 @@ struct user_namespace init_user_ns = {
50 .count = ATOMIC_INIT(3), 50 .count = ATOMIC_INIT(3),
51 .owner = GLOBAL_ROOT_UID, 51 .owner = GLOBAL_ROOT_UID,
52 .group = GLOBAL_ROOT_GID, 52 .group = GLOBAL_ROOT_GID,
53 .proc_inum = PROC_USER_INIT_INO, 53 .ns.inum = PROC_USER_INIT_INO,
54#ifdef CONFIG_USER_NS
55 .ns.ops = &userns_operations,
56#endif
57 .flags = USERNS_INIT_FLAGS,
54#ifdef CONFIG_PERSISTENT_KEYRINGS 58#ifdef CONFIG_PERSISTENT_KEYRINGS
55 .persistent_keyring_register_sem = 59 .persistent_keyring_register_sem =
56 __RWSEM_INITIALIZER(init_user_ns.persistent_keyring_register_sem), 60 __RWSEM_INITIALIZER(init_user_ns.persistent_keyring_register_sem),
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index aa312b0dc3ec..4109f8320684 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -24,6 +24,7 @@
24#include <linux/fs_struct.h> 24#include <linux/fs_struct.h>
25 25
26static struct kmem_cache *user_ns_cachep __read_mostly; 26static struct kmem_cache *user_ns_cachep __read_mostly;
27static DEFINE_MUTEX(userns_state_mutex);
27 28
28static bool new_idmap_permitted(const struct file *file, 29static bool new_idmap_permitted(const struct file *file,
29 struct user_namespace *ns, int cap_setid, 30 struct user_namespace *ns, int cap_setid,
@@ -86,11 +87,12 @@ int create_user_ns(struct cred *new)
86 if (!ns) 87 if (!ns)
87 return -ENOMEM; 88 return -ENOMEM;
88 89
89 ret = proc_alloc_inum(&ns->proc_inum); 90 ret = ns_alloc_inum(&ns->ns);
90 if (ret) { 91 if (ret) {
91 kmem_cache_free(user_ns_cachep, ns); 92 kmem_cache_free(user_ns_cachep, ns);
92 return ret; 93 return ret;
93 } 94 }
95 ns->ns.ops = &userns_operations;
94 96
95 atomic_set(&ns->count, 1); 97 atomic_set(&ns->count, 1);
96 /* Leave the new->user_ns reference with the new user namespace. */ 98 /* Leave the new->user_ns reference with the new user namespace. */
@@ -99,6 +101,11 @@ int create_user_ns(struct cred *new)
99 ns->owner = owner; 101 ns->owner = owner;
100 ns->group = group; 102 ns->group = group;
101 103
104 /* Inherit USERNS_SETGROUPS_ALLOWED from our parent */
105 mutex_lock(&userns_state_mutex);
106 ns->flags = parent_ns->flags;
107 mutex_unlock(&userns_state_mutex);
108
102 set_cred_user_ns(new, ns); 109 set_cred_user_ns(new, ns);
103 110
104#ifdef CONFIG_PERSISTENT_KEYRINGS 111#ifdef CONFIG_PERSISTENT_KEYRINGS
@@ -136,7 +143,7 @@ void free_user_ns(struct user_namespace *ns)
136#ifdef CONFIG_PERSISTENT_KEYRINGS 143#ifdef CONFIG_PERSISTENT_KEYRINGS
137 key_put(ns->persistent_keyring_register); 144 key_put(ns->persistent_keyring_register);
138#endif 145#endif
139 proc_free_inum(ns->proc_inum); 146 ns_free_inum(&ns->ns);
140 kmem_cache_free(user_ns_cachep, ns); 147 kmem_cache_free(user_ns_cachep, ns);
141 ns = parent; 148 ns = parent;
142 } while (atomic_dec_and_test(&parent->count)); 149 } while (atomic_dec_and_test(&parent->count));
@@ -583,9 +590,6 @@ static bool mappings_overlap(struct uid_gid_map *new_map,
583 return false; 590 return false;
584} 591}
585 592
586
587static DEFINE_MUTEX(id_map_mutex);
588
589static ssize_t map_write(struct file *file, const char __user *buf, 593static ssize_t map_write(struct file *file, const char __user *buf,
590 size_t count, loff_t *ppos, 594 size_t count, loff_t *ppos,
591 int cap_setid, 595 int cap_setid,
@@ -602,7 +606,7 @@ static ssize_t map_write(struct file *file, const char __user *buf,
602 ssize_t ret = -EINVAL; 606 ssize_t ret = -EINVAL;
603 607
604 /* 608 /*
605 * The id_map_mutex serializes all writes to any given map. 609 * The userns_state_mutex serializes all writes to any given map.
606 * 610 *
607 * Any map is only ever written once. 611 * Any map is only ever written once.
608 * 612 *
@@ -620,7 +624,7 @@ static ssize_t map_write(struct file *file, const char __user *buf,
620 * order and smp_rmb() is guaranteed that we don't have crazy 624 * order and smp_rmb() is guaranteed that we don't have crazy
621 * architectures returning stale data. 625 * architectures returning stale data.
622 */ 626 */
623 mutex_lock(&id_map_mutex); 627 mutex_lock(&userns_state_mutex);
624 628
625 ret = -EPERM; 629 ret = -EPERM;
626 /* Only allow one successful write to the map */ 630 /* Only allow one successful write to the map */
@@ -640,7 +644,7 @@ static ssize_t map_write(struct file *file, const char __user *buf,
640 if (!page) 644 if (!page)
641 goto out; 645 goto out;
642 646
643 /* Only allow <= page size writes at the beginning of the file */ 647 /* Only allow < page size writes at the beginning of the file */
644 ret = -EINVAL; 648 ret = -EINVAL;
645 if ((*ppos != 0) || (count >= PAGE_SIZE)) 649 if ((*ppos != 0) || (count >= PAGE_SIZE))
646 goto out; 650 goto out;
@@ -750,7 +754,7 @@ static ssize_t map_write(struct file *file, const char __user *buf,
750 *ppos = count; 754 *ppos = count;
751 ret = count; 755 ret = count;
752out: 756out:
753 mutex_unlock(&id_map_mutex); 757 mutex_unlock(&userns_state_mutex);
754 if (page) 758 if (page)
755 free_page(page); 759 free_page(page);
756 return ret; 760 return ret;
@@ -812,16 +816,21 @@ static bool new_idmap_permitted(const struct file *file,
812 struct user_namespace *ns, int cap_setid, 816 struct user_namespace *ns, int cap_setid,
813 struct uid_gid_map *new_map) 817 struct uid_gid_map *new_map)
814{ 818{
815 /* Allow mapping to your own filesystem ids */ 819 const struct cred *cred = file->f_cred;
816 if ((new_map->nr_extents == 1) && (new_map->extent[0].count == 1)) { 820 /* Don't allow mappings that would allow anything that wouldn't
821 * be allowed without the establishment of unprivileged mappings.
822 */
823 if ((new_map->nr_extents == 1) && (new_map->extent[0].count == 1) &&
824 uid_eq(ns->owner, cred->euid)) {
817 u32 id = new_map->extent[0].lower_first; 825 u32 id = new_map->extent[0].lower_first;
818 if (cap_setid == CAP_SETUID) { 826 if (cap_setid == CAP_SETUID) {
819 kuid_t uid = make_kuid(ns->parent, id); 827 kuid_t uid = make_kuid(ns->parent, id);
820 if (uid_eq(uid, file->f_cred->fsuid)) 828 if (uid_eq(uid, cred->euid))
821 return true; 829 return true;
822 } else if (cap_setid == CAP_SETGID) { 830 } else if (cap_setid == CAP_SETGID) {
823 kgid_t gid = make_kgid(ns->parent, id); 831 kgid_t gid = make_kgid(ns->parent, id);
824 if (gid_eq(gid, file->f_cred->fsgid)) 832 if (!(ns->flags & USERNS_SETGROUPS_ALLOWED) &&
833 gid_eq(gid, cred->egid))
825 return true; 834 return true;
826 } 835 }
827 } 836 }
@@ -841,7 +850,106 @@ static bool new_idmap_permitted(const struct file *file,
841 return false; 850 return false;
842} 851}
843 852
844static void *userns_get(struct task_struct *task) 853int proc_setgroups_show(struct seq_file *seq, void *v)
854{
855 struct user_namespace *ns = seq->private;
856 unsigned long userns_flags = ACCESS_ONCE(ns->flags);
857
858 seq_printf(seq, "%s\n",
859 (userns_flags & USERNS_SETGROUPS_ALLOWED) ?
860 "allow" : "deny");
861 return 0;
862}
863
864ssize_t proc_setgroups_write(struct file *file, const char __user *buf,
865 size_t count, loff_t *ppos)
866{
867 struct seq_file *seq = file->private_data;
868 struct user_namespace *ns = seq->private;
869 char kbuf[8], *pos;
870 bool setgroups_allowed;
871 ssize_t ret;
872
873 /* Only allow a very narrow range of strings to be written */
874 ret = -EINVAL;
875 if ((*ppos != 0) || (count >= sizeof(kbuf)))
876 goto out;
877
878 /* What was written? */
879 ret = -EFAULT;
880 if (copy_from_user(kbuf, buf, count))
881 goto out;
882 kbuf[count] = '\0';
883 pos = kbuf;
884
885 /* What is being requested? */
886 ret = -EINVAL;
887 if (strncmp(pos, "allow", 5) == 0) {
888 pos += 5;
889 setgroups_allowed = true;
890 }
891 else if (strncmp(pos, "deny", 4) == 0) {
892 pos += 4;
893 setgroups_allowed = false;
894 }
895 else
896 goto out;
897
898 /* Verify there is not trailing junk on the line */
899 pos = skip_spaces(pos);
900 if (*pos != '\0')
901 goto out;
902
903 ret = -EPERM;
904 mutex_lock(&userns_state_mutex);
905 if (setgroups_allowed) {
906 /* Enabling setgroups after setgroups has been disabled
907 * is not allowed.
908 */
909 if (!(ns->flags & USERNS_SETGROUPS_ALLOWED))
910 goto out_unlock;
911 } else {
912 /* Permanently disabling setgroups after setgroups has
913 * been enabled by writing the gid_map is not allowed.
914 */
915 if (ns->gid_map.nr_extents != 0)
916 goto out_unlock;
917 ns->flags &= ~USERNS_SETGROUPS_ALLOWED;
918 }
919 mutex_unlock(&userns_state_mutex);
920
921 /* Report a successful write */
922 *ppos = count;
923 ret = count;
924out:
925 return ret;
926out_unlock:
927 mutex_unlock(&userns_state_mutex);
928 goto out;
929}
930
931bool userns_may_setgroups(const struct user_namespace *ns)
932{
933 bool allowed;
934
935 mutex_lock(&userns_state_mutex);
936 /* It is not safe to use setgroups until a gid mapping in
937 * the user namespace has been established.
938 */
939 allowed = ns->gid_map.nr_extents != 0;
940 /* Is setgroups allowed? */
941 allowed = allowed && (ns->flags & USERNS_SETGROUPS_ALLOWED);
942 mutex_unlock(&userns_state_mutex);
943
944 return allowed;
945}
946
947static inline struct user_namespace *to_user_ns(struct ns_common *ns)
948{
949 return container_of(ns, struct user_namespace, ns);
950}
951
952static struct ns_common *userns_get(struct task_struct *task)
845{ 953{
846 struct user_namespace *user_ns; 954 struct user_namespace *user_ns;
847 955
@@ -849,17 +957,17 @@ static void *userns_get(struct task_struct *task)
849 user_ns = get_user_ns(__task_cred(task)->user_ns); 957 user_ns = get_user_ns(__task_cred(task)->user_ns);
850 rcu_read_unlock(); 958 rcu_read_unlock();
851 959
852 return user_ns; 960 return user_ns ? &user_ns->ns : NULL;
853} 961}
854 962
855static void userns_put(void *ns) 963static void userns_put(struct ns_common *ns)
856{ 964{
857 put_user_ns(ns); 965 put_user_ns(to_user_ns(ns));
858} 966}
859 967
860static int userns_install(struct nsproxy *nsproxy, void *ns) 968static int userns_install(struct nsproxy *nsproxy, struct ns_common *ns)
861{ 969{
862 struct user_namespace *user_ns = ns; 970 struct user_namespace *user_ns = to_user_ns(ns);
863 struct cred *cred; 971 struct cred *cred;
864 972
865 /* Don't allow gaining capabilities by reentering 973 /* Don't allow gaining capabilities by reentering
@@ -888,19 +996,12 @@ static int userns_install(struct nsproxy *nsproxy, void *ns)
888 return commit_creds(cred); 996 return commit_creds(cred);
889} 997}
890 998
891static unsigned int userns_inum(void *ns)
892{
893 struct user_namespace *user_ns = ns;
894 return user_ns->proc_inum;
895}
896
897const struct proc_ns_operations userns_operations = { 999const struct proc_ns_operations userns_operations = {
898 .name = "user", 1000 .name = "user",
899 .type = CLONE_NEWUSER, 1001 .type = CLONE_NEWUSER,
900 .get = userns_get, 1002 .get = userns_get,
901 .put = userns_put, 1003 .put = userns_put,
902 .install = userns_install, 1004 .install = userns_install,
903 .inum = userns_inum,
904}; 1005};
905 1006
906static __init int user_namespaces_init(void) 1007static __init int user_namespaces_init(void)
diff --git a/kernel/utsname.c b/kernel/utsname.c
index 883aaaa7de8a..831ea7108232 100644
--- a/kernel/utsname.c
+++ b/kernel/utsname.c
@@ -42,12 +42,14 @@ static struct uts_namespace *clone_uts_ns(struct user_namespace *user_ns,
42 if (!ns) 42 if (!ns)
43 return ERR_PTR(-ENOMEM); 43 return ERR_PTR(-ENOMEM);
44 44
45 err = proc_alloc_inum(&ns->proc_inum); 45 err = ns_alloc_inum(&ns->ns);
46 if (err) { 46 if (err) {
47 kfree(ns); 47 kfree(ns);
48 return ERR_PTR(err); 48 return ERR_PTR(err);
49 } 49 }
50 50
51 ns->ns.ops = &utsns_operations;
52
51 down_read(&uts_sem); 53 down_read(&uts_sem);
52 memcpy(&ns->name, &old_ns->name, sizeof(ns->name)); 54 memcpy(&ns->name, &old_ns->name, sizeof(ns->name));
53 ns->user_ns = get_user_ns(user_ns); 55 ns->user_ns = get_user_ns(user_ns);
@@ -84,11 +86,16 @@ void free_uts_ns(struct kref *kref)
84 86
85 ns = container_of(kref, struct uts_namespace, kref); 87 ns = container_of(kref, struct uts_namespace, kref);
86 put_user_ns(ns->user_ns); 88 put_user_ns(ns->user_ns);
87 proc_free_inum(ns->proc_inum); 89 ns_free_inum(&ns->ns);
88 kfree(ns); 90 kfree(ns);
89} 91}
90 92
91static void *utsns_get(struct task_struct *task) 93static inline struct uts_namespace *to_uts_ns(struct ns_common *ns)
94{
95 return container_of(ns, struct uts_namespace, ns);
96}
97
98static struct ns_common *utsns_get(struct task_struct *task)
92{ 99{
93 struct uts_namespace *ns = NULL; 100 struct uts_namespace *ns = NULL;
94 struct nsproxy *nsproxy; 101 struct nsproxy *nsproxy;
@@ -101,17 +108,17 @@ static void *utsns_get(struct task_struct *task)
101 } 108 }
102 task_unlock(task); 109 task_unlock(task);
103 110
104 return ns; 111 return ns ? &ns->ns : NULL;
105} 112}
106 113
107static void utsns_put(void *ns) 114static void utsns_put(struct ns_common *ns)
108{ 115{
109 put_uts_ns(ns); 116 put_uts_ns(to_uts_ns(ns));
110} 117}
111 118
112static int utsns_install(struct nsproxy *nsproxy, void *new) 119static int utsns_install(struct nsproxy *nsproxy, struct ns_common *new)
113{ 120{
114 struct uts_namespace *ns = new; 121 struct uts_namespace *ns = to_uts_ns(new);
115 122
116 if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN) || 123 if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN) ||
117 !ns_capable(current_user_ns(), CAP_SYS_ADMIN)) 124 !ns_capable(current_user_ns(), CAP_SYS_ADMIN))
@@ -123,18 +130,10 @@ static int utsns_install(struct nsproxy *nsproxy, void *new)
123 return 0; 130 return 0;
124} 131}
125 132
126static unsigned int utsns_inum(void *vp)
127{
128 struct uts_namespace *ns = vp;
129
130 return ns->proc_inum;
131}
132
133const struct proc_ns_operations utsns_operations = { 133const struct proc_ns_operations utsns_operations = {
134 .name = "uts", 134 .name = "uts",
135 .type = CLONE_NEWUTS, 135 .type = CLONE_NEWUTS,
136 .get = utsns_get, 136 .get = utsns_get,
137 .put = utsns_put, 137 .put = utsns_put,
138 .install = utsns_install, 138 .install = utsns_install,
139 .inum = utsns_inum,
140}; 139};
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index a8d6914030fe..70bf11815f84 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -15,11 +15,6 @@
15#include <linux/cpu.h> 15#include <linux/cpu.h>
16#include <linux/nmi.h> 16#include <linux/nmi.h>
17#include <linux/init.h> 17#include <linux/init.h>
18#include <linux/delay.h>
19#include <linux/freezer.h>
20#include <linux/kthread.h>
21#include <linux/lockdep.h>
22#include <linux/notifier.h>
23#include <linux/module.h> 18#include <linux/module.h>
24#include <linux/sysctl.h> 19#include <linux/sysctl.h>
25#include <linux/smpboot.h> 20#include <linux/smpboot.h>
@@ -47,6 +42,7 @@ static DEFINE_PER_CPU(bool, softlockup_touch_sync);
47static DEFINE_PER_CPU(bool, soft_watchdog_warn); 42static DEFINE_PER_CPU(bool, soft_watchdog_warn);
48static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts); 43static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts);
49static DEFINE_PER_CPU(unsigned long, soft_lockup_hrtimer_cnt); 44static DEFINE_PER_CPU(unsigned long, soft_lockup_hrtimer_cnt);
45static DEFINE_PER_CPU(struct task_struct *, softlockup_task_ptr_saved);
50#ifdef CONFIG_HARDLOCKUP_DETECTOR 46#ifdef CONFIG_HARDLOCKUP_DETECTOR
51static DEFINE_PER_CPU(bool, hard_watchdog_warn); 47static DEFINE_PER_CPU(bool, hard_watchdog_warn);
52static DEFINE_PER_CPU(bool, watchdog_nmi_touch); 48static DEFINE_PER_CPU(bool, watchdog_nmi_touch);
@@ -63,6 +59,25 @@ static unsigned long soft_lockup_nmi_warn;
63static int hardlockup_panic = 59static int hardlockup_panic =
64 CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE; 60 CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE;
65 61
62static bool hardlockup_detector_enabled = true;
63/*
64 * We may not want to enable hard lockup detection by default in all cases,
65 * for example when running the kernel as a guest on a hypervisor. In these
66 * cases this function can be called to disable hard lockup detection. This
67 * function should only be executed once by the boot processor before the
68 * kernel command line parameters are parsed, because otherwise it is not
69 * possible to override this in hardlockup_panic_setup().
70 */
71void watchdog_enable_hardlockup_detector(bool val)
72{
73 hardlockup_detector_enabled = val;
74}
75
76bool watchdog_hardlockup_detector_is_enabled(void)
77{
78 return hardlockup_detector_enabled;
79}
80
66static int __init hardlockup_panic_setup(char *str) 81static int __init hardlockup_panic_setup(char *str)
67{ 82{
68 if (!strncmp(str, "panic", 5)) 83 if (!strncmp(str, "panic", 5))
@@ -71,6 +86,14 @@ static int __init hardlockup_panic_setup(char *str)
71 hardlockup_panic = 0; 86 hardlockup_panic = 0;
72 else if (!strncmp(str, "0", 1)) 87 else if (!strncmp(str, "0", 1))
73 watchdog_user_enabled = 0; 88 watchdog_user_enabled = 0;
89 else if (!strncmp(str, "1", 1) || !strncmp(str, "2", 1)) {
90 /*
91 * Setting 'nmi_watchdog=1' or 'nmi_watchdog=2' (legacy option)
92 * has the same effect.
93 */
94 watchdog_user_enabled = 1;
95 watchdog_enable_hardlockup_detector(true);
96 }
74 return 1; 97 return 1;
75} 98}
76__setup("nmi_watchdog=", hardlockup_panic_setup); 99__setup("nmi_watchdog=", hardlockup_panic_setup);
@@ -185,7 +208,7 @@ void touch_nmi_watchdog(void)
185 * case we shouldn't have to worry about the watchdog 208 * case we shouldn't have to worry about the watchdog
186 * going off. 209 * going off.
187 */ 210 */
188 __raw_get_cpu_var(watchdog_nmi_touch) = true; 211 raw_cpu_write(watchdog_nmi_touch, true);
189 touch_softlockup_watchdog(); 212 touch_softlockup_watchdog();
190} 213}
191EXPORT_SYMBOL(touch_nmi_watchdog); 214EXPORT_SYMBOL(touch_nmi_watchdog);
@@ -194,8 +217,8 @@ EXPORT_SYMBOL(touch_nmi_watchdog);
194 217
195void touch_softlockup_watchdog_sync(void) 218void touch_softlockup_watchdog_sync(void)
196{ 219{
197 __raw_get_cpu_var(softlockup_touch_sync) = true; 220 __this_cpu_write(softlockup_touch_sync, true);
198 __raw_get_cpu_var(watchdog_touch_ts) = 0; 221 __this_cpu_write(watchdog_touch_ts, 0);
199} 222}
200 223
201#ifdef CONFIG_HARDLOCKUP_DETECTOR 224#ifdef CONFIG_HARDLOCKUP_DETECTOR
@@ -333,8 +356,22 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
333 return HRTIMER_RESTART; 356 return HRTIMER_RESTART;
334 357
335 /* only warn once */ 358 /* only warn once */
336 if (__this_cpu_read(soft_watchdog_warn) == true) 359 if (__this_cpu_read(soft_watchdog_warn) == true) {
360 /*
361 * When multiple processes are causing softlockups the
362 * softlockup detector only warns on the first one
363 * because the code relies on a full quiet cycle to
364 * re-arm. The second process prevents the quiet cycle
365 * and never gets reported. Use task pointers to detect
366 * this.
367 */
368 if (__this_cpu_read(softlockup_task_ptr_saved) !=
369 current) {
370 __this_cpu_write(soft_watchdog_warn, false);
371 __touch_watchdog();
372 }
337 return HRTIMER_RESTART; 373 return HRTIMER_RESTART;
374 }
338 375
339 if (softlockup_all_cpu_backtrace) { 376 if (softlockup_all_cpu_backtrace) {
340 /* Prevent multiple soft-lockup reports if one cpu is already 377 /* Prevent multiple soft-lockup reports if one cpu is already
@@ -350,6 +387,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
350 pr_emerg("BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n", 387 pr_emerg("BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n",
351 smp_processor_id(), duration, 388 smp_processor_id(), duration,
352 current->comm, task_pid_nr(current)); 389 current->comm, task_pid_nr(current));
390 __this_cpu_write(softlockup_task_ptr_saved, current);
353 print_modules(); 391 print_modules();
354 print_irqtrace_events(current); 392 print_irqtrace_events(current);
355 if (regs) 393 if (regs)
@@ -387,7 +425,7 @@ static void watchdog_set_prio(unsigned int policy, unsigned int prio)
387 425
388static void watchdog_enable(unsigned int cpu) 426static void watchdog_enable(unsigned int cpu)
389{ 427{
390 struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer); 428 struct hrtimer *hrtimer = raw_cpu_ptr(&watchdog_hrtimer);
391 429
392 /* kick off the timer for the hardlockup detector */ 430 /* kick off the timer for the hardlockup detector */
393 hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 431 hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
@@ -407,7 +445,7 @@ static void watchdog_enable(unsigned int cpu)
407 445
408static void watchdog_disable(unsigned int cpu) 446static void watchdog_disable(unsigned int cpu)
409{ 447{
410 struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer); 448 struct hrtimer *hrtimer = raw_cpu_ptr(&watchdog_hrtimer);
411 449
412 watchdog_set_prio(SCHED_NORMAL, 0); 450 watchdog_set_prio(SCHED_NORMAL, 0);
413 hrtimer_cancel(hrtimer); 451 hrtimer_cancel(hrtimer);
@@ -454,6 +492,15 @@ static int watchdog_nmi_enable(unsigned int cpu)
454 struct perf_event_attr *wd_attr; 492 struct perf_event_attr *wd_attr;
455 struct perf_event *event = per_cpu(watchdog_ev, cpu); 493 struct perf_event *event = per_cpu(watchdog_ev, cpu);
456 494
495 /*
496 * Some kernels need to default hard lockup detection to
497 * 'disabled', for example a guest on a hypervisor.
498 */
499 if (!watchdog_hardlockup_detector_is_enabled()) {
500 event = ERR_PTR(-ENOENT);
501 goto handle_err;
502 }
503
457 /* is it already setup and enabled? */ 504 /* is it already setup and enabled? */
458 if (event && event->state > PERF_EVENT_STATE_OFF) 505 if (event && event->state > PERF_EVENT_STATE_OFF)
459 goto out; 506 goto out;
@@ -468,6 +515,7 @@ static int watchdog_nmi_enable(unsigned int cpu)
468 /* Try to register using hardware perf events */ 515 /* Try to register using hardware perf events */
469 event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL); 516 event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL);
470 517
518handle_err:
471 /* save cpu0 error for future comparision */ 519 /* save cpu0 error for future comparision */
472 if (cpu == 0 && IS_ERR(event)) 520 if (cpu == 0 && IS_ERR(event))
473 cpu0_err = PTR_ERR(event); 521 cpu0_err = PTR_ERR(event);
@@ -514,7 +562,10 @@ static void watchdog_nmi_disable(unsigned int cpu)
514 /* should be in cleanup, but blocks oprofile */ 562 /* should be in cleanup, but blocks oprofile */
515 perf_event_release_kernel(event); 563 perf_event_release_kernel(event);
516 } 564 }
517 return; 565 if (cpu == 0) {
566 /* watchdog_nmi_enable() expects this to be zero initially. */
567 cpu0_err = 0;
568 }
518} 569}
519#else 570#else
520static int watchdog_nmi_enable(unsigned int cpu) { return 0; } 571static int watchdog_nmi_enable(unsigned int cpu) { return 0; }
@@ -534,7 +585,7 @@ static struct smp_hotplug_thread watchdog_threads = {
534 585
535static void restart_watchdog_hrtimer(void *info) 586static void restart_watchdog_hrtimer(void *info)
536{ 587{
537 struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer); 588 struct hrtimer *hrtimer = raw_cpu_ptr(&watchdog_hrtimer);
538 int ret; 589 int ret;
539 590
540 /* 591 /*
@@ -610,11 +661,13 @@ int proc_dowatchdog(struct ctl_table *table, int write,
610 void __user *buffer, size_t *lenp, loff_t *ppos) 661 void __user *buffer, size_t *lenp, loff_t *ppos)
611{ 662{
612 int err, old_thresh, old_enabled; 663 int err, old_thresh, old_enabled;
664 bool old_hardlockup;
613 static DEFINE_MUTEX(watchdog_proc_mutex); 665 static DEFINE_MUTEX(watchdog_proc_mutex);
614 666
615 mutex_lock(&watchdog_proc_mutex); 667 mutex_lock(&watchdog_proc_mutex);
616 old_thresh = ACCESS_ONCE(watchdog_thresh); 668 old_thresh = ACCESS_ONCE(watchdog_thresh);
617 old_enabled = ACCESS_ONCE(watchdog_user_enabled); 669 old_enabled = ACCESS_ONCE(watchdog_user_enabled);
670 old_hardlockup = watchdog_hardlockup_detector_is_enabled();
618 671
619 err = proc_dointvec_minmax(table, write, buffer, lenp, ppos); 672 err = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
620 if (err || !write) 673 if (err || !write)
@@ -626,15 +679,22 @@ int proc_dowatchdog(struct ctl_table *table, int write,
626 * disabled. The 'watchdog_running' variable check in 679 * disabled. The 'watchdog_running' variable check in
627 * watchdog_*_all_cpus() function takes care of this. 680 * watchdog_*_all_cpus() function takes care of this.
628 */ 681 */
629 if (watchdog_user_enabled && watchdog_thresh) 682 if (watchdog_user_enabled && watchdog_thresh) {
683 /*
684 * Prevent a change in watchdog_thresh accidentally overriding
685 * the enablement of the hardlockup detector.
686 */
687 if (watchdog_user_enabled != old_enabled)
688 watchdog_enable_hardlockup_detector(true);
630 err = watchdog_enable_all_cpus(old_thresh != watchdog_thresh); 689 err = watchdog_enable_all_cpus(old_thresh != watchdog_thresh);
631 else 690 } else
632 watchdog_disable_all_cpus(); 691 watchdog_disable_all_cpus();
633 692
634 /* Restore old values on failure */ 693 /* Restore old values on failure */
635 if (err) { 694 if (err) {
636 watchdog_thresh = old_thresh; 695 watchdog_thresh = old_thresh;
637 watchdog_user_enabled = old_enabled; 696 watchdog_user_enabled = old_enabled;
697 watchdog_enable_hardlockup_detector(old_hardlockup);
638 } 698 }
639out: 699out:
640 mutex_unlock(&watchdog_proc_mutex); 700 mutex_unlock(&watchdog_proc_mutex);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 5dbe22aa3efd..6202b08f1933 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1804,8 +1804,8 @@ static void pool_mayday_timeout(unsigned long __pool)
1804 struct worker_pool *pool = (void *)__pool; 1804 struct worker_pool *pool = (void *)__pool;
1805 struct work_struct *work; 1805 struct work_struct *work;
1806 1806
1807 spin_lock_irq(&wq_mayday_lock); /* for wq->maydays */ 1807 spin_lock_irq(&pool->lock);
1808 spin_lock(&pool->lock); 1808 spin_lock(&wq_mayday_lock); /* for wq->maydays */
1809 1809
1810 if (need_to_create_worker(pool)) { 1810 if (need_to_create_worker(pool)) {
1811 /* 1811 /*
@@ -1818,8 +1818,8 @@ static void pool_mayday_timeout(unsigned long __pool)
1818 send_mayday(work); 1818 send_mayday(work);
1819 } 1819 }
1820 1820
1821 spin_unlock(&pool->lock); 1821 spin_unlock(&wq_mayday_lock);
1822 spin_unlock_irq(&wq_mayday_lock); 1822 spin_unlock_irq(&pool->lock);
1823 1823
1824 mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INTERVAL); 1824 mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INTERVAL);
1825} 1825}
@@ -2043,9 +2043,10 @@ __acquires(&pool->lock)
2043 * kernels, where a requeueing work item waiting for something to 2043 * kernels, where a requeueing work item waiting for something to
2044 * happen could deadlock with stop_machine as such work item could 2044 * happen could deadlock with stop_machine as such work item could
2045 * indefinitely requeue itself while all other CPUs are trapped in 2045 * indefinitely requeue itself while all other CPUs are trapped in
2046 * stop_machine. 2046 * stop_machine. At the same time, report a quiescent RCU state so
2047 * the same condition doesn't freeze RCU.
2047 */ 2048 */
2048 cond_resched(); 2049 cond_resched_rcu_qs();
2049 2050
2050 spin_lock_irq(&pool->lock); 2051 spin_lock_irq(&pool->lock);
2051 2052
@@ -2247,12 +2248,30 @@ repeat:
2247 * Slurp in all works issued via this workqueue and 2248 * Slurp in all works issued via this workqueue and
2248 * process'em. 2249 * process'em.
2249 */ 2250 */
2250 WARN_ON_ONCE(!list_empty(&rescuer->scheduled)); 2251 WARN_ON_ONCE(!list_empty(scheduled));
2251 list_for_each_entry_safe(work, n, &pool->worklist, entry) 2252 list_for_each_entry_safe(work, n, &pool->worklist, entry)
2252 if (get_work_pwq(work) == pwq) 2253 if (get_work_pwq(work) == pwq)
2253 move_linked_works(work, scheduled, &n); 2254 move_linked_works(work, scheduled, &n);
2254 2255
2255 process_scheduled_works(rescuer); 2256 if (!list_empty(scheduled)) {
2257 process_scheduled_works(rescuer);
2258
2259 /*
2260 * The above execution of rescued work items could
2261 * have created more to rescue through
2262 * pwq_activate_first_delayed() or chained
2263 * queueing. Let's put @pwq back on mayday list so
2264 * that such back-to-back work items, which may be
2265 * being used to relieve memory pressure, don't
2266 * incur MAYDAY_INTERVAL delay inbetween.
2267 */
2268 if (need_to_create_worker(pool)) {
2269 spin_lock(&wq_mayday_lock);
2270 get_pwq(pwq);
2271 list_move_tail(&pwq->mayday_node, &wq->maydays);
2272 spin_unlock(&wq_mayday_lock);
2273 }
2274 }
2256 2275
2257 /* 2276 /*
2258 * Put the reference grabbed by send_mayday(). @pool won't 2277 * Put the reference grabbed by send_mayday(). @pool won't