aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Kconfig.locks7
-rw-r--r--kernel/acct.c6
-rw-r--r--kernel/audit.c70
-rw-r--r--kernel/auditsc.c27
-rw-r--r--kernel/backtracetest.c18
-rw-r--r--kernel/capability.c26
-rw-r--r--kernel/cgroup.c1831
-rw-r--r--kernel/cgroup_freezer.c138
-rw-r--r--kernel/compat.c8
-rw-r--r--kernel/context_tracking.c2
-rw-r--r--kernel/cpu.c42
-rw-r--r--kernel/cpuset.c60
-rw-r--r--kernel/debug/debug_core.c4
-rw-r--r--kernel/debug/kdb/kdb_bt.c2
-rw-r--r--kernel/debug/kdb/kdb_io.c2
-rw-r--r--kernel/debug/kdb/kdb_main.c2
-rw-r--r--kernel/events/core.c329
-rw-r--r--kernel/events/uprobes.c108
-rw-r--r--kernel/exec_domain.c14
-rw-r--r--kernel/exit.c61
-rw-r--r--kernel/fork.c20
-rw-r--r--kernel/futex.c243
-rw-r--r--kernel/gcov/base.c6
-rw-r--r--kernel/gcov/gcc_4_7.c5
-rw-r--r--kernel/hrtimer.c9
-rw-r--r--kernel/hung_task.c4
-rw-r--r--kernel/irq/Kconfig9
-rw-r--r--kernel/irq/chip.c5
-rw-r--r--kernel/irq/internals.h8
-rw-r--r--kernel/irq/irqdesc.c95
-rw-r--r--kernel/irq/irqdomain.c6
-rw-r--r--kernel/irq/manage.c4
-rw-r--r--kernel/irq/spurious.c106
-rw-r--r--kernel/kexec.c77
-rw-r--r--kernel/kmod.c7
-rw-r--r--kernel/kprobes.c392
-rw-r--r--kernel/ksysfs.c5
-rw-r--r--kernel/kthread.c4
-rw-r--r--kernel/latencytop.c5
-rw-r--r--kernel/locking/Makefile1
-rw-r--r--kernel/locking/lockdep.c2
-rw-r--r--kernel/locking/lockdep_internals.h6
-rw-r--r--kernel/locking/locktorture.c12
-rw-r--r--kernel/locking/qrwlock.c133
-rw-r--r--kernel/locking/rtmutex.c32
-rw-r--r--kernel/locking/rwsem-xadd.c274
-rw-r--r--kernel/locking/rwsem.c31
-rw-r--r--kernel/module.c44
-rw-r--r--kernel/notifier.c22
-rw-r--r--kernel/panic.c23
-rw-r--r--kernel/params.c25
-rw-r--r--kernel/power/Kconfig3
-rw-r--r--kernel/power/hibernate.c30
-rw-r--r--kernel/power/main.c33
-rw-r--r--kernel/power/power.h9
-rw-r--r--kernel/power/process.c3
-rw-r--r--kernel/power/snapshot.c2
-rw-r--r--kernel/power/suspend.c125
-rw-r--r--kernel/power/suspend_test.c24
-rw-r--r--kernel/power/swap.c2
-rw-r--r--kernel/printk/printk.c348
-rw-r--r--kernel/profile.c20
-rw-r--r--kernel/rcu/rcutorture.c217
-rw-r--r--kernel/rcu/tiny_plugin.h8
-rw-r--r--kernel/rcu/tree.c331
-rw-r--r--kernel/rcu/tree.h11
-rw-r--r--kernel/rcu/tree_plugin.h144
-rw-r--r--kernel/rcu/update.c30
-rw-r--r--kernel/reboot.c21
-rw-r--r--kernel/res_counter.c7
-rw-r--r--kernel/resource.c7
-rw-r--r--kernel/sched/core.c613
-rw-r--r--kernel/sched/cpuacct.c2
-rw-r--r--kernel/sched/cpudeadline.c37
-rw-r--r--kernel/sched/cpudeadline.h6
-rw-r--r--kernel/sched/cpupri.c16
-rw-r--r--kernel/sched/cpupri.h2
-rw-r--r--kernel/sched/cputime.c32
-rw-r--r--kernel/sched/deadline.c30
-rw-r--r--kernel/sched/fair.c647
-rw-r--r--kernel/sched/features.h8
-rw-r--r--kernel/sched/idle.c170
-rw-r--r--kernel/sched/rt.c130
-rw-r--r--kernel/sched/sched.h52
-rw-r--r--kernel/sched/stop_task.c4
-rw-r--r--kernel/sched/wait.c2
-rw-r--r--kernel/seccomp.c114
-rw-r--r--kernel/signal.c95
-rw-r--r--kernel/smp.c18
-rw-r--r--kernel/softirq.c8
-rw-r--r--kernel/stop_machine.c1
-rw-r--r--kernel/sys.c6
-rw-r--r--kernel/sys_ni.c2
-rw-r--r--kernel/sysctl.c107
-rw-r--r--kernel/time/ntp.c32
-rw-r--r--kernel/time/sched_clock.c13
-rw-r--r--kernel/time/timekeeping.c7
-rw-r--r--kernel/torture.c40
-rw-r--r--kernel/trace/Kconfig30
-rw-r--r--kernel/trace/Makefile3
-rw-r--r--kernel/trace/ftrace.c267
-rw-r--r--kernel/trace/ring_buffer.c5
-rw-r--r--kernel/trace/trace.c458
-rw-r--r--kernel/trace/trace.h46
-rw-r--r--kernel/trace/trace_benchmark.c198
-rw-r--r--kernel/trace/trace_benchmark.h41
-rw-r--r--kernel/trace/trace_event_perf.c5
-rw-r--r--kernel/trace/trace_events.c13
-rw-r--r--kernel/trace/trace_functions.c56
-rw-r--r--kernel/trace/trace_functions_graph.c19
-rw-r--r--kernel/trace/trace_irqsoff.c71
-rw-r--r--kernel/trace/trace_kprobe.c74
-rw-r--r--kernel/trace/trace_nop.c1
-rw-r--r--kernel/trace/trace_output.c41
-rw-r--r--kernel/trace/trace_probe.c65
-rw-r--r--kernel/trace/trace_probe.h15
-rw-r--r--kernel/trace/trace_sched_wakeup.c70
-rw-r--r--kernel/trace/trace_selftest.c69
-rw-r--r--kernel/trace/trace_stack.c42
-rw-r--r--kernel/trace/trace_uprobe.c66
-rw-r--r--kernel/tracepoint.c6
-rw-r--r--kernel/user.c1
-rw-r--r--kernel/user_namespace.c33
-rw-r--r--kernel/utsname_sysctl.c10
-rw-r--r--kernel/workqueue.c490
-rw-r--r--kernel/workqueue_internal.h2
126 files changed, 6383 insertions, 3755 deletions
diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks
index d2b32ac27a39..35536d9c0964 100644
--- a/kernel/Kconfig.locks
+++ b/kernel/Kconfig.locks
@@ -223,3 +223,10 @@ endif
223config MUTEX_SPIN_ON_OWNER 223config MUTEX_SPIN_ON_OWNER
224 def_bool y 224 def_bool y
225 depends on SMP && !DEBUG_MUTEXES 225 depends on SMP && !DEBUG_MUTEXES
226
227config ARCH_USE_QUEUE_RWLOCK
228 bool
229
230config QUEUE_RWLOCK
231 def_bool y if ARCH_USE_QUEUE_RWLOCK
232 depends on SMP
diff --git a/kernel/acct.c b/kernel/acct.c
index 8d6e145138bb..808a86ff229d 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -55,7 +55,7 @@
55#include <linux/times.h> 55#include <linux/times.h>
56#include <linux/syscalls.h> 56#include <linux/syscalls.h>
57#include <linux/mount.h> 57#include <linux/mount.h>
58#include <asm/uaccess.h> 58#include <linux/uaccess.h>
59#include <asm/div64.h> 59#include <asm/div64.h>
60#include <linux/blkdev.h> /* sector_div */ 60#include <linux/blkdev.h> /* sector_div */
61#include <linux/pid_namespace.h> 61#include <linux/pid_namespace.h>
@@ -134,7 +134,7 @@ static int check_free_space(struct bsd_acct_struct *acct, struct file *file)
134 spin_lock(&acct_lock); 134 spin_lock(&acct_lock);
135 if (file != acct->file) { 135 if (file != acct->file) {
136 if (act) 136 if (act)
137 res = act>0; 137 res = act > 0;
138 goto out; 138 goto out;
139 } 139 }
140 140
@@ -262,7 +262,7 @@ SYSCALL_DEFINE1(acct, const char __user *, name)
262 if (name) { 262 if (name) {
263 struct filename *tmp = getname(name); 263 struct filename *tmp = getname(name);
264 if (IS_ERR(tmp)) 264 if (IS_ERR(tmp))
265 return (PTR_ERR(tmp)); 265 return PTR_ERR(tmp);
266 error = acct_on(tmp); 266 error = acct_on(tmp);
267 putname(tmp); 267 putname(tmp);
268 } else { 268 } else {
diff --git a/kernel/audit.c b/kernel/audit.c
index 7c2893602d06..3ef2e0e797e8 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -44,7 +44,7 @@
44#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 44#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
45 45
46#include <linux/init.h> 46#include <linux/init.h>
47#include <asm/types.h> 47#include <linux/types.h>
48#include <linux/atomic.h> 48#include <linux/atomic.h>
49#include <linux/mm.h> 49#include <linux/mm.h>
50#include <linux/export.h> 50#include <linux/export.h>
@@ -424,6 +424,38 @@ static void kauditd_send_skb(struct sk_buff *skb)
424} 424}
425 425
426/* 426/*
427 * kauditd_send_multicast_skb - send the skb to multicast userspace listeners
428 *
429 * This function doesn't consume an skb as might be expected since it has to
430 * copy it anyways.
431 */
432static void kauditd_send_multicast_skb(struct sk_buff *skb)
433{
434 struct sk_buff *copy;
435 struct audit_net *aunet = net_generic(&init_net, audit_net_id);
436 struct sock *sock = aunet->nlsk;
437
438 if (!netlink_has_listeners(sock, AUDIT_NLGRP_READLOG))
439 return;
440
441 /*
442 * The seemingly wasteful skb_copy() rather than bumping the refcount
443 * using skb_get() is necessary because non-standard mods are made to
444 * the skb by the original kaudit unicast socket send routine. The
445 * existing auditd daemon assumes this breakage. Fixing this would
446 * require co-ordinating a change in the established protocol between
447 * the kaudit kernel subsystem and the auditd userspace code. There is
448 * no reason for new multicast clients to continue with this
449 * non-compliance.
450 */
451 copy = skb_copy(skb, GFP_KERNEL);
452 if (!copy)
453 return;
454
455 nlmsg_multicast(sock, copy, 0, AUDIT_NLGRP_READLOG, GFP_KERNEL);
456}
457
458/*
427 * flush_hold_queue - empty the hold queue if auditd appears 459 * flush_hold_queue - empty the hold queue if auditd appears
428 * 460 *
429 * If auditd just started, drain the queue of messages already 461 * If auditd just started, drain the queue of messages already
@@ -643,13 +675,13 @@ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type)
643 if ((task_active_pid_ns(current) != &init_pid_ns)) 675 if ((task_active_pid_ns(current) != &init_pid_ns))
644 return -EPERM; 676 return -EPERM;
645 677
646 if (!capable(CAP_AUDIT_CONTROL)) 678 if (!netlink_capable(skb, CAP_AUDIT_CONTROL))
647 err = -EPERM; 679 err = -EPERM;
648 break; 680 break;
649 case AUDIT_USER: 681 case AUDIT_USER:
650 case AUDIT_FIRST_USER_MSG ... AUDIT_LAST_USER_MSG: 682 case AUDIT_FIRST_USER_MSG ... AUDIT_LAST_USER_MSG:
651 case AUDIT_FIRST_USER_MSG2 ... AUDIT_LAST_USER_MSG2: 683 case AUDIT_FIRST_USER_MSG2 ... AUDIT_LAST_USER_MSG2:
652 if (!capable(CAP_AUDIT_WRITE)) 684 if (!netlink_capable(skb, CAP_AUDIT_WRITE))
653 err = -EPERM; 685 err = -EPERM;
654 break; 686 break;
655 default: /* bad msg */ 687 default: /* bad msg */
@@ -1076,10 +1108,22 @@ static void audit_receive(struct sk_buff *skb)
1076 mutex_unlock(&audit_cmd_mutex); 1108 mutex_unlock(&audit_cmd_mutex);
1077} 1109}
1078 1110
1111/* Run custom bind function on netlink socket group connect or bind requests. */
1112static int audit_bind(int group)
1113{
1114 if (!capable(CAP_AUDIT_READ))
1115 return -EPERM;
1116
1117 return 0;
1118}
1119
1079static int __net_init audit_net_init(struct net *net) 1120static int __net_init audit_net_init(struct net *net)
1080{ 1121{
1081 struct netlink_kernel_cfg cfg = { 1122 struct netlink_kernel_cfg cfg = {
1082 .input = audit_receive, 1123 .input = audit_receive,
1124 .bind = audit_bind,
1125 .flags = NL_CFG_F_NONROOT_RECV,
1126 .groups = AUDIT_NLGRP_MAX,
1083 }; 1127 };
1084 1128
1085 struct audit_net *aunet = net_generic(net, audit_net_id); 1129 struct audit_net *aunet = net_generic(net, audit_net_id);
@@ -1901,10 +1945,10 @@ out:
1901 * audit_log_end - end one audit record 1945 * audit_log_end - end one audit record
1902 * @ab: the audit_buffer 1946 * @ab: the audit_buffer
1903 * 1947 *
1904 * The netlink_* functions cannot be called inside an irq context, so 1948 * netlink_unicast() cannot be called inside an irq context because it blocks
1905 * the audit buffer is placed on a queue and a tasklet is scheduled to 1949 * (last arg, flags, is not set to MSG_DONTWAIT), so the audit buffer is placed
1906 * remove them from the queue outside the irq context. May be called in 1950 * on a queue and a tasklet is scheduled to remove them from the queue outside
1907 * any context. 1951 * the irq context. May be called in any context.
1908 */ 1952 */
1909void audit_log_end(struct audit_buffer *ab) 1953void audit_log_end(struct audit_buffer *ab)
1910{ 1954{
@@ -1914,6 +1958,18 @@ void audit_log_end(struct audit_buffer *ab)
1914 audit_log_lost("rate limit exceeded"); 1958 audit_log_lost("rate limit exceeded");
1915 } else { 1959 } else {
1916 struct nlmsghdr *nlh = nlmsg_hdr(ab->skb); 1960 struct nlmsghdr *nlh = nlmsg_hdr(ab->skb);
1961
1962 kauditd_send_multicast_skb(ab->skb);
1963
1964 /*
1965 * The original kaudit unicast socket sends up messages with
1966 * nlmsg_len set to the payload length rather than the entire
1967 * message length. This breaks the standard set by netlink.
1968 * The existing auditd daemon assumes this breakage. Fixing
1969 * this would require co-ordinating a change in the established
1970 * protocol between the kaudit kernel subsystem and the auditd
1971 * userspace code.
1972 */
1917 nlh->nlmsg_len = ab->skb->len - NLMSG_HDRLEN; 1973 nlh->nlmsg_len = ab->skb->len - NLMSG_HDRLEN;
1918 1974
1919 if (audit_pid) { 1975 if (audit_pid) {
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index f251a5e8d17a..21eae3c05ec0 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -728,6 +728,22 @@ static enum audit_state audit_filter_task(struct task_struct *tsk, char **key)
728 return AUDIT_BUILD_CONTEXT; 728 return AUDIT_BUILD_CONTEXT;
729} 729}
730 730
731static int audit_in_mask(const struct audit_krule *rule, unsigned long val)
732{
733 int word, bit;
734
735 if (val > 0xffffffff)
736 return false;
737
738 word = AUDIT_WORD(val);
739 if (word >= AUDIT_BITMASK_SIZE)
740 return false;
741
742 bit = AUDIT_BIT(val);
743
744 return rule->mask[word] & bit;
745}
746
731/* At syscall entry and exit time, this filter is called if the 747/* At syscall entry and exit time, this filter is called if the
732 * audit_state is not low enough that auditing cannot take place, but is 748 * audit_state is not low enough that auditing cannot take place, but is
733 * also not high enough that we already know we have to write an audit 749 * also not high enough that we already know we have to write an audit
@@ -745,11 +761,8 @@ static enum audit_state audit_filter_syscall(struct task_struct *tsk,
745 761
746 rcu_read_lock(); 762 rcu_read_lock();
747 if (!list_empty(list)) { 763 if (!list_empty(list)) {
748 int word = AUDIT_WORD(ctx->major);
749 int bit = AUDIT_BIT(ctx->major);
750
751 list_for_each_entry_rcu(e, list, list) { 764 list_for_each_entry_rcu(e, list, list) {
752 if ((e->rule.mask[word] & bit) == bit && 765 if (audit_in_mask(&e->rule, ctx->major) &&
753 audit_filter_rules(tsk, &e->rule, ctx, NULL, 766 audit_filter_rules(tsk, &e->rule, ctx, NULL,
754 &state, false)) { 767 &state, false)) {
755 rcu_read_unlock(); 768 rcu_read_unlock();
@@ -769,20 +782,16 @@ static enum audit_state audit_filter_syscall(struct task_struct *tsk,
769static int audit_filter_inode_name(struct task_struct *tsk, 782static int audit_filter_inode_name(struct task_struct *tsk,
770 struct audit_names *n, 783 struct audit_names *n,
771 struct audit_context *ctx) { 784 struct audit_context *ctx) {
772 int word, bit;
773 int h = audit_hash_ino((u32)n->ino); 785 int h = audit_hash_ino((u32)n->ino);
774 struct list_head *list = &audit_inode_hash[h]; 786 struct list_head *list = &audit_inode_hash[h];
775 struct audit_entry *e; 787 struct audit_entry *e;
776 enum audit_state state; 788 enum audit_state state;
777 789
778 word = AUDIT_WORD(ctx->major);
779 bit = AUDIT_BIT(ctx->major);
780
781 if (list_empty(list)) 790 if (list_empty(list))
782 return 0; 791 return 0;
783 792
784 list_for_each_entry_rcu(e, list, list) { 793 list_for_each_entry_rcu(e, list, list) {
785 if ((e->rule.mask[word] & bit) == bit && 794 if (audit_in_mask(&e->rule, ctx->major) &&
786 audit_filter_rules(tsk, &e->rule, ctx, n, &state, false)) { 795 audit_filter_rules(tsk, &e->rule, ctx, n, &state, false)) {
787 ctx->current_state = state; 796 ctx->current_state = state;
788 return 1; 797 return 1;
diff --git a/kernel/backtracetest.c b/kernel/backtracetest.c
index a5e026bc45c4..1323360d90e3 100644
--- a/kernel/backtracetest.c
+++ b/kernel/backtracetest.c
@@ -19,8 +19,8 @@
19 19
20static void backtrace_test_normal(void) 20static void backtrace_test_normal(void)
21{ 21{
22 printk("Testing a backtrace from process context.\n"); 22 pr_info("Testing a backtrace from process context.\n");
23 printk("The following trace is a kernel self test and not a bug!\n"); 23 pr_info("The following trace is a kernel self test and not a bug!\n");
24 24
25 dump_stack(); 25 dump_stack();
26} 26}
@@ -37,8 +37,8 @@ static DECLARE_TASKLET(backtrace_tasklet, &backtrace_test_irq_callback, 0);
37 37
38static void backtrace_test_irq(void) 38static void backtrace_test_irq(void)
39{ 39{
40 printk("Testing a backtrace from irq context.\n"); 40 pr_info("Testing a backtrace from irq context.\n");
41 printk("The following trace is a kernel self test and not a bug!\n"); 41 pr_info("The following trace is a kernel self test and not a bug!\n");
42 42
43 init_completion(&backtrace_work); 43 init_completion(&backtrace_work);
44 tasklet_schedule(&backtrace_tasklet); 44 tasklet_schedule(&backtrace_tasklet);
@@ -51,8 +51,8 @@ static void backtrace_test_saved(void)
51 struct stack_trace trace; 51 struct stack_trace trace;
52 unsigned long entries[8]; 52 unsigned long entries[8];
53 53
54 printk("Testing a saved backtrace.\n"); 54 pr_info("Testing a saved backtrace.\n");
55 printk("The following trace is a kernel self test and not a bug!\n"); 55 pr_info("The following trace is a kernel self test and not a bug!\n");
56 56
57 trace.nr_entries = 0; 57 trace.nr_entries = 0;
58 trace.max_entries = ARRAY_SIZE(entries); 58 trace.max_entries = ARRAY_SIZE(entries);
@@ -65,19 +65,19 @@ static void backtrace_test_saved(void)
65#else 65#else
66static void backtrace_test_saved(void) 66static void backtrace_test_saved(void)
67{ 67{
68 printk("Saved backtrace test skipped.\n"); 68 pr_info("Saved backtrace test skipped.\n");
69} 69}
70#endif 70#endif
71 71
72static int backtrace_regression_test(void) 72static int backtrace_regression_test(void)
73{ 73{
74 printk("====[ backtrace testing ]===========\n"); 74 pr_info("====[ backtrace testing ]===========\n");
75 75
76 backtrace_test_normal(); 76 backtrace_test_normal();
77 backtrace_test_irq(); 77 backtrace_test_irq();
78 backtrace_test_saved(); 78 backtrace_test_saved();
79 79
80 printk("====[ end of backtrace testing ]====\n"); 80 pr_info("====[ end of backtrace testing ]====\n");
81 return 0; 81 return 0;
82} 82}
83 83
diff --git a/kernel/capability.c b/kernel/capability.c
index a8d63df0c322..a5cf13c018ce 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -24,7 +24,6 @@
24 */ 24 */
25 25
26const kernel_cap_t __cap_empty_set = CAP_EMPTY_SET; 26const kernel_cap_t __cap_empty_set = CAP_EMPTY_SET;
27
28EXPORT_SYMBOL(__cap_empty_set); 27EXPORT_SYMBOL(__cap_empty_set);
29 28
30int file_caps_enabled = 1; 29int file_caps_enabled = 1;
@@ -189,7 +188,7 @@ SYSCALL_DEFINE2(capget, cap_user_header_t, header, cap_user_data_t, dataptr)
189 * 188 *
190 * An alternative would be to return an error here 189 * An alternative would be to return an error here
191 * (-ERANGE), but that causes legacy applications to 190 * (-ERANGE), but that causes legacy applications to
192 * unexpectidly fail; the capget/modify/capset aborts 191 * unexpectedly fail; the capget/modify/capset aborts
193 * before modification is attempted and the application 192 * before modification is attempted and the application
194 * fails. 193 * fails.
195 */ 194 */
@@ -395,7 +394,8 @@ EXPORT_SYMBOL(ns_capable);
395 * This does not set PF_SUPERPRIV because the caller may not 394 * This does not set PF_SUPERPRIV because the caller may not
396 * actually be privileged. 395 * actually be privileged.
397 */ 396 */
398bool file_ns_capable(const struct file *file, struct user_namespace *ns, int cap) 397bool file_ns_capable(const struct file *file, struct user_namespace *ns,
398 int cap)
399{ 399{
400 if (WARN_ON_ONCE(!cap_valid(cap))) 400 if (WARN_ON_ONCE(!cap_valid(cap)))
401 return false; 401 return false;
@@ -424,23 +424,19 @@ bool capable(int cap)
424EXPORT_SYMBOL(capable); 424EXPORT_SYMBOL(capable);
425 425
426/** 426/**
427 * inode_capable - Check superior capability over inode 427 * capable_wrt_inode_uidgid - Check nsown_capable and uid and gid mapped
428 * @inode: The inode in question 428 * @inode: The inode in question
429 * @cap: The capability in question 429 * @cap: The capability in question
430 * 430 *
431 * Return true if the current task has the given superior capability 431 * Return true if the current task has the given capability targeted at
432 * targeted at it's own user namespace and that the given inode is owned 432 * its own user namespace and that the given inode's uid and gid are
433 * by the current user namespace or a child namespace. 433 * mapped into the current user namespace.
434 *
435 * Currently we check to see if an inode is owned by the current
436 * user namespace by seeing if the inode's owner maps into the
437 * current user namespace.
438 *
439 */ 434 */
440bool inode_capable(const struct inode *inode, int cap) 435bool capable_wrt_inode_uidgid(const struct inode *inode, int cap)
441{ 436{
442 struct user_namespace *ns = current_user_ns(); 437 struct user_namespace *ns = current_user_ns();
443 438
444 return ns_capable(ns, cap) && kuid_has_mapping(ns, inode->i_uid); 439 return ns_capable(ns, cap) && kuid_has_mapping(ns, inode->i_uid) &&
440 kgid_has_mapping(ns, inode->i_gid);
445} 441}
446EXPORT_SYMBOL(inode_capable); 442EXPORT_SYMBOL(capable_wrt_inode_uidgid);
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 9fcdaa705b6c..7868fc3c0bc5 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -26,6 +26,8 @@
26 * distribution for more details. 26 * distribution for more details.
27 */ 27 */
28 28
29#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
30
29#include <linux/cgroup.h> 31#include <linux/cgroup.h>
30#include <linux/cred.h> 32#include <linux/cred.h>
31#include <linux/ctype.h> 33#include <linux/ctype.h>
@@ -33,6 +35,7 @@
33#include <linux/init_task.h> 35#include <linux/init_task.h>
34#include <linux/kernel.h> 36#include <linux/kernel.h>
35#include <linux/list.h> 37#include <linux/list.h>
38#include <linux/magic.h>
36#include <linux/mm.h> 39#include <linux/mm.h>
37#include <linux/mutex.h> 40#include <linux/mutex.h>
38#include <linux/mount.h> 41#include <linux/mount.h>
@@ -69,15 +72,6 @@
69 MAX_CFTYPE_NAME + 2) 72 MAX_CFTYPE_NAME + 2)
70 73
71/* 74/*
72 * cgroup_tree_mutex nests above cgroup_mutex and protects cftypes, file
73 * creation/removal and hierarchy changing operations including cgroup
74 * creation, removal, css association and controller rebinding. This outer
75 * lock is needed mainly to resolve the circular dependency between kernfs
76 * active ref and cgroup_mutex. cgroup_tree_mutex nests above both.
77 */
78static DEFINE_MUTEX(cgroup_tree_mutex);
79
80/*
81 * cgroup_mutex is the master lock. Any modification to cgroup or its 75 * cgroup_mutex is the master lock. Any modification to cgroup or its
82 * hierarchy must be performed while holding it. 76 * hierarchy must be performed while holding it.
83 * 77 *
@@ -98,16 +92,21 @@ static DECLARE_RWSEM(css_set_rwsem);
98#endif 92#endif
99 93
100/* 94/*
95 * Protects cgroup_idr and css_idr so that IDs can be released without
96 * grabbing cgroup_mutex.
97 */
98static DEFINE_SPINLOCK(cgroup_idr_lock);
99
100/*
101 * Protects cgroup_subsys->release_agent_path. Modifying it also requires 101 * Protects cgroup_subsys->release_agent_path. Modifying it also requires
102 * cgroup_mutex. Reading requires either cgroup_mutex or this spinlock. 102 * cgroup_mutex. Reading requires either cgroup_mutex or this spinlock.
103 */ 103 */
104static DEFINE_SPINLOCK(release_agent_path_lock); 104static DEFINE_SPINLOCK(release_agent_path_lock);
105 105
106#define cgroup_assert_mutexes_or_rcu_locked() \ 106#define cgroup_assert_mutex_or_rcu_locked() \
107 rcu_lockdep_assert(rcu_read_lock_held() || \ 107 rcu_lockdep_assert(rcu_read_lock_held() || \
108 lockdep_is_held(&cgroup_tree_mutex) || \
109 lockdep_is_held(&cgroup_mutex), \ 108 lockdep_is_held(&cgroup_mutex), \
110 "cgroup_[tree_]mutex or RCU read lock required"); 109 "cgroup_mutex or RCU read lock required");
111 110
112/* 111/*
113 * cgroup destruction makes heavy use of work items and there can be a lot 112 * cgroup destruction makes heavy use of work items and there can be a lot
@@ -150,6 +149,13 @@ struct cgroup_root cgrp_dfl_root;
150 */ 149 */
151static bool cgrp_dfl_root_visible; 150static bool cgrp_dfl_root_visible;
152 151
152/* some controllers are not supported in the default hierarchy */
153static const unsigned int cgrp_dfl_root_inhibit_ss_mask = 0
154#ifdef CONFIG_CGROUP_DEBUG
155 | (1 << debug_cgrp_id)
156#endif
157 ;
158
153/* The list of hierarchy roots */ 159/* The list of hierarchy roots */
154 160
155static LIST_HEAD(cgroup_roots); 161static LIST_HEAD(cgroup_roots);
@@ -159,14 +165,13 @@ static int cgroup_root_count;
159static DEFINE_IDR(cgroup_hierarchy_idr); 165static DEFINE_IDR(cgroup_hierarchy_idr);
160 166
161/* 167/*
162 * Assign a monotonically increasing serial number to cgroups. It 168 * Assign a monotonically increasing serial number to csses. It guarantees
163 * guarantees cgroups with bigger numbers are newer than those with smaller 169 * cgroups with bigger numbers are newer than those with smaller numbers.
164 * numbers. Also, as cgroups are always appended to the parent's 170 * Also, as csses are always appended to the parent's ->children list, it
165 * ->children list, it guarantees that sibling cgroups are always sorted in 171 * guarantees that sibling csses are always sorted in the ascending serial
166 * the ascending serial number order on the list. Protected by 172 * number order on the list. Protected by cgroup_mutex.
167 * cgroup_mutex.
168 */ 173 */
169static u64 cgroup_serial_nr_next = 1; 174static u64 css_serial_nr_next = 1;
170 175
171/* This flag indicates whether tasks in the fork and exit paths should 176/* This flag indicates whether tasks in the fork and exit paths should
172 * check for fork/exit handlers to call. This avoids us having to do 177 * check for fork/exit handlers to call. This avoids us having to do
@@ -179,17 +184,59 @@ static struct cftype cgroup_base_files[];
179 184
180static void cgroup_put(struct cgroup *cgrp); 185static void cgroup_put(struct cgroup *cgrp);
181static int rebind_subsystems(struct cgroup_root *dst_root, 186static int rebind_subsystems(struct cgroup_root *dst_root,
182 unsigned long ss_mask); 187 unsigned int ss_mask);
183static void cgroup_destroy_css_killed(struct cgroup *cgrp);
184static int cgroup_destroy_locked(struct cgroup *cgrp); 188static int cgroup_destroy_locked(struct cgroup *cgrp);
189static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss);
190static void css_release(struct percpu_ref *ref);
191static void kill_css(struct cgroup_subsys_state *css);
185static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], 192static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
186 bool is_add); 193 bool is_add);
187static void cgroup_pidlist_destroy_all(struct cgroup *cgrp); 194static void cgroup_pidlist_destroy_all(struct cgroup *cgrp);
188 195
196/* IDR wrappers which synchronize using cgroup_idr_lock */
197static int cgroup_idr_alloc(struct idr *idr, void *ptr, int start, int end,
198 gfp_t gfp_mask)
199{
200 int ret;
201
202 idr_preload(gfp_mask);
203 spin_lock_bh(&cgroup_idr_lock);
204 ret = idr_alloc(idr, ptr, start, end, gfp_mask);
205 spin_unlock_bh(&cgroup_idr_lock);
206 idr_preload_end();
207 return ret;
208}
209
210static void *cgroup_idr_replace(struct idr *idr, void *ptr, int id)
211{
212 void *ret;
213
214 spin_lock_bh(&cgroup_idr_lock);
215 ret = idr_replace(idr, ptr, id);
216 spin_unlock_bh(&cgroup_idr_lock);
217 return ret;
218}
219
220static void cgroup_idr_remove(struct idr *idr, int id)
221{
222 spin_lock_bh(&cgroup_idr_lock);
223 idr_remove(idr, id);
224 spin_unlock_bh(&cgroup_idr_lock);
225}
226
227static struct cgroup *cgroup_parent(struct cgroup *cgrp)
228{
229 struct cgroup_subsys_state *parent_css = cgrp->self.parent;
230
231 if (parent_css)
232 return container_of(parent_css, struct cgroup, self);
233 return NULL;
234}
235
189/** 236/**
190 * cgroup_css - obtain a cgroup's css for the specified subsystem 237 * cgroup_css - obtain a cgroup's css for the specified subsystem
191 * @cgrp: the cgroup of interest 238 * @cgrp: the cgroup of interest
192 * @ss: the subsystem of interest (%NULL returns the dummy_css) 239 * @ss: the subsystem of interest (%NULL returns @cgrp->self)
193 * 240 *
194 * Return @cgrp's css (cgroup_subsys_state) associated with @ss. This 241 * Return @cgrp's css (cgroup_subsys_state) associated with @ss. This
195 * function must be called either under cgroup_mutex or rcu_read_lock() and 242 * function must be called either under cgroup_mutex or rcu_read_lock() and
@@ -202,23 +249,49 @@ static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp,
202{ 249{
203 if (ss) 250 if (ss)
204 return rcu_dereference_check(cgrp->subsys[ss->id], 251 return rcu_dereference_check(cgrp->subsys[ss->id],
205 lockdep_is_held(&cgroup_tree_mutex) ||
206 lockdep_is_held(&cgroup_mutex)); 252 lockdep_is_held(&cgroup_mutex));
207 else 253 else
208 return &cgrp->dummy_css; 254 return &cgrp->self;
255}
256
257/**
258 * cgroup_e_css - obtain a cgroup's effective css for the specified subsystem
259 * @cgrp: the cgroup of interest
260 * @ss: the subsystem of interest (%NULL returns @cgrp->self)
261 *
262 * Similar to cgroup_css() but returns the effctive css, which is defined
263 * as the matching css of the nearest ancestor including self which has @ss
264 * enabled. If @ss is associated with the hierarchy @cgrp is on, this
265 * function is guaranteed to return non-NULL css.
266 */
267static struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp,
268 struct cgroup_subsys *ss)
269{
270 lockdep_assert_held(&cgroup_mutex);
271
272 if (!ss)
273 return &cgrp->self;
274
275 if (!(cgrp->root->subsys_mask & (1 << ss->id)))
276 return NULL;
277
278 while (cgroup_parent(cgrp) &&
279 !(cgroup_parent(cgrp)->child_subsys_mask & (1 << ss->id)))
280 cgrp = cgroup_parent(cgrp);
281
282 return cgroup_css(cgrp, ss);
209} 283}
210 284
211/* convenient tests for these bits */ 285/* convenient tests for these bits */
212static inline bool cgroup_is_dead(const struct cgroup *cgrp) 286static inline bool cgroup_is_dead(const struct cgroup *cgrp)
213{ 287{
214 return test_bit(CGRP_DEAD, &cgrp->flags); 288 return !(cgrp->self.flags & CSS_ONLINE);
215} 289}
216 290
217struct cgroup_subsys_state *seq_css(struct seq_file *seq) 291struct cgroup_subsys_state *of_css(struct kernfs_open_file *of)
218{ 292{
219 struct kernfs_open_file *of = seq->private;
220 struct cgroup *cgrp = of->kn->parent->priv; 293 struct cgroup *cgrp = of->kn->parent->priv;
221 struct cftype *cft = seq_cft(seq); 294 struct cftype *cft = of_cft(of);
222 295
223 /* 296 /*
224 * This is open and unprotected implementation of cgroup_css(). 297 * This is open and unprotected implementation of cgroup_css().
@@ -231,9 +304,9 @@ struct cgroup_subsys_state *seq_css(struct seq_file *seq)
231 if (cft->ss) 304 if (cft->ss)
232 return rcu_dereference_raw(cgrp->subsys[cft->ss->id]); 305 return rcu_dereference_raw(cgrp->subsys[cft->ss->id]);
233 else 306 else
234 return &cgrp->dummy_css; 307 return &cgrp->self;
235} 308}
236EXPORT_SYMBOL_GPL(seq_css); 309EXPORT_SYMBOL_GPL(of_css);
237 310
238/** 311/**
239 * cgroup_is_descendant - test ancestry 312 * cgroup_is_descendant - test ancestry
@@ -249,7 +322,7 @@ bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor)
249 while (cgrp) { 322 while (cgrp) {
250 if (cgrp == ancestor) 323 if (cgrp == ancestor)
251 return true; 324 return true;
252 cgrp = cgrp->parent; 325 cgrp = cgroup_parent(cgrp);
253 } 326 }
254 return false; 327 return false;
255} 328}
@@ -273,17 +346,30 @@ static int notify_on_release(const struct cgroup *cgrp)
273 * @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end 346 * @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end
274 * @cgrp: the target cgroup to iterate css's of 347 * @cgrp: the target cgroup to iterate css's of
275 * 348 *
276 * Should be called under cgroup_mutex. 349 * Should be called under cgroup_[tree_]mutex.
277 */ 350 */
278#define for_each_css(css, ssid, cgrp) \ 351#define for_each_css(css, ssid, cgrp) \
279 for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \ 352 for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \
280 if (!((css) = rcu_dereference_check( \ 353 if (!((css) = rcu_dereference_check( \
281 (cgrp)->subsys[(ssid)], \ 354 (cgrp)->subsys[(ssid)], \
282 lockdep_is_held(&cgroup_tree_mutex) || \
283 lockdep_is_held(&cgroup_mutex)))) { } \ 355 lockdep_is_held(&cgroup_mutex)))) { } \
284 else 356 else
285 357
286/** 358/**
359 * for_each_e_css - iterate all effective css's of a cgroup
360 * @css: the iteration cursor
361 * @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end
362 * @cgrp: the target cgroup to iterate css's of
363 *
364 * Should be called under cgroup_[tree_]mutex.
365 */
366#define for_each_e_css(css, ssid, cgrp) \
367 for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \
368 if (!((css) = cgroup_e_css(cgrp, cgroup_subsys[(ssid)]))) \
369 ; \
370 else
371
372/**
287 * for_each_subsys - iterate all enabled cgroup subsystems 373 * for_each_subsys - iterate all enabled cgroup subsystems
288 * @ss: the iteration cursor 374 * @ss: the iteration cursor
289 * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end 375 * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end
@@ -296,22 +382,13 @@ static int notify_on_release(const struct cgroup *cgrp)
296#define for_each_root(root) \ 382#define for_each_root(root) \
297 list_for_each_entry((root), &cgroup_roots, root_list) 383 list_for_each_entry((root), &cgroup_roots, root_list)
298 384
299/** 385/* iterate over child cgrps, lock should be held throughout iteration */
300 * cgroup_lock_live_group - take cgroup_mutex and check that cgrp is alive. 386#define cgroup_for_each_live_child(child, cgrp) \
301 * @cgrp: the cgroup to be checked for liveness 387 list_for_each_entry((child), &(cgrp)->self.children, self.sibling) \
302 * 388 if (({ lockdep_assert_held(&cgroup_mutex); \
303 * On success, returns true; the mutex should be later unlocked. On 389 cgroup_is_dead(child); })) \
304 * failure returns false with no lock held. 390 ; \
305 */ 391 else
306static bool cgroup_lock_live_group(struct cgroup *cgrp)
307{
308 mutex_lock(&cgroup_mutex);
309 if (cgroup_is_dead(cgrp)) {
310 mutex_unlock(&cgroup_mutex);
311 return false;
312 }
313 return true;
314}
315 392
316/* the list of cgroups eligible for automatic release. Protected by 393/* the list of cgroups eligible for automatic release. Protected by
317 * release_list_lock */ 394 * release_list_lock */
@@ -348,7 +425,7 @@ struct cgrp_cset_link {
348 * reference-counted, to improve performance when child cgroups 425 * reference-counted, to improve performance when child cgroups
349 * haven't been created. 426 * haven't been created.
350 */ 427 */
351static struct css_set init_css_set = { 428struct css_set init_css_set = {
352 .refcount = ATOMIC_INIT(1), 429 .refcount = ATOMIC_INIT(1),
353 .cgrp_links = LIST_HEAD_INIT(init_css_set.cgrp_links), 430 .cgrp_links = LIST_HEAD_INIT(init_css_set.cgrp_links),
354 .tasks = LIST_HEAD_INIT(init_css_set.tasks), 431 .tasks = LIST_HEAD_INIT(init_css_set.tasks),
@@ -359,6 +436,43 @@ static struct css_set init_css_set = {
359 436
360static int css_set_count = 1; /* 1 for init_css_set */ 437static int css_set_count = 1; /* 1 for init_css_set */
361 438
439/**
440 * cgroup_update_populated - updated populated count of a cgroup
441 * @cgrp: the target cgroup
442 * @populated: inc or dec populated count
443 *
444 * @cgrp is either getting the first task (css_set) or losing the last.
445 * Update @cgrp->populated_cnt accordingly. The count is propagated
446 * towards root so that a given cgroup's populated_cnt is zero iff the
447 * cgroup and all its descendants are empty.
448 *
449 * @cgrp's interface file "cgroup.populated" is zero if
450 * @cgrp->populated_cnt is zero and 1 otherwise. When @cgrp->populated_cnt
451 * changes from or to zero, userland is notified that the content of the
452 * interface file has changed. This can be used to detect when @cgrp and
453 * its descendants become populated or empty.
454 */
455static void cgroup_update_populated(struct cgroup *cgrp, bool populated)
456{
457 lockdep_assert_held(&css_set_rwsem);
458
459 do {
460 bool trigger;
461
462 if (populated)
463 trigger = !cgrp->populated_cnt++;
464 else
465 trigger = !--cgrp->populated_cnt;
466
467 if (!trigger)
468 break;
469
470 if (cgrp->populated_kn)
471 kernfs_notify(cgrp->populated_kn);
472 cgrp = cgroup_parent(cgrp);
473 } while (cgrp);
474}
475
362/* 476/*
363 * hash table for cgroup groups. This improves the performance to find 477 * hash table for cgroup groups. This improves the performance to find
364 * an existing css_set. This hash doesn't (currently) take into 478 * an existing css_set. This hash doesn't (currently) take into
@@ -383,6 +497,8 @@ static unsigned long css_set_hash(struct cgroup_subsys_state *css[])
383static void put_css_set_locked(struct css_set *cset, bool taskexit) 497static void put_css_set_locked(struct css_set *cset, bool taskexit)
384{ 498{
385 struct cgrp_cset_link *link, *tmp_link; 499 struct cgrp_cset_link *link, *tmp_link;
500 struct cgroup_subsys *ss;
501 int ssid;
386 502
387 lockdep_assert_held(&css_set_rwsem); 503 lockdep_assert_held(&css_set_rwsem);
388 504
@@ -390,6 +506,8 @@ static void put_css_set_locked(struct css_set *cset, bool taskexit)
390 return; 506 return;
391 507
392 /* This css_set is dead. unlink it and release cgroup refcounts */ 508 /* This css_set is dead. unlink it and release cgroup refcounts */
509 for_each_subsys(ss, ssid)
510 list_del(&cset->e_cset_node[ssid]);
393 hash_del(&cset->hlist); 511 hash_del(&cset->hlist);
394 css_set_count--; 512 css_set_count--;
395 513
@@ -400,10 +518,13 @@ static void put_css_set_locked(struct css_set *cset, bool taskexit)
400 list_del(&link->cgrp_link); 518 list_del(&link->cgrp_link);
401 519
402 /* @cgrp can't go away while we're holding css_set_rwsem */ 520 /* @cgrp can't go away while we're holding css_set_rwsem */
403 if (list_empty(&cgrp->cset_links) && notify_on_release(cgrp)) { 521 if (list_empty(&cgrp->cset_links)) {
404 if (taskexit) 522 cgroup_update_populated(cgrp, false);
405 set_bit(CGRP_RELEASABLE, &cgrp->flags); 523 if (notify_on_release(cgrp)) {
406 check_for_release(cgrp); 524 if (taskexit)
525 set_bit(CGRP_RELEASABLE, &cgrp->flags);
526 check_for_release(cgrp);
527 }
407 } 528 }
408 529
409 kfree(link); 530 kfree(link);
@@ -452,20 +573,20 @@ static bool compare_css_sets(struct css_set *cset,
452{ 573{
453 struct list_head *l1, *l2; 574 struct list_head *l1, *l2;
454 575
455 if (memcmp(template, cset->subsys, sizeof(cset->subsys))) { 576 /*
456 /* Not all subsystems matched */ 577 * On the default hierarchy, there can be csets which are
578 * associated with the same set of cgroups but different csses.
579 * Let's first ensure that csses match.
580 */
581 if (memcmp(template, cset->subsys, sizeof(cset->subsys)))
457 return false; 582 return false;
458 }
459 583
460 /* 584 /*
461 * Compare cgroup pointers in order to distinguish between 585 * Compare cgroup pointers in order to distinguish between
462 * different cgroups in heirarchies with no subsystems. We 586 * different cgroups in hierarchies. As different cgroups may
463 * could get by with just this check alone (and skip the 587 * share the same effective css, this comparison is always
464 * memcmp above) but on most setups the memcmp check will 588 * necessary.
465 * avoid the need for this more expensive check on almost all
466 * candidates.
467 */ 589 */
468
469 l1 = &cset->cgrp_links; 590 l1 = &cset->cgrp_links;
470 l2 = &old_cset->cgrp_links; 591 l2 = &old_cset->cgrp_links;
471 while (1) { 592 while (1) {
@@ -529,14 +650,17 @@ static struct css_set *find_existing_css_set(struct css_set *old_cset,
529 * won't change, so no need for locking. 650 * won't change, so no need for locking.
530 */ 651 */
531 for_each_subsys(ss, i) { 652 for_each_subsys(ss, i) {
532 if (root->cgrp.subsys_mask & (1UL << i)) { 653 if (root->subsys_mask & (1UL << i)) {
533 /* Subsystem is in this hierarchy. So we want 654 /*
534 * the subsystem state from the new 655 * @ss is in this hierarchy, so we want the
535 * cgroup */ 656 * effective css from @cgrp.
536 template[i] = cgroup_css(cgrp, ss); 657 */
658 template[i] = cgroup_e_css(cgrp, ss);
537 } else { 659 } else {
538 /* Subsystem is not in this hierarchy, so we 660 /*
539 * don't want to change the subsystem state */ 661 * @ss is not in this hierarchy, so we don't want
662 * to change the css.
663 */
540 template[i] = old_cset->subsys[i]; 664 template[i] = old_cset->subsys[i];
541 } 665 }
542 } 666 }
@@ -602,10 +726,18 @@ static void link_css_set(struct list_head *tmp_links, struct css_set *cset,
602 struct cgrp_cset_link *link; 726 struct cgrp_cset_link *link;
603 727
604 BUG_ON(list_empty(tmp_links)); 728 BUG_ON(list_empty(tmp_links));
729
730 if (cgroup_on_dfl(cgrp))
731 cset->dfl_cgrp = cgrp;
732
605 link = list_first_entry(tmp_links, struct cgrp_cset_link, cset_link); 733 link = list_first_entry(tmp_links, struct cgrp_cset_link, cset_link);
606 link->cset = cset; 734 link->cset = cset;
607 link->cgrp = cgrp; 735 link->cgrp = cgrp;
736
737 if (list_empty(&cgrp->cset_links))
738 cgroup_update_populated(cgrp, true);
608 list_move(&link->cset_link, &cgrp->cset_links); 739 list_move(&link->cset_link, &cgrp->cset_links);
740
609 /* 741 /*
610 * Always add links to the tail of the list so that the list 742 * Always add links to the tail of the list so that the list
611 * is sorted by order of hierarchy creation 743 * is sorted by order of hierarchy creation
@@ -628,7 +760,9 @@ static struct css_set *find_css_set(struct css_set *old_cset,
628 struct css_set *cset; 760 struct css_set *cset;
629 struct list_head tmp_links; 761 struct list_head tmp_links;
630 struct cgrp_cset_link *link; 762 struct cgrp_cset_link *link;
763 struct cgroup_subsys *ss;
631 unsigned long key; 764 unsigned long key;
765 int ssid;
632 766
633 lockdep_assert_held(&cgroup_mutex); 767 lockdep_assert_held(&cgroup_mutex);
634 768
@@ -679,10 +813,14 @@ static struct css_set *find_css_set(struct css_set *old_cset,
679 813
680 css_set_count++; 814 css_set_count++;
681 815
682 /* Add this cgroup group to the hash table */ 816 /* Add @cset to the hash table */
683 key = css_set_hash(cset->subsys); 817 key = css_set_hash(cset->subsys);
684 hash_add(css_set_table, &cset->hlist, key); 818 hash_add(css_set_table, &cset->hlist, key);
685 819
820 for_each_subsys(ss, ssid)
821 list_add_tail(&cset->e_cset_node[ssid],
822 &cset->subsys[ssid]->cgroup->e_csets[ssid]);
823
686 up_write(&css_set_rwsem); 824 up_write(&css_set_rwsem);
687 825
688 return cset; 826 return cset;
@@ -735,14 +873,13 @@ static void cgroup_destroy_root(struct cgroup_root *root)
735 struct cgroup *cgrp = &root->cgrp; 873 struct cgroup *cgrp = &root->cgrp;
736 struct cgrp_cset_link *link, *tmp_link; 874 struct cgrp_cset_link *link, *tmp_link;
737 875
738 mutex_lock(&cgroup_tree_mutex);
739 mutex_lock(&cgroup_mutex); 876 mutex_lock(&cgroup_mutex);
740 877
741 BUG_ON(atomic_read(&root->nr_cgrps)); 878 BUG_ON(atomic_read(&root->nr_cgrps));
742 BUG_ON(!list_empty(&cgrp->children)); 879 BUG_ON(!list_empty(&cgrp->self.children));
743 880
744 /* Rebind all subsystems back to the default hierarchy */ 881 /* Rebind all subsystems back to the default hierarchy */
745 rebind_subsystems(&cgrp_dfl_root, cgrp->subsys_mask); 882 rebind_subsystems(&cgrp_dfl_root, root->subsys_mask);
746 883
747 /* 884 /*
748 * Release all the links from cset_links to this hierarchy's 885 * Release all the links from cset_links to this hierarchy's
@@ -765,7 +902,6 @@ static void cgroup_destroy_root(struct cgroup_root *root)
765 cgroup_exit_root_id(root); 902 cgroup_exit_root_id(root);
766 903
767 mutex_unlock(&cgroup_mutex); 904 mutex_unlock(&cgroup_mutex);
768 mutex_unlock(&cgroup_tree_mutex);
769 905
770 kernfs_destroy_root(root->kf_root); 906 kernfs_destroy_root(root->kf_root);
771 cgroup_free_root(root); 907 cgroup_free_root(root);
@@ -848,7 +984,7 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task,
848 * update of a tasks cgroup pointer by cgroup_attach_task() 984 * update of a tasks cgroup pointer by cgroup_attach_task()
849 */ 985 */
850 986
851static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask); 987static int cgroup_populate_dir(struct cgroup *cgrp, unsigned int subsys_mask);
852static struct kernfs_syscall_ops cgroup_kf_syscall_ops; 988static struct kernfs_syscall_ops cgroup_kf_syscall_ops;
853static const struct file_operations proc_cgroupstats_operations; 989static const struct file_operations proc_cgroupstats_operations;
854 990
@@ -883,79 +1019,95 @@ static umode_t cgroup_file_mode(const struct cftype *cft)
883 if (cft->read_u64 || cft->read_s64 || cft->seq_show) 1019 if (cft->read_u64 || cft->read_s64 || cft->seq_show)
884 mode |= S_IRUGO; 1020 mode |= S_IRUGO;
885 1021
886 if (cft->write_u64 || cft->write_s64 || cft->write_string || 1022 if (cft->write_u64 || cft->write_s64 || cft->write)
887 cft->trigger)
888 mode |= S_IWUSR; 1023 mode |= S_IWUSR;
889 1024
890 return mode; 1025 return mode;
891} 1026}
892 1027
893static void cgroup_free_fn(struct work_struct *work) 1028static void cgroup_get(struct cgroup *cgrp)
894{ 1029{
895 struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work); 1030 WARN_ON_ONCE(cgroup_is_dead(cgrp));
896 1031 css_get(&cgrp->self);
897 atomic_dec(&cgrp->root->nr_cgrps);
898 cgroup_pidlist_destroy_all(cgrp);
899
900 if (cgrp->parent) {
901 /*
902 * We get a ref to the parent, and put the ref when this
903 * cgroup is being freed, so it's guaranteed that the
904 * parent won't be destroyed before its children.
905 */
906 cgroup_put(cgrp->parent);
907 kernfs_put(cgrp->kn);
908 kfree(cgrp);
909 } else {
910 /*
911 * This is root cgroup's refcnt reaching zero, which
912 * indicates that the root should be released.
913 */
914 cgroup_destroy_root(cgrp->root);
915 }
916} 1032}
917 1033
918static void cgroup_free_rcu(struct rcu_head *head) 1034static void cgroup_put(struct cgroup *cgrp)
919{ 1035{
920 struct cgroup *cgrp = container_of(head, struct cgroup, rcu_head); 1036 css_put(&cgrp->self);
921
922 INIT_WORK(&cgrp->destroy_work, cgroup_free_fn);
923 queue_work(cgroup_destroy_wq, &cgrp->destroy_work);
924} 1037}
925 1038
926static void cgroup_get(struct cgroup *cgrp) 1039/**
1040 * cgroup_kn_unlock - unlocking helper for cgroup kernfs methods
1041 * @kn: the kernfs_node being serviced
1042 *
1043 * This helper undoes cgroup_kn_lock_live() and should be invoked before
1044 * the method finishes if locking succeeded. Note that once this function
1045 * returns the cgroup returned by cgroup_kn_lock_live() may become
1046 * inaccessible any time. If the caller intends to continue to access the
1047 * cgroup, it should pin it before invoking this function.
1048 */
1049static void cgroup_kn_unlock(struct kernfs_node *kn)
927{ 1050{
928 WARN_ON_ONCE(cgroup_is_dead(cgrp)); 1051 struct cgroup *cgrp;
929 WARN_ON_ONCE(atomic_read(&cgrp->refcnt) <= 0); 1052
930 atomic_inc(&cgrp->refcnt); 1053 if (kernfs_type(kn) == KERNFS_DIR)
1054 cgrp = kn->priv;
1055 else
1056 cgrp = kn->parent->priv;
1057
1058 mutex_unlock(&cgroup_mutex);
1059
1060 kernfs_unbreak_active_protection(kn);
1061 cgroup_put(cgrp);
931} 1062}
932 1063
933static void cgroup_put(struct cgroup *cgrp) 1064/**
1065 * cgroup_kn_lock_live - locking helper for cgroup kernfs methods
1066 * @kn: the kernfs_node being serviced
1067 *
1068 * This helper is to be used by a cgroup kernfs method currently servicing
1069 * @kn. It breaks the active protection, performs cgroup locking and
1070 * verifies that the associated cgroup is alive. Returns the cgroup if
1071 * alive; otherwise, %NULL. A successful return should be undone by a
1072 * matching cgroup_kn_unlock() invocation.
1073 *
1074 * Any cgroup kernfs method implementation which requires locking the
1075 * associated cgroup should use this helper. It avoids nesting cgroup
1076 * locking under kernfs active protection and allows all kernfs operations
1077 * including self-removal.
1078 */
1079static struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn)
934{ 1080{
935 if (!atomic_dec_and_test(&cgrp->refcnt)) 1081 struct cgroup *cgrp;
936 return; 1082
937 if (WARN_ON_ONCE(cgrp->parent && !cgroup_is_dead(cgrp))) 1083 if (kernfs_type(kn) == KERNFS_DIR)
938 return; 1084 cgrp = kn->priv;
1085 else
1086 cgrp = kn->parent->priv;
939 1087
940 /* 1088 /*
941 * XXX: cgrp->id is only used to look up css's. As cgroup and 1089 * We're gonna grab cgroup_mutex which nests outside kernfs
942 * css's lifetimes will be decoupled, it should be made 1090 * active_ref. cgroup liveliness check alone provides enough
943 * per-subsystem and moved to css->id so that lookups are 1091 * protection against removal. Ensure @cgrp stays accessible and
944 * successful until the target css is released. 1092 * break the active_ref protection.
945 */ 1093 */
1094 cgroup_get(cgrp);
1095 kernfs_break_active_protection(kn);
1096
946 mutex_lock(&cgroup_mutex); 1097 mutex_lock(&cgroup_mutex);
947 idr_remove(&cgrp->root->cgroup_idr, cgrp->id);
948 mutex_unlock(&cgroup_mutex);
949 cgrp->id = -1;
950 1098
951 call_rcu(&cgrp->rcu_head, cgroup_free_rcu); 1099 if (!cgroup_is_dead(cgrp))
1100 return cgrp;
1101
1102 cgroup_kn_unlock(kn);
1103 return NULL;
952} 1104}
953 1105
954static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft) 1106static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
955{ 1107{
956 char name[CGROUP_FILE_NAME_MAX]; 1108 char name[CGROUP_FILE_NAME_MAX];
957 1109
958 lockdep_assert_held(&cgroup_tree_mutex); 1110 lockdep_assert_held(&cgroup_mutex);
959 kernfs_remove_by_name(cgrp->kn, cgroup_file_name(cgrp, cft, name)); 1111 kernfs_remove_by_name(cgrp->kn, cgroup_file_name(cgrp, cft, name));
960} 1112}
961 1113
@@ -964,7 +1116,7 @@ static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
964 * @cgrp: target cgroup 1116 * @cgrp: target cgroup
965 * @subsys_mask: mask of the subsystem ids whose files should be removed 1117 * @subsys_mask: mask of the subsystem ids whose files should be removed
966 */ 1118 */
967static void cgroup_clear_dir(struct cgroup *cgrp, unsigned long subsys_mask) 1119static void cgroup_clear_dir(struct cgroup *cgrp, unsigned int subsys_mask)
968{ 1120{
969 struct cgroup_subsys *ss; 1121 struct cgroup_subsys *ss;
970 int i; 1122 int i;
@@ -972,40 +1124,40 @@ static void cgroup_clear_dir(struct cgroup *cgrp, unsigned long subsys_mask)
972 for_each_subsys(ss, i) { 1124 for_each_subsys(ss, i) {
973 struct cftype *cfts; 1125 struct cftype *cfts;
974 1126
975 if (!test_bit(i, &subsys_mask)) 1127 if (!(subsys_mask & (1 << i)))
976 continue; 1128 continue;
977 list_for_each_entry(cfts, &ss->cfts, node) 1129 list_for_each_entry(cfts, &ss->cfts, node)
978 cgroup_addrm_files(cgrp, cfts, false); 1130 cgroup_addrm_files(cgrp, cfts, false);
979 } 1131 }
980} 1132}
981 1133
982static int rebind_subsystems(struct cgroup_root *dst_root, 1134static int rebind_subsystems(struct cgroup_root *dst_root, unsigned int ss_mask)
983 unsigned long ss_mask)
984{ 1135{
985 struct cgroup_subsys *ss; 1136 struct cgroup_subsys *ss;
986 int ssid, ret; 1137 unsigned int tmp_ss_mask;
1138 int ssid, i, ret;
987 1139
988 lockdep_assert_held(&cgroup_tree_mutex);
989 lockdep_assert_held(&cgroup_mutex); 1140 lockdep_assert_held(&cgroup_mutex);
990 1141
991 for_each_subsys(ss, ssid) { 1142 for_each_subsys(ss, ssid) {
992 if (!(ss_mask & (1 << ssid))) 1143 if (!(ss_mask & (1 << ssid)))
993 continue; 1144 continue;
994 1145
995 /* if @ss is on the dummy_root, we can always move it */ 1146 /* if @ss has non-root csses attached to it, can't move */
996 if (ss->root == &cgrp_dfl_root) 1147 if (css_next_child(NULL, cgroup_css(&ss->root->cgrp, ss)))
997 continue;
998
999 /* if @ss has non-root cgroups attached to it, can't move */
1000 if (!list_empty(&ss->root->cgrp.children))
1001 return -EBUSY; 1148 return -EBUSY;
1002 1149
1003 /* can't move between two non-dummy roots either */ 1150 /* can't move between two non-dummy roots either */
1004 if (dst_root != &cgrp_dfl_root) 1151 if (ss->root != &cgrp_dfl_root && dst_root != &cgrp_dfl_root)
1005 return -EBUSY; 1152 return -EBUSY;
1006 } 1153 }
1007 1154
1008 ret = cgroup_populate_dir(&dst_root->cgrp, ss_mask); 1155 /* skip creating root files on dfl_root for inhibited subsystems */
1156 tmp_ss_mask = ss_mask;
1157 if (dst_root == &cgrp_dfl_root)
1158 tmp_ss_mask &= ~cgrp_dfl_root_inhibit_ss_mask;
1159
1160 ret = cgroup_populate_dir(&dst_root->cgrp, tmp_ss_mask);
1009 if (ret) { 1161 if (ret) {
1010 if (dst_root != &cgrp_dfl_root) 1162 if (dst_root != &cgrp_dfl_root)
1011 return ret; 1163 return ret;
@@ -1017,9 +1169,9 @@ static int rebind_subsystems(struct cgroup_root *dst_root,
1017 * Just warn about it and continue. 1169 * Just warn about it and continue.
1018 */ 1170 */
1019 if (cgrp_dfl_root_visible) { 1171 if (cgrp_dfl_root_visible) {
1020 pr_warning("cgroup: failed to create files (%d) while rebinding 0x%lx to default root\n", 1172 pr_warn("failed to create files (%d) while rebinding 0x%x to default root\n",
1021 ret, ss_mask); 1173 ret, ss_mask);
1022 pr_warning("cgroup: you may retry by moving them to a different hierarchy and unbinding\n"); 1174 pr_warn("you may retry by moving them to a different hierarchy and unbinding\n");
1023 } 1175 }
1024 } 1176 }
1025 1177
@@ -1027,15 +1179,14 @@ static int rebind_subsystems(struct cgroup_root *dst_root,
1027 * Nothing can fail from this point on. Remove files for the 1179 * Nothing can fail from this point on. Remove files for the
1028 * removed subsystems and rebind each subsystem. 1180 * removed subsystems and rebind each subsystem.
1029 */ 1181 */
1030 mutex_unlock(&cgroup_mutex);
1031 for_each_subsys(ss, ssid) 1182 for_each_subsys(ss, ssid)
1032 if (ss_mask & (1 << ssid)) 1183 if (ss_mask & (1 << ssid))
1033 cgroup_clear_dir(&ss->root->cgrp, 1 << ssid); 1184 cgroup_clear_dir(&ss->root->cgrp, 1 << ssid);
1034 mutex_lock(&cgroup_mutex);
1035 1185
1036 for_each_subsys(ss, ssid) { 1186 for_each_subsys(ss, ssid) {
1037 struct cgroup_root *src_root; 1187 struct cgroup_root *src_root;
1038 struct cgroup_subsys_state *css; 1188 struct cgroup_subsys_state *css;
1189 struct css_set *cset;
1039 1190
1040 if (!(ss_mask & (1 << ssid))) 1191 if (!(ss_mask & (1 << ssid)))
1041 continue; 1192 continue;
@@ -1050,8 +1201,19 @@ static int rebind_subsystems(struct cgroup_root *dst_root,
1050 ss->root = dst_root; 1201 ss->root = dst_root;
1051 css->cgroup = &dst_root->cgrp; 1202 css->cgroup = &dst_root->cgrp;
1052 1203
1053 src_root->cgrp.subsys_mask &= ~(1 << ssid); 1204 down_write(&css_set_rwsem);
1054 dst_root->cgrp.subsys_mask |= 1 << ssid; 1205 hash_for_each(css_set_table, i, cset, hlist)
1206 list_move_tail(&cset->e_cset_node[ss->id],
1207 &dst_root->cgrp.e_csets[ss->id]);
1208 up_write(&css_set_rwsem);
1209
1210 src_root->subsys_mask &= ~(1 << ssid);
1211 src_root->cgrp.child_subsys_mask &= ~(1 << ssid);
1212
1213 /* default hierarchy doesn't enable controllers by default */
1214 dst_root->subsys_mask |= 1 << ssid;
1215 if (dst_root != &cgrp_dfl_root)
1216 dst_root->cgrp.child_subsys_mask |= 1 << ssid;
1055 1217
1056 if (ss->bind) 1218 if (ss->bind)
1057 ss->bind(css); 1219 ss->bind(css);
@@ -1069,7 +1231,7 @@ static int cgroup_show_options(struct seq_file *seq,
1069 int ssid; 1231 int ssid;
1070 1232
1071 for_each_subsys(ss, ssid) 1233 for_each_subsys(ss, ssid)
1072 if (root->cgrp.subsys_mask & (1 << ssid)) 1234 if (root->subsys_mask & (1 << ssid))
1073 seq_printf(seq, ",%s", ss->name); 1235 seq_printf(seq, ",%s", ss->name);
1074 if (root->flags & CGRP_ROOT_SANE_BEHAVIOR) 1236 if (root->flags & CGRP_ROOT_SANE_BEHAVIOR)
1075 seq_puts(seq, ",sane_behavior"); 1237 seq_puts(seq, ",sane_behavior");
@@ -1091,8 +1253,8 @@ static int cgroup_show_options(struct seq_file *seq,
1091} 1253}
1092 1254
1093struct cgroup_sb_opts { 1255struct cgroup_sb_opts {
1094 unsigned long subsys_mask; 1256 unsigned int subsys_mask;
1095 unsigned long flags; 1257 unsigned int flags;
1096 char *release_agent; 1258 char *release_agent;
1097 bool cpuset_clone_children; 1259 bool cpuset_clone_children;
1098 char *name; 1260 char *name;
@@ -1100,24 +1262,16 @@ struct cgroup_sb_opts {
1100 bool none; 1262 bool none;
1101}; 1263};
1102 1264
1103/*
1104 * Convert a hierarchy specifier into a bitmask of subsystems and
1105 * flags. Call with cgroup_mutex held to protect the cgroup_subsys[]
1106 * array. This function takes refcounts on subsystems to be used, unless it
1107 * returns error, in which case no refcounts are taken.
1108 */
1109static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) 1265static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1110{ 1266{
1111 char *token, *o = data; 1267 char *token, *o = data;
1112 bool all_ss = false, one_ss = false; 1268 bool all_ss = false, one_ss = false;
1113 unsigned long mask = (unsigned long)-1; 1269 unsigned int mask = -1U;
1114 struct cgroup_subsys *ss; 1270 struct cgroup_subsys *ss;
1115 int i; 1271 int i;
1116 1272
1117 BUG_ON(!mutex_is_locked(&cgroup_mutex));
1118
1119#ifdef CONFIG_CPUSETS 1273#ifdef CONFIG_CPUSETS
1120 mask = ~(1UL << cpuset_cgrp_id); 1274 mask = ~(1U << cpuset_cgrp_id);
1121#endif 1275#endif
1122 1276
1123 memset(opts, 0, sizeof(*opts)); 1277 memset(opts, 0, sizeof(*opts));
@@ -1198,7 +1352,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1198 /* Mutually exclusive option 'all' + subsystem name */ 1352 /* Mutually exclusive option 'all' + subsystem name */
1199 if (all_ss) 1353 if (all_ss)
1200 return -EINVAL; 1354 return -EINVAL;
1201 set_bit(i, &opts->subsys_mask); 1355 opts->subsys_mask |= (1 << i);
1202 one_ss = true; 1356 one_ss = true;
1203 1357
1204 break; 1358 break;
@@ -1210,12 +1364,12 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1210 /* Consistency checks */ 1364 /* Consistency checks */
1211 1365
1212 if (opts->flags & CGRP_ROOT_SANE_BEHAVIOR) { 1366 if (opts->flags & CGRP_ROOT_SANE_BEHAVIOR) {
1213 pr_warning("cgroup: sane_behavior: this is still under development and its behaviors will change, proceed at your own risk\n"); 1367 pr_warn("sane_behavior: this is still under development and its behaviors will change, proceed at your own risk\n");
1214 1368
1215 if ((opts->flags & (CGRP_ROOT_NOPREFIX | CGRP_ROOT_XATTR)) || 1369 if ((opts->flags & (CGRP_ROOT_NOPREFIX | CGRP_ROOT_XATTR)) ||
1216 opts->cpuset_clone_children || opts->release_agent || 1370 opts->cpuset_clone_children || opts->release_agent ||
1217 opts->name) { 1371 opts->name) {
1218 pr_err("cgroup: sane_behavior: noprefix, xattr, clone_children, release_agent and name are not allowed\n"); 1372 pr_err("sane_behavior: noprefix, xattr, clone_children, release_agent and name are not allowed\n");
1219 return -EINVAL; 1373 return -EINVAL;
1220 } 1374 }
1221 } else { 1375 } else {
@@ -1227,7 +1381,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1227 if (all_ss || (!one_ss && !opts->none && !opts->name)) 1381 if (all_ss || (!one_ss && !opts->none && !opts->name))
1228 for_each_subsys(ss, i) 1382 for_each_subsys(ss, i)
1229 if (!ss->disabled) 1383 if (!ss->disabled)
1230 set_bit(i, &opts->subsys_mask); 1384 opts->subsys_mask |= (1 << i);
1231 1385
1232 /* 1386 /*
1233 * We either have to specify by name or by subsystems. (So 1387 * We either have to specify by name or by subsystems. (So
@@ -1258,14 +1412,13 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data)
1258 int ret = 0; 1412 int ret = 0;
1259 struct cgroup_root *root = cgroup_root_from_kf(kf_root); 1413 struct cgroup_root *root = cgroup_root_from_kf(kf_root);
1260 struct cgroup_sb_opts opts; 1414 struct cgroup_sb_opts opts;
1261 unsigned long added_mask, removed_mask; 1415 unsigned int added_mask, removed_mask;
1262 1416
1263 if (root->flags & CGRP_ROOT_SANE_BEHAVIOR) { 1417 if (root->flags & CGRP_ROOT_SANE_BEHAVIOR) {
1264 pr_err("cgroup: sane_behavior: remount is not allowed\n"); 1418 pr_err("sane_behavior: remount is not allowed\n");
1265 return -EINVAL; 1419 return -EINVAL;
1266 } 1420 }
1267 1421
1268 mutex_lock(&cgroup_tree_mutex);
1269 mutex_lock(&cgroup_mutex); 1422 mutex_lock(&cgroup_mutex);
1270 1423
1271 /* See what subsystems are wanted */ 1424 /* See what subsystems are wanted */
@@ -1273,17 +1426,17 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data)
1273 if (ret) 1426 if (ret)
1274 goto out_unlock; 1427 goto out_unlock;
1275 1428
1276 if (opts.subsys_mask != root->cgrp.subsys_mask || opts.release_agent) 1429 if (opts.subsys_mask != root->subsys_mask || opts.release_agent)
1277 pr_warning("cgroup: option changes via remount are deprecated (pid=%d comm=%s)\n", 1430 pr_warn("option changes via remount are deprecated (pid=%d comm=%s)\n",
1278 task_tgid_nr(current), current->comm); 1431 task_tgid_nr(current), current->comm);
1279 1432
1280 added_mask = opts.subsys_mask & ~root->cgrp.subsys_mask; 1433 added_mask = opts.subsys_mask & ~root->subsys_mask;
1281 removed_mask = root->cgrp.subsys_mask & ~opts.subsys_mask; 1434 removed_mask = root->subsys_mask & ~opts.subsys_mask;
1282 1435
1283 /* Don't allow flags or name to change at remount */ 1436 /* Don't allow flags or name to change at remount */
1284 if (((opts.flags ^ root->flags) & CGRP_ROOT_OPTION_MASK) || 1437 if (((opts.flags ^ root->flags) & CGRP_ROOT_OPTION_MASK) ||
1285 (opts.name && strcmp(opts.name, root->name))) { 1438 (opts.name && strcmp(opts.name, root->name))) {
1286 pr_err("cgroup: option or name mismatch, new: 0x%lx \"%s\", old: 0x%lx \"%s\"\n", 1439 pr_err("option or name mismatch, new: 0x%x \"%s\", old: 0x%x \"%s\"\n",
1287 opts.flags & CGRP_ROOT_OPTION_MASK, opts.name ?: "", 1440 opts.flags & CGRP_ROOT_OPTION_MASK, opts.name ?: "",
1288 root->flags & CGRP_ROOT_OPTION_MASK, root->name); 1441 root->flags & CGRP_ROOT_OPTION_MASK, root->name);
1289 ret = -EINVAL; 1442 ret = -EINVAL;
@@ -1291,7 +1444,7 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data)
1291 } 1444 }
1292 1445
1293 /* remounting is not allowed for populated hierarchies */ 1446 /* remounting is not allowed for populated hierarchies */
1294 if (!list_empty(&root->cgrp.children)) { 1447 if (!list_empty(&root->cgrp.self.children)) {
1295 ret = -EBUSY; 1448 ret = -EBUSY;
1296 goto out_unlock; 1449 goto out_unlock;
1297 } 1450 }
@@ -1311,7 +1464,6 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data)
1311 kfree(opts.release_agent); 1464 kfree(opts.release_agent);
1312 kfree(opts.name); 1465 kfree(opts.name);
1313 mutex_unlock(&cgroup_mutex); 1466 mutex_unlock(&cgroup_mutex);
1314 mutex_unlock(&cgroup_tree_mutex);
1315 return ret; 1467 return ret;
1316} 1468}
1317 1469
@@ -1369,14 +1521,22 @@ out_unlock:
1369 1521
1370static void init_cgroup_housekeeping(struct cgroup *cgrp) 1522static void init_cgroup_housekeeping(struct cgroup *cgrp)
1371{ 1523{
1372 atomic_set(&cgrp->refcnt, 1); 1524 struct cgroup_subsys *ss;
1373 INIT_LIST_HEAD(&cgrp->sibling); 1525 int ssid;
1374 INIT_LIST_HEAD(&cgrp->children); 1526
1527 INIT_LIST_HEAD(&cgrp->self.sibling);
1528 INIT_LIST_HEAD(&cgrp->self.children);
1375 INIT_LIST_HEAD(&cgrp->cset_links); 1529 INIT_LIST_HEAD(&cgrp->cset_links);
1376 INIT_LIST_HEAD(&cgrp->release_list); 1530 INIT_LIST_HEAD(&cgrp->release_list);
1377 INIT_LIST_HEAD(&cgrp->pidlists); 1531 INIT_LIST_HEAD(&cgrp->pidlists);
1378 mutex_init(&cgrp->pidlist_mutex); 1532 mutex_init(&cgrp->pidlist_mutex);
1379 cgrp->dummy_css.cgroup = cgrp; 1533 cgrp->self.cgroup = cgrp;
1534 cgrp->self.flags |= CSS_ONLINE;
1535
1536 for_each_subsys(ss, ssid)
1537 INIT_LIST_HEAD(&cgrp->e_csets[ssid]);
1538
1539 init_waitqueue_head(&cgrp->offline_waitq);
1380} 1540}
1381 1541
1382static void init_cgroup_root(struct cgroup_root *root, 1542static void init_cgroup_root(struct cgroup_root *root,
@@ -1399,21 +1559,24 @@ static void init_cgroup_root(struct cgroup_root *root,
1399 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags); 1559 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags);
1400} 1560}
1401 1561
1402static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask) 1562static int cgroup_setup_root(struct cgroup_root *root, unsigned int ss_mask)
1403{ 1563{
1404 LIST_HEAD(tmp_links); 1564 LIST_HEAD(tmp_links);
1405 struct cgroup *root_cgrp = &root->cgrp; 1565 struct cgroup *root_cgrp = &root->cgrp;
1406 struct css_set *cset; 1566 struct css_set *cset;
1407 int i, ret; 1567 int i, ret;
1408 1568
1409 lockdep_assert_held(&cgroup_tree_mutex);
1410 lockdep_assert_held(&cgroup_mutex); 1569 lockdep_assert_held(&cgroup_mutex);
1411 1570
1412 ret = idr_alloc(&root->cgroup_idr, root_cgrp, 0, 1, GFP_KERNEL); 1571 ret = cgroup_idr_alloc(&root->cgroup_idr, root_cgrp, 1, 2, GFP_NOWAIT);
1413 if (ret < 0) 1572 if (ret < 0)
1414 goto out; 1573 goto out;
1415 root_cgrp->id = ret; 1574 root_cgrp->id = ret;
1416 1575
1576 ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release);
1577 if (ret)
1578 goto out;
1579
1417 /* 1580 /*
1418 * We're accessing css_set_count without locking css_set_rwsem here, 1581 * We're accessing css_set_count without locking css_set_rwsem here,
1419 * but that's OK - it can only be increased by someone holding 1582 * but that's OK - it can only be increased by someone holding
@@ -1422,11 +1585,11 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask)
1422 */ 1585 */
1423 ret = allocate_cgrp_cset_links(css_set_count, &tmp_links); 1586 ret = allocate_cgrp_cset_links(css_set_count, &tmp_links);
1424 if (ret) 1587 if (ret)
1425 goto out; 1588 goto cancel_ref;
1426 1589
1427 ret = cgroup_init_root_id(root); 1590 ret = cgroup_init_root_id(root);
1428 if (ret) 1591 if (ret)
1429 goto out; 1592 goto cancel_ref;
1430 1593
1431 root->kf_root = kernfs_create_root(&cgroup_kf_syscall_ops, 1594 root->kf_root = kernfs_create_root(&cgroup_kf_syscall_ops,
1432 KERNFS_ROOT_CREATE_DEACTIVATED, 1595 KERNFS_ROOT_CREATE_DEACTIVATED,
@@ -1462,7 +1625,7 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask)
1462 link_css_set(&tmp_links, cset, root_cgrp); 1625 link_css_set(&tmp_links, cset, root_cgrp);
1463 up_write(&css_set_rwsem); 1626 up_write(&css_set_rwsem);
1464 1627
1465 BUG_ON(!list_empty(&root_cgrp->children)); 1628 BUG_ON(!list_empty(&root_cgrp->self.children));
1466 BUG_ON(atomic_read(&root->nr_cgrps) != 1); 1629 BUG_ON(atomic_read(&root->nr_cgrps) != 1);
1467 1630
1468 kernfs_activate(root_cgrp->kn); 1631 kernfs_activate(root_cgrp->kn);
@@ -1474,6 +1637,8 @@ destroy_root:
1474 root->kf_root = NULL; 1637 root->kf_root = NULL;
1475exit_root_id: 1638exit_root_id:
1476 cgroup_exit_root_id(root); 1639 cgroup_exit_root_id(root);
1640cancel_ref:
1641 percpu_ref_cancel_init(&root_cgrp->self.refcnt);
1477out: 1642out:
1478 free_cgrp_cset_links(&tmp_links); 1643 free_cgrp_cset_links(&tmp_links);
1479 return ret; 1644 return ret;
@@ -1495,8 +1660,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1495 */ 1660 */
1496 if (!use_task_css_set_links) 1661 if (!use_task_css_set_links)
1497 cgroup_enable_task_cg_lists(); 1662 cgroup_enable_task_cg_lists();
1498retry: 1663
1499 mutex_lock(&cgroup_tree_mutex);
1500 mutex_lock(&cgroup_mutex); 1664 mutex_lock(&cgroup_mutex);
1501 1665
1502 /* First find the desired set of subsystems */ 1666 /* First find the desired set of subsystems */
@@ -1535,7 +1699,7 @@ retry:
1535 * subsystems) then they must match. 1699 * subsystems) then they must match.
1536 */ 1700 */
1537 if ((opts.subsys_mask || opts.none) && 1701 if ((opts.subsys_mask || opts.none) &&
1538 (opts.subsys_mask != root->cgrp.subsys_mask)) { 1702 (opts.subsys_mask != root->subsys_mask)) {
1539 if (!name_match) 1703 if (!name_match)
1540 continue; 1704 continue;
1541 ret = -EBUSY; 1705 ret = -EBUSY;
@@ -1544,28 +1708,27 @@ retry:
1544 1708
1545 if ((root->flags ^ opts.flags) & CGRP_ROOT_OPTION_MASK) { 1709 if ((root->flags ^ opts.flags) & CGRP_ROOT_OPTION_MASK) {
1546 if ((root->flags | opts.flags) & CGRP_ROOT_SANE_BEHAVIOR) { 1710 if ((root->flags | opts.flags) & CGRP_ROOT_SANE_BEHAVIOR) {
1547 pr_err("cgroup: sane_behavior: new mount options should match the existing superblock\n"); 1711 pr_err("sane_behavior: new mount options should match the existing superblock\n");
1548 ret = -EINVAL; 1712 ret = -EINVAL;
1549 goto out_unlock; 1713 goto out_unlock;
1550 } else { 1714 } else {
1551 pr_warning("cgroup: new mount options do not match the existing superblock, will be ignored\n"); 1715 pr_warn("new mount options do not match the existing superblock, will be ignored\n");
1552 } 1716 }
1553 } 1717 }
1554 1718
1555 /* 1719 /*
1556 * A root's lifetime is governed by its root cgroup. Zero 1720 * A root's lifetime is governed by its root cgroup.
1557 * ref indicate that the root is being destroyed. Wait for 1721 * tryget_live failure indicate that the root is being
1558 * destruction to complete so that the subsystems are free. 1722 * destroyed. Wait for destruction to complete so that the
1559 * We can use wait_queue for the wait but this path is 1723 * subsystems are free. We can use wait_queue for the wait
1560 * super cold. Let's just sleep for a bit and retry. 1724 * but this path is super cold. Let's just sleep for a bit
1725 * and retry.
1561 */ 1726 */
1562 if (!atomic_inc_not_zero(&root->cgrp.refcnt)) { 1727 if (!percpu_ref_tryget_live(&root->cgrp.self.refcnt)) {
1563 mutex_unlock(&cgroup_mutex); 1728 mutex_unlock(&cgroup_mutex);
1564 mutex_unlock(&cgroup_tree_mutex);
1565 kfree(opts.release_agent);
1566 kfree(opts.name);
1567 msleep(10); 1729 msleep(10);
1568 goto retry; 1730 ret = restart_syscall();
1731 goto out_free;
1569 } 1732 }
1570 1733
1571 ret = 0; 1734 ret = 0;
@@ -1596,15 +1759,15 @@ retry:
1596 1759
1597out_unlock: 1760out_unlock:
1598 mutex_unlock(&cgroup_mutex); 1761 mutex_unlock(&cgroup_mutex);
1599 mutex_unlock(&cgroup_tree_mutex); 1762out_free:
1600
1601 kfree(opts.release_agent); 1763 kfree(opts.release_agent);
1602 kfree(opts.name); 1764 kfree(opts.name);
1603 1765
1604 if (ret) 1766 if (ret)
1605 return ERR_PTR(ret); 1767 return ERR_PTR(ret);
1606 1768
1607 dentry = kernfs_mount(fs_type, flags, root->kf_root, &new_sb); 1769 dentry = kernfs_mount(fs_type, flags, root->kf_root,
1770 CGROUP_SUPER_MAGIC, &new_sb);
1608 if (IS_ERR(dentry) || !new_sb) 1771 if (IS_ERR(dentry) || !new_sb)
1609 cgroup_put(&root->cgrp); 1772 cgroup_put(&root->cgrp);
1610 return dentry; 1773 return dentry;
@@ -1615,7 +1778,19 @@ static void cgroup_kill_sb(struct super_block *sb)
1615 struct kernfs_root *kf_root = kernfs_root_from_sb(sb); 1778 struct kernfs_root *kf_root = kernfs_root_from_sb(sb);
1616 struct cgroup_root *root = cgroup_root_from_kf(kf_root); 1779 struct cgroup_root *root = cgroup_root_from_kf(kf_root);
1617 1780
1618 cgroup_put(&root->cgrp); 1781 /*
1782 * If @root doesn't have any mounts or children, start killing it.
1783 * This prevents new mounts by disabling percpu_ref_tryget_live().
1784 * cgroup_mount() may wait for @root's release.
1785 *
1786 * And don't kill the default root.
1787 */
1788 if (css_has_online_children(&root->cgrp.self) ||
1789 root == &cgrp_dfl_root)
1790 cgroup_put(&root->cgrp);
1791 else
1792 percpu_ref_kill(&root->cgrp.self.refcnt);
1793
1619 kernfs_kill_sb(sb); 1794 kernfs_kill_sb(sb);
1620} 1795}
1621 1796
@@ -1737,7 +1912,7 @@ struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset)
1737 1912
1738/** 1913/**
1739 * cgroup_task_migrate - move a task from one cgroup to another. 1914 * cgroup_task_migrate - move a task from one cgroup to another.
1740 * @old_cgrp; the cgroup @tsk is being migrated from 1915 * @old_cgrp: the cgroup @tsk is being migrated from
1741 * @tsk: the task being migrated 1916 * @tsk: the task being migrated
1742 * @new_cset: the new css_set @tsk is being attached to 1917 * @new_cset: the new css_set @tsk is being attached to
1743 * 1918 *
@@ -1829,10 +2004,6 @@ static void cgroup_migrate_add_src(struct css_set *src_cset,
1829 2004
1830 src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root); 2005 src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root);
1831 2006
1832 /* nothing to do if this cset already belongs to the cgroup */
1833 if (src_cgrp == dst_cgrp)
1834 return;
1835
1836 if (!list_empty(&src_cset->mg_preload_node)) 2007 if (!list_empty(&src_cset->mg_preload_node))
1837 return; 2008 return;
1838 2009
@@ -1847,13 +2018,14 @@ static void cgroup_migrate_add_src(struct css_set *src_cset,
1847 2018
1848/** 2019/**
1849 * cgroup_migrate_prepare_dst - prepare destination css_sets for migration 2020 * cgroup_migrate_prepare_dst - prepare destination css_sets for migration
1850 * @dst_cgrp: the destination cgroup 2021 * @dst_cgrp: the destination cgroup (may be %NULL)
1851 * @preloaded_csets: list of preloaded source css_sets 2022 * @preloaded_csets: list of preloaded source css_sets
1852 * 2023 *
1853 * Tasks are about to be moved to @dst_cgrp and all the source css_sets 2024 * Tasks are about to be moved to @dst_cgrp and all the source css_sets
1854 * have been preloaded to @preloaded_csets. This function looks up and 2025 * have been preloaded to @preloaded_csets. This function looks up and
1855 * pins all destination css_sets, links each to its source, and put them on 2026 * pins all destination css_sets, links each to its source, and append them
1856 * @preloaded_csets. 2027 * to @preloaded_csets. If @dst_cgrp is %NULL, the destination of each
2028 * source css_set is assumed to be its cgroup on the default hierarchy.
1857 * 2029 *
1858 * This function must be called after cgroup_migrate_add_src() has been 2030 * This function must be called after cgroup_migrate_add_src() has been
1859 * called on each migration source css_set. After migration is performed 2031 * called on each migration source css_set. After migration is performed
@@ -1864,19 +2036,42 @@ static int cgroup_migrate_prepare_dst(struct cgroup *dst_cgrp,
1864 struct list_head *preloaded_csets) 2036 struct list_head *preloaded_csets)
1865{ 2037{
1866 LIST_HEAD(csets); 2038 LIST_HEAD(csets);
1867 struct css_set *src_cset; 2039 struct css_set *src_cset, *tmp_cset;
1868 2040
1869 lockdep_assert_held(&cgroup_mutex); 2041 lockdep_assert_held(&cgroup_mutex);
1870 2042
2043 /*
2044 * Except for the root, child_subsys_mask must be zero for a cgroup
2045 * with tasks so that child cgroups don't compete against tasks.
2046 */
2047 if (dst_cgrp && cgroup_on_dfl(dst_cgrp) && cgroup_parent(dst_cgrp) &&
2048 dst_cgrp->child_subsys_mask)
2049 return -EBUSY;
2050
1871 /* look up the dst cset for each src cset and link it to src */ 2051 /* look up the dst cset for each src cset and link it to src */
1872 list_for_each_entry(src_cset, preloaded_csets, mg_preload_node) { 2052 list_for_each_entry_safe(src_cset, tmp_cset, preloaded_csets, mg_preload_node) {
1873 struct css_set *dst_cset; 2053 struct css_set *dst_cset;
1874 2054
1875 dst_cset = find_css_set(src_cset, dst_cgrp); 2055 dst_cset = find_css_set(src_cset,
2056 dst_cgrp ?: src_cset->dfl_cgrp);
1876 if (!dst_cset) 2057 if (!dst_cset)
1877 goto err; 2058 goto err;
1878 2059
1879 WARN_ON_ONCE(src_cset->mg_dst_cset || dst_cset->mg_dst_cset); 2060 WARN_ON_ONCE(src_cset->mg_dst_cset || dst_cset->mg_dst_cset);
2061
2062 /*
2063 * If src cset equals dst, it's noop. Drop the src.
2064 * cgroup_migrate() will skip the cset too. Note that we
2065 * can't handle src == dst as some nodes are used by both.
2066 */
2067 if (src_cset == dst_cset) {
2068 src_cset->mg_src_cgrp = NULL;
2069 list_del_init(&src_cset->mg_preload_node);
2070 put_css_set(src_cset, false);
2071 put_css_set(dst_cset, false);
2072 continue;
2073 }
2074
1880 src_cset->mg_dst_cset = dst_cset; 2075 src_cset->mg_dst_cset = dst_cset;
1881 2076
1882 if (list_empty(&dst_cset->mg_preload_node)) 2077 if (list_empty(&dst_cset->mg_preload_node))
@@ -1885,7 +2080,7 @@ static int cgroup_migrate_prepare_dst(struct cgroup *dst_cgrp,
1885 put_css_set(dst_cset, false); 2080 put_css_set(dst_cset, false);
1886 } 2081 }
1887 2082
1888 list_splice(&csets, preloaded_csets); 2083 list_splice_tail(&csets, preloaded_csets);
1889 return 0; 2084 return 0;
1890err: 2085err:
1891 cgroup_migrate_finish(&csets); 2086 cgroup_migrate_finish(&csets);
@@ -1966,7 +2161,7 @@ static int cgroup_migrate(struct cgroup *cgrp, struct task_struct *leader,
1966 return 0; 2161 return 0;
1967 2162
1968 /* check that we can legitimately attach to the cgroup */ 2163 /* check that we can legitimately attach to the cgroup */
1969 for_each_css(css, i, cgrp) { 2164 for_each_e_css(css, i, cgrp) {
1970 if (css->ss->can_attach) { 2165 if (css->ss->can_attach) {
1971 ret = css->ss->can_attach(css, &tset); 2166 ret = css->ss->can_attach(css, &tset);
1972 if (ret) { 2167 if (ret) {
@@ -1996,7 +2191,7 @@ static int cgroup_migrate(struct cgroup *cgrp, struct task_struct *leader,
1996 */ 2191 */
1997 tset.csets = &tset.dst_csets; 2192 tset.csets = &tset.dst_csets;
1998 2193
1999 for_each_css(css, i, cgrp) 2194 for_each_e_css(css, i, cgrp)
2000 if (css->ss->attach) 2195 if (css->ss->attach)
2001 css->ss->attach(css, &tset); 2196 css->ss->attach(css, &tset);
2002 2197
@@ -2004,7 +2199,7 @@ static int cgroup_migrate(struct cgroup *cgrp, struct task_struct *leader,
2004 goto out_release_tset; 2199 goto out_release_tset;
2005 2200
2006out_cancel_attach: 2201out_cancel_attach:
2007 for_each_css(css, i, cgrp) { 2202 for_each_e_css(css, i, cgrp) {
2008 if (css == failed_css) 2203 if (css == failed_css)
2009 break; 2204 break;
2010 if (css->ss->cancel_attach) 2205 if (css->ss->cancel_attach)
@@ -2063,13 +2258,20 @@ static int cgroup_attach_task(struct cgroup *dst_cgrp,
2063 * function to attach either it or all tasks in its threadgroup. Will lock 2258 * function to attach either it or all tasks in its threadgroup. Will lock
2064 * cgroup_mutex and threadgroup. 2259 * cgroup_mutex and threadgroup.
2065 */ 2260 */
2066static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup) 2261static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
2262 size_t nbytes, loff_t off, bool threadgroup)
2067{ 2263{
2068 struct task_struct *tsk; 2264 struct task_struct *tsk;
2069 const struct cred *cred = current_cred(), *tcred; 2265 const struct cred *cred = current_cred(), *tcred;
2266 struct cgroup *cgrp;
2267 pid_t pid;
2070 int ret; 2268 int ret;
2071 2269
2072 if (!cgroup_lock_live_group(cgrp)) 2270 if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0)
2271 return -EINVAL;
2272
2273 cgrp = cgroup_kn_lock_live(of->kn);
2274 if (!cgrp)
2073 return -ENODEV; 2275 return -ENODEV;
2074 2276
2075retry_find_task: 2277retry_find_task:
@@ -2135,8 +2337,8 @@ retry_find_task:
2135 2337
2136 put_task_struct(tsk); 2338 put_task_struct(tsk);
2137out_unlock_cgroup: 2339out_unlock_cgroup:
2138 mutex_unlock(&cgroup_mutex); 2340 cgroup_kn_unlock(of->kn);
2139 return ret; 2341 return ret ?: nbytes;
2140} 2342}
2141 2343
2142/** 2344/**
@@ -2170,43 +2372,44 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
2170} 2372}
2171EXPORT_SYMBOL_GPL(cgroup_attach_task_all); 2373EXPORT_SYMBOL_GPL(cgroup_attach_task_all);
2172 2374
2173static int cgroup_tasks_write(struct cgroup_subsys_state *css, 2375static ssize_t cgroup_tasks_write(struct kernfs_open_file *of,
2174 struct cftype *cft, u64 pid) 2376 char *buf, size_t nbytes, loff_t off)
2175{ 2377{
2176 return attach_task_by_pid(css->cgroup, pid, false); 2378 return __cgroup_procs_write(of, buf, nbytes, off, false);
2177} 2379}
2178 2380
2179static int cgroup_procs_write(struct cgroup_subsys_state *css, 2381static ssize_t cgroup_procs_write(struct kernfs_open_file *of,
2180 struct cftype *cft, u64 tgid) 2382 char *buf, size_t nbytes, loff_t off)
2181{ 2383{
2182 return attach_task_by_pid(css->cgroup, tgid, true); 2384 return __cgroup_procs_write(of, buf, nbytes, off, true);
2183} 2385}
2184 2386
2185static int cgroup_release_agent_write(struct cgroup_subsys_state *css, 2387static ssize_t cgroup_release_agent_write(struct kernfs_open_file *of,
2186 struct cftype *cft, char *buffer) 2388 char *buf, size_t nbytes, loff_t off)
2187{ 2389{
2188 struct cgroup_root *root = css->cgroup->root; 2390 struct cgroup *cgrp;
2189 2391
2190 BUILD_BUG_ON(sizeof(root->release_agent_path) < PATH_MAX); 2392 BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX);
2191 if (!cgroup_lock_live_group(css->cgroup)) 2393
2394 cgrp = cgroup_kn_lock_live(of->kn);
2395 if (!cgrp)
2192 return -ENODEV; 2396 return -ENODEV;
2193 spin_lock(&release_agent_path_lock); 2397 spin_lock(&release_agent_path_lock);
2194 strlcpy(root->release_agent_path, buffer, 2398 strlcpy(cgrp->root->release_agent_path, strstrip(buf),
2195 sizeof(root->release_agent_path)); 2399 sizeof(cgrp->root->release_agent_path));
2196 spin_unlock(&release_agent_path_lock); 2400 spin_unlock(&release_agent_path_lock);
2197 mutex_unlock(&cgroup_mutex); 2401 cgroup_kn_unlock(of->kn);
2198 return 0; 2402 return nbytes;
2199} 2403}
2200 2404
2201static int cgroup_release_agent_show(struct seq_file *seq, void *v) 2405static int cgroup_release_agent_show(struct seq_file *seq, void *v)
2202{ 2406{
2203 struct cgroup *cgrp = seq_css(seq)->cgroup; 2407 struct cgroup *cgrp = seq_css(seq)->cgroup;
2204 2408
2205 if (!cgroup_lock_live_group(cgrp)) 2409 spin_lock(&release_agent_path_lock);
2206 return -ENODEV;
2207 seq_puts(seq, cgrp->root->release_agent_path); 2410 seq_puts(seq, cgrp->root->release_agent_path);
2411 spin_unlock(&release_agent_path_lock);
2208 seq_putc(seq, '\n'); 2412 seq_putc(seq, '\n');
2209 mutex_unlock(&cgroup_mutex);
2210 return 0; 2413 return 0;
2211} 2414}
2212 2415
@@ -2218,6 +2421,320 @@ static int cgroup_sane_behavior_show(struct seq_file *seq, void *v)
2218 return 0; 2421 return 0;
2219} 2422}
2220 2423
2424static void cgroup_print_ss_mask(struct seq_file *seq, unsigned int ss_mask)
2425{
2426 struct cgroup_subsys *ss;
2427 bool printed = false;
2428 int ssid;
2429
2430 for_each_subsys(ss, ssid) {
2431 if (ss_mask & (1 << ssid)) {
2432 if (printed)
2433 seq_putc(seq, ' ');
2434 seq_printf(seq, "%s", ss->name);
2435 printed = true;
2436 }
2437 }
2438 if (printed)
2439 seq_putc(seq, '\n');
2440}
2441
2442/* show controllers which are currently attached to the default hierarchy */
2443static int cgroup_root_controllers_show(struct seq_file *seq, void *v)
2444{
2445 struct cgroup *cgrp = seq_css(seq)->cgroup;
2446
2447 cgroup_print_ss_mask(seq, cgrp->root->subsys_mask &
2448 ~cgrp_dfl_root_inhibit_ss_mask);
2449 return 0;
2450}
2451
2452/* show controllers which are enabled from the parent */
2453static int cgroup_controllers_show(struct seq_file *seq, void *v)
2454{
2455 struct cgroup *cgrp = seq_css(seq)->cgroup;
2456
2457 cgroup_print_ss_mask(seq, cgroup_parent(cgrp)->child_subsys_mask);
2458 return 0;
2459}
2460
2461/* show controllers which are enabled for a given cgroup's children */
2462static int cgroup_subtree_control_show(struct seq_file *seq, void *v)
2463{
2464 struct cgroup *cgrp = seq_css(seq)->cgroup;
2465
2466 cgroup_print_ss_mask(seq, cgrp->child_subsys_mask);
2467 return 0;
2468}
2469
2470/**
2471 * cgroup_update_dfl_csses - update css assoc of a subtree in default hierarchy
2472 * @cgrp: root of the subtree to update csses for
2473 *
2474 * @cgrp's child_subsys_mask has changed and its subtree's (self excluded)
2475 * css associations need to be updated accordingly. This function looks up
2476 * all css_sets which are attached to the subtree, creates the matching
2477 * updated css_sets and migrates the tasks to the new ones.
2478 */
2479static int cgroup_update_dfl_csses(struct cgroup *cgrp)
2480{
2481 LIST_HEAD(preloaded_csets);
2482 struct cgroup_subsys_state *css;
2483 struct css_set *src_cset;
2484 int ret;
2485
2486 lockdep_assert_held(&cgroup_mutex);
2487
2488 /* look up all csses currently attached to @cgrp's subtree */
2489 down_read(&css_set_rwsem);
2490 css_for_each_descendant_pre(css, cgroup_css(cgrp, NULL)) {
2491 struct cgrp_cset_link *link;
2492
2493 /* self is not affected by child_subsys_mask change */
2494 if (css->cgroup == cgrp)
2495 continue;
2496
2497 list_for_each_entry(link, &css->cgroup->cset_links, cset_link)
2498 cgroup_migrate_add_src(link->cset, cgrp,
2499 &preloaded_csets);
2500 }
2501 up_read(&css_set_rwsem);
2502
2503 /* NULL dst indicates self on default hierarchy */
2504 ret = cgroup_migrate_prepare_dst(NULL, &preloaded_csets);
2505 if (ret)
2506 goto out_finish;
2507
2508 list_for_each_entry(src_cset, &preloaded_csets, mg_preload_node) {
2509 struct task_struct *last_task = NULL, *task;
2510
2511 /* src_csets precede dst_csets, break on the first dst_cset */
2512 if (!src_cset->mg_src_cgrp)
2513 break;
2514
2515 /*
2516 * All tasks in src_cset need to be migrated to the
2517 * matching dst_cset. Empty it process by process. We
2518 * walk tasks but migrate processes. The leader might even
2519 * belong to a different cset but such src_cset would also
2520 * be among the target src_csets because the default
2521 * hierarchy enforces per-process membership.
2522 */
2523 while (true) {
2524 down_read(&css_set_rwsem);
2525 task = list_first_entry_or_null(&src_cset->tasks,
2526 struct task_struct, cg_list);
2527 if (task) {
2528 task = task->group_leader;
2529 WARN_ON_ONCE(!task_css_set(task)->mg_src_cgrp);
2530 get_task_struct(task);
2531 }
2532 up_read(&css_set_rwsem);
2533
2534 if (!task)
2535 break;
2536
2537 /* guard against possible infinite loop */
2538 if (WARN(last_task == task,
2539 "cgroup: update_dfl_csses failed to make progress, aborting in inconsistent state\n"))
2540 goto out_finish;
2541 last_task = task;
2542
2543 threadgroup_lock(task);
2544 /* raced against de_thread() from another thread? */
2545 if (!thread_group_leader(task)) {
2546 threadgroup_unlock(task);
2547 put_task_struct(task);
2548 continue;
2549 }
2550
2551 ret = cgroup_migrate(src_cset->dfl_cgrp, task, true);
2552
2553 threadgroup_unlock(task);
2554 put_task_struct(task);
2555
2556 if (WARN(ret, "cgroup: failed to update controllers for the default hierarchy (%d), further operations may crash or hang\n", ret))
2557 goto out_finish;
2558 }
2559 }
2560
2561out_finish:
2562 cgroup_migrate_finish(&preloaded_csets);
2563 return ret;
2564}
2565
2566/* change the enabled child controllers for a cgroup in the default hierarchy */
2567static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
2568 char *buf, size_t nbytes,
2569 loff_t off)
2570{
2571 unsigned int enable = 0, disable = 0;
2572 struct cgroup *cgrp, *child;
2573 struct cgroup_subsys *ss;
2574 char *tok;
2575 int ssid, ret;
2576
2577 /*
2578 * Parse input - space separated list of subsystem names prefixed
2579 * with either + or -.
2580 */
2581 buf = strstrip(buf);
2582 while ((tok = strsep(&buf, " "))) {
2583 if (tok[0] == '\0')
2584 continue;
2585 for_each_subsys(ss, ssid) {
2586 if (ss->disabled || strcmp(tok + 1, ss->name) ||
2587 ((1 << ss->id) & cgrp_dfl_root_inhibit_ss_mask))
2588 continue;
2589
2590 if (*tok == '+') {
2591 enable |= 1 << ssid;
2592 disable &= ~(1 << ssid);
2593 } else if (*tok == '-') {
2594 disable |= 1 << ssid;
2595 enable &= ~(1 << ssid);
2596 } else {
2597 return -EINVAL;
2598 }
2599 break;
2600 }
2601 if (ssid == CGROUP_SUBSYS_COUNT)
2602 return -EINVAL;
2603 }
2604
2605 cgrp = cgroup_kn_lock_live(of->kn);
2606 if (!cgrp)
2607 return -ENODEV;
2608
2609 for_each_subsys(ss, ssid) {
2610 if (enable & (1 << ssid)) {
2611 if (cgrp->child_subsys_mask & (1 << ssid)) {
2612 enable &= ~(1 << ssid);
2613 continue;
2614 }
2615
2616 /*
2617 * Because css offlining is asynchronous, userland
2618 * might try to re-enable the same controller while
2619 * the previous instance is still around. In such
2620 * cases, wait till it's gone using offline_waitq.
2621 */
2622 cgroup_for_each_live_child(child, cgrp) {
2623 DEFINE_WAIT(wait);
2624
2625 if (!cgroup_css(child, ss))
2626 continue;
2627
2628 cgroup_get(child);
2629 prepare_to_wait(&child->offline_waitq, &wait,
2630 TASK_UNINTERRUPTIBLE);
2631 cgroup_kn_unlock(of->kn);
2632 schedule();
2633 finish_wait(&child->offline_waitq, &wait);
2634 cgroup_put(child);
2635
2636 return restart_syscall();
2637 }
2638
2639 /* unavailable or not enabled on the parent? */
2640 if (!(cgrp_dfl_root.subsys_mask & (1 << ssid)) ||
2641 (cgroup_parent(cgrp) &&
2642 !(cgroup_parent(cgrp)->child_subsys_mask & (1 << ssid)))) {
2643 ret = -ENOENT;
2644 goto out_unlock;
2645 }
2646 } else if (disable & (1 << ssid)) {
2647 if (!(cgrp->child_subsys_mask & (1 << ssid))) {
2648 disable &= ~(1 << ssid);
2649 continue;
2650 }
2651
2652 /* a child has it enabled? */
2653 cgroup_for_each_live_child(child, cgrp) {
2654 if (child->child_subsys_mask & (1 << ssid)) {
2655 ret = -EBUSY;
2656 goto out_unlock;
2657 }
2658 }
2659 }
2660 }
2661
2662 if (!enable && !disable) {
2663 ret = 0;
2664 goto out_unlock;
2665 }
2666
2667 /*
2668 * Except for the root, child_subsys_mask must be zero for a cgroup
2669 * with tasks so that child cgroups don't compete against tasks.
2670 */
2671 if (enable && cgroup_parent(cgrp) && !list_empty(&cgrp->cset_links)) {
2672 ret = -EBUSY;
2673 goto out_unlock;
2674 }
2675
2676 /*
2677 * Create csses for enables and update child_subsys_mask. This
2678 * changes cgroup_e_css() results which in turn makes the
2679 * subsequent cgroup_update_dfl_csses() associate all tasks in the
2680 * subtree to the updated csses.
2681 */
2682 for_each_subsys(ss, ssid) {
2683 if (!(enable & (1 << ssid)))
2684 continue;
2685
2686 cgroup_for_each_live_child(child, cgrp) {
2687 ret = create_css(child, ss);
2688 if (ret)
2689 goto err_undo_css;
2690 }
2691 }
2692
2693 cgrp->child_subsys_mask |= enable;
2694 cgrp->child_subsys_mask &= ~disable;
2695
2696 ret = cgroup_update_dfl_csses(cgrp);
2697 if (ret)
2698 goto err_undo_css;
2699
2700 /* all tasks are now migrated away from the old csses, kill them */
2701 for_each_subsys(ss, ssid) {
2702 if (!(disable & (1 << ssid)))
2703 continue;
2704
2705 cgroup_for_each_live_child(child, cgrp)
2706 kill_css(cgroup_css(child, ss));
2707 }
2708
2709 kernfs_activate(cgrp->kn);
2710 ret = 0;
2711out_unlock:
2712 cgroup_kn_unlock(of->kn);
2713 return ret ?: nbytes;
2714
2715err_undo_css:
2716 cgrp->child_subsys_mask &= ~enable;
2717 cgrp->child_subsys_mask |= disable;
2718
2719 for_each_subsys(ss, ssid) {
2720 if (!(enable & (1 << ssid)))
2721 continue;
2722
2723 cgroup_for_each_live_child(child, cgrp) {
2724 struct cgroup_subsys_state *css = cgroup_css(child, ss);
2725 if (css)
2726 kill_css(css);
2727 }
2728 }
2729 goto out_unlock;
2730}
2731
2732static int cgroup_populated_show(struct seq_file *seq, void *v)
2733{
2734 seq_printf(seq, "%d\n", (bool)seq_css(seq)->cgroup->populated_cnt);
2735 return 0;
2736}
2737
2221static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf, 2738static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,
2222 size_t nbytes, loff_t off) 2739 size_t nbytes, loff_t off)
2223{ 2740{
@@ -2226,6 +2743,9 @@ static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,
2226 struct cgroup_subsys_state *css; 2743 struct cgroup_subsys_state *css;
2227 int ret; 2744 int ret;
2228 2745
2746 if (cft->write)
2747 return cft->write(of, buf, nbytes, off);
2748
2229 /* 2749 /*
2230 * kernfs guarantees that a file isn't deleted with operations in 2750 * kernfs guarantees that a file isn't deleted with operations in
2231 * flight, which means that the matching css is and stays alive and 2751 * flight, which means that the matching css is and stays alive and
@@ -2236,9 +2756,7 @@ static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,
2236 css = cgroup_css(cgrp, cft->ss); 2756 css = cgroup_css(cgrp, cft->ss);
2237 rcu_read_unlock(); 2757 rcu_read_unlock();
2238 2758
2239 if (cft->write_string) { 2759 if (cft->write_u64) {
2240 ret = cft->write_string(css, cft, strstrip(buf));
2241 } else if (cft->write_u64) {
2242 unsigned long long v; 2760 unsigned long long v;
2243 ret = kstrtoull(buf, 0, &v); 2761 ret = kstrtoull(buf, 0, &v);
2244 if (!ret) 2762 if (!ret)
@@ -2248,8 +2766,6 @@ static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,
2248 ret = kstrtoll(buf, 0, &v); 2766 ret = kstrtoll(buf, 0, &v);
2249 if (!ret) 2767 if (!ret)
2250 ret = cft->write_s64(css, cft, v); 2768 ret = cft->write_s64(css, cft, v);
2251 } else if (cft->trigger) {
2252 ret = cft->trigger(css, (unsigned int)cft->private);
2253 } else { 2769 } else {
2254 ret = -EINVAL; 2770 ret = -EINVAL;
2255 } 2771 }
@@ -2326,20 +2842,18 @@ static int cgroup_rename(struct kernfs_node *kn, struct kernfs_node *new_parent,
2326 return -EPERM; 2842 return -EPERM;
2327 2843
2328 /* 2844 /*
2329 * We're gonna grab cgroup_tree_mutex which nests outside kernfs 2845 * We're gonna grab cgroup_mutex which nests outside kernfs
2330 * active_ref. kernfs_rename() doesn't require active_ref 2846 * active_ref. kernfs_rename() doesn't require active_ref
2331 * protection. Break them before grabbing cgroup_tree_mutex. 2847 * protection. Break them before grabbing cgroup_mutex.
2332 */ 2848 */
2333 kernfs_break_active_protection(new_parent); 2849 kernfs_break_active_protection(new_parent);
2334 kernfs_break_active_protection(kn); 2850 kernfs_break_active_protection(kn);
2335 2851
2336 mutex_lock(&cgroup_tree_mutex);
2337 mutex_lock(&cgroup_mutex); 2852 mutex_lock(&cgroup_mutex);
2338 2853
2339 ret = kernfs_rename(kn, new_parent, new_name_str); 2854 ret = kernfs_rename(kn, new_parent, new_name_str);
2340 2855
2341 mutex_unlock(&cgroup_mutex); 2856 mutex_unlock(&cgroup_mutex);
2342 mutex_unlock(&cgroup_tree_mutex);
2343 2857
2344 kernfs_unbreak_active_protection(kn); 2858 kernfs_unbreak_active_protection(kn);
2345 kernfs_unbreak_active_protection(new_parent); 2859 kernfs_unbreak_active_protection(new_parent);
@@ -2377,9 +2891,14 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft)
2377 return PTR_ERR(kn); 2891 return PTR_ERR(kn);
2378 2892
2379 ret = cgroup_kn_set_ugid(kn); 2893 ret = cgroup_kn_set_ugid(kn);
2380 if (ret) 2894 if (ret) {
2381 kernfs_remove(kn); 2895 kernfs_remove(kn);
2382 return ret; 2896 return ret;
2897 }
2898
2899 if (cft->seq_show == cgroup_populated_show)
2900 cgrp->populated_kn = kn;
2901 return 0;
2383} 2902}
2384 2903
2385/** 2904/**
@@ -2399,7 +2918,7 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
2399 struct cftype *cft; 2918 struct cftype *cft;
2400 int ret; 2919 int ret;
2401 2920
2402 lockdep_assert_held(&cgroup_tree_mutex); 2921 lockdep_assert_held(&cgroup_mutex);
2403 2922
2404 for (cft = cfts; cft->name[0] != '\0'; cft++) { 2923 for (cft = cfts; cft->name[0] != '\0'; cft++) {
2405 /* does cft->flags tell us to skip this file on @cgrp? */ 2924 /* does cft->flags tell us to skip this file on @cgrp? */
@@ -2407,16 +2926,16 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
2407 continue; 2926 continue;
2408 if ((cft->flags & CFTYPE_INSANE) && cgroup_sane_behavior(cgrp)) 2927 if ((cft->flags & CFTYPE_INSANE) && cgroup_sane_behavior(cgrp))
2409 continue; 2928 continue;
2410 if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgrp->parent) 2929 if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgroup_parent(cgrp))
2411 continue; 2930 continue;
2412 if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgrp->parent) 2931 if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgroup_parent(cgrp))
2413 continue; 2932 continue;
2414 2933
2415 if (is_add) { 2934 if (is_add) {
2416 ret = cgroup_add_file(cgrp, cft); 2935 ret = cgroup_add_file(cgrp, cft);
2417 if (ret) { 2936 if (ret) {
2418 pr_warn("cgroup_addrm_files: failed to add %s, err=%d\n", 2937 pr_warn("%s: failed to add %s, err=%d\n",
2419 cft->name, ret); 2938 __func__, cft->name, ret);
2420 return ret; 2939 return ret;
2421 } 2940 }
2422 } else { 2941 } else {
@@ -2434,11 +2953,7 @@ static int cgroup_apply_cftypes(struct cftype *cfts, bool is_add)
2434 struct cgroup_subsys_state *css; 2953 struct cgroup_subsys_state *css;
2435 int ret = 0; 2954 int ret = 0;
2436 2955
2437 lockdep_assert_held(&cgroup_tree_mutex); 2956 lockdep_assert_held(&cgroup_mutex);
2438
2439 /* don't bother if @ss isn't attached */
2440 if (ss->root == &cgrp_dfl_root)
2441 return 0;
2442 2957
2443 /* add/rm files for all cgroups created before */ 2958 /* add/rm files for all cgroups created before */
2444 css_for_each_descendant_pre(css, cgroup_css(root, ss)) { 2959 css_for_each_descendant_pre(css, cgroup_css(root, ss)) {
@@ -2506,7 +3021,7 @@ static int cgroup_init_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
2506 3021
2507static int cgroup_rm_cftypes_locked(struct cftype *cfts) 3022static int cgroup_rm_cftypes_locked(struct cftype *cfts)
2508{ 3023{
2509 lockdep_assert_held(&cgroup_tree_mutex); 3024 lockdep_assert_held(&cgroup_mutex);
2510 3025
2511 if (!cfts || !cfts[0].ss) 3026 if (!cfts || !cfts[0].ss)
2512 return -ENOENT; 3027 return -ENOENT;
@@ -2532,9 +3047,9 @@ int cgroup_rm_cftypes(struct cftype *cfts)
2532{ 3047{
2533 int ret; 3048 int ret;
2534 3049
2535 mutex_lock(&cgroup_tree_mutex); 3050 mutex_lock(&cgroup_mutex);
2536 ret = cgroup_rm_cftypes_locked(cfts); 3051 ret = cgroup_rm_cftypes_locked(cfts);
2537 mutex_unlock(&cgroup_tree_mutex); 3052 mutex_unlock(&cgroup_mutex);
2538 return ret; 3053 return ret;
2539} 3054}
2540 3055
@@ -2556,6 +3071,9 @@ int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
2556{ 3071{
2557 int ret; 3072 int ret;
2558 3073
3074 if (ss->disabled)
3075 return 0;
3076
2559 if (!cfts || cfts[0].name[0] == '\0') 3077 if (!cfts || cfts[0].name[0] == '\0')
2560 return 0; 3078 return 0;
2561 3079
@@ -2563,14 +3081,14 @@ int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
2563 if (ret) 3081 if (ret)
2564 return ret; 3082 return ret;
2565 3083
2566 mutex_lock(&cgroup_tree_mutex); 3084 mutex_lock(&cgroup_mutex);
2567 3085
2568 list_add_tail(&cfts->node, &ss->cfts); 3086 list_add_tail(&cfts->node, &ss->cfts);
2569 ret = cgroup_apply_cftypes(cfts, true); 3087 ret = cgroup_apply_cftypes(cfts, true);
2570 if (ret) 3088 if (ret)
2571 cgroup_rm_cftypes_locked(cfts); 3089 cgroup_rm_cftypes_locked(cfts);
2572 3090
2573 mutex_unlock(&cgroup_tree_mutex); 3091 mutex_unlock(&cgroup_mutex);
2574 return ret; 3092 return ret;
2575} 3093}
2576 3094
@@ -2594,57 +3112,65 @@ static int cgroup_task_count(const struct cgroup *cgrp)
2594 3112
2595/** 3113/**
2596 * css_next_child - find the next child of a given css 3114 * css_next_child - find the next child of a given css
2597 * @pos_css: the current position (%NULL to initiate traversal) 3115 * @pos: the current position (%NULL to initiate traversal)
2598 * @parent_css: css whose children to walk 3116 * @parent: css whose children to walk
2599 * 3117 *
2600 * This function returns the next child of @parent_css and should be called 3118 * This function returns the next child of @parent and should be called
2601 * under either cgroup_mutex or RCU read lock. The only requirement is 3119 * under either cgroup_mutex or RCU read lock. The only requirement is
2602 * that @parent_css and @pos_css are accessible. The next sibling is 3120 * that @parent and @pos are accessible. The next sibling is guaranteed to
2603 * guaranteed to be returned regardless of their states. 3121 * be returned regardless of their states.
3122 *
3123 * If a subsystem synchronizes ->css_online() and the start of iteration, a
3124 * css which finished ->css_online() is guaranteed to be visible in the
3125 * future iterations and will stay visible until the last reference is put.
3126 * A css which hasn't finished ->css_online() or already finished
3127 * ->css_offline() may show up during traversal. It's each subsystem's
3128 * responsibility to synchronize against on/offlining.
2604 */ 3129 */
2605struct cgroup_subsys_state * 3130struct cgroup_subsys_state *css_next_child(struct cgroup_subsys_state *pos,
2606css_next_child(struct cgroup_subsys_state *pos_css, 3131 struct cgroup_subsys_state *parent)
2607 struct cgroup_subsys_state *parent_css)
2608{ 3132{
2609 struct cgroup *pos = pos_css ? pos_css->cgroup : NULL; 3133 struct cgroup_subsys_state *next;
2610 struct cgroup *cgrp = parent_css->cgroup;
2611 struct cgroup *next;
2612 3134
2613 cgroup_assert_mutexes_or_rcu_locked(); 3135 cgroup_assert_mutex_or_rcu_locked();
2614 3136
2615 /* 3137 /*
2616 * @pos could already have been removed. Once a cgroup is removed, 3138 * @pos could already have been unlinked from the sibling list.
2617 * its ->sibling.next is no longer updated when its next sibling 3139 * Once a cgroup is removed, its ->sibling.next is no longer
2618 * changes. As CGRP_DEAD assertion is serialized and happens 3140 * updated when its next sibling changes. CSS_RELEASED is set when
2619 * before the cgroup is taken off the ->sibling list, if we see it 3141 * @pos is taken off list, at which time its next pointer is valid,
2620 * unasserted, it's guaranteed that the next sibling hasn't 3142 * and, as releases are serialized, the one pointed to by the next
2621 * finished its grace period even if it's already removed, and thus 3143 * pointer is guaranteed to not have started release yet. This
2622 * safe to dereference from this RCU critical section. If 3144 * implies that if we observe !CSS_RELEASED on @pos in this RCU
2623 * ->sibling.next is inaccessible, cgroup_is_dead() is guaranteed 3145 * critical section, the one pointed to by its next pointer is
2624 * to be visible as %true here. 3146 * guaranteed to not have finished its RCU grace period even if we
3147 * have dropped rcu_read_lock() inbetween iterations.
2625 * 3148 *
2626 * If @pos is dead, its next pointer can't be dereferenced; 3149 * If @pos has CSS_RELEASED set, its next pointer can't be
2627 * however, as each cgroup is given a monotonically increasing 3150 * dereferenced; however, as each css is given a monotonically
2628 * unique serial number and always appended to the sibling list, 3151 * increasing unique serial number and always appended to the
2629 * the next one can be found by walking the parent's children until 3152 * sibling list, the next one can be found by walking the parent's
2630 * we see a cgroup with higher serial number than @pos's. While 3153 * children until the first css with higher serial number than
2631 * this path can be slower, it's taken only when either the current 3154 * @pos's. While this path can be slower, it happens iff iteration
2632 * cgroup is removed or iteration and removal race. 3155 * races against release and the race window is very small.
2633 */ 3156 */
2634 if (!pos) { 3157 if (!pos) {
2635 next = list_entry_rcu(cgrp->children.next, struct cgroup, sibling); 3158 next = list_entry_rcu(parent->children.next, struct cgroup_subsys_state, sibling);
2636 } else if (likely(!cgroup_is_dead(pos))) { 3159 } else if (likely(!(pos->flags & CSS_RELEASED))) {
2637 next = list_entry_rcu(pos->sibling.next, struct cgroup, sibling); 3160 next = list_entry_rcu(pos->sibling.next, struct cgroup_subsys_state, sibling);
2638 } else { 3161 } else {
2639 list_for_each_entry_rcu(next, &cgrp->children, sibling) 3162 list_for_each_entry_rcu(next, &parent->children, sibling)
2640 if (next->serial_nr > pos->serial_nr) 3163 if (next->serial_nr > pos->serial_nr)
2641 break; 3164 break;
2642 } 3165 }
2643 3166
2644 if (&next->sibling == &cgrp->children) 3167 /*
2645 return NULL; 3168 * @next, if not pointing to the head, can be dereferenced and is
2646 3169 * the next sibling.
2647 return cgroup_css(next, parent_css->ss); 3170 */
3171 if (&next->sibling != &parent->children)
3172 return next;
3173 return NULL;
2648} 3174}
2649 3175
2650/** 3176/**
@@ -2660,6 +3186,13 @@ css_next_child(struct cgroup_subsys_state *pos_css,
2660 * doesn't require the whole traversal to be contained in a single critical 3186 * doesn't require the whole traversal to be contained in a single critical
2661 * section. This function will return the correct next descendant as long 3187 * section. This function will return the correct next descendant as long
2662 * as both @pos and @root are accessible and @pos is a descendant of @root. 3188 * as both @pos and @root are accessible and @pos is a descendant of @root.
3189 *
3190 * If a subsystem synchronizes ->css_online() and the start of iteration, a
3191 * css which finished ->css_online() is guaranteed to be visible in the
3192 * future iterations and will stay visible until the last reference is put.
3193 * A css which hasn't finished ->css_online() or already finished
3194 * ->css_offline() may show up during traversal. It's each subsystem's
3195 * responsibility to synchronize against on/offlining.
2663 */ 3196 */
2664struct cgroup_subsys_state * 3197struct cgroup_subsys_state *
2665css_next_descendant_pre(struct cgroup_subsys_state *pos, 3198css_next_descendant_pre(struct cgroup_subsys_state *pos,
@@ -2667,7 +3200,7 @@ css_next_descendant_pre(struct cgroup_subsys_state *pos,
2667{ 3200{
2668 struct cgroup_subsys_state *next; 3201 struct cgroup_subsys_state *next;
2669 3202
2670 cgroup_assert_mutexes_or_rcu_locked(); 3203 cgroup_assert_mutex_or_rcu_locked();
2671 3204
2672 /* if first iteration, visit @root */ 3205 /* if first iteration, visit @root */
2673 if (!pos) 3206 if (!pos)
@@ -2680,10 +3213,10 @@ css_next_descendant_pre(struct cgroup_subsys_state *pos,
2680 3213
2681 /* no child, visit my or the closest ancestor's next sibling */ 3214 /* no child, visit my or the closest ancestor's next sibling */
2682 while (pos != root) { 3215 while (pos != root) {
2683 next = css_next_child(pos, css_parent(pos)); 3216 next = css_next_child(pos, pos->parent);
2684 if (next) 3217 if (next)
2685 return next; 3218 return next;
2686 pos = css_parent(pos); 3219 pos = pos->parent;
2687 } 3220 }
2688 3221
2689 return NULL; 3222 return NULL;
@@ -2707,7 +3240,7 @@ css_rightmost_descendant(struct cgroup_subsys_state *pos)
2707{ 3240{
2708 struct cgroup_subsys_state *last, *tmp; 3241 struct cgroup_subsys_state *last, *tmp;
2709 3242
2710 cgroup_assert_mutexes_or_rcu_locked(); 3243 cgroup_assert_mutex_or_rcu_locked();
2711 3244
2712 do { 3245 do {
2713 last = pos; 3246 last = pos;
@@ -2747,6 +3280,13 @@ css_leftmost_descendant(struct cgroup_subsys_state *pos)
2747 * section. This function will return the correct next descendant as long 3280 * section. This function will return the correct next descendant as long
2748 * as both @pos and @cgroup are accessible and @pos is a descendant of 3281 * as both @pos and @cgroup are accessible and @pos is a descendant of
2749 * @cgroup. 3282 * @cgroup.
3283 *
3284 * If a subsystem synchronizes ->css_online() and the start of iteration, a
3285 * css which finished ->css_online() is guaranteed to be visible in the
3286 * future iterations and will stay visible until the last reference is put.
3287 * A css which hasn't finished ->css_online() or already finished
3288 * ->css_offline() may show up during traversal. It's each subsystem's
3289 * responsibility to synchronize against on/offlining.
2750 */ 3290 */
2751struct cgroup_subsys_state * 3291struct cgroup_subsys_state *
2752css_next_descendant_post(struct cgroup_subsys_state *pos, 3292css_next_descendant_post(struct cgroup_subsys_state *pos,
@@ -2754,7 +3294,7 @@ css_next_descendant_post(struct cgroup_subsys_state *pos,
2754{ 3294{
2755 struct cgroup_subsys_state *next; 3295 struct cgroup_subsys_state *next;
2756 3296
2757 cgroup_assert_mutexes_or_rcu_locked(); 3297 cgroup_assert_mutex_or_rcu_locked();
2758 3298
2759 /* if first iteration, visit leftmost descendant which may be @root */ 3299 /* if first iteration, visit leftmost descendant which may be @root */
2760 if (!pos) 3300 if (!pos)
@@ -2765,12 +3305,36 @@ css_next_descendant_post(struct cgroup_subsys_state *pos,
2765 return NULL; 3305 return NULL;
2766 3306
2767 /* if there's an unvisited sibling, visit its leftmost descendant */ 3307 /* if there's an unvisited sibling, visit its leftmost descendant */
2768 next = css_next_child(pos, css_parent(pos)); 3308 next = css_next_child(pos, pos->parent);
2769 if (next) 3309 if (next)
2770 return css_leftmost_descendant(next); 3310 return css_leftmost_descendant(next);
2771 3311
2772 /* no sibling left, visit parent */ 3312 /* no sibling left, visit parent */
2773 return css_parent(pos); 3313 return pos->parent;
3314}
3315
3316/**
3317 * css_has_online_children - does a css have online children
3318 * @css: the target css
3319 *
3320 * Returns %true if @css has any online children; otherwise, %false. This
3321 * function can be called from any context but the caller is responsible
3322 * for synchronizing against on/offlining as necessary.
3323 */
3324bool css_has_online_children(struct cgroup_subsys_state *css)
3325{
3326 struct cgroup_subsys_state *child;
3327 bool ret = false;
3328
3329 rcu_read_lock();
3330 css_for_each_child(child, css) {
3331 if (css->flags & CSS_ONLINE) {
3332 ret = true;
3333 break;
3334 }
3335 }
3336 rcu_read_unlock();
3337 return ret;
2774} 3338}
2775 3339
2776/** 3340/**
@@ -2781,27 +3345,36 @@ css_next_descendant_post(struct cgroup_subsys_state *pos,
2781 */ 3345 */
2782static void css_advance_task_iter(struct css_task_iter *it) 3346static void css_advance_task_iter(struct css_task_iter *it)
2783{ 3347{
2784 struct list_head *l = it->cset_link; 3348 struct list_head *l = it->cset_pos;
2785 struct cgrp_cset_link *link; 3349 struct cgrp_cset_link *link;
2786 struct css_set *cset; 3350 struct css_set *cset;
2787 3351
2788 /* Advance to the next non-empty css_set */ 3352 /* Advance to the next non-empty css_set */
2789 do { 3353 do {
2790 l = l->next; 3354 l = l->next;
2791 if (l == &it->origin_css->cgroup->cset_links) { 3355 if (l == it->cset_head) {
2792 it->cset_link = NULL; 3356 it->cset_pos = NULL;
2793 return; 3357 return;
2794 } 3358 }
2795 link = list_entry(l, struct cgrp_cset_link, cset_link); 3359
2796 cset = link->cset; 3360 if (it->ss) {
3361 cset = container_of(l, struct css_set,
3362 e_cset_node[it->ss->id]);
3363 } else {
3364 link = list_entry(l, struct cgrp_cset_link, cset_link);
3365 cset = link->cset;
3366 }
2797 } while (list_empty(&cset->tasks) && list_empty(&cset->mg_tasks)); 3367 } while (list_empty(&cset->tasks) && list_empty(&cset->mg_tasks));
2798 3368
2799 it->cset_link = l; 3369 it->cset_pos = l;
2800 3370
2801 if (!list_empty(&cset->tasks)) 3371 if (!list_empty(&cset->tasks))
2802 it->task = cset->tasks.next; 3372 it->task_pos = cset->tasks.next;
2803 else 3373 else
2804 it->task = cset->mg_tasks.next; 3374 it->task_pos = cset->mg_tasks.next;
3375
3376 it->tasks_head = &cset->tasks;
3377 it->mg_tasks_head = &cset->mg_tasks;
2805} 3378}
2806 3379
2807/** 3380/**
@@ -2827,8 +3400,14 @@ void css_task_iter_start(struct cgroup_subsys_state *css,
2827 3400
2828 down_read(&css_set_rwsem); 3401 down_read(&css_set_rwsem);
2829 3402
2830 it->origin_css = css; 3403 it->ss = css->ss;
2831 it->cset_link = &css->cgroup->cset_links; 3404
3405 if (it->ss)
3406 it->cset_pos = &css->cgroup->e_csets[css->ss->id];
3407 else
3408 it->cset_pos = &css->cgroup->cset_links;
3409
3410 it->cset_head = it->cset_pos;
2832 3411
2833 css_advance_task_iter(it); 3412 css_advance_task_iter(it);
2834} 3413}
@@ -2844,12 +3423,10 @@ void css_task_iter_start(struct cgroup_subsys_state *css,
2844struct task_struct *css_task_iter_next(struct css_task_iter *it) 3423struct task_struct *css_task_iter_next(struct css_task_iter *it)
2845{ 3424{
2846 struct task_struct *res; 3425 struct task_struct *res;
2847 struct list_head *l = it->task; 3426 struct list_head *l = it->task_pos;
2848 struct cgrp_cset_link *link = list_entry(it->cset_link,
2849 struct cgrp_cset_link, cset_link);
2850 3427
2851 /* If the iterator cg is NULL, we have no tasks */ 3428 /* If the iterator cg is NULL, we have no tasks */
2852 if (!it->cset_link) 3429 if (!it->cset_pos)
2853 return NULL; 3430 return NULL;
2854 res = list_entry(l, struct task_struct, cg_list); 3431 res = list_entry(l, struct task_struct, cg_list);
2855 3432
@@ -2860,13 +3437,13 @@ struct task_struct *css_task_iter_next(struct css_task_iter *it)
2860 */ 3437 */
2861 l = l->next; 3438 l = l->next;
2862 3439
2863 if (l == &link->cset->tasks) 3440 if (l == it->tasks_head)
2864 l = link->cset->mg_tasks.next; 3441 l = it->mg_tasks_head->next;
2865 3442
2866 if (l == &link->cset->mg_tasks) 3443 if (l == it->mg_tasks_head)
2867 css_advance_task_iter(it); 3444 css_advance_task_iter(it);
2868 else 3445 else
2869 it->task = l; 3446 it->task_pos = l;
2870 3447
2871 return res; 3448 return res;
2872} 3449}
@@ -2919,7 +3496,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
2919 * ->can_attach() fails. 3496 * ->can_attach() fails.
2920 */ 3497 */
2921 do { 3498 do {
2922 css_task_iter_start(&from->dummy_css, &it); 3499 css_task_iter_start(&from->self, &it);
2923 task = css_task_iter_next(&it); 3500 task = css_task_iter_next(&it);
2924 if (task) 3501 if (task)
2925 get_task_struct(task); 3502 get_task_struct(task);
@@ -3184,7 +3761,7 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
3184 if (!array) 3761 if (!array)
3185 return -ENOMEM; 3762 return -ENOMEM;
3186 /* now, populate the array */ 3763 /* now, populate the array */
3187 css_task_iter_start(&cgrp->dummy_css, &it); 3764 css_task_iter_start(&cgrp->self, &it);
3188 while ((tsk = css_task_iter_next(&it))) { 3765 while ((tsk = css_task_iter_next(&it))) {
3189 if (unlikely(n == length)) 3766 if (unlikely(n == length))
3190 break; 3767 break;
@@ -3246,7 +3823,7 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
3246 3823
3247 /* 3824 /*
3248 * We aren't being called from kernfs and there's no guarantee on 3825 * We aren't being called from kernfs and there's no guarantee on
3249 * @kn->priv's validity. For this and css_tryget_from_dir(), 3826 * @kn->priv's validity. For this and css_tryget_online_from_dir(),
3250 * @kn->priv is RCU safe. Let's do the RCU dancing. 3827 * @kn->priv is RCU safe. Let's do the RCU dancing.
3251 */ 3828 */
3252 rcu_read_lock(); 3829 rcu_read_lock();
@@ -3258,7 +3835,7 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
3258 } 3835 }
3259 rcu_read_unlock(); 3836 rcu_read_unlock();
3260 3837
3261 css_task_iter_start(&cgrp->dummy_css, &it); 3838 css_task_iter_start(&cgrp->self, &it);
3262 while ((tsk = css_task_iter_next(&it))) { 3839 while ((tsk = css_task_iter_next(&it))) {
3263 switch (tsk->state) { 3840 switch (tsk->state) {
3264 case TASK_RUNNING: 3841 case TASK_RUNNING:
@@ -3388,17 +3965,6 @@ static int cgroup_pidlist_show(struct seq_file *s, void *v)
3388 return seq_printf(s, "%d\n", *(int *)v); 3965 return seq_printf(s, "%d\n", *(int *)v);
3389} 3966}
3390 3967
3391/*
3392 * seq_operations functions for iterating on pidlists through seq_file -
3393 * independent of whether it's tasks or procs
3394 */
3395static const struct seq_operations cgroup_pidlist_seq_operations = {
3396 .start = cgroup_pidlist_start,
3397 .stop = cgroup_pidlist_stop,
3398 .next = cgroup_pidlist_next,
3399 .show = cgroup_pidlist_show,
3400};
3401
3402static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css, 3968static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css,
3403 struct cftype *cft) 3969 struct cftype *cft)
3404{ 3970{
@@ -3440,7 +4006,7 @@ static struct cftype cgroup_base_files[] = {
3440 .seq_stop = cgroup_pidlist_stop, 4006 .seq_stop = cgroup_pidlist_stop,
3441 .seq_show = cgroup_pidlist_show, 4007 .seq_show = cgroup_pidlist_show,
3442 .private = CGROUP_FILE_PROCS, 4008 .private = CGROUP_FILE_PROCS,
3443 .write_u64 = cgroup_procs_write, 4009 .write = cgroup_procs_write,
3444 .mode = S_IRUGO | S_IWUSR, 4010 .mode = S_IRUGO | S_IWUSR,
3445 }, 4011 },
3446 { 4012 {
@@ -3454,6 +4020,27 @@ static struct cftype cgroup_base_files[] = {
3454 .flags = CFTYPE_ONLY_ON_ROOT, 4020 .flags = CFTYPE_ONLY_ON_ROOT,
3455 .seq_show = cgroup_sane_behavior_show, 4021 .seq_show = cgroup_sane_behavior_show,
3456 }, 4022 },
4023 {
4024 .name = "cgroup.controllers",
4025 .flags = CFTYPE_ONLY_ON_DFL | CFTYPE_ONLY_ON_ROOT,
4026 .seq_show = cgroup_root_controllers_show,
4027 },
4028 {
4029 .name = "cgroup.controllers",
4030 .flags = CFTYPE_ONLY_ON_DFL | CFTYPE_NOT_ON_ROOT,
4031 .seq_show = cgroup_controllers_show,
4032 },
4033 {
4034 .name = "cgroup.subtree_control",
4035 .flags = CFTYPE_ONLY_ON_DFL,
4036 .seq_show = cgroup_subtree_control_show,
4037 .write = cgroup_subtree_control_write,
4038 },
4039 {
4040 .name = "cgroup.populated",
4041 .flags = CFTYPE_ONLY_ON_DFL | CFTYPE_NOT_ON_ROOT,
4042 .seq_show = cgroup_populated_show,
4043 },
3457 4044
3458 /* 4045 /*
3459 * Historical crazy stuff. These don't have "cgroup." prefix and 4046 * Historical crazy stuff. These don't have "cgroup." prefix and
@@ -3468,7 +4055,7 @@ static struct cftype cgroup_base_files[] = {
3468 .seq_stop = cgroup_pidlist_stop, 4055 .seq_stop = cgroup_pidlist_stop,
3469 .seq_show = cgroup_pidlist_show, 4056 .seq_show = cgroup_pidlist_show,
3470 .private = CGROUP_FILE_TASKS, 4057 .private = CGROUP_FILE_TASKS,
3471 .write_u64 = cgroup_tasks_write, 4058 .write = cgroup_tasks_write,
3472 .mode = S_IRUGO | S_IWUSR, 4059 .mode = S_IRUGO | S_IWUSR,
3473 }, 4060 },
3474 { 4061 {
@@ -3481,7 +4068,7 @@ static struct cftype cgroup_base_files[] = {
3481 .name = "release_agent", 4068 .name = "release_agent",
3482 .flags = CFTYPE_INSANE | CFTYPE_ONLY_ON_ROOT, 4069 .flags = CFTYPE_INSANE | CFTYPE_ONLY_ON_ROOT,
3483 .seq_show = cgroup_release_agent_show, 4070 .seq_show = cgroup_release_agent_show,
3484 .write_string = cgroup_release_agent_write, 4071 .write = cgroup_release_agent_write,
3485 .max_write_len = PATH_MAX - 1, 4072 .max_write_len = PATH_MAX - 1,
3486 }, 4073 },
3487 { } /* terminate */ 4074 { } /* terminate */
@@ -3494,7 +4081,7 @@ static struct cftype cgroup_base_files[] = {
3494 * 4081 *
3495 * On failure, no file is added. 4082 * On failure, no file is added.
3496 */ 4083 */
3497static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask) 4084static int cgroup_populate_dir(struct cgroup *cgrp, unsigned int subsys_mask)
3498{ 4085{
3499 struct cgroup_subsys *ss; 4086 struct cgroup_subsys *ss;
3500 int i, ret = 0; 4087 int i, ret = 0;
@@ -3503,7 +4090,7 @@ static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask)
3503 for_each_subsys(ss, i) { 4090 for_each_subsys(ss, i) {
3504 struct cftype *cfts; 4091 struct cftype *cfts;
3505 4092
3506 if (!test_bit(i, &subsys_mask)) 4093 if (!(subsys_mask & (1 << i)))
3507 continue; 4094 continue;
3508 4095
3509 list_for_each_entry(cfts, &ss->cfts, node) { 4096 list_for_each_entry(cfts, &ss->cfts, node) {
@@ -3525,9 +4112,9 @@ err:
3525 * Implemented in kill_css(). 4112 * Implemented in kill_css().
3526 * 4113 *
3527 * 2. When the percpu_ref is confirmed to be visible as killed on all CPUs 4114 * 2. When the percpu_ref is confirmed to be visible as killed on all CPUs
3528 * and thus css_tryget() is guaranteed to fail, the css can be offlined 4115 * and thus css_tryget_online() is guaranteed to fail, the css can be
3529 * by invoking offline_css(). After offlining, the base ref is put. 4116 * offlined by invoking offline_css(). After offlining, the base ref is
3530 * Implemented in css_killed_work_fn(). 4117 * put. Implemented in css_killed_work_fn().
3531 * 4118 *
3532 * 3. When the percpu_ref reaches zero, the only possible remaining 4119 * 3. When the percpu_ref reaches zero, the only possible remaining
3533 * accessors are inside RCU read sections. css_release() schedules the 4120 * accessors are inside RCU read sections. css_release() schedules the
@@ -3546,11 +4133,37 @@ static void css_free_work_fn(struct work_struct *work)
3546 container_of(work, struct cgroup_subsys_state, destroy_work); 4133 container_of(work, struct cgroup_subsys_state, destroy_work);
3547 struct cgroup *cgrp = css->cgroup; 4134 struct cgroup *cgrp = css->cgroup;
3548 4135
3549 if (css->parent) 4136 if (css->ss) {
3550 css_put(css->parent); 4137 /* css free path */
4138 if (css->parent)
4139 css_put(css->parent);
3551 4140
3552 css->ss->css_free(css); 4141 css->ss->css_free(css);
3553 cgroup_put(cgrp); 4142 cgroup_put(cgrp);
4143 } else {
4144 /* cgroup free path */
4145 atomic_dec(&cgrp->root->nr_cgrps);
4146 cgroup_pidlist_destroy_all(cgrp);
4147
4148 if (cgroup_parent(cgrp)) {
4149 /*
4150 * We get a ref to the parent, and put the ref when
4151 * this cgroup is being freed, so it's guaranteed
4152 * that the parent won't be destroyed before its
4153 * children.
4154 */
4155 cgroup_put(cgroup_parent(cgrp));
4156 kernfs_put(cgrp->kn);
4157 kfree(cgrp);
4158 } else {
4159 /*
4160 * This is root cgroup's refcnt reaching zero,
4161 * which indicates that the root should be
4162 * released.
4163 */
4164 cgroup_destroy_root(cgrp->root);
4165 }
4166 }
3554} 4167}
3555 4168
3556static void css_free_rcu_fn(struct rcu_head *rcu_head) 4169static void css_free_rcu_fn(struct rcu_head *rcu_head)
@@ -3562,26 +4175,59 @@ static void css_free_rcu_fn(struct rcu_head *rcu_head)
3562 queue_work(cgroup_destroy_wq, &css->destroy_work); 4175 queue_work(cgroup_destroy_wq, &css->destroy_work);
3563} 4176}
3564 4177
4178static void css_release_work_fn(struct work_struct *work)
4179{
4180 struct cgroup_subsys_state *css =
4181 container_of(work, struct cgroup_subsys_state, destroy_work);
4182 struct cgroup_subsys *ss = css->ss;
4183 struct cgroup *cgrp = css->cgroup;
4184
4185 mutex_lock(&cgroup_mutex);
4186
4187 css->flags |= CSS_RELEASED;
4188 list_del_rcu(&css->sibling);
4189
4190 if (ss) {
4191 /* css release path */
4192 cgroup_idr_remove(&ss->css_idr, css->id);
4193 } else {
4194 /* cgroup release path */
4195 cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id);
4196 cgrp->id = -1;
4197 }
4198
4199 mutex_unlock(&cgroup_mutex);
4200
4201 call_rcu(&css->rcu_head, css_free_rcu_fn);
4202}
4203
3565static void css_release(struct percpu_ref *ref) 4204static void css_release(struct percpu_ref *ref)
3566{ 4205{
3567 struct cgroup_subsys_state *css = 4206 struct cgroup_subsys_state *css =
3568 container_of(ref, struct cgroup_subsys_state, refcnt); 4207 container_of(ref, struct cgroup_subsys_state, refcnt);
3569 4208
3570 RCU_INIT_POINTER(css->cgroup->subsys[css->ss->id], NULL); 4209 INIT_WORK(&css->destroy_work, css_release_work_fn);
3571 call_rcu(&css->rcu_head, css_free_rcu_fn); 4210 queue_work(cgroup_destroy_wq, &css->destroy_work);
3572} 4211}
3573 4212
3574static void init_css(struct cgroup_subsys_state *css, struct cgroup_subsys *ss, 4213static void init_and_link_css(struct cgroup_subsys_state *css,
3575 struct cgroup *cgrp) 4214 struct cgroup_subsys *ss, struct cgroup *cgrp)
3576{ 4215{
4216 lockdep_assert_held(&cgroup_mutex);
4217
4218 cgroup_get(cgrp);
4219
4220 memset(css, 0, sizeof(*css));
3577 css->cgroup = cgrp; 4221 css->cgroup = cgrp;
3578 css->ss = ss; 4222 css->ss = ss;
3579 css->flags = 0; 4223 INIT_LIST_HEAD(&css->sibling);
4224 INIT_LIST_HEAD(&css->children);
4225 css->serial_nr = css_serial_nr_next++;
3580 4226
3581 if (cgrp->parent) 4227 if (cgroup_parent(cgrp)) {
3582 css->parent = cgroup_css(cgrp->parent, ss); 4228 css->parent = cgroup_css(cgroup_parent(cgrp), ss);
3583 else 4229 css_get(css->parent);
3584 css->flags |= CSS_ROOT; 4230 }
3585 4231
3586 BUG_ON(cgroup_css(cgrp, ss)); 4232 BUG_ON(cgroup_css(cgrp, ss));
3587} 4233}
@@ -3592,14 +4238,12 @@ static int online_css(struct cgroup_subsys_state *css)
3592 struct cgroup_subsys *ss = css->ss; 4238 struct cgroup_subsys *ss = css->ss;
3593 int ret = 0; 4239 int ret = 0;
3594 4240
3595 lockdep_assert_held(&cgroup_tree_mutex);
3596 lockdep_assert_held(&cgroup_mutex); 4241 lockdep_assert_held(&cgroup_mutex);
3597 4242
3598 if (ss->css_online) 4243 if (ss->css_online)
3599 ret = ss->css_online(css); 4244 ret = ss->css_online(css);
3600 if (!ret) { 4245 if (!ret) {
3601 css->flags |= CSS_ONLINE; 4246 css->flags |= CSS_ONLINE;
3602 css->cgroup->nr_css++;
3603 rcu_assign_pointer(css->cgroup->subsys[ss->id], css); 4247 rcu_assign_pointer(css->cgroup->subsys[ss->id], css);
3604 } 4248 }
3605 return ret; 4249 return ret;
@@ -3610,7 +4254,6 @@ static void offline_css(struct cgroup_subsys_state *css)
3610{ 4254{
3611 struct cgroup_subsys *ss = css->ss; 4255 struct cgroup_subsys *ss = css->ss;
3612 4256
3613 lockdep_assert_held(&cgroup_tree_mutex);
3614 lockdep_assert_held(&cgroup_mutex); 4257 lockdep_assert_held(&cgroup_mutex);
3615 4258
3616 if (!(css->flags & CSS_ONLINE)) 4259 if (!(css->flags & CSS_ONLINE))
@@ -3620,8 +4263,9 @@ static void offline_css(struct cgroup_subsys_state *css)
3620 ss->css_offline(css); 4263 ss->css_offline(css);
3621 4264
3622 css->flags &= ~CSS_ONLINE; 4265 css->flags &= ~CSS_ONLINE;
3623 css->cgroup->nr_css--; 4266 RCU_INIT_POINTER(css->cgroup->subsys[ss->id], NULL);
3624 RCU_INIT_POINTER(css->cgroup->subsys[ss->id], css); 4267
4268 wake_up_all(&css->cgroup->offline_waitq);
3625} 4269}
3626 4270
3627/** 4271/**
@@ -3635,111 +4279,102 @@ static void offline_css(struct cgroup_subsys_state *css)
3635 */ 4279 */
3636static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss) 4280static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss)
3637{ 4281{
3638 struct cgroup *parent = cgrp->parent; 4282 struct cgroup *parent = cgroup_parent(cgrp);
4283 struct cgroup_subsys_state *parent_css = cgroup_css(parent, ss);
3639 struct cgroup_subsys_state *css; 4284 struct cgroup_subsys_state *css;
3640 int err; 4285 int err;
3641 4286
3642 lockdep_assert_held(&cgroup_mutex); 4287 lockdep_assert_held(&cgroup_mutex);
3643 4288
3644 css = ss->css_alloc(cgroup_css(parent, ss)); 4289 css = ss->css_alloc(parent_css);
3645 if (IS_ERR(css)) 4290 if (IS_ERR(css))
3646 return PTR_ERR(css); 4291 return PTR_ERR(css);
3647 4292
4293 init_and_link_css(css, ss, cgrp);
4294
3648 err = percpu_ref_init(&css->refcnt, css_release); 4295 err = percpu_ref_init(&css->refcnt, css_release);
3649 if (err) 4296 if (err)
3650 goto err_free_css; 4297 goto err_free_css;
3651 4298
3652 init_css(css, ss, cgrp); 4299 err = cgroup_idr_alloc(&ss->css_idr, NULL, 2, 0, GFP_NOWAIT);
4300 if (err < 0)
4301 goto err_free_percpu_ref;
4302 css->id = err;
3653 4303
3654 err = cgroup_populate_dir(cgrp, 1 << ss->id); 4304 err = cgroup_populate_dir(cgrp, 1 << ss->id);
3655 if (err) 4305 if (err)
3656 goto err_free_percpu_ref; 4306 goto err_free_id;
4307
4308 /* @css is ready to be brought online now, make it visible */
4309 list_add_tail_rcu(&css->sibling, &parent_css->children);
4310 cgroup_idr_replace(&ss->css_idr, css, css->id);
3657 4311
3658 err = online_css(css); 4312 err = online_css(css);
3659 if (err) 4313 if (err)
3660 goto err_clear_dir; 4314 goto err_list_del;
3661
3662 cgroup_get(cgrp);
3663 css_get(css->parent);
3664
3665 cgrp->subsys_mask |= 1 << ss->id;
3666 4315
3667 if (ss->broken_hierarchy && !ss->warned_broken_hierarchy && 4316 if (ss->broken_hierarchy && !ss->warned_broken_hierarchy &&
3668 parent->parent) { 4317 cgroup_parent(parent)) {
3669 pr_warning("cgroup: %s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n", 4318 pr_warn("%s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n",
3670 current->comm, current->pid, ss->name); 4319 current->comm, current->pid, ss->name);
3671 if (!strcmp(ss->name, "memory")) 4320 if (!strcmp(ss->name, "memory"))
3672 pr_warning("cgroup: \"memory\" requires setting use_hierarchy to 1 on the root.\n"); 4321 pr_warn("\"memory\" requires setting use_hierarchy to 1 on the root\n");
3673 ss->warned_broken_hierarchy = true; 4322 ss->warned_broken_hierarchy = true;
3674 } 4323 }
3675 4324
3676 return 0; 4325 return 0;
3677 4326
3678err_clear_dir: 4327err_list_del:
4328 list_del_rcu(&css->sibling);
3679 cgroup_clear_dir(css->cgroup, 1 << css->ss->id); 4329 cgroup_clear_dir(css->cgroup, 1 << css->ss->id);
4330err_free_id:
4331 cgroup_idr_remove(&ss->css_idr, css->id);
3680err_free_percpu_ref: 4332err_free_percpu_ref:
3681 percpu_ref_cancel_init(&css->refcnt); 4333 percpu_ref_cancel_init(&css->refcnt);
3682err_free_css: 4334err_free_css:
3683 ss->css_free(css); 4335 call_rcu(&css->rcu_head, css_free_rcu_fn);
3684 return err; 4336 return err;
3685} 4337}
3686 4338
3687/** 4339static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
3688 * cgroup_create - create a cgroup 4340 umode_t mode)
3689 * @parent: cgroup that will be parent of the new cgroup
3690 * @name: name of the new cgroup
3691 * @mode: mode to set on new cgroup
3692 */
3693static long cgroup_create(struct cgroup *parent, const char *name,
3694 umode_t mode)
3695{ 4341{
3696 struct cgroup *cgrp; 4342 struct cgroup *parent, *cgrp;
3697 struct cgroup_root *root = parent->root; 4343 struct cgroup_root *root;
3698 int ssid, err;
3699 struct cgroup_subsys *ss; 4344 struct cgroup_subsys *ss;
3700 struct kernfs_node *kn; 4345 struct kernfs_node *kn;
4346 int ssid, ret;
3701 4347
3702 /* 4348 parent = cgroup_kn_lock_live(parent_kn);
3703 * XXX: The default hierarchy isn't fully implemented yet. Block 4349 if (!parent)
3704 * !root cgroup creation on it for now. 4350 return -ENODEV;
3705 */ 4351 root = parent->root;
3706 if (root == &cgrp_dfl_root)
3707 return -EINVAL;
3708 4352
3709 /* allocate the cgroup and its ID, 0 is reserved for the root */ 4353 /* allocate the cgroup and its ID, 0 is reserved for the root */
3710 cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL); 4354 cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL);
3711 if (!cgrp) 4355 if (!cgrp) {
3712 return -ENOMEM; 4356 ret = -ENOMEM;
3713 4357 goto out_unlock;
3714 mutex_lock(&cgroup_tree_mutex);
3715
3716 /*
3717 * Only live parents can have children. Note that the liveliness
3718 * check isn't strictly necessary because cgroup_mkdir() and
3719 * cgroup_rmdir() are fully synchronized by i_mutex; however, do it
3720 * anyway so that locking is contained inside cgroup proper and we
3721 * don't get nasty surprises if we ever grow another caller.
3722 */
3723 if (!cgroup_lock_live_group(parent)) {
3724 err = -ENODEV;
3725 goto err_unlock_tree;
3726 } 4358 }
3727 4359
4360 ret = percpu_ref_init(&cgrp->self.refcnt, css_release);
4361 if (ret)
4362 goto out_free_cgrp;
4363
3728 /* 4364 /*
3729 * Temporarily set the pointer to NULL, so idr_find() won't return 4365 * Temporarily set the pointer to NULL, so idr_find() won't return
3730 * a half-baked cgroup. 4366 * a half-baked cgroup.
3731 */ 4367 */
3732 cgrp->id = idr_alloc(&root->cgroup_idr, NULL, 1, 0, GFP_KERNEL); 4368 cgrp->id = cgroup_idr_alloc(&root->cgroup_idr, NULL, 2, 0, GFP_NOWAIT);
3733 if (cgrp->id < 0) { 4369 if (cgrp->id < 0) {
3734 err = -ENOMEM; 4370 ret = -ENOMEM;
3735 goto err_unlock; 4371 goto out_cancel_ref;
3736 } 4372 }
3737 4373
3738 init_cgroup_housekeeping(cgrp); 4374 init_cgroup_housekeeping(cgrp);
3739 4375
3740 cgrp->parent = parent; 4376 cgrp->self.parent = &parent->self;
3741 cgrp->dummy_css.parent = &parent->dummy_css; 4377 cgrp->root = root;
3742 cgrp->root = parent->root;
3743 4378
3744 if (notify_on_release(parent)) 4379 if (notify_on_release(parent))
3745 set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); 4380 set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
@@ -3750,8 +4385,8 @@ static long cgroup_create(struct cgroup *parent, const char *name,
3750 /* create the directory */ 4385 /* create the directory */
3751 kn = kernfs_create_dir(parent->kn, name, mode, cgrp); 4386 kn = kernfs_create_dir(parent->kn, name, mode, cgrp);
3752 if (IS_ERR(kn)) { 4387 if (IS_ERR(kn)) {
3753 err = PTR_ERR(kn); 4388 ret = PTR_ERR(kn);
3754 goto err_free_id; 4389 goto out_free_id;
3755 } 4390 }
3756 cgrp->kn = kn; 4391 cgrp->kn = kn;
3757 4392
@@ -3761,10 +4396,10 @@ static long cgroup_create(struct cgroup *parent, const char *name,
3761 */ 4396 */
3762 kernfs_get(kn); 4397 kernfs_get(kn);
3763 4398
3764 cgrp->serial_nr = cgroup_serial_nr_next++; 4399 cgrp->self.serial_nr = css_serial_nr_next++;
3765 4400
3766 /* allocation complete, commit to creation */ 4401 /* allocation complete, commit to creation */
3767 list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children); 4402 list_add_tail_rcu(&cgrp->self.sibling, &cgroup_parent(cgrp)->self.children);
3768 atomic_inc(&root->nr_cgrps); 4403 atomic_inc(&root->nr_cgrps);
3769 cgroup_get(parent); 4404 cgroup_get(parent);
3770 4405
@@ -3772,107 +4407,66 @@ static long cgroup_create(struct cgroup *parent, const char *name,
3772 * @cgrp is now fully operational. If something fails after this 4407 * @cgrp is now fully operational. If something fails after this
3773 * point, it'll be released via the normal destruction path. 4408 * point, it'll be released via the normal destruction path.
3774 */ 4409 */
3775 idr_replace(&root->cgroup_idr, cgrp, cgrp->id); 4410 cgroup_idr_replace(&root->cgroup_idr, cgrp, cgrp->id);
3776 4411
3777 err = cgroup_kn_set_ugid(kn); 4412 ret = cgroup_kn_set_ugid(kn);
3778 if (err) 4413 if (ret)
3779 goto err_destroy; 4414 goto out_destroy;
3780 4415
3781 err = cgroup_addrm_files(cgrp, cgroup_base_files, true); 4416 ret = cgroup_addrm_files(cgrp, cgroup_base_files, true);
3782 if (err) 4417 if (ret)
3783 goto err_destroy; 4418 goto out_destroy;
3784 4419
3785 /* let's create and online css's */ 4420 /* let's create and online css's */
3786 for_each_subsys(ss, ssid) { 4421 for_each_subsys(ss, ssid) {
3787 if (root->cgrp.subsys_mask & (1 << ssid)) { 4422 if (parent->child_subsys_mask & (1 << ssid)) {
3788 err = create_css(cgrp, ss); 4423 ret = create_css(cgrp, ss);
3789 if (err) 4424 if (ret)
3790 goto err_destroy; 4425 goto out_destroy;
3791 } 4426 }
3792 } 4427 }
3793 4428
3794 kernfs_activate(kn); 4429 /*
4430 * On the default hierarchy, a child doesn't automatically inherit
4431 * child_subsys_mask from the parent. Each is configured manually.
4432 */
4433 if (!cgroup_on_dfl(cgrp))
4434 cgrp->child_subsys_mask = parent->child_subsys_mask;
3795 4435
3796 mutex_unlock(&cgroup_mutex); 4436 kernfs_activate(kn);
3797 mutex_unlock(&cgroup_tree_mutex);
3798 4437
3799 return 0; 4438 ret = 0;
4439 goto out_unlock;
3800 4440
3801err_free_id: 4441out_free_id:
3802 idr_remove(&root->cgroup_idr, cgrp->id); 4442 cgroup_idr_remove(&root->cgroup_idr, cgrp->id);
3803err_unlock: 4443out_cancel_ref:
3804 mutex_unlock(&cgroup_mutex); 4444 percpu_ref_cancel_init(&cgrp->self.refcnt);
3805err_unlock_tree: 4445out_free_cgrp:
3806 mutex_unlock(&cgroup_tree_mutex);
3807 kfree(cgrp); 4446 kfree(cgrp);
3808 return err; 4447out_unlock:
4448 cgroup_kn_unlock(parent_kn);
4449 return ret;
3809 4450
3810err_destroy: 4451out_destroy:
3811 cgroup_destroy_locked(cgrp); 4452 cgroup_destroy_locked(cgrp);
3812 mutex_unlock(&cgroup_mutex); 4453 goto out_unlock;
3813 mutex_unlock(&cgroup_tree_mutex);
3814 return err;
3815}
3816
3817static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
3818 umode_t mode)
3819{
3820 struct cgroup *parent = parent_kn->priv;
3821 int ret;
3822
3823 /*
3824 * cgroup_create() grabs cgroup_tree_mutex which nests outside
3825 * kernfs active_ref and cgroup_create() already synchronizes
3826 * properly against removal through cgroup_lock_live_group().
3827 * Break it before calling cgroup_create().
3828 */
3829 cgroup_get(parent);
3830 kernfs_break_active_protection(parent_kn);
3831
3832 ret = cgroup_create(parent, name, mode);
3833
3834 kernfs_unbreak_active_protection(parent_kn);
3835 cgroup_put(parent);
3836 return ret;
3837} 4454}
3838 4455
3839/* 4456/*
3840 * This is called when the refcnt of a css is confirmed to be killed. 4457 * This is called when the refcnt of a css is confirmed to be killed.
3841 * css_tryget() is now guaranteed to fail. 4458 * css_tryget_online() is now guaranteed to fail. Tell the subsystem to
4459 * initate destruction and put the css ref from kill_css().
3842 */ 4460 */
3843static void css_killed_work_fn(struct work_struct *work) 4461static void css_killed_work_fn(struct work_struct *work)
3844{ 4462{
3845 struct cgroup_subsys_state *css = 4463 struct cgroup_subsys_state *css =
3846 container_of(work, struct cgroup_subsys_state, destroy_work); 4464 container_of(work, struct cgroup_subsys_state, destroy_work);
3847 struct cgroup *cgrp = css->cgroup;
3848 4465
3849 mutex_lock(&cgroup_tree_mutex);
3850 mutex_lock(&cgroup_mutex); 4466 mutex_lock(&cgroup_mutex);
3851
3852 /*
3853 * css_tryget() is guaranteed to fail now. Tell subsystems to
3854 * initate destruction.
3855 */
3856 offline_css(css); 4467 offline_css(css);
3857
3858 /*
3859 * If @cgrp is marked dead, it's waiting for refs of all css's to
3860 * be disabled before proceeding to the second phase of cgroup
3861 * destruction. If we are the last one, kick it off.
3862 */
3863 if (!cgrp->nr_css && cgroup_is_dead(cgrp))
3864 cgroup_destroy_css_killed(cgrp);
3865
3866 mutex_unlock(&cgroup_mutex); 4468 mutex_unlock(&cgroup_mutex);
3867 mutex_unlock(&cgroup_tree_mutex);
3868 4469
3869 /*
3870 * Put the css refs from kill_css(). Each css holds an extra
3871 * reference to the cgroup's dentry and cgroup removal proceeds
3872 * regardless of css refs. On the last put of each css, whenever
3873 * that may be, the extra dentry ref is put so that dentry
3874 * destruction happens only after all css's are released.
3875 */
3876 css_put(css); 4470 css_put(css);
3877} 4471}
3878 4472
@@ -3886,9 +4480,18 @@ static void css_killed_ref_fn(struct percpu_ref *ref)
3886 queue_work(cgroup_destroy_wq, &css->destroy_work); 4480 queue_work(cgroup_destroy_wq, &css->destroy_work);
3887} 4481}
3888 4482
3889static void __kill_css(struct cgroup_subsys_state *css) 4483/**
4484 * kill_css - destroy a css
4485 * @css: css to destroy
4486 *
4487 * This function initiates destruction of @css by removing cgroup interface
4488 * files and putting its base reference. ->css_offline() will be invoked
4489 * asynchronously once css_tryget_online() is guaranteed to fail and when
4490 * the reference count reaches zero, @css will be released.
4491 */
4492static void kill_css(struct cgroup_subsys_state *css)
3890{ 4493{
3891 lockdep_assert_held(&cgroup_tree_mutex); 4494 lockdep_assert_held(&cgroup_mutex);
3892 4495
3893 /* 4496 /*
3894 * This must happen before css is disassociated with its cgroup. 4497 * This must happen before css is disassociated with its cgroup.
@@ -3905,7 +4508,7 @@ static void __kill_css(struct cgroup_subsys_state *css)
3905 /* 4508 /*
3906 * cgroup core guarantees that, by the time ->css_offline() is 4509 * cgroup core guarantees that, by the time ->css_offline() is
3907 * invoked, no new css reference will be given out via 4510 * invoked, no new css reference will be given out via
3908 * css_tryget(). We can't simply call percpu_ref_kill() and 4511 * css_tryget_online(). We can't simply call percpu_ref_kill() and
3909 * proceed to offlining css's because percpu_ref_kill() doesn't 4512 * proceed to offlining css's because percpu_ref_kill() doesn't
3910 * guarantee that the ref is seen as killed on all CPUs on return. 4513 * guarantee that the ref is seen as killed on all CPUs on return.
3911 * 4514 *
@@ -3916,36 +4519,14 @@ static void __kill_css(struct cgroup_subsys_state *css)
3916} 4519}
3917 4520
3918/** 4521/**
3919 * kill_css - destroy a css
3920 * @css: css to destroy
3921 *
3922 * This function initiates destruction of @css by removing cgroup interface
3923 * files and putting its base reference. ->css_offline() will be invoked
3924 * asynchronously once css_tryget() is guaranteed to fail and when the
3925 * reference count reaches zero, @css will be released.
3926 */
3927static void kill_css(struct cgroup_subsys_state *css)
3928{
3929 struct cgroup *cgrp = css->cgroup;
3930
3931 lockdep_assert_held(&cgroup_tree_mutex);
3932
3933 /* if already killed, noop */
3934 if (cgrp->subsys_mask & (1 << css->ss->id)) {
3935 cgrp->subsys_mask &= ~(1 << css->ss->id);
3936 __kill_css(css);
3937 }
3938}
3939
3940/**
3941 * cgroup_destroy_locked - the first stage of cgroup destruction 4522 * cgroup_destroy_locked - the first stage of cgroup destruction
3942 * @cgrp: cgroup to be destroyed 4523 * @cgrp: cgroup to be destroyed
3943 * 4524 *
3944 * css's make use of percpu refcnts whose killing latency shouldn't be 4525 * css's make use of percpu refcnts whose killing latency shouldn't be
3945 * exposed to userland and are RCU protected. Also, cgroup core needs to 4526 * exposed to userland and are RCU protected. Also, cgroup core needs to
3946 * guarantee that css_tryget() won't succeed by the time ->css_offline() is 4527 * guarantee that css_tryget_online() won't succeed by the time
3947 * invoked. To satisfy all the requirements, destruction is implemented in 4528 * ->css_offline() is invoked. To satisfy all the requirements,
3948 * the following two steps. 4529 * destruction is implemented in the following two steps.
3949 * 4530 *
3950 * s1. Verify @cgrp can be destroyed and mark it dying. Remove all 4531 * s1. Verify @cgrp can be destroyed and mark it dying. Remove all
3951 * userland visible parts and start killing the percpu refcnts of 4532 * userland visible parts and start killing the percpu refcnts of
@@ -3964,12 +4545,10 @@ static void kill_css(struct cgroup_subsys_state *css)
3964static int cgroup_destroy_locked(struct cgroup *cgrp) 4545static int cgroup_destroy_locked(struct cgroup *cgrp)
3965 __releases(&cgroup_mutex) __acquires(&cgroup_mutex) 4546 __releases(&cgroup_mutex) __acquires(&cgroup_mutex)
3966{ 4547{
3967 struct cgroup *child;
3968 struct cgroup_subsys_state *css; 4548 struct cgroup_subsys_state *css;
3969 bool empty; 4549 bool empty;
3970 int ssid; 4550 int ssid;
3971 4551
3972 lockdep_assert_held(&cgroup_tree_mutex);
3973 lockdep_assert_held(&cgroup_mutex); 4552 lockdep_assert_held(&cgroup_mutex);
3974 4553
3975 /* 4554 /*
@@ -3983,127 +4562,68 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
3983 return -EBUSY; 4562 return -EBUSY;
3984 4563
3985 /* 4564 /*
3986 * Make sure there's no live children. We can't test ->children 4565 * Make sure there's no live children. We can't test emptiness of
3987 * emptiness as dead children linger on it while being destroyed; 4566 * ->self.children as dead children linger on it while being
3988 * otherwise, "rmdir parent/child parent" may fail with -EBUSY. 4567 * drained; otherwise, "rmdir parent/child parent" may fail.
3989 */ 4568 */
3990 empty = true; 4569 if (css_has_online_children(&cgrp->self))
3991 rcu_read_lock();
3992 list_for_each_entry_rcu(child, &cgrp->children, sibling) {
3993 empty = cgroup_is_dead(child);
3994 if (!empty)
3995 break;
3996 }
3997 rcu_read_unlock();
3998 if (!empty)
3999 return -EBUSY; 4570 return -EBUSY;
4000 4571
4001 /* 4572 /*
4002 * Mark @cgrp dead. This prevents further task migration and child 4573 * Mark @cgrp dead. This prevents further task migration and child
4003 * creation by disabling cgroup_lock_live_group(). Note that 4574 * creation by disabling cgroup_lock_live_group().
4004 * CGRP_DEAD assertion is depended upon by css_next_child() to
4005 * resume iteration after dropping RCU read lock. See
4006 * css_next_child() for details.
4007 */ 4575 */
4008 set_bit(CGRP_DEAD, &cgrp->flags); 4576 cgrp->self.flags &= ~CSS_ONLINE;
4009 4577
4010 /* 4578 /* initiate massacre of all css's */
4011 * Initiate massacre of all css's. cgroup_destroy_css_killed()
4012 * will be invoked to perform the rest of destruction once the
4013 * percpu refs of all css's are confirmed to be killed. This
4014 * involves removing the subsystem's files, drop cgroup_mutex.
4015 */
4016 mutex_unlock(&cgroup_mutex);
4017 for_each_css(css, ssid, cgrp) 4579 for_each_css(css, ssid, cgrp)
4018 kill_css(css); 4580 kill_css(css);
4019 mutex_lock(&cgroup_mutex);
4020 4581
4021 /* CGRP_DEAD is set, remove from ->release_list for the last time */ 4582 /* CSS_ONLINE is clear, remove from ->release_list for the last time */
4022 raw_spin_lock(&release_list_lock); 4583 raw_spin_lock(&release_list_lock);
4023 if (!list_empty(&cgrp->release_list)) 4584 if (!list_empty(&cgrp->release_list))
4024 list_del_init(&cgrp->release_list); 4585 list_del_init(&cgrp->release_list);
4025 raw_spin_unlock(&release_list_lock); 4586 raw_spin_unlock(&release_list_lock);
4026 4587
4027 /* 4588 /*
4028 * If @cgrp has css's attached, the second stage of cgroup 4589 * Remove @cgrp directory along with the base files. @cgrp has an
4029 * destruction is kicked off from css_killed_work_fn() after the 4590 * extra ref on its kn.
4030 * refs of all attached css's are killed. If @cgrp doesn't have
4031 * any css, we kick it off here.
4032 */ 4591 */
4033 if (!cgrp->nr_css) 4592 kernfs_remove(cgrp->kn);
4034 cgroup_destroy_css_killed(cgrp);
4035
4036 /* remove @cgrp directory along with the base files */
4037 mutex_unlock(&cgroup_mutex);
4038 4593
4039 /* 4594 set_bit(CGRP_RELEASABLE, &cgroup_parent(cgrp)->flags);
4040 * There are two control paths which try to determine cgroup from 4595 check_for_release(cgroup_parent(cgrp));
4041 * dentry without going through kernfs - cgroupstats_build() and
4042 * css_tryget_from_dir(). Those are supported by RCU protecting
4043 * clearing of cgrp->kn->priv backpointer, which should happen
4044 * after all files under it have been removed.
4045 */
4046 kernfs_remove(cgrp->kn); /* @cgrp has an extra ref on its kn */
4047 RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv, NULL);
4048 4596
4049 mutex_lock(&cgroup_mutex); 4597 /* put the base reference */
4598 percpu_ref_kill(&cgrp->self.refcnt);
4050 4599
4051 return 0; 4600 return 0;
4052}; 4601};
4053 4602
4054/**
4055 * cgroup_destroy_css_killed - the second step of cgroup destruction
4056 * @work: cgroup->destroy_free_work
4057 *
4058 * This function is invoked from a work item for a cgroup which is being
4059 * destroyed after all css's are offlined and performs the rest of
4060 * destruction. This is the second step of destruction described in the
4061 * comment above cgroup_destroy_locked().
4062 */
4063static void cgroup_destroy_css_killed(struct cgroup *cgrp)
4064{
4065 struct cgroup *parent = cgrp->parent;
4066
4067 lockdep_assert_held(&cgroup_tree_mutex);
4068 lockdep_assert_held(&cgroup_mutex);
4069
4070 /* delete this cgroup from parent->children */
4071 list_del_rcu(&cgrp->sibling);
4072
4073 cgroup_put(cgrp);
4074
4075 set_bit(CGRP_RELEASABLE, &parent->flags);
4076 check_for_release(parent);
4077}
4078
4079static int cgroup_rmdir(struct kernfs_node *kn) 4603static int cgroup_rmdir(struct kernfs_node *kn)
4080{ 4604{
4081 struct cgroup *cgrp = kn->priv; 4605 struct cgroup *cgrp;
4082 int ret = 0; 4606 int ret = 0;
4083 4607
4084 /* 4608 cgrp = cgroup_kn_lock_live(kn);
4085 * This is self-destruction but @kn can't be removed while this 4609 if (!cgrp)
4086 * callback is in progress. Let's break active protection. Once 4610 return 0;
4087 * the protection is broken, @cgrp can be destroyed at any point. 4611 cgroup_get(cgrp); /* for @kn->priv clearing */
4088 * Pin it so that it stays accessible.
4089 */
4090 cgroup_get(cgrp);
4091 kernfs_break_active_protection(kn);
4092 4612
4093 mutex_lock(&cgroup_tree_mutex); 4613 ret = cgroup_destroy_locked(cgrp);
4094 mutex_lock(&cgroup_mutex); 4614
4615 cgroup_kn_unlock(kn);
4095 4616
4096 /* 4617 /*
4097 * @cgrp might already have been destroyed while we're trying to 4618 * There are two control paths which try to determine cgroup from
4098 * grab the mutexes. 4619 * dentry without going through kernfs - cgroupstats_build() and
4620 * css_tryget_online_from_dir(). Those are supported by RCU
4621 * protecting clearing of cgrp->kn->priv backpointer, which should
4622 * happen after all files under it have been removed.
4099 */ 4623 */
4100 if (!cgroup_is_dead(cgrp)) 4624 if (!ret)
4101 ret = cgroup_destroy_locked(cgrp); 4625 RCU_INIT_POINTER(*(void __rcu __force **)&kn->priv, NULL);
4102
4103 mutex_unlock(&cgroup_mutex);
4104 mutex_unlock(&cgroup_tree_mutex);
4105 4626
4106 kernfs_unbreak_active_protection(kn);
4107 cgroup_put(cgrp); 4627 cgroup_put(cgrp);
4108 return ret; 4628 return ret;
4109} 4629}
@@ -4116,15 +4636,15 @@ static struct kernfs_syscall_ops cgroup_kf_syscall_ops = {
4116 .rename = cgroup_rename, 4636 .rename = cgroup_rename,
4117}; 4637};
4118 4638
4119static void __init cgroup_init_subsys(struct cgroup_subsys *ss) 4639static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early)
4120{ 4640{
4121 struct cgroup_subsys_state *css; 4641 struct cgroup_subsys_state *css;
4122 4642
4123 printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name); 4643 printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name);
4124 4644
4125 mutex_lock(&cgroup_tree_mutex);
4126 mutex_lock(&cgroup_mutex); 4645 mutex_lock(&cgroup_mutex);
4127 4646
4647 idr_init(&ss->css_idr);
4128 INIT_LIST_HEAD(&ss->cfts); 4648 INIT_LIST_HEAD(&ss->cfts);
4129 4649
4130 /* Create the root cgroup state for this subsystem */ 4650 /* Create the root cgroup state for this subsystem */
@@ -4132,7 +4652,21 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
4132 css = ss->css_alloc(cgroup_css(&cgrp_dfl_root.cgrp, ss)); 4652 css = ss->css_alloc(cgroup_css(&cgrp_dfl_root.cgrp, ss));
4133 /* We don't handle early failures gracefully */ 4653 /* We don't handle early failures gracefully */
4134 BUG_ON(IS_ERR(css)); 4654 BUG_ON(IS_ERR(css));
4135 init_css(css, ss, &cgrp_dfl_root.cgrp); 4655 init_and_link_css(css, ss, &cgrp_dfl_root.cgrp);
4656
4657 /*
4658 * Root csses are never destroyed and we can't initialize
4659 * percpu_ref during early init. Disable refcnting.
4660 */
4661 css->flags |= CSS_NO_REF;
4662
4663 if (early) {
4664 /* allocation can't be done safely during early init */
4665 css->id = 1;
4666 } else {
4667 css->id = cgroup_idr_alloc(&ss->css_idr, css, 1, 2, GFP_KERNEL);
4668 BUG_ON(css->id < 0);
4669 }
4136 4670
4137 /* Update the init_css_set to contain a subsys 4671 /* Update the init_css_set to contain a subsys
4138 * pointer to this state - since the subsystem is 4672 * pointer to this state - since the subsystem is
@@ -4149,10 +4683,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
4149 4683
4150 BUG_ON(online_css(css)); 4684 BUG_ON(online_css(css));
4151 4685
4152 cgrp_dfl_root.cgrp.subsys_mask |= 1 << ss->id;
4153
4154 mutex_unlock(&cgroup_mutex); 4686 mutex_unlock(&cgroup_mutex);
4155 mutex_unlock(&cgroup_tree_mutex);
4156} 4687}
4157 4688
4158/** 4689/**
@@ -4169,6 +4700,8 @@ int __init cgroup_init_early(void)
4169 int i; 4700 int i;
4170 4701
4171 init_cgroup_root(&cgrp_dfl_root, &opts); 4702 init_cgroup_root(&cgrp_dfl_root, &opts);
4703 cgrp_dfl_root.cgrp.self.flags |= CSS_NO_REF;
4704
4172 RCU_INIT_POINTER(init_task.cgroups, &init_css_set); 4705 RCU_INIT_POINTER(init_task.cgroups, &init_css_set);
4173 4706
4174 for_each_subsys(ss, i) { 4707 for_each_subsys(ss, i) {
@@ -4183,7 +4716,7 @@ int __init cgroup_init_early(void)
4183 ss->name = cgroup_subsys_name[i]; 4716 ss->name = cgroup_subsys_name[i];
4184 4717
4185 if (ss->early_init) 4718 if (ss->early_init)
4186 cgroup_init_subsys(ss); 4719 cgroup_init_subsys(ss, true);
4187 } 4720 }
4188 return 0; 4721 return 0;
4189} 4722}
@@ -4202,7 +4735,6 @@ int __init cgroup_init(void)
4202 4735
4203 BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files)); 4736 BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files));
4204 4737
4205 mutex_lock(&cgroup_tree_mutex);
4206 mutex_lock(&cgroup_mutex); 4738 mutex_lock(&cgroup_mutex);
4207 4739
4208 /* Add init_css_set to the hash table */ 4740 /* Add init_css_set to the hash table */
@@ -4212,18 +4744,31 @@ int __init cgroup_init(void)
4212 BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0)); 4744 BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0));
4213 4745
4214 mutex_unlock(&cgroup_mutex); 4746 mutex_unlock(&cgroup_mutex);
4215 mutex_unlock(&cgroup_tree_mutex);
4216 4747
4217 for_each_subsys(ss, ssid) { 4748 for_each_subsys(ss, ssid) {
4218 if (!ss->early_init) 4749 if (ss->early_init) {
4219 cgroup_init_subsys(ss); 4750 struct cgroup_subsys_state *css =
4751 init_css_set.subsys[ss->id];
4752
4753 css->id = cgroup_idr_alloc(&ss->css_idr, css, 1, 2,
4754 GFP_KERNEL);
4755 BUG_ON(css->id < 0);
4756 } else {
4757 cgroup_init_subsys(ss, false);
4758 }
4759
4760 list_add_tail(&init_css_set.e_cset_node[ssid],
4761 &cgrp_dfl_root.cgrp.e_csets[ssid]);
4220 4762
4221 /* 4763 /*
4222 * cftype registration needs kmalloc and can't be done 4764 * Setting dfl_root subsys_mask needs to consider the
4223 * during early_init. Register base cftypes separately. 4765 * disabled flag and cftype registration needs kmalloc,
4766 * both of which aren't available during early_init.
4224 */ 4767 */
4225 if (ss->base_cftypes) 4768 if (!ss->disabled) {
4769 cgrp_dfl_root.subsys_mask |= 1 << ss->id;
4226 WARN_ON(cgroup_add_cftypes(ss, ss->base_cftypes)); 4770 WARN_ON(cgroup_add_cftypes(ss, ss->base_cftypes));
4771 }
4227 } 4772 }
4228 4773
4229 cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj); 4774 cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj);
@@ -4306,7 +4851,7 @@ int proc_cgroup_show(struct seq_file *m, void *v)
4306 4851
4307 seq_printf(m, "%d:", root->hierarchy_id); 4852 seq_printf(m, "%d:", root->hierarchy_id);
4308 for_each_subsys(ss, ssid) 4853 for_each_subsys(ss, ssid)
4309 if (root->cgrp.subsys_mask & (1 << ssid)) 4854 if (root->subsys_mask & (1 << ssid))
4310 seq_printf(m, "%s%s", count++ ? "," : "", ss->name); 4855 seq_printf(m, "%s%s", count++ ? "," : "", ss->name);
4311 if (strlen(root->name)) 4856 if (strlen(root->name))
4312 seq_printf(m, "%sname=%s", count ? "," : "", 4857 seq_printf(m, "%sname=%s", count ? "," : "",
@@ -4501,8 +5046,8 @@ void cgroup_exit(struct task_struct *tsk)
4501 5046
4502static void check_for_release(struct cgroup *cgrp) 5047static void check_for_release(struct cgroup *cgrp)
4503{ 5048{
4504 if (cgroup_is_releasable(cgrp) && 5049 if (cgroup_is_releasable(cgrp) && list_empty(&cgrp->cset_links) &&
4505 list_empty(&cgrp->cset_links) && list_empty(&cgrp->children)) { 5050 !css_has_online_children(&cgrp->self)) {
4506 /* 5051 /*
4507 * Control Group is currently removeable. If it's not 5052 * Control Group is currently removeable. If it's not
4508 * already queued for a userspace notification, queue 5053 * already queued for a userspace notification, queue
@@ -4619,7 +5164,7 @@ static int __init cgroup_disable(char *str)
4619__setup("cgroup_disable=", cgroup_disable); 5164__setup("cgroup_disable=", cgroup_disable);
4620 5165
4621/** 5166/**
4622 * css_tryget_from_dir - get corresponding css from the dentry of a cgroup dir 5167 * css_tryget_online_from_dir - get corresponding css from a cgroup dentry
4623 * @dentry: directory dentry of interest 5168 * @dentry: directory dentry of interest
4624 * @ss: subsystem of interest 5169 * @ss: subsystem of interest
4625 * 5170 *
@@ -4627,8 +5172,8 @@ __setup("cgroup_disable=", cgroup_disable);
4627 * to get the corresponding css and return it. If such css doesn't exist 5172 * to get the corresponding css and return it. If such css doesn't exist
4628 * or can't be pinned, an ERR_PTR value is returned. 5173 * or can't be pinned, an ERR_PTR value is returned.
4629 */ 5174 */
4630struct cgroup_subsys_state *css_tryget_from_dir(struct dentry *dentry, 5175struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry,
4631 struct cgroup_subsys *ss) 5176 struct cgroup_subsys *ss)
4632{ 5177{
4633 struct kernfs_node *kn = kernfs_node_from_dentry(dentry); 5178 struct kernfs_node *kn = kernfs_node_from_dentry(dentry);
4634 struct cgroup_subsys_state *css = NULL; 5179 struct cgroup_subsys_state *css = NULL;
@@ -4644,13 +5189,13 @@ struct cgroup_subsys_state *css_tryget_from_dir(struct dentry *dentry,
4644 /* 5189 /*
4645 * This path doesn't originate from kernfs and @kn could already 5190 * This path doesn't originate from kernfs and @kn could already
4646 * have been or be removed at any point. @kn->priv is RCU 5191 * have been or be removed at any point. @kn->priv is RCU
4647 * protected for this access. See destroy_locked() for details. 5192 * protected for this access. See cgroup_rmdir() for details.
4648 */ 5193 */
4649 cgrp = rcu_dereference(kn->priv); 5194 cgrp = rcu_dereference(kn->priv);
4650 if (cgrp) 5195 if (cgrp)
4651 css = cgroup_css(cgrp, ss); 5196 css = cgroup_css(cgrp, ss);
4652 5197
4653 if (!css || !css_tryget(css)) 5198 if (!css || !css_tryget_online(css))
4654 css = ERR_PTR(-ENOENT); 5199 css = ERR_PTR(-ENOENT);
4655 5200
4656 rcu_read_unlock(); 5201 rcu_read_unlock();
@@ -4667,14 +5212,8 @@ struct cgroup_subsys_state *css_tryget_from_dir(struct dentry *dentry,
4667 */ 5212 */
4668struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss) 5213struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss)
4669{ 5214{
4670 struct cgroup *cgrp; 5215 WARN_ON_ONCE(!rcu_read_lock_held());
4671 5216 return idr_find(&ss->css_idr, id);
4672 cgroup_assert_mutexes_or_rcu_locked();
4673
4674 cgrp = idr_find(&ss->root->cgroup_idr, id);
4675 if (cgrp)
4676 return cgroup_css(cgrp, ss);
4677 return NULL;
4678} 5217}
4679 5218
4680#ifdef CONFIG_CGROUP_DEBUG 5219#ifdef CONFIG_CGROUP_DEBUG
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index 2bc4a2256444..a79e40f9d700 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -21,6 +21,7 @@
21#include <linux/uaccess.h> 21#include <linux/uaccess.h>
22#include <linux/freezer.h> 22#include <linux/freezer.h>
23#include <linux/seq_file.h> 23#include <linux/seq_file.h>
24#include <linux/mutex.h>
24 25
25/* 26/*
26 * A cgroup is freezing if any FREEZING flags are set. FREEZING_SELF is 27 * A cgroup is freezing if any FREEZING flags are set. FREEZING_SELF is
@@ -42,9 +43,10 @@ enum freezer_state_flags {
42struct freezer { 43struct freezer {
43 struct cgroup_subsys_state css; 44 struct cgroup_subsys_state css;
44 unsigned int state; 45 unsigned int state;
45 spinlock_t lock;
46}; 46};
47 47
48static DEFINE_MUTEX(freezer_mutex);
49
48static inline struct freezer *css_freezer(struct cgroup_subsys_state *css) 50static inline struct freezer *css_freezer(struct cgroup_subsys_state *css)
49{ 51{
50 return css ? container_of(css, struct freezer, css) : NULL; 52 return css ? container_of(css, struct freezer, css) : NULL;
@@ -57,7 +59,7 @@ static inline struct freezer *task_freezer(struct task_struct *task)
57 59
58static struct freezer *parent_freezer(struct freezer *freezer) 60static struct freezer *parent_freezer(struct freezer *freezer)
59{ 61{
60 return css_freezer(css_parent(&freezer->css)); 62 return css_freezer(freezer->css.parent);
61} 63}
62 64
63bool cgroup_freezing(struct task_struct *task) 65bool cgroup_freezing(struct task_struct *task)
@@ -71,10 +73,6 @@ bool cgroup_freezing(struct task_struct *task)
71 return ret; 73 return ret;
72} 74}
73 75
74/*
75 * cgroups_write_string() limits the size of freezer state strings to
76 * CGROUP_LOCAL_BUFFER_SIZE
77 */
78static const char *freezer_state_strs(unsigned int state) 76static const char *freezer_state_strs(unsigned int state)
79{ 77{
80 if (state & CGROUP_FROZEN) 78 if (state & CGROUP_FROZEN)
@@ -93,7 +91,6 @@ freezer_css_alloc(struct cgroup_subsys_state *parent_css)
93 if (!freezer) 91 if (!freezer)
94 return ERR_PTR(-ENOMEM); 92 return ERR_PTR(-ENOMEM);
95 93
96 spin_lock_init(&freezer->lock);
97 return &freezer->css; 94 return &freezer->css;
98} 95}
99 96
@@ -110,14 +107,7 @@ static int freezer_css_online(struct cgroup_subsys_state *css)
110 struct freezer *freezer = css_freezer(css); 107 struct freezer *freezer = css_freezer(css);
111 struct freezer *parent = parent_freezer(freezer); 108 struct freezer *parent = parent_freezer(freezer);
112 109
113 /* 110 mutex_lock(&freezer_mutex);
114 * The following double locking and freezing state inheritance
115 * guarantee that @cgroup can never escape ancestors' freezing
116 * states. See css_for_each_descendant_pre() for details.
117 */
118 if (parent)
119 spin_lock_irq(&parent->lock);
120 spin_lock_nested(&freezer->lock, SINGLE_DEPTH_NESTING);
121 111
122 freezer->state |= CGROUP_FREEZER_ONLINE; 112 freezer->state |= CGROUP_FREEZER_ONLINE;
123 113
@@ -126,10 +116,7 @@ static int freezer_css_online(struct cgroup_subsys_state *css)
126 atomic_inc(&system_freezing_cnt); 116 atomic_inc(&system_freezing_cnt);
127 } 117 }
128 118
129 spin_unlock(&freezer->lock); 119 mutex_unlock(&freezer_mutex);
130 if (parent)
131 spin_unlock_irq(&parent->lock);
132
133 return 0; 120 return 0;
134} 121}
135 122
@@ -144,14 +131,14 @@ static void freezer_css_offline(struct cgroup_subsys_state *css)
144{ 131{
145 struct freezer *freezer = css_freezer(css); 132 struct freezer *freezer = css_freezer(css);
146 133
147 spin_lock_irq(&freezer->lock); 134 mutex_lock(&freezer_mutex);
148 135
149 if (freezer->state & CGROUP_FREEZING) 136 if (freezer->state & CGROUP_FREEZING)
150 atomic_dec(&system_freezing_cnt); 137 atomic_dec(&system_freezing_cnt);
151 138
152 freezer->state = 0; 139 freezer->state = 0;
153 140
154 spin_unlock_irq(&freezer->lock); 141 mutex_unlock(&freezer_mutex);
155} 142}
156 143
157static void freezer_css_free(struct cgroup_subsys_state *css) 144static void freezer_css_free(struct cgroup_subsys_state *css)
@@ -175,7 +162,7 @@ static void freezer_attach(struct cgroup_subsys_state *new_css,
175 struct task_struct *task; 162 struct task_struct *task;
176 bool clear_frozen = false; 163 bool clear_frozen = false;
177 164
178 spin_lock_irq(&freezer->lock); 165 mutex_lock(&freezer_mutex);
179 166
180 /* 167 /*
181 * Make the new tasks conform to the current state of @new_css. 168 * Make the new tasks conform to the current state of @new_css.
@@ -197,21 +184,13 @@ static void freezer_attach(struct cgroup_subsys_state *new_css,
197 } 184 }
198 } 185 }
199 186
200 spin_unlock_irq(&freezer->lock); 187 /* propagate FROZEN clearing upwards */
201
202 /*
203 * Propagate FROZEN clearing upwards. We may race with
204 * update_if_frozen(), but as long as both work bottom-up, either
205 * update_if_frozen() sees child's FROZEN cleared or we clear the
206 * parent's FROZEN later. No parent w/ !FROZEN children can be
207 * left FROZEN.
208 */
209 while (clear_frozen && (freezer = parent_freezer(freezer))) { 188 while (clear_frozen && (freezer = parent_freezer(freezer))) {
210 spin_lock_irq(&freezer->lock);
211 freezer->state &= ~CGROUP_FROZEN; 189 freezer->state &= ~CGROUP_FROZEN;
212 clear_frozen = freezer->state & CGROUP_FREEZING; 190 clear_frozen = freezer->state & CGROUP_FREEZING;
213 spin_unlock_irq(&freezer->lock);
214 } 191 }
192
193 mutex_unlock(&freezer_mutex);
215} 194}
216 195
217/** 196/**
@@ -228,9 +207,6 @@ static void freezer_fork(struct task_struct *task)
228{ 207{
229 struct freezer *freezer; 208 struct freezer *freezer;
230 209
231 rcu_read_lock();
232 freezer = task_freezer(task);
233
234 /* 210 /*
235 * The root cgroup is non-freezable, so we can skip locking the 211 * The root cgroup is non-freezable, so we can skip locking the
236 * freezer. This is safe regardless of race with task migration. 212 * freezer. This is safe regardless of race with task migration.
@@ -238,24 +214,18 @@ static void freezer_fork(struct task_struct *task)
238 * to do. If we lost and root is the new cgroup, noop is still the 214 * to do. If we lost and root is the new cgroup, noop is still the
239 * right thing to do. 215 * right thing to do.
240 */ 216 */
241 if (!parent_freezer(freezer)) 217 if (task_css_is_root(task, freezer_cgrp_id))
242 goto out; 218 return;
243 219
244 /* 220 mutex_lock(&freezer_mutex);
245 * Grab @freezer->lock and freeze @task after verifying @task still 221 rcu_read_lock();
246 * belongs to @freezer and it's freezing. The former is for the 222
247 * case where we have raced against task migration and lost and 223 freezer = task_freezer(task);
248 * @task is already in a different cgroup which may not be frozen. 224 if (freezer->state & CGROUP_FREEZING)
249 * This isn't strictly necessary as freeze_task() is allowed to be
250 * called spuriously but let's do it anyway for, if nothing else,
251 * documentation.
252 */
253 spin_lock_irq(&freezer->lock);
254 if (freezer == task_freezer(task) && (freezer->state & CGROUP_FREEZING))
255 freeze_task(task); 225 freeze_task(task);
256 spin_unlock_irq(&freezer->lock); 226
257out:
258 rcu_read_unlock(); 227 rcu_read_unlock();
228 mutex_unlock(&freezer_mutex);
259} 229}
260 230
261/** 231/**
@@ -281,22 +251,24 @@ static void update_if_frozen(struct cgroup_subsys_state *css)
281 struct css_task_iter it; 251 struct css_task_iter it;
282 struct task_struct *task; 252 struct task_struct *task;
283 253
284 WARN_ON_ONCE(!rcu_read_lock_held()); 254 lockdep_assert_held(&freezer_mutex);
285
286 spin_lock_irq(&freezer->lock);
287 255
288 if (!(freezer->state & CGROUP_FREEZING) || 256 if (!(freezer->state & CGROUP_FREEZING) ||
289 (freezer->state & CGROUP_FROZEN)) 257 (freezer->state & CGROUP_FROZEN))
290 goto out_unlock; 258 return;
291 259
292 /* are all (live) children frozen? */ 260 /* are all (live) children frozen? */
261 rcu_read_lock();
293 css_for_each_child(pos, css) { 262 css_for_each_child(pos, css) {
294 struct freezer *child = css_freezer(pos); 263 struct freezer *child = css_freezer(pos);
295 264
296 if ((child->state & CGROUP_FREEZER_ONLINE) && 265 if ((child->state & CGROUP_FREEZER_ONLINE) &&
297 !(child->state & CGROUP_FROZEN)) 266 !(child->state & CGROUP_FROZEN)) {
298 goto out_unlock; 267 rcu_read_unlock();
268 return;
269 }
299 } 270 }
271 rcu_read_unlock();
300 272
301 /* are all tasks frozen? */ 273 /* are all tasks frozen? */
302 css_task_iter_start(css, &it); 274 css_task_iter_start(css, &it);
@@ -317,21 +289,29 @@ static void update_if_frozen(struct cgroup_subsys_state *css)
317 freezer->state |= CGROUP_FROZEN; 289 freezer->state |= CGROUP_FROZEN;
318out_iter_end: 290out_iter_end:
319 css_task_iter_end(&it); 291 css_task_iter_end(&it);
320out_unlock:
321 spin_unlock_irq(&freezer->lock);
322} 292}
323 293
324static int freezer_read(struct seq_file *m, void *v) 294static int freezer_read(struct seq_file *m, void *v)
325{ 295{
326 struct cgroup_subsys_state *css = seq_css(m), *pos; 296 struct cgroup_subsys_state *css = seq_css(m), *pos;
327 297
298 mutex_lock(&freezer_mutex);
328 rcu_read_lock(); 299 rcu_read_lock();
329 300
330 /* update states bottom-up */ 301 /* update states bottom-up */
331 css_for_each_descendant_post(pos, css) 302 css_for_each_descendant_post(pos, css) {
303 if (!css_tryget_online(pos))
304 continue;
305 rcu_read_unlock();
306
332 update_if_frozen(pos); 307 update_if_frozen(pos);
333 308
309 rcu_read_lock();
310 css_put(pos);
311 }
312
334 rcu_read_unlock(); 313 rcu_read_unlock();
314 mutex_unlock(&freezer_mutex);
335 315
336 seq_puts(m, freezer_state_strs(css_freezer(css)->state)); 316 seq_puts(m, freezer_state_strs(css_freezer(css)->state));
337 seq_putc(m, '\n'); 317 seq_putc(m, '\n');
@@ -373,7 +353,7 @@ static void freezer_apply_state(struct freezer *freezer, bool freeze,
373 unsigned int state) 353 unsigned int state)
374{ 354{
375 /* also synchronizes against task migration, see freezer_attach() */ 355 /* also synchronizes against task migration, see freezer_attach() */
376 lockdep_assert_held(&freezer->lock); 356 lockdep_assert_held(&freezer_mutex);
377 357
378 if (!(freezer->state & CGROUP_FREEZER_ONLINE)) 358 if (!(freezer->state & CGROUP_FREEZER_ONLINE))
379 return; 359 return;
@@ -414,47 +394,47 @@ static void freezer_change_state(struct freezer *freezer, bool freeze)
414 * descendant will try to inherit its parent's FREEZING state as 394 * descendant will try to inherit its parent's FREEZING state as
415 * CGROUP_FREEZING_PARENT. 395 * CGROUP_FREEZING_PARENT.
416 */ 396 */
397 mutex_lock(&freezer_mutex);
417 rcu_read_lock(); 398 rcu_read_lock();
418 css_for_each_descendant_pre(pos, &freezer->css) { 399 css_for_each_descendant_pre(pos, &freezer->css) {
419 struct freezer *pos_f = css_freezer(pos); 400 struct freezer *pos_f = css_freezer(pos);
420 struct freezer *parent = parent_freezer(pos_f); 401 struct freezer *parent = parent_freezer(pos_f);
421 402
422 spin_lock_irq(&pos_f->lock); 403 if (!css_tryget_online(pos))
404 continue;
405 rcu_read_unlock();
423 406
424 if (pos_f == freezer) { 407 if (pos_f == freezer)
425 freezer_apply_state(pos_f, freeze, 408 freezer_apply_state(pos_f, freeze,
426 CGROUP_FREEZING_SELF); 409 CGROUP_FREEZING_SELF);
427 } else { 410 else
428 /*
429 * Our update to @parent->state is already visible
430 * which is all we need. No need to lock @parent.
431 * For more info on synchronization, see
432 * freezer_post_create().
433 */
434 freezer_apply_state(pos_f, 411 freezer_apply_state(pos_f,
435 parent->state & CGROUP_FREEZING, 412 parent->state & CGROUP_FREEZING,
436 CGROUP_FREEZING_PARENT); 413 CGROUP_FREEZING_PARENT);
437 }
438 414
439 spin_unlock_irq(&pos_f->lock); 415 rcu_read_lock();
416 css_put(pos);
440 } 417 }
441 rcu_read_unlock(); 418 rcu_read_unlock();
419 mutex_unlock(&freezer_mutex);
442} 420}
443 421
444static int freezer_write(struct cgroup_subsys_state *css, struct cftype *cft, 422static ssize_t freezer_write(struct kernfs_open_file *of,
445 char *buffer) 423 char *buf, size_t nbytes, loff_t off)
446{ 424{
447 bool freeze; 425 bool freeze;
448 426
449 if (strcmp(buffer, freezer_state_strs(0)) == 0) 427 buf = strstrip(buf);
428
429 if (strcmp(buf, freezer_state_strs(0)) == 0)
450 freeze = false; 430 freeze = false;
451 else if (strcmp(buffer, freezer_state_strs(CGROUP_FROZEN)) == 0) 431 else if (strcmp(buf, freezer_state_strs(CGROUP_FROZEN)) == 0)
452 freeze = true; 432 freeze = true;
453 else 433 else
454 return -EINVAL; 434 return -EINVAL;
455 435
456 freezer_change_state(css_freezer(css), freeze); 436 freezer_change_state(css_freezer(of_css(of)), freeze);
457 return 0; 437 return nbytes;
458} 438}
459 439
460static u64 freezer_self_freezing_read(struct cgroup_subsys_state *css, 440static u64 freezer_self_freezing_read(struct cgroup_subsys_state *css,
@@ -478,7 +458,7 @@ static struct cftype files[] = {
478 .name = "state", 458 .name = "state",
479 .flags = CFTYPE_NOT_ON_ROOT, 459 .flags = CFTYPE_NOT_ON_ROOT,
480 .seq_show = freezer_read, 460 .seq_show = freezer_read,
481 .write_string = freezer_write, 461 .write = freezer_write,
482 }, 462 },
483 { 463 {
484 .name = "self_freezing", 464 .name = "self_freezing",
diff --git a/kernel/compat.c b/kernel/compat.c
index e40b0430b562..633394f442f8 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -157,7 +157,7 @@ static int __compat_put_timespec(const struct timespec *ts, struct compat_timesp
157int compat_get_timeval(struct timeval *tv, const void __user *utv) 157int compat_get_timeval(struct timeval *tv, const void __user *utv)
158{ 158{
159 if (COMPAT_USE_64BIT_TIME) 159 if (COMPAT_USE_64BIT_TIME)
160 return copy_from_user(tv, utv, sizeof *tv) ? -EFAULT : 0; 160 return copy_from_user(tv, utv, sizeof(*tv)) ? -EFAULT : 0;
161 else 161 else
162 return __compat_get_timeval(tv, utv); 162 return __compat_get_timeval(tv, utv);
163} 163}
@@ -166,7 +166,7 @@ EXPORT_SYMBOL_GPL(compat_get_timeval);
166int compat_put_timeval(const struct timeval *tv, void __user *utv) 166int compat_put_timeval(const struct timeval *tv, void __user *utv)
167{ 167{
168 if (COMPAT_USE_64BIT_TIME) 168 if (COMPAT_USE_64BIT_TIME)
169 return copy_to_user(utv, tv, sizeof *tv) ? -EFAULT : 0; 169 return copy_to_user(utv, tv, sizeof(*tv)) ? -EFAULT : 0;
170 else 170 else
171 return __compat_put_timeval(tv, utv); 171 return __compat_put_timeval(tv, utv);
172} 172}
@@ -175,7 +175,7 @@ EXPORT_SYMBOL_GPL(compat_put_timeval);
175int compat_get_timespec(struct timespec *ts, const void __user *uts) 175int compat_get_timespec(struct timespec *ts, const void __user *uts)
176{ 176{
177 if (COMPAT_USE_64BIT_TIME) 177 if (COMPAT_USE_64BIT_TIME)
178 return copy_from_user(ts, uts, sizeof *ts) ? -EFAULT : 0; 178 return copy_from_user(ts, uts, sizeof(*ts)) ? -EFAULT : 0;
179 else 179 else
180 return __compat_get_timespec(ts, uts); 180 return __compat_get_timespec(ts, uts);
181} 181}
@@ -184,7 +184,7 @@ EXPORT_SYMBOL_GPL(compat_get_timespec);
184int compat_put_timespec(const struct timespec *ts, void __user *uts) 184int compat_put_timespec(const struct timespec *ts, void __user *uts)
185{ 185{
186 if (COMPAT_USE_64BIT_TIME) 186 if (COMPAT_USE_64BIT_TIME)
187 return copy_to_user(uts, ts, sizeof *ts) ? -EFAULT : 0; 187 return copy_to_user(uts, ts, sizeof(*ts)) ? -EFAULT : 0;
188 else 188 else
189 return __compat_put_timespec(ts, uts); 189 return __compat_put_timespec(ts, uts);
190} 190}
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c
index 6cb20d2e7ee0..019d45008448 100644
--- a/kernel/context_tracking.c
+++ b/kernel/context_tracking.c
@@ -120,7 +120,7 @@ void context_tracking_user_enter(void)
120 * instead of preempt_schedule() to exit user context if needed before 120 * instead of preempt_schedule() to exit user context if needed before
121 * calling the scheduler. 121 * calling the scheduler.
122 */ 122 */
123asmlinkage void __sched notrace preempt_schedule_context(void) 123asmlinkage __visible void __sched notrace preempt_schedule_context(void)
124{ 124{
125 enum ctx_state prev_ctx; 125 enum ctx_state prev_ctx;
126 126
diff --git a/kernel/cpu.c b/kernel/cpu.c
index a9e710eef0e2..a343bde710b1 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -20,6 +20,7 @@
20#include <linux/gfp.h> 20#include <linux/gfp.h>
21#include <linux/suspend.h> 21#include <linux/suspend.h>
22#include <linux/lockdep.h> 22#include <linux/lockdep.h>
23#include <trace/events/power.h>
23 24
24#include "smpboot.h" 25#include "smpboot.h"
25 26
@@ -283,8 +284,7 @@ static inline void check_for_tasks(int cpu)
283 task_cputime(p, &utime, &stime); 284 task_cputime(p, &utime, &stime);
284 if (task_cpu(p) == cpu && p->state == TASK_RUNNING && 285 if (task_cpu(p) == cpu && p->state == TASK_RUNNING &&
285 (utime || stime)) 286 (utime || stime))
286 printk(KERN_WARNING "Task %s (pid = %d) is on cpu %d " 287 pr_warn("Task %s (pid = %d) is on cpu %d (state = %ld, flags = %x)\n",
287 "(state = %ld, flags = %x)\n",
288 p->comm, task_pid_nr(p), cpu, 288 p->comm, task_pid_nr(p), cpu,
289 p->state, p->flags); 289 p->state, p->flags);
290 } 290 }
@@ -336,8 +336,8 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
336 if (err) { 336 if (err) {
337 nr_calls--; 337 nr_calls--;
338 __cpu_notify(CPU_DOWN_FAILED | mod, hcpu, nr_calls, NULL); 338 __cpu_notify(CPU_DOWN_FAILED | mod, hcpu, nr_calls, NULL);
339 printk("%s: attempt to take down CPU %u failed\n", 339 pr_warn("%s: attempt to take down CPU %u failed\n",
340 __func__, cpu); 340 __func__, cpu);
341 goto out_release; 341 goto out_release;
342 } 342 }
343 343
@@ -444,8 +444,8 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen)
444 ret = __cpu_notify(CPU_UP_PREPARE | mod, hcpu, -1, &nr_calls); 444 ret = __cpu_notify(CPU_UP_PREPARE | mod, hcpu, -1, &nr_calls);
445 if (ret) { 445 if (ret) {
446 nr_calls--; 446 nr_calls--;
447 printk(KERN_WARNING "%s: attempt to bring up CPU %u failed\n", 447 pr_warn("%s: attempt to bring up CPU %u failed\n",
448 __func__, cpu); 448 __func__, cpu);
449 goto out_notify; 449 goto out_notify;
450 } 450 }
451 451
@@ -475,11 +475,10 @@ int cpu_up(unsigned int cpu)
475 int err = 0; 475 int err = 0;
476 476
477 if (!cpu_possible(cpu)) { 477 if (!cpu_possible(cpu)) {
478 printk(KERN_ERR "can't online cpu %d because it is not " 478 pr_err("can't online cpu %d because it is not configured as may-hotadd at boot time\n",
479 "configured as may-hotadd at boot time\n", cpu); 479 cpu);
480#if defined(CONFIG_IA64) 480#if defined(CONFIG_IA64)
481 printk(KERN_ERR "please check additional_cpus= boot " 481 pr_err("please check additional_cpus= boot parameter\n");
482 "parameter\n");
483#endif 482#endif
484 return -EINVAL; 483 return -EINVAL;
485 } 484 }
@@ -518,16 +517,17 @@ int disable_nonboot_cpus(void)
518 */ 517 */
519 cpumask_clear(frozen_cpus); 518 cpumask_clear(frozen_cpus);
520 519
521 printk("Disabling non-boot CPUs ...\n"); 520 pr_info("Disabling non-boot CPUs ...\n");
522 for_each_online_cpu(cpu) { 521 for_each_online_cpu(cpu) {
523 if (cpu == first_cpu) 522 if (cpu == first_cpu)
524 continue; 523 continue;
524 trace_suspend_resume(TPS("CPU_OFF"), cpu, true);
525 error = _cpu_down(cpu, 1); 525 error = _cpu_down(cpu, 1);
526 trace_suspend_resume(TPS("CPU_OFF"), cpu, false);
526 if (!error) 527 if (!error)
527 cpumask_set_cpu(cpu, frozen_cpus); 528 cpumask_set_cpu(cpu, frozen_cpus);
528 else { 529 else {
529 printk(KERN_ERR "Error taking CPU%d down: %d\n", 530 pr_err("Error taking CPU%d down: %d\n", cpu, error);
530 cpu, error);
531 break; 531 break;
532 } 532 }
533 } 533 }
@@ -537,7 +537,7 @@ int disable_nonboot_cpus(void)
537 /* Make sure the CPUs won't be enabled by someone else */ 537 /* Make sure the CPUs won't be enabled by someone else */
538 cpu_hotplug_disabled = 1; 538 cpu_hotplug_disabled = 1;
539 } else { 539 } else {
540 printk(KERN_ERR "Non-boot CPUs are not disabled\n"); 540 pr_err("Non-boot CPUs are not disabled\n");
541 } 541 }
542 cpu_maps_update_done(); 542 cpu_maps_update_done();
543 return error; 543 return error;
@@ -561,17 +561,19 @@ void __ref enable_nonboot_cpus(void)
561 if (cpumask_empty(frozen_cpus)) 561 if (cpumask_empty(frozen_cpus))
562 goto out; 562 goto out;
563 563
564 printk(KERN_INFO "Enabling non-boot CPUs ...\n"); 564 pr_info("Enabling non-boot CPUs ...\n");
565 565
566 arch_enable_nonboot_cpus_begin(); 566 arch_enable_nonboot_cpus_begin();
567 567
568 for_each_cpu(cpu, frozen_cpus) { 568 for_each_cpu(cpu, frozen_cpus) {
569 trace_suspend_resume(TPS("CPU_ON"), cpu, true);
569 error = _cpu_up(cpu, 1); 570 error = _cpu_up(cpu, 1);
571 trace_suspend_resume(TPS("CPU_ON"), cpu, false);
570 if (!error) { 572 if (!error) {
571 printk(KERN_INFO "CPU%d is up\n", cpu); 573 pr_info("CPU%d is up\n", cpu);
572 continue; 574 continue;
573 } 575 }
574 printk(KERN_WARNING "Error taking CPU%d up: %d\n", cpu, error); 576 pr_warn("Error taking CPU%d up: %d\n", cpu, error);
575 } 577 }
576 578
577 arch_enable_nonboot_cpus_end(); 579 arch_enable_nonboot_cpus_end();
@@ -726,10 +728,12 @@ void set_cpu_present(unsigned int cpu, bool present)
726 728
727void set_cpu_online(unsigned int cpu, bool online) 729void set_cpu_online(unsigned int cpu, bool online)
728{ 730{
729 if (online) 731 if (online) {
730 cpumask_set_cpu(cpu, to_cpumask(cpu_online_bits)); 732 cpumask_set_cpu(cpu, to_cpumask(cpu_online_bits));
731 else 733 cpumask_set_cpu(cpu, to_cpumask(cpu_active_bits));
734 } else {
732 cpumask_clear_cpu(cpu, to_cpumask(cpu_online_bits)); 735 cpumask_clear_cpu(cpu, to_cpumask(cpu_online_bits));
736 }
733} 737}
734 738
735void set_cpu_active(unsigned int cpu, bool active) 739void set_cpu_active(unsigned int cpu, bool active)
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 3d54c418bd06..f6b33c696224 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -61,12 +61,7 @@
61#include <linux/cgroup.h> 61#include <linux/cgroup.h>
62#include <linux/wait.h> 62#include <linux/wait.h>
63 63
64/* 64struct static_key cpusets_enabled_key __read_mostly = STATIC_KEY_INIT_FALSE;
65 * Tracks how many cpusets are currently defined in system.
66 * When there is only one cpuset (the root cpuset) we can
67 * short circuit some hooks.
68 */
69int number_of_cpusets __read_mostly;
70 65
71/* See "Frequency meter" comments, below. */ 66/* See "Frequency meter" comments, below. */
72 67
@@ -124,7 +119,7 @@ static inline struct cpuset *task_cs(struct task_struct *task)
124 119
125static inline struct cpuset *parent_cs(struct cpuset *cs) 120static inline struct cpuset *parent_cs(struct cpuset *cs)
126{ 121{
127 return css_cs(css_parent(&cs->css)); 122 return css_cs(cs->css.parent);
128} 123}
129 124
130#ifdef CONFIG_NUMA 125#ifdef CONFIG_NUMA
@@ -611,7 +606,7 @@ static int generate_sched_domains(cpumask_var_t **domains,
611 goto done; 606 goto done;
612 } 607 }
613 608
614 csa = kmalloc(number_of_cpusets * sizeof(cp), GFP_KERNEL); 609 csa = kmalloc(nr_cpusets() * sizeof(cp), GFP_KERNEL);
615 if (!csa) 610 if (!csa)
616 goto done; 611 goto done;
617 csn = 0; 612 csn = 0;
@@ -696,11 +691,8 @@ restart:
696 if (nslot == ndoms) { 691 if (nslot == ndoms) {
697 static int warnings = 10; 692 static int warnings = 10;
698 if (warnings) { 693 if (warnings) {
699 printk(KERN_WARNING 694 pr_warn("rebuild_sched_domains confused: nslot %d, ndoms %d, csn %d, i %d, apn %d\n",
700 "rebuild_sched_domains confused:" 695 nslot, ndoms, csn, i, apn);
701 " nslot %d, ndoms %d, csn %d, i %d,"
702 " apn %d\n",
703 nslot, ndoms, csn, i, apn);
704 warnings--; 696 warnings--;
705 } 697 }
706 continue; 698 continue;
@@ -875,7 +867,7 @@ static void update_tasks_cpumask_hier(struct cpuset *root_cs, bool update_root)
875 continue; 867 continue;
876 } 868 }
877 } 869 }
878 if (!css_tryget(&cp->css)) 870 if (!css_tryget_online(&cp->css))
879 continue; 871 continue;
880 rcu_read_unlock(); 872 rcu_read_unlock();
881 873
@@ -890,6 +882,7 @@ static void update_tasks_cpumask_hier(struct cpuset *root_cs, bool update_root)
890/** 882/**
891 * update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it 883 * update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it
892 * @cs: the cpuset to consider 884 * @cs: the cpuset to consider
885 * @trialcs: trial cpuset
893 * @buf: buffer of cpu numbers written to this cpuset 886 * @buf: buffer of cpu numbers written to this cpuset
894 */ 887 */
895static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, 888static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
@@ -1110,7 +1103,7 @@ static void update_tasks_nodemask_hier(struct cpuset *root_cs, bool update_root)
1110 continue; 1103 continue;
1111 } 1104 }
1112 } 1105 }
1113 if (!css_tryget(&cp->css)) 1106 if (!css_tryget_online(&cp->css))
1114 continue; 1107 continue;
1115 rcu_read_unlock(); 1108 rcu_read_unlock();
1116 1109
@@ -1605,13 +1598,15 @@ out_unlock:
1605/* 1598/*
1606 * Common handling for a write to a "cpus" or "mems" file. 1599 * Common handling for a write to a "cpus" or "mems" file.
1607 */ 1600 */
1608static int cpuset_write_resmask(struct cgroup_subsys_state *css, 1601static ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
1609 struct cftype *cft, char *buf) 1602 char *buf, size_t nbytes, loff_t off)
1610{ 1603{
1611 struct cpuset *cs = css_cs(css); 1604 struct cpuset *cs = css_cs(of_css(of));
1612 struct cpuset *trialcs; 1605 struct cpuset *trialcs;
1613 int retval = -ENODEV; 1606 int retval = -ENODEV;
1614 1607
1608 buf = strstrip(buf);
1609
1615 /* 1610 /*
1616 * CPU or memory hotunplug may leave @cs w/o any execution 1611 * CPU or memory hotunplug may leave @cs w/o any execution
1617 * resources, in which case the hotplug code asynchronously updates 1612 * resources, in which case the hotplug code asynchronously updates
@@ -1635,7 +1630,7 @@ static int cpuset_write_resmask(struct cgroup_subsys_state *css,
1635 goto out_unlock; 1630 goto out_unlock;
1636 } 1631 }
1637 1632
1638 switch (cft->private) { 1633 switch (of_cft(of)->private) {
1639 case FILE_CPULIST: 1634 case FILE_CPULIST:
1640 retval = update_cpumask(cs, trialcs, buf); 1635 retval = update_cpumask(cs, trialcs, buf);
1641 break; 1636 break;
@@ -1650,7 +1645,7 @@ static int cpuset_write_resmask(struct cgroup_subsys_state *css,
1650 free_trial_cpuset(trialcs); 1645 free_trial_cpuset(trialcs);
1651out_unlock: 1646out_unlock:
1652 mutex_unlock(&cpuset_mutex); 1647 mutex_unlock(&cpuset_mutex);
1653 return retval; 1648 return retval ?: nbytes;
1654} 1649}
1655 1650
1656/* 1651/*
@@ -1752,7 +1747,7 @@ static struct cftype files[] = {
1752 { 1747 {
1753 .name = "cpus", 1748 .name = "cpus",
1754 .seq_show = cpuset_common_seq_show, 1749 .seq_show = cpuset_common_seq_show,
1755 .write_string = cpuset_write_resmask, 1750 .write = cpuset_write_resmask,
1756 .max_write_len = (100U + 6 * NR_CPUS), 1751 .max_write_len = (100U + 6 * NR_CPUS),
1757 .private = FILE_CPULIST, 1752 .private = FILE_CPULIST,
1758 }, 1753 },
@@ -1760,7 +1755,7 @@ static struct cftype files[] = {
1760 { 1755 {
1761 .name = "mems", 1756 .name = "mems",
1762 .seq_show = cpuset_common_seq_show, 1757 .seq_show = cpuset_common_seq_show,
1763 .write_string = cpuset_write_resmask, 1758 .write = cpuset_write_resmask,
1764 .max_write_len = (100U + 6 * MAX_NUMNODES), 1759 .max_write_len = (100U + 6 * MAX_NUMNODES),
1765 .private = FILE_MEMLIST, 1760 .private = FILE_MEMLIST,
1766 }, 1761 },
@@ -1888,7 +1883,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
1888 if (is_spread_slab(parent)) 1883 if (is_spread_slab(parent))
1889 set_bit(CS_SPREAD_SLAB, &cs->flags); 1884 set_bit(CS_SPREAD_SLAB, &cs->flags);
1890 1885
1891 number_of_cpusets++; 1886 cpuset_inc();
1892 1887
1893 if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags)) 1888 if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags))
1894 goto out_unlock; 1889 goto out_unlock;
@@ -1939,7 +1934,7 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css)
1939 if (is_sched_load_balance(cs)) 1934 if (is_sched_load_balance(cs))
1940 update_flag(CS_SCHED_LOAD_BALANCE, cs, 0); 1935 update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
1941 1936
1942 number_of_cpusets--; 1937 cpuset_dec();
1943 clear_bit(CS_ONLINE, &cs->flags); 1938 clear_bit(CS_ONLINE, &cs->flags);
1944 1939
1945 mutex_unlock(&cpuset_mutex); 1940 mutex_unlock(&cpuset_mutex);
@@ -1992,7 +1987,6 @@ int __init cpuset_init(void)
1992 if (!alloc_cpumask_var(&cpus_attach, GFP_KERNEL)) 1987 if (!alloc_cpumask_var(&cpus_attach, GFP_KERNEL))
1993 BUG(); 1988 BUG();
1994 1989
1995 number_of_cpusets = 1;
1996 return 0; 1990 return 0;
1997} 1991}
1998 1992
@@ -2017,7 +2011,7 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
2017 parent = parent_cs(parent); 2011 parent = parent_cs(parent);
2018 2012
2019 if (cgroup_transfer_tasks(parent->css.cgroup, cs->css.cgroup)) { 2013 if (cgroup_transfer_tasks(parent->css.cgroup, cs->css.cgroup)) {
2020 printk(KERN_ERR "cpuset: failed to transfer tasks out of empty cpuset "); 2014 pr_err("cpuset: failed to transfer tasks out of empty cpuset ");
2021 pr_cont_cgroup_name(cs->css.cgroup); 2015 pr_cont_cgroup_name(cs->css.cgroup);
2022 pr_cont("\n"); 2016 pr_cont("\n");
2023 } 2017 }
@@ -2155,7 +2149,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
2155 2149
2156 rcu_read_lock(); 2150 rcu_read_lock();
2157 cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) { 2151 cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) {
2158 if (cs == &top_cpuset || !css_tryget(&cs->css)) 2152 if (cs == &top_cpuset || !css_tryget_online(&cs->css))
2159 continue; 2153 continue;
2160 rcu_read_unlock(); 2154 rcu_read_unlock();
2161 2155
@@ -2536,7 +2530,7 @@ int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
2536 2530
2537/** 2531/**
2538 * cpuset_print_task_mems_allowed - prints task's cpuset and mems_allowed 2532 * cpuset_print_task_mems_allowed - prints task's cpuset and mems_allowed
2539 * @task: pointer to task_struct of some task. 2533 * @tsk: pointer to task_struct of some task.
2540 * 2534 *
2541 * Description: Prints @task's name, cpuset name, and cached copy of its 2535 * Description: Prints @task's name, cpuset name, and cached copy of its
2542 * mems_allowed to the kernel log. 2536 * mems_allowed to the kernel log.
@@ -2554,7 +2548,7 @@ void cpuset_print_task_mems_allowed(struct task_struct *tsk)
2554 cgrp = task_cs(tsk)->css.cgroup; 2548 cgrp = task_cs(tsk)->css.cgroup;
2555 nodelist_scnprintf(cpuset_nodelist, CPUSET_NODELIST_LEN, 2549 nodelist_scnprintf(cpuset_nodelist, CPUSET_NODELIST_LEN,
2556 tsk->mems_allowed); 2550 tsk->mems_allowed);
2557 printk(KERN_INFO "%s cpuset=", tsk->comm); 2551 pr_info("%s cpuset=", tsk->comm);
2558 pr_cont_cgroup_name(cgrp); 2552 pr_cont_cgroup_name(cgrp);
2559 pr_cont(" mems_allowed=%s\n", cpuset_nodelist); 2553 pr_cont(" mems_allowed=%s\n", cpuset_nodelist);
2560 2554
@@ -2646,10 +2640,10 @@ out:
2646/* Display task mems_allowed in /proc/<pid>/status file. */ 2640/* Display task mems_allowed in /proc/<pid>/status file. */
2647void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task) 2641void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task)
2648{ 2642{
2649 seq_printf(m, "Mems_allowed:\t"); 2643 seq_puts(m, "Mems_allowed:\t");
2650 seq_nodemask(m, &task->mems_allowed); 2644 seq_nodemask(m, &task->mems_allowed);
2651 seq_printf(m, "\n"); 2645 seq_puts(m, "\n");
2652 seq_printf(m, "Mems_allowed_list:\t"); 2646 seq_puts(m, "Mems_allowed_list:\t");
2653 seq_nodemask_list(m, &task->mems_allowed); 2647 seq_nodemask_list(m, &task->mems_allowed);
2654 seq_printf(m, "\n"); 2648 seq_puts(m, "\n");
2655} 2649}
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index 2956c8da1605..1adf62b39b96 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -534,7 +534,7 @@ return_normal:
534 kgdb_info[cpu].exception_state &= 534 kgdb_info[cpu].exception_state &=
535 ~(DCPU_WANT_MASTER | DCPU_IS_SLAVE); 535 ~(DCPU_WANT_MASTER | DCPU_IS_SLAVE);
536 kgdb_info[cpu].enter_kgdb--; 536 kgdb_info[cpu].enter_kgdb--;
537 smp_mb__before_atomic_dec(); 537 smp_mb__before_atomic();
538 atomic_dec(&slaves_in_kgdb); 538 atomic_dec(&slaves_in_kgdb);
539 dbg_touch_watchdogs(); 539 dbg_touch_watchdogs();
540 local_irq_restore(flags); 540 local_irq_restore(flags);
@@ -662,7 +662,7 @@ kgdb_restore:
662 kgdb_info[cpu].exception_state &= 662 kgdb_info[cpu].exception_state &=
663 ~(DCPU_WANT_MASTER | DCPU_IS_SLAVE); 663 ~(DCPU_WANT_MASTER | DCPU_IS_SLAVE);
664 kgdb_info[cpu].enter_kgdb--; 664 kgdb_info[cpu].enter_kgdb--;
665 smp_mb__before_atomic_dec(); 665 smp_mb__before_atomic();
666 atomic_dec(&masters_in_kgdb); 666 atomic_dec(&masters_in_kgdb);
667 /* Free kgdb_active */ 667 /* Free kgdb_active */
668 atomic_set(&kgdb_active, -1); 668 atomic_set(&kgdb_active, -1);
diff --git a/kernel/debug/kdb/kdb_bt.c b/kernel/debug/kdb/kdb_bt.c
index b03e0e814e43..fe15fff5df53 100644
--- a/kernel/debug/kdb/kdb_bt.c
+++ b/kernel/debug/kdb/kdb_bt.c
@@ -21,7 +21,7 @@
21static void kdb_show_stack(struct task_struct *p, void *addr) 21static void kdb_show_stack(struct task_struct *p, void *addr)
22{ 22{
23 int old_lvl = console_loglevel; 23 int old_lvl = console_loglevel;
24 console_loglevel = 15; 24 console_loglevel = CONSOLE_LOGLEVEL_MOTORMOUTH;
25 kdb_trap_printk++; 25 kdb_trap_printk++;
26 kdb_set_current_task(p); 26 kdb_set_current_task(p);
27 if (addr) { 27 if (addr) {
diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c
index 14ff4849262c..7c70812caea5 100644
--- a/kernel/debug/kdb/kdb_io.c
+++ b/kernel/debug/kdb/kdb_io.c
@@ -710,7 +710,7 @@ kdb_printit:
710 } 710 }
711 if (logging) { 711 if (logging) {
712 saved_loglevel = console_loglevel; 712 saved_loglevel = console_loglevel;
713 console_loglevel = 0; 713 console_loglevel = CONSOLE_LOGLEVEL_SILENT;
714 printk(KERN_INFO "%s", kdb_buffer); 714 printk(KERN_INFO "%s", kdb_buffer);
715 } 715 }
716 716
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index 0b097c8a1e50..2f7c760305ca 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -1091,7 +1091,7 @@ static int kdb_reboot(int argc, const char **argv)
1091static void kdb_dumpregs(struct pt_regs *regs) 1091static void kdb_dumpregs(struct pt_regs *regs)
1092{ 1092{
1093 int old_lvl = console_loglevel; 1093 int old_lvl = console_loglevel;
1094 console_loglevel = 15; 1094 console_loglevel = CONSOLE_LOGLEVEL_MOTORMOUTH;
1095 kdb_trap_printk++; 1095 kdb_trap_printk++;
1096 show_regs(regs); 1096 show_regs(regs);
1097 kdb_trap_printk--; 1097 kdb_trap_printk--;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index f83a71a3e46d..5fa58e4cffac 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -39,6 +39,7 @@
39#include <linux/hw_breakpoint.h> 39#include <linux/hw_breakpoint.h>
40#include <linux/mm_types.h> 40#include <linux/mm_types.h>
41#include <linux/cgroup.h> 41#include <linux/cgroup.h>
42#include <linux/module.h>
42 43
43#include "internal.h" 44#include "internal.h"
44 45
@@ -607,7 +608,8 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event,
607 if (!f.file) 608 if (!f.file)
608 return -EBADF; 609 return -EBADF;
609 610
610 css = css_tryget_from_dir(f.file->f_dentry, &perf_event_cgrp_subsys); 611 css = css_tryget_online_from_dir(f.file->f_dentry,
612 &perf_event_cgrp_subsys);
611 if (IS_ERR(css)) { 613 if (IS_ERR(css)) {
612 ret = PTR_ERR(css); 614 ret = PTR_ERR(css);
613 goto out; 615 goto out;
@@ -1443,6 +1445,11 @@ group_sched_out(struct perf_event *group_event,
1443 cpuctx->exclusive = 0; 1445 cpuctx->exclusive = 0;
1444} 1446}
1445 1447
1448struct remove_event {
1449 struct perf_event *event;
1450 bool detach_group;
1451};
1452
1446/* 1453/*
1447 * Cross CPU call to remove a performance event 1454 * Cross CPU call to remove a performance event
1448 * 1455 *
@@ -1451,12 +1458,15 @@ group_sched_out(struct perf_event *group_event,
1451 */ 1458 */
1452static int __perf_remove_from_context(void *info) 1459static int __perf_remove_from_context(void *info)
1453{ 1460{
1454 struct perf_event *event = info; 1461 struct remove_event *re = info;
1462 struct perf_event *event = re->event;
1455 struct perf_event_context *ctx = event->ctx; 1463 struct perf_event_context *ctx = event->ctx;
1456 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); 1464 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
1457 1465
1458 raw_spin_lock(&ctx->lock); 1466 raw_spin_lock(&ctx->lock);
1459 event_sched_out(event, cpuctx, ctx); 1467 event_sched_out(event, cpuctx, ctx);
1468 if (re->detach_group)
1469 perf_group_detach(event);
1460 list_del_event(event, ctx); 1470 list_del_event(event, ctx);
1461 if (!ctx->nr_events && cpuctx->task_ctx == ctx) { 1471 if (!ctx->nr_events && cpuctx->task_ctx == ctx) {
1462 ctx->is_active = 0; 1472 ctx->is_active = 0;
@@ -1481,10 +1491,14 @@ static int __perf_remove_from_context(void *info)
1481 * When called from perf_event_exit_task, it's OK because the 1491 * When called from perf_event_exit_task, it's OK because the
1482 * context has been detached from its task. 1492 * context has been detached from its task.
1483 */ 1493 */
1484static void perf_remove_from_context(struct perf_event *event) 1494static void perf_remove_from_context(struct perf_event *event, bool detach_group)
1485{ 1495{
1486 struct perf_event_context *ctx = event->ctx; 1496 struct perf_event_context *ctx = event->ctx;
1487 struct task_struct *task = ctx->task; 1497 struct task_struct *task = ctx->task;
1498 struct remove_event re = {
1499 .event = event,
1500 .detach_group = detach_group,
1501 };
1488 1502
1489 lockdep_assert_held(&ctx->mutex); 1503 lockdep_assert_held(&ctx->mutex);
1490 1504
@@ -1493,12 +1507,12 @@ static void perf_remove_from_context(struct perf_event *event)
1493 * Per cpu events are removed via an smp call and 1507 * Per cpu events are removed via an smp call and
1494 * the removal is always successful. 1508 * the removal is always successful.
1495 */ 1509 */
1496 cpu_function_call(event->cpu, __perf_remove_from_context, event); 1510 cpu_function_call(event->cpu, __perf_remove_from_context, &re);
1497 return; 1511 return;
1498 } 1512 }
1499 1513
1500retry: 1514retry:
1501 if (!task_function_call(task, __perf_remove_from_context, event)) 1515 if (!task_function_call(task, __perf_remove_from_context, &re))
1502 return; 1516 return;
1503 1517
1504 raw_spin_lock_irq(&ctx->lock); 1518 raw_spin_lock_irq(&ctx->lock);
@@ -1515,6 +1529,8 @@ retry:
1515 * Since the task isn't running, its safe to remove the event, us 1529 * Since the task isn't running, its safe to remove the event, us
1516 * holding the ctx->lock ensures the task won't get scheduled in. 1530 * holding the ctx->lock ensures the task won't get scheduled in.
1517 */ 1531 */
1532 if (detach_group)
1533 perf_group_detach(event);
1518 list_del_event(event, ctx); 1534 list_del_event(event, ctx);
1519 raw_spin_unlock_irq(&ctx->lock); 1535 raw_spin_unlock_irq(&ctx->lock);
1520} 1536}
@@ -1663,6 +1679,8 @@ event_sched_in(struct perf_event *event,
1663 u64 tstamp = perf_event_time(event); 1679 u64 tstamp = perf_event_time(event);
1664 int ret = 0; 1680 int ret = 0;
1665 1681
1682 lockdep_assert_held(&ctx->lock);
1683
1666 if (event->state <= PERF_EVENT_STATE_OFF) 1684 if (event->state <= PERF_EVENT_STATE_OFF)
1667 return 0; 1685 return 0;
1668 1686
@@ -2956,6 +2974,22 @@ out:
2956 local_irq_restore(flags); 2974 local_irq_restore(flags);
2957} 2975}
2958 2976
2977void perf_event_exec(void)
2978{
2979 struct perf_event_context *ctx;
2980 int ctxn;
2981
2982 rcu_read_lock();
2983 for_each_task_context_nr(ctxn) {
2984 ctx = current->perf_event_ctxp[ctxn];
2985 if (!ctx)
2986 continue;
2987
2988 perf_event_enable_on_exec(ctx);
2989 }
2990 rcu_read_unlock();
2991}
2992
2959/* 2993/*
2960 * Cross CPU call to read the hardware event 2994 * Cross CPU call to read the hardware event
2961 */ 2995 */
@@ -3178,7 +3212,8 @@ static void free_event_rcu(struct rcu_head *head)
3178} 3212}
3179 3213
3180static void ring_buffer_put(struct ring_buffer *rb); 3214static void ring_buffer_put(struct ring_buffer *rb);
3181static void ring_buffer_detach(struct perf_event *event, struct ring_buffer *rb); 3215static void ring_buffer_attach(struct perf_event *event,
3216 struct ring_buffer *rb);
3182 3217
3183static void unaccount_event_cpu(struct perf_event *event, int cpu) 3218static void unaccount_event_cpu(struct perf_event *event, int cpu)
3184{ 3219{
@@ -3229,17 +3264,19 @@ static void __free_event(struct perf_event *event)
3229 if (event->ctx) 3264 if (event->ctx)
3230 put_ctx(event->ctx); 3265 put_ctx(event->ctx);
3231 3266
3267 if (event->pmu)
3268 module_put(event->pmu->module);
3269
3232 call_rcu(&event->rcu_head, free_event_rcu); 3270 call_rcu(&event->rcu_head, free_event_rcu);
3233} 3271}
3234static void free_event(struct perf_event *event) 3272
3273static void _free_event(struct perf_event *event)
3235{ 3274{
3236 irq_work_sync(&event->pending); 3275 irq_work_sync(&event->pending);
3237 3276
3238 unaccount_event(event); 3277 unaccount_event(event);
3239 3278
3240 if (event->rb) { 3279 if (event->rb) {
3241 struct ring_buffer *rb;
3242
3243 /* 3280 /*
3244 * Can happen when we close an event with re-directed output. 3281 * Can happen when we close an event with re-directed output.
3245 * 3282 *
@@ -3247,57 +3284,38 @@ static void free_event(struct perf_event *event)
3247 * over us; possibly making our ring_buffer_put() the last. 3284 * over us; possibly making our ring_buffer_put() the last.
3248 */ 3285 */
3249 mutex_lock(&event->mmap_mutex); 3286 mutex_lock(&event->mmap_mutex);
3250 rb = event->rb; 3287 ring_buffer_attach(event, NULL);
3251 if (rb) {
3252 rcu_assign_pointer(event->rb, NULL);
3253 ring_buffer_detach(event, rb);
3254 ring_buffer_put(rb); /* could be last */
3255 }
3256 mutex_unlock(&event->mmap_mutex); 3288 mutex_unlock(&event->mmap_mutex);
3257 } 3289 }
3258 3290
3259 if (is_cgroup_event(event)) 3291 if (is_cgroup_event(event))
3260 perf_detach_cgroup(event); 3292 perf_detach_cgroup(event);
3261 3293
3262
3263 __free_event(event); 3294 __free_event(event);
3264} 3295}
3265 3296
3266int perf_event_release_kernel(struct perf_event *event) 3297/*
3298 * Used to free events which have a known refcount of 1, such as in error paths
3299 * where the event isn't exposed yet and inherited events.
3300 */
3301static void free_event(struct perf_event *event)
3267{ 3302{
3268 struct perf_event_context *ctx = event->ctx; 3303 if (WARN(atomic_long_cmpxchg(&event->refcount, 1, 0) != 1,
3269 3304 "unexpected event refcount: %ld; ptr=%p\n",
3270 WARN_ON_ONCE(ctx->parent_ctx); 3305 atomic_long_read(&event->refcount), event)) {
3271 /* 3306 /* leak to avoid use-after-free */
3272 * There are two ways this annotation is useful: 3307 return;
3273 * 3308 }
3274 * 1) there is a lock recursion from perf_event_exit_task
3275 * see the comment there.
3276 *
3277 * 2) there is a lock-inversion with mmap_sem through
3278 * perf_event_read_group(), which takes faults while
3279 * holding ctx->mutex, however this is called after
3280 * the last filedesc died, so there is no possibility
3281 * to trigger the AB-BA case.
3282 */
3283 mutex_lock_nested(&ctx->mutex, SINGLE_DEPTH_NESTING);
3284 raw_spin_lock_irq(&ctx->lock);
3285 perf_group_detach(event);
3286 raw_spin_unlock_irq(&ctx->lock);
3287 perf_remove_from_context(event);
3288 mutex_unlock(&ctx->mutex);
3289
3290 free_event(event);
3291 3309
3292 return 0; 3310 _free_event(event);
3293} 3311}
3294EXPORT_SYMBOL_GPL(perf_event_release_kernel);
3295 3312
3296/* 3313/*
3297 * Called when the last reference to the file is gone. 3314 * Called when the last reference to the file is gone.
3298 */ 3315 */
3299static void put_event(struct perf_event *event) 3316static void put_event(struct perf_event *event)
3300{ 3317{
3318 struct perf_event_context *ctx = event->ctx;
3301 struct task_struct *owner; 3319 struct task_struct *owner;
3302 3320
3303 if (!atomic_long_dec_and_test(&event->refcount)) 3321 if (!atomic_long_dec_and_test(&event->refcount))
@@ -3336,9 +3354,33 @@ static void put_event(struct perf_event *event)
3336 put_task_struct(owner); 3354 put_task_struct(owner);
3337 } 3355 }
3338 3356
3339 perf_event_release_kernel(event); 3357 WARN_ON_ONCE(ctx->parent_ctx);
3358 /*
3359 * There are two ways this annotation is useful:
3360 *
3361 * 1) there is a lock recursion from perf_event_exit_task
3362 * see the comment there.
3363 *
3364 * 2) there is a lock-inversion with mmap_sem through
3365 * perf_event_read_group(), which takes faults while
3366 * holding ctx->mutex, however this is called after
3367 * the last filedesc died, so there is no possibility
3368 * to trigger the AB-BA case.
3369 */
3370 mutex_lock_nested(&ctx->mutex, SINGLE_DEPTH_NESTING);
3371 perf_remove_from_context(event, true);
3372 mutex_unlock(&ctx->mutex);
3373
3374 _free_event(event);
3340} 3375}
3341 3376
3377int perf_event_release_kernel(struct perf_event *event)
3378{
3379 put_event(event);
3380 return 0;
3381}
3382EXPORT_SYMBOL_GPL(perf_event_release_kernel);
3383
3342static int perf_release(struct inode *inode, struct file *file) 3384static int perf_release(struct inode *inode, struct file *file)
3343{ 3385{
3344 put_event(file->private_data); 3386 put_event(file->private_data);
@@ -3839,28 +3881,47 @@ unlock:
3839static void ring_buffer_attach(struct perf_event *event, 3881static void ring_buffer_attach(struct perf_event *event,
3840 struct ring_buffer *rb) 3882 struct ring_buffer *rb)
3841{ 3883{
3884 struct ring_buffer *old_rb = NULL;
3842 unsigned long flags; 3885 unsigned long flags;
3843 3886
3844 if (!list_empty(&event->rb_entry)) 3887 if (event->rb) {
3845 return; 3888 /*
3889 * Should be impossible, we set this when removing
3890 * event->rb_entry and wait/clear when adding event->rb_entry.
3891 */
3892 WARN_ON_ONCE(event->rcu_pending);
3846 3893
3847 spin_lock_irqsave(&rb->event_lock, flags); 3894 old_rb = event->rb;
3848 if (list_empty(&event->rb_entry)) 3895 event->rcu_batches = get_state_synchronize_rcu();
3849 list_add(&event->rb_entry, &rb->event_list); 3896 event->rcu_pending = 1;
3850 spin_unlock_irqrestore(&rb->event_lock, flags);
3851}
3852 3897
3853static void ring_buffer_detach(struct perf_event *event, struct ring_buffer *rb) 3898 spin_lock_irqsave(&old_rb->event_lock, flags);
3854{ 3899 list_del_rcu(&event->rb_entry);
3855 unsigned long flags; 3900 spin_unlock_irqrestore(&old_rb->event_lock, flags);
3901 }
3856 3902
3857 if (list_empty(&event->rb_entry)) 3903 if (event->rcu_pending && rb) {
3858 return; 3904 cond_synchronize_rcu(event->rcu_batches);
3905 event->rcu_pending = 0;
3906 }
3859 3907
3860 spin_lock_irqsave(&rb->event_lock, flags); 3908 if (rb) {
3861 list_del_init(&event->rb_entry); 3909 spin_lock_irqsave(&rb->event_lock, flags);
3862 wake_up_all(&event->waitq); 3910 list_add_rcu(&event->rb_entry, &rb->event_list);
3863 spin_unlock_irqrestore(&rb->event_lock, flags); 3911 spin_unlock_irqrestore(&rb->event_lock, flags);
3912 }
3913
3914 rcu_assign_pointer(event->rb, rb);
3915
3916 if (old_rb) {
3917 ring_buffer_put(old_rb);
3918 /*
3919 * Since we detached before setting the new rb, so that we
3920 * could attach the new rb, we could have missed a wakeup.
3921 * Provide it now.
3922 */
3923 wake_up_all(&event->waitq);
3924 }
3864} 3925}
3865 3926
3866static void ring_buffer_wakeup(struct perf_event *event) 3927static void ring_buffer_wakeup(struct perf_event *event)
@@ -3929,7 +3990,7 @@ static void perf_mmap_close(struct vm_area_struct *vma)
3929{ 3990{
3930 struct perf_event *event = vma->vm_file->private_data; 3991 struct perf_event *event = vma->vm_file->private_data;
3931 3992
3932 struct ring_buffer *rb = event->rb; 3993 struct ring_buffer *rb = ring_buffer_get(event);
3933 struct user_struct *mmap_user = rb->mmap_user; 3994 struct user_struct *mmap_user = rb->mmap_user;
3934 int mmap_locked = rb->mmap_locked; 3995 int mmap_locked = rb->mmap_locked;
3935 unsigned long size = perf_data_size(rb); 3996 unsigned long size = perf_data_size(rb);
@@ -3937,18 +3998,14 @@ static void perf_mmap_close(struct vm_area_struct *vma)
3937 atomic_dec(&rb->mmap_count); 3998 atomic_dec(&rb->mmap_count);
3938 3999
3939 if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) 4000 if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex))
3940 return; 4001 goto out_put;
3941 4002
3942 /* Detach current event from the buffer. */ 4003 ring_buffer_attach(event, NULL);
3943 rcu_assign_pointer(event->rb, NULL);
3944 ring_buffer_detach(event, rb);
3945 mutex_unlock(&event->mmap_mutex); 4004 mutex_unlock(&event->mmap_mutex);
3946 4005
3947 /* If there's still other mmap()s of this buffer, we're done. */ 4006 /* If there's still other mmap()s of this buffer, we're done. */
3948 if (atomic_read(&rb->mmap_count)) { 4007 if (atomic_read(&rb->mmap_count))
3949 ring_buffer_put(rb); /* can't be last */ 4008 goto out_put;
3950 return;
3951 }
3952 4009
3953 /* 4010 /*
3954 * No other mmap()s, detach from all other events that might redirect 4011 * No other mmap()s, detach from all other events that might redirect
@@ -3978,11 +4035,9 @@ again:
3978 * still restart the iteration to make sure we're not now 4035 * still restart the iteration to make sure we're not now
3979 * iterating the wrong list. 4036 * iterating the wrong list.
3980 */ 4037 */
3981 if (event->rb == rb) { 4038 if (event->rb == rb)
3982 rcu_assign_pointer(event->rb, NULL); 4039 ring_buffer_attach(event, NULL);
3983 ring_buffer_detach(event, rb); 4040
3984 ring_buffer_put(rb); /* can't be last, we still have one */
3985 }
3986 mutex_unlock(&event->mmap_mutex); 4041 mutex_unlock(&event->mmap_mutex);
3987 put_event(event); 4042 put_event(event);
3988 4043
@@ -4007,6 +4062,7 @@ again:
4007 vma->vm_mm->pinned_vm -= mmap_locked; 4062 vma->vm_mm->pinned_vm -= mmap_locked;
4008 free_uid(mmap_user); 4063 free_uid(mmap_user);
4009 4064
4065out_put:
4010 ring_buffer_put(rb); /* could be last */ 4066 ring_buffer_put(rb); /* could be last */
4011} 4067}
4012 4068
@@ -4124,7 +4180,6 @@ again:
4124 vma->vm_mm->pinned_vm += extra; 4180 vma->vm_mm->pinned_vm += extra;
4125 4181
4126 ring_buffer_attach(event, rb); 4182 ring_buffer_attach(event, rb);
4127 rcu_assign_pointer(event->rb, rb);
4128 4183
4129 perf_event_init_userpage(event); 4184 perf_event_init_userpage(event);
4130 perf_event_update_userpage(event); 4185 perf_event_update_userpage(event);
@@ -5036,21 +5091,9 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event)
5036 NULL); 5091 NULL);
5037} 5092}
5038 5093
5039void perf_event_comm(struct task_struct *task) 5094void perf_event_comm(struct task_struct *task, bool exec)
5040{ 5095{
5041 struct perf_comm_event comm_event; 5096 struct perf_comm_event comm_event;
5042 struct perf_event_context *ctx;
5043 int ctxn;
5044
5045 rcu_read_lock();
5046 for_each_task_context_nr(ctxn) {
5047 ctx = task->perf_event_ctxp[ctxn];
5048 if (!ctx)
5049 continue;
5050
5051 perf_event_enable_on_exec(ctx);
5052 }
5053 rcu_read_unlock();
5054 5097
5055 if (!atomic_read(&nr_comm_events)) 5098 if (!atomic_read(&nr_comm_events))
5056 return; 5099 return;
@@ -5062,7 +5105,7 @@ void perf_event_comm(struct task_struct *task)
5062 .event_id = { 5105 .event_id = {
5063 .header = { 5106 .header = {
5064 .type = PERF_RECORD_COMM, 5107 .type = PERF_RECORD_COMM,
5065 .misc = 0, 5108 .misc = exec ? PERF_RECORD_MISC_COMM_EXEC : 0,
5066 /* .size */ 5109 /* .size */
5067 }, 5110 },
5068 /* .pid */ 5111 /* .pid */
@@ -5408,6 +5451,9 @@ struct swevent_htable {
5408 5451
5409 /* Recursion avoidance in each contexts */ 5452 /* Recursion avoidance in each contexts */
5410 int recursion[PERF_NR_CONTEXTS]; 5453 int recursion[PERF_NR_CONTEXTS];
5454
5455 /* Keeps track of cpu being initialized/exited */
5456 bool online;
5411}; 5457};
5412 5458
5413static DEFINE_PER_CPU(struct swevent_htable, swevent_htable); 5459static DEFINE_PER_CPU(struct swevent_htable, swevent_htable);
@@ -5654,8 +5700,14 @@ static int perf_swevent_add(struct perf_event *event, int flags)
5654 hwc->state = !(flags & PERF_EF_START); 5700 hwc->state = !(flags & PERF_EF_START);
5655 5701
5656 head = find_swevent_head(swhash, event); 5702 head = find_swevent_head(swhash, event);
5657 if (WARN_ON_ONCE(!head)) 5703 if (!head) {
5704 /*
5705 * We can race with cpu hotplug code. Do not
5706 * WARN if the cpu just got unplugged.
5707 */
5708 WARN_ON_ONCE(swhash->online);
5658 return -EINVAL; 5709 return -EINVAL;
5710 }
5659 5711
5660 hlist_add_head_rcu(&event->hlist_entry, head); 5712 hlist_add_head_rcu(&event->hlist_entry, head);
5661 5713
@@ -6551,6 +6603,7 @@ free_pdc:
6551 free_percpu(pmu->pmu_disable_count); 6603 free_percpu(pmu->pmu_disable_count);
6552 goto unlock; 6604 goto unlock;
6553} 6605}
6606EXPORT_SYMBOL_GPL(perf_pmu_register);
6554 6607
6555void perf_pmu_unregister(struct pmu *pmu) 6608void perf_pmu_unregister(struct pmu *pmu)
6556{ 6609{
@@ -6572,6 +6625,7 @@ void perf_pmu_unregister(struct pmu *pmu)
6572 put_device(pmu->dev); 6625 put_device(pmu->dev);
6573 free_pmu_context(pmu); 6626 free_pmu_context(pmu);
6574} 6627}
6628EXPORT_SYMBOL_GPL(perf_pmu_unregister);
6575 6629
6576struct pmu *perf_init_event(struct perf_event *event) 6630struct pmu *perf_init_event(struct perf_event *event)
6577{ 6631{
@@ -6585,6 +6639,10 @@ struct pmu *perf_init_event(struct perf_event *event)
6585 pmu = idr_find(&pmu_idr, event->attr.type); 6639 pmu = idr_find(&pmu_idr, event->attr.type);
6586 rcu_read_unlock(); 6640 rcu_read_unlock();
6587 if (pmu) { 6641 if (pmu) {
6642 if (!try_module_get(pmu->module)) {
6643 pmu = ERR_PTR(-ENODEV);
6644 goto unlock;
6645 }
6588 event->pmu = pmu; 6646 event->pmu = pmu;
6589 ret = pmu->event_init(event); 6647 ret = pmu->event_init(event);
6590 if (ret) 6648 if (ret)
@@ -6593,6 +6651,10 @@ struct pmu *perf_init_event(struct perf_event *event)
6593 } 6651 }
6594 6652
6595 list_for_each_entry_rcu(pmu, &pmus, entry) { 6653 list_for_each_entry_rcu(pmu, &pmus, entry) {
6654 if (!try_module_get(pmu->module)) {
6655 pmu = ERR_PTR(-ENODEV);
6656 goto unlock;
6657 }
6596 event->pmu = pmu; 6658 event->pmu = pmu;
6597 ret = pmu->event_init(event); 6659 ret = pmu->event_init(event);
6598 if (!ret) 6660 if (!ret)
@@ -6771,6 +6833,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
6771err_pmu: 6833err_pmu:
6772 if (event->destroy) 6834 if (event->destroy)
6773 event->destroy(event); 6835 event->destroy(event);
6836 module_put(pmu->module);
6774err_ns: 6837err_ns:
6775 if (event->ns) 6838 if (event->ns)
6776 put_pid_ns(event->ns); 6839 put_pid_ns(event->ns);
@@ -6914,7 +6977,7 @@ err_size:
6914static int 6977static int
6915perf_event_set_output(struct perf_event *event, struct perf_event *output_event) 6978perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
6916{ 6979{
6917 struct ring_buffer *rb = NULL, *old_rb = NULL; 6980 struct ring_buffer *rb = NULL;
6918 int ret = -EINVAL; 6981 int ret = -EINVAL;
6919 6982
6920 if (!output_event) 6983 if (!output_event)
@@ -6942,8 +7005,6 @@ set:
6942 if (atomic_read(&event->mmap_count)) 7005 if (atomic_read(&event->mmap_count))
6943 goto unlock; 7006 goto unlock;
6944 7007
6945 old_rb = event->rb;
6946
6947 if (output_event) { 7008 if (output_event) {
6948 /* get the rb we want to redirect to */ 7009 /* get the rb we want to redirect to */
6949 rb = ring_buffer_get(output_event); 7010 rb = ring_buffer_get(output_event);
@@ -6951,23 +7012,7 @@ set:
6951 goto unlock; 7012 goto unlock;
6952 } 7013 }
6953 7014
6954 if (old_rb) 7015 ring_buffer_attach(event, rb);
6955 ring_buffer_detach(event, old_rb);
6956
6957 if (rb)
6958 ring_buffer_attach(event, rb);
6959
6960 rcu_assign_pointer(event->rb, rb);
6961
6962 if (old_rb) {
6963 ring_buffer_put(old_rb);
6964 /*
6965 * Since we detached before setting the new rb, so that we
6966 * could attach the new rb, we could have missed a wakeup.
6967 * Provide it now.
6968 */
6969 wake_up_all(&event->waitq);
6970 }
6971 7016
6972 ret = 0; 7017 ret = 0;
6973unlock: 7018unlock:
@@ -7018,6 +7063,9 @@ SYSCALL_DEFINE5(perf_event_open,
7018 if (attr.freq) { 7063 if (attr.freq) {
7019 if (attr.sample_freq > sysctl_perf_event_sample_rate) 7064 if (attr.sample_freq > sysctl_perf_event_sample_rate)
7020 return -EINVAL; 7065 return -EINVAL;
7066 } else {
7067 if (attr.sample_period & (1ULL << 63))
7068 return -EINVAL;
7021 } 7069 }
7022 7070
7023 /* 7071 /*
@@ -7055,20 +7103,33 @@ SYSCALL_DEFINE5(perf_event_open,
7055 } 7103 }
7056 } 7104 }
7057 7105
7106 if (task && group_leader &&
7107 group_leader->attr.inherit != attr.inherit) {
7108 err = -EINVAL;
7109 goto err_task;
7110 }
7111
7058 get_online_cpus(); 7112 get_online_cpus();
7059 7113
7060 event = perf_event_alloc(&attr, cpu, task, group_leader, NULL, 7114 event = perf_event_alloc(&attr, cpu, task, group_leader, NULL,
7061 NULL, NULL); 7115 NULL, NULL);
7062 if (IS_ERR(event)) { 7116 if (IS_ERR(event)) {
7063 err = PTR_ERR(event); 7117 err = PTR_ERR(event);
7064 goto err_task; 7118 goto err_cpus;
7065 } 7119 }
7066 7120
7067 if (flags & PERF_FLAG_PID_CGROUP) { 7121 if (flags & PERF_FLAG_PID_CGROUP) {
7068 err = perf_cgroup_connect(pid, event, &attr, group_leader); 7122 err = perf_cgroup_connect(pid, event, &attr, group_leader);
7069 if (err) { 7123 if (err) {
7070 __free_event(event); 7124 __free_event(event);
7071 goto err_task; 7125 goto err_cpus;
7126 }
7127 }
7128
7129 if (is_sampling_event(event)) {
7130 if (event->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT) {
7131 err = -ENOTSUPP;
7132 goto err_alloc;
7072 } 7133 }
7073 } 7134 }
7074 7135
@@ -7165,7 +7226,7 @@ SYSCALL_DEFINE5(perf_event_open,
7165 struct perf_event_context *gctx = group_leader->ctx; 7226 struct perf_event_context *gctx = group_leader->ctx;
7166 7227
7167 mutex_lock(&gctx->mutex); 7228 mutex_lock(&gctx->mutex);
7168 perf_remove_from_context(group_leader); 7229 perf_remove_from_context(group_leader, false);
7169 7230
7170 /* 7231 /*
7171 * Removing from the context ends up with disabled 7232 * Removing from the context ends up with disabled
@@ -7175,7 +7236,7 @@ SYSCALL_DEFINE5(perf_event_open,
7175 perf_event__state_init(group_leader); 7236 perf_event__state_init(group_leader);
7176 list_for_each_entry(sibling, &group_leader->sibling_list, 7237 list_for_each_entry(sibling, &group_leader->sibling_list,
7177 group_entry) { 7238 group_entry) {
7178 perf_remove_from_context(sibling); 7239 perf_remove_from_context(sibling, false);
7179 perf_event__state_init(sibling); 7240 perf_event__state_init(sibling);
7180 put_ctx(gctx); 7241 put_ctx(gctx);
7181 } 7242 }
@@ -7230,8 +7291,9 @@ err_context:
7230 put_ctx(ctx); 7291 put_ctx(ctx);
7231err_alloc: 7292err_alloc:
7232 free_event(event); 7293 free_event(event);
7233err_task: 7294err_cpus:
7234 put_online_cpus(); 7295 put_online_cpus();
7296err_task:
7235 if (task) 7297 if (task)
7236 put_task_struct(task); 7298 put_task_struct(task);
7237err_group_fd: 7299err_group_fd:
@@ -7305,7 +7367,7 @@ void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
7305 mutex_lock(&src_ctx->mutex); 7367 mutex_lock(&src_ctx->mutex);
7306 list_for_each_entry_safe(event, tmp, &src_ctx->event_list, 7368 list_for_each_entry_safe(event, tmp, &src_ctx->event_list,
7307 event_entry) { 7369 event_entry) {
7308 perf_remove_from_context(event); 7370 perf_remove_from_context(event, false);
7309 unaccount_event_cpu(event, src_cpu); 7371 unaccount_event_cpu(event, src_cpu);
7310 put_ctx(src_ctx); 7372 put_ctx(src_ctx);
7311 list_add(&event->migrate_entry, &events); 7373 list_add(&event->migrate_entry, &events);
@@ -7367,13 +7429,7 @@ __perf_event_exit_task(struct perf_event *child_event,
7367 struct perf_event_context *child_ctx, 7429 struct perf_event_context *child_ctx,
7368 struct task_struct *child) 7430 struct task_struct *child)
7369{ 7431{
7370 if (child_event->parent) { 7432 perf_remove_from_context(child_event, true);
7371 raw_spin_lock_irq(&child_ctx->lock);
7372 perf_group_detach(child_event);
7373 raw_spin_unlock_irq(&child_ctx->lock);
7374 }
7375
7376 perf_remove_from_context(child_event);
7377 7433
7378 /* 7434 /*
7379 * It can happen that the parent exits first, and has events 7435 * It can happen that the parent exits first, and has events
@@ -7388,7 +7444,7 @@ __perf_event_exit_task(struct perf_event *child_event,
7388 7444
7389static void perf_event_exit_task_context(struct task_struct *child, int ctxn) 7445static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
7390{ 7446{
7391 struct perf_event *child_event, *tmp; 7447 struct perf_event *child_event, *next;
7392 struct perf_event_context *child_ctx; 7448 struct perf_event_context *child_ctx;
7393 unsigned long flags; 7449 unsigned long flags;
7394 7450
@@ -7442,24 +7498,9 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
7442 */ 7498 */
7443 mutex_lock(&child_ctx->mutex); 7499 mutex_lock(&child_ctx->mutex);
7444 7500
7445again: 7501 list_for_each_entry_safe(child_event, next, &child_ctx->event_list, event_entry)
7446 list_for_each_entry_safe(child_event, tmp, &child_ctx->pinned_groups,
7447 group_entry)
7448 __perf_event_exit_task(child_event, child_ctx, child); 7502 __perf_event_exit_task(child_event, child_ctx, child);
7449 7503
7450 list_for_each_entry_safe(child_event, tmp, &child_ctx->flexible_groups,
7451 group_entry)
7452 __perf_event_exit_task(child_event, child_ctx, child);
7453
7454 /*
7455 * If the last event was a group event, it will have appended all
7456 * its siblings to the list, but we obtained 'tmp' before that which
7457 * will still point to the list head terminating the iteration.
7458 */
7459 if (!list_empty(&child_ctx->pinned_groups) ||
7460 !list_empty(&child_ctx->flexible_groups))
7461 goto again;
7462
7463 mutex_unlock(&child_ctx->mutex); 7504 mutex_unlock(&child_ctx->mutex);
7464 7505
7465 put_ctx(child_ctx); 7506 put_ctx(child_ctx);
@@ -7724,6 +7765,8 @@ int perf_event_init_context(struct task_struct *child, int ctxn)
7724 * swapped under us. 7765 * swapped under us.
7725 */ 7766 */
7726 parent_ctx = perf_pin_task_context(parent, ctxn); 7767 parent_ctx = perf_pin_task_context(parent, ctxn);
7768 if (!parent_ctx)
7769 return 0;
7727 7770
7728 /* 7771 /*
7729 * No need to check if parent_ctx != NULL here; since we saw 7772 * No need to check if parent_ctx != NULL here; since we saw
@@ -7835,6 +7878,7 @@ static void perf_event_init_cpu(int cpu)
7835 struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); 7878 struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
7836 7879
7837 mutex_lock(&swhash->hlist_mutex); 7880 mutex_lock(&swhash->hlist_mutex);
7881 swhash->online = true;
7838 if (swhash->hlist_refcount > 0) { 7882 if (swhash->hlist_refcount > 0) {
7839 struct swevent_hlist *hlist; 7883 struct swevent_hlist *hlist;
7840 7884
@@ -7857,14 +7901,14 @@ static void perf_pmu_rotate_stop(struct pmu *pmu)
7857 7901
7858static void __perf_event_exit_context(void *__info) 7902static void __perf_event_exit_context(void *__info)
7859{ 7903{
7904 struct remove_event re = { .detach_group = false };
7860 struct perf_event_context *ctx = __info; 7905 struct perf_event_context *ctx = __info;
7861 struct perf_event *event;
7862 7906
7863 perf_pmu_rotate_stop(ctx->pmu); 7907 perf_pmu_rotate_stop(ctx->pmu);
7864 7908
7865 rcu_read_lock(); 7909 rcu_read_lock();
7866 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) 7910 list_for_each_entry_rcu(re.event, &ctx->event_list, event_entry)
7867 __perf_remove_from_context(event); 7911 __perf_remove_from_context(&re);
7868 rcu_read_unlock(); 7912 rcu_read_unlock();
7869} 7913}
7870 7914
@@ -7892,6 +7936,7 @@ static void perf_event_exit_cpu(int cpu)
7892 perf_event_exit_cpu_context(cpu); 7936 perf_event_exit_cpu_context(cpu);
7893 7937
7894 mutex_lock(&swhash->hlist_mutex); 7938 mutex_lock(&swhash->hlist_mutex);
7939 swhash->online = false;
7895 swevent_hlist_release(swhash); 7940 swevent_hlist_release(swhash);
7896 mutex_unlock(&swhash->hlist_mutex); 7941 mutex_unlock(&swhash->hlist_mutex);
7897} 7942}
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 04709b66369d..c445e392e93f 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -36,6 +36,7 @@
36#include "../../mm/internal.h" /* munlock_vma_page */ 36#include "../../mm/internal.h" /* munlock_vma_page */
37#include <linux/percpu-rwsem.h> 37#include <linux/percpu-rwsem.h>
38#include <linux/task_work.h> 38#include <linux/task_work.h>
39#include <linux/shmem_fs.h>
39 40
40#include <linux/uprobes.h> 41#include <linux/uprobes.h>
41 42
@@ -60,8 +61,6 @@ static struct percpu_rw_semaphore dup_mmap_sem;
60 61
61/* Have a copy of original instruction */ 62/* Have a copy of original instruction */
62#define UPROBE_COPY_INSN 0 63#define UPROBE_COPY_INSN 0
63/* Can skip singlestep */
64#define UPROBE_SKIP_SSTEP 1
65 64
66struct uprobe { 65struct uprobe {
67 struct rb_node rb_node; /* node in the rb tree */ 66 struct rb_node rb_node; /* node in the rb tree */
@@ -129,7 +128,7 @@ struct xol_area {
129 */ 128 */
130static bool valid_vma(struct vm_area_struct *vma, bool is_register) 129static bool valid_vma(struct vm_area_struct *vma, bool is_register)
131{ 130{
132 vm_flags_t flags = VM_HUGETLB | VM_MAYEXEC | VM_SHARED; 131 vm_flags_t flags = VM_HUGETLB | VM_MAYEXEC | VM_MAYSHARE;
133 132
134 if (is_register) 133 if (is_register)
135 flags |= VM_WRITE; 134 flags |= VM_WRITE;
@@ -281,18 +280,13 @@ static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t
281 * supported by that architecture then we need to modify is_trap_at_addr and 280 * supported by that architecture then we need to modify is_trap_at_addr and
282 * uprobe_write_opcode accordingly. This would never be a problem for archs 281 * uprobe_write_opcode accordingly. This would never be a problem for archs
283 * that have fixed length instructions. 282 * that have fixed length instructions.
284 */ 283 *
285
286/*
287 * uprobe_write_opcode - write the opcode at a given virtual address. 284 * uprobe_write_opcode - write the opcode at a given virtual address.
288 * @mm: the probed process address space. 285 * @mm: the probed process address space.
289 * @vaddr: the virtual address to store the opcode. 286 * @vaddr: the virtual address to store the opcode.
290 * @opcode: opcode to be written at @vaddr. 287 * @opcode: opcode to be written at @vaddr.
291 * 288 *
292 * Called with mm->mmap_sem held (for read and with a reference to 289 * Called with mm->mmap_sem held for write.
293 * mm).
294 *
295 * For mm @mm, write the opcode at @vaddr.
296 * Return 0 (success) or a negative errno. 290 * Return 0 (success) or a negative errno.
297 */ 291 */
298int uprobe_write_opcode(struct mm_struct *mm, unsigned long vaddr, 292int uprobe_write_opcode(struct mm_struct *mm, unsigned long vaddr,
@@ -312,21 +306,25 @@ retry:
312 if (ret <= 0) 306 if (ret <= 0)
313 goto put_old; 307 goto put_old;
314 308
309 ret = anon_vma_prepare(vma);
310 if (ret)
311 goto put_old;
312
315 ret = -ENOMEM; 313 ret = -ENOMEM;
316 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vaddr); 314 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vaddr);
317 if (!new_page) 315 if (!new_page)
318 goto put_old; 316 goto put_old;
319 317
320 __SetPageUptodate(new_page); 318 if (mem_cgroup_charge_anon(new_page, mm, GFP_KERNEL))
319 goto put_new;
321 320
321 __SetPageUptodate(new_page);
322 copy_highpage(new_page, old_page); 322 copy_highpage(new_page, old_page);
323 copy_to_page(new_page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE); 323 copy_to_page(new_page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE);
324 324
325 ret = anon_vma_prepare(vma);
326 if (ret)
327 goto put_new;
328
329 ret = __replace_page(vma, vaddr, old_page, new_page); 325 ret = __replace_page(vma, vaddr, old_page, new_page);
326 if (ret)
327 mem_cgroup_uncharge_page(new_page);
330 328
331put_new: 329put_new:
332 page_cache_release(new_page); 330 page_cache_release(new_page);
@@ -491,12 +489,9 @@ static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset)
491 uprobe->offset = offset; 489 uprobe->offset = offset;
492 init_rwsem(&uprobe->register_rwsem); 490 init_rwsem(&uprobe->register_rwsem);
493 init_rwsem(&uprobe->consumer_rwsem); 491 init_rwsem(&uprobe->consumer_rwsem);
494 /* For now assume that the instruction need not be single-stepped */
495 __set_bit(UPROBE_SKIP_SSTEP, &uprobe->flags);
496 492
497 /* add to uprobes_tree, sorted on inode:offset */ 493 /* add to uprobes_tree, sorted on inode:offset */
498 cur_uprobe = insert_uprobe(uprobe); 494 cur_uprobe = insert_uprobe(uprobe);
499
500 /* a uprobe exists for this inode:offset combination */ 495 /* a uprobe exists for this inode:offset combination */
501 if (cur_uprobe) { 496 if (cur_uprobe) {
502 kfree(uprobe); 497 kfree(uprobe);
@@ -542,14 +537,15 @@ static int __copy_insn(struct address_space *mapping, struct file *filp,
542 void *insn, int nbytes, loff_t offset) 537 void *insn, int nbytes, loff_t offset)
543{ 538{
544 struct page *page; 539 struct page *page;
545
546 if (!mapping->a_ops->readpage)
547 return -EIO;
548 /* 540 /*
549 * Ensure that the page that has the original instruction is 541 * Ensure that the page that has the original instruction is populated
550 * populated and in page-cache. 542 * and in page-cache. If ->readpage == NULL it must be shmem_mapping(),
543 * see uprobe_register().
551 */ 544 */
552 page = read_mapping_page(mapping, offset >> PAGE_CACHE_SHIFT, filp); 545 if (mapping->a_ops->readpage)
546 page = read_mapping_page(mapping, offset >> PAGE_CACHE_SHIFT, filp);
547 else
548 page = shmem_read_mapping_page(mapping, offset >> PAGE_CACHE_SHIFT);
553 if (IS_ERR(page)) 549 if (IS_ERR(page))
554 return PTR_ERR(page); 550 return PTR_ERR(page);
555 551
@@ -885,6 +881,9 @@ int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *
885 if (!uc->handler && !uc->ret_handler) 881 if (!uc->handler && !uc->ret_handler)
886 return -EINVAL; 882 return -EINVAL;
887 883
884 /* copy_insn() uses read_mapping_page() or shmem_read_mapping_page() */
885 if (!inode->i_mapping->a_ops->readpage && !shmem_mapping(inode->i_mapping))
886 return -EIO;
888 /* Racy, just to catch the obvious mistakes */ 887 /* Racy, just to catch the obvious mistakes */
889 if (offset > i_size_read(inode)) 888 if (offset > i_size_read(inode))
890 return -EINVAL; 889 return -EINVAL;
@@ -1296,14 +1295,8 @@ static unsigned long xol_get_insn_slot(struct uprobe *uprobe)
1296 if (unlikely(!xol_vaddr)) 1295 if (unlikely(!xol_vaddr))
1297 return 0; 1296 return 0;
1298 1297
1299 /* Initialize the slot */ 1298 arch_uprobe_copy_ixol(area->page, xol_vaddr,
1300 copy_to_page(area->page, xol_vaddr, 1299 &uprobe->arch.ixol, sizeof(uprobe->arch.ixol));
1301 &uprobe->arch.ixol, sizeof(uprobe->arch.ixol));
1302 /*
1303 * We probably need flush_icache_user_range() but it needs vma.
1304 * This should work on supported architectures too.
1305 */
1306 flush_dcache_page(area->page);
1307 1300
1308 return xol_vaddr; 1301 return xol_vaddr;
1309} 1302}
@@ -1346,6 +1339,21 @@ static void xol_free_insn_slot(struct task_struct *tsk)
1346 } 1339 }
1347} 1340}
1348 1341
1342void __weak arch_uprobe_copy_ixol(struct page *page, unsigned long vaddr,
1343 void *src, unsigned long len)
1344{
1345 /* Initialize the slot */
1346 copy_to_page(page, vaddr, src, len);
1347
1348 /*
1349 * We probably need flush_icache_user_range() but it needs vma.
1350 * This should work on most of architectures by default. If
1351 * architecture needs to do something different it can define
1352 * its own version of the function.
1353 */
1354 flush_dcache_page(page);
1355}
1356
1349/** 1357/**
1350 * uprobe_get_swbp_addr - compute address of swbp given post-swbp regs 1358 * uprobe_get_swbp_addr - compute address of swbp given post-swbp regs
1351 * @regs: Reflects the saved state of the task after it has hit a breakpoint 1359 * @regs: Reflects the saved state of the task after it has hit a breakpoint
@@ -1357,6 +1365,16 @@ unsigned long __weak uprobe_get_swbp_addr(struct pt_regs *regs)
1357 return instruction_pointer(regs) - UPROBE_SWBP_INSN_SIZE; 1365 return instruction_pointer(regs) - UPROBE_SWBP_INSN_SIZE;
1358} 1366}
1359 1367
1368unsigned long uprobe_get_trap_addr(struct pt_regs *regs)
1369{
1370 struct uprobe_task *utask = current->utask;
1371
1372 if (unlikely(utask && utask->active_uprobe))
1373 return utask->vaddr;
1374
1375 return instruction_pointer(regs);
1376}
1377
1360/* 1378/*
1361 * Called with no locks held. 1379 * Called with no locks held.
1362 * Called in context of a exiting or a exec-ing thread. 1380 * Called in context of a exiting or a exec-ing thread.
@@ -1628,20 +1646,6 @@ bool uprobe_deny_signal(void)
1628 return true; 1646 return true;
1629} 1647}
1630 1648
1631/*
1632 * Avoid singlestepping the original instruction if the original instruction
1633 * is a NOP or can be emulated.
1634 */
1635static bool can_skip_sstep(struct uprobe *uprobe, struct pt_regs *regs)
1636{
1637 if (test_bit(UPROBE_SKIP_SSTEP, &uprobe->flags)) {
1638 if (arch_uprobe_skip_sstep(&uprobe->arch, regs))
1639 return true;
1640 clear_bit(UPROBE_SKIP_SSTEP, &uprobe->flags);
1641 }
1642 return false;
1643}
1644
1645static void mmf_recalc_uprobes(struct mm_struct *mm) 1649static void mmf_recalc_uprobes(struct mm_struct *mm)
1646{ 1650{
1647 struct vm_area_struct *vma; 1651 struct vm_area_struct *vma;
@@ -1868,13 +1872,13 @@ static void handle_swbp(struct pt_regs *regs)
1868 1872
1869 handler_chain(uprobe, regs); 1873 handler_chain(uprobe, regs);
1870 1874
1871 if (can_skip_sstep(uprobe, regs)) 1875 if (arch_uprobe_skip_sstep(&uprobe->arch, regs))
1872 goto out; 1876 goto out;
1873 1877
1874 if (!pre_ssout(uprobe, regs, bp_vaddr)) 1878 if (!pre_ssout(uprobe, regs, bp_vaddr))
1875 return; 1879 return;
1876 1880
1877 /* can_skip_sstep() succeeded, or restart if can't singlestep */ 1881 /* arch_uprobe_skip_sstep() succeeded, or restart if can't singlestep */
1878out: 1882out:
1879 put_uprobe(uprobe); 1883 put_uprobe(uprobe);
1880} 1884}
@@ -1886,10 +1890,11 @@ out:
1886static void handle_singlestep(struct uprobe_task *utask, struct pt_regs *regs) 1890static void handle_singlestep(struct uprobe_task *utask, struct pt_regs *regs)
1887{ 1891{
1888 struct uprobe *uprobe; 1892 struct uprobe *uprobe;
1893 int err = 0;
1889 1894
1890 uprobe = utask->active_uprobe; 1895 uprobe = utask->active_uprobe;
1891 if (utask->state == UTASK_SSTEP_ACK) 1896 if (utask->state == UTASK_SSTEP_ACK)
1892 arch_uprobe_post_xol(&uprobe->arch, regs); 1897 err = arch_uprobe_post_xol(&uprobe->arch, regs);
1893 else if (utask->state == UTASK_SSTEP_TRAPPED) 1898 else if (utask->state == UTASK_SSTEP_TRAPPED)
1894 arch_uprobe_abort_xol(&uprobe->arch, regs); 1899 arch_uprobe_abort_xol(&uprobe->arch, regs);
1895 else 1900 else
@@ -1903,6 +1908,11 @@ static void handle_singlestep(struct uprobe_task *utask, struct pt_regs *regs)
1903 spin_lock_irq(&current->sighand->siglock); 1908 spin_lock_irq(&current->sighand->siglock);
1904 recalc_sigpending(); /* see uprobe_deny_signal() */ 1909 recalc_sigpending(); /* see uprobe_deny_signal() */
1905 spin_unlock_irq(&current->sighand->siglock); 1910 spin_unlock_irq(&current->sighand->siglock);
1911
1912 if (unlikely(err)) {
1913 uprobe_warn(current, "execute the probed insn, sending SIGILL.");
1914 force_sig_info(SIGILL, SEND_SIG_FORCED, current);
1915 }
1906} 1916}
1907 1917
1908/* 1918/*
diff --git a/kernel/exec_domain.c b/kernel/exec_domain.c
index 0dbeae374225..83d4382f5699 100644
--- a/kernel/exec_domain.c
+++ b/kernel/exec_domain.c
@@ -37,7 +37,7 @@ static unsigned long ident_map[32] = {
37struct exec_domain default_exec_domain = { 37struct exec_domain default_exec_domain = {
38 .name = "Linux", /* name */ 38 .name = "Linux", /* name */
39 .handler = default_handler, /* lcall7 causes a seg fault. */ 39 .handler = default_handler, /* lcall7 causes a seg fault. */
40 .pers_low = 0, /* PER_LINUX personality. */ 40 .pers_low = 0, /* PER_LINUX personality. */
41 .pers_high = 0, /* PER_LINUX personality. */ 41 .pers_high = 0, /* PER_LINUX personality. */
42 .signal_map = ident_map, /* Identity map signals. */ 42 .signal_map = ident_map, /* Identity map signals. */
43 .signal_invmap = ident_map, /* - both ways. */ 43 .signal_invmap = ident_map, /* - both ways. */
@@ -83,7 +83,7 @@ lookup_exec_domain(unsigned int personality)
83 ep = &default_exec_domain; 83 ep = &default_exec_domain;
84out: 84out:
85 read_unlock(&exec_domains_lock); 85 read_unlock(&exec_domains_lock);
86 return (ep); 86 return ep;
87} 87}
88 88
89int 89int
@@ -110,8 +110,9 @@ register_exec_domain(struct exec_domain *ep)
110 110
111out: 111out:
112 write_unlock(&exec_domains_lock); 112 write_unlock(&exec_domains_lock);
113 return (err); 113 return err;
114} 114}
115EXPORT_SYMBOL(register_exec_domain);
115 116
116int 117int
117unregister_exec_domain(struct exec_domain *ep) 118unregister_exec_domain(struct exec_domain *ep)
@@ -133,6 +134,7 @@ unregister:
133 write_unlock(&exec_domains_lock); 134 write_unlock(&exec_domains_lock);
134 return 0; 135 return 0;
135} 136}
137EXPORT_SYMBOL(unregister_exec_domain);
136 138
137int __set_personality(unsigned int personality) 139int __set_personality(unsigned int personality)
138{ 140{
@@ -144,6 +146,7 @@ int __set_personality(unsigned int personality)
144 146
145 return 0; 147 return 0;
146} 148}
149EXPORT_SYMBOL(__set_personality);
147 150
148#ifdef CONFIG_PROC_FS 151#ifdef CONFIG_PROC_FS
149static int execdomains_proc_show(struct seq_file *m, void *v) 152static int execdomains_proc_show(struct seq_file *m, void *v)
@@ -188,8 +191,3 @@ SYSCALL_DEFINE1(personality, unsigned int, personality)
188 191
189 return old; 192 return old;
190} 193}
191
192
193EXPORT_SYMBOL(register_exec_domain);
194EXPORT_SYMBOL(unregister_exec_domain);
195EXPORT_SYMBOL(__set_personality);
diff --git a/kernel/exit.c b/kernel/exit.c
index 6ed6a1d552b5..e5c4668f1799 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -313,46 +313,7 @@ kill_orphaned_pgrp(struct task_struct *tsk, struct task_struct *parent)
313 } 313 }
314} 314}
315 315
316/* 316#ifdef CONFIG_MEMCG
317 * Let kernel threads use this to say that they allow a certain signal.
318 * Must not be used if kthread was cloned with CLONE_SIGHAND.
319 */
320int allow_signal(int sig)
321{
322 if (!valid_signal(sig) || sig < 1)
323 return -EINVAL;
324
325 spin_lock_irq(&current->sighand->siglock);
326 /* This is only needed for daemonize()'ed kthreads */
327 sigdelset(&current->blocked, sig);
328 /*
329 * Kernel threads handle their own signals. Let the signal code
330 * know it'll be handled, so that they don't get converted to
331 * SIGKILL or just silently dropped.
332 */
333 current->sighand->action[(sig)-1].sa.sa_handler = (void __user *)2;
334 recalc_sigpending();
335 spin_unlock_irq(&current->sighand->siglock);
336 return 0;
337}
338
339EXPORT_SYMBOL(allow_signal);
340
341int disallow_signal(int sig)
342{
343 if (!valid_signal(sig) || sig < 1)
344 return -EINVAL;
345
346 spin_lock_irq(&current->sighand->siglock);
347 current->sighand->action[(sig)-1].sa.sa_handler = SIG_IGN;
348 recalc_sigpending();
349 spin_unlock_irq(&current->sighand->siglock);
350 return 0;
351}
352
353EXPORT_SYMBOL(disallow_signal);
354
355#ifdef CONFIG_MM_OWNER
356/* 317/*
357 * A task is exiting. If it owned this mm, find a new owner for the mm. 318 * A task is exiting. If it owned this mm, find a new owner for the mm.
358 */ 319 */
@@ -395,14 +356,18 @@ retry:
395 } 356 }
396 357
397 /* 358 /*
398 * Search through everything else. We should not get 359 * Search through everything else, we should not get here often.
399 * here often
400 */ 360 */
401 do_each_thread(g, c) { 361 for_each_process(g) {
402 if (c->mm == mm) 362 if (g->flags & PF_KTHREAD)
403 goto assign_new_owner; 363 continue;
404 } while_each_thread(g, c); 364 for_each_thread(g, c) {
405 365 if (c->mm == mm)
366 goto assign_new_owner;
367 if (c->mm)
368 break;
369 }
370 }
406 read_unlock(&tasklist_lock); 371 read_unlock(&tasklist_lock);
407 /* 372 /*
408 * We found no owner yet mm_users > 1: this implies that we are 373 * We found no owner yet mm_users > 1: this implies that we are
@@ -434,7 +399,7 @@ assign_new_owner:
434 task_unlock(c); 399 task_unlock(c);
435 put_task_struct(c); 400 put_task_struct(c);
436} 401}
437#endif /* CONFIG_MM_OWNER */ 402#endif /* CONFIG_MEMCG */
438 403
439/* 404/*
440 * Turn us into a lazy TLB process if we 405 * Turn us into a lazy TLB process if we
diff --git a/kernel/fork.c b/kernel/fork.c
index 54a8d26f612f..d2799d1fc952 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -150,15 +150,15 @@ void __weak arch_release_thread_info(struct thread_info *ti)
150static struct thread_info *alloc_thread_info_node(struct task_struct *tsk, 150static struct thread_info *alloc_thread_info_node(struct task_struct *tsk,
151 int node) 151 int node)
152{ 152{
153 struct page *page = alloc_pages_node(node, THREADINFO_GFP_ACCOUNTED, 153 struct page *page = alloc_kmem_pages_node(node, THREADINFO_GFP,
154 THREAD_SIZE_ORDER); 154 THREAD_SIZE_ORDER);
155 155
156 return page ? page_address(page) : NULL; 156 return page ? page_address(page) : NULL;
157} 157}
158 158
159static inline void free_thread_info(struct thread_info *ti) 159static inline void free_thread_info(struct thread_info *ti)
160{ 160{
161 free_memcg_kmem_pages((unsigned long)ti, THREAD_SIZE_ORDER); 161 free_kmem_pages((unsigned long)ti, THREAD_SIZE_ORDER);
162} 162}
163# else 163# else
164static struct kmem_cache *thread_info_cache; 164static struct kmem_cache *thread_info_cache;
@@ -1099,12 +1099,12 @@ static void rt_mutex_init_task(struct task_struct *p)
1099#endif 1099#endif
1100} 1100}
1101 1101
1102#ifdef CONFIG_MM_OWNER 1102#ifdef CONFIG_MEMCG
1103void mm_init_owner(struct mm_struct *mm, struct task_struct *p) 1103void mm_init_owner(struct mm_struct *mm, struct task_struct *p)
1104{ 1104{
1105 mm->owner = p; 1105 mm->owner = p;
1106} 1106}
1107#endif /* CONFIG_MM_OWNER */ 1107#endif /* CONFIG_MEMCG */
1108 1108
1109/* 1109/*
1110 * Initialize POSIX timer handling for a single task. 1110 * Initialize POSIX timer handling for a single task.
@@ -1606,10 +1606,12 @@ long do_fork(unsigned long clone_flags,
1606 */ 1606 */
1607 if (!IS_ERR(p)) { 1607 if (!IS_ERR(p)) {
1608 struct completion vfork; 1608 struct completion vfork;
1609 struct pid *pid;
1609 1610
1610 trace_sched_process_fork(current, p); 1611 trace_sched_process_fork(current, p);
1611 1612
1612 nr = task_pid_vnr(p); 1613 pid = get_task_pid(p, PIDTYPE_PID);
1614 nr = pid_vnr(pid);
1613 1615
1614 if (clone_flags & CLONE_PARENT_SETTID) 1616 if (clone_flags & CLONE_PARENT_SETTID)
1615 put_user(nr, parent_tidptr); 1617 put_user(nr, parent_tidptr);
@@ -1624,12 +1626,14 @@ long do_fork(unsigned long clone_flags,
1624 1626
1625 /* forking complete and child started to run, tell ptracer */ 1627 /* forking complete and child started to run, tell ptracer */
1626 if (unlikely(trace)) 1628 if (unlikely(trace))
1627 ptrace_event(trace, nr); 1629 ptrace_event_pid(trace, pid);
1628 1630
1629 if (clone_flags & CLONE_VFORK) { 1631 if (clone_flags & CLONE_VFORK) {
1630 if (!wait_for_vfork_done(p, &vfork)) 1632 if (!wait_for_vfork_done(p, &vfork))
1631 ptrace_event(PTRACE_EVENT_VFORK_DONE, nr); 1633 ptrace_event_pid(PTRACE_EVENT_VFORK_DONE, pid);
1632 } 1634 }
1635
1636 put_pid(pid);
1633 } else { 1637 } else {
1634 nr = PTR_ERR(p); 1638 nr = PTR_ERR(p);
1635 } 1639 }
diff --git a/kernel/futex.c b/kernel/futex.c
index 5f589279e462..b632b5f3f094 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -267,7 +267,7 @@ static inline void futex_get_mm(union futex_key *key)
267 * get_futex_key() implies a full barrier. This is relied upon 267 * get_futex_key() implies a full barrier. This is relied upon
268 * as full barrier (B), see the ordering comment above. 268 * as full barrier (B), see the ordering comment above.
269 */ 269 */
270 smp_mb__after_atomic_inc(); 270 smp_mb__after_atomic();
271} 271}
272 272
273/* 273/*
@@ -280,7 +280,7 @@ static inline void hb_waiters_inc(struct futex_hash_bucket *hb)
280 /* 280 /*
281 * Full barrier (A), see the ordering comment above. 281 * Full barrier (A), see the ordering comment above.
282 */ 282 */
283 smp_mb__after_atomic_inc(); 283 smp_mb__after_atomic();
284#endif 284#endif
285} 285}
286 286
@@ -743,6 +743,55 @@ void exit_pi_state_list(struct task_struct *curr)
743 raw_spin_unlock_irq(&curr->pi_lock); 743 raw_spin_unlock_irq(&curr->pi_lock);
744} 744}
745 745
746/*
747 * We need to check the following states:
748 *
749 * Waiter | pi_state | pi->owner | uTID | uODIED | ?
750 *
751 * [1] NULL | --- | --- | 0 | 0/1 | Valid
752 * [2] NULL | --- | --- | >0 | 0/1 | Valid
753 *
754 * [3] Found | NULL | -- | Any | 0/1 | Invalid
755 *
756 * [4] Found | Found | NULL | 0 | 1 | Valid
757 * [5] Found | Found | NULL | >0 | 1 | Invalid
758 *
759 * [6] Found | Found | task | 0 | 1 | Valid
760 *
761 * [7] Found | Found | NULL | Any | 0 | Invalid
762 *
763 * [8] Found | Found | task | ==taskTID | 0/1 | Valid
764 * [9] Found | Found | task | 0 | 0 | Invalid
765 * [10] Found | Found | task | !=taskTID | 0/1 | Invalid
766 *
767 * [1] Indicates that the kernel can acquire the futex atomically. We
768 * came came here due to a stale FUTEX_WAITERS/FUTEX_OWNER_DIED bit.
769 *
770 * [2] Valid, if TID does not belong to a kernel thread. If no matching
771 * thread is found then it indicates that the owner TID has died.
772 *
773 * [3] Invalid. The waiter is queued on a non PI futex
774 *
775 * [4] Valid state after exit_robust_list(), which sets the user space
776 * value to FUTEX_WAITERS | FUTEX_OWNER_DIED.
777 *
778 * [5] The user space value got manipulated between exit_robust_list()
779 * and exit_pi_state_list()
780 *
781 * [6] Valid state after exit_pi_state_list() which sets the new owner in
782 * the pi_state but cannot access the user space value.
783 *
784 * [7] pi_state->owner can only be NULL when the OWNER_DIED bit is set.
785 *
786 * [8] Owner and user space value match
787 *
788 * [9] There is no transient state which sets the user space TID to 0
789 * except exit_robust_list(), but this is indicated by the
790 * FUTEX_OWNER_DIED bit. See [4]
791 *
792 * [10] There is no transient state which leaves owner and user space
793 * TID out of sync.
794 */
746static int 795static int
747lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, 796lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
748 union futex_key *key, struct futex_pi_state **ps) 797 union futex_key *key, struct futex_pi_state **ps)
@@ -755,12 +804,13 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
755 plist_for_each_entry_safe(this, next, &hb->chain, list) { 804 plist_for_each_entry_safe(this, next, &hb->chain, list) {
756 if (match_futex(&this->key, key)) { 805 if (match_futex(&this->key, key)) {
757 /* 806 /*
758 * Another waiter already exists - bump up 807 * Sanity check the waiter before increasing
759 * the refcount and return its pi_state: 808 * the refcount and attaching to it.
760 */ 809 */
761 pi_state = this->pi_state; 810 pi_state = this->pi_state;
762 /* 811 /*
763 * Userspace might have messed up non-PI and PI futexes 812 * Userspace might have messed up non-PI and
813 * PI futexes [3]
764 */ 814 */
765 if (unlikely(!pi_state)) 815 if (unlikely(!pi_state))
766 return -EINVAL; 816 return -EINVAL;
@@ -768,34 +818,70 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
768 WARN_ON(!atomic_read(&pi_state->refcount)); 818 WARN_ON(!atomic_read(&pi_state->refcount));
769 819
770 /* 820 /*
771 * When pi_state->owner is NULL then the owner died 821 * Handle the owner died case:
772 * and another waiter is on the fly. pi_state->owner
773 * is fixed up by the task which acquires
774 * pi_state->rt_mutex.
775 *
776 * We do not check for pid == 0 which can happen when
777 * the owner died and robust_list_exit() cleared the
778 * TID.
779 */ 822 */
780 if (pid && pi_state->owner) { 823 if (uval & FUTEX_OWNER_DIED) {
824 /*
825 * exit_pi_state_list sets owner to NULL and
826 * wakes the topmost waiter. The task which
827 * acquires the pi_state->rt_mutex will fixup
828 * owner.
829 */
830 if (!pi_state->owner) {
831 /*
832 * No pi state owner, but the user
833 * space TID is not 0. Inconsistent
834 * state. [5]
835 */
836 if (pid)
837 return -EINVAL;
838 /*
839 * Take a ref on the state and
840 * return. [4]
841 */
842 goto out_state;
843 }
844
781 /* 845 /*
782 * Bail out if user space manipulated the 846 * If TID is 0, then either the dying owner
783 * futex value. 847 * has not yet executed exit_pi_state_list()
848 * or some waiter acquired the rtmutex in the
849 * pi state, but did not yet fixup the TID in
850 * user space.
851 *
852 * Take a ref on the state and return. [6]
784 */ 853 */
785 if (pid != task_pid_vnr(pi_state->owner)) 854 if (!pid)
855 goto out_state;
856 } else {
857 /*
858 * If the owner died bit is not set,
859 * then the pi_state must have an
860 * owner. [7]
861 */
862 if (!pi_state->owner)
786 return -EINVAL; 863 return -EINVAL;
787 } 864 }
788 865
866 /*
867 * Bail out if user space manipulated the
868 * futex value. If pi state exists then the
869 * owner TID must be the same as the user
870 * space TID. [9/10]
871 */
872 if (pid != task_pid_vnr(pi_state->owner))
873 return -EINVAL;
874
875 out_state:
789 atomic_inc(&pi_state->refcount); 876 atomic_inc(&pi_state->refcount);
790 *ps = pi_state; 877 *ps = pi_state;
791
792 return 0; 878 return 0;
793 } 879 }
794 } 880 }
795 881
796 /* 882 /*
797 * We are the first waiter - try to look up the real owner and attach 883 * We are the first waiter - try to look up the real owner and attach
798 * the new pi_state to it, but bail out when TID = 0 884 * the new pi_state to it, but bail out when TID = 0 [1]
799 */ 885 */
800 if (!pid) 886 if (!pid)
801 return -ESRCH; 887 return -ESRCH;
@@ -803,6 +889,11 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
803 if (!p) 889 if (!p)
804 return -ESRCH; 890 return -ESRCH;
805 891
892 if (!p->mm) {
893 put_task_struct(p);
894 return -EPERM;
895 }
896
806 /* 897 /*
807 * We need to look at the task state flags to figure out, 898 * We need to look at the task state flags to figure out,
808 * whether the task is exiting. To protect against the do_exit 899 * whether the task is exiting. To protect against the do_exit
@@ -823,6 +914,9 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
823 return ret; 914 return ret;
824 } 915 }
825 916
917 /*
918 * No existing pi state. First waiter. [2]
919 */
826 pi_state = alloc_pi_state(); 920 pi_state = alloc_pi_state();
827 921
828 /* 922 /*
@@ -894,10 +988,18 @@ retry:
894 return -EDEADLK; 988 return -EDEADLK;
895 989
896 /* 990 /*
897 * Surprise - we got the lock. Just return to userspace: 991 * Surprise - we got the lock, but we do not trust user space at all.
898 */ 992 */
899 if (unlikely(!curval)) 993 if (unlikely(!curval)) {
900 return 1; 994 /*
995 * We verify whether there is kernel state for this
996 * futex. If not, we can safely assume, that the 0 ->
997 * TID transition is correct. If state exists, we do
998 * not bother to fixup the user space state as it was
999 * corrupted already.
1000 */
1001 return futex_top_waiter(hb, key) ? -EINVAL : 1;
1002 }
901 1003
902 uval = curval; 1004 uval = curval;
903 1005
@@ -1028,6 +1130,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
1028 struct task_struct *new_owner; 1130 struct task_struct *new_owner;
1029 struct futex_pi_state *pi_state = this->pi_state; 1131 struct futex_pi_state *pi_state = this->pi_state;
1030 u32 uninitialized_var(curval), newval; 1132 u32 uninitialized_var(curval), newval;
1133 int ret = 0;
1031 1134
1032 if (!pi_state) 1135 if (!pi_state)
1033 return -EINVAL; 1136 return -EINVAL;
@@ -1051,23 +1154,19 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
1051 new_owner = this->task; 1154 new_owner = this->task;
1052 1155
1053 /* 1156 /*
1054 * We pass it to the next owner. (The WAITERS bit is always 1157 * We pass it to the next owner. The WAITERS bit is always
1055 * kept enabled while there is PI state around. We must also 1158 * kept enabled while there is PI state around. We cleanup the
1056 * preserve the owner died bit.) 1159 * owner died bit, because we are the owner.
1057 */ 1160 */
1058 if (!(uval & FUTEX_OWNER_DIED)) { 1161 newval = FUTEX_WAITERS | task_pid_vnr(new_owner);
1059 int ret = 0;
1060 1162
1061 newval = FUTEX_WAITERS | task_pid_vnr(new_owner); 1163 if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))
1062 1164 ret = -EFAULT;
1063 if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) 1165 else if (curval != uval)
1064 ret = -EFAULT; 1166 ret = -EINVAL;
1065 else if (curval != uval) 1167 if (ret) {
1066 ret = -EINVAL; 1168 raw_spin_unlock(&pi_state->pi_mutex.wait_lock);
1067 if (ret) { 1169 return ret;
1068 raw_spin_unlock(&pi_state->pi_mutex.wait_lock);
1069 return ret;
1070 }
1071 } 1170 }
1072 1171
1073 raw_spin_lock_irq(&pi_state->owner->pi_lock); 1172 raw_spin_lock_irq(&pi_state->owner->pi_lock);
@@ -1347,7 +1446,7 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key,
1347 * 1446 *
1348 * Return: 1447 * Return:
1349 * 0 - failed to acquire the lock atomically; 1448 * 0 - failed to acquire the lock atomically;
1350 * 1 - acquired the lock; 1449 * >0 - acquired the lock, return value is vpid of the top_waiter
1351 * <0 - error 1450 * <0 - error
1352 */ 1451 */
1353static int futex_proxy_trylock_atomic(u32 __user *pifutex, 1452static int futex_proxy_trylock_atomic(u32 __user *pifutex,
@@ -1358,7 +1457,7 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex,
1358{ 1457{
1359 struct futex_q *top_waiter = NULL; 1458 struct futex_q *top_waiter = NULL;
1360 u32 curval; 1459 u32 curval;
1361 int ret; 1460 int ret, vpid;
1362 1461
1363 if (get_futex_value_locked(&curval, pifutex)) 1462 if (get_futex_value_locked(&curval, pifutex))
1364 return -EFAULT; 1463 return -EFAULT;
@@ -1386,11 +1485,13 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex,
1386 * the contended case or if set_waiters is 1. The pi_state is returned 1485 * the contended case or if set_waiters is 1. The pi_state is returned
1387 * in ps in contended cases. 1486 * in ps in contended cases.
1388 */ 1487 */
1488 vpid = task_pid_vnr(top_waiter->task);
1389 ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task, 1489 ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task,
1390 set_waiters); 1490 set_waiters);
1391 if (ret == 1) 1491 if (ret == 1) {
1392 requeue_pi_wake_futex(top_waiter, key2, hb2); 1492 requeue_pi_wake_futex(top_waiter, key2, hb2);
1393 1493 return vpid;
1494 }
1394 return ret; 1495 return ret;
1395} 1496}
1396 1497
@@ -1421,10 +1522,16 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
1421 struct futex_pi_state *pi_state = NULL; 1522 struct futex_pi_state *pi_state = NULL;
1422 struct futex_hash_bucket *hb1, *hb2; 1523 struct futex_hash_bucket *hb1, *hb2;
1423 struct futex_q *this, *next; 1524 struct futex_q *this, *next;
1424 u32 curval2;
1425 1525
1426 if (requeue_pi) { 1526 if (requeue_pi) {
1427 /* 1527 /*
1528 * Requeue PI only works on two distinct uaddrs. This
1529 * check is only valid for private futexes. See below.
1530 */
1531 if (uaddr1 == uaddr2)
1532 return -EINVAL;
1533
1534 /*
1428 * requeue_pi requires a pi_state, try to allocate it now 1535 * requeue_pi requires a pi_state, try to allocate it now
1429 * without any locks in case it fails. 1536 * without any locks in case it fails.
1430 */ 1537 */
@@ -1462,6 +1569,15 @@ retry:
1462 if (unlikely(ret != 0)) 1569 if (unlikely(ret != 0))
1463 goto out_put_key1; 1570 goto out_put_key1;
1464 1571
1572 /*
1573 * The check above which compares uaddrs is not sufficient for
1574 * shared futexes. We need to compare the keys:
1575 */
1576 if (requeue_pi && match_futex(&key1, &key2)) {
1577 ret = -EINVAL;
1578 goto out_put_keys;
1579 }
1580
1465 hb1 = hash_futex(&key1); 1581 hb1 = hash_futex(&key1);
1466 hb2 = hash_futex(&key2); 1582 hb2 = hash_futex(&key2);
1467 1583
@@ -1509,16 +1625,25 @@ retry_private:
1509 * At this point the top_waiter has either taken uaddr2 or is 1625 * At this point the top_waiter has either taken uaddr2 or is
1510 * waiting on it. If the former, then the pi_state will not 1626 * waiting on it. If the former, then the pi_state will not
1511 * exist yet, look it up one more time to ensure we have a 1627 * exist yet, look it up one more time to ensure we have a
1512 * reference to it. 1628 * reference to it. If the lock was taken, ret contains the
1629 * vpid of the top waiter task.
1513 */ 1630 */
1514 if (ret == 1) { 1631 if (ret > 0) {
1515 WARN_ON(pi_state); 1632 WARN_ON(pi_state);
1516 drop_count++; 1633 drop_count++;
1517 task_count++; 1634 task_count++;
1518 ret = get_futex_value_locked(&curval2, uaddr2); 1635 /*
1519 if (!ret) 1636 * If we acquired the lock, then the user
1520 ret = lookup_pi_state(curval2, hb2, &key2, 1637 * space value of uaddr2 should be vpid. It
1521 &pi_state); 1638 * cannot be changed by the top waiter as it
1639 * is blocked on hb2 lock if it tries to do
1640 * so. If something fiddled with it behind our
1641 * back the pi state lookup might unearth
1642 * it. So we rather use the known value than
1643 * rereading and handing potential crap to
1644 * lookup_pi_state.
1645 */
1646 ret = lookup_pi_state(ret, hb2, &key2, &pi_state);
1522 } 1647 }
1523 1648
1524 switch (ret) { 1649 switch (ret) {
@@ -2301,9 +2426,10 @@ retry:
2301 /* 2426 /*
2302 * To avoid races, try to do the TID -> 0 atomic transition 2427 * To avoid races, try to do the TID -> 0 atomic transition
2303 * again. If it succeeds then we can return without waking 2428 * again. If it succeeds then we can return without waking
2304 * anyone else up: 2429 * anyone else up. We only try this if neither the waiters nor
2430 * the owner died bit are set.
2305 */ 2431 */
2306 if (!(uval & FUTEX_OWNER_DIED) && 2432 if (!(uval & ~FUTEX_TID_MASK) &&
2307 cmpxchg_futex_value_locked(&uval, uaddr, vpid, 0)) 2433 cmpxchg_futex_value_locked(&uval, uaddr, vpid, 0))
2308 goto pi_faulted; 2434 goto pi_faulted;
2309 /* 2435 /*
@@ -2333,11 +2459,9 @@ retry:
2333 /* 2459 /*
2334 * No waiters - kernel unlocks the futex: 2460 * No waiters - kernel unlocks the futex:
2335 */ 2461 */
2336 if (!(uval & FUTEX_OWNER_DIED)) { 2462 ret = unlock_futex_pi(uaddr, uval);
2337 ret = unlock_futex_pi(uaddr, uval); 2463 if (ret == -EFAULT)
2338 if (ret == -EFAULT) 2464 goto pi_faulted;
2339 goto pi_faulted;
2340 }
2341 2465
2342out_unlock: 2466out_unlock:
2343 spin_unlock(&hb->lock); 2467 spin_unlock(&hb->lock);
@@ -2499,6 +2623,15 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
2499 if (ret) 2623 if (ret)
2500 goto out_key2; 2624 goto out_key2;
2501 2625
2626 /*
2627 * The check above which compares uaddrs is not sufficient for
2628 * shared futexes. We need to compare the keys:
2629 */
2630 if (match_futex(&q.key, &key2)) {
2631 ret = -EINVAL;
2632 goto out_put_keys;
2633 }
2634
2502 /* Queue the futex_q, drop the hb lock, wait for wakeup. */ 2635 /* Queue the futex_q, drop the hb lock, wait for wakeup. */
2503 futex_wait_queue_me(hb, &q, to); 2636 futex_wait_queue_me(hb, &q, to);
2504 2637
diff --git a/kernel/gcov/base.c b/kernel/gcov/base.c
index f45b75b713c0..b358a802fd18 100644
--- a/kernel/gcov/base.c
+++ b/kernel/gcov/base.c
@@ -85,6 +85,12 @@ void __gcov_merge_ior(gcov_type *counters, unsigned int n_counters)
85} 85}
86EXPORT_SYMBOL(__gcov_merge_ior); 86EXPORT_SYMBOL(__gcov_merge_ior);
87 87
88void __gcov_merge_time_profile(gcov_type *counters, unsigned int n_counters)
89{
90 /* Unused. */
91}
92EXPORT_SYMBOL(__gcov_merge_time_profile);
93
88/** 94/**
89 * gcov_enable_events - enable event reporting through gcov_event() 95 * gcov_enable_events - enable event reporting through gcov_event()
90 * 96 *
diff --git a/kernel/gcov/gcc_4_7.c b/kernel/gcov/gcc_4_7.c
index 2c6e4631c814..826ba9fb5e32 100644
--- a/kernel/gcov/gcc_4_7.c
+++ b/kernel/gcov/gcc_4_7.c
@@ -18,7 +18,12 @@
18#include <linux/vmalloc.h> 18#include <linux/vmalloc.h>
19#include "gcov.h" 19#include "gcov.h"
20 20
21#if __GNUC__ == 4 && __GNUC_MINOR__ >= 9
22#define GCOV_COUNTERS 9
23#else
21#define GCOV_COUNTERS 8 24#define GCOV_COUNTERS 8
25#endif
26
22#define GCOV_TAG_FUNCTION_LENGTH 3 27#define GCOV_TAG_FUNCTION_LENGTH 3
23 28
24static struct gcov_info *gcov_info_head; 29static struct gcov_info *gcov_info_head;
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 6b715c0af1b1..3ab28993f6e0 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -990,11 +990,8 @@ int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
990 /* Remove an active timer from the queue: */ 990 /* Remove an active timer from the queue: */
991 ret = remove_hrtimer(timer, base); 991 ret = remove_hrtimer(timer, base);
992 992
993 /* Switch the timer base, if necessary: */
994 new_base = switch_hrtimer_base(timer, base, mode & HRTIMER_MODE_PINNED);
995
996 if (mode & HRTIMER_MODE_REL) { 993 if (mode & HRTIMER_MODE_REL) {
997 tim = ktime_add_safe(tim, new_base->get_time()); 994 tim = ktime_add_safe(tim, base->get_time());
998 /* 995 /*
999 * CONFIG_TIME_LOW_RES is a temporary way for architectures 996 * CONFIG_TIME_LOW_RES is a temporary way for architectures
1000 * to signal that they simply return xtime in 997 * to signal that they simply return xtime in
@@ -1009,6 +1006,9 @@ int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
1009 1006
1010 hrtimer_set_expires_range_ns(timer, tim, delta_ns); 1007 hrtimer_set_expires_range_ns(timer, tim, delta_ns);
1011 1008
1009 /* Switch the timer base, if necessary: */
1010 new_base = switch_hrtimer_base(timer, base, mode & HRTIMER_MODE_PINNED);
1011
1012 timer_stats_hrtimer_set_start_info(timer); 1012 timer_stats_hrtimer_set_start_info(timer);
1013 1013
1014 leftmost = enqueue_hrtimer(timer, new_base); 1014 leftmost = enqueue_hrtimer(timer, new_base);
@@ -1039,6 +1039,7 @@ int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
1039 1039
1040 return ret; 1040 return ret;
1041} 1041}
1042EXPORT_SYMBOL_GPL(__hrtimer_start_range_ns);
1042 1043
1043/** 1044/**
1044 * hrtimer_start_range_ns - (re)start an hrtimer on the current CPU 1045 * hrtimer_start_range_ns - (re)start an hrtimer on the current CPU
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index 06bb1417b063..06db12434d72 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -52,8 +52,10 @@ unsigned int __read_mostly sysctl_hung_task_panic =
52 52
53static int __init hung_task_panic_setup(char *str) 53static int __init hung_task_panic_setup(char *str)
54{ 54{
55 sysctl_hung_task_panic = simple_strtoul(str, NULL, 0); 55 int rc = kstrtouint(str, 0, &sysctl_hung_task_panic);
56 56
57 if (rc)
58 return rc;
57 return 1; 59 return 1;
58} 60}
59__setup("hung_task_panic=", hung_task_panic_setup); 61__setup("hung_task_panic=", hung_task_panic_setup);
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
index 07cbdfea9ae2..d269cecdfbf0 100644
--- a/kernel/irq/Kconfig
+++ b/kernel/irq/Kconfig
@@ -5,6 +5,10 @@ menu "IRQ subsystem"
5config MAY_HAVE_SPARSE_IRQ 5config MAY_HAVE_SPARSE_IRQ
6 bool 6 bool
7 7
8# Legacy support, required for itanic
9config GENERIC_IRQ_LEGACY
10 bool
11
8# Enable the generic irq autoprobe mechanism 12# Enable the generic irq autoprobe mechanism
9config GENERIC_IRQ_PROBE 13config GENERIC_IRQ_PROBE
10 bool 14 bool
@@ -17,6 +21,11 @@ config GENERIC_IRQ_SHOW
17config GENERIC_IRQ_SHOW_LEVEL 21config GENERIC_IRQ_SHOW_LEVEL
18 bool 22 bool
19 23
24# Facility to allocate a hardware interrupt. This is legacy support
25# and should not be used in new code. Use irq domains instead.
26config GENERIC_IRQ_LEGACY_ALLOC_HWIRQ
27 bool
28
20# Support for delayed migration from interrupt context 29# Support for delayed migration from interrupt context
21config GENERIC_PENDING_IRQ 30config GENERIC_PENDING_IRQ
22 bool 31 bool
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 6397df2d6945..a2b28a2fd7b1 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -40,10 +40,9 @@ int irq_set_chip(unsigned int irq, struct irq_chip *chip)
40 irq_put_desc_unlock(desc, flags); 40 irq_put_desc_unlock(desc, flags);
41 /* 41 /*
42 * For !CONFIG_SPARSE_IRQ make the irq show up in 42 * For !CONFIG_SPARSE_IRQ make the irq show up in
43 * allocated_irqs. For the CONFIG_SPARSE_IRQ case, it is 43 * allocated_irqs.
44 * already marked, and this call is harmless.
45 */ 44 */
46 irq_reserve_irq(irq); 45 irq_mark_irq(irq);
47 return 0; 46 return 0;
48} 47}
49EXPORT_SYMBOL(irq_set_chip); 48EXPORT_SYMBOL(irq_set_chip);
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index ddf1ffeb79f1..099ea2e0eb88 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -33,7 +33,7 @@ enum {
33}; 33};
34 34
35/* 35/*
36 * Bit masks for desc->state 36 * Bit masks for desc->core_internal_state__do_not_mess_with_it
37 * 37 *
38 * IRQS_AUTODETECT - autodetection in progress 38 * IRQS_AUTODETECT - autodetection in progress
39 * IRQS_SPURIOUS_DISABLED - was disabled due to spurious interrupt 39 * IRQS_SPURIOUS_DISABLED - was disabled due to spurious interrupt
@@ -76,6 +76,12 @@ extern void mask_irq(struct irq_desc *desc);
76extern void unmask_irq(struct irq_desc *desc); 76extern void unmask_irq(struct irq_desc *desc);
77extern void unmask_threaded_irq(struct irq_desc *desc); 77extern void unmask_threaded_irq(struct irq_desc *desc);
78 78
79#ifdef CONFIG_SPARSE_IRQ
80static inline void irq_mark_irq(unsigned int irq) { }
81#else
82extern void irq_mark_irq(unsigned int irq);
83#endif
84
79extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr); 85extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr);
80 86
81irqreturn_t handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action); 87irqreturn_t handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action);
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index bb07f2928f4b..7339e42a85ab 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -278,7 +278,12 @@ EXPORT_SYMBOL(irq_to_desc);
278 278
279static void free_desc(unsigned int irq) 279static void free_desc(unsigned int irq)
280{ 280{
281 dynamic_irq_cleanup(irq); 281 struct irq_desc *desc = irq_to_desc(irq);
282 unsigned long flags;
283
284 raw_spin_lock_irqsave(&desc->lock, flags);
285 desc_set_defaults(irq, desc, desc_node(desc), NULL);
286 raw_spin_unlock_irqrestore(&desc->lock, flags);
282} 287}
283 288
284static inline int alloc_descs(unsigned int start, unsigned int cnt, int node, 289static inline int alloc_descs(unsigned int start, unsigned int cnt, int node,
@@ -299,6 +304,20 @@ static int irq_expand_nr_irqs(unsigned int nr)
299 return -ENOMEM; 304 return -ENOMEM;
300} 305}
301 306
307void irq_mark_irq(unsigned int irq)
308{
309 mutex_lock(&sparse_irq_lock);
310 bitmap_set(allocated_irqs, irq, 1);
311 mutex_unlock(&sparse_irq_lock);
312}
313
314#ifdef CONFIG_GENERIC_IRQ_LEGACY
315void irq_init_desc(unsigned int irq)
316{
317 free_desc(irq);
318}
319#endif
320
302#endif /* !CONFIG_SPARSE_IRQ */ 321#endif /* !CONFIG_SPARSE_IRQ */
303 322
304/** 323/**
@@ -396,30 +415,56 @@ err:
396} 415}
397EXPORT_SYMBOL_GPL(__irq_alloc_descs); 416EXPORT_SYMBOL_GPL(__irq_alloc_descs);
398 417
418#ifdef CONFIG_GENERIC_IRQ_LEGACY_ALLOC_HWIRQ
399/** 419/**
400 * irq_reserve_irqs - mark irqs allocated 420 * irq_alloc_hwirqs - Allocate an irq descriptor and initialize the hardware
401 * @from: mark from irq number 421 * @cnt: number of interrupts to allocate
402 * @cnt: number of irqs to mark 422 * @node: node on which to allocate
403 * 423 *
404 * Returns 0 on success or an appropriate error code 424 * Returns an interrupt number > 0 or 0, if the allocation fails.
405 */ 425 */
406int irq_reserve_irqs(unsigned int from, unsigned int cnt) 426unsigned int irq_alloc_hwirqs(int cnt, int node)
407{ 427{
408 unsigned int start; 428 int i, irq = __irq_alloc_descs(-1, 0, cnt, node, NULL);
409 int ret = 0;
410 429
411 if (!cnt || (from + cnt) > nr_irqs) 430 if (irq < 0)
412 return -EINVAL; 431 return 0;
413 432
414 mutex_lock(&sparse_irq_lock); 433 for (i = irq; cnt > 0; i++, cnt--) {
415 start = bitmap_find_next_zero_area(allocated_irqs, nr_irqs, from, cnt, 0); 434 if (arch_setup_hwirq(i, node))
416 if (start == from) 435 goto err;
417 bitmap_set(allocated_irqs, start, cnt); 436 irq_clear_status_flags(i, _IRQ_NOREQUEST);
418 else 437 }
419 ret = -EEXIST; 438 return irq;
420 mutex_unlock(&sparse_irq_lock); 439
421 return ret; 440err:
441 for (i--; i >= irq; i--) {
442 irq_set_status_flags(i, _IRQ_NOREQUEST | _IRQ_NOPROBE);
443 arch_teardown_hwirq(i);
444 }
445 irq_free_descs(irq, cnt);
446 return 0;
447}
448EXPORT_SYMBOL_GPL(irq_alloc_hwirqs);
449
450/**
451 * irq_free_hwirqs - Free irq descriptor and cleanup the hardware
452 * @from: Free from irq number
453 * @cnt: number of interrupts to free
454 *
455 */
456void irq_free_hwirqs(unsigned int from, int cnt)
457{
458 int i;
459
460 for (i = from; cnt > 0; i++, cnt--) {
461 irq_set_status_flags(i, _IRQ_NOREQUEST | _IRQ_NOPROBE);
462 arch_teardown_hwirq(i);
463 }
464 irq_free_descs(from, cnt);
422} 465}
466EXPORT_SYMBOL_GPL(irq_free_hwirqs);
467#endif
423 468
424/** 469/**
425 * irq_get_next_irq - get next allocated irq number 470 * irq_get_next_irq - get next allocated irq number
@@ -482,20 +527,6 @@ int irq_set_percpu_devid(unsigned int irq)
482 return 0; 527 return 0;
483} 528}
484 529
485/**
486 * dynamic_irq_cleanup - cleanup a dynamically allocated irq
487 * @irq: irq number to initialize
488 */
489void dynamic_irq_cleanup(unsigned int irq)
490{
491 struct irq_desc *desc = irq_to_desc(irq);
492 unsigned long flags;
493
494 raw_spin_lock_irqsave(&desc->lock, flags);
495 desc_set_defaults(irq, desc, desc_node(desc), NULL);
496 raw_spin_unlock_irqrestore(&desc->lock, flags);
497}
498
499void kstat_incr_irq_this_cpu(unsigned int irq) 530void kstat_incr_irq_this_cpu(unsigned int irq)
500{ 531{
501 kstat_incr_irqs_this_cpu(irq, irq_to_desc(irq)); 532 kstat_incr_irqs_this_cpu(irq, irq_to_desc(irq));
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index f14033700c25..eb5e10e32e05 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -27,14 +27,14 @@ static struct irq_domain *irq_default_domain;
27 * __irq_domain_add() - Allocate a new irq_domain data structure 27 * __irq_domain_add() - Allocate a new irq_domain data structure
28 * @of_node: optional device-tree node of the interrupt controller 28 * @of_node: optional device-tree node of the interrupt controller
29 * @size: Size of linear map; 0 for radix mapping only 29 * @size: Size of linear map; 0 for radix mapping only
30 * @hwirq_max: Maximum number of interrupts supported by controller
30 * @direct_max: Maximum value of direct maps; Use ~0 for no limit; 0 for no 31 * @direct_max: Maximum value of direct maps; Use ~0 for no limit; 0 for no
31 * direct mapping 32 * direct mapping
32 * @ops: map/unmap domain callbacks 33 * @ops: map/unmap domain callbacks
33 * @host_data: Controller private data pointer 34 * @host_data: Controller private data pointer
34 * 35 *
35 * Allocates and initialize and irq_domain structure. Caller is expected to 36 * Allocates and initialize and irq_domain structure.
36 * register allocated irq_domain with irq_domain_register(). Returns pointer 37 * Returns pointer to IRQ domain, or NULL on failure.
37 * to IRQ domain, or NULL on failure.
38 */ 38 */
39struct irq_domain *__irq_domain_add(struct device_node *of_node, int size, 39struct irq_domain *__irq_domain_add(struct device_node *of_node, int size,
40 irq_hw_number_t hwirq_max, int direct_max, 40 irq_hw_number_t hwirq_max, int direct_max,
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index d34131ca372b..3dc6a61bf06a 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -886,8 +886,8 @@ static int irq_thread(void *data)
886 irq_thread_check_affinity(desc, action); 886 irq_thread_check_affinity(desc, action);
887 887
888 action_ret = handler_fn(desc, action); 888 action_ret = handler_fn(desc, action);
889 if (!noirqdebug) 889 if (action_ret == IRQ_HANDLED)
890 note_interrupt(action->irq, desc, action_ret); 890 atomic_inc(&desc->threads_handled);
891 891
892 wake_threads_waitq(desc); 892 wake_threads_waitq(desc);
893 } 893 }
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index a1d8cc63b56e..e2514b0e439e 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -270,6 +270,8 @@ try_misrouted_irq(unsigned int irq, struct irq_desc *desc,
270 return action && (action->flags & IRQF_IRQPOLL); 270 return action && (action->flags & IRQF_IRQPOLL);
271} 271}
272 272
273#define SPURIOUS_DEFERRED 0x80000000
274
273void note_interrupt(unsigned int irq, struct irq_desc *desc, 275void note_interrupt(unsigned int irq, struct irq_desc *desc,
274 irqreturn_t action_ret) 276 irqreturn_t action_ret)
275{ 277{
@@ -277,15 +279,111 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc,
277 irq_settings_is_polled(desc)) 279 irq_settings_is_polled(desc))
278 return; 280 return;
279 281
280 /* we get here again via the threaded handler */
281 if (action_ret == IRQ_WAKE_THREAD)
282 return;
283
284 if (bad_action_ret(action_ret)) { 282 if (bad_action_ret(action_ret)) {
285 report_bad_irq(irq, desc, action_ret); 283 report_bad_irq(irq, desc, action_ret);
286 return; 284 return;
287 } 285 }
288 286
287 /*
288 * We cannot call note_interrupt from the threaded handler
289 * because we need to look at the compound of all handlers
290 * (primary and threaded). Aside of that in the threaded
291 * shared case we have no serialization against an incoming
292 * hardware interrupt while we are dealing with a threaded
293 * result.
294 *
295 * So in case a thread is woken, we just note the fact and
296 * defer the analysis to the next hardware interrupt.
297 *
298 * The threaded handlers store whether they sucessfully
299 * handled an interrupt and we check whether that number
300 * changed versus the last invocation.
301 *
302 * We could handle all interrupts with the delayed by one
303 * mechanism, but for the non forced threaded case we'd just
304 * add pointless overhead to the straight hardirq interrupts
305 * for the sake of a few lines less code.
306 */
307 if (action_ret & IRQ_WAKE_THREAD) {
308 /*
309 * There is a thread woken. Check whether one of the
310 * shared primary handlers returned IRQ_HANDLED. If
311 * not we defer the spurious detection to the next
312 * interrupt.
313 */
314 if (action_ret == IRQ_WAKE_THREAD) {
315 int handled;
316 /*
317 * We use bit 31 of thread_handled_last to
318 * denote the deferred spurious detection
319 * active. No locking necessary as
320 * thread_handled_last is only accessed here
321 * and we have the guarantee that hard
322 * interrupts are not reentrant.
323 */
324 if (!(desc->threads_handled_last & SPURIOUS_DEFERRED)) {
325 desc->threads_handled_last |= SPURIOUS_DEFERRED;
326 return;
327 }
328 /*
329 * Check whether one of the threaded handlers
330 * returned IRQ_HANDLED since the last
331 * interrupt happened.
332 *
333 * For simplicity we just set bit 31, as it is
334 * set in threads_handled_last as well. So we
335 * avoid extra masking. And we really do not
336 * care about the high bits of the handled
337 * count. We just care about the count being
338 * different than the one we saw before.
339 */
340 handled = atomic_read(&desc->threads_handled);
341 handled |= SPURIOUS_DEFERRED;
342 if (handled != desc->threads_handled_last) {
343 action_ret = IRQ_HANDLED;
344 /*
345 * Note: We keep the SPURIOUS_DEFERRED
346 * bit set. We are handling the
347 * previous invocation right now.
348 * Keep it for the current one, so the
349 * next hardware interrupt will
350 * account for it.
351 */
352 desc->threads_handled_last = handled;
353 } else {
354 /*
355 * None of the threaded handlers felt
356 * responsible for the last interrupt
357 *
358 * We keep the SPURIOUS_DEFERRED bit
359 * set in threads_handled_last as we
360 * need to account for the current
361 * interrupt as well.
362 */
363 action_ret = IRQ_NONE;
364 }
365 } else {
366 /*
367 * One of the primary handlers returned
368 * IRQ_HANDLED. So we don't care about the
369 * threaded handlers on the same line. Clear
370 * the deferred detection bit.
371 *
372 * In theory we could/should check whether the
373 * deferred bit is set and take the result of
374 * the previous run into account here as
375 * well. But it's really not worth the
376 * trouble. If every other interrupt is
377 * handled we never trigger the spurious
378 * detector. And if this is just the one out
379 * of 100k unhandled ones which is handled
380 * then we merily delay the spurious detection
381 * by one hard interrupt. Not a real problem.
382 */
383 desc->threads_handled_last &= ~SPURIOUS_DEFERRED;
384 }
385 }
386
289 if (unlikely(action_ret == IRQ_NONE)) { 387 if (unlikely(action_ret == IRQ_NONE)) {
290 /* 388 /*
291 * If we are seeing only the odd spurious IRQ caused by 389 * If we are seeing only the odd spurious IRQ caused by
diff --git a/kernel/kexec.c b/kernel/kexec.c
index c8380ad203bc..6748688813d0 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -125,8 +125,8 @@ static struct page *kimage_alloc_page(struct kimage *image,
125 unsigned long dest); 125 unsigned long dest);
126 126
127static int do_kimage_alloc(struct kimage **rimage, unsigned long entry, 127static int do_kimage_alloc(struct kimage **rimage, unsigned long entry,
128 unsigned long nr_segments, 128 unsigned long nr_segments,
129 struct kexec_segment __user *segments) 129 struct kexec_segment __user *segments)
130{ 130{
131 size_t segment_bytes; 131 size_t segment_bytes;
132 struct kimage *image; 132 struct kimage *image;
@@ -257,13 +257,13 @@ static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry,
257 image->control_code_page = kimage_alloc_control_pages(image, 257 image->control_code_page = kimage_alloc_control_pages(image,
258 get_order(KEXEC_CONTROL_PAGE_SIZE)); 258 get_order(KEXEC_CONTROL_PAGE_SIZE));
259 if (!image->control_code_page) { 259 if (!image->control_code_page) {
260 printk(KERN_ERR "Could not allocate control_code_buffer\n"); 260 pr_err("Could not allocate control_code_buffer\n");
261 goto out_free; 261 goto out_free;
262 } 262 }
263 263
264 image->swap_page = kimage_alloc_control_pages(image, 0); 264 image->swap_page = kimage_alloc_control_pages(image, 0);
265 if (!image->swap_page) { 265 if (!image->swap_page) {
266 printk(KERN_ERR "Could not allocate swap buffer\n"); 266 pr_err("Could not allocate swap buffer\n");
267 goto out_free; 267 goto out_free;
268 } 268 }
269 269
@@ -332,7 +332,7 @@ static int kimage_crash_alloc(struct kimage **rimage, unsigned long entry,
332 image->control_code_page = kimage_alloc_control_pages(image, 332 image->control_code_page = kimage_alloc_control_pages(image,
333 get_order(KEXEC_CONTROL_PAGE_SIZE)); 333 get_order(KEXEC_CONTROL_PAGE_SIZE));
334 if (!image->control_code_page) { 334 if (!image->control_code_page) {
335 printk(KERN_ERR "Could not allocate control_code_buffer\n"); 335 pr_err("Could not allocate control_code_buffer\n");
336 goto out_free; 336 goto out_free;
337 } 337 }
338 338
@@ -621,8 +621,8 @@ static void kimage_terminate(struct kimage *image)
621 621
622#define for_each_kimage_entry(image, ptr, entry) \ 622#define for_each_kimage_entry(image, ptr, entry) \
623 for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \ 623 for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \
624 ptr = (entry & IND_INDIRECTION)? \ 624 ptr = (entry & IND_INDIRECTION) ? \
625 phys_to_virt((entry & PAGE_MASK)): ptr +1) 625 phys_to_virt((entry & PAGE_MASK)) : ptr + 1)
626 626
627static void kimage_free_entry(kimage_entry_t entry) 627static void kimage_free_entry(kimage_entry_t entry)
628{ 628{
@@ -650,8 +650,7 @@ static void kimage_free(struct kimage *image)
650 * done with it. 650 * done with it.
651 */ 651 */
652 ind = entry; 652 ind = entry;
653 } 653 } else if (entry & IND_SOURCE)
654 else if (entry & IND_SOURCE)
655 kimage_free_entry(entry); 654 kimage_free_entry(entry);
656 } 655 }
657 /* Free the final indirection page */ 656 /* Free the final indirection page */
@@ -774,8 +773,7 @@ static struct page *kimage_alloc_page(struct kimage *image,
774 addr = old_addr; 773 addr = old_addr;
775 page = old_page; 774 page = old_page;
776 break; 775 break;
777 } 776 } else {
778 else {
779 /* Place the page on the destination list I 777 /* Place the page on the destination list I
780 * will use it later. 778 * will use it later.
781 */ 779 */
@@ -1059,7 +1057,7 @@ COMPAT_SYSCALL_DEFINE4(kexec_load, compat_ulong_t, entry,
1059 return -EINVAL; 1057 return -EINVAL;
1060 1058
1061 ksegments = compat_alloc_user_space(nr_segments * sizeof(out)); 1059 ksegments = compat_alloc_user_space(nr_segments * sizeof(out));
1062 for (i=0; i < nr_segments; i++) { 1060 for (i = 0; i < nr_segments; i++) {
1063 result = copy_from_user(&in, &segments[i], sizeof(in)); 1061 result = copy_from_user(&in, &segments[i], sizeof(in));
1064 if (result) 1062 if (result)
1065 return -EFAULT; 1063 return -EFAULT;
@@ -1214,14 +1212,14 @@ void crash_save_cpu(struct pt_regs *regs, int cpu)
1214 * squirrelled away. ELF notes happen to provide 1212 * squirrelled away. ELF notes happen to provide
1215 * all of that, so there is no need to invent something new. 1213 * all of that, so there is no need to invent something new.
1216 */ 1214 */
1217 buf = (u32*)per_cpu_ptr(crash_notes, cpu); 1215 buf = (u32 *)per_cpu_ptr(crash_notes, cpu);
1218 if (!buf) 1216 if (!buf)
1219 return; 1217 return;
1220 memset(&prstatus, 0, sizeof(prstatus)); 1218 memset(&prstatus, 0, sizeof(prstatus));
1221 prstatus.pr_pid = current->pid; 1219 prstatus.pr_pid = current->pid;
1222 elf_core_copy_kernel_regs(&prstatus.pr_reg, regs); 1220 elf_core_copy_kernel_regs(&prstatus.pr_reg, regs);
1223 buf = append_elf_note(buf, KEXEC_CORE_NOTE_NAME, NT_PRSTATUS, 1221 buf = append_elf_note(buf, KEXEC_CORE_NOTE_NAME, NT_PRSTATUS,
1224 &prstatus, sizeof(prstatus)); 1222 &prstatus, sizeof(prstatus));
1225 final_note(buf); 1223 final_note(buf);
1226} 1224}
1227 1225
@@ -1230,8 +1228,7 @@ static int __init crash_notes_memory_init(void)
1230 /* Allocate memory for saving cpu registers. */ 1228 /* Allocate memory for saving cpu registers. */
1231 crash_notes = alloc_percpu(note_buf_t); 1229 crash_notes = alloc_percpu(note_buf_t);
1232 if (!crash_notes) { 1230 if (!crash_notes) {
1233 printk("Kexec: Memory allocation for saving cpu register" 1231 pr_warn("Kexec: Memory allocation for saving cpu register states failed\n");
1234 " states failed\n");
1235 return -ENOMEM; 1232 return -ENOMEM;
1236 } 1233 }
1237 return 0; 1234 return 0;
@@ -1253,10 +1250,10 @@ subsys_initcall(crash_notes_memory_init);
1253 * 1250 *
1254 * The function returns 0 on success and -EINVAL on failure. 1251 * The function returns 0 on success and -EINVAL on failure.
1255 */ 1252 */
1256static int __init parse_crashkernel_mem(char *cmdline, 1253static int __init parse_crashkernel_mem(char *cmdline,
1257 unsigned long long system_ram, 1254 unsigned long long system_ram,
1258 unsigned long long *crash_size, 1255 unsigned long long *crash_size,
1259 unsigned long long *crash_base) 1256 unsigned long long *crash_base)
1260{ 1257{
1261 char *cur = cmdline, *tmp; 1258 char *cur = cmdline, *tmp;
1262 1259
@@ -1267,12 +1264,12 @@ static int __init parse_crashkernel_mem(char *cmdline,
1267 /* get the start of the range */ 1264 /* get the start of the range */
1268 start = memparse(cur, &tmp); 1265 start = memparse(cur, &tmp);
1269 if (cur == tmp) { 1266 if (cur == tmp) {
1270 pr_warning("crashkernel: Memory value expected\n"); 1267 pr_warn("crashkernel: Memory value expected\n");
1271 return -EINVAL; 1268 return -EINVAL;
1272 } 1269 }
1273 cur = tmp; 1270 cur = tmp;
1274 if (*cur != '-') { 1271 if (*cur != '-') {
1275 pr_warning("crashkernel: '-' expected\n"); 1272 pr_warn("crashkernel: '-' expected\n");
1276 return -EINVAL; 1273 return -EINVAL;
1277 } 1274 }
1278 cur++; 1275 cur++;
@@ -1281,31 +1278,30 @@ static int __init parse_crashkernel_mem(char *cmdline,
1281 if (*cur != ':') { 1278 if (*cur != ':') {
1282 end = memparse(cur, &tmp); 1279 end = memparse(cur, &tmp);
1283 if (cur == tmp) { 1280 if (cur == tmp) {
1284 pr_warning("crashkernel: Memory " 1281 pr_warn("crashkernel: Memory value expected\n");
1285 "value expected\n");
1286 return -EINVAL; 1282 return -EINVAL;
1287 } 1283 }
1288 cur = tmp; 1284 cur = tmp;
1289 if (end <= start) { 1285 if (end <= start) {
1290 pr_warning("crashkernel: end <= start\n"); 1286 pr_warn("crashkernel: end <= start\n");
1291 return -EINVAL; 1287 return -EINVAL;
1292 } 1288 }
1293 } 1289 }
1294 1290
1295 if (*cur != ':') { 1291 if (*cur != ':') {
1296 pr_warning("crashkernel: ':' expected\n"); 1292 pr_warn("crashkernel: ':' expected\n");
1297 return -EINVAL; 1293 return -EINVAL;
1298 } 1294 }
1299 cur++; 1295 cur++;
1300 1296
1301 size = memparse(cur, &tmp); 1297 size = memparse(cur, &tmp);
1302 if (cur == tmp) { 1298 if (cur == tmp) {
1303 pr_warning("Memory value expected\n"); 1299 pr_warn("Memory value expected\n");
1304 return -EINVAL; 1300 return -EINVAL;
1305 } 1301 }
1306 cur = tmp; 1302 cur = tmp;
1307 if (size >= system_ram) { 1303 if (size >= system_ram) {
1308 pr_warning("crashkernel: invalid size\n"); 1304 pr_warn("crashkernel: invalid size\n");
1309 return -EINVAL; 1305 return -EINVAL;
1310 } 1306 }
1311 1307
@@ -1323,8 +1319,7 @@ static int __init parse_crashkernel_mem(char *cmdline,
1323 cur++; 1319 cur++;
1324 *crash_base = memparse(cur, &tmp); 1320 *crash_base = memparse(cur, &tmp);
1325 if (cur == tmp) { 1321 if (cur == tmp) {
1326 pr_warning("Memory value expected " 1322 pr_warn("Memory value expected after '@'\n");
1327 "after '@'\n");
1328 return -EINVAL; 1323 return -EINVAL;
1329 } 1324 }
1330 } 1325 }
@@ -1336,26 +1331,26 @@ static int __init parse_crashkernel_mem(char *cmdline,
1336/* 1331/*
1337 * That function parses "simple" (old) crashkernel command lines like 1332 * That function parses "simple" (old) crashkernel command lines like
1338 * 1333 *
1339 * crashkernel=size[@offset] 1334 * crashkernel=size[@offset]
1340 * 1335 *
1341 * It returns 0 on success and -EINVAL on failure. 1336 * It returns 0 on success and -EINVAL on failure.
1342 */ 1337 */
1343static int __init parse_crashkernel_simple(char *cmdline, 1338static int __init parse_crashkernel_simple(char *cmdline,
1344 unsigned long long *crash_size, 1339 unsigned long long *crash_size,
1345 unsigned long long *crash_base) 1340 unsigned long long *crash_base)
1346{ 1341{
1347 char *cur = cmdline; 1342 char *cur = cmdline;
1348 1343
1349 *crash_size = memparse(cmdline, &cur); 1344 *crash_size = memparse(cmdline, &cur);
1350 if (cmdline == cur) { 1345 if (cmdline == cur) {
1351 pr_warning("crashkernel: memory value expected\n"); 1346 pr_warn("crashkernel: memory value expected\n");
1352 return -EINVAL; 1347 return -EINVAL;
1353 } 1348 }
1354 1349
1355 if (*cur == '@') 1350 if (*cur == '@')
1356 *crash_base = memparse(cur+1, &cur); 1351 *crash_base = memparse(cur+1, &cur);
1357 else if (*cur != ' ' && *cur != '\0') { 1352 else if (*cur != ' ' && *cur != '\0') {
1358 pr_warning("crashkernel: unrecognized char\n"); 1353 pr_warn("crashkernel: unrecognized char\n");
1359 return -EINVAL; 1354 return -EINVAL;
1360 } 1355 }
1361 1356
@@ -1683,7 +1678,15 @@ int kernel_kexec(void)
1683 kexec_in_progress = true; 1678 kexec_in_progress = true;
1684 kernel_restart_prepare(NULL); 1679 kernel_restart_prepare(NULL);
1685 migrate_to_reboot_cpu(); 1680 migrate_to_reboot_cpu();
1686 printk(KERN_EMERG "Starting new kernel\n"); 1681
1682 /*
1683 * migrate_to_reboot_cpu() disables CPU hotplug assuming that
1684 * no further code needs to use CPU hotplug (which is true in
1685 * the reboot case). However, the kexec path depends on using
1686 * CPU hotplug again; so re-enable it here.
1687 */
1688 cpu_hotplug_enable();
1689 pr_emerg("Starting new kernel\n");
1687 machine_shutdown(); 1690 machine_shutdown();
1688 } 1691 }
1689 1692
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 6b375af4958d..8637e041a247 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -285,10 +285,7 @@ static int wait_for_helper(void *data)
285 pid_t pid; 285 pid_t pid;
286 286
287 /* If SIGCLD is ignored sys_wait4 won't populate the status. */ 287 /* If SIGCLD is ignored sys_wait4 won't populate the status. */
288 spin_lock_irq(&current->sighand->siglock); 288 kernel_sigaction(SIGCHLD, SIG_DFL);
289 current->sighand->action[SIGCHLD-1].sa.sa_handler = SIG_DFL;
290 spin_unlock_irq(&current->sighand->siglock);
291
292 pid = kernel_thread(____call_usermodehelper, sub_info, SIGCHLD); 289 pid = kernel_thread(____call_usermodehelper, sub_info, SIGCHLD);
293 if (pid < 0) { 290 if (pid < 0) {
294 sub_info->retval = pid; 291 sub_info->retval = pid;
@@ -498,7 +495,7 @@ int __usermodehelper_disable(enum umh_disable_depth depth)
498static void helper_lock(void) 495static void helper_lock(void)
499{ 496{
500 atomic_inc(&running_helpers); 497 atomic_inc(&running_helpers);
501 smp_mb__after_atomic_inc(); 498 smp_mb__after_atomic();
502} 499}
503 500
504static void helper_unlock(void) 501static void helper_unlock(void)
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index ceeadfcabb76..3214289df5a7 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -86,21 +86,8 @@ static raw_spinlock_t *kretprobe_table_lock_ptr(unsigned long hash)
86 return &(kretprobe_table_locks[hash].lock); 86 return &(kretprobe_table_locks[hash].lock);
87} 87}
88 88
89/* 89/* Blacklist -- list of struct kprobe_blacklist_entry */
90 * Normally, functions that we'd want to prohibit kprobes in, are marked 90static LIST_HEAD(kprobe_blacklist);
91 * __kprobes. But, there are cases where such functions already belong to
92 * a different section (__sched for preempt_schedule)
93 *
94 * For such cases, we now have a blacklist
95 */
96static struct kprobe_blackpoint kprobe_blacklist[] = {
97 {"preempt_schedule",},
98 {"native_get_debugreg",},
99 {"irq_entries_start",},
100 {"common_interrupt",},
101 {"mcount",}, /* mcount can be called from everywhere */
102 {NULL} /* Terminator */
103};
104 91
105#ifdef __ARCH_WANT_KPROBES_INSN_SLOT 92#ifdef __ARCH_WANT_KPROBES_INSN_SLOT
106/* 93/*
@@ -151,13 +138,13 @@ struct kprobe_insn_cache kprobe_insn_slots = {
151 .insn_size = MAX_INSN_SIZE, 138 .insn_size = MAX_INSN_SIZE,
152 .nr_garbage = 0, 139 .nr_garbage = 0,
153}; 140};
154static int __kprobes collect_garbage_slots(struct kprobe_insn_cache *c); 141static int collect_garbage_slots(struct kprobe_insn_cache *c);
155 142
156/** 143/**
157 * __get_insn_slot() - Find a slot on an executable page for an instruction. 144 * __get_insn_slot() - Find a slot on an executable page for an instruction.
158 * We allocate an executable page if there's no room on existing ones. 145 * We allocate an executable page if there's no room on existing ones.
159 */ 146 */
160kprobe_opcode_t __kprobes *__get_insn_slot(struct kprobe_insn_cache *c) 147kprobe_opcode_t *__get_insn_slot(struct kprobe_insn_cache *c)
161{ 148{
162 struct kprobe_insn_page *kip; 149 struct kprobe_insn_page *kip;
163 kprobe_opcode_t *slot = NULL; 150 kprobe_opcode_t *slot = NULL;
@@ -214,7 +201,7 @@ out:
214} 201}
215 202
216/* Return 1 if all garbages are collected, otherwise 0. */ 203/* Return 1 if all garbages are collected, otherwise 0. */
217static int __kprobes collect_one_slot(struct kprobe_insn_page *kip, int idx) 204static int collect_one_slot(struct kprobe_insn_page *kip, int idx)
218{ 205{
219 kip->slot_used[idx] = SLOT_CLEAN; 206 kip->slot_used[idx] = SLOT_CLEAN;
220 kip->nused--; 207 kip->nused--;
@@ -235,7 +222,7 @@ static int __kprobes collect_one_slot(struct kprobe_insn_page *kip, int idx)
235 return 0; 222 return 0;
236} 223}
237 224
238static int __kprobes collect_garbage_slots(struct kprobe_insn_cache *c) 225static int collect_garbage_slots(struct kprobe_insn_cache *c)
239{ 226{
240 struct kprobe_insn_page *kip, *next; 227 struct kprobe_insn_page *kip, *next;
241 228
@@ -257,8 +244,8 @@ static int __kprobes collect_garbage_slots(struct kprobe_insn_cache *c)
257 return 0; 244 return 0;
258} 245}
259 246
260void __kprobes __free_insn_slot(struct kprobe_insn_cache *c, 247void __free_insn_slot(struct kprobe_insn_cache *c,
261 kprobe_opcode_t *slot, int dirty) 248 kprobe_opcode_t *slot, int dirty)
262{ 249{
263 struct kprobe_insn_page *kip; 250 struct kprobe_insn_page *kip;
264 251
@@ -314,7 +301,7 @@ static inline void reset_kprobe_instance(void)
314 * OR 301 * OR
315 * - with preemption disabled - from arch/xxx/kernel/kprobes.c 302 * - with preemption disabled - from arch/xxx/kernel/kprobes.c
316 */ 303 */
317struct kprobe __kprobes *get_kprobe(void *addr) 304struct kprobe *get_kprobe(void *addr)
318{ 305{
319 struct hlist_head *head; 306 struct hlist_head *head;
320 struct kprobe *p; 307 struct kprobe *p;
@@ -327,8 +314,9 @@ struct kprobe __kprobes *get_kprobe(void *addr)
327 314
328 return NULL; 315 return NULL;
329} 316}
317NOKPROBE_SYMBOL(get_kprobe);
330 318
331static int __kprobes aggr_pre_handler(struct kprobe *p, struct pt_regs *regs); 319static int aggr_pre_handler(struct kprobe *p, struct pt_regs *regs);
332 320
333/* Return true if the kprobe is an aggregator */ 321/* Return true if the kprobe is an aggregator */
334static inline int kprobe_aggrprobe(struct kprobe *p) 322static inline int kprobe_aggrprobe(struct kprobe *p)
@@ -360,7 +348,7 @@ static bool kprobes_allow_optimization;
360 * Call all pre_handler on the list, but ignores its return value. 348 * Call all pre_handler on the list, but ignores its return value.
361 * This must be called from arch-dep optimized caller. 349 * This must be called from arch-dep optimized caller.
362 */ 350 */
363void __kprobes opt_pre_handler(struct kprobe *p, struct pt_regs *regs) 351void opt_pre_handler(struct kprobe *p, struct pt_regs *regs)
364{ 352{
365 struct kprobe *kp; 353 struct kprobe *kp;
366 354
@@ -372,9 +360,10 @@ void __kprobes opt_pre_handler(struct kprobe *p, struct pt_regs *regs)
372 reset_kprobe_instance(); 360 reset_kprobe_instance();
373 } 361 }
374} 362}
363NOKPROBE_SYMBOL(opt_pre_handler);
375 364
376/* Free optimized instructions and optimized_kprobe */ 365/* Free optimized instructions and optimized_kprobe */
377static __kprobes void free_aggr_kprobe(struct kprobe *p) 366static void free_aggr_kprobe(struct kprobe *p)
378{ 367{
379 struct optimized_kprobe *op; 368 struct optimized_kprobe *op;
380 369
@@ -412,7 +401,7 @@ static inline int kprobe_disarmed(struct kprobe *p)
412} 401}
413 402
414/* Return true(!0) if the probe is queued on (un)optimizing lists */ 403/* Return true(!0) if the probe is queued on (un)optimizing lists */
415static int __kprobes kprobe_queued(struct kprobe *p) 404static int kprobe_queued(struct kprobe *p)
416{ 405{
417 struct optimized_kprobe *op; 406 struct optimized_kprobe *op;
418 407
@@ -428,7 +417,7 @@ static int __kprobes kprobe_queued(struct kprobe *p)
428 * Return an optimized kprobe whose optimizing code replaces 417 * Return an optimized kprobe whose optimizing code replaces
429 * instructions including addr (exclude breakpoint). 418 * instructions including addr (exclude breakpoint).
430 */ 419 */
431static struct kprobe *__kprobes get_optimized_kprobe(unsigned long addr) 420static struct kprobe *get_optimized_kprobe(unsigned long addr)
432{ 421{
433 int i; 422 int i;
434 struct kprobe *p = NULL; 423 struct kprobe *p = NULL;
@@ -460,7 +449,7 @@ static DECLARE_DELAYED_WORK(optimizing_work, kprobe_optimizer);
460 * Optimize (replace a breakpoint with a jump) kprobes listed on 449 * Optimize (replace a breakpoint with a jump) kprobes listed on
461 * optimizing_list. 450 * optimizing_list.
462 */ 451 */
463static __kprobes void do_optimize_kprobes(void) 452static void do_optimize_kprobes(void)
464{ 453{
465 /* Optimization never be done when disarmed */ 454 /* Optimization never be done when disarmed */
466 if (kprobes_all_disarmed || !kprobes_allow_optimization || 455 if (kprobes_all_disarmed || !kprobes_allow_optimization ||
@@ -488,7 +477,7 @@ static __kprobes void do_optimize_kprobes(void)
488 * Unoptimize (replace a jump with a breakpoint and remove the breakpoint 477 * Unoptimize (replace a jump with a breakpoint and remove the breakpoint
489 * if need) kprobes listed on unoptimizing_list. 478 * if need) kprobes listed on unoptimizing_list.
490 */ 479 */
491static __kprobes void do_unoptimize_kprobes(void) 480static void do_unoptimize_kprobes(void)
492{ 481{
493 struct optimized_kprobe *op, *tmp; 482 struct optimized_kprobe *op, *tmp;
494 483
@@ -520,7 +509,7 @@ static __kprobes void do_unoptimize_kprobes(void)
520} 509}
521 510
522/* Reclaim all kprobes on the free_list */ 511/* Reclaim all kprobes on the free_list */
523static __kprobes void do_free_cleaned_kprobes(void) 512static void do_free_cleaned_kprobes(void)
524{ 513{
525 struct optimized_kprobe *op, *tmp; 514 struct optimized_kprobe *op, *tmp;
526 515
@@ -532,13 +521,13 @@ static __kprobes void do_free_cleaned_kprobes(void)
532} 521}
533 522
534/* Start optimizer after OPTIMIZE_DELAY passed */ 523/* Start optimizer after OPTIMIZE_DELAY passed */
535static __kprobes void kick_kprobe_optimizer(void) 524static void kick_kprobe_optimizer(void)
536{ 525{
537 schedule_delayed_work(&optimizing_work, OPTIMIZE_DELAY); 526 schedule_delayed_work(&optimizing_work, OPTIMIZE_DELAY);
538} 527}
539 528
540/* Kprobe jump optimizer */ 529/* Kprobe jump optimizer */
541static __kprobes void kprobe_optimizer(struct work_struct *work) 530static void kprobe_optimizer(struct work_struct *work)
542{ 531{
543 mutex_lock(&kprobe_mutex); 532 mutex_lock(&kprobe_mutex);
544 /* Lock modules while optimizing kprobes */ 533 /* Lock modules while optimizing kprobes */
@@ -574,7 +563,7 @@ static __kprobes void kprobe_optimizer(struct work_struct *work)
574} 563}
575 564
576/* Wait for completing optimization and unoptimization */ 565/* Wait for completing optimization and unoptimization */
577static __kprobes void wait_for_kprobe_optimizer(void) 566static void wait_for_kprobe_optimizer(void)
578{ 567{
579 mutex_lock(&kprobe_mutex); 568 mutex_lock(&kprobe_mutex);
580 569
@@ -593,7 +582,7 @@ static __kprobes void wait_for_kprobe_optimizer(void)
593} 582}
594 583
595/* Optimize kprobe if p is ready to be optimized */ 584/* Optimize kprobe if p is ready to be optimized */
596static __kprobes void optimize_kprobe(struct kprobe *p) 585static void optimize_kprobe(struct kprobe *p)
597{ 586{
598 struct optimized_kprobe *op; 587 struct optimized_kprobe *op;
599 588
@@ -627,7 +616,7 @@ static __kprobes void optimize_kprobe(struct kprobe *p)
627} 616}
628 617
629/* Short cut to direct unoptimizing */ 618/* Short cut to direct unoptimizing */
630static __kprobes void force_unoptimize_kprobe(struct optimized_kprobe *op) 619static void force_unoptimize_kprobe(struct optimized_kprobe *op)
631{ 620{
632 get_online_cpus(); 621 get_online_cpus();
633 arch_unoptimize_kprobe(op); 622 arch_unoptimize_kprobe(op);
@@ -637,7 +626,7 @@ static __kprobes void force_unoptimize_kprobe(struct optimized_kprobe *op)
637} 626}
638 627
639/* Unoptimize a kprobe if p is optimized */ 628/* Unoptimize a kprobe if p is optimized */
640static __kprobes void unoptimize_kprobe(struct kprobe *p, bool force) 629static void unoptimize_kprobe(struct kprobe *p, bool force)
641{ 630{
642 struct optimized_kprobe *op; 631 struct optimized_kprobe *op;
643 632
@@ -697,7 +686,7 @@ static void reuse_unused_kprobe(struct kprobe *ap)
697} 686}
698 687
699/* Remove optimized instructions */ 688/* Remove optimized instructions */
700static void __kprobes kill_optimized_kprobe(struct kprobe *p) 689static void kill_optimized_kprobe(struct kprobe *p)
701{ 690{
702 struct optimized_kprobe *op; 691 struct optimized_kprobe *op;
703 692
@@ -723,7 +712,7 @@ static void __kprobes kill_optimized_kprobe(struct kprobe *p)
723} 712}
724 713
725/* Try to prepare optimized instructions */ 714/* Try to prepare optimized instructions */
726static __kprobes void prepare_optimized_kprobe(struct kprobe *p) 715static void prepare_optimized_kprobe(struct kprobe *p)
727{ 716{
728 struct optimized_kprobe *op; 717 struct optimized_kprobe *op;
729 718
@@ -732,7 +721,7 @@ static __kprobes void prepare_optimized_kprobe(struct kprobe *p)
732} 721}
733 722
734/* Allocate new optimized_kprobe and try to prepare optimized instructions */ 723/* Allocate new optimized_kprobe and try to prepare optimized instructions */
735static __kprobes struct kprobe *alloc_aggr_kprobe(struct kprobe *p) 724static struct kprobe *alloc_aggr_kprobe(struct kprobe *p)
736{ 725{
737 struct optimized_kprobe *op; 726 struct optimized_kprobe *op;
738 727
@@ -747,13 +736,13 @@ static __kprobes struct kprobe *alloc_aggr_kprobe(struct kprobe *p)
747 return &op->kp; 736 return &op->kp;
748} 737}
749 738
750static void __kprobes init_aggr_kprobe(struct kprobe *ap, struct kprobe *p); 739static void init_aggr_kprobe(struct kprobe *ap, struct kprobe *p);
751 740
752/* 741/*
753 * Prepare an optimized_kprobe and optimize it 742 * Prepare an optimized_kprobe and optimize it
754 * NOTE: p must be a normal registered kprobe 743 * NOTE: p must be a normal registered kprobe
755 */ 744 */
756static __kprobes void try_to_optimize_kprobe(struct kprobe *p) 745static void try_to_optimize_kprobe(struct kprobe *p)
757{ 746{
758 struct kprobe *ap; 747 struct kprobe *ap;
759 struct optimized_kprobe *op; 748 struct optimized_kprobe *op;
@@ -787,7 +776,7 @@ out:
787} 776}
788 777
789#ifdef CONFIG_SYSCTL 778#ifdef CONFIG_SYSCTL
790static void __kprobes optimize_all_kprobes(void) 779static void optimize_all_kprobes(void)
791{ 780{
792 struct hlist_head *head; 781 struct hlist_head *head;
793 struct kprobe *p; 782 struct kprobe *p;
@@ -810,7 +799,7 @@ out:
810 mutex_unlock(&kprobe_mutex); 799 mutex_unlock(&kprobe_mutex);
811} 800}
812 801
813static void __kprobes unoptimize_all_kprobes(void) 802static void unoptimize_all_kprobes(void)
814{ 803{
815 struct hlist_head *head; 804 struct hlist_head *head;
816 struct kprobe *p; 805 struct kprobe *p;
@@ -861,7 +850,7 @@ int proc_kprobes_optimization_handler(struct ctl_table *table, int write,
861#endif /* CONFIG_SYSCTL */ 850#endif /* CONFIG_SYSCTL */
862 851
863/* Put a breakpoint for a probe. Must be called with text_mutex locked */ 852/* Put a breakpoint for a probe. Must be called with text_mutex locked */
864static void __kprobes __arm_kprobe(struct kprobe *p) 853static void __arm_kprobe(struct kprobe *p)
865{ 854{
866 struct kprobe *_p; 855 struct kprobe *_p;
867 856
@@ -876,7 +865,7 @@ static void __kprobes __arm_kprobe(struct kprobe *p)
876} 865}
877 866
878/* Remove the breakpoint of a probe. Must be called with text_mutex locked */ 867/* Remove the breakpoint of a probe. Must be called with text_mutex locked */
879static void __kprobes __disarm_kprobe(struct kprobe *p, bool reopt) 868static void __disarm_kprobe(struct kprobe *p, bool reopt)
880{ 869{
881 struct kprobe *_p; 870 struct kprobe *_p;
882 871
@@ -911,13 +900,13 @@ static void reuse_unused_kprobe(struct kprobe *ap)
911 BUG_ON(kprobe_unused(ap)); 900 BUG_ON(kprobe_unused(ap));
912} 901}
913 902
914static __kprobes void free_aggr_kprobe(struct kprobe *p) 903static void free_aggr_kprobe(struct kprobe *p)
915{ 904{
916 arch_remove_kprobe(p); 905 arch_remove_kprobe(p);
917 kfree(p); 906 kfree(p);
918} 907}
919 908
920static __kprobes struct kprobe *alloc_aggr_kprobe(struct kprobe *p) 909static struct kprobe *alloc_aggr_kprobe(struct kprobe *p)
921{ 910{
922 return kzalloc(sizeof(struct kprobe), GFP_KERNEL); 911 return kzalloc(sizeof(struct kprobe), GFP_KERNEL);
923} 912}
@@ -931,7 +920,7 @@ static struct ftrace_ops kprobe_ftrace_ops __read_mostly = {
931static int kprobe_ftrace_enabled; 920static int kprobe_ftrace_enabled;
932 921
933/* Must ensure p->addr is really on ftrace */ 922/* Must ensure p->addr is really on ftrace */
934static int __kprobes prepare_kprobe(struct kprobe *p) 923static int prepare_kprobe(struct kprobe *p)
935{ 924{
936 if (!kprobe_ftrace(p)) 925 if (!kprobe_ftrace(p))
937 return arch_prepare_kprobe(p); 926 return arch_prepare_kprobe(p);
@@ -940,7 +929,7 @@ static int __kprobes prepare_kprobe(struct kprobe *p)
940} 929}
941 930
942/* Caller must lock kprobe_mutex */ 931/* Caller must lock kprobe_mutex */
943static void __kprobes arm_kprobe_ftrace(struct kprobe *p) 932static void arm_kprobe_ftrace(struct kprobe *p)
944{ 933{
945 int ret; 934 int ret;
946 935
@@ -955,7 +944,7 @@ static void __kprobes arm_kprobe_ftrace(struct kprobe *p)
955} 944}
956 945
957/* Caller must lock kprobe_mutex */ 946/* Caller must lock kprobe_mutex */
958static void __kprobes disarm_kprobe_ftrace(struct kprobe *p) 947static void disarm_kprobe_ftrace(struct kprobe *p)
959{ 948{
960 int ret; 949 int ret;
961 950
@@ -975,7 +964,7 @@ static void __kprobes disarm_kprobe_ftrace(struct kprobe *p)
975#endif 964#endif
976 965
977/* Arm a kprobe with text_mutex */ 966/* Arm a kprobe with text_mutex */
978static void __kprobes arm_kprobe(struct kprobe *kp) 967static void arm_kprobe(struct kprobe *kp)
979{ 968{
980 if (unlikely(kprobe_ftrace(kp))) { 969 if (unlikely(kprobe_ftrace(kp))) {
981 arm_kprobe_ftrace(kp); 970 arm_kprobe_ftrace(kp);
@@ -992,7 +981,7 @@ static void __kprobes arm_kprobe(struct kprobe *kp)
992} 981}
993 982
994/* Disarm a kprobe with text_mutex */ 983/* Disarm a kprobe with text_mutex */
995static void __kprobes disarm_kprobe(struct kprobe *kp, bool reopt) 984static void disarm_kprobe(struct kprobe *kp, bool reopt)
996{ 985{
997 if (unlikely(kprobe_ftrace(kp))) { 986 if (unlikely(kprobe_ftrace(kp))) {
998 disarm_kprobe_ftrace(kp); 987 disarm_kprobe_ftrace(kp);
@@ -1008,7 +997,7 @@ static void __kprobes disarm_kprobe(struct kprobe *kp, bool reopt)
1008 * Aggregate handlers for multiple kprobes support - these handlers 997 * Aggregate handlers for multiple kprobes support - these handlers
1009 * take care of invoking the individual kprobe handlers on p->list 998 * take care of invoking the individual kprobe handlers on p->list
1010 */ 999 */
1011static int __kprobes aggr_pre_handler(struct kprobe *p, struct pt_regs *regs) 1000static int aggr_pre_handler(struct kprobe *p, struct pt_regs *regs)
1012{ 1001{
1013 struct kprobe *kp; 1002 struct kprobe *kp;
1014 1003
@@ -1022,9 +1011,10 @@ static int __kprobes aggr_pre_handler(struct kprobe *p, struct pt_regs *regs)
1022 } 1011 }
1023 return 0; 1012 return 0;
1024} 1013}
1014NOKPROBE_SYMBOL(aggr_pre_handler);
1025 1015
1026static void __kprobes aggr_post_handler(struct kprobe *p, struct pt_regs *regs, 1016static void aggr_post_handler(struct kprobe *p, struct pt_regs *regs,
1027 unsigned long flags) 1017 unsigned long flags)
1028{ 1018{
1029 struct kprobe *kp; 1019 struct kprobe *kp;
1030 1020
@@ -1036,9 +1026,10 @@ static void __kprobes aggr_post_handler(struct kprobe *p, struct pt_regs *regs,
1036 } 1026 }
1037 } 1027 }
1038} 1028}
1029NOKPROBE_SYMBOL(aggr_post_handler);
1039 1030
1040static int __kprobes aggr_fault_handler(struct kprobe *p, struct pt_regs *regs, 1031static int aggr_fault_handler(struct kprobe *p, struct pt_regs *regs,
1041 int trapnr) 1032 int trapnr)
1042{ 1033{
1043 struct kprobe *cur = __this_cpu_read(kprobe_instance); 1034 struct kprobe *cur = __this_cpu_read(kprobe_instance);
1044 1035
@@ -1052,8 +1043,9 @@ static int __kprobes aggr_fault_handler(struct kprobe *p, struct pt_regs *regs,
1052 } 1043 }
1053 return 0; 1044 return 0;
1054} 1045}
1046NOKPROBE_SYMBOL(aggr_fault_handler);
1055 1047
1056static int __kprobes aggr_break_handler(struct kprobe *p, struct pt_regs *regs) 1048static int aggr_break_handler(struct kprobe *p, struct pt_regs *regs)
1057{ 1049{
1058 struct kprobe *cur = __this_cpu_read(kprobe_instance); 1050 struct kprobe *cur = __this_cpu_read(kprobe_instance);
1059 int ret = 0; 1051 int ret = 0;
@@ -1065,9 +1057,10 @@ static int __kprobes aggr_break_handler(struct kprobe *p, struct pt_regs *regs)
1065 reset_kprobe_instance(); 1057 reset_kprobe_instance();
1066 return ret; 1058 return ret;
1067} 1059}
1060NOKPROBE_SYMBOL(aggr_break_handler);
1068 1061
1069/* Walks the list and increments nmissed count for multiprobe case */ 1062/* Walks the list and increments nmissed count for multiprobe case */
1070void __kprobes kprobes_inc_nmissed_count(struct kprobe *p) 1063void kprobes_inc_nmissed_count(struct kprobe *p)
1071{ 1064{
1072 struct kprobe *kp; 1065 struct kprobe *kp;
1073 if (!kprobe_aggrprobe(p)) { 1066 if (!kprobe_aggrprobe(p)) {
@@ -1078,9 +1071,10 @@ void __kprobes kprobes_inc_nmissed_count(struct kprobe *p)
1078 } 1071 }
1079 return; 1072 return;
1080} 1073}
1074NOKPROBE_SYMBOL(kprobes_inc_nmissed_count);
1081 1075
1082void __kprobes recycle_rp_inst(struct kretprobe_instance *ri, 1076void recycle_rp_inst(struct kretprobe_instance *ri,
1083 struct hlist_head *head) 1077 struct hlist_head *head)
1084{ 1078{
1085 struct kretprobe *rp = ri->rp; 1079 struct kretprobe *rp = ri->rp;
1086 1080
@@ -1095,8 +1089,9 @@ void __kprobes recycle_rp_inst(struct kretprobe_instance *ri,
1095 /* Unregistering */ 1089 /* Unregistering */
1096 hlist_add_head(&ri->hlist, head); 1090 hlist_add_head(&ri->hlist, head);
1097} 1091}
1092NOKPROBE_SYMBOL(recycle_rp_inst);
1098 1093
1099void __kprobes kretprobe_hash_lock(struct task_struct *tsk, 1094void kretprobe_hash_lock(struct task_struct *tsk,
1100 struct hlist_head **head, unsigned long *flags) 1095 struct hlist_head **head, unsigned long *flags)
1101__acquires(hlist_lock) 1096__acquires(hlist_lock)
1102{ 1097{
@@ -1107,17 +1102,19 @@ __acquires(hlist_lock)
1107 hlist_lock = kretprobe_table_lock_ptr(hash); 1102 hlist_lock = kretprobe_table_lock_ptr(hash);
1108 raw_spin_lock_irqsave(hlist_lock, *flags); 1103 raw_spin_lock_irqsave(hlist_lock, *flags);
1109} 1104}
1105NOKPROBE_SYMBOL(kretprobe_hash_lock);
1110 1106
1111static void __kprobes kretprobe_table_lock(unsigned long hash, 1107static void kretprobe_table_lock(unsigned long hash,
1112 unsigned long *flags) 1108 unsigned long *flags)
1113__acquires(hlist_lock) 1109__acquires(hlist_lock)
1114{ 1110{
1115 raw_spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash); 1111 raw_spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash);
1116 raw_spin_lock_irqsave(hlist_lock, *flags); 1112 raw_spin_lock_irqsave(hlist_lock, *flags);
1117} 1113}
1114NOKPROBE_SYMBOL(kretprobe_table_lock);
1118 1115
1119void __kprobes kretprobe_hash_unlock(struct task_struct *tsk, 1116void kretprobe_hash_unlock(struct task_struct *tsk,
1120 unsigned long *flags) 1117 unsigned long *flags)
1121__releases(hlist_lock) 1118__releases(hlist_lock)
1122{ 1119{
1123 unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS); 1120 unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS);
@@ -1126,14 +1123,16 @@ __releases(hlist_lock)
1126 hlist_lock = kretprobe_table_lock_ptr(hash); 1123 hlist_lock = kretprobe_table_lock_ptr(hash);
1127 raw_spin_unlock_irqrestore(hlist_lock, *flags); 1124 raw_spin_unlock_irqrestore(hlist_lock, *flags);
1128} 1125}
1126NOKPROBE_SYMBOL(kretprobe_hash_unlock);
1129 1127
1130static void __kprobes kretprobe_table_unlock(unsigned long hash, 1128static void kretprobe_table_unlock(unsigned long hash,
1131 unsigned long *flags) 1129 unsigned long *flags)
1132__releases(hlist_lock) 1130__releases(hlist_lock)
1133{ 1131{
1134 raw_spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash); 1132 raw_spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash);
1135 raw_spin_unlock_irqrestore(hlist_lock, *flags); 1133 raw_spin_unlock_irqrestore(hlist_lock, *flags);
1136} 1134}
1135NOKPROBE_SYMBOL(kretprobe_table_unlock);
1137 1136
1138/* 1137/*
1139 * This function is called from finish_task_switch when task tk becomes dead, 1138 * This function is called from finish_task_switch when task tk becomes dead,
@@ -1141,7 +1140,7 @@ __releases(hlist_lock)
1141 * with this task. These left over instances represent probed functions 1140 * with this task. These left over instances represent probed functions
1142 * that have been called but will never return. 1141 * that have been called but will never return.
1143 */ 1142 */
1144void __kprobes kprobe_flush_task(struct task_struct *tk) 1143void kprobe_flush_task(struct task_struct *tk)
1145{ 1144{
1146 struct kretprobe_instance *ri; 1145 struct kretprobe_instance *ri;
1147 struct hlist_head *head, empty_rp; 1146 struct hlist_head *head, empty_rp;
@@ -1166,6 +1165,7 @@ void __kprobes kprobe_flush_task(struct task_struct *tk)
1166 kfree(ri); 1165 kfree(ri);
1167 } 1166 }
1168} 1167}
1168NOKPROBE_SYMBOL(kprobe_flush_task);
1169 1169
1170static inline void free_rp_inst(struct kretprobe *rp) 1170static inline void free_rp_inst(struct kretprobe *rp)
1171{ 1171{
@@ -1178,7 +1178,7 @@ static inline void free_rp_inst(struct kretprobe *rp)
1178 } 1178 }
1179} 1179}
1180 1180
1181static void __kprobes cleanup_rp_inst(struct kretprobe *rp) 1181static void cleanup_rp_inst(struct kretprobe *rp)
1182{ 1182{
1183 unsigned long flags, hash; 1183 unsigned long flags, hash;
1184 struct kretprobe_instance *ri; 1184 struct kretprobe_instance *ri;
@@ -1197,12 +1197,13 @@ static void __kprobes cleanup_rp_inst(struct kretprobe *rp)
1197 } 1197 }
1198 free_rp_inst(rp); 1198 free_rp_inst(rp);
1199} 1199}
1200NOKPROBE_SYMBOL(cleanup_rp_inst);
1200 1201
1201/* 1202/*
1202* Add the new probe to ap->list. Fail if this is the 1203* Add the new probe to ap->list. Fail if this is the
1203* second jprobe at the address - two jprobes can't coexist 1204* second jprobe at the address - two jprobes can't coexist
1204*/ 1205*/
1205static int __kprobes add_new_kprobe(struct kprobe *ap, struct kprobe *p) 1206static int add_new_kprobe(struct kprobe *ap, struct kprobe *p)
1206{ 1207{
1207 BUG_ON(kprobe_gone(ap) || kprobe_gone(p)); 1208 BUG_ON(kprobe_gone(ap) || kprobe_gone(p));
1208 1209
@@ -1226,7 +1227,7 @@ static int __kprobes add_new_kprobe(struct kprobe *ap, struct kprobe *p)
1226 * Fill in the required fields of the "manager kprobe". Replace the 1227 * Fill in the required fields of the "manager kprobe". Replace the
1227 * earlier kprobe in the hlist with the manager kprobe 1228 * earlier kprobe in the hlist with the manager kprobe
1228 */ 1229 */
1229static void __kprobes init_aggr_kprobe(struct kprobe *ap, struct kprobe *p) 1230static void init_aggr_kprobe(struct kprobe *ap, struct kprobe *p)
1230{ 1231{
1231 /* Copy p's insn slot to ap */ 1232 /* Copy p's insn slot to ap */
1232 copy_kprobe(p, ap); 1233 copy_kprobe(p, ap);
@@ -1252,8 +1253,7 @@ static void __kprobes init_aggr_kprobe(struct kprobe *ap, struct kprobe *p)
1252 * This is the second or subsequent kprobe at the address - handle 1253 * This is the second or subsequent kprobe at the address - handle
1253 * the intricacies 1254 * the intricacies
1254 */ 1255 */
1255static int __kprobes register_aggr_kprobe(struct kprobe *orig_p, 1256static int register_aggr_kprobe(struct kprobe *orig_p, struct kprobe *p)
1256 struct kprobe *p)
1257{ 1257{
1258 int ret = 0; 1258 int ret = 0;
1259 struct kprobe *ap = orig_p; 1259 struct kprobe *ap = orig_p;
@@ -1324,25 +1324,29 @@ out:
1324 return ret; 1324 return ret;
1325} 1325}
1326 1326
1327static int __kprobes in_kprobes_functions(unsigned long addr) 1327bool __weak arch_within_kprobe_blacklist(unsigned long addr)
1328{ 1328{
1329 struct kprobe_blackpoint *kb; 1329 /* The __kprobes marked functions and entry code must not be probed */
1330 return addr >= (unsigned long)__kprobes_text_start &&
1331 addr < (unsigned long)__kprobes_text_end;
1332}
1330 1333
1331 if (addr >= (unsigned long)__kprobes_text_start && 1334static bool within_kprobe_blacklist(unsigned long addr)
1332 addr < (unsigned long)__kprobes_text_end) 1335{
1333 return -EINVAL; 1336 struct kprobe_blacklist_entry *ent;
1337
1338 if (arch_within_kprobe_blacklist(addr))
1339 return true;
1334 /* 1340 /*
1335 * If there exists a kprobe_blacklist, verify and 1341 * If there exists a kprobe_blacklist, verify and
1336 * fail any probe registration in the prohibited area 1342 * fail any probe registration in the prohibited area
1337 */ 1343 */
1338 for (kb = kprobe_blacklist; kb->name != NULL; kb++) { 1344 list_for_each_entry(ent, &kprobe_blacklist, list) {
1339 if (kb->start_addr) { 1345 if (addr >= ent->start_addr && addr < ent->end_addr)
1340 if (addr >= kb->start_addr && 1346 return true;
1341 addr < (kb->start_addr + kb->range))
1342 return -EINVAL;
1343 }
1344 } 1347 }
1345 return 0; 1348
1349 return false;
1346} 1350}
1347 1351
1348/* 1352/*
@@ -1351,7 +1355,7 @@ static int __kprobes in_kprobes_functions(unsigned long addr)
1351 * This returns encoded errors if it fails to look up symbol or invalid 1355 * This returns encoded errors if it fails to look up symbol or invalid
1352 * combination of parameters. 1356 * combination of parameters.
1353 */ 1357 */
1354static kprobe_opcode_t __kprobes *kprobe_addr(struct kprobe *p) 1358static kprobe_opcode_t *kprobe_addr(struct kprobe *p)
1355{ 1359{
1356 kprobe_opcode_t *addr = p->addr; 1360 kprobe_opcode_t *addr = p->addr;
1357 1361
@@ -1374,7 +1378,7 @@ invalid:
1374} 1378}
1375 1379
1376/* Check passed kprobe is valid and return kprobe in kprobe_table. */ 1380/* Check passed kprobe is valid and return kprobe in kprobe_table. */
1377static struct kprobe * __kprobes __get_valid_kprobe(struct kprobe *p) 1381static struct kprobe *__get_valid_kprobe(struct kprobe *p)
1378{ 1382{
1379 struct kprobe *ap, *list_p; 1383 struct kprobe *ap, *list_p;
1380 1384
@@ -1406,8 +1410,8 @@ static inline int check_kprobe_rereg(struct kprobe *p)
1406 return ret; 1410 return ret;
1407} 1411}
1408 1412
1409static __kprobes int check_kprobe_address_safe(struct kprobe *p, 1413static int check_kprobe_address_safe(struct kprobe *p,
1410 struct module **probed_mod) 1414 struct module **probed_mod)
1411{ 1415{
1412 int ret = 0; 1416 int ret = 0;
1413 unsigned long ftrace_addr; 1417 unsigned long ftrace_addr;
@@ -1433,7 +1437,7 @@ static __kprobes int check_kprobe_address_safe(struct kprobe *p,
1433 1437
1434 /* Ensure it is not in reserved area nor out of text */ 1438 /* Ensure it is not in reserved area nor out of text */
1435 if (!kernel_text_address((unsigned long) p->addr) || 1439 if (!kernel_text_address((unsigned long) p->addr) ||
1436 in_kprobes_functions((unsigned long) p->addr) || 1440 within_kprobe_blacklist((unsigned long) p->addr) ||
1437 jump_label_text_reserved(p->addr, p->addr)) { 1441 jump_label_text_reserved(p->addr, p->addr)) {
1438 ret = -EINVAL; 1442 ret = -EINVAL;
1439 goto out; 1443 goto out;
@@ -1469,7 +1473,7 @@ out:
1469 return ret; 1473 return ret;
1470} 1474}
1471 1475
1472int __kprobes register_kprobe(struct kprobe *p) 1476int register_kprobe(struct kprobe *p)
1473{ 1477{
1474 int ret; 1478 int ret;
1475 struct kprobe *old_p; 1479 struct kprobe *old_p;
@@ -1531,7 +1535,7 @@ out:
1531EXPORT_SYMBOL_GPL(register_kprobe); 1535EXPORT_SYMBOL_GPL(register_kprobe);
1532 1536
1533/* Check if all probes on the aggrprobe are disabled */ 1537/* Check if all probes on the aggrprobe are disabled */
1534static int __kprobes aggr_kprobe_disabled(struct kprobe *ap) 1538static int aggr_kprobe_disabled(struct kprobe *ap)
1535{ 1539{
1536 struct kprobe *kp; 1540 struct kprobe *kp;
1537 1541
@@ -1547,7 +1551,7 @@ static int __kprobes aggr_kprobe_disabled(struct kprobe *ap)
1547} 1551}
1548 1552
1549/* Disable one kprobe: Make sure called under kprobe_mutex is locked */ 1553/* Disable one kprobe: Make sure called under kprobe_mutex is locked */
1550static struct kprobe *__kprobes __disable_kprobe(struct kprobe *p) 1554static struct kprobe *__disable_kprobe(struct kprobe *p)
1551{ 1555{
1552 struct kprobe *orig_p; 1556 struct kprobe *orig_p;
1553 1557
@@ -1574,7 +1578,7 @@ static struct kprobe *__kprobes __disable_kprobe(struct kprobe *p)
1574/* 1578/*
1575 * Unregister a kprobe without a scheduler synchronization. 1579 * Unregister a kprobe without a scheduler synchronization.
1576 */ 1580 */
1577static int __kprobes __unregister_kprobe_top(struct kprobe *p) 1581static int __unregister_kprobe_top(struct kprobe *p)
1578{ 1582{
1579 struct kprobe *ap, *list_p; 1583 struct kprobe *ap, *list_p;
1580 1584
@@ -1631,7 +1635,7 @@ disarmed:
1631 return 0; 1635 return 0;
1632} 1636}
1633 1637
1634static void __kprobes __unregister_kprobe_bottom(struct kprobe *p) 1638static void __unregister_kprobe_bottom(struct kprobe *p)
1635{ 1639{
1636 struct kprobe *ap; 1640 struct kprobe *ap;
1637 1641
@@ -1647,7 +1651,7 @@ static void __kprobes __unregister_kprobe_bottom(struct kprobe *p)
1647 /* Otherwise, do nothing. */ 1651 /* Otherwise, do nothing. */
1648} 1652}
1649 1653
1650int __kprobes register_kprobes(struct kprobe **kps, int num) 1654int register_kprobes(struct kprobe **kps, int num)
1651{ 1655{
1652 int i, ret = 0; 1656 int i, ret = 0;
1653 1657
@@ -1665,13 +1669,13 @@ int __kprobes register_kprobes(struct kprobe **kps, int num)
1665} 1669}
1666EXPORT_SYMBOL_GPL(register_kprobes); 1670EXPORT_SYMBOL_GPL(register_kprobes);
1667 1671
1668void __kprobes unregister_kprobe(struct kprobe *p) 1672void unregister_kprobe(struct kprobe *p)
1669{ 1673{
1670 unregister_kprobes(&p, 1); 1674 unregister_kprobes(&p, 1);
1671} 1675}
1672EXPORT_SYMBOL_GPL(unregister_kprobe); 1676EXPORT_SYMBOL_GPL(unregister_kprobe);
1673 1677
1674void __kprobes unregister_kprobes(struct kprobe **kps, int num) 1678void unregister_kprobes(struct kprobe **kps, int num)
1675{ 1679{
1676 int i; 1680 int i;
1677 1681
@@ -1700,7 +1704,7 @@ unsigned long __weak arch_deref_entry_point(void *entry)
1700 return (unsigned long)entry; 1704 return (unsigned long)entry;
1701} 1705}
1702 1706
1703int __kprobes register_jprobes(struct jprobe **jps, int num) 1707int register_jprobes(struct jprobe **jps, int num)
1704{ 1708{
1705 struct jprobe *jp; 1709 struct jprobe *jp;
1706 int ret = 0, i; 1710 int ret = 0, i;
@@ -1731,19 +1735,19 @@ int __kprobes register_jprobes(struct jprobe **jps, int num)
1731} 1735}
1732EXPORT_SYMBOL_GPL(register_jprobes); 1736EXPORT_SYMBOL_GPL(register_jprobes);
1733 1737
1734int __kprobes register_jprobe(struct jprobe *jp) 1738int register_jprobe(struct jprobe *jp)
1735{ 1739{
1736 return register_jprobes(&jp, 1); 1740 return register_jprobes(&jp, 1);
1737} 1741}
1738EXPORT_SYMBOL_GPL(register_jprobe); 1742EXPORT_SYMBOL_GPL(register_jprobe);
1739 1743
1740void __kprobes unregister_jprobe(struct jprobe *jp) 1744void unregister_jprobe(struct jprobe *jp)
1741{ 1745{
1742 unregister_jprobes(&jp, 1); 1746 unregister_jprobes(&jp, 1);
1743} 1747}
1744EXPORT_SYMBOL_GPL(unregister_jprobe); 1748EXPORT_SYMBOL_GPL(unregister_jprobe);
1745 1749
1746void __kprobes unregister_jprobes(struct jprobe **jps, int num) 1750void unregister_jprobes(struct jprobe **jps, int num)
1747{ 1751{
1748 int i; 1752 int i;
1749 1753
@@ -1768,8 +1772,7 @@ EXPORT_SYMBOL_GPL(unregister_jprobes);
1768 * This kprobe pre_handler is registered with every kretprobe. When probe 1772 * This kprobe pre_handler is registered with every kretprobe. When probe
1769 * hits it will set up the return probe. 1773 * hits it will set up the return probe.
1770 */ 1774 */
1771static int __kprobes pre_handler_kretprobe(struct kprobe *p, 1775static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs)
1772 struct pt_regs *regs)
1773{ 1776{
1774 struct kretprobe *rp = container_of(p, struct kretprobe, kp); 1777 struct kretprobe *rp = container_of(p, struct kretprobe, kp);
1775 unsigned long hash, flags = 0; 1778 unsigned long hash, flags = 0;
@@ -1807,8 +1810,9 @@ static int __kprobes pre_handler_kretprobe(struct kprobe *p,
1807 } 1810 }
1808 return 0; 1811 return 0;
1809} 1812}
1813NOKPROBE_SYMBOL(pre_handler_kretprobe);
1810 1814
1811int __kprobes register_kretprobe(struct kretprobe *rp) 1815int register_kretprobe(struct kretprobe *rp)
1812{ 1816{
1813 int ret = 0; 1817 int ret = 0;
1814 struct kretprobe_instance *inst; 1818 struct kretprobe_instance *inst;
@@ -1861,7 +1865,7 @@ int __kprobes register_kretprobe(struct kretprobe *rp)
1861} 1865}
1862EXPORT_SYMBOL_GPL(register_kretprobe); 1866EXPORT_SYMBOL_GPL(register_kretprobe);
1863 1867
1864int __kprobes register_kretprobes(struct kretprobe **rps, int num) 1868int register_kretprobes(struct kretprobe **rps, int num)
1865{ 1869{
1866 int ret = 0, i; 1870 int ret = 0, i;
1867 1871
@@ -1879,13 +1883,13 @@ int __kprobes register_kretprobes(struct kretprobe **rps, int num)
1879} 1883}
1880EXPORT_SYMBOL_GPL(register_kretprobes); 1884EXPORT_SYMBOL_GPL(register_kretprobes);
1881 1885
1882void __kprobes unregister_kretprobe(struct kretprobe *rp) 1886void unregister_kretprobe(struct kretprobe *rp)
1883{ 1887{
1884 unregister_kretprobes(&rp, 1); 1888 unregister_kretprobes(&rp, 1);
1885} 1889}
1886EXPORT_SYMBOL_GPL(unregister_kretprobe); 1890EXPORT_SYMBOL_GPL(unregister_kretprobe);
1887 1891
1888void __kprobes unregister_kretprobes(struct kretprobe **rps, int num) 1892void unregister_kretprobes(struct kretprobe **rps, int num)
1889{ 1893{
1890 int i; 1894 int i;
1891 1895
@@ -1908,38 +1912,38 @@ void __kprobes unregister_kretprobes(struct kretprobe **rps, int num)
1908EXPORT_SYMBOL_GPL(unregister_kretprobes); 1912EXPORT_SYMBOL_GPL(unregister_kretprobes);
1909 1913
1910#else /* CONFIG_KRETPROBES */ 1914#else /* CONFIG_KRETPROBES */
1911int __kprobes register_kretprobe(struct kretprobe *rp) 1915int register_kretprobe(struct kretprobe *rp)
1912{ 1916{
1913 return -ENOSYS; 1917 return -ENOSYS;
1914} 1918}
1915EXPORT_SYMBOL_GPL(register_kretprobe); 1919EXPORT_SYMBOL_GPL(register_kretprobe);
1916 1920
1917int __kprobes register_kretprobes(struct kretprobe **rps, int num) 1921int register_kretprobes(struct kretprobe **rps, int num)
1918{ 1922{
1919 return -ENOSYS; 1923 return -ENOSYS;
1920} 1924}
1921EXPORT_SYMBOL_GPL(register_kretprobes); 1925EXPORT_SYMBOL_GPL(register_kretprobes);
1922 1926
1923void __kprobes unregister_kretprobe(struct kretprobe *rp) 1927void unregister_kretprobe(struct kretprobe *rp)
1924{ 1928{
1925} 1929}
1926EXPORT_SYMBOL_GPL(unregister_kretprobe); 1930EXPORT_SYMBOL_GPL(unregister_kretprobe);
1927 1931
1928void __kprobes unregister_kretprobes(struct kretprobe **rps, int num) 1932void unregister_kretprobes(struct kretprobe **rps, int num)
1929{ 1933{
1930} 1934}
1931EXPORT_SYMBOL_GPL(unregister_kretprobes); 1935EXPORT_SYMBOL_GPL(unregister_kretprobes);
1932 1936
1933static int __kprobes pre_handler_kretprobe(struct kprobe *p, 1937static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs)
1934 struct pt_regs *regs)
1935{ 1938{
1936 return 0; 1939 return 0;
1937} 1940}
1941NOKPROBE_SYMBOL(pre_handler_kretprobe);
1938 1942
1939#endif /* CONFIG_KRETPROBES */ 1943#endif /* CONFIG_KRETPROBES */
1940 1944
1941/* Set the kprobe gone and remove its instruction buffer. */ 1945/* Set the kprobe gone and remove its instruction buffer. */
1942static void __kprobes kill_kprobe(struct kprobe *p) 1946static void kill_kprobe(struct kprobe *p)
1943{ 1947{
1944 struct kprobe *kp; 1948 struct kprobe *kp;
1945 1949
@@ -1963,7 +1967,7 @@ static void __kprobes kill_kprobe(struct kprobe *p)
1963} 1967}
1964 1968
1965/* Disable one kprobe */ 1969/* Disable one kprobe */
1966int __kprobes disable_kprobe(struct kprobe *kp) 1970int disable_kprobe(struct kprobe *kp)
1967{ 1971{
1968 int ret = 0; 1972 int ret = 0;
1969 1973
@@ -1979,7 +1983,7 @@ int __kprobes disable_kprobe(struct kprobe *kp)
1979EXPORT_SYMBOL_GPL(disable_kprobe); 1983EXPORT_SYMBOL_GPL(disable_kprobe);
1980 1984
1981/* Enable one kprobe */ 1985/* Enable one kprobe */
1982int __kprobes enable_kprobe(struct kprobe *kp) 1986int enable_kprobe(struct kprobe *kp)
1983{ 1987{
1984 int ret = 0; 1988 int ret = 0;
1985 struct kprobe *p; 1989 struct kprobe *p;
@@ -2012,16 +2016,49 @@ out:
2012} 2016}
2013EXPORT_SYMBOL_GPL(enable_kprobe); 2017EXPORT_SYMBOL_GPL(enable_kprobe);
2014 2018
2015void __kprobes dump_kprobe(struct kprobe *kp) 2019void dump_kprobe(struct kprobe *kp)
2016{ 2020{
2017 printk(KERN_WARNING "Dumping kprobe:\n"); 2021 printk(KERN_WARNING "Dumping kprobe:\n");
2018 printk(KERN_WARNING "Name: %s\nAddress: %p\nOffset: %x\n", 2022 printk(KERN_WARNING "Name: %s\nAddress: %p\nOffset: %x\n",
2019 kp->symbol_name, kp->addr, kp->offset); 2023 kp->symbol_name, kp->addr, kp->offset);
2020} 2024}
2025NOKPROBE_SYMBOL(dump_kprobe);
2026
2027/*
2028 * Lookup and populate the kprobe_blacklist.
2029 *
2030 * Unlike the kretprobe blacklist, we'll need to determine
2031 * the range of addresses that belong to the said functions,
2032 * since a kprobe need not necessarily be at the beginning
2033 * of a function.
2034 */
2035static int __init populate_kprobe_blacklist(unsigned long *start,
2036 unsigned long *end)
2037{
2038 unsigned long *iter;
2039 struct kprobe_blacklist_entry *ent;
2040 unsigned long offset = 0, size = 0;
2041
2042 for (iter = start; iter < end; iter++) {
2043 if (!kallsyms_lookup_size_offset(*iter, &size, &offset)) {
2044 pr_err("Failed to find blacklist %p\n", (void *)*iter);
2045 continue;
2046 }
2047
2048 ent = kmalloc(sizeof(*ent), GFP_KERNEL);
2049 if (!ent)
2050 return -ENOMEM;
2051 ent->start_addr = *iter;
2052 ent->end_addr = *iter + size;
2053 INIT_LIST_HEAD(&ent->list);
2054 list_add_tail(&ent->list, &kprobe_blacklist);
2055 }
2056 return 0;
2057}
2021 2058
2022/* Module notifier call back, checking kprobes on the module */ 2059/* Module notifier call back, checking kprobes on the module */
2023static int __kprobes kprobes_module_callback(struct notifier_block *nb, 2060static int kprobes_module_callback(struct notifier_block *nb,
2024 unsigned long val, void *data) 2061 unsigned long val, void *data)
2025{ 2062{
2026 struct module *mod = data; 2063 struct module *mod = data;
2027 struct hlist_head *head; 2064 struct hlist_head *head;
@@ -2062,14 +2099,13 @@ static struct notifier_block kprobe_module_nb = {
2062 .priority = 0 2099 .priority = 0
2063}; 2100};
2064 2101
2102/* Markers of _kprobe_blacklist section */
2103extern unsigned long __start_kprobe_blacklist[];
2104extern unsigned long __stop_kprobe_blacklist[];
2105
2065static int __init init_kprobes(void) 2106static int __init init_kprobes(void)
2066{ 2107{
2067 int i, err = 0; 2108 int i, err = 0;
2068 unsigned long offset = 0, size = 0;
2069 char *modname, namebuf[KSYM_NAME_LEN];
2070 const char *symbol_name;
2071 void *addr;
2072 struct kprobe_blackpoint *kb;
2073 2109
2074 /* FIXME allocate the probe table, currently defined statically */ 2110 /* FIXME allocate the probe table, currently defined statically */
2075 /* initialize all list heads */ 2111 /* initialize all list heads */
@@ -2079,26 +2115,11 @@ static int __init init_kprobes(void)
2079 raw_spin_lock_init(&(kretprobe_table_locks[i].lock)); 2115 raw_spin_lock_init(&(kretprobe_table_locks[i].lock));
2080 } 2116 }
2081 2117
2082 /* 2118 err = populate_kprobe_blacklist(__start_kprobe_blacklist,
2083 * Lookup and populate the kprobe_blacklist. 2119 __stop_kprobe_blacklist);
2084 * 2120 if (err) {
2085 * Unlike the kretprobe blacklist, we'll need to determine 2121 pr_err("kprobes: failed to populate blacklist: %d\n", err);
2086 * the range of addresses that belong to the said functions, 2122 pr_err("Please take care of using kprobes.\n");
2087 * since a kprobe need not necessarily be at the beginning
2088 * of a function.
2089 */
2090 for (kb = kprobe_blacklist; kb->name != NULL; kb++) {
2091 kprobe_lookup_name(kb->name, addr);
2092 if (!addr)
2093 continue;
2094
2095 kb->start_addr = (unsigned long)addr;
2096 symbol_name = kallsyms_lookup(kb->start_addr,
2097 &size, &offset, &modname, namebuf);
2098 if (!symbol_name)
2099 kb->range = 0;
2100 else
2101 kb->range = size;
2102 } 2123 }
2103 2124
2104 if (kretprobe_blacklist_size) { 2125 if (kretprobe_blacklist_size) {
@@ -2138,7 +2159,7 @@ static int __init init_kprobes(void)
2138} 2159}
2139 2160
2140#ifdef CONFIG_DEBUG_FS 2161#ifdef CONFIG_DEBUG_FS
2141static void __kprobes report_probe(struct seq_file *pi, struct kprobe *p, 2162static void report_probe(struct seq_file *pi, struct kprobe *p,
2142 const char *sym, int offset, char *modname, struct kprobe *pp) 2163 const char *sym, int offset, char *modname, struct kprobe *pp)
2143{ 2164{
2144 char *kprobe_type; 2165 char *kprobe_type;
@@ -2167,12 +2188,12 @@ static void __kprobes report_probe(struct seq_file *pi, struct kprobe *p,
2167 (kprobe_ftrace(pp) ? "[FTRACE]" : "")); 2188 (kprobe_ftrace(pp) ? "[FTRACE]" : ""));
2168} 2189}
2169 2190
2170static void __kprobes *kprobe_seq_start(struct seq_file *f, loff_t *pos) 2191static void *kprobe_seq_start(struct seq_file *f, loff_t *pos)
2171{ 2192{
2172 return (*pos < KPROBE_TABLE_SIZE) ? pos : NULL; 2193 return (*pos < KPROBE_TABLE_SIZE) ? pos : NULL;
2173} 2194}
2174 2195
2175static void __kprobes *kprobe_seq_next(struct seq_file *f, void *v, loff_t *pos) 2196static void *kprobe_seq_next(struct seq_file *f, void *v, loff_t *pos)
2176{ 2197{
2177 (*pos)++; 2198 (*pos)++;
2178 if (*pos >= KPROBE_TABLE_SIZE) 2199 if (*pos >= KPROBE_TABLE_SIZE)
@@ -2180,12 +2201,12 @@ static void __kprobes *kprobe_seq_next(struct seq_file *f, void *v, loff_t *pos)
2180 return pos; 2201 return pos;
2181} 2202}
2182 2203
2183static void __kprobes kprobe_seq_stop(struct seq_file *f, void *v) 2204static void kprobe_seq_stop(struct seq_file *f, void *v)
2184{ 2205{
2185 /* Nothing to do */ 2206 /* Nothing to do */
2186} 2207}
2187 2208
2188static int __kprobes show_kprobe_addr(struct seq_file *pi, void *v) 2209static int show_kprobe_addr(struct seq_file *pi, void *v)
2189{ 2210{
2190 struct hlist_head *head; 2211 struct hlist_head *head;
2191 struct kprobe *p, *kp; 2212 struct kprobe *p, *kp;
@@ -2216,7 +2237,7 @@ static const struct seq_operations kprobes_seq_ops = {
2216 .show = show_kprobe_addr 2237 .show = show_kprobe_addr
2217}; 2238};
2218 2239
2219static int __kprobes kprobes_open(struct inode *inode, struct file *filp) 2240static int kprobes_open(struct inode *inode, struct file *filp)
2220{ 2241{
2221 return seq_open(filp, &kprobes_seq_ops); 2242 return seq_open(filp, &kprobes_seq_ops);
2222} 2243}
@@ -2228,7 +2249,47 @@ static const struct file_operations debugfs_kprobes_operations = {
2228 .release = seq_release, 2249 .release = seq_release,
2229}; 2250};
2230 2251
2231static void __kprobes arm_all_kprobes(void) 2252/* kprobes/blacklist -- shows which functions can not be probed */
2253static void *kprobe_blacklist_seq_start(struct seq_file *m, loff_t *pos)
2254{
2255 return seq_list_start(&kprobe_blacklist, *pos);
2256}
2257
2258static void *kprobe_blacklist_seq_next(struct seq_file *m, void *v, loff_t *pos)
2259{
2260 return seq_list_next(v, &kprobe_blacklist, pos);
2261}
2262
2263static int kprobe_blacklist_seq_show(struct seq_file *m, void *v)
2264{
2265 struct kprobe_blacklist_entry *ent =
2266 list_entry(v, struct kprobe_blacklist_entry, list);
2267
2268 seq_printf(m, "0x%p-0x%p\t%ps\n", (void *)ent->start_addr,
2269 (void *)ent->end_addr, (void *)ent->start_addr);
2270 return 0;
2271}
2272
2273static const struct seq_operations kprobe_blacklist_seq_ops = {
2274 .start = kprobe_blacklist_seq_start,
2275 .next = kprobe_blacklist_seq_next,
2276 .stop = kprobe_seq_stop, /* Reuse void function */
2277 .show = kprobe_blacklist_seq_show,
2278};
2279
2280static int kprobe_blacklist_open(struct inode *inode, struct file *filp)
2281{
2282 return seq_open(filp, &kprobe_blacklist_seq_ops);
2283}
2284
2285static const struct file_operations debugfs_kprobe_blacklist_ops = {
2286 .open = kprobe_blacklist_open,
2287 .read = seq_read,
2288 .llseek = seq_lseek,
2289 .release = seq_release,
2290};
2291
2292static void arm_all_kprobes(void)
2232{ 2293{
2233 struct hlist_head *head; 2294 struct hlist_head *head;
2234 struct kprobe *p; 2295 struct kprobe *p;
@@ -2256,7 +2317,7 @@ already_enabled:
2256 return; 2317 return;
2257} 2318}
2258 2319
2259static void __kprobes disarm_all_kprobes(void) 2320static void disarm_all_kprobes(void)
2260{ 2321{
2261 struct hlist_head *head; 2322 struct hlist_head *head;
2262 struct kprobe *p; 2323 struct kprobe *p;
@@ -2340,7 +2401,7 @@ static const struct file_operations fops_kp = {
2340 .llseek = default_llseek, 2401 .llseek = default_llseek,
2341}; 2402};
2342 2403
2343static int __kprobes debugfs_kprobe_init(void) 2404static int __init debugfs_kprobe_init(void)
2344{ 2405{
2345 struct dentry *dir, *file; 2406 struct dentry *dir, *file;
2346 unsigned int value = 1; 2407 unsigned int value = 1;
@@ -2351,19 +2412,24 @@ static int __kprobes debugfs_kprobe_init(void)
2351 2412
2352 file = debugfs_create_file("list", 0444, dir, NULL, 2413 file = debugfs_create_file("list", 0444, dir, NULL,
2353 &debugfs_kprobes_operations); 2414 &debugfs_kprobes_operations);
2354 if (!file) { 2415 if (!file)
2355 debugfs_remove(dir); 2416 goto error;
2356 return -ENOMEM;
2357 }
2358 2417
2359 file = debugfs_create_file("enabled", 0600, dir, 2418 file = debugfs_create_file("enabled", 0600, dir,
2360 &value, &fops_kp); 2419 &value, &fops_kp);
2361 if (!file) { 2420 if (!file)
2362 debugfs_remove(dir); 2421 goto error;
2363 return -ENOMEM; 2422
2364 } 2423 file = debugfs_create_file("blacklist", 0444, dir, NULL,
2424 &debugfs_kprobe_blacklist_ops);
2425 if (!file)
2426 goto error;
2365 2427
2366 return 0; 2428 return 0;
2429
2430error:
2431 debugfs_remove(dir);
2432 return -ENOMEM;
2367} 2433}
2368 2434
2369late_initcall(debugfs_kprobe_init); 2435late_initcall(debugfs_kprobe_init);
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 2495a9b14ac8..6683ccef9fff 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -37,6 +37,7 @@ static ssize_t uevent_seqnum_show(struct kobject *kobj,
37} 37}
38KERNEL_ATTR_RO(uevent_seqnum); 38KERNEL_ATTR_RO(uevent_seqnum);
39 39
40#ifdef CONFIG_UEVENT_HELPER
40/* uevent helper program, used during early boot */ 41/* uevent helper program, used during early boot */
41static ssize_t uevent_helper_show(struct kobject *kobj, 42static ssize_t uevent_helper_show(struct kobject *kobj,
42 struct kobj_attribute *attr, char *buf) 43 struct kobj_attribute *attr, char *buf)
@@ -56,7 +57,7 @@ static ssize_t uevent_helper_store(struct kobject *kobj,
56 return count; 57 return count;
57} 58}
58KERNEL_ATTR_RW(uevent_helper); 59KERNEL_ATTR_RW(uevent_helper);
59 60#endif
60 61
61#ifdef CONFIG_PROFILING 62#ifdef CONFIG_PROFILING
62static ssize_t profiling_show(struct kobject *kobj, 63static ssize_t profiling_show(struct kobject *kobj,
@@ -189,7 +190,9 @@ EXPORT_SYMBOL_GPL(kernel_kobj);
189static struct attribute * kernel_attrs[] = { 190static struct attribute * kernel_attrs[] = {
190 &fscaps_attr.attr, 191 &fscaps_attr.attr,
191 &uevent_seqnum_attr.attr, 192 &uevent_seqnum_attr.attr,
193#ifdef CONFIG_UEVENT_HELPER
192 &uevent_helper_attr.attr, 194 &uevent_helper_attr.attr,
195#endif
193#ifdef CONFIG_PROFILING 196#ifdef CONFIG_PROFILING
194 &profiling_attr.attr, 197 &profiling_attr.attr,
195#endif 198#endif
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 9a130ec06f7a..c2390f41307b 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -262,7 +262,7 @@ static void create_kthread(struct kthread_create_info *create)
262 * kthread_stop() has been called). The return value should be zero 262 * kthread_stop() has been called). The return value should be zero
263 * or a negative error number; it will be passed to kthread_stop(). 263 * or a negative error number; it will be passed to kthread_stop().
264 * 264 *
265 * Returns a task_struct or ERR_PTR(-ENOMEM). 265 * Returns a task_struct or ERR_PTR(-ENOMEM) or ERR_PTR(-EINTR).
266 */ 266 */
267struct task_struct *kthread_create_on_node(int (*threadfn)(void *data), 267struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
268 void *data, int node, 268 void *data, int node,
@@ -298,7 +298,7 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
298 * that thread. 298 * that thread.
299 */ 299 */
300 if (xchg(&create->done, NULL)) 300 if (xchg(&create->done, NULL))
301 return ERR_PTR(-ENOMEM); 301 return ERR_PTR(-EINTR);
302 /* 302 /*
303 * kthreadd (or new kernel thread) will call complete() 303 * kthreadd (or new kernel thread) will call complete()
304 * shortly. 304 * shortly.
diff --git a/kernel/latencytop.c b/kernel/latencytop.c
index a462b317f9a0..a02812743a7e 100644
--- a/kernel/latencytop.c
+++ b/kernel/latencytop.c
@@ -88,7 +88,8 @@ static void clear_global_latency_tracing(void)
88} 88}
89 89
90static void __sched 90static void __sched
91account_global_scheduler_latency(struct task_struct *tsk, struct latency_record *lat) 91account_global_scheduler_latency(struct task_struct *tsk,
92 struct latency_record *lat)
92{ 93{
93 int firstnonnull = MAXLR + 1; 94 int firstnonnull = MAXLR + 1;
94 int i; 95 int i;
@@ -255,7 +256,7 @@ static int lstats_show(struct seq_file *m, void *v)
255 break; 256 break;
256 seq_printf(m, " %ps", (void *)bt); 257 seq_printf(m, " %ps", (void *)bt);
257 } 258 }
258 seq_printf(m, "\n"); 259 seq_puts(m, "\n");
259 } 260 }
260 } 261 }
261 return 0; 262 return 0;
diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile
index b8bdcd4785b7..8541bfdfd232 100644
--- a/kernel/locking/Makefile
+++ b/kernel/locking/Makefile
@@ -24,4 +24,5 @@ obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o
24obj-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o 24obj-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o
25obj-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem-xadd.o 25obj-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem-xadd.o
26obj-$(CONFIG_PERCPU_RWSEM) += percpu-rwsem.o 26obj-$(CONFIG_PERCPU_RWSEM) += percpu-rwsem.o
27obj-$(CONFIG_QUEUE_RWLOCK) += qrwlock.o
27obj-$(CONFIG_LOCK_TORTURE_TEST) += locktorture.o 28obj-$(CONFIG_LOCK_TORTURE_TEST) += locktorture.o
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index b0e9467922e1..d24e4339b46d 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -4188,7 +4188,7 @@ void debug_show_held_locks(struct task_struct *task)
4188} 4188}
4189EXPORT_SYMBOL_GPL(debug_show_held_locks); 4189EXPORT_SYMBOL_GPL(debug_show_held_locks);
4190 4190
4191asmlinkage void lockdep_sys_exit(void) 4191asmlinkage __visible void lockdep_sys_exit(void)
4192{ 4192{
4193 struct task_struct *curr = current; 4193 struct task_struct *curr = current;
4194 4194
diff --git a/kernel/locking/lockdep_internals.h b/kernel/locking/lockdep_internals.h
index 4f560cfedc8f..51c4b24b6328 100644
--- a/kernel/locking/lockdep_internals.h
+++ b/kernel/locking/lockdep_internals.h
@@ -54,9 +54,9 @@ enum {
54 * table (if it's not there yet), and we check it for lock order 54 * table (if it's not there yet), and we check it for lock order
55 * conflicts and deadlocks. 55 * conflicts and deadlocks.
56 */ 56 */
57#define MAX_LOCKDEP_ENTRIES 16384UL 57#define MAX_LOCKDEP_ENTRIES 32768UL
58 58
59#define MAX_LOCKDEP_CHAINS_BITS 15 59#define MAX_LOCKDEP_CHAINS_BITS 16
60#define MAX_LOCKDEP_CHAINS (1UL << MAX_LOCKDEP_CHAINS_BITS) 60#define MAX_LOCKDEP_CHAINS (1UL << MAX_LOCKDEP_CHAINS_BITS)
61 61
62#define MAX_LOCKDEP_CHAIN_HLOCKS (MAX_LOCKDEP_CHAINS*5) 62#define MAX_LOCKDEP_CHAIN_HLOCKS (MAX_LOCKDEP_CHAINS*5)
@@ -65,7 +65,7 @@ enum {
65 * Stack-trace: tightly packed array of stack backtrace 65 * Stack-trace: tightly packed array of stack backtrace
66 * addresses. Protected by the hash_lock. 66 * addresses. Protected by the hash_lock.
67 */ 67 */
68#define MAX_STACK_TRACE_ENTRIES 262144UL 68#define MAX_STACK_TRACE_ENTRIES 524288UL
69 69
70extern struct list_head all_lock_classes; 70extern struct list_head all_lock_classes;
71extern struct lock_chain lock_chains[]; 71extern struct lock_chain lock_chains[];
diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c
index f26b1a18e34e..0955b885d0dc 100644
--- a/kernel/locking/locktorture.c
+++ b/kernel/locking/locktorture.c
@@ -82,14 +82,14 @@ struct lock_writer_stress_stats {
82}; 82};
83static struct lock_writer_stress_stats *lwsa; 83static struct lock_writer_stress_stats *lwsa;
84 84
85#if defined(MODULE) || defined(CONFIG_LOCK_TORTURE_TEST_RUNNABLE) 85#if defined(MODULE)
86#define LOCKTORTURE_RUNNABLE_INIT 1 86#define LOCKTORTURE_RUNNABLE_INIT 1
87#else 87#else
88#define LOCKTORTURE_RUNNABLE_INIT 0 88#define LOCKTORTURE_RUNNABLE_INIT 0
89#endif 89#endif
90int locktorture_runnable = LOCKTORTURE_RUNNABLE_INIT; 90int locktorture_runnable = LOCKTORTURE_RUNNABLE_INIT;
91module_param(locktorture_runnable, int, 0444); 91module_param(locktorture_runnable, int, 0444);
92MODULE_PARM_DESC(locktorture_runnable, "Start locktorture at boot"); 92MODULE_PARM_DESC(locktorture_runnable, "Start locktorture at module init");
93 93
94/* Forward reference. */ 94/* Forward reference. */
95static void lock_torture_cleanup(void); 95static void lock_torture_cleanup(void);
@@ -216,10 +216,11 @@ static int lock_torture_writer(void *arg)
216 static DEFINE_TORTURE_RANDOM(rand); 216 static DEFINE_TORTURE_RANDOM(rand);
217 217
218 VERBOSE_TOROUT_STRING("lock_torture_writer task started"); 218 VERBOSE_TOROUT_STRING("lock_torture_writer task started");
219 set_user_nice(current, 19); 219 set_user_nice(current, MAX_NICE);
220 220
221 do { 221 do {
222 schedule_timeout_uninterruptible(1); 222 if ((torture_random(&rand) & 0xfffff) == 0)
223 schedule_timeout_uninterruptible(1);
223 cur_ops->writelock(); 224 cur_ops->writelock();
224 if (WARN_ON_ONCE(lock_is_write_held)) 225 if (WARN_ON_ONCE(lock_is_write_held))
225 lwsp->n_write_lock_fail++; 226 lwsp->n_write_lock_fail++;
@@ -354,7 +355,8 @@ static int __init lock_torture_init(void)
354 &lock_busted_ops, &spin_lock_ops, &spin_lock_irq_ops, 355 &lock_busted_ops, &spin_lock_ops, &spin_lock_irq_ops,
355 }; 356 };
356 357
357 torture_init_begin(torture_type, verbose, &locktorture_runnable); 358 if (!torture_init_begin(torture_type, verbose, &locktorture_runnable))
359 return -EBUSY;
358 360
359 /* Process args and tell the world that the torturer is on the job. */ 361 /* Process args and tell the world that the torturer is on the job. */
360 for (i = 0; i < ARRAY_SIZE(torture_ops); i++) { 362 for (i = 0; i < ARRAY_SIZE(torture_ops); i++) {
diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c
new file mode 100644
index 000000000000..fb5b8ac411a5
--- /dev/null
+++ b/kernel/locking/qrwlock.c
@@ -0,0 +1,133 @@
1/*
2 * Queue read/write lock
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * (C) Copyright 2013-2014 Hewlett-Packard Development Company, L.P.
15 *
16 * Authors: Waiman Long <waiman.long@hp.com>
17 */
18#include <linux/smp.h>
19#include <linux/bug.h>
20#include <linux/cpumask.h>
21#include <linux/percpu.h>
22#include <linux/hardirq.h>
23#include <linux/mutex.h>
24#include <asm/qrwlock.h>
25
26/**
27 * rspin_until_writer_unlock - inc reader count & spin until writer is gone
28 * @lock : Pointer to queue rwlock structure
29 * @writer: Current queue rwlock writer status byte
30 *
31 * In interrupt context or at the head of the queue, the reader will just
32 * increment the reader count & wait until the writer releases the lock.
33 */
34static __always_inline void
35rspin_until_writer_unlock(struct qrwlock *lock, u32 cnts)
36{
37 while ((cnts & _QW_WMASK) == _QW_LOCKED) {
38 arch_mutex_cpu_relax();
39 cnts = smp_load_acquire((u32 *)&lock->cnts);
40 }
41}
42
43/**
44 * queue_read_lock_slowpath - acquire read lock of a queue rwlock
45 * @lock: Pointer to queue rwlock structure
46 */
47void queue_read_lock_slowpath(struct qrwlock *lock)
48{
49 u32 cnts;
50
51 /*
52 * Readers come here when they cannot get the lock without waiting
53 */
54 if (unlikely(in_interrupt())) {
55 /*
56 * Readers in interrupt context will spin until the lock is
57 * available without waiting in the queue.
58 */
59 cnts = smp_load_acquire((u32 *)&lock->cnts);
60 rspin_until_writer_unlock(lock, cnts);
61 return;
62 }
63 atomic_sub(_QR_BIAS, &lock->cnts);
64
65 /*
66 * Put the reader into the wait queue
67 */
68 arch_spin_lock(&lock->lock);
69
70 /*
71 * At the head of the wait queue now, wait until the writer state
72 * goes to 0 and then try to increment the reader count and get
73 * the lock. It is possible that an incoming writer may steal the
74 * lock in the interim, so it is necessary to check the writer byte
75 * to make sure that the write lock isn't taken.
76 */
77 while (atomic_read(&lock->cnts) & _QW_WMASK)
78 arch_mutex_cpu_relax();
79
80 cnts = atomic_add_return(_QR_BIAS, &lock->cnts) - _QR_BIAS;
81 rspin_until_writer_unlock(lock, cnts);
82
83 /*
84 * Signal the next one in queue to become queue head
85 */
86 arch_spin_unlock(&lock->lock);
87}
88EXPORT_SYMBOL(queue_read_lock_slowpath);
89
90/**
91 * queue_write_lock_slowpath - acquire write lock of a queue rwlock
92 * @lock : Pointer to queue rwlock structure
93 */
94void queue_write_lock_slowpath(struct qrwlock *lock)
95{
96 u32 cnts;
97
98 /* Put the writer into the wait queue */
99 arch_spin_lock(&lock->lock);
100
101 /* Try to acquire the lock directly if no reader is present */
102 if (!atomic_read(&lock->cnts) &&
103 (atomic_cmpxchg(&lock->cnts, 0, _QW_LOCKED) == 0))
104 goto unlock;
105
106 /*
107 * Set the waiting flag to notify readers that a writer is pending,
108 * or wait for a previous writer to go away.
109 */
110 for (;;) {
111 cnts = atomic_read(&lock->cnts);
112 if (!(cnts & _QW_WMASK) &&
113 (atomic_cmpxchg(&lock->cnts, cnts,
114 cnts | _QW_WAITING) == cnts))
115 break;
116
117 arch_mutex_cpu_relax();
118 }
119
120 /* When no more readers, set the locked flag */
121 for (;;) {
122 cnts = atomic_read(&lock->cnts);
123 if ((cnts == _QW_WAITING) &&
124 (atomic_cmpxchg(&lock->cnts, _QW_WAITING,
125 _QW_LOCKED) == _QW_WAITING))
126 break;
127
128 arch_mutex_cpu_relax();
129 }
130unlock:
131 arch_spin_unlock(&lock->lock);
132}
133EXPORT_SYMBOL(queue_write_lock_slowpath);
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index aa4dff04b594..a620d4d08ca6 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -343,9 +343,16 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
343 * top_waiter can be NULL, when we are in the deboosting 343 * top_waiter can be NULL, when we are in the deboosting
344 * mode! 344 * mode!
345 */ 345 */
346 if (top_waiter && (!task_has_pi_waiters(task) || 346 if (top_waiter) {
347 top_waiter != task_top_pi_waiter(task))) 347 if (!task_has_pi_waiters(task))
348 goto out_unlock_pi; 348 goto out_unlock_pi;
349 /*
350 * If deadlock detection is off, we stop here if we
351 * are not the top pi waiter of the task.
352 */
353 if (!detect_deadlock && top_waiter != task_top_pi_waiter(task))
354 goto out_unlock_pi;
355 }
349 356
350 /* 357 /*
351 * When deadlock detection is off then we check, if further 358 * When deadlock detection is off then we check, if further
@@ -361,7 +368,12 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
361 goto retry; 368 goto retry;
362 } 369 }
363 370
364 /* Deadlock detection */ 371 /*
372 * Deadlock detection. If the lock is the same as the original
373 * lock which caused us to walk the lock chain or if the
374 * current lock is owned by the task which initiated the chain
375 * walk, we detected a deadlock.
376 */
365 if (lock == orig_lock || rt_mutex_owner(lock) == top_task) { 377 if (lock == orig_lock || rt_mutex_owner(lock) == top_task) {
366 debug_rt_mutex_deadlock(deadlock_detect, orig_waiter, lock); 378 debug_rt_mutex_deadlock(deadlock_detect, orig_waiter, lock);
367 raw_spin_unlock(&lock->wait_lock); 379 raw_spin_unlock(&lock->wait_lock);
@@ -527,6 +539,18 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
527 unsigned long flags; 539 unsigned long flags;
528 int chain_walk = 0, res; 540 int chain_walk = 0, res;
529 541
542 /*
543 * Early deadlock detection. We really don't want the task to
544 * enqueue on itself just to untangle the mess later. It's not
545 * only an optimization. We drop the locks, so another waiter
546 * can come in before the chain walk detects the deadlock. So
547 * the other will detect the deadlock and return -EDEADLOCK,
548 * which is wrong, as the other waiter is not in a deadlock
549 * situation.
550 */
551 if (detect_deadlock && owner == task)
552 return -EDEADLK;
553
530 raw_spin_lock_irqsave(&task->pi_lock, flags); 554 raw_spin_lock_irqsave(&task->pi_lock, flags);
531 __rt_mutex_adjust_prio(task); 555 __rt_mutex_adjust_prio(task);
532 waiter->task = task; 556 waiter->task = task;
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index 1d66e08e897d..dacc32142fcc 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -5,11 +5,66 @@
5 * 5 *
6 * Writer lock-stealing by Alex Shi <alex.shi@intel.com> 6 * Writer lock-stealing by Alex Shi <alex.shi@intel.com>
7 * and Michel Lespinasse <walken@google.com> 7 * and Michel Lespinasse <walken@google.com>
8 *
9 * Optimistic spinning by Tim Chen <tim.c.chen@intel.com>
10 * and Davidlohr Bueso <davidlohr@hp.com>. Based on mutexes.
8 */ 11 */
9#include <linux/rwsem.h> 12#include <linux/rwsem.h>
10#include <linux/sched.h> 13#include <linux/sched.h>
11#include <linux/init.h> 14#include <linux/init.h>
12#include <linux/export.h> 15#include <linux/export.h>
16#include <linux/sched/rt.h>
17
18#include "mcs_spinlock.h"
19
20/*
21 * Guide to the rw_semaphore's count field for common values.
22 * (32-bit case illustrated, similar for 64-bit)
23 *
24 * 0x0000000X (1) X readers active or attempting lock, no writer waiting
25 * X = #active_readers + #readers attempting to lock
26 * (X*ACTIVE_BIAS)
27 *
28 * 0x00000000 rwsem is unlocked, and no one is waiting for the lock or
29 * attempting to read lock or write lock.
30 *
31 * 0xffff000X (1) X readers active or attempting lock, with waiters for lock
32 * X = #active readers + # readers attempting lock
33 * (X*ACTIVE_BIAS + WAITING_BIAS)
34 * (2) 1 writer attempting lock, no waiters for lock
35 * X-1 = #active readers + #readers attempting lock
36 * ((X-1)*ACTIVE_BIAS + ACTIVE_WRITE_BIAS)
37 * (3) 1 writer active, no waiters for lock
38 * X-1 = #active readers + #readers attempting lock
39 * ((X-1)*ACTIVE_BIAS + ACTIVE_WRITE_BIAS)
40 *
41 * 0xffff0001 (1) 1 reader active or attempting lock, waiters for lock
42 * (WAITING_BIAS + ACTIVE_BIAS)
43 * (2) 1 writer active or attempting lock, no waiters for lock
44 * (ACTIVE_WRITE_BIAS)
45 *
46 * 0xffff0000 (1) There are writers or readers queued but none active
47 * or in the process of attempting lock.
48 * (WAITING_BIAS)
49 * Note: writer can attempt to steal lock for this count by adding
50 * ACTIVE_WRITE_BIAS in cmpxchg and checking the old count
51 *
52 * 0xfffe0001 (1) 1 writer active, or attempting lock. Waiters on queue.
53 * (ACTIVE_WRITE_BIAS + WAITING_BIAS)
54 *
55 * Note: Readers attempt to lock by adding ACTIVE_BIAS in down_read and checking
56 * the count becomes more than 0 for successful lock acquisition,
57 * i.e. the case where there are only readers or nobody has lock.
58 * (1st and 2nd case above).
59 *
60 * Writers attempt to lock by adding ACTIVE_WRITE_BIAS in down_write and
61 * checking the count becomes ACTIVE_WRITE_BIAS for successful lock
62 * acquisition (i.e. nobody else has lock or attempts lock). If
63 * unsuccessful, in rwsem_down_write_failed, we'll check to see if there
64 * are only waiters but none active (5th case above), and attempt to
65 * steal the lock.
66 *
67 */
13 68
14/* 69/*
15 * Initialize an rwsem: 70 * Initialize an rwsem:
@@ -27,6 +82,10 @@ void __init_rwsem(struct rw_semaphore *sem, const char *name,
27 sem->count = RWSEM_UNLOCKED_VALUE; 82 sem->count = RWSEM_UNLOCKED_VALUE;
28 raw_spin_lock_init(&sem->wait_lock); 83 raw_spin_lock_init(&sem->wait_lock);
29 INIT_LIST_HEAD(&sem->wait_list); 84 INIT_LIST_HEAD(&sem->wait_list);
85#ifdef CONFIG_SMP
86 sem->owner = NULL;
87 sem->osq = NULL;
88#endif
30} 89}
31 90
32EXPORT_SYMBOL(__init_rwsem); 91EXPORT_SYMBOL(__init_rwsem);
@@ -141,7 +200,7 @@ __rwsem_do_wake(struct rw_semaphore *sem, enum rwsem_wake_type wake_type)
141} 200}
142 201
143/* 202/*
144 * wait for the read lock to be granted 203 * Wait for the read lock to be granted
145 */ 204 */
146__visible 205__visible
147struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem) 206struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem)
@@ -188,64 +247,221 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem)
188 return sem; 247 return sem;
189} 248}
190 249
250static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem)
251{
252 if (!(count & RWSEM_ACTIVE_MASK)) {
253 /* try acquiring the write lock */
254 if (sem->count == RWSEM_WAITING_BIAS &&
255 cmpxchg(&sem->count, RWSEM_WAITING_BIAS,
256 RWSEM_ACTIVE_WRITE_BIAS) == RWSEM_WAITING_BIAS) {
257 if (!list_is_singular(&sem->wait_list))
258 rwsem_atomic_update(RWSEM_WAITING_BIAS, sem);
259 return true;
260 }
261 }
262 return false;
263}
264
265#ifdef CONFIG_SMP
191/* 266/*
192 * wait until we successfully acquire the write lock 267 * Try to acquire write lock before the writer has been put on wait queue.
268 */
269static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem)
270{
271 long old, count = ACCESS_ONCE(sem->count);
272
273 while (true) {
274 if (!(count == 0 || count == RWSEM_WAITING_BIAS))
275 return false;
276
277 old = cmpxchg(&sem->count, count, count + RWSEM_ACTIVE_WRITE_BIAS);
278 if (old == count)
279 return true;
280
281 count = old;
282 }
283}
284
285static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem)
286{
287 struct task_struct *owner;
288 bool on_cpu = true;
289
290 if (need_resched())
291 return 0;
292
293 rcu_read_lock();
294 owner = ACCESS_ONCE(sem->owner);
295 if (owner)
296 on_cpu = owner->on_cpu;
297 rcu_read_unlock();
298
299 /*
300 * If sem->owner is not set, the rwsem owner may have
301 * just acquired it and not set the owner yet or the rwsem
302 * has been released.
303 */
304 return on_cpu;
305}
306
307static inline bool owner_running(struct rw_semaphore *sem,
308 struct task_struct *owner)
309{
310 if (sem->owner != owner)
311 return false;
312
313 /*
314 * Ensure we emit the owner->on_cpu, dereference _after_ checking
315 * sem->owner still matches owner, if that fails, owner might
316 * point to free()d memory, if it still matches, the rcu_read_lock()
317 * ensures the memory stays valid.
318 */
319 barrier();
320
321 return owner->on_cpu;
322}
323
324static noinline
325bool rwsem_spin_on_owner(struct rw_semaphore *sem, struct task_struct *owner)
326{
327 rcu_read_lock();
328 while (owner_running(sem, owner)) {
329 if (need_resched())
330 break;
331
332 arch_mutex_cpu_relax();
333 }
334 rcu_read_unlock();
335
336 /*
337 * We break out the loop above on need_resched() or when the
338 * owner changed, which is a sign for heavy contention. Return
339 * success only when sem->owner is NULL.
340 */
341 return sem->owner == NULL;
342}
343
344static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
345{
346 struct task_struct *owner;
347 bool taken = false;
348
349 preempt_disable();
350
351 /* sem->wait_lock should not be held when doing optimistic spinning */
352 if (!rwsem_can_spin_on_owner(sem))
353 goto done;
354
355 if (!osq_lock(&sem->osq))
356 goto done;
357
358 while (true) {
359 owner = ACCESS_ONCE(sem->owner);
360 if (owner && !rwsem_spin_on_owner(sem, owner))
361 break;
362
363 /* wait_lock will be acquired if write_lock is obtained */
364 if (rwsem_try_write_lock_unqueued(sem)) {
365 taken = true;
366 break;
367 }
368
369 /*
370 * When there's no owner, we might have preempted between the
371 * owner acquiring the lock and setting the owner field. If
372 * we're an RT task that will live-lock because we won't let
373 * the owner complete.
374 */
375 if (!owner && (need_resched() || rt_task(current)))
376 break;
377
378 /*
379 * The cpu_relax() call is a compiler barrier which forces
380 * everything in this loop to be re-loaded. We don't need
381 * memory barriers as we'll eventually observe the right
382 * values at the cost of a few extra spins.
383 */
384 arch_mutex_cpu_relax();
385 }
386 osq_unlock(&sem->osq);
387done:
388 preempt_enable();
389 return taken;
390}
391
392#else
393static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
394{
395 return false;
396}
397#endif
398
399/*
400 * Wait until we successfully acquire the write lock
193 */ 401 */
194__visible 402__visible
195struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem) 403struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem)
196{ 404{
197 long count, adjustment = -RWSEM_ACTIVE_WRITE_BIAS; 405 long count;
406 bool waiting = true; /* any queued threads before us */
198 struct rwsem_waiter waiter; 407 struct rwsem_waiter waiter;
199 struct task_struct *tsk = current;
200 408
201 /* set up my own style of waitqueue */ 409 /* undo write bias from down_write operation, stop active locking */
202 waiter.task = tsk; 410 count = rwsem_atomic_update(-RWSEM_ACTIVE_WRITE_BIAS, sem);
411
412 /* do optimistic spinning and steal lock if possible */
413 if (rwsem_optimistic_spin(sem))
414 return sem;
415
416 /*
417 * Optimistic spinning failed, proceed to the slowpath
418 * and block until we can acquire the sem.
419 */
420 waiter.task = current;
203 waiter.type = RWSEM_WAITING_FOR_WRITE; 421 waiter.type = RWSEM_WAITING_FOR_WRITE;
204 422
205 raw_spin_lock_irq(&sem->wait_lock); 423 raw_spin_lock_irq(&sem->wait_lock);
424
425 /* account for this before adding a new element to the list */
206 if (list_empty(&sem->wait_list)) 426 if (list_empty(&sem->wait_list))
207 adjustment += RWSEM_WAITING_BIAS; 427 waiting = false;
428
208 list_add_tail(&waiter.list, &sem->wait_list); 429 list_add_tail(&waiter.list, &sem->wait_list);
209 430
210 /* we're now waiting on the lock, but no longer actively locking */ 431 /* we're now waiting on the lock, but no longer actively locking */
211 count = rwsem_atomic_update(adjustment, sem); 432 if (waiting) {
433 count = ACCESS_ONCE(sem->count);
434
435 /*
436 * If there were already threads queued before us and there are
437 * no active writers, the lock must be read owned; so we try to
438 * wake any read locks that were queued ahead of us.
439 */
440 if (count > RWSEM_WAITING_BIAS)
441 sem = __rwsem_do_wake(sem, RWSEM_WAKE_READERS);
212 442
213 /* If there were already threads queued before us and there are no 443 } else
214 * active writers, the lock must be read owned; so we try to wake 444 count = rwsem_atomic_update(RWSEM_WAITING_BIAS, sem);
215 * any read locks that were queued ahead of us. */
216 if (count > RWSEM_WAITING_BIAS &&
217 adjustment == -RWSEM_ACTIVE_WRITE_BIAS)
218 sem = __rwsem_do_wake(sem, RWSEM_WAKE_READERS);
219 445
220 /* wait until we successfully acquire the lock */ 446 /* wait until we successfully acquire the lock */
221 set_task_state(tsk, TASK_UNINTERRUPTIBLE); 447 set_current_state(TASK_UNINTERRUPTIBLE);
222 while (true) { 448 while (true) {
223 if (!(count & RWSEM_ACTIVE_MASK)) { 449 if (rwsem_try_write_lock(count, sem))
224 /* Try acquiring the write lock. */ 450 break;
225 count = RWSEM_ACTIVE_WRITE_BIAS;
226 if (!list_is_singular(&sem->wait_list))
227 count += RWSEM_WAITING_BIAS;
228
229 if (sem->count == RWSEM_WAITING_BIAS &&
230 cmpxchg(&sem->count, RWSEM_WAITING_BIAS, count) ==
231 RWSEM_WAITING_BIAS)
232 break;
233 }
234
235 raw_spin_unlock_irq(&sem->wait_lock); 451 raw_spin_unlock_irq(&sem->wait_lock);
236 452
237 /* Block until there are no active lockers. */ 453 /* Block until there are no active lockers. */
238 do { 454 do {
239 schedule(); 455 schedule();
240 set_task_state(tsk, TASK_UNINTERRUPTIBLE); 456 set_current_state(TASK_UNINTERRUPTIBLE);
241 } while ((count = sem->count) & RWSEM_ACTIVE_MASK); 457 } while ((count = sem->count) & RWSEM_ACTIVE_MASK);
242 458
243 raw_spin_lock_irq(&sem->wait_lock); 459 raw_spin_lock_irq(&sem->wait_lock);
244 } 460 }
461 __set_current_state(TASK_RUNNING);
245 462
246 list_del(&waiter.list); 463 list_del(&waiter.list);
247 raw_spin_unlock_irq(&sem->wait_lock); 464 raw_spin_unlock_irq(&sem->wait_lock);
248 tsk->state = TASK_RUNNING;
249 465
250 return sem; 466 return sem;
251} 467}
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index cfff1435bdfb..42f806de49d4 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -12,6 +12,27 @@
12 12
13#include <linux/atomic.h> 13#include <linux/atomic.h>
14 14
15#if defined(CONFIG_SMP) && defined(CONFIG_RWSEM_XCHGADD_ALGORITHM)
16static inline void rwsem_set_owner(struct rw_semaphore *sem)
17{
18 sem->owner = current;
19}
20
21static inline void rwsem_clear_owner(struct rw_semaphore *sem)
22{
23 sem->owner = NULL;
24}
25
26#else
27static inline void rwsem_set_owner(struct rw_semaphore *sem)
28{
29}
30
31static inline void rwsem_clear_owner(struct rw_semaphore *sem)
32{
33}
34#endif
35
15/* 36/*
16 * lock for reading 37 * lock for reading
17 */ 38 */
@@ -48,6 +69,7 @@ void __sched down_write(struct rw_semaphore *sem)
48 rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_); 69 rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_);
49 70
50 LOCK_CONTENDED(sem, __down_write_trylock, __down_write); 71 LOCK_CONTENDED(sem, __down_write_trylock, __down_write);
72 rwsem_set_owner(sem);
51} 73}
52 74
53EXPORT_SYMBOL(down_write); 75EXPORT_SYMBOL(down_write);
@@ -59,8 +81,11 @@ int down_write_trylock(struct rw_semaphore *sem)
59{ 81{
60 int ret = __down_write_trylock(sem); 82 int ret = __down_write_trylock(sem);
61 83
62 if (ret == 1) 84 if (ret == 1) {
63 rwsem_acquire(&sem->dep_map, 0, 1, _RET_IP_); 85 rwsem_acquire(&sem->dep_map, 0, 1, _RET_IP_);
86 rwsem_set_owner(sem);
87 }
88
64 return ret; 89 return ret;
65} 90}
66 91
@@ -85,6 +110,7 @@ void up_write(struct rw_semaphore *sem)
85{ 110{
86 rwsem_release(&sem->dep_map, 1, _RET_IP_); 111 rwsem_release(&sem->dep_map, 1, _RET_IP_);
87 112
113 rwsem_clear_owner(sem);
88 __up_write(sem); 114 __up_write(sem);
89} 115}
90 116
@@ -99,6 +125,7 @@ void downgrade_write(struct rw_semaphore *sem)
99 * lockdep: a downgraded write will live on as a write 125 * lockdep: a downgraded write will live on as a write
100 * dependency. 126 * dependency.
101 */ 127 */
128 rwsem_clear_owner(sem);
102 __downgrade_write(sem); 129 __downgrade_write(sem);
103} 130}
104 131
@@ -122,6 +149,7 @@ void _down_write_nest_lock(struct rw_semaphore *sem, struct lockdep_map *nest)
122 rwsem_acquire_nest(&sem->dep_map, 0, 0, nest, _RET_IP_); 149 rwsem_acquire_nest(&sem->dep_map, 0, 0, nest, _RET_IP_);
123 150
124 LOCK_CONTENDED(sem, __down_write_trylock, __down_write); 151 LOCK_CONTENDED(sem, __down_write_trylock, __down_write);
152 rwsem_set_owner(sem);
125} 153}
126 154
127EXPORT_SYMBOL(_down_write_nest_lock); 155EXPORT_SYMBOL(_down_write_nest_lock);
@@ -141,6 +169,7 @@ void down_write_nested(struct rw_semaphore *sem, int subclass)
141 rwsem_acquire(&sem->dep_map, subclass, 0, _RET_IP_); 169 rwsem_acquire(&sem->dep_map, subclass, 0, _RET_IP_);
142 170
143 LOCK_CONTENDED(sem, __down_write_trylock, __down_write); 171 LOCK_CONTENDED(sem, __down_write_trylock, __down_write);
172 rwsem_set_owner(sem);
144} 173}
145 174
146EXPORT_SYMBOL(down_write_nested); 175EXPORT_SYMBOL(down_write_nested);
diff --git a/kernel/module.c b/kernel/module.c
index 079c4615607d..81e727cf6df9 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -3020,21 +3020,6 @@ static int do_init_module(struct module *mod)
3020 */ 3020 */
3021 current->flags &= ~PF_USED_ASYNC; 3021 current->flags &= ~PF_USED_ASYNC;
3022 3022
3023 blocking_notifier_call_chain(&module_notify_list,
3024 MODULE_STATE_COMING, mod);
3025
3026 /* Set RO and NX regions for core */
3027 set_section_ro_nx(mod->module_core,
3028 mod->core_text_size,
3029 mod->core_ro_size,
3030 mod->core_size);
3031
3032 /* Set RO and NX regions for init */
3033 set_section_ro_nx(mod->module_init,
3034 mod->init_text_size,
3035 mod->init_ro_size,
3036 mod->init_size);
3037
3038 do_mod_ctors(mod); 3023 do_mod_ctors(mod);
3039 /* Start the module */ 3024 /* Start the module */
3040 if (mod->init != NULL) 3025 if (mod->init != NULL)
@@ -3165,9 +3150,26 @@ static int complete_formation(struct module *mod, struct load_info *info)
3165 /* This relies on module_mutex for list integrity. */ 3150 /* This relies on module_mutex for list integrity. */
3166 module_bug_finalize(info->hdr, info->sechdrs, mod); 3151 module_bug_finalize(info->hdr, info->sechdrs, mod);
3167 3152
3153 /* Set RO and NX regions for core */
3154 set_section_ro_nx(mod->module_core,
3155 mod->core_text_size,
3156 mod->core_ro_size,
3157 mod->core_size);
3158
3159 /* Set RO and NX regions for init */
3160 set_section_ro_nx(mod->module_init,
3161 mod->init_text_size,
3162 mod->init_ro_size,
3163 mod->init_size);
3164
3168 /* Mark state as coming so strong_try_module_get() ignores us, 3165 /* Mark state as coming so strong_try_module_get() ignores us,
3169 * but kallsyms etc. can see us. */ 3166 * but kallsyms etc. can see us. */
3170 mod->state = MODULE_STATE_COMING; 3167 mod->state = MODULE_STATE_COMING;
3168 mutex_unlock(&module_mutex);
3169
3170 blocking_notifier_call_chain(&module_notify_list,
3171 MODULE_STATE_COMING, mod);
3172 return 0;
3171 3173
3172out: 3174out:
3173 mutex_unlock(&module_mutex); 3175 mutex_unlock(&module_mutex);
@@ -3190,6 +3192,7 @@ static int load_module(struct load_info *info, const char __user *uargs,
3190{ 3192{
3191 struct module *mod; 3193 struct module *mod;
3192 long err; 3194 long err;
3195 char *after_dashes;
3193 3196
3194 err = module_sig_check(info); 3197 err = module_sig_check(info);
3195 if (err) 3198 if (err)
@@ -3277,10 +3280,15 @@ static int load_module(struct load_info *info, const char __user *uargs,
3277 goto ddebug_cleanup; 3280 goto ddebug_cleanup;
3278 3281
3279 /* Module is ready to execute: parsing args may do that. */ 3282 /* Module is ready to execute: parsing args may do that. */
3280 err = parse_args(mod->name, mod->args, mod->kp, mod->num_kp, 3283 after_dashes = parse_args(mod->name, mod->args, mod->kp, mod->num_kp,
3281 -32768, 32767, unknown_module_param_cb); 3284 -32768, 32767, unknown_module_param_cb);
3282 if (err < 0) 3285 if (IS_ERR(after_dashes)) {
3286 err = PTR_ERR(after_dashes);
3283 goto bug_cleanup; 3287 goto bug_cleanup;
3288 } else if (after_dashes) {
3289 pr_warn("%s: parameters '%s' after `--' ignored\n",
3290 mod->name, after_dashes);
3291 }
3284 3292
3285 /* Link in to syfs. */ 3293 /* Link in to syfs. */
3286 err = mod_sysfs_setup(mod, info, mod->kp, mod->num_kp); 3294 err = mod_sysfs_setup(mod, info, mod->kp, mod->num_kp);
diff --git a/kernel/notifier.c b/kernel/notifier.c
index db4c8b08a50c..4803da6eab62 100644
--- a/kernel/notifier.c
+++ b/kernel/notifier.c
@@ -71,9 +71,9 @@ static int notifier_chain_unregister(struct notifier_block **nl,
71 * @returns: notifier_call_chain returns the value returned by the 71 * @returns: notifier_call_chain returns the value returned by the
72 * last notifier function called. 72 * last notifier function called.
73 */ 73 */
74static int __kprobes notifier_call_chain(struct notifier_block **nl, 74static int notifier_call_chain(struct notifier_block **nl,
75 unsigned long val, void *v, 75 unsigned long val, void *v,
76 int nr_to_call, int *nr_calls) 76 int nr_to_call, int *nr_calls)
77{ 77{
78 int ret = NOTIFY_DONE; 78 int ret = NOTIFY_DONE;
79 struct notifier_block *nb, *next_nb; 79 struct notifier_block *nb, *next_nb;
@@ -102,6 +102,7 @@ static int __kprobes notifier_call_chain(struct notifier_block **nl,
102 } 102 }
103 return ret; 103 return ret;
104} 104}
105NOKPROBE_SYMBOL(notifier_call_chain);
105 106
106/* 107/*
107 * Atomic notifier chain routines. Registration and unregistration 108 * Atomic notifier chain routines. Registration and unregistration
@@ -172,9 +173,9 @@ EXPORT_SYMBOL_GPL(atomic_notifier_chain_unregister);
172 * Otherwise the return value is the return value 173 * Otherwise the return value is the return value
173 * of the last notifier function called. 174 * of the last notifier function called.
174 */ 175 */
175int __kprobes __atomic_notifier_call_chain(struct atomic_notifier_head *nh, 176int __atomic_notifier_call_chain(struct atomic_notifier_head *nh,
176 unsigned long val, void *v, 177 unsigned long val, void *v,
177 int nr_to_call, int *nr_calls) 178 int nr_to_call, int *nr_calls)
178{ 179{
179 int ret; 180 int ret;
180 181
@@ -184,13 +185,15 @@ int __kprobes __atomic_notifier_call_chain(struct atomic_notifier_head *nh,
184 return ret; 185 return ret;
185} 186}
186EXPORT_SYMBOL_GPL(__atomic_notifier_call_chain); 187EXPORT_SYMBOL_GPL(__atomic_notifier_call_chain);
188NOKPROBE_SYMBOL(__atomic_notifier_call_chain);
187 189
188int __kprobes atomic_notifier_call_chain(struct atomic_notifier_head *nh, 190int atomic_notifier_call_chain(struct atomic_notifier_head *nh,
189 unsigned long val, void *v) 191 unsigned long val, void *v)
190{ 192{
191 return __atomic_notifier_call_chain(nh, val, v, -1, NULL); 193 return __atomic_notifier_call_chain(nh, val, v, -1, NULL);
192} 194}
193EXPORT_SYMBOL_GPL(atomic_notifier_call_chain); 195EXPORT_SYMBOL_GPL(atomic_notifier_call_chain);
196NOKPROBE_SYMBOL(atomic_notifier_call_chain);
194 197
195/* 198/*
196 * Blocking notifier chain routines. All access to the chain is 199 * Blocking notifier chain routines. All access to the chain is
@@ -527,7 +530,7 @@ EXPORT_SYMBOL_GPL(srcu_init_notifier_head);
527 530
528static ATOMIC_NOTIFIER_HEAD(die_chain); 531static ATOMIC_NOTIFIER_HEAD(die_chain);
529 532
530int notrace __kprobes notify_die(enum die_val val, const char *str, 533int notrace notify_die(enum die_val val, const char *str,
531 struct pt_regs *regs, long err, int trap, int sig) 534 struct pt_regs *regs, long err, int trap, int sig)
532{ 535{
533 struct die_args args = { 536 struct die_args args = {
@@ -540,6 +543,7 @@ int notrace __kprobes notify_die(enum die_val val, const char *str,
540 }; 543 };
541 return atomic_notifier_call_chain(&die_chain, val, &args); 544 return atomic_notifier_call_chain(&die_chain, val, &args);
542} 545}
546NOKPROBE_SYMBOL(notify_die);
543 547
544int register_die_notifier(struct notifier_block *nb) 548int register_die_notifier(struct notifier_block *nb)
545{ 549{
diff --git a/kernel/panic.c b/kernel/panic.c
index d02fa9fef46a..62e16cef9cc2 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -32,6 +32,7 @@ static unsigned long tainted_mask;
32static int pause_on_oops; 32static int pause_on_oops;
33static int pause_on_oops_flag; 33static int pause_on_oops_flag;
34static DEFINE_SPINLOCK(pause_on_oops_lock); 34static DEFINE_SPINLOCK(pause_on_oops_lock);
35static bool crash_kexec_post_notifiers;
35 36
36int panic_timeout = CONFIG_PANIC_TIMEOUT; 37int panic_timeout = CONFIG_PANIC_TIMEOUT;
37EXPORT_SYMBOL_GPL(panic_timeout); 38EXPORT_SYMBOL_GPL(panic_timeout);
@@ -112,9 +113,11 @@ void panic(const char *fmt, ...)
112 /* 113 /*
113 * If we have crashed and we have a crash kernel loaded let it handle 114 * If we have crashed and we have a crash kernel loaded let it handle
114 * everything else. 115 * everything else.
115 * Do we want to call this before we try to display a message? 116 * If we want to run this after calling panic_notifiers, pass
117 * the "crash_kexec_post_notifiers" option to the kernel.
116 */ 118 */
117 crash_kexec(NULL); 119 if (!crash_kexec_post_notifiers)
120 crash_kexec(NULL);
118 121
119 /* 122 /*
120 * Note smp_send_stop is the usual smp shutdown function, which 123 * Note smp_send_stop is the usual smp shutdown function, which
@@ -131,6 +134,15 @@ void panic(const char *fmt, ...)
131 134
132 kmsg_dump(KMSG_DUMP_PANIC); 135 kmsg_dump(KMSG_DUMP_PANIC);
133 136
137 /*
138 * If you doubt kdump always works fine in any situation,
139 * "crash_kexec_post_notifiers" offers you a chance to run
140 * panic_notifiers and dumping kmsg before kdump.
141 * Note: since some panic_notifiers can make crashed kernel
142 * more unstable, it can increase risks of the kdump failure too.
143 */
144 crash_kexec(NULL);
145
134 bust_spinlocks(0); 146 bust_spinlocks(0);
135 147
136 if (!panic_blink) 148 if (!panic_blink)
@@ -472,6 +484,13 @@ EXPORT_SYMBOL(__stack_chk_fail);
472core_param(panic, panic_timeout, int, 0644); 484core_param(panic, panic_timeout, int, 0644);
473core_param(pause_on_oops, pause_on_oops, int, 0644); 485core_param(pause_on_oops, pause_on_oops, int, 0644);
474 486
487static int __init setup_crash_kexec_post_notifiers(char *s)
488{
489 crash_kexec_post_notifiers = true;
490 return 0;
491}
492early_param("crash_kexec_post_notifiers", setup_crash_kexec_post_notifiers);
493
475static int __init oops_setup(char *s) 494static int __init oops_setup(char *s)
476{ 495{
477 if (!s) 496 if (!s)
diff --git a/kernel/params.c b/kernel/params.c
index b00142e7f3ba..1e52ca233fd9 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -177,13 +177,13 @@ static char *next_arg(char *args, char **param, char **val)
177} 177}
178 178
179/* Args looks like "foo=bar,bar2 baz=fuz wiz". */ 179/* Args looks like "foo=bar,bar2 baz=fuz wiz". */
180int parse_args(const char *doing, 180char *parse_args(const char *doing,
181 char *args, 181 char *args,
182 const struct kernel_param *params, 182 const struct kernel_param *params,
183 unsigned num, 183 unsigned num,
184 s16 min_level, 184 s16 min_level,
185 s16 max_level, 185 s16 max_level,
186 int (*unknown)(char *param, char *val, const char *doing)) 186 int (*unknown)(char *param, char *val, const char *doing))
187{ 187{
188 char *param, *val; 188 char *param, *val;
189 189
@@ -198,6 +198,9 @@ int parse_args(const char *doing,
198 int irq_was_disabled; 198 int irq_was_disabled;
199 199
200 args = next_arg(args, &param, &val); 200 args = next_arg(args, &param, &val);
201 /* Stop at -- */
202 if (!val && strcmp(param, "--") == 0)
203 return args;
201 irq_was_disabled = irqs_disabled(); 204 irq_was_disabled = irqs_disabled();
202 ret = parse_one(param, val, doing, params, num, 205 ret = parse_one(param, val, doing, params, num,
203 min_level, max_level, unknown); 206 min_level, max_level, unknown);
@@ -208,22 +211,22 @@ int parse_args(const char *doing,
208 switch (ret) { 211 switch (ret) {
209 case -ENOENT: 212 case -ENOENT:
210 pr_err("%s: Unknown parameter `%s'\n", doing, param); 213 pr_err("%s: Unknown parameter `%s'\n", doing, param);
211 return ret; 214 return ERR_PTR(ret);
212 case -ENOSPC: 215 case -ENOSPC:
213 pr_err("%s: `%s' too large for parameter `%s'\n", 216 pr_err("%s: `%s' too large for parameter `%s'\n",
214 doing, val ?: "", param); 217 doing, val ?: "", param);
215 return ret; 218 return ERR_PTR(ret);
216 case 0: 219 case 0:
217 break; 220 break;
218 default: 221 default:
219 pr_err("%s: `%s' invalid for parameter `%s'\n", 222 pr_err("%s: `%s' invalid for parameter `%s'\n",
220 doing, val ?: "", param); 223 doing, val ?: "", param);
221 return ret; 224 return ERR_PTR(ret);
222 } 225 }
223 } 226 }
224 227
225 /* All parsed OK. */ 228 /* All parsed OK. */
226 return 0; 229 return NULL;
227} 230}
228 231
229/* Lazy bastard, eh? */ 232/* Lazy bastard, eh? */
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 2fac9cc79b3d..9a83d780facd 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -257,8 +257,7 @@ config ARCH_HAS_OPP
257 bool 257 bool
258 258
259config PM_OPP 259config PM_OPP
260 bool "Operating Performance Point (OPP) Layer library" 260 bool
261 depends on ARCH_HAS_OPP
262 ---help--- 261 ---help---
263 SOCs have a standard set of tuples consisting of frequency and 262 SOCs have a standard set of tuples consisting of frequency and
264 voltage pairs that the device will support per voltage domain. This 263 voltage pairs that the device will support per voltage domain. This
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index f4f2073711d3..49e0a20fd010 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -28,6 +28,7 @@
28#include <linux/syscore_ops.h> 28#include <linux/syscore_ops.h>
29#include <linux/ctype.h> 29#include <linux/ctype.h>
30#include <linux/genhd.h> 30#include <linux/genhd.h>
31#include <trace/events/power.h>
31 32
32#include "power.h" 33#include "power.h"
33 34
@@ -35,7 +36,7 @@
35static int nocompress; 36static int nocompress;
36static int noresume; 37static int noresume;
37static int resume_wait; 38static int resume_wait;
38static int resume_delay; 39static unsigned int resume_delay;
39static char resume_file[256] = CONFIG_PM_STD_PARTITION; 40static char resume_file[256] = CONFIG_PM_STD_PARTITION;
40dev_t swsusp_resume_device; 41dev_t swsusp_resume_device;
41sector_t swsusp_resume_block; 42sector_t swsusp_resume_block;
@@ -228,19 +229,23 @@ static void platform_recover(int platform_mode)
228void swsusp_show_speed(struct timeval *start, struct timeval *stop, 229void swsusp_show_speed(struct timeval *start, struct timeval *stop,
229 unsigned nr_pages, char *msg) 230 unsigned nr_pages, char *msg)
230{ 231{
231 s64 elapsed_centisecs64; 232 u64 elapsed_centisecs64;
232 int centisecs; 233 unsigned int centisecs;
233 int k; 234 unsigned int k;
234 int kps; 235 unsigned int kps;
235 236
236 elapsed_centisecs64 = timeval_to_ns(stop) - timeval_to_ns(start); 237 elapsed_centisecs64 = timeval_to_ns(stop) - timeval_to_ns(start);
238 /*
239 * If "(s64)elapsed_centisecs64 < 0", it will print long elapsed time,
240 * it is obvious enough for what went wrong.
241 */
237 do_div(elapsed_centisecs64, NSEC_PER_SEC / 100); 242 do_div(elapsed_centisecs64, NSEC_PER_SEC / 100);
238 centisecs = elapsed_centisecs64; 243 centisecs = elapsed_centisecs64;
239 if (centisecs == 0) 244 if (centisecs == 0)
240 centisecs = 1; /* avoid div-by-zero */ 245 centisecs = 1; /* avoid div-by-zero */
241 k = nr_pages * (PAGE_SIZE / 1024); 246 k = nr_pages * (PAGE_SIZE / 1024);
242 kps = (k * 100) / centisecs; 247 kps = (k * 100) / centisecs;
243 printk(KERN_INFO "PM: %s %d kbytes in %d.%02d seconds (%d.%02d MB/s)\n", 248 printk(KERN_INFO "PM: %s %u kbytes in %u.%02u seconds (%u.%02u MB/s)\n",
244 msg, k, 249 msg, k,
245 centisecs / 100, centisecs % 100, 250 centisecs / 100, centisecs % 100,
246 kps / 1000, (kps % 1000) / 10); 251 kps / 1000, (kps % 1000) / 10);
@@ -288,7 +293,9 @@ static int create_image(int platform_mode)
288 293
289 in_suspend = 1; 294 in_suspend = 1;
290 save_processor_state(); 295 save_processor_state();
296 trace_suspend_resume(TPS("machine_suspend"), PM_EVENT_HIBERNATE, true);
291 error = swsusp_arch_suspend(); 297 error = swsusp_arch_suspend();
298 trace_suspend_resume(TPS("machine_suspend"), PM_EVENT_HIBERNATE, false);
292 if (error) 299 if (error)
293 printk(KERN_ERR "PM: Error %d creating hibernation image\n", 300 printk(KERN_ERR "PM: Error %d creating hibernation image\n",
294 error); 301 error);
@@ -595,7 +602,8 @@ static void power_down(void)
595 case HIBERNATION_PLATFORM: 602 case HIBERNATION_PLATFORM:
596 hibernation_platform_enter(); 603 hibernation_platform_enter();
597 case HIBERNATION_SHUTDOWN: 604 case HIBERNATION_SHUTDOWN:
598 kernel_power_off(); 605 if (pm_power_off)
606 kernel_power_off();
599 break; 607 break;
600#ifdef CONFIG_SUSPEND 608#ifdef CONFIG_SUSPEND
601 case HIBERNATION_SUSPEND: 609 case HIBERNATION_SUSPEND:
@@ -623,7 +631,8 @@ static void power_down(void)
623 * corruption after resume. 631 * corruption after resume.
624 */ 632 */
625 printk(KERN_CRIT "PM: Please power down manually\n"); 633 printk(KERN_CRIT "PM: Please power down manually\n");
626 while(1); 634 while (1)
635 cpu_relax();
627} 636}
628 637
629/** 638/**
@@ -1109,7 +1118,10 @@ static int __init resumewait_setup(char *str)
1109 1118
1110static int __init resumedelay_setup(char *str) 1119static int __init resumedelay_setup(char *str)
1111{ 1120{
1112 resume_delay = simple_strtoul(str, NULL, 0); 1121 int rc = kstrtouint(str, 0, &resume_delay);
1122
1123 if (rc)
1124 return rc;
1113 return 1; 1125 return 1;
1114} 1126}
1115 1127
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 6271bc4073ef..573410d6647e 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -279,26 +279,26 @@ static inline void pm_print_times_init(void) {}
279struct kobject *power_kobj; 279struct kobject *power_kobj;
280 280
281/** 281/**
282 * state - control system power state. 282 * state - control system sleep states.
283 * 283 *
284 * show() returns what states are supported, which is hard-coded to 284 * show() returns available sleep state labels, which may be "mem", "standby",
285 * 'freeze' (Low-Power Idle), 'standby' (Power-On Suspend), 285 * "freeze" and "disk" (hibernation). See Documentation/power/states.txt for a
286 * 'mem' (Suspend-to-RAM), and 'disk' (Suspend-to-Disk). 286 * description of what they mean.
287 * 287 *
288 * store() accepts one of those strings, translates it into the 288 * store() accepts one of those strings, translates it into the proper
289 * proper enumerated value, and initiates a suspend transition. 289 * enumerated value, and initiates a suspend transition.
290 */ 290 */
291static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr, 291static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr,
292 char *buf) 292 char *buf)
293{ 293{
294 char *s = buf; 294 char *s = buf;
295#ifdef CONFIG_SUSPEND 295#ifdef CONFIG_SUSPEND
296 int i; 296 suspend_state_t i;
297
298 for (i = PM_SUSPEND_MIN; i < PM_SUSPEND_MAX; i++)
299 if (pm_states[i].state)
300 s += sprintf(s,"%s ", pm_states[i].label);
297 301
298 for (i = 0; i < PM_SUSPEND_MAX; i++) {
299 if (pm_states[i] && valid_state(i))
300 s += sprintf(s,"%s ", pm_states[i]);
301 }
302#endif 302#endif
303#ifdef CONFIG_HIBERNATION 303#ifdef CONFIG_HIBERNATION
304 s += sprintf(s, "%s\n", "disk"); 304 s += sprintf(s, "%s\n", "disk");
@@ -314,7 +314,7 @@ static suspend_state_t decode_state(const char *buf, size_t n)
314{ 314{
315#ifdef CONFIG_SUSPEND 315#ifdef CONFIG_SUSPEND
316 suspend_state_t state = PM_SUSPEND_MIN; 316 suspend_state_t state = PM_SUSPEND_MIN;
317 const char * const *s; 317 struct pm_sleep_state *s;
318#endif 318#endif
319 char *p; 319 char *p;
320 int len; 320 int len;
@@ -328,8 +328,9 @@ static suspend_state_t decode_state(const char *buf, size_t n)
328 328
329#ifdef CONFIG_SUSPEND 329#ifdef CONFIG_SUSPEND
330 for (s = &pm_states[state]; state < PM_SUSPEND_MAX; s++, state++) 330 for (s = &pm_states[state]; state < PM_SUSPEND_MAX; s++, state++)
331 if (*s && len == strlen(*s) && !strncmp(buf, *s, len)) 331 if (s->state && len == strlen(s->label)
332 return state; 332 && !strncmp(buf, s->label, len))
333 return s->state;
333#endif 334#endif
334 335
335 return PM_SUSPEND_ON; 336 return PM_SUSPEND_ON;
@@ -447,8 +448,8 @@ static ssize_t autosleep_show(struct kobject *kobj,
447 448
448#ifdef CONFIG_SUSPEND 449#ifdef CONFIG_SUSPEND
449 if (state < PM_SUSPEND_MAX) 450 if (state < PM_SUSPEND_MAX)
450 return sprintf(buf, "%s\n", valid_state(state) ? 451 return sprintf(buf, "%s\n", pm_states[state].state ?
451 pm_states[state] : "error"); 452 pm_states[state].label : "error");
452#endif 453#endif
453#ifdef CONFIG_HIBERNATION 454#ifdef CONFIG_HIBERNATION
454 return sprintf(buf, "disk\n"); 455 return sprintf(buf, "disk\n");
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 15f37ea08719..c60f13b5270a 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -178,17 +178,20 @@ extern void swsusp_show_speed(struct timeval *, struct timeval *,
178 unsigned int, char *); 178 unsigned int, char *);
179 179
180#ifdef CONFIG_SUSPEND 180#ifdef CONFIG_SUSPEND
181struct pm_sleep_state {
182 const char *label;
183 suspend_state_t state;
184};
185
181/* kernel/power/suspend.c */ 186/* kernel/power/suspend.c */
182extern const char *const pm_states[]; 187extern struct pm_sleep_state pm_states[];
183 188
184extern bool valid_state(suspend_state_t state);
185extern int suspend_devices_and_enter(suspend_state_t state); 189extern int suspend_devices_and_enter(suspend_state_t state);
186#else /* !CONFIG_SUSPEND */ 190#else /* !CONFIG_SUSPEND */
187static inline int suspend_devices_and_enter(suspend_state_t state) 191static inline int suspend_devices_and_enter(suspend_state_t state)
188{ 192{
189 return -ENOSYS; 193 return -ENOSYS;
190} 194}
191static inline bool valid_state(suspend_state_t state) { return false; }
192#endif /* !CONFIG_SUSPEND */ 195#endif /* !CONFIG_SUSPEND */
193 196
194#ifdef CONFIG_PM_TEST_SUSPEND 197#ifdef CONFIG_PM_TEST_SUSPEND
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 06ec8869dbf1..0ca8d83e2369 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -17,6 +17,7 @@
17#include <linux/delay.h> 17#include <linux/delay.h>
18#include <linux/workqueue.h> 18#include <linux/workqueue.h>
19#include <linux/kmod.h> 19#include <linux/kmod.h>
20#include <trace/events/power.h>
20 21
21/* 22/*
22 * Timeout for stopping processes 23 * Timeout for stopping processes
@@ -175,6 +176,7 @@ void thaw_processes(void)
175 struct task_struct *g, *p; 176 struct task_struct *g, *p;
176 struct task_struct *curr = current; 177 struct task_struct *curr = current;
177 178
179 trace_suspend_resume(TPS("thaw_processes"), 0, true);
178 if (pm_freezing) 180 if (pm_freezing)
179 atomic_dec(&system_freezing_cnt); 181 atomic_dec(&system_freezing_cnt);
180 pm_freezing = false; 182 pm_freezing = false;
@@ -201,6 +203,7 @@ void thaw_processes(void)
201 203
202 schedule(); 204 schedule();
203 printk("done.\n"); 205 printk("done.\n");
206 trace_suspend_resume(TPS("thaw_processes"), 0, false);
204} 207}
205 208
206void thaw_kernel_threads(void) 209void thaw_kernel_threads(void)
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 18fb7a2fb14b..1ea328aafdc9 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -1586,7 +1586,7 @@ swsusp_alloc(struct memory_bitmap *orig_bm, struct memory_bitmap *copy_bm,
1586 return -ENOMEM; 1586 return -ENOMEM;
1587} 1587}
1588 1588
1589asmlinkage int swsusp_save(void) 1589asmlinkage __visible int swsusp_save(void)
1590{ 1590{
1591 unsigned int nr_pages, nr_highmem; 1591 unsigned int nr_pages, nr_highmem;
1592 1592
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 8233cd4047d7..4dd8822f732a 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -31,13 +31,14 @@
31 31
32#include "power.h" 32#include "power.h"
33 33
34const char *const pm_states[PM_SUSPEND_MAX] = { 34struct pm_sleep_state pm_states[PM_SUSPEND_MAX] = {
35 [PM_SUSPEND_FREEZE] = "freeze", 35 [PM_SUSPEND_FREEZE] = { .label = "freeze", .state = PM_SUSPEND_FREEZE },
36 [PM_SUSPEND_STANDBY] = "standby", 36 [PM_SUSPEND_STANDBY] = { .label = "standby", },
37 [PM_SUSPEND_MEM] = "mem", 37 [PM_SUSPEND_MEM] = { .label = "mem", },
38}; 38};
39 39
40static const struct platform_suspend_ops *suspend_ops; 40static const struct platform_suspend_ops *suspend_ops;
41static const struct platform_freeze_ops *freeze_ops;
41 42
42static bool need_suspend_ops(suspend_state_t state) 43static bool need_suspend_ops(suspend_state_t state)
43{ 44{
@@ -47,6 +48,13 @@ static bool need_suspend_ops(suspend_state_t state)
47static DECLARE_WAIT_QUEUE_HEAD(suspend_freeze_wait_head); 48static DECLARE_WAIT_QUEUE_HEAD(suspend_freeze_wait_head);
48static bool suspend_freeze_wake; 49static bool suspend_freeze_wake;
49 50
51void freeze_set_ops(const struct platform_freeze_ops *ops)
52{
53 lock_system_sleep();
54 freeze_ops = ops;
55 unlock_system_sleep();
56}
57
50static void freeze_begin(void) 58static void freeze_begin(void)
51{ 59{
52 suspend_freeze_wake = false; 60 suspend_freeze_wake = false;
@@ -54,9 +62,11 @@ static void freeze_begin(void)
54 62
55static void freeze_enter(void) 63static void freeze_enter(void)
56{ 64{
65 cpuidle_use_deepest_state(true);
57 cpuidle_resume(); 66 cpuidle_resume();
58 wait_event(suspend_freeze_wait_head, suspend_freeze_wake); 67 wait_event(suspend_freeze_wait_head, suspend_freeze_wake);
59 cpuidle_pause(); 68 cpuidle_pause();
69 cpuidle_use_deepest_state(false);
60} 70}
61 71
62void freeze_wake(void) 72void freeze_wake(void)
@@ -66,42 +76,62 @@ void freeze_wake(void)
66} 76}
67EXPORT_SYMBOL_GPL(freeze_wake); 77EXPORT_SYMBOL_GPL(freeze_wake);
68 78
79static bool valid_state(suspend_state_t state)
80{
81 /*
82 * PM_SUSPEND_STANDBY and PM_SUSPEND_MEM states need low level
83 * support and need to be valid to the low level
84 * implementation, no valid callback implies that none are valid.
85 */
86 return suspend_ops && suspend_ops->valid && suspend_ops->valid(state);
87}
88
89/*
90 * If this is set, the "mem" label always corresponds to the deepest sleep state
91 * available, the "standby" label corresponds to the second deepest sleep state
92 * available (if any), and the "freeze" label corresponds to the remaining
93 * available sleep state (if there is one).
94 */
95static bool relative_states;
96
97static int __init sleep_states_setup(char *str)
98{
99 relative_states = !strncmp(str, "1", 1);
100 if (relative_states) {
101 pm_states[PM_SUSPEND_MEM].state = PM_SUSPEND_FREEZE;
102 pm_states[PM_SUSPEND_FREEZE].state = 0;
103 }
104 return 1;
105}
106
107__setup("relative_sleep_states=", sleep_states_setup);
108
69/** 109/**
70 * suspend_set_ops - Set the global suspend method table. 110 * suspend_set_ops - Set the global suspend method table.
71 * @ops: Suspend operations to use. 111 * @ops: Suspend operations to use.
72 */ 112 */
73void suspend_set_ops(const struct platform_suspend_ops *ops) 113void suspend_set_ops(const struct platform_suspend_ops *ops)
74{ 114{
115 suspend_state_t i;
116 int j = PM_SUSPEND_MAX - 1;
117
75 lock_system_sleep(); 118 lock_system_sleep();
119
76 suspend_ops = ops; 120 suspend_ops = ops;
121 for (i = PM_SUSPEND_MEM; i >= PM_SUSPEND_STANDBY; i--)
122 if (valid_state(i))
123 pm_states[j--].state = i;
124 else if (!relative_states)
125 pm_states[j--].state = 0;
126
127 pm_states[j--].state = PM_SUSPEND_FREEZE;
128 while (j >= PM_SUSPEND_MIN)
129 pm_states[j--].state = 0;
130
77 unlock_system_sleep(); 131 unlock_system_sleep();
78} 132}
79EXPORT_SYMBOL_GPL(suspend_set_ops); 133EXPORT_SYMBOL_GPL(suspend_set_ops);
80 134
81bool valid_state(suspend_state_t state)
82{
83 if (state == PM_SUSPEND_FREEZE) {
84#ifdef CONFIG_PM_DEBUG
85 if (pm_test_level != TEST_NONE &&
86 pm_test_level != TEST_FREEZER &&
87 pm_test_level != TEST_DEVICES &&
88 pm_test_level != TEST_PLATFORM) {
89 printk(KERN_WARNING "Unsupported pm_test mode for "
90 "freeze state, please choose "
91 "none/freezer/devices/platform.\n");
92 return false;
93 }
94#endif
95 return true;
96 }
97 /*
98 * PM_SUSPEND_STANDBY and PM_SUSPEND_MEMORY states need lowlevel
99 * support and need to be valid to the lowlevel
100 * implementation, no valid callback implies that none are valid.
101 */
102 return suspend_ops && suspend_ops->valid && suspend_ops->valid(state);
103}
104
105/** 135/**
106 * suspend_valid_only_mem - Generic memory-only valid callback. 136 * suspend_valid_only_mem - Generic memory-only valid callback.
107 * 137 *
@@ -147,7 +177,9 @@ static int suspend_prepare(suspend_state_t state)
147 if (error) 177 if (error)
148 goto Finish; 178 goto Finish;
149 179
180 trace_suspend_resume(TPS("freeze_processes"), 0, true);
150 error = suspend_freeze_processes(); 181 error = suspend_freeze_processes();
182 trace_suspend_resume(TPS("freeze_processes"), 0, false);
151 if (!error) 183 if (!error)
152 return 0; 184 return 0;
153 185
@@ -210,7 +242,9 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
210 * all the devices are suspended. 242 * all the devices are suspended.
211 */ 243 */
212 if (state == PM_SUSPEND_FREEZE) { 244 if (state == PM_SUSPEND_FREEZE) {
245 trace_suspend_resume(TPS("machine_suspend"), state, true);
213 freeze_enter(); 246 freeze_enter();
247 trace_suspend_resume(TPS("machine_suspend"), state, false);
214 goto Platform_wake; 248 goto Platform_wake;
215 } 249 }
216 250
@@ -226,7 +260,11 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
226 if (!error) { 260 if (!error) {
227 *wakeup = pm_wakeup_pending(); 261 *wakeup = pm_wakeup_pending();
228 if (!(suspend_test(TEST_CORE) || *wakeup)) { 262 if (!(suspend_test(TEST_CORE) || *wakeup)) {
263 trace_suspend_resume(TPS("machine_suspend"),
264 state, true);
229 error = suspend_ops->enter(state); 265 error = suspend_ops->enter(state);
266 trace_suspend_resume(TPS("machine_suspend"),
267 state, false);
230 events_check_enabled = false; 268 events_check_enabled = false;
231 } 269 }
232 syscore_resume(); 270 syscore_resume();
@@ -264,11 +302,14 @@ int suspend_devices_and_enter(suspend_state_t state)
264 if (need_suspend_ops(state) && !suspend_ops) 302 if (need_suspend_ops(state) && !suspend_ops)
265 return -ENOSYS; 303 return -ENOSYS;
266 304
267 trace_machine_suspend(state);
268 if (need_suspend_ops(state) && suspend_ops->begin) { 305 if (need_suspend_ops(state) && suspend_ops->begin) {
269 error = suspend_ops->begin(state); 306 error = suspend_ops->begin(state);
270 if (error) 307 if (error)
271 goto Close; 308 goto Close;
309 } else if (state == PM_SUSPEND_FREEZE && freeze_ops->begin) {
310 error = freeze_ops->begin();
311 if (error)
312 goto Close;
272 } 313 }
273 suspend_console(); 314 suspend_console();
274 suspend_test_start(); 315 suspend_test_start();
@@ -294,7 +335,9 @@ int suspend_devices_and_enter(suspend_state_t state)
294 Close: 335 Close:
295 if (need_suspend_ops(state) && suspend_ops->end) 336 if (need_suspend_ops(state) && suspend_ops->end)
296 suspend_ops->end(); 337 suspend_ops->end();
297 trace_machine_suspend(PWR_EVENT_EXIT); 338 else if (state == PM_SUSPEND_FREEZE && freeze_ops->end)
339 freeze_ops->end();
340
298 return error; 341 return error;
299 342
300 Recover_platform: 343 Recover_platform:
@@ -328,20 +371,31 @@ static int enter_state(suspend_state_t state)
328{ 371{
329 int error; 372 int error;
330 373
331 if (!valid_state(state)) 374 trace_suspend_resume(TPS("suspend_enter"), state, true);
332 return -ENODEV; 375 if (state == PM_SUSPEND_FREEZE) {
333 376#ifdef CONFIG_PM_DEBUG
377 if (pm_test_level != TEST_NONE && pm_test_level <= TEST_CPUS) {
378 pr_warning("PM: Unsupported test mode for freeze state,"
379 "please choose none/freezer/devices/platform.\n");
380 return -EAGAIN;
381 }
382#endif
383 } else if (!valid_state(state)) {
384 return -EINVAL;
385 }
334 if (!mutex_trylock(&pm_mutex)) 386 if (!mutex_trylock(&pm_mutex))
335 return -EBUSY; 387 return -EBUSY;
336 388
337 if (state == PM_SUSPEND_FREEZE) 389 if (state == PM_SUSPEND_FREEZE)
338 freeze_begin(); 390 freeze_begin();
339 391
392 trace_suspend_resume(TPS("sync_filesystems"), 0, true);
340 printk(KERN_INFO "PM: Syncing filesystems ... "); 393 printk(KERN_INFO "PM: Syncing filesystems ... ");
341 sys_sync(); 394 sys_sync();
342 printk("done.\n"); 395 printk("done.\n");
396 trace_suspend_resume(TPS("sync_filesystems"), 0, false);
343 397
344 pr_debug("PM: Preparing system for %s sleep\n", pm_states[state]); 398 pr_debug("PM: Preparing system for %s sleep\n", pm_states[state].label);
345 error = suspend_prepare(state); 399 error = suspend_prepare(state);
346 if (error) 400 if (error)
347 goto Unlock; 401 goto Unlock;
@@ -349,7 +403,8 @@ static int enter_state(suspend_state_t state)
349 if (suspend_test(TEST_FREEZER)) 403 if (suspend_test(TEST_FREEZER))
350 goto Finish; 404 goto Finish;
351 405
352 pr_debug("PM: Entering %s sleep\n", pm_states[state]); 406 trace_suspend_resume(TPS("suspend_enter"), state, false);
407 pr_debug("PM: Entering %s sleep\n", pm_states[state].label);
353 pm_restrict_gfp_mask(); 408 pm_restrict_gfp_mask();
354 error = suspend_devices_and_enter(state); 409 error = suspend_devices_and_enter(state);
355 pm_restore_gfp_mask(); 410 pm_restore_gfp_mask();
diff --git a/kernel/power/suspend_test.c b/kernel/power/suspend_test.c
index 9b2a1d58558d..269b097e78ea 100644
--- a/kernel/power/suspend_test.c
+++ b/kernel/power/suspend_test.c
@@ -92,13 +92,13 @@ static void __init test_wakealarm(struct rtc_device *rtc, suspend_state_t state)
92 } 92 }
93 93
94 if (state == PM_SUSPEND_MEM) { 94 if (state == PM_SUSPEND_MEM) {
95 printk(info_test, pm_states[state]); 95 printk(info_test, pm_states[state].label);
96 status = pm_suspend(state); 96 status = pm_suspend(state);
97 if (status == -ENODEV) 97 if (status == -ENODEV)
98 state = PM_SUSPEND_STANDBY; 98 state = PM_SUSPEND_STANDBY;
99 } 99 }
100 if (state == PM_SUSPEND_STANDBY) { 100 if (state == PM_SUSPEND_STANDBY) {
101 printk(info_test, pm_states[state]); 101 printk(info_test, pm_states[state].label);
102 status = pm_suspend(state); 102 status = pm_suspend(state);
103 } 103 }
104 if (status < 0) 104 if (status < 0)
@@ -136,18 +136,16 @@ static char warn_bad_state[] __initdata =
136 136
137static int __init setup_test_suspend(char *value) 137static int __init setup_test_suspend(char *value)
138{ 138{
139 unsigned i; 139 suspend_state_t i;
140 140
141 /* "=mem" ==> "mem" */ 141 /* "=mem" ==> "mem" */
142 value++; 142 value++;
143 for (i = 0; i < PM_SUSPEND_MAX; i++) { 143 for (i = PM_SUSPEND_MIN; i < PM_SUSPEND_MAX; i++)
144 if (!pm_states[i]) 144 if (!strcmp(pm_states[i].label, value)) {
145 continue; 145 test_state = pm_states[i].state;
146 if (strcmp(pm_states[i], value) != 0) 146 return 0;
147 continue; 147 }
148 test_state = (__force suspend_state_t) i; 148
149 return 0;
150 }
151 printk(warn_bad_state, value); 149 printk(warn_bad_state, value);
152 return 0; 150 return 0;
153} 151}
@@ -164,8 +162,8 @@ static int __init test_suspend(void)
164 /* PM is initialized by now; is that state testable? */ 162 /* PM is initialized by now; is that state testable? */
165 if (test_state == PM_SUSPEND_ON) 163 if (test_state == PM_SUSPEND_ON)
166 goto done; 164 goto done;
167 if (!valid_state(test_state)) { 165 if (!pm_states[test_state].state) {
168 printk(warn_bad_state, pm_states[test_state]); 166 printk(warn_bad_state, pm_states[test_state].label);
169 goto done; 167 goto done;
170 } 168 }
171 169
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 8c9a4819f798..aaa3261dea5d 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -567,7 +567,7 @@ static int lzo_compress_threadfn(void *data)
567 567
568/** 568/**
569 * save_image_lzo - Save the suspend image data compressed with LZO. 569 * save_image_lzo - Save the suspend image data compressed with LZO.
570 * @handle: Swap mam handle to use for saving the image. 570 * @handle: Swap map handle to use for saving the image.
571 * @snapshot: Image to read data from. 571 * @snapshot: Image to read data from.
572 * @nr_to_write: Number of pages to save. 572 * @nr_to_write: Number of pages to save.
573 */ 573 */
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index a45b50962295..ea2d5f6962ed 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -54,20 +54,16 @@
54#include "console_cmdline.h" 54#include "console_cmdline.h"
55#include "braille.h" 55#include "braille.h"
56 56
57/* printk's without a loglevel use this.. */
58#define DEFAULT_MESSAGE_LOGLEVEL CONFIG_DEFAULT_MESSAGE_LOGLEVEL
59
60/* We show everything that is MORE important than this.. */
61#define MINIMUM_CONSOLE_LOGLEVEL 1 /* Minimum loglevel we let people use */
62#define DEFAULT_CONSOLE_LOGLEVEL 7 /* anything MORE serious than KERN_DEBUG */
63
64int console_printk[4] = { 57int console_printk[4] = {
65 DEFAULT_CONSOLE_LOGLEVEL, /* console_loglevel */ 58 CONSOLE_LOGLEVEL_DEFAULT, /* console_loglevel */
66 DEFAULT_MESSAGE_LOGLEVEL, /* default_message_loglevel */ 59 DEFAULT_MESSAGE_LOGLEVEL, /* default_message_loglevel */
67 MINIMUM_CONSOLE_LOGLEVEL, /* minimum_console_loglevel */ 60 CONSOLE_LOGLEVEL_MIN, /* minimum_console_loglevel */
68 DEFAULT_CONSOLE_LOGLEVEL, /* default_console_loglevel */ 61 CONSOLE_LOGLEVEL_DEFAULT, /* default_console_loglevel */
69}; 62};
70 63
64/* Deferred messaged from sched code are marked by this special level */
65#define SCHED_MESSAGE_LOGLEVEL -2
66
71/* 67/*
72 * Low level drivers may need that to know if they can schedule in 68 * Low level drivers may need that to know if they can schedule in
73 * their unblank() callback or not. So let's export it. 69 * their unblank() callback or not. So let's export it.
@@ -91,6 +87,29 @@ static struct lockdep_map console_lock_dep_map = {
91#endif 87#endif
92 88
93/* 89/*
90 * Helper macros to handle lockdep when locking/unlocking console_sem. We use
91 * macros instead of functions so that _RET_IP_ contains useful information.
92 */
93#define down_console_sem() do { \
94 down(&console_sem);\
95 mutex_acquire(&console_lock_dep_map, 0, 0, _RET_IP_);\
96} while (0)
97
98static int __down_trylock_console_sem(unsigned long ip)
99{
100 if (down_trylock(&console_sem))
101 return 1;
102 mutex_acquire(&console_lock_dep_map, 0, 1, ip);
103 return 0;
104}
105#define down_trylock_console_sem() __down_trylock_console_sem(_RET_IP_)
106
107#define up_console_sem() do { \
108 mutex_release(&console_lock_dep_map, 1, _RET_IP_);\
109 up(&console_sem);\
110} while (0)
111
112/*
94 * This is used for debugging the mess that is the VT code by 113 * This is used for debugging the mess that is the VT code by
95 * keeping track if we have the console semaphore held. It's 114 * keeping track if we have the console semaphore held. It's
96 * definitely not the perfect debug tool (we don't know if _WE_ 115 * definitely not the perfect debug tool (we don't know if _WE_
@@ -206,8 +225,9 @@ struct printk_log {
206}; 225};
207 226
208/* 227/*
209 * The logbuf_lock protects kmsg buffer, indices, counters. It is also 228 * The logbuf_lock protects kmsg buffer, indices, counters. This can be taken
210 * used in interesting ways to provide interlocking in console_unlock(); 229 * within the scheduler's rq lock. It must be released before calling
230 * console_unlock() or anything else that might wake up a process.
211 */ 231 */
212static DEFINE_RAW_SPINLOCK(logbuf_lock); 232static DEFINE_RAW_SPINLOCK(logbuf_lock);
213 233
@@ -250,9 +270,6 @@ static char __log_buf[__LOG_BUF_LEN] __aligned(LOG_ALIGN);
250static char *log_buf = __log_buf; 270static char *log_buf = __log_buf;
251static u32 log_buf_len = __LOG_BUF_LEN; 271static u32 log_buf_len = __LOG_BUF_LEN;
252 272
253/* cpu currently holding logbuf_lock */
254static volatile unsigned int logbuf_cpu = UINT_MAX;
255
256/* human readable text of the record */ 273/* human readable text of the record */
257static char *log_text(const struct printk_log *msg) 274static char *log_text(const struct printk_log *msg)
258{ 275{
@@ -297,34 +314,106 @@ static u32 log_next(u32 idx)
297 return idx + msg->len; 314 return idx + msg->len;
298} 315}
299 316
300/* insert record into the buffer, discard old ones, update heads */ 317/*
301static void log_store(int facility, int level, 318 * Check whether there is enough free space for the given message.
302 enum log_flags flags, u64 ts_nsec, 319 *
303 const char *dict, u16 dict_len, 320 * The same values of first_idx and next_idx mean that the buffer
304 const char *text, u16 text_len) 321 * is either empty or full.
322 *
323 * If the buffer is empty, we must respect the position of the indexes.
324 * They cannot be reset to the beginning of the buffer.
325 */
326static int logbuf_has_space(u32 msg_size, bool empty)
305{ 327{
306 struct printk_log *msg; 328 u32 free;
307 u32 size, pad_len;
308 329
309 /* number of '\0' padding bytes to next message */ 330 if (log_next_idx > log_first_idx || empty)
310 size = sizeof(struct printk_log) + text_len + dict_len; 331 free = max(log_buf_len - log_next_idx, log_first_idx);
311 pad_len = (-size) & (LOG_ALIGN - 1); 332 else
312 size += pad_len; 333 free = log_first_idx - log_next_idx;
334
335 /*
336 * We need space also for an empty header that signalizes wrapping
337 * of the buffer.
338 */
339 return free >= msg_size + sizeof(struct printk_log);
340}
313 341
342static int log_make_free_space(u32 msg_size)
343{
314 while (log_first_seq < log_next_seq) { 344 while (log_first_seq < log_next_seq) {
315 u32 free; 345 if (logbuf_has_space(msg_size, false))
346 return 0;
347 /* drop old messages until we have enough continuous space */
348 log_first_idx = log_next(log_first_idx);
349 log_first_seq++;
350 }
316 351
317 if (log_next_idx > log_first_idx) 352 /* sequence numbers are equal, so the log buffer is empty */
318 free = max(log_buf_len - log_next_idx, log_first_idx); 353 if (logbuf_has_space(msg_size, true))
319 else 354 return 0;
320 free = log_first_idx - log_next_idx;
321 355
322 if (free >= size + sizeof(struct printk_log)) 356 return -ENOMEM;
323 break; 357}
324 358
325 /* drop old messages until we have enough contiuous space */ 359/* compute the message size including the padding bytes */
326 log_first_idx = log_next(log_first_idx); 360static u32 msg_used_size(u16 text_len, u16 dict_len, u32 *pad_len)
327 log_first_seq++; 361{
362 u32 size;
363
364 size = sizeof(struct printk_log) + text_len + dict_len;
365 *pad_len = (-size) & (LOG_ALIGN - 1);
366 size += *pad_len;
367
368 return size;
369}
370
371/*
372 * Define how much of the log buffer we could take at maximum. The value
373 * must be greater than two. Note that only half of the buffer is available
374 * when the index points to the middle.
375 */
376#define MAX_LOG_TAKE_PART 4
377static const char trunc_msg[] = "<truncated>";
378
379static u32 truncate_msg(u16 *text_len, u16 *trunc_msg_len,
380 u16 *dict_len, u32 *pad_len)
381{
382 /*
383 * The message should not take the whole buffer. Otherwise, it might
384 * get removed too soon.
385 */
386 u32 max_text_len = log_buf_len / MAX_LOG_TAKE_PART;
387 if (*text_len > max_text_len)
388 *text_len = max_text_len;
389 /* enable the warning message */
390 *trunc_msg_len = strlen(trunc_msg);
391 /* disable the "dict" completely */
392 *dict_len = 0;
393 /* compute the size again, count also the warning message */
394 return msg_used_size(*text_len + *trunc_msg_len, 0, pad_len);
395}
396
397/* insert record into the buffer, discard old ones, update heads */
398static int log_store(int facility, int level,
399 enum log_flags flags, u64 ts_nsec,
400 const char *dict, u16 dict_len,
401 const char *text, u16 text_len)
402{
403 struct printk_log *msg;
404 u32 size, pad_len;
405 u16 trunc_msg_len = 0;
406
407 /* number of '\0' padding bytes to next message */
408 size = msg_used_size(text_len, dict_len, &pad_len);
409
410 if (log_make_free_space(size)) {
411 /* truncate the message if it is too long for empty buffer */
412 size = truncate_msg(&text_len, &trunc_msg_len,
413 &dict_len, &pad_len);
414 /* survive when the log buffer is too small for trunc_msg */
415 if (log_make_free_space(size))
416 return 0;
328 } 417 }
329 418
330 if (log_next_idx + size + sizeof(struct printk_log) > log_buf_len) { 419 if (log_next_idx + size + sizeof(struct printk_log) > log_buf_len) {
@@ -341,6 +430,10 @@ static void log_store(int facility, int level,
341 msg = (struct printk_log *)(log_buf + log_next_idx); 430 msg = (struct printk_log *)(log_buf + log_next_idx);
342 memcpy(log_text(msg), text, text_len); 431 memcpy(log_text(msg), text, text_len);
343 msg->text_len = text_len; 432 msg->text_len = text_len;
433 if (trunc_msg_len) {
434 memcpy(log_text(msg) + text_len, trunc_msg, trunc_msg_len);
435 msg->text_len += trunc_msg_len;
436 }
344 memcpy(log_dict(msg), dict, dict_len); 437 memcpy(log_dict(msg), dict, dict_len);
345 msg->dict_len = dict_len; 438 msg->dict_len = dict_len;
346 msg->facility = facility; 439 msg->facility = facility;
@@ -356,6 +449,8 @@ static void log_store(int facility, int level,
356 /* insert message */ 449 /* insert message */
357 log_next_idx += msg->len; 450 log_next_idx += msg->len;
358 log_next_seq++; 451 log_next_seq++;
452
453 return msg->text_len;
359} 454}
360 455
361#ifdef CONFIG_SECURITY_DMESG_RESTRICT 456#ifdef CONFIG_SECURITY_DMESG_RESTRICT
@@ -1303,7 +1398,10 @@ static void zap_locks(void)
1303 sema_init(&console_sem, 1); 1398 sema_init(&console_sem, 1);
1304} 1399}
1305 1400
1306/* Check if we have any console registered that can be called early in boot. */ 1401/*
1402 * Check if we have any console that is capable of printing while cpu is
1403 * booting or shutting down. Requires console_sem.
1404 */
1307static int have_callable_console(void) 1405static int have_callable_console(void)
1308{ 1406{
1309 struct console *con; 1407 struct console *con;
@@ -1318,10 +1416,9 @@ static int have_callable_console(void)
1318/* 1416/*
1319 * Can we actually use the console at this time on this cpu? 1417 * Can we actually use the console at this time on this cpu?
1320 * 1418 *
1321 * Console drivers may assume that per-cpu resources have 1419 * Console drivers may assume that per-cpu resources have been allocated. So
1322 * been allocated. So unless they're explicitly marked as 1420 * unless they're explicitly marked as being able to cope (CON_ANYTIME) don't
1323 * being able to cope (CON_ANYTIME) don't call them until 1421 * call them until this CPU is officially up.
1324 * this CPU is officially up.
1325 */ 1422 */
1326static inline int can_use_console(unsigned int cpu) 1423static inline int can_use_console(unsigned int cpu)
1327{ 1424{
@@ -1333,36 +1430,24 @@ static inline int can_use_console(unsigned int cpu)
1333 * messages from a 'printk'. Return true (and with the 1430 * messages from a 'printk'. Return true (and with the
1334 * console_lock held, and 'console_locked' set) if it 1431 * console_lock held, and 'console_locked' set) if it
1335 * is successful, false otherwise. 1432 * is successful, false otherwise.
1336 *
1337 * This gets called with the 'logbuf_lock' spinlock held and
1338 * interrupts disabled. It should return with 'lockbuf_lock'
1339 * released but interrupts still disabled.
1340 */ 1433 */
1341static int console_trylock_for_printk(unsigned int cpu) 1434static int console_trylock_for_printk(void)
1342 __releases(&logbuf_lock)
1343{ 1435{
1344 int retval = 0, wake = 0; 1436 unsigned int cpu = smp_processor_id();
1345 1437
1346 if (console_trylock()) { 1438 if (!console_trylock())
1347 retval = 1; 1439 return 0;
1348 1440 /*
1349 /* 1441 * If we can't use the console, we need to release the console
1350 * If we can't use the console, we need to release 1442 * semaphore by hand to avoid flushing the buffer. We need to hold the
1351 * the console semaphore by hand to avoid flushing 1443 * console semaphore in order to do this test safely.
1352 * the buffer. We need to hold the console semaphore 1444 */
1353 * in order to do this test safely. 1445 if (!can_use_console(cpu)) {
1354 */ 1446 console_locked = 0;
1355 if (!can_use_console(cpu)) { 1447 up_console_sem();
1356 console_locked = 0; 1448 return 0;
1357 wake = 1;
1358 retval = 0;
1359 }
1360 } 1449 }
1361 logbuf_cpu = UINT_MAX; 1450 return 1;
1362 raw_spin_unlock(&logbuf_lock);
1363 if (wake)
1364 up(&console_sem);
1365 return retval;
1366} 1451}
1367 1452
1368int printk_delay_msec __read_mostly; 1453int printk_delay_msec __read_mostly;
@@ -1490,11 +1575,19 @@ asmlinkage int vprintk_emit(int facility, int level,
1490 static int recursion_bug; 1575 static int recursion_bug;
1491 static char textbuf[LOG_LINE_MAX]; 1576 static char textbuf[LOG_LINE_MAX];
1492 char *text = textbuf; 1577 char *text = textbuf;
1493 size_t text_len; 1578 size_t text_len = 0;
1494 enum log_flags lflags = 0; 1579 enum log_flags lflags = 0;
1495 unsigned long flags; 1580 unsigned long flags;
1496 int this_cpu; 1581 int this_cpu;
1497 int printed_len = 0; 1582 int printed_len = 0;
1583 bool in_sched = false;
1584 /* cpu currently holding logbuf_lock in this function */
1585 static volatile unsigned int logbuf_cpu = UINT_MAX;
1586
1587 if (level == SCHED_MESSAGE_LOGLEVEL) {
1588 level = -1;
1589 in_sched = true;
1590 }
1498 1591
1499 boot_delay_msec(level); 1592 boot_delay_msec(level);
1500 printk_delay(); 1593 printk_delay();
@@ -1516,7 +1609,8 @@ asmlinkage int vprintk_emit(int facility, int level,
1516 */ 1609 */
1517 if (!oops_in_progress && !lockdep_recursing(current)) { 1610 if (!oops_in_progress && !lockdep_recursing(current)) {
1518 recursion_bug = 1; 1611 recursion_bug = 1;
1519 goto out_restore_irqs; 1612 local_irq_restore(flags);
1613 return 0;
1520 } 1614 }
1521 zap_locks(); 1615 zap_locks();
1522 } 1616 }
@@ -1530,17 +1624,22 @@ asmlinkage int vprintk_emit(int facility, int level,
1530 "BUG: recent printk recursion!"; 1624 "BUG: recent printk recursion!";
1531 1625
1532 recursion_bug = 0; 1626 recursion_bug = 0;
1533 printed_len += strlen(recursion_msg); 1627 text_len = strlen(recursion_msg);
1534 /* emit KERN_CRIT message */ 1628 /* emit KERN_CRIT message */
1535 log_store(0, 2, LOG_PREFIX|LOG_NEWLINE, 0, 1629 printed_len += log_store(0, 2, LOG_PREFIX|LOG_NEWLINE, 0,
1536 NULL, 0, recursion_msg, printed_len); 1630 NULL, 0, recursion_msg, text_len);
1537 } 1631 }
1538 1632
1539 /* 1633 /*
1540 * The printf needs to come first; we need the syslog 1634 * The printf needs to come first; we need the syslog
1541 * prefix which might be passed-in as a parameter. 1635 * prefix which might be passed-in as a parameter.
1542 */ 1636 */
1543 text_len = vscnprintf(text, sizeof(textbuf), fmt, args); 1637 if (in_sched)
1638 text_len = scnprintf(text, sizeof(textbuf),
1639 KERN_WARNING "[sched_delayed] ");
1640
1641 text_len += vscnprintf(text + text_len,
1642 sizeof(textbuf) - text_len, fmt, args);
1544 1643
1545 /* mark and strip a trailing newline */ 1644 /* mark and strip a trailing newline */
1546 if (text_len && text[text_len-1] == '\n') { 1645 if (text_len && text[text_len-1] == '\n') {
@@ -1586,9 +1685,12 @@ asmlinkage int vprintk_emit(int facility, int level,
1586 cont_flush(LOG_NEWLINE); 1685 cont_flush(LOG_NEWLINE);
1587 1686
1588 /* buffer line if possible, otherwise store it right away */ 1687 /* buffer line if possible, otherwise store it right away */
1589 if (!cont_add(facility, level, text, text_len)) 1688 if (cont_add(facility, level, text, text_len))
1590 log_store(facility, level, lflags | LOG_CONT, 0, 1689 printed_len += text_len;
1591 dict, dictlen, text, text_len); 1690 else
1691 printed_len += log_store(facility, level,
1692 lflags | LOG_CONT, 0,
1693 dict, dictlen, text, text_len);
1592 } else { 1694 } else {
1593 bool stored = false; 1695 bool stored = false;
1594 1696
@@ -1607,26 +1709,35 @@ asmlinkage int vprintk_emit(int facility, int level,
1607 cont_flush(LOG_NEWLINE); 1709 cont_flush(LOG_NEWLINE);
1608 } 1710 }
1609 1711
1610 if (!stored) 1712 if (stored)
1611 log_store(facility, level, lflags, 0, 1713 printed_len += text_len;
1612 dict, dictlen, text, text_len); 1714 else
1715 printed_len += log_store(facility, level, lflags, 0,
1716 dict, dictlen, text, text_len);
1613 } 1717 }
1614 printed_len += text_len; 1718
1719 logbuf_cpu = UINT_MAX;
1720 raw_spin_unlock(&logbuf_lock);
1721 lockdep_on();
1722 local_irq_restore(flags);
1723
1724 /* If called from the scheduler, we can not call up(). */
1725 if (in_sched)
1726 return printed_len;
1615 1727
1616 /* 1728 /*
1729 * Disable preemption to avoid being preempted while holding
1730 * console_sem which would prevent anyone from printing to console
1731 */
1732 preempt_disable();
1733 /*
1617 * Try to acquire and then immediately release the console semaphore. 1734 * Try to acquire and then immediately release the console semaphore.
1618 * The release will print out buffers and wake up /dev/kmsg and syslog() 1735 * The release will print out buffers and wake up /dev/kmsg and syslog()
1619 * users. 1736 * users.
1620 *
1621 * The console_trylock_for_printk() function will release 'logbuf_lock'
1622 * regardless of whether it actually gets the console semaphore or not.
1623 */ 1737 */
1624 if (console_trylock_for_printk(this_cpu)) 1738 if (console_trylock_for_printk())
1625 console_unlock(); 1739 console_unlock();
1626 1740 preempt_enable();
1627 lockdep_on();
1628out_restore_irqs:
1629 local_irq_restore(flags);
1630 1741
1631 return printed_len; 1742 return printed_len;
1632} 1743}
@@ -1674,7 +1785,7 @@ EXPORT_SYMBOL(printk_emit);
1674 * 1785 *
1675 * See the vsnprintf() documentation for format string extensions over C99. 1786 * See the vsnprintf() documentation for format string extensions over C99.
1676 */ 1787 */
1677asmlinkage int printk(const char *fmt, ...) 1788asmlinkage __visible int printk(const char *fmt, ...)
1678{ 1789{
1679 va_list args; 1790 va_list args;
1680 int r; 1791 int r;
@@ -1737,7 +1848,7 @@ void early_vprintk(const char *fmt, va_list ap)
1737 } 1848 }
1738} 1849}
1739 1850
1740asmlinkage void early_printk(const char *fmt, ...) 1851asmlinkage __visible void early_printk(const char *fmt, ...)
1741{ 1852{
1742 va_list ap; 1853 va_list ap;
1743 1854
@@ -1882,16 +1993,14 @@ void suspend_console(void)
1882 printk("Suspending console(s) (use no_console_suspend to debug)\n"); 1993 printk("Suspending console(s) (use no_console_suspend to debug)\n");
1883 console_lock(); 1994 console_lock();
1884 console_suspended = 1; 1995 console_suspended = 1;
1885 up(&console_sem); 1996 up_console_sem();
1886 mutex_release(&console_lock_dep_map, 1, _RET_IP_);
1887} 1997}
1888 1998
1889void resume_console(void) 1999void resume_console(void)
1890{ 2000{
1891 if (!console_suspend_enabled) 2001 if (!console_suspend_enabled)
1892 return; 2002 return;
1893 down(&console_sem); 2003 down_console_sem();
1894 mutex_acquire(&console_lock_dep_map, 0, 0, _RET_IP_);
1895 console_suspended = 0; 2004 console_suspended = 0;
1896 console_unlock(); 2005 console_unlock();
1897} 2006}
@@ -1933,12 +2042,11 @@ void console_lock(void)
1933{ 2042{
1934 might_sleep(); 2043 might_sleep();
1935 2044
1936 down(&console_sem); 2045 down_console_sem();
1937 if (console_suspended) 2046 if (console_suspended)
1938 return; 2047 return;
1939 console_locked = 1; 2048 console_locked = 1;
1940 console_may_schedule = 1; 2049 console_may_schedule = 1;
1941 mutex_acquire(&console_lock_dep_map, 0, 0, _RET_IP_);
1942} 2050}
1943EXPORT_SYMBOL(console_lock); 2051EXPORT_SYMBOL(console_lock);
1944 2052
@@ -1952,15 +2060,14 @@ EXPORT_SYMBOL(console_lock);
1952 */ 2060 */
1953int console_trylock(void) 2061int console_trylock(void)
1954{ 2062{
1955 if (down_trylock(&console_sem)) 2063 if (down_trylock_console_sem())
1956 return 0; 2064 return 0;
1957 if (console_suspended) { 2065 if (console_suspended) {
1958 up(&console_sem); 2066 up_console_sem();
1959 return 0; 2067 return 0;
1960 } 2068 }
1961 console_locked = 1; 2069 console_locked = 1;
1962 console_may_schedule = 0; 2070 console_may_schedule = 0;
1963 mutex_acquire(&console_lock_dep_map, 0, 1, _RET_IP_);
1964 return 1; 2071 return 1;
1965} 2072}
1966EXPORT_SYMBOL(console_trylock); 2073EXPORT_SYMBOL(console_trylock);
@@ -2022,7 +2129,7 @@ void console_unlock(void)
2022 bool retry; 2129 bool retry;
2023 2130
2024 if (console_suspended) { 2131 if (console_suspended) {
2025 up(&console_sem); 2132 up_console_sem();
2026 return; 2133 return;
2027 } 2134 }
2028 2135
@@ -2043,10 +2150,15 @@ again:
2043 } 2150 }
2044 2151
2045 if (console_seq < log_first_seq) { 2152 if (console_seq < log_first_seq) {
2153 len = sprintf(text, "** %u printk messages dropped ** ",
2154 (unsigned)(log_first_seq - console_seq));
2155
2046 /* messages are gone, move to first one */ 2156 /* messages are gone, move to first one */
2047 console_seq = log_first_seq; 2157 console_seq = log_first_seq;
2048 console_idx = log_first_idx; 2158 console_idx = log_first_idx;
2049 console_prev = 0; 2159 console_prev = 0;
2160 } else {
2161 len = 0;
2050 } 2162 }
2051skip: 2163skip:
2052 if (console_seq == log_next_seq) 2164 if (console_seq == log_next_seq)
@@ -2071,8 +2183,8 @@ skip:
2071 } 2183 }
2072 2184
2073 level = msg->level; 2185 level = msg->level;
2074 len = msg_print_text(msg, console_prev, false, 2186 len += msg_print_text(msg, console_prev, false,
2075 text, sizeof(text)); 2187 text + len, sizeof(text) - len);
2076 console_idx = log_next(console_idx); 2188 console_idx = log_next(console_idx);
2077 console_seq++; 2189 console_seq++;
2078 console_prev = msg->flags; 2190 console_prev = msg->flags;
@@ -2084,7 +2196,6 @@ skip:
2084 local_irq_restore(flags); 2196 local_irq_restore(flags);
2085 } 2197 }
2086 console_locked = 0; 2198 console_locked = 0;
2087 mutex_release(&console_lock_dep_map, 1, _RET_IP_);
2088 2199
2089 /* Release the exclusive_console once it is used */ 2200 /* Release the exclusive_console once it is used */
2090 if (unlikely(exclusive_console)) 2201 if (unlikely(exclusive_console))
@@ -2092,7 +2203,7 @@ skip:
2092 2203
2093 raw_spin_unlock(&logbuf_lock); 2204 raw_spin_unlock(&logbuf_lock);
2094 2205
2095 up(&console_sem); 2206 up_console_sem();
2096 2207
2097 /* 2208 /*
2098 * Someone could have filled up the buffer again, so re-check if there's 2209 * Someone could have filled up the buffer again, so re-check if there's
@@ -2137,7 +2248,7 @@ void console_unblank(void)
2137 * oops_in_progress is set to 1.. 2248 * oops_in_progress is set to 1..
2138 */ 2249 */
2139 if (oops_in_progress) { 2250 if (oops_in_progress) {
2140 if (down_trylock(&console_sem) != 0) 2251 if (down_trylock_console_sem() != 0)
2141 return; 2252 return;
2142 } else 2253 } else
2143 console_lock(); 2254 console_lock();
@@ -2413,6 +2524,7 @@ int unregister_console(struct console *console)
2413 if (console_drivers != NULL && console->flags & CON_CONSDEV) 2524 if (console_drivers != NULL && console->flags & CON_CONSDEV)
2414 console_drivers->flags |= CON_CONSDEV; 2525 console_drivers->flags |= CON_CONSDEV;
2415 2526
2527 console->flags &= ~CON_ENABLED;
2416 console_unlock(); 2528 console_unlock();
2417 console_sysfs_notify(); 2529 console_sysfs_notify();
2418 return res; 2530 return res;
@@ -2437,21 +2549,19 @@ late_initcall(printk_late_init);
2437/* 2549/*
2438 * Delayed printk version, for scheduler-internal messages: 2550 * Delayed printk version, for scheduler-internal messages:
2439 */ 2551 */
2440#define PRINTK_BUF_SIZE 512
2441
2442#define PRINTK_PENDING_WAKEUP 0x01 2552#define PRINTK_PENDING_WAKEUP 0x01
2443#define PRINTK_PENDING_SCHED 0x02 2553#define PRINTK_PENDING_OUTPUT 0x02
2444 2554
2445static DEFINE_PER_CPU(int, printk_pending); 2555static DEFINE_PER_CPU(int, printk_pending);
2446static DEFINE_PER_CPU(char [PRINTK_BUF_SIZE], printk_sched_buf);
2447 2556
2448static void wake_up_klogd_work_func(struct irq_work *irq_work) 2557static void wake_up_klogd_work_func(struct irq_work *irq_work)
2449{ 2558{
2450 int pending = __this_cpu_xchg(printk_pending, 0); 2559 int pending = __this_cpu_xchg(printk_pending, 0);
2451 2560
2452 if (pending & PRINTK_PENDING_SCHED) { 2561 if (pending & PRINTK_PENDING_OUTPUT) {
2453 char *buf = __get_cpu_var(printk_sched_buf); 2562 /* If trylock fails, someone else is doing the printing */
2454 pr_warn("[sched_delayed] %s", buf); 2563 if (console_trylock())
2564 console_unlock();
2455 } 2565 }
2456 2566
2457 if (pending & PRINTK_PENDING_WAKEUP) 2567 if (pending & PRINTK_PENDING_WAKEUP)
@@ -2473,23 +2583,19 @@ void wake_up_klogd(void)
2473 preempt_enable(); 2583 preempt_enable();
2474} 2584}
2475 2585
2476int printk_sched(const char *fmt, ...) 2586int printk_deferred(const char *fmt, ...)
2477{ 2587{
2478 unsigned long flags;
2479 va_list args; 2588 va_list args;
2480 char *buf;
2481 int r; 2589 int r;
2482 2590
2483 local_irq_save(flags); 2591 preempt_disable();
2484 buf = __get_cpu_var(printk_sched_buf);
2485
2486 va_start(args, fmt); 2592 va_start(args, fmt);
2487 r = vsnprintf(buf, PRINTK_BUF_SIZE, fmt, args); 2593 r = vprintk_emit(0, SCHED_MESSAGE_LOGLEVEL, NULL, 0, fmt, args);
2488 va_end(args); 2594 va_end(args);
2489 2595
2490 __this_cpu_or(printk_pending, PRINTK_PENDING_SCHED); 2596 __this_cpu_or(printk_pending, PRINTK_PENDING_OUTPUT);
2491 irq_work_queue(&__get_cpu_var(wake_up_klogd_work)); 2597 irq_work_queue(&__get_cpu_var(wake_up_klogd_work));
2492 local_irq_restore(flags); 2598 preempt_enable();
2493 2599
2494 return r; 2600 return r;
2495} 2601}
diff --git a/kernel/profile.c b/kernel/profile.c
index cb980f0c731b..54bf5ba26420 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -52,9 +52,9 @@ static DEFINE_MUTEX(profile_flip_mutex);
52 52
53int profile_setup(char *str) 53int profile_setup(char *str)
54{ 54{
55 static char schedstr[] = "schedule"; 55 static const char schedstr[] = "schedule";
56 static char sleepstr[] = "sleep"; 56 static const char sleepstr[] = "sleep";
57 static char kvmstr[] = "kvm"; 57 static const char kvmstr[] = "kvm";
58 int par; 58 int par;
59 59
60 if (!strncmp(str, sleepstr, strlen(sleepstr))) { 60 if (!strncmp(str, sleepstr, strlen(sleepstr))) {
@@ -64,12 +64,10 @@ int profile_setup(char *str)
64 str += strlen(sleepstr) + 1; 64 str += strlen(sleepstr) + 1;
65 if (get_option(&str, &par)) 65 if (get_option(&str, &par))
66 prof_shift = par; 66 prof_shift = par;
67 printk(KERN_INFO 67 pr_info("kernel sleep profiling enabled (shift: %ld)\n",
68 "kernel sleep profiling enabled (shift: %ld)\n",
69 prof_shift); 68 prof_shift);
70#else 69#else
71 printk(KERN_WARNING 70 pr_warn("kernel sleep profiling requires CONFIG_SCHEDSTATS\n");
72 "kernel sleep profiling requires CONFIG_SCHEDSTATS\n");
73#endif /* CONFIG_SCHEDSTATS */ 71#endif /* CONFIG_SCHEDSTATS */
74 } else if (!strncmp(str, schedstr, strlen(schedstr))) { 72 } else if (!strncmp(str, schedstr, strlen(schedstr))) {
75 prof_on = SCHED_PROFILING; 73 prof_on = SCHED_PROFILING;
@@ -77,8 +75,7 @@ int profile_setup(char *str)
77 str += strlen(schedstr) + 1; 75 str += strlen(schedstr) + 1;
78 if (get_option(&str, &par)) 76 if (get_option(&str, &par))
79 prof_shift = par; 77 prof_shift = par;
80 printk(KERN_INFO 78 pr_info("kernel schedule profiling enabled (shift: %ld)\n",
81 "kernel schedule profiling enabled (shift: %ld)\n",
82 prof_shift); 79 prof_shift);
83 } else if (!strncmp(str, kvmstr, strlen(kvmstr))) { 80 } else if (!strncmp(str, kvmstr, strlen(kvmstr))) {
84 prof_on = KVM_PROFILING; 81 prof_on = KVM_PROFILING;
@@ -86,13 +83,12 @@ int profile_setup(char *str)
86 str += strlen(kvmstr) + 1; 83 str += strlen(kvmstr) + 1;
87 if (get_option(&str, &par)) 84 if (get_option(&str, &par))
88 prof_shift = par; 85 prof_shift = par;
89 printk(KERN_INFO 86 pr_info("kernel KVM profiling enabled (shift: %ld)\n",
90 "kernel KVM profiling enabled (shift: %ld)\n",
91 prof_shift); 87 prof_shift);
92 } else if (get_option(&str, &par)) { 88 } else if (get_option(&str, &par)) {
93 prof_shift = par; 89 prof_shift = par;
94 prof_on = CPU_PROFILING; 90 prof_on = CPU_PROFILING;
95 printk(KERN_INFO "kernel profiling enabled (shift: %ld)\n", 91 pr_info("kernel profiling enabled (shift: %ld)\n",
96 prof_shift); 92 prof_shift);
97 } 93 }
98 return 1; 94 return 1;
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index bd30bc61bc05..7fa34f86e5ba 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -58,9 +58,11 @@ torture_param(int, fqs_duration, 0,
58 "Duration of fqs bursts (us), 0 to disable"); 58 "Duration of fqs bursts (us), 0 to disable");
59torture_param(int, fqs_holdoff, 0, "Holdoff time within fqs bursts (us)"); 59torture_param(int, fqs_holdoff, 0, "Holdoff time within fqs bursts (us)");
60torture_param(int, fqs_stutter, 3, "Wait time between fqs bursts (s)"); 60torture_param(int, fqs_stutter, 3, "Wait time between fqs bursts (s)");
61torture_param(bool, gp_cond, false, "Use conditional/async GP wait primitives");
61torture_param(bool, gp_exp, false, "Use expedited GP wait primitives"); 62torture_param(bool, gp_exp, false, "Use expedited GP wait primitives");
62torture_param(bool, gp_normal, false, 63torture_param(bool, gp_normal, false,
63 "Use normal (non-expedited) GP wait primitives"); 64 "Use normal (non-expedited) GP wait primitives");
65torture_param(bool, gp_sync, false, "Use synchronous GP wait primitives");
64torture_param(int, irqreader, 1, "Allow RCU readers from irq handlers"); 66torture_param(int, irqreader, 1, "Allow RCU readers from irq handlers");
65torture_param(int, n_barrier_cbs, 0, 67torture_param(int, n_barrier_cbs, 0,
66 "# of callbacks/kthreads for barrier testing"); 68 "# of callbacks/kthreads for barrier testing");
@@ -138,6 +140,18 @@ static long n_barrier_attempts;
138static long n_barrier_successes; 140static long n_barrier_successes;
139static struct list_head rcu_torture_removed; 141static struct list_head rcu_torture_removed;
140 142
143static int rcu_torture_writer_state;
144#define RTWS_FIXED_DELAY 0
145#define RTWS_DELAY 1
146#define RTWS_REPLACE 2
147#define RTWS_DEF_FREE 3
148#define RTWS_EXP_SYNC 4
149#define RTWS_COND_GET 5
150#define RTWS_COND_SYNC 6
151#define RTWS_SYNC 7
152#define RTWS_STUTTER 8
153#define RTWS_STOPPING 9
154
141#if defined(MODULE) || defined(CONFIG_RCU_TORTURE_TEST_RUNNABLE) 155#if defined(MODULE) || defined(CONFIG_RCU_TORTURE_TEST_RUNNABLE)
142#define RCUTORTURE_RUNNABLE_INIT 1 156#define RCUTORTURE_RUNNABLE_INIT 1
143#else 157#else
@@ -214,6 +228,7 @@ rcu_torture_free(struct rcu_torture *p)
214 */ 228 */
215 229
216struct rcu_torture_ops { 230struct rcu_torture_ops {
231 int ttype;
217 void (*init)(void); 232 void (*init)(void);
218 int (*readlock)(void); 233 int (*readlock)(void);
219 void (*read_delay)(struct torture_random_state *rrsp); 234 void (*read_delay)(struct torture_random_state *rrsp);
@@ -222,6 +237,8 @@ struct rcu_torture_ops {
222 void (*deferred_free)(struct rcu_torture *p); 237 void (*deferred_free)(struct rcu_torture *p);
223 void (*sync)(void); 238 void (*sync)(void);
224 void (*exp_sync)(void); 239 void (*exp_sync)(void);
240 unsigned long (*get_state)(void);
241 void (*cond_sync)(unsigned long oldstate);
225 void (*call)(struct rcu_head *head, void (*func)(struct rcu_head *rcu)); 242 void (*call)(struct rcu_head *head, void (*func)(struct rcu_head *rcu));
226 void (*cb_barrier)(void); 243 void (*cb_barrier)(void);
227 void (*fqs)(void); 244 void (*fqs)(void);
@@ -273,10 +290,48 @@ static int rcu_torture_completed(void)
273 return rcu_batches_completed(); 290 return rcu_batches_completed();
274} 291}
275 292
293/*
294 * Update callback in the pipe. This should be invoked after a grace period.
295 */
296static bool
297rcu_torture_pipe_update_one(struct rcu_torture *rp)
298{
299 int i;
300
301 i = rp->rtort_pipe_count;
302 if (i > RCU_TORTURE_PIPE_LEN)
303 i = RCU_TORTURE_PIPE_LEN;
304 atomic_inc(&rcu_torture_wcount[i]);
305 if (++rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) {
306 rp->rtort_mbtest = 0;
307 return true;
308 }
309 return false;
310}
311
312/*
313 * Update all callbacks in the pipe. Suitable for synchronous grace-period
314 * primitives.
315 */
316static void
317rcu_torture_pipe_update(struct rcu_torture *old_rp)
318{
319 struct rcu_torture *rp;
320 struct rcu_torture *rp1;
321
322 if (old_rp)
323 list_add(&old_rp->rtort_free, &rcu_torture_removed);
324 list_for_each_entry_safe(rp, rp1, &rcu_torture_removed, rtort_free) {
325 if (rcu_torture_pipe_update_one(rp)) {
326 list_del(&rp->rtort_free);
327 rcu_torture_free(rp);
328 }
329 }
330}
331
276static void 332static void
277rcu_torture_cb(struct rcu_head *p) 333rcu_torture_cb(struct rcu_head *p)
278{ 334{
279 int i;
280 struct rcu_torture *rp = container_of(p, struct rcu_torture, rtort_rcu); 335 struct rcu_torture *rp = container_of(p, struct rcu_torture, rtort_rcu);
281 336
282 if (torture_must_stop_irq()) { 337 if (torture_must_stop_irq()) {
@@ -284,16 +339,10 @@ rcu_torture_cb(struct rcu_head *p)
284 /* The next initialization will pick up the pieces. */ 339 /* The next initialization will pick up the pieces. */
285 return; 340 return;
286 } 341 }
287 i = rp->rtort_pipe_count; 342 if (rcu_torture_pipe_update_one(rp))
288 if (i > RCU_TORTURE_PIPE_LEN)
289 i = RCU_TORTURE_PIPE_LEN;
290 atomic_inc(&rcu_torture_wcount[i]);
291 if (++rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) {
292 rp->rtort_mbtest = 0;
293 rcu_torture_free(rp); 343 rcu_torture_free(rp);
294 } else { 344 else
295 cur_ops->deferred_free(rp); 345 cur_ops->deferred_free(rp);
296 }
297} 346}
298 347
299static int rcu_no_completed(void) 348static int rcu_no_completed(void)
@@ -312,6 +361,7 @@ static void rcu_sync_torture_init(void)
312} 361}
313 362
314static struct rcu_torture_ops rcu_ops = { 363static struct rcu_torture_ops rcu_ops = {
364 .ttype = RCU_FLAVOR,
315 .init = rcu_sync_torture_init, 365 .init = rcu_sync_torture_init,
316 .readlock = rcu_torture_read_lock, 366 .readlock = rcu_torture_read_lock,
317 .read_delay = rcu_read_delay, 367 .read_delay = rcu_read_delay,
@@ -320,6 +370,8 @@ static struct rcu_torture_ops rcu_ops = {
320 .deferred_free = rcu_torture_deferred_free, 370 .deferred_free = rcu_torture_deferred_free,
321 .sync = synchronize_rcu, 371 .sync = synchronize_rcu,
322 .exp_sync = synchronize_rcu_expedited, 372 .exp_sync = synchronize_rcu_expedited,
373 .get_state = get_state_synchronize_rcu,
374 .cond_sync = cond_synchronize_rcu,
323 .call = call_rcu, 375 .call = call_rcu,
324 .cb_barrier = rcu_barrier, 376 .cb_barrier = rcu_barrier,
325 .fqs = rcu_force_quiescent_state, 377 .fqs = rcu_force_quiescent_state,
@@ -355,6 +407,7 @@ static void rcu_bh_torture_deferred_free(struct rcu_torture *p)
355} 407}
356 408
357static struct rcu_torture_ops rcu_bh_ops = { 409static struct rcu_torture_ops rcu_bh_ops = {
410 .ttype = RCU_BH_FLAVOR,
358 .init = rcu_sync_torture_init, 411 .init = rcu_sync_torture_init,
359 .readlock = rcu_bh_torture_read_lock, 412 .readlock = rcu_bh_torture_read_lock,
360 .read_delay = rcu_read_delay, /* just reuse rcu's version. */ 413 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
@@ -397,6 +450,7 @@ call_rcu_busted(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
397} 450}
398 451
399static struct rcu_torture_ops rcu_busted_ops = { 452static struct rcu_torture_ops rcu_busted_ops = {
453 .ttype = INVALID_RCU_FLAVOR,
400 .init = rcu_sync_torture_init, 454 .init = rcu_sync_torture_init,
401 .readlock = rcu_torture_read_lock, 455 .readlock = rcu_torture_read_lock,
402 .read_delay = rcu_read_delay, /* just reuse rcu's version. */ 456 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
@@ -479,9 +533,11 @@ static void srcu_torture_stats(char *page)
479 page += sprintf(page, "%s%s per-CPU(idx=%d):", 533 page += sprintf(page, "%s%s per-CPU(idx=%d):",
480 torture_type, TORTURE_FLAG, idx); 534 torture_type, TORTURE_FLAG, idx);
481 for_each_possible_cpu(cpu) { 535 for_each_possible_cpu(cpu) {
482 page += sprintf(page, " %d(%lu,%lu)", cpu, 536 long c0, c1;
483 per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[!idx], 537
484 per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[idx]); 538 c0 = (long)per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[!idx];
539 c1 = (long)per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[idx];
540 page += sprintf(page, " %d(%ld,%ld)", cpu, c0, c1);
485 } 541 }
486 sprintf(page, "\n"); 542 sprintf(page, "\n");
487} 543}
@@ -492,6 +548,7 @@ static void srcu_torture_synchronize_expedited(void)
492} 548}
493 549
494static struct rcu_torture_ops srcu_ops = { 550static struct rcu_torture_ops srcu_ops = {
551 .ttype = SRCU_FLAVOR,
495 .init = rcu_sync_torture_init, 552 .init = rcu_sync_torture_init,
496 .readlock = srcu_torture_read_lock, 553 .readlock = srcu_torture_read_lock,
497 .read_delay = srcu_read_delay, 554 .read_delay = srcu_read_delay,
@@ -527,6 +584,7 @@ static void rcu_sched_torture_deferred_free(struct rcu_torture *p)
527} 584}
528 585
529static struct rcu_torture_ops sched_ops = { 586static struct rcu_torture_ops sched_ops = {
587 .ttype = RCU_SCHED_FLAVOR,
530 .init = rcu_sync_torture_init, 588 .init = rcu_sync_torture_init,
531 .readlock = sched_torture_read_lock, 589 .readlock = sched_torture_read_lock,
532 .read_delay = rcu_read_delay, /* just reuse rcu's version. */ 590 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
@@ -688,23 +746,59 @@ rcu_torture_fqs(void *arg)
688static int 746static int
689rcu_torture_writer(void *arg) 747rcu_torture_writer(void *arg)
690{ 748{
691 bool exp; 749 unsigned long gp_snap;
750 bool gp_cond1 = gp_cond, gp_exp1 = gp_exp, gp_normal1 = gp_normal;
751 bool gp_sync1 = gp_sync;
692 int i; 752 int i;
693 struct rcu_torture *rp; 753 struct rcu_torture *rp;
694 struct rcu_torture *rp1;
695 struct rcu_torture *old_rp; 754 struct rcu_torture *old_rp;
696 static DEFINE_TORTURE_RANDOM(rand); 755 static DEFINE_TORTURE_RANDOM(rand);
756 int synctype[] = { RTWS_DEF_FREE, RTWS_EXP_SYNC,
757 RTWS_COND_GET, RTWS_SYNC };
758 int nsynctypes = 0;
697 759
698 VERBOSE_TOROUT_STRING("rcu_torture_writer task started"); 760 VERBOSE_TOROUT_STRING("rcu_torture_writer task started");
699 set_user_nice(current, MAX_NICE); 761
762 /* Initialize synctype[] array. If none set, take default. */
763 if (!gp_cond1 && !gp_exp1 && !gp_normal1 && !gp_sync)
764 gp_cond1 = gp_exp1 = gp_normal1 = gp_sync1 = true;
765 if (gp_cond1 && cur_ops->get_state && cur_ops->cond_sync)
766 synctype[nsynctypes++] = RTWS_COND_GET;
767 else if (gp_cond && (!cur_ops->get_state || !cur_ops->cond_sync))
768 pr_alert("rcu_torture_writer: gp_cond without primitives.\n");
769 if (gp_exp1 && cur_ops->exp_sync)
770 synctype[nsynctypes++] = RTWS_EXP_SYNC;
771 else if (gp_exp && !cur_ops->exp_sync)
772 pr_alert("rcu_torture_writer: gp_exp without primitives.\n");
773 if (gp_normal1 && cur_ops->deferred_free)
774 synctype[nsynctypes++] = RTWS_DEF_FREE;
775 else if (gp_normal && !cur_ops->deferred_free)
776 pr_alert("rcu_torture_writer: gp_normal without primitives.\n");
777 if (gp_sync1 && cur_ops->sync)
778 synctype[nsynctypes++] = RTWS_SYNC;
779 else if (gp_sync && !cur_ops->sync)
780 pr_alert("rcu_torture_writer: gp_sync without primitives.\n");
781 if (WARN_ONCE(nsynctypes == 0,
782 "rcu_torture_writer: No update-side primitives.\n")) {
783 /*
784 * No updates primitives, so don't try updating.
785 * The resulting test won't be testing much, hence the
786 * above WARN_ONCE().
787 */
788 rcu_torture_writer_state = RTWS_STOPPING;
789 torture_kthread_stopping("rcu_torture_writer");
790 }
700 791
701 do { 792 do {
793 rcu_torture_writer_state = RTWS_FIXED_DELAY;
702 schedule_timeout_uninterruptible(1); 794 schedule_timeout_uninterruptible(1);
703 rp = rcu_torture_alloc(); 795 rp = rcu_torture_alloc();
704 if (rp == NULL) 796 if (rp == NULL)
705 continue; 797 continue;
706 rp->rtort_pipe_count = 0; 798 rp->rtort_pipe_count = 0;
799 rcu_torture_writer_state = RTWS_DELAY;
707 udelay(torture_random(&rand) & 0x3ff); 800 udelay(torture_random(&rand) & 0x3ff);
801 rcu_torture_writer_state = RTWS_REPLACE;
708 old_rp = rcu_dereference_check(rcu_torture_current, 802 old_rp = rcu_dereference_check(rcu_torture_current,
709 current == writer_task); 803 current == writer_task);
710 rp->rtort_mbtest = 1; 804 rp->rtort_mbtest = 1;
@@ -716,35 +810,42 @@ rcu_torture_writer(void *arg)
716 i = RCU_TORTURE_PIPE_LEN; 810 i = RCU_TORTURE_PIPE_LEN;
717 atomic_inc(&rcu_torture_wcount[i]); 811 atomic_inc(&rcu_torture_wcount[i]);
718 old_rp->rtort_pipe_count++; 812 old_rp->rtort_pipe_count++;
719 if (gp_normal == gp_exp) 813 switch (synctype[torture_random(&rand) % nsynctypes]) {
720 exp = !!(torture_random(&rand) & 0x80); 814 case RTWS_DEF_FREE:
721 else 815 rcu_torture_writer_state = RTWS_DEF_FREE;
722 exp = gp_exp;
723 if (!exp) {
724 cur_ops->deferred_free(old_rp); 816 cur_ops->deferred_free(old_rp);
725 } else { 817 break;
818 case RTWS_EXP_SYNC:
819 rcu_torture_writer_state = RTWS_EXP_SYNC;
726 cur_ops->exp_sync(); 820 cur_ops->exp_sync();
727 list_add(&old_rp->rtort_free, 821 rcu_torture_pipe_update(old_rp);
728 &rcu_torture_removed); 822 break;
729 list_for_each_entry_safe(rp, rp1, 823 case RTWS_COND_GET:
730 &rcu_torture_removed, 824 rcu_torture_writer_state = RTWS_COND_GET;
731 rtort_free) { 825 gp_snap = cur_ops->get_state();
732 i = rp->rtort_pipe_count; 826 i = torture_random(&rand) % 16;
733 if (i > RCU_TORTURE_PIPE_LEN) 827 if (i != 0)
734 i = RCU_TORTURE_PIPE_LEN; 828 schedule_timeout_interruptible(i);
735 atomic_inc(&rcu_torture_wcount[i]); 829 udelay(torture_random(&rand) % 1000);
736 if (++rp->rtort_pipe_count >= 830 rcu_torture_writer_state = RTWS_COND_SYNC;
737 RCU_TORTURE_PIPE_LEN) { 831 cur_ops->cond_sync(gp_snap);
738 rp->rtort_mbtest = 0; 832 rcu_torture_pipe_update(old_rp);
739 list_del(&rp->rtort_free); 833 break;
740 rcu_torture_free(rp); 834 case RTWS_SYNC:
741 } 835 rcu_torture_writer_state = RTWS_SYNC;
742 } 836 cur_ops->sync();
837 rcu_torture_pipe_update(old_rp);
838 break;
839 default:
840 WARN_ON_ONCE(1);
841 break;
743 } 842 }
744 } 843 }
745 rcutorture_record_progress(++rcu_torture_current_version); 844 rcutorture_record_progress(++rcu_torture_current_version);
845 rcu_torture_writer_state = RTWS_STUTTER;
746 stutter_wait("rcu_torture_writer"); 846 stutter_wait("rcu_torture_writer");
747 } while (!torture_must_stop()); 847 } while (!torture_must_stop());
848 rcu_torture_writer_state = RTWS_STOPPING;
748 torture_kthread_stopping("rcu_torture_writer"); 849 torture_kthread_stopping("rcu_torture_writer");
749 return 0; 850 return 0;
750} 851}
@@ -784,7 +885,7 @@ rcu_torture_fakewriter(void *arg)
784 return 0; 885 return 0;
785} 886}
786 887
787void rcutorture_trace_dump(void) 888static void rcutorture_trace_dump(void)
788{ 889{
789 static atomic_t beenhere = ATOMIC_INIT(0); 890 static atomic_t beenhere = ATOMIC_INIT(0);
790 891
@@ -918,11 +1019,13 @@ rcu_torture_reader(void *arg)
918 __this_cpu_inc(rcu_torture_batch[completed]); 1019 __this_cpu_inc(rcu_torture_batch[completed]);
919 preempt_enable(); 1020 preempt_enable();
920 cur_ops->readunlock(idx); 1021 cur_ops->readunlock(idx);
921 schedule(); 1022 cond_resched();
922 stutter_wait("rcu_torture_reader"); 1023 stutter_wait("rcu_torture_reader");
923 } while (!torture_must_stop()); 1024 } while (!torture_must_stop());
924 if (irqreader && cur_ops->irq_capable) 1025 if (irqreader && cur_ops->irq_capable) {
925 del_timer_sync(&t); 1026 del_timer_sync(&t);
1027 destroy_timer_on_stack(&t);
1028 }
926 torture_kthread_stopping("rcu_torture_reader"); 1029 torture_kthread_stopping("rcu_torture_reader");
927 return 0; 1030 return 0;
928} 1031}
@@ -937,6 +1040,7 @@ rcu_torture_printk(char *page)
937 int i; 1040 int i;
938 long pipesummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 }; 1041 long pipesummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 };
939 long batchsummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 }; 1042 long batchsummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 };
1043 static unsigned long rtcv_snap = ULONG_MAX;
940 1044
941 for_each_possible_cpu(cpu) { 1045 for_each_possible_cpu(cpu) {
942 for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) { 1046 for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) {
@@ -997,6 +1101,22 @@ rcu_torture_printk(char *page)
997 page += sprintf(page, "\n"); 1101 page += sprintf(page, "\n");
998 if (cur_ops->stats) 1102 if (cur_ops->stats)
999 cur_ops->stats(page); 1103 cur_ops->stats(page);
1104 if (rtcv_snap == rcu_torture_current_version &&
1105 rcu_torture_current != NULL) {
1106 int __maybe_unused flags;
1107 unsigned long __maybe_unused gpnum;
1108 unsigned long __maybe_unused completed;
1109
1110 rcutorture_get_gp_data(cur_ops->ttype,
1111 &flags, &gpnum, &completed);
1112 page += sprintf(page,
1113 "??? Writer stall state %d g%lu c%lu f%#x\n",
1114 rcu_torture_writer_state,
1115 gpnum, completed, flags);
1116 show_rcu_gp_kthreads();
1117 rcutorture_trace_dump();
1118 }
1119 rtcv_snap = rcu_torture_current_version;
1000} 1120}
1001 1121
1002/* 1122/*
@@ -1146,7 +1266,7 @@ static int __init rcu_torture_stall_init(void)
1146} 1266}
1147 1267
1148/* Callback function for RCU barrier testing. */ 1268/* Callback function for RCU barrier testing. */
1149void rcu_torture_barrier_cbf(struct rcu_head *rcu) 1269static void rcu_torture_barrier_cbf(struct rcu_head *rcu)
1150{ 1270{
1151 atomic_inc(&barrier_cbs_invoked); 1271 atomic_inc(&barrier_cbs_invoked);
1152} 1272}
@@ -1416,7 +1536,8 @@ rcu_torture_init(void)
1416 &rcu_ops, &rcu_bh_ops, &rcu_busted_ops, &srcu_ops, &sched_ops, 1536 &rcu_ops, &rcu_bh_ops, &rcu_busted_ops, &srcu_ops, &sched_ops,
1417 }; 1537 };
1418 1538
1419 torture_init_begin(torture_type, verbose, &rcutorture_runnable); 1539 if (!torture_init_begin(torture_type, verbose, &rcutorture_runnable))
1540 return -EBUSY;
1420 1541
1421 /* Process args and tell the world that the torturer is on the job. */ 1542 /* Process args and tell the world that the torturer is on the job. */
1422 for (i = 0; i < ARRAY_SIZE(torture_ops); i++) { 1543 for (i = 0; i < ARRAY_SIZE(torture_ops); i++) {
@@ -1441,10 +1562,13 @@ rcu_torture_init(void)
1441 if (cur_ops->init) 1562 if (cur_ops->init)
1442 cur_ops->init(); /* no "goto unwind" prior to this point!!! */ 1563 cur_ops->init(); /* no "goto unwind" prior to this point!!! */
1443 1564
1444 if (nreaders >= 0) 1565 if (nreaders >= 0) {
1445 nrealreaders = nreaders; 1566 nrealreaders = nreaders;
1446 else 1567 } else {
1447 nrealreaders = 2 * num_online_cpus(); 1568 nrealreaders = num_online_cpus() - 1;
1569 if (nrealreaders <= 0)
1570 nrealreaders = 1;
1571 }
1448 rcu_torture_print_module_parms(cur_ops, "Start of test"); 1572 rcu_torture_print_module_parms(cur_ops, "Start of test");
1449 1573
1450 /* Set up the freelist. */ 1574 /* Set up the freelist. */
@@ -1533,7 +1657,8 @@ rcu_torture_init(void)
1533 fqs_duration = 0; 1657 fqs_duration = 0;
1534 if (fqs_duration) { 1658 if (fqs_duration) {
1535 /* Create the fqs thread */ 1659 /* Create the fqs thread */
1536 torture_create_kthread(rcu_torture_fqs, NULL, fqs_task); 1660 firsterr = torture_create_kthread(rcu_torture_fqs, NULL,
1661 fqs_task);
1537 if (firsterr) 1662 if (firsterr)
1538 goto unwind; 1663 goto unwind;
1539 } 1664 }
diff --git a/kernel/rcu/tiny_plugin.h b/kernel/rcu/tiny_plugin.h
index 431528520562..858c56569127 100644
--- a/kernel/rcu/tiny_plugin.h
+++ b/kernel/rcu/tiny_plugin.h
@@ -144,7 +144,7 @@ static void check_cpu_stall(struct rcu_ctrlblk *rcp)
144 return; 144 return;
145 rcp->ticks_this_gp++; 145 rcp->ticks_this_gp++;
146 j = jiffies; 146 j = jiffies;
147 js = rcp->jiffies_stall; 147 js = ACCESS_ONCE(rcp->jiffies_stall);
148 if (*rcp->curtail && ULONG_CMP_GE(j, js)) { 148 if (*rcp->curtail && ULONG_CMP_GE(j, js)) {
149 pr_err("INFO: %s stall on CPU (%lu ticks this GP) idle=%llx (t=%lu jiffies q=%ld)\n", 149 pr_err("INFO: %s stall on CPU (%lu ticks this GP) idle=%llx (t=%lu jiffies q=%ld)\n",
150 rcp->name, rcp->ticks_this_gp, rcu_dynticks_nesting, 150 rcp->name, rcp->ticks_this_gp, rcu_dynticks_nesting,
@@ -152,17 +152,17 @@ static void check_cpu_stall(struct rcu_ctrlblk *rcp)
152 dump_stack(); 152 dump_stack();
153 } 153 }
154 if (*rcp->curtail && ULONG_CMP_GE(j, js)) 154 if (*rcp->curtail && ULONG_CMP_GE(j, js))
155 rcp->jiffies_stall = jiffies + 155 ACCESS_ONCE(rcp->jiffies_stall) = jiffies +
156 3 * rcu_jiffies_till_stall_check() + 3; 156 3 * rcu_jiffies_till_stall_check() + 3;
157 else if (ULONG_CMP_GE(j, js)) 157 else if (ULONG_CMP_GE(j, js))
158 rcp->jiffies_stall = jiffies + rcu_jiffies_till_stall_check(); 158 ACCESS_ONCE(rcp->jiffies_stall) = jiffies + rcu_jiffies_till_stall_check();
159} 159}
160 160
161static void reset_cpu_stall_ticks(struct rcu_ctrlblk *rcp) 161static void reset_cpu_stall_ticks(struct rcu_ctrlblk *rcp)
162{ 162{
163 rcp->ticks_this_gp = 0; 163 rcp->ticks_this_gp = 0;
164 rcp->gp_start = jiffies; 164 rcp->gp_start = jiffies;
165 rcp->jiffies_stall = jiffies + rcu_jiffies_till_stall_check(); 165 ACCESS_ONCE(rcp->jiffies_stall) = jiffies + rcu_jiffies_till_stall_check();
166} 166}
167 167
168static void check_cpu_stalls(void) 168static void check_cpu_stalls(void)
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 0c47e300210a..f1ba77363fbb 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -101,7 +101,7 @@ DEFINE_PER_CPU(struct rcu_data, sname##_data)
101RCU_STATE_INITIALIZER(rcu_sched, 's', call_rcu_sched); 101RCU_STATE_INITIALIZER(rcu_sched, 's', call_rcu_sched);
102RCU_STATE_INITIALIZER(rcu_bh, 'b', call_rcu_bh); 102RCU_STATE_INITIALIZER(rcu_bh, 'b', call_rcu_bh);
103 103
104static struct rcu_state *rcu_state; 104static struct rcu_state *rcu_state_p;
105LIST_HEAD(rcu_struct_flavors); 105LIST_HEAD(rcu_struct_flavors);
106 106
107/* Increase (but not decrease) the CONFIG_RCU_FANOUT_LEAF at boot time. */ 107/* Increase (but not decrease) the CONFIG_RCU_FANOUT_LEAF at boot time. */
@@ -243,7 +243,7 @@ static ulong jiffies_till_next_fqs = ULONG_MAX;
243module_param(jiffies_till_first_fqs, ulong, 0644); 243module_param(jiffies_till_first_fqs, ulong, 0644);
244module_param(jiffies_till_next_fqs, ulong, 0644); 244module_param(jiffies_till_next_fqs, ulong, 0644);
245 245
246static void rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp, 246static bool rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp,
247 struct rcu_data *rdp); 247 struct rcu_data *rdp);
248static void force_qs_rnp(struct rcu_state *rsp, 248static void force_qs_rnp(struct rcu_state *rsp,
249 int (*f)(struct rcu_data *rsp, bool *isidle, 249 int (*f)(struct rcu_data *rsp, bool *isidle,
@@ -271,6 +271,15 @@ long rcu_batches_completed_bh(void)
271EXPORT_SYMBOL_GPL(rcu_batches_completed_bh); 271EXPORT_SYMBOL_GPL(rcu_batches_completed_bh);
272 272
273/* 273/*
274 * Force a quiescent state.
275 */
276void rcu_force_quiescent_state(void)
277{
278 force_quiescent_state(rcu_state_p);
279}
280EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
281
282/*
274 * Force a quiescent state for RCU BH. 283 * Force a quiescent state for RCU BH.
275 */ 284 */
276void rcu_bh_force_quiescent_state(void) 285void rcu_bh_force_quiescent_state(void)
@@ -280,6 +289,21 @@ void rcu_bh_force_quiescent_state(void)
280EXPORT_SYMBOL_GPL(rcu_bh_force_quiescent_state); 289EXPORT_SYMBOL_GPL(rcu_bh_force_quiescent_state);
281 290
282/* 291/*
292 * Show the state of the grace-period kthreads.
293 */
294void show_rcu_gp_kthreads(void)
295{
296 struct rcu_state *rsp;
297
298 for_each_rcu_flavor(rsp) {
299 pr_info("%s: wait state: %d ->state: %#lx\n",
300 rsp->name, rsp->gp_state, rsp->gp_kthread->state);
301 /* sched_show_task(rsp->gp_kthread); */
302 }
303}
304EXPORT_SYMBOL_GPL(show_rcu_gp_kthreads);
305
306/*
283 * Record the number of times rcutorture tests have been initiated and 307 * Record the number of times rcutorture tests have been initiated and
284 * terminated. This information allows the debugfs tracing stats to be 308 * terminated. This information allows the debugfs tracing stats to be
285 * correlated to the rcutorture messages, even when the rcutorture module 309 * correlated to the rcutorture messages, even when the rcutorture module
@@ -294,6 +318,39 @@ void rcutorture_record_test_transition(void)
294EXPORT_SYMBOL_GPL(rcutorture_record_test_transition); 318EXPORT_SYMBOL_GPL(rcutorture_record_test_transition);
295 319
296/* 320/*
321 * Send along grace-period-related data for rcutorture diagnostics.
322 */
323void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags,
324 unsigned long *gpnum, unsigned long *completed)
325{
326 struct rcu_state *rsp = NULL;
327
328 switch (test_type) {
329 case RCU_FLAVOR:
330 rsp = rcu_state_p;
331 break;
332 case RCU_BH_FLAVOR:
333 rsp = &rcu_bh_state;
334 break;
335 case RCU_SCHED_FLAVOR:
336 rsp = &rcu_sched_state;
337 break;
338 default:
339 break;
340 }
341 if (rsp != NULL) {
342 *flags = ACCESS_ONCE(rsp->gp_flags);
343 *gpnum = ACCESS_ONCE(rsp->gpnum);
344 *completed = ACCESS_ONCE(rsp->completed);
345 return;
346 }
347 *flags = 0;
348 *gpnum = 0;
349 *completed = 0;
350}
351EXPORT_SYMBOL_GPL(rcutorture_get_gp_data);
352
353/*
297 * Record the number of writer passes through the current rcutorture test. 354 * Record the number of writer passes through the current rcutorture test.
298 * This is also used to correlate debugfs tracing stats with the rcutorture 355 * This is also used to correlate debugfs tracing stats with the rcutorture
299 * messages. 356 * messages.
@@ -324,6 +381,28 @@ cpu_has_callbacks_ready_to_invoke(struct rcu_data *rdp)
324} 381}
325 382
326/* 383/*
384 * Return the root node of the specified rcu_state structure.
385 */
386static struct rcu_node *rcu_get_root(struct rcu_state *rsp)
387{
388 return &rsp->node[0];
389}
390
391/*
392 * Is there any need for future grace periods?
393 * Interrupts must be disabled. If the caller does not hold the root
394 * rnp_node structure's ->lock, the results are advisory only.
395 */
396static int rcu_future_needs_gp(struct rcu_state *rsp)
397{
398 struct rcu_node *rnp = rcu_get_root(rsp);
399 int idx = (ACCESS_ONCE(rnp->completed) + 1) & 0x1;
400 int *fp = &rnp->need_future_gp[idx];
401
402 return ACCESS_ONCE(*fp);
403}
404
405/*
327 * Does the current CPU require a not-yet-started grace period? 406 * Does the current CPU require a not-yet-started grace period?
328 * The caller must have disabled interrupts to prevent races with 407 * The caller must have disabled interrupts to prevent races with
329 * normal callback registry. 408 * normal callback registry.
@@ -335,7 +414,7 @@ cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp)
335 414
336 if (rcu_gp_in_progress(rsp)) 415 if (rcu_gp_in_progress(rsp))
337 return 0; /* No, a grace period is already in progress. */ 416 return 0; /* No, a grace period is already in progress. */
338 if (rcu_nocb_needs_gp(rsp)) 417 if (rcu_future_needs_gp(rsp))
339 return 1; /* Yes, a no-CBs CPU needs one. */ 418 return 1; /* Yes, a no-CBs CPU needs one. */
340 if (!rdp->nxttail[RCU_NEXT_TAIL]) 419 if (!rdp->nxttail[RCU_NEXT_TAIL])
341 return 0; /* No, this is a no-CBs (or offline) CPU. */ 420 return 0; /* No, this is a no-CBs (or offline) CPU. */
@@ -350,14 +429,6 @@ cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp)
350} 429}
351 430
352/* 431/*
353 * Return the root node of the specified rcu_state structure.
354 */
355static struct rcu_node *rcu_get_root(struct rcu_state *rsp)
356{
357 return &rsp->node[0];
358}
359
360/*
361 * rcu_eqs_enter_common - current CPU is moving towards extended quiescent state 432 * rcu_eqs_enter_common - current CPU is moving towards extended quiescent state
362 * 433 *
363 * If the new value of the ->dynticks_nesting counter now is zero, 434 * If the new value of the ->dynticks_nesting counter now is zero,
@@ -387,9 +458,9 @@ static void rcu_eqs_enter_common(struct rcu_dynticks *rdtp, long long oldval,
387 } 458 }
388 rcu_prepare_for_idle(smp_processor_id()); 459 rcu_prepare_for_idle(smp_processor_id());
389 /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */ 460 /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */
390 smp_mb__before_atomic_inc(); /* See above. */ 461 smp_mb__before_atomic(); /* See above. */
391 atomic_inc(&rdtp->dynticks); 462 atomic_inc(&rdtp->dynticks);
392 smp_mb__after_atomic_inc(); /* Force ordering with next sojourn. */ 463 smp_mb__after_atomic(); /* Force ordering with next sojourn. */
393 WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1); 464 WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1);
394 465
395 /* 466 /*
@@ -507,10 +578,10 @@ void rcu_irq_exit(void)
507static void rcu_eqs_exit_common(struct rcu_dynticks *rdtp, long long oldval, 578static void rcu_eqs_exit_common(struct rcu_dynticks *rdtp, long long oldval,
508 int user) 579 int user)
509{ 580{
510 smp_mb__before_atomic_inc(); /* Force ordering w/previous sojourn. */ 581 smp_mb__before_atomic(); /* Force ordering w/previous sojourn. */
511 atomic_inc(&rdtp->dynticks); 582 atomic_inc(&rdtp->dynticks);
512 /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */ 583 /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */
513 smp_mb__after_atomic_inc(); /* See above. */ 584 smp_mb__after_atomic(); /* See above. */
514 WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1)); 585 WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1));
515 rcu_cleanup_after_idle(smp_processor_id()); 586 rcu_cleanup_after_idle(smp_processor_id());
516 trace_rcu_dyntick(TPS("End"), oldval, rdtp->dynticks_nesting); 587 trace_rcu_dyntick(TPS("End"), oldval, rdtp->dynticks_nesting);
@@ -635,10 +706,10 @@ void rcu_nmi_enter(void)
635 (atomic_read(&rdtp->dynticks) & 0x1)) 706 (atomic_read(&rdtp->dynticks) & 0x1))
636 return; 707 return;
637 rdtp->dynticks_nmi_nesting++; 708 rdtp->dynticks_nmi_nesting++;
638 smp_mb__before_atomic_inc(); /* Force delay from prior write. */ 709 smp_mb__before_atomic(); /* Force delay from prior write. */
639 atomic_inc(&rdtp->dynticks); 710 atomic_inc(&rdtp->dynticks);
640 /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */ 711 /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */
641 smp_mb__after_atomic_inc(); /* See above. */ 712 smp_mb__after_atomic(); /* See above. */
642 WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1)); 713 WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1));
643} 714}
644 715
@@ -657,9 +728,9 @@ void rcu_nmi_exit(void)
657 --rdtp->dynticks_nmi_nesting != 0) 728 --rdtp->dynticks_nmi_nesting != 0)
658 return; 729 return;
659 /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */ 730 /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */
660 smp_mb__before_atomic_inc(); /* See above. */ 731 smp_mb__before_atomic(); /* See above. */
661 atomic_inc(&rdtp->dynticks); 732 atomic_inc(&rdtp->dynticks);
662 smp_mb__after_atomic_inc(); /* Force delay to next write. */ 733 smp_mb__after_atomic(); /* Force delay to next write. */
663 WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1); 734 WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1);
664} 735}
665 736
@@ -758,7 +829,12 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp,
758{ 829{
759 rdp->dynticks_snap = atomic_add_return(0, &rdp->dynticks->dynticks); 830 rdp->dynticks_snap = atomic_add_return(0, &rdp->dynticks->dynticks);
760 rcu_sysidle_check_cpu(rdp, isidle, maxj); 831 rcu_sysidle_check_cpu(rdp, isidle, maxj);
761 return (rdp->dynticks_snap & 0x1) == 0; 832 if ((rdp->dynticks_snap & 0x1) == 0) {
833 trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("dti"));
834 return 1;
835 } else {
836 return 0;
837 }
762} 838}
763 839
764/* 840/*
@@ -834,7 +910,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp,
834 * we will beat on the first one until it gets unstuck, then move 910 * we will beat on the first one until it gets unstuck, then move
835 * to the next. Only do this for the primary flavor of RCU. 911 * to the next. Only do this for the primary flavor of RCU.
836 */ 912 */
837 if (rdp->rsp == rcu_state && 913 if (rdp->rsp == rcu_state_p &&
838 ULONG_CMP_GE(jiffies, rdp->rsp->jiffies_resched)) { 914 ULONG_CMP_GE(jiffies, rdp->rsp->jiffies_resched)) {
839 rdp->rsp->jiffies_resched += 5; 915 rdp->rsp->jiffies_resched += 5;
840 resched_cpu(rdp->cpu); 916 resched_cpu(rdp->cpu);
@@ -851,7 +927,7 @@ static void record_gp_stall_check_time(struct rcu_state *rsp)
851 rsp->gp_start = j; 927 rsp->gp_start = j;
852 smp_wmb(); /* Record start time before stall time. */ 928 smp_wmb(); /* Record start time before stall time. */
853 j1 = rcu_jiffies_till_stall_check(); 929 j1 = rcu_jiffies_till_stall_check();
854 rsp->jiffies_stall = j + j1; 930 ACCESS_ONCE(rsp->jiffies_stall) = j + j1;
855 rsp->jiffies_resched = j + j1 / 2; 931 rsp->jiffies_resched = j + j1 / 2;
856} 932}
857 933
@@ -890,12 +966,12 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
890 /* Only let one CPU complain about others per time interval. */ 966 /* Only let one CPU complain about others per time interval. */
891 967
892 raw_spin_lock_irqsave(&rnp->lock, flags); 968 raw_spin_lock_irqsave(&rnp->lock, flags);
893 delta = jiffies - rsp->jiffies_stall; 969 delta = jiffies - ACCESS_ONCE(rsp->jiffies_stall);
894 if (delta < RCU_STALL_RAT_DELAY || !rcu_gp_in_progress(rsp)) { 970 if (delta < RCU_STALL_RAT_DELAY || !rcu_gp_in_progress(rsp)) {
895 raw_spin_unlock_irqrestore(&rnp->lock, flags); 971 raw_spin_unlock_irqrestore(&rnp->lock, flags);
896 return; 972 return;
897 } 973 }
898 rsp->jiffies_stall = jiffies + 3 * rcu_jiffies_till_stall_check() + 3; 974 ACCESS_ONCE(rsp->jiffies_stall) = jiffies + 3 * rcu_jiffies_till_stall_check() + 3;
899 raw_spin_unlock_irqrestore(&rnp->lock, flags); 975 raw_spin_unlock_irqrestore(&rnp->lock, flags);
900 976
901 /* 977 /*
@@ -932,9 +1008,9 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
932 print_cpu_stall_info_end(); 1008 print_cpu_stall_info_end();
933 for_each_possible_cpu(cpu) 1009 for_each_possible_cpu(cpu)
934 totqlen += per_cpu_ptr(rsp->rda, cpu)->qlen; 1010 totqlen += per_cpu_ptr(rsp->rda, cpu)->qlen;
935 pr_cont("(detected by %d, t=%ld jiffies, g=%lu, c=%lu, q=%lu)\n", 1011 pr_cont("(detected by %d, t=%ld jiffies, g=%ld, c=%ld, q=%lu)\n",
936 smp_processor_id(), (long)(jiffies - rsp->gp_start), 1012 smp_processor_id(), (long)(jiffies - rsp->gp_start),
937 rsp->gpnum, rsp->completed, totqlen); 1013 (long)rsp->gpnum, (long)rsp->completed, totqlen);
938 if (ndetected == 0) 1014 if (ndetected == 0)
939 pr_err("INFO: Stall ended before state dump start\n"); 1015 pr_err("INFO: Stall ended before state dump start\n");
940 else if (!trigger_all_cpu_backtrace()) 1016 else if (!trigger_all_cpu_backtrace())
@@ -947,12 +1023,6 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
947 force_quiescent_state(rsp); /* Kick them all. */ 1023 force_quiescent_state(rsp); /* Kick them all. */
948} 1024}
949 1025
950/*
951 * This function really isn't for public consumption, but RCU is special in
952 * that context switches can allow the state machine to make progress.
953 */
954extern void resched_cpu(int cpu);
955
956static void print_cpu_stall(struct rcu_state *rsp) 1026static void print_cpu_stall(struct rcu_state *rsp)
957{ 1027{
958 int cpu; 1028 int cpu;
@@ -971,14 +1041,15 @@ static void print_cpu_stall(struct rcu_state *rsp)
971 print_cpu_stall_info_end(); 1041 print_cpu_stall_info_end();
972 for_each_possible_cpu(cpu) 1042 for_each_possible_cpu(cpu)
973 totqlen += per_cpu_ptr(rsp->rda, cpu)->qlen; 1043 totqlen += per_cpu_ptr(rsp->rda, cpu)->qlen;
974 pr_cont(" (t=%lu jiffies g=%lu c=%lu q=%lu)\n", 1044 pr_cont(" (t=%lu jiffies g=%ld c=%ld q=%lu)\n",
975 jiffies - rsp->gp_start, rsp->gpnum, rsp->completed, totqlen); 1045 jiffies - rsp->gp_start,
1046 (long)rsp->gpnum, (long)rsp->completed, totqlen);
976 if (!trigger_all_cpu_backtrace()) 1047 if (!trigger_all_cpu_backtrace())
977 dump_stack(); 1048 dump_stack();
978 1049
979 raw_spin_lock_irqsave(&rnp->lock, flags); 1050 raw_spin_lock_irqsave(&rnp->lock, flags);
980 if (ULONG_CMP_GE(jiffies, rsp->jiffies_stall)) 1051 if (ULONG_CMP_GE(jiffies, ACCESS_ONCE(rsp->jiffies_stall)))
981 rsp->jiffies_stall = jiffies + 1052 ACCESS_ONCE(rsp->jiffies_stall) = jiffies +
982 3 * rcu_jiffies_till_stall_check() + 3; 1053 3 * rcu_jiffies_till_stall_check() + 3;
983 raw_spin_unlock_irqrestore(&rnp->lock, flags); 1054 raw_spin_unlock_irqrestore(&rnp->lock, flags);
984 1055
@@ -1062,7 +1133,7 @@ void rcu_cpu_stall_reset(void)
1062 struct rcu_state *rsp; 1133 struct rcu_state *rsp;
1063 1134
1064 for_each_rcu_flavor(rsp) 1135 for_each_rcu_flavor(rsp)
1065 rsp->jiffies_stall = jiffies + ULONG_MAX / 2; 1136 ACCESS_ONCE(rsp->jiffies_stall) = jiffies + ULONG_MAX / 2;
1066} 1137}
1067 1138
1068/* 1139/*
@@ -1123,15 +1194,18 @@ static void trace_rcu_future_gp(struct rcu_node *rnp, struct rcu_data *rdp,
1123/* 1194/*
1124 * Start some future grace period, as needed to handle newly arrived 1195 * Start some future grace period, as needed to handle newly arrived
1125 * callbacks. The required future grace periods are recorded in each 1196 * callbacks. The required future grace periods are recorded in each
1126 * rcu_node structure's ->need_future_gp field. 1197 * rcu_node structure's ->need_future_gp field. Returns true if there
1198 * is reason to awaken the grace-period kthread.
1127 * 1199 *
1128 * The caller must hold the specified rcu_node structure's ->lock. 1200 * The caller must hold the specified rcu_node structure's ->lock.
1129 */ 1201 */
1130static unsigned long __maybe_unused 1202static bool __maybe_unused
1131rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp) 1203rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp,
1204 unsigned long *c_out)
1132{ 1205{
1133 unsigned long c; 1206 unsigned long c;
1134 int i; 1207 int i;
1208 bool ret = false;
1135 struct rcu_node *rnp_root = rcu_get_root(rdp->rsp); 1209 struct rcu_node *rnp_root = rcu_get_root(rdp->rsp);
1136 1210
1137 /* 1211 /*
@@ -1142,7 +1216,7 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp)
1142 trace_rcu_future_gp(rnp, rdp, c, TPS("Startleaf")); 1216 trace_rcu_future_gp(rnp, rdp, c, TPS("Startleaf"));
1143 if (rnp->need_future_gp[c & 0x1]) { 1217 if (rnp->need_future_gp[c & 0x1]) {
1144 trace_rcu_future_gp(rnp, rdp, c, TPS("Prestartleaf")); 1218 trace_rcu_future_gp(rnp, rdp, c, TPS("Prestartleaf"));
1145 return c; 1219 goto out;
1146 } 1220 }
1147 1221
1148 /* 1222 /*
@@ -1156,7 +1230,7 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp)
1156 ACCESS_ONCE(rnp->gpnum) != ACCESS_ONCE(rnp->completed)) { 1230 ACCESS_ONCE(rnp->gpnum) != ACCESS_ONCE(rnp->completed)) {
1157 rnp->need_future_gp[c & 0x1]++; 1231 rnp->need_future_gp[c & 0x1]++;
1158 trace_rcu_future_gp(rnp, rdp, c, TPS("Startedleaf")); 1232 trace_rcu_future_gp(rnp, rdp, c, TPS("Startedleaf"));
1159 return c; 1233 goto out;
1160 } 1234 }
1161 1235
1162 /* 1236 /*
@@ -1197,12 +1271,15 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp)
1197 trace_rcu_future_gp(rnp, rdp, c, TPS("Startedleafroot")); 1271 trace_rcu_future_gp(rnp, rdp, c, TPS("Startedleafroot"));
1198 } else { 1272 } else {
1199 trace_rcu_future_gp(rnp, rdp, c, TPS("Startedroot")); 1273 trace_rcu_future_gp(rnp, rdp, c, TPS("Startedroot"));
1200 rcu_start_gp_advanced(rdp->rsp, rnp_root, rdp); 1274 ret = rcu_start_gp_advanced(rdp->rsp, rnp_root, rdp);
1201 } 1275 }
1202unlock_out: 1276unlock_out:
1203 if (rnp != rnp_root) 1277 if (rnp != rnp_root)
1204 raw_spin_unlock(&rnp_root->lock); 1278 raw_spin_unlock(&rnp_root->lock);
1205 return c; 1279out:
1280 if (c_out != NULL)
1281 *c_out = c;
1282 return ret;
1206} 1283}
1207 1284
1208/* 1285/*
@@ -1226,25 +1303,43 @@ static int rcu_future_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp)
1226} 1303}
1227 1304
1228/* 1305/*
1306 * Awaken the grace-period kthread for the specified flavor of RCU.
1307 * Don't do a self-awaken, and don't bother awakening when there is
1308 * nothing for the grace-period kthread to do (as in several CPUs
1309 * raced to awaken, and we lost), and finally don't try to awaken
1310 * a kthread that has not yet been created.
1311 */
1312static void rcu_gp_kthread_wake(struct rcu_state *rsp)
1313{
1314 if (current == rsp->gp_kthread ||
1315 !ACCESS_ONCE(rsp->gp_flags) ||
1316 !rsp->gp_kthread)
1317 return;
1318 wake_up(&rsp->gp_wq);
1319}
1320
1321/*
1229 * If there is room, assign a ->completed number to any callbacks on 1322 * If there is room, assign a ->completed number to any callbacks on
1230 * this CPU that have not already been assigned. Also accelerate any 1323 * this CPU that have not already been assigned. Also accelerate any
1231 * callbacks that were previously assigned a ->completed number that has 1324 * callbacks that were previously assigned a ->completed number that has
1232 * since proven to be too conservative, which can happen if callbacks get 1325 * since proven to be too conservative, which can happen if callbacks get
1233 * assigned a ->completed number while RCU is idle, but with reference to 1326 * assigned a ->completed number while RCU is idle, but with reference to
1234 * a non-root rcu_node structure. This function is idempotent, so it does 1327 * a non-root rcu_node structure. This function is idempotent, so it does
1235 * not hurt to call it repeatedly. 1328 * not hurt to call it repeatedly. Returns an flag saying that we should
1329 * awaken the RCU grace-period kthread.
1236 * 1330 *
1237 * The caller must hold rnp->lock with interrupts disabled. 1331 * The caller must hold rnp->lock with interrupts disabled.
1238 */ 1332 */
1239static void rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp, 1333static bool rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp,
1240 struct rcu_data *rdp) 1334 struct rcu_data *rdp)
1241{ 1335{
1242 unsigned long c; 1336 unsigned long c;
1243 int i; 1337 int i;
1338 bool ret;
1244 1339
1245 /* If the CPU has no callbacks, nothing to do. */ 1340 /* If the CPU has no callbacks, nothing to do. */
1246 if (!rdp->nxttail[RCU_NEXT_TAIL] || !*rdp->nxttail[RCU_DONE_TAIL]) 1341 if (!rdp->nxttail[RCU_NEXT_TAIL] || !*rdp->nxttail[RCU_DONE_TAIL])
1247 return; 1342 return false;
1248 1343
1249 /* 1344 /*
1250 * Starting from the sublist containing the callbacks most 1345 * Starting from the sublist containing the callbacks most
@@ -1273,7 +1368,7 @@ static void rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp,
1273 * be grouped into. 1368 * be grouped into.
1274 */ 1369 */
1275 if (++i >= RCU_NEXT_TAIL) 1370 if (++i >= RCU_NEXT_TAIL)
1276 return; 1371 return false;
1277 1372
1278 /* 1373 /*
1279 * Assign all subsequent callbacks' ->completed number to the next 1374 * Assign all subsequent callbacks' ->completed number to the next
@@ -1285,13 +1380,14 @@ static void rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp,
1285 rdp->nxtcompleted[i] = c; 1380 rdp->nxtcompleted[i] = c;
1286 } 1381 }
1287 /* Record any needed additional grace periods. */ 1382 /* Record any needed additional grace periods. */
1288 rcu_start_future_gp(rnp, rdp); 1383 ret = rcu_start_future_gp(rnp, rdp, NULL);
1289 1384
1290 /* Trace depending on how much we were able to accelerate. */ 1385 /* Trace depending on how much we were able to accelerate. */
1291 if (!*rdp->nxttail[RCU_WAIT_TAIL]) 1386 if (!*rdp->nxttail[RCU_WAIT_TAIL])
1292 trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("AccWaitCB")); 1387 trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("AccWaitCB"));
1293 else 1388 else
1294 trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("AccReadyCB")); 1389 trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("AccReadyCB"));
1390 return ret;
1295} 1391}
1296 1392
1297/* 1393/*
@@ -1300,17 +1396,18 @@ static void rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp,
1300 * assign ->completed numbers to any callbacks in the RCU_NEXT_TAIL 1396 * assign ->completed numbers to any callbacks in the RCU_NEXT_TAIL
1301 * sublist. This function is idempotent, so it does not hurt to 1397 * sublist. This function is idempotent, so it does not hurt to
1302 * invoke it repeatedly. As long as it is not invoked -too- often... 1398 * invoke it repeatedly. As long as it is not invoked -too- often...
1399 * Returns true if the RCU grace-period kthread needs to be awakened.
1303 * 1400 *
1304 * The caller must hold rnp->lock with interrupts disabled. 1401 * The caller must hold rnp->lock with interrupts disabled.
1305 */ 1402 */
1306static void rcu_advance_cbs(struct rcu_state *rsp, struct rcu_node *rnp, 1403static bool rcu_advance_cbs(struct rcu_state *rsp, struct rcu_node *rnp,
1307 struct rcu_data *rdp) 1404 struct rcu_data *rdp)
1308{ 1405{
1309 int i, j; 1406 int i, j;
1310 1407
1311 /* If the CPU has no callbacks, nothing to do. */ 1408 /* If the CPU has no callbacks, nothing to do. */
1312 if (!rdp->nxttail[RCU_NEXT_TAIL] || !*rdp->nxttail[RCU_DONE_TAIL]) 1409 if (!rdp->nxttail[RCU_NEXT_TAIL] || !*rdp->nxttail[RCU_DONE_TAIL])
1313 return; 1410 return false;
1314 1411
1315 /* 1412 /*
1316 * Find all callbacks whose ->completed numbers indicate that they 1413 * Find all callbacks whose ->completed numbers indicate that they
@@ -1334,26 +1431,30 @@ static void rcu_advance_cbs(struct rcu_state *rsp, struct rcu_node *rnp,
1334 } 1431 }
1335 1432
1336 /* Classify any remaining callbacks. */ 1433 /* Classify any remaining callbacks. */
1337 rcu_accelerate_cbs(rsp, rnp, rdp); 1434 return rcu_accelerate_cbs(rsp, rnp, rdp);
1338} 1435}
1339 1436
1340/* 1437/*
1341 * Update CPU-local rcu_data state to record the beginnings and ends of 1438 * Update CPU-local rcu_data state to record the beginnings and ends of
1342 * grace periods. The caller must hold the ->lock of the leaf rcu_node 1439 * grace periods. The caller must hold the ->lock of the leaf rcu_node
1343 * structure corresponding to the current CPU, and must have irqs disabled. 1440 * structure corresponding to the current CPU, and must have irqs disabled.
1441 * Returns true if the grace-period kthread needs to be awakened.
1344 */ 1442 */
1345static void __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) 1443static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp,
1444 struct rcu_data *rdp)
1346{ 1445{
1446 bool ret;
1447
1347 /* Handle the ends of any preceding grace periods first. */ 1448 /* Handle the ends of any preceding grace periods first. */
1348 if (rdp->completed == rnp->completed) { 1449 if (rdp->completed == rnp->completed) {
1349 1450
1350 /* No grace period end, so just accelerate recent callbacks. */ 1451 /* No grace period end, so just accelerate recent callbacks. */
1351 rcu_accelerate_cbs(rsp, rnp, rdp); 1452 ret = rcu_accelerate_cbs(rsp, rnp, rdp);
1352 1453
1353 } else { 1454 } else {
1354 1455
1355 /* Advance callbacks. */ 1456 /* Advance callbacks. */
1356 rcu_advance_cbs(rsp, rnp, rdp); 1457 ret = rcu_advance_cbs(rsp, rnp, rdp);
1357 1458
1358 /* Remember that we saw this grace-period completion. */ 1459 /* Remember that we saw this grace-period completion. */
1359 rdp->completed = rnp->completed; 1460 rdp->completed = rnp->completed;
@@ -1372,11 +1473,13 @@ static void __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, struc
1372 rdp->qs_pending = !!(rnp->qsmask & rdp->grpmask); 1473 rdp->qs_pending = !!(rnp->qsmask & rdp->grpmask);
1373 zero_cpu_stall_ticks(rdp); 1474 zero_cpu_stall_ticks(rdp);
1374 } 1475 }
1476 return ret;
1375} 1477}
1376 1478
1377static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp) 1479static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp)
1378{ 1480{
1379 unsigned long flags; 1481 unsigned long flags;
1482 bool needwake;
1380 struct rcu_node *rnp; 1483 struct rcu_node *rnp;
1381 1484
1382 local_irq_save(flags); 1485 local_irq_save(flags);
@@ -1388,8 +1491,10 @@ static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp)
1388 return; 1491 return;
1389 } 1492 }
1390 smp_mb__after_unlock_lock(); 1493 smp_mb__after_unlock_lock();
1391 __note_gp_changes(rsp, rnp, rdp); 1494 needwake = __note_gp_changes(rsp, rnp, rdp);
1392 raw_spin_unlock_irqrestore(&rnp->lock, flags); 1495 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1496 if (needwake)
1497 rcu_gp_kthread_wake(rsp);
1393} 1498}
1394 1499
1395/* 1500/*
@@ -1403,12 +1508,12 @@ static int rcu_gp_init(struct rcu_state *rsp)
1403 rcu_bind_gp_kthread(); 1508 rcu_bind_gp_kthread();
1404 raw_spin_lock_irq(&rnp->lock); 1509 raw_spin_lock_irq(&rnp->lock);
1405 smp_mb__after_unlock_lock(); 1510 smp_mb__after_unlock_lock();
1406 if (rsp->gp_flags == 0) { 1511 if (!ACCESS_ONCE(rsp->gp_flags)) {
1407 /* Spurious wakeup, tell caller to go back to sleep. */ 1512 /* Spurious wakeup, tell caller to go back to sleep. */
1408 raw_spin_unlock_irq(&rnp->lock); 1513 raw_spin_unlock_irq(&rnp->lock);
1409 return 0; 1514 return 0;
1410 } 1515 }
1411 rsp->gp_flags = 0; /* Clear all flags: New grace period. */ 1516 ACCESS_ONCE(rsp->gp_flags) = 0; /* Clear all flags: New grace period. */
1412 1517
1413 if (WARN_ON_ONCE(rcu_gp_in_progress(rsp))) { 1518 if (WARN_ON_ONCE(rcu_gp_in_progress(rsp))) {
1414 /* 1519 /*
@@ -1453,7 +1558,7 @@ static int rcu_gp_init(struct rcu_state *rsp)
1453 WARN_ON_ONCE(rnp->completed != rsp->completed); 1558 WARN_ON_ONCE(rnp->completed != rsp->completed);
1454 ACCESS_ONCE(rnp->completed) = rsp->completed; 1559 ACCESS_ONCE(rnp->completed) = rsp->completed;
1455 if (rnp == rdp->mynode) 1560 if (rnp == rdp->mynode)
1456 __note_gp_changes(rsp, rnp, rdp); 1561 (void)__note_gp_changes(rsp, rnp, rdp);
1457 rcu_preempt_boost_start_gp(rnp); 1562 rcu_preempt_boost_start_gp(rnp);
1458 trace_rcu_grace_period_init(rsp->name, rnp->gpnum, 1563 trace_rcu_grace_period_init(rsp->name, rnp->gpnum,
1459 rnp->level, rnp->grplo, 1564 rnp->level, rnp->grplo,
@@ -1501,7 +1606,7 @@ static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in)
1501 if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) { 1606 if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) {
1502 raw_spin_lock_irq(&rnp->lock); 1607 raw_spin_lock_irq(&rnp->lock);
1503 smp_mb__after_unlock_lock(); 1608 smp_mb__after_unlock_lock();
1504 rsp->gp_flags &= ~RCU_GP_FLAG_FQS; 1609 ACCESS_ONCE(rsp->gp_flags) &= ~RCU_GP_FLAG_FQS;
1505 raw_spin_unlock_irq(&rnp->lock); 1610 raw_spin_unlock_irq(&rnp->lock);
1506 } 1611 }
1507 return fqs_state; 1612 return fqs_state;
@@ -1513,6 +1618,7 @@ static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in)
1513static void rcu_gp_cleanup(struct rcu_state *rsp) 1618static void rcu_gp_cleanup(struct rcu_state *rsp)
1514{ 1619{
1515 unsigned long gp_duration; 1620 unsigned long gp_duration;
1621 bool needgp = false;
1516 int nocb = 0; 1622 int nocb = 0;
1517 struct rcu_data *rdp; 1623 struct rcu_data *rdp;
1518 struct rcu_node *rnp = rcu_get_root(rsp); 1624 struct rcu_node *rnp = rcu_get_root(rsp);
@@ -1548,7 +1654,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
1548 ACCESS_ONCE(rnp->completed) = rsp->gpnum; 1654 ACCESS_ONCE(rnp->completed) = rsp->gpnum;
1549 rdp = this_cpu_ptr(rsp->rda); 1655 rdp = this_cpu_ptr(rsp->rda);
1550 if (rnp == rdp->mynode) 1656 if (rnp == rdp->mynode)
1551 __note_gp_changes(rsp, rnp, rdp); 1657 needgp = __note_gp_changes(rsp, rnp, rdp) || needgp;
1552 /* smp_mb() provided by prior unlock-lock pair. */ 1658 /* smp_mb() provided by prior unlock-lock pair. */
1553 nocb += rcu_future_gp_cleanup(rsp, rnp); 1659 nocb += rcu_future_gp_cleanup(rsp, rnp);
1554 raw_spin_unlock_irq(&rnp->lock); 1660 raw_spin_unlock_irq(&rnp->lock);
@@ -1564,9 +1670,10 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
1564 trace_rcu_grace_period(rsp->name, rsp->completed, TPS("end")); 1670 trace_rcu_grace_period(rsp->name, rsp->completed, TPS("end"));
1565 rsp->fqs_state = RCU_GP_IDLE; 1671 rsp->fqs_state = RCU_GP_IDLE;
1566 rdp = this_cpu_ptr(rsp->rda); 1672 rdp = this_cpu_ptr(rsp->rda);
1567 rcu_advance_cbs(rsp, rnp, rdp); /* Reduce false positives below. */ 1673 /* Advance CBs to reduce false positives below. */
1568 if (cpu_needs_another_gp(rsp, rdp)) { 1674 needgp = rcu_advance_cbs(rsp, rnp, rdp) || needgp;
1569 rsp->gp_flags = RCU_GP_FLAG_INIT; 1675 if (needgp || cpu_needs_another_gp(rsp, rdp)) {
1676 ACCESS_ONCE(rsp->gp_flags) = RCU_GP_FLAG_INIT;
1570 trace_rcu_grace_period(rsp->name, 1677 trace_rcu_grace_period(rsp->name,
1571 ACCESS_ONCE(rsp->gpnum), 1678 ACCESS_ONCE(rsp->gpnum),
1572 TPS("newreq")); 1679 TPS("newreq"));
@@ -1593,6 +1700,7 @@ static int __noreturn rcu_gp_kthread(void *arg)
1593 trace_rcu_grace_period(rsp->name, 1700 trace_rcu_grace_period(rsp->name,
1594 ACCESS_ONCE(rsp->gpnum), 1701 ACCESS_ONCE(rsp->gpnum),
1595 TPS("reqwait")); 1702 TPS("reqwait"));
1703 rsp->gp_state = RCU_GP_WAIT_GPS;
1596 wait_event_interruptible(rsp->gp_wq, 1704 wait_event_interruptible(rsp->gp_wq,
1597 ACCESS_ONCE(rsp->gp_flags) & 1705 ACCESS_ONCE(rsp->gp_flags) &
1598 RCU_GP_FLAG_INIT); 1706 RCU_GP_FLAG_INIT);
@@ -1620,6 +1728,7 @@ static int __noreturn rcu_gp_kthread(void *arg)
1620 trace_rcu_grace_period(rsp->name, 1728 trace_rcu_grace_period(rsp->name,
1621 ACCESS_ONCE(rsp->gpnum), 1729 ACCESS_ONCE(rsp->gpnum),
1622 TPS("fqswait")); 1730 TPS("fqswait"));
1731 rsp->gp_state = RCU_GP_WAIT_FQS;
1623 ret = wait_event_interruptible_timeout(rsp->gp_wq, 1732 ret = wait_event_interruptible_timeout(rsp->gp_wq,
1624 ((gf = ACCESS_ONCE(rsp->gp_flags)) & 1733 ((gf = ACCESS_ONCE(rsp->gp_flags)) &
1625 RCU_GP_FLAG_FQS) || 1734 RCU_GP_FLAG_FQS) ||
@@ -1665,14 +1774,6 @@ static int __noreturn rcu_gp_kthread(void *arg)
1665 } 1774 }
1666} 1775}
1667 1776
1668static void rsp_wakeup(struct irq_work *work)
1669{
1670 struct rcu_state *rsp = container_of(work, struct rcu_state, wakeup_work);
1671
1672 /* Wake up rcu_gp_kthread() to start the grace period. */
1673 wake_up(&rsp->gp_wq);
1674}
1675
1676/* 1777/*
1677 * Start a new RCU grace period if warranted, re-initializing the hierarchy 1778 * Start a new RCU grace period if warranted, re-initializing the hierarchy
1678 * in preparation for detecting the next grace period. The caller must hold 1779 * in preparation for detecting the next grace period. The caller must hold
@@ -1681,8 +1782,10 @@ static void rsp_wakeup(struct irq_work *work)
1681 * Note that it is legal for a dying CPU (which is marked as offline) to 1782 * Note that it is legal for a dying CPU (which is marked as offline) to
1682 * invoke this function. This can happen when the dying CPU reports its 1783 * invoke this function. This can happen when the dying CPU reports its
1683 * quiescent state. 1784 * quiescent state.
1785 *
1786 * Returns true if the grace-period kthread must be awakened.
1684 */ 1787 */
1685static void 1788static bool
1686rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp, 1789rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp,
1687 struct rcu_data *rdp) 1790 struct rcu_data *rdp)
1688{ 1791{
@@ -1693,20 +1796,18 @@ rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp,
1693 * or a grace period is already in progress. 1796 * or a grace period is already in progress.
1694 * Either way, don't start a new grace period. 1797 * Either way, don't start a new grace period.
1695 */ 1798 */
1696 return; 1799 return false;
1697 } 1800 }
1698 rsp->gp_flags = RCU_GP_FLAG_INIT; 1801 ACCESS_ONCE(rsp->gp_flags) = RCU_GP_FLAG_INIT;
1699 trace_rcu_grace_period(rsp->name, ACCESS_ONCE(rsp->gpnum), 1802 trace_rcu_grace_period(rsp->name, ACCESS_ONCE(rsp->gpnum),
1700 TPS("newreq")); 1803 TPS("newreq"));
1701 1804
1702 /* 1805 /*
1703 * We can't do wakeups while holding the rnp->lock, as that 1806 * We can't do wakeups while holding the rnp->lock, as that
1704 * could cause possible deadlocks with the rq->lock. Defer 1807 * could cause possible deadlocks with the rq->lock. Defer
1705 * the wakeup to interrupt context. And don't bother waking 1808 * the wakeup to our caller.
1706 * up the running kthread.
1707 */ 1809 */
1708 if (current != rsp->gp_kthread) 1810 return true;
1709 irq_work_queue(&rsp->wakeup_work);
1710} 1811}
1711 1812
1712/* 1813/*
@@ -1715,12 +1816,14 @@ rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp,
1715 * is invoked indirectly from rcu_advance_cbs(), which would result in 1816 * is invoked indirectly from rcu_advance_cbs(), which would result in
1716 * endless recursion -- or would do so if it wasn't for the self-deadlock 1817 * endless recursion -- or would do so if it wasn't for the self-deadlock
1717 * that is encountered beforehand. 1818 * that is encountered beforehand.
1819 *
1820 * Returns true if the grace-period kthread needs to be awakened.
1718 */ 1821 */
1719static void 1822static bool rcu_start_gp(struct rcu_state *rsp)
1720rcu_start_gp(struct rcu_state *rsp)
1721{ 1823{
1722 struct rcu_data *rdp = this_cpu_ptr(rsp->rda); 1824 struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
1723 struct rcu_node *rnp = rcu_get_root(rsp); 1825 struct rcu_node *rnp = rcu_get_root(rsp);
1826 bool ret = false;
1724 1827
1725 /* 1828 /*
1726 * If there is no grace period in progress right now, any 1829 * If there is no grace period in progress right now, any
@@ -1730,8 +1833,9 @@ rcu_start_gp(struct rcu_state *rsp)
1730 * resulting in pointless grace periods. So, advance callbacks 1833 * resulting in pointless grace periods. So, advance callbacks
1731 * then start the grace period! 1834 * then start the grace period!
1732 */ 1835 */
1733 rcu_advance_cbs(rsp, rnp, rdp); 1836 ret = rcu_advance_cbs(rsp, rnp, rdp) || ret;
1734 rcu_start_gp_advanced(rsp, rnp, rdp); 1837 ret = rcu_start_gp_advanced(rsp, rnp, rdp) || ret;
1838 return ret;
1735} 1839}
1736 1840
1737/* 1841/*
@@ -1820,6 +1924,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp)
1820{ 1924{
1821 unsigned long flags; 1925 unsigned long flags;
1822 unsigned long mask; 1926 unsigned long mask;
1927 bool needwake;
1823 struct rcu_node *rnp; 1928 struct rcu_node *rnp;
1824 1929
1825 rnp = rdp->mynode; 1930 rnp = rdp->mynode;
@@ -1848,9 +1953,11 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp)
1848 * This GP can't end until cpu checks in, so all of our 1953 * This GP can't end until cpu checks in, so all of our
1849 * callbacks can be processed during the next GP. 1954 * callbacks can be processed during the next GP.
1850 */ 1955 */
1851 rcu_accelerate_cbs(rsp, rnp, rdp); 1956 needwake = rcu_accelerate_cbs(rsp, rnp, rdp);
1852 1957
1853 rcu_report_qs_rnp(mask, rsp, rnp, flags); /* rlses rnp->lock */ 1958 rcu_report_qs_rnp(mask, rsp, rnp, flags); /* rlses rnp->lock */
1959 if (needwake)
1960 rcu_gp_kthread_wake(rsp);
1854 } 1961 }
1855} 1962}
1856 1963
@@ -1951,7 +2058,7 @@ rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp,
1951static void rcu_adopt_orphan_cbs(struct rcu_state *rsp, unsigned long flags) 2058static void rcu_adopt_orphan_cbs(struct rcu_state *rsp, unsigned long flags)
1952{ 2059{
1953 int i; 2060 int i;
1954 struct rcu_data *rdp = __this_cpu_ptr(rsp->rda); 2061 struct rcu_data *rdp = raw_cpu_ptr(rsp->rda);
1955 2062
1956 /* No-CBs CPUs are handled specially. */ 2063 /* No-CBs CPUs are handled specially. */
1957 if (rcu_nocb_adopt_orphan_cbs(rsp, rdp, flags)) 2064 if (rcu_nocb_adopt_orphan_cbs(rsp, rdp, flags))
@@ -2320,7 +2427,7 @@ static void force_quiescent_state(struct rcu_state *rsp)
2320 raw_spin_unlock_irqrestore(&rnp_old->lock, flags); 2427 raw_spin_unlock_irqrestore(&rnp_old->lock, flags);
2321 return; /* Someone beat us to it. */ 2428 return; /* Someone beat us to it. */
2322 } 2429 }
2323 rsp->gp_flags |= RCU_GP_FLAG_FQS; 2430 ACCESS_ONCE(rsp->gp_flags) |= RCU_GP_FLAG_FQS;
2324 raw_spin_unlock_irqrestore(&rnp_old->lock, flags); 2431 raw_spin_unlock_irqrestore(&rnp_old->lock, flags);
2325 wake_up(&rsp->gp_wq); /* Memory barrier implied by wake_up() path. */ 2432 wake_up(&rsp->gp_wq); /* Memory barrier implied by wake_up() path. */
2326} 2433}
@@ -2334,7 +2441,8 @@ static void
2334__rcu_process_callbacks(struct rcu_state *rsp) 2441__rcu_process_callbacks(struct rcu_state *rsp)
2335{ 2442{
2336 unsigned long flags; 2443 unsigned long flags;
2337 struct rcu_data *rdp = __this_cpu_ptr(rsp->rda); 2444 bool needwake;
2445 struct rcu_data *rdp = raw_cpu_ptr(rsp->rda);
2338 2446
2339 WARN_ON_ONCE(rdp->beenonline == 0); 2447 WARN_ON_ONCE(rdp->beenonline == 0);
2340 2448
@@ -2345,8 +2453,10 @@ __rcu_process_callbacks(struct rcu_state *rsp)
2345 local_irq_save(flags); 2453 local_irq_save(flags);
2346 if (cpu_needs_another_gp(rsp, rdp)) { 2454 if (cpu_needs_another_gp(rsp, rdp)) {
2347 raw_spin_lock(&rcu_get_root(rsp)->lock); /* irqs disabled. */ 2455 raw_spin_lock(&rcu_get_root(rsp)->lock); /* irqs disabled. */
2348 rcu_start_gp(rsp); 2456 needwake = rcu_start_gp(rsp);
2349 raw_spin_unlock_irqrestore(&rcu_get_root(rsp)->lock, flags); 2457 raw_spin_unlock_irqrestore(&rcu_get_root(rsp)->lock, flags);
2458 if (needwake)
2459 rcu_gp_kthread_wake(rsp);
2350 } else { 2460 } else {
2351 local_irq_restore(flags); 2461 local_irq_restore(flags);
2352 } 2462 }
@@ -2404,6 +2514,8 @@ static void invoke_rcu_core(void)
2404static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp, 2514static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp,
2405 struct rcu_head *head, unsigned long flags) 2515 struct rcu_head *head, unsigned long flags)
2406{ 2516{
2517 bool needwake;
2518
2407 /* 2519 /*
2408 * If called from an extended quiescent state, invoke the RCU 2520 * If called from an extended quiescent state, invoke the RCU
2409 * core in order to force a re-evaluation of RCU's idleness. 2521 * core in order to force a re-evaluation of RCU's idleness.
@@ -2433,8 +2545,10 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp,
2433 2545
2434 raw_spin_lock(&rnp_root->lock); 2546 raw_spin_lock(&rnp_root->lock);
2435 smp_mb__after_unlock_lock(); 2547 smp_mb__after_unlock_lock();
2436 rcu_start_gp(rsp); 2548 needwake = rcu_start_gp(rsp);
2437 raw_spin_unlock(&rnp_root->lock); 2549 raw_spin_unlock(&rnp_root->lock);
2550 if (needwake)
2551 rcu_gp_kthread_wake(rsp);
2438 } else { 2552 } else {
2439 /* Give the grace period a kick. */ 2553 /* Give the grace period a kick. */
2440 rdp->blimit = LONG_MAX; 2554 rdp->blimit = LONG_MAX;
@@ -2537,6 +2651,20 @@ void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
2537EXPORT_SYMBOL_GPL(call_rcu_bh); 2651EXPORT_SYMBOL_GPL(call_rcu_bh);
2538 2652
2539/* 2653/*
2654 * Queue an RCU callback for lazy invocation after a grace period.
2655 * This will likely be later named something like "call_rcu_lazy()",
2656 * but this change will require some way of tagging the lazy RCU
2657 * callbacks in the list of pending callbacks. Until then, this
2658 * function may only be called from __kfree_rcu().
2659 */
2660void kfree_call_rcu(struct rcu_head *head,
2661 void (*func)(struct rcu_head *rcu))
2662{
2663 __call_rcu(head, func, rcu_state_p, -1, 1);
2664}
2665EXPORT_SYMBOL_GPL(kfree_call_rcu);
2666
2667/*
2540 * Because a context switch is a grace period for RCU-sched and RCU-bh, 2668 * Because a context switch is a grace period for RCU-sched and RCU-bh,
2541 * any blocking grace-period wait automatically implies a grace period 2669 * any blocking grace-period wait automatically implies a grace period
2542 * if there is only one CPU online at any point time during execution 2670 * if there is only one CPU online at any point time during execution
@@ -2659,7 +2787,7 @@ unsigned long get_state_synchronize_rcu(void)
2659 * time-consuming work between get_state_synchronize_rcu() 2787 * time-consuming work between get_state_synchronize_rcu()
2660 * and cond_synchronize_rcu(). 2788 * and cond_synchronize_rcu().
2661 */ 2789 */
2662 return smp_load_acquire(&rcu_state->gpnum); 2790 return smp_load_acquire(&rcu_state_p->gpnum);
2663} 2791}
2664EXPORT_SYMBOL_GPL(get_state_synchronize_rcu); 2792EXPORT_SYMBOL_GPL(get_state_synchronize_rcu);
2665 2793
@@ -2685,7 +2813,7 @@ void cond_synchronize_rcu(unsigned long oldstate)
2685 * Ensure that this load happens before any RCU-destructive 2813 * Ensure that this load happens before any RCU-destructive
2686 * actions the caller might carry out after we return. 2814 * actions the caller might carry out after we return.
2687 */ 2815 */
2688 newstate = smp_load_acquire(&rcu_state->completed); 2816 newstate = smp_load_acquire(&rcu_state_p->completed);
2689 if (ULONG_CMP_GE(oldstate, newstate)) 2817 if (ULONG_CMP_GE(oldstate, newstate))
2690 synchronize_rcu(); 2818 synchronize_rcu();
2691} 2819}
@@ -2790,7 +2918,7 @@ void synchronize_sched_expedited(void)
2790 s = atomic_long_read(&rsp->expedited_done); 2918 s = atomic_long_read(&rsp->expedited_done);
2791 if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) { 2919 if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) {
2792 /* ensure test happens before caller kfree */ 2920 /* ensure test happens before caller kfree */
2793 smp_mb__before_atomic_inc(); /* ^^^ */ 2921 smp_mb__before_atomic(); /* ^^^ */
2794 atomic_long_inc(&rsp->expedited_workdone1); 2922 atomic_long_inc(&rsp->expedited_workdone1);
2795 return; 2923 return;
2796 } 2924 }
@@ -2808,7 +2936,7 @@ void synchronize_sched_expedited(void)
2808 s = atomic_long_read(&rsp->expedited_done); 2936 s = atomic_long_read(&rsp->expedited_done);
2809 if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) { 2937 if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) {
2810 /* ensure test happens before caller kfree */ 2938 /* ensure test happens before caller kfree */
2811 smp_mb__before_atomic_inc(); /* ^^^ */ 2939 smp_mb__before_atomic(); /* ^^^ */
2812 atomic_long_inc(&rsp->expedited_workdone2); 2940 atomic_long_inc(&rsp->expedited_workdone2);
2813 return; 2941 return;
2814 } 2942 }
@@ -2837,7 +2965,7 @@ void synchronize_sched_expedited(void)
2837 s = atomic_long_read(&rsp->expedited_done); 2965 s = atomic_long_read(&rsp->expedited_done);
2838 if (ULONG_CMP_GE((ulong)s, (ulong)snap)) { 2966 if (ULONG_CMP_GE((ulong)s, (ulong)snap)) {
2839 /* ensure test happens before caller kfree */ 2967 /* ensure test happens before caller kfree */
2840 smp_mb__before_atomic_inc(); /* ^^^ */ 2968 smp_mb__before_atomic(); /* ^^^ */
2841 atomic_long_inc(&rsp->expedited_done_lost); 2969 atomic_long_inc(&rsp->expedited_done_lost);
2842 break; 2970 break;
2843 } 2971 }
@@ -2988,7 +3116,7 @@ static void rcu_barrier_callback(struct rcu_head *rhp)
2988static void rcu_barrier_func(void *type) 3116static void rcu_barrier_func(void *type)
2989{ 3117{
2990 struct rcu_state *rsp = type; 3118 struct rcu_state *rsp = type;
2991 struct rcu_data *rdp = __this_cpu_ptr(rsp->rda); 3119 struct rcu_data *rdp = raw_cpu_ptr(rsp->rda);
2992 3120
2993 _rcu_barrier_trace(rsp, "IRQ", -1, rsp->n_barrier_done); 3121 _rcu_barrier_trace(rsp, "IRQ", -1, rsp->n_barrier_done);
2994 atomic_inc(&rsp->barrier_cpu_count); 3122 atomic_inc(&rsp->barrier_cpu_count);
@@ -3160,7 +3288,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
3160 * that this CPU cannot possibly have any RCU callbacks in flight yet. 3288 * that this CPU cannot possibly have any RCU callbacks in flight yet.
3161 */ 3289 */
3162static void 3290static void
3163rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible) 3291rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
3164{ 3292{
3165 unsigned long flags; 3293 unsigned long flags;
3166 unsigned long mask; 3294 unsigned long mask;
@@ -3173,7 +3301,6 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible)
3173 /* Set up local state, ensuring consistent view of global state. */ 3301 /* Set up local state, ensuring consistent view of global state. */
3174 raw_spin_lock_irqsave(&rnp->lock, flags); 3302 raw_spin_lock_irqsave(&rnp->lock, flags);
3175 rdp->beenonline = 1; /* We have now been online. */ 3303 rdp->beenonline = 1; /* We have now been online. */
3176 rdp->preemptible = preemptible;
3177 rdp->qlen_last_fqs_check = 0; 3304 rdp->qlen_last_fqs_check = 0;
3178 rdp->n_force_qs_snap = rsp->n_force_qs; 3305 rdp->n_force_qs_snap = rsp->n_force_qs;
3179 rdp->blimit = blimit; 3306 rdp->blimit = blimit;
@@ -3217,8 +3344,7 @@ static void rcu_prepare_cpu(int cpu)
3217 struct rcu_state *rsp; 3344 struct rcu_state *rsp;
3218 3345
3219 for_each_rcu_flavor(rsp) 3346 for_each_rcu_flavor(rsp)
3220 rcu_init_percpu_data(cpu, rsp, 3347 rcu_init_percpu_data(cpu, rsp);
3221 strcmp(rsp->name, "rcu_preempt") == 0);
3222} 3348}
3223 3349
3224/* 3350/*
@@ -3228,7 +3354,7 @@ static int rcu_cpu_notify(struct notifier_block *self,
3228 unsigned long action, void *hcpu) 3354 unsigned long action, void *hcpu)
3229{ 3355{
3230 long cpu = (long)hcpu; 3356 long cpu = (long)hcpu;
3231 struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu); 3357 struct rcu_data *rdp = per_cpu_ptr(rcu_state_p->rda, cpu);
3232 struct rcu_node *rnp = rdp->mynode; 3358 struct rcu_node *rnp = rdp->mynode;
3233 struct rcu_state *rsp; 3359 struct rcu_state *rsp;
3234 3360
@@ -3402,8 +3528,8 @@ static void __init rcu_init_one(struct rcu_state *rsp,
3402 rnp->qsmaskinit = 0; 3528 rnp->qsmaskinit = 0;
3403 rnp->grplo = j * cpustride; 3529 rnp->grplo = j * cpustride;
3404 rnp->grphi = (j + 1) * cpustride - 1; 3530 rnp->grphi = (j + 1) * cpustride - 1;
3405 if (rnp->grphi >= NR_CPUS) 3531 if (rnp->grphi >= nr_cpu_ids)
3406 rnp->grphi = NR_CPUS - 1; 3532 rnp->grphi = nr_cpu_ids - 1;
3407 if (i == 0) { 3533 if (i == 0) {
3408 rnp->grpnum = 0; 3534 rnp->grpnum = 0;
3409 rnp->grpmask = 0; 3535 rnp->grpmask = 0;
@@ -3422,7 +3548,6 @@ static void __init rcu_init_one(struct rcu_state *rsp,
3422 3548
3423 rsp->rda = rda; 3549 rsp->rda = rda;
3424 init_waitqueue_head(&rsp->gp_wq); 3550 init_waitqueue_head(&rsp->gp_wq);
3425 init_irq_work(&rsp->wakeup_work, rsp_wakeup);
3426 rnp = rsp->level[rcu_num_lvls - 1]; 3551 rnp = rsp->level[rcu_num_lvls - 1];
3427 for_each_possible_cpu(i) { 3552 for_each_possible_cpu(i) {
3428 while (i > rnp->grphi) 3553 while (i > rnp->grphi)
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 75dc3c39a02a..bf2c1e669691 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -252,7 +252,6 @@ struct rcu_data {
252 bool passed_quiesce; /* User-mode/idle loop etc. */ 252 bool passed_quiesce; /* User-mode/idle loop etc. */
253 bool qs_pending; /* Core waits for quiesc state. */ 253 bool qs_pending; /* Core waits for quiesc state. */
254 bool beenonline; /* CPU online at least once. */ 254 bool beenonline; /* CPU online at least once. */
255 bool preemptible; /* Preemptible RCU? */
256 struct rcu_node *mynode; /* This CPU's leaf of hierarchy */ 255 struct rcu_node *mynode; /* This CPU's leaf of hierarchy */
257 unsigned long grpmask; /* Mask to apply to leaf qsmask. */ 256 unsigned long grpmask; /* Mask to apply to leaf qsmask. */
258#ifdef CONFIG_RCU_CPU_STALL_INFO 257#ifdef CONFIG_RCU_CPU_STALL_INFO
@@ -406,7 +405,8 @@ struct rcu_state {
406 unsigned long completed; /* # of last completed gp. */ 405 unsigned long completed; /* # of last completed gp. */
407 struct task_struct *gp_kthread; /* Task for grace periods. */ 406 struct task_struct *gp_kthread; /* Task for grace periods. */
408 wait_queue_head_t gp_wq; /* Where GP task waits. */ 407 wait_queue_head_t gp_wq; /* Where GP task waits. */
409 int gp_flags; /* Commands for GP task. */ 408 short gp_flags; /* Commands for GP task. */
409 short gp_state; /* GP kthread sleep state. */
410 410
411 /* End of fields guarded by root rcu_node's lock. */ 411 /* End of fields guarded by root rcu_node's lock. */
412 412
@@ -462,13 +462,17 @@ struct rcu_state {
462 const char *name; /* Name of structure. */ 462 const char *name; /* Name of structure. */
463 char abbr; /* Abbreviated name. */ 463 char abbr; /* Abbreviated name. */
464 struct list_head flavors; /* List of RCU flavors. */ 464 struct list_head flavors; /* List of RCU flavors. */
465 struct irq_work wakeup_work; /* Postponed wakeups */
466}; 465};
467 466
468/* Values for rcu_state structure's gp_flags field. */ 467/* Values for rcu_state structure's gp_flags field. */
469#define RCU_GP_FLAG_INIT 0x1 /* Need grace-period initialization. */ 468#define RCU_GP_FLAG_INIT 0x1 /* Need grace-period initialization. */
470#define RCU_GP_FLAG_FQS 0x2 /* Need grace-period quiescent-state forcing. */ 469#define RCU_GP_FLAG_FQS 0x2 /* Need grace-period quiescent-state forcing. */
471 470
471/* Values for rcu_state structure's gp_flags field. */
472#define RCU_GP_WAIT_INIT 0 /* Initial state. */
473#define RCU_GP_WAIT_GPS 1 /* Wait for grace-period start. */
474#define RCU_GP_WAIT_FQS 2 /* Wait for force-quiescent-state time. */
475
472extern struct list_head rcu_struct_flavors; 476extern struct list_head rcu_struct_flavors;
473 477
474/* Sequence through rcu_state structures for each RCU flavor. */ 478/* Sequence through rcu_state structures for each RCU flavor. */
@@ -547,7 +551,6 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu);
547static void print_cpu_stall_info_end(void); 551static void print_cpu_stall_info_end(void);
548static void zero_cpu_stall_ticks(struct rcu_data *rdp); 552static void zero_cpu_stall_ticks(struct rcu_data *rdp);
549static void increment_cpu_stall_ticks(void); 553static void increment_cpu_stall_ticks(void);
550static int rcu_nocb_needs_gp(struct rcu_state *rsp);
551static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq); 554static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq);
552static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp); 555static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp);
553static void rcu_init_one_nocb(struct rcu_node *rnp); 556static void rcu_init_one_nocb(struct rcu_node *rnp);
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 962d1d589929..cbc2c45265e2 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -116,7 +116,7 @@ static void __init rcu_bootup_announce_oddness(void)
116#ifdef CONFIG_TREE_PREEMPT_RCU 116#ifdef CONFIG_TREE_PREEMPT_RCU
117 117
118RCU_STATE_INITIALIZER(rcu_preempt, 'p', call_rcu); 118RCU_STATE_INITIALIZER(rcu_preempt, 'p', call_rcu);
119static struct rcu_state *rcu_state = &rcu_preempt_state; 119static struct rcu_state *rcu_state_p = &rcu_preempt_state;
120 120
121static int rcu_preempted_readers_exp(struct rcu_node *rnp); 121static int rcu_preempted_readers_exp(struct rcu_node *rnp);
122 122
@@ -149,15 +149,6 @@ long rcu_batches_completed(void)
149EXPORT_SYMBOL_GPL(rcu_batches_completed); 149EXPORT_SYMBOL_GPL(rcu_batches_completed);
150 150
151/* 151/*
152 * Force a quiescent state for preemptible RCU.
153 */
154void rcu_force_quiescent_state(void)
155{
156 force_quiescent_state(&rcu_preempt_state);
157}
158EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
159
160/*
161 * Record a preemptible-RCU quiescent state for the specified CPU. Note 152 * Record a preemptible-RCU quiescent state for the specified CPU. Note
162 * that this just means that the task currently running on the CPU is 153 * that this just means that the task currently running on the CPU is
163 * not in a quiescent state. There might be any number of tasks blocked 154 * not in a quiescent state. There might be any number of tasks blocked
@@ -688,20 +679,6 @@ void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
688} 679}
689EXPORT_SYMBOL_GPL(call_rcu); 680EXPORT_SYMBOL_GPL(call_rcu);
690 681
691/*
692 * Queue an RCU callback for lazy invocation after a grace period.
693 * This will likely be later named something like "call_rcu_lazy()",
694 * but this change will require some way of tagging the lazy RCU
695 * callbacks in the list of pending callbacks. Until then, this
696 * function may only be called from __kfree_rcu().
697 */
698void kfree_call_rcu(struct rcu_head *head,
699 void (*func)(struct rcu_head *rcu))
700{
701 __call_rcu(head, func, &rcu_preempt_state, -1, 1);
702}
703EXPORT_SYMBOL_GPL(kfree_call_rcu);
704
705/** 682/**
706 * synchronize_rcu - wait until a grace period has elapsed. 683 * synchronize_rcu - wait until a grace period has elapsed.
707 * 684 *
@@ -970,7 +947,7 @@ void exit_rcu(void)
970 947
971#else /* #ifdef CONFIG_TREE_PREEMPT_RCU */ 948#else /* #ifdef CONFIG_TREE_PREEMPT_RCU */
972 949
973static struct rcu_state *rcu_state = &rcu_sched_state; 950static struct rcu_state *rcu_state_p = &rcu_sched_state;
974 951
975/* 952/*
976 * Tell them what RCU they are running. 953 * Tell them what RCU they are running.
@@ -991,16 +968,6 @@ long rcu_batches_completed(void)
991EXPORT_SYMBOL_GPL(rcu_batches_completed); 968EXPORT_SYMBOL_GPL(rcu_batches_completed);
992 969
993/* 970/*
994 * Force a quiescent state for RCU, which, because there is no preemptible
995 * RCU, becomes the same as rcu-sched.
996 */
997void rcu_force_quiescent_state(void)
998{
999 rcu_sched_force_quiescent_state();
1000}
1001EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
1002
1003/*
1004 * Because preemptible RCU does not exist, we never have to check for 971 * Because preemptible RCU does not exist, we never have to check for
1005 * CPUs being in quiescent states. 972 * CPUs being in quiescent states.
1006 */ 973 */
@@ -1080,22 +1047,6 @@ static void rcu_preempt_check_callbacks(int cpu)
1080} 1047}
1081 1048
1082/* 1049/*
1083 * Queue an RCU callback for lazy invocation after a grace period.
1084 * This will likely be later named something like "call_rcu_lazy()",
1085 * but this change will require some way of tagging the lazy RCU
1086 * callbacks in the list of pending callbacks. Until then, this
1087 * function may only be called from __kfree_rcu().
1088 *
1089 * Because there is no preemptible RCU, we use RCU-sched instead.
1090 */
1091void kfree_call_rcu(struct rcu_head *head,
1092 void (*func)(struct rcu_head *rcu))
1093{
1094 __call_rcu(head, func, &rcu_sched_state, -1, 1);
1095}
1096EXPORT_SYMBOL_GPL(kfree_call_rcu);
1097
1098/*
1099 * Wait for an rcu-preempt grace period, but make it happen quickly. 1050 * Wait for an rcu-preempt grace period, but make it happen quickly.
1100 * But because preemptible RCU does not exist, map to rcu-sched. 1051 * But because preemptible RCU does not exist, map to rcu-sched.
1101 */ 1052 */
@@ -1517,11 +1468,11 @@ static int __init rcu_spawn_kthreads(void)
1517 for_each_possible_cpu(cpu) 1468 for_each_possible_cpu(cpu)
1518 per_cpu(rcu_cpu_has_work, cpu) = 0; 1469 per_cpu(rcu_cpu_has_work, cpu) = 0;
1519 BUG_ON(smpboot_register_percpu_thread(&rcu_cpu_thread_spec)); 1470 BUG_ON(smpboot_register_percpu_thread(&rcu_cpu_thread_spec));
1520 rnp = rcu_get_root(rcu_state); 1471 rnp = rcu_get_root(rcu_state_p);
1521 (void)rcu_spawn_one_boost_kthread(rcu_state, rnp); 1472 (void)rcu_spawn_one_boost_kthread(rcu_state_p, rnp);
1522 if (NUM_RCU_NODES > 1) { 1473 if (NUM_RCU_NODES > 1) {
1523 rcu_for_each_leaf_node(rcu_state, rnp) 1474 rcu_for_each_leaf_node(rcu_state_p, rnp)
1524 (void)rcu_spawn_one_boost_kthread(rcu_state, rnp); 1475 (void)rcu_spawn_one_boost_kthread(rcu_state_p, rnp);
1525 } 1476 }
1526 return 0; 1477 return 0;
1527} 1478}
@@ -1529,12 +1480,12 @@ early_initcall(rcu_spawn_kthreads);
1529 1480
1530static void rcu_prepare_kthreads(int cpu) 1481static void rcu_prepare_kthreads(int cpu)
1531{ 1482{
1532 struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu); 1483 struct rcu_data *rdp = per_cpu_ptr(rcu_state_p->rda, cpu);
1533 struct rcu_node *rnp = rdp->mynode; 1484 struct rcu_node *rnp = rdp->mynode;
1534 1485
1535 /* Fire up the incoming CPU's kthread and leaf rcu_node kthread. */ 1486 /* Fire up the incoming CPU's kthread and leaf rcu_node kthread. */
1536 if (rcu_scheduler_fully_active) 1487 if (rcu_scheduler_fully_active)
1537 (void)rcu_spawn_one_boost_kthread(rcu_state, rnp); 1488 (void)rcu_spawn_one_boost_kthread(rcu_state_p, rnp);
1538} 1489}
1539 1490
1540#else /* #ifdef CONFIG_RCU_BOOST */ 1491#else /* #ifdef CONFIG_RCU_BOOST */
@@ -1744,6 +1695,7 @@ int rcu_needs_cpu(int cpu, unsigned long *dj)
1744static void rcu_prepare_for_idle(int cpu) 1695static void rcu_prepare_for_idle(int cpu)
1745{ 1696{
1746#ifndef CONFIG_RCU_NOCB_CPU_ALL 1697#ifndef CONFIG_RCU_NOCB_CPU_ALL
1698 bool needwake;
1747 struct rcu_data *rdp; 1699 struct rcu_data *rdp;
1748 struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); 1700 struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
1749 struct rcu_node *rnp; 1701 struct rcu_node *rnp;
@@ -1792,8 +1744,10 @@ static void rcu_prepare_for_idle(int cpu)
1792 rnp = rdp->mynode; 1744 rnp = rdp->mynode;
1793 raw_spin_lock(&rnp->lock); /* irqs already disabled. */ 1745 raw_spin_lock(&rnp->lock); /* irqs already disabled. */
1794 smp_mb__after_unlock_lock(); 1746 smp_mb__after_unlock_lock();
1795 rcu_accelerate_cbs(rsp, rnp, rdp); 1747 needwake = rcu_accelerate_cbs(rsp, rnp, rdp);
1796 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ 1748 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
1749 if (needwake)
1750 rcu_gp_kthread_wake(rsp);
1797 } 1751 }
1798#endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */ 1752#endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */
1799} 1753}
@@ -1855,7 +1809,7 @@ static void rcu_oom_notify_cpu(void *unused)
1855 struct rcu_data *rdp; 1809 struct rcu_data *rdp;
1856 1810
1857 for_each_rcu_flavor(rsp) { 1811 for_each_rcu_flavor(rsp) {
1858 rdp = __this_cpu_ptr(rsp->rda); 1812 rdp = raw_cpu_ptr(rsp->rda);
1859 if (rdp->qlen_lazy != 0) { 1813 if (rdp->qlen_lazy != 0) {
1860 atomic_inc(&oom_callback_count); 1814 atomic_inc(&oom_callback_count);
1861 rsp->call(&rdp->oom_head, rcu_oom_callback); 1815 rsp->call(&rdp->oom_head, rcu_oom_callback);
@@ -1997,7 +1951,7 @@ static void increment_cpu_stall_ticks(void)
1997 struct rcu_state *rsp; 1951 struct rcu_state *rsp;
1998 1952
1999 for_each_rcu_flavor(rsp) 1953 for_each_rcu_flavor(rsp)
2000 __this_cpu_ptr(rsp->rda)->ticks_this_gp++; 1954 raw_cpu_inc(rsp->rda->ticks_this_gp);
2001} 1955}
2002 1956
2003#else /* #ifdef CONFIG_RCU_CPU_STALL_INFO */ 1957#else /* #ifdef CONFIG_RCU_CPU_STALL_INFO */
@@ -2068,19 +2022,6 @@ static int __init parse_rcu_nocb_poll(char *arg)
2068early_param("rcu_nocb_poll", parse_rcu_nocb_poll); 2022early_param("rcu_nocb_poll", parse_rcu_nocb_poll);
2069 2023
2070/* 2024/*
2071 * Do any no-CBs CPUs need another grace period?
2072 *
2073 * Interrupts must be disabled. If the caller does not hold the root
2074 * rnp_node structure's ->lock, the results are advisory only.
2075 */
2076static int rcu_nocb_needs_gp(struct rcu_state *rsp)
2077{
2078 struct rcu_node *rnp = rcu_get_root(rsp);
2079
2080 return rnp->need_future_gp[(ACCESS_ONCE(rnp->completed) + 1) & 0x1];
2081}
2082
2083/*
2084 * Wake up any no-CBs CPUs' kthreads that were waiting on the just-ended 2025 * Wake up any no-CBs CPUs' kthreads that were waiting on the just-ended
2085 * grace period. 2026 * grace period.
2086 */ 2027 */
@@ -2109,7 +2050,7 @@ static void rcu_init_one_nocb(struct rcu_node *rnp)
2109} 2050}
2110 2051
2111#ifndef CONFIG_RCU_NOCB_CPU_ALL 2052#ifndef CONFIG_RCU_NOCB_CPU_ALL
2112/* Is the specified CPU a no-CPUs CPU? */ 2053/* Is the specified CPU a no-CBs CPU? */
2113bool rcu_is_nocb_cpu(int cpu) 2054bool rcu_is_nocb_cpu(int cpu)
2114{ 2055{
2115 if (have_rcu_nocb_mask) 2056 if (have_rcu_nocb_mask)
@@ -2243,12 +2184,15 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp)
2243 unsigned long c; 2184 unsigned long c;
2244 bool d; 2185 bool d;
2245 unsigned long flags; 2186 unsigned long flags;
2187 bool needwake;
2246 struct rcu_node *rnp = rdp->mynode; 2188 struct rcu_node *rnp = rdp->mynode;
2247 2189
2248 raw_spin_lock_irqsave(&rnp->lock, flags); 2190 raw_spin_lock_irqsave(&rnp->lock, flags);
2249 smp_mb__after_unlock_lock(); 2191 smp_mb__after_unlock_lock();
2250 c = rcu_start_future_gp(rnp, rdp); 2192 needwake = rcu_start_future_gp(rnp, rdp, &c);
2251 raw_spin_unlock_irqrestore(&rnp->lock, flags); 2193 raw_spin_unlock_irqrestore(&rnp->lock, flags);
2194 if (needwake)
2195 rcu_gp_kthread_wake(rdp->rsp);
2252 2196
2253 /* 2197 /*
2254 * Wait for the grace period. Do so interruptibly to avoid messing 2198 * Wait for the grace period. Do so interruptibly to avoid messing
@@ -2402,11 +2346,6 @@ static bool init_nocb_callback_list(struct rcu_data *rdp)
2402 2346
2403#else /* #ifdef CONFIG_RCU_NOCB_CPU */ 2347#else /* #ifdef CONFIG_RCU_NOCB_CPU */
2404 2348
2405static int rcu_nocb_needs_gp(struct rcu_state *rsp)
2406{
2407 return 0;
2408}
2409
2410static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) 2349static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp)
2411{ 2350{
2412} 2351}
@@ -2523,9 +2462,9 @@ static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq)
2523 /* Record start of fully idle period. */ 2462 /* Record start of fully idle period. */
2524 j = jiffies; 2463 j = jiffies;
2525 ACCESS_ONCE(rdtp->dynticks_idle_jiffies) = j; 2464 ACCESS_ONCE(rdtp->dynticks_idle_jiffies) = j;
2526 smp_mb__before_atomic_inc(); 2465 smp_mb__before_atomic();
2527 atomic_inc(&rdtp->dynticks_idle); 2466 atomic_inc(&rdtp->dynticks_idle);
2528 smp_mb__after_atomic_inc(); 2467 smp_mb__after_atomic();
2529 WARN_ON_ONCE(atomic_read(&rdtp->dynticks_idle) & 0x1); 2468 WARN_ON_ONCE(atomic_read(&rdtp->dynticks_idle) & 0x1);
2530} 2469}
2531 2470
@@ -2590,9 +2529,9 @@ static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq)
2590 } 2529 }
2591 2530
2592 /* Record end of idle period. */ 2531 /* Record end of idle period. */
2593 smp_mb__before_atomic_inc(); 2532 smp_mb__before_atomic();
2594 atomic_inc(&rdtp->dynticks_idle); 2533 atomic_inc(&rdtp->dynticks_idle);
2595 smp_mb__after_atomic_inc(); 2534 smp_mb__after_atomic();
2596 WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks_idle) & 0x1)); 2535 WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks_idle) & 0x1));
2597 2536
2598 /* 2537 /*
@@ -2657,20 +2596,6 @@ static bool is_sysidle_rcu_state(struct rcu_state *rsp)
2657} 2596}
2658 2597
2659/* 2598/*
2660 * Bind the grace-period kthread for the sysidle flavor of RCU to the
2661 * timekeeping CPU.
2662 */
2663static void rcu_bind_gp_kthread(void)
2664{
2665 int cpu = ACCESS_ONCE(tick_do_timer_cpu);
2666
2667 if (cpu < 0 || cpu >= nr_cpu_ids)
2668 return;
2669 if (raw_smp_processor_id() != cpu)
2670 set_cpus_allowed_ptr(current, cpumask_of(cpu));
2671}
2672
2673/*
2674 * Return a delay in jiffies based on the number of CPUs, rcu_node 2599 * Return a delay in jiffies based on the number of CPUs, rcu_node
2675 * leaf fanout, and jiffies tick rate. The idea is to allow larger 2600 * leaf fanout, and jiffies tick rate. The idea is to allow larger
2676 * systems more time to transition to full-idle state in order to 2601 * systems more time to transition to full-idle state in order to
@@ -2734,7 +2659,8 @@ static void rcu_sysidle(unsigned long j)
2734static void rcu_sysidle_cancel(void) 2659static void rcu_sysidle_cancel(void)
2735{ 2660{
2736 smp_mb(); 2661 smp_mb();
2737 ACCESS_ONCE(full_sysidle_state) = RCU_SYSIDLE_NOT; 2662 if (full_sysidle_state > RCU_SYSIDLE_SHORT)
2663 ACCESS_ONCE(full_sysidle_state) = RCU_SYSIDLE_NOT;
2738} 2664}
2739 2665
2740/* 2666/*
@@ -2880,10 +2806,6 @@ static bool is_sysidle_rcu_state(struct rcu_state *rsp)
2880 return false; 2806 return false;
2881} 2807}
2882 2808
2883static void rcu_bind_gp_kthread(void)
2884{
2885}
2886
2887static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle, 2809static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle,
2888 unsigned long maxj) 2810 unsigned long maxj)
2889{ 2811{
@@ -2914,3 +2836,19 @@ static bool rcu_nohz_full_cpu(struct rcu_state *rsp)
2914#endif /* #ifdef CONFIG_NO_HZ_FULL */ 2836#endif /* #ifdef CONFIG_NO_HZ_FULL */
2915 return 0; 2837 return 0;
2916} 2838}
2839
2840/*
2841 * Bind the grace-period kthread for the sysidle flavor of RCU to the
2842 * timekeeping CPU.
2843 */
2844static void rcu_bind_gp_kthread(void)
2845{
2846#ifdef CONFIG_NO_HZ_FULL
2847 int cpu = ACCESS_ONCE(tick_do_timer_cpu);
2848
2849 if (cpu < 0 || cpu >= nr_cpu_ids)
2850 return;
2851 if (raw_smp_processor_id() != cpu)
2852 set_cpus_allowed_ptr(current, cpumask_of(cpu));
2853#endif /* #ifdef CONFIG_NO_HZ_FULL */
2854}
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index 4c0a9b0af469..a2aeb4df0f60 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -320,6 +320,18 @@ int rcu_jiffies_till_stall_check(void)
320 return till_stall_check * HZ + RCU_STALL_DELAY_DELTA; 320 return till_stall_check * HZ + RCU_STALL_DELAY_DELTA;
321} 321}
322 322
323void rcu_sysrq_start(void)
324{
325 if (!rcu_cpu_stall_suppress)
326 rcu_cpu_stall_suppress = 2;
327}
328
329void rcu_sysrq_end(void)
330{
331 if (rcu_cpu_stall_suppress == 2)
332 rcu_cpu_stall_suppress = 0;
333}
334
323static int rcu_panic(struct notifier_block *this, unsigned long ev, void *ptr) 335static int rcu_panic(struct notifier_block *this, unsigned long ev, void *ptr)
324{ 336{
325 rcu_cpu_stall_suppress = 1; 337 rcu_cpu_stall_suppress = 1;
@@ -338,3 +350,21 @@ static int __init check_cpu_stall_init(void)
338early_initcall(check_cpu_stall_init); 350early_initcall(check_cpu_stall_init);
339 351
340#endif /* #ifdef CONFIG_RCU_STALL_COMMON */ 352#endif /* #ifdef CONFIG_RCU_STALL_COMMON */
353
354/*
355 * Hooks for cond_resched() and friends to avoid RCU CPU stall warnings.
356 */
357
358DEFINE_PER_CPU(int, rcu_cond_resched_count);
359
360/*
361 * Report a set of RCU quiescent states, for use by cond_resched()
362 * and friends. Out of line due to being called infrequently.
363 */
364void rcu_resched(void)
365{
366 preempt_disable();
367 __this_cpu_write(rcu_cond_resched_count, 0);
368 rcu_note_context_switch(smp_processor_id());
369 preempt_enable();
370}
diff --git a/kernel/reboot.c b/kernel/reboot.c
index 662c83fc16b7..a3a9e240fcdb 100644
--- a/kernel/reboot.c
+++ b/kernel/reboot.c
@@ -388,15 +388,22 @@ static int __init reboot_setup(char *str)
388 break; 388 break;
389 389
390 case 's': 390 case 's':
391 if (isdigit(*(str+1))) 391 {
392 reboot_cpu = simple_strtoul(str+1, NULL, 0); 392 int rc;
393 else if (str[1] == 'm' && str[2] == 'p' && 393
394 isdigit(*(str+3))) 394 if (isdigit(*(str+1))) {
395 reboot_cpu = simple_strtoul(str+3, NULL, 0); 395 rc = kstrtoint(str+1, 0, &reboot_cpu);
396 else 396 if (rc)
397 return rc;
398 } else if (str[1] == 'm' && str[2] == 'p' &&
399 isdigit(*(str+3))) {
400 rc = kstrtoint(str+3, 0, &reboot_cpu);
401 if (rc)
402 return rc;
403 } else
397 reboot_mode = REBOOT_SOFT; 404 reboot_mode = REBOOT_SOFT;
398 break; 405 break;
399 406 }
400 case 'g': 407 case 'g':
401 reboot_mode = REBOOT_GPIO; 408 reboot_mode = REBOOT_GPIO;
402 break; 409 break;
diff --git a/kernel/res_counter.c b/kernel/res_counter.c
index 51dbac6a3633..e791130f85a7 100644
--- a/kernel/res_counter.c
+++ b/kernel/res_counter.c
@@ -186,8 +186,11 @@ int res_counter_memparse_write_strategy(const char *buf,
186 186
187 /* return RES_COUNTER_MAX(unlimited) if "-1" is specified */ 187 /* return RES_COUNTER_MAX(unlimited) if "-1" is specified */
188 if (*buf == '-') { 188 if (*buf == '-') {
189 res = simple_strtoull(buf + 1, &end, 10); 189 int rc = kstrtoull(buf + 1, 10, &res);
190 if (res != 1 || *end != '\0') 190
191 if (rc)
192 return rc;
193 if (res != 1)
191 return -EINVAL; 194 return -EINVAL;
192 *resp = RES_COUNTER_MAX; 195 *resp = RES_COUNTER_MAX;
193 return 0; 196 return 0;
diff --git a/kernel/resource.c b/kernel/resource.c
index 8957d686e29b..3c2237ac32db 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -1288,13 +1288,10 @@ int iomem_map_sanity_check(resource_size_t addr, unsigned long size)
1288 if (p->flags & IORESOURCE_BUSY) 1288 if (p->flags & IORESOURCE_BUSY)
1289 continue; 1289 continue;
1290 1290
1291 printk(KERN_WARNING "resource map sanity check conflict: " 1291 printk(KERN_WARNING "resource sanity check: requesting [mem %#010llx-%#010llx], which spans more than %s %pR\n",
1292 "0x%llx 0x%llx 0x%llx 0x%llx %s\n",
1293 (unsigned long long)addr, 1292 (unsigned long long)addr,
1294 (unsigned long long)(addr + size - 1), 1293 (unsigned long long)(addr + size - 1),
1295 (unsigned long long)p->start, 1294 p->name, p);
1296 (unsigned long long)p->end,
1297 p->name);
1298 err = -1; 1295 err = -1;
1299 break; 1296 break;
1300 } 1297 }
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 268a45ea238c..3bdf01b494fe 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -90,6 +90,22 @@
90#define CREATE_TRACE_POINTS 90#define CREATE_TRACE_POINTS
91#include <trace/events/sched.h> 91#include <trace/events/sched.h>
92 92
93#ifdef smp_mb__before_atomic
94void __smp_mb__before_atomic(void)
95{
96 smp_mb__before_atomic();
97}
98EXPORT_SYMBOL(__smp_mb__before_atomic);
99#endif
100
101#ifdef smp_mb__after_atomic
102void __smp_mb__after_atomic(void)
103{
104 smp_mb__after_atomic();
105}
106EXPORT_SYMBOL(__smp_mb__after_atomic);
107#endif
108
93void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period) 109void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period)
94{ 110{
95 unsigned long delta; 111 unsigned long delta;
@@ -506,6 +522,71 @@ static inline void init_hrtick(void)
506#endif /* CONFIG_SCHED_HRTICK */ 522#endif /* CONFIG_SCHED_HRTICK */
507 523
508/* 524/*
525 * cmpxchg based fetch_or, macro so it works for different integer types
526 */
527#define fetch_or(ptr, val) \
528({ typeof(*(ptr)) __old, __val = *(ptr); \
529 for (;;) { \
530 __old = cmpxchg((ptr), __val, __val | (val)); \
531 if (__old == __val) \
532 break; \
533 __val = __old; \
534 } \
535 __old; \
536})
537
538#if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG)
539/*
540 * Atomically set TIF_NEED_RESCHED and test for TIF_POLLING_NRFLAG,
541 * this avoids any races wrt polling state changes and thereby avoids
542 * spurious IPIs.
543 */
544static bool set_nr_and_not_polling(struct task_struct *p)
545{
546 struct thread_info *ti = task_thread_info(p);
547 return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG);
548}
549
550/*
551 * Atomically set TIF_NEED_RESCHED if TIF_POLLING_NRFLAG is set.
552 *
553 * If this returns true, then the idle task promises to call
554 * sched_ttwu_pending() and reschedule soon.
555 */
556static bool set_nr_if_polling(struct task_struct *p)
557{
558 struct thread_info *ti = task_thread_info(p);
559 typeof(ti->flags) old, val = ACCESS_ONCE(ti->flags);
560
561 for (;;) {
562 if (!(val & _TIF_POLLING_NRFLAG))
563 return false;
564 if (val & _TIF_NEED_RESCHED)
565 return true;
566 old = cmpxchg(&ti->flags, val, val | _TIF_NEED_RESCHED);
567 if (old == val)
568 break;
569 val = old;
570 }
571 return true;
572}
573
574#else
575static bool set_nr_and_not_polling(struct task_struct *p)
576{
577 set_tsk_need_resched(p);
578 return true;
579}
580
581#ifdef CONFIG_SMP
582static bool set_nr_if_polling(struct task_struct *p)
583{
584 return false;
585}
586#endif
587#endif
588
589/*
509 * resched_task - mark a task 'to be rescheduled now'. 590 * resched_task - mark a task 'to be rescheduled now'.
510 * 591 *
511 * On UP this means the setting of the need_resched flag, on SMP it 592 * On UP this means the setting of the need_resched flag, on SMP it
@@ -521,18 +602,18 @@ void resched_task(struct task_struct *p)
521 if (test_tsk_need_resched(p)) 602 if (test_tsk_need_resched(p))
522 return; 603 return;
523 604
524 set_tsk_need_resched(p);
525
526 cpu = task_cpu(p); 605 cpu = task_cpu(p);
606
527 if (cpu == smp_processor_id()) { 607 if (cpu == smp_processor_id()) {
608 set_tsk_need_resched(p);
528 set_preempt_need_resched(); 609 set_preempt_need_resched();
529 return; 610 return;
530 } 611 }
531 612
532 /* NEED_RESCHED must be visible before we test polling */ 613 if (set_nr_and_not_polling(p))
533 smp_mb();
534 if (!tsk_is_polling(p))
535 smp_send_reschedule(cpu); 614 smp_send_reschedule(cpu);
615 else
616 trace_sched_wake_idle_without_ipi(cpu);
536} 617}
537 618
538void resched_cpu(int cpu) 619void resched_cpu(int cpu)
@@ -595,27 +676,10 @@ static void wake_up_idle_cpu(int cpu)
595 if (cpu == smp_processor_id()) 676 if (cpu == smp_processor_id())
596 return; 677 return;
597 678
598 /* 679 if (set_nr_and_not_polling(rq->idle))
599 * This is safe, as this function is called with the timer
600 * wheel base lock of (cpu) held. When the CPU is on the way
601 * to idle and has not yet set rq->curr to idle then it will
602 * be serialized on the timer wheel base lock and take the new
603 * timer into account automatically.
604 */
605 if (rq->curr != rq->idle)
606 return;
607
608 /*
609 * We can set TIF_RESCHED on the idle task of the other CPU
610 * lockless. The worst case is that the other CPU runs the
611 * idle task through an additional NOOP schedule()
612 */
613 set_tsk_need_resched(rq->idle);
614
615 /* NEED_RESCHED must be visible before we test polling */
616 smp_mb();
617 if (!tsk_is_polling(rq->idle))
618 smp_send_reschedule(cpu); 680 smp_send_reschedule(cpu);
681 else
682 trace_sched_wake_idle_without_ipi(cpu);
619} 683}
620 684
621static bool wake_up_full_nohz_cpu(int cpu) 685static bool wake_up_full_nohz_cpu(int cpu)
@@ -841,7 +905,7 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
841 rq->clock_task += delta; 905 rq->clock_task += delta;
842 906
843#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) 907#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
844 if ((irq_delta + steal) && sched_feat(NONTASK_POWER)) 908 if ((irq_delta + steal) && sched_feat(NONTASK_CAPACITY))
845 sched_rt_avg_update(rq, irq_delta + steal); 909 sched_rt_avg_update(rq, irq_delta + steal);
846#endif 910#endif
847} 911}
@@ -1320,7 +1384,7 @@ out:
1320 * leave kernel. 1384 * leave kernel.
1321 */ 1385 */
1322 if (p->mm && printk_ratelimit()) { 1386 if (p->mm && printk_ratelimit()) {
1323 printk_sched("process %d (%s) no longer affine to cpu%d\n", 1387 printk_deferred("process %d (%s) no longer affine to cpu%d\n",
1324 task_pid_nr(p), p->comm, cpu); 1388 task_pid_nr(p), p->comm, cpu);
1325 } 1389 }
1326 } 1390 }
@@ -1474,13 +1538,17 @@ static int ttwu_remote(struct task_struct *p, int wake_flags)
1474} 1538}
1475 1539
1476#ifdef CONFIG_SMP 1540#ifdef CONFIG_SMP
1477static void sched_ttwu_pending(void) 1541void sched_ttwu_pending(void)
1478{ 1542{
1479 struct rq *rq = this_rq(); 1543 struct rq *rq = this_rq();
1480 struct llist_node *llist = llist_del_all(&rq->wake_list); 1544 struct llist_node *llist = llist_del_all(&rq->wake_list);
1481 struct task_struct *p; 1545 struct task_struct *p;
1546 unsigned long flags;
1482 1547
1483 raw_spin_lock(&rq->lock); 1548 if (!llist)
1549 return;
1550
1551 raw_spin_lock_irqsave(&rq->lock, flags);
1484 1552
1485 while (llist) { 1553 while (llist) {
1486 p = llist_entry(llist, struct task_struct, wake_entry); 1554 p = llist_entry(llist, struct task_struct, wake_entry);
@@ -1488,7 +1556,7 @@ static void sched_ttwu_pending(void)
1488 ttwu_do_activate(rq, p, 0); 1556 ttwu_do_activate(rq, p, 0);
1489 } 1557 }
1490 1558
1491 raw_spin_unlock(&rq->lock); 1559 raw_spin_unlock_irqrestore(&rq->lock, flags);
1492} 1560}
1493 1561
1494void scheduler_ipi(void) 1562void scheduler_ipi(void)
@@ -1534,8 +1602,14 @@ void scheduler_ipi(void)
1534 1602
1535static void ttwu_queue_remote(struct task_struct *p, int cpu) 1603static void ttwu_queue_remote(struct task_struct *p, int cpu)
1536{ 1604{
1537 if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list)) 1605 struct rq *rq = cpu_rq(cpu);
1538 smp_send_reschedule(cpu); 1606
1607 if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list)) {
1608 if (!set_nr_if_polling(rq->idle))
1609 smp_send_reschedule(cpu);
1610 else
1611 trace_sched_wake_idle_without_ipi(cpu);
1612 }
1539} 1613}
1540 1614
1541bool cpus_share_cache(int this_cpu, int that_cpu) 1615bool cpus_share_cache(int this_cpu, int that_cpu)
@@ -2192,7 +2266,7 @@ static inline void post_schedule(struct rq *rq)
2192 * schedule_tail - first thing a freshly forked thread must call. 2266 * schedule_tail - first thing a freshly forked thread must call.
2193 * @prev: the thread we just switched away from. 2267 * @prev: the thread we just switched away from.
2194 */ 2268 */
2195asmlinkage void schedule_tail(struct task_struct *prev) 2269asmlinkage __visible void schedule_tail(struct task_struct *prev)
2196 __releases(rq->lock) 2270 __releases(rq->lock)
2197{ 2271{
2198 struct rq *rq = this_rq(); 2272 struct rq *rq = this_rq();
@@ -2480,7 +2554,7 @@ notrace unsigned long get_parent_ip(unsigned long addr)
2480#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ 2554#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
2481 defined(CONFIG_PREEMPT_TRACER)) 2555 defined(CONFIG_PREEMPT_TRACER))
2482 2556
2483void __kprobes preempt_count_add(int val) 2557void preempt_count_add(int val)
2484{ 2558{
2485#ifdef CONFIG_DEBUG_PREEMPT 2559#ifdef CONFIG_DEBUG_PREEMPT
2486 /* 2560 /*
@@ -2506,8 +2580,9 @@ void __kprobes preempt_count_add(int val)
2506 } 2580 }
2507} 2581}
2508EXPORT_SYMBOL(preempt_count_add); 2582EXPORT_SYMBOL(preempt_count_add);
2583NOKPROBE_SYMBOL(preempt_count_add);
2509 2584
2510void __kprobes preempt_count_sub(int val) 2585void preempt_count_sub(int val)
2511{ 2586{
2512#ifdef CONFIG_DEBUG_PREEMPT 2587#ifdef CONFIG_DEBUG_PREEMPT
2513 /* 2588 /*
@@ -2528,6 +2603,7 @@ void __kprobes preempt_count_sub(int val)
2528 __preempt_count_sub(val); 2603 __preempt_count_sub(val);
2529} 2604}
2530EXPORT_SYMBOL(preempt_count_sub); 2605EXPORT_SYMBOL(preempt_count_sub);
2606NOKPROBE_SYMBOL(preempt_count_sub);
2531 2607
2532#endif 2608#endif
2533 2609
@@ -2592,8 +2668,14 @@ pick_next_task(struct rq *rq, struct task_struct *prev)
2592 if (likely(prev->sched_class == class && 2668 if (likely(prev->sched_class == class &&
2593 rq->nr_running == rq->cfs.h_nr_running)) { 2669 rq->nr_running == rq->cfs.h_nr_running)) {
2594 p = fair_sched_class.pick_next_task(rq, prev); 2670 p = fair_sched_class.pick_next_task(rq, prev);
2595 if (likely(p && p != RETRY_TASK)) 2671 if (unlikely(p == RETRY_TASK))
2596 return p; 2672 goto again;
2673
2674 /* assumes fair_sched_class->next == idle_sched_class */
2675 if (unlikely(!p))
2676 p = idle_sched_class.pick_next_task(rq, prev);
2677
2678 return p;
2597 } 2679 }
2598 2680
2599again: 2681again:
@@ -2741,7 +2823,7 @@ static inline void sched_submit_work(struct task_struct *tsk)
2741 blk_schedule_flush_plug(tsk); 2823 blk_schedule_flush_plug(tsk);
2742} 2824}
2743 2825
2744asmlinkage void __sched schedule(void) 2826asmlinkage __visible void __sched schedule(void)
2745{ 2827{
2746 struct task_struct *tsk = current; 2828 struct task_struct *tsk = current;
2747 2829
@@ -2751,7 +2833,7 @@ asmlinkage void __sched schedule(void)
2751EXPORT_SYMBOL(schedule); 2833EXPORT_SYMBOL(schedule);
2752 2834
2753#ifdef CONFIG_CONTEXT_TRACKING 2835#ifdef CONFIG_CONTEXT_TRACKING
2754asmlinkage void __sched schedule_user(void) 2836asmlinkage __visible void __sched schedule_user(void)
2755{ 2837{
2756 /* 2838 /*
2757 * If we come here after a random call to set_need_resched(), 2839 * If we come here after a random call to set_need_resched(),
@@ -2783,7 +2865,7 @@ void __sched schedule_preempt_disabled(void)
2783 * off of preempt_enable. Kernel preemptions off return from interrupt 2865 * off of preempt_enable. Kernel preemptions off return from interrupt
2784 * occur there and call schedule directly. 2866 * occur there and call schedule directly.
2785 */ 2867 */
2786asmlinkage void __sched notrace preempt_schedule(void) 2868asmlinkage __visible void __sched notrace preempt_schedule(void)
2787{ 2869{
2788 /* 2870 /*
2789 * If there is a non-zero preempt_count or interrupts are disabled, 2871 * If there is a non-zero preempt_count or interrupts are disabled,
@@ -2804,6 +2886,7 @@ asmlinkage void __sched notrace preempt_schedule(void)
2804 barrier(); 2886 barrier();
2805 } while (need_resched()); 2887 } while (need_resched());
2806} 2888}
2889NOKPROBE_SYMBOL(preempt_schedule);
2807EXPORT_SYMBOL(preempt_schedule); 2890EXPORT_SYMBOL(preempt_schedule);
2808#endif /* CONFIG_PREEMPT */ 2891#endif /* CONFIG_PREEMPT */
2809 2892
@@ -2813,7 +2896,7 @@ EXPORT_SYMBOL(preempt_schedule);
2813 * Note, that this is called and return with irqs disabled. This will 2896 * Note, that this is called and return with irqs disabled. This will
2814 * protect us against recursive calling from irq. 2897 * protect us against recursive calling from irq.
2815 */ 2898 */
2816asmlinkage void __sched preempt_schedule_irq(void) 2899asmlinkage __visible void __sched preempt_schedule_irq(void)
2817{ 2900{
2818 enum ctx_state prev_state; 2901 enum ctx_state prev_state;
2819 2902
@@ -2996,7 +3079,7 @@ EXPORT_SYMBOL(set_user_nice);
2996int can_nice(const struct task_struct *p, const int nice) 3079int can_nice(const struct task_struct *p, const int nice)
2997{ 3080{
2998 /* convert nice value [19,-20] to rlimit style value [1,40] */ 3081 /* convert nice value [19,-20] to rlimit style value [1,40] */
2999 int nice_rlim = 20 - nice; 3082 int nice_rlim = nice_to_rlimit(nice);
3000 3083
3001 return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) || 3084 return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) ||
3002 capable(CAP_SYS_NICE)); 3085 capable(CAP_SYS_NICE));
@@ -3020,17 +3103,10 @@ SYSCALL_DEFINE1(nice, int, increment)
3020 * We don't have to worry. Conceptually one call occurs first 3103 * We don't have to worry. Conceptually one call occurs first
3021 * and we have a single winner. 3104 * and we have a single winner.
3022 */ 3105 */
3023 if (increment < -40) 3106 increment = clamp(increment, -NICE_WIDTH, NICE_WIDTH);
3024 increment = -40;
3025 if (increment > 40)
3026 increment = 40;
3027
3028 nice = task_nice(current) + increment; 3107 nice = task_nice(current) + increment;
3029 if (nice < MIN_NICE)
3030 nice = MIN_NICE;
3031 if (nice > MAX_NICE)
3032 nice = MAX_NICE;
3033 3108
3109 nice = clamp_val(nice, MIN_NICE, MAX_NICE);
3034 if (increment < 0 && !can_nice(current, nice)) 3110 if (increment < 0 && !can_nice(current, nice))
3035 return -EPERM; 3111 return -EPERM;
3036 3112
@@ -3124,6 +3200,7 @@ __setparam_dl(struct task_struct *p, const struct sched_attr *attr)
3124 dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime); 3200 dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime);
3125 dl_se->dl_throttled = 0; 3201 dl_se->dl_throttled = 0;
3126 dl_se->dl_new = 1; 3202 dl_se->dl_new = 1;
3203 dl_se->dl_yielded = 0;
3127} 3204}
3128 3205
3129static void __setscheduler_params(struct task_struct *p, 3206static void __setscheduler_params(struct task_struct *p,
@@ -3188,17 +3265,40 @@ __getparam_dl(struct task_struct *p, struct sched_attr *attr)
3188 * We ask for the deadline not being zero, and greater or equal 3265 * We ask for the deadline not being zero, and greater or equal
3189 * than the runtime, as well as the period of being zero or 3266 * than the runtime, as well as the period of being zero or
3190 * greater than deadline. Furthermore, we have to be sure that 3267 * greater than deadline. Furthermore, we have to be sure that
3191 * user parameters are above the internal resolution (1us); we 3268 * user parameters are above the internal resolution of 1us (we
3192 * check sched_runtime only since it is always the smaller one. 3269 * check sched_runtime only since it is always the smaller one) and
3270 * below 2^63 ns (we have to check both sched_deadline and
3271 * sched_period, as the latter can be zero).
3193 */ 3272 */
3194static bool 3273static bool
3195__checkparam_dl(const struct sched_attr *attr) 3274__checkparam_dl(const struct sched_attr *attr)
3196{ 3275{
3197 return attr && attr->sched_deadline != 0 && 3276 /* deadline != 0 */
3198 (attr->sched_period == 0 || 3277 if (attr->sched_deadline == 0)
3199 (s64)(attr->sched_period - attr->sched_deadline) >= 0) && 3278 return false;
3200 (s64)(attr->sched_deadline - attr->sched_runtime ) >= 0 && 3279
3201 attr->sched_runtime >= (2 << (DL_SCALE - 1)); 3280 /*
3281 * Since we truncate DL_SCALE bits, make sure we're at least
3282 * that big.
3283 */
3284 if (attr->sched_runtime < (1ULL << DL_SCALE))
3285 return false;
3286
3287 /*
3288 * Since we use the MSB for wrap-around and sign issues, make
3289 * sure it's not set (mind that period can be equal to zero).
3290 */
3291 if (attr->sched_deadline & (1ULL << 63) ||
3292 attr->sched_period & (1ULL << 63))
3293 return false;
3294
3295 /* runtime <= deadline <= period (if period != 0) */
3296 if ((attr->sched_period != 0 &&
3297 attr->sched_period < attr->sched_deadline) ||
3298 attr->sched_deadline < attr->sched_runtime)
3299 return false;
3300
3301 return true;
3202} 3302}
3203 3303
3204/* 3304/*
@@ -3596,13 +3696,11 @@ static int sched_copy_attr(struct sched_attr __user *uattr,
3596 */ 3696 */
3597 attr->sched_nice = clamp(attr->sched_nice, MIN_NICE, MAX_NICE); 3697 attr->sched_nice = clamp(attr->sched_nice, MIN_NICE, MAX_NICE);
3598 3698
3599out: 3699 return 0;
3600 return ret;
3601 3700
3602err_size: 3701err_size:
3603 put_user(sizeof(*attr), &uattr->size); 3702 put_user(sizeof(*attr), &uattr->size);
3604 ret = -E2BIG; 3703 return -E2BIG;
3605 goto out;
3606} 3704}
3607 3705
3608/** 3706/**
@@ -3639,6 +3737,7 @@ SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
3639 * sys_sched_setattr - same as above, but with extended sched_attr 3737 * sys_sched_setattr - same as above, but with extended sched_attr
3640 * @pid: the pid in question. 3738 * @pid: the pid in question.
3641 * @uattr: structure containing the extended parameters. 3739 * @uattr: structure containing the extended parameters.
3740 * @flags: for future extension.
3642 */ 3741 */
3643SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr, 3742SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr,
3644 unsigned int, flags) 3743 unsigned int, flags)
@@ -3650,8 +3749,12 @@ SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr,
3650 if (!uattr || pid < 0 || flags) 3749 if (!uattr || pid < 0 || flags)
3651 return -EINVAL; 3750 return -EINVAL;
3652 3751
3653 if (sched_copy_attr(uattr, &attr)) 3752 retval = sched_copy_attr(uattr, &attr);
3654 return -EFAULT; 3753 if (retval)
3754 return retval;
3755
3756 if ((int)attr.sched_policy < 0)
3757 return -EINVAL;
3655 3758
3656 rcu_read_lock(); 3759 rcu_read_lock();
3657 retval = -ESRCH; 3760 retval = -ESRCH;
@@ -3701,7 +3804,7 @@ SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
3701 */ 3804 */
3702SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param) 3805SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
3703{ 3806{
3704 struct sched_param lp; 3807 struct sched_param lp = { .sched_priority = 0 };
3705 struct task_struct *p; 3808 struct task_struct *p;
3706 int retval; 3809 int retval;
3707 3810
@@ -3718,11 +3821,8 @@ SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
3718 if (retval) 3821 if (retval)
3719 goto out_unlock; 3822 goto out_unlock;
3720 3823
3721 if (task_has_dl_policy(p)) { 3824 if (task_has_rt_policy(p))
3722 retval = -EINVAL; 3825 lp.sched_priority = p->rt_priority;
3723 goto out_unlock;
3724 }
3725 lp.sched_priority = p->rt_priority;
3726 rcu_read_unlock(); 3826 rcu_read_unlock();
3727 3827
3728 /* 3828 /*
@@ -3760,7 +3860,7 @@ static int sched_read_attr(struct sched_attr __user *uattr,
3760 3860
3761 for (; addr < end; addr++) { 3861 for (; addr < end; addr++) {
3762 if (*addr) 3862 if (*addr)
3763 goto err_size; 3863 return -EFBIG;
3764 } 3864 }
3765 3865
3766 attr->size = usize; 3866 attr->size = usize;
@@ -3770,12 +3870,7 @@ static int sched_read_attr(struct sched_attr __user *uattr,
3770 if (ret) 3870 if (ret)
3771 return -EFAULT; 3871 return -EFAULT;
3772 3872
3773out: 3873 return 0;
3774 return ret;
3775
3776err_size:
3777 ret = -E2BIG;
3778 goto out;
3779} 3874}
3780 3875
3781/** 3876/**
@@ -3783,6 +3878,7 @@ err_size:
3783 * @pid: the pid in question. 3878 * @pid: the pid in question.
3784 * @uattr: structure containing the extended parameters. 3879 * @uattr: structure containing the extended parameters.
3785 * @size: sizeof(attr) for fwd/bwd comp. 3880 * @size: sizeof(attr) for fwd/bwd comp.
3881 * @flags: for future extension.
3786 */ 3882 */
3787SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, 3883SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
3788 unsigned int, size, unsigned int, flags) 3884 unsigned int, size, unsigned int, flags)
@@ -4051,6 +4147,7 @@ static void __cond_resched(void)
4051 4147
4052int __sched _cond_resched(void) 4148int __sched _cond_resched(void)
4053{ 4149{
4150 rcu_cond_resched();
4054 if (should_resched()) { 4151 if (should_resched()) {
4055 __cond_resched(); 4152 __cond_resched();
4056 return 1; 4153 return 1;
@@ -4069,15 +4166,18 @@ EXPORT_SYMBOL(_cond_resched);
4069 */ 4166 */
4070int __cond_resched_lock(spinlock_t *lock) 4167int __cond_resched_lock(spinlock_t *lock)
4071{ 4168{
4169 bool need_rcu_resched = rcu_should_resched();
4072 int resched = should_resched(); 4170 int resched = should_resched();
4073 int ret = 0; 4171 int ret = 0;
4074 4172
4075 lockdep_assert_held(lock); 4173 lockdep_assert_held(lock);
4076 4174
4077 if (spin_needbreak(lock) || resched) { 4175 if (spin_needbreak(lock) || resched || need_rcu_resched) {
4078 spin_unlock(lock); 4176 spin_unlock(lock);
4079 if (resched) 4177 if (resched)
4080 __cond_resched(); 4178 __cond_resched();
4179 else if (unlikely(need_rcu_resched))
4180 rcu_resched();
4081 else 4181 else
4082 cpu_relax(); 4182 cpu_relax();
4083 ret = 1; 4183 ret = 1;
@@ -4091,6 +4191,7 @@ int __sched __cond_resched_softirq(void)
4091{ 4191{
4092 BUG_ON(!in_softirq()); 4192 BUG_ON(!in_softirq());
4093 4193
4194 rcu_cond_resched(); /* BH disabled OK, just recording QSes. */
4094 if (should_resched()) { 4195 if (should_resched()) {
4095 local_bh_enable(); 4196 local_bh_enable();
4096 __cond_resched(); 4197 __cond_resched();
@@ -4145,7 +4246,7 @@ EXPORT_SYMBOL(yield);
4145 * false (0) if we failed to boost the target. 4246 * false (0) if we failed to boost the target.
4146 * -ESRCH if there's no task to yield to. 4247 * -ESRCH if there's no task to yield to.
4147 */ 4248 */
4148bool __sched yield_to(struct task_struct *p, bool preempt) 4249int __sched yield_to(struct task_struct *p, bool preempt)
4149{ 4250{
4150 struct task_struct *curr = current; 4251 struct task_struct *curr = current;
4151 struct rq *rq, *p_rq; 4252 struct rq *rq, *p_rq;
@@ -5039,11 +5140,20 @@ static struct notifier_block migration_notifier = {
5039 .priority = CPU_PRI_MIGRATION, 5140 .priority = CPU_PRI_MIGRATION,
5040}; 5141};
5041 5142
5143static void __cpuinit set_cpu_rq_start_time(void)
5144{
5145 int cpu = smp_processor_id();
5146 struct rq *rq = cpu_rq(cpu);
5147 rq->age_stamp = sched_clock_cpu(cpu);
5148}
5149
5042static int sched_cpu_active(struct notifier_block *nfb, 5150static int sched_cpu_active(struct notifier_block *nfb,
5043 unsigned long action, void *hcpu) 5151 unsigned long action, void *hcpu)
5044{ 5152{
5045 switch (action & ~CPU_TASKS_FROZEN) { 5153 switch (action & ~CPU_TASKS_FROZEN) {
5046 case CPU_STARTING: 5154 case CPU_STARTING:
5155 set_cpu_rq_start_time();
5156 return NOTIFY_OK;
5047 case CPU_DOWN_FAILED: 5157 case CPU_DOWN_FAILED:
5048 set_cpu_active((long)hcpu, true); 5158 set_cpu_active((long)hcpu, true);
5049 return NOTIFY_OK; 5159 return NOTIFY_OK;
@@ -5162,14 +5272,13 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
5162 } 5272 }
5163 5273
5164 /* 5274 /*
5165 * Even though we initialize ->power to something semi-sane, 5275 * Even though we initialize ->capacity to something semi-sane,
5166 * we leave power_orig unset. This allows us to detect if 5276 * we leave capacity_orig unset. This allows us to detect if
5167 * domain iteration is still funny without causing /0 traps. 5277 * domain iteration is still funny without causing /0 traps.
5168 */ 5278 */
5169 if (!group->sgp->power_orig) { 5279 if (!group->sgc->capacity_orig) {
5170 printk(KERN_CONT "\n"); 5280 printk(KERN_CONT "\n");
5171 printk(KERN_ERR "ERROR: domain->cpu_power not " 5281 printk(KERN_ERR "ERROR: domain->cpu_capacity not set\n");
5172 "set\n");
5173 break; 5282 break;
5174 } 5283 }
5175 5284
@@ -5191,9 +5300,9 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
5191 cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group)); 5300 cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));
5192 5301
5193 printk(KERN_CONT " %s", str); 5302 printk(KERN_CONT " %s", str);
5194 if (group->sgp->power != SCHED_POWER_SCALE) { 5303 if (group->sgc->capacity != SCHED_CAPACITY_SCALE) {
5195 printk(KERN_CONT " (cpu_power = %d)", 5304 printk(KERN_CONT " (cpu_capacity = %d)",
5196 group->sgp->power); 5305 group->sgc->capacity);
5197 } 5306 }
5198 5307
5199 group = group->next; 5308 group = group->next;
@@ -5251,8 +5360,9 @@ static int sd_degenerate(struct sched_domain *sd)
5251 SD_BALANCE_NEWIDLE | 5360 SD_BALANCE_NEWIDLE |
5252 SD_BALANCE_FORK | 5361 SD_BALANCE_FORK |
5253 SD_BALANCE_EXEC | 5362 SD_BALANCE_EXEC |
5254 SD_SHARE_CPUPOWER | 5363 SD_SHARE_CPUCAPACITY |
5255 SD_SHARE_PKG_RESOURCES)) { 5364 SD_SHARE_PKG_RESOURCES |
5365 SD_SHARE_POWERDOMAIN)) {
5256 if (sd->groups != sd->groups->next) 5366 if (sd->groups != sd->groups->next)
5257 return 0; 5367 return 0;
5258 } 5368 }
@@ -5281,9 +5391,10 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
5281 SD_BALANCE_NEWIDLE | 5391 SD_BALANCE_NEWIDLE |
5282 SD_BALANCE_FORK | 5392 SD_BALANCE_FORK |
5283 SD_BALANCE_EXEC | 5393 SD_BALANCE_EXEC |
5284 SD_SHARE_CPUPOWER | 5394 SD_SHARE_CPUCAPACITY |
5285 SD_SHARE_PKG_RESOURCES | 5395 SD_SHARE_PKG_RESOURCES |
5286 SD_PREFER_SIBLING); 5396 SD_PREFER_SIBLING |
5397 SD_SHARE_POWERDOMAIN);
5287 if (nr_node_ids == 1) 5398 if (nr_node_ids == 1)
5288 pflags &= ~SD_SERIALIZE; 5399 pflags &= ~SD_SERIALIZE;
5289 } 5400 }
@@ -5405,7 +5516,7 @@ static struct root_domain *alloc_rootdomain(void)
5405 return rd; 5516 return rd;
5406} 5517}
5407 5518
5408static void free_sched_groups(struct sched_group *sg, int free_sgp) 5519static void free_sched_groups(struct sched_group *sg, int free_sgc)
5409{ 5520{
5410 struct sched_group *tmp, *first; 5521 struct sched_group *tmp, *first;
5411 5522
@@ -5416,8 +5527,8 @@ static void free_sched_groups(struct sched_group *sg, int free_sgp)
5416 do { 5527 do {
5417 tmp = sg->next; 5528 tmp = sg->next;
5418 5529
5419 if (free_sgp && atomic_dec_and_test(&sg->sgp->ref)) 5530 if (free_sgc && atomic_dec_and_test(&sg->sgc->ref))
5420 kfree(sg->sgp); 5531 kfree(sg->sgc);
5421 5532
5422 kfree(sg); 5533 kfree(sg);
5423 sg = tmp; 5534 sg = tmp;
@@ -5435,7 +5546,7 @@ static void free_sched_domain(struct rcu_head *rcu)
5435 if (sd->flags & SD_OVERLAP) { 5546 if (sd->flags & SD_OVERLAP) {
5436 free_sched_groups(sd->groups, 1); 5547 free_sched_groups(sd->groups, 1);
5437 } else if (atomic_dec_and_test(&sd->groups->ref)) { 5548 } else if (atomic_dec_and_test(&sd->groups->ref)) {
5438 kfree(sd->groups->sgp); 5549 kfree(sd->groups->sgc);
5439 kfree(sd->groups); 5550 kfree(sd->groups);
5440 } 5551 }
5441 kfree(sd); 5552 kfree(sd);
@@ -5557,17 +5668,6 @@ static int __init isolated_cpu_setup(char *str)
5557 5668
5558__setup("isolcpus=", isolated_cpu_setup); 5669__setup("isolcpus=", isolated_cpu_setup);
5559 5670
5560static const struct cpumask *cpu_cpu_mask(int cpu)
5561{
5562 return cpumask_of_node(cpu_to_node(cpu));
5563}
5564
5565struct sd_data {
5566 struct sched_domain **__percpu sd;
5567 struct sched_group **__percpu sg;
5568 struct sched_group_power **__percpu sgp;
5569};
5570
5571struct s_data { 5671struct s_data {
5572 struct sched_domain ** __percpu sd; 5672 struct sched_domain ** __percpu sd;
5573 struct root_domain *rd; 5673 struct root_domain *rd;
@@ -5580,21 +5680,6 @@ enum s_alloc {
5580 sa_none, 5680 sa_none,
5581}; 5681};
5582 5682
5583struct sched_domain_topology_level;
5584
5585typedef struct sched_domain *(*sched_domain_init_f)(struct sched_domain_topology_level *tl, int cpu);
5586typedef const struct cpumask *(*sched_domain_mask_f)(int cpu);
5587
5588#define SDTL_OVERLAP 0x01
5589
5590struct sched_domain_topology_level {
5591 sched_domain_init_f init;
5592 sched_domain_mask_f mask;
5593 int flags;
5594 int numa_level;
5595 struct sd_data data;
5596};
5597
5598/* 5683/*
5599 * Build an iteration mask that can exclude certain CPUs from the upwards 5684 * Build an iteration mask that can exclude certain CPUs from the upwards
5600 * domain traversal. 5685 * domain traversal.
@@ -5672,17 +5757,17 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
5672 5757
5673 cpumask_or(covered, covered, sg_span); 5758 cpumask_or(covered, covered, sg_span);
5674 5759
5675 sg->sgp = *per_cpu_ptr(sdd->sgp, i); 5760 sg->sgc = *per_cpu_ptr(sdd->sgc, i);
5676 if (atomic_inc_return(&sg->sgp->ref) == 1) 5761 if (atomic_inc_return(&sg->sgc->ref) == 1)
5677 build_group_mask(sd, sg); 5762 build_group_mask(sd, sg);
5678 5763
5679 /* 5764 /*
5680 * Initialize sgp->power such that even if we mess up the 5765 * Initialize sgc->capacity such that even if we mess up the
5681 * domains and no possible iteration will get us here, we won't 5766 * domains and no possible iteration will get us here, we won't
5682 * die on a /0 trap. 5767 * die on a /0 trap.
5683 */ 5768 */
5684 sg->sgp->power = SCHED_POWER_SCALE * cpumask_weight(sg_span); 5769 sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span);
5685 sg->sgp->power_orig = sg->sgp->power; 5770 sg->sgc->capacity_orig = sg->sgc->capacity;
5686 5771
5687 /* 5772 /*
5688 * Make sure the first group of this domain contains the 5773 * Make sure the first group of this domain contains the
@@ -5720,8 +5805,8 @@ static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg)
5720 5805
5721 if (sg) { 5806 if (sg) {
5722 *sg = *per_cpu_ptr(sdd->sg, cpu); 5807 *sg = *per_cpu_ptr(sdd->sg, cpu);
5723 (*sg)->sgp = *per_cpu_ptr(sdd->sgp, cpu); 5808 (*sg)->sgc = *per_cpu_ptr(sdd->sgc, cpu);
5724 atomic_set(&(*sg)->sgp->ref, 1); /* for claim_allocations */ 5809 atomic_set(&(*sg)->sgc->ref, 1); /* for claim_allocations */
5725 } 5810 }
5726 5811
5727 return cpu; 5812 return cpu;
@@ -5730,7 +5815,7 @@ static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg)
5730/* 5815/*
5731 * build_sched_groups will build a circular linked list of the groups 5816 * build_sched_groups will build a circular linked list of the groups
5732 * covered by the given span, and will set each group's ->cpumask correctly, 5817 * covered by the given span, and will set each group's ->cpumask correctly,
5733 * and ->cpu_power to 0. 5818 * and ->cpu_capacity to 0.
5734 * 5819 *
5735 * Assumes the sched_domain tree is fully constructed 5820 * Assumes the sched_domain tree is fully constructed
5736 */ 5821 */
@@ -5762,8 +5847,6 @@ build_sched_groups(struct sched_domain *sd, int cpu)
5762 continue; 5847 continue;
5763 5848
5764 group = get_group(i, sdd, &sg); 5849 group = get_group(i, sdd, &sg);
5765 cpumask_clear(sched_group_cpus(sg));
5766 sg->sgp->power = 0;
5767 cpumask_setall(sched_group_mask(sg)); 5850 cpumask_setall(sched_group_mask(sg));
5768 5851
5769 for_each_cpu(j, span) { 5852 for_each_cpu(j, span) {
@@ -5786,16 +5869,16 @@ build_sched_groups(struct sched_domain *sd, int cpu)
5786} 5869}
5787 5870
5788/* 5871/*
5789 * Initialize sched groups cpu_power. 5872 * Initialize sched groups cpu_capacity.
5790 * 5873 *
5791 * cpu_power indicates the capacity of sched group, which is used while 5874 * cpu_capacity indicates the capacity of sched group, which is used while
5792 * distributing the load between different sched groups in a sched domain. 5875 * distributing the load between different sched groups in a sched domain.
5793 * Typically cpu_power for all the groups in a sched domain will be same unless 5876 * Typically cpu_capacity for all the groups in a sched domain will be same
5794 * there are asymmetries in the topology. If there are asymmetries, group 5877 * unless there are asymmetries in the topology. If there are asymmetries,
5795 * having more cpu_power will pickup more load compared to the group having 5878 * group having more cpu_capacity will pickup more load compared to the
5796 * less cpu_power. 5879 * group having less cpu_capacity.
5797 */ 5880 */
5798static void init_sched_groups_power(int cpu, struct sched_domain *sd) 5881static void init_sched_groups_capacity(int cpu, struct sched_domain *sd)
5799{ 5882{
5800 struct sched_group *sg = sd->groups; 5883 struct sched_group *sg = sd->groups;
5801 5884
@@ -5809,13 +5892,8 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
5809 if (cpu != group_balance_cpu(sg)) 5892 if (cpu != group_balance_cpu(sg))
5810 return; 5893 return;
5811 5894
5812 update_group_power(sd, cpu); 5895 update_group_capacity(sd, cpu);
5813 atomic_set(&sg->sgp->nr_busy_cpus, sg->group_weight); 5896 atomic_set(&sg->sgc->nr_busy_cpus, sg->group_weight);
5814}
5815
5816int __weak arch_sd_sibling_asym_packing(void)
5817{
5818 return 0*SD_ASYM_PACKING;
5819} 5897}
5820 5898
5821/* 5899/*
@@ -5823,34 +5901,6 @@ int __weak arch_sd_sibling_asym_packing(void)
5823 * Non-inlined to reduce accumulated stack pressure in build_sched_domains() 5901 * Non-inlined to reduce accumulated stack pressure in build_sched_domains()
5824 */ 5902 */
5825 5903
5826#ifdef CONFIG_SCHED_DEBUG
5827# define SD_INIT_NAME(sd, type) sd->name = #type
5828#else
5829# define SD_INIT_NAME(sd, type) do { } while (0)
5830#endif
5831
5832#define SD_INIT_FUNC(type) \
5833static noinline struct sched_domain * \
5834sd_init_##type(struct sched_domain_topology_level *tl, int cpu) \
5835{ \
5836 struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu); \
5837 *sd = SD_##type##_INIT; \
5838 SD_INIT_NAME(sd, type); \
5839 sd->private = &tl->data; \
5840 return sd; \
5841}
5842
5843SD_INIT_FUNC(CPU)
5844#ifdef CONFIG_SCHED_SMT
5845 SD_INIT_FUNC(SIBLING)
5846#endif
5847#ifdef CONFIG_SCHED_MC
5848 SD_INIT_FUNC(MC)
5849#endif
5850#ifdef CONFIG_SCHED_BOOK
5851 SD_INIT_FUNC(BOOK)
5852#endif
5853
5854static int default_relax_domain_level = -1; 5904static int default_relax_domain_level = -1;
5855int sched_domain_level_max; 5905int sched_domain_level_max;
5856 5906
@@ -5934,101 +5984,158 @@ static void claim_allocations(int cpu, struct sched_domain *sd)
5934 if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref)) 5984 if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref))
5935 *per_cpu_ptr(sdd->sg, cpu) = NULL; 5985 *per_cpu_ptr(sdd->sg, cpu) = NULL;
5936 5986
5937 if (atomic_read(&(*per_cpu_ptr(sdd->sgp, cpu))->ref)) 5987 if (atomic_read(&(*per_cpu_ptr(sdd->sgc, cpu))->ref))
5938 *per_cpu_ptr(sdd->sgp, cpu) = NULL; 5988 *per_cpu_ptr(sdd->sgc, cpu) = NULL;
5939} 5989}
5940 5990
5941#ifdef CONFIG_SCHED_SMT
5942static const struct cpumask *cpu_smt_mask(int cpu)
5943{
5944 return topology_thread_cpumask(cpu);
5945}
5946#endif
5947
5948/*
5949 * Topology list, bottom-up.
5950 */
5951static struct sched_domain_topology_level default_topology[] = {
5952#ifdef CONFIG_SCHED_SMT
5953 { sd_init_SIBLING, cpu_smt_mask, },
5954#endif
5955#ifdef CONFIG_SCHED_MC
5956 { sd_init_MC, cpu_coregroup_mask, },
5957#endif
5958#ifdef CONFIG_SCHED_BOOK
5959 { sd_init_BOOK, cpu_book_mask, },
5960#endif
5961 { sd_init_CPU, cpu_cpu_mask, },
5962 { NULL, },
5963};
5964
5965static struct sched_domain_topology_level *sched_domain_topology = default_topology;
5966
5967#define for_each_sd_topology(tl) \
5968 for (tl = sched_domain_topology; tl->init; tl++)
5969
5970#ifdef CONFIG_NUMA 5991#ifdef CONFIG_NUMA
5971
5972static int sched_domains_numa_levels; 5992static int sched_domains_numa_levels;
5973static int *sched_domains_numa_distance; 5993static int *sched_domains_numa_distance;
5974static struct cpumask ***sched_domains_numa_masks; 5994static struct cpumask ***sched_domains_numa_masks;
5975static int sched_domains_curr_level; 5995static int sched_domains_curr_level;
5996#endif
5976 5997
5977static inline int sd_local_flags(int level) 5998/*
5978{ 5999 * SD_flags allowed in topology descriptions.
5979 if (sched_domains_numa_distance[level] > RECLAIM_DISTANCE) 6000 *
5980 return 0; 6001 * SD_SHARE_CPUCAPACITY - describes SMT topologies
5981 6002 * SD_SHARE_PKG_RESOURCES - describes shared caches
5982 return SD_BALANCE_EXEC | SD_BALANCE_FORK | SD_WAKE_AFFINE; 6003 * SD_NUMA - describes NUMA topologies
5983} 6004 * SD_SHARE_POWERDOMAIN - describes shared power domain
6005 *
6006 * Odd one out:
6007 * SD_ASYM_PACKING - describes SMT quirks
6008 */
6009#define TOPOLOGY_SD_FLAGS \
6010 (SD_SHARE_CPUCAPACITY | \
6011 SD_SHARE_PKG_RESOURCES | \
6012 SD_NUMA | \
6013 SD_ASYM_PACKING | \
6014 SD_SHARE_POWERDOMAIN)
5984 6015
5985static struct sched_domain * 6016static struct sched_domain *
5986sd_numa_init(struct sched_domain_topology_level *tl, int cpu) 6017sd_init(struct sched_domain_topology_level *tl, int cpu)
5987{ 6018{
5988 struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu); 6019 struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu);
5989 int level = tl->numa_level; 6020 int sd_weight, sd_flags = 0;
5990 int sd_weight = cpumask_weight( 6021
5991 sched_domains_numa_masks[level][cpu_to_node(cpu)]); 6022#ifdef CONFIG_NUMA
6023 /*
6024 * Ugly hack to pass state to sd_numa_mask()...
6025 */
6026 sched_domains_curr_level = tl->numa_level;
6027#endif
6028
6029 sd_weight = cpumask_weight(tl->mask(cpu));
6030
6031 if (tl->sd_flags)
6032 sd_flags = (*tl->sd_flags)();
6033 if (WARN_ONCE(sd_flags & ~TOPOLOGY_SD_FLAGS,
6034 "wrong sd_flags in topology description\n"))
6035 sd_flags &= ~TOPOLOGY_SD_FLAGS;
5992 6036
5993 *sd = (struct sched_domain){ 6037 *sd = (struct sched_domain){
5994 .min_interval = sd_weight, 6038 .min_interval = sd_weight,
5995 .max_interval = 2*sd_weight, 6039 .max_interval = 2*sd_weight,
5996 .busy_factor = 32, 6040 .busy_factor = 32,
5997 .imbalance_pct = 125, 6041 .imbalance_pct = 125,
5998 .cache_nice_tries = 2, 6042
5999 .busy_idx = 3, 6043 .cache_nice_tries = 0,
6000 .idle_idx = 2, 6044 .busy_idx = 0,
6045 .idle_idx = 0,
6001 .newidle_idx = 0, 6046 .newidle_idx = 0,
6002 .wake_idx = 0, 6047 .wake_idx = 0,
6003 .forkexec_idx = 0, 6048 .forkexec_idx = 0,
6004 6049
6005 .flags = 1*SD_LOAD_BALANCE 6050 .flags = 1*SD_LOAD_BALANCE
6006 | 1*SD_BALANCE_NEWIDLE 6051 | 1*SD_BALANCE_NEWIDLE
6007 | 0*SD_BALANCE_EXEC 6052 | 1*SD_BALANCE_EXEC
6008 | 0*SD_BALANCE_FORK 6053 | 1*SD_BALANCE_FORK
6009 | 0*SD_BALANCE_WAKE 6054 | 0*SD_BALANCE_WAKE
6010 | 0*SD_WAKE_AFFINE 6055 | 1*SD_WAKE_AFFINE
6011 | 0*SD_SHARE_CPUPOWER 6056 | 0*SD_SHARE_CPUCAPACITY
6012 | 0*SD_SHARE_PKG_RESOURCES 6057 | 0*SD_SHARE_PKG_RESOURCES
6013 | 1*SD_SERIALIZE 6058 | 0*SD_SERIALIZE
6014 | 0*SD_PREFER_SIBLING 6059 | 0*SD_PREFER_SIBLING
6015 | 1*SD_NUMA 6060 | 0*SD_NUMA
6016 | sd_local_flags(level) 6061 | sd_flags
6017 , 6062 ,
6063
6018 .last_balance = jiffies, 6064 .last_balance = jiffies,
6019 .balance_interval = sd_weight, 6065 .balance_interval = sd_weight,
6066 .smt_gain = 0,
6067 .max_newidle_lb_cost = 0,
6068 .next_decay_max_lb_cost = jiffies,
6069#ifdef CONFIG_SCHED_DEBUG
6070 .name = tl->name,
6071#endif
6020 }; 6072 };
6021 SD_INIT_NAME(sd, NUMA);
6022 sd->private = &tl->data;
6023 6073
6024 /* 6074 /*
6025 * Ugly hack to pass state to sd_numa_mask()... 6075 * Convert topological properties into behaviour.
6026 */ 6076 */
6027 sched_domains_curr_level = tl->numa_level; 6077
6078 if (sd->flags & SD_SHARE_CPUCAPACITY) {
6079 sd->imbalance_pct = 110;
6080 sd->smt_gain = 1178; /* ~15% */
6081
6082 } else if (sd->flags & SD_SHARE_PKG_RESOURCES) {
6083 sd->imbalance_pct = 117;
6084 sd->cache_nice_tries = 1;
6085 sd->busy_idx = 2;
6086
6087#ifdef CONFIG_NUMA
6088 } else if (sd->flags & SD_NUMA) {
6089 sd->cache_nice_tries = 2;
6090 sd->busy_idx = 3;
6091 sd->idle_idx = 2;
6092
6093 sd->flags |= SD_SERIALIZE;
6094 if (sched_domains_numa_distance[tl->numa_level] > RECLAIM_DISTANCE) {
6095 sd->flags &= ~(SD_BALANCE_EXEC |
6096 SD_BALANCE_FORK |
6097 SD_WAKE_AFFINE);
6098 }
6099
6100#endif
6101 } else {
6102 sd->flags |= SD_PREFER_SIBLING;
6103 sd->cache_nice_tries = 1;
6104 sd->busy_idx = 2;
6105 sd->idle_idx = 1;
6106 }
6107
6108 sd->private = &tl->data;
6028 6109
6029 return sd; 6110 return sd;
6030} 6111}
6031 6112
6113/*
6114 * Topology list, bottom-up.
6115 */
6116static struct sched_domain_topology_level default_topology[] = {
6117#ifdef CONFIG_SCHED_SMT
6118 { cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) },
6119#endif
6120#ifdef CONFIG_SCHED_MC
6121 { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
6122#endif
6123 { cpu_cpu_mask, SD_INIT_NAME(DIE) },
6124 { NULL, },
6125};
6126
6127struct sched_domain_topology_level *sched_domain_topology = default_topology;
6128
6129#define for_each_sd_topology(tl) \
6130 for (tl = sched_domain_topology; tl->mask; tl++)
6131
6132void set_sched_topology(struct sched_domain_topology_level *tl)
6133{
6134 sched_domain_topology = tl;
6135}
6136
6137#ifdef CONFIG_NUMA
6138
6032static const struct cpumask *sd_numa_mask(int cpu) 6139static const struct cpumask *sd_numa_mask(int cpu)
6033{ 6140{
6034 return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)]; 6141 return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)];
@@ -6172,7 +6279,10 @@ static void sched_init_numa(void)
6172 } 6279 }
6173 } 6280 }
6174 6281
6175 tl = kzalloc((ARRAY_SIZE(default_topology) + level) * 6282 /* Compute default topology size */
6283 for (i = 0; sched_domain_topology[i].mask; i++);
6284
6285 tl = kzalloc((i + level + 1) *
6176 sizeof(struct sched_domain_topology_level), GFP_KERNEL); 6286 sizeof(struct sched_domain_topology_level), GFP_KERNEL);
6177 if (!tl) 6287 if (!tl)
6178 return; 6288 return;
@@ -6180,18 +6290,19 @@ static void sched_init_numa(void)
6180 /* 6290 /*
6181 * Copy the default topology bits.. 6291 * Copy the default topology bits..
6182 */ 6292 */
6183 for (i = 0; default_topology[i].init; i++) 6293 for (i = 0; sched_domain_topology[i].mask; i++)
6184 tl[i] = default_topology[i]; 6294 tl[i] = sched_domain_topology[i];
6185 6295
6186 /* 6296 /*
6187 * .. and append 'j' levels of NUMA goodness. 6297 * .. and append 'j' levels of NUMA goodness.
6188 */ 6298 */
6189 for (j = 0; j < level; i++, j++) { 6299 for (j = 0; j < level; i++, j++) {
6190 tl[i] = (struct sched_domain_topology_level){ 6300 tl[i] = (struct sched_domain_topology_level){
6191 .init = sd_numa_init,
6192 .mask = sd_numa_mask, 6301 .mask = sd_numa_mask,
6302 .sd_flags = cpu_numa_flags,
6193 .flags = SDTL_OVERLAP, 6303 .flags = SDTL_OVERLAP,
6194 .numa_level = j, 6304 .numa_level = j,
6305 SD_INIT_NAME(NUMA)
6195 }; 6306 };
6196 } 6307 }
6197 6308
@@ -6276,14 +6387,14 @@ static int __sdt_alloc(const struct cpumask *cpu_map)
6276 if (!sdd->sg) 6387 if (!sdd->sg)
6277 return -ENOMEM; 6388 return -ENOMEM;
6278 6389
6279 sdd->sgp = alloc_percpu(struct sched_group_power *); 6390 sdd->sgc = alloc_percpu(struct sched_group_capacity *);
6280 if (!sdd->sgp) 6391 if (!sdd->sgc)
6281 return -ENOMEM; 6392 return -ENOMEM;
6282 6393
6283 for_each_cpu(j, cpu_map) { 6394 for_each_cpu(j, cpu_map) {
6284 struct sched_domain *sd; 6395 struct sched_domain *sd;
6285 struct sched_group *sg; 6396 struct sched_group *sg;
6286 struct sched_group_power *sgp; 6397 struct sched_group_capacity *sgc;
6287 6398
6288 sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(), 6399 sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(),
6289 GFP_KERNEL, cpu_to_node(j)); 6400 GFP_KERNEL, cpu_to_node(j));
@@ -6301,12 +6412,12 @@ static int __sdt_alloc(const struct cpumask *cpu_map)
6301 6412
6302 *per_cpu_ptr(sdd->sg, j) = sg; 6413 *per_cpu_ptr(sdd->sg, j) = sg;
6303 6414
6304 sgp = kzalloc_node(sizeof(struct sched_group_power) + cpumask_size(), 6415 sgc = kzalloc_node(sizeof(struct sched_group_capacity) + cpumask_size(),
6305 GFP_KERNEL, cpu_to_node(j)); 6416 GFP_KERNEL, cpu_to_node(j));
6306 if (!sgp) 6417 if (!sgc)
6307 return -ENOMEM; 6418 return -ENOMEM;
6308 6419
6309 *per_cpu_ptr(sdd->sgp, j) = sgp; 6420 *per_cpu_ptr(sdd->sgc, j) = sgc;
6310 } 6421 }
6311 } 6422 }
6312 6423
@@ -6333,15 +6444,15 @@ static void __sdt_free(const struct cpumask *cpu_map)
6333 6444
6334 if (sdd->sg) 6445 if (sdd->sg)
6335 kfree(*per_cpu_ptr(sdd->sg, j)); 6446 kfree(*per_cpu_ptr(sdd->sg, j));
6336 if (sdd->sgp) 6447 if (sdd->sgc)
6337 kfree(*per_cpu_ptr(sdd->sgp, j)); 6448 kfree(*per_cpu_ptr(sdd->sgc, j));
6338 } 6449 }
6339 free_percpu(sdd->sd); 6450 free_percpu(sdd->sd);
6340 sdd->sd = NULL; 6451 sdd->sd = NULL;
6341 free_percpu(sdd->sg); 6452 free_percpu(sdd->sg);
6342 sdd->sg = NULL; 6453 sdd->sg = NULL;
6343 free_percpu(sdd->sgp); 6454 free_percpu(sdd->sgc);
6344 sdd->sgp = NULL; 6455 sdd->sgc = NULL;
6345 } 6456 }
6346} 6457}
6347 6458
@@ -6349,7 +6460,7 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
6349 const struct cpumask *cpu_map, struct sched_domain_attr *attr, 6460 const struct cpumask *cpu_map, struct sched_domain_attr *attr,
6350 struct sched_domain *child, int cpu) 6461 struct sched_domain *child, int cpu)
6351{ 6462{
6352 struct sched_domain *sd = tl->init(tl, cpu); 6463 struct sched_domain *sd = sd_init(tl, cpu);
6353 if (!sd) 6464 if (!sd)
6354 return child; 6465 return child;
6355 6466
@@ -6411,14 +6522,14 @@ static int build_sched_domains(const struct cpumask *cpu_map,
6411 } 6522 }
6412 } 6523 }
6413 6524
6414 /* Calculate CPU power for physical packages and nodes */ 6525 /* Calculate CPU capacity for physical packages and nodes */
6415 for (i = nr_cpumask_bits-1; i >= 0; i--) { 6526 for (i = nr_cpumask_bits-1; i >= 0; i--) {
6416 if (!cpumask_test_cpu(i, cpu_map)) 6527 if (!cpumask_test_cpu(i, cpu_map))
6417 continue; 6528 continue;
6418 6529
6419 for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) { 6530 for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
6420 claim_allocations(i, sd); 6531 claim_allocations(i, sd);
6421 init_sched_groups_power(i, sd); 6532 init_sched_groups_capacity(i, sd);
6422 } 6533 }
6423 } 6534 }
6424 6535
@@ -6861,7 +6972,7 @@ void __init sched_init(void)
6861#ifdef CONFIG_SMP 6972#ifdef CONFIG_SMP
6862 rq->sd = NULL; 6973 rq->sd = NULL;
6863 rq->rd = NULL; 6974 rq->rd = NULL;
6864 rq->cpu_power = SCHED_POWER_SCALE; 6975 rq->cpu_capacity = SCHED_CAPACITY_SCALE;
6865 rq->post_schedule = 0; 6976 rq->post_schedule = 0;
6866 rq->active_balance = 0; 6977 rq->active_balance = 0;
6867 rq->next_balance = jiffies; 6978 rq->next_balance = jiffies;
@@ -6919,6 +7030,7 @@ void __init sched_init(void)
6919 if (cpu_isolated_map == NULL) 7030 if (cpu_isolated_map == NULL)
6920 zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); 7031 zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
6921 idle_thread_set_boot_cpu(); 7032 idle_thread_set_boot_cpu();
7033 set_cpu_rq_start_time();
6922#endif 7034#endif
6923 init_sched_fair_class(); 7035 init_sched_fair_class();
6924 7036
@@ -7586,7 +7698,7 @@ cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
7586static int cpu_cgroup_css_online(struct cgroup_subsys_state *css) 7698static int cpu_cgroup_css_online(struct cgroup_subsys_state *css)
7587{ 7699{
7588 struct task_group *tg = css_tg(css); 7700 struct task_group *tg = css_tg(css);
7589 struct task_group *parent = css_tg(css_parent(css)); 7701 struct task_group *parent = css_tg(css->parent);
7590 7702
7591 if (parent) 7703 if (parent)
7592 sched_online_group(tg, parent); 7704 sched_online_group(tg, parent);
@@ -7717,8 +7829,7 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
7717 /* restart the period timer (if active) to handle new period expiry */ 7829 /* restart the period timer (if active) to handle new period expiry */
7718 if (runtime_enabled && cfs_b->timer_active) { 7830 if (runtime_enabled && cfs_b->timer_active) {
7719 /* force a reprogram */ 7831 /* force a reprogram */
7720 cfs_b->timer_active = 0; 7832 __start_cfs_bandwidth(cfs_b, true);
7721 __start_cfs_bandwidth(cfs_b);
7722 } 7833 }
7723 raw_spin_unlock_irq(&cfs_b->lock); 7834 raw_spin_unlock_irq(&cfs_b->lock);
7724 7835
diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c
index c143ee380e3a..9cf350c94ec4 100644
--- a/kernel/sched/cpuacct.c
+++ b/kernel/sched/cpuacct.c
@@ -46,7 +46,7 @@ static inline struct cpuacct *task_ca(struct task_struct *tsk)
46 46
47static inline struct cpuacct *parent_ca(struct cpuacct *ca) 47static inline struct cpuacct *parent_ca(struct cpuacct *ca)
48{ 48{
49 return css_ca(css_parent(&ca->css)); 49 return css_ca(ca->css.parent);
50} 50}
51 51
52static DEFINE_PER_CPU(u64, root_cpuacct_cpuusage); 52static DEFINE_PER_CPU(u64, root_cpuacct_cpuusage);
diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c
index 5b9bb42b2d47..bd95963dae80 100644
--- a/kernel/sched/cpudeadline.c
+++ b/kernel/sched/cpudeadline.c
@@ -13,6 +13,7 @@
13 13
14#include <linux/gfp.h> 14#include <linux/gfp.h>
15#include <linux/kernel.h> 15#include <linux/kernel.h>
16#include <linux/slab.h>
16#include "cpudeadline.h" 17#include "cpudeadline.h"
17 18
18static inline int parent(int i) 19static inline int parent(int i)
@@ -39,8 +40,10 @@ static void cpudl_exchange(struct cpudl *cp, int a, int b)
39{ 40{
40 int cpu_a = cp->elements[a].cpu, cpu_b = cp->elements[b].cpu; 41 int cpu_a = cp->elements[a].cpu, cpu_b = cp->elements[b].cpu;
41 42
42 swap(cp->elements[a], cp->elements[b]); 43 swap(cp->elements[a].cpu, cp->elements[b].cpu);
43 swap(cp->cpu_to_idx[cpu_a], cp->cpu_to_idx[cpu_b]); 44 swap(cp->elements[a].dl , cp->elements[b].dl );
45
46 swap(cp->elements[cpu_a].idx, cp->elements[cpu_b].idx);
44} 47}
45 48
46static void cpudl_heapify(struct cpudl *cp, int idx) 49static void cpudl_heapify(struct cpudl *cp, int idx)
@@ -140,7 +143,7 @@ void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid)
140 WARN_ON(!cpu_present(cpu)); 143 WARN_ON(!cpu_present(cpu));
141 144
142 raw_spin_lock_irqsave(&cp->lock, flags); 145 raw_spin_lock_irqsave(&cp->lock, flags);
143 old_idx = cp->cpu_to_idx[cpu]; 146 old_idx = cp->elements[cpu].idx;
144 if (!is_valid) { 147 if (!is_valid) {
145 /* remove item */ 148 /* remove item */
146 if (old_idx == IDX_INVALID) { 149 if (old_idx == IDX_INVALID) {
@@ -155,8 +158,8 @@ void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid)
155 cp->elements[old_idx].dl = cp->elements[cp->size - 1].dl; 158 cp->elements[old_idx].dl = cp->elements[cp->size - 1].dl;
156 cp->elements[old_idx].cpu = new_cpu; 159 cp->elements[old_idx].cpu = new_cpu;
157 cp->size--; 160 cp->size--;
158 cp->cpu_to_idx[new_cpu] = old_idx; 161 cp->elements[new_cpu].idx = old_idx;
159 cp->cpu_to_idx[cpu] = IDX_INVALID; 162 cp->elements[cpu].idx = IDX_INVALID;
160 while (old_idx > 0 && dl_time_before( 163 while (old_idx > 0 && dl_time_before(
161 cp->elements[parent(old_idx)].dl, 164 cp->elements[parent(old_idx)].dl,
162 cp->elements[old_idx].dl)) { 165 cp->elements[old_idx].dl)) {
@@ -173,7 +176,7 @@ void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid)
173 cp->size++; 176 cp->size++;
174 cp->elements[cp->size - 1].dl = 0; 177 cp->elements[cp->size - 1].dl = 0;
175 cp->elements[cp->size - 1].cpu = cpu; 178 cp->elements[cp->size - 1].cpu = cpu;
176 cp->cpu_to_idx[cpu] = cp->size - 1; 179 cp->elements[cpu].idx = cp->size - 1;
177 cpudl_change_key(cp, cp->size - 1, dl); 180 cpudl_change_key(cp, cp->size - 1, dl);
178 cpumask_clear_cpu(cpu, cp->free_cpus); 181 cpumask_clear_cpu(cpu, cp->free_cpus);
179 } else { 182 } else {
@@ -195,10 +198,21 @@ int cpudl_init(struct cpudl *cp)
195 memset(cp, 0, sizeof(*cp)); 198 memset(cp, 0, sizeof(*cp));
196 raw_spin_lock_init(&cp->lock); 199 raw_spin_lock_init(&cp->lock);
197 cp->size = 0; 200 cp->size = 0;
198 for (i = 0; i < NR_CPUS; i++) 201
199 cp->cpu_to_idx[i] = IDX_INVALID; 202 cp->elements = kcalloc(nr_cpu_ids,
200 if (!alloc_cpumask_var(&cp->free_cpus, GFP_KERNEL)) 203 sizeof(struct cpudl_item),
204 GFP_KERNEL);
205 if (!cp->elements)
206 return -ENOMEM;
207
208 if (!alloc_cpumask_var(&cp->free_cpus, GFP_KERNEL)) {
209 kfree(cp->elements);
201 return -ENOMEM; 210 return -ENOMEM;
211 }
212
213 for_each_possible_cpu(i)
214 cp->elements[i].idx = IDX_INVALID;
215
202 cpumask_setall(cp->free_cpus); 216 cpumask_setall(cp->free_cpus);
203 217
204 return 0; 218 return 0;
@@ -210,7 +224,6 @@ int cpudl_init(struct cpudl *cp)
210 */ 224 */
211void cpudl_cleanup(struct cpudl *cp) 225void cpudl_cleanup(struct cpudl *cp)
212{ 226{
213 /* 227 free_cpumask_var(cp->free_cpus);
214 * nothing to do for the moment 228 kfree(cp->elements);
215 */
216} 229}
diff --git a/kernel/sched/cpudeadline.h b/kernel/sched/cpudeadline.h
index a202789a412c..538c9796ad4a 100644
--- a/kernel/sched/cpudeadline.h
+++ b/kernel/sched/cpudeadline.h
@@ -5,17 +5,17 @@
5 5
6#define IDX_INVALID -1 6#define IDX_INVALID -1
7 7
8struct array_item { 8struct cpudl_item {
9 u64 dl; 9 u64 dl;
10 int cpu; 10 int cpu;
11 int idx;
11}; 12};
12 13
13struct cpudl { 14struct cpudl {
14 raw_spinlock_t lock; 15 raw_spinlock_t lock;
15 int size; 16 int size;
16 int cpu_to_idx[NR_CPUS];
17 struct array_item elements[NR_CPUS];
18 cpumask_var_t free_cpus; 17 cpumask_var_t free_cpus;
18 struct cpudl_item *elements;
19}; 19};
20 20
21 21
diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c
index 8b836b376d91..981fcd7dc394 100644
--- a/kernel/sched/cpupri.c
+++ b/kernel/sched/cpupri.c
@@ -30,6 +30,7 @@
30#include <linux/gfp.h> 30#include <linux/gfp.h>
31#include <linux/sched.h> 31#include <linux/sched.h>
32#include <linux/sched/rt.h> 32#include <linux/sched/rt.h>
33#include <linux/slab.h>
33#include "cpupri.h" 34#include "cpupri.h"
34 35
35/* Convert between a 140 based task->prio, and our 102 based cpupri */ 36/* Convert between a 140 based task->prio, and our 102 based cpupri */
@@ -70,8 +71,7 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p,
70 int idx = 0; 71 int idx = 0;
71 int task_pri = convert_prio(p->prio); 72 int task_pri = convert_prio(p->prio);
72 73
73 if (task_pri >= MAX_RT_PRIO) 74 BUG_ON(task_pri >= CPUPRI_NR_PRIORITIES);
74 return 0;
75 75
76 for (idx = 0; idx < task_pri; idx++) { 76 for (idx = 0; idx < task_pri; idx++) {
77 struct cpupri_vec *vec = &cp->pri_to_cpu[idx]; 77 struct cpupri_vec *vec = &cp->pri_to_cpu[idx];
@@ -165,7 +165,7 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
165 * do a write memory barrier, and then update the count, to 165 * do a write memory barrier, and then update the count, to
166 * make sure the vector is visible when count is set. 166 * make sure the vector is visible when count is set.
167 */ 167 */
168 smp_mb__before_atomic_inc(); 168 smp_mb__before_atomic();
169 atomic_inc(&(vec)->count); 169 atomic_inc(&(vec)->count);
170 do_mb = 1; 170 do_mb = 1;
171 } 171 }
@@ -185,14 +185,14 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
185 * the new priority vec. 185 * the new priority vec.
186 */ 186 */
187 if (do_mb) 187 if (do_mb)
188 smp_mb__after_atomic_inc(); 188 smp_mb__after_atomic();
189 189
190 /* 190 /*
191 * When removing from the vector, we decrement the counter first 191 * When removing from the vector, we decrement the counter first
192 * do a memory barrier and then clear the mask. 192 * do a memory barrier and then clear the mask.
193 */ 193 */
194 atomic_dec(&(vec)->count); 194 atomic_dec(&(vec)->count);
195 smp_mb__after_atomic_inc(); 195 smp_mb__after_atomic();
196 cpumask_clear_cpu(cpu, vec->mask); 196 cpumask_clear_cpu(cpu, vec->mask);
197 } 197 }
198 198
@@ -219,8 +219,13 @@ int cpupri_init(struct cpupri *cp)
219 goto cleanup; 219 goto cleanup;
220 } 220 }
221 221
222 cp->cpu_to_pri = kcalloc(nr_cpu_ids, sizeof(int), GFP_KERNEL);
223 if (!cp->cpu_to_pri)
224 goto cleanup;
225
222 for_each_possible_cpu(i) 226 for_each_possible_cpu(i)
223 cp->cpu_to_pri[i] = CPUPRI_INVALID; 227 cp->cpu_to_pri[i] = CPUPRI_INVALID;
228
224 return 0; 229 return 0;
225 230
226cleanup: 231cleanup:
@@ -237,6 +242,7 @@ void cpupri_cleanup(struct cpupri *cp)
237{ 242{
238 int i; 243 int i;
239 244
245 kfree(cp->cpu_to_pri);
240 for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) 246 for (i = 0; i < CPUPRI_NR_PRIORITIES; i++)
241 free_cpumask_var(cp->pri_to_cpu[i].mask); 247 free_cpumask_var(cp->pri_to_cpu[i].mask);
242} 248}
diff --git a/kernel/sched/cpupri.h b/kernel/sched/cpupri.h
index f6d756173491..6b033347fdfd 100644
--- a/kernel/sched/cpupri.h
+++ b/kernel/sched/cpupri.h
@@ -17,7 +17,7 @@ struct cpupri_vec {
17 17
18struct cpupri { 18struct cpupri {
19 struct cpupri_vec pri_to_cpu[CPUPRI_NR_PRIORITIES]; 19 struct cpupri_vec pri_to_cpu[CPUPRI_NR_PRIORITIES];
20 int cpu_to_pri[NR_CPUS]; 20 int *cpu_to_pri;
21}; 21};
22 22
23#ifdef CONFIG_SMP 23#ifdef CONFIG_SMP
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index a95097cb4591..72fdf06ef865 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -332,50 +332,50 @@ out:
332 * softirq as those do not count in task exec_runtime any more. 332 * softirq as those do not count in task exec_runtime any more.
333 */ 333 */
334static void irqtime_account_process_tick(struct task_struct *p, int user_tick, 334static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
335 struct rq *rq) 335 struct rq *rq, int ticks)
336{ 336{
337 cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); 337 cputime_t scaled = cputime_to_scaled(cputime_one_jiffy);
338 u64 cputime = (__force u64) cputime_one_jiffy;
338 u64 *cpustat = kcpustat_this_cpu->cpustat; 339 u64 *cpustat = kcpustat_this_cpu->cpustat;
339 340
340 if (steal_account_process_tick()) 341 if (steal_account_process_tick())
341 return; 342 return;
342 343
344 cputime *= ticks;
345 scaled *= ticks;
346
343 if (irqtime_account_hi_update()) { 347 if (irqtime_account_hi_update()) {
344 cpustat[CPUTIME_IRQ] += (__force u64) cputime_one_jiffy; 348 cpustat[CPUTIME_IRQ] += cputime;
345 } else if (irqtime_account_si_update()) { 349 } else if (irqtime_account_si_update()) {
346 cpustat[CPUTIME_SOFTIRQ] += (__force u64) cputime_one_jiffy; 350 cpustat[CPUTIME_SOFTIRQ] += cputime;
347 } else if (this_cpu_ksoftirqd() == p) { 351 } else if (this_cpu_ksoftirqd() == p) {
348 /* 352 /*
349 * ksoftirqd time do not get accounted in cpu_softirq_time. 353 * ksoftirqd time do not get accounted in cpu_softirq_time.
350 * So, we have to handle it separately here. 354 * So, we have to handle it separately here.
351 * Also, p->stime needs to be updated for ksoftirqd. 355 * Also, p->stime needs to be updated for ksoftirqd.
352 */ 356 */
353 __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled, 357 __account_system_time(p, cputime, scaled, CPUTIME_SOFTIRQ);
354 CPUTIME_SOFTIRQ);
355 } else if (user_tick) { 358 } else if (user_tick) {
356 account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); 359 account_user_time(p, cputime, scaled);
357 } else if (p == rq->idle) { 360 } else if (p == rq->idle) {
358 account_idle_time(cputime_one_jiffy); 361 account_idle_time(cputime);
359 } else if (p->flags & PF_VCPU) { /* System time or guest time */ 362 } else if (p->flags & PF_VCPU) { /* System time or guest time */
360 account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled); 363 account_guest_time(p, cputime, scaled);
361 } else { 364 } else {
362 __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled, 365 __account_system_time(p, cputime, scaled, CPUTIME_SYSTEM);
363 CPUTIME_SYSTEM);
364 } 366 }
365} 367}
366 368
367static void irqtime_account_idle_ticks(int ticks) 369static void irqtime_account_idle_ticks(int ticks)
368{ 370{
369 int i;
370 struct rq *rq = this_rq(); 371 struct rq *rq = this_rq();
371 372
372 for (i = 0; i < ticks; i++) 373 irqtime_account_process_tick(current, 0, rq, ticks);
373 irqtime_account_process_tick(current, 0, rq);
374} 374}
375#else /* CONFIG_IRQ_TIME_ACCOUNTING */ 375#else /* CONFIG_IRQ_TIME_ACCOUNTING */
376static inline void irqtime_account_idle_ticks(int ticks) {} 376static inline void irqtime_account_idle_ticks(int ticks) {}
377static inline void irqtime_account_process_tick(struct task_struct *p, int user_tick, 377static inline void irqtime_account_process_tick(struct task_struct *p, int user_tick,
378 struct rq *rq) {} 378 struct rq *rq, int nr_ticks) {}
379#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ 379#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
380 380
381/* 381/*
@@ -464,7 +464,7 @@ void account_process_tick(struct task_struct *p, int user_tick)
464 return; 464 return;
465 465
466 if (sched_clock_irqtime) { 466 if (sched_clock_irqtime) {
467 irqtime_account_process_tick(p, user_tick, rq); 467 irqtime_account_process_tick(p, user_tick, rq, 1);
468 return; 468 return;
469 } 469 }
470 470
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index b08095786cb8..fc4f98b1258f 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -57,8 +57,6 @@ void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime)
57 dl_b->dl_runtime = runtime; 57 dl_b->dl_runtime = runtime;
58} 58}
59 59
60extern unsigned long to_ratio(u64 period, u64 runtime);
61
62void init_dl_bw(struct dl_bw *dl_b) 60void init_dl_bw(struct dl_bw *dl_b)
63{ 61{
64 raw_spin_lock_init(&dl_b->lock); 62 raw_spin_lock_init(&dl_b->lock);
@@ -348,12 +346,7 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se,
348 * entity. 346 * entity.
349 */ 347 */
350 if (dl_time_before(dl_se->deadline, rq_clock(rq))) { 348 if (dl_time_before(dl_se->deadline, rq_clock(rq))) {
351 static bool lag_once = false; 349 printk_deferred_once("sched: DL replenish lagged to much\n");
352
353 if (!lag_once) {
354 lag_once = true;
355 printk_sched("sched: DL replenish lagged to much\n");
356 }
357 dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline; 350 dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline;
358 dl_se->runtime = pi_se->dl_runtime; 351 dl_se->runtime = pi_se->dl_runtime;
359 } 352 }
@@ -513,14 +506,22 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
513 struct sched_dl_entity, 506 struct sched_dl_entity,
514 dl_timer); 507 dl_timer);
515 struct task_struct *p = dl_task_of(dl_se); 508 struct task_struct *p = dl_task_of(dl_se);
516 struct rq *rq = task_rq(p); 509 struct rq *rq;
510again:
511 rq = task_rq(p);
517 raw_spin_lock(&rq->lock); 512 raw_spin_lock(&rq->lock);
518 513
514 if (rq != task_rq(p)) {
515 /* Task was moved, retrying. */
516 raw_spin_unlock(&rq->lock);
517 goto again;
518 }
519
519 /* 520 /*
520 * We need to take care of a possible races here. In fact, the 521 * We need to take care of a possible races here. In fact, the
521 * task might have changed its scheduling policy to something 522 * task might have changed its scheduling policy to something
522 * different from SCHED_DEADLINE or changed its reservation 523 * different from SCHED_DEADLINE or changed its reservation
523 * parameters (through sched_setscheduler()). 524 * parameters (through sched_setattr()).
524 */ 525 */
525 if (!dl_task(p) || dl_se->dl_new) 526 if (!dl_task(p) || dl_se->dl_new)
526 goto unlock; 527 goto unlock;
@@ -528,6 +529,7 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
528 sched_clock_tick(); 529 sched_clock_tick();
529 update_rq_clock(rq); 530 update_rq_clock(rq);
530 dl_se->dl_throttled = 0; 531 dl_se->dl_throttled = 0;
532 dl_se->dl_yielded = 0;
531 if (p->on_rq) { 533 if (p->on_rq) {
532 enqueue_task_dl(rq, p, ENQUEUE_REPLENISH); 534 enqueue_task_dl(rq, p, ENQUEUE_REPLENISH);
533 if (task_has_dl_policy(rq->curr)) 535 if (task_has_dl_policy(rq->curr))
@@ -740,7 +742,7 @@ void inc_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
740 742
741 WARN_ON(!dl_prio(prio)); 743 WARN_ON(!dl_prio(prio));
742 dl_rq->dl_nr_running++; 744 dl_rq->dl_nr_running++;
743 inc_nr_running(rq_of_dl_rq(dl_rq)); 745 add_nr_running(rq_of_dl_rq(dl_rq), 1);
744 746
745 inc_dl_deadline(dl_rq, deadline); 747 inc_dl_deadline(dl_rq, deadline);
746 inc_dl_migration(dl_se, dl_rq); 748 inc_dl_migration(dl_se, dl_rq);
@@ -754,7 +756,7 @@ void dec_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
754 WARN_ON(!dl_prio(prio)); 756 WARN_ON(!dl_prio(prio));
755 WARN_ON(!dl_rq->dl_nr_running); 757 WARN_ON(!dl_rq->dl_nr_running);
756 dl_rq->dl_nr_running--; 758 dl_rq->dl_nr_running--;
757 dec_nr_running(rq_of_dl_rq(dl_rq)); 759 sub_nr_running(rq_of_dl_rq(dl_rq), 1);
758 760
759 dec_dl_deadline(dl_rq, dl_se->deadline); 761 dec_dl_deadline(dl_rq, dl_se->deadline);
760 dec_dl_migration(dl_se, dl_rq); 762 dec_dl_migration(dl_se, dl_rq);
@@ -893,10 +895,10 @@ static void yield_task_dl(struct rq *rq)
893 * We make the task go to sleep until its current deadline by 895 * We make the task go to sleep until its current deadline by
894 * forcing its runtime to zero. This way, update_curr_dl() stops 896 * forcing its runtime to zero. This way, update_curr_dl() stops
895 * it and the bandwidth timer will wake it up and will give it 897 * it and the bandwidth timer will wake it up and will give it
896 * new scheduling parameters (thanks to dl_new=1). 898 * new scheduling parameters (thanks to dl_yielded=1).
897 */ 899 */
898 if (p->dl.runtime > 0) { 900 if (p->dl.runtime > 0) {
899 rq->curr->dl.dl_new = 1; 901 rq->curr->dl.dl_yielded = 1;
900 p->dl.runtime = 0; 902 p->dl.runtime = 0;
901 } 903 }
902 update_curr_dl(rq); 904 update_curr_dl(rq);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 7570dd969c28..fea7d3335e1f 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1017,7 +1017,7 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
1017static unsigned long weighted_cpuload(const int cpu); 1017static unsigned long weighted_cpuload(const int cpu);
1018static unsigned long source_load(int cpu, int type); 1018static unsigned long source_load(int cpu, int type);
1019static unsigned long target_load(int cpu, int type); 1019static unsigned long target_load(int cpu, int type);
1020static unsigned long power_of(int cpu); 1020static unsigned long capacity_of(int cpu);
1021static long effective_load(struct task_group *tg, int cpu, long wl, long wg); 1021static long effective_load(struct task_group *tg, int cpu, long wl, long wg);
1022 1022
1023/* Cached statistics for all CPUs within a node */ 1023/* Cached statistics for all CPUs within a node */
@@ -1026,11 +1026,11 @@ struct numa_stats {
1026 unsigned long load; 1026 unsigned long load;
1027 1027
1028 /* Total compute capacity of CPUs on a node */ 1028 /* Total compute capacity of CPUs on a node */
1029 unsigned long power; 1029 unsigned long compute_capacity;
1030 1030
1031 /* Approximate capacity in terms of runnable tasks on a node */ 1031 /* Approximate capacity in terms of runnable tasks on a node */
1032 unsigned long capacity; 1032 unsigned long task_capacity;
1033 int has_capacity; 1033 int has_free_capacity;
1034}; 1034};
1035 1035
1036/* 1036/*
@@ -1046,7 +1046,7 @@ static void update_numa_stats(struct numa_stats *ns, int nid)
1046 1046
1047 ns->nr_running += rq->nr_running; 1047 ns->nr_running += rq->nr_running;
1048 ns->load += weighted_cpuload(cpu); 1048 ns->load += weighted_cpuload(cpu);
1049 ns->power += power_of(cpu); 1049 ns->compute_capacity += capacity_of(cpu);
1050 1050
1051 cpus++; 1051 cpus++;
1052 } 1052 }
@@ -1056,15 +1056,16 @@ static void update_numa_stats(struct numa_stats *ns, int nid)
1056 * the @ns structure is NULL'ed and task_numa_compare() will 1056 * the @ns structure is NULL'ed and task_numa_compare() will
1057 * not find this node attractive. 1057 * not find this node attractive.
1058 * 1058 *
1059 * We'll either bail at !has_capacity, or we'll detect a huge imbalance 1059 * We'll either bail at !has_free_capacity, or we'll detect a huge
1060 * and bail there. 1060 * imbalance and bail there.
1061 */ 1061 */
1062 if (!cpus) 1062 if (!cpus)
1063 return; 1063 return;
1064 1064
1065 ns->load = (ns->load * SCHED_POWER_SCALE) / ns->power; 1065 ns->load = (ns->load * SCHED_CAPACITY_SCALE) / ns->compute_capacity;
1066 ns->capacity = DIV_ROUND_CLOSEST(ns->power, SCHED_POWER_SCALE); 1066 ns->task_capacity =
1067 ns->has_capacity = (ns->nr_running < ns->capacity); 1067 DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_CAPACITY_SCALE);
1068 ns->has_free_capacity = (ns->nr_running < ns->task_capacity);
1068} 1069}
1069 1070
1070struct task_numa_env { 1071struct task_numa_env {
@@ -1095,6 +1096,34 @@ static void task_numa_assign(struct task_numa_env *env,
1095 env->best_cpu = env->dst_cpu; 1096 env->best_cpu = env->dst_cpu;
1096} 1097}
1097 1098
1099static bool load_too_imbalanced(long orig_src_load, long orig_dst_load,
1100 long src_load, long dst_load,
1101 struct task_numa_env *env)
1102{
1103 long imb, old_imb;
1104
1105 /* We care about the slope of the imbalance, not the direction. */
1106 if (dst_load < src_load)
1107 swap(dst_load, src_load);
1108
1109 /* Is the difference below the threshold? */
1110 imb = dst_load * 100 - src_load * env->imbalance_pct;
1111 if (imb <= 0)
1112 return false;
1113
1114 /*
1115 * The imbalance is above the allowed threshold.
1116 * Compare it with the old imbalance.
1117 */
1118 if (orig_dst_load < orig_src_load)
1119 swap(orig_dst_load, orig_src_load);
1120
1121 old_imb = orig_dst_load * 100 - orig_src_load * env->imbalance_pct;
1122
1123 /* Would this change make things worse? */
1124 return (imb > old_imb);
1125}
1126
1098/* 1127/*
1099 * This checks if the overall compute and NUMA accesses of the system would 1128 * This checks if the overall compute and NUMA accesses of the system would
1100 * be improved if the source tasks was migrated to the target dst_cpu taking 1129 * be improved if the source tasks was migrated to the target dst_cpu taking
@@ -1107,7 +1136,8 @@ static void task_numa_compare(struct task_numa_env *env,
1107 struct rq *src_rq = cpu_rq(env->src_cpu); 1136 struct rq *src_rq = cpu_rq(env->src_cpu);
1108 struct rq *dst_rq = cpu_rq(env->dst_cpu); 1137 struct rq *dst_rq = cpu_rq(env->dst_cpu);
1109 struct task_struct *cur; 1138 struct task_struct *cur;
1110 long dst_load, src_load; 1139 long orig_src_load, src_load;
1140 long orig_dst_load, dst_load;
1111 long load; 1141 long load;
1112 long imp = (groupimp > 0) ? groupimp : taskimp; 1142 long imp = (groupimp > 0) ? groupimp : taskimp;
1113 1143
@@ -1166,8 +1196,8 @@ static void task_numa_compare(struct task_numa_env *env,
1166 1196
1167 if (!cur) { 1197 if (!cur) {
1168 /* Is there capacity at our destination? */ 1198 /* Is there capacity at our destination? */
1169 if (env->src_stats.has_capacity && 1199 if (env->src_stats.has_free_capacity &&
1170 !env->dst_stats.has_capacity) 1200 !env->dst_stats.has_free_capacity)
1171 goto unlock; 1201 goto unlock;
1172 1202
1173 goto balance; 1203 goto balance;
@@ -1181,13 +1211,13 @@ static void task_numa_compare(struct task_numa_env *env,
1181 * In the overloaded case, try and keep the load balanced. 1211 * In the overloaded case, try and keep the load balanced.
1182 */ 1212 */
1183balance: 1213balance:
1184 dst_load = env->dst_stats.load; 1214 orig_dst_load = env->dst_stats.load;
1185 src_load = env->src_stats.load; 1215 orig_src_load = env->src_stats.load;
1186 1216
1187 /* XXX missing power terms */ 1217 /* XXX missing capacity terms */
1188 load = task_h_load(env->p); 1218 load = task_h_load(env->p);
1189 dst_load += load; 1219 dst_load = orig_dst_load + load;
1190 src_load -= load; 1220 src_load = orig_src_load - load;
1191 1221
1192 if (cur) { 1222 if (cur) {
1193 load = task_h_load(cur); 1223 load = task_h_load(cur);
@@ -1195,11 +1225,8 @@ balance:
1195 src_load += load; 1225 src_load += load;
1196 } 1226 }
1197 1227
1198 /* make src_load the smaller */ 1228 if (load_too_imbalanced(orig_src_load, orig_dst_load,
1199 if (dst_load < src_load) 1229 src_load, dst_load, env))
1200 swap(dst_load, src_load);
1201
1202 if (src_load * env->imbalance_pct < dst_load * 100)
1203 goto unlock; 1230 goto unlock;
1204 1231
1205assign: 1232assign:
@@ -1275,8 +1302,8 @@ static int task_numa_migrate(struct task_struct *p)
1275 groupimp = group_weight(p, env.dst_nid) - groupweight; 1302 groupimp = group_weight(p, env.dst_nid) - groupweight;
1276 update_numa_stats(&env.dst_stats, env.dst_nid); 1303 update_numa_stats(&env.dst_stats, env.dst_nid);
1277 1304
1278 /* If the preferred nid has capacity, try to use it. */ 1305 /* If the preferred nid has free capacity, try to use it. */
1279 if (env.dst_stats.has_capacity) 1306 if (env.dst_stats.has_free_capacity)
1280 task_numa_find_cpu(&env, taskimp, groupimp); 1307 task_numa_find_cpu(&env, taskimp, groupimp);
1281 1308
1282 /* No space available on the preferred nid. Look elsewhere. */ 1309 /* No space available on the preferred nid. Look elsewhere. */
@@ -1301,7 +1328,16 @@ static int task_numa_migrate(struct task_struct *p)
1301 if (env.best_cpu == -1) 1328 if (env.best_cpu == -1)
1302 return -EAGAIN; 1329 return -EAGAIN;
1303 1330
1304 sched_setnuma(p, env.dst_nid); 1331 /*
1332 * If the task is part of a workload that spans multiple NUMA nodes,
1333 * and is migrating into one of the workload's active nodes, remember
1334 * this node as the task's preferred numa node, so the workload can
1335 * settle down.
1336 * A task that migrated to a second choice node will be better off
1337 * trying for a better one later. Do not set the preferred node here.
1338 */
1339 if (p->numa_group && node_isset(env.dst_nid, p->numa_group->active_nodes))
1340 sched_setnuma(p, env.dst_nid);
1305 1341
1306 /* 1342 /*
1307 * Reset the scan period if the task is being rescheduled on an 1343 * Reset the scan period if the task is being rescheduled on an
@@ -1326,12 +1362,15 @@ static int task_numa_migrate(struct task_struct *p)
1326/* Attempt to migrate a task to a CPU on the preferred node. */ 1362/* Attempt to migrate a task to a CPU on the preferred node. */
1327static void numa_migrate_preferred(struct task_struct *p) 1363static void numa_migrate_preferred(struct task_struct *p)
1328{ 1364{
1365 unsigned long interval = HZ;
1366
1329 /* This task has no NUMA fault statistics yet */ 1367 /* This task has no NUMA fault statistics yet */
1330 if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults_memory)) 1368 if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults_memory))
1331 return; 1369 return;
1332 1370
1333 /* Periodically retry migrating the task to the preferred node */ 1371 /* Periodically retry migrating the task to the preferred node */
1334 p->numa_migrate_retry = jiffies + HZ; 1372 interval = min(interval, msecs_to_jiffies(p->numa_scan_period) / 16);
1373 p->numa_migrate_retry = jiffies + interval;
1335 1374
1336 /* Success if task is already running on preferred CPU */ 1375 /* Success if task is already running on preferred CPU */
1337 if (task_node(p) == p->numa_preferred_nid) 1376 if (task_node(p) == p->numa_preferred_nid)
@@ -1707,18 +1746,19 @@ no_join:
1707void task_numa_free(struct task_struct *p) 1746void task_numa_free(struct task_struct *p)
1708{ 1747{
1709 struct numa_group *grp = p->numa_group; 1748 struct numa_group *grp = p->numa_group;
1710 int i;
1711 void *numa_faults = p->numa_faults_memory; 1749 void *numa_faults = p->numa_faults_memory;
1750 unsigned long flags;
1751 int i;
1712 1752
1713 if (grp) { 1753 if (grp) {
1714 spin_lock_irq(&grp->lock); 1754 spin_lock_irqsave(&grp->lock, flags);
1715 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) 1755 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
1716 grp->faults[i] -= p->numa_faults_memory[i]; 1756 grp->faults[i] -= p->numa_faults_memory[i];
1717 grp->total_faults -= p->total_numa_faults; 1757 grp->total_faults -= p->total_numa_faults;
1718 1758
1719 list_del(&p->numa_entry); 1759 list_del(&p->numa_entry);
1720 grp->nr_tasks--; 1760 grp->nr_tasks--;
1721 spin_unlock_irq(&grp->lock); 1761 spin_unlock_irqrestore(&grp->lock, flags);
1722 rcu_assign_pointer(p->numa_group, NULL); 1762 rcu_assign_pointer(p->numa_group, NULL);
1723 put_numa_group(grp); 1763 put_numa_group(grp);
1724 } 1764 }
@@ -1738,6 +1778,7 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
1738 struct task_struct *p = current; 1778 struct task_struct *p = current;
1739 bool migrated = flags & TNF_MIGRATED; 1779 bool migrated = flags & TNF_MIGRATED;
1740 int cpu_node = task_node(current); 1780 int cpu_node = task_node(current);
1781 int local = !!(flags & TNF_FAULT_LOCAL);
1741 int priv; 1782 int priv;
1742 1783
1743 if (!numabalancing_enabled) 1784 if (!numabalancing_enabled)
@@ -1786,6 +1827,17 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
1786 task_numa_group(p, last_cpupid, flags, &priv); 1827 task_numa_group(p, last_cpupid, flags, &priv);
1787 } 1828 }
1788 1829
1830 /*
1831 * If a workload spans multiple NUMA nodes, a shared fault that
1832 * occurs wholly within the set of nodes that the workload is
1833 * actively using should be counted as local. This allows the
1834 * scan rate to slow down when a workload has settled down.
1835 */
1836 if (!priv && !local && p->numa_group &&
1837 node_isset(cpu_node, p->numa_group->active_nodes) &&
1838 node_isset(mem_node, p->numa_group->active_nodes))
1839 local = 1;
1840
1789 task_numa_placement(p); 1841 task_numa_placement(p);
1790 1842
1791 /* 1843 /*
@@ -1800,7 +1852,7 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
1800 1852
1801 p->numa_faults_buffer_memory[task_faults_idx(mem_node, priv)] += pages; 1853 p->numa_faults_buffer_memory[task_faults_idx(mem_node, priv)] += pages;
1802 p->numa_faults_buffer_cpu[task_faults_idx(cpu_node, priv)] += pages; 1854 p->numa_faults_buffer_cpu[task_faults_idx(cpu_node, priv)] += pages;
1803 p->numa_faults_locality[!!(flags & TNF_FAULT_LOCAL)] += pages; 1855 p->numa_faults_locality[local] += pages;
1804} 1856}
1805 1857
1806static void reset_ptenuma_scan(struct task_struct *p) 1858static void reset_ptenuma_scan(struct task_struct *p)
@@ -3129,7 +3181,7 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
3129 */ 3181 */
3130 if (!cfs_b->timer_active) { 3182 if (!cfs_b->timer_active) {
3131 __refill_cfs_bandwidth_runtime(cfs_b); 3183 __refill_cfs_bandwidth_runtime(cfs_b);
3132 __start_cfs_bandwidth(cfs_b); 3184 __start_cfs_bandwidth(cfs_b, false);
3133 } 3185 }
3134 3186
3135 if (cfs_b->runtime > 0) { 3187 if (cfs_b->runtime > 0) {
@@ -3174,10 +3226,12 @@ static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
3174 * has not truly expired. 3226 * has not truly expired.
3175 * 3227 *
3176 * Fortunately we can check determine whether this the case by checking 3228 * Fortunately we can check determine whether this the case by checking
3177 * whether the global deadline has advanced. 3229 * whether the global deadline has advanced. It is valid to compare
3230 * cfs_b->runtime_expires without any locks since we only care about
3231 * exact equality, so a partial write will still work.
3178 */ 3232 */
3179 3233
3180 if ((s64)(cfs_rq->runtime_expires - cfs_b->runtime_expires) >= 0) { 3234 if (cfs_rq->runtime_expires != cfs_b->runtime_expires) {
3181 /* extend local deadline, drift is bounded above by 2 ticks */ 3235 /* extend local deadline, drift is bounded above by 2 ticks */
3182 cfs_rq->runtime_expires += TICK_NSEC; 3236 cfs_rq->runtime_expires += TICK_NSEC;
3183 } else { 3237 } else {
@@ -3301,14 +3355,14 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
3301 } 3355 }
3302 3356
3303 if (!se) 3357 if (!se)
3304 rq->nr_running -= task_delta; 3358 sub_nr_running(rq, task_delta);
3305 3359
3306 cfs_rq->throttled = 1; 3360 cfs_rq->throttled = 1;
3307 cfs_rq->throttled_clock = rq_clock(rq); 3361 cfs_rq->throttled_clock = rq_clock(rq);
3308 raw_spin_lock(&cfs_b->lock); 3362 raw_spin_lock(&cfs_b->lock);
3309 list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq); 3363 list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
3310 if (!cfs_b->timer_active) 3364 if (!cfs_b->timer_active)
3311 __start_cfs_bandwidth(cfs_b); 3365 __start_cfs_bandwidth(cfs_b, false);
3312 raw_spin_unlock(&cfs_b->lock); 3366 raw_spin_unlock(&cfs_b->lock);
3313} 3367}
3314 3368
@@ -3352,7 +3406,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
3352 } 3406 }
3353 3407
3354 if (!se) 3408 if (!se)
3355 rq->nr_running += task_delta; 3409 add_nr_running(rq, task_delta);
3356 3410
3357 /* determine whether we need to wake up potentially idle cpu */ 3411 /* determine whether we need to wake up potentially idle cpu */
3358 if (rq->curr == rq->idle && rq->cfs.nr_running) 3412 if (rq->curr == rq->idle && rq->cfs.nr_running)
@@ -3406,21 +3460,21 @@ next:
3406static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun) 3460static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
3407{ 3461{
3408 u64 runtime, runtime_expires; 3462 u64 runtime, runtime_expires;
3409 int idle = 1, throttled; 3463 int throttled;
3410 3464
3411 raw_spin_lock(&cfs_b->lock);
3412 /* no need to continue the timer with no bandwidth constraint */ 3465 /* no need to continue the timer with no bandwidth constraint */
3413 if (cfs_b->quota == RUNTIME_INF) 3466 if (cfs_b->quota == RUNTIME_INF)
3414 goto out_unlock; 3467 goto out_deactivate;
3415 3468
3416 throttled = !list_empty(&cfs_b->throttled_cfs_rq); 3469 throttled = !list_empty(&cfs_b->throttled_cfs_rq);
3417 /* idle depends on !throttled (for the case of a large deficit) */
3418 idle = cfs_b->idle && !throttled;
3419 cfs_b->nr_periods += overrun; 3470 cfs_b->nr_periods += overrun;
3420 3471
3421 /* if we're going inactive then everything else can be deferred */ 3472 /*
3422 if (idle) 3473 * idle depends on !throttled (for the case of a large deficit), and if
3423 goto out_unlock; 3474 * we're going inactive then everything else can be deferred
3475 */
3476 if (cfs_b->idle && !throttled)
3477 goto out_deactivate;
3424 3478
3425 /* 3479 /*
3426 * if we have relooped after returning idle once, we need to update our 3480 * if we have relooped after returning idle once, we need to update our
@@ -3434,7 +3488,7 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
3434 if (!throttled) { 3488 if (!throttled) {
3435 /* mark as potentially idle for the upcoming period */ 3489 /* mark as potentially idle for the upcoming period */
3436 cfs_b->idle = 1; 3490 cfs_b->idle = 1;
3437 goto out_unlock; 3491 return 0;
3438 } 3492 }
3439 3493
3440 /* account preceding periods in which throttling occurred */ 3494 /* account preceding periods in which throttling occurred */
@@ -3474,12 +3528,12 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
3474 * timer to remain active while there are any throttled entities.) 3528 * timer to remain active while there are any throttled entities.)
3475 */ 3529 */
3476 cfs_b->idle = 0; 3530 cfs_b->idle = 0;
3477out_unlock:
3478 if (idle)
3479 cfs_b->timer_active = 0;
3480 raw_spin_unlock(&cfs_b->lock);
3481 3531
3482 return idle; 3532 return 0;
3533
3534out_deactivate:
3535 cfs_b->timer_active = 0;
3536 return 1;
3483} 3537}
3484 3538
3485/* a cfs_rq won't donate quota below this amount */ 3539/* a cfs_rq won't donate quota below this amount */
@@ -3656,6 +3710,7 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
3656 int overrun; 3710 int overrun;
3657 int idle = 0; 3711 int idle = 0;
3658 3712
3713 raw_spin_lock(&cfs_b->lock);
3659 for (;;) { 3714 for (;;) {
3660 now = hrtimer_cb_get_time(timer); 3715 now = hrtimer_cb_get_time(timer);
3661 overrun = hrtimer_forward(timer, now, cfs_b->period); 3716 overrun = hrtimer_forward(timer, now, cfs_b->period);
@@ -3665,6 +3720,7 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
3665 3720
3666 idle = do_sched_cfs_period_timer(cfs_b, overrun); 3721 idle = do_sched_cfs_period_timer(cfs_b, overrun);
3667 } 3722 }
3723 raw_spin_unlock(&cfs_b->lock);
3668 3724
3669 return idle ? HRTIMER_NORESTART : HRTIMER_RESTART; 3725 return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
3670} 3726}
@@ -3690,7 +3746,7 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
3690} 3746}
3691 3747
3692/* requires cfs_b->lock, may release to reprogram timer */ 3748/* requires cfs_b->lock, may release to reprogram timer */
3693void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b) 3749void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b, bool force)
3694{ 3750{
3695 /* 3751 /*
3696 * The timer may be active because we're trying to set a new bandwidth 3752 * The timer may be active because we're trying to set a new bandwidth
@@ -3705,7 +3761,7 @@ void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
3705 cpu_relax(); 3761 cpu_relax();
3706 raw_spin_lock(&cfs_b->lock); 3762 raw_spin_lock(&cfs_b->lock);
3707 /* if someone else restarted the timer then we're done */ 3763 /* if someone else restarted the timer then we're done */
3708 if (cfs_b->timer_active) 3764 if (!force && cfs_b->timer_active)
3709 return; 3765 return;
3710 } 3766 }
3711 3767
@@ -3724,8 +3780,6 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
3724 struct cfs_rq *cfs_rq; 3780 struct cfs_rq *cfs_rq;
3725 3781
3726 for_each_leaf_cfs_rq(rq, cfs_rq) { 3782 for_each_leaf_cfs_rq(rq, cfs_rq) {
3727 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
3728
3729 if (!cfs_rq->runtime_enabled) 3783 if (!cfs_rq->runtime_enabled)
3730 continue; 3784 continue;
3731 3785
@@ -3733,7 +3787,7 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
3733 * clock_task is not advancing so we just need to make sure 3787 * clock_task is not advancing so we just need to make sure
3734 * there's some valid quota amount 3788 * there's some valid quota amount
3735 */ 3789 */
3736 cfs_rq->runtime_remaining = cfs_b->quota; 3790 cfs_rq->runtime_remaining = 1;
3737 if (cfs_rq_throttled(cfs_rq)) 3791 if (cfs_rq_throttled(cfs_rq))
3738 unthrottle_cfs_rq(cfs_rq); 3792 unthrottle_cfs_rq(cfs_rq);
3739 } 3793 }
@@ -3884,7 +3938,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
3884 3938
3885 if (!se) { 3939 if (!se) {
3886 update_rq_runnable_avg(rq, rq->nr_running); 3940 update_rq_runnable_avg(rq, rq->nr_running);
3887 inc_nr_running(rq); 3941 add_nr_running(rq, 1);
3888 } 3942 }
3889 hrtick_update(rq); 3943 hrtick_update(rq);
3890} 3944}
@@ -3944,7 +3998,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
3944 } 3998 }
3945 3999
3946 if (!se) { 4000 if (!se) {
3947 dec_nr_running(rq); 4001 sub_nr_running(rq, 1);
3948 update_rq_runnable_avg(rq, 1); 4002 update_rq_runnable_avg(rq, 1);
3949 } 4003 }
3950 hrtick_update(rq); 4004 hrtick_update(rq);
@@ -3990,9 +4044,9 @@ static unsigned long target_load(int cpu, int type)
3990 return max(rq->cpu_load[type-1], total); 4044 return max(rq->cpu_load[type-1], total);
3991} 4045}
3992 4046
3993static unsigned long power_of(int cpu) 4047static unsigned long capacity_of(int cpu)
3994{ 4048{
3995 return cpu_rq(cpu)->cpu_power; 4049 return cpu_rq(cpu)->cpu_capacity;
3996} 4050}
3997 4051
3998static unsigned long cpu_avg_load_per_task(int cpu) 4052static unsigned long cpu_avg_load_per_task(int cpu)
@@ -4014,8 +4068,8 @@ static void record_wakee(struct task_struct *p)
4014 * about the boundary, really active task won't care 4068 * about the boundary, really active task won't care
4015 * about the loss. 4069 * about the loss.
4016 */ 4070 */
4017 if (jiffies > current->wakee_flip_decay_ts + HZ) { 4071 if (time_after(jiffies, current->wakee_flip_decay_ts + HZ)) {
4018 current->wakee_flips = 0; 4072 current->wakee_flips >>= 1;
4019 current->wakee_flip_decay_ts = jiffies; 4073 current->wakee_flip_decay_ts = jiffies;
4020 } 4074 }
4021 4075
@@ -4235,12 +4289,12 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
4235 s64 this_eff_load, prev_eff_load; 4289 s64 this_eff_load, prev_eff_load;
4236 4290
4237 this_eff_load = 100; 4291 this_eff_load = 100;
4238 this_eff_load *= power_of(prev_cpu); 4292 this_eff_load *= capacity_of(prev_cpu);
4239 this_eff_load *= this_load + 4293 this_eff_load *= this_load +
4240 effective_load(tg, this_cpu, weight, weight); 4294 effective_load(tg, this_cpu, weight, weight);
4241 4295
4242 prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2; 4296 prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
4243 prev_eff_load *= power_of(this_cpu); 4297 prev_eff_load *= capacity_of(this_cpu);
4244 prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight); 4298 prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight);
4245 4299
4246 balanced = this_eff_load <= prev_eff_load; 4300 balanced = this_eff_load <= prev_eff_load;
@@ -4316,8 +4370,8 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
4316 avg_load += load; 4370 avg_load += load;
4317 } 4371 }
4318 4372
4319 /* Adjust by relative CPU power of the group */ 4373 /* Adjust by relative CPU capacity of the group */
4320 avg_load = (avg_load * SCHED_POWER_SCALE) / group->sgp->power; 4374 avg_load = (avg_load * SCHED_CAPACITY_SCALE) / group->sgc->capacity;
4321 4375
4322 if (local_group) { 4376 if (local_group) {
4323 this_load = avg_load; 4377 this_load = avg_load;
@@ -4449,10 +4503,10 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
4449 sd = tmp; 4503 sd = tmp;
4450 } 4504 }
4451 4505
4452 if (affine_sd) { 4506 if (affine_sd && cpu != prev_cpu && wake_affine(affine_sd, p, sync))
4453 if (cpu != prev_cpu && wake_affine(affine_sd, p, sync)) 4507 prev_cpu = cpu;
4454 prev_cpu = cpu;
4455 4508
4509 if (sd_flag & SD_BALANCE_WAKE) {
4456 new_cpu = select_idle_sibling(p, prev_cpu); 4510 new_cpu = select_idle_sibling(p, prev_cpu);
4457 goto unlock; 4511 goto unlock;
4458 } 4512 }
@@ -4520,6 +4574,9 @@ migrate_task_rq_fair(struct task_struct *p, int next_cpu)
4520 atomic_long_add(se->avg.load_avg_contrib, 4574 atomic_long_add(se->avg.load_avg_contrib,
4521 &cfs_rq->removed_load); 4575 &cfs_rq->removed_load);
4522 } 4576 }
4577
4578 /* We have migrated, no longer consider this task hot */
4579 se->exec_start = 0;
4523} 4580}
4524#endif /* CONFIG_SMP */ 4581#endif /* CONFIG_SMP */
4525 4582
@@ -4894,14 +4951,14 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
4894 * 4951 *
4895 * W'_i,n = (2^n - 1) / 2^n * W_i,n + 1 / 2^n * W_i,0 (3) 4952 * W'_i,n = (2^n - 1) / 2^n * W_i,n + 1 / 2^n * W_i,0 (3)
4896 * 4953 *
4897 * P_i is the cpu power (or compute capacity) of cpu i, typically it is the 4954 * C_i is the compute capacity of cpu i, typically it is the
4898 * fraction of 'recent' time available for SCHED_OTHER task execution. But it 4955 * fraction of 'recent' time available for SCHED_OTHER task execution. But it
4899 * can also include other factors [XXX]. 4956 * can also include other factors [XXX].
4900 * 4957 *
4901 * To achieve this balance we define a measure of imbalance which follows 4958 * To achieve this balance we define a measure of imbalance which follows
4902 * directly from (1): 4959 * directly from (1):
4903 * 4960 *
4904 * imb_i,j = max{ avg(W/P), W_i/P_i } - min{ avg(W/P), W_j/P_j } (4) 4961 * imb_i,j = max{ avg(W/C), W_i/C_i } - min{ avg(W/C), W_j/C_j } (4)
4905 * 4962 *
4906 * We them move tasks around to minimize the imbalance. In the continuous 4963 * We them move tasks around to minimize the imbalance. In the continuous
4907 * function space it is obvious this converges, in the discrete case we get 4964 * function space it is obvious this converges, in the discrete case we get
@@ -5070,6 +5127,7 @@ task_hot(struct task_struct *p, u64 now)
5070/* Returns true if the destination node has incurred more faults */ 5127/* Returns true if the destination node has incurred more faults */
5071static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env) 5128static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env)
5072{ 5129{
5130 struct numa_group *numa_group = rcu_dereference(p->numa_group);
5073 int src_nid, dst_nid; 5131 int src_nid, dst_nid;
5074 5132
5075 if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults_memory || 5133 if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults_memory ||
@@ -5083,21 +5141,29 @@ static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env)
5083 if (src_nid == dst_nid) 5141 if (src_nid == dst_nid)
5084 return false; 5142 return false;
5085 5143
5086 /* Always encourage migration to the preferred node. */ 5144 if (numa_group) {
5087 if (dst_nid == p->numa_preferred_nid) 5145 /* Task is already in the group's interleave set. */
5088 return true; 5146 if (node_isset(src_nid, numa_group->active_nodes))
5147 return false;
5148
5149 /* Task is moving into the group's interleave set. */
5150 if (node_isset(dst_nid, numa_group->active_nodes))
5151 return true;
5152
5153 return group_faults(p, dst_nid) > group_faults(p, src_nid);
5154 }
5089 5155
5090 /* If both task and group weight improve, this move is a winner. */ 5156 /* Encourage migration to the preferred node. */
5091 if (task_weight(p, dst_nid) > task_weight(p, src_nid) && 5157 if (dst_nid == p->numa_preferred_nid)
5092 group_weight(p, dst_nid) > group_weight(p, src_nid))
5093 return true; 5158 return true;
5094 5159
5095 return false; 5160 return task_faults(p, dst_nid) > task_faults(p, src_nid);
5096} 5161}
5097 5162
5098 5163
5099static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env) 5164static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
5100{ 5165{
5166 struct numa_group *numa_group = rcu_dereference(p->numa_group);
5101 int src_nid, dst_nid; 5167 int src_nid, dst_nid;
5102 5168
5103 if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER)) 5169 if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER))
@@ -5112,16 +5178,23 @@ static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
5112 if (src_nid == dst_nid) 5178 if (src_nid == dst_nid)
5113 return false; 5179 return false;
5114 5180
5181 if (numa_group) {
5182 /* Task is moving within/into the group's interleave set. */
5183 if (node_isset(dst_nid, numa_group->active_nodes))
5184 return false;
5185
5186 /* Task is moving out of the group's interleave set. */
5187 if (node_isset(src_nid, numa_group->active_nodes))
5188 return true;
5189
5190 return group_faults(p, dst_nid) < group_faults(p, src_nid);
5191 }
5192
5115 /* Migrating away from the preferred node is always bad. */ 5193 /* Migrating away from the preferred node is always bad. */
5116 if (src_nid == p->numa_preferred_nid) 5194 if (src_nid == p->numa_preferred_nid)
5117 return true; 5195 return true;
5118 5196
5119 /* If either task or group weight get worse, don't do it. */ 5197 return task_faults(p, dst_nid) < task_faults(p, src_nid);
5120 if (task_weight(p, dst_nid) < task_weight(p, src_nid) ||
5121 group_weight(p, dst_nid) < group_weight(p, src_nid))
5122 return true;
5123
5124 return false;
5125} 5198}
5126 5199
5127#else 5200#else
@@ -5460,13 +5533,13 @@ struct sg_lb_stats {
5460 unsigned long group_load; /* Total load over the CPUs of the group */ 5533 unsigned long group_load; /* Total load over the CPUs of the group */
5461 unsigned long sum_weighted_load; /* Weighted load of group's tasks */ 5534 unsigned long sum_weighted_load; /* Weighted load of group's tasks */
5462 unsigned long load_per_task; 5535 unsigned long load_per_task;
5463 unsigned long group_power; 5536 unsigned long group_capacity;
5464 unsigned int sum_nr_running; /* Nr tasks running in the group */ 5537 unsigned int sum_nr_running; /* Nr tasks running in the group */
5465 unsigned int group_capacity; 5538 unsigned int group_capacity_factor;
5466 unsigned int idle_cpus; 5539 unsigned int idle_cpus;
5467 unsigned int group_weight; 5540 unsigned int group_weight;
5468 int group_imb; /* Is there an imbalance in the group ? */ 5541 int group_imb; /* Is there an imbalance in the group ? */
5469 int group_has_capacity; /* Is there extra capacity in the group? */ 5542 int group_has_free_capacity;
5470#ifdef CONFIG_NUMA_BALANCING 5543#ifdef CONFIG_NUMA_BALANCING
5471 unsigned int nr_numa_running; 5544 unsigned int nr_numa_running;
5472 unsigned int nr_preferred_running; 5545 unsigned int nr_preferred_running;
@@ -5481,7 +5554,7 @@ struct sd_lb_stats {
5481 struct sched_group *busiest; /* Busiest group in this sd */ 5554 struct sched_group *busiest; /* Busiest group in this sd */
5482 struct sched_group *local; /* Local group in this sd */ 5555 struct sched_group *local; /* Local group in this sd */
5483 unsigned long total_load; /* Total load of all groups in sd */ 5556 unsigned long total_load; /* Total load of all groups in sd */
5484 unsigned long total_pwr; /* Total power of all groups in sd */ 5557 unsigned long total_capacity; /* Total capacity of all groups in sd */
5485 unsigned long avg_load; /* Average load across all groups in sd */ 5558 unsigned long avg_load; /* Average load across all groups in sd */
5486 5559
5487 struct sg_lb_stats busiest_stat;/* Statistics of the busiest group */ 5560 struct sg_lb_stats busiest_stat;/* Statistics of the busiest group */
@@ -5500,7 +5573,7 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
5500 .busiest = NULL, 5573 .busiest = NULL,
5501 .local = NULL, 5574 .local = NULL,
5502 .total_load = 0UL, 5575 .total_load = 0UL,
5503 .total_pwr = 0UL, 5576 .total_capacity = 0UL,
5504 .busiest_stat = { 5577 .busiest_stat = {
5505 .avg_load = 0UL, 5578 .avg_load = 0UL,
5506 }, 5579 },
@@ -5535,17 +5608,17 @@ static inline int get_sd_load_idx(struct sched_domain *sd,
5535 return load_idx; 5608 return load_idx;
5536} 5609}
5537 5610
5538static unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu) 5611static unsigned long default_scale_capacity(struct sched_domain *sd, int cpu)
5539{ 5612{
5540 return SCHED_POWER_SCALE; 5613 return SCHED_CAPACITY_SCALE;
5541} 5614}
5542 5615
5543unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu) 5616unsigned long __weak arch_scale_freq_capacity(struct sched_domain *sd, int cpu)
5544{ 5617{
5545 return default_scale_freq_power(sd, cpu); 5618 return default_scale_capacity(sd, cpu);
5546} 5619}
5547 5620
5548static unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu) 5621static unsigned long default_scale_smt_capacity(struct sched_domain *sd, int cpu)
5549{ 5622{
5550 unsigned long weight = sd->span_weight; 5623 unsigned long weight = sd->span_weight;
5551 unsigned long smt_gain = sd->smt_gain; 5624 unsigned long smt_gain = sd->smt_gain;
@@ -5555,15 +5628,16 @@ static unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)
5555 return smt_gain; 5628 return smt_gain;
5556} 5629}
5557 5630
5558unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu) 5631unsigned long __weak arch_scale_smt_capacity(struct sched_domain *sd, int cpu)
5559{ 5632{
5560 return default_scale_smt_power(sd, cpu); 5633 return default_scale_smt_capacity(sd, cpu);
5561} 5634}
5562 5635
5563static unsigned long scale_rt_power(int cpu) 5636static unsigned long scale_rt_capacity(int cpu)
5564{ 5637{
5565 struct rq *rq = cpu_rq(cpu); 5638 struct rq *rq = cpu_rq(cpu);
5566 u64 total, available, age_stamp, avg; 5639 u64 total, available, age_stamp, avg;
5640 s64 delta;
5567 5641
5568 /* 5642 /*
5569 * Since we're reading these variables without serialization make sure 5643 * Since we're reading these variables without serialization make sure
@@ -5572,74 +5646,78 @@ static unsigned long scale_rt_power(int cpu)
5572 age_stamp = ACCESS_ONCE(rq->age_stamp); 5646 age_stamp = ACCESS_ONCE(rq->age_stamp);
5573 avg = ACCESS_ONCE(rq->rt_avg); 5647 avg = ACCESS_ONCE(rq->rt_avg);
5574 5648
5575 total = sched_avg_period() + (rq_clock(rq) - age_stamp); 5649 delta = rq_clock(rq) - age_stamp;
5650 if (unlikely(delta < 0))
5651 delta = 0;
5652
5653 total = sched_avg_period() + delta;
5576 5654
5577 if (unlikely(total < avg)) { 5655 if (unlikely(total < avg)) {
5578 /* Ensures that power won't end up being negative */ 5656 /* Ensures that capacity won't end up being negative */
5579 available = 0; 5657 available = 0;
5580 } else { 5658 } else {
5581 available = total - avg; 5659 available = total - avg;
5582 } 5660 }
5583 5661
5584 if (unlikely((s64)total < SCHED_POWER_SCALE)) 5662 if (unlikely((s64)total < SCHED_CAPACITY_SCALE))
5585 total = SCHED_POWER_SCALE; 5663 total = SCHED_CAPACITY_SCALE;
5586 5664
5587 total >>= SCHED_POWER_SHIFT; 5665 total >>= SCHED_CAPACITY_SHIFT;
5588 5666
5589 return div_u64(available, total); 5667 return div_u64(available, total);
5590} 5668}
5591 5669
5592static void update_cpu_power(struct sched_domain *sd, int cpu) 5670static void update_cpu_capacity(struct sched_domain *sd, int cpu)
5593{ 5671{
5594 unsigned long weight = sd->span_weight; 5672 unsigned long weight = sd->span_weight;
5595 unsigned long power = SCHED_POWER_SCALE; 5673 unsigned long capacity = SCHED_CAPACITY_SCALE;
5596 struct sched_group *sdg = sd->groups; 5674 struct sched_group *sdg = sd->groups;
5597 5675
5598 if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) { 5676 if ((sd->flags & SD_SHARE_CPUCAPACITY) && weight > 1) {
5599 if (sched_feat(ARCH_POWER)) 5677 if (sched_feat(ARCH_CAPACITY))
5600 power *= arch_scale_smt_power(sd, cpu); 5678 capacity *= arch_scale_smt_capacity(sd, cpu);
5601 else 5679 else
5602 power *= default_scale_smt_power(sd, cpu); 5680 capacity *= default_scale_smt_capacity(sd, cpu);
5603 5681
5604 power >>= SCHED_POWER_SHIFT; 5682 capacity >>= SCHED_CAPACITY_SHIFT;
5605 } 5683 }
5606 5684
5607 sdg->sgp->power_orig = power; 5685 sdg->sgc->capacity_orig = capacity;
5608 5686
5609 if (sched_feat(ARCH_POWER)) 5687 if (sched_feat(ARCH_CAPACITY))
5610 power *= arch_scale_freq_power(sd, cpu); 5688 capacity *= arch_scale_freq_capacity(sd, cpu);
5611 else 5689 else
5612 power *= default_scale_freq_power(sd, cpu); 5690 capacity *= default_scale_capacity(sd, cpu);
5613 5691
5614 power >>= SCHED_POWER_SHIFT; 5692 capacity >>= SCHED_CAPACITY_SHIFT;
5615 5693
5616 power *= scale_rt_power(cpu); 5694 capacity *= scale_rt_capacity(cpu);
5617 power >>= SCHED_POWER_SHIFT; 5695 capacity >>= SCHED_CAPACITY_SHIFT;
5618 5696
5619 if (!power) 5697 if (!capacity)
5620 power = 1; 5698 capacity = 1;
5621 5699
5622 cpu_rq(cpu)->cpu_power = power; 5700 cpu_rq(cpu)->cpu_capacity = capacity;
5623 sdg->sgp->power = power; 5701 sdg->sgc->capacity = capacity;
5624} 5702}
5625 5703
5626void update_group_power(struct sched_domain *sd, int cpu) 5704void update_group_capacity(struct sched_domain *sd, int cpu)
5627{ 5705{
5628 struct sched_domain *child = sd->child; 5706 struct sched_domain *child = sd->child;
5629 struct sched_group *group, *sdg = sd->groups; 5707 struct sched_group *group, *sdg = sd->groups;
5630 unsigned long power, power_orig; 5708 unsigned long capacity, capacity_orig;
5631 unsigned long interval; 5709 unsigned long interval;
5632 5710
5633 interval = msecs_to_jiffies(sd->balance_interval); 5711 interval = msecs_to_jiffies(sd->balance_interval);
5634 interval = clamp(interval, 1UL, max_load_balance_interval); 5712 interval = clamp(interval, 1UL, max_load_balance_interval);
5635 sdg->sgp->next_update = jiffies + interval; 5713 sdg->sgc->next_update = jiffies + interval;
5636 5714
5637 if (!child) { 5715 if (!child) {
5638 update_cpu_power(sd, cpu); 5716 update_cpu_capacity(sd, cpu);
5639 return; 5717 return;
5640 } 5718 }
5641 5719
5642 power_orig = power = 0; 5720 capacity_orig = capacity = 0;
5643 5721
5644 if (child->flags & SD_OVERLAP) { 5722 if (child->flags & SD_OVERLAP) {
5645 /* 5723 /*
@@ -5648,31 +5726,31 @@ void update_group_power(struct sched_domain *sd, int cpu)
5648 */ 5726 */
5649 5727
5650 for_each_cpu(cpu, sched_group_cpus(sdg)) { 5728 for_each_cpu(cpu, sched_group_cpus(sdg)) {
5651 struct sched_group_power *sgp; 5729 struct sched_group_capacity *sgc;
5652 struct rq *rq = cpu_rq(cpu); 5730 struct rq *rq = cpu_rq(cpu);
5653 5731
5654 /* 5732 /*
5655 * build_sched_domains() -> init_sched_groups_power() 5733 * build_sched_domains() -> init_sched_groups_capacity()
5656 * gets here before we've attached the domains to the 5734 * gets here before we've attached the domains to the
5657 * runqueues. 5735 * runqueues.
5658 * 5736 *
5659 * Use power_of(), which is set irrespective of domains 5737 * Use capacity_of(), which is set irrespective of domains
5660 * in update_cpu_power(). 5738 * in update_cpu_capacity().
5661 * 5739 *
5662 * This avoids power/power_orig from being 0 and 5740 * This avoids capacity/capacity_orig from being 0 and
5663 * causing divide-by-zero issues on boot. 5741 * causing divide-by-zero issues on boot.
5664 * 5742 *
5665 * Runtime updates will correct power_orig. 5743 * Runtime updates will correct capacity_orig.
5666 */ 5744 */
5667 if (unlikely(!rq->sd)) { 5745 if (unlikely(!rq->sd)) {
5668 power_orig += power_of(cpu); 5746 capacity_orig += capacity_of(cpu);
5669 power += power_of(cpu); 5747 capacity += capacity_of(cpu);
5670 continue; 5748 continue;
5671 } 5749 }
5672 5750
5673 sgp = rq->sd->groups->sgp; 5751 sgc = rq->sd->groups->sgc;
5674 power_orig += sgp->power_orig; 5752 capacity_orig += sgc->capacity_orig;
5675 power += sgp->power; 5753 capacity += sgc->capacity;
5676 } 5754 }
5677 } else { 5755 } else {
5678 /* 5756 /*
@@ -5682,14 +5760,14 @@ void update_group_power(struct sched_domain *sd, int cpu)
5682 5760
5683 group = child->groups; 5761 group = child->groups;
5684 do { 5762 do {
5685 power_orig += group->sgp->power_orig; 5763 capacity_orig += group->sgc->capacity_orig;
5686 power += group->sgp->power; 5764 capacity += group->sgc->capacity;
5687 group = group->next; 5765 group = group->next;
5688 } while (group != child->groups); 5766 } while (group != child->groups);
5689 } 5767 }
5690 5768
5691 sdg->sgp->power_orig = power_orig; 5769 sdg->sgc->capacity_orig = capacity_orig;
5692 sdg->sgp->power = power; 5770 sdg->sgc->capacity = capacity;
5693} 5771}
5694 5772
5695/* 5773/*
@@ -5703,15 +5781,15 @@ static inline int
5703fix_small_capacity(struct sched_domain *sd, struct sched_group *group) 5781fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
5704{ 5782{
5705 /* 5783 /*
5706 * Only siblings can have significantly less than SCHED_POWER_SCALE 5784 * Only siblings can have significantly less than SCHED_CAPACITY_SCALE
5707 */ 5785 */
5708 if (!(sd->flags & SD_SHARE_CPUPOWER)) 5786 if (!(sd->flags & SD_SHARE_CPUCAPACITY))
5709 return 0; 5787 return 0;
5710 5788
5711 /* 5789 /*
5712 * If ~90% of the cpu_power is still there, we're good. 5790 * If ~90% of the cpu_capacity is still there, we're good.
5713 */ 5791 */
5714 if (group->sgp->power * 32 > group->sgp->power_orig * 29) 5792 if (group->sgc->capacity * 32 > group->sgc->capacity_orig * 29)
5715 return 1; 5793 return 1;
5716 5794
5717 return 0; 5795 return 0;
@@ -5748,34 +5826,35 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
5748 5826
5749static inline int sg_imbalanced(struct sched_group *group) 5827static inline int sg_imbalanced(struct sched_group *group)
5750{ 5828{
5751 return group->sgp->imbalance; 5829 return group->sgc->imbalance;
5752} 5830}
5753 5831
5754/* 5832/*
5755 * Compute the group capacity. 5833 * Compute the group capacity factor.
5756 * 5834 *
5757 * Avoid the issue where N*frac(smt_power) >= 1 creates 'phantom' cores by 5835 * Avoid the issue where N*frac(smt_capacity) >= 1 creates 'phantom' cores by
5758 * first dividing out the smt factor and computing the actual number of cores 5836 * first dividing out the smt factor and computing the actual number of cores
5759 * and limit power unit capacity with that. 5837 * and limit unit capacity with that.
5760 */ 5838 */
5761static inline int sg_capacity(struct lb_env *env, struct sched_group *group) 5839static inline int sg_capacity_factor(struct lb_env *env, struct sched_group *group)
5762{ 5840{
5763 unsigned int capacity, smt, cpus; 5841 unsigned int capacity_factor, smt, cpus;
5764 unsigned int power, power_orig; 5842 unsigned int capacity, capacity_orig;
5765 5843
5766 power = group->sgp->power; 5844 capacity = group->sgc->capacity;
5767 power_orig = group->sgp->power_orig; 5845 capacity_orig = group->sgc->capacity_orig;
5768 cpus = group->group_weight; 5846 cpus = group->group_weight;
5769 5847
5770 /* smt := ceil(cpus / power), assumes: 1 < smt_power < 2 */ 5848 /* smt := ceil(cpus / capacity), assumes: 1 < smt_capacity < 2 */
5771 smt = DIV_ROUND_UP(SCHED_POWER_SCALE * cpus, power_orig); 5849 smt = DIV_ROUND_UP(SCHED_CAPACITY_SCALE * cpus, capacity_orig);
5772 capacity = cpus / smt; /* cores */ 5850 capacity_factor = cpus / smt; /* cores */
5773 5851
5774 capacity = min_t(unsigned, capacity, DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE)); 5852 capacity_factor = min_t(unsigned,
5775 if (!capacity) 5853 capacity_factor, DIV_ROUND_CLOSEST(capacity, SCHED_CAPACITY_SCALE));
5776 capacity = fix_small_capacity(env->sd, group); 5854 if (!capacity_factor)
5855 capacity_factor = fix_small_capacity(env->sd, group);
5777 5856
5778 return capacity; 5857 return capacity_factor;
5779} 5858}
5780 5859
5781/** 5860/**
@@ -5815,9 +5894,9 @@ static inline void update_sg_lb_stats(struct lb_env *env,
5815 sgs->idle_cpus++; 5894 sgs->idle_cpus++;
5816 } 5895 }
5817 5896
5818 /* Adjust by relative CPU power of the group */ 5897 /* Adjust by relative CPU capacity of the group */
5819 sgs->group_power = group->sgp->power; 5898 sgs->group_capacity = group->sgc->capacity;
5820 sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / sgs->group_power; 5899 sgs->avg_load = (sgs->group_load*SCHED_CAPACITY_SCALE) / sgs->group_capacity;
5821 5900
5822 if (sgs->sum_nr_running) 5901 if (sgs->sum_nr_running)
5823 sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; 5902 sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
@@ -5825,10 +5904,10 @@ static inline void update_sg_lb_stats(struct lb_env *env,
5825 sgs->group_weight = group->group_weight; 5904 sgs->group_weight = group->group_weight;
5826 5905
5827 sgs->group_imb = sg_imbalanced(group); 5906 sgs->group_imb = sg_imbalanced(group);
5828 sgs->group_capacity = sg_capacity(env, group); 5907 sgs->group_capacity_factor = sg_capacity_factor(env, group);
5829 5908
5830 if (sgs->group_capacity > sgs->sum_nr_running) 5909 if (sgs->group_capacity_factor > sgs->sum_nr_running)
5831 sgs->group_has_capacity = 1; 5910 sgs->group_has_free_capacity = 1;
5832} 5911}
5833 5912
5834/** 5913/**
@@ -5852,7 +5931,7 @@ static bool update_sd_pick_busiest(struct lb_env *env,
5852 if (sgs->avg_load <= sds->busiest_stat.avg_load) 5931 if (sgs->avg_load <= sds->busiest_stat.avg_load)
5853 return false; 5932 return false;
5854 5933
5855 if (sgs->sum_nr_running > sgs->group_capacity) 5934 if (sgs->sum_nr_running > sgs->group_capacity_factor)
5856 return true; 5935 return true;
5857 5936
5858 if (sgs->group_imb) 5937 if (sgs->group_imb)
@@ -5932,8 +6011,8 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
5932 sgs = &sds->local_stat; 6011 sgs = &sds->local_stat;
5933 6012
5934 if (env->idle != CPU_NEWLY_IDLE || 6013 if (env->idle != CPU_NEWLY_IDLE ||
5935 time_after_eq(jiffies, sg->sgp->next_update)) 6014 time_after_eq(jiffies, sg->sgc->next_update))
5936 update_group_power(env->sd, env->dst_cpu); 6015 update_group_capacity(env->sd, env->dst_cpu);
5937 } 6016 }
5938 6017
5939 update_sg_lb_stats(env, sg, load_idx, local_group, sgs); 6018 update_sg_lb_stats(env, sg, load_idx, local_group, sgs);
@@ -5943,17 +6022,17 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
5943 6022
5944 /* 6023 /*
5945 * In case the child domain prefers tasks go to siblings 6024 * In case the child domain prefers tasks go to siblings
5946 * first, lower the sg capacity to one so that we'll try 6025 * first, lower the sg capacity factor to one so that we'll try
5947 * and move all the excess tasks away. We lower the capacity 6026 * and move all the excess tasks away. We lower the capacity
5948 * of a group only if the local group has the capacity to fit 6027 * of a group only if the local group has the capacity to fit
5949 * these excess tasks, i.e. nr_running < group_capacity. The 6028 * these excess tasks, i.e. nr_running < group_capacity_factor. The
5950 * extra check prevents the case where you always pull from the 6029 * extra check prevents the case where you always pull from the
5951 * heaviest group when it is already under-utilized (possible 6030 * heaviest group when it is already under-utilized (possible
5952 * with a large weight task outweighs the tasks on the system). 6031 * with a large weight task outweighs the tasks on the system).
5953 */ 6032 */
5954 if (prefer_sibling && sds->local && 6033 if (prefer_sibling && sds->local &&
5955 sds->local_stat.group_has_capacity) 6034 sds->local_stat.group_has_free_capacity)
5956 sgs->group_capacity = min(sgs->group_capacity, 1U); 6035 sgs->group_capacity_factor = min(sgs->group_capacity_factor, 1U);
5957 6036
5958 if (update_sd_pick_busiest(env, sds, sg, sgs)) { 6037 if (update_sd_pick_busiest(env, sds, sg, sgs)) {
5959 sds->busiest = sg; 6038 sds->busiest = sg;
@@ -5963,7 +6042,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
5963next_group: 6042next_group:
5964 /* Now, start updating sd_lb_stats */ 6043 /* Now, start updating sd_lb_stats */
5965 sds->total_load += sgs->group_load; 6044 sds->total_load += sgs->group_load;
5966 sds->total_pwr += sgs->group_power; 6045 sds->total_capacity += sgs->group_capacity;
5967 6046
5968 sg = sg->next; 6047 sg = sg->next;
5969 } while (sg != env->sd->groups); 6048 } while (sg != env->sd->groups);
@@ -6010,8 +6089,8 @@ static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds)
6010 return 0; 6089 return 0;
6011 6090
6012 env->imbalance = DIV_ROUND_CLOSEST( 6091 env->imbalance = DIV_ROUND_CLOSEST(
6013 sds->busiest_stat.avg_load * sds->busiest_stat.group_power, 6092 sds->busiest_stat.avg_load * sds->busiest_stat.group_capacity,
6014 SCHED_POWER_SCALE); 6093 SCHED_CAPACITY_SCALE);
6015 6094
6016 return 1; 6095 return 1;
6017} 6096}
@@ -6026,7 +6105,7 @@ static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds)
6026static inline 6105static inline
6027void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds) 6106void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
6028{ 6107{
6029 unsigned long tmp, pwr_now = 0, pwr_move = 0; 6108 unsigned long tmp, capa_now = 0, capa_move = 0;
6030 unsigned int imbn = 2; 6109 unsigned int imbn = 2;
6031 unsigned long scaled_busy_load_per_task; 6110 unsigned long scaled_busy_load_per_task;
6032 struct sg_lb_stats *local, *busiest; 6111 struct sg_lb_stats *local, *busiest;
@@ -6040,8 +6119,8 @@ void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
6040 imbn = 1; 6119 imbn = 1;
6041 6120
6042 scaled_busy_load_per_task = 6121 scaled_busy_load_per_task =
6043 (busiest->load_per_task * SCHED_POWER_SCALE) / 6122 (busiest->load_per_task * SCHED_CAPACITY_SCALE) /
6044 busiest->group_power; 6123 busiest->group_capacity;
6045 6124
6046 if (busiest->avg_load + scaled_busy_load_per_task >= 6125 if (busiest->avg_load + scaled_busy_load_per_task >=
6047 local->avg_load + (scaled_busy_load_per_task * imbn)) { 6126 local->avg_load + (scaled_busy_load_per_task * imbn)) {
@@ -6051,38 +6130,38 @@ void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
6051 6130
6052 /* 6131 /*
6053 * OK, we don't have enough imbalance to justify moving tasks, 6132 * OK, we don't have enough imbalance to justify moving tasks,
6054 * however we may be able to increase total CPU power used by 6133 * however we may be able to increase total CPU capacity used by
6055 * moving them. 6134 * moving them.
6056 */ 6135 */
6057 6136
6058 pwr_now += busiest->group_power * 6137 capa_now += busiest->group_capacity *
6059 min(busiest->load_per_task, busiest->avg_load); 6138 min(busiest->load_per_task, busiest->avg_load);
6060 pwr_now += local->group_power * 6139 capa_now += local->group_capacity *
6061 min(local->load_per_task, local->avg_load); 6140 min(local->load_per_task, local->avg_load);
6062 pwr_now /= SCHED_POWER_SCALE; 6141 capa_now /= SCHED_CAPACITY_SCALE;
6063 6142
6064 /* Amount of load we'd subtract */ 6143 /* Amount of load we'd subtract */
6065 if (busiest->avg_load > scaled_busy_load_per_task) { 6144 if (busiest->avg_load > scaled_busy_load_per_task) {
6066 pwr_move += busiest->group_power * 6145 capa_move += busiest->group_capacity *
6067 min(busiest->load_per_task, 6146 min(busiest->load_per_task,
6068 busiest->avg_load - scaled_busy_load_per_task); 6147 busiest->avg_load - scaled_busy_load_per_task);
6069 } 6148 }
6070 6149
6071 /* Amount of load we'd add */ 6150 /* Amount of load we'd add */
6072 if (busiest->avg_load * busiest->group_power < 6151 if (busiest->avg_load * busiest->group_capacity <
6073 busiest->load_per_task * SCHED_POWER_SCALE) { 6152 busiest->load_per_task * SCHED_CAPACITY_SCALE) {
6074 tmp = (busiest->avg_load * busiest->group_power) / 6153 tmp = (busiest->avg_load * busiest->group_capacity) /
6075 local->group_power; 6154 local->group_capacity;
6076 } else { 6155 } else {
6077 tmp = (busiest->load_per_task * SCHED_POWER_SCALE) / 6156 tmp = (busiest->load_per_task * SCHED_CAPACITY_SCALE) /
6078 local->group_power; 6157 local->group_capacity;
6079 } 6158 }
6080 pwr_move += local->group_power * 6159 capa_move += local->group_capacity *
6081 min(local->load_per_task, local->avg_load + tmp); 6160 min(local->load_per_task, local->avg_load + tmp);
6082 pwr_move /= SCHED_POWER_SCALE; 6161 capa_move /= SCHED_CAPACITY_SCALE;
6083 6162
6084 /* Move if we gain throughput */ 6163 /* Move if we gain throughput */
6085 if (pwr_move > pwr_now) 6164 if (capa_move > capa_now)
6086 env->imbalance = busiest->load_per_task; 6165 env->imbalance = busiest->load_per_task;
6087} 6166}
6088 6167
@@ -6112,7 +6191,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
6112 /* 6191 /*
6113 * In the presence of smp nice balancing, certain scenarios can have 6192 * In the presence of smp nice balancing, certain scenarios can have
6114 * max load less than avg load(as we skip the groups at or below 6193 * max load less than avg load(as we skip the groups at or below
6115 * its cpu_power, while calculating max_load..) 6194 * its cpu_capacity, while calculating max_load..)
6116 */ 6195 */
6117 if (busiest->avg_load <= sds->avg_load || 6196 if (busiest->avg_load <= sds->avg_load ||
6118 local->avg_load >= sds->avg_load) { 6197 local->avg_load >= sds->avg_load) {
@@ -6127,10 +6206,10 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
6127 * have to drop below capacity to reach cpu-load equilibrium. 6206 * have to drop below capacity to reach cpu-load equilibrium.
6128 */ 6207 */
6129 load_above_capacity = 6208 load_above_capacity =
6130 (busiest->sum_nr_running - busiest->group_capacity); 6209 (busiest->sum_nr_running - busiest->group_capacity_factor);
6131 6210
6132 load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_POWER_SCALE); 6211 load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_CAPACITY_SCALE);
6133 load_above_capacity /= busiest->group_power; 6212 load_above_capacity /= busiest->group_capacity;
6134 } 6213 }
6135 6214
6136 /* 6215 /*
@@ -6145,9 +6224,9 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
6145 6224
6146 /* How much load to actually move to equalise the imbalance */ 6225 /* How much load to actually move to equalise the imbalance */
6147 env->imbalance = min( 6226 env->imbalance = min(
6148 max_pull * busiest->group_power, 6227 max_pull * busiest->group_capacity,
6149 (sds->avg_load - local->avg_load) * local->group_power 6228 (sds->avg_load - local->avg_load) * local->group_capacity
6150 ) / SCHED_POWER_SCALE; 6229 ) / SCHED_CAPACITY_SCALE;
6151 6230
6152 /* 6231 /*
6153 * if *imbalance is less than the average load per runnable task 6232 * if *imbalance is less than the average load per runnable task
@@ -6201,7 +6280,8 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
6201 if (!sds.busiest || busiest->sum_nr_running == 0) 6280 if (!sds.busiest || busiest->sum_nr_running == 0)
6202 goto out_balanced; 6281 goto out_balanced;
6203 6282
6204 sds.avg_load = (SCHED_POWER_SCALE * sds.total_load) / sds.total_pwr; 6283 sds.avg_load = (SCHED_CAPACITY_SCALE * sds.total_load)
6284 / sds.total_capacity;
6205 6285
6206 /* 6286 /*
6207 * If the busiest group is imbalanced the below checks don't 6287 * If the busiest group is imbalanced the below checks don't
@@ -6212,8 +6292,8 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
6212 goto force_balance; 6292 goto force_balance;
6213 6293
6214 /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */ 6294 /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
6215 if (env->idle == CPU_NEWLY_IDLE && local->group_has_capacity && 6295 if (env->idle == CPU_NEWLY_IDLE && local->group_has_free_capacity &&
6216 !busiest->group_has_capacity) 6296 !busiest->group_has_free_capacity)
6217 goto force_balance; 6297 goto force_balance;
6218 6298
6219 /* 6299 /*
@@ -6267,11 +6347,11 @@ static struct rq *find_busiest_queue(struct lb_env *env,
6267 struct sched_group *group) 6347 struct sched_group *group)
6268{ 6348{
6269 struct rq *busiest = NULL, *rq; 6349 struct rq *busiest = NULL, *rq;
6270 unsigned long busiest_load = 0, busiest_power = 1; 6350 unsigned long busiest_load = 0, busiest_capacity = 1;
6271 int i; 6351 int i;
6272 6352
6273 for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { 6353 for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
6274 unsigned long power, capacity, wl; 6354 unsigned long capacity, capacity_factor, wl;
6275 enum fbq_type rt; 6355 enum fbq_type rt;
6276 6356
6277 rq = cpu_rq(i); 6357 rq = cpu_rq(i);
@@ -6299,34 +6379,34 @@ static struct rq *find_busiest_queue(struct lb_env *env,
6299 if (rt > env->fbq_type) 6379 if (rt > env->fbq_type)
6300 continue; 6380 continue;
6301 6381
6302 power = power_of(i); 6382 capacity = capacity_of(i);
6303 capacity = DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE); 6383 capacity_factor = DIV_ROUND_CLOSEST(capacity, SCHED_CAPACITY_SCALE);
6304 if (!capacity) 6384 if (!capacity_factor)
6305 capacity = fix_small_capacity(env->sd, group); 6385 capacity_factor = fix_small_capacity(env->sd, group);
6306 6386
6307 wl = weighted_cpuload(i); 6387 wl = weighted_cpuload(i);
6308 6388
6309 /* 6389 /*
6310 * When comparing with imbalance, use weighted_cpuload() 6390 * When comparing with imbalance, use weighted_cpuload()
6311 * which is not scaled with the cpu power. 6391 * which is not scaled with the cpu capacity.
6312 */ 6392 */
6313 if (capacity && rq->nr_running == 1 && wl > env->imbalance) 6393 if (capacity_factor && rq->nr_running == 1 && wl > env->imbalance)
6314 continue; 6394 continue;
6315 6395
6316 /* 6396 /*
6317 * For the load comparisons with the other cpu's, consider 6397 * For the load comparisons with the other cpu's, consider
6318 * the weighted_cpuload() scaled with the cpu power, so that 6398 * the weighted_cpuload() scaled with the cpu capacity, so
6319 * the load can be moved away from the cpu that is potentially 6399 * that the load can be moved away from the cpu that is
6320 * running at a lower capacity. 6400 * potentially running at a lower capacity.
6321 * 6401 *
6322 * Thus we're looking for max(wl_i / power_i), crosswise 6402 * Thus we're looking for max(wl_i / capacity_i), crosswise
6323 * multiplication to rid ourselves of the division works out 6403 * multiplication to rid ourselves of the division works out
6324 * to: wl_i * power_j > wl_j * power_i; where j is our 6404 * to: wl_i * capacity_j > wl_j * capacity_i; where j is
6325 * previous maximum. 6405 * our previous maximum.
6326 */ 6406 */
6327 if (wl * busiest_power > busiest_load * power) { 6407 if (wl * busiest_capacity > busiest_load * capacity) {
6328 busiest_load = wl; 6408 busiest_load = wl;
6329 busiest_power = power; 6409 busiest_capacity = capacity;
6330 busiest = rq; 6410 busiest = rq;
6331 } 6411 }
6332 } 6412 }
@@ -6534,7 +6614,7 @@ more_balance:
6534 * We failed to reach balance because of affinity. 6614 * We failed to reach balance because of affinity.
6535 */ 6615 */
6536 if (sd_parent) { 6616 if (sd_parent) {
6537 int *group_imbalance = &sd_parent->groups->sgp->imbalance; 6617 int *group_imbalance = &sd_parent->groups->sgc->imbalance;
6538 6618
6539 if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) { 6619 if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) {
6540 *group_imbalance = 1; 6620 *group_imbalance = 1;
@@ -6640,27 +6720,62 @@ out:
6640 return ld_moved; 6720 return ld_moved;
6641} 6721}
6642 6722
6723static inline unsigned long
6724get_sd_balance_interval(struct sched_domain *sd, int cpu_busy)
6725{
6726 unsigned long interval = sd->balance_interval;
6727
6728 if (cpu_busy)
6729 interval *= sd->busy_factor;
6730
6731 /* scale ms to jiffies */
6732 interval = msecs_to_jiffies(interval);
6733 interval = clamp(interval, 1UL, max_load_balance_interval);
6734
6735 return interval;
6736}
6737
6738static inline void
6739update_next_balance(struct sched_domain *sd, int cpu_busy, unsigned long *next_balance)
6740{
6741 unsigned long interval, next;
6742
6743 interval = get_sd_balance_interval(sd, cpu_busy);
6744 next = sd->last_balance + interval;
6745
6746 if (time_after(*next_balance, next))
6747 *next_balance = next;
6748}
6749
6643/* 6750/*
6644 * idle_balance is called by schedule() if this_cpu is about to become 6751 * idle_balance is called by schedule() if this_cpu is about to become
6645 * idle. Attempts to pull tasks from other CPUs. 6752 * idle. Attempts to pull tasks from other CPUs.
6646 */ 6753 */
6647static int idle_balance(struct rq *this_rq) 6754static int idle_balance(struct rq *this_rq)
6648{ 6755{
6756 unsigned long next_balance = jiffies + HZ;
6757 int this_cpu = this_rq->cpu;
6649 struct sched_domain *sd; 6758 struct sched_domain *sd;
6650 int pulled_task = 0; 6759 int pulled_task = 0;
6651 unsigned long next_balance = jiffies + HZ;
6652 u64 curr_cost = 0; 6760 u64 curr_cost = 0;
6653 int this_cpu = this_rq->cpu;
6654 6761
6655 idle_enter_fair(this_rq); 6762 idle_enter_fair(this_rq);
6763
6656 /* 6764 /*
6657 * We must set idle_stamp _before_ calling idle_balance(), such that we 6765 * We must set idle_stamp _before_ calling idle_balance(), such that we
6658 * measure the duration of idle_balance() as idle time. 6766 * measure the duration of idle_balance() as idle time.
6659 */ 6767 */
6660 this_rq->idle_stamp = rq_clock(this_rq); 6768 this_rq->idle_stamp = rq_clock(this_rq);
6661 6769
6662 if (this_rq->avg_idle < sysctl_sched_migration_cost) 6770 if (this_rq->avg_idle < sysctl_sched_migration_cost) {
6771 rcu_read_lock();
6772 sd = rcu_dereference_check_sched_domain(this_rq->sd);
6773 if (sd)
6774 update_next_balance(sd, 0, &next_balance);
6775 rcu_read_unlock();
6776
6663 goto out; 6777 goto out;
6778 }
6664 6779
6665 /* 6780 /*
6666 * Drop the rq->lock, but keep IRQ/preempt disabled. 6781 * Drop the rq->lock, but keep IRQ/preempt disabled.
@@ -6670,20 +6785,20 @@ static int idle_balance(struct rq *this_rq)
6670 update_blocked_averages(this_cpu); 6785 update_blocked_averages(this_cpu);
6671 rcu_read_lock(); 6786 rcu_read_lock();
6672 for_each_domain(this_cpu, sd) { 6787 for_each_domain(this_cpu, sd) {
6673 unsigned long interval;
6674 int continue_balancing = 1; 6788 int continue_balancing = 1;
6675 u64 t0, domain_cost; 6789 u64 t0, domain_cost;
6676 6790
6677 if (!(sd->flags & SD_LOAD_BALANCE)) 6791 if (!(sd->flags & SD_LOAD_BALANCE))
6678 continue; 6792 continue;
6679 6793
6680 if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) 6794 if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) {
6795 update_next_balance(sd, 0, &next_balance);
6681 break; 6796 break;
6797 }
6682 6798
6683 if (sd->flags & SD_BALANCE_NEWIDLE) { 6799 if (sd->flags & SD_BALANCE_NEWIDLE) {
6684 t0 = sched_clock_cpu(this_cpu); 6800 t0 = sched_clock_cpu(this_cpu);
6685 6801
6686 /* If we've pulled tasks over stop searching: */
6687 pulled_task = load_balance(this_cpu, this_rq, 6802 pulled_task = load_balance(this_cpu, this_rq,
6688 sd, CPU_NEWLY_IDLE, 6803 sd, CPU_NEWLY_IDLE,
6689 &continue_balancing); 6804 &continue_balancing);
@@ -6695,42 +6810,37 @@ static int idle_balance(struct rq *this_rq)
6695 curr_cost += domain_cost; 6810 curr_cost += domain_cost;
6696 } 6811 }
6697 6812
6698 interval = msecs_to_jiffies(sd->balance_interval); 6813 update_next_balance(sd, 0, &next_balance);
6699 if (time_after(next_balance, sd->last_balance + interval)) 6814
6700 next_balance = sd->last_balance + interval; 6815 /*
6701 if (pulled_task) 6816 * Stop searching for tasks to pull if there are
6817 * now runnable tasks on this rq.
6818 */
6819 if (pulled_task || this_rq->nr_running > 0)
6702 break; 6820 break;
6703 } 6821 }
6704 rcu_read_unlock(); 6822 rcu_read_unlock();
6705 6823
6706 raw_spin_lock(&this_rq->lock); 6824 raw_spin_lock(&this_rq->lock);
6707 6825
6826 if (curr_cost > this_rq->max_idle_balance_cost)
6827 this_rq->max_idle_balance_cost = curr_cost;
6828
6708 /* 6829 /*
6709 * While browsing the domains, we released the rq lock. 6830 * While browsing the domains, we released the rq lock, a task could
6710 * A task could have be enqueued in the meantime 6831 * have been enqueued in the meantime. Since we're not going idle,
6832 * pretend we pulled a task.
6711 */ 6833 */
6712 if (this_rq->cfs.h_nr_running && !pulled_task) { 6834 if (this_rq->cfs.h_nr_running && !pulled_task)
6713 pulled_task = 1; 6835 pulled_task = 1;
6714 goto out;
6715 }
6716 6836
6717 if (pulled_task || time_after(jiffies, this_rq->next_balance)) { 6837out:
6718 /* 6838 /* Move the next balance forward */
6719 * We are going idle. next_balance may be set based on 6839 if (time_after(this_rq->next_balance, next_balance))
6720 * a busy processor. So reset next_balance.
6721 */
6722 this_rq->next_balance = next_balance; 6840 this_rq->next_balance = next_balance;
6723 }
6724 6841
6725 if (curr_cost > this_rq->max_idle_balance_cost)
6726 this_rq->max_idle_balance_cost = curr_cost;
6727
6728out:
6729 /* Is there a task of a high priority class? */ 6842 /* Is there a task of a high priority class? */
6730 if (this_rq->nr_running != this_rq->cfs.h_nr_running && 6843 if (this_rq->nr_running != this_rq->cfs.h_nr_running)
6731 ((this_rq->stop && this_rq->stop->on_rq) ||
6732 this_rq->dl.dl_nr_running ||
6733 (this_rq->rt.rt_nr_running && !rt_rq_throttled(&this_rq->rt))))
6734 pulled_task = -1; 6844 pulled_task = -1;
6735 6845
6736 if (pulled_task) { 6846 if (pulled_task) {
@@ -6891,7 +7001,7 @@ static inline void set_cpu_sd_state_busy(void)
6891 goto unlock; 7001 goto unlock;
6892 sd->nohz_idle = 0; 7002 sd->nohz_idle = 0;
6893 7003
6894 atomic_inc(&sd->groups->sgp->nr_busy_cpus); 7004 atomic_inc(&sd->groups->sgc->nr_busy_cpus);
6895unlock: 7005unlock:
6896 rcu_read_unlock(); 7006 rcu_read_unlock();
6897} 7007}
@@ -6908,7 +7018,7 @@ void set_cpu_sd_state_idle(void)
6908 goto unlock; 7018 goto unlock;
6909 sd->nohz_idle = 1; 7019 sd->nohz_idle = 1;
6910 7020
6911 atomic_dec(&sd->groups->sgp->nr_busy_cpus); 7021 atomic_dec(&sd->groups->sgc->nr_busy_cpus);
6912unlock: 7022unlock:
6913 rcu_read_unlock(); 7023 rcu_read_unlock();
6914} 7024}
@@ -7011,16 +7121,9 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
7011 break; 7121 break;
7012 } 7122 }
7013 7123
7014 interval = sd->balance_interval; 7124 interval = get_sd_balance_interval(sd, idle != CPU_IDLE);
7015 if (idle != CPU_IDLE)
7016 interval *= sd->busy_factor;
7017
7018 /* scale ms to jiffies */
7019 interval = msecs_to_jiffies(interval);
7020 interval = clamp(interval, 1UL, max_load_balance_interval);
7021 7125
7022 need_serialize = sd->flags & SD_SERIALIZE; 7126 need_serialize = sd->flags & SD_SERIALIZE;
7023
7024 if (need_serialize) { 7127 if (need_serialize) {
7025 if (!spin_trylock(&balancing)) 7128 if (!spin_trylock(&balancing))
7026 goto out; 7129 goto out;
@@ -7036,6 +7139,7 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
7036 idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE; 7139 idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE;
7037 } 7140 }
7038 sd->last_balance = jiffies; 7141 sd->last_balance = jiffies;
7142 interval = get_sd_balance_interval(sd, idle != CPU_IDLE);
7039 } 7143 }
7040 if (need_serialize) 7144 if (need_serialize)
7041 spin_unlock(&balancing); 7145 spin_unlock(&balancing);
@@ -7093,12 +7197,17 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
7093 7197
7094 rq = cpu_rq(balance_cpu); 7198 rq = cpu_rq(balance_cpu);
7095 7199
7096 raw_spin_lock_irq(&rq->lock); 7200 /*
7097 update_rq_clock(rq); 7201 * If time for next balance is due,
7098 update_idle_cpu_load(rq); 7202 * do the balance.
7099 raw_spin_unlock_irq(&rq->lock); 7203 */
7100 7204 if (time_after_eq(jiffies, rq->next_balance)) {
7101 rebalance_domains(rq, CPU_IDLE); 7205 raw_spin_lock_irq(&rq->lock);
7206 update_rq_clock(rq);
7207 update_idle_cpu_load(rq);
7208 raw_spin_unlock_irq(&rq->lock);
7209 rebalance_domains(rq, CPU_IDLE);
7210 }
7102 7211
7103 if (time_after(this_rq->next_balance, rq->next_balance)) 7212 if (time_after(this_rq->next_balance, rq->next_balance))
7104 this_rq->next_balance = rq->next_balance; 7213 this_rq->next_balance = rq->next_balance;
@@ -7113,7 +7222,7 @@ end:
7113 * of an idle cpu is the system. 7222 * of an idle cpu is the system.
7114 * - This rq has more than one task. 7223 * - This rq has more than one task.
7115 * - At any scheduler domain level, this cpu's scheduler group has multiple 7224 * - At any scheduler domain level, this cpu's scheduler group has multiple
7116 * busy cpu's exceeding the group's power. 7225 * busy cpu's exceeding the group's capacity.
7117 * - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler 7226 * - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler
7118 * domain span are idle. 7227 * domain span are idle.
7119 */ 7228 */
@@ -7121,7 +7230,7 @@ static inline int nohz_kick_needed(struct rq *rq)
7121{ 7230{
7122 unsigned long now = jiffies; 7231 unsigned long now = jiffies;
7123 struct sched_domain *sd; 7232 struct sched_domain *sd;
7124 struct sched_group_power *sgp; 7233 struct sched_group_capacity *sgc;
7125 int nr_busy, cpu = rq->cpu; 7234 int nr_busy, cpu = rq->cpu;
7126 7235
7127 if (unlikely(rq->idle_balance)) 7236 if (unlikely(rq->idle_balance))
@@ -7151,8 +7260,8 @@ static inline int nohz_kick_needed(struct rq *rq)
7151 sd = rcu_dereference(per_cpu(sd_busy, cpu)); 7260 sd = rcu_dereference(per_cpu(sd_busy, cpu));
7152 7261
7153 if (sd) { 7262 if (sd) {
7154 sgp = sd->groups->sgp; 7263 sgc = sd->groups->sgc;
7155 nr_busy = atomic_read(&sgp->nr_busy_cpus); 7264 nr_busy = atomic_read(&sgc->nr_busy_cpus);
7156 7265
7157 if (nr_busy > 1) 7266 if (nr_busy > 1)
7158 goto need_kick_unlock; 7267 goto need_kick_unlock;
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 5716929a2e3a..90284d117fe6 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -37,18 +37,18 @@ SCHED_FEAT(CACHE_HOT_BUDDY, true)
37SCHED_FEAT(WAKEUP_PREEMPTION, true) 37SCHED_FEAT(WAKEUP_PREEMPTION, true)
38 38
39/* 39/*
40 * Use arch dependent cpu power functions 40 * Use arch dependent cpu capacity functions
41 */ 41 */
42SCHED_FEAT(ARCH_POWER, true) 42SCHED_FEAT(ARCH_CAPACITY, true)
43 43
44SCHED_FEAT(HRTICK, false) 44SCHED_FEAT(HRTICK, false)
45SCHED_FEAT(DOUBLE_TICK, false) 45SCHED_FEAT(DOUBLE_TICK, false)
46SCHED_FEAT(LB_BIAS, true) 46SCHED_FEAT(LB_BIAS, true)
47 47
48/* 48/*
49 * Decrement CPU power based on time not spent running tasks 49 * Decrement CPU capacity based on time not spent running tasks
50 */ 50 */
51SCHED_FEAT(NONTASK_POWER, true) 51SCHED_FEAT(NONTASK_CAPACITY, true)
52 52
53/* 53/*
54 * Queue remote wakeups on the target CPU and process them 54 * Queue remote wakeups on the target CPU and process them
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 8f4390a079c7..cf009fb0bc25 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -12,6 +12,8 @@
12 12
13#include <trace/events/power.h> 13#include <trace/events/power.h>
14 14
15#include "sched.h"
16
15static int __read_mostly cpu_idle_force_poll; 17static int __read_mostly cpu_idle_force_poll;
16 18
17void cpu_idle_poll_ctrl(bool enable) 19void cpu_idle_poll_ctrl(bool enable)
@@ -67,24 +69,25 @@ void __weak arch_cpu_idle(void)
67 * cpuidle_idle_call - the main idle function 69 * cpuidle_idle_call - the main idle function
68 * 70 *
69 * NOTE: no locks or semaphores should be used here 71 * NOTE: no locks or semaphores should be used here
70 * return non-zero on failure 72 *
73 * On archs that support TIF_POLLING_NRFLAG, is called with polling
74 * set, and it returns with polling set. If it ever stops polling, it
75 * must clear the polling bit.
71 */ 76 */
72static int cpuidle_idle_call(void) 77static void cpuidle_idle_call(void)
73{ 78{
74 struct cpuidle_device *dev = __this_cpu_read(cpuidle_devices); 79 struct cpuidle_device *dev = __this_cpu_read(cpuidle_devices);
75 struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev); 80 struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev);
76 int next_state, entered_state, ret; 81 int next_state, entered_state;
77 bool broadcast; 82 bool broadcast;
78 83
79 /* 84 /*
80 * Check if the idle task must be rescheduled. If it is the 85 * Check if the idle task must be rescheduled. If it is the
81 * case, exit the function after re-enabling the local irq and 86 * case, exit the function after re-enabling the local irq.
82 * set again the polling flag
83 */ 87 */
84 if (current_clr_polling_and_test()) { 88 if (need_resched()) {
85 local_irq_enable(); 89 local_irq_enable();
86 __current_set_polling(); 90 return;
87 return 0;
88 } 91 }
89 92
90 /* 93 /*
@@ -101,104 +104,99 @@ static int cpuidle_idle_call(void)
101 rcu_idle_enter(); 104 rcu_idle_enter();
102 105
103 /* 106 /*
104 * Check if the cpuidle framework is ready, otherwise fallback 107 * Ask the cpuidle framework to choose a convenient idle state.
105 * to the default arch specific idle method 108 * Fall back to the default arch idle method on errors.
106 */ 109 */
107 ret = cpuidle_enabled(drv, dev); 110 next_state = cpuidle_select(drv, dev);
108 111 if (next_state < 0) {
109 if (!ret) { 112use_default:
110 /* 113 /*
111 * Ask the governor to choose an idle state it thinks 114 * We can't use the cpuidle framework, let's use the default
112 * it is convenient to go to. There is *always* a 115 * idle routine.
113 * convenient idle state
114 */ 116 */
115 next_state = cpuidle_select(drv, dev); 117 if (current_clr_polling_and_test())
116
117 /*
118 * The idle task must be scheduled, it is pointless to
119 * go to idle, just update no idle residency and get
120 * out of this function
121 */
122 if (current_clr_polling_and_test()) {
123 dev->last_residency = 0;
124 entered_state = next_state;
125 local_irq_enable(); 118 local_irq_enable();
126 } else { 119 else
127 broadcast = !!(drv->states[next_state].flags & 120 arch_cpu_idle();
128 CPUIDLE_FLAG_TIMER_STOP); 121
129 122 goto exit_idle;
130 if (broadcast)
131 /*
132 * Tell the time framework to switch
133 * to a broadcast timer because our
134 * local timer will be shutdown. If a
135 * local timer is used from another
136 * cpu as a broadcast timer, this call
137 * may fail if it is not available
138 */
139 ret = clockevents_notify(
140 CLOCK_EVT_NOTIFY_BROADCAST_ENTER,
141 &dev->cpu);
142
143 if (!ret) {
144 trace_cpu_idle_rcuidle(next_state, dev->cpu);
145
146 /*
147 * Enter the idle state previously
148 * returned by the governor
149 * decision. This function will block
150 * until an interrupt occurs and will
151 * take care of re-enabling the local
152 * interrupts
153 */
154 entered_state = cpuidle_enter(drv, dev,
155 next_state);
156
157 trace_cpu_idle_rcuidle(PWR_EVENT_EXIT,
158 dev->cpu);
159
160 if (broadcast)
161 clockevents_notify(
162 CLOCK_EVT_NOTIFY_BROADCAST_EXIT,
163 &dev->cpu);
164
165 /*
166 * Give the governor an opportunity to reflect on the
167 * outcome
168 */
169 cpuidle_reflect(dev, entered_state);
170 }
171 }
172 } 123 }
173 124
125
126 /*
127 * The idle task must be scheduled, it is pointless to
128 * go to idle, just update no idle residency and get
129 * out of this function
130 */
131 if (current_clr_polling_and_test()) {
132 dev->last_residency = 0;
133 entered_state = next_state;
134 local_irq_enable();
135 goto exit_idle;
136 }
137
138 broadcast = !!(drv->states[next_state].flags & CPUIDLE_FLAG_TIMER_STOP);
139
174 /* 140 /*
175 * We can't use the cpuidle framework, let's use the default 141 * Tell the time framework to switch to a broadcast timer
176 * idle routine 142 * because our local timer will be shutdown. If a local timer
143 * is used from another cpu as a broadcast timer, this call may
144 * fail if it is not available
177 */ 145 */
178 if (ret) 146 if (broadcast &&
179 arch_cpu_idle(); 147 clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &dev->cpu))
148 goto use_default;
180 149
150 trace_cpu_idle_rcuidle(next_state, dev->cpu);
151
152 /*
153 * Enter the idle state previously returned by the governor decision.
154 * This function will block until an interrupt occurs and will take
155 * care of re-enabling the local interrupts
156 */
157 entered_state = cpuidle_enter(drv, dev, next_state);
158
159 trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, dev->cpu);
160
161 if (broadcast)
162 clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &dev->cpu);
163
164 /*
165 * Give the governor an opportunity to reflect on the outcome
166 */
167 cpuidle_reflect(dev, entered_state);
168
169exit_idle:
181 __current_set_polling(); 170 __current_set_polling();
182 171
183 /* 172 /*
184 * It is up to the idle functions to enable back the local 173 * It is up to the idle functions to reenable local interrupts
185 * interrupt
186 */ 174 */
187 if (WARN_ON_ONCE(irqs_disabled())) 175 if (WARN_ON_ONCE(irqs_disabled()))
188 local_irq_enable(); 176 local_irq_enable();
189 177
190 rcu_idle_exit(); 178 rcu_idle_exit();
191 start_critical_timings(); 179 start_critical_timings();
192
193 return 0;
194} 180}
195 181
196/* 182/*
197 * Generic idle loop implementation 183 * Generic idle loop implementation
184 *
185 * Called with polling cleared.
198 */ 186 */
199static void cpu_idle_loop(void) 187static void cpu_idle_loop(void)
200{ 188{
201 while (1) { 189 while (1) {
190 /*
191 * If the arch has a polling bit, we maintain an invariant:
192 *
193 * Our polling bit is clear if we're not scheduled (i.e. if
194 * rq->curr != rq->idle). This means that, if rq->idle has
195 * the polling bit set, then setting need_resched is
196 * guaranteed to cause the cpu to reschedule.
197 */
198
199 __current_set_polling();
202 tick_nohz_idle_enter(); 200 tick_nohz_idle_enter();
203 201
204 while (!need_resched()) { 202 while (!need_resched()) {
@@ -238,6 +236,17 @@ static void cpu_idle_loop(void)
238 */ 236 */
239 preempt_set_need_resched(); 237 preempt_set_need_resched();
240 tick_nohz_idle_exit(); 238 tick_nohz_idle_exit();
239 __current_clr_polling();
240
241 /*
242 * We promise to call sched_ttwu_pending and reschedule
243 * if need_resched is set while polling is set. That
244 * means that clearing polling needs to be visible
245 * before doing these things.
246 */
247 smp_mb__after_atomic();
248
249 sched_ttwu_pending();
241 schedule_preempt_disabled(); 250 schedule_preempt_disabled();
242 } 251 }
243} 252}
@@ -259,7 +268,6 @@ void cpu_startup_entry(enum cpuhp_state state)
259 */ 268 */
260 boot_init_stack_canary(); 269 boot_init_stack_canary();
261#endif 270#endif
262 __current_set_polling();
263 arch_cpu_idle_prepare(); 271 arch_cpu_idle_prepare();
264 cpu_idle_loop(); 272 cpu_idle_loop();
265} 273}
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index bd2267ad404f..a49083192c64 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -79,6 +79,8 @@ void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
79 rt_rq->overloaded = 0; 79 rt_rq->overloaded = 0;
80 plist_head_init(&rt_rq->pushable_tasks); 80 plist_head_init(&rt_rq->pushable_tasks);
81#endif 81#endif
82 /* We start is dequeued state, because no RT tasks are queued */
83 rt_rq->rt_queued = 0;
82 84
83 rt_rq->rt_time = 0; 85 rt_rq->rt_time = 0;
84 rt_rq->rt_throttled = 0; 86 rt_rq->rt_throttled = 0;
@@ -112,6 +114,13 @@ static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
112 return rt_se->rt_rq; 114 return rt_se->rt_rq;
113} 115}
114 116
117static inline struct rq *rq_of_rt_se(struct sched_rt_entity *rt_se)
118{
119 struct rt_rq *rt_rq = rt_se->rt_rq;
120
121 return rt_rq->rq;
122}
123
115void free_rt_sched_group(struct task_group *tg) 124void free_rt_sched_group(struct task_group *tg)
116{ 125{
117 int i; 126 int i;
@@ -211,10 +220,16 @@ static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
211 return container_of(rt_rq, struct rq, rt); 220 return container_of(rt_rq, struct rq, rt);
212} 221}
213 222
214static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se) 223static inline struct rq *rq_of_rt_se(struct sched_rt_entity *rt_se)
215{ 224{
216 struct task_struct *p = rt_task_of(rt_se); 225 struct task_struct *p = rt_task_of(rt_se);
217 struct rq *rq = task_rq(p); 226
227 return task_rq(p);
228}
229
230static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
231{
232 struct rq *rq = rq_of_rt_se(rt_se);
218 233
219 return &rq->rt; 234 return &rq->rt;
220} 235}
@@ -391,6 +406,9 @@ static inline void set_post_schedule(struct rq *rq)
391} 406}
392#endif /* CONFIG_SMP */ 407#endif /* CONFIG_SMP */
393 408
409static void enqueue_top_rt_rq(struct rt_rq *rt_rq);
410static void dequeue_top_rt_rq(struct rt_rq *rt_rq);
411
394static inline int on_rt_rq(struct sched_rt_entity *rt_se) 412static inline int on_rt_rq(struct sched_rt_entity *rt_se)
395{ 413{
396 return !list_empty(&rt_se->run_list); 414 return !list_empty(&rt_se->run_list);
@@ -452,8 +470,11 @@ static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
452 rt_se = rt_rq->tg->rt_se[cpu]; 470 rt_se = rt_rq->tg->rt_se[cpu];
453 471
454 if (rt_rq->rt_nr_running) { 472 if (rt_rq->rt_nr_running) {
455 if (rt_se && !on_rt_rq(rt_se)) 473 if (!rt_se)
474 enqueue_top_rt_rq(rt_rq);
475 else if (!on_rt_rq(rt_se))
456 enqueue_rt_entity(rt_se, false); 476 enqueue_rt_entity(rt_se, false);
477
457 if (rt_rq->highest_prio.curr < curr->prio) 478 if (rt_rq->highest_prio.curr < curr->prio)
458 resched_task(curr); 479 resched_task(curr);
459 } 480 }
@@ -466,10 +487,17 @@ static void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
466 487
467 rt_se = rt_rq->tg->rt_se[cpu]; 488 rt_se = rt_rq->tg->rt_se[cpu];
468 489
469 if (rt_se && on_rt_rq(rt_se)) 490 if (!rt_se)
491 dequeue_top_rt_rq(rt_rq);
492 else if (on_rt_rq(rt_se))
470 dequeue_rt_entity(rt_se); 493 dequeue_rt_entity(rt_se);
471} 494}
472 495
496static inline int rt_rq_throttled(struct rt_rq *rt_rq)
497{
498 return rt_rq->rt_throttled && !rt_rq->rt_nr_boosted;
499}
500
473static int rt_se_boosted(struct sched_rt_entity *rt_se) 501static int rt_se_boosted(struct sched_rt_entity *rt_se)
474{ 502{
475 struct rt_rq *rt_rq = group_rt_rq(rt_se); 503 struct rt_rq *rt_rq = group_rt_rq(rt_se);
@@ -532,12 +560,23 @@ static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
532 560
533static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq) 561static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
534{ 562{
535 if (rt_rq->rt_nr_running) 563 struct rq *rq = rq_of_rt_rq(rt_rq);
536 resched_task(rq_of_rt_rq(rt_rq)->curr); 564
565 if (!rt_rq->rt_nr_running)
566 return;
567
568 enqueue_top_rt_rq(rt_rq);
569 resched_task(rq->curr);
537} 570}
538 571
539static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq) 572static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
540{ 573{
574 dequeue_top_rt_rq(rt_rq);
575}
576
577static inline int rt_rq_throttled(struct rt_rq *rt_rq)
578{
579 return rt_rq->rt_throttled;
541} 580}
542 581
543static inline const struct cpumask *sched_rt_period_mask(void) 582static inline const struct cpumask *sched_rt_period_mask(void)
@@ -851,14 +890,8 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
851 * but accrue some time due to boosting. 890 * but accrue some time due to boosting.
852 */ 891 */
853 if (likely(rt_b->rt_runtime)) { 892 if (likely(rt_b->rt_runtime)) {
854 static bool once = false;
855
856 rt_rq->rt_throttled = 1; 893 rt_rq->rt_throttled = 1;
857 894 printk_deferred_once("sched: RT throttling activated\n");
858 if (!once) {
859 once = true;
860 printk_sched("sched: RT throttling activated\n");
861 }
862 } else { 895 } else {
863 /* 896 /*
864 * In case we did anyway, make it go away, 897 * In case we did anyway, make it go away,
@@ -885,7 +918,6 @@ static void update_curr_rt(struct rq *rq)
885{ 918{
886 struct task_struct *curr = rq->curr; 919 struct task_struct *curr = rq->curr;
887 struct sched_rt_entity *rt_se = &curr->rt; 920 struct sched_rt_entity *rt_se = &curr->rt;
888 struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
889 u64 delta_exec; 921 u64 delta_exec;
890 922
891 if (curr->sched_class != &rt_sched_class) 923 if (curr->sched_class != &rt_sched_class)
@@ -910,7 +942,7 @@ static void update_curr_rt(struct rq *rq)
910 return; 942 return;
911 943
912 for_each_sched_rt_entity(rt_se) { 944 for_each_sched_rt_entity(rt_se) {
913 rt_rq = rt_rq_of_se(rt_se); 945 struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
914 946
915 if (sched_rt_runtime(rt_rq) != RUNTIME_INF) { 947 if (sched_rt_runtime(rt_rq) != RUNTIME_INF) {
916 raw_spin_lock(&rt_rq->rt_runtime_lock); 948 raw_spin_lock(&rt_rq->rt_runtime_lock);
@@ -922,6 +954,38 @@ static void update_curr_rt(struct rq *rq)
922 } 954 }
923} 955}
924 956
957static void
958dequeue_top_rt_rq(struct rt_rq *rt_rq)
959{
960 struct rq *rq = rq_of_rt_rq(rt_rq);
961
962 BUG_ON(&rq->rt != rt_rq);
963
964 if (!rt_rq->rt_queued)
965 return;
966
967 BUG_ON(!rq->nr_running);
968
969 sub_nr_running(rq, rt_rq->rt_nr_running);
970 rt_rq->rt_queued = 0;
971}
972
973static void
974enqueue_top_rt_rq(struct rt_rq *rt_rq)
975{
976 struct rq *rq = rq_of_rt_rq(rt_rq);
977
978 BUG_ON(&rq->rt != rt_rq);
979
980 if (rt_rq->rt_queued)
981 return;
982 if (rt_rq_throttled(rt_rq) || !rt_rq->rt_nr_running)
983 return;
984
985 add_nr_running(rq, rt_rq->rt_nr_running);
986 rt_rq->rt_queued = 1;
987}
988
925#if defined CONFIG_SMP 989#if defined CONFIG_SMP
926 990
927static void 991static void
@@ -1045,12 +1109,23 @@ void dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) {}
1045#endif /* CONFIG_RT_GROUP_SCHED */ 1109#endif /* CONFIG_RT_GROUP_SCHED */
1046 1110
1047static inline 1111static inline
1112unsigned int rt_se_nr_running(struct sched_rt_entity *rt_se)
1113{
1114 struct rt_rq *group_rq = group_rt_rq(rt_se);
1115
1116 if (group_rq)
1117 return group_rq->rt_nr_running;
1118 else
1119 return 1;
1120}
1121
1122static inline
1048void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) 1123void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
1049{ 1124{
1050 int prio = rt_se_prio(rt_se); 1125 int prio = rt_se_prio(rt_se);
1051 1126
1052 WARN_ON(!rt_prio(prio)); 1127 WARN_ON(!rt_prio(prio));
1053 rt_rq->rt_nr_running++; 1128 rt_rq->rt_nr_running += rt_se_nr_running(rt_se);
1054 1129
1055 inc_rt_prio(rt_rq, prio); 1130 inc_rt_prio(rt_rq, prio);
1056 inc_rt_migration(rt_se, rt_rq); 1131 inc_rt_migration(rt_se, rt_rq);
@@ -1062,7 +1137,7 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
1062{ 1137{
1063 WARN_ON(!rt_prio(rt_se_prio(rt_se))); 1138 WARN_ON(!rt_prio(rt_se_prio(rt_se)));
1064 WARN_ON(!rt_rq->rt_nr_running); 1139 WARN_ON(!rt_rq->rt_nr_running);
1065 rt_rq->rt_nr_running--; 1140 rt_rq->rt_nr_running -= rt_se_nr_running(rt_se);
1066 1141
1067 dec_rt_prio(rt_rq, rt_se_prio(rt_se)); 1142 dec_rt_prio(rt_rq, rt_se_prio(rt_se));
1068 dec_rt_migration(rt_se, rt_rq); 1143 dec_rt_migration(rt_se, rt_rq);
@@ -1119,6 +1194,8 @@ static void dequeue_rt_stack(struct sched_rt_entity *rt_se)
1119 back = rt_se; 1194 back = rt_se;
1120 } 1195 }
1121 1196
1197 dequeue_top_rt_rq(rt_rq_of_se(back));
1198
1122 for (rt_se = back; rt_se; rt_se = rt_se->back) { 1199 for (rt_se = back; rt_se; rt_se = rt_se->back) {
1123 if (on_rt_rq(rt_se)) 1200 if (on_rt_rq(rt_se))
1124 __dequeue_rt_entity(rt_se); 1201 __dequeue_rt_entity(rt_se);
@@ -1127,13 +1204,18 @@ static void dequeue_rt_stack(struct sched_rt_entity *rt_se)
1127 1204
1128static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head) 1205static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
1129{ 1206{
1207 struct rq *rq = rq_of_rt_se(rt_se);
1208
1130 dequeue_rt_stack(rt_se); 1209 dequeue_rt_stack(rt_se);
1131 for_each_sched_rt_entity(rt_se) 1210 for_each_sched_rt_entity(rt_se)
1132 __enqueue_rt_entity(rt_se, head); 1211 __enqueue_rt_entity(rt_se, head);
1212 enqueue_top_rt_rq(&rq->rt);
1133} 1213}
1134 1214
1135static void dequeue_rt_entity(struct sched_rt_entity *rt_se) 1215static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
1136{ 1216{
1217 struct rq *rq = rq_of_rt_se(rt_se);
1218
1137 dequeue_rt_stack(rt_se); 1219 dequeue_rt_stack(rt_se);
1138 1220
1139 for_each_sched_rt_entity(rt_se) { 1221 for_each_sched_rt_entity(rt_se) {
@@ -1142,6 +1224,7 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
1142 if (rt_rq && rt_rq->rt_nr_running) 1224 if (rt_rq && rt_rq->rt_nr_running)
1143 __enqueue_rt_entity(rt_se, false); 1225 __enqueue_rt_entity(rt_se, false);
1144 } 1226 }
1227 enqueue_top_rt_rq(&rq->rt);
1145} 1228}
1146 1229
1147/* 1230/*
@@ -1159,8 +1242,6 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
1159 1242
1160 if (!task_current(rq, p) && p->nr_cpus_allowed > 1) 1243 if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
1161 enqueue_pushable_task(rq, p); 1244 enqueue_pushable_task(rq, p);
1162
1163 inc_nr_running(rq);
1164} 1245}
1165 1246
1166static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags) 1247static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
@@ -1171,8 +1252,6 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
1171 dequeue_rt_entity(rt_se); 1252 dequeue_rt_entity(rt_se);
1172 1253
1173 dequeue_pushable_task(rq, p); 1254 dequeue_pushable_task(rq, p);
1174
1175 dec_nr_running(rq);
1176} 1255}
1177 1256
1178/* 1257/*
@@ -1377,10 +1456,7 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev)
1377 if (prev->sched_class == &rt_sched_class) 1456 if (prev->sched_class == &rt_sched_class)
1378 update_curr_rt(rq); 1457 update_curr_rt(rq);
1379 1458
1380 if (!rt_rq->rt_nr_running) 1459 if (!rt_rq->rt_queued)
1381 return NULL;
1382
1383 if (rt_rq_throttled(rt_rq))
1384 return NULL; 1460 return NULL;
1385 1461
1386 put_prev_task(rq, prev); 1462 put_prev_task(rq, prev);
@@ -1892,9 +1968,9 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p)
1892 */ 1968 */
1893 if (p->on_rq && rq->curr != p) { 1969 if (p->on_rq && rq->curr != p) {
1894#ifdef CONFIG_SMP 1970#ifdef CONFIG_SMP
1895 if (rq->rt.overloaded && push_rt_task(rq) && 1971 if (p->nr_cpus_allowed > 1 && rq->rt.overloaded &&
1896 /* Don't resched if we changed runqueues */ 1972 /* Don't resched if we changed runqueues */
1897 rq != task_rq(p)) 1973 push_rt_task(rq) && rq != task_rq(p))
1898 check_resched = 0; 1974 check_resched = 0;
1899#endif /* CONFIG_SMP */ 1975#endif /* CONFIG_SMP */
1900 if (check_resched && p->prio < rq->curr->prio) 1976 if (check_resched && p->prio < rq->curr->prio)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 456e492a3dca..31cc02ebc54e 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -278,7 +278,7 @@ extern void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b);
278extern int sched_group_set_shares(struct task_group *tg, unsigned long shares); 278extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);
279 279
280extern void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b); 280extern void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b);
281extern void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b); 281extern void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b, bool force);
282extern void unthrottle_cfs_rq(struct cfs_rq *cfs_rq); 282extern void unthrottle_cfs_rq(struct cfs_rq *cfs_rq);
283 283
284extern void free_rt_sched_group(struct task_group *tg); 284extern void free_rt_sched_group(struct task_group *tg);
@@ -409,6 +409,8 @@ struct rt_rq {
409 int overloaded; 409 int overloaded;
410 struct plist_head pushable_tasks; 410 struct plist_head pushable_tasks;
411#endif 411#endif
412 int rt_queued;
413
412 int rt_throttled; 414 int rt_throttled;
413 u64 rt_time; 415 u64 rt_time;
414 u64 rt_runtime; 416 u64 rt_runtime;
@@ -423,18 +425,6 @@ struct rt_rq {
423#endif 425#endif
424}; 426};
425 427
426#ifdef CONFIG_RT_GROUP_SCHED
427static inline int rt_rq_throttled(struct rt_rq *rt_rq)
428{
429 return rt_rq->rt_throttled && !rt_rq->rt_nr_boosted;
430}
431#else
432static inline int rt_rq_throttled(struct rt_rq *rt_rq)
433{
434 return rt_rq->rt_throttled;
435}
436#endif
437
438/* Deadline class' related fields in a runqueue */ 428/* Deadline class' related fields in a runqueue */
439struct dl_rq { 429struct dl_rq {
440 /* runqueue is an rbtree, ordered by deadline */ 430 /* runqueue is an rbtree, ordered by deadline */
@@ -577,7 +567,7 @@ struct rq {
577 struct root_domain *rd; 567 struct root_domain *rd;
578 struct sched_domain *sd; 568 struct sched_domain *sd;
579 569
580 unsigned long cpu_power; 570 unsigned long cpu_capacity;
581 571
582 unsigned char idle_balance; 572 unsigned char idle_balance;
583 /* For active balancing */ 573 /* For active balancing */
@@ -680,6 +670,8 @@ extern int migrate_swap(struct task_struct *, struct task_struct *);
680 670
681#ifdef CONFIG_SMP 671#ifdef CONFIG_SMP
682 672
673extern void sched_ttwu_pending(void);
674
683#define rcu_dereference_check_sched_domain(p) \ 675#define rcu_dereference_check_sched_domain(p) \
684 rcu_dereference_check((p), \ 676 rcu_dereference_check((p), \
685 lockdep_is_held(&sched_domains_mutex)) 677 lockdep_is_held(&sched_domains_mutex))
@@ -738,15 +730,15 @@ DECLARE_PER_CPU(struct sched_domain *, sd_numa);
738DECLARE_PER_CPU(struct sched_domain *, sd_busy); 730DECLARE_PER_CPU(struct sched_domain *, sd_busy);
739DECLARE_PER_CPU(struct sched_domain *, sd_asym); 731DECLARE_PER_CPU(struct sched_domain *, sd_asym);
740 732
741struct sched_group_power { 733struct sched_group_capacity {
742 atomic_t ref; 734 atomic_t ref;
743 /* 735 /*
744 * CPU power of this group, SCHED_LOAD_SCALE being max power for a 736 * CPU capacity of this group, SCHED_LOAD_SCALE being max capacity
745 * single CPU. 737 * for a single CPU.
746 */ 738 */
747 unsigned int power, power_orig; 739 unsigned int capacity, capacity_orig;
748 unsigned long next_update; 740 unsigned long next_update;
749 int imbalance; /* XXX unrelated to power but shared group state */ 741 int imbalance; /* XXX unrelated to capacity but shared group state */
750 /* 742 /*
751 * Number of busy cpus in this group. 743 * Number of busy cpus in this group.
752 */ 744 */
@@ -760,7 +752,7 @@ struct sched_group {
760 atomic_t ref; 752 atomic_t ref;
761 753
762 unsigned int group_weight; 754 unsigned int group_weight;
763 struct sched_group_power *sgp; 755 struct sched_group_capacity *sgc;
764 756
765 /* 757 /*
766 * The CPUs this group covers. 758 * The CPUs this group covers.
@@ -783,7 +775,7 @@ static inline struct cpumask *sched_group_cpus(struct sched_group *sg)
783 */ 775 */
784static inline struct cpumask *sched_group_mask(struct sched_group *sg) 776static inline struct cpumask *sched_group_mask(struct sched_group *sg)
785{ 777{
786 return to_cpumask(sg->sgp->cpumask); 778 return to_cpumask(sg->sgc->cpumask);
787} 779}
788 780
789/** 781/**
@@ -797,6 +789,10 @@ static inline unsigned int group_first_cpu(struct sched_group *group)
797 789
798extern int group_balance_cpu(struct sched_group *sg); 790extern int group_balance_cpu(struct sched_group *sg);
799 791
792#else
793
794static inline void sched_ttwu_pending(void) { }
795
800#endif /* CONFIG_SMP */ 796#endif /* CONFIG_SMP */
801 797
802#include "stats.h" 798#include "stats.h"
@@ -1177,7 +1173,7 @@ extern const struct sched_class idle_sched_class;
1177 1173
1178#ifdef CONFIG_SMP 1174#ifdef CONFIG_SMP
1179 1175
1180extern void update_group_power(struct sched_domain *sd, int cpu); 1176extern void update_group_capacity(struct sched_domain *sd, int cpu);
1181 1177
1182extern void trigger_load_balance(struct rq *rq); 1178extern void trigger_load_balance(struct rq *rq);
1183 1179
@@ -1216,12 +1212,14 @@ extern void update_idle_cpu_load(struct rq *this_rq);
1216 1212
1217extern void init_task_runnable_average(struct task_struct *p); 1213extern void init_task_runnable_average(struct task_struct *p);
1218 1214
1219static inline void inc_nr_running(struct rq *rq) 1215static inline void add_nr_running(struct rq *rq, unsigned count)
1220{ 1216{
1221 rq->nr_running++; 1217 unsigned prev_nr = rq->nr_running;
1218
1219 rq->nr_running = prev_nr + count;
1222 1220
1223#ifdef CONFIG_NO_HZ_FULL 1221#ifdef CONFIG_NO_HZ_FULL
1224 if (rq->nr_running == 2) { 1222 if (prev_nr < 2 && rq->nr_running >= 2) {
1225 if (tick_nohz_full_cpu(rq->cpu)) { 1223 if (tick_nohz_full_cpu(rq->cpu)) {
1226 /* Order rq->nr_running write against the IPI */ 1224 /* Order rq->nr_running write against the IPI */
1227 smp_wmb(); 1225 smp_wmb();
@@ -1231,9 +1229,9 @@ static inline void inc_nr_running(struct rq *rq)
1231#endif 1229#endif
1232} 1230}
1233 1231
1234static inline void dec_nr_running(struct rq *rq) 1232static inline void sub_nr_running(struct rq *rq, unsigned count)
1235{ 1233{
1236 rq->nr_running--; 1234 rq->nr_running -= count;
1237} 1235}
1238 1236
1239static inline void rq_last_tick_reset(struct rq *rq) 1237static inline void rq_last_tick_reset(struct rq *rq)
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index d6ce65dde541..bfe0edadbfbb 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -41,13 +41,13 @@ pick_next_task_stop(struct rq *rq, struct task_struct *prev)
41static void 41static void
42enqueue_task_stop(struct rq *rq, struct task_struct *p, int flags) 42enqueue_task_stop(struct rq *rq, struct task_struct *p, int flags)
43{ 43{
44 inc_nr_running(rq); 44 add_nr_running(rq, 1);
45} 45}
46 46
47static void 47static void
48dequeue_task_stop(struct rq *rq, struct task_struct *p, int flags) 48dequeue_task_stop(struct rq *rq, struct task_struct *p, int flags)
49{ 49{
50 dec_nr_running(rq); 50 sub_nr_running(rq, 1);
51} 51}
52 52
53static void yield_task_stop(struct rq *rq) 53static void yield_task_stop(struct rq *rq)
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index 7d50f794e248..0ffa20ae657b 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -394,7 +394,7 @@ EXPORT_SYMBOL(__wake_up_bit);
394 * 394 *
395 * In order for this to function properly, as it uses waitqueue_active() 395 * In order for this to function properly, as it uses waitqueue_active()
396 * internally, some kind of memory barrier must be done prior to calling 396 * internally, some kind of memory barrier must be done prior to calling
397 * this. Typically, this will be smp_mb__after_clear_bit(), but in some 397 * this. Typically, this will be smp_mb__after_atomic(), but in some
398 * cases where bitflags are manipulated non-atomically under a lock, one 398 * cases where bitflags are manipulated non-atomically under a lock, one
399 * may need to use a less regular barrier, such fs/inode.c's smp_mb(), 399 * may need to use a less regular barrier, such fs/inode.c's smp_mb(),
400 * because spin_unlock() does not guarantee a memory barrier. 400 * because spin_unlock() does not guarantee a memory barrier.
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index b35c21503a36..301bbc24739c 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -39,7 +39,7 @@
39 * is only needed for handling filters shared across tasks. 39 * is only needed for handling filters shared across tasks.
40 * @prev: points to a previously installed, or inherited, filter 40 * @prev: points to a previously installed, or inherited, filter
41 * @len: the number of instructions in the program 41 * @len: the number of instructions in the program
42 * @insns: the BPF program instructions to evaluate 42 * @insnsi: the BPF program instructions to evaluate
43 * 43 *
44 * seccomp_filter objects are organized in a tree linked via the @prev 44 * seccomp_filter objects are organized in a tree linked via the @prev
45 * pointer. For any task, it appears to be a singly-linked list starting 45 * pointer. For any task, it appears to be a singly-linked list starting
@@ -54,8 +54,7 @@
54struct seccomp_filter { 54struct seccomp_filter {
55 atomic_t usage; 55 atomic_t usage;
56 struct seccomp_filter *prev; 56 struct seccomp_filter *prev;
57 unsigned short len; /* Instruction count */ 57 struct sk_filter *prog;
58 struct sock_filter_int insnsi[];
59}; 58};
60 59
61/* Limit any path through the tree to 256KB worth of instructions. */ 60/* Limit any path through the tree to 256KB worth of instructions. */
@@ -104,60 +103,59 @@ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen)
104 u32 k = ftest->k; 103 u32 k = ftest->k;
105 104
106 switch (code) { 105 switch (code) {
107 case BPF_S_LD_W_ABS: 106 case BPF_LD | BPF_W | BPF_ABS:
108 ftest->code = BPF_LDX | BPF_W | BPF_ABS; 107 ftest->code = BPF_LDX | BPF_W | BPF_ABS;
109 /* 32-bit aligned and not out of bounds. */ 108 /* 32-bit aligned and not out of bounds. */
110 if (k >= sizeof(struct seccomp_data) || k & 3) 109 if (k >= sizeof(struct seccomp_data) || k & 3)
111 return -EINVAL; 110 return -EINVAL;
112 continue; 111 continue;
113 case BPF_S_LD_W_LEN: 112 case BPF_LD | BPF_W | BPF_LEN:
114 ftest->code = BPF_LD | BPF_IMM; 113 ftest->code = BPF_LD | BPF_IMM;
115 ftest->k = sizeof(struct seccomp_data); 114 ftest->k = sizeof(struct seccomp_data);
116 continue; 115 continue;
117 case BPF_S_LDX_W_LEN: 116 case BPF_LDX | BPF_W | BPF_LEN:
118 ftest->code = BPF_LDX | BPF_IMM; 117 ftest->code = BPF_LDX | BPF_IMM;
119 ftest->k = sizeof(struct seccomp_data); 118 ftest->k = sizeof(struct seccomp_data);
120 continue; 119 continue;
121 /* Explicitly include allowed calls. */ 120 /* Explicitly include allowed calls. */
122 case BPF_S_RET_K: 121 case BPF_RET | BPF_K:
123 case BPF_S_RET_A: 122 case BPF_RET | BPF_A:
124 case BPF_S_ALU_ADD_K: 123 case BPF_ALU | BPF_ADD | BPF_K:
125 case BPF_S_ALU_ADD_X: 124 case BPF_ALU | BPF_ADD | BPF_X:
126 case BPF_S_ALU_SUB_K: 125 case BPF_ALU | BPF_SUB | BPF_K:
127 case BPF_S_ALU_SUB_X: 126 case BPF_ALU | BPF_SUB | BPF_X:
128 case BPF_S_ALU_MUL_K: 127 case BPF_ALU | BPF_MUL | BPF_K:
129 case BPF_S_ALU_MUL_X: 128 case BPF_ALU | BPF_MUL | BPF_X:
130 case BPF_S_ALU_DIV_X: 129 case BPF_ALU | BPF_DIV | BPF_K:
131 case BPF_S_ALU_AND_K: 130 case BPF_ALU | BPF_DIV | BPF_X:
132 case BPF_S_ALU_AND_X: 131 case BPF_ALU | BPF_AND | BPF_K:
133 case BPF_S_ALU_OR_K: 132 case BPF_ALU | BPF_AND | BPF_X:
134 case BPF_S_ALU_OR_X: 133 case BPF_ALU | BPF_OR | BPF_K:
135 case BPF_S_ALU_XOR_K: 134 case BPF_ALU | BPF_OR | BPF_X:
136 case BPF_S_ALU_XOR_X: 135 case BPF_ALU | BPF_XOR | BPF_K:
137 case BPF_S_ALU_LSH_K: 136 case BPF_ALU | BPF_XOR | BPF_X:
138 case BPF_S_ALU_LSH_X: 137 case BPF_ALU | BPF_LSH | BPF_K:
139 case BPF_S_ALU_RSH_K: 138 case BPF_ALU | BPF_LSH | BPF_X:
140 case BPF_S_ALU_RSH_X: 139 case BPF_ALU | BPF_RSH | BPF_K:
141 case BPF_S_ALU_NEG: 140 case BPF_ALU | BPF_RSH | BPF_X:
142 case BPF_S_LD_IMM: 141 case BPF_ALU | BPF_NEG:
143 case BPF_S_LDX_IMM: 142 case BPF_LD | BPF_IMM:
144 case BPF_S_MISC_TAX: 143 case BPF_LDX | BPF_IMM:
145 case BPF_S_MISC_TXA: 144 case BPF_MISC | BPF_TAX:
146 case BPF_S_ALU_DIV_K: 145 case BPF_MISC | BPF_TXA:
147 case BPF_S_LD_MEM: 146 case BPF_LD | BPF_MEM:
148 case BPF_S_LDX_MEM: 147 case BPF_LDX | BPF_MEM:
149 case BPF_S_ST: 148 case BPF_ST:
150 case BPF_S_STX: 149 case BPF_STX:
151 case BPF_S_JMP_JA: 150 case BPF_JMP | BPF_JA:
152 case BPF_S_JMP_JEQ_K: 151 case BPF_JMP | BPF_JEQ | BPF_K:
153 case BPF_S_JMP_JEQ_X: 152 case BPF_JMP | BPF_JEQ | BPF_X:
154 case BPF_S_JMP_JGE_K: 153 case BPF_JMP | BPF_JGE | BPF_K:
155 case BPF_S_JMP_JGE_X: 154 case BPF_JMP | BPF_JGE | BPF_X:
156 case BPF_S_JMP_JGT_K: 155 case BPF_JMP | BPF_JGT | BPF_K:
157 case BPF_S_JMP_JGT_X: 156 case BPF_JMP | BPF_JGT | BPF_X:
158 case BPF_S_JMP_JSET_K: 157 case BPF_JMP | BPF_JSET | BPF_K:
159 case BPF_S_JMP_JSET_X: 158 case BPF_JMP | BPF_JSET | BPF_X:
160 sk_decode_filter(ftest, ftest);
161 continue; 159 continue;
162 default: 160 default:
163 return -EINVAL; 161 return -EINVAL;
@@ -189,7 +187,8 @@ static u32 seccomp_run_filters(int syscall)
189 * value always takes priority (ignoring the DATA). 187 * value always takes priority (ignoring the DATA).
190 */ 188 */
191 for (f = current->seccomp.filter; f; f = f->prev) { 189 for (f = current->seccomp.filter; f; f = f->prev) {
192 u32 cur_ret = sk_run_filter_int_seccomp(&sd, f->insnsi); 190 u32 cur_ret = SK_RUN_FILTER(f->prog, (void *)&sd);
191
193 if ((cur_ret & SECCOMP_RET_ACTION) < (ret & SECCOMP_RET_ACTION)) 192 if ((cur_ret & SECCOMP_RET_ACTION) < (ret & SECCOMP_RET_ACTION))
194 ret = cur_ret; 193 ret = cur_ret;
195 } 194 }
@@ -215,12 +214,12 @@ static long seccomp_attach_filter(struct sock_fprog *fprog)
215 return -EINVAL; 214 return -EINVAL;
216 215
217 for (filter = current->seccomp.filter; filter; filter = filter->prev) 216 for (filter = current->seccomp.filter; filter; filter = filter->prev)
218 total_insns += filter->len + 4; /* include a 4 instr penalty */ 217 total_insns += filter->prog->len + 4; /* include a 4 instr penalty */
219 if (total_insns > MAX_INSNS_PER_PATH) 218 if (total_insns > MAX_INSNS_PER_PATH)
220 return -ENOMEM; 219 return -ENOMEM;
221 220
222 /* 221 /*
223 * Installing a seccomp filter requires that the task have 222 * Installing a seccomp filter requires that the task has
224 * CAP_SYS_ADMIN in its namespace or be running with no_new_privs. 223 * CAP_SYS_ADMIN in its namespace or be running with no_new_privs.
225 * This avoids scenarios where unprivileged tasks can affect the 224 * This avoids scenarios where unprivileged tasks can affect the
226 * behavior of privileged children. 225 * behavior of privileged children.
@@ -256,19 +255,25 @@ static long seccomp_attach_filter(struct sock_fprog *fprog)
256 255
257 /* Allocate a new seccomp_filter */ 256 /* Allocate a new seccomp_filter */
258 ret = -ENOMEM; 257 ret = -ENOMEM;
259 filter = kzalloc(sizeof(struct seccomp_filter) + 258 filter = kzalloc(sizeof(struct seccomp_filter),
260 sizeof(struct sock_filter_int) * new_len,
261 GFP_KERNEL|__GFP_NOWARN); 259 GFP_KERNEL|__GFP_NOWARN);
262 if (!filter) 260 if (!filter)
263 goto free_prog; 261 goto free_prog;
264 262
265 ret = sk_convert_filter(fp, fprog->len, filter->insnsi, &new_len); 263 filter->prog = kzalloc(sk_filter_size(new_len),
266 if (ret) 264 GFP_KERNEL|__GFP_NOWARN);
265 if (!filter->prog)
267 goto free_filter; 266 goto free_filter;
267
268 ret = sk_convert_filter(fp, fprog->len, filter->prog->insnsi, &new_len);
269 if (ret)
270 goto free_filter_prog;
268 kfree(fp); 271 kfree(fp);
269 272
270 atomic_set(&filter->usage, 1); 273 atomic_set(&filter->usage, 1);
271 filter->len = new_len; 274 filter->prog->len = new_len;
275
276 sk_filter_select_runtime(filter->prog);
272 277
273 /* 278 /*
274 * If there is an existing filter, make it the prev and don't drop its 279 * If there is an existing filter, make it the prev and don't drop its
@@ -278,6 +283,8 @@ static long seccomp_attach_filter(struct sock_fprog *fprog)
278 current->seccomp.filter = filter; 283 current->seccomp.filter = filter;
279 return 0; 284 return 0;
280 285
286free_filter_prog:
287 kfree(filter->prog);
281free_filter: 288free_filter:
282 kfree(filter); 289 kfree(filter);
283free_prog: 290free_prog:
@@ -330,6 +337,7 @@ void put_seccomp_filter(struct task_struct *tsk)
330 while (orig && atomic_dec_and_test(&orig->usage)) { 337 while (orig && atomic_dec_and_test(&orig->usage)) {
331 struct seccomp_filter *freeme = orig; 338 struct seccomp_filter *freeme = orig;
332 orig = orig->prev; 339 orig = orig->prev;
340 sk_filter_free(freeme->prog);
333 kfree(freeme); 341 kfree(freeme);
334 } 342 }
335} 343}
diff --git a/kernel/signal.c b/kernel/signal.c
index 6ea13c09ae56..a4077e90f19f 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -277,6 +277,7 @@ void task_clear_jobctl_trapping(struct task_struct *task)
277{ 277{
278 if (unlikely(task->jobctl & JOBCTL_TRAPPING)) { 278 if (unlikely(task->jobctl & JOBCTL_TRAPPING)) {
279 task->jobctl &= ~JOBCTL_TRAPPING; 279 task->jobctl &= ~JOBCTL_TRAPPING;
280 smp_mb(); /* advised by wake_up_bit() */
280 wake_up_bit(&task->jobctl, JOBCTL_TRAPPING_BIT); 281 wake_up_bit(&task->jobctl, JOBCTL_TRAPPING_BIT);
281 } 282 }
282} 283}
@@ -705,11 +706,8 @@ void signal_wake_up_state(struct task_struct *t, unsigned int state)
705 * Returns 1 if any signals were found. 706 * Returns 1 if any signals were found.
706 * 707 *
707 * All callers must be holding the siglock. 708 * All callers must be holding the siglock.
708 *
709 * This version takes a sigset mask and looks at all signals,
710 * not just those in the first mask word.
711 */ 709 */
712static int rm_from_queue_full(sigset_t *mask, struct sigpending *s) 710static int flush_sigqueue_mask(sigset_t *mask, struct sigpending *s)
713{ 711{
714 struct sigqueue *q, *n; 712 struct sigqueue *q, *n;
715 sigset_t m; 713 sigset_t m;
@@ -727,29 +725,6 @@ static int rm_from_queue_full(sigset_t *mask, struct sigpending *s)
727 } 725 }
728 return 1; 726 return 1;
729} 727}
730/*
731 * Remove signals in mask from the pending set and queue.
732 * Returns 1 if any signals were found.
733 *
734 * All callers must be holding the siglock.
735 */
736static int rm_from_queue(unsigned long mask, struct sigpending *s)
737{
738 struct sigqueue *q, *n;
739
740 if (!sigtestsetmask(&s->signal, mask))
741 return 0;
742
743 sigdelsetmask(&s->signal, mask);
744 list_for_each_entry_safe(q, n, &s->list, list) {
745 if (q->info.si_signo < SIGRTMIN &&
746 (mask & sigmask(q->info.si_signo))) {
747 list_del_init(&q->list);
748 __sigqueue_free(q);
749 }
750 }
751 return 1;
752}
753 728
754static inline int is_si_special(const struct siginfo *info) 729static inline int is_si_special(const struct siginfo *info)
755{ 730{
@@ -861,6 +836,7 @@ static bool prepare_signal(int sig, struct task_struct *p, bool force)
861{ 836{
862 struct signal_struct *signal = p->signal; 837 struct signal_struct *signal = p->signal;
863 struct task_struct *t; 838 struct task_struct *t;
839 sigset_t flush;
864 840
865 if (signal->flags & (SIGNAL_GROUP_EXIT | SIGNAL_GROUP_COREDUMP)) { 841 if (signal->flags & (SIGNAL_GROUP_EXIT | SIGNAL_GROUP_COREDUMP)) {
866 if (signal->flags & SIGNAL_GROUP_COREDUMP) 842 if (signal->flags & SIGNAL_GROUP_COREDUMP)
@@ -872,26 +848,25 @@ static bool prepare_signal(int sig, struct task_struct *p, bool force)
872 /* 848 /*
873 * This is a stop signal. Remove SIGCONT from all queues. 849 * This is a stop signal. Remove SIGCONT from all queues.
874 */ 850 */
875 rm_from_queue(sigmask(SIGCONT), &signal->shared_pending); 851 siginitset(&flush, sigmask(SIGCONT));
876 t = p; 852 flush_sigqueue_mask(&flush, &signal->shared_pending);
877 do { 853 for_each_thread(p, t)
878 rm_from_queue(sigmask(SIGCONT), &t->pending); 854 flush_sigqueue_mask(&flush, &t->pending);
879 } while_each_thread(p, t);
880 } else if (sig == SIGCONT) { 855 } else if (sig == SIGCONT) {
881 unsigned int why; 856 unsigned int why;
882 /* 857 /*
883 * Remove all stop signals from all queues, wake all threads. 858 * Remove all stop signals from all queues, wake all threads.
884 */ 859 */
885 rm_from_queue(SIG_KERNEL_STOP_MASK, &signal->shared_pending); 860 siginitset(&flush, SIG_KERNEL_STOP_MASK);
886 t = p; 861 flush_sigqueue_mask(&flush, &signal->shared_pending);
887 do { 862 for_each_thread(p, t) {
863 flush_sigqueue_mask(&flush, &t->pending);
888 task_clear_jobctl_pending(t, JOBCTL_STOP_PENDING); 864 task_clear_jobctl_pending(t, JOBCTL_STOP_PENDING);
889 rm_from_queue(SIG_KERNEL_STOP_MASK, &t->pending);
890 if (likely(!(t->ptrace & PT_SEIZED))) 865 if (likely(!(t->ptrace & PT_SEIZED)))
891 wake_up_state(t, __TASK_STOPPED); 866 wake_up_state(t, __TASK_STOPPED);
892 else 867 else
893 ptrace_trap_notify(t); 868 ptrace_trap_notify(t);
894 } while_each_thread(p, t); 869 }
895 870
896 /* 871 /*
897 * Notify the parent with CLD_CONTINUED if we were stopped. 872 * Notify the parent with CLD_CONTINUED if we were stopped.
@@ -2854,7 +2829,7 @@ int do_sigtimedwait(const sigset_t *which, siginfo_t *info,
2854 2829
2855 spin_lock_irq(&tsk->sighand->siglock); 2830 spin_lock_irq(&tsk->sighand->siglock);
2856 __set_task_blocked(tsk, &tsk->real_blocked); 2831 __set_task_blocked(tsk, &tsk->real_blocked);
2857 siginitset(&tsk->real_blocked, 0); 2832 sigemptyset(&tsk->real_blocked);
2858 sig = dequeue_signal(tsk, &mask, info); 2833 sig = dequeue_signal(tsk, &mask, info);
2859 } 2834 }
2860 spin_unlock_irq(&tsk->sighand->siglock); 2835 spin_unlock_irq(&tsk->sighand->siglock);
@@ -3091,18 +3066,39 @@ COMPAT_SYSCALL_DEFINE4(rt_tgsigqueueinfo,
3091} 3066}
3092#endif 3067#endif
3093 3068
3069/*
3070 * For kthreads only, must not be used if cloned with CLONE_SIGHAND
3071 */
3072void kernel_sigaction(int sig, __sighandler_t action)
3073{
3074 spin_lock_irq(&current->sighand->siglock);
3075 current->sighand->action[sig - 1].sa.sa_handler = action;
3076 if (action == SIG_IGN) {
3077 sigset_t mask;
3078
3079 sigemptyset(&mask);
3080 sigaddset(&mask, sig);
3081
3082 flush_sigqueue_mask(&mask, &current->signal->shared_pending);
3083 flush_sigqueue_mask(&mask, &current->pending);
3084 recalc_sigpending();
3085 }
3086 spin_unlock_irq(&current->sighand->siglock);
3087}
3088EXPORT_SYMBOL(kernel_sigaction);
3089
3094int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact) 3090int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact)
3095{ 3091{
3096 struct task_struct *t = current; 3092 struct task_struct *p = current, *t;
3097 struct k_sigaction *k; 3093 struct k_sigaction *k;
3098 sigset_t mask; 3094 sigset_t mask;
3099 3095
3100 if (!valid_signal(sig) || sig < 1 || (act && sig_kernel_only(sig))) 3096 if (!valid_signal(sig) || sig < 1 || (act && sig_kernel_only(sig)))
3101 return -EINVAL; 3097 return -EINVAL;
3102 3098
3103 k = &t->sighand->action[sig-1]; 3099 k = &p->sighand->action[sig-1];
3104 3100
3105 spin_lock_irq(&current->sighand->siglock); 3101 spin_lock_irq(&p->sighand->siglock);
3106 if (oact) 3102 if (oact)
3107 *oact = *k; 3103 *oact = *k;
3108 3104
@@ -3121,21 +3117,20 @@ int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact)
3121 * (for example, SIGCHLD), shall cause the pending signal to 3117 * (for example, SIGCHLD), shall cause the pending signal to
3122 * be discarded, whether or not it is blocked" 3118 * be discarded, whether or not it is blocked"
3123 */ 3119 */
3124 if (sig_handler_ignored(sig_handler(t, sig), sig)) { 3120 if (sig_handler_ignored(sig_handler(p, sig), sig)) {
3125 sigemptyset(&mask); 3121 sigemptyset(&mask);
3126 sigaddset(&mask, sig); 3122 sigaddset(&mask, sig);
3127 rm_from_queue_full(&mask, &t->signal->shared_pending); 3123 flush_sigqueue_mask(&mask, &p->signal->shared_pending);
3128 do { 3124 for_each_thread(p, t)
3129 rm_from_queue_full(&mask, &t->pending); 3125 flush_sigqueue_mask(&mask, &t->pending);
3130 } while_each_thread(current, t);
3131 } 3126 }
3132 } 3127 }
3133 3128
3134 spin_unlock_irq(&current->sighand->siglock); 3129 spin_unlock_irq(&p->sighand->siglock);
3135 return 0; 3130 return 0;
3136} 3131}
3137 3132
3138static int 3133static int
3139do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long sp) 3134do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long sp)
3140{ 3135{
3141 stack_t oss; 3136 stack_t oss;
@@ -3496,7 +3491,7 @@ COMPAT_SYSCALL_DEFINE3(sigaction, int, sig,
3496} 3491}
3497#endif 3492#endif
3498 3493
3499#ifdef __ARCH_WANT_SYS_SGETMASK 3494#ifdef CONFIG_SGETMASK_SYSCALL
3500 3495
3501/* 3496/*
3502 * For backwards compatibility. Functionality superseded by sigprocmask. 3497 * For backwards compatibility. Functionality superseded by sigprocmask.
@@ -3517,7 +3512,7 @@ SYSCALL_DEFINE1(ssetmask, int, newmask)
3517 3512
3518 return old; 3513 return old;
3519} 3514}
3520#endif /* __ARCH_WANT_SGETMASK */ 3515#endif /* CONFIG_SGETMASK_SYSCALL */
3521 3516
3522#ifdef __ARCH_WANT_SYS_SIGNAL 3517#ifdef __ARCH_WANT_SYS_SIGNAL
3523/* 3518/*
diff --git a/kernel/smp.c b/kernel/smp.c
index 06d574e42c72..306f8180b0d5 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -185,14 +185,26 @@ void generic_smp_call_function_single_interrupt(void)
185{ 185{
186 struct llist_node *entry; 186 struct llist_node *entry;
187 struct call_single_data *csd, *csd_next; 187 struct call_single_data *csd, *csd_next;
188 static bool warned;
189
190 entry = llist_del_all(&__get_cpu_var(call_single_queue));
191 entry = llist_reverse_order(entry);
188 192
189 /* 193 /*
190 * Shouldn't receive this interrupt on a cpu that is not yet online. 194 * Shouldn't receive this interrupt on a cpu that is not yet online.
191 */ 195 */
192 WARN_ON_ONCE(!cpu_online(smp_processor_id())); 196 if (unlikely(!cpu_online(smp_processor_id()) && !warned)) {
197 warned = true;
198 WARN(1, "IPI on offline CPU %d\n", smp_processor_id());
193 199
194 entry = llist_del_all(&__get_cpu_var(call_single_queue)); 200 /*
195 entry = llist_reverse_order(entry); 201 * We don't have to use the _safe() variant here
202 * because we are not invoking the IPI handlers yet.
203 */
204 llist_for_each_entry(csd, entry, llist)
205 pr_warn("IPI callback %pS sent to offline CPU\n",
206 csd->func);
207 }
196 208
197 llist_for_each_entry_safe(csd, csd_next, entry, llist) { 209 llist_for_each_entry_safe(csd, csd_next, entry, llist) {
198 csd->func(csd->info); 210 csd->func(csd->info);
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 33e4648ae0e7..5918d227730f 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -223,7 +223,7 @@ static inline bool lockdep_softirq_start(void) { return false; }
223static inline void lockdep_softirq_end(bool in_hardirq) { } 223static inline void lockdep_softirq_end(bool in_hardirq) { }
224#endif 224#endif
225 225
226asmlinkage void __do_softirq(void) 226asmlinkage __visible void __do_softirq(void)
227{ 227{
228 unsigned long end = jiffies + MAX_SOFTIRQ_TIME; 228 unsigned long end = jiffies + MAX_SOFTIRQ_TIME;
229 unsigned long old_flags = current->flags; 229 unsigned long old_flags = current->flags;
@@ -232,7 +232,6 @@ asmlinkage void __do_softirq(void)
232 bool in_hardirq; 232 bool in_hardirq;
233 __u32 pending; 233 __u32 pending;
234 int softirq_bit; 234 int softirq_bit;
235 int cpu;
236 235
237 /* 236 /*
238 * Mask out PF_MEMALLOC s current task context is borrowed for the 237 * Mask out PF_MEMALLOC s current task context is borrowed for the
@@ -247,7 +246,6 @@ asmlinkage void __do_softirq(void)
247 __local_bh_disable_ip(_RET_IP_, SOFTIRQ_OFFSET); 246 __local_bh_disable_ip(_RET_IP_, SOFTIRQ_OFFSET);
248 in_hardirq = lockdep_softirq_start(); 247 in_hardirq = lockdep_softirq_start();
249 248
250 cpu = smp_processor_id();
251restart: 249restart:
252 /* Reset the pending bitmask before enabling irqs */ 250 /* Reset the pending bitmask before enabling irqs */
253 set_softirq_pending(0); 251 set_softirq_pending(0);
@@ -276,11 +274,11 @@ restart:
276 prev_count, preempt_count()); 274 prev_count, preempt_count());
277 preempt_count_set(prev_count); 275 preempt_count_set(prev_count);
278 } 276 }
279 rcu_bh_qs(cpu);
280 h++; 277 h++;
281 pending >>= softirq_bit; 278 pending >>= softirq_bit;
282 } 279 }
283 280
281 rcu_bh_qs(smp_processor_id());
284 local_irq_disable(); 282 local_irq_disable();
285 283
286 pending = local_softirq_pending(); 284 pending = local_softirq_pending();
@@ -299,7 +297,7 @@ restart:
299 tsk_restore_flags(current, old_flags, PF_MEMALLOC); 297 tsk_restore_flags(current, old_flags, PF_MEMALLOC);
300} 298}
301 299
302asmlinkage void do_softirq(void) 300asmlinkage __visible void do_softirq(void)
303{ 301{
304 __u32 pending; 302 __u32 pending;
305 unsigned long flags; 303 unsigned long flags;
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 01fbae5b97b7..695f0c6cd169 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -307,6 +307,7 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *
307 * @cpu: cpu to stop 307 * @cpu: cpu to stop
308 * @fn: function to execute 308 * @fn: function to execute
309 * @arg: argument to @fn 309 * @arg: argument to @fn
310 * @work_buf: pointer to cpu_stop_work structure
310 * 311 *
311 * Similar to stop_one_cpu() but doesn't wait for completion. The 312 * Similar to stop_one_cpu() but doesn't wait for completion. The
312 * caller is responsible for ensuring @work_buf is currently unused 313 * caller is responsible for ensuring @work_buf is currently unused
diff --git a/kernel/sys.c b/kernel/sys.c
index fba0f29401ea..66a751ebf9d9 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -250,7 +250,7 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who)
250 else 250 else
251 p = current; 251 p = current;
252 if (p) { 252 if (p) {
253 niceval = 20 - task_nice(p); 253 niceval = nice_to_rlimit(task_nice(p));
254 if (niceval > retval) 254 if (niceval > retval)
255 retval = niceval; 255 retval = niceval;
256 } 256 }
@@ -261,7 +261,7 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who)
261 else 261 else
262 pgrp = task_pgrp(current); 262 pgrp = task_pgrp(current);
263 do_each_pid_thread(pgrp, PIDTYPE_PGID, p) { 263 do_each_pid_thread(pgrp, PIDTYPE_PGID, p) {
264 niceval = 20 - task_nice(p); 264 niceval = nice_to_rlimit(task_nice(p));
265 if (niceval > retval) 265 if (niceval > retval)
266 retval = niceval; 266 retval = niceval;
267 } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); 267 } while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
@@ -277,7 +277,7 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who)
277 277
278 do_each_thread(g, p) { 278 do_each_thread(g, p) {
279 if (uid_eq(task_uid(p), uid)) { 279 if (uid_eq(task_uid(p), uid)) {
280 niceval = 20 - task_nice(p); 280 niceval = nice_to_rlimit(task_nice(p));
281 if (niceval > retval) 281 if (niceval > retval)
282 retval = niceval; 282 retval = niceval;
283 } 283 }
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index bc8d1b74a6b9..36441b51b5df 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -135,6 +135,8 @@ cond_syscall(sys_setresgid16);
135cond_syscall(sys_setresuid16); 135cond_syscall(sys_setresuid16);
136cond_syscall(sys_setreuid16); 136cond_syscall(sys_setreuid16);
137cond_syscall(sys_setuid16); 137cond_syscall(sys_setuid16);
138cond_syscall(sys_sgetmask);
139cond_syscall(sys_ssetmask);
138cond_syscall(sys_vm86old); 140cond_syscall(sys_vm86old);
139cond_syscall(sys_vm86); 141cond_syscall(sys_vm86);
140cond_syscall(sys_ipc); 142cond_syscall(sys_ipc);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 74f5b580fe34..ba9ed453c4ed 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -173,6 +173,13 @@ extern int no_unaligned_warning;
173#endif 173#endif
174 174
175#ifdef CONFIG_PROC_SYSCTL 175#ifdef CONFIG_PROC_SYSCTL
176
177#define SYSCTL_WRITES_LEGACY -1
178#define SYSCTL_WRITES_WARN 0
179#define SYSCTL_WRITES_STRICT 1
180
181static int sysctl_writes_strict = SYSCTL_WRITES_WARN;
182
176static int proc_do_cad_pid(struct ctl_table *table, int write, 183static int proc_do_cad_pid(struct ctl_table *table, int write,
177 void __user *buffer, size_t *lenp, loff_t *ppos); 184 void __user *buffer, size_t *lenp, loff_t *ppos);
178static int proc_taint(struct ctl_table *table, int write, 185static int proc_taint(struct ctl_table *table, int write,
@@ -195,7 +202,7 @@ static int proc_dostring_coredump(struct ctl_table *table, int write,
195/* Note: sysrq code uses it's own private copy */ 202/* Note: sysrq code uses it's own private copy */
196static int __sysrq_enabled = CONFIG_MAGIC_SYSRQ_DEFAULT_ENABLE; 203static int __sysrq_enabled = CONFIG_MAGIC_SYSRQ_DEFAULT_ENABLE;
197 204
198static int sysrq_sysctl_handler(ctl_table *table, int write, 205static int sysrq_sysctl_handler(struct ctl_table *table, int write,
199 void __user *buffer, size_t *lenp, 206 void __user *buffer, size_t *lenp,
200 loff_t *ppos) 207 loff_t *ppos)
201{ 208{
@@ -495,6 +502,15 @@ static struct ctl_table kern_table[] = {
495 .mode = 0644, 502 .mode = 0644,
496 .proc_handler = proc_taint, 503 .proc_handler = proc_taint,
497 }, 504 },
505 {
506 .procname = "sysctl_writes_strict",
507 .data = &sysctl_writes_strict,
508 .maxlen = sizeof(int),
509 .mode = 0644,
510 .proc_handler = proc_dointvec_minmax,
511 .extra1 = &neg_one,
512 .extra2 = &one,
513 },
498#endif 514#endif
499#ifdef CONFIG_LATENCYTOP 515#ifdef CONFIG_LATENCYTOP
500 { 516 {
@@ -643,7 +659,7 @@ static struct ctl_table kern_table[] = {
643 .extra2 = &one, 659 .extra2 = &one,
644 }, 660 },
645#endif 661#endif
646 662#ifdef CONFIG_UEVENT_HELPER
647 { 663 {
648 .procname = "hotplug", 664 .procname = "hotplug",
649 .data = &uevent_helper, 665 .data = &uevent_helper,
@@ -651,7 +667,7 @@ static struct ctl_table kern_table[] = {
651 .mode = 0644, 667 .mode = 0644,
652 .proc_handler = proc_dostring, 668 .proc_handler = proc_dostring,
653 }, 669 },
654 670#endif
655#ifdef CONFIG_CHR_DEV_SG 671#ifdef CONFIG_CHR_DEV_SG
656 { 672 {
657 .procname = "sg-big-buff", 673 .procname = "sg-big-buff",
@@ -1418,8 +1434,13 @@ static struct ctl_table vm_table[] = {
1418 (defined(CONFIG_SUPERH) && defined(CONFIG_VSYSCALL)) 1434 (defined(CONFIG_SUPERH) && defined(CONFIG_VSYSCALL))
1419 { 1435 {
1420 .procname = "vdso_enabled", 1436 .procname = "vdso_enabled",
1437#ifdef CONFIG_X86_32
1438 .data = &vdso32_enabled,
1439 .maxlen = sizeof(vdso32_enabled),
1440#else
1421 .data = &vdso_enabled, 1441 .data = &vdso_enabled,
1422 .maxlen = sizeof(vdso_enabled), 1442 .maxlen = sizeof(vdso_enabled),
1443#endif
1423 .mode = 0644, 1444 .mode = 0644,
1424 .proc_handler = proc_dointvec, 1445 .proc_handler = proc_dointvec,
1425 .extra1 = &zero, 1446 .extra1 = &zero,
@@ -1698,8 +1719,8 @@ int __init sysctl_init(void)
1698 1719
1699#ifdef CONFIG_PROC_SYSCTL 1720#ifdef CONFIG_PROC_SYSCTL
1700 1721
1701static int _proc_do_string(void* data, int maxlen, int write, 1722static int _proc_do_string(char *data, int maxlen, int write,
1702 void __user *buffer, 1723 char __user *buffer,
1703 size_t *lenp, loff_t *ppos) 1724 size_t *lenp, loff_t *ppos)
1704{ 1725{
1705 size_t len; 1726 size_t len;
@@ -1712,21 +1733,30 @@ static int _proc_do_string(void* data, int maxlen, int write,
1712 } 1733 }
1713 1734
1714 if (write) { 1735 if (write) {
1715 len = 0; 1736 if (sysctl_writes_strict == SYSCTL_WRITES_STRICT) {
1737 /* Only continue writes not past the end of buffer. */
1738 len = strlen(data);
1739 if (len > maxlen - 1)
1740 len = maxlen - 1;
1741
1742 if (*ppos > len)
1743 return 0;
1744 len = *ppos;
1745 } else {
1746 /* Start writing from beginning of buffer. */
1747 len = 0;
1748 }
1749
1750 *ppos += *lenp;
1716 p = buffer; 1751 p = buffer;
1717 while (len < *lenp) { 1752 while ((p - buffer) < *lenp && len < maxlen - 1) {
1718 if (get_user(c, p++)) 1753 if (get_user(c, p++))
1719 return -EFAULT; 1754 return -EFAULT;
1720 if (c == 0 || c == '\n') 1755 if (c == 0 || c == '\n')
1721 break; 1756 break;
1722 len++; 1757 data[len++] = c;
1723 } 1758 }
1724 if (len >= maxlen) 1759 data[len] = 0;
1725 len = maxlen-1;
1726 if(copy_from_user(data, buffer, len))
1727 return -EFAULT;
1728 ((char *) data)[len] = 0;
1729 *ppos += *lenp;
1730 } else { 1760 } else {
1731 len = strlen(data); 1761 len = strlen(data);
1732 if (len > maxlen) 1762 if (len > maxlen)
@@ -1743,10 +1773,10 @@ static int _proc_do_string(void* data, int maxlen, int write,
1743 if (len > *lenp) 1773 if (len > *lenp)
1744 len = *lenp; 1774 len = *lenp;
1745 if (len) 1775 if (len)
1746 if(copy_to_user(buffer, data, len)) 1776 if (copy_to_user(buffer, data, len))
1747 return -EFAULT; 1777 return -EFAULT;
1748 if (len < *lenp) { 1778 if (len < *lenp) {
1749 if(put_user('\n', ((char __user *) buffer) + len)) 1779 if (put_user('\n', buffer + len))
1750 return -EFAULT; 1780 return -EFAULT;
1751 len++; 1781 len++;
1752 } 1782 }
@@ -1756,6 +1786,14 @@ static int _proc_do_string(void* data, int maxlen, int write,
1756 return 0; 1786 return 0;
1757} 1787}
1758 1788
1789static void warn_sysctl_write(struct ctl_table *table)
1790{
1791 pr_warn_once("%s wrote to %s when file position was not 0!\n"
1792 "This will not be supported in the future. To silence this\n"
1793 "warning, set kernel.sysctl_writes_strict = -1\n",
1794 current->comm, table->procname);
1795}
1796
1759/** 1797/**
1760 * proc_dostring - read a string sysctl 1798 * proc_dostring - read a string sysctl
1761 * @table: the sysctl table 1799 * @table: the sysctl table
@@ -1776,8 +1814,11 @@ static int _proc_do_string(void* data, int maxlen, int write,
1776int proc_dostring(struct ctl_table *table, int write, 1814int proc_dostring(struct ctl_table *table, int write,
1777 void __user *buffer, size_t *lenp, loff_t *ppos) 1815 void __user *buffer, size_t *lenp, loff_t *ppos)
1778{ 1816{
1779 return _proc_do_string(table->data, table->maxlen, write, 1817 if (write && *ppos && sysctl_writes_strict == SYSCTL_WRITES_WARN)
1780 buffer, lenp, ppos); 1818 warn_sysctl_write(table);
1819
1820 return _proc_do_string((char *)(table->data), table->maxlen, write,
1821 (char __user *)buffer, lenp, ppos);
1781} 1822}
1782 1823
1783static size_t proc_skip_spaces(char **buf) 1824static size_t proc_skip_spaces(char **buf)
@@ -1951,6 +1992,18 @@ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table,
1951 conv = do_proc_dointvec_conv; 1992 conv = do_proc_dointvec_conv;
1952 1993
1953 if (write) { 1994 if (write) {
1995 if (*ppos) {
1996 switch (sysctl_writes_strict) {
1997 case SYSCTL_WRITES_STRICT:
1998 goto out;
1999 case SYSCTL_WRITES_WARN:
2000 warn_sysctl_write(table);
2001 break;
2002 default:
2003 break;
2004 }
2005 }
2006
1954 if (left > PAGE_SIZE - 1) 2007 if (left > PAGE_SIZE - 1)
1955 left = PAGE_SIZE - 1; 2008 left = PAGE_SIZE - 1;
1956 page = __get_free_page(GFP_TEMPORARY); 2009 page = __get_free_page(GFP_TEMPORARY);
@@ -2008,6 +2061,7 @@ free:
2008 return err ? : -EINVAL; 2061 return err ? : -EINVAL;
2009 } 2062 }
2010 *lenp -= left; 2063 *lenp -= left;
2064out:
2011 *ppos += *lenp; 2065 *ppos += *lenp;
2012 return err; 2066 return err;
2013} 2067}
@@ -2200,6 +2254,18 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int
2200 left = *lenp; 2254 left = *lenp;
2201 2255
2202 if (write) { 2256 if (write) {
2257 if (*ppos) {
2258 switch (sysctl_writes_strict) {
2259 case SYSCTL_WRITES_STRICT:
2260 goto out;
2261 case SYSCTL_WRITES_WARN:
2262 warn_sysctl_write(table);
2263 break;
2264 default:
2265 break;
2266 }
2267 }
2268
2203 if (left > PAGE_SIZE - 1) 2269 if (left > PAGE_SIZE - 1)
2204 left = PAGE_SIZE - 1; 2270 left = PAGE_SIZE - 1;
2205 page = __get_free_page(GFP_TEMPORARY); 2271 page = __get_free_page(GFP_TEMPORARY);
@@ -2255,6 +2321,7 @@ free:
2255 return err ? : -EINVAL; 2321 return err ? : -EINVAL;
2256 } 2322 }
2257 *lenp -= left; 2323 *lenp -= left;
2324out:
2258 *ppos += *lenp; 2325 *ppos += *lenp;
2259 return err; 2326 return err;
2260} 2327}
@@ -2501,11 +2568,11 @@ int proc_do_large_bitmap(struct ctl_table *table, int write,
2501 bool first = 1; 2568 bool first = 1;
2502 size_t left = *lenp; 2569 size_t left = *lenp;
2503 unsigned long bitmap_len = table->maxlen; 2570 unsigned long bitmap_len = table->maxlen;
2504 unsigned long *bitmap = (unsigned long *) table->data; 2571 unsigned long *bitmap = *(unsigned long **) table->data;
2505 unsigned long *tmp_bitmap = NULL; 2572 unsigned long *tmp_bitmap = NULL;
2506 char tr_a[] = { '-', ',', '\n' }, tr_b[] = { ',', '\n', 0 }, c; 2573 char tr_a[] = { '-', ',', '\n' }, tr_b[] = { ',', '\n', 0 }, c;
2507 2574
2508 if (!bitmap_len || !left || (*ppos && !write)) { 2575 if (!bitmap || !bitmap_len || !left || (*ppos && !write)) {
2509 *lenp = 0; 2576 *lenp = 0;
2510 return 0; 2577 return 0;
2511 } 2578 }
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 419a52cecd20..33db43a39515 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -165,21 +165,21 @@ static inline void pps_set_freq(s64 freq)
165 165
166static inline int is_error_status(int status) 166static inline int is_error_status(int status)
167{ 167{
168 return (time_status & (STA_UNSYNC|STA_CLOCKERR)) 168 return (status & (STA_UNSYNC|STA_CLOCKERR))
169 /* PPS signal lost when either PPS time or 169 /* PPS signal lost when either PPS time or
170 * PPS frequency synchronization requested 170 * PPS frequency synchronization requested
171 */ 171 */
172 || ((time_status & (STA_PPSFREQ|STA_PPSTIME)) 172 || ((status & (STA_PPSFREQ|STA_PPSTIME))
173 && !(time_status & STA_PPSSIGNAL)) 173 && !(status & STA_PPSSIGNAL))
174 /* PPS jitter exceeded when 174 /* PPS jitter exceeded when
175 * PPS time synchronization requested */ 175 * PPS time synchronization requested */
176 || ((time_status & (STA_PPSTIME|STA_PPSJITTER)) 176 || ((status & (STA_PPSTIME|STA_PPSJITTER))
177 == (STA_PPSTIME|STA_PPSJITTER)) 177 == (STA_PPSTIME|STA_PPSJITTER))
178 /* PPS wander exceeded or calibration error when 178 /* PPS wander exceeded or calibration error when
179 * PPS frequency synchronization requested 179 * PPS frequency synchronization requested
180 */ 180 */
181 || ((time_status & STA_PPSFREQ) 181 || ((status & STA_PPSFREQ)
182 && (time_status & (STA_PPSWANDER|STA_PPSERROR))); 182 && (status & (STA_PPSWANDER|STA_PPSERROR)));
183} 183}
184 184
185static inline void pps_fill_timex(struct timex *txc) 185static inline void pps_fill_timex(struct timex *txc)
@@ -786,8 +786,9 @@ static long hardpps_update_freq(struct pps_normtime freq_norm)
786 time_status |= STA_PPSERROR; 786 time_status |= STA_PPSERROR;
787 pps_errcnt++; 787 pps_errcnt++;
788 pps_dec_freq_interval(); 788 pps_dec_freq_interval();
789 pr_err("hardpps: PPSERROR: interval too long - %ld s\n", 789 printk_deferred(KERN_ERR
790 freq_norm.sec); 790 "hardpps: PPSERROR: interval too long - %ld s\n",
791 freq_norm.sec);
791 return 0; 792 return 0;
792 } 793 }
793 794
@@ -800,7 +801,8 @@ static long hardpps_update_freq(struct pps_normtime freq_norm)
800 delta = shift_right(ftemp - pps_freq, NTP_SCALE_SHIFT); 801 delta = shift_right(ftemp - pps_freq, NTP_SCALE_SHIFT);
801 pps_freq = ftemp; 802 pps_freq = ftemp;
802 if (delta > PPS_MAXWANDER || delta < -PPS_MAXWANDER) { 803 if (delta > PPS_MAXWANDER || delta < -PPS_MAXWANDER) {
803 pr_warning("hardpps: PPSWANDER: change=%ld\n", delta); 804 printk_deferred(KERN_WARNING
805 "hardpps: PPSWANDER: change=%ld\n", delta);
804 time_status |= STA_PPSWANDER; 806 time_status |= STA_PPSWANDER;
805 pps_stbcnt++; 807 pps_stbcnt++;
806 pps_dec_freq_interval(); 808 pps_dec_freq_interval();
@@ -844,8 +846,9 @@ static void hardpps_update_phase(long error)
844 * the time offset is updated. 846 * the time offset is updated.
845 */ 847 */
846 if (jitter > (pps_jitter << PPS_POPCORN)) { 848 if (jitter > (pps_jitter << PPS_POPCORN)) {
847 pr_warning("hardpps: PPSJITTER: jitter=%ld, limit=%ld\n", 849 printk_deferred(KERN_WARNING
848 jitter, (pps_jitter << PPS_POPCORN)); 850 "hardpps: PPSJITTER: jitter=%ld, limit=%ld\n",
851 jitter, (pps_jitter << PPS_POPCORN));
849 time_status |= STA_PPSJITTER; 852 time_status |= STA_PPSJITTER;
850 pps_jitcnt++; 853 pps_jitcnt++;
851 } else if (time_status & STA_PPSTIME) { 854 } else if (time_status & STA_PPSTIME) {
@@ -902,7 +905,7 @@ void __hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)
902 time_status |= STA_PPSJITTER; 905 time_status |= STA_PPSJITTER;
903 /* restart the frequency calibration interval */ 906 /* restart the frequency calibration interval */
904 pps_fbase = *raw_ts; 907 pps_fbase = *raw_ts;
905 pr_err("hardpps: PPSJITTER: bad pulse\n"); 908 printk_deferred(KERN_ERR "hardpps: PPSJITTER: bad pulse\n");
906 return; 909 return;
907 } 910 }
908 911
@@ -923,7 +926,10 @@ void __hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)
923 926
924static int __init ntp_tick_adj_setup(char *str) 927static int __init ntp_tick_adj_setup(char *str)
925{ 928{
926 ntp_tick_adj = simple_strtol(str, NULL, 0); 929 int rc = kstrtol(str, 0, (long *)&ntp_tick_adj);
930
931 if (rc)
932 return rc;
927 ntp_tick_adj <<= NTP_SCALE_SHIFT; 933 ntp_tick_adj <<= NTP_SCALE_SHIFT;
928 934
929 return 1; 935 return 1;
diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c
index 4d23dc4d8139..445106d2c729 100644
--- a/kernel/time/sched_clock.c
+++ b/kernel/time/sched_clock.c
@@ -49,13 +49,6 @@ static u64 notrace jiffy_sched_clock_read(void)
49 return (u64)(jiffies - INITIAL_JIFFIES); 49 return (u64)(jiffies - INITIAL_JIFFIES);
50} 50}
51 51
52static u32 __read_mostly (*read_sched_clock_32)(void);
53
54static u64 notrace read_sched_clock_32_wrapper(void)
55{
56 return read_sched_clock_32();
57}
58
59static u64 __read_mostly (*read_sched_clock)(void) = jiffy_sched_clock_read; 52static u64 __read_mostly (*read_sched_clock)(void) = jiffy_sched_clock_read;
60 53
61static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift) 54static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift)
@@ -176,12 +169,6 @@ void __init sched_clock_register(u64 (*read)(void), int bits,
176 pr_debug("Registered %pF as sched_clock source\n", read); 169 pr_debug("Registered %pF as sched_clock source\n", read);
177} 170}
178 171
179void __init setup_sched_clock(u32 (*read)(void), int bits, unsigned long rate)
180{
181 read_sched_clock_32 = read;
182 sched_clock_register(read_sched_clock_32_wrapper, bits, rate);
183}
184
185void __init sched_clock_postinit(void) 172void __init sched_clock_postinit(void)
186{ 173{
187 /* 174 /*
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index f7df8ea21707..32d8d6aaedb8 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -852,8 +852,9 @@ static void __timekeeping_inject_sleeptime(struct timekeeper *tk,
852 struct timespec *delta) 852 struct timespec *delta)
853{ 853{
854 if (!timespec_valid_strict(delta)) { 854 if (!timespec_valid_strict(delta)) {
855 printk(KERN_WARNING "__timekeeping_inject_sleeptime: Invalid " 855 printk_deferred(KERN_WARNING
856 "sleep delta value!\n"); 856 "__timekeeping_inject_sleeptime: Invalid "
857 "sleep delta value!\n");
857 return; 858 return;
858 } 859 }
859 tk_xtime_add(tk, delta); 860 tk_xtime_add(tk, delta);
@@ -1157,7 +1158,7 @@ static void timekeeping_adjust(struct timekeeper *tk, s64 offset)
1157 1158
1158 if (unlikely(tk->clock->maxadj && 1159 if (unlikely(tk->clock->maxadj &&
1159 (tk->mult + adj > tk->clock->mult + tk->clock->maxadj))) { 1160 (tk->mult + adj > tk->clock->mult + tk->clock->maxadj))) {
1160 printk_once(KERN_WARNING 1161 printk_deferred_once(KERN_WARNING
1161 "Adjusting %s more than 11%% (%ld vs %ld)\n", 1162 "Adjusting %s more than 11%% (%ld vs %ld)\n",
1162 tk->clock->name, (long)tk->mult + adj, 1163 tk->clock->name, (long)tk->mult + adj,
1163 (long)tk->clock->mult + tk->clock->maxadj); 1164 (long)tk->clock->mult + tk->clock->maxadj);
diff --git a/kernel/torture.c b/kernel/torture.c
index acc9afc2f26e..40bb511cca48 100644
--- a/kernel/torture.c
+++ b/kernel/torture.c
@@ -335,13 +335,8 @@ static void torture_shuffle_tasks(void)
335 shuffle_idle_cpu = cpumask_next(shuffle_idle_cpu, shuffle_tmp_mask); 335 shuffle_idle_cpu = cpumask_next(shuffle_idle_cpu, shuffle_tmp_mask);
336 if (shuffle_idle_cpu >= nr_cpu_ids) 336 if (shuffle_idle_cpu >= nr_cpu_ids)
337 shuffle_idle_cpu = -1; 337 shuffle_idle_cpu = -1;
338 if (shuffle_idle_cpu != -1) { 338 else
339 cpumask_clear_cpu(shuffle_idle_cpu, shuffle_tmp_mask); 339 cpumask_clear_cpu(shuffle_idle_cpu, shuffle_tmp_mask);
340 if (cpumask_empty(shuffle_tmp_mask)) {
341 put_online_cpus();
342 return;
343 }
344 }
345 340
346 mutex_lock(&shuffle_task_mutex); 341 mutex_lock(&shuffle_task_mutex);
347 list_for_each_entry(stp, &shuffle_task_list, st_l) 342 list_for_each_entry(stp, &shuffle_task_list, st_l)
@@ -533,7 +528,11 @@ void stutter_wait(const char *title)
533 while (ACCESS_ONCE(stutter_pause_test) || 528 while (ACCESS_ONCE(stutter_pause_test) ||
534 (torture_runnable && !ACCESS_ONCE(*torture_runnable))) { 529 (torture_runnable && !ACCESS_ONCE(*torture_runnable))) {
535 if (stutter_pause_test) 530 if (stutter_pause_test)
536 schedule_timeout_interruptible(1); 531 if (ACCESS_ONCE(stutter_pause_test) == 1)
532 schedule_timeout_interruptible(1);
533 else
534 while (ACCESS_ONCE(stutter_pause_test))
535 cond_resched();
537 else 536 else
538 schedule_timeout_interruptible(round_jiffies_relative(HZ)); 537 schedule_timeout_interruptible(round_jiffies_relative(HZ));
539 torture_shutdown_absorb(title); 538 torture_shutdown_absorb(title);
@@ -550,7 +549,11 @@ static int torture_stutter(void *arg)
550 VERBOSE_TOROUT_STRING("torture_stutter task started"); 549 VERBOSE_TOROUT_STRING("torture_stutter task started");
551 do { 550 do {
552 if (!torture_must_stop()) { 551 if (!torture_must_stop()) {
553 schedule_timeout_interruptible(stutter); 552 if (stutter > 1) {
553 schedule_timeout_interruptible(stutter - 1);
554 ACCESS_ONCE(stutter_pause_test) = 2;
555 }
556 schedule_timeout_interruptible(1);
554 ACCESS_ONCE(stutter_pause_test) = 1; 557 ACCESS_ONCE(stutter_pause_test) = 1;
555 } 558 }
556 if (!torture_must_stop()) 559 if (!torture_must_stop())
@@ -596,21 +599,27 @@ static void torture_stutter_cleanup(void)
596 * The runnable parameter points to a flag that controls whether or not 599 * The runnable parameter points to a flag that controls whether or not
597 * the test is currently runnable. If there is no such flag, pass in NULL. 600 * the test is currently runnable. If there is no such flag, pass in NULL.
598 */ 601 */
599void __init torture_init_begin(char *ttype, bool v, int *runnable) 602bool torture_init_begin(char *ttype, bool v, int *runnable)
600{ 603{
601 mutex_lock(&fullstop_mutex); 604 mutex_lock(&fullstop_mutex);
605 if (torture_type != NULL) {
606 pr_alert("torture_init_begin: refusing %s init: %s running",
607 ttype, torture_type);
608 mutex_unlock(&fullstop_mutex);
609 return false;
610 }
602 torture_type = ttype; 611 torture_type = ttype;
603 verbose = v; 612 verbose = v;
604 torture_runnable = runnable; 613 torture_runnable = runnable;
605 fullstop = FULLSTOP_DONTSTOP; 614 fullstop = FULLSTOP_DONTSTOP;
606 615 return true;
607} 616}
608EXPORT_SYMBOL_GPL(torture_init_begin); 617EXPORT_SYMBOL_GPL(torture_init_begin);
609 618
610/* 619/*
611 * Tell the torture module that initialization is complete. 620 * Tell the torture module that initialization is complete.
612 */ 621 */
613void __init torture_init_end(void) 622void torture_init_end(void)
614{ 623{
615 mutex_unlock(&fullstop_mutex); 624 mutex_unlock(&fullstop_mutex);
616 register_reboot_notifier(&torture_shutdown_nb); 625 register_reboot_notifier(&torture_shutdown_nb);
@@ -642,6 +651,9 @@ bool torture_cleanup(void)
642 torture_shuffle_cleanup(); 651 torture_shuffle_cleanup();
643 torture_stutter_cleanup(); 652 torture_stutter_cleanup();
644 torture_onoff_cleanup(); 653 torture_onoff_cleanup();
654 mutex_lock(&fullstop_mutex);
655 torture_type = NULL;
656 mutex_unlock(&fullstop_mutex);
645 return false; 657 return false;
646} 658}
647EXPORT_SYMBOL_GPL(torture_cleanup); 659EXPORT_SYMBOL_GPL(torture_cleanup);
@@ -674,8 +686,10 @@ EXPORT_SYMBOL_GPL(torture_must_stop_irq);
674 */ 686 */
675void torture_kthread_stopping(char *title) 687void torture_kthread_stopping(char *title)
676{ 688{
677 if (verbose) 689 char buf[128];
678 VERBOSE_TOROUT_STRING(title); 690
691 snprintf(buf, sizeof(buf), "Stopping %s", title);
692 VERBOSE_TOROUT_STRING(buf);
679 while (!kthread_should_stop()) { 693 while (!kthread_should_stop()) {
680 torture_shutdown_absorb(title); 694 torture_shutdown_absorb(title);
681 schedule_timeout_uninterruptible(1); 695 schedule_timeout_uninterruptible(1);
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 8639819f6cef..d4409356f40d 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -535,6 +535,36 @@ config MMIOTRACE_TEST
535 535
536 Say N, unless you absolutely know what you are doing. 536 Say N, unless you absolutely know what you are doing.
537 537
538config TRACEPOINT_BENCHMARK
539 bool "Add tracepoint that benchmarks tracepoints"
540 help
541 This option creates the tracepoint "benchmark:benchmark_event".
542 When the tracepoint is enabled, it kicks off a kernel thread that
543 goes into an infinite loop (calling cond_sched() to let other tasks
544 run), and calls the tracepoint. Each iteration will record the time
545 it took to write to the tracepoint and the next iteration that
546 data will be passed to the tracepoint itself. That is, the tracepoint
547 will report the time it took to do the previous tracepoint.
548 The string written to the tracepoint is a static string of 128 bytes
549 to keep the time the same. The initial string is simply a write of
550 "START". The second string records the cold cache time of the first
551 write which is not added to the rest of the calculations.
552
553 As it is a tight loop, it benchmarks as hot cache. That's fine because
554 we care most about hot paths that are probably in cache already.
555
556 An example of the output:
557
558 START
559 first=3672 [COLD CACHED]
560 last=632 first=3672 max=632 min=632 avg=316 std=446 std^2=199712
561 last=278 first=3672 max=632 min=278 avg=303 std=316 std^2=100337
562 last=277 first=3672 max=632 min=277 avg=296 std=258 std^2=67064
563 last=273 first=3672 max=632 min=273 avg=292 std=224 std^2=50411
564 last=273 first=3672 max=632 min=273 avg=288 std=200 std^2=40389
565 last=281 first=3672 max=632 min=273 avg=287 std=183 std^2=33666
566
567
538config RING_BUFFER_BENCHMARK 568config RING_BUFFER_BENCHMARK
539 tristate "Ring buffer benchmark stress tester" 569 tristate "Ring buffer benchmark stress tester"
540 depends on RING_BUFFER 570 depends on RING_BUFFER
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 1378e84fbe39..2611613f14f1 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -17,6 +17,7 @@ ifdef CONFIG_TRACING_BRANCHES
17KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING 17KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING
18endif 18endif
19 19
20CFLAGS_trace_benchmark.o := -I$(src)
20CFLAGS_trace_events_filter.o := -I$(src) 21CFLAGS_trace_events_filter.o := -I$(src)
21 22
22obj-$(CONFIG_TRACE_CLOCK) += trace_clock.o 23obj-$(CONFIG_TRACE_CLOCK) += trace_clock.o
@@ -62,4 +63,6 @@ endif
62obj-$(CONFIG_PROBE_EVENTS) += trace_probe.o 63obj-$(CONFIG_PROBE_EVENTS) += trace_probe.o
63obj-$(CONFIG_UPROBE_EVENT) += trace_uprobe.o 64obj-$(CONFIG_UPROBE_EVENT) += trace_uprobe.o
64 65
66obj-$(CONFIG_TRACEPOINT_BENCHMARK) += trace_benchmark.o
67
65libftrace-y := ftrace.o 68libftrace-y := ftrace.o
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 4a54a25afa2f..5b372e3ed675 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -62,7 +62,7 @@
62#define FTRACE_HASH_DEFAULT_BITS 10 62#define FTRACE_HASH_DEFAULT_BITS 10
63#define FTRACE_HASH_MAX_BITS 12 63#define FTRACE_HASH_MAX_BITS 12
64 64
65#define FL_GLOBAL_CONTROL_MASK (FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_CONTROL) 65#define FL_GLOBAL_CONTROL_MASK (FTRACE_OPS_FL_CONTROL)
66 66
67#ifdef CONFIG_DYNAMIC_FTRACE 67#ifdef CONFIG_DYNAMIC_FTRACE
68#define INIT_REGEX_LOCK(opsname) \ 68#define INIT_REGEX_LOCK(opsname) \
@@ -103,7 +103,6 @@ static int ftrace_disabled __read_mostly;
103 103
104static DEFINE_MUTEX(ftrace_lock); 104static DEFINE_MUTEX(ftrace_lock);
105 105
106static struct ftrace_ops *ftrace_global_list __read_mostly = &ftrace_list_end;
107static struct ftrace_ops *ftrace_control_list __read_mostly = &ftrace_list_end; 106static struct ftrace_ops *ftrace_control_list __read_mostly = &ftrace_list_end;
108static struct ftrace_ops *ftrace_ops_list __read_mostly = &ftrace_list_end; 107static struct ftrace_ops *ftrace_ops_list __read_mostly = &ftrace_list_end;
109ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub; 108ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub;
@@ -171,23 +170,6 @@ int ftrace_nr_registered_ops(void)
171 return cnt; 170 return cnt;
172} 171}
173 172
174static void
175ftrace_global_list_func(unsigned long ip, unsigned long parent_ip,
176 struct ftrace_ops *op, struct pt_regs *regs)
177{
178 int bit;
179
180 bit = trace_test_and_set_recursion(TRACE_GLOBAL_START, TRACE_GLOBAL_MAX);
181 if (bit < 0)
182 return;
183
184 do_for_each_ftrace_op(op, ftrace_global_list) {
185 op->func(ip, parent_ip, op, regs);
186 } while_for_each_ftrace_op(op);
187
188 trace_clear_recursion(bit);
189}
190
191static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip, 173static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip,
192 struct ftrace_ops *op, struct pt_regs *regs) 174 struct ftrace_ops *op, struct pt_regs *regs)
193{ 175{
@@ -237,43 +219,6 @@ static int control_ops_alloc(struct ftrace_ops *ops)
237 return 0; 219 return 0;
238} 220}
239 221
240static void update_global_ops(void)
241{
242 ftrace_func_t func = ftrace_global_list_func;
243 void *private = NULL;
244
245 /* The list has its own recursion protection. */
246 global_ops.flags |= FTRACE_OPS_FL_RECURSION_SAFE;
247
248 /*
249 * If there's only one function registered, then call that
250 * function directly. Otherwise, we need to iterate over the
251 * registered callers.
252 */
253 if (ftrace_global_list == &ftrace_list_end ||
254 ftrace_global_list->next == &ftrace_list_end) {
255 func = ftrace_global_list->func;
256 private = ftrace_global_list->private;
257 /*
258 * As we are calling the function directly.
259 * If it does not have recursion protection,
260 * the function_trace_op needs to be updated
261 * accordingly.
262 */
263 if (!(ftrace_global_list->flags & FTRACE_OPS_FL_RECURSION_SAFE))
264 global_ops.flags &= ~FTRACE_OPS_FL_RECURSION_SAFE;
265 }
266
267 /* If we filter on pids, update to use the pid function */
268 if (!list_empty(&ftrace_pids)) {
269 set_ftrace_pid_function(func);
270 func = ftrace_pid_func;
271 }
272
273 global_ops.func = func;
274 global_ops.private = private;
275}
276
277static void ftrace_sync(struct work_struct *work) 222static void ftrace_sync(struct work_struct *work)
278{ 223{
279 /* 224 /*
@@ -301,8 +246,6 @@ static void update_ftrace_function(void)
301{ 246{
302 ftrace_func_t func; 247 ftrace_func_t func;
303 248
304 update_global_ops();
305
306 /* 249 /*
307 * If we are at the end of the list and this ops is 250 * If we are at the end of the list and this ops is
308 * recursion safe and not dynamic and the arch supports passing ops, 251 * recursion safe and not dynamic and the arch supports passing ops,
@@ -314,10 +257,7 @@ static void update_ftrace_function(void)
314 (ftrace_ops_list->flags & FTRACE_OPS_FL_RECURSION_SAFE) && 257 (ftrace_ops_list->flags & FTRACE_OPS_FL_RECURSION_SAFE) &&
315 !FTRACE_FORCE_LIST_FUNC)) { 258 !FTRACE_FORCE_LIST_FUNC)) {
316 /* Set the ftrace_ops that the arch callback uses */ 259 /* Set the ftrace_ops that the arch callback uses */
317 if (ftrace_ops_list == &global_ops) 260 set_function_trace_op = ftrace_ops_list;
318 set_function_trace_op = ftrace_global_list;
319 else
320 set_function_trace_op = ftrace_ops_list;
321 func = ftrace_ops_list->func; 261 func = ftrace_ops_list->func;
322 } else { 262 } else {
323 /* Just use the default ftrace_ops */ 263 /* Just use the default ftrace_ops */
@@ -373,6 +313,11 @@ static void update_ftrace_function(void)
373 ftrace_trace_function = func; 313 ftrace_trace_function = func;
374} 314}
375 315
316int using_ftrace_ops_list_func(void)
317{
318 return ftrace_trace_function == ftrace_ops_list_func;
319}
320
376static void add_ftrace_ops(struct ftrace_ops **list, struct ftrace_ops *ops) 321static void add_ftrace_ops(struct ftrace_ops **list, struct ftrace_ops *ops)
377{ 322{
378 ops->next = *list; 323 ops->next = *list;
@@ -434,16 +379,9 @@ static int __register_ftrace_function(struct ftrace_ops *ops)
434 if (ops->flags & FTRACE_OPS_FL_DELETED) 379 if (ops->flags & FTRACE_OPS_FL_DELETED)
435 return -EINVAL; 380 return -EINVAL;
436 381
437 if (FTRACE_WARN_ON(ops == &global_ops))
438 return -EINVAL;
439
440 if (WARN_ON(ops->flags & FTRACE_OPS_FL_ENABLED)) 382 if (WARN_ON(ops->flags & FTRACE_OPS_FL_ENABLED))
441 return -EBUSY; 383 return -EBUSY;
442 384
443 /* We don't support both control and global flags set. */
444 if ((ops->flags & FL_GLOBAL_CONTROL_MASK) == FL_GLOBAL_CONTROL_MASK)
445 return -EINVAL;
446
447#ifndef CONFIG_DYNAMIC_FTRACE_WITH_REGS 385#ifndef CONFIG_DYNAMIC_FTRACE_WITH_REGS
448 /* 386 /*
449 * If the ftrace_ops specifies SAVE_REGS, then it only can be used 387 * If the ftrace_ops specifies SAVE_REGS, then it only can be used
@@ -461,10 +399,7 @@ static int __register_ftrace_function(struct ftrace_ops *ops)
461 if (!core_kernel_data((unsigned long)ops)) 399 if (!core_kernel_data((unsigned long)ops))
462 ops->flags |= FTRACE_OPS_FL_DYNAMIC; 400 ops->flags |= FTRACE_OPS_FL_DYNAMIC;
463 401
464 if (ops->flags & FTRACE_OPS_FL_GLOBAL) { 402 if (ops->flags & FTRACE_OPS_FL_CONTROL) {
465 add_ftrace_list_ops(&ftrace_global_list, &global_ops, ops);
466 ops->flags |= FTRACE_OPS_FL_ENABLED;
467 } else if (ops->flags & FTRACE_OPS_FL_CONTROL) {
468 if (control_ops_alloc(ops)) 403 if (control_ops_alloc(ops))
469 return -ENOMEM; 404 return -ENOMEM;
470 add_ftrace_list_ops(&ftrace_control_list, &control_ops, ops); 405 add_ftrace_list_ops(&ftrace_control_list, &control_ops, ops);
@@ -484,15 +419,7 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops)
484 if (WARN_ON(!(ops->flags & FTRACE_OPS_FL_ENABLED))) 419 if (WARN_ON(!(ops->flags & FTRACE_OPS_FL_ENABLED)))
485 return -EBUSY; 420 return -EBUSY;
486 421
487 if (FTRACE_WARN_ON(ops == &global_ops)) 422 if (ops->flags & FTRACE_OPS_FL_CONTROL) {
488 return -EINVAL;
489
490 if (ops->flags & FTRACE_OPS_FL_GLOBAL) {
491 ret = remove_ftrace_list_ops(&ftrace_global_list,
492 &global_ops, ops);
493 if (!ret)
494 ops->flags &= ~FTRACE_OPS_FL_ENABLED;
495 } else if (ops->flags & FTRACE_OPS_FL_CONTROL) {
496 ret = remove_ftrace_list_ops(&ftrace_control_list, 423 ret = remove_ftrace_list_ops(&ftrace_control_list,
497 &control_ops, ops); 424 &control_ops, ops);
498 } else 425 } else
@@ -895,7 +822,7 @@ function_profile_call(unsigned long ip, unsigned long parent_ip,
895 822
896 local_irq_save(flags); 823 local_irq_save(flags);
897 824
898 stat = &__get_cpu_var(ftrace_profile_stats); 825 stat = this_cpu_ptr(&ftrace_profile_stats);
899 if (!stat->hash || !ftrace_profile_enabled) 826 if (!stat->hash || !ftrace_profile_enabled)
900 goto out; 827 goto out;
901 828
@@ -926,7 +853,7 @@ static void profile_graph_return(struct ftrace_graph_ret *trace)
926 unsigned long flags; 853 unsigned long flags;
927 854
928 local_irq_save(flags); 855 local_irq_save(flags);
929 stat = &__get_cpu_var(ftrace_profile_stats); 856 stat = this_cpu_ptr(&ftrace_profile_stats);
930 if (!stat->hash || !ftrace_profile_enabled) 857 if (!stat->hash || !ftrace_profile_enabled)
931 goto out; 858 goto out;
932 859
@@ -1178,7 +1105,7 @@ struct ftrace_page {
1178static struct ftrace_page *ftrace_pages_start; 1105static struct ftrace_page *ftrace_pages_start;
1179static struct ftrace_page *ftrace_pages; 1106static struct ftrace_page *ftrace_pages;
1180 1107
1181static bool ftrace_hash_empty(struct ftrace_hash *hash) 1108static bool __always_inline ftrace_hash_empty(struct ftrace_hash *hash)
1182{ 1109{
1183 return !hash || !hash->count; 1110 return !hash || !hash->count;
1184} 1111}
@@ -1625,7 +1552,14 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops,
1625 in_other_hash = !!ftrace_lookup_ip(other_hash, rec->ip); 1552 in_other_hash = !!ftrace_lookup_ip(other_hash, rec->ip);
1626 1553
1627 /* 1554 /*
1555 * If filter_hash is set, we want to match all functions
1556 * that are in the hash but not in the other hash.
1628 * 1557 *
1558 * If filter_hash is not set, then we are decrementing.
1559 * That means we match anything that is in the hash
1560 * and also in the other_hash. That is, we need to turn
1561 * off functions in the other hash because they are disabled
1562 * by this hash.
1629 */ 1563 */
1630 if (filter_hash && in_hash && !in_other_hash) 1564 if (filter_hash && in_hash && !in_other_hash)
1631 match = 1; 1565 match = 1;
@@ -1767,19 +1701,15 @@ static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update)
1767 /* 1701 /*
1768 * If this record is being updated from a nop, then 1702 * If this record is being updated from a nop, then
1769 * return UPDATE_MAKE_CALL. 1703 * return UPDATE_MAKE_CALL.
1770 * Otherwise, if the EN flag is set, then return
1771 * UPDATE_MODIFY_CALL_REGS to tell the caller to convert
1772 * from the non-save regs, to a save regs function.
1773 * Otherwise, 1704 * Otherwise,
1774 * return UPDATE_MODIFY_CALL to tell the caller to convert 1705 * return UPDATE_MODIFY_CALL to tell the caller to convert
1775 * from the save regs, to a non-save regs function. 1706 * from the save regs, to a non-save regs function or
1707 * vice versa.
1776 */ 1708 */
1777 if (flag & FTRACE_FL_ENABLED) 1709 if (flag & FTRACE_FL_ENABLED)
1778 return FTRACE_UPDATE_MAKE_CALL; 1710 return FTRACE_UPDATE_MAKE_CALL;
1779 else if (rec->flags & FTRACE_FL_REGS_EN) 1711
1780 return FTRACE_UPDATE_MODIFY_CALL_REGS; 1712 return FTRACE_UPDATE_MODIFY_CALL;
1781 else
1782 return FTRACE_UPDATE_MODIFY_CALL;
1783 } 1713 }
1784 1714
1785 if (update) { 1715 if (update) {
@@ -1821,6 +1751,42 @@ int ftrace_test_record(struct dyn_ftrace *rec, int enable)
1821 return ftrace_check_record(rec, enable, 0); 1751 return ftrace_check_record(rec, enable, 0);
1822} 1752}
1823 1753
1754/**
1755 * ftrace_get_addr_new - Get the call address to set to
1756 * @rec: The ftrace record descriptor
1757 *
1758 * If the record has the FTRACE_FL_REGS set, that means that it
1759 * wants to convert to a callback that saves all regs. If FTRACE_FL_REGS
1760 * is not not set, then it wants to convert to the normal callback.
1761 *
1762 * Returns the address of the trampoline to set to
1763 */
1764unsigned long ftrace_get_addr_new(struct dyn_ftrace *rec)
1765{
1766 if (rec->flags & FTRACE_FL_REGS)
1767 return (unsigned long)FTRACE_REGS_ADDR;
1768 else
1769 return (unsigned long)FTRACE_ADDR;
1770}
1771
1772/**
1773 * ftrace_get_addr_curr - Get the call address that is already there
1774 * @rec: The ftrace record descriptor
1775 *
1776 * The FTRACE_FL_REGS_EN is set when the record already points to
1777 * a function that saves all the regs. Basically the '_EN' version
1778 * represents the current state of the function.
1779 *
1780 * Returns the address of the trampoline that is currently being called
1781 */
1782unsigned long ftrace_get_addr_curr(struct dyn_ftrace *rec)
1783{
1784 if (rec->flags & FTRACE_FL_REGS_EN)
1785 return (unsigned long)FTRACE_REGS_ADDR;
1786 else
1787 return (unsigned long)FTRACE_ADDR;
1788}
1789
1824static int 1790static int
1825__ftrace_replace_code(struct dyn_ftrace *rec, int enable) 1791__ftrace_replace_code(struct dyn_ftrace *rec, int enable)
1826{ 1792{
@@ -1828,12 +1794,12 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable)
1828 unsigned long ftrace_addr; 1794 unsigned long ftrace_addr;
1829 int ret; 1795 int ret;
1830 1796
1831 ret = ftrace_update_record(rec, enable); 1797 ftrace_addr = ftrace_get_addr_new(rec);
1832 1798
1833 if (rec->flags & FTRACE_FL_REGS) 1799 /* This needs to be done before we call ftrace_update_record */
1834 ftrace_addr = (unsigned long)FTRACE_REGS_ADDR; 1800 ftrace_old_addr = ftrace_get_addr_curr(rec);
1835 else 1801
1836 ftrace_addr = (unsigned long)FTRACE_ADDR; 1802 ret = ftrace_update_record(rec, enable);
1837 1803
1838 switch (ret) { 1804 switch (ret) {
1839 case FTRACE_UPDATE_IGNORE: 1805 case FTRACE_UPDATE_IGNORE:
@@ -1845,13 +1811,7 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable)
1845 case FTRACE_UPDATE_MAKE_NOP: 1811 case FTRACE_UPDATE_MAKE_NOP:
1846 return ftrace_make_nop(NULL, rec, ftrace_addr); 1812 return ftrace_make_nop(NULL, rec, ftrace_addr);
1847 1813
1848 case FTRACE_UPDATE_MODIFY_CALL_REGS:
1849 case FTRACE_UPDATE_MODIFY_CALL: 1814 case FTRACE_UPDATE_MODIFY_CALL:
1850 if (rec->flags & FTRACE_FL_REGS)
1851 ftrace_old_addr = (unsigned long)FTRACE_ADDR;
1852 else
1853 ftrace_old_addr = (unsigned long)FTRACE_REGS_ADDR;
1854
1855 return ftrace_modify_call(rec, ftrace_old_addr, ftrace_addr); 1815 return ftrace_modify_call(rec, ftrace_old_addr, ftrace_addr);
1856 } 1816 }
1857 1817
@@ -2115,7 +2075,6 @@ static void ftrace_startup_enable(int command)
2115 2075
2116static int ftrace_startup(struct ftrace_ops *ops, int command) 2076static int ftrace_startup(struct ftrace_ops *ops, int command)
2117{ 2077{
2118 bool hash_enable = true;
2119 int ret; 2078 int ret;
2120 2079
2121 if (unlikely(ftrace_disabled)) 2080 if (unlikely(ftrace_disabled))
@@ -2128,18 +2087,9 @@ static int ftrace_startup(struct ftrace_ops *ops, int command)
2128 ftrace_start_up++; 2087 ftrace_start_up++;
2129 command |= FTRACE_UPDATE_CALLS; 2088 command |= FTRACE_UPDATE_CALLS;
2130 2089
2131 /* ops marked global share the filter hashes */
2132 if (ops->flags & FTRACE_OPS_FL_GLOBAL) {
2133 ops = &global_ops;
2134 /* Don't update hash if global is already set */
2135 if (global_start_up)
2136 hash_enable = false;
2137 global_start_up++;
2138 }
2139
2140 ops->flags |= FTRACE_OPS_FL_ENABLED; 2090 ops->flags |= FTRACE_OPS_FL_ENABLED;
2141 if (hash_enable) 2091
2142 ftrace_hash_rec_enable(ops, 1); 2092 ftrace_hash_rec_enable(ops, 1);
2143 2093
2144 ftrace_startup_enable(command); 2094 ftrace_startup_enable(command);
2145 2095
@@ -2148,7 +2098,6 @@ static int ftrace_startup(struct ftrace_ops *ops, int command)
2148 2098
2149static int ftrace_shutdown(struct ftrace_ops *ops, int command) 2099static int ftrace_shutdown(struct ftrace_ops *ops, int command)
2150{ 2100{
2151 bool hash_disable = true;
2152 int ret; 2101 int ret;
2153 2102
2154 if (unlikely(ftrace_disabled)) 2103 if (unlikely(ftrace_disabled))
@@ -2166,21 +2115,9 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command)
2166 */ 2115 */
2167 WARN_ON_ONCE(ftrace_start_up < 0); 2116 WARN_ON_ONCE(ftrace_start_up < 0);
2168 2117
2169 if (ops->flags & FTRACE_OPS_FL_GLOBAL) { 2118 ftrace_hash_rec_disable(ops, 1);
2170 ops = &global_ops;
2171 global_start_up--;
2172 WARN_ON_ONCE(global_start_up < 0);
2173 /* Don't update hash if global still has users */
2174 if (global_start_up) {
2175 WARN_ON_ONCE(!ftrace_start_up);
2176 hash_disable = false;
2177 }
2178 }
2179
2180 if (hash_disable)
2181 ftrace_hash_rec_disable(ops, 1);
2182 2119
2183 if (ops != &global_ops || !global_start_up) 2120 if (!global_start_up)
2184 ops->flags &= ~FTRACE_OPS_FL_ENABLED; 2121 ops->flags &= ~FTRACE_OPS_FL_ENABLED;
2185 2122
2186 command |= FTRACE_UPDATE_CALLS; 2123 command |= FTRACE_UPDATE_CALLS;
@@ -3524,10 +3461,6 @@ ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len,
3524 struct ftrace_hash *hash; 3461 struct ftrace_hash *hash;
3525 int ret; 3462 int ret;
3526 3463
3527 /* All global ops uses the global ops filters */
3528 if (ops->flags & FTRACE_OPS_FL_GLOBAL)
3529 ops = &global_ops;
3530
3531 if (unlikely(ftrace_disabled)) 3464 if (unlikely(ftrace_disabled))
3532 return -ENODEV; 3465 return -ENODEV;
3533 3466
@@ -3639,8 +3572,7 @@ int ftrace_set_notrace(struct ftrace_ops *ops, unsigned char *buf,
3639} 3572}
3640EXPORT_SYMBOL_GPL(ftrace_set_notrace); 3573EXPORT_SYMBOL_GPL(ftrace_set_notrace);
3641/** 3574/**
3642 * ftrace_set_filter - set a function to filter on in ftrace 3575 * ftrace_set_global_filter - set a function to filter on with global tracers
3643 * @ops - the ops to set the filter with
3644 * @buf - the string that holds the function filter text. 3576 * @buf - the string that holds the function filter text.
3645 * @len - the length of the string. 3577 * @len - the length of the string.
3646 * @reset - non zero to reset all filters before applying this filter. 3578 * @reset - non zero to reset all filters before applying this filter.
@@ -3655,8 +3587,7 @@ void ftrace_set_global_filter(unsigned char *buf, int len, int reset)
3655EXPORT_SYMBOL_GPL(ftrace_set_global_filter); 3587EXPORT_SYMBOL_GPL(ftrace_set_global_filter);
3656 3588
3657/** 3589/**
3658 * ftrace_set_notrace - set a function to not trace in ftrace 3590 * ftrace_set_global_notrace - set a function to not trace with global tracers
3659 * @ops - the ops to set the notrace filter with
3660 * @buf - the string that holds the function notrace text. 3591 * @buf - the string that holds the function notrace text.
3661 * @len - the length of the string. 3592 * @len - the length of the string.
3662 * @reset - non zero to reset all filters before applying this filter. 3593 * @reset - non zero to reset all filters before applying this filter.
@@ -4443,6 +4374,34 @@ ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip, void *regs)
4443 4374
4444#endif /* CONFIG_DYNAMIC_FTRACE */ 4375#endif /* CONFIG_DYNAMIC_FTRACE */
4445 4376
4377__init void ftrace_init_global_array_ops(struct trace_array *tr)
4378{
4379 tr->ops = &global_ops;
4380 tr->ops->private = tr;
4381}
4382
4383void ftrace_init_array_ops(struct trace_array *tr, ftrace_func_t func)
4384{
4385 /* If we filter on pids, update to use the pid function */
4386 if (tr->flags & TRACE_ARRAY_FL_GLOBAL) {
4387 if (WARN_ON(tr->ops->func != ftrace_stub))
4388 printk("ftrace ops had %pS for function\n",
4389 tr->ops->func);
4390 /* Only the top level instance does pid tracing */
4391 if (!list_empty(&ftrace_pids)) {
4392 set_ftrace_pid_function(func);
4393 func = ftrace_pid_func;
4394 }
4395 }
4396 tr->ops->func = func;
4397 tr->ops->private = tr;
4398}
4399
4400void ftrace_reset_array_ops(struct trace_array *tr)
4401{
4402 tr->ops->func = ftrace_stub;
4403}
4404
4446static void 4405static void
4447ftrace_ops_control_func(unsigned long ip, unsigned long parent_ip, 4406ftrace_ops_control_func(unsigned long ip, unsigned long parent_ip,
4448 struct ftrace_ops *op, struct pt_regs *regs) 4407 struct ftrace_ops *op, struct pt_regs *regs)
@@ -4501,9 +4460,16 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
4501 */ 4460 */
4502 preempt_disable_notrace(); 4461 preempt_disable_notrace();
4503 do_for_each_ftrace_op(op, ftrace_ops_list) { 4462 do_for_each_ftrace_op(op, ftrace_ops_list) {
4504 if (ftrace_ops_test(op, ip, regs)) 4463 if (ftrace_ops_test(op, ip, regs)) {
4464 if (WARN_ON(!op->func)) {
4465 function_trace_stop = 1;
4466 printk("op=%p %pS\n", op, op);
4467 goto out;
4468 }
4505 op->func(ip, parent_ip, op, regs); 4469 op->func(ip, parent_ip, op, regs);
4470 }
4506 } while_for_each_ftrace_op(op); 4471 } while_for_each_ftrace_op(op);
4472out:
4507 preempt_enable_notrace(); 4473 preempt_enable_notrace();
4508 trace_clear_recursion(bit); 4474 trace_clear_recursion(bit);
4509} 4475}
@@ -4908,7 +4874,6 @@ ftrace_enable_sysctl(struct ctl_table *table, int write,
4908#ifdef CONFIG_FUNCTION_GRAPH_TRACER 4874#ifdef CONFIG_FUNCTION_GRAPH_TRACER
4909 4875
4910static int ftrace_graph_active; 4876static int ftrace_graph_active;
4911static struct notifier_block ftrace_suspend_notifier;
4912 4877
4913int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace) 4878int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace)
4914{ 4879{
@@ -5054,13 +5019,6 @@ ftrace_suspend_notifier_call(struct notifier_block *bl, unsigned long state,
5054 return NOTIFY_DONE; 5019 return NOTIFY_DONE;
5055} 5020}
5056 5021
5057/* Just a place holder for function graph */
5058static struct ftrace_ops fgraph_ops __read_mostly = {
5059 .func = ftrace_stub,
5060 .flags = FTRACE_OPS_FL_STUB | FTRACE_OPS_FL_GLOBAL |
5061 FTRACE_OPS_FL_RECURSION_SAFE,
5062};
5063
5064static int ftrace_graph_entry_test(struct ftrace_graph_ent *trace) 5022static int ftrace_graph_entry_test(struct ftrace_graph_ent *trace)
5065{ 5023{
5066 if (!ftrace_ops_test(&global_ops, trace->func, NULL)) 5024 if (!ftrace_ops_test(&global_ops, trace->func, NULL))
@@ -5085,6 +5043,10 @@ static void update_function_graph_func(void)
5085 ftrace_graph_entry = ftrace_graph_entry_test; 5043 ftrace_graph_entry = ftrace_graph_entry_test;
5086} 5044}
5087 5045
5046static struct notifier_block ftrace_suspend_notifier = {
5047 .notifier_call = ftrace_suspend_notifier_call,
5048};
5049
5088int register_ftrace_graph(trace_func_graph_ret_t retfunc, 5050int register_ftrace_graph(trace_func_graph_ret_t retfunc,
5089 trace_func_graph_ent_t entryfunc) 5051 trace_func_graph_ent_t entryfunc)
5090{ 5052{
@@ -5098,7 +5060,6 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc,
5098 goto out; 5060 goto out;
5099 } 5061 }
5100 5062
5101 ftrace_suspend_notifier.notifier_call = ftrace_suspend_notifier_call;
5102 register_pm_notifier(&ftrace_suspend_notifier); 5063 register_pm_notifier(&ftrace_suspend_notifier);
5103 5064
5104 ftrace_graph_active++; 5065 ftrace_graph_active++;
@@ -5120,7 +5081,10 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc,
5120 ftrace_graph_entry = ftrace_graph_entry_test; 5081 ftrace_graph_entry = ftrace_graph_entry_test;
5121 update_function_graph_func(); 5082 update_function_graph_func();
5122 5083
5123 ret = ftrace_startup(&fgraph_ops, FTRACE_START_FUNC_RET); 5084 /* Function graph doesn't use the .func field of global_ops */
5085 global_ops.flags |= FTRACE_OPS_FL_STUB;
5086
5087 ret = ftrace_startup(&global_ops, FTRACE_START_FUNC_RET);
5124 5088
5125out: 5089out:
5126 mutex_unlock(&ftrace_lock); 5090 mutex_unlock(&ftrace_lock);
@@ -5138,7 +5102,8 @@ void unregister_ftrace_graph(void)
5138 ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub; 5102 ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub;
5139 ftrace_graph_entry = ftrace_graph_entry_stub; 5103 ftrace_graph_entry = ftrace_graph_entry_stub;
5140 __ftrace_graph_entry = ftrace_graph_entry_stub; 5104 __ftrace_graph_entry = ftrace_graph_entry_stub;
5141 ftrace_shutdown(&fgraph_ops, FTRACE_STOP_FUNC_RET); 5105 ftrace_shutdown(&global_ops, FTRACE_STOP_FUNC_RET);
5106 global_ops.flags &= ~FTRACE_OPS_FL_STUB;
5142 unregister_pm_notifier(&ftrace_suspend_notifier); 5107 unregister_pm_notifier(&ftrace_suspend_notifier);
5143 unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL); 5108 unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL);
5144 5109
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index c634868c2921..7c56c3d06943 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -543,7 +543,7 @@ static void rb_wake_up_waiters(struct irq_work *work)
543 * as data is added to any of the @buffer's cpu buffers. Otherwise 543 * as data is added to any of the @buffer's cpu buffers. Otherwise
544 * it will wait for data to be added to a specific cpu buffer. 544 * it will wait for data to be added to a specific cpu buffer.
545 */ 545 */
546void ring_buffer_wait(struct ring_buffer *buffer, int cpu) 546int ring_buffer_wait(struct ring_buffer *buffer, int cpu)
547{ 547{
548 struct ring_buffer_per_cpu *cpu_buffer; 548 struct ring_buffer_per_cpu *cpu_buffer;
549 DEFINE_WAIT(wait); 549 DEFINE_WAIT(wait);
@@ -557,6 +557,8 @@ void ring_buffer_wait(struct ring_buffer *buffer, int cpu)
557 if (cpu == RING_BUFFER_ALL_CPUS) 557 if (cpu == RING_BUFFER_ALL_CPUS)
558 work = &buffer->irq_work; 558 work = &buffer->irq_work;
559 else { 559 else {
560 if (!cpumask_test_cpu(cpu, buffer->cpumask))
561 return -ENODEV;
560 cpu_buffer = buffer->buffers[cpu]; 562 cpu_buffer = buffer->buffers[cpu];
561 work = &cpu_buffer->irq_work; 563 work = &cpu_buffer->irq_work;
562 } 564 }
@@ -591,6 +593,7 @@ void ring_buffer_wait(struct ring_buffer *buffer, int cpu)
591 schedule(); 593 schedule();
592 594
593 finish_wait(&work->waiters, &wait); 595 finish_wait(&work->waiters, &wait);
596 return 0;
594} 597}
595 598
596/** 599/**
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 737b0efa1a62..384ede311717 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -275,7 +275,7 @@ int call_filter_check_discard(struct ftrace_event_call *call, void *rec,
275} 275}
276EXPORT_SYMBOL_GPL(call_filter_check_discard); 276EXPORT_SYMBOL_GPL(call_filter_check_discard);
277 277
278cycle_t buffer_ftrace_now(struct trace_buffer *buf, int cpu) 278static cycle_t buffer_ftrace_now(struct trace_buffer *buf, int cpu)
279{ 279{
280 u64 ts; 280 u64 ts;
281 281
@@ -599,7 +599,7 @@ static int alloc_snapshot(struct trace_array *tr)
599 return 0; 599 return 0;
600} 600}
601 601
602void free_snapshot(struct trace_array *tr) 602static void free_snapshot(struct trace_array *tr)
603{ 603{
604 /* 604 /*
605 * We don't free the ring buffer. instead, resize it because 605 * We don't free the ring buffer. instead, resize it because
@@ -963,27 +963,9 @@ static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt)
963 return cnt; 963 return cnt;
964} 964}
965 965
966/*
967 * ftrace_max_lock is used to protect the swapping of buffers
968 * when taking a max snapshot. The buffers themselves are
969 * protected by per_cpu spinlocks. But the action of the swap
970 * needs its own lock.
971 *
972 * This is defined as a arch_spinlock_t in order to help
973 * with performance when lockdep debugging is enabled.
974 *
975 * It is also used in other places outside the update_max_tr
976 * so it needs to be defined outside of the
977 * CONFIG_TRACER_MAX_TRACE.
978 */
979static arch_spinlock_t ftrace_max_lock =
980 (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
981
982unsigned long __read_mostly tracing_thresh; 966unsigned long __read_mostly tracing_thresh;
983 967
984#ifdef CONFIG_TRACER_MAX_TRACE 968#ifdef CONFIG_TRACER_MAX_TRACE
985unsigned long __read_mostly tracing_max_latency;
986
987/* 969/*
988 * Copy the new maximum trace into the separate maximum-trace 970 * Copy the new maximum trace into the separate maximum-trace
989 * structure. (this way the maximum trace is permanently saved, 971 * structure. (this way the maximum trace is permanently saved,
@@ -1000,7 +982,7 @@ __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
1000 max_buf->cpu = cpu; 982 max_buf->cpu = cpu;
1001 max_buf->time_start = data->preempt_timestamp; 983 max_buf->time_start = data->preempt_timestamp;
1002 984
1003 max_data->saved_latency = tracing_max_latency; 985 max_data->saved_latency = tr->max_latency;
1004 max_data->critical_start = data->critical_start; 986 max_data->critical_start = data->critical_start;
1005 max_data->critical_end = data->critical_end; 987 max_data->critical_end = data->critical_end;
1006 988
@@ -1048,14 +1030,14 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
1048 return; 1030 return;
1049 } 1031 }
1050 1032
1051 arch_spin_lock(&ftrace_max_lock); 1033 arch_spin_lock(&tr->max_lock);
1052 1034
1053 buf = tr->trace_buffer.buffer; 1035 buf = tr->trace_buffer.buffer;
1054 tr->trace_buffer.buffer = tr->max_buffer.buffer; 1036 tr->trace_buffer.buffer = tr->max_buffer.buffer;
1055 tr->max_buffer.buffer = buf; 1037 tr->max_buffer.buffer = buf;
1056 1038
1057 __update_max_tr(tr, tsk, cpu); 1039 __update_max_tr(tr, tsk, cpu);
1058 arch_spin_unlock(&ftrace_max_lock); 1040 arch_spin_unlock(&tr->max_lock);
1059} 1041}
1060 1042
1061/** 1043/**
@@ -1081,7 +1063,7 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
1081 return; 1063 return;
1082 } 1064 }
1083 1065
1084 arch_spin_lock(&ftrace_max_lock); 1066 arch_spin_lock(&tr->max_lock);
1085 1067
1086 ret = ring_buffer_swap_cpu(tr->max_buffer.buffer, tr->trace_buffer.buffer, cpu); 1068 ret = ring_buffer_swap_cpu(tr->max_buffer.buffer, tr->trace_buffer.buffer, cpu);
1087 1069
@@ -1099,17 +1081,17 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
1099 WARN_ON_ONCE(ret && ret != -EAGAIN && ret != -EBUSY); 1081 WARN_ON_ONCE(ret && ret != -EAGAIN && ret != -EBUSY);
1100 1082
1101 __update_max_tr(tr, tsk, cpu); 1083 __update_max_tr(tr, tsk, cpu);
1102 arch_spin_unlock(&ftrace_max_lock); 1084 arch_spin_unlock(&tr->max_lock);
1103} 1085}
1104#endif /* CONFIG_TRACER_MAX_TRACE */ 1086#endif /* CONFIG_TRACER_MAX_TRACE */
1105 1087
1106static void default_wait_pipe(struct trace_iterator *iter) 1088static int wait_on_pipe(struct trace_iterator *iter)
1107{ 1089{
1108 /* Iterators are static, they should be filled or empty */ 1090 /* Iterators are static, they should be filled or empty */
1109 if (trace_buffer_iter(iter, iter->cpu_file)) 1091 if (trace_buffer_iter(iter, iter->cpu_file))
1110 return; 1092 return 0;
1111 1093
1112 ring_buffer_wait(iter->trace_buffer->buffer, iter->cpu_file); 1094 return ring_buffer_wait(iter->trace_buffer->buffer, iter->cpu_file);
1113} 1095}
1114 1096
1115#ifdef CONFIG_FTRACE_STARTUP_TEST 1097#ifdef CONFIG_FTRACE_STARTUP_TEST
@@ -1220,8 +1202,6 @@ int register_tracer(struct tracer *type)
1220 else 1202 else
1221 if (!type->flags->opts) 1203 if (!type->flags->opts)
1222 type->flags->opts = dummy_tracer_opt; 1204 type->flags->opts = dummy_tracer_opt;
1223 if (!type->wait_pipe)
1224 type->wait_pipe = default_wait_pipe;
1225 1205
1226 ret = run_tracer_selftest(type); 1206 ret = run_tracer_selftest(type);
1227 if (ret < 0) 1207 if (ret < 0)
@@ -1305,22 +1285,71 @@ void tracing_reset_all_online_cpus(void)
1305 } 1285 }
1306} 1286}
1307 1287
1308#define SAVED_CMDLINES 128 1288#define SAVED_CMDLINES_DEFAULT 128
1309#define NO_CMDLINE_MAP UINT_MAX 1289#define NO_CMDLINE_MAP UINT_MAX
1310static unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1];
1311static unsigned map_cmdline_to_pid[SAVED_CMDLINES];
1312static char saved_cmdlines[SAVED_CMDLINES][TASK_COMM_LEN];
1313static int cmdline_idx;
1314static arch_spinlock_t trace_cmdline_lock = __ARCH_SPIN_LOCK_UNLOCKED; 1290static arch_spinlock_t trace_cmdline_lock = __ARCH_SPIN_LOCK_UNLOCKED;
1291struct saved_cmdlines_buffer {
1292 unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1];
1293 unsigned *map_cmdline_to_pid;
1294 unsigned cmdline_num;
1295 int cmdline_idx;
1296 char *saved_cmdlines;
1297};
1298static struct saved_cmdlines_buffer *savedcmd;
1315 1299
1316/* temporary disable recording */ 1300/* temporary disable recording */
1317static atomic_t trace_record_cmdline_disabled __read_mostly; 1301static atomic_t trace_record_cmdline_disabled __read_mostly;
1318 1302
1319static void trace_init_cmdlines(void) 1303static inline char *get_saved_cmdlines(int idx)
1304{
1305 return &savedcmd->saved_cmdlines[idx * TASK_COMM_LEN];
1306}
1307
1308static inline void set_cmdline(int idx, const char *cmdline)
1309{
1310 memcpy(get_saved_cmdlines(idx), cmdline, TASK_COMM_LEN);
1311}
1312
1313static int allocate_cmdlines_buffer(unsigned int val,
1314 struct saved_cmdlines_buffer *s)
1320{ 1315{
1321 memset(&map_pid_to_cmdline, NO_CMDLINE_MAP, sizeof(map_pid_to_cmdline)); 1316 s->map_cmdline_to_pid = kmalloc(val * sizeof(*s->map_cmdline_to_pid),
1322 memset(&map_cmdline_to_pid, NO_CMDLINE_MAP, sizeof(map_cmdline_to_pid)); 1317 GFP_KERNEL);
1323 cmdline_idx = 0; 1318 if (!s->map_cmdline_to_pid)
1319 return -ENOMEM;
1320
1321 s->saved_cmdlines = kmalloc(val * TASK_COMM_LEN, GFP_KERNEL);
1322 if (!s->saved_cmdlines) {
1323 kfree(s->map_cmdline_to_pid);
1324 return -ENOMEM;
1325 }
1326
1327 s->cmdline_idx = 0;
1328 s->cmdline_num = val;
1329 memset(&s->map_pid_to_cmdline, NO_CMDLINE_MAP,
1330 sizeof(s->map_pid_to_cmdline));
1331 memset(s->map_cmdline_to_pid, NO_CMDLINE_MAP,
1332 val * sizeof(*s->map_cmdline_to_pid));
1333
1334 return 0;
1335}
1336
1337static int trace_create_savedcmd(void)
1338{
1339 int ret;
1340
1341 savedcmd = kmalloc(sizeof(*savedcmd), GFP_KERNEL);
1342 if (!savedcmd)
1343 return -ENOMEM;
1344
1345 ret = allocate_cmdlines_buffer(SAVED_CMDLINES_DEFAULT, savedcmd);
1346 if (ret < 0) {
1347 kfree(savedcmd);
1348 savedcmd = NULL;
1349 return -ENOMEM;
1350 }
1351
1352 return 0;
1324} 1353}
1325 1354
1326int is_tracing_stopped(void) 1355int is_tracing_stopped(void)
@@ -1353,7 +1382,7 @@ void tracing_start(void)
1353 } 1382 }
1354 1383
1355 /* Prevent the buffers from switching */ 1384 /* Prevent the buffers from switching */
1356 arch_spin_lock(&ftrace_max_lock); 1385 arch_spin_lock(&global_trace.max_lock);
1357 1386
1358 buffer = global_trace.trace_buffer.buffer; 1387 buffer = global_trace.trace_buffer.buffer;
1359 if (buffer) 1388 if (buffer)
@@ -1365,7 +1394,7 @@ void tracing_start(void)
1365 ring_buffer_record_enable(buffer); 1394 ring_buffer_record_enable(buffer);
1366#endif 1395#endif
1367 1396
1368 arch_spin_unlock(&ftrace_max_lock); 1397 arch_spin_unlock(&global_trace.max_lock);
1369 1398
1370 ftrace_start(); 1399 ftrace_start();
1371 out: 1400 out:
@@ -1420,7 +1449,7 @@ void tracing_stop(void)
1420 goto out; 1449 goto out;
1421 1450
1422 /* Prevent the buffers from switching */ 1451 /* Prevent the buffers from switching */
1423 arch_spin_lock(&ftrace_max_lock); 1452 arch_spin_lock(&global_trace.max_lock);
1424 1453
1425 buffer = global_trace.trace_buffer.buffer; 1454 buffer = global_trace.trace_buffer.buffer;
1426 if (buffer) 1455 if (buffer)
@@ -1432,7 +1461,7 @@ void tracing_stop(void)
1432 ring_buffer_record_disable(buffer); 1461 ring_buffer_record_disable(buffer);
1433#endif 1462#endif
1434 1463
1435 arch_spin_unlock(&ftrace_max_lock); 1464 arch_spin_unlock(&global_trace.max_lock);
1436 1465
1437 out: 1466 out:
1438 raw_spin_unlock_irqrestore(&global_trace.start_lock, flags); 1467 raw_spin_unlock_irqrestore(&global_trace.start_lock, flags);
@@ -1461,12 +1490,12 @@ static void tracing_stop_tr(struct trace_array *tr)
1461 1490
1462void trace_stop_cmdline_recording(void); 1491void trace_stop_cmdline_recording(void);
1463 1492
1464static void trace_save_cmdline(struct task_struct *tsk) 1493static int trace_save_cmdline(struct task_struct *tsk)
1465{ 1494{
1466 unsigned pid, idx; 1495 unsigned pid, idx;
1467 1496
1468 if (!tsk->pid || unlikely(tsk->pid > PID_MAX_DEFAULT)) 1497 if (!tsk->pid || unlikely(tsk->pid > PID_MAX_DEFAULT))
1469 return; 1498 return 0;
1470 1499
1471 /* 1500 /*
1472 * It's not the end of the world if we don't get 1501 * It's not the end of the world if we don't get
@@ -1475,11 +1504,11 @@ static void trace_save_cmdline(struct task_struct *tsk)
1475 * so if we miss here, then better luck next time. 1504 * so if we miss here, then better luck next time.
1476 */ 1505 */
1477 if (!arch_spin_trylock(&trace_cmdline_lock)) 1506 if (!arch_spin_trylock(&trace_cmdline_lock))
1478 return; 1507 return 0;
1479 1508
1480 idx = map_pid_to_cmdline[tsk->pid]; 1509 idx = savedcmd->map_pid_to_cmdline[tsk->pid];
1481 if (idx == NO_CMDLINE_MAP) { 1510 if (idx == NO_CMDLINE_MAP) {
1482 idx = (cmdline_idx + 1) % SAVED_CMDLINES; 1511 idx = (savedcmd->cmdline_idx + 1) % savedcmd->cmdline_num;
1483 1512
1484 /* 1513 /*
1485 * Check whether the cmdline buffer at idx has a pid 1514 * Check whether the cmdline buffer at idx has a pid
@@ -1487,22 +1516,24 @@ static void trace_save_cmdline(struct task_struct *tsk)
1487 * need to clear the map_pid_to_cmdline. Otherwise we 1516 * need to clear the map_pid_to_cmdline. Otherwise we
1488 * would read the new comm for the old pid. 1517 * would read the new comm for the old pid.
1489 */ 1518 */
1490 pid = map_cmdline_to_pid[idx]; 1519 pid = savedcmd->map_cmdline_to_pid[idx];
1491 if (pid != NO_CMDLINE_MAP) 1520 if (pid != NO_CMDLINE_MAP)
1492 map_pid_to_cmdline[pid] = NO_CMDLINE_MAP; 1521 savedcmd->map_pid_to_cmdline[pid] = NO_CMDLINE_MAP;
1493 1522
1494 map_cmdline_to_pid[idx] = tsk->pid; 1523 savedcmd->map_cmdline_to_pid[idx] = tsk->pid;
1495 map_pid_to_cmdline[tsk->pid] = idx; 1524 savedcmd->map_pid_to_cmdline[tsk->pid] = idx;
1496 1525
1497 cmdline_idx = idx; 1526 savedcmd->cmdline_idx = idx;
1498 } 1527 }
1499 1528
1500 memcpy(&saved_cmdlines[idx], tsk->comm, TASK_COMM_LEN); 1529 set_cmdline(idx, tsk->comm);
1501 1530
1502 arch_spin_unlock(&trace_cmdline_lock); 1531 arch_spin_unlock(&trace_cmdline_lock);
1532
1533 return 1;
1503} 1534}
1504 1535
1505void trace_find_cmdline(int pid, char comm[]) 1536static void __trace_find_cmdline(int pid, char comm[])
1506{ 1537{
1507 unsigned map; 1538 unsigned map;
1508 1539
@@ -1521,13 +1552,19 @@ void trace_find_cmdline(int pid, char comm[])
1521 return; 1552 return;
1522 } 1553 }
1523 1554
1524 preempt_disable(); 1555 map = savedcmd->map_pid_to_cmdline[pid];
1525 arch_spin_lock(&trace_cmdline_lock);
1526 map = map_pid_to_cmdline[pid];
1527 if (map != NO_CMDLINE_MAP) 1556 if (map != NO_CMDLINE_MAP)
1528 strcpy(comm, saved_cmdlines[map]); 1557 strcpy(comm, get_saved_cmdlines(map));
1529 else 1558 else
1530 strcpy(comm, "<...>"); 1559 strcpy(comm, "<...>");
1560}
1561
1562void trace_find_cmdline(int pid, char comm[])
1563{
1564 preempt_disable();
1565 arch_spin_lock(&trace_cmdline_lock);
1566
1567 __trace_find_cmdline(pid, comm);
1531 1568
1532 arch_spin_unlock(&trace_cmdline_lock); 1569 arch_spin_unlock(&trace_cmdline_lock);
1533 preempt_enable(); 1570 preempt_enable();
@@ -1541,9 +1578,8 @@ void tracing_record_cmdline(struct task_struct *tsk)
1541 if (!__this_cpu_read(trace_cmdline_save)) 1578 if (!__this_cpu_read(trace_cmdline_save))
1542 return; 1579 return;
1543 1580
1544 __this_cpu_write(trace_cmdline_save, false); 1581 if (trace_save_cmdline(tsk))
1545 1582 __this_cpu_write(trace_cmdline_save, false);
1546 trace_save_cmdline(tsk);
1547} 1583}
1548 1584
1549void 1585void
@@ -1746,7 +1782,7 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer,
1746 */ 1782 */
1747 barrier(); 1783 barrier();
1748 if (use_stack == 1) { 1784 if (use_stack == 1) {
1749 trace.entries = &__get_cpu_var(ftrace_stack).calls[0]; 1785 trace.entries = this_cpu_ptr(ftrace_stack.calls);
1750 trace.max_entries = FTRACE_STACK_MAX_ENTRIES; 1786 trace.max_entries = FTRACE_STACK_MAX_ENTRIES;
1751 1787
1752 if (regs) 1788 if (regs)
@@ -1995,7 +2031,21 @@ void trace_printk_init_buffers(void)
1995 if (alloc_percpu_trace_buffer()) 2031 if (alloc_percpu_trace_buffer())
1996 return; 2032 return;
1997 2033
1998 pr_info("ftrace: Allocated trace_printk buffers\n"); 2034 /* trace_printk() is for debug use only. Don't use it in production. */
2035
2036 pr_warning("\n**********************************************************\n");
2037 pr_warning("** NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE **\n");
2038 pr_warning("** **\n");
2039 pr_warning("** trace_printk() being used. Allocating extra memory. **\n");
2040 pr_warning("** **\n");
2041 pr_warning("** This means that this is a DEBUG kernel and it is **\n");
2042 pr_warning("** unsafe for produciton use. **\n");
2043 pr_warning("** **\n");
2044 pr_warning("** If you see this message and you are not debugging **\n");
2045 pr_warning("** the kernel, report this immediately to your vendor! **\n");
2046 pr_warning("** **\n");
2047 pr_warning("** NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE **\n");
2048 pr_warning("**********************************************************\n");
1999 2049
2000 /* Expand the buffers to set size */ 2050 /* Expand the buffers to set size */
2001 tracing_update_buffers(); 2051 tracing_update_buffers();
@@ -3333,7 +3383,7 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf,
3333 mutex_lock(&tracing_cpumask_update_lock); 3383 mutex_lock(&tracing_cpumask_update_lock);
3334 3384
3335 local_irq_disable(); 3385 local_irq_disable();
3336 arch_spin_lock(&ftrace_max_lock); 3386 arch_spin_lock(&tr->max_lock);
3337 for_each_tracing_cpu(cpu) { 3387 for_each_tracing_cpu(cpu) {
3338 /* 3388 /*
3339 * Increase/decrease the disabled counter if we are 3389 * Increase/decrease the disabled counter if we are
@@ -3350,7 +3400,7 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf,
3350 ring_buffer_record_enable_cpu(tr->trace_buffer.buffer, cpu); 3400 ring_buffer_record_enable_cpu(tr->trace_buffer.buffer, cpu);
3351 } 3401 }
3352 } 3402 }
3353 arch_spin_unlock(&ftrace_max_lock); 3403 arch_spin_unlock(&tr->max_lock);
3354 local_irq_enable(); 3404 local_irq_enable();
3355 3405
3356 cpumask_copy(tr->tracing_cpumask, tracing_cpumask_new); 3406 cpumask_copy(tr->tracing_cpumask, tracing_cpumask_new);
@@ -3592,6 +3642,7 @@ static const char readme_msg[] =
3592 " trace_options\t\t- Set format or modify how tracing happens\n" 3642 " trace_options\t\t- Set format or modify how tracing happens\n"
3593 "\t\t\t Disable an option by adding a suffix 'no' to the\n" 3643 "\t\t\t Disable an option by adding a suffix 'no' to the\n"
3594 "\t\t\t option name\n" 3644 "\t\t\t option name\n"
3645 " saved_cmdlines_size\t- echo command number in here to store comm-pid list\n"
3595#ifdef CONFIG_DYNAMIC_FTRACE 3646#ifdef CONFIG_DYNAMIC_FTRACE
3596 "\n available_filter_functions - list of functions that can be filtered on\n" 3647 "\n available_filter_functions - list of functions that can be filtered on\n"
3597 " set_ftrace_filter\t- echo function name in here to only trace these\n" 3648 " set_ftrace_filter\t- echo function name in here to only trace these\n"
@@ -3705,55 +3756,153 @@ static const struct file_operations tracing_readme_fops = {
3705 .llseek = generic_file_llseek, 3756 .llseek = generic_file_llseek,
3706}; 3757};
3707 3758
3759static void *saved_cmdlines_next(struct seq_file *m, void *v, loff_t *pos)
3760{
3761 unsigned int *ptr = v;
3762
3763 if (*pos || m->count)
3764 ptr++;
3765
3766 (*pos)++;
3767
3768 for (; ptr < &savedcmd->map_cmdline_to_pid[savedcmd->cmdline_num];
3769 ptr++) {
3770 if (*ptr == -1 || *ptr == NO_CMDLINE_MAP)
3771 continue;
3772
3773 return ptr;
3774 }
3775
3776 return NULL;
3777}
3778
3779static void *saved_cmdlines_start(struct seq_file *m, loff_t *pos)
3780{
3781 void *v;
3782 loff_t l = 0;
3783
3784 preempt_disable();
3785 arch_spin_lock(&trace_cmdline_lock);
3786
3787 v = &savedcmd->map_cmdline_to_pid[0];
3788 while (l <= *pos) {
3789 v = saved_cmdlines_next(m, v, &l);
3790 if (!v)
3791 return NULL;
3792 }
3793
3794 return v;
3795}
3796
3797static void saved_cmdlines_stop(struct seq_file *m, void *v)
3798{
3799 arch_spin_unlock(&trace_cmdline_lock);
3800 preempt_enable();
3801}
3802
3803static int saved_cmdlines_show(struct seq_file *m, void *v)
3804{
3805 char buf[TASK_COMM_LEN];
3806 unsigned int *pid = v;
3807
3808 __trace_find_cmdline(*pid, buf);
3809 seq_printf(m, "%d %s\n", *pid, buf);
3810 return 0;
3811}
3812
3813static const struct seq_operations tracing_saved_cmdlines_seq_ops = {
3814 .start = saved_cmdlines_start,
3815 .next = saved_cmdlines_next,
3816 .stop = saved_cmdlines_stop,
3817 .show = saved_cmdlines_show,
3818};
3819
3820static int tracing_saved_cmdlines_open(struct inode *inode, struct file *filp)
3821{
3822 if (tracing_disabled)
3823 return -ENODEV;
3824
3825 return seq_open(filp, &tracing_saved_cmdlines_seq_ops);
3826}
3827
3828static const struct file_operations tracing_saved_cmdlines_fops = {
3829 .open = tracing_saved_cmdlines_open,
3830 .read = seq_read,
3831 .llseek = seq_lseek,
3832 .release = seq_release,
3833};
3834
3708static ssize_t 3835static ssize_t
3709tracing_saved_cmdlines_read(struct file *file, char __user *ubuf, 3836tracing_saved_cmdlines_size_read(struct file *filp, char __user *ubuf,
3710 size_t cnt, loff_t *ppos) 3837 size_t cnt, loff_t *ppos)
3711{ 3838{
3712 char *buf_comm; 3839 char buf[64];
3713 char *file_buf; 3840 int r;
3714 char *buf;
3715 int len = 0;
3716 int pid;
3717 int i;
3718 3841
3719 file_buf = kmalloc(SAVED_CMDLINES*(16+TASK_COMM_LEN), GFP_KERNEL); 3842 arch_spin_lock(&trace_cmdline_lock);
3720 if (!file_buf) 3843 r = scnprintf(buf, sizeof(buf), "%u\n", savedcmd->cmdline_num);
3844 arch_spin_unlock(&trace_cmdline_lock);
3845
3846 return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
3847}
3848
3849static void free_saved_cmdlines_buffer(struct saved_cmdlines_buffer *s)
3850{
3851 kfree(s->saved_cmdlines);
3852 kfree(s->map_cmdline_to_pid);
3853 kfree(s);
3854}
3855
3856static int tracing_resize_saved_cmdlines(unsigned int val)
3857{
3858 struct saved_cmdlines_buffer *s, *savedcmd_temp;
3859
3860 s = kmalloc(sizeof(*s), GFP_KERNEL);
3861 if (!s)
3721 return -ENOMEM; 3862 return -ENOMEM;
3722 3863
3723 buf_comm = kmalloc(TASK_COMM_LEN, GFP_KERNEL); 3864 if (allocate_cmdlines_buffer(val, s) < 0) {
3724 if (!buf_comm) { 3865 kfree(s);
3725 kfree(file_buf);
3726 return -ENOMEM; 3866 return -ENOMEM;
3727 } 3867 }
3728 3868
3729 buf = file_buf; 3869 arch_spin_lock(&trace_cmdline_lock);
3870 savedcmd_temp = savedcmd;
3871 savedcmd = s;
3872 arch_spin_unlock(&trace_cmdline_lock);
3873 free_saved_cmdlines_buffer(savedcmd_temp);
3730 3874
3731 for (i = 0; i < SAVED_CMDLINES; i++) { 3875 return 0;
3732 int r; 3876}
3733 3877
3734 pid = map_cmdline_to_pid[i]; 3878static ssize_t
3735 if (pid == -1 || pid == NO_CMDLINE_MAP) 3879tracing_saved_cmdlines_size_write(struct file *filp, const char __user *ubuf,
3736 continue; 3880 size_t cnt, loff_t *ppos)
3881{
3882 unsigned long val;
3883 int ret;
3737 3884
3738 trace_find_cmdline(pid, buf_comm); 3885 ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
3739 r = sprintf(buf, "%d %s\n", pid, buf_comm); 3886 if (ret)
3740 buf += r; 3887 return ret;
3741 len += r;
3742 }
3743 3888
3744 len = simple_read_from_buffer(ubuf, cnt, ppos, 3889 /* must have at least 1 entry or less than PID_MAX_DEFAULT */
3745 file_buf, len); 3890 if (!val || val > PID_MAX_DEFAULT)
3891 return -EINVAL;
3746 3892
3747 kfree(file_buf); 3893 ret = tracing_resize_saved_cmdlines((unsigned int)val);
3748 kfree(buf_comm); 3894 if (ret < 0)
3895 return ret;
3749 3896
3750 return len; 3897 *ppos += cnt;
3898
3899 return cnt;
3751} 3900}
3752 3901
3753static const struct file_operations tracing_saved_cmdlines_fops = { 3902static const struct file_operations tracing_saved_cmdlines_size_fops = {
3754 .open = tracing_open_generic, 3903 .open = tracing_open_generic,
3755 .read = tracing_saved_cmdlines_read, 3904 .read = tracing_saved_cmdlines_size_read,
3756 .llseek = generic_file_llseek, 3905 .write = tracing_saved_cmdlines_size_write,
3757}; 3906};
3758 3907
3759static ssize_t 3908static ssize_t
@@ -4225,29 +4374,11 @@ tracing_poll_pipe(struct file *filp, poll_table *poll_table)
4225 return trace_poll(iter, filp, poll_table); 4374 return trace_poll(iter, filp, poll_table);
4226} 4375}
4227 4376
4228/*
4229 * This is a make-shift waitqueue.
4230 * A tracer might use this callback on some rare cases:
4231 *
4232 * 1) the current tracer might hold the runqueue lock when it wakes up
4233 * a reader, hence a deadlock (sched, function, and function graph tracers)
4234 * 2) the function tracers, trace all functions, we don't want
4235 * the overhead of calling wake_up and friends
4236 * (and tracing them too)
4237 *
4238 * Anyway, this is really very primitive wakeup.
4239 */
4240void poll_wait_pipe(struct trace_iterator *iter)
4241{
4242 set_current_state(TASK_INTERRUPTIBLE);
4243 /* sleep for 100 msecs, and try again. */
4244 schedule_timeout(HZ / 10);
4245}
4246
4247/* Must be called with trace_types_lock mutex held. */ 4377/* Must be called with trace_types_lock mutex held. */
4248static int tracing_wait_pipe(struct file *filp) 4378static int tracing_wait_pipe(struct file *filp)
4249{ 4379{
4250 struct trace_iterator *iter = filp->private_data; 4380 struct trace_iterator *iter = filp->private_data;
4381 int ret;
4251 4382
4252 while (trace_empty(iter)) { 4383 while (trace_empty(iter)) {
4253 4384
@@ -4255,15 +4386,6 @@ static int tracing_wait_pipe(struct file *filp)
4255 return -EAGAIN; 4386 return -EAGAIN;
4256 } 4387 }
4257 4388
4258 mutex_unlock(&iter->mutex);
4259
4260 iter->trace->wait_pipe(iter);
4261
4262 mutex_lock(&iter->mutex);
4263
4264 if (signal_pending(current))
4265 return -EINTR;
4266
4267 /* 4389 /*
4268 * We block until we read something and tracing is disabled. 4390 * We block until we read something and tracing is disabled.
4269 * We still block if tracing is disabled, but we have never 4391 * We still block if tracing is disabled, but we have never
@@ -4275,6 +4397,18 @@ static int tracing_wait_pipe(struct file *filp)
4275 */ 4397 */
4276 if (!tracing_is_on() && iter->pos) 4398 if (!tracing_is_on() && iter->pos)
4277 break; 4399 break;
4400
4401 mutex_unlock(&iter->mutex);
4402
4403 ret = wait_on_pipe(iter);
4404
4405 mutex_lock(&iter->mutex);
4406
4407 if (ret)
4408 return ret;
4409
4410 if (signal_pending(current))
4411 return -EINTR;
4278 } 4412 }
4279 4413
4280 return 1; 4414 return 1;
@@ -5197,8 +5331,12 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
5197 goto out_unlock; 5331 goto out_unlock;
5198 } 5332 }
5199 mutex_unlock(&trace_types_lock); 5333 mutex_unlock(&trace_types_lock);
5200 iter->trace->wait_pipe(iter); 5334 ret = wait_on_pipe(iter);
5201 mutex_lock(&trace_types_lock); 5335 mutex_lock(&trace_types_lock);
5336 if (ret) {
5337 size = ret;
5338 goto out_unlock;
5339 }
5202 if (signal_pending(current)) { 5340 if (signal_pending(current)) {
5203 size = -EINTR; 5341 size = -EINTR;
5204 goto out_unlock; 5342 goto out_unlock;
@@ -5408,8 +5546,10 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
5408 goto out; 5546 goto out;
5409 } 5547 }
5410 mutex_unlock(&trace_types_lock); 5548 mutex_unlock(&trace_types_lock);
5411 iter->trace->wait_pipe(iter); 5549 ret = wait_on_pipe(iter);
5412 mutex_lock(&trace_types_lock); 5550 mutex_lock(&trace_types_lock);
5551 if (ret)
5552 goto out;
5413 if (signal_pending(current)) { 5553 if (signal_pending(current)) {
5414 ret = -EINTR; 5554 ret = -EINTR;
5415 goto out; 5555 goto out;
@@ -6102,6 +6242,28 @@ static int allocate_trace_buffers(struct trace_array *tr, int size)
6102 return 0; 6242 return 0;
6103} 6243}
6104 6244
6245static void free_trace_buffer(struct trace_buffer *buf)
6246{
6247 if (buf->buffer) {
6248 ring_buffer_free(buf->buffer);
6249 buf->buffer = NULL;
6250 free_percpu(buf->data);
6251 buf->data = NULL;
6252 }
6253}
6254
6255static void free_trace_buffers(struct trace_array *tr)
6256{
6257 if (!tr)
6258 return;
6259
6260 free_trace_buffer(&tr->trace_buffer);
6261
6262#ifdef CONFIG_TRACER_MAX_TRACE
6263 free_trace_buffer(&tr->max_buffer);
6264#endif
6265}
6266
6105static int new_instance_create(const char *name) 6267static int new_instance_create(const char *name)
6106{ 6268{
6107 struct trace_array *tr; 6269 struct trace_array *tr;
@@ -6131,6 +6293,8 @@ static int new_instance_create(const char *name)
6131 6293
6132 raw_spin_lock_init(&tr->start_lock); 6294 raw_spin_lock_init(&tr->start_lock);
6133 6295
6296 tr->max_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
6297
6134 tr->current_trace = &nop_trace; 6298 tr->current_trace = &nop_trace;
6135 6299
6136 INIT_LIST_HEAD(&tr->systems); 6300 INIT_LIST_HEAD(&tr->systems);
@@ -6158,8 +6322,7 @@ static int new_instance_create(const char *name)
6158 return 0; 6322 return 0;
6159 6323
6160 out_free_tr: 6324 out_free_tr:
6161 if (tr->trace_buffer.buffer) 6325 free_trace_buffers(tr);
6162 ring_buffer_free(tr->trace_buffer.buffer);
6163 free_cpumask_var(tr->tracing_cpumask); 6326 free_cpumask_var(tr->tracing_cpumask);
6164 kfree(tr->name); 6327 kfree(tr->name);
6165 kfree(tr); 6328 kfree(tr);
@@ -6199,8 +6362,7 @@ static int instance_delete(const char *name)
6199 event_trace_del_tracer(tr); 6362 event_trace_del_tracer(tr);
6200 ftrace_destroy_function_files(tr); 6363 ftrace_destroy_function_files(tr);
6201 debugfs_remove_recursive(tr->dir); 6364 debugfs_remove_recursive(tr->dir);
6202 free_percpu(tr->trace_buffer.data); 6365 free_trace_buffers(tr);
6203 ring_buffer_free(tr->trace_buffer.buffer);
6204 6366
6205 kfree(tr->name); 6367 kfree(tr->name);
6206 kfree(tr); 6368 kfree(tr);
@@ -6328,6 +6490,11 @@ init_tracer_debugfs(struct trace_array *tr, struct dentry *d_tracer)
6328 trace_create_file("tracing_on", 0644, d_tracer, 6490 trace_create_file("tracing_on", 0644, d_tracer,
6329 tr, &rb_simple_fops); 6491 tr, &rb_simple_fops);
6330 6492
6493#ifdef CONFIG_TRACER_MAX_TRACE
6494 trace_create_file("tracing_max_latency", 0644, d_tracer,
6495 &tr->max_latency, &tracing_max_lat_fops);
6496#endif
6497
6331 if (ftrace_create_function_files(tr, d_tracer)) 6498 if (ftrace_create_function_files(tr, d_tracer))
6332 WARN(1, "Could not allocate function filter files"); 6499 WARN(1, "Could not allocate function filter files");
6333 6500
@@ -6353,11 +6520,6 @@ static __init int tracer_init_debugfs(void)
6353 6520
6354 init_tracer_debugfs(&global_trace, d_tracer); 6521 init_tracer_debugfs(&global_trace, d_tracer);
6355 6522
6356#ifdef CONFIG_TRACER_MAX_TRACE
6357 trace_create_file("tracing_max_latency", 0644, d_tracer,
6358 &tracing_max_latency, &tracing_max_lat_fops);
6359#endif
6360
6361 trace_create_file("tracing_thresh", 0644, d_tracer, 6523 trace_create_file("tracing_thresh", 0644, d_tracer,
6362 &tracing_thresh, &tracing_max_lat_fops); 6524 &tracing_thresh, &tracing_max_lat_fops);
6363 6525
@@ -6367,6 +6529,9 @@ static __init int tracer_init_debugfs(void)
6367 trace_create_file("saved_cmdlines", 0444, d_tracer, 6529 trace_create_file("saved_cmdlines", 0444, d_tracer,
6368 NULL, &tracing_saved_cmdlines_fops); 6530 NULL, &tracing_saved_cmdlines_fops);
6369 6531
6532 trace_create_file("saved_cmdlines_size", 0644, d_tracer,
6533 NULL, &tracing_saved_cmdlines_size_fops);
6534
6370#ifdef CONFIG_DYNAMIC_FTRACE 6535#ifdef CONFIG_DYNAMIC_FTRACE
6371 trace_create_file("dyn_ftrace_total_info", 0444, d_tracer, 6536 trace_create_file("dyn_ftrace_total_info", 0444, d_tracer,
6372 &ftrace_update_tot_cnt, &tracing_dyn_info_fops); 6537 &ftrace_update_tot_cnt, &tracing_dyn_info_fops);
@@ -6603,18 +6768,19 @@ __init static int tracer_alloc_buffers(void)
6603 if (!temp_buffer) 6768 if (!temp_buffer)
6604 goto out_free_cpumask; 6769 goto out_free_cpumask;
6605 6770
6771 if (trace_create_savedcmd() < 0)
6772 goto out_free_temp_buffer;
6773
6606 /* TODO: make the number of buffers hot pluggable with CPUS */ 6774 /* TODO: make the number of buffers hot pluggable with CPUS */
6607 if (allocate_trace_buffers(&global_trace, ring_buf_size) < 0) { 6775 if (allocate_trace_buffers(&global_trace, ring_buf_size) < 0) {
6608 printk(KERN_ERR "tracer: failed to allocate ring buffer!\n"); 6776 printk(KERN_ERR "tracer: failed to allocate ring buffer!\n");
6609 WARN_ON(1); 6777 WARN_ON(1);
6610 goto out_free_temp_buffer; 6778 goto out_free_savedcmd;
6611 } 6779 }
6612 6780
6613 if (global_trace.buffer_disabled) 6781 if (global_trace.buffer_disabled)
6614 tracing_off(); 6782 tracing_off();
6615 6783
6616 trace_init_cmdlines();
6617
6618 if (trace_boot_clock) { 6784 if (trace_boot_clock) {
6619 ret = tracing_set_clock(&global_trace, trace_boot_clock); 6785 ret = tracing_set_clock(&global_trace, trace_boot_clock);
6620 if (ret < 0) 6786 if (ret < 0)
@@ -6629,6 +6795,10 @@ __init static int tracer_alloc_buffers(void)
6629 */ 6795 */
6630 global_trace.current_trace = &nop_trace; 6796 global_trace.current_trace = &nop_trace;
6631 6797
6798 global_trace.max_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
6799
6800 ftrace_init_global_array_ops(&global_trace);
6801
6632 register_tracer(&nop_trace); 6802 register_tracer(&nop_trace);
6633 6803
6634 /* All seems OK, enable tracing */ 6804 /* All seems OK, enable tracing */
@@ -6656,13 +6826,11 @@ __init static int tracer_alloc_buffers(void)
6656 6826
6657 return 0; 6827 return 0;
6658 6828
6829out_free_savedcmd:
6830 free_saved_cmdlines_buffer(savedcmd);
6659out_free_temp_buffer: 6831out_free_temp_buffer:
6660 ring_buffer_free(temp_buffer); 6832 ring_buffer_free(temp_buffer);
6661out_free_cpumask: 6833out_free_cpumask:
6662 free_percpu(global_trace.trace_buffer.data);
6663#ifdef CONFIG_TRACER_MAX_TRACE
6664 free_percpu(global_trace.max_buffer.data);
6665#endif
6666 free_cpumask_var(global_trace.tracing_cpumask); 6834 free_cpumask_var(global_trace.tracing_cpumask);
6667out_free_buffer_mask: 6835out_free_buffer_mask:
6668 free_cpumask_var(tracing_buffer_mask); 6836 free_cpumask_var(tracing_buffer_mask);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 2e29d7ba5a52..9258f5a815db 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -190,7 +190,22 @@ struct trace_array {
190 */ 190 */
191 struct trace_buffer max_buffer; 191 struct trace_buffer max_buffer;
192 bool allocated_snapshot; 192 bool allocated_snapshot;
193 unsigned long max_latency;
193#endif 194#endif
195 /*
196 * max_lock is used to protect the swapping of buffers
197 * when taking a max snapshot. The buffers themselves are
198 * protected by per_cpu spinlocks. But the action of the swap
199 * needs its own lock.
200 *
201 * This is defined as a arch_spinlock_t in order to help
202 * with performance when lockdep debugging is enabled.
203 *
204 * It is also used in other places outside the update_max_tr
205 * so it needs to be defined outside of the
206 * CONFIG_TRACER_MAX_TRACE.
207 */
208 arch_spinlock_t max_lock;
194 int buffer_disabled; 209 int buffer_disabled;
195#ifdef CONFIG_FTRACE_SYSCALLS 210#ifdef CONFIG_FTRACE_SYSCALLS
196 int sys_refcount_enter; 211 int sys_refcount_enter;
@@ -237,6 +252,9 @@ static inline struct trace_array *top_trace_array(void)
237{ 252{
238 struct trace_array *tr; 253 struct trace_array *tr;
239 254
255 if (list_empty(&ftrace_trace_arrays))
256 return NULL;
257
240 tr = list_entry(ftrace_trace_arrays.prev, 258 tr = list_entry(ftrace_trace_arrays.prev,
241 typeof(*tr), list); 259 typeof(*tr), list);
242 WARN_ON(!(tr->flags & TRACE_ARRAY_FL_GLOBAL)); 260 WARN_ON(!(tr->flags & TRACE_ARRAY_FL_GLOBAL));
@@ -323,7 +341,6 @@ struct tracer_flags {
323 * @stop: called when tracing is paused (echo 0 > tracing_enabled) 341 * @stop: called when tracing is paused (echo 0 > tracing_enabled)
324 * @open: called when the trace file is opened 342 * @open: called when the trace file is opened
325 * @pipe_open: called when the trace_pipe file is opened 343 * @pipe_open: called when the trace_pipe file is opened
326 * @wait_pipe: override how the user waits for traces on trace_pipe
327 * @close: called when the trace file is released 344 * @close: called when the trace file is released
328 * @pipe_close: called when the trace_pipe file is released 345 * @pipe_close: called when the trace_pipe file is released
329 * @read: override the default read callback on trace_pipe 346 * @read: override the default read callback on trace_pipe
@@ -342,7 +359,6 @@ struct tracer {
342 void (*stop)(struct trace_array *tr); 359 void (*stop)(struct trace_array *tr);
343 void (*open)(struct trace_iterator *iter); 360 void (*open)(struct trace_iterator *iter);
344 void (*pipe_open)(struct trace_iterator *iter); 361 void (*pipe_open)(struct trace_iterator *iter);
345 void (*wait_pipe)(struct trace_iterator *iter);
346 void (*close)(struct trace_iterator *iter); 362 void (*close)(struct trace_iterator *iter);
347 void (*pipe_close)(struct trace_iterator *iter); 363 void (*pipe_close)(struct trace_iterator *iter);
348 ssize_t (*read)(struct trace_iterator *iter, 364 ssize_t (*read)(struct trace_iterator *iter,
@@ -416,13 +432,7 @@ enum {
416 TRACE_FTRACE_IRQ_BIT, 432 TRACE_FTRACE_IRQ_BIT,
417 TRACE_FTRACE_SIRQ_BIT, 433 TRACE_FTRACE_SIRQ_BIT,
418 434
419 /* GLOBAL_BITs must be greater than FTRACE_BITs */ 435 /* INTERNAL_BITs must be greater than FTRACE_BITs */
420 TRACE_GLOBAL_BIT,
421 TRACE_GLOBAL_NMI_BIT,
422 TRACE_GLOBAL_IRQ_BIT,
423 TRACE_GLOBAL_SIRQ_BIT,
424
425 /* INTERNAL_BITs must be greater than GLOBAL_BITs */
426 TRACE_INTERNAL_BIT, 436 TRACE_INTERNAL_BIT,
427 TRACE_INTERNAL_NMI_BIT, 437 TRACE_INTERNAL_NMI_BIT,
428 TRACE_INTERNAL_IRQ_BIT, 438 TRACE_INTERNAL_IRQ_BIT,
@@ -449,9 +459,6 @@ enum {
449#define TRACE_FTRACE_START TRACE_FTRACE_BIT 459#define TRACE_FTRACE_START TRACE_FTRACE_BIT
450#define TRACE_FTRACE_MAX ((1 << (TRACE_FTRACE_START + TRACE_CONTEXT_BITS)) - 1) 460#define TRACE_FTRACE_MAX ((1 << (TRACE_FTRACE_START + TRACE_CONTEXT_BITS)) - 1)
451 461
452#define TRACE_GLOBAL_START TRACE_GLOBAL_BIT
453#define TRACE_GLOBAL_MAX ((1 << (TRACE_GLOBAL_START + TRACE_CONTEXT_BITS)) - 1)
454
455#define TRACE_LIST_START TRACE_INTERNAL_BIT 462#define TRACE_LIST_START TRACE_INTERNAL_BIT
456#define TRACE_LIST_MAX ((1 << (TRACE_LIST_START + TRACE_CONTEXT_BITS)) - 1) 463#define TRACE_LIST_MAX ((1 << (TRACE_LIST_START + TRACE_CONTEXT_BITS)) - 1)
457 464
@@ -560,8 +567,6 @@ void trace_init_global_iter(struct trace_iterator *iter);
560 567
561void tracing_iter_reset(struct trace_iterator *iter, int cpu); 568void tracing_iter_reset(struct trace_iterator *iter, int cpu);
562 569
563void poll_wait_pipe(struct trace_iterator *iter);
564
565void tracing_sched_switch_trace(struct trace_array *tr, 570void tracing_sched_switch_trace(struct trace_array *tr,
566 struct task_struct *prev, 571 struct task_struct *prev,
567 struct task_struct *next, 572 struct task_struct *next,
@@ -608,8 +613,6 @@ extern unsigned long nsecs_to_usecs(unsigned long nsecs);
608extern unsigned long tracing_thresh; 613extern unsigned long tracing_thresh;
609 614
610#ifdef CONFIG_TRACER_MAX_TRACE 615#ifdef CONFIG_TRACER_MAX_TRACE
611extern unsigned long tracing_max_latency;
612
613void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu); 616void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu);
614void update_max_tr_single(struct trace_array *tr, 617void update_max_tr_single(struct trace_array *tr,
615 struct task_struct *tsk, int cpu); 618 struct task_struct *tsk, int cpu);
@@ -724,6 +727,8 @@ extern unsigned long trace_flags;
724#define TRACE_GRAPH_PRINT_PROC 0x8 727#define TRACE_GRAPH_PRINT_PROC 0x8
725#define TRACE_GRAPH_PRINT_DURATION 0x10 728#define TRACE_GRAPH_PRINT_DURATION 0x10
726#define TRACE_GRAPH_PRINT_ABS_TIME 0x20 729#define TRACE_GRAPH_PRINT_ABS_TIME 0x20
730#define TRACE_GRAPH_PRINT_IRQS 0x40
731#define TRACE_GRAPH_PRINT_TAIL 0x80
727#define TRACE_GRAPH_PRINT_FILL_SHIFT 28 732#define TRACE_GRAPH_PRINT_FILL_SHIFT 28
728#define TRACE_GRAPH_PRINT_FILL_MASK (0x3 << TRACE_GRAPH_PRINT_FILL_SHIFT) 733#define TRACE_GRAPH_PRINT_FILL_MASK (0x3 << TRACE_GRAPH_PRINT_FILL_SHIFT)
729 734
@@ -823,6 +828,10 @@ extern int ftrace_is_dead(void);
823int ftrace_create_function_files(struct trace_array *tr, 828int ftrace_create_function_files(struct trace_array *tr,
824 struct dentry *parent); 829 struct dentry *parent);
825void ftrace_destroy_function_files(struct trace_array *tr); 830void ftrace_destroy_function_files(struct trace_array *tr);
831void ftrace_init_global_array_ops(struct trace_array *tr);
832void ftrace_init_array_ops(struct trace_array *tr, ftrace_func_t func);
833void ftrace_reset_array_ops(struct trace_array *tr);
834int using_ftrace_ops_list_func(void);
826#else 835#else
827static inline int ftrace_trace_task(struct task_struct *task) 836static inline int ftrace_trace_task(struct task_struct *task)
828{ 837{
@@ -836,6 +845,11 @@ ftrace_create_function_files(struct trace_array *tr,
836 return 0; 845 return 0;
837} 846}
838static inline void ftrace_destroy_function_files(struct trace_array *tr) { } 847static inline void ftrace_destroy_function_files(struct trace_array *tr) { }
848static inline __init void
849ftrace_init_global_array_ops(struct trace_array *tr) { }
850static inline void ftrace_reset_array_ops(struct trace_array *tr) { }
851/* ftace_func_t type is not defined, use macro instead of static inline */
852#define ftrace_init_array_ops(tr, func) do { } while (0)
839#endif /* CONFIG_FUNCTION_TRACER */ 853#endif /* CONFIG_FUNCTION_TRACER */
840 854
841#if defined(CONFIG_FUNCTION_TRACER) && defined(CONFIG_DYNAMIC_FTRACE) 855#if defined(CONFIG_FUNCTION_TRACER) && defined(CONFIG_DYNAMIC_FTRACE)
diff --git a/kernel/trace/trace_benchmark.c b/kernel/trace/trace_benchmark.c
new file mode 100644
index 000000000000..40a14cbcf8e0
--- /dev/null
+++ b/kernel/trace/trace_benchmark.c
@@ -0,0 +1,198 @@
1#include <linux/delay.h>
2#include <linux/module.h>
3#include <linux/kthread.h>
4#include <linux/trace_clock.h>
5
6#define CREATE_TRACE_POINTS
7#include "trace_benchmark.h"
8
9static struct task_struct *bm_event_thread;
10
11static char bm_str[BENCHMARK_EVENT_STRLEN] = "START";
12
13static u64 bm_total;
14static u64 bm_totalsq;
15static u64 bm_last;
16static u64 bm_max;
17static u64 bm_min;
18static u64 bm_first;
19static u64 bm_cnt;
20static u64 bm_stddev;
21static unsigned int bm_avg;
22static unsigned int bm_std;
23
24/*
25 * This gets called in a loop recording the time it took to write
26 * the tracepoint. What it writes is the time statistics of the last
27 * tracepoint write. As there is nothing to write the first time
28 * it simply writes "START". As the first write is cold cache and
29 * the rest is hot, we save off that time in bm_first and it is
30 * reported as "first", which is shown in the second write to the
31 * tracepoint. The "first" field is writen within the statics from
32 * then on but never changes.
33 */
34static void trace_do_benchmark(void)
35{
36 u64 start;
37 u64 stop;
38 u64 delta;
39 u64 stddev;
40 u64 seed;
41 u64 last_seed;
42 unsigned int avg;
43 unsigned int std = 0;
44
45 /* Only run if the tracepoint is actually active */
46 if (!trace_benchmark_event_enabled())
47 return;
48
49 local_irq_disable();
50 start = trace_clock_local();
51 trace_benchmark_event(bm_str);
52 stop = trace_clock_local();
53 local_irq_enable();
54
55 bm_cnt++;
56
57 delta = stop - start;
58
59 /*
60 * The first read is cold cached, keep it separate from the
61 * other calculations.
62 */
63 if (bm_cnt == 1) {
64 bm_first = delta;
65 scnprintf(bm_str, BENCHMARK_EVENT_STRLEN,
66 "first=%llu [COLD CACHED]", bm_first);
67 return;
68 }
69
70 bm_last = delta;
71
72 if (delta > bm_max)
73 bm_max = delta;
74 if (!bm_min || delta < bm_min)
75 bm_min = delta;
76
77 /*
78 * When bm_cnt is greater than UINT_MAX, it breaks the statistics
79 * accounting. Freeze the statistics when that happens.
80 * We should have enough data for the avg and stddev anyway.
81 */
82 if (bm_cnt > UINT_MAX) {
83 scnprintf(bm_str, BENCHMARK_EVENT_STRLEN,
84 "last=%llu first=%llu max=%llu min=%llu ** avg=%u std=%d std^2=%lld",
85 bm_last, bm_first, bm_max, bm_min, bm_avg, bm_std, bm_stddev);
86 return;
87 }
88
89 bm_total += delta;
90 bm_totalsq += delta * delta;
91
92
93 if (bm_cnt > 1) {
94 /*
95 * Apply Welford's method to calculate standard deviation:
96 * s^2 = 1 / (n * (n-1)) * (n * \Sum (x_i)^2 - (\Sum x_i)^2)
97 */
98 stddev = (u64)bm_cnt * bm_totalsq - bm_total * bm_total;
99 do_div(stddev, (u32)bm_cnt);
100 do_div(stddev, (u32)bm_cnt - 1);
101 } else
102 stddev = 0;
103
104 delta = bm_total;
105 do_div(delta, bm_cnt);
106 avg = delta;
107
108 if (stddev > 0) {
109 int i = 0;
110 /*
111 * stddev is the square of standard deviation but
112 * we want the actualy number. Use the average
113 * as our seed to find the std.
114 *
115 * The next try is:
116 * x = (x + N/x) / 2
117 *
118 * Where N is the squared number to find the square
119 * root of.
120 */
121 seed = avg;
122 do {
123 last_seed = seed;
124 seed = stddev;
125 if (!last_seed)
126 break;
127 do_div(seed, last_seed);
128 seed += last_seed;
129 do_div(seed, 2);
130 } while (i++ < 10 && last_seed != seed);
131
132 std = seed;
133 }
134
135 scnprintf(bm_str, BENCHMARK_EVENT_STRLEN,
136 "last=%llu first=%llu max=%llu min=%llu avg=%u std=%d std^2=%lld",
137 bm_last, bm_first, bm_max, bm_min, avg, std, stddev);
138
139 bm_std = std;
140 bm_avg = avg;
141 bm_stddev = stddev;
142}
143
144static int benchmark_event_kthread(void *arg)
145{
146 /* sleep a bit to make sure the tracepoint gets activated */
147 msleep(100);
148
149 while (!kthread_should_stop()) {
150
151 trace_do_benchmark();
152
153 /*
154 * We don't go to sleep, but let others
155 * run as well.
156 */
157 cond_resched();
158 }
159
160 return 0;
161}
162
163/*
164 * When the benchmark tracepoint is enabled, it calls this
165 * function and the thread that calls the tracepoint is created.
166 */
167void trace_benchmark_reg(void)
168{
169 bm_event_thread = kthread_run(benchmark_event_kthread,
170 NULL, "event_benchmark");
171 WARN_ON(!bm_event_thread);
172}
173
174/*
175 * When the benchmark tracepoint is disabled, it calls this
176 * function and the thread that calls the tracepoint is deleted
177 * and all the numbers are reset.
178 */
179void trace_benchmark_unreg(void)
180{
181 if (!bm_event_thread)
182 return;
183
184 kthread_stop(bm_event_thread);
185
186 strcpy(bm_str, "START");
187 bm_total = 0;
188 bm_totalsq = 0;
189 bm_last = 0;
190 bm_max = 0;
191 bm_min = 0;
192 bm_cnt = 0;
193 /* These don't need to be reset but reset them anyway */
194 bm_first = 0;
195 bm_std = 0;
196 bm_avg = 0;
197 bm_stddev = 0;
198}
diff --git a/kernel/trace/trace_benchmark.h b/kernel/trace/trace_benchmark.h
new file mode 100644
index 000000000000..3c1df1df4e29
--- /dev/null
+++ b/kernel/trace/trace_benchmark.h
@@ -0,0 +1,41 @@
1#undef TRACE_SYSTEM
2#define TRACE_SYSTEM benchmark
3
4#if !defined(_TRACE_BENCHMARK_H) || defined(TRACE_HEADER_MULTI_READ)
5#define _TRACE_BENCHMARK_H
6
7#include <linux/tracepoint.h>
8
9extern void trace_benchmark_reg(void);
10extern void trace_benchmark_unreg(void);
11
12#define BENCHMARK_EVENT_STRLEN 128
13
14TRACE_EVENT_FN(benchmark_event,
15
16 TP_PROTO(const char *str),
17
18 TP_ARGS(str),
19
20 TP_STRUCT__entry(
21 __array( char, str, BENCHMARK_EVENT_STRLEN )
22 ),
23
24 TP_fast_assign(
25 memcpy(__entry->str, str, BENCHMARK_EVENT_STRLEN);
26 ),
27
28 TP_printk("%s", __entry->str),
29
30 trace_benchmark_reg, trace_benchmark_unreg
31);
32
33#endif /* _TRACE_BENCHMARK_H */
34
35#undef TRACE_INCLUDE_FILE
36#undef TRACE_INCLUDE_PATH
37#define TRACE_INCLUDE_PATH .
38#define TRACE_INCLUDE_FILE trace_benchmark
39
40/* This part must be outside protection */
41#include <trace/define_trace.h>
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index c894614de14d..5d12bb407b44 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -248,8 +248,8 @@ void perf_trace_del(struct perf_event *p_event, int flags)
248 tp_event->class->reg(tp_event, TRACE_REG_PERF_DEL, p_event); 248 tp_event->class->reg(tp_event, TRACE_REG_PERF_DEL, p_event);
249} 249}
250 250
251__kprobes void *perf_trace_buf_prepare(int size, unsigned short type, 251void *perf_trace_buf_prepare(int size, unsigned short type,
252 struct pt_regs *regs, int *rctxp) 252 struct pt_regs *regs, int *rctxp)
253{ 253{
254 struct trace_entry *entry; 254 struct trace_entry *entry;
255 unsigned long flags; 255 unsigned long flags;
@@ -281,6 +281,7 @@ __kprobes void *perf_trace_buf_prepare(int size, unsigned short type,
281 return raw_data; 281 return raw_data;
282} 282}
283EXPORT_SYMBOL_GPL(perf_trace_buf_prepare); 283EXPORT_SYMBOL_GPL(perf_trace_buf_prepare);
284NOKPROBE_SYMBOL(perf_trace_buf_prepare);
284 285
285#ifdef CONFIG_FUNCTION_TRACER 286#ifdef CONFIG_FUNCTION_TRACER
286static void 287static void
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 3ddfd8f62c05..f99e0b3bca8c 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -574,6 +574,9 @@ int trace_set_clr_event(const char *system, const char *event, int set)
574{ 574{
575 struct trace_array *tr = top_trace_array(); 575 struct trace_array *tr = top_trace_array();
576 576
577 if (!tr)
578 return -ENODEV;
579
577 return __ftrace_set_clr_event(tr, NULL, system, event, set); 580 return __ftrace_set_clr_event(tr, NULL, system, event, set);
578} 581}
579EXPORT_SYMBOL_GPL(trace_set_clr_event); 582EXPORT_SYMBOL_GPL(trace_set_clr_event);
@@ -2065,6 +2068,9 @@ event_enable_func(struct ftrace_hash *hash,
2065 bool enable; 2068 bool enable;
2066 int ret; 2069 int ret;
2067 2070
2071 if (!tr)
2072 return -ENODEV;
2073
2068 /* hash funcs only work with set_ftrace_filter */ 2074 /* hash funcs only work with set_ftrace_filter */
2069 if (!enabled || !param) 2075 if (!enabled || !param)
2070 return -EINVAL; 2076 return -EINVAL;
@@ -2396,6 +2402,9 @@ static __init int event_trace_enable(void)
2396 char *token; 2402 char *token;
2397 int ret; 2403 int ret;
2398 2404
2405 if (!tr)
2406 return -ENODEV;
2407
2399 for_each_event(iter, __start_ftrace_events, __stop_ftrace_events) { 2408 for_each_event(iter, __start_ftrace_events, __stop_ftrace_events) {
2400 2409
2401 call = *iter; 2410 call = *iter;
@@ -2442,6 +2451,8 @@ static __init int event_trace_init(void)
2442 int ret; 2451 int ret;
2443 2452
2444 tr = top_trace_array(); 2453 tr = top_trace_array();
2454 if (!tr)
2455 return -ENODEV;
2445 2456
2446 d_tracer = tracing_init_dentry(); 2457 d_tracer = tracing_init_dentry();
2447 if (!d_tracer) 2458 if (!d_tracer)
@@ -2535,6 +2546,8 @@ static __init void event_trace_self_tests(void)
2535 int ret; 2546 int ret;
2536 2547
2537 tr = top_trace_array(); 2548 tr = top_trace_array();
2549 if (!tr)
2550 return;
2538 2551
2539 pr_info("Running tests on trace events:\n"); 2552 pr_info("Running tests on trace events:\n");
2540 2553
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index ffd56351b521..57f0ec962d2c 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -26,8 +26,6 @@ function_trace_call(unsigned long ip, unsigned long parent_ip,
26static void 26static void
27function_stack_trace_call(unsigned long ip, unsigned long parent_ip, 27function_stack_trace_call(unsigned long ip, unsigned long parent_ip,
28 struct ftrace_ops *op, struct pt_regs *pt_regs); 28 struct ftrace_ops *op, struct pt_regs *pt_regs);
29static struct ftrace_ops trace_ops;
30static struct ftrace_ops trace_stack_ops;
31static struct tracer_flags func_flags; 29static struct tracer_flags func_flags;
32 30
33/* Our option */ 31/* Our option */
@@ -83,28 +81,24 @@ void ftrace_destroy_function_files(struct trace_array *tr)
83 81
84static int function_trace_init(struct trace_array *tr) 82static int function_trace_init(struct trace_array *tr)
85{ 83{
86 struct ftrace_ops *ops; 84 ftrace_func_t func;
87
88 if (tr->flags & TRACE_ARRAY_FL_GLOBAL) {
89 /* There's only one global tr */
90 if (!trace_ops.private) {
91 trace_ops.private = tr;
92 trace_stack_ops.private = tr;
93 }
94 85
95 if (func_flags.val & TRACE_FUNC_OPT_STACK) 86 /*
96 ops = &trace_stack_ops; 87 * Instance trace_arrays get their ops allocated
97 else 88 * at instance creation. Unless it failed
98 ops = &trace_ops; 89 * the allocation.
99 tr->ops = ops; 90 */
100 } else if (!tr->ops) { 91 if (!tr->ops)
101 /*
102 * Instance trace_arrays get their ops allocated
103 * at instance creation. Unless it failed
104 * the allocation.
105 */
106 return -ENOMEM; 92 return -ENOMEM;
107 } 93
94 /* Currently only the global instance can do stack tracing */
95 if (tr->flags & TRACE_ARRAY_FL_GLOBAL &&
96 func_flags.val & TRACE_FUNC_OPT_STACK)
97 func = function_stack_trace_call;
98 else
99 func = function_trace_call;
100
101 ftrace_init_array_ops(tr, func);
108 102
109 tr->trace_buffer.cpu = get_cpu(); 103 tr->trace_buffer.cpu = get_cpu();
110 put_cpu(); 104 put_cpu();
@@ -118,6 +112,7 @@ static void function_trace_reset(struct trace_array *tr)
118{ 112{
119 tracing_stop_function_trace(tr); 113 tracing_stop_function_trace(tr);
120 tracing_stop_cmdline_record(); 114 tracing_stop_cmdline_record();
115 ftrace_reset_array_ops(tr);
121} 116}
122 117
123static void function_trace_start(struct trace_array *tr) 118static void function_trace_start(struct trace_array *tr)
@@ -199,18 +194,6 @@ function_stack_trace_call(unsigned long ip, unsigned long parent_ip,
199 local_irq_restore(flags); 194 local_irq_restore(flags);
200} 195}
201 196
202static struct ftrace_ops trace_ops __read_mostly =
203{
204 .func = function_trace_call,
205 .flags = FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_RECURSION_SAFE,
206};
207
208static struct ftrace_ops trace_stack_ops __read_mostly =
209{
210 .func = function_stack_trace_call,
211 .flags = FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_RECURSION_SAFE,
212};
213
214static struct tracer_opt func_opts[] = { 197static struct tracer_opt func_opts[] = {
215#ifdef CONFIG_STACKTRACE 198#ifdef CONFIG_STACKTRACE
216 { TRACER_OPT(func_stack_trace, TRACE_FUNC_OPT_STACK) }, 199 { TRACER_OPT(func_stack_trace, TRACE_FUNC_OPT_STACK) },
@@ -248,10 +231,10 @@ func_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set)
248 unregister_ftrace_function(tr->ops); 231 unregister_ftrace_function(tr->ops);
249 232
250 if (set) { 233 if (set) {
251 tr->ops = &trace_stack_ops; 234 tr->ops->func = function_stack_trace_call;
252 register_ftrace_function(tr->ops); 235 register_ftrace_function(tr->ops);
253 } else { 236 } else {
254 tr->ops = &trace_ops; 237 tr->ops->func = function_trace_call;
255 register_ftrace_function(tr->ops); 238 register_ftrace_function(tr->ops);
256 } 239 }
257 240
@@ -269,7 +252,6 @@ static struct tracer function_trace __tracer_data =
269 .init = function_trace_init, 252 .init = function_trace_init,
270 .reset = function_trace_reset, 253 .reset = function_trace_reset,
271 .start = function_trace_start, 254 .start = function_trace_start,
272 .wait_pipe = poll_wait_pipe,
273 .flags = &func_flags, 255 .flags = &func_flags,
274 .set_flag = func_set_flag, 256 .set_flag = func_set_flag,
275 .allow_instances = true, 257 .allow_instances = true,
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index deff11200261..4de3e57f723c 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -38,15 +38,6 @@ struct fgraph_data {
38 38
39#define TRACE_GRAPH_INDENT 2 39#define TRACE_GRAPH_INDENT 2
40 40
41/* Flag options */
42#define TRACE_GRAPH_PRINT_OVERRUN 0x1
43#define TRACE_GRAPH_PRINT_CPU 0x2
44#define TRACE_GRAPH_PRINT_OVERHEAD 0x4
45#define TRACE_GRAPH_PRINT_PROC 0x8
46#define TRACE_GRAPH_PRINT_DURATION 0x10
47#define TRACE_GRAPH_PRINT_ABS_TIME 0x20
48#define TRACE_GRAPH_PRINT_IRQS 0x40
49
50static unsigned int max_depth; 41static unsigned int max_depth;
51 42
52static struct tracer_opt trace_opts[] = { 43static struct tracer_opt trace_opts[] = {
@@ -64,11 +55,13 @@ static struct tracer_opt trace_opts[] = {
64 { TRACER_OPT(funcgraph-abstime, TRACE_GRAPH_PRINT_ABS_TIME) }, 55 { TRACER_OPT(funcgraph-abstime, TRACE_GRAPH_PRINT_ABS_TIME) },
65 /* Display interrupts */ 56 /* Display interrupts */
66 { TRACER_OPT(funcgraph-irqs, TRACE_GRAPH_PRINT_IRQS) }, 57 { TRACER_OPT(funcgraph-irqs, TRACE_GRAPH_PRINT_IRQS) },
58 /* Display function name after trailing } */
59 { TRACER_OPT(funcgraph-tail, TRACE_GRAPH_PRINT_TAIL) },
67 { } /* Empty entry */ 60 { } /* Empty entry */
68}; 61};
69 62
70static struct tracer_flags tracer_flags = { 63static struct tracer_flags tracer_flags = {
71 /* Don't display overruns and proc by default */ 64 /* Don't display overruns, proc, or tail by default */
72 .val = TRACE_GRAPH_PRINT_CPU | TRACE_GRAPH_PRINT_OVERHEAD | 65 .val = TRACE_GRAPH_PRINT_CPU | TRACE_GRAPH_PRINT_OVERHEAD |
73 TRACE_GRAPH_PRINT_DURATION | TRACE_GRAPH_PRINT_IRQS, 66 TRACE_GRAPH_PRINT_DURATION | TRACE_GRAPH_PRINT_IRQS,
74 .opts = trace_opts 67 .opts = trace_opts
@@ -1176,9 +1169,10 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
1176 * If the return function does not have a matching entry, 1169 * If the return function does not have a matching entry,
1177 * then the entry was lost. Instead of just printing 1170 * then the entry was lost. Instead of just printing
1178 * the '}' and letting the user guess what function this 1171 * the '}' and letting the user guess what function this
1179 * belongs to, write out the function name. 1172 * belongs to, write out the function name. Always do
1173 * that if the funcgraph-tail option is enabled.
1180 */ 1174 */
1181 if (func_match) { 1175 if (func_match && !(flags & TRACE_GRAPH_PRINT_TAIL)) {
1182 ret = trace_seq_puts(s, "}\n"); 1176 ret = trace_seq_puts(s, "}\n");
1183 if (!ret) 1177 if (!ret)
1184 return TRACE_TYPE_PARTIAL_LINE; 1178 return TRACE_TYPE_PARTIAL_LINE;
@@ -1505,7 +1499,6 @@ static struct tracer graph_trace __tracer_data = {
1505 .pipe_open = graph_trace_open, 1499 .pipe_open = graph_trace_open,
1506 .close = graph_trace_close, 1500 .close = graph_trace_close,
1507 .pipe_close = graph_trace_close, 1501 .pipe_close = graph_trace_close,
1508 .wait_pipe = poll_wait_pipe,
1509 .init = graph_trace_init, 1502 .init = graph_trace_init,
1510 .reset = graph_trace_reset, 1503 .reset = graph_trace_reset,
1511 .print_line = print_graph_function, 1504 .print_line = print_graph_function,
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 8ff02cbb892f..9bb104f748d0 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -151,12 +151,6 @@ irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip,
151 151
152 atomic_dec(&data->disabled); 152 atomic_dec(&data->disabled);
153} 153}
154
155static struct ftrace_ops trace_ops __read_mostly =
156{
157 .func = irqsoff_tracer_call,
158 .flags = FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_RECURSION_SAFE,
159};
160#endif /* CONFIG_FUNCTION_TRACER */ 154#endif /* CONFIG_FUNCTION_TRACER */
161 155
162#ifdef CONFIG_FUNCTION_GRAPH_TRACER 156#ifdef CONFIG_FUNCTION_GRAPH_TRACER
@@ -176,7 +170,7 @@ irqsoff_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set)
176 for_each_possible_cpu(cpu) 170 for_each_possible_cpu(cpu)
177 per_cpu(tracing_cpu, cpu) = 0; 171 per_cpu(tracing_cpu, cpu) = 0;
178 172
179 tracing_max_latency = 0; 173 tr->max_latency = 0;
180 tracing_reset_online_cpus(&irqsoff_trace->trace_buffer); 174 tracing_reset_online_cpus(&irqsoff_trace->trace_buffer);
181 175
182 return start_irqsoff_tracer(irqsoff_trace, set); 176 return start_irqsoff_tracer(irqsoff_trace, set);
@@ -303,13 +297,13 @@ static void irqsoff_print_header(struct seq_file *s)
303/* 297/*
304 * Should this new latency be reported/recorded? 298 * Should this new latency be reported/recorded?
305 */ 299 */
306static int report_latency(cycle_t delta) 300static int report_latency(struct trace_array *tr, cycle_t delta)
307{ 301{
308 if (tracing_thresh) { 302 if (tracing_thresh) {
309 if (delta < tracing_thresh) 303 if (delta < tracing_thresh)
310 return 0; 304 return 0;
311 } else { 305 } else {
312 if (delta <= tracing_max_latency) 306 if (delta <= tr->max_latency)
313 return 0; 307 return 0;
314 } 308 }
315 return 1; 309 return 1;
@@ -333,13 +327,13 @@ check_critical_timing(struct trace_array *tr,
333 327
334 pc = preempt_count(); 328 pc = preempt_count();
335 329
336 if (!report_latency(delta)) 330 if (!report_latency(tr, delta))
337 goto out; 331 goto out;
338 332
339 raw_spin_lock_irqsave(&max_trace_lock, flags); 333 raw_spin_lock_irqsave(&max_trace_lock, flags);
340 334
341 /* check if we are still the max latency */ 335 /* check if we are still the max latency */
342 if (!report_latency(delta)) 336 if (!report_latency(tr, delta))
343 goto out_unlock; 337 goto out_unlock;
344 338
345 __trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc); 339 __trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc);
@@ -352,7 +346,7 @@ check_critical_timing(struct trace_array *tr,
352 data->critical_end = parent_ip; 346 data->critical_end = parent_ip;
353 347
354 if (likely(!is_tracing_stopped())) { 348 if (likely(!is_tracing_stopped())) {
355 tracing_max_latency = delta; 349 tr->max_latency = delta;
356 update_max_tr_single(tr, current, cpu); 350 update_max_tr_single(tr, current, cpu);
357 } 351 }
358 352
@@ -531,7 +525,7 @@ void trace_preempt_off(unsigned long a0, unsigned long a1)
531} 525}
532#endif /* CONFIG_PREEMPT_TRACER */ 526#endif /* CONFIG_PREEMPT_TRACER */
533 527
534static int register_irqsoff_function(int graph, int set) 528static int register_irqsoff_function(struct trace_array *tr, int graph, int set)
535{ 529{
536 int ret; 530 int ret;
537 531
@@ -543,7 +537,7 @@ static int register_irqsoff_function(int graph, int set)
543 ret = register_ftrace_graph(&irqsoff_graph_return, 537 ret = register_ftrace_graph(&irqsoff_graph_return,
544 &irqsoff_graph_entry); 538 &irqsoff_graph_entry);
545 else 539 else
546 ret = register_ftrace_function(&trace_ops); 540 ret = register_ftrace_function(tr->ops);
547 541
548 if (!ret) 542 if (!ret)
549 function_enabled = true; 543 function_enabled = true;
@@ -551,7 +545,7 @@ static int register_irqsoff_function(int graph, int set)
551 return ret; 545 return ret;
552} 546}
553 547
554static void unregister_irqsoff_function(int graph) 548static void unregister_irqsoff_function(struct trace_array *tr, int graph)
555{ 549{
556 if (!function_enabled) 550 if (!function_enabled)
557 return; 551 return;
@@ -559,17 +553,17 @@ static void unregister_irqsoff_function(int graph)
559 if (graph) 553 if (graph)
560 unregister_ftrace_graph(); 554 unregister_ftrace_graph();
561 else 555 else
562 unregister_ftrace_function(&trace_ops); 556 unregister_ftrace_function(tr->ops);
563 557
564 function_enabled = false; 558 function_enabled = false;
565} 559}
566 560
567static void irqsoff_function_set(int set) 561static void irqsoff_function_set(struct trace_array *tr, int set)
568{ 562{
569 if (set) 563 if (set)
570 register_irqsoff_function(is_graph(), 1); 564 register_irqsoff_function(tr, is_graph(), 1);
571 else 565 else
572 unregister_irqsoff_function(is_graph()); 566 unregister_irqsoff_function(tr, is_graph());
573} 567}
574 568
575static int irqsoff_flag_changed(struct trace_array *tr, u32 mask, int set) 569static int irqsoff_flag_changed(struct trace_array *tr, u32 mask, int set)
@@ -577,7 +571,7 @@ static int irqsoff_flag_changed(struct trace_array *tr, u32 mask, int set)
577 struct tracer *tracer = tr->current_trace; 571 struct tracer *tracer = tr->current_trace;
578 572
579 if (mask & TRACE_ITER_FUNCTION) 573 if (mask & TRACE_ITER_FUNCTION)
580 irqsoff_function_set(set); 574 irqsoff_function_set(tr, set);
581 575
582 return trace_keep_overwrite(tracer, mask, set); 576 return trace_keep_overwrite(tracer, mask, set);
583} 577}
@@ -586,7 +580,7 @@ static int start_irqsoff_tracer(struct trace_array *tr, int graph)
586{ 580{
587 int ret; 581 int ret;
588 582
589 ret = register_irqsoff_function(graph, 0); 583 ret = register_irqsoff_function(tr, graph, 0);
590 584
591 if (!ret && tracing_is_enabled()) 585 if (!ret && tracing_is_enabled())
592 tracer_enabled = 1; 586 tracer_enabled = 1;
@@ -600,25 +594,37 @@ static void stop_irqsoff_tracer(struct trace_array *tr, int graph)
600{ 594{
601 tracer_enabled = 0; 595 tracer_enabled = 0;
602 596
603 unregister_irqsoff_function(graph); 597 unregister_irqsoff_function(tr, graph);
604} 598}
605 599
606static void __irqsoff_tracer_init(struct trace_array *tr) 600static bool irqsoff_busy;
601
602static int __irqsoff_tracer_init(struct trace_array *tr)
607{ 603{
604 if (irqsoff_busy)
605 return -EBUSY;
606
608 save_flags = trace_flags; 607 save_flags = trace_flags;
609 608
610 /* non overwrite screws up the latency tracers */ 609 /* non overwrite screws up the latency tracers */
611 set_tracer_flag(tr, TRACE_ITER_OVERWRITE, 1); 610 set_tracer_flag(tr, TRACE_ITER_OVERWRITE, 1);
612 set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, 1); 611 set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, 1);
613 612
614 tracing_max_latency = 0; 613 tr->max_latency = 0;
615 irqsoff_trace = tr; 614 irqsoff_trace = tr;
616 /* make sure that the tracer is visible */ 615 /* make sure that the tracer is visible */
617 smp_wmb(); 616 smp_wmb();
618 tracing_reset_online_cpus(&tr->trace_buffer); 617 tracing_reset_online_cpus(&tr->trace_buffer);
619 618
620 if (start_irqsoff_tracer(tr, is_graph())) 619 ftrace_init_array_ops(tr, irqsoff_tracer_call);
620
621 /* Only toplevel instance supports graph tracing */
622 if (start_irqsoff_tracer(tr, (tr->flags & TRACE_ARRAY_FL_GLOBAL &&
623 is_graph())))
621 printk(KERN_ERR "failed to start irqsoff tracer\n"); 624 printk(KERN_ERR "failed to start irqsoff tracer\n");
625
626 irqsoff_busy = true;
627 return 0;
622} 628}
623 629
624static void irqsoff_tracer_reset(struct trace_array *tr) 630static void irqsoff_tracer_reset(struct trace_array *tr)
@@ -630,6 +636,9 @@ static void irqsoff_tracer_reset(struct trace_array *tr)
630 636
631 set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, lat_flag); 637 set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, lat_flag);
632 set_tracer_flag(tr, TRACE_ITER_OVERWRITE, overwrite_flag); 638 set_tracer_flag(tr, TRACE_ITER_OVERWRITE, overwrite_flag);
639 ftrace_reset_array_ops(tr);
640
641 irqsoff_busy = false;
633} 642}
634 643
635static void irqsoff_tracer_start(struct trace_array *tr) 644static void irqsoff_tracer_start(struct trace_array *tr)
@@ -647,8 +656,7 @@ static int irqsoff_tracer_init(struct trace_array *tr)
647{ 656{
648 trace_type = TRACER_IRQS_OFF; 657 trace_type = TRACER_IRQS_OFF;
649 658
650 __irqsoff_tracer_init(tr); 659 return __irqsoff_tracer_init(tr);
651 return 0;
652} 660}
653static struct tracer irqsoff_tracer __read_mostly = 661static struct tracer irqsoff_tracer __read_mostly =
654{ 662{
@@ -668,6 +676,7 @@ static struct tracer irqsoff_tracer __read_mostly =
668#endif 676#endif
669 .open = irqsoff_trace_open, 677 .open = irqsoff_trace_open,
670 .close = irqsoff_trace_close, 678 .close = irqsoff_trace_close,
679 .allow_instances = true,
671 .use_max_tr = true, 680 .use_max_tr = true,
672}; 681};
673# define register_irqsoff(trace) register_tracer(&trace) 682# define register_irqsoff(trace) register_tracer(&trace)
@@ -680,8 +689,7 @@ static int preemptoff_tracer_init(struct trace_array *tr)
680{ 689{
681 trace_type = TRACER_PREEMPT_OFF; 690 trace_type = TRACER_PREEMPT_OFF;
682 691
683 __irqsoff_tracer_init(tr); 692 return __irqsoff_tracer_init(tr);
684 return 0;
685} 693}
686 694
687static struct tracer preemptoff_tracer __read_mostly = 695static struct tracer preemptoff_tracer __read_mostly =
@@ -702,6 +710,7 @@ static struct tracer preemptoff_tracer __read_mostly =
702#endif 710#endif
703 .open = irqsoff_trace_open, 711 .open = irqsoff_trace_open,
704 .close = irqsoff_trace_close, 712 .close = irqsoff_trace_close,
713 .allow_instances = true,
705 .use_max_tr = true, 714 .use_max_tr = true,
706}; 715};
707# define register_preemptoff(trace) register_tracer(&trace) 716# define register_preemptoff(trace) register_tracer(&trace)
@@ -716,8 +725,7 @@ static int preemptirqsoff_tracer_init(struct trace_array *tr)
716{ 725{
717 trace_type = TRACER_IRQS_OFF | TRACER_PREEMPT_OFF; 726 trace_type = TRACER_IRQS_OFF | TRACER_PREEMPT_OFF;
718 727
719 __irqsoff_tracer_init(tr); 728 return __irqsoff_tracer_init(tr);
720 return 0;
721} 729}
722 730
723static struct tracer preemptirqsoff_tracer __read_mostly = 731static struct tracer preemptirqsoff_tracer __read_mostly =
@@ -738,6 +746,7 @@ static struct tracer preemptirqsoff_tracer __read_mostly =
738#endif 746#endif
739 .open = irqsoff_trace_open, 747 .open = irqsoff_trace_open,
740 .close = irqsoff_trace_close, 748 .close = irqsoff_trace_close,
749 .allow_instances = true,
741 .use_max_tr = true, 750 .use_max_tr = true,
742}; 751};
743 752
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 903ae28962be..282f6e4e5539 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -40,27 +40,27 @@ struct trace_kprobe {
40 (sizeof(struct probe_arg) * (n))) 40 (sizeof(struct probe_arg) * (n)))
41 41
42 42
43static __kprobes bool trace_kprobe_is_return(struct trace_kprobe *tk) 43static nokprobe_inline bool trace_kprobe_is_return(struct trace_kprobe *tk)
44{ 44{
45 return tk->rp.handler != NULL; 45 return tk->rp.handler != NULL;
46} 46}
47 47
48static __kprobes const char *trace_kprobe_symbol(struct trace_kprobe *tk) 48static nokprobe_inline const char *trace_kprobe_symbol(struct trace_kprobe *tk)
49{ 49{
50 return tk->symbol ? tk->symbol : "unknown"; 50 return tk->symbol ? tk->symbol : "unknown";
51} 51}
52 52
53static __kprobes unsigned long trace_kprobe_offset(struct trace_kprobe *tk) 53static nokprobe_inline unsigned long trace_kprobe_offset(struct trace_kprobe *tk)
54{ 54{
55 return tk->rp.kp.offset; 55 return tk->rp.kp.offset;
56} 56}
57 57
58static __kprobes bool trace_kprobe_has_gone(struct trace_kprobe *tk) 58static nokprobe_inline bool trace_kprobe_has_gone(struct trace_kprobe *tk)
59{ 59{
60 return !!(kprobe_gone(&tk->rp.kp)); 60 return !!(kprobe_gone(&tk->rp.kp));
61} 61}
62 62
63static __kprobes bool trace_kprobe_within_module(struct trace_kprobe *tk, 63static nokprobe_inline bool trace_kprobe_within_module(struct trace_kprobe *tk,
64 struct module *mod) 64 struct module *mod)
65{ 65{
66 int len = strlen(mod->name); 66 int len = strlen(mod->name);
@@ -68,7 +68,7 @@ static __kprobes bool trace_kprobe_within_module(struct trace_kprobe *tk,
68 return strncmp(mod->name, name, len) == 0 && name[len] == ':'; 68 return strncmp(mod->name, name, len) == 0 && name[len] == ':';
69} 69}
70 70
71static __kprobes bool trace_kprobe_is_on_module(struct trace_kprobe *tk) 71static nokprobe_inline bool trace_kprobe_is_on_module(struct trace_kprobe *tk)
72{ 72{
73 return !!strchr(trace_kprobe_symbol(tk), ':'); 73 return !!strchr(trace_kprobe_symbol(tk), ':');
74} 74}
@@ -132,19 +132,21 @@ struct symbol_cache *alloc_symbol_cache(const char *sym, long offset)
132 * Kprobes-specific fetch functions 132 * Kprobes-specific fetch functions
133 */ 133 */
134#define DEFINE_FETCH_stack(type) \ 134#define DEFINE_FETCH_stack(type) \
135static __kprobes void FETCH_FUNC_NAME(stack, type)(struct pt_regs *regs,\ 135static void FETCH_FUNC_NAME(stack, type)(struct pt_regs *regs, \
136 void *offset, void *dest) \ 136 void *offset, void *dest) \
137{ \ 137{ \
138 *(type *)dest = (type)regs_get_kernel_stack_nth(regs, \ 138 *(type *)dest = (type)regs_get_kernel_stack_nth(regs, \
139 (unsigned int)((unsigned long)offset)); \ 139 (unsigned int)((unsigned long)offset)); \
140} 140} \
141NOKPROBE_SYMBOL(FETCH_FUNC_NAME(stack, type));
142
141DEFINE_BASIC_FETCH_FUNCS(stack) 143DEFINE_BASIC_FETCH_FUNCS(stack)
142/* No string on the stack entry */ 144/* No string on the stack entry */
143#define fetch_stack_string NULL 145#define fetch_stack_string NULL
144#define fetch_stack_string_size NULL 146#define fetch_stack_string_size NULL
145 147
146#define DEFINE_FETCH_memory(type) \ 148#define DEFINE_FETCH_memory(type) \
147static __kprobes void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs,\ 149static void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs, \
148 void *addr, void *dest) \ 150 void *addr, void *dest) \
149{ \ 151{ \
150 type retval; \ 152 type retval; \
@@ -152,14 +154,16 @@ static __kprobes void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs,\
152 *(type *)dest = 0; \ 154 *(type *)dest = 0; \
153 else \ 155 else \
154 *(type *)dest = retval; \ 156 *(type *)dest = retval; \
155} 157} \
158NOKPROBE_SYMBOL(FETCH_FUNC_NAME(memory, type));
159
156DEFINE_BASIC_FETCH_FUNCS(memory) 160DEFINE_BASIC_FETCH_FUNCS(memory)
157/* 161/*
158 * Fetch a null-terminated string. Caller MUST set *(u32 *)dest with max 162 * Fetch a null-terminated string. Caller MUST set *(u32 *)dest with max
159 * length and relative data location. 163 * length and relative data location.
160 */ 164 */
161static __kprobes void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs, 165static void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs,
162 void *addr, void *dest) 166 void *addr, void *dest)
163{ 167{
164 long ret; 168 long ret;
165 int maxlen = get_rloc_len(*(u32 *)dest); 169 int maxlen = get_rloc_len(*(u32 *)dest);
@@ -193,10 +197,11 @@ static __kprobes void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs,
193 get_rloc_offs(*(u32 *)dest)); 197 get_rloc_offs(*(u32 *)dest));
194 } 198 }
195} 199}
200NOKPROBE_SYMBOL(FETCH_FUNC_NAME(memory, string));
196 201
197/* Return the length of string -- including null terminal byte */ 202/* Return the length of string -- including null terminal byte */
198static __kprobes void FETCH_FUNC_NAME(memory, string_size)(struct pt_regs *regs, 203static void FETCH_FUNC_NAME(memory, string_size)(struct pt_regs *regs,
199 void *addr, void *dest) 204 void *addr, void *dest)
200{ 205{
201 mm_segment_t old_fs; 206 mm_segment_t old_fs;
202 int ret, len = 0; 207 int ret, len = 0;
@@ -219,17 +224,19 @@ static __kprobes void FETCH_FUNC_NAME(memory, string_size)(struct pt_regs *regs,
219 else 224 else
220 *(u32 *)dest = len; 225 *(u32 *)dest = len;
221} 226}
227NOKPROBE_SYMBOL(FETCH_FUNC_NAME(memory, string_size));
222 228
223#define DEFINE_FETCH_symbol(type) \ 229#define DEFINE_FETCH_symbol(type) \
224__kprobes void FETCH_FUNC_NAME(symbol, type)(struct pt_regs *regs, \ 230void FETCH_FUNC_NAME(symbol, type)(struct pt_regs *regs, void *data, void *dest)\
225 void *data, void *dest) \
226{ \ 231{ \
227 struct symbol_cache *sc = data; \ 232 struct symbol_cache *sc = data; \
228 if (sc->addr) \ 233 if (sc->addr) \
229 fetch_memory_##type(regs, (void *)sc->addr, dest); \ 234 fetch_memory_##type(regs, (void *)sc->addr, dest); \
230 else \ 235 else \
231 *(type *)dest = 0; \ 236 *(type *)dest = 0; \
232} 237} \
238NOKPROBE_SYMBOL(FETCH_FUNC_NAME(symbol, type));
239
233DEFINE_BASIC_FETCH_FUNCS(symbol) 240DEFINE_BASIC_FETCH_FUNCS(symbol)
234DEFINE_FETCH_symbol(string) 241DEFINE_FETCH_symbol(string)
235DEFINE_FETCH_symbol(string_size) 242DEFINE_FETCH_symbol(string_size)
@@ -907,7 +914,7 @@ static const struct file_operations kprobe_profile_ops = {
907}; 914};
908 915
909/* Kprobe handler */ 916/* Kprobe handler */
910static __kprobes void 917static nokprobe_inline void
911__kprobe_trace_func(struct trace_kprobe *tk, struct pt_regs *regs, 918__kprobe_trace_func(struct trace_kprobe *tk, struct pt_regs *regs,
912 struct ftrace_event_file *ftrace_file) 919 struct ftrace_event_file *ftrace_file)
913{ 920{
@@ -943,7 +950,7 @@ __kprobe_trace_func(struct trace_kprobe *tk, struct pt_regs *regs,
943 entry, irq_flags, pc, regs); 950 entry, irq_flags, pc, regs);
944} 951}
945 952
946static __kprobes void 953static void
947kprobe_trace_func(struct trace_kprobe *tk, struct pt_regs *regs) 954kprobe_trace_func(struct trace_kprobe *tk, struct pt_regs *regs)
948{ 955{
949 struct event_file_link *link; 956 struct event_file_link *link;
@@ -951,9 +958,10 @@ kprobe_trace_func(struct trace_kprobe *tk, struct pt_regs *regs)
951 list_for_each_entry_rcu(link, &tk->tp.files, list) 958 list_for_each_entry_rcu(link, &tk->tp.files, list)
952 __kprobe_trace_func(tk, regs, link->file); 959 __kprobe_trace_func(tk, regs, link->file);
953} 960}
961NOKPROBE_SYMBOL(kprobe_trace_func);
954 962
955/* Kretprobe handler */ 963/* Kretprobe handler */
956static __kprobes void 964static nokprobe_inline void
957__kretprobe_trace_func(struct trace_kprobe *tk, struct kretprobe_instance *ri, 965__kretprobe_trace_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
958 struct pt_regs *regs, 966 struct pt_regs *regs,
959 struct ftrace_event_file *ftrace_file) 967 struct ftrace_event_file *ftrace_file)
@@ -991,7 +999,7 @@ __kretprobe_trace_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
991 entry, irq_flags, pc, regs); 999 entry, irq_flags, pc, regs);
992} 1000}
993 1001
994static __kprobes void 1002static void
995kretprobe_trace_func(struct trace_kprobe *tk, struct kretprobe_instance *ri, 1003kretprobe_trace_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
996 struct pt_regs *regs) 1004 struct pt_regs *regs)
997{ 1005{
@@ -1000,6 +1008,7 @@ kretprobe_trace_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
1000 list_for_each_entry_rcu(link, &tk->tp.files, list) 1008 list_for_each_entry_rcu(link, &tk->tp.files, list)
1001 __kretprobe_trace_func(tk, ri, regs, link->file); 1009 __kretprobe_trace_func(tk, ri, regs, link->file);
1002} 1010}
1011NOKPROBE_SYMBOL(kretprobe_trace_func);
1003 1012
1004/* Event entry printers */ 1013/* Event entry printers */
1005static enum print_line_t 1014static enum print_line_t
@@ -1131,7 +1140,7 @@ static int kretprobe_event_define_fields(struct ftrace_event_call *event_call)
1131#ifdef CONFIG_PERF_EVENTS 1140#ifdef CONFIG_PERF_EVENTS
1132 1141
1133/* Kprobe profile handler */ 1142/* Kprobe profile handler */
1134static __kprobes void 1143static void
1135kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs) 1144kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs)
1136{ 1145{
1137 struct ftrace_event_call *call = &tk->tp.call; 1146 struct ftrace_event_call *call = &tk->tp.call;
@@ -1158,9 +1167,10 @@ kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs)
1158 store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize); 1167 store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize);
1159 perf_trace_buf_submit(entry, size, rctx, 0, 1, regs, head, NULL); 1168 perf_trace_buf_submit(entry, size, rctx, 0, 1, regs, head, NULL);
1160} 1169}
1170NOKPROBE_SYMBOL(kprobe_perf_func);
1161 1171
1162/* Kretprobe profile handler */ 1172/* Kretprobe profile handler */
1163static __kprobes void 1173static void
1164kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri, 1174kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
1165 struct pt_regs *regs) 1175 struct pt_regs *regs)
1166{ 1176{
@@ -1188,6 +1198,7 @@ kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
1188 store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize); 1198 store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize);
1189 perf_trace_buf_submit(entry, size, rctx, 0, 1, regs, head, NULL); 1199 perf_trace_buf_submit(entry, size, rctx, 0, 1, regs, head, NULL);
1190} 1200}
1201NOKPROBE_SYMBOL(kretprobe_perf_func);
1191#endif /* CONFIG_PERF_EVENTS */ 1202#endif /* CONFIG_PERF_EVENTS */
1192 1203
1193/* 1204/*
@@ -1196,9 +1207,8 @@ kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
1196 * kprobe_trace_self_tests_init() does enable_trace_probe/disable_trace_probe 1207 * kprobe_trace_self_tests_init() does enable_trace_probe/disable_trace_probe
1197 * lockless, but we can't race with this __init function. 1208 * lockless, but we can't race with this __init function.
1198 */ 1209 */
1199static __kprobes 1210static int kprobe_register(struct ftrace_event_call *event,
1200int kprobe_register(struct ftrace_event_call *event, 1211 enum trace_reg type, void *data)
1201 enum trace_reg type, void *data)
1202{ 1212{
1203 struct trace_kprobe *tk = (struct trace_kprobe *)event->data; 1213 struct trace_kprobe *tk = (struct trace_kprobe *)event->data;
1204 struct ftrace_event_file *file = data; 1214 struct ftrace_event_file *file = data;
@@ -1224,8 +1234,7 @@ int kprobe_register(struct ftrace_event_call *event,
1224 return 0; 1234 return 0;
1225} 1235}
1226 1236
1227static __kprobes 1237static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs)
1228int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs)
1229{ 1238{
1230 struct trace_kprobe *tk = container_of(kp, struct trace_kprobe, rp.kp); 1239 struct trace_kprobe *tk = container_of(kp, struct trace_kprobe, rp.kp);
1231 1240
@@ -1239,9 +1248,10 @@ int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs)
1239#endif 1248#endif
1240 return 0; /* We don't tweek kernel, so just return 0 */ 1249 return 0; /* We don't tweek kernel, so just return 0 */
1241} 1250}
1251NOKPROBE_SYMBOL(kprobe_dispatcher);
1242 1252
1243static __kprobes 1253static int
1244int kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs) 1254kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs)
1245{ 1255{
1246 struct trace_kprobe *tk = container_of(ri->rp, struct trace_kprobe, rp); 1256 struct trace_kprobe *tk = container_of(ri->rp, struct trace_kprobe, rp);
1247 1257
@@ -1255,6 +1265,7 @@ int kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs)
1255#endif 1265#endif
1256 return 0; /* We don't tweek kernel, so just return 0 */ 1266 return 0; /* We don't tweek kernel, so just return 0 */
1257} 1267}
1268NOKPROBE_SYMBOL(kretprobe_dispatcher);
1258 1269
1259static struct trace_event_functions kretprobe_funcs = { 1270static struct trace_event_functions kretprobe_funcs = {
1260 .trace = print_kretprobe_event 1271 .trace = print_kretprobe_event
@@ -1377,6 +1388,9 @@ static __init int kprobe_trace_self_tests_init(void)
1377 struct trace_kprobe *tk; 1388 struct trace_kprobe *tk;
1378 struct ftrace_event_file *file; 1389 struct ftrace_event_file *file;
1379 1390
1391 if (tracing_is_disabled())
1392 return -ENODEV;
1393
1380 target = kprobe_trace_selftest_target; 1394 target = kprobe_trace_selftest_target;
1381 1395
1382 pr_info("Testing kprobe tracing: "); 1396 pr_info("Testing kprobe tracing: ");
diff --git a/kernel/trace/trace_nop.c b/kernel/trace/trace_nop.c
index 69a5cc94c01a..fcf0a9e48916 100644
--- a/kernel/trace/trace_nop.c
+++ b/kernel/trace/trace_nop.c
@@ -91,7 +91,6 @@ struct tracer nop_trace __read_mostly =
91 .name = "nop", 91 .name = "nop",
92 .init = nop_trace_init, 92 .init = nop_trace_init,
93 .reset = nop_trace_reset, 93 .reset = nop_trace_reset,
94 .wait_pipe = poll_wait_pipe,
95#ifdef CONFIG_FTRACE_SELFTEST 94#ifdef CONFIG_FTRACE_SELFTEST
96 .selftest = trace_selftest_startup_nop, 95 .selftest = trace_selftest_startup_nop,
97#endif 96#endif
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index a436de18aa99..f3dad80c20b2 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -126,6 +126,34 @@ trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
126EXPORT_SYMBOL_GPL(trace_seq_printf); 126EXPORT_SYMBOL_GPL(trace_seq_printf);
127 127
128/** 128/**
129 * trace_seq_bitmask - put a list of longs as a bitmask print output
130 * @s: trace sequence descriptor
131 * @maskp: points to an array of unsigned longs that represent a bitmask
132 * @nmaskbits: The number of bits that are valid in @maskp
133 *
134 * It returns 0 if the trace oversizes the buffer's free
135 * space, 1 otherwise.
136 *
137 * Writes a ASCII representation of a bitmask string into @s.
138 */
139int
140trace_seq_bitmask(struct trace_seq *s, const unsigned long *maskp,
141 int nmaskbits)
142{
143 int len = (PAGE_SIZE - 1) - s->len;
144 int ret;
145
146 if (s->full || !len)
147 return 0;
148
149 ret = bitmap_scnprintf(s->buffer, len, maskp, nmaskbits);
150 s->len += ret;
151
152 return 1;
153}
154EXPORT_SYMBOL_GPL(trace_seq_bitmask);
155
156/**
129 * trace_seq_vprintf - sequence printing of trace information 157 * trace_seq_vprintf - sequence printing of trace information
130 * @s: trace sequence descriptor 158 * @s: trace sequence descriptor
131 * @fmt: printf format string 159 * @fmt: printf format string
@@ -399,6 +427,19 @@ EXPORT_SYMBOL(ftrace_print_symbols_seq_u64);
399#endif 427#endif
400 428
401const char * 429const char *
430ftrace_print_bitmask_seq(struct trace_seq *p, void *bitmask_ptr,
431 unsigned int bitmask_size)
432{
433 const char *ret = p->buffer + p->len;
434
435 trace_seq_bitmask(p, bitmask_ptr, bitmask_size * 8);
436 trace_seq_putc(p, 0);
437
438 return ret;
439}
440EXPORT_SYMBOL_GPL(ftrace_print_bitmask_seq);
441
442const char *
402ftrace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int buf_len) 443ftrace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int buf_len)
403{ 444{
404 int i; 445 int i;
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
index 8364a421b4df..d4b9fc22cd27 100644
--- a/kernel/trace/trace_probe.c
+++ b/kernel/trace/trace_probe.c
@@ -37,13 +37,13 @@ const char *reserved_field_names[] = {
37 37
38/* Printing in basic type function template */ 38/* Printing in basic type function template */
39#define DEFINE_BASIC_PRINT_TYPE_FUNC(type, fmt) \ 39#define DEFINE_BASIC_PRINT_TYPE_FUNC(type, fmt) \
40__kprobes int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s, \ 40int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s, const char *name, \
41 const char *name, \ 41 void *data, void *ent) \
42 void *data, void *ent) \
43{ \ 42{ \
44 return trace_seq_printf(s, " %s=" fmt, name, *(type *)data); \ 43 return trace_seq_printf(s, " %s=" fmt, name, *(type *)data); \
45} \ 44} \
46const char PRINT_TYPE_FMT_NAME(type)[] = fmt; 45const char PRINT_TYPE_FMT_NAME(type)[] = fmt; \
46NOKPROBE_SYMBOL(PRINT_TYPE_FUNC_NAME(type));
47 47
48DEFINE_BASIC_PRINT_TYPE_FUNC(u8 , "0x%x") 48DEFINE_BASIC_PRINT_TYPE_FUNC(u8 , "0x%x")
49DEFINE_BASIC_PRINT_TYPE_FUNC(u16, "0x%x") 49DEFINE_BASIC_PRINT_TYPE_FUNC(u16, "0x%x")
@@ -55,9 +55,8 @@ DEFINE_BASIC_PRINT_TYPE_FUNC(s32, "%d")
55DEFINE_BASIC_PRINT_TYPE_FUNC(s64, "%Ld") 55DEFINE_BASIC_PRINT_TYPE_FUNC(s64, "%Ld")
56 56
57/* Print type function for string type */ 57/* Print type function for string type */
58__kprobes int PRINT_TYPE_FUNC_NAME(string)(struct trace_seq *s, 58int PRINT_TYPE_FUNC_NAME(string)(struct trace_seq *s, const char *name,
59 const char *name, 59 void *data, void *ent)
60 void *data, void *ent)
61{ 60{
62 int len = *(u32 *)data >> 16; 61 int len = *(u32 *)data >> 16;
63 62
@@ -67,6 +66,7 @@ __kprobes int PRINT_TYPE_FUNC_NAME(string)(struct trace_seq *s,
67 return trace_seq_printf(s, " %s=\"%s\"", name, 66 return trace_seq_printf(s, " %s=\"%s\"", name,
68 (const char *)get_loc_data(data, ent)); 67 (const char *)get_loc_data(data, ent));
69} 68}
69NOKPROBE_SYMBOL(PRINT_TYPE_FUNC_NAME(string));
70 70
71const char PRINT_TYPE_FMT_NAME(string)[] = "\\\"%s\\\""; 71const char PRINT_TYPE_FMT_NAME(string)[] = "\\\"%s\\\"";
72 72
@@ -81,23 +81,24 @@ const char PRINT_TYPE_FMT_NAME(string)[] = "\\\"%s\\\"";
81 81
82/* Data fetch function templates */ 82/* Data fetch function templates */
83#define DEFINE_FETCH_reg(type) \ 83#define DEFINE_FETCH_reg(type) \
84__kprobes void FETCH_FUNC_NAME(reg, type)(struct pt_regs *regs, \ 84void FETCH_FUNC_NAME(reg, type)(struct pt_regs *regs, void *offset, void *dest) \
85 void *offset, void *dest) \
86{ \ 85{ \
87 *(type *)dest = (type)regs_get_register(regs, \ 86 *(type *)dest = (type)regs_get_register(regs, \
88 (unsigned int)((unsigned long)offset)); \ 87 (unsigned int)((unsigned long)offset)); \
89} 88} \
89NOKPROBE_SYMBOL(FETCH_FUNC_NAME(reg, type));
90DEFINE_BASIC_FETCH_FUNCS(reg) 90DEFINE_BASIC_FETCH_FUNCS(reg)
91/* No string on the register */ 91/* No string on the register */
92#define fetch_reg_string NULL 92#define fetch_reg_string NULL
93#define fetch_reg_string_size NULL 93#define fetch_reg_string_size NULL
94 94
95#define DEFINE_FETCH_retval(type) \ 95#define DEFINE_FETCH_retval(type) \
96__kprobes void FETCH_FUNC_NAME(retval, type)(struct pt_regs *regs, \ 96void FETCH_FUNC_NAME(retval, type)(struct pt_regs *regs, \
97 void *dummy, void *dest) \ 97 void *dummy, void *dest) \
98{ \ 98{ \
99 *(type *)dest = (type)regs_return_value(regs); \ 99 *(type *)dest = (type)regs_return_value(regs); \
100} 100} \
101NOKPROBE_SYMBOL(FETCH_FUNC_NAME(retval, type));
101DEFINE_BASIC_FETCH_FUNCS(retval) 102DEFINE_BASIC_FETCH_FUNCS(retval)
102/* No string on the retval */ 103/* No string on the retval */
103#define fetch_retval_string NULL 104#define fetch_retval_string NULL
@@ -112,8 +113,8 @@ struct deref_fetch_param {
112}; 113};
113 114
114#define DEFINE_FETCH_deref(type) \ 115#define DEFINE_FETCH_deref(type) \
115__kprobes void FETCH_FUNC_NAME(deref, type)(struct pt_regs *regs, \ 116void FETCH_FUNC_NAME(deref, type)(struct pt_regs *regs, \
116 void *data, void *dest) \ 117 void *data, void *dest) \
117{ \ 118{ \
118 struct deref_fetch_param *dprm = data; \ 119 struct deref_fetch_param *dprm = data; \
119 unsigned long addr; \ 120 unsigned long addr; \
@@ -123,12 +124,13 @@ __kprobes void FETCH_FUNC_NAME(deref, type)(struct pt_regs *regs, \
123 dprm->fetch(regs, (void *)addr, dest); \ 124 dprm->fetch(regs, (void *)addr, dest); \
124 } else \ 125 } else \
125 *(type *)dest = 0; \ 126 *(type *)dest = 0; \
126} 127} \
128NOKPROBE_SYMBOL(FETCH_FUNC_NAME(deref, type));
127DEFINE_BASIC_FETCH_FUNCS(deref) 129DEFINE_BASIC_FETCH_FUNCS(deref)
128DEFINE_FETCH_deref(string) 130DEFINE_FETCH_deref(string)
129 131
130__kprobes void FETCH_FUNC_NAME(deref, string_size)(struct pt_regs *regs, 132void FETCH_FUNC_NAME(deref, string_size)(struct pt_regs *regs,
131 void *data, void *dest) 133 void *data, void *dest)
132{ 134{
133 struct deref_fetch_param *dprm = data; 135 struct deref_fetch_param *dprm = data;
134 unsigned long addr; 136 unsigned long addr;
@@ -140,16 +142,18 @@ __kprobes void FETCH_FUNC_NAME(deref, string_size)(struct pt_regs *regs,
140 } else 142 } else
141 *(string_size *)dest = 0; 143 *(string_size *)dest = 0;
142} 144}
145NOKPROBE_SYMBOL(FETCH_FUNC_NAME(deref, string_size));
143 146
144static __kprobes void update_deref_fetch_param(struct deref_fetch_param *data) 147static void update_deref_fetch_param(struct deref_fetch_param *data)
145{ 148{
146 if (CHECK_FETCH_FUNCS(deref, data->orig.fn)) 149 if (CHECK_FETCH_FUNCS(deref, data->orig.fn))
147 update_deref_fetch_param(data->orig.data); 150 update_deref_fetch_param(data->orig.data);
148 else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn)) 151 else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn))
149 update_symbol_cache(data->orig.data); 152 update_symbol_cache(data->orig.data);
150} 153}
154NOKPROBE_SYMBOL(update_deref_fetch_param);
151 155
152static __kprobes void free_deref_fetch_param(struct deref_fetch_param *data) 156static void free_deref_fetch_param(struct deref_fetch_param *data)
153{ 157{
154 if (CHECK_FETCH_FUNCS(deref, data->orig.fn)) 158 if (CHECK_FETCH_FUNCS(deref, data->orig.fn))
155 free_deref_fetch_param(data->orig.data); 159 free_deref_fetch_param(data->orig.data);
@@ -157,6 +161,7 @@ static __kprobes void free_deref_fetch_param(struct deref_fetch_param *data)
157 free_symbol_cache(data->orig.data); 161 free_symbol_cache(data->orig.data);
158 kfree(data); 162 kfree(data);
159} 163}
164NOKPROBE_SYMBOL(free_deref_fetch_param);
160 165
161/* Bitfield fetch function */ 166/* Bitfield fetch function */
162struct bitfield_fetch_param { 167struct bitfield_fetch_param {
@@ -166,8 +171,8 @@ struct bitfield_fetch_param {
166}; 171};
167 172
168#define DEFINE_FETCH_bitfield(type) \ 173#define DEFINE_FETCH_bitfield(type) \
169__kprobes void FETCH_FUNC_NAME(bitfield, type)(struct pt_regs *regs, \ 174void FETCH_FUNC_NAME(bitfield, type)(struct pt_regs *regs, \
170 void *data, void *dest) \ 175 void *data, void *dest) \
171{ \ 176{ \
172 struct bitfield_fetch_param *bprm = data; \ 177 struct bitfield_fetch_param *bprm = data; \
173 type buf = 0; \ 178 type buf = 0; \
@@ -177,13 +182,13 @@ __kprobes void FETCH_FUNC_NAME(bitfield, type)(struct pt_regs *regs, \
177 buf >>= bprm->low_shift; \ 182 buf >>= bprm->low_shift; \
178 } \ 183 } \
179 *(type *)dest = buf; \ 184 *(type *)dest = buf; \
180} 185} \
181 186NOKPROBE_SYMBOL(FETCH_FUNC_NAME(bitfield, type));
182DEFINE_BASIC_FETCH_FUNCS(bitfield) 187DEFINE_BASIC_FETCH_FUNCS(bitfield)
183#define fetch_bitfield_string NULL 188#define fetch_bitfield_string NULL
184#define fetch_bitfield_string_size NULL 189#define fetch_bitfield_string_size NULL
185 190
186static __kprobes void 191static void
187update_bitfield_fetch_param(struct bitfield_fetch_param *data) 192update_bitfield_fetch_param(struct bitfield_fetch_param *data)
188{ 193{
189 /* 194 /*
@@ -196,7 +201,7 @@ update_bitfield_fetch_param(struct bitfield_fetch_param *data)
196 update_symbol_cache(data->orig.data); 201 update_symbol_cache(data->orig.data);
197} 202}
198 203
199static __kprobes void 204static void
200free_bitfield_fetch_param(struct bitfield_fetch_param *data) 205free_bitfield_fetch_param(struct bitfield_fetch_param *data)
201{ 206{
202 /* 207 /*
@@ -255,17 +260,17 @@ fail:
255} 260}
256 261
257/* Special function : only accept unsigned long */ 262/* Special function : only accept unsigned long */
258static __kprobes void fetch_kernel_stack_address(struct pt_regs *regs, 263static void fetch_kernel_stack_address(struct pt_regs *regs, void *dummy, void *dest)
259 void *dummy, void *dest)
260{ 264{
261 *(unsigned long *)dest = kernel_stack_pointer(regs); 265 *(unsigned long *)dest = kernel_stack_pointer(regs);
262} 266}
267NOKPROBE_SYMBOL(fetch_kernel_stack_address);
263 268
264static __kprobes void fetch_user_stack_address(struct pt_regs *regs, 269static void fetch_user_stack_address(struct pt_regs *regs, void *dummy, void *dest)
265 void *dummy, void *dest)
266{ 270{
267 *(unsigned long *)dest = user_stack_pointer(regs); 271 *(unsigned long *)dest = user_stack_pointer(regs);
268} 272}
273NOKPROBE_SYMBOL(fetch_user_stack_address);
269 274
270static fetch_func_t get_fetch_size_function(const struct fetch_type *type, 275static fetch_func_t get_fetch_size_function(const struct fetch_type *type,
271 fetch_func_t orig_fn, 276 fetch_func_t orig_fn,
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
index fb1ab5dfbd42..4f815fbce16d 100644
--- a/kernel/trace/trace_probe.h
+++ b/kernel/trace/trace_probe.h
@@ -81,13 +81,13 @@
81 */ 81 */
82#define convert_rloc_to_loc(dl, offs) ((u32)(dl) + (offs)) 82#define convert_rloc_to_loc(dl, offs) ((u32)(dl) + (offs))
83 83
84static inline void *get_rloc_data(u32 *dl) 84static nokprobe_inline void *get_rloc_data(u32 *dl)
85{ 85{
86 return (u8 *)dl + get_rloc_offs(*dl); 86 return (u8 *)dl + get_rloc_offs(*dl);
87} 87}
88 88
89/* For data_loc conversion */ 89/* For data_loc conversion */
90static inline void *get_loc_data(u32 *dl, void *ent) 90static nokprobe_inline void *get_loc_data(u32 *dl, void *ent)
91{ 91{
92 return (u8 *)ent + get_rloc_offs(*dl); 92 return (u8 *)ent + get_rloc_offs(*dl);
93} 93}
@@ -136,9 +136,8 @@ typedef u32 string_size;
136 136
137/* Printing in basic type function template */ 137/* Printing in basic type function template */
138#define DECLARE_BASIC_PRINT_TYPE_FUNC(type) \ 138#define DECLARE_BASIC_PRINT_TYPE_FUNC(type) \
139__kprobes int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s, \ 139int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s, const char *name, \
140 const char *name, \ 140 void *data, void *ent); \
141 void *data, void *ent); \
142extern const char PRINT_TYPE_FMT_NAME(type)[] 141extern const char PRINT_TYPE_FMT_NAME(type)[]
143 142
144DECLARE_BASIC_PRINT_TYPE_FUNC(u8); 143DECLARE_BASIC_PRINT_TYPE_FUNC(u8);
@@ -303,7 +302,7 @@ static inline bool trace_probe_is_registered(struct trace_probe *tp)
303 return !!(tp->flags & TP_FLAG_REGISTERED); 302 return !!(tp->flags & TP_FLAG_REGISTERED);
304} 303}
305 304
306static inline __kprobes void call_fetch(struct fetch_param *fprm, 305static nokprobe_inline void call_fetch(struct fetch_param *fprm,
307 struct pt_regs *regs, void *dest) 306 struct pt_regs *regs, void *dest)
308{ 307{
309 return fprm->fn(regs, fprm->data, dest); 308 return fprm->fn(regs, fprm->data, dest);
@@ -351,7 +350,7 @@ extern ssize_t traceprobe_probes_write(struct file *file,
351extern int traceprobe_command(const char *buf, int (*createfn)(int, char**)); 350extern int traceprobe_command(const char *buf, int (*createfn)(int, char**));
352 351
353/* Sum up total data length for dynamic arraies (strings) */ 352/* Sum up total data length for dynamic arraies (strings) */
354static inline __kprobes int 353static nokprobe_inline int
355__get_data_size(struct trace_probe *tp, struct pt_regs *regs) 354__get_data_size(struct trace_probe *tp, struct pt_regs *regs)
356{ 355{
357 int i, ret = 0; 356 int i, ret = 0;
@@ -367,7 +366,7 @@ __get_data_size(struct trace_probe *tp, struct pt_regs *regs)
367} 366}
368 367
369/* Store the value of each argument */ 368/* Store the value of each argument */
370static inline __kprobes void 369static nokprobe_inline void
371store_trace_args(int ent_size, struct trace_probe *tp, struct pt_regs *regs, 370store_trace_args(int ent_size, struct trace_probe *tp, struct pt_regs *regs,
372 u8 *data, int maxlen) 371 u8 *data, int maxlen)
373{ 372{
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index e14da5e97a69..19bd8928ce94 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -130,15 +130,9 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip,
130 atomic_dec(&data->disabled); 130 atomic_dec(&data->disabled);
131 preempt_enable_notrace(); 131 preempt_enable_notrace();
132} 132}
133
134static struct ftrace_ops trace_ops __read_mostly =
135{
136 .func = wakeup_tracer_call,
137 .flags = FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_RECURSION_SAFE,
138};
139#endif /* CONFIG_FUNCTION_TRACER */ 133#endif /* CONFIG_FUNCTION_TRACER */
140 134
141static int register_wakeup_function(int graph, int set) 135static int register_wakeup_function(struct trace_array *tr, int graph, int set)
142{ 136{
143 int ret; 137 int ret;
144 138
@@ -150,7 +144,7 @@ static int register_wakeup_function(int graph, int set)
150 ret = register_ftrace_graph(&wakeup_graph_return, 144 ret = register_ftrace_graph(&wakeup_graph_return,
151 &wakeup_graph_entry); 145 &wakeup_graph_entry);
152 else 146 else
153 ret = register_ftrace_function(&trace_ops); 147 ret = register_ftrace_function(tr->ops);
154 148
155 if (!ret) 149 if (!ret)
156 function_enabled = true; 150 function_enabled = true;
@@ -158,7 +152,7 @@ static int register_wakeup_function(int graph, int set)
158 return ret; 152 return ret;
159} 153}
160 154
161static void unregister_wakeup_function(int graph) 155static void unregister_wakeup_function(struct trace_array *tr, int graph)
162{ 156{
163 if (!function_enabled) 157 if (!function_enabled)
164 return; 158 return;
@@ -166,17 +160,17 @@ static void unregister_wakeup_function(int graph)
166 if (graph) 160 if (graph)
167 unregister_ftrace_graph(); 161 unregister_ftrace_graph();
168 else 162 else
169 unregister_ftrace_function(&trace_ops); 163 unregister_ftrace_function(tr->ops);
170 164
171 function_enabled = false; 165 function_enabled = false;
172} 166}
173 167
174static void wakeup_function_set(int set) 168static void wakeup_function_set(struct trace_array *tr, int set)
175{ 169{
176 if (set) 170 if (set)
177 register_wakeup_function(is_graph(), 1); 171 register_wakeup_function(tr, is_graph(), 1);
178 else 172 else
179 unregister_wakeup_function(is_graph()); 173 unregister_wakeup_function(tr, is_graph());
180} 174}
181 175
182static int wakeup_flag_changed(struct trace_array *tr, u32 mask, int set) 176static int wakeup_flag_changed(struct trace_array *tr, u32 mask, int set)
@@ -184,16 +178,16 @@ static int wakeup_flag_changed(struct trace_array *tr, u32 mask, int set)
184 struct tracer *tracer = tr->current_trace; 178 struct tracer *tracer = tr->current_trace;
185 179
186 if (mask & TRACE_ITER_FUNCTION) 180 if (mask & TRACE_ITER_FUNCTION)
187 wakeup_function_set(set); 181 wakeup_function_set(tr, set);
188 182
189 return trace_keep_overwrite(tracer, mask, set); 183 return trace_keep_overwrite(tracer, mask, set);
190} 184}
191 185
192static int start_func_tracer(int graph) 186static int start_func_tracer(struct trace_array *tr, int graph)
193{ 187{
194 int ret; 188 int ret;
195 189
196 ret = register_wakeup_function(graph, 0); 190 ret = register_wakeup_function(tr, graph, 0);
197 191
198 if (!ret && tracing_is_enabled()) 192 if (!ret && tracing_is_enabled())
199 tracer_enabled = 1; 193 tracer_enabled = 1;
@@ -203,11 +197,11 @@ static int start_func_tracer(int graph)
203 return ret; 197 return ret;
204} 198}
205 199
206static void stop_func_tracer(int graph) 200static void stop_func_tracer(struct trace_array *tr, int graph)
207{ 201{
208 tracer_enabled = 0; 202 tracer_enabled = 0;
209 203
210 unregister_wakeup_function(graph); 204 unregister_wakeup_function(tr, graph);
211} 205}
212 206
213#ifdef CONFIG_FUNCTION_GRAPH_TRACER 207#ifdef CONFIG_FUNCTION_GRAPH_TRACER
@@ -221,12 +215,12 @@ wakeup_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set)
221 if (!(is_graph() ^ set)) 215 if (!(is_graph() ^ set))
222 return 0; 216 return 0;
223 217
224 stop_func_tracer(!set); 218 stop_func_tracer(tr, !set);
225 219
226 wakeup_reset(wakeup_trace); 220 wakeup_reset(wakeup_trace);
227 tracing_max_latency = 0; 221 tr->max_latency = 0;
228 222
229 return start_func_tracer(set); 223 return start_func_tracer(tr, set);
230} 224}
231 225
232static int wakeup_graph_entry(struct ftrace_graph_ent *trace) 226static int wakeup_graph_entry(struct ftrace_graph_ent *trace)
@@ -350,13 +344,13 @@ static void wakeup_print_header(struct seq_file *s)
350/* 344/*
351 * Should this new latency be reported/recorded? 345 * Should this new latency be reported/recorded?
352 */ 346 */
353static int report_latency(cycle_t delta) 347static int report_latency(struct trace_array *tr, cycle_t delta)
354{ 348{
355 if (tracing_thresh) { 349 if (tracing_thresh) {
356 if (delta < tracing_thresh) 350 if (delta < tracing_thresh)
357 return 0; 351 return 0;
358 } else { 352 } else {
359 if (delta <= tracing_max_latency) 353 if (delta <= tr->max_latency)
360 return 0; 354 return 0;
361 } 355 }
362 return 1; 356 return 1;
@@ -424,11 +418,11 @@ probe_wakeup_sched_switch(void *ignore,
424 T1 = ftrace_now(cpu); 418 T1 = ftrace_now(cpu);
425 delta = T1-T0; 419 delta = T1-T0;
426 420
427 if (!report_latency(delta)) 421 if (!report_latency(wakeup_trace, delta))
428 goto out_unlock; 422 goto out_unlock;
429 423
430 if (likely(!is_tracing_stopped())) { 424 if (likely(!is_tracing_stopped())) {
431 tracing_max_latency = delta; 425 wakeup_trace->max_latency = delta;
432 update_max_tr(wakeup_trace, wakeup_task, wakeup_cpu); 426 update_max_tr(wakeup_trace, wakeup_task, wakeup_cpu);
433 } 427 }
434 428
@@ -587,7 +581,7 @@ static void start_wakeup_tracer(struct trace_array *tr)
587 */ 581 */
588 smp_wmb(); 582 smp_wmb();
589 583
590 if (start_func_tracer(is_graph())) 584 if (start_func_tracer(tr, is_graph()))
591 printk(KERN_ERR "failed to start wakeup tracer\n"); 585 printk(KERN_ERR "failed to start wakeup tracer\n");
592 586
593 return; 587 return;
@@ -600,13 +594,15 @@ fail_deprobe:
600static void stop_wakeup_tracer(struct trace_array *tr) 594static void stop_wakeup_tracer(struct trace_array *tr)
601{ 595{
602 tracer_enabled = 0; 596 tracer_enabled = 0;
603 stop_func_tracer(is_graph()); 597 stop_func_tracer(tr, is_graph());
604 unregister_trace_sched_switch(probe_wakeup_sched_switch, NULL); 598 unregister_trace_sched_switch(probe_wakeup_sched_switch, NULL);
605 unregister_trace_sched_wakeup_new(probe_wakeup, NULL); 599 unregister_trace_sched_wakeup_new(probe_wakeup, NULL);
606 unregister_trace_sched_wakeup(probe_wakeup, NULL); 600 unregister_trace_sched_wakeup(probe_wakeup, NULL);
607 unregister_trace_sched_migrate_task(probe_wakeup_migrate_task, NULL); 601 unregister_trace_sched_migrate_task(probe_wakeup_migrate_task, NULL);
608} 602}
609 603
604static bool wakeup_busy;
605
610static int __wakeup_tracer_init(struct trace_array *tr) 606static int __wakeup_tracer_init(struct trace_array *tr)
611{ 607{
612 save_flags = trace_flags; 608 save_flags = trace_flags;
@@ -615,14 +611,20 @@ static int __wakeup_tracer_init(struct trace_array *tr)
615 set_tracer_flag(tr, TRACE_ITER_OVERWRITE, 1); 611 set_tracer_flag(tr, TRACE_ITER_OVERWRITE, 1);
616 set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, 1); 612 set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, 1);
617 613
618 tracing_max_latency = 0; 614 tr->max_latency = 0;
619 wakeup_trace = tr; 615 wakeup_trace = tr;
616 ftrace_init_array_ops(tr, wakeup_tracer_call);
620 start_wakeup_tracer(tr); 617 start_wakeup_tracer(tr);
618
619 wakeup_busy = true;
621 return 0; 620 return 0;
622} 621}
623 622
624static int wakeup_tracer_init(struct trace_array *tr) 623static int wakeup_tracer_init(struct trace_array *tr)
625{ 624{
625 if (wakeup_busy)
626 return -EBUSY;
627
626 wakeup_dl = 0; 628 wakeup_dl = 0;
627 wakeup_rt = 0; 629 wakeup_rt = 0;
628 return __wakeup_tracer_init(tr); 630 return __wakeup_tracer_init(tr);
@@ -630,6 +632,9 @@ static int wakeup_tracer_init(struct trace_array *tr)
630 632
631static int wakeup_rt_tracer_init(struct trace_array *tr) 633static int wakeup_rt_tracer_init(struct trace_array *tr)
632{ 634{
635 if (wakeup_busy)
636 return -EBUSY;
637
633 wakeup_dl = 0; 638 wakeup_dl = 0;
634 wakeup_rt = 1; 639 wakeup_rt = 1;
635 return __wakeup_tracer_init(tr); 640 return __wakeup_tracer_init(tr);
@@ -637,6 +642,9 @@ static int wakeup_rt_tracer_init(struct trace_array *tr)
637 642
638static int wakeup_dl_tracer_init(struct trace_array *tr) 643static int wakeup_dl_tracer_init(struct trace_array *tr)
639{ 644{
645 if (wakeup_busy)
646 return -EBUSY;
647
640 wakeup_dl = 1; 648 wakeup_dl = 1;
641 wakeup_rt = 0; 649 wakeup_rt = 0;
642 return __wakeup_tracer_init(tr); 650 return __wakeup_tracer_init(tr);
@@ -653,6 +661,8 @@ static void wakeup_tracer_reset(struct trace_array *tr)
653 661
654 set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, lat_flag); 662 set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, lat_flag);
655 set_tracer_flag(tr, TRACE_ITER_OVERWRITE, overwrite_flag); 663 set_tracer_flag(tr, TRACE_ITER_OVERWRITE, overwrite_flag);
664 ftrace_reset_array_ops(tr);
665 wakeup_busy = false;
656} 666}
657 667
658static void wakeup_tracer_start(struct trace_array *tr) 668static void wakeup_tracer_start(struct trace_array *tr)
@@ -684,6 +694,7 @@ static struct tracer wakeup_tracer __read_mostly =
684#endif 694#endif
685 .open = wakeup_trace_open, 695 .open = wakeup_trace_open,
686 .close = wakeup_trace_close, 696 .close = wakeup_trace_close,
697 .allow_instances = true,
687 .use_max_tr = true, 698 .use_max_tr = true,
688}; 699};
689 700
@@ -694,7 +705,6 @@ static struct tracer wakeup_rt_tracer __read_mostly =
694 .reset = wakeup_tracer_reset, 705 .reset = wakeup_tracer_reset,
695 .start = wakeup_tracer_start, 706 .start = wakeup_tracer_start,
696 .stop = wakeup_tracer_stop, 707 .stop = wakeup_tracer_stop,
697 .wait_pipe = poll_wait_pipe,
698 .print_max = true, 708 .print_max = true,
699 .print_header = wakeup_print_header, 709 .print_header = wakeup_print_header,
700 .print_line = wakeup_print_line, 710 .print_line = wakeup_print_line,
@@ -706,6 +716,7 @@ static struct tracer wakeup_rt_tracer __read_mostly =
706#endif 716#endif
707 .open = wakeup_trace_open, 717 .open = wakeup_trace_open,
708 .close = wakeup_trace_close, 718 .close = wakeup_trace_close,
719 .allow_instances = true,
709 .use_max_tr = true, 720 .use_max_tr = true,
710}; 721};
711 722
@@ -716,7 +727,6 @@ static struct tracer wakeup_dl_tracer __read_mostly =
716 .reset = wakeup_tracer_reset, 727 .reset = wakeup_tracer_reset,
717 .start = wakeup_tracer_start, 728 .start = wakeup_tracer_start,
718 .stop = wakeup_tracer_stop, 729 .stop = wakeup_tracer_stop,
719 .wait_pipe = poll_wait_pipe,
720 .print_max = true, 730 .print_max = true,
721 .print_header = wakeup_print_header, 731 .print_header = wakeup_print_header,
722 .print_line = wakeup_print_line, 732 .print_line = wakeup_print_line,
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index e98fca60974f..5ef60499dc8e 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -65,7 +65,7 @@ static int trace_test_buffer(struct trace_buffer *buf, unsigned long *count)
65 65
66 /* Don't allow flipping of max traces now */ 66 /* Don't allow flipping of max traces now */
67 local_irq_save(flags); 67 local_irq_save(flags);
68 arch_spin_lock(&ftrace_max_lock); 68 arch_spin_lock(&buf->tr->max_lock);
69 69
70 cnt = ring_buffer_entries(buf->buffer); 70 cnt = ring_buffer_entries(buf->buffer);
71 71
@@ -83,7 +83,7 @@ static int trace_test_buffer(struct trace_buffer *buf, unsigned long *count)
83 break; 83 break;
84 } 84 }
85 tracing_on(); 85 tracing_on();
86 arch_spin_unlock(&ftrace_max_lock); 86 arch_spin_unlock(&buf->tr->max_lock);
87 local_irq_restore(flags); 87 local_irq_restore(flags);
88 88
89 if (count) 89 if (count)
@@ -161,11 +161,6 @@ static struct ftrace_ops test_probe3 = {
161 .flags = FTRACE_OPS_FL_RECURSION_SAFE, 161 .flags = FTRACE_OPS_FL_RECURSION_SAFE,
162}; 162};
163 163
164static struct ftrace_ops test_global = {
165 .func = trace_selftest_test_global_func,
166 .flags = FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_RECURSION_SAFE,
167};
168
169static void print_counts(void) 164static void print_counts(void)
170{ 165{
171 printk("(%d %d %d %d %d) ", 166 printk("(%d %d %d %d %d) ",
@@ -185,7 +180,7 @@ static void reset_counts(void)
185 trace_selftest_test_dyn_cnt = 0; 180 trace_selftest_test_dyn_cnt = 0;
186} 181}
187 182
188static int trace_selftest_ops(int cnt) 183static int trace_selftest_ops(struct trace_array *tr, int cnt)
189{ 184{
190 int save_ftrace_enabled = ftrace_enabled; 185 int save_ftrace_enabled = ftrace_enabled;
191 struct ftrace_ops *dyn_ops; 186 struct ftrace_ops *dyn_ops;
@@ -220,7 +215,11 @@ static int trace_selftest_ops(int cnt)
220 register_ftrace_function(&test_probe1); 215 register_ftrace_function(&test_probe1);
221 register_ftrace_function(&test_probe2); 216 register_ftrace_function(&test_probe2);
222 register_ftrace_function(&test_probe3); 217 register_ftrace_function(&test_probe3);
223 register_ftrace_function(&test_global); 218 /* First time we are running with main function */
219 if (cnt > 1) {
220 ftrace_init_array_ops(tr, trace_selftest_test_global_func);
221 register_ftrace_function(tr->ops);
222 }
224 223
225 DYN_FTRACE_TEST_NAME(); 224 DYN_FTRACE_TEST_NAME();
226 225
@@ -232,8 +231,10 @@ static int trace_selftest_ops(int cnt)
232 goto out; 231 goto out;
233 if (trace_selftest_test_probe3_cnt != 1) 232 if (trace_selftest_test_probe3_cnt != 1)
234 goto out; 233 goto out;
235 if (trace_selftest_test_global_cnt == 0) 234 if (cnt > 1) {
236 goto out; 235 if (trace_selftest_test_global_cnt == 0)
236 goto out;
237 }
237 238
238 DYN_FTRACE_TEST_NAME2(); 239 DYN_FTRACE_TEST_NAME2();
239 240
@@ -269,8 +270,10 @@ static int trace_selftest_ops(int cnt)
269 goto out_free; 270 goto out_free;
270 if (trace_selftest_test_probe3_cnt != 3) 271 if (trace_selftest_test_probe3_cnt != 3)
271 goto out_free; 272 goto out_free;
272 if (trace_selftest_test_global_cnt == 0) 273 if (cnt > 1) {
273 goto out; 274 if (trace_selftest_test_global_cnt == 0)
275 goto out;
276 }
274 if (trace_selftest_test_dyn_cnt == 0) 277 if (trace_selftest_test_dyn_cnt == 0)
275 goto out_free; 278 goto out_free;
276 279
@@ -295,7 +298,9 @@ static int trace_selftest_ops(int cnt)
295 unregister_ftrace_function(&test_probe1); 298 unregister_ftrace_function(&test_probe1);
296 unregister_ftrace_function(&test_probe2); 299 unregister_ftrace_function(&test_probe2);
297 unregister_ftrace_function(&test_probe3); 300 unregister_ftrace_function(&test_probe3);
298 unregister_ftrace_function(&test_global); 301 if (cnt > 1)
302 unregister_ftrace_function(tr->ops);
303 ftrace_reset_array_ops(tr);
299 304
300 /* Make sure everything is off */ 305 /* Make sure everything is off */
301 reset_counts(); 306 reset_counts();
@@ -315,9 +320,9 @@ static int trace_selftest_ops(int cnt)
315} 320}
316 321
317/* Test dynamic code modification and ftrace filters */ 322/* Test dynamic code modification and ftrace filters */
318int trace_selftest_startup_dynamic_tracing(struct tracer *trace, 323static int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
319 struct trace_array *tr, 324 struct trace_array *tr,
320 int (*func)(void)) 325 int (*func)(void))
321{ 326{
322 int save_ftrace_enabled = ftrace_enabled; 327 int save_ftrace_enabled = ftrace_enabled;
323 unsigned long count; 328 unsigned long count;
@@ -388,7 +393,7 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
388 } 393 }
389 394
390 /* Test the ops with global tracing running */ 395 /* Test the ops with global tracing running */
391 ret = trace_selftest_ops(1); 396 ret = trace_selftest_ops(tr, 1);
392 trace->reset(tr); 397 trace->reset(tr);
393 398
394 out: 399 out:
@@ -399,7 +404,7 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
399 404
400 /* Test the ops with global tracing off */ 405 /* Test the ops with global tracing off */
401 if (!ret) 406 if (!ret)
402 ret = trace_selftest_ops(2); 407 ret = trace_selftest_ops(tr, 2);
403 408
404 return ret; 409 return ret;
405} 410}
@@ -802,7 +807,7 @@ out:
802int 807int
803trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr) 808trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr)
804{ 809{
805 unsigned long save_max = tracing_max_latency; 810 unsigned long save_max = tr->max_latency;
806 unsigned long count; 811 unsigned long count;
807 int ret; 812 int ret;
808 813
@@ -814,7 +819,7 @@ trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr)
814 } 819 }
815 820
816 /* reset the max latency */ 821 /* reset the max latency */
817 tracing_max_latency = 0; 822 tr->max_latency = 0;
818 /* disable interrupts for a bit */ 823 /* disable interrupts for a bit */
819 local_irq_disable(); 824 local_irq_disable();
820 udelay(100); 825 udelay(100);
@@ -841,7 +846,7 @@ trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr)
841 ret = -1; 846 ret = -1;
842 } 847 }
843 848
844 tracing_max_latency = save_max; 849 tr->max_latency = save_max;
845 850
846 return ret; 851 return ret;
847} 852}
@@ -851,7 +856,7 @@ trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr)
851int 856int
852trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr) 857trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr)
853{ 858{
854 unsigned long save_max = tracing_max_latency; 859 unsigned long save_max = tr->max_latency;
855 unsigned long count; 860 unsigned long count;
856 int ret; 861 int ret;
857 862
@@ -876,7 +881,7 @@ trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr)
876 } 881 }
877 882
878 /* reset the max latency */ 883 /* reset the max latency */
879 tracing_max_latency = 0; 884 tr->max_latency = 0;
880 /* disable preemption for a bit */ 885 /* disable preemption for a bit */
881 preempt_disable(); 886 preempt_disable();
882 udelay(100); 887 udelay(100);
@@ -903,7 +908,7 @@ trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr)
903 ret = -1; 908 ret = -1;
904 } 909 }
905 910
906 tracing_max_latency = save_max; 911 tr->max_latency = save_max;
907 912
908 return ret; 913 return ret;
909} 914}
@@ -913,7 +918,7 @@ trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr)
913int 918int
914trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *tr) 919trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *tr)
915{ 920{
916 unsigned long save_max = tracing_max_latency; 921 unsigned long save_max = tr->max_latency;
917 unsigned long count; 922 unsigned long count;
918 int ret; 923 int ret;
919 924
@@ -938,7 +943,7 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *
938 } 943 }
939 944
940 /* reset the max latency */ 945 /* reset the max latency */
941 tracing_max_latency = 0; 946 tr->max_latency = 0;
942 947
943 /* disable preemption and interrupts for a bit */ 948 /* disable preemption and interrupts for a bit */
944 preempt_disable(); 949 preempt_disable();
@@ -973,7 +978,7 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *
973 } 978 }
974 979
975 /* do the test by disabling interrupts first this time */ 980 /* do the test by disabling interrupts first this time */
976 tracing_max_latency = 0; 981 tr->max_latency = 0;
977 tracing_start(); 982 tracing_start();
978 trace->start(tr); 983 trace->start(tr);
979 984
@@ -1004,7 +1009,7 @@ out:
1004 tracing_start(); 1009 tracing_start();
1005out_no_start: 1010out_no_start:
1006 trace->reset(tr); 1011 trace->reset(tr);
1007 tracing_max_latency = save_max; 1012 tr->max_latency = save_max;
1008 1013
1009 return ret; 1014 return ret;
1010} 1015}
@@ -1057,7 +1062,7 @@ static int trace_wakeup_test_thread(void *data)
1057int 1062int
1058trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr) 1063trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr)
1059{ 1064{
1060 unsigned long save_max = tracing_max_latency; 1065 unsigned long save_max = tr->max_latency;
1061 struct task_struct *p; 1066 struct task_struct *p;
1062 struct completion is_ready; 1067 struct completion is_ready;
1063 unsigned long count; 1068 unsigned long count;
@@ -1083,7 +1088,7 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr)
1083 } 1088 }
1084 1089
1085 /* reset the max latency */ 1090 /* reset the max latency */
1086 tracing_max_latency = 0; 1091 tr->max_latency = 0;
1087 1092
1088 while (p->on_rq) { 1093 while (p->on_rq) {
1089 /* 1094 /*
@@ -1113,7 +1118,7 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr)
1113 trace->reset(tr); 1118 trace->reset(tr);
1114 tracing_start(); 1119 tracing_start();
1115 1120
1116 tracing_max_latency = save_max; 1121 tr->max_latency = save_max;
1117 1122
1118 /* kill the thread */ 1123 /* kill the thread */
1119 kthread_stop(p); 1124 kthread_stop(p);
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 21b320e5d163..8a4e5cb66a4c 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -51,11 +51,33 @@ static DEFINE_MUTEX(stack_sysctl_mutex);
51int stack_tracer_enabled; 51int stack_tracer_enabled;
52static int last_stack_tracer_enabled; 52static int last_stack_tracer_enabled;
53 53
54static inline void print_max_stack(void)
55{
56 long i;
57 int size;
58
59 pr_emerg(" Depth Size Location (%d entries)\n"
60 " ----- ---- --------\n",
61 max_stack_trace.nr_entries - 1);
62
63 for (i = 0; i < max_stack_trace.nr_entries; i++) {
64 if (stack_dump_trace[i] == ULONG_MAX)
65 break;
66 if (i+1 == max_stack_trace.nr_entries ||
67 stack_dump_trace[i+1] == ULONG_MAX)
68 size = stack_dump_index[i];
69 else
70 size = stack_dump_index[i] - stack_dump_index[i+1];
71
72 pr_emerg("%3ld) %8d %5d %pS\n", i, stack_dump_index[i],
73 size, (void *)stack_dump_trace[i]);
74 }
75}
76
54static inline void 77static inline void
55check_stack(unsigned long ip, unsigned long *stack) 78check_stack(unsigned long ip, unsigned long *stack)
56{ 79{
57 unsigned long this_size, flags; 80 unsigned long this_size, flags; unsigned long *p, *top, *start;
58 unsigned long *p, *top, *start;
59 static int tracer_frame; 81 static int tracer_frame;
60 int frame_size = ACCESS_ONCE(tracer_frame); 82 int frame_size = ACCESS_ONCE(tracer_frame);
61 int i; 83 int i;
@@ -85,8 +107,12 @@ check_stack(unsigned long ip, unsigned long *stack)
85 107
86 max_stack_size = this_size; 108 max_stack_size = this_size;
87 109
88 max_stack_trace.nr_entries = 0; 110 max_stack_trace.nr_entries = 0;
89 max_stack_trace.skip = 3; 111
112 if (using_ftrace_ops_list_func())
113 max_stack_trace.skip = 4;
114 else
115 max_stack_trace.skip = 3;
90 116
91 save_stack_trace(&max_stack_trace); 117 save_stack_trace(&max_stack_trace);
92 118
@@ -145,8 +171,12 @@ check_stack(unsigned long ip, unsigned long *stack)
145 i++; 171 i++;
146 } 172 }
147 173
148 BUG_ON(current != &init_task && 174 if ((current != &init_task &&
149 *(end_of_stack(current)) != STACK_END_MAGIC); 175 *(end_of_stack(current)) != STACK_END_MAGIC)) {
176 print_max_stack();
177 BUG();
178 }
179
150 out: 180 out:
151 arch_spin_unlock(&max_stack_lock); 181 arch_spin_unlock(&max_stack_lock);
152 local_irq_restore(flags); 182 local_irq_restore(flags);
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index c082a7441345..04fdb5de823c 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -108,8 +108,8 @@ static unsigned long get_user_stack_nth(struct pt_regs *regs, unsigned int n)
108 * Uprobes-specific fetch functions 108 * Uprobes-specific fetch functions
109 */ 109 */
110#define DEFINE_FETCH_stack(type) \ 110#define DEFINE_FETCH_stack(type) \
111static __kprobes void FETCH_FUNC_NAME(stack, type)(struct pt_regs *regs,\ 111static void FETCH_FUNC_NAME(stack, type)(struct pt_regs *regs, \
112 void *offset, void *dest) \ 112 void *offset, void *dest) \
113{ \ 113{ \
114 *(type *)dest = (type)get_user_stack_nth(regs, \ 114 *(type *)dest = (type)get_user_stack_nth(regs, \
115 ((unsigned long)offset)); \ 115 ((unsigned long)offset)); \
@@ -120,8 +120,8 @@ DEFINE_BASIC_FETCH_FUNCS(stack)
120#define fetch_stack_string_size NULL 120#define fetch_stack_string_size NULL
121 121
122#define DEFINE_FETCH_memory(type) \ 122#define DEFINE_FETCH_memory(type) \
123static __kprobes void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs,\ 123static void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs, \
124 void *addr, void *dest) \ 124 void *addr, void *dest) \
125{ \ 125{ \
126 type retval; \ 126 type retval; \
127 void __user *vaddr = (void __force __user *) addr; \ 127 void __user *vaddr = (void __force __user *) addr; \
@@ -136,8 +136,8 @@ DEFINE_BASIC_FETCH_FUNCS(memory)
136 * Fetch a null-terminated string. Caller MUST set *(u32 *)dest with max 136 * Fetch a null-terminated string. Caller MUST set *(u32 *)dest with max
137 * length and relative data location. 137 * length and relative data location.
138 */ 138 */
139static __kprobes void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs, 139static void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs,
140 void *addr, void *dest) 140 void *addr, void *dest)
141{ 141{
142 long ret; 142 long ret;
143 u32 rloc = *(u32 *)dest; 143 u32 rloc = *(u32 *)dest;
@@ -158,8 +158,8 @@ static __kprobes void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs,
158 } 158 }
159} 159}
160 160
161static __kprobes void FETCH_FUNC_NAME(memory, string_size)(struct pt_regs *regs, 161static void FETCH_FUNC_NAME(memory, string_size)(struct pt_regs *regs,
162 void *addr, void *dest) 162 void *addr, void *dest)
163{ 163{
164 int len; 164 int len;
165 void __user *vaddr = (void __force __user *) addr; 165 void __user *vaddr = (void __force __user *) addr;
@@ -184,8 +184,8 @@ static unsigned long translate_user_vaddr(void *file_offset)
184} 184}
185 185
186#define DEFINE_FETCH_file_offset(type) \ 186#define DEFINE_FETCH_file_offset(type) \
187static __kprobes void FETCH_FUNC_NAME(file_offset, type)(struct pt_regs *regs,\ 187static void FETCH_FUNC_NAME(file_offset, type)(struct pt_regs *regs, \
188 void *offset, void *dest) \ 188 void *offset, void *dest)\
189{ \ 189{ \
190 void *vaddr = (void *)translate_user_vaddr(offset); \ 190 void *vaddr = (void *)translate_user_vaddr(offset); \
191 \ 191 \
@@ -1009,56 +1009,60 @@ uprobe_filter_event(struct trace_uprobe *tu, struct perf_event *event)
1009 return __uprobe_perf_filter(&tu->filter, event->hw.tp_target->mm); 1009 return __uprobe_perf_filter(&tu->filter, event->hw.tp_target->mm);
1010} 1010}
1011 1011
1012static int uprobe_perf_open(struct trace_uprobe *tu, struct perf_event *event) 1012static int uprobe_perf_close(struct trace_uprobe *tu, struct perf_event *event)
1013{ 1013{
1014 bool done; 1014 bool done;
1015 1015
1016 write_lock(&tu->filter.rwlock); 1016 write_lock(&tu->filter.rwlock);
1017 if (event->hw.tp_target) { 1017 if (event->hw.tp_target) {
1018 /* 1018 list_del(&event->hw.tp_list);
1019 * event->parent != NULL means copy_process(), we can avoid
1020 * uprobe_apply(). current->mm must be probed and we can rely
1021 * on dup_mmap() which preserves the already installed bp's.
1022 *
1023 * attr.enable_on_exec means that exec/mmap will install the
1024 * breakpoints we need.
1025 */
1026 done = tu->filter.nr_systemwide || 1019 done = tu->filter.nr_systemwide ||
1027 event->parent || event->attr.enable_on_exec || 1020 (event->hw.tp_target->flags & PF_EXITING) ||
1028 uprobe_filter_event(tu, event); 1021 uprobe_filter_event(tu, event);
1029 list_add(&event->hw.tp_list, &tu->filter.perf_events);
1030 } else { 1022 } else {
1023 tu->filter.nr_systemwide--;
1031 done = tu->filter.nr_systemwide; 1024 done = tu->filter.nr_systemwide;
1032 tu->filter.nr_systemwide++;
1033 } 1025 }
1034 write_unlock(&tu->filter.rwlock); 1026 write_unlock(&tu->filter.rwlock);
1035 1027
1036 if (!done) 1028 if (!done)
1037 uprobe_apply(tu->inode, tu->offset, &tu->consumer, true); 1029 return uprobe_apply(tu->inode, tu->offset, &tu->consumer, false);
1038 1030
1039 return 0; 1031 return 0;
1040} 1032}
1041 1033
1042static int uprobe_perf_close(struct trace_uprobe *tu, struct perf_event *event) 1034static int uprobe_perf_open(struct trace_uprobe *tu, struct perf_event *event)
1043{ 1035{
1044 bool done; 1036 bool done;
1037 int err;
1045 1038
1046 write_lock(&tu->filter.rwlock); 1039 write_lock(&tu->filter.rwlock);
1047 if (event->hw.tp_target) { 1040 if (event->hw.tp_target) {
1048 list_del(&event->hw.tp_list); 1041 /*
1042 * event->parent != NULL means copy_process(), we can avoid
1043 * uprobe_apply(). current->mm must be probed and we can rely
1044 * on dup_mmap() which preserves the already installed bp's.
1045 *
1046 * attr.enable_on_exec means that exec/mmap will install the
1047 * breakpoints we need.
1048 */
1049 done = tu->filter.nr_systemwide || 1049 done = tu->filter.nr_systemwide ||
1050 (event->hw.tp_target->flags & PF_EXITING) || 1050 event->parent || event->attr.enable_on_exec ||
1051 uprobe_filter_event(tu, event); 1051 uprobe_filter_event(tu, event);
1052 list_add(&event->hw.tp_list, &tu->filter.perf_events);
1052 } else { 1053 } else {
1053 tu->filter.nr_systemwide--;
1054 done = tu->filter.nr_systemwide; 1054 done = tu->filter.nr_systemwide;
1055 tu->filter.nr_systemwide++;
1055 } 1056 }
1056 write_unlock(&tu->filter.rwlock); 1057 write_unlock(&tu->filter.rwlock);
1057 1058
1058 if (!done) 1059 err = 0;
1059 uprobe_apply(tu->inode, tu->offset, &tu->consumer, false); 1060 if (!done) {
1060 1061 err = uprobe_apply(tu->inode, tu->offset, &tu->consumer, true);
1061 return 0; 1062 if (err)
1063 uprobe_perf_close(tu, event);
1064 }
1065 return err;
1062} 1066}
1063 1067
1064static bool uprobe_perf_filter(struct uprobe_consumer *uc, 1068static bool uprobe_perf_filter(struct uprobe_consumer *uc,
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index ac5b23cf7212..33cbd8c203f8 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -188,7 +188,6 @@ static int tracepoint_add_func(struct tracepoint *tp,
188 WARN_ON_ONCE(1); 188 WARN_ON_ONCE(1);
189 return PTR_ERR(old); 189 return PTR_ERR(old);
190 } 190 }
191 release_probes(old);
192 191
193 /* 192 /*
194 * rcu_assign_pointer has a smp_wmb() which makes sure that the new 193 * rcu_assign_pointer has a smp_wmb() which makes sure that the new
@@ -200,6 +199,7 @@ static int tracepoint_add_func(struct tracepoint *tp,
200 rcu_assign_pointer(tp->funcs, tp_funcs); 199 rcu_assign_pointer(tp->funcs, tp_funcs);
201 if (!static_key_enabled(&tp->key)) 200 if (!static_key_enabled(&tp->key))
202 static_key_slow_inc(&tp->key); 201 static_key_slow_inc(&tp->key);
202 release_probes(old);
203 return 0; 203 return 0;
204} 204}
205 205
@@ -221,7 +221,6 @@ static int tracepoint_remove_func(struct tracepoint *tp,
221 WARN_ON_ONCE(1); 221 WARN_ON_ONCE(1);
222 return PTR_ERR(old); 222 return PTR_ERR(old);
223 } 223 }
224 release_probes(old);
225 224
226 if (!tp_funcs) { 225 if (!tp_funcs) {
227 /* Removed last function */ 226 /* Removed last function */
@@ -232,6 +231,7 @@ static int tracepoint_remove_func(struct tracepoint *tp,
232 static_key_slow_dec(&tp->key); 231 static_key_slow_dec(&tp->key);
233 } 232 }
234 rcu_assign_pointer(tp->funcs, tp_funcs); 233 rcu_assign_pointer(tp->funcs, tp_funcs);
234 release_probes(old);
235 return 0; 235 return 0;
236} 236}
237 237
@@ -239,6 +239,7 @@ static int tracepoint_remove_func(struct tracepoint *tp,
239 * tracepoint_probe_register - Connect a probe to a tracepoint 239 * tracepoint_probe_register - Connect a probe to a tracepoint
240 * @tp: tracepoint 240 * @tp: tracepoint
241 * @probe: probe handler 241 * @probe: probe handler
242 * @data: tracepoint data
242 * 243 *
243 * Returns 0 if ok, error value on error. 244 * Returns 0 if ok, error value on error.
244 * Note: if @tp is within a module, the caller is responsible for 245 * Note: if @tp is within a module, the caller is responsible for
@@ -264,6 +265,7 @@ EXPORT_SYMBOL_GPL(tracepoint_probe_register);
264 * tracepoint_probe_unregister - Disconnect a probe from a tracepoint 265 * tracepoint_probe_unregister - Disconnect a probe from a tracepoint
265 * @tp: tracepoint 266 * @tp: tracepoint
266 * @probe: probe function pointer 267 * @probe: probe function pointer
268 * @data: tracepoint data
267 * 269 *
268 * Returns 0 if ok, error value on error. 270 * Returns 0 if ok, error value on error.
269 */ 271 */
diff --git a/kernel/user.c b/kernel/user.c
index 294fc6a94168..4efa39350e44 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -87,7 +87,6 @@ static DEFINE_SPINLOCK(uidhash_lock);
87struct user_struct root_user = { 87struct user_struct root_user = {
88 .__count = ATOMIC_INIT(1), 88 .__count = ATOMIC_INIT(1),
89 .processes = ATOMIC_INIT(1), 89 .processes = ATOMIC_INIT(1),
90 .files = ATOMIC_INIT(0),
91 .sigpending = ATOMIC_INIT(0), 90 .sigpending = ATOMIC_INIT(0),
92 .locked_shm = 0, 91 .locked_shm = 0,
93 .uid = GLOBAL_ROOT_UID, 92 .uid = GLOBAL_ROOT_UID,
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index bf71b4b2d632..fcc02560fd6b 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -286,7 +286,7 @@ EXPORT_SYMBOL(from_kuid_munged);
286/** 286/**
287 * make_kgid - Map a user-namespace gid pair into a kgid. 287 * make_kgid - Map a user-namespace gid pair into a kgid.
288 * @ns: User namespace that the gid is in 288 * @ns: User namespace that the gid is in
289 * @uid: group identifier 289 * @gid: group identifier
290 * 290 *
291 * Maps a user-namespace gid pair into a kernel internal kgid, 291 * Maps a user-namespace gid pair into a kernel internal kgid,
292 * and returns that kgid. 292 * and returns that kgid.
@@ -482,7 +482,8 @@ static int projid_m_show(struct seq_file *seq, void *v)
482 return 0; 482 return 0;
483} 483}
484 484
485static void *m_start(struct seq_file *seq, loff_t *ppos, struct uid_gid_map *map) 485static void *m_start(struct seq_file *seq, loff_t *ppos,
486 struct uid_gid_map *map)
486{ 487{
487 struct uid_gid_extent *extent = NULL; 488 struct uid_gid_extent *extent = NULL;
488 loff_t pos = *ppos; 489 loff_t pos = *ppos;
@@ -546,7 +547,8 @@ struct seq_operations proc_projid_seq_operations = {
546 .show = projid_m_show, 547 .show = projid_m_show,
547}; 548};
548 549
549static bool mappings_overlap(struct uid_gid_map *new_map, struct uid_gid_extent *extent) 550static bool mappings_overlap(struct uid_gid_map *new_map,
551 struct uid_gid_extent *extent)
550{ 552{
551 u32 upper_first, lower_first, upper_last, lower_last; 553 u32 upper_first, lower_first, upper_last, lower_last;
552 unsigned idx; 554 unsigned idx;
@@ -653,7 +655,7 @@ static ssize_t map_write(struct file *file, const char __user *buf,
653 ret = -EINVAL; 655 ret = -EINVAL;
654 pos = kbuf; 656 pos = kbuf;
655 new_map.nr_extents = 0; 657 new_map.nr_extents = 0;
656 for (;pos; pos = next_line) { 658 for (; pos; pos = next_line) {
657 extent = &new_map.extent[new_map.nr_extents]; 659 extent = &new_map.extent[new_map.nr_extents];
658 660
659 /* Find the end of line and ensure I don't look past it */ 661 /* Find the end of line and ensure I don't look past it */
@@ -687,13 +689,16 @@ static ssize_t map_write(struct file *file, const char __user *buf,
687 689
688 /* Verify we have been given valid starting values */ 690 /* Verify we have been given valid starting values */
689 if ((extent->first == (u32) -1) || 691 if ((extent->first == (u32) -1) ||
690 (extent->lower_first == (u32) -1 )) 692 (extent->lower_first == (u32) -1))
691 goto out; 693 goto out;
692 694
693 /* Verify count is not zero and does not cause the extent to wrap */ 695 /* Verify count is not zero and does not cause the
696 * extent to wrap
697 */
694 if ((extent->first + extent->count) <= extent->first) 698 if ((extent->first + extent->count) <= extent->first)
695 goto out; 699 goto out;
696 if ((extent->lower_first + extent->count) <= extent->lower_first) 700 if ((extent->lower_first + extent->count) <=
701 extent->lower_first)
697 goto out; 702 goto out;
698 703
699 /* Do the ranges in extent overlap any previous extents? */ 704 /* Do the ranges in extent overlap any previous extents? */
@@ -751,7 +756,8 @@ out:
751 return ret; 756 return ret;
752} 757}
753 758
754ssize_t proc_uid_map_write(struct file *file, const char __user *buf, size_t size, loff_t *ppos) 759ssize_t proc_uid_map_write(struct file *file, const char __user *buf,
760 size_t size, loff_t *ppos)
755{ 761{
756 struct seq_file *seq = file->private_data; 762 struct seq_file *seq = file->private_data;
757 struct user_namespace *ns = seq->private; 763 struct user_namespace *ns = seq->private;
@@ -767,7 +773,8 @@ ssize_t proc_uid_map_write(struct file *file, const char __user *buf, size_t siz
767 &ns->uid_map, &ns->parent->uid_map); 773 &ns->uid_map, &ns->parent->uid_map);
768} 774}
769 775
770ssize_t proc_gid_map_write(struct file *file, const char __user *buf, size_t size, loff_t *ppos) 776ssize_t proc_gid_map_write(struct file *file, const char __user *buf,
777 size_t size, loff_t *ppos)
771{ 778{
772 struct seq_file *seq = file->private_data; 779 struct seq_file *seq = file->private_data;
773 struct user_namespace *ns = seq->private; 780 struct user_namespace *ns = seq->private;
@@ -783,7 +790,8 @@ ssize_t proc_gid_map_write(struct file *file, const char __user *buf, size_t siz
783 &ns->gid_map, &ns->parent->gid_map); 790 &ns->gid_map, &ns->parent->gid_map);
784} 791}
785 792
786ssize_t proc_projid_map_write(struct file *file, const char __user *buf, size_t size, loff_t *ppos) 793ssize_t proc_projid_map_write(struct file *file, const char __user *buf,
794 size_t size, loff_t *ppos)
787{ 795{
788 struct seq_file *seq = file->private_data; 796 struct seq_file *seq = file->private_data;
789 struct user_namespace *ns = seq->private; 797 struct user_namespace *ns = seq->private;
@@ -800,7 +808,7 @@ ssize_t proc_projid_map_write(struct file *file, const char __user *buf, size_t
800 &ns->projid_map, &ns->parent->projid_map); 808 &ns->projid_map, &ns->parent->projid_map);
801} 809}
802 810
803static bool new_idmap_permitted(const struct file *file, 811static bool new_idmap_permitted(const struct file *file,
804 struct user_namespace *ns, int cap_setid, 812 struct user_namespace *ns, int cap_setid,
805 struct uid_gid_map *new_map) 813 struct uid_gid_map *new_map)
806{ 814{
@@ -811,8 +819,7 @@ static bool new_idmap_permitted(const struct file *file,
811 kuid_t uid = make_kuid(ns->parent, id); 819 kuid_t uid = make_kuid(ns->parent, id);
812 if (uid_eq(uid, file->f_cred->fsuid)) 820 if (uid_eq(uid, file->f_cred->fsuid))
813 return true; 821 return true;
814 } 822 } else if (cap_setid == CAP_SETGID) {
815 else if (cap_setid == CAP_SETGID) {
816 kgid_t gid = make_kgid(ns->parent, id); 823 kgid_t gid = make_kgid(ns->parent, id);
817 if (gid_eq(gid, file->f_cred->fsgid)) 824 if (gid_eq(gid, file->f_cred->fsgid))
818 return true; 825 return true;
diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c
index 4f69f9a5e221..c8eac43267e9 100644
--- a/kernel/utsname_sysctl.c
+++ b/kernel/utsname_sysctl.c
@@ -17,7 +17,7 @@
17 17
18#ifdef CONFIG_PROC_SYSCTL 18#ifdef CONFIG_PROC_SYSCTL
19 19
20static void *get_uts(ctl_table *table, int write) 20static void *get_uts(struct ctl_table *table, int write)
21{ 21{
22 char *which = table->data; 22 char *which = table->data;
23 struct uts_namespace *uts_ns; 23 struct uts_namespace *uts_ns;
@@ -32,7 +32,7 @@ static void *get_uts(ctl_table *table, int write)
32 return which; 32 return which;
33} 33}
34 34
35static void put_uts(ctl_table *table, int write, void *which) 35static void put_uts(struct ctl_table *table, int write, void *which)
36{ 36{
37 if (!write) 37 if (!write)
38 up_read(&uts_sem); 38 up_read(&uts_sem);
@@ -44,14 +44,14 @@ static void put_uts(ctl_table *table, int write, void *which)
44 * Special case of dostring for the UTS structure. This has locks 44 * Special case of dostring for the UTS structure. This has locks
45 * to observe. Should this be in kernel/sys.c ???? 45 * to observe. Should this be in kernel/sys.c ????
46 */ 46 */
47static int proc_do_uts_string(ctl_table *table, int write, 47static int proc_do_uts_string(struct ctl_table *table, int write,
48 void __user *buffer, size_t *lenp, loff_t *ppos) 48 void __user *buffer, size_t *lenp, loff_t *ppos)
49{ 49{
50 struct ctl_table uts_table; 50 struct ctl_table uts_table;
51 int r; 51 int r;
52 memcpy(&uts_table, table, sizeof(uts_table)); 52 memcpy(&uts_table, table, sizeof(uts_table));
53 uts_table.data = get_uts(table, write); 53 uts_table.data = get_uts(table, write);
54 r = proc_dostring(&uts_table,write,buffer,lenp, ppos); 54 r = proc_dostring(&uts_table, write, buffer, lenp, ppos);
55 put_uts(table, write, uts_table.data); 55 put_uts(table, write, uts_table.data);
56 56
57 if (write) 57 if (write)
@@ -135,4 +135,4 @@ static int __init utsname_sysctl_init(void)
135 return 0; 135 return 0;
136} 136}
137 137
138__initcall(utsname_sysctl_init); 138device_initcall(utsname_sysctl_init);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 0ee63af30bd1..6203d2900877 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -65,15 +65,12 @@ enum {
65 * be executing on any CPU. The pool behaves as an unbound one. 65 * be executing on any CPU. The pool behaves as an unbound one.
66 * 66 *
67 * Note that DISASSOCIATED should be flipped only while holding 67 * Note that DISASSOCIATED should be flipped only while holding
68 * manager_mutex to avoid changing binding state while 68 * attach_mutex to avoid changing binding state while
69 * create_worker() is in progress. 69 * worker_attach_to_pool() is in progress.
70 */ 70 */
71 POOL_MANAGE_WORKERS = 1 << 0, /* need to manage workers */
72 POOL_DISASSOCIATED = 1 << 2, /* cpu can't serve workers */ 71 POOL_DISASSOCIATED = 1 << 2, /* cpu can't serve workers */
73 POOL_FREEZING = 1 << 3, /* freeze in progress */
74 72
75 /* worker flags */ 73 /* worker flags */
76 WORKER_STARTED = 1 << 0, /* started */
77 WORKER_DIE = 1 << 1, /* die die die */ 74 WORKER_DIE = 1 << 1, /* die die die */
78 WORKER_IDLE = 1 << 2, /* is idle */ 75 WORKER_IDLE = 1 << 2, /* is idle */
79 WORKER_PREP = 1 << 3, /* preparing to run works */ 76 WORKER_PREP = 1 << 3, /* preparing to run works */
@@ -100,10 +97,10 @@ enum {
100 97
101 /* 98 /*
102 * Rescue workers are used only on emergencies and shared by 99 * Rescue workers are used only on emergencies and shared by
103 * all cpus. Give -20. 100 * all cpus. Give MIN_NICE.
104 */ 101 */
105 RESCUER_NICE_LEVEL = -20, 102 RESCUER_NICE_LEVEL = MIN_NICE,
106 HIGHPRI_NICE_LEVEL = -20, 103 HIGHPRI_NICE_LEVEL = MIN_NICE,
107 104
108 WQ_NAME_LEN = 24, 105 WQ_NAME_LEN = 24,
109}; 106};
@@ -124,8 +121,7 @@ enum {
124 * cpu or grabbing pool->lock is enough for read access. If 121 * cpu or grabbing pool->lock is enough for read access. If
125 * POOL_DISASSOCIATED is set, it's identical to L. 122 * POOL_DISASSOCIATED is set, it's identical to L.
126 * 123 *
127 * MG: pool->manager_mutex and pool->lock protected. Writes require both 124 * A: pool->attach_mutex protected.
128 * locks. Reads can happen under either lock.
129 * 125 *
130 * PL: wq_pool_mutex protected. 126 * PL: wq_pool_mutex protected.
131 * 127 *
@@ -163,8 +159,11 @@ struct worker_pool {
163 159
164 /* see manage_workers() for details on the two manager mutexes */ 160 /* see manage_workers() for details on the two manager mutexes */
165 struct mutex manager_arb; /* manager arbitration */ 161 struct mutex manager_arb; /* manager arbitration */
166 struct mutex manager_mutex; /* manager exclusion */ 162 struct mutex attach_mutex; /* attach/detach exclusion */
167 struct idr worker_idr; /* MG: worker IDs and iteration */ 163 struct list_head workers; /* A: attached workers */
164 struct completion *detach_completion; /* all workers detached */
165
166 struct ida worker_ida; /* worker IDs for task name */
168 167
169 struct workqueue_attrs *attrs; /* I: worker attributes */ 168 struct workqueue_attrs *attrs; /* I: worker attributes */
170 struct hlist_node hash_node; /* PL: unbound_pool_hash node */ 169 struct hlist_node hash_node; /* PL: unbound_pool_hash node */
@@ -340,16 +339,6 @@ static void copy_workqueue_attrs(struct workqueue_attrs *to,
340 lockdep_is_held(&wq->mutex), \ 339 lockdep_is_held(&wq->mutex), \
341 "sched RCU or wq->mutex should be held") 340 "sched RCU or wq->mutex should be held")
342 341
343#ifdef CONFIG_LOCKDEP
344#define assert_manager_or_pool_lock(pool) \
345 WARN_ONCE(debug_locks && \
346 !lockdep_is_held(&(pool)->manager_mutex) && \
347 !lockdep_is_held(&(pool)->lock), \
348 "pool->manager_mutex or ->lock should be held")
349#else
350#define assert_manager_or_pool_lock(pool) do { } while (0)
351#endif
352
353#define for_each_cpu_worker_pool(pool, cpu) \ 342#define for_each_cpu_worker_pool(pool, cpu) \
354 for ((pool) = &per_cpu(cpu_worker_pools, cpu)[0]; \ 343 for ((pool) = &per_cpu(cpu_worker_pools, cpu)[0]; \
355 (pool) < &per_cpu(cpu_worker_pools, cpu)[NR_STD_WORKER_POOLS]; \ 344 (pool) < &per_cpu(cpu_worker_pools, cpu)[NR_STD_WORKER_POOLS]; \
@@ -375,17 +364,16 @@ static void copy_workqueue_attrs(struct workqueue_attrs *to,
375/** 364/**
376 * for_each_pool_worker - iterate through all workers of a worker_pool 365 * for_each_pool_worker - iterate through all workers of a worker_pool
377 * @worker: iteration cursor 366 * @worker: iteration cursor
378 * @wi: integer used for iteration
379 * @pool: worker_pool to iterate workers of 367 * @pool: worker_pool to iterate workers of
380 * 368 *
381 * This must be called with either @pool->manager_mutex or ->lock held. 369 * This must be called with @pool->attach_mutex.
382 * 370 *
383 * The if/else clause exists only for the lockdep assertion and can be 371 * The if/else clause exists only for the lockdep assertion and can be
384 * ignored. 372 * ignored.
385 */ 373 */
386#define for_each_pool_worker(worker, wi, pool) \ 374#define for_each_pool_worker(worker, pool) \
387 idr_for_each_entry(&(pool)->worker_idr, (worker), (wi)) \ 375 list_for_each_entry((worker), &(pool)->workers, node) \
388 if (({ assert_manager_or_pool_lock((pool)); false; })) { } \ 376 if (({ lockdep_assert_held(&pool->attach_mutex); false; })) { } \
389 else 377 else
390 378
391/** 379/**
@@ -763,13 +751,6 @@ static bool need_to_create_worker(struct worker_pool *pool)
763 return need_more_worker(pool) && !may_start_working(pool); 751 return need_more_worker(pool) && !may_start_working(pool);
764} 752}
765 753
766/* Do I need to be the manager? */
767static bool need_to_manage_workers(struct worker_pool *pool)
768{
769 return need_to_create_worker(pool) ||
770 (pool->flags & POOL_MANAGE_WORKERS);
771}
772
773/* Do we have too many workers and should some go away? */ 754/* Do we have too many workers and should some go away? */
774static bool too_many_workers(struct worker_pool *pool) 755static bool too_many_workers(struct worker_pool *pool)
775{ 756{
@@ -791,8 +772,8 @@ static bool too_many_workers(struct worker_pool *pool)
791 * Wake up functions. 772 * Wake up functions.
792 */ 773 */
793 774
794/* Return the first worker. Safe with preemption disabled */ 775/* Return the first idle worker. Safe with preemption disabled */
795static struct worker *first_worker(struct worker_pool *pool) 776static struct worker *first_idle_worker(struct worker_pool *pool)
796{ 777{
797 if (unlikely(list_empty(&pool->idle_list))) 778 if (unlikely(list_empty(&pool->idle_list)))
798 return NULL; 779 return NULL;
@@ -811,7 +792,7 @@ static struct worker *first_worker(struct worker_pool *pool)
811 */ 792 */
812static void wake_up_worker(struct worker_pool *pool) 793static void wake_up_worker(struct worker_pool *pool)
813{ 794{
814 struct worker *worker = first_worker(pool); 795 struct worker *worker = first_idle_worker(pool);
815 796
816 if (likely(worker)) 797 if (likely(worker))
817 wake_up_process(worker->task); 798 wake_up_process(worker->task);
@@ -885,7 +866,7 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task, int cpu)
885 */ 866 */
886 if (atomic_dec_and_test(&pool->nr_running) && 867 if (atomic_dec_and_test(&pool->nr_running) &&
887 !list_empty(&pool->worklist)) 868 !list_empty(&pool->worklist))
888 to_wakeup = first_worker(pool); 869 to_wakeup = first_idle_worker(pool);
889 return to_wakeup ? to_wakeup->task : NULL; 870 return to_wakeup ? to_wakeup->task : NULL;
890} 871}
891 872
@@ -1621,70 +1602,6 @@ static void worker_leave_idle(struct worker *worker)
1621 list_del_init(&worker->entry); 1602 list_del_init(&worker->entry);
1622} 1603}
1623 1604
1624/**
1625 * worker_maybe_bind_and_lock - try to bind %current to worker_pool and lock it
1626 * @pool: target worker_pool
1627 *
1628 * Bind %current to the cpu of @pool if it is associated and lock @pool.
1629 *
1630 * Works which are scheduled while the cpu is online must at least be
1631 * scheduled to a worker which is bound to the cpu so that if they are
1632 * flushed from cpu callbacks while cpu is going down, they are
1633 * guaranteed to execute on the cpu.
1634 *
1635 * This function is to be used by unbound workers and rescuers to bind
1636 * themselves to the target cpu and may race with cpu going down or
1637 * coming online. kthread_bind() can't be used because it may put the
1638 * worker to already dead cpu and set_cpus_allowed_ptr() can't be used
1639 * verbatim as it's best effort and blocking and pool may be
1640 * [dis]associated in the meantime.
1641 *
1642 * This function tries set_cpus_allowed() and locks pool and verifies the
1643 * binding against %POOL_DISASSOCIATED which is set during
1644 * %CPU_DOWN_PREPARE and cleared during %CPU_ONLINE, so if the worker
1645 * enters idle state or fetches works without dropping lock, it can
1646 * guarantee the scheduling requirement described in the first paragraph.
1647 *
1648 * CONTEXT:
1649 * Might sleep. Called without any lock but returns with pool->lock
1650 * held.
1651 *
1652 * Return:
1653 * %true if the associated pool is online (@worker is successfully
1654 * bound), %false if offline.
1655 */
1656static bool worker_maybe_bind_and_lock(struct worker_pool *pool)
1657__acquires(&pool->lock)
1658{
1659 while (true) {
1660 /*
1661 * The following call may fail, succeed or succeed
1662 * without actually migrating the task to the cpu if
1663 * it races with cpu hotunplug operation. Verify
1664 * against POOL_DISASSOCIATED.
1665 */
1666 if (!(pool->flags & POOL_DISASSOCIATED))
1667 set_cpus_allowed_ptr(current, pool->attrs->cpumask);
1668
1669 spin_lock_irq(&pool->lock);
1670 if (pool->flags & POOL_DISASSOCIATED)
1671 return false;
1672 if (task_cpu(current) == pool->cpu &&
1673 cpumask_equal(&current->cpus_allowed, pool->attrs->cpumask))
1674 return true;
1675 spin_unlock_irq(&pool->lock);
1676
1677 /*
1678 * We've raced with CPU hot[un]plug. Give it a breather
1679 * and retry migration. cond_resched() is required here;
1680 * otherwise, we might deadlock against cpu_stop trying to
1681 * bring down the CPU on non-preemptive kernel.
1682 */
1683 cpu_relax();
1684 cond_resched();
1685 }
1686}
1687
1688static struct worker *alloc_worker(void) 1605static struct worker *alloc_worker(void)
1689{ 1606{
1690 struct worker *worker; 1607 struct worker *worker;
@@ -1693,6 +1610,7 @@ static struct worker *alloc_worker(void)
1693 if (worker) { 1610 if (worker) {
1694 INIT_LIST_HEAD(&worker->entry); 1611 INIT_LIST_HEAD(&worker->entry);
1695 INIT_LIST_HEAD(&worker->scheduled); 1612 INIT_LIST_HEAD(&worker->scheduled);
1613 INIT_LIST_HEAD(&worker->node);
1696 /* on creation a worker is in !idle && prep state */ 1614 /* on creation a worker is in !idle && prep state */
1697 worker->flags = WORKER_PREP; 1615 worker->flags = WORKER_PREP;
1698 } 1616 }
@@ -1700,12 +1618,68 @@ static struct worker *alloc_worker(void)
1700} 1618}
1701 1619
1702/** 1620/**
1621 * worker_attach_to_pool() - attach a worker to a pool
1622 * @worker: worker to be attached
1623 * @pool: the target pool
1624 *
1625 * Attach @worker to @pool. Once attached, the %WORKER_UNBOUND flag and
1626 * cpu-binding of @worker are kept coordinated with the pool across
1627 * cpu-[un]hotplugs.
1628 */
1629static void worker_attach_to_pool(struct worker *worker,
1630 struct worker_pool *pool)
1631{
1632 mutex_lock(&pool->attach_mutex);
1633
1634 /*
1635 * set_cpus_allowed_ptr() will fail if the cpumask doesn't have any
1636 * online CPUs. It'll be re-applied when any of the CPUs come up.
1637 */
1638 set_cpus_allowed_ptr(worker->task, pool->attrs->cpumask);
1639
1640 /*
1641 * The pool->attach_mutex ensures %POOL_DISASSOCIATED remains
1642 * stable across this function. See the comments above the
1643 * flag definition for details.
1644 */
1645 if (pool->flags & POOL_DISASSOCIATED)
1646 worker->flags |= WORKER_UNBOUND;
1647
1648 list_add_tail(&worker->node, &pool->workers);
1649
1650 mutex_unlock(&pool->attach_mutex);
1651}
1652
1653/**
1654 * worker_detach_from_pool() - detach a worker from its pool
1655 * @worker: worker which is attached to its pool
1656 * @pool: the pool @worker is attached to
1657 *
1658 * Undo the attaching which had been done in worker_attach_to_pool(). The
1659 * caller worker shouldn't access to the pool after detached except it has
1660 * other reference to the pool.
1661 */
1662static void worker_detach_from_pool(struct worker *worker,
1663 struct worker_pool *pool)
1664{
1665 struct completion *detach_completion = NULL;
1666
1667 mutex_lock(&pool->attach_mutex);
1668 list_del(&worker->node);
1669 if (list_empty(&pool->workers))
1670 detach_completion = pool->detach_completion;
1671 mutex_unlock(&pool->attach_mutex);
1672
1673 if (detach_completion)
1674 complete(detach_completion);
1675}
1676
1677/**
1703 * create_worker - create a new workqueue worker 1678 * create_worker - create a new workqueue worker
1704 * @pool: pool the new worker will belong to 1679 * @pool: pool the new worker will belong to
1705 * 1680 *
1706 * Create a new worker which is bound to @pool. The returned worker 1681 * Create a new worker which is attached to @pool. The new worker must be
1707 * can be started by calling start_worker() or destroyed using 1682 * started by start_worker().
1708 * destroy_worker().
1709 * 1683 *
1710 * CONTEXT: 1684 * CONTEXT:
1711 * Might sleep. Does GFP_KERNEL allocations. 1685 * Might sleep. Does GFP_KERNEL allocations.
@@ -1719,19 +1693,8 @@ static struct worker *create_worker(struct worker_pool *pool)
1719 int id = -1; 1693 int id = -1;
1720 char id_buf[16]; 1694 char id_buf[16];
1721 1695
1722 lockdep_assert_held(&pool->manager_mutex); 1696 /* ID is needed to determine kthread name */
1723 1697 id = ida_simple_get(&pool->worker_ida, 0, 0, GFP_KERNEL);
1724 /*
1725 * ID is needed to determine kthread name. Allocate ID first
1726 * without installing the pointer.
1727 */
1728 idr_preload(GFP_KERNEL);
1729 spin_lock_irq(&pool->lock);
1730
1731 id = idr_alloc(&pool->worker_idr, NULL, 0, 0, GFP_NOWAIT);
1732
1733 spin_unlock_irq(&pool->lock);
1734 idr_preload_end();
1735 if (id < 0) 1698 if (id < 0)
1736 goto fail; 1699 goto fail;
1737 1700
@@ -1758,33 +1721,14 @@ static struct worker *create_worker(struct worker_pool *pool)
1758 /* prevent userland from meddling with cpumask of workqueue workers */ 1721 /* prevent userland from meddling with cpumask of workqueue workers */
1759 worker->task->flags |= PF_NO_SETAFFINITY; 1722 worker->task->flags |= PF_NO_SETAFFINITY;
1760 1723
1761 /* 1724 /* successful, attach the worker to the pool */
1762 * set_cpus_allowed_ptr() will fail if the cpumask doesn't have any 1725 worker_attach_to_pool(worker, pool);
1763 * online CPUs. It'll be re-applied when any of the CPUs come up.
1764 */
1765 set_cpus_allowed_ptr(worker->task, pool->attrs->cpumask);
1766
1767 /*
1768 * The caller is responsible for ensuring %POOL_DISASSOCIATED
1769 * remains stable across this function. See the comments above the
1770 * flag definition for details.
1771 */
1772 if (pool->flags & POOL_DISASSOCIATED)
1773 worker->flags |= WORKER_UNBOUND;
1774
1775 /* successful, commit the pointer to idr */
1776 spin_lock_irq(&pool->lock);
1777 idr_replace(&pool->worker_idr, worker, worker->id);
1778 spin_unlock_irq(&pool->lock);
1779 1726
1780 return worker; 1727 return worker;
1781 1728
1782fail: 1729fail:
1783 if (id >= 0) { 1730 if (id >= 0)
1784 spin_lock_irq(&pool->lock); 1731 ida_simple_remove(&pool->worker_ida, id);
1785 idr_remove(&pool->worker_idr, id);
1786 spin_unlock_irq(&pool->lock);
1787 }
1788 kfree(worker); 1732 kfree(worker);
1789 return NULL; 1733 return NULL;
1790} 1734}
@@ -1800,7 +1744,6 @@ fail:
1800 */ 1744 */
1801static void start_worker(struct worker *worker) 1745static void start_worker(struct worker *worker)
1802{ 1746{
1803 worker->flags |= WORKER_STARTED;
1804 worker->pool->nr_workers++; 1747 worker->pool->nr_workers++;
1805 worker_enter_idle(worker); 1748 worker_enter_idle(worker);
1806 wake_up_process(worker->task); 1749 wake_up_process(worker->task);
@@ -1818,8 +1761,6 @@ static int create_and_start_worker(struct worker_pool *pool)
1818{ 1761{
1819 struct worker *worker; 1762 struct worker *worker;
1820 1763
1821 mutex_lock(&pool->manager_mutex);
1822
1823 worker = create_worker(pool); 1764 worker = create_worker(pool);
1824 if (worker) { 1765 if (worker) {
1825 spin_lock_irq(&pool->lock); 1766 spin_lock_irq(&pool->lock);
@@ -1827,8 +1768,6 @@ static int create_and_start_worker(struct worker_pool *pool)
1827 spin_unlock_irq(&pool->lock); 1768 spin_unlock_irq(&pool->lock);
1828 } 1769 }
1829 1770
1830 mutex_unlock(&pool->manager_mutex);
1831
1832 return worker ? 0 : -ENOMEM; 1771 return worker ? 0 : -ENOMEM;
1833} 1772}
1834 1773
@@ -1836,46 +1775,30 @@ static int create_and_start_worker(struct worker_pool *pool)
1836 * destroy_worker - destroy a workqueue worker 1775 * destroy_worker - destroy a workqueue worker
1837 * @worker: worker to be destroyed 1776 * @worker: worker to be destroyed
1838 * 1777 *
1839 * Destroy @worker and adjust @pool stats accordingly. 1778 * Destroy @worker and adjust @pool stats accordingly. The worker should
1779 * be idle.
1840 * 1780 *
1841 * CONTEXT: 1781 * CONTEXT:
1842 * spin_lock_irq(pool->lock) which is released and regrabbed. 1782 * spin_lock_irq(pool->lock).
1843 */ 1783 */
1844static void destroy_worker(struct worker *worker) 1784static void destroy_worker(struct worker *worker)
1845{ 1785{
1846 struct worker_pool *pool = worker->pool; 1786 struct worker_pool *pool = worker->pool;
1847 1787
1848 lockdep_assert_held(&pool->manager_mutex);
1849 lockdep_assert_held(&pool->lock); 1788 lockdep_assert_held(&pool->lock);
1850 1789
1851 /* sanity check frenzy */ 1790 /* sanity check frenzy */
1852 if (WARN_ON(worker->current_work) || 1791 if (WARN_ON(worker->current_work) ||
1853 WARN_ON(!list_empty(&worker->scheduled))) 1792 WARN_ON(!list_empty(&worker->scheduled)) ||
1793 WARN_ON(!(worker->flags & WORKER_IDLE)))
1854 return; 1794 return;
1855 1795
1856 if (worker->flags & WORKER_STARTED) 1796 pool->nr_workers--;
1857 pool->nr_workers--; 1797 pool->nr_idle--;
1858 if (worker->flags & WORKER_IDLE)
1859 pool->nr_idle--;
1860
1861 /*
1862 * Once WORKER_DIE is set, the kworker may destroy itself at any
1863 * point. Pin to ensure the task stays until we're done with it.
1864 */
1865 get_task_struct(worker->task);
1866 1798
1867 list_del_init(&worker->entry); 1799 list_del_init(&worker->entry);
1868 worker->flags |= WORKER_DIE; 1800 worker->flags |= WORKER_DIE;
1869 1801 wake_up_process(worker->task);
1870 idr_remove(&pool->worker_idr, worker->id);
1871
1872 spin_unlock_irq(&pool->lock);
1873
1874 kthread_stop(worker->task);
1875 put_task_struct(worker->task);
1876 kfree(worker);
1877
1878 spin_lock_irq(&pool->lock);
1879} 1802}
1880 1803
1881static void idle_worker_timeout(unsigned long __pool) 1804static void idle_worker_timeout(unsigned long __pool)
@@ -1884,7 +1807,7 @@ static void idle_worker_timeout(unsigned long __pool)
1884 1807
1885 spin_lock_irq(&pool->lock); 1808 spin_lock_irq(&pool->lock);
1886 1809
1887 if (too_many_workers(pool)) { 1810 while (too_many_workers(pool)) {
1888 struct worker *worker; 1811 struct worker *worker;
1889 unsigned long expires; 1812 unsigned long expires;
1890 1813
@@ -1892,13 +1815,12 @@ static void idle_worker_timeout(unsigned long __pool)
1892 worker = list_entry(pool->idle_list.prev, struct worker, entry); 1815 worker = list_entry(pool->idle_list.prev, struct worker, entry);
1893 expires = worker->last_active + IDLE_WORKER_TIMEOUT; 1816 expires = worker->last_active + IDLE_WORKER_TIMEOUT;
1894 1817
1895 if (time_before(jiffies, expires)) 1818 if (time_before(jiffies, expires)) {
1896 mod_timer(&pool->idle_timer, expires); 1819 mod_timer(&pool->idle_timer, expires);
1897 else { 1820 break;
1898 /* it's been idle for too long, wake up manager */
1899 pool->flags |= POOL_MANAGE_WORKERS;
1900 wake_up_worker(pool);
1901 } 1821 }
1822
1823 destroy_worker(worker);
1902 } 1824 }
1903 1825
1904 spin_unlock_irq(&pool->lock); 1826 spin_unlock_irq(&pool->lock);
@@ -1916,6 +1838,12 @@ static void send_mayday(struct work_struct *work)
1916 1838
1917 /* mayday mayday mayday */ 1839 /* mayday mayday mayday */
1918 if (list_empty(&pwq->mayday_node)) { 1840 if (list_empty(&pwq->mayday_node)) {
1841 /*
1842 * If @pwq is for an unbound wq, its base ref may be put at
1843 * any time due to an attribute change. Pin @pwq until the
1844 * rescuer is done with it.
1845 */
1846 get_pwq(pwq);
1919 list_add_tail(&pwq->mayday_node, &wq->maydays); 1847 list_add_tail(&pwq->mayday_node, &wq->maydays);
1920 wake_up_process(wq->rescuer->task); 1848 wake_up_process(wq->rescuer->task);
1921 } 1849 }
@@ -2011,44 +1939,6 @@ restart:
2011} 1939}
2012 1940
2013/** 1941/**
2014 * maybe_destroy_worker - destroy workers which have been idle for a while
2015 * @pool: pool to destroy workers for
2016 *
2017 * Destroy @pool workers which have been idle for longer than
2018 * IDLE_WORKER_TIMEOUT.
2019 *
2020 * LOCKING:
2021 * spin_lock_irq(pool->lock) which may be released and regrabbed
2022 * multiple times. Called only from manager.
2023 *
2024 * Return:
2025 * %false if no action was taken and pool->lock stayed locked, %true
2026 * otherwise.
2027 */
2028static bool maybe_destroy_workers(struct worker_pool *pool)
2029{
2030 bool ret = false;
2031
2032 while (too_many_workers(pool)) {
2033 struct worker *worker;
2034 unsigned long expires;
2035
2036 worker = list_entry(pool->idle_list.prev, struct worker, entry);
2037 expires = worker->last_active + IDLE_WORKER_TIMEOUT;
2038
2039 if (time_before(jiffies, expires)) {
2040 mod_timer(&pool->idle_timer, expires);
2041 break;
2042 }
2043
2044 destroy_worker(worker);
2045 ret = true;
2046 }
2047
2048 return ret;
2049}
2050
2051/**
2052 * manage_workers - manage worker pool 1942 * manage_workers - manage worker pool
2053 * @worker: self 1943 * @worker: self
2054 * 1944 *
@@ -2077,8 +1967,6 @@ static bool manage_workers(struct worker *worker)
2077 bool ret = false; 1967 bool ret = false;
2078 1968
2079 /* 1969 /*
2080 * Managership is governed by two mutexes - manager_arb and
2081 * manager_mutex. manager_arb handles arbitration of manager role.
2082 * Anyone who successfully grabs manager_arb wins the arbitration 1970 * Anyone who successfully grabs manager_arb wins the arbitration
2083 * and becomes the manager. mutex_trylock() on pool->manager_arb 1971 * and becomes the manager. mutex_trylock() on pool->manager_arb
2084 * failure while holding pool->lock reliably indicates that someone 1972 * failure while holding pool->lock reliably indicates that someone
@@ -2087,40 +1975,12 @@ static bool manage_workers(struct worker *worker)
2087 * grabbing manager_arb is responsible for actually performing 1975 * grabbing manager_arb is responsible for actually performing
2088 * manager duties. If manager_arb is grabbed and released without 1976 * manager duties. If manager_arb is grabbed and released without
2089 * actual management, the pool may stall indefinitely. 1977 * actual management, the pool may stall indefinitely.
2090 *
2091 * manager_mutex is used for exclusion of actual management
2092 * operations. The holder of manager_mutex can be sure that none
2093 * of management operations, including creation and destruction of
2094 * workers, won't take place until the mutex is released. Because
2095 * manager_mutex doesn't interfere with manager role arbitration,
2096 * it is guaranteed that the pool's management, while may be
2097 * delayed, won't be disturbed by someone else grabbing
2098 * manager_mutex.
2099 */ 1978 */
2100 if (!mutex_trylock(&pool->manager_arb)) 1979 if (!mutex_trylock(&pool->manager_arb))
2101 return ret; 1980 return ret;
2102 1981
2103 /*
2104 * With manager arbitration won, manager_mutex would be free in
2105 * most cases. trylock first without dropping @pool->lock.
2106 */
2107 if (unlikely(!mutex_trylock(&pool->manager_mutex))) {
2108 spin_unlock_irq(&pool->lock);
2109 mutex_lock(&pool->manager_mutex);
2110 spin_lock_irq(&pool->lock);
2111 ret = true;
2112 }
2113
2114 pool->flags &= ~POOL_MANAGE_WORKERS;
2115
2116 /*
2117 * Destroy and then create so that may_start_working() is true
2118 * on return.
2119 */
2120 ret |= maybe_destroy_workers(pool);
2121 ret |= maybe_create_worker(pool); 1982 ret |= maybe_create_worker(pool);
2122 1983
2123 mutex_unlock(&pool->manager_mutex);
2124 mutex_unlock(&pool->manager_arb); 1984 mutex_unlock(&pool->manager_arb);
2125 return ret; 1985 return ret;
2126} 1986}
@@ -2308,6 +2168,11 @@ woke_up:
2308 spin_unlock_irq(&pool->lock); 2168 spin_unlock_irq(&pool->lock);
2309 WARN_ON_ONCE(!list_empty(&worker->entry)); 2169 WARN_ON_ONCE(!list_empty(&worker->entry));
2310 worker->task->flags &= ~PF_WQ_WORKER; 2170 worker->task->flags &= ~PF_WQ_WORKER;
2171
2172 set_task_comm(worker->task, "kworker/dying");
2173 ida_simple_remove(&pool->worker_ida, worker->id);
2174 worker_detach_from_pool(worker, pool);
2175 kfree(worker);
2311 return 0; 2176 return 0;
2312 } 2177 }
2313 2178
@@ -2355,9 +2220,6 @@ recheck:
2355 2220
2356 worker_set_flags(worker, WORKER_PREP, false); 2221 worker_set_flags(worker, WORKER_PREP, false);
2357sleep: 2222sleep:
2358 if (unlikely(need_to_manage_workers(pool)) && manage_workers(worker))
2359 goto recheck;
2360
2361 /* 2223 /*
2362 * pool->lock is held and there's no work to process and no need to 2224 * pool->lock is held and there's no work to process and no need to
2363 * manage, sleep. Workers are woken up only while holding 2225 * manage, sleep. Workers are woken up only while holding
@@ -2398,6 +2260,7 @@ static int rescuer_thread(void *__rescuer)
2398 struct worker *rescuer = __rescuer; 2260 struct worker *rescuer = __rescuer;
2399 struct workqueue_struct *wq = rescuer->rescue_wq; 2261 struct workqueue_struct *wq = rescuer->rescue_wq;
2400 struct list_head *scheduled = &rescuer->scheduled; 2262 struct list_head *scheduled = &rescuer->scheduled;
2263 bool should_stop;
2401 2264
2402 set_user_nice(current, RESCUER_NICE_LEVEL); 2265 set_user_nice(current, RESCUER_NICE_LEVEL);
2403 2266
@@ -2409,11 +2272,15 @@ static int rescuer_thread(void *__rescuer)
2409repeat: 2272repeat:
2410 set_current_state(TASK_INTERRUPTIBLE); 2273 set_current_state(TASK_INTERRUPTIBLE);
2411 2274
2412 if (kthread_should_stop()) { 2275 /*
2413 __set_current_state(TASK_RUNNING); 2276 * By the time the rescuer is requested to stop, the workqueue
2414 rescuer->task->flags &= ~PF_WQ_WORKER; 2277 * shouldn't have any work pending, but @wq->maydays may still have
2415 return 0; 2278 * pwq(s) queued. This can happen by non-rescuer workers consuming
2416 } 2279 * all the work items before the rescuer got to them. Go through
2280 * @wq->maydays processing before acting on should_stop so that the
2281 * list is always empty on exit.
2282 */
2283 should_stop = kthread_should_stop();
2417 2284
2418 /* see whether any pwq is asking for help */ 2285 /* see whether any pwq is asking for help */
2419 spin_lock_irq(&wq_mayday_lock); 2286 spin_lock_irq(&wq_mayday_lock);
@@ -2429,8 +2296,9 @@ repeat:
2429 2296
2430 spin_unlock_irq(&wq_mayday_lock); 2297 spin_unlock_irq(&wq_mayday_lock);
2431 2298
2432 /* migrate to the target cpu if possible */ 2299 worker_attach_to_pool(rescuer, pool);
2433 worker_maybe_bind_and_lock(pool); 2300
2301 spin_lock_irq(&pool->lock);
2434 rescuer->pool = pool; 2302 rescuer->pool = pool;
2435 2303
2436 /* 2304 /*
@@ -2443,6 +2311,17 @@ repeat:
2443 move_linked_works(work, scheduled, &n); 2311 move_linked_works(work, scheduled, &n);
2444 2312
2445 process_scheduled_works(rescuer); 2313 process_scheduled_works(rescuer);
2314 spin_unlock_irq(&pool->lock);
2315
2316 worker_detach_from_pool(rescuer, pool);
2317
2318 spin_lock_irq(&pool->lock);
2319
2320 /*
2321 * Put the reference grabbed by send_mayday(). @pool won't
2322 * go away while we're holding its lock.
2323 */
2324 put_pwq(pwq);
2446 2325
2447 /* 2326 /*
2448 * Leave this pool. If keep_working() is %true, notify a 2327 * Leave this pool. If keep_working() is %true, notify a
@@ -2459,6 +2338,12 @@ repeat:
2459 2338
2460 spin_unlock_irq(&wq_mayday_lock); 2339 spin_unlock_irq(&wq_mayday_lock);
2461 2340
2341 if (should_stop) {
2342 __set_current_state(TASK_RUNNING);
2343 rescuer->task->flags &= ~PF_WQ_WORKER;
2344 return 0;
2345 }
2346
2462 /* rescuers should never participate in concurrency management */ 2347 /* rescuers should never participate in concurrency management */
2463 WARN_ON_ONCE(!(rescuer->flags & WORKER_NOT_RUNNING)); 2348 WARN_ON_ONCE(!(rescuer->flags & WORKER_NOT_RUNNING));
2464 schedule(); 2349 schedule();
@@ -3527,9 +3412,10 @@ static int init_worker_pool(struct worker_pool *pool)
3527 (unsigned long)pool); 3412 (unsigned long)pool);
3528 3413
3529 mutex_init(&pool->manager_arb); 3414 mutex_init(&pool->manager_arb);
3530 mutex_init(&pool->manager_mutex); 3415 mutex_init(&pool->attach_mutex);
3531 idr_init(&pool->worker_idr); 3416 INIT_LIST_HEAD(&pool->workers);
3532 3417
3418 ida_init(&pool->worker_ida);
3533 INIT_HLIST_NODE(&pool->hash_node); 3419 INIT_HLIST_NODE(&pool->hash_node);
3534 pool->refcnt = 1; 3420 pool->refcnt = 1;
3535 3421
@@ -3544,7 +3430,7 @@ static void rcu_free_pool(struct rcu_head *rcu)
3544{ 3430{
3545 struct worker_pool *pool = container_of(rcu, struct worker_pool, rcu); 3431 struct worker_pool *pool = container_of(rcu, struct worker_pool, rcu);
3546 3432
3547 idr_destroy(&pool->worker_idr); 3433 ida_destroy(&pool->worker_ida);
3548 free_workqueue_attrs(pool->attrs); 3434 free_workqueue_attrs(pool->attrs);
3549 kfree(pool); 3435 kfree(pool);
3550} 3436}
@@ -3562,6 +3448,7 @@ static void rcu_free_pool(struct rcu_head *rcu)
3562 */ 3448 */
3563static void put_unbound_pool(struct worker_pool *pool) 3449static void put_unbound_pool(struct worker_pool *pool)
3564{ 3450{
3451 DECLARE_COMPLETION_ONSTACK(detach_completion);
3565 struct worker *worker; 3452 struct worker *worker;
3566 3453
3567 lockdep_assert_held(&wq_pool_mutex); 3454 lockdep_assert_held(&wq_pool_mutex);
@@ -3582,18 +3469,24 @@ static void put_unbound_pool(struct worker_pool *pool)
3582 /* 3469 /*
3583 * Become the manager and destroy all workers. Grabbing 3470 * Become the manager and destroy all workers. Grabbing
3584 * manager_arb prevents @pool's workers from blocking on 3471 * manager_arb prevents @pool's workers from blocking on
3585 * manager_mutex. 3472 * attach_mutex.
3586 */ 3473 */
3587 mutex_lock(&pool->manager_arb); 3474 mutex_lock(&pool->manager_arb);
3588 mutex_lock(&pool->manager_mutex);
3589 spin_lock_irq(&pool->lock);
3590 3475
3591 while ((worker = first_worker(pool))) 3476 spin_lock_irq(&pool->lock);
3477 while ((worker = first_idle_worker(pool)))
3592 destroy_worker(worker); 3478 destroy_worker(worker);
3593 WARN_ON(pool->nr_workers || pool->nr_idle); 3479 WARN_ON(pool->nr_workers || pool->nr_idle);
3594
3595 spin_unlock_irq(&pool->lock); 3480 spin_unlock_irq(&pool->lock);
3596 mutex_unlock(&pool->manager_mutex); 3481
3482 mutex_lock(&pool->attach_mutex);
3483 if (!list_empty(&pool->workers))
3484 pool->detach_completion = &detach_completion;
3485 mutex_unlock(&pool->attach_mutex);
3486
3487 if (pool->detach_completion)
3488 wait_for_completion(pool->detach_completion);
3489
3597 mutex_unlock(&pool->manager_arb); 3490 mutex_unlock(&pool->manager_arb);
3598 3491
3599 /* shut down the timers */ 3492 /* shut down the timers */
@@ -3639,9 +3532,6 @@ static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs)
3639 if (!pool || init_worker_pool(pool) < 0) 3532 if (!pool || init_worker_pool(pool) < 0)
3640 goto fail; 3533 goto fail;
3641 3534
3642 if (workqueue_freezing)
3643 pool->flags |= POOL_FREEZING;
3644
3645 lockdep_set_subclass(&pool->lock, 1); /* see put_pwq() */ 3535 lockdep_set_subclass(&pool->lock, 1); /* see put_pwq() */
3646 copy_workqueue_attrs(pool->attrs, attrs); 3536 copy_workqueue_attrs(pool->attrs, attrs);
3647 3537
@@ -3748,7 +3638,12 @@ static void pwq_adjust_max_active(struct pool_workqueue *pwq)
3748 3638
3749 spin_lock_irq(&pwq->pool->lock); 3639 spin_lock_irq(&pwq->pool->lock);
3750 3640
3751 if (!freezable || !(pwq->pool->flags & POOL_FREEZING)) { 3641 /*
3642 * During [un]freezing, the caller is responsible for ensuring that
3643 * this function is called at least once after @workqueue_freezing
3644 * is updated and visible.
3645 */
3646 if (!freezable || !workqueue_freezing) {
3752 pwq->max_active = wq->saved_max_active; 3647 pwq->max_active = wq->saved_max_active;
3753 3648
3754 while (!list_empty(&pwq->delayed_works) && 3649 while (!list_empty(&pwq->delayed_works) &&
@@ -4080,17 +3975,13 @@ static void wq_update_unbound_numa(struct workqueue_struct *wq, int cpu,
4080 * Let's determine what needs to be done. If the target cpumask is 3975 * Let's determine what needs to be done. If the target cpumask is
4081 * different from wq's, we need to compare it to @pwq's and create 3976 * different from wq's, we need to compare it to @pwq's and create
4082 * a new one if they don't match. If the target cpumask equals 3977 * a new one if they don't match. If the target cpumask equals
4083 * wq's, the default pwq should be used. If @pwq is already the 3978 * wq's, the default pwq should be used.
4084 * default one, nothing to do; otherwise, install the default one.
4085 */ 3979 */
4086 if (wq_calc_node_cpumask(wq->unbound_attrs, node, cpu_off, cpumask)) { 3980 if (wq_calc_node_cpumask(wq->unbound_attrs, node, cpu_off, cpumask)) {
4087 if (cpumask_equal(cpumask, pwq->pool->attrs->cpumask)) 3981 if (cpumask_equal(cpumask, pwq->pool->attrs->cpumask))
4088 goto out_unlock; 3982 goto out_unlock;
4089 } else { 3983 } else {
4090 if (pwq == wq->dfl_pwq) 3984 goto use_dfl_pwq;
4091 goto out_unlock;
4092 else
4093 goto use_dfl_pwq;
4094 } 3985 }
4095 3986
4096 mutex_unlock(&wq->mutex); 3987 mutex_unlock(&wq->mutex);
@@ -4098,9 +3989,10 @@ static void wq_update_unbound_numa(struct workqueue_struct *wq, int cpu,
4098 /* create a new pwq */ 3989 /* create a new pwq */
4099 pwq = alloc_unbound_pwq(wq, target_attrs); 3990 pwq = alloc_unbound_pwq(wq, target_attrs);
4100 if (!pwq) { 3991 if (!pwq) {
4101 pr_warning("workqueue: allocation failed while updating NUMA affinity of \"%s\"\n", 3992 pr_warn("workqueue: allocation failed while updating NUMA affinity of \"%s\"\n",
4102 wq->name); 3993 wq->name);
4103 goto out_unlock; 3994 mutex_lock(&wq->mutex);
3995 goto use_dfl_pwq;
4104 } 3996 }
4105 3997
4106 /* 3998 /*
@@ -4575,28 +4467,27 @@ static void wq_unbind_fn(struct work_struct *work)
4575 int cpu = smp_processor_id(); 4467 int cpu = smp_processor_id();
4576 struct worker_pool *pool; 4468 struct worker_pool *pool;
4577 struct worker *worker; 4469 struct worker *worker;
4578 int wi;
4579 4470
4580 for_each_cpu_worker_pool(pool, cpu) { 4471 for_each_cpu_worker_pool(pool, cpu) {
4581 WARN_ON_ONCE(cpu != smp_processor_id()); 4472 WARN_ON_ONCE(cpu != smp_processor_id());
4582 4473
4583 mutex_lock(&pool->manager_mutex); 4474 mutex_lock(&pool->attach_mutex);
4584 spin_lock_irq(&pool->lock); 4475 spin_lock_irq(&pool->lock);
4585 4476
4586 /* 4477 /*
4587 * We've blocked all manager operations. Make all workers 4478 * We've blocked all attach/detach operations. Make all workers
4588 * unbound and set DISASSOCIATED. Before this, all workers 4479 * unbound and set DISASSOCIATED. Before this, all workers
4589 * except for the ones which are still executing works from 4480 * except for the ones which are still executing works from
4590 * before the last CPU down must be on the cpu. After 4481 * before the last CPU down must be on the cpu. After
4591 * this, they may become diasporas. 4482 * this, they may become diasporas.
4592 */ 4483 */
4593 for_each_pool_worker(worker, wi, pool) 4484 for_each_pool_worker(worker, pool)
4594 worker->flags |= WORKER_UNBOUND; 4485 worker->flags |= WORKER_UNBOUND;
4595 4486
4596 pool->flags |= POOL_DISASSOCIATED; 4487 pool->flags |= POOL_DISASSOCIATED;
4597 4488
4598 spin_unlock_irq(&pool->lock); 4489 spin_unlock_irq(&pool->lock);
4599 mutex_unlock(&pool->manager_mutex); 4490 mutex_unlock(&pool->attach_mutex);
4600 4491
4601 /* 4492 /*
4602 * Call schedule() so that we cross rq->lock and thus can 4493 * Call schedule() so that we cross rq->lock and thus can
@@ -4636,9 +4527,8 @@ static void wq_unbind_fn(struct work_struct *work)
4636static void rebind_workers(struct worker_pool *pool) 4527static void rebind_workers(struct worker_pool *pool)
4637{ 4528{
4638 struct worker *worker; 4529 struct worker *worker;
4639 int wi;
4640 4530
4641 lockdep_assert_held(&pool->manager_mutex); 4531 lockdep_assert_held(&pool->attach_mutex);
4642 4532
4643 /* 4533 /*
4644 * Restore CPU affinity of all workers. As all idle workers should 4534 * Restore CPU affinity of all workers. As all idle workers should
@@ -4647,13 +4537,13 @@ static void rebind_workers(struct worker_pool *pool)
4647 * of all workers first and then clear UNBOUND. As we're called 4537 * of all workers first and then clear UNBOUND. As we're called
4648 * from CPU_ONLINE, the following shouldn't fail. 4538 * from CPU_ONLINE, the following shouldn't fail.
4649 */ 4539 */
4650 for_each_pool_worker(worker, wi, pool) 4540 for_each_pool_worker(worker, pool)
4651 WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, 4541 WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task,
4652 pool->attrs->cpumask) < 0); 4542 pool->attrs->cpumask) < 0);
4653 4543
4654 spin_lock_irq(&pool->lock); 4544 spin_lock_irq(&pool->lock);
4655 4545
4656 for_each_pool_worker(worker, wi, pool) { 4546 for_each_pool_worker(worker, pool) {
4657 unsigned int worker_flags = worker->flags; 4547 unsigned int worker_flags = worker->flags;
4658 4548
4659 /* 4549 /*
@@ -4705,9 +4595,8 @@ static void restore_unbound_workers_cpumask(struct worker_pool *pool, int cpu)
4705{ 4595{
4706 static cpumask_t cpumask; 4596 static cpumask_t cpumask;
4707 struct worker *worker; 4597 struct worker *worker;
4708 int wi;
4709 4598
4710 lockdep_assert_held(&pool->manager_mutex); 4599 lockdep_assert_held(&pool->attach_mutex);
4711 4600
4712 /* is @cpu allowed for @pool? */ 4601 /* is @cpu allowed for @pool? */
4713 if (!cpumask_test_cpu(cpu, pool->attrs->cpumask)) 4602 if (!cpumask_test_cpu(cpu, pool->attrs->cpumask))
@@ -4719,7 +4608,7 @@ static void restore_unbound_workers_cpumask(struct worker_pool *pool, int cpu)
4719 return; 4608 return;
4720 4609
4721 /* as we're called from CPU_ONLINE, the following shouldn't fail */ 4610 /* as we're called from CPU_ONLINE, the following shouldn't fail */
4722 for_each_pool_worker(worker, wi, pool) 4611 for_each_pool_worker(worker, pool)
4723 WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, 4612 WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task,
4724 pool->attrs->cpumask) < 0); 4613 pool->attrs->cpumask) < 0);
4725} 4614}
@@ -4752,7 +4641,7 @@ static int workqueue_cpu_up_callback(struct notifier_block *nfb,
4752 mutex_lock(&wq_pool_mutex); 4641 mutex_lock(&wq_pool_mutex);
4753 4642
4754 for_each_pool(pool, pi) { 4643 for_each_pool(pool, pi) {
4755 mutex_lock(&pool->manager_mutex); 4644 mutex_lock(&pool->attach_mutex);
4756 4645
4757 if (pool->cpu == cpu) { 4646 if (pool->cpu == cpu) {
4758 spin_lock_irq(&pool->lock); 4647 spin_lock_irq(&pool->lock);
@@ -4764,7 +4653,7 @@ static int workqueue_cpu_up_callback(struct notifier_block *nfb,
4764 restore_unbound_workers_cpumask(pool, cpu); 4653 restore_unbound_workers_cpumask(pool, cpu);
4765 } 4654 }
4766 4655
4767 mutex_unlock(&pool->manager_mutex); 4656 mutex_unlock(&pool->attach_mutex);
4768 } 4657 }
4769 4658
4770 /* update NUMA affinity of unbound workqueues */ 4659 /* update NUMA affinity of unbound workqueues */
@@ -4863,24 +4752,14 @@ EXPORT_SYMBOL_GPL(work_on_cpu);
4863 */ 4752 */
4864void freeze_workqueues_begin(void) 4753void freeze_workqueues_begin(void)
4865{ 4754{
4866 struct worker_pool *pool;
4867 struct workqueue_struct *wq; 4755 struct workqueue_struct *wq;
4868 struct pool_workqueue *pwq; 4756 struct pool_workqueue *pwq;
4869 int pi;
4870 4757
4871 mutex_lock(&wq_pool_mutex); 4758 mutex_lock(&wq_pool_mutex);
4872 4759
4873 WARN_ON_ONCE(workqueue_freezing); 4760 WARN_ON_ONCE(workqueue_freezing);
4874 workqueue_freezing = true; 4761 workqueue_freezing = true;
4875 4762
4876 /* set FREEZING */
4877 for_each_pool(pool, pi) {
4878 spin_lock_irq(&pool->lock);
4879 WARN_ON_ONCE(pool->flags & POOL_FREEZING);
4880 pool->flags |= POOL_FREEZING;
4881 spin_unlock_irq(&pool->lock);
4882 }
4883
4884 list_for_each_entry(wq, &workqueues, list) { 4763 list_for_each_entry(wq, &workqueues, list) {
4885 mutex_lock(&wq->mutex); 4764 mutex_lock(&wq->mutex);
4886 for_each_pwq(pwq, wq) 4765 for_each_pwq(pwq, wq)
@@ -4950,21 +4829,13 @@ void thaw_workqueues(void)
4950{ 4829{
4951 struct workqueue_struct *wq; 4830 struct workqueue_struct *wq;
4952 struct pool_workqueue *pwq; 4831 struct pool_workqueue *pwq;
4953 struct worker_pool *pool;
4954 int pi;
4955 4832
4956 mutex_lock(&wq_pool_mutex); 4833 mutex_lock(&wq_pool_mutex);
4957 4834
4958 if (!workqueue_freezing) 4835 if (!workqueue_freezing)
4959 goto out_unlock; 4836 goto out_unlock;
4960 4837
4961 /* clear FREEZING */ 4838 workqueue_freezing = false;
4962 for_each_pool(pool, pi) {
4963 spin_lock_irq(&pool->lock);
4964 WARN_ON_ONCE(!(pool->flags & POOL_FREEZING));
4965 pool->flags &= ~POOL_FREEZING;
4966 spin_unlock_irq(&pool->lock);
4967 }
4968 4839
4969 /* restore max_active and repopulate worklist */ 4840 /* restore max_active and repopulate worklist */
4970 list_for_each_entry(wq, &workqueues, list) { 4841 list_for_each_entry(wq, &workqueues, list) {
@@ -4974,7 +4845,6 @@ void thaw_workqueues(void)
4974 mutex_unlock(&wq->mutex); 4845 mutex_unlock(&wq->mutex);
4975 } 4846 }
4976 4847
4977 workqueue_freezing = false;
4978out_unlock: 4848out_unlock:
4979 mutex_unlock(&wq_pool_mutex); 4849 mutex_unlock(&wq_pool_mutex);
4980} 4850}
diff --git a/kernel/workqueue_internal.h b/kernel/workqueue_internal.h
index 7e2204db0b1a..45215870ac6c 100644
--- a/kernel/workqueue_internal.h
+++ b/kernel/workqueue_internal.h
@@ -37,6 +37,8 @@ struct worker {
37 struct task_struct *task; /* I: worker task */ 37 struct task_struct *task; /* I: worker task */
38 struct worker_pool *pool; /* I: the associated pool */ 38 struct worker_pool *pool; /* I: the associated pool */
39 /* L: for rescuers */ 39 /* L: for rescuers */
40 struct list_head node; /* A: anchored at pool->workers */
41 /* A: runs through worker->node */
40 42
41 unsigned long last_active; /* L: last active timestamp */ 43 unsigned long last_active; /* L: last active timestamp */
42 unsigned int flags; /* X: flags */ 44 unsigned int flags; /* X: flags */