aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/acct.c6
-rw-r--r--kernel/audit.c70
-rw-r--r--kernel/auditsc.c27
-rw-r--r--kernel/backtracetest.c18
-rw-r--r--kernel/capability.c26
-rw-r--r--kernel/cgroup.c1831
-rw-r--r--kernel/cgroup_freezer.c138
-rw-r--r--kernel/compat.c8
-rw-r--r--kernel/context_tracking.c2
-rw-r--r--kernel/cpu.c42
-rw-r--r--kernel/cpuset.c60
-rw-r--r--kernel/debug/kdb/kdb_bt.c2
-rw-r--r--kernel/debug/kdb/kdb_io.c2
-rw-r--r--kernel/debug/kdb/kdb_main.c2
-rw-r--r--kernel/events/core.c290
-rw-r--r--kernel/events/uprobes.c56
-rw-r--r--kernel/exec_domain.c14
-rw-r--r--kernel/exit.c61
-rw-r--r--kernel/fork.c20
-rw-r--r--kernel/futex.c239
-rw-r--r--kernel/gcov/base.c6
-rw-r--r--kernel/gcov/gcc_4_7.c5
-rw-r--r--kernel/hrtimer.c31
-rw-r--r--kernel/hung_task.c4
-rw-r--r--kernel/irq/Kconfig9
-rw-r--r--kernel/irq/chip.c5
-rw-r--r--kernel/irq/internals.h8
-rw-r--r--kernel/irq/irqdesc.c102
-rw-r--r--kernel/irq/irqdomain.c6
-rw-r--r--kernel/irq/manage.c21
-rw-r--r--kernel/irq/spurious.c106
-rw-r--r--kernel/kexec.c77
-rw-r--r--kernel/kmod.c5
-rw-r--r--kernel/ksysfs.c5
-rw-r--r--kernel/kthread.c4
-rw-r--r--kernel/latencytop.c5
-rw-r--r--kernel/locking/lockdep.c2
-rw-r--r--kernel/locking/locktorture.c12
-rw-r--r--kernel/locking/rtmutex.c32
-rw-r--r--kernel/module.c50
-rw-r--r--kernel/panic.c23
-rw-r--r--kernel/params.c25
-rw-r--r--kernel/power/Kconfig3
-rw-r--r--kernel/power/hibernate.c30
-rw-r--r--kernel/power/main.c33
-rw-r--r--kernel/power/power.h9
-rw-r--r--kernel/power/process.c3
-rw-r--r--kernel/power/snapshot.c2
-rw-r--r--kernel/power/suspend.c128
-rw-r--r--kernel/power/suspend_test.c24
-rw-r--r--kernel/power/swap.c2
-rw-r--r--kernel/printk/printk.c348
-rw-r--r--kernel/profile.c20
-rw-r--r--kernel/rcu/rcutorture.c217
-rw-r--r--kernel/rcu/tiny_plugin.h8
-rw-r--r--kernel/rcu/tree.c309
-rw-r--r--kernel/rcu/tree.h11
-rw-r--r--kernel/rcu/tree_plugin.h136
-rw-r--r--kernel/rcu/update.c30
-rw-r--r--kernel/reboot.c21
-rw-r--r--kernel/res_counter.c7
-rw-r--r--kernel/resource.c7
-rw-r--r--kernel/sched/core.c416
-rw-r--r--kernel/sched/cpuacct.c2
-rw-r--r--kernel/sched/cpudeadline.c37
-rw-r--r--kernel/sched/cpudeadline.h6
-rw-r--r--kernel/sched/cpupri.c10
-rw-r--r--kernel/sched/cpupri.h2
-rw-r--r--kernel/sched/cputime.c32
-rw-r--r--kernel/sched/deadline.c39
-rw-r--r--kernel/sched/fair.c265
-rw-r--r--kernel/sched/idle.c140
-rw-r--r--kernel/sched/rt.c134
-rw-r--r--kernel/sched/sched.h37
-rw-r--r--kernel/sched/stop_task.c4
-rw-r--r--kernel/seccomp.c116
-rw-r--r--kernel/signal.c95
-rw-r--r--kernel/smp.c18
-rw-r--r--kernel/softirq.c13
-rw-r--r--kernel/stop_machine.c1
-rw-r--r--kernel/sys.c6
-rw-r--r--kernel/sys_ni.c2
-rw-r--r--kernel/sysctl.c107
-rw-r--r--kernel/time/ntp.c32
-rw-r--r--kernel/time/sched_clock.c13
-rw-r--r--kernel/time/tick-common.c2
-rw-r--r--kernel/time/tick-sched.c5
-rw-r--r--kernel/time/timekeeping.c7
-rw-r--r--kernel/timer.c2
-rw-r--r--kernel/torture.c40
-rw-r--r--kernel/trace/Kconfig30
-rw-r--r--kernel/trace/Makefile3
-rw-r--r--kernel/trace/ftrace.c294
-rw-r--r--kernel/trace/trace.c441
-rw-r--r--kernel/trace/trace.h46
-rw-r--r--kernel/trace/trace_benchmark.c198
-rw-r--r--kernel/trace/trace_benchmark.h41
-rw-r--r--kernel/trace/trace_events.c13
-rw-r--r--kernel/trace/trace_events_trigger.c2
-rw-r--r--kernel/trace/trace_functions.c72
-rw-r--r--kernel/trace/trace_functions_graph.c19
-rw-r--r--kernel/trace/trace_irqsoff.c71
-rw-r--r--kernel/trace/trace_kprobe.c3
-rw-r--r--kernel/trace/trace_nop.c1
-rw-r--r--kernel/trace/trace_output.c41
-rw-r--r--kernel/trace/trace_sched_wakeup.c70
-rw-r--r--kernel/trace/trace_selftest.c69
-rw-r--r--kernel/trace/trace_stack.c42
-rw-r--r--kernel/trace/trace_uprobe.c6
-rw-r--r--kernel/tracepoint.c6
-rw-r--r--kernel/user.c1
-rw-r--r--kernel/user_namespace.c33
-rw-r--r--kernel/utsname_sysctl.c10
-rw-r--r--kernel/watchdog.c6
-rw-r--r--kernel/workqueue.c490
-rw-r--r--kernel/workqueue_internal.h2
116 files changed, 5238 insertions, 3160 deletions
diff --git a/kernel/acct.c b/kernel/acct.c
index 8d6e145138bb..808a86ff229d 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -55,7 +55,7 @@
55#include <linux/times.h> 55#include <linux/times.h>
56#include <linux/syscalls.h> 56#include <linux/syscalls.h>
57#include <linux/mount.h> 57#include <linux/mount.h>
58#include <asm/uaccess.h> 58#include <linux/uaccess.h>
59#include <asm/div64.h> 59#include <asm/div64.h>
60#include <linux/blkdev.h> /* sector_div */ 60#include <linux/blkdev.h> /* sector_div */
61#include <linux/pid_namespace.h> 61#include <linux/pid_namespace.h>
@@ -134,7 +134,7 @@ static int check_free_space(struct bsd_acct_struct *acct, struct file *file)
134 spin_lock(&acct_lock); 134 spin_lock(&acct_lock);
135 if (file != acct->file) { 135 if (file != acct->file) {
136 if (act) 136 if (act)
137 res = act>0; 137 res = act > 0;
138 goto out; 138 goto out;
139 } 139 }
140 140
@@ -262,7 +262,7 @@ SYSCALL_DEFINE1(acct, const char __user *, name)
262 if (name) { 262 if (name) {
263 struct filename *tmp = getname(name); 263 struct filename *tmp = getname(name);
264 if (IS_ERR(tmp)) 264 if (IS_ERR(tmp))
265 return (PTR_ERR(tmp)); 265 return PTR_ERR(tmp);
266 error = acct_on(tmp); 266 error = acct_on(tmp);
267 putname(tmp); 267 putname(tmp);
268 } else { 268 } else {
diff --git a/kernel/audit.c b/kernel/audit.c
index 7c2893602d06..3ef2e0e797e8 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -44,7 +44,7 @@
44#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 44#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
45 45
46#include <linux/init.h> 46#include <linux/init.h>
47#include <asm/types.h> 47#include <linux/types.h>
48#include <linux/atomic.h> 48#include <linux/atomic.h>
49#include <linux/mm.h> 49#include <linux/mm.h>
50#include <linux/export.h> 50#include <linux/export.h>
@@ -424,6 +424,38 @@ static void kauditd_send_skb(struct sk_buff *skb)
424} 424}
425 425
426/* 426/*
427 * kauditd_send_multicast_skb - send the skb to multicast userspace listeners
428 *
429 * This function doesn't consume an skb as might be expected since it has to
430 * copy it anyways.
431 */
432static void kauditd_send_multicast_skb(struct sk_buff *skb)
433{
434 struct sk_buff *copy;
435 struct audit_net *aunet = net_generic(&init_net, audit_net_id);
436 struct sock *sock = aunet->nlsk;
437
438 if (!netlink_has_listeners(sock, AUDIT_NLGRP_READLOG))
439 return;
440
441 /*
442 * The seemingly wasteful skb_copy() rather than bumping the refcount
443 * using skb_get() is necessary because non-standard mods are made to
444 * the skb by the original kaudit unicast socket send routine. The
445 * existing auditd daemon assumes this breakage. Fixing this would
446 * require co-ordinating a change in the established protocol between
447 * the kaudit kernel subsystem and the auditd userspace code. There is
448 * no reason for new multicast clients to continue with this
449 * non-compliance.
450 */
451 copy = skb_copy(skb, GFP_KERNEL);
452 if (!copy)
453 return;
454
455 nlmsg_multicast(sock, copy, 0, AUDIT_NLGRP_READLOG, GFP_KERNEL);
456}
457
458/*
427 * flush_hold_queue - empty the hold queue if auditd appears 459 * flush_hold_queue - empty the hold queue if auditd appears
428 * 460 *
429 * If auditd just started, drain the queue of messages already 461 * If auditd just started, drain the queue of messages already
@@ -643,13 +675,13 @@ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type)
643 if ((task_active_pid_ns(current) != &init_pid_ns)) 675 if ((task_active_pid_ns(current) != &init_pid_ns))
644 return -EPERM; 676 return -EPERM;
645 677
646 if (!capable(CAP_AUDIT_CONTROL)) 678 if (!netlink_capable(skb, CAP_AUDIT_CONTROL))
647 err = -EPERM; 679 err = -EPERM;
648 break; 680 break;
649 case AUDIT_USER: 681 case AUDIT_USER:
650 case AUDIT_FIRST_USER_MSG ... AUDIT_LAST_USER_MSG: 682 case AUDIT_FIRST_USER_MSG ... AUDIT_LAST_USER_MSG:
651 case AUDIT_FIRST_USER_MSG2 ... AUDIT_LAST_USER_MSG2: 683 case AUDIT_FIRST_USER_MSG2 ... AUDIT_LAST_USER_MSG2:
652 if (!capable(CAP_AUDIT_WRITE)) 684 if (!netlink_capable(skb, CAP_AUDIT_WRITE))
653 err = -EPERM; 685 err = -EPERM;
654 break; 686 break;
655 default: /* bad msg */ 687 default: /* bad msg */
@@ -1076,10 +1108,22 @@ static void audit_receive(struct sk_buff *skb)
1076 mutex_unlock(&audit_cmd_mutex); 1108 mutex_unlock(&audit_cmd_mutex);
1077} 1109}
1078 1110
1111/* Run custom bind function on netlink socket group connect or bind requests. */
1112static int audit_bind(int group)
1113{
1114 if (!capable(CAP_AUDIT_READ))
1115 return -EPERM;
1116
1117 return 0;
1118}
1119
1079static int __net_init audit_net_init(struct net *net) 1120static int __net_init audit_net_init(struct net *net)
1080{ 1121{
1081 struct netlink_kernel_cfg cfg = { 1122 struct netlink_kernel_cfg cfg = {
1082 .input = audit_receive, 1123 .input = audit_receive,
1124 .bind = audit_bind,
1125 .flags = NL_CFG_F_NONROOT_RECV,
1126 .groups = AUDIT_NLGRP_MAX,
1083 }; 1127 };
1084 1128
1085 struct audit_net *aunet = net_generic(net, audit_net_id); 1129 struct audit_net *aunet = net_generic(net, audit_net_id);
@@ -1901,10 +1945,10 @@ out:
1901 * audit_log_end - end one audit record 1945 * audit_log_end - end one audit record
1902 * @ab: the audit_buffer 1946 * @ab: the audit_buffer
1903 * 1947 *
1904 * The netlink_* functions cannot be called inside an irq context, so 1948 * netlink_unicast() cannot be called inside an irq context because it blocks
1905 * the audit buffer is placed on a queue and a tasklet is scheduled to 1949 * (last arg, flags, is not set to MSG_DONTWAIT), so the audit buffer is placed
1906 * remove them from the queue outside the irq context. May be called in 1950 * on a queue and a tasklet is scheduled to remove them from the queue outside
1907 * any context. 1951 * the irq context. May be called in any context.
1908 */ 1952 */
1909void audit_log_end(struct audit_buffer *ab) 1953void audit_log_end(struct audit_buffer *ab)
1910{ 1954{
@@ -1914,6 +1958,18 @@ void audit_log_end(struct audit_buffer *ab)
1914 audit_log_lost("rate limit exceeded"); 1958 audit_log_lost("rate limit exceeded");
1915 } else { 1959 } else {
1916 struct nlmsghdr *nlh = nlmsg_hdr(ab->skb); 1960 struct nlmsghdr *nlh = nlmsg_hdr(ab->skb);
1961
1962 kauditd_send_multicast_skb(ab->skb);
1963
1964 /*
1965 * The original kaudit unicast socket sends up messages with
1966 * nlmsg_len set to the payload length rather than the entire
1967 * message length. This breaks the standard set by netlink.
1968 * The existing auditd daemon assumes this breakage. Fixing
1969 * this would require co-ordinating a change in the established
1970 * protocol between the kaudit kernel subsystem and the auditd
1971 * userspace code.
1972 */
1917 nlh->nlmsg_len = ab->skb->len - NLMSG_HDRLEN; 1973 nlh->nlmsg_len = ab->skb->len - NLMSG_HDRLEN;
1918 1974
1919 if (audit_pid) { 1975 if (audit_pid) {
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index f251a5e8d17a..21eae3c05ec0 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -728,6 +728,22 @@ static enum audit_state audit_filter_task(struct task_struct *tsk, char **key)
728 return AUDIT_BUILD_CONTEXT; 728 return AUDIT_BUILD_CONTEXT;
729} 729}
730 730
731static int audit_in_mask(const struct audit_krule *rule, unsigned long val)
732{
733 int word, bit;
734
735 if (val > 0xffffffff)
736 return false;
737
738 word = AUDIT_WORD(val);
739 if (word >= AUDIT_BITMASK_SIZE)
740 return false;
741
742 bit = AUDIT_BIT(val);
743
744 return rule->mask[word] & bit;
745}
746
731/* At syscall entry and exit time, this filter is called if the 747/* At syscall entry and exit time, this filter is called if the
732 * audit_state is not low enough that auditing cannot take place, but is 748 * audit_state is not low enough that auditing cannot take place, but is
733 * also not high enough that we already know we have to write an audit 749 * also not high enough that we already know we have to write an audit
@@ -745,11 +761,8 @@ static enum audit_state audit_filter_syscall(struct task_struct *tsk,
745 761
746 rcu_read_lock(); 762 rcu_read_lock();
747 if (!list_empty(list)) { 763 if (!list_empty(list)) {
748 int word = AUDIT_WORD(ctx->major);
749 int bit = AUDIT_BIT(ctx->major);
750
751 list_for_each_entry_rcu(e, list, list) { 764 list_for_each_entry_rcu(e, list, list) {
752 if ((e->rule.mask[word] & bit) == bit && 765 if (audit_in_mask(&e->rule, ctx->major) &&
753 audit_filter_rules(tsk, &e->rule, ctx, NULL, 766 audit_filter_rules(tsk, &e->rule, ctx, NULL,
754 &state, false)) { 767 &state, false)) {
755 rcu_read_unlock(); 768 rcu_read_unlock();
@@ -769,20 +782,16 @@ static enum audit_state audit_filter_syscall(struct task_struct *tsk,
769static int audit_filter_inode_name(struct task_struct *tsk, 782static int audit_filter_inode_name(struct task_struct *tsk,
770 struct audit_names *n, 783 struct audit_names *n,
771 struct audit_context *ctx) { 784 struct audit_context *ctx) {
772 int word, bit;
773 int h = audit_hash_ino((u32)n->ino); 785 int h = audit_hash_ino((u32)n->ino);
774 struct list_head *list = &audit_inode_hash[h]; 786 struct list_head *list = &audit_inode_hash[h];
775 struct audit_entry *e; 787 struct audit_entry *e;
776 enum audit_state state; 788 enum audit_state state;
777 789
778 word = AUDIT_WORD(ctx->major);
779 bit = AUDIT_BIT(ctx->major);
780
781 if (list_empty(list)) 790 if (list_empty(list))
782 return 0; 791 return 0;
783 792
784 list_for_each_entry_rcu(e, list, list) { 793 list_for_each_entry_rcu(e, list, list) {
785 if ((e->rule.mask[word] & bit) == bit && 794 if (audit_in_mask(&e->rule, ctx->major) &&
786 audit_filter_rules(tsk, &e->rule, ctx, n, &state, false)) { 795 audit_filter_rules(tsk, &e->rule, ctx, n, &state, false)) {
787 ctx->current_state = state; 796 ctx->current_state = state;
788 return 1; 797 return 1;
diff --git a/kernel/backtracetest.c b/kernel/backtracetest.c
index a5e026bc45c4..1323360d90e3 100644
--- a/kernel/backtracetest.c
+++ b/kernel/backtracetest.c
@@ -19,8 +19,8 @@
19 19
20static void backtrace_test_normal(void) 20static void backtrace_test_normal(void)
21{ 21{
22 printk("Testing a backtrace from process context.\n"); 22 pr_info("Testing a backtrace from process context.\n");
23 printk("The following trace is a kernel self test and not a bug!\n"); 23 pr_info("The following trace is a kernel self test and not a bug!\n");
24 24
25 dump_stack(); 25 dump_stack();
26} 26}
@@ -37,8 +37,8 @@ static DECLARE_TASKLET(backtrace_tasklet, &backtrace_test_irq_callback, 0);
37 37
38static void backtrace_test_irq(void) 38static void backtrace_test_irq(void)
39{ 39{
40 printk("Testing a backtrace from irq context.\n"); 40 pr_info("Testing a backtrace from irq context.\n");
41 printk("The following trace is a kernel self test and not a bug!\n"); 41 pr_info("The following trace is a kernel self test and not a bug!\n");
42 42
43 init_completion(&backtrace_work); 43 init_completion(&backtrace_work);
44 tasklet_schedule(&backtrace_tasklet); 44 tasklet_schedule(&backtrace_tasklet);
@@ -51,8 +51,8 @@ static void backtrace_test_saved(void)
51 struct stack_trace trace; 51 struct stack_trace trace;
52 unsigned long entries[8]; 52 unsigned long entries[8];
53 53
54 printk("Testing a saved backtrace.\n"); 54 pr_info("Testing a saved backtrace.\n");
55 printk("The following trace is a kernel self test and not a bug!\n"); 55 pr_info("The following trace is a kernel self test and not a bug!\n");
56 56
57 trace.nr_entries = 0; 57 trace.nr_entries = 0;
58 trace.max_entries = ARRAY_SIZE(entries); 58 trace.max_entries = ARRAY_SIZE(entries);
@@ -65,19 +65,19 @@ static void backtrace_test_saved(void)
65#else 65#else
66static void backtrace_test_saved(void) 66static void backtrace_test_saved(void)
67{ 67{
68 printk("Saved backtrace test skipped.\n"); 68 pr_info("Saved backtrace test skipped.\n");
69} 69}
70#endif 70#endif
71 71
72static int backtrace_regression_test(void) 72static int backtrace_regression_test(void)
73{ 73{
74 printk("====[ backtrace testing ]===========\n"); 74 pr_info("====[ backtrace testing ]===========\n");
75 75
76 backtrace_test_normal(); 76 backtrace_test_normal();
77 backtrace_test_irq(); 77 backtrace_test_irq();
78 backtrace_test_saved(); 78 backtrace_test_saved();
79 79
80 printk("====[ end of backtrace testing ]====\n"); 80 pr_info("====[ end of backtrace testing ]====\n");
81 return 0; 81 return 0;
82} 82}
83 83
diff --git a/kernel/capability.c b/kernel/capability.c
index a8d63df0c322..a5cf13c018ce 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -24,7 +24,6 @@
24 */ 24 */
25 25
26const kernel_cap_t __cap_empty_set = CAP_EMPTY_SET; 26const kernel_cap_t __cap_empty_set = CAP_EMPTY_SET;
27
28EXPORT_SYMBOL(__cap_empty_set); 27EXPORT_SYMBOL(__cap_empty_set);
29 28
30int file_caps_enabled = 1; 29int file_caps_enabled = 1;
@@ -189,7 +188,7 @@ SYSCALL_DEFINE2(capget, cap_user_header_t, header, cap_user_data_t, dataptr)
189 * 188 *
190 * An alternative would be to return an error here 189 * An alternative would be to return an error here
191 * (-ERANGE), but that causes legacy applications to 190 * (-ERANGE), but that causes legacy applications to
192 * unexpectidly fail; the capget/modify/capset aborts 191 * unexpectedly fail; the capget/modify/capset aborts
193 * before modification is attempted and the application 192 * before modification is attempted and the application
194 * fails. 193 * fails.
195 */ 194 */
@@ -395,7 +394,8 @@ EXPORT_SYMBOL(ns_capable);
395 * This does not set PF_SUPERPRIV because the caller may not 394 * This does not set PF_SUPERPRIV because the caller may not
396 * actually be privileged. 395 * actually be privileged.
397 */ 396 */
398bool file_ns_capable(const struct file *file, struct user_namespace *ns, int cap) 397bool file_ns_capable(const struct file *file, struct user_namespace *ns,
398 int cap)
399{ 399{
400 if (WARN_ON_ONCE(!cap_valid(cap))) 400 if (WARN_ON_ONCE(!cap_valid(cap)))
401 return false; 401 return false;
@@ -424,23 +424,19 @@ bool capable(int cap)
424EXPORT_SYMBOL(capable); 424EXPORT_SYMBOL(capable);
425 425
426/** 426/**
427 * inode_capable - Check superior capability over inode 427 * capable_wrt_inode_uidgid - Check nsown_capable and uid and gid mapped
428 * @inode: The inode in question 428 * @inode: The inode in question
429 * @cap: The capability in question 429 * @cap: The capability in question
430 * 430 *
431 * Return true if the current task has the given superior capability 431 * Return true if the current task has the given capability targeted at
432 * targeted at it's own user namespace and that the given inode is owned 432 * its own user namespace and that the given inode's uid and gid are
433 * by the current user namespace or a child namespace. 433 * mapped into the current user namespace.
434 *
435 * Currently we check to see if an inode is owned by the current
436 * user namespace by seeing if the inode's owner maps into the
437 * current user namespace.
438 *
439 */ 434 */
440bool inode_capable(const struct inode *inode, int cap) 435bool capable_wrt_inode_uidgid(const struct inode *inode, int cap)
441{ 436{
442 struct user_namespace *ns = current_user_ns(); 437 struct user_namespace *ns = current_user_ns();
443 438
444 return ns_capable(ns, cap) && kuid_has_mapping(ns, inode->i_uid); 439 return ns_capable(ns, cap) && kuid_has_mapping(ns, inode->i_uid) &&
440 kgid_has_mapping(ns, inode->i_gid);
445} 441}
446EXPORT_SYMBOL(inode_capable); 442EXPORT_SYMBOL(capable_wrt_inode_uidgid);
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 9fcdaa705b6c..7868fc3c0bc5 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -26,6 +26,8 @@
26 * distribution for more details. 26 * distribution for more details.
27 */ 27 */
28 28
29#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
30
29#include <linux/cgroup.h> 31#include <linux/cgroup.h>
30#include <linux/cred.h> 32#include <linux/cred.h>
31#include <linux/ctype.h> 33#include <linux/ctype.h>
@@ -33,6 +35,7 @@
33#include <linux/init_task.h> 35#include <linux/init_task.h>
34#include <linux/kernel.h> 36#include <linux/kernel.h>
35#include <linux/list.h> 37#include <linux/list.h>
38#include <linux/magic.h>
36#include <linux/mm.h> 39#include <linux/mm.h>
37#include <linux/mutex.h> 40#include <linux/mutex.h>
38#include <linux/mount.h> 41#include <linux/mount.h>
@@ -69,15 +72,6 @@
69 MAX_CFTYPE_NAME + 2) 72 MAX_CFTYPE_NAME + 2)
70 73
71/* 74/*
72 * cgroup_tree_mutex nests above cgroup_mutex and protects cftypes, file
73 * creation/removal and hierarchy changing operations including cgroup
74 * creation, removal, css association and controller rebinding. This outer
75 * lock is needed mainly to resolve the circular dependency between kernfs
76 * active ref and cgroup_mutex. cgroup_tree_mutex nests above both.
77 */
78static DEFINE_MUTEX(cgroup_tree_mutex);
79
80/*
81 * cgroup_mutex is the master lock. Any modification to cgroup or its 75 * cgroup_mutex is the master lock. Any modification to cgroup or its
82 * hierarchy must be performed while holding it. 76 * hierarchy must be performed while holding it.
83 * 77 *
@@ -98,16 +92,21 @@ static DECLARE_RWSEM(css_set_rwsem);
98#endif 92#endif
99 93
100/* 94/*
95 * Protects cgroup_idr and css_idr so that IDs can be released without
96 * grabbing cgroup_mutex.
97 */
98static DEFINE_SPINLOCK(cgroup_idr_lock);
99
100/*
101 * Protects cgroup_subsys->release_agent_path. Modifying it also requires 101 * Protects cgroup_subsys->release_agent_path. Modifying it also requires
102 * cgroup_mutex. Reading requires either cgroup_mutex or this spinlock. 102 * cgroup_mutex. Reading requires either cgroup_mutex or this spinlock.
103 */ 103 */
104static DEFINE_SPINLOCK(release_agent_path_lock); 104static DEFINE_SPINLOCK(release_agent_path_lock);
105 105
106#define cgroup_assert_mutexes_or_rcu_locked() \ 106#define cgroup_assert_mutex_or_rcu_locked() \
107 rcu_lockdep_assert(rcu_read_lock_held() || \ 107 rcu_lockdep_assert(rcu_read_lock_held() || \
108 lockdep_is_held(&cgroup_tree_mutex) || \
109 lockdep_is_held(&cgroup_mutex), \ 108 lockdep_is_held(&cgroup_mutex), \
110 "cgroup_[tree_]mutex or RCU read lock required"); 109 "cgroup_mutex or RCU read lock required");
111 110
112/* 111/*
113 * cgroup destruction makes heavy use of work items and there can be a lot 112 * cgroup destruction makes heavy use of work items and there can be a lot
@@ -150,6 +149,13 @@ struct cgroup_root cgrp_dfl_root;
150 */ 149 */
151static bool cgrp_dfl_root_visible; 150static bool cgrp_dfl_root_visible;
152 151
152/* some controllers are not supported in the default hierarchy */
153static const unsigned int cgrp_dfl_root_inhibit_ss_mask = 0
154#ifdef CONFIG_CGROUP_DEBUG
155 | (1 << debug_cgrp_id)
156#endif
157 ;
158
153/* The list of hierarchy roots */ 159/* The list of hierarchy roots */
154 160
155static LIST_HEAD(cgroup_roots); 161static LIST_HEAD(cgroup_roots);
@@ -159,14 +165,13 @@ static int cgroup_root_count;
159static DEFINE_IDR(cgroup_hierarchy_idr); 165static DEFINE_IDR(cgroup_hierarchy_idr);
160 166
161/* 167/*
162 * Assign a monotonically increasing serial number to cgroups. It 168 * Assign a monotonically increasing serial number to csses. It guarantees
163 * guarantees cgroups with bigger numbers are newer than those with smaller 169 * cgroups with bigger numbers are newer than those with smaller numbers.
164 * numbers. Also, as cgroups are always appended to the parent's 170 * Also, as csses are always appended to the parent's ->children list, it
165 * ->children list, it guarantees that sibling cgroups are always sorted in 171 * guarantees that sibling csses are always sorted in the ascending serial
166 * the ascending serial number order on the list. Protected by 172 * number order on the list. Protected by cgroup_mutex.
167 * cgroup_mutex.
168 */ 173 */
169static u64 cgroup_serial_nr_next = 1; 174static u64 css_serial_nr_next = 1;
170 175
171/* This flag indicates whether tasks in the fork and exit paths should 176/* This flag indicates whether tasks in the fork and exit paths should
172 * check for fork/exit handlers to call. This avoids us having to do 177 * check for fork/exit handlers to call. This avoids us having to do
@@ -179,17 +184,59 @@ static struct cftype cgroup_base_files[];
179 184
180static void cgroup_put(struct cgroup *cgrp); 185static void cgroup_put(struct cgroup *cgrp);
181static int rebind_subsystems(struct cgroup_root *dst_root, 186static int rebind_subsystems(struct cgroup_root *dst_root,
182 unsigned long ss_mask); 187 unsigned int ss_mask);
183static void cgroup_destroy_css_killed(struct cgroup *cgrp);
184static int cgroup_destroy_locked(struct cgroup *cgrp); 188static int cgroup_destroy_locked(struct cgroup *cgrp);
189static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss);
190static void css_release(struct percpu_ref *ref);
191static void kill_css(struct cgroup_subsys_state *css);
185static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], 192static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
186 bool is_add); 193 bool is_add);
187static void cgroup_pidlist_destroy_all(struct cgroup *cgrp); 194static void cgroup_pidlist_destroy_all(struct cgroup *cgrp);
188 195
196/* IDR wrappers which synchronize using cgroup_idr_lock */
197static int cgroup_idr_alloc(struct idr *idr, void *ptr, int start, int end,
198 gfp_t gfp_mask)
199{
200 int ret;
201
202 idr_preload(gfp_mask);
203 spin_lock_bh(&cgroup_idr_lock);
204 ret = idr_alloc(idr, ptr, start, end, gfp_mask);
205 spin_unlock_bh(&cgroup_idr_lock);
206 idr_preload_end();
207 return ret;
208}
209
210static void *cgroup_idr_replace(struct idr *idr, void *ptr, int id)
211{
212 void *ret;
213
214 spin_lock_bh(&cgroup_idr_lock);
215 ret = idr_replace(idr, ptr, id);
216 spin_unlock_bh(&cgroup_idr_lock);
217 return ret;
218}
219
220static void cgroup_idr_remove(struct idr *idr, int id)
221{
222 spin_lock_bh(&cgroup_idr_lock);
223 idr_remove(idr, id);
224 spin_unlock_bh(&cgroup_idr_lock);
225}
226
227static struct cgroup *cgroup_parent(struct cgroup *cgrp)
228{
229 struct cgroup_subsys_state *parent_css = cgrp->self.parent;
230
231 if (parent_css)
232 return container_of(parent_css, struct cgroup, self);
233 return NULL;
234}
235
189/** 236/**
190 * cgroup_css - obtain a cgroup's css for the specified subsystem 237 * cgroup_css - obtain a cgroup's css for the specified subsystem
191 * @cgrp: the cgroup of interest 238 * @cgrp: the cgroup of interest
192 * @ss: the subsystem of interest (%NULL returns the dummy_css) 239 * @ss: the subsystem of interest (%NULL returns @cgrp->self)
193 * 240 *
194 * Return @cgrp's css (cgroup_subsys_state) associated with @ss. This 241 * Return @cgrp's css (cgroup_subsys_state) associated with @ss. This
195 * function must be called either under cgroup_mutex or rcu_read_lock() and 242 * function must be called either under cgroup_mutex or rcu_read_lock() and
@@ -202,23 +249,49 @@ static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp,
202{ 249{
203 if (ss) 250 if (ss)
204 return rcu_dereference_check(cgrp->subsys[ss->id], 251 return rcu_dereference_check(cgrp->subsys[ss->id],
205 lockdep_is_held(&cgroup_tree_mutex) ||
206 lockdep_is_held(&cgroup_mutex)); 252 lockdep_is_held(&cgroup_mutex));
207 else 253 else
208 return &cgrp->dummy_css; 254 return &cgrp->self;
255}
256
257/**
258 * cgroup_e_css - obtain a cgroup's effective css for the specified subsystem
259 * @cgrp: the cgroup of interest
260 * @ss: the subsystem of interest (%NULL returns @cgrp->self)
261 *
262 * Similar to cgroup_css() but returns the effctive css, which is defined
263 * as the matching css of the nearest ancestor including self which has @ss
264 * enabled. If @ss is associated with the hierarchy @cgrp is on, this
265 * function is guaranteed to return non-NULL css.
266 */
267static struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp,
268 struct cgroup_subsys *ss)
269{
270 lockdep_assert_held(&cgroup_mutex);
271
272 if (!ss)
273 return &cgrp->self;
274
275 if (!(cgrp->root->subsys_mask & (1 << ss->id)))
276 return NULL;
277
278 while (cgroup_parent(cgrp) &&
279 !(cgroup_parent(cgrp)->child_subsys_mask & (1 << ss->id)))
280 cgrp = cgroup_parent(cgrp);
281
282 return cgroup_css(cgrp, ss);
209} 283}
210 284
211/* convenient tests for these bits */ 285/* convenient tests for these bits */
212static inline bool cgroup_is_dead(const struct cgroup *cgrp) 286static inline bool cgroup_is_dead(const struct cgroup *cgrp)
213{ 287{
214 return test_bit(CGRP_DEAD, &cgrp->flags); 288 return !(cgrp->self.flags & CSS_ONLINE);
215} 289}
216 290
217struct cgroup_subsys_state *seq_css(struct seq_file *seq) 291struct cgroup_subsys_state *of_css(struct kernfs_open_file *of)
218{ 292{
219 struct kernfs_open_file *of = seq->private;
220 struct cgroup *cgrp = of->kn->parent->priv; 293 struct cgroup *cgrp = of->kn->parent->priv;
221 struct cftype *cft = seq_cft(seq); 294 struct cftype *cft = of_cft(of);
222 295
223 /* 296 /*
224 * This is open and unprotected implementation of cgroup_css(). 297 * This is open and unprotected implementation of cgroup_css().
@@ -231,9 +304,9 @@ struct cgroup_subsys_state *seq_css(struct seq_file *seq)
231 if (cft->ss) 304 if (cft->ss)
232 return rcu_dereference_raw(cgrp->subsys[cft->ss->id]); 305 return rcu_dereference_raw(cgrp->subsys[cft->ss->id]);
233 else 306 else
234 return &cgrp->dummy_css; 307 return &cgrp->self;
235} 308}
236EXPORT_SYMBOL_GPL(seq_css); 309EXPORT_SYMBOL_GPL(of_css);
237 310
238/** 311/**
239 * cgroup_is_descendant - test ancestry 312 * cgroup_is_descendant - test ancestry
@@ -249,7 +322,7 @@ bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor)
249 while (cgrp) { 322 while (cgrp) {
250 if (cgrp == ancestor) 323 if (cgrp == ancestor)
251 return true; 324 return true;
252 cgrp = cgrp->parent; 325 cgrp = cgroup_parent(cgrp);
253 } 326 }
254 return false; 327 return false;
255} 328}
@@ -273,17 +346,30 @@ static int notify_on_release(const struct cgroup *cgrp)
273 * @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end 346 * @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end
274 * @cgrp: the target cgroup to iterate css's of 347 * @cgrp: the target cgroup to iterate css's of
275 * 348 *
276 * Should be called under cgroup_mutex. 349 * Should be called under cgroup_[tree_]mutex.
277 */ 350 */
278#define for_each_css(css, ssid, cgrp) \ 351#define for_each_css(css, ssid, cgrp) \
279 for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \ 352 for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \
280 if (!((css) = rcu_dereference_check( \ 353 if (!((css) = rcu_dereference_check( \
281 (cgrp)->subsys[(ssid)], \ 354 (cgrp)->subsys[(ssid)], \
282 lockdep_is_held(&cgroup_tree_mutex) || \
283 lockdep_is_held(&cgroup_mutex)))) { } \ 355 lockdep_is_held(&cgroup_mutex)))) { } \
284 else 356 else
285 357
286/** 358/**
359 * for_each_e_css - iterate all effective css's of a cgroup
360 * @css: the iteration cursor
361 * @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end
362 * @cgrp: the target cgroup to iterate css's of
363 *
364 * Should be called under cgroup_[tree_]mutex.
365 */
366#define for_each_e_css(css, ssid, cgrp) \
367 for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \
368 if (!((css) = cgroup_e_css(cgrp, cgroup_subsys[(ssid)]))) \
369 ; \
370 else
371
372/**
287 * for_each_subsys - iterate all enabled cgroup subsystems 373 * for_each_subsys - iterate all enabled cgroup subsystems
288 * @ss: the iteration cursor 374 * @ss: the iteration cursor
289 * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end 375 * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end
@@ -296,22 +382,13 @@ static int notify_on_release(const struct cgroup *cgrp)
296#define for_each_root(root) \ 382#define for_each_root(root) \
297 list_for_each_entry((root), &cgroup_roots, root_list) 383 list_for_each_entry((root), &cgroup_roots, root_list)
298 384
299/** 385/* iterate over child cgrps, lock should be held throughout iteration */
300 * cgroup_lock_live_group - take cgroup_mutex and check that cgrp is alive. 386#define cgroup_for_each_live_child(child, cgrp) \
301 * @cgrp: the cgroup to be checked for liveness 387 list_for_each_entry((child), &(cgrp)->self.children, self.sibling) \
302 * 388 if (({ lockdep_assert_held(&cgroup_mutex); \
303 * On success, returns true; the mutex should be later unlocked. On 389 cgroup_is_dead(child); })) \
304 * failure returns false with no lock held. 390 ; \
305 */ 391 else
306static bool cgroup_lock_live_group(struct cgroup *cgrp)
307{
308 mutex_lock(&cgroup_mutex);
309 if (cgroup_is_dead(cgrp)) {
310 mutex_unlock(&cgroup_mutex);
311 return false;
312 }
313 return true;
314}
315 392
316/* the list of cgroups eligible for automatic release. Protected by 393/* the list of cgroups eligible for automatic release. Protected by
317 * release_list_lock */ 394 * release_list_lock */
@@ -348,7 +425,7 @@ struct cgrp_cset_link {
348 * reference-counted, to improve performance when child cgroups 425 * reference-counted, to improve performance when child cgroups
349 * haven't been created. 426 * haven't been created.
350 */ 427 */
351static struct css_set init_css_set = { 428struct css_set init_css_set = {
352 .refcount = ATOMIC_INIT(1), 429 .refcount = ATOMIC_INIT(1),
353 .cgrp_links = LIST_HEAD_INIT(init_css_set.cgrp_links), 430 .cgrp_links = LIST_HEAD_INIT(init_css_set.cgrp_links),
354 .tasks = LIST_HEAD_INIT(init_css_set.tasks), 431 .tasks = LIST_HEAD_INIT(init_css_set.tasks),
@@ -359,6 +436,43 @@ static struct css_set init_css_set = {
359 436
360static int css_set_count = 1; /* 1 for init_css_set */ 437static int css_set_count = 1; /* 1 for init_css_set */
361 438
439/**
440 * cgroup_update_populated - updated populated count of a cgroup
441 * @cgrp: the target cgroup
442 * @populated: inc or dec populated count
443 *
444 * @cgrp is either getting the first task (css_set) or losing the last.
445 * Update @cgrp->populated_cnt accordingly. The count is propagated
446 * towards root so that a given cgroup's populated_cnt is zero iff the
447 * cgroup and all its descendants are empty.
448 *
449 * @cgrp's interface file "cgroup.populated" is zero if
450 * @cgrp->populated_cnt is zero and 1 otherwise. When @cgrp->populated_cnt
451 * changes from or to zero, userland is notified that the content of the
452 * interface file has changed. This can be used to detect when @cgrp and
453 * its descendants become populated or empty.
454 */
455static void cgroup_update_populated(struct cgroup *cgrp, bool populated)
456{
457 lockdep_assert_held(&css_set_rwsem);
458
459 do {
460 bool trigger;
461
462 if (populated)
463 trigger = !cgrp->populated_cnt++;
464 else
465 trigger = !--cgrp->populated_cnt;
466
467 if (!trigger)
468 break;
469
470 if (cgrp->populated_kn)
471 kernfs_notify(cgrp->populated_kn);
472 cgrp = cgroup_parent(cgrp);
473 } while (cgrp);
474}
475
362/* 476/*
363 * hash table for cgroup groups. This improves the performance to find 477 * hash table for cgroup groups. This improves the performance to find
364 * an existing css_set. This hash doesn't (currently) take into 478 * an existing css_set. This hash doesn't (currently) take into
@@ -383,6 +497,8 @@ static unsigned long css_set_hash(struct cgroup_subsys_state *css[])
383static void put_css_set_locked(struct css_set *cset, bool taskexit) 497static void put_css_set_locked(struct css_set *cset, bool taskexit)
384{ 498{
385 struct cgrp_cset_link *link, *tmp_link; 499 struct cgrp_cset_link *link, *tmp_link;
500 struct cgroup_subsys *ss;
501 int ssid;
386 502
387 lockdep_assert_held(&css_set_rwsem); 503 lockdep_assert_held(&css_set_rwsem);
388 504
@@ -390,6 +506,8 @@ static void put_css_set_locked(struct css_set *cset, bool taskexit)
390 return; 506 return;
391 507
392 /* This css_set is dead. unlink it and release cgroup refcounts */ 508 /* This css_set is dead. unlink it and release cgroup refcounts */
509 for_each_subsys(ss, ssid)
510 list_del(&cset->e_cset_node[ssid]);
393 hash_del(&cset->hlist); 511 hash_del(&cset->hlist);
394 css_set_count--; 512 css_set_count--;
395 513
@@ -400,10 +518,13 @@ static void put_css_set_locked(struct css_set *cset, bool taskexit)
400 list_del(&link->cgrp_link); 518 list_del(&link->cgrp_link);
401 519
402 /* @cgrp can't go away while we're holding css_set_rwsem */ 520 /* @cgrp can't go away while we're holding css_set_rwsem */
403 if (list_empty(&cgrp->cset_links) && notify_on_release(cgrp)) { 521 if (list_empty(&cgrp->cset_links)) {
404 if (taskexit) 522 cgroup_update_populated(cgrp, false);
405 set_bit(CGRP_RELEASABLE, &cgrp->flags); 523 if (notify_on_release(cgrp)) {
406 check_for_release(cgrp); 524 if (taskexit)
525 set_bit(CGRP_RELEASABLE, &cgrp->flags);
526 check_for_release(cgrp);
527 }
407 } 528 }
408 529
409 kfree(link); 530 kfree(link);
@@ -452,20 +573,20 @@ static bool compare_css_sets(struct css_set *cset,
452{ 573{
453 struct list_head *l1, *l2; 574 struct list_head *l1, *l2;
454 575
455 if (memcmp(template, cset->subsys, sizeof(cset->subsys))) { 576 /*
456 /* Not all subsystems matched */ 577 * On the default hierarchy, there can be csets which are
578 * associated with the same set of cgroups but different csses.
579 * Let's first ensure that csses match.
580 */
581 if (memcmp(template, cset->subsys, sizeof(cset->subsys)))
457 return false; 582 return false;
458 }
459 583
460 /* 584 /*
461 * Compare cgroup pointers in order to distinguish between 585 * Compare cgroup pointers in order to distinguish between
462 * different cgroups in heirarchies with no subsystems. We 586 * different cgroups in hierarchies. As different cgroups may
463 * could get by with just this check alone (and skip the 587 * share the same effective css, this comparison is always
464 * memcmp above) but on most setups the memcmp check will 588 * necessary.
465 * avoid the need for this more expensive check on almost all
466 * candidates.
467 */ 589 */
468
469 l1 = &cset->cgrp_links; 590 l1 = &cset->cgrp_links;
470 l2 = &old_cset->cgrp_links; 591 l2 = &old_cset->cgrp_links;
471 while (1) { 592 while (1) {
@@ -529,14 +650,17 @@ static struct css_set *find_existing_css_set(struct css_set *old_cset,
529 * won't change, so no need for locking. 650 * won't change, so no need for locking.
530 */ 651 */
531 for_each_subsys(ss, i) { 652 for_each_subsys(ss, i) {
532 if (root->cgrp.subsys_mask & (1UL << i)) { 653 if (root->subsys_mask & (1UL << i)) {
533 /* Subsystem is in this hierarchy. So we want 654 /*
534 * the subsystem state from the new 655 * @ss is in this hierarchy, so we want the
535 * cgroup */ 656 * effective css from @cgrp.
536 template[i] = cgroup_css(cgrp, ss); 657 */
658 template[i] = cgroup_e_css(cgrp, ss);
537 } else { 659 } else {
538 /* Subsystem is not in this hierarchy, so we 660 /*
539 * don't want to change the subsystem state */ 661 * @ss is not in this hierarchy, so we don't want
662 * to change the css.
663 */
540 template[i] = old_cset->subsys[i]; 664 template[i] = old_cset->subsys[i];
541 } 665 }
542 } 666 }
@@ -602,10 +726,18 @@ static void link_css_set(struct list_head *tmp_links, struct css_set *cset,
602 struct cgrp_cset_link *link; 726 struct cgrp_cset_link *link;
603 727
604 BUG_ON(list_empty(tmp_links)); 728 BUG_ON(list_empty(tmp_links));
729
730 if (cgroup_on_dfl(cgrp))
731 cset->dfl_cgrp = cgrp;
732
605 link = list_first_entry(tmp_links, struct cgrp_cset_link, cset_link); 733 link = list_first_entry(tmp_links, struct cgrp_cset_link, cset_link);
606 link->cset = cset; 734 link->cset = cset;
607 link->cgrp = cgrp; 735 link->cgrp = cgrp;
736
737 if (list_empty(&cgrp->cset_links))
738 cgroup_update_populated(cgrp, true);
608 list_move(&link->cset_link, &cgrp->cset_links); 739 list_move(&link->cset_link, &cgrp->cset_links);
740
609 /* 741 /*
610 * Always add links to the tail of the list so that the list 742 * Always add links to the tail of the list so that the list
611 * is sorted by order of hierarchy creation 743 * is sorted by order of hierarchy creation
@@ -628,7 +760,9 @@ static struct css_set *find_css_set(struct css_set *old_cset,
628 struct css_set *cset; 760 struct css_set *cset;
629 struct list_head tmp_links; 761 struct list_head tmp_links;
630 struct cgrp_cset_link *link; 762 struct cgrp_cset_link *link;
763 struct cgroup_subsys *ss;
631 unsigned long key; 764 unsigned long key;
765 int ssid;
632 766
633 lockdep_assert_held(&cgroup_mutex); 767 lockdep_assert_held(&cgroup_mutex);
634 768
@@ -679,10 +813,14 @@ static struct css_set *find_css_set(struct css_set *old_cset,
679 813
680 css_set_count++; 814 css_set_count++;
681 815
682 /* Add this cgroup group to the hash table */ 816 /* Add @cset to the hash table */
683 key = css_set_hash(cset->subsys); 817 key = css_set_hash(cset->subsys);
684 hash_add(css_set_table, &cset->hlist, key); 818 hash_add(css_set_table, &cset->hlist, key);
685 819
820 for_each_subsys(ss, ssid)
821 list_add_tail(&cset->e_cset_node[ssid],
822 &cset->subsys[ssid]->cgroup->e_csets[ssid]);
823
686 up_write(&css_set_rwsem); 824 up_write(&css_set_rwsem);
687 825
688 return cset; 826 return cset;
@@ -735,14 +873,13 @@ static void cgroup_destroy_root(struct cgroup_root *root)
735 struct cgroup *cgrp = &root->cgrp; 873 struct cgroup *cgrp = &root->cgrp;
736 struct cgrp_cset_link *link, *tmp_link; 874 struct cgrp_cset_link *link, *tmp_link;
737 875
738 mutex_lock(&cgroup_tree_mutex);
739 mutex_lock(&cgroup_mutex); 876 mutex_lock(&cgroup_mutex);
740 877
741 BUG_ON(atomic_read(&root->nr_cgrps)); 878 BUG_ON(atomic_read(&root->nr_cgrps));
742 BUG_ON(!list_empty(&cgrp->children)); 879 BUG_ON(!list_empty(&cgrp->self.children));
743 880
744 /* Rebind all subsystems back to the default hierarchy */ 881 /* Rebind all subsystems back to the default hierarchy */
745 rebind_subsystems(&cgrp_dfl_root, cgrp->subsys_mask); 882 rebind_subsystems(&cgrp_dfl_root, root->subsys_mask);
746 883
747 /* 884 /*
748 * Release all the links from cset_links to this hierarchy's 885 * Release all the links from cset_links to this hierarchy's
@@ -765,7 +902,6 @@ static void cgroup_destroy_root(struct cgroup_root *root)
765 cgroup_exit_root_id(root); 902 cgroup_exit_root_id(root);
766 903
767 mutex_unlock(&cgroup_mutex); 904 mutex_unlock(&cgroup_mutex);
768 mutex_unlock(&cgroup_tree_mutex);
769 905
770 kernfs_destroy_root(root->kf_root); 906 kernfs_destroy_root(root->kf_root);
771 cgroup_free_root(root); 907 cgroup_free_root(root);
@@ -848,7 +984,7 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task,
848 * update of a tasks cgroup pointer by cgroup_attach_task() 984 * update of a tasks cgroup pointer by cgroup_attach_task()
849 */ 985 */
850 986
851static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask); 987static int cgroup_populate_dir(struct cgroup *cgrp, unsigned int subsys_mask);
852static struct kernfs_syscall_ops cgroup_kf_syscall_ops; 988static struct kernfs_syscall_ops cgroup_kf_syscall_ops;
853static const struct file_operations proc_cgroupstats_operations; 989static const struct file_operations proc_cgroupstats_operations;
854 990
@@ -883,79 +1019,95 @@ static umode_t cgroup_file_mode(const struct cftype *cft)
883 if (cft->read_u64 || cft->read_s64 || cft->seq_show) 1019 if (cft->read_u64 || cft->read_s64 || cft->seq_show)
884 mode |= S_IRUGO; 1020 mode |= S_IRUGO;
885 1021
886 if (cft->write_u64 || cft->write_s64 || cft->write_string || 1022 if (cft->write_u64 || cft->write_s64 || cft->write)
887 cft->trigger)
888 mode |= S_IWUSR; 1023 mode |= S_IWUSR;
889 1024
890 return mode; 1025 return mode;
891} 1026}
892 1027
893static void cgroup_free_fn(struct work_struct *work) 1028static void cgroup_get(struct cgroup *cgrp)
894{ 1029{
895 struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work); 1030 WARN_ON_ONCE(cgroup_is_dead(cgrp));
896 1031 css_get(&cgrp->self);
897 atomic_dec(&cgrp->root->nr_cgrps);
898 cgroup_pidlist_destroy_all(cgrp);
899
900 if (cgrp->parent) {
901 /*
902 * We get a ref to the parent, and put the ref when this
903 * cgroup is being freed, so it's guaranteed that the
904 * parent won't be destroyed before its children.
905 */
906 cgroup_put(cgrp->parent);
907 kernfs_put(cgrp->kn);
908 kfree(cgrp);
909 } else {
910 /*
911 * This is root cgroup's refcnt reaching zero, which
912 * indicates that the root should be released.
913 */
914 cgroup_destroy_root(cgrp->root);
915 }
916} 1032}
917 1033
918static void cgroup_free_rcu(struct rcu_head *head) 1034static void cgroup_put(struct cgroup *cgrp)
919{ 1035{
920 struct cgroup *cgrp = container_of(head, struct cgroup, rcu_head); 1036 css_put(&cgrp->self);
921
922 INIT_WORK(&cgrp->destroy_work, cgroup_free_fn);
923 queue_work(cgroup_destroy_wq, &cgrp->destroy_work);
924} 1037}
925 1038
926static void cgroup_get(struct cgroup *cgrp) 1039/**
1040 * cgroup_kn_unlock - unlocking helper for cgroup kernfs methods
1041 * @kn: the kernfs_node being serviced
1042 *
1043 * This helper undoes cgroup_kn_lock_live() and should be invoked before
1044 * the method finishes if locking succeeded. Note that once this function
1045 * returns the cgroup returned by cgroup_kn_lock_live() may become
1046 * inaccessible any time. If the caller intends to continue to access the
1047 * cgroup, it should pin it before invoking this function.
1048 */
1049static void cgroup_kn_unlock(struct kernfs_node *kn)
927{ 1050{
928 WARN_ON_ONCE(cgroup_is_dead(cgrp)); 1051 struct cgroup *cgrp;
929 WARN_ON_ONCE(atomic_read(&cgrp->refcnt) <= 0); 1052
930 atomic_inc(&cgrp->refcnt); 1053 if (kernfs_type(kn) == KERNFS_DIR)
1054 cgrp = kn->priv;
1055 else
1056 cgrp = kn->parent->priv;
1057
1058 mutex_unlock(&cgroup_mutex);
1059
1060 kernfs_unbreak_active_protection(kn);
1061 cgroup_put(cgrp);
931} 1062}
932 1063
933static void cgroup_put(struct cgroup *cgrp) 1064/**
1065 * cgroup_kn_lock_live - locking helper for cgroup kernfs methods
1066 * @kn: the kernfs_node being serviced
1067 *
1068 * This helper is to be used by a cgroup kernfs method currently servicing
1069 * @kn. It breaks the active protection, performs cgroup locking and
1070 * verifies that the associated cgroup is alive. Returns the cgroup if
1071 * alive; otherwise, %NULL. A successful return should be undone by a
1072 * matching cgroup_kn_unlock() invocation.
1073 *
1074 * Any cgroup kernfs method implementation which requires locking the
1075 * associated cgroup should use this helper. It avoids nesting cgroup
1076 * locking under kernfs active protection and allows all kernfs operations
1077 * including self-removal.
1078 */
1079static struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn)
934{ 1080{
935 if (!atomic_dec_and_test(&cgrp->refcnt)) 1081 struct cgroup *cgrp;
936 return; 1082
937 if (WARN_ON_ONCE(cgrp->parent && !cgroup_is_dead(cgrp))) 1083 if (kernfs_type(kn) == KERNFS_DIR)
938 return; 1084 cgrp = kn->priv;
1085 else
1086 cgrp = kn->parent->priv;
939 1087
940 /* 1088 /*
941 * XXX: cgrp->id is only used to look up css's. As cgroup and 1089 * We're gonna grab cgroup_mutex which nests outside kernfs
942 * css's lifetimes will be decoupled, it should be made 1090 * active_ref. cgroup liveliness check alone provides enough
943 * per-subsystem and moved to css->id so that lookups are 1091 * protection against removal. Ensure @cgrp stays accessible and
944 * successful until the target css is released. 1092 * break the active_ref protection.
945 */ 1093 */
1094 cgroup_get(cgrp);
1095 kernfs_break_active_protection(kn);
1096
946 mutex_lock(&cgroup_mutex); 1097 mutex_lock(&cgroup_mutex);
947 idr_remove(&cgrp->root->cgroup_idr, cgrp->id);
948 mutex_unlock(&cgroup_mutex);
949 cgrp->id = -1;
950 1098
951 call_rcu(&cgrp->rcu_head, cgroup_free_rcu); 1099 if (!cgroup_is_dead(cgrp))
1100 return cgrp;
1101
1102 cgroup_kn_unlock(kn);
1103 return NULL;
952} 1104}
953 1105
954static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft) 1106static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
955{ 1107{
956 char name[CGROUP_FILE_NAME_MAX]; 1108 char name[CGROUP_FILE_NAME_MAX];
957 1109
958 lockdep_assert_held(&cgroup_tree_mutex); 1110 lockdep_assert_held(&cgroup_mutex);
959 kernfs_remove_by_name(cgrp->kn, cgroup_file_name(cgrp, cft, name)); 1111 kernfs_remove_by_name(cgrp->kn, cgroup_file_name(cgrp, cft, name));
960} 1112}
961 1113
@@ -964,7 +1116,7 @@ static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
964 * @cgrp: target cgroup 1116 * @cgrp: target cgroup
965 * @subsys_mask: mask of the subsystem ids whose files should be removed 1117 * @subsys_mask: mask of the subsystem ids whose files should be removed
966 */ 1118 */
967static void cgroup_clear_dir(struct cgroup *cgrp, unsigned long subsys_mask) 1119static void cgroup_clear_dir(struct cgroup *cgrp, unsigned int subsys_mask)
968{ 1120{
969 struct cgroup_subsys *ss; 1121 struct cgroup_subsys *ss;
970 int i; 1122 int i;
@@ -972,40 +1124,40 @@ static void cgroup_clear_dir(struct cgroup *cgrp, unsigned long subsys_mask)
972 for_each_subsys(ss, i) { 1124 for_each_subsys(ss, i) {
973 struct cftype *cfts; 1125 struct cftype *cfts;
974 1126
975 if (!test_bit(i, &subsys_mask)) 1127 if (!(subsys_mask & (1 << i)))
976 continue; 1128 continue;
977 list_for_each_entry(cfts, &ss->cfts, node) 1129 list_for_each_entry(cfts, &ss->cfts, node)
978 cgroup_addrm_files(cgrp, cfts, false); 1130 cgroup_addrm_files(cgrp, cfts, false);
979 } 1131 }
980} 1132}
981 1133
982static int rebind_subsystems(struct cgroup_root *dst_root, 1134static int rebind_subsystems(struct cgroup_root *dst_root, unsigned int ss_mask)
983 unsigned long ss_mask)
984{ 1135{
985 struct cgroup_subsys *ss; 1136 struct cgroup_subsys *ss;
986 int ssid, ret; 1137 unsigned int tmp_ss_mask;
1138 int ssid, i, ret;
987 1139
988 lockdep_assert_held(&cgroup_tree_mutex);
989 lockdep_assert_held(&cgroup_mutex); 1140 lockdep_assert_held(&cgroup_mutex);
990 1141
991 for_each_subsys(ss, ssid) { 1142 for_each_subsys(ss, ssid) {
992 if (!(ss_mask & (1 << ssid))) 1143 if (!(ss_mask & (1 << ssid)))
993 continue; 1144 continue;
994 1145
995 /* if @ss is on the dummy_root, we can always move it */ 1146 /* if @ss has non-root csses attached to it, can't move */
996 if (ss->root == &cgrp_dfl_root) 1147 if (css_next_child(NULL, cgroup_css(&ss->root->cgrp, ss)))
997 continue;
998
999 /* if @ss has non-root cgroups attached to it, can't move */
1000 if (!list_empty(&ss->root->cgrp.children))
1001 return -EBUSY; 1148 return -EBUSY;
1002 1149
1003 /* can't move between two non-dummy roots either */ 1150 /* can't move between two non-dummy roots either */
1004 if (dst_root != &cgrp_dfl_root) 1151 if (ss->root != &cgrp_dfl_root && dst_root != &cgrp_dfl_root)
1005 return -EBUSY; 1152 return -EBUSY;
1006 } 1153 }
1007 1154
1008 ret = cgroup_populate_dir(&dst_root->cgrp, ss_mask); 1155 /* skip creating root files on dfl_root for inhibited subsystems */
1156 tmp_ss_mask = ss_mask;
1157 if (dst_root == &cgrp_dfl_root)
1158 tmp_ss_mask &= ~cgrp_dfl_root_inhibit_ss_mask;
1159
1160 ret = cgroup_populate_dir(&dst_root->cgrp, tmp_ss_mask);
1009 if (ret) { 1161 if (ret) {
1010 if (dst_root != &cgrp_dfl_root) 1162 if (dst_root != &cgrp_dfl_root)
1011 return ret; 1163 return ret;
@@ -1017,9 +1169,9 @@ static int rebind_subsystems(struct cgroup_root *dst_root,
1017 * Just warn about it and continue. 1169 * Just warn about it and continue.
1018 */ 1170 */
1019 if (cgrp_dfl_root_visible) { 1171 if (cgrp_dfl_root_visible) {
1020 pr_warning("cgroup: failed to create files (%d) while rebinding 0x%lx to default root\n", 1172 pr_warn("failed to create files (%d) while rebinding 0x%x to default root\n",
1021 ret, ss_mask); 1173 ret, ss_mask);
1022 pr_warning("cgroup: you may retry by moving them to a different hierarchy and unbinding\n"); 1174 pr_warn("you may retry by moving them to a different hierarchy and unbinding\n");
1023 } 1175 }
1024 } 1176 }
1025 1177
@@ -1027,15 +1179,14 @@ static int rebind_subsystems(struct cgroup_root *dst_root,
1027 * Nothing can fail from this point on. Remove files for the 1179 * Nothing can fail from this point on. Remove files for the
1028 * removed subsystems and rebind each subsystem. 1180 * removed subsystems and rebind each subsystem.
1029 */ 1181 */
1030 mutex_unlock(&cgroup_mutex);
1031 for_each_subsys(ss, ssid) 1182 for_each_subsys(ss, ssid)
1032 if (ss_mask & (1 << ssid)) 1183 if (ss_mask & (1 << ssid))
1033 cgroup_clear_dir(&ss->root->cgrp, 1 << ssid); 1184 cgroup_clear_dir(&ss->root->cgrp, 1 << ssid);
1034 mutex_lock(&cgroup_mutex);
1035 1185
1036 for_each_subsys(ss, ssid) { 1186 for_each_subsys(ss, ssid) {
1037 struct cgroup_root *src_root; 1187 struct cgroup_root *src_root;
1038 struct cgroup_subsys_state *css; 1188 struct cgroup_subsys_state *css;
1189 struct css_set *cset;
1039 1190
1040 if (!(ss_mask & (1 << ssid))) 1191 if (!(ss_mask & (1 << ssid)))
1041 continue; 1192 continue;
@@ -1050,8 +1201,19 @@ static int rebind_subsystems(struct cgroup_root *dst_root,
1050 ss->root = dst_root; 1201 ss->root = dst_root;
1051 css->cgroup = &dst_root->cgrp; 1202 css->cgroup = &dst_root->cgrp;
1052 1203
1053 src_root->cgrp.subsys_mask &= ~(1 << ssid); 1204 down_write(&css_set_rwsem);
1054 dst_root->cgrp.subsys_mask |= 1 << ssid; 1205 hash_for_each(css_set_table, i, cset, hlist)
1206 list_move_tail(&cset->e_cset_node[ss->id],
1207 &dst_root->cgrp.e_csets[ss->id]);
1208 up_write(&css_set_rwsem);
1209
1210 src_root->subsys_mask &= ~(1 << ssid);
1211 src_root->cgrp.child_subsys_mask &= ~(1 << ssid);
1212
1213 /* default hierarchy doesn't enable controllers by default */
1214 dst_root->subsys_mask |= 1 << ssid;
1215 if (dst_root != &cgrp_dfl_root)
1216 dst_root->cgrp.child_subsys_mask |= 1 << ssid;
1055 1217
1056 if (ss->bind) 1218 if (ss->bind)
1057 ss->bind(css); 1219 ss->bind(css);
@@ -1069,7 +1231,7 @@ static int cgroup_show_options(struct seq_file *seq,
1069 int ssid; 1231 int ssid;
1070 1232
1071 for_each_subsys(ss, ssid) 1233 for_each_subsys(ss, ssid)
1072 if (root->cgrp.subsys_mask & (1 << ssid)) 1234 if (root->subsys_mask & (1 << ssid))
1073 seq_printf(seq, ",%s", ss->name); 1235 seq_printf(seq, ",%s", ss->name);
1074 if (root->flags & CGRP_ROOT_SANE_BEHAVIOR) 1236 if (root->flags & CGRP_ROOT_SANE_BEHAVIOR)
1075 seq_puts(seq, ",sane_behavior"); 1237 seq_puts(seq, ",sane_behavior");
@@ -1091,8 +1253,8 @@ static int cgroup_show_options(struct seq_file *seq,
1091} 1253}
1092 1254
1093struct cgroup_sb_opts { 1255struct cgroup_sb_opts {
1094 unsigned long subsys_mask; 1256 unsigned int subsys_mask;
1095 unsigned long flags; 1257 unsigned int flags;
1096 char *release_agent; 1258 char *release_agent;
1097 bool cpuset_clone_children; 1259 bool cpuset_clone_children;
1098 char *name; 1260 char *name;
@@ -1100,24 +1262,16 @@ struct cgroup_sb_opts {
1100 bool none; 1262 bool none;
1101}; 1263};
1102 1264
1103/*
1104 * Convert a hierarchy specifier into a bitmask of subsystems and
1105 * flags. Call with cgroup_mutex held to protect the cgroup_subsys[]
1106 * array. This function takes refcounts on subsystems to be used, unless it
1107 * returns error, in which case no refcounts are taken.
1108 */
1109static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) 1265static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1110{ 1266{
1111 char *token, *o = data; 1267 char *token, *o = data;
1112 bool all_ss = false, one_ss = false; 1268 bool all_ss = false, one_ss = false;
1113 unsigned long mask = (unsigned long)-1; 1269 unsigned int mask = -1U;
1114 struct cgroup_subsys *ss; 1270 struct cgroup_subsys *ss;
1115 int i; 1271 int i;
1116 1272
1117 BUG_ON(!mutex_is_locked(&cgroup_mutex));
1118
1119#ifdef CONFIG_CPUSETS 1273#ifdef CONFIG_CPUSETS
1120 mask = ~(1UL << cpuset_cgrp_id); 1274 mask = ~(1U << cpuset_cgrp_id);
1121#endif 1275#endif
1122 1276
1123 memset(opts, 0, sizeof(*opts)); 1277 memset(opts, 0, sizeof(*opts));
@@ -1198,7 +1352,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1198 /* Mutually exclusive option 'all' + subsystem name */ 1352 /* Mutually exclusive option 'all' + subsystem name */
1199 if (all_ss) 1353 if (all_ss)
1200 return -EINVAL; 1354 return -EINVAL;
1201 set_bit(i, &opts->subsys_mask); 1355 opts->subsys_mask |= (1 << i);
1202 one_ss = true; 1356 one_ss = true;
1203 1357
1204 break; 1358 break;
@@ -1210,12 +1364,12 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1210 /* Consistency checks */ 1364 /* Consistency checks */
1211 1365
1212 if (opts->flags & CGRP_ROOT_SANE_BEHAVIOR) { 1366 if (opts->flags & CGRP_ROOT_SANE_BEHAVIOR) {
1213 pr_warning("cgroup: sane_behavior: this is still under development and its behaviors will change, proceed at your own risk\n"); 1367 pr_warn("sane_behavior: this is still under development and its behaviors will change, proceed at your own risk\n");
1214 1368
1215 if ((opts->flags & (CGRP_ROOT_NOPREFIX | CGRP_ROOT_XATTR)) || 1369 if ((opts->flags & (CGRP_ROOT_NOPREFIX | CGRP_ROOT_XATTR)) ||
1216 opts->cpuset_clone_children || opts->release_agent || 1370 opts->cpuset_clone_children || opts->release_agent ||
1217 opts->name) { 1371 opts->name) {
1218 pr_err("cgroup: sane_behavior: noprefix, xattr, clone_children, release_agent and name are not allowed\n"); 1372 pr_err("sane_behavior: noprefix, xattr, clone_children, release_agent and name are not allowed\n");
1219 return -EINVAL; 1373 return -EINVAL;
1220 } 1374 }
1221 } else { 1375 } else {
@@ -1227,7 +1381,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1227 if (all_ss || (!one_ss && !opts->none && !opts->name)) 1381 if (all_ss || (!one_ss && !opts->none && !opts->name))
1228 for_each_subsys(ss, i) 1382 for_each_subsys(ss, i)
1229 if (!ss->disabled) 1383 if (!ss->disabled)
1230 set_bit(i, &opts->subsys_mask); 1384 opts->subsys_mask |= (1 << i);
1231 1385
1232 /* 1386 /*
1233 * We either have to specify by name or by subsystems. (So 1387 * We either have to specify by name or by subsystems. (So
@@ -1258,14 +1412,13 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data)
1258 int ret = 0; 1412 int ret = 0;
1259 struct cgroup_root *root = cgroup_root_from_kf(kf_root); 1413 struct cgroup_root *root = cgroup_root_from_kf(kf_root);
1260 struct cgroup_sb_opts opts; 1414 struct cgroup_sb_opts opts;
1261 unsigned long added_mask, removed_mask; 1415 unsigned int added_mask, removed_mask;
1262 1416
1263 if (root->flags & CGRP_ROOT_SANE_BEHAVIOR) { 1417 if (root->flags & CGRP_ROOT_SANE_BEHAVIOR) {
1264 pr_err("cgroup: sane_behavior: remount is not allowed\n"); 1418 pr_err("sane_behavior: remount is not allowed\n");
1265 return -EINVAL; 1419 return -EINVAL;
1266 } 1420 }
1267 1421
1268 mutex_lock(&cgroup_tree_mutex);
1269 mutex_lock(&cgroup_mutex); 1422 mutex_lock(&cgroup_mutex);
1270 1423
1271 /* See what subsystems are wanted */ 1424 /* See what subsystems are wanted */
@@ -1273,17 +1426,17 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data)
1273 if (ret) 1426 if (ret)
1274 goto out_unlock; 1427 goto out_unlock;
1275 1428
1276 if (opts.subsys_mask != root->cgrp.subsys_mask || opts.release_agent) 1429 if (opts.subsys_mask != root->subsys_mask || opts.release_agent)
1277 pr_warning("cgroup: option changes via remount are deprecated (pid=%d comm=%s)\n", 1430 pr_warn("option changes via remount are deprecated (pid=%d comm=%s)\n",
1278 task_tgid_nr(current), current->comm); 1431 task_tgid_nr(current), current->comm);
1279 1432
1280 added_mask = opts.subsys_mask & ~root->cgrp.subsys_mask; 1433 added_mask = opts.subsys_mask & ~root->subsys_mask;
1281 removed_mask = root->cgrp.subsys_mask & ~opts.subsys_mask; 1434 removed_mask = root->subsys_mask & ~opts.subsys_mask;
1282 1435
1283 /* Don't allow flags or name to change at remount */ 1436 /* Don't allow flags or name to change at remount */
1284 if (((opts.flags ^ root->flags) & CGRP_ROOT_OPTION_MASK) || 1437 if (((opts.flags ^ root->flags) & CGRP_ROOT_OPTION_MASK) ||
1285 (opts.name && strcmp(opts.name, root->name))) { 1438 (opts.name && strcmp(opts.name, root->name))) {
1286 pr_err("cgroup: option or name mismatch, new: 0x%lx \"%s\", old: 0x%lx \"%s\"\n", 1439 pr_err("option or name mismatch, new: 0x%x \"%s\", old: 0x%x \"%s\"\n",
1287 opts.flags & CGRP_ROOT_OPTION_MASK, opts.name ?: "", 1440 opts.flags & CGRP_ROOT_OPTION_MASK, opts.name ?: "",
1288 root->flags & CGRP_ROOT_OPTION_MASK, root->name); 1441 root->flags & CGRP_ROOT_OPTION_MASK, root->name);
1289 ret = -EINVAL; 1442 ret = -EINVAL;
@@ -1291,7 +1444,7 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data)
1291 } 1444 }
1292 1445
1293 /* remounting is not allowed for populated hierarchies */ 1446 /* remounting is not allowed for populated hierarchies */
1294 if (!list_empty(&root->cgrp.children)) { 1447 if (!list_empty(&root->cgrp.self.children)) {
1295 ret = -EBUSY; 1448 ret = -EBUSY;
1296 goto out_unlock; 1449 goto out_unlock;
1297 } 1450 }
@@ -1311,7 +1464,6 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data)
1311 kfree(opts.release_agent); 1464 kfree(opts.release_agent);
1312 kfree(opts.name); 1465 kfree(opts.name);
1313 mutex_unlock(&cgroup_mutex); 1466 mutex_unlock(&cgroup_mutex);
1314 mutex_unlock(&cgroup_tree_mutex);
1315 return ret; 1467 return ret;
1316} 1468}
1317 1469
@@ -1369,14 +1521,22 @@ out_unlock:
1369 1521
1370static void init_cgroup_housekeeping(struct cgroup *cgrp) 1522static void init_cgroup_housekeeping(struct cgroup *cgrp)
1371{ 1523{
1372 atomic_set(&cgrp->refcnt, 1); 1524 struct cgroup_subsys *ss;
1373 INIT_LIST_HEAD(&cgrp->sibling); 1525 int ssid;
1374 INIT_LIST_HEAD(&cgrp->children); 1526
1527 INIT_LIST_HEAD(&cgrp->self.sibling);
1528 INIT_LIST_HEAD(&cgrp->self.children);
1375 INIT_LIST_HEAD(&cgrp->cset_links); 1529 INIT_LIST_HEAD(&cgrp->cset_links);
1376 INIT_LIST_HEAD(&cgrp->release_list); 1530 INIT_LIST_HEAD(&cgrp->release_list);
1377 INIT_LIST_HEAD(&cgrp->pidlists); 1531 INIT_LIST_HEAD(&cgrp->pidlists);
1378 mutex_init(&cgrp->pidlist_mutex); 1532 mutex_init(&cgrp->pidlist_mutex);
1379 cgrp->dummy_css.cgroup = cgrp; 1533 cgrp->self.cgroup = cgrp;
1534 cgrp->self.flags |= CSS_ONLINE;
1535
1536 for_each_subsys(ss, ssid)
1537 INIT_LIST_HEAD(&cgrp->e_csets[ssid]);
1538
1539 init_waitqueue_head(&cgrp->offline_waitq);
1380} 1540}
1381 1541
1382static void init_cgroup_root(struct cgroup_root *root, 1542static void init_cgroup_root(struct cgroup_root *root,
@@ -1399,21 +1559,24 @@ static void init_cgroup_root(struct cgroup_root *root,
1399 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags); 1559 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags);
1400} 1560}
1401 1561
1402static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask) 1562static int cgroup_setup_root(struct cgroup_root *root, unsigned int ss_mask)
1403{ 1563{
1404 LIST_HEAD(tmp_links); 1564 LIST_HEAD(tmp_links);
1405 struct cgroup *root_cgrp = &root->cgrp; 1565 struct cgroup *root_cgrp = &root->cgrp;
1406 struct css_set *cset; 1566 struct css_set *cset;
1407 int i, ret; 1567 int i, ret;
1408 1568
1409 lockdep_assert_held(&cgroup_tree_mutex);
1410 lockdep_assert_held(&cgroup_mutex); 1569 lockdep_assert_held(&cgroup_mutex);
1411 1570
1412 ret = idr_alloc(&root->cgroup_idr, root_cgrp, 0, 1, GFP_KERNEL); 1571 ret = cgroup_idr_alloc(&root->cgroup_idr, root_cgrp, 1, 2, GFP_NOWAIT);
1413 if (ret < 0) 1572 if (ret < 0)
1414 goto out; 1573 goto out;
1415 root_cgrp->id = ret; 1574 root_cgrp->id = ret;
1416 1575
1576 ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release);
1577 if (ret)
1578 goto out;
1579
1417 /* 1580 /*
1418 * We're accessing css_set_count without locking css_set_rwsem here, 1581 * We're accessing css_set_count without locking css_set_rwsem here,
1419 * but that's OK - it can only be increased by someone holding 1582 * but that's OK - it can only be increased by someone holding
@@ -1422,11 +1585,11 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask)
1422 */ 1585 */
1423 ret = allocate_cgrp_cset_links(css_set_count, &tmp_links); 1586 ret = allocate_cgrp_cset_links(css_set_count, &tmp_links);
1424 if (ret) 1587 if (ret)
1425 goto out; 1588 goto cancel_ref;
1426 1589
1427 ret = cgroup_init_root_id(root); 1590 ret = cgroup_init_root_id(root);
1428 if (ret) 1591 if (ret)
1429 goto out; 1592 goto cancel_ref;
1430 1593
1431 root->kf_root = kernfs_create_root(&cgroup_kf_syscall_ops, 1594 root->kf_root = kernfs_create_root(&cgroup_kf_syscall_ops,
1432 KERNFS_ROOT_CREATE_DEACTIVATED, 1595 KERNFS_ROOT_CREATE_DEACTIVATED,
@@ -1462,7 +1625,7 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask)
1462 link_css_set(&tmp_links, cset, root_cgrp); 1625 link_css_set(&tmp_links, cset, root_cgrp);
1463 up_write(&css_set_rwsem); 1626 up_write(&css_set_rwsem);
1464 1627
1465 BUG_ON(!list_empty(&root_cgrp->children)); 1628 BUG_ON(!list_empty(&root_cgrp->self.children));
1466 BUG_ON(atomic_read(&root->nr_cgrps) != 1); 1629 BUG_ON(atomic_read(&root->nr_cgrps) != 1);
1467 1630
1468 kernfs_activate(root_cgrp->kn); 1631 kernfs_activate(root_cgrp->kn);
@@ -1474,6 +1637,8 @@ destroy_root:
1474 root->kf_root = NULL; 1637 root->kf_root = NULL;
1475exit_root_id: 1638exit_root_id:
1476 cgroup_exit_root_id(root); 1639 cgroup_exit_root_id(root);
1640cancel_ref:
1641 percpu_ref_cancel_init(&root_cgrp->self.refcnt);
1477out: 1642out:
1478 free_cgrp_cset_links(&tmp_links); 1643 free_cgrp_cset_links(&tmp_links);
1479 return ret; 1644 return ret;
@@ -1495,8 +1660,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1495 */ 1660 */
1496 if (!use_task_css_set_links) 1661 if (!use_task_css_set_links)
1497 cgroup_enable_task_cg_lists(); 1662 cgroup_enable_task_cg_lists();
1498retry: 1663
1499 mutex_lock(&cgroup_tree_mutex);
1500 mutex_lock(&cgroup_mutex); 1664 mutex_lock(&cgroup_mutex);
1501 1665
1502 /* First find the desired set of subsystems */ 1666 /* First find the desired set of subsystems */
@@ -1535,7 +1699,7 @@ retry:
1535 * subsystems) then they must match. 1699 * subsystems) then they must match.
1536 */ 1700 */
1537 if ((opts.subsys_mask || opts.none) && 1701 if ((opts.subsys_mask || opts.none) &&
1538 (opts.subsys_mask != root->cgrp.subsys_mask)) { 1702 (opts.subsys_mask != root->subsys_mask)) {
1539 if (!name_match) 1703 if (!name_match)
1540 continue; 1704 continue;
1541 ret = -EBUSY; 1705 ret = -EBUSY;
@@ -1544,28 +1708,27 @@ retry:
1544 1708
1545 if ((root->flags ^ opts.flags) & CGRP_ROOT_OPTION_MASK) { 1709 if ((root->flags ^ opts.flags) & CGRP_ROOT_OPTION_MASK) {
1546 if ((root->flags | opts.flags) & CGRP_ROOT_SANE_BEHAVIOR) { 1710 if ((root->flags | opts.flags) & CGRP_ROOT_SANE_BEHAVIOR) {
1547 pr_err("cgroup: sane_behavior: new mount options should match the existing superblock\n"); 1711 pr_err("sane_behavior: new mount options should match the existing superblock\n");
1548 ret = -EINVAL; 1712 ret = -EINVAL;
1549 goto out_unlock; 1713 goto out_unlock;
1550 } else { 1714 } else {
1551 pr_warning("cgroup: new mount options do not match the existing superblock, will be ignored\n"); 1715 pr_warn("new mount options do not match the existing superblock, will be ignored\n");
1552 } 1716 }
1553 } 1717 }
1554 1718
1555 /* 1719 /*
1556 * A root's lifetime is governed by its root cgroup. Zero 1720 * A root's lifetime is governed by its root cgroup.
1557 * ref indicate that the root is being destroyed. Wait for 1721 * tryget_live failure indicate that the root is being
1558 * destruction to complete so that the subsystems are free. 1722 * destroyed. Wait for destruction to complete so that the
1559 * We can use wait_queue for the wait but this path is 1723 * subsystems are free. We can use wait_queue for the wait
1560 * super cold. Let's just sleep for a bit and retry. 1724 * but this path is super cold. Let's just sleep for a bit
1725 * and retry.
1561 */ 1726 */
1562 if (!atomic_inc_not_zero(&root->cgrp.refcnt)) { 1727 if (!percpu_ref_tryget_live(&root->cgrp.self.refcnt)) {
1563 mutex_unlock(&cgroup_mutex); 1728 mutex_unlock(&cgroup_mutex);
1564 mutex_unlock(&cgroup_tree_mutex);
1565 kfree(opts.release_agent);
1566 kfree(opts.name);
1567 msleep(10); 1729 msleep(10);
1568 goto retry; 1730 ret = restart_syscall();
1731 goto out_free;
1569 } 1732 }
1570 1733
1571 ret = 0; 1734 ret = 0;
@@ -1596,15 +1759,15 @@ retry:
1596 1759
1597out_unlock: 1760out_unlock:
1598 mutex_unlock(&cgroup_mutex); 1761 mutex_unlock(&cgroup_mutex);
1599 mutex_unlock(&cgroup_tree_mutex); 1762out_free:
1600
1601 kfree(opts.release_agent); 1763 kfree(opts.release_agent);
1602 kfree(opts.name); 1764 kfree(opts.name);
1603 1765
1604 if (ret) 1766 if (ret)
1605 return ERR_PTR(ret); 1767 return ERR_PTR(ret);
1606 1768
1607 dentry = kernfs_mount(fs_type, flags, root->kf_root, &new_sb); 1769 dentry = kernfs_mount(fs_type, flags, root->kf_root,
1770 CGROUP_SUPER_MAGIC, &new_sb);
1608 if (IS_ERR(dentry) || !new_sb) 1771 if (IS_ERR(dentry) || !new_sb)
1609 cgroup_put(&root->cgrp); 1772 cgroup_put(&root->cgrp);
1610 return dentry; 1773 return dentry;
@@ -1615,7 +1778,19 @@ static void cgroup_kill_sb(struct super_block *sb)
1615 struct kernfs_root *kf_root = kernfs_root_from_sb(sb); 1778 struct kernfs_root *kf_root = kernfs_root_from_sb(sb);
1616 struct cgroup_root *root = cgroup_root_from_kf(kf_root); 1779 struct cgroup_root *root = cgroup_root_from_kf(kf_root);
1617 1780
1618 cgroup_put(&root->cgrp); 1781 /*
1782 * If @root doesn't have any mounts or children, start killing it.
1783 * This prevents new mounts by disabling percpu_ref_tryget_live().
1784 * cgroup_mount() may wait for @root's release.
1785 *
1786 * And don't kill the default root.
1787 */
1788 if (css_has_online_children(&root->cgrp.self) ||
1789 root == &cgrp_dfl_root)
1790 cgroup_put(&root->cgrp);
1791 else
1792 percpu_ref_kill(&root->cgrp.self.refcnt);
1793
1619 kernfs_kill_sb(sb); 1794 kernfs_kill_sb(sb);
1620} 1795}
1621 1796
@@ -1737,7 +1912,7 @@ struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset)
1737 1912
1738/** 1913/**
1739 * cgroup_task_migrate - move a task from one cgroup to another. 1914 * cgroup_task_migrate - move a task from one cgroup to another.
1740 * @old_cgrp; the cgroup @tsk is being migrated from 1915 * @old_cgrp: the cgroup @tsk is being migrated from
1741 * @tsk: the task being migrated 1916 * @tsk: the task being migrated
1742 * @new_cset: the new css_set @tsk is being attached to 1917 * @new_cset: the new css_set @tsk is being attached to
1743 * 1918 *
@@ -1829,10 +2004,6 @@ static void cgroup_migrate_add_src(struct css_set *src_cset,
1829 2004
1830 src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root); 2005 src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root);
1831 2006
1832 /* nothing to do if this cset already belongs to the cgroup */
1833 if (src_cgrp == dst_cgrp)
1834 return;
1835
1836 if (!list_empty(&src_cset->mg_preload_node)) 2007 if (!list_empty(&src_cset->mg_preload_node))
1837 return; 2008 return;
1838 2009
@@ -1847,13 +2018,14 @@ static void cgroup_migrate_add_src(struct css_set *src_cset,
1847 2018
1848/** 2019/**
1849 * cgroup_migrate_prepare_dst - prepare destination css_sets for migration 2020 * cgroup_migrate_prepare_dst - prepare destination css_sets for migration
1850 * @dst_cgrp: the destination cgroup 2021 * @dst_cgrp: the destination cgroup (may be %NULL)
1851 * @preloaded_csets: list of preloaded source css_sets 2022 * @preloaded_csets: list of preloaded source css_sets
1852 * 2023 *
1853 * Tasks are about to be moved to @dst_cgrp and all the source css_sets 2024 * Tasks are about to be moved to @dst_cgrp and all the source css_sets
1854 * have been preloaded to @preloaded_csets. This function looks up and 2025 * have been preloaded to @preloaded_csets. This function looks up and
1855 * pins all destination css_sets, links each to its source, and put them on 2026 * pins all destination css_sets, links each to its source, and append them
1856 * @preloaded_csets. 2027 * to @preloaded_csets. If @dst_cgrp is %NULL, the destination of each
2028 * source css_set is assumed to be its cgroup on the default hierarchy.
1857 * 2029 *
1858 * This function must be called after cgroup_migrate_add_src() has been 2030 * This function must be called after cgroup_migrate_add_src() has been
1859 * called on each migration source css_set. After migration is performed 2031 * called on each migration source css_set. After migration is performed
@@ -1864,19 +2036,42 @@ static int cgroup_migrate_prepare_dst(struct cgroup *dst_cgrp,
1864 struct list_head *preloaded_csets) 2036 struct list_head *preloaded_csets)
1865{ 2037{
1866 LIST_HEAD(csets); 2038 LIST_HEAD(csets);
1867 struct css_set *src_cset; 2039 struct css_set *src_cset, *tmp_cset;
1868 2040
1869 lockdep_assert_held(&cgroup_mutex); 2041 lockdep_assert_held(&cgroup_mutex);
1870 2042
2043 /*
2044 * Except for the root, child_subsys_mask must be zero for a cgroup
2045 * with tasks so that child cgroups don't compete against tasks.
2046 */
2047 if (dst_cgrp && cgroup_on_dfl(dst_cgrp) && cgroup_parent(dst_cgrp) &&
2048 dst_cgrp->child_subsys_mask)
2049 return -EBUSY;
2050
1871 /* look up the dst cset for each src cset and link it to src */ 2051 /* look up the dst cset for each src cset and link it to src */
1872 list_for_each_entry(src_cset, preloaded_csets, mg_preload_node) { 2052 list_for_each_entry_safe(src_cset, tmp_cset, preloaded_csets, mg_preload_node) {
1873 struct css_set *dst_cset; 2053 struct css_set *dst_cset;
1874 2054
1875 dst_cset = find_css_set(src_cset, dst_cgrp); 2055 dst_cset = find_css_set(src_cset,
2056 dst_cgrp ?: src_cset->dfl_cgrp);
1876 if (!dst_cset) 2057 if (!dst_cset)
1877 goto err; 2058 goto err;
1878 2059
1879 WARN_ON_ONCE(src_cset->mg_dst_cset || dst_cset->mg_dst_cset); 2060 WARN_ON_ONCE(src_cset->mg_dst_cset || dst_cset->mg_dst_cset);
2061
2062 /*
2063 * If src cset equals dst, it's noop. Drop the src.
2064 * cgroup_migrate() will skip the cset too. Note that we
2065 * can't handle src == dst as some nodes are used by both.
2066 */
2067 if (src_cset == dst_cset) {
2068 src_cset->mg_src_cgrp = NULL;
2069 list_del_init(&src_cset->mg_preload_node);
2070 put_css_set(src_cset, false);
2071 put_css_set(dst_cset, false);
2072 continue;
2073 }
2074
1880 src_cset->mg_dst_cset = dst_cset; 2075 src_cset->mg_dst_cset = dst_cset;
1881 2076
1882 if (list_empty(&dst_cset->mg_preload_node)) 2077 if (list_empty(&dst_cset->mg_preload_node))
@@ -1885,7 +2080,7 @@ static int cgroup_migrate_prepare_dst(struct cgroup *dst_cgrp,
1885 put_css_set(dst_cset, false); 2080 put_css_set(dst_cset, false);
1886 } 2081 }
1887 2082
1888 list_splice(&csets, preloaded_csets); 2083 list_splice_tail(&csets, preloaded_csets);
1889 return 0; 2084 return 0;
1890err: 2085err:
1891 cgroup_migrate_finish(&csets); 2086 cgroup_migrate_finish(&csets);
@@ -1966,7 +2161,7 @@ static int cgroup_migrate(struct cgroup *cgrp, struct task_struct *leader,
1966 return 0; 2161 return 0;
1967 2162
1968 /* check that we can legitimately attach to the cgroup */ 2163 /* check that we can legitimately attach to the cgroup */
1969 for_each_css(css, i, cgrp) { 2164 for_each_e_css(css, i, cgrp) {
1970 if (css->ss->can_attach) { 2165 if (css->ss->can_attach) {
1971 ret = css->ss->can_attach(css, &tset); 2166 ret = css->ss->can_attach(css, &tset);
1972 if (ret) { 2167 if (ret) {
@@ -1996,7 +2191,7 @@ static int cgroup_migrate(struct cgroup *cgrp, struct task_struct *leader,
1996 */ 2191 */
1997 tset.csets = &tset.dst_csets; 2192 tset.csets = &tset.dst_csets;
1998 2193
1999 for_each_css(css, i, cgrp) 2194 for_each_e_css(css, i, cgrp)
2000 if (css->ss->attach) 2195 if (css->ss->attach)
2001 css->ss->attach(css, &tset); 2196 css->ss->attach(css, &tset);
2002 2197
@@ -2004,7 +2199,7 @@ static int cgroup_migrate(struct cgroup *cgrp, struct task_struct *leader,
2004 goto out_release_tset; 2199 goto out_release_tset;
2005 2200
2006out_cancel_attach: 2201out_cancel_attach:
2007 for_each_css(css, i, cgrp) { 2202 for_each_e_css(css, i, cgrp) {
2008 if (css == failed_css) 2203 if (css == failed_css)
2009 break; 2204 break;
2010 if (css->ss->cancel_attach) 2205 if (css->ss->cancel_attach)
@@ -2063,13 +2258,20 @@ static int cgroup_attach_task(struct cgroup *dst_cgrp,
2063 * function to attach either it or all tasks in its threadgroup. Will lock 2258 * function to attach either it or all tasks in its threadgroup. Will lock
2064 * cgroup_mutex and threadgroup. 2259 * cgroup_mutex and threadgroup.
2065 */ 2260 */
2066static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup) 2261static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
2262 size_t nbytes, loff_t off, bool threadgroup)
2067{ 2263{
2068 struct task_struct *tsk; 2264 struct task_struct *tsk;
2069 const struct cred *cred = current_cred(), *tcred; 2265 const struct cred *cred = current_cred(), *tcred;
2266 struct cgroup *cgrp;
2267 pid_t pid;
2070 int ret; 2268 int ret;
2071 2269
2072 if (!cgroup_lock_live_group(cgrp)) 2270 if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0)
2271 return -EINVAL;
2272
2273 cgrp = cgroup_kn_lock_live(of->kn);
2274 if (!cgrp)
2073 return -ENODEV; 2275 return -ENODEV;
2074 2276
2075retry_find_task: 2277retry_find_task:
@@ -2135,8 +2337,8 @@ retry_find_task:
2135 2337
2136 put_task_struct(tsk); 2338 put_task_struct(tsk);
2137out_unlock_cgroup: 2339out_unlock_cgroup:
2138 mutex_unlock(&cgroup_mutex); 2340 cgroup_kn_unlock(of->kn);
2139 return ret; 2341 return ret ?: nbytes;
2140} 2342}
2141 2343
2142/** 2344/**
@@ -2170,43 +2372,44 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
2170} 2372}
2171EXPORT_SYMBOL_GPL(cgroup_attach_task_all); 2373EXPORT_SYMBOL_GPL(cgroup_attach_task_all);
2172 2374
2173static int cgroup_tasks_write(struct cgroup_subsys_state *css, 2375static ssize_t cgroup_tasks_write(struct kernfs_open_file *of,
2174 struct cftype *cft, u64 pid) 2376 char *buf, size_t nbytes, loff_t off)
2175{ 2377{
2176 return attach_task_by_pid(css->cgroup, pid, false); 2378 return __cgroup_procs_write(of, buf, nbytes, off, false);
2177} 2379}
2178 2380
2179static int cgroup_procs_write(struct cgroup_subsys_state *css, 2381static ssize_t cgroup_procs_write(struct kernfs_open_file *of,
2180 struct cftype *cft, u64 tgid) 2382 char *buf, size_t nbytes, loff_t off)
2181{ 2383{
2182 return attach_task_by_pid(css->cgroup, tgid, true); 2384 return __cgroup_procs_write(of, buf, nbytes, off, true);
2183} 2385}
2184 2386
2185static int cgroup_release_agent_write(struct cgroup_subsys_state *css, 2387static ssize_t cgroup_release_agent_write(struct kernfs_open_file *of,
2186 struct cftype *cft, char *buffer) 2388 char *buf, size_t nbytes, loff_t off)
2187{ 2389{
2188 struct cgroup_root *root = css->cgroup->root; 2390 struct cgroup *cgrp;
2189 2391
2190 BUILD_BUG_ON(sizeof(root->release_agent_path) < PATH_MAX); 2392 BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX);
2191 if (!cgroup_lock_live_group(css->cgroup)) 2393
2394 cgrp = cgroup_kn_lock_live(of->kn);
2395 if (!cgrp)
2192 return -ENODEV; 2396 return -ENODEV;
2193 spin_lock(&release_agent_path_lock); 2397 spin_lock(&release_agent_path_lock);
2194 strlcpy(root->release_agent_path, buffer, 2398 strlcpy(cgrp->root->release_agent_path, strstrip(buf),
2195 sizeof(root->release_agent_path)); 2399 sizeof(cgrp->root->release_agent_path));
2196 spin_unlock(&release_agent_path_lock); 2400 spin_unlock(&release_agent_path_lock);
2197 mutex_unlock(&cgroup_mutex); 2401 cgroup_kn_unlock(of->kn);
2198 return 0; 2402 return nbytes;
2199} 2403}
2200 2404
2201static int cgroup_release_agent_show(struct seq_file *seq, void *v) 2405static int cgroup_release_agent_show(struct seq_file *seq, void *v)
2202{ 2406{
2203 struct cgroup *cgrp = seq_css(seq)->cgroup; 2407 struct cgroup *cgrp = seq_css(seq)->cgroup;
2204 2408
2205 if (!cgroup_lock_live_group(cgrp)) 2409 spin_lock(&release_agent_path_lock);
2206 return -ENODEV;
2207 seq_puts(seq, cgrp->root->release_agent_path); 2410 seq_puts(seq, cgrp->root->release_agent_path);
2411 spin_unlock(&release_agent_path_lock);
2208 seq_putc(seq, '\n'); 2412 seq_putc(seq, '\n');
2209 mutex_unlock(&cgroup_mutex);
2210 return 0; 2413 return 0;
2211} 2414}
2212 2415
@@ -2218,6 +2421,320 @@ static int cgroup_sane_behavior_show(struct seq_file *seq, void *v)
2218 return 0; 2421 return 0;
2219} 2422}
2220 2423
2424static void cgroup_print_ss_mask(struct seq_file *seq, unsigned int ss_mask)
2425{
2426 struct cgroup_subsys *ss;
2427 bool printed = false;
2428 int ssid;
2429
2430 for_each_subsys(ss, ssid) {
2431 if (ss_mask & (1 << ssid)) {
2432 if (printed)
2433 seq_putc(seq, ' ');
2434 seq_printf(seq, "%s", ss->name);
2435 printed = true;
2436 }
2437 }
2438 if (printed)
2439 seq_putc(seq, '\n');
2440}
2441
2442/* show controllers which are currently attached to the default hierarchy */
2443static int cgroup_root_controllers_show(struct seq_file *seq, void *v)
2444{
2445 struct cgroup *cgrp = seq_css(seq)->cgroup;
2446
2447 cgroup_print_ss_mask(seq, cgrp->root->subsys_mask &
2448 ~cgrp_dfl_root_inhibit_ss_mask);
2449 return 0;
2450}
2451
2452/* show controllers which are enabled from the parent */
2453static int cgroup_controllers_show(struct seq_file *seq, void *v)
2454{
2455 struct cgroup *cgrp = seq_css(seq)->cgroup;
2456
2457 cgroup_print_ss_mask(seq, cgroup_parent(cgrp)->child_subsys_mask);
2458 return 0;
2459}
2460
2461/* show controllers which are enabled for a given cgroup's children */
2462static int cgroup_subtree_control_show(struct seq_file *seq, void *v)
2463{
2464 struct cgroup *cgrp = seq_css(seq)->cgroup;
2465
2466 cgroup_print_ss_mask(seq, cgrp->child_subsys_mask);
2467 return 0;
2468}
2469
2470/**
2471 * cgroup_update_dfl_csses - update css assoc of a subtree in default hierarchy
2472 * @cgrp: root of the subtree to update csses for
2473 *
2474 * @cgrp's child_subsys_mask has changed and its subtree's (self excluded)
2475 * css associations need to be updated accordingly. This function looks up
2476 * all css_sets which are attached to the subtree, creates the matching
2477 * updated css_sets and migrates the tasks to the new ones.
2478 */
2479static int cgroup_update_dfl_csses(struct cgroup *cgrp)
2480{
2481 LIST_HEAD(preloaded_csets);
2482 struct cgroup_subsys_state *css;
2483 struct css_set *src_cset;
2484 int ret;
2485
2486 lockdep_assert_held(&cgroup_mutex);
2487
2488 /* look up all csses currently attached to @cgrp's subtree */
2489 down_read(&css_set_rwsem);
2490 css_for_each_descendant_pre(css, cgroup_css(cgrp, NULL)) {
2491 struct cgrp_cset_link *link;
2492
2493 /* self is not affected by child_subsys_mask change */
2494 if (css->cgroup == cgrp)
2495 continue;
2496
2497 list_for_each_entry(link, &css->cgroup->cset_links, cset_link)
2498 cgroup_migrate_add_src(link->cset, cgrp,
2499 &preloaded_csets);
2500 }
2501 up_read(&css_set_rwsem);
2502
2503 /* NULL dst indicates self on default hierarchy */
2504 ret = cgroup_migrate_prepare_dst(NULL, &preloaded_csets);
2505 if (ret)
2506 goto out_finish;
2507
2508 list_for_each_entry(src_cset, &preloaded_csets, mg_preload_node) {
2509 struct task_struct *last_task = NULL, *task;
2510
2511 /* src_csets precede dst_csets, break on the first dst_cset */
2512 if (!src_cset->mg_src_cgrp)
2513 break;
2514
2515 /*
2516 * All tasks in src_cset need to be migrated to the
2517 * matching dst_cset. Empty it process by process. We
2518 * walk tasks but migrate processes. The leader might even
2519 * belong to a different cset but such src_cset would also
2520 * be among the target src_csets because the default
2521 * hierarchy enforces per-process membership.
2522 */
2523 while (true) {
2524 down_read(&css_set_rwsem);
2525 task = list_first_entry_or_null(&src_cset->tasks,
2526 struct task_struct, cg_list);
2527 if (task) {
2528 task = task->group_leader;
2529 WARN_ON_ONCE(!task_css_set(task)->mg_src_cgrp);
2530 get_task_struct(task);
2531 }
2532 up_read(&css_set_rwsem);
2533
2534 if (!task)
2535 break;
2536
2537 /* guard against possible infinite loop */
2538 if (WARN(last_task == task,
2539 "cgroup: update_dfl_csses failed to make progress, aborting in inconsistent state\n"))
2540 goto out_finish;
2541 last_task = task;
2542
2543 threadgroup_lock(task);
2544 /* raced against de_thread() from another thread? */
2545 if (!thread_group_leader(task)) {
2546 threadgroup_unlock(task);
2547 put_task_struct(task);
2548 continue;
2549 }
2550
2551 ret = cgroup_migrate(src_cset->dfl_cgrp, task, true);
2552
2553 threadgroup_unlock(task);
2554 put_task_struct(task);
2555
2556 if (WARN(ret, "cgroup: failed to update controllers for the default hierarchy (%d), further operations may crash or hang\n", ret))
2557 goto out_finish;
2558 }
2559 }
2560
2561out_finish:
2562 cgroup_migrate_finish(&preloaded_csets);
2563 return ret;
2564}
2565
2566/* change the enabled child controllers for a cgroup in the default hierarchy */
2567static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
2568 char *buf, size_t nbytes,
2569 loff_t off)
2570{
2571 unsigned int enable = 0, disable = 0;
2572 struct cgroup *cgrp, *child;
2573 struct cgroup_subsys *ss;
2574 char *tok;
2575 int ssid, ret;
2576
2577 /*
2578 * Parse input - space separated list of subsystem names prefixed
2579 * with either + or -.
2580 */
2581 buf = strstrip(buf);
2582 while ((tok = strsep(&buf, " "))) {
2583 if (tok[0] == '\0')
2584 continue;
2585 for_each_subsys(ss, ssid) {
2586 if (ss->disabled || strcmp(tok + 1, ss->name) ||
2587 ((1 << ss->id) & cgrp_dfl_root_inhibit_ss_mask))
2588 continue;
2589
2590 if (*tok == '+') {
2591 enable |= 1 << ssid;
2592 disable &= ~(1 << ssid);
2593 } else if (*tok == '-') {
2594 disable |= 1 << ssid;
2595 enable &= ~(1 << ssid);
2596 } else {
2597 return -EINVAL;
2598 }
2599 break;
2600 }
2601 if (ssid == CGROUP_SUBSYS_COUNT)
2602 return -EINVAL;
2603 }
2604
2605 cgrp = cgroup_kn_lock_live(of->kn);
2606 if (!cgrp)
2607 return -ENODEV;
2608
2609 for_each_subsys(ss, ssid) {
2610 if (enable & (1 << ssid)) {
2611 if (cgrp->child_subsys_mask & (1 << ssid)) {
2612 enable &= ~(1 << ssid);
2613 continue;
2614 }
2615
2616 /*
2617 * Because css offlining is asynchronous, userland
2618 * might try to re-enable the same controller while
2619 * the previous instance is still around. In such
2620 * cases, wait till it's gone using offline_waitq.
2621 */
2622 cgroup_for_each_live_child(child, cgrp) {
2623 DEFINE_WAIT(wait);
2624
2625 if (!cgroup_css(child, ss))
2626 continue;
2627
2628 cgroup_get(child);
2629 prepare_to_wait(&child->offline_waitq, &wait,
2630 TASK_UNINTERRUPTIBLE);
2631 cgroup_kn_unlock(of->kn);
2632 schedule();
2633 finish_wait(&child->offline_waitq, &wait);
2634 cgroup_put(child);
2635
2636 return restart_syscall();
2637 }
2638
2639 /* unavailable or not enabled on the parent? */
2640 if (!(cgrp_dfl_root.subsys_mask & (1 << ssid)) ||
2641 (cgroup_parent(cgrp) &&
2642 !(cgroup_parent(cgrp)->child_subsys_mask & (1 << ssid)))) {
2643 ret = -ENOENT;
2644 goto out_unlock;
2645 }
2646 } else if (disable & (1 << ssid)) {
2647 if (!(cgrp->child_subsys_mask & (1 << ssid))) {
2648 disable &= ~(1 << ssid);
2649 continue;
2650 }
2651
2652 /* a child has it enabled? */
2653 cgroup_for_each_live_child(child, cgrp) {
2654 if (child->child_subsys_mask & (1 << ssid)) {
2655 ret = -EBUSY;
2656 goto out_unlock;
2657 }
2658 }
2659 }
2660 }
2661
2662 if (!enable && !disable) {
2663 ret = 0;
2664 goto out_unlock;
2665 }
2666
2667 /*
2668 * Except for the root, child_subsys_mask must be zero for a cgroup
2669 * with tasks so that child cgroups don't compete against tasks.
2670 */
2671 if (enable && cgroup_parent(cgrp) && !list_empty(&cgrp->cset_links)) {
2672 ret = -EBUSY;
2673 goto out_unlock;
2674 }
2675
2676 /*
2677 * Create csses for enables and update child_subsys_mask. This
2678 * changes cgroup_e_css() results which in turn makes the
2679 * subsequent cgroup_update_dfl_csses() associate all tasks in the
2680 * subtree to the updated csses.
2681 */
2682 for_each_subsys(ss, ssid) {
2683 if (!(enable & (1 << ssid)))
2684 continue;
2685
2686 cgroup_for_each_live_child(child, cgrp) {
2687 ret = create_css(child, ss);
2688 if (ret)
2689 goto err_undo_css;
2690 }
2691 }
2692
2693 cgrp->child_subsys_mask |= enable;
2694 cgrp->child_subsys_mask &= ~disable;
2695
2696 ret = cgroup_update_dfl_csses(cgrp);
2697 if (ret)
2698 goto err_undo_css;
2699
2700 /* all tasks are now migrated away from the old csses, kill them */
2701 for_each_subsys(ss, ssid) {
2702 if (!(disable & (1 << ssid)))
2703 continue;
2704
2705 cgroup_for_each_live_child(child, cgrp)
2706 kill_css(cgroup_css(child, ss));
2707 }
2708
2709 kernfs_activate(cgrp->kn);
2710 ret = 0;
2711out_unlock:
2712 cgroup_kn_unlock(of->kn);
2713 return ret ?: nbytes;
2714
2715err_undo_css:
2716 cgrp->child_subsys_mask &= ~enable;
2717 cgrp->child_subsys_mask |= disable;
2718
2719 for_each_subsys(ss, ssid) {
2720 if (!(enable & (1 << ssid)))
2721 continue;
2722
2723 cgroup_for_each_live_child(child, cgrp) {
2724 struct cgroup_subsys_state *css = cgroup_css(child, ss);
2725 if (css)
2726 kill_css(css);
2727 }
2728 }
2729 goto out_unlock;
2730}
2731
2732static int cgroup_populated_show(struct seq_file *seq, void *v)
2733{
2734 seq_printf(seq, "%d\n", (bool)seq_css(seq)->cgroup->populated_cnt);
2735 return 0;
2736}
2737
2221static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf, 2738static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,
2222 size_t nbytes, loff_t off) 2739 size_t nbytes, loff_t off)
2223{ 2740{
@@ -2226,6 +2743,9 @@ static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,
2226 struct cgroup_subsys_state *css; 2743 struct cgroup_subsys_state *css;
2227 int ret; 2744 int ret;
2228 2745
2746 if (cft->write)
2747 return cft->write(of, buf, nbytes, off);
2748
2229 /* 2749 /*
2230 * kernfs guarantees that a file isn't deleted with operations in 2750 * kernfs guarantees that a file isn't deleted with operations in
2231 * flight, which means that the matching css is and stays alive and 2751 * flight, which means that the matching css is and stays alive and
@@ -2236,9 +2756,7 @@ static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,
2236 css = cgroup_css(cgrp, cft->ss); 2756 css = cgroup_css(cgrp, cft->ss);
2237 rcu_read_unlock(); 2757 rcu_read_unlock();
2238 2758
2239 if (cft->write_string) { 2759 if (cft->write_u64) {
2240 ret = cft->write_string(css, cft, strstrip(buf));
2241 } else if (cft->write_u64) {
2242 unsigned long long v; 2760 unsigned long long v;
2243 ret = kstrtoull(buf, 0, &v); 2761 ret = kstrtoull(buf, 0, &v);
2244 if (!ret) 2762 if (!ret)
@@ -2248,8 +2766,6 @@ static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,
2248 ret = kstrtoll(buf, 0, &v); 2766 ret = kstrtoll(buf, 0, &v);
2249 if (!ret) 2767 if (!ret)
2250 ret = cft->write_s64(css, cft, v); 2768 ret = cft->write_s64(css, cft, v);
2251 } else if (cft->trigger) {
2252 ret = cft->trigger(css, (unsigned int)cft->private);
2253 } else { 2769 } else {
2254 ret = -EINVAL; 2770 ret = -EINVAL;
2255 } 2771 }
@@ -2326,20 +2842,18 @@ static int cgroup_rename(struct kernfs_node *kn, struct kernfs_node *new_parent,
2326 return -EPERM; 2842 return -EPERM;
2327 2843
2328 /* 2844 /*
2329 * We're gonna grab cgroup_tree_mutex which nests outside kernfs 2845 * We're gonna grab cgroup_mutex which nests outside kernfs
2330 * active_ref. kernfs_rename() doesn't require active_ref 2846 * active_ref. kernfs_rename() doesn't require active_ref
2331 * protection. Break them before grabbing cgroup_tree_mutex. 2847 * protection. Break them before grabbing cgroup_mutex.
2332 */ 2848 */
2333 kernfs_break_active_protection(new_parent); 2849 kernfs_break_active_protection(new_parent);
2334 kernfs_break_active_protection(kn); 2850 kernfs_break_active_protection(kn);
2335 2851
2336 mutex_lock(&cgroup_tree_mutex);
2337 mutex_lock(&cgroup_mutex); 2852 mutex_lock(&cgroup_mutex);
2338 2853
2339 ret = kernfs_rename(kn, new_parent, new_name_str); 2854 ret = kernfs_rename(kn, new_parent, new_name_str);
2340 2855
2341 mutex_unlock(&cgroup_mutex); 2856 mutex_unlock(&cgroup_mutex);
2342 mutex_unlock(&cgroup_tree_mutex);
2343 2857
2344 kernfs_unbreak_active_protection(kn); 2858 kernfs_unbreak_active_protection(kn);
2345 kernfs_unbreak_active_protection(new_parent); 2859 kernfs_unbreak_active_protection(new_parent);
@@ -2377,9 +2891,14 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft)
2377 return PTR_ERR(kn); 2891 return PTR_ERR(kn);
2378 2892
2379 ret = cgroup_kn_set_ugid(kn); 2893 ret = cgroup_kn_set_ugid(kn);
2380 if (ret) 2894 if (ret) {
2381 kernfs_remove(kn); 2895 kernfs_remove(kn);
2382 return ret; 2896 return ret;
2897 }
2898
2899 if (cft->seq_show == cgroup_populated_show)
2900 cgrp->populated_kn = kn;
2901 return 0;
2383} 2902}
2384 2903
2385/** 2904/**
@@ -2399,7 +2918,7 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
2399 struct cftype *cft; 2918 struct cftype *cft;
2400 int ret; 2919 int ret;
2401 2920
2402 lockdep_assert_held(&cgroup_tree_mutex); 2921 lockdep_assert_held(&cgroup_mutex);
2403 2922
2404 for (cft = cfts; cft->name[0] != '\0'; cft++) { 2923 for (cft = cfts; cft->name[0] != '\0'; cft++) {
2405 /* does cft->flags tell us to skip this file on @cgrp? */ 2924 /* does cft->flags tell us to skip this file on @cgrp? */
@@ -2407,16 +2926,16 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
2407 continue; 2926 continue;
2408 if ((cft->flags & CFTYPE_INSANE) && cgroup_sane_behavior(cgrp)) 2927 if ((cft->flags & CFTYPE_INSANE) && cgroup_sane_behavior(cgrp))
2409 continue; 2928 continue;
2410 if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgrp->parent) 2929 if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgroup_parent(cgrp))
2411 continue; 2930 continue;
2412 if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgrp->parent) 2931 if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgroup_parent(cgrp))
2413 continue; 2932 continue;
2414 2933
2415 if (is_add) { 2934 if (is_add) {
2416 ret = cgroup_add_file(cgrp, cft); 2935 ret = cgroup_add_file(cgrp, cft);
2417 if (ret) { 2936 if (ret) {
2418 pr_warn("cgroup_addrm_files: failed to add %s, err=%d\n", 2937 pr_warn("%s: failed to add %s, err=%d\n",
2419 cft->name, ret); 2938 __func__, cft->name, ret);
2420 return ret; 2939 return ret;
2421 } 2940 }
2422 } else { 2941 } else {
@@ -2434,11 +2953,7 @@ static int cgroup_apply_cftypes(struct cftype *cfts, bool is_add)
2434 struct cgroup_subsys_state *css; 2953 struct cgroup_subsys_state *css;
2435 int ret = 0; 2954 int ret = 0;
2436 2955
2437 lockdep_assert_held(&cgroup_tree_mutex); 2956 lockdep_assert_held(&cgroup_mutex);
2438
2439 /* don't bother if @ss isn't attached */
2440 if (ss->root == &cgrp_dfl_root)
2441 return 0;
2442 2957
2443 /* add/rm files for all cgroups created before */ 2958 /* add/rm files for all cgroups created before */
2444 css_for_each_descendant_pre(css, cgroup_css(root, ss)) { 2959 css_for_each_descendant_pre(css, cgroup_css(root, ss)) {
@@ -2506,7 +3021,7 @@ static int cgroup_init_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
2506 3021
2507static int cgroup_rm_cftypes_locked(struct cftype *cfts) 3022static int cgroup_rm_cftypes_locked(struct cftype *cfts)
2508{ 3023{
2509 lockdep_assert_held(&cgroup_tree_mutex); 3024 lockdep_assert_held(&cgroup_mutex);
2510 3025
2511 if (!cfts || !cfts[0].ss) 3026 if (!cfts || !cfts[0].ss)
2512 return -ENOENT; 3027 return -ENOENT;
@@ -2532,9 +3047,9 @@ int cgroup_rm_cftypes(struct cftype *cfts)
2532{ 3047{
2533 int ret; 3048 int ret;
2534 3049
2535 mutex_lock(&cgroup_tree_mutex); 3050 mutex_lock(&cgroup_mutex);
2536 ret = cgroup_rm_cftypes_locked(cfts); 3051 ret = cgroup_rm_cftypes_locked(cfts);
2537 mutex_unlock(&cgroup_tree_mutex); 3052 mutex_unlock(&cgroup_mutex);
2538 return ret; 3053 return ret;
2539} 3054}
2540 3055
@@ -2556,6 +3071,9 @@ int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
2556{ 3071{
2557 int ret; 3072 int ret;
2558 3073
3074 if (ss->disabled)
3075 return 0;
3076
2559 if (!cfts || cfts[0].name[0] == '\0') 3077 if (!cfts || cfts[0].name[0] == '\0')
2560 return 0; 3078 return 0;
2561 3079
@@ -2563,14 +3081,14 @@ int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
2563 if (ret) 3081 if (ret)
2564 return ret; 3082 return ret;
2565 3083
2566 mutex_lock(&cgroup_tree_mutex); 3084 mutex_lock(&cgroup_mutex);
2567 3085
2568 list_add_tail(&cfts->node, &ss->cfts); 3086 list_add_tail(&cfts->node, &ss->cfts);
2569 ret = cgroup_apply_cftypes(cfts, true); 3087 ret = cgroup_apply_cftypes(cfts, true);
2570 if (ret) 3088 if (ret)
2571 cgroup_rm_cftypes_locked(cfts); 3089 cgroup_rm_cftypes_locked(cfts);
2572 3090
2573 mutex_unlock(&cgroup_tree_mutex); 3091 mutex_unlock(&cgroup_mutex);
2574 return ret; 3092 return ret;
2575} 3093}
2576 3094
@@ -2594,57 +3112,65 @@ static int cgroup_task_count(const struct cgroup *cgrp)
2594 3112
2595/** 3113/**
2596 * css_next_child - find the next child of a given css 3114 * css_next_child - find the next child of a given css
2597 * @pos_css: the current position (%NULL to initiate traversal) 3115 * @pos: the current position (%NULL to initiate traversal)
2598 * @parent_css: css whose children to walk 3116 * @parent: css whose children to walk
2599 * 3117 *
2600 * This function returns the next child of @parent_css and should be called 3118 * This function returns the next child of @parent and should be called
2601 * under either cgroup_mutex or RCU read lock. The only requirement is 3119 * under either cgroup_mutex or RCU read lock. The only requirement is
2602 * that @parent_css and @pos_css are accessible. The next sibling is 3120 * that @parent and @pos are accessible. The next sibling is guaranteed to
2603 * guaranteed to be returned regardless of their states. 3121 * be returned regardless of their states.
3122 *
3123 * If a subsystem synchronizes ->css_online() and the start of iteration, a
3124 * css which finished ->css_online() is guaranteed to be visible in the
3125 * future iterations and will stay visible until the last reference is put.
3126 * A css which hasn't finished ->css_online() or already finished
3127 * ->css_offline() may show up during traversal. It's each subsystem's
3128 * responsibility to synchronize against on/offlining.
2604 */ 3129 */
2605struct cgroup_subsys_state * 3130struct cgroup_subsys_state *css_next_child(struct cgroup_subsys_state *pos,
2606css_next_child(struct cgroup_subsys_state *pos_css, 3131 struct cgroup_subsys_state *parent)
2607 struct cgroup_subsys_state *parent_css)
2608{ 3132{
2609 struct cgroup *pos = pos_css ? pos_css->cgroup : NULL; 3133 struct cgroup_subsys_state *next;
2610 struct cgroup *cgrp = parent_css->cgroup;
2611 struct cgroup *next;
2612 3134
2613 cgroup_assert_mutexes_or_rcu_locked(); 3135 cgroup_assert_mutex_or_rcu_locked();
2614 3136
2615 /* 3137 /*
2616 * @pos could already have been removed. Once a cgroup is removed, 3138 * @pos could already have been unlinked from the sibling list.
2617 * its ->sibling.next is no longer updated when its next sibling 3139 * Once a cgroup is removed, its ->sibling.next is no longer
2618 * changes. As CGRP_DEAD assertion is serialized and happens 3140 * updated when its next sibling changes. CSS_RELEASED is set when
2619 * before the cgroup is taken off the ->sibling list, if we see it 3141 * @pos is taken off list, at which time its next pointer is valid,
2620 * unasserted, it's guaranteed that the next sibling hasn't 3142 * and, as releases are serialized, the one pointed to by the next
2621 * finished its grace period even if it's already removed, and thus 3143 * pointer is guaranteed to not have started release yet. This
2622 * safe to dereference from this RCU critical section. If 3144 * implies that if we observe !CSS_RELEASED on @pos in this RCU
2623 * ->sibling.next is inaccessible, cgroup_is_dead() is guaranteed 3145 * critical section, the one pointed to by its next pointer is
2624 * to be visible as %true here. 3146 * guaranteed to not have finished its RCU grace period even if we
3147 * have dropped rcu_read_lock() inbetween iterations.
2625 * 3148 *
2626 * If @pos is dead, its next pointer can't be dereferenced; 3149 * If @pos has CSS_RELEASED set, its next pointer can't be
2627 * however, as each cgroup is given a monotonically increasing 3150 * dereferenced; however, as each css is given a monotonically
2628 * unique serial number and always appended to the sibling list, 3151 * increasing unique serial number and always appended to the
2629 * the next one can be found by walking the parent's children until 3152 * sibling list, the next one can be found by walking the parent's
2630 * we see a cgroup with higher serial number than @pos's. While 3153 * children until the first css with higher serial number than
2631 * this path can be slower, it's taken only when either the current 3154 * @pos's. While this path can be slower, it happens iff iteration
2632 * cgroup is removed or iteration and removal race. 3155 * races against release and the race window is very small.
2633 */ 3156 */
2634 if (!pos) { 3157 if (!pos) {
2635 next = list_entry_rcu(cgrp->children.next, struct cgroup, sibling); 3158 next = list_entry_rcu(parent->children.next, struct cgroup_subsys_state, sibling);
2636 } else if (likely(!cgroup_is_dead(pos))) { 3159 } else if (likely(!(pos->flags & CSS_RELEASED))) {
2637 next = list_entry_rcu(pos->sibling.next, struct cgroup, sibling); 3160 next = list_entry_rcu(pos->sibling.next, struct cgroup_subsys_state, sibling);
2638 } else { 3161 } else {
2639 list_for_each_entry_rcu(next, &cgrp->children, sibling) 3162 list_for_each_entry_rcu(next, &parent->children, sibling)
2640 if (next->serial_nr > pos->serial_nr) 3163 if (next->serial_nr > pos->serial_nr)
2641 break; 3164 break;
2642 } 3165 }
2643 3166
2644 if (&next->sibling == &cgrp->children) 3167 /*
2645 return NULL; 3168 * @next, if not pointing to the head, can be dereferenced and is
2646 3169 * the next sibling.
2647 return cgroup_css(next, parent_css->ss); 3170 */
3171 if (&next->sibling != &parent->children)
3172 return next;
3173 return NULL;
2648} 3174}
2649 3175
2650/** 3176/**
@@ -2660,6 +3186,13 @@ css_next_child(struct cgroup_subsys_state *pos_css,
2660 * doesn't require the whole traversal to be contained in a single critical 3186 * doesn't require the whole traversal to be contained in a single critical
2661 * section. This function will return the correct next descendant as long 3187 * section. This function will return the correct next descendant as long
2662 * as both @pos and @root are accessible and @pos is a descendant of @root. 3188 * as both @pos and @root are accessible and @pos is a descendant of @root.
3189 *
3190 * If a subsystem synchronizes ->css_online() and the start of iteration, a
3191 * css which finished ->css_online() is guaranteed to be visible in the
3192 * future iterations and will stay visible until the last reference is put.
3193 * A css which hasn't finished ->css_online() or already finished
3194 * ->css_offline() may show up during traversal. It's each subsystem's
3195 * responsibility to synchronize against on/offlining.
2663 */ 3196 */
2664struct cgroup_subsys_state * 3197struct cgroup_subsys_state *
2665css_next_descendant_pre(struct cgroup_subsys_state *pos, 3198css_next_descendant_pre(struct cgroup_subsys_state *pos,
@@ -2667,7 +3200,7 @@ css_next_descendant_pre(struct cgroup_subsys_state *pos,
2667{ 3200{
2668 struct cgroup_subsys_state *next; 3201 struct cgroup_subsys_state *next;
2669 3202
2670 cgroup_assert_mutexes_or_rcu_locked(); 3203 cgroup_assert_mutex_or_rcu_locked();
2671 3204
2672 /* if first iteration, visit @root */ 3205 /* if first iteration, visit @root */
2673 if (!pos) 3206 if (!pos)
@@ -2680,10 +3213,10 @@ css_next_descendant_pre(struct cgroup_subsys_state *pos,
2680 3213
2681 /* no child, visit my or the closest ancestor's next sibling */ 3214 /* no child, visit my or the closest ancestor's next sibling */
2682 while (pos != root) { 3215 while (pos != root) {
2683 next = css_next_child(pos, css_parent(pos)); 3216 next = css_next_child(pos, pos->parent);
2684 if (next) 3217 if (next)
2685 return next; 3218 return next;
2686 pos = css_parent(pos); 3219 pos = pos->parent;
2687 } 3220 }
2688 3221
2689 return NULL; 3222 return NULL;
@@ -2707,7 +3240,7 @@ css_rightmost_descendant(struct cgroup_subsys_state *pos)
2707{ 3240{
2708 struct cgroup_subsys_state *last, *tmp; 3241 struct cgroup_subsys_state *last, *tmp;
2709 3242
2710 cgroup_assert_mutexes_or_rcu_locked(); 3243 cgroup_assert_mutex_or_rcu_locked();
2711 3244
2712 do { 3245 do {
2713 last = pos; 3246 last = pos;
@@ -2747,6 +3280,13 @@ css_leftmost_descendant(struct cgroup_subsys_state *pos)
2747 * section. This function will return the correct next descendant as long 3280 * section. This function will return the correct next descendant as long
2748 * as both @pos and @cgroup are accessible and @pos is a descendant of 3281 * as both @pos and @cgroup are accessible and @pos is a descendant of
2749 * @cgroup. 3282 * @cgroup.
3283 *
3284 * If a subsystem synchronizes ->css_online() and the start of iteration, a
3285 * css which finished ->css_online() is guaranteed to be visible in the
3286 * future iterations and will stay visible until the last reference is put.
3287 * A css which hasn't finished ->css_online() or already finished
3288 * ->css_offline() may show up during traversal. It's each subsystem's
3289 * responsibility to synchronize against on/offlining.
2750 */ 3290 */
2751struct cgroup_subsys_state * 3291struct cgroup_subsys_state *
2752css_next_descendant_post(struct cgroup_subsys_state *pos, 3292css_next_descendant_post(struct cgroup_subsys_state *pos,
@@ -2754,7 +3294,7 @@ css_next_descendant_post(struct cgroup_subsys_state *pos,
2754{ 3294{
2755 struct cgroup_subsys_state *next; 3295 struct cgroup_subsys_state *next;
2756 3296
2757 cgroup_assert_mutexes_or_rcu_locked(); 3297 cgroup_assert_mutex_or_rcu_locked();
2758 3298
2759 /* if first iteration, visit leftmost descendant which may be @root */ 3299 /* if first iteration, visit leftmost descendant which may be @root */
2760 if (!pos) 3300 if (!pos)
@@ -2765,12 +3305,36 @@ css_next_descendant_post(struct cgroup_subsys_state *pos,
2765 return NULL; 3305 return NULL;
2766 3306
2767 /* if there's an unvisited sibling, visit its leftmost descendant */ 3307 /* if there's an unvisited sibling, visit its leftmost descendant */
2768 next = css_next_child(pos, css_parent(pos)); 3308 next = css_next_child(pos, pos->parent);
2769 if (next) 3309 if (next)
2770 return css_leftmost_descendant(next); 3310 return css_leftmost_descendant(next);
2771 3311
2772 /* no sibling left, visit parent */ 3312 /* no sibling left, visit parent */
2773 return css_parent(pos); 3313 return pos->parent;
3314}
3315
3316/**
3317 * css_has_online_children - does a css have online children
3318 * @css: the target css
3319 *
3320 * Returns %true if @css has any online children; otherwise, %false. This
3321 * function can be called from any context but the caller is responsible
3322 * for synchronizing against on/offlining as necessary.
3323 */
3324bool css_has_online_children(struct cgroup_subsys_state *css)
3325{
3326 struct cgroup_subsys_state *child;
3327 bool ret = false;
3328
3329 rcu_read_lock();
3330 css_for_each_child(child, css) {
3331 if (css->flags & CSS_ONLINE) {
3332 ret = true;
3333 break;
3334 }
3335 }
3336 rcu_read_unlock();
3337 return ret;
2774} 3338}
2775 3339
2776/** 3340/**
@@ -2781,27 +3345,36 @@ css_next_descendant_post(struct cgroup_subsys_state *pos,
2781 */ 3345 */
2782static void css_advance_task_iter(struct css_task_iter *it) 3346static void css_advance_task_iter(struct css_task_iter *it)
2783{ 3347{
2784 struct list_head *l = it->cset_link; 3348 struct list_head *l = it->cset_pos;
2785 struct cgrp_cset_link *link; 3349 struct cgrp_cset_link *link;
2786 struct css_set *cset; 3350 struct css_set *cset;
2787 3351
2788 /* Advance to the next non-empty css_set */ 3352 /* Advance to the next non-empty css_set */
2789 do { 3353 do {
2790 l = l->next; 3354 l = l->next;
2791 if (l == &it->origin_css->cgroup->cset_links) { 3355 if (l == it->cset_head) {
2792 it->cset_link = NULL; 3356 it->cset_pos = NULL;
2793 return; 3357 return;
2794 } 3358 }
2795 link = list_entry(l, struct cgrp_cset_link, cset_link); 3359
2796 cset = link->cset; 3360 if (it->ss) {
3361 cset = container_of(l, struct css_set,
3362 e_cset_node[it->ss->id]);
3363 } else {
3364 link = list_entry(l, struct cgrp_cset_link, cset_link);
3365 cset = link->cset;
3366 }
2797 } while (list_empty(&cset->tasks) && list_empty(&cset->mg_tasks)); 3367 } while (list_empty(&cset->tasks) && list_empty(&cset->mg_tasks));
2798 3368
2799 it->cset_link = l; 3369 it->cset_pos = l;
2800 3370
2801 if (!list_empty(&cset->tasks)) 3371 if (!list_empty(&cset->tasks))
2802 it->task = cset->tasks.next; 3372 it->task_pos = cset->tasks.next;
2803 else 3373 else
2804 it->task = cset->mg_tasks.next; 3374 it->task_pos = cset->mg_tasks.next;
3375
3376 it->tasks_head = &cset->tasks;
3377 it->mg_tasks_head = &cset->mg_tasks;
2805} 3378}
2806 3379
2807/** 3380/**
@@ -2827,8 +3400,14 @@ void css_task_iter_start(struct cgroup_subsys_state *css,
2827 3400
2828 down_read(&css_set_rwsem); 3401 down_read(&css_set_rwsem);
2829 3402
2830 it->origin_css = css; 3403 it->ss = css->ss;
2831 it->cset_link = &css->cgroup->cset_links; 3404
3405 if (it->ss)
3406 it->cset_pos = &css->cgroup->e_csets[css->ss->id];
3407 else
3408 it->cset_pos = &css->cgroup->cset_links;
3409
3410 it->cset_head = it->cset_pos;
2832 3411
2833 css_advance_task_iter(it); 3412 css_advance_task_iter(it);
2834} 3413}
@@ -2844,12 +3423,10 @@ void css_task_iter_start(struct cgroup_subsys_state *css,
2844struct task_struct *css_task_iter_next(struct css_task_iter *it) 3423struct task_struct *css_task_iter_next(struct css_task_iter *it)
2845{ 3424{
2846 struct task_struct *res; 3425 struct task_struct *res;
2847 struct list_head *l = it->task; 3426 struct list_head *l = it->task_pos;
2848 struct cgrp_cset_link *link = list_entry(it->cset_link,
2849 struct cgrp_cset_link, cset_link);
2850 3427
2851 /* If the iterator cg is NULL, we have no tasks */ 3428 /* If the iterator cg is NULL, we have no tasks */
2852 if (!it->cset_link) 3429 if (!it->cset_pos)
2853 return NULL; 3430 return NULL;
2854 res = list_entry(l, struct task_struct, cg_list); 3431 res = list_entry(l, struct task_struct, cg_list);
2855 3432
@@ -2860,13 +3437,13 @@ struct task_struct *css_task_iter_next(struct css_task_iter *it)
2860 */ 3437 */
2861 l = l->next; 3438 l = l->next;
2862 3439
2863 if (l == &link->cset->tasks) 3440 if (l == it->tasks_head)
2864 l = link->cset->mg_tasks.next; 3441 l = it->mg_tasks_head->next;
2865 3442
2866 if (l == &link->cset->mg_tasks) 3443 if (l == it->mg_tasks_head)
2867 css_advance_task_iter(it); 3444 css_advance_task_iter(it);
2868 else 3445 else
2869 it->task = l; 3446 it->task_pos = l;
2870 3447
2871 return res; 3448 return res;
2872} 3449}
@@ -2919,7 +3496,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
2919 * ->can_attach() fails. 3496 * ->can_attach() fails.
2920 */ 3497 */
2921 do { 3498 do {
2922 css_task_iter_start(&from->dummy_css, &it); 3499 css_task_iter_start(&from->self, &it);
2923 task = css_task_iter_next(&it); 3500 task = css_task_iter_next(&it);
2924 if (task) 3501 if (task)
2925 get_task_struct(task); 3502 get_task_struct(task);
@@ -3184,7 +3761,7 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
3184 if (!array) 3761 if (!array)
3185 return -ENOMEM; 3762 return -ENOMEM;
3186 /* now, populate the array */ 3763 /* now, populate the array */
3187 css_task_iter_start(&cgrp->dummy_css, &it); 3764 css_task_iter_start(&cgrp->self, &it);
3188 while ((tsk = css_task_iter_next(&it))) { 3765 while ((tsk = css_task_iter_next(&it))) {
3189 if (unlikely(n == length)) 3766 if (unlikely(n == length))
3190 break; 3767 break;
@@ -3246,7 +3823,7 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
3246 3823
3247 /* 3824 /*
3248 * We aren't being called from kernfs and there's no guarantee on 3825 * We aren't being called from kernfs and there's no guarantee on
3249 * @kn->priv's validity. For this and css_tryget_from_dir(), 3826 * @kn->priv's validity. For this and css_tryget_online_from_dir(),
3250 * @kn->priv is RCU safe. Let's do the RCU dancing. 3827 * @kn->priv is RCU safe. Let's do the RCU dancing.
3251 */ 3828 */
3252 rcu_read_lock(); 3829 rcu_read_lock();
@@ -3258,7 +3835,7 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
3258 } 3835 }
3259 rcu_read_unlock(); 3836 rcu_read_unlock();
3260 3837
3261 css_task_iter_start(&cgrp->dummy_css, &it); 3838 css_task_iter_start(&cgrp->self, &it);
3262 while ((tsk = css_task_iter_next(&it))) { 3839 while ((tsk = css_task_iter_next(&it))) {
3263 switch (tsk->state) { 3840 switch (tsk->state) {
3264 case TASK_RUNNING: 3841 case TASK_RUNNING:
@@ -3388,17 +3965,6 @@ static int cgroup_pidlist_show(struct seq_file *s, void *v)
3388 return seq_printf(s, "%d\n", *(int *)v); 3965 return seq_printf(s, "%d\n", *(int *)v);
3389} 3966}
3390 3967
3391/*
3392 * seq_operations functions for iterating on pidlists through seq_file -
3393 * independent of whether it's tasks or procs
3394 */
3395static const struct seq_operations cgroup_pidlist_seq_operations = {
3396 .start = cgroup_pidlist_start,
3397 .stop = cgroup_pidlist_stop,
3398 .next = cgroup_pidlist_next,
3399 .show = cgroup_pidlist_show,
3400};
3401
3402static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css, 3968static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css,
3403 struct cftype *cft) 3969 struct cftype *cft)
3404{ 3970{
@@ -3440,7 +4006,7 @@ static struct cftype cgroup_base_files[] = {
3440 .seq_stop = cgroup_pidlist_stop, 4006 .seq_stop = cgroup_pidlist_stop,
3441 .seq_show = cgroup_pidlist_show, 4007 .seq_show = cgroup_pidlist_show,
3442 .private = CGROUP_FILE_PROCS, 4008 .private = CGROUP_FILE_PROCS,
3443 .write_u64 = cgroup_procs_write, 4009 .write = cgroup_procs_write,
3444 .mode = S_IRUGO | S_IWUSR, 4010 .mode = S_IRUGO | S_IWUSR,
3445 }, 4011 },
3446 { 4012 {
@@ -3454,6 +4020,27 @@ static struct cftype cgroup_base_files[] = {
3454 .flags = CFTYPE_ONLY_ON_ROOT, 4020 .flags = CFTYPE_ONLY_ON_ROOT,
3455 .seq_show = cgroup_sane_behavior_show, 4021 .seq_show = cgroup_sane_behavior_show,
3456 }, 4022 },
4023 {
4024 .name = "cgroup.controllers",
4025 .flags = CFTYPE_ONLY_ON_DFL | CFTYPE_ONLY_ON_ROOT,
4026 .seq_show = cgroup_root_controllers_show,
4027 },
4028 {
4029 .name = "cgroup.controllers",
4030 .flags = CFTYPE_ONLY_ON_DFL | CFTYPE_NOT_ON_ROOT,
4031 .seq_show = cgroup_controllers_show,
4032 },
4033 {
4034 .name = "cgroup.subtree_control",
4035 .flags = CFTYPE_ONLY_ON_DFL,
4036 .seq_show = cgroup_subtree_control_show,
4037 .write = cgroup_subtree_control_write,
4038 },
4039 {
4040 .name = "cgroup.populated",
4041 .flags = CFTYPE_ONLY_ON_DFL | CFTYPE_NOT_ON_ROOT,
4042 .seq_show = cgroup_populated_show,
4043 },
3457 4044
3458 /* 4045 /*
3459 * Historical crazy stuff. These don't have "cgroup." prefix and 4046 * Historical crazy stuff. These don't have "cgroup." prefix and
@@ -3468,7 +4055,7 @@ static struct cftype cgroup_base_files[] = {
3468 .seq_stop = cgroup_pidlist_stop, 4055 .seq_stop = cgroup_pidlist_stop,
3469 .seq_show = cgroup_pidlist_show, 4056 .seq_show = cgroup_pidlist_show,
3470 .private = CGROUP_FILE_TASKS, 4057 .private = CGROUP_FILE_TASKS,
3471 .write_u64 = cgroup_tasks_write, 4058 .write = cgroup_tasks_write,
3472 .mode = S_IRUGO | S_IWUSR, 4059 .mode = S_IRUGO | S_IWUSR,
3473 }, 4060 },
3474 { 4061 {
@@ -3481,7 +4068,7 @@ static struct cftype cgroup_base_files[] = {
3481 .name = "release_agent", 4068 .name = "release_agent",
3482 .flags = CFTYPE_INSANE | CFTYPE_ONLY_ON_ROOT, 4069 .flags = CFTYPE_INSANE | CFTYPE_ONLY_ON_ROOT,
3483 .seq_show = cgroup_release_agent_show, 4070 .seq_show = cgroup_release_agent_show,
3484 .write_string = cgroup_release_agent_write, 4071 .write = cgroup_release_agent_write,
3485 .max_write_len = PATH_MAX - 1, 4072 .max_write_len = PATH_MAX - 1,
3486 }, 4073 },
3487 { } /* terminate */ 4074 { } /* terminate */
@@ -3494,7 +4081,7 @@ static struct cftype cgroup_base_files[] = {
3494 * 4081 *
3495 * On failure, no file is added. 4082 * On failure, no file is added.
3496 */ 4083 */
3497static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask) 4084static int cgroup_populate_dir(struct cgroup *cgrp, unsigned int subsys_mask)
3498{ 4085{
3499 struct cgroup_subsys *ss; 4086 struct cgroup_subsys *ss;
3500 int i, ret = 0; 4087 int i, ret = 0;
@@ -3503,7 +4090,7 @@ static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask)
3503 for_each_subsys(ss, i) { 4090 for_each_subsys(ss, i) {
3504 struct cftype *cfts; 4091 struct cftype *cfts;
3505 4092
3506 if (!test_bit(i, &subsys_mask)) 4093 if (!(subsys_mask & (1 << i)))
3507 continue; 4094 continue;
3508 4095
3509 list_for_each_entry(cfts, &ss->cfts, node) { 4096 list_for_each_entry(cfts, &ss->cfts, node) {
@@ -3525,9 +4112,9 @@ err:
3525 * Implemented in kill_css(). 4112 * Implemented in kill_css().
3526 * 4113 *
3527 * 2. When the percpu_ref is confirmed to be visible as killed on all CPUs 4114 * 2. When the percpu_ref is confirmed to be visible as killed on all CPUs
3528 * and thus css_tryget() is guaranteed to fail, the css can be offlined 4115 * and thus css_tryget_online() is guaranteed to fail, the css can be
3529 * by invoking offline_css(). After offlining, the base ref is put. 4116 * offlined by invoking offline_css(). After offlining, the base ref is
3530 * Implemented in css_killed_work_fn(). 4117 * put. Implemented in css_killed_work_fn().
3531 * 4118 *
3532 * 3. When the percpu_ref reaches zero, the only possible remaining 4119 * 3. When the percpu_ref reaches zero, the only possible remaining
3533 * accessors are inside RCU read sections. css_release() schedules the 4120 * accessors are inside RCU read sections. css_release() schedules the
@@ -3546,11 +4133,37 @@ static void css_free_work_fn(struct work_struct *work)
3546 container_of(work, struct cgroup_subsys_state, destroy_work); 4133 container_of(work, struct cgroup_subsys_state, destroy_work);
3547 struct cgroup *cgrp = css->cgroup; 4134 struct cgroup *cgrp = css->cgroup;
3548 4135
3549 if (css->parent) 4136 if (css->ss) {
3550 css_put(css->parent); 4137 /* css free path */
4138 if (css->parent)
4139 css_put(css->parent);
3551 4140
3552 css->ss->css_free(css); 4141 css->ss->css_free(css);
3553 cgroup_put(cgrp); 4142 cgroup_put(cgrp);
4143 } else {
4144 /* cgroup free path */
4145 atomic_dec(&cgrp->root->nr_cgrps);
4146 cgroup_pidlist_destroy_all(cgrp);
4147
4148 if (cgroup_parent(cgrp)) {
4149 /*
4150 * We get a ref to the parent, and put the ref when
4151 * this cgroup is being freed, so it's guaranteed
4152 * that the parent won't be destroyed before its
4153 * children.
4154 */
4155 cgroup_put(cgroup_parent(cgrp));
4156 kernfs_put(cgrp->kn);
4157 kfree(cgrp);
4158 } else {
4159 /*
4160 * This is root cgroup's refcnt reaching zero,
4161 * which indicates that the root should be
4162 * released.
4163 */
4164 cgroup_destroy_root(cgrp->root);
4165 }
4166 }
3554} 4167}
3555 4168
3556static void css_free_rcu_fn(struct rcu_head *rcu_head) 4169static void css_free_rcu_fn(struct rcu_head *rcu_head)
@@ -3562,26 +4175,59 @@ static void css_free_rcu_fn(struct rcu_head *rcu_head)
3562 queue_work(cgroup_destroy_wq, &css->destroy_work); 4175 queue_work(cgroup_destroy_wq, &css->destroy_work);
3563} 4176}
3564 4177
4178static void css_release_work_fn(struct work_struct *work)
4179{
4180 struct cgroup_subsys_state *css =
4181 container_of(work, struct cgroup_subsys_state, destroy_work);
4182 struct cgroup_subsys *ss = css->ss;
4183 struct cgroup *cgrp = css->cgroup;
4184
4185 mutex_lock(&cgroup_mutex);
4186
4187 css->flags |= CSS_RELEASED;
4188 list_del_rcu(&css->sibling);
4189
4190 if (ss) {
4191 /* css release path */
4192 cgroup_idr_remove(&ss->css_idr, css->id);
4193 } else {
4194 /* cgroup release path */
4195 cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id);
4196 cgrp->id = -1;
4197 }
4198
4199 mutex_unlock(&cgroup_mutex);
4200
4201 call_rcu(&css->rcu_head, css_free_rcu_fn);
4202}
4203
3565static void css_release(struct percpu_ref *ref) 4204static void css_release(struct percpu_ref *ref)
3566{ 4205{
3567 struct cgroup_subsys_state *css = 4206 struct cgroup_subsys_state *css =
3568 container_of(ref, struct cgroup_subsys_state, refcnt); 4207 container_of(ref, struct cgroup_subsys_state, refcnt);
3569 4208
3570 RCU_INIT_POINTER(css->cgroup->subsys[css->ss->id], NULL); 4209 INIT_WORK(&css->destroy_work, css_release_work_fn);
3571 call_rcu(&css->rcu_head, css_free_rcu_fn); 4210 queue_work(cgroup_destroy_wq, &css->destroy_work);
3572} 4211}
3573 4212
3574static void init_css(struct cgroup_subsys_state *css, struct cgroup_subsys *ss, 4213static void init_and_link_css(struct cgroup_subsys_state *css,
3575 struct cgroup *cgrp) 4214 struct cgroup_subsys *ss, struct cgroup *cgrp)
3576{ 4215{
4216 lockdep_assert_held(&cgroup_mutex);
4217
4218 cgroup_get(cgrp);
4219
4220 memset(css, 0, sizeof(*css));
3577 css->cgroup = cgrp; 4221 css->cgroup = cgrp;
3578 css->ss = ss; 4222 css->ss = ss;
3579 css->flags = 0; 4223 INIT_LIST_HEAD(&css->sibling);
4224 INIT_LIST_HEAD(&css->children);
4225 css->serial_nr = css_serial_nr_next++;
3580 4226
3581 if (cgrp->parent) 4227 if (cgroup_parent(cgrp)) {
3582 css->parent = cgroup_css(cgrp->parent, ss); 4228 css->parent = cgroup_css(cgroup_parent(cgrp), ss);
3583 else 4229 css_get(css->parent);
3584 css->flags |= CSS_ROOT; 4230 }
3585 4231
3586 BUG_ON(cgroup_css(cgrp, ss)); 4232 BUG_ON(cgroup_css(cgrp, ss));
3587} 4233}
@@ -3592,14 +4238,12 @@ static int online_css(struct cgroup_subsys_state *css)
3592 struct cgroup_subsys *ss = css->ss; 4238 struct cgroup_subsys *ss = css->ss;
3593 int ret = 0; 4239 int ret = 0;
3594 4240
3595 lockdep_assert_held(&cgroup_tree_mutex);
3596 lockdep_assert_held(&cgroup_mutex); 4241 lockdep_assert_held(&cgroup_mutex);
3597 4242
3598 if (ss->css_online) 4243 if (ss->css_online)
3599 ret = ss->css_online(css); 4244 ret = ss->css_online(css);
3600 if (!ret) { 4245 if (!ret) {
3601 css->flags |= CSS_ONLINE; 4246 css->flags |= CSS_ONLINE;
3602 css->cgroup->nr_css++;
3603 rcu_assign_pointer(css->cgroup->subsys[ss->id], css); 4247 rcu_assign_pointer(css->cgroup->subsys[ss->id], css);
3604 } 4248 }
3605 return ret; 4249 return ret;
@@ -3610,7 +4254,6 @@ static void offline_css(struct cgroup_subsys_state *css)
3610{ 4254{
3611 struct cgroup_subsys *ss = css->ss; 4255 struct cgroup_subsys *ss = css->ss;
3612 4256
3613 lockdep_assert_held(&cgroup_tree_mutex);
3614 lockdep_assert_held(&cgroup_mutex); 4257 lockdep_assert_held(&cgroup_mutex);
3615 4258
3616 if (!(css->flags & CSS_ONLINE)) 4259 if (!(css->flags & CSS_ONLINE))
@@ -3620,8 +4263,9 @@ static void offline_css(struct cgroup_subsys_state *css)
3620 ss->css_offline(css); 4263 ss->css_offline(css);
3621 4264
3622 css->flags &= ~CSS_ONLINE; 4265 css->flags &= ~CSS_ONLINE;
3623 css->cgroup->nr_css--; 4266 RCU_INIT_POINTER(css->cgroup->subsys[ss->id], NULL);
3624 RCU_INIT_POINTER(css->cgroup->subsys[ss->id], css); 4267
4268 wake_up_all(&css->cgroup->offline_waitq);
3625} 4269}
3626 4270
3627/** 4271/**
@@ -3635,111 +4279,102 @@ static void offline_css(struct cgroup_subsys_state *css)
3635 */ 4279 */
3636static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss) 4280static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss)
3637{ 4281{
3638 struct cgroup *parent = cgrp->parent; 4282 struct cgroup *parent = cgroup_parent(cgrp);
4283 struct cgroup_subsys_state *parent_css = cgroup_css(parent, ss);
3639 struct cgroup_subsys_state *css; 4284 struct cgroup_subsys_state *css;
3640 int err; 4285 int err;
3641 4286
3642 lockdep_assert_held(&cgroup_mutex); 4287 lockdep_assert_held(&cgroup_mutex);
3643 4288
3644 css = ss->css_alloc(cgroup_css(parent, ss)); 4289 css = ss->css_alloc(parent_css);
3645 if (IS_ERR(css)) 4290 if (IS_ERR(css))
3646 return PTR_ERR(css); 4291 return PTR_ERR(css);
3647 4292
4293 init_and_link_css(css, ss, cgrp);
4294
3648 err = percpu_ref_init(&css->refcnt, css_release); 4295 err = percpu_ref_init(&css->refcnt, css_release);
3649 if (err) 4296 if (err)
3650 goto err_free_css; 4297 goto err_free_css;
3651 4298
3652 init_css(css, ss, cgrp); 4299 err = cgroup_idr_alloc(&ss->css_idr, NULL, 2, 0, GFP_NOWAIT);
4300 if (err < 0)
4301 goto err_free_percpu_ref;
4302 css->id = err;
3653 4303
3654 err = cgroup_populate_dir(cgrp, 1 << ss->id); 4304 err = cgroup_populate_dir(cgrp, 1 << ss->id);
3655 if (err) 4305 if (err)
3656 goto err_free_percpu_ref; 4306 goto err_free_id;
4307
4308 /* @css is ready to be brought online now, make it visible */
4309 list_add_tail_rcu(&css->sibling, &parent_css->children);
4310 cgroup_idr_replace(&ss->css_idr, css, css->id);
3657 4311
3658 err = online_css(css); 4312 err = online_css(css);
3659 if (err) 4313 if (err)
3660 goto err_clear_dir; 4314 goto err_list_del;
3661
3662 cgroup_get(cgrp);
3663 css_get(css->parent);
3664
3665 cgrp->subsys_mask |= 1 << ss->id;
3666 4315
3667 if (ss->broken_hierarchy && !ss->warned_broken_hierarchy && 4316 if (ss->broken_hierarchy && !ss->warned_broken_hierarchy &&
3668 parent->parent) { 4317 cgroup_parent(parent)) {
3669 pr_warning("cgroup: %s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n", 4318 pr_warn("%s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n",
3670 current->comm, current->pid, ss->name); 4319 current->comm, current->pid, ss->name);
3671 if (!strcmp(ss->name, "memory")) 4320 if (!strcmp(ss->name, "memory"))
3672 pr_warning("cgroup: \"memory\" requires setting use_hierarchy to 1 on the root.\n"); 4321 pr_warn("\"memory\" requires setting use_hierarchy to 1 on the root\n");
3673 ss->warned_broken_hierarchy = true; 4322 ss->warned_broken_hierarchy = true;
3674 } 4323 }
3675 4324
3676 return 0; 4325 return 0;
3677 4326
3678err_clear_dir: 4327err_list_del:
4328 list_del_rcu(&css->sibling);
3679 cgroup_clear_dir(css->cgroup, 1 << css->ss->id); 4329 cgroup_clear_dir(css->cgroup, 1 << css->ss->id);
4330err_free_id:
4331 cgroup_idr_remove(&ss->css_idr, css->id);
3680err_free_percpu_ref: 4332err_free_percpu_ref:
3681 percpu_ref_cancel_init(&css->refcnt); 4333 percpu_ref_cancel_init(&css->refcnt);
3682err_free_css: 4334err_free_css:
3683 ss->css_free(css); 4335 call_rcu(&css->rcu_head, css_free_rcu_fn);
3684 return err; 4336 return err;
3685} 4337}
3686 4338
3687/** 4339static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
3688 * cgroup_create - create a cgroup 4340 umode_t mode)
3689 * @parent: cgroup that will be parent of the new cgroup
3690 * @name: name of the new cgroup
3691 * @mode: mode to set on new cgroup
3692 */
3693static long cgroup_create(struct cgroup *parent, const char *name,
3694 umode_t mode)
3695{ 4341{
3696 struct cgroup *cgrp; 4342 struct cgroup *parent, *cgrp;
3697 struct cgroup_root *root = parent->root; 4343 struct cgroup_root *root;
3698 int ssid, err;
3699 struct cgroup_subsys *ss; 4344 struct cgroup_subsys *ss;
3700 struct kernfs_node *kn; 4345 struct kernfs_node *kn;
4346 int ssid, ret;
3701 4347
3702 /* 4348 parent = cgroup_kn_lock_live(parent_kn);
3703 * XXX: The default hierarchy isn't fully implemented yet. Block 4349 if (!parent)
3704 * !root cgroup creation on it for now. 4350 return -ENODEV;
3705 */ 4351 root = parent->root;
3706 if (root == &cgrp_dfl_root)
3707 return -EINVAL;
3708 4352
3709 /* allocate the cgroup and its ID, 0 is reserved for the root */ 4353 /* allocate the cgroup and its ID, 0 is reserved for the root */
3710 cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL); 4354 cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL);
3711 if (!cgrp) 4355 if (!cgrp) {
3712 return -ENOMEM; 4356 ret = -ENOMEM;
3713 4357 goto out_unlock;
3714 mutex_lock(&cgroup_tree_mutex);
3715
3716 /*
3717 * Only live parents can have children. Note that the liveliness
3718 * check isn't strictly necessary because cgroup_mkdir() and
3719 * cgroup_rmdir() are fully synchronized by i_mutex; however, do it
3720 * anyway so that locking is contained inside cgroup proper and we
3721 * don't get nasty surprises if we ever grow another caller.
3722 */
3723 if (!cgroup_lock_live_group(parent)) {
3724 err = -ENODEV;
3725 goto err_unlock_tree;
3726 } 4358 }
3727 4359
4360 ret = percpu_ref_init(&cgrp->self.refcnt, css_release);
4361 if (ret)
4362 goto out_free_cgrp;
4363
3728 /* 4364 /*
3729 * Temporarily set the pointer to NULL, so idr_find() won't return 4365 * Temporarily set the pointer to NULL, so idr_find() won't return
3730 * a half-baked cgroup. 4366 * a half-baked cgroup.
3731 */ 4367 */
3732 cgrp->id = idr_alloc(&root->cgroup_idr, NULL, 1, 0, GFP_KERNEL); 4368 cgrp->id = cgroup_idr_alloc(&root->cgroup_idr, NULL, 2, 0, GFP_NOWAIT);
3733 if (cgrp->id < 0) { 4369 if (cgrp->id < 0) {
3734 err = -ENOMEM; 4370 ret = -ENOMEM;
3735 goto err_unlock; 4371 goto out_cancel_ref;
3736 } 4372 }
3737 4373
3738 init_cgroup_housekeeping(cgrp); 4374 init_cgroup_housekeeping(cgrp);
3739 4375
3740 cgrp->parent = parent; 4376 cgrp->self.parent = &parent->self;
3741 cgrp->dummy_css.parent = &parent->dummy_css; 4377 cgrp->root = root;
3742 cgrp->root = parent->root;
3743 4378
3744 if (notify_on_release(parent)) 4379 if (notify_on_release(parent))
3745 set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); 4380 set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
@@ -3750,8 +4385,8 @@ static long cgroup_create(struct cgroup *parent, const char *name,
3750 /* create the directory */ 4385 /* create the directory */
3751 kn = kernfs_create_dir(parent->kn, name, mode, cgrp); 4386 kn = kernfs_create_dir(parent->kn, name, mode, cgrp);
3752 if (IS_ERR(kn)) { 4387 if (IS_ERR(kn)) {
3753 err = PTR_ERR(kn); 4388 ret = PTR_ERR(kn);
3754 goto err_free_id; 4389 goto out_free_id;
3755 } 4390 }
3756 cgrp->kn = kn; 4391 cgrp->kn = kn;
3757 4392
@@ -3761,10 +4396,10 @@ static long cgroup_create(struct cgroup *parent, const char *name,
3761 */ 4396 */
3762 kernfs_get(kn); 4397 kernfs_get(kn);
3763 4398
3764 cgrp->serial_nr = cgroup_serial_nr_next++; 4399 cgrp->self.serial_nr = css_serial_nr_next++;
3765 4400
3766 /* allocation complete, commit to creation */ 4401 /* allocation complete, commit to creation */
3767 list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children); 4402 list_add_tail_rcu(&cgrp->self.sibling, &cgroup_parent(cgrp)->self.children);
3768 atomic_inc(&root->nr_cgrps); 4403 atomic_inc(&root->nr_cgrps);
3769 cgroup_get(parent); 4404 cgroup_get(parent);
3770 4405
@@ -3772,107 +4407,66 @@ static long cgroup_create(struct cgroup *parent, const char *name,
3772 * @cgrp is now fully operational. If something fails after this 4407 * @cgrp is now fully operational. If something fails after this
3773 * point, it'll be released via the normal destruction path. 4408 * point, it'll be released via the normal destruction path.
3774 */ 4409 */
3775 idr_replace(&root->cgroup_idr, cgrp, cgrp->id); 4410 cgroup_idr_replace(&root->cgroup_idr, cgrp, cgrp->id);
3776 4411
3777 err = cgroup_kn_set_ugid(kn); 4412 ret = cgroup_kn_set_ugid(kn);
3778 if (err) 4413 if (ret)
3779 goto err_destroy; 4414 goto out_destroy;
3780 4415
3781 err = cgroup_addrm_files(cgrp, cgroup_base_files, true); 4416 ret = cgroup_addrm_files(cgrp, cgroup_base_files, true);
3782 if (err) 4417 if (ret)
3783 goto err_destroy; 4418 goto out_destroy;
3784 4419
3785 /* let's create and online css's */ 4420 /* let's create and online css's */
3786 for_each_subsys(ss, ssid) { 4421 for_each_subsys(ss, ssid) {
3787 if (root->cgrp.subsys_mask & (1 << ssid)) { 4422 if (parent->child_subsys_mask & (1 << ssid)) {
3788 err = create_css(cgrp, ss); 4423 ret = create_css(cgrp, ss);
3789 if (err) 4424 if (ret)
3790 goto err_destroy; 4425 goto out_destroy;
3791 } 4426 }
3792 } 4427 }
3793 4428
3794 kernfs_activate(kn); 4429 /*
4430 * On the default hierarchy, a child doesn't automatically inherit
4431 * child_subsys_mask from the parent. Each is configured manually.
4432 */
4433 if (!cgroup_on_dfl(cgrp))
4434 cgrp->child_subsys_mask = parent->child_subsys_mask;
3795 4435
3796 mutex_unlock(&cgroup_mutex); 4436 kernfs_activate(kn);
3797 mutex_unlock(&cgroup_tree_mutex);
3798 4437
3799 return 0; 4438 ret = 0;
4439 goto out_unlock;
3800 4440
3801err_free_id: 4441out_free_id:
3802 idr_remove(&root->cgroup_idr, cgrp->id); 4442 cgroup_idr_remove(&root->cgroup_idr, cgrp->id);
3803err_unlock: 4443out_cancel_ref:
3804 mutex_unlock(&cgroup_mutex); 4444 percpu_ref_cancel_init(&cgrp->self.refcnt);
3805err_unlock_tree: 4445out_free_cgrp:
3806 mutex_unlock(&cgroup_tree_mutex);
3807 kfree(cgrp); 4446 kfree(cgrp);
3808 return err; 4447out_unlock:
4448 cgroup_kn_unlock(parent_kn);
4449 return ret;
3809 4450
3810err_destroy: 4451out_destroy:
3811 cgroup_destroy_locked(cgrp); 4452 cgroup_destroy_locked(cgrp);
3812 mutex_unlock(&cgroup_mutex); 4453 goto out_unlock;
3813 mutex_unlock(&cgroup_tree_mutex);
3814 return err;
3815}
3816
3817static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
3818 umode_t mode)
3819{
3820 struct cgroup *parent = parent_kn->priv;
3821 int ret;
3822
3823 /*
3824 * cgroup_create() grabs cgroup_tree_mutex which nests outside
3825 * kernfs active_ref and cgroup_create() already synchronizes
3826 * properly against removal through cgroup_lock_live_group().
3827 * Break it before calling cgroup_create().
3828 */
3829 cgroup_get(parent);
3830 kernfs_break_active_protection(parent_kn);
3831
3832 ret = cgroup_create(parent, name, mode);
3833
3834 kernfs_unbreak_active_protection(parent_kn);
3835 cgroup_put(parent);
3836 return ret;
3837} 4454}
3838 4455
3839/* 4456/*
3840 * This is called when the refcnt of a css is confirmed to be killed. 4457 * This is called when the refcnt of a css is confirmed to be killed.
3841 * css_tryget() is now guaranteed to fail. 4458 * css_tryget_online() is now guaranteed to fail. Tell the subsystem to
4459 * initate destruction and put the css ref from kill_css().
3842 */ 4460 */
3843static void css_killed_work_fn(struct work_struct *work) 4461static void css_killed_work_fn(struct work_struct *work)
3844{ 4462{
3845 struct cgroup_subsys_state *css = 4463 struct cgroup_subsys_state *css =
3846 container_of(work, struct cgroup_subsys_state, destroy_work); 4464 container_of(work, struct cgroup_subsys_state, destroy_work);
3847 struct cgroup *cgrp = css->cgroup;
3848 4465
3849 mutex_lock(&cgroup_tree_mutex);
3850 mutex_lock(&cgroup_mutex); 4466 mutex_lock(&cgroup_mutex);
3851
3852 /*
3853 * css_tryget() is guaranteed to fail now. Tell subsystems to
3854 * initate destruction.
3855 */
3856 offline_css(css); 4467 offline_css(css);
3857
3858 /*
3859 * If @cgrp is marked dead, it's waiting for refs of all css's to
3860 * be disabled before proceeding to the second phase of cgroup
3861 * destruction. If we are the last one, kick it off.
3862 */
3863 if (!cgrp->nr_css && cgroup_is_dead(cgrp))
3864 cgroup_destroy_css_killed(cgrp);
3865
3866 mutex_unlock(&cgroup_mutex); 4468 mutex_unlock(&cgroup_mutex);
3867 mutex_unlock(&cgroup_tree_mutex);
3868 4469
3869 /*
3870 * Put the css refs from kill_css(). Each css holds an extra
3871 * reference to the cgroup's dentry and cgroup removal proceeds
3872 * regardless of css refs. On the last put of each css, whenever
3873 * that may be, the extra dentry ref is put so that dentry
3874 * destruction happens only after all css's are released.
3875 */
3876 css_put(css); 4470 css_put(css);
3877} 4471}
3878 4472
@@ -3886,9 +4480,18 @@ static void css_killed_ref_fn(struct percpu_ref *ref)
3886 queue_work(cgroup_destroy_wq, &css->destroy_work); 4480 queue_work(cgroup_destroy_wq, &css->destroy_work);
3887} 4481}
3888 4482
3889static void __kill_css(struct cgroup_subsys_state *css) 4483/**
4484 * kill_css - destroy a css
4485 * @css: css to destroy
4486 *
4487 * This function initiates destruction of @css by removing cgroup interface
4488 * files and putting its base reference. ->css_offline() will be invoked
4489 * asynchronously once css_tryget_online() is guaranteed to fail and when
4490 * the reference count reaches zero, @css will be released.
4491 */
4492static void kill_css(struct cgroup_subsys_state *css)
3890{ 4493{
3891 lockdep_assert_held(&cgroup_tree_mutex); 4494 lockdep_assert_held(&cgroup_mutex);
3892 4495
3893 /* 4496 /*
3894 * This must happen before css is disassociated with its cgroup. 4497 * This must happen before css is disassociated with its cgroup.
@@ -3905,7 +4508,7 @@ static void __kill_css(struct cgroup_subsys_state *css)
3905 /* 4508 /*
3906 * cgroup core guarantees that, by the time ->css_offline() is 4509 * cgroup core guarantees that, by the time ->css_offline() is
3907 * invoked, no new css reference will be given out via 4510 * invoked, no new css reference will be given out via
3908 * css_tryget(). We can't simply call percpu_ref_kill() and 4511 * css_tryget_online(). We can't simply call percpu_ref_kill() and
3909 * proceed to offlining css's because percpu_ref_kill() doesn't 4512 * proceed to offlining css's because percpu_ref_kill() doesn't
3910 * guarantee that the ref is seen as killed on all CPUs on return. 4513 * guarantee that the ref is seen as killed on all CPUs on return.
3911 * 4514 *
@@ -3916,36 +4519,14 @@ static void __kill_css(struct cgroup_subsys_state *css)
3916} 4519}
3917 4520
3918/** 4521/**
3919 * kill_css - destroy a css
3920 * @css: css to destroy
3921 *
3922 * This function initiates destruction of @css by removing cgroup interface
3923 * files and putting its base reference. ->css_offline() will be invoked
3924 * asynchronously once css_tryget() is guaranteed to fail and when the
3925 * reference count reaches zero, @css will be released.
3926 */
3927static void kill_css(struct cgroup_subsys_state *css)
3928{
3929 struct cgroup *cgrp = css->cgroup;
3930
3931 lockdep_assert_held(&cgroup_tree_mutex);
3932
3933 /* if already killed, noop */
3934 if (cgrp->subsys_mask & (1 << css->ss->id)) {
3935 cgrp->subsys_mask &= ~(1 << css->ss->id);
3936 __kill_css(css);
3937 }
3938}
3939
3940/**
3941 * cgroup_destroy_locked - the first stage of cgroup destruction 4522 * cgroup_destroy_locked - the first stage of cgroup destruction
3942 * @cgrp: cgroup to be destroyed 4523 * @cgrp: cgroup to be destroyed
3943 * 4524 *
3944 * css's make use of percpu refcnts whose killing latency shouldn't be 4525 * css's make use of percpu refcnts whose killing latency shouldn't be
3945 * exposed to userland and are RCU protected. Also, cgroup core needs to 4526 * exposed to userland and are RCU protected. Also, cgroup core needs to
3946 * guarantee that css_tryget() won't succeed by the time ->css_offline() is 4527 * guarantee that css_tryget_online() won't succeed by the time
3947 * invoked. To satisfy all the requirements, destruction is implemented in 4528 * ->css_offline() is invoked. To satisfy all the requirements,
3948 * the following two steps. 4529 * destruction is implemented in the following two steps.
3949 * 4530 *
3950 * s1. Verify @cgrp can be destroyed and mark it dying. Remove all 4531 * s1. Verify @cgrp can be destroyed and mark it dying. Remove all
3951 * userland visible parts and start killing the percpu refcnts of 4532 * userland visible parts and start killing the percpu refcnts of
@@ -3964,12 +4545,10 @@ static void kill_css(struct cgroup_subsys_state *css)
3964static int cgroup_destroy_locked(struct cgroup *cgrp) 4545static int cgroup_destroy_locked(struct cgroup *cgrp)
3965 __releases(&cgroup_mutex) __acquires(&cgroup_mutex) 4546 __releases(&cgroup_mutex) __acquires(&cgroup_mutex)
3966{ 4547{
3967 struct cgroup *child;
3968 struct cgroup_subsys_state *css; 4548 struct cgroup_subsys_state *css;
3969 bool empty; 4549 bool empty;
3970 int ssid; 4550 int ssid;
3971 4551
3972 lockdep_assert_held(&cgroup_tree_mutex);
3973 lockdep_assert_held(&cgroup_mutex); 4552 lockdep_assert_held(&cgroup_mutex);
3974 4553
3975 /* 4554 /*
@@ -3983,127 +4562,68 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
3983 return -EBUSY; 4562 return -EBUSY;
3984 4563
3985 /* 4564 /*
3986 * Make sure there's no live children. We can't test ->children 4565 * Make sure there's no live children. We can't test emptiness of
3987 * emptiness as dead children linger on it while being destroyed; 4566 * ->self.children as dead children linger on it while being
3988 * otherwise, "rmdir parent/child parent" may fail with -EBUSY. 4567 * drained; otherwise, "rmdir parent/child parent" may fail.
3989 */ 4568 */
3990 empty = true; 4569 if (css_has_online_children(&cgrp->self))
3991 rcu_read_lock();
3992 list_for_each_entry_rcu(child, &cgrp->children, sibling) {
3993 empty = cgroup_is_dead(child);
3994 if (!empty)
3995 break;
3996 }
3997 rcu_read_unlock();
3998 if (!empty)
3999 return -EBUSY; 4570 return -EBUSY;
4000 4571
4001 /* 4572 /*
4002 * Mark @cgrp dead. This prevents further task migration and child 4573 * Mark @cgrp dead. This prevents further task migration and child
4003 * creation by disabling cgroup_lock_live_group(). Note that 4574 * creation by disabling cgroup_lock_live_group().
4004 * CGRP_DEAD assertion is depended upon by css_next_child() to
4005 * resume iteration after dropping RCU read lock. See
4006 * css_next_child() for details.
4007 */ 4575 */
4008 set_bit(CGRP_DEAD, &cgrp->flags); 4576 cgrp->self.flags &= ~CSS_ONLINE;
4009 4577
4010 /* 4578 /* initiate massacre of all css's */
4011 * Initiate massacre of all css's. cgroup_destroy_css_killed()
4012 * will be invoked to perform the rest of destruction once the
4013 * percpu refs of all css's are confirmed to be killed. This
4014 * involves removing the subsystem's files, drop cgroup_mutex.
4015 */
4016 mutex_unlock(&cgroup_mutex);
4017 for_each_css(css, ssid, cgrp) 4579 for_each_css(css, ssid, cgrp)
4018 kill_css(css); 4580 kill_css(css);
4019 mutex_lock(&cgroup_mutex);
4020 4581
4021 /* CGRP_DEAD is set, remove from ->release_list for the last time */ 4582 /* CSS_ONLINE is clear, remove from ->release_list for the last time */
4022 raw_spin_lock(&release_list_lock); 4583 raw_spin_lock(&release_list_lock);
4023 if (!list_empty(&cgrp->release_list)) 4584 if (!list_empty(&cgrp->release_list))
4024 list_del_init(&cgrp->release_list); 4585 list_del_init(&cgrp->release_list);
4025 raw_spin_unlock(&release_list_lock); 4586 raw_spin_unlock(&release_list_lock);
4026 4587
4027 /* 4588 /*
4028 * If @cgrp has css's attached, the second stage of cgroup 4589 * Remove @cgrp directory along with the base files. @cgrp has an
4029 * destruction is kicked off from css_killed_work_fn() after the 4590 * extra ref on its kn.
4030 * refs of all attached css's are killed. If @cgrp doesn't have
4031 * any css, we kick it off here.
4032 */ 4591 */
4033 if (!cgrp->nr_css) 4592 kernfs_remove(cgrp->kn);
4034 cgroup_destroy_css_killed(cgrp);
4035
4036 /* remove @cgrp directory along with the base files */
4037 mutex_unlock(&cgroup_mutex);
4038 4593
4039 /* 4594 set_bit(CGRP_RELEASABLE, &cgroup_parent(cgrp)->flags);
4040 * There are two control paths which try to determine cgroup from 4595 check_for_release(cgroup_parent(cgrp));
4041 * dentry without going through kernfs - cgroupstats_build() and
4042 * css_tryget_from_dir(). Those are supported by RCU protecting
4043 * clearing of cgrp->kn->priv backpointer, which should happen
4044 * after all files under it have been removed.
4045 */
4046 kernfs_remove(cgrp->kn); /* @cgrp has an extra ref on its kn */
4047 RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv, NULL);
4048 4596
4049 mutex_lock(&cgroup_mutex); 4597 /* put the base reference */
4598 percpu_ref_kill(&cgrp->self.refcnt);
4050 4599
4051 return 0; 4600 return 0;
4052}; 4601};
4053 4602
4054/**
4055 * cgroup_destroy_css_killed - the second step of cgroup destruction
4056 * @work: cgroup->destroy_free_work
4057 *
4058 * This function is invoked from a work item for a cgroup which is being
4059 * destroyed after all css's are offlined and performs the rest of
4060 * destruction. This is the second step of destruction described in the
4061 * comment above cgroup_destroy_locked().
4062 */
4063static void cgroup_destroy_css_killed(struct cgroup *cgrp)
4064{
4065 struct cgroup *parent = cgrp->parent;
4066
4067 lockdep_assert_held(&cgroup_tree_mutex);
4068 lockdep_assert_held(&cgroup_mutex);
4069
4070 /* delete this cgroup from parent->children */
4071 list_del_rcu(&cgrp->sibling);
4072
4073 cgroup_put(cgrp);
4074
4075 set_bit(CGRP_RELEASABLE, &parent->flags);
4076 check_for_release(parent);
4077}
4078
4079static int cgroup_rmdir(struct kernfs_node *kn) 4603static int cgroup_rmdir(struct kernfs_node *kn)
4080{ 4604{
4081 struct cgroup *cgrp = kn->priv; 4605 struct cgroup *cgrp;
4082 int ret = 0; 4606 int ret = 0;
4083 4607
4084 /* 4608 cgrp = cgroup_kn_lock_live(kn);
4085 * This is self-destruction but @kn can't be removed while this 4609 if (!cgrp)
4086 * callback is in progress. Let's break active protection. Once 4610 return 0;
4087 * the protection is broken, @cgrp can be destroyed at any point. 4611 cgroup_get(cgrp); /* for @kn->priv clearing */
4088 * Pin it so that it stays accessible.
4089 */
4090 cgroup_get(cgrp);
4091 kernfs_break_active_protection(kn);
4092 4612
4093 mutex_lock(&cgroup_tree_mutex); 4613 ret = cgroup_destroy_locked(cgrp);
4094 mutex_lock(&cgroup_mutex); 4614
4615 cgroup_kn_unlock(kn);
4095 4616
4096 /* 4617 /*
4097 * @cgrp might already have been destroyed while we're trying to 4618 * There are two control paths which try to determine cgroup from
4098 * grab the mutexes. 4619 * dentry without going through kernfs - cgroupstats_build() and
4620 * css_tryget_online_from_dir(). Those are supported by RCU
4621 * protecting clearing of cgrp->kn->priv backpointer, which should
4622 * happen after all files under it have been removed.
4099 */ 4623 */
4100 if (!cgroup_is_dead(cgrp)) 4624 if (!ret)
4101 ret = cgroup_destroy_locked(cgrp); 4625 RCU_INIT_POINTER(*(void __rcu __force **)&kn->priv, NULL);
4102
4103 mutex_unlock(&cgroup_mutex);
4104 mutex_unlock(&cgroup_tree_mutex);
4105 4626
4106 kernfs_unbreak_active_protection(kn);
4107 cgroup_put(cgrp); 4627 cgroup_put(cgrp);
4108 return ret; 4628 return ret;
4109} 4629}
@@ -4116,15 +4636,15 @@ static struct kernfs_syscall_ops cgroup_kf_syscall_ops = {
4116 .rename = cgroup_rename, 4636 .rename = cgroup_rename,
4117}; 4637};
4118 4638
4119static void __init cgroup_init_subsys(struct cgroup_subsys *ss) 4639static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early)
4120{ 4640{
4121 struct cgroup_subsys_state *css; 4641 struct cgroup_subsys_state *css;
4122 4642
4123 printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name); 4643 printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name);
4124 4644
4125 mutex_lock(&cgroup_tree_mutex);
4126 mutex_lock(&cgroup_mutex); 4645 mutex_lock(&cgroup_mutex);
4127 4646
4647 idr_init(&ss->css_idr);
4128 INIT_LIST_HEAD(&ss->cfts); 4648 INIT_LIST_HEAD(&ss->cfts);
4129 4649
4130 /* Create the root cgroup state for this subsystem */ 4650 /* Create the root cgroup state for this subsystem */
@@ -4132,7 +4652,21 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
4132 css = ss->css_alloc(cgroup_css(&cgrp_dfl_root.cgrp, ss)); 4652 css = ss->css_alloc(cgroup_css(&cgrp_dfl_root.cgrp, ss));
4133 /* We don't handle early failures gracefully */ 4653 /* We don't handle early failures gracefully */
4134 BUG_ON(IS_ERR(css)); 4654 BUG_ON(IS_ERR(css));
4135 init_css(css, ss, &cgrp_dfl_root.cgrp); 4655 init_and_link_css(css, ss, &cgrp_dfl_root.cgrp);
4656
4657 /*
4658 * Root csses are never destroyed and we can't initialize
4659 * percpu_ref during early init. Disable refcnting.
4660 */
4661 css->flags |= CSS_NO_REF;
4662
4663 if (early) {
4664 /* allocation can't be done safely during early init */
4665 css->id = 1;
4666 } else {
4667 css->id = cgroup_idr_alloc(&ss->css_idr, css, 1, 2, GFP_KERNEL);
4668 BUG_ON(css->id < 0);
4669 }
4136 4670
4137 /* Update the init_css_set to contain a subsys 4671 /* Update the init_css_set to contain a subsys
4138 * pointer to this state - since the subsystem is 4672 * pointer to this state - since the subsystem is
@@ -4149,10 +4683,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
4149 4683
4150 BUG_ON(online_css(css)); 4684 BUG_ON(online_css(css));
4151 4685
4152 cgrp_dfl_root.cgrp.subsys_mask |= 1 << ss->id;
4153
4154 mutex_unlock(&cgroup_mutex); 4686 mutex_unlock(&cgroup_mutex);
4155 mutex_unlock(&cgroup_tree_mutex);
4156} 4687}
4157 4688
4158/** 4689/**
@@ -4169,6 +4700,8 @@ int __init cgroup_init_early(void)
4169 int i; 4700 int i;
4170 4701
4171 init_cgroup_root(&cgrp_dfl_root, &opts); 4702 init_cgroup_root(&cgrp_dfl_root, &opts);
4703 cgrp_dfl_root.cgrp.self.flags |= CSS_NO_REF;
4704
4172 RCU_INIT_POINTER(init_task.cgroups, &init_css_set); 4705 RCU_INIT_POINTER(init_task.cgroups, &init_css_set);
4173 4706
4174 for_each_subsys(ss, i) { 4707 for_each_subsys(ss, i) {
@@ -4183,7 +4716,7 @@ int __init cgroup_init_early(void)
4183 ss->name = cgroup_subsys_name[i]; 4716 ss->name = cgroup_subsys_name[i];
4184 4717
4185 if (ss->early_init) 4718 if (ss->early_init)
4186 cgroup_init_subsys(ss); 4719 cgroup_init_subsys(ss, true);
4187 } 4720 }
4188 return 0; 4721 return 0;
4189} 4722}
@@ -4202,7 +4735,6 @@ int __init cgroup_init(void)
4202 4735
4203 BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files)); 4736 BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files));
4204 4737
4205 mutex_lock(&cgroup_tree_mutex);
4206 mutex_lock(&cgroup_mutex); 4738 mutex_lock(&cgroup_mutex);
4207 4739
4208 /* Add init_css_set to the hash table */ 4740 /* Add init_css_set to the hash table */
@@ -4212,18 +4744,31 @@ int __init cgroup_init(void)
4212 BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0)); 4744 BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0));
4213 4745
4214 mutex_unlock(&cgroup_mutex); 4746 mutex_unlock(&cgroup_mutex);
4215 mutex_unlock(&cgroup_tree_mutex);
4216 4747
4217 for_each_subsys(ss, ssid) { 4748 for_each_subsys(ss, ssid) {
4218 if (!ss->early_init) 4749 if (ss->early_init) {
4219 cgroup_init_subsys(ss); 4750 struct cgroup_subsys_state *css =
4751 init_css_set.subsys[ss->id];
4752
4753 css->id = cgroup_idr_alloc(&ss->css_idr, css, 1, 2,
4754 GFP_KERNEL);
4755 BUG_ON(css->id < 0);
4756 } else {
4757 cgroup_init_subsys(ss, false);
4758 }
4759
4760 list_add_tail(&init_css_set.e_cset_node[ssid],
4761 &cgrp_dfl_root.cgrp.e_csets[ssid]);
4220 4762
4221 /* 4763 /*
4222 * cftype registration needs kmalloc and can't be done 4764 * Setting dfl_root subsys_mask needs to consider the
4223 * during early_init. Register base cftypes separately. 4765 * disabled flag and cftype registration needs kmalloc,
4766 * both of which aren't available during early_init.
4224 */ 4767 */
4225 if (ss->base_cftypes) 4768 if (!ss->disabled) {
4769 cgrp_dfl_root.subsys_mask |= 1 << ss->id;
4226 WARN_ON(cgroup_add_cftypes(ss, ss->base_cftypes)); 4770 WARN_ON(cgroup_add_cftypes(ss, ss->base_cftypes));
4771 }
4227 } 4772 }
4228 4773
4229 cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj); 4774 cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj);
@@ -4306,7 +4851,7 @@ int proc_cgroup_show(struct seq_file *m, void *v)
4306 4851
4307 seq_printf(m, "%d:", root->hierarchy_id); 4852 seq_printf(m, "%d:", root->hierarchy_id);
4308 for_each_subsys(ss, ssid) 4853 for_each_subsys(ss, ssid)
4309 if (root->cgrp.subsys_mask & (1 << ssid)) 4854 if (root->subsys_mask & (1 << ssid))
4310 seq_printf(m, "%s%s", count++ ? "," : "", ss->name); 4855 seq_printf(m, "%s%s", count++ ? "," : "", ss->name);
4311 if (strlen(root->name)) 4856 if (strlen(root->name))
4312 seq_printf(m, "%sname=%s", count ? "," : "", 4857 seq_printf(m, "%sname=%s", count ? "," : "",
@@ -4501,8 +5046,8 @@ void cgroup_exit(struct task_struct *tsk)
4501 5046
4502static void check_for_release(struct cgroup *cgrp) 5047static void check_for_release(struct cgroup *cgrp)
4503{ 5048{
4504 if (cgroup_is_releasable(cgrp) && 5049 if (cgroup_is_releasable(cgrp) && list_empty(&cgrp->cset_links) &&
4505 list_empty(&cgrp->cset_links) && list_empty(&cgrp->children)) { 5050 !css_has_online_children(&cgrp->self)) {
4506 /* 5051 /*
4507 * Control Group is currently removeable. If it's not 5052 * Control Group is currently removeable. If it's not
4508 * already queued for a userspace notification, queue 5053 * already queued for a userspace notification, queue
@@ -4619,7 +5164,7 @@ static int __init cgroup_disable(char *str)
4619__setup("cgroup_disable=", cgroup_disable); 5164__setup("cgroup_disable=", cgroup_disable);
4620 5165
4621/** 5166/**
4622 * css_tryget_from_dir - get corresponding css from the dentry of a cgroup dir 5167 * css_tryget_online_from_dir - get corresponding css from a cgroup dentry
4623 * @dentry: directory dentry of interest 5168 * @dentry: directory dentry of interest
4624 * @ss: subsystem of interest 5169 * @ss: subsystem of interest
4625 * 5170 *
@@ -4627,8 +5172,8 @@ __setup("cgroup_disable=", cgroup_disable);
4627 * to get the corresponding css and return it. If such css doesn't exist 5172 * to get the corresponding css and return it. If such css doesn't exist
4628 * or can't be pinned, an ERR_PTR value is returned. 5173 * or can't be pinned, an ERR_PTR value is returned.
4629 */ 5174 */
4630struct cgroup_subsys_state *css_tryget_from_dir(struct dentry *dentry, 5175struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry,
4631 struct cgroup_subsys *ss) 5176 struct cgroup_subsys *ss)
4632{ 5177{
4633 struct kernfs_node *kn = kernfs_node_from_dentry(dentry); 5178 struct kernfs_node *kn = kernfs_node_from_dentry(dentry);
4634 struct cgroup_subsys_state *css = NULL; 5179 struct cgroup_subsys_state *css = NULL;
@@ -4644,13 +5189,13 @@ struct cgroup_subsys_state *css_tryget_from_dir(struct dentry *dentry,
4644 /* 5189 /*
4645 * This path doesn't originate from kernfs and @kn could already 5190 * This path doesn't originate from kernfs and @kn could already
4646 * have been or be removed at any point. @kn->priv is RCU 5191 * have been or be removed at any point. @kn->priv is RCU
4647 * protected for this access. See destroy_locked() for details. 5192 * protected for this access. See cgroup_rmdir() for details.
4648 */ 5193 */
4649 cgrp = rcu_dereference(kn->priv); 5194 cgrp = rcu_dereference(kn->priv);
4650 if (cgrp) 5195 if (cgrp)
4651 css = cgroup_css(cgrp, ss); 5196 css = cgroup_css(cgrp, ss);
4652 5197
4653 if (!css || !css_tryget(css)) 5198 if (!css || !css_tryget_online(css))
4654 css = ERR_PTR(-ENOENT); 5199 css = ERR_PTR(-ENOENT);
4655 5200
4656 rcu_read_unlock(); 5201 rcu_read_unlock();
@@ -4667,14 +5212,8 @@ struct cgroup_subsys_state *css_tryget_from_dir(struct dentry *dentry,
4667 */ 5212 */
4668struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss) 5213struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss)
4669{ 5214{
4670 struct cgroup *cgrp; 5215 WARN_ON_ONCE(!rcu_read_lock_held());
4671 5216 return idr_find(&ss->css_idr, id);
4672 cgroup_assert_mutexes_or_rcu_locked();
4673
4674 cgrp = idr_find(&ss->root->cgroup_idr, id);
4675 if (cgrp)
4676 return cgroup_css(cgrp, ss);
4677 return NULL;
4678} 5217}
4679 5218
4680#ifdef CONFIG_CGROUP_DEBUG 5219#ifdef CONFIG_CGROUP_DEBUG
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index 2bc4a2256444..a79e40f9d700 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -21,6 +21,7 @@
21#include <linux/uaccess.h> 21#include <linux/uaccess.h>
22#include <linux/freezer.h> 22#include <linux/freezer.h>
23#include <linux/seq_file.h> 23#include <linux/seq_file.h>
24#include <linux/mutex.h>
24 25
25/* 26/*
26 * A cgroup is freezing if any FREEZING flags are set. FREEZING_SELF is 27 * A cgroup is freezing if any FREEZING flags are set. FREEZING_SELF is
@@ -42,9 +43,10 @@ enum freezer_state_flags {
42struct freezer { 43struct freezer {
43 struct cgroup_subsys_state css; 44 struct cgroup_subsys_state css;
44 unsigned int state; 45 unsigned int state;
45 spinlock_t lock;
46}; 46};
47 47
48static DEFINE_MUTEX(freezer_mutex);
49
48static inline struct freezer *css_freezer(struct cgroup_subsys_state *css) 50static inline struct freezer *css_freezer(struct cgroup_subsys_state *css)
49{ 51{
50 return css ? container_of(css, struct freezer, css) : NULL; 52 return css ? container_of(css, struct freezer, css) : NULL;
@@ -57,7 +59,7 @@ static inline struct freezer *task_freezer(struct task_struct *task)
57 59
58static struct freezer *parent_freezer(struct freezer *freezer) 60static struct freezer *parent_freezer(struct freezer *freezer)
59{ 61{
60 return css_freezer(css_parent(&freezer->css)); 62 return css_freezer(freezer->css.parent);
61} 63}
62 64
63bool cgroup_freezing(struct task_struct *task) 65bool cgroup_freezing(struct task_struct *task)
@@ -71,10 +73,6 @@ bool cgroup_freezing(struct task_struct *task)
71 return ret; 73 return ret;
72} 74}
73 75
74/*
75 * cgroups_write_string() limits the size of freezer state strings to
76 * CGROUP_LOCAL_BUFFER_SIZE
77 */
78static const char *freezer_state_strs(unsigned int state) 76static const char *freezer_state_strs(unsigned int state)
79{ 77{
80 if (state & CGROUP_FROZEN) 78 if (state & CGROUP_FROZEN)
@@ -93,7 +91,6 @@ freezer_css_alloc(struct cgroup_subsys_state *parent_css)
93 if (!freezer) 91 if (!freezer)
94 return ERR_PTR(-ENOMEM); 92 return ERR_PTR(-ENOMEM);
95 93
96 spin_lock_init(&freezer->lock);
97 return &freezer->css; 94 return &freezer->css;
98} 95}
99 96
@@ -110,14 +107,7 @@ static int freezer_css_online(struct cgroup_subsys_state *css)
110 struct freezer *freezer = css_freezer(css); 107 struct freezer *freezer = css_freezer(css);
111 struct freezer *parent = parent_freezer(freezer); 108 struct freezer *parent = parent_freezer(freezer);
112 109
113 /* 110 mutex_lock(&freezer_mutex);
114 * The following double locking and freezing state inheritance
115 * guarantee that @cgroup can never escape ancestors' freezing
116 * states. See css_for_each_descendant_pre() for details.
117 */
118 if (parent)
119 spin_lock_irq(&parent->lock);
120 spin_lock_nested(&freezer->lock, SINGLE_DEPTH_NESTING);
121 111
122 freezer->state |= CGROUP_FREEZER_ONLINE; 112 freezer->state |= CGROUP_FREEZER_ONLINE;
123 113
@@ -126,10 +116,7 @@ static int freezer_css_online(struct cgroup_subsys_state *css)
126 atomic_inc(&system_freezing_cnt); 116 atomic_inc(&system_freezing_cnt);
127 } 117 }
128 118
129 spin_unlock(&freezer->lock); 119 mutex_unlock(&freezer_mutex);
130 if (parent)
131 spin_unlock_irq(&parent->lock);
132
133 return 0; 120 return 0;
134} 121}
135 122
@@ -144,14 +131,14 @@ static void freezer_css_offline(struct cgroup_subsys_state *css)
144{ 131{
145 struct freezer *freezer = css_freezer(css); 132 struct freezer *freezer = css_freezer(css);
146 133
147 spin_lock_irq(&freezer->lock); 134 mutex_lock(&freezer_mutex);
148 135
149 if (freezer->state & CGROUP_FREEZING) 136 if (freezer->state & CGROUP_FREEZING)
150 atomic_dec(&system_freezing_cnt); 137 atomic_dec(&system_freezing_cnt);
151 138
152 freezer->state = 0; 139 freezer->state = 0;
153 140
154 spin_unlock_irq(&freezer->lock); 141 mutex_unlock(&freezer_mutex);
155} 142}
156 143
157static void freezer_css_free(struct cgroup_subsys_state *css) 144static void freezer_css_free(struct cgroup_subsys_state *css)
@@ -175,7 +162,7 @@ static void freezer_attach(struct cgroup_subsys_state *new_css,
175 struct task_struct *task; 162 struct task_struct *task;
176 bool clear_frozen = false; 163 bool clear_frozen = false;
177 164
178 spin_lock_irq(&freezer->lock); 165 mutex_lock(&freezer_mutex);
179 166
180 /* 167 /*
181 * Make the new tasks conform to the current state of @new_css. 168 * Make the new tasks conform to the current state of @new_css.
@@ -197,21 +184,13 @@ static void freezer_attach(struct cgroup_subsys_state *new_css,
197 } 184 }
198 } 185 }
199 186
200 spin_unlock_irq(&freezer->lock); 187 /* propagate FROZEN clearing upwards */
201
202 /*
203 * Propagate FROZEN clearing upwards. We may race with
204 * update_if_frozen(), but as long as both work bottom-up, either
205 * update_if_frozen() sees child's FROZEN cleared or we clear the
206 * parent's FROZEN later. No parent w/ !FROZEN children can be
207 * left FROZEN.
208 */
209 while (clear_frozen && (freezer = parent_freezer(freezer))) { 188 while (clear_frozen && (freezer = parent_freezer(freezer))) {
210 spin_lock_irq(&freezer->lock);
211 freezer->state &= ~CGROUP_FROZEN; 189 freezer->state &= ~CGROUP_FROZEN;
212 clear_frozen = freezer->state & CGROUP_FREEZING; 190 clear_frozen = freezer->state & CGROUP_FREEZING;
213 spin_unlock_irq(&freezer->lock);
214 } 191 }
192
193 mutex_unlock(&freezer_mutex);
215} 194}
216 195
217/** 196/**
@@ -228,9 +207,6 @@ static void freezer_fork(struct task_struct *task)
228{ 207{
229 struct freezer *freezer; 208 struct freezer *freezer;
230 209
231 rcu_read_lock();
232 freezer = task_freezer(task);
233
234 /* 210 /*
235 * The root cgroup is non-freezable, so we can skip locking the 211 * The root cgroup is non-freezable, so we can skip locking the
236 * freezer. This is safe regardless of race with task migration. 212 * freezer. This is safe regardless of race with task migration.
@@ -238,24 +214,18 @@ static void freezer_fork(struct task_struct *task)
238 * to do. If we lost and root is the new cgroup, noop is still the 214 * to do. If we lost and root is the new cgroup, noop is still the
239 * right thing to do. 215 * right thing to do.
240 */ 216 */
241 if (!parent_freezer(freezer)) 217 if (task_css_is_root(task, freezer_cgrp_id))
242 goto out; 218 return;
243 219
244 /* 220 mutex_lock(&freezer_mutex);
245 * Grab @freezer->lock and freeze @task after verifying @task still 221 rcu_read_lock();
246 * belongs to @freezer and it's freezing. The former is for the 222
247 * case where we have raced against task migration and lost and 223 freezer = task_freezer(task);
248 * @task is already in a different cgroup which may not be frozen. 224 if (freezer->state & CGROUP_FREEZING)
249 * This isn't strictly necessary as freeze_task() is allowed to be
250 * called spuriously but let's do it anyway for, if nothing else,
251 * documentation.
252 */
253 spin_lock_irq(&freezer->lock);
254 if (freezer == task_freezer(task) && (freezer->state & CGROUP_FREEZING))
255 freeze_task(task); 225 freeze_task(task);
256 spin_unlock_irq(&freezer->lock); 226
257out:
258 rcu_read_unlock(); 227 rcu_read_unlock();
228 mutex_unlock(&freezer_mutex);
259} 229}
260 230
261/** 231/**
@@ -281,22 +251,24 @@ static void update_if_frozen(struct cgroup_subsys_state *css)
281 struct css_task_iter it; 251 struct css_task_iter it;
282 struct task_struct *task; 252 struct task_struct *task;
283 253
284 WARN_ON_ONCE(!rcu_read_lock_held()); 254 lockdep_assert_held(&freezer_mutex);
285
286 spin_lock_irq(&freezer->lock);
287 255
288 if (!(freezer->state & CGROUP_FREEZING) || 256 if (!(freezer->state & CGROUP_FREEZING) ||
289 (freezer->state & CGROUP_FROZEN)) 257 (freezer->state & CGROUP_FROZEN))
290 goto out_unlock; 258 return;
291 259
292 /* are all (live) children frozen? */ 260 /* are all (live) children frozen? */
261 rcu_read_lock();
293 css_for_each_child(pos, css) { 262 css_for_each_child(pos, css) {
294 struct freezer *child = css_freezer(pos); 263 struct freezer *child = css_freezer(pos);
295 264
296 if ((child->state & CGROUP_FREEZER_ONLINE) && 265 if ((child->state & CGROUP_FREEZER_ONLINE) &&
297 !(child->state & CGROUP_FROZEN)) 266 !(child->state & CGROUP_FROZEN)) {
298 goto out_unlock; 267 rcu_read_unlock();
268 return;
269 }
299 } 270 }
271 rcu_read_unlock();
300 272
301 /* are all tasks frozen? */ 273 /* are all tasks frozen? */
302 css_task_iter_start(css, &it); 274 css_task_iter_start(css, &it);
@@ -317,21 +289,29 @@ static void update_if_frozen(struct cgroup_subsys_state *css)
317 freezer->state |= CGROUP_FROZEN; 289 freezer->state |= CGROUP_FROZEN;
318out_iter_end: 290out_iter_end:
319 css_task_iter_end(&it); 291 css_task_iter_end(&it);
320out_unlock:
321 spin_unlock_irq(&freezer->lock);
322} 292}
323 293
324static int freezer_read(struct seq_file *m, void *v) 294static int freezer_read(struct seq_file *m, void *v)
325{ 295{
326 struct cgroup_subsys_state *css = seq_css(m), *pos; 296 struct cgroup_subsys_state *css = seq_css(m), *pos;
327 297
298 mutex_lock(&freezer_mutex);
328 rcu_read_lock(); 299 rcu_read_lock();
329 300
330 /* update states bottom-up */ 301 /* update states bottom-up */
331 css_for_each_descendant_post(pos, css) 302 css_for_each_descendant_post(pos, css) {
303 if (!css_tryget_online(pos))
304 continue;
305 rcu_read_unlock();
306
332 update_if_frozen(pos); 307 update_if_frozen(pos);
333 308
309 rcu_read_lock();
310 css_put(pos);
311 }
312
334 rcu_read_unlock(); 313 rcu_read_unlock();
314 mutex_unlock(&freezer_mutex);
335 315
336 seq_puts(m, freezer_state_strs(css_freezer(css)->state)); 316 seq_puts(m, freezer_state_strs(css_freezer(css)->state));
337 seq_putc(m, '\n'); 317 seq_putc(m, '\n');
@@ -373,7 +353,7 @@ static void freezer_apply_state(struct freezer *freezer, bool freeze,
373 unsigned int state) 353 unsigned int state)
374{ 354{
375 /* also synchronizes against task migration, see freezer_attach() */ 355 /* also synchronizes against task migration, see freezer_attach() */
376 lockdep_assert_held(&freezer->lock); 356 lockdep_assert_held(&freezer_mutex);
377 357
378 if (!(freezer->state & CGROUP_FREEZER_ONLINE)) 358 if (!(freezer->state & CGROUP_FREEZER_ONLINE))
379 return; 359 return;
@@ -414,47 +394,47 @@ static void freezer_change_state(struct freezer *freezer, bool freeze)
414 * descendant will try to inherit its parent's FREEZING state as 394 * descendant will try to inherit its parent's FREEZING state as
415 * CGROUP_FREEZING_PARENT. 395 * CGROUP_FREEZING_PARENT.
416 */ 396 */
397 mutex_lock(&freezer_mutex);
417 rcu_read_lock(); 398 rcu_read_lock();
418 css_for_each_descendant_pre(pos, &freezer->css) { 399 css_for_each_descendant_pre(pos, &freezer->css) {
419 struct freezer *pos_f = css_freezer(pos); 400 struct freezer *pos_f = css_freezer(pos);
420 struct freezer *parent = parent_freezer(pos_f); 401 struct freezer *parent = parent_freezer(pos_f);
421 402
422 spin_lock_irq(&pos_f->lock); 403 if (!css_tryget_online(pos))
404 continue;
405 rcu_read_unlock();
423 406
424 if (pos_f == freezer) { 407 if (pos_f == freezer)
425 freezer_apply_state(pos_f, freeze, 408 freezer_apply_state(pos_f, freeze,
426 CGROUP_FREEZING_SELF); 409 CGROUP_FREEZING_SELF);
427 } else { 410 else
428 /*
429 * Our update to @parent->state is already visible
430 * which is all we need. No need to lock @parent.
431 * For more info on synchronization, see
432 * freezer_post_create().
433 */
434 freezer_apply_state(pos_f, 411 freezer_apply_state(pos_f,
435 parent->state & CGROUP_FREEZING, 412 parent->state & CGROUP_FREEZING,
436 CGROUP_FREEZING_PARENT); 413 CGROUP_FREEZING_PARENT);
437 }
438 414
439 spin_unlock_irq(&pos_f->lock); 415 rcu_read_lock();
416 css_put(pos);
440 } 417 }
441 rcu_read_unlock(); 418 rcu_read_unlock();
419 mutex_unlock(&freezer_mutex);
442} 420}
443 421
444static int freezer_write(struct cgroup_subsys_state *css, struct cftype *cft, 422static ssize_t freezer_write(struct kernfs_open_file *of,
445 char *buffer) 423 char *buf, size_t nbytes, loff_t off)
446{ 424{
447 bool freeze; 425 bool freeze;
448 426
449 if (strcmp(buffer, freezer_state_strs(0)) == 0) 427 buf = strstrip(buf);
428
429 if (strcmp(buf, freezer_state_strs(0)) == 0)
450 freeze = false; 430 freeze = false;
451 else if (strcmp(buffer, freezer_state_strs(CGROUP_FROZEN)) == 0) 431 else if (strcmp(buf, freezer_state_strs(CGROUP_FROZEN)) == 0)
452 freeze = true; 432 freeze = true;
453 else 433 else
454 return -EINVAL; 434 return -EINVAL;
455 435
456 freezer_change_state(css_freezer(css), freeze); 436 freezer_change_state(css_freezer(of_css(of)), freeze);
457 return 0; 437 return nbytes;
458} 438}
459 439
460static u64 freezer_self_freezing_read(struct cgroup_subsys_state *css, 440static u64 freezer_self_freezing_read(struct cgroup_subsys_state *css,
@@ -478,7 +458,7 @@ static struct cftype files[] = {
478 .name = "state", 458 .name = "state",
479 .flags = CFTYPE_NOT_ON_ROOT, 459 .flags = CFTYPE_NOT_ON_ROOT,
480 .seq_show = freezer_read, 460 .seq_show = freezer_read,
481 .write_string = freezer_write, 461 .write = freezer_write,
482 }, 462 },
483 { 463 {
484 .name = "self_freezing", 464 .name = "self_freezing",
diff --git a/kernel/compat.c b/kernel/compat.c
index e40b0430b562..633394f442f8 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -157,7 +157,7 @@ static int __compat_put_timespec(const struct timespec *ts, struct compat_timesp
157int compat_get_timeval(struct timeval *tv, const void __user *utv) 157int compat_get_timeval(struct timeval *tv, const void __user *utv)
158{ 158{
159 if (COMPAT_USE_64BIT_TIME) 159 if (COMPAT_USE_64BIT_TIME)
160 return copy_from_user(tv, utv, sizeof *tv) ? -EFAULT : 0; 160 return copy_from_user(tv, utv, sizeof(*tv)) ? -EFAULT : 0;
161 else 161 else
162 return __compat_get_timeval(tv, utv); 162 return __compat_get_timeval(tv, utv);
163} 163}
@@ -166,7 +166,7 @@ EXPORT_SYMBOL_GPL(compat_get_timeval);
166int compat_put_timeval(const struct timeval *tv, void __user *utv) 166int compat_put_timeval(const struct timeval *tv, void __user *utv)
167{ 167{
168 if (COMPAT_USE_64BIT_TIME) 168 if (COMPAT_USE_64BIT_TIME)
169 return copy_to_user(utv, tv, sizeof *tv) ? -EFAULT : 0; 169 return copy_to_user(utv, tv, sizeof(*tv)) ? -EFAULT : 0;
170 else 170 else
171 return __compat_put_timeval(tv, utv); 171 return __compat_put_timeval(tv, utv);
172} 172}
@@ -175,7 +175,7 @@ EXPORT_SYMBOL_GPL(compat_put_timeval);
175int compat_get_timespec(struct timespec *ts, const void __user *uts) 175int compat_get_timespec(struct timespec *ts, const void __user *uts)
176{ 176{
177 if (COMPAT_USE_64BIT_TIME) 177 if (COMPAT_USE_64BIT_TIME)
178 return copy_from_user(ts, uts, sizeof *ts) ? -EFAULT : 0; 178 return copy_from_user(ts, uts, sizeof(*ts)) ? -EFAULT : 0;
179 else 179 else
180 return __compat_get_timespec(ts, uts); 180 return __compat_get_timespec(ts, uts);
181} 181}
@@ -184,7 +184,7 @@ EXPORT_SYMBOL_GPL(compat_get_timespec);
184int compat_put_timespec(const struct timespec *ts, void __user *uts) 184int compat_put_timespec(const struct timespec *ts, void __user *uts)
185{ 185{
186 if (COMPAT_USE_64BIT_TIME) 186 if (COMPAT_USE_64BIT_TIME)
187 return copy_to_user(uts, ts, sizeof *ts) ? -EFAULT : 0; 187 return copy_to_user(uts, ts, sizeof(*ts)) ? -EFAULT : 0;
188 else 188 else
189 return __compat_put_timespec(ts, uts); 189 return __compat_put_timespec(ts, uts);
190} 190}
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c
index 6cb20d2e7ee0..019d45008448 100644
--- a/kernel/context_tracking.c
+++ b/kernel/context_tracking.c
@@ -120,7 +120,7 @@ void context_tracking_user_enter(void)
120 * instead of preempt_schedule() to exit user context if needed before 120 * instead of preempt_schedule() to exit user context if needed before
121 * calling the scheduler. 121 * calling the scheduler.
122 */ 122 */
123asmlinkage void __sched notrace preempt_schedule_context(void) 123asmlinkage __visible void __sched notrace preempt_schedule_context(void)
124{ 124{
125 enum ctx_state prev_ctx; 125 enum ctx_state prev_ctx;
126 126
diff --git a/kernel/cpu.c b/kernel/cpu.c
index a9e710eef0e2..a343bde710b1 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -20,6 +20,7 @@
20#include <linux/gfp.h> 20#include <linux/gfp.h>
21#include <linux/suspend.h> 21#include <linux/suspend.h>
22#include <linux/lockdep.h> 22#include <linux/lockdep.h>
23#include <trace/events/power.h>
23 24
24#include "smpboot.h" 25#include "smpboot.h"
25 26
@@ -283,8 +284,7 @@ static inline void check_for_tasks(int cpu)
283 task_cputime(p, &utime, &stime); 284 task_cputime(p, &utime, &stime);
284 if (task_cpu(p) == cpu && p->state == TASK_RUNNING && 285 if (task_cpu(p) == cpu && p->state == TASK_RUNNING &&
285 (utime || stime)) 286 (utime || stime))
286 printk(KERN_WARNING "Task %s (pid = %d) is on cpu %d " 287 pr_warn("Task %s (pid = %d) is on cpu %d (state = %ld, flags = %x)\n",
287 "(state = %ld, flags = %x)\n",
288 p->comm, task_pid_nr(p), cpu, 288 p->comm, task_pid_nr(p), cpu,
289 p->state, p->flags); 289 p->state, p->flags);
290 } 290 }
@@ -336,8 +336,8 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
336 if (err) { 336 if (err) {
337 nr_calls--; 337 nr_calls--;
338 __cpu_notify(CPU_DOWN_FAILED | mod, hcpu, nr_calls, NULL); 338 __cpu_notify(CPU_DOWN_FAILED | mod, hcpu, nr_calls, NULL);
339 printk("%s: attempt to take down CPU %u failed\n", 339 pr_warn("%s: attempt to take down CPU %u failed\n",
340 __func__, cpu); 340 __func__, cpu);
341 goto out_release; 341 goto out_release;
342 } 342 }
343 343
@@ -444,8 +444,8 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen)
444 ret = __cpu_notify(CPU_UP_PREPARE | mod, hcpu, -1, &nr_calls); 444 ret = __cpu_notify(CPU_UP_PREPARE | mod, hcpu, -1, &nr_calls);
445 if (ret) { 445 if (ret) {
446 nr_calls--; 446 nr_calls--;
447 printk(KERN_WARNING "%s: attempt to bring up CPU %u failed\n", 447 pr_warn("%s: attempt to bring up CPU %u failed\n",
448 __func__, cpu); 448 __func__, cpu);
449 goto out_notify; 449 goto out_notify;
450 } 450 }
451 451
@@ -475,11 +475,10 @@ int cpu_up(unsigned int cpu)
475 int err = 0; 475 int err = 0;
476 476
477 if (!cpu_possible(cpu)) { 477 if (!cpu_possible(cpu)) {
478 printk(KERN_ERR "can't online cpu %d because it is not " 478 pr_err("can't online cpu %d because it is not configured as may-hotadd at boot time\n",
479 "configured as may-hotadd at boot time\n", cpu); 479 cpu);
480#if defined(CONFIG_IA64) 480#if defined(CONFIG_IA64)
481 printk(KERN_ERR "please check additional_cpus= boot " 481 pr_err("please check additional_cpus= boot parameter\n");
482 "parameter\n");
483#endif 482#endif
484 return -EINVAL; 483 return -EINVAL;
485 } 484 }
@@ -518,16 +517,17 @@ int disable_nonboot_cpus(void)
518 */ 517 */
519 cpumask_clear(frozen_cpus); 518 cpumask_clear(frozen_cpus);
520 519
521 printk("Disabling non-boot CPUs ...\n"); 520 pr_info("Disabling non-boot CPUs ...\n");
522 for_each_online_cpu(cpu) { 521 for_each_online_cpu(cpu) {
523 if (cpu == first_cpu) 522 if (cpu == first_cpu)
524 continue; 523 continue;
524 trace_suspend_resume(TPS("CPU_OFF"), cpu, true);
525 error = _cpu_down(cpu, 1); 525 error = _cpu_down(cpu, 1);
526 trace_suspend_resume(TPS("CPU_OFF"), cpu, false);
526 if (!error) 527 if (!error)
527 cpumask_set_cpu(cpu, frozen_cpus); 528 cpumask_set_cpu(cpu, frozen_cpus);
528 else { 529 else {
529 printk(KERN_ERR "Error taking CPU%d down: %d\n", 530 pr_err("Error taking CPU%d down: %d\n", cpu, error);
530 cpu, error);
531 break; 531 break;
532 } 532 }
533 } 533 }
@@ -537,7 +537,7 @@ int disable_nonboot_cpus(void)
537 /* Make sure the CPUs won't be enabled by someone else */ 537 /* Make sure the CPUs won't be enabled by someone else */
538 cpu_hotplug_disabled = 1; 538 cpu_hotplug_disabled = 1;
539 } else { 539 } else {
540 printk(KERN_ERR "Non-boot CPUs are not disabled\n"); 540 pr_err("Non-boot CPUs are not disabled\n");
541 } 541 }
542 cpu_maps_update_done(); 542 cpu_maps_update_done();
543 return error; 543 return error;
@@ -561,17 +561,19 @@ void __ref enable_nonboot_cpus(void)
561 if (cpumask_empty(frozen_cpus)) 561 if (cpumask_empty(frozen_cpus))
562 goto out; 562 goto out;
563 563
564 printk(KERN_INFO "Enabling non-boot CPUs ...\n"); 564 pr_info("Enabling non-boot CPUs ...\n");
565 565
566 arch_enable_nonboot_cpus_begin(); 566 arch_enable_nonboot_cpus_begin();
567 567
568 for_each_cpu(cpu, frozen_cpus) { 568 for_each_cpu(cpu, frozen_cpus) {
569 trace_suspend_resume(TPS("CPU_ON"), cpu, true);
569 error = _cpu_up(cpu, 1); 570 error = _cpu_up(cpu, 1);
571 trace_suspend_resume(TPS("CPU_ON"), cpu, false);
570 if (!error) { 572 if (!error) {
571 printk(KERN_INFO "CPU%d is up\n", cpu); 573 pr_info("CPU%d is up\n", cpu);
572 continue; 574 continue;
573 } 575 }
574 printk(KERN_WARNING "Error taking CPU%d up: %d\n", cpu, error); 576 pr_warn("Error taking CPU%d up: %d\n", cpu, error);
575 } 577 }
576 578
577 arch_enable_nonboot_cpus_end(); 579 arch_enable_nonboot_cpus_end();
@@ -726,10 +728,12 @@ void set_cpu_present(unsigned int cpu, bool present)
726 728
727void set_cpu_online(unsigned int cpu, bool online) 729void set_cpu_online(unsigned int cpu, bool online)
728{ 730{
729 if (online) 731 if (online) {
730 cpumask_set_cpu(cpu, to_cpumask(cpu_online_bits)); 732 cpumask_set_cpu(cpu, to_cpumask(cpu_online_bits));
731 else 733 cpumask_set_cpu(cpu, to_cpumask(cpu_active_bits));
734 } else {
732 cpumask_clear_cpu(cpu, to_cpumask(cpu_online_bits)); 735 cpumask_clear_cpu(cpu, to_cpumask(cpu_online_bits));
736 }
733} 737}
734 738
735void set_cpu_active(unsigned int cpu, bool active) 739void set_cpu_active(unsigned int cpu, bool active)
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 3d54c418bd06..f6b33c696224 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -61,12 +61,7 @@
61#include <linux/cgroup.h> 61#include <linux/cgroup.h>
62#include <linux/wait.h> 62#include <linux/wait.h>
63 63
64/* 64struct static_key cpusets_enabled_key __read_mostly = STATIC_KEY_INIT_FALSE;
65 * Tracks how many cpusets are currently defined in system.
66 * When there is only one cpuset (the root cpuset) we can
67 * short circuit some hooks.
68 */
69int number_of_cpusets __read_mostly;
70 65
71/* See "Frequency meter" comments, below. */ 66/* See "Frequency meter" comments, below. */
72 67
@@ -124,7 +119,7 @@ static inline struct cpuset *task_cs(struct task_struct *task)
124 119
125static inline struct cpuset *parent_cs(struct cpuset *cs) 120static inline struct cpuset *parent_cs(struct cpuset *cs)
126{ 121{
127 return css_cs(css_parent(&cs->css)); 122 return css_cs(cs->css.parent);
128} 123}
129 124
130#ifdef CONFIG_NUMA 125#ifdef CONFIG_NUMA
@@ -611,7 +606,7 @@ static int generate_sched_domains(cpumask_var_t **domains,
611 goto done; 606 goto done;
612 } 607 }
613 608
614 csa = kmalloc(number_of_cpusets * sizeof(cp), GFP_KERNEL); 609 csa = kmalloc(nr_cpusets() * sizeof(cp), GFP_KERNEL);
615 if (!csa) 610 if (!csa)
616 goto done; 611 goto done;
617 csn = 0; 612 csn = 0;
@@ -696,11 +691,8 @@ restart:
696 if (nslot == ndoms) { 691 if (nslot == ndoms) {
697 static int warnings = 10; 692 static int warnings = 10;
698 if (warnings) { 693 if (warnings) {
699 printk(KERN_WARNING 694 pr_warn("rebuild_sched_domains confused: nslot %d, ndoms %d, csn %d, i %d, apn %d\n",
700 "rebuild_sched_domains confused:" 695 nslot, ndoms, csn, i, apn);
701 " nslot %d, ndoms %d, csn %d, i %d,"
702 " apn %d\n",
703 nslot, ndoms, csn, i, apn);
704 warnings--; 696 warnings--;
705 } 697 }
706 continue; 698 continue;
@@ -875,7 +867,7 @@ static void update_tasks_cpumask_hier(struct cpuset *root_cs, bool update_root)
875 continue; 867 continue;
876 } 868 }
877 } 869 }
878 if (!css_tryget(&cp->css)) 870 if (!css_tryget_online(&cp->css))
879 continue; 871 continue;
880 rcu_read_unlock(); 872 rcu_read_unlock();
881 873
@@ -890,6 +882,7 @@ static void update_tasks_cpumask_hier(struct cpuset *root_cs, bool update_root)
890/** 882/**
891 * update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it 883 * update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it
892 * @cs: the cpuset to consider 884 * @cs: the cpuset to consider
885 * @trialcs: trial cpuset
893 * @buf: buffer of cpu numbers written to this cpuset 886 * @buf: buffer of cpu numbers written to this cpuset
894 */ 887 */
895static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, 888static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
@@ -1110,7 +1103,7 @@ static void update_tasks_nodemask_hier(struct cpuset *root_cs, bool update_root)
1110 continue; 1103 continue;
1111 } 1104 }
1112 } 1105 }
1113 if (!css_tryget(&cp->css)) 1106 if (!css_tryget_online(&cp->css))
1114 continue; 1107 continue;
1115 rcu_read_unlock(); 1108 rcu_read_unlock();
1116 1109
@@ -1605,13 +1598,15 @@ out_unlock:
1605/* 1598/*
1606 * Common handling for a write to a "cpus" or "mems" file. 1599 * Common handling for a write to a "cpus" or "mems" file.
1607 */ 1600 */
1608static int cpuset_write_resmask(struct cgroup_subsys_state *css, 1601static ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
1609 struct cftype *cft, char *buf) 1602 char *buf, size_t nbytes, loff_t off)
1610{ 1603{
1611 struct cpuset *cs = css_cs(css); 1604 struct cpuset *cs = css_cs(of_css(of));
1612 struct cpuset *trialcs; 1605 struct cpuset *trialcs;
1613 int retval = -ENODEV; 1606 int retval = -ENODEV;
1614 1607
1608 buf = strstrip(buf);
1609
1615 /* 1610 /*
1616 * CPU or memory hotunplug may leave @cs w/o any execution 1611 * CPU or memory hotunplug may leave @cs w/o any execution
1617 * resources, in which case the hotplug code asynchronously updates 1612 * resources, in which case the hotplug code asynchronously updates
@@ -1635,7 +1630,7 @@ static int cpuset_write_resmask(struct cgroup_subsys_state *css,
1635 goto out_unlock; 1630 goto out_unlock;
1636 } 1631 }
1637 1632
1638 switch (cft->private) { 1633 switch (of_cft(of)->private) {
1639 case FILE_CPULIST: 1634 case FILE_CPULIST:
1640 retval = update_cpumask(cs, trialcs, buf); 1635 retval = update_cpumask(cs, trialcs, buf);
1641 break; 1636 break;
@@ -1650,7 +1645,7 @@ static int cpuset_write_resmask(struct cgroup_subsys_state *css,
1650 free_trial_cpuset(trialcs); 1645 free_trial_cpuset(trialcs);
1651out_unlock: 1646out_unlock:
1652 mutex_unlock(&cpuset_mutex); 1647 mutex_unlock(&cpuset_mutex);
1653 return retval; 1648 return retval ?: nbytes;
1654} 1649}
1655 1650
1656/* 1651/*
@@ -1752,7 +1747,7 @@ static struct cftype files[] = {
1752 { 1747 {
1753 .name = "cpus", 1748 .name = "cpus",
1754 .seq_show = cpuset_common_seq_show, 1749 .seq_show = cpuset_common_seq_show,
1755 .write_string = cpuset_write_resmask, 1750 .write = cpuset_write_resmask,
1756 .max_write_len = (100U + 6 * NR_CPUS), 1751 .max_write_len = (100U + 6 * NR_CPUS),
1757 .private = FILE_CPULIST, 1752 .private = FILE_CPULIST,
1758 }, 1753 },
@@ -1760,7 +1755,7 @@ static struct cftype files[] = {
1760 { 1755 {
1761 .name = "mems", 1756 .name = "mems",
1762 .seq_show = cpuset_common_seq_show, 1757 .seq_show = cpuset_common_seq_show,
1763 .write_string = cpuset_write_resmask, 1758 .write = cpuset_write_resmask,
1764 .max_write_len = (100U + 6 * MAX_NUMNODES), 1759 .max_write_len = (100U + 6 * MAX_NUMNODES),
1765 .private = FILE_MEMLIST, 1760 .private = FILE_MEMLIST,
1766 }, 1761 },
@@ -1888,7 +1883,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
1888 if (is_spread_slab(parent)) 1883 if (is_spread_slab(parent))
1889 set_bit(CS_SPREAD_SLAB, &cs->flags); 1884 set_bit(CS_SPREAD_SLAB, &cs->flags);
1890 1885
1891 number_of_cpusets++; 1886 cpuset_inc();
1892 1887
1893 if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags)) 1888 if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags))
1894 goto out_unlock; 1889 goto out_unlock;
@@ -1939,7 +1934,7 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css)
1939 if (is_sched_load_balance(cs)) 1934 if (is_sched_load_balance(cs))
1940 update_flag(CS_SCHED_LOAD_BALANCE, cs, 0); 1935 update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
1941 1936
1942 number_of_cpusets--; 1937 cpuset_dec();
1943 clear_bit(CS_ONLINE, &cs->flags); 1938 clear_bit(CS_ONLINE, &cs->flags);
1944 1939
1945 mutex_unlock(&cpuset_mutex); 1940 mutex_unlock(&cpuset_mutex);
@@ -1992,7 +1987,6 @@ int __init cpuset_init(void)
1992 if (!alloc_cpumask_var(&cpus_attach, GFP_KERNEL)) 1987 if (!alloc_cpumask_var(&cpus_attach, GFP_KERNEL))
1993 BUG(); 1988 BUG();
1994 1989
1995 number_of_cpusets = 1;
1996 return 0; 1990 return 0;
1997} 1991}
1998 1992
@@ -2017,7 +2011,7 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
2017 parent = parent_cs(parent); 2011 parent = parent_cs(parent);
2018 2012
2019 if (cgroup_transfer_tasks(parent->css.cgroup, cs->css.cgroup)) { 2013 if (cgroup_transfer_tasks(parent->css.cgroup, cs->css.cgroup)) {
2020 printk(KERN_ERR "cpuset: failed to transfer tasks out of empty cpuset "); 2014 pr_err("cpuset: failed to transfer tasks out of empty cpuset ");
2021 pr_cont_cgroup_name(cs->css.cgroup); 2015 pr_cont_cgroup_name(cs->css.cgroup);
2022 pr_cont("\n"); 2016 pr_cont("\n");
2023 } 2017 }
@@ -2155,7 +2149,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
2155 2149
2156 rcu_read_lock(); 2150 rcu_read_lock();
2157 cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) { 2151 cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) {
2158 if (cs == &top_cpuset || !css_tryget(&cs->css)) 2152 if (cs == &top_cpuset || !css_tryget_online(&cs->css))
2159 continue; 2153 continue;
2160 rcu_read_unlock(); 2154 rcu_read_unlock();
2161 2155
@@ -2536,7 +2530,7 @@ int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
2536 2530
2537/** 2531/**
2538 * cpuset_print_task_mems_allowed - prints task's cpuset and mems_allowed 2532 * cpuset_print_task_mems_allowed - prints task's cpuset and mems_allowed
2539 * @task: pointer to task_struct of some task. 2533 * @tsk: pointer to task_struct of some task.
2540 * 2534 *
2541 * Description: Prints @task's name, cpuset name, and cached copy of its 2535 * Description: Prints @task's name, cpuset name, and cached copy of its
2542 * mems_allowed to the kernel log. 2536 * mems_allowed to the kernel log.
@@ -2554,7 +2548,7 @@ void cpuset_print_task_mems_allowed(struct task_struct *tsk)
2554 cgrp = task_cs(tsk)->css.cgroup; 2548 cgrp = task_cs(tsk)->css.cgroup;
2555 nodelist_scnprintf(cpuset_nodelist, CPUSET_NODELIST_LEN, 2549 nodelist_scnprintf(cpuset_nodelist, CPUSET_NODELIST_LEN,
2556 tsk->mems_allowed); 2550 tsk->mems_allowed);
2557 printk(KERN_INFO "%s cpuset=", tsk->comm); 2551 pr_info("%s cpuset=", tsk->comm);
2558 pr_cont_cgroup_name(cgrp); 2552 pr_cont_cgroup_name(cgrp);
2559 pr_cont(" mems_allowed=%s\n", cpuset_nodelist); 2553 pr_cont(" mems_allowed=%s\n", cpuset_nodelist);
2560 2554
@@ -2646,10 +2640,10 @@ out:
2646/* Display task mems_allowed in /proc/<pid>/status file. */ 2640/* Display task mems_allowed in /proc/<pid>/status file. */
2647void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task) 2641void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task)
2648{ 2642{
2649 seq_printf(m, "Mems_allowed:\t"); 2643 seq_puts(m, "Mems_allowed:\t");
2650 seq_nodemask(m, &task->mems_allowed); 2644 seq_nodemask(m, &task->mems_allowed);
2651 seq_printf(m, "\n"); 2645 seq_puts(m, "\n");
2652 seq_printf(m, "Mems_allowed_list:\t"); 2646 seq_puts(m, "Mems_allowed_list:\t");
2653 seq_nodemask_list(m, &task->mems_allowed); 2647 seq_nodemask_list(m, &task->mems_allowed);
2654 seq_printf(m, "\n"); 2648 seq_puts(m, "\n");
2655} 2649}
diff --git a/kernel/debug/kdb/kdb_bt.c b/kernel/debug/kdb/kdb_bt.c
index b03e0e814e43..fe15fff5df53 100644
--- a/kernel/debug/kdb/kdb_bt.c
+++ b/kernel/debug/kdb/kdb_bt.c
@@ -21,7 +21,7 @@
21static void kdb_show_stack(struct task_struct *p, void *addr) 21static void kdb_show_stack(struct task_struct *p, void *addr)
22{ 22{
23 int old_lvl = console_loglevel; 23 int old_lvl = console_loglevel;
24 console_loglevel = 15; 24 console_loglevel = CONSOLE_LOGLEVEL_MOTORMOUTH;
25 kdb_trap_printk++; 25 kdb_trap_printk++;
26 kdb_set_current_task(p); 26 kdb_set_current_task(p);
27 if (addr) { 27 if (addr) {
diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c
index 14ff4849262c..7c70812caea5 100644
--- a/kernel/debug/kdb/kdb_io.c
+++ b/kernel/debug/kdb/kdb_io.c
@@ -710,7 +710,7 @@ kdb_printit:
710 } 710 }
711 if (logging) { 711 if (logging) {
712 saved_loglevel = console_loglevel; 712 saved_loglevel = console_loglevel;
713 console_loglevel = 0; 713 console_loglevel = CONSOLE_LOGLEVEL_SILENT;
714 printk(KERN_INFO "%s", kdb_buffer); 714 printk(KERN_INFO "%s", kdb_buffer);
715 } 715 }
716 716
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index 0b097c8a1e50..2f7c760305ca 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -1091,7 +1091,7 @@ static int kdb_reboot(int argc, const char **argv)
1091static void kdb_dumpregs(struct pt_regs *regs) 1091static void kdb_dumpregs(struct pt_regs *regs)
1092{ 1092{
1093 int old_lvl = console_loglevel; 1093 int old_lvl = console_loglevel;
1094 console_loglevel = 15; 1094 console_loglevel = CONSOLE_LOGLEVEL_MOTORMOUTH;
1095 kdb_trap_printk++; 1095 kdb_trap_printk++;
1096 show_regs(regs); 1096 show_regs(regs);
1097 kdb_trap_printk--; 1097 kdb_trap_printk--;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index f83a71a3e46d..24d35cc38e42 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -39,6 +39,7 @@
39#include <linux/hw_breakpoint.h> 39#include <linux/hw_breakpoint.h>
40#include <linux/mm_types.h> 40#include <linux/mm_types.h>
41#include <linux/cgroup.h> 41#include <linux/cgroup.h>
42#include <linux/module.h>
42 43
43#include "internal.h" 44#include "internal.h"
44 45
@@ -607,7 +608,8 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event,
607 if (!f.file) 608 if (!f.file)
608 return -EBADF; 609 return -EBADF;
609 610
610 css = css_tryget_from_dir(f.file->f_dentry, &perf_event_cgrp_subsys); 611 css = css_tryget_online_from_dir(f.file->f_dentry,
612 &perf_event_cgrp_subsys);
611 if (IS_ERR(css)) { 613 if (IS_ERR(css)) {
612 ret = PTR_ERR(css); 614 ret = PTR_ERR(css);
613 goto out; 615 goto out;
@@ -1443,6 +1445,11 @@ group_sched_out(struct perf_event *group_event,
1443 cpuctx->exclusive = 0; 1445 cpuctx->exclusive = 0;
1444} 1446}
1445 1447
1448struct remove_event {
1449 struct perf_event *event;
1450 bool detach_group;
1451};
1452
1446/* 1453/*
1447 * Cross CPU call to remove a performance event 1454 * Cross CPU call to remove a performance event
1448 * 1455 *
@@ -1451,12 +1458,15 @@ group_sched_out(struct perf_event *group_event,
1451 */ 1458 */
1452static int __perf_remove_from_context(void *info) 1459static int __perf_remove_from_context(void *info)
1453{ 1460{
1454 struct perf_event *event = info; 1461 struct remove_event *re = info;
1462 struct perf_event *event = re->event;
1455 struct perf_event_context *ctx = event->ctx; 1463 struct perf_event_context *ctx = event->ctx;
1456 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); 1464 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
1457 1465
1458 raw_spin_lock(&ctx->lock); 1466 raw_spin_lock(&ctx->lock);
1459 event_sched_out(event, cpuctx, ctx); 1467 event_sched_out(event, cpuctx, ctx);
1468 if (re->detach_group)
1469 perf_group_detach(event);
1460 list_del_event(event, ctx); 1470 list_del_event(event, ctx);
1461 if (!ctx->nr_events && cpuctx->task_ctx == ctx) { 1471 if (!ctx->nr_events && cpuctx->task_ctx == ctx) {
1462 ctx->is_active = 0; 1472 ctx->is_active = 0;
@@ -1481,10 +1491,14 @@ static int __perf_remove_from_context(void *info)
1481 * When called from perf_event_exit_task, it's OK because the 1491 * When called from perf_event_exit_task, it's OK because the
1482 * context has been detached from its task. 1492 * context has been detached from its task.
1483 */ 1493 */
1484static void perf_remove_from_context(struct perf_event *event) 1494static void perf_remove_from_context(struct perf_event *event, bool detach_group)
1485{ 1495{
1486 struct perf_event_context *ctx = event->ctx; 1496 struct perf_event_context *ctx = event->ctx;
1487 struct task_struct *task = ctx->task; 1497 struct task_struct *task = ctx->task;
1498 struct remove_event re = {
1499 .event = event,
1500 .detach_group = detach_group,
1501 };
1488 1502
1489 lockdep_assert_held(&ctx->mutex); 1503 lockdep_assert_held(&ctx->mutex);
1490 1504
@@ -1493,12 +1507,12 @@ static void perf_remove_from_context(struct perf_event *event)
1493 * Per cpu events are removed via an smp call and 1507 * Per cpu events are removed via an smp call and
1494 * the removal is always successful. 1508 * the removal is always successful.
1495 */ 1509 */
1496 cpu_function_call(event->cpu, __perf_remove_from_context, event); 1510 cpu_function_call(event->cpu, __perf_remove_from_context, &re);
1497 return; 1511 return;
1498 } 1512 }
1499 1513
1500retry: 1514retry:
1501 if (!task_function_call(task, __perf_remove_from_context, event)) 1515 if (!task_function_call(task, __perf_remove_from_context, &re))
1502 return; 1516 return;
1503 1517
1504 raw_spin_lock_irq(&ctx->lock); 1518 raw_spin_lock_irq(&ctx->lock);
@@ -1515,6 +1529,8 @@ retry:
1515 * Since the task isn't running, its safe to remove the event, us 1529 * Since the task isn't running, its safe to remove the event, us
1516 * holding the ctx->lock ensures the task won't get scheduled in. 1530 * holding the ctx->lock ensures the task won't get scheduled in.
1517 */ 1531 */
1532 if (detach_group)
1533 perf_group_detach(event);
1518 list_del_event(event, ctx); 1534 list_del_event(event, ctx);
1519 raw_spin_unlock_irq(&ctx->lock); 1535 raw_spin_unlock_irq(&ctx->lock);
1520} 1536}
@@ -1663,6 +1679,8 @@ event_sched_in(struct perf_event *event,
1663 u64 tstamp = perf_event_time(event); 1679 u64 tstamp = perf_event_time(event);
1664 int ret = 0; 1680 int ret = 0;
1665 1681
1682 lockdep_assert_held(&ctx->lock);
1683
1666 if (event->state <= PERF_EVENT_STATE_OFF) 1684 if (event->state <= PERF_EVENT_STATE_OFF)
1667 return 0; 1685 return 0;
1668 1686
@@ -3178,7 +3196,8 @@ static void free_event_rcu(struct rcu_head *head)
3178} 3196}
3179 3197
3180static void ring_buffer_put(struct ring_buffer *rb); 3198static void ring_buffer_put(struct ring_buffer *rb);
3181static void ring_buffer_detach(struct perf_event *event, struct ring_buffer *rb); 3199static void ring_buffer_attach(struct perf_event *event,
3200 struct ring_buffer *rb);
3182 3201
3183static void unaccount_event_cpu(struct perf_event *event, int cpu) 3202static void unaccount_event_cpu(struct perf_event *event, int cpu)
3184{ 3203{
@@ -3229,17 +3248,19 @@ static void __free_event(struct perf_event *event)
3229 if (event->ctx) 3248 if (event->ctx)
3230 put_ctx(event->ctx); 3249 put_ctx(event->ctx);
3231 3250
3251 if (event->pmu)
3252 module_put(event->pmu->module);
3253
3232 call_rcu(&event->rcu_head, free_event_rcu); 3254 call_rcu(&event->rcu_head, free_event_rcu);
3233} 3255}
3234static void free_event(struct perf_event *event) 3256
3257static void _free_event(struct perf_event *event)
3235{ 3258{
3236 irq_work_sync(&event->pending); 3259 irq_work_sync(&event->pending);
3237 3260
3238 unaccount_event(event); 3261 unaccount_event(event);
3239 3262
3240 if (event->rb) { 3263 if (event->rb) {
3241 struct ring_buffer *rb;
3242
3243 /* 3264 /*
3244 * Can happen when we close an event with re-directed output. 3265 * Can happen when we close an event with re-directed output.
3245 * 3266 *
@@ -3247,57 +3268,38 @@ static void free_event(struct perf_event *event)
3247 * over us; possibly making our ring_buffer_put() the last. 3268 * over us; possibly making our ring_buffer_put() the last.
3248 */ 3269 */
3249 mutex_lock(&event->mmap_mutex); 3270 mutex_lock(&event->mmap_mutex);
3250 rb = event->rb; 3271 ring_buffer_attach(event, NULL);
3251 if (rb) {
3252 rcu_assign_pointer(event->rb, NULL);
3253 ring_buffer_detach(event, rb);
3254 ring_buffer_put(rb); /* could be last */
3255 }
3256 mutex_unlock(&event->mmap_mutex); 3272 mutex_unlock(&event->mmap_mutex);
3257 } 3273 }
3258 3274
3259 if (is_cgroup_event(event)) 3275 if (is_cgroup_event(event))
3260 perf_detach_cgroup(event); 3276 perf_detach_cgroup(event);
3261 3277
3262
3263 __free_event(event); 3278 __free_event(event);
3264} 3279}
3265 3280
3266int perf_event_release_kernel(struct perf_event *event) 3281/*
3282 * Used to free events which have a known refcount of 1, such as in error paths
3283 * where the event isn't exposed yet and inherited events.
3284 */
3285static void free_event(struct perf_event *event)
3267{ 3286{
3268 struct perf_event_context *ctx = event->ctx; 3287 if (WARN(atomic_long_cmpxchg(&event->refcount, 1, 0) != 1,
3269 3288 "unexpected event refcount: %ld; ptr=%p\n",
3270 WARN_ON_ONCE(ctx->parent_ctx); 3289 atomic_long_read(&event->refcount), event)) {
3271 /* 3290 /* leak to avoid use-after-free */
3272 * There are two ways this annotation is useful: 3291 return;
3273 * 3292 }
3274 * 1) there is a lock recursion from perf_event_exit_task
3275 * see the comment there.
3276 *
3277 * 2) there is a lock-inversion with mmap_sem through
3278 * perf_event_read_group(), which takes faults while
3279 * holding ctx->mutex, however this is called after
3280 * the last filedesc died, so there is no possibility
3281 * to trigger the AB-BA case.
3282 */
3283 mutex_lock_nested(&ctx->mutex, SINGLE_DEPTH_NESTING);
3284 raw_spin_lock_irq(&ctx->lock);
3285 perf_group_detach(event);
3286 raw_spin_unlock_irq(&ctx->lock);
3287 perf_remove_from_context(event);
3288 mutex_unlock(&ctx->mutex);
3289
3290 free_event(event);
3291 3293
3292 return 0; 3294 _free_event(event);
3293} 3295}
3294EXPORT_SYMBOL_GPL(perf_event_release_kernel);
3295 3296
3296/* 3297/*
3297 * Called when the last reference to the file is gone. 3298 * Called when the last reference to the file is gone.
3298 */ 3299 */
3299static void put_event(struct perf_event *event) 3300static void put_event(struct perf_event *event)
3300{ 3301{
3302 struct perf_event_context *ctx = event->ctx;
3301 struct task_struct *owner; 3303 struct task_struct *owner;
3302 3304
3303 if (!atomic_long_dec_and_test(&event->refcount)) 3305 if (!atomic_long_dec_and_test(&event->refcount))
@@ -3336,9 +3338,33 @@ static void put_event(struct perf_event *event)
3336 put_task_struct(owner); 3338 put_task_struct(owner);
3337 } 3339 }
3338 3340
3339 perf_event_release_kernel(event); 3341 WARN_ON_ONCE(ctx->parent_ctx);
3342 /*
3343 * There are two ways this annotation is useful:
3344 *
3345 * 1) there is a lock recursion from perf_event_exit_task
3346 * see the comment there.
3347 *
3348 * 2) there is a lock-inversion with mmap_sem through
3349 * perf_event_read_group(), which takes faults while
3350 * holding ctx->mutex, however this is called after
3351 * the last filedesc died, so there is no possibility
3352 * to trigger the AB-BA case.
3353 */
3354 mutex_lock_nested(&ctx->mutex, SINGLE_DEPTH_NESTING);
3355 perf_remove_from_context(event, true);
3356 mutex_unlock(&ctx->mutex);
3357
3358 _free_event(event);
3340} 3359}
3341 3360
3361int perf_event_release_kernel(struct perf_event *event)
3362{
3363 put_event(event);
3364 return 0;
3365}
3366EXPORT_SYMBOL_GPL(perf_event_release_kernel);
3367
3342static int perf_release(struct inode *inode, struct file *file) 3368static int perf_release(struct inode *inode, struct file *file)
3343{ 3369{
3344 put_event(file->private_data); 3370 put_event(file->private_data);
@@ -3839,28 +3865,47 @@ unlock:
3839static void ring_buffer_attach(struct perf_event *event, 3865static void ring_buffer_attach(struct perf_event *event,
3840 struct ring_buffer *rb) 3866 struct ring_buffer *rb)
3841{ 3867{
3868 struct ring_buffer *old_rb = NULL;
3842 unsigned long flags; 3869 unsigned long flags;
3843 3870
3844 if (!list_empty(&event->rb_entry)) 3871 if (event->rb) {
3845 return; 3872 /*
3873 * Should be impossible, we set this when removing
3874 * event->rb_entry and wait/clear when adding event->rb_entry.
3875 */
3876 WARN_ON_ONCE(event->rcu_pending);
3846 3877
3847 spin_lock_irqsave(&rb->event_lock, flags); 3878 old_rb = event->rb;
3848 if (list_empty(&event->rb_entry)) 3879 event->rcu_batches = get_state_synchronize_rcu();
3849 list_add(&event->rb_entry, &rb->event_list); 3880 event->rcu_pending = 1;
3850 spin_unlock_irqrestore(&rb->event_lock, flags);
3851}
3852 3881
3853static void ring_buffer_detach(struct perf_event *event, struct ring_buffer *rb) 3882 spin_lock_irqsave(&old_rb->event_lock, flags);
3854{ 3883 list_del_rcu(&event->rb_entry);
3855 unsigned long flags; 3884 spin_unlock_irqrestore(&old_rb->event_lock, flags);
3885 }
3856 3886
3857 if (list_empty(&event->rb_entry)) 3887 if (event->rcu_pending && rb) {
3858 return; 3888 cond_synchronize_rcu(event->rcu_batches);
3889 event->rcu_pending = 0;
3890 }
3891
3892 if (rb) {
3893 spin_lock_irqsave(&rb->event_lock, flags);
3894 list_add_rcu(&event->rb_entry, &rb->event_list);
3895 spin_unlock_irqrestore(&rb->event_lock, flags);
3896 }
3897
3898 rcu_assign_pointer(event->rb, rb);
3859 3899
3860 spin_lock_irqsave(&rb->event_lock, flags); 3900 if (old_rb) {
3861 list_del_init(&event->rb_entry); 3901 ring_buffer_put(old_rb);
3862 wake_up_all(&event->waitq); 3902 /*
3863 spin_unlock_irqrestore(&rb->event_lock, flags); 3903 * Since we detached before setting the new rb, so that we
3904 * could attach the new rb, we could have missed a wakeup.
3905 * Provide it now.
3906 */
3907 wake_up_all(&event->waitq);
3908 }
3864} 3909}
3865 3910
3866static void ring_buffer_wakeup(struct perf_event *event) 3911static void ring_buffer_wakeup(struct perf_event *event)
@@ -3929,7 +3974,7 @@ static void perf_mmap_close(struct vm_area_struct *vma)
3929{ 3974{
3930 struct perf_event *event = vma->vm_file->private_data; 3975 struct perf_event *event = vma->vm_file->private_data;
3931 3976
3932 struct ring_buffer *rb = event->rb; 3977 struct ring_buffer *rb = ring_buffer_get(event);
3933 struct user_struct *mmap_user = rb->mmap_user; 3978 struct user_struct *mmap_user = rb->mmap_user;
3934 int mmap_locked = rb->mmap_locked; 3979 int mmap_locked = rb->mmap_locked;
3935 unsigned long size = perf_data_size(rb); 3980 unsigned long size = perf_data_size(rb);
@@ -3937,18 +3982,14 @@ static void perf_mmap_close(struct vm_area_struct *vma)
3937 atomic_dec(&rb->mmap_count); 3982 atomic_dec(&rb->mmap_count);
3938 3983
3939 if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) 3984 if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex))
3940 return; 3985 goto out_put;
3941 3986
3942 /* Detach current event from the buffer. */ 3987 ring_buffer_attach(event, NULL);
3943 rcu_assign_pointer(event->rb, NULL);
3944 ring_buffer_detach(event, rb);
3945 mutex_unlock(&event->mmap_mutex); 3988 mutex_unlock(&event->mmap_mutex);
3946 3989
3947 /* If there's still other mmap()s of this buffer, we're done. */ 3990 /* If there's still other mmap()s of this buffer, we're done. */
3948 if (atomic_read(&rb->mmap_count)) { 3991 if (atomic_read(&rb->mmap_count))
3949 ring_buffer_put(rb); /* can't be last */ 3992 goto out_put;
3950 return;
3951 }
3952 3993
3953 /* 3994 /*
3954 * No other mmap()s, detach from all other events that might redirect 3995 * No other mmap()s, detach from all other events that might redirect
@@ -3978,11 +4019,9 @@ again:
3978 * still restart the iteration to make sure we're not now 4019 * still restart the iteration to make sure we're not now
3979 * iterating the wrong list. 4020 * iterating the wrong list.
3980 */ 4021 */
3981 if (event->rb == rb) { 4022 if (event->rb == rb)
3982 rcu_assign_pointer(event->rb, NULL); 4023 ring_buffer_attach(event, NULL);
3983 ring_buffer_detach(event, rb); 4024
3984 ring_buffer_put(rb); /* can't be last, we still have one */
3985 }
3986 mutex_unlock(&event->mmap_mutex); 4025 mutex_unlock(&event->mmap_mutex);
3987 put_event(event); 4026 put_event(event);
3988 4027
@@ -4007,6 +4046,7 @@ again:
4007 vma->vm_mm->pinned_vm -= mmap_locked; 4046 vma->vm_mm->pinned_vm -= mmap_locked;
4008 free_uid(mmap_user); 4047 free_uid(mmap_user);
4009 4048
4049out_put:
4010 ring_buffer_put(rb); /* could be last */ 4050 ring_buffer_put(rb); /* could be last */
4011} 4051}
4012 4052
@@ -4124,7 +4164,6 @@ again:
4124 vma->vm_mm->pinned_vm += extra; 4164 vma->vm_mm->pinned_vm += extra;
4125 4165
4126 ring_buffer_attach(event, rb); 4166 ring_buffer_attach(event, rb);
4127 rcu_assign_pointer(event->rb, rb);
4128 4167
4129 perf_event_init_userpage(event); 4168 perf_event_init_userpage(event);
4130 perf_event_update_userpage(event); 4169 perf_event_update_userpage(event);
@@ -5408,6 +5447,9 @@ struct swevent_htable {
5408 5447
5409 /* Recursion avoidance in each contexts */ 5448 /* Recursion avoidance in each contexts */
5410 int recursion[PERF_NR_CONTEXTS]; 5449 int recursion[PERF_NR_CONTEXTS];
5450
5451 /* Keeps track of cpu being initialized/exited */
5452 bool online;
5411}; 5453};
5412 5454
5413static DEFINE_PER_CPU(struct swevent_htable, swevent_htable); 5455static DEFINE_PER_CPU(struct swevent_htable, swevent_htable);
@@ -5654,8 +5696,14 @@ static int perf_swevent_add(struct perf_event *event, int flags)
5654 hwc->state = !(flags & PERF_EF_START); 5696 hwc->state = !(flags & PERF_EF_START);
5655 5697
5656 head = find_swevent_head(swhash, event); 5698 head = find_swevent_head(swhash, event);
5657 if (WARN_ON_ONCE(!head)) 5699 if (!head) {
5700 /*
5701 * We can race with cpu hotplug code. Do not
5702 * WARN if the cpu just got unplugged.
5703 */
5704 WARN_ON_ONCE(swhash->online);
5658 return -EINVAL; 5705 return -EINVAL;
5706 }
5659 5707
5660 hlist_add_head_rcu(&event->hlist_entry, head); 5708 hlist_add_head_rcu(&event->hlist_entry, head);
5661 5709
@@ -6551,6 +6599,7 @@ free_pdc:
6551 free_percpu(pmu->pmu_disable_count); 6599 free_percpu(pmu->pmu_disable_count);
6552 goto unlock; 6600 goto unlock;
6553} 6601}
6602EXPORT_SYMBOL_GPL(perf_pmu_register);
6554 6603
6555void perf_pmu_unregister(struct pmu *pmu) 6604void perf_pmu_unregister(struct pmu *pmu)
6556{ 6605{
@@ -6572,6 +6621,7 @@ void perf_pmu_unregister(struct pmu *pmu)
6572 put_device(pmu->dev); 6621 put_device(pmu->dev);
6573 free_pmu_context(pmu); 6622 free_pmu_context(pmu);
6574} 6623}
6624EXPORT_SYMBOL_GPL(perf_pmu_unregister);
6575 6625
6576struct pmu *perf_init_event(struct perf_event *event) 6626struct pmu *perf_init_event(struct perf_event *event)
6577{ 6627{
@@ -6585,6 +6635,10 @@ struct pmu *perf_init_event(struct perf_event *event)
6585 pmu = idr_find(&pmu_idr, event->attr.type); 6635 pmu = idr_find(&pmu_idr, event->attr.type);
6586 rcu_read_unlock(); 6636 rcu_read_unlock();
6587 if (pmu) { 6637 if (pmu) {
6638 if (!try_module_get(pmu->module)) {
6639 pmu = ERR_PTR(-ENODEV);
6640 goto unlock;
6641 }
6588 event->pmu = pmu; 6642 event->pmu = pmu;
6589 ret = pmu->event_init(event); 6643 ret = pmu->event_init(event);
6590 if (ret) 6644 if (ret)
@@ -6593,6 +6647,10 @@ struct pmu *perf_init_event(struct perf_event *event)
6593 } 6647 }
6594 6648
6595 list_for_each_entry_rcu(pmu, &pmus, entry) { 6649 list_for_each_entry_rcu(pmu, &pmus, entry) {
6650 if (!try_module_get(pmu->module)) {
6651 pmu = ERR_PTR(-ENODEV);
6652 goto unlock;
6653 }
6596 event->pmu = pmu; 6654 event->pmu = pmu;
6597 ret = pmu->event_init(event); 6655 ret = pmu->event_init(event);
6598 if (!ret) 6656 if (!ret)
@@ -6771,6 +6829,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
6771err_pmu: 6829err_pmu:
6772 if (event->destroy) 6830 if (event->destroy)
6773 event->destroy(event); 6831 event->destroy(event);
6832 module_put(pmu->module);
6774err_ns: 6833err_ns:
6775 if (event->ns) 6834 if (event->ns)
6776 put_pid_ns(event->ns); 6835 put_pid_ns(event->ns);
@@ -6914,7 +6973,7 @@ err_size:
6914static int 6973static int
6915perf_event_set_output(struct perf_event *event, struct perf_event *output_event) 6974perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
6916{ 6975{
6917 struct ring_buffer *rb = NULL, *old_rb = NULL; 6976 struct ring_buffer *rb = NULL;
6918 int ret = -EINVAL; 6977 int ret = -EINVAL;
6919 6978
6920 if (!output_event) 6979 if (!output_event)
@@ -6942,8 +7001,6 @@ set:
6942 if (atomic_read(&event->mmap_count)) 7001 if (atomic_read(&event->mmap_count))
6943 goto unlock; 7002 goto unlock;
6944 7003
6945 old_rb = event->rb;
6946
6947 if (output_event) { 7004 if (output_event) {
6948 /* get the rb we want to redirect to */ 7005 /* get the rb we want to redirect to */
6949 rb = ring_buffer_get(output_event); 7006 rb = ring_buffer_get(output_event);
@@ -6951,23 +7008,7 @@ set:
6951 goto unlock; 7008 goto unlock;
6952 } 7009 }
6953 7010
6954 if (old_rb) 7011 ring_buffer_attach(event, rb);
6955 ring_buffer_detach(event, old_rb);
6956
6957 if (rb)
6958 ring_buffer_attach(event, rb);
6959
6960 rcu_assign_pointer(event->rb, rb);
6961
6962 if (old_rb) {
6963 ring_buffer_put(old_rb);
6964 /*
6965 * Since we detached before setting the new rb, so that we
6966 * could attach the new rb, we could have missed a wakeup.
6967 * Provide it now.
6968 */
6969 wake_up_all(&event->waitq);
6970 }
6971 7012
6972 ret = 0; 7013 ret = 0;
6973unlock: 7014unlock:
@@ -7018,6 +7059,9 @@ SYSCALL_DEFINE5(perf_event_open,
7018 if (attr.freq) { 7059 if (attr.freq) {
7019 if (attr.sample_freq > sysctl_perf_event_sample_rate) 7060 if (attr.sample_freq > sysctl_perf_event_sample_rate)
7020 return -EINVAL; 7061 return -EINVAL;
7062 } else {
7063 if (attr.sample_period & (1ULL << 63))
7064 return -EINVAL;
7021 } 7065 }
7022 7066
7023 /* 7067 /*
@@ -7055,20 +7099,26 @@ SYSCALL_DEFINE5(perf_event_open,
7055 } 7099 }
7056 } 7100 }
7057 7101
7102 if (task && group_leader &&
7103 group_leader->attr.inherit != attr.inherit) {
7104 err = -EINVAL;
7105 goto err_task;
7106 }
7107
7058 get_online_cpus(); 7108 get_online_cpus();
7059 7109
7060 event = perf_event_alloc(&attr, cpu, task, group_leader, NULL, 7110 event = perf_event_alloc(&attr, cpu, task, group_leader, NULL,
7061 NULL, NULL); 7111 NULL, NULL);
7062 if (IS_ERR(event)) { 7112 if (IS_ERR(event)) {
7063 err = PTR_ERR(event); 7113 err = PTR_ERR(event);
7064 goto err_task; 7114 goto err_cpus;
7065 } 7115 }
7066 7116
7067 if (flags & PERF_FLAG_PID_CGROUP) { 7117 if (flags & PERF_FLAG_PID_CGROUP) {
7068 err = perf_cgroup_connect(pid, event, &attr, group_leader); 7118 err = perf_cgroup_connect(pid, event, &attr, group_leader);
7069 if (err) { 7119 if (err) {
7070 __free_event(event); 7120 __free_event(event);
7071 goto err_task; 7121 goto err_cpus;
7072 } 7122 }
7073 } 7123 }
7074 7124
@@ -7165,7 +7215,7 @@ SYSCALL_DEFINE5(perf_event_open,
7165 struct perf_event_context *gctx = group_leader->ctx; 7215 struct perf_event_context *gctx = group_leader->ctx;
7166 7216
7167 mutex_lock(&gctx->mutex); 7217 mutex_lock(&gctx->mutex);
7168 perf_remove_from_context(group_leader); 7218 perf_remove_from_context(group_leader, false);
7169 7219
7170 /* 7220 /*
7171 * Removing from the context ends up with disabled 7221 * Removing from the context ends up with disabled
@@ -7175,7 +7225,7 @@ SYSCALL_DEFINE5(perf_event_open,
7175 perf_event__state_init(group_leader); 7225 perf_event__state_init(group_leader);
7176 list_for_each_entry(sibling, &group_leader->sibling_list, 7226 list_for_each_entry(sibling, &group_leader->sibling_list,
7177 group_entry) { 7227 group_entry) {
7178 perf_remove_from_context(sibling); 7228 perf_remove_from_context(sibling, false);
7179 perf_event__state_init(sibling); 7229 perf_event__state_init(sibling);
7180 put_ctx(gctx); 7230 put_ctx(gctx);
7181 } 7231 }
@@ -7230,8 +7280,9 @@ err_context:
7230 put_ctx(ctx); 7280 put_ctx(ctx);
7231err_alloc: 7281err_alloc:
7232 free_event(event); 7282 free_event(event);
7233err_task: 7283err_cpus:
7234 put_online_cpus(); 7284 put_online_cpus();
7285err_task:
7235 if (task) 7286 if (task)
7236 put_task_struct(task); 7287 put_task_struct(task);
7237err_group_fd: 7288err_group_fd:
@@ -7305,7 +7356,7 @@ void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
7305 mutex_lock(&src_ctx->mutex); 7356 mutex_lock(&src_ctx->mutex);
7306 list_for_each_entry_safe(event, tmp, &src_ctx->event_list, 7357 list_for_each_entry_safe(event, tmp, &src_ctx->event_list,
7307 event_entry) { 7358 event_entry) {
7308 perf_remove_from_context(event); 7359 perf_remove_from_context(event, false);
7309 unaccount_event_cpu(event, src_cpu); 7360 unaccount_event_cpu(event, src_cpu);
7310 put_ctx(src_ctx); 7361 put_ctx(src_ctx);
7311 list_add(&event->migrate_entry, &events); 7362 list_add(&event->migrate_entry, &events);
@@ -7367,13 +7418,7 @@ __perf_event_exit_task(struct perf_event *child_event,
7367 struct perf_event_context *child_ctx, 7418 struct perf_event_context *child_ctx,
7368 struct task_struct *child) 7419 struct task_struct *child)
7369{ 7420{
7370 if (child_event->parent) { 7421 perf_remove_from_context(child_event, true);
7371 raw_spin_lock_irq(&child_ctx->lock);
7372 perf_group_detach(child_event);
7373 raw_spin_unlock_irq(&child_ctx->lock);
7374 }
7375
7376 perf_remove_from_context(child_event);
7377 7422
7378 /* 7423 /*
7379 * It can happen that the parent exits first, and has events 7424 * It can happen that the parent exits first, and has events
@@ -7388,7 +7433,7 @@ __perf_event_exit_task(struct perf_event *child_event,
7388 7433
7389static void perf_event_exit_task_context(struct task_struct *child, int ctxn) 7434static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
7390{ 7435{
7391 struct perf_event *child_event, *tmp; 7436 struct perf_event *child_event;
7392 struct perf_event_context *child_ctx; 7437 struct perf_event_context *child_ctx;
7393 unsigned long flags; 7438 unsigned long flags;
7394 7439
@@ -7442,24 +7487,9 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
7442 */ 7487 */
7443 mutex_lock(&child_ctx->mutex); 7488 mutex_lock(&child_ctx->mutex);
7444 7489
7445again: 7490 list_for_each_entry_rcu(child_event, &child_ctx->event_list, event_entry)
7446 list_for_each_entry_safe(child_event, tmp, &child_ctx->pinned_groups,
7447 group_entry)
7448 __perf_event_exit_task(child_event, child_ctx, child); 7491 __perf_event_exit_task(child_event, child_ctx, child);
7449 7492
7450 list_for_each_entry_safe(child_event, tmp, &child_ctx->flexible_groups,
7451 group_entry)
7452 __perf_event_exit_task(child_event, child_ctx, child);
7453
7454 /*
7455 * If the last event was a group event, it will have appended all
7456 * its siblings to the list, but we obtained 'tmp' before that which
7457 * will still point to the list head terminating the iteration.
7458 */
7459 if (!list_empty(&child_ctx->pinned_groups) ||
7460 !list_empty(&child_ctx->flexible_groups))
7461 goto again;
7462
7463 mutex_unlock(&child_ctx->mutex); 7493 mutex_unlock(&child_ctx->mutex);
7464 7494
7465 put_ctx(child_ctx); 7495 put_ctx(child_ctx);
@@ -7724,6 +7754,8 @@ int perf_event_init_context(struct task_struct *child, int ctxn)
7724 * swapped under us. 7754 * swapped under us.
7725 */ 7755 */
7726 parent_ctx = perf_pin_task_context(parent, ctxn); 7756 parent_ctx = perf_pin_task_context(parent, ctxn);
7757 if (!parent_ctx)
7758 return 0;
7727 7759
7728 /* 7760 /*
7729 * No need to check if parent_ctx != NULL here; since we saw 7761 * No need to check if parent_ctx != NULL here; since we saw
@@ -7835,6 +7867,7 @@ static void perf_event_init_cpu(int cpu)
7835 struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); 7867 struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
7836 7868
7837 mutex_lock(&swhash->hlist_mutex); 7869 mutex_lock(&swhash->hlist_mutex);
7870 swhash->online = true;
7838 if (swhash->hlist_refcount > 0) { 7871 if (swhash->hlist_refcount > 0) {
7839 struct swevent_hlist *hlist; 7872 struct swevent_hlist *hlist;
7840 7873
@@ -7857,14 +7890,14 @@ static void perf_pmu_rotate_stop(struct pmu *pmu)
7857 7890
7858static void __perf_event_exit_context(void *__info) 7891static void __perf_event_exit_context(void *__info)
7859{ 7892{
7893 struct remove_event re = { .detach_group = false };
7860 struct perf_event_context *ctx = __info; 7894 struct perf_event_context *ctx = __info;
7861 struct perf_event *event;
7862 7895
7863 perf_pmu_rotate_stop(ctx->pmu); 7896 perf_pmu_rotate_stop(ctx->pmu);
7864 7897
7865 rcu_read_lock(); 7898 rcu_read_lock();
7866 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) 7899 list_for_each_entry_rcu(re.event, &ctx->event_list, event_entry)
7867 __perf_remove_from_context(event); 7900 __perf_remove_from_context(&re);
7868 rcu_read_unlock(); 7901 rcu_read_unlock();
7869} 7902}
7870 7903
@@ -7892,6 +7925,7 @@ static void perf_event_exit_cpu(int cpu)
7892 perf_event_exit_cpu_context(cpu); 7925 perf_event_exit_cpu_context(cpu);
7893 7926
7894 mutex_lock(&swhash->hlist_mutex); 7927 mutex_lock(&swhash->hlist_mutex);
7928 swhash->online = false;
7895 swevent_hlist_release(swhash); 7929 swevent_hlist_release(swhash);
7896 mutex_unlock(&swhash->hlist_mutex); 7930 mutex_unlock(&swhash->hlist_mutex);
7897} 7931}
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 04709b66369d..adcd76a96839 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -60,8 +60,6 @@ static struct percpu_rw_semaphore dup_mmap_sem;
60 60
61/* Have a copy of original instruction */ 61/* Have a copy of original instruction */
62#define UPROBE_COPY_INSN 0 62#define UPROBE_COPY_INSN 0
63/* Can skip singlestep */
64#define UPROBE_SKIP_SSTEP 1
65 63
66struct uprobe { 64struct uprobe {
67 struct rb_node rb_node; /* node in the rb tree */ 65 struct rb_node rb_node; /* node in the rb tree */
@@ -491,12 +489,9 @@ static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset)
491 uprobe->offset = offset; 489 uprobe->offset = offset;
492 init_rwsem(&uprobe->register_rwsem); 490 init_rwsem(&uprobe->register_rwsem);
493 init_rwsem(&uprobe->consumer_rwsem); 491 init_rwsem(&uprobe->consumer_rwsem);
494 /* For now assume that the instruction need not be single-stepped */
495 __set_bit(UPROBE_SKIP_SSTEP, &uprobe->flags);
496 492
497 /* add to uprobes_tree, sorted on inode:offset */ 493 /* add to uprobes_tree, sorted on inode:offset */
498 cur_uprobe = insert_uprobe(uprobe); 494 cur_uprobe = insert_uprobe(uprobe);
499
500 /* a uprobe exists for this inode:offset combination */ 495 /* a uprobe exists for this inode:offset combination */
501 if (cur_uprobe) { 496 if (cur_uprobe) {
502 kfree(uprobe); 497 kfree(uprobe);
@@ -1296,14 +1291,8 @@ static unsigned long xol_get_insn_slot(struct uprobe *uprobe)
1296 if (unlikely(!xol_vaddr)) 1291 if (unlikely(!xol_vaddr))
1297 return 0; 1292 return 0;
1298 1293
1299 /* Initialize the slot */ 1294 arch_uprobe_copy_ixol(area->page, xol_vaddr,
1300 copy_to_page(area->page, xol_vaddr, 1295 &uprobe->arch.ixol, sizeof(uprobe->arch.ixol));
1301 &uprobe->arch.ixol, sizeof(uprobe->arch.ixol));
1302 /*
1303 * We probably need flush_icache_user_range() but it needs vma.
1304 * This should work on supported architectures too.
1305 */
1306 flush_dcache_page(area->page);
1307 1296
1308 return xol_vaddr; 1297 return xol_vaddr;
1309} 1298}
@@ -1346,6 +1335,21 @@ static void xol_free_insn_slot(struct task_struct *tsk)
1346 } 1335 }
1347} 1336}
1348 1337
1338void __weak arch_uprobe_copy_ixol(struct page *page, unsigned long vaddr,
1339 void *src, unsigned long len)
1340{
1341 /* Initialize the slot */
1342 copy_to_page(page, vaddr, src, len);
1343
1344 /*
1345 * We probably need flush_icache_user_range() but it needs vma.
1346 * This should work on most of architectures by default. If
1347 * architecture needs to do something different it can define
1348 * its own version of the function.
1349 */
1350 flush_dcache_page(page);
1351}
1352
1349/** 1353/**
1350 * uprobe_get_swbp_addr - compute address of swbp given post-swbp regs 1354 * uprobe_get_swbp_addr - compute address of swbp given post-swbp regs
1351 * @regs: Reflects the saved state of the task after it has hit a breakpoint 1355 * @regs: Reflects the saved state of the task after it has hit a breakpoint
@@ -1628,20 +1632,6 @@ bool uprobe_deny_signal(void)
1628 return true; 1632 return true;
1629} 1633}
1630 1634
1631/*
1632 * Avoid singlestepping the original instruction if the original instruction
1633 * is a NOP or can be emulated.
1634 */
1635static bool can_skip_sstep(struct uprobe *uprobe, struct pt_regs *regs)
1636{
1637 if (test_bit(UPROBE_SKIP_SSTEP, &uprobe->flags)) {
1638 if (arch_uprobe_skip_sstep(&uprobe->arch, regs))
1639 return true;
1640 clear_bit(UPROBE_SKIP_SSTEP, &uprobe->flags);
1641 }
1642 return false;
1643}
1644
1645static void mmf_recalc_uprobes(struct mm_struct *mm) 1635static void mmf_recalc_uprobes(struct mm_struct *mm)
1646{ 1636{
1647 struct vm_area_struct *vma; 1637 struct vm_area_struct *vma;
@@ -1868,13 +1858,13 @@ static void handle_swbp(struct pt_regs *regs)
1868 1858
1869 handler_chain(uprobe, regs); 1859 handler_chain(uprobe, regs);
1870 1860
1871 if (can_skip_sstep(uprobe, regs)) 1861 if (arch_uprobe_skip_sstep(&uprobe->arch, regs))
1872 goto out; 1862 goto out;
1873 1863
1874 if (!pre_ssout(uprobe, regs, bp_vaddr)) 1864 if (!pre_ssout(uprobe, regs, bp_vaddr))
1875 return; 1865 return;
1876 1866
1877 /* can_skip_sstep() succeeded, or restart if can't singlestep */ 1867 /* arch_uprobe_skip_sstep() succeeded, or restart if can't singlestep */
1878out: 1868out:
1879 put_uprobe(uprobe); 1869 put_uprobe(uprobe);
1880} 1870}
@@ -1886,10 +1876,11 @@ out:
1886static void handle_singlestep(struct uprobe_task *utask, struct pt_regs *regs) 1876static void handle_singlestep(struct uprobe_task *utask, struct pt_regs *regs)
1887{ 1877{
1888 struct uprobe *uprobe; 1878 struct uprobe *uprobe;
1879 int err = 0;
1889 1880
1890 uprobe = utask->active_uprobe; 1881 uprobe = utask->active_uprobe;
1891 if (utask->state == UTASK_SSTEP_ACK) 1882 if (utask->state == UTASK_SSTEP_ACK)
1892 arch_uprobe_post_xol(&uprobe->arch, regs); 1883 err = arch_uprobe_post_xol(&uprobe->arch, regs);
1893 else if (utask->state == UTASK_SSTEP_TRAPPED) 1884 else if (utask->state == UTASK_SSTEP_TRAPPED)
1894 arch_uprobe_abort_xol(&uprobe->arch, regs); 1885 arch_uprobe_abort_xol(&uprobe->arch, regs);
1895 else 1886 else
@@ -1903,6 +1894,11 @@ static void handle_singlestep(struct uprobe_task *utask, struct pt_regs *regs)
1903 spin_lock_irq(&current->sighand->siglock); 1894 spin_lock_irq(&current->sighand->siglock);
1904 recalc_sigpending(); /* see uprobe_deny_signal() */ 1895 recalc_sigpending(); /* see uprobe_deny_signal() */
1905 spin_unlock_irq(&current->sighand->siglock); 1896 spin_unlock_irq(&current->sighand->siglock);
1897
1898 if (unlikely(err)) {
1899 uprobe_warn(current, "execute the probed insn, sending SIGILL.");
1900 force_sig_info(SIGILL, SEND_SIG_FORCED, current);
1901 }
1906} 1902}
1907 1903
1908/* 1904/*
diff --git a/kernel/exec_domain.c b/kernel/exec_domain.c
index 0dbeae374225..83d4382f5699 100644
--- a/kernel/exec_domain.c
+++ b/kernel/exec_domain.c
@@ -37,7 +37,7 @@ static unsigned long ident_map[32] = {
37struct exec_domain default_exec_domain = { 37struct exec_domain default_exec_domain = {
38 .name = "Linux", /* name */ 38 .name = "Linux", /* name */
39 .handler = default_handler, /* lcall7 causes a seg fault. */ 39 .handler = default_handler, /* lcall7 causes a seg fault. */
40 .pers_low = 0, /* PER_LINUX personality. */ 40 .pers_low = 0, /* PER_LINUX personality. */
41 .pers_high = 0, /* PER_LINUX personality. */ 41 .pers_high = 0, /* PER_LINUX personality. */
42 .signal_map = ident_map, /* Identity map signals. */ 42 .signal_map = ident_map, /* Identity map signals. */
43 .signal_invmap = ident_map, /* - both ways. */ 43 .signal_invmap = ident_map, /* - both ways. */
@@ -83,7 +83,7 @@ lookup_exec_domain(unsigned int personality)
83 ep = &default_exec_domain; 83 ep = &default_exec_domain;
84out: 84out:
85 read_unlock(&exec_domains_lock); 85 read_unlock(&exec_domains_lock);
86 return (ep); 86 return ep;
87} 87}
88 88
89int 89int
@@ -110,8 +110,9 @@ register_exec_domain(struct exec_domain *ep)
110 110
111out: 111out:
112 write_unlock(&exec_domains_lock); 112 write_unlock(&exec_domains_lock);
113 return (err); 113 return err;
114} 114}
115EXPORT_SYMBOL(register_exec_domain);
115 116
116int 117int
117unregister_exec_domain(struct exec_domain *ep) 118unregister_exec_domain(struct exec_domain *ep)
@@ -133,6 +134,7 @@ unregister:
133 write_unlock(&exec_domains_lock); 134 write_unlock(&exec_domains_lock);
134 return 0; 135 return 0;
135} 136}
137EXPORT_SYMBOL(unregister_exec_domain);
136 138
137int __set_personality(unsigned int personality) 139int __set_personality(unsigned int personality)
138{ 140{
@@ -144,6 +146,7 @@ int __set_personality(unsigned int personality)
144 146
145 return 0; 147 return 0;
146} 148}
149EXPORT_SYMBOL(__set_personality);
147 150
148#ifdef CONFIG_PROC_FS 151#ifdef CONFIG_PROC_FS
149static int execdomains_proc_show(struct seq_file *m, void *v) 152static int execdomains_proc_show(struct seq_file *m, void *v)
@@ -188,8 +191,3 @@ SYSCALL_DEFINE1(personality, unsigned int, personality)
188 191
189 return old; 192 return old;
190} 193}
191
192
193EXPORT_SYMBOL(register_exec_domain);
194EXPORT_SYMBOL(unregister_exec_domain);
195EXPORT_SYMBOL(__set_personality);
diff --git a/kernel/exit.c b/kernel/exit.c
index 6ed6a1d552b5..e5c4668f1799 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -313,46 +313,7 @@ kill_orphaned_pgrp(struct task_struct *tsk, struct task_struct *parent)
313 } 313 }
314} 314}
315 315
316/* 316#ifdef CONFIG_MEMCG
317 * Let kernel threads use this to say that they allow a certain signal.
318 * Must not be used if kthread was cloned with CLONE_SIGHAND.
319 */
320int allow_signal(int sig)
321{
322 if (!valid_signal(sig) || sig < 1)
323 return -EINVAL;
324
325 spin_lock_irq(&current->sighand->siglock);
326 /* This is only needed for daemonize()'ed kthreads */
327 sigdelset(&current->blocked, sig);
328 /*
329 * Kernel threads handle their own signals. Let the signal code
330 * know it'll be handled, so that they don't get converted to
331 * SIGKILL or just silently dropped.
332 */
333 current->sighand->action[(sig)-1].sa.sa_handler = (void __user *)2;
334 recalc_sigpending();
335 spin_unlock_irq(&current->sighand->siglock);
336 return 0;
337}
338
339EXPORT_SYMBOL(allow_signal);
340
341int disallow_signal(int sig)
342{
343 if (!valid_signal(sig) || sig < 1)
344 return -EINVAL;
345
346 spin_lock_irq(&current->sighand->siglock);
347 current->sighand->action[(sig)-1].sa.sa_handler = SIG_IGN;
348 recalc_sigpending();
349 spin_unlock_irq(&current->sighand->siglock);
350 return 0;
351}
352
353EXPORT_SYMBOL(disallow_signal);
354
355#ifdef CONFIG_MM_OWNER
356/* 317/*
357 * A task is exiting. If it owned this mm, find a new owner for the mm. 318 * A task is exiting. If it owned this mm, find a new owner for the mm.
358 */ 319 */
@@ -395,14 +356,18 @@ retry:
395 } 356 }
396 357
397 /* 358 /*
398 * Search through everything else. We should not get 359 * Search through everything else, we should not get here often.
399 * here often
400 */ 360 */
401 do_each_thread(g, c) { 361 for_each_process(g) {
402 if (c->mm == mm) 362 if (g->flags & PF_KTHREAD)
403 goto assign_new_owner; 363 continue;
404 } while_each_thread(g, c); 364 for_each_thread(g, c) {
405 365 if (c->mm == mm)
366 goto assign_new_owner;
367 if (c->mm)
368 break;
369 }
370 }
406 read_unlock(&tasklist_lock); 371 read_unlock(&tasklist_lock);
407 /* 372 /*
408 * We found no owner yet mm_users > 1: this implies that we are 373 * We found no owner yet mm_users > 1: this implies that we are
@@ -434,7 +399,7 @@ assign_new_owner:
434 task_unlock(c); 399 task_unlock(c);
435 put_task_struct(c); 400 put_task_struct(c);
436} 401}
437#endif /* CONFIG_MM_OWNER */ 402#endif /* CONFIG_MEMCG */
438 403
439/* 404/*
440 * Turn us into a lazy TLB process if we 405 * Turn us into a lazy TLB process if we
diff --git a/kernel/fork.c b/kernel/fork.c
index 54a8d26f612f..d2799d1fc952 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -150,15 +150,15 @@ void __weak arch_release_thread_info(struct thread_info *ti)
150static struct thread_info *alloc_thread_info_node(struct task_struct *tsk, 150static struct thread_info *alloc_thread_info_node(struct task_struct *tsk,
151 int node) 151 int node)
152{ 152{
153 struct page *page = alloc_pages_node(node, THREADINFO_GFP_ACCOUNTED, 153 struct page *page = alloc_kmem_pages_node(node, THREADINFO_GFP,
154 THREAD_SIZE_ORDER); 154 THREAD_SIZE_ORDER);
155 155
156 return page ? page_address(page) : NULL; 156 return page ? page_address(page) : NULL;
157} 157}
158 158
159static inline void free_thread_info(struct thread_info *ti) 159static inline void free_thread_info(struct thread_info *ti)
160{ 160{
161 free_memcg_kmem_pages((unsigned long)ti, THREAD_SIZE_ORDER); 161 free_kmem_pages((unsigned long)ti, THREAD_SIZE_ORDER);
162} 162}
163# else 163# else
164static struct kmem_cache *thread_info_cache; 164static struct kmem_cache *thread_info_cache;
@@ -1099,12 +1099,12 @@ static void rt_mutex_init_task(struct task_struct *p)
1099#endif 1099#endif
1100} 1100}
1101 1101
1102#ifdef CONFIG_MM_OWNER 1102#ifdef CONFIG_MEMCG
1103void mm_init_owner(struct mm_struct *mm, struct task_struct *p) 1103void mm_init_owner(struct mm_struct *mm, struct task_struct *p)
1104{ 1104{
1105 mm->owner = p; 1105 mm->owner = p;
1106} 1106}
1107#endif /* CONFIG_MM_OWNER */ 1107#endif /* CONFIG_MEMCG */
1108 1108
1109/* 1109/*
1110 * Initialize POSIX timer handling for a single task. 1110 * Initialize POSIX timer handling for a single task.
@@ -1606,10 +1606,12 @@ long do_fork(unsigned long clone_flags,
1606 */ 1606 */
1607 if (!IS_ERR(p)) { 1607 if (!IS_ERR(p)) {
1608 struct completion vfork; 1608 struct completion vfork;
1609 struct pid *pid;
1609 1610
1610 trace_sched_process_fork(current, p); 1611 trace_sched_process_fork(current, p);
1611 1612
1612 nr = task_pid_vnr(p); 1613 pid = get_task_pid(p, PIDTYPE_PID);
1614 nr = pid_vnr(pid);
1613 1615
1614 if (clone_flags & CLONE_PARENT_SETTID) 1616 if (clone_flags & CLONE_PARENT_SETTID)
1615 put_user(nr, parent_tidptr); 1617 put_user(nr, parent_tidptr);
@@ -1624,12 +1626,14 @@ long do_fork(unsigned long clone_flags,
1624 1626
1625 /* forking complete and child started to run, tell ptracer */ 1627 /* forking complete and child started to run, tell ptracer */
1626 if (unlikely(trace)) 1628 if (unlikely(trace))
1627 ptrace_event(trace, nr); 1629 ptrace_event_pid(trace, pid);
1628 1630
1629 if (clone_flags & CLONE_VFORK) { 1631 if (clone_flags & CLONE_VFORK) {
1630 if (!wait_for_vfork_done(p, &vfork)) 1632 if (!wait_for_vfork_done(p, &vfork))
1631 ptrace_event(PTRACE_EVENT_VFORK_DONE, nr); 1633 ptrace_event_pid(PTRACE_EVENT_VFORK_DONE, pid);
1632 } 1634 }
1635
1636 put_pid(pid);
1633 } else { 1637 } else {
1634 nr = PTR_ERR(p); 1638 nr = PTR_ERR(p);
1635 } 1639 }
diff --git a/kernel/futex.c b/kernel/futex.c
index b991ec05b8f9..b632b5f3f094 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -743,6 +743,55 @@ void exit_pi_state_list(struct task_struct *curr)
743 raw_spin_unlock_irq(&curr->pi_lock); 743 raw_spin_unlock_irq(&curr->pi_lock);
744} 744}
745 745
746/*
747 * We need to check the following states:
748 *
749 * Waiter | pi_state | pi->owner | uTID | uODIED | ?
750 *
751 * [1] NULL | --- | --- | 0 | 0/1 | Valid
752 * [2] NULL | --- | --- | >0 | 0/1 | Valid
753 *
754 * [3] Found | NULL | -- | Any | 0/1 | Invalid
755 *
756 * [4] Found | Found | NULL | 0 | 1 | Valid
757 * [5] Found | Found | NULL | >0 | 1 | Invalid
758 *
759 * [6] Found | Found | task | 0 | 1 | Valid
760 *
761 * [7] Found | Found | NULL | Any | 0 | Invalid
762 *
763 * [8] Found | Found | task | ==taskTID | 0/1 | Valid
764 * [9] Found | Found | task | 0 | 0 | Invalid
765 * [10] Found | Found | task | !=taskTID | 0/1 | Invalid
766 *
767 * [1] Indicates that the kernel can acquire the futex atomically. We
768 * came came here due to a stale FUTEX_WAITERS/FUTEX_OWNER_DIED bit.
769 *
770 * [2] Valid, if TID does not belong to a kernel thread. If no matching
771 * thread is found then it indicates that the owner TID has died.
772 *
773 * [3] Invalid. The waiter is queued on a non PI futex
774 *
775 * [4] Valid state after exit_robust_list(), which sets the user space
776 * value to FUTEX_WAITERS | FUTEX_OWNER_DIED.
777 *
778 * [5] The user space value got manipulated between exit_robust_list()
779 * and exit_pi_state_list()
780 *
781 * [6] Valid state after exit_pi_state_list() which sets the new owner in
782 * the pi_state but cannot access the user space value.
783 *
784 * [7] pi_state->owner can only be NULL when the OWNER_DIED bit is set.
785 *
786 * [8] Owner and user space value match
787 *
788 * [9] There is no transient state which sets the user space TID to 0
789 * except exit_robust_list(), but this is indicated by the
790 * FUTEX_OWNER_DIED bit. See [4]
791 *
792 * [10] There is no transient state which leaves owner and user space
793 * TID out of sync.
794 */
746static int 795static int
747lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, 796lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
748 union futex_key *key, struct futex_pi_state **ps) 797 union futex_key *key, struct futex_pi_state **ps)
@@ -755,12 +804,13 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
755 plist_for_each_entry_safe(this, next, &hb->chain, list) { 804 plist_for_each_entry_safe(this, next, &hb->chain, list) {
756 if (match_futex(&this->key, key)) { 805 if (match_futex(&this->key, key)) {
757 /* 806 /*
758 * Another waiter already exists - bump up 807 * Sanity check the waiter before increasing
759 * the refcount and return its pi_state: 808 * the refcount and attaching to it.
760 */ 809 */
761 pi_state = this->pi_state; 810 pi_state = this->pi_state;
762 /* 811 /*
763 * Userspace might have messed up non-PI and PI futexes 812 * Userspace might have messed up non-PI and
813 * PI futexes [3]
764 */ 814 */
765 if (unlikely(!pi_state)) 815 if (unlikely(!pi_state))
766 return -EINVAL; 816 return -EINVAL;
@@ -768,34 +818,70 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
768 WARN_ON(!atomic_read(&pi_state->refcount)); 818 WARN_ON(!atomic_read(&pi_state->refcount));
769 819
770 /* 820 /*
771 * When pi_state->owner is NULL then the owner died 821 * Handle the owner died case:
772 * and another waiter is on the fly. pi_state->owner
773 * is fixed up by the task which acquires
774 * pi_state->rt_mutex.
775 *
776 * We do not check for pid == 0 which can happen when
777 * the owner died and robust_list_exit() cleared the
778 * TID.
779 */ 822 */
780 if (pid && pi_state->owner) { 823 if (uval & FUTEX_OWNER_DIED) {
824 /*
825 * exit_pi_state_list sets owner to NULL and
826 * wakes the topmost waiter. The task which
827 * acquires the pi_state->rt_mutex will fixup
828 * owner.
829 */
830 if (!pi_state->owner) {
831 /*
832 * No pi state owner, but the user
833 * space TID is not 0. Inconsistent
834 * state. [5]
835 */
836 if (pid)
837 return -EINVAL;
838 /*
839 * Take a ref on the state and
840 * return. [4]
841 */
842 goto out_state;
843 }
844
781 /* 845 /*
782 * Bail out if user space manipulated the 846 * If TID is 0, then either the dying owner
783 * futex value. 847 * has not yet executed exit_pi_state_list()
848 * or some waiter acquired the rtmutex in the
849 * pi state, but did not yet fixup the TID in
850 * user space.
851 *
852 * Take a ref on the state and return. [6]
784 */ 853 */
785 if (pid != task_pid_vnr(pi_state->owner)) 854 if (!pid)
855 goto out_state;
856 } else {
857 /*
858 * If the owner died bit is not set,
859 * then the pi_state must have an
860 * owner. [7]
861 */
862 if (!pi_state->owner)
786 return -EINVAL; 863 return -EINVAL;
787 } 864 }
788 865
866 /*
867 * Bail out if user space manipulated the
868 * futex value. If pi state exists then the
869 * owner TID must be the same as the user
870 * space TID. [9/10]
871 */
872 if (pid != task_pid_vnr(pi_state->owner))
873 return -EINVAL;
874
875 out_state:
789 atomic_inc(&pi_state->refcount); 876 atomic_inc(&pi_state->refcount);
790 *ps = pi_state; 877 *ps = pi_state;
791
792 return 0; 878 return 0;
793 } 879 }
794 } 880 }
795 881
796 /* 882 /*
797 * We are the first waiter - try to look up the real owner and attach 883 * We are the first waiter - try to look up the real owner and attach
798 * the new pi_state to it, but bail out when TID = 0 884 * the new pi_state to it, but bail out when TID = 0 [1]
799 */ 885 */
800 if (!pid) 886 if (!pid)
801 return -ESRCH; 887 return -ESRCH;
@@ -803,6 +889,11 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
803 if (!p) 889 if (!p)
804 return -ESRCH; 890 return -ESRCH;
805 891
892 if (!p->mm) {
893 put_task_struct(p);
894 return -EPERM;
895 }
896
806 /* 897 /*
807 * We need to look at the task state flags to figure out, 898 * We need to look at the task state flags to figure out,
808 * whether the task is exiting. To protect against the do_exit 899 * whether the task is exiting. To protect against the do_exit
@@ -823,6 +914,9 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
823 return ret; 914 return ret;
824 } 915 }
825 916
917 /*
918 * No existing pi state. First waiter. [2]
919 */
826 pi_state = alloc_pi_state(); 920 pi_state = alloc_pi_state();
827 921
828 /* 922 /*
@@ -894,10 +988,18 @@ retry:
894 return -EDEADLK; 988 return -EDEADLK;
895 989
896 /* 990 /*
897 * Surprise - we got the lock. Just return to userspace: 991 * Surprise - we got the lock, but we do not trust user space at all.
898 */ 992 */
899 if (unlikely(!curval)) 993 if (unlikely(!curval)) {
900 return 1; 994 /*
995 * We verify whether there is kernel state for this
996 * futex. If not, we can safely assume, that the 0 ->
997 * TID transition is correct. If state exists, we do
998 * not bother to fixup the user space state as it was
999 * corrupted already.
1000 */
1001 return futex_top_waiter(hb, key) ? -EINVAL : 1;
1002 }
901 1003
902 uval = curval; 1004 uval = curval;
903 1005
@@ -1028,6 +1130,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
1028 struct task_struct *new_owner; 1130 struct task_struct *new_owner;
1029 struct futex_pi_state *pi_state = this->pi_state; 1131 struct futex_pi_state *pi_state = this->pi_state;
1030 u32 uninitialized_var(curval), newval; 1132 u32 uninitialized_var(curval), newval;
1133 int ret = 0;
1031 1134
1032 if (!pi_state) 1135 if (!pi_state)
1033 return -EINVAL; 1136 return -EINVAL;
@@ -1051,23 +1154,19 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
1051 new_owner = this->task; 1154 new_owner = this->task;
1052 1155
1053 /* 1156 /*
1054 * We pass it to the next owner. (The WAITERS bit is always 1157 * We pass it to the next owner. The WAITERS bit is always
1055 * kept enabled while there is PI state around. We must also 1158 * kept enabled while there is PI state around. We cleanup the
1056 * preserve the owner died bit.) 1159 * owner died bit, because we are the owner.
1057 */ 1160 */
1058 if (!(uval & FUTEX_OWNER_DIED)) { 1161 newval = FUTEX_WAITERS | task_pid_vnr(new_owner);
1059 int ret = 0;
1060 1162
1061 newval = FUTEX_WAITERS | task_pid_vnr(new_owner); 1163 if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))
1062 1164 ret = -EFAULT;
1063 if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) 1165 else if (curval != uval)
1064 ret = -EFAULT; 1166 ret = -EINVAL;
1065 else if (curval != uval) 1167 if (ret) {
1066 ret = -EINVAL; 1168 raw_spin_unlock(&pi_state->pi_mutex.wait_lock);
1067 if (ret) { 1169 return ret;
1068 raw_spin_unlock(&pi_state->pi_mutex.wait_lock);
1069 return ret;
1070 }
1071 } 1170 }
1072 1171
1073 raw_spin_lock_irq(&pi_state->owner->pi_lock); 1172 raw_spin_lock_irq(&pi_state->owner->pi_lock);
@@ -1347,7 +1446,7 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key,
1347 * 1446 *
1348 * Return: 1447 * Return:
1349 * 0 - failed to acquire the lock atomically; 1448 * 0 - failed to acquire the lock atomically;
1350 * 1 - acquired the lock; 1449 * >0 - acquired the lock, return value is vpid of the top_waiter
1351 * <0 - error 1450 * <0 - error
1352 */ 1451 */
1353static int futex_proxy_trylock_atomic(u32 __user *pifutex, 1452static int futex_proxy_trylock_atomic(u32 __user *pifutex,
@@ -1358,7 +1457,7 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex,
1358{ 1457{
1359 struct futex_q *top_waiter = NULL; 1458 struct futex_q *top_waiter = NULL;
1360 u32 curval; 1459 u32 curval;
1361 int ret; 1460 int ret, vpid;
1362 1461
1363 if (get_futex_value_locked(&curval, pifutex)) 1462 if (get_futex_value_locked(&curval, pifutex))
1364 return -EFAULT; 1463 return -EFAULT;
@@ -1386,11 +1485,13 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex,
1386 * the contended case or if set_waiters is 1. The pi_state is returned 1485 * the contended case or if set_waiters is 1. The pi_state is returned
1387 * in ps in contended cases. 1486 * in ps in contended cases.
1388 */ 1487 */
1488 vpid = task_pid_vnr(top_waiter->task);
1389 ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task, 1489 ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task,
1390 set_waiters); 1490 set_waiters);
1391 if (ret == 1) 1491 if (ret == 1) {
1392 requeue_pi_wake_futex(top_waiter, key2, hb2); 1492 requeue_pi_wake_futex(top_waiter, key2, hb2);
1393 1493 return vpid;
1494 }
1394 return ret; 1495 return ret;
1395} 1496}
1396 1497
@@ -1421,10 +1522,16 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
1421 struct futex_pi_state *pi_state = NULL; 1522 struct futex_pi_state *pi_state = NULL;
1422 struct futex_hash_bucket *hb1, *hb2; 1523 struct futex_hash_bucket *hb1, *hb2;
1423 struct futex_q *this, *next; 1524 struct futex_q *this, *next;
1424 u32 curval2;
1425 1525
1426 if (requeue_pi) { 1526 if (requeue_pi) {
1427 /* 1527 /*
1528 * Requeue PI only works on two distinct uaddrs. This
1529 * check is only valid for private futexes. See below.
1530 */
1531 if (uaddr1 == uaddr2)
1532 return -EINVAL;
1533
1534 /*
1428 * requeue_pi requires a pi_state, try to allocate it now 1535 * requeue_pi requires a pi_state, try to allocate it now
1429 * without any locks in case it fails. 1536 * without any locks in case it fails.
1430 */ 1537 */
@@ -1462,6 +1569,15 @@ retry:
1462 if (unlikely(ret != 0)) 1569 if (unlikely(ret != 0))
1463 goto out_put_key1; 1570 goto out_put_key1;
1464 1571
1572 /*
1573 * The check above which compares uaddrs is not sufficient for
1574 * shared futexes. We need to compare the keys:
1575 */
1576 if (requeue_pi && match_futex(&key1, &key2)) {
1577 ret = -EINVAL;
1578 goto out_put_keys;
1579 }
1580
1465 hb1 = hash_futex(&key1); 1581 hb1 = hash_futex(&key1);
1466 hb2 = hash_futex(&key2); 1582 hb2 = hash_futex(&key2);
1467 1583
@@ -1509,16 +1625,25 @@ retry_private:
1509 * At this point the top_waiter has either taken uaddr2 or is 1625 * At this point the top_waiter has either taken uaddr2 or is
1510 * waiting on it. If the former, then the pi_state will not 1626 * waiting on it. If the former, then the pi_state will not
1511 * exist yet, look it up one more time to ensure we have a 1627 * exist yet, look it up one more time to ensure we have a
1512 * reference to it. 1628 * reference to it. If the lock was taken, ret contains the
1629 * vpid of the top waiter task.
1513 */ 1630 */
1514 if (ret == 1) { 1631 if (ret > 0) {
1515 WARN_ON(pi_state); 1632 WARN_ON(pi_state);
1516 drop_count++; 1633 drop_count++;
1517 task_count++; 1634 task_count++;
1518 ret = get_futex_value_locked(&curval2, uaddr2); 1635 /*
1519 if (!ret) 1636 * If we acquired the lock, then the user
1520 ret = lookup_pi_state(curval2, hb2, &key2, 1637 * space value of uaddr2 should be vpid. It
1521 &pi_state); 1638 * cannot be changed by the top waiter as it
1639 * is blocked on hb2 lock if it tries to do
1640 * so. If something fiddled with it behind our
1641 * back the pi state lookup might unearth
1642 * it. So we rather use the known value than
1643 * rereading and handing potential crap to
1644 * lookup_pi_state.
1645 */
1646 ret = lookup_pi_state(ret, hb2, &key2, &pi_state);
1522 } 1647 }
1523 1648
1524 switch (ret) { 1649 switch (ret) {
@@ -2301,9 +2426,10 @@ retry:
2301 /* 2426 /*
2302 * To avoid races, try to do the TID -> 0 atomic transition 2427 * To avoid races, try to do the TID -> 0 atomic transition
2303 * again. If it succeeds then we can return without waking 2428 * again. If it succeeds then we can return without waking
2304 * anyone else up: 2429 * anyone else up. We only try this if neither the waiters nor
2430 * the owner died bit are set.
2305 */ 2431 */
2306 if (!(uval & FUTEX_OWNER_DIED) && 2432 if (!(uval & ~FUTEX_TID_MASK) &&
2307 cmpxchg_futex_value_locked(&uval, uaddr, vpid, 0)) 2433 cmpxchg_futex_value_locked(&uval, uaddr, vpid, 0))
2308 goto pi_faulted; 2434 goto pi_faulted;
2309 /* 2435 /*
@@ -2333,11 +2459,9 @@ retry:
2333 /* 2459 /*
2334 * No waiters - kernel unlocks the futex: 2460 * No waiters - kernel unlocks the futex:
2335 */ 2461 */
2336 if (!(uval & FUTEX_OWNER_DIED)) { 2462 ret = unlock_futex_pi(uaddr, uval);
2337 ret = unlock_futex_pi(uaddr, uval); 2463 if (ret == -EFAULT)
2338 if (ret == -EFAULT) 2464 goto pi_faulted;
2339 goto pi_faulted;
2340 }
2341 2465
2342out_unlock: 2466out_unlock:
2343 spin_unlock(&hb->lock); 2467 spin_unlock(&hb->lock);
@@ -2499,6 +2623,15 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
2499 if (ret) 2623 if (ret)
2500 goto out_key2; 2624 goto out_key2;
2501 2625
2626 /*
2627 * The check above which compares uaddrs is not sufficient for
2628 * shared futexes. We need to compare the keys:
2629 */
2630 if (match_futex(&q.key, &key2)) {
2631 ret = -EINVAL;
2632 goto out_put_keys;
2633 }
2634
2502 /* Queue the futex_q, drop the hb lock, wait for wakeup. */ 2635 /* Queue the futex_q, drop the hb lock, wait for wakeup. */
2503 futex_wait_queue_me(hb, &q, to); 2636 futex_wait_queue_me(hb, &q, to);
2504 2637
diff --git a/kernel/gcov/base.c b/kernel/gcov/base.c
index f45b75b713c0..b358a802fd18 100644
--- a/kernel/gcov/base.c
+++ b/kernel/gcov/base.c
@@ -85,6 +85,12 @@ void __gcov_merge_ior(gcov_type *counters, unsigned int n_counters)
85} 85}
86EXPORT_SYMBOL(__gcov_merge_ior); 86EXPORT_SYMBOL(__gcov_merge_ior);
87 87
88void __gcov_merge_time_profile(gcov_type *counters, unsigned int n_counters)
89{
90 /* Unused. */
91}
92EXPORT_SYMBOL(__gcov_merge_time_profile);
93
88/** 94/**
89 * gcov_enable_events - enable event reporting through gcov_event() 95 * gcov_enable_events - enable event reporting through gcov_event()
90 * 96 *
diff --git a/kernel/gcov/gcc_4_7.c b/kernel/gcov/gcc_4_7.c
index 2c6e4631c814..826ba9fb5e32 100644
--- a/kernel/gcov/gcc_4_7.c
+++ b/kernel/gcov/gcc_4_7.c
@@ -18,7 +18,12 @@
18#include <linux/vmalloc.h> 18#include <linux/vmalloc.h>
19#include "gcov.h" 19#include "gcov.h"
20 20
21#if __GNUC__ == 4 && __GNUC_MINOR__ >= 9
22#define GCOV_COUNTERS 9
23#else
21#define GCOV_COUNTERS 8 24#define GCOV_COUNTERS 8
25#endif
26
22#define GCOV_TAG_FUNCTION_LENGTH 3 27#define GCOV_TAG_FUNCTION_LENGTH 3
23 28
24static struct gcov_info *gcov_info_head; 29static struct gcov_info *gcov_info_head;
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index d55092ceee29..3ab28993f6e0 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -234,6 +234,11 @@ again:
234 goto again; 234 goto again;
235 } 235 }
236 timer->base = new_base; 236 timer->base = new_base;
237 } else {
238 if (cpu != this_cpu && hrtimer_check_target(timer, new_base)) {
239 cpu = this_cpu;
240 goto again;
241 }
237 } 242 }
238 return new_base; 243 return new_base;
239} 244}
@@ -569,6 +574,23 @@ hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal)
569 574
570 cpu_base->expires_next.tv64 = expires_next.tv64; 575 cpu_base->expires_next.tv64 = expires_next.tv64;
571 576
577 /*
578 * If a hang was detected in the last timer interrupt then we
579 * leave the hang delay active in the hardware. We want the
580 * system to make progress. That also prevents the following
581 * scenario:
582 * T1 expires 50ms from now
583 * T2 expires 5s from now
584 *
585 * T1 is removed, so this code is called and would reprogram
586 * the hardware to 5s from now. Any hrtimer_start after that
587 * will not reprogram the hardware due to hang_detected being
588 * set. So we'd effectivly block all timers until the T2 event
589 * fires.
590 */
591 if (cpu_base->hang_detected)
592 return;
593
572 if (cpu_base->expires_next.tv64 != KTIME_MAX) 594 if (cpu_base->expires_next.tv64 != KTIME_MAX)
573 tick_program_event(cpu_base->expires_next, 1); 595 tick_program_event(cpu_base->expires_next, 1);
574} 596}
@@ -968,11 +990,8 @@ int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
968 /* Remove an active timer from the queue: */ 990 /* Remove an active timer from the queue: */
969 ret = remove_hrtimer(timer, base); 991 ret = remove_hrtimer(timer, base);
970 992
971 /* Switch the timer base, if necessary: */
972 new_base = switch_hrtimer_base(timer, base, mode & HRTIMER_MODE_PINNED);
973
974 if (mode & HRTIMER_MODE_REL) { 993 if (mode & HRTIMER_MODE_REL) {
975 tim = ktime_add_safe(tim, new_base->get_time()); 994 tim = ktime_add_safe(tim, base->get_time());
976 /* 995 /*
977 * CONFIG_TIME_LOW_RES is a temporary way for architectures 996 * CONFIG_TIME_LOW_RES is a temporary way for architectures
978 * to signal that they simply return xtime in 997 * to signal that they simply return xtime in
@@ -987,6 +1006,9 @@ int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
987 1006
988 hrtimer_set_expires_range_ns(timer, tim, delta_ns); 1007 hrtimer_set_expires_range_ns(timer, tim, delta_ns);
989 1008
1009 /* Switch the timer base, if necessary: */
1010 new_base = switch_hrtimer_base(timer, base, mode & HRTIMER_MODE_PINNED);
1011
990 timer_stats_hrtimer_set_start_info(timer); 1012 timer_stats_hrtimer_set_start_info(timer);
991 1013
992 leftmost = enqueue_hrtimer(timer, new_base); 1014 leftmost = enqueue_hrtimer(timer, new_base);
@@ -1017,6 +1039,7 @@ int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
1017 1039
1018 return ret; 1040 return ret;
1019} 1041}
1042EXPORT_SYMBOL_GPL(__hrtimer_start_range_ns);
1020 1043
1021/** 1044/**
1022 * hrtimer_start_range_ns - (re)start an hrtimer on the current CPU 1045 * hrtimer_start_range_ns - (re)start an hrtimer on the current CPU
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index 06bb1417b063..06db12434d72 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -52,8 +52,10 @@ unsigned int __read_mostly sysctl_hung_task_panic =
52 52
53static int __init hung_task_panic_setup(char *str) 53static int __init hung_task_panic_setup(char *str)
54{ 54{
55 sysctl_hung_task_panic = simple_strtoul(str, NULL, 0); 55 int rc = kstrtouint(str, 0, &sysctl_hung_task_panic);
56 56
57 if (rc)
58 return rc;
57 return 1; 59 return 1;
58} 60}
59__setup("hung_task_panic=", hung_task_panic_setup); 61__setup("hung_task_panic=", hung_task_panic_setup);
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
index 07cbdfea9ae2..d269cecdfbf0 100644
--- a/kernel/irq/Kconfig
+++ b/kernel/irq/Kconfig
@@ -5,6 +5,10 @@ menu "IRQ subsystem"
5config MAY_HAVE_SPARSE_IRQ 5config MAY_HAVE_SPARSE_IRQ
6 bool 6 bool
7 7
8# Legacy support, required for itanic
9config GENERIC_IRQ_LEGACY
10 bool
11
8# Enable the generic irq autoprobe mechanism 12# Enable the generic irq autoprobe mechanism
9config GENERIC_IRQ_PROBE 13config GENERIC_IRQ_PROBE
10 bool 14 bool
@@ -17,6 +21,11 @@ config GENERIC_IRQ_SHOW
17config GENERIC_IRQ_SHOW_LEVEL 21config GENERIC_IRQ_SHOW_LEVEL
18 bool 22 bool
19 23
24# Facility to allocate a hardware interrupt. This is legacy support
25# and should not be used in new code. Use irq domains instead.
26config GENERIC_IRQ_LEGACY_ALLOC_HWIRQ
27 bool
28
20# Support for delayed migration from interrupt context 29# Support for delayed migration from interrupt context
21config GENERIC_PENDING_IRQ 30config GENERIC_PENDING_IRQ
22 bool 31 bool
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 6397df2d6945..a2b28a2fd7b1 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -40,10 +40,9 @@ int irq_set_chip(unsigned int irq, struct irq_chip *chip)
40 irq_put_desc_unlock(desc, flags); 40 irq_put_desc_unlock(desc, flags);
41 /* 41 /*
42 * For !CONFIG_SPARSE_IRQ make the irq show up in 42 * For !CONFIG_SPARSE_IRQ make the irq show up in
43 * allocated_irqs. For the CONFIG_SPARSE_IRQ case, it is 43 * allocated_irqs.
44 * already marked, and this call is harmless.
45 */ 44 */
46 irq_reserve_irq(irq); 45 irq_mark_irq(irq);
47 return 0; 46 return 0;
48} 47}
49EXPORT_SYMBOL(irq_set_chip); 48EXPORT_SYMBOL(irq_set_chip);
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index ddf1ffeb79f1..099ea2e0eb88 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -33,7 +33,7 @@ enum {
33}; 33};
34 34
35/* 35/*
36 * Bit masks for desc->state 36 * Bit masks for desc->core_internal_state__do_not_mess_with_it
37 * 37 *
38 * IRQS_AUTODETECT - autodetection in progress 38 * IRQS_AUTODETECT - autodetection in progress
39 * IRQS_SPURIOUS_DISABLED - was disabled due to spurious interrupt 39 * IRQS_SPURIOUS_DISABLED - was disabled due to spurious interrupt
@@ -76,6 +76,12 @@ extern void mask_irq(struct irq_desc *desc);
76extern void unmask_irq(struct irq_desc *desc); 76extern void unmask_irq(struct irq_desc *desc);
77extern void unmask_threaded_irq(struct irq_desc *desc); 77extern void unmask_threaded_irq(struct irq_desc *desc);
78 78
79#ifdef CONFIG_SPARSE_IRQ
80static inline void irq_mark_irq(unsigned int irq) { }
81#else
82extern void irq_mark_irq(unsigned int irq);
83#endif
84
79extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr); 85extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr);
80 86
81irqreturn_t handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action); 87irqreturn_t handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action);
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index a7174617616b..7339e42a85ab 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -278,7 +278,12 @@ EXPORT_SYMBOL(irq_to_desc);
278 278
279static void free_desc(unsigned int irq) 279static void free_desc(unsigned int irq)
280{ 280{
281 dynamic_irq_cleanup(irq); 281 struct irq_desc *desc = irq_to_desc(irq);
282 unsigned long flags;
283
284 raw_spin_lock_irqsave(&desc->lock, flags);
285 desc_set_defaults(irq, desc, desc_node(desc), NULL);
286 raw_spin_unlock_irqrestore(&desc->lock, flags);
282} 287}
283 288
284static inline int alloc_descs(unsigned int start, unsigned int cnt, int node, 289static inline int alloc_descs(unsigned int start, unsigned int cnt, int node,
@@ -299,6 +304,20 @@ static int irq_expand_nr_irqs(unsigned int nr)
299 return -ENOMEM; 304 return -ENOMEM;
300} 305}
301 306
307void irq_mark_irq(unsigned int irq)
308{
309 mutex_lock(&sparse_irq_lock);
310 bitmap_set(allocated_irqs, irq, 1);
311 mutex_unlock(&sparse_irq_lock);
312}
313
314#ifdef CONFIG_GENERIC_IRQ_LEGACY
315void irq_init_desc(unsigned int irq)
316{
317 free_desc(irq);
318}
319#endif
320
302#endif /* !CONFIG_SPARSE_IRQ */ 321#endif /* !CONFIG_SPARSE_IRQ */
303 322
304/** 323/**
@@ -363,6 +382,13 @@ __irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node,
363 if (from > irq) 382 if (from > irq)
364 return -EINVAL; 383 return -EINVAL;
365 from = irq; 384 from = irq;
385 } else {
386 /*
387 * For interrupts which are freely allocated the
388 * architecture can force a lower bound to the @from
389 * argument. x86 uses this to exclude the GSI space.
390 */
391 from = arch_dynirq_lower_bound(from);
366 } 392 }
367 393
368 mutex_lock(&sparse_irq_lock); 394 mutex_lock(&sparse_irq_lock);
@@ -389,30 +415,56 @@ err:
389} 415}
390EXPORT_SYMBOL_GPL(__irq_alloc_descs); 416EXPORT_SYMBOL_GPL(__irq_alloc_descs);
391 417
418#ifdef CONFIG_GENERIC_IRQ_LEGACY_ALLOC_HWIRQ
392/** 419/**
393 * irq_reserve_irqs - mark irqs allocated 420 * irq_alloc_hwirqs - Allocate an irq descriptor and initialize the hardware
394 * @from: mark from irq number 421 * @cnt: number of interrupts to allocate
395 * @cnt: number of irqs to mark 422 * @node: node on which to allocate
396 * 423 *
397 * Returns 0 on success or an appropriate error code 424 * Returns an interrupt number > 0 or 0, if the allocation fails.
398 */ 425 */
399int irq_reserve_irqs(unsigned int from, unsigned int cnt) 426unsigned int irq_alloc_hwirqs(int cnt, int node)
400{ 427{
401 unsigned int start; 428 int i, irq = __irq_alloc_descs(-1, 0, cnt, node, NULL);
402 int ret = 0;
403 429
404 if (!cnt || (from + cnt) > nr_irqs) 430 if (irq < 0)
405 return -EINVAL; 431 return 0;
406 432
407 mutex_lock(&sparse_irq_lock); 433 for (i = irq; cnt > 0; i++, cnt--) {
408 start = bitmap_find_next_zero_area(allocated_irqs, nr_irqs, from, cnt, 0); 434 if (arch_setup_hwirq(i, node))
409 if (start == from) 435 goto err;
410 bitmap_set(allocated_irqs, start, cnt); 436 irq_clear_status_flags(i, _IRQ_NOREQUEST);
411 else 437 }
412 ret = -EEXIST; 438 return irq;
413 mutex_unlock(&sparse_irq_lock); 439
414 return ret; 440err:
441 for (i--; i >= irq; i--) {
442 irq_set_status_flags(i, _IRQ_NOREQUEST | _IRQ_NOPROBE);
443 arch_teardown_hwirq(i);
444 }
445 irq_free_descs(irq, cnt);
446 return 0;
447}
448EXPORT_SYMBOL_GPL(irq_alloc_hwirqs);
449
450/**
451 * irq_free_hwirqs - Free irq descriptor and cleanup the hardware
452 * @from: Free from irq number
453 * @cnt: number of interrupts to free
454 *
455 */
456void irq_free_hwirqs(unsigned int from, int cnt)
457{
458 int i;
459
460 for (i = from; cnt > 0; i++, cnt--) {
461 irq_set_status_flags(i, _IRQ_NOREQUEST | _IRQ_NOPROBE);
462 arch_teardown_hwirq(i);
463 }
464 irq_free_descs(from, cnt);
415} 465}
466EXPORT_SYMBOL_GPL(irq_free_hwirqs);
467#endif
416 468
417/** 469/**
418 * irq_get_next_irq - get next allocated irq number 470 * irq_get_next_irq - get next allocated irq number
@@ -475,20 +527,6 @@ int irq_set_percpu_devid(unsigned int irq)
475 return 0; 527 return 0;
476} 528}
477 529
478/**
479 * dynamic_irq_cleanup - cleanup a dynamically allocated irq
480 * @irq: irq number to initialize
481 */
482void dynamic_irq_cleanup(unsigned int irq)
483{
484 struct irq_desc *desc = irq_to_desc(irq);
485 unsigned long flags;
486
487 raw_spin_lock_irqsave(&desc->lock, flags);
488 desc_set_defaults(irq, desc, desc_node(desc), NULL);
489 raw_spin_unlock_irqrestore(&desc->lock, flags);
490}
491
492void kstat_incr_irq_this_cpu(unsigned int irq) 530void kstat_incr_irq_this_cpu(unsigned int irq)
493{ 531{
494 kstat_incr_irqs_this_cpu(irq, irq_to_desc(irq)); 532 kstat_incr_irqs_this_cpu(irq, irq_to_desc(irq));
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index f14033700c25..eb5e10e32e05 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -27,14 +27,14 @@ static struct irq_domain *irq_default_domain;
27 * __irq_domain_add() - Allocate a new irq_domain data structure 27 * __irq_domain_add() - Allocate a new irq_domain data structure
28 * @of_node: optional device-tree node of the interrupt controller 28 * @of_node: optional device-tree node of the interrupt controller
29 * @size: Size of linear map; 0 for radix mapping only 29 * @size: Size of linear map; 0 for radix mapping only
30 * @hwirq_max: Maximum number of interrupts supported by controller
30 * @direct_max: Maximum value of direct maps; Use ~0 for no limit; 0 for no 31 * @direct_max: Maximum value of direct maps; Use ~0 for no limit; 0 for no
31 * direct mapping 32 * direct mapping
32 * @ops: map/unmap domain callbacks 33 * @ops: map/unmap domain callbacks
33 * @host_data: Controller private data pointer 34 * @host_data: Controller private data pointer
34 * 35 *
35 * Allocates and initialize and irq_domain structure. Caller is expected to 36 * Allocates and initialize and irq_domain structure.
36 * register allocated irq_domain with irq_domain_register(). Returns pointer 37 * Returns pointer to IRQ domain, or NULL on failure.
37 * to IRQ domain, or NULL on failure.
38 */ 38 */
39struct irq_domain *__irq_domain_add(struct device_node *of_node, int size, 39struct irq_domain *__irq_domain_add(struct device_node *of_node, int size,
40 irq_hw_number_t hwirq_max, int direct_max, 40 irq_hw_number_t hwirq_max, int direct_max,
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 2486a4c1a710..3dc6a61bf06a 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -180,7 +180,7 @@ int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask,
180 struct irq_chip *chip = irq_data_get_irq_chip(data); 180 struct irq_chip *chip = irq_data_get_irq_chip(data);
181 int ret; 181 int ret;
182 182
183 ret = chip->irq_set_affinity(data, mask, false); 183 ret = chip->irq_set_affinity(data, mask, force);
184 switch (ret) { 184 switch (ret) {
185 case IRQ_SET_MASK_OK: 185 case IRQ_SET_MASK_OK:
186 cpumask_copy(data->affinity, mask); 186 cpumask_copy(data->affinity, mask);
@@ -192,7 +192,8 @@ int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask,
192 return ret; 192 return ret;
193} 193}
194 194
195int __irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask) 195int irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask,
196 bool force)
196{ 197{
197 struct irq_chip *chip = irq_data_get_irq_chip(data); 198 struct irq_chip *chip = irq_data_get_irq_chip(data);
198 struct irq_desc *desc = irq_data_to_desc(data); 199 struct irq_desc *desc = irq_data_to_desc(data);
@@ -202,7 +203,7 @@ int __irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask)
202 return -EINVAL; 203 return -EINVAL;
203 204
204 if (irq_can_move_pcntxt(data)) { 205 if (irq_can_move_pcntxt(data)) {
205 ret = irq_do_set_affinity(data, mask, false); 206 ret = irq_do_set_affinity(data, mask, force);
206 } else { 207 } else {
207 irqd_set_move_pending(data); 208 irqd_set_move_pending(data);
208 irq_copy_pending(desc, mask); 209 irq_copy_pending(desc, mask);
@@ -217,13 +218,7 @@ int __irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask)
217 return ret; 218 return ret;
218} 219}
219 220
220/** 221int __irq_set_affinity(unsigned int irq, const struct cpumask *mask, bool force)
221 * irq_set_affinity - Set the irq affinity of a given irq
222 * @irq: Interrupt to set affinity
223 * @mask: cpumask
224 *
225 */
226int irq_set_affinity(unsigned int irq, const struct cpumask *mask)
227{ 222{
228 struct irq_desc *desc = irq_to_desc(irq); 223 struct irq_desc *desc = irq_to_desc(irq);
229 unsigned long flags; 224 unsigned long flags;
@@ -233,7 +228,7 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *mask)
233 return -EINVAL; 228 return -EINVAL;
234 229
235 raw_spin_lock_irqsave(&desc->lock, flags); 230 raw_spin_lock_irqsave(&desc->lock, flags);
236 ret = __irq_set_affinity_locked(irq_desc_get_irq_data(desc), mask); 231 ret = irq_set_affinity_locked(irq_desc_get_irq_data(desc), mask, force);
237 raw_spin_unlock_irqrestore(&desc->lock, flags); 232 raw_spin_unlock_irqrestore(&desc->lock, flags);
238 return ret; 233 return ret;
239} 234}
@@ -891,8 +886,8 @@ static int irq_thread(void *data)
891 irq_thread_check_affinity(desc, action); 886 irq_thread_check_affinity(desc, action);
892 887
893 action_ret = handler_fn(desc, action); 888 action_ret = handler_fn(desc, action);
894 if (!noirqdebug) 889 if (action_ret == IRQ_HANDLED)
895 note_interrupt(action->irq, desc, action_ret); 890 atomic_inc(&desc->threads_handled);
896 891
897 wake_threads_waitq(desc); 892 wake_threads_waitq(desc);
898 } 893 }
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index a1d8cc63b56e..e2514b0e439e 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -270,6 +270,8 @@ try_misrouted_irq(unsigned int irq, struct irq_desc *desc,
270 return action && (action->flags & IRQF_IRQPOLL); 270 return action && (action->flags & IRQF_IRQPOLL);
271} 271}
272 272
273#define SPURIOUS_DEFERRED 0x80000000
274
273void note_interrupt(unsigned int irq, struct irq_desc *desc, 275void note_interrupt(unsigned int irq, struct irq_desc *desc,
274 irqreturn_t action_ret) 276 irqreturn_t action_ret)
275{ 277{
@@ -277,15 +279,111 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc,
277 irq_settings_is_polled(desc)) 279 irq_settings_is_polled(desc))
278 return; 280 return;
279 281
280 /* we get here again via the threaded handler */
281 if (action_ret == IRQ_WAKE_THREAD)
282 return;
283
284 if (bad_action_ret(action_ret)) { 282 if (bad_action_ret(action_ret)) {
285 report_bad_irq(irq, desc, action_ret); 283 report_bad_irq(irq, desc, action_ret);
286 return; 284 return;
287 } 285 }
288 286
287 /*
288 * We cannot call note_interrupt from the threaded handler
289 * because we need to look at the compound of all handlers
290 * (primary and threaded). Aside of that in the threaded
291 * shared case we have no serialization against an incoming
292 * hardware interrupt while we are dealing with a threaded
293 * result.
294 *
295 * So in case a thread is woken, we just note the fact and
296 * defer the analysis to the next hardware interrupt.
297 *
298 * The threaded handlers store whether they sucessfully
299 * handled an interrupt and we check whether that number
300 * changed versus the last invocation.
301 *
302 * We could handle all interrupts with the delayed by one
303 * mechanism, but for the non forced threaded case we'd just
304 * add pointless overhead to the straight hardirq interrupts
305 * for the sake of a few lines less code.
306 */
307 if (action_ret & IRQ_WAKE_THREAD) {
308 /*
309 * There is a thread woken. Check whether one of the
310 * shared primary handlers returned IRQ_HANDLED. If
311 * not we defer the spurious detection to the next
312 * interrupt.
313 */
314 if (action_ret == IRQ_WAKE_THREAD) {
315 int handled;
316 /*
317 * We use bit 31 of thread_handled_last to
318 * denote the deferred spurious detection
319 * active. No locking necessary as
320 * thread_handled_last is only accessed here
321 * and we have the guarantee that hard
322 * interrupts are not reentrant.
323 */
324 if (!(desc->threads_handled_last & SPURIOUS_DEFERRED)) {
325 desc->threads_handled_last |= SPURIOUS_DEFERRED;
326 return;
327 }
328 /*
329 * Check whether one of the threaded handlers
330 * returned IRQ_HANDLED since the last
331 * interrupt happened.
332 *
333 * For simplicity we just set bit 31, as it is
334 * set in threads_handled_last as well. So we
335 * avoid extra masking. And we really do not
336 * care about the high bits of the handled
337 * count. We just care about the count being
338 * different than the one we saw before.
339 */
340 handled = atomic_read(&desc->threads_handled);
341 handled |= SPURIOUS_DEFERRED;
342 if (handled != desc->threads_handled_last) {
343 action_ret = IRQ_HANDLED;
344 /*
345 * Note: We keep the SPURIOUS_DEFERRED
346 * bit set. We are handling the
347 * previous invocation right now.
348 * Keep it for the current one, so the
349 * next hardware interrupt will
350 * account for it.
351 */
352 desc->threads_handled_last = handled;
353 } else {
354 /*
355 * None of the threaded handlers felt
356 * responsible for the last interrupt
357 *
358 * We keep the SPURIOUS_DEFERRED bit
359 * set in threads_handled_last as we
360 * need to account for the current
361 * interrupt as well.
362 */
363 action_ret = IRQ_NONE;
364 }
365 } else {
366 /*
367 * One of the primary handlers returned
368 * IRQ_HANDLED. So we don't care about the
369 * threaded handlers on the same line. Clear
370 * the deferred detection bit.
371 *
372 * In theory we could/should check whether the
373 * deferred bit is set and take the result of
374 * the previous run into account here as
375 * well. But it's really not worth the
376 * trouble. If every other interrupt is
377 * handled we never trigger the spurious
378 * detector. And if this is just the one out
379 * of 100k unhandled ones which is handled
380 * then we merily delay the spurious detection
381 * by one hard interrupt. Not a real problem.
382 */
383 desc->threads_handled_last &= ~SPURIOUS_DEFERRED;
384 }
385 }
386
289 if (unlikely(action_ret == IRQ_NONE)) { 387 if (unlikely(action_ret == IRQ_NONE)) {
290 /* 388 /*
291 * If we are seeing only the odd spurious IRQ caused by 389 * If we are seeing only the odd spurious IRQ caused by
diff --git a/kernel/kexec.c b/kernel/kexec.c
index c8380ad203bc..6748688813d0 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -125,8 +125,8 @@ static struct page *kimage_alloc_page(struct kimage *image,
125 unsigned long dest); 125 unsigned long dest);
126 126
127static int do_kimage_alloc(struct kimage **rimage, unsigned long entry, 127static int do_kimage_alloc(struct kimage **rimage, unsigned long entry,
128 unsigned long nr_segments, 128 unsigned long nr_segments,
129 struct kexec_segment __user *segments) 129 struct kexec_segment __user *segments)
130{ 130{
131 size_t segment_bytes; 131 size_t segment_bytes;
132 struct kimage *image; 132 struct kimage *image;
@@ -257,13 +257,13 @@ static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry,
257 image->control_code_page = kimage_alloc_control_pages(image, 257 image->control_code_page = kimage_alloc_control_pages(image,
258 get_order(KEXEC_CONTROL_PAGE_SIZE)); 258 get_order(KEXEC_CONTROL_PAGE_SIZE));
259 if (!image->control_code_page) { 259 if (!image->control_code_page) {
260 printk(KERN_ERR "Could not allocate control_code_buffer\n"); 260 pr_err("Could not allocate control_code_buffer\n");
261 goto out_free; 261 goto out_free;
262 } 262 }
263 263
264 image->swap_page = kimage_alloc_control_pages(image, 0); 264 image->swap_page = kimage_alloc_control_pages(image, 0);
265 if (!image->swap_page) { 265 if (!image->swap_page) {
266 printk(KERN_ERR "Could not allocate swap buffer\n"); 266 pr_err("Could not allocate swap buffer\n");
267 goto out_free; 267 goto out_free;
268 } 268 }
269 269
@@ -332,7 +332,7 @@ static int kimage_crash_alloc(struct kimage **rimage, unsigned long entry,
332 image->control_code_page = kimage_alloc_control_pages(image, 332 image->control_code_page = kimage_alloc_control_pages(image,
333 get_order(KEXEC_CONTROL_PAGE_SIZE)); 333 get_order(KEXEC_CONTROL_PAGE_SIZE));
334 if (!image->control_code_page) { 334 if (!image->control_code_page) {
335 printk(KERN_ERR "Could not allocate control_code_buffer\n"); 335 pr_err("Could not allocate control_code_buffer\n");
336 goto out_free; 336 goto out_free;
337 } 337 }
338 338
@@ -621,8 +621,8 @@ static void kimage_terminate(struct kimage *image)
621 621
622#define for_each_kimage_entry(image, ptr, entry) \ 622#define for_each_kimage_entry(image, ptr, entry) \
623 for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \ 623 for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \
624 ptr = (entry & IND_INDIRECTION)? \ 624 ptr = (entry & IND_INDIRECTION) ? \
625 phys_to_virt((entry & PAGE_MASK)): ptr +1) 625 phys_to_virt((entry & PAGE_MASK)) : ptr + 1)
626 626
627static void kimage_free_entry(kimage_entry_t entry) 627static void kimage_free_entry(kimage_entry_t entry)
628{ 628{
@@ -650,8 +650,7 @@ static void kimage_free(struct kimage *image)
650 * done with it. 650 * done with it.
651 */ 651 */
652 ind = entry; 652 ind = entry;
653 } 653 } else if (entry & IND_SOURCE)
654 else if (entry & IND_SOURCE)
655 kimage_free_entry(entry); 654 kimage_free_entry(entry);
656 } 655 }
657 /* Free the final indirection page */ 656 /* Free the final indirection page */
@@ -774,8 +773,7 @@ static struct page *kimage_alloc_page(struct kimage *image,
774 addr = old_addr; 773 addr = old_addr;
775 page = old_page; 774 page = old_page;
776 break; 775 break;
777 } 776 } else {
778 else {
779 /* Place the page on the destination list I 777 /* Place the page on the destination list I
780 * will use it later. 778 * will use it later.
781 */ 779 */
@@ -1059,7 +1057,7 @@ COMPAT_SYSCALL_DEFINE4(kexec_load, compat_ulong_t, entry,
1059 return -EINVAL; 1057 return -EINVAL;
1060 1058
1061 ksegments = compat_alloc_user_space(nr_segments * sizeof(out)); 1059 ksegments = compat_alloc_user_space(nr_segments * sizeof(out));
1062 for (i=0; i < nr_segments; i++) { 1060 for (i = 0; i < nr_segments; i++) {
1063 result = copy_from_user(&in, &segments[i], sizeof(in)); 1061 result = copy_from_user(&in, &segments[i], sizeof(in));
1064 if (result) 1062 if (result)
1065 return -EFAULT; 1063 return -EFAULT;
@@ -1214,14 +1212,14 @@ void crash_save_cpu(struct pt_regs *regs, int cpu)
1214 * squirrelled away. ELF notes happen to provide 1212 * squirrelled away. ELF notes happen to provide
1215 * all of that, so there is no need to invent something new. 1213 * all of that, so there is no need to invent something new.
1216 */ 1214 */
1217 buf = (u32*)per_cpu_ptr(crash_notes, cpu); 1215 buf = (u32 *)per_cpu_ptr(crash_notes, cpu);
1218 if (!buf) 1216 if (!buf)
1219 return; 1217 return;
1220 memset(&prstatus, 0, sizeof(prstatus)); 1218 memset(&prstatus, 0, sizeof(prstatus));
1221 prstatus.pr_pid = current->pid; 1219 prstatus.pr_pid = current->pid;
1222 elf_core_copy_kernel_regs(&prstatus.pr_reg, regs); 1220 elf_core_copy_kernel_regs(&prstatus.pr_reg, regs);
1223 buf = append_elf_note(buf, KEXEC_CORE_NOTE_NAME, NT_PRSTATUS, 1221 buf = append_elf_note(buf, KEXEC_CORE_NOTE_NAME, NT_PRSTATUS,
1224 &prstatus, sizeof(prstatus)); 1222 &prstatus, sizeof(prstatus));
1225 final_note(buf); 1223 final_note(buf);
1226} 1224}
1227 1225
@@ -1230,8 +1228,7 @@ static int __init crash_notes_memory_init(void)
1230 /* Allocate memory for saving cpu registers. */ 1228 /* Allocate memory for saving cpu registers. */
1231 crash_notes = alloc_percpu(note_buf_t); 1229 crash_notes = alloc_percpu(note_buf_t);
1232 if (!crash_notes) { 1230 if (!crash_notes) {
1233 printk("Kexec: Memory allocation for saving cpu register" 1231 pr_warn("Kexec: Memory allocation for saving cpu register states failed\n");
1234 " states failed\n");
1235 return -ENOMEM; 1232 return -ENOMEM;
1236 } 1233 }
1237 return 0; 1234 return 0;
@@ -1253,10 +1250,10 @@ subsys_initcall(crash_notes_memory_init);
1253 * 1250 *
1254 * The function returns 0 on success and -EINVAL on failure. 1251 * The function returns 0 on success and -EINVAL on failure.
1255 */ 1252 */
1256static int __init parse_crashkernel_mem(char *cmdline, 1253static int __init parse_crashkernel_mem(char *cmdline,
1257 unsigned long long system_ram, 1254 unsigned long long system_ram,
1258 unsigned long long *crash_size, 1255 unsigned long long *crash_size,
1259 unsigned long long *crash_base) 1256 unsigned long long *crash_base)
1260{ 1257{
1261 char *cur = cmdline, *tmp; 1258 char *cur = cmdline, *tmp;
1262 1259
@@ -1267,12 +1264,12 @@ static int __init parse_crashkernel_mem(char *cmdline,
1267 /* get the start of the range */ 1264 /* get the start of the range */
1268 start = memparse(cur, &tmp); 1265 start = memparse(cur, &tmp);
1269 if (cur == tmp) { 1266 if (cur == tmp) {
1270 pr_warning("crashkernel: Memory value expected\n"); 1267 pr_warn("crashkernel: Memory value expected\n");
1271 return -EINVAL; 1268 return -EINVAL;
1272 } 1269 }
1273 cur = tmp; 1270 cur = tmp;
1274 if (*cur != '-') { 1271 if (*cur != '-') {
1275 pr_warning("crashkernel: '-' expected\n"); 1272 pr_warn("crashkernel: '-' expected\n");
1276 return -EINVAL; 1273 return -EINVAL;
1277 } 1274 }
1278 cur++; 1275 cur++;
@@ -1281,31 +1278,30 @@ static int __init parse_crashkernel_mem(char *cmdline,
1281 if (*cur != ':') { 1278 if (*cur != ':') {
1282 end = memparse(cur, &tmp); 1279 end = memparse(cur, &tmp);
1283 if (cur == tmp) { 1280 if (cur == tmp) {
1284 pr_warning("crashkernel: Memory " 1281 pr_warn("crashkernel: Memory value expected\n");
1285 "value expected\n");
1286 return -EINVAL; 1282 return -EINVAL;
1287 } 1283 }
1288 cur = tmp; 1284 cur = tmp;
1289 if (end <= start) { 1285 if (end <= start) {
1290 pr_warning("crashkernel: end <= start\n"); 1286 pr_warn("crashkernel: end <= start\n");
1291 return -EINVAL; 1287 return -EINVAL;
1292 } 1288 }
1293 } 1289 }
1294 1290
1295 if (*cur != ':') { 1291 if (*cur != ':') {
1296 pr_warning("crashkernel: ':' expected\n"); 1292 pr_warn("crashkernel: ':' expected\n");
1297 return -EINVAL; 1293 return -EINVAL;
1298 } 1294 }
1299 cur++; 1295 cur++;
1300 1296
1301 size = memparse(cur, &tmp); 1297 size = memparse(cur, &tmp);
1302 if (cur == tmp) { 1298 if (cur == tmp) {
1303 pr_warning("Memory value expected\n"); 1299 pr_warn("Memory value expected\n");
1304 return -EINVAL; 1300 return -EINVAL;
1305 } 1301 }
1306 cur = tmp; 1302 cur = tmp;
1307 if (size >= system_ram) { 1303 if (size >= system_ram) {
1308 pr_warning("crashkernel: invalid size\n"); 1304 pr_warn("crashkernel: invalid size\n");
1309 return -EINVAL; 1305 return -EINVAL;
1310 } 1306 }
1311 1307
@@ -1323,8 +1319,7 @@ static int __init parse_crashkernel_mem(char *cmdline,
1323 cur++; 1319 cur++;
1324 *crash_base = memparse(cur, &tmp); 1320 *crash_base = memparse(cur, &tmp);
1325 if (cur == tmp) { 1321 if (cur == tmp) {
1326 pr_warning("Memory value expected " 1322 pr_warn("Memory value expected after '@'\n");
1327 "after '@'\n");
1328 return -EINVAL; 1323 return -EINVAL;
1329 } 1324 }
1330 } 1325 }
@@ -1336,26 +1331,26 @@ static int __init parse_crashkernel_mem(char *cmdline,
1336/* 1331/*
1337 * That function parses "simple" (old) crashkernel command lines like 1332 * That function parses "simple" (old) crashkernel command lines like
1338 * 1333 *
1339 * crashkernel=size[@offset] 1334 * crashkernel=size[@offset]
1340 * 1335 *
1341 * It returns 0 on success and -EINVAL on failure. 1336 * It returns 0 on success and -EINVAL on failure.
1342 */ 1337 */
1343static int __init parse_crashkernel_simple(char *cmdline, 1338static int __init parse_crashkernel_simple(char *cmdline,
1344 unsigned long long *crash_size, 1339 unsigned long long *crash_size,
1345 unsigned long long *crash_base) 1340 unsigned long long *crash_base)
1346{ 1341{
1347 char *cur = cmdline; 1342 char *cur = cmdline;
1348 1343
1349 *crash_size = memparse(cmdline, &cur); 1344 *crash_size = memparse(cmdline, &cur);
1350 if (cmdline == cur) { 1345 if (cmdline == cur) {
1351 pr_warning("crashkernel: memory value expected\n"); 1346 pr_warn("crashkernel: memory value expected\n");
1352 return -EINVAL; 1347 return -EINVAL;
1353 } 1348 }
1354 1349
1355 if (*cur == '@') 1350 if (*cur == '@')
1356 *crash_base = memparse(cur+1, &cur); 1351 *crash_base = memparse(cur+1, &cur);
1357 else if (*cur != ' ' && *cur != '\0') { 1352 else if (*cur != ' ' && *cur != '\0') {
1358 pr_warning("crashkernel: unrecognized char\n"); 1353 pr_warn("crashkernel: unrecognized char\n");
1359 return -EINVAL; 1354 return -EINVAL;
1360 } 1355 }
1361 1356
@@ -1683,7 +1678,15 @@ int kernel_kexec(void)
1683 kexec_in_progress = true; 1678 kexec_in_progress = true;
1684 kernel_restart_prepare(NULL); 1679 kernel_restart_prepare(NULL);
1685 migrate_to_reboot_cpu(); 1680 migrate_to_reboot_cpu();
1686 printk(KERN_EMERG "Starting new kernel\n"); 1681
1682 /*
1683 * migrate_to_reboot_cpu() disables CPU hotplug assuming that
1684 * no further code needs to use CPU hotplug (which is true in
1685 * the reboot case). However, the kexec path depends on using
1686 * CPU hotplug again; so re-enable it here.
1687 */
1688 cpu_hotplug_enable();
1689 pr_emerg("Starting new kernel\n");
1687 machine_shutdown(); 1690 machine_shutdown();
1688 } 1691 }
1689 1692
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 0ac67a5861c5..8637e041a247 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -285,10 +285,7 @@ static int wait_for_helper(void *data)
285 pid_t pid; 285 pid_t pid;
286 286
287 /* If SIGCLD is ignored sys_wait4 won't populate the status. */ 287 /* If SIGCLD is ignored sys_wait4 won't populate the status. */
288 spin_lock_irq(&current->sighand->siglock); 288 kernel_sigaction(SIGCHLD, SIG_DFL);
289 current->sighand->action[SIGCHLD-1].sa.sa_handler = SIG_DFL;
290 spin_unlock_irq(&current->sighand->siglock);
291
292 pid = kernel_thread(____call_usermodehelper, sub_info, SIGCHLD); 289 pid = kernel_thread(____call_usermodehelper, sub_info, SIGCHLD);
293 if (pid < 0) { 290 if (pid < 0) {
294 sub_info->retval = pid; 291 sub_info->retval = pid;
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 2495a9b14ac8..6683ccef9fff 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -37,6 +37,7 @@ static ssize_t uevent_seqnum_show(struct kobject *kobj,
37} 37}
38KERNEL_ATTR_RO(uevent_seqnum); 38KERNEL_ATTR_RO(uevent_seqnum);
39 39
40#ifdef CONFIG_UEVENT_HELPER
40/* uevent helper program, used during early boot */ 41/* uevent helper program, used during early boot */
41static ssize_t uevent_helper_show(struct kobject *kobj, 42static ssize_t uevent_helper_show(struct kobject *kobj,
42 struct kobj_attribute *attr, char *buf) 43 struct kobj_attribute *attr, char *buf)
@@ -56,7 +57,7 @@ static ssize_t uevent_helper_store(struct kobject *kobj,
56 return count; 57 return count;
57} 58}
58KERNEL_ATTR_RW(uevent_helper); 59KERNEL_ATTR_RW(uevent_helper);
59 60#endif
60 61
61#ifdef CONFIG_PROFILING 62#ifdef CONFIG_PROFILING
62static ssize_t profiling_show(struct kobject *kobj, 63static ssize_t profiling_show(struct kobject *kobj,
@@ -189,7 +190,9 @@ EXPORT_SYMBOL_GPL(kernel_kobj);
189static struct attribute * kernel_attrs[] = { 190static struct attribute * kernel_attrs[] = {
190 &fscaps_attr.attr, 191 &fscaps_attr.attr,
191 &uevent_seqnum_attr.attr, 192 &uevent_seqnum_attr.attr,
193#ifdef CONFIG_UEVENT_HELPER
192 &uevent_helper_attr.attr, 194 &uevent_helper_attr.attr,
195#endif
193#ifdef CONFIG_PROFILING 196#ifdef CONFIG_PROFILING
194 &profiling_attr.attr, 197 &profiling_attr.attr,
195#endif 198#endif
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 9a130ec06f7a..c2390f41307b 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -262,7 +262,7 @@ static void create_kthread(struct kthread_create_info *create)
262 * kthread_stop() has been called). The return value should be zero 262 * kthread_stop() has been called). The return value should be zero
263 * or a negative error number; it will be passed to kthread_stop(). 263 * or a negative error number; it will be passed to kthread_stop().
264 * 264 *
265 * Returns a task_struct or ERR_PTR(-ENOMEM). 265 * Returns a task_struct or ERR_PTR(-ENOMEM) or ERR_PTR(-EINTR).
266 */ 266 */
267struct task_struct *kthread_create_on_node(int (*threadfn)(void *data), 267struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
268 void *data, int node, 268 void *data, int node,
@@ -298,7 +298,7 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
298 * that thread. 298 * that thread.
299 */ 299 */
300 if (xchg(&create->done, NULL)) 300 if (xchg(&create->done, NULL))
301 return ERR_PTR(-ENOMEM); 301 return ERR_PTR(-EINTR);
302 /* 302 /*
303 * kthreadd (or new kernel thread) will call complete() 303 * kthreadd (or new kernel thread) will call complete()
304 * shortly. 304 * shortly.
diff --git a/kernel/latencytop.c b/kernel/latencytop.c
index a462b317f9a0..a02812743a7e 100644
--- a/kernel/latencytop.c
+++ b/kernel/latencytop.c
@@ -88,7 +88,8 @@ static void clear_global_latency_tracing(void)
88} 88}
89 89
90static void __sched 90static void __sched
91account_global_scheduler_latency(struct task_struct *tsk, struct latency_record *lat) 91account_global_scheduler_latency(struct task_struct *tsk,
92 struct latency_record *lat)
92{ 93{
93 int firstnonnull = MAXLR + 1; 94 int firstnonnull = MAXLR + 1;
94 int i; 95 int i;
@@ -255,7 +256,7 @@ static int lstats_show(struct seq_file *m, void *v)
255 break; 256 break;
256 seq_printf(m, " %ps", (void *)bt); 257 seq_printf(m, " %ps", (void *)bt);
257 } 258 }
258 seq_printf(m, "\n"); 259 seq_puts(m, "\n");
259 } 260 }
260 } 261 }
261 return 0; 262 return 0;
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index b0e9467922e1..d24e4339b46d 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -4188,7 +4188,7 @@ void debug_show_held_locks(struct task_struct *task)
4188} 4188}
4189EXPORT_SYMBOL_GPL(debug_show_held_locks); 4189EXPORT_SYMBOL_GPL(debug_show_held_locks);
4190 4190
4191asmlinkage void lockdep_sys_exit(void) 4191asmlinkage __visible void lockdep_sys_exit(void)
4192{ 4192{
4193 struct task_struct *curr = current; 4193 struct task_struct *curr = current;
4194 4194
diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c
index f26b1a18e34e..0955b885d0dc 100644
--- a/kernel/locking/locktorture.c
+++ b/kernel/locking/locktorture.c
@@ -82,14 +82,14 @@ struct lock_writer_stress_stats {
82}; 82};
83static struct lock_writer_stress_stats *lwsa; 83static struct lock_writer_stress_stats *lwsa;
84 84
85#if defined(MODULE) || defined(CONFIG_LOCK_TORTURE_TEST_RUNNABLE) 85#if defined(MODULE)
86#define LOCKTORTURE_RUNNABLE_INIT 1 86#define LOCKTORTURE_RUNNABLE_INIT 1
87#else 87#else
88#define LOCKTORTURE_RUNNABLE_INIT 0 88#define LOCKTORTURE_RUNNABLE_INIT 0
89#endif 89#endif
90int locktorture_runnable = LOCKTORTURE_RUNNABLE_INIT; 90int locktorture_runnable = LOCKTORTURE_RUNNABLE_INIT;
91module_param(locktorture_runnable, int, 0444); 91module_param(locktorture_runnable, int, 0444);
92MODULE_PARM_DESC(locktorture_runnable, "Start locktorture at boot"); 92MODULE_PARM_DESC(locktorture_runnable, "Start locktorture at module init");
93 93
94/* Forward reference. */ 94/* Forward reference. */
95static void lock_torture_cleanup(void); 95static void lock_torture_cleanup(void);
@@ -216,10 +216,11 @@ static int lock_torture_writer(void *arg)
216 static DEFINE_TORTURE_RANDOM(rand); 216 static DEFINE_TORTURE_RANDOM(rand);
217 217
218 VERBOSE_TOROUT_STRING("lock_torture_writer task started"); 218 VERBOSE_TOROUT_STRING("lock_torture_writer task started");
219 set_user_nice(current, 19); 219 set_user_nice(current, MAX_NICE);
220 220
221 do { 221 do {
222 schedule_timeout_uninterruptible(1); 222 if ((torture_random(&rand) & 0xfffff) == 0)
223 schedule_timeout_uninterruptible(1);
223 cur_ops->writelock(); 224 cur_ops->writelock();
224 if (WARN_ON_ONCE(lock_is_write_held)) 225 if (WARN_ON_ONCE(lock_is_write_held))
225 lwsp->n_write_lock_fail++; 226 lwsp->n_write_lock_fail++;
@@ -354,7 +355,8 @@ static int __init lock_torture_init(void)
354 &lock_busted_ops, &spin_lock_ops, &spin_lock_irq_ops, 355 &lock_busted_ops, &spin_lock_ops, &spin_lock_irq_ops,
355 }; 356 };
356 357
357 torture_init_begin(torture_type, verbose, &locktorture_runnable); 358 if (!torture_init_begin(torture_type, verbose, &locktorture_runnable))
359 return -EBUSY;
358 360
359 /* Process args and tell the world that the torturer is on the job. */ 361 /* Process args and tell the world that the torturer is on the job. */
360 for (i = 0; i < ARRAY_SIZE(torture_ops); i++) { 362 for (i = 0; i < ARRAY_SIZE(torture_ops); i++) {
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index aa4dff04b594..a620d4d08ca6 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -343,9 +343,16 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
343 * top_waiter can be NULL, when we are in the deboosting 343 * top_waiter can be NULL, when we are in the deboosting
344 * mode! 344 * mode!
345 */ 345 */
346 if (top_waiter && (!task_has_pi_waiters(task) || 346 if (top_waiter) {
347 top_waiter != task_top_pi_waiter(task))) 347 if (!task_has_pi_waiters(task))
348 goto out_unlock_pi; 348 goto out_unlock_pi;
349 /*
350 * If deadlock detection is off, we stop here if we
351 * are not the top pi waiter of the task.
352 */
353 if (!detect_deadlock && top_waiter != task_top_pi_waiter(task))
354 goto out_unlock_pi;
355 }
349 356
350 /* 357 /*
351 * When deadlock detection is off then we check, if further 358 * When deadlock detection is off then we check, if further
@@ -361,7 +368,12 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
361 goto retry; 368 goto retry;
362 } 369 }
363 370
364 /* Deadlock detection */ 371 /*
372 * Deadlock detection. If the lock is the same as the original
373 * lock which caused us to walk the lock chain or if the
374 * current lock is owned by the task which initiated the chain
375 * walk, we detected a deadlock.
376 */
365 if (lock == orig_lock || rt_mutex_owner(lock) == top_task) { 377 if (lock == orig_lock || rt_mutex_owner(lock) == top_task) {
366 debug_rt_mutex_deadlock(deadlock_detect, orig_waiter, lock); 378 debug_rt_mutex_deadlock(deadlock_detect, orig_waiter, lock);
367 raw_spin_unlock(&lock->wait_lock); 379 raw_spin_unlock(&lock->wait_lock);
@@ -527,6 +539,18 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
527 unsigned long flags; 539 unsigned long flags;
528 int chain_walk = 0, res; 540 int chain_walk = 0, res;
529 541
542 /*
543 * Early deadlock detection. We really don't want the task to
544 * enqueue on itself just to untangle the mess later. It's not
545 * only an optimization. We drop the locks, so another waiter
546 * can come in before the chain walk detects the deadlock. So
547 * the other will detect the deadlock and return -EDEADLOCK,
548 * which is wrong, as the other waiter is not in a deadlock
549 * situation.
550 */
551 if (detect_deadlock && owner == task)
552 return -EDEADLK;
553
530 raw_spin_lock_irqsave(&task->pi_lock, flags); 554 raw_spin_lock_irqsave(&task->pi_lock, flags);
531 __rt_mutex_adjust_prio(task); 555 __rt_mutex_adjust_prio(task);
532 waiter->task = task; 556 waiter->task = task;
diff --git a/kernel/module.c b/kernel/module.c
index 11869408f79b..81e727cf6df9 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -815,9 +815,6 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user,
815 return -EFAULT; 815 return -EFAULT;
816 name[MODULE_NAME_LEN-1] = '\0'; 816 name[MODULE_NAME_LEN-1] = '\0';
817 817
818 if (!(flags & O_NONBLOCK))
819 pr_warn("waiting module removal not supported: please upgrade\n");
820
821 if (mutex_lock_interruptible(&module_mutex) != 0) 818 if (mutex_lock_interruptible(&module_mutex) != 0)
822 return -EINTR; 819 return -EINTR;
823 820
@@ -3023,21 +3020,6 @@ static int do_init_module(struct module *mod)
3023 */ 3020 */
3024 current->flags &= ~PF_USED_ASYNC; 3021 current->flags &= ~PF_USED_ASYNC;
3025 3022
3026 blocking_notifier_call_chain(&module_notify_list,
3027 MODULE_STATE_COMING, mod);
3028
3029 /* Set RO and NX regions for core */
3030 set_section_ro_nx(mod->module_core,
3031 mod->core_text_size,
3032 mod->core_ro_size,
3033 mod->core_size);
3034
3035 /* Set RO and NX regions for init */
3036 set_section_ro_nx(mod->module_init,
3037 mod->init_text_size,
3038 mod->init_ro_size,
3039 mod->init_size);
3040
3041 do_mod_ctors(mod); 3023 do_mod_ctors(mod);
3042 /* Start the module */ 3024 /* Start the module */
3043 if (mod->init != NULL) 3025 if (mod->init != NULL)
@@ -3168,9 +3150,26 @@ static int complete_formation(struct module *mod, struct load_info *info)
3168 /* This relies on module_mutex for list integrity. */ 3150 /* This relies on module_mutex for list integrity. */
3169 module_bug_finalize(info->hdr, info->sechdrs, mod); 3151 module_bug_finalize(info->hdr, info->sechdrs, mod);
3170 3152
3153 /* Set RO and NX regions for core */
3154 set_section_ro_nx(mod->module_core,
3155 mod->core_text_size,
3156 mod->core_ro_size,
3157 mod->core_size);
3158
3159 /* Set RO and NX regions for init */
3160 set_section_ro_nx(mod->module_init,
3161 mod->init_text_size,
3162 mod->init_ro_size,
3163 mod->init_size);
3164
3171 /* Mark state as coming so strong_try_module_get() ignores us, 3165 /* Mark state as coming so strong_try_module_get() ignores us,
3172 * but kallsyms etc. can see us. */ 3166 * but kallsyms etc. can see us. */
3173 mod->state = MODULE_STATE_COMING; 3167 mod->state = MODULE_STATE_COMING;
3168 mutex_unlock(&module_mutex);
3169
3170 blocking_notifier_call_chain(&module_notify_list,
3171 MODULE_STATE_COMING, mod);
3172 return 0;
3174 3173
3175out: 3174out:
3176 mutex_unlock(&module_mutex); 3175 mutex_unlock(&module_mutex);
@@ -3193,6 +3192,7 @@ static int load_module(struct load_info *info, const char __user *uargs,
3193{ 3192{
3194 struct module *mod; 3193 struct module *mod;
3195 long err; 3194 long err;
3195 char *after_dashes;
3196 3196
3197 err = module_sig_check(info); 3197 err = module_sig_check(info);
3198 if (err) 3198 if (err)
@@ -3271,16 +3271,24 @@ static int load_module(struct load_info *info, const char __user *uargs,
3271 3271
3272 dynamic_debug_setup(info->debug, info->num_debug); 3272 dynamic_debug_setup(info->debug, info->num_debug);
3273 3273
3274 /* Ftrace init must be called in the MODULE_STATE_UNFORMED state */
3275 ftrace_module_init(mod);
3276
3274 /* Finally it's fully formed, ready to start executing. */ 3277 /* Finally it's fully formed, ready to start executing. */
3275 err = complete_formation(mod, info); 3278 err = complete_formation(mod, info);
3276 if (err) 3279 if (err)
3277 goto ddebug_cleanup; 3280 goto ddebug_cleanup;
3278 3281
3279 /* Module is ready to execute: parsing args may do that. */ 3282 /* Module is ready to execute: parsing args may do that. */
3280 err = parse_args(mod->name, mod->args, mod->kp, mod->num_kp, 3283 after_dashes = parse_args(mod->name, mod->args, mod->kp, mod->num_kp,
3281 -32768, 32767, unknown_module_param_cb); 3284 -32768, 32767, unknown_module_param_cb);
3282 if (err < 0) 3285 if (IS_ERR(after_dashes)) {
3286 err = PTR_ERR(after_dashes);
3283 goto bug_cleanup; 3287 goto bug_cleanup;
3288 } else if (after_dashes) {
3289 pr_warn("%s: parameters '%s' after `--' ignored\n",
3290 mod->name, after_dashes);
3291 }
3284 3292
3285 /* Link in to syfs. */ 3293 /* Link in to syfs. */
3286 err = mod_sysfs_setup(mod, info, mod->kp, mod->num_kp); 3294 err = mod_sysfs_setup(mod, info, mod->kp, mod->num_kp);
diff --git a/kernel/panic.c b/kernel/panic.c
index d02fa9fef46a..62e16cef9cc2 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -32,6 +32,7 @@ static unsigned long tainted_mask;
32static int pause_on_oops; 32static int pause_on_oops;
33static int pause_on_oops_flag; 33static int pause_on_oops_flag;
34static DEFINE_SPINLOCK(pause_on_oops_lock); 34static DEFINE_SPINLOCK(pause_on_oops_lock);
35static bool crash_kexec_post_notifiers;
35 36
36int panic_timeout = CONFIG_PANIC_TIMEOUT; 37int panic_timeout = CONFIG_PANIC_TIMEOUT;
37EXPORT_SYMBOL_GPL(panic_timeout); 38EXPORT_SYMBOL_GPL(panic_timeout);
@@ -112,9 +113,11 @@ void panic(const char *fmt, ...)
112 /* 113 /*
113 * If we have crashed and we have a crash kernel loaded let it handle 114 * If we have crashed and we have a crash kernel loaded let it handle
114 * everything else. 115 * everything else.
115 * Do we want to call this before we try to display a message? 116 * If we want to run this after calling panic_notifiers, pass
117 * the "crash_kexec_post_notifiers" option to the kernel.
116 */ 118 */
117 crash_kexec(NULL); 119 if (!crash_kexec_post_notifiers)
120 crash_kexec(NULL);
118 121
119 /* 122 /*
120 * Note smp_send_stop is the usual smp shutdown function, which 123 * Note smp_send_stop is the usual smp shutdown function, which
@@ -131,6 +134,15 @@ void panic(const char *fmt, ...)
131 134
132 kmsg_dump(KMSG_DUMP_PANIC); 135 kmsg_dump(KMSG_DUMP_PANIC);
133 136
137 /*
138 * If you doubt kdump always works fine in any situation,
139 * "crash_kexec_post_notifiers" offers you a chance to run
140 * panic_notifiers and dumping kmsg before kdump.
141 * Note: since some panic_notifiers can make crashed kernel
142 * more unstable, it can increase risks of the kdump failure too.
143 */
144 crash_kexec(NULL);
145
134 bust_spinlocks(0); 146 bust_spinlocks(0);
135 147
136 if (!panic_blink) 148 if (!panic_blink)
@@ -472,6 +484,13 @@ EXPORT_SYMBOL(__stack_chk_fail);
472core_param(panic, panic_timeout, int, 0644); 484core_param(panic, panic_timeout, int, 0644);
473core_param(pause_on_oops, pause_on_oops, int, 0644); 485core_param(pause_on_oops, pause_on_oops, int, 0644);
474 486
487static int __init setup_crash_kexec_post_notifiers(char *s)
488{
489 crash_kexec_post_notifiers = true;
490 return 0;
491}
492early_param("crash_kexec_post_notifiers", setup_crash_kexec_post_notifiers);
493
475static int __init oops_setup(char *s) 494static int __init oops_setup(char *s)
476{ 495{
477 if (!s) 496 if (!s)
diff --git a/kernel/params.c b/kernel/params.c
index b00142e7f3ba..1e52ca233fd9 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -177,13 +177,13 @@ static char *next_arg(char *args, char **param, char **val)
177} 177}
178 178
179/* Args looks like "foo=bar,bar2 baz=fuz wiz". */ 179/* Args looks like "foo=bar,bar2 baz=fuz wiz". */
180int parse_args(const char *doing, 180char *parse_args(const char *doing,
181 char *args, 181 char *args,
182 const struct kernel_param *params, 182 const struct kernel_param *params,
183 unsigned num, 183 unsigned num,
184 s16 min_level, 184 s16 min_level,
185 s16 max_level, 185 s16 max_level,
186 int (*unknown)(char *param, char *val, const char *doing)) 186 int (*unknown)(char *param, char *val, const char *doing))
187{ 187{
188 char *param, *val; 188 char *param, *val;
189 189
@@ -198,6 +198,9 @@ int parse_args(const char *doing,
198 int irq_was_disabled; 198 int irq_was_disabled;
199 199
200 args = next_arg(args, &param, &val); 200 args = next_arg(args, &param, &val);
201 /* Stop at -- */
202 if (!val && strcmp(param, "--") == 0)
203 return args;
201 irq_was_disabled = irqs_disabled(); 204 irq_was_disabled = irqs_disabled();
202 ret = parse_one(param, val, doing, params, num, 205 ret = parse_one(param, val, doing, params, num,
203 min_level, max_level, unknown); 206 min_level, max_level, unknown);
@@ -208,22 +211,22 @@ int parse_args(const char *doing,
208 switch (ret) { 211 switch (ret) {
209 case -ENOENT: 212 case -ENOENT:
210 pr_err("%s: Unknown parameter `%s'\n", doing, param); 213 pr_err("%s: Unknown parameter `%s'\n", doing, param);
211 return ret; 214 return ERR_PTR(ret);
212 case -ENOSPC: 215 case -ENOSPC:
213 pr_err("%s: `%s' too large for parameter `%s'\n", 216 pr_err("%s: `%s' too large for parameter `%s'\n",
214 doing, val ?: "", param); 217 doing, val ?: "", param);
215 return ret; 218 return ERR_PTR(ret);
216 case 0: 219 case 0:
217 break; 220 break;
218 default: 221 default:
219 pr_err("%s: `%s' invalid for parameter `%s'\n", 222 pr_err("%s: `%s' invalid for parameter `%s'\n",
220 doing, val ?: "", param); 223 doing, val ?: "", param);
221 return ret; 224 return ERR_PTR(ret);
222 } 225 }
223 } 226 }
224 227
225 /* All parsed OK. */ 228 /* All parsed OK. */
226 return 0; 229 return NULL;
227} 230}
228 231
229/* Lazy bastard, eh? */ 232/* Lazy bastard, eh? */
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 2fac9cc79b3d..9a83d780facd 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -257,8 +257,7 @@ config ARCH_HAS_OPP
257 bool 257 bool
258 258
259config PM_OPP 259config PM_OPP
260 bool "Operating Performance Point (OPP) Layer library" 260 bool
261 depends on ARCH_HAS_OPP
262 ---help--- 261 ---help---
263 SOCs have a standard set of tuples consisting of frequency and 262 SOCs have a standard set of tuples consisting of frequency and
264 voltage pairs that the device will support per voltage domain. This 263 voltage pairs that the device will support per voltage domain. This
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index f4f2073711d3..49e0a20fd010 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -28,6 +28,7 @@
28#include <linux/syscore_ops.h> 28#include <linux/syscore_ops.h>
29#include <linux/ctype.h> 29#include <linux/ctype.h>
30#include <linux/genhd.h> 30#include <linux/genhd.h>
31#include <trace/events/power.h>
31 32
32#include "power.h" 33#include "power.h"
33 34
@@ -35,7 +36,7 @@
35static int nocompress; 36static int nocompress;
36static int noresume; 37static int noresume;
37static int resume_wait; 38static int resume_wait;
38static int resume_delay; 39static unsigned int resume_delay;
39static char resume_file[256] = CONFIG_PM_STD_PARTITION; 40static char resume_file[256] = CONFIG_PM_STD_PARTITION;
40dev_t swsusp_resume_device; 41dev_t swsusp_resume_device;
41sector_t swsusp_resume_block; 42sector_t swsusp_resume_block;
@@ -228,19 +229,23 @@ static void platform_recover(int platform_mode)
228void swsusp_show_speed(struct timeval *start, struct timeval *stop, 229void swsusp_show_speed(struct timeval *start, struct timeval *stop,
229 unsigned nr_pages, char *msg) 230 unsigned nr_pages, char *msg)
230{ 231{
231 s64 elapsed_centisecs64; 232 u64 elapsed_centisecs64;
232 int centisecs; 233 unsigned int centisecs;
233 int k; 234 unsigned int k;
234 int kps; 235 unsigned int kps;
235 236
236 elapsed_centisecs64 = timeval_to_ns(stop) - timeval_to_ns(start); 237 elapsed_centisecs64 = timeval_to_ns(stop) - timeval_to_ns(start);
238 /*
239 * If "(s64)elapsed_centisecs64 < 0", it will print long elapsed time,
240 * it is obvious enough for what went wrong.
241 */
237 do_div(elapsed_centisecs64, NSEC_PER_SEC / 100); 242 do_div(elapsed_centisecs64, NSEC_PER_SEC / 100);
238 centisecs = elapsed_centisecs64; 243 centisecs = elapsed_centisecs64;
239 if (centisecs == 0) 244 if (centisecs == 0)
240 centisecs = 1; /* avoid div-by-zero */ 245 centisecs = 1; /* avoid div-by-zero */
241 k = nr_pages * (PAGE_SIZE / 1024); 246 k = nr_pages * (PAGE_SIZE / 1024);
242 kps = (k * 100) / centisecs; 247 kps = (k * 100) / centisecs;
243 printk(KERN_INFO "PM: %s %d kbytes in %d.%02d seconds (%d.%02d MB/s)\n", 248 printk(KERN_INFO "PM: %s %u kbytes in %u.%02u seconds (%u.%02u MB/s)\n",
244 msg, k, 249 msg, k,
245 centisecs / 100, centisecs % 100, 250 centisecs / 100, centisecs % 100,
246 kps / 1000, (kps % 1000) / 10); 251 kps / 1000, (kps % 1000) / 10);
@@ -288,7 +293,9 @@ static int create_image(int platform_mode)
288 293
289 in_suspend = 1; 294 in_suspend = 1;
290 save_processor_state(); 295 save_processor_state();
296 trace_suspend_resume(TPS("machine_suspend"), PM_EVENT_HIBERNATE, true);
291 error = swsusp_arch_suspend(); 297 error = swsusp_arch_suspend();
298 trace_suspend_resume(TPS("machine_suspend"), PM_EVENT_HIBERNATE, false);
292 if (error) 299 if (error)
293 printk(KERN_ERR "PM: Error %d creating hibernation image\n", 300 printk(KERN_ERR "PM: Error %d creating hibernation image\n",
294 error); 301 error);
@@ -595,7 +602,8 @@ static void power_down(void)
595 case HIBERNATION_PLATFORM: 602 case HIBERNATION_PLATFORM:
596 hibernation_platform_enter(); 603 hibernation_platform_enter();
597 case HIBERNATION_SHUTDOWN: 604 case HIBERNATION_SHUTDOWN:
598 kernel_power_off(); 605 if (pm_power_off)
606 kernel_power_off();
599 break; 607 break;
600#ifdef CONFIG_SUSPEND 608#ifdef CONFIG_SUSPEND
601 case HIBERNATION_SUSPEND: 609 case HIBERNATION_SUSPEND:
@@ -623,7 +631,8 @@ static void power_down(void)
623 * corruption after resume. 631 * corruption after resume.
624 */ 632 */
625 printk(KERN_CRIT "PM: Please power down manually\n"); 633 printk(KERN_CRIT "PM: Please power down manually\n");
626 while(1); 634 while (1)
635 cpu_relax();
627} 636}
628 637
629/** 638/**
@@ -1109,7 +1118,10 @@ static int __init resumewait_setup(char *str)
1109 1118
1110static int __init resumedelay_setup(char *str) 1119static int __init resumedelay_setup(char *str)
1111{ 1120{
1112 resume_delay = simple_strtoul(str, NULL, 0); 1121 int rc = kstrtouint(str, 0, &resume_delay);
1122
1123 if (rc)
1124 return rc;
1113 return 1; 1125 return 1;
1114} 1126}
1115 1127
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 6271bc4073ef..573410d6647e 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -279,26 +279,26 @@ static inline void pm_print_times_init(void) {}
279struct kobject *power_kobj; 279struct kobject *power_kobj;
280 280
281/** 281/**
282 * state - control system power state. 282 * state - control system sleep states.
283 * 283 *
284 * show() returns what states are supported, which is hard-coded to 284 * show() returns available sleep state labels, which may be "mem", "standby",
285 * 'freeze' (Low-Power Idle), 'standby' (Power-On Suspend), 285 * "freeze" and "disk" (hibernation). See Documentation/power/states.txt for a
286 * 'mem' (Suspend-to-RAM), and 'disk' (Suspend-to-Disk). 286 * description of what they mean.
287 * 287 *
288 * store() accepts one of those strings, translates it into the 288 * store() accepts one of those strings, translates it into the proper
289 * proper enumerated value, and initiates a suspend transition. 289 * enumerated value, and initiates a suspend transition.
290 */ 290 */
291static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr, 291static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr,
292 char *buf) 292 char *buf)
293{ 293{
294 char *s = buf; 294 char *s = buf;
295#ifdef CONFIG_SUSPEND 295#ifdef CONFIG_SUSPEND
296 int i; 296 suspend_state_t i;
297
298 for (i = PM_SUSPEND_MIN; i < PM_SUSPEND_MAX; i++)
299 if (pm_states[i].state)
300 s += sprintf(s,"%s ", pm_states[i].label);
297 301
298 for (i = 0; i < PM_SUSPEND_MAX; i++) {
299 if (pm_states[i] && valid_state(i))
300 s += sprintf(s,"%s ", pm_states[i]);
301 }
302#endif 302#endif
303#ifdef CONFIG_HIBERNATION 303#ifdef CONFIG_HIBERNATION
304 s += sprintf(s, "%s\n", "disk"); 304 s += sprintf(s, "%s\n", "disk");
@@ -314,7 +314,7 @@ static suspend_state_t decode_state(const char *buf, size_t n)
314{ 314{
315#ifdef CONFIG_SUSPEND 315#ifdef CONFIG_SUSPEND
316 suspend_state_t state = PM_SUSPEND_MIN; 316 suspend_state_t state = PM_SUSPEND_MIN;
317 const char * const *s; 317 struct pm_sleep_state *s;
318#endif 318#endif
319 char *p; 319 char *p;
320 int len; 320 int len;
@@ -328,8 +328,9 @@ static suspend_state_t decode_state(const char *buf, size_t n)
328 328
329#ifdef CONFIG_SUSPEND 329#ifdef CONFIG_SUSPEND
330 for (s = &pm_states[state]; state < PM_SUSPEND_MAX; s++, state++) 330 for (s = &pm_states[state]; state < PM_SUSPEND_MAX; s++, state++)
331 if (*s && len == strlen(*s) && !strncmp(buf, *s, len)) 331 if (s->state && len == strlen(s->label)
332 return state; 332 && !strncmp(buf, s->label, len))
333 return s->state;
333#endif 334#endif
334 335
335 return PM_SUSPEND_ON; 336 return PM_SUSPEND_ON;
@@ -447,8 +448,8 @@ static ssize_t autosleep_show(struct kobject *kobj,
447 448
448#ifdef CONFIG_SUSPEND 449#ifdef CONFIG_SUSPEND
449 if (state < PM_SUSPEND_MAX) 450 if (state < PM_SUSPEND_MAX)
450 return sprintf(buf, "%s\n", valid_state(state) ? 451 return sprintf(buf, "%s\n", pm_states[state].state ?
451 pm_states[state] : "error"); 452 pm_states[state].label : "error");
452#endif 453#endif
453#ifdef CONFIG_HIBERNATION 454#ifdef CONFIG_HIBERNATION
454 return sprintf(buf, "disk\n"); 455 return sprintf(buf, "disk\n");
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 15f37ea08719..c60f13b5270a 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -178,17 +178,20 @@ extern void swsusp_show_speed(struct timeval *, struct timeval *,
178 unsigned int, char *); 178 unsigned int, char *);
179 179
180#ifdef CONFIG_SUSPEND 180#ifdef CONFIG_SUSPEND
181struct pm_sleep_state {
182 const char *label;
183 suspend_state_t state;
184};
185
181/* kernel/power/suspend.c */ 186/* kernel/power/suspend.c */
182extern const char *const pm_states[]; 187extern struct pm_sleep_state pm_states[];
183 188
184extern bool valid_state(suspend_state_t state);
185extern int suspend_devices_and_enter(suspend_state_t state); 189extern int suspend_devices_and_enter(suspend_state_t state);
186#else /* !CONFIG_SUSPEND */ 190#else /* !CONFIG_SUSPEND */
187static inline int suspend_devices_and_enter(suspend_state_t state) 191static inline int suspend_devices_and_enter(suspend_state_t state)
188{ 192{
189 return -ENOSYS; 193 return -ENOSYS;
190} 194}
191static inline bool valid_state(suspend_state_t state) { return false; }
192#endif /* !CONFIG_SUSPEND */ 195#endif /* !CONFIG_SUSPEND */
193 196
194#ifdef CONFIG_PM_TEST_SUSPEND 197#ifdef CONFIG_PM_TEST_SUSPEND
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 06ec8869dbf1..0ca8d83e2369 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -17,6 +17,7 @@
17#include <linux/delay.h> 17#include <linux/delay.h>
18#include <linux/workqueue.h> 18#include <linux/workqueue.h>
19#include <linux/kmod.h> 19#include <linux/kmod.h>
20#include <trace/events/power.h>
20 21
21/* 22/*
22 * Timeout for stopping processes 23 * Timeout for stopping processes
@@ -175,6 +176,7 @@ void thaw_processes(void)
175 struct task_struct *g, *p; 176 struct task_struct *g, *p;
176 struct task_struct *curr = current; 177 struct task_struct *curr = current;
177 178
179 trace_suspend_resume(TPS("thaw_processes"), 0, true);
178 if (pm_freezing) 180 if (pm_freezing)
179 atomic_dec(&system_freezing_cnt); 181 atomic_dec(&system_freezing_cnt);
180 pm_freezing = false; 182 pm_freezing = false;
@@ -201,6 +203,7 @@ void thaw_processes(void)
201 203
202 schedule(); 204 schedule();
203 printk("done.\n"); 205 printk("done.\n");
206 trace_suspend_resume(TPS("thaw_processes"), 0, false);
204} 207}
205 208
206void thaw_kernel_threads(void) 209void thaw_kernel_threads(void)
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 18fb7a2fb14b..1ea328aafdc9 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -1586,7 +1586,7 @@ swsusp_alloc(struct memory_bitmap *orig_bm, struct memory_bitmap *copy_bm,
1586 return -ENOMEM; 1586 return -ENOMEM;
1587} 1587}
1588 1588
1589asmlinkage int swsusp_save(void) 1589asmlinkage __visible int swsusp_save(void)
1590{ 1590{
1591 unsigned int nr_pages, nr_highmem; 1591 unsigned int nr_pages, nr_highmem;
1592 1592
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index c3ad9cafe930..4dd8822f732a 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -14,6 +14,7 @@
14#include <linux/init.h> 14#include <linux/init.h>
15#include <linux/console.h> 15#include <linux/console.h>
16#include <linux/cpu.h> 16#include <linux/cpu.h>
17#include <linux/cpuidle.h>
17#include <linux/syscalls.h> 18#include <linux/syscalls.h>
18#include <linux/gfp.h> 19#include <linux/gfp.h>
19#include <linux/io.h> 20#include <linux/io.h>
@@ -30,13 +31,14 @@
30 31
31#include "power.h" 32#include "power.h"
32 33
33const char *const pm_states[PM_SUSPEND_MAX] = { 34struct pm_sleep_state pm_states[PM_SUSPEND_MAX] = {
34 [PM_SUSPEND_FREEZE] = "freeze", 35 [PM_SUSPEND_FREEZE] = { .label = "freeze", .state = PM_SUSPEND_FREEZE },
35 [PM_SUSPEND_STANDBY] = "standby", 36 [PM_SUSPEND_STANDBY] = { .label = "standby", },
36 [PM_SUSPEND_MEM] = "mem", 37 [PM_SUSPEND_MEM] = { .label = "mem", },
37}; 38};
38 39
39static const struct platform_suspend_ops *suspend_ops; 40static const struct platform_suspend_ops *suspend_ops;
41static const struct platform_freeze_ops *freeze_ops;
40 42
41static bool need_suspend_ops(suspend_state_t state) 43static bool need_suspend_ops(suspend_state_t state)
42{ 44{
@@ -46,6 +48,13 @@ static bool need_suspend_ops(suspend_state_t state)
46static DECLARE_WAIT_QUEUE_HEAD(suspend_freeze_wait_head); 48static DECLARE_WAIT_QUEUE_HEAD(suspend_freeze_wait_head);
47static bool suspend_freeze_wake; 49static bool suspend_freeze_wake;
48 50
51void freeze_set_ops(const struct platform_freeze_ops *ops)
52{
53 lock_system_sleep();
54 freeze_ops = ops;
55 unlock_system_sleep();
56}
57
49static void freeze_begin(void) 58static void freeze_begin(void)
50{ 59{
51 suspend_freeze_wake = false; 60 suspend_freeze_wake = false;
@@ -53,7 +62,11 @@ static void freeze_begin(void)
53 62
54static void freeze_enter(void) 63static void freeze_enter(void)
55{ 64{
65 cpuidle_use_deepest_state(true);
66 cpuidle_resume();
56 wait_event(suspend_freeze_wait_head, suspend_freeze_wake); 67 wait_event(suspend_freeze_wait_head, suspend_freeze_wake);
68 cpuidle_pause();
69 cpuidle_use_deepest_state(false);
57} 70}
58 71
59void freeze_wake(void) 72void freeze_wake(void)
@@ -63,42 +76,62 @@ void freeze_wake(void)
63} 76}
64EXPORT_SYMBOL_GPL(freeze_wake); 77EXPORT_SYMBOL_GPL(freeze_wake);
65 78
79static bool valid_state(suspend_state_t state)
80{
81 /*
82 * PM_SUSPEND_STANDBY and PM_SUSPEND_MEM states need low level
83 * support and need to be valid to the low level
84 * implementation, no valid callback implies that none are valid.
85 */
86 return suspend_ops && suspend_ops->valid && suspend_ops->valid(state);
87}
88
89/*
90 * If this is set, the "mem" label always corresponds to the deepest sleep state
91 * available, the "standby" label corresponds to the second deepest sleep state
92 * available (if any), and the "freeze" label corresponds to the remaining
93 * available sleep state (if there is one).
94 */
95static bool relative_states;
96
97static int __init sleep_states_setup(char *str)
98{
99 relative_states = !strncmp(str, "1", 1);
100 if (relative_states) {
101 pm_states[PM_SUSPEND_MEM].state = PM_SUSPEND_FREEZE;
102 pm_states[PM_SUSPEND_FREEZE].state = 0;
103 }
104 return 1;
105}
106
107__setup("relative_sleep_states=", sleep_states_setup);
108
66/** 109/**
67 * suspend_set_ops - Set the global suspend method table. 110 * suspend_set_ops - Set the global suspend method table.
68 * @ops: Suspend operations to use. 111 * @ops: Suspend operations to use.
69 */ 112 */
70void suspend_set_ops(const struct platform_suspend_ops *ops) 113void suspend_set_ops(const struct platform_suspend_ops *ops)
71{ 114{
115 suspend_state_t i;
116 int j = PM_SUSPEND_MAX - 1;
117
72 lock_system_sleep(); 118 lock_system_sleep();
119
73 suspend_ops = ops; 120 suspend_ops = ops;
121 for (i = PM_SUSPEND_MEM; i >= PM_SUSPEND_STANDBY; i--)
122 if (valid_state(i))
123 pm_states[j--].state = i;
124 else if (!relative_states)
125 pm_states[j--].state = 0;
126
127 pm_states[j--].state = PM_SUSPEND_FREEZE;
128 while (j >= PM_SUSPEND_MIN)
129 pm_states[j--].state = 0;
130
74 unlock_system_sleep(); 131 unlock_system_sleep();
75} 132}
76EXPORT_SYMBOL_GPL(suspend_set_ops); 133EXPORT_SYMBOL_GPL(suspend_set_ops);
77 134
78bool valid_state(suspend_state_t state)
79{
80 if (state == PM_SUSPEND_FREEZE) {
81#ifdef CONFIG_PM_DEBUG
82 if (pm_test_level != TEST_NONE &&
83 pm_test_level != TEST_FREEZER &&
84 pm_test_level != TEST_DEVICES &&
85 pm_test_level != TEST_PLATFORM) {
86 printk(KERN_WARNING "Unsupported pm_test mode for "
87 "freeze state, please choose "
88 "none/freezer/devices/platform.\n");
89 return false;
90 }
91#endif
92 return true;
93 }
94 /*
95 * PM_SUSPEND_STANDBY and PM_SUSPEND_MEMORY states need lowlevel
96 * support and need to be valid to the lowlevel
97 * implementation, no valid callback implies that none are valid.
98 */
99 return suspend_ops && suspend_ops->valid && suspend_ops->valid(state);
100}
101
102/** 135/**
103 * suspend_valid_only_mem - Generic memory-only valid callback. 136 * suspend_valid_only_mem - Generic memory-only valid callback.
104 * 137 *
@@ -144,7 +177,9 @@ static int suspend_prepare(suspend_state_t state)
144 if (error) 177 if (error)
145 goto Finish; 178 goto Finish;
146 179
180 trace_suspend_resume(TPS("freeze_processes"), 0, true);
147 error = suspend_freeze_processes(); 181 error = suspend_freeze_processes();
182 trace_suspend_resume(TPS("freeze_processes"), 0, false);
148 if (!error) 183 if (!error)
149 return 0; 184 return 0;
150 185
@@ -207,7 +242,9 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
207 * all the devices are suspended. 242 * all the devices are suspended.
208 */ 243 */
209 if (state == PM_SUSPEND_FREEZE) { 244 if (state == PM_SUSPEND_FREEZE) {
245 trace_suspend_resume(TPS("machine_suspend"), state, true);
210 freeze_enter(); 246 freeze_enter();
247 trace_suspend_resume(TPS("machine_suspend"), state, false);
211 goto Platform_wake; 248 goto Platform_wake;
212 } 249 }
213 250
@@ -223,7 +260,11 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
223 if (!error) { 260 if (!error) {
224 *wakeup = pm_wakeup_pending(); 261 *wakeup = pm_wakeup_pending();
225 if (!(suspend_test(TEST_CORE) || *wakeup)) { 262 if (!(suspend_test(TEST_CORE) || *wakeup)) {
263 trace_suspend_resume(TPS("machine_suspend"),
264 state, true);
226 error = suspend_ops->enter(state); 265 error = suspend_ops->enter(state);
266 trace_suspend_resume(TPS("machine_suspend"),
267 state, false);
227 events_check_enabled = false; 268 events_check_enabled = false;
228 } 269 }
229 syscore_resume(); 270 syscore_resume();
@@ -261,11 +302,14 @@ int suspend_devices_and_enter(suspend_state_t state)
261 if (need_suspend_ops(state) && !suspend_ops) 302 if (need_suspend_ops(state) && !suspend_ops)
262 return -ENOSYS; 303 return -ENOSYS;
263 304
264 trace_machine_suspend(state);
265 if (need_suspend_ops(state) && suspend_ops->begin) { 305 if (need_suspend_ops(state) && suspend_ops->begin) {
266 error = suspend_ops->begin(state); 306 error = suspend_ops->begin(state);
267 if (error) 307 if (error)
268 goto Close; 308 goto Close;
309 } else if (state == PM_SUSPEND_FREEZE && freeze_ops->begin) {
310 error = freeze_ops->begin();
311 if (error)
312 goto Close;
269 } 313 }
270 suspend_console(); 314 suspend_console();
271 suspend_test_start(); 315 suspend_test_start();
@@ -291,7 +335,9 @@ int suspend_devices_and_enter(suspend_state_t state)
291 Close: 335 Close:
292 if (need_suspend_ops(state) && suspend_ops->end) 336 if (need_suspend_ops(state) && suspend_ops->end)
293 suspend_ops->end(); 337 suspend_ops->end();
294 trace_machine_suspend(PWR_EVENT_EXIT); 338 else if (state == PM_SUSPEND_FREEZE && freeze_ops->end)
339 freeze_ops->end();
340
295 return error; 341 return error;
296 342
297 Recover_platform: 343 Recover_platform:
@@ -325,20 +371,31 @@ static int enter_state(suspend_state_t state)
325{ 371{
326 int error; 372 int error;
327 373
328 if (!valid_state(state)) 374 trace_suspend_resume(TPS("suspend_enter"), state, true);
329 return -ENODEV; 375 if (state == PM_SUSPEND_FREEZE) {
330 376#ifdef CONFIG_PM_DEBUG
377 if (pm_test_level != TEST_NONE && pm_test_level <= TEST_CPUS) {
378 pr_warning("PM: Unsupported test mode for freeze state,"
379 "please choose none/freezer/devices/platform.\n");
380 return -EAGAIN;
381 }
382#endif
383 } else if (!valid_state(state)) {
384 return -EINVAL;
385 }
331 if (!mutex_trylock(&pm_mutex)) 386 if (!mutex_trylock(&pm_mutex))
332 return -EBUSY; 387 return -EBUSY;
333 388
334 if (state == PM_SUSPEND_FREEZE) 389 if (state == PM_SUSPEND_FREEZE)
335 freeze_begin(); 390 freeze_begin();
336 391
392 trace_suspend_resume(TPS("sync_filesystems"), 0, true);
337 printk(KERN_INFO "PM: Syncing filesystems ... "); 393 printk(KERN_INFO "PM: Syncing filesystems ... ");
338 sys_sync(); 394 sys_sync();
339 printk("done.\n"); 395 printk("done.\n");
396 trace_suspend_resume(TPS("sync_filesystems"), 0, false);
340 397
341 pr_debug("PM: Preparing system for %s sleep\n", pm_states[state]); 398 pr_debug("PM: Preparing system for %s sleep\n", pm_states[state].label);
342 error = suspend_prepare(state); 399 error = suspend_prepare(state);
343 if (error) 400 if (error)
344 goto Unlock; 401 goto Unlock;
@@ -346,7 +403,8 @@ static int enter_state(suspend_state_t state)
346 if (suspend_test(TEST_FREEZER)) 403 if (suspend_test(TEST_FREEZER))
347 goto Finish; 404 goto Finish;
348 405
349 pr_debug("PM: Entering %s sleep\n", pm_states[state]); 406 trace_suspend_resume(TPS("suspend_enter"), state, false);
407 pr_debug("PM: Entering %s sleep\n", pm_states[state].label);
350 pm_restrict_gfp_mask(); 408 pm_restrict_gfp_mask();
351 error = suspend_devices_and_enter(state); 409 error = suspend_devices_and_enter(state);
352 pm_restore_gfp_mask(); 410 pm_restore_gfp_mask();
diff --git a/kernel/power/suspend_test.c b/kernel/power/suspend_test.c
index 9b2a1d58558d..269b097e78ea 100644
--- a/kernel/power/suspend_test.c
+++ b/kernel/power/suspend_test.c
@@ -92,13 +92,13 @@ static void __init test_wakealarm(struct rtc_device *rtc, suspend_state_t state)
92 } 92 }
93 93
94 if (state == PM_SUSPEND_MEM) { 94 if (state == PM_SUSPEND_MEM) {
95 printk(info_test, pm_states[state]); 95 printk(info_test, pm_states[state].label);
96 status = pm_suspend(state); 96 status = pm_suspend(state);
97 if (status == -ENODEV) 97 if (status == -ENODEV)
98 state = PM_SUSPEND_STANDBY; 98 state = PM_SUSPEND_STANDBY;
99 } 99 }
100 if (state == PM_SUSPEND_STANDBY) { 100 if (state == PM_SUSPEND_STANDBY) {
101 printk(info_test, pm_states[state]); 101 printk(info_test, pm_states[state].label);
102 status = pm_suspend(state); 102 status = pm_suspend(state);
103 } 103 }
104 if (status < 0) 104 if (status < 0)
@@ -136,18 +136,16 @@ static char warn_bad_state[] __initdata =
136 136
137static int __init setup_test_suspend(char *value) 137static int __init setup_test_suspend(char *value)
138{ 138{
139 unsigned i; 139 suspend_state_t i;
140 140
141 /* "=mem" ==> "mem" */ 141 /* "=mem" ==> "mem" */
142 value++; 142 value++;
143 for (i = 0; i < PM_SUSPEND_MAX; i++) { 143 for (i = PM_SUSPEND_MIN; i < PM_SUSPEND_MAX; i++)
144 if (!pm_states[i]) 144 if (!strcmp(pm_states[i].label, value)) {
145 continue; 145 test_state = pm_states[i].state;
146 if (strcmp(pm_states[i], value) != 0) 146 return 0;
147 continue; 147 }
148 test_state = (__force suspend_state_t) i; 148
149 return 0;
150 }
151 printk(warn_bad_state, value); 149 printk(warn_bad_state, value);
152 return 0; 150 return 0;
153} 151}
@@ -164,8 +162,8 @@ static int __init test_suspend(void)
164 /* PM is initialized by now; is that state testable? */ 162 /* PM is initialized by now; is that state testable? */
165 if (test_state == PM_SUSPEND_ON) 163 if (test_state == PM_SUSPEND_ON)
166 goto done; 164 goto done;
167 if (!valid_state(test_state)) { 165 if (!pm_states[test_state].state) {
168 printk(warn_bad_state, pm_states[test_state]); 166 printk(warn_bad_state, pm_states[test_state].label);
169 goto done; 167 goto done;
170 } 168 }
171 169
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 8c9a4819f798..aaa3261dea5d 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -567,7 +567,7 @@ static int lzo_compress_threadfn(void *data)
567 567
568/** 568/**
569 * save_image_lzo - Save the suspend image data compressed with LZO. 569 * save_image_lzo - Save the suspend image data compressed with LZO.
570 * @handle: Swap mam handle to use for saving the image. 570 * @handle: Swap map handle to use for saving the image.
571 * @snapshot: Image to read data from. 571 * @snapshot: Image to read data from.
572 * @nr_to_write: Number of pages to save. 572 * @nr_to_write: Number of pages to save.
573 */ 573 */
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index a45b50962295..ea2d5f6962ed 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -54,20 +54,16 @@
54#include "console_cmdline.h" 54#include "console_cmdline.h"
55#include "braille.h" 55#include "braille.h"
56 56
57/* printk's without a loglevel use this.. */
58#define DEFAULT_MESSAGE_LOGLEVEL CONFIG_DEFAULT_MESSAGE_LOGLEVEL
59
60/* We show everything that is MORE important than this.. */
61#define MINIMUM_CONSOLE_LOGLEVEL 1 /* Minimum loglevel we let people use */
62#define DEFAULT_CONSOLE_LOGLEVEL 7 /* anything MORE serious than KERN_DEBUG */
63
64int console_printk[4] = { 57int console_printk[4] = {
65 DEFAULT_CONSOLE_LOGLEVEL, /* console_loglevel */ 58 CONSOLE_LOGLEVEL_DEFAULT, /* console_loglevel */
66 DEFAULT_MESSAGE_LOGLEVEL, /* default_message_loglevel */ 59 DEFAULT_MESSAGE_LOGLEVEL, /* default_message_loglevel */
67 MINIMUM_CONSOLE_LOGLEVEL, /* minimum_console_loglevel */ 60 CONSOLE_LOGLEVEL_MIN, /* minimum_console_loglevel */
68 DEFAULT_CONSOLE_LOGLEVEL, /* default_console_loglevel */ 61 CONSOLE_LOGLEVEL_DEFAULT, /* default_console_loglevel */
69}; 62};
70 63
64/* Deferred messaged from sched code are marked by this special level */
65#define SCHED_MESSAGE_LOGLEVEL -2
66
71/* 67/*
72 * Low level drivers may need that to know if they can schedule in 68 * Low level drivers may need that to know if they can schedule in
73 * their unblank() callback or not. So let's export it. 69 * their unblank() callback or not. So let's export it.
@@ -91,6 +87,29 @@ static struct lockdep_map console_lock_dep_map = {
91#endif 87#endif
92 88
93/* 89/*
90 * Helper macros to handle lockdep when locking/unlocking console_sem. We use
91 * macros instead of functions so that _RET_IP_ contains useful information.
92 */
93#define down_console_sem() do { \
94 down(&console_sem);\
95 mutex_acquire(&console_lock_dep_map, 0, 0, _RET_IP_);\
96} while (0)
97
98static int __down_trylock_console_sem(unsigned long ip)
99{
100 if (down_trylock(&console_sem))
101 return 1;
102 mutex_acquire(&console_lock_dep_map, 0, 1, ip);
103 return 0;
104}
105#define down_trylock_console_sem() __down_trylock_console_sem(_RET_IP_)
106
107#define up_console_sem() do { \
108 mutex_release(&console_lock_dep_map, 1, _RET_IP_);\
109 up(&console_sem);\
110} while (0)
111
112/*
94 * This is used for debugging the mess that is the VT code by 113 * This is used for debugging the mess that is the VT code by
95 * keeping track if we have the console semaphore held. It's 114 * keeping track if we have the console semaphore held. It's
96 * definitely not the perfect debug tool (we don't know if _WE_ 115 * definitely not the perfect debug tool (we don't know if _WE_
@@ -206,8 +225,9 @@ struct printk_log {
206}; 225};
207 226
208/* 227/*
209 * The logbuf_lock protects kmsg buffer, indices, counters. It is also 228 * The logbuf_lock protects kmsg buffer, indices, counters. This can be taken
210 * used in interesting ways to provide interlocking in console_unlock(); 229 * within the scheduler's rq lock. It must be released before calling
230 * console_unlock() or anything else that might wake up a process.
211 */ 231 */
212static DEFINE_RAW_SPINLOCK(logbuf_lock); 232static DEFINE_RAW_SPINLOCK(logbuf_lock);
213 233
@@ -250,9 +270,6 @@ static char __log_buf[__LOG_BUF_LEN] __aligned(LOG_ALIGN);
250static char *log_buf = __log_buf; 270static char *log_buf = __log_buf;
251static u32 log_buf_len = __LOG_BUF_LEN; 271static u32 log_buf_len = __LOG_BUF_LEN;
252 272
253/* cpu currently holding logbuf_lock */
254static volatile unsigned int logbuf_cpu = UINT_MAX;
255
256/* human readable text of the record */ 273/* human readable text of the record */
257static char *log_text(const struct printk_log *msg) 274static char *log_text(const struct printk_log *msg)
258{ 275{
@@ -297,34 +314,106 @@ static u32 log_next(u32 idx)
297 return idx + msg->len; 314 return idx + msg->len;
298} 315}
299 316
300/* insert record into the buffer, discard old ones, update heads */ 317/*
301static void log_store(int facility, int level, 318 * Check whether there is enough free space for the given message.
302 enum log_flags flags, u64 ts_nsec, 319 *
303 const char *dict, u16 dict_len, 320 * The same values of first_idx and next_idx mean that the buffer
304 const char *text, u16 text_len) 321 * is either empty or full.
322 *
323 * If the buffer is empty, we must respect the position of the indexes.
324 * They cannot be reset to the beginning of the buffer.
325 */
326static int logbuf_has_space(u32 msg_size, bool empty)
305{ 327{
306 struct printk_log *msg; 328 u32 free;
307 u32 size, pad_len;
308 329
309 /* number of '\0' padding bytes to next message */ 330 if (log_next_idx > log_first_idx || empty)
310 size = sizeof(struct printk_log) + text_len + dict_len; 331 free = max(log_buf_len - log_next_idx, log_first_idx);
311 pad_len = (-size) & (LOG_ALIGN - 1); 332 else
312 size += pad_len; 333 free = log_first_idx - log_next_idx;
334
335 /*
336 * We need space also for an empty header that signalizes wrapping
337 * of the buffer.
338 */
339 return free >= msg_size + sizeof(struct printk_log);
340}
313 341
342static int log_make_free_space(u32 msg_size)
343{
314 while (log_first_seq < log_next_seq) { 344 while (log_first_seq < log_next_seq) {
315 u32 free; 345 if (logbuf_has_space(msg_size, false))
346 return 0;
347 /* drop old messages until we have enough continuous space */
348 log_first_idx = log_next(log_first_idx);
349 log_first_seq++;
350 }
316 351
317 if (log_next_idx > log_first_idx) 352 /* sequence numbers are equal, so the log buffer is empty */
318 free = max(log_buf_len - log_next_idx, log_first_idx); 353 if (logbuf_has_space(msg_size, true))
319 else 354 return 0;
320 free = log_first_idx - log_next_idx;
321 355
322 if (free >= size + sizeof(struct printk_log)) 356 return -ENOMEM;
323 break; 357}
324 358
325 /* drop old messages until we have enough contiuous space */ 359/* compute the message size including the padding bytes */
326 log_first_idx = log_next(log_first_idx); 360static u32 msg_used_size(u16 text_len, u16 dict_len, u32 *pad_len)
327 log_first_seq++; 361{
362 u32 size;
363
364 size = sizeof(struct printk_log) + text_len + dict_len;
365 *pad_len = (-size) & (LOG_ALIGN - 1);
366 size += *pad_len;
367
368 return size;
369}
370
371/*
372 * Define how much of the log buffer we could take at maximum. The value
373 * must be greater than two. Note that only half of the buffer is available
374 * when the index points to the middle.
375 */
376#define MAX_LOG_TAKE_PART 4
377static const char trunc_msg[] = "<truncated>";
378
379static u32 truncate_msg(u16 *text_len, u16 *trunc_msg_len,
380 u16 *dict_len, u32 *pad_len)
381{
382 /*
383 * The message should not take the whole buffer. Otherwise, it might
384 * get removed too soon.
385 */
386 u32 max_text_len = log_buf_len / MAX_LOG_TAKE_PART;
387 if (*text_len > max_text_len)
388 *text_len = max_text_len;
389 /* enable the warning message */
390 *trunc_msg_len = strlen(trunc_msg);
391 /* disable the "dict" completely */
392 *dict_len = 0;
393 /* compute the size again, count also the warning message */
394 return msg_used_size(*text_len + *trunc_msg_len, 0, pad_len);
395}
396
397/* insert record into the buffer, discard old ones, update heads */
398static int log_store(int facility, int level,
399 enum log_flags flags, u64 ts_nsec,
400 const char *dict, u16 dict_len,
401 const char *text, u16 text_len)
402{
403 struct printk_log *msg;
404 u32 size, pad_len;
405 u16 trunc_msg_len = 0;
406
407 /* number of '\0' padding bytes to next message */
408 size = msg_used_size(text_len, dict_len, &pad_len);
409
410 if (log_make_free_space(size)) {
411 /* truncate the message if it is too long for empty buffer */
412 size = truncate_msg(&text_len, &trunc_msg_len,
413 &dict_len, &pad_len);
414 /* survive when the log buffer is too small for trunc_msg */
415 if (log_make_free_space(size))
416 return 0;
328 } 417 }
329 418
330 if (log_next_idx + size + sizeof(struct printk_log) > log_buf_len) { 419 if (log_next_idx + size + sizeof(struct printk_log) > log_buf_len) {
@@ -341,6 +430,10 @@ static void log_store(int facility, int level,
341 msg = (struct printk_log *)(log_buf + log_next_idx); 430 msg = (struct printk_log *)(log_buf + log_next_idx);
342 memcpy(log_text(msg), text, text_len); 431 memcpy(log_text(msg), text, text_len);
343 msg->text_len = text_len; 432 msg->text_len = text_len;
433 if (trunc_msg_len) {
434 memcpy(log_text(msg) + text_len, trunc_msg, trunc_msg_len);
435 msg->text_len += trunc_msg_len;
436 }
344 memcpy(log_dict(msg), dict, dict_len); 437 memcpy(log_dict(msg), dict, dict_len);
345 msg->dict_len = dict_len; 438 msg->dict_len = dict_len;
346 msg->facility = facility; 439 msg->facility = facility;
@@ -356,6 +449,8 @@ static void log_store(int facility, int level,
356 /* insert message */ 449 /* insert message */
357 log_next_idx += msg->len; 450 log_next_idx += msg->len;
358 log_next_seq++; 451 log_next_seq++;
452
453 return msg->text_len;
359} 454}
360 455
361#ifdef CONFIG_SECURITY_DMESG_RESTRICT 456#ifdef CONFIG_SECURITY_DMESG_RESTRICT
@@ -1303,7 +1398,10 @@ static void zap_locks(void)
1303 sema_init(&console_sem, 1); 1398 sema_init(&console_sem, 1);
1304} 1399}
1305 1400
1306/* Check if we have any console registered that can be called early in boot. */ 1401/*
1402 * Check if we have any console that is capable of printing while cpu is
1403 * booting or shutting down. Requires console_sem.
1404 */
1307static int have_callable_console(void) 1405static int have_callable_console(void)
1308{ 1406{
1309 struct console *con; 1407 struct console *con;
@@ -1318,10 +1416,9 @@ static int have_callable_console(void)
1318/* 1416/*
1319 * Can we actually use the console at this time on this cpu? 1417 * Can we actually use the console at this time on this cpu?
1320 * 1418 *
1321 * Console drivers may assume that per-cpu resources have 1419 * Console drivers may assume that per-cpu resources have been allocated. So
1322 * been allocated. So unless they're explicitly marked as 1420 * unless they're explicitly marked as being able to cope (CON_ANYTIME) don't
1323 * being able to cope (CON_ANYTIME) don't call them until 1421 * call them until this CPU is officially up.
1324 * this CPU is officially up.
1325 */ 1422 */
1326static inline int can_use_console(unsigned int cpu) 1423static inline int can_use_console(unsigned int cpu)
1327{ 1424{
@@ -1333,36 +1430,24 @@ static inline int can_use_console(unsigned int cpu)
1333 * messages from a 'printk'. Return true (and with the 1430 * messages from a 'printk'. Return true (and with the
1334 * console_lock held, and 'console_locked' set) if it 1431 * console_lock held, and 'console_locked' set) if it
1335 * is successful, false otherwise. 1432 * is successful, false otherwise.
1336 *
1337 * This gets called with the 'logbuf_lock' spinlock held and
1338 * interrupts disabled. It should return with 'lockbuf_lock'
1339 * released but interrupts still disabled.
1340 */ 1433 */
1341static int console_trylock_for_printk(unsigned int cpu) 1434static int console_trylock_for_printk(void)
1342 __releases(&logbuf_lock)
1343{ 1435{
1344 int retval = 0, wake = 0; 1436 unsigned int cpu = smp_processor_id();
1345 1437
1346 if (console_trylock()) { 1438 if (!console_trylock())
1347 retval = 1; 1439 return 0;
1348 1440 /*
1349 /* 1441 * If we can't use the console, we need to release the console
1350 * If we can't use the console, we need to release 1442 * semaphore by hand to avoid flushing the buffer. We need to hold the
1351 * the console semaphore by hand to avoid flushing 1443 * console semaphore in order to do this test safely.
1352 * the buffer. We need to hold the console semaphore 1444 */
1353 * in order to do this test safely. 1445 if (!can_use_console(cpu)) {
1354 */ 1446 console_locked = 0;
1355 if (!can_use_console(cpu)) { 1447 up_console_sem();
1356 console_locked = 0; 1448 return 0;
1357 wake = 1;
1358 retval = 0;
1359 }
1360 } 1449 }
1361 logbuf_cpu = UINT_MAX; 1450 return 1;
1362 raw_spin_unlock(&logbuf_lock);
1363 if (wake)
1364 up(&console_sem);
1365 return retval;
1366} 1451}
1367 1452
1368int printk_delay_msec __read_mostly; 1453int printk_delay_msec __read_mostly;
@@ -1490,11 +1575,19 @@ asmlinkage int vprintk_emit(int facility, int level,
1490 static int recursion_bug; 1575 static int recursion_bug;
1491 static char textbuf[LOG_LINE_MAX]; 1576 static char textbuf[LOG_LINE_MAX];
1492 char *text = textbuf; 1577 char *text = textbuf;
1493 size_t text_len; 1578 size_t text_len = 0;
1494 enum log_flags lflags = 0; 1579 enum log_flags lflags = 0;
1495 unsigned long flags; 1580 unsigned long flags;
1496 int this_cpu; 1581 int this_cpu;
1497 int printed_len = 0; 1582 int printed_len = 0;
1583 bool in_sched = false;
1584 /* cpu currently holding logbuf_lock in this function */
1585 static volatile unsigned int logbuf_cpu = UINT_MAX;
1586
1587 if (level == SCHED_MESSAGE_LOGLEVEL) {
1588 level = -1;
1589 in_sched = true;
1590 }
1498 1591
1499 boot_delay_msec(level); 1592 boot_delay_msec(level);
1500 printk_delay(); 1593 printk_delay();
@@ -1516,7 +1609,8 @@ asmlinkage int vprintk_emit(int facility, int level,
1516 */ 1609 */
1517 if (!oops_in_progress && !lockdep_recursing(current)) { 1610 if (!oops_in_progress && !lockdep_recursing(current)) {
1518 recursion_bug = 1; 1611 recursion_bug = 1;
1519 goto out_restore_irqs; 1612 local_irq_restore(flags);
1613 return 0;
1520 } 1614 }
1521 zap_locks(); 1615 zap_locks();
1522 } 1616 }
@@ -1530,17 +1624,22 @@ asmlinkage int vprintk_emit(int facility, int level,
1530 "BUG: recent printk recursion!"; 1624 "BUG: recent printk recursion!";
1531 1625
1532 recursion_bug = 0; 1626 recursion_bug = 0;
1533 printed_len += strlen(recursion_msg); 1627 text_len = strlen(recursion_msg);
1534 /* emit KERN_CRIT message */ 1628 /* emit KERN_CRIT message */
1535 log_store(0, 2, LOG_PREFIX|LOG_NEWLINE, 0, 1629 printed_len += log_store(0, 2, LOG_PREFIX|LOG_NEWLINE, 0,
1536 NULL, 0, recursion_msg, printed_len); 1630 NULL, 0, recursion_msg, text_len);
1537 } 1631 }
1538 1632
1539 /* 1633 /*
1540 * The printf needs to come first; we need the syslog 1634 * The printf needs to come first; we need the syslog
1541 * prefix which might be passed-in as a parameter. 1635 * prefix which might be passed-in as a parameter.
1542 */ 1636 */
1543 text_len = vscnprintf(text, sizeof(textbuf), fmt, args); 1637 if (in_sched)
1638 text_len = scnprintf(text, sizeof(textbuf),
1639 KERN_WARNING "[sched_delayed] ");
1640
1641 text_len += vscnprintf(text + text_len,
1642 sizeof(textbuf) - text_len, fmt, args);
1544 1643
1545 /* mark and strip a trailing newline */ 1644 /* mark and strip a trailing newline */
1546 if (text_len && text[text_len-1] == '\n') { 1645 if (text_len && text[text_len-1] == '\n') {
@@ -1586,9 +1685,12 @@ asmlinkage int vprintk_emit(int facility, int level,
1586 cont_flush(LOG_NEWLINE); 1685 cont_flush(LOG_NEWLINE);
1587 1686
1588 /* buffer line if possible, otherwise store it right away */ 1687 /* buffer line if possible, otherwise store it right away */
1589 if (!cont_add(facility, level, text, text_len)) 1688 if (cont_add(facility, level, text, text_len))
1590 log_store(facility, level, lflags | LOG_CONT, 0, 1689 printed_len += text_len;
1591 dict, dictlen, text, text_len); 1690 else
1691 printed_len += log_store(facility, level,
1692 lflags | LOG_CONT, 0,
1693 dict, dictlen, text, text_len);
1592 } else { 1694 } else {
1593 bool stored = false; 1695 bool stored = false;
1594 1696
@@ -1607,26 +1709,35 @@ asmlinkage int vprintk_emit(int facility, int level,
1607 cont_flush(LOG_NEWLINE); 1709 cont_flush(LOG_NEWLINE);
1608 } 1710 }
1609 1711
1610 if (!stored) 1712 if (stored)
1611 log_store(facility, level, lflags, 0, 1713 printed_len += text_len;
1612 dict, dictlen, text, text_len); 1714 else
1715 printed_len += log_store(facility, level, lflags, 0,
1716 dict, dictlen, text, text_len);
1613 } 1717 }
1614 printed_len += text_len; 1718
1719 logbuf_cpu = UINT_MAX;
1720 raw_spin_unlock(&logbuf_lock);
1721 lockdep_on();
1722 local_irq_restore(flags);
1723
1724 /* If called from the scheduler, we can not call up(). */
1725 if (in_sched)
1726 return printed_len;
1615 1727
1616 /* 1728 /*
1729 * Disable preemption to avoid being preempted while holding
1730 * console_sem which would prevent anyone from printing to console
1731 */
1732 preempt_disable();
1733 /*
1617 * Try to acquire and then immediately release the console semaphore. 1734 * Try to acquire and then immediately release the console semaphore.
1618 * The release will print out buffers and wake up /dev/kmsg and syslog() 1735 * The release will print out buffers and wake up /dev/kmsg and syslog()
1619 * users. 1736 * users.
1620 *
1621 * The console_trylock_for_printk() function will release 'logbuf_lock'
1622 * regardless of whether it actually gets the console semaphore or not.
1623 */ 1737 */
1624 if (console_trylock_for_printk(this_cpu)) 1738 if (console_trylock_for_printk())
1625 console_unlock(); 1739 console_unlock();
1626 1740 preempt_enable();
1627 lockdep_on();
1628out_restore_irqs:
1629 local_irq_restore(flags);
1630 1741
1631 return printed_len; 1742 return printed_len;
1632} 1743}
@@ -1674,7 +1785,7 @@ EXPORT_SYMBOL(printk_emit);
1674 * 1785 *
1675 * See the vsnprintf() documentation for format string extensions over C99. 1786 * See the vsnprintf() documentation for format string extensions over C99.
1676 */ 1787 */
1677asmlinkage int printk(const char *fmt, ...) 1788asmlinkage __visible int printk(const char *fmt, ...)
1678{ 1789{
1679 va_list args; 1790 va_list args;
1680 int r; 1791 int r;
@@ -1737,7 +1848,7 @@ void early_vprintk(const char *fmt, va_list ap)
1737 } 1848 }
1738} 1849}
1739 1850
1740asmlinkage void early_printk(const char *fmt, ...) 1851asmlinkage __visible void early_printk(const char *fmt, ...)
1741{ 1852{
1742 va_list ap; 1853 va_list ap;
1743 1854
@@ -1882,16 +1993,14 @@ void suspend_console(void)
1882 printk("Suspending console(s) (use no_console_suspend to debug)\n"); 1993 printk("Suspending console(s) (use no_console_suspend to debug)\n");
1883 console_lock(); 1994 console_lock();
1884 console_suspended = 1; 1995 console_suspended = 1;
1885 up(&console_sem); 1996 up_console_sem();
1886 mutex_release(&console_lock_dep_map, 1, _RET_IP_);
1887} 1997}
1888 1998
1889void resume_console(void) 1999void resume_console(void)
1890{ 2000{
1891 if (!console_suspend_enabled) 2001 if (!console_suspend_enabled)
1892 return; 2002 return;
1893 down(&console_sem); 2003 down_console_sem();
1894 mutex_acquire(&console_lock_dep_map, 0, 0, _RET_IP_);
1895 console_suspended = 0; 2004 console_suspended = 0;
1896 console_unlock(); 2005 console_unlock();
1897} 2006}
@@ -1933,12 +2042,11 @@ void console_lock(void)
1933{ 2042{
1934 might_sleep(); 2043 might_sleep();
1935 2044
1936 down(&console_sem); 2045 down_console_sem();
1937 if (console_suspended) 2046 if (console_suspended)
1938 return; 2047 return;
1939 console_locked = 1; 2048 console_locked = 1;
1940 console_may_schedule = 1; 2049 console_may_schedule = 1;
1941 mutex_acquire(&console_lock_dep_map, 0, 0, _RET_IP_);
1942} 2050}
1943EXPORT_SYMBOL(console_lock); 2051EXPORT_SYMBOL(console_lock);
1944 2052
@@ -1952,15 +2060,14 @@ EXPORT_SYMBOL(console_lock);
1952 */ 2060 */
1953int console_trylock(void) 2061int console_trylock(void)
1954{ 2062{
1955 if (down_trylock(&console_sem)) 2063 if (down_trylock_console_sem())
1956 return 0; 2064 return 0;
1957 if (console_suspended) { 2065 if (console_suspended) {
1958 up(&console_sem); 2066 up_console_sem();
1959 return 0; 2067 return 0;
1960 } 2068 }
1961 console_locked = 1; 2069 console_locked = 1;
1962 console_may_schedule = 0; 2070 console_may_schedule = 0;
1963 mutex_acquire(&console_lock_dep_map, 0, 1, _RET_IP_);
1964 return 1; 2071 return 1;
1965} 2072}
1966EXPORT_SYMBOL(console_trylock); 2073EXPORT_SYMBOL(console_trylock);
@@ -2022,7 +2129,7 @@ void console_unlock(void)
2022 bool retry; 2129 bool retry;
2023 2130
2024 if (console_suspended) { 2131 if (console_suspended) {
2025 up(&console_sem); 2132 up_console_sem();
2026 return; 2133 return;
2027 } 2134 }
2028 2135
@@ -2043,10 +2150,15 @@ again:
2043 } 2150 }
2044 2151
2045 if (console_seq < log_first_seq) { 2152 if (console_seq < log_first_seq) {
2153 len = sprintf(text, "** %u printk messages dropped ** ",
2154 (unsigned)(log_first_seq - console_seq));
2155
2046 /* messages are gone, move to first one */ 2156 /* messages are gone, move to first one */
2047 console_seq = log_first_seq; 2157 console_seq = log_first_seq;
2048 console_idx = log_first_idx; 2158 console_idx = log_first_idx;
2049 console_prev = 0; 2159 console_prev = 0;
2160 } else {
2161 len = 0;
2050 } 2162 }
2051skip: 2163skip:
2052 if (console_seq == log_next_seq) 2164 if (console_seq == log_next_seq)
@@ -2071,8 +2183,8 @@ skip:
2071 } 2183 }
2072 2184
2073 level = msg->level; 2185 level = msg->level;
2074 len = msg_print_text(msg, console_prev, false, 2186 len += msg_print_text(msg, console_prev, false,
2075 text, sizeof(text)); 2187 text + len, sizeof(text) - len);
2076 console_idx = log_next(console_idx); 2188 console_idx = log_next(console_idx);
2077 console_seq++; 2189 console_seq++;
2078 console_prev = msg->flags; 2190 console_prev = msg->flags;
@@ -2084,7 +2196,6 @@ skip:
2084 local_irq_restore(flags); 2196 local_irq_restore(flags);
2085 } 2197 }
2086 console_locked = 0; 2198 console_locked = 0;
2087 mutex_release(&console_lock_dep_map, 1, _RET_IP_);
2088 2199
2089 /* Release the exclusive_console once it is used */ 2200 /* Release the exclusive_console once it is used */
2090 if (unlikely(exclusive_console)) 2201 if (unlikely(exclusive_console))
@@ -2092,7 +2203,7 @@ skip:
2092 2203
2093 raw_spin_unlock(&logbuf_lock); 2204 raw_spin_unlock(&logbuf_lock);
2094 2205
2095 up(&console_sem); 2206 up_console_sem();
2096 2207
2097 /* 2208 /*
2098 * Someone could have filled up the buffer again, so re-check if there's 2209 * Someone could have filled up the buffer again, so re-check if there's
@@ -2137,7 +2248,7 @@ void console_unblank(void)
2137 * oops_in_progress is set to 1.. 2248 * oops_in_progress is set to 1..
2138 */ 2249 */
2139 if (oops_in_progress) { 2250 if (oops_in_progress) {
2140 if (down_trylock(&console_sem) != 0) 2251 if (down_trylock_console_sem() != 0)
2141 return; 2252 return;
2142 } else 2253 } else
2143 console_lock(); 2254 console_lock();
@@ -2413,6 +2524,7 @@ int unregister_console(struct console *console)
2413 if (console_drivers != NULL && console->flags & CON_CONSDEV) 2524 if (console_drivers != NULL && console->flags & CON_CONSDEV)
2414 console_drivers->flags |= CON_CONSDEV; 2525 console_drivers->flags |= CON_CONSDEV;
2415 2526
2527 console->flags &= ~CON_ENABLED;
2416 console_unlock(); 2528 console_unlock();
2417 console_sysfs_notify(); 2529 console_sysfs_notify();
2418 return res; 2530 return res;
@@ -2437,21 +2549,19 @@ late_initcall(printk_late_init);
2437/* 2549/*
2438 * Delayed printk version, for scheduler-internal messages: 2550 * Delayed printk version, for scheduler-internal messages:
2439 */ 2551 */
2440#define PRINTK_BUF_SIZE 512
2441
2442#define PRINTK_PENDING_WAKEUP 0x01 2552#define PRINTK_PENDING_WAKEUP 0x01
2443#define PRINTK_PENDING_SCHED 0x02 2553#define PRINTK_PENDING_OUTPUT 0x02
2444 2554
2445static DEFINE_PER_CPU(int, printk_pending); 2555static DEFINE_PER_CPU(int, printk_pending);
2446static DEFINE_PER_CPU(char [PRINTK_BUF_SIZE], printk_sched_buf);
2447 2556
2448static void wake_up_klogd_work_func(struct irq_work *irq_work) 2557static void wake_up_klogd_work_func(struct irq_work *irq_work)
2449{ 2558{
2450 int pending = __this_cpu_xchg(printk_pending, 0); 2559 int pending = __this_cpu_xchg(printk_pending, 0);
2451 2560
2452 if (pending & PRINTK_PENDING_SCHED) { 2561 if (pending & PRINTK_PENDING_OUTPUT) {
2453 char *buf = __get_cpu_var(printk_sched_buf); 2562 /* If trylock fails, someone else is doing the printing */
2454 pr_warn("[sched_delayed] %s", buf); 2563 if (console_trylock())
2564 console_unlock();
2455 } 2565 }
2456 2566
2457 if (pending & PRINTK_PENDING_WAKEUP) 2567 if (pending & PRINTK_PENDING_WAKEUP)
@@ -2473,23 +2583,19 @@ void wake_up_klogd(void)
2473 preempt_enable(); 2583 preempt_enable();
2474} 2584}
2475 2585
2476int printk_sched(const char *fmt, ...) 2586int printk_deferred(const char *fmt, ...)
2477{ 2587{
2478 unsigned long flags;
2479 va_list args; 2588 va_list args;
2480 char *buf;
2481 int r; 2589 int r;
2482 2590
2483 local_irq_save(flags); 2591 preempt_disable();
2484 buf = __get_cpu_var(printk_sched_buf);
2485
2486 va_start(args, fmt); 2592 va_start(args, fmt);
2487 r = vsnprintf(buf, PRINTK_BUF_SIZE, fmt, args); 2593 r = vprintk_emit(0, SCHED_MESSAGE_LOGLEVEL, NULL, 0, fmt, args);
2488 va_end(args); 2594 va_end(args);
2489 2595
2490 __this_cpu_or(printk_pending, PRINTK_PENDING_SCHED); 2596 __this_cpu_or(printk_pending, PRINTK_PENDING_OUTPUT);
2491 irq_work_queue(&__get_cpu_var(wake_up_klogd_work)); 2597 irq_work_queue(&__get_cpu_var(wake_up_klogd_work));
2492 local_irq_restore(flags); 2598 preempt_enable();
2493 2599
2494 return r; 2600 return r;
2495} 2601}
diff --git a/kernel/profile.c b/kernel/profile.c
index cb980f0c731b..54bf5ba26420 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -52,9 +52,9 @@ static DEFINE_MUTEX(profile_flip_mutex);
52 52
53int profile_setup(char *str) 53int profile_setup(char *str)
54{ 54{
55 static char schedstr[] = "schedule"; 55 static const char schedstr[] = "schedule";
56 static char sleepstr[] = "sleep"; 56 static const char sleepstr[] = "sleep";
57 static char kvmstr[] = "kvm"; 57 static const char kvmstr[] = "kvm";
58 int par; 58 int par;
59 59
60 if (!strncmp(str, sleepstr, strlen(sleepstr))) { 60 if (!strncmp(str, sleepstr, strlen(sleepstr))) {
@@ -64,12 +64,10 @@ int profile_setup(char *str)
64 str += strlen(sleepstr) + 1; 64 str += strlen(sleepstr) + 1;
65 if (get_option(&str, &par)) 65 if (get_option(&str, &par))
66 prof_shift = par; 66 prof_shift = par;
67 printk(KERN_INFO 67 pr_info("kernel sleep profiling enabled (shift: %ld)\n",
68 "kernel sleep profiling enabled (shift: %ld)\n",
69 prof_shift); 68 prof_shift);
70#else 69#else
71 printk(KERN_WARNING 70 pr_warn("kernel sleep profiling requires CONFIG_SCHEDSTATS\n");
72 "kernel sleep profiling requires CONFIG_SCHEDSTATS\n");
73#endif /* CONFIG_SCHEDSTATS */ 71#endif /* CONFIG_SCHEDSTATS */
74 } else if (!strncmp(str, schedstr, strlen(schedstr))) { 72 } else if (!strncmp(str, schedstr, strlen(schedstr))) {
75 prof_on = SCHED_PROFILING; 73 prof_on = SCHED_PROFILING;
@@ -77,8 +75,7 @@ int profile_setup(char *str)
77 str += strlen(schedstr) + 1; 75 str += strlen(schedstr) + 1;
78 if (get_option(&str, &par)) 76 if (get_option(&str, &par))
79 prof_shift = par; 77 prof_shift = par;
80 printk(KERN_INFO 78 pr_info("kernel schedule profiling enabled (shift: %ld)\n",
81 "kernel schedule profiling enabled (shift: %ld)\n",
82 prof_shift); 79 prof_shift);
83 } else if (!strncmp(str, kvmstr, strlen(kvmstr))) { 80 } else if (!strncmp(str, kvmstr, strlen(kvmstr))) {
84 prof_on = KVM_PROFILING; 81 prof_on = KVM_PROFILING;
@@ -86,13 +83,12 @@ int profile_setup(char *str)
86 str += strlen(kvmstr) + 1; 83 str += strlen(kvmstr) + 1;
87 if (get_option(&str, &par)) 84 if (get_option(&str, &par))
88 prof_shift = par; 85 prof_shift = par;
89 printk(KERN_INFO 86 pr_info("kernel KVM profiling enabled (shift: %ld)\n",
90 "kernel KVM profiling enabled (shift: %ld)\n",
91 prof_shift); 87 prof_shift);
92 } else if (get_option(&str, &par)) { 88 } else if (get_option(&str, &par)) {
93 prof_shift = par; 89 prof_shift = par;
94 prof_on = CPU_PROFILING; 90 prof_on = CPU_PROFILING;
95 printk(KERN_INFO "kernel profiling enabled (shift: %ld)\n", 91 pr_info("kernel profiling enabled (shift: %ld)\n",
96 prof_shift); 92 prof_shift);
97 } 93 }
98 return 1; 94 return 1;
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index bd30bc61bc05..7fa34f86e5ba 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -58,9 +58,11 @@ torture_param(int, fqs_duration, 0,
58 "Duration of fqs bursts (us), 0 to disable"); 58 "Duration of fqs bursts (us), 0 to disable");
59torture_param(int, fqs_holdoff, 0, "Holdoff time within fqs bursts (us)"); 59torture_param(int, fqs_holdoff, 0, "Holdoff time within fqs bursts (us)");
60torture_param(int, fqs_stutter, 3, "Wait time between fqs bursts (s)"); 60torture_param(int, fqs_stutter, 3, "Wait time between fqs bursts (s)");
61torture_param(bool, gp_cond, false, "Use conditional/async GP wait primitives");
61torture_param(bool, gp_exp, false, "Use expedited GP wait primitives"); 62torture_param(bool, gp_exp, false, "Use expedited GP wait primitives");
62torture_param(bool, gp_normal, false, 63torture_param(bool, gp_normal, false,
63 "Use normal (non-expedited) GP wait primitives"); 64 "Use normal (non-expedited) GP wait primitives");
65torture_param(bool, gp_sync, false, "Use synchronous GP wait primitives");
64torture_param(int, irqreader, 1, "Allow RCU readers from irq handlers"); 66torture_param(int, irqreader, 1, "Allow RCU readers from irq handlers");
65torture_param(int, n_barrier_cbs, 0, 67torture_param(int, n_barrier_cbs, 0,
66 "# of callbacks/kthreads for barrier testing"); 68 "# of callbacks/kthreads for barrier testing");
@@ -138,6 +140,18 @@ static long n_barrier_attempts;
138static long n_barrier_successes; 140static long n_barrier_successes;
139static struct list_head rcu_torture_removed; 141static struct list_head rcu_torture_removed;
140 142
143static int rcu_torture_writer_state;
144#define RTWS_FIXED_DELAY 0
145#define RTWS_DELAY 1
146#define RTWS_REPLACE 2
147#define RTWS_DEF_FREE 3
148#define RTWS_EXP_SYNC 4
149#define RTWS_COND_GET 5
150#define RTWS_COND_SYNC 6
151#define RTWS_SYNC 7
152#define RTWS_STUTTER 8
153#define RTWS_STOPPING 9
154
141#if defined(MODULE) || defined(CONFIG_RCU_TORTURE_TEST_RUNNABLE) 155#if defined(MODULE) || defined(CONFIG_RCU_TORTURE_TEST_RUNNABLE)
142#define RCUTORTURE_RUNNABLE_INIT 1 156#define RCUTORTURE_RUNNABLE_INIT 1
143#else 157#else
@@ -214,6 +228,7 @@ rcu_torture_free(struct rcu_torture *p)
214 */ 228 */
215 229
216struct rcu_torture_ops { 230struct rcu_torture_ops {
231 int ttype;
217 void (*init)(void); 232 void (*init)(void);
218 int (*readlock)(void); 233 int (*readlock)(void);
219 void (*read_delay)(struct torture_random_state *rrsp); 234 void (*read_delay)(struct torture_random_state *rrsp);
@@ -222,6 +237,8 @@ struct rcu_torture_ops {
222 void (*deferred_free)(struct rcu_torture *p); 237 void (*deferred_free)(struct rcu_torture *p);
223 void (*sync)(void); 238 void (*sync)(void);
224 void (*exp_sync)(void); 239 void (*exp_sync)(void);
240 unsigned long (*get_state)(void);
241 void (*cond_sync)(unsigned long oldstate);
225 void (*call)(struct rcu_head *head, void (*func)(struct rcu_head *rcu)); 242 void (*call)(struct rcu_head *head, void (*func)(struct rcu_head *rcu));
226 void (*cb_barrier)(void); 243 void (*cb_barrier)(void);
227 void (*fqs)(void); 244 void (*fqs)(void);
@@ -273,10 +290,48 @@ static int rcu_torture_completed(void)
273 return rcu_batches_completed(); 290 return rcu_batches_completed();
274} 291}
275 292
293/*
294 * Update callback in the pipe. This should be invoked after a grace period.
295 */
296static bool
297rcu_torture_pipe_update_one(struct rcu_torture *rp)
298{
299 int i;
300
301 i = rp->rtort_pipe_count;
302 if (i > RCU_TORTURE_PIPE_LEN)
303 i = RCU_TORTURE_PIPE_LEN;
304 atomic_inc(&rcu_torture_wcount[i]);
305 if (++rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) {
306 rp->rtort_mbtest = 0;
307 return true;
308 }
309 return false;
310}
311
312/*
313 * Update all callbacks in the pipe. Suitable for synchronous grace-period
314 * primitives.
315 */
316static void
317rcu_torture_pipe_update(struct rcu_torture *old_rp)
318{
319 struct rcu_torture *rp;
320 struct rcu_torture *rp1;
321
322 if (old_rp)
323 list_add(&old_rp->rtort_free, &rcu_torture_removed);
324 list_for_each_entry_safe(rp, rp1, &rcu_torture_removed, rtort_free) {
325 if (rcu_torture_pipe_update_one(rp)) {
326 list_del(&rp->rtort_free);
327 rcu_torture_free(rp);
328 }
329 }
330}
331
276static void 332static void
277rcu_torture_cb(struct rcu_head *p) 333rcu_torture_cb(struct rcu_head *p)
278{ 334{
279 int i;
280 struct rcu_torture *rp = container_of(p, struct rcu_torture, rtort_rcu); 335 struct rcu_torture *rp = container_of(p, struct rcu_torture, rtort_rcu);
281 336
282 if (torture_must_stop_irq()) { 337 if (torture_must_stop_irq()) {
@@ -284,16 +339,10 @@ rcu_torture_cb(struct rcu_head *p)
284 /* The next initialization will pick up the pieces. */ 339 /* The next initialization will pick up the pieces. */
285 return; 340 return;
286 } 341 }
287 i = rp->rtort_pipe_count; 342 if (rcu_torture_pipe_update_one(rp))
288 if (i > RCU_TORTURE_PIPE_LEN)
289 i = RCU_TORTURE_PIPE_LEN;
290 atomic_inc(&rcu_torture_wcount[i]);
291 if (++rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) {
292 rp->rtort_mbtest = 0;
293 rcu_torture_free(rp); 343 rcu_torture_free(rp);
294 } else { 344 else
295 cur_ops->deferred_free(rp); 345 cur_ops->deferred_free(rp);
296 }
297} 346}
298 347
299static int rcu_no_completed(void) 348static int rcu_no_completed(void)
@@ -312,6 +361,7 @@ static void rcu_sync_torture_init(void)
312} 361}
313 362
314static struct rcu_torture_ops rcu_ops = { 363static struct rcu_torture_ops rcu_ops = {
364 .ttype = RCU_FLAVOR,
315 .init = rcu_sync_torture_init, 365 .init = rcu_sync_torture_init,
316 .readlock = rcu_torture_read_lock, 366 .readlock = rcu_torture_read_lock,
317 .read_delay = rcu_read_delay, 367 .read_delay = rcu_read_delay,
@@ -320,6 +370,8 @@ static struct rcu_torture_ops rcu_ops = {
320 .deferred_free = rcu_torture_deferred_free, 370 .deferred_free = rcu_torture_deferred_free,
321 .sync = synchronize_rcu, 371 .sync = synchronize_rcu,
322 .exp_sync = synchronize_rcu_expedited, 372 .exp_sync = synchronize_rcu_expedited,
373 .get_state = get_state_synchronize_rcu,
374 .cond_sync = cond_synchronize_rcu,
323 .call = call_rcu, 375 .call = call_rcu,
324 .cb_barrier = rcu_barrier, 376 .cb_barrier = rcu_barrier,
325 .fqs = rcu_force_quiescent_state, 377 .fqs = rcu_force_quiescent_state,
@@ -355,6 +407,7 @@ static void rcu_bh_torture_deferred_free(struct rcu_torture *p)
355} 407}
356 408
357static struct rcu_torture_ops rcu_bh_ops = { 409static struct rcu_torture_ops rcu_bh_ops = {
410 .ttype = RCU_BH_FLAVOR,
358 .init = rcu_sync_torture_init, 411 .init = rcu_sync_torture_init,
359 .readlock = rcu_bh_torture_read_lock, 412 .readlock = rcu_bh_torture_read_lock,
360 .read_delay = rcu_read_delay, /* just reuse rcu's version. */ 413 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
@@ -397,6 +450,7 @@ call_rcu_busted(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
397} 450}
398 451
399static struct rcu_torture_ops rcu_busted_ops = { 452static struct rcu_torture_ops rcu_busted_ops = {
453 .ttype = INVALID_RCU_FLAVOR,
400 .init = rcu_sync_torture_init, 454 .init = rcu_sync_torture_init,
401 .readlock = rcu_torture_read_lock, 455 .readlock = rcu_torture_read_lock,
402 .read_delay = rcu_read_delay, /* just reuse rcu's version. */ 456 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
@@ -479,9 +533,11 @@ static void srcu_torture_stats(char *page)
479 page += sprintf(page, "%s%s per-CPU(idx=%d):", 533 page += sprintf(page, "%s%s per-CPU(idx=%d):",
480 torture_type, TORTURE_FLAG, idx); 534 torture_type, TORTURE_FLAG, idx);
481 for_each_possible_cpu(cpu) { 535 for_each_possible_cpu(cpu) {
482 page += sprintf(page, " %d(%lu,%lu)", cpu, 536 long c0, c1;
483 per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[!idx], 537
484 per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[idx]); 538 c0 = (long)per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[!idx];
539 c1 = (long)per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[idx];
540 page += sprintf(page, " %d(%ld,%ld)", cpu, c0, c1);
485 } 541 }
486 sprintf(page, "\n"); 542 sprintf(page, "\n");
487} 543}
@@ -492,6 +548,7 @@ static void srcu_torture_synchronize_expedited(void)
492} 548}
493 549
494static struct rcu_torture_ops srcu_ops = { 550static struct rcu_torture_ops srcu_ops = {
551 .ttype = SRCU_FLAVOR,
495 .init = rcu_sync_torture_init, 552 .init = rcu_sync_torture_init,
496 .readlock = srcu_torture_read_lock, 553 .readlock = srcu_torture_read_lock,
497 .read_delay = srcu_read_delay, 554 .read_delay = srcu_read_delay,
@@ -527,6 +584,7 @@ static void rcu_sched_torture_deferred_free(struct rcu_torture *p)
527} 584}
528 585
529static struct rcu_torture_ops sched_ops = { 586static struct rcu_torture_ops sched_ops = {
587 .ttype = RCU_SCHED_FLAVOR,
530 .init = rcu_sync_torture_init, 588 .init = rcu_sync_torture_init,
531 .readlock = sched_torture_read_lock, 589 .readlock = sched_torture_read_lock,
532 .read_delay = rcu_read_delay, /* just reuse rcu's version. */ 590 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
@@ -688,23 +746,59 @@ rcu_torture_fqs(void *arg)
688static int 746static int
689rcu_torture_writer(void *arg) 747rcu_torture_writer(void *arg)
690{ 748{
691 bool exp; 749 unsigned long gp_snap;
750 bool gp_cond1 = gp_cond, gp_exp1 = gp_exp, gp_normal1 = gp_normal;
751 bool gp_sync1 = gp_sync;
692 int i; 752 int i;
693 struct rcu_torture *rp; 753 struct rcu_torture *rp;
694 struct rcu_torture *rp1;
695 struct rcu_torture *old_rp; 754 struct rcu_torture *old_rp;
696 static DEFINE_TORTURE_RANDOM(rand); 755 static DEFINE_TORTURE_RANDOM(rand);
756 int synctype[] = { RTWS_DEF_FREE, RTWS_EXP_SYNC,
757 RTWS_COND_GET, RTWS_SYNC };
758 int nsynctypes = 0;
697 759
698 VERBOSE_TOROUT_STRING("rcu_torture_writer task started"); 760 VERBOSE_TOROUT_STRING("rcu_torture_writer task started");
699 set_user_nice(current, MAX_NICE); 761
762 /* Initialize synctype[] array. If none set, take default. */
763 if (!gp_cond1 && !gp_exp1 && !gp_normal1 && !gp_sync)
764 gp_cond1 = gp_exp1 = gp_normal1 = gp_sync1 = true;
765 if (gp_cond1 && cur_ops->get_state && cur_ops->cond_sync)
766 synctype[nsynctypes++] = RTWS_COND_GET;
767 else if (gp_cond && (!cur_ops->get_state || !cur_ops->cond_sync))
768 pr_alert("rcu_torture_writer: gp_cond without primitives.\n");
769 if (gp_exp1 && cur_ops->exp_sync)
770 synctype[nsynctypes++] = RTWS_EXP_SYNC;
771 else if (gp_exp && !cur_ops->exp_sync)
772 pr_alert("rcu_torture_writer: gp_exp without primitives.\n");
773 if (gp_normal1 && cur_ops->deferred_free)
774 synctype[nsynctypes++] = RTWS_DEF_FREE;
775 else if (gp_normal && !cur_ops->deferred_free)
776 pr_alert("rcu_torture_writer: gp_normal without primitives.\n");
777 if (gp_sync1 && cur_ops->sync)
778 synctype[nsynctypes++] = RTWS_SYNC;
779 else if (gp_sync && !cur_ops->sync)
780 pr_alert("rcu_torture_writer: gp_sync without primitives.\n");
781 if (WARN_ONCE(nsynctypes == 0,
782 "rcu_torture_writer: No update-side primitives.\n")) {
783 /*
784 * No updates primitives, so don't try updating.
785 * The resulting test won't be testing much, hence the
786 * above WARN_ONCE().
787 */
788 rcu_torture_writer_state = RTWS_STOPPING;
789 torture_kthread_stopping("rcu_torture_writer");
790 }
700 791
701 do { 792 do {
793 rcu_torture_writer_state = RTWS_FIXED_DELAY;
702 schedule_timeout_uninterruptible(1); 794 schedule_timeout_uninterruptible(1);
703 rp = rcu_torture_alloc(); 795 rp = rcu_torture_alloc();
704 if (rp == NULL) 796 if (rp == NULL)
705 continue; 797 continue;
706 rp->rtort_pipe_count = 0; 798 rp->rtort_pipe_count = 0;
799 rcu_torture_writer_state = RTWS_DELAY;
707 udelay(torture_random(&rand) & 0x3ff); 800 udelay(torture_random(&rand) & 0x3ff);
801 rcu_torture_writer_state = RTWS_REPLACE;
708 old_rp = rcu_dereference_check(rcu_torture_current, 802 old_rp = rcu_dereference_check(rcu_torture_current,
709 current == writer_task); 803 current == writer_task);
710 rp->rtort_mbtest = 1; 804 rp->rtort_mbtest = 1;
@@ -716,35 +810,42 @@ rcu_torture_writer(void *arg)
716 i = RCU_TORTURE_PIPE_LEN; 810 i = RCU_TORTURE_PIPE_LEN;
717 atomic_inc(&rcu_torture_wcount[i]); 811 atomic_inc(&rcu_torture_wcount[i]);
718 old_rp->rtort_pipe_count++; 812 old_rp->rtort_pipe_count++;
719 if (gp_normal == gp_exp) 813 switch (synctype[torture_random(&rand) % nsynctypes]) {
720 exp = !!(torture_random(&rand) & 0x80); 814 case RTWS_DEF_FREE:
721 else 815 rcu_torture_writer_state = RTWS_DEF_FREE;
722 exp = gp_exp;
723 if (!exp) {
724 cur_ops->deferred_free(old_rp); 816 cur_ops->deferred_free(old_rp);
725 } else { 817 break;
818 case RTWS_EXP_SYNC:
819 rcu_torture_writer_state = RTWS_EXP_SYNC;
726 cur_ops->exp_sync(); 820 cur_ops->exp_sync();
727 list_add(&old_rp->rtort_free, 821 rcu_torture_pipe_update(old_rp);
728 &rcu_torture_removed); 822 break;
729 list_for_each_entry_safe(rp, rp1, 823 case RTWS_COND_GET:
730 &rcu_torture_removed, 824 rcu_torture_writer_state = RTWS_COND_GET;
731 rtort_free) { 825 gp_snap = cur_ops->get_state();
732 i = rp->rtort_pipe_count; 826 i = torture_random(&rand) % 16;
733 if (i > RCU_TORTURE_PIPE_LEN) 827 if (i != 0)
734 i = RCU_TORTURE_PIPE_LEN; 828 schedule_timeout_interruptible(i);
735 atomic_inc(&rcu_torture_wcount[i]); 829 udelay(torture_random(&rand) % 1000);
736 if (++rp->rtort_pipe_count >= 830 rcu_torture_writer_state = RTWS_COND_SYNC;
737 RCU_TORTURE_PIPE_LEN) { 831 cur_ops->cond_sync(gp_snap);
738 rp->rtort_mbtest = 0; 832 rcu_torture_pipe_update(old_rp);
739 list_del(&rp->rtort_free); 833 break;
740 rcu_torture_free(rp); 834 case RTWS_SYNC:
741 } 835 rcu_torture_writer_state = RTWS_SYNC;
742 } 836 cur_ops->sync();
837 rcu_torture_pipe_update(old_rp);
838 break;
839 default:
840 WARN_ON_ONCE(1);
841 break;
743 } 842 }
744 } 843 }
745 rcutorture_record_progress(++rcu_torture_current_version); 844 rcutorture_record_progress(++rcu_torture_current_version);
845 rcu_torture_writer_state = RTWS_STUTTER;
746 stutter_wait("rcu_torture_writer"); 846 stutter_wait("rcu_torture_writer");
747 } while (!torture_must_stop()); 847 } while (!torture_must_stop());
848 rcu_torture_writer_state = RTWS_STOPPING;
748 torture_kthread_stopping("rcu_torture_writer"); 849 torture_kthread_stopping("rcu_torture_writer");
749 return 0; 850 return 0;
750} 851}
@@ -784,7 +885,7 @@ rcu_torture_fakewriter(void *arg)
784 return 0; 885 return 0;
785} 886}
786 887
787void rcutorture_trace_dump(void) 888static void rcutorture_trace_dump(void)
788{ 889{
789 static atomic_t beenhere = ATOMIC_INIT(0); 890 static atomic_t beenhere = ATOMIC_INIT(0);
790 891
@@ -918,11 +1019,13 @@ rcu_torture_reader(void *arg)
918 __this_cpu_inc(rcu_torture_batch[completed]); 1019 __this_cpu_inc(rcu_torture_batch[completed]);
919 preempt_enable(); 1020 preempt_enable();
920 cur_ops->readunlock(idx); 1021 cur_ops->readunlock(idx);
921 schedule(); 1022 cond_resched();
922 stutter_wait("rcu_torture_reader"); 1023 stutter_wait("rcu_torture_reader");
923 } while (!torture_must_stop()); 1024 } while (!torture_must_stop());
924 if (irqreader && cur_ops->irq_capable) 1025 if (irqreader && cur_ops->irq_capable) {
925 del_timer_sync(&t); 1026 del_timer_sync(&t);
1027 destroy_timer_on_stack(&t);
1028 }
926 torture_kthread_stopping("rcu_torture_reader"); 1029 torture_kthread_stopping("rcu_torture_reader");
927 return 0; 1030 return 0;
928} 1031}
@@ -937,6 +1040,7 @@ rcu_torture_printk(char *page)
937 int i; 1040 int i;
938 long pipesummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 }; 1041 long pipesummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 };
939 long batchsummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 }; 1042 long batchsummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 };
1043 static unsigned long rtcv_snap = ULONG_MAX;
940 1044
941 for_each_possible_cpu(cpu) { 1045 for_each_possible_cpu(cpu) {
942 for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) { 1046 for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) {
@@ -997,6 +1101,22 @@ rcu_torture_printk(char *page)
997 page += sprintf(page, "\n"); 1101 page += sprintf(page, "\n");
998 if (cur_ops->stats) 1102 if (cur_ops->stats)
999 cur_ops->stats(page); 1103 cur_ops->stats(page);
1104 if (rtcv_snap == rcu_torture_current_version &&
1105 rcu_torture_current != NULL) {
1106 int __maybe_unused flags;
1107 unsigned long __maybe_unused gpnum;
1108 unsigned long __maybe_unused completed;
1109
1110 rcutorture_get_gp_data(cur_ops->ttype,
1111 &flags, &gpnum, &completed);
1112 page += sprintf(page,
1113 "??? Writer stall state %d g%lu c%lu f%#x\n",
1114 rcu_torture_writer_state,
1115 gpnum, completed, flags);
1116 show_rcu_gp_kthreads();
1117 rcutorture_trace_dump();
1118 }
1119 rtcv_snap = rcu_torture_current_version;
1000} 1120}
1001 1121
1002/* 1122/*
@@ -1146,7 +1266,7 @@ static int __init rcu_torture_stall_init(void)
1146} 1266}
1147 1267
1148/* Callback function for RCU barrier testing. */ 1268/* Callback function for RCU barrier testing. */
1149void rcu_torture_barrier_cbf(struct rcu_head *rcu) 1269static void rcu_torture_barrier_cbf(struct rcu_head *rcu)
1150{ 1270{
1151 atomic_inc(&barrier_cbs_invoked); 1271 atomic_inc(&barrier_cbs_invoked);
1152} 1272}
@@ -1416,7 +1536,8 @@ rcu_torture_init(void)
1416 &rcu_ops, &rcu_bh_ops, &rcu_busted_ops, &srcu_ops, &sched_ops, 1536 &rcu_ops, &rcu_bh_ops, &rcu_busted_ops, &srcu_ops, &sched_ops,
1417 }; 1537 };
1418 1538
1419 torture_init_begin(torture_type, verbose, &rcutorture_runnable); 1539 if (!torture_init_begin(torture_type, verbose, &rcutorture_runnable))
1540 return -EBUSY;
1420 1541
1421 /* Process args and tell the world that the torturer is on the job. */ 1542 /* Process args and tell the world that the torturer is on the job. */
1422 for (i = 0; i < ARRAY_SIZE(torture_ops); i++) { 1543 for (i = 0; i < ARRAY_SIZE(torture_ops); i++) {
@@ -1441,10 +1562,13 @@ rcu_torture_init(void)
1441 if (cur_ops->init) 1562 if (cur_ops->init)
1442 cur_ops->init(); /* no "goto unwind" prior to this point!!! */ 1563 cur_ops->init(); /* no "goto unwind" prior to this point!!! */
1443 1564
1444 if (nreaders >= 0) 1565 if (nreaders >= 0) {
1445 nrealreaders = nreaders; 1566 nrealreaders = nreaders;
1446 else 1567 } else {
1447 nrealreaders = 2 * num_online_cpus(); 1568 nrealreaders = num_online_cpus() - 1;
1569 if (nrealreaders <= 0)
1570 nrealreaders = 1;
1571 }
1448 rcu_torture_print_module_parms(cur_ops, "Start of test"); 1572 rcu_torture_print_module_parms(cur_ops, "Start of test");
1449 1573
1450 /* Set up the freelist. */ 1574 /* Set up the freelist. */
@@ -1533,7 +1657,8 @@ rcu_torture_init(void)
1533 fqs_duration = 0; 1657 fqs_duration = 0;
1534 if (fqs_duration) { 1658 if (fqs_duration) {
1535 /* Create the fqs thread */ 1659 /* Create the fqs thread */
1536 torture_create_kthread(rcu_torture_fqs, NULL, fqs_task); 1660 firsterr = torture_create_kthread(rcu_torture_fqs, NULL,
1661 fqs_task);
1537 if (firsterr) 1662 if (firsterr)
1538 goto unwind; 1663 goto unwind;
1539 } 1664 }
diff --git a/kernel/rcu/tiny_plugin.h b/kernel/rcu/tiny_plugin.h
index 431528520562..858c56569127 100644
--- a/kernel/rcu/tiny_plugin.h
+++ b/kernel/rcu/tiny_plugin.h
@@ -144,7 +144,7 @@ static void check_cpu_stall(struct rcu_ctrlblk *rcp)
144 return; 144 return;
145 rcp->ticks_this_gp++; 145 rcp->ticks_this_gp++;
146 j = jiffies; 146 j = jiffies;
147 js = rcp->jiffies_stall; 147 js = ACCESS_ONCE(rcp->jiffies_stall);
148 if (*rcp->curtail && ULONG_CMP_GE(j, js)) { 148 if (*rcp->curtail && ULONG_CMP_GE(j, js)) {
149 pr_err("INFO: %s stall on CPU (%lu ticks this GP) idle=%llx (t=%lu jiffies q=%ld)\n", 149 pr_err("INFO: %s stall on CPU (%lu ticks this GP) idle=%llx (t=%lu jiffies q=%ld)\n",
150 rcp->name, rcp->ticks_this_gp, rcu_dynticks_nesting, 150 rcp->name, rcp->ticks_this_gp, rcu_dynticks_nesting,
@@ -152,17 +152,17 @@ static void check_cpu_stall(struct rcu_ctrlblk *rcp)
152 dump_stack(); 152 dump_stack();
153 } 153 }
154 if (*rcp->curtail && ULONG_CMP_GE(j, js)) 154 if (*rcp->curtail && ULONG_CMP_GE(j, js))
155 rcp->jiffies_stall = jiffies + 155 ACCESS_ONCE(rcp->jiffies_stall) = jiffies +
156 3 * rcu_jiffies_till_stall_check() + 3; 156 3 * rcu_jiffies_till_stall_check() + 3;
157 else if (ULONG_CMP_GE(j, js)) 157 else if (ULONG_CMP_GE(j, js))
158 rcp->jiffies_stall = jiffies + rcu_jiffies_till_stall_check(); 158 ACCESS_ONCE(rcp->jiffies_stall) = jiffies + rcu_jiffies_till_stall_check();
159} 159}
160 160
161static void reset_cpu_stall_ticks(struct rcu_ctrlblk *rcp) 161static void reset_cpu_stall_ticks(struct rcu_ctrlblk *rcp)
162{ 162{
163 rcp->ticks_this_gp = 0; 163 rcp->ticks_this_gp = 0;
164 rcp->gp_start = jiffies; 164 rcp->gp_start = jiffies;
165 rcp->jiffies_stall = jiffies + rcu_jiffies_till_stall_check(); 165 ACCESS_ONCE(rcp->jiffies_stall) = jiffies + rcu_jiffies_till_stall_check();
166} 166}
167 167
168static void check_cpu_stalls(void) 168static void check_cpu_stalls(void)
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 88b4a1dcb58c..f1ba77363fbb 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -101,7 +101,7 @@ DEFINE_PER_CPU(struct rcu_data, sname##_data)
101RCU_STATE_INITIALIZER(rcu_sched, 's', call_rcu_sched); 101RCU_STATE_INITIALIZER(rcu_sched, 's', call_rcu_sched);
102RCU_STATE_INITIALIZER(rcu_bh, 'b', call_rcu_bh); 102RCU_STATE_INITIALIZER(rcu_bh, 'b', call_rcu_bh);
103 103
104static struct rcu_state *rcu_state; 104static struct rcu_state *rcu_state_p;
105LIST_HEAD(rcu_struct_flavors); 105LIST_HEAD(rcu_struct_flavors);
106 106
107/* Increase (but not decrease) the CONFIG_RCU_FANOUT_LEAF at boot time. */ 107/* Increase (but not decrease) the CONFIG_RCU_FANOUT_LEAF at boot time. */
@@ -243,7 +243,7 @@ static ulong jiffies_till_next_fqs = ULONG_MAX;
243module_param(jiffies_till_first_fqs, ulong, 0644); 243module_param(jiffies_till_first_fqs, ulong, 0644);
244module_param(jiffies_till_next_fqs, ulong, 0644); 244module_param(jiffies_till_next_fqs, ulong, 0644);
245 245
246static void rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp, 246static bool rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp,
247 struct rcu_data *rdp); 247 struct rcu_data *rdp);
248static void force_qs_rnp(struct rcu_state *rsp, 248static void force_qs_rnp(struct rcu_state *rsp,
249 int (*f)(struct rcu_data *rsp, bool *isidle, 249 int (*f)(struct rcu_data *rsp, bool *isidle,
@@ -271,6 +271,15 @@ long rcu_batches_completed_bh(void)
271EXPORT_SYMBOL_GPL(rcu_batches_completed_bh); 271EXPORT_SYMBOL_GPL(rcu_batches_completed_bh);
272 272
273/* 273/*
274 * Force a quiescent state.
275 */
276void rcu_force_quiescent_state(void)
277{
278 force_quiescent_state(rcu_state_p);
279}
280EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
281
282/*
274 * Force a quiescent state for RCU BH. 283 * Force a quiescent state for RCU BH.
275 */ 284 */
276void rcu_bh_force_quiescent_state(void) 285void rcu_bh_force_quiescent_state(void)
@@ -280,6 +289,21 @@ void rcu_bh_force_quiescent_state(void)
280EXPORT_SYMBOL_GPL(rcu_bh_force_quiescent_state); 289EXPORT_SYMBOL_GPL(rcu_bh_force_quiescent_state);
281 290
282/* 291/*
292 * Show the state of the grace-period kthreads.
293 */
294void show_rcu_gp_kthreads(void)
295{
296 struct rcu_state *rsp;
297
298 for_each_rcu_flavor(rsp) {
299 pr_info("%s: wait state: %d ->state: %#lx\n",
300 rsp->name, rsp->gp_state, rsp->gp_kthread->state);
301 /* sched_show_task(rsp->gp_kthread); */
302 }
303}
304EXPORT_SYMBOL_GPL(show_rcu_gp_kthreads);
305
306/*
283 * Record the number of times rcutorture tests have been initiated and 307 * Record the number of times rcutorture tests have been initiated and
284 * terminated. This information allows the debugfs tracing stats to be 308 * terminated. This information allows the debugfs tracing stats to be
285 * correlated to the rcutorture messages, even when the rcutorture module 309 * correlated to the rcutorture messages, even when the rcutorture module
@@ -294,6 +318,39 @@ void rcutorture_record_test_transition(void)
294EXPORT_SYMBOL_GPL(rcutorture_record_test_transition); 318EXPORT_SYMBOL_GPL(rcutorture_record_test_transition);
295 319
296/* 320/*
321 * Send along grace-period-related data for rcutorture diagnostics.
322 */
323void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags,
324 unsigned long *gpnum, unsigned long *completed)
325{
326 struct rcu_state *rsp = NULL;
327
328 switch (test_type) {
329 case RCU_FLAVOR:
330 rsp = rcu_state_p;
331 break;
332 case RCU_BH_FLAVOR:
333 rsp = &rcu_bh_state;
334 break;
335 case RCU_SCHED_FLAVOR:
336 rsp = &rcu_sched_state;
337 break;
338 default:
339 break;
340 }
341 if (rsp != NULL) {
342 *flags = ACCESS_ONCE(rsp->gp_flags);
343 *gpnum = ACCESS_ONCE(rsp->gpnum);
344 *completed = ACCESS_ONCE(rsp->completed);
345 return;
346 }
347 *flags = 0;
348 *gpnum = 0;
349 *completed = 0;
350}
351EXPORT_SYMBOL_GPL(rcutorture_get_gp_data);
352
353/*
297 * Record the number of writer passes through the current rcutorture test. 354 * Record the number of writer passes through the current rcutorture test.
298 * This is also used to correlate debugfs tracing stats with the rcutorture 355 * This is also used to correlate debugfs tracing stats with the rcutorture
299 * messages. 356 * messages.
@@ -324,6 +381,28 @@ cpu_has_callbacks_ready_to_invoke(struct rcu_data *rdp)
324} 381}
325 382
326/* 383/*
384 * Return the root node of the specified rcu_state structure.
385 */
386static struct rcu_node *rcu_get_root(struct rcu_state *rsp)
387{
388 return &rsp->node[0];
389}
390
391/*
392 * Is there any need for future grace periods?
393 * Interrupts must be disabled. If the caller does not hold the root
394 * rnp_node structure's ->lock, the results are advisory only.
395 */
396static int rcu_future_needs_gp(struct rcu_state *rsp)
397{
398 struct rcu_node *rnp = rcu_get_root(rsp);
399 int idx = (ACCESS_ONCE(rnp->completed) + 1) & 0x1;
400 int *fp = &rnp->need_future_gp[idx];
401
402 return ACCESS_ONCE(*fp);
403}
404
405/*
327 * Does the current CPU require a not-yet-started grace period? 406 * Does the current CPU require a not-yet-started grace period?
328 * The caller must have disabled interrupts to prevent races with 407 * The caller must have disabled interrupts to prevent races with
329 * normal callback registry. 408 * normal callback registry.
@@ -335,7 +414,7 @@ cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp)
335 414
336 if (rcu_gp_in_progress(rsp)) 415 if (rcu_gp_in_progress(rsp))
337 return 0; /* No, a grace period is already in progress. */ 416 return 0; /* No, a grace period is already in progress. */
338 if (rcu_nocb_needs_gp(rsp)) 417 if (rcu_future_needs_gp(rsp))
339 return 1; /* Yes, a no-CBs CPU needs one. */ 418 return 1; /* Yes, a no-CBs CPU needs one. */
340 if (!rdp->nxttail[RCU_NEXT_TAIL]) 419 if (!rdp->nxttail[RCU_NEXT_TAIL])
341 return 0; /* No, this is a no-CBs (or offline) CPU. */ 420 return 0; /* No, this is a no-CBs (or offline) CPU. */
@@ -350,14 +429,6 @@ cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp)
350} 429}
351 430
352/* 431/*
353 * Return the root node of the specified rcu_state structure.
354 */
355static struct rcu_node *rcu_get_root(struct rcu_state *rsp)
356{
357 return &rsp->node[0];
358}
359
360/*
361 * rcu_eqs_enter_common - current CPU is moving towards extended quiescent state 432 * rcu_eqs_enter_common - current CPU is moving towards extended quiescent state
362 * 433 *
363 * If the new value of the ->dynticks_nesting counter now is zero, 434 * If the new value of the ->dynticks_nesting counter now is zero,
@@ -758,7 +829,12 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp,
758{ 829{
759 rdp->dynticks_snap = atomic_add_return(0, &rdp->dynticks->dynticks); 830 rdp->dynticks_snap = atomic_add_return(0, &rdp->dynticks->dynticks);
760 rcu_sysidle_check_cpu(rdp, isidle, maxj); 831 rcu_sysidle_check_cpu(rdp, isidle, maxj);
761 return (rdp->dynticks_snap & 0x1) == 0; 832 if ((rdp->dynticks_snap & 0x1) == 0) {
833 trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("dti"));
834 return 1;
835 } else {
836 return 0;
837 }
762} 838}
763 839
764/* 840/*
@@ -834,7 +910,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp,
834 * we will beat on the first one until it gets unstuck, then move 910 * we will beat on the first one until it gets unstuck, then move
835 * to the next. Only do this for the primary flavor of RCU. 911 * to the next. Only do this for the primary flavor of RCU.
836 */ 912 */
837 if (rdp->rsp == rcu_state && 913 if (rdp->rsp == rcu_state_p &&
838 ULONG_CMP_GE(jiffies, rdp->rsp->jiffies_resched)) { 914 ULONG_CMP_GE(jiffies, rdp->rsp->jiffies_resched)) {
839 rdp->rsp->jiffies_resched += 5; 915 rdp->rsp->jiffies_resched += 5;
840 resched_cpu(rdp->cpu); 916 resched_cpu(rdp->cpu);
@@ -851,7 +927,7 @@ static void record_gp_stall_check_time(struct rcu_state *rsp)
851 rsp->gp_start = j; 927 rsp->gp_start = j;
852 smp_wmb(); /* Record start time before stall time. */ 928 smp_wmb(); /* Record start time before stall time. */
853 j1 = rcu_jiffies_till_stall_check(); 929 j1 = rcu_jiffies_till_stall_check();
854 rsp->jiffies_stall = j + j1; 930 ACCESS_ONCE(rsp->jiffies_stall) = j + j1;
855 rsp->jiffies_resched = j + j1 / 2; 931 rsp->jiffies_resched = j + j1 / 2;
856} 932}
857 933
@@ -890,12 +966,12 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
890 /* Only let one CPU complain about others per time interval. */ 966 /* Only let one CPU complain about others per time interval. */
891 967
892 raw_spin_lock_irqsave(&rnp->lock, flags); 968 raw_spin_lock_irqsave(&rnp->lock, flags);
893 delta = jiffies - rsp->jiffies_stall; 969 delta = jiffies - ACCESS_ONCE(rsp->jiffies_stall);
894 if (delta < RCU_STALL_RAT_DELAY || !rcu_gp_in_progress(rsp)) { 970 if (delta < RCU_STALL_RAT_DELAY || !rcu_gp_in_progress(rsp)) {
895 raw_spin_unlock_irqrestore(&rnp->lock, flags); 971 raw_spin_unlock_irqrestore(&rnp->lock, flags);
896 return; 972 return;
897 } 973 }
898 rsp->jiffies_stall = jiffies + 3 * rcu_jiffies_till_stall_check() + 3; 974 ACCESS_ONCE(rsp->jiffies_stall) = jiffies + 3 * rcu_jiffies_till_stall_check() + 3;
899 raw_spin_unlock_irqrestore(&rnp->lock, flags); 975 raw_spin_unlock_irqrestore(&rnp->lock, flags);
900 976
901 /* 977 /*
@@ -932,9 +1008,9 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
932 print_cpu_stall_info_end(); 1008 print_cpu_stall_info_end();
933 for_each_possible_cpu(cpu) 1009 for_each_possible_cpu(cpu)
934 totqlen += per_cpu_ptr(rsp->rda, cpu)->qlen; 1010 totqlen += per_cpu_ptr(rsp->rda, cpu)->qlen;
935 pr_cont("(detected by %d, t=%ld jiffies, g=%lu, c=%lu, q=%lu)\n", 1011 pr_cont("(detected by %d, t=%ld jiffies, g=%ld, c=%ld, q=%lu)\n",
936 smp_processor_id(), (long)(jiffies - rsp->gp_start), 1012 smp_processor_id(), (long)(jiffies - rsp->gp_start),
937 rsp->gpnum, rsp->completed, totqlen); 1013 (long)rsp->gpnum, (long)rsp->completed, totqlen);
938 if (ndetected == 0) 1014 if (ndetected == 0)
939 pr_err("INFO: Stall ended before state dump start\n"); 1015 pr_err("INFO: Stall ended before state dump start\n");
940 else if (!trigger_all_cpu_backtrace()) 1016 else if (!trigger_all_cpu_backtrace())
@@ -947,12 +1023,6 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
947 force_quiescent_state(rsp); /* Kick them all. */ 1023 force_quiescent_state(rsp); /* Kick them all. */
948} 1024}
949 1025
950/*
951 * This function really isn't for public consumption, but RCU is special in
952 * that context switches can allow the state machine to make progress.
953 */
954extern void resched_cpu(int cpu);
955
956static void print_cpu_stall(struct rcu_state *rsp) 1026static void print_cpu_stall(struct rcu_state *rsp)
957{ 1027{
958 int cpu; 1028 int cpu;
@@ -971,14 +1041,15 @@ static void print_cpu_stall(struct rcu_state *rsp)
971 print_cpu_stall_info_end(); 1041 print_cpu_stall_info_end();
972 for_each_possible_cpu(cpu) 1042 for_each_possible_cpu(cpu)
973 totqlen += per_cpu_ptr(rsp->rda, cpu)->qlen; 1043 totqlen += per_cpu_ptr(rsp->rda, cpu)->qlen;
974 pr_cont(" (t=%lu jiffies g=%lu c=%lu q=%lu)\n", 1044 pr_cont(" (t=%lu jiffies g=%ld c=%ld q=%lu)\n",
975 jiffies - rsp->gp_start, rsp->gpnum, rsp->completed, totqlen); 1045 jiffies - rsp->gp_start,
1046 (long)rsp->gpnum, (long)rsp->completed, totqlen);
976 if (!trigger_all_cpu_backtrace()) 1047 if (!trigger_all_cpu_backtrace())
977 dump_stack(); 1048 dump_stack();
978 1049
979 raw_spin_lock_irqsave(&rnp->lock, flags); 1050 raw_spin_lock_irqsave(&rnp->lock, flags);
980 if (ULONG_CMP_GE(jiffies, rsp->jiffies_stall)) 1051 if (ULONG_CMP_GE(jiffies, ACCESS_ONCE(rsp->jiffies_stall)))
981 rsp->jiffies_stall = jiffies + 1052 ACCESS_ONCE(rsp->jiffies_stall) = jiffies +
982 3 * rcu_jiffies_till_stall_check() + 3; 1053 3 * rcu_jiffies_till_stall_check() + 3;
983 raw_spin_unlock_irqrestore(&rnp->lock, flags); 1054 raw_spin_unlock_irqrestore(&rnp->lock, flags);
984 1055
@@ -1062,7 +1133,7 @@ void rcu_cpu_stall_reset(void)
1062 struct rcu_state *rsp; 1133 struct rcu_state *rsp;
1063 1134
1064 for_each_rcu_flavor(rsp) 1135 for_each_rcu_flavor(rsp)
1065 rsp->jiffies_stall = jiffies + ULONG_MAX / 2; 1136 ACCESS_ONCE(rsp->jiffies_stall) = jiffies + ULONG_MAX / 2;
1066} 1137}
1067 1138
1068/* 1139/*
@@ -1123,15 +1194,18 @@ static void trace_rcu_future_gp(struct rcu_node *rnp, struct rcu_data *rdp,
1123/* 1194/*
1124 * Start some future grace period, as needed to handle newly arrived 1195 * Start some future grace period, as needed to handle newly arrived
1125 * callbacks. The required future grace periods are recorded in each 1196 * callbacks. The required future grace periods are recorded in each
1126 * rcu_node structure's ->need_future_gp field. 1197 * rcu_node structure's ->need_future_gp field. Returns true if there
1198 * is reason to awaken the grace-period kthread.
1127 * 1199 *
1128 * The caller must hold the specified rcu_node structure's ->lock. 1200 * The caller must hold the specified rcu_node structure's ->lock.
1129 */ 1201 */
1130static unsigned long __maybe_unused 1202static bool __maybe_unused
1131rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp) 1203rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp,
1204 unsigned long *c_out)
1132{ 1205{
1133 unsigned long c; 1206 unsigned long c;
1134 int i; 1207 int i;
1208 bool ret = false;
1135 struct rcu_node *rnp_root = rcu_get_root(rdp->rsp); 1209 struct rcu_node *rnp_root = rcu_get_root(rdp->rsp);
1136 1210
1137 /* 1211 /*
@@ -1142,7 +1216,7 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp)
1142 trace_rcu_future_gp(rnp, rdp, c, TPS("Startleaf")); 1216 trace_rcu_future_gp(rnp, rdp, c, TPS("Startleaf"));
1143 if (rnp->need_future_gp[c & 0x1]) { 1217 if (rnp->need_future_gp[c & 0x1]) {
1144 trace_rcu_future_gp(rnp, rdp, c, TPS("Prestartleaf")); 1218 trace_rcu_future_gp(rnp, rdp, c, TPS("Prestartleaf"));
1145 return c; 1219 goto out;
1146 } 1220 }
1147 1221
1148 /* 1222 /*
@@ -1156,7 +1230,7 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp)
1156 ACCESS_ONCE(rnp->gpnum) != ACCESS_ONCE(rnp->completed)) { 1230 ACCESS_ONCE(rnp->gpnum) != ACCESS_ONCE(rnp->completed)) {
1157 rnp->need_future_gp[c & 0x1]++; 1231 rnp->need_future_gp[c & 0x1]++;
1158 trace_rcu_future_gp(rnp, rdp, c, TPS("Startedleaf")); 1232 trace_rcu_future_gp(rnp, rdp, c, TPS("Startedleaf"));
1159 return c; 1233 goto out;
1160 } 1234 }
1161 1235
1162 /* 1236 /*
@@ -1197,12 +1271,15 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp)
1197 trace_rcu_future_gp(rnp, rdp, c, TPS("Startedleafroot")); 1271 trace_rcu_future_gp(rnp, rdp, c, TPS("Startedleafroot"));
1198 } else { 1272 } else {
1199 trace_rcu_future_gp(rnp, rdp, c, TPS("Startedroot")); 1273 trace_rcu_future_gp(rnp, rdp, c, TPS("Startedroot"));
1200 rcu_start_gp_advanced(rdp->rsp, rnp_root, rdp); 1274 ret = rcu_start_gp_advanced(rdp->rsp, rnp_root, rdp);
1201 } 1275 }
1202unlock_out: 1276unlock_out:
1203 if (rnp != rnp_root) 1277 if (rnp != rnp_root)
1204 raw_spin_unlock(&rnp_root->lock); 1278 raw_spin_unlock(&rnp_root->lock);
1205 return c; 1279out:
1280 if (c_out != NULL)
1281 *c_out = c;
1282 return ret;
1206} 1283}
1207 1284
1208/* 1285/*
@@ -1226,25 +1303,43 @@ static int rcu_future_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp)
1226} 1303}
1227 1304
1228/* 1305/*
1306 * Awaken the grace-period kthread for the specified flavor of RCU.
1307 * Don't do a self-awaken, and don't bother awakening when there is
1308 * nothing for the grace-period kthread to do (as in several CPUs
1309 * raced to awaken, and we lost), and finally don't try to awaken
1310 * a kthread that has not yet been created.
1311 */
1312static void rcu_gp_kthread_wake(struct rcu_state *rsp)
1313{
1314 if (current == rsp->gp_kthread ||
1315 !ACCESS_ONCE(rsp->gp_flags) ||
1316 !rsp->gp_kthread)
1317 return;
1318 wake_up(&rsp->gp_wq);
1319}
1320
1321/*
1229 * If there is room, assign a ->completed number to any callbacks on 1322 * If there is room, assign a ->completed number to any callbacks on
1230 * this CPU that have not already been assigned. Also accelerate any 1323 * this CPU that have not already been assigned. Also accelerate any
1231 * callbacks that were previously assigned a ->completed number that has 1324 * callbacks that were previously assigned a ->completed number that has
1232 * since proven to be too conservative, which can happen if callbacks get 1325 * since proven to be too conservative, which can happen if callbacks get
1233 * assigned a ->completed number while RCU is idle, but with reference to 1326 * assigned a ->completed number while RCU is idle, but with reference to
1234 * a non-root rcu_node structure. This function is idempotent, so it does 1327 * a non-root rcu_node structure. This function is idempotent, so it does
1235 * not hurt to call it repeatedly. 1328 * not hurt to call it repeatedly. Returns an flag saying that we should
1329 * awaken the RCU grace-period kthread.
1236 * 1330 *
1237 * The caller must hold rnp->lock with interrupts disabled. 1331 * The caller must hold rnp->lock with interrupts disabled.
1238 */ 1332 */
1239static void rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp, 1333static bool rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp,
1240 struct rcu_data *rdp) 1334 struct rcu_data *rdp)
1241{ 1335{
1242 unsigned long c; 1336 unsigned long c;
1243 int i; 1337 int i;
1338 bool ret;
1244 1339
1245 /* If the CPU has no callbacks, nothing to do. */ 1340 /* If the CPU has no callbacks, nothing to do. */
1246 if (!rdp->nxttail[RCU_NEXT_TAIL] || !*rdp->nxttail[RCU_DONE_TAIL]) 1341 if (!rdp->nxttail[RCU_NEXT_TAIL] || !*rdp->nxttail[RCU_DONE_TAIL])
1247 return; 1342 return false;
1248 1343
1249 /* 1344 /*
1250 * Starting from the sublist containing the callbacks most 1345 * Starting from the sublist containing the callbacks most
@@ -1273,7 +1368,7 @@ static void rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp,
1273 * be grouped into. 1368 * be grouped into.
1274 */ 1369 */
1275 if (++i >= RCU_NEXT_TAIL) 1370 if (++i >= RCU_NEXT_TAIL)
1276 return; 1371 return false;
1277 1372
1278 /* 1373 /*
1279 * Assign all subsequent callbacks' ->completed number to the next 1374 * Assign all subsequent callbacks' ->completed number to the next
@@ -1285,13 +1380,14 @@ static void rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp,
1285 rdp->nxtcompleted[i] = c; 1380 rdp->nxtcompleted[i] = c;
1286 } 1381 }
1287 /* Record any needed additional grace periods. */ 1382 /* Record any needed additional grace periods. */
1288 rcu_start_future_gp(rnp, rdp); 1383 ret = rcu_start_future_gp(rnp, rdp, NULL);
1289 1384
1290 /* Trace depending on how much we were able to accelerate. */ 1385 /* Trace depending on how much we were able to accelerate. */
1291 if (!*rdp->nxttail[RCU_WAIT_TAIL]) 1386 if (!*rdp->nxttail[RCU_WAIT_TAIL])
1292 trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("AccWaitCB")); 1387 trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("AccWaitCB"));
1293 else 1388 else
1294 trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("AccReadyCB")); 1389 trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("AccReadyCB"));
1390 return ret;
1295} 1391}
1296 1392
1297/* 1393/*
@@ -1300,17 +1396,18 @@ static void rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp,
1300 * assign ->completed numbers to any callbacks in the RCU_NEXT_TAIL 1396 * assign ->completed numbers to any callbacks in the RCU_NEXT_TAIL
1301 * sublist. This function is idempotent, so it does not hurt to 1397 * sublist. This function is idempotent, so it does not hurt to
1302 * invoke it repeatedly. As long as it is not invoked -too- often... 1398 * invoke it repeatedly. As long as it is not invoked -too- often...
1399 * Returns true if the RCU grace-period kthread needs to be awakened.
1303 * 1400 *
1304 * The caller must hold rnp->lock with interrupts disabled. 1401 * The caller must hold rnp->lock with interrupts disabled.
1305 */ 1402 */
1306static void rcu_advance_cbs(struct rcu_state *rsp, struct rcu_node *rnp, 1403static bool rcu_advance_cbs(struct rcu_state *rsp, struct rcu_node *rnp,
1307 struct rcu_data *rdp) 1404 struct rcu_data *rdp)
1308{ 1405{
1309 int i, j; 1406 int i, j;
1310 1407
1311 /* If the CPU has no callbacks, nothing to do. */ 1408 /* If the CPU has no callbacks, nothing to do. */
1312 if (!rdp->nxttail[RCU_NEXT_TAIL] || !*rdp->nxttail[RCU_DONE_TAIL]) 1409 if (!rdp->nxttail[RCU_NEXT_TAIL] || !*rdp->nxttail[RCU_DONE_TAIL])
1313 return; 1410 return false;
1314 1411
1315 /* 1412 /*
1316 * Find all callbacks whose ->completed numbers indicate that they 1413 * Find all callbacks whose ->completed numbers indicate that they
@@ -1334,26 +1431,30 @@ static void rcu_advance_cbs(struct rcu_state *rsp, struct rcu_node *rnp,
1334 } 1431 }
1335 1432
1336 /* Classify any remaining callbacks. */ 1433 /* Classify any remaining callbacks. */
1337 rcu_accelerate_cbs(rsp, rnp, rdp); 1434 return rcu_accelerate_cbs(rsp, rnp, rdp);
1338} 1435}
1339 1436
1340/* 1437/*
1341 * Update CPU-local rcu_data state to record the beginnings and ends of 1438 * Update CPU-local rcu_data state to record the beginnings and ends of
1342 * grace periods. The caller must hold the ->lock of the leaf rcu_node 1439 * grace periods. The caller must hold the ->lock of the leaf rcu_node
1343 * structure corresponding to the current CPU, and must have irqs disabled. 1440 * structure corresponding to the current CPU, and must have irqs disabled.
1441 * Returns true if the grace-period kthread needs to be awakened.
1344 */ 1442 */
1345static void __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) 1443static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp,
1444 struct rcu_data *rdp)
1346{ 1445{
1446 bool ret;
1447
1347 /* Handle the ends of any preceding grace periods first. */ 1448 /* Handle the ends of any preceding grace periods first. */
1348 if (rdp->completed == rnp->completed) { 1449 if (rdp->completed == rnp->completed) {
1349 1450
1350 /* No grace period end, so just accelerate recent callbacks. */ 1451 /* No grace period end, so just accelerate recent callbacks. */
1351 rcu_accelerate_cbs(rsp, rnp, rdp); 1452 ret = rcu_accelerate_cbs(rsp, rnp, rdp);
1352 1453
1353 } else { 1454 } else {
1354 1455
1355 /* Advance callbacks. */ 1456 /* Advance callbacks. */
1356 rcu_advance_cbs(rsp, rnp, rdp); 1457 ret = rcu_advance_cbs(rsp, rnp, rdp);
1357 1458
1358 /* Remember that we saw this grace-period completion. */ 1459 /* Remember that we saw this grace-period completion. */
1359 rdp->completed = rnp->completed; 1460 rdp->completed = rnp->completed;
@@ -1372,11 +1473,13 @@ static void __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, struc
1372 rdp->qs_pending = !!(rnp->qsmask & rdp->grpmask); 1473 rdp->qs_pending = !!(rnp->qsmask & rdp->grpmask);
1373 zero_cpu_stall_ticks(rdp); 1474 zero_cpu_stall_ticks(rdp);
1374 } 1475 }
1476 return ret;
1375} 1477}
1376 1478
1377static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp) 1479static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp)
1378{ 1480{
1379 unsigned long flags; 1481 unsigned long flags;
1482 bool needwake;
1380 struct rcu_node *rnp; 1483 struct rcu_node *rnp;
1381 1484
1382 local_irq_save(flags); 1485 local_irq_save(flags);
@@ -1388,8 +1491,10 @@ static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp)
1388 return; 1491 return;
1389 } 1492 }
1390 smp_mb__after_unlock_lock(); 1493 smp_mb__after_unlock_lock();
1391 __note_gp_changes(rsp, rnp, rdp); 1494 needwake = __note_gp_changes(rsp, rnp, rdp);
1392 raw_spin_unlock_irqrestore(&rnp->lock, flags); 1495 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1496 if (needwake)
1497 rcu_gp_kthread_wake(rsp);
1393} 1498}
1394 1499
1395/* 1500/*
@@ -1403,12 +1508,12 @@ static int rcu_gp_init(struct rcu_state *rsp)
1403 rcu_bind_gp_kthread(); 1508 rcu_bind_gp_kthread();
1404 raw_spin_lock_irq(&rnp->lock); 1509 raw_spin_lock_irq(&rnp->lock);
1405 smp_mb__after_unlock_lock(); 1510 smp_mb__after_unlock_lock();
1406 if (rsp->gp_flags == 0) { 1511 if (!ACCESS_ONCE(rsp->gp_flags)) {
1407 /* Spurious wakeup, tell caller to go back to sleep. */ 1512 /* Spurious wakeup, tell caller to go back to sleep. */
1408 raw_spin_unlock_irq(&rnp->lock); 1513 raw_spin_unlock_irq(&rnp->lock);
1409 return 0; 1514 return 0;
1410 } 1515 }
1411 rsp->gp_flags = 0; /* Clear all flags: New grace period. */ 1516 ACCESS_ONCE(rsp->gp_flags) = 0; /* Clear all flags: New grace period. */
1412 1517
1413 if (WARN_ON_ONCE(rcu_gp_in_progress(rsp))) { 1518 if (WARN_ON_ONCE(rcu_gp_in_progress(rsp))) {
1414 /* 1519 /*
@@ -1453,7 +1558,7 @@ static int rcu_gp_init(struct rcu_state *rsp)
1453 WARN_ON_ONCE(rnp->completed != rsp->completed); 1558 WARN_ON_ONCE(rnp->completed != rsp->completed);
1454 ACCESS_ONCE(rnp->completed) = rsp->completed; 1559 ACCESS_ONCE(rnp->completed) = rsp->completed;
1455 if (rnp == rdp->mynode) 1560 if (rnp == rdp->mynode)
1456 __note_gp_changes(rsp, rnp, rdp); 1561 (void)__note_gp_changes(rsp, rnp, rdp);
1457 rcu_preempt_boost_start_gp(rnp); 1562 rcu_preempt_boost_start_gp(rnp);
1458 trace_rcu_grace_period_init(rsp->name, rnp->gpnum, 1563 trace_rcu_grace_period_init(rsp->name, rnp->gpnum,
1459 rnp->level, rnp->grplo, 1564 rnp->level, rnp->grplo,
@@ -1501,7 +1606,7 @@ static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in)
1501 if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) { 1606 if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) {
1502 raw_spin_lock_irq(&rnp->lock); 1607 raw_spin_lock_irq(&rnp->lock);
1503 smp_mb__after_unlock_lock(); 1608 smp_mb__after_unlock_lock();
1504 rsp->gp_flags &= ~RCU_GP_FLAG_FQS; 1609 ACCESS_ONCE(rsp->gp_flags) &= ~RCU_GP_FLAG_FQS;
1505 raw_spin_unlock_irq(&rnp->lock); 1610 raw_spin_unlock_irq(&rnp->lock);
1506 } 1611 }
1507 return fqs_state; 1612 return fqs_state;
@@ -1513,6 +1618,7 @@ static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in)
1513static void rcu_gp_cleanup(struct rcu_state *rsp) 1618static void rcu_gp_cleanup(struct rcu_state *rsp)
1514{ 1619{
1515 unsigned long gp_duration; 1620 unsigned long gp_duration;
1621 bool needgp = false;
1516 int nocb = 0; 1622 int nocb = 0;
1517 struct rcu_data *rdp; 1623 struct rcu_data *rdp;
1518 struct rcu_node *rnp = rcu_get_root(rsp); 1624 struct rcu_node *rnp = rcu_get_root(rsp);
@@ -1548,7 +1654,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
1548 ACCESS_ONCE(rnp->completed) = rsp->gpnum; 1654 ACCESS_ONCE(rnp->completed) = rsp->gpnum;
1549 rdp = this_cpu_ptr(rsp->rda); 1655 rdp = this_cpu_ptr(rsp->rda);
1550 if (rnp == rdp->mynode) 1656 if (rnp == rdp->mynode)
1551 __note_gp_changes(rsp, rnp, rdp); 1657 needgp = __note_gp_changes(rsp, rnp, rdp) || needgp;
1552 /* smp_mb() provided by prior unlock-lock pair. */ 1658 /* smp_mb() provided by prior unlock-lock pair. */
1553 nocb += rcu_future_gp_cleanup(rsp, rnp); 1659 nocb += rcu_future_gp_cleanup(rsp, rnp);
1554 raw_spin_unlock_irq(&rnp->lock); 1660 raw_spin_unlock_irq(&rnp->lock);
@@ -1564,9 +1670,10 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
1564 trace_rcu_grace_period(rsp->name, rsp->completed, TPS("end")); 1670 trace_rcu_grace_period(rsp->name, rsp->completed, TPS("end"));
1565 rsp->fqs_state = RCU_GP_IDLE; 1671 rsp->fqs_state = RCU_GP_IDLE;
1566 rdp = this_cpu_ptr(rsp->rda); 1672 rdp = this_cpu_ptr(rsp->rda);
1567 rcu_advance_cbs(rsp, rnp, rdp); /* Reduce false positives below. */ 1673 /* Advance CBs to reduce false positives below. */
1568 if (cpu_needs_another_gp(rsp, rdp)) { 1674 needgp = rcu_advance_cbs(rsp, rnp, rdp) || needgp;
1569 rsp->gp_flags = RCU_GP_FLAG_INIT; 1675 if (needgp || cpu_needs_another_gp(rsp, rdp)) {
1676 ACCESS_ONCE(rsp->gp_flags) = RCU_GP_FLAG_INIT;
1570 trace_rcu_grace_period(rsp->name, 1677 trace_rcu_grace_period(rsp->name,
1571 ACCESS_ONCE(rsp->gpnum), 1678 ACCESS_ONCE(rsp->gpnum),
1572 TPS("newreq")); 1679 TPS("newreq"));
@@ -1593,6 +1700,7 @@ static int __noreturn rcu_gp_kthread(void *arg)
1593 trace_rcu_grace_period(rsp->name, 1700 trace_rcu_grace_period(rsp->name,
1594 ACCESS_ONCE(rsp->gpnum), 1701 ACCESS_ONCE(rsp->gpnum),
1595 TPS("reqwait")); 1702 TPS("reqwait"));
1703 rsp->gp_state = RCU_GP_WAIT_GPS;
1596 wait_event_interruptible(rsp->gp_wq, 1704 wait_event_interruptible(rsp->gp_wq,
1597 ACCESS_ONCE(rsp->gp_flags) & 1705 ACCESS_ONCE(rsp->gp_flags) &
1598 RCU_GP_FLAG_INIT); 1706 RCU_GP_FLAG_INIT);
@@ -1620,6 +1728,7 @@ static int __noreturn rcu_gp_kthread(void *arg)
1620 trace_rcu_grace_period(rsp->name, 1728 trace_rcu_grace_period(rsp->name,
1621 ACCESS_ONCE(rsp->gpnum), 1729 ACCESS_ONCE(rsp->gpnum),
1622 TPS("fqswait")); 1730 TPS("fqswait"));
1731 rsp->gp_state = RCU_GP_WAIT_FQS;
1623 ret = wait_event_interruptible_timeout(rsp->gp_wq, 1732 ret = wait_event_interruptible_timeout(rsp->gp_wq,
1624 ((gf = ACCESS_ONCE(rsp->gp_flags)) & 1733 ((gf = ACCESS_ONCE(rsp->gp_flags)) &
1625 RCU_GP_FLAG_FQS) || 1734 RCU_GP_FLAG_FQS) ||
@@ -1665,14 +1774,6 @@ static int __noreturn rcu_gp_kthread(void *arg)
1665 } 1774 }
1666} 1775}
1667 1776
1668static void rsp_wakeup(struct irq_work *work)
1669{
1670 struct rcu_state *rsp = container_of(work, struct rcu_state, wakeup_work);
1671
1672 /* Wake up rcu_gp_kthread() to start the grace period. */
1673 wake_up(&rsp->gp_wq);
1674}
1675
1676/* 1777/*
1677 * Start a new RCU grace period if warranted, re-initializing the hierarchy 1778 * Start a new RCU grace period if warranted, re-initializing the hierarchy
1678 * in preparation for detecting the next grace period. The caller must hold 1779 * in preparation for detecting the next grace period. The caller must hold
@@ -1681,8 +1782,10 @@ static void rsp_wakeup(struct irq_work *work)
1681 * Note that it is legal for a dying CPU (which is marked as offline) to 1782 * Note that it is legal for a dying CPU (which is marked as offline) to
1682 * invoke this function. This can happen when the dying CPU reports its 1783 * invoke this function. This can happen when the dying CPU reports its
1683 * quiescent state. 1784 * quiescent state.
1785 *
1786 * Returns true if the grace-period kthread must be awakened.
1684 */ 1787 */
1685static void 1788static bool
1686rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp, 1789rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp,
1687 struct rcu_data *rdp) 1790 struct rcu_data *rdp)
1688{ 1791{
@@ -1693,20 +1796,18 @@ rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp,
1693 * or a grace period is already in progress. 1796 * or a grace period is already in progress.
1694 * Either way, don't start a new grace period. 1797 * Either way, don't start a new grace period.
1695 */ 1798 */
1696 return; 1799 return false;
1697 } 1800 }
1698 rsp->gp_flags = RCU_GP_FLAG_INIT; 1801 ACCESS_ONCE(rsp->gp_flags) = RCU_GP_FLAG_INIT;
1699 trace_rcu_grace_period(rsp->name, ACCESS_ONCE(rsp->gpnum), 1802 trace_rcu_grace_period(rsp->name, ACCESS_ONCE(rsp->gpnum),
1700 TPS("newreq")); 1803 TPS("newreq"));
1701 1804
1702 /* 1805 /*
1703 * We can't do wakeups while holding the rnp->lock, as that 1806 * We can't do wakeups while holding the rnp->lock, as that
1704 * could cause possible deadlocks with the rq->lock. Defer 1807 * could cause possible deadlocks with the rq->lock. Defer
1705 * the wakeup to interrupt context. And don't bother waking 1808 * the wakeup to our caller.
1706 * up the running kthread.
1707 */ 1809 */
1708 if (current != rsp->gp_kthread) 1810 return true;
1709 irq_work_queue(&rsp->wakeup_work);
1710} 1811}
1711 1812
1712/* 1813/*
@@ -1715,12 +1816,14 @@ rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp,
1715 * is invoked indirectly from rcu_advance_cbs(), which would result in 1816 * is invoked indirectly from rcu_advance_cbs(), which would result in
1716 * endless recursion -- or would do so if it wasn't for the self-deadlock 1817 * endless recursion -- or would do so if it wasn't for the self-deadlock
1717 * that is encountered beforehand. 1818 * that is encountered beforehand.
1819 *
1820 * Returns true if the grace-period kthread needs to be awakened.
1718 */ 1821 */
1719static void 1822static bool rcu_start_gp(struct rcu_state *rsp)
1720rcu_start_gp(struct rcu_state *rsp)
1721{ 1823{
1722 struct rcu_data *rdp = this_cpu_ptr(rsp->rda); 1824 struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
1723 struct rcu_node *rnp = rcu_get_root(rsp); 1825 struct rcu_node *rnp = rcu_get_root(rsp);
1826 bool ret = false;
1724 1827
1725 /* 1828 /*
1726 * If there is no grace period in progress right now, any 1829 * If there is no grace period in progress right now, any
@@ -1730,8 +1833,9 @@ rcu_start_gp(struct rcu_state *rsp)
1730 * resulting in pointless grace periods. So, advance callbacks 1833 * resulting in pointless grace periods. So, advance callbacks
1731 * then start the grace period! 1834 * then start the grace period!
1732 */ 1835 */
1733 rcu_advance_cbs(rsp, rnp, rdp); 1836 ret = rcu_advance_cbs(rsp, rnp, rdp) || ret;
1734 rcu_start_gp_advanced(rsp, rnp, rdp); 1837 ret = rcu_start_gp_advanced(rsp, rnp, rdp) || ret;
1838 return ret;
1735} 1839}
1736 1840
1737/* 1841/*
@@ -1820,6 +1924,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp)
1820{ 1924{
1821 unsigned long flags; 1925 unsigned long flags;
1822 unsigned long mask; 1926 unsigned long mask;
1927 bool needwake;
1823 struct rcu_node *rnp; 1928 struct rcu_node *rnp;
1824 1929
1825 rnp = rdp->mynode; 1930 rnp = rdp->mynode;
@@ -1848,9 +1953,11 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp)
1848 * This GP can't end until cpu checks in, so all of our 1953 * This GP can't end until cpu checks in, so all of our
1849 * callbacks can be processed during the next GP. 1954 * callbacks can be processed during the next GP.
1850 */ 1955 */
1851 rcu_accelerate_cbs(rsp, rnp, rdp); 1956 needwake = rcu_accelerate_cbs(rsp, rnp, rdp);
1852 1957
1853 rcu_report_qs_rnp(mask, rsp, rnp, flags); /* rlses rnp->lock */ 1958 rcu_report_qs_rnp(mask, rsp, rnp, flags); /* rlses rnp->lock */
1959 if (needwake)
1960 rcu_gp_kthread_wake(rsp);
1854 } 1961 }
1855} 1962}
1856 1963
@@ -1951,7 +2058,7 @@ rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp,
1951static void rcu_adopt_orphan_cbs(struct rcu_state *rsp, unsigned long flags) 2058static void rcu_adopt_orphan_cbs(struct rcu_state *rsp, unsigned long flags)
1952{ 2059{
1953 int i; 2060 int i;
1954 struct rcu_data *rdp = __this_cpu_ptr(rsp->rda); 2061 struct rcu_data *rdp = raw_cpu_ptr(rsp->rda);
1955 2062
1956 /* No-CBs CPUs are handled specially. */ 2063 /* No-CBs CPUs are handled specially. */
1957 if (rcu_nocb_adopt_orphan_cbs(rsp, rdp, flags)) 2064 if (rcu_nocb_adopt_orphan_cbs(rsp, rdp, flags))
@@ -2320,7 +2427,7 @@ static void force_quiescent_state(struct rcu_state *rsp)
2320 raw_spin_unlock_irqrestore(&rnp_old->lock, flags); 2427 raw_spin_unlock_irqrestore(&rnp_old->lock, flags);
2321 return; /* Someone beat us to it. */ 2428 return; /* Someone beat us to it. */
2322 } 2429 }
2323 rsp->gp_flags |= RCU_GP_FLAG_FQS; 2430 ACCESS_ONCE(rsp->gp_flags) |= RCU_GP_FLAG_FQS;
2324 raw_spin_unlock_irqrestore(&rnp_old->lock, flags); 2431 raw_spin_unlock_irqrestore(&rnp_old->lock, flags);
2325 wake_up(&rsp->gp_wq); /* Memory barrier implied by wake_up() path. */ 2432 wake_up(&rsp->gp_wq); /* Memory barrier implied by wake_up() path. */
2326} 2433}
@@ -2334,7 +2441,8 @@ static void
2334__rcu_process_callbacks(struct rcu_state *rsp) 2441__rcu_process_callbacks(struct rcu_state *rsp)
2335{ 2442{
2336 unsigned long flags; 2443 unsigned long flags;
2337 struct rcu_data *rdp = __this_cpu_ptr(rsp->rda); 2444 bool needwake;
2445 struct rcu_data *rdp = raw_cpu_ptr(rsp->rda);
2338 2446
2339 WARN_ON_ONCE(rdp->beenonline == 0); 2447 WARN_ON_ONCE(rdp->beenonline == 0);
2340 2448
@@ -2345,8 +2453,10 @@ __rcu_process_callbacks(struct rcu_state *rsp)
2345 local_irq_save(flags); 2453 local_irq_save(flags);
2346 if (cpu_needs_another_gp(rsp, rdp)) { 2454 if (cpu_needs_another_gp(rsp, rdp)) {
2347 raw_spin_lock(&rcu_get_root(rsp)->lock); /* irqs disabled. */ 2455 raw_spin_lock(&rcu_get_root(rsp)->lock); /* irqs disabled. */
2348 rcu_start_gp(rsp); 2456 needwake = rcu_start_gp(rsp);
2349 raw_spin_unlock_irqrestore(&rcu_get_root(rsp)->lock, flags); 2457 raw_spin_unlock_irqrestore(&rcu_get_root(rsp)->lock, flags);
2458 if (needwake)
2459 rcu_gp_kthread_wake(rsp);
2350 } else { 2460 } else {
2351 local_irq_restore(flags); 2461 local_irq_restore(flags);
2352 } 2462 }
@@ -2404,6 +2514,8 @@ static void invoke_rcu_core(void)
2404static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp, 2514static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp,
2405 struct rcu_head *head, unsigned long flags) 2515 struct rcu_head *head, unsigned long flags)
2406{ 2516{
2517 bool needwake;
2518
2407 /* 2519 /*
2408 * If called from an extended quiescent state, invoke the RCU 2520 * If called from an extended quiescent state, invoke the RCU
2409 * core in order to force a re-evaluation of RCU's idleness. 2521 * core in order to force a re-evaluation of RCU's idleness.
@@ -2433,8 +2545,10 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp,
2433 2545
2434 raw_spin_lock(&rnp_root->lock); 2546 raw_spin_lock(&rnp_root->lock);
2435 smp_mb__after_unlock_lock(); 2547 smp_mb__after_unlock_lock();
2436 rcu_start_gp(rsp); 2548 needwake = rcu_start_gp(rsp);
2437 raw_spin_unlock(&rnp_root->lock); 2549 raw_spin_unlock(&rnp_root->lock);
2550 if (needwake)
2551 rcu_gp_kthread_wake(rsp);
2438 } else { 2552 } else {
2439 /* Give the grace period a kick. */ 2553 /* Give the grace period a kick. */
2440 rdp->blimit = LONG_MAX; 2554 rdp->blimit = LONG_MAX;
@@ -2537,6 +2651,20 @@ void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
2537EXPORT_SYMBOL_GPL(call_rcu_bh); 2651EXPORT_SYMBOL_GPL(call_rcu_bh);
2538 2652
2539/* 2653/*
2654 * Queue an RCU callback for lazy invocation after a grace period.
2655 * This will likely be later named something like "call_rcu_lazy()",
2656 * but this change will require some way of tagging the lazy RCU
2657 * callbacks in the list of pending callbacks. Until then, this
2658 * function may only be called from __kfree_rcu().
2659 */
2660void kfree_call_rcu(struct rcu_head *head,
2661 void (*func)(struct rcu_head *rcu))
2662{
2663 __call_rcu(head, func, rcu_state_p, -1, 1);
2664}
2665EXPORT_SYMBOL_GPL(kfree_call_rcu);
2666
2667/*
2540 * Because a context switch is a grace period for RCU-sched and RCU-bh, 2668 * Because a context switch is a grace period for RCU-sched and RCU-bh,
2541 * any blocking grace-period wait automatically implies a grace period 2669 * any blocking grace-period wait automatically implies a grace period
2542 * if there is only one CPU online at any point time during execution 2670 * if there is only one CPU online at any point time during execution
@@ -2659,7 +2787,7 @@ unsigned long get_state_synchronize_rcu(void)
2659 * time-consuming work between get_state_synchronize_rcu() 2787 * time-consuming work between get_state_synchronize_rcu()
2660 * and cond_synchronize_rcu(). 2788 * and cond_synchronize_rcu().
2661 */ 2789 */
2662 return smp_load_acquire(&rcu_state->gpnum); 2790 return smp_load_acquire(&rcu_state_p->gpnum);
2663} 2791}
2664EXPORT_SYMBOL_GPL(get_state_synchronize_rcu); 2792EXPORT_SYMBOL_GPL(get_state_synchronize_rcu);
2665 2793
@@ -2685,7 +2813,7 @@ void cond_synchronize_rcu(unsigned long oldstate)
2685 * Ensure that this load happens before any RCU-destructive 2813 * Ensure that this load happens before any RCU-destructive
2686 * actions the caller might carry out after we return. 2814 * actions the caller might carry out after we return.
2687 */ 2815 */
2688 newstate = smp_load_acquire(&rcu_state->completed); 2816 newstate = smp_load_acquire(&rcu_state_p->completed);
2689 if (ULONG_CMP_GE(oldstate, newstate)) 2817 if (ULONG_CMP_GE(oldstate, newstate))
2690 synchronize_rcu(); 2818 synchronize_rcu();
2691} 2819}
@@ -2988,7 +3116,7 @@ static void rcu_barrier_callback(struct rcu_head *rhp)
2988static void rcu_barrier_func(void *type) 3116static void rcu_barrier_func(void *type)
2989{ 3117{
2990 struct rcu_state *rsp = type; 3118 struct rcu_state *rsp = type;
2991 struct rcu_data *rdp = __this_cpu_ptr(rsp->rda); 3119 struct rcu_data *rdp = raw_cpu_ptr(rsp->rda);
2992 3120
2993 _rcu_barrier_trace(rsp, "IRQ", -1, rsp->n_barrier_done); 3121 _rcu_barrier_trace(rsp, "IRQ", -1, rsp->n_barrier_done);
2994 atomic_inc(&rsp->barrier_cpu_count); 3122 atomic_inc(&rsp->barrier_cpu_count);
@@ -3160,7 +3288,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
3160 * that this CPU cannot possibly have any RCU callbacks in flight yet. 3288 * that this CPU cannot possibly have any RCU callbacks in flight yet.
3161 */ 3289 */
3162static void 3290static void
3163rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible) 3291rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
3164{ 3292{
3165 unsigned long flags; 3293 unsigned long flags;
3166 unsigned long mask; 3294 unsigned long mask;
@@ -3173,7 +3301,6 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible)
3173 /* Set up local state, ensuring consistent view of global state. */ 3301 /* Set up local state, ensuring consistent view of global state. */
3174 raw_spin_lock_irqsave(&rnp->lock, flags); 3302 raw_spin_lock_irqsave(&rnp->lock, flags);
3175 rdp->beenonline = 1; /* We have now been online. */ 3303 rdp->beenonline = 1; /* We have now been online. */
3176 rdp->preemptible = preemptible;
3177 rdp->qlen_last_fqs_check = 0; 3304 rdp->qlen_last_fqs_check = 0;
3178 rdp->n_force_qs_snap = rsp->n_force_qs; 3305 rdp->n_force_qs_snap = rsp->n_force_qs;
3179 rdp->blimit = blimit; 3306 rdp->blimit = blimit;
@@ -3217,8 +3344,7 @@ static void rcu_prepare_cpu(int cpu)
3217 struct rcu_state *rsp; 3344 struct rcu_state *rsp;
3218 3345
3219 for_each_rcu_flavor(rsp) 3346 for_each_rcu_flavor(rsp)
3220 rcu_init_percpu_data(cpu, rsp, 3347 rcu_init_percpu_data(cpu, rsp);
3221 strcmp(rsp->name, "rcu_preempt") == 0);
3222} 3348}
3223 3349
3224/* 3350/*
@@ -3228,7 +3354,7 @@ static int rcu_cpu_notify(struct notifier_block *self,
3228 unsigned long action, void *hcpu) 3354 unsigned long action, void *hcpu)
3229{ 3355{
3230 long cpu = (long)hcpu; 3356 long cpu = (long)hcpu;
3231 struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu); 3357 struct rcu_data *rdp = per_cpu_ptr(rcu_state_p->rda, cpu);
3232 struct rcu_node *rnp = rdp->mynode; 3358 struct rcu_node *rnp = rdp->mynode;
3233 struct rcu_state *rsp; 3359 struct rcu_state *rsp;
3234 3360
@@ -3402,8 +3528,8 @@ static void __init rcu_init_one(struct rcu_state *rsp,
3402 rnp->qsmaskinit = 0; 3528 rnp->qsmaskinit = 0;
3403 rnp->grplo = j * cpustride; 3529 rnp->grplo = j * cpustride;
3404 rnp->grphi = (j + 1) * cpustride - 1; 3530 rnp->grphi = (j + 1) * cpustride - 1;
3405 if (rnp->grphi >= NR_CPUS) 3531 if (rnp->grphi >= nr_cpu_ids)
3406 rnp->grphi = NR_CPUS - 1; 3532 rnp->grphi = nr_cpu_ids - 1;
3407 if (i == 0) { 3533 if (i == 0) {
3408 rnp->grpnum = 0; 3534 rnp->grpnum = 0;
3409 rnp->grpmask = 0; 3535 rnp->grpmask = 0;
@@ -3422,7 +3548,6 @@ static void __init rcu_init_one(struct rcu_state *rsp,
3422 3548
3423 rsp->rda = rda; 3549 rsp->rda = rda;
3424 init_waitqueue_head(&rsp->gp_wq); 3550 init_waitqueue_head(&rsp->gp_wq);
3425 init_irq_work(&rsp->wakeup_work, rsp_wakeup);
3426 rnp = rsp->level[rcu_num_lvls - 1]; 3551 rnp = rsp->level[rcu_num_lvls - 1];
3427 for_each_possible_cpu(i) { 3552 for_each_possible_cpu(i) {
3428 while (i > rnp->grphi) 3553 while (i > rnp->grphi)
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 75dc3c39a02a..bf2c1e669691 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -252,7 +252,6 @@ struct rcu_data {
252 bool passed_quiesce; /* User-mode/idle loop etc. */ 252 bool passed_quiesce; /* User-mode/idle loop etc. */
253 bool qs_pending; /* Core waits for quiesc state. */ 253 bool qs_pending; /* Core waits for quiesc state. */
254 bool beenonline; /* CPU online at least once. */ 254 bool beenonline; /* CPU online at least once. */
255 bool preemptible; /* Preemptible RCU? */
256 struct rcu_node *mynode; /* This CPU's leaf of hierarchy */ 255 struct rcu_node *mynode; /* This CPU's leaf of hierarchy */
257 unsigned long grpmask; /* Mask to apply to leaf qsmask. */ 256 unsigned long grpmask; /* Mask to apply to leaf qsmask. */
258#ifdef CONFIG_RCU_CPU_STALL_INFO 257#ifdef CONFIG_RCU_CPU_STALL_INFO
@@ -406,7 +405,8 @@ struct rcu_state {
406 unsigned long completed; /* # of last completed gp. */ 405 unsigned long completed; /* # of last completed gp. */
407 struct task_struct *gp_kthread; /* Task for grace periods. */ 406 struct task_struct *gp_kthread; /* Task for grace periods. */
408 wait_queue_head_t gp_wq; /* Where GP task waits. */ 407 wait_queue_head_t gp_wq; /* Where GP task waits. */
409 int gp_flags; /* Commands for GP task. */ 408 short gp_flags; /* Commands for GP task. */
409 short gp_state; /* GP kthread sleep state. */
410 410
411 /* End of fields guarded by root rcu_node's lock. */ 411 /* End of fields guarded by root rcu_node's lock. */
412 412
@@ -462,13 +462,17 @@ struct rcu_state {
462 const char *name; /* Name of structure. */ 462 const char *name; /* Name of structure. */
463 char abbr; /* Abbreviated name. */ 463 char abbr; /* Abbreviated name. */
464 struct list_head flavors; /* List of RCU flavors. */ 464 struct list_head flavors; /* List of RCU flavors. */
465 struct irq_work wakeup_work; /* Postponed wakeups */
466}; 465};
467 466
468/* Values for rcu_state structure's gp_flags field. */ 467/* Values for rcu_state structure's gp_flags field. */
469#define RCU_GP_FLAG_INIT 0x1 /* Need grace-period initialization. */ 468#define RCU_GP_FLAG_INIT 0x1 /* Need grace-period initialization. */
470#define RCU_GP_FLAG_FQS 0x2 /* Need grace-period quiescent-state forcing. */ 469#define RCU_GP_FLAG_FQS 0x2 /* Need grace-period quiescent-state forcing. */
471 470
471/* Values for rcu_state structure's gp_flags field. */
472#define RCU_GP_WAIT_INIT 0 /* Initial state. */
473#define RCU_GP_WAIT_GPS 1 /* Wait for grace-period start. */
474#define RCU_GP_WAIT_FQS 2 /* Wait for force-quiescent-state time. */
475
472extern struct list_head rcu_struct_flavors; 476extern struct list_head rcu_struct_flavors;
473 477
474/* Sequence through rcu_state structures for each RCU flavor. */ 478/* Sequence through rcu_state structures for each RCU flavor. */
@@ -547,7 +551,6 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu);
547static void print_cpu_stall_info_end(void); 551static void print_cpu_stall_info_end(void);
548static void zero_cpu_stall_ticks(struct rcu_data *rdp); 552static void zero_cpu_stall_ticks(struct rcu_data *rdp);
549static void increment_cpu_stall_ticks(void); 553static void increment_cpu_stall_ticks(void);
550static int rcu_nocb_needs_gp(struct rcu_state *rsp);
551static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq); 554static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq);
552static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp); 555static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp);
553static void rcu_init_one_nocb(struct rcu_node *rnp); 556static void rcu_init_one_nocb(struct rcu_node *rnp);
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 56db2f853e43..cbc2c45265e2 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -116,7 +116,7 @@ static void __init rcu_bootup_announce_oddness(void)
116#ifdef CONFIG_TREE_PREEMPT_RCU 116#ifdef CONFIG_TREE_PREEMPT_RCU
117 117
118RCU_STATE_INITIALIZER(rcu_preempt, 'p', call_rcu); 118RCU_STATE_INITIALIZER(rcu_preempt, 'p', call_rcu);
119static struct rcu_state *rcu_state = &rcu_preempt_state; 119static struct rcu_state *rcu_state_p = &rcu_preempt_state;
120 120
121static int rcu_preempted_readers_exp(struct rcu_node *rnp); 121static int rcu_preempted_readers_exp(struct rcu_node *rnp);
122 122
@@ -149,15 +149,6 @@ long rcu_batches_completed(void)
149EXPORT_SYMBOL_GPL(rcu_batches_completed); 149EXPORT_SYMBOL_GPL(rcu_batches_completed);
150 150
151/* 151/*
152 * Force a quiescent state for preemptible RCU.
153 */
154void rcu_force_quiescent_state(void)
155{
156 force_quiescent_state(&rcu_preempt_state);
157}
158EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
159
160/*
161 * Record a preemptible-RCU quiescent state for the specified CPU. Note 152 * Record a preemptible-RCU quiescent state for the specified CPU. Note
162 * that this just means that the task currently running on the CPU is 153 * that this just means that the task currently running on the CPU is
163 * not in a quiescent state. There might be any number of tasks blocked 154 * not in a quiescent state. There might be any number of tasks blocked
@@ -688,20 +679,6 @@ void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
688} 679}
689EXPORT_SYMBOL_GPL(call_rcu); 680EXPORT_SYMBOL_GPL(call_rcu);
690 681
691/*
692 * Queue an RCU callback for lazy invocation after a grace period.
693 * This will likely be later named something like "call_rcu_lazy()",
694 * but this change will require some way of tagging the lazy RCU
695 * callbacks in the list of pending callbacks. Until then, this
696 * function may only be called from __kfree_rcu().
697 */
698void kfree_call_rcu(struct rcu_head *head,
699 void (*func)(struct rcu_head *rcu))
700{
701 __call_rcu(head, func, &rcu_preempt_state, -1, 1);
702}
703EXPORT_SYMBOL_GPL(kfree_call_rcu);
704
705/** 682/**
706 * synchronize_rcu - wait until a grace period has elapsed. 683 * synchronize_rcu - wait until a grace period has elapsed.
707 * 684 *
@@ -970,7 +947,7 @@ void exit_rcu(void)
970 947
971#else /* #ifdef CONFIG_TREE_PREEMPT_RCU */ 948#else /* #ifdef CONFIG_TREE_PREEMPT_RCU */
972 949
973static struct rcu_state *rcu_state = &rcu_sched_state; 950static struct rcu_state *rcu_state_p = &rcu_sched_state;
974 951
975/* 952/*
976 * Tell them what RCU they are running. 953 * Tell them what RCU they are running.
@@ -991,16 +968,6 @@ long rcu_batches_completed(void)
991EXPORT_SYMBOL_GPL(rcu_batches_completed); 968EXPORT_SYMBOL_GPL(rcu_batches_completed);
992 969
993/* 970/*
994 * Force a quiescent state for RCU, which, because there is no preemptible
995 * RCU, becomes the same as rcu-sched.
996 */
997void rcu_force_quiescent_state(void)
998{
999 rcu_sched_force_quiescent_state();
1000}
1001EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
1002
1003/*
1004 * Because preemptible RCU does not exist, we never have to check for 971 * Because preemptible RCU does not exist, we never have to check for
1005 * CPUs being in quiescent states. 972 * CPUs being in quiescent states.
1006 */ 973 */
@@ -1080,22 +1047,6 @@ static void rcu_preempt_check_callbacks(int cpu)
1080} 1047}
1081 1048
1082/* 1049/*
1083 * Queue an RCU callback for lazy invocation after a grace period.
1084 * This will likely be later named something like "call_rcu_lazy()",
1085 * but this change will require some way of tagging the lazy RCU
1086 * callbacks in the list of pending callbacks. Until then, this
1087 * function may only be called from __kfree_rcu().
1088 *
1089 * Because there is no preemptible RCU, we use RCU-sched instead.
1090 */
1091void kfree_call_rcu(struct rcu_head *head,
1092 void (*func)(struct rcu_head *rcu))
1093{
1094 __call_rcu(head, func, &rcu_sched_state, -1, 1);
1095}
1096EXPORT_SYMBOL_GPL(kfree_call_rcu);
1097
1098/*
1099 * Wait for an rcu-preempt grace period, but make it happen quickly. 1050 * Wait for an rcu-preempt grace period, but make it happen quickly.
1100 * But because preemptible RCU does not exist, map to rcu-sched. 1051 * But because preemptible RCU does not exist, map to rcu-sched.
1101 */ 1052 */
@@ -1517,11 +1468,11 @@ static int __init rcu_spawn_kthreads(void)
1517 for_each_possible_cpu(cpu) 1468 for_each_possible_cpu(cpu)
1518 per_cpu(rcu_cpu_has_work, cpu) = 0; 1469 per_cpu(rcu_cpu_has_work, cpu) = 0;
1519 BUG_ON(smpboot_register_percpu_thread(&rcu_cpu_thread_spec)); 1470 BUG_ON(smpboot_register_percpu_thread(&rcu_cpu_thread_spec));
1520 rnp = rcu_get_root(rcu_state); 1471 rnp = rcu_get_root(rcu_state_p);
1521 (void)rcu_spawn_one_boost_kthread(rcu_state, rnp); 1472 (void)rcu_spawn_one_boost_kthread(rcu_state_p, rnp);
1522 if (NUM_RCU_NODES > 1) { 1473 if (NUM_RCU_NODES > 1) {
1523 rcu_for_each_leaf_node(rcu_state, rnp) 1474 rcu_for_each_leaf_node(rcu_state_p, rnp)
1524 (void)rcu_spawn_one_boost_kthread(rcu_state, rnp); 1475 (void)rcu_spawn_one_boost_kthread(rcu_state_p, rnp);
1525 } 1476 }
1526 return 0; 1477 return 0;
1527} 1478}
@@ -1529,12 +1480,12 @@ early_initcall(rcu_spawn_kthreads);
1529 1480
1530static void rcu_prepare_kthreads(int cpu) 1481static void rcu_prepare_kthreads(int cpu)
1531{ 1482{
1532 struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu); 1483 struct rcu_data *rdp = per_cpu_ptr(rcu_state_p->rda, cpu);
1533 struct rcu_node *rnp = rdp->mynode; 1484 struct rcu_node *rnp = rdp->mynode;
1534 1485
1535 /* Fire up the incoming CPU's kthread and leaf rcu_node kthread. */ 1486 /* Fire up the incoming CPU's kthread and leaf rcu_node kthread. */
1536 if (rcu_scheduler_fully_active) 1487 if (rcu_scheduler_fully_active)
1537 (void)rcu_spawn_one_boost_kthread(rcu_state, rnp); 1488 (void)rcu_spawn_one_boost_kthread(rcu_state_p, rnp);
1538} 1489}
1539 1490
1540#else /* #ifdef CONFIG_RCU_BOOST */ 1491#else /* #ifdef CONFIG_RCU_BOOST */
@@ -1744,6 +1695,7 @@ int rcu_needs_cpu(int cpu, unsigned long *dj)
1744static void rcu_prepare_for_idle(int cpu) 1695static void rcu_prepare_for_idle(int cpu)
1745{ 1696{
1746#ifndef CONFIG_RCU_NOCB_CPU_ALL 1697#ifndef CONFIG_RCU_NOCB_CPU_ALL
1698 bool needwake;
1747 struct rcu_data *rdp; 1699 struct rcu_data *rdp;
1748 struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); 1700 struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
1749 struct rcu_node *rnp; 1701 struct rcu_node *rnp;
@@ -1792,8 +1744,10 @@ static void rcu_prepare_for_idle(int cpu)
1792 rnp = rdp->mynode; 1744 rnp = rdp->mynode;
1793 raw_spin_lock(&rnp->lock); /* irqs already disabled. */ 1745 raw_spin_lock(&rnp->lock); /* irqs already disabled. */
1794 smp_mb__after_unlock_lock(); 1746 smp_mb__after_unlock_lock();
1795 rcu_accelerate_cbs(rsp, rnp, rdp); 1747 needwake = rcu_accelerate_cbs(rsp, rnp, rdp);
1796 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ 1748 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
1749 if (needwake)
1750 rcu_gp_kthread_wake(rsp);
1797 } 1751 }
1798#endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */ 1752#endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */
1799} 1753}
@@ -1855,7 +1809,7 @@ static void rcu_oom_notify_cpu(void *unused)
1855 struct rcu_data *rdp; 1809 struct rcu_data *rdp;
1856 1810
1857 for_each_rcu_flavor(rsp) { 1811 for_each_rcu_flavor(rsp) {
1858 rdp = __this_cpu_ptr(rsp->rda); 1812 rdp = raw_cpu_ptr(rsp->rda);
1859 if (rdp->qlen_lazy != 0) { 1813 if (rdp->qlen_lazy != 0) {
1860 atomic_inc(&oom_callback_count); 1814 atomic_inc(&oom_callback_count);
1861 rsp->call(&rdp->oom_head, rcu_oom_callback); 1815 rsp->call(&rdp->oom_head, rcu_oom_callback);
@@ -1997,7 +1951,7 @@ static void increment_cpu_stall_ticks(void)
1997 struct rcu_state *rsp; 1951 struct rcu_state *rsp;
1998 1952
1999 for_each_rcu_flavor(rsp) 1953 for_each_rcu_flavor(rsp)
2000 __this_cpu_ptr(rsp->rda)->ticks_this_gp++; 1954 raw_cpu_inc(rsp->rda->ticks_this_gp);
2001} 1955}
2002 1956
2003#else /* #ifdef CONFIG_RCU_CPU_STALL_INFO */ 1957#else /* #ifdef CONFIG_RCU_CPU_STALL_INFO */
@@ -2068,19 +2022,6 @@ static int __init parse_rcu_nocb_poll(char *arg)
2068early_param("rcu_nocb_poll", parse_rcu_nocb_poll); 2022early_param("rcu_nocb_poll", parse_rcu_nocb_poll);
2069 2023
2070/* 2024/*
2071 * Do any no-CBs CPUs need another grace period?
2072 *
2073 * Interrupts must be disabled. If the caller does not hold the root
2074 * rnp_node structure's ->lock, the results are advisory only.
2075 */
2076static int rcu_nocb_needs_gp(struct rcu_state *rsp)
2077{
2078 struct rcu_node *rnp = rcu_get_root(rsp);
2079
2080 return rnp->need_future_gp[(ACCESS_ONCE(rnp->completed) + 1) & 0x1];
2081}
2082
2083/*
2084 * Wake up any no-CBs CPUs' kthreads that were waiting on the just-ended 2025 * Wake up any no-CBs CPUs' kthreads that were waiting on the just-ended
2085 * grace period. 2026 * grace period.
2086 */ 2027 */
@@ -2109,7 +2050,7 @@ static void rcu_init_one_nocb(struct rcu_node *rnp)
2109} 2050}
2110 2051
2111#ifndef CONFIG_RCU_NOCB_CPU_ALL 2052#ifndef CONFIG_RCU_NOCB_CPU_ALL
2112/* Is the specified CPU a no-CPUs CPU? */ 2053/* Is the specified CPU a no-CBs CPU? */
2113bool rcu_is_nocb_cpu(int cpu) 2054bool rcu_is_nocb_cpu(int cpu)
2114{ 2055{
2115 if (have_rcu_nocb_mask) 2056 if (have_rcu_nocb_mask)
@@ -2243,12 +2184,15 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp)
2243 unsigned long c; 2184 unsigned long c;
2244 bool d; 2185 bool d;
2245 unsigned long flags; 2186 unsigned long flags;
2187 bool needwake;
2246 struct rcu_node *rnp = rdp->mynode; 2188 struct rcu_node *rnp = rdp->mynode;
2247 2189
2248 raw_spin_lock_irqsave(&rnp->lock, flags); 2190 raw_spin_lock_irqsave(&rnp->lock, flags);
2249 smp_mb__after_unlock_lock(); 2191 smp_mb__after_unlock_lock();
2250 c = rcu_start_future_gp(rnp, rdp); 2192 needwake = rcu_start_future_gp(rnp, rdp, &c);
2251 raw_spin_unlock_irqrestore(&rnp->lock, flags); 2193 raw_spin_unlock_irqrestore(&rnp->lock, flags);
2194 if (needwake)
2195 rcu_gp_kthread_wake(rdp->rsp);
2252 2196
2253 /* 2197 /*
2254 * Wait for the grace period. Do so interruptibly to avoid messing 2198 * Wait for the grace period. Do so interruptibly to avoid messing
@@ -2402,11 +2346,6 @@ static bool init_nocb_callback_list(struct rcu_data *rdp)
2402 2346
2403#else /* #ifdef CONFIG_RCU_NOCB_CPU */ 2347#else /* #ifdef CONFIG_RCU_NOCB_CPU */
2404 2348
2405static int rcu_nocb_needs_gp(struct rcu_state *rsp)
2406{
2407 return 0;
2408}
2409
2410static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) 2349static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp)
2411{ 2350{
2412} 2351}
@@ -2657,20 +2596,6 @@ static bool is_sysidle_rcu_state(struct rcu_state *rsp)
2657} 2596}
2658 2597
2659/* 2598/*
2660 * Bind the grace-period kthread for the sysidle flavor of RCU to the
2661 * timekeeping CPU.
2662 */
2663static void rcu_bind_gp_kthread(void)
2664{
2665 int cpu = ACCESS_ONCE(tick_do_timer_cpu);
2666
2667 if (cpu < 0 || cpu >= nr_cpu_ids)
2668 return;
2669 if (raw_smp_processor_id() != cpu)
2670 set_cpus_allowed_ptr(current, cpumask_of(cpu));
2671}
2672
2673/*
2674 * Return a delay in jiffies based on the number of CPUs, rcu_node 2599 * Return a delay in jiffies based on the number of CPUs, rcu_node
2675 * leaf fanout, and jiffies tick rate. The idea is to allow larger 2600 * leaf fanout, and jiffies tick rate. The idea is to allow larger
2676 * systems more time to transition to full-idle state in order to 2601 * systems more time to transition to full-idle state in order to
@@ -2734,7 +2659,8 @@ static void rcu_sysidle(unsigned long j)
2734static void rcu_sysidle_cancel(void) 2659static void rcu_sysidle_cancel(void)
2735{ 2660{
2736 smp_mb(); 2661 smp_mb();
2737 ACCESS_ONCE(full_sysidle_state) = RCU_SYSIDLE_NOT; 2662 if (full_sysidle_state > RCU_SYSIDLE_SHORT)
2663 ACCESS_ONCE(full_sysidle_state) = RCU_SYSIDLE_NOT;
2738} 2664}
2739 2665
2740/* 2666/*
@@ -2880,10 +2806,6 @@ static bool is_sysidle_rcu_state(struct rcu_state *rsp)
2880 return false; 2806 return false;
2881} 2807}
2882 2808
2883static void rcu_bind_gp_kthread(void)
2884{
2885}
2886
2887static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle, 2809static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle,
2888 unsigned long maxj) 2810 unsigned long maxj)
2889{ 2811{
@@ -2914,3 +2836,19 @@ static bool rcu_nohz_full_cpu(struct rcu_state *rsp)
2914#endif /* #ifdef CONFIG_NO_HZ_FULL */ 2836#endif /* #ifdef CONFIG_NO_HZ_FULL */
2915 return 0; 2837 return 0;
2916} 2838}
2839
2840/*
2841 * Bind the grace-period kthread for the sysidle flavor of RCU to the
2842 * timekeeping CPU.
2843 */
2844static void rcu_bind_gp_kthread(void)
2845{
2846#ifdef CONFIG_NO_HZ_FULL
2847 int cpu = ACCESS_ONCE(tick_do_timer_cpu);
2848
2849 if (cpu < 0 || cpu >= nr_cpu_ids)
2850 return;
2851 if (raw_smp_processor_id() != cpu)
2852 set_cpus_allowed_ptr(current, cpumask_of(cpu));
2853#endif /* #ifdef CONFIG_NO_HZ_FULL */
2854}
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index 4c0a9b0af469..a2aeb4df0f60 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -320,6 +320,18 @@ int rcu_jiffies_till_stall_check(void)
320 return till_stall_check * HZ + RCU_STALL_DELAY_DELTA; 320 return till_stall_check * HZ + RCU_STALL_DELAY_DELTA;
321} 321}
322 322
323void rcu_sysrq_start(void)
324{
325 if (!rcu_cpu_stall_suppress)
326 rcu_cpu_stall_suppress = 2;
327}
328
329void rcu_sysrq_end(void)
330{
331 if (rcu_cpu_stall_suppress == 2)
332 rcu_cpu_stall_suppress = 0;
333}
334
323static int rcu_panic(struct notifier_block *this, unsigned long ev, void *ptr) 335static int rcu_panic(struct notifier_block *this, unsigned long ev, void *ptr)
324{ 336{
325 rcu_cpu_stall_suppress = 1; 337 rcu_cpu_stall_suppress = 1;
@@ -338,3 +350,21 @@ static int __init check_cpu_stall_init(void)
338early_initcall(check_cpu_stall_init); 350early_initcall(check_cpu_stall_init);
339 351
340#endif /* #ifdef CONFIG_RCU_STALL_COMMON */ 352#endif /* #ifdef CONFIG_RCU_STALL_COMMON */
353
354/*
355 * Hooks for cond_resched() and friends to avoid RCU CPU stall warnings.
356 */
357
358DEFINE_PER_CPU(int, rcu_cond_resched_count);
359
360/*
361 * Report a set of RCU quiescent states, for use by cond_resched()
362 * and friends. Out of line due to being called infrequently.
363 */
364void rcu_resched(void)
365{
366 preempt_disable();
367 __this_cpu_write(rcu_cond_resched_count, 0);
368 rcu_note_context_switch(smp_processor_id());
369 preempt_enable();
370}
diff --git a/kernel/reboot.c b/kernel/reboot.c
index 662c83fc16b7..a3a9e240fcdb 100644
--- a/kernel/reboot.c
+++ b/kernel/reboot.c
@@ -388,15 +388,22 @@ static int __init reboot_setup(char *str)
388 break; 388 break;
389 389
390 case 's': 390 case 's':
391 if (isdigit(*(str+1))) 391 {
392 reboot_cpu = simple_strtoul(str+1, NULL, 0); 392 int rc;
393 else if (str[1] == 'm' && str[2] == 'p' && 393
394 isdigit(*(str+3))) 394 if (isdigit(*(str+1))) {
395 reboot_cpu = simple_strtoul(str+3, NULL, 0); 395 rc = kstrtoint(str+1, 0, &reboot_cpu);
396 else 396 if (rc)
397 return rc;
398 } else if (str[1] == 'm' && str[2] == 'p' &&
399 isdigit(*(str+3))) {
400 rc = kstrtoint(str+3, 0, &reboot_cpu);
401 if (rc)
402 return rc;
403 } else
397 reboot_mode = REBOOT_SOFT; 404 reboot_mode = REBOOT_SOFT;
398 break; 405 break;
399 406 }
400 case 'g': 407 case 'g':
401 reboot_mode = REBOOT_GPIO; 408 reboot_mode = REBOOT_GPIO;
402 break; 409 break;
diff --git a/kernel/res_counter.c b/kernel/res_counter.c
index 51dbac6a3633..e791130f85a7 100644
--- a/kernel/res_counter.c
+++ b/kernel/res_counter.c
@@ -186,8 +186,11 @@ int res_counter_memparse_write_strategy(const char *buf,
186 186
187 /* return RES_COUNTER_MAX(unlimited) if "-1" is specified */ 187 /* return RES_COUNTER_MAX(unlimited) if "-1" is specified */
188 if (*buf == '-') { 188 if (*buf == '-') {
189 res = simple_strtoull(buf + 1, &end, 10); 189 int rc = kstrtoull(buf + 1, 10, &res);
190 if (res != 1 || *end != '\0') 190
191 if (rc)
192 return rc;
193 if (res != 1)
191 return -EINVAL; 194 return -EINVAL;
192 *resp = RES_COUNTER_MAX; 195 *resp = RES_COUNTER_MAX;
193 return 0; 196 return 0;
diff --git a/kernel/resource.c b/kernel/resource.c
index 8957d686e29b..3c2237ac32db 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -1288,13 +1288,10 @@ int iomem_map_sanity_check(resource_size_t addr, unsigned long size)
1288 if (p->flags & IORESOURCE_BUSY) 1288 if (p->flags & IORESOURCE_BUSY)
1289 continue; 1289 continue;
1290 1290
1291 printk(KERN_WARNING "resource map sanity check conflict: " 1291 printk(KERN_WARNING "resource sanity check: requesting [mem %#010llx-%#010llx], which spans more than %s %pR\n",
1292 "0x%llx 0x%llx 0x%llx 0x%llx %s\n",
1293 (unsigned long long)addr, 1292 (unsigned long long)addr,
1294 (unsigned long long)(addr + size - 1), 1293 (unsigned long long)(addr + size - 1),
1295 (unsigned long long)p->start, 1294 p->name, p);
1296 (unsigned long long)p->end,
1297 p->name);
1298 err = -1; 1295 err = -1;
1299 break; 1296 break;
1300 } 1297 }
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 8a70ec091760..c6b98793d647 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -522,6 +522,39 @@ static inline void init_hrtick(void)
522#endif /* CONFIG_SCHED_HRTICK */ 522#endif /* CONFIG_SCHED_HRTICK */
523 523
524/* 524/*
525 * cmpxchg based fetch_or, macro so it works for different integer types
526 */
527#define fetch_or(ptr, val) \
528({ typeof(*(ptr)) __old, __val = *(ptr); \
529 for (;;) { \
530 __old = cmpxchg((ptr), __val, __val | (val)); \
531 if (__old == __val) \
532 break; \
533 __val = __old; \
534 } \
535 __old; \
536})
537
538#ifdef TIF_POLLING_NRFLAG
539/*
540 * Atomically set TIF_NEED_RESCHED and test for TIF_POLLING_NRFLAG,
541 * this avoids any races wrt polling state changes and thereby avoids
542 * spurious IPIs.
543 */
544static bool set_nr_and_not_polling(struct task_struct *p)
545{
546 struct thread_info *ti = task_thread_info(p);
547 return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG);
548}
549#else
550static bool set_nr_and_not_polling(struct task_struct *p)
551{
552 set_tsk_need_resched(p);
553 return true;
554}
555#endif
556
557/*
525 * resched_task - mark a task 'to be rescheduled now'. 558 * resched_task - mark a task 'to be rescheduled now'.
526 * 559 *
527 * On UP this means the setting of the need_resched flag, on SMP it 560 * On UP this means the setting of the need_resched flag, on SMP it
@@ -537,17 +570,15 @@ void resched_task(struct task_struct *p)
537 if (test_tsk_need_resched(p)) 570 if (test_tsk_need_resched(p))
538 return; 571 return;
539 572
540 set_tsk_need_resched(p);
541
542 cpu = task_cpu(p); 573 cpu = task_cpu(p);
574
543 if (cpu == smp_processor_id()) { 575 if (cpu == smp_processor_id()) {
576 set_tsk_need_resched(p);
544 set_preempt_need_resched(); 577 set_preempt_need_resched();
545 return; 578 return;
546 } 579 }
547 580
548 /* NEED_RESCHED must be visible before we test polling */ 581 if (set_nr_and_not_polling(p))
549 smp_mb();
550 if (!tsk_is_polling(p))
551 smp_send_reschedule(cpu); 582 smp_send_reschedule(cpu);
552} 583}
553 584
@@ -1336,7 +1367,7 @@ out:
1336 * leave kernel. 1367 * leave kernel.
1337 */ 1368 */
1338 if (p->mm && printk_ratelimit()) { 1369 if (p->mm && printk_ratelimit()) {
1339 printk_sched("process %d (%s) no longer affine to cpu%d\n", 1370 printk_deferred("process %d (%s) no longer affine to cpu%d\n",
1340 task_pid_nr(p), p->comm, cpu); 1371 task_pid_nr(p), p->comm, cpu);
1341 } 1372 }
1342 } 1373 }
@@ -2208,7 +2239,7 @@ static inline void post_schedule(struct rq *rq)
2208 * schedule_tail - first thing a freshly forked thread must call. 2239 * schedule_tail - first thing a freshly forked thread must call.
2209 * @prev: the thread we just switched away from. 2240 * @prev: the thread we just switched away from.
2210 */ 2241 */
2211asmlinkage void schedule_tail(struct task_struct *prev) 2242asmlinkage __visible void schedule_tail(struct task_struct *prev)
2212 __releases(rq->lock) 2243 __releases(rq->lock)
2213{ 2244{
2214 struct rq *rq = this_rq(); 2245 struct rq *rq = this_rq();
@@ -2608,8 +2639,14 @@ pick_next_task(struct rq *rq, struct task_struct *prev)
2608 if (likely(prev->sched_class == class && 2639 if (likely(prev->sched_class == class &&
2609 rq->nr_running == rq->cfs.h_nr_running)) { 2640 rq->nr_running == rq->cfs.h_nr_running)) {
2610 p = fair_sched_class.pick_next_task(rq, prev); 2641 p = fair_sched_class.pick_next_task(rq, prev);
2611 if (likely(p && p != RETRY_TASK)) 2642 if (unlikely(p == RETRY_TASK))
2612 return p; 2643 goto again;
2644
2645 /* assumes fair_sched_class->next == idle_sched_class */
2646 if (unlikely(!p))
2647 p = idle_sched_class.pick_next_task(rq, prev);
2648
2649 return p;
2613 } 2650 }
2614 2651
2615again: 2652again:
@@ -2757,7 +2794,7 @@ static inline void sched_submit_work(struct task_struct *tsk)
2757 blk_schedule_flush_plug(tsk); 2794 blk_schedule_flush_plug(tsk);
2758} 2795}
2759 2796
2760asmlinkage void __sched schedule(void) 2797asmlinkage __visible void __sched schedule(void)
2761{ 2798{
2762 struct task_struct *tsk = current; 2799 struct task_struct *tsk = current;
2763 2800
@@ -2767,7 +2804,7 @@ asmlinkage void __sched schedule(void)
2767EXPORT_SYMBOL(schedule); 2804EXPORT_SYMBOL(schedule);
2768 2805
2769#ifdef CONFIG_CONTEXT_TRACKING 2806#ifdef CONFIG_CONTEXT_TRACKING
2770asmlinkage void __sched schedule_user(void) 2807asmlinkage __visible void __sched schedule_user(void)
2771{ 2808{
2772 /* 2809 /*
2773 * If we come here after a random call to set_need_resched(), 2810 * If we come here after a random call to set_need_resched(),
@@ -2799,7 +2836,7 @@ void __sched schedule_preempt_disabled(void)
2799 * off of preempt_enable. Kernel preemptions off return from interrupt 2836 * off of preempt_enable. Kernel preemptions off return from interrupt
2800 * occur there and call schedule directly. 2837 * occur there and call schedule directly.
2801 */ 2838 */
2802asmlinkage void __sched notrace preempt_schedule(void) 2839asmlinkage __visible void __sched notrace preempt_schedule(void)
2803{ 2840{
2804 /* 2841 /*
2805 * If there is a non-zero preempt_count or interrupts are disabled, 2842 * If there is a non-zero preempt_count or interrupts are disabled,
@@ -2829,7 +2866,7 @@ EXPORT_SYMBOL(preempt_schedule);
2829 * Note, that this is called and return with irqs disabled. This will 2866 * Note, that this is called and return with irqs disabled. This will
2830 * protect us against recursive calling from irq. 2867 * protect us against recursive calling from irq.
2831 */ 2868 */
2832asmlinkage void __sched preempt_schedule_irq(void) 2869asmlinkage __visible void __sched preempt_schedule_irq(void)
2833{ 2870{
2834 enum ctx_state prev_state; 2871 enum ctx_state prev_state;
2835 2872
@@ -3012,7 +3049,7 @@ EXPORT_SYMBOL(set_user_nice);
3012int can_nice(const struct task_struct *p, const int nice) 3049int can_nice(const struct task_struct *p, const int nice)
3013{ 3050{
3014 /* convert nice value [19,-20] to rlimit style value [1,40] */ 3051 /* convert nice value [19,-20] to rlimit style value [1,40] */
3015 int nice_rlim = 20 - nice; 3052 int nice_rlim = nice_to_rlimit(nice);
3016 3053
3017 return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) || 3054 return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) ||
3018 capable(CAP_SYS_NICE)); 3055 capable(CAP_SYS_NICE));
@@ -3036,17 +3073,10 @@ SYSCALL_DEFINE1(nice, int, increment)
3036 * We don't have to worry. Conceptually one call occurs first 3073 * We don't have to worry. Conceptually one call occurs first
3037 * and we have a single winner. 3074 * and we have a single winner.
3038 */ 3075 */
3039 if (increment < -40) 3076 increment = clamp(increment, -NICE_WIDTH, NICE_WIDTH);
3040 increment = -40;
3041 if (increment > 40)
3042 increment = 40;
3043
3044 nice = task_nice(current) + increment; 3077 nice = task_nice(current) + increment;
3045 if (nice < MIN_NICE)
3046 nice = MIN_NICE;
3047 if (nice > MAX_NICE)
3048 nice = MAX_NICE;
3049 3078
3079 nice = clamp_val(nice, MIN_NICE, MAX_NICE);
3050 if (increment < 0 && !can_nice(current, nice)) 3080 if (increment < 0 && !can_nice(current, nice))
3051 return -EPERM; 3081 return -EPERM;
3052 3082
@@ -3140,6 +3170,7 @@ __setparam_dl(struct task_struct *p, const struct sched_attr *attr)
3140 dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime); 3170 dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime);
3141 dl_se->dl_throttled = 0; 3171 dl_se->dl_throttled = 0;
3142 dl_se->dl_new = 1; 3172 dl_se->dl_new = 1;
3173 dl_se->dl_yielded = 0;
3143} 3174}
3144 3175
3145static void __setscheduler_params(struct task_struct *p, 3176static void __setscheduler_params(struct task_struct *p,
@@ -3204,17 +3235,40 @@ __getparam_dl(struct task_struct *p, struct sched_attr *attr)
3204 * We ask for the deadline not being zero, and greater or equal 3235 * We ask for the deadline not being zero, and greater or equal
3205 * than the runtime, as well as the period of being zero or 3236 * than the runtime, as well as the period of being zero or
3206 * greater than deadline. Furthermore, we have to be sure that 3237 * greater than deadline. Furthermore, we have to be sure that
3207 * user parameters are above the internal resolution (1us); we 3238 * user parameters are above the internal resolution of 1us (we
3208 * check sched_runtime only since it is always the smaller one. 3239 * check sched_runtime only since it is always the smaller one) and
3240 * below 2^63 ns (we have to check both sched_deadline and
3241 * sched_period, as the latter can be zero).
3209 */ 3242 */
3210static bool 3243static bool
3211__checkparam_dl(const struct sched_attr *attr) 3244__checkparam_dl(const struct sched_attr *attr)
3212{ 3245{
3213 return attr && attr->sched_deadline != 0 && 3246 /* deadline != 0 */
3214 (attr->sched_period == 0 || 3247 if (attr->sched_deadline == 0)
3215 (s64)(attr->sched_period - attr->sched_deadline) >= 0) && 3248 return false;
3216 (s64)(attr->sched_deadline - attr->sched_runtime ) >= 0 && 3249
3217 attr->sched_runtime >= (2 << (DL_SCALE - 1)); 3250 /*
3251 * Since we truncate DL_SCALE bits, make sure we're at least
3252 * that big.
3253 */
3254 if (attr->sched_runtime < (1ULL << DL_SCALE))
3255 return false;
3256
3257 /*
3258 * Since we use the MSB for wrap-around and sign issues, make
3259 * sure it's not set (mind that period can be equal to zero).
3260 */
3261 if (attr->sched_deadline & (1ULL << 63) ||
3262 attr->sched_period & (1ULL << 63))
3263 return false;
3264
3265 /* runtime <= deadline <= period (if period != 0) */
3266 if ((attr->sched_period != 0 &&
3267 attr->sched_period < attr->sched_deadline) ||
3268 attr->sched_deadline < attr->sched_runtime)
3269 return false;
3270
3271 return true;
3218} 3272}
3219 3273
3220/* 3274/*
@@ -3612,13 +3666,11 @@ static int sched_copy_attr(struct sched_attr __user *uattr,
3612 */ 3666 */
3613 attr->sched_nice = clamp(attr->sched_nice, MIN_NICE, MAX_NICE); 3667 attr->sched_nice = clamp(attr->sched_nice, MIN_NICE, MAX_NICE);
3614 3668
3615out: 3669 return 0;
3616 return ret;
3617 3670
3618err_size: 3671err_size:
3619 put_user(sizeof(*attr), &uattr->size); 3672 put_user(sizeof(*attr), &uattr->size);
3620 ret = -E2BIG; 3673 return -E2BIG;
3621 goto out;
3622} 3674}
3623 3675
3624/** 3676/**
@@ -3655,6 +3707,7 @@ SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
3655 * sys_sched_setattr - same as above, but with extended sched_attr 3707 * sys_sched_setattr - same as above, but with extended sched_attr
3656 * @pid: the pid in question. 3708 * @pid: the pid in question.
3657 * @uattr: structure containing the extended parameters. 3709 * @uattr: structure containing the extended parameters.
3710 * @flags: for future extension.
3658 */ 3711 */
3659SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr, 3712SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr,
3660 unsigned int, flags) 3713 unsigned int, flags)
@@ -3666,8 +3719,12 @@ SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr,
3666 if (!uattr || pid < 0 || flags) 3719 if (!uattr || pid < 0 || flags)
3667 return -EINVAL; 3720 return -EINVAL;
3668 3721
3669 if (sched_copy_attr(uattr, &attr)) 3722 retval = sched_copy_attr(uattr, &attr);
3670 return -EFAULT; 3723 if (retval)
3724 return retval;
3725
3726 if ((int)attr.sched_policy < 0)
3727 return -EINVAL;
3671 3728
3672 rcu_read_lock(); 3729 rcu_read_lock();
3673 retval = -ESRCH; 3730 retval = -ESRCH;
@@ -3717,7 +3774,7 @@ SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
3717 */ 3774 */
3718SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param) 3775SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
3719{ 3776{
3720 struct sched_param lp; 3777 struct sched_param lp = { .sched_priority = 0 };
3721 struct task_struct *p; 3778 struct task_struct *p;
3722 int retval; 3779 int retval;
3723 3780
@@ -3734,11 +3791,8 @@ SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
3734 if (retval) 3791 if (retval)
3735 goto out_unlock; 3792 goto out_unlock;
3736 3793
3737 if (task_has_dl_policy(p)) { 3794 if (task_has_rt_policy(p))
3738 retval = -EINVAL; 3795 lp.sched_priority = p->rt_priority;
3739 goto out_unlock;
3740 }
3741 lp.sched_priority = p->rt_priority;
3742 rcu_read_unlock(); 3796 rcu_read_unlock();
3743 3797
3744 /* 3798 /*
@@ -3776,7 +3830,7 @@ static int sched_read_attr(struct sched_attr __user *uattr,
3776 3830
3777 for (; addr < end; addr++) { 3831 for (; addr < end; addr++) {
3778 if (*addr) 3832 if (*addr)
3779 goto err_size; 3833 return -EFBIG;
3780 } 3834 }
3781 3835
3782 attr->size = usize; 3836 attr->size = usize;
@@ -3786,12 +3840,7 @@ static int sched_read_attr(struct sched_attr __user *uattr,
3786 if (ret) 3840 if (ret)
3787 return -EFAULT; 3841 return -EFAULT;
3788 3842
3789out: 3843 return 0;
3790 return ret;
3791
3792err_size:
3793 ret = -E2BIG;
3794 goto out;
3795} 3844}
3796 3845
3797/** 3846/**
@@ -3799,6 +3848,7 @@ err_size:
3799 * @pid: the pid in question. 3848 * @pid: the pid in question.
3800 * @uattr: structure containing the extended parameters. 3849 * @uattr: structure containing the extended parameters.
3801 * @size: sizeof(attr) for fwd/bwd comp. 3850 * @size: sizeof(attr) for fwd/bwd comp.
3851 * @flags: for future extension.
3802 */ 3852 */
3803SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, 3853SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
3804 unsigned int, size, unsigned int, flags) 3854 unsigned int, size, unsigned int, flags)
@@ -4067,6 +4117,7 @@ static void __cond_resched(void)
4067 4117
4068int __sched _cond_resched(void) 4118int __sched _cond_resched(void)
4069{ 4119{
4120 rcu_cond_resched();
4070 if (should_resched()) { 4121 if (should_resched()) {
4071 __cond_resched(); 4122 __cond_resched();
4072 return 1; 4123 return 1;
@@ -4085,15 +4136,18 @@ EXPORT_SYMBOL(_cond_resched);
4085 */ 4136 */
4086int __cond_resched_lock(spinlock_t *lock) 4137int __cond_resched_lock(spinlock_t *lock)
4087{ 4138{
4139 bool need_rcu_resched = rcu_should_resched();
4088 int resched = should_resched(); 4140 int resched = should_resched();
4089 int ret = 0; 4141 int ret = 0;
4090 4142
4091 lockdep_assert_held(lock); 4143 lockdep_assert_held(lock);
4092 4144
4093 if (spin_needbreak(lock) || resched) { 4145 if (spin_needbreak(lock) || resched || need_rcu_resched) {
4094 spin_unlock(lock); 4146 spin_unlock(lock);
4095 if (resched) 4147 if (resched)
4096 __cond_resched(); 4148 __cond_resched();
4149 else if (unlikely(need_rcu_resched))
4150 rcu_resched();
4097 else 4151 else
4098 cpu_relax(); 4152 cpu_relax();
4099 ret = 1; 4153 ret = 1;
@@ -4107,6 +4161,7 @@ int __sched __cond_resched_softirq(void)
4107{ 4161{
4108 BUG_ON(!in_softirq()); 4162 BUG_ON(!in_softirq());
4109 4163
4164 rcu_cond_resched(); /* BH disabled OK, just recording QSes. */
4110 if (should_resched()) { 4165 if (should_resched()) {
4111 local_bh_enable(); 4166 local_bh_enable();
4112 __cond_resched(); 4167 __cond_resched();
@@ -5055,11 +5110,20 @@ static struct notifier_block migration_notifier = {
5055 .priority = CPU_PRI_MIGRATION, 5110 .priority = CPU_PRI_MIGRATION,
5056}; 5111};
5057 5112
5113static void __cpuinit set_cpu_rq_start_time(void)
5114{
5115 int cpu = smp_processor_id();
5116 struct rq *rq = cpu_rq(cpu);
5117 rq->age_stamp = sched_clock_cpu(cpu);
5118}
5119
5058static int sched_cpu_active(struct notifier_block *nfb, 5120static int sched_cpu_active(struct notifier_block *nfb,
5059 unsigned long action, void *hcpu) 5121 unsigned long action, void *hcpu)
5060{ 5122{
5061 switch (action & ~CPU_TASKS_FROZEN) { 5123 switch (action & ~CPU_TASKS_FROZEN) {
5062 case CPU_STARTING: 5124 case CPU_STARTING:
5125 set_cpu_rq_start_time();
5126 return NOTIFY_OK;
5063 case CPU_DOWN_FAILED: 5127 case CPU_DOWN_FAILED:
5064 set_cpu_active((long)hcpu, true); 5128 set_cpu_active((long)hcpu, true);
5065 return NOTIFY_OK; 5129 return NOTIFY_OK;
@@ -5268,7 +5332,8 @@ static int sd_degenerate(struct sched_domain *sd)
5268 SD_BALANCE_FORK | 5332 SD_BALANCE_FORK |
5269 SD_BALANCE_EXEC | 5333 SD_BALANCE_EXEC |
5270 SD_SHARE_CPUPOWER | 5334 SD_SHARE_CPUPOWER |
5271 SD_SHARE_PKG_RESOURCES)) { 5335 SD_SHARE_PKG_RESOURCES |
5336 SD_SHARE_POWERDOMAIN)) {
5272 if (sd->groups != sd->groups->next) 5337 if (sd->groups != sd->groups->next)
5273 return 0; 5338 return 0;
5274 } 5339 }
@@ -5299,7 +5364,8 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
5299 SD_BALANCE_EXEC | 5364 SD_BALANCE_EXEC |
5300 SD_SHARE_CPUPOWER | 5365 SD_SHARE_CPUPOWER |
5301 SD_SHARE_PKG_RESOURCES | 5366 SD_SHARE_PKG_RESOURCES |
5302 SD_PREFER_SIBLING); 5367 SD_PREFER_SIBLING |
5368 SD_SHARE_POWERDOMAIN);
5303 if (nr_node_ids == 1) 5369 if (nr_node_ids == 1)
5304 pflags &= ~SD_SERIALIZE; 5370 pflags &= ~SD_SERIALIZE;
5305 } 5371 }
@@ -5573,17 +5639,6 @@ static int __init isolated_cpu_setup(char *str)
5573 5639
5574__setup("isolcpus=", isolated_cpu_setup); 5640__setup("isolcpus=", isolated_cpu_setup);
5575 5641
5576static const struct cpumask *cpu_cpu_mask(int cpu)
5577{
5578 return cpumask_of_node(cpu_to_node(cpu));
5579}
5580
5581struct sd_data {
5582 struct sched_domain **__percpu sd;
5583 struct sched_group **__percpu sg;
5584 struct sched_group_power **__percpu sgp;
5585};
5586
5587struct s_data { 5642struct s_data {
5588 struct sched_domain ** __percpu sd; 5643 struct sched_domain ** __percpu sd;
5589 struct root_domain *rd; 5644 struct root_domain *rd;
@@ -5596,21 +5651,6 @@ enum s_alloc {
5596 sa_none, 5651 sa_none,
5597}; 5652};
5598 5653
5599struct sched_domain_topology_level;
5600
5601typedef struct sched_domain *(*sched_domain_init_f)(struct sched_domain_topology_level *tl, int cpu);
5602typedef const struct cpumask *(*sched_domain_mask_f)(int cpu);
5603
5604#define SDTL_OVERLAP 0x01
5605
5606struct sched_domain_topology_level {
5607 sched_domain_init_f init;
5608 sched_domain_mask_f mask;
5609 int flags;
5610 int numa_level;
5611 struct sd_data data;
5612};
5613
5614/* 5654/*
5615 * Build an iteration mask that can exclude certain CPUs from the upwards 5655 * Build an iteration mask that can exclude certain CPUs from the upwards
5616 * domain traversal. 5656 * domain traversal.
@@ -5778,8 +5818,6 @@ build_sched_groups(struct sched_domain *sd, int cpu)
5778 continue; 5818 continue;
5779 5819
5780 group = get_group(i, sdd, &sg); 5820 group = get_group(i, sdd, &sg);
5781 cpumask_clear(sched_group_cpus(sg));
5782 sg->sgp->power = 0;
5783 cpumask_setall(sched_group_mask(sg)); 5821 cpumask_setall(sched_group_mask(sg));
5784 5822
5785 for_each_cpu(j, span) { 5823 for_each_cpu(j, span) {
@@ -5829,44 +5867,11 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
5829 atomic_set(&sg->sgp->nr_busy_cpus, sg->group_weight); 5867 atomic_set(&sg->sgp->nr_busy_cpus, sg->group_weight);
5830} 5868}
5831 5869
5832int __weak arch_sd_sibling_asym_packing(void)
5833{
5834 return 0*SD_ASYM_PACKING;
5835}
5836
5837/* 5870/*
5838 * Initializers for schedule domains 5871 * Initializers for schedule domains
5839 * Non-inlined to reduce accumulated stack pressure in build_sched_domains() 5872 * Non-inlined to reduce accumulated stack pressure in build_sched_domains()
5840 */ 5873 */
5841 5874
5842#ifdef CONFIG_SCHED_DEBUG
5843# define SD_INIT_NAME(sd, type) sd->name = #type
5844#else
5845# define SD_INIT_NAME(sd, type) do { } while (0)
5846#endif
5847
5848#define SD_INIT_FUNC(type) \
5849static noinline struct sched_domain * \
5850sd_init_##type(struct sched_domain_topology_level *tl, int cpu) \
5851{ \
5852 struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu); \
5853 *sd = SD_##type##_INIT; \
5854 SD_INIT_NAME(sd, type); \
5855 sd->private = &tl->data; \
5856 return sd; \
5857}
5858
5859SD_INIT_FUNC(CPU)
5860#ifdef CONFIG_SCHED_SMT
5861 SD_INIT_FUNC(SIBLING)
5862#endif
5863#ifdef CONFIG_SCHED_MC
5864 SD_INIT_FUNC(MC)
5865#endif
5866#ifdef CONFIG_SCHED_BOOK
5867 SD_INIT_FUNC(BOOK)
5868#endif
5869
5870static int default_relax_domain_level = -1; 5875static int default_relax_domain_level = -1;
5871int sched_domain_level_max; 5876int sched_domain_level_max;
5872 5877
@@ -5954,97 +5959,154 @@ static void claim_allocations(int cpu, struct sched_domain *sd)
5954 *per_cpu_ptr(sdd->sgp, cpu) = NULL; 5959 *per_cpu_ptr(sdd->sgp, cpu) = NULL;
5955} 5960}
5956 5961
5957#ifdef CONFIG_SCHED_SMT
5958static const struct cpumask *cpu_smt_mask(int cpu)
5959{
5960 return topology_thread_cpumask(cpu);
5961}
5962#endif
5963
5964/*
5965 * Topology list, bottom-up.
5966 */
5967static struct sched_domain_topology_level default_topology[] = {
5968#ifdef CONFIG_SCHED_SMT
5969 { sd_init_SIBLING, cpu_smt_mask, },
5970#endif
5971#ifdef CONFIG_SCHED_MC
5972 { sd_init_MC, cpu_coregroup_mask, },
5973#endif
5974#ifdef CONFIG_SCHED_BOOK
5975 { sd_init_BOOK, cpu_book_mask, },
5976#endif
5977 { sd_init_CPU, cpu_cpu_mask, },
5978 { NULL, },
5979};
5980
5981static struct sched_domain_topology_level *sched_domain_topology = default_topology;
5982
5983#define for_each_sd_topology(tl) \
5984 for (tl = sched_domain_topology; tl->init; tl++)
5985
5986#ifdef CONFIG_NUMA 5962#ifdef CONFIG_NUMA
5987
5988static int sched_domains_numa_levels; 5963static int sched_domains_numa_levels;
5989static int *sched_domains_numa_distance; 5964static int *sched_domains_numa_distance;
5990static struct cpumask ***sched_domains_numa_masks; 5965static struct cpumask ***sched_domains_numa_masks;
5991static int sched_domains_curr_level; 5966static int sched_domains_curr_level;
5967#endif
5992 5968
5993static inline int sd_local_flags(int level) 5969/*
5994{ 5970 * SD_flags allowed in topology descriptions.
5995 if (sched_domains_numa_distance[level] > RECLAIM_DISTANCE) 5971 *
5996 return 0; 5972 * SD_SHARE_CPUPOWER - describes SMT topologies
5997 5973 * SD_SHARE_PKG_RESOURCES - describes shared caches
5998 return SD_BALANCE_EXEC | SD_BALANCE_FORK | SD_WAKE_AFFINE; 5974 * SD_NUMA - describes NUMA topologies
5999} 5975 * SD_SHARE_POWERDOMAIN - describes shared power domain
5976 *
5977 * Odd one out:
5978 * SD_ASYM_PACKING - describes SMT quirks
5979 */
5980#define TOPOLOGY_SD_FLAGS \
5981 (SD_SHARE_CPUPOWER | \
5982 SD_SHARE_PKG_RESOURCES | \
5983 SD_NUMA | \
5984 SD_ASYM_PACKING | \
5985 SD_SHARE_POWERDOMAIN)
6000 5986
6001static struct sched_domain * 5987static struct sched_domain *
6002sd_numa_init(struct sched_domain_topology_level *tl, int cpu) 5988sd_init(struct sched_domain_topology_level *tl, int cpu)
6003{ 5989{
6004 struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu); 5990 struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu);
6005 int level = tl->numa_level; 5991 int sd_weight, sd_flags = 0;
6006 int sd_weight = cpumask_weight( 5992
6007 sched_domains_numa_masks[level][cpu_to_node(cpu)]); 5993#ifdef CONFIG_NUMA
5994 /*
5995 * Ugly hack to pass state to sd_numa_mask()...
5996 */
5997 sched_domains_curr_level = tl->numa_level;
5998#endif
5999
6000 sd_weight = cpumask_weight(tl->mask(cpu));
6001
6002 if (tl->sd_flags)
6003 sd_flags = (*tl->sd_flags)();
6004 if (WARN_ONCE(sd_flags & ~TOPOLOGY_SD_FLAGS,
6005 "wrong sd_flags in topology description\n"))
6006 sd_flags &= ~TOPOLOGY_SD_FLAGS;
6008 6007
6009 *sd = (struct sched_domain){ 6008 *sd = (struct sched_domain){
6010 .min_interval = sd_weight, 6009 .min_interval = sd_weight,
6011 .max_interval = 2*sd_weight, 6010 .max_interval = 2*sd_weight,
6012 .busy_factor = 32, 6011 .busy_factor = 32,
6013 .imbalance_pct = 125, 6012 .imbalance_pct = 125,
6014 .cache_nice_tries = 2, 6013
6015 .busy_idx = 3, 6014 .cache_nice_tries = 0,
6016 .idle_idx = 2, 6015 .busy_idx = 0,
6016 .idle_idx = 0,
6017 .newidle_idx = 0, 6017 .newidle_idx = 0,
6018 .wake_idx = 0, 6018 .wake_idx = 0,
6019 .forkexec_idx = 0, 6019 .forkexec_idx = 0,
6020 6020
6021 .flags = 1*SD_LOAD_BALANCE 6021 .flags = 1*SD_LOAD_BALANCE
6022 | 1*SD_BALANCE_NEWIDLE 6022 | 1*SD_BALANCE_NEWIDLE
6023 | 0*SD_BALANCE_EXEC 6023 | 1*SD_BALANCE_EXEC
6024 | 0*SD_BALANCE_FORK 6024 | 1*SD_BALANCE_FORK
6025 | 0*SD_BALANCE_WAKE 6025 | 0*SD_BALANCE_WAKE
6026 | 0*SD_WAKE_AFFINE 6026 | 1*SD_WAKE_AFFINE
6027 | 0*SD_SHARE_CPUPOWER 6027 | 0*SD_SHARE_CPUPOWER
6028 | 0*SD_SHARE_PKG_RESOURCES 6028 | 0*SD_SHARE_PKG_RESOURCES
6029 | 1*SD_SERIALIZE 6029 | 0*SD_SERIALIZE
6030 | 0*SD_PREFER_SIBLING 6030 | 0*SD_PREFER_SIBLING
6031 | 1*SD_NUMA 6031 | 0*SD_NUMA
6032 | sd_local_flags(level) 6032 | sd_flags
6033 , 6033 ,
6034
6034 .last_balance = jiffies, 6035 .last_balance = jiffies,
6035 .balance_interval = sd_weight, 6036 .balance_interval = sd_weight,
6037 .smt_gain = 0,
6038 .max_newidle_lb_cost = 0,
6039 .next_decay_max_lb_cost = jiffies,
6040#ifdef CONFIG_SCHED_DEBUG
6041 .name = tl->name,
6042#endif
6036 }; 6043 };
6037 SD_INIT_NAME(sd, NUMA);
6038 sd->private = &tl->data;
6039 6044
6040 /* 6045 /*
6041 * Ugly hack to pass state to sd_numa_mask()... 6046 * Convert topological properties into behaviour.
6042 */ 6047 */
6043 sched_domains_curr_level = tl->numa_level; 6048
6049 if (sd->flags & SD_SHARE_CPUPOWER) {
6050 sd->imbalance_pct = 110;
6051 sd->smt_gain = 1178; /* ~15% */
6052
6053 } else if (sd->flags & SD_SHARE_PKG_RESOURCES) {
6054 sd->imbalance_pct = 117;
6055 sd->cache_nice_tries = 1;
6056 sd->busy_idx = 2;
6057
6058#ifdef CONFIG_NUMA
6059 } else if (sd->flags & SD_NUMA) {
6060 sd->cache_nice_tries = 2;
6061 sd->busy_idx = 3;
6062 sd->idle_idx = 2;
6063
6064 sd->flags |= SD_SERIALIZE;
6065 if (sched_domains_numa_distance[tl->numa_level] > RECLAIM_DISTANCE) {
6066 sd->flags &= ~(SD_BALANCE_EXEC |
6067 SD_BALANCE_FORK |
6068 SD_WAKE_AFFINE);
6069 }
6070
6071#endif
6072 } else {
6073 sd->flags |= SD_PREFER_SIBLING;
6074 sd->cache_nice_tries = 1;
6075 sd->busy_idx = 2;
6076 sd->idle_idx = 1;
6077 }
6078
6079 sd->private = &tl->data;
6044 6080
6045 return sd; 6081 return sd;
6046} 6082}
6047 6083
6084/*
6085 * Topology list, bottom-up.
6086 */
6087static struct sched_domain_topology_level default_topology[] = {
6088#ifdef CONFIG_SCHED_SMT
6089 { cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) },
6090#endif
6091#ifdef CONFIG_SCHED_MC
6092 { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
6093#endif
6094 { cpu_cpu_mask, SD_INIT_NAME(DIE) },
6095 { NULL, },
6096};
6097
6098struct sched_domain_topology_level *sched_domain_topology = default_topology;
6099
6100#define for_each_sd_topology(tl) \
6101 for (tl = sched_domain_topology; tl->mask; tl++)
6102
6103void set_sched_topology(struct sched_domain_topology_level *tl)
6104{
6105 sched_domain_topology = tl;
6106}
6107
6108#ifdef CONFIG_NUMA
6109
6048static const struct cpumask *sd_numa_mask(int cpu) 6110static const struct cpumask *sd_numa_mask(int cpu)
6049{ 6111{
6050 return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)]; 6112 return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)];
@@ -6188,7 +6250,10 @@ static void sched_init_numa(void)
6188 } 6250 }
6189 } 6251 }
6190 6252
6191 tl = kzalloc((ARRAY_SIZE(default_topology) + level) * 6253 /* Compute default topology size */
6254 for (i = 0; sched_domain_topology[i].mask; i++);
6255
6256 tl = kzalloc((i + level + 1) *
6192 sizeof(struct sched_domain_topology_level), GFP_KERNEL); 6257 sizeof(struct sched_domain_topology_level), GFP_KERNEL);
6193 if (!tl) 6258 if (!tl)
6194 return; 6259 return;
@@ -6196,18 +6261,19 @@ static void sched_init_numa(void)
6196 /* 6261 /*
6197 * Copy the default topology bits.. 6262 * Copy the default topology bits..
6198 */ 6263 */
6199 for (i = 0; default_topology[i].init; i++) 6264 for (i = 0; sched_domain_topology[i].mask; i++)
6200 tl[i] = default_topology[i]; 6265 tl[i] = sched_domain_topology[i];
6201 6266
6202 /* 6267 /*
6203 * .. and append 'j' levels of NUMA goodness. 6268 * .. and append 'j' levels of NUMA goodness.
6204 */ 6269 */
6205 for (j = 0; j < level; i++, j++) { 6270 for (j = 0; j < level; i++, j++) {
6206 tl[i] = (struct sched_domain_topology_level){ 6271 tl[i] = (struct sched_domain_topology_level){
6207 .init = sd_numa_init,
6208 .mask = sd_numa_mask, 6272 .mask = sd_numa_mask,
6273 .sd_flags = cpu_numa_flags,
6209 .flags = SDTL_OVERLAP, 6274 .flags = SDTL_OVERLAP,
6210 .numa_level = j, 6275 .numa_level = j,
6276 SD_INIT_NAME(NUMA)
6211 }; 6277 };
6212 } 6278 }
6213 6279
@@ -6365,7 +6431,7 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
6365 const struct cpumask *cpu_map, struct sched_domain_attr *attr, 6431 const struct cpumask *cpu_map, struct sched_domain_attr *attr,
6366 struct sched_domain *child, int cpu) 6432 struct sched_domain *child, int cpu)
6367{ 6433{
6368 struct sched_domain *sd = tl->init(tl, cpu); 6434 struct sched_domain *sd = sd_init(tl, cpu);
6369 if (!sd) 6435 if (!sd)
6370 return child; 6436 return child;
6371 6437
@@ -6935,6 +7001,7 @@ void __init sched_init(void)
6935 if (cpu_isolated_map == NULL) 7001 if (cpu_isolated_map == NULL)
6936 zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); 7002 zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
6937 idle_thread_set_boot_cpu(); 7003 idle_thread_set_boot_cpu();
7004 set_cpu_rq_start_time();
6938#endif 7005#endif
6939 init_sched_fair_class(); 7006 init_sched_fair_class();
6940 7007
@@ -7602,7 +7669,7 @@ cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
7602static int cpu_cgroup_css_online(struct cgroup_subsys_state *css) 7669static int cpu_cgroup_css_online(struct cgroup_subsys_state *css)
7603{ 7670{
7604 struct task_group *tg = css_tg(css); 7671 struct task_group *tg = css_tg(css);
7605 struct task_group *parent = css_tg(css_parent(css)); 7672 struct task_group *parent = css_tg(css->parent);
7606 7673
7607 if (parent) 7674 if (parent)
7608 sched_online_group(tg, parent); 7675 sched_online_group(tg, parent);
@@ -7733,8 +7800,7 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
7733 /* restart the period timer (if active) to handle new period expiry */ 7800 /* restart the period timer (if active) to handle new period expiry */
7734 if (runtime_enabled && cfs_b->timer_active) { 7801 if (runtime_enabled && cfs_b->timer_active) {
7735 /* force a reprogram */ 7802 /* force a reprogram */
7736 cfs_b->timer_active = 0; 7803 __start_cfs_bandwidth(cfs_b, true);
7737 __start_cfs_bandwidth(cfs_b);
7738 } 7804 }
7739 raw_spin_unlock_irq(&cfs_b->lock); 7805 raw_spin_unlock_irq(&cfs_b->lock);
7740 7806
diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c
index c143ee380e3a..9cf350c94ec4 100644
--- a/kernel/sched/cpuacct.c
+++ b/kernel/sched/cpuacct.c
@@ -46,7 +46,7 @@ static inline struct cpuacct *task_ca(struct task_struct *tsk)
46 46
47static inline struct cpuacct *parent_ca(struct cpuacct *ca) 47static inline struct cpuacct *parent_ca(struct cpuacct *ca)
48{ 48{
49 return css_ca(css_parent(&ca->css)); 49 return css_ca(ca->css.parent);
50} 50}
51 51
52static DEFINE_PER_CPU(u64, root_cpuacct_cpuusage); 52static DEFINE_PER_CPU(u64, root_cpuacct_cpuusage);
diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c
index 5b9bb42b2d47..bd95963dae80 100644
--- a/kernel/sched/cpudeadline.c
+++ b/kernel/sched/cpudeadline.c
@@ -13,6 +13,7 @@
13 13
14#include <linux/gfp.h> 14#include <linux/gfp.h>
15#include <linux/kernel.h> 15#include <linux/kernel.h>
16#include <linux/slab.h>
16#include "cpudeadline.h" 17#include "cpudeadline.h"
17 18
18static inline int parent(int i) 19static inline int parent(int i)
@@ -39,8 +40,10 @@ static void cpudl_exchange(struct cpudl *cp, int a, int b)
39{ 40{
40 int cpu_a = cp->elements[a].cpu, cpu_b = cp->elements[b].cpu; 41 int cpu_a = cp->elements[a].cpu, cpu_b = cp->elements[b].cpu;
41 42
42 swap(cp->elements[a], cp->elements[b]); 43 swap(cp->elements[a].cpu, cp->elements[b].cpu);
43 swap(cp->cpu_to_idx[cpu_a], cp->cpu_to_idx[cpu_b]); 44 swap(cp->elements[a].dl , cp->elements[b].dl );
45
46 swap(cp->elements[cpu_a].idx, cp->elements[cpu_b].idx);
44} 47}
45 48
46static void cpudl_heapify(struct cpudl *cp, int idx) 49static void cpudl_heapify(struct cpudl *cp, int idx)
@@ -140,7 +143,7 @@ void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid)
140 WARN_ON(!cpu_present(cpu)); 143 WARN_ON(!cpu_present(cpu));
141 144
142 raw_spin_lock_irqsave(&cp->lock, flags); 145 raw_spin_lock_irqsave(&cp->lock, flags);
143 old_idx = cp->cpu_to_idx[cpu]; 146 old_idx = cp->elements[cpu].idx;
144 if (!is_valid) { 147 if (!is_valid) {
145 /* remove item */ 148 /* remove item */
146 if (old_idx == IDX_INVALID) { 149 if (old_idx == IDX_INVALID) {
@@ -155,8 +158,8 @@ void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid)
155 cp->elements[old_idx].dl = cp->elements[cp->size - 1].dl; 158 cp->elements[old_idx].dl = cp->elements[cp->size - 1].dl;
156 cp->elements[old_idx].cpu = new_cpu; 159 cp->elements[old_idx].cpu = new_cpu;
157 cp->size--; 160 cp->size--;
158 cp->cpu_to_idx[new_cpu] = old_idx; 161 cp->elements[new_cpu].idx = old_idx;
159 cp->cpu_to_idx[cpu] = IDX_INVALID; 162 cp->elements[cpu].idx = IDX_INVALID;
160 while (old_idx > 0 && dl_time_before( 163 while (old_idx > 0 && dl_time_before(
161 cp->elements[parent(old_idx)].dl, 164 cp->elements[parent(old_idx)].dl,
162 cp->elements[old_idx].dl)) { 165 cp->elements[old_idx].dl)) {
@@ -173,7 +176,7 @@ void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid)
173 cp->size++; 176 cp->size++;
174 cp->elements[cp->size - 1].dl = 0; 177 cp->elements[cp->size - 1].dl = 0;
175 cp->elements[cp->size - 1].cpu = cpu; 178 cp->elements[cp->size - 1].cpu = cpu;
176 cp->cpu_to_idx[cpu] = cp->size - 1; 179 cp->elements[cpu].idx = cp->size - 1;
177 cpudl_change_key(cp, cp->size - 1, dl); 180 cpudl_change_key(cp, cp->size - 1, dl);
178 cpumask_clear_cpu(cpu, cp->free_cpus); 181 cpumask_clear_cpu(cpu, cp->free_cpus);
179 } else { 182 } else {
@@ -195,10 +198,21 @@ int cpudl_init(struct cpudl *cp)
195 memset(cp, 0, sizeof(*cp)); 198 memset(cp, 0, sizeof(*cp));
196 raw_spin_lock_init(&cp->lock); 199 raw_spin_lock_init(&cp->lock);
197 cp->size = 0; 200 cp->size = 0;
198 for (i = 0; i < NR_CPUS; i++) 201
199 cp->cpu_to_idx[i] = IDX_INVALID; 202 cp->elements = kcalloc(nr_cpu_ids,
200 if (!alloc_cpumask_var(&cp->free_cpus, GFP_KERNEL)) 203 sizeof(struct cpudl_item),
204 GFP_KERNEL);
205 if (!cp->elements)
206 return -ENOMEM;
207
208 if (!alloc_cpumask_var(&cp->free_cpus, GFP_KERNEL)) {
209 kfree(cp->elements);
201 return -ENOMEM; 210 return -ENOMEM;
211 }
212
213 for_each_possible_cpu(i)
214 cp->elements[i].idx = IDX_INVALID;
215
202 cpumask_setall(cp->free_cpus); 216 cpumask_setall(cp->free_cpus);
203 217
204 return 0; 218 return 0;
@@ -210,7 +224,6 @@ int cpudl_init(struct cpudl *cp)
210 */ 224 */
211void cpudl_cleanup(struct cpudl *cp) 225void cpudl_cleanup(struct cpudl *cp)
212{ 226{
213 /* 227 free_cpumask_var(cp->free_cpus);
214 * nothing to do for the moment 228 kfree(cp->elements);
215 */
216} 229}
diff --git a/kernel/sched/cpudeadline.h b/kernel/sched/cpudeadline.h
index a202789a412c..538c9796ad4a 100644
--- a/kernel/sched/cpudeadline.h
+++ b/kernel/sched/cpudeadline.h
@@ -5,17 +5,17 @@
5 5
6#define IDX_INVALID -1 6#define IDX_INVALID -1
7 7
8struct array_item { 8struct cpudl_item {
9 u64 dl; 9 u64 dl;
10 int cpu; 10 int cpu;
11 int idx;
11}; 12};
12 13
13struct cpudl { 14struct cpudl {
14 raw_spinlock_t lock; 15 raw_spinlock_t lock;
15 int size; 16 int size;
16 int cpu_to_idx[NR_CPUS];
17 struct array_item elements[NR_CPUS];
18 cpumask_var_t free_cpus; 17 cpumask_var_t free_cpus;
18 struct cpudl_item *elements;
19}; 19};
20 20
21 21
diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c
index 746bc9344969..981fcd7dc394 100644
--- a/kernel/sched/cpupri.c
+++ b/kernel/sched/cpupri.c
@@ -30,6 +30,7 @@
30#include <linux/gfp.h> 30#include <linux/gfp.h>
31#include <linux/sched.h> 31#include <linux/sched.h>
32#include <linux/sched/rt.h> 32#include <linux/sched/rt.h>
33#include <linux/slab.h>
33#include "cpupri.h" 34#include "cpupri.h"
34 35
35/* Convert between a 140 based task->prio, and our 102 based cpupri */ 36/* Convert between a 140 based task->prio, and our 102 based cpupri */
@@ -70,8 +71,7 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p,
70 int idx = 0; 71 int idx = 0;
71 int task_pri = convert_prio(p->prio); 72 int task_pri = convert_prio(p->prio);
72 73
73 if (task_pri >= MAX_RT_PRIO) 74 BUG_ON(task_pri >= CPUPRI_NR_PRIORITIES);
74 return 0;
75 75
76 for (idx = 0; idx < task_pri; idx++) { 76 for (idx = 0; idx < task_pri; idx++) {
77 struct cpupri_vec *vec = &cp->pri_to_cpu[idx]; 77 struct cpupri_vec *vec = &cp->pri_to_cpu[idx];
@@ -219,8 +219,13 @@ int cpupri_init(struct cpupri *cp)
219 goto cleanup; 219 goto cleanup;
220 } 220 }
221 221
222 cp->cpu_to_pri = kcalloc(nr_cpu_ids, sizeof(int), GFP_KERNEL);
223 if (!cp->cpu_to_pri)
224 goto cleanup;
225
222 for_each_possible_cpu(i) 226 for_each_possible_cpu(i)
223 cp->cpu_to_pri[i] = CPUPRI_INVALID; 227 cp->cpu_to_pri[i] = CPUPRI_INVALID;
228
224 return 0; 229 return 0;
225 230
226cleanup: 231cleanup:
@@ -237,6 +242,7 @@ void cpupri_cleanup(struct cpupri *cp)
237{ 242{
238 int i; 243 int i;
239 244
245 kfree(cp->cpu_to_pri);
240 for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) 246 for (i = 0; i < CPUPRI_NR_PRIORITIES; i++)
241 free_cpumask_var(cp->pri_to_cpu[i].mask); 247 free_cpumask_var(cp->pri_to_cpu[i].mask);
242} 248}
diff --git a/kernel/sched/cpupri.h b/kernel/sched/cpupri.h
index f6d756173491..6b033347fdfd 100644
--- a/kernel/sched/cpupri.h
+++ b/kernel/sched/cpupri.h
@@ -17,7 +17,7 @@ struct cpupri_vec {
17 17
18struct cpupri { 18struct cpupri {
19 struct cpupri_vec pri_to_cpu[CPUPRI_NR_PRIORITIES]; 19 struct cpupri_vec pri_to_cpu[CPUPRI_NR_PRIORITIES];
20 int cpu_to_pri[NR_CPUS]; 20 int *cpu_to_pri;
21}; 21};
22 22
23#ifdef CONFIG_SMP 23#ifdef CONFIG_SMP
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index a95097cb4591..72fdf06ef865 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -332,50 +332,50 @@ out:
332 * softirq as those do not count in task exec_runtime any more. 332 * softirq as those do not count in task exec_runtime any more.
333 */ 333 */
334static void irqtime_account_process_tick(struct task_struct *p, int user_tick, 334static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
335 struct rq *rq) 335 struct rq *rq, int ticks)
336{ 336{
337 cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); 337 cputime_t scaled = cputime_to_scaled(cputime_one_jiffy);
338 u64 cputime = (__force u64) cputime_one_jiffy;
338 u64 *cpustat = kcpustat_this_cpu->cpustat; 339 u64 *cpustat = kcpustat_this_cpu->cpustat;
339 340
340 if (steal_account_process_tick()) 341 if (steal_account_process_tick())
341 return; 342 return;
342 343
344 cputime *= ticks;
345 scaled *= ticks;
346
343 if (irqtime_account_hi_update()) { 347 if (irqtime_account_hi_update()) {
344 cpustat[CPUTIME_IRQ] += (__force u64) cputime_one_jiffy; 348 cpustat[CPUTIME_IRQ] += cputime;
345 } else if (irqtime_account_si_update()) { 349 } else if (irqtime_account_si_update()) {
346 cpustat[CPUTIME_SOFTIRQ] += (__force u64) cputime_one_jiffy; 350 cpustat[CPUTIME_SOFTIRQ] += cputime;
347 } else if (this_cpu_ksoftirqd() == p) { 351 } else if (this_cpu_ksoftirqd() == p) {
348 /* 352 /*
349 * ksoftirqd time do not get accounted in cpu_softirq_time. 353 * ksoftirqd time do not get accounted in cpu_softirq_time.
350 * So, we have to handle it separately here. 354 * So, we have to handle it separately here.
351 * Also, p->stime needs to be updated for ksoftirqd. 355 * Also, p->stime needs to be updated for ksoftirqd.
352 */ 356 */
353 __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled, 357 __account_system_time(p, cputime, scaled, CPUTIME_SOFTIRQ);
354 CPUTIME_SOFTIRQ);
355 } else if (user_tick) { 358 } else if (user_tick) {
356 account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); 359 account_user_time(p, cputime, scaled);
357 } else if (p == rq->idle) { 360 } else if (p == rq->idle) {
358 account_idle_time(cputime_one_jiffy); 361 account_idle_time(cputime);
359 } else if (p->flags & PF_VCPU) { /* System time or guest time */ 362 } else if (p->flags & PF_VCPU) { /* System time or guest time */
360 account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled); 363 account_guest_time(p, cputime, scaled);
361 } else { 364 } else {
362 __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled, 365 __account_system_time(p, cputime, scaled, CPUTIME_SYSTEM);
363 CPUTIME_SYSTEM);
364 } 366 }
365} 367}
366 368
367static void irqtime_account_idle_ticks(int ticks) 369static void irqtime_account_idle_ticks(int ticks)
368{ 370{
369 int i;
370 struct rq *rq = this_rq(); 371 struct rq *rq = this_rq();
371 372
372 for (i = 0; i < ticks; i++) 373 irqtime_account_process_tick(current, 0, rq, ticks);
373 irqtime_account_process_tick(current, 0, rq);
374} 374}
375#else /* CONFIG_IRQ_TIME_ACCOUNTING */ 375#else /* CONFIG_IRQ_TIME_ACCOUNTING */
376static inline void irqtime_account_idle_ticks(int ticks) {} 376static inline void irqtime_account_idle_ticks(int ticks) {}
377static inline void irqtime_account_process_tick(struct task_struct *p, int user_tick, 377static inline void irqtime_account_process_tick(struct task_struct *p, int user_tick,
378 struct rq *rq) {} 378 struct rq *rq, int nr_ticks) {}
379#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ 379#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
380 380
381/* 381/*
@@ -464,7 +464,7 @@ void account_process_tick(struct task_struct *p, int user_tick)
464 return; 464 return;
465 465
466 if (sched_clock_irqtime) { 466 if (sched_clock_irqtime) {
467 irqtime_account_process_tick(p, user_tick, rq); 467 irqtime_account_process_tick(p, user_tick, rq, 1);
468 return; 468 return;
469 } 469 }
470 470
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 27ef40925525..2b8cbf09d1a4 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -348,12 +348,7 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se,
348 * entity. 348 * entity.
349 */ 349 */
350 if (dl_time_before(dl_se->deadline, rq_clock(rq))) { 350 if (dl_time_before(dl_se->deadline, rq_clock(rq))) {
351 static bool lag_once = false; 351 printk_deferred_once("sched: DL replenish lagged to much\n");
352
353 if (!lag_once) {
354 lag_once = true;
355 printk_sched("sched: DL replenish lagged to much\n");
356 }
357 dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline; 352 dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline;
358 dl_se->runtime = pi_se->dl_runtime; 353 dl_se->runtime = pi_se->dl_runtime;
359 } 354 }
@@ -513,14 +508,22 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
513 struct sched_dl_entity, 508 struct sched_dl_entity,
514 dl_timer); 509 dl_timer);
515 struct task_struct *p = dl_task_of(dl_se); 510 struct task_struct *p = dl_task_of(dl_se);
516 struct rq *rq = task_rq(p); 511 struct rq *rq;
512again:
513 rq = task_rq(p);
517 raw_spin_lock(&rq->lock); 514 raw_spin_lock(&rq->lock);
518 515
516 if (rq != task_rq(p)) {
517 /* Task was moved, retrying. */
518 raw_spin_unlock(&rq->lock);
519 goto again;
520 }
521
519 /* 522 /*
520 * We need to take care of a possible races here. In fact, the 523 * We need to take care of a possible races here. In fact, the
521 * task might have changed its scheduling policy to something 524 * task might have changed its scheduling policy to something
522 * different from SCHED_DEADLINE or changed its reservation 525 * different from SCHED_DEADLINE or changed its reservation
523 * parameters (through sched_setscheduler()). 526 * parameters (through sched_setattr()).
524 */ 527 */
525 if (!dl_task(p) || dl_se->dl_new) 528 if (!dl_task(p) || dl_se->dl_new)
526 goto unlock; 529 goto unlock;
@@ -528,6 +531,7 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
528 sched_clock_tick(); 531 sched_clock_tick();
529 update_rq_clock(rq); 532 update_rq_clock(rq);
530 dl_se->dl_throttled = 0; 533 dl_se->dl_throttled = 0;
534 dl_se->dl_yielded = 0;
531 if (p->on_rq) { 535 if (p->on_rq) {
532 enqueue_task_dl(rq, p, ENQUEUE_REPLENISH); 536 enqueue_task_dl(rq, p, ENQUEUE_REPLENISH);
533 if (task_has_dl_policy(rq->curr)) 537 if (task_has_dl_policy(rq->curr))
@@ -740,7 +744,7 @@ void inc_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
740 744
741 WARN_ON(!dl_prio(prio)); 745 WARN_ON(!dl_prio(prio));
742 dl_rq->dl_nr_running++; 746 dl_rq->dl_nr_running++;
743 inc_nr_running(rq_of_dl_rq(dl_rq)); 747 add_nr_running(rq_of_dl_rq(dl_rq), 1);
744 748
745 inc_dl_deadline(dl_rq, deadline); 749 inc_dl_deadline(dl_rq, deadline);
746 inc_dl_migration(dl_se, dl_rq); 750 inc_dl_migration(dl_se, dl_rq);
@@ -754,7 +758,7 @@ void dec_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
754 WARN_ON(!dl_prio(prio)); 758 WARN_ON(!dl_prio(prio));
755 WARN_ON(!dl_rq->dl_nr_running); 759 WARN_ON(!dl_rq->dl_nr_running);
756 dl_rq->dl_nr_running--; 760 dl_rq->dl_nr_running--;
757 dec_nr_running(rq_of_dl_rq(dl_rq)); 761 sub_nr_running(rq_of_dl_rq(dl_rq), 1);
758 762
759 dec_dl_deadline(dl_rq, dl_se->deadline); 763 dec_dl_deadline(dl_rq, dl_se->deadline);
760 dec_dl_migration(dl_se, dl_rq); 764 dec_dl_migration(dl_se, dl_rq);
@@ -893,10 +897,10 @@ static void yield_task_dl(struct rq *rq)
893 * We make the task go to sleep until its current deadline by 897 * We make the task go to sleep until its current deadline by
894 * forcing its runtime to zero. This way, update_curr_dl() stops 898 * forcing its runtime to zero. This way, update_curr_dl() stops
895 * it and the bandwidth timer will wake it up and will give it 899 * it and the bandwidth timer will wake it up and will give it
896 * new scheduling parameters (thanks to dl_new=1). 900 * new scheduling parameters (thanks to dl_yielded=1).
897 */ 901 */
898 if (p->dl.runtime > 0) { 902 if (p->dl.runtime > 0) {
899 rq->curr->dl.dl_new = 1; 903 rq->curr->dl.dl_yielded = 1;
900 p->dl.runtime = 0; 904 p->dl.runtime = 0;
901 } 905 }
902 update_curr_dl(rq); 906 update_curr_dl(rq);
@@ -1021,8 +1025,17 @@ struct task_struct *pick_next_task_dl(struct rq *rq, struct task_struct *prev)
1021 1025
1022 dl_rq = &rq->dl; 1026 dl_rq = &rq->dl;
1023 1027
1024 if (need_pull_dl_task(rq, prev)) 1028 if (need_pull_dl_task(rq, prev)) {
1025 pull_dl_task(rq); 1029 pull_dl_task(rq);
1030 /*
1031 * pull_rt_task() can drop (and re-acquire) rq->lock; this
1032 * means a stop task can slip in, in which case we need to
1033 * re-start task selection.
1034 */
1035 if (rq->stop && rq->stop->on_rq)
1036 return RETRY_TASK;
1037 }
1038
1026 /* 1039 /*
1027 * When prev is DL, we may throttle it in put_prev_task(). 1040 * When prev is DL, we may throttle it in put_prev_task().
1028 * So, we update time before we check for dl_nr_running. 1041 * So, we update time before we check for dl_nr_running.
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 7e9bd0b1fa9e..9855e87d671a 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1095,6 +1095,34 @@ static void task_numa_assign(struct task_numa_env *env,
1095 env->best_cpu = env->dst_cpu; 1095 env->best_cpu = env->dst_cpu;
1096} 1096}
1097 1097
1098static bool load_too_imbalanced(long orig_src_load, long orig_dst_load,
1099 long src_load, long dst_load,
1100 struct task_numa_env *env)
1101{
1102 long imb, old_imb;
1103
1104 /* We care about the slope of the imbalance, not the direction. */
1105 if (dst_load < src_load)
1106 swap(dst_load, src_load);
1107
1108 /* Is the difference below the threshold? */
1109 imb = dst_load * 100 - src_load * env->imbalance_pct;
1110 if (imb <= 0)
1111 return false;
1112
1113 /*
1114 * The imbalance is above the allowed threshold.
1115 * Compare it with the old imbalance.
1116 */
1117 if (orig_dst_load < orig_src_load)
1118 swap(orig_dst_load, orig_src_load);
1119
1120 old_imb = orig_dst_load * 100 - orig_src_load * env->imbalance_pct;
1121
1122 /* Would this change make things worse? */
1123 return (imb > old_imb);
1124}
1125
1098/* 1126/*
1099 * This checks if the overall compute and NUMA accesses of the system would 1127 * This checks if the overall compute and NUMA accesses of the system would
1100 * be improved if the source tasks was migrated to the target dst_cpu taking 1128 * be improved if the source tasks was migrated to the target dst_cpu taking
@@ -1107,7 +1135,8 @@ static void task_numa_compare(struct task_numa_env *env,
1107 struct rq *src_rq = cpu_rq(env->src_cpu); 1135 struct rq *src_rq = cpu_rq(env->src_cpu);
1108 struct rq *dst_rq = cpu_rq(env->dst_cpu); 1136 struct rq *dst_rq = cpu_rq(env->dst_cpu);
1109 struct task_struct *cur; 1137 struct task_struct *cur;
1110 long dst_load, src_load; 1138 long orig_src_load, src_load;
1139 long orig_dst_load, dst_load;
1111 long load; 1140 long load;
1112 long imp = (groupimp > 0) ? groupimp : taskimp; 1141 long imp = (groupimp > 0) ? groupimp : taskimp;
1113 1142
@@ -1181,13 +1210,13 @@ static void task_numa_compare(struct task_numa_env *env,
1181 * In the overloaded case, try and keep the load balanced. 1210 * In the overloaded case, try and keep the load balanced.
1182 */ 1211 */
1183balance: 1212balance:
1184 dst_load = env->dst_stats.load; 1213 orig_dst_load = env->dst_stats.load;
1185 src_load = env->src_stats.load; 1214 orig_src_load = env->src_stats.load;
1186 1215
1187 /* XXX missing power terms */ 1216 /* XXX missing power terms */
1188 load = task_h_load(env->p); 1217 load = task_h_load(env->p);
1189 dst_load += load; 1218 dst_load = orig_dst_load + load;
1190 src_load -= load; 1219 src_load = orig_src_load - load;
1191 1220
1192 if (cur) { 1221 if (cur) {
1193 load = task_h_load(cur); 1222 load = task_h_load(cur);
@@ -1195,11 +1224,8 @@ balance:
1195 src_load += load; 1224 src_load += load;
1196 } 1225 }
1197 1226
1198 /* make src_load the smaller */ 1227 if (load_too_imbalanced(orig_src_load, orig_dst_load,
1199 if (dst_load < src_load) 1228 src_load, dst_load, env))
1200 swap(dst_load, src_load);
1201
1202 if (src_load * env->imbalance_pct < dst_load * 100)
1203 goto unlock; 1229 goto unlock;
1204 1230
1205assign: 1231assign:
@@ -1301,7 +1327,16 @@ static int task_numa_migrate(struct task_struct *p)
1301 if (env.best_cpu == -1) 1327 if (env.best_cpu == -1)
1302 return -EAGAIN; 1328 return -EAGAIN;
1303 1329
1304 sched_setnuma(p, env.dst_nid); 1330 /*
1331 * If the task is part of a workload that spans multiple NUMA nodes,
1332 * and is migrating into one of the workload's active nodes, remember
1333 * this node as the task's preferred numa node, so the workload can
1334 * settle down.
1335 * A task that migrated to a second choice node will be better off
1336 * trying for a better one later. Do not set the preferred node here.
1337 */
1338 if (p->numa_group && node_isset(env.dst_nid, p->numa_group->active_nodes))
1339 sched_setnuma(p, env.dst_nid);
1305 1340
1306 /* 1341 /*
1307 * Reset the scan period if the task is being rescheduled on an 1342 * Reset the scan period if the task is being rescheduled on an
@@ -1326,12 +1361,15 @@ static int task_numa_migrate(struct task_struct *p)
1326/* Attempt to migrate a task to a CPU on the preferred node. */ 1361/* Attempt to migrate a task to a CPU on the preferred node. */
1327static void numa_migrate_preferred(struct task_struct *p) 1362static void numa_migrate_preferred(struct task_struct *p)
1328{ 1363{
1364 unsigned long interval = HZ;
1365
1329 /* This task has no NUMA fault statistics yet */ 1366 /* This task has no NUMA fault statistics yet */
1330 if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults_memory)) 1367 if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults_memory))
1331 return; 1368 return;
1332 1369
1333 /* Periodically retry migrating the task to the preferred node */ 1370 /* Periodically retry migrating the task to the preferred node */
1334 p->numa_migrate_retry = jiffies + HZ; 1371 interval = min(interval, msecs_to_jiffies(p->numa_scan_period) / 16);
1372 p->numa_migrate_retry = jiffies + interval;
1335 1373
1336 /* Success if task is already running on preferred CPU */ 1374 /* Success if task is already running on preferred CPU */
1337 if (task_node(p) == p->numa_preferred_nid) 1375 if (task_node(p) == p->numa_preferred_nid)
@@ -1497,7 +1535,7 @@ static void task_numa_placement(struct task_struct *p)
1497 /* If the task is part of a group prevent parallel updates to group stats */ 1535 /* If the task is part of a group prevent parallel updates to group stats */
1498 if (p->numa_group) { 1536 if (p->numa_group) {
1499 group_lock = &p->numa_group->lock; 1537 group_lock = &p->numa_group->lock;
1500 spin_lock(group_lock); 1538 spin_lock_irq(group_lock);
1501 } 1539 }
1502 1540
1503 /* Find the node with the highest number of faults */ 1541 /* Find the node with the highest number of faults */
@@ -1572,7 +1610,7 @@ static void task_numa_placement(struct task_struct *p)
1572 } 1610 }
1573 } 1611 }
1574 1612
1575 spin_unlock(group_lock); 1613 spin_unlock_irq(group_lock);
1576 } 1614 }
1577 1615
1578 /* Preferred node as the node with the most faults */ 1616 /* Preferred node as the node with the most faults */
@@ -1677,7 +1715,8 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
1677 if (!join) 1715 if (!join)
1678 return; 1716 return;
1679 1717
1680 double_lock(&my_grp->lock, &grp->lock); 1718 BUG_ON(irqs_disabled());
1719 double_lock_irq(&my_grp->lock, &grp->lock);
1681 1720
1682 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) { 1721 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) {
1683 my_grp->faults[i] -= p->numa_faults_memory[i]; 1722 my_grp->faults[i] -= p->numa_faults_memory[i];
@@ -1691,7 +1730,7 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
1691 grp->nr_tasks++; 1730 grp->nr_tasks++;
1692 1731
1693 spin_unlock(&my_grp->lock); 1732 spin_unlock(&my_grp->lock);
1694 spin_unlock(&grp->lock); 1733 spin_unlock_irq(&grp->lock);
1695 1734
1696 rcu_assign_pointer(p->numa_group, grp); 1735 rcu_assign_pointer(p->numa_group, grp);
1697 1736
@@ -1706,18 +1745,19 @@ no_join:
1706void task_numa_free(struct task_struct *p) 1745void task_numa_free(struct task_struct *p)
1707{ 1746{
1708 struct numa_group *grp = p->numa_group; 1747 struct numa_group *grp = p->numa_group;
1709 int i;
1710 void *numa_faults = p->numa_faults_memory; 1748 void *numa_faults = p->numa_faults_memory;
1749 unsigned long flags;
1750 int i;
1711 1751
1712 if (grp) { 1752 if (grp) {
1713 spin_lock(&grp->lock); 1753 spin_lock_irqsave(&grp->lock, flags);
1714 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) 1754 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
1715 grp->faults[i] -= p->numa_faults_memory[i]; 1755 grp->faults[i] -= p->numa_faults_memory[i];
1716 grp->total_faults -= p->total_numa_faults; 1756 grp->total_faults -= p->total_numa_faults;
1717 1757
1718 list_del(&p->numa_entry); 1758 list_del(&p->numa_entry);
1719 grp->nr_tasks--; 1759 grp->nr_tasks--;
1720 spin_unlock(&grp->lock); 1760 spin_unlock_irqrestore(&grp->lock, flags);
1721 rcu_assign_pointer(p->numa_group, NULL); 1761 rcu_assign_pointer(p->numa_group, NULL);
1722 put_numa_group(grp); 1762 put_numa_group(grp);
1723 } 1763 }
@@ -1737,6 +1777,7 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
1737 struct task_struct *p = current; 1777 struct task_struct *p = current;
1738 bool migrated = flags & TNF_MIGRATED; 1778 bool migrated = flags & TNF_MIGRATED;
1739 int cpu_node = task_node(current); 1779 int cpu_node = task_node(current);
1780 int local = !!(flags & TNF_FAULT_LOCAL);
1740 int priv; 1781 int priv;
1741 1782
1742 if (!numabalancing_enabled) 1783 if (!numabalancing_enabled)
@@ -1785,6 +1826,17 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
1785 task_numa_group(p, last_cpupid, flags, &priv); 1826 task_numa_group(p, last_cpupid, flags, &priv);
1786 } 1827 }
1787 1828
1829 /*
1830 * If a workload spans multiple NUMA nodes, a shared fault that
1831 * occurs wholly within the set of nodes that the workload is
1832 * actively using should be counted as local. This allows the
1833 * scan rate to slow down when a workload has settled down.
1834 */
1835 if (!priv && !local && p->numa_group &&
1836 node_isset(cpu_node, p->numa_group->active_nodes) &&
1837 node_isset(mem_node, p->numa_group->active_nodes))
1838 local = 1;
1839
1788 task_numa_placement(p); 1840 task_numa_placement(p);
1789 1841
1790 /* 1842 /*
@@ -1799,7 +1851,7 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
1799 1851
1800 p->numa_faults_buffer_memory[task_faults_idx(mem_node, priv)] += pages; 1852 p->numa_faults_buffer_memory[task_faults_idx(mem_node, priv)] += pages;
1801 p->numa_faults_buffer_cpu[task_faults_idx(cpu_node, priv)] += pages; 1853 p->numa_faults_buffer_cpu[task_faults_idx(cpu_node, priv)] += pages;
1802 p->numa_faults_locality[!!(flags & TNF_FAULT_LOCAL)] += pages; 1854 p->numa_faults_locality[local] += pages;
1803} 1855}
1804 1856
1805static void reset_ptenuma_scan(struct task_struct *p) 1857static void reset_ptenuma_scan(struct task_struct *p)
@@ -3128,7 +3180,7 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
3128 */ 3180 */
3129 if (!cfs_b->timer_active) { 3181 if (!cfs_b->timer_active) {
3130 __refill_cfs_bandwidth_runtime(cfs_b); 3182 __refill_cfs_bandwidth_runtime(cfs_b);
3131 __start_cfs_bandwidth(cfs_b); 3183 __start_cfs_bandwidth(cfs_b, false);
3132 } 3184 }
3133 3185
3134 if (cfs_b->runtime > 0) { 3186 if (cfs_b->runtime > 0) {
@@ -3300,14 +3352,14 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
3300 } 3352 }
3301 3353
3302 if (!se) 3354 if (!se)
3303 rq->nr_running -= task_delta; 3355 sub_nr_running(rq, task_delta);
3304 3356
3305 cfs_rq->throttled = 1; 3357 cfs_rq->throttled = 1;
3306 cfs_rq->throttled_clock = rq_clock(rq); 3358 cfs_rq->throttled_clock = rq_clock(rq);
3307 raw_spin_lock(&cfs_b->lock); 3359 raw_spin_lock(&cfs_b->lock);
3308 list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq); 3360 list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
3309 if (!cfs_b->timer_active) 3361 if (!cfs_b->timer_active)
3310 __start_cfs_bandwidth(cfs_b); 3362 __start_cfs_bandwidth(cfs_b, false);
3311 raw_spin_unlock(&cfs_b->lock); 3363 raw_spin_unlock(&cfs_b->lock);
3312} 3364}
3313 3365
@@ -3351,7 +3403,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
3351 } 3403 }
3352 3404
3353 if (!se) 3405 if (!se)
3354 rq->nr_running += task_delta; 3406 add_nr_running(rq, task_delta);
3355 3407
3356 /* determine whether we need to wake up potentially idle cpu */ 3408 /* determine whether we need to wake up potentially idle cpu */
3357 if (rq->curr == rq->idle && rq->cfs.nr_running) 3409 if (rq->curr == rq->idle && rq->cfs.nr_running)
@@ -3689,7 +3741,7 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
3689} 3741}
3690 3742
3691/* requires cfs_b->lock, may release to reprogram timer */ 3743/* requires cfs_b->lock, may release to reprogram timer */
3692void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b) 3744void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b, bool force)
3693{ 3745{
3694 /* 3746 /*
3695 * The timer may be active because we're trying to set a new bandwidth 3747 * The timer may be active because we're trying to set a new bandwidth
@@ -3704,7 +3756,7 @@ void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
3704 cpu_relax(); 3756 cpu_relax();
3705 raw_spin_lock(&cfs_b->lock); 3757 raw_spin_lock(&cfs_b->lock);
3706 /* if someone else restarted the timer then we're done */ 3758 /* if someone else restarted the timer then we're done */
3707 if (cfs_b->timer_active) 3759 if (!force && cfs_b->timer_active)
3708 return; 3760 return;
3709 } 3761 }
3710 3762
@@ -3883,7 +3935,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
3883 3935
3884 if (!se) { 3936 if (!se) {
3885 update_rq_runnable_avg(rq, rq->nr_running); 3937 update_rq_runnable_avg(rq, rq->nr_running);
3886 inc_nr_running(rq); 3938 add_nr_running(rq, 1);
3887 } 3939 }
3888 hrtick_update(rq); 3940 hrtick_update(rq);
3889} 3941}
@@ -3943,7 +3995,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
3943 } 3995 }
3944 3996
3945 if (!se) { 3997 if (!se) {
3946 dec_nr_running(rq); 3998 sub_nr_running(rq, 1);
3947 update_rq_runnable_avg(rq, 1); 3999 update_rq_runnable_avg(rq, 1);
3948 } 4000 }
3949 hrtick_update(rq); 4001 hrtick_update(rq);
@@ -4014,7 +4066,7 @@ static void record_wakee(struct task_struct *p)
4014 * about the loss. 4066 * about the loss.
4015 */ 4067 */
4016 if (jiffies > current->wakee_flip_decay_ts + HZ) { 4068 if (jiffies > current->wakee_flip_decay_ts + HZ) {
4017 current->wakee_flips = 0; 4069 current->wakee_flips >>= 1;
4018 current->wakee_flip_decay_ts = jiffies; 4070 current->wakee_flip_decay_ts = jiffies;
4019 } 4071 }
4020 4072
@@ -4448,10 +4500,10 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
4448 sd = tmp; 4500 sd = tmp;
4449 } 4501 }
4450 4502
4451 if (affine_sd) { 4503 if (affine_sd && cpu != prev_cpu && wake_affine(affine_sd, p, sync))
4452 if (cpu != prev_cpu && wake_affine(affine_sd, p, sync)) 4504 prev_cpu = cpu;
4453 prev_cpu = cpu;
4454 4505
4506 if (sd_flag & SD_BALANCE_WAKE) {
4455 new_cpu = select_idle_sibling(p, prev_cpu); 4507 new_cpu = select_idle_sibling(p, prev_cpu);
4456 goto unlock; 4508 goto unlock;
4457 } 4509 }
@@ -4519,6 +4571,9 @@ migrate_task_rq_fair(struct task_struct *p, int next_cpu)
4519 atomic_long_add(se->avg.load_avg_contrib, 4571 atomic_long_add(se->avg.load_avg_contrib,
4520 &cfs_rq->removed_load); 4572 &cfs_rq->removed_load);
4521 } 4573 }
4574
4575 /* We have migrated, no longer consider this task hot */
4576 se->exec_start = 0;
4522} 4577}
4523#endif /* CONFIG_SMP */ 4578#endif /* CONFIG_SMP */
4524 4579
@@ -5069,6 +5124,7 @@ task_hot(struct task_struct *p, u64 now)
5069/* Returns true if the destination node has incurred more faults */ 5124/* Returns true if the destination node has incurred more faults */
5070static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env) 5125static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env)
5071{ 5126{
5127 struct numa_group *numa_group = rcu_dereference(p->numa_group);
5072 int src_nid, dst_nid; 5128 int src_nid, dst_nid;
5073 5129
5074 if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults_memory || 5130 if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults_memory ||
@@ -5082,21 +5138,29 @@ static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env)
5082 if (src_nid == dst_nid) 5138 if (src_nid == dst_nid)
5083 return false; 5139 return false;
5084 5140
5085 /* Always encourage migration to the preferred node. */ 5141 if (numa_group) {
5086 if (dst_nid == p->numa_preferred_nid) 5142 /* Task is already in the group's interleave set. */
5087 return true; 5143 if (node_isset(src_nid, numa_group->active_nodes))
5144 return false;
5145
5146 /* Task is moving into the group's interleave set. */
5147 if (node_isset(dst_nid, numa_group->active_nodes))
5148 return true;
5149
5150 return group_faults(p, dst_nid) > group_faults(p, src_nid);
5151 }
5088 5152
5089 /* If both task and group weight improve, this move is a winner. */ 5153 /* Encourage migration to the preferred node. */
5090 if (task_weight(p, dst_nid) > task_weight(p, src_nid) && 5154 if (dst_nid == p->numa_preferred_nid)
5091 group_weight(p, dst_nid) > group_weight(p, src_nid))
5092 return true; 5155 return true;
5093 5156
5094 return false; 5157 return task_faults(p, dst_nid) > task_faults(p, src_nid);
5095} 5158}
5096 5159
5097 5160
5098static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env) 5161static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
5099{ 5162{
5163 struct numa_group *numa_group = rcu_dereference(p->numa_group);
5100 int src_nid, dst_nid; 5164 int src_nid, dst_nid;
5101 5165
5102 if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER)) 5166 if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER))
@@ -5111,16 +5175,23 @@ static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
5111 if (src_nid == dst_nid) 5175 if (src_nid == dst_nid)
5112 return false; 5176 return false;
5113 5177
5178 if (numa_group) {
5179 /* Task is moving within/into the group's interleave set. */
5180 if (node_isset(dst_nid, numa_group->active_nodes))
5181 return false;
5182
5183 /* Task is moving out of the group's interleave set. */
5184 if (node_isset(src_nid, numa_group->active_nodes))
5185 return true;
5186
5187 return group_faults(p, dst_nid) < group_faults(p, src_nid);
5188 }
5189
5114 /* Migrating away from the preferred node is always bad. */ 5190 /* Migrating away from the preferred node is always bad. */
5115 if (src_nid == p->numa_preferred_nid) 5191 if (src_nid == p->numa_preferred_nid)
5116 return true; 5192 return true;
5117 5193
5118 /* If either task or group weight get worse, don't do it. */ 5194 return task_faults(p, dst_nid) < task_faults(p, src_nid);
5119 if (task_weight(p, dst_nid) < task_weight(p, src_nid) ||
5120 group_weight(p, dst_nid) < group_weight(p, src_nid))
5121 return true;
5122
5123 return false;
5124} 5195}
5125 5196
5126#else 5197#else
@@ -5563,6 +5634,7 @@ static unsigned long scale_rt_power(int cpu)
5563{ 5634{
5564 struct rq *rq = cpu_rq(cpu); 5635 struct rq *rq = cpu_rq(cpu);
5565 u64 total, available, age_stamp, avg; 5636 u64 total, available, age_stamp, avg;
5637 s64 delta;
5566 5638
5567 /* 5639 /*
5568 * Since we're reading these variables without serialization make sure 5640 * Since we're reading these variables without serialization make sure
@@ -5571,7 +5643,11 @@ static unsigned long scale_rt_power(int cpu)
5571 age_stamp = ACCESS_ONCE(rq->age_stamp); 5643 age_stamp = ACCESS_ONCE(rq->age_stamp);
5572 avg = ACCESS_ONCE(rq->rt_avg); 5644 avg = ACCESS_ONCE(rq->rt_avg);
5573 5645
5574 total = sched_avg_period() + (rq_clock(rq) - age_stamp); 5646 delta = rq_clock(rq) - age_stamp;
5647 if (unlikely(delta < 0))
5648 delta = 0;
5649
5650 total = sched_avg_period() + delta;
5575 5651
5576 if (unlikely(total < avg)) { 5652 if (unlikely(total < avg)) {
5577 /* Ensures that power won't end up being negative */ 5653 /* Ensures that power won't end up being negative */
@@ -6639,27 +6715,62 @@ out:
6639 return ld_moved; 6715 return ld_moved;
6640} 6716}
6641 6717
6718static inline unsigned long
6719get_sd_balance_interval(struct sched_domain *sd, int cpu_busy)
6720{
6721 unsigned long interval = sd->balance_interval;
6722
6723 if (cpu_busy)
6724 interval *= sd->busy_factor;
6725
6726 /* scale ms to jiffies */
6727 interval = msecs_to_jiffies(interval);
6728 interval = clamp(interval, 1UL, max_load_balance_interval);
6729
6730 return interval;
6731}
6732
6733static inline void
6734update_next_balance(struct sched_domain *sd, int cpu_busy, unsigned long *next_balance)
6735{
6736 unsigned long interval, next;
6737
6738 interval = get_sd_balance_interval(sd, cpu_busy);
6739 next = sd->last_balance + interval;
6740
6741 if (time_after(*next_balance, next))
6742 *next_balance = next;
6743}
6744
6642/* 6745/*
6643 * idle_balance is called by schedule() if this_cpu is about to become 6746 * idle_balance is called by schedule() if this_cpu is about to become
6644 * idle. Attempts to pull tasks from other CPUs. 6747 * idle. Attempts to pull tasks from other CPUs.
6645 */ 6748 */
6646static int idle_balance(struct rq *this_rq) 6749static int idle_balance(struct rq *this_rq)
6647{ 6750{
6751 unsigned long next_balance = jiffies + HZ;
6752 int this_cpu = this_rq->cpu;
6648 struct sched_domain *sd; 6753 struct sched_domain *sd;
6649 int pulled_task = 0; 6754 int pulled_task = 0;
6650 unsigned long next_balance = jiffies + HZ;
6651 u64 curr_cost = 0; 6755 u64 curr_cost = 0;
6652 int this_cpu = this_rq->cpu;
6653 6756
6654 idle_enter_fair(this_rq); 6757 idle_enter_fair(this_rq);
6758
6655 /* 6759 /*
6656 * We must set idle_stamp _before_ calling idle_balance(), such that we 6760 * We must set idle_stamp _before_ calling idle_balance(), such that we
6657 * measure the duration of idle_balance() as idle time. 6761 * measure the duration of idle_balance() as idle time.
6658 */ 6762 */
6659 this_rq->idle_stamp = rq_clock(this_rq); 6763 this_rq->idle_stamp = rq_clock(this_rq);
6660 6764
6661 if (this_rq->avg_idle < sysctl_sched_migration_cost) 6765 if (this_rq->avg_idle < sysctl_sched_migration_cost) {
6766 rcu_read_lock();
6767 sd = rcu_dereference_check_sched_domain(this_rq->sd);
6768 if (sd)
6769 update_next_balance(sd, 0, &next_balance);
6770 rcu_read_unlock();
6771
6662 goto out; 6772 goto out;
6773 }
6663 6774
6664 /* 6775 /*
6665 * Drop the rq->lock, but keep IRQ/preempt disabled. 6776 * Drop the rq->lock, but keep IRQ/preempt disabled.
@@ -6669,20 +6780,20 @@ static int idle_balance(struct rq *this_rq)
6669 update_blocked_averages(this_cpu); 6780 update_blocked_averages(this_cpu);
6670 rcu_read_lock(); 6781 rcu_read_lock();
6671 for_each_domain(this_cpu, sd) { 6782 for_each_domain(this_cpu, sd) {
6672 unsigned long interval;
6673 int continue_balancing = 1; 6783 int continue_balancing = 1;
6674 u64 t0, domain_cost; 6784 u64 t0, domain_cost;
6675 6785
6676 if (!(sd->flags & SD_LOAD_BALANCE)) 6786 if (!(sd->flags & SD_LOAD_BALANCE))
6677 continue; 6787 continue;
6678 6788
6679 if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) 6789 if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) {
6790 update_next_balance(sd, 0, &next_balance);
6680 break; 6791 break;
6792 }
6681 6793
6682 if (sd->flags & SD_BALANCE_NEWIDLE) { 6794 if (sd->flags & SD_BALANCE_NEWIDLE) {
6683 t0 = sched_clock_cpu(this_cpu); 6795 t0 = sched_clock_cpu(this_cpu);
6684 6796
6685 /* If we've pulled tasks over stop searching: */
6686 pulled_task = load_balance(this_cpu, this_rq, 6797 pulled_task = load_balance(this_cpu, this_rq,
6687 sd, CPU_NEWLY_IDLE, 6798 sd, CPU_NEWLY_IDLE,
6688 &continue_balancing); 6799 &continue_balancing);
@@ -6694,41 +6805,37 @@ static int idle_balance(struct rq *this_rq)
6694 curr_cost += domain_cost; 6805 curr_cost += domain_cost;
6695 } 6806 }
6696 6807
6697 interval = msecs_to_jiffies(sd->balance_interval); 6808 update_next_balance(sd, 0, &next_balance);
6698 if (time_after(next_balance, sd->last_balance + interval)) 6809
6699 next_balance = sd->last_balance + interval; 6810 /*
6700 if (pulled_task) 6811 * Stop searching for tasks to pull if there are
6812 * now runnable tasks on this rq.
6813 */
6814 if (pulled_task || this_rq->nr_running > 0)
6701 break; 6815 break;
6702 } 6816 }
6703 rcu_read_unlock(); 6817 rcu_read_unlock();
6704 6818
6705 raw_spin_lock(&this_rq->lock); 6819 raw_spin_lock(&this_rq->lock);
6706 6820
6821 if (curr_cost > this_rq->max_idle_balance_cost)
6822 this_rq->max_idle_balance_cost = curr_cost;
6823
6707 /* 6824 /*
6708 * While browsing the domains, we released the rq lock. 6825 * While browsing the domains, we released the rq lock, a task could
6709 * A task could have be enqueued in the meantime 6826 * have been enqueued in the meantime. Since we're not going idle,
6827 * pretend we pulled a task.
6710 */ 6828 */
6711 if (this_rq->cfs.h_nr_running && !pulled_task) { 6829 if (this_rq->cfs.h_nr_running && !pulled_task)
6712 pulled_task = 1; 6830 pulled_task = 1;
6713 goto out;
6714 }
6715 6831
6716 if (pulled_task || time_after(jiffies, this_rq->next_balance)) { 6832out:
6717 /* 6833 /* Move the next balance forward */
6718 * We are going idle. next_balance may be set based on 6834 if (time_after(this_rq->next_balance, next_balance))
6719 * a busy processor. So reset next_balance.
6720 */
6721 this_rq->next_balance = next_balance; 6835 this_rq->next_balance = next_balance;
6722 }
6723
6724 if (curr_cost > this_rq->max_idle_balance_cost)
6725 this_rq->max_idle_balance_cost = curr_cost;
6726 6836
6727out:
6728 /* Is there a task of a high priority class? */ 6837 /* Is there a task of a high priority class? */
6729 if (this_rq->nr_running != this_rq->cfs.h_nr_running && 6838 if (this_rq->nr_running != this_rq->cfs.h_nr_running)
6730 (this_rq->dl.dl_nr_running ||
6731 (this_rq->rt.rt_nr_running && !rt_rq_throttled(&this_rq->rt))))
6732 pulled_task = -1; 6839 pulled_task = -1;
6733 6840
6734 if (pulled_task) { 6841 if (pulled_task) {
@@ -7009,16 +7116,9 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
7009 break; 7116 break;
7010 } 7117 }
7011 7118
7012 interval = sd->balance_interval; 7119 interval = get_sd_balance_interval(sd, idle != CPU_IDLE);
7013 if (idle != CPU_IDLE)
7014 interval *= sd->busy_factor;
7015
7016 /* scale ms to jiffies */
7017 interval = msecs_to_jiffies(interval);
7018 interval = clamp(interval, 1UL, max_load_balance_interval);
7019 7120
7020 need_serialize = sd->flags & SD_SERIALIZE; 7121 need_serialize = sd->flags & SD_SERIALIZE;
7021
7022 if (need_serialize) { 7122 if (need_serialize) {
7023 if (!spin_trylock(&balancing)) 7123 if (!spin_trylock(&balancing))
7024 goto out; 7124 goto out;
@@ -7034,6 +7134,7 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
7034 idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE; 7134 idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE;
7035 } 7135 }
7036 sd->last_balance = jiffies; 7136 sd->last_balance = jiffies;
7137 interval = get_sd_balance_interval(sd, idle != CPU_IDLE);
7037 } 7138 }
7038 if (need_serialize) 7139 if (need_serialize)
7039 spin_unlock(&balancing); 7140 spin_unlock(&balancing);
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 8f4390a079c7..25b9423abce9 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -67,24 +67,21 @@ void __weak arch_cpu_idle(void)
67 * cpuidle_idle_call - the main idle function 67 * cpuidle_idle_call - the main idle function
68 * 68 *
69 * NOTE: no locks or semaphores should be used here 69 * NOTE: no locks or semaphores should be used here
70 * return non-zero on failure
71 */ 70 */
72static int cpuidle_idle_call(void) 71static void cpuidle_idle_call(void)
73{ 72{
74 struct cpuidle_device *dev = __this_cpu_read(cpuidle_devices); 73 struct cpuidle_device *dev = __this_cpu_read(cpuidle_devices);
75 struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev); 74 struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev);
76 int next_state, entered_state, ret; 75 int next_state, entered_state;
77 bool broadcast; 76 bool broadcast;
78 77
79 /* 78 /*
80 * Check if the idle task must be rescheduled. If it is the 79 * Check if the idle task must be rescheduled. If it is the
81 * case, exit the function after re-enabling the local irq and 80 * case, exit the function after re-enabling the local irq.
82 * set again the polling flag
83 */ 81 */
84 if (current_clr_polling_and_test()) { 82 if (need_resched()) {
85 local_irq_enable(); 83 local_irq_enable();
86 __current_set_polling(); 84 return;
87 return 0;
88 } 85 }
89 86
90 /* 87 /*
@@ -101,96 +98,79 @@ static int cpuidle_idle_call(void)
101 rcu_idle_enter(); 98 rcu_idle_enter();
102 99
103 /* 100 /*
104 * Check if the cpuidle framework is ready, otherwise fallback 101 * Ask the cpuidle framework to choose a convenient idle state.
105 * to the default arch specific idle method 102 * Fall back to the default arch idle method on errors.
106 */ 103 */
107 ret = cpuidle_enabled(drv, dev); 104 next_state = cpuidle_select(drv, dev);
108 105 if (next_state < 0) {
109 if (!ret) { 106use_default:
110 /* 107 /*
111 * Ask the governor to choose an idle state it thinks 108 * We can't use the cpuidle framework, let's use the default
112 * it is convenient to go to. There is *always* a 109 * idle routine.
113 * convenient idle state
114 */ 110 */
115 next_state = cpuidle_select(drv, dev); 111 if (current_clr_polling_and_test())
116
117 /*
118 * The idle task must be scheduled, it is pointless to
119 * go to idle, just update no idle residency and get
120 * out of this function
121 */
122 if (current_clr_polling_and_test()) {
123 dev->last_residency = 0;
124 entered_state = next_state;
125 local_irq_enable(); 112 local_irq_enable();
126 } else { 113 else
127 broadcast = !!(drv->states[next_state].flags & 114 arch_cpu_idle();
128 CPUIDLE_FLAG_TIMER_STOP); 115
129 116 goto exit_idle;
130 if (broadcast)
131 /*
132 * Tell the time framework to switch
133 * to a broadcast timer because our
134 * local timer will be shutdown. If a
135 * local timer is used from another
136 * cpu as a broadcast timer, this call
137 * may fail if it is not available
138 */
139 ret = clockevents_notify(
140 CLOCK_EVT_NOTIFY_BROADCAST_ENTER,
141 &dev->cpu);
142
143 if (!ret) {
144 trace_cpu_idle_rcuidle(next_state, dev->cpu);
145
146 /*
147 * Enter the idle state previously
148 * returned by the governor
149 * decision. This function will block
150 * until an interrupt occurs and will
151 * take care of re-enabling the local
152 * interrupts
153 */
154 entered_state = cpuidle_enter(drv, dev,
155 next_state);
156
157 trace_cpu_idle_rcuidle(PWR_EVENT_EXIT,
158 dev->cpu);
159
160 if (broadcast)
161 clockevents_notify(
162 CLOCK_EVT_NOTIFY_BROADCAST_EXIT,
163 &dev->cpu);
164
165 /*
166 * Give the governor an opportunity to reflect on the
167 * outcome
168 */
169 cpuidle_reflect(dev, entered_state);
170 }
171 }
172 } 117 }
173 118
119
174 /* 120 /*
175 * We can't use the cpuidle framework, let's use the default 121 * The idle task must be scheduled, it is pointless to
176 * idle routine 122 * go to idle, just update no idle residency and get
123 * out of this function
177 */ 124 */
178 if (ret) 125 if (current_clr_polling_and_test()) {
179 arch_cpu_idle(); 126 dev->last_residency = 0;
127 entered_state = next_state;
128 local_irq_enable();
129 goto exit_idle;
130 }
131
132 broadcast = !!(drv->states[next_state].flags & CPUIDLE_FLAG_TIMER_STOP);
180 133
134 /*
135 * Tell the time framework to switch to a broadcast timer
136 * because our local timer will be shutdown. If a local timer
137 * is used from another cpu as a broadcast timer, this call may
138 * fail if it is not available
139 */
140 if (broadcast &&
141 clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &dev->cpu))
142 goto use_default;
143
144 trace_cpu_idle_rcuidle(next_state, dev->cpu);
145
146 /*
147 * Enter the idle state previously returned by the governor decision.
148 * This function will block until an interrupt occurs and will take
149 * care of re-enabling the local interrupts
150 */
151 entered_state = cpuidle_enter(drv, dev, next_state);
152
153 trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, dev->cpu);
154
155 if (broadcast)
156 clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &dev->cpu);
157
158 /*
159 * Give the governor an opportunity to reflect on the outcome
160 */
161 cpuidle_reflect(dev, entered_state);
162
163exit_idle:
181 __current_set_polling(); 164 __current_set_polling();
182 165
183 /* 166 /*
184 * It is up to the idle functions to enable back the local 167 * It is up to the idle functions to reenable local interrupts
185 * interrupt
186 */ 168 */
187 if (WARN_ON_ONCE(irqs_disabled())) 169 if (WARN_ON_ONCE(irqs_disabled()))
188 local_irq_enable(); 170 local_irq_enable();
189 171
190 rcu_idle_exit(); 172 rcu_idle_exit();
191 start_critical_timings(); 173 start_critical_timings();
192
193 return 0;
194} 174}
195 175
196/* 176/*
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index d8cdf1618551..b3512f1afce9 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -79,6 +79,8 @@ void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
79 rt_rq->overloaded = 0; 79 rt_rq->overloaded = 0;
80 plist_head_init(&rt_rq->pushable_tasks); 80 plist_head_init(&rt_rq->pushable_tasks);
81#endif 81#endif
82 /* We start is dequeued state, because no RT tasks are queued */
83 rt_rq->rt_queued = 0;
82 84
83 rt_rq->rt_time = 0; 85 rt_rq->rt_time = 0;
84 rt_rq->rt_throttled = 0; 86 rt_rq->rt_throttled = 0;
@@ -112,6 +114,13 @@ static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
112 return rt_se->rt_rq; 114 return rt_se->rt_rq;
113} 115}
114 116
117static inline struct rq *rq_of_rt_se(struct sched_rt_entity *rt_se)
118{
119 struct rt_rq *rt_rq = rt_se->rt_rq;
120
121 return rt_rq->rq;
122}
123
115void free_rt_sched_group(struct task_group *tg) 124void free_rt_sched_group(struct task_group *tg)
116{ 125{
117 int i; 126 int i;
@@ -211,10 +220,16 @@ static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
211 return container_of(rt_rq, struct rq, rt); 220 return container_of(rt_rq, struct rq, rt);
212} 221}
213 222
214static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se) 223static inline struct rq *rq_of_rt_se(struct sched_rt_entity *rt_se)
215{ 224{
216 struct task_struct *p = rt_task_of(rt_se); 225 struct task_struct *p = rt_task_of(rt_se);
217 struct rq *rq = task_rq(p); 226
227 return task_rq(p);
228}
229
230static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
231{
232 struct rq *rq = rq_of_rt_se(rt_se);
218 233
219 return &rq->rt; 234 return &rq->rt;
220} 235}
@@ -391,6 +406,9 @@ static inline void set_post_schedule(struct rq *rq)
391} 406}
392#endif /* CONFIG_SMP */ 407#endif /* CONFIG_SMP */
393 408
409static void enqueue_top_rt_rq(struct rt_rq *rt_rq);
410static void dequeue_top_rt_rq(struct rt_rq *rt_rq);
411
394static inline int on_rt_rq(struct sched_rt_entity *rt_se) 412static inline int on_rt_rq(struct sched_rt_entity *rt_se)
395{ 413{
396 return !list_empty(&rt_se->run_list); 414 return !list_empty(&rt_se->run_list);
@@ -452,8 +470,11 @@ static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
452 rt_se = rt_rq->tg->rt_se[cpu]; 470 rt_se = rt_rq->tg->rt_se[cpu];
453 471
454 if (rt_rq->rt_nr_running) { 472 if (rt_rq->rt_nr_running) {
455 if (rt_se && !on_rt_rq(rt_se)) 473 if (!rt_se)
474 enqueue_top_rt_rq(rt_rq);
475 else if (!on_rt_rq(rt_se))
456 enqueue_rt_entity(rt_se, false); 476 enqueue_rt_entity(rt_se, false);
477
457 if (rt_rq->highest_prio.curr < curr->prio) 478 if (rt_rq->highest_prio.curr < curr->prio)
458 resched_task(curr); 479 resched_task(curr);
459 } 480 }
@@ -466,10 +487,17 @@ static void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
466 487
467 rt_se = rt_rq->tg->rt_se[cpu]; 488 rt_se = rt_rq->tg->rt_se[cpu];
468 489
469 if (rt_se && on_rt_rq(rt_se)) 490 if (!rt_se)
491 dequeue_top_rt_rq(rt_rq);
492 else if (on_rt_rq(rt_se))
470 dequeue_rt_entity(rt_se); 493 dequeue_rt_entity(rt_se);
471} 494}
472 495
496static inline int rt_rq_throttled(struct rt_rq *rt_rq)
497{
498 return rt_rq->rt_throttled && !rt_rq->rt_nr_boosted;
499}
500
473static int rt_se_boosted(struct sched_rt_entity *rt_se) 501static int rt_se_boosted(struct sched_rt_entity *rt_se)
474{ 502{
475 struct rt_rq *rt_rq = group_rt_rq(rt_se); 503 struct rt_rq *rt_rq = group_rt_rq(rt_se);
@@ -532,12 +560,23 @@ static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
532 560
533static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq) 561static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
534{ 562{
535 if (rt_rq->rt_nr_running) 563 struct rq *rq = rq_of_rt_rq(rt_rq);
536 resched_task(rq_of_rt_rq(rt_rq)->curr); 564
565 if (!rt_rq->rt_nr_running)
566 return;
567
568 enqueue_top_rt_rq(rt_rq);
569 resched_task(rq->curr);
537} 570}
538 571
539static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq) 572static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
540{ 573{
574 dequeue_top_rt_rq(rt_rq);
575}
576
577static inline int rt_rq_throttled(struct rt_rq *rt_rq)
578{
579 return rt_rq->rt_throttled;
541} 580}
542 581
543static inline const struct cpumask *sched_rt_period_mask(void) 582static inline const struct cpumask *sched_rt_period_mask(void)
@@ -851,14 +890,8 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
851 * but accrue some time due to boosting. 890 * but accrue some time due to boosting.
852 */ 891 */
853 if (likely(rt_b->rt_runtime)) { 892 if (likely(rt_b->rt_runtime)) {
854 static bool once = false;
855
856 rt_rq->rt_throttled = 1; 893 rt_rq->rt_throttled = 1;
857 894 printk_deferred_once("sched: RT throttling activated\n");
858 if (!once) {
859 once = true;
860 printk_sched("sched: RT throttling activated\n");
861 }
862 } else { 895 } else {
863 /* 896 /*
864 * In case we did anyway, make it go away, 897 * In case we did anyway, make it go away,
@@ -922,6 +955,38 @@ static void update_curr_rt(struct rq *rq)
922 } 955 }
923} 956}
924 957
958static void
959dequeue_top_rt_rq(struct rt_rq *rt_rq)
960{
961 struct rq *rq = rq_of_rt_rq(rt_rq);
962
963 BUG_ON(&rq->rt != rt_rq);
964
965 if (!rt_rq->rt_queued)
966 return;
967
968 BUG_ON(!rq->nr_running);
969
970 sub_nr_running(rq, rt_rq->rt_nr_running);
971 rt_rq->rt_queued = 0;
972}
973
974static void
975enqueue_top_rt_rq(struct rt_rq *rt_rq)
976{
977 struct rq *rq = rq_of_rt_rq(rt_rq);
978
979 BUG_ON(&rq->rt != rt_rq);
980
981 if (rt_rq->rt_queued)
982 return;
983 if (rt_rq_throttled(rt_rq) || !rt_rq->rt_nr_running)
984 return;
985
986 add_nr_running(rq, rt_rq->rt_nr_running);
987 rt_rq->rt_queued = 1;
988}
989
925#if defined CONFIG_SMP 990#if defined CONFIG_SMP
926 991
927static void 992static void
@@ -1045,12 +1110,23 @@ void dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) {}
1045#endif /* CONFIG_RT_GROUP_SCHED */ 1110#endif /* CONFIG_RT_GROUP_SCHED */
1046 1111
1047static inline 1112static inline
1113unsigned int rt_se_nr_running(struct sched_rt_entity *rt_se)
1114{
1115 struct rt_rq *group_rq = group_rt_rq(rt_se);
1116
1117 if (group_rq)
1118 return group_rq->rt_nr_running;
1119 else
1120 return 1;
1121}
1122
1123static inline
1048void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) 1124void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
1049{ 1125{
1050 int prio = rt_se_prio(rt_se); 1126 int prio = rt_se_prio(rt_se);
1051 1127
1052 WARN_ON(!rt_prio(prio)); 1128 WARN_ON(!rt_prio(prio));
1053 rt_rq->rt_nr_running++; 1129 rt_rq->rt_nr_running += rt_se_nr_running(rt_se);
1054 1130
1055 inc_rt_prio(rt_rq, prio); 1131 inc_rt_prio(rt_rq, prio);
1056 inc_rt_migration(rt_se, rt_rq); 1132 inc_rt_migration(rt_se, rt_rq);
@@ -1062,7 +1138,7 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
1062{ 1138{
1063 WARN_ON(!rt_prio(rt_se_prio(rt_se))); 1139 WARN_ON(!rt_prio(rt_se_prio(rt_se)));
1064 WARN_ON(!rt_rq->rt_nr_running); 1140 WARN_ON(!rt_rq->rt_nr_running);
1065 rt_rq->rt_nr_running--; 1141 rt_rq->rt_nr_running -= rt_se_nr_running(rt_se);
1066 1142
1067 dec_rt_prio(rt_rq, rt_se_prio(rt_se)); 1143 dec_rt_prio(rt_rq, rt_se_prio(rt_se));
1068 dec_rt_migration(rt_se, rt_rq); 1144 dec_rt_migration(rt_se, rt_rq);
@@ -1119,6 +1195,8 @@ static void dequeue_rt_stack(struct sched_rt_entity *rt_se)
1119 back = rt_se; 1195 back = rt_se;
1120 } 1196 }
1121 1197
1198 dequeue_top_rt_rq(rt_rq_of_se(back));
1199
1122 for (rt_se = back; rt_se; rt_se = rt_se->back) { 1200 for (rt_se = back; rt_se; rt_se = rt_se->back) {
1123 if (on_rt_rq(rt_se)) 1201 if (on_rt_rq(rt_se))
1124 __dequeue_rt_entity(rt_se); 1202 __dequeue_rt_entity(rt_se);
@@ -1127,13 +1205,18 @@ static void dequeue_rt_stack(struct sched_rt_entity *rt_se)
1127 1205
1128static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head) 1206static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
1129{ 1207{
1208 struct rq *rq = rq_of_rt_se(rt_se);
1209
1130 dequeue_rt_stack(rt_se); 1210 dequeue_rt_stack(rt_se);
1131 for_each_sched_rt_entity(rt_se) 1211 for_each_sched_rt_entity(rt_se)
1132 __enqueue_rt_entity(rt_se, head); 1212 __enqueue_rt_entity(rt_se, head);
1213 enqueue_top_rt_rq(&rq->rt);
1133} 1214}
1134 1215
1135static void dequeue_rt_entity(struct sched_rt_entity *rt_se) 1216static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
1136{ 1217{
1218 struct rq *rq = rq_of_rt_se(rt_se);
1219
1137 dequeue_rt_stack(rt_se); 1220 dequeue_rt_stack(rt_se);
1138 1221
1139 for_each_sched_rt_entity(rt_se) { 1222 for_each_sched_rt_entity(rt_se) {
@@ -1142,6 +1225,7 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
1142 if (rt_rq && rt_rq->rt_nr_running) 1225 if (rt_rq && rt_rq->rt_nr_running)
1143 __enqueue_rt_entity(rt_se, false); 1226 __enqueue_rt_entity(rt_se, false);
1144 } 1227 }
1228 enqueue_top_rt_rq(&rq->rt);
1145} 1229}
1146 1230
1147/* 1231/*
@@ -1159,8 +1243,6 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
1159 1243
1160 if (!task_current(rq, p) && p->nr_cpus_allowed > 1) 1244 if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
1161 enqueue_pushable_task(rq, p); 1245 enqueue_pushable_task(rq, p);
1162
1163 inc_nr_running(rq);
1164} 1246}
1165 1247
1166static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags) 1248static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
@@ -1171,8 +1253,6 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
1171 dequeue_rt_entity(rt_se); 1253 dequeue_rt_entity(rt_se);
1172 1254
1173 dequeue_pushable_task(rq, p); 1255 dequeue_pushable_task(rq, p);
1174
1175 dec_nr_running(rq);
1176} 1256}
1177 1257
1178/* 1258/*
@@ -1362,10 +1442,11 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev)
1362 pull_rt_task(rq); 1442 pull_rt_task(rq);
1363 /* 1443 /*
1364 * pull_rt_task() can drop (and re-acquire) rq->lock; this 1444 * pull_rt_task() can drop (and re-acquire) rq->lock; this
1365 * means a dl task can slip in, in which case we need to 1445 * means a dl or stop task can slip in, in which case we need
1366 * re-start task selection. 1446 * to re-start task selection.
1367 */ 1447 */
1368 if (unlikely(rq->dl.dl_nr_running)) 1448 if (unlikely((rq->stop && rq->stop->on_rq) ||
1449 rq->dl.dl_nr_running))
1369 return RETRY_TASK; 1450 return RETRY_TASK;
1370 } 1451 }
1371 1452
@@ -1376,10 +1457,7 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev)
1376 if (prev->sched_class == &rt_sched_class) 1457 if (prev->sched_class == &rt_sched_class)
1377 update_curr_rt(rq); 1458 update_curr_rt(rq);
1378 1459
1379 if (!rt_rq->rt_nr_running) 1460 if (!rt_rq->rt_queued)
1380 return NULL;
1381
1382 if (rt_rq_throttled(rt_rq))
1383 return NULL; 1461 return NULL;
1384 1462
1385 put_prev_task(rq, prev); 1463 put_prev_task(rq, prev);
@@ -1891,9 +1969,9 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p)
1891 */ 1969 */
1892 if (p->on_rq && rq->curr != p) { 1970 if (p->on_rq && rq->curr != p) {
1893#ifdef CONFIG_SMP 1971#ifdef CONFIG_SMP
1894 if (rq->rt.overloaded && push_rt_task(rq) && 1972 if (p->nr_cpus_allowed > 1 && rq->rt.overloaded &&
1895 /* Don't resched if we changed runqueues */ 1973 /* Don't resched if we changed runqueues */
1896 rq != task_rq(p)) 1974 push_rt_task(rq) && rq != task_rq(p))
1897 check_resched = 0; 1975 check_resched = 0;
1898#endif /* CONFIG_SMP */ 1976#endif /* CONFIG_SMP */
1899 if (check_resched && p->prio < rq->curr->prio) 1977 if (check_resched && p->prio < rq->curr->prio)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index c9007f28d3a2..e47679b04d16 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -278,7 +278,7 @@ extern void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b);
278extern int sched_group_set_shares(struct task_group *tg, unsigned long shares); 278extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);
279 279
280extern void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b); 280extern void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b);
281extern void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b); 281extern void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b, bool force);
282extern void unthrottle_cfs_rq(struct cfs_rq *cfs_rq); 282extern void unthrottle_cfs_rq(struct cfs_rq *cfs_rq);
283 283
284extern void free_rt_sched_group(struct task_group *tg); 284extern void free_rt_sched_group(struct task_group *tg);
@@ -409,6 +409,8 @@ struct rt_rq {
409 int overloaded; 409 int overloaded;
410 struct plist_head pushable_tasks; 410 struct plist_head pushable_tasks;
411#endif 411#endif
412 int rt_queued;
413
412 int rt_throttled; 414 int rt_throttled;
413 u64 rt_time; 415 u64 rt_time;
414 u64 rt_runtime; 416 u64 rt_runtime;
@@ -423,18 +425,6 @@ struct rt_rq {
423#endif 425#endif
424}; 426};
425 427
426#ifdef CONFIG_RT_GROUP_SCHED
427static inline int rt_rq_throttled(struct rt_rq *rt_rq)
428{
429 return rt_rq->rt_throttled && !rt_rq->rt_nr_boosted;
430}
431#else
432static inline int rt_rq_throttled(struct rt_rq *rt_rq)
433{
434 return rt_rq->rt_throttled;
435}
436#endif
437
438/* Deadline class' related fields in a runqueue */ 428/* Deadline class' related fields in a runqueue */
439struct dl_rq { 429struct dl_rq {
440 /* runqueue is an rbtree, ordered by deadline */ 430 /* runqueue is an rbtree, ordered by deadline */
@@ -1216,12 +1206,14 @@ extern void update_idle_cpu_load(struct rq *this_rq);
1216 1206
1217extern void init_task_runnable_average(struct task_struct *p); 1207extern void init_task_runnable_average(struct task_struct *p);
1218 1208
1219static inline void inc_nr_running(struct rq *rq) 1209static inline void add_nr_running(struct rq *rq, unsigned count)
1220{ 1210{
1221 rq->nr_running++; 1211 unsigned prev_nr = rq->nr_running;
1212
1213 rq->nr_running = prev_nr + count;
1222 1214
1223#ifdef CONFIG_NO_HZ_FULL 1215#ifdef CONFIG_NO_HZ_FULL
1224 if (rq->nr_running == 2) { 1216 if (prev_nr < 2 && rq->nr_running >= 2) {
1225 if (tick_nohz_full_cpu(rq->cpu)) { 1217 if (tick_nohz_full_cpu(rq->cpu)) {
1226 /* Order rq->nr_running write against the IPI */ 1218 /* Order rq->nr_running write against the IPI */
1227 smp_wmb(); 1219 smp_wmb();
@@ -1231,9 +1223,9 @@ static inline void inc_nr_running(struct rq *rq)
1231#endif 1223#endif
1232} 1224}
1233 1225
1234static inline void dec_nr_running(struct rq *rq) 1226static inline void sub_nr_running(struct rq *rq, unsigned count)
1235{ 1227{
1236 rq->nr_running--; 1228 rq->nr_running -= count;
1237} 1229}
1238 1230
1239static inline void rq_last_tick_reset(struct rq *rq) 1231static inline void rq_last_tick_reset(struct rq *rq)
@@ -1385,6 +1377,15 @@ static inline void double_lock(spinlock_t *l1, spinlock_t *l2)
1385 spin_lock_nested(l2, SINGLE_DEPTH_NESTING); 1377 spin_lock_nested(l2, SINGLE_DEPTH_NESTING);
1386} 1378}
1387 1379
1380static inline void double_lock_irq(spinlock_t *l1, spinlock_t *l2)
1381{
1382 if (l1 > l2)
1383 swap(l1, l2);
1384
1385 spin_lock_irq(l1);
1386 spin_lock_nested(l2, SINGLE_DEPTH_NESTING);
1387}
1388
1388static inline void double_raw_lock(raw_spinlock_t *l1, raw_spinlock_t *l2) 1389static inline void double_raw_lock(raw_spinlock_t *l1, raw_spinlock_t *l2)
1389{ 1390{
1390 if (l1 > l2) 1391 if (l1 > l2)
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index d6ce65dde541..bfe0edadbfbb 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -41,13 +41,13 @@ pick_next_task_stop(struct rq *rq, struct task_struct *prev)
41static void 41static void
42enqueue_task_stop(struct rq *rq, struct task_struct *p, int flags) 42enqueue_task_stop(struct rq *rq, struct task_struct *p, int flags)
43{ 43{
44 inc_nr_running(rq); 44 add_nr_running(rq, 1);
45} 45}
46 46
47static void 47static void
48dequeue_task_stop(struct rq *rq, struct task_struct *p, int flags) 48dequeue_task_stop(struct rq *rq, struct task_struct *p, int flags)
49{ 49{
50 dec_nr_running(rq); 50 sub_nr_running(rq, 1);
51} 51}
52 52
53static void yield_task_stop(struct rq *rq) 53static void yield_task_stop(struct rq *rq)
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 590c37925084..301bbc24739c 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -39,7 +39,7 @@
39 * is only needed for handling filters shared across tasks. 39 * is only needed for handling filters shared across tasks.
40 * @prev: points to a previously installed, or inherited, filter 40 * @prev: points to a previously installed, or inherited, filter
41 * @len: the number of instructions in the program 41 * @len: the number of instructions in the program
42 * @insns: the BPF program instructions to evaluate 42 * @insnsi: the BPF program instructions to evaluate
43 * 43 *
44 * seccomp_filter objects are organized in a tree linked via the @prev 44 * seccomp_filter objects are organized in a tree linked via the @prev
45 * pointer. For any task, it appears to be a singly-linked list starting 45 * pointer. For any task, it appears to be a singly-linked list starting
@@ -54,8 +54,7 @@
54struct seccomp_filter { 54struct seccomp_filter {
55 atomic_t usage; 55 atomic_t usage;
56 struct seccomp_filter *prev; 56 struct seccomp_filter *prev;
57 unsigned short len; /* Instruction count */ 57 struct sk_filter *prog;
58 struct sock_filter_int insnsi[];
59}; 58};
60 59
61/* Limit any path through the tree to 256KB worth of instructions. */ 60/* Limit any path through the tree to 256KB worth of instructions. */
@@ -104,60 +103,59 @@ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen)
104 u32 k = ftest->k; 103 u32 k = ftest->k;
105 104
106 switch (code) { 105 switch (code) {
107 case BPF_S_LD_W_ABS: 106 case BPF_LD | BPF_W | BPF_ABS:
108 ftest->code = BPF_LDX | BPF_W | BPF_ABS; 107 ftest->code = BPF_LDX | BPF_W | BPF_ABS;
109 /* 32-bit aligned and not out of bounds. */ 108 /* 32-bit aligned and not out of bounds. */
110 if (k >= sizeof(struct seccomp_data) || k & 3) 109 if (k >= sizeof(struct seccomp_data) || k & 3)
111 return -EINVAL; 110 return -EINVAL;
112 continue; 111 continue;
113 case BPF_S_LD_W_LEN: 112 case BPF_LD | BPF_W | BPF_LEN:
114 ftest->code = BPF_LD | BPF_IMM; 113 ftest->code = BPF_LD | BPF_IMM;
115 ftest->k = sizeof(struct seccomp_data); 114 ftest->k = sizeof(struct seccomp_data);
116 continue; 115 continue;
117 case BPF_S_LDX_W_LEN: 116 case BPF_LDX | BPF_W | BPF_LEN:
118 ftest->code = BPF_LDX | BPF_IMM; 117 ftest->code = BPF_LDX | BPF_IMM;
119 ftest->k = sizeof(struct seccomp_data); 118 ftest->k = sizeof(struct seccomp_data);
120 continue; 119 continue;
121 /* Explicitly include allowed calls. */ 120 /* Explicitly include allowed calls. */
122 case BPF_S_RET_K: 121 case BPF_RET | BPF_K:
123 case BPF_S_RET_A: 122 case BPF_RET | BPF_A:
124 case BPF_S_ALU_ADD_K: 123 case BPF_ALU | BPF_ADD | BPF_K:
125 case BPF_S_ALU_ADD_X: 124 case BPF_ALU | BPF_ADD | BPF_X:
126 case BPF_S_ALU_SUB_K: 125 case BPF_ALU | BPF_SUB | BPF_K:
127 case BPF_S_ALU_SUB_X: 126 case BPF_ALU | BPF_SUB | BPF_X:
128 case BPF_S_ALU_MUL_K: 127 case BPF_ALU | BPF_MUL | BPF_K:
129 case BPF_S_ALU_MUL_X: 128 case BPF_ALU | BPF_MUL | BPF_X:
130 case BPF_S_ALU_DIV_X: 129 case BPF_ALU | BPF_DIV | BPF_K:
131 case BPF_S_ALU_AND_K: 130 case BPF_ALU | BPF_DIV | BPF_X:
132 case BPF_S_ALU_AND_X: 131 case BPF_ALU | BPF_AND | BPF_K:
133 case BPF_S_ALU_OR_K: 132 case BPF_ALU | BPF_AND | BPF_X:
134 case BPF_S_ALU_OR_X: 133 case BPF_ALU | BPF_OR | BPF_K:
135 case BPF_S_ALU_XOR_K: 134 case BPF_ALU | BPF_OR | BPF_X:
136 case BPF_S_ALU_XOR_X: 135 case BPF_ALU | BPF_XOR | BPF_K:
137 case BPF_S_ALU_LSH_K: 136 case BPF_ALU | BPF_XOR | BPF_X:
138 case BPF_S_ALU_LSH_X: 137 case BPF_ALU | BPF_LSH | BPF_K:
139 case BPF_S_ALU_RSH_K: 138 case BPF_ALU | BPF_LSH | BPF_X:
140 case BPF_S_ALU_RSH_X: 139 case BPF_ALU | BPF_RSH | BPF_K:
141 case BPF_S_ALU_NEG: 140 case BPF_ALU | BPF_RSH | BPF_X:
142 case BPF_S_LD_IMM: 141 case BPF_ALU | BPF_NEG:
143 case BPF_S_LDX_IMM: 142 case BPF_LD | BPF_IMM:
144 case BPF_S_MISC_TAX: 143 case BPF_LDX | BPF_IMM:
145 case BPF_S_MISC_TXA: 144 case BPF_MISC | BPF_TAX:
146 case BPF_S_ALU_DIV_K: 145 case BPF_MISC | BPF_TXA:
147 case BPF_S_LD_MEM: 146 case BPF_LD | BPF_MEM:
148 case BPF_S_LDX_MEM: 147 case BPF_LDX | BPF_MEM:
149 case BPF_S_ST: 148 case BPF_ST:
150 case BPF_S_STX: 149 case BPF_STX:
151 case BPF_S_JMP_JA: 150 case BPF_JMP | BPF_JA:
152 case BPF_S_JMP_JEQ_K: 151 case BPF_JMP | BPF_JEQ | BPF_K:
153 case BPF_S_JMP_JEQ_X: 152 case BPF_JMP | BPF_JEQ | BPF_X:
154 case BPF_S_JMP_JGE_K: 153 case BPF_JMP | BPF_JGE | BPF_K:
155 case BPF_S_JMP_JGE_X: 154 case BPF_JMP | BPF_JGE | BPF_X:
156 case BPF_S_JMP_JGT_K: 155 case BPF_JMP | BPF_JGT | BPF_K:
157 case BPF_S_JMP_JGT_X: 156 case BPF_JMP | BPF_JGT | BPF_X:
158 case BPF_S_JMP_JSET_K: 157 case BPF_JMP | BPF_JSET | BPF_K:
159 case BPF_S_JMP_JSET_X: 158 case BPF_JMP | BPF_JSET | BPF_X:
160 sk_decode_filter(ftest, ftest);
161 continue; 159 continue;
162 default: 160 default:
163 return -EINVAL; 161 return -EINVAL;
@@ -189,7 +187,8 @@ static u32 seccomp_run_filters(int syscall)
189 * value always takes priority (ignoring the DATA). 187 * value always takes priority (ignoring the DATA).
190 */ 188 */
191 for (f = current->seccomp.filter; f; f = f->prev) { 189 for (f = current->seccomp.filter; f; f = f->prev) {
192 u32 cur_ret = sk_run_filter_int_seccomp(&sd, f->insnsi); 190 u32 cur_ret = SK_RUN_FILTER(f->prog, (void *)&sd);
191
193 if ((cur_ret & SECCOMP_RET_ACTION) < (ret & SECCOMP_RET_ACTION)) 192 if ((cur_ret & SECCOMP_RET_ACTION) < (ret & SECCOMP_RET_ACTION))
194 ret = cur_ret; 193 ret = cur_ret;
195 } 194 }
@@ -215,12 +214,12 @@ static long seccomp_attach_filter(struct sock_fprog *fprog)
215 return -EINVAL; 214 return -EINVAL;
216 215
217 for (filter = current->seccomp.filter; filter; filter = filter->prev) 216 for (filter = current->seccomp.filter; filter; filter = filter->prev)
218 total_insns += filter->len + 4; /* include a 4 instr penalty */ 217 total_insns += filter->prog->len + 4; /* include a 4 instr penalty */
219 if (total_insns > MAX_INSNS_PER_PATH) 218 if (total_insns > MAX_INSNS_PER_PATH)
220 return -ENOMEM; 219 return -ENOMEM;
221 220
222 /* 221 /*
223 * Installing a seccomp filter requires that the task have 222 * Installing a seccomp filter requires that the task has
224 * CAP_SYS_ADMIN in its namespace or be running with no_new_privs. 223 * CAP_SYS_ADMIN in its namespace or be running with no_new_privs.
225 * This avoids scenarios where unprivileged tasks can affect the 224 * This avoids scenarios where unprivileged tasks can affect the
226 * behavior of privileged children. 225 * behavior of privileged children.
@@ -255,18 +254,26 @@ static long seccomp_attach_filter(struct sock_fprog *fprog)
255 goto free_prog; 254 goto free_prog;
256 255
257 /* Allocate a new seccomp_filter */ 256 /* Allocate a new seccomp_filter */
258 filter = kzalloc(sizeof(struct seccomp_filter) + 257 ret = -ENOMEM;
259 sizeof(struct sock_filter_int) * new_len, 258 filter = kzalloc(sizeof(struct seccomp_filter),
260 GFP_KERNEL|__GFP_NOWARN); 259 GFP_KERNEL|__GFP_NOWARN);
261 if (!filter) 260 if (!filter)
262 goto free_prog; 261 goto free_prog;
263 262
264 ret = sk_convert_filter(fp, fprog->len, filter->insnsi, &new_len); 263 filter->prog = kzalloc(sk_filter_size(new_len),
265 if (ret) 264 GFP_KERNEL|__GFP_NOWARN);
265 if (!filter->prog)
266 goto free_filter; 266 goto free_filter;
267 267
268 ret = sk_convert_filter(fp, fprog->len, filter->prog->insnsi, &new_len);
269 if (ret)
270 goto free_filter_prog;
271 kfree(fp);
272
268 atomic_set(&filter->usage, 1); 273 atomic_set(&filter->usage, 1);
269 filter->len = new_len; 274 filter->prog->len = new_len;
275
276 sk_filter_select_runtime(filter->prog);
270 277
271 /* 278 /*
272 * If there is an existing filter, make it the prev and don't drop its 279 * If there is an existing filter, make it the prev and don't drop its
@@ -276,6 +283,8 @@ static long seccomp_attach_filter(struct sock_fprog *fprog)
276 current->seccomp.filter = filter; 283 current->seccomp.filter = filter;
277 return 0; 284 return 0;
278 285
286free_filter_prog:
287 kfree(filter->prog);
279free_filter: 288free_filter:
280 kfree(filter); 289 kfree(filter);
281free_prog: 290free_prog:
@@ -328,6 +337,7 @@ void put_seccomp_filter(struct task_struct *tsk)
328 while (orig && atomic_dec_and_test(&orig->usage)) { 337 while (orig && atomic_dec_and_test(&orig->usage)) {
329 struct seccomp_filter *freeme = orig; 338 struct seccomp_filter *freeme = orig;
330 orig = orig->prev; 339 orig = orig->prev;
340 sk_filter_free(freeme->prog);
331 kfree(freeme); 341 kfree(freeme);
332 } 342 }
333} 343}
diff --git a/kernel/signal.c b/kernel/signal.c
index 6ea13c09ae56..a4077e90f19f 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -277,6 +277,7 @@ void task_clear_jobctl_trapping(struct task_struct *task)
277{ 277{
278 if (unlikely(task->jobctl & JOBCTL_TRAPPING)) { 278 if (unlikely(task->jobctl & JOBCTL_TRAPPING)) {
279 task->jobctl &= ~JOBCTL_TRAPPING; 279 task->jobctl &= ~JOBCTL_TRAPPING;
280 smp_mb(); /* advised by wake_up_bit() */
280 wake_up_bit(&task->jobctl, JOBCTL_TRAPPING_BIT); 281 wake_up_bit(&task->jobctl, JOBCTL_TRAPPING_BIT);
281 } 282 }
282} 283}
@@ -705,11 +706,8 @@ void signal_wake_up_state(struct task_struct *t, unsigned int state)
705 * Returns 1 if any signals were found. 706 * Returns 1 if any signals were found.
706 * 707 *
707 * All callers must be holding the siglock. 708 * All callers must be holding the siglock.
708 *
709 * This version takes a sigset mask and looks at all signals,
710 * not just those in the first mask word.
711 */ 709 */
712static int rm_from_queue_full(sigset_t *mask, struct sigpending *s) 710static int flush_sigqueue_mask(sigset_t *mask, struct sigpending *s)
713{ 711{
714 struct sigqueue *q, *n; 712 struct sigqueue *q, *n;
715 sigset_t m; 713 sigset_t m;
@@ -727,29 +725,6 @@ static int rm_from_queue_full(sigset_t *mask, struct sigpending *s)
727 } 725 }
728 return 1; 726 return 1;
729} 727}
730/*
731 * Remove signals in mask from the pending set and queue.
732 * Returns 1 if any signals were found.
733 *
734 * All callers must be holding the siglock.
735 */
736static int rm_from_queue(unsigned long mask, struct sigpending *s)
737{
738 struct sigqueue *q, *n;
739
740 if (!sigtestsetmask(&s->signal, mask))
741 return 0;
742
743 sigdelsetmask(&s->signal, mask);
744 list_for_each_entry_safe(q, n, &s->list, list) {
745 if (q->info.si_signo < SIGRTMIN &&
746 (mask & sigmask(q->info.si_signo))) {
747 list_del_init(&q->list);
748 __sigqueue_free(q);
749 }
750 }
751 return 1;
752}
753 728
754static inline int is_si_special(const struct siginfo *info) 729static inline int is_si_special(const struct siginfo *info)
755{ 730{
@@ -861,6 +836,7 @@ static bool prepare_signal(int sig, struct task_struct *p, bool force)
861{ 836{
862 struct signal_struct *signal = p->signal; 837 struct signal_struct *signal = p->signal;
863 struct task_struct *t; 838 struct task_struct *t;
839 sigset_t flush;
864 840
865 if (signal->flags & (SIGNAL_GROUP_EXIT | SIGNAL_GROUP_COREDUMP)) { 841 if (signal->flags & (SIGNAL_GROUP_EXIT | SIGNAL_GROUP_COREDUMP)) {
866 if (signal->flags & SIGNAL_GROUP_COREDUMP) 842 if (signal->flags & SIGNAL_GROUP_COREDUMP)
@@ -872,26 +848,25 @@ static bool prepare_signal(int sig, struct task_struct *p, bool force)
872 /* 848 /*
873 * This is a stop signal. Remove SIGCONT from all queues. 849 * This is a stop signal. Remove SIGCONT from all queues.
874 */ 850 */
875 rm_from_queue(sigmask(SIGCONT), &signal->shared_pending); 851 siginitset(&flush, sigmask(SIGCONT));
876 t = p; 852 flush_sigqueue_mask(&flush, &signal->shared_pending);
877 do { 853 for_each_thread(p, t)
878 rm_from_queue(sigmask(SIGCONT), &t->pending); 854 flush_sigqueue_mask(&flush, &t->pending);
879 } while_each_thread(p, t);
880 } else if (sig == SIGCONT) { 855 } else if (sig == SIGCONT) {
881 unsigned int why; 856 unsigned int why;
882 /* 857 /*
883 * Remove all stop signals from all queues, wake all threads. 858 * Remove all stop signals from all queues, wake all threads.
884 */ 859 */
885 rm_from_queue(SIG_KERNEL_STOP_MASK, &signal->shared_pending); 860 siginitset(&flush, SIG_KERNEL_STOP_MASK);
886 t = p; 861 flush_sigqueue_mask(&flush, &signal->shared_pending);
887 do { 862 for_each_thread(p, t) {
863 flush_sigqueue_mask(&flush, &t->pending);
888 task_clear_jobctl_pending(t, JOBCTL_STOP_PENDING); 864 task_clear_jobctl_pending(t, JOBCTL_STOP_PENDING);
889 rm_from_queue(SIG_KERNEL_STOP_MASK, &t->pending);
890 if (likely(!(t->ptrace & PT_SEIZED))) 865 if (likely(!(t->ptrace & PT_SEIZED)))
891 wake_up_state(t, __TASK_STOPPED); 866 wake_up_state(t, __TASK_STOPPED);
892 else 867 else
893 ptrace_trap_notify(t); 868 ptrace_trap_notify(t);
894 } while_each_thread(p, t); 869 }
895 870
896 /* 871 /*
897 * Notify the parent with CLD_CONTINUED if we were stopped. 872 * Notify the parent with CLD_CONTINUED if we were stopped.
@@ -2854,7 +2829,7 @@ int do_sigtimedwait(const sigset_t *which, siginfo_t *info,
2854 2829
2855 spin_lock_irq(&tsk->sighand->siglock); 2830 spin_lock_irq(&tsk->sighand->siglock);
2856 __set_task_blocked(tsk, &tsk->real_blocked); 2831 __set_task_blocked(tsk, &tsk->real_blocked);
2857 siginitset(&tsk->real_blocked, 0); 2832 sigemptyset(&tsk->real_blocked);
2858 sig = dequeue_signal(tsk, &mask, info); 2833 sig = dequeue_signal(tsk, &mask, info);
2859 } 2834 }
2860 spin_unlock_irq(&tsk->sighand->siglock); 2835 spin_unlock_irq(&tsk->sighand->siglock);
@@ -3091,18 +3066,39 @@ COMPAT_SYSCALL_DEFINE4(rt_tgsigqueueinfo,
3091} 3066}
3092#endif 3067#endif
3093 3068
3069/*
3070 * For kthreads only, must not be used if cloned with CLONE_SIGHAND
3071 */
3072void kernel_sigaction(int sig, __sighandler_t action)
3073{
3074 spin_lock_irq(&current->sighand->siglock);
3075 current->sighand->action[sig - 1].sa.sa_handler = action;
3076 if (action == SIG_IGN) {
3077 sigset_t mask;
3078
3079 sigemptyset(&mask);
3080 sigaddset(&mask, sig);
3081
3082 flush_sigqueue_mask(&mask, &current->signal->shared_pending);
3083 flush_sigqueue_mask(&mask, &current->pending);
3084 recalc_sigpending();
3085 }
3086 spin_unlock_irq(&current->sighand->siglock);
3087}
3088EXPORT_SYMBOL(kernel_sigaction);
3089
3094int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact) 3090int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact)
3095{ 3091{
3096 struct task_struct *t = current; 3092 struct task_struct *p = current, *t;
3097 struct k_sigaction *k; 3093 struct k_sigaction *k;
3098 sigset_t mask; 3094 sigset_t mask;
3099 3095
3100 if (!valid_signal(sig) || sig < 1 || (act && sig_kernel_only(sig))) 3096 if (!valid_signal(sig) || sig < 1 || (act && sig_kernel_only(sig)))
3101 return -EINVAL; 3097 return -EINVAL;
3102 3098
3103 k = &t->sighand->action[sig-1]; 3099 k = &p->sighand->action[sig-1];
3104 3100
3105 spin_lock_irq(&current->sighand->siglock); 3101 spin_lock_irq(&p->sighand->siglock);
3106 if (oact) 3102 if (oact)
3107 *oact = *k; 3103 *oact = *k;
3108 3104
@@ -3121,21 +3117,20 @@ int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact)
3121 * (for example, SIGCHLD), shall cause the pending signal to 3117 * (for example, SIGCHLD), shall cause the pending signal to
3122 * be discarded, whether or not it is blocked" 3118 * be discarded, whether or not it is blocked"
3123 */ 3119 */
3124 if (sig_handler_ignored(sig_handler(t, sig), sig)) { 3120 if (sig_handler_ignored(sig_handler(p, sig), sig)) {
3125 sigemptyset(&mask); 3121 sigemptyset(&mask);
3126 sigaddset(&mask, sig); 3122 sigaddset(&mask, sig);
3127 rm_from_queue_full(&mask, &t->signal->shared_pending); 3123 flush_sigqueue_mask(&mask, &p->signal->shared_pending);
3128 do { 3124 for_each_thread(p, t)
3129 rm_from_queue_full(&mask, &t->pending); 3125 flush_sigqueue_mask(&mask, &t->pending);
3130 } while_each_thread(current, t);
3131 } 3126 }
3132 } 3127 }
3133 3128
3134 spin_unlock_irq(&current->sighand->siglock); 3129 spin_unlock_irq(&p->sighand->siglock);
3135 return 0; 3130 return 0;
3136} 3131}
3137 3132
3138static int 3133static int
3139do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long sp) 3134do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long sp)
3140{ 3135{
3141 stack_t oss; 3136 stack_t oss;
@@ -3496,7 +3491,7 @@ COMPAT_SYSCALL_DEFINE3(sigaction, int, sig,
3496} 3491}
3497#endif 3492#endif
3498 3493
3499#ifdef __ARCH_WANT_SYS_SGETMASK 3494#ifdef CONFIG_SGETMASK_SYSCALL
3500 3495
3501/* 3496/*
3502 * For backwards compatibility. Functionality superseded by sigprocmask. 3497 * For backwards compatibility. Functionality superseded by sigprocmask.
@@ -3517,7 +3512,7 @@ SYSCALL_DEFINE1(ssetmask, int, newmask)
3517 3512
3518 return old; 3513 return old;
3519} 3514}
3520#endif /* __ARCH_WANT_SGETMASK */ 3515#endif /* CONFIG_SGETMASK_SYSCALL */
3521 3516
3522#ifdef __ARCH_WANT_SYS_SIGNAL 3517#ifdef __ARCH_WANT_SYS_SIGNAL
3523/* 3518/*
diff --git a/kernel/smp.c b/kernel/smp.c
index 06d574e42c72..306f8180b0d5 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -185,14 +185,26 @@ void generic_smp_call_function_single_interrupt(void)
185{ 185{
186 struct llist_node *entry; 186 struct llist_node *entry;
187 struct call_single_data *csd, *csd_next; 187 struct call_single_data *csd, *csd_next;
188 static bool warned;
189
190 entry = llist_del_all(&__get_cpu_var(call_single_queue));
191 entry = llist_reverse_order(entry);
188 192
189 /* 193 /*
190 * Shouldn't receive this interrupt on a cpu that is not yet online. 194 * Shouldn't receive this interrupt on a cpu that is not yet online.
191 */ 195 */
192 WARN_ON_ONCE(!cpu_online(smp_processor_id())); 196 if (unlikely(!cpu_online(smp_processor_id()) && !warned)) {
197 warned = true;
198 WARN(1, "IPI on offline CPU %d\n", smp_processor_id());
193 199
194 entry = llist_del_all(&__get_cpu_var(call_single_queue)); 200 /*
195 entry = llist_reverse_order(entry); 201 * We don't have to use the _safe() variant here
202 * because we are not invoking the IPI handlers yet.
203 */
204 llist_for_each_entry(csd, entry, llist)
205 pr_warn("IPI callback %pS sent to offline CPU\n",
206 csd->func);
207 }
196 208
197 llist_for_each_entry_safe(csd, csd_next, entry, llist) { 209 llist_for_each_entry_safe(csd, csd_next, entry, llist) {
198 csd->func(csd->info); 210 csd->func(csd->info);
diff --git a/kernel/softirq.c b/kernel/softirq.c
index b50990a5bea0..5918d227730f 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -223,7 +223,7 @@ static inline bool lockdep_softirq_start(void) { return false; }
223static inline void lockdep_softirq_end(bool in_hardirq) { } 223static inline void lockdep_softirq_end(bool in_hardirq) { }
224#endif 224#endif
225 225
226asmlinkage void __do_softirq(void) 226asmlinkage __visible void __do_softirq(void)
227{ 227{
228 unsigned long end = jiffies + MAX_SOFTIRQ_TIME; 228 unsigned long end = jiffies + MAX_SOFTIRQ_TIME;
229 unsigned long old_flags = current->flags; 229 unsigned long old_flags = current->flags;
@@ -232,7 +232,6 @@ asmlinkage void __do_softirq(void)
232 bool in_hardirq; 232 bool in_hardirq;
233 __u32 pending; 233 __u32 pending;
234 int softirq_bit; 234 int softirq_bit;
235 int cpu;
236 235
237 /* 236 /*
238 * Mask out PF_MEMALLOC s current task context is borrowed for the 237 * Mask out PF_MEMALLOC s current task context is borrowed for the
@@ -247,7 +246,6 @@ asmlinkage void __do_softirq(void)
247 __local_bh_disable_ip(_RET_IP_, SOFTIRQ_OFFSET); 246 __local_bh_disable_ip(_RET_IP_, SOFTIRQ_OFFSET);
248 in_hardirq = lockdep_softirq_start(); 247 in_hardirq = lockdep_softirq_start();
249 248
250 cpu = smp_processor_id();
251restart: 249restart:
252 /* Reset the pending bitmask before enabling irqs */ 250 /* Reset the pending bitmask before enabling irqs */
253 set_softirq_pending(0); 251 set_softirq_pending(0);
@@ -276,11 +274,11 @@ restart:
276 prev_count, preempt_count()); 274 prev_count, preempt_count());
277 preempt_count_set(prev_count); 275 preempt_count_set(prev_count);
278 } 276 }
279 rcu_bh_qs(cpu);
280 h++; 277 h++;
281 pending >>= softirq_bit; 278 pending >>= softirq_bit;
282 } 279 }
283 280
281 rcu_bh_qs(smp_processor_id());
284 local_irq_disable(); 282 local_irq_disable();
285 283
286 pending = local_softirq_pending(); 284 pending = local_softirq_pending();
@@ -299,7 +297,7 @@ restart:
299 tsk_restore_flags(current, old_flags, PF_MEMALLOC); 297 tsk_restore_flags(current, old_flags, PF_MEMALLOC);
300} 298}
301 299
302asmlinkage void do_softirq(void) 300asmlinkage __visible void do_softirq(void)
303{ 301{
304 __u32 pending; 302 __u32 pending;
305 unsigned long flags; 303 unsigned long flags;
@@ -779,3 +777,8 @@ int __init __weak arch_early_irq_init(void)
779{ 777{
780 return 0; 778 return 0;
781} 779}
780
781unsigned int __weak arch_dynirq_lower_bound(unsigned int from)
782{
783 return from;
784}
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 01fbae5b97b7..695f0c6cd169 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -307,6 +307,7 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *
307 * @cpu: cpu to stop 307 * @cpu: cpu to stop
308 * @fn: function to execute 308 * @fn: function to execute
309 * @arg: argument to @fn 309 * @arg: argument to @fn
310 * @work_buf: pointer to cpu_stop_work structure
310 * 311 *
311 * Similar to stop_one_cpu() but doesn't wait for completion. The 312 * Similar to stop_one_cpu() but doesn't wait for completion. The
312 * caller is responsible for ensuring @work_buf is currently unused 313 * caller is responsible for ensuring @work_buf is currently unused
diff --git a/kernel/sys.c b/kernel/sys.c
index fba0f29401ea..66a751ebf9d9 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -250,7 +250,7 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who)
250 else 250 else
251 p = current; 251 p = current;
252 if (p) { 252 if (p) {
253 niceval = 20 - task_nice(p); 253 niceval = nice_to_rlimit(task_nice(p));
254 if (niceval > retval) 254 if (niceval > retval)
255 retval = niceval; 255 retval = niceval;
256 } 256 }
@@ -261,7 +261,7 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who)
261 else 261 else
262 pgrp = task_pgrp(current); 262 pgrp = task_pgrp(current);
263 do_each_pid_thread(pgrp, PIDTYPE_PGID, p) { 263 do_each_pid_thread(pgrp, PIDTYPE_PGID, p) {
264 niceval = 20 - task_nice(p); 264 niceval = nice_to_rlimit(task_nice(p));
265 if (niceval > retval) 265 if (niceval > retval)
266 retval = niceval; 266 retval = niceval;
267 } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); 267 } while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
@@ -277,7 +277,7 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who)
277 277
278 do_each_thread(g, p) { 278 do_each_thread(g, p) {
279 if (uid_eq(task_uid(p), uid)) { 279 if (uid_eq(task_uid(p), uid)) {
280 niceval = 20 - task_nice(p); 280 niceval = nice_to_rlimit(task_nice(p));
281 if (niceval > retval) 281 if (niceval > retval)
282 retval = niceval; 282 retval = niceval;
283 } 283 }
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index bc8d1b74a6b9..36441b51b5df 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -135,6 +135,8 @@ cond_syscall(sys_setresgid16);
135cond_syscall(sys_setresuid16); 135cond_syscall(sys_setresuid16);
136cond_syscall(sys_setreuid16); 136cond_syscall(sys_setreuid16);
137cond_syscall(sys_setuid16); 137cond_syscall(sys_setuid16);
138cond_syscall(sys_sgetmask);
139cond_syscall(sys_ssetmask);
138cond_syscall(sys_vm86old); 140cond_syscall(sys_vm86old);
139cond_syscall(sys_vm86); 141cond_syscall(sys_vm86);
140cond_syscall(sys_ipc); 142cond_syscall(sys_ipc);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 74f5b580fe34..ba9ed453c4ed 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -173,6 +173,13 @@ extern int no_unaligned_warning;
173#endif 173#endif
174 174
175#ifdef CONFIG_PROC_SYSCTL 175#ifdef CONFIG_PROC_SYSCTL
176
177#define SYSCTL_WRITES_LEGACY -1
178#define SYSCTL_WRITES_WARN 0
179#define SYSCTL_WRITES_STRICT 1
180
181static int sysctl_writes_strict = SYSCTL_WRITES_WARN;
182
176static int proc_do_cad_pid(struct ctl_table *table, int write, 183static int proc_do_cad_pid(struct ctl_table *table, int write,
177 void __user *buffer, size_t *lenp, loff_t *ppos); 184 void __user *buffer, size_t *lenp, loff_t *ppos);
178static int proc_taint(struct ctl_table *table, int write, 185static int proc_taint(struct ctl_table *table, int write,
@@ -195,7 +202,7 @@ static int proc_dostring_coredump(struct ctl_table *table, int write,
195/* Note: sysrq code uses it's own private copy */ 202/* Note: sysrq code uses it's own private copy */
196static int __sysrq_enabled = CONFIG_MAGIC_SYSRQ_DEFAULT_ENABLE; 203static int __sysrq_enabled = CONFIG_MAGIC_SYSRQ_DEFAULT_ENABLE;
197 204
198static int sysrq_sysctl_handler(ctl_table *table, int write, 205static int sysrq_sysctl_handler(struct ctl_table *table, int write,
199 void __user *buffer, size_t *lenp, 206 void __user *buffer, size_t *lenp,
200 loff_t *ppos) 207 loff_t *ppos)
201{ 208{
@@ -495,6 +502,15 @@ static struct ctl_table kern_table[] = {
495 .mode = 0644, 502 .mode = 0644,
496 .proc_handler = proc_taint, 503 .proc_handler = proc_taint,
497 }, 504 },
505 {
506 .procname = "sysctl_writes_strict",
507 .data = &sysctl_writes_strict,
508 .maxlen = sizeof(int),
509 .mode = 0644,
510 .proc_handler = proc_dointvec_minmax,
511 .extra1 = &neg_one,
512 .extra2 = &one,
513 },
498#endif 514#endif
499#ifdef CONFIG_LATENCYTOP 515#ifdef CONFIG_LATENCYTOP
500 { 516 {
@@ -643,7 +659,7 @@ static struct ctl_table kern_table[] = {
643 .extra2 = &one, 659 .extra2 = &one,
644 }, 660 },
645#endif 661#endif
646 662#ifdef CONFIG_UEVENT_HELPER
647 { 663 {
648 .procname = "hotplug", 664 .procname = "hotplug",
649 .data = &uevent_helper, 665 .data = &uevent_helper,
@@ -651,7 +667,7 @@ static struct ctl_table kern_table[] = {
651 .mode = 0644, 667 .mode = 0644,
652 .proc_handler = proc_dostring, 668 .proc_handler = proc_dostring,
653 }, 669 },
654 670#endif
655#ifdef CONFIG_CHR_DEV_SG 671#ifdef CONFIG_CHR_DEV_SG
656 { 672 {
657 .procname = "sg-big-buff", 673 .procname = "sg-big-buff",
@@ -1418,8 +1434,13 @@ static struct ctl_table vm_table[] = {
1418 (defined(CONFIG_SUPERH) && defined(CONFIG_VSYSCALL)) 1434 (defined(CONFIG_SUPERH) && defined(CONFIG_VSYSCALL))
1419 { 1435 {
1420 .procname = "vdso_enabled", 1436 .procname = "vdso_enabled",
1437#ifdef CONFIG_X86_32
1438 .data = &vdso32_enabled,
1439 .maxlen = sizeof(vdso32_enabled),
1440#else
1421 .data = &vdso_enabled, 1441 .data = &vdso_enabled,
1422 .maxlen = sizeof(vdso_enabled), 1442 .maxlen = sizeof(vdso_enabled),
1443#endif
1423 .mode = 0644, 1444 .mode = 0644,
1424 .proc_handler = proc_dointvec, 1445 .proc_handler = proc_dointvec,
1425 .extra1 = &zero, 1446 .extra1 = &zero,
@@ -1698,8 +1719,8 @@ int __init sysctl_init(void)
1698 1719
1699#ifdef CONFIG_PROC_SYSCTL 1720#ifdef CONFIG_PROC_SYSCTL
1700 1721
1701static int _proc_do_string(void* data, int maxlen, int write, 1722static int _proc_do_string(char *data, int maxlen, int write,
1702 void __user *buffer, 1723 char __user *buffer,
1703 size_t *lenp, loff_t *ppos) 1724 size_t *lenp, loff_t *ppos)
1704{ 1725{
1705 size_t len; 1726 size_t len;
@@ -1712,21 +1733,30 @@ static int _proc_do_string(void* data, int maxlen, int write,
1712 } 1733 }
1713 1734
1714 if (write) { 1735 if (write) {
1715 len = 0; 1736 if (sysctl_writes_strict == SYSCTL_WRITES_STRICT) {
1737 /* Only continue writes not past the end of buffer. */
1738 len = strlen(data);
1739 if (len > maxlen - 1)
1740 len = maxlen - 1;
1741
1742 if (*ppos > len)
1743 return 0;
1744 len = *ppos;
1745 } else {
1746 /* Start writing from beginning of buffer. */
1747 len = 0;
1748 }
1749
1750 *ppos += *lenp;
1716 p = buffer; 1751 p = buffer;
1717 while (len < *lenp) { 1752 while ((p - buffer) < *lenp && len < maxlen - 1) {
1718 if (get_user(c, p++)) 1753 if (get_user(c, p++))
1719 return -EFAULT; 1754 return -EFAULT;
1720 if (c == 0 || c == '\n') 1755 if (c == 0 || c == '\n')
1721 break; 1756 break;
1722 len++; 1757 data[len++] = c;
1723 } 1758 }
1724 if (len >= maxlen) 1759 data[len] = 0;
1725 len = maxlen-1;
1726 if(copy_from_user(data, buffer, len))
1727 return -EFAULT;
1728 ((char *) data)[len] = 0;
1729 *ppos += *lenp;
1730 } else { 1760 } else {
1731 len = strlen(data); 1761 len = strlen(data);
1732 if (len > maxlen) 1762 if (len > maxlen)
@@ -1743,10 +1773,10 @@ static int _proc_do_string(void* data, int maxlen, int write,
1743 if (len > *lenp) 1773 if (len > *lenp)
1744 len = *lenp; 1774 len = *lenp;
1745 if (len) 1775 if (len)
1746 if(copy_to_user(buffer, data, len)) 1776 if (copy_to_user(buffer, data, len))
1747 return -EFAULT; 1777 return -EFAULT;
1748 if (len < *lenp) { 1778 if (len < *lenp) {
1749 if(put_user('\n', ((char __user *) buffer) + len)) 1779 if (put_user('\n', buffer + len))
1750 return -EFAULT; 1780 return -EFAULT;
1751 len++; 1781 len++;
1752 } 1782 }
@@ -1756,6 +1786,14 @@ static int _proc_do_string(void* data, int maxlen, int write,
1756 return 0; 1786 return 0;
1757} 1787}
1758 1788
1789static void warn_sysctl_write(struct ctl_table *table)
1790{
1791 pr_warn_once("%s wrote to %s when file position was not 0!\n"
1792 "This will not be supported in the future. To silence this\n"
1793 "warning, set kernel.sysctl_writes_strict = -1\n",
1794 current->comm, table->procname);
1795}
1796
1759/** 1797/**
1760 * proc_dostring - read a string sysctl 1798 * proc_dostring - read a string sysctl
1761 * @table: the sysctl table 1799 * @table: the sysctl table
@@ -1776,8 +1814,11 @@ static int _proc_do_string(void* data, int maxlen, int write,
1776int proc_dostring(struct ctl_table *table, int write, 1814int proc_dostring(struct ctl_table *table, int write,
1777 void __user *buffer, size_t *lenp, loff_t *ppos) 1815 void __user *buffer, size_t *lenp, loff_t *ppos)
1778{ 1816{
1779 return _proc_do_string(table->data, table->maxlen, write, 1817 if (write && *ppos && sysctl_writes_strict == SYSCTL_WRITES_WARN)
1780 buffer, lenp, ppos); 1818 warn_sysctl_write(table);
1819
1820 return _proc_do_string((char *)(table->data), table->maxlen, write,
1821 (char __user *)buffer, lenp, ppos);
1781} 1822}
1782 1823
1783static size_t proc_skip_spaces(char **buf) 1824static size_t proc_skip_spaces(char **buf)
@@ -1951,6 +1992,18 @@ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table,
1951 conv = do_proc_dointvec_conv; 1992 conv = do_proc_dointvec_conv;
1952 1993
1953 if (write) { 1994 if (write) {
1995 if (*ppos) {
1996 switch (sysctl_writes_strict) {
1997 case SYSCTL_WRITES_STRICT:
1998 goto out;
1999 case SYSCTL_WRITES_WARN:
2000 warn_sysctl_write(table);
2001 break;
2002 default:
2003 break;
2004 }
2005 }
2006
1954 if (left > PAGE_SIZE - 1) 2007 if (left > PAGE_SIZE - 1)
1955 left = PAGE_SIZE - 1; 2008 left = PAGE_SIZE - 1;
1956 page = __get_free_page(GFP_TEMPORARY); 2009 page = __get_free_page(GFP_TEMPORARY);
@@ -2008,6 +2061,7 @@ free:
2008 return err ? : -EINVAL; 2061 return err ? : -EINVAL;
2009 } 2062 }
2010 *lenp -= left; 2063 *lenp -= left;
2064out:
2011 *ppos += *lenp; 2065 *ppos += *lenp;
2012 return err; 2066 return err;
2013} 2067}
@@ -2200,6 +2254,18 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int
2200 left = *lenp; 2254 left = *lenp;
2201 2255
2202 if (write) { 2256 if (write) {
2257 if (*ppos) {
2258 switch (sysctl_writes_strict) {
2259 case SYSCTL_WRITES_STRICT:
2260 goto out;
2261 case SYSCTL_WRITES_WARN:
2262 warn_sysctl_write(table);
2263 break;
2264 default:
2265 break;
2266 }
2267 }
2268
2203 if (left > PAGE_SIZE - 1) 2269 if (left > PAGE_SIZE - 1)
2204 left = PAGE_SIZE - 1; 2270 left = PAGE_SIZE - 1;
2205 page = __get_free_page(GFP_TEMPORARY); 2271 page = __get_free_page(GFP_TEMPORARY);
@@ -2255,6 +2321,7 @@ free:
2255 return err ? : -EINVAL; 2321 return err ? : -EINVAL;
2256 } 2322 }
2257 *lenp -= left; 2323 *lenp -= left;
2324out:
2258 *ppos += *lenp; 2325 *ppos += *lenp;
2259 return err; 2326 return err;
2260} 2327}
@@ -2501,11 +2568,11 @@ int proc_do_large_bitmap(struct ctl_table *table, int write,
2501 bool first = 1; 2568 bool first = 1;
2502 size_t left = *lenp; 2569 size_t left = *lenp;
2503 unsigned long bitmap_len = table->maxlen; 2570 unsigned long bitmap_len = table->maxlen;
2504 unsigned long *bitmap = (unsigned long *) table->data; 2571 unsigned long *bitmap = *(unsigned long **) table->data;
2505 unsigned long *tmp_bitmap = NULL; 2572 unsigned long *tmp_bitmap = NULL;
2506 char tr_a[] = { '-', ',', '\n' }, tr_b[] = { ',', '\n', 0 }, c; 2573 char tr_a[] = { '-', ',', '\n' }, tr_b[] = { ',', '\n', 0 }, c;
2507 2574
2508 if (!bitmap_len || !left || (*ppos && !write)) { 2575 if (!bitmap || !bitmap_len || !left || (*ppos && !write)) {
2509 *lenp = 0; 2576 *lenp = 0;
2510 return 0; 2577 return 0;
2511 } 2578 }
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 419a52cecd20..33db43a39515 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -165,21 +165,21 @@ static inline void pps_set_freq(s64 freq)
165 165
166static inline int is_error_status(int status) 166static inline int is_error_status(int status)
167{ 167{
168 return (time_status & (STA_UNSYNC|STA_CLOCKERR)) 168 return (status & (STA_UNSYNC|STA_CLOCKERR))
169 /* PPS signal lost when either PPS time or 169 /* PPS signal lost when either PPS time or
170 * PPS frequency synchronization requested 170 * PPS frequency synchronization requested
171 */ 171 */
172 || ((time_status & (STA_PPSFREQ|STA_PPSTIME)) 172 || ((status & (STA_PPSFREQ|STA_PPSTIME))
173 && !(time_status & STA_PPSSIGNAL)) 173 && !(status & STA_PPSSIGNAL))
174 /* PPS jitter exceeded when 174 /* PPS jitter exceeded when
175 * PPS time synchronization requested */ 175 * PPS time synchronization requested */
176 || ((time_status & (STA_PPSTIME|STA_PPSJITTER)) 176 || ((status & (STA_PPSTIME|STA_PPSJITTER))
177 == (STA_PPSTIME|STA_PPSJITTER)) 177 == (STA_PPSTIME|STA_PPSJITTER))
178 /* PPS wander exceeded or calibration error when 178 /* PPS wander exceeded or calibration error when
179 * PPS frequency synchronization requested 179 * PPS frequency synchronization requested
180 */ 180 */
181 || ((time_status & STA_PPSFREQ) 181 || ((status & STA_PPSFREQ)
182 && (time_status & (STA_PPSWANDER|STA_PPSERROR))); 182 && (status & (STA_PPSWANDER|STA_PPSERROR)));
183} 183}
184 184
185static inline void pps_fill_timex(struct timex *txc) 185static inline void pps_fill_timex(struct timex *txc)
@@ -786,8 +786,9 @@ static long hardpps_update_freq(struct pps_normtime freq_norm)
786 time_status |= STA_PPSERROR; 786 time_status |= STA_PPSERROR;
787 pps_errcnt++; 787 pps_errcnt++;
788 pps_dec_freq_interval(); 788 pps_dec_freq_interval();
789 pr_err("hardpps: PPSERROR: interval too long - %ld s\n", 789 printk_deferred(KERN_ERR
790 freq_norm.sec); 790 "hardpps: PPSERROR: interval too long - %ld s\n",
791 freq_norm.sec);
791 return 0; 792 return 0;
792 } 793 }
793 794
@@ -800,7 +801,8 @@ static long hardpps_update_freq(struct pps_normtime freq_norm)
800 delta = shift_right(ftemp - pps_freq, NTP_SCALE_SHIFT); 801 delta = shift_right(ftemp - pps_freq, NTP_SCALE_SHIFT);
801 pps_freq = ftemp; 802 pps_freq = ftemp;
802 if (delta > PPS_MAXWANDER || delta < -PPS_MAXWANDER) { 803 if (delta > PPS_MAXWANDER || delta < -PPS_MAXWANDER) {
803 pr_warning("hardpps: PPSWANDER: change=%ld\n", delta); 804 printk_deferred(KERN_WARNING
805 "hardpps: PPSWANDER: change=%ld\n", delta);
804 time_status |= STA_PPSWANDER; 806 time_status |= STA_PPSWANDER;
805 pps_stbcnt++; 807 pps_stbcnt++;
806 pps_dec_freq_interval(); 808 pps_dec_freq_interval();
@@ -844,8 +846,9 @@ static void hardpps_update_phase(long error)
844 * the time offset is updated. 846 * the time offset is updated.
845 */ 847 */
846 if (jitter > (pps_jitter << PPS_POPCORN)) { 848 if (jitter > (pps_jitter << PPS_POPCORN)) {
847 pr_warning("hardpps: PPSJITTER: jitter=%ld, limit=%ld\n", 849 printk_deferred(KERN_WARNING
848 jitter, (pps_jitter << PPS_POPCORN)); 850 "hardpps: PPSJITTER: jitter=%ld, limit=%ld\n",
851 jitter, (pps_jitter << PPS_POPCORN));
849 time_status |= STA_PPSJITTER; 852 time_status |= STA_PPSJITTER;
850 pps_jitcnt++; 853 pps_jitcnt++;
851 } else if (time_status & STA_PPSTIME) { 854 } else if (time_status & STA_PPSTIME) {
@@ -902,7 +905,7 @@ void __hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)
902 time_status |= STA_PPSJITTER; 905 time_status |= STA_PPSJITTER;
903 /* restart the frequency calibration interval */ 906 /* restart the frequency calibration interval */
904 pps_fbase = *raw_ts; 907 pps_fbase = *raw_ts;
905 pr_err("hardpps: PPSJITTER: bad pulse\n"); 908 printk_deferred(KERN_ERR "hardpps: PPSJITTER: bad pulse\n");
906 return; 909 return;
907 } 910 }
908 911
@@ -923,7 +926,10 @@ void __hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)
923 926
924static int __init ntp_tick_adj_setup(char *str) 927static int __init ntp_tick_adj_setup(char *str)
925{ 928{
926 ntp_tick_adj = simple_strtol(str, NULL, 0); 929 int rc = kstrtol(str, 0, (long *)&ntp_tick_adj);
930
931 if (rc)
932 return rc;
927 ntp_tick_adj <<= NTP_SCALE_SHIFT; 933 ntp_tick_adj <<= NTP_SCALE_SHIFT;
928 934
929 return 1; 935 return 1;
diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c
index 4d23dc4d8139..445106d2c729 100644
--- a/kernel/time/sched_clock.c
+++ b/kernel/time/sched_clock.c
@@ -49,13 +49,6 @@ static u64 notrace jiffy_sched_clock_read(void)
49 return (u64)(jiffies - INITIAL_JIFFIES); 49 return (u64)(jiffies - INITIAL_JIFFIES);
50} 50}
51 51
52static u32 __read_mostly (*read_sched_clock_32)(void);
53
54static u64 notrace read_sched_clock_32_wrapper(void)
55{
56 return read_sched_clock_32();
57}
58
59static u64 __read_mostly (*read_sched_clock)(void) = jiffy_sched_clock_read; 52static u64 __read_mostly (*read_sched_clock)(void) = jiffy_sched_clock_read;
60 53
61static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift) 54static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift)
@@ -176,12 +169,6 @@ void __init sched_clock_register(u64 (*read)(void), int bits,
176 pr_debug("Registered %pF as sched_clock source\n", read); 169 pr_debug("Registered %pF as sched_clock source\n", read);
177} 170}
178 171
179void __init setup_sched_clock(u32 (*read)(void), int bits, unsigned long rate)
180{
181 read_sched_clock_32 = read;
182 sched_clock_register(read_sched_clock_32_wrapper, bits, rate);
183}
184
185void __init sched_clock_postinit(void) 172void __init sched_clock_postinit(void)
186{ 173{
187 /* 174 /*
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index 015661279b68..0a0608edeb26 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -276,7 +276,7 @@ static bool tick_check_preferred(struct clock_event_device *curdev,
276bool tick_check_replacement(struct clock_event_device *curdev, 276bool tick_check_replacement(struct clock_event_device *curdev,
277 struct clock_event_device *newdev) 277 struct clock_event_device *newdev)
278{ 278{
279 if (tick_check_percpu(curdev, newdev, smp_processor_id())) 279 if (!tick_check_percpu(curdev, newdev, smp_processor_id()))
280 return false; 280 return false;
281 281
282 return tick_check_preferred(curdev, newdev); 282 return tick_check_preferred(curdev, newdev);
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 9f8af69c67ec..6558b7ac112d 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -84,6 +84,9 @@ static void tick_do_update_jiffies64(ktime_t now)
84 84
85 /* Keep the tick_next_period variable up to date */ 85 /* Keep the tick_next_period variable up to date */
86 tick_next_period = ktime_add(last_jiffies_update, tick_period); 86 tick_next_period = ktime_add(last_jiffies_update, tick_period);
87 } else {
88 write_sequnlock(&jiffies_lock);
89 return;
87 } 90 }
88 write_sequnlock(&jiffies_lock); 91 write_sequnlock(&jiffies_lock);
89 update_wall_time(); 92 update_wall_time();
@@ -967,7 +970,7 @@ static void tick_nohz_switch_to_nohz(void)
967 struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); 970 struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
968 ktime_t next; 971 ktime_t next;
969 972
970 if (!tick_nohz_active) 973 if (!tick_nohz_enabled)
971 return; 974 return;
972 975
973 local_irq_disable(); 976 local_irq_disable();
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index f7df8ea21707..32d8d6aaedb8 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -852,8 +852,9 @@ static void __timekeeping_inject_sleeptime(struct timekeeper *tk,
852 struct timespec *delta) 852 struct timespec *delta)
853{ 853{
854 if (!timespec_valid_strict(delta)) { 854 if (!timespec_valid_strict(delta)) {
855 printk(KERN_WARNING "__timekeeping_inject_sleeptime: Invalid " 855 printk_deferred(KERN_WARNING
856 "sleep delta value!\n"); 856 "__timekeeping_inject_sleeptime: Invalid "
857 "sleep delta value!\n");
857 return; 858 return;
858 } 859 }
859 tk_xtime_add(tk, delta); 860 tk_xtime_add(tk, delta);
@@ -1157,7 +1158,7 @@ static void timekeeping_adjust(struct timekeeper *tk, s64 offset)
1157 1158
1158 if (unlikely(tk->clock->maxadj && 1159 if (unlikely(tk->clock->maxadj &&
1159 (tk->mult + adj > tk->clock->mult + tk->clock->maxadj))) { 1160 (tk->mult + adj > tk->clock->mult + tk->clock->maxadj))) {
1160 printk_once(KERN_WARNING 1161 printk_deferred_once(KERN_WARNING
1161 "Adjusting %s more than 11%% (%ld vs %ld)\n", 1162 "Adjusting %s more than 11%% (%ld vs %ld)\n",
1162 tk->clock->name, (long)tk->mult + adj, 1163 tk->clock->name, (long)tk->mult + adj,
1163 (long)tk->clock->mult + tk->clock->maxadj); 1164 (long)tk->clock->mult + tk->clock->maxadj);
diff --git a/kernel/timer.c b/kernel/timer.c
index 87bd529879c2..3bb01a323b2a 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -838,7 +838,7 @@ unsigned long apply_slack(struct timer_list *timer, unsigned long expires)
838 838
839 bit = find_last_bit(&mask, BITS_PER_LONG); 839 bit = find_last_bit(&mask, BITS_PER_LONG);
840 840
841 mask = (1 << bit) - 1; 841 mask = (1UL << bit) - 1;
842 842
843 expires_limit = expires_limit & ~(mask); 843 expires_limit = expires_limit & ~(mask);
844 844
diff --git a/kernel/torture.c b/kernel/torture.c
index acc9afc2f26e..40bb511cca48 100644
--- a/kernel/torture.c
+++ b/kernel/torture.c
@@ -335,13 +335,8 @@ static void torture_shuffle_tasks(void)
335 shuffle_idle_cpu = cpumask_next(shuffle_idle_cpu, shuffle_tmp_mask); 335 shuffle_idle_cpu = cpumask_next(shuffle_idle_cpu, shuffle_tmp_mask);
336 if (shuffle_idle_cpu >= nr_cpu_ids) 336 if (shuffle_idle_cpu >= nr_cpu_ids)
337 shuffle_idle_cpu = -1; 337 shuffle_idle_cpu = -1;
338 if (shuffle_idle_cpu != -1) { 338 else
339 cpumask_clear_cpu(shuffle_idle_cpu, shuffle_tmp_mask); 339 cpumask_clear_cpu(shuffle_idle_cpu, shuffle_tmp_mask);
340 if (cpumask_empty(shuffle_tmp_mask)) {
341 put_online_cpus();
342 return;
343 }
344 }
345 340
346 mutex_lock(&shuffle_task_mutex); 341 mutex_lock(&shuffle_task_mutex);
347 list_for_each_entry(stp, &shuffle_task_list, st_l) 342 list_for_each_entry(stp, &shuffle_task_list, st_l)
@@ -533,7 +528,11 @@ void stutter_wait(const char *title)
533 while (ACCESS_ONCE(stutter_pause_test) || 528 while (ACCESS_ONCE(stutter_pause_test) ||
534 (torture_runnable && !ACCESS_ONCE(*torture_runnable))) { 529 (torture_runnable && !ACCESS_ONCE(*torture_runnable))) {
535 if (stutter_pause_test) 530 if (stutter_pause_test)
536 schedule_timeout_interruptible(1); 531 if (ACCESS_ONCE(stutter_pause_test) == 1)
532 schedule_timeout_interruptible(1);
533 else
534 while (ACCESS_ONCE(stutter_pause_test))
535 cond_resched();
537 else 536 else
538 schedule_timeout_interruptible(round_jiffies_relative(HZ)); 537 schedule_timeout_interruptible(round_jiffies_relative(HZ));
539 torture_shutdown_absorb(title); 538 torture_shutdown_absorb(title);
@@ -550,7 +549,11 @@ static int torture_stutter(void *arg)
550 VERBOSE_TOROUT_STRING("torture_stutter task started"); 549 VERBOSE_TOROUT_STRING("torture_stutter task started");
551 do { 550 do {
552 if (!torture_must_stop()) { 551 if (!torture_must_stop()) {
553 schedule_timeout_interruptible(stutter); 552 if (stutter > 1) {
553 schedule_timeout_interruptible(stutter - 1);
554 ACCESS_ONCE(stutter_pause_test) = 2;
555 }
556 schedule_timeout_interruptible(1);
554 ACCESS_ONCE(stutter_pause_test) = 1; 557 ACCESS_ONCE(stutter_pause_test) = 1;
555 } 558 }
556 if (!torture_must_stop()) 559 if (!torture_must_stop())
@@ -596,21 +599,27 @@ static void torture_stutter_cleanup(void)
596 * The runnable parameter points to a flag that controls whether or not 599 * The runnable parameter points to a flag that controls whether or not
597 * the test is currently runnable. If there is no such flag, pass in NULL. 600 * the test is currently runnable. If there is no such flag, pass in NULL.
598 */ 601 */
599void __init torture_init_begin(char *ttype, bool v, int *runnable) 602bool torture_init_begin(char *ttype, bool v, int *runnable)
600{ 603{
601 mutex_lock(&fullstop_mutex); 604 mutex_lock(&fullstop_mutex);
605 if (torture_type != NULL) {
606 pr_alert("torture_init_begin: refusing %s init: %s running",
607 ttype, torture_type);
608 mutex_unlock(&fullstop_mutex);
609 return false;
610 }
602 torture_type = ttype; 611 torture_type = ttype;
603 verbose = v; 612 verbose = v;
604 torture_runnable = runnable; 613 torture_runnable = runnable;
605 fullstop = FULLSTOP_DONTSTOP; 614 fullstop = FULLSTOP_DONTSTOP;
606 615 return true;
607} 616}
608EXPORT_SYMBOL_GPL(torture_init_begin); 617EXPORT_SYMBOL_GPL(torture_init_begin);
609 618
610/* 619/*
611 * Tell the torture module that initialization is complete. 620 * Tell the torture module that initialization is complete.
612 */ 621 */
613void __init torture_init_end(void) 622void torture_init_end(void)
614{ 623{
615 mutex_unlock(&fullstop_mutex); 624 mutex_unlock(&fullstop_mutex);
616 register_reboot_notifier(&torture_shutdown_nb); 625 register_reboot_notifier(&torture_shutdown_nb);
@@ -642,6 +651,9 @@ bool torture_cleanup(void)
642 torture_shuffle_cleanup(); 651 torture_shuffle_cleanup();
643 torture_stutter_cleanup(); 652 torture_stutter_cleanup();
644 torture_onoff_cleanup(); 653 torture_onoff_cleanup();
654 mutex_lock(&fullstop_mutex);
655 torture_type = NULL;
656 mutex_unlock(&fullstop_mutex);
645 return false; 657 return false;
646} 658}
647EXPORT_SYMBOL_GPL(torture_cleanup); 659EXPORT_SYMBOL_GPL(torture_cleanup);
@@ -674,8 +686,10 @@ EXPORT_SYMBOL_GPL(torture_must_stop_irq);
674 */ 686 */
675void torture_kthread_stopping(char *title) 687void torture_kthread_stopping(char *title)
676{ 688{
677 if (verbose) 689 char buf[128];
678 VERBOSE_TOROUT_STRING(title); 690
691 snprintf(buf, sizeof(buf), "Stopping %s", title);
692 VERBOSE_TOROUT_STRING(buf);
679 while (!kthread_should_stop()) { 693 while (!kthread_should_stop()) {
680 torture_shutdown_absorb(title); 694 torture_shutdown_absorb(title);
681 schedule_timeout_uninterruptible(1); 695 schedule_timeout_uninterruptible(1);
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 8639819f6cef..d4409356f40d 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -535,6 +535,36 @@ config MMIOTRACE_TEST
535 535
536 Say N, unless you absolutely know what you are doing. 536 Say N, unless you absolutely know what you are doing.
537 537
538config TRACEPOINT_BENCHMARK
539 bool "Add tracepoint that benchmarks tracepoints"
540 help
541 This option creates the tracepoint "benchmark:benchmark_event".
542 When the tracepoint is enabled, it kicks off a kernel thread that
543 goes into an infinite loop (calling cond_sched() to let other tasks
544 run), and calls the tracepoint. Each iteration will record the time
545 it took to write to the tracepoint and the next iteration that
546 data will be passed to the tracepoint itself. That is, the tracepoint
547 will report the time it took to do the previous tracepoint.
548 The string written to the tracepoint is a static string of 128 bytes
549 to keep the time the same. The initial string is simply a write of
550 "START". The second string records the cold cache time of the first
551 write which is not added to the rest of the calculations.
552
553 As it is a tight loop, it benchmarks as hot cache. That's fine because
554 we care most about hot paths that are probably in cache already.
555
556 An example of the output:
557
558 START
559 first=3672 [COLD CACHED]
560 last=632 first=3672 max=632 min=632 avg=316 std=446 std^2=199712
561 last=278 first=3672 max=632 min=278 avg=303 std=316 std^2=100337
562 last=277 first=3672 max=632 min=277 avg=296 std=258 std^2=67064
563 last=273 first=3672 max=632 min=273 avg=292 std=224 std^2=50411
564 last=273 first=3672 max=632 min=273 avg=288 std=200 std^2=40389
565 last=281 first=3672 max=632 min=273 avg=287 std=183 std^2=33666
566
567
538config RING_BUFFER_BENCHMARK 568config RING_BUFFER_BENCHMARK
539 tristate "Ring buffer benchmark stress tester" 569 tristate "Ring buffer benchmark stress tester"
540 depends on RING_BUFFER 570 depends on RING_BUFFER
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 1378e84fbe39..2611613f14f1 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -17,6 +17,7 @@ ifdef CONFIG_TRACING_BRANCHES
17KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING 17KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING
18endif 18endif
19 19
20CFLAGS_trace_benchmark.o := -I$(src)
20CFLAGS_trace_events_filter.o := -I$(src) 21CFLAGS_trace_events_filter.o := -I$(src)
21 22
22obj-$(CONFIG_TRACE_CLOCK) += trace_clock.o 23obj-$(CONFIG_TRACE_CLOCK) += trace_clock.o
@@ -62,4 +63,6 @@ endif
62obj-$(CONFIG_PROBE_EVENTS) += trace_probe.o 63obj-$(CONFIG_PROBE_EVENTS) += trace_probe.o
63obj-$(CONFIG_UPROBE_EVENT) += trace_uprobe.o 64obj-$(CONFIG_UPROBE_EVENT) += trace_uprobe.o
64 65
66obj-$(CONFIG_TRACEPOINT_BENCHMARK) += trace_benchmark.o
67
65libftrace-y := ftrace.o 68libftrace-y := ftrace.o
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 1fd4b9479210..5b372e3ed675 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -62,7 +62,7 @@
62#define FTRACE_HASH_DEFAULT_BITS 10 62#define FTRACE_HASH_DEFAULT_BITS 10
63#define FTRACE_HASH_MAX_BITS 12 63#define FTRACE_HASH_MAX_BITS 12
64 64
65#define FL_GLOBAL_CONTROL_MASK (FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_CONTROL) 65#define FL_GLOBAL_CONTROL_MASK (FTRACE_OPS_FL_CONTROL)
66 66
67#ifdef CONFIG_DYNAMIC_FTRACE 67#ifdef CONFIG_DYNAMIC_FTRACE
68#define INIT_REGEX_LOCK(opsname) \ 68#define INIT_REGEX_LOCK(opsname) \
@@ -103,7 +103,6 @@ static int ftrace_disabled __read_mostly;
103 103
104static DEFINE_MUTEX(ftrace_lock); 104static DEFINE_MUTEX(ftrace_lock);
105 105
106static struct ftrace_ops *ftrace_global_list __read_mostly = &ftrace_list_end;
107static struct ftrace_ops *ftrace_control_list __read_mostly = &ftrace_list_end; 106static struct ftrace_ops *ftrace_control_list __read_mostly = &ftrace_list_end;
108static struct ftrace_ops *ftrace_ops_list __read_mostly = &ftrace_list_end; 107static struct ftrace_ops *ftrace_ops_list __read_mostly = &ftrace_list_end;
109ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub; 108ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub;
@@ -171,23 +170,6 @@ int ftrace_nr_registered_ops(void)
171 return cnt; 170 return cnt;
172} 171}
173 172
174static void
175ftrace_global_list_func(unsigned long ip, unsigned long parent_ip,
176 struct ftrace_ops *op, struct pt_regs *regs)
177{
178 int bit;
179
180 bit = trace_test_and_set_recursion(TRACE_GLOBAL_START, TRACE_GLOBAL_MAX);
181 if (bit < 0)
182 return;
183
184 do_for_each_ftrace_op(op, ftrace_global_list) {
185 op->func(ip, parent_ip, op, regs);
186 } while_for_each_ftrace_op(op);
187
188 trace_clear_recursion(bit);
189}
190
191static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip, 173static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip,
192 struct ftrace_ops *op, struct pt_regs *regs) 174 struct ftrace_ops *op, struct pt_regs *regs)
193{ 175{
@@ -237,43 +219,6 @@ static int control_ops_alloc(struct ftrace_ops *ops)
237 return 0; 219 return 0;
238} 220}
239 221
240static void update_global_ops(void)
241{
242 ftrace_func_t func = ftrace_global_list_func;
243 void *private = NULL;
244
245 /* The list has its own recursion protection. */
246 global_ops.flags |= FTRACE_OPS_FL_RECURSION_SAFE;
247
248 /*
249 * If there's only one function registered, then call that
250 * function directly. Otherwise, we need to iterate over the
251 * registered callers.
252 */
253 if (ftrace_global_list == &ftrace_list_end ||
254 ftrace_global_list->next == &ftrace_list_end) {
255 func = ftrace_global_list->func;
256 private = ftrace_global_list->private;
257 /*
258 * As we are calling the function directly.
259 * If it does not have recursion protection,
260 * the function_trace_op needs to be updated
261 * accordingly.
262 */
263 if (!(ftrace_global_list->flags & FTRACE_OPS_FL_RECURSION_SAFE))
264 global_ops.flags &= ~FTRACE_OPS_FL_RECURSION_SAFE;
265 }
266
267 /* If we filter on pids, update to use the pid function */
268 if (!list_empty(&ftrace_pids)) {
269 set_ftrace_pid_function(func);
270 func = ftrace_pid_func;
271 }
272
273 global_ops.func = func;
274 global_ops.private = private;
275}
276
277static void ftrace_sync(struct work_struct *work) 222static void ftrace_sync(struct work_struct *work)
278{ 223{
279 /* 224 /*
@@ -301,8 +246,6 @@ static void update_ftrace_function(void)
301{ 246{
302 ftrace_func_t func; 247 ftrace_func_t func;
303 248
304 update_global_ops();
305
306 /* 249 /*
307 * If we are at the end of the list and this ops is 250 * If we are at the end of the list and this ops is
308 * recursion safe and not dynamic and the arch supports passing ops, 251 * recursion safe and not dynamic and the arch supports passing ops,
@@ -314,10 +257,7 @@ static void update_ftrace_function(void)
314 (ftrace_ops_list->flags & FTRACE_OPS_FL_RECURSION_SAFE) && 257 (ftrace_ops_list->flags & FTRACE_OPS_FL_RECURSION_SAFE) &&
315 !FTRACE_FORCE_LIST_FUNC)) { 258 !FTRACE_FORCE_LIST_FUNC)) {
316 /* Set the ftrace_ops that the arch callback uses */ 259 /* Set the ftrace_ops that the arch callback uses */
317 if (ftrace_ops_list == &global_ops) 260 set_function_trace_op = ftrace_ops_list;
318 set_function_trace_op = ftrace_global_list;
319 else
320 set_function_trace_op = ftrace_ops_list;
321 func = ftrace_ops_list->func; 261 func = ftrace_ops_list->func;
322 } else { 262 } else {
323 /* Just use the default ftrace_ops */ 263 /* Just use the default ftrace_ops */
@@ -373,6 +313,11 @@ static void update_ftrace_function(void)
373 ftrace_trace_function = func; 313 ftrace_trace_function = func;
374} 314}
375 315
316int using_ftrace_ops_list_func(void)
317{
318 return ftrace_trace_function == ftrace_ops_list_func;
319}
320
376static void add_ftrace_ops(struct ftrace_ops **list, struct ftrace_ops *ops) 321static void add_ftrace_ops(struct ftrace_ops **list, struct ftrace_ops *ops)
377{ 322{
378 ops->next = *list; 323 ops->next = *list;
@@ -434,16 +379,9 @@ static int __register_ftrace_function(struct ftrace_ops *ops)
434 if (ops->flags & FTRACE_OPS_FL_DELETED) 379 if (ops->flags & FTRACE_OPS_FL_DELETED)
435 return -EINVAL; 380 return -EINVAL;
436 381
437 if (FTRACE_WARN_ON(ops == &global_ops))
438 return -EINVAL;
439
440 if (WARN_ON(ops->flags & FTRACE_OPS_FL_ENABLED)) 382 if (WARN_ON(ops->flags & FTRACE_OPS_FL_ENABLED))
441 return -EBUSY; 383 return -EBUSY;
442 384
443 /* We don't support both control and global flags set. */
444 if ((ops->flags & FL_GLOBAL_CONTROL_MASK) == FL_GLOBAL_CONTROL_MASK)
445 return -EINVAL;
446
447#ifndef CONFIG_DYNAMIC_FTRACE_WITH_REGS 385#ifndef CONFIG_DYNAMIC_FTRACE_WITH_REGS
448 /* 386 /*
449 * If the ftrace_ops specifies SAVE_REGS, then it only can be used 387 * If the ftrace_ops specifies SAVE_REGS, then it only can be used
@@ -461,10 +399,7 @@ static int __register_ftrace_function(struct ftrace_ops *ops)
461 if (!core_kernel_data((unsigned long)ops)) 399 if (!core_kernel_data((unsigned long)ops))
462 ops->flags |= FTRACE_OPS_FL_DYNAMIC; 400 ops->flags |= FTRACE_OPS_FL_DYNAMIC;
463 401
464 if (ops->flags & FTRACE_OPS_FL_GLOBAL) { 402 if (ops->flags & FTRACE_OPS_FL_CONTROL) {
465 add_ftrace_list_ops(&ftrace_global_list, &global_ops, ops);
466 ops->flags |= FTRACE_OPS_FL_ENABLED;
467 } else if (ops->flags & FTRACE_OPS_FL_CONTROL) {
468 if (control_ops_alloc(ops)) 403 if (control_ops_alloc(ops))
469 return -ENOMEM; 404 return -ENOMEM;
470 add_ftrace_list_ops(&ftrace_control_list, &control_ops, ops); 405 add_ftrace_list_ops(&ftrace_control_list, &control_ops, ops);
@@ -484,15 +419,7 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops)
484 if (WARN_ON(!(ops->flags & FTRACE_OPS_FL_ENABLED))) 419 if (WARN_ON(!(ops->flags & FTRACE_OPS_FL_ENABLED)))
485 return -EBUSY; 420 return -EBUSY;
486 421
487 if (FTRACE_WARN_ON(ops == &global_ops)) 422 if (ops->flags & FTRACE_OPS_FL_CONTROL) {
488 return -EINVAL;
489
490 if (ops->flags & FTRACE_OPS_FL_GLOBAL) {
491 ret = remove_ftrace_list_ops(&ftrace_global_list,
492 &global_ops, ops);
493 if (!ret)
494 ops->flags &= ~FTRACE_OPS_FL_ENABLED;
495 } else if (ops->flags & FTRACE_OPS_FL_CONTROL) {
496 ret = remove_ftrace_list_ops(&ftrace_control_list, 423 ret = remove_ftrace_list_ops(&ftrace_control_list,
497 &control_ops, ops); 424 &control_ops, ops);
498 } else 425 } else
@@ -895,7 +822,7 @@ function_profile_call(unsigned long ip, unsigned long parent_ip,
895 822
896 local_irq_save(flags); 823 local_irq_save(flags);
897 824
898 stat = &__get_cpu_var(ftrace_profile_stats); 825 stat = this_cpu_ptr(&ftrace_profile_stats);
899 if (!stat->hash || !ftrace_profile_enabled) 826 if (!stat->hash || !ftrace_profile_enabled)
900 goto out; 827 goto out;
901 828
@@ -926,7 +853,7 @@ static void profile_graph_return(struct ftrace_graph_ret *trace)
926 unsigned long flags; 853 unsigned long flags;
927 854
928 local_irq_save(flags); 855 local_irq_save(flags);
929 stat = &__get_cpu_var(ftrace_profile_stats); 856 stat = this_cpu_ptr(&ftrace_profile_stats);
930 if (!stat->hash || !ftrace_profile_enabled) 857 if (!stat->hash || !ftrace_profile_enabled)
931 goto out; 858 goto out;
932 859
@@ -1178,7 +1105,7 @@ struct ftrace_page {
1178static struct ftrace_page *ftrace_pages_start; 1105static struct ftrace_page *ftrace_pages_start;
1179static struct ftrace_page *ftrace_pages; 1106static struct ftrace_page *ftrace_pages;
1180 1107
1181static bool ftrace_hash_empty(struct ftrace_hash *hash) 1108static bool __always_inline ftrace_hash_empty(struct ftrace_hash *hash)
1182{ 1109{
1183 return !hash || !hash->count; 1110 return !hash || !hash->count;
1184} 1111}
@@ -1625,7 +1552,14 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops,
1625 in_other_hash = !!ftrace_lookup_ip(other_hash, rec->ip); 1552 in_other_hash = !!ftrace_lookup_ip(other_hash, rec->ip);
1626 1553
1627 /* 1554 /*
1555 * If filter_hash is set, we want to match all functions
1556 * that are in the hash but not in the other hash.
1628 * 1557 *
1558 * If filter_hash is not set, then we are decrementing.
1559 * That means we match anything that is in the hash
1560 * and also in the other_hash. That is, we need to turn
1561 * off functions in the other hash because they are disabled
1562 * by this hash.
1629 */ 1563 */
1630 if (filter_hash && in_hash && !in_other_hash) 1564 if (filter_hash && in_hash && !in_other_hash)
1631 match = 1; 1565 match = 1;
@@ -1767,19 +1701,15 @@ static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update)
1767 /* 1701 /*
1768 * If this record is being updated from a nop, then 1702 * If this record is being updated from a nop, then
1769 * return UPDATE_MAKE_CALL. 1703 * return UPDATE_MAKE_CALL.
1770 * Otherwise, if the EN flag is set, then return
1771 * UPDATE_MODIFY_CALL_REGS to tell the caller to convert
1772 * from the non-save regs, to a save regs function.
1773 * Otherwise, 1704 * Otherwise,
1774 * return UPDATE_MODIFY_CALL to tell the caller to convert 1705 * return UPDATE_MODIFY_CALL to tell the caller to convert
1775 * from the save regs, to a non-save regs function. 1706 * from the save regs, to a non-save regs function or
1707 * vice versa.
1776 */ 1708 */
1777 if (flag & FTRACE_FL_ENABLED) 1709 if (flag & FTRACE_FL_ENABLED)
1778 return FTRACE_UPDATE_MAKE_CALL; 1710 return FTRACE_UPDATE_MAKE_CALL;
1779 else if (rec->flags & FTRACE_FL_REGS_EN) 1711
1780 return FTRACE_UPDATE_MODIFY_CALL_REGS; 1712 return FTRACE_UPDATE_MODIFY_CALL;
1781 else
1782 return FTRACE_UPDATE_MODIFY_CALL;
1783 } 1713 }
1784 1714
1785 if (update) { 1715 if (update) {
@@ -1821,6 +1751,42 @@ int ftrace_test_record(struct dyn_ftrace *rec, int enable)
1821 return ftrace_check_record(rec, enable, 0); 1751 return ftrace_check_record(rec, enable, 0);
1822} 1752}
1823 1753
1754/**
1755 * ftrace_get_addr_new - Get the call address to set to
1756 * @rec: The ftrace record descriptor
1757 *
1758 * If the record has the FTRACE_FL_REGS set, that means that it
1759 * wants to convert to a callback that saves all regs. If FTRACE_FL_REGS
1760 * is not not set, then it wants to convert to the normal callback.
1761 *
1762 * Returns the address of the trampoline to set to
1763 */
1764unsigned long ftrace_get_addr_new(struct dyn_ftrace *rec)
1765{
1766 if (rec->flags & FTRACE_FL_REGS)
1767 return (unsigned long)FTRACE_REGS_ADDR;
1768 else
1769 return (unsigned long)FTRACE_ADDR;
1770}
1771
1772/**
1773 * ftrace_get_addr_curr - Get the call address that is already there
1774 * @rec: The ftrace record descriptor
1775 *
1776 * The FTRACE_FL_REGS_EN is set when the record already points to
1777 * a function that saves all the regs. Basically the '_EN' version
1778 * represents the current state of the function.
1779 *
1780 * Returns the address of the trampoline that is currently being called
1781 */
1782unsigned long ftrace_get_addr_curr(struct dyn_ftrace *rec)
1783{
1784 if (rec->flags & FTRACE_FL_REGS_EN)
1785 return (unsigned long)FTRACE_REGS_ADDR;
1786 else
1787 return (unsigned long)FTRACE_ADDR;
1788}
1789
1824static int 1790static int
1825__ftrace_replace_code(struct dyn_ftrace *rec, int enable) 1791__ftrace_replace_code(struct dyn_ftrace *rec, int enable)
1826{ 1792{
@@ -1828,12 +1794,12 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable)
1828 unsigned long ftrace_addr; 1794 unsigned long ftrace_addr;
1829 int ret; 1795 int ret;
1830 1796
1831 ret = ftrace_update_record(rec, enable); 1797 ftrace_addr = ftrace_get_addr_new(rec);
1832 1798
1833 if (rec->flags & FTRACE_FL_REGS) 1799 /* This needs to be done before we call ftrace_update_record */
1834 ftrace_addr = (unsigned long)FTRACE_REGS_ADDR; 1800 ftrace_old_addr = ftrace_get_addr_curr(rec);
1835 else 1801
1836 ftrace_addr = (unsigned long)FTRACE_ADDR; 1802 ret = ftrace_update_record(rec, enable);
1837 1803
1838 switch (ret) { 1804 switch (ret) {
1839 case FTRACE_UPDATE_IGNORE: 1805 case FTRACE_UPDATE_IGNORE:
@@ -1845,13 +1811,7 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable)
1845 case FTRACE_UPDATE_MAKE_NOP: 1811 case FTRACE_UPDATE_MAKE_NOP:
1846 return ftrace_make_nop(NULL, rec, ftrace_addr); 1812 return ftrace_make_nop(NULL, rec, ftrace_addr);
1847 1813
1848 case FTRACE_UPDATE_MODIFY_CALL_REGS:
1849 case FTRACE_UPDATE_MODIFY_CALL: 1814 case FTRACE_UPDATE_MODIFY_CALL:
1850 if (rec->flags & FTRACE_FL_REGS)
1851 ftrace_old_addr = (unsigned long)FTRACE_ADDR;
1852 else
1853 ftrace_old_addr = (unsigned long)FTRACE_REGS_ADDR;
1854
1855 return ftrace_modify_call(rec, ftrace_old_addr, ftrace_addr); 1815 return ftrace_modify_call(rec, ftrace_old_addr, ftrace_addr);
1856 } 1816 }
1857 1817
@@ -2115,7 +2075,6 @@ static void ftrace_startup_enable(int command)
2115 2075
2116static int ftrace_startup(struct ftrace_ops *ops, int command) 2076static int ftrace_startup(struct ftrace_ops *ops, int command)
2117{ 2077{
2118 bool hash_enable = true;
2119 int ret; 2078 int ret;
2120 2079
2121 if (unlikely(ftrace_disabled)) 2080 if (unlikely(ftrace_disabled))
@@ -2128,18 +2087,9 @@ static int ftrace_startup(struct ftrace_ops *ops, int command)
2128 ftrace_start_up++; 2087 ftrace_start_up++;
2129 command |= FTRACE_UPDATE_CALLS; 2088 command |= FTRACE_UPDATE_CALLS;
2130 2089
2131 /* ops marked global share the filter hashes */
2132 if (ops->flags & FTRACE_OPS_FL_GLOBAL) {
2133 ops = &global_ops;
2134 /* Don't update hash if global is already set */
2135 if (global_start_up)
2136 hash_enable = false;
2137 global_start_up++;
2138 }
2139
2140 ops->flags |= FTRACE_OPS_FL_ENABLED; 2090 ops->flags |= FTRACE_OPS_FL_ENABLED;
2141 if (hash_enable) 2091
2142 ftrace_hash_rec_enable(ops, 1); 2092 ftrace_hash_rec_enable(ops, 1);
2143 2093
2144 ftrace_startup_enable(command); 2094 ftrace_startup_enable(command);
2145 2095
@@ -2148,7 +2098,6 @@ static int ftrace_startup(struct ftrace_ops *ops, int command)
2148 2098
2149static int ftrace_shutdown(struct ftrace_ops *ops, int command) 2099static int ftrace_shutdown(struct ftrace_ops *ops, int command)
2150{ 2100{
2151 bool hash_disable = true;
2152 int ret; 2101 int ret;
2153 2102
2154 if (unlikely(ftrace_disabled)) 2103 if (unlikely(ftrace_disabled))
@@ -2166,21 +2115,9 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command)
2166 */ 2115 */
2167 WARN_ON_ONCE(ftrace_start_up < 0); 2116 WARN_ON_ONCE(ftrace_start_up < 0);
2168 2117
2169 if (ops->flags & FTRACE_OPS_FL_GLOBAL) { 2118 ftrace_hash_rec_disable(ops, 1);
2170 ops = &global_ops;
2171 global_start_up--;
2172 WARN_ON_ONCE(global_start_up < 0);
2173 /* Don't update hash if global still has users */
2174 if (global_start_up) {
2175 WARN_ON_ONCE(!ftrace_start_up);
2176 hash_disable = false;
2177 }
2178 }
2179 2119
2180 if (hash_disable) 2120 if (!global_start_up)
2181 ftrace_hash_rec_disable(ops, 1);
2182
2183 if (ops != &global_ops || !global_start_up)
2184 ops->flags &= ~FTRACE_OPS_FL_ENABLED; 2121 ops->flags &= ~FTRACE_OPS_FL_ENABLED;
2185 2122
2186 command |= FTRACE_UPDATE_CALLS; 2123 command |= FTRACE_UPDATE_CALLS;
@@ -3524,10 +3461,6 @@ ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len,
3524 struct ftrace_hash *hash; 3461 struct ftrace_hash *hash;
3525 int ret; 3462 int ret;
3526 3463
3527 /* All global ops uses the global ops filters */
3528 if (ops->flags & FTRACE_OPS_FL_GLOBAL)
3529 ops = &global_ops;
3530
3531 if (unlikely(ftrace_disabled)) 3464 if (unlikely(ftrace_disabled))
3532 return -ENODEV; 3465 return -ENODEV;
3533 3466
@@ -3639,8 +3572,7 @@ int ftrace_set_notrace(struct ftrace_ops *ops, unsigned char *buf,
3639} 3572}
3640EXPORT_SYMBOL_GPL(ftrace_set_notrace); 3573EXPORT_SYMBOL_GPL(ftrace_set_notrace);
3641/** 3574/**
3642 * ftrace_set_filter - set a function to filter on in ftrace 3575 * ftrace_set_global_filter - set a function to filter on with global tracers
3643 * @ops - the ops to set the filter with
3644 * @buf - the string that holds the function filter text. 3576 * @buf - the string that holds the function filter text.
3645 * @len - the length of the string. 3577 * @len - the length of the string.
3646 * @reset - non zero to reset all filters before applying this filter. 3578 * @reset - non zero to reset all filters before applying this filter.
@@ -3655,8 +3587,7 @@ void ftrace_set_global_filter(unsigned char *buf, int len, int reset)
3655EXPORT_SYMBOL_GPL(ftrace_set_global_filter); 3587EXPORT_SYMBOL_GPL(ftrace_set_global_filter);
3656 3588
3657/** 3589/**
3658 * ftrace_set_notrace - set a function to not trace in ftrace 3590 * ftrace_set_global_notrace - set a function to not trace with global tracers
3659 * @ops - the ops to set the notrace filter with
3660 * @buf - the string that holds the function notrace text. 3591 * @buf - the string that holds the function notrace text.
3661 * @len - the length of the string. 3592 * @len - the length of the string.
3662 * @reset - non zero to reset all filters before applying this filter. 3593 * @reset - non zero to reset all filters before applying this filter.
@@ -4330,16 +4261,11 @@ static void ftrace_init_module(struct module *mod,
4330 ftrace_process_locs(mod, start, end); 4261 ftrace_process_locs(mod, start, end);
4331} 4262}
4332 4263
4333static int ftrace_module_notify_enter(struct notifier_block *self, 4264void ftrace_module_init(struct module *mod)
4334 unsigned long val, void *data)
4335{ 4265{
4336 struct module *mod = data; 4266 ftrace_init_module(mod, mod->ftrace_callsites,
4337 4267 mod->ftrace_callsites +
4338 if (val == MODULE_STATE_COMING) 4268 mod->num_ftrace_callsites);
4339 ftrace_init_module(mod, mod->ftrace_callsites,
4340 mod->ftrace_callsites +
4341 mod->num_ftrace_callsites);
4342 return 0;
4343} 4269}
4344 4270
4345static int ftrace_module_notify_exit(struct notifier_block *self, 4271static int ftrace_module_notify_exit(struct notifier_block *self,
@@ -4353,11 +4279,6 @@ static int ftrace_module_notify_exit(struct notifier_block *self,
4353 return 0; 4279 return 0;
4354} 4280}
4355#else 4281#else
4356static int ftrace_module_notify_enter(struct notifier_block *self,
4357 unsigned long val, void *data)
4358{
4359 return 0;
4360}
4361static int ftrace_module_notify_exit(struct notifier_block *self, 4282static int ftrace_module_notify_exit(struct notifier_block *self,
4362 unsigned long val, void *data) 4283 unsigned long val, void *data)
4363{ 4284{
@@ -4365,11 +4286,6 @@ static int ftrace_module_notify_exit(struct notifier_block *self,
4365} 4286}
4366#endif /* CONFIG_MODULES */ 4287#endif /* CONFIG_MODULES */
4367 4288
4368struct notifier_block ftrace_module_enter_nb = {
4369 .notifier_call = ftrace_module_notify_enter,
4370 .priority = INT_MAX, /* Run before anything that can use kprobes */
4371};
4372
4373struct notifier_block ftrace_module_exit_nb = { 4289struct notifier_block ftrace_module_exit_nb = {
4374 .notifier_call = ftrace_module_notify_exit, 4290 .notifier_call = ftrace_module_notify_exit,
4375 .priority = INT_MIN, /* Run after anything that can remove kprobes */ 4291 .priority = INT_MIN, /* Run after anything that can remove kprobes */
@@ -4403,10 +4319,6 @@ void __init ftrace_init(void)
4403 __start_mcount_loc, 4319 __start_mcount_loc,
4404 __stop_mcount_loc); 4320 __stop_mcount_loc);
4405 4321
4406 ret = register_module_notifier(&ftrace_module_enter_nb);
4407 if (ret)
4408 pr_warning("Failed to register trace ftrace module enter notifier\n");
4409
4410 ret = register_module_notifier(&ftrace_module_exit_nb); 4322 ret = register_module_notifier(&ftrace_module_exit_nb);
4411 if (ret) 4323 if (ret)
4412 pr_warning("Failed to register trace ftrace module exit notifier\n"); 4324 pr_warning("Failed to register trace ftrace module exit notifier\n");
@@ -4462,6 +4374,34 @@ ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip, void *regs)
4462 4374
4463#endif /* CONFIG_DYNAMIC_FTRACE */ 4375#endif /* CONFIG_DYNAMIC_FTRACE */
4464 4376
4377__init void ftrace_init_global_array_ops(struct trace_array *tr)
4378{
4379 tr->ops = &global_ops;
4380 tr->ops->private = tr;
4381}
4382
4383void ftrace_init_array_ops(struct trace_array *tr, ftrace_func_t func)
4384{
4385 /* If we filter on pids, update to use the pid function */
4386 if (tr->flags & TRACE_ARRAY_FL_GLOBAL) {
4387 if (WARN_ON(tr->ops->func != ftrace_stub))
4388 printk("ftrace ops had %pS for function\n",
4389 tr->ops->func);
4390 /* Only the top level instance does pid tracing */
4391 if (!list_empty(&ftrace_pids)) {
4392 set_ftrace_pid_function(func);
4393 func = ftrace_pid_func;
4394 }
4395 }
4396 tr->ops->func = func;
4397 tr->ops->private = tr;
4398}
4399
4400void ftrace_reset_array_ops(struct trace_array *tr)
4401{
4402 tr->ops->func = ftrace_stub;
4403}
4404
4465static void 4405static void
4466ftrace_ops_control_func(unsigned long ip, unsigned long parent_ip, 4406ftrace_ops_control_func(unsigned long ip, unsigned long parent_ip,
4467 struct ftrace_ops *op, struct pt_regs *regs) 4407 struct ftrace_ops *op, struct pt_regs *regs)
@@ -4520,9 +4460,16 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
4520 */ 4460 */
4521 preempt_disable_notrace(); 4461 preempt_disable_notrace();
4522 do_for_each_ftrace_op(op, ftrace_ops_list) { 4462 do_for_each_ftrace_op(op, ftrace_ops_list) {
4523 if (ftrace_ops_test(op, ip, regs)) 4463 if (ftrace_ops_test(op, ip, regs)) {
4464 if (WARN_ON(!op->func)) {
4465 function_trace_stop = 1;
4466 printk("op=%p %pS\n", op, op);
4467 goto out;
4468 }
4524 op->func(ip, parent_ip, op, regs); 4469 op->func(ip, parent_ip, op, regs);
4470 }
4525 } while_for_each_ftrace_op(op); 4471 } while_for_each_ftrace_op(op);
4472out:
4526 preempt_enable_notrace(); 4473 preempt_enable_notrace();
4527 trace_clear_recursion(bit); 4474 trace_clear_recursion(bit);
4528} 4475}
@@ -4927,7 +4874,6 @@ ftrace_enable_sysctl(struct ctl_table *table, int write,
4927#ifdef CONFIG_FUNCTION_GRAPH_TRACER 4874#ifdef CONFIG_FUNCTION_GRAPH_TRACER
4928 4875
4929static int ftrace_graph_active; 4876static int ftrace_graph_active;
4930static struct notifier_block ftrace_suspend_notifier;
4931 4877
4932int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace) 4878int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace)
4933{ 4879{
@@ -5073,13 +5019,6 @@ ftrace_suspend_notifier_call(struct notifier_block *bl, unsigned long state,
5073 return NOTIFY_DONE; 5019 return NOTIFY_DONE;
5074} 5020}
5075 5021
5076/* Just a place holder for function graph */
5077static struct ftrace_ops fgraph_ops __read_mostly = {
5078 .func = ftrace_stub,
5079 .flags = FTRACE_OPS_FL_STUB | FTRACE_OPS_FL_GLOBAL |
5080 FTRACE_OPS_FL_RECURSION_SAFE,
5081};
5082
5083static int ftrace_graph_entry_test(struct ftrace_graph_ent *trace) 5022static int ftrace_graph_entry_test(struct ftrace_graph_ent *trace)
5084{ 5023{
5085 if (!ftrace_ops_test(&global_ops, trace->func, NULL)) 5024 if (!ftrace_ops_test(&global_ops, trace->func, NULL))
@@ -5104,6 +5043,10 @@ static void update_function_graph_func(void)
5104 ftrace_graph_entry = ftrace_graph_entry_test; 5043 ftrace_graph_entry = ftrace_graph_entry_test;
5105} 5044}
5106 5045
5046static struct notifier_block ftrace_suspend_notifier = {
5047 .notifier_call = ftrace_suspend_notifier_call,
5048};
5049
5107int register_ftrace_graph(trace_func_graph_ret_t retfunc, 5050int register_ftrace_graph(trace_func_graph_ret_t retfunc,
5108 trace_func_graph_ent_t entryfunc) 5051 trace_func_graph_ent_t entryfunc)
5109{ 5052{
@@ -5117,7 +5060,6 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc,
5117 goto out; 5060 goto out;
5118 } 5061 }
5119 5062
5120 ftrace_suspend_notifier.notifier_call = ftrace_suspend_notifier_call;
5121 register_pm_notifier(&ftrace_suspend_notifier); 5063 register_pm_notifier(&ftrace_suspend_notifier);
5122 5064
5123 ftrace_graph_active++; 5065 ftrace_graph_active++;
@@ -5139,7 +5081,10 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc,
5139 ftrace_graph_entry = ftrace_graph_entry_test; 5081 ftrace_graph_entry = ftrace_graph_entry_test;
5140 update_function_graph_func(); 5082 update_function_graph_func();
5141 5083
5142 ret = ftrace_startup(&fgraph_ops, FTRACE_START_FUNC_RET); 5084 /* Function graph doesn't use the .func field of global_ops */
5085 global_ops.flags |= FTRACE_OPS_FL_STUB;
5086
5087 ret = ftrace_startup(&global_ops, FTRACE_START_FUNC_RET);
5143 5088
5144out: 5089out:
5145 mutex_unlock(&ftrace_lock); 5090 mutex_unlock(&ftrace_lock);
@@ -5157,7 +5102,8 @@ void unregister_ftrace_graph(void)
5157 ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub; 5102 ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub;
5158 ftrace_graph_entry = ftrace_graph_entry_stub; 5103 ftrace_graph_entry = ftrace_graph_entry_stub;
5159 __ftrace_graph_entry = ftrace_graph_entry_stub; 5104 __ftrace_graph_entry = ftrace_graph_entry_stub;
5160 ftrace_shutdown(&fgraph_ops, FTRACE_STOP_FUNC_RET); 5105 ftrace_shutdown(&global_ops, FTRACE_STOP_FUNC_RET);
5106 global_ops.flags &= ~FTRACE_OPS_FL_STUB;
5161 unregister_pm_notifier(&ftrace_suspend_notifier); 5107 unregister_pm_notifier(&ftrace_suspend_notifier);
5162 unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL); 5108 unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL);
5163 5109
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 737b0efa1a62..16f7038d1f4d 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -275,7 +275,7 @@ int call_filter_check_discard(struct ftrace_event_call *call, void *rec,
275} 275}
276EXPORT_SYMBOL_GPL(call_filter_check_discard); 276EXPORT_SYMBOL_GPL(call_filter_check_discard);
277 277
278cycle_t buffer_ftrace_now(struct trace_buffer *buf, int cpu) 278static cycle_t buffer_ftrace_now(struct trace_buffer *buf, int cpu)
279{ 279{
280 u64 ts; 280 u64 ts;
281 281
@@ -599,7 +599,7 @@ static int alloc_snapshot(struct trace_array *tr)
599 return 0; 599 return 0;
600} 600}
601 601
602void free_snapshot(struct trace_array *tr) 602static void free_snapshot(struct trace_array *tr)
603{ 603{
604 /* 604 /*
605 * We don't free the ring buffer. instead, resize it because 605 * We don't free the ring buffer. instead, resize it because
@@ -963,27 +963,9 @@ static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt)
963 return cnt; 963 return cnt;
964} 964}
965 965
966/*
967 * ftrace_max_lock is used to protect the swapping of buffers
968 * when taking a max snapshot. The buffers themselves are
969 * protected by per_cpu spinlocks. But the action of the swap
970 * needs its own lock.
971 *
972 * This is defined as a arch_spinlock_t in order to help
973 * with performance when lockdep debugging is enabled.
974 *
975 * It is also used in other places outside the update_max_tr
976 * so it needs to be defined outside of the
977 * CONFIG_TRACER_MAX_TRACE.
978 */
979static arch_spinlock_t ftrace_max_lock =
980 (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
981
982unsigned long __read_mostly tracing_thresh; 966unsigned long __read_mostly tracing_thresh;
983 967
984#ifdef CONFIG_TRACER_MAX_TRACE 968#ifdef CONFIG_TRACER_MAX_TRACE
985unsigned long __read_mostly tracing_max_latency;
986
987/* 969/*
988 * Copy the new maximum trace into the separate maximum-trace 970 * Copy the new maximum trace into the separate maximum-trace
989 * structure. (this way the maximum trace is permanently saved, 971 * structure. (this way the maximum trace is permanently saved,
@@ -1000,7 +982,7 @@ __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
1000 max_buf->cpu = cpu; 982 max_buf->cpu = cpu;
1001 max_buf->time_start = data->preempt_timestamp; 983 max_buf->time_start = data->preempt_timestamp;
1002 984
1003 max_data->saved_latency = tracing_max_latency; 985 max_data->saved_latency = tr->max_latency;
1004 max_data->critical_start = data->critical_start; 986 max_data->critical_start = data->critical_start;
1005 max_data->critical_end = data->critical_end; 987 max_data->critical_end = data->critical_end;
1006 988
@@ -1048,14 +1030,14 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
1048 return; 1030 return;
1049 } 1031 }
1050 1032
1051 arch_spin_lock(&ftrace_max_lock); 1033 arch_spin_lock(&tr->max_lock);
1052 1034
1053 buf = tr->trace_buffer.buffer; 1035 buf = tr->trace_buffer.buffer;
1054 tr->trace_buffer.buffer = tr->max_buffer.buffer; 1036 tr->trace_buffer.buffer = tr->max_buffer.buffer;
1055 tr->max_buffer.buffer = buf; 1037 tr->max_buffer.buffer = buf;
1056 1038
1057 __update_max_tr(tr, tsk, cpu); 1039 __update_max_tr(tr, tsk, cpu);
1058 arch_spin_unlock(&ftrace_max_lock); 1040 arch_spin_unlock(&tr->max_lock);
1059} 1041}
1060 1042
1061/** 1043/**
@@ -1081,7 +1063,7 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
1081 return; 1063 return;
1082 } 1064 }
1083 1065
1084 arch_spin_lock(&ftrace_max_lock); 1066 arch_spin_lock(&tr->max_lock);
1085 1067
1086 ret = ring_buffer_swap_cpu(tr->max_buffer.buffer, tr->trace_buffer.buffer, cpu); 1068 ret = ring_buffer_swap_cpu(tr->max_buffer.buffer, tr->trace_buffer.buffer, cpu);
1087 1069
@@ -1099,11 +1081,11 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
1099 WARN_ON_ONCE(ret && ret != -EAGAIN && ret != -EBUSY); 1081 WARN_ON_ONCE(ret && ret != -EAGAIN && ret != -EBUSY);
1100 1082
1101 __update_max_tr(tr, tsk, cpu); 1083 __update_max_tr(tr, tsk, cpu);
1102 arch_spin_unlock(&ftrace_max_lock); 1084 arch_spin_unlock(&tr->max_lock);
1103} 1085}
1104#endif /* CONFIG_TRACER_MAX_TRACE */ 1086#endif /* CONFIG_TRACER_MAX_TRACE */
1105 1087
1106static void default_wait_pipe(struct trace_iterator *iter) 1088static void wait_on_pipe(struct trace_iterator *iter)
1107{ 1089{
1108 /* Iterators are static, they should be filled or empty */ 1090 /* Iterators are static, they should be filled or empty */
1109 if (trace_buffer_iter(iter, iter->cpu_file)) 1091 if (trace_buffer_iter(iter, iter->cpu_file))
@@ -1220,8 +1202,6 @@ int register_tracer(struct tracer *type)
1220 else 1202 else
1221 if (!type->flags->opts) 1203 if (!type->flags->opts)
1222 type->flags->opts = dummy_tracer_opt; 1204 type->flags->opts = dummy_tracer_opt;
1223 if (!type->wait_pipe)
1224 type->wait_pipe = default_wait_pipe;
1225 1205
1226 ret = run_tracer_selftest(type); 1206 ret = run_tracer_selftest(type);
1227 if (ret < 0) 1207 if (ret < 0)
@@ -1305,22 +1285,71 @@ void tracing_reset_all_online_cpus(void)
1305 } 1285 }
1306} 1286}
1307 1287
1308#define SAVED_CMDLINES 128 1288#define SAVED_CMDLINES_DEFAULT 128
1309#define NO_CMDLINE_MAP UINT_MAX 1289#define NO_CMDLINE_MAP UINT_MAX
1310static unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1];
1311static unsigned map_cmdline_to_pid[SAVED_CMDLINES];
1312static char saved_cmdlines[SAVED_CMDLINES][TASK_COMM_LEN];
1313static int cmdline_idx;
1314static arch_spinlock_t trace_cmdline_lock = __ARCH_SPIN_LOCK_UNLOCKED; 1290static arch_spinlock_t trace_cmdline_lock = __ARCH_SPIN_LOCK_UNLOCKED;
1291struct saved_cmdlines_buffer {
1292 unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1];
1293 unsigned *map_cmdline_to_pid;
1294 unsigned cmdline_num;
1295 int cmdline_idx;
1296 char *saved_cmdlines;
1297};
1298static struct saved_cmdlines_buffer *savedcmd;
1315 1299
1316/* temporary disable recording */ 1300/* temporary disable recording */
1317static atomic_t trace_record_cmdline_disabled __read_mostly; 1301static atomic_t trace_record_cmdline_disabled __read_mostly;
1318 1302
1319static void trace_init_cmdlines(void) 1303static inline char *get_saved_cmdlines(int idx)
1320{ 1304{
1321 memset(&map_pid_to_cmdline, NO_CMDLINE_MAP, sizeof(map_pid_to_cmdline)); 1305 return &savedcmd->saved_cmdlines[idx * TASK_COMM_LEN];
1322 memset(&map_cmdline_to_pid, NO_CMDLINE_MAP, sizeof(map_cmdline_to_pid)); 1306}
1323 cmdline_idx = 0; 1307
1308static inline void set_cmdline(int idx, const char *cmdline)
1309{
1310 memcpy(get_saved_cmdlines(idx), cmdline, TASK_COMM_LEN);
1311}
1312
1313static int allocate_cmdlines_buffer(unsigned int val,
1314 struct saved_cmdlines_buffer *s)
1315{
1316 s->map_cmdline_to_pid = kmalloc(val * sizeof(*s->map_cmdline_to_pid),
1317 GFP_KERNEL);
1318 if (!s->map_cmdline_to_pid)
1319 return -ENOMEM;
1320
1321 s->saved_cmdlines = kmalloc(val * TASK_COMM_LEN, GFP_KERNEL);
1322 if (!s->saved_cmdlines) {
1323 kfree(s->map_cmdline_to_pid);
1324 return -ENOMEM;
1325 }
1326
1327 s->cmdline_idx = 0;
1328 s->cmdline_num = val;
1329 memset(&s->map_pid_to_cmdline, NO_CMDLINE_MAP,
1330 sizeof(s->map_pid_to_cmdline));
1331 memset(s->map_cmdline_to_pid, NO_CMDLINE_MAP,
1332 val * sizeof(*s->map_cmdline_to_pid));
1333
1334 return 0;
1335}
1336
1337static int trace_create_savedcmd(void)
1338{
1339 int ret;
1340
1341 savedcmd = kmalloc(sizeof(struct saved_cmdlines_buffer), GFP_KERNEL);
1342 if (!savedcmd)
1343 return -ENOMEM;
1344
1345 ret = allocate_cmdlines_buffer(SAVED_CMDLINES_DEFAULT, savedcmd);
1346 if (ret < 0) {
1347 kfree(savedcmd);
1348 savedcmd = NULL;
1349 return -ENOMEM;
1350 }
1351
1352 return 0;
1324} 1353}
1325 1354
1326int is_tracing_stopped(void) 1355int is_tracing_stopped(void)
@@ -1353,7 +1382,7 @@ void tracing_start(void)
1353 } 1382 }
1354 1383
1355 /* Prevent the buffers from switching */ 1384 /* Prevent the buffers from switching */
1356 arch_spin_lock(&ftrace_max_lock); 1385 arch_spin_lock(&global_trace.max_lock);
1357 1386
1358 buffer = global_trace.trace_buffer.buffer; 1387 buffer = global_trace.trace_buffer.buffer;
1359 if (buffer) 1388 if (buffer)
@@ -1365,7 +1394,7 @@ void tracing_start(void)
1365 ring_buffer_record_enable(buffer); 1394 ring_buffer_record_enable(buffer);
1366#endif 1395#endif
1367 1396
1368 arch_spin_unlock(&ftrace_max_lock); 1397 arch_spin_unlock(&global_trace.max_lock);
1369 1398
1370 ftrace_start(); 1399 ftrace_start();
1371 out: 1400 out:
@@ -1420,7 +1449,7 @@ void tracing_stop(void)
1420 goto out; 1449 goto out;
1421 1450
1422 /* Prevent the buffers from switching */ 1451 /* Prevent the buffers from switching */
1423 arch_spin_lock(&ftrace_max_lock); 1452 arch_spin_lock(&global_trace.max_lock);
1424 1453
1425 buffer = global_trace.trace_buffer.buffer; 1454 buffer = global_trace.trace_buffer.buffer;
1426 if (buffer) 1455 if (buffer)
@@ -1432,7 +1461,7 @@ void tracing_stop(void)
1432 ring_buffer_record_disable(buffer); 1461 ring_buffer_record_disable(buffer);
1433#endif 1462#endif
1434 1463
1435 arch_spin_unlock(&ftrace_max_lock); 1464 arch_spin_unlock(&global_trace.max_lock);
1436 1465
1437 out: 1466 out:
1438 raw_spin_unlock_irqrestore(&global_trace.start_lock, flags); 1467 raw_spin_unlock_irqrestore(&global_trace.start_lock, flags);
@@ -1461,12 +1490,12 @@ static void tracing_stop_tr(struct trace_array *tr)
1461 1490
1462void trace_stop_cmdline_recording(void); 1491void trace_stop_cmdline_recording(void);
1463 1492
1464static void trace_save_cmdline(struct task_struct *tsk) 1493static int trace_save_cmdline(struct task_struct *tsk)
1465{ 1494{
1466 unsigned pid, idx; 1495 unsigned pid, idx;
1467 1496
1468 if (!tsk->pid || unlikely(tsk->pid > PID_MAX_DEFAULT)) 1497 if (!tsk->pid || unlikely(tsk->pid > PID_MAX_DEFAULT))
1469 return; 1498 return 0;
1470 1499
1471 /* 1500 /*
1472 * It's not the end of the world if we don't get 1501 * It's not the end of the world if we don't get
@@ -1475,11 +1504,11 @@ static void trace_save_cmdline(struct task_struct *tsk)
1475 * so if we miss here, then better luck next time. 1504 * so if we miss here, then better luck next time.
1476 */ 1505 */
1477 if (!arch_spin_trylock(&trace_cmdline_lock)) 1506 if (!arch_spin_trylock(&trace_cmdline_lock))
1478 return; 1507 return 0;
1479 1508
1480 idx = map_pid_to_cmdline[tsk->pid]; 1509 idx = savedcmd->map_pid_to_cmdline[tsk->pid];
1481 if (idx == NO_CMDLINE_MAP) { 1510 if (idx == NO_CMDLINE_MAP) {
1482 idx = (cmdline_idx + 1) % SAVED_CMDLINES; 1511 idx = (savedcmd->cmdline_idx + 1) % savedcmd->cmdline_num;
1483 1512
1484 /* 1513 /*
1485 * Check whether the cmdline buffer at idx has a pid 1514 * Check whether the cmdline buffer at idx has a pid
@@ -1487,22 +1516,24 @@ static void trace_save_cmdline(struct task_struct *tsk)
1487 * need to clear the map_pid_to_cmdline. Otherwise we 1516 * need to clear the map_pid_to_cmdline. Otherwise we
1488 * would read the new comm for the old pid. 1517 * would read the new comm for the old pid.
1489 */ 1518 */
1490 pid = map_cmdline_to_pid[idx]; 1519 pid = savedcmd->map_cmdline_to_pid[idx];
1491 if (pid != NO_CMDLINE_MAP) 1520 if (pid != NO_CMDLINE_MAP)
1492 map_pid_to_cmdline[pid] = NO_CMDLINE_MAP; 1521 savedcmd->map_pid_to_cmdline[pid] = NO_CMDLINE_MAP;
1493 1522
1494 map_cmdline_to_pid[idx] = tsk->pid; 1523 savedcmd->map_cmdline_to_pid[idx] = tsk->pid;
1495 map_pid_to_cmdline[tsk->pid] = idx; 1524 savedcmd->map_pid_to_cmdline[tsk->pid] = idx;
1496 1525
1497 cmdline_idx = idx; 1526 savedcmd->cmdline_idx = idx;
1498 } 1527 }
1499 1528
1500 memcpy(&saved_cmdlines[idx], tsk->comm, TASK_COMM_LEN); 1529 set_cmdline(idx, tsk->comm);
1501 1530
1502 arch_spin_unlock(&trace_cmdline_lock); 1531 arch_spin_unlock(&trace_cmdline_lock);
1532
1533 return 1;
1503} 1534}
1504 1535
1505void trace_find_cmdline(int pid, char comm[]) 1536static void __trace_find_cmdline(int pid, char comm[])
1506{ 1537{
1507 unsigned map; 1538 unsigned map;
1508 1539
@@ -1521,13 +1552,19 @@ void trace_find_cmdline(int pid, char comm[])
1521 return; 1552 return;
1522 } 1553 }
1523 1554
1524 preempt_disable(); 1555 map = savedcmd->map_pid_to_cmdline[pid];
1525 arch_spin_lock(&trace_cmdline_lock);
1526 map = map_pid_to_cmdline[pid];
1527 if (map != NO_CMDLINE_MAP) 1556 if (map != NO_CMDLINE_MAP)
1528 strcpy(comm, saved_cmdlines[map]); 1557 strcpy(comm, get_saved_cmdlines(map));
1529 else 1558 else
1530 strcpy(comm, "<...>"); 1559 strcpy(comm, "<...>");
1560}
1561
1562void trace_find_cmdline(int pid, char comm[])
1563{
1564 preempt_disable();
1565 arch_spin_lock(&trace_cmdline_lock);
1566
1567 __trace_find_cmdline(pid, comm);
1531 1568
1532 arch_spin_unlock(&trace_cmdline_lock); 1569 arch_spin_unlock(&trace_cmdline_lock);
1533 preempt_enable(); 1570 preempt_enable();
@@ -1541,9 +1578,8 @@ void tracing_record_cmdline(struct task_struct *tsk)
1541 if (!__this_cpu_read(trace_cmdline_save)) 1578 if (!__this_cpu_read(trace_cmdline_save))
1542 return; 1579 return;
1543 1580
1544 __this_cpu_write(trace_cmdline_save, false); 1581 if (trace_save_cmdline(tsk))
1545 1582 __this_cpu_write(trace_cmdline_save, false);
1546 trace_save_cmdline(tsk);
1547} 1583}
1548 1584
1549void 1585void
@@ -1746,7 +1782,7 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer,
1746 */ 1782 */
1747 barrier(); 1783 barrier();
1748 if (use_stack == 1) { 1784 if (use_stack == 1) {
1749 trace.entries = &__get_cpu_var(ftrace_stack).calls[0]; 1785 trace.entries = this_cpu_ptr(ftrace_stack.calls);
1750 trace.max_entries = FTRACE_STACK_MAX_ENTRIES; 1786 trace.max_entries = FTRACE_STACK_MAX_ENTRIES;
1751 1787
1752 if (regs) 1788 if (regs)
@@ -1995,7 +2031,21 @@ void trace_printk_init_buffers(void)
1995 if (alloc_percpu_trace_buffer()) 2031 if (alloc_percpu_trace_buffer())
1996 return; 2032 return;
1997 2033
1998 pr_info("ftrace: Allocated trace_printk buffers\n"); 2034 /* trace_printk() is for debug use only. Don't use it in production. */
2035
2036 pr_warning("\n**********************************************************\n");
2037 pr_warning("** NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE **\n");
2038 pr_warning("** **\n");
2039 pr_warning("** trace_printk() being used. Allocating extra memory. **\n");
2040 pr_warning("** **\n");
2041 pr_warning("** This means that this is a DEBUG kernel and it is **\n");
2042 pr_warning("** unsafe for produciton use. **\n");
2043 pr_warning("** **\n");
2044 pr_warning("** If you see this message and you are not debugging **\n");
2045 pr_warning("** the kernel, report this immediately to your vendor! **\n");
2046 pr_warning("** **\n");
2047 pr_warning("** NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE **\n");
2048 pr_warning("**********************************************************\n");
1999 2049
2000 /* Expand the buffers to set size */ 2050 /* Expand the buffers to set size */
2001 tracing_update_buffers(); 2051 tracing_update_buffers();
@@ -3333,7 +3383,7 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf,
3333 mutex_lock(&tracing_cpumask_update_lock); 3383 mutex_lock(&tracing_cpumask_update_lock);
3334 3384
3335 local_irq_disable(); 3385 local_irq_disable();
3336 arch_spin_lock(&ftrace_max_lock); 3386 arch_spin_lock(&tr->max_lock);
3337 for_each_tracing_cpu(cpu) { 3387 for_each_tracing_cpu(cpu) {
3338 /* 3388 /*
3339 * Increase/decrease the disabled counter if we are 3389 * Increase/decrease the disabled counter if we are
@@ -3350,7 +3400,7 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf,
3350 ring_buffer_record_enable_cpu(tr->trace_buffer.buffer, cpu); 3400 ring_buffer_record_enable_cpu(tr->trace_buffer.buffer, cpu);
3351 } 3401 }
3352 } 3402 }
3353 arch_spin_unlock(&ftrace_max_lock); 3403 arch_spin_unlock(&tr->max_lock);
3354 local_irq_enable(); 3404 local_irq_enable();
3355 3405
3356 cpumask_copy(tr->tracing_cpumask, tracing_cpumask_new); 3406 cpumask_copy(tr->tracing_cpumask, tracing_cpumask_new);
@@ -3592,6 +3642,7 @@ static const char readme_msg[] =
3592 " trace_options\t\t- Set format or modify how tracing happens\n" 3642 " trace_options\t\t- Set format or modify how tracing happens\n"
3593 "\t\t\t Disable an option by adding a suffix 'no' to the\n" 3643 "\t\t\t Disable an option by adding a suffix 'no' to the\n"
3594 "\t\t\t option name\n" 3644 "\t\t\t option name\n"
3645 " saved_cmdlines_size\t- echo command number in here to store comm-pid list\n"
3595#ifdef CONFIG_DYNAMIC_FTRACE 3646#ifdef CONFIG_DYNAMIC_FTRACE
3596 "\n available_filter_functions - list of functions that can be filtered on\n" 3647 "\n available_filter_functions - list of functions that can be filtered on\n"
3597 " set_ftrace_filter\t- echo function name in here to only trace these\n" 3648 " set_ftrace_filter\t- echo function name in here to only trace these\n"
@@ -3705,55 +3756,153 @@ static const struct file_operations tracing_readme_fops = {
3705 .llseek = generic_file_llseek, 3756 .llseek = generic_file_llseek,
3706}; 3757};
3707 3758
3759static void *saved_cmdlines_next(struct seq_file *m, void *v, loff_t *pos)
3760{
3761 unsigned int *ptr = v;
3762
3763 if (*pos || m->count)
3764 ptr++;
3765
3766 (*pos)++;
3767
3768 for (; ptr < &savedcmd->map_cmdline_to_pid[savedcmd->cmdline_num];
3769 ptr++) {
3770 if (*ptr == -1 || *ptr == NO_CMDLINE_MAP)
3771 continue;
3772
3773 return ptr;
3774 }
3775
3776 return NULL;
3777}
3778
3779static void *saved_cmdlines_start(struct seq_file *m, loff_t *pos)
3780{
3781 void *v;
3782 loff_t l = 0;
3783
3784 preempt_disable();
3785 arch_spin_lock(&trace_cmdline_lock);
3786
3787 v = &savedcmd->map_cmdline_to_pid[0];
3788 while (l <= *pos) {
3789 v = saved_cmdlines_next(m, v, &l);
3790 if (!v)
3791 return NULL;
3792 }
3793
3794 return v;
3795}
3796
3797static void saved_cmdlines_stop(struct seq_file *m, void *v)
3798{
3799 arch_spin_unlock(&trace_cmdline_lock);
3800 preempt_enable();
3801}
3802
3803static int saved_cmdlines_show(struct seq_file *m, void *v)
3804{
3805 char buf[TASK_COMM_LEN];
3806 unsigned int *pid = v;
3807
3808 __trace_find_cmdline(*pid, buf);
3809 seq_printf(m, "%d %s\n", *pid, buf);
3810 return 0;
3811}
3812
3813static const struct seq_operations tracing_saved_cmdlines_seq_ops = {
3814 .start = saved_cmdlines_start,
3815 .next = saved_cmdlines_next,
3816 .stop = saved_cmdlines_stop,
3817 .show = saved_cmdlines_show,
3818};
3819
3820static int tracing_saved_cmdlines_open(struct inode *inode, struct file *filp)
3821{
3822 if (tracing_disabled)
3823 return -ENODEV;
3824
3825 return seq_open(filp, &tracing_saved_cmdlines_seq_ops);
3826}
3827
3828static const struct file_operations tracing_saved_cmdlines_fops = {
3829 .open = tracing_saved_cmdlines_open,
3830 .read = seq_read,
3831 .llseek = seq_lseek,
3832 .release = seq_release,
3833};
3834
3708static ssize_t 3835static ssize_t
3709tracing_saved_cmdlines_read(struct file *file, char __user *ubuf, 3836tracing_saved_cmdlines_size_read(struct file *filp, char __user *ubuf,
3710 size_t cnt, loff_t *ppos) 3837 size_t cnt, loff_t *ppos)
3711{ 3838{
3712 char *buf_comm; 3839 char buf[64];
3713 char *file_buf; 3840 int r;
3714 char *buf; 3841
3715 int len = 0; 3842 arch_spin_lock(&trace_cmdline_lock);
3716 int pid; 3843 r = sprintf(buf, "%u\n", savedcmd->cmdline_num);
3717 int i; 3844 arch_spin_unlock(&trace_cmdline_lock);
3845
3846 return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
3847}
3848
3849static void free_saved_cmdlines_buffer(struct saved_cmdlines_buffer *s)
3850{
3851 kfree(s->saved_cmdlines);
3852 kfree(s->map_cmdline_to_pid);
3853 kfree(s);
3854}
3855
3856static int tracing_resize_saved_cmdlines(unsigned int val)
3857{
3858 struct saved_cmdlines_buffer *s, *savedcmd_temp;
3718 3859
3719 file_buf = kmalloc(SAVED_CMDLINES*(16+TASK_COMM_LEN), GFP_KERNEL); 3860 s = kmalloc(sizeof(struct saved_cmdlines_buffer), GFP_KERNEL);
3720 if (!file_buf) 3861 if (!s)
3721 return -ENOMEM; 3862 return -ENOMEM;
3722 3863
3723 buf_comm = kmalloc(TASK_COMM_LEN, GFP_KERNEL); 3864 if (allocate_cmdlines_buffer(val, s) < 0) {
3724 if (!buf_comm) { 3865 kfree(s);
3725 kfree(file_buf);
3726 return -ENOMEM; 3866 return -ENOMEM;
3727 } 3867 }
3728 3868
3729 buf = file_buf; 3869 arch_spin_lock(&trace_cmdline_lock);
3870 savedcmd_temp = savedcmd;
3871 savedcmd = s;
3872 arch_spin_unlock(&trace_cmdline_lock);
3873 free_saved_cmdlines_buffer(savedcmd_temp);
3730 3874
3731 for (i = 0; i < SAVED_CMDLINES; i++) { 3875 return 0;
3732 int r; 3876}
3733 3877
3734 pid = map_cmdline_to_pid[i]; 3878static ssize_t
3735 if (pid == -1 || pid == NO_CMDLINE_MAP) 3879tracing_saved_cmdlines_size_write(struct file *filp, const char __user *ubuf,
3736 continue; 3880 size_t cnt, loff_t *ppos)
3881{
3882 unsigned long val;
3883 int ret;
3737 3884
3738 trace_find_cmdline(pid, buf_comm); 3885 ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
3739 r = sprintf(buf, "%d %s\n", pid, buf_comm); 3886 if (ret)
3740 buf += r; 3887 return ret;
3741 len += r;
3742 }
3743 3888
3744 len = simple_read_from_buffer(ubuf, cnt, ppos, 3889 /* must have at least 1 entry or less than PID_MAX_DEFAULT */
3745 file_buf, len); 3890 if (!val || val > PID_MAX_DEFAULT)
3891 return -EINVAL;
3746 3892
3747 kfree(file_buf); 3893 ret = tracing_resize_saved_cmdlines((unsigned int)val);
3748 kfree(buf_comm); 3894 if (ret < 0)
3895 return ret;
3749 3896
3750 return len; 3897 *ppos += cnt;
3898
3899 return cnt;
3751} 3900}
3752 3901
3753static const struct file_operations tracing_saved_cmdlines_fops = { 3902static const struct file_operations tracing_saved_cmdlines_size_fops = {
3754 .open = tracing_open_generic, 3903 .open = tracing_open_generic,
3755 .read = tracing_saved_cmdlines_read, 3904 .read = tracing_saved_cmdlines_size_read,
3756 .llseek = generic_file_llseek, 3905 .write = tracing_saved_cmdlines_size_write,
3757}; 3906};
3758 3907
3759static ssize_t 3908static ssize_t
@@ -4225,25 +4374,6 @@ tracing_poll_pipe(struct file *filp, poll_table *poll_table)
4225 return trace_poll(iter, filp, poll_table); 4374 return trace_poll(iter, filp, poll_table);
4226} 4375}
4227 4376
4228/*
4229 * This is a make-shift waitqueue.
4230 * A tracer might use this callback on some rare cases:
4231 *
4232 * 1) the current tracer might hold the runqueue lock when it wakes up
4233 * a reader, hence a deadlock (sched, function, and function graph tracers)
4234 * 2) the function tracers, trace all functions, we don't want
4235 * the overhead of calling wake_up and friends
4236 * (and tracing them too)
4237 *
4238 * Anyway, this is really very primitive wakeup.
4239 */
4240void poll_wait_pipe(struct trace_iterator *iter)
4241{
4242 set_current_state(TASK_INTERRUPTIBLE);
4243 /* sleep for 100 msecs, and try again. */
4244 schedule_timeout(HZ / 10);
4245}
4246
4247/* Must be called with trace_types_lock mutex held. */ 4377/* Must be called with trace_types_lock mutex held. */
4248static int tracing_wait_pipe(struct file *filp) 4378static int tracing_wait_pipe(struct file *filp)
4249{ 4379{
@@ -4255,15 +4385,6 @@ static int tracing_wait_pipe(struct file *filp)
4255 return -EAGAIN; 4385 return -EAGAIN;
4256 } 4386 }
4257 4387
4258 mutex_unlock(&iter->mutex);
4259
4260 iter->trace->wait_pipe(iter);
4261
4262 mutex_lock(&iter->mutex);
4263
4264 if (signal_pending(current))
4265 return -EINTR;
4266
4267 /* 4388 /*
4268 * We block until we read something and tracing is disabled. 4389 * We block until we read something and tracing is disabled.
4269 * We still block if tracing is disabled, but we have never 4390 * We still block if tracing is disabled, but we have never
@@ -4275,6 +4396,15 @@ static int tracing_wait_pipe(struct file *filp)
4275 */ 4396 */
4276 if (!tracing_is_on() && iter->pos) 4397 if (!tracing_is_on() && iter->pos)
4277 break; 4398 break;
4399
4400 mutex_unlock(&iter->mutex);
4401
4402 wait_on_pipe(iter);
4403
4404 mutex_lock(&iter->mutex);
4405
4406 if (signal_pending(current))
4407 return -EINTR;
4278 } 4408 }
4279 4409
4280 return 1; 4410 return 1;
@@ -5197,7 +5327,7 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
5197 goto out_unlock; 5327 goto out_unlock;
5198 } 5328 }
5199 mutex_unlock(&trace_types_lock); 5329 mutex_unlock(&trace_types_lock);
5200 iter->trace->wait_pipe(iter); 5330 wait_on_pipe(iter);
5201 mutex_lock(&trace_types_lock); 5331 mutex_lock(&trace_types_lock);
5202 if (signal_pending(current)) { 5332 if (signal_pending(current)) {
5203 size = -EINTR; 5333 size = -EINTR;
@@ -5408,7 +5538,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
5408 goto out; 5538 goto out;
5409 } 5539 }
5410 mutex_unlock(&trace_types_lock); 5540 mutex_unlock(&trace_types_lock);
5411 iter->trace->wait_pipe(iter); 5541 wait_on_pipe(iter);
5412 mutex_lock(&trace_types_lock); 5542 mutex_lock(&trace_types_lock);
5413 if (signal_pending(current)) { 5543 if (signal_pending(current)) {
5414 ret = -EINTR; 5544 ret = -EINTR;
@@ -6102,6 +6232,25 @@ static int allocate_trace_buffers(struct trace_array *tr, int size)
6102 return 0; 6232 return 0;
6103} 6233}
6104 6234
6235static void free_trace_buffers(struct trace_array *tr)
6236{
6237 if (!tr)
6238 return;
6239
6240 if (tr->trace_buffer.buffer) {
6241 ring_buffer_free(tr->trace_buffer.buffer);
6242 tr->trace_buffer.buffer = NULL;
6243 free_percpu(tr->trace_buffer.data);
6244 }
6245
6246#ifdef CONFIG_TRACER_MAX_TRACE
6247 if (tr->max_buffer.buffer) {
6248 ring_buffer_free(tr->max_buffer.buffer);
6249 tr->max_buffer.buffer = NULL;
6250 }
6251#endif
6252}
6253
6105static int new_instance_create(const char *name) 6254static int new_instance_create(const char *name)
6106{ 6255{
6107 struct trace_array *tr; 6256 struct trace_array *tr;
@@ -6131,6 +6280,8 @@ static int new_instance_create(const char *name)
6131 6280
6132 raw_spin_lock_init(&tr->start_lock); 6281 raw_spin_lock_init(&tr->start_lock);
6133 6282
6283 tr->max_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
6284
6134 tr->current_trace = &nop_trace; 6285 tr->current_trace = &nop_trace;
6135 6286
6136 INIT_LIST_HEAD(&tr->systems); 6287 INIT_LIST_HEAD(&tr->systems);
@@ -6158,8 +6309,7 @@ static int new_instance_create(const char *name)
6158 return 0; 6309 return 0;
6159 6310
6160 out_free_tr: 6311 out_free_tr:
6161 if (tr->trace_buffer.buffer) 6312 free_trace_buffers(tr);
6162 ring_buffer_free(tr->trace_buffer.buffer);
6163 free_cpumask_var(tr->tracing_cpumask); 6313 free_cpumask_var(tr->tracing_cpumask);
6164 kfree(tr->name); 6314 kfree(tr->name);
6165 kfree(tr); 6315 kfree(tr);
@@ -6199,8 +6349,7 @@ static int instance_delete(const char *name)
6199 event_trace_del_tracer(tr); 6349 event_trace_del_tracer(tr);
6200 ftrace_destroy_function_files(tr); 6350 ftrace_destroy_function_files(tr);
6201 debugfs_remove_recursive(tr->dir); 6351 debugfs_remove_recursive(tr->dir);
6202 free_percpu(tr->trace_buffer.data); 6352 free_trace_buffers(tr);
6203 ring_buffer_free(tr->trace_buffer.buffer);
6204 6353
6205 kfree(tr->name); 6354 kfree(tr->name);
6206 kfree(tr); 6355 kfree(tr);
@@ -6328,6 +6477,11 @@ init_tracer_debugfs(struct trace_array *tr, struct dentry *d_tracer)
6328 trace_create_file("tracing_on", 0644, d_tracer, 6477 trace_create_file("tracing_on", 0644, d_tracer,
6329 tr, &rb_simple_fops); 6478 tr, &rb_simple_fops);
6330 6479
6480#ifdef CONFIG_TRACER_MAX_TRACE
6481 trace_create_file("tracing_max_latency", 0644, d_tracer,
6482 &tr->max_latency, &tracing_max_lat_fops);
6483#endif
6484
6331 if (ftrace_create_function_files(tr, d_tracer)) 6485 if (ftrace_create_function_files(tr, d_tracer))
6332 WARN(1, "Could not allocate function filter files"); 6486 WARN(1, "Could not allocate function filter files");
6333 6487
@@ -6353,11 +6507,6 @@ static __init int tracer_init_debugfs(void)
6353 6507
6354 init_tracer_debugfs(&global_trace, d_tracer); 6508 init_tracer_debugfs(&global_trace, d_tracer);
6355 6509
6356#ifdef CONFIG_TRACER_MAX_TRACE
6357 trace_create_file("tracing_max_latency", 0644, d_tracer,
6358 &tracing_max_latency, &tracing_max_lat_fops);
6359#endif
6360
6361 trace_create_file("tracing_thresh", 0644, d_tracer, 6510 trace_create_file("tracing_thresh", 0644, d_tracer,
6362 &tracing_thresh, &tracing_max_lat_fops); 6511 &tracing_thresh, &tracing_max_lat_fops);
6363 6512
@@ -6367,6 +6516,9 @@ static __init int tracer_init_debugfs(void)
6367 trace_create_file("saved_cmdlines", 0444, d_tracer, 6516 trace_create_file("saved_cmdlines", 0444, d_tracer,
6368 NULL, &tracing_saved_cmdlines_fops); 6517 NULL, &tracing_saved_cmdlines_fops);
6369 6518
6519 trace_create_file("saved_cmdlines_size", 0644, d_tracer,
6520 NULL, &tracing_saved_cmdlines_size_fops);
6521
6370#ifdef CONFIG_DYNAMIC_FTRACE 6522#ifdef CONFIG_DYNAMIC_FTRACE
6371 trace_create_file("dyn_ftrace_total_info", 0444, d_tracer, 6523 trace_create_file("dyn_ftrace_total_info", 0444, d_tracer,
6372 &ftrace_update_tot_cnt, &tracing_dyn_info_fops); 6524 &ftrace_update_tot_cnt, &tracing_dyn_info_fops);
@@ -6603,18 +6755,19 @@ __init static int tracer_alloc_buffers(void)
6603 if (!temp_buffer) 6755 if (!temp_buffer)
6604 goto out_free_cpumask; 6756 goto out_free_cpumask;
6605 6757
6758 if (trace_create_savedcmd() < 0)
6759 goto out_free_temp_buffer;
6760
6606 /* TODO: make the number of buffers hot pluggable with CPUS */ 6761 /* TODO: make the number of buffers hot pluggable with CPUS */
6607 if (allocate_trace_buffers(&global_trace, ring_buf_size) < 0) { 6762 if (allocate_trace_buffers(&global_trace, ring_buf_size) < 0) {
6608 printk(KERN_ERR "tracer: failed to allocate ring buffer!\n"); 6763 printk(KERN_ERR "tracer: failed to allocate ring buffer!\n");
6609 WARN_ON(1); 6764 WARN_ON(1);
6610 goto out_free_temp_buffer; 6765 goto out_free_savedcmd;
6611 } 6766 }
6612 6767
6613 if (global_trace.buffer_disabled) 6768 if (global_trace.buffer_disabled)
6614 tracing_off(); 6769 tracing_off();
6615 6770
6616 trace_init_cmdlines();
6617
6618 if (trace_boot_clock) { 6771 if (trace_boot_clock) {
6619 ret = tracing_set_clock(&global_trace, trace_boot_clock); 6772 ret = tracing_set_clock(&global_trace, trace_boot_clock);
6620 if (ret < 0) 6773 if (ret < 0)
@@ -6629,6 +6782,10 @@ __init static int tracer_alloc_buffers(void)
6629 */ 6782 */
6630 global_trace.current_trace = &nop_trace; 6783 global_trace.current_trace = &nop_trace;
6631 6784
6785 global_trace.max_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
6786
6787 ftrace_init_global_array_ops(&global_trace);
6788
6632 register_tracer(&nop_trace); 6789 register_tracer(&nop_trace);
6633 6790
6634 /* All seems OK, enable tracing */ 6791 /* All seems OK, enable tracing */
@@ -6656,13 +6813,11 @@ __init static int tracer_alloc_buffers(void)
6656 6813
6657 return 0; 6814 return 0;
6658 6815
6816out_free_savedcmd:
6817 free_saved_cmdlines_buffer(savedcmd);
6659out_free_temp_buffer: 6818out_free_temp_buffer:
6660 ring_buffer_free(temp_buffer); 6819 ring_buffer_free(temp_buffer);
6661out_free_cpumask: 6820out_free_cpumask:
6662 free_percpu(global_trace.trace_buffer.data);
6663#ifdef CONFIG_TRACER_MAX_TRACE
6664 free_percpu(global_trace.max_buffer.data);
6665#endif
6666 free_cpumask_var(global_trace.tracing_cpumask); 6821 free_cpumask_var(global_trace.tracing_cpumask);
6667out_free_buffer_mask: 6822out_free_buffer_mask:
6668 free_cpumask_var(tracing_buffer_mask); 6823 free_cpumask_var(tracing_buffer_mask);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 2e29d7ba5a52..9e82551dd566 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -190,7 +190,22 @@ struct trace_array {
190 */ 190 */
191 struct trace_buffer max_buffer; 191 struct trace_buffer max_buffer;
192 bool allocated_snapshot; 192 bool allocated_snapshot;
193 unsigned long max_latency;
193#endif 194#endif
195 /*
196 * max_lock is used to protect the swapping of buffers
197 * when taking a max snapshot. The buffers themselves are
198 * protected by per_cpu spinlocks. But the action of the swap
199 * needs its own lock.
200 *
201 * This is defined as a arch_spinlock_t in order to help
202 * with performance when lockdep debugging is enabled.
203 *
204 * It is also used in other places outside the update_max_tr
205 * so it needs to be defined outside of the
206 * CONFIG_TRACER_MAX_TRACE.
207 */
208 arch_spinlock_t max_lock;
194 int buffer_disabled; 209 int buffer_disabled;
195#ifdef CONFIG_FTRACE_SYSCALLS 210#ifdef CONFIG_FTRACE_SYSCALLS
196 int sys_refcount_enter; 211 int sys_refcount_enter;
@@ -237,6 +252,9 @@ static inline struct trace_array *top_trace_array(void)
237{ 252{
238 struct trace_array *tr; 253 struct trace_array *tr;
239 254
255 if (list_empty(ftrace_trace_arrays.prev))
256 return NULL;
257
240 tr = list_entry(ftrace_trace_arrays.prev, 258 tr = list_entry(ftrace_trace_arrays.prev,
241 typeof(*tr), list); 259 typeof(*tr), list);
242 WARN_ON(!(tr->flags & TRACE_ARRAY_FL_GLOBAL)); 260 WARN_ON(!(tr->flags & TRACE_ARRAY_FL_GLOBAL));
@@ -323,7 +341,6 @@ struct tracer_flags {
323 * @stop: called when tracing is paused (echo 0 > tracing_enabled) 341 * @stop: called when tracing is paused (echo 0 > tracing_enabled)
324 * @open: called when the trace file is opened 342 * @open: called when the trace file is opened
325 * @pipe_open: called when the trace_pipe file is opened 343 * @pipe_open: called when the trace_pipe file is opened
326 * @wait_pipe: override how the user waits for traces on trace_pipe
327 * @close: called when the trace file is released 344 * @close: called when the trace file is released
328 * @pipe_close: called when the trace_pipe file is released 345 * @pipe_close: called when the trace_pipe file is released
329 * @read: override the default read callback on trace_pipe 346 * @read: override the default read callback on trace_pipe
@@ -342,7 +359,6 @@ struct tracer {
342 void (*stop)(struct trace_array *tr); 359 void (*stop)(struct trace_array *tr);
343 void (*open)(struct trace_iterator *iter); 360 void (*open)(struct trace_iterator *iter);
344 void (*pipe_open)(struct trace_iterator *iter); 361 void (*pipe_open)(struct trace_iterator *iter);
345 void (*wait_pipe)(struct trace_iterator *iter);
346 void (*close)(struct trace_iterator *iter); 362 void (*close)(struct trace_iterator *iter);
347 void (*pipe_close)(struct trace_iterator *iter); 363 void (*pipe_close)(struct trace_iterator *iter);
348 ssize_t (*read)(struct trace_iterator *iter, 364 ssize_t (*read)(struct trace_iterator *iter,
@@ -416,13 +432,7 @@ enum {
416 TRACE_FTRACE_IRQ_BIT, 432 TRACE_FTRACE_IRQ_BIT,
417 TRACE_FTRACE_SIRQ_BIT, 433 TRACE_FTRACE_SIRQ_BIT,
418 434
419 /* GLOBAL_BITs must be greater than FTRACE_BITs */ 435 /* INTERNAL_BITs must be greater than FTRACE_BITs */
420 TRACE_GLOBAL_BIT,
421 TRACE_GLOBAL_NMI_BIT,
422 TRACE_GLOBAL_IRQ_BIT,
423 TRACE_GLOBAL_SIRQ_BIT,
424
425 /* INTERNAL_BITs must be greater than GLOBAL_BITs */
426 TRACE_INTERNAL_BIT, 436 TRACE_INTERNAL_BIT,
427 TRACE_INTERNAL_NMI_BIT, 437 TRACE_INTERNAL_NMI_BIT,
428 TRACE_INTERNAL_IRQ_BIT, 438 TRACE_INTERNAL_IRQ_BIT,
@@ -449,9 +459,6 @@ enum {
449#define TRACE_FTRACE_START TRACE_FTRACE_BIT 459#define TRACE_FTRACE_START TRACE_FTRACE_BIT
450#define TRACE_FTRACE_MAX ((1 << (TRACE_FTRACE_START + TRACE_CONTEXT_BITS)) - 1) 460#define TRACE_FTRACE_MAX ((1 << (TRACE_FTRACE_START + TRACE_CONTEXT_BITS)) - 1)
451 461
452#define TRACE_GLOBAL_START TRACE_GLOBAL_BIT
453#define TRACE_GLOBAL_MAX ((1 << (TRACE_GLOBAL_START + TRACE_CONTEXT_BITS)) - 1)
454
455#define TRACE_LIST_START TRACE_INTERNAL_BIT 462#define TRACE_LIST_START TRACE_INTERNAL_BIT
456#define TRACE_LIST_MAX ((1 << (TRACE_LIST_START + TRACE_CONTEXT_BITS)) - 1) 463#define TRACE_LIST_MAX ((1 << (TRACE_LIST_START + TRACE_CONTEXT_BITS)) - 1)
457 464
@@ -560,8 +567,6 @@ void trace_init_global_iter(struct trace_iterator *iter);
560 567
561void tracing_iter_reset(struct trace_iterator *iter, int cpu); 568void tracing_iter_reset(struct trace_iterator *iter, int cpu);
562 569
563void poll_wait_pipe(struct trace_iterator *iter);
564
565void tracing_sched_switch_trace(struct trace_array *tr, 570void tracing_sched_switch_trace(struct trace_array *tr,
566 struct task_struct *prev, 571 struct task_struct *prev,
567 struct task_struct *next, 572 struct task_struct *next,
@@ -608,8 +613,6 @@ extern unsigned long nsecs_to_usecs(unsigned long nsecs);
608extern unsigned long tracing_thresh; 613extern unsigned long tracing_thresh;
609 614
610#ifdef CONFIG_TRACER_MAX_TRACE 615#ifdef CONFIG_TRACER_MAX_TRACE
611extern unsigned long tracing_max_latency;
612
613void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu); 616void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu);
614void update_max_tr_single(struct trace_array *tr, 617void update_max_tr_single(struct trace_array *tr,
615 struct task_struct *tsk, int cpu); 618 struct task_struct *tsk, int cpu);
@@ -724,6 +727,8 @@ extern unsigned long trace_flags;
724#define TRACE_GRAPH_PRINT_PROC 0x8 727#define TRACE_GRAPH_PRINT_PROC 0x8
725#define TRACE_GRAPH_PRINT_DURATION 0x10 728#define TRACE_GRAPH_PRINT_DURATION 0x10
726#define TRACE_GRAPH_PRINT_ABS_TIME 0x20 729#define TRACE_GRAPH_PRINT_ABS_TIME 0x20
730#define TRACE_GRAPH_PRINT_IRQS 0x40
731#define TRACE_GRAPH_PRINT_TAIL 0x80
727#define TRACE_GRAPH_PRINT_FILL_SHIFT 28 732#define TRACE_GRAPH_PRINT_FILL_SHIFT 28
728#define TRACE_GRAPH_PRINT_FILL_MASK (0x3 << TRACE_GRAPH_PRINT_FILL_SHIFT) 733#define TRACE_GRAPH_PRINT_FILL_MASK (0x3 << TRACE_GRAPH_PRINT_FILL_SHIFT)
729 734
@@ -823,6 +828,10 @@ extern int ftrace_is_dead(void);
823int ftrace_create_function_files(struct trace_array *tr, 828int ftrace_create_function_files(struct trace_array *tr,
824 struct dentry *parent); 829 struct dentry *parent);
825void ftrace_destroy_function_files(struct trace_array *tr); 830void ftrace_destroy_function_files(struct trace_array *tr);
831void ftrace_init_global_array_ops(struct trace_array *tr);
832void ftrace_init_array_ops(struct trace_array *tr, ftrace_func_t func);
833void ftrace_reset_array_ops(struct trace_array *tr);
834int using_ftrace_ops_list_func(void);
826#else 835#else
827static inline int ftrace_trace_task(struct task_struct *task) 836static inline int ftrace_trace_task(struct task_struct *task)
828{ 837{
@@ -836,6 +845,11 @@ ftrace_create_function_files(struct trace_array *tr,
836 return 0; 845 return 0;
837} 846}
838static inline void ftrace_destroy_function_files(struct trace_array *tr) { } 847static inline void ftrace_destroy_function_files(struct trace_array *tr) { }
848static inline __init void
849ftrace_init_global_array_ops(struct trace_array *tr) { }
850static inline void ftrace_reset_array_ops(struct trace_array *tr) { }
851/* ftace_func_t type is not defined, use macro instead of static inline */
852#define ftrace_init_array_ops(tr, func) do { } while (0)
839#endif /* CONFIG_FUNCTION_TRACER */ 853#endif /* CONFIG_FUNCTION_TRACER */
840 854
841#if defined(CONFIG_FUNCTION_TRACER) && defined(CONFIG_DYNAMIC_FTRACE) 855#if defined(CONFIG_FUNCTION_TRACER) && defined(CONFIG_DYNAMIC_FTRACE)
diff --git a/kernel/trace/trace_benchmark.c b/kernel/trace/trace_benchmark.c
new file mode 100644
index 000000000000..40a14cbcf8e0
--- /dev/null
+++ b/kernel/trace/trace_benchmark.c
@@ -0,0 +1,198 @@
1#include <linux/delay.h>
2#include <linux/module.h>
3#include <linux/kthread.h>
4#include <linux/trace_clock.h>
5
6#define CREATE_TRACE_POINTS
7#include "trace_benchmark.h"
8
9static struct task_struct *bm_event_thread;
10
11static char bm_str[BENCHMARK_EVENT_STRLEN] = "START";
12
13static u64 bm_total;
14static u64 bm_totalsq;
15static u64 bm_last;
16static u64 bm_max;
17static u64 bm_min;
18static u64 bm_first;
19static u64 bm_cnt;
20static u64 bm_stddev;
21static unsigned int bm_avg;
22static unsigned int bm_std;
23
24/*
25 * This gets called in a loop recording the time it took to write
26 * the tracepoint. What it writes is the time statistics of the last
27 * tracepoint write. As there is nothing to write the first time
28 * it simply writes "START". As the first write is cold cache and
29 * the rest is hot, we save off that time in bm_first and it is
30 * reported as "first", which is shown in the second write to the
31 * tracepoint. The "first" field is writen within the statics from
32 * then on but never changes.
33 */
34static void trace_do_benchmark(void)
35{
36 u64 start;
37 u64 stop;
38 u64 delta;
39 u64 stddev;
40 u64 seed;
41 u64 last_seed;
42 unsigned int avg;
43 unsigned int std = 0;
44
45 /* Only run if the tracepoint is actually active */
46 if (!trace_benchmark_event_enabled())
47 return;
48
49 local_irq_disable();
50 start = trace_clock_local();
51 trace_benchmark_event(bm_str);
52 stop = trace_clock_local();
53 local_irq_enable();
54
55 bm_cnt++;
56
57 delta = stop - start;
58
59 /*
60 * The first read is cold cached, keep it separate from the
61 * other calculations.
62 */
63 if (bm_cnt == 1) {
64 bm_first = delta;
65 scnprintf(bm_str, BENCHMARK_EVENT_STRLEN,
66 "first=%llu [COLD CACHED]", bm_first);
67 return;
68 }
69
70 bm_last = delta;
71
72 if (delta > bm_max)
73 bm_max = delta;
74 if (!bm_min || delta < bm_min)
75 bm_min = delta;
76
77 /*
78 * When bm_cnt is greater than UINT_MAX, it breaks the statistics
79 * accounting. Freeze the statistics when that happens.
80 * We should have enough data for the avg and stddev anyway.
81 */
82 if (bm_cnt > UINT_MAX) {
83 scnprintf(bm_str, BENCHMARK_EVENT_STRLEN,
84 "last=%llu first=%llu max=%llu min=%llu ** avg=%u std=%d std^2=%lld",
85 bm_last, bm_first, bm_max, bm_min, bm_avg, bm_std, bm_stddev);
86 return;
87 }
88
89 bm_total += delta;
90 bm_totalsq += delta * delta;
91
92
93 if (bm_cnt > 1) {
94 /*
95 * Apply Welford's method to calculate standard deviation:
96 * s^2 = 1 / (n * (n-1)) * (n * \Sum (x_i)^2 - (\Sum x_i)^2)
97 */
98 stddev = (u64)bm_cnt * bm_totalsq - bm_total * bm_total;
99 do_div(stddev, (u32)bm_cnt);
100 do_div(stddev, (u32)bm_cnt - 1);
101 } else
102 stddev = 0;
103
104 delta = bm_total;
105 do_div(delta, bm_cnt);
106 avg = delta;
107
108 if (stddev > 0) {
109 int i = 0;
110 /*
111 * stddev is the square of standard deviation but
112 * we want the actualy number. Use the average
113 * as our seed to find the std.
114 *
115 * The next try is:
116 * x = (x + N/x) / 2
117 *
118 * Where N is the squared number to find the square
119 * root of.
120 */
121 seed = avg;
122 do {
123 last_seed = seed;
124 seed = stddev;
125 if (!last_seed)
126 break;
127 do_div(seed, last_seed);
128 seed += last_seed;
129 do_div(seed, 2);
130 } while (i++ < 10 && last_seed != seed);
131
132 std = seed;
133 }
134
135 scnprintf(bm_str, BENCHMARK_EVENT_STRLEN,
136 "last=%llu first=%llu max=%llu min=%llu avg=%u std=%d std^2=%lld",
137 bm_last, bm_first, bm_max, bm_min, avg, std, stddev);
138
139 bm_std = std;
140 bm_avg = avg;
141 bm_stddev = stddev;
142}
143
144static int benchmark_event_kthread(void *arg)
145{
146 /* sleep a bit to make sure the tracepoint gets activated */
147 msleep(100);
148
149 while (!kthread_should_stop()) {
150
151 trace_do_benchmark();
152
153 /*
154 * We don't go to sleep, but let others
155 * run as well.
156 */
157 cond_resched();
158 }
159
160 return 0;
161}
162
163/*
164 * When the benchmark tracepoint is enabled, it calls this
165 * function and the thread that calls the tracepoint is created.
166 */
167void trace_benchmark_reg(void)
168{
169 bm_event_thread = kthread_run(benchmark_event_kthread,
170 NULL, "event_benchmark");
171 WARN_ON(!bm_event_thread);
172}
173
174/*
175 * When the benchmark tracepoint is disabled, it calls this
176 * function and the thread that calls the tracepoint is deleted
177 * and all the numbers are reset.
178 */
179void trace_benchmark_unreg(void)
180{
181 if (!bm_event_thread)
182 return;
183
184 kthread_stop(bm_event_thread);
185
186 strcpy(bm_str, "START");
187 bm_total = 0;
188 bm_totalsq = 0;
189 bm_last = 0;
190 bm_max = 0;
191 bm_min = 0;
192 bm_cnt = 0;
193 /* These don't need to be reset but reset them anyway */
194 bm_first = 0;
195 bm_std = 0;
196 bm_avg = 0;
197 bm_stddev = 0;
198}
diff --git a/kernel/trace/trace_benchmark.h b/kernel/trace/trace_benchmark.h
new file mode 100644
index 000000000000..3c1df1df4e29
--- /dev/null
+++ b/kernel/trace/trace_benchmark.h
@@ -0,0 +1,41 @@
1#undef TRACE_SYSTEM
2#define TRACE_SYSTEM benchmark
3
4#if !defined(_TRACE_BENCHMARK_H) || defined(TRACE_HEADER_MULTI_READ)
5#define _TRACE_BENCHMARK_H
6
7#include <linux/tracepoint.h>
8
9extern void trace_benchmark_reg(void);
10extern void trace_benchmark_unreg(void);
11
12#define BENCHMARK_EVENT_STRLEN 128
13
14TRACE_EVENT_FN(benchmark_event,
15
16 TP_PROTO(const char *str),
17
18 TP_ARGS(str),
19
20 TP_STRUCT__entry(
21 __array( char, str, BENCHMARK_EVENT_STRLEN )
22 ),
23
24 TP_fast_assign(
25 memcpy(__entry->str, str, BENCHMARK_EVENT_STRLEN);
26 ),
27
28 TP_printk("%s", __entry->str),
29
30 trace_benchmark_reg, trace_benchmark_unreg
31);
32
33#endif /* _TRACE_BENCHMARK_H */
34
35#undef TRACE_INCLUDE_FILE
36#undef TRACE_INCLUDE_PATH
37#define TRACE_INCLUDE_PATH .
38#define TRACE_INCLUDE_FILE trace_benchmark
39
40/* This part must be outside protection */
41#include <trace/define_trace.h>
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 3ddfd8f62c05..f99e0b3bca8c 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -574,6 +574,9 @@ int trace_set_clr_event(const char *system, const char *event, int set)
574{ 574{
575 struct trace_array *tr = top_trace_array(); 575 struct trace_array *tr = top_trace_array();
576 576
577 if (!tr)
578 return -ENODEV;
579
577 return __ftrace_set_clr_event(tr, NULL, system, event, set); 580 return __ftrace_set_clr_event(tr, NULL, system, event, set);
578} 581}
579EXPORT_SYMBOL_GPL(trace_set_clr_event); 582EXPORT_SYMBOL_GPL(trace_set_clr_event);
@@ -2065,6 +2068,9 @@ event_enable_func(struct ftrace_hash *hash,
2065 bool enable; 2068 bool enable;
2066 int ret; 2069 int ret;
2067 2070
2071 if (!tr)
2072 return -ENODEV;
2073
2068 /* hash funcs only work with set_ftrace_filter */ 2074 /* hash funcs only work with set_ftrace_filter */
2069 if (!enabled || !param) 2075 if (!enabled || !param)
2070 return -EINVAL; 2076 return -EINVAL;
@@ -2396,6 +2402,9 @@ static __init int event_trace_enable(void)
2396 char *token; 2402 char *token;
2397 int ret; 2403 int ret;
2398 2404
2405 if (!tr)
2406 return -ENODEV;
2407
2399 for_each_event(iter, __start_ftrace_events, __stop_ftrace_events) { 2408 for_each_event(iter, __start_ftrace_events, __stop_ftrace_events) {
2400 2409
2401 call = *iter; 2410 call = *iter;
@@ -2442,6 +2451,8 @@ static __init int event_trace_init(void)
2442 int ret; 2451 int ret;
2443 2452
2444 tr = top_trace_array(); 2453 tr = top_trace_array();
2454 if (!tr)
2455 return -ENODEV;
2445 2456
2446 d_tracer = tracing_init_dentry(); 2457 d_tracer = tracing_init_dentry();
2447 if (!d_tracer) 2458 if (!d_tracer)
@@ -2535,6 +2546,8 @@ static __init void event_trace_self_tests(void)
2535 int ret; 2546 int ret;
2536 2547
2537 tr = top_trace_array(); 2548 tr = top_trace_array();
2549 if (!tr)
2550 return;
2538 2551
2539 pr_info("Running tests on trace events:\n"); 2552 pr_info("Running tests on trace events:\n");
2540 2553
diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c
index 925f537f07d1..4747b476a030 100644
--- a/kernel/trace/trace_events_trigger.c
+++ b/kernel/trace/trace_events_trigger.c
@@ -77,7 +77,7 @@ event_triggers_call(struct ftrace_event_file *file, void *rec)
77 data->ops->func(data); 77 data->ops->func(data);
78 continue; 78 continue;
79 } 79 }
80 filter = rcu_dereference(data->filter); 80 filter = rcu_dereference_sched(data->filter);
81 if (filter && !filter_match_preds(filter, rec)) 81 if (filter && !filter_match_preds(filter, rec))
82 continue; 82 continue;
83 if (data->cmd_ops->post_trigger) { 83 if (data->cmd_ops->post_trigger) {
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 5b781d2be383..57f0ec962d2c 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -26,8 +26,6 @@ function_trace_call(unsigned long ip, unsigned long parent_ip,
26static void 26static void
27function_stack_trace_call(unsigned long ip, unsigned long parent_ip, 27function_stack_trace_call(unsigned long ip, unsigned long parent_ip,
28 struct ftrace_ops *op, struct pt_regs *pt_regs); 28 struct ftrace_ops *op, struct pt_regs *pt_regs);
29static struct ftrace_ops trace_ops;
30static struct ftrace_ops trace_stack_ops;
31static struct tracer_flags func_flags; 29static struct tracer_flags func_flags;
32 30
33/* Our option */ 31/* Our option */
@@ -58,12 +56,16 @@ int ftrace_create_function_files(struct trace_array *tr,
58{ 56{
59 int ret; 57 int ret;
60 58
61 /* The top level array uses the "global_ops". */ 59 /*
62 if (!(tr->flags & TRACE_ARRAY_FL_GLOBAL)) { 60 * The top level array uses the "global_ops", and the files are
63 ret = allocate_ftrace_ops(tr); 61 * created on boot up.
64 if (ret) 62 */
65 return ret; 63 if (tr->flags & TRACE_ARRAY_FL_GLOBAL)
66 } 64 return 0;
65
66 ret = allocate_ftrace_ops(tr);
67 if (ret)
68 return ret;
67 69
68 ftrace_create_filter_files(tr->ops, parent); 70 ftrace_create_filter_files(tr->ops, parent);
69 71
@@ -79,28 +81,24 @@ void ftrace_destroy_function_files(struct trace_array *tr)
79 81
80static int function_trace_init(struct trace_array *tr) 82static int function_trace_init(struct trace_array *tr)
81{ 83{
82 struct ftrace_ops *ops; 84 ftrace_func_t func;
83
84 if (tr->flags & TRACE_ARRAY_FL_GLOBAL) {
85 /* There's only one global tr */
86 if (!trace_ops.private) {
87 trace_ops.private = tr;
88 trace_stack_ops.private = tr;
89 }
90 85
91 if (func_flags.val & TRACE_FUNC_OPT_STACK) 86 /*
92 ops = &trace_stack_ops; 87 * Instance trace_arrays get their ops allocated
93 else 88 * at instance creation. Unless it failed
94 ops = &trace_ops; 89 * the allocation.
95 tr->ops = ops; 90 */
96 } else if (!tr->ops) { 91 if (!tr->ops)
97 /*
98 * Instance trace_arrays get their ops allocated
99 * at instance creation. Unless it failed
100 * the allocation.
101 */
102 return -ENOMEM; 92 return -ENOMEM;
103 } 93
94 /* Currently only the global instance can do stack tracing */
95 if (tr->flags & TRACE_ARRAY_FL_GLOBAL &&
96 func_flags.val & TRACE_FUNC_OPT_STACK)
97 func = function_stack_trace_call;
98 else
99 func = function_trace_call;
100
101 ftrace_init_array_ops(tr, func);
104 102
105 tr->trace_buffer.cpu = get_cpu(); 103 tr->trace_buffer.cpu = get_cpu();
106 put_cpu(); 104 put_cpu();
@@ -114,6 +112,7 @@ static void function_trace_reset(struct trace_array *tr)
114{ 112{
115 tracing_stop_function_trace(tr); 113 tracing_stop_function_trace(tr);
116 tracing_stop_cmdline_record(); 114 tracing_stop_cmdline_record();
115 ftrace_reset_array_ops(tr);
117} 116}
118 117
119static void function_trace_start(struct trace_array *tr) 118static void function_trace_start(struct trace_array *tr)
@@ -195,18 +194,6 @@ function_stack_trace_call(unsigned long ip, unsigned long parent_ip,
195 local_irq_restore(flags); 194 local_irq_restore(flags);
196} 195}
197 196
198static struct ftrace_ops trace_ops __read_mostly =
199{
200 .func = function_trace_call,
201 .flags = FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_RECURSION_SAFE,
202};
203
204static struct ftrace_ops trace_stack_ops __read_mostly =
205{
206 .func = function_stack_trace_call,
207 .flags = FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_RECURSION_SAFE,
208};
209
210static struct tracer_opt func_opts[] = { 197static struct tracer_opt func_opts[] = {
211#ifdef CONFIG_STACKTRACE 198#ifdef CONFIG_STACKTRACE
212 { TRACER_OPT(func_stack_trace, TRACE_FUNC_OPT_STACK) }, 199 { TRACER_OPT(func_stack_trace, TRACE_FUNC_OPT_STACK) },
@@ -244,10 +231,10 @@ func_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set)
244 unregister_ftrace_function(tr->ops); 231 unregister_ftrace_function(tr->ops);
245 232
246 if (set) { 233 if (set) {
247 tr->ops = &trace_stack_ops; 234 tr->ops->func = function_stack_trace_call;
248 register_ftrace_function(tr->ops); 235 register_ftrace_function(tr->ops);
249 } else { 236 } else {
250 tr->ops = &trace_ops; 237 tr->ops->func = function_trace_call;
251 register_ftrace_function(tr->ops); 238 register_ftrace_function(tr->ops);
252 } 239 }
253 240
@@ -265,7 +252,6 @@ static struct tracer function_trace __tracer_data =
265 .init = function_trace_init, 252 .init = function_trace_init,
266 .reset = function_trace_reset, 253 .reset = function_trace_reset,
267 .start = function_trace_start, 254 .start = function_trace_start,
268 .wait_pipe = poll_wait_pipe,
269 .flags = &func_flags, 255 .flags = &func_flags,
270 .set_flag = func_set_flag, 256 .set_flag = func_set_flag,
271 .allow_instances = true, 257 .allow_instances = true,
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index deff11200261..4de3e57f723c 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -38,15 +38,6 @@ struct fgraph_data {
38 38
39#define TRACE_GRAPH_INDENT 2 39#define TRACE_GRAPH_INDENT 2
40 40
41/* Flag options */
42#define TRACE_GRAPH_PRINT_OVERRUN 0x1
43#define TRACE_GRAPH_PRINT_CPU 0x2
44#define TRACE_GRAPH_PRINT_OVERHEAD 0x4
45#define TRACE_GRAPH_PRINT_PROC 0x8
46#define TRACE_GRAPH_PRINT_DURATION 0x10
47#define TRACE_GRAPH_PRINT_ABS_TIME 0x20
48#define TRACE_GRAPH_PRINT_IRQS 0x40
49
50static unsigned int max_depth; 41static unsigned int max_depth;
51 42
52static struct tracer_opt trace_opts[] = { 43static struct tracer_opt trace_opts[] = {
@@ -64,11 +55,13 @@ static struct tracer_opt trace_opts[] = {
64 { TRACER_OPT(funcgraph-abstime, TRACE_GRAPH_PRINT_ABS_TIME) }, 55 { TRACER_OPT(funcgraph-abstime, TRACE_GRAPH_PRINT_ABS_TIME) },
65 /* Display interrupts */ 56 /* Display interrupts */
66 { TRACER_OPT(funcgraph-irqs, TRACE_GRAPH_PRINT_IRQS) }, 57 { TRACER_OPT(funcgraph-irqs, TRACE_GRAPH_PRINT_IRQS) },
58 /* Display function name after trailing } */
59 { TRACER_OPT(funcgraph-tail, TRACE_GRAPH_PRINT_TAIL) },
67 { } /* Empty entry */ 60 { } /* Empty entry */
68}; 61};
69 62
70static struct tracer_flags tracer_flags = { 63static struct tracer_flags tracer_flags = {
71 /* Don't display overruns and proc by default */ 64 /* Don't display overruns, proc, or tail by default */
72 .val = TRACE_GRAPH_PRINT_CPU | TRACE_GRAPH_PRINT_OVERHEAD | 65 .val = TRACE_GRAPH_PRINT_CPU | TRACE_GRAPH_PRINT_OVERHEAD |
73 TRACE_GRAPH_PRINT_DURATION | TRACE_GRAPH_PRINT_IRQS, 66 TRACE_GRAPH_PRINT_DURATION | TRACE_GRAPH_PRINT_IRQS,
74 .opts = trace_opts 67 .opts = trace_opts
@@ -1176,9 +1169,10 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
1176 * If the return function does not have a matching entry, 1169 * If the return function does not have a matching entry,
1177 * then the entry was lost. Instead of just printing 1170 * then the entry was lost. Instead of just printing
1178 * the '}' and letting the user guess what function this 1171 * the '}' and letting the user guess what function this
1179 * belongs to, write out the function name. 1172 * belongs to, write out the function name. Always do
1173 * that if the funcgraph-tail option is enabled.
1180 */ 1174 */
1181 if (func_match) { 1175 if (func_match && !(flags & TRACE_GRAPH_PRINT_TAIL)) {
1182 ret = trace_seq_puts(s, "}\n"); 1176 ret = trace_seq_puts(s, "}\n");
1183 if (!ret) 1177 if (!ret)
1184 return TRACE_TYPE_PARTIAL_LINE; 1178 return TRACE_TYPE_PARTIAL_LINE;
@@ -1505,7 +1499,6 @@ static struct tracer graph_trace __tracer_data = {
1505 .pipe_open = graph_trace_open, 1499 .pipe_open = graph_trace_open,
1506 .close = graph_trace_close, 1500 .close = graph_trace_close,
1507 .pipe_close = graph_trace_close, 1501 .pipe_close = graph_trace_close,
1508 .wait_pipe = poll_wait_pipe,
1509 .init = graph_trace_init, 1502 .init = graph_trace_init,
1510 .reset = graph_trace_reset, 1503 .reset = graph_trace_reset,
1511 .print_line = print_graph_function, 1504 .print_line = print_graph_function,
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 8ff02cbb892f..9bb104f748d0 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -151,12 +151,6 @@ irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip,
151 151
152 atomic_dec(&data->disabled); 152 atomic_dec(&data->disabled);
153} 153}
154
155static struct ftrace_ops trace_ops __read_mostly =
156{
157 .func = irqsoff_tracer_call,
158 .flags = FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_RECURSION_SAFE,
159};
160#endif /* CONFIG_FUNCTION_TRACER */ 154#endif /* CONFIG_FUNCTION_TRACER */
161 155
162#ifdef CONFIG_FUNCTION_GRAPH_TRACER 156#ifdef CONFIG_FUNCTION_GRAPH_TRACER
@@ -176,7 +170,7 @@ irqsoff_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set)
176 for_each_possible_cpu(cpu) 170 for_each_possible_cpu(cpu)
177 per_cpu(tracing_cpu, cpu) = 0; 171 per_cpu(tracing_cpu, cpu) = 0;
178 172
179 tracing_max_latency = 0; 173 tr->max_latency = 0;
180 tracing_reset_online_cpus(&irqsoff_trace->trace_buffer); 174 tracing_reset_online_cpus(&irqsoff_trace->trace_buffer);
181 175
182 return start_irqsoff_tracer(irqsoff_trace, set); 176 return start_irqsoff_tracer(irqsoff_trace, set);
@@ -303,13 +297,13 @@ static void irqsoff_print_header(struct seq_file *s)
303/* 297/*
304 * Should this new latency be reported/recorded? 298 * Should this new latency be reported/recorded?
305 */ 299 */
306static int report_latency(cycle_t delta) 300static int report_latency(struct trace_array *tr, cycle_t delta)
307{ 301{
308 if (tracing_thresh) { 302 if (tracing_thresh) {
309 if (delta < tracing_thresh) 303 if (delta < tracing_thresh)
310 return 0; 304 return 0;
311 } else { 305 } else {
312 if (delta <= tracing_max_latency) 306 if (delta <= tr->max_latency)
313 return 0; 307 return 0;
314 } 308 }
315 return 1; 309 return 1;
@@ -333,13 +327,13 @@ check_critical_timing(struct trace_array *tr,
333 327
334 pc = preempt_count(); 328 pc = preempt_count();
335 329
336 if (!report_latency(delta)) 330 if (!report_latency(tr, delta))
337 goto out; 331 goto out;
338 332
339 raw_spin_lock_irqsave(&max_trace_lock, flags); 333 raw_spin_lock_irqsave(&max_trace_lock, flags);
340 334
341 /* check if we are still the max latency */ 335 /* check if we are still the max latency */
342 if (!report_latency(delta)) 336 if (!report_latency(tr, delta))
343 goto out_unlock; 337 goto out_unlock;
344 338
345 __trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc); 339 __trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc);
@@ -352,7 +346,7 @@ check_critical_timing(struct trace_array *tr,
352 data->critical_end = parent_ip; 346 data->critical_end = parent_ip;
353 347
354 if (likely(!is_tracing_stopped())) { 348 if (likely(!is_tracing_stopped())) {
355 tracing_max_latency = delta; 349 tr->max_latency = delta;
356 update_max_tr_single(tr, current, cpu); 350 update_max_tr_single(tr, current, cpu);
357 } 351 }
358 352
@@ -531,7 +525,7 @@ void trace_preempt_off(unsigned long a0, unsigned long a1)
531} 525}
532#endif /* CONFIG_PREEMPT_TRACER */ 526#endif /* CONFIG_PREEMPT_TRACER */
533 527
534static int register_irqsoff_function(int graph, int set) 528static int register_irqsoff_function(struct trace_array *tr, int graph, int set)
535{ 529{
536 int ret; 530 int ret;
537 531
@@ -543,7 +537,7 @@ static int register_irqsoff_function(int graph, int set)
543 ret = register_ftrace_graph(&irqsoff_graph_return, 537 ret = register_ftrace_graph(&irqsoff_graph_return,
544 &irqsoff_graph_entry); 538 &irqsoff_graph_entry);
545 else 539 else
546 ret = register_ftrace_function(&trace_ops); 540 ret = register_ftrace_function(tr->ops);
547 541
548 if (!ret) 542 if (!ret)
549 function_enabled = true; 543 function_enabled = true;
@@ -551,7 +545,7 @@ static int register_irqsoff_function(int graph, int set)
551 return ret; 545 return ret;
552} 546}
553 547
554static void unregister_irqsoff_function(int graph) 548static void unregister_irqsoff_function(struct trace_array *tr, int graph)
555{ 549{
556 if (!function_enabled) 550 if (!function_enabled)
557 return; 551 return;
@@ -559,17 +553,17 @@ static void unregister_irqsoff_function(int graph)
559 if (graph) 553 if (graph)
560 unregister_ftrace_graph(); 554 unregister_ftrace_graph();
561 else 555 else
562 unregister_ftrace_function(&trace_ops); 556 unregister_ftrace_function(tr->ops);
563 557
564 function_enabled = false; 558 function_enabled = false;
565} 559}
566 560
567static void irqsoff_function_set(int set) 561static void irqsoff_function_set(struct trace_array *tr, int set)
568{ 562{
569 if (set) 563 if (set)
570 register_irqsoff_function(is_graph(), 1); 564 register_irqsoff_function(tr, is_graph(), 1);
571 else 565 else
572 unregister_irqsoff_function(is_graph()); 566 unregister_irqsoff_function(tr, is_graph());
573} 567}
574 568
575static int irqsoff_flag_changed(struct trace_array *tr, u32 mask, int set) 569static int irqsoff_flag_changed(struct trace_array *tr, u32 mask, int set)
@@ -577,7 +571,7 @@ static int irqsoff_flag_changed(struct trace_array *tr, u32 mask, int set)
577 struct tracer *tracer = tr->current_trace; 571 struct tracer *tracer = tr->current_trace;
578 572
579 if (mask & TRACE_ITER_FUNCTION) 573 if (mask & TRACE_ITER_FUNCTION)
580 irqsoff_function_set(set); 574 irqsoff_function_set(tr, set);
581 575
582 return trace_keep_overwrite(tracer, mask, set); 576 return trace_keep_overwrite(tracer, mask, set);
583} 577}
@@ -586,7 +580,7 @@ static int start_irqsoff_tracer(struct trace_array *tr, int graph)
586{ 580{
587 int ret; 581 int ret;
588 582
589 ret = register_irqsoff_function(graph, 0); 583 ret = register_irqsoff_function(tr, graph, 0);
590 584
591 if (!ret && tracing_is_enabled()) 585 if (!ret && tracing_is_enabled())
592 tracer_enabled = 1; 586 tracer_enabled = 1;
@@ -600,25 +594,37 @@ static void stop_irqsoff_tracer(struct trace_array *tr, int graph)
600{ 594{
601 tracer_enabled = 0; 595 tracer_enabled = 0;
602 596
603 unregister_irqsoff_function(graph); 597 unregister_irqsoff_function(tr, graph);
604} 598}
605 599
606static void __irqsoff_tracer_init(struct trace_array *tr) 600static bool irqsoff_busy;
601
602static int __irqsoff_tracer_init(struct trace_array *tr)
607{ 603{
604 if (irqsoff_busy)
605 return -EBUSY;
606
608 save_flags = trace_flags; 607 save_flags = trace_flags;
609 608
610 /* non overwrite screws up the latency tracers */ 609 /* non overwrite screws up the latency tracers */
611 set_tracer_flag(tr, TRACE_ITER_OVERWRITE, 1); 610 set_tracer_flag(tr, TRACE_ITER_OVERWRITE, 1);
612 set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, 1); 611 set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, 1);
613 612
614 tracing_max_latency = 0; 613 tr->max_latency = 0;
615 irqsoff_trace = tr; 614 irqsoff_trace = tr;
616 /* make sure that the tracer is visible */ 615 /* make sure that the tracer is visible */
617 smp_wmb(); 616 smp_wmb();
618 tracing_reset_online_cpus(&tr->trace_buffer); 617 tracing_reset_online_cpus(&tr->trace_buffer);
619 618
620 if (start_irqsoff_tracer(tr, is_graph())) 619 ftrace_init_array_ops(tr, irqsoff_tracer_call);
620
621 /* Only toplevel instance supports graph tracing */
622 if (start_irqsoff_tracer(tr, (tr->flags & TRACE_ARRAY_FL_GLOBAL &&
623 is_graph())))
621 printk(KERN_ERR "failed to start irqsoff tracer\n"); 624 printk(KERN_ERR "failed to start irqsoff tracer\n");
625
626 irqsoff_busy = true;
627 return 0;
622} 628}
623 629
624static void irqsoff_tracer_reset(struct trace_array *tr) 630static void irqsoff_tracer_reset(struct trace_array *tr)
@@ -630,6 +636,9 @@ static void irqsoff_tracer_reset(struct trace_array *tr)
630 636
631 set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, lat_flag); 637 set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, lat_flag);
632 set_tracer_flag(tr, TRACE_ITER_OVERWRITE, overwrite_flag); 638 set_tracer_flag(tr, TRACE_ITER_OVERWRITE, overwrite_flag);
639 ftrace_reset_array_ops(tr);
640
641 irqsoff_busy = false;
633} 642}
634 643
635static void irqsoff_tracer_start(struct trace_array *tr) 644static void irqsoff_tracer_start(struct trace_array *tr)
@@ -647,8 +656,7 @@ static int irqsoff_tracer_init(struct trace_array *tr)
647{ 656{
648 trace_type = TRACER_IRQS_OFF; 657 trace_type = TRACER_IRQS_OFF;
649 658
650 __irqsoff_tracer_init(tr); 659 return __irqsoff_tracer_init(tr);
651 return 0;
652} 660}
653static struct tracer irqsoff_tracer __read_mostly = 661static struct tracer irqsoff_tracer __read_mostly =
654{ 662{
@@ -668,6 +676,7 @@ static struct tracer irqsoff_tracer __read_mostly =
668#endif 676#endif
669 .open = irqsoff_trace_open, 677 .open = irqsoff_trace_open,
670 .close = irqsoff_trace_close, 678 .close = irqsoff_trace_close,
679 .allow_instances = true,
671 .use_max_tr = true, 680 .use_max_tr = true,
672}; 681};
673# define register_irqsoff(trace) register_tracer(&trace) 682# define register_irqsoff(trace) register_tracer(&trace)
@@ -680,8 +689,7 @@ static int preemptoff_tracer_init(struct trace_array *tr)
680{ 689{
681 trace_type = TRACER_PREEMPT_OFF; 690 trace_type = TRACER_PREEMPT_OFF;
682 691
683 __irqsoff_tracer_init(tr); 692 return __irqsoff_tracer_init(tr);
684 return 0;
685} 693}
686 694
687static struct tracer preemptoff_tracer __read_mostly = 695static struct tracer preemptoff_tracer __read_mostly =
@@ -702,6 +710,7 @@ static struct tracer preemptoff_tracer __read_mostly =
702#endif 710#endif
703 .open = irqsoff_trace_open, 711 .open = irqsoff_trace_open,
704 .close = irqsoff_trace_close, 712 .close = irqsoff_trace_close,
713 .allow_instances = true,
705 .use_max_tr = true, 714 .use_max_tr = true,
706}; 715};
707# define register_preemptoff(trace) register_tracer(&trace) 716# define register_preemptoff(trace) register_tracer(&trace)
@@ -716,8 +725,7 @@ static int preemptirqsoff_tracer_init(struct trace_array *tr)
716{ 725{
717 trace_type = TRACER_IRQS_OFF | TRACER_PREEMPT_OFF; 726 trace_type = TRACER_IRQS_OFF | TRACER_PREEMPT_OFF;
718 727
719 __irqsoff_tracer_init(tr); 728 return __irqsoff_tracer_init(tr);
720 return 0;
721} 729}
722 730
723static struct tracer preemptirqsoff_tracer __read_mostly = 731static struct tracer preemptirqsoff_tracer __read_mostly =
@@ -738,6 +746,7 @@ static struct tracer preemptirqsoff_tracer __read_mostly =
738#endif 746#endif
739 .open = irqsoff_trace_open, 747 .open = irqsoff_trace_open,
740 .close = irqsoff_trace_close, 748 .close = irqsoff_trace_close,
749 .allow_instances = true,
741 .use_max_tr = true, 750 .use_max_tr = true,
742}; 751};
743 752
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 903ae28962be..ef2fba1f46b5 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -1377,6 +1377,9 @@ static __init int kprobe_trace_self_tests_init(void)
1377 struct trace_kprobe *tk; 1377 struct trace_kprobe *tk;
1378 struct ftrace_event_file *file; 1378 struct ftrace_event_file *file;
1379 1379
1380 if (tracing_is_disabled())
1381 return -ENODEV;
1382
1380 target = kprobe_trace_selftest_target; 1383 target = kprobe_trace_selftest_target;
1381 1384
1382 pr_info("Testing kprobe tracing: "); 1385 pr_info("Testing kprobe tracing: ");
diff --git a/kernel/trace/trace_nop.c b/kernel/trace/trace_nop.c
index 69a5cc94c01a..fcf0a9e48916 100644
--- a/kernel/trace/trace_nop.c
+++ b/kernel/trace/trace_nop.c
@@ -91,7 +91,6 @@ struct tracer nop_trace __read_mostly =
91 .name = "nop", 91 .name = "nop",
92 .init = nop_trace_init, 92 .init = nop_trace_init,
93 .reset = nop_trace_reset, 93 .reset = nop_trace_reset,
94 .wait_pipe = poll_wait_pipe,
95#ifdef CONFIG_FTRACE_SELFTEST 94#ifdef CONFIG_FTRACE_SELFTEST
96 .selftest = trace_selftest_startup_nop, 95 .selftest = trace_selftest_startup_nop,
97#endif 96#endif
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index a436de18aa99..f3dad80c20b2 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -126,6 +126,34 @@ trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
126EXPORT_SYMBOL_GPL(trace_seq_printf); 126EXPORT_SYMBOL_GPL(trace_seq_printf);
127 127
128/** 128/**
129 * trace_seq_bitmask - put a list of longs as a bitmask print output
130 * @s: trace sequence descriptor
131 * @maskp: points to an array of unsigned longs that represent a bitmask
132 * @nmaskbits: The number of bits that are valid in @maskp
133 *
134 * It returns 0 if the trace oversizes the buffer's free
135 * space, 1 otherwise.
136 *
137 * Writes a ASCII representation of a bitmask string into @s.
138 */
139int
140trace_seq_bitmask(struct trace_seq *s, const unsigned long *maskp,
141 int nmaskbits)
142{
143 int len = (PAGE_SIZE - 1) - s->len;
144 int ret;
145
146 if (s->full || !len)
147 return 0;
148
149 ret = bitmap_scnprintf(s->buffer, len, maskp, nmaskbits);
150 s->len += ret;
151
152 return 1;
153}
154EXPORT_SYMBOL_GPL(trace_seq_bitmask);
155
156/**
129 * trace_seq_vprintf - sequence printing of trace information 157 * trace_seq_vprintf - sequence printing of trace information
130 * @s: trace sequence descriptor 158 * @s: trace sequence descriptor
131 * @fmt: printf format string 159 * @fmt: printf format string
@@ -399,6 +427,19 @@ EXPORT_SYMBOL(ftrace_print_symbols_seq_u64);
399#endif 427#endif
400 428
401const char * 429const char *
430ftrace_print_bitmask_seq(struct trace_seq *p, void *bitmask_ptr,
431 unsigned int bitmask_size)
432{
433 const char *ret = p->buffer + p->len;
434
435 trace_seq_bitmask(p, bitmask_ptr, bitmask_size * 8);
436 trace_seq_putc(p, 0);
437
438 return ret;
439}
440EXPORT_SYMBOL_GPL(ftrace_print_bitmask_seq);
441
442const char *
402ftrace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int buf_len) 443ftrace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int buf_len)
403{ 444{
404 int i; 445 int i;
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index e14da5e97a69..19bd8928ce94 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -130,15 +130,9 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip,
130 atomic_dec(&data->disabled); 130 atomic_dec(&data->disabled);
131 preempt_enable_notrace(); 131 preempt_enable_notrace();
132} 132}
133
134static struct ftrace_ops trace_ops __read_mostly =
135{
136 .func = wakeup_tracer_call,
137 .flags = FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_RECURSION_SAFE,
138};
139#endif /* CONFIG_FUNCTION_TRACER */ 133#endif /* CONFIG_FUNCTION_TRACER */
140 134
141static int register_wakeup_function(int graph, int set) 135static int register_wakeup_function(struct trace_array *tr, int graph, int set)
142{ 136{
143 int ret; 137 int ret;
144 138
@@ -150,7 +144,7 @@ static int register_wakeup_function(int graph, int set)
150 ret = register_ftrace_graph(&wakeup_graph_return, 144 ret = register_ftrace_graph(&wakeup_graph_return,
151 &wakeup_graph_entry); 145 &wakeup_graph_entry);
152 else 146 else
153 ret = register_ftrace_function(&trace_ops); 147 ret = register_ftrace_function(tr->ops);
154 148
155 if (!ret) 149 if (!ret)
156 function_enabled = true; 150 function_enabled = true;
@@ -158,7 +152,7 @@ static int register_wakeup_function(int graph, int set)
158 return ret; 152 return ret;
159} 153}
160 154
161static void unregister_wakeup_function(int graph) 155static void unregister_wakeup_function(struct trace_array *tr, int graph)
162{ 156{
163 if (!function_enabled) 157 if (!function_enabled)
164 return; 158 return;
@@ -166,17 +160,17 @@ static void unregister_wakeup_function(int graph)
166 if (graph) 160 if (graph)
167 unregister_ftrace_graph(); 161 unregister_ftrace_graph();
168 else 162 else
169 unregister_ftrace_function(&trace_ops); 163 unregister_ftrace_function(tr->ops);
170 164
171 function_enabled = false; 165 function_enabled = false;
172} 166}
173 167
174static void wakeup_function_set(int set) 168static void wakeup_function_set(struct trace_array *tr, int set)
175{ 169{
176 if (set) 170 if (set)
177 register_wakeup_function(is_graph(), 1); 171 register_wakeup_function(tr, is_graph(), 1);
178 else 172 else
179 unregister_wakeup_function(is_graph()); 173 unregister_wakeup_function(tr, is_graph());
180} 174}
181 175
182static int wakeup_flag_changed(struct trace_array *tr, u32 mask, int set) 176static int wakeup_flag_changed(struct trace_array *tr, u32 mask, int set)
@@ -184,16 +178,16 @@ static int wakeup_flag_changed(struct trace_array *tr, u32 mask, int set)
184 struct tracer *tracer = tr->current_trace; 178 struct tracer *tracer = tr->current_trace;
185 179
186 if (mask & TRACE_ITER_FUNCTION) 180 if (mask & TRACE_ITER_FUNCTION)
187 wakeup_function_set(set); 181 wakeup_function_set(tr, set);
188 182
189 return trace_keep_overwrite(tracer, mask, set); 183 return trace_keep_overwrite(tracer, mask, set);
190} 184}
191 185
192static int start_func_tracer(int graph) 186static int start_func_tracer(struct trace_array *tr, int graph)
193{ 187{
194 int ret; 188 int ret;
195 189
196 ret = register_wakeup_function(graph, 0); 190 ret = register_wakeup_function(tr, graph, 0);
197 191
198 if (!ret && tracing_is_enabled()) 192 if (!ret && tracing_is_enabled())
199 tracer_enabled = 1; 193 tracer_enabled = 1;
@@ -203,11 +197,11 @@ static int start_func_tracer(int graph)
203 return ret; 197 return ret;
204} 198}
205 199
206static void stop_func_tracer(int graph) 200static void stop_func_tracer(struct trace_array *tr, int graph)
207{ 201{
208 tracer_enabled = 0; 202 tracer_enabled = 0;
209 203
210 unregister_wakeup_function(graph); 204 unregister_wakeup_function(tr, graph);
211} 205}
212 206
213#ifdef CONFIG_FUNCTION_GRAPH_TRACER 207#ifdef CONFIG_FUNCTION_GRAPH_TRACER
@@ -221,12 +215,12 @@ wakeup_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set)
221 if (!(is_graph() ^ set)) 215 if (!(is_graph() ^ set))
222 return 0; 216 return 0;
223 217
224 stop_func_tracer(!set); 218 stop_func_tracer(tr, !set);
225 219
226 wakeup_reset(wakeup_trace); 220 wakeup_reset(wakeup_trace);
227 tracing_max_latency = 0; 221 tr->max_latency = 0;
228 222
229 return start_func_tracer(set); 223 return start_func_tracer(tr, set);
230} 224}
231 225
232static int wakeup_graph_entry(struct ftrace_graph_ent *trace) 226static int wakeup_graph_entry(struct ftrace_graph_ent *trace)
@@ -350,13 +344,13 @@ static void wakeup_print_header(struct seq_file *s)
350/* 344/*
351 * Should this new latency be reported/recorded? 345 * Should this new latency be reported/recorded?
352 */ 346 */
353static int report_latency(cycle_t delta) 347static int report_latency(struct trace_array *tr, cycle_t delta)
354{ 348{
355 if (tracing_thresh) { 349 if (tracing_thresh) {
356 if (delta < tracing_thresh) 350 if (delta < tracing_thresh)
357 return 0; 351 return 0;
358 } else { 352 } else {
359 if (delta <= tracing_max_latency) 353 if (delta <= tr->max_latency)
360 return 0; 354 return 0;
361 } 355 }
362 return 1; 356 return 1;
@@ -424,11 +418,11 @@ probe_wakeup_sched_switch(void *ignore,
424 T1 = ftrace_now(cpu); 418 T1 = ftrace_now(cpu);
425 delta = T1-T0; 419 delta = T1-T0;
426 420
427 if (!report_latency(delta)) 421 if (!report_latency(wakeup_trace, delta))
428 goto out_unlock; 422 goto out_unlock;
429 423
430 if (likely(!is_tracing_stopped())) { 424 if (likely(!is_tracing_stopped())) {
431 tracing_max_latency = delta; 425 wakeup_trace->max_latency = delta;
432 update_max_tr(wakeup_trace, wakeup_task, wakeup_cpu); 426 update_max_tr(wakeup_trace, wakeup_task, wakeup_cpu);
433 } 427 }
434 428
@@ -587,7 +581,7 @@ static void start_wakeup_tracer(struct trace_array *tr)
587 */ 581 */
588 smp_wmb(); 582 smp_wmb();
589 583
590 if (start_func_tracer(is_graph())) 584 if (start_func_tracer(tr, is_graph()))
591 printk(KERN_ERR "failed to start wakeup tracer\n"); 585 printk(KERN_ERR "failed to start wakeup tracer\n");
592 586
593 return; 587 return;
@@ -600,13 +594,15 @@ fail_deprobe:
600static void stop_wakeup_tracer(struct trace_array *tr) 594static void stop_wakeup_tracer(struct trace_array *tr)
601{ 595{
602 tracer_enabled = 0; 596 tracer_enabled = 0;
603 stop_func_tracer(is_graph()); 597 stop_func_tracer(tr, is_graph());
604 unregister_trace_sched_switch(probe_wakeup_sched_switch, NULL); 598 unregister_trace_sched_switch(probe_wakeup_sched_switch, NULL);
605 unregister_trace_sched_wakeup_new(probe_wakeup, NULL); 599 unregister_trace_sched_wakeup_new(probe_wakeup, NULL);
606 unregister_trace_sched_wakeup(probe_wakeup, NULL); 600 unregister_trace_sched_wakeup(probe_wakeup, NULL);
607 unregister_trace_sched_migrate_task(probe_wakeup_migrate_task, NULL); 601 unregister_trace_sched_migrate_task(probe_wakeup_migrate_task, NULL);
608} 602}
609 603
604static bool wakeup_busy;
605
610static int __wakeup_tracer_init(struct trace_array *tr) 606static int __wakeup_tracer_init(struct trace_array *tr)
611{ 607{
612 save_flags = trace_flags; 608 save_flags = trace_flags;
@@ -615,14 +611,20 @@ static int __wakeup_tracer_init(struct trace_array *tr)
615 set_tracer_flag(tr, TRACE_ITER_OVERWRITE, 1); 611 set_tracer_flag(tr, TRACE_ITER_OVERWRITE, 1);
616 set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, 1); 612 set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, 1);
617 613
618 tracing_max_latency = 0; 614 tr->max_latency = 0;
619 wakeup_trace = tr; 615 wakeup_trace = tr;
616 ftrace_init_array_ops(tr, wakeup_tracer_call);
620 start_wakeup_tracer(tr); 617 start_wakeup_tracer(tr);
618
619 wakeup_busy = true;
621 return 0; 620 return 0;
622} 621}
623 622
624static int wakeup_tracer_init(struct trace_array *tr) 623static int wakeup_tracer_init(struct trace_array *tr)
625{ 624{
625 if (wakeup_busy)
626 return -EBUSY;
627
626 wakeup_dl = 0; 628 wakeup_dl = 0;
627 wakeup_rt = 0; 629 wakeup_rt = 0;
628 return __wakeup_tracer_init(tr); 630 return __wakeup_tracer_init(tr);
@@ -630,6 +632,9 @@ static int wakeup_tracer_init(struct trace_array *tr)
630 632
631static int wakeup_rt_tracer_init(struct trace_array *tr) 633static int wakeup_rt_tracer_init(struct trace_array *tr)
632{ 634{
635 if (wakeup_busy)
636 return -EBUSY;
637
633 wakeup_dl = 0; 638 wakeup_dl = 0;
634 wakeup_rt = 1; 639 wakeup_rt = 1;
635 return __wakeup_tracer_init(tr); 640 return __wakeup_tracer_init(tr);
@@ -637,6 +642,9 @@ static int wakeup_rt_tracer_init(struct trace_array *tr)
637 642
638static int wakeup_dl_tracer_init(struct trace_array *tr) 643static int wakeup_dl_tracer_init(struct trace_array *tr)
639{ 644{
645 if (wakeup_busy)
646 return -EBUSY;
647
640 wakeup_dl = 1; 648 wakeup_dl = 1;
641 wakeup_rt = 0; 649 wakeup_rt = 0;
642 return __wakeup_tracer_init(tr); 650 return __wakeup_tracer_init(tr);
@@ -653,6 +661,8 @@ static void wakeup_tracer_reset(struct trace_array *tr)
653 661
654 set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, lat_flag); 662 set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, lat_flag);
655 set_tracer_flag(tr, TRACE_ITER_OVERWRITE, overwrite_flag); 663 set_tracer_flag(tr, TRACE_ITER_OVERWRITE, overwrite_flag);
664 ftrace_reset_array_ops(tr);
665 wakeup_busy = false;
656} 666}
657 667
658static void wakeup_tracer_start(struct trace_array *tr) 668static void wakeup_tracer_start(struct trace_array *tr)
@@ -684,6 +694,7 @@ static struct tracer wakeup_tracer __read_mostly =
684#endif 694#endif
685 .open = wakeup_trace_open, 695 .open = wakeup_trace_open,
686 .close = wakeup_trace_close, 696 .close = wakeup_trace_close,
697 .allow_instances = true,
687 .use_max_tr = true, 698 .use_max_tr = true,
688}; 699};
689 700
@@ -694,7 +705,6 @@ static struct tracer wakeup_rt_tracer __read_mostly =
694 .reset = wakeup_tracer_reset, 705 .reset = wakeup_tracer_reset,
695 .start = wakeup_tracer_start, 706 .start = wakeup_tracer_start,
696 .stop = wakeup_tracer_stop, 707 .stop = wakeup_tracer_stop,
697 .wait_pipe = poll_wait_pipe,
698 .print_max = true, 708 .print_max = true,
699 .print_header = wakeup_print_header, 709 .print_header = wakeup_print_header,
700 .print_line = wakeup_print_line, 710 .print_line = wakeup_print_line,
@@ -706,6 +716,7 @@ static struct tracer wakeup_rt_tracer __read_mostly =
706#endif 716#endif
707 .open = wakeup_trace_open, 717 .open = wakeup_trace_open,
708 .close = wakeup_trace_close, 718 .close = wakeup_trace_close,
719 .allow_instances = true,
709 .use_max_tr = true, 720 .use_max_tr = true,
710}; 721};
711 722
@@ -716,7 +727,6 @@ static struct tracer wakeup_dl_tracer __read_mostly =
716 .reset = wakeup_tracer_reset, 727 .reset = wakeup_tracer_reset,
717 .start = wakeup_tracer_start, 728 .start = wakeup_tracer_start,
718 .stop = wakeup_tracer_stop, 729 .stop = wakeup_tracer_stop,
719 .wait_pipe = poll_wait_pipe,
720 .print_max = true, 730 .print_max = true,
721 .print_header = wakeup_print_header, 731 .print_header = wakeup_print_header,
722 .print_line = wakeup_print_line, 732 .print_line = wakeup_print_line,
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index e98fca60974f..5ef60499dc8e 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -65,7 +65,7 @@ static int trace_test_buffer(struct trace_buffer *buf, unsigned long *count)
65 65
66 /* Don't allow flipping of max traces now */ 66 /* Don't allow flipping of max traces now */
67 local_irq_save(flags); 67 local_irq_save(flags);
68 arch_spin_lock(&ftrace_max_lock); 68 arch_spin_lock(&buf->tr->max_lock);
69 69
70 cnt = ring_buffer_entries(buf->buffer); 70 cnt = ring_buffer_entries(buf->buffer);
71 71
@@ -83,7 +83,7 @@ static int trace_test_buffer(struct trace_buffer *buf, unsigned long *count)
83 break; 83 break;
84 } 84 }
85 tracing_on(); 85 tracing_on();
86 arch_spin_unlock(&ftrace_max_lock); 86 arch_spin_unlock(&buf->tr->max_lock);
87 local_irq_restore(flags); 87 local_irq_restore(flags);
88 88
89 if (count) 89 if (count)
@@ -161,11 +161,6 @@ static struct ftrace_ops test_probe3 = {
161 .flags = FTRACE_OPS_FL_RECURSION_SAFE, 161 .flags = FTRACE_OPS_FL_RECURSION_SAFE,
162}; 162};
163 163
164static struct ftrace_ops test_global = {
165 .func = trace_selftest_test_global_func,
166 .flags = FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_RECURSION_SAFE,
167};
168
169static void print_counts(void) 164static void print_counts(void)
170{ 165{
171 printk("(%d %d %d %d %d) ", 166 printk("(%d %d %d %d %d) ",
@@ -185,7 +180,7 @@ static void reset_counts(void)
185 trace_selftest_test_dyn_cnt = 0; 180 trace_selftest_test_dyn_cnt = 0;
186} 181}
187 182
188static int trace_selftest_ops(int cnt) 183static int trace_selftest_ops(struct trace_array *tr, int cnt)
189{ 184{
190 int save_ftrace_enabled = ftrace_enabled; 185 int save_ftrace_enabled = ftrace_enabled;
191 struct ftrace_ops *dyn_ops; 186 struct ftrace_ops *dyn_ops;
@@ -220,7 +215,11 @@ static int trace_selftest_ops(int cnt)
220 register_ftrace_function(&test_probe1); 215 register_ftrace_function(&test_probe1);
221 register_ftrace_function(&test_probe2); 216 register_ftrace_function(&test_probe2);
222 register_ftrace_function(&test_probe3); 217 register_ftrace_function(&test_probe3);
223 register_ftrace_function(&test_global); 218 /* First time we are running with main function */
219 if (cnt > 1) {
220 ftrace_init_array_ops(tr, trace_selftest_test_global_func);
221 register_ftrace_function(tr->ops);
222 }
224 223
225 DYN_FTRACE_TEST_NAME(); 224 DYN_FTRACE_TEST_NAME();
226 225
@@ -232,8 +231,10 @@ static int trace_selftest_ops(int cnt)
232 goto out; 231 goto out;
233 if (trace_selftest_test_probe3_cnt != 1) 232 if (trace_selftest_test_probe3_cnt != 1)
234 goto out; 233 goto out;
235 if (trace_selftest_test_global_cnt == 0) 234 if (cnt > 1) {
236 goto out; 235 if (trace_selftest_test_global_cnt == 0)
236 goto out;
237 }
237 238
238 DYN_FTRACE_TEST_NAME2(); 239 DYN_FTRACE_TEST_NAME2();
239 240
@@ -269,8 +270,10 @@ static int trace_selftest_ops(int cnt)
269 goto out_free; 270 goto out_free;
270 if (trace_selftest_test_probe3_cnt != 3) 271 if (trace_selftest_test_probe3_cnt != 3)
271 goto out_free; 272 goto out_free;
272 if (trace_selftest_test_global_cnt == 0) 273 if (cnt > 1) {
273 goto out; 274 if (trace_selftest_test_global_cnt == 0)
275 goto out;
276 }
274 if (trace_selftest_test_dyn_cnt == 0) 277 if (trace_selftest_test_dyn_cnt == 0)
275 goto out_free; 278 goto out_free;
276 279
@@ -295,7 +298,9 @@ static int trace_selftest_ops(int cnt)
295 unregister_ftrace_function(&test_probe1); 298 unregister_ftrace_function(&test_probe1);
296 unregister_ftrace_function(&test_probe2); 299 unregister_ftrace_function(&test_probe2);
297 unregister_ftrace_function(&test_probe3); 300 unregister_ftrace_function(&test_probe3);
298 unregister_ftrace_function(&test_global); 301 if (cnt > 1)
302 unregister_ftrace_function(tr->ops);
303 ftrace_reset_array_ops(tr);
299 304
300 /* Make sure everything is off */ 305 /* Make sure everything is off */
301 reset_counts(); 306 reset_counts();
@@ -315,9 +320,9 @@ static int trace_selftest_ops(int cnt)
315} 320}
316 321
317/* Test dynamic code modification and ftrace filters */ 322/* Test dynamic code modification and ftrace filters */
318int trace_selftest_startup_dynamic_tracing(struct tracer *trace, 323static int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
319 struct trace_array *tr, 324 struct trace_array *tr,
320 int (*func)(void)) 325 int (*func)(void))
321{ 326{
322 int save_ftrace_enabled = ftrace_enabled; 327 int save_ftrace_enabled = ftrace_enabled;
323 unsigned long count; 328 unsigned long count;
@@ -388,7 +393,7 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
388 } 393 }
389 394
390 /* Test the ops with global tracing running */ 395 /* Test the ops with global tracing running */
391 ret = trace_selftest_ops(1); 396 ret = trace_selftest_ops(tr, 1);
392 trace->reset(tr); 397 trace->reset(tr);
393 398
394 out: 399 out:
@@ -399,7 +404,7 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
399 404
400 /* Test the ops with global tracing off */ 405 /* Test the ops with global tracing off */
401 if (!ret) 406 if (!ret)
402 ret = trace_selftest_ops(2); 407 ret = trace_selftest_ops(tr, 2);
403 408
404 return ret; 409 return ret;
405} 410}
@@ -802,7 +807,7 @@ out:
802int 807int
803trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr) 808trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr)
804{ 809{
805 unsigned long save_max = tracing_max_latency; 810 unsigned long save_max = tr->max_latency;
806 unsigned long count; 811 unsigned long count;
807 int ret; 812 int ret;
808 813
@@ -814,7 +819,7 @@ trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr)
814 } 819 }
815 820
816 /* reset the max latency */ 821 /* reset the max latency */
817 tracing_max_latency = 0; 822 tr->max_latency = 0;
818 /* disable interrupts for a bit */ 823 /* disable interrupts for a bit */
819 local_irq_disable(); 824 local_irq_disable();
820 udelay(100); 825 udelay(100);
@@ -841,7 +846,7 @@ trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr)
841 ret = -1; 846 ret = -1;
842 } 847 }
843 848
844 tracing_max_latency = save_max; 849 tr->max_latency = save_max;
845 850
846 return ret; 851 return ret;
847} 852}
@@ -851,7 +856,7 @@ trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr)
851int 856int
852trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr) 857trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr)
853{ 858{
854 unsigned long save_max = tracing_max_latency; 859 unsigned long save_max = tr->max_latency;
855 unsigned long count; 860 unsigned long count;
856 int ret; 861 int ret;
857 862
@@ -876,7 +881,7 @@ trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr)
876 } 881 }
877 882
878 /* reset the max latency */ 883 /* reset the max latency */
879 tracing_max_latency = 0; 884 tr->max_latency = 0;
880 /* disable preemption for a bit */ 885 /* disable preemption for a bit */
881 preempt_disable(); 886 preempt_disable();
882 udelay(100); 887 udelay(100);
@@ -903,7 +908,7 @@ trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr)
903 ret = -1; 908 ret = -1;
904 } 909 }
905 910
906 tracing_max_latency = save_max; 911 tr->max_latency = save_max;
907 912
908 return ret; 913 return ret;
909} 914}
@@ -913,7 +918,7 @@ trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr)
913int 918int
914trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *tr) 919trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *tr)
915{ 920{
916 unsigned long save_max = tracing_max_latency; 921 unsigned long save_max = tr->max_latency;
917 unsigned long count; 922 unsigned long count;
918 int ret; 923 int ret;
919 924
@@ -938,7 +943,7 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *
938 } 943 }
939 944
940 /* reset the max latency */ 945 /* reset the max latency */
941 tracing_max_latency = 0; 946 tr->max_latency = 0;
942 947
943 /* disable preemption and interrupts for a bit */ 948 /* disable preemption and interrupts for a bit */
944 preempt_disable(); 949 preempt_disable();
@@ -973,7 +978,7 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *
973 } 978 }
974 979
975 /* do the test by disabling interrupts first this time */ 980 /* do the test by disabling interrupts first this time */
976 tracing_max_latency = 0; 981 tr->max_latency = 0;
977 tracing_start(); 982 tracing_start();
978 trace->start(tr); 983 trace->start(tr);
979 984
@@ -1004,7 +1009,7 @@ out:
1004 tracing_start(); 1009 tracing_start();
1005out_no_start: 1010out_no_start:
1006 trace->reset(tr); 1011 trace->reset(tr);
1007 tracing_max_latency = save_max; 1012 tr->max_latency = save_max;
1008 1013
1009 return ret; 1014 return ret;
1010} 1015}
@@ -1057,7 +1062,7 @@ static int trace_wakeup_test_thread(void *data)
1057int 1062int
1058trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr) 1063trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr)
1059{ 1064{
1060 unsigned long save_max = tracing_max_latency; 1065 unsigned long save_max = tr->max_latency;
1061 struct task_struct *p; 1066 struct task_struct *p;
1062 struct completion is_ready; 1067 struct completion is_ready;
1063 unsigned long count; 1068 unsigned long count;
@@ -1083,7 +1088,7 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr)
1083 } 1088 }
1084 1089
1085 /* reset the max latency */ 1090 /* reset the max latency */
1086 tracing_max_latency = 0; 1091 tr->max_latency = 0;
1087 1092
1088 while (p->on_rq) { 1093 while (p->on_rq) {
1089 /* 1094 /*
@@ -1113,7 +1118,7 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr)
1113 trace->reset(tr); 1118 trace->reset(tr);
1114 tracing_start(); 1119 tracing_start();
1115 1120
1116 tracing_max_latency = save_max; 1121 tr->max_latency = save_max;
1117 1122
1118 /* kill the thread */ 1123 /* kill the thread */
1119 kthread_stop(p); 1124 kthread_stop(p);
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 21b320e5d163..8a4e5cb66a4c 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -51,11 +51,33 @@ static DEFINE_MUTEX(stack_sysctl_mutex);
51int stack_tracer_enabled; 51int stack_tracer_enabled;
52static int last_stack_tracer_enabled; 52static int last_stack_tracer_enabled;
53 53
54static inline void print_max_stack(void)
55{
56 long i;
57 int size;
58
59 pr_emerg(" Depth Size Location (%d entries)\n"
60 " ----- ---- --------\n",
61 max_stack_trace.nr_entries - 1);
62
63 for (i = 0; i < max_stack_trace.nr_entries; i++) {
64 if (stack_dump_trace[i] == ULONG_MAX)
65 break;
66 if (i+1 == max_stack_trace.nr_entries ||
67 stack_dump_trace[i+1] == ULONG_MAX)
68 size = stack_dump_index[i];
69 else
70 size = stack_dump_index[i] - stack_dump_index[i+1];
71
72 pr_emerg("%3ld) %8d %5d %pS\n", i, stack_dump_index[i],
73 size, (void *)stack_dump_trace[i]);
74 }
75}
76
54static inline void 77static inline void
55check_stack(unsigned long ip, unsigned long *stack) 78check_stack(unsigned long ip, unsigned long *stack)
56{ 79{
57 unsigned long this_size, flags; 80 unsigned long this_size, flags; unsigned long *p, *top, *start;
58 unsigned long *p, *top, *start;
59 static int tracer_frame; 81 static int tracer_frame;
60 int frame_size = ACCESS_ONCE(tracer_frame); 82 int frame_size = ACCESS_ONCE(tracer_frame);
61 int i; 83 int i;
@@ -85,8 +107,12 @@ check_stack(unsigned long ip, unsigned long *stack)
85 107
86 max_stack_size = this_size; 108 max_stack_size = this_size;
87 109
88 max_stack_trace.nr_entries = 0; 110 max_stack_trace.nr_entries = 0;
89 max_stack_trace.skip = 3; 111
112 if (using_ftrace_ops_list_func())
113 max_stack_trace.skip = 4;
114 else
115 max_stack_trace.skip = 3;
90 116
91 save_stack_trace(&max_stack_trace); 117 save_stack_trace(&max_stack_trace);
92 118
@@ -145,8 +171,12 @@ check_stack(unsigned long ip, unsigned long *stack)
145 i++; 171 i++;
146 } 172 }
147 173
148 BUG_ON(current != &init_task && 174 if ((current != &init_task &&
149 *(end_of_stack(current)) != STACK_END_MAGIC); 175 *(end_of_stack(current)) != STACK_END_MAGIC)) {
176 print_max_stack();
177 BUG();
178 }
179
150 out: 180 out:
151 arch_spin_unlock(&max_stack_lock); 181 arch_spin_unlock(&max_stack_lock);
152 local_irq_restore(flags); 182 local_irq_restore(flags);
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 930e51462dc8..c082a7441345 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -732,9 +732,15 @@ static int uprobe_buffer_enable(void)
732 732
733static void uprobe_buffer_disable(void) 733static void uprobe_buffer_disable(void)
734{ 734{
735 int cpu;
736
735 BUG_ON(!mutex_is_locked(&event_mutex)); 737 BUG_ON(!mutex_is_locked(&event_mutex));
736 738
737 if (--uprobe_buffer_refcnt == 0) { 739 if (--uprobe_buffer_refcnt == 0) {
740 for_each_possible_cpu(cpu)
741 free_page((unsigned long)per_cpu_ptr(uprobe_cpu_buffer,
742 cpu)->buf);
743
738 free_percpu(uprobe_cpu_buffer); 744 free_percpu(uprobe_cpu_buffer);
739 uprobe_cpu_buffer = NULL; 745 uprobe_cpu_buffer = NULL;
740 } 746 }
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index ac5b23cf7212..33cbd8c203f8 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -188,7 +188,6 @@ static int tracepoint_add_func(struct tracepoint *tp,
188 WARN_ON_ONCE(1); 188 WARN_ON_ONCE(1);
189 return PTR_ERR(old); 189 return PTR_ERR(old);
190 } 190 }
191 release_probes(old);
192 191
193 /* 192 /*
194 * rcu_assign_pointer has a smp_wmb() which makes sure that the new 193 * rcu_assign_pointer has a smp_wmb() which makes sure that the new
@@ -200,6 +199,7 @@ static int tracepoint_add_func(struct tracepoint *tp,
200 rcu_assign_pointer(tp->funcs, tp_funcs); 199 rcu_assign_pointer(tp->funcs, tp_funcs);
201 if (!static_key_enabled(&tp->key)) 200 if (!static_key_enabled(&tp->key))
202 static_key_slow_inc(&tp->key); 201 static_key_slow_inc(&tp->key);
202 release_probes(old);
203 return 0; 203 return 0;
204} 204}
205 205
@@ -221,7 +221,6 @@ static int tracepoint_remove_func(struct tracepoint *tp,
221 WARN_ON_ONCE(1); 221 WARN_ON_ONCE(1);
222 return PTR_ERR(old); 222 return PTR_ERR(old);
223 } 223 }
224 release_probes(old);
225 224
226 if (!tp_funcs) { 225 if (!tp_funcs) {
227 /* Removed last function */ 226 /* Removed last function */
@@ -232,6 +231,7 @@ static int tracepoint_remove_func(struct tracepoint *tp,
232 static_key_slow_dec(&tp->key); 231 static_key_slow_dec(&tp->key);
233 } 232 }
234 rcu_assign_pointer(tp->funcs, tp_funcs); 233 rcu_assign_pointer(tp->funcs, tp_funcs);
234 release_probes(old);
235 return 0; 235 return 0;
236} 236}
237 237
@@ -239,6 +239,7 @@ static int tracepoint_remove_func(struct tracepoint *tp,
239 * tracepoint_probe_register - Connect a probe to a tracepoint 239 * tracepoint_probe_register - Connect a probe to a tracepoint
240 * @tp: tracepoint 240 * @tp: tracepoint
241 * @probe: probe handler 241 * @probe: probe handler
242 * @data: tracepoint data
242 * 243 *
243 * Returns 0 if ok, error value on error. 244 * Returns 0 if ok, error value on error.
244 * Note: if @tp is within a module, the caller is responsible for 245 * Note: if @tp is within a module, the caller is responsible for
@@ -264,6 +265,7 @@ EXPORT_SYMBOL_GPL(tracepoint_probe_register);
264 * tracepoint_probe_unregister - Disconnect a probe from a tracepoint 265 * tracepoint_probe_unregister - Disconnect a probe from a tracepoint
265 * @tp: tracepoint 266 * @tp: tracepoint
266 * @probe: probe function pointer 267 * @probe: probe function pointer
268 * @data: tracepoint data
267 * 269 *
268 * Returns 0 if ok, error value on error. 270 * Returns 0 if ok, error value on error.
269 */ 271 */
diff --git a/kernel/user.c b/kernel/user.c
index 294fc6a94168..4efa39350e44 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -87,7 +87,6 @@ static DEFINE_SPINLOCK(uidhash_lock);
87struct user_struct root_user = { 87struct user_struct root_user = {
88 .__count = ATOMIC_INIT(1), 88 .__count = ATOMIC_INIT(1),
89 .processes = ATOMIC_INIT(1), 89 .processes = ATOMIC_INIT(1),
90 .files = ATOMIC_INIT(0),
91 .sigpending = ATOMIC_INIT(0), 90 .sigpending = ATOMIC_INIT(0),
92 .locked_shm = 0, 91 .locked_shm = 0,
93 .uid = GLOBAL_ROOT_UID, 92 .uid = GLOBAL_ROOT_UID,
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index bf71b4b2d632..fcc02560fd6b 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -286,7 +286,7 @@ EXPORT_SYMBOL(from_kuid_munged);
286/** 286/**
287 * make_kgid - Map a user-namespace gid pair into a kgid. 287 * make_kgid - Map a user-namespace gid pair into a kgid.
288 * @ns: User namespace that the gid is in 288 * @ns: User namespace that the gid is in
289 * @uid: group identifier 289 * @gid: group identifier
290 * 290 *
291 * Maps a user-namespace gid pair into a kernel internal kgid, 291 * Maps a user-namespace gid pair into a kernel internal kgid,
292 * and returns that kgid. 292 * and returns that kgid.
@@ -482,7 +482,8 @@ static int projid_m_show(struct seq_file *seq, void *v)
482 return 0; 482 return 0;
483} 483}
484 484
485static void *m_start(struct seq_file *seq, loff_t *ppos, struct uid_gid_map *map) 485static void *m_start(struct seq_file *seq, loff_t *ppos,
486 struct uid_gid_map *map)
486{ 487{
487 struct uid_gid_extent *extent = NULL; 488 struct uid_gid_extent *extent = NULL;
488 loff_t pos = *ppos; 489 loff_t pos = *ppos;
@@ -546,7 +547,8 @@ struct seq_operations proc_projid_seq_operations = {
546 .show = projid_m_show, 547 .show = projid_m_show,
547}; 548};
548 549
549static bool mappings_overlap(struct uid_gid_map *new_map, struct uid_gid_extent *extent) 550static bool mappings_overlap(struct uid_gid_map *new_map,
551 struct uid_gid_extent *extent)
550{ 552{
551 u32 upper_first, lower_first, upper_last, lower_last; 553 u32 upper_first, lower_first, upper_last, lower_last;
552 unsigned idx; 554 unsigned idx;
@@ -653,7 +655,7 @@ static ssize_t map_write(struct file *file, const char __user *buf,
653 ret = -EINVAL; 655 ret = -EINVAL;
654 pos = kbuf; 656 pos = kbuf;
655 new_map.nr_extents = 0; 657 new_map.nr_extents = 0;
656 for (;pos; pos = next_line) { 658 for (; pos; pos = next_line) {
657 extent = &new_map.extent[new_map.nr_extents]; 659 extent = &new_map.extent[new_map.nr_extents];
658 660
659 /* Find the end of line and ensure I don't look past it */ 661 /* Find the end of line and ensure I don't look past it */
@@ -687,13 +689,16 @@ static ssize_t map_write(struct file *file, const char __user *buf,
687 689
688 /* Verify we have been given valid starting values */ 690 /* Verify we have been given valid starting values */
689 if ((extent->first == (u32) -1) || 691 if ((extent->first == (u32) -1) ||
690 (extent->lower_first == (u32) -1 )) 692 (extent->lower_first == (u32) -1))
691 goto out; 693 goto out;
692 694
693 /* Verify count is not zero and does not cause the extent to wrap */ 695 /* Verify count is not zero and does not cause the
696 * extent to wrap
697 */
694 if ((extent->first + extent->count) <= extent->first) 698 if ((extent->first + extent->count) <= extent->first)
695 goto out; 699 goto out;
696 if ((extent->lower_first + extent->count) <= extent->lower_first) 700 if ((extent->lower_first + extent->count) <=
701 extent->lower_first)
697 goto out; 702 goto out;
698 703
699 /* Do the ranges in extent overlap any previous extents? */ 704 /* Do the ranges in extent overlap any previous extents? */
@@ -751,7 +756,8 @@ out:
751 return ret; 756 return ret;
752} 757}
753 758
754ssize_t proc_uid_map_write(struct file *file, const char __user *buf, size_t size, loff_t *ppos) 759ssize_t proc_uid_map_write(struct file *file, const char __user *buf,
760 size_t size, loff_t *ppos)
755{ 761{
756 struct seq_file *seq = file->private_data; 762 struct seq_file *seq = file->private_data;
757 struct user_namespace *ns = seq->private; 763 struct user_namespace *ns = seq->private;
@@ -767,7 +773,8 @@ ssize_t proc_uid_map_write(struct file *file, const char __user *buf, size_t siz
767 &ns->uid_map, &ns->parent->uid_map); 773 &ns->uid_map, &ns->parent->uid_map);
768} 774}
769 775
770ssize_t proc_gid_map_write(struct file *file, const char __user *buf, size_t size, loff_t *ppos) 776ssize_t proc_gid_map_write(struct file *file, const char __user *buf,
777 size_t size, loff_t *ppos)
771{ 778{
772 struct seq_file *seq = file->private_data; 779 struct seq_file *seq = file->private_data;
773 struct user_namespace *ns = seq->private; 780 struct user_namespace *ns = seq->private;
@@ -783,7 +790,8 @@ ssize_t proc_gid_map_write(struct file *file, const char __user *buf, size_t siz
783 &ns->gid_map, &ns->parent->gid_map); 790 &ns->gid_map, &ns->parent->gid_map);
784} 791}
785 792
786ssize_t proc_projid_map_write(struct file *file, const char __user *buf, size_t size, loff_t *ppos) 793ssize_t proc_projid_map_write(struct file *file, const char __user *buf,
794 size_t size, loff_t *ppos)
787{ 795{
788 struct seq_file *seq = file->private_data; 796 struct seq_file *seq = file->private_data;
789 struct user_namespace *ns = seq->private; 797 struct user_namespace *ns = seq->private;
@@ -800,7 +808,7 @@ ssize_t proc_projid_map_write(struct file *file, const char __user *buf, size_t
800 &ns->projid_map, &ns->parent->projid_map); 808 &ns->projid_map, &ns->parent->projid_map);
801} 809}
802 810
803static bool new_idmap_permitted(const struct file *file, 811static bool new_idmap_permitted(const struct file *file,
804 struct user_namespace *ns, int cap_setid, 812 struct user_namespace *ns, int cap_setid,
805 struct uid_gid_map *new_map) 813 struct uid_gid_map *new_map)
806{ 814{
@@ -811,8 +819,7 @@ static bool new_idmap_permitted(const struct file *file,
811 kuid_t uid = make_kuid(ns->parent, id); 819 kuid_t uid = make_kuid(ns->parent, id);
812 if (uid_eq(uid, file->f_cred->fsuid)) 820 if (uid_eq(uid, file->f_cred->fsuid))
813 return true; 821 return true;
814 } 822 } else if (cap_setid == CAP_SETGID) {
815 else if (cap_setid == CAP_SETGID) {
816 kgid_t gid = make_kgid(ns->parent, id); 823 kgid_t gid = make_kgid(ns->parent, id);
817 if (gid_eq(gid, file->f_cred->fsgid)) 824 if (gid_eq(gid, file->f_cred->fsgid))
818 return true; 825 return true;
diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c
index 4f69f9a5e221..c8eac43267e9 100644
--- a/kernel/utsname_sysctl.c
+++ b/kernel/utsname_sysctl.c
@@ -17,7 +17,7 @@
17 17
18#ifdef CONFIG_PROC_SYSCTL 18#ifdef CONFIG_PROC_SYSCTL
19 19
20static void *get_uts(ctl_table *table, int write) 20static void *get_uts(struct ctl_table *table, int write)
21{ 21{
22 char *which = table->data; 22 char *which = table->data;
23 struct uts_namespace *uts_ns; 23 struct uts_namespace *uts_ns;
@@ -32,7 +32,7 @@ static void *get_uts(ctl_table *table, int write)
32 return which; 32 return which;
33} 33}
34 34
35static void put_uts(ctl_table *table, int write, void *which) 35static void put_uts(struct ctl_table *table, int write, void *which)
36{ 36{
37 if (!write) 37 if (!write)
38 up_read(&uts_sem); 38 up_read(&uts_sem);
@@ -44,14 +44,14 @@ static void put_uts(ctl_table *table, int write, void *which)
44 * Special case of dostring for the UTS structure. This has locks 44 * Special case of dostring for the UTS structure. This has locks
45 * to observe. Should this be in kernel/sys.c ???? 45 * to observe. Should this be in kernel/sys.c ????
46 */ 46 */
47static int proc_do_uts_string(ctl_table *table, int write, 47static int proc_do_uts_string(struct ctl_table *table, int write,
48 void __user *buffer, size_t *lenp, loff_t *ppos) 48 void __user *buffer, size_t *lenp, loff_t *ppos)
49{ 49{
50 struct ctl_table uts_table; 50 struct ctl_table uts_table;
51 int r; 51 int r;
52 memcpy(&uts_table, table, sizeof(uts_table)); 52 memcpy(&uts_table, table, sizeof(uts_table));
53 uts_table.data = get_uts(table, write); 53 uts_table.data = get_uts(table, write);
54 r = proc_dostring(&uts_table,write,buffer,lenp, ppos); 54 r = proc_dostring(&uts_table, write, buffer, lenp, ppos);
55 put_uts(table, write, uts_table.data); 55 put_uts(table, write, uts_table.data);
56 56
57 if (write) 57 if (write)
@@ -135,4 +135,4 @@ static int __init utsname_sysctl_init(void)
135 return 0; 135 return 0;
136} 136}
137 137
138__initcall(utsname_sysctl_init); 138device_initcall(utsname_sysctl_init);
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index e90089fd78e0..516203e665fc 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -138,7 +138,11 @@ static void __touch_watchdog(void)
138 138
139void touch_softlockup_watchdog(void) 139void touch_softlockup_watchdog(void)
140{ 140{
141 __this_cpu_write(watchdog_touch_ts, 0); 141 /*
142 * Preemption can be enabled. It doesn't matter which CPU's timestamp
143 * gets zeroed here, so use the raw_ operation.
144 */
145 raw_cpu_write(watchdog_touch_ts, 0);
142} 146}
143EXPORT_SYMBOL(touch_softlockup_watchdog); 147EXPORT_SYMBOL(touch_softlockup_watchdog);
144 148
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 0ee63af30bd1..6203d2900877 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -65,15 +65,12 @@ enum {
65 * be executing on any CPU. The pool behaves as an unbound one. 65 * be executing on any CPU. The pool behaves as an unbound one.
66 * 66 *
67 * Note that DISASSOCIATED should be flipped only while holding 67 * Note that DISASSOCIATED should be flipped only while holding
68 * manager_mutex to avoid changing binding state while 68 * attach_mutex to avoid changing binding state while
69 * create_worker() is in progress. 69 * worker_attach_to_pool() is in progress.
70 */ 70 */
71 POOL_MANAGE_WORKERS = 1 << 0, /* need to manage workers */
72 POOL_DISASSOCIATED = 1 << 2, /* cpu can't serve workers */ 71 POOL_DISASSOCIATED = 1 << 2, /* cpu can't serve workers */
73 POOL_FREEZING = 1 << 3, /* freeze in progress */
74 72
75 /* worker flags */ 73 /* worker flags */
76 WORKER_STARTED = 1 << 0, /* started */
77 WORKER_DIE = 1 << 1, /* die die die */ 74 WORKER_DIE = 1 << 1, /* die die die */
78 WORKER_IDLE = 1 << 2, /* is idle */ 75 WORKER_IDLE = 1 << 2, /* is idle */
79 WORKER_PREP = 1 << 3, /* preparing to run works */ 76 WORKER_PREP = 1 << 3, /* preparing to run works */
@@ -100,10 +97,10 @@ enum {
100 97
101 /* 98 /*
102 * Rescue workers are used only on emergencies and shared by 99 * Rescue workers are used only on emergencies and shared by
103 * all cpus. Give -20. 100 * all cpus. Give MIN_NICE.
104 */ 101 */
105 RESCUER_NICE_LEVEL = -20, 102 RESCUER_NICE_LEVEL = MIN_NICE,
106 HIGHPRI_NICE_LEVEL = -20, 103 HIGHPRI_NICE_LEVEL = MIN_NICE,
107 104
108 WQ_NAME_LEN = 24, 105 WQ_NAME_LEN = 24,
109}; 106};
@@ -124,8 +121,7 @@ enum {
124 * cpu or grabbing pool->lock is enough for read access. If 121 * cpu or grabbing pool->lock is enough for read access. If
125 * POOL_DISASSOCIATED is set, it's identical to L. 122 * POOL_DISASSOCIATED is set, it's identical to L.
126 * 123 *
127 * MG: pool->manager_mutex and pool->lock protected. Writes require both 124 * A: pool->attach_mutex protected.
128 * locks. Reads can happen under either lock.
129 * 125 *
130 * PL: wq_pool_mutex protected. 126 * PL: wq_pool_mutex protected.
131 * 127 *
@@ -163,8 +159,11 @@ struct worker_pool {
163 159
164 /* see manage_workers() for details on the two manager mutexes */ 160 /* see manage_workers() for details on the two manager mutexes */
165 struct mutex manager_arb; /* manager arbitration */ 161 struct mutex manager_arb; /* manager arbitration */
166 struct mutex manager_mutex; /* manager exclusion */ 162 struct mutex attach_mutex; /* attach/detach exclusion */
167 struct idr worker_idr; /* MG: worker IDs and iteration */ 163 struct list_head workers; /* A: attached workers */
164 struct completion *detach_completion; /* all workers detached */
165
166 struct ida worker_ida; /* worker IDs for task name */
168 167
169 struct workqueue_attrs *attrs; /* I: worker attributes */ 168 struct workqueue_attrs *attrs; /* I: worker attributes */
170 struct hlist_node hash_node; /* PL: unbound_pool_hash node */ 169 struct hlist_node hash_node; /* PL: unbound_pool_hash node */
@@ -340,16 +339,6 @@ static void copy_workqueue_attrs(struct workqueue_attrs *to,
340 lockdep_is_held(&wq->mutex), \ 339 lockdep_is_held(&wq->mutex), \
341 "sched RCU or wq->mutex should be held") 340 "sched RCU or wq->mutex should be held")
342 341
343#ifdef CONFIG_LOCKDEP
344#define assert_manager_or_pool_lock(pool) \
345 WARN_ONCE(debug_locks && \
346 !lockdep_is_held(&(pool)->manager_mutex) && \
347 !lockdep_is_held(&(pool)->lock), \
348 "pool->manager_mutex or ->lock should be held")
349#else
350#define assert_manager_or_pool_lock(pool) do { } while (0)
351#endif
352
353#define for_each_cpu_worker_pool(pool, cpu) \ 342#define for_each_cpu_worker_pool(pool, cpu) \
354 for ((pool) = &per_cpu(cpu_worker_pools, cpu)[0]; \ 343 for ((pool) = &per_cpu(cpu_worker_pools, cpu)[0]; \
355 (pool) < &per_cpu(cpu_worker_pools, cpu)[NR_STD_WORKER_POOLS]; \ 344 (pool) < &per_cpu(cpu_worker_pools, cpu)[NR_STD_WORKER_POOLS]; \
@@ -375,17 +364,16 @@ static void copy_workqueue_attrs(struct workqueue_attrs *to,
375/** 364/**
376 * for_each_pool_worker - iterate through all workers of a worker_pool 365 * for_each_pool_worker - iterate through all workers of a worker_pool
377 * @worker: iteration cursor 366 * @worker: iteration cursor
378 * @wi: integer used for iteration
379 * @pool: worker_pool to iterate workers of 367 * @pool: worker_pool to iterate workers of
380 * 368 *
381 * This must be called with either @pool->manager_mutex or ->lock held. 369 * This must be called with @pool->attach_mutex.
382 * 370 *
383 * The if/else clause exists only for the lockdep assertion and can be 371 * The if/else clause exists only for the lockdep assertion and can be
384 * ignored. 372 * ignored.
385 */ 373 */
386#define for_each_pool_worker(worker, wi, pool) \ 374#define for_each_pool_worker(worker, pool) \
387 idr_for_each_entry(&(pool)->worker_idr, (worker), (wi)) \ 375 list_for_each_entry((worker), &(pool)->workers, node) \
388 if (({ assert_manager_or_pool_lock((pool)); false; })) { } \ 376 if (({ lockdep_assert_held(&pool->attach_mutex); false; })) { } \
389 else 377 else
390 378
391/** 379/**
@@ -763,13 +751,6 @@ static bool need_to_create_worker(struct worker_pool *pool)
763 return need_more_worker(pool) && !may_start_working(pool); 751 return need_more_worker(pool) && !may_start_working(pool);
764} 752}
765 753
766/* Do I need to be the manager? */
767static bool need_to_manage_workers(struct worker_pool *pool)
768{
769 return need_to_create_worker(pool) ||
770 (pool->flags & POOL_MANAGE_WORKERS);
771}
772
773/* Do we have too many workers and should some go away? */ 754/* Do we have too many workers and should some go away? */
774static bool too_many_workers(struct worker_pool *pool) 755static bool too_many_workers(struct worker_pool *pool)
775{ 756{
@@ -791,8 +772,8 @@ static bool too_many_workers(struct worker_pool *pool)
791 * Wake up functions. 772 * Wake up functions.
792 */ 773 */
793 774
794/* Return the first worker. Safe with preemption disabled */ 775/* Return the first idle worker. Safe with preemption disabled */
795static struct worker *first_worker(struct worker_pool *pool) 776static struct worker *first_idle_worker(struct worker_pool *pool)
796{ 777{
797 if (unlikely(list_empty(&pool->idle_list))) 778 if (unlikely(list_empty(&pool->idle_list)))
798 return NULL; 779 return NULL;
@@ -811,7 +792,7 @@ static struct worker *first_worker(struct worker_pool *pool)
811 */ 792 */
812static void wake_up_worker(struct worker_pool *pool) 793static void wake_up_worker(struct worker_pool *pool)
813{ 794{
814 struct worker *worker = first_worker(pool); 795 struct worker *worker = first_idle_worker(pool);
815 796
816 if (likely(worker)) 797 if (likely(worker))
817 wake_up_process(worker->task); 798 wake_up_process(worker->task);
@@ -885,7 +866,7 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task, int cpu)
885 */ 866 */
886 if (atomic_dec_and_test(&pool->nr_running) && 867 if (atomic_dec_and_test(&pool->nr_running) &&
887 !list_empty(&pool->worklist)) 868 !list_empty(&pool->worklist))
888 to_wakeup = first_worker(pool); 869 to_wakeup = first_idle_worker(pool);
889 return to_wakeup ? to_wakeup->task : NULL; 870 return to_wakeup ? to_wakeup->task : NULL;
890} 871}
891 872
@@ -1621,70 +1602,6 @@ static void worker_leave_idle(struct worker *worker)
1621 list_del_init(&worker->entry); 1602 list_del_init(&worker->entry);
1622} 1603}
1623 1604
1624/**
1625 * worker_maybe_bind_and_lock - try to bind %current to worker_pool and lock it
1626 * @pool: target worker_pool
1627 *
1628 * Bind %current to the cpu of @pool if it is associated and lock @pool.
1629 *
1630 * Works which are scheduled while the cpu is online must at least be
1631 * scheduled to a worker which is bound to the cpu so that if they are
1632 * flushed from cpu callbacks while cpu is going down, they are
1633 * guaranteed to execute on the cpu.
1634 *
1635 * This function is to be used by unbound workers and rescuers to bind
1636 * themselves to the target cpu and may race with cpu going down or
1637 * coming online. kthread_bind() can't be used because it may put the
1638 * worker to already dead cpu and set_cpus_allowed_ptr() can't be used
1639 * verbatim as it's best effort and blocking and pool may be
1640 * [dis]associated in the meantime.
1641 *
1642 * This function tries set_cpus_allowed() and locks pool and verifies the
1643 * binding against %POOL_DISASSOCIATED which is set during
1644 * %CPU_DOWN_PREPARE and cleared during %CPU_ONLINE, so if the worker
1645 * enters idle state or fetches works without dropping lock, it can
1646 * guarantee the scheduling requirement described in the first paragraph.
1647 *
1648 * CONTEXT:
1649 * Might sleep. Called without any lock but returns with pool->lock
1650 * held.
1651 *
1652 * Return:
1653 * %true if the associated pool is online (@worker is successfully
1654 * bound), %false if offline.
1655 */
1656static bool worker_maybe_bind_and_lock(struct worker_pool *pool)
1657__acquires(&pool->lock)
1658{
1659 while (true) {
1660 /*
1661 * The following call may fail, succeed or succeed
1662 * without actually migrating the task to the cpu if
1663 * it races with cpu hotunplug operation. Verify
1664 * against POOL_DISASSOCIATED.
1665 */
1666 if (!(pool->flags & POOL_DISASSOCIATED))
1667 set_cpus_allowed_ptr(current, pool->attrs->cpumask);
1668
1669 spin_lock_irq(&pool->lock);
1670 if (pool->flags & POOL_DISASSOCIATED)
1671 return false;
1672 if (task_cpu(current) == pool->cpu &&
1673 cpumask_equal(&current->cpus_allowed, pool->attrs->cpumask))
1674 return true;
1675 spin_unlock_irq(&pool->lock);
1676
1677 /*
1678 * We've raced with CPU hot[un]plug. Give it a breather
1679 * and retry migration. cond_resched() is required here;
1680 * otherwise, we might deadlock against cpu_stop trying to
1681 * bring down the CPU on non-preemptive kernel.
1682 */
1683 cpu_relax();
1684 cond_resched();
1685 }
1686}
1687
1688static struct worker *alloc_worker(void) 1605static struct worker *alloc_worker(void)
1689{ 1606{
1690 struct worker *worker; 1607 struct worker *worker;
@@ -1693,6 +1610,7 @@ static struct worker *alloc_worker(void)
1693 if (worker) { 1610 if (worker) {
1694 INIT_LIST_HEAD(&worker->entry); 1611 INIT_LIST_HEAD(&worker->entry);
1695 INIT_LIST_HEAD(&worker->scheduled); 1612 INIT_LIST_HEAD(&worker->scheduled);
1613 INIT_LIST_HEAD(&worker->node);
1696 /* on creation a worker is in !idle && prep state */ 1614 /* on creation a worker is in !idle && prep state */
1697 worker->flags = WORKER_PREP; 1615 worker->flags = WORKER_PREP;
1698 } 1616 }
@@ -1700,12 +1618,68 @@ static struct worker *alloc_worker(void)
1700} 1618}
1701 1619
1702/** 1620/**
1621 * worker_attach_to_pool() - attach a worker to a pool
1622 * @worker: worker to be attached
1623 * @pool: the target pool
1624 *
1625 * Attach @worker to @pool. Once attached, the %WORKER_UNBOUND flag and
1626 * cpu-binding of @worker are kept coordinated with the pool across
1627 * cpu-[un]hotplugs.
1628 */
1629static void worker_attach_to_pool(struct worker *worker,
1630 struct worker_pool *pool)
1631{
1632 mutex_lock(&pool->attach_mutex);
1633
1634 /*
1635 * set_cpus_allowed_ptr() will fail if the cpumask doesn't have any
1636 * online CPUs. It'll be re-applied when any of the CPUs come up.
1637 */
1638 set_cpus_allowed_ptr(worker->task, pool->attrs->cpumask);
1639
1640 /*
1641 * The pool->attach_mutex ensures %POOL_DISASSOCIATED remains
1642 * stable across this function. See the comments above the
1643 * flag definition for details.
1644 */
1645 if (pool->flags & POOL_DISASSOCIATED)
1646 worker->flags |= WORKER_UNBOUND;
1647
1648 list_add_tail(&worker->node, &pool->workers);
1649
1650 mutex_unlock(&pool->attach_mutex);
1651}
1652
1653/**
1654 * worker_detach_from_pool() - detach a worker from its pool
1655 * @worker: worker which is attached to its pool
1656 * @pool: the pool @worker is attached to
1657 *
1658 * Undo the attaching which had been done in worker_attach_to_pool(). The
1659 * caller worker shouldn't access to the pool after detached except it has
1660 * other reference to the pool.
1661 */
1662static void worker_detach_from_pool(struct worker *worker,
1663 struct worker_pool *pool)
1664{
1665 struct completion *detach_completion = NULL;
1666
1667 mutex_lock(&pool->attach_mutex);
1668 list_del(&worker->node);
1669 if (list_empty(&pool->workers))
1670 detach_completion = pool->detach_completion;
1671 mutex_unlock(&pool->attach_mutex);
1672
1673 if (detach_completion)
1674 complete(detach_completion);
1675}
1676
1677/**
1703 * create_worker - create a new workqueue worker 1678 * create_worker - create a new workqueue worker
1704 * @pool: pool the new worker will belong to 1679 * @pool: pool the new worker will belong to
1705 * 1680 *
1706 * Create a new worker which is bound to @pool. The returned worker 1681 * Create a new worker which is attached to @pool. The new worker must be
1707 * can be started by calling start_worker() or destroyed using 1682 * started by start_worker().
1708 * destroy_worker().
1709 * 1683 *
1710 * CONTEXT: 1684 * CONTEXT:
1711 * Might sleep. Does GFP_KERNEL allocations. 1685 * Might sleep. Does GFP_KERNEL allocations.
@@ -1719,19 +1693,8 @@ static struct worker *create_worker(struct worker_pool *pool)
1719 int id = -1; 1693 int id = -1;
1720 char id_buf[16]; 1694 char id_buf[16];
1721 1695
1722 lockdep_assert_held(&pool->manager_mutex); 1696 /* ID is needed to determine kthread name */
1723 1697 id = ida_simple_get(&pool->worker_ida, 0, 0, GFP_KERNEL);
1724 /*
1725 * ID is needed to determine kthread name. Allocate ID first
1726 * without installing the pointer.
1727 */
1728 idr_preload(GFP_KERNEL);
1729 spin_lock_irq(&pool->lock);
1730
1731 id = idr_alloc(&pool->worker_idr, NULL, 0, 0, GFP_NOWAIT);
1732
1733 spin_unlock_irq(&pool->lock);
1734 idr_preload_end();
1735 if (id < 0) 1698 if (id < 0)
1736 goto fail; 1699 goto fail;
1737 1700
@@ -1758,33 +1721,14 @@ static struct worker *create_worker(struct worker_pool *pool)
1758 /* prevent userland from meddling with cpumask of workqueue workers */ 1721 /* prevent userland from meddling with cpumask of workqueue workers */
1759 worker->task->flags |= PF_NO_SETAFFINITY; 1722 worker->task->flags |= PF_NO_SETAFFINITY;
1760 1723
1761 /* 1724 /* successful, attach the worker to the pool */
1762 * set_cpus_allowed_ptr() will fail if the cpumask doesn't have any 1725 worker_attach_to_pool(worker, pool);
1763 * online CPUs. It'll be re-applied when any of the CPUs come up.
1764 */
1765 set_cpus_allowed_ptr(worker->task, pool->attrs->cpumask);
1766
1767 /*
1768 * The caller is responsible for ensuring %POOL_DISASSOCIATED
1769 * remains stable across this function. See the comments above the
1770 * flag definition for details.
1771 */
1772 if (pool->flags & POOL_DISASSOCIATED)
1773 worker->flags |= WORKER_UNBOUND;
1774
1775 /* successful, commit the pointer to idr */
1776 spin_lock_irq(&pool->lock);
1777 idr_replace(&pool->worker_idr, worker, worker->id);
1778 spin_unlock_irq(&pool->lock);
1779 1726
1780 return worker; 1727 return worker;
1781 1728
1782fail: 1729fail:
1783 if (id >= 0) { 1730 if (id >= 0)
1784 spin_lock_irq(&pool->lock); 1731 ida_simple_remove(&pool->worker_ida, id);
1785 idr_remove(&pool->worker_idr, id);
1786 spin_unlock_irq(&pool->lock);
1787 }
1788 kfree(worker); 1732 kfree(worker);
1789 return NULL; 1733 return NULL;
1790} 1734}
@@ -1800,7 +1744,6 @@ fail:
1800 */ 1744 */
1801static void start_worker(struct worker *worker) 1745static void start_worker(struct worker *worker)
1802{ 1746{
1803 worker->flags |= WORKER_STARTED;
1804 worker->pool->nr_workers++; 1747 worker->pool->nr_workers++;
1805 worker_enter_idle(worker); 1748 worker_enter_idle(worker);
1806 wake_up_process(worker->task); 1749 wake_up_process(worker->task);
@@ -1818,8 +1761,6 @@ static int create_and_start_worker(struct worker_pool *pool)
1818{ 1761{
1819 struct worker *worker; 1762 struct worker *worker;
1820 1763
1821 mutex_lock(&pool->manager_mutex);
1822
1823 worker = create_worker(pool); 1764 worker = create_worker(pool);
1824 if (worker) { 1765 if (worker) {
1825 spin_lock_irq(&pool->lock); 1766 spin_lock_irq(&pool->lock);
@@ -1827,8 +1768,6 @@ static int create_and_start_worker(struct worker_pool *pool)
1827 spin_unlock_irq(&pool->lock); 1768 spin_unlock_irq(&pool->lock);
1828 } 1769 }
1829 1770
1830 mutex_unlock(&pool->manager_mutex);
1831
1832 return worker ? 0 : -ENOMEM; 1771 return worker ? 0 : -ENOMEM;
1833} 1772}
1834 1773
@@ -1836,46 +1775,30 @@ static int create_and_start_worker(struct worker_pool *pool)
1836 * destroy_worker - destroy a workqueue worker 1775 * destroy_worker - destroy a workqueue worker
1837 * @worker: worker to be destroyed 1776 * @worker: worker to be destroyed
1838 * 1777 *
1839 * Destroy @worker and adjust @pool stats accordingly. 1778 * Destroy @worker and adjust @pool stats accordingly. The worker should
1779 * be idle.
1840 * 1780 *
1841 * CONTEXT: 1781 * CONTEXT:
1842 * spin_lock_irq(pool->lock) which is released and regrabbed. 1782 * spin_lock_irq(pool->lock).
1843 */ 1783 */
1844static void destroy_worker(struct worker *worker) 1784static void destroy_worker(struct worker *worker)
1845{ 1785{
1846 struct worker_pool *pool = worker->pool; 1786 struct worker_pool *pool = worker->pool;
1847 1787
1848 lockdep_assert_held(&pool->manager_mutex);
1849 lockdep_assert_held(&pool->lock); 1788 lockdep_assert_held(&pool->lock);
1850 1789
1851 /* sanity check frenzy */ 1790 /* sanity check frenzy */
1852 if (WARN_ON(worker->current_work) || 1791 if (WARN_ON(worker->current_work) ||
1853 WARN_ON(!list_empty(&worker->scheduled))) 1792 WARN_ON(!list_empty(&worker->scheduled)) ||
1793 WARN_ON(!(worker->flags & WORKER_IDLE)))
1854 return; 1794 return;
1855 1795
1856 if (worker->flags & WORKER_STARTED) 1796 pool->nr_workers--;
1857 pool->nr_workers--; 1797 pool->nr_idle--;
1858 if (worker->flags & WORKER_IDLE)
1859 pool->nr_idle--;
1860
1861 /*
1862 * Once WORKER_DIE is set, the kworker may destroy itself at any
1863 * point. Pin to ensure the task stays until we're done with it.
1864 */
1865 get_task_struct(worker->task);
1866 1798
1867 list_del_init(&worker->entry); 1799 list_del_init(&worker->entry);
1868 worker->flags |= WORKER_DIE; 1800 worker->flags |= WORKER_DIE;
1869 1801 wake_up_process(worker->task);
1870 idr_remove(&pool->worker_idr, worker->id);
1871
1872 spin_unlock_irq(&pool->lock);
1873
1874 kthread_stop(worker->task);
1875 put_task_struct(worker->task);
1876 kfree(worker);
1877
1878 spin_lock_irq(&pool->lock);
1879} 1802}
1880 1803
1881static void idle_worker_timeout(unsigned long __pool) 1804static void idle_worker_timeout(unsigned long __pool)
@@ -1884,7 +1807,7 @@ static void idle_worker_timeout(unsigned long __pool)
1884 1807
1885 spin_lock_irq(&pool->lock); 1808 spin_lock_irq(&pool->lock);
1886 1809
1887 if (too_many_workers(pool)) { 1810 while (too_many_workers(pool)) {
1888 struct worker *worker; 1811 struct worker *worker;
1889 unsigned long expires; 1812 unsigned long expires;
1890 1813
@@ -1892,13 +1815,12 @@ static void idle_worker_timeout(unsigned long __pool)
1892 worker = list_entry(pool->idle_list.prev, struct worker, entry); 1815 worker = list_entry(pool->idle_list.prev, struct worker, entry);
1893 expires = worker->last_active + IDLE_WORKER_TIMEOUT; 1816 expires = worker->last_active + IDLE_WORKER_TIMEOUT;
1894 1817
1895 if (time_before(jiffies, expires)) 1818 if (time_before(jiffies, expires)) {
1896 mod_timer(&pool->idle_timer, expires); 1819 mod_timer(&pool->idle_timer, expires);
1897 else { 1820 break;
1898 /* it's been idle for too long, wake up manager */
1899 pool->flags |= POOL_MANAGE_WORKERS;
1900 wake_up_worker(pool);
1901 } 1821 }
1822
1823 destroy_worker(worker);
1902 } 1824 }
1903 1825
1904 spin_unlock_irq(&pool->lock); 1826 spin_unlock_irq(&pool->lock);
@@ -1916,6 +1838,12 @@ static void send_mayday(struct work_struct *work)
1916 1838
1917 /* mayday mayday mayday */ 1839 /* mayday mayday mayday */
1918 if (list_empty(&pwq->mayday_node)) { 1840 if (list_empty(&pwq->mayday_node)) {
1841 /*
1842 * If @pwq is for an unbound wq, its base ref may be put at
1843 * any time due to an attribute change. Pin @pwq until the
1844 * rescuer is done with it.
1845 */
1846 get_pwq(pwq);
1919 list_add_tail(&pwq->mayday_node, &wq->maydays); 1847 list_add_tail(&pwq->mayday_node, &wq->maydays);
1920 wake_up_process(wq->rescuer->task); 1848 wake_up_process(wq->rescuer->task);
1921 } 1849 }
@@ -2011,44 +1939,6 @@ restart:
2011} 1939}
2012 1940
2013/** 1941/**
2014 * maybe_destroy_worker - destroy workers which have been idle for a while
2015 * @pool: pool to destroy workers for
2016 *
2017 * Destroy @pool workers which have been idle for longer than
2018 * IDLE_WORKER_TIMEOUT.
2019 *
2020 * LOCKING:
2021 * spin_lock_irq(pool->lock) which may be released and regrabbed
2022 * multiple times. Called only from manager.
2023 *
2024 * Return:
2025 * %false if no action was taken and pool->lock stayed locked, %true
2026 * otherwise.
2027 */
2028static bool maybe_destroy_workers(struct worker_pool *pool)
2029{
2030 bool ret = false;
2031
2032 while (too_many_workers(pool)) {
2033 struct worker *worker;
2034 unsigned long expires;
2035
2036 worker = list_entry(pool->idle_list.prev, struct worker, entry);
2037 expires = worker->last_active + IDLE_WORKER_TIMEOUT;
2038
2039 if (time_before(jiffies, expires)) {
2040 mod_timer(&pool->idle_timer, expires);
2041 break;
2042 }
2043
2044 destroy_worker(worker);
2045 ret = true;
2046 }
2047
2048 return ret;
2049}
2050
2051/**
2052 * manage_workers - manage worker pool 1942 * manage_workers - manage worker pool
2053 * @worker: self 1943 * @worker: self
2054 * 1944 *
@@ -2077,8 +1967,6 @@ static bool manage_workers(struct worker *worker)
2077 bool ret = false; 1967 bool ret = false;
2078 1968
2079 /* 1969 /*
2080 * Managership is governed by two mutexes - manager_arb and
2081 * manager_mutex. manager_arb handles arbitration of manager role.
2082 * Anyone who successfully grabs manager_arb wins the arbitration 1970 * Anyone who successfully grabs manager_arb wins the arbitration
2083 * and becomes the manager. mutex_trylock() on pool->manager_arb 1971 * and becomes the manager. mutex_trylock() on pool->manager_arb
2084 * failure while holding pool->lock reliably indicates that someone 1972 * failure while holding pool->lock reliably indicates that someone
@@ -2087,40 +1975,12 @@ static bool manage_workers(struct worker *worker)
2087 * grabbing manager_arb is responsible for actually performing 1975 * grabbing manager_arb is responsible for actually performing
2088 * manager duties. If manager_arb is grabbed and released without 1976 * manager duties. If manager_arb is grabbed and released without
2089 * actual management, the pool may stall indefinitely. 1977 * actual management, the pool may stall indefinitely.
2090 *
2091 * manager_mutex is used for exclusion of actual management
2092 * operations. The holder of manager_mutex can be sure that none
2093 * of management operations, including creation and destruction of
2094 * workers, won't take place until the mutex is released. Because
2095 * manager_mutex doesn't interfere with manager role arbitration,
2096 * it is guaranteed that the pool's management, while may be
2097 * delayed, won't be disturbed by someone else grabbing
2098 * manager_mutex.
2099 */ 1978 */
2100 if (!mutex_trylock(&pool->manager_arb)) 1979 if (!mutex_trylock(&pool->manager_arb))
2101 return ret; 1980 return ret;
2102 1981
2103 /*
2104 * With manager arbitration won, manager_mutex would be free in
2105 * most cases. trylock first without dropping @pool->lock.
2106 */
2107 if (unlikely(!mutex_trylock(&pool->manager_mutex))) {
2108 spin_unlock_irq(&pool->lock);
2109 mutex_lock(&pool->manager_mutex);
2110 spin_lock_irq(&pool->lock);
2111 ret = true;
2112 }
2113
2114 pool->flags &= ~POOL_MANAGE_WORKERS;
2115
2116 /*
2117 * Destroy and then create so that may_start_working() is true
2118 * on return.
2119 */
2120 ret |= maybe_destroy_workers(pool);
2121 ret |= maybe_create_worker(pool); 1982 ret |= maybe_create_worker(pool);
2122 1983
2123 mutex_unlock(&pool->manager_mutex);
2124 mutex_unlock(&pool->manager_arb); 1984 mutex_unlock(&pool->manager_arb);
2125 return ret; 1985 return ret;
2126} 1986}
@@ -2308,6 +2168,11 @@ woke_up:
2308 spin_unlock_irq(&pool->lock); 2168 spin_unlock_irq(&pool->lock);
2309 WARN_ON_ONCE(!list_empty(&worker->entry)); 2169 WARN_ON_ONCE(!list_empty(&worker->entry));
2310 worker->task->flags &= ~PF_WQ_WORKER; 2170 worker->task->flags &= ~PF_WQ_WORKER;
2171
2172 set_task_comm(worker->task, "kworker/dying");
2173 ida_simple_remove(&pool->worker_ida, worker->id);
2174 worker_detach_from_pool(worker, pool);
2175 kfree(worker);
2311 return 0; 2176 return 0;
2312 } 2177 }
2313 2178
@@ -2355,9 +2220,6 @@ recheck:
2355 2220
2356 worker_set_flags(worker, WORKER_PREP, false); 2221 worker_set_flags(worker, WORKER_PREP, false);
2357sleep: 2222sleep:
2358 if (unlikely(need_to_manage_workers(pool)) && manage_workers(worker))
2359 goto recheck;
2360
2361 /* 2223 /*
2362 * pool->lock is held and there's no work to process and no need to 2224 * pool->lock is held and there's no work to process and no need to
2363 * manage, sleep. Workers are woken up only while holding 2225 * manage, sleep. Workers are woken up only while holding
@@ -2398,6 +2260,7 @@ static int rescuer_thread(void *__rescuer)
2398 struct worker *rescuer = __rescuer; 2260 struct worker *rescuer = __rescuer;
2399 struct workqueue_struct *wq = rescuer->rescue_wq; 2261 struct workqueue_struct *wq = rescuer->rescue_wq;
2400 struct list_head *scheduled = &rescuer->scheduled; 2262 struct list_head *scheduled = &rescuer->scheduled;
2263 bool should_stop;
2401 2264
2402 set_user_nice(current, RESCUER_NICE_LEVEL); 2265 set_user_nice(current, RESCUER_NICE_LEVEL);
2403 2266
@@ -2409,11 +2272,15 @@ static int rescuer_thread(void *__rescuer)
2409repeat: 2272repeat:
2410 set_current_state(TASK_INTERRUPTIBLE); 2273 set_current_state(TASK_INTERRUPTIBLE);
2411 2274
2412 if (kthread_should_stop()) { 2275 /*
2413 __set_current_state(TASK_RUNNING); 2276 * By the time the rescuer is requested to stop, the workqueue
2414 rescuer->task->flags &= ~PF_WQ_WORKER; 2277 * shouldn't have any work pending, but @wq->maydays may still have
2415 return 0; 2278 * pwq(s) queued. This can happen by non-rescuer workers consuming
2416 } 2279 * all the work items before the rescuer got to them. Go through
2280 * @wq->maydays processing before acting on should_stop so that the
2281 * list is always empty on exit.
2282 */
2283 should_stop = kthread_should_stop();
2417 2284
2418 /* see whether any pwq is asking for help */ 2285 /* see whether any pwq is asking for help */
2419 spin_lock_irq(&wq_mayday_lock); 2286 spin_lock_irq(&wq_mayday_lock);
@@ -2429,8 +2296,9 @@ repeat:
2429 2296
2430 spin_unlock_irq(&wq_mayday_lock); 2297 spin_unlock_irq(&wq_mayday_lock);
2431 2298
2432 /* migrate to the target cpu if possible */ 2299 worker_attach_to_pool(rescuer, pool);
2433 worker_maybe_bind_and_lock(pool); 2300
2301 spin_lock_irq(&pool->lock);
2434 rescuer->pool = pool; 2302 rescuer->pool = pool;
2435 2303
2436 /* 2304 /*
@@ -2443,6 +2311,17 @@ repeat:
2443 move_linked_works(work, scheduled, &n); 2311 move_linked_works(work, scheduled, &n);
2444 2312
2445 process_scheduled_works(rescuer); 2313 process_scheduled_works(rescuer);
2314 spin_unlock_irq(&pool->lock);
2315
2316 worker_detach_from_pool(rescuer, pool);
2317
2318 spin_lock_irq(&pool->lock);
2319
2320 /*
2321 * Put the reference grabbed by send_mayday(). @pool won't
2322 * go away while we're holding its lock.
2323 */
2324 put_pwq(pwq);
2446 2325
2447 /* 2326 /*
2448 * Leave this pool. If keep_working() is %true, notify a 2327 * Leave this pool. If keep_working() is %true, notify a
@@ -2459,6 +2338,12 @@ repeat:
2459 2338
2460 spin_unlock_irq(&wq_mayday_lock); 2339 spin_unlock_irq(&wq_mayday_lock);
2461 2340
2341 if (should_stop) {
2342 __set_current_state(TASK_RUNNING);
2343 rescuer->task->flags &= ~PF_WQ_WORKER;
2344 return 0;
2345 }
2346
2462 /* rescuers should never participate in concurrency management */ 2347 /* rescuers should never participate in concurrency management */
2463 WARN_ON_ONCE(!(rescuer->flags & WORKER_NOT_RUNNING)); 2348 WARN_ON_ONCE(!(rescuer->flags & WORKER_NOT_RUNNING));
2464 schedule(); 2349 schedule();
@@ -3527,9 +3412,10 @@ static int init_worker_pool(struct worker_pool *pool)
3527 (unsigned long)pool); 3412 (unsigned long)pool);
3528 3413
3529 mutex_init(&pool->manager_arb); 3414 mutex_init(&pool->manager_arb);
3530 mutex_init(&pool->manager_mutex); 3415 mutex_init(&pool->attach_mutex);
3531 idr_init(&pool->worker_idr); 3416 INIT_LIST_HEAD(&pool->workers);
3532 3417
3418 ida_init(&pool->worker_ida);
3533 INIT_HLIST_NODE(&pool->hash_node); 3419 INIT_HLIST_NODE(&pool->hash_node);
3534 pool->refcnt = 1; 3420 pool->refcnt = 1;
3535 3421
@@ -3544,7 +3430,7 @@ static void rcu_free_pool(struct rcu_head *rcu)
3544{ 3430{
3545 struct worker_pool *pool = container_of(rcu, struct worker_pool, rcu); 3431 struct worker_pool *pool = container_of(rcu, struct worker_pool, rcu);
3546 3432
3547 idr_destroy(&pool->worker_idr); 3433 ida_destroy(&pool->worker_ida);
3548 free_workqueue_attrs(pool->attrs); 3434 free_workqueue_attrs(pool->attrs);
3549 kfree(pool); 3435 kfree(pool);
3550} 3436}
@@ -3562,6 +3448,7 @@ static void rcu_free_pool(struct rcu_head *rcu)
3562 */ 3448 */
3563static void put_unbound_pool(struct worker_pool *pool) 3449static void put_unbound_pool(struct worker_pool *pool)
3564{ 3450{
3451 DECLARE_COMPLETION_ONSTACK(detach_completion);
3565 struct worker *worker; 3452 struct worker *worker;
3566 3453
3567 lockdep_assert_held(&wq_pool_mutex); 3454 lockdep_assert_held(&wq_pool_mutex);
@@ -3582,18 +3469,24 @@ static void put_unbound_pool(struct worker_pool *pool)
3582 /* 3469 /*
3583 * Become the manager and destroy all workers. Grabbing 3470 * Become the manager and destroy all workers. Grabbing
3584 * manager_arb prevents @pool's workers from blocking on 3471 * manager_arb prevents @pool's workers from blocking on
3585 * manager_mutex. 3472 * attach_mutex.
3586 */ 3473 */
3587 mutex_lock(&pool->manager_arb); 3474 mutex_lock(&pool->manager_arb);
3588 mutex_lock(&pool->manager_mutex);
3589 spin_lock_irq(&pool->lock);
3590 3475
3591 while ((worker = first_worker(pool))) 3476 spin_lock_irq(&pool->lock);
3477 while ((worker = first_idle_worker(pool)))
3592 destroy_worker(worker); 3478 destroy_worker(worker);
3593 WARN_ON(pool->nr_workers || pool->nr_idle); 3479 WARN_ON(pool->nr_workers || pool->nr_idle);
3594
3595 spin_unlock_irq(&pool->lock); 3480 spin_unlock_irq(&pool->lock);
3596 mutex_unlock(&pool->manager_mutex); 3481
3482 mutex_lock(&pool->attach_mutex);
3483 if (!list_empty(&pool->workers))
3484 pool->detach_completion = &detach_completion;
3485 mutex_unlock(&pool->attach_mutex);
3486
3487 if (pool->detach_completion)
3488 wait_for_completion(pool->detach_completion);
3489
3597 mutex_unlock(&pool->manager_arb); 3490 mutex_unlock(&pool->manager_arb);
3598 3491
3599 /* shut down the timers */ 3492 /* shut down the timers */
@@ -3639,9 +3532,6 @@ static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs)
3639 if (!pool || init_worker_pool(pool) < 0) 3532 if (!pool || init_worker_pool(pool) < 0)
3640 goto fail; 3533 goto fail;
3641 3534
3642 if (workqueue_freezing)
3643 pool->flags |= POOL_FREEZING;
3644
3645 lockdep_set_subclass(&pool->lock, 1); /* see put_pwq() */ 3535 lockdep_set_subclass(&pool->lock, 1); /* see put_pwq() */
3646 copy_workqueue_attrs(pool->attrs, attrs); 3536 copy_workqueue_attrs(pool->attrs, attrs);
3647 3537
@@ -3748,7 +3638,12 @@ static void pwq_adjust_max_active(struct pool_workqueue *pwq)
3748 3638
3749 spin_lock_irq(&pwq->pool->lock); 3639 spin_lock_irq(&pwq->pool->lock);
3750 3640
3751 if (!freezable || !(pwq->pool->flags & POOL_FREEZING)) { 3641 /*
3642 * During [un]freezing, the caller is responsible for ensuring that
3643 * this function is called at least once after @workqueue_freezing
3644 * is updated and visible.
3645 */
3646 if (!freezable || !workqueue_freezing) {
3752 pwq->max_active = wq->saved_max_active; 3647 pwq->max_active = wq->saved_max_active;
3753 3648
3754 while (!list_empty(&pwq->delayed_works) && 3649 while (!list_empty(&pwq->delayed_works) &&
@@ -4080,17 +3975,13 @@ static void wq_update_unbound_numa(struct workqueue_struct *wq, int cpu,
4080 * Let's determine what needs to be done. If the target cpumask is 3975 * Let's determine what needs to be done. If the target cpumask is
4081 * different from wq's, we need to compare it to @pwq's and create 3976 * different from wq's, we need to compare it to @pwq's and create
4082 * a new one if they don't match. If the target cpumask equals 3977 * a new one if they don't match. If the target cpumask equals
4083 * wq's, the default pwq should be used. If @pwq is already the 3978 * wq's, the default pwq should be used.
4084 * default one, nothing to do; otherwise, install the default one.
4085 */ 3979 */
4086 if (wq_calc_node_cpumask(wq->unbound_attrs, node, cpu_off, cpumask)) { 3980 if (wq_calc_node_cpumask(wq->unbound_attrs, node, cpu_off, cpumask)) {
4087 if (cpumask_equal(cpumask, pwq->pool->attrs->cpumask)) 3981 if (cpumask_equal(cpumask, pwq->pool->attrs->cpumask))
4088 goto out_unlock; 3982 goto out_unlock;
4089 } else { 3983 } else {
4090 if (pwq == wq->dfl_pwq) 3984 goto use_dfl_pwq;
4091 goto out_unlock;
4092 else
4093 goto use_dfl_pwq;
4094 } 3985 }
4095 3986
4096 mutex_unlock(&wq->mutex); 3987 mutex_unlock(&wq->mutex);
@@ -4098,9 +3989,10 @@ static void wq_update_unbound_numa(struct workqueue_struct *wq, int cpu,
4098 /* create a new pwq */ 3989 /* create a new pwq */
4099 pwq = alloc_unbound_pwq(wq, target_attrs); 3990 pwq = alloc_unbound_pwq(wq, target_attrs);
4100 if (!pwq) { 3991 if (!pwq) {
4101 pr_warning("workqueue: allocation failed while updating NUMA affinity of \"%s\"\n", 3992 pr_warn("workqueue: allocation failed while updating NUMA affinity of \"%s\"\n",
4102 wq->name); 3993 wq->name);
4103 goto out_unlock; 3994 mutex_lock(&wq->mutex);
3995 goto use_dfl_pwq;
4104 } 3996 }
4105 3997
4106 /* 3998 /*
@@ -4575,28 +4467,27 @@ static void wq_unbind_fn(struct work_struct *work)
4575 int cpu = smp_processor_id(); 4467 int cpu = smp_processor_id();
4576 struct worker_pool *pool; 4468 struct worker_pool *pool;
4577 struct worker *worker; 4469 struct worker *worker;
4578 int wi;
4579 4470
4580 for_each_cpu_worker_pool(pool, cpu) { 4471 for_each_cpu_worker_pool(pool, cpu) {
4581 WARN_ON_ONCE(cpu != smp_processor_id()); 4472 WARN_ON_ONCE(cpu != smp_processor_id());
4582 4473
4583 mutex_lock(&pool->manager_mutex); 4474 mutex_lock(&pool->attach_mutex);
4584 spin_lock_irq(&pool->lock); 4475 spin_lock_irq(&pool->lock);
4585 4476
4586 /* 4477 /*
4587 * We've blocked all manager operations. Make all workers 4478 * We've blocked all attach/detach operations. Make all workers
4588 * unbound and set DISASSOCIATED. Before this, all workers 4479 * unbound and set DISASSOCIATED. Before this, all workers
4589 * except for the ones which are still executing works from 4480 * except for the ones which are still executing works from
4590 * before the last CPU down must be on the cpu. After 4481 * before the last CPU down must be on the cpu. After
4591 * this, they may become diasporas. 4482 * this, they may become diasporas.
4592 */ 4483 */
4593 for_each_pool_worker(worker, wi, pool) 4484 for_each_pool_worker(worker, pool)
4594 worker->flags |= WORKER_UNBOUND; 4485 worker->flags |= WORKER_UNBOUND;
4595 4486
4596 pool->flags |= POOL_DISASSOCIATED; 4487 pool->flags |= POOL_DISASSOCIATED;
4597 4488
4598 spin_unlock_irq(&pool->lock); 4489 spin_unlock_irq(&pool->lock);
4599 mutex_unlock(&pool->manager_mutex); 4490 mutex_unlock(&pool->attach_mutex);
4600 4491
4601 /* 4492 /*
4602 * Call schedule() so that we cross rq->lock and thus can 4493 * Call schedule() so that we cross rq->lock and thus can
@@ -4636,9 +4527,8 @@ static void wq_unbind_fn(struct work_struct *work)
4636static void rebind_workers(struct worker_pool *pool) 4527static void rebind_workers(struct worker_pool *pool)
4637{ 4528{
4638 struct worker *worker; 4529 struct worker *worker;
4639 int wi;
4640 4530
4641 lockdep_assert_held(&pool->manager_mutex); 4531 lockdep_assert_held(&pool->attach_mutex);
4642 4532
4643 /* 4533 /*
4644 * Restore CPU affinity of all workers. As all idle workers should 4534 * Restore CPU affinity of all workers. As all idle workers should
@@ -4647,13 +4537,13 @@ static void rebind_workers(struct worker_pool *pool)
4647 * of all workers first and then clear UNBOUND. As we're called 4537 * of all workers first and then clear UNBOUND. As we're called
4648 * from CPU_ONLINE, the following shouldn't fail. 4538 * from CPU_ONLINE, the following shouldn't fail.
4649 */ 4539 */
4650 for_each_pool_worker(worker, wi, pool) 4540 for_each_pool_worker(worker, pool)
4651 WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, 4541 WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task,
4652 pool->attrs->cpumask) < 0); 4542 pool->attrs->cpumask) < 0);
4653 4543
4654 spin_lock_irq(&pool->lock); 4544 spin_lock_irq(&pool->lock);
4655 4545
4656 for_each_pool_worker(worker, wi, pool) { 4546 for_each_pool_worker(worker, pool) {
4657 unsigned int worker_flags = worker->flags; 4547 unsigned int worker_flags = worker->flags;
4658 4548
4659 /* 4549 /*
@@ -4705,9 +4595,8 @@ static void restore_unbound_workers_cpumask(struct worker_pool *pool, int cpu)
4705{ 4595{
4706 static cpumask_t cpumask; 4596 static cpumask_t cpumask;
4707 struct worker *worker; 4597 struct worker *worker;
4708 int wi;
4709 4598
4710 lockdep_assert_held(&pool->manager_mutex); 4599 lockdep_assert_held(&pool->attach_mutex);
4711 4600
4712 /* is @cpu allowed for @pool? */ 4601 /* is @cpu allowed for @pool? */
4713 if (!cpumask_test_cpu(cpu, pool->attrs->cpumask)) 4602 if (!cpumask_test_cpu(cpu, pool->attrs->cpumask))
@@ -4719,7 +4608,7 @@ static void restore_unbound_workers_cpumask(struct worker_pool *pool, int cpu)
4719 return; 4608 return;
4720 4609
4721 /* as we're called from CPU_ONLINE, the following shouldn't fail */ 4610 /* as we're called from CPU_ONLINE, the following shouldn't fail */
4722 for_each_pool_worker(worker, wi, pool) 4611 for_each_pool_worker(worker, pool)
4723 WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, 4612 WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task,
4724 pool->attrs->cpumask) < 0); 4613 pool->attrs->cpumask) < 0);
4725} 4614}
@@ -4752,7 +4641,7 @@ static int workqueue_cpu_up_callback(struct notifier_block *nfb,
4752 mutex_lock(&wq_pool_mutex); 4641 mutex_lock(&wq_pool_mutex);
4753 4642
4754 for_each_pool(pool, pi) { 4643 for_each_pool(pool, pi) {
4755 mutex_lock(&pool->manager_mutex); 4644 mutex_lock(&pool->attach_mutex);
4756 4645
4757 if (pool->cpu == cpu) { 4646 if (pool->cpu == cpu) {
4758 spin_lock_irq(&pool->lock); 4647 spin_lock_irq(&pool->lock);
@@ -4764,7 +4653,7 @@ static int workqueue_cpu_up_callback(struct notifier_block *nfb,
4764 restore_unbound_workers_cpumask(pool, cpu); 4653 restore_unbound_workers_cpumask(pool, cpu);
4765 } 4654 }
4766 4655
4767 mutex_unlock(&pool->manager_mutex); 4656 mutex_unlock(&pool->attach_mutex);
4768 } 4657 }
4769 4658
4770 /* update NUMA affinity of unbound workqueues */ 4659 /* update NUMA affinity of unbound workqueues */
@@ -4863,24 +4752,14 @@ EXPORT_SYMBOL_GPL(work_on_cpu);
4863 */ 4752 */
4864void freeze_workqueues_begin(void) 4753void freeze_workqueues_begin(void)
4865{ 4754{
4866 struct worker_pool *pool;
4867 struct workqueue_struct *wq; 4755 struct workqueue_struct *wq;
4868 struct pool_workqueue *pwq; 4756 struct pool_workqueue *pwq;
4869 int pi;
4870 4757
4871 mutex_lock(&wq_pool_mutex); 4758 mutex_lock(&wq_pool_mutex);
4872 4759
4873 WARN_ON_ONCE(workqueue_freezing); 4760 WARN_ON_ONCE(workqueue_freezing);
4874 workqueue_freezing = true; 4761 workqueue_freezing = true;
4875 4762
4876 /* set FREEZING */
4877 for_each_pool(pool, pi) {
4878 spin_lock_irq(&pool->lock);
4879 WARN_ON_ONCE(pool->flags & POOL_FREEZING);
4880 pool->flags |= POOL_FREEZING;
4881 spin_unlock_irq(&pool->lock);
4882 }
4883
4884 list_for_each_entry(wq, &workqueues, list) { 4763 list_for_each_entry(wq, &workqueues, list) {
4885 mutex_lock(&wq->mutex); 4764 mutex_lock(&wq->mutex);
4886 for_each_pwq(pwq, wq) 4765 for_each_pwq(pwq, wq)
@@ -4950,21 +4829,13 @@ void thaw_workqueues(void)
4950{ 4829{
4951 struct workqueue_struct *wq; 4830 struct workqueue_struct *wq;
4952 struct pool_workqueue *pwq; 4831 struct pool_workqueue *pwq;
4953 struct worker_pool *pool;
4954 int pi;
4955 4832
4956 mutex_lock(&wq_pool_mutex); 4833 mutex_lock(&wq_pool_mutex);
4957 4834
4958 if (!workqueue_freezing) 4835 if (!workqueue_freezing)
4959 goto out_unlock; 4836 goto out_unlock;
4960 4837
4961 /* clear FREEZING */ 4838 workqueue_freezing = false;
4962 for_each_pool(pool, pi) {
4963 spin_lock_irq(&pool->lock);
4964 WARN_ON_ONCE(!(pool->flags & POOL_FREEZING));
4965 pool->flags &= ~POOL_FREEZING;
4966 spin_unlock_irq(&pool->lock);
4967 }
4968 4839
4969 /* restore max_active and repopulate worklist */ 4840 /* restore max_active and repopulate worklist */
4970 list_for_each_entry(wq, &workqueues, list) { 4841 list_for_each_entry(wq, &workqueues, list) {
@@ -4974,7 +4845,6 @@ void thaw_workqueues(void)
4974 mutex_unlock(&wq->mutex); 4845 mutex_unlock(&wq->mutex);
4975 } 4846 }
4976 4847
4977 workqueue_freezing = false;
4978out_unlock: 4848out_unlock:
4979 mutex_unlock(&wq_pool_mutex); 4849 mutex_unlock(&wq_pool_mutex);
4980} 4850}
diff --git a/kernel/workqueue_internal.h b/kernel/workqueue_internal.h
index 7e2204db0b1a..45215870ac6c 100644
--- a/kernel/workqueue_internal.h
+++ b/kernel/workqueue_internal.h
@@ -37,6 +37,8 @@ struct worker {
37 struct task_struct *task; /* I: worker task */ 37 struct task_struct *task; /* I: worker task */
38 struct worker_pool *pool; /* I: the associated pool */ 38 struct worker_pool *pool; /* I: the associated pool */
39 /* L: for rescuers */ 39 /* L: for rescuers */
40 struct list_head node; /* A: anchored at pool->workers */
41 /* A: runs through worker->node */
40 42
41 unsigned long last_active; /* L: last active timestamp */ 43 unsigned long last_active; /* L: last active timestamp */
42 unsigned int flags; /* X: flags */ 44 unsigned int flags; /* X: flags */