aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile55
-rw-r--r--kernel/acct.c8
-rw-r--r--kernel/async.c159
-rw-r--r--kernel/audit.c40
-rw-r--r--kernel/audit_tree.c36
-rw-r--r--kernel/audit_watch.c6
-rw-r--r--kernel/auditfilter.c1
-rw-r--r--kernel/auditsc.c124
-rw-r--r--kernel/capability.c24
-rw-r--r--kernel/cgroup.c1036
-rw-r--r--kernel/cgroup_freezer.c514
-rw-r--r--kernel/compat.c106
-rw-r--r--kernel/context_tracking.c145
-rw-r--r--kernel/cpu.c19
-rw-r--r--kernel/cpuset.c968
-rw-r--r--kernel/cred.c154
-rw-r--r--kernel/debug/debug_core.c1
-rw-r--r--kernel/debug/debug_core.h2
-rw-r--r--kernel/debug/gdbstub.c4
-rw-r--r--kernel/debug/kdb/kdb_bp.c20
-rw-r--r--kernel/debug/kdb/kdb_debugger.c25
-rw-r--r--kernel/debug/kdb/kdb_main.c137
-rw-r--r--kernel/debug/kdb/kdb_private.h4
-rw-r--r--kernel/delayacct.c7
-rw-r--r--kernel/events/core.c65
-rw-r--r--kernel/events/hw_breakpoint.c14
-rw-r--r--kernel/events/internal.h2
-rw-r--r--kernel/events/ring_buffer.c22
-rw-r--r--kernel/events/uprobes.c499
-rw-r--r--kernel/exit.c122
-rw-r--r--kernel/fork.c181
-rw-r--r--kernel/freezer.c11
-rw-r--r--kernel/futex.c110
-rw-r--r--kernel/futex_compat.c21
-rw-r--r--kernel/gcov/Kconfig2
-rw-r--r--kernel/hrtimer.c41
-rw-r--r--kernel/irq/chip.c31
-rw-r--r--kernel/irq/irqdomain.c4
-rw-r--r--kernel/irq/manage.c46
-rw-r--r--kernel/irq/proc.c2
-rw-r--r--kernel/irq/resend.c8
-rw-r--r--kernel/irq/spurious.c7
-rw-r--r--kernel/irq_work.c150
-rw-r--r--kernel/kcmp.c1
-rw-r--r--kernel/kexec.c78
-rw-r--r--kernel/kfifo.c609
-rw-r--r--kernel/kmod.c15
-rw-r--r--kernel/kprobes.c66
-rw-r--r--kernel/ksysfs.c23
-rw-r--r--kernel/kthread.c54
-rw-r--r--kernel/lockdep.c15
-rw-r--r--kernel/lockdep_proc.c2
-rw-r--r--kernel/modsign_certificate.S19
-rw-r--r--kernel/modsign_pubkey.c21
-rw-r--r--kernel/module.c710
-rw-r--r--kernel/module_signing.c14
-rw-r--r--kernel/mutex.c1
-rw-r--r--kernel/nsproxy.c37
-rw-r--r--kernel/padata.c5
-rw-r--r--kernel/panic.c34
-rw-r--r--kernel/pid.c78
-rw-r--r--kernel/pid_namespace.c118
-rw-r--r--kernel/posix-cpu-timers.c78
-rw-r--r--kernel/posix-timers.c27
-rw-r--r--kernel/power/autosleep.c2
-rw-r--r--kernel/power/main.c31
-rw-r--r--kernel/power/process.c17
-rw-r--r--kernel/power/qos.c74
-rw-r--r--kernel/power/suspend.c69
-rw-r--r--kernel/power/suspend_test.c11
-rw-r--r--kernel/power/swap.c2
-rw-r--r--kernel/printk.c137
-rw-r--r--kernel/profile.c31
-rw-r--r--kernel/ptrace.c93
-rw-r--r--kernel/rcu.h9
-rw-r--r--kernel/rcupdate.c63
-rw-r--r--kernel/rcutiny.c10
-rw-r--r--kernel/rcutiny_plugin.h61
-rw-r--r--kernel/rcutorture.c120
-rw-r--r--kernel/rcutree.c577
-rw-r--r--kernel/rcutree.h78
-rw-r--r--kernel/rcutree_plugin.h422
-rw-r--r--kernel/rcutree_trace.c330
-rw-r--r--kernel/relay.c4
-rw-r--r--kernel/res_counter.c42
-rw-r--r--kernel/rtmutex-debug.c1
-rw-r--r--kernel/rtmutex-tester.c1
-rw-r--r--kernel/rtmutex.c1
-rw-r--r--kernel/rwsem.c10
-rw-r--r--kernel/sched/auto_group.c3
-rw-r--r--kernel/sched/clock.c26
-rw-r--r--kernel/sched/core.c370
-rw-r--r--kernel/sched/cpupri.c2
-rw-r--r--kernel/sched/cputime.c395
-rw-r--r--kernel/sched/debug.c133
-rw-r--r--kernel/sched/fair.c1159
-rw-r--r--kernel/sched/features.h16
-rw-r--r--kernel/sched/rt.c28
-rw-r--r--kernel/sched/sched.h74
-rw-r--r--kernel/sched/stats.c79
-rw-r--r--kernel/seccomp.c13
-rw-r--r--kernel/signal.c493
-rw-r--r--kernel/smp.c192
-rw-r--r--kernel/smpboot.c17
-rw-r--r--kernel/softirq.c44
-rw-r--r--kernel/srcu.c53
-rw-r--r--kernel/stop_machine.c156
-rw-r--r--kernel/sys.c373
-rw-r--r--kernel/sys_ni.c1
-rw-r--r--kernel/sysctl.c76
-rw-r--r--kernel/sysctl_binary.c45
-rw-r--r--kernel/time.c12
-rw-r--r--kernel/time/Kconfig9
-rw-r--r--kernel/time/Makefile2
-rw-r--r--kernel/time/clockevents.c1
-rw-r--r--kernel/time/jiffies.c8
-rw-r--r--kernel/time/ntp.c48
-rw-r--r--kernel/time/tick-broadcast.c41
-rw-r--r--kernel/time/tick-common.c8
-rw-r--r--kernel/time/tick-internal.h1
-rw-r--r--kernel/time/tick-sched.c151
-rw-r--r--kernel/time/timecompare.c193
-rw-r--r--kernel/time/timekeeping.c135
-rw-r--r--kernel/timeconst.bc108
-rw-r--r--kernel/timeconst.pl378
-rw-r--r--kernel/timer.c2
-rw-r--r--kernel/trace/Kconfig56
-rw-r--r--kernel/trace/blktrace.c30
-rw-r--r--kernel/trace/ftrace.c228
-rw-r--r--kernel/trace/power-traces.c3
-rw-r--r--kernel/trace/ring_buffer.c179
-rw-r--r--kernel/trace/trace.c814
-rw-r--r--kernel/trace/trace.h158
-rw-r--r--kernel/trace/trace_branch.c4
-rw-r--r--kernel/trace/trace_clock.c5
-rw-r--r--kernel/trace/trace_events.c52
-rw-r--r--kernel/trace/trace_events_filter.c4
-rw-r--r--kernel/trace/trace_functions.c68
-rw-r--r--kernel/trace/trace_functions_graph.c74
-rw-r--r--kernel/trace/trace_irqsoff.c35
-rw-r--r--kernel/trace/trace_kprobe.c10
-rw-r--r--kernel/trace/trace_output.c81
-rw-r--r--kernel/trace/trace_probe.c14
-rw-r--r--kernel/trace/trace_probe.h1
-rw-r--r--kernel/trace/trace_sched_switch.c4
-rw-r--r--kernel/trace/trace_sched_wakeup.c32
-rw-r--r--kernel/trace/trace_selftest.c34
-rw-r--r--kernel/trace/trace_stack.c6
-rw-r--r--kernel/trace/trace_syscalls.c122
-rw-r--r--kernel/trace/trace_uprobe.c229
-rw-r--r--kernel/tracepoint.c6
-rw-r--r--kernel/tsacct.c44
-rw-r--r--kernel/user-return-notifier.c4
-rw-r--r--kernel/user.c11
-rw-r--r--kernel/user_namespace.c220
-rw-r--r--kernel/utsname.c36
-rw-r--r--kernel/utsname_sysctl.c3
-rw-r--r--kernel/wait.c2
-rw-r--r--kernel/watchdog.c32
-rw-r--r--kernel/workqueue.c1586
-rw-r--r--kernel/workqueue_internal.h65
-rw-r--r--kernel/workqueue_sched.h9
162 files changed, 10846 insertions, 7394 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 86e3285ae7e5..bbde5f1a4486 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -7,7 +7,7 @@ obj-y = fork.o exec_domain.o panic.o printk.o \
7 sysctl.o sysctl_binary.o capability.o ptrace.o timer.o user.o \ 7 sysctl.o sysctl_binary.o capability.o ptrace.o timer.o user.o \
8 signal.o sys.o kmod.o workqueue.o pid.o task_work.o \ 8 signal.o sys.o kmod.o workqueue.o pid.o task_work.o \
9 rcupdate.o extable.o params.o posix-timers.o \ 9 rcupdate.o extable.o params.o posix-timers.o \
10 kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ 10 kthread.o wait.o sys_ni.o posix-cpu-timers.o mutex.o \
11 hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ 11 hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
12 notifier.o ksysfs.o cred.o \ 12 notifier.o ksysfs.o cred.o \
13 async.o range.o groups.o lglock.o smpboot.o 13 async.o range.o groups.o lglock.o smpboot.o
@@ -25,9 +25,7 @@ endif
25obj-y += sched/ 25obj-y += sched/
26obj-y += power/ 26obj-y += power/
27 27
28ifeq ($(CONFIG_CHECKPOINT_RESTORE),y) 28obj-$(CONFIG_CHECKPOINT_RESTORE) += kcmp.o
29obj-$(CONFIG_X86) += kcmp.o
30endif
31obj-$(CONFIG_FREEZER) += freezer.o 29obj-$(CONFIG_FREEZER) += freezer.o
32obj-$(CONFIG_PROFILING) += profile.o 30obj-$(CONFIG_PROFILING) += profile.o
33obj-$(CONFIG_STACKTRACE) += stacktrace.o 31obj-$(CONFIG_STACKTRACE) += stacktrace.o
@@ -54,7 +52,7 @@ obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
54obj-$(CONFIG_PROVE_LOCKING) += spinlock.o 52obj-$(CONFIG_PROVE_LOCKING) += spinlock.o
55obj-$(CONFIG_UID16) += uid16.o 53obj-$(CONFIG_UID16) += uid16.o
56obj-$(CONFIG_MODULES) += module.o 54obj-$(CONFIG_MODULES) += module.o
57obj-$(CONFIG_MODULE_SIG) += module_signing.o modsign_pubkey.o 55obj-$(CONFIG_MODULE_SIG) += module_signing.o modsign_pubkey.o modsign_certificate.o
58obj-$(CONFIG_KALLSYMS) += kallsyms.o 56obj-$(CONFIG_KALLSYMS) += kallsyms.o
59obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o 57obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o
60obj-$(CONFIG_KEXEC) += kexec.o 58obj-$(CONFIG_KEXEC) += kexec.o
@@ -110,6 +108,7 @@ obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o
110obj-$(CONFIG_PADATA) += padata.o 108obj-$(CONFIG_PADATA) += padata.o
111obj-$(CONFIG_CRASH_DUMP) += crash_dump.o 109obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
112obj-$(CONFIG_JUMP_LABEL) += jump_label.o 110obj-$(CONFIG_JUMP_LABEL) += jump_label.o
111obj-$(CONFIG_CONTEXT_TRACKING) += context_tracking.o
113 112
114$(obj)/configs.o: $(obj)/config_data.h 113$(obj)/configs.o: $(obj)/config_data.h
115 114
@@ -126,20 +125,32 @@ $(obj)/config_data.h: $(obj)/config_data.gz FORCE
126 125
127$(obj)/time.o: $(obj)/timeconst.h 126$(obj)/time.o: $(obj)/timeconst.h
128 127
129quiet_cmd_timeconst = TIMEC $@ 128quiet_cmd_hzfile = HZFILE $@
130 cmd_timeconst = $(PERL) $< $(CONFIG_HZ) > $@ 129 cmd_hzfile = echo "hz=$(CONFIG_HZ)" > $@
130
131targets += hz.bc
132$(obj)/hz.bc: $(objtree)/include/config/hz.h FORCE
133 $(call if_changed,hzfile)
134
135quiet_cmd_bc = BC $@
136 cmd_bc = bc -q $(filter-out FORCE,$^) > $@
137
131targets += timeconst.h 138targets += timeconst.h
132$(obj)/timeconst.h: $(src)/timeconst.pl FORCE 139$(obj)/timeconst.h: $(obj)/hz.bc $(src)/timeconst.bc FORCE
133 $(call if_changed,timeconst) 140 $(call if_changed,bc)
134 141
135ifeq ($(CONFIG_MODULE_SIG),y) 142ifeq ($(CONFIG_MODULE_SIG),y)
136# 143#
137# Pull the signing certificate and any extra certificates into the kernel 144# Pull the signing certificate and any extra certificates into the kernel
138# 145#
146
147quiet_cmd_touch = TOUCH $@
148 cmd_touch = touch $@
149
139extra_certificates: 150extra_certificates:
140 touch $@ 151 $(call cmd,touch)
141 152
142kernel/modsign_pubkey.o: signing_key.x509 extra_certificates 153kernel/modsign_certificate.o: signing_key.x509 extra_certificates
143 154
144############################################################################### 155###############################################################################
145# 156#
@@ -148,23 +159,7 @@ kernel/modsign_pubkey.o: signing_key.x509 extra_certificates
148# fail and that the kernel may be used afterwards. 159# fail and that the kernel may be used afterwards.
149# 160#
150############################################################################### 161###############################################################################
151sign_key_with_hash := 162ifndef CONFIG_MODULE_SIG_HASH
152ifeq ($(CONFIG_MODULE_SIG_SHA1),y)
153sign_key_with_hash := -sha1
154endif
155ifeq ($(CONFIG_MODULE_SIG_SHA224),y)
156sign_key_with_hash := -sha224
157endif
158ifeq ($(CONFIG_MODULE_SIG_SHA256),y)
159sign_key_with_hash := -sha256
160endif
161ifeq ($(CONFIG_MODULE_SIG_SHA384),y)
162sign_key_with_hash := -sha384
163endif
164ifeq ($(CONFIG_MODULE_SIG_SHA512),y)
165sign_key_with_hash := -sha512
166endif
167ifeq ($(sign_key_with_hash),)
168$(error Could not determine digest type to use from kernel config) 163$(error Could not determine digest type to use from kernel config)
169endif 164endif
170 165
@@ -177,8 +172,8 @@ signing_key.priv signing_key.x509: x509.genkey
177 @echo "### needs to be run as root, and uses a hardware random" 172 @echo "### needs to be run as root, and uses a hardware random"
178 @echo "### number generator if one is available." 173 @echo "### number generator if one is available."
179 @echo "###" 174 @echo "###"
180 openssl req -new -nodes -utf8 $(sign_key_with_hash) -days 36500 -batch \ 175 openssl req -new -nodes -utf8 -$(CONFIG_MODULE_SIG_HASH) -days 36500 \
181 -x509 -config x509.genkey \ 176 -batch -x509 -config x509.genkey \
182 -outform DER -out signing_key.x509 \ 177 -outform DER -out signing_key.x509 \
183 -keyout signing_key.priv 178 -keyout signing_key.priv
184 @echo "###" 179 @echo "###"
diff --git a/kernel/acct.c b/kernel/acct.c
index 051e071a06e7..b9bd7f098ee5 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -205,7 +205,7 @@ static int acct_on(struct filename *pathname)
205 if (IS_ERR(file)) 205 if (IS_ERR(file))
206 return PTR_ERR(file); 206 return PTR_ERR(file);
207 207
208 if (!S_ISREG(file->f_path.dentry->d_inode->i_mode)) { 208 if (!S_ISREG(file_inode(file)->i_mode)) {
209 filp_close(file, NULL); 209 filp_close(file, NULL);
210 return -EACCES; 210 return -EACCES;
211 } 211 }
@@ -566,6 +566,7 @@ out:
566void acct_collect(long exitcode, int group_dead) 566void acct_collect(long exitcode, int group_dead)
567{ 567{
568 struct pacct_struct *pacct = &current->signal->pacct; 568 struct pacct_struct *pacct = &current->signal->pacct;
569 cputime_t utime, stime;
569 unsigned long vsize = 0; 570 unsigned long vsize = 0;
570 571
571 if (group_dead && current->mm) { 572 if (group_dead && current->mm) {
@@ -593,8 +594,9 @@ void acct_collect(long exitcode, int group_dead)
593 pacct->ac_flag |= ACORE; 594 pacct->ac_flag |= ACORE;
594 if (current->flags & PF_SIGNALED) 595 if (current->flags & PF_SIGNALED)
595 pacct->ac_flag |= AXSIG; 596 pacct->ac_flag |= AXSIG;
596 pacct->ac_utime += current->utime; 597 task_cputime(current, &utime, &stime);
597 pacct->ac_stime += current->stime; 598 pacct->ac_utime += utime;
599 pacct->ac_stime += stime;
598 pacct->ac_minflt += current->min_flt; 600 pacct->ac_minflt += current->min_flt;
599 pacct->ac_majflt += current->maj_flt; 601 pacct->ac_majflt += current->maj_flt;
600 spin_unlock_irq(&current->sighand->siglock); 602 spin_unlock_irq(&current->sighand->siglock);
diff --git a/kernel/async.c b/kernel/async.c
index 9d3118384858..8ddee2c3e5b0 100644
--- a/kernel/async.c
+++ b/kernel/async.c
@@ -57,56 +57,52 @@ asynchronous and synchronous parts of the kernel.
57#include <linux/slab.h> 57#include <linux/slab.h>
58#include <linux/workqueue.h> 58#include <linux/workqueue.h>
59 59
60#include "workqueue_internal.h"
61
60static async_cookie_t next_cookie = 1; 62static async_cookie_t next_cookie = 1;
61 63
62#define MAX_WORK 32768 64#define MAX_WORK 32768
65#define ASYNC_COOKIE_MAX ULLONG_MAX /* infinity cookie */
63 66
64static LIST_HEAD(async_pending); 67static LIST_HEAD(async_global_pending); /* pending from all registered doms */
65static ASYNC_DOMAIN(async_running); 68static ASYNC_DOMAIN(async_dfl_domain);
66static LIST_HEAD(async_domains);
67static DEFINE_SPINLOCK(async_lock); 69static DEFINE_SPINLOCK(async_lock);
68static DEFINE_MUTEX(async_register_mutex);
69 70
70struct async_entry { 71struct async_entry {
71 struct list_head list; 72 struct list_head domain_list;
73 struct list_head global_list;
72 struct work_struct work; 74 struct work_struct work;
73 async_cookie_t cookie; 75 async_cookie_t cookie;
74 async_func_ptr *func; 76 async_func_ptr *func;
75 void *data; 77 void *data;
76 struct async_domain *running; 78 struct async_domain *domain;
77}; 79};
78 80
79static DECLARE_WAIT_QUEUE_HEAD(async_done); 81static DECLARE_WAIT_QUEUE_HEAD(async_done);
80 82
81static atomic_t entry_count; 83static atomic_t entry_count;
82 84
83 85static async_cookie_t lowest_in_progress(struct async_domain *domain)
84/*
85 * MUST be called with the lock held!
86 */
87static async_cookie_t __lowest_in_progress(struct async_domain *running)
88{ 86{
89 struct async_entry *entry; 87 struct async_entry *first = NULL;
90 88 async_cookie_t ret = ASYNC_COOKIE_MAX;
91 if (!list_empty(&running->domain)) { 89 unsigned long flags;
92 entry = list_first_entry(&running->domain, typeof(*entry), list);
93 return entry->cookie;
94 }
95 90
96 list_for_each_entry(entry, &async_pending, list) 91 spin_lock_irqsave(&async_lock, flags);
97 if (entry->running == running)
98 return entry->cookie;
99 92
100 return next_cookie; /* "infinity" value */ 93 if (domain) {
101} 94 if (!list_empty(&domain->pending))
95 first = list_first_entry(&domain->pending,
96 struct async_entry, domain_list);
97 } else {
98 if (!list_empty(&async_global_pending))
99 first = list_first_entry(&async_global_pending,
100 struct async_entry, global_list);
101 }
102 102
103static async_cookie_t lowest_in_progress(struct async_domain *running) 103 if (first)
104{ 104 ret = first->cookie;
105 unsigned long flags;
106 async_cookie_t ret;
107 105
108 spin_lock_irqsave(&async_lock, flags);
109 ret = __lowest_in_progress(running);
110 spin_unlock_irqrestore(&async_lock, flags); 106 spin_unlock_irqrestore(&async_lock, flags);
111 return ret; 107 return ret;
112} 108}
@@ -120,14 +116,8 @@ static void async_run_entry_fn(struct work_struct *work)
120 container_of(work, struct async_entry, work); 116 container_of(work, struct async_entry, work);
121 unsigned long flags; 117 unsigned long flags;
122 ktime_t uninitialized_var(calltime), delta, rettime; 118 ktime_t uninitialized_var(calltime), delta, rettime;
123 struct async_domain *running = entry->running;
124 119
125 /* 1) move self to the running queue */ 120 /* 1) run (and print duration) */
126 spin_lock_irqsave(&async_lock, flags);
127 list_move_tail(&entry->list, &running->domain);
128 spin_unlock_irqrestore(&async_lock, flags);
129
130 /* 2) run (and print duration) */
131 if (initcall_debug && system_state == SYSTEM_BOOTING) { 121 if (initcall_debug && system_state == SYSTEM_BOOTING) {
132 printk(KERN_DEBUG "calling %lli_%pF @ %i\n", 122 printk(KERN_DEBUG "calling %lli_%pF @ %i\n",
133 (long long)entry->cookie, 123 (long long)entry->cookie,
@@ -144,23 +134,22 @@ static void async_run_entry_fn(struct work_struct *work)
144 (long long)ktime_to_ns(delta) >> 10); 134 (long long)ktime_to_ns(delta) >> 10);
145 } 135 }
146 136
147 /* 3) remove self from the running queue */ 137 /* 2) remove self from the pending queues */
148 spin_lock_irqsave(&async_lock, flags); 138 spin_lock_irqsave(&async_lock, flags);
149 list_del(&entry->list); 139 list_del_init(&entry->domain_list);
150 if (running->registered && --running->count == 0) 140 list_del_init(&entry->global_list);
151 list_del_init(&running->node);
152 141
153 /* 4) free the entry */ 142 /* 3) free the entry */
154 kfree(entry); 143 kfree(entry);
155 atomic_dec(&entry_count); 144 atomic_dec(&entry_count);
156 145
157 spin_unlock_irqrestore(&async_lock, flags); 146 spin_unlock_irqrestore(&async_lock, flags);
158 147
159 /* 5) wake up any waiters */ 148 /* 4) wake up any waiters */
160 wake_up(&async_done); 149 wake_up(&async_done);
161} 150}
162 151
163static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct async_domain *running) 152static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct async_domain *domain)
164{ 153{
165 struct async_entry *entry; 154 struct async_entry *entry;
166 unsigned long flags; 155 unsigned long flags;
@@ -183,19 +172,28 @@ static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct a
183 ptr(data, newcookie); 172 ptr(data, newcookie);
184 return newcookie; 173 return newcookie;
185 } 174 }
175 INIT_LIST_HEAD(&entry->domain_list);
176 INIT_LIST_HEAD(&entry->global_list);
186 INIT_WORK(&entry->work, async_run_entry_fn); 177 INIT_WORK(&entry->work, async_run_entry_fn);
187 entry->func = ptr; 178 entry->func = ptr;
188 entry->data = data; 179 entry->data = data;
189 entry->running = running; 180 entry->domain = domain;
190 181
191 spin_lock_irqsave(&async_lock, flags); 182 spin_lock_irqsave(&async_lock, flags);
183
184 /* allocate cookie and queue */
192 newcookie = entry->cookie = next_cookie++; 185 newcookie = entry->cookie = next_cookie++;
193 list_add_tail(&entry->list, &async_pending); 186
194 if (running->registered && running->count++ == 0) 187 list_add_tail(&entry->domain_list, &domain->pending);
195 list_add_tail(&running->node, &async_domains); 188 if (domain->registered)
189 list_add_tail(&entry->global_list, &async_global_pending);
190
196 atomic_inc(&entry_count); 191 atomic_inc(&entry_count);
197 spin_unlock_irqrestore(&async_lock, flags); 192 spin_unlock_irqrestore(&async_lock, flags);
198 193
194 /* mark that this task has queued an async job, used by module init */
195 current->flags |= PF_USED_ASYNC;
196
199 /* schedule for execution */ 197 /* schedule for execution */
200 queue_work(system_unbound_wq, &entry->work); 198 queue_work(system_unbound_wq, &entry->work);
201 199
@@ -212,7 +210,7 @@ static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct a
212 */ 210 */
213async_cookie_t async_schedule(async_func_ptr *ptr, void *data) 211async_cookie_t async_schedule(async_func_ptr *ptr, void *data)
214{ 212{
215 return __async_schedule(ptr, data, &async_running); 213 return __async_schedule(ptr, data, &async_dfl_domain);
216} 214}
217EXPORT_SYMBOL_GPL(async_schedule); 215EXPORT_SYMBOL_GPL(async_schedule);
218 216
@@ -220,18 +218,18 @@ EXPORT_SYMBOL_GPL(async_schedule);
220 * async_schedule_domain - schedule a function for asynchronous execution within a certain domain 218 * async_schedule_domain - schedule a function for asynchronous execution within a certain domain
221 * @ptr: function to execute asynchronously 219 * @ptr: function to execute asynchronously
222 * @data: data pointer to pass to the function 220 * @data: data pointer to pass to the function
223 * @running: running list for the domain 221 * @domain: the domain
224 * 222 *
225 * Returns an async_cookie_t that may be used for checkpointing later. 223 * Returns an async_cookie_t that may be used for checkpointing later.
226 * @running may be used in the async_synchronize_*_domain() functions 224 * @domain may be used in the async_synchronize_*_domain() functions to
227 * to wait within a certain synchronization domain rather than globally. 225 * wait within a certain synchronization domain rather than globally. A
228 * A synchronization domain is specified via the running queue @running to use. 226 * synchronization domain is specified via @domain. Note: This function
229 * Note: This function may be called from atomic or non-atomic contexts. 227 * may be called from atomic or non-atomic contexts.
230 */ 228 */
231async_cookie_t async_schedule_domain(async_func_ptr *ptr, void *data, 229async_cookie_t async_schedule_domain(async_func_ptr *ptr, void *data,
232 struct async_domain *running) 230 struct async_domain *domain)
233{ 231{
234 return __async_schedule(ptr, data, running); 232 return __async_schedule(ptr, data, domain);
235} 233}
236EXPORT_SYMBOL_GPL(async_schedule_domain); 234EXPORT_SYMBOL_GPL(async_schedule_domain);
237 235
@@ -242,18 +240,7 @@ EXPORT_SYMBOL_GPL(async_schedule_domain);
242 */ 240 */
243void async_synchronize_full(void) 241void async_synchronize_full(void)
244{ 242{
245 mutex_lock(&async_register_mutex); 243 async_synchronize_full_domain(NULL);
246 do {
247 struct async_domain *domain = NULL;
248
249 spin_lock_irq(&async_lock);
250 if (!list_empty(&async_domains))
251 domain = list_first_entry(&async_domains, typeof(*domain), node);
252 spin_unlock_irq(&async_lock);
253
254 async_synchronize_cookie_domain(next_cookie, domain);
255 } while (!list_empty(&async_domains));
256 mutex_unlock(&async_register_mutex);
257} 244}
258EXPORT_SYMBOL_GPL(async_synchronize_full); 245EXPORT_SYMBOL_GPL(async_synchronize_full);
259 246
@@ -268,51 +255,45 @@ EXPORT_SYMBOL_GPL(async_synchronize_full);
268 */ 255 */
269void async_unregister_domain(struct async_domain *domain) 256void async_unregister_domain(struct async_domain *domain)
270{ 257{
271 mutex_lock(&async_register_mutex);
272 spin_lock_irq(&async_lock); 258 spin_lock_irq(&async_lock);
273 WARN_ON(!domain->registered || !list_empty(&domain->node) || 259 WARN_ON(!domain->registered || !list_empty(&domain->pending));
274 !list_empty(&domain->domain));
275 domain->registered = 0; 260 domain->registered = 0;
276 spin_unlock_irq(&async_lock); 261 spin_unlock_irq(&async_lock);
277 mutex_unlock(&async_register_mutex);
278} 262}
279EXPORT_SYMBOL_GPL(async_unregister_domain); 263EXPORT_SYMBOL_GPL(async_unregister_domain);
280 264
281/** 265/**
282 * async_synchronize_full_domain - synchronize all asynchronous function within a certain domain 266 * async_synchronize_full_domain - synchronize all asynchronous function within a certain domain
283 * @domain: running list to synchronize on 267 * @domain: the domain to synchronize
284 * 268 *
285 * This function waits until all asynchronous function calls for the 269 * This function waits until all asynchronous function calls for the
286 * synchronization domain specified by the running list @domain have been done. 270 * synchronization domain specified by @domain have been done.
287 */ 271 */
288void async_synchronize_full_domain(struct async_domain *domain) 272void async_synchronize_full_domain(struct async_domain *domain)
289{ 273{
290 async_synchronize_cookie_domain(next_cookie, domain); 274 async_synchronize_cookie_domain(ASYNC_COOKIE_MAX, domain);
291} 275}
292EXPORT_SYMBOL_GPL(async_synchronize_full_domain); 276EXPORT_SYMBOL_GPL(async_synchronize_full_domain);
293 277
294/** 278/**
295 * async_synchronize_cookie_domain - synchronize asynchronous function calls within a certain domain with cookie checkpointing 279 * async_synchronize_cookie_domain - synchronize asynchronous function calls within a certain domain with cookie checkpointing
296 * @cookie: async_cookie_t to use as checkpoint 280 * @cookie: async_cookie_t to use as checkpoint
297 * @running: running list to synchronize on 281 * @domain: the domain to synchronize (%NULL for all registered domains)
298 * 282 *
299 * This function waits until all asynchronous function calls for the 283 * This function waits until all asynchronous function calls for the
300 * synchronization domain specified by running list @running submitted 284 * synchronization domain specified by @domain submitted prior to @cookie
301 * prior to @cookie have been done. 285 * have been done.
302 */ 286 */
303void async_synchronize_cookie_domain(async_cookie_t cookie, struct async_domain *running) 287void async_synchronize_cookie_domain(async_cookie_t cookie, struct async_domain *domain)
304{ 288{
305 ktime_t uninitialized_var(starttime), delta, endtime; 289 ktime_t uninitialized_var(starttime), delta, endtime;
306 290
307 if (!running)
308 return;
309
310 if (initcall_debug && system_state == SYSTEM_BOOTING) { 291 if (initcall_debug && system_state == SYSTEM_BOOTING) {
311 printk(KERN_DEBUG "async_waiting @ %i\n", task_pid_nr(current)); 292 printk(KERN_DEBUG "async_waiting @ %i\n", task_pid_nr(current));
312 starttime = ktime_get(); 293 starttime = ktime_get();
313 } 294 }
314 295
315 wait_event(async_done, lowest_in_progress(running) >= cookie); 296 wait_event(async_done, lowest_in_progress(domain) >= cookie);
316 297
317 if (initcall_debug && system_state == SYSTEM_BOOTING) { 298 if (initcall_debug && system_state == SYSTEM_BOOTING) {
318 endtime = ktime_get(); 299 endtime = ktime_get();
@@ -334,6 +315,18 @@ EXPORT_SYMBOL_GPL(async_synchronize_cookie_domain);
334 */ 315 */
335void async_synchronize_cookie(async_cookie_t cookie) 316void async_synchronize_cookie(async_cookie_t cookie)
336{ 317{
337 async_synchronize_cookie_domain(cookie, &async_running); 318 async_synchronize_cookie_domain(cookie, &async_dfl_domain);
338} 319}
339EXPORT_SYMBOL_GPL(async_synchronize_cookie); 320EXPORT_SYMBOL_GPL(async_synchronize_cookie);
321
322/**
323 * current_is_async - is %current an async worker task?
324 *
325 * Returns %true if %current is an async worker task.
326 */
327bool current_is_async(void)
328{
329 struct worker *worker = current_wq_worker();
330
331 return worker && worker->current_func == async_run_entry_fn;
332}
diff --git a/kernel/audit.c b/kernel/audit.c
index 40414e9143db..d596e5355f15 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -272,6 +272,8 @@ static int audit_log_config_change(char *function_name, int new, int old,
272 int rc = 0; 272 int rc = 0;
273 273
274 ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE); 274 ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE);
275 if (unlikely(!ab))
276 return rc;
275 audit_log_format(ab, "%s=%d old=%d auid=%u ses=%u", function_name, new, 277 audit_log_format(ab, "%s=%d old=%d auid=%u ses=%u", function_name, new,
276 old, from_kuid(&init_user_ns, loginuid), sessionid); 278 old, from_kuid(&init_user_ns, loginuid), sessionid);
277 if (sid) { 279 if (sid) {
@@ -619,6 +621,8 @@ static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type,
619 } 621 }
620 622
621 *ab = audit_log_start(NULL, GFP_KERNEL, msg_type); 623 *ab = audit_log_start(NULL, GFP_KERNEL, msg_type);
624 if (unlikely(!*ab))
625 return rc;
622 audit_log_format(*ab, "pid=%d uid=%u auid=%u ses=%u", 626 audit_log_format(*ab, "pid=%d uid=%u auid=%u ses=%u",
623 task_tgid_vnr(current), 627 task_tgid_vnr(current),
624 from_kuid(&init_user_ns, current_uid()), 628 from_kuid(&init_user_ns, current_uid()),
@@ -1097,6 +1101,23 @@ static inline void audit_get_stamp(struct audit_context *ctx,
1097 } 1101 }
1098} 1102}
1099 1103
1104/*
1105 * Wait for auditd to drain the queue a little
1106 */
1107static void wait_for_auditd(unsigned long sleep_time)
1108{
1109 DECLARE_WAITQUEUE(wait, current);
1110 set_current_state(TASK_INTERRUPTIBLE);
1111 add_wait_queue(&audit_backlog_wait, &wait);
1112
1113 if (audit_backlog_limit &&
1114 skb_queue_len(&audit_skb_queue) > audit_backlog_limit)
1115 schedule_timeout(sleep_time);
1116
1117 __set_current_state(TASK_RUNNING);
1118 remove_wait_queue(&audit_backlog_wait, &wait);
1119}
1120
1100/* Obtain an audit buffer. This routine does locking to obtain the 1121/* Obtain an audit buffer. This routine does locking to obtain the
1101 * audit buffer, but then no locking is required for calls to 1122 * audit buffer, but then no locking is required for calls to
1102 * audit_log_*format. If the tsk is a task that is currently in a 1123 * audit_log_*format. If the tsk is a task that is currently in a
@@ -1142,20 +1163,13 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask,
1142 1163
1143 while (audit_backlog_limit 1164 while (audit_backlog_limit
1144 && skb_queue_len(&audit_skb_queue) > audit_backlog_limit + reserve) { 1165 && skb_queue_len(&audit_skb_queue) > audit_backlog_limit + reserve) {
1145 if (gfp_mask & __GFP_WAIT && audit_backlog_wait_time 1166 if (gfp_mask & __GFP_WAIT && audit_backlog_wait_time) {
1146 && time_before(jiffies, timeout_start + audit_backlog_wait_time)) { 1167 unsigned long sleep_time;
1147 1168
1148 /* Wait for auditd to drain the queue a little */ 1169 sleep_time = timeout_start + audit_backlog_wait_time -
1149 DECLARE_WAITQUEUE(wait, current); 1170 jiffies;
1150 set_current_state(TASK_INTERRUPTIBLE); 1171 if ((long)sleep_time > 0)
1151 add_wait_queue(&audit_backlog_wait, &wait); 1172 wait_for_auditd(sleep_time);
1152
1153 if (audit_backlog_limit &&
1154 skb_queue_len(&audit_skb_queue) > audit_backlog_limit)
1155 schedule_timeout(timeout_start + audit_backlog_wait_time - jiffies);
1156
1157 __set_current_state(TASK_RUNNING);
1158 remove_wait_queue(&audit_backlog_wait, &wait);
1159 continue; 1173 continue;
1160 } 1174 }
1161 if (audit_rate_check() && printk_ratelimit()) 1175 if (audit_rate_check() && printk_ratelimit())
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index ed206fd88cca..642a89c4f3d6 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -249,7 +249,7 @@ static void untag_chunk(struct node *p)
249 list_del_rcu(&chunk->hash); 249 list_del_rcu(&chunk->hash);
250 spin_unlock(&hash_lock); 250 spin_unlock(&hash_lock);
251 spin_unlock(&entry->lock); 251 spin_unlock(&entry->lock);
252 fsnotify_destroy_mark(entry); 252 fsnotify_destroy_mark(entry, audit_tree_group);
253 goto out; 253 goto out;
254 } 254 }
255 255
@@ -291,7 +291,7 @@ static void untag_chunk(struct node *p)
291 owner->root = new; 291 owner->root = new;
292 spin_unlock(&hash_lock); 292 spin_unlock(&hash_lock);
293 spin_unlock(&entry->lock); 293 spin_unlock(&entry->lock);
294 fsnotify_destroy_mark(entry); 294 fsnotify_destroy_mark(entry, audit_tree_group);
295 fsnotify_put_mark(&new->mark); /* drop initial reference */ 295 fsnotify_put_mark(&new->mark); /* drop initial reference */
296 goto out; 296 goto out;
297 297
@@ -331,7 +331,7 @@ static int create_chunk(struct inode *inode, struct audit_tree *tree)
331 spin_unlock(&hash_lock); 331 spin_unlock(&hash_lock);
332 chunk->dead = 1; 332 chunk->dead = 1;
333 spin_unlock(&entry->lock); 333 spin_unlock(&entry->lock);
334 fsnotify_destroy_mark(entry); 334 fsnotify_destroy_mark(entry, audit_tree_group);
335 fsnotify_put_mark(entry); 335 fsnotify_put_mark(entry);
336 return 0; 336 return 0;
337 } 337 }
@@ -412,7 +412,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
412 spin_unlock(&chunk_entry->lock); 412 spin_unlock(&chunk_entry->lock);
413 spin_unlock(&old_entry->lock); 413 spin_unlock(&old_entry->lock);
414 414
415 fsnotify_destroy_mark(chunk_entry); 415 fsnotify_destroy_mark(chunk_entry, audit_tree_group);
416 416
417 fsnotify_put_mark(chunk_entry); 417 fsnotify_put_mark(chunk_entry);
418 fsnotify_put_mark(old_entry); 418 fsnotify_put_mark(old_entry);
@@ -443,17 +443,32 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
443 spin_unlock(&hash_lock); 443 spin_unlock(&hash_lock);
444 spin_unlock(&chunk_entry->lock); 444 spin_unlock(&chunk_entry->lock);
445 spin_unlock(&old_entry->lock); 445 spin_unlock(&old_entry->lock);
446 fsnotify_destroy_mark(old_entry); 446 fsnotify_destroy_mark(old_entry, audit_tree_group);
447 fsnotify_put_mark(chunk_entry); /* drop initial reference */ 447 fsnotify_put_mark(chunk_entry); /* drop initial reference */
448 fsnotify_put_mark(old_entry); /* pair to fsnotify_find mark_entry */ 448 fsnotify_put_mark(old_entry); /* pair to fsnotify_find mark_entry */
449 return 0; 449 return 0;
450} 450}
451 451
452static void audit_log_remove_rule(struct audit_krule *rule)
453{
454 struct audit_buffer *ab;
455
456 ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE);
457 if (unlikely(!ab))
458 return;
459 audit_log_format(ab, "op=");
460 audit_log_string(ab, "remove rule");
461 audit_log_format(ab, " dir=");
462 audit_log_untrustedstring(ab, rule->tree->pathname);
463 audit_log_key(ab, rule->filterkey);
464 audit_log_format(ab, " list=%d res=1", rule->listnr);
465 audit_log_end(ab);
466}
467
452static void kill_rules(struct audit_tree *tree) 468static void kill_rules(struct audit_tree *tree)
453{ 469{
454 struct audit_krule *rule, *next; 470 struct audit_krule *rule, *next;
455 struct audit_entry *entry; 471 struct audit_entry *entry;
456 struct audit_buffer *ab;
457 472
458 list_for_each_entry_safe(rule, next, &tree->rules, rlist) { 473 list_for_each_entry_safe(rule, next, &tree->rules, rlist) {
459 entry = container_of(rule, struct audit_entry, rule); 474 entry = container_of(rule, struct audit_entry, rule);
@@ -461,14 +476,7 @@ static void kill_rules(struct audit_tree *tree)
461 list_del_init(&rule->rlist); 476 list_del_init(&rule->rlist);
462 if (rule->tree) { 477 if (rule->tree) {
463 /* not a half-baked one */ 478 /* not a half-baked one */
464 ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE); 479 audit_log_remove_rule(rule);
465 audit_log_format(ab, "op=");
466 audit_log_string(ab, "remove rule");
467 audit_log_format(ab, " dir=");
468 audit_log_untrustedstring(ab, rule->tree->pathname);
469 audit_log_key(ab, rule->filterkey);
470 audit_log_format(ab, " list=%d res=1", rule->listnr);
471 audit_log_end(ab);
472 rule->tree = NULL; 480 rule->tree = NULL;
473 list_del_rcu(&entry->list); 481 list_del_rcu(&entry->list);
474 list_del(&entry->rule.list); 482 list_del(&entry->rule.list);
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index 9a9ae6e3d290..22831c4d369c 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -240,6 +240,8 @@ static void audit_watch_log_rule_change(struct audit_krule *r, struct audit_watc
240 if (audit_enabled) { 240 if (audit_enabled) {
241 struct audit_buffer *ab; 241 struct audit_buffer *ab;
242 ab = audit_log_start(NULL, GFP_NOFS, AUDIT_CONFIG_CHANGE); 242 ab = audit_log_start(NULL, GFP_NOFS, AUDIT_CONFIG_CHANGE);
243 if (unlikely(!ab))
244 return;
243 audit_log_format(ab, "auid=%u ses=%u op=", 245 audit_log_format(ab, "auid=%u ses=%u op=",
244 from_kuid(&init_user_ns, audit_get_loginuid(current)), 246 from_kuid(&init_user_ns, audit_get_loginuid(current)),
245 audit_get_sessionid(current)); 247 audit_get_sessionid(current));
@@ -350,7 +352,7 @@ static void audit_remove_parent_watches(struct audit_parent *parent)
350 } 352 }
351 mutex_unlock(&audit_filter_mutex); 353 mutex_unlock(&audit_filter_mutex);
352 354
353 fsnotify_destroy_mark(&parent->mark); 355 fsnotify_destroy_mark(&parent->mark, audit_watch_group);
354} 356}
355 357
356/* Get path information necessary for adding watches. */ 358/* Get path information necessary for adding watches. */
@@ -457,7 +459,7 @@ void audit_remove_watch_rule(struct audit_krule *krule)
457 459
458 if (list_empty(&parent->watches)) { 460 if (list_empty(&parent->watches)) {
459 audit_get_parent(parent); 461 audit_get_parent(parent);
460 fsnotify_destroy_mark(&parent->mark); 462 fsnotify_destroy_mark(&parent->mark, audit_watch_group);
461 audit_put_parent(parent); 463 audit_put_parent(parent);
462 } 464 }
463 } 465 }
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 7f19f23d38a3..f9fc54bbe06f 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -1144,7 +1144,6 @@ static void audit_log_rule_change(kuid_t loginuid, u32 sessionid, u32 sid,
1144 * audit_receive_filter - apply all rules to the specified message type 1144 * audit_receive_filter - apply all rules to the specified message type
1145 * @type: audit message type 1145 * @type: audit message type
1146 * @pid: target pid for netlink audit messages 1146 * @pid: target pid for netlink audit messages
1147 * @uid: target uid for netlink audit messages
1148 * @seq: netlink audit message sequence (serial) number 1147 * @seq: netlink audit message sequence (serial) number
1149 * @data: payload data 1148 * @data: payload data
1150 * @datasz: size of payload data 1149 * @datasz: size of payload data
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 2f186ed80c40..a371f857a0a9 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -200,7 +200,6 @@ struct audit_context {
200 struct list_head names_list; /* anchor for struct audit_names->list */ 200 struct list_head names_list; /* anchor for struct audit_names->list */
201 char * filterkey; /* key for rule that triggered record */ 201 char * filterkey; /* key for rule that triggered record */
202 struct path pwd; 202 struct path pwd;
203 struct audit_context *previous; /* For nested syscalls */
204 struct audit_aux_data *aux; 203 struct audit_aux_data *aux;
205 struct audit_aux_data *aux_pids; 204 struct audit_aux_data *aux_pids;
206 struct sockaddr_storage *sockaddr; 205 struct sockaddr_storage *sockaddr;
@@ -1091,29 +1090,13 @@ int audit_alloc(struct task_struct *tsk)
1091 1090
1092static inline void audit_free_context(struct audit_context *context) 1091static inline void audit_free_context(struct audit_context *context)
1093{ 1092{
1094 struct audit_context *previous; 1093 audit_free_names(context);
1095 int count = 0; 1094 unroll_tree_refs(context, NULL, 0);
1096 1095 free_tree_refs(context);
1097 do { 1096 audit_free_aux(context);
1098 previous = context->previous; 1097 kfree(context->filterkey);
1099 if (previous || (count && count < 10)) { 1098 kfree(context->sockaddr);
1100 ++count; 1099 kfree(context);
1101 printk(KERN_ERR "audit(:%d): major=%d name_count=%d:"
1102 " freeing multiple contexts (%d)\n",
1103 context->serial, context->major,
1104 context->name_count, count);
1105 }
1106 audit_free_names(context);
1107 unroll_tree_refs(context, NULL, 0);
1108 free_tree_refs(context);
1109 audit_free_aux(context);
1110 kfree(context->filterkey);
1111 kfree(context->sockaddr);
1112 kfree(context);
1113 context = previous;
1114 } while (context);
1115 if (count >= 10)
1116 printk(KERN_ERR "audit: freed %d contexts\n", count);
1117} 1100}
1118 1101
1119void audit_log_task_context(struct audit_buffer *ab) 1102void audit_log_task_context(struct audit_buffer *ab)
@@ -1159,7 +1142,7 @@ void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk)
1159 cred = current_cred(); 1142 cred = current_cred();
1160 1143
1161 spin_lock_irq(&tsk->sighand->siglock); 1144 spin_lock_irq(&tsk->sighand->siglock);
1162 if (tsk->signal && tsk->signal->tty && tsk->signal->tty->name) 1145 if (tsk->signal && tsk->signal->tty)
1163 tty = tsk->signal->tty->name; 1146 tty = tsk->signal->tty->name;
1164 else 1147 else
1165 tty = "(none)"; 1148 tty = "(none)";
@@ -1481,14 +1464,14 @@ static void show_special(struct audit_context *context, int *call_panic)
1481 audit_log_end(ab); 1464 audit_log_end(ab);
1482 ab = audit_log_start(context, GFP_KERNEL, 1465 ab = audit_log_start(context, GFP_KERNEL,
1483 AUDIT_IPC_SET_PERM); 1466 AUDIT_IPC_SET_PERM);
1467 if (unlikely(!ab))
1468 return;
1484 audit_log_format(ab, 1469 audit_log_format(ab,
1485 "qbytes=%lx ouid=%u ogid=%u mode=%#ho", 1470 "qbytes=%lx ouid=%u ogid=%u mode=%#ho",
1486 context->ipc.qbytes, 1471 context->ipc.qbytes,
1487 context->ipc.perm_uid, 1472 context->ipc.perm_uid,
1488 context->ipc.perm_gid, 1473 context->ipc.perm_gid,
1489 context->ipc.perm_mode); 1474 context->ipc.perm_mode);
1490 if (!ab)
1491 return;
1492 } 1475 }
1493 break; } 1476 break; }
1494 case AUDIT_MQ_OPEN: { 1477 case AUDIT_MQ_OPEN: {
@@ -1783,42 +1766,6 @@ void __audit_syscall_entry(int arch, int major,
1783 if (!context) 1766 if (!context)
1784 return; 1767 return;
1785 1768
1786 /*
1787 * This happens only on certain architectures that make system
1788 * calls in kernel_thread via the entry.S interface, instead of
1789 * with direct calls. (If you are porting to a new
1790 * architecture, hitting this condition can indicate that you
1791 * got the _exit/_leave calls backward in entry.S.)
1792 *
1793 * i386 no
1794 * x86_64 no
1795 * ppc64 yes (see arch/powerpc/platforms/iseries/misc.S)
1796 *
1797 * This also happens with vm86 emulation in a non-nested manner
1798 * (entries without exits), so this case must be caught.
1799 */
1800 if (context->in_syscall) {
1801 struct audit_context *newctx;
1802
1803#if AUDIT_DEBUG
1804 printk(KERN_ERR
1805 "audit(:%d) pid=%d in syscall=%d;"
1806 " entering syscall=%d\n",
1807 context->serial, tsk->pid, context->major, major);
1808#endif
1809 newctx = audit_alloc_context(context->state);
1810 if (newctx) {
1811 newctx->previous = context;
1812 context = newctx;
1813 tsk->audit_context = newctx;
1814 } else {
1815 /* If we can't alloc a new context, the best we
1816 * can do is to leak memory (any pending putname
1817 * will be lost). The only other alternative is
1818 * to abandon auditing. */
1819 audit_zero_context(context, context->state);
1820 }
1821 }
1822 BUG_ON(context->in_syscall || context->name_count); 1769 BUG_ON(context->in_syscall || context->name_count);
1823 1770
1824 if (!audit_enabled) 1771 if (!audit_enabled)
@@ -1881,28 +1828,21 @@ void __audit_syscall_exit(int success, long return_code)
1881 if (!list_empty(&context->killed_trees)) 1828 if (!list_empty(&context->killed_trees))
1882 audit_kill_trees(&context->killed_trees); 1829 audit_kill_trees(&context->killed_trees);
1883 1830
1884 if (context->previous) { 1831 audit_free_names(context);
1885 struct audit_context *new_context = context->previous; 1832 unroll_tree_refs(context, NULL, 0);
1886 context->previous = NULL; 1833 audit_free_aux(context);
1887 audit_free_context(context); 1834 context->aux = NULL;
1888 tsk->audit_context = new_context; 1835 context->aux_pids = NULL;
1889 } else { 1836 context->target_pid = 0;
1890 audit_free_names(context); 1837 context->target_sid = 0;
1891 unroll_tree_refs(context, NULL, 0); 1838 context->sockaddr_len = 0;
1892 audit_free_aux(context); 1839 context->type = 0;
1893 context->aux = NULL; 1840 context->fds[0] = -1;
1894 context->aux_pids = NULL; 1841 if (context->state != AUDIT_RECORD_CONTEXT) {
1895 context->target_pid = 0; 1842 kfree(context->filterkey);
1896 context->target_sid = 0; 1843 context->filterkey = NULL;
1897 context->sockaddr_len = 0;
1898 context->type = 0;
1899 context->fds[0] = -1;
1900 if (context->state != AUDIT_RECORD_CONTEXT) {
1901 kfree(context->filterkey);
1902 context->filterkey = NULL;
1903 }
1904 tsk->audit_context = context;
1905 } 1844 }
1845 tsk->audit_context = context;
1906} 1846}
1907 1847
1908static inline void handle_one(const struct inode *inode) 1848static inline void handle_one(const struct inode *inode)
@@ -2735,7 +2675,7 @@ void __audit_mmap_fd(int fd, int flags)
2735 context->type = AUDIT_MMAP; 2675 context->type = AUDIT_MMAP;
2736} 2676}
2737 2677
2738static void audit_log_abend(struct audit_buffer *ab, char *reason, long signr) 2678static void audit_log_task(struct audit_buffer *ab)
2739{ 2679{
2740 kuid_t auid, uid; 2680 kuid_t auid, uid;
2741 kgid_t gid; 2681 kgid_t gid;
@@ -2753,6 +2693,11 @@ static void audit_log_abend(struct audit_buffer *ab, char *reason, long signr)
2753 audit_log_task_context(ab); 2693 audit_log_task_context(ab);
2754 audit_log_format(ab, " pid=%d comm=", current->pid); 2694 audit_log_format(ab, " pid=%d comm=", current->pid);
2755 audit_log_untrustedstring(ab, current->comm); 2695 audit_log_untrustedstring(ab, current->comm);
2696}
2697
2698static void audit_log_abend(struct audit_buffer *ab, char *reason, long signr)
2699{
2700 audit_log_task(ab);
2756 audit_log_format(ab, " reason="); 2701 audit_log_format(ab, " reason=");
2757 audit_log_string(ab, reason); 2702 audit_log_string(ab, reason);
2758 audit_log_format(ab, " sig=%ld", signr); 2703 audit_log_format(ab, " sig=%ld", signr);
@@ -2775,6 +2720,8 @@ void audit_core_dumps(long signr)
2775 return; 2720 return;
2776 2721
2777 ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_ANOM_ABEND); 2722 ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_ANOM_ABEND);
2723 if (unlikely(!ab))
2724 return;
2778 audit_log_abend(ab, "memory violation", signr); 2725 audit_log_abend(ab, "memory violation", signr);
2779 audit_log_end(ab); 2726 audit_log_end(ab);
2780} 2727}
@@ -2783,8 +2730,11 @@ void __audit_seccomp(unsigned long syscall, long signr, int code)
2783{ 2730{
2784 struct audit_buffer *ab; 2731 struct audit_buffer *ab;
2785 2732
2786 ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_ANOM_ABEND); 2733 ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_SECCOMP);
2787 audit_log_abend(ab, "seccomp", signr); 2734 if (unlikely(!ab))
2735 return;
2736 audit_log_task(ab);
2737 audit_log_format(ab, " sig=%ld", signr);
2788 audit_log_format(ab, " syscall=%ld", syscall); 2738 audit_log_format(ab, " syscall=%ld", syscall);
2789 audit_log_format(ab, " compat=%d", is_compat_task()); 2739 audit_log_format(ab, " compat=%d", is_compat_task());
2790 audit_log_format(ab, " ip=0x%lx", KSTK_EIP(current)); 2740 audit_log_format(ab, " ip=0x%lx", KSTK_EIP(current));
diff --git a/kernel/capability.c b/kernel/capability.c
index 493d97259484..f6c2ce5701e1 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -393,6 +393,30 @@ bool ns_capable(struct user_namespace *ns, int cap)
393EXPORT_SYMBOL(ns_capable); 393EXPORT_SYMBOL(ns_capable);
394 394
395/** 395/**
396 * file_ns_capable - Determine if the file's opener had a capability in effect
397 * @file: The file we want to check
398 * @ns: The usernamespace we want the capability in
399 * @cap: The capability to be tested for
400 *
401 * Return true if task that opened the file had a capability in effect
402 * when the file was opened.
403 *
404 * This does not set PF_SUPERPRIV because the caller may not
405 * actually be privileged.
406 */
407bool file_ns_capable(const struct file *file, struct user_namespace *ns, int cap)
408{
409 if (WARN_ON_ONCE(!cap_valid(cap)))
410 return false;
411
412 if (security_capable(file->f_cred, ns, cap) == 0)
413 return true;
414
415 return false;
416}
417EXPORT_SYMBOL(file_ns_capable);
418
419/**
396 * capable - Determine if the current task has a superior capability in effect 420 * capable - Determine if the current task has a superior capability in effect
397 * @cap: The capability to be tested for 421 * @cap: The capability to be tested for
398 * 422 *
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index f24f724620dd..a32f9432666c 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -52,7 +52,7 @@
52#include <linux/module.h> 52#include <linux/module.h>
53#include <linux/delayacct.h> 53#include <linux/delayacct.h>
54#include <linux/cgroupstats.h> 54#include <linux/cgroupstats.h>
55#include <linux/hash.h> 55#include <linux/hashtable.h>
56#include <linux/namei.h> 56#include <linux/namei.h>
57#include <linux/pid_namespace.h> 57#include <linux/pid_namespace.h>
58#include <linux/idr.h> 58#include <linux/idr.h>
@@ -138,6 +138,9 @@ struct cgroupfs_root {
138 /* Hierarchy-specific flags */ 138 /* Hierarchy-specific flags */
139 unsigned long flags; 139 unsigned long flags;
140 140
141 /* IDs for cgroups in this hierarchy */
142 struct ida cgroup_ida;
143
141 /* The path to use for release notifications. */ 144 /* The path to use for release notifications. */
142 char release_agent_path[PATH_MAX]; 145 char release_agent_path[PATH_MAX];
143 146
@@ -171,8 +174,8 @@ struct css_id {
171 * The css to which this ID points. This pointer is set to valid value 174 * The css to which this ID points. This pointer is set to valid value
172 * after cgroup is populated. If cgroup is removed, this will be NULL. 175 * after cgroup is populated. If cgroup is removed, this will be NULL.
173 * This pointer is expected to be RCU-safe because destroy() 176 * This pointer is expected to be RCU-safe because destroy()
174 * is called after synchronize_rcu(). But for safe use, css_is_removed() 177 * is called after synchronize_rcu(). But for safe use, css_tryget()
175 * css_tryget() should be used for avoiding race. 178 * should be used for avoiding race.
176 */ 179 */
177 struct cgroup_subsys_state __rcu *css; 180 struct cgroup_subsys_state __rcu *css;
178 /* 181 /*
@@ -242,6 +245,10 @@ static DEFINE_SPINLOCK(hierarchy_id_lock);
242 */ 245 */
243static int need_forkexit_callback __read_mostly; 246static int need_forkexit_callback __read_mostly;
244 247
248static int cgroup_destroy_locked(struct cgroup *cgrp);
249static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys,
250 struct cftype cfts[], bool is_add);
251
245#ifdef CONFIG_PROVE_LOCKING 252#ifdef CONFIG_PROVE_LOCKING
246int cgroup_lock_is_held(void) 253int cgroup_lock_is_held(void)
247{ 254{
@@ -294,11 +301,6 @@ static int notify_on_release(const struct cgroup *cgrp)
294 return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); 301 return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
295} 302}
296 303
297static int clone_children(const struct cgroup *cgrp)
298{
299 return test_bit(CGRP_CLONE_CHILDREN, &cgrp->flags);
300}
301
302/* 304/*
303 * for_each_subsys() allows you to iterate on each subsystem attached to 305 * for_each_subsys() allows you to iterate on each subsystem attached to
304 * an active hierarchy 306 * an active hierarchy
@@ -374,22 +376,18 @@ static int css_set_count;
374 * account cgroups in empty hierarchies. 376 * account cgroups in empty hierarchies.
375 */ 377 */
376#define CSS_SET_HASH_BITS 7 378#define CSS_SET_HASH_BITS 7
377#define CSS_SET_TABLE_SIZE (1 << CSS_SET_HASH_BITS) 379static DEFINE_HASHTABLE(css_set_table, CSS_SET_HASH_BITS);
378static struct hlist_head css_set_table[CSS_SET_TABLE_SIZE];
379 380
380static struct hlist_head *css_set_hash(struct cgroup_subsys_state *css[]) 381static unsigned long css_set_hash(struct cgroup_subsys_state *css[])
381{ 382{
382 int i; 383 int i;
383 int index; 384 unsigned long key = 0UL;
384 unsigned long tmp = 0UL;
385 385
386 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) 386 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++)
387 tmp += (unsigned long)css[i]; 387 key += (unsigned long)css[i];
388 tmp = (tmp >> 16) ^ tmp; 388 key = (key >> 16) ^ key;
389 389
390 index = hash_long(tmp, CSS_SET_HASH_BITS); 390 return key;
391
392 return &css_set_table[index];
393} 391}
394 392
395/* We don't maintain the lists running through each css_set to its 393/* We don't maintain the lists running through each css_set to its
@@ -416,7 +414,7 @@ static void __put_css_set(struct css_set *cg, int taskexit)
416 } 414 }
417 415
418 /* This css_set is dead. unlink it and release cgroup refcounts */ 416 /* This css_set is dead. unlink it and release cgroup refcounts */
419 hlist_del(&cg->hlist); 417 hash_del(&cg->hlist);
420 css_set_count--; 418 css_set_count--;
421 419
422 list_for_each_entry_safe(link, saved_link, &cg->cg_links, 420 list_for_each_entry_safe(link, saved_link, &cg->cg_links,
@@ -424,12 +422,20 @@ static void __put_css_set(struct css_set *cg, int taskexit)
424 struct cgroup *cgrp = link->cgrp; 422 struct cgroup *cgrp = link->cgrp;
425 list_del(&link->cg_link_list); 423 list_del(&link->cg_link_list);
426 list_del(&link->cgrp_link_list); 424 list_del(&link->cgrp_link_list);
425
426 /*
427 * We may not be holding cgroup_mutex, and if cgrp->count is
428 * dropped to 0 the cgroup can be destroyed at any time, hence
429 * rcu_read_lock is used to keep it alive.
430 */
431 rcu_read_lock();
427 if (atomic_dec_and_test(&cgrp->count) && 432 if (atomic_dec_and_test(&cgrp->count) &&
428 notify_on_release(cgrp)) { 433 notify_on_release(cgrp)) {
429 if (taskexit) 434 if (taskexit)
430 set_bit(CGRP_RELEASABLE, &cgrp->flags); 435 set_bit(CGRP_RELEASABLE, &cgrp->flags);
431 check_for_release(cgrp); 436 check_for_release(cgrp);
432 } 437 }
438 rcu_read_unlock();
433 439
434 kfree(link); 440 kfree(link);
435 } 441 }
@@ -548,9 +554,8 @@ static struct css_set *find_existing_css_set(
548{ 554{
549 int i; 555 int i;
550 struct cgroupfs_root *root = cgrp->root; 556 struct cgroupfs_root *root = cgrp->root;
551 struct hlist_head *hhead;
552 struct hlist_node *node;
553 struct css_set *cg; 557 struct css_set *cg;
558 unsigned long key;
554 559
555 /* 560 /*
556 * Build the set of subsystem state objects that we want to see in the 561 * Build the set of subsystem state objects that we want to see in the
@@ -570,8 +575,8 @@ static struct css_set *find_existing_css_set(
570 } 575 }
571 } 576 }
572 577
573 hhead = css_set_hash(template); 578 key = css_set_hash(template);
574 hlist_for_each_entry(cg, node, hhead, hlist) { 579 hash_for_each_possible(css_set_table, cg, hlist, key) {
575 if (!compare_css_sets(cg, oldcg, cgrp, template)) 580 if (!compare_css_sets(cg, oldcg, cgrp, template))
576 continue; 581 continue;
577 582
@@ -655,8 +660,8 @@ static struct css_set *find_css_set(
655 660
656 struct list_head tmp_cg_links; 661 struct list_head tmp_cg_links;
657 662
658 struct hlist_head *hhead;
659 struct cg_cgroup_link *link; 663 struct cg_cgroup_link *link;
664 unsigned long key;
660 665
661 /* First see if we already have a cgroup group that matches 666 /* First see if we already have a cgroup group that matches
662 * the desired set */ 667 * the desired set */
@@ -702,8 +707,8 @@ static struct css_set *find_css_set(
702 css_set_count++; 707 css_set_count++;
703 708
704 /* Add this cgroup group to the hash table */ 709 /* Add this cgroup group to the hash table */
705 hhead = css_set_hash(res->subsys); 710 key = css_set_hash(res->subsys);
706 hlist_add_head(&res->hlist, hhead); 711 hash_add(css_set_table, &res->hlist, key);
707 712
708 write_unlock(&css_set_lock); 713 write_unlock(&css_set_lock);
709 714
@@ -782,12 +787,12 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task,
782 * The task_lock() exception 787 * The task_lock() exception
783 * 788 *
784 * The need for this exception arises from the action of 789 * The need for this exception arises from the action of
785 * cgroup_attach_task(), which overwrites one tasks cgroup pointer with 790 * cgroup_attach_task(), which overwrites one task's cgroup pointer with
786 * another. It does so using cgroup_mutex, however there are 791 * another. It does so using cgroup_mutex, however there are
787 * several performance critical places that need to reference 792 * several performance critical places that need to reference
788 * task->cgroup without the expense of grabbing a system global 793 * task->cgroup without the expense of grabbing a system global
789 * mutex. Therefore except as noted below, when dereferencing or, as 794 * mutex. Therefore except as noted below, when dereferencing or, as
790 * in cgroup_attach_task(), modifying a task'ss cgroup pointer we use 795 * in cgroup_attach_task(), modifying a task's cgroup pointer we use
791 * task_lock(), which acts on a spinlock (task->alloc_lock) already in 796 * task_lock(), which acts on a spinlock (task->alloc_lock) already in
792 * the task_struct routinely used for such matters. 797 * the task_struct routinely used for such matters.
793 * 798 *
@@ -854,28 +859,44 @@ static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb)
854 return inode; 859 return inode;
855} 860}
856 861
857/* 862static void cgroup_free_fn(struct work_struct *work)
858 * Call subsys's pre_destroy handler.
859 * This is called before css refcnt check.
860 */
861static int cgroup_call_pre_destroy(struct cgroup *cgrp)
862{ 863{
864 struct cgroup *cgrp = container_of(work, struct cgroup, free_work);
863 struct cgroup_subsys *ss; 865 struct cgroup_subsys *ss;
864 int ret = 0;
865 866
866 for_each_subsys(cgrp->root, ss) { 867 mutex_lock(&cgroup_mutex);
867 if (!ss->pre_destroy) 868 /*
868 continue; 869 * Release the subsystem state objects.
870 */
871 for_each_subsys(cgrp->root, ss)
872 ss->css_free(cgrp);
869 873
870 ret = ss->pre_destroy(cgrp); 874 cgrp->root->number_of_cgroups--;
871 if (ret) { 875 mutex_unlock(&cgroup_mutex);
872 /* ->pre_destroy() failure is being deprecated */
873 WARN_ON_ONCE(!ss->__DEPRECATED_clear_css_refs);
874 break;
875 }
876 }
877 876
878 return ret; 877 /*
878 * Drop the active superblock reference that we took when we
879 * created the cgroup
880 */
881 deactivate_super(cgrp->root->sb);
882
883 /*
884 * if we're getting rid of the cgroup, refcount should ensure
885 * that there are no pidlists left.
886 */
887 BUG_ON(!list_empty(&cgrp->pidlists));
888
889 simple_xattrs_free(&cgrp->xattrs);
890
891 ida_simple_remove(&cgrp->root->cgroup_ida, cgrp->id);
892 kfree(cgrp);
893}
894
895static void cgroup_free_rcu(struct rcu_head *head)
896{
897 struct cgroup *cgrp = container_of(head, struct cgroup, rcu_head);
898
899 schedule_work(&cgrp->free_work);
879} 900}
880 901
881static void cgroup_diput(struct dentry *dentry, struct inode *inode) 902static void cgroup_diput(struct dentry *dentry, struct inode *inode)
@@ -883,41 +904,9 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
883 /* is dentry a directory ? if so, kfree() associated cgroup */ 904 /* is dentry a directory ? if so, kfree() associated cgroup */
884 if (S_ISDIR(inode->i_mode)) { 905 if (S_ISDIR(inode->i_mode)) {
885 struct cgroup *cgrp = dentry->d_fsdata; 906 struct cgroup *cgrp = dentry->d_fsdata;
886 struct cgroup_subsys *ss;
887 BUG_ON(!(cgroup_is_removed(cgrp)));
888 /* It's possible for external users to be holding css
889 * reference counts on a cgroup; css_put() needs to
890 * be able to access the cgroup after decrementing
891 * the reference count in order to know if it needs to
892 * queue the cgroup to be handled by the release
893 * agent */
894 synchronize_rcu();
895
896 mutex_lock(&cgroup_mutex);
897 /*
898 * Release the subsystem state objects.
899 */
900 for_each_subsys(cgrp->root, ss)
901 ss->destroy(cgrp);
902
903 cgrp->root->number_of_cgroups--;
904 mutex_unlock(&cgroup_mutex);
905
906 /*
907 * Drop the active superblock reference that we took when we
908 * created the cgroup
909 */
910 deactivate_super(cgrp->root->sb);
911 907
912 /* 908 BUG_ON(!(cgroup_is_removed(cgrp)));
913 * if we're getting rid of the cgroup, refcount should ensure 909 call_rcu(&cgrp->rcu_head, cgroup_free_rcu);
914 * that there are no pidlists left.
915 */
916 BUG_ON(!list_empty(&cgrp->pidlists));
917
918 simple_xattrs_free(&cgrp->xattrs);
919
920 kfree_rcu(cgrp, rcu_head);
921 } else { 910 } else {
922 struct cfent *cfe = __d_cfe(dentry); 911 struct cfent *cfe = __d_cfe(dentry);
923 struct cgroup *cgrp = dentry->d_parent->d_fsdata; 912 struct cgroup *cgrp = dentry->d_parent->d_fsdata;
@@ -946,13 +935,17 @@ static void remove_dir(struct dentry *d)
946 dput(parent); 935 dput(parent);
947} 936}
948 937
949static int cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft) 938static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
950{ 939{
951 struct cfent *cfe; 940 struct cfent *cfe;
952 941
953 lockdep_assert_held(&cgrp->dentry->d_inode->i_mutex); 942 lockdep_assert_held(&cgrp->dentry->d_inode->i_mutex);
954 lockdep_assert_held(&cgroup_mutex); 943 lockdep_assert_held(&cgroup_mutex);
955 944
945 /*
946 * If we're doing cleanup due to failure of cgroup_create(),
947 * the corresponding @cfe may not exist.
948 */
956 list_for_each_entry(cfe, &cgrp->files, node) { 949 list_for_each_entry(cfe, &cgrp->files, node) {
957 struct dentry *d = cfe->dentry; 950 struct dentry *d = cfe->dentry;
958 951
@@ -965,9 +958,8 @@ static int cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
965 list_del_init(&cfe->node); 958 list_del_init(&cfe->node);
966 dput(d); 959 dput(d);
967 960
968 return 0; 961 break;
969 } 962 }
970 return -ENOENT;
971} 963}
972 964
973/** 965/**
@@ -987,7 +979,7 @@ static void cgroup_clear_directory(struct dentry *dir, bool base_files,
987 if (!test_bit(ss->subsys_id, &subsys_mask)) 979 if (!test_bit(ss->subsys_id, &subsys_mask))
988 continue; 980 continue;
989 list_for_each_entry(set, &ss->cftsets, node) 981 list_for_each_entry(set, &ss->cftsets, node)
990 cgroup_rm_file(cgrp, set->cfts); 982 cgroup_addrm_files(cgrp, NULL, set->cfts, false);
991 } 983 }
992 if (base_files) { 984 if (base_files) {
993 while (!list_empty(&cgrp->files)) 985 while (!list_empty(&cgrp->files))
@@ -1015,33 +1007,6 @@ static void cgroup_d_remove_dir(struct dentry *dentry)
1015} 1007}
1016 1008
1017/* 1009/*
1018 * A queue for waiters to do rmdir() cgroup. A tasks will sleep when
1019 * cgroup->count == 0 && list_empty(&cgroup->children) && subsys has some
1020 * reference to css->refcnt. In general, this refcnt is expected to goes down
1021 * to zero, soon.
1022 *
1023 * CGRP_WAIT_ON_RMDIR flag is set under cgroup's inode->i_mutex;
1024 */
1025static DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq);
1026
1027static void cgroup_wakeup_rmdir_waiter(struct cgroup *cgrp)
1028{
1029 if (unlikely(test_and_clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags)))
1030 wake_up_all(&cgroup_rmdir_waitq);
1031}
1032
1033void cgroup_exclude_rmdir(struct cgroup_subsys_state *css)
1034{
1035 css_get(css);
1036}
1037
1038void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css)
1039{
1040 cgroup_wakeup_rmdir_waiter(css->cgroup);
1041 css_put(css);
1042}
1043
1044/*
1045 * Call with cgroup_mutex held. Drops reference counts on modules, including 1010 * Call with cgroup_mutex held. Drops reference counts on modules, including
1046 * any duplicate ones that parse_cgroupfs_options took. If this function 1011 * any duplicate ones that parse_cgroupfs_options took. If this function
1047 * returns an error, no reference counts are touched. 1012 * returns an error, no reference counts are touched.
@@ -1131,7 +1096,6 @@ static int rebind_subsystems(struct cgroupfs_root *root,
1131 } 1096 }
1132 } 1097 }
1133 root->subsys_mask = root->actual_subsys_mask = final_subsys_mask; 1098 root->subsys_mask = root->actual_subsys_mask = final_subsys_mask;
1134 synchronize_rcu();
1135 1099
1136 return 0; 1100 return 0;
1137} 1101}
@@ -1150,7 +1114,7 @@ static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry)
1150 seq_puts(seq, ",xattr"); 1114 seq_puts(seq, ",xattr");
1151 if (strlen(root->release_agent_path)) 1115 if (strlen(root->release_agent_path))
1152 seq_printf(seq, ",release_agent=%s", root->release_agent_path); 1116 seq_printf(seq, ",release_agent=%s", root->release_agent_path);
1153 if (clone_children(&root->top_cgroup)) 1117 if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->top_cgroup.flags))
1154 seq_puts(seq, ",clone_children"); 1118 seq_puts(seq, ",clone_children");
1155 if (strlen(root->name)) 1119 if (strlen(root->name))
1156 seq_printf(seq, ",name=%s", root->name); 1120 seq_printf(seq, ",name=%s", root->name);
@@ -1162,7 +1126,7 @@ struct cgroup_sb_opts {
1162 unsigned long subsys_mask; 1126 unsigned long subsys_mask;
1163 unsigned long flags; 1127 unsigned long flags;
1164 char *release_agent; 1128 char *release_agent;
1165 bool clone_children; 1129 bool cpuset_clone_children;
1166 char *name; 1130 char *name;
1167 /* User explicitly requested empty subsystem */ 1131 /* User explicitly requested empty subsystem */
1168 bool none; 1132 bool none;
@@ -1213,7 +1177,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1213 continue; 1177 continue;
1214 } 1178 }
1215 if (!strcmp(token, "clone_children")) { 1179 if (!strcmp(token, "clone_children")) {
1216 opts->clone_children = true; 1180 opts->cpuset_clone_children = true;
1217 continue; 1181 continue;
1218 } 1182 }
1219 if (!strcmp(token, "xattr")) { 1183 if (!strcmp(token, "xattr")) {
@@ -1381,7 +1345,6 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
1381 if (ret) 1345 if (ret)
1382 goto out_unlock; 1346 goto out_unlock;
1383 1347
1384 /* See feature-removal-schedule.txt */
1385 if (opts.subsys_mask != root->actual_subsys_mask || opts.release_agent) 1348 if (opts.subsys_mask != root->actual_subsys_mask || opts.release_agent)
1386 pr_warning("cgroup: option changes via remount are deprecated (pid=%d comm=%s)\n", 1349 pr_warning("cgroup: option changes via remount are deprecated (pid=%d comm=%s)\n",
1387 task_tgid_nr(current), current->comm); 1350 task_tgid_nr(current), current->comm);
@@ -1397,14 +1360,21 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
1397 goto out_unlock; 1360 goto out_unlock;
1398 } 1361 }
1399 1362
1363 /*
1364 * Clear out the files of subsystems that should be removed, do
1365 * this before rebind_subsystems, since rebind_subsystems may
1366 * change this hierarchy's subsys_list.
1367 */
1368 cgroup_clear_directory(cgrp->dentry, false, removed_mask);
1369
1400 ret = rebind_subsystems(root, opts.subsys_mask); 1370 ret = rebind_subsystems(root, opts.subsys_mask);
1401 if (ret) { 1371 if (ret) {
1372 /* rebind_subsystems failed, re-populate the removed files */
1373 cgroup_populate_dir(cgrp, false, removed_mask);
1402 drop_parsed_module_refcounts(opts.subsys_mask); 1374 drop_parsed_module_refcounts(opts.subsys_mask);
1403 goto out_unlock; 1375 goto out_unlock;
1404 } 1376 }
1405 1377
1406 /* clear out any existing files and repopulate subsystem files */
1407 cgroup_clear_directory(cgrp->dentry, false, removed_mask);
1408 /* re-populate subsystem files */ 1378 /* re-populate subsystem files */
1409 cgroup_populate_dir(cgrp, false, added_mask); 1379 cgroup_populate_dir(cgrp, false, added_mask);
1410 1380
@@ -1432,8 +1402,10 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
1432 INIT_LIST_HEAD(&cgrp->children); 1402 INIT_LIST_HEAD(&cgrp->children);
1433 INIT_LIST_HEAD(&cgrp->files); 1403 INIT_LIST_HEAD(&cgrp->files);
1434 INIT_LIST_HEAD(&cgrp->css_sets); 1404 INIT_LIST_HEAD(&cgrp->css_sets);
1405 INIT_LIST_HEAD(&cgrp->allcg_node);
1435 INIT_LIST_HEAD(&cgrp->release_list); 1406 INIT_LIST_HEAD(&cgrp->release_list);
1436 INIT_LIST_HEAD(&cgrp->pidlists); 1407 INIT_LIST_HEAD(&cgrp->pidlists);
1408 INIT_WORK(&cgrp->free_work, cgroup_free_fn);
1437 mutex_init(&cgrp->pidlist_mutex); 1409 mutex_init(&cgrp->pidlist_mutex);
1438 INIT_LIST_HEAD(&cgrp->event_list); 1410 INIT_LIST_HEAD(&cgrp->event_list);
1439 spin_lock_init(&cgrp->event_list_lock); 1411 spin_lock_init(&cgrp->event_list_lock);
@@ -1450,8 +1422,8 @@ static void init_cgroup_root(struct cgroupfs_root *root)
1450 root->number_of_cgroups = 1; 1422 root->number_of_cgroups = 1;
1451 cgrp->root = root; 1423 cgrp->root = root;
1452 cgrp->top_cgroup = cgrp; 1424 cgrp->top_cgroup = cgrp;
1453 list_add_tail(&cgrp->allcg_node, &root->allcg_list);
1454 init_cgroup_housekeeping(cgrp); 1425 init_cgroup_housekeeping(cgrp);
1426 list_add_tail(&cgrp->allcg_node, &root->allcg_list);
1455} 1427}
1456 1428
1457static bool init_root_id(struct cgroupfs_root *root) 1429static bool init_root_id(struct cgroupfs_root *root)
@@ -1518,12 +1490,13 @@ static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts)
1518 1490
1519 root->subsys_mask = opts->subsys_mask; 1491 root->subsys_mask = opts->subsys_mask;
1520 root->flags = opts->flags; 1492 root->flags = opts->flags;
1493 ida_init(&root->cgroup_ida);
1521 if (opts->release_agent) 1494 if (opts->release_agent)
1522 strcpy(root->release_agent_path, opts->release_agent); 1495 strcpy(root->release_agent_path, opts->release_agent);
1523 if (opts->name) 1496 if (opts->name)
1524 strcpy(root->name, opts->name); 1497 strcpy(root->name, opts->name);
1525 if (opts->clone_children) 1498 if (opts->cpuset_clone_children)
1526 set_bit(CGRP_CLONE_CHILDREN, &root->top_cgroup.flags); 1499 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->top_cgroup.flags);
1527 return root; 1500 return root;
1528} 1501}
1529 1502
@@ -1536,6 +1509,7 @@ static void cgroup_drop_root(struct cgroupfs_root *root)
1536 spin_lock(&hierarchy_id_lock); 1509 spin_lock(&hierarchy_id_lock);
1537 ida_remove(&hierarchy_ida, root->hierarchy_id); 1510 ida_remove(&hierarchy_ida, root->hierarchy_id);
1538 spin_unlock(&hierarchy_id_lock); 1511 spin_unlock(&hierarchy_id_lock);
1512 ida_destroy(&root->cgroup_ida);
1539 kfree(root); 1513 kfree(root);
1540} 1514}
1541 1515
@@ -1636,6 +1610,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1636 struct cgroupfs_root *existing_root; 1610 struct cgroupfs_root *existing_root;
1637 const struct cred *cred; 1611 const struct cred *cred;
1638 int i; 1612 int i;
1613 struct css_set *cg;
1639 1614
1640 BUG_ON(sb->s_root != NULL); 1615 BUG_ON(sb->s_root != NULL);
1641 1616
@@ -1689,19 +1664,12 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1689 /* Link the top cgroup in this hierarchy into all 1664 /* Link the top cgroup in this hierarchy into all
1690 * the css_set objects */ 1665 * the css_set objects */
1691 write_lock(&css_set_lock); 1666 write_lock(&css_set_lock);
1692 for (i = 0; i < CSS_SET_TABLE_SIZE; i++) { 1667 hash_for_each(css_set_table, i, cg, hlist)
1693 struct hlist_head *hhead = &css_set_table[i]; 1668 link_css_set(&tmp_cg_links, cg, root_cgrp);
1694 struct hlist_node *node;
1695 struct css_set *cg;
1696
1697 hlist_for_each_entry(cg, node, hhead, hlist)
1698 link_css_set(&tmp_cg_links, cg, root_cgrp);
1699 }
1700 write_unlock(&css_set_lock); 1669 write_unlock(&css_set_lock);
1701 1670
1702 free_cg_links(&tmp_cg_links); 1671 free_cg_links(&tmp_cg_links);
1703 1672
1704 BUG_ON(!list_empty(&root_cgrp->sibling));
1705 BUG_ON(!list_empty(&root_cgrp->children)); 1673 BUG_ON(!list_empty(&root_cgrp->children));
1706 BUG_ON(root->number_of_cgroups != 1); 1674 BUG_ON(root->number_of_cgroups != 1);
1707 1675
@@ -1750,7 +1718,6 @@ static void cgroup_kill_sb(struct super_block *sb) {
1750 1718
1751 BUG_ON(root->number_of_cgroups != 1); 1719 BUG_ON(root->number_of_cgroups != 1);
1752 BUG_ON(!list_empty(&cgrp->children)); 1720 BUG_ON(!list_empty(&cgrp->children));
1753 BUG_ON(!list_empty(&cgrp->sibling));
1754 1721
1755 mutex_lock(&cgroup_mutex); 1722 mutex_lock(&cgroup_mutex);
1756 mutex_lock(&cgroup_root_mutex); 1723 mutex_lock(&cgroup_root_mutex);
@@ -1808,11 +1775,13 @@ static struct kobject *cgroup_kobj;
1808 */ 1775 */
1809int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen) 1776int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
1810{ 1777{
1778 struct dentry *dentry = cgrp->dentry;
1811 char *start; 1779 char *start;
1812 struct dentry *dentry = rcu_dereference_check(cgrp->dentry,
1813 cgroup_lock_is_held());
1814 1780
1815 if (!dentry || cgrp == dummytop) { 1781 rcu_lockdep_assert(rcu_read_lock_held() || cgroup_lock_is_held(),
1782 "cgroup_path() called without proper locking");
1783
1784 if (cgrp == dummytop) {
1816 /* 1785 /*
1817 * Inactive subsystems have no dentry for their root 1786 * Inactive subsystems have no dentry for their root
1818 * cgroup 1787 * cgroup
@@ -1821,9 +1790,9 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
1821 return 0; 1790 return 0;
1822 } 1791 }
1823 1792
1824 start = buf + buflen; 1793 start = buf + buflen - 1;
1825 1794
1826 *--start = '\0'; 1795 *start = '\0';
1827 for (;;) { 1796 for (;;) {
1828 int len = dentry->d_name.len; 1797 int len = dentry->d_name.len;
1829 1798
@@ -1834,8 +1803,7 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
1834 if (!cgrp) 1803 if (!cgrp)
1835 break; 1804 break;
1836 1805
1837 dentry = rcu_dereference_check(cgrp->dentry, 1806 dentry = cgrp->dentry;
1838 cgroup_lock_is_held());
1839 if (!cgrp->parent) 1807 if (!cgrp->parent)
1840 continue; 1808 continue;
1841 if (--start < buf) 1809 if (--start < buf)
@@ -1930,9 +1898,7 @@ EXPORT_SYMBOL_GPL(cgroup_taskset_size);
1930/* 1898/*
1931 * cgroup_task_migrate - move a task from one cgroup to another. 1899 * cgroup_task_migrate - move a task from one cgroup to another.
1932 * 1900 *
1933 * 'guarantee' is set if the caller promises that a new css_set for the task 1901 * Must be called with cgroup_mutex and threadgroup locked.
1934 * will already exist. If not set, this function might sleep, and can fail with
1935 * -ENOMEM. Must be called with cgroup_mutex and threadgroup locked.
1936 */ 1902 */
1937static void cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp, 1903static void cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp,
1938 struct task_struct *tsk, struct css_set *newcg) 1904 struct task_struct *tsk, struct css_set *newcg)
@@ -2024,13 +1990,6 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
2024 ss->attach(cgrp, &tset); 1990 ss->attach(cgrp, &tset);
2025 } 1991 }
2026 1992
2027 synchronize_rcu();
2028
2029 /*
2030 * wake up rmdir() waiter. the rmdir should fail since the cgroup
2031 * is no longer empty.
2032 */
2033 cgroup_wakeup_rmdir_waiter(cgrp);
2034out: 1993out:
2035 if (retval) { 1994 if (retval) {
2036 for_each_subsys(root, ss) { 1995 for_each_subsys(root, ss) {
@@ -2199,8 +2158,6 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
2199 /* 2158 /*
2200 * step 5: success! and cleanup 2159 * step 5: success! and cleanup
2201 */ 2160 */
2202 synchronize_rcu();
2203 cgroup_wakeup_rmdir_waiter(cgrp);
2204 retval = 0; 2161 retval = 0;
2205out_put_css_set_refs: 2162out_put_css_set_refs:
2206 if (retval) { 2163 if (retval) {
@@ -2686,7 +2643,7 @@ static struct dentry *cgroup_lookup(struct inode *dir, struct dentry *dentry, un
2686 */ 2643 */
2687static inline struct cftype *__file_cft(struct file *file) 2644static inline struct cftype *__file_cft(struct file *file)
2688{ 2645{
2689 if (file->f_dentry->d_inode->i_fop != &cgroup_file_operations) 2646 if (file_inode(file)->i_fop != &cgroup_file_operations)
2690 return ERR_PTR(-EINVAL); 2647 return ERR_PTR(-EINVAL);
2691 return __d_cft(file->f_dentry); 2648 return __d_cft(file->f_dentry);
2692} 2649}
@@ -2711,10 +2668,17 @@ static int cgroup_create_file(struct dentry *dentry, umode_t mode,
2711 2668
2712 /* start off with i_nlink == 2 (for "." entry) */ 2669 /* start off with i_nlink == 2 (for "." entry) */
2713 inc_nlink(inode); 2670 inc_nlink(inode);
2671 inc_nlink(dentry->d_parent->d_inode);
2714 2672
2715 /* start with the directory inode held, so that we can 2673 /*
2716 * populate it without racing with another mkdir */ 2674 * Control reaches here with cgroup_mutex held.
2717 mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD); 2675 * @inode->i_mutex should nest outside cgroup_mutex but we
2676 * want to populate it immediately without releasing
2677 * cgroup_mutex. As @inode isn't visible to anyone else
2678 * yet, trylock will always succeed without affecting
2679 * lockdep checks.
2680 */
2681 WARN_ON_ONCE(!mutex_trylock(&inode->i_mutex));
2718 } else if (S_ISREG(mode)) { 2682 } else if (S_ISREG(mode)) {
2719 inode->i_size = 0; 2683 inode->i_size = 0;
2720 inode->i_fop = &cgroup_file_operations; 2684 inode->i_fop = &cgroup_file_operations;
@@ -2725,32 +2689,6 @@ static int cgroup_create_file(struct dentry *dentry, umode_t mode,
2725 return 0; 2689 return 0;
2726} 2690}
2727 2691
2728/*
2729 * cgroup_create_dir - create a directory for an object.
2730 * @cgrp: the cgroup we create the directory for. It must have a valid
2731 * ->parent field. And we are going to fill its ->dentry field.
2732 * @dentry: dentry of the new cgroup
2733 * @mode: mode to set on new directory.
2734 */
2735static int cgroup_create_dir(struct cgroup *cgrp, struct dentry *dentry,
2736 umode_t mode)
2737{
2738 struct dentry *parent;
2739 int error = 0;
2740
2741 parent = cgrp->parent->dentry;
2742 error = cgroup_create_file(dentry, S_IFDIR | mode, cgrp->root->sb);
2743 if (!error) {
2744 dentry->d_fsdata = cgrp;
2745 inc_nlink(parent->d_inode);
2746 rcu_assign_pointer(cgrp->dentry, dentry);
2747 dget(dentry);
2748 }
2749 dput(dentry);
2750
2751 return error;
2752}
2753
2754/** 2692/**
2755 * cgroup_file_mode - deduce file mode of a control file 2693 * cgroup_file_mode - deduce file mode of a control file
2756 * @cft: the control file in question 2694 * @cft: the control file in question
@@ -2791,12 +2729,6 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys,
2791 2729
2792 simple_xattrs_init(&cft->xattrs); 2730 simple_xattrs_init(&cft->xattrs);
2793 2731
2794 /* does @cft->flags tell us to skip creation on @cgrp? */
2795 if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgrp->parent)
2796 return 0;
2797 if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgrp->parent)
2798 return 0;
2799
2800 if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) { 2732 if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) {
2801 strcpy(name, subsys->name); 2733 strcpy(name, subsys->name);
2802 strcat(name, "."); 2734 strcat(name, ".");
@@ -2837,14 +2769,20 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys,
2837 int err, ret = 0; 2769 int err, ret = 0;
2838 2770
2839 for (cft = cfts; cft->name[0] != '\0'; cft++) { 2771 for (cft = cfts; cft->name[0] != '\0'; cft++) {
2840 if (is_add) 2772 /* does cft->flags tell us to skip this file on @cgrp? */
2773 if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgrp->parent)
2774 continue;
2775 if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgrp->parent)
2776 continue;
2777
2778 if (is_add) {
2841 err = cgroup_add_file(cgrp, subsys, cft); 2779 err = cgroup_add_file(cgrp, subsys, cft);
2842 else 2780 if (err)
2843 err = cgroup_rm_file(cgrp, cft); 2781 pr_warn("cgroup_addrm_files: failed to add %s, err=%d\n",
2844 if (err) { 2782 cft->name, err);
2845 pr_warning("cgroup_addrm_files: failed to %s %s, err=%d\n",
2846 is_add ? "add" : "remove", cft->name, err);
2847 ret = err; 2783 ret = err;
2784 } else {
2785 cgroup_rm_file(cgrp, cft);
2848 } 2786 }
2849 } 2787 }
2850 return ret; 2788 return ret;
@@ -3044,6 +2982,118 @@ static void cgroup_enable_task_cg_lists(void)
3044 write_unlock(&css_set_lock); 2982 write_unlock(&css_set_lock);
3045} 2983}
3046 2984
2985/**
2986 * cgroup_next_descendant_pre - find the next descendant for pre-order walk
2987 * @pos: the current position (%NULL to initiate traversal)
2988 * @cgroup: cgroup whose descendants to walk
2989 *
2990 * To be used by cgroup_for_each_descendant_pre(). Find the next
2991 * descendant to visit for pre-order traversal of @cgroup's descendants.
2992 */
2993struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos,
2994 struct cgroup *cgroup)
2995{
2996 struct cgroup *next;
2997
2998 WARN_ON_ONCE(!rcu_read_lock_held());
2999
3000 /* if first iteration, pretend we just visited @cgroup */
3001 if (!pos) {
3002 if (list_empty(&cgroup->children))
3003 return NULL;
3004 pos = cgroup;
3005 }
3006
3007 /* visit the first child if exists */
3008 next = list_first_or_null_rcu(&pos->children, struct cgroup, sibling);
3009 if (next)
3010 return next;
3011
3012 /* no child, visit my or the closest ancestor's next sibling */
3013 do {
3014 next = list_entry_rcu(pos->sibling.next, struct cgroup,
3015 sibling);
3016 if (&next->sibling != &pos->parent->children)
3017 return next;
3018
3019 pos = pos->parent;
3020 } while (pos != cgroup);
3021
3022 return NULL;
3023}
3024EXPORT_SYMBOL_GPL(cgroup_next_descendant_pre);
3025
3026/**
3027 * cgroup_rightmost_descendant - return the rightmost descendant of a cgroup
3028 * @pos: cgroup of interest
3029 *
3030 * Return the rightmost descendant of @pos. If there's no descendant,
3031 * @pos is returned. This can be used during pre-order traversal to skip
3032 * subtree of @pos.
3033 */
3034struct cgroup *cgroup_rightmost_descendant(struct cgroup *pos)
3035{
3036 struct cgroup *last, *tmp;
3037
3038 WARN_ON_ONCE(!rcu_read_lock_held());
3039
3040 do {
3041 last = pos;
3042 /* ->prev isn't RCU safe, walk ->next till the end */
3043 pos = NULL;
3044 list_for_each_entry_rcu(tmp, &last->children, sibling)
3045 pos = tmp;
3046 } while (pos);
3047
3048 return last;
3049}
3050EXPORT_SYMBOL_GPL(cgroup_rightmost_descendant);
3051
3052static struct cgroup *cgroup_leftmost_descendant(struct cgroup *pos)
3053{
3054 struct cgroup *last;
3055
3056 do {
3057 last = pos;
3058 pos = list_first_or_null_rcu(&pos->children, struct cgroup,
3059 sibling);
3060 } while (pos);
3061
3062 return last;
3063}
3064
3065/**
3066 * cgroup_next_descendant_post - find the next descendant for post-order walk
3067 * @pos: the current position (%NULL to initiate traversal)
3068 * @cgroup: cgroup whose descendants to walk
3069 *
3070 * To be used by cgroup_for_each_descendant_post(). Find the next
3071 * descendant to visit for post-order traversal of @cgroup's descendants.
3072 */
3073struct cgroup *cgroup_next_descendant_post(struct cgroup *pos,
3074 struct cgroup *cgroup)
3075{
3076 struct cgroup *next;
3077
3078 WARN_ON_ONCE(!rcu_read_lock_held());
3079
3080 /* if first iteration, visit the leftmost descendant */
3081 if (!pos) {
3082 next = cgroup_leftmost_descendant(cgroup);
3083 return next != cgroup ? next : NULL;
3084 }
3085
3086 /* if there's an unvisited sibling, visit its leftmost descendant */
3087 next = list_entry_rcu(pos->sibling.next, struct cgroup, sibling);
3088 if (&next->sibling != &pos->parent->children)
3089 return cgroup_leftmost_descendant(next);
3090
3091 /* no sibling left, visit parent */
3092 next = pos->parent;
3093 return next != cgroup ? next : NULL;
3094}
3095EXPORT_SYMBOL_GPL(cgroup_next_descendant_post);
3096
3047void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it) 3097void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it)
3048 __acquires(css_set_lock) 3098 __acquires(css_set_lock)
3049{ 3099{
@@ -3390,7 +3440,7 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
3390{ 3440{
3391 struct cgroup_pidlist *l; 3441 struct cgroup_pidlist *l;
3392 /* don't need task_nsproxy() if we're looking at ourself */ 3442 /* don't need task_nsproxy() if we're looking at ourself */
3393 struct pid_namespace *ns = current->nsproxy->pid_ns; 3443 struct pid_namespace *ns = task_active_pid_ns(current);
3394 3444
3395 /* 3445 /*
3396 * We can't drop the pidlist_mutex before taking the l->mutex in case 3446 * We can't drop the pidlist_mutex before taking the l->mutex in case
@@ -3734,8 +3784,13 @@ static void cgroup_event_remove(struct work_struct *work)
3734 remove); 3784 remove);
3735 struct cgroup *cgrp = event->cgrp; 3785 struct cgroup *cgrp = event->cgrp;
3736 3786
3787 remove_wait_queue(event->wqh, &event->wait);
3788
3737 event->cft->unregister_event(cgrp, event->cft, event->eventfd); 3789 event->cft->unregister_event(cgrp, event->cft, event->eventfd);
3738 3790
3791 /* Notify userspace the event is going away. */
3792 eventfd_signal(event->eventfd, 1);
3793
3739 eventfd_ctx_put(event->eventfd); 3794 eventfd_ctx_put(event->eventfd);
3740 kfree(event); 3795 kfree(event);
3741 dput(cgrp->dentry); 3796 dput(cgrp->dentry);
@@ -3755,15 +3810,25 @@ static int cgroup_event_wake(wait_queue_t *wait, unsigned mode,
3755 unsigned long flags = (unsigned long)key; 3810 unsigned long flags = (unsigned long)key;
3756 3811
3757 if (flags & POLLHUP) { 3812 if (flags & POLLHUP) {
3758 __remove_wait_queue(event->wqh, &event->wait);
3759 spin_lock(&cgrp->event_list_lock);
3760 list_del(&event->list);
3761 spin_unlock(&cgrp->event_list_lock);
3762 /* 3813 /*
3763 * We are in atomic context, but cgroup_event_remove() may 3814 * If the event has been detached at cgroup removal, we
3764 * sleep, so we have to call it in workqueue. 3815 * can simply return knowing the other side will cleanup
3816 * for us.
3817 *
3818 * We can't race against event freeing since the other
3819 * side will require wqh->lock via remove_wait_queue(),
3820 * which we hold.
3765 */ 3821 */
3766 schedule_work(&event->remove); 3822 spin_lock(&cgrp->event_list_lock);
3823 if (!list_empty(&event->list)) {
3824 list_del_init(&event->list);
3825 /*
3826 * We are in atomic context, but cgroup_event_remove()
3827 * may sleep, so we have to call it in workqueue.
3828 */
3829 schedule_work(&event->remove);
3830 }
3831 spin_unlock(&cgrp->event_list_lock);
3767 } 3832 }
3768 3833
3769 return 0; 3834 return 0;
@@ -3789,6 +3854,7 @@ static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft,
3789 const char *buffer) 3854 const char *buffer)
3790{ 3855{
3791 struct cgroup_event *event = NULL; 3856 struct cgroup_event *event = NULL;
3857 struct cgroup *cgrp_cfile;
3792 unsigned int efd, cfd; 3858 unsigned int efd, cfd;
3793 struct file *efile = NULL; 3859 struct file *efile = NULL;
3794 struct file *cfile = NULL; 3860 struct file *cfile = NULL;
@@ -3834,7 +3900,7 @@ static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft,
3834 3900
3835 /* the process need read permission on control file */ 3901 /* the process need read permission on control file */
3836 /* AV: shouldn't we check that it's been opened for read instead? */ 3902 /* AV: shouldn't we check that it's been opened for read instead? */
3837 ret = inode_permission(cfile->f_path.dentry->d_inode, MAY_READ); 3903 ret = inode_permission(file_inode(cfile), MAY_READ);
3838 if (ret < 0) 3904 if (ret < 0)
3839 goto fail; 3905 goto fail;
3840 3906
@@ -3844,6 +3910,16 @@ static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft,
3844 goto fail; 3910 goto fail;
3845 } 3911 }
3846 3912
3913 /*
3914 * The file to be monitored must be in the same cgroup as
3915 * cgroup.event_control is.
3916 */
3917 cgrp_cfile = __d_cgrp(cfile->f_dentry->d_parent);
3918 if (cgrp_cfile != cgrp) {
3919 ret = -EINVAL;
3920 goto fail;
3921 }
3922
3847 if (!event->cft->register_event || !event->cft->unregister_event) { 3923 if (!event->cft->register_event || !event->cft->unregister_event) {
3848 ret = -EINVAL; 3924 ret = -EINVAL;
3849 goto fail; 3925 goto fail;
@@ -3894,7 +3970,7 @@ fail:
3894static u64 cgroup_clone_children_read(struct cgroup *cgrp, 3970static u64 cgroup_clone_children_read(struct cgroup *cgrp,
3895 struct cftype *cft) 3971 struct cftype *cft)
3896{ 3972{
3897 return clone_children(cgrp); 3973 return test_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);
3898} 3974}
3899 3975
3900static int cgroup_clone_children_write(struct cgroup *cgrp, 3976static int cgroup_clone_children_write(struct cgroup *cgrp,
@@ -3902,9 +3978,9 @@ static int cgroup_clone_children_write(struct cgroup *cgrp,
3902 u64 val) 3978 u64 val)
3903{ 3979{
3904 if (val) 3980 if (val)
3905 set_bit(CGRP_CLONE_CHILDREN, &cgrp->flags); 3981 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);
3906 else 3982 else
3907 clear_bit(CGRP_CLONE_CHILDREN, &cgrp->flags); 3983 clear_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);
3908 return 0; 3984 return 0;
3909} 3985}
3910 3986
@@ -4017,19 +4093,57 @@ static void init_cgroup_css(struct cgroup_subsys_state *css,
4017 css->flags = 0; 4093 css->flags = 0;
4018 css->id = NULL; 4094 css->id = NULL;
4019 if (cgrp == dummytop) 4095 if (cgrp == dummytop)
4020 set_bit(CSS_ROOT, &css->flags); 4096 css->flags |= CSS_ROOT;
4021 BUG_ON(cgrp->subsys[ss->subsys_id]); 4097 BUG_ON(cgrp->subsys[ss->subsys_id]);
4022 cgrp->subsys[ss->subsys_id] = css; 4098 cgrp->subsys[ss->subsys_id] = css;
4023 4099
4024 /* 4100 /*
4025 * If !clear_css_refs, css holds an extra ref to @cgrp->dentry 4101 * css holds an extra ref to @cgrp->dentry which is put on the last
4026 * which is put on the last css_put(). dput() requires process 4102 * css_put(). dput() requires process context, which css_put() may
4027 * context, which css_put() may be called without. @css->dput_work 4103 * be called without. @css->dput_work will be used to invoke
4028 * will be used to invoke dput() asynchronously from css_put(). 4104 * dput() asynchronously from css_put().
4029 */ 4105 */
4030 INIT_WORK(&css->dput_work, css_dput_fn); 4106 INIT_WORK(&css->dput_work, css_dput_fn);
4031 if (ss->__DEPRECATED_clear_css_refs) 4107}
4032 set_bit(CSS_CLEAR_CSS_REFS, &css->flags); 4108
4109/* invoke ->post_create() on a new CSS and mark it online if successful */
4110static int online_css(struct cgroup_subsys *ss, struct cgroup *cgrp)
4111{
4112 int ret = 0;
4113
4114 lockdep_assert_held(&cgroup_mutex);
4115
4116 if (ss->css_online)
4117 ret = ss->css_online(cgrp);
4118 if (!ret)
4119 cgrp->subsys[ss->subsys_id]->flags |= CSS_ONLINE;
4120 return ret;
4121}
4122
4123/* if the CSS is online, invoke ->pre_destory() on it and mark it offline */
4124static void offline_css(struct cgroup_subsys *ss, struct cgroup *cgrp)
4125 __releases(&cgroup_mutex) __acquires(&cgroup_mutex)
4126{
4127 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
4128
4129 lockdep_assert_held(&cgroup_mutex);
4130
4131 if (!(css->flags & CSS_ONLINE))
4132 return;
4133
4134 /*
4135 * css_offline() should be called with cgroup_mutex unlocked. See
4136 * 3fa59dfbc3 ("cgroup: fix potential deadlock in pre_destroy") for
4137 * details. This temporary unlocking should go away once
4138 * cgroup_mutex is unexported from controllers.
4139 */
4140 if (ss->css_offline) {
4141 mutex_unlock(&cgroup_mutex);
4142 ss->css_offline(cgrp);
4143 mutex_lock(&cgroup_mutex);
4144 }
4145
4146 cgrp->subsys[ss->subsys_id]->flags &= ~CSS_ONLINE;
4033} 4147}
4034 4148
4035/* 4149/*
@@ -4049,10 +4163,27 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4049 struct cgroup_subsys *ss; 4163 struct cgroup_subsys *ss;
4050 struct super_block *sb = root->sb; 4164 struct super_block *sb = root->sb;
4051 4165
4166 /* allocate the cgroup and its ID, 0 is reserved for the root */
4052 cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL); 4167 cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL);
4053 if (!cgrp) 4168 if (!cgrp)
4054 return -ENOMEM; 4169 return -ENOMEM;
4055 4170
4171 cgrp->id = ida_simple_get(&root->cgroup_ida, 1, 0, GFP_KERNEL);
4172 if (cgrp->id < 0)
4173 goto err_free_cgrp;
4174
4175 /*
4176 * Only live parents can have children. Note that the liveliness
4177 * check isn't strictly necessary because cgroup_mkdir() and
4178 * cgroup_rmdir() are fully synchronized by i_mutex; however, do it
4179 * anyway so that locking is contained inside cgroup proper and we
4180 * don't get nasty surprises if we ever grow another caller.
4181 */
4182 if (!cgroup_lock_live_group(parent)) {
4183 err = -ENODEV;
4184 goto err_free_id;
4185 }
4186
4056 /* Grab a reference on the superblock so the hierarchy doesn't 4187 /* Grab a reference on the superblock so the hierarchy doesn't
4057 * get deleted on unmount if there are child cgroups. This 4188 * get deleted on unmount if there are child cgroups. This
4058 * can be done outside cgroup_mutex, since the sb can't 4189 * can be done outside cgroup_mutex, since the sb can't
@@ -4060,10 +4191,11 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4060 * fs */ 4191 * fs */
4061 atomic_inc(&sb->s_active); 4192 atomic_inc(&sb->s_active);
4062 4193
4063 mutex_lock(&cgroup_mutex);
4064
4065 init_cgroup_housekeeping(cgrp); 4194 init_cgroup_housekeeping(cgrp);
4066 4195
4196 dentry->d_fsdata = cgrp;
4197 cgrp->dentry = dentry;
4198
4067 cgrp->parent = parent; 4199 cgrp->parent = parent;
4068 cgrp->root = parent->root; 4200 cgrp->root = parent->root;
4069 cgrp->top_cgroup = parent->top_cgroup; 4201 cgrp->top_cgroup = parent->top_cgroup;
@@ -4071,26 +4203,49 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4071 if (notify_on_release(parent)) 4203 if (notify_on_release(parent))
4072 set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); 4204 set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
4073 4205
4074 if (clone_children(parent)) 4206 if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags))
4075 set_bit(CGRP_CLONE_CHILDREN, &cgrp->flags); 4207 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);
4076 4208
4077 for_each_subsys(root, ss) { 4209 for_each_subsys(root, ss) {
4078 struct cgroup_subsys_state *css; 4210 struct cgroup_subsys_state *css;
4079 4211
4080 css = ss->create(cgrp); 4212 css = ss->css_alloc(cgrp);
4081 if (IS_ERR(css)) { 4213 if (IS_ERR(css)) {
4082 err = PTR_ERR(css); 4214 err = PTR_ERR(css);
4083 goto err_destroy; 4215 goto err_free_all;
4084 } 4216 }
4085 init_cgroup_css(css, ss, cgrp); 4217 init_cgroup_css(css, ss, cgrp);
4086 if (ss->use_id) { 4218 if (ss->use_id) {
4087 err = alloc_css_id(ss, parent, cgrp); 4219 err = alloc_css_id(ss, parent, cgrp);
4088 if (err) 4220 if (err)
4089 goto err_destroy; 4221 goto err_free_all;
4090 } 4222 }
4091 /* At error, ->destroy() callback has to free assigned ID. */ 4223 }
4092 if (clone_children(parent) && ss->post_clone) 4224
4093 ss->post_clone(cgrp); 4225 /*
4226 * Create directory. cgroup_create_file() returns with the new
4227 * directory locked on success so that it can be populated without
4228 * dropping cgroup_mutex.
4229 */
4230 err = cgroup_create_file(dentry, S_IFDIR | mode, sb);
4231 if (err < 0)
4232 goto err_free_all;
4233 lockdep_assert_held(&dentry->d_inode->i_mutex);
4234
4235 /* allocation complete, commit to creation */
4236 list_add_tail(&cgrp->allcg_node, &root->allcg_list);
4237 list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children);
4238 root->number_of_cgroups++;
4239
4240 /* each css holds a ref to the cgroup's dentry */
4241 for_each_subsys(root, ss)
4242 dget(dentry);
4243
4244 /* creation succeeded, notify subsystems */
4245 for_each_subsys(root, ss) {
4246 err = online_css(ss, cgrp);
4247 if (err)
4248 goto err_destroy;
4094 4249
4095 if (ss->broken_hierarchy && !ss->warned_broken_hierarchy && 4250 if (ss->broken_hierarchy && !ss->warned_broken_hierarchy &&
4096 parent->parent) { 4251 parent->parent) {
@@ -4102,50 +4257,34 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4102 } 4257 }
4103 } 4258 }
4104 4259
4105 list_add(&cgrp->sibling, &cgrp->parent->children);
4106 root->number_of_cgroups++;
4107
4108 err = cgroup_create_dir(cgrp, dentry, mode);
4109 if (err < 0)
4110 goto err_remove;
4111
4112 /* If !clear_css_refs, each css holds a ref to the cgroup's dentry */
4113 for_each_subsys(root, ss)
4114 if (!ss->__DEPRECATED_clear_css_refs)
4115 dget(dentry);
4116
4117 /* The cgroup directory was pre-locked for us */
4118 BUG_ON(!mutex_is_locked(&cgrp->dentry->d_inode->i_mutex));
4119
4120 list_add_tail(&cgrp->allcg_node, &root->allcg_list);
4121
4122 err = cgroup_populate_dir(cgrp, true, root->subsys_mask); 4260 err = cgroup_populate_dir(cgrp, true, root->subsys_mask);
4123 /* If err < 0, we have a half-filled directory - oh well ;) */ 4261 if (err)
4262 goto err_destroy;
4124 4263
4125 mutex_unlock(&cgroup_mutex); 4264 mutex_unlock(&cgroup_mutex);
4126 mutex_unlock(&cgrp->dentry->d_inode->i_mutex); 4265 mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
4127 4266
4128 return 0; 4267 return 0;
4129 4268
4130 err_remove: 4269err_free_all:
4131
4132 list_del(&cgrp->sibling);
4133 root->number_of_cgroups--;
4134
4135 err_destroy:
4136
4137 for_each_subsys(root, ss) { 4270 for_each_subsys(root, ss) {
4138 if (cgrp->subsys[ss->subsys_id]) 4271 if (cgrp->subsys[ss->subsys_id])
4139 ss->destroy(cgrp); 4272 ss->css_free(cgrp);
4140 } 4273 }
4141
4142 mutex_unlock(&cgroup_mutex); 4274 mutex_unlock(&cgroup_mutex);
4143
4144 /* Release the reference count that we took on the superblock */ 4275 /* Release the reference count that we took on the superblock */
4145 deactivate_super(sb); 4276 deactivate_super(sb);
4146 4277err_free_id:
4278 ida_simple_remove(&root->cgroup_ida, cgrp->id);
4279err_free_cgrp:
4147 kfree(cgrp); 4280 kfree(cgrp);
4148 return err; 4281 return err;
4282
4283err_destroy:
4284 cgroup_destroy_locked(cgrp);
4285 mutex_unlock(&cgroup_mutex);
4286 mutex_unlock(&dentry->d_inode->i_mutex);
4287 return err;
4149} 4288}
4150 4289
4151static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) 4290static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
@@ -4197,153 +4336,60 @@ static int cgroup_has_css_refs(struct cgroup *cgrp)
4197 return 0; 4336 return 0;
4198} 4337}
4199 4338
4200/* 4339static int cgroup_destroy_locked(struct cgroup *cgrp)
4201 * Atomically mark all (or else none) of the cgroup's CSS objects as 4340 __releases(&cgroup_mutex) __acquires(&cgroup_mutex)
4202 * CSS_REMOVED. Return true on success, or false if the cgroup has
4203 * busy subsystems. Call with cgroup_mutex held
4204 *
4205 * Depending on whether a subsys has __DEPRECATED_clear_css_refs set or
4206 * not, cgroup removal behaves differently.
4207 *
4208 * If clear is set, css refcnt for the subsystem should be zero before
4209 * cgroup removal can be committed. This is implemented by
4210 * CGRP_WAIT_ON_RMDIR and retry logic around ->pre_destroy(), which may be
4211 * called multiple times until all css refcnts reach zero and is allowed to
4212 * veto removal on any invocation. This behavior is deprecated and will be
4213 * removed as soon as the existing user (memcg) is updated.
4214 *
4215 * If clear is not set, each css holds an extra reference to the cgroup's
4216 * dentry and cgroup removal proceeds regardless of css refs.
4217 * ->pre_destroy() will be called at least once and is not allowed to fail.
4218 * On the last put of each css, whenever that may be, the extra dentry ref
4219 * is put so that dentry destruction happens only after all css's are
4220 * released.
4221 */
4222static int cgroup_clear_css_refs(struct cgroup *cgrp)
4223{ 4341{
4342 struct dentry *d = cgrp->dentry;
4343 struct cgroup *parent = cgrp->parent;
4344 DEFINE_WAIT(wait);
4345 struct cgroup_event *event, *tmp;
4224 struct cgroup_subsys *ss; 4346 struct cgroup_subsys *ss;
4225 unsigned long flags; 4347 LIST_HEAD(tmp_list);
4226 bool failed = false; 4348
4349 lockdep_assert_held(&d->d_inode->i_mutex);
4350 lockdep_assert_held(&cgroup_mutex);
4227 4351
4228 local_irq_save(flags); 4352 if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children))
4353 return -EBUSY;
4229 4354
4230 /* 4355 /*
4231 * Block new css_tryget() by deactivating refcnt. If all refcnts 4356 * Block new css_tryget() by deactivating refcnt and mark @cgrp
4232 * for subsystems w/ clear_css_refs set were 1 at the moment of 4357 * removed. This makes future css_tryget() and child creation
4233 * deactivation, we succeeded. 4358 * attempts fail thus maintaining the removal conditions verified
4359 * above.
4234 */ 4360 */
4235 for_each_subsys(cgrp->root, ss) { 4361 for_each_subsys(cgrp->root, ss) {
4236 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; 4362 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
4237 4363
4238 WARN_ON(atomic_read(&css->refcnt) < 0); 4364 WARN_ON(atomic_read(&css->refcnt) < 0);
4239 atomic_add(CSS_DEACT_BIAS, &css->refcnt); 4365 atomic_add(CSS_DEACT_BIAS, &css->refcnt);
4240
4241 if (ss->__DEPRECATED_clear_css_refs)
4242 failed |= css_refcnt(css) != 1;
4243 } 4366 }
4367 set_bit(CGRP_REMOVED, &cgrp->flags);
4244 4368
4245 /* 4369 /* tell subsystems to initate destruction */
4246 * If succeeded, set REMOVED and put all the base refs; otherwise, 4370 for_each_subsys(cgrp->root, ss)
4247 * restore refcnts to positive values. Either way, all in-progress 4371 offline_css(ss, cgrp);
4248 * css_tryget() will be released.
4249 */
4250 for_each_subsys(cgrp->root, ss) {
4251 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
4252
4253 if (!failed) {
4254 set_bit(CSS_REMOVED, &css->flags);
4255 css_put(css);
4256 } else {
4257 atomic_sub(CSS_DEACT_BIAS, &css->refcnt);
4258 }
4259 }
4260
4261 local_irq_restore(flags);
4262 return !failed;
4263}
4264
4265static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
4266{
4267 struct cgroup *cgrp = dentry->d_fsdata;
4268 struct dentry *d;
4269 struct cgroup *parent;
4270 DEFINE_WAIT(wait);
4271 struct cgroup_event *event, *tmp;
4272 int ret;
4273
4274 /* the vfs holds both inode->i_mutex already */
4275again:
4276 mutex_lock(&cgroup_mutex);
4277 if (atomic_read(&cgrp->count) != 0) {
4278 mutex_unlock(&cgroup_mutex);
4279 return -EBUSY;
4280 }
4281 if (!list_empty(&cgrp->children)) {
4282 mutex_unlock(&cgroup_mutex);
4283 return -EBUSY;
4284 }
4285 mutex_unlock(&cgroup_mutex);
4286
4287 /*
4288 * In general, subsystem has no css->refcnt after pre_destroy(). But
4289 * in racy cases, subsystem may have to get css->refcnt after
4290 * pre_destroy() and it makes rmdir return with -EBUSY. This sometimes
4291 * make rmdir return -EBUSY too often. To avoid that, we use waitqueue
4292 * for cgroup's rmdir. CGRP_WAIT_ON_RMDIR is for synchronizing rmdir
4293 * and subsystem's reference count handling. Please see css_get/put
4294 * and css_tryget() and cgroup_wakeup_rmdir_waiter() implementation.
4295 */
4296 set_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
4297 4372
4298 /* 4373 /*
4299 * Call pre_destroy handlers of subsys. Notify subsystems 4374 * Put all the base refs. Each css holds an extra reference to the
4300 * that rmdir() request comes. 4375 * cgroup's dentry and cgroup removal proceeds regardless of css
4376 * refs. On the last put of each css, whenever that may be, the
4377 * extra dentry ref is put so that dentry destruction happens only
4378 * after all css's are released.
4301 */ 4379 */
4302 ret = cgroup_call_pre_destroy(cgrp); 4380 for_each_subsys(cgrp->root, ss)
4303 if (ret) { 4381 css_put(cgrp->subsys[ss->subsys_id]);
4304 clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
4305 return ret;
4306 }
4307
4308 mutex_lock(&cgroup_mutex);
4309 parent = cgrp->parent;
4310 if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) {
4311 clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
4312 mutex_unlock(&cgroup_mutex);
4313 return -EBUSY;
4314 }
4315 prepare_to_wait(&cgroup_rmdir_waitq, &wait, TASK_INTERRUPTIBLE);
4316 if (!cgroup_clear_css_refs(cgrp)) {
4317 mutex_unlock(&cgroup_mutex);
4318 /*
4319 * Because someone may call cgroup_wakeup_rmdir_waiter() before
4320 * prepare_to_wait(), we need to check this flag.
4321 */
4322 if (test_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags))
4323 schedule();
4324 finish_wait(&cgroup_rmdir_waitq, &wait);
4325 clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
4326 if (signal_pending(current))
4327 return -EINTR;
4328 goto again;
4329 }
4330 /* NO css_tryget() can success after here. */
4331 finish_wait(&cgroup_rmdir_waitq, &wait);
4332 clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
4333 4382
4334 raw_spin_lock(&release_list_lock); 4383 raw_spin_lock(&release_list_lock);
4335 set_bit(CGRP_REMOVED, &cgrp->flags);
4336 if (!list_empty(&cgrp->release_list)) 4384 if (!list_empty(&cgrp->release_list))
4337 list_del_init(&cgrp->release_list); 4385 list_del_init(&cgrp->release_list);
4338 raw_spin_unlock(&release_list_lock); 4386 raw_spin_unlock(&release_list_lock);
4339 4387
4340 /* delete this cgroup from parent->children */ 4388 /* delete this cgroup from parent->children */
4341 list_del_init(&cgrp->sibling); 4389 list_del_rcu(&cgrp->sibling);
4342
4343 list_del_init(&cgrp->allcg_node); 4390 list_del_init(&cgrp->allcg_node);
4344 4391
4345 d = dget(cgrp->dentry); 4392 dget(d);
4346
4347 cgroup_d_remove_dir(d); 4393 cgroup_d_remove_dir(d);
4348 dput(d); 4394 dput(d);
4349 4395
@@ -4353,21 +4399,29 @@ again:
4353 /* 4399 /*
4354 * Unregister events and notify userspace. 4400 * Unregister events and notify userspace.
4355 * Notify userspace about cgroup removing only after rmdir of cgroup 4401 * Notify userspace about cgroup removing only after rmdir of cgroup
4356 * directory to avoid race between userspace and kernelspace 4402 * directory to avoid race between userspace and kernelspace.
4357 */ 4403 */
4358 spin_lock(&cgrp->event_list_lock); 4404 spin_lock(&cgrp->event_list_lock);
4359 list_for_each_entry_safe(event, tmp, &cgrp->event_list, list) { 4405 list_for_each_entry_safe(event, tmp, &cgrp->event_list, list) {
4360 list_del(&event->list); 4406 list_del_init(&event->list);
4361 remove_wait_queue(event->wqh, &event->wait);
4362 eventfd_signal(event->eventfd, 1);
4363 schedule_work(&event->remove); 4407 schedule_work(&event->remove);
4364 } 4408 }
4365 spin_unlock(&cgrp->event_list_lock); 4409 spin_unlock(&cgrp->event_list_lock);
4366 4410
4367 mutex_unlock(&cgroup_mutex);
4368 return 0; 4411 return 0;
4369} 4412}
4370 4413
4414static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
4415{
4416 int ret;
4417
4418 mutex_lock(&cgroup_mutex);
4419 ret = cgroup_destroy_locked(dentry->d_fsdata);
4420 mutex_unlock(&cgroup_mutex);
4421
4422 return ret;
4423}
4424
4371static void __init_or_module cgroup_init_cftsets(struct cgroup_subsys *ss) 4425static void __init_or_module cgroup_init_cftsets(struct cgroup_subsys *ss)
4372{ 4426{
4373 INIT_LIST_HEAD(&ss->cftsets); 4427 INIT_LIST_HEAD(&ss->cftsets);
@@ -4388,13 +4442,15 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
4388 4442
4389 printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name); 4443 printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name);
4390 4444
4445 mutex_lock(&cgroup_mutex);
4446
4391 /* init base cftset */ 4447 /* init base cftset */
4392 cgroup_init_cftsets(ss); 4448 cgroup_init_cftsets(ss);
4393 4449
4394 /* Create the top cgroup state for this subsystem */ 4450 /* Create the top cgroup state for this subsystem */
4395 list_add(&ss->sibling, &rootnode.subsys_list); 4451 list_add(&ss->sibling, &rootnode.subsys_list);
4396 ss->root = &rootnode; 4452 ss->root = &rootnode;
4397 css = ss->create(dummytop); 4453 css = ss->css_alloc(dummytop);
4398 /* We don't handle early failures gracefully */ 4454 /* We don't handle early failures gracefully */
4399 BUG_ON(IS_ERR(css)); 4455 BUG_ON(IS_ERR(css));
4400 init_cgroup_css(css, ss, dummytop); 4456 init_cgroup_css(css, ss, dummytop);
@@ -4403,7 +4459,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
4403 * pointer to this state - since the subsystem is 4459 * pointer to this state - since the subsystem is
4404 * newly registered, all tasks and hence the 4460 * newly registered, all tasks and hence the
4405 * init_css_set is in the subsystem's top cgroup. */ 4461 * init_css_set is in the subsystem's top cgroup. */
4406 init_css_set.subsys[ss->subsys_id] = dummytop->subsys[ss->subsys_id]; 4462 init_css_set.subsys[ss->subsys_id] = css;
4407 4463
4408 need_forkexit_callback |= ss->fork || ss->exit; 4464 need_forkexit_callback |= ss->fork || ss->exit;
4409 4465
@@ -4413,6 +4469,9 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
4413 BUG_ON(!list_empty(&init_task.tasks)); 4469 BUG_ON(!list_empty(&init_task.tasks));
4414 4470
4415 ss->active = 1; 4471 ss->active = 1;
4472 BUG_ON(online_css(ss, dummytop));
4473
4474 mutex_unlock(&cgroup_mutex);
4416 4475
4417 /* this function shouldn't be used with modular subsystems, since they 4476 /* this function shouldn't be used with modular subsystems, since they
4418 * need to register a subsys_id, among other things */ 4477 * need to register a subsys_id, among other things */
@@ -4430,12 +4489,15 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
4430 */ 4489 */
4431int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) 4490int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
4432{ 4491{
4433 int i;
4434 struct cgroup_subsys_state *css; 4492 struct cgroup_subsys_state *css;
4493 int i, ret;
4494 struct hlist_node *tmp;
4495 struct css_set *cg;
4496 unsigned long key;
4435 4497
4436 /* check name and function validity */ 4498 /* check name and function validity */
4437 if (ss->name == NULL || strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN || 4499 if (ss->name == NULL || strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN ||
4438 ss->create == NULL || ss->destroy == NULL) 4500 ss->css_alloc == NULL || ss->css_free == NULL)
4439 return -EINVAL; 4501 return -EINVAL;
4440 4502
4441 /* 4503 /*
@@ -4464,10 +4526,11 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
4464 subsys[ss->subsys_id] = ss; 4526 subsys[ss->subsys_id] = ss;
4465 4527
4466 /* 4528 /*
4467 * no ss->create seems to need anything important in the ss struct, so 4529 * no ss->css_alloc seems to need anything important in the ss
4468 * this can happen first (i.e. before the rootnode attachment). 4530 * struct, so this can happen first (i.e. before the rootnode
4531 * attachment).
4469 */ 4532 */
4470 css = ss->create(dummytop); 4533 css = ss->css_alloc(dummytop);
4471 if (IS_ERR(css)) { 4534 if (IS_ERR(css)) {
4472 /* failure case - need to deassign the subsys[] slot. */ 4535 /* failure case - need to deassign the subsys[] slot. */
4473 subsys[ss->subsys_id] = NULL; 4536 subsys[ss->subsys_id] = NULL;
@@ -4482,14 +4545,9 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
4482 init_cgroup_css(css, ss, dummytop); 4545 init_cgroup_css(css, ss, dummytop);
4483 /* init_idr must be after init_cgroup_css because it sets css->id. */ 4546 /* init_idr must be after init_cgroup_css because it sets css->id. */
4484 if (ss->use_id) { 4547 if (ss->use_id) {
4485 int ret = cgroup_init_idr(ss, css); 4548 ret = cgroup_init_idr(ss, css);
4486 if (ret) { 4549 if (ret)
4487 dummytop->subsys[ss->subsys_id] = NULL; 4550 goto err_unload;
4488 ss->destroy(dummytop);
4489 subsys[ss->subsys_id] = NULL;
4490 mutex_unlock(&cgroup_mutex);
4491 return ret;
4492 }
4493 } 4551 }
4494 4552
4495 /* 4553 /*
@@ -4501,31 +4559,34 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
4501 * this is all done under the css_set_lock. 4559 * this is all done under the css_set_lock.
4502 */ 4560 */
4503 write_lock(&css_set_lock); 4561 write_lock(&css_set_lock);
4504 for (i = 0; i < CSS_SET_TABLE_SIZE; i++) { 4562 hash_for_each_safe(css_set_table, i, tmp, cg, hlist) {
4505 struct css_set *cg; 4563 /* skip entries that we already rehashed */
4506 struct hlist_node *node, *tmp; 4564 if (cg->subsys[ss->subsys_id])
4507 struct hlist_head *bucket = &css_set_table[i], *new_bucket; 4565 continue;
4508 4566 /* remove existing entry */
4509 hlist_for_each_entry_safe(cg, node, tmp, bucket, hlist) { 4567 hash_del(&cg->hlist);
4510 /* skip entries that we already rehashed */ 4568 /* set new value */
4511 if (cg->subsys[ss->subsys_id]) 4569 cg->subsys[ss->subsys_id] = css;
4512 continue; 4570 /* recompute hash and restore entry */
4513 /* remove existing entry */ 4571 key = css_set_hash(cg->subsys);
4514 hlist_del(&cg->hlist); 4572 hash_add(css_set_table, &cg->hlist, key);
4515 /* set new value */
4516 cg->subsys[ss->subsys_id] = css;
4517 /* recompute hash and restore entry */
4518 new_bucket = css_set_hash(cg->subsys);
4519 hlist_add_head(&cg->hlist, new_bucket);
4520 }
4521 } 4573 }
4522 write_unlock(&css_set_lock); 4574 write_unlock(&css_set_lock);
4523 4575
4524 ss->active = 1; 4576 ss->active = 1;
4577 ret = online_css(ss, dummytop);
4578 if (ret)
4579 goto err_unload;
4525 4580
4526 /* success! */ 4581 /* success! */
4527 mutex_unlock(&cgroup_mutex); 4582 mutex_unlock(&cgroup_mutex);
4528 return 0; 4583 return 0;
4584
4585err_unload:
4586 mutex_unlock(&cgroup_mutex);
4587 /* @ss can't be mounted here as try_module_get() would fail */
4588 cgroup_unload_subsys(ss);
4589 return ret;
4529} 4590}
4530EXPORT_SYMBOL_GPL(cgroup_load_subsys); 4591EXPORT_SYMBOL_GPL(cgroup_load_subsys);
4531 4592
@@ -4540,7 +4601,6 @@ EXPORT_SYMBOL_GPL(cgroup_load_subsys);
4540void cgroup_unload_subsys(struct cgroup_subsys *ss) 4601void cgroup_unload_subsys(struct cgroup_subsys *ss)
4541{ 4602{
4542 struct cg_cgroup_link *link; 4603 struct cg_cgroup_link *link;
4543 struct hlist_head *hhead;
4544 4604
4545 BUG_ON(ss->module == NULL); 4605 BUG_ON(ss->module == NULL);
4546 4606
@@ -4552,6 +4612,13 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss)
4552 BUG_ON(ss->root != &rootnode); 4612 BUG_ON(ss->root != &rootnode);
4553 4613
4554 mutex_lock(&cgroup_mutex); 4614 mutex_lock(&cgroup_mutex);
4615
4616 offline_css(ss, dummytop);
4617 ss->active = 0;
4618
4619 if (ss->use_id)
4620 idr_destroy(&ss->idr);
4621
4555 /* deassign the subsys_id */ 4622 /* deassign the subsys_id */
4556 subsys[ss->subsys_id] = NULL; 4623 subsys[ss->subsys_id] = NULL;
4557 4624
@@ -4565,22 +4632,22 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss)
4565 write_lock(&css_set_lock); 4632 write_lock(&css_set_lock);
4566 list_for_each_entry(link, &dummytop->css_sets, cgrp_link_list) { 4633 list_for_each_entry(link, &dummytop->css_sets, cgrp_link_list) {
4567 struct css_set *cg = link->cg; 4634 struct css_set *cg = link->cg;
4635 unsigned long key;
4568 4636
4569 hlist_del(&cg->hlist); 4637 hash_del(&cg->hlist);
4570 BUG_ON(!cg->subsys[ss->subsys_id]);
4571 cg->subsys[ss->subsys_id] = NULL; 4638 cg->subsys[ss->subsys_id] = NULL;
4572 hhead = css_set_hash(cg->subsys); 4639 key = css_set_hash(cg->subsys);
4573 hlist_add_head(&cg->hlist, hhead); 4640 hash_add(css_set_table, &cg->hlist, key);
4574 } 4641 }
4575 write_unlock(&css_set_lock); 4642 write_unlock(&css_set_lock);
4576 4643
4577 /* 4644 /*
4578 * remove subsystem's css from the dummytop and free it - need to free 4645 * remove subsystem's css from the dummytop and free it - need to
4579 * before marking as null because ss->destroy needs the cgrp->subsys 4646 * free before marking as null because ss->css_free needs the
4580 * pointer to find their state. note that this also takes care of 4647 * cgrp->subsys pointer to find their state. note that this also
4581 * freeing the css_id. 4648 * takes care of freeing the css_id.
4582 */ 4649 */
4583 ss->destroy(dummytop); 4650 ss->css_free(dummytop);
4584 dummytop->subsys[ss->subsys_id] = NULL; 4651 dummytop->subsys[ss->subsys_id] = NULL;
4585 4652
4586 mutex_unlock(&cgroup_mutex); 4653 mutex_unlock(&cgroup_mutex);
@@ -4612,9 +4679,6 @@ int __init cgroup_init_early(void)
4612 list_add(&init_css_set_link.cg_link_list, 4679 list_add(&init_css_set_link.cg_link_list,
4613 &init_css_set.cg_links); 4680 &init_css_set.cg_links);
4614 4681
4615 for (i = 0; i < CSS_SET_TABLE_SIZE; i++)
4616 INIT_HLIST_HEAD(&css_set_table[i]);
4617
4618 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 4682 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
4619 struct cgroup_subsys *ss = subsys[i]; 4683 struct cgroup_subsys *ss = subsys[i];
4620 4684
@@ -4624,8 +4688,8 @@ int __init cgroup_init_early(void)
4624 4688
4625 BUG_ON(!ss->name); 4689 BUG_ON(!ss->name);
4626 BUG_ON(strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN); 4690 BUG_ON(strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN);
4627 BUG_ON(!ss->create); 4691 BUG_ON(!ss->css_alloc);
4628 BUG_ON(!ss->destroy); 4692 BUG_ON(!ss->css_free);
4629 if (ss->subsys_id != i) { 4693 if (ss->subsys_id != i) {
4630 printk(KERN_ERR "cgroup: Subsys %s id == %d\n", 4694 printk(KERN_ERR "cgroup: Subsys %s id == %d\n",
4631 ss->name, ss->subsys_id); 4695 ss->name, ss->subsys_id);
@@ -4648,7 +4712,7 @@ int __init cgroup_init(void)
4648{ 4712{
4649 int err; 4713 int err;
4650 int i; 4714 int i;
4651 struct hlist_head *hhead; 4715 unsigned long key;
4652 4716
4653 err = bdi_init(&cgroup_backing_dev_info); 4717 err = bdi_init(&cgroup_backing_dev_info);
4654 if (err) 4718 if (err)
@@ -4667,8 +4731,8 @@ int __init cgroup_init(void)
4667 } 4731 }
4668 4732
4669 /* Add init_css_set to the hash table */ 4733 /* Add init_css_set to the hash table */
4670 hhead = css_set_hash(init_css_set.subsys); 4734 key = css_set_hash(init_css_set.subsys);
4671 hlist_add_head(&init_css_set.hlist, hhead); 4735 hash_add(css_set_table, &init_css_set.hlist, key);
4672 BUG_ON(!init_root_id(&rootnode)); 4736 BUG_ON(!init_root_id(&rootnode));
4673 4737
4674 cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj); 4738 cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj);
@@ -4832,44 +4896,19 @@ void cgroup_fork(struct task_struct *child)
4832} 4896}
4833 4897
4834/** 4898/**
4835 * cgroup_fork_callbacks - run fork callbacks
4836 * @child: the new task
4837 *
4838 * Called on a new task very soon before adding it to the
4839 * tasklist. No need to take any locks since no-one can
4840 * be operating on this task.
4841 */
4842void cgroup_fork_callbacks(struct task_struct *child)
4843{
4844 if (need_forkexit_callback) {
4845 int i;
4846 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
4847 struct cgroup_subsys *ss = subsys[i];
4848
4849 /*
4850 * forkexit callbacks are only supported for
4851 * builtin subsystems.
4852 */
4853 if (!ss || ss->module)
4854 continue;
4855
4856 if (ss->fork)
4857 ss->fork(child);
4858 }
4859 }
4860}
4861
4862/**
4863 * cgroup_post_fork - called on a new task after adding it to the task list 4899 * cgroup_post_fork - called on a new task after adding it to the task list
4864 * @child: the task in question 4900 * @child: the task in question
4865 * 4901 *
4866 * Adds the task to the list running through its css_set if necessary. 4902 * Adds the task to the list running through its css_set if necessary and
4867 * Has to be after the task is visible on the task list in case we race 4903 * call the subsystem fork() callbacks. Has to be after the task is
4868 * with the first call to cgroup_iter_start() - to guarantee that the 4904 * visible on the task list in case we race with the first call to
4869 * new task ends up on its list. 4905 * cgroup_iter_start() - to guarantee that the new task ends up on its
4906 * list.
4870 */ 4907 */
4871void cgroup_post_fork(struct task_struct *child) 4908void cgroup_post_fork(struct task_struct *child)
4872{ 4909{
4910 int i;
4911
4873 /* 4912 /*
4874 * use_task_css_set_links is set to 1 before we walk the tasklist 4913 * use_task_css_set_links is set to 1 before we walk the tasklist
4875 * under the tasklist_lock and we read it here after we added the child 4914 * under the tasklist_lock and we read it here after we added the child
@@ -4889,7 +4928,30 @@ void cgroup_post_fork(struct task_struct *child)
4889 task_unlock(child); 4928 task_unlock(child);
4890 write_unlock(&css_set_lock); 4929 write_unlock(&css_set_lock);
4891 } 4930 }
4931
4932 /*
4933 * Call ss->fork(). This must happen after @child is linked on
4934 * css_set; otherwise, @child might change state between ->fork()
4935 * and addition to css_set.
4936 */
4937 if (need_forkexit_callback) {
4938 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
4939 struct cgroup_subsys *ss = subsys[i];
4940
4941 /*
4942 * fork/exit callbacks are supported only for
4943 * builtin subsystems and we don't need further
4944 * synchronization as they never go away.
4945 */
4946 if (!ss || ss->module)
4947 continue;
4948
4949 if (ss->fork)
4950 ss->fork(child);
4951 }
4952 }
4892} 4953}
4954
4893/** 4955/**
4894 * cgroup_exit - detach cgroup from exiting task 4956 * cgroup_exit - detach cgroup from exiting task
4895 * @tsk: pointer to task_struct of exiting process 4957 * @tsk: pointer to task_struct of exiting process
@@ -4965,8 +5027,7 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
4965 } 5027 }
4966 task_unlock(tsk); 5028 task_unlock(tsk);
4967 5029
4968 if (cg) 5030 put_css_set_taskexit(cg);
4969 put_css_set_taskexit(cg);
4970} 5031}
4971 5032
4972/** 5033/**
@@ -5022,15 +5083,17 @@ static void check_for_release(struct cgroup *cgrp)
5022/* Caller must verify that the css is not for root cgroup */ 5083/* Caller must verify that the css is not for root cgroup */
5023bool __css_tryget(struct cgroup_subsys_state *css) 5084bool __css_tryget(struct cgroup_subsys_state *css)
5024{ 5085{
5025 do { 5086 while (true) {
5026 int v = css_refcnt(css); 5087 int t, v;
5027 5088
5028 if (atomic_cmpxchg(&css->refcnt, v, v + 1) == v) 5089 v = css_refcnt(css);
5090 t = atomic_cmpxchg(&css->refcnt, v, v + 1);
5091 if (likely(t == v))
5029 return true; 5092 return true;
5093 else if (t < 0)
5094 return false;
5030 cpu_relax(); 5095 cpu_relax();
5031 } while (!test_bit(CSS_REMOVED, &css->flags)); 5096 }
5032
5033 return false;
5034} 5097}
5035EXPORT_SYMBOL_GPL(__css_tryget); 5098EXPORT_SYMBOL_GPL(__css_tryget);
5036 5099
@@ -5049,11 +5112,9 @@ void __css_put(struct cgroup_subsys_state *css)
5049 set_bit(CGRP_RELEASABLE, &cgrp->flags); 5112 set_bit(CGRP_RELEASABLE, &cgrp->flags);
5050 check_for_release(cgrp); 5113 check_for_release(cgrp);
5051 } 5114 }
5052 cgroup_wakeup_rmdir_waiter(cgrp);
5053 break; 5115 break;
5054 case 0: 5116 case 0:
5055 if (!test_bit(CSS_CLEAR_CSS_REFS, &css->flags)) 5117 schedule_work(&css->dput_work);
5056 schedule_work(&css->dput_work);
5057 break; 5118 break;
5058 } 5119 }
5059 rcu_read_unlock(); 5120 rcu_read_unlock();
@@ -5257,7 +5318,7 @@ EXPORT_SYMBOL_GPL(free_css_id);
5257static struct css_id *get_new_cssid(struct cgroup_subsys *ss, int depth) 5318static struct css_id *get_new_cssid(struct cgroup_subsys *ss, int depth)
5258{ 5319{
5259 struct css_id *newid; 5320 struct css_id *newid;
5260 int myid, error, size; 5321 int ret, size;
5261 5322
5262 BUG_ON(!ss->use_id); 5323 BUG_ON(!ss->use_id);
5263 5324
@@ -5265,35 +5326,24 @@ static struct css_id *get_new_cssid(struct cgroup_subsys *ss, int depth)
5265 newid = kzalloc(size, GFP_KERNEL); 5326 newid = kzalloc(size, GFP_KERNEL);
5266 if (!newid) 5327 if (!newid)
5267 return ERR_PTR(-ENOMEM); 5328 return ERR_PTR(-ENOMEM);
5268 /* get id */ 5329
5269 if (unlikely(!idr_pre_get(&ss->idr, GFP_KERNEL))) { 5330 idr_preload(GFP_KERNEL);
5270 error = -ENOMEM;
5271 goto err_out;
5272 }
5273 spin_lock(&ss->id_lock); 5331 spin_lock(&ss->id_lock);
5274 /* Don't use 0. allocates an ID of 1-65535 */ 5332 /* Don't use 0. allocates an ID of 1-65535 */
5275 error = idr_get_new_above(&ss->idr, newid, 1, &myid); 5333 ret = idr_alloc(&ss->idr, newid, 1, CSS_ID_MAX + 1, GFP_NOWAIT);
5276 spin_unlock(&ss->id_lock); 5334 spin_unlock(&ss->id_lock);
5335 idr_preload_end();
5277 5336
5278 /* Returns error when there are no free spaces for new ID.*/ 5337 /* Returns error when there are no free spaces for new ID.*/
5279 if (error) { 5338 if (ret < 0)
5280 error = -ENOSPC;
5281 goto err_out; 5339 goto err_out;
5282 }
5283 if (myid > CSS_ID_MAX)
5284 goto remove_idr;
5285 5340
5286 newid->id = myid; 5341 newid->id = ret;
5287 newid->depth = depth; 5342 newid->depth = depth;
5288 return newid; 5343 return newid;
5289remove_idr:
5290 error = -ENOSPC;
5291 spin_lock(&ss->id_lock);
5292 idr_remove(&ss->idr, myid);
5293 spin_unlock(&ss->id_lock);
5294err_out: 5344err_out:
5295 kfree(newid); 5345 kfree(newid);
5296 return ERR_PTR(error); 5346 return ERR_PTR(ret);
5297 5347
5298} 5348}
5299 5349
@@ -5424,7 +5474,7 @@ struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id)
5424 struct inode *inode; 5474 struct inode *inode;
5425 struct cgroup_subsys_state *css; 5475 struct cgroup_subsys_state *css;
5426 5476
5427 inode = f->f_dentry->d_inode; 5477 inode = file_inode(f);
5428 /* check in cgroup filesystem dir */ 5478 /* check in cgroup filesystem dir */
5429 if (inode->i_op != &cgroup_dir_inode_operations) 5479 if (inode->i_op != &cgroup_dir_inode_operations)
5430 return ERR_PTR(-EBADF); 5480 return ERR_PTR(-EBADF);
@@ -5439,7 +5489,7 @@ struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id)
5439} 5489}
5440 5490
5441#ifdef CONFIG_CGROUP_DEBUG 5491#ifdef CONFIG_CGROUP_DEBUG
5442static struct cgroup_subsys_state *debug_create(struct cgroup *cont) 5492static struct cgroup_subsys_state *debug_css_alloc(struct cgroup *cont)
5443{ 5493{
5444 struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL); 5494 struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL);
5445 5495
@@ -5449,7 +5499,7 @@ static struct cgroup_subsys_state *debug_create(struct cgroup *cont)
5449 return css; 5499 return css;
5450} 5500}
5451 5501
5452static void debug_destroy(struct cgroup *cont) 5502static void debug_css_free(struct cgroup *cont)
5453{ 5503{
5454 kfree(cont->subsys[debug_subsys_id]); 5504 kfree(cont->subsys[debug_subsys_id]);
5455} 5505}
@@ -5578,8 +5628,8 @@ static struct cftype debug_files[] = {
5578 5628
5579struct cgroup_subsys debug_subsys = { 5629struct cgroup_subsys debug_subsys = {
5580 .name = "debug", 5630 .name = "debug",
5581 .create = debug_create, 5631 .css_alloc = debug_css_alloc,
5582 .destroy = debug_destroy, 5632 .css_free = debug_css_free,
5583 .subsys_id = debug_subsys_id, 5633 .subsys_id = debug_subsys_id,
5584 .base_cftypes = debug_files, 5634 .base_cftypes = debug_files,
5585}; 5635};
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index b1724ce98981..75dda1ea5026 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -22,24 +22,33 @@
22#include <linux/freezer.h> 22#include <linux/freezer.h>
23#include <linux/seq_file.h> 23#include <linux/seq_file.h>
24 24
25enum freezer_state { 25/*
26 CGROUP_THAWED = 0, 26 * A cgroup is freezing if any FREEZING flags are set. FREEZING_SELF is
27 CGROUP_FREEZING, 27 * set if "FROZEN" is written to freezer.state cgroupfs file, and cleared
28 CGROUP_FROZEN, 28 * for "THAWED". FREEZING_PARENT is set if the parent freezer is FREEZING
29 * for whatever reason. IOW, a cgroup has FREEZING_PARENT set if one of
30 * its ancestors has FREEZING_SELF set.
31 */
32enum freezer_state_flags {
33 CGROUP_FREEZER_ONLINE = (1 << 0), /* freezer is fully online */
34 CGROUP_FREEZING_SELF = (1 << 1), /* this freezer is freezing */
35 CGROUP_FREEZING_PARENT = (1 << 2), /* the parent freezer is freezing */
36 CGROUP_FROZEN = (1 << 3), /* this and its descendants frozen */
37
38 /* mask for all FREEZING flags */
39 CGROUP_FREEZING = CGROUP_FREEZING_SELF | CGROUP_FREEZING_PARENT,
29}; 40};
30 41
31struct freezer { 42struct freezer {
32 struct cgroup_subsys_state css; 43 struct cgroup_subsys_state css;
33 enum freezer_state state; 44 unsigned int state;
34 spinlock_t lock; /* protects _writes_ to state */ 45 spinlock_t lock;
35}; 46};
36 47
37static inline struct freezer *cgroup_freezer( 48static inline struct freezer *cgroup_freezer(struct cgroup *cgroup)
38 struct cgroup *cgroup)
39{ 49{
40 return container_of( 50 return container_of(cgroup_subsys_state(cgroup, freezer_subsys_id),
41 cgroup_subsys_state(cgroup, freezer_subsys_id), 51 struct freezer, css);
42 struct freezer, css);
43} 52}
44 53
45static inline struct freezer *task_freezer(struct task_struct *task) 54static inline struct freezer *task_freezer(struct task_struct *task)
@@ -48,14 +57,21 @@ static inline struct freezer *task_freezer(struct task_struct *task)
48 struct freezer, css); 57 struct freezer, css);
49} 58}
50 59
60static struct freezer *parent_freezer(struct freezer *freezer)
61{
62 struct cgroup *pcg = freezer->css.cgroup->parent;
63
64 if (pcg)
65 return cgroup_freezer(pcg);
66 return NULL;
67}
68
51bool cgroup_freezing(struct task_struct *task) 69bool cgroup_freezing(struct task_struct *task)
52{ 70{
53 enum freezer_state state;
54 bool ret; 71 bool ret;
55 72
56 rcu_read_lock(); 73 rcu_read_lock();
57 state = task_freezer(task)->state; 74 ret = task_freezer(task)->state & CGROUP_FREEZING;
58 ret = state == CGROUP_FREEZING || state == CGROUP_FROZEN;
59 rcu_read_unlock(); 75 rcu_read_unlock();
60 76
61 return ret; 77 return ret;
@@ -65,70 +81,18 @@ bool cgroup_freezing(struct task_struct *task)
65 * cgroups_write_string() limits the size of freezer state strings to 81 * cgroups_write_string() limits the size of freezer state strings to
66 * CGROUP_LOCAL_BUFFER_SIZE 82 * CGROUP_LOCAL_BUFFER_SIZE
67 */ 83 */
68static const char *freezer_state_strs[] = { 84static const char *freezer_state_strs(unsigned int state)
69 "THAWED", 85{
70 "FREEZING", 86 if (state & CGROUP_FROZEN)
71 "FROZEN", 87 return "FROZEN";
88 if (state & CGROUP_FREEZING)
89 return "FREEZING";
90 return "THAWED";
72}; 91};
73 92
74/*
75 * State diagram
76 * Transitions are caused by userspace writes to the freezer.state file.
77 * The values in parenthesis are state labels. The rest are edge labels.
78 *
79 * (THAWED) --FROZEN--> (FREEZING) --FROZEN--> (FROZEN)
80 * ^ ^ | |
81 * | \_______THAWED_______/ |
82 * \__________________________THAWED____________/
83 */
84
85struct cgroup_subsys freezer_subsys; 93struct cgroup_subsys freezer_subsys;
86 94
87/* Locks taken and their ordering 95static struct cgroup_subsys_state *freezer_css_alloc(struct cgroup *cgroup)
88 * ------------------------------
89 * cgroup_mutex (AKA cgroup_lock)
90 * freezer->lock
91 * css_set_lock
92 * task->alloc_lock (AKA task_lock)
93 * task->sighand->siglock
94 *
95 * cgroup code forces css_set_lock to be taken before task->alloc_lock
96 *
97 * freezer_create(), freezer_destroy():
98 * cgroup_mutex [ by cgroup core ]
99 *
100 * freezer_can_attach():
101 * cgroup_mutex (held by caller of can_attach)
102 *
103 * freezer_fork() (preserving fork() performance means can't take cgroup_mutex):
104 * freezer->lock
105 * sighand->siglock (if the cgroup is freezing)
106 *
107 * freezer_read():
108 * cgroup_mutex
109 * freezer->lock
110 * write_lock css_set_lock (cgroup iterator start)
111 * task->alloc_lock
112 * read_lock css_set_lock (cgroup iterator start)
113 *
114 * freezer_write() (freeze):
115 * cgroup_mutex
116 * freezer->lock
117 * write_lock css_set_lock (cgroup iterator start)
118 * task->alloc_lock
119 * read_lock css_set_lock (cgroup iterator start)
120 * sighand->siglock (fake signal delivery inside freeze_task())
121 *
122 * freezer_write() (unfreeze):
123 * cgroup_mutex
124 * freezer->lock
125 * write_lock css_set_lock (cgroup iterator start)
126 * task->alloc_lock
127 * read_lock css_set_lock (cgroup iterator start)
128 * task->alloc_lock (inside __thaw_task(), prevents race with refrigerator())
129 * sighand->siglock
130 */
131static struct cgroup_subsys_state *freezer_create(struct cgroup *cgroup)
132{ 96{
133 struct freezer *freezer; 97 struct freezer *freezer;
134 98
@@ -137,160 +101,244 @@ static struct cgroup_subsys_state *freezer_create(struct cgroup *cgroup)
137 return ERR_PTR(-ENOMEM); 101 return ERR_PTR(-ENOMEM);
138 102
139 spin_lock_init(&freezer->lock); 103 spin_lock_init(&freezer->lock);
140 freezer->state = CGROUP_THAWED;
141 return &freezer->css; 104 return &freezer->css;
142} 105}
143 106
144static void freezer_destroy(struct cgroup *cgroup) 107/**
108 * freezer_css_online - commit creation of a freezer cgroup
109 * @cgroup: cgroup being created
110 *
111 * We're committing to creation of @cgroup. Mark it online and inherit
112 * parent's freezing state while holding both parent's and our
113 * freezer->lock.
114 */
115static int freezer_css_online(struct cgroup *cgroup)
116{
117 struct freezer *freezer = cgroup_freezer(cgroup);
118 struct freezer *parent = parent_freezer(freezer);
119
120 /*
121 * The following double locking and freezing state inheritance
122 * guarantee that @cgroup can never escape ancestors' freezing
123 * states. See cgroup_for_each_descendant_pre() for details.
124 */
125 if (parent)
126 spin_lock_irq(&parent->lock);
127 spin_lock_nested(&freezer->lock, SINGLE_DEPTH_NESTING);
128
129 freezer->state |= CGROUP_FREEZER_ONLINE;
130
131 if (parent && (parent->state & CGROUP_FREEZING)) {
132 freezer->state |= CGROUP_FREEZING_PARENT | CGROUP_FROZEN;
133 atomic_inc(&system_freezing_cnt);
134 }
135
136 spin_unlock(&freezer->lock);
137 if (parent)
138 spin_unlock_irq(&parent->lock);
139
140 return 0;
141}
142
143/**
144 * freezer_css_offline - initiate destruction of @cgroup
145 * @cgroup: cgroup being destroyed
146 *
147 * @cgroup is going away. Mark it dead and decrement system_freezing_count
148 * if it was holding one.
149 */
150static void freezer_css_offline(struct cgroup *cgroup)
145{ 151{
146 struct freezer *freezer = cgroup_freezer(cgroup); 152 struct freezer *freezer = cgroup_freezer(cgroup);
147 153
148 if (freezer->state != CGROUP_THAWED) 154 spin_lock_irq(&freezer->lock);
155
156 if (freezer->state & CGROUP_FREEZING)
149 atomic_dec(&system_freezing_cnt); 157 atomic_dec(&system_freezing_cnt);
150 kfree(freezer); 158
159 freezer->state = 0;
160
161 spin_unlock_irq(&freezer->lock);
151} 162}
152 163
153/* task is frozen or will freeze immediately when next it gets woken */ 164static void freezer_css_free(struct cgroup *cgroup)
154static bool is_task_frozen_enough(struct task_struct *task)
155{ 165{
156 return frozen(task) || 166 kfree(cgroup_freezer(cgroup));
157 (task_is_stopped_or_traced(task) && freezing(task));
158} 167}
159 168
160/* 169/*
161 * The call to cgroup_lock() in the freezer.state write method prevents 170 * Tasks can be migrated into a different freezer anytime regardless of its
162 * a write to that file racing against an attach, and hence the 171 * current state. freezer_attach() is responsible for making new tasks
163 * can_attach() result will remain valid until the attach completes. 172 * conform to the current state.
173 *
174 * Freezer state changes and task migration are synchronized via
175 * @freezer->lock. freezer_attach() makes the new tasks conform to the
176 * current state and all following state changes can see the new tasks.
164 */ 177 */
165static int freezer_can_attach(struct cgroup *new_cgroup, 178static void freezer_attach(struct cgroup *new_cgrp, struct cgroup_taskset *tset)
166 struct cgroup_taskset *tset)
167{ 179{
168 struct freezer *freezer; 180 struct freezer *freezer = cgroup_freezer(new_cgrp);
169 struct task_struct *task; 181 struct task_struct *task;
182 bool clear_frozen = false;
183
184 spin_lock_irq(&freezer->lock);
170 185
171 /* 186 /*
172 * Anything frozen can't move or be moved to/from. 187 * Make the new tasks conform to the current state of @new_cgrp.
188 * For simplicity, when migrating any task to a FROZEN cgroup, we
189 * revert it to FREEZING and let update_if_frozen() determine the
190 * correct state later.
191 *
192 * Tasks in @tset are on @new_cgrp but may not conform to its
193 * current state before executing the following - !frozen tasks may
194 * be visible in a FROZEN cgroup and frozen tasks in a THAWED one.
173 */ 195 */
174 cgroup_taskset_for_each(task, new_cgroup, tset) 196 cgroup_taskset_for_each(task, new_cgrp, tset) {
175 if (cgroup_freezing(task)) 197 if (!(freezer->state & CGROUP_FREEZING)) {
176 return -EBUSY; 198 __thaw_task(task);
199 } else {
200 freeze_task(task);
201 freezer->state &= ~CGROUP_FROZEN;
202 clear_frozen = true;
203 }
204 }
177 205
178 freezer = cgroup_freezer(new_cgroup); 206 spin_unlock_irq(&freezer->lock);
179 if (freezer->state != CGROUP_THAWED)
180 return -EBUSY;
181 207
182 return 0; 208 /*
209 * Propagate FROZEN clearing upwards. We may race with
210 * update_if_frozen(), but as long as both work bottom-up, either
211 * update_if_frozen() sees child's FROZEN cleared or we clear the
212 * parent's FROZEN later. No parent w/ !FROZEN children can be
213 * left FROZEN.
214 */
215 while (clear_frozen && (freezer = parent_freezer(freezer))) {
216 spin_lock_irq(&freezer->lock);
217 freezer->state &= ~CGROUP_FROZEN;
218 clear_frozen = freezer->state & CGROUP_FREEZING;
219 spin_unlock_irq(&freezer->lock);
220 }
183} 221}
184 222
185static void freezer_fork(struct task_struct *task) 223static void freezer_fork(struct task_struct *task)
186{ 224{
187 struct freezer *freezer; 225 struct freezer *freezer;
188 226
189 /*
190 * No lock is needed, since the task isn't on tasklist yet,
191 * so it can't be moved to another cgroup, which means the
192 * freezer won't be removed and will be valid during this
193 * function call. Nevertheless, apply RCU read-side critical
194 * section to suppress RCU lockdep false positives.
195 */
196 rcu_read_lock(); 227 rcu_read_lock();
197 freezer = task_freezer(task); 228 freezer = task_freezer(task);
198 rcu_read_unlock();
199 229
200 /* 230 /*
201 * The root cgroup is non-freezable, so we can skip the 231 * The root cgroup is non-freezable, so we can skip the
202 * following check. 232 * following check.
203 */ 233 */
204 if (!freezer->css.cgroup->parent) 234 if (!freezer->css.cgroup->parent)
205 return; 235 goto out;
206 236
207 spin_lock_irq(&freezer->lock); 237 spin_lock_irq(&freezer->lock);
208 BUG_ON(freezer->state == CGROUP_FROZEN); 238 if (freezer->state & CGROUP_FREEZING)
209
210 /* Locking avoids race with FREEZING -> THAWED transitions. */
211 if (freezer->state == CGROUP_FREEZING)
212 freeze_task(task); 239 freeze_task(task);
213 spin_unlock_irq(&freezer->lock); 240 spin_unlock_irq(&freezer->lock);
241out:
242 rcu_read_unlock();
214} 243}
215 244
216/* 245/**
217 * caller must hold freezer->lock 246 * update_if_frozen - update whether a cgroup finished freezing
247 * @cgroup: cgroup of interest
248 *
249 * Once FREEZING is initiated, transition to FROZEN is lazily updated by
250 * calling this function. If the current state is FREEZING but not FROZEN,
251 * this function checks whether all tasks of this cgroup and the descendant
252 * cgroups finished freezing and, if so, sets FROZEN.
253 *
254 * The caller is responsible for grabbing RCU read lock and calling
255 * update_if_frozen() on all descendants prior to invoking this function.
256 *
257 * Task states and freezer state might disagree while tasks are being
258 * migrated into or out of @cgroup, so we can't verify task states against
259 * @freezer state here. See freezer_attach() for details.
218 */ 260 */
219static void update_if_frozen(struct cgroup *cgroup, 261static void update_if_frozen(struct cgroup *cgroup)
220 struct freezer *freezer)
221{ 262{
263 struct freezer *freezer = cgroup_freezer(cgroup);
264 struct cgroup *pos;
222 struct cgroup_iter it; 265 struct cgroup_iter it;
223 struct task_struct *task; 266 struct task_struct *task;
224 unsigned int nfrozen = 0, ntotal = 0;
225 enum freezer_state old_state = freezer->state;
226 267
227 cgroup_iter_start(cgroup, &it); 268 WARN_ON_ONCE(!rcu_read_lock_held());
228 while ((task = cgroup_iter_next(cgroup, &it))) { 269
229 ntotal++; 270 spin_lock_irq(&freezer->lock);
230 if (freezing(task) && is_task_frozen_enough(task)) 271
231 nfrozen++; 272 if (!(freezer->state & CGROUP_FREEZING) ||
273 (freezer->state & CGROUP_FROZEN))
274 goto out_unlock;
275
276 /* are all (live) children frozen? */
277 cgroup_for_each_child(pos, cgroup) {
278 struct freezer *child = cgroup_freezer(pos);
279
280 if ((child->state & CGROUP_FREEZER_ONLINE) &&
281 !(child->state & CGROUP_FROZEN))
282 goto out_unlock;
232 } 283 }
233 284
234 if (old_state == CGROUP_THAWED) { 285 /* are all tasks frozen? */
235 BUG_ON(nfrozen > 0); 286 cgroup_iter_start(cgroup, &it);
236 } else if (old_state == CGROUP_FREEZING) { 287
237 if (nfrozen == ntotal) 288 while ((task = cgroup_iter_next(cgroup, &it))) {
238 freezer->state = CGROUP_FROZEN; 289 if (freezing(task)) {
239 } else { /* old_state == CGROUP_FROZEN */ 290 /*
240 BUG_ON(nfrozen != ntotal); 291 * freezer_should_skip() indicates that the task
292 * should be skipped when determining freezing
293 * completion. Consider it frozen in addition to
294 * the usual frozen condition.
295 */
296 if (!frozen(task) && !freezer_should_skip(task))
297 goto out_iter_end;
298 }
241 } 299 }
242 300
301 freezer->state |= CGROUP_FROZEN;
302out_iter_end:
243 cgroup_iter_end(cgroup, &it); 303 cgroup_iter_end(cgroup, &it);
304out_unlock:
305 spin_unlock_irq(&freezer->lock);
244} 306}
245 307
246static int freezer_read(struct cgroup *cgroup, struct cftype *cft, 308static int freezer_read(struct cgroup *cgroup, struct cftype *cft,
247 struct seq_file *m) 309 struct seq_file *m)
248{ 310{
249 struct freezer *freezer; 311 struct cgroup *pos;
250 enum freezer_state state;
251 312
252 if (!cgroup_lock_live_group(cgroup)) 313 rcu_read_lock();
253 return -ENODEV;
254 314
255 freezer = cgroup_freezer(cgroup); 315 /* update states bottom-up */
256 spin_lock_irq(&freezer->lock); 316 cgroup_for_each_descendant_post(pos, cgroup)
257 state = freezer->state; 317 update_if_frozen(pos);
258 if (state == CGROUP_FREEZING) { 318 update_if_frozen(cgroup);
259 /* We change from FREEZING to FROZEN lazily if the cgroup was 319
260 * only partially frozen when we exitted write. */ 320 rcu_read_unlock();
261 update_if_frozen(cgroup, freezer);
262 state = freezer->state;
263 }
264 spin_unlock_irq(&freezer->lock);
265 cgroup_unlock();
266 321
267 seq_puts(m, freezer_state_strs[state]); 322 seq_puts(m, freezer_state_strs(cgroup_freezer(cgroup)->state));
268 seq_putc(m, '\n'); 323 seq_putc(m, '\n');
269 return 0; 324 return 0;
270} 325}
271 326
272static int try_to_freeze_cgroup(struct cgroup *cgroup, struct freezer *freezer) 327static void freeze_cgroup(struct freezer *freezer)
273{ 328{
329 struct cgroup *cgroup = freezer->css.cgroup;
274 struct cgroup_iter it; 330 struct cgroup_iter it;
275 struct task_struct *task; 331 struct task_struct *task;
276 unsigned int num_cant_freeze_now = 0;
277 332
278 cgroup_iter_start(cgroup, &it); 333 cgroup_iter_start(cgroup, &it);
279 while ((task = cgroup_iter_next(cgroup, &it))) { 334 while ((task = cgroup_iter_next(cgroup, &it)))
280 if (!freeze_task(task)) 335 freeze_task(task);
281 continue;
282 if (is_task_frozen_enough(task))
283 continue;
284 if (!freezing(task) && !freezer_should_skip(task))
285 num_cant_freeze_now++;
286 }
287 cgroup_iter_end(cgroup, &it); 336 cgroup_iter_end(cgroup, &it);
288
289 return num_cant_freeze_now ? -EBUSY : 0;
290} 337}
291 338
292static void unfreeze_cgroup(struct cgroup *cgroup, struct freezer *freezer) 339static void unfreeze_cgroup(struct freezer *freezer)
293{ 340{
341 struct cgroup *cgroup = freezer->css.cgroup;
294 struct cgroup_iter it; 342 struct cgroup_iter it;
295 struct task_struct *task; 343 struct task_struct *task;
296 344
@@ -300,59 +348,111 @@ static void unfreeze_cgroup(struct cgroup *cgroup, struct freezer *freezer)
300 cgroup_iter_end(cgroup, &it); 348 cgroup_iter_end(cgroup, &it);
301} 349}
302 350
303static int freezer_change_state(struct cgroup *cgroup, 351/**
304 enum freezer_state goal_state) 352 * freezer_apply_state - apply state change to a single cgroup_freezer
353 * @freezer: freezer to apply state change to
354 * @freeze: whether to freeze or unfreeze
355 * @state: CGROUP_FREEZING_* flag to set or clear
356 *
357 * Set or clear @state on @cgroup according to @freeze, and perform
358 * freezing or thawing as necessary.
359 */
360static void freezer_apply_state(struct freezer *freezer, bool freeze,
361 unsigned int state)
305{ 362{
306 struct freezer *freezer; 363 /* also synchronizes against task migration, see freezer_attach() */
307 int retval = 0; 364 lockdep_assert_held(&freezer->lock);
308
309 freezer = cgroup_freezer(cgroup);
310 365
311 spin_lock_irq(&freezer->lock); 366 if (!(freezer->state & CGROUP_FREEZER_ONLINE))
367 return;
312 368
313 update_if_frozen(cgroup, freezer); 369 if (freeze) {
314 370 if (!(freezer->state & CGROUP_FREEZING))
315 switch (goal_state) {
316 case CGROUP_THAWED:
317 if (freezer->state != CGROUP_THAWED)
318 atomic_dec(&system_freezing_cnt);
319 freezer->state = CGROUP_THAWED;
320 unfreeze_cgroup(cgroup, freezer);
321 break;
322 case CGROUP_FROZEN:
323 if (freezer->state == CGROUP_THAWED)
324 atomic_inc(&system_freezing_cnt); 371 atomic_inc(&system_freezing_cnt);
325 freezer->state = CGROUP_FREEZING; 372 freezer->state |= state;
326 retval = try_to_freeze_cgroup(cgroup, freezer); 373 freeze_cgroup(freezer);
327 break; 374 } else {
328 default: 375 bool was_freezing = freezer->state & CGROUP_FREEZING;
329 BUG(); 376
377 freezer->state &= ~state;
378
379 if (!(freezer->state & CGROUP_FREEZING)) {
380 if (was_freezing)
381 atomic_dec(&system_freezing_cnt);
382 freezer->state &= ~CGROUP_FROZEN;
383 unfreeze_cgroup(freezer);
384 }
330 } 385 }
386}
331 387
388/**
389 * freezer_change_state - change the freezing state of a cgroup_freezer
390 * @freezer: freezer of interest
391 * @freeze: whether to freeze or thaw
392 *
393 * Freeze or thaw @freezer according to @freeze. The operations are
394 * recursive - all descendants of @freezer will be affected.
395 */
396static void freezer_change_state(struct freezer *freezer, bool freeze)
397{
398 struct cgroup *pos;
399
400 /* update @freezer */
401 spin_lock_irq(&freezer->lock);
402 freezer_apply_state(freezer, freeze, CGROUP_FREEZING_SELF);
332 spin_unlock_irq(&freezer->lock); 403 spin_unlock_irq(&freezer->lock);
333 404
334 return retval; 405 /*
406 * Update all its descendants in pre-order traversal. Each
407 * descendant will try to inherit its parent's FREEZING state as
408 * CGROUP_FREEZING_PARENT.
409 */
410 rcu_read_lock();
411 cgroup_for_each_descendant_pre(pos, freezer->css.cgroup) {
412 struct freezer *pos_f = cgroup_freezer(pos);
413 struct freezer *parent = parent_freezer(pos_f);
414
415 /*
416 * Our update to @parent->state is already visible which is
417 * all we need. No need to lock @parent. For more info on
418 * synchronization, see freezer_post_create().
419 */
420 spin_lock_irq(&pos_f->lock);
421 freezer_apply_state(pos_f, parent->state & CGROUP_FREEZING,
422 CGROUP_FREEZING_PARENT);
423 spin_unlock_irq(&pos_f->lock);
424 }
425 rcu_read_unlock();
335} 426}
336 427
337static int freezer_write(struct cgroup *cgroup, 428static int freezer_write(struct cgroup *cgroup, struct cftype *cft,
338 struct cftype *cft,
339 const char *buffer) 429 const char *buffer)
340{ 430{
341 int retval; 431 bool freeze;
342 enum freezer_state goal_state;
343 432
344 if (strcmp(buffer, freezer_state_strs[CGROUP_THAWED]) == 0) 433 if (strcmp(buffer, freezer_state_strs(0)) == 0)
345 goal_state = CGROUP_THAWED; 434 freeze = false;
346 else if (strcmp(buffer, freezer_state_strs[CGROUP_FROZEN]) == 0) 435 else if (strcmp(buffer, freezer_state_strs(CGROUP_FROZEN)) == 0)
347 goal_state = CGROUP_FROZEN; 436 freeze = true;
348 else 437 else
349 return -EINVAL; 438 return -EINVAL;
350 439
351 if (!cgroup_lock_live_group(cgroup)) 440 freezer_change_state(cgroup_freezer(cgroup), freeze);
352 return -ENODEV; 441 return 0;
353 retval = freezer_change_state(cgroup, goal_state); 442}
354 cgroup_unlock(); 443
355 return retval; 444static u64 freezer_self_freezing_read(struct cgroup *cgroup, struct cftype *cft)
445{
446 struct freezer *freezer = cgroup_freezer(cgroup);
447
448 return (bool)(freezer->state & CGROUP_FREEZING_SELF);
449}
450
451static u64 freezer_parent_freezing_read(struct cgroup *cgroup, struct cftype *cft)
452{
453 struct freezer *freezer = cgroup_freezer(cgroup);
454
455 return (bool)(freezer->state & CGROUP_FREEZING_PARENT);
356} 456}
357 457
358static struct cftype files[] = { 458static struct cftype files[] = {
@@ -362,23 +462,27 @@ static struct cftype files[] = {
362 .read_seq_string = freezer_read, 462 .read_seq_string = freezer_read,
363 .write_string = freezer_write, 463 .write_string = freezer_write,
364 }, 464 },
465 {
466 .name = "self_freezing",
467 .flags = CFTYPE_NOT_ON_ROOT,
468 .read_u64 = freezer_self_freezing_read,
469 },
470 {
471 .name = "parent_freezing",
472 .flags = CFTYPE_NOT_ON_ROOT,
473 .read_u64 = freezer_parent_freezing_read,
474 },
365 { } /* terminate */ 475 { } /* terminate */
366}; 476};
367 477
368struct cgroup_subsys freezer_subsys = { 478struct cgroup_subsys freezer_subsys = {
369 .name = "freezer", 479 .name = "freezer",
370 .create = freezer_create, 480 .css_alloc = freezer_css_alloc,
371 .destroy = freezer_destroy, 481 .css_online = freezer_css_online,
482 .css_offline = freezer_css_offline,
483 .css_free = freezer_css_free,
372 .subsys_id = freezer_subsys_id, 484 .subsys_id = freezer_subsys_id,
373 .can_attach = freezer_can_attach, 485 .attach = freezer_attach,
374 .fork = freezer_fork, 486 .fork = freezer_fork,
375 .base_cftypes = files, 487 .base_cftypes = files,
376
377 /*
378 * freezer subsys doesn't handle hierarchy at all. Frozen state
379 * should be inherited through the hierarchy - if a parent is
380 * frozen, all its children should be frozen. Fix it and remove
381 * the following.
382 */
383 .broken_hierarchy = true,
384}; 488};
diff --git a/kernel/compat.c b/kernel/compat.c
index c28a306ae05c..19971d8c7299 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -290,8 +290,8 @@ static inline long put_compat_itimerval(struct compat_itimerval __user *o,
290 __put_user(i->it_value.tv_usec, &o->it_value.tv_usec))); 290 __put_user(i->it_value.tv_usec, &o->it_value.tv_usec)));
291} 291}
292 292
293asmlinkage long compat_sys_getitimer(int which, 293COMPAT_SYSCALL_DEFINE2(getitimer, int, which,
294 struct compat_itimerval __user *it) 294 struct compat_itimerval __user *, it)
295{ 295{
296 struct itimerval kit; 296 struct itimerval kit;
297 int error; 297 int error;
@@ -302,9 +302,9 @@ asmlinkage long compat_sys_getitimer(int which,
302 return error; 302 return error;
303} 303}
304 304
305asmlinkage long compat_sys_setitimer(int which, 305COMPAT_SYSCALL_DEFINE3(setitimer, int, which,
306 struct compat_itimerval __user *in, 306 struct compat_itimerval __user *, in,
307 struct compat_itimerval __user *out) 307 struct compat_itimerval __user *, out)
308{ 308{
309 struct itimerval kin, kout; 309 struct itimerval kin, kout;
310 int error; 310 int error;
@@ -381,9 +381,9 @@ static inline void compat_sig_setmask(sigset_t *blocked, compat_sigset_word set)
381 memcpy(blocked->sig, &set, sizeof(set)); 381 memcpy(blocked->sig, &set, sizeof(set));
382} 382}
383 383
384asmlinkage long compat_sys_sigprocmask(int how, 384COMPAT_SYSCALL_DEFINE3(sigprocmask, int, how,
385 compat_old_sigset_t __user *nset, 385 compat_old_sigset_t __user *, nset,
386 compat_old_sigset_t __user *oset) 386 compat_old_sigset_t __user *, oset)
387{ 387{
388 old_sigset_t old_set, new_set; 388 old_sigset_t old_set, new_set;
389 sigset_t new_blocked; 389 sigset_t new_blocked;
@@ -535,9 +535,11 @@ asmlinkage long compat_sys_getrusage(int who, struct compat_rusage __user *ru)
535 return 0; 535 return 0;
536} 536}
537 537
538asmlinkage long 538COMPAT_SYSCALL_DEFINE4(wait4,
539compat_sys_wait4(compat_pid_t pid, compat_uint_t __user *stat_addr, int options, 539 compat_pid_t, pid,
540 struct compat_rusage __user *ru) 540 compat_uint_t __user *, stat_addr,
541 int, options,
542 struct compat_rusage __user *, ru)
541{ 543{
542 if (!ru) { 544 if (!ru) {
543 return sys_wait4(pid, stat_addr, options, NULL); 545 return sys_wait4(pid, stat_addr, options, NULL);
@@ -564,9 +566,10 @@ compat_sys_wait4(compat_pid_t pid, compat_uint_t __user *stat_addr, int options,
564 } 566 }
565} 567}
566 568
567asmlinkage long compat_sys_waitid(int which, compat_pid_t pid, 569COMPAT_SYSCALL_DEFINE5(waitid,
568 struct compat_siginfo __user *uinfo, int options, 570 int, which, compat_pid_t, pid,
569 struct compat_rusage __user *uru) 571 struct compat_siginfo __user *, uinfo, int, options,
572 struct compat_rusage __user *, uru)
570{ 573{
571 siginfo_t info; 574 siginfo_t info;
572 struct rusage ru; 575 struct rusage ru;
@@ -584,9 +587,13 @@ asmlinkage long compat_sys_waitid(int which, compat_pid_t pid,
584 return ret; 587 return ret;
585 588
586 if (uru) { 589 if (uru) {
587 ret = put_compat_rusage(&ru, uru); 590 /* sys_waitid() overwrites everything in ru */
591 if (COMPAT_USE_64BIT_TIME)
592 ret = copy_to_user(uru, &ru, sizeof(ru));
593 else
594 ret = put_compat_rusage(&ru, uru);
588 if (ret) 595 if (ret)
589 return ret; 596 return -EFAULT;
590 } 597 }
591 598
592 BUG_ON(info.si_code & __SI_MASK); 599 BUG_ON(info.si_code & __SI_MASK);
@@ -964,7 +971,7 @@ long compat_put_bitmap(compat_ulong_t __user *umask, unsigned long *mask,
964} 971}
965 972
966void 973void
967sigset_from_compat (sigset_t *set, compat_sigset_t *compat) 974sigset_from_compat(sigset_t *set, const compat_sigset_t *compat)
968{ 975{
969 switch (_NSIG_WORDS) { 976 switch (_NSIG_WORDS) {
970 case 4: set->sig[3] = compat->sig[6] | (((long)compat->sig[7]) << 32 ); 977 case 4: set->sig[3] = compat->sig[6] | (((long)compat->sig[7]) << 32 );
@@ -975,10 +982,20 @@ sigset_from_compat (sigset_t *set, compat_sigset_t *compat)
975} 982}
976EXPORT_SYMBOL_GPL(sigset_from_compat); 983EXPORT_SYMBOL_GPL(sigset_from_compat);
977 984
978asmlinkage long 985void
979compat_sys_rt_sigtimedwait (compat_sigset_t __user *uthese, 986sigset_to_compat(compat_sigset_t *compat, const sigset_t *set)
980 struct compat_siginfo __user *uinfo, 987{
981 struct compat_timespec __user *uts, compat_size_t sigsetsize) 988 switch (_NSIG_WORDS) {
989 case 4: compat->sig[7] = (set->sig[3] >> 32); compat->sig[6] = set->sig[3];
990 case 3: compat->sig[5] = (set->sig[2] >> 32); compat->sig[4] = set->sig[2];
991 case 2: compat->sig[3] = (set->sig[1] >> 32); compat->sig[2] = set->sig[1];
992 case 1: compat->sig[1] = (set->sig[0] >> 32); compat->sig[0] = set->sig[0];
993 }
994}
995
996COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait, compat_sigset_t __user *, uthese,
997 struct compat_siginfo __user *, uinfo,
998 struct compat_timespec __user *, uts, compat_size_t, sigsetsize)
982{ 999{
983 compat_sigset_t s32; 1000 compat_sigset_t s32;
984 sigset_t s; 1001 sigset_t s;
@@ -994,7 +1011,7 @@ compat_sys_rt_sigtimedwait (compat_sigset_t __user *uthese,
994 sigset_from_compat(&s, &s32); 1011 sigset_from_compat(&s, &s32);
995 1012
996 if (uts) { 1013 if (uts) {
997 if (get_compat_timespec(&t, uts)) 1014 if (compat_get_timespec(&t, uts))
998 return -EFAULT; 1015 return -EFAULT;
999 } 1016 }
1000 1017
@@ -1006,18 +1023,6 @@ compat_sys_rt_sigtimedwait (compat_sigset_t __user *uthese,
1006 } 1023 }
1007 1024
1008 return ret; 1025 return ret;
1009
1010}
1011
1012asmlinkage long
1013compat_sys_rt_tgsigqueueinfo(compat_pid_t tgid, compat_pid_t pid, int sig,
1014 struct compat_siginfo __user *uinfo)
1015{
1016 siginfo_t info;
1017
1018 if (copy_siginfo_from_user32(&info, uinfo))
1019 return -EFAULT;
1020 return do_rt_tgsigqueueinfo(tgid, pid, sig, &info);
1021} 1026}
1022 1027
1023#ifdef __ARCH_WANT_COMPAT_SYS_TIME 1028#ifdef __ARCH_WANT_COMPAT_SYS_TIME
@@ -1060,23 +1065,6 @@ asmlinkage long compat_sys_stime(compat_time_t __user *tptr)
1060 1065
1061#endif /* __ARCH_WANT_COMPAT_SYS_TIME */ 1066#endif /* __ARCH_WANT_COMPAT_SYS_TIME */
1062 1067
1063#ifdef __ARCH_WANT_COMPAT_SYS_RT_SIGSUSPEND
1064asmlinkage long compat_sys_rt_sigsuspend(compat_sigset_t __user *unewset, compat_size_t sigsetsize)
1065{
1066 sigset_t newset;
1067 compat_sigset_t newset32;
1068
1069 /* XXX: Don't preclude handling different sized sigset_t's. */
1070 if (sigsetsize != sizeof(sigset_t))
1071 return -EINVAL;
1072
1073 if (copy_from_user(&newset32, unewset, sizeof(compat_sigset_t)))
1074 return -EFAULT;
1075 sigset_from_compat(&newset, &newset32);
1076 return sigsuspend(&newset);
1077}
1078#endif /* __ARCH_WANT_COMPAT_SYS_RT_SIGSUSPEND */
1079
1080asmlinkage long compat_sys_adjtimex(struct compat_timex __user *utp) 1068asmlinkage long compat_sys_adjtimex(struct compat_timex __user *utp)
1081{ 1069{
1082 struct timex txc; 1070 struct timex txc;
@@ -1215,6 +1203,22 @@ compat_sys_sysinfo(struct compat_sysinfo __user *info)
1215 return 0; 1203 return 0;
1216} 1204}
1217 1205
1206COMPAT_SYSCALL_DEFINE2(sched_rr_get_interval,
1207 compat_pid_t, pid,
1208 struct compat_timespec __user *, interval)
1209{
1210 struct timespec t;
1211 int ret;
1212 mm_segment_t old_fs = get_fs();
1213
1214 set_fs(KERNEL_DS);
1215 ret = sys_sched_rr_get_interval(pid, (struct timespec __user *)&t);
1216 set_fs(old_fs);
1217 if (put_compat_timespec(&t, interval))
1218 return -EFAULT;
1219 return ret;
1220}
1221
1218/* 1222/*
1219 * Allocate user-space memory for the duration of a single system call, 1223 * Allocate user-space memory for the duration of a single system call,
1220 * in order to marshall parameters inside a compat thunk. 1224 * in order to marshall parameters inside a compat thunk.
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c
new file mode 100644
index 000000000000..65349f07b878
--- /dev/null
+++ b/kernel/context_tracking.c
@@ -0,0 +1,145 @@
1/*
2 * Context tracking: Probe on high level context boundaries such as kernel
3 * and userspace. This includes syscalls and exceptions entry/exit.
4 *
5 * This is used by RCU to remove its dependency on the timer tick while a CPU
6 * runs in userspace.
7 *
8 * Started by Frederic Weisbecker:
9 *
10 * Copyright (C) 2012 Red Hat, Inc., Frederic Weisbecker <fweisbec@redhat.com>
11 *
12 * Many thanks to Gilad Ben-Yossef, Paul McKenney, Ingo Molnar, Andrew Morton,
13 * Steven Rostedt, Peter Zijlstra for suggestions and improvements.
14 *
15 */
16
17#include <linux/context_tracking.h>
18#include <linux/kvm_host.h>
19#include <linux/rcupdate.h>
20#include <linux/sched.h>
21#include <linux/hardirq.h>
22#include <linux/export.h>
23
24DEFINE_PER_CPU(struct context_tracking, context_tracking) = {
25#ifdef CONFIG_CONTEXT_TRACKING_FORCE
26 .active = true,
27#endif
28};
29
30/**
31 * user_enter - Inform the context tracking that the CPU is going to
32 * enter userspace mode.
33 *
34 * This function must be called right before we switch from the kernel
35 * to userspace, when it's guaranteed the remaining kernel instructions
36 * to execute won't use any RCU read side critical section because this
37 * function sets RCU in extended quiescent state.
38 */
39void user_enter(void)
40{
41 unsigned long flags;
42
43 /*
44 * Some contexts may involve an exception occuring in an irq,
45 * leading to that nesting:
46 * rcu_irq_enter() rcu_user_exit() rcu_user_exit() rcu_irq_exit()
47 * This would mess up the dyntick_nesting count though. And rcu_irq_*()
48 * helpers are enough to protect RCU uses inside the exception. So
49 * just return immediately if we detect we are in an IRQ.
50 */
51 if (in_interrupt())
52 return;
53
54 /* Kernel threads aren't supposed to go to userspace */
55 WARN_ON_ONCE(!current->mm);
56
57 local_irq_save(flags);
58 if (__this_cpu_read(context_tracking.active) &&
59 __this_cpu_read(context_tracking.state) != IN_USER) {
60 /*
61 * At this stage, only low level arch entry code remains and
62 * then we'll run in userspace. We can assume there won't be
63 * any RCU read-side critical section until the next call to
64 * user_exit() or rcu_irq_enter(). Let's remove RCU's dependency
65 * on the tick.
66 */
67 vtime_user_enter(current);
68 rcu_user_enter();
69 __this_cpu_write(context_tracking.state, IN_USER);
70 }
71 local_irq_restore(flags);
72}
73
74
75/**
76 * user_exit - Inform the context tracking that the CPU is
77 * exiting userspace mode and entering the kernel.
78 *
79 * This function must be called after we entered the kernel from userspace
80 * before any use of RCU read side critical section. This potentially include
81 * any high level kernel code like syscalls, exceptions, signal handling, etc...
82 *
83 * This call supports re-entrancy. This way it can be called from any exception
84 * handler without needing to know if we came from userspace or not.
85 */
86void user_exit(void)
87{
88 unsigned long flags;
89
90 if (in_interrupt())
91 return;
92
93 local_irq_save(flags);
94 if (__this_cpu_read(context_tracking.state) == IN_USER) {
95 /*
96 * We are going to run code that may use RCU. Inform
97 * RCU core about that (ie: we may need the tick again).
98 */
99 rcu_user_exit();
100 vtime_user_exit(current);
101 __this_cpu_write(context_tracking.state, IN_KERNEL);
102 }
103 local_irq_restore(flags);
104}
105
106void guest_enter(void)
107{
108 if (vtime_accounting_enabled())
109 vtime_guest_enter(current);
110 else
111 __guest_enter();
112}
113EXPORT_SYMBOL_GPL(guest_enter);
114
115void guest_exit(void)
116{
117 if (vtime_accounting_enabled())
118 vtime_guest_exit(current);
119 else
120 __guest_exit();
121}
122EXPORT_SYMBOL_GPL(guest_exit);
123
124
125/**
126 * context_tracking_task_switch - context switch the syscall callbacks
127 * @prev: the task that is being switched out
128 * @next: the task that is being switched in
129 *
130 * The context tracking uses the syscall slow path to implement its user-kernel
131 * boundaries probes on syscalls. This way it doesn't impact the syscall fast
132 * path on CPUs that don't do context tracking.
133 *
134 * But we need to clear the flag on the previous task because it may later
135 * migrate to some CPU that doesn't do the context tracking. As such the TIF
136 * flag may not be desired there.
137 */
138void context_tracking_task_switch(struct task_struct *prev,
139 struct task_struct *next)
140{
141 if (__this_cpu_read(context_tracking.active)) {
142 clear_tsk_thread_flag(prev, TIF_NOHZ);
143 set_tsk_thread_flag(next, TIF_NOHZ);
144 }
145}
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 42bd331ee0ab..b5e4ab2d427e 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -224,11 +224,13 @@ void clear_tasks_mm_cpumask(int cpu)
224static inline void check_for_tasks(int cpu) 224static inline void check_for_tasks(int cpu)
225{ 225{
226 struct task_struct *p; 226 struct task_struct *p;
227 cputime_t utime, stime;
227 228
228 write_lock_irq(&tasklist_lock); 229 write_lock_irq(&tasklist_lock);
229 for_each_process(p) { 230 for_each_process(p) {
231 task_cputime(p, &utime, &stime);
230 if (task_cpu(p) == cpu && p->state == TASK_RUNNING && 232 if (task_cpu(p) == cpu && p->state == TASK_RUNNING &&
231 (p->utime || p->stime)) 233 (utime || stime))
232 printk(KERN_WARNING "Task %s (pid = %d) is on cpu %d " 234 printk(KERN_WARNING "Task %s (pid = %d) is on cpu %d "
233 "(state = %ld, flags = %x)\n", 235 "(state = %ld, flags = %x)\n",
234 p->comm, task_pid_nr(p), cpu, 236 p->comm, task_pid_nr(p), cpu,
@@ -254,6 +256,8 @@ static int __ref take_cpu_down(void *_param)
254 return err; 256 return err;
255 257
256 cpu_notify(CPU_DYING | param->mod, param->hcpu); 258 cpu_notify(CPU_DYING | param->mod, param->hcpu);
259 /* Park the stopper thread */
260 kthread_park(current);
257 return 0; 261 return 0;
258} 262}
259 263
@@ -348,11 +352,13 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen)
348 unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0; 352 unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0;
349 struct task_struct *idle; 353 struct task_struct *idle;
350 354
351 if (cpu_online(cpu) || !cpu_present(cpu))
352 return -EINVAL;
353
354 cpu_hotplug_begin(); 355 cpu_hotplug_begin();
355 356
357 if (cpu_online(cpu) || !cpu_present(cpu)) {
358 ret = -EINVAL;
359 goto out;
360 }
361
356 idle = idle_thread_get(cpu); 362 idle = idle_thread_get(cpu);
357 if (IS_ERR(idle)) { 363 if (IS_ERR(idle)) {
358 ret = PTR_ERR(idle); 364 ret = PTR_ERR(idle);
@@ -601,6 +607,11 @@ cpu_hotplug_pm_callback(struct notifier_block *nb,
601 607
602static int __init cpu_hotplug_pm_sync_init(void) 608static int __init cpu_hotplug_pm_sync_init(void)
603{ 609{
610 /*
611 * cpu_hotplug_pm_callback has higher priority than x86
612 * bsp_pm_callback which depends on cpu_hotplug_pm_callback
613 * to disable cpu hotplug to avoid cpu hotplug race.
614 */
604 pm_notifier(cpu_hotplug_pm_callback, 0); 615 pm_notifier(cpu_hotplug_pm_callback, 0);
605 return 0; 616 return 0;
606} 617}
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index f33c7153b6d7..4f9dfe43ecbd 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -61,14 +61,6 @@
61#include <linux/cgroup.h> 61#include <linux/cgroup.h>
62 62
63/* 63/*
64 * Workqueue for cpuset related tasks.
65 *
66 * Using kevent workqueue may cause deadlock when memory_migrate
67 * is set. So we create a separate workqueue thread for cpuset.
68 */
69static struct workqueue_struct *cpuset_wq;
70
71/*
72 * Tracks how many cpusets are currently defined in system. 64 * Tracks how many cpusets are currently defined in system.
73 * When there is only one cpuset (the root cpuset) we can 65 * When there is only one cpuset (the root cpuset) we can
74 * short circuit some hooks. 66 * short circuit some hooks.
@@ -95,18 +87,21 @@ struct cpuset {
95 cpumask_var_t cpus_allowed; /* CPUs allowed to tasks in cpuset */ 87 cpumask_var_t cpus_allowed; /* CPUs allowed to tasks in cpuset */
96 nodemask_t mems_allowed; /* Memory Nodes allowed to tasks */ 88 nodemask_t mems_allowed; /* Memory Nodes allowed to tasks */
97 89
98 struct cpuset *parent; /* my parent */
99
100 struct fmeter fmeter; /* memory_pressure filter */ 90 struct fmeter fmeter; /* memory_pressure filter */
101 91
92 /*
93 * Tasks are being attached to this cpuset. Used to prevent
94 * zeroing cpus/mems_allowed between ->can_attach() and ->attach().
95 */
96 int attach_in_progress;
97
102 /* partition number for rebuild_sched_domains() */ 98 /* partition number for rebuild_sched_domains() */
103 int pn; 99 int pn;
104 100
105 /* for custom sched domain */ 101 /* for custom sched domain */
106 int relax_domain_level; 102 int relax_domain_level;
107 103
108 /* used for walking a cpuset hierarchy */ 104 struct work_struct hotplug_work;
109 struct list_head stack_list;
110}; 105};
111 106
112/* Retrieve the cpuset for a cgroup */ 107/* Retrieve the cpuset for a cgroup */
@@ -123,6 +118,15 @@ static inline struct cpuset *task_cs(struct task_struct *task)
123 struct cpuset, css); 118 struct cpuset, css);
124} 119}
125 120
121static inline struct cpuset *parent_cs(const struct cpuset *cs)
122{
123 struct cgroup *pcgrp = cs->css.cgroup->parent;
124
125 if (pcgrp)
126 return cgroup_cs(pcgrp);
127 return NULL;
128}
129
126#ifdef CONFIG_NUMA 130#ifdef CONFIG_NUMA
127static inline bool task_has_mempolicy(struct task_struct *task) 131static inline bool task_has_mempolicy(struct task_struct *task)
128{ 132{
@@ -138,6 +142,7 @@ static inline bool task_has_mempolicy(struct task_struct *task)
138 142
139/* bits in struct cpuset flags field */ 143/* bits in struct cpuset flags field */
140typedef enum { 144typedef enum {
145 CS_ONLINE,
141 CS_CPU_EXCLUSIVE, 146 CS_CPU_EXCLUSIVE,
142 CS_MEM_EXCLUSIVE, 147 CS_MEM_EXCLUSIVE,
143 CS_MEM_HARDWALL, 148 CS_MEM_HARDWALL,
@@ -147,13 +152,12 @@ typedef enum {
147 CS_SPREAD_SLAB, 152 CS_SPREAD_SLAB,
148} cpuset_flagbits_t; 153} cpuset_flagbits_t;
149 154
150/* the type of hotplug event */
151enum hotplug_event {
152 CPUSET_CPU_OFFLINE,
153 CPUSET_MEM_OFFLINE,
154};
155
156/* convenient tests for these bits */ 155/* convenient tests for these bits */
156static inline bool is_cpuset_online(const struct cpuset *cs)
157{
158 return test_bit(CS_ONLINE, &cs->flags);
159}
160
157static inline int is_cpu_exclusive(const struct cpuset *cs) 161static inline int is_cpu_exclusive(const struct cpuset *cs)
158{ 162{
159 return test_bit(CS_CPU_EXCLUSIVE, &cs->flags); 163 return test_bit(CS_CPU_EXCLUSIVE, &cs->flags);
@@ -190,27 +194,52 @@ static inline int is_spread_slab(const struct cpuset *cs)
190} 194}
191 195
192static struct cpuset top_cpuset = { 196static struct cpuset top_cpuset = {
193 .flags = ((1 << CS_CPU_EXCLUSIVE) | (1 << CS_MEM_EXCLUSIVE)), 197 .flags = ((1 << CS_ONLINE) | (1 << CS_CPU_EXCLUSIVE) |
198 (1 << CS_MEM_EXCLUSIVE)),
194}; 199};
195 200
201/**
202 * cpuset_for_each_child - traverse online children of a cpuset
203 * @child_cs: loop cursor pointing to the current child
204 * @pos_cgrp: used for iteration
205 * @parent_cs: target cpuset to walk children of
206 *
207 * Walk @child_cs through the online children of @parent_cs. Must be used
208 * with RCU read locked.
209 */
210#define cpuset_for_each_child(child_cs, pos_cgrp, parent_cs) \
211 cgroup_for_each_child((pos_cgrp), (parent_cs)->css.cgroup) \
212 if (is_cpuset_online(((child_cs) = cgroup_cs((pos_cgrp)))))
213
214/**
215 * cpuset_for_each_descendant_pre - pre-order walk of a cpuset's descendants
216 * @des_cs: loop cursor pointing to the current descendant
217 * @pos_cgrp: used for iteration
218 * @root_cs: target cpuset to walk ancestor of
219 *
220 * Walk @des_cs through the online descendants of @root_cs. Must be used
221 * with RCU read locked. The caller may modify @pos_cgrp by calling
222 * cgroup_rightmost_descendant() to skip subtree.
223 */
224#define cpuset_for_each_descendant_pre(des_cs, pos_cgrp, root_cs) \
225 cgroup_for_each_descendant_pre((pos_cgrp), (root_cs)->css.cgroup) \
226 if (is_cpuset_online(((des_cs) = cgroup_cs((pos_cgrp)))))
227
196/* 228/*
197 * There are two global mutexes guarding cpuset structures. The first 229 * There are two global mutexes guarding cpuset structures - cpuset_mutex
198 * is the main control groups cgroup_mutex, accessed via 230 * and callback_mutex. The latter may nest inside the former. We also
199 * cgroup_lock()/cgroup_unlock(). The second is the cpuset-specific 231 * require taking task_lock() when dereferencing a task's cpuset pointer.
200 * callback_mutex, below. They can nest. It is ok to first take 232 * See "The task_lock() exception", at the end of this comment.
201 * cgroup_mutex, then nest callback_mutex. We also require taking 233 *
202 * task_lock() when dereferencing a task's cpuset pointer. See "The 234 * A task must hold both mutexes to modify cpusets. If a task holds
203 * task_lock() exception", at the end of this comment. 235 * cpuset_mutex, then it blocks others wanting that mutex, ensuring that it
204 * 236 * is the only task able to also acquire callback_mutex and be able to
205 * A task must hold both mutexes to modify cpusets. If a task 237 * modify cpusets. It can perform various checks on the cpuset structure
206 * holds cgroup_mutex, then it blocks others wanting that mutex, 238 * first, knowing nothing will change. It can also allocate memory while
207 * ensuring that it is the only task able to also acquire callback_mutex 239 * just holding cpuset_mutex. While it is performing these checks, various
208 * and be able to modify cpusets. It can perform various checks on 240 * callback routines can briefly acquire callback_mutex to query cpusets.
209 * the cpuset structure first, knowing nothing will change. It can 241 * Once it is ready to make the changes, it takes callback_mutex, blocking
210 * also allocate memory while just holding cgroup_mutex. While it is 242 * everyone else.
211 * performing these checks, various callback routines can briefly
212 * acquire callback_mutex to query cpusets. Once it is ready to make
213 * the changes, it takes callback_mutex, blocking everyone else.
214 * 243 *
215 * Calls to the kernel memory allocator can not be made while holding 244 * Calls to the kernel memory allocator can not be made while holding
216 * callback_mutex, as that would risk double tripping on callback_mutex 245 * callback_mutex, as that would risk double tripping on callback_mutex
@@ -232,6 +261,7 @@ static struct cpuset top_cpuset = {
232 * guidelines for accessing subsystem state in kernel/cgroup.c 261 * guidelines for accessing subsystem state in kernel/cgroup.c
233 */ 262 */
234 263
264static DEFINE_MUTEX(cpuset_mutex);
235static DEFINE_MUTEX(callback_mutex); 265static DEFINE_MUTEX(callback_mutex);
236 266
237/* 267/*
@@ -246,6 +276,17 @@ static char cpuset_nodelist[CPUSET_NODELIST_LEN];
246static DEFINE_SPINLOCK(cpuset_buffer_lock); 276static DEFINE_SPINLOCK(cpuset_buffer_lock);
247 277
248/* 278/*
279 * CPU / memory hotplug is handled asynchronously.
280 */
281static struct workqueue_struct *cpuset_propagate_hotplug_wq;
282
283static void cpuset_hotplug_workfn(struct work_struct *work);
284static void cpuset_propagate_hotplug_workfn(struct work_struct *work);
285static void schedule_cpuset_propagate_hotplug(struct cpuset *cs);
286
287static DECLARE_WORK(cpuset_hotplug_work, cpuset_hotplug_workfn);
288
289/*
249 * This is ugly, but preserves the userspace API for existing cpuset 290 * This is ugly, but preserves the userspace API for existing cpuset
250 * users. If someone tries to mount the "cpuset" filesystem, we 291 * users. If someone tries to mount the "cpuset" filesystem, we
251 * silently switch it to mount "cgroup" instead 292 * silently switch it to mount "cgroup" instead
@@ -289,7 +330,7 @@ static void guarantee_online_cpus(const struct cpuset *cs,
289 struct cpumask *pmask) 330 struct cpumask *pmask)
290{ 331{
291 while (cs && !cpumask_intersects(cs->cpus_allowed, cpu_online_mask)) 332 while (cs && !cpumask_intersects(cs->cpus_allowed, cpu_online_mask))
292 cs = cs->parent; 333 cs = parent_cs(cs);
293 if (cs) 334 if (cs)
294 cpumask_and(pmask, cs->cpus_allowed, cpu_online_mask); 335 cpumask_and(pmask, cs->cpus_allowed, cpu_online_mask);
295 else 336 else
@@ -302,10 +343,10 @@ static void guarantee_online_cpus(const struct cpuset *cs,
302 * are online, with memory. If none are online with memory, walk 343 * are online, with memory. If none are online with memory, walk
303 * up the cpuset hierarchy until we find one that does have some 344 * up the cpuset hierarchy until we find one that does have some
304 * online mems. If we get all the way to the top and still haven't 345 * online mems. If we get all the way to the top and still haven't
305 * found any online mems, return node_states[N_HIGH_MEMORY]. 346 * found any online mems, return node_states[N_MEMORY].
306 * 347 *
307 * One way or another, we guarantee to return some non-empty subset 348 * One way or another, we guarantee to return some non-empty subset
308 * of node_states[N_HIGH_MEMORY]. 349 * of node_states[N_MEMORY].
309 * 350 *
310 * Call with callback_mutex held. 351 * Call with callback_mutex held.
311 */ 352 */
@@ -313,20 +354,20 @@ static void guarantee_online_cpus(const struct cpuset *cs,
313static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask) 354static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
314{ 355{
315 while (cs && !nodes_intersects(cs->mems_allowed, 356 while (cs && !nodes_intersects(cs->mems_allowed,
316 node_states[N_HIGH_MEMORY])) 357 node_states[N_MEMORY]))
317 cs = cs->parent; 358 cs = parent_cs(cs);
318 if (cs) 359 if (cs)
319 nodes_and(*pmask, cs->mems_allowed, 360 nodes_and(*pmask, cs->mems_allowed,
320 node_states[N_HIGH_MEMORY]); 361 node_states[N_MEMORY]);
321 else 362 else
322 *pmask = node_states[N_HIGH_MEMORY]; 363 *pmask = node_states[N_MEMORY];
323 BUG_ON(!nodes_intersects(*pmask, node_states[N_HIGH_MEMORY])); 364 BUG_ON(!nodes_intersects(*pmask, node_states[N_MEMORY]));
324} 365}
325 366
326/* 367/*
327 * update task's spread flag if cpuset's page/slab spread flag is set 368 * update task's spread flag if cpuset's page/slab spread flag is set
328 * 369 *
329 * Called with callback_mutex/cgroup_mutex held 370 * Called with callback_mutex/cpuset_mutex held
330 */ 371 */
331static void cpuset_update_task_spread_flag(struct cpuset *cs, 372static void cpuset_update_task_spread_flag(struct cpuset *cs,
332 struct task_struct *tsk) 373 struct task_struct *tsk)
@@ -346,7 +387,7 @@ static void cpuset_update_task_spread_flag(struct cpuset *cs,
346 * 387 *
347 * One cpuset is a subset of another if all its allowed CPUs and 388 * One cpuset is a subset of another if all its allowed CPUs and
348 * Memory Nodes are a subset of the other, and its exclusive flags 389 * Memory Nodes are a subset of the other, and its exclusive flags
349 * are only set if the other's are set. Call holding cgroup_mutex. 390 * are only set if the other's are set. Call holding cpuset_mutex.
350 */ 391 */
351 392
352static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q) 393static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
@@ -395,7 +436,7 @@ static void free_trial_cpuset(struct cpuset *trial)
395 * If we replaced the flag and mask values of the current cpuset 436 * If we replaced the flag and mask values of the current cpuset
396 * (cur) with those values in the trial cpuset (trial), would 437 * (cur) with those values in the trial cpuset (trial), would
397 * our various subset and exclusive rules still be valid? Presumes 438 * our various subset and exclusive rules still be valid? Presumes
398 * cgroup_mutex held. 439 * cpuset_mutex held.
399 * 440 *
400 * 'cur' is the address of an actual, in-use cpuset. Operations 441 * 'cur' is the address of an actual, in-use cpuset. Operations
401 * such as list traversal that depend on the actual address of the 442 * such as list traversal that depend on the actual address of the
@@ -412,48 +453,58 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
412{ 453{
413 struct cgroup *cont; 454 struct cgroup *cont;
414 struct cpuset *c, *par; 455 struct cpuset *c, *par;
456 int ret;
457
458 rcu_read_lock();
415 459
416 /* Each of our child cpusets must be a subset of us */ 460 /* Each of our child cpusets must be a subset of us */
417 list_for_each_entry(cont, &cur->css.cgroup->children, sibling) { 461 ret = -EBUSY;
418 if (!is_cpuset_subset(cgroup_cs(cont), trial)) 462 cpuset_for_each_child(c, cont, cur)
419 return -EBUSY; 463 if (!is_cpuset_subset(c, trial))
420 } 464 goto out;
421 465
422 /* Remaining checks don't apply to root cpuset */ 466 /* Remaining checks don't apply to root cpuset */
467 ret = 0;
423 if (cur == &top_cpuset) 468 if (cur == &top_cpuset)
424 return 0; 469 goto out;
425 470
426 par = cur->parent; 471 par = parent_cs(cur);
427 472
428 /* We must be a subset of our parent cpuset */ 473 /* We must be a subset of our parent cpuset */
474 ret = -EACCES;
429 if (!is_cpuset_subset(trial, par)) 475 if (!is_cpuset_subset(trial, par))
430 return -EACCES; 476 goto out;
431 477
432 /* 478 /*
433 * If either I or some sibling (!= me) is exclusive, we can't 479 * If either I or some sibling (!= me) is exclusive, we can't
434 * overlap 480 * overlap
435 */ 481 */
436 list_for_each_entry(cont, &par->css.cgroup->children, sibling) { 482 ret = -EINVAL;
437 c = cgroup_cs(cont); 483 cpuset_for_each_child(c, cont, par) {
438 if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) && 484 if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) &&
439 c != cur && 485 c != cur &&
440 cpumask_intersects(trial->cpus_allowed, c->cpus_allowed)) 486 cpumask_intersects(trial->cpus_allowed, c->cpus_allowed))
441 return -EINVAL; 487 goto out;
442 if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) && 488 if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) &&
443 c != cur && 489 c != cur &&
444 nodes_intersects(trial->mems_allowed, c->mems_allowed)) 490 nodes_intersects(trial->mems_allowed, c->mems_allowed))
445 return -EINVAL; 491 goto out;
446 } 492 }
447 493
448 /* Cpusets with tasks can't have empty cpus_allowed or mems_allowed */ 494 /*
449 if (cgroup_task_count(cur->css.cgroup)) { 495 * Cpusets with tasks - existing or newly being attached - can't
450 if (cpumask_empty(trial->cpus_allowed) || 496 * have empty cpus_allowed or mems_allowed.
451 nodes_empty(trial->mems_allowed)) { 497 */
452 return -ENOSPC; 498 ret = -ENOSPC;
453 } 499 if ((cgroup_task_count(cur->css.cgroup) || cur->attach_in_progress) &&
454 } 500 (cpumask_empty(trial->cpus_allowed) ||
501 nodes_empty(trial->mems_allowed)))
502 goto out;
455 503
456 return 0; 504 ret = 0;
505out:
506 rcu_read_unlock();
507 return ret;
457} 508}
458 509
459#ifdef CONFIG_SMP 510#ifdef CONFIG_SMP
@@ -474,31 +525,24 @@ update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c)
474 return; 525 return;
475} 526}
476 527
477static void 528static void update_domain_attr_tree(struct sched_domain_attr *dattr,
478update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c) 529 struct cpuset *root_cs)
479{ 530{
480 LIST_HEAD(q); 531 struct cpuset *cp;
481 532 struct cgroup *pos_cgrp;
482 list_add(&c->stack_list, &q);
483 while (!list_empty(&q)) {
484 struct cpuset *cp;
485 struct cgroup *cont;
486 struct cpuset *child;
487
488 cp = list_first_entry(&q, struct cpuset, stack_list);
489 list_del(q.next);
490 533
491 if (cpumask_empty(cp->cpus_allowed)) 534 rcu_read_lock();
535 cpuset_for_each_descendant_pre(cp, pos_cgrp, root_cs) {
536 /* skip the whole subtree if @cp doesn't have any CPU */
537 if (cpumask_empty(cp->cpus_allowed)) {
538 pos_cgrp = cgroup_rightmost_descendant(pos_cgrp);
492 continue; 539 continue;
540 }
493 541
494 if (is_sched_load_balance(cp)) 542 if (is_sched_load_balance(cp))
495 update_domain_attr(dattr, cp); 543 update_domain_attr(dattr, cp);
496
497 list_for_each_entry(cont, &cp->css.cgroup->children, sibling) {
498 child = cgroup_cs(cont);
499 list_add_tail(&child->stack_list, &q);
500 }
501 } 544 }
545 rcu_read_unlock();
502} 546}
503 547
504/* 548/*
@@ -520,7 +564,7 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c)
520 * domains when operating in the severe memory shortage situations 564 * domains when operating in the severe memory shortage situations
521 * that could cause allocation failures below. 565 * that could cause allocation failures below.
522 * 566 *
523 * Must be called with cgroup_lock held. 567 * Must be called with cpuset_mutex held.
524 * 568 *
525 * The three key local variables below are: 569 * The three key local variables below are:
526 * q - a linked-list queue of cpuset pointers, used to implement a 570 * q - a linked-list queue of cpuset pointers, used to implement a
@@ -558,7 +602,6 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c)
558static int generate_sched_domains(cpumask_var_t **domains, 602static int generate_sched_domains(cpumask_var_t **domains,
559 struct sched_domain_attr **attributes) 603 struct sched_domain_attr **attributes)
560{ 604{
561 LIST_HEAD(q); /* queue of cpusets to be scanned */
562 struct cpuset *cp; /* scans q */ 605 struct cpuset *cp; /* scans q */
563 struct cpuset **csa; /* array of all cpuset ptrs */ 606 struct cpuset **csa; /* array of all cpuset ptrs */
564 int csn; /* how many cpuset ptrs in csa so far */ 607 int csn; /* how many cpuset ptrs in csa so far */
@@ -567,6 +610,7 @@ static int generate_sched_domains(cpumask_var_t **domains,
567 struct sched_domain_attr *dattr; /* attributes for custom domains */ 610 struct sched_domain_attr *dattr; /* attributes for custom domains */
568 int ndoms = 0; /* number of sched domains in result */ 611 int ndoms = 0; /* number of sched domains in result */
569 int nslot; /* next empty doms[] struct cpumask slot */ 612 int nslot; /* next empty doms[] struct cpumask slot */
613 struct cgroup *pos_cgrp;
570 614
571 doms = NULL; 615 doms = NULL;
572 dattr = NULL; 616 dattr = NULL;
@@ -594,33 +638,27 @@ static int generate_sched_domains(cpumask_var_t **domains,
594 goto done; 638 goto done;
595 csn = 0; 639 csn = 0;
596 640
597 list_add(&top_cpuset.stack_list, &q); 641 rcu_read_lock();
598 while (!list_empty(&q)) { 642 cpuset_for_each_descendant_pre(cp, pos_cgrp, &top_cpuset) {
599 struct cgroup *cont;
600 struct cpuset *child; /* scans child cpusets of cp */
601
602 cp = list_first_entry(&q, struct cpuset, stack_list);
603 list_del(q.next);
604
605 if (cpumask_empty(cp->cpus_allowed))
606 continue;
607
608 /* 643 /*
609 * All child cpusets contain a subset of the parent's cpus, so 644 * Continue traversing beyond @cp iff @cp has some CPUs and
610 * just skip them, and then we call update_domain_attr_tree() 645 * isn't load balancing. The former is obvious. The
611 * to calc relax_domain_level of the corresponding sched 646 * latter: All child cpusets contain a subset of the
612 * domain. 647 * parent's cpus, so just skip them, and then we call
648 * update_domain_attr_tree() to calc relax_domain_level of
649 * the corresponding sched domain.
613 */ 650 */
614 if (is_sched_load_balance(cp)) { 651 if (!cpumask_empty(cp->cpus_allowed) &&
615 csa[csn++] = cp; 652 !is_sched_load_balance(cp))
616 continue; 653 continue;
617 }
618 654
619 list_for_each_entry(cont, &cp->css.cgroup->children, sibling) { 655 if (is_sched_load_balance(cp))
620 child = cgroup_cs(cont); 656 csa[csn++] = cp;
621 list_add_tail(&child->stack_list, &q); 657
622 } 658 /* skip @cp's subtree */
623 } 659 pos_cgrp = cgroup_rightmost_descendant(pos_cgrp);
660 }
661 rcu_read_unlock();
624 662
625 for (i = 0; i < csn; i++) 663 for (i = 0; i < csn; i++)
626 csa[i]->pn = i; 664 csa[i]->pn = i;
@@ -725,25 +763,25 @@ done:
725/* 763/*
726 * Rebuild scheduler domains. 764 * Rebuild scheduler domains.
727 * 765 *
728 * Call with neither cgroup_mutex held nor within get_online_cpus(). 766 * If the flag 'sched_load_balance' of any cpuset with non-empty
729 * Takes both cgroup_mutex and get_online_cpus(). 767 * 'cpus' changes, or if the 'cpus' allowed changes in any cpuset
768 * which has that flag enabled, or if any cpuset with a non-empty
769 * 'cpus' is removed, then call this routine to rebuild the
770 * scheduler's dynamic sched domains.
730 * 771 *
731 * Cannot be directly called from cpuset code handling changes 772 * Call with cpuset_mutex held. Takes get_online_cpus().
732 * to the cpuset pseudo-filesystem, because it cannot be called
733 * from code that already holds cgroup_mutex.
734 */ 773 */
735static void do_rebuild_sched_domains(struct work_struct *unused) 774static void rebuild_sched_domains_locked(void)
736{ 775{
737 struct sched_domain_attr *attr; 776 struct sched_domain_attr *attr;
738 cpumask_var_t *doms; 777 cpumask_var_t *doms;
739 int ndoms; 778 int ndoms;
740 779
780 lockdep_assert_held(&cpuset_mutex);
741 get_online_cpus(); 781 get_online_cpus();
742 782
743 /* Generate domain masks and attrs */ 783 /* Generate domain masks and attrs */
744 cgroup_lock();
745 ndoms = generate_sched_domains(&doms, &attr); 784 ndoms = generate_sched_domains(&doms, &attr);
746 cgroup_unlock();
747 785
748 /* Have scheduler rebuild the domains */ 786 /* Have scheduler rebuild the domains */
749 partition_sched_domains(ndoms, doms, attr); 787 partition_sched_domains(ndoms, doms, attr);
@@ -751,7 +789,7 @@ static void do_rebuild_sched_domains(struct work_struct *unused)
751 put_online_cpus(); 789 put_online_cpus();
752} 790}
753#else /* !CONFIG_SMP */ 791#else /* !CONFIG_SMP */
754static void do_rebuild_sched_domains(struct work_struct *unused) 792static void rebuild_sched_domains_locked(void)
755{ 793{
756} 794}
757 795
@@ -763,44 +801,11 @@ static int generate_sched_domains(cpumask_var_t **domains,
763} 801}
764#endif /* CONFIG_SMP */ 802#endif /* CONFIG_SMP */
765 803
766static DECLARE_WORK(rebuild_sched_domains_work, do_rebuild_sched_domains);
767
768/*
769 * Rebuild scheduler domains, asynchronously via workqueue.
770 *
771 * If the flag 'sched_load_balance' of any cpuset with non-empty
772 * 'cpus' changes, or if the 'cpus' allowed changes in any cpuset
773 * which has that flag enabled, or if any cpuset with a non-empty
774 * 'cpus' is removed, then call this routine to rebuild the
775 * scheduler's dynamic sched domains.
776 *
777 * The rebuild_sched_domains() and partition_sched_domains()
778 * routines must nest cgroup_lock() inside get_online_cpus(),
779 * but such cpuset changes as these must nest that locking the
780 * other way, holding cgroup_lock() for much of the code.
781 *
782 * So in order to avoid an ABBA deadlock, the cpuset code handling
783 * these user changes delegates the actual sched domain rebuilding
784 * to a separate workqueue thread, which ends up processing the
785 * above do_rebuild_sched_domains() function.
786 */
787static void async_rebuild_sched_domains(void)
788{
789 queue_work(cpuset_wq, &rebuild_sched_domains_work);
790}
791
792/*
793 * Accomplishes the same scheduler domain rebuild as the above
794 * async_rebuild_sched_domains(), however it directly calls the
795 * rebuild routine synchronously rather than calling it via an
796 * asynchronous work thread.
797 *
798 * This can only be called from code that is not holding
799 * cgroup_mutex (not nested in a cgroup_lock() call.)
800 */
801void rebuild_sched_domains(void) 804void rebuild_sched_domains(void)
802{ 805{
803 do_rebuild_sched_domains(NULL); 806 mutex_lock(&cpuset_mutex);
807 rebuild_sched_domains_locked();
808 mutex_unlock(&cpuset_mutex);
804} 809}
805 810
806/** 811/**
@@ -808,7 +813,7 @@ void rebuild_sched_domains(void)
808 * @tsk: task to test 813 * @tsk: task to test
809 * @scan: struct cgroup_scanner contained in its struct cpuset_hotplug_scanner 814 * @scan: struct cgroup_scanner contained in its struct cpuset_hotplug_scanner
810 * 815 *
811 * Call with cgroup_mutex held. May take callback_mutex during call. 816 * Call with cpuset_mutex held. May take callback_mutex during call.
812 * Called for each task in a cgroup by cgroup_scan_tasks(). 817 * Called for each task in a cgroup by cgroup_scan_tasks().
813 * Return nonzero if this tasks's cpus_allowed mask should be changed (in other 818 * Return nonzero if this tasks's cpus_allowed mask should be changed (in other
814 * words, if its mask is not equal to its cpuset's mask). 819 * words, if its mask is not equal to its cpuset's mask).
@@ -829,7 +834,7 @@ static int cpuset_test_cpumask(struct task_struct *tsk,
829 * cpus_allowed mask needs to be changed. 834 * cpus_allowed mask needs to be changed.
830 * 835 *
831 * We don't need to re-check for the cgroup/cpuset membership, since we're 836 * We don't need to re-check for the cgroup/cpuset membership, since we're
832 * holding cgroup_lock() at this point. 837 * holding cpuset_mutex at this point.
833 */ 838 */
834static void cpuset_change_cpumask(struct task_struct *tsk, 839static void cpuset_change_cpumask(struct task_struct *tsk,
835 struct cgroup_scanner *scan) 840 struct cgroup_scanner *scan)
@@ -842,7 +847,7 @@ static void cpuset_change_cpumask(struct task_struct *tsk,
842 * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed 847 * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed
843 * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks() 848 * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks()
844 * 849 *
845 * Called with cgroup_mutex held 850 * Called with cpuset_mutex held
846 * 851 *
847 * The cgroup_scan_tasks() function will scan all the tasks in a cgroup, 852 * The cgroup_scan_tasks() function will scan all the tasks in a cgroup,
848 * calling callback functions for each. 853 * calling callback functions for each.
@@ -920,7 +925,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
920 heap_free(&heap); 925 heap_free(&heap);
921 926
922 if (is_load_balanced) 927 if (is_load_balanced)
923 async_rebuild_sched_domains(); 928 rebuild_sched_domains_locked();
924 return 0; 929 return 0;
925} 930}
926 931
@@ -932,7 +937,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
932 * Temporarilly set tasks mems_allowed to target nodes of migration, 937 * Temporarilly set tasks mems_allowed to target nodes of migration,
933 * so that the migration code can allocate pages on these nodes. 938 * so that the migration code can allocate pages on these nodes.
934 * 939 *
935 * Call holding cgroup_mutex, so current's cpuset won't change 940 * Call holding cpuset_mutex, so current's cpuset won't change
936 * during this call, as manage_mutex holds off any cpuset_attach() 941 * during this call, as manage_mutex holds off any cpuset_attach()
937 * calls. Therefore we don't need to take task_lock around the 942 * calls. Therefore we don't need to take task_lock around the
938 * call to guarantee_online_mems(), as we know no one is changing 943 * call to guarantee_online_mems(), as we know no one is changing
@@ -1007,7 +1012,7 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk,
1007/* 1012/*
1008 * Update task's mems_allowed and rebind its mempolicy and vmas' mempolicy 1013 * Update task's mems_allowed and rebind its mempolicy and vmas' mempolicy
1009 * of it to cpuset's new mems_allowed, and migrate pages to new nodes if 1014 * of it to cpuset's new mems_allowed, and migrate pages to new nodes if
1010 * memory_migrate flag is set. Called with cgroup_mutex held. 1015 * memory_migrate flag is set. Called with cpuset_mutex held.
1011 */ 1016 */
1012static void cpuset_change_nodemask(struct task_struct *p, 1017static void cpuset_change_nodemask(struct task_struct *p,
1013 struct cgroup_scanner *scan) 1018 struct cgroup_scanner *scan)
@@ -1016,7 +1021,7 @@ static void cpuset_change_nodemask(struct task_struct *p,
1016 struct cpuset *cs; 1021 struct cpuset *cs;
1017 int migrate; 1022 int migrate;
1018 const nodemask_t *oldmem = scan->data; 1023 const nodemask_t *oldmem = scan->data;
1019 static nodemask_t newmems; /* protected by cgroup_mutex */ 1024 static nodemask_t newmems; /* protected by cpuset_mutex */
1020 1025
1021 cs = cgroup_cs(scan->cg); 1026 cs = cgroup_cs(scan->cg);
1022 guarantee_online_mems(cs, &newmems); 1027 guarantee_online_mems(cs, &newmems);
@@ -1043,7 +1048,7 @@ static void *cpuset_being_rebound;
1043 * @oldmem: old mems_allowed of cpuset cs 1048 * @oldmem: old mems_allowed of cpuset cs
1044 * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks() 1049 * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks()
1045 * 1050 *
1046 * Called with cgroup_mutex held 1051 * Called with cpuset_mutex held
1047 * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0 1052 * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0
1048 * if @heap != NULL. 1053 * if @heap != NULL.
1049 */ 1054 */
@@ -1065,7 +1070,7 @@ static void update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem,
1065 * take while holding tasklist_lock. Forks can happen - the 1070 * take while holding tasklist_lock. Forks can happen - the
1066 * mpol_dup() cpuset_being_rebound check will catch such forks, 1071 * mpol_dup() cpuset_being_rebound check will catch such forks,
1067 * and rebind their vma mempolicies too. Because we still hold 1072 * and rebind their vma mempolicies too. Because we still hold
1068 * the global cgroup_mutex, we know that no other rebind effort 1073 * the global cpuset_mutex, we know that no other rebind effort
1069 * will be contending for the global variable cpuset_being_rebound. 1074 * will be contending for the global variable cpuset_being_rebound.
1070 * It's ok if we rebind the same mm twice; mpol_rebind_mm() 1075 * It's ok if we rebind the same mm twice; mpol_rebind_mm()
1071 * is idempotent. Also migrate pages in each mm to new nodes. 1076 * is idempotent. Also migrate pages in each mm to new nodes.
@@ -1084,7 +1089,7 @@ static void update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem,
1084 * mempolicies and if the cpuset is marked 'memory_migrate', 1089 * mempolicies and if the cpuset is marked 'memory_migrate',
1085 * migrate the tasks pages to the new memory. 1090 * migrate the tasks pages to the new memory.
1086 * 1091 *
1087 * Call with cgroup_mutex held. May take callback_mutex during call. 1092 * Call with cpuset_mutex held. May take callback_mutex during call.
1088 * Will take tasklist_lock, scan tasklist for tasks in cpuset cs, 1093 * Will take tasklist_lock, scan tasklist for tasks in cpuset cs,
1089 * lock each such tasks mm->mmap_sem, scan its vma's and rebind 1094 * lock each such tasks mm->mmap_sem, scan its vma's and rebind
1090 * their mempolicies to the cpusets new mems_allowed. 1095 * their mempolicies to the cpusets new mems_allowed.
@@ -1100,7 +1105,7 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
1100 return -ENOMEM; 1105 return -ENOMEM;
1101 1106
1102 /* 1107 /*
1103 * top_cpuset.mems_allowed tracks node_stats[N_HIGH_MEMORY]; 1108 * top_cpuset.mems_allowed tracks node_stats[N_MEMORY];
1104 * it's read-only 1109 * it's read-only
1105 */ 1110 */
1106 if (cs == &top_cpuset) { 1111 if (cs == &top_cpuset) {
@@ -1122,7 +1127,7 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
1122 goto done; 1127 goto done;
1123 1128
1124 if (!nodes_subset(trialcs->mems_allowed, 1129 if (!nodes_subset(trialcs->mems_allowed,
1125 node_states[N_HIGH_MEMORY])) { 1130 node_states[N_MEMORY])) {
1126 retval = -EINVAL; 1131 retval = -EINVAL;
1127 goto done; 1132 goto done;
1128 } 1133 }
@@ -1168,7 +1173,7 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val)
1168 cs->relax_domain_level = val; 1173 cs->relax_domain_level = val;
1169 if (!cpumask_empty(cs->cpus_allowed) && 1174 if (!cpumask_empty(cs->cpus_allowed) &&
1170 is_sched_load_balance(cs)) 1175 is_sched_load_balance(cs))
1171 async_rebuild_sched_domains(); 1176 rebuild_sched_domains_locked();
1172 } 1177 }
1173 1178
1174 return 0; 1179 return 0;
@@ -1182,7 +1187,7 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val)
1182 * Called by cgroup_scan_tasks() for each task in a cgroup. 1187 * Called by cgroup_scan_tasks() for each task in a cgroup.
1183 * 1188 *
1184 * We don't need to re-check for the cgroup/cpuset membership, since we're 1189 * We don't need to re-check for the cgroup/cpuset membership, since we're
1185 * holding cgroup_lock() at this point. 1190 * holding cpuset_mutex at this point.
1186 */ 1191 */
1187static void cpuset_change_flag(struct task_struct *tsk, 1192static void cpuset_change_flag(struct task_struct *tsk,
1188 struct cgroup_scanner *scan) 1193 struct cgroup_scanner *scan)
@@ -1195,7 +1200,7 @@ static void cpuset_change_flag(struct task_struct *tsk,
1195 * @cs: the cpuset in which each task's spread flags needs to be changed 1200 * @cs: the cpuset in which each task's spread flags needs to be changed
1196 * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks() 1201 * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks()
1197 * 1202 *
1198 * Called with cgroup_mutex held 1203 * Called with cpuset_mutex held
1199 * 1204 *
1200 * The cgroup_scan_tasks() function will scan all the tasks in a cgroup, 1205 * The cgroup_scan_tasks() function will scan all the tasks in a cgroup,
1201 * calling callback functions for each. 1206 * calling callback functions for each.
@@ -1220,7 +1225,7 @@ static void update_tasks_flags(struct cpuset *cs, struct ptr_heap *heap)
1220 * cs: the cpuset to update 1225 * cs: the cpuset to update
1221 * turning_on: whether the flag is being set or cleared 1226 * turning_on: whether the flag is being set or cleared
1222 * 1227 *
1223 * Call with cgroup_mutex held. 1228 * Call with cpuset_mutex held.
1224 */ 1229 */
1225 1230
1226static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, 1231static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
@@ -1260,7 +1265,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
1260 mutex_unlock(&callback_mutex); 1265 mutex_unlock(&callback_mutex);
1261 1266
1262 if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed) 1267 if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed)
1263 async_rebuild_sched_domains(); 1268 rebuild_sched_domains_locked();
1264 1269
1265 if (spread_flag_changed) 1270 if (spread_flag_changed)
1266 update_tasks_flags(cs, &heap); 1271 update_tasks_flags(cs, &heap);
@@ -1368,24 +1373,18 @@ static int fmeter_getrate(struct fmeter *fmp)
1368 return val; 1373 return val;
1369} 1374}
1370 1375
1371/* 1376/* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */
1372 * Protected by cgroup_lock. The nodemasks must be stored globally because
1373 * dynamically allocating them is not allowed in can_attach, and they must
1374 * persist until attach.
1375 */
1376static cpumask_var_t cpus_attach;
1377static nodemask_t cpuset_attach_nodemask_from;
1378static nodemask_t cpuset_attach_nodemask_to;
1379
1380/* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */
1381static int cpuset_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) 1377static int cpuset_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
1382{ 1378{
1383 struct cpuset *cs = cgroup_cs(cgrp); 1379 struct cpuset *cs = cgroup_cs(cgrp);
1384 struct task_struct *task; 1380 struct task_struct *task;
1385 int ret; 1381 int ret;
1386 1382
1383 mutex_lock(&cpuset_mutex);
1384
1385 ret = -ENOSPC;
1387 if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)) 1386 if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
1388 return -ENOSPC; 1387 goto out_unlock;
1389 1388
1390 cgroup_taskset_for_each(task, cgrp, tset) { 1389 cgroup_taskset_for_each(task, cgrp, tset) {
1391 /* 1390 /*
@@ -1397,25 +1396,45 @@ static int cpuset_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
1397 * set_cpus_allowed_ptr() on all attached tasks before 1396 * set_cpus_allowed_ptr() on all attached tasks before
1398 * cpus_allowed may be changed. 1397 * cpus_allowed may be changed.
1399 */ 1398 */
1399 ret = -EINVAL;
1400 if (task->flags & PF_THREAD_BOUND) 1400 if (task->flags & PF_THREAD_BOUND)
1401 return -EINVAL; 1401 goto out_unlock;
1402 if ((ret = security_task_setscheduler(task))) 1402 ret = security_task_setscheduler(task);
1403 return ret; 1403 if (ret)
1404 goto out_unlock;
1404 } 1405 }
1405 1406
1406 /* prepare for attach */ 1407 /*
1407 if (cs == &top_cpuset) 1408 * Mark attach is in progress. This makes validate_change() fail
1408 cpumask_copy(cpus_attach, cpu_possible_mask); 1409 * changes which zero cpus/mems_allowed.
1409 else 1410 */
1410 guarantee_online_cpus(cs, cpus_attach); 1411 cs->attach_in_progress++;
1411 1412 ret = 0;
1412 guarantee_online_mems(cs, &cpuset_attach_nodemask_to); 1413out_unlock:
1414 mutex_unlock(&cpuset_mutex);
1415 return ret;
1416}
1413 1417
1414 return 0; 1418static void cpuset_cancel_attach(struct cgroup *cgrp,
1419 struct cgroup_taskset *tset)
1420{
1421 mutex_lock(&cpuset_mutex);
1422 cgroup_cs(cgrp)->attach_in_progress--;
1423 mutex_unlock(&cpuset_mutex);
1415} 1424}
1416 1425
1426/*
1427 * Protected by cpuset_mutex. cpus_attach is used only by cpuset_attach()
1428 * but we can't allocate it dynamically there. Define it global and
1429 * allocate from cpuset_init().
1430 */
1431static cpumask_var_t cpus_attach;
1432
1417static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) 1433static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
1418{ 1434{
1435 /* static bufs protected by cpuset_mutex */
1436 static nodemask_t cpuset_attach_nodemask_from;
1437 static nodemask_t cpuset_attach_nodemask_to;
1419 struct mm_struct *mm; 1438 struct mm_struct *mm;
1420 struct task_struct *task; 1439 struct task_struct *task;
1421 struct task_struct *leader = cgroup_taskset_first(tset); 1440 struct task_struct *leader = cgroup_taskset_first(tset);
@@ -1423,6 +1442,16 @@ static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
1423 struct cpuset *cs = cgroup_cs(cgrp); 1442 struct cpuset *cs = cgroup_cs(cgrp);
1424 struct cpuset *oldcs = cgroup_cs(oldcgrp); 1443 struct cpuset *oldcs = cgroup_cs(oldcgrp);
1425 1444
1445 mutex_lock(&cpuset_mutex);
1446
1447 /* prepare for attach */
1448 if (cs == &top_cpuset)
1449 cpumask_copy(cpus_attach, cpu_possible_mask);
1450 else
1451 guarantee_online_cpus(cs, cpus_attach);
1452
1453 guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
1454
1426 cgroup_taskset_for_each(task, cgrp, tset) { 1455 cgroup_taskset_for_each(task, cgrp, tset) {
1427 /* 1456 /*
1428 * can_attach beforehand should guarantee that this doesn't 1457 * can_attach beforehand should guarantee that this doesn't
@@ -1448,6 +1477,18 @@ static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
1448 &cpuset_attach_nodemask_to); 1477 &cpuset_attach_nodemask_to);
1449 mmput(mm); 1478 mmput(mm);
1450 } 1479 }
1480
1481 cs->attach_in_progress--;
1482
1483 /*
1484 * We may have raced with CPU/memory hotunplug. Trigger hotplug
1485 * propagation if @cs doesn't have any CPU or memory. It will move
1486 * the newly added tasks to the nearest parent which can execute.
1487 */
1488 if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
1489 schedule_cpuset_propagate_hotplug(cs);
1490
1491 mutex_unlock(&cpuset_mutex);
1451} 1492}
1452 1493
1453/* The various types of files and directories in a cpuset file system */ 1494/* The various types of files and directories in a cpuset file system */
@@ -1469,12 +1510,13 @@ typedef enum {
1469 1510
1470static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val) 1511static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)
1471{ 1512{
1472 int retval = 0;
1473 struct cpuset *cs = cgroup_cs(cgrp); 1513 struct cpuset *cs = cgroup_cs(cgrp);
1474 cpuset_filetype_t type = cft->private; 1514 cpuset_filetype_t type = cft->private;
1515 int retval = -ENODEV;
1475 1516
1476 if (!cgroup_lock_live_group(cgrp)) 1517 mutex_lock(&cpuset_mutex);
1477 return -ENODEV; 1518 if (!is_cpuset_online(cs))
1519 goto out_unlock;
1478 1520
1479 switch (type) { 1521 switch (type) {
1480 case FILE_CPU_EXCLUSIVE: 1522 case FILE_CPU_EXCLUSIVE:
@@ -1508,18 +1550,20 @@ static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)
1508 retval = -EINVAL; 1550 retval = -EINVAL;
1509 break; 1551 break;
1510 } 1552 }
1511 cgroup_unlock(); 1553out_unlock:
1554 mutex_unlock(&cpuset_mutex);
1512 return retval; 1555 return retval;
1513} 1556}
1514 1557
1515static int cpuset_write_s64(struct cgroup *cgrp, struct cftype *cft, s64 val) 1558static int cpuset_write_s64(struct cgroup *cgrp, struct cftype *cft, s64 val)
1516{ 1559{
1517 int retval = 0;
1518 struct cpuset *cs = cgroup_cs(cgrp); 1560 struct cpuset *cs = cgroup_cs(cgrp);
1519 cpuset_filetype_t type = cft->private; 1561 cpuset_filetype_t type = cft->private;
1562 int retval = -ENODEV;
1520 1563
1521 if (!cgroup_lock_live_group(cgrp)) 1564 mutex_lock(&cpuset_mutex);
1522 return -ENODEV; 1565 if (!is_cpuset_online(cs))
1566 goto out_unlock;
1523 1567
1524 switch (type) { 1568 switch (type) {
1525 case FILE_SCHED_RELAX_DOMAIN_LEVEL: 1569 case FILE_SCHED_RELAX_DOMAIN_LEVEL:
@@ -1529,7 +1573,8 @@ static int cpuset_write_s64(struct cgroup *cgrp, struct cftype *cft, s64 val)
1529 retval = -EINVAL; 1573 retval = -EINVAL;
1530 break; 1574 break;
1531 } 1575 }
1532 cgroup_unlock(); 1576out_unlock:
1577 mutex_unlock(&cpuset_mutex);
1533 return retval; 1578 return retval;
1534} 1579}
1535 1580
@@ -1539,17 +1584,36 @@ static int cpuset_write_s64(struct cgroup *cgrp, struct cftype *cft, s64 val)
1539static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft, 1584static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft,
1540 const char *buf) 1585 const char *buf)
1541{ 1586{
1542 int retval = 0;
1543 struct cpuset *cs = cgroup_cs(cgrp); 1587 struct cpuset *cs = cgroup_cs(cgrp);
1544 struct cpuset *trialcs; 1588 struct cpuset *trialcs;
1589 int retval = -ENODEV;
1545 1590
1546 if (!cgroup_lock_live_group(cgrp)) 1591 /*
1547 return -ENODEV; 1592 * CPU or memory hotunplug may leave @cs w/o any execution
1593 * resources, in which case the hotplug code asynchronously updates
1594 * configuration and transfers all tasks to the nearest ancestor
1595 * which can execute.
1596 *
1597 * As writes to "cpus" or "mems" may restore @cs's execution
1598 * resources, wait for the previously scheduled operations before
1599 * proceeding, so that we don't end up keep removing tasks added
1600 * after execution capability is restored.
1601 *
1602 * Flushing cpuset_hotplug_work is enough to synchronize against
1603 * hotplug hanlding; however, cpuset_attach() may schedule
1604 * propagation work directly. Flush the workqueue too.
1605 */
1606 flush_work(&cpuset_hotplug_work);
1607 flush_workqueue(cpuset_propagate_hotplug_wq);
1608
1609 mutex_lock(&cpuset_mutex);
1610 if (!is_cpuset_online(cs))
1611 goto out_unlock;
1548 1612
1549 trialcs = alloc_trial_cpuset(cs); 1613 trialcs = alloc_trial_cpuset(cs);
1550 if (!trialcs) { 1614 if (!trialcs) {
1551 retval = -ENOMEM; 1615 retval = -ENOMEM;
1552 goto out; 1616 goto out_unlock;
1553 } 1617 }
1554 1618
1555 switch (cft->private) { 1619 switch (cft->private) {
@@ -1565,8 +1629,8 @@ static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft,
1565 } 1629 }
1566 1630
1567 free_trial_cpuset(trialcs); 1631 free_trial_cpuset(trialcs);
1568out: 1632out_unlock:
1569 cgroup_unlock(); 1633 mutex_unlock(&cpuset_mutex);
1570 return retval; 1634 return retval;
1571} 1635}
1572 1636
@@ -1784,57 +1848,18 @@ static struct cftype files[] = {
1784}; 1848};
1785 1849
1786/* 1850/*
1787 * post_clone() is called during cgroup_create() when the 1851 * cpuset_css_alloc - allocate a cpuset css
1788 * clone_children mount argument was specified. The cgroup
1789 * can not yet have any tasks.
1790 *
1791 * Currently we refuse to set up the cgroup - thereby
1792 * refusing the task to be entered, and as a result refusing
1793 * the sys_unshare() or clone() which initiated it - if any
1794 * sibling cpusets have exclusive cpus or mem.
1795 *
1796 * If this becomes a problem for some users who wish to
1797 * allow that scenario, then cpuset_post_clone() could be
1798 * changed to grant parent->cpus_allowed-sibling_cpus_exclusive
1799 * (and likewise for mems) to the new cgroup. Called with cgroup_mutex
1800 * held.
1801 */
1802static void cpuset_post_clone(struct cgroup *cgroup)
1803{
1804 struct cgroup *parent, *child;
1805 struct cpuset *cs, *parent_cs;
1806
1807 parent = cgroup->parent;
1808 list_for_each_entry(child, &parent->children, sibling) {
1809 cs = cgroup_cs(child);
1810 if (is_mem_exclusive(cs) || is_cpu_exclusive(cs))
1811 return;
1812 }
1813 cs = cgroup_cs(cgroup);
1814 parent_cs = cgroup_cs(parent);
1815
1816 mutex_lock(&callback_mutex);
1817 cs->mems_allowed = parent_cs->mems_allowed;
1818 cpumask_copy(cs->cpus_allowed, parent_cs->cpus_allowed);
1819 mutex_unlock(&callback_mutex);
1820 return;
1821}
1822
1823/*
1824 * cpuset_create - create a cpuset
1825 * cont: control group that the new cpuset will be part of 1852 * cont: control group that the new cpuset will be part of
1826 */ 1853 */
1827 1854
1828static struct cgroup_subsys_state *cpuset_create(struct cgroup *cont) 1855static struct cgroup_subsys_state *cpuset_css_alloc(struct cgroup *cont)
1829{ 1856{
1830 struct cpuset *cs; 1857 struct cpuset *cs;
1831 struct cpuset *parent;
1832 1858
1833 if (!cont->parent) { 1859 if (!cont->parent)
1834 return &top_cpuset.css; 1860 return &top_cpuset.css;
1835 } 1861
1836 parent = cgroup_cs(cont->parent); 1862 cs = kzalloc(sizeof(*cs), GFP_KERNEL);
1837 cs = kmalloc(sizeof(*cs), GFP_KERNEL);
1838 if (!cs) 1863 if (!cs)
1839 return ERR_PTR(-ENOMEM); 1864 return ERR_PTR(-ENOMEM);
1840 if (!alloc_cpumask_var(&cs->cpus_allowed, GFP_KERNEL)) { 1865 if (!alloc_cpumask_var(&cs->cpus_allowed, GFP_KERNEL)) {
@@ -1842,47 +1867,108 @@ static struct cgroup_subsys_state *cpuset_create(struct cgroup *cont)
1842 return ERR_PTR(-ENOMEM); 1867 return ERR_PTR(-ENOMEM);
1843 } 1868 }
1844 1869
1845 cs->flags = 0;
1846 if (is_spread_page(parent))
1847 set_bit(CS_SPREAD_PAGE, &cs->flags);
1848 if (is_spread_slab(parent))
1849 set_bit(CS_SPREAD_SLAB, &cs->flags);
1850 set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); 1870 set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
1851 cpumask_clear(cs->cpus_allowed); 1871 cpumask_clear(cs->cpus_allowed);
1852 nodes_clear(cs->mems_allowed); 1872 nodes_clear(cs->mems_allowed);
1853 fmeter_init(&cs->fmeter); 1873 fmeter_init(&cs->fmeter);
1874 INIT_WORK(&cs->hotplug_work, cpuset_propagate_hotplug_workfn);
1854 cs->relax_domain_level = -1; 1875 cs->relax_domain_level = -1;
1855 1876
1856 cs->parent = parent; 1877 return &cs->css;
1878}
1879
1880static int cpuset_css_online(struct cgroup *cgrp)
1881{
1882 struct cpuset *cs = cgroup_cs(cgrp);
1883 struct cpuset *parent = parent_cs(cs);
1884 struct cpuset *tmp_cs;
1885 struct cgroup *pos_cg;
1886
1887 if (!parent)
1888 return 0;
1889
1890 mutex_lock(&cpuset_mutex);
1891
1892 set_bit(CS_ONLINE, &cs->flags);
1893 if (is_spread_page(parent))
1894 set_bit(CS_SPREAD_PAGE, &cs->flags);
1895 if (is_spread_slab(parent))
1896 set_bit(CS_SPREAD_SLAB, &cs->flags);
1897
1857 number_of_cpusets++; 1898 number_of_cpusets++;
1858 return &cs->css ; 1899
1900 if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags))
1901 goto out_unlock;
1902
1903 /*
1904 * Clone @parent's configuration if CGRP_CPUSET_CLONE_CHILDREN is
1905 * set. This flag handling is implemented in cgroup core for
1906 * histrical reasons - the flag may be specified during mount.
1907 *
1908 * Currently, if any sibling cpusets have exclusive cpus or mem, we
1909 * refuse to clone the configuration - thereby refusing the task to
1910 * be entered, and as a result refusing the sys_unshare() or
1911 * clone() which initiated it. If this becomes a problem for some
1912 * users who wish to allow that scenario, then this could be
1913 * changed to grant parent->cpus_allowed-sibling_cpus_exclusive
1914 * (and likewise for mems) to the new cgroup.
1915 */
1916 rcu_read_lock();
1917 cpuset_for_each_child(tmp_cs, pos_cg, parent) {
1918 if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs)) {
1919 rcu_read_unlock();
1920 goto out_unlock;
1921 }
1922 }
1923 rcu_read_unlock();
1924
1925 mutex_lock(&callback_mutex);
1926 cs->mems_allowed = parent->mems_allowed;
1927 cpumask_copy(cs->cpus_allowed, parent->cpus_allowed);
1928 mutex_unlock(&callback_mutex);
1929out_unlock:
1930 mutex_unlock(&cpuset_mutex);
1931 return 0;
1932}
1933
1934static void cpuset_css_offline(struct cgroup *cgrp)
1935{
1936 struct cpuset *cs = cgroup_cs(cgrp);
1937
1938 mutex_lock(&cpuset_mutex);
1939
1940 if (is_sched_load_balance(cs))
1941 update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
1942
1943 number_of_cpusets--;
1944 clear_bit(CS_ONLINE, &cs->flags);
1945
1946 mutex_unlock(&cpuset_mutex);
1859} 1947}
1860 1948
1861/* 1949/*
1862 * If the cpuset being removed has its flag 'sched_load_balance' 1950 * If the cpuset being removed has its flag 'sched_load_balance'
1863 * enabled, then simulate turning sched_load_balance off, which 1951 * enabled, then simulate turning sched_load_balance off, which
1864 * will call async_rebuild_sched_domains(). 1952 * will call rebuild_sched_domains_locked().
1865 */ 1953 */
1866 1954
1867static void cpuset_destroy(struct cgroup *cont) 1955static void cpuset_css_free(struct cgroup *cont)
1868{ 1956{
1869 struct cpuset *cs = cgroup_cs(cont); 1957 struct cpuset *cs = cgroup_cs(cont);
1870 1958
1871 if (is_sched_load_balance(cs))
1872 update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
1873
1874 number_of_cpusets--;
1875 free_cpumask_var(cs->cpus_allowed); 1959 free_cpumask_var(cs->cpus_allowed);
1876 kfree(cs); 1960 kfree(cs);
1877} 1961}
1878 1962
1879struct cgroup_subsys cpuset_subsys = { 1963struct cgroup_subsys cpuset_subsys = {
1880 .name = "cpuset", 1964 .name = "cpuset",
1881 .create = cpuset_create, 1965 .css_alloc = cpuset_css_alloc,
1882 .destroy = cpuset_destroy, 1966 .css_online = cpuset_css_online,
1967 .css_offline = cpuset_css_offline,
1968 .css_free = cpuset_css_free,
1883 .can_attach = cpuset_can_attach, 1969 .can_attach = cpuset_can_attach,
1970 .cancel_attach = cpuset_cancel_attach,
1884 .attach = cpuset_attach, 1971 .attach = cpuset_attach,
1885 .post_clone = cpuset_post_clone,
1886 .subsys_id = cpuset_subsys_id, 1972 .subsys_id = cpuset_subsys_id,
1887 .base_cftypes = files, 1973 .base_cftypes = files,
1888 .early_init = 1, 1974 .early_init = 1,
@@ -1932,7 +2018,9 @@ static void cpuset_do_move_task(struct task_struct *tsk,
1932{ 2018{
1933 struct cgroup *new_cgroup = scan->data; 2019 struct cgroup *new_cgroup = scan->data;
1934 2020
2021 cgroup_lock();
1935 cgroup_attach_task(new_cgroup, tsk); 2022 cgroup_attach_task(new_cgroup, tsk);
2023 cgroup_unlock();
1936} 2024}
1937 2025
1938/** 2026/**
@@ -1940,7 +2028,7 @@ static void cpuset_do_move_task(struct task_struct *tsk,
1940 * @from: cpuset in which the tasks currently reside 2028 * @from: cpuset in which the tasks currently reside
1941 * @to: cpuset to which the tasks will be moved 2029 * @to: cpuset to which the tasks will be moved
1942 * 2030 *
1943 * Called with cgroup_mutex held 2031 * Called with cpuset_mutex held
1944 * callback_mutex must not be held, as cpuset_attach() will take it. 2032 * callback_mutex must not be held, as cpuset_attach() will take it.
1945 * 2033 *
1946 * The cgroup_scan_tasks() function will scan all the tasks in a cgroup, 2034 * The cgroup_scan_tasks() function will scan all the tasks in a cgroup,
@@ -1967,203 +2055,212 @@ static void move_member_tasks_to_cpuset(struct cpuset *from, struct cpuset *to)
1967 * removing that CPU or node from all cpusets. If this removes the 2055 * removing that CPU or node from all cpusets. If this removes the
1968 * last CPU or node from a cpuset, then move the tasks in the empty 2056 * last CPU or node from a cpuset, then move the tasks in the empty
1969 * cpuset to its next-highest non-empty parent. 2057 * cpuset to its next-highest non-empty parent.
1970 *
1971 * Called with cgroup_mutex held
1972 * callback_mutex must not be held, as cpuset_attach() will take it.
1973 */ 2058 */
1974static void remove_tasks_in_empty_cpuset(struct cpuset *cs) 2059static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
1975{ 2060{
1976 struct cpuset *parent; 2061 struct cpuset *parent;
1977 2062
1978 /* 2063 /*
1979 * The cgroup's css_sets list is in use if there are tasks
1980 * in the cpuset; the list is empty if there are none;
1981 * the cs->css.refcnt seems always 0.
1982 */
1983 if (list_empty(&cs->css.cgroup->css_sets))
1984 return;
1985
1986 /*
1987 * Find its next-highest non-empty parent, (top cpuset 2064 * Find its next-highest non-empty parent, (top cpuset
1988 * has online cpus, so can't be empty). 2065 * has online cpus, so can't be empty).
1989 */ 2066 */
1990 parent = cs->parent; 2067 parent = parent_cs(cs);
1991 while (cpumask_empty(parent->cpus_allowed) || 2068 while (cpumask_empty(parent->cpus_allowed) ||
1992 nodes_empty(parent->mems_allowed)) 2069 nodes_empty(parent->mems_allowed))
1993 parent = parent->parent; 2070 parent = parent_cs(parent);
1994 2071
1995 move_member_tasks_to_cpuset(cs, parent); 2072 move_member_tasks_to_cpuset(cs, parent);
1996} 2073}
1997 2074
1998/* 2075/**
1999 * Helper function to traverse cpusets. 2076 * cpuset_propagate_hotplug_workfn - propagate CPU/memory hotplug to a cpuset
2000 * It can be used to walk the cpuset tree from top to bottom, completing 2077 * @cs: cpuset in interest
2001 * one layer before dropping down to the next (thus always processing a 2078 *
2002 * node before any of its children). 2079 * Compare @cs's cpu and mem masks against top_cpuset and if some have gone
2080 * offline, update @cs accordingly. If @cs ends up with no CPU or memory,
2081 * all its tasks are moved to the nearest ancestor with both resources.
2003 */ 2082 */
2004static struct cpuset *cpuset_next(struct list_head *queue) 2083static void cpuset_propagate_hotplug_workfn(struct work_struct *work)
2005{ 2084{
2006 struct cpuset *cp; 2085 static cpumask_t off_cpus;
2007 struct cpuset *child; /* scans child cpusets of cp */ 2086 static nodemask_t off_mems, tmp_mems;
2008 struct cgroup *cont; 2087 struct cpuset *cs = container_of(work, struct cpuset, hotplug_work);
2088 bool is_empty;
2009 2089
2010 if (list_empty(queue)) 2090 mutex_lock(&cpuset_mutex);
2011 return NULL; 2091
2092 cpumask_andnot(&off_cpus, cs->cpus_allowed, top_cpuset.cpus_allowed);
2093 nodes_andnot(off_mems, cs->mems_allowed, top_cpuset.mems_allowed);
2094
2095 /* remove offline cpus from @cs */
2096 if (!cpumask_empty(&off_cpus)) {
2097 mutex_lock(&callback_mutex);
2098 cpumask_andnot(cs->cpus_allowed, cs->cpus_allowed, &off_cpus);
2099 mutex_unlock(&callback_mutex);
2100 update_tasks_cpumask(cs, NULL);
2101 }
2012 2102
2013 cp = list_first_entry(queue, struct cpuset, stack_list); 2103 /* remove offline mems from @cs */
2014 list_del(queue->next); 2104 if (!nodes_empty(off_mems)) {
2015 list_for_each_entry(cont, &cp->css.cgroup->children, sibling) { 2105 tmp_mems = cs->mems_allowed;
2016 child = cgroup_cs(cont); 2106 mutex_lock(&callback_mutex);
2017 list_add_tail(&child->stack_list, queue); 2107 nodes_andnot(cs->mems_allowed, cs->mems_allowed, off_mems);
2108 mutex_unlock(&callback_mutex);
2109 update_tasks_nodemask(cs, &tmp_mems, NULL);
2018 } 2110 }
2019 2111
2020 return cp; 2112 is_empty = cpumask_empty(cs->cpus_allowed) ||
2113 nodes_empty(cs->mems_allowed);
2114
2115 mutex_unlock(&cpuset_mutex);
2116
2117 /*
2118 * If @cs became empty, move tasks to the nearest ancestor with
2119 * execution resources. This is full cgroup operation which will
2120 * also call back into cpuset. Should be done outside any lock.
2121 */
2122 if (is_empty)
2123 remove_tasks_in_empty_cpuset(cs);
2124
2125 /* the following may free @cs, should be the last operation */
2126 css_put(&cs->css);
2021} 2127}
2022 2128
2129/**
2130 * schedule_cpuset_propagate_hotplug - schedule hotplug propagation to a cpuset
2131 * @cs: cpuset of interest
2132 *
2133 * Schedule cpuset_propagate_hotplug_workfn() which will update CPU and
2134 * memory masks according to top_cpuset.
2135 */
2136static void schedule_cpuset_propagate_hotplug(struct cpuset *cs)
2137{
2138 /*
2139 * Pin @cs. The refcnt will be released when the work item
2140 * finishes executing.
2141 */
2142 if (!css_tryget(&cs->css))
2143 return;
2144
2145 /*
2146 * Queue @cs->hotplug_work. If already pending, lose the css ref.
2147 * cpuset_propagate_hotplug_wq is ordered and propagation will
2148 * happen in the order this function is called.
2149 */
2150 if (!queue_work(cpuset_propagate_hotplug_wq, &cs->hotplug_work))
2151 css_put(&cs->css);
2152}
2023 2153
2024/* 2154/**
2025 * Walk the specified cpuset subtree upon a hotplug operation (CPU/Memory 2155 * cpuset_hotplug_workfn - handle CPU/memory hotunplug for a cpuset
2026 * online/offline) and update the cpusets accordingly.
2027 * For regular CPU/Mem hotplug, look for empty cpusets; the tasks of such
2028 * cpuset must be moved to a parent cpuset.
2029 * 2156 *
2030 * Called with cgroup_mutex held. We take callback_mutex to modify 2157 * This function is called after either CPU or memory configuration has
2031 * cpus_allowed and mems_allowed. 2158 * changed and updates cpuset accordingly. The top_cpuset is always
2159 * synchronized to cpu_active_mask and N_MEMORY, which is necessary in
2160 * order to make cpusets transparent (of no affect) on systems that are
2161 * actively using CPU hotplug but making no active use of cpusets.
2032 * 2162 *
2033 * This walk processes the tree from top to bottom, completing one layer 2163 * Non-root cpusets are only affected by offlining. If any CPUs or memory
2034 * before dropping down to the next. It always processes a node before 2164 * nodes have been taken down, cpuset_propagate_hotplug() is invoked on all
2035 * any of its children. 2165 * descendants.
2036 * 2166 *
2037 * In the case of memory hot-unplug, it will remove nodes from N_HIGH_MEMORY 2167 * Note that CPU offlining during suspend is ignored. We don't modify
2038 * if all present pages from a node are offlined. 2168 * cpusets across suspend/resume cycles at all.
2039 */ 2169 */
2040static void 2170static void cpuset_hotplug_workfn(struct work_struct *work)
2041scan_cpusets_upon_hotplug(struct cpuset *root, enum hotplug_event event)
2042{ 2171{
2043 LIST_HEAD(queue); 2172 static cpumask_t new_cpus, tmp_cpus;
2044 struct cpuset *cp; /* scans cpusets being updated */ 2173 static nodemask_t new_mems, tmp_mems;
2045 static nodemask_t oldmems; /* protected by cgroup_mutex */ 2174 bool cpus_updated, mems_updated;
2175 bool cpus_offlined, mems_offlined;
2046 2176
2047 list_add_tail((struct list_head *)&root->stack_list, &queue); 2177 mutex_lock(&cpuset_mutex);
2048 2178
2049 switch (event) { 2179 /* fetch the available cpus/mems and find out which changed how */
2050 case CPUSET_CPU_OFFLINE: 2180 cpumask_copy(&new_cpus, cpu_active_mask);
2051 while ((cp = cpuset_next(&queue)) != NULL) { 2181 new_mems = node_states[N_MEMORY];
2052 2182
2053 /* Continue past cpusets with all cpus online */ 2183 cpus_updated = !cpumask_equal(top_cpuset.cpus_allowed, &new_cpus);
2054 if (cpumask_subset(cp->cpus_allowed, cpu_active_mask)) 2184 cpus_offlined = cpumask_andnot(&tmp_cpus, top_cpuset.cpus_allowed,
2055 continue; 2185 &new_cpus);
2056 2186
2057 /* Remove offline cpus from this cpuset. */ 2187 mems_updated = !nodes_equal(top_cpuset.mems_allowed, new_mems);
2058 mutex_lock(&callback_mutex); 2188 nodes_andnot(tmp_mems, top_cpuset.mems_allowed, new_mems);
2059 cpumask_and(cp->cpus_allowed, cp->cpus_allowed, 2189 mems_offlined = !nodes_empty(tmp_mems);
2060 cpu_active_mask);
2061 mutex_unlock(&callback_mutex);
2062 2190
2063 /* Move tasks from the empty cpuset to a parent */ 2191 /* synchronize cpus_allowed to cpu_active_mask */
2064 if (cpumask_empty(cp->cpus_allowed)) 2192 if (cpus_updated) {
2065 remove_tasks_in_empty_cpuset(cp); 2193 mutex_lock(&callback_mutex);
2066 else 2194 cpumask_copy(top_cpuset.cpus_allowed, &new_cpus);
2067 update_tasks_cpumask(cp, NULL); 2195 mutex_unlock(&callback_mutex);
2068 } 2196 /* we don't mess with cpumasks of tasks in top_cpuset */
2069 break; 2197 }
2198
2199 /* synchronize mems_allowed to N_MEMORY */
2200 if (mems_updated) {
2201 tmp_mems = top_cpuset.mems_allowed;
2202 mutex_lock(&callback_mutex);
2203 top_cpuset.mems_allowed = new_mems;
2204 mutex_unlock(&callback_mutex);
2205 update_tasks_nodemask(&top_cpuset, &tmp_mems, NULL);
2206 }
2070 2207
2071 case CPUSET_MEM_OFFLINE: 2208 /* if cpus or mems went down, we need to propagate to descendants */
2072 while ((cp = cpuset_next(&queue)) != NULL) { 2209 if (cpus_offlined || mems_offlined) {
2210 struct cpuset *cs;
2211 struct cgroup *pos_cgrp;
2073 2212
2074 /* Continue past cpusets with all mems online */ 2213 rcu_read_lock();
2075 if (nodes_subset(cp->mems_allowed, 2214 cpuset_for_each_descendant_pre(cs, pos_cgrp, &top_cpuset)
2076 node_states[N_HIGH_MEMORY])) 2215 schedule_cpuset_propagate_hotplug(cs);
2077 continue; 2216 rcu_read_unlock();
2217 }
2078 2218
2079 oldmems = cp->mems_allowed; 2219 mutex_unlock(&cpuset_mutex);
2080 2220
2081 /* Remove offline mems from this cpuset. */ 2221 /* wait for propagations to finish */
2082 mutex_lock(&callback_mutex); 2222 flush_workqueue(cpuset_propagate_hotplug_wq);
2083 nodes_and(cp->mems_allowed, cp->mems_allowed,
2084 node_states[N_HIGH_MEMORY]);
2085 mutex_unlock(&callback_mutex);
2086 2223
2087 /* Move tasks from the empty cpuset to a parent */ 2224 /* rebuild sched domains if cpus_allowed has changed */
2088 if (nodes_empty(cp->mems_allowed)) 2225 if (cpus_updated) {
2089 remove_tasks_in_empty_cpuset(cp); 2226 struct sched_domain_attr *attr;
2090 else 2227 cpumask_var_t *doms;
2091 update_tasks_nodemask(cp, &oldmems, NULL); 2228 int ndoms;
2092 } 2229
2230 mutex_lock(&cpuset_mutex);
2231 ndoms = generate_sched_domains(&doms, &attr);
2232 mutex_unlock(&cpuset_mutex);
2233
2234 partition_sched_domains(ndoms, doms, attr);
2093 } 2235 }
2094} 2236}
2095 2237
2096/*
2097 * The top_cpuset tracks what CPUs and Memory Nodes are online,
2098 * period. This is necessary in order to make cpusets transparent
2099 * (of no affect) on systems that are actively using CPU hotplug
2100 * but making no active use of cpusets.
2101 *
2102 * The only exception to this is suspend/resume, where we don't
2103 * modify cpusets at all.
2104 *
2105 * This routine ensures that top_cpuset.cpus_allowed tracks
2106 * cpu_active_mask on each CPU hotplug (cpuhp) event.
2107 *
2108 * Called within get_online_cpus(). Needs to call cgroup_lock()
2109 * before calling generate_sched_domains().
2110 *
2111 * @cpu_online: Indicates whether this is a CPU online event (true) or
2112 * a CPU offline event (false).
2113 */
2114void cpuset_update_active_cpus(bool cpu_online) 2238void cpuset_update_active_cpus(bool cpu_online)
2115{ 2239{
2116 struct sched_domain_attr *attr; 2240 /*
2117 cpumask_var_t *doms; 2241 * We're inside cpu hotplug critical region which usually nests
2118 int ndoms; 2242 * inside cgroup synchronization. Bounce actual hotplug processing
2119 2243 * to a work item to avoid reverse locking order.
2120 cgroup_lock(); 2244 *
2121 mutex_lock(&callback_mutex); 2245 * We still need to do partition_sched_domains() synchronously;
2122 cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask); 2246 * otherwise, the scheduler will get confused and put tasks to the
2123 mutex_unlock(&callback_mutex); 2247 * dead CPU. Fall back to the default single domain.
2124 2248 * cpuset_hotplug_workfn() will rebuild it as necessary.
2125 if (!cpu_online) 2249 */
2126 scan_cpusets_upon_hotplug(&top_cpuset, CPUSET_CPU_OFFLINE); 2250 partition_sched_domains(1, NULL, NULL);
2127 2251 schedule_work(&cpuset_hotplug_work);
2128 ndoms = generate_sched_domains(&doms, &attr);
2129 cgroup_unlock();
2130
2131 /* Have scheduler rebuild the domains */
2132 partition_sched_domains(ndoms, doms, attr);
2133} 2252}
2134 2253
2135#ifdef CONFIG_MEMORY_HOTPLUG 2254#ifdef CONFIG_MEMORY_HOTPLUG
2136/* 2255/*
2137 * Keep top_cpuset.mems_allowed tracking node_states[N_HIGH_MEMORY]. 2256 * Keep top_cpuset.mems_allowed tracking node_states[N_MEMORY].
2138 * Call this routine anytime after node_states[N_HIGH_MEMORY] changes. 2257 * Call this routine anytime after node_states[N_MEMORY] changes.
2139 * See cpuset_update_active_cpus() for CPU hotplug handling. 2258 * See cpuset_update_active_cpus() for CPU hotplug handling.
2140 */ 2259 */
2141static int cpuset_track_online_nodes(struct notifier_block *self, 2260static int cpuset_track_online_nodes(struct notifier_block *self,
2142 unsigned long action, void *arg) 2261 unsigned long action, void *arg)
2143{ 2262{
2144 static nodemask_t oldmems; /* protected by cgroup_mutex */ 2263 schedule_work(&cpuset_hotplug_work);
2145
2146 cgroup_lock();
2147 switch (action) {
2148 case MEM_ONLINE:
2149 oldmems = top_cpuset.mems_allowed;
2150 mutex_lock(&callback_mutex);
2151 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
2152 mutex_unlock(&callback_mutex);
2153 update_tasks_nodemask(&top_cpuset, &oldmems, NULL);
2154 break;
2155 case MEM_OFFLINE:
2156 /*
2157 * needn't update top_cpuset.mems_allowed explicitly because
2158 * scan_cpusets_upon_hotplug() will update it.
2159 */
2160 scan_cpusets_upon_hotplug(&top_cpuset, CPUSET_MEM_OFFLINE);
2161 break;
2162 default:
2163 break;
2164 }
2165 cgroup_unlock();
2166
2167 return NOTIFY_OK; 2264 return NOTIFY_OK;
2168} 2265}
2169#endif 2266#endif
@@ -2177,12 +2274,13 @@ static int cpuset_track_online_nodes(struct notifier_block *self,
2177void __init cpuset_init_smp(void) 2274void __init cpuset_init_smp(void)
2178{ 2275{
2179 cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask); 2276 cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
2180 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; 2277 top_cpuset.mems_allowed = node_states[N_MEMORY];
2181 2278
2182 hotplug_memory_notifier(cpuset_track_online_nodes, 10); 2279 hotplug_memory_notifier(cpuset_track_online_nodes, 10);
2183 2280
2184 cpuset_wq = create_singlethread_workqueue("cpuset"); 2281 cpuset_propagate_hotplug_wq =
2185 BUG_ON(!cpuset_wq); 2282 alloc_ordered_workqueue("cpuset_hotplug", 0);
2283 BUG_ON(!cpuset_propagate_hotplug_wq);
2186} 2284}
2187 2285
2188/** 2286/**
@@ -2245,7 +2343,7 @@ void cpuset_init_current_mems_allowed(void)
2245 * 2343 *
2246 * Description: Returns the nodemask_t mems_allowed of the cpuset 2344 * Description: Returns the nodemask_t mems_allowed of the cpuset
2247 * attached to the specified @tsk. Guaranteed to return some non-empty 2345 * attached to the specified @tsk. Guaranteed to return some non-empty
2248 * subset of node_states[N_HIGH_MEMORY], even if this means going outside the 2346 * subset of node_states[N_MEMORY], even if this means going outside the
2249 * tasks cpuset. 2347 * tasks cpuset.
2250 **/ 2348 **/
2251 2349
@@ -2281,8 +2379,8 @@ int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask)
2281 */ 2379 */
2282static const struct cpuset *nearest_hardwall_ancestor(const struct cpuset *cs) 2380static const struct cpuset *nearest_hardwall_ancestor(const struct cpuset *cs)
2283{ 2381{
2284 while (!(is_mem_exclusive(cs) || is_mem_hardwall(cs)) && cs->parent) 2382 while (!(is_mem_exclusive(cs) || is_mem_hardwall(cs)) && parent_cs(cs))
2285 cs = cs->parent; 2383 cs = parent_cs(cs);
2286 return cs; 2384 return cs;
2287} 2385}
2288 2386
@@ -2420,17 +2518,6 @@ int __cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask)
2420} 2518}
2421 2519
2422/** 2520/**
2423 * cpuset_unlock - release lock on cpuset changes
2424 *
2425 * Undo the lock taken in a previous cpuset_lock() call.
2426 */
2427
2428void cpuset_unlock(void)
2429{
2430 mutex_unlock(&callback_mutex);
2431}
2432
2433/**
2434 * cpuset_mem_spread_node() - On which node to begin search for a file page 2521 * cpuset_mem_spread_node() - On which node to begin search for a file page
2435 * cpuset_slab_spread_node() - On which node to begin search for a slab page 2522 * cpuset_slab_spread_node() - On which node to begin search for a slab page
2436 * 2523 *
@@ -2519,8 +2606,16 @@ void cpuset_print_task_mems_allowed(struct task_struct *tsk)
2519 2606
2520 dentry = task_cs(tsk)->css.cgroup->dentry; 2607 dentry = task_cs(tsk)->css.cgroup->dentry;
2521 spin_lock(&cpuset_buffer_lock); 2608 spin_lock(&cpuset_buffer_lock);
2522 snprintf(cpuset_name, CPUSET_NAME_LEN, 2609
2523 dentry ? (const char *)dentry->d_name.name : "/"); 2610 if (!dentry) {
2611 strcpy(cpuset_name, "/");
2612 } else {
2613 spin_lock(&dentry->d_lock);
2614 strlcpy(cpuset_name, (const char *)dentry->d_name.name,
2615 CPUSET_NAME_LEN);
2616 spin_unlock(&dentry->d_lock);
2617 }
2618
2524 nodelist_scnprintf(cpuset_nodelist, CPUSET_NODELIST_LEN, 2619 nodelist_scnprintf(cpuset_nodelist, CPUSET_NODELIST_LEN,
2525 tsk->mems_allowed); 2620 tsk->mems_allowed);
2526 printk(KERN_INFO "%s cpuset=%s mems_allowed=%s\n", 2621 printk(KERN_INFO "%s cpuset=%s mems_allowed=%s\n",
@@ -2568,7 +2663,7 @@ void __cpuset_memory_pressure_bump(void)
2568 * - Used for /proc/<pid>/cpuset. 2663 * - Used for /proc/<pid>/cpuset.
2569 * - No need to task_lock(tsk) on this tsk->cpuset reference, as it 2664 * - No need to task_lock(tsk) on this tsk->cpuset reference, as it
2570 * doesn't really matter if tsk->cpuset changes after we read it, 2665 * doesn't really matter if tsk->cpuset changes after we read it,
2571 * and we take cgroup_mutex, keeping cpuset_attach() from changing it 2666 * and we take cpuset_mutex, keeping cpuset_attach() from changing it
2572 * anyway. 2667 * anyway.
2573 */ 2668 */
2574static int proc_cpuset_show(struct seq_file *m, void *unused_v) 2669static int proc_cpuset_show(struct seq_file *m, void *unused_v)
@@ -2590,16 +2685,15 @@ static int proc_cpuset_show(struct seq_file *m, void *unused_v)
2590 if (!tsk) 2685 if (!tsk)
2591 goto out_free; 2686 goto out_free;
2592 2687
2593 retval = -EINVAL; 2688 rcu_read_lock();
2594 cgroup_lock();
2595 css = task_subsys_state(tsk, cpuset_subsys_id); 2689 css = task_subsys_state(tsk, cpuset_subsys_id);
2596 retval = cgroup_path(css->cgroup, buf, PAGE_SIZE); 2690 retval = cgroup_path(css->cgroup, buf, PAGE_SIZE);
2691 rcu_read_unlock();
2597 if (retval < 0) 2692 if (retval < 0)
2598 goto out_unlock; 2693 goto out_put_task;
2599 seq_puts(m, buf); 2694 seq_puts(m, buf);
2600 seq_putc(m, '\n'); 2695 seq_putc(m, '\n');
2601out_unlock: 2696out_put_task:
2602 cgroup_unlock();
2603 put_task_struct(tsk); 2697 put_task_struct(tsk);
2604out_free: 2698out_free:
2605 kfree(buf); 2699 kfree(buf);
diff --git a/kernel/cred.c b/kernel/cred.c
index 48cea3da6d05..e0573a43c7df 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -30,17 +30,6 @@
30static struct kmem_cache *cred_jar; 30static struct kmem_cache *cred_jar;
31 31
32/* 32/*
33 * The common credentials for the initial task's thread group
34 */
35#ifdef CONFIG_KEYS
36static struct thread_group_cred init_tgcred = {
37 .usage = ATOMIC_INIT(2),
38 .tgid = 0,
39 .lock = __SPIN_LOCK_UNLOCKED(init_cred.tgcred.lock),
40};
41#endif
42
43/*
44 * The initial credentials for the initial task 33 * The initial credentials for the initial task
45 */ 34 */
46struct cred init_cred = { 35struct cred init_cred = {
@@ -65,9 +54,6 @@ struct cred init_cred = {
65 .user = INIT_USER, 54 .user = INIT_USER,
66 .user_ns = &init_user_ns, 55 .user_ns = &init_user_ns,
67 .group_info = &init_groups, 56 .group_info = &init_groups,
68#ifdef CONFIG_KEYS
69 .tgcred = &init_tgcred,
70#endif
71}; 57};
72 58
73static inline void set_cred_subscribers(struct cred *cred, int n) 59static inline void set_cred_subscribers(struct cred *cred, int n)
@@ -96,36 +82,6 @@ static inline void alter_cred_subscribers(const struct cred *_cred, int n)
96} 82}
97 83
98/* 84/*
99 * Dispose of the shared task group credentials
100 */
101#ifdef CONFIG_KEYS
102static void release_tgcred_rcu(struct rcu_head *rcu)
103{
104 struct thread_group_cred *tgcred =
105 container_of(rcu, struct thread_group_cred, rcu);
106
107 BUG_ON(atomic_read(&tgcred->usage) != 0);
108
109 key_put(tgcred->session_keyring);
110 key_put(tgcred->process_keyring);
111 kfree(tgcred);
112}
113#endif
114
115/*
116 * Release a set of thread group credentials.
117 */
118static void release_tgcred(struct cred *cred)
119{
120#ifdef CONFIG_KEYS
121 struct thread_group_cred *tgcred = cred->tgcred;
122
123 if (atomic_dec_and_test(&tgcred->usage))
124 call_rcu(&tgcred->rcu, release_tgcred_rcu);
125#endif
126}
127
128/*
129 * The RCU callback to actually dispose of a set of credentials 85 * The RCU callback to actually dispose of a set of credentials
130 */ 86 */
131static void put_cred_rcu(struct rcu_head *rcu) 87static void put_cred_rcu(struct rcu_head *rcu)
@@ -150,9 +106,10 @@ static void put_cred_rcu(struct rcu_head *rcu)
150#endif 106#endif
151 107
152 security_cred_free(cred); 108 security_cred_free(cred);
109 key_put(cred->session_keyring);
110 key_put(cred->process_keyring);
153 key_put(cred->thread_keyring); 111 key_put(cred->thread_keyring);
154 key_put(cred->request_key_auth); 112 key_put(cred->request_key_auth);
155 release_tgcred(cred);
156 if (cred->group_info) 113 if (cred->group_info)
157 put_group_info(cred->group_info); 114 put_group_info(cred->group_info);
158 free_uid(cred->user); 115 free_uid(cred->user);
@@ -246,15 +203,6 @@ struct cred *cred_alloc_blank(void)
246 if (!new) 203 if (!new)
247 return NULL; 204 return NULL;
248 205
249#ifdef CONFIG_KEYS
250 new->tgcred = kzalloc(sizeof(*new->tgcred), GFP_KERNEL);
251 if (!new->tgcred) {
252 kmem_cache_free(cred_jar, new);
253 return NULL;
254 }
255 atomic_set(&new->tgcred->usage, 1);
256#endif
257
258 atomic_set(&new->usage, 1); 206 atomic_set(&new->usage, 1);
259#ifdef CONFIG_DEBUG_CREDENTIALS 207#ifdef CONFIG_DEBUG_CREDENTIALS
260 new->magic = CRED_MAGIC; 208 new->magic = CRED_MAGIC;
@@ -308,9 +256,10 @@ struct cred *prepare_creds(void)
308 get_user_ns(new->user_ns); 256 get_user_ns(new->user_ns);
309 257
310#ifdef CONFIG_KEYS 258#ifdef CONFIG_KEYS
259 key_get(new->session_keyring);
260 key_get(new->process_keyring);
311 key_get(new->thread_keyring); 261 key_get(new->thread_keyring);
312 key_get(new->request_key_auth); 262 key_get(new->request_key_auth);
313 atomic_inc(&new->tgcred->usage);
314#endif 263#endif
315 264
316#ifdef CONFIG_SECURITY 265#ifdef CONFIG_SECURITY
@@ -334,39 +283,20 @@ EXPORT_SYMBOL(prepare_creds);
334 */ 283 */
335struct cred *prepare_exec_creds(void) 284struct cred *prepare_exec_creds(void)
336{ 285{
337 struct thread_group_cred *tgcred = NULL;
338 struct cred *new; 286 struct cred *new;
339 287
340#ifdef CONFIG_KEYS
341 tgcred = kmalloc(sizeof(*tgcred), GFP_KERNEL);
342 if (!tgcred)
343 return NULL;
344#endif
345
346 new = prepare_creds(); 288 new = prepare_creds();
347 if (!new) { 289 if (!new)
348 kfree(tgcred);
349 return new; 290 return new;
350 }
351 291
352#ifdef CONFIG_KEYS 292#ifdef CONFIG_KEYS
353 /* newly exec'd tasks don't get a thread keyring */ 293 /* newly exec'd tasks don't get a thread keyring */
354 key_put(new->thread_keyring); 294 key_put(new->thread_keyring);
355 new->thread_keyring = NULL; 295 new->thread_keyring = NULL;
356 296
357 /* create a new per-thread-group creds for all this set of threads to
358 * share */
359 memcpy(tgcred, new->tgcred, sizeof(struct thread_group_cred));
360
361 atomic_set(&tgcred->usage, 1);
362 spin_lock_init(&tgcred->lock);
363
364 /* inherit the session keyring; new process keyring */ 297 /* inherit the session keyring; new process keyring */
365 key_get(tgcred->session_keyring); 298 key_put(new->process_keyring);
366 tgcred->process_keyring = NULL; 299 new->process_keyring = NULL;
367
368 release_tgcred(new);
369 new->tgcred = tgcred;
370#endif 300#endif
371 301
372 return new; 302 return new;
@@ -383,9 +313,6 @@ struct cred *prepare_exec_creds(void)
383 */ 313 */
384int copy_creds(struct task_struct *p, unsigned long clone_flags) 314int copy_creds(struct task_struct *p, unsigned long clone_flags)
385{ 315{
386#ifdef CONFIG_KEYS
387 struct thread_group_cred *tgcred;
388#endif
389 struct cred *new; 316 struct cred *new;
390 int ret; 317 int ret;
391 318
@@ -425,22 +352,12 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
425 install_thread_keyring_to_cred(new); 352 install_thread_keyring_to_cred(new);
426 } 353 }
427 354
428 /* we share the process and session keyrings between all the threads in 355 /* The process keyring is only shared between the threads in a process;
429 * a process - this is slightly icky as we violate COW credentials a 356 * anything outside of those threads doesn't inherit.
430 * bit */ 357 */
431 if (!(clone_flags & CLONE_THREAD)) { 358 if (!(clone_flags & CLONE_THREAD)) {
432 tgcred = kmalloc(sizeof(*tgcred), GFP_KERNEL); 359 key_put(new->process_keyring);
433 if (!tgcred) { 360 new->process_keyring = NULL;
434 ret = -ENOMEM;
435 goto error_put;
436 }
437 atomic_set(&tgcred->usage, 1);
438 spin_lock_init(&tgcred->lock);
439 tgcred->process_keyring = NULL;
440 tgcred->session_keyring = key_get(new->tgcred->session_keyring);
441
442 release_tgcred(new);
443 new->tgcred = tgcred;
444 } 361 }
445#endif 362#endif
446 363
@@ -455,6 +372,31 @@ error_put:
455 return ret; 372 return ret;
456} 373}
457 374
375static bool cred_cap_issubset(const struct cred *set, const struct cred *subset)
376{
377 const struct user_namespace *set_ns = set->user_ns;
378 const struct user_namespace *subset_ns = subset->user_ns;
379
380 /* If the two credentials are in the same user namespace see if
381 * the capabilities of subset are a subset of set.
382 */
383 if (set_ns == subset_ns)
384 return cap_issubset(subset->cap_permitted, set->cap_permitted);
385
386 /* The credentials are in a different user namespaces
387 * therefore one is a subset of the other only if a set is an
388 * ancestor of subset and set->euid is owner of subset or one
389 * of subsets ancestors.
390 */
391 for (;subset_ns != &init_user_ns; subset_ns = subset_ns->parent) {
392 if ((set_ns == subset_ns->parent) &&
393 uid_eq(subset_ns->owner, set->euid))
394 return true;
395 }
396
397 return false;
398}
399
458/** 400/**
459 * commit_creds - Install new credentials upon the current task 401 * commit_creds - Install new credentials upon the current task
460 * @new: The credentials to be assigned 402 * @new: The credentials to be assigned
@@ -493,7 +435,7 @@ int commit_creds(struct cred *new)
493 !gid_eq(old->egid, new->egid) || 435 !gid_eq(old->egid, new->egid) ||
494 !uid_eq(old->fsuid, new->fsuid) || 436 !uid_eq(old->fsuid, new->fsuid) ||
495 !gid_eq(old->fsgid, new->fsgid) || 437 !gid_eq(old->fsgid, new->fsgid) ||
496 !cap_issubset(new->cap_permitted, old->cap_permitted)) { 438 !cred_cap_issubset(old, new)) {
497 if (task->mm) 439 if (task->mm)
498 set_dumpable(task->mm, suid_dumpable); 440 set_dumpable(task->mm, suid_dumpable);
499 task->pdeath_signal = 0; 441 task->pdeath_signal = 0;
@@ -643,9 +585,6 @@ void __init cred_init(void)
643 */ 585 */
644struct cred *prepare_kernel_cred(struct task_struct *daemon) 586struct cred *prepare_kernel_cred(struct task_struct *daemon)
645{ 587{
646#ifdef CONFIG_KEYS
647 struct thread_group_cred *tgcred;
648#endif
649 const struct cred *old; 588 const struct cred *old;
650 struct cred *new; 589 struct cred *new;
651 590
@@ -653,14 +592,6 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon)
653 if (!new) 592 if (!new)
654 return NULL; 593 return NULL;
655 594
656#ifdef CONFIG_KEYS
657 tgcred = kmalloc(sizeof(*tgcred), GFP_KERNEL);
658 if (!tgcred) {
659 kmem_cache_free(cred_jar, new);
660 return NULL;
661 }
662#endif
663
664 kdebug("prepare_kernel_cred() alloc %p", new); 595 kdebug("prepare_kernel_cred() alloc %p", new);
665 596
666 if (daemon) 597 if (daemon)
@@ -678,13 +609,10 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon)
678 get_group_info(new->group_info); 609 get_group_info(new->group_info);
679 610
680#ifdef CONFIG_KEYS 611#ifdef CONFIG_KEYS
681 atomic_set(&tgcred->usage, 1); 612 new->session_keyring = NULL;
682 spin_lock_init(&tgcred->lock); 613 new->process_keyring = NULL;
683 tgcred->process_keyring = NULL;
684 tgcred->session_keyring = NULL;
685 new->tgcred = tgcred;
686 new->request_key_auth = NULL;
687 new->thread_keyring = NULL; 614 new->thread_keyring = NULL;
615 new->request_key_auth = NULL;
688 new->jit_keyring = KEY_REQKEY_DEFL_THREAD_KEYRING; 616 new->jit_keyring = KEY_REQKEY_DEFL_THREAD_KEYRING;
689#endif 617#endif
690 618
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index 9a61738cefc8..c26278fd4851 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -29,6 +29,7 @@
29 */ 29 */
30#include <linux/pid_namespace.h> 30#include <linux/pid_namespace.h>
31#include <linux/clocksource.h> 31#include <linux/clocksource.h>
32#include <linux/serial_core.h>
32#include <linux/interrupt.h> 33#include <linux/interrupt.h>
33#include <linux/spinlock.h> 34#include <linux/spinlock.h>
34#include <linux/console.h> 35#include <linux/console.h>
diff --git a/kernel/debug/debug_core.h b/kernel/debug/debug_core.h
index 3494c28a7e7a..2235967e78b0 100644
--- a/kernel/debug/debug_core.h
+++ b/kernel/debug/debug_core.h
@@ -72,6 +72,8 @@ extern int dbg_kdb_mode;
72#ifdef CONFIG_KGDB_KDB 72#ifdef CONFIG_KGDB_KDB
73extern int kdb_stub(struct kgdb_state *ks); 73extern int kdb_stub(struct kgdb_state *ks);
74extern int kdb_parse(const char *cmdstr); 74extern int kdb_parse(const char *cmdstr);
75extern int kdb_common_init_state(struct kgdb_state *ks);
76extern int kdb_common_deinit_state(void);
75#else /* ! CONFIG_KGDB_KDB */ 77#else /* ! CONFIG_KGDB_KDB */
76static inline int kdb_stub(struct kgdb_state *ks) 78static inline int kdb_stub(struct kgdb_state *ks)
77{ 79{
diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c
index ce615e064482..19d9a578c753 100644
--- a/kernel/debug/gdbstub.c
+++ b/kernel/debug/gdbstub.c
@@ -31,6 +31,7 @@
31#include <linux/kernel.h> 31#include <linux/kernel.h>
32#include <linux/kgdb.h> 32#include <linux/kgdb.h>
33#include <linux/kdb.h> 33#include <linux/kdb.h>
34#include <linux/serial_core.h>
34#include <linux/reboot.h> 35#include <linux/reboot.h>
35#include <linux/uaccess.h> 36#include <linux/uaccess.h>
36#include <asm/cacheflush.h> 37#include <asm/cacheflush.h>
@@ -782,7 +783,10 @@ static void gdb_cmd_query(struct kgdb_state *ks)
782 len = len / 2; 783 len = len / 2;
783 remcom_out_buffer[len++] = 0; 784 remcom_out_buffer[len++] = 0;
784 785
786 kdb_common_init_state(ks);
785 kdb_parse(remcom_out_buffer); 787 kdb_parse(remcom_out_buffer);
788 kdb_common_deinit_state();
789
786 strcpy(remcom_out_buffer, "OK"); 790 strcpy(remcom_out_buffer, "OK");
787 } 791 }
788 break; 792 break;
diff --git a/kernel/debug/kdb/kdb_bp.c b/kernel/debug/kdb/kdb_bp.c
index 8418c2f8ec5d..70a504601dc3 100644
--- a/kernel/debug/kdb/kdb_bp.c
+++ b/kernel/debug/kdb/kdb_bp.c
@@ -486,11 +486,9 @@ static int kdb_bc(int argc, const char **argv)
486/* 486/*
487 * kdb_ss 487 * kdb_ss
488 * 488 *
489 * Process the 'ss' (Single Step) and 'ssb' (Single Step to Branch) 489 * Process the 'ss' (Single Step) command.
490 * commands.
491 * 490 *
492 * ss 491 * ss
493 * ssb
494 * 492 *
495 * Parameters: 493 * Parameters:
496 * argc Argument count 494 * argc Argument count
@@ -498,35 +496,23 @@ static int kdb_bc(int argc, const char **argv)
498 * Outputs: 496 * Outputs:
499 * None. 497 * None.
500 * Returns: 498 * Returns:
501 * KDB_CMD_SS[B] for success, a kdb error if failure. 499 * KDB_CMD_SS for success, a kdb error if failure.
502 * Locking: 500 * Locking:
503 * None. 501 * None.
504 * Remarks: 502 * Remarks:
505 * 503 *
506 * Set the arch specific option to trigger a debug trap after the next 504 * Set the arch specific option to trigger a debug trap after the next
507 * instruction. 505 * instruction.
508 *
509 * For 'ssb', set the trace flag in the debug trap handler
510 * after printing the current insn and return directly without
511 * invoking the kdb command processor, until a branch instruction
512 * is encountered.
513 */ 506 */
514 507
515static int kdb_ss(int argc, const char **argv) 508static int kdb_ss(int argc, const char **argv)
516{ 509{
517 int ssb = 0;
518
519 ssb = (strcmp(argv[0], "ssb") == 0);
520 if (argc != 0) 510 if (argc != 0)
521 return KDB_ARGCOUNT; 511 return KDB_ARGCOUNT;
522 /* 512 /*
523 * Set trace flag and go. 513 * Set trace flag and go.
524 */ 514 */
525 KDB_STATE_SET(DOING_SS); 515 KDB_STATE_SET(DOING_SS);
526 if (ssb) {
527 KDB_STATE_SET(DOING_SSB);
528 return KDB_CMD_SSB;
529 }
530 return KDB_CMD_SS; 516 return KDB_CMD_SS;
531} 517}
532 518
@@ -561,8 +547,6 @@ void __init kdb_initbptab(void)
561 547
562 kdb_register_repeat("ss", kdb_ss, "", 548 kdb_register_repeat("ss", kdb_ss, "",
563 "Single Step", 1, KDB_REPEAT_NO_ARGS); 549 "Single Step", 1, KDB_REPEAT_NO_ARGS);
564 kdb_register_repeat("ssb", kdb_ss, "",
565 "Single step to branch/call", 0, KDB_REPEAT_NO_ARGS);
566 /* 550 /*
567 * Architecture dependent initialization. 551 * Architecture dependent initialization.
568 */ 552 */
diff --git a/kernel/debug/kdb/kdb_debugger.c b/kernel/debug/kdb/kdb_debugger.c
index be7b33b73d30..328d18ef31e4 100644
--- a/kernel/debug/kdb/kdb_debugger.c
+++ b/kernel/debug/kdb/kdb_debugger.c
@@ -34,6 +34,22 @@ EXPORT_SYMBOL_GPL(kdb_poll_idx);
34 34
35static struct kgdb_state *kdb_ks; 35static struct kgdb_state *kdb_ks;
36 36
37int kdb_common_init_state(struct kgdb_state *ks)
38{
39 kdb_initial_cpu = atomic_read(&kgdb_active);
40 kdb_current_task = kgdb_info[ks->cpu].task;
41 kdb_current_regs = kgdb_info[ks->cpu].debuggerinfo;
42 return 0;
43}
44
45int kdb_common_deinit_state(void)
46{
47 kdb_initial_cpu = -1;
48 kdb_current_task = NULL;
49 kdb_current_regs = NULL;
50 return 0;
51}
52
37int kdb_stub(struct kgdb_state *ks) 53int kdb_stub(struct kgdb_state *ks)
38{ 54{
39 int error = 0; 55 int error = 0;
@@ -94,13 +110,10 @@ int kdb_stub(struct kgdb_state *ks)
94 } 110 }
95 /* Set initial kdb state variables */ 111 /* Set initial kdb state variables */
96 KDB_STATE_CLEAR(KGDB_TRANS); 112 KDB_STATE_CLEAR(KGDB_TRANS);
97 kdb_initial_cpu = atomic_read(&kgdb_active); 113 kdb_common_init_state(ks);
98 kdb_current_task = kgdb_info[ks->cpu].task;
99 kdb_current_regs = kgdb_info[ks->cpu].debuggerinfo;
100 /* Remove any breakpoints as needed by kdb and clear single step */ 114 /* Remove any breakpoints as needed by kdb and clear single step */
101 kdb_bp_remove(); 115 kdb_bp_remove();
102 KDB_STATE_CLEAR(DOING_SS); 116 KDB_STATE_CLEAR(DOING_SS);
103 KDB_STATE_CLEAR(DOING_SSB);
104 KDB_STATE_SET(PAGER); 117 KDB_STATE_SET(PAGER);
105 /* zero out any offline cpu data */ 118 /* zero out any offline cpu data */
106 for_each_present_cpu(i) { 119 for_each_present_cpu(i) {
@@ -125,9 +138,7 @@ int kdb_stub(struct kgdb_state *ks)
125 * Upon exit from the kdb main loop setup break points and restart 138 * Upon exit from the kdb main loop setup break points and restart
126 * the system based on the requested continue state 139 * the system based on the requested continue state
127 */ 140 */
128 kdb_initial_cpu = -1; 141 kdb_common_deinit_state();
129 kdb_current_task = NULL;
130 kdb_current_regs = NULL;
131 KDB_STATE_CLEAR(PAGER); 142 KDB_STATE_CLEAR(PAGER);
132 kdbnearsym_cleanup(); 143 kdbnearsym_cleanup();
133 if (error == KDB_CMD_KGDB) { 144 if (error == KDB_CMD_KGDB) {
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index 4d5f8d5612f3..00eb8f7fbf41 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -124,7 +124,7 @@ static kdbmsg_t kdbmsgs[] = {
124}; 124};
125#undef KDBMSG 125#undef KDBMSG
126 126
127static const int __nkdb_err = sizeof(kdbmsgs) / sizeof(kdbmsg_t); 127static const int __nkdb_err = ARRAY_SIZE(kdbmsgs);
128 128
129 129
130/* 130/*
@@ -175,7 +175,7 @@ static char *__env[] = {
175 (char *)0, 175 (char *)0,
176}; 176};
177 177
178static const int __nenv = (sizeof(__env) / sizeof(char *)); 178static const int __nenv = ARRAY_SIZE(__env);
179 179
180struct task_struct *kdb_curr_task(int cpu) 180struct task_struct *kdb_curr_task(int cpu)
181{ 181{
@@ -681,34 +681,50 @@ static int kdb_defcmd(int argc, const char **argv)
681 } 681 }
682 if (argc != 3) 682 if (argc != 3)
683 return KDB_ARGCOUNT; 683 return KDB_ARGCOUNT;
684 defcmd_set = kmalloc((defcmd_set_count + 1) * sizeof(*defcmd_set), 684 if (in_dbg_master()) {
685 GFP_KDB); 685 kdb_printf("Command only available during kdb_init()\n");
686 if (!defcmd_set) {
687 kdb_printf("Could not allocate new defcmd_set entry for %s\n",
688 argv[1]);
689 defcmd_set = save_defcmd_set;
690 return KDB_NOTIMP; 686 return KDB_NOTIMP;
691 } 687 }
688 defcmd_set = kmalloc((defcmd_set_count + 1) * sizeof(*defcmd_set),
689 GFP_KDB);
690 if (!defcmd_set)
691 goto fail_defcmd;
692 memcpy(defcmd_set, save_defcmd_set, 692 memcpy(defcmd_set, save_defcmd_set,
693 defcmd_set_count * sizeof(*defcmd_set)); 693 defcmd_set_count * sizeof(*defcmd_set));
694 kfree(save_defcmd_set);
695 s = defcmd_set + defcmd_set_count; 694 s = defcmd_set + defcmd_set_count;
696 memset(s, 0, sizeof(*s)); 695 memset(s, 0, sizeof(*s));
697 s->usable = 1; 696 s->usable = 1;
698 s->name = kdb_strdup(argv[1], GFP_KDB); 697 s->name = kdb_strdup(argv[1], GFP_KDB);
698 if (!s->name)
699 goto fail_name;
699 s->usage = kdb_strdup(argv[2], GFP_KDB); 700 s->usage = kdb_strdup(argv[2], GFP_KDB);
701 if (!s->usage)
702 goto fail_usage;
700 s->help = kdb_strdup(argv[3], GFP_KDB); 703 s->help = kdb_strdup(argv[3], GFP_KDB);
704 if (!s->help)
705 goto fail_help;
701 if (s->usage[0] == '"') { 706 if (s->usage[0] == '"') {
702 strcpy(s->usage, s->usage+1); 707 strcpy(s->usage, argv[2]+1);
703 s->usage[strlen(s->usage)-1] = '\0'; 708 s->usage[strlen(s->usage)-1] = '\0';
704 } 709 }
705 if (s->help[0] == '"') { 710 if (s->help[0] == '"') {
706 strcpy(s->help, s->help+1); 711 strcpy(s->help, argv[3]+1);
707 s->help[strlen(s->help)-1] = '\0'; 712 s->help[strlen(s->help)-1] = '\0';
708 } 713 }
709 ++defcmd_set_count; 714 ++defcmd_set_count;
710 defcmd_in_progress = 1; 715 defcmd_in_progress = 1;
716 kfree(save_defcmd_set);
711 return 0; 717 return 0;
718fail_help:
719 kfree(s->usage);
720fail_usage:
721 kfree(s->name);
722fail_name:
723 kfree(defcmd_set);
724fail_defcmd:
725 kdb_printf("Could not allocate new defcmd_set entry for %s\n", argv[1]);
726 defcmd_set = save_defcmd_set;
727 return KDB_NOTIMP;
712} 728}
713 729
714/* 730/*
@@ -1112,7 +1128,6 @@ void kdb_set_current_task(struct task_struct *p)
1112 * KDB_CMD_GO User typed 'go'. 1128 * KDB_CMD_GO User typed 'go'.
1113 * KDB_CMD_CPU User switched to another cpu. 1129 * KDB_CMD_CPU User switched to another cpu.
1114 * KDB_CMD_SS Single step. 1130 * KDB_CMD_SS Single step.
1115 * KDB_CMD_SSB Single step until branch.
1116 */ 1131 */
1117static int kdb_local(kdb_reason_t reason, int error, struct pt_regs *regs, 1132static int kdb_local(kdb_reason_t reason, int error, struct pt_regs *regs,
1118 kdb_dbtrap_t db_result) 1133 kdb_dbtrap_t db_result)
@@ -1151,14 +1166,6 @@ static int kdb_local(kdb_reason_t reason, int error, struct pt_regs *regs,
1151 kdb_printf("due to Debug @ " kdb_machreg_fmt "\n", 1166 kdb_printf("due to Debug @ " kdb_machreg_fmt "\n",
1152 instruction_pointer(regs)); 1167 instruction_pointer(regs));
1153 break; 1168 break;
1154 case KDB_DB_SSB:
1155 /*
1156 * In the midst of ssb command. Just return.
1157 */
1158 KDB_DEBUG_STATE("kdb_local 3", reason);
1159 return KDB_CMD_SSB; /* Continue with SSB command */
1160
1161 break;
1162 case KDB_DB_SS: 1169 case KDB_DB_SS:
1163 break; 1170 break;
1164 case KDB_DB_SSBPT: 1171 case KDB_DB_SSBPT:
@@ -1281,7 +1288,6 @@ do_full_getstr:
1281 if (diag == KDB_CMD_GO 1288 if (diag == KDB_CMD_GO
1282 || diag == KDB_CMD_CPU 1289 || diag == KDB_CMD_CPU
1283 || diag == KDB_CMD_SS 1290 || diag == KDB_CMD_SS
1284 || diag == KDB_CMD_SSB
1285 || diag == KDB_CMD_KGDB) 1291 || diag == KDB_CMD_KGDB)
1286 break; 1292 break;
1287 1293
@@ -1368,12 +1374,6 @@ int kdb_main_loop(kdb_reason_t reason, kdb_reason_t reason2, int error,
1368 break; 1374 break;
1369 } 1375 }
1370 1376
1371 if (result == KDB_CMD_SSB) {
1372 KDB_STATE_SET(DOING_SS);
1373 KDB_STATE_SET(DOING_SSB);
1374 break;
1375 }
1376
1377 if (result == KDB_CMD_KGDB) { 1377 if (result == KDB_CMD_KGDB) {
1378 if (!KDB_STATE(DOING_KGDB)) 1378 if (!KDB_STATE(DOING_KGDB))
1379 kdb_printf("Entering please attach debugger " 1379 kdb_printf("Entering please attach debugger "
@@ -1970,6 +1970,8 @@ static int kdb_lsmod(int argc, const char **argv)
1970 1970
1971 kdb_printf("Module Size modstruct Used by\n"); 1971 kdb_printf("Module Size modstruct Used by\n");
1972 list_for_each_entry(mod, kdb_modules, list) { 1972 list_for_each_entry(mod, kdb_modules, list) {
1973 if (mod->state == MODULE_STATE_UNFORMED)
1974 continue;
1973 1975
1974 kdb_printf("%-20s%8u 0x%p ", mod->name, 1976 kdb_printf("%-20s%8u 0x%p ", mod->name,
1975 mod->core_size, (void *)mod); 1977 mod->core_size, (void *)mod);
@@ -2348,69 +2350,6 @@ static int kdb_pid(int argc, const char **argv)
2348 return 0; 2350 return 0;
2349} 2351}
2350 2352
2351/*
2352 * kdb_ll - This function implements the 'll' command which follows a
2353 * linked list and executes an arbitrary command for each
2354 * element.
2355 */
2356static int kdb_ll(int argc, const char **argv)
2357{
2358 int diag = 0;
2359 unsigned long addr;
2360 long offset = 0;
2361 unsigned long va;
2362 unsigned long linkoffset;
2363 int nextarg;
2364 const char *command;
2365
2366 if (argc != 3)
2367 return KDB_ARGCOUNT;
2368
2369 nextarg = 1;
2370 diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset, NULL);
2371 if (diag)
2372 return diag;
2373
2374 diag = kdbgetularg(argv[2], &linkoffset);
2375 if (diag)
2376 return diag;
2377
2378 /*
2379 * Using the starting address as
2380 * the first element in the list, and assuming that
2381 * the list ends with a null pointer.
2382 */
2383
2384 va = addr;
2385 command = kdb_strdup(argv[3], GFP_KDB);
2386 if (!command) {
2387 kdb_printf("%s: cannot duplicate command\n", __func__);
2388 return 0;
2389 }
2390 /* Recursive use of kdb_parse, do not use argv after this point */
2391 argv = NULL;
2392
2393 while (va) {
2394 char buf[80];
2395
2396 if (KDB_FLAG(CMD_INTERRUPT))
2397 goto out;
2398
2399 sprintf(buf, "%s " kdb_machreg_fmt "\n", command, va);
2400 diag = kdb_parse(buf);
2401 if (diag)
2402 goto out;
2403
2404 addr = va + linkoffset;
2405 if (kdb_getword(&va, addr, sizeof(va)))
2406 goto out;
2407 }
2408
2409out:
2410 kfree(command);
2411 return diag;
2412}
2413
2414static int kdb_kgdb(int argc, const char **argv) 2353static int kdb_kgdb(int argc, const char **argv)
2415{ 2354{
2416 return KDB_CMD_KGDB; 2355 return KDB_CMD_KGDB;
@@ -2428,11 +2367,15 @@ static int kdb_help(int argc, const char **argv)
2428 kdb_printf("-----------------------------" 2367 kdb_printf("-----------------------------"
2429 "-----------------------------\n"); 2368 "-----------------------------\n");
2430 for_each_kdbcmd(kt, i) { 2369 for_each_kdbcmd(kt, i) {
2431 if (kt->cmd_name) 2370 char *space = "";
2432 kdb_printf("%-15.15s %-20.20s %s\n", kt->cmd_name,
2433 kt->cmd_usage, kt->cmd_help);
2434 if (KDB_FLAG(CMD_INTERRUPT)) 2371 if (KDB_FLAG(CMD_INTERRUPT))
2435 return 0; 2372 return 0;
2373 if (!kt->cmd_name)
2374 continue;
2375 if (strlen(kt->cmd_usage) > 20)
2376 space = "\n ";
2377 kdb_printf("%-15.15s %-20s%s%s\n", kt->cmd_name,
2378 kt->cmd_usage, space, kt->cmd_help);
2436 } 2379 }
2437 return 0; 2380 return 0;
2438} 2381}
@@ -2737,7 +2680,7 @@ int kdb_register_repeat(char *cmd,
2737 (kdb_max_commands - KDB_BASE_CMD_MAX) * sizeof(*new)); 2680 (kdb_max_commands - KDB_BASE_CMD_MAX) * sizeof(*new));
2738 kfree(kdb_commands); 2681 kfree(kdb_commands);
2739 } 2682 }
2740 memset(new + kdb_max_commands, 0, 2683 memset(new + kdb_max_commands - KDB_BASE_CMD_MAX, 0,
2741 kdb_command_extend * sizeof(*new)); 2684 kdb_command_extend * sizeof(*new));
2742 kdb_commands = new; 2685 kdb_commands = new;
2743 kp = kdb_commands + kdb_max_commands - KDB_BASE_CMD_MAX; 2686 kp = kdb_commands + kdb_max_commands - KDB_BASE_CMD_MAX;
@@ -2841,15 +2784,13 @@ static void __init kdb_inittab(void)
2841 "Stack traceback", 1, KDB_REPEAT_NONE); 2784 "Stack traceback", 1, KDB_REPEAT_NONE);
2842 kdb_register_repeat("btp", kdb_bt, "<pid>", 2785 kdb_register_repeat("btp", kdb_bt, "<pid>",
2843 "Display stack for process <pid>", 0, KDB_REPEAT_NONE); 2786 "Display stack for process <pid>", 0, KDB_REPEAT_NONE);
2844 kdb_register_repeat("bta", kdb_bt, "[DRSTCZEUIMA]", 2787 kdb_register_repeat("bta", kdb_bt, "[D|R|S|T|C|Z|E|U|I|M|A]",
2845 "Display stack all processes", 0, KDB_REPEAT_NONE); 2788 "Backtrace all processes matching state flag", 0, KDB_REPEAT_NONE);
2846 kdb_register_repeat("btc", kdb_bt, "", 2789 kdb_register_repeat("btc", kdb_bt, "",
2847 "Backtrace current process on each cpu", 0, KDB_REPEAT_NONE); 2790 "Backtrace current process on each cpu", 0, KDB_REPEAT_NONE);
2848 kdb_register_repeat("btt", kdb_bt, "<vaddr>", 2791 kdb_register_repeat("btt", kdb_bt, "<vaddr>",
2849 "Backtrace process given its struct task address", 0, 2792 "Backtrace process given its struct task address", 0,
2850 KDB_REPEAT_NONE); 2793 KDB_REPEAT_NONE);
2851 kdb_register_repeat("ll", kdb_ll, "<first-element> <linkoffset> <cmd>",
2852 "Execute cmd for each element in linked list", 0, KDB_REPEAT_NONE);
2853 kdb_register_repeat("env", kdb_env, "", 2794 kdb_register_repeat("env", kdb_env, "",
2854 "Show environment variables", 0, KDB_REPEAT_NONE); 2795 "Show environment variables", 0, KDB_REPEAT_NONE);
2855 kdb_register_repeat("set", kdb_set, "", 2796 kdb_register_repeat("set", kdb_set, "",
diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h
index 392ec6a25844..7afd3c8c41d5 100644
--- a/kernel/debug/kdb/kdb_private.h
+++ b/kernel/debug/kdb/kdb_private.h
@@ -19,7 +19,6 @@
19#define KDB_CMD_GO (-1001) 19#define KDB_CMD_GO (-1001)
20#define KDB_CMD_CPU (-1002) 20#define KDB_CMD_CPU (-1002)
21#define KDB_CMD_SS (-1003) 21#define KDB_CMD_SS (-1003)
22#define KDB_CMD_SSB (-1004)
23#define KDB_CMD_KGDB (-1005) 22#define KDB_CMD_KGDB (-1005)
24 23
25/* Internal debug flags */ 24/* Internal debug flags */
@@ -125,8 +124,6 @@ extern int kdb_state;
125 * kdb control */ 124 * kdb control */
126#define KDB_STATE_HOLD_CPU 0x00000010 /* Hold this cpu inside kdb */ 125#define KDB_STATE_HOLD_CPU 0x00000010 /* Hold this cpu inside kdb */
127#define KDB_STATE_DOING_SS 0x00000020 /* Doing ss command */ 126#define KDB_STATE_DOING_SS 0x00000020 /* Doing ss command */
128#define KDB_STATE_DOING_SSB 0x00000040 /* Doing ssb command,
129 * DOING_SS is also set */
130#define KDB_STATE_SSBPT 0x00000080 /* Install breakpoint 127#define KDB_STATE_SSBPT 0x00000080 /* Install breakpoint
131 * after one ss, independent of 128 * after one ss, independent of
132 * DOING_SS */ 129 * DOING_SS */
@@ -191,7 +188,6 @@ extern void kdb_bp_remove(void);
191typedef enum { 188typedef enum {
192 KDB_DB_BPT, /* Breakpoint */ 189 KDB_DB_BPT, /* Breakpoint */
193 KDB_DB_SS, /* Single-step trap */ 190 KDB_DB_SS, /* Single-step trap */
194 KDB_DB_SSB, /* Single step to branch */
195 KDB_DB_SSBPT, /* Single step over breakpoint */ 191 KDB_DB_SSBPT, /* Single step over breakpoint */
196 KDB_DB_NOBPT /* Spurious breakpoint */ 192 KDB_DB_NOBPT /* Spurious breakpoint */
197} kdb_dbtrap_t; 193} kdb_dbtrap_t;
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index 418b3f7053aa..d473988c1d0b 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -106,6 +106,7 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
106 unsigned long long t2, t3; 106 unsigned long long t2, t3;
107 unsigned long flags; 107 unsigned long flags;
108 struct timespec ts; 108 struct timespec ts;
109 cputime_t utime, stime, stimescaled, utimescaled;
109 110
110 /* Though tsk->delays accessed later, early exit avoids 111 /* Though tsk->delays accessed later, early exit avoids
111 * unnecessary returning of other data 112 * unnecessary returning of other data
@@ -114,12 +115,14 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
114 goto done; 115 goto done;
115 116
116 tmp = (s64)d->cpu_run_real_total; 117 tmp = (s64)d->cpu_run_real_total;
117 cputime_to_timespec(tsk->utime + tsk->stime, &ts); 118 task_cputime(tsk, &utime, &stime);
119 cputime_to_timespec(utime + stime, &ts);
118 tmp += timespec_to_ns(&ts); 120 tmp += timespec_to_ns(&ts);
119 d->cpu_run_real_total = (tmp < (s64)d->cpu_run_real_total) ? 0 : tmp; 121 d->cpu_run_real_total = (tmp < (s64)d->cpu_run_real_total) ? 0 : tmp;
120 122
121 tmp = (s64)d->cpu_scaled_run_real_total; 123 tmp = (s64)d->cpu_scaled_run_real_total;
122 cputime_to_timespec(tsk->utimescaled + tsk->stimescaled, &ts); 124 task_cputime_scaled(tsk, &utimescaled, &stimescaled);
125 cputime_to_timespec(utimescaled + stimescaled, &ts);
123 tmp += timespec_to_ns(&ts); 126 tmp += timespec_to_ns(&ts);
124 d->cpu_scaled_run_real_total = 127 d->cpu_scaled_run_real_total =
125 (tmp < (s64)d->cpu_scaled_run_real_total) ? 0 : tmp; 128 (tmp < (s64)d->cpu_scaled_run_real_total) ? 0 : tmp;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index dbccf83c134d..7e0962ed7f8a 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -908,6 +908,15 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
908} 908}
909 909
910/* 910/*
911 * Initialize event state based on the perf_event_attr::disabled.
912 */
913static inline void perf_event__state_init(struct perf_event *event)
914{
915 event->state = event->attr.disabled ? PERF_EVENT_STATE_OFF :
916 PERF_EVENT_STATE_INACTIVE;
917}
918
919/*
911 * Called at perf_event creation and when events are attached/detached from a 920 * Called at perf_event creation and when events are attached/detached from a
912 * group. 921 * group.
913 */ 922 */
@@ -3682,7 +3691,7 @@ unlock:
3682 3691
3683static int perf_fasync(int fd, struct file *filp, int on) 3692static int perf_fasync(int fd, struct file *filp, int on)
3684{ 3693{
3685 struct inode *inode = filp->f_path.dentry->d_inode; 3694 struct inode *inode = file_inode(filp);
3686 struct perf_event *event = filp->private_data; 3695 struct perf_event *event = filp->private_data;
3687 int retval; 3696 int retval;
3688 3697
@@ -4425,12 +4434,15 @@ static void perf_event_task_event(struct perf_task_event *task_event)
4425 if (ctxn < 0) 4434 if (ctxn < 0)
4426 goto next; 4435 goto next;
4427 ctx = rcu_dereference(current->perf_event_ctxp[ctxn]); 4436 ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
4437 if (ctx)
4438 perf_event_task_ctx(ctx, task_event);
4428 } 4439 }
4429 if (ctx)
4430 perf_event_task_ctx(ctx, task_event);
4431next: 4440next:
4432 put_cpu_ptr(pmu->pmu_cpu_context); 4441 put_cpu_ptr(pmu->pmu_cpu_context);
4433 } 4442 }
4443 if (task_event->task_ctx)
4444 perf_event_task_ctx(task_event->task_ctx, task_event);
4445
4434 rcu_read_unlock(); 4446 rcu_read_unlock();
4435} 4447}
4436 4448
@@ -4725,7 +4737,8 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
4725 } else { 4737 } else {
4726 if (arch_vma_name(mmap_event->vma)) { 4738 if (arch_vma_name(mmap_event->vma)) {
4727 name = strncpy(tmp, arch_vma_name(mmap_event->vma), 4739 name = strncpy(tmp, arch_vma_name(mmap_event->vma),
4728 sizeof(tmp)); 4740 sizeof(tmp) - 1);
4741 tmp[sizeof(tmp) - 1] = '\0';
4729 goto got_name; 4742 goto got_name;
4730 } 4743 }
4731 4744
@@ -5117,7 +5130,6 @@ static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
5117{ 5130{
5118 struct swevent_htable *swhash = &__get_cpu_var(swevent_htable); 5131 struct swevent_htable *swhash = &__get_cpu_var(swevent_htable);
5119 struct perf_event *event; 5132 struct perf_event *event;
5120 struct hlist_node *node;
5121 struct hlist_head *head; 5133 struct hlist_head *head;
5122 5134
5123 rcu_read_lock(); 5135 rcu_read_lock();
@@ -5125,7 +5137,7 @@ static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
5125 if (!head) 5137 if (!head)
5126 goto end; 5138 goto end;
5127 5139
5128 hlist_for_each_entry_rcu(event, node, head, hlist_entry) { 5140 hlist_for_each_entry_rcu(event, head, hlist_entry) {
5129 if (perf_swevent_match(event, type, event_id, data, regs)) 5141 if (perf_swevent_match(event, type, event_id, data, regs))
5130 perf_swevent_event(event, nr, data, regs); 5142 perf_swevent_event(event, nr, data, regs);
5131 } 5143 }
@@ -5410,7 +5422,6 @@ void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,
5410{ 5422{
5411 struct perf_sample_data data; 5423 struct perf_sample_data data;
5412 struct perf_event *event; 5424 struct perf_event *event;
5413 struct hlist_node *node;
5414 5425
5415 struct perf_raw_record raw = { 5426 struct perf_raw_record raw = {
5416 .size = entry_size, 5427 .size = entry_size,
@@ -5420,7 +5431,7 @@ void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,
5420 perf_sample_data_init(&data, addr, 0); 5431 perf_sample_data_init(&data, addr, 0);
5421 data.raw = &raw; 5432 data.raw = &raw;
5422 5433
5423 hlist_for_each_entry_rcu(event, node, head, hlist_entry) { 5434 hlist_for_each_entry_rcu(event, head, hlist_entry) {
5424 if (perf_tp_event_match(event, &data, regs)) 5435 if (perf_tp_event_match(event, &data, regs))
5425 perf_swevent_event(event, count, &data, regs); 5436 perf_swevent_event(event, count, &data, regs);
5426 } 5437 }
@@ -5640,6 +5651,7 @@ static void perf_swevent_init_hrtimer(struct perf_event *event)
5640 event->attr.sample_period = NSEC_PER_SEC / freq; 5651 event->attr.sample_period = NSEC_PER_SEC / freq;
5641 hwc->sample_period = event->attr.sample_period; 5652 hwc->sample_period = event->attr.sample_period;
5642 local64_set(&hwc->period_left, hwc->sample_period); 5653 local64_set(&hwc->period_left, hwc->sample_period);
5654 hwc->last_period = hwc->sample_period;
5643 event->attr.freq = 0; 5655 event->attr.freq = 0;
5644 } 5656 }
5645} 5657}
@@ -5956,13 +5968,9 @@ int perf_pmu_register(struct pmu *pmu, char *name, int type)
5956 pmu->name = name; 5968 pmu->name = name;
5957 5969
5958 if (type < 0) { 5970 if (type < 0) {
5959 int err = idr_pre_get(&pmu_idr, GFP_KERNEL); 5971 type = idr_alloc(&pmu_idr, pmu, PERF_TYPE_MAX, 0, GFP_KERNEL);
5960 if (!err) 5972 if (type < 0) {
5961 goto free_pdc; 5973 ret = type;
5962
5963 err = idr_get_new_above(&pmu_idr, pmu, PERF_TYPE_MAX, &type);
5964 if (err) {
5965 ret = err;
5966 goto free_pdc; 5974 goto free_pdc;
5967 } 5975 }
5968 } 5976 }
@@ -5979,6 +5987,7 @@ skip_type:
5979 if (pmu->pmu_cpu_context) 5987 if (pmu->pmu_cpu_context)
5980 goto got_cpu_context; 5988 goto got_cpu_context;
5981 5989
5990 ret = -ENOMEM;
5982 pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context); 5991 pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context);
5983 if (!pmu->pmu_cpu_context) 5992 if (!pmu->pmu_cpu_context)
5984 goto free_dev; 5993 goto free_dev;
@@ -6155,18 +6164,21 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
6155 6164
6156 event->parent = parent_event; 6165 event->parent = parent_event;
6157 6166
6158 event->ns = get_pid_ns(current->nsproxy->pid_ns); 6167 event->ns = get_pid_ns(task_active_pid_ns(current));
6159 event->id = atomic64_inc_return(&perf_event_id); 6168 event->id = atomic64_inc_return(&perf_event_id);
6160 6169
6161 event->state = PERF_EVENT_STATE_INACTIVE; 6170 event->state = PERF_EVENT_STATE_INACTIVE;
6162 6171
6163 if (task) { 6172 if (task) {
6164 event->attach_state = PERF_ATTACH_TASK; 6173 event->attach_state = PERF_ATTACH_TASK;
6174
6175 if (attr->type == PERF_TYPE_TRACEPOINT)
6176 event->hw.tp_target = task;
6165#ifdef CONFIG_HAVE_HW_BREAKPOINT 6177#ifdef CONFIG_HAVE_HW_BREAKPOINT
6166 /* 6178 /*
6167 * hw_breakpoint is a bit difficult here.. 6179 * hw_breakpoint is a bit difficult here..
6168 */ 6180 */
6169 if (attr->type == PERF_TYPE_BREAKPOINT) 6181 else if (attr->type == PERF_TYPE_BREAKPOINT)
6170 event->hw.bp_target = task; 6182 event->hw.bp_target = task;
6171#endif 6183#endif
6172 } 6184 }
@@ -6179,8 +6191,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
6179 event->overflow_handler = overflow_handler; 6191 event->overflow_handler = overflow_handler;
6180 event->overflow_handler_context = context; 6192 event->overflow_handler_context = context;
6181 6193
6182 if (attr->disabled) 6194 perf_event__state_init(event);
6183 event->state = PERF_EVENT_STATE_OFF;
6184 6195
6185 pmu = NULL; 6196 pmu = NULL;
6186 6197
@@ -6609,9 +6620,17 @@ SYSCALL_DEFINE5(perf_event_open,
6609 6620
6610 mutex_lock(&gctx->mutex); 6621 mutex_lock(&gctx->mutex);
6611 perf_remove_from_context(group_leader); 6622 perf_remove_from_context(group_leader);
6623
6624 /*
6625 * Removing from the context ends up with disabled
6626 * event. What we want here is event in the initial
6627 * startup state, ready to be add into new context.
6628 */
6629 perf_event__state_init(group_leader);
6612 list_for_each_entry(sibling, &group_leader->sibling_list, 6630 list_for_each_entry(sibling, &group_leader->sibling_list,
6613 group_entry) { 6631 group_entry) {
6614 perf_remove_from_context(sibling); 6632 perf_remove_from_context(sibling);
6633 perf_event__state_init(sibling);
6615 put_ctx(gctx); 6634 put_ctx(gctx);
6616 } 6635 }
6617 mutex_unlock(&gctx->mutex); 6636 mutex_unlock(&gctx->mutex);
@@ -7434,7 +7453,7 @@ unlock:
7434device_initcall(perf_event_sysfs_init); 7453device_initcall(perf_event_sysfs_init);
7435 7454
7436#ifdef CONFIG_CGROUP_PERF 7455#ifdef CONFIG_CGROUP_PERF
7437static struct cgroup_subsys_state *perf_cgroup_create(struct cgroup *cont) 7456static struct cgroup_subsys_state *perf_cgroup_css_alloc(struct cgroup *cont)
7438{ 7457{
7439 struct perf_cgroup *jc; 7458 struct perf_cgroup *jc;
7440 7459
@@ -7451,7 +7470,7 @@ static struct cgroup_subsys_state *perf_cgroup_create(struct cgroup *cont)
7451 return &jc->css; 7470 return &jc->css;
7452} 7471}
7453 7472
7454static void perf_cgroup_destroy(struct cgroup *cont) 7473static void perf_cgroup_css_free(struct cgroup *cont)
7455{ 7474{
7456 struct perf_cgroup *jc; 7475 struct perf_cgroup *jc;
7457 jc = container_of(cgroup_subsys_state(cont, perf_subsys_id), 7476 jc = container_of(cgroup_subsys_state(cont, perf_subsys_id),
@@ -7492,8 +7511,8 @@ static void perf_cgroup_exit(struct cgroup *cgrp, struct cgroup *old_cgrp,
7492struct cgroup_subsys perf_subsys = { 7511struct cgroup_subsys perf_subsys = {
7493 .name = "perf_event", 7512 .name = "perf_event",
7494 .subsys_id = perf_subsys_id, 7513 .subsys_id = perf_subsys_id,
7495 .create = perf_cgroup_create, 7514 .css_alloc = perf_cgroup_css_alloc,
7496 .destroy = perf_cgroup_destroy, 7515 .css_free = perf_cgroup_css_free,
7497 .exit = perf_cgroup_exit, 7516 .exit = perf_cgroup_exit,
7498 .attach = perf_cgroup_attach, 7517 .attach = perf_cgroup_attach,
7499 7518
diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c
index 9a7b487c6fe2..a64f8aeb5c1f 100644
--- a/kernel/events/hw_breakpoint.c
+++ b/kernel/events/hw_breakpoint.c
@@ -111,14 +111,16 @@ static unsigned int max_task_bp_pinned(int cpu, enum bp_type_idx type)
111 * Count the number of breakpoints of the same type and same task. 111 * Count the number of breakpoints of the same type and same task.
112 * The given event must be not on the list. 112 * The given event must be not on the list.
113 */ 113 */
114static int task_bp_pinned(struct perf_event *bp, enum bp_type_idx type) 114static int task_bp_pinned(int cpu, struct perf_event *bp, enum bp_type_idx type)
115{ 115{
116 struct task_struct *tsk = bp->hw.bp_target; 116 struct task_struct *tsk = bp->hw.bp_target;
117 struct perf_event *iter; 117 struct perf_event *iter;
118 int count = 0; 118 int count = 0;
119 119
120 list_for_each_entry(iter, &bp_task_head, hw.bp_list) { 120 list_for_each_entry(iter, &bp_task_head, hw.bp_list) {
121 if (iter->hw.bp_target == tsk && find_slot_idx(iter) == type) 121 if (iter->hw.bp_target == tsk &&
122 find_slot_idx(iter) == type &&
123 cpu == iter->cpu)
122 count += hw_breakpoint_weight(iter); 124 count += hw_breakpoint_weight(iter);
123 } 125 }
124 126
@@ -141,7 +143,7 @@ fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp,
141 if (!tsk) 143 if (!tsk)
142 slots->pinned += max_task_bp_pinned(cpu, type); 144 slots->pinned += max_task_bp_pinned(cpu, type);
143 else 145 else
144 slots->pinned += task_bp_pinned(bp, type); 146 slots->pinned += task_bp_pinned(cpu, bp, type);
145 slots->flexible = per_cpu(nr_bp_flexible[type], cpu); 147 slots->flexible = per_cpu(nr_bp_flexible[type], cpu);
146 148
147 return; 149 return;
@@ -154,7 +156,7 @@ fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp,
154 if (!tsk) 156 if (!tsk)
155 nr += max_task_bp_pinned(cpu, type); 157 nr += max_task_bp_pinned(cpu, type);
156 else 158 else
157 nr += task_bp_pinned(bp, type); 159 nr += task_bp_pinned(cpu, bp, type);
158 160
159 if (nr > slots->pinned) 161 if (nr > slots->pinned)
160 slots->pinned = nr; 162 slots->pinned = nr;
@@ -188,7 +190,7 @@ static void toggle_bp_task_slot(struct perf_event *bp, int cpu, bool enable,
188 int old_idx = 0; 190 int old_idx = 0;
189 int idx = 0; 191 int idx = 0;
190 192
191 old_count = task_bp_pinned(bp, type); 193 old_count = task_bp_pinned(cpu, bp, type);
192 old_idx = old_count - 1; 194 old_idx = old_count - 1;
193 idx = old_idx + weight; 195 idx = old_idx + weight;
194 196
@@ -674,7 +676,7 @@ int __init init_hw_breakpoint(void)
674 err_alloc: 676 err_alloc:
675 for_each_possible_cpu(err_cpu) { 677 for_each_possible_cpu(err_cpu) {
676 for (i = 0; i < TYPE_MAX; i++) 678 for (i = 0; i < TYPE_MAX; i++)
677 kfree(per_cpu(nr_task_bp_pinned[i], cpu)); 679 kfree(per_cpu(nr_task_bp_pinned[i], err_cpu));
678 if (err_cpu == cpu) 680 if (err_cpu == cpu)
679 break; 681 break;
680 } 682 }
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index d56a64c99a8b..eb675c4d59df 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -16,7 +16,7 @@ struct ring_buffer {
16 int page_order; /* allocation order */ 16 int page_order; /* allocation order */
17#endif 17#endif
18 int nr_pages; /* nr of data pages */ 18 int nr_pages; /* nr of data pages */
19 int writable; /* are we writable */ 19 int overwrite; /* can overwrite itself */
20 20
21 atomic_t poll; /* POLL_ for wakeups */ 21 atomic_t poll; /* POLL_ for wakeups */
22 22
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index 23cb34ff3973..97fddb09762b 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -18,12 +18,24 @@
18static bool perf_output_space(struct ring_buffer *rb, unsigned long tail, 18static bool perf_output_space(struct ring_buffer *rb, unsigned long tail,
19 unsigned long offset, unsigned long head) 19 unsigned long offset, unsigned long head)
20{ 20{
21 unsigned long mask; 21 unsigned long sz = perf_data_size(rb);
22 unsigned long mask = sz - 1;
22 23
23 if (!rb->writable) 24 /*
25 * check if user-writable
26 * overwrite : over-write its own tail
27 * !overwrite: buffer possibly drops events.
28 */
29 if (rb->overwrite)
24 return true; 30 return true;
25 31
26 mask = perf_data_size(rb) - 1; 32 /*
33 * verify that payload is not bigger than buffer
34 * otherwise masking logic may fail to detect
35 * the "not enough space" condition
36 */
37 if ((head - offset) > sz)
38 return false;
27 39
28 offset = (offset - tail) & mask; 40 offset = (offset - tail) & mask;
29 head = (head - tail) & mask; 41 head = (head - tail) & mask;
@@ -212,7 +224,9 @@ ring_buffer_init(struct ring_buffer *rb, long watermark, int flags)
212 rb->watermark = max_size / 2; 224 rb->watermark = max_size / 2;
213 225
214 if (flags & RING_BUFFER_WRITABLE) 226 if (flags & RING_BUFFER_WRITABLE)
215 rb->writable = 1; 227 rb->overwrite = 0;
228 else
229 rb->overwrite = 1;
216 230
217 atomic_set(&rb->refcount, 1); 231 atomic_set(&rb->refcount, 1);
218 232
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 5cc4e7e42e68..a567c8c7ef31 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -27,12 +27,14 @@
27#include <linux/pagemap.h> /* read_mapping_page */ 27#include <linux/pagemap.h> /* read_mapping_page */
28#include <linux/slab.h> 28#include <linux/slab.h>
29#include <linux/sched.h> 29#include <linux/sched.h>
30#include <linux/export.h>
30#include <linux/rmap.h> /* anon_vma_prepare */ 31#include <linux/rmap.h> /* anon_vma_prepare */
31#include <linux/mmu_notifier.h> /* set_pte_at_notify */ 32#include <linux/mmu_notifier.h> /* set_pte_at_notify */
32#include <linux/swap.h> /* try_to_free_swap */ 33#include <linux/swap.h> /* try_to_free_swap */
33#include <linux/ptrace.h> /* user_enable_single_step */ 34#include <linux/ptrace.h> /* user_enable_single_step */
34#include <linux/kdebug.h> /* notifier mechanism */ 35#include <linux/kdebug.h> /* notifier mechanism */
35#include "../../mm/internal.h" /* munlock_vma_page */ 36#include "../../mm/internal.h" /* munlock_vma_page */
37#include <linux/percpu-rwsem.h>
36 38
37#include <linux/uprobes.h> 39#include <linux/uprobes.h>
38 40
@@ -40,56 +42,31 @@
40#define MAX_UPROBE_XOL_SLOTS UINSNS_PER_PAGE 42#define MAX_UPROBE_XOL_SLOTS UINSNS_PER_PAGE
41 43
42static struct rb_root uprobes_tree = RB_ROOT; 44static struct rb_root uprobes_tree = RB_ROOT;
43
44static DEFINE_SPINLOCK(uprobes_treelock); /* serialize rbtree access */
45
46#define UPROBES_HASH_SZ 13
47
48/* 45/*
49 * We need separate register/unregister and mmap/munmap lock hashes because 46 * allows us to skip the uprobe_mmap if there are no uprobe events active
50 * of mmap_sem nesting. 47 * at this time. Probably a fine grained per inode count is better?
51 *
52 * uprobe_register() needs to install probes on (potentially) all processes
53 * and thus needs to acquire multiple mmap_sems (consequtively, not
54 * concurrently), whereas uprobe_mmap() is called while holding mmap_sem
55 * for the particular process doing the mmap.
56 *
57 * uprobe_register()->register_for_each_vma() needs to drop/acquire mmap_sem
58 * because of lock order against i_mmap_mutex. This means there's a hole in
59 * the register vma iteration where a mmap() can happen.
60 *
61 * Thus uprobe_register() can race with uprobe_mmap() and we can try and
62 * install a probe where one is already installed.
63 */ 48 */
49#define no_uprobe_events() RB_EMPTY_ROOT(&uprobes_tree)
64 50
65/* serialize (un)register */ 51static DEFINE_SPINLOCK(uprobes_treelock); /* serialize rbtree access */
66static struct mutex uprobes_mutex[UPROBES_HASH_SZ];
67
68#define uprobes_hash(v) (&uprobes_mutex[((unsigned long)(v)) % UPROBES_HASH_SZ])
69 52
53#define UPROBES_HASH_SZ 13
70/* serialize uprobe->pending_list */ 54/* serialize uprobe->pending_list */
71static struct mutex uprobes_mmap_mutex[UPROBES_HASH_SZ]; 55static struct mutex uprobes_mmap_mutex[UPROBES_HASH_SZ];
72#define uprobes_mmap_hash(v) (&uprobes_mmap_mutex[((unsigned long)(v)) % UPROBES_HASH_SZ]) 56#define uprobes_mmap_hash(v) (&uprobes_mmap_mutex[((unsigned long)(v)) % UPROBES_HASH_SZ])
73 57
74/* 58static struct percpu_rw_semaphore dup_mmap_sem;
75 * uprobe_events allows us to skip the uprobe_mmap if there are no uprobe
76 * events active at this time. Probably a fine grained per inode count is
77 * better?
78 */
79static atomic_t uprobe_events = ATOMIC_INIT(0);
80 59
81/* Have a copy of original instruction */ 60/* Have a copy of original instruction */
82#define UPROBE_COPY_INSN 0 61#define UPROBE_COPY_INSN 0
83/* Dont run handlers when first register/ last unregister in progress*/
84#define UPROBE_RUN_HANDLER 1
85/* Can skip singlestep */ 62/* Can skip singlestep */
86#define UPROBE_SKIP_SSTEP 2 63#define UPROBE_SKIP_SSTEP 1
87 64
88struct uprobe { 65struct uprobe {
89 struct rb_node rb_node; /* node in the rb tree */ 66 struct rb_node rb_node; /* node in the rb tree */
90 atomic_t ref; 67 atomic_t ref;
68 struct rw_semaphore register_rwsem;
91 struct rw_semaphore consumer_rwsem; 69 struct rw_semaphore consumer_rwsem;
92 struct mutex copy_mutex; /* TODO: kill me and UPROBE_COPY_INSN */
93 struct list_head pending_list; 70 struct list_head pending_list;
94 struct uprobe_consumer *consumers; 71 struct uprobe_consumer *consumers;
95 struct inode *inode; /* Also hold a ref to inode */ 72 struct inode *inode; /* Also hold a ref to inode */
@@ -427,9 +404,6 @@ static struct uprobe *insert_uprobe(struct uprobe *uprobe)
427 u = __insert_uprobe(uprobe); 404 u = __insert_uprobe(uprobe);
428 spin_unlock(&uprobes_treelock); 405 spin_unlock(&uprobes_treelock);
429 406
430 /* For now assume that the instruction need not be single-stepped */
431 __set_bit(UPROBE_SKIP_SSTEP, &uprobe->flags);
432
433 return u; 407 return u;
434} 408}
435 409
@@ -449,8 +423,10 @@ static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset)
449 423
450 uprobe->inode = igrab(inode); 424 uprobe->inode = igrab(inode);
451 uprobe->offset = offset; 425 uprobe->offset = offset;
426 init_rwsem(&uprobe->register_rwsem);
452 init_rwsem(&uprobe->consumer_rwsem); 427 init_rwsem(&uprobe->consumer_rwsem);
453 mutex_init(&uprobe->copy_mutex); 428 /* For now assume that the instruction need not be single-stepped */
429 __set_bit(UPROBE_SKIP_SSTEP, &uprobe->flags);
454 430
455 /* add to uprobes_tree, sorted on inode:offset */ 431 /* add to uprobes_tree, sorted on inode:offset */
456 cur_uprobe = insert_uprobe(uprobe); 432 cur_uprobe = insert_uprobe(uprobe);
@@ -460,38 +436,17 @@ static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset)
460 kfree(uprobe); 436 kfree(uprobe);
461 uprobe = cur_uprobe; 437 uprobe = cur_uprobe;
462 iput(inode); 438 iput(inode);
463 } else {
464 atomic_inc(&uprobe_events);
465 } 439 }
466 440
467 return uprobe; 441 return uprobe;
468} 442}
469 443
470static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs) 444static void consumer_add(struct uprobe *uprobe, struct uprobe_consumer *uc)
471{
472 struct uprobe_consumer *uc;
473
474 if (!test_bit(UPROBE_RUN_HANDLER, &uprobe->flags))
475 return;
476
477 down_read(&uprobe->consumer_rwsem);
478 for (uc = uprobe->consumers; uc; uc = uc->next) {
479 if (!uc->filter || uc->filter(uc, current))
480 uc->handler(uc, regs);
481 }
482 up_read(&uprobe->consumer_rwsem);
483}
484
485/* Returns the previous consumer */
486static struct uprobe_consumer *
487consumer_add(struct uprobe *uprobe, struct uprobe_consumer *uc)
488{ 445{
489 down_write(&uprobe->consumer_rwsem); 446 down_write(&uprobe->consumer_rwsem);
490 uc->next = uprobe->consumers; 447 uc->next = uprobe->consumers;
491 uprobe->consumers = uc; 448 uprobe->consumers = uc;
492 up_write(&uprobe->consumer_rwsem); 449 up_write(&uprobe->consumer_rwsem);
493
494 return uc->next;
495} 450}
496 451
497/* 452/*
@@ -585,7 +540,8 @@ static int prepare_uprobe(struct uprobe *uprobe, struct file *file,
585 if (test_bit(UPROBE_COPY_INSN, &uprobe->flags)) 540 if (test_bit(UPROBE_COPY_INSN, &uprobe->flags))
586 return ret; 541 return ret;
587 542
588 mutex_lock(&uprobe->copy_mutex); 543 /* TODO: move this into _register, until then we abuse this sem. */
544 down_write(&uprobe->consumer_rwsem);
589 if (test_bit(UPROBE_COPY_INSN, &uprobe->flags)) 545 if (test_bit(UPROBE_COPY_INSN, &uprobe->flags))
590 goto out; 546 goto out;
591 547
@@ -609,7 +565,30 @@ static int prepare_uprobe(struct uprobe *uprobe, struct file *file,
609 set_bit(UPROBE_COPY_INSN, &uprobe->flags); 565 set_bit(UPROBE_COPY_INSN, &uprobe->flags);
610 566
611 out: 567 out:
612 mutex_unlock(&uprobe->copy_mutex); 568 up_write(&uprobe->consumer_rwsem);
569
570 return ret;
571}
572
573static inline bool consumer_filter(struct uprobe_consumer *uc,
574 enum uprobe_filter_ctx ctx, struct mm_struct *mm)
575{
576 return !uc->filter || uc->filter(uc, ctx, mm);
577}
578
579static bool filter_chain(struct uprobe *uprobe,
580 enum uprobe_filter_ctx ctx, struct mm_struct *mm)
581{
582 struct uprobe_consumer *uc;
583 bool ret = false;
584
585 down_read(&uprobe->consumer_rwsem);
586 for (uc = uprobe->consumers; uc; uc = uc->next) {
587 ret = consumer_filter(uc, ctx, mm);
588 if (ret)
589 break;
590 }
591 up_read(&uprobe->consumer_rwsem);
613 592
614 return ret; 593 return ret;
615} 594}
@@ -621,16 +600,6 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm,
621 bool first_uprobe; 600 bool first_uprobe;
622 int ret; 601 int ret;
623 602
624 /*
625 * If probe is being deleted, unregister thread could be done with
626 * the vma-rmap-walk through. Adding a probe now can be fatal since
627 * nobody will be able to cleanup. Also we could be from fork or
628 * mremap path, where the probe might have already been inserted.
629 * Hence behave as if probe already existed.
630 */
631 if (!uprobe->consumers)
632 return 0;
633
634 ret = prepare_uprobe(uprobe, vma->vm_file, mm, vaddr); 603 ret = prepare_uprobe(uprobe, vma->vm_file, mm, vaddr);
635 if (ret) 604 if (ret)
636 return ret; 605 return ret;
@@ -655,14 +624,14 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm,
655static int 624static int
656remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, unsigned long vaddr) 625remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, unsigned long vaddr)
657{ 626{
658 /* can happen if uprobe_register() fails */
659 if (!test_bit(MMF_HAS_UPROBES, &mm->flags))
660 return 0;
661
662 set_bit(MMF_RECALC_UPROBES, &mm->flags); 627 set_bit(MMF_RECALC_UPROBES, &mm->flags);
663 return set_orig_insn(&uprobe->arch, mm, vaddr); 628 return set_orig_insn(&uprobe->arch, mm, vaddr);
664} 629}
665 630
631static inline bool uprobe_is_active(struct uprobe *uprobe)
632{
633 return !RB_EMPTY_NODE(&uprobe->rb_node);
634}
666/* 635/*
667 * There could be threads that have already hit the breakpoint. They 636 * There could be threads that have already hit the breakpoint. They
668 * will recheck the current insn and restart if find_uprobe() fails. 637 * will recheck the current insn and restart if find_uprobe() fails.
@@ -670,12 +639,15 @@ remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, unsigned long vad
670 */ 639 */
671static void delete_uprobe(struct uprobe *uprobe) 640static void delete_uprobe(struct uprobe *uprobe)
672{ 641{
642 if (WARN_ON(!uprobe_is_active(uprobe)))
643 return;
644
673 spin_lock(&uprobes_treelock); 645 spin_lock(&uprobes_treelock);
674 rb_erase(&uprobe->rb_node, &uprobes_tree); 646 rb_erase(&uprobe->rb_node, &uprobes_tree);
675 spin_unlock(&uprobes_treelock); 647 spin_unlock(&uprobes_treelock);
648 RB_CLEAR_NODE(&uprobe->rb_node); /* for uprobe_is_active() */
676 iput(uprobe->inode); 649 iput(uprobe->inode);
677 put_uprobe(uprobe); 650 put_uprobe(uprobe);
678 atomic_dec(&uprobe_events);
679} 651}
680 652
681struct map_info { 653struct map_info {
@@ -761,15 +733,20 @@ build_map_info(struct address_space *mapping, loff_t offset, bool is_register)
761 return curr; 733 return curr;
762} 734}
763 735
764static int register_for_each_vma(struct uprobe *uprobe, bool is_register) 736static int
737register_for_each_vma(struct uprobe *uprobe, struct uprobe_consumer *new)
765{ 738{
739 bool is_register = !!new;
766 struct map_info *info; 740 struct map_info *info;
767 int err = 0; 741 int err = 0;
768 742
743 percpu_down_write(&dup_mmap_sem);
769 info = build_map_info(uprobe->inode->i_mapping, 744 info = build_map_info(uprobe->inode->i_mapping,
770 uprobe->offset, is_register); 745 uprobe->offset, is_register);
771 if (IS_ERR(info)) 746 if (IS_ERR(info)) {
772 return PTR_ERR(info); 747 err = PTR_ERR(info);
748 goto out;
749 }
773 750
774 while (info) { 751 while (info) {
775 struct mm_struct *mm = info->mm; 752 struct mm_struct *mm = info->mm;
@@ -788,10 +765,16 @@ static int register_for_each_vma(struct uprobe *uprobe, bool is_register)
788 vaddr_to_offset(vma, info->vaddr) != uprobe->offset) 765 vaddr_to_offset(vma, info->vaddr) != uprobe->offset)
789 goto unlock; 766 goto unlock;
790 767
791 if (is_register) 768 if (is_register) {
792 err = install_breakpoint(uprobe, mm, vma, info->vaddr); 769 /* consult only the "caller", new consumer. */
793 else 770 if (consumer_filter(new,
794 err |= remove_breakpoint(uprobe, mm, info->vaddr); 771 UPROBE_FILTER_REGISTER, mm))
772 err = install_breakpoint(uprobe, mm, vma, info->vaddr);
773 } else if (test_bit(MMF_HAS_UPROBES, &mm->flags)) {
774 if (!filter_chain(uprobe,
775 UPROBE_FILTER_UNREGISTER, mm))
776 err |= remove_breakpoint(uprobe, mm, info->vaddr);
777 }
795 778
796 unlock: 779 unlock:
797 up_write(&mm->mmap_sem); 780 up_write(&mm->mmap_sem);
@@ -799,21 +782,28 @@ static int register_for_each_vma(struct uprobe *uprobe, bool is_register)
799 mmput(mm); 782 mmput(mm);
800 info = free_map_info(info); 783 info = free_map_info(info);
801 } 784 }
802 785 out:
786 percpu_up_write(&dup_mmap_sem);
803 return err; 787 return err;
804} 788}
805 789
806static int __uprobe_register(struct uprobe *uprobe) 790static int __uprobe_register(struct uprobe *uprobe, struct uprobe_consumer *uc)
807{ 791{
808 return register_for_each_vma(uprobe, true); 792 consumer_add(uprobe, uc);
793 return register_for_each_vma(uprobe, uc);
809} 794}
810 795
811static void __uprobe_unregister(struct uprobe *uprobe) 796static void __uprobe_unregister(struct uprobe *uprobe, struct uprobe_consumer *uc)
812{ 797{
813 if (!register_for_each_vma(uprobe, false)) 798 int err;
814 delete_uprobe(uprobe); 799
800 if (!consumer_del(uprobe, uc)) /* WARN? */
801 return;
815 802
803 err = register_for_each_vma(uprobe, NULL);
816 /* TODO : cant unregister? schedule a worker thread */ 804 /* TODO : cant unregister? schedule a worker thread */
805 if (!uprobe->consumers && !err)
806 delete_uprobe(uprobe);
817} 807}
818 808
819/* 809/*
@@ -838,31 +828,59 @@ int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *
838 struct uprobe *uprobe; 828 struct uprobe *uprobe;
839 int ret; 829 int ret;
840 830
841 if (!inode || !uc || uc->next) 831 /* Racy, just to catch the obvious mistakes */
842 return -EINVAL;
843
844 if (offset > i_size_read(inode)) 832 if (offset > i_size_read(inode))
845 return -EINVAL; 833 return -EINVAL;
846 834
847 ret = 0; 835 retry:
848 mutex_lock(uprobes_hash(inode));
849 uprobe = alloc_uprobe(inode, offset); 836 uprobe = alloc_uprobe(inode, offset);
850 837 if (!uprobe)
851 if (!uprobe) { 838 return -ENOMEM;
852 ret = -ENOMEM; 839 /*
853 } else if (!consumer_add(uprobe, uc)) { 840 * We can race with uprobe_unregister()->delete_uprobe().
854 ret = __uprobe_register(uprobe); 841 * Check uprobe_is_active() and retry if it is false.
855 if (ret) { 842 */
856 uprobe->consumers = NULL; 843 down_write(&uprobe->register_rwsem);
857 __uprobe_unregister(uprobe); 844 ret = -EAGAIN;
858 } else { 845 if (likely(uprobe_is_active(uprobe))) {
859 set_bit(UPROBE_RUN_HANDLER, &uprobe->flags); 846 ret = __uprobe_register(uprobe, uc);
860 } 847 if (ret)
848 __uprobe_unregister(uprobe, uc);
861 } 849 }
850 up_write(&uprobe->register_rwsem);
851 put_uprobe(uprobe);
862 852
863 mutex_unlock(uprobes_hash(inode)); 853 if (unlikely(ret == -EAGAIN))
864 if (uprobe) 854 goto retry;
865 put_uprobe(uprobe); 855 return ret;
856}
857EXPORT_SYMBOL_GPL(uprobe_register);
858
859/*
860 * uprobe_apply - unregister a already registered probe.
861 * @inode: the file in which the probe has to be removed.
862 * @offset: offset from the start of the file.
863 * @uc: consumer which wants to add more or remove some breakpoints
864 * @add: add or remove the breakpoints
865 */
866int uprobe_apply(struct inode *inode, loff_t offset,
867 struct uprobe_consumer *uc, bool add)
868{
869 struct uprobe *uprobe;
870 struct uprobe_consumer *con;
871 int ret = -ENOENT;
872
873 uprobe = find_uprobe(inode, offset);
874 if (!uprobe)
875 return ret;
876
877 down_write(&uprobe->register_rwsem);
878 for (con = uprobe->consumers; con && con != uc ; con = con->next)
879 ;
880 if (con)
881 ret = register_for_each_vma(uprobe, add ? uc : NULL);
882 up_write(&uprobe->register_rwsem);
883 put_uprobe(uprobe);
866 884
867 return ret; 885 return ret;
868} 886}
@@ -877,25 +895,42 @@ void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consume
877{ 895{
878 struct uprobe *uprobe; 896 struct uprobe *uprobe;
879 897
880 if (!inode || !uc)
881 return;
882
883 uprobe = find_uprobe(inode, offset); 898 uprobe = find_uprobe(inode, offset);
884 if (!uprobe) 899 if (!uprobe)
885 return; 900 return;
886 901
887 mutex_lock(uprobes_hash(inode)); 902 down_write(&uprobe->register_rwsem);
903 __uprobe_unregister(uprobe, uc);
904 up_write(&uprobe->register_rwsem);
905 put_uprobe(uprobe);
906}
907EXPORT_SYMBOL_GPL(uprobe_unregister);
888 908
889 if (consumer_del(uprobe, uc)) { 909static int unapply_uprobe(struct uprobe *uprobe, struct mm_struct *mm)
890 if (!uprobe->consumers) { 910{
891 __uprobe_unregister(uprobe); 911 struct vm_area_struct *vma;
892 clear_bit(UPROBE_RUN_HANDLER, &uprobe->flags); 912 int err = 0;
893 } 913
914 down_read(&mm->mmap_sem);
915 for (vma = mm->mmap; vma; vma = vma->vm_next) {
916 unsigned long vaddr;
917 loff_t offset;
918
919 if (!valid_vma(vma, false) ||
920 vma->vm_file->f_mapping->host != uprobe->inode)
921 continue;
922
923 offset = (loff_t)vma->vm_pgoff << PAGE_SHIFT;
924 if (uprobe->offset < offset ||
925 uprobe->offset >= offset + vma->vm_end - vma->vm_start)
926 continue;
927
928 vaddr = offset_to_vaddr(vma, uprobe->offset);
929 err |= remove_breakpoint(uprobe, mm, vaddr);
894 } 930 }
931 up_read(&mm->mmap_sem);
895 932
896 mutex_unlock(uprobes_hash(inode)); 933 return err;
897 if (uprobe)
898 put_uprobe(uprobe);
899} 934}
900 935
901static struct rb_node * 936static struct rb_node *
@@ -972,7 +1007,7 @@ int uprobe_mmap(struct vm_area_struct *vma)
972 struct uprobe *uprobe, *u; 1007 struct uprobe *uprobe, *u;
973 struct inode *inode; 1008 struct inode *inode;
974 1009
975 if (!atomic_read(&uprobe_events) || !valid_vma(vma, true)) 1010 if (no_uprobe_events() || !valid_vma(vma, true))
976 return 0; 1011 return 0;
977 1012
978 inode = vma->vm_file->f_mapping->host; 1013 inode = vma->vm_file->f_mapping->host;
@@ -981,9 +1016,14 @@ int uprobe_mmap(struct vm_area_struct *vma)
981 1016
982 mutex_lock(uprobes_mmap_hash(inode)); 1017 mutex_lock(uprobes_mmap_hash(inode));
983 build_probe_list(inode, vma, vma->vm_start, vma->vm_end, &tmp_list); 1018 build_probe_list(inode, vma, vma->vm_start, vma->vm_end, &tmp_list);
984 1019 /*
1020 * We can race with uprobe_unregister(), this uprobe can be already
1021 * removed. But in this case filter_chain() must return false, all
1022 * consumers have gone away.
1023 */
985 list_for_each_entry_safe(uprobe, u, &tmp_list, pending_list) { 1024 list_for_each_entry_safe(uprobe, u, &tmp_list, pending_list) {
986 if (!fatal_signal_pending(current)) { 1025 if (!fatal_signal_pending(current) &&
1026 filter_chain(uprobe, UPROBE_FILTER_MMAP, vma->vm_mm)) {
987 unsigned long vaddr = offset_to_vaddr(vma, uprobe->offset); 1027 unsigned long vaddr = offset_to_vaddr(vma, uprobe->offset);
988 install_breakpoint(uprobe, vma->vm_mm, vma, vaddr); 1028 install_breakpoint(uprobe, vma->vm_mm, vma, vaddr);
989 } 1029 }
@@ -1018,7 +1058,7 @@ vma_has_uprobes(struct vm_area_struct *vma, unsigned long start, unsigned long e
1018 */ 1058 */
1019void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned long end) 1059void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned long end)
1020{ 1060{
1021 if (!atomic_read(&uprobe_events) || !valid_vma(vma, false)) 1061 if (no_uprobe_events() || !valid_vma(vma, false))
1022 return; 1062 return;
1023 1063
1024 if (!atomic_read(&vma->vm_mm->mm_users)) /* called by mmput() ? */ 1064 if (!atomic_read(&vma->vm_mm->mm_users)) /* called by mmput() ? */
@@ -1035,22 +1075,14 @@ void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned lon
1035/* Slot allocation for XOL */ 1075/* Slot allocation for XOL */
1036static int xol_add_vma(struct xol_area *area) 1076static int xol_add_vma(struct xol_area *area)
1037{ 1077{
1038 struct mm_struct *mm; 1078 struct mm_struct *mm = current->mm;
1039 int ret; 1079 int ret = -EALREADY;
1040
1041 area->page = alloc_page(GFP_HIGHUSER);
1042 if (!area->page)
1043 return -ENOMEM;
1044
1045 ret = -EALREADY;
1046 mm = current->mm;
1047 1080
1048 down_write(&mm->mmap_sem); 1081 down_write(&mm->mmap_sem);
1049 if (mm->uprobes_state.xol_area) 1082 if (mm->uprobes_state.xol_area)
1050 goto fail; 1083 goto fail;
1051 1084
1052 ret = -ENOMEM; 1085 ret = -ENOMEM;
1053
1054 /* Try to map as high as possible, this is only a hint. */ 1086 /* Try to map as high as possible, this is only a hint. */
1055 area->vaddr = get_unmapped_area(NULL, TASK_SIZE - PAGE_SIZE, PAGE_SIZE, 0, 0); 1087 area->vaddr = get_unmapped_area(NULL, TASK_SIZE - PAGE_SIZE, PAGE_SIZE, 0, 0);
1056 if (area->vaddr & ~PAGE_MASK) { 1088 if (area->vaddr & ~PAGE_MASK) {
@@ -1066,54 +1098,53 @@ static int xol_add_vma(struct xol_area *area)
1066 smp_wmb(); /* pairs with get_xol_area() */ 1098 smp_wmb(); /* pairs with get_xol_area() */
1067 mm->uprobes_state.xol_area = area; 1099 mm->uprobes_state.xol_area = area;
1068 ret = 0; 1100 ret = 0;
1069 1101 fail:
1070fail:
1071 up_write(&mm->mmap_sem); 1102 up_write(&mm->mmap_sem);
1072 if (ret)
1073 __free_page(area->page);
1074 1103
1075 return ret; 1104 return ret;
1076} 1105}
1077 1106
1078static struct xol_area *get_xol_area(struct mm_struct *mm)
1079{
1080 struct xol_area *area;
1081
1082 area = mm->uprobes_state.xol_area;
1083 smp_read_barrier_depends(); /* pairs with wmb in xol_add_vma() */
1084
1085 return area;
1086}
1087
1088/* 1107/*
1089 * xol_alloc_area - Allocate process's xol_area. 1108 * get_xol_area - Allocate process's xol_area if necessary.
1090 * This area will be used for storing instructions for execution out of 1109 * This area will be used for storing instructions for execution out of line.
1091 * line.
1092 * 1110 *
1093 * Returns the allocated area or NULL. 1111 * Returns the allocated area or NULL.
1094 */ 1112 */
1095static struct xol_area *xol_alloc_area(void) 1113static struct xol_area *get_xol_area(void)
1096{ 1114{
1115 struct mm_struct *mm = current->mm;
1097 struct xol_area *area; 1116 struct xol_area *area;
1098 1117
1118 area = mm->uprobes_state.xol_area;
1119 if (area)
1120 goto ret;
1121
1099 area = kzalloc(sizeof(*area), GFP_KERNEL); 1122 area = kzalloc(sizeof(*area), GFP_KERNEL);
1100 if (unlikely(!area)) 1123 if (unlikely(!area))
1101 return NULL; 1124 goto out;
1102 1125
1103 area->bitmap = kzalloc(BITS_TO_LONGS(UINSNS_PER_PAGE) * sizeof(long), GFP_KERNEL); 1126 area->bitmap = kzalloc(BITS_TO_LONGS(UINSNS_PER_PAGE) * sizeof(long), GFP_KERNEL);
1104
1105 if (!area->bitmap) 1127 if (!area->bitmap)
1106 goto fail; 1128 goto free_area;
1129
1130 area->page = alloc_page(GFP_HIGHUSER);
1131 if (!area->page)
1132 goto free_bitmap;
1107 1133
1108 init_waitqueue_head(&area->wq); 1134 init_waitqueue_head(&area->wq);
1109 if (!xol_add_vma(area)) 1135 if (!xol_add_vma(area))
1110 return area; 1136 return area;
1111 1137
1112fail: 1138 __free_page(area->page);
1139 free_bitmap:
1113 kfree(area->bitmap); 1140 kfree(area->bitmap);
1141 free_area:
1114 kfree(area); 1142 kfree(area);
1115 1143 out:
1116 return get_xol_area(current->mm); 1144 area = mm->uprobes_state.xol_area;
1145 ret:
1146 smp_read_barrier_depends(); /* pairs with wmb in xol_add_vma() */
1147 return area;
1117} 1148}
1118 1149
1119/* 1150/*
@@ -1131,6 +1162,16 @@ void uprobe_clear_state(struct mm_struct *mm)
1131 kfree(area); 1162 kfree(area);
1132} 1163}
1133 1164
1165void uprobe_start_dup_mmap(void)
1166{
1167 percpu_down_read(&dup_mmap_sem);
1168}
1169
1170void uprobe_end_dup_mmap(void)
1171{
1172 percpu_up_read(&dup_mmap_sem);
1173}
1174
1134void uprobe_dup_mmap(struct mm_struct *oldmm, struct mm_struct *newmm) 1175void uprobe_dup_mmap(struct mm_struct *oldmm, struct mm_struct *newmm)
1135{ 1176{
1136 newmm->uprobes_state.xol_area = NULL; 1177 newmm->uprobes_state.xol_area = NULL;
@@ -1169,38 +1210,36 @@ static unsigned long xol_take_insn_slot(struct xol_area *area)
1169} 1210}
1170 1211
1171/* 1212/*
1172 * xol_get_insn_slot - If was not allocated a slot, then 1213 * xol_get_insn_slot - allocate a slot for xol.
1173 * allocate a slot.
1174 * Returns the allocated slot address or 0. 1214 * Returns the allocated slot address or 0.
1175 */ 1215 */
1176static unsigned long xol_get_insn_slot(struct uprobe *uprobe, unsigned long slot_addr) 1216static unsigned long xol_get_insn_slot(struct uprobe *uprobe)
1177{ 1217{
1178 struct xol_area *area; 1218 struct xol_area *area;
1179 unsigned long offset; 1219 unsigned long offset;
1220 unsigned long xol_vaddr;
1180 void *vaddr; 1221 void *vaddr;
1181 1222
1182 area = get_xol_area(current->mm); 1223 area = get_xol_area();
1183 if (!area) { 1224 if (!area)
1184 area = xol_alloc_area(); 1225 return 0;
1185 if (!area)
1186 return 0;
1187 }
1188 current->utask->xol_vaddr = xol_take_insn_slot(area);
1189 1226
1190 /* 1227 xol_vaddr = xol_take_insn_slot(area);
1191 * Initialize the slot if xol_vaddr points to valid 1228 if (unlikely(!xol_vaddr))
1192 * instruction slot.
1193 */
1194 if (unlikely(!current->utask->xol_vaddr))
1195 return 0; 1229 return 0;
1196 1230
1197 current->utask->vaddr = slot_addr; 1231 /* Initialize the slot */
1198 offset = current->utask->xol_vaddr & ~PAGE_MASK; 1232 offset = xol_vaddr & ~PAGE_MASK;
1199 vaddr = kmap_atomic(area->page); 1233 vaddr = kmap_atomic(area->page);
1200 memcpy(vaddr + offset, uprobe->arch.insn, MAX_UINSN_BYTES); 1234 memcpy(vaddr + offset, uprobe->arch.insn, MAX_UINSN_BYTES);
1201 kunmap_atomic(vaddr); 1235 kunmap_atomic(vaddr);
1236 /*
1237 * We probably need flush_icache_user_range() but it needs vma.
1238 * This should work on supported architectures too.
1239 */
1240 flush_dcache_page(area->page);
1202 1241
1203 return current->utask->xol_vaddr; 1242 return xol_vaddr;
1204} 1243}
1205 1244
1206/* 1245/*
@@ -1218,8 +1257,7 @@ static void xol_free_insn_slot(struct task_struct *tsk)
1218 return; 1257 return;
1219 1258
1220 slot_addr = tsk->utask->xol_vaddr; 1259 slot_addr = tsk->utask->xol_vaddr;
1221 1260 if (unlikely(!slot_addr))
1222 if (unlikely(!slot_addr || IS_ERR_VALUE(slot_addr)))
1223 return; 1261 return;
1224 1262
1225 area = tsk->mm->uprobes_state.xol_area; 1263 area = tsk->mm->uprobes_state.xol_area;
@@ -1281,33 +1319,48 @@ void uprobe_copy_process(struct task_struct *t)
1281} 1319}
1282 1320
1283/* 1321/*
1284 * Allocate a uprobe_task object for the task. 1322 * Allocate a uprobe_task object for the task if if necessary.
1285 * Called when the thread hits a breakpoint for the first time. 1323 * Called when the thread hits a breakpoint.
1286 * 1324 *
1287 * Returns: 1325 * Returns:
1288 * - pointer to new uprobe_task on success 1326 * - pointer to new uprobe_task on success
1289 * - NULL otherwise 1327 * - NULL otherwise
1290 */ 1328 */
1291static struct uprobe_task *add_utask(void) 1329static struct uprobe_task *get_utask(void)
1292{ 1330{
1293 struct uprobe_task *utask; 1331 if (!current->utask)
1294 1332 current->utask = kzalloc(sizeof(struct uprobe_task), GFP_KERNEL);
1295 utask = kzalloc(sizeof *utask, GFP_KERNEL); 1333 return current->utask;
1296 if (unlikely(!utask))
1297 return NULL;
1298
1299 current->utask = utask;
1300 return utask;
1301} 1334}
1302 1335
1303/* Prepare to single-step probed instruction out of line. */ 1336/* Prepare to single-step probed instruction out of line. */
1304static int 1337static int
1305pre_ssout(struct uprobe *uprobe, struct pt_regs *regs, unsigned long vaddr) 1338pre_ssout(struct uprobe *uprobe, struct pt_regs *regs, unsigned long bp_vaddr)
1306{ 1339{
1307 if (xol_get_insn_slot(uprobe, vaddr) && !arch_uprobe_pre_xol(&uprobe->arch, regs)) 1340 struct uprobe_task *utask;
1308 return 0; 1341 unsigned long xol_vaddr;
1342 int err;
1309 1343
1310 return -EFAULT; 1344 utask = get_utask();
1345 if (!utask)
1346 return -ENOMEM;
1347
1348 xol_vaddr = xol_get_insn_slot(uprobe);
1349 if (!xol_vaddr)
1350 return -ENOMEM;
1351
1352 utask->xol_vaddr = xol_vaddr;
1353 utask->vaddr = bp_vaddr;
1354
1355 err = arch_uprobe_pre_xol(&uprobe->arch, regs);
1356 if (unlikely(err)) {
1357 xol_free_insn_slot(current);
1358 return err;
1359 }
1360
1361 utask->active_uprobe = uprobe;
1362 utask->state = UTASK_SSTEP;
1363 return 0;
1311} 1364}
1312 1365
1313/* 1366/*
@@ -1369,6 +1422,7 @@ static void mmf_recalc_uprobes(struct mm_struct *mm)
1369 * This is not strictly accurate, we can race with 1422 * This is not strictly accurate, we can race with
1370 * uprobe_unregister() and see the already removed 1423 * uprobe_unregister() and see the already removed
1371 * uprobe if delete_uprobe() was not yet called. 1424 * uprobe if delete_uprobe() was not yet called.
1425 * Or this uprobe can be filtered out.
1372 */ 1426 */
1373 if (vma_has_uprobes(vma, vma->vm_start, vma->vm_end)) 1427 if (vma_has_uprobes(vma, vma->vm_start, vma->vm_end))
1374 return; 1428 return;
@@ -1430,14 +1484,25 @@ static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp)
1430 return uprobe; 1484 return uprobe;
1431} 1485}
1432 1486
1433void __weak arch_uprobe_enable_step(struct arch_uprobe *arch) 1487static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs)
1434{ 1488{
1435 user_enable_single_step(current); 1489 struct uprobe_consumer *uc;
1436} 1490 int remove = UPROBE_HANDLER_REMOVE;
1437 1491
1438void __weak arch_uprobe_disable_step(struct arch_uprobe *arch) 1492 down_read(&uprobe->register_rwsem);
1439{ 1493 for (uc = uprobe->consumers; uc; uc = uc->next) {
1440 user_disable_single_step(current); 1494 int rc = uc->handler(uc, regs);
1495
1496 WARN(rc & ~UPROBE_HANDLER_MASK,
1497 "bad rc=0x%x from %pf()\n", rc, uc->handler);
1498 remove &= rc;
1499 }
1500
1501 if (remove && uprobe->consumers) {
1502 WARN_ON(!uprobe_is_active(uprobe));
1503 unapply_uprobe(uprobe, current->mm);
1504 }
1505 up_read(&uprobe->register_rwsem);
1441} 1506}
1442 1507
1443/* 1508/*
@@ -1446,7 +1511,6 @@ void __weak arch_uprobe_disable_step(struct arch_uprobe *arch)
1446 */ 1511 */
1447static void handle_swbp(struct pt_regs *regs) 1512static void handle_swbp(struct pt_regs *regs)
1448{ 1513{
1449 struct uprobe_task *utask;
1450 struct uprobe *uprobe; 1514 struct uprobe *uprobe;
1451 unsigned long bp_vaddr; 1515 unsigned long bp_vaddr;
1452 int uninitialized_var(is_swbp); 1516 int uninitialized_var(is_swbp);
@@ -1471,6 +1535,10 @@ static void handle_swbp(struct pt_regs *regs)
1471 } 1535 }
1472 return; 1536 return;
1473 } 1537 }
1538
1539 /* change it in advance for ->handler() and restart */
1540 instruction_pointer_set(regs, bp_vaddr);
1541
1474 /* 1542 /*
1475 * TODO: move copy_insn/etc into _register and remove this hack. 1543 * TODO: move copy_insn/etc into _register and remove this hack.
1476 * After we hit the bp, _unregister + _register can install the 1544 * After we hit the bp, _unregister + _register can install the
@@ -1478,33 +1546,16 @@ static void handle_swbp(struct pt_regs *regs)
1478 */ 1546 */
1479 smp_rmb(); /* pairs with wmb() in install_breakpoint() */ 1547 smp_rmb(); /* pairs with wmb() in install_breakpoint() */
1480 if (unlikely(!test_bit(UPROBE_COPY_INSN, &uprobe->flags))) 1548 if (unlikely(!test_bit(UPROBE_COPY_INSN, &uprobe->flags)))
1481 goto restart; 1549 goto out;
1482
1483 utask = current->utask;
1484 if (!utask) {
1485 utask = add_utask();
1486 /* Cannot allocate; re-execute the instruction. */
1487 if (!utask)
1488 goto restart;
1489 }
1490 1550
1491 handler_chain(uprobe, regs); 1551 handler_chain(uprobe, regs);
1492 if (can_skip_sstep(uprobe, regs)) 1552 if (can_skip_sstep(uprobe, regs))
1493 goto out; 1553 goto out;
1494 1554
1495 if (!pre_ssout(uprobe, regs, bp_vaddr)) { 1555 if (!pre_ssout(uprobe, regs, bp_vaddr))
1496 arch_uprobe_enable_step(&uprobe->arch);
1497 utask->active_uprobe = uprobe;
1498 utask->state = UTASK_SSTEP;
1499 return; 1556 return;
1500 }
1501 1557
1502restart: 1558 /* can_skip_sstep() succeeded, or restart if can't singlestep */
1503 /*
1504 * cannot singlestep; cannot skip instruction;
1505 * re-execute the instruction.
1506 */
1507 instruction_pointer_set(regs, bp_vaddr);
1508out: 1559out:
1509 put_uprobe(uprobe); 1560 put_uprobe(uprobe);
1510} 1561}
@@ -1525,7 +1576,6 @@ static void handle_singlestep(struct uprobe_task *utask, struct pt_regs *regs)
1525 else 1576 else
1526 WARN_ON_ONCE(1); 1577 WARN_ON_ONCE(1);
1527 1578
1528 arch_uprobe_disable_step(&uprobe->arch);
1529 put_uprobe(uprobe); 1579 put_uprobe(uprobe);
1530 utask->active_uprobe = NULL; 1580 utask->active_uprobe = NULL;
1531 utask->state = UTASK_RUNNING; 1581 utask->state = UTASK_RUNNING;
@@ -1599,10 +1649,11 @@ static int __init init_uprobes(void)
1599{ 1649{
1600 int i; 1650 int i;
1601 1651
1602 for (i = 0; i < UPROBES_HASH_SZ; i++) { 1652 for (i = 0; i < UPROBES_HASH_SZ; i++)
1603 mutex_init(&uprobes_mutex[i]);
1604 mutex_init(&uprobes_mmap_mutex[i]); 1653 mutex_init(&uprobes_mmap_mutex[i]);
1605 } 1654
1655 if (percpu_init_rwsem(&dup_mmap_sem))
1656 return -ENOMEM;
1606 1657
1607 return register_die_notifier(&uprobe_exception_nb); 1658 return register_die_notifier(&uprobe_exception_nb);
1608} 1659}
diff --git a/kernel/exit.c b/kernel/exit.c
index 346616c0092c..60bc027c61c3 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -20,6 +20,7 @@
20#include <linux/tsacct_kern.h> 20#include <linux/tsacct_kern.h>
21#include <linux/file.h> 21#include <linux/file.h>
22#include <linux/fdtable.h> 22#include <linux/fdtable.h>
23#include <linux/freezer.h>
23#include <linux/binfmts.h> 24#include <linux/binfmts.h>
24#include <linux/nsproxy.h> 25#include <linux/nsproxy.h>
25#include <linux/pid_namespace.h> 26#include <linux/pid_namespace.h>
@@ -31,7 +32,6 @@
31#include <linux/mempolicy.h> 32#include <linux/mempolicy.h>
32#include <linux/taskstats_kern.h> 33#include <linux/taskstats_kern.h>
33#include <linux/delayacct.h> 34#include <linux/delayacct.h>
34#include <linux/freezer.h>
35#include <linux/cgroup.h> 35#include <linux/cgroup.h>
36#include <linux/syscalls.h> 36#include <linux/syscalls.h>
37#include <linux/signal.h> 37#include <linux/signal.h>
@@ -72,18 +72,6 @@ static void __unhash_process(struct task_struct *p, bool group_dead)
72 list_del_rcu(&p->tasks); 72 list_del_rcu(&p->tasks);
73 list_del_init(&p->sibling); 73 list_del_init(&p->sibling);
74 __this_cpu_dec(process_counts); 74 __this_cpu_dec(process_counts);
75 /*
76 * If we are the last child process in a pid namespace to be
77 * reaped, notify the reaper sleeping zap_pid_ns_processes().
78 */
79 if (IS_ENABLED(CONFIG_PID_NS)) {
80 struct task_struct *parent = p->real_parent;
81
82 if ((task_active_pid_ns(parent)->child_reaper == parent) &&
83 list_empty(&parent->children) &&
84 (parent->flags & PF_EXITING))
85 wake_up_process(parent);
86 }
87 } 75 }
88 list_del_rcu(&p->thread_group); 76 list_del_rcu(&p->thread_group);
89} 77}
@@ -97,6 +85,7 @@ static void __exit_signal(struct task_struct *tsk)
97 bool group_dead = thread_group_leader(tsk); 85 bool group_dead = thread_group_leader(tsk);
98 struct sighand_struct *sighand; 86 struct sighand_struct *sighand;
99 struct tty_struct *uninitialized_var(tty); 87 struct tty_struct *uninitialized_var(tty);
88 cputime_t utime, stime;
100 89
101 sighand = rcu_dereference_check(tsk->sighand, 90 sighand = rcu_dereference_check(tsk->sighand,
102 lockdep_tasklist_lock_is_held()); 91 lockdep_tasklist_lock_is_held());
@@ -135,9 +124,10 @@ static void __exit_signal(struct task_struct *tsk)
135 * We won't ever get here for the group leader, since it 124 * We won't ever get here for the group leader, since it
136 * will have been the last reference on the signal_struct. 125 * will have been the last reference on the signal_struct.
137 */ 126 */
138 sig->utime += tsk->utime; 127 task_cputime(tsk, &utime, &stime);
139 sig->stime += tsk->stime; 128 sig->utime += utime;
140 sig->gtime += tsk->gtime; 129 sig->stime += stime;
130 sig->gtime += task_gtime(tsk);
141 sig->min_flt += tsk->min_flt; 131 sig->min_flt += tsk->min_flt;
142 sig->maj_flt += tsk->maj_flt; 132 sig->maj_flt += tsk->maj_flt;
143 sig->nvcsw += tsk->nvcsw; 133 sig->nvcsw += tsk->nvcsw;
@@ -322,43 +312,6 @@ kill_orphaned_pgrp(struct task_struct *tsk, struct task_struct *parent)
322 } 312 }
323} 313}
324 314
325/**
326 * reparent_to_kthreadd - Reparent the calling kernel thread to kthreadd
327 *
328 * If a kernel thread is launched as a result of a system call, or if
329 * it ever exits, it should generally reparent itself to kthreadd so it
330 * isn't in the way of other processes and is correctly cleaned up on exit.
331 *
332 * The various task state such as scheduling policy and priority may have
333 * been inherited from a user process, so we reset them to sane values here.
334 *
335 * NOTE that reparent_to_kthreadd() gives the caller full capabilities.
336 */
337static void reparent_to_kthreadd(void)
338{
339 write_lock_irq(&tasklist_lock);
340
341 ptrace_unlink(current);
342 /* Reparent to init */
343 current->real_parent = current->parent = kthreadd_task;
344 list_move_tail(&current->sibling, &current->real_parent->children);
345
346 /* Set the exit signal to SIGCHLD so we signal init on exit */
347 current->exit_signal = SIGCHLD;
348
349 if (task_nice(current) < 0)
350 set_user_nice(current, 0);
351 /* cpus_allowed? */
352 /* rt_priority? */
353 /* signals? */
354 memcpy(current->signal->rlim, init_task.signal->rlim,
355 sizeof(current->signal->rlim));
356
357 atomic_inc(&init_cred.usage);
358 commit_creds(&init_cred);
359 write_unlock_irq(&tasklist_lock);
360}
361
362void __set_special_pids(struct pid *pid) 315void __set_special_pids(struct pid *pid)
363{ 316{
364 struct task_struct *curr = current->group_leader; 317 struct task_struct *curr = current->group_leader;
@@ -370,13 +323,6 @@ void __set_special_pids(struct pid *pid)
370 change_pid(curr, PIDTYPE_PGID, pid); 323 change_pid(curr, PIDTYPE_PGID, pid);
371} 324}
372 325
373static void set_special_pids(struct pid *pid)
374{
375 write_lock_irq(&tasklist_lock);
376 __set_special_pids(pid);
377 write_unlock_irq(&tasklist_lock);
378}
379
380/* 326/*
381 * Let kernel threads use this to say that they allow a certain signal. 327 * Let kernel threads use this to say that they allow a certain signal.
382 * Must not be used if kthread was cloned with CLONE_SIGHAND. 328 * Must not be used if kthread was cloned with CLONE_SIGHAND.
@@ -416,54 +362,6 @@ int disallow_signal(int sig)
416 362
417EXPORT_SYMBOL(disallow_signal); 363EXPORT_SYMBOL(disallow_signal);
418 364
419/*
420 * Put all the gunge required to become a kernel thread without
421 * attached user resources in one place where it belongs.
422 */
423
424void daemonize(const char *name, ...)
425{
426 va_list args;
427 sigset_t blocked;
428
429 va_start(args, name);
430 vsnprintf(current->comm, sizeof(current->comm), name, args);
431 va_end(args);
432
433 /*
434 * If we were started as result of loading a module, close all of the
435 * user space pages. We don't need them, and if we didn't close them
436 * they would be locked into memory.
437 */
438 exit_mm(current);
439 /*
440 * We don't want to get frozen, in case system-wide hibernation
441 * or suspend transition begins right now.
442 */
443 current->flags |= (PF_NOFREEZE | PF_KTHREAD);
444
445 if (current->nsproxy != &init_nsproxy) {
446 get_nsproxy(&init_nsproxy);
447 switch_task_namespaces(current, &init_nsproxy);
448 }
449 set_special_pids(&init_struct_pid);
450 proc_clear_tty(current);
451
452 /* Block and flush all signals */
453 sigfillset(&blocked);
454 sigprocmask(SIG_BLOCK, &blocked, NULL);
455 flush_signals(current);
456
457 /* Become as one with the init task */
458
459 daemonize_fs_struct();
460 daemonize_descriptors();
461
462 reparent_to_kthreadd();
463}
464
465EXPORT_SYMBOL(daemonize);
466
467#ifdef CONFIG_MM_OWNER 365#ifdef CONFIG_MM_OWNER
468/* 366/*
469 * A task is exiting. If it owned this mm, find a new owner for the mm. 367 * A task is exiting. If it owned this mm, find a new owner for the mm.
@@ -587,7 +485,7 @@ static void exit_mm(struct task_struct * tsk)
587 set_task_state(tsk, TASK_UNINTERRUPTIBLE); 485 set_task_state(tsk, TASK_UNINTERRUPTIBLE);
588 if (!self.task) /* see coredump_finish() */ 486 if (!self.task) /* see coredump_finish() */
589 break; 487 break;
590 schedule(); 488 freezable_schedule();
591 } 489 }
592 __set_task_state(tsk, TASK_RUNNING); 490 __set_task_state(tsk, TASK_RUNNING);
593 down_read(&mm->mmap_sem); 491 down_read(&mm->mmap_sem);
@@ -1186,17 +1084,17 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
1186 * as other threads in the parent group can be right 1084 * as other threads in the parent group can be right
1187 * here reaping other children at the same time. 1085 * here reaping other children at the same time.
1188 * 1086 *
1189 * We use thread_group_times() to get times for the thread 1087 * We use thread_group_cputime_adjusted() to get times for the thread
1190 * group, which consolidates times for all threads in the 1088 * group, which consolidates times for all threads in the
1191 * group including the group leader. 1089 * group including the group leader.
1192 */ 1090 */
1193 thread_group_times(p, &tgutime, &tgstime); 1091 thread_group_cputime_adjusted(p, &tgutime, &tgstime);
1194 spin_lock_irq(&p->real_parent->sighand->siglock); 1092 spin_lock_irq(&p->real_parent->sighand->siglock);
1195 psig = p->real_parent->signal; 1093 psig = p->real_parent->signal;
1196 sig = p->signal; 1094 sig = p->signal;
1197 psig->cutime += tgutime + sig->cutime; 1095 psig->cutime += tgutime + sig->cutime;
1198 psig->cstime += tgstime + sig->cstime; 1096 psig->cstime += tgstime + sig->cstime;
1199 psig->cgtime += p->gtime + sig->gtime + sig->cgtime; 1097 psig->cgtime += task_gtime(p) + sig->gtime + sig->cgtime;
1200 psig->cmin_flt += 1098 psig->cmin_flt +=
1201 p->min_flt + sig->min_flt + sig->cmin_flt; 1099 p->min_flt + sig->min_flt + sig->cmin_flt;
1202 psig->cmaj_flt += 1100 psig->cmaj_flt +=
diff --git a/kernel/fork.c b/kernel/fork.c
index 8b20ab7d3aa2..1766d324d5e3 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -146,7 +146,7 @@ void __weak arch_release_thread_info(struct thread_info *ti)
146static struct thread_info *alloc_thread_info_node(struct task_struct *tsk, 146static struct thread_info *alloc_thread_info_node(struct task_struct *tsk,
147 int node) 147 int node)
148{ 148{
149 struct page *page = alloc_pages_node(node, THREADINFO_GFP, 149 struct page *page = alloc_pages_node(node, THREADINFO_GFP_ACCOUNTED,
150 THREAD_SIZE_ORDER); 150 THREAD_SIZE_ORDER);
151 151
152 return page ? page_address(page) : NULL; 152 return page ? page_address(page) : NULL;
@@ -154,7 +154,7 @@ static struct thread_info *alloc_thread_info_node(struct task_struct *tsk,
154 154
155static inline void free_thread_info(struct thread_info *ti) 155static inline void free_thread_info(struct thread_info *ti)
156{ 156{
157 free_pages((unsigned long)ti, THREAD_SIZE_ORDER); 157 free_memcg_kmem_pages((unsigned long)ti, THREAD_SIZE_ORDER);
158} 158}
159# else 159# else
160static struct kmem_cache *thread_info_cache; 160static struct kmem_cache *thread_info_cache;
@@ -352,6 +352,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
352 unsigned long charge; 352 unsigned long charge;
353 struct mempolicy *pol; 353 struct mempolicy *pol;
354 354
355 uprobe_start_dup_mmap();
355 down_write(&oldmm->mmap_sem); 356 down_write(&oldmm->mmap_sem);
356 flush_cache_dup_mm(oldmm); 357 flush_cache_dup_mm(oldmm);
357 uprobe_dup_mmap(oldmm, mm); 358 uprobe_dup_mmap(oldmm, mm);
@@ -412,7 +413,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
412 tmp->vm_next = tmp->vm_prev = NULL; 413 tmp->vm_next = tmp->vm_prev = NULL;
413 file = tmp->vm_file; 414 file = tmp->vm_file;
414 if (file) { 415 if (file) {
415 struct inode *inode = file->f_path.dentry->d_inode; 416 struct inode *inode = file_inode(file);
416 struct address_space *mapping = file->f_mapping; 417 struct address_space *mapping = file->f_mapping;
417 418
418 get_file(file); 419 get_file(file);
@@ -469,6 +470,7 @@ out:
469 up_write(&mm->mmap_sem); 470 up_write(&mm->mmap_sem);
470 flush_tlb_mm(oldmm); 471 flush_tlb_mm(oldmm);
471 up_write(&oldmm->mmap_sem); 472 up_write(&oldmm->mmap_sem);
473 uprobe_end_dup_mmap();
472 return retval; 474 return retval;
473fail_nomem_anon_vma_fork: 475fail_nomem_anon_vma_fork:
474 mpol_put(pol); 476 mpol_put(pol);
@@ -821,6 +823,9 @@ struct mm_struct *dup_mm(struct task_struct *tsk)
821#ifdef CONFIG_TRANSPARENT_HUGEPAGE 823#ifdef CONFIG_TRANSPARENT_HUGEPAGE
822 mm->pmd_huge_pte = NULL; 824 mm->pmd_huge_pte = NULL;
823#endif 825#endif
826#ifdef CONFIG_NUMA_BALANCING
827 mm->first_nid = NUMA_PTE_SCAN_INIT;
828#endif
824 if (!mm_init(mm, tsk)) 829 if (!mm_init(mm, tsk))
825 goto fail_nomem; 830 goto fail_nomem;
826 831
@@ -1039,8 +1044,6 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
1039 atomic_set(&sig->live, 1); 1044 atomic_set(&sig->live, 1);
1040 atomic_set(&sig->sigcnt, 1); 1045 atomic_set(&sig->sigcnt, 1);
1041 init_waitqueue_head(&sig->wait_chldexit); 1046 init_waitqueue_head(&sig->wait_chldexit);
1042 if (clone_flags & CLONE_NEWPID)
1043 sig->flags |= SIGNAL_UNKILLABLE;
1044 sig->curr_target = tsk; 1047 sig->curr_target = tsk;
1045 init_sigpending(&sig->shared_pending); 1048 init_sigpending(&sig->shared_pending);
1046 INIT_LIST_HEAD(&sig->posix_timers); 1049 INIT_LIST_HEAD(&sig->posix_timers);
@@ -1127,7 +1130,6 @@ static void posix_cpu_timers_init(struct task_struct *tsk)
1127 */ 1130 */
1128static struct task_struct *copy_process(unsigned long clone_flags, 1131static struct task_struct *copy_process(unsigned long clone_flags,
1129 unsigned long stack_start, 1132 unsigned long stack_start,
1130 struct pt_regs *regs,
1131 unsigned long stack_size, 1133 unsigned long stack_size,
1132 int __user *child_tidptr, 1134 int __user *child_tidptr,
1133 struct pid *pid, 1135 struct pid *pid,
@@ -1135,11 +1137,13 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1135{ 1137{
1136 int retval; 1138 int retval;
1137 struct task_struct *p; 1139 struct task_struct *p;
1138 int cgroup_callbacks_done = 0;
1139 1140
1140 if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS)) 1141 if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
1141 return ERR_PTR(-EINVAL); 1142 return ERR_PTR(-EINVAL);
1142 1143
1144 if ((clone_flags & (CLONE_NEWUSER|CLONE_FS)) == (CLONE_NEWUSER|CLONE_FS))
1145 return ERR_PTR(-EINVAL);
1146
1143 /* 1147 /*
1144 * Thread groups must share signals as well, and detached threads 1148 * Thread groups must share signals as well, and detached threads
1145 * can only be started up within the thread group. 1149 * can only be started up within the thread group.
@@ -1165,6 +1169,14 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1165 current->signal->flags & SIGNAL_UNKILLABLE) 1169 current->signal->flags & SIGNAL_UNKILLABLE)
1166 return ERR_PTR(-EINVAL); 1170 return ERR_PTR(-EINVAL);
1167 1171
1172 /*
1173 * If the new process will be in a different pid namespace
1174 * don't allow the creation of threads.
1175 */
1176 if ((clone_flags & (CLONE_VM|CLONE_NEWPID)) &&
1177 (task_active_pid_ns(current) != current->nsproxy->pid_ns))
1178 return ERR_PTR(-EINVAL);
1179
1168 retval = security_task_create(clone_flags); 1180 retval = security_task_create(clone_flags);
1169 if (retval) 1181 if (retval)
1170 goto fork_out; 1182 goto fork_out;
@@ -1222,8 +1234,14 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1222 p->utime = p->stime = p->gtime = 0; 1234 p->utime = p->stime = p->gtime = 0;
1223 p->utimescaled = p->stimescaled = 0; 1235 p->utimescaled = p->stimescaled = 0;
1224#ifndef CONFIG_VIRT_CPU_ACCOUNTING 1236#ifndef CONFIG_VIRT_CPU_ACCOUNTING
1225 p->prev_utime = p->prev_stime = 0; 1237 p->prev_cputime.utime = p->prev_cputime.stime = 0;
1226#endif 1238#endif
1239#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
1240 seqlock_init(&p->vtime_seqlock);
1241 p->vtime_snap = 0;
1242 p->vtime_snap_whence = VTIME_SLEEPING;
1243#endif
1244
1227#if defined(SPLIT_RSS_COUNTING) 1245#if defined(SPLIT_RSS_COUNTING)
1228 memset(&p->rss_stat, 0, sizeof(p->rss_stat)); 1246 memset(&p->rss_stat, 0, sizeof(p->rss_stat));
1229#endif 1247#endif
@@ -1320,7 +1338,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1320 retval = copy_io(clone_flags, p); 1338 retval = copy_io(clone_flags, p);
1321 if (retval) 1339 if (retval)
1322 goto bad_fork_cleanup_namespaces; 1340 goto bad_fork_cleanup_namespaces;
1323 retval = copy_thread(clone_flags, stack_start, stack_size, p, regs); 1341 retval = copy_thread(clone_flags, stack_start, stack_size, p);
1324 if (retval) 1342 if (retval)
1325 goto bad_fork_cleanup_io; 1343 goto bad_fork_cleanup_io;
1326 1344
@@ -1393,12 +1411,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1393 INIT_LIST_HEAD(&p->thread_group); 1411 INIT_LIST_HEAD(&p->thread_group);
1394 p->task_works = NULL; 1412 p->task_works = NULL;
1395 1413
1396 /* Now that the task is set up, run cgroup callbacks if
1397 * necessary. We need to run them before the task is visible
1398 * on the tasklist. */
1399 cgroup_fork_callbacks(p);
1400 cgroup_callbacks_done = 1;
1401
1402 /* Need tasklist lock for parent etc handling! */ 1414 /* Need tasklist lock for parent etc handling! */
1403 write_lock_irq(&tasklist_lock); 1415 write_lock_irq(&tasklist_lock);
1404 1416
@@ -1441,8 +1453,10 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1441 ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace); 1453 ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace);
1442 1454
1443 if (thread_group_leader(p)) { 1455 if (thread_group_leader(p)) {
1444 if (is_child_reaper(pid)) 1456 if (is_child_reaper(pid)) {
1445 p->nsproxy->pid_ns->child_reaper = p; 1457 ns_of_pid(pid)->child_reaper = p;
1458 p->signal->flags |= SIGNAL_UNKILLABLE;
1459 }
1446 1460
1447 p->signal->leader_pid = pid; 1461 p->signal->leader_pid = pid;
1448 p->signal->tty = tty_kref_get(current->signal->tty); 1462 p->signal->tty = tty_kref_get(current->signal->tty);
@@ -1476,8 +1490,6 @@ bad_fork_cleanup_io:
1476 if (p->io_context) 1490 if (p->io_context)
1477 exit_io_context(p); 1491 exit_io_context(p);
1478bad_fork_cleanup_namespaces: 1492bad_fork_cleanup_namespaces:
1479 if (unlikely(clone_flags & CLONE_NEWPID))
1480 pid_ns_release_proc(p->nsproxy->pid_ns);
1481 exit_task_namespaces(p); 1493 exit_task_namespaces(p);
1482bad_fork_cleanup_mm: 1494bad_fork_cleanup_mm:
1483 if (p->mm) 1495 if (p->mm)
@@ -1503,7 +1515,7 @@ bad_fork_cleanup_cgroup:
1503#endif 1515#endif
1504 if (clone_flags & CLONE_THREAD) 1516 if (clone_flags & CLONE_THREAD)
1505 threadgroup_change_end(current); 1517 threadgroup_change_end(current);
1506 cgroup_exit(p, cgroup_callbacks_done); 1518 cgroup_exit(p, 0);
1507 delayacct_tsk_free(p); 1519 delayacct_tsk_free(p);
1508 module_put(task_thread_info(p)->exec_domain->module); 1520 module_put(task_thread_info(p)->exec_domain->module);
1509bad_fork_cleanup_count: 1521bad_fork_cleanup_count:
@@ -1515,12 +1527,6 @@ fork_out:
1515 return ERR_PTR(retval); 1527 return ERR_PTR(retval);
1516} 1528}
1517 1529
1518noinline struct pt_regs * __cpuinit __attribute__((weak)) idle_regs(struct pt_regs *regs)
1519{
1520 memset(regs, 0, sizeof(struct pt_regs));
1521 return regs;
1522}
1523
1524static inline void init_idle_pids(struct pid_link *links) 1530static inline void init_idle_pids(struct pid_link *links)
1525{ 1531{
1526 enum pid_type type; 1532 enum pid_type type;
@@ -1534,10 +1540,7 @@ static inline void init_idle_pids(struct pid_link *links)
1534struct task_struct * __cpuinit fork_idle(int cpu) 1540struct task_struct * __cpuinit fork_idle(int cpu)
1535{ 1541{
1536 struct task_struct *task; 1542 struct task_struct *task;
1537 struct pt_regs regs; 1543 task = copy_process(CLONE_VM, 0, 0, NULL, &init_struct_pid, 0);
1538
1539 task = copy_process(CLONE_VM, 0, idle_regs(&regs), 0, NULL,
1540 &init_struct_pid, 0);
1541 if (!IS_ERR(task)) { 1544 if (!IS_ERR(task)) {
1542 init_idle_pids(task->pids); 1545 init_idle_pids(task->pids);
1543 init_idle(task, cpu); 1546 init_idle(task, cpu);
@@ -1554,7 +1557,6 @@ struct task_struct * __cpuinit fork_idle(int cpu)
1554 */ 1557 */
1555long do_fork(unsigned long clone_flags, 1558long do_fork(unsigned long clone_flags,
1556 unsigned long stack_start, 1559 unsigned long stack_start,
1557 struct pt_regs *regs,
1558 unsigned long stack_size, 1560 unsigned long stack_size,
1559 int __user *parent_tidptr, 1561 int __user *parent_tidptr,
1560 int __user *child_tidptr) 1562 int __user *child_tidptr)
@@ -1567,15 +1569,9 @@ long do_fork(unsigned long clone_flags,
1567 * Do some preliminary argument and permissions checking before we 1569 * Do some preliminary argument and permissions checking before we
1568 * actually start allocating stuff 1570 * actually start allocating stuff
1569 */ 1571 */
1570 if (clone_flags & CLONE_NEWUSER) { 1572 if (clone_flags & (CLONE_NEWUSER | CLONE_NEWPID)) {
1571 if (clone_flags & CLONE_THREAD) 1573 if (clone_flags & (CLONE_THREAD|CLONE_PARENT))
1572 return -EINVAL; 1574 return -EINVAL;
1573 /* hopefully this check will go away when userns support is
1574 * complete
1575 */
1576 if (!capable(CAP_SYS_ADMIN) || !capable(CAP_SETUID) ||
1577 !capable(CAP_SETGID))
1578 return -EPERM;
1579 } 1575 }
1580 1576
1581 /* 1577 /*
@@ -1584,7 +1580,7 @@ long do_fork(unsigned long clone_flags,
1584 * requested, no event is reported; otherwise, report if the event 1580 * requested, no event is reported; otherwise, report if the event
1585 * for the type of forking is enabled. 1581 * for the type of forking is enabled.
1586 */ 1582 */
1587 if (!(clone_flags & CLONE_UNTRACED) && likely(user_mode(regs))) { 1583 if (!(clone_flags & CLONE_UNTRACED)) {
1588 if (clone_flags & CLONE_VFORK) 1584 if (clone_flags & CLONE_VFORK)
1589 trace = PTRACE_EVENT_VFORK; 1585 trace = PTRACE_EVENT_VFORK;
1590 else if ((clone_flags & CSIGNAL) != SIGCHLD) 1586 else if ((clone_flags & CSIGNAL) != SIGCHLD)
@@ -1596,7 +1592,7 @@ long do_fork(unsigned long clone_flags,
1596 trace = 0; 1592 trace = 0;
1597 } 1593 }
1598 1594
1599 p = copy_process(clone_flags, stack_start, regs, stack_size, 1595 p = copy_process(clone_flags, stack_start, stack_size,
1600 child_tidptr, NULL, trace); 1596 child_tidptr, NULL, trace);
1601 /* 1597 /*
1602 * Do this prior waking up the new thread - the thread pointer 1598 * Do this prior waking up the new thread - the thread pointer
@@ -1634,15 +1630,58 @@ long do_fork(unsigned long clone_flags,
1634 return nr; 1630 return nr;
1635} 1631}
1636 1632
1637#ifdef CONFIG_GENERIC_KERNEL_THREAD
1638/* 1633/*
1639 * Create a kernel thread. 1634 * Create a kernel thread.
1640 */ 1635 */
1641pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags) 1636pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
1642{ 1637{
1643 return do_fork(flags|CLONE_VM|CLONE_UNTRACED, (unsigned long)fn, NULL, 1638 return do_fork(flags|CLONE_VM|CLONE_UNTRACED, (unsigned long)fn,
1644 (unsigned long)arg, NULL, NULL); 1639 (unsigned long)arg, NULL, NULL);
1645} 1640}
1641
1642#ifdef __ARCH_WANT_SYS_FORK
1643SYSCALL_DEFINE0(fork)
1644{
1645#ifdef CONFIG_MMU
1646 return do_fork(SIGCHLD, 0, 0, NULL, NULL);
1647#else
1648 /* can not support in nommu mode */
1649 return(-EINVAL);
1650#endif
1651}
1652#endif
1653
1654#ifdef __ARCH_WANT_SYS_VFORK
1655SYSCALL_DEFINE0(vfork)
1656{
1657 return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, 0,
1658 0, NULL, NULL);
1659}
1660#endif
1661
1662#ifdef __ARCH_WANT_SYS_CLONE
1663#ifdef CONFIG_CLONE_BACKWARDS
1664SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
1665 int __user *, parent_tidptr,
1666 int, tls_val,
1667 int __user *, child_tidptr)
1668#elif defined(CONFIG_CLONE_BACKWARDS2)
1669SYSCALL_DEFINE5(clone, unsigned long, newsp, unsigned long, clone_flags,
1670 int __user *, parent_tidptr,
1671 int __user *, child_tidptr,
1672 int, tls_val)
1673#else
1674SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
1675 int __user *, parent_tidptr,
1676 int __user *, child_tidptr,
1677 int, tls_val)
1678#endif
1679{
1680 long ret = do_fork(clone_flags, newsp, 0, parent_tidptr, child_tidptr);
1681 asmlinkage_protect(5, ret, clone_flags, newsp,
1682 parent_tidptr, child_tidptr, tls_val);
1683 return ret;
1684}
1646#endif 1685#endif
1647 1686
1648#ifndef ARCH_MIN_MMSTRUCT_ALIGN 1687#ifndef ARCH_MIN_MMSTRUCT_ALIGN
@@ -1694,7 +1733,8 @@ static int check_unshare_flags(unsigned long unshare_flags)
1694{ 1733{
1695 if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND| 1734 if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND|
1696 CLONE_VM|CLONE_FILES|CLONE_SYSVSEM| 1735 CLONE_VM|CLONE_FILES|CLONE_SYSVSEM|
1697 CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET)) 1736 CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET|
1737 CLONE_NEWUSER|CLONE_NEWPID))
1698 return -EINVAL; 1738 return -EINVAL;
1699 /* 1739 /*
1700 * Not implemented, but pretend it works if there is nothing to 1740 * Not implemented, but pretend it works if there is nothing to
@@ -1761,19 +1801,40 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
1761{ 1801{
1762 struct fs_struct *fs, *new_fs = NULL; 1802 struct fs_struct *fs, *new_fs = NULL;
1763 struct files_struct *fd, *new_fd = NULL; 1803 struct files_struct *fd, *new_fd = NULL;
1804 struct cred *new_cred = NULL;
1764 struct nsproxy *new_nsproxy = NULL; 1805 struct nsproxy *new_nsproxy = NULL;
1765 int do_sysvsem = 0; 1806 int do_sysvsem = 0;
1766 int err; 1807 int err;
1767 1808
1768 err = check_unshare_flags(unshare_flags); 1809 /*
1769 if (err) 1810 * If unsharing a user namespace must also unshare the thread.
1770 goto bad_unshare_out; 1811 */
1771 1812 if (unshare_flags & CLONE_NEWUSER)
1813 unshare_flags |= CLONE_THREAD | CLONE_FS;
1814 /*
1815 * If unsharing a pid namespace must also unshare the thread.
1816 */
1817 if (unshare_flags & CLONE_NEWPID)
1818 unshare_flags |= CLONE_THREAD;
1819 /*
1820 * If unsharing a thread from a thread group, must also unshare vm.
1821 */
1822 if (unshare_flags & CLONE_THREAD)
1823 unshare_flags |= CLONE_VM;
1824 /*
1825 * If unsharing vm, must also unshare signal handlers.
1826 */
1827 if (unshare_flags & CLONE_VM)
1828 unshare_flags |= CLONE_SIGHAND;
1772 /* 1829 /*
1773 * If unsharing namespace, must also unshare filesystem information. 1830 * If unsharing namespace, must also unshare filesystem information.
1774 */ 1831 */
1775 if (unshare_flags & CLONE_NEWNS) 1832 if (unshare_flags & CLONE_NEWNS)
1776 unshare_flags |= CLONE_FS; 1833 unshare_flags |= CLONE_FS;
1834
1835 err = check_unshare_flags(unshare_flags);
1836 if (err)
1837 goto bad_unshare_out;
1777 /* 1838 /*
1778 * CLONE_NEWIPC must also detach from the undolist: after switching 1839 * CLONE_NEWIPC must also detach from the undolist: after switching
1779 * to a new ipc namespace, the semaphore arrays from the old 1840 * to a new ipc namespace, the semaphore arrays from the old
@@ -1787,11 +1848,15 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
1787 err = unshare_fd(unshare_flags, &new_fd); 1848 err = unshare_fd(unshare_flags, &new_fd);
1788 if (err) 1849 if (err)
1789 goto bad_unshare_cleanup_fs; 1850 goto bad_unshare_cleanup_fs;
1790 err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy, new_fs); 1851 err = unshare_userns(unshare_flags, &new_cred);
1791 if (err) 1852 if (err)
1792 goto bad_unshare_cleanup_fd; 1853 goto bad_unshare_cleanup_fd;
1854 err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy,
1855 new_cred, new_fs);
1856 if (err)
1857 goto bad_unshare_cleanup_cred;
1793 1858
1794 if (new_fs || new_fd || do_sysvsem || new_nsproxy) { 1859 if (new_fs || new_fd || do_sysvsem || new_cred || new_nsproxy) {
1795 if (do_sysvsem) { 1860 if (do_sysvsem) {
1796 /* 1861 /*
1797 * CLONE_SYSVSEM is equivalent to sys_exit(). 1862 * CLONE_SYSVSEM is equivalent to sys_exit().
@@ -1799,10 +1864,8 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
1799 exit_sem(current); 1864 exit_sem(current);
1800 } 1865 }
1801 1866
1802 if (new_nsproxy) { 1867 if (new_nsproxy)
1803 switch_task_namespaces(current, new_nsproxy); 1868 switch_task_namespaces(current, new_nsproxy);
1804 new_nsproxy = NULL;
1805 }
1806 1869
1807 task_lock(current); 1870 task_lock(current);
1808 1871
@@ -1824,11 +1887,17 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
1824 } 1887 }
1825 1888
1826 task_unlock(current); 1889 task_unlock(current);
1827 }
1828 1890
1829 if (new_nsproxy) 1891 if (new_cred) {
1830 put_nsproxy(new_nsproxy); 1892 /* Install the new user namespace */
1893 commit_creds(new_cred);
1894 new_cred = NULL;
1895 }
1896 }
1831 1897
1898bad_unshare_cleanup_cred:
1899 if (new_cred)
1900 put_cred(new_cred);
1832bad_unshare_cleanup_fd: 1901bad_unshare_cleanup_fd:
1833 if (new_fd) 1902 if (new_fd)
1834 put_files_struct(new_fd); 1903 put_files_struct(new_fd);
diff --git a/kernel/freezer.c b/kernel/freezer.c
index 11f82a4d4eae..c38893b0efba 100644
--- a/kernel/freezer.c
+++ b/kernel/freezer.c
@@ -116,17 +116,10 @@ bool freeze_task(struct task_struct *p)
116 return false; 116 return false;
117 } 117 }
118 118
119 if (!(p->flags & PF_KTHREAD)) { 119 if (!(p->flags & PF_KTHREAD))
120 fake_signal_wake_up(p); 120 fake_signal_wake_up(p);
121 /* 121 else
122 * fake_signal_wake_up() goes through p's scheduler
123 * lock and guarantees that TASK_STOPPED/TRACED ->
124 * TASK_RUNNING transition can't race with task state
125 * testing in try_to_freeze_tasks().
126 */
127 } else {
128 wake_up_state(p, TASK_INTERRUPTIBLE); 122 wake_up_state(p, TASK_INTERRUPTIBLE);
129 }
130 123
131 spin_unlock_irqrestore(&freezer_lock, flags); 124 spin_unlock_irqrestore(&freezer_lock, flags);
132 return true; 125 return true;
diff --git a/kernel/futex.c b/kernel/futex.c
index 3717e7b306e0..b26dcfc02c94 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -60,6 +60,7 @@
60#include <linux/pid.h> 60#include <linux/pid.h>
61#include <linux/nsproxy.h> 61#include <linux/nsproxy.h>
62#include <linux/ptrace.h> 62#include <linux/ptrace.h>
63#include <linux/sched/rt.h>
63 64
64#include <asm/futex.h> 65#include <asm/futex.h>
65 66
@@ -222,10 +223,11 @@ static void drop_futex_key_refs(union futex_key *key)
222 * @rw: mapping needs to be read/write (values: VERIFY_READ, 223 * @rw: mapping needs to be read/write (values: VERIFY_READ,
223 * VERIFY_WRITE) 224 * VERIFY_WRITE)
224 * 225 *
225 * Returns a negative error code or 0 226 * Return: a negative error code or 0
227 *
226 * The key words are stored in *key on success. 228 * The key words are stored in *key on success.
227 * 229 *
228 * For shared mappings, it's (page->index, vma->vm_file->f_path.dentry->d_inode, 230 * For shared mappings, it's (page->index, file_inode(vma->vm_file),
229 * offset_within_page). For private mappings, it's (uaddr, current->mm). 231 * offset_within_page). For private mappings, it's (uaddr, current->mm).
230 * We can usually work out the index without swapping in the page. 232 * We can usually work out the index without swapping in the page.
231 * 233 *
@@ -704,9 +706,9 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
704 * be "current" except in the case of requeue pi. 706 * be "current" except in the case of requeue pi.
705 * @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0) 707 * @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0)
706 * 708 *
707 * Returns: 709 * Return:
708 * 0 - ready to wait 710 * 0 - ready to wait;
709 * 1 - acquired the lock 711 * 1 - acquired the lock;
710 * <0 - error 712 * <0 - error
711 * 713 *
712 * The hb->lock and futex_key refs shall be held by the caller. 714 * The hb->lock and futex_key refs shall be held by the caller.
@@ -716,7 +718,7 @@ static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
716 struct futex_pi_state **ps, 718 struct futex_pi_state **ps,
717 struct task_struct *task, int set_waiters) 719 struct task_struct *task, int set_waiters)
718{ 720{
719 int lock_taken, ret, ownerdied = 0; 721 int lock_taken, ret, force_take = 0;
720 u32 uval, newval, curval, vpid = task_pid_vnr(task); 722 u32 uval, newval, curval, vpid = task_pid_vnr(task);
721 723
722retry: 724retry:
@@ -755,17 +757,15 @@ retry:
755 newval = curval | FUTEX_WAITERS; 757 newval = curval | FUTEX_WAITERS;
756 758
757 /* 759 /*
758 * There are two cases, where a futex might have no owner (the 760 * Should we force take the futex? See below.
759 * owner TID is 0): OWNER_DIED. We take over the futex in this
760 * case. We also do an unconditional take over, when the owner
761 * of the futex died.
762 *
763 * This is safe as we are protected by the hash bucket lock !
764 */ 761 */
765 if (unlikely(ownerdied || !(curval & FUTEX_TID_MASK))) { 762 if (unlikely(force_take)) {
766 /* Keep the OWNER_DIED bit */ 763 /*
764 * Keep the OWNER_DIED and the WAITERS bit and set the
765 * new TID value.
766 */
767 newval = (curval & ~FUTEX_TID_MASK) | vpid; 767 newval = (curval & ~FUTEX_TID_MASK) | vpid;
768 ownerdied = 0; 768 force_take = 0;
769 lock_taken = 1; 769 lock_taken = 1;
770 } 770 }
771 771
@@ -775,7 +775,7 @@ retry:
775 goto retry; 775 goto retry;
776 776
777 /* 777 /*
778 * We took the lock due to owner died take over. 778 * We took the lock due to forced take over.
779 */ 779 */
780 if (unlikely(lock_taken)) 780 if (unlikely(lock_taken))
781 return 1; 781 return 1;
@@ -790,20 +790,25 @@ retry:
790 switch (ret) { 790 switch (ret) {
791 case -ESRCH: 791 case -ESRCH:
792 /* 792 /*
793 * No owner found for this futex. Check if the 793 * We failed to find an owner for this
794 * OWNER_DIED bit is set to figure out whether 794 * futex. So we have no pi_state to block
795 * this is a robust futex or not. 795 * on. This can happen in two cases:
796 *
797 * 1) The owner died
798 * 2) A stale FUTEX_WAITERS bit
799 *
800 * Re-read the futex value.
796 */ 801 */
797 if (get_futex_value_locked(&curval, uaddr)) 802 if (get_futex_value_locked(&curval, uaddr))
798 return -EFAULT; 803 return -EFAULT;
799 804
800 /* 805 /*
801 * We simply start over in case of a robust 806 * If the owner died or we have a stale
802 * futex. The code above will take the futex 807 * WAITERS bit the owner TID in the user space
803 * and return happy. 808 * futex is 0.
804 */ 809 */
805 if (curval & FUTEX_OWNER_DIED) { 810 if (!(curval & FUTEX_TID_MASK)) {
806 ownerdied = 1; 811 force_take = 1;
807 goto retry; 812 goto retry;
808 } 813 }
809 default: 814 default:
@@ -840,6 +845,9 @@ static void wake_futex(struct futex_q *q)
840{ 845{
841 struct task_struct *p = q->task; 846 struct task_struct *p = q->task;
842 847
848 if (WARN(q->pi_state || q->rt_waiter, "refusing to wake PI futex\n"))
849 return;
850
843 /* 851 /*
844 * We set q->lock_ptr = NULL _before_ we wake up the task. If 852 * We set q->lock_ptr = NULL _before_ we wake up the task. If
845 * a non-futex wake up happens on another CPU then the task 853 * a non-futex wake up happens on another CPU then the task
@@ -1075,6 +1083,10 @@ retry_private:
1075 1083
1076 plist_for_each_entry_safe(this, next, head, list) { 1084 plist_for_each_entry_safe(this, next, head, list) {
1077 if (match_futex (&this->key, &key1)) { 1085 if (match_futex (&this->key, &key1)) {
1086 if (this->pi_state || this->rt_waiter) {
1087 ret = -EINVAL;
1088 goto out_unlock;
1089 }
1078 wake_futex(this); 1090 wake_futex(this);
1079 if (++ret >= nr_wake) 1091 if (++ret >= nr_wake)
1080 break; 1092 break;
@@ -1087,6 +1099,10 @@ retry_private:
1087 op_ret = 0; 1099 op_ret = 0;
1088 plist_for_each_entry_safe(this, next, head, list) { 1100 plist_for_each_entry_safe(this, next, head, list) {
1089 if (match_futex (&this->key, &key2)) { 1101 if (match_futex (&this->key, &key2)) {
1102 if (this->pi_state || this->rt_waiter) {
1103 ret = -EINVAL;
1104 goto out_unlock;
1105 }
1090 wake_futex(this); 1106 wake_futex(this);
1091 if (++op_ret >= nr_wake2) 1107 if (++op_ret >= nr_wake2)
1092 break; 1108 break;
@@ -1095,6 +1111,7 @@ retry_private:
1095 ret += op_ret; 1111 ret += op_ret;
1096 } 1112 }
1097 1113
1114out_unlock:
1098 double_unlock_hb(hb1, hb2); 1115 double_unlock_hb(hb1, hb2);
1099out_put_keys: 1116out_put_keys:
1100 put_futex_key(&key2); 1117 put_futex_key(&key2);
@@ -1175,9 +1192,9 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key,
1175 * then direct futex_lock_pi_atomic() to force setting the FUTEX_WAITERS bit. 1192 * then direct futex_lock_pi_atomic() to force setting the FUTEX_WAITERS bit.
1176 * hb1 and hb2 must be held by the caller. 1193 * hb1 and hb2 must be held by the caller.
1177 * 1194 *
1178 * Returns: 1195 * Return:
1179 * 0 - failed to acquire the lock atomicly 1196 * 0 - failed to acquire the lock atomically;
1180 * 1 - acquired the lock 1197 * 1 - acquired the lock;
1181 * <0 - error 1198 * <0 - error
1182 */ 1199 */
1183static int futex_proxy_trylock_atomic(u32 __user *pifutex, 1200static int futex_proxy_trylock_atomic(u32 __user *pifutex,
@@ -1238,8 +1255,8 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex,
1238 * Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire 1255 * Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire
1239 * uaddr2 atomically on behalf of the top waiter. 1256 * uaddr2 atomically on behalf of the top waiter.
1240 * 1257 *
1241 * Returns: 1258 * Return:
1242 * >=0 - on success, the number of tasks requeued or woken 1259 * >=0 - on success, the number of tasks requeued or woken;
1243 * <0 - on error 1260 * <0 - on error
1244 */ 1261 */
1245static int futex_requeue(u32 __user *uaddr1, unsigned int flags, 1262static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
@@ -1384,9 +1401,13 @@ retry_private:
1384 /* 1401 /*
1385 * FUTEX_WAIT_REQEUE_PI and FUTEX_CMP_REQUEUE_PI should always 1402 * FUTEX_WAIT_REQEUE_PI and FUTEX_CMP_REQUEUE_PI should always
1386 * be paired with each other and no other futex ops. 1403 * be paired with each other and no other futex ops.
1404 *
1405 * We should never be requeueing a futex_q with a pi_state,
1406 * which is awaiting a futex_unlock_pi().
1387 */ 1407 */
1388 if ((requeue_pi && !this->rt_waiter) || 1408 if ((requeue_pi && !this->rt_waiter) ||
1389 (!requeue_pi && this->rt_waiter)) { 1409 (!requeue_pi && this->rt_waiter) ||
1410 this->pi_state) {
1390 ret = -EINVAL; 1411 ret = -EINVAL;
1391 break; 1412 break;
1392 } 1413 }
@@ -1516,8 +1537,8 @@ static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
1516 * The q->lock_ptr must not be held by the caller. A call to unqueue_me() must 1537 * The q->lock_ptr must not be held by the caller. A call to unqueue_me() must
1517 * be paired with exactly one earlier call to queue_me(). 1538 * be paired with exactly one earlier call to queue_me().
1518 * 1539 *
1519 * Returns: 1540 * Return:
1520 * 1 - if the futex_q was still queued (and we removed unqueued it) 1541 * 1 - if the futex_q was still queued (and we removed unqueued it);
1521 * 0 - if the futex_q was already removed by the waking thread 1542 * 0 - if the futex_q was already removed by the waking thread
1522 */ 1543 */
1523static int unqueue_me(struct futex_q *q) 1544static int unqueue_me(struct futex_q *q)
@@ -1687,9 +1708,9 @@ static long futex_wait_restart(struct restart_block *restart);
1687 * the pi_state owner as well as handle race conditions that may allow us to 1708 * the pi_state owner as well as handle race conditions that may allow us to
1688 * acquire the lock. Must be called with the hb lock held. 1709 * acquire the lock. Must be called with the hb lock held.
1689 * 1710 *
1690 * Returns: 1711 * Return:
1691 * 1 - success, lock taken 1712 * 1 - success, lock taken;
1692 * 0 - success, lock not taken 1713 * 0 - success, lock not taken;
1693 * <0 - on error (-EFAULT) 1714 * <0 - on error (-EFAULT)
1694 */ 1715 */
1695static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked) 1716static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked)
@@ -1804,8 +1825,8 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
1804 * Return with the hb lock held and a q.key reference on success, and unlocked 1825 * Return with the hb lock held and a q.key reference on success, and unlocked
1805 * with no q.key reference on failure. 1826 * with no q.key reference on failure.
1806 * 1827 *
1807 * Returns: 1828 * Return:
1808 * 0 - uaddr contains val and hb has been locked 1829 * 0 - uaddr contains val and hb has been locked;
1809 * <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlocked 1830 * <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlocked
1810 */ 1831 */
1811static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags, 1832static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags,
@@ -2183,9 +2204,9 @@ pi_faulted:
2183 * the wakeup and return the appropriate error code to the caller. Must be 2204 * the wakeup and return the appropriate error code to the caller. Must be
2184 * called with the hb lock held. 2205 * called with the hb lock held.
2185 * 2206 *
2186 * Returns 2207 * Return:
2187 * 0 - no early wakeup detected 2208 * 0 = no early wakeup detected;
2188 * <0 - -ETIMEDOUT or -ERESTARTNOINTR 2209 * <0 = -ETIMEDOUT or -ERESTARTNOINTR
2189 */ 2210 */
2190static inline 2211static inline
2191int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb, 2212int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
@@ -2227,7 +2248,6 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
2227 * @val: the expected value of uaddr 2248 * @val: the expected value of uaddr
2228 * @abs_time: absolute timeout 2249 * @abs_time: absolute timeout
2229 * @bitset: 32 bit wakeup bitset set by userspace, defaults to all 2250 * @bitset: 32 bit wakeup bitset set by userspace, defaults to all
2230 * @clockrt: whether to use CLOCK_REALTIME (1) or CLOCK_MONOTONIC (0)
2231 * @uaddr2: the pi futex we will take prior to returning to user-space 2251 * @uaddr2: the pi futex we will take prior to returning to user-space
2232 * 2252 *
2233 * The caller will wait on uaddr and will be requeued by futex_requeue() to 2253 * The caller will wait on uaddr and will be requeued by futex_requeue() to
@@ -2238,7 +2258,7 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
2238 * there was a need to. 2258 * there was a need to.
2239 * 2259 *
2240 * We call schedule in futex_wait_queue_me() when we enqueue and return there 2260 * We call schedule in futex_wait_queue_me() when we enqueue and return there
2241 * via the following: 2261 * via the following--
2242 * 1) wakeup on uaddr2 after an atomic lock acquisition by futex_requeue() 2262 * 1) wakeup on uaddr2 after an atomic lock acquisition by futex_requeue()
2243 * 2) wakeup on uaddr2 after a requeue 2263 * 2) wakeup on uaddr2 after a requeue
2244 * 3) signal 2264 * 3) signal
@@ -2256,8 +2276,8 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
2256 * 2276 *
2257 * If 4 or 7, we cleanup and return with -ETIMEDOUT. 2277 * If 4 or 7, we cleanup and return with -ETIMEDOUT.
2258 * 2278 *
2259 * Returns: 2279 * Return:
2260 * 0 - On success 2280 * 0 - On success;
2261 * <0 - On error 2281 * <0 - On error
2262 */ 2282 */
2263static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, 2283static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
@@ -2452,8 +2472,6 @@ SYSCALL_DEFINE3(get_robust_list, int, pid,
2452 if (!futex_cmpxchg_enabled) 2472 if (!futex_cmpxchg_enabled)
2453 return -ENOSYS; 2473 return -ENOSYS;
2454 2474
2455 WARN_ONCE(1, "deprecated: get_robust_list will be deleted in 2013.\n");
2456
2457 rcu_read_lock(); 2475 rcu_read_lock();
2458 2476
2459 ret = -ESRCH; 2477 ret = -ESRCH;
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
index 83e368b005fc..f9f44fd4d34d 100644
--- a/kernel/futex_compat.c
+++ b/kernel/futex_compat.c
@@ -11,6 +11,7 @@
11#include <linux/nsproxy.h> 11#include <linux/nsproxy.h>
12#include <linux/futex.h> 12#include <linux/futex.h>
13#include <linux/ptrace.h> 13#include <linux/ptrace.h>
14#include <linux/syscalls.h>
14 15
15#include <asm/uaccess.h> 16#include <asm/uaccess.h>
16 17
@@ -116,9 +117,9 @@ void compat_exit_robust_list(struct task_struct *curr)
116 } 117 }
117} 118}
118 119
119asmlinkage long 120COMPAT_SYSCALL_DEFINE2(set_robust_list,
120compat_sys_set_robust_list(struct compat_robust_list_head __user *head, 121 struct compat_robust_list_head __user *, head,
121 compat_size_t len) 122 compat_size_t, len)
122{ 123{
123 if (!futex_cmpxchg_enabled) 124 if (!futex_cmpxchg_enabled)
124 return -ENOSYS; 125 return -ENOSYS;
@@ -131,9 +132,9 @@ compat_sys_set_robust_list(struct compat_robust_list_head __user *head,
131 return 0; 132 return 0;
132} 133}
133 134
134asmlinkage long 135COMPAT_SYSCALL_DEFINE3(get_robust_list, int, pid,
135compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr, 136 compat_uptr_t __user *, head_ptr,
136 compat_size_t __user *len_ptr) 137 compat_size_t __user *, len_ptr)
137{ 138{
138 struct compat_robust_list_head __user *head; 139 struct compat_robust_list_head __user *head;
139 unsigned long ret; 140 unsigned long ret;
@@ -142,8 +143,6 @@ compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr,
142 if (!futex_cmpxchg_enabled) 143 if (!futex_cmpxchg_enabled)
143 return -ENOSYS; 144 return -ENOSYS;
144 145
145 WARN_ONCE(1, "deprecated: get_robust_list will be deleted in 2013.\n");
146
147 rcu_read_lock(); 146 rcu_read_lock();
148 147
149 ret = -ESRCH; 148 ret = -ESRCH;
@@ -172,9 +171,9 @@ err_unlock:
172 return ret; 171 return ret;
173} 172}
174 173
175asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, u32 val, 174COMPAT_SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
176 struct compat_timespec __user *utime, u32 __user *uaddr2, 175 struct compat_timespec __user *, utime, u32 __user *, uaddr2,
177 u32 val3) 176 u32, val3)
178{ 177{
179 struct timespec ts; 178 struct timespec ts;
180 ktime_t t, *tp = NULL; 179 ktime_t t, *tp = NULL;
diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig
index a92028196cc1..d4da55d1fb65 100644
--- a/kernel/gcov/Kconfig
+++ b/kernel/gcov/Kconfig
@@ -35,7 +35,7 @@ config GCOV_KERNEL
35config GCOV_PROFILE_ALL 35config GCOV_PROFILE_ALL
36 bool "Profile entire Kernel" 36 bool "Profile entire Kernel"
37 depends on GCOV_KERNEL 37 depends on GCOV_KERNEL
38 depends on SUPERH || S390 || X86 || (PPC && EXPERIMENTAL) || MICROBLAZE 38 depends on SUPERH || S390 || X86 || PPC || MICROBLAZE
39 default n 39 default n
40 ---help--- 40 ---help---
41 This options activates profiling for the entire kernel. 41 This options activates profiling for the entire kernel.
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 6db7a5ed52b5..14be27feda49 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -44,6 +44,8 @@
44#include <linux/err.h> 44#include <linux/err.h>
45#include <linux/debugobjects.h> 45#include <linux/debugobjects.h>
46#include <linux/sched.h> 46#include <linux/sched.h>
47#include <linux/sched/sysctl.h>
48#include <linux/sched/rt.h>
47#include <linux/timer.h> 49#include <linux/timer.h>
48 50
49#include <asm/uaccess.h> 51#include <asm/uaccess.h>
@@ -61,6 +63,7 @@
61DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) = 63DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
62{ 64{
63 65
66 .lock = __RAW_SPIN_LOCK_UNLOCKED(hrtimer_bases.lock),
64 .clock_base = 67 .clock_base =
65 { 68 {
66 { 69 {
@@ -640,21 +643,9 @@ static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base)
640 * and expiry check is done in the hrtimer_interrupt or in the softirq. 643 * and expiry check is done in the hrtimer_interrupt or in the softirq.
641 */ 644 */
642static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, 645static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
643 struct hrtimer_clock_base *base, 646 struct hrtimer_clock_base *base)
644 int wakeup)
645{ 647{
646 if (base->cpu_base->hres_active && hrtimer_reprogram(timer, base)) { 648 return base->cpu_base->hres_active && hrtimer_reprogram(timer, base);
647 if (wakeup) {
648 raw_spin_unlock(&base->cpu_base->lock);
649 raise_softirq_irqoff(HRTIMER_SOFTIRQ);
650 raw_spin_lock(&base->cpu_base->lock);
651 } else
652 __raise_softirq_irqoff(HRTIMER_SOFTIRQ);
653
654 return 1;
655 }
656
657 return 0;
658} 649}
659 650
660static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base) 651static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base)
@@ -735,8 +726,7 @@ static inline int hrtimer_switch_to_hres(void) { return 0; }
735static inline void 726static inline void
736hrtimer_force_reprogram(struct hrtimer_cpu_base *base, int skip_equal) { } 727hrtimer_force_reprogram(struct hrtimer_cpu_base *base, int skip_equal) { }
737static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, 728static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
738 struct hrtimer_clock_base *base, 729 struct hrtimer_clock_base *base)
739 int wakeup)
740{ 730{
741 return 0; 731 return 0;
742} 732}
@@ -995,8 +985,21 @@ int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
995 * 985 *
996 * XXX send_remote_softirq() ? 986 * XXX send_remote_softirq() ?
997 */ 987 */
998 if (leftmost && new_base->cpu_base == &__get_cpu_var(hrtimer_bases)) 988 if (leftmost && new_base->cpu_base == &__get_cpu_var(hrtimer_bases)
999 hrtimer_enqueue_reprogram(timer, new_base, wakeup); 989 && hrtimer_enqueue_reprogram(timer, new_base)) {
990 if (wakeup) {
991 /*
992 * We need to drop cpu_base->lock to avoid a
993 * lock ordering issue vs. rq->lock.
994 */
995 raw_spin_unlock(&new_base->cpu_base->lock);
996 raise_softirq_irqoff(HRTIMER_SOFTIRQ);
997 local_irq_restore(flags);
998 return ret;
999 } else {
1000 __raise_softirq_irqoff(HRTIMER_SOFTIRQ);
1001 }
1002 }
1000 1003
1001 unlock_hrtimer_base(timer, &flags); 1004 unlock_hrtimer_base(timer, &flags);
1002 1005
@@ -1640,8 +1643,6 @@ static void __cpuinit init_hrtimers_cpu(int cpu)
1640 struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu); 1643 struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu);
1641 int i; 1644 int i;
1642 1645
1643 raw_spin_lock_init(&cpu_base->lock);
1644
1645 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { 1646 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
1646 cpu_base->clock_base[i].cpu_base = cpu_base; 1647 cpu_base->clock_base[i].cpu_base = cpu_base;
1647 timerqueue_init_head(&cpu_base->clock_base[i].active); 1648 timerqueue_init_head(&cpu_base->clock_base[i].active);
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 57d86d07221e..cbd97ce0b000 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -90,27 +90,41 @@ int irq_set_handler_data(unsigned int irq, void *data)
90EXPORT_SYMBOL(irq_set_handler_data); 90EXPORT_SYMBOL(irq_set_handler_data);
91 91
92/** 92/**
93 * irq_set_msi_desc - set MSI descriptor data for an irq 93 * irq_set_msi_desc_off - set MSI descriptor data for an irq at offset
94 * @irq: Interrupt number 94 * @irq_base: Interrupt number base
95 * @entry: Pointer to MSI descriptor data 95 * @irq_offset: Interrupt number offset
96 * @entry: Pointer to MSI descriptor data
96 * 97 *
97 * Set the MSI descriptor entry for an irq 98 * Set the MSI descriptor entry for an irq at offset
98 */ 99 */
99int irq_set_msi_desc(unsigned int irq, struct msi_desc *entry) 100int irq_set_msi_desc_off(unsigned int irq_base, unsigned int irq_offset,
101 struct msi_desc *entry)
100{ 102{
101 unsigned long flags; 103 unsigned long flags;
102 struct irq_desc *desc = irq_get_desc_lock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL); 104 struct irq_desc *desc = irq_get_desc_lock(irq_base + irq_offset, &flags, IRQ_GET_DESC_CHECK_GLOBAL);
103 105
104 if (!desc) 106 if (!desc)
105 return -EINVAL; 107 return -EINVAL;
106 desc->irq_data.msi_desc = entry; 108 desc->irq_data.msi_desc = entry;
107 if (entry) 109 if (entry && !irq_offset)
108 entry->irq = irq; 110 entry->irq = irq_base;
109 irq_put_desc_unlock(desc, flags); 111 irq_put_desc_unlock(desc, flags);
110 return 0; 112 return 0;
111} 113}
112 114
113/** 115/**
116 * irq_set_msi_desc - set MSI descriptor data for an irq
117 * @irq: Interrupt number
118 * @entry: Pointer to MSI descriptor data
119 *
120 * Set the MSI descriptor entry for an irq
121 */
122int irq_set_msi_desc(unsigned int irq, struct msi_desc *entry)
123{
124 return irq_set_msi_desc_off(irq, 0, entry);
125}
126
127/**
114 * irq_set_chip_data - set irq chip data for an irq 128 * irq_set_chip_data - set irq chip data for an irq
115 * @irq: Interrupt number 129 * @irq: Interrupt number
116 * @data: Pointer to chip specific data 130 * @data: Pointer to chip specific data
@@ -272,6 +286,7 @@ void handle_nested_irq(unsigned int irq)
272 286
273 raw_spin_lock_irq(&desc->lock); 287 raw_spin_lock_irq(&desc->lock);
274 288
289 desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
275 kstat_incr_irqs_this_cpu(irq, desc); 290 kstat_incr_irqs_this_cpu(irq, desc);
276 291
277 action = desc->action; 292 action = desc->action;
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 4e69e24d3d7d..96f3a1d9c379 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -177,8 +177,8 @@ struct irq_domain *irq_domain_add_simple(struct device_node *of_node,
177 irq_base = irq_alloc_descs(first_irq, first_irq, size, 177 irq_base = irq_alloc_descs(first_irq, first_irq, size,
178 of_node_to_nid(of_node)); 178 of_node_to_nid(of_node));
179 if (irq_base < 0) { 179 if (irq_base < 0) {
180 WARN(1, "Cannot allocate irq_descs @ IRQ%d, assuming pre-allocated\n", 180 pr_info("Cannot allocate irq_descs @ IRQ%d, assuming pre-allocated\n",
181 first_irq); 181 first_irq);
182 irq_base = first_irq; 182 irq_base = first_irq;
183 } 183 }
184 } else 184 } else
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 4c69326aa773..fa17855ca65a 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -16,6 +16,7 @@
16#include <linux/interrupt.h> 16#include <linux/interrupt.h>
17#include <linux/slab.h> 17#include <linux/slab.h>
18#include <linux/sched.h> 18#include <linux/sched.h>
19#include <linux/sched/rt.h>
19#include <linux/task_work.h> 20#include <linux/task_work.h>
20 21
21#include "internals.h" 22#include "internals.h"
@@ -616,6 +617,22 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
616 return ret; 617 return ret;
617} 618}
618 619
620#ifdef CONFIG_HARDIRQS_SW_RESEND
621int irq_set_parent(int irq, int parent_irq)
622{
623 unsigned long flags;
624 struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0);
625
626 if (!desc)
627 return -EINVAL;
628
629 desc->parent_irq = parent_irq;
630
631 irq_put_desc_unlock(desc, flags);
632 return 0;
633}
634#endif
635
619/* 636/*
620 * Default primary interrupt handler for threaded interrupts. Is 637 * Default primary interrupt handler for threaded interrupts. Is
621 * assigned as primary handler when request_threaded_irq is called 638 * assigned as primary handler when request_threaded_irq is called
@@ -716,6 +733,7 @@ static void
716irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) 733irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action)
717{ 734{
718 cpumask_var_t mask; 735 cpumask_var_t mask;
736 bool valid = true;
719 737
720 if (!test_and_clear_bit(IRQTF_AFFINITY, &action->thread_flags)) 738 if (!test_and_clear_bit(IRQTF_AFFINITY, &action->thread_flags))
721 return; 739 return;
@@ -730,10 +748,18 @@ irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action)
730 } 748 }
731 749
732 raw_spin_lock_irq(&desc->lock); 750 raw_spin_lock_irq(&desc->lock);
733 cpumask_copy(mask, desc->irq_data.affinity); 751 /*
752 * This code is triggered unconditionally. Check the affinity
753 * mask pointer. For CPU_MASK_OFFSTACK=n this is optimized out.
754 */
755 if (desc->irq_data.affinity)
756 cpumask_copy(mask, desc->irq_data.affinity);
757 else
758 valid = false;
734 raw_spin_unlock_irq(&desc->lock); 759 raw_spin_unlock_irq(&desc->lock);
735 760
736 set_cpus_allowed_ptr(current, mask); 761 if (valid)
762 set_cpus_allowed_ptr(current, mask);
737 free_cpumask_var(mask); 763 free_cpumask_var(mask);
738} 764}
739#else 765#else
@@ -793,7 +819,7 @@ static void irq_thread_dtor(struct callback_head *unused)
793 action = kthread_data(tsk); 819 action = kthread_data(tsk);
794 820
795 pr_err("exiting task \"%s\" (%d) is an active IRQ thread (irq %d)\n", 821 pr_err("exiting task \"%s\" (%d) is an active IRQ thread (irq %d)\n",
796 tsk->comm ? tsk->comm : "", tsk->pid, action->irq); 822 tsk->comm, tsk->pid, action->irq);
797 823
798 824
799 desc = irq_to_desc(action->irq); 825 desc = irq_to_desc(action->irq);
@@ -833,6 +859,8 @@ static int irq_thread(void *data)
833 init_task_work(&on_exit_work, irq_thread_dtor); 859 init_task_work(&on_exit_work, irq_thread_dtor);
834 task_work_add(current, &on_exit_work, false); 860 task_work_add(current, &on_exit_work, false);
835 861
862 irq_thread_check_affinity(desc, action);
863
836 while (!irq_wait_for_interrupt(action)) { 864 while (!irq_wait_for_interrupt(action)) {
837 irqreturn_t action_ret; 865 irqreturn_t action_ret;
838 866
@@ -936,6 +964,16 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
936 */ 964 */
937 get_task_struct(t); 965 get_task_struct(t);
938 new->thread = t; 966 new->thread = t;
967 /*
968 * Tell the thread to set its affinity. This is
969 * important for shared interrupt handlers as we do
970 * not invoke setup_affinity() for the secondary
971 * handlers as everything is already set up. Even for
972 * interrupts marked with IRQF_NO_BALANCE this is
973 * correct as we want the thread to move to the cpu(s)
974 * on which the requesting code placed the interrupt.
975 */
976 set_bit(IRQTF_AFFINITY, &new->thread_flags);
939 } 977 }
940 978
941 if (!alloc_cpumask_var(&mask, GFP_KERNEL)) { 979 if (!alloc_cpumask_var(&mask, GFP_KERNEL)) {
@@ -1487,6 +1525,7 @@ void enable_percpu_irq(unsigned int irq, unsigned int type)
1487out: 1525out:
1488 irq_put_desc_unlock(desc, flags); 1526 irq_put_desc_unlock(desc, flags);
1489} 1527}
1528EXPORT_SYMBOL_GPL(enable_percpu_irq);
1490 1529
1491void disable_percpu_irq(unsigned int irq) 1530void disable_percpu_irq(unsigned int irq)
1492{ 1531{
@@ -1500,6 +1539,7 @@ void disable_percpu_irq(unsigned int irq)
1500 irq_percpu_disable(desc, cpu); 1539 irq_percpu_disable(desc, cpu);
1501 irq_put_desc_unlock(desc, flags); 1540 irq_put_desc_unlock(desc, flags);
1502} 1541}
1542EXPORT_SYMBOL_GPL(disable_percpu_irq);
1503 1543
1504/* 1544/*
1505 * Internal function to unregister a percpu irqaction. 1545 * Internal function to unregister a percpu irqaction.
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 4bd4faa6323a..397db02209ed 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -76,7 +76,7 @@ static int irq_affinity_list_proc_show(struct seq_file *m, void *v)
76static ssize_t write_irq_affinity(int type, struct file *file, 76static ssize_t write_irq_affinity(int type, struct file *file,
77 const char __user *buffer, size_t count, loff_t *pos) 77 const char __user *buffer, size_t count, loff_t *pos)
78{ 78{
79 unsigned int irq = (int)(long)PDE(file->f_path.dentry->d_inode)->data; 79 unsigned int irq = (int)(long)PDE(file_inode(file))->data;
80 cpumask_var_t new_value; 80 cpumask_var_t new_value;
81 int err; 81 int err;
82 82
diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c
index 6454db7b6a4d..9065107f083e 100644
--- a/kernel/irq/resend.c
+++ b/kernel/irq/resend.c
@@ -74,6 +74,14 @@ void check_irq_resend(struct irq_desc *desc, unsigned int irq)
74 if (!desc->irq_data.chip->irq_retrigger || 74 if (!desc->irq_data.chip->irq_retrigger ||
75 !desc->irq_data.chip->irq_retrigger(&desc->irq_data)) { 75 !desc->irq_data.chip->irq_retrigger(&desc->irq_data)) {
76#ifdef CONFIG_HARDIRQS_SW_RESEND 76#ifdef CONFIG_HARDIRQS_SW_RESEND
77 /*
78 * If the interrupt has a parent irq and runs
79 * in the thread context of the parent irq,
80 * retrigger the parent.
81 */
82 if (desc->parent_irq &&
83 irq_settings_is_nested_thread(desc))
84 irq = desc->parent_irq;
77 /* Set it pending and activate the softirq: */ 85 /* Set it pending and activate the softirq: */
78 set_bit(irq, irqs_resend); 86 set_bit(irq, irqs_resend);
79 tasklet_schedule(&resend_tasklet); 87 tasklet_schedule(&resend_tasklet);
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index 611cd6003c45..7b5f012bde9d 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -80,13 +80,11 @@ static int try_one_irq(int irq, struct irq_desc *desc, bool force)
80 80
81 /* 81 /*
82 * All handlers must agree on IRQF_SHARED, so we test just the 82 * All handlers must agree on IRQF_SHARED, so we test just the
83 * first. Check for action->next as well. 83 * first.
84 */ 84 */
85 action = desc->action; 85 action = desc->action;
86 if (!action || !(action->flags & IRQF_SHARED) || 86 if (!action || !(action->flags & IRQF_SHARED) ||
87 (action->flags & __IRQF_TIMER) || 87 (action->flags & __IRQF_TIMER))
88 (action->handler(irq, action->dev_id) == IRQ_HANDLED) ||
89 !action->next)
90 goto out; 88 goto out;
91 89
92 /* Already running on another processor */ 90 /* Already running on another processor */
@@ -104,6 +102,7 @@ static int try_one_irq(int irq, struct irq_desc *desc, bool force)
104 do { 102 do {
105 if (handle_irq_event(desc) == IRQ_HANDLED) 103 if (handle_irq_event(desc) == IRQ_HANDLED)
106 ret = IRQ_HANDLED; 104 ret = IRQ_HANDLED;
105 /* Make sure that there is still a valid action */
107 action = desc->action; 106 action = desc->action;
108 } while ((desc->istate & IRQS_PENDING) && action); 107 } while ((desc->istate & IRQS_PENDING) && action);
109 desc->istate &= ~IRQS_POLL_INPROGRESS; 108 desc->istate &= ~IRQS_POLL_INPROGRESS;
diff --git a/kernel/irq_work.c b/kernel/irq_work.c
index 1588e3b2871b..55fcce6065cf 100644
--- a/kernel/irq_work.c
+++ b/kernel/irq_work.c
@@ -12,37 +12,36 @@
12#include <linux/percpu.h> 12#include <linux/percpu.h>
13#include <linux/hardirq.h> 13#include <linux/hardirq.h>
14#include <linux/irqflags.h> 14#include <linux/irqflags.h>
15#include <linux/sched.h>
16#include <linux/tick.h>
17#include <linux/cpu.h>
18#include <linux/notifier.h>
15#include <asm/processor.h> 19#include <asm/processor.h>
16 20
17/*
18 * An entry can be in one of four states:
19 *
20 * free NULL, 0 -> {claimed} : free to be used
21 * claimed NULL, 3 -> {pending} : claimed to be enqueued
22 * pending next, 3 -> {busy} : queued, pending callback
23 * busy NULL, 2 -> {free, claimed} : callback in progress, can be claimed
24 */
25
26#define IRQ_WORK_PENDING 1UL
27#define IRQ_WORK_BUSY 2UL
28#define IRQ_WORK_FLAGS 3UL
29 21
30static DEFINE_PER_CPU(struct llist_head, irq_work_list); 22static DEFINE_PER_CPU(struct llist_head, irq_work_list);
23static DEFINE_PER_CPU(int, irq_work_raised);
31 24
32/* 25/*
33 * Claim the entry so that no one else will poke at it. 26 * Claim the entry so that no one else will poke at it.
34 */ 27 */
35static bool irq_work_claim(struct irq_work *work) 28static bool irq_work_claim(struct irq_work *work)
36{ 29{
37 unsigned long flags, nflags; 30 unsigned long flags, oflags, nflags;
38 31
32 /*
33 * Start with our best wish as a premise but only trust any
34 * flag value after cmpxchg() result.
35 */
36 flags = work->flags & ~IRQ_WORK_PENDING;
39 for (;;) { 37 for (;;) {
40 flags = work->flags;
41 if (flags & IRQ_WORK_PENDING)
42 return false;
43 nflags = flags | IRQ_WORK_FLAGS; 38 nflags = flags | IRQ_WORK_FLAGS;
44 if (cmpxchg(&work->flags, flags, nflags) == flags) 39 oflags = cmpxchg(&work->flags, flags, nflags);
40 if (oflags == flags)
45 break; 41 break;
42 if (oflags & IRQ_WORK_PENDING)
43 return false;
44 flags = oflags;
46 cpu_relax(); 45 cpu_relax();
47 } 46 }
48 47
@@ -57,57 +56,69 @@ void __weak arch_irq_work_raise(void)
57} 56}
58 57
59/* 58/*
60 * Queue the entry and raise the IPI if needed. 59 * Enqueue the irq_work @entry unless it's already pending
60 * somewhere.
61 *
62 * Can be re-enqueued while the callback is still in progress.
61 */ 63 */
62static void __irq_work_queue(struct irq_work *work) 64void irq_work_queue(struct irq_work *work)
63{ 65{
64 bool empty; 66 /* Only queue if not already pending */
67 if (!irq_work_claim(work))
68 return;
65 69
70 /* Queue the entry and raise the IPI if needed. */
66 preempt_disable(); 71 preempt_disable();
67 72
68 empty = llist_add(&work->llnode, &__get_cpu_var(irq_work_list)); 73 llist_add(&work->llnode, &__get_cpu_var(irq_work_list));
69 /* The list was empty, raise self-interrupt to start processing. */ 74
70 if (empty) 75 /*
71 arch_irq_work_raise(); 76 * If the work is not "lazy" or the tick is stopped, raise the irq
77 * work interrupt (if supported by the arch), otherwise, just wait
78 * for the next tick.
79 */
80 if (!(work->flags & IRQ_WORK_LAZY) || tick_nohz_tick_stopped()) {
81 if (!this_cpu_cmpxchg(irq_work_raised, 0, 1))
82 arch_irq_work_raise();
83 }
72 84
73 preempt_enable(); 85 preempt_enable();
74} 86}
87EXPORT_SYMBOL_GPL(irq_work_queue);
75 88
76/* 89bool irq_work_needs_cpu(void)
77 * Enqueue the irq_work @entry, returns true on success, failure when the
78 * @entry was already enqueued by someone else.
79 *
80 * Can be re-enqueued while the callback is still in progress.
81 */
82bool irq_work_queue(struct irq_work *work)
83{ 90{
84 if (!irq_work_claim(work)) { 91 struct llist_head *this_list;
85 /* 92
86 * Already enqueued, can't do! 93 this_list = &__get_cpu_var(irq_work_list);
87 */ 94 if (llist_empty(this_list))
88 return false; 95 return false;
89 }
90 96
91 __irq_work_queue(work); 97 /* All work should have been flushed before going offline */
98 WARN_ON_ONCE(cpu_is_offline(smp_processor_id()));
99
92 return true; 100 return true;
93} 101}
94EXPORT_SYMBOL_GPL(irq_work_queue);
95 102
96/* 103static void __irq_work_run(void)
97 * Run the irq_work entries on this cpu. Requires to be ran from hardirq
98 * context with local IRQs disabled.
99 */
100void irq_work_run(void)
101{ 104{
105 unsigned long flags;
102 struct irq_work *work; 106 struct irq_work *work;
103 struct llist_head *this_list; 107 struct llist_head *this_list;
104 struct llist_node *llnode; 108 struct llist_node *llnode;
105 109
110
111 /*
112 * Reset the "raised" state right before we check the list because
113 * an NMI may enqueue after we find the list empty from the runner.
114 */
115 __this_cpu_write(irq_work_raised, 0);
116 barrier();
117
106 this_list = &__get_cpu_var(irq_work_list); 118 this_list = &__get_cpu_var(irq_work_list);
107 if (llist_empty(this_list)) 119 if (llist_empty(this_list))
108 return; 120 return;
109 121
110 BUG_ON(!in_irq());
111 BUG_ON(!irqs_disabled()); 122 BUG_ON(!irqs_disabled());
112 123
113 llnode = llist_del_all(this_list); 124 llnode = llist_del_all(this_list);
@@ -119,16 +130,31 @@ void irq_work_run(void)
119 /* 130 /*
120 * Clear the PENDING bit, after this point the @work 131 * Clear the PENDING bit, after this point the @work
121 * can be re-used. 132 * can be re-used.
133 * Make it immediately visible so that other CPUs trying
134 * to claim that work don't rely on us to handle their data
135 * while we are in the middle of the func.
122 */ 136 */
123 work->flags = IRQ_WORK_BUSY; 137 flags = work->flags & ~IRQ_WORK_PENDING;
138 xchg(&work->flags, flags);
139
124 work->func(work); 140 work->func(work);
125 /* 141 /*
126 * Clear the BUSY bit and return to the free state if 142 * Clear the BUSY bit and return to the free state if
127 * no-one else claimed it meanwhile. 143 * no-one else claimed it meanwhile.
128 */ 144 */
129 (void)cmpxchg(&work->flags, IRQ_WORK_BUSY, 0); 145 (void)cmpxchg(&work->flags, flags, flags & ~IRQ_WORK_BUSY);
130 } 146 }
131} 147}
148
149/*
150 * Run the irq_work entries on this cpu. Requires to be ran from hardirq
151 * context with local IRQs disabled.
152 */
153void irq_work_run(void)
154{
155 BUG_ON(!in_irq());
156 __irq_work_run();
157}
132EXPORT_SYMBOL_GPL(irq_work_run); 158EXPORT_SYMBOL_GPL(irq_work_run);
133 159
134/* 160/*
@@ -143,3 +169,35 @@ void irq_work_sync(struct irq_work *work)
143 cpu_relax(); 169 cpu_relax();
144} 170}
145EXPORT_SYMBOL_GPL(irq_work_sync); 171EXPORT_SYMBOL_GPL(irq_work_sync);
172
173#ifdef CONFIG_HOTPLUG_CPU
174static int irq_work_cpu_notify(struct notifier_block *self,
175 unsigned long action, void *hcpu)
176{
177 long cpu = (long)hcpu;
178
179 switch (action) {
180 case CPU_DYING:
181 /* Called from stop_machine */
182 if (WARN_ON_ONCE(cpu != smp_processor_id()))
183 break;
184 __irq_work_run();
185 break;
186 default:
187 break;
188 }
189 return NOTIFY_OK;
190}
191
192static struct notifier_block cpu_notify;
193
194static __init int irq_work_init_cpu_notifier(void)
195{
196 cpu_notify.notifier_call = irq_work_cpu_notify;
197 cpu_notify.priority = 0;
198 register_cpu_notifier(&cpu_notify);
199 return 0;
200}
201device_initcall(irq_work_init_cpu_notifier);
202
203#endif /* CONFIG_HOTPLUG_CPU */
diff --git a/kernel/kcmp.c b/kernel/kcmp.c
index 30b7b225306c..e30ac0fe61c3 100644
--- a/kernel/kcmp.c
+++ b/kernel/kcmp.c
@@ -4,6 +4,7 @@
4#include <linux/string.h> 4#include <linux/string.h>
5#include <linux/random.h> 5#include <linux/random.h>
6#include <linux/module.h> 6#include <linux/module.h>
7#include <linux/ptrace.h>
7#include <linux/init.h> 8#include <linux/init.h>
8#include <linux/errno.h> 9#include <linux/errno.h>
9#include <linux/cache.h> 10#include <linux/cache.h>
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 5e4bd7864c5d..bddd3d7a74b6 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -54,6 +54,12 @@ struct resource crashk_res = {
54 .end = 0, 54 .end = 0,
55 .flags = IORESOURCE_BUSY | IORESOURCE_MEM 55 .flags = IORESOURCE_BUSY | IORESOURCE_MEM
56}; 56};
57struct resource crashk_low_res = {
58 .name = "Crash kernel low",
59 .start = 0,
60 .end = 0,
61 .flags = IORESOURCE_BUSY | IORESOURCE_MEM
62};
57 63
58int kexec_should_crash(struct task_struct *p) 64int kexec_should_crash(struct task_struct *p)
59{ 65{
@@ -223,6 +229,8 @@ out:
223 229
224} 230}
225 231
232static void kimage_free_page_list(struct list_head *list);
233
226static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry, 234static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry,
227 unsigned long nr_segments, 235 unsigned long nr_segments,
228 struct kexec_segment __user *segments) 236 struct kexec_segment __user *segments)
@@ -236,8 +244,6 @@ static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry,
236 if (result) 244 if (result)
237 goto out; 245 goto out;
238 246
239 *rimage = image;
240
241 /* 247 /*
242 * Find a location for the control code buffer, and add it 248 * Find a location for the control code buffer, and add it
243 * the vector of segments so that it's pages will also be 249 * the vector of segments so that it's pages will also be
@@ -248,22 +254,22 @@ static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry,
248 get_order(KEXEC_CONTROL_PAGE_SIZE)); 254 get_order(KEXEC_CONTROL_PAGE_SIZE));
249 if (!image->control_code_page) { 255 if (!image->control_code_page) {
250 printk(KERN_ERR "Could not allocate control_code_buffer\n"); 256 printk(KERN_ERR "Could not allocate control_code_buffer\n");
251 goto out; 257 goto out_free;
252 } 258 }
253 259
254 image->swap_page = kimage_alloc_control_pages(image, 0); 260 image->swap_page = kimage_alloc_control_pages(image, 0);
255 if (!image->swap_page) { 261 if (!image->swap_page) {
256 printk(KERN_ERR "Could not allocate swap buffer\n"); 262 printk(KERN_ERR "Could not allocate swap buffer\n");
257 goto out; 263 goto out_free;
258 } 264 }
259 265
260 result = 0; 266 *rimage = image;
261 out: 267 return 0;
262 if (result == 0)
263 *rimage = image;
264 else
265 kfree(image);
266 268
269out_free:
270 kimage_free_page_list(&image->control_pages);
271 kfree(image);
272out:
267 return result; 273 return result;
268} 274}
269 275
@@ -310,7 +316,7 @@ static int kimage_crash_alloc(struct kimage **rimage, unsigned long entry,
310 mend = mstart + image->segment[i].memsz - 1; 316 mend = mstart + image->segment[i].memsz - 1;
311 /* Ensure we are within the crash kernel limits */ 317 /* Ensure we are within the crash kernel limits */
312 if ((mstart < crashk_res.start) || (mend > crashk_res.end)) 318 if ((mstart < crashk_res.start) || (mend > crashk_res.end))
313 goto out; 319 goto out_free;
314 } 320 }
315 321
316 /* 322 /*
@@ -323,16 +329,15 @@ static int kimage_crash_alloc(struct kimage **rimage, unsigned long entry,
323 get_order(KEXEC_CONTROL_PAGE_SIZE)); 329 get_order(KEXEC_CONTROL_PAGE_SIZE));
324 if (!image->control_code_page) { 330 if (!image->control_code_page) {
325 printk(KERN_ERR "Could not allocate control_code_buffer\n"); 331 printk(KERN_ERR "Could not allocate control_code_buffer\n");
326 goto out; 332 goto out_free;
327 } 333 }
328 334
329 result = 0; 335 *rimage = image;
330out: 336 return 0;
331 if (result == 0)
332 *rimage = image;
333 else
334 kfree(image);
335 337
338out_free:
339 kfree(image);
340out:
336 return result; 341 return result;
337} 342}
338 343
@@ -497,8 +502,6 @@ static struct page *kimage_alloc_crash_control_pages(struct kimage *image,
497 502
498 if (hole_end > KEXEC_CRASH_CONTROL_MEMORY_LIMIT) 503 if (hole_end > KEXEC_CRASH_CONTROL_MEMORY_LIMIT)
499 break; 504 break;
500 if (hole_end > crashk_res.end)
501 break;
502 /* See if I overlap any of the segments */ 505 /* See if I overlap any of the segments */
503 for (i = 0; i < image->nr_segments; i++) { 506 for (i = 0; i < image->nr_segments; i++) {
504 unsigned long mstart, mend; 507 unsigned long mstart, mend;
@@ -1369,10 +1372,11 @@ static int __init parse_crashkernel_simple(char *cmdline,
1369 * That function is the entry point for command line parsing and should be 1372 * That function is the entry point for command line parsing and should be
1370 * called from the arch-specific code. 1373 * called from the arch-specific code.
1371 */ 1374 */
1372int __init parse_crashkernel(char *cmdline, 1375static int __init __parse_crashkernel(char *cmdline,
1373 unsigned long long system_ram, 1376 unsigned long long system_ram,
1374 unsigned long long *crash_size, 1377 unsigned long long *crash_size,
1375 unsigned long long *crash_base) 1378 unsigned long long *crash_base,
1379 const char *name)
1376{ 1380{
1377 char *p = cmdline, *ck_cmdline = NULL; 1381 char *p = cmdline, *ck_cmdline = NULL;
1378 char *first_colon, *first_space; 1382 char *first_colon, *first_space;
@@ -1382,16 +1386,16 @@ int __init parse_crashkernel(char *cmdline,
1382 *crash_base = 0; 1386 *crash_base = 0;
1383 1387
1384 /* find crashkernel and use the last one if there are more */ 1388 /* find crashkernel and use the last one if there are more */
1385 p = strstr(p, "crashkernel="); 1389 p = strstr(p, name);
1386 while (p) { 1390 while (p) {
1387 ck_cmdline = p; 1391 ck_cmdline = p;
1388 p = strstr(p+1, "crashkernel="); 1392 p = strstr(p+1, name);
1389 } 1393 }
1390 1394
1391 if (!ck_cmdline) 1395 if (!ck_cmdline)
1392 return -EINVAL; 1396 return -EINVAL;
1393 1397
1394 ck_cmdline += 12; /* strlen("crashkernel=") */ 1398 ck_cmdline += strlen(name);
1395 1399
1396 /* 1400 /*
1397 * if the commandline contains a ':', then that's the extended 1401 * if the commandline contains a ':', then that's the extended
@@ -1409,6 +1413,23 @@ int __init parse_crashkernel(char *cmdline,
1409 return 0; 1413 return 0;
1410} 1414}
1411 1415
1416int __init parse_crashkernel(char *cmdline,
1417 unsigned long long system_ram,
1418 unsigned long long *crash_size,
1419 unsigned long long *crash_base)
1420{
1421 return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
1422 "crashkernel=");
1423}
1424
1425int __init parse_crashkernel_low(char *cmdline,
1426 unsigned long long system_ram,
1427 unsigned long long *crash_size,
1428 unsigned long long *crash_base)
1429{
1430 return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
1431 "crashkernel_low=");
1432}
1412 1433
1413static void update_vmcoreinfo_note(void) 1434static void update_vmcoreinfo_note(void)
1414{ 1435{
@@ -1490,6 +1511,8 @@ static int __init crash_save_vmcoreinfo_init(void)
1490 VMCOREINFO_OFFSET(page, _count); 1511 VMCOREINFO_OFFSET(page, _count);
1491 VMCOREINFO_OFFSET(page, mapping); 1512 VMCOREINFO_OFFSET(page, mapping);
1492 VMCOREINFO_OFFSET(page, lru); 1513 VMCOREINFO_OFFSET(page, lru);
1514 VMCOREINFO_OFFSET(page, _mapcount);
1515 VMCOREINFO_OFFSET(page, private);
1493 VMCOREINFO_OFFSET(pglist_data, node_zones); 1516 VMCOREINFO_OFFSET(pglist_data, node_zones);
1494 VMCOREINFO_OFFSET(pglist_data, nr_zones); 1517 VMCOREINFO_OFFSET(pglist_data, nr_zones);
1495#ifdef CONFIG_FLAT_NODE_MEM_MAP 1518#ifdef CONFIG_FLAT_NODE_MEM_MAP
@@ -1512,6 +1535,11 @@ static int __init crash_save_vmcoreinfo_init(void)
1512 VMCOREINFO_NUMBER(PG_lru); 1535 VMCOREINFO_NUMBER(PG_lru);
1513 VMCOREINFO_NUMBER(PG_private); 1536 VMCOREINFO_NUMBER(PG_private);
1514 VMCOREINFO_NUMBER(PG_swapcache); 1537 VMCOREINFO_NUMBER(PG_swapcache);
1538 VMCOREINFO_NUMBER(PG_slab);
1539#ifdef CONFIG_MEMORY_FAILURE
1540 VMCOREINFO_NUMBER(PG_hwpoison);
1541#endif
1542 VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE);
1515 1543
1516 arch_crash_save_vmcoreinfo(); 1544 arch_crash_save_vmcoreinfo();
1517 update_vmcoreinfo_note(); 1545 update_vmcoreinfo_note();
diff --git a/kernel/kfifo.c b/kernel/kfifo.c
deleted file mode 100644
index 59dcf5b81d24..000000000000
--- a/kernel/kfifo.c
+++ /dev/null
@@ -1,609 +0,0 @@
1/*
2 * A generic kernel FIFO implementation
3 *
4 * Copyright (C) 2009/2010 Stefani Seibold <stefani@seibold.net>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 */
21
22#include <linux/kernel.h>
23#include <linux/export.h>
24#include <linux/slab.h>
25#include <linux/err.h>
26#include <linux/log2.h>
27#include <linux/uaccess.h>
28#include <linux/kfifo.h>
29
30/*
31 * internal helper to calculate the unused elements in a fifo
32 */
33static inline unsigned int kfifo_unused(struct __kfifo *fifo)
34{
35 return (fifo->mask + 1) - (fifo->in - fifo->out);
36}
37
38int __kfifo_alloc(struct __kfifo *fifo, unsigned int size,
39 size_t esize, gfp_t gfp_mask)
40{
41 /*
42 * round down to the next power of 2, since our 'let the indices
43 * wrap' technique works only in this case.
44 */
45 if (!is_power_of_2(size))
46 size = rounddown_pow_of_two(size);
47
48 fifo->in = 0;
49 fifo->out = 0;
50 fifo->esize = esize;
51
52 if (size < 2) {
53 fifo->data = NULL;
54 fifo->mask = 0;
55 return -EINVAL;
56 }
57
58 fifo->data = kmalloc(size * esize, gfp_mask);
59
60 if (!fifo->data) {
61 fifo->mask = 0;
62 return -ENOMEM;
63 }
64 fifo->mask = size - 1;
65
66 return 0;
67}
68EXPORT_SYMBOL(__kfifo_alloc);
69
70void __kfifo_free(struct __kfifo *fifo)
71{
72 kfree(fifo->data);
73 fifo->in = 0;
74 fifo->out = 0;
75 fifo->esize = 0;
76 fifo->data = NULL;
77 fifo->mask = 0;
78}
79EXPORT_SYMBOL(__kfifo_free);
80
81int __kfifo_init(struct __kfifo *fifo, void *buffer,
82 unsigned int size, size_t esize)
83{
84 size /= esize;
85
86 if (!is_power_of_2(size))
87 size = rounddown_pow_of_two(size);
88
89 fifo->in = 0;
90 fifo->out = 0;
91 fifo->esize = esize;
92 fifo->data = buffer;
93
94 if (size < 2) {
95 fifo->mask = 0;
96 return -EINVAL;
97 }
98 fifo->mask = size - 1;
99
100 return 0;
101}
102EXPORT_SYMBOL(__kfifo_init);
103
104static void kfifo_copy_in(struct __kfifo *fifo, const void *src,
105 unsigned int len, unsigned int off)
106{
107 unsigned int size = fifo->mask + 1;
108 unsigned int esize = fifo->esize;
109 unsigned int l;
110
111 off &= fifo->mask;
112 if (esize != 1) {
113 off *= esize;
114 size *= esize;
115 len *= esize;
116 }
117 l = min(len, size - off);
118
119 memcpy(fifo->data + off, src, l);
120 memcpy(fifo->data, src + l, len - l);
121 /*
122 * make sure that the data in the fifo is up to date before
123 * incrementing the fifo->in index counter
124 */
125 smp_wmb();
126}
127
128unsigned int __kfifo_in(struct __kfifo *fifo,
129 const void *buf, unsigned int len)
130{
131 unsigned int l;
132
133 l = kfifo_unused(fifo);
134 if (len > l)
135 len = l;
136
137 kfifo_copy_in(fifo, buf, len, fifo->in);
138 fifo->in += len;
139 return len;
140}
141EXPORT_SYMBOL(__kfifo_in);
142
143static void kfifo_copy_out(struct __kfifo *fifo, void *dst,
144 unsigned int len, unsigned int off)
145{
146 unsigned int size = fifo->mask + 1;
147 unsigned int esize = fifo->esize;
148 unsigned int l;
149
150 off &= fifo->mask;
151 if (esize != 1) {
152 off *= esize;
153 size *= esize;
154 len *= esize;
155 }
156 l = min(len, size - off);
157
158 memcpy(dst, fifo->data + off, l);
159 memcpy(dst + l, fifo->data, len - l);
160 /*
161 * make sure that the data is copied before
162 * incrementing the fifo->out index counter
163 */
164 smp_wmb();
165}
166
167unsigned int __kfifo_out_peek(struct __kfifo *fifo,
168 void *buf, unsigned int len)
169{
170 unsigned int l;
171
172 l = fifo->in - fifo->out;
173 if (len > l)
174 len = l;
175
176 kfifo_copy_out(fifo, buf, len, fifo->out);
177 return len;
178}
179EXPORT_SYMBOL(__kfifo_out_peek);
180
181unsigned int __kfifo_out(struct __kfifo *fifo,
182 void *buf, unsigned int len)
183{
184 len = __kfifo_out_peek(fifo, buf, len);
185 fifo->out += len;
186 return len;
187}
188EXPORT_SYMBOL(__kfifo_out);
189
190static unsigned long kfifo_copy_from_user(struct __kfifo *fifo,
191 const void __user *from, unsigned int len, unsigned int off,
192 unsigned int *copied)
193{
194 unsigned int size = fifo->mask + 1;
195 unsigned int esize = fifo->esize;
196 unsigned int l;
197 unsigned long ret;
198
199 off &= fifo->mask;
200 if (esize != 1) {
201 off *= esize;
202 size *= esize;
203 len *= esize;
204 }
205 l = min(len, size - off);
206
207 ret = copy_from_user(fifo->data + off, from, l);
208 if (unlikely(ret))
209 ret = DIV_ROUND_UP(ret + len - l, esize);
210 else {
211 ret = copy_from_user(fifo->data, from + l, len - l);
212 if (unlikely(ret))
213 ret = DIV_ROUND_UP(ret, esize);
214 }
215 /*
216 * make sure that the data in the fifo is up to date before
217 * incrementing the fifo->in index counter
218 */
219 smp_wmb();
220 *copied = len - ret;
221 /* return the number of elements which are not copied */
222 return ret;
223}
224
225int __kfifo_from_user(struct __kfifo *fifo, const void __user *from,
226 unsigned long len, unsigned int *copied)
227{
228 unsigned int l;
229 unsigned long ret;
230 unsigned int esize = fifo->esize;
231 int err;
232
233 if (esize != 1)
234 len /= esize;
235
236 l = kfifo_unused(fifo);
237 if (len > l)
238 len = l;
239
240 ret = kfifo_copy_from_user(fifo, from, len, fifo->in, copied);
241 if (unlikely(ret)) {
242 len -= ret;
243 err = -EFAULT;
244 } else
245 err = 0;
246 fifo->in += len;
247 return err;
248}
249EXPORT_SYMBOL(__kfifo_from_user);
250
251static unsigned long kfifo_copy_to_user(struct __kfifo *fifo, void __user *to,
252 unsigned int len, unsigned int off, unsigned int *copied)
253{
254 unsigned int l;
255 unsigned long ret;
256 unsigned int size = fifo->mask + 1;
257 unsigned int esize = fifo->esize;
258
259 off &= fifo->mask;
260 if (esize != 1) {
261 off *= esize;
262 size *= esize;
263 len *= esize;
264 }
265 l = min(len, size - off);
266
267 ret = copy_to_user(to, fifo->data + off, l);
268 if (unlikely(ret))
269 ret = DIV_ROUND_UP(ret + len - l, esize);
270 else {
271 ret = copy_to_user(to + l, fifo->data, len - l);
272 if (unlikely(ret))
273 ret = DIV_ROUND_UP(ret, esize);
274 }
275 /*
276 * make sure that the data is copied before
277 * incrementing the fifo->out index counter
278 */
279 smp_wmb();
280 *copied = len - ret;
281 /* return the number of elements which are not copied */
282 return ret;
283}
284
285int __kfifo_to_user(struct __kfifo *fifo, void __user *to,
286 unsigned long len, unsigned int *copied)
287{
288 unsigned int l;
289 unsigned long ret;
290 unsigned int esize = fifo->esize;
291 int err;
292
293 if (esize != 1)
294 len /= esize;
295
296 l = fifo->in - fifo->out;
297 if (len > l)
298 len = l;
299 ret = kfifo_copy_to_user(fifo, to, len, fifo->out, copied);
300 if (unlikely(ret)) {
301 len -= ret;
302 err = -EFAULT;
303 } else
304 err = 0;
305 fifo->out += len;
306 return err;
307}
308EXPORT_SYMBOL(__kfifo_to_user);
309
310static int setup_sgl_buf(struct scatterlist *sgl, void *buf,
311 int nents, unsigned int len)
312{
313 int n;
314 unsigned int l;
315 unsigned int off;
316 struct page *page;
317
318 if (!nents)
319 return 0;
320
321 if (!len)
322 return 0;
323
324 n = 0;
325 page = virt_to_page(buf);
326 off = offset_in_page(buf);
327 l = 0;
328
329 while (len >= l + PAGE_SIZE - off) {
330 struct page *npage;
331
332 l += PAGE_SIZE;
333 buf += PAGE_SIZE;
334 npage = virt_to_page(buf);
335 if (page_to_phys(page) != page_to_phys(npage) - l) {
336 sg_set_page(sgl, page, l - off, off);
337 sgl = sg_next(sgl);
338 if (++n == nents || sgl == NULL)
339 return n;
340 page = npage;
341 len -= l - off;
342 l = off = 0;
343 }
344 }
345 sg_set_page(sgl, page, len, off);
346 return n + 1;
347}
348
349static unsigned int setup_sgl(struct __kfifo *fifo, struct scatterlist *sgl,
350 int nents, unsigned int len, unsigned int off)
351{
352 unsigned int size = fifo->mask + 1;
353 unsigned int esize = fifo->esize;
354 unsigned int l;
355 unsigned int n;
356
357 off &= fifo->mask;
358 if (esize != 1) {
359 off *= esize;
360 size *= esize;
361 len *= esize;
362 }
363 l = min(len, size - off);
364
365 n = setup_sgl_buf(sgl, fifo->data + off, nents, l);
366 n += setup_sgl_buf(sgl + n, fifo->data, nents - n, len - l);
367
368 return n;
369}
370
371unsigned int __kfifo_dma_in_prepare(struct __kfifo *fifo,
372 struct scatterlist *sgl, int nents, unsigned int len)
373{
374 unsigned int l;
375
376 l = kfifo_unused(fifo);
377 if (len > l)
378 len = l;
379
380 return setup_sgl(fifo, sgl, nents, len, fifo->in);
381}
382EXPORT_SYMBOL(__kfifo_dma_in_prepare);
383
384unsigned int __kfifo_dma_out_prepare(struct __kfifo *fifo,
385 struct scatterlist *sgl, int nents, unsigned int len)
386{
387 unsigned int l;
388
389 l = fifo->in - fifo->out;
390 if (len > l)
391 len = l;
392
393 return setup_sgl(fifo, sgl, nents, len, fifo->out);
394}
395EXPORT_SYMBOL(__kfifo_dma_out_prepare);
396
397unsigned int __kfifo_max_r(unsigned int len, size_t recsize)
398{
399 unsigned int max = (1 << (recsize << 3)) - 1;
400
401 if (len > max)
402 return max;
403 return len;
404}
405EXPORT_SYMBOL(__kfifo_max_r);
406
407#define __KFIFO_PEEK(data, out, mask) \
408 ((data)[(out) & (mask)])
409/*
410 * __kfifo_peek_n internal helper function for determinate the length of
411 * the next record in the fifo
412 */
413static unsigned int __kfifo_peek_n(struct __kfifo *fifo, size_t recsize)
414{
415 unsigned int l;
416 unsigned int mask = fifo->mask;
417 unsigned char *data = fifo->data;
418
419 l = __KFIFO_PEEK(data, fifo->out, mask);
420
421 if (--recsize)
422 l |= __KFIFO_PEEK(data, fifo->out + 1, mask) << 8;
423
424 return l;
425}
426
427#define __KFIFO_POKE(data, in, mask, val) \
428 ( \
429 (data)[(in) & (mask)] = (unsigned char)(val) \
430 )
431
432/*
433 * __kfifo_poke_n internal helper function for storeing the length of
434 * the record into the fifo
435 */
436static void __kfifo_poke_n(struct __kfifo *fifo, unsigned int n, size_t recsize)
437{
438 unsigned int mask = fifo->mask;
439 unsigned char *data = fifo->data;
440
441 __KFIFO_POKE(data, fifo->in, mask, n);
442
443 if (recsize > 1)
444 __KFIFO_POKE(data, fifo->in + 1, mask, n >> 8);
445}
446
447unsigned int __kfifo_len_r(struct __kfifo *fifo, size_t recsize)
448{
449 return __kfifo_peek_n(fifo, recsize);
450}
451EXPORT_SYMBOL(__kfifo_len_r);
452
453unsigned int __kfifo_in_r(struct __kfifo *fifo, const void *buf,
454 unsigned int len, size_t recsize)
455{
456 if (len + recsize > kfifo_unused(fifo))
457 return 0;
458
459 __kfifo_poke_n(fifo, len, recsize);
460
461 kfifo_copy_in(fifo, buf, len, fifo->in + recsize);
462 fifo->in += len + recsize;
463 return len;
464}
465EXPORT_SYMBOL(__kfifo_in_r);
466
467static unsigned int kfifo_out_copy_r(struct __kfifo *fifo,
468 void *buf, unsigned int len, size_t recsize, unsigned int *n)
469{
470 *n = __kfifo_peek_n(fifo, recsize);
471
472 if (len > *n)
473 len = *n;
474
475 kfifo_copy_out(fifo, buf, len, fifo->out + recsize);
476 return len;
477}
478
479unsigned int __kfifo_out_peek_r(struct __kfifo *fifo, void *buf,
480 unsigned int len, size_t recsize)
481{
482 unsigned int n;
483
484 if (fifo->in == fifo->out)
485 return 0;
486
487 return kfifo_out_copy_r(fifo, buf, len, recsize, &n);
488}
489EXPORT_SYMBOL(__kfifo_out_peek_r);
490
491unsigned int __kfifo_out_r(struct __kfifo *fifo, void *buf,
492 unsigned int len, size_t recsize)
493{
494 unsigned int n;
495
496 if (fifo->in == fifo->out)
497 return 0;
498
499 len = kfifo_out_copy_r(fifo, buf, len, recsize, &n);
500 fifo->out += n + recsize;
501 return len;
502}
503EXPORT_SYMBOL(__kfifo_out_r);
504
505void __kfifo_skip_r(struct __kfifo *fifo, size_t recsize)
506{
507 unsigned int n;
508
509 n = __kfifo_peek_n(fifo, recsize);
510 fifo->out += n + recsize;
511}
512EXPORT_SYMBOL(__kfifo_skip_r);
513
514int __kfifo_from_user_r(struct __kfifo *fifo, const void __user *from,
515 unsigned long len, unsigned int *copied, size_t recsize)
516{
517 unsigned long ret;
518
519 len = __kfifo_max_r(len, recsize);
520
521 if (len + recsize > kfifo_unused(fifo)) {
522 *copied = 0;
523 return 0;
524 }
525
526 __kfifo_poke_n(fifo, len, recsize);
527
528 ret = kfifo_copy_from_user(fifo, from, len, fifo->in + recsize, copied);
529 if (unlikely(ret)) {
530 *copied = 0;
531 return -EFAULT;
532 }
533 fifo->in += len + recsize;
534 return 0;
535}
536EXPORT_SYMBOL(__kfifo_from_user_r);
537
538int __kfifo_to_user_r(struct __kfifo *fifo, void __user *to,
539 unsigned long len, unsigned int *copied, size_t recsize)
540{
541 unsigned long ret;
542 unsigned int n;
543
544 if (fifo->in == fifo->out) {
545 *copied = 0;
546 return 0;
547 }
548
549 n = __kfifo_peek_n(fifo, recsize);
550 if (len > n)
551 len = n;
552
553 ret = kfifo_copy_to_user(fifo, to, len, fifo->out + recsize, copied);
554 if (unlikely(ret)) {
555 *copied = 0;
556 return -EFAULT;
557 }
558 fifo->out += n + recsize;
559 return 0;
560}
561EXPORT_SYMBOL(__kfifo_to_user_r);
562
563unsigned int __kfifo_dma_in_prepare_r(struct __kfifo *fifo,
564 struct scatterlist *sgl, int nents, unsigned int len, size_t recsize)
565{
566 if (!nents)
567 BUG();
568
569 len = __kfifo_max_r(len, recsize);
570
571 if (len + recsize > kfifo_unused(fifo))
572 return 0;
573
574 return setup_sgl(fifo, sgl, nents, len, fifo->in + recsize);
575}
576EXPORT_SYMBOL(__kfifo_dma_in_prepare_r);
577
578void __kfifo_dma_in_finish_r(struct __kfifo *fifo,
579 unsigned int len, size_t recsize)
580{
581 len = __kfifo_max_r(len, recsize);
582 __kfifo_poke_n(fifo, len, recsize);
583 fifo->in += len + recsize;
584}
585EXPORT_SYMBOL(__kfifo_dma_in_finish_r);
586
587unsigned int __kfifo_dma_out_prepare_r(struct __kfifo *fifo,
588 struct scatterlist *sgl, int nents, unsigned int len, size_t recsize)
589{
590 if (!nents)
591 BUG();
592
593 len = __kfifo_max_r(len, recsize);
594
595 if (len + recsize > fifo->in - fifo->out)
596 return 0;
597
598 return setup_sgl(fifo, sgl, nents, len, fifo->out + recsize);
599}
600EXPORT_SYMBOL(__kfifo_dma_out_prepare_r);
601
602void __kfifo_dma_out_finish_r(struct __kfifo *fifo, size_t recsize)
603{
604 unsigned int len;
605
606 len = __kfifo_peek_n(fifo, recsize);
607 fifo->out += len + recsize;
608}
609EXPORT_SYMBOL(__kfifo_dma_out_finish_r);
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 1c317e386831..56dd34976d7b 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -38,6 +38,7 @@
38#include <linux/suspend.h> 38#include <linux/suspend.h>
39#include <linux/rwsem.h> 39#include <linux/rwsem.h>
40#include <linux/ptrace.h> 40#include <linux/ptrace.h>
41#include <linux/async.h>
41#include <asm/uaccess.h> 42#include <asm/uaccess.h>
42 43
43#include <trace/events/module.h> 44#include <trace/events/module.h>
@@ -130,6 +131,14 @@ int __request_module(bool wait, const char *fmt, ...)
130#define MAX_KMOD_CONCURRENT 50 /* Completely arbitrary value - KAO */ 131#define MAX_KMOD_CONCURRENT 50 /* Completely arbitrary value - KAO */
131 static int kmod_loop_msg; 132 static int kmod_loop_msg;
132 133
134 /*
135 * We don't allow synchronous module loading from async. Module
136 * init may invoke async_synchronize_full() which will end up
137 * waiting for this task which already is waiting for the module
138 * loading to complete, leading to a deadlock.
139 */
140 WARN_ON_ONCE(wait && current_is_async());
141
133 va_start(args, fmt); 142 va_start(args, fmt);
134 ret = vsnprintf(module_name, MODULE_NAME_LEN, fmt, args); 143 ret = vsnprintf(module_name, MODULE_NAME_LEN, fmt, args);
135 va_end(args); 144 va_end(args);
@@ -219,9 +228,9 @@ static int ____call_usermodehelper(void *data)
219 228
220 commit_creds(new); 229 commit_creds(new);
221 230
222 retval = kernel_execve(sub_info->path, 231 retval = do_execve(sub_info->path,
223 (const char *const *)sub_info->argv, 232 (const char __user *const __user *)sub_info->argv,
224 (const char *const *)sub_info->envp); 233 (const char __user *const __user *)sub_info->envp);
225 if (!retval) 234 if (!retval)
226 return 0; 235 return 0;
227 236
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 098f396aa409..e35be53f6613 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -334,11 +334,10 @@ static inline void reset_kprobe_instance(void)
334struct kprobe __kprobes *get_kprobe(void *addr) 334struct kprobe __kprobes *get_kprobe(void *addr)
335{ 335{
336 struct hlist_head *head; 336 struct hlist_head *head;
337 struct hlist_node *node;
338 struct kprobe *p; 337 struct kprobe *p;
339 338
340 head = &kprobe_table[hash_ptr(addr, KPROBE_HASH_BITS)]; 339 head = &kprobe_table[hash_ptr(addr, KPROBE_HASH_BITS)];
341 hlist_for_each_entry_rcu(p, node, head, hlist) { 340 hlist_for_each_entry_rcu(p, head, hlist) {
342 if (p->addr == addr) 341 if (p->addr == addr)
343 return p; 342 return p;
344 } 343 }
@@ -471,7 +470,6 @@ static LIST_HEAD(unoptimizing_list);
471 470
472static void kprobe_optimizer(struct work_struct *work); 471static void kprobe_optimizer(struct work_struct *work);
473static DECLARE_DELAYED_WORK(optimizing_work, kprobe_optimizer); 472static DECLARE_DELAYED_WORK(optimizing_work, kprobe_optimizer);
474static DECLARE_COMPLETION(optimizer_comp);
475#define OPTIMIZE_DELAY 5 473#define OPTIMIZE_DELAY 5
476 474
477/* 475/*
@@ -552,8 +550,7 @@ static __kprobes void do_free_cleaned_kprobes(struct list_head *free_list)
552/* Start optimizer after OPTIMIZE_DELAY passed */ 550/* Start optimizer after OPTIMIZE_DELAY passed */
553static __kprobes void kick_kprobe_optimizer(void) 551static __kprobes void kick_kprobe_optimizer(void)
554{ 552{
555 if (!delayed_work_pending(&optimizing_work)) 553 schedule_delayed_work(&optimizing_work, OPTIMIZE_DELAY);
556 schedule_delayed_work(&optimizing_work, OPTIMIZE_DELAY);
557} 554}
558 555
559/* Kprobe jump optimizer */ 556/* Kprobe jump optimizer */
@@ -592,16 +589,25 @@ static __kprobes void kprobe_optimizer(struct work_struct *work)
592 /* Step 5: Kick optimizer again if needed */ 589 /* Step 5: Kick optimizer again if needed */
593 if (!list_empty(&optimizing_list) || !list_empty(&unoptimizing_list)) 590 if (!list_empty(&optimizing_list) || !list_empty(&unoptimizing_list))
594 kick_kprobe_optimizer(); 591 kick_kprobe_optimizer();
595 else
596 /* Wake up all waiters */
597 complete_all(&optimizer_comp);
598} 592}
599 593
600/* Wait for completing optimization and unoptimization */ 594/* Wait for completing optimization and unoptimization */
601static __kprobes void wait_for_kprobe_optimizer(void) 595static __kprobes void wait_for_kprobe_optimizer(void)
602{ 596{
603 if (delayed_work_pending(&optimizing_work)) 597 mutex_lock(&kprobe_mutex);
604 wait_for_completion(&optimizer_comp); 598
599 while (!list_empty(&optimizing_list) || !list_empty(&unoptimizing_list)) {
600 mutex_unlock(&kprobe_mutex);
601
602 /* this will also make optimizing_work execute immmediately */
603 flush_delayed_work(&optimizing_work);
604 /* @optimizing_work might not have been queued yet, relax */
605 cpu_relax();
606
607 mutex_lock(&kprobe_mutex);
608 }
609
610 mutex_unlock(&kprobe_mutex);
605} 611}
606 612
607/* Optimize kprobe if p is ready to be optimized */ 613/* Optimize kprobe if p is ready to be optimized */
@@ -792,7 +798,6 @@ out:
792static void __kprobes optimize_all_kprobes(void) 798static void __kprobes optimize_all_kprobes(void)
793{ 799{
794 struct hlist_head *head; 800 struct hlist_head *head;
795 struct hlist_node *node;
796 struct kprobe *p; 801 struct kprobe *p;
797 unsigned int i; 802 unsigned int i;
798 803
@@ -803,7 +808,7 @@ static void __kprobes optimize_all_kprobes(void)
803 kprobes_allow_optimization = true; 808 kprobes_allow_optimization = true;
804 for (i = 0; i < KPROBE_TABLE_SIZE; i++) { 809 for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
805 head = &kprobe_table[i]; 810 head = &kprobe_table[i];
806 hlist_for_each_entry_rcu(p, node, head, hlist) 811 hlist_for_each_entry_rcu(p, head, hlist)
807 if (!kprobe_disabled(p)) 812 if (!kprobe_disabled(p))
808 optimize_kprobe(p); 813 optimize_kprobe(p);
809 } 814 }
@@ -814,7 +819,6 @@ static void __kprobes optimize_all_kprobes(void)
814static void __kprobes unoptimize_all_kprobes(void) 819static void __kprobes unoptimize_all_kprobes(void)
815{ 820{
816 struct hlist_head *head; 821 struct hlist_head *head;
817 struct hlist_node *node;
818 struct kprobe *p; 822 struct kprobe *p;
819 unsigned int i; 823 unsigned int i;
820 824
@@ -825,7 +829,7 @@ static void __kprobes unoptimize_all_kprobes(void)
825 kprobes_allow_optimization = false; 829 kprobes_allow_optimization = false;
826 for (i = 0; i < KPROBE_TABLE_SIZE; i++) { 830 for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
827 head = &kprobe_table[i]; 831 head = &kprobe_table[i];
828 hlist_for_each_entry_rcu(p, node, head, hlist) { 832 hlist_for_each_entry_rcu(p, head, hlist) {
829 if (!kprobe_disabled(p)) 833 if (!kprobe_disabled(p))
830 unoptimize_kprobe(p, false); 834 unoptimize_kprobe(p, false);
831 } 835 }
@@ -919,7 +923,7 @@ static __kprobes struct kprobe *alloc_aggr_kprobe(struct kprobe *p)
919} 923}
920#endif /* CONFIG_OPTPROBES */ 924#endif /* CONFIG_OPTPROBES */
921 925
922#ifdef KPROBES_CAN_USE_FTRACE 926#ifdef CONFIG_KPROBES_ON_FTRACE
923static struct ftrace_ops kprobe_ftrace_ops __read_mostly = { 927static struct ftrace_ops kprobe_ftrace_ops __read_mostly = {
924 .func = kprobe_ftrace_handler, 928 .func = kprobe_ftrace_handler,
925 .flags = FTRACE_OPS_FL_SAVE_REGS, 929 .flags = FTRACE_OPS_FL_SAVE_REGS,
@@ -964,7 +968,7 @@ static void __kprobes disarm_kprobe_ftrace(struct kprobe *p)
964 (unsigned long)p->addr, 1, 0); 968 (unsigned long)p->addr, 1, 0);
965 WARN(ret < 0, "Failed to disarm kprobe-ftrace at %p (%d)\n", p->addr, ret); 969 WARN(ret < 0, "Failed to disarm kprobe-ftrace at %p (%d)\n", p->addr, ret);
966} 970}
967#else /* !KPROBES_CAN_USE_FTRACE */ 971#else /* !CONFIG_KPROBES_ON_FTRACE */
968#define prepare_kprobe(p) arch_prepare_kprobe(p) 972#define prepare_kprobe(p) arch_prepare_kprobe(p)
969#define arm_kprobe_ftrace(p) do {} while (0) 973#define arm_kprobe_ftrace(p) do {} while (0)
970#define disarm_kprobe_ftrace(p) do {} while (0) 974#define disarm_kprobe_ftrace(p) do {} while (0)
@@ -1141,7 +1145,7 @@ void __kprobes kprobe_flush_task(struct task_struct *tk)
1141{ 1145{
1142 struct kretprobe_instance *ri; 1146 struct kretprobe_instance *ri;
1143 struct hlist_head *head, empty_rp; 1147 struct hlist_head *head, empty_rp;
1144 struct hlist_node *node, *tmp; 1148 struct hlist_node *tmp;
1145 unsigned long hash, flags = 0; 1149 unsigned long hash, flags = 0;
1146 1150
1147 if (unlikely(!kprobes_initialized)) 1151 if (unlikely(!kprobes_initialized))
@@ -1152,12 +1156,12 @@ void __kprobes kprobe_flush_task(struct task_struct *tk)
1152 hash = hash_ptr(tk, KPROBE_HASH_BITS); 1156 hash = hash_ptr(tk, KPROBE_HASH_BITS);
1153 head = &kretprobe_inst_table[hash]; 1157 head = &kretprobe_inst_table[hash];
1154 kretprobe_table_lock(hash, &flags); 1158 kretprobe_table_lock(hash, &flags);
1155 hlist_for_each_entry_safe(ri, node, tmp, head, hlist) { 1159 hlist_for_each_entry_safe(ri, tmp, head, hlist) {
1156 if (ri->task == tk) 1160 if (ri->task == tk)
1157 recycle_rp_inst(ri, &empty_rp); 1161 recycle_rp_inst(ri, &empty_rp);
1158 } 1162 }
1159 kretprobe_table_unlock(hash, &flags); 1163 kretprobe_table_unlock(hash, &flags);
1160 hlist_for_each_entry_safe(ri, node, tmp, &empty_rp, hlist) { 1164 hlist_for_each_entry_safe(ri, tmp, &empty_rp, hlist) {
1161 hlist_del(&ri->hlist); 1165 hlist_del(&ri->hlist);
1162 kfree(ri); 1166 kfree(ri);
1163 } 1167 }
@@ -1166,9 +1170,9 @@ void __kprobes kprobe_flush_task(struct task_struct *tk)
1166static inline void free_rp_inst(struct kretprobe *rp) 1170static inline void free_rp_inst(struct kretprobe *rp)
1167{ 1171{
1168 struct kretprobe_instance *ri; 1172 struct kretprobe_instance *ri;
1169 struct hlist_node *pos, *next; 1173 struct hlist_node *next;
1170 1174
1171 hlist_for_each_entry_safe(ri, pos, next, &rp->free_instances, hlist) { 1175 hlist_for_each_entry_safe(ri, next, &rp->free_instances, hlist) {
1172 hlist_del(&ri->hlist); 1176 hlist_del(&ri->hlist);
1173 kfree(ri); 1177 kfree(ri);
1174 } 1178 }
@@ -1178,14 +1182,14 @@ static void __kprobes cleanup_rp_inst(struct kretprobe *rp)
1178{ 1182{
1179 unsigned long flags, hash; 1183 unsigned long flags, hash;
1180 struct kretprobe_instance *ri; 1184 struct kretprobe_instance *ri;
1181 struct hlist_node *pos, *next; 1185 struct hlist_node *next;
1182 struct hlist_head *head; 1186 struct hlist_head *head;
1183 1187
1184 /* No race here */ 1188 /* No race here */
1185 for (hash = 0; hash < KPROBE_TABLE_SIZE; hash++) { 1189 for (hash = 0; hash < KPROBE_TABLE_SIZE; hash++) {
1186 kretprobe_table_lock(hash, &flags); 1190 kretprobe_table_lock(hash, &flags);
1187 head = &kretprobe_inst_table[hash]; 1191 head = &kretprobe_inst_table[hash];
1188 hlist_for_each_entry_safe(ri, pos, next, head, hlist) { 1192 hlist_for_each_entry_safe(ri, next, head, hlist) {
1189 if (ri->rp == rp) 1193 if (ri->rp == rp)
1190 ri->rp = NULL; 1194 ri->rp = NULL;
1191 } 1195 }
@@ -1414,12 +1418,12 @@ static __kprobes int check_kprobe_address_safe(struct kprobe *p,
1414 */ 1418 */
1415 ftrace_addr = ftrace_location((unsigned long)p->addr); 1419 ftrace_addr = ftrace_location((unsigned long)p->addr);
1416 if (ftrace_addr) { 1420 if (ftrace_addr) {
1417#ifdef KPROBES_CAN_USE_FTRACE 1421#ifdef CONFIG_KPROBES_ON_FTRACE
1418 /* Given address is not on the instruction boundary */ 1422 /* Given address is not on the instruction boundary */
1419 if ((unsigned long)p->addr != ftrace_addr) 1423 if ((unsigned long)p->addr != ftrace_addr)
1420 return -EILSEQ; 1424 return -EILSEQ;
1421 p->flags |= KPROBE_FLAG_FTRACE; 1425 p->flags |= KPROBE_FLAG_FTRACE;
1422#else /* !KPROBES_CAN_USE_FTRACE */ 1426#else /* !CONFIG_KPROBES_ON_FTRACE */
1423 return -EINVAL; 1427 return -EINVAL;
1424#endif 1428#endif
1425 } 1429 }
@@ -2021,7 +2025,6 @@ static int __kprobes kprobes_module_callback(struct notifier_block *nb,
2021{ 2025{
2022 struct module *mod = data; 2026 struct module *mod = data;
2023 struct hlist_head *head; 2027 struct hlist_head *head;
2024 struct hlist_node *node;
2025 struct kprobe *p; 2028 struct kprobe *p;
2026 unsigned int i; 2029 unsigned int i;
2027 int checkcore = (val == MODULE_STATE_GOING); 2030 int checkcore = (val == MODULE_STATE_GOING);
@@ -2038,7 +2041,7 @@ static int __kprobes kprobes_module_callback(struct notifier_block *nb,
2038 mutex_lock(&kprobe_mutex); 2041 mutex_lock(&kprobe_mutex);
2039 for (i = 0; i < KPROBE_TABLE_SIZE; i++) { 2042 for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
2040 head = &kprobe_table[i]; 2043 head = &kprobe_table[i];
2041 hlist_for_each_entry_rcu(p, node, head, hlist) 2044 hlist_for_each_entry_rcu(p, head, hlist)
2042 if (within_module_init((unsigned long)p->addr, mod) || 2045 if (within_module_init((unsigned long)p->addr, mod) ||
2043 (checkcore && 2046 (checkcore &&
2044 within_module_core((unsigned long)p->addr, mod))) { 2047 within_module_core((unsigned long)p->addr, mod))) {
@@ -2185,7 +2188,6 @@ static void __kprobes kprobe_seq_stop(struct seq_file *f, void *v)
2185static int __kprobes show_kprobe_addr(struct seq_file *pi, void *v) 2188static int __kprobes show_kprobe_addr(struct seq_file *pi, void *v)
2186{ 2189{
2187 struct hlist_head *head; 2190 struct hlist_head *head;
2188 struct hlist_node *node;
2189 struct kprobe *p, *kp; 2191 struct kprobe *p, *kp;
2190 const char *sym = NULL; 2192 const char *sym = NULL;
2191 unsigned int i = *(loff_t *) v; 2193 unsigned int i = *(loff_t *) v;
@@ -2194,7 +2196,7 @@ static int __kprobes show_kprobe_addr(struct seq_file *pi, void *v)
2194 2196
2195 head = &kprobe_table[i]; 2197 head = &kprobe_table[i];
2196 preempt_disable(); 2198 preempt_disable();
2197 hlist_for_each_entry_rcu(p, node, head, hlist) { 2199 hlist_for_each_entry_rcu(p, head, hlist) {
2198 sym = kallsyms_lookup((unsigned long)p->addr, NULL, 2200 sym = kallsyms_lookup((unsigned long)p->addr, NULL,
2199 &offset, &modname, namebuf); 2201 &offset, &modname, namebuf);
2200 if (kprobe_aggrprobe(p)) { 2202 if (kprobe_aggrprobe(p)) {
@@ -2229,7 +2231,6 @@ static const struct file_operations debugfs_kprobes_operations = {
2229static void __kprobes arm_all_kprobes(void) 2231static void __kprobes arm_all_kprobes(void)
2230{ 2232{
2231 struct hlist_head *head; 2233 struct hlist_head *head;
2232 struct hlist_node *node;
2233 struct kprobe *p; 2234 struct kprobe *p;
2234 unsigned int i; 2235 unsigned int i;
2235 2236
@@ -2242,7 +2243,7 @@ static void __kprobes arm_all_kprobes(void)
2242 /* Arming kprobes doesn't optimize kprobe itself */ 2243 /* Arming kprobes doesn't optimize kprobe itself */
2243 for (i = 0; i < KPROBE_TABLE_SIZE; i++) { 2244 for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
2244 head = &kprobe_table[i]; 2245 head = &kprobe_table[i];
2245 hlist_for_each_entry_rcu(p, node, head, hlist) 2246 hlist_for_each_entry_rcu(p, head, hlist)
2246 if (!kprobe_disabled(p)) 2247 if (!kprobe_disabled(p))
2247 arm_kprobe(p); 2248 arm_kprobe(p);
2248 } 2249 }
@@ -2258,7 +2259,6 @@ already_enabled:
2258static void __kprobes disarm_all_kprobes(void) 2259static void __kprobes disarm_all_kprobes(void)
2259{ 2260{
2260 struct hlist_head *head; 2261 struct hlist_head *head;
2261 struct hlist_node *node;
2262 struct kprobe *p; 2262 struct kprobe *p;
2263 unsigned int i; 2263 unsigned int i;
2264 2264
@@ -2275,7 +2275,7 @@ static void __kprobes disarm_all_kprobes(void)
2275 2275
2276 for (i = 0; i < KPROBE_TABLE_SIZE; i++) { 2276 for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
2277 head = &kprobe_table[i]; 2277 head = &kprobe_table[i];
2278 hlist_for_each_entry_rcu(p, node, head, hlist) { 2278 hlist_for_each_entry_rcu(p, head, hlist) {
2279 if (!arch_trampoline_kprobe(p) && !kprobe_disabled(p)) 2279 if (!arch_trampoline_kprobe(p) && !kprobe_disabled(p))
2280 disarm_kprobe(p, false); 2280 disarm_kprobe(p, false);
2281 } 2281 }
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 4e316e1acf58..6ada93c23a9a 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -26,7 +26,6 @@ static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
26static struct kobj_attribute _name##_attr = \ 26static struct kobj_attribute _name##_attr = \
27 __ATTR(_name, 0644, _name##_show, _name##_store) 27 __ATTR(_name, 0644, _name##_show, _name##_store)
28 28
29#if defined(CONFIG_HOTPLUG)
30/* current uevent sequence number */ 29/* current uevent sequence number */
31static ssize_t uevent_seqnum_show(struct kobject *kobj, 30static ssize_t uevent_seqnum_show(struct kobject *kobj,
32 struct kobj_attribute *attr, char *buf) 31 struct kobj_attribute *attr, char *buf)
@@ -54,7 +53,7 @@ static ssize_t uevent_helper_store(struct kobject *kobj,
54 return count; 53 return count;
55} 54}
56KERNEL_ATTR_RW(uevent_helper); 55KERNEL_ATTR_RW(uevent_helper);
57#endif 56
58 57
59#ifdef CONFIG_PROFILING 58#ifdef CONFIG_PROFILING
60static ssize_t profiling_show(struct kobject *kobj, 59static ssize_t profiling_show(struct kobject *kobj,
@@ -141,6 +140,23 @@ static ssize_t fscaps_show(struct kobject *kobj,
141} 140}
142KERNEL_ATTR_RO(fscaps); 141KERNEL_ATTR_RO(fscaps);
143 142
143int rcu_expedited;
144static ssize_t rcu_expedited_show(struct kobject *kobj,
145 struct kobj_attribute *attr, char *buf)
146{
147 return sprintf(buf, "%d\n", rcu_expedited);
148}
149static ssize_t rcu_expedited_store(struct kobject *kobj,
150 struct kobj_attribute *attr,
151 const char *buf, size_t count)
152{
153 if (kstrtoint(buf, 0, &rcu_expedited))
154 return -EINVAL;
155
156 return count;
157}
158KERNEL_ATTR_RW(rcu_expedited);
159
144/* 160/*
145 * Make /sys/kernel/notes give the raw contents of our kernel .notes section. 161 * Make /sys/kernel/notes give the raw contents of our kernel .notes section.
146 */ 162 */
@@ -169,10 +185,8 @@ EXPORT_SYMBOL_GPL(kernel_kobj);
169 185
170static struct attribute * kernel_attrs[] = { 186static struct attribute * kernel_attrs[] = {
171 &fscaps_attr.attr, 187 &fscaps_attr.attr,
172#if defined(CONFIG_HOTPLUG)
173 &uevent_seqnum_attr.attr, 188 &uevent_seqnum_attr.attr,
174 &uevent_helper_attr.attr, 189 &uevent_helper_attr.attr,
175#endif
176#ifdef CONFIG_PROFILING 190#ifdef CONFIG_PROFILING
177 &profiling_attr.attr, 191 &profiling_attr.attr,
178#endif 192#endif
@@ -182,6 +196,7 @@ static struct attribute * kernel_attrs[] = {
182 &kexec_crash_size_attr.attr, 196 &kexec_crash_size_attr.attr,
183 &vmcoreinfo_attr.attr, 197 &vmcoreinfo_attr.attr,
184#endif 198#endif
199 &rcu_expedited_attr.attr,
185 NULL 200 NULL
186}; 201};
187 202
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 29fb60caecb5..9eb7fed0bbaa 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -124,12 +124,12 @@ void *kthread_data(struct task_struct *task)
124 124
125static void __kthread_parkme(struct kthread *self) 125static void __kthread_parkme(struct kthread *self)
126{ 126{
127 __set_current_state(TASK_INTERRUPTIBLE); 127 __set_current_state(TASK_PARKED);
128 while (test_bit(KTHREAD_SHOULD_PARK, &self->flags)) { 128 while (test_bit(KTHREAD_SHOULD_PARK, &self->flags)) {
129 if (!test_and_set_bit(KTHREAD_IS_PARKED, &self->flags)) 129 if (!test_and_set_bit(KTHREAD_IS_PARKED, &self->flags))
130 complete(&self->parked); 130 complete(&self->parked);
131 schedule(); 131 schedule();
132 __set_current_state(TASK_INTERRUPTIBLE); 132 __set_current_state(TASK_PARKED);
133 } 133 }
134 clear_bit(KTHREAD_IS_PARKED, &self->flags); 134 clear_bit(KTHREAD_IS_PARKED, &self->flags);
135 __set_current_state(TASK_RUNNING); 135 __set_current_state(TASK_RUNNING);
@@ -256,8 +256,13 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
256} 256}
257EXPORT_SYMBOL(kthread_create_on_node); 257EXPORT_SYMBOL(kthread_create_on_node);
258 258
259static void __kthread_bind(struct task_struct *p, unsigned int cpu) 259static void __kthread_bind(struct task_struct *p, unsigned int cpu, long state)
260{ 260{
261 /* Must have done schedule() in kthread() before we set_task_cpu */
262 if (!wait_task_inactive(p, state)) {
263 WARN_ON(1);
264 return;
265 }
261 /* It's safe because the task is inactive. */ 266 /* It's safe because the task is inactive. */
262 do_set_cpus_allowed(p, cpumask_of(cpu)); 267 do_set_cpus_allowed(p, cpumask_of(cpu));
263 p->flags |= PF_THREAD_BOUND; 268 p->flags |= PF_THREAD_BOUND;
@@ -274,12 +279,7 @@ static void __kthread_bind(struct task_struct *p, unsigned int cpu)
274 */ 279 */
275void kthread_bind(struct task_struct *p, unsigned int cpu) 280void kthread_bind(struct task_struct *p, unsigned int cpu)
276{ 281{
277 /* Must have done schedule() in kthread() before we set_task_cpu */ 282 __kthread_bind(p, cpu, TASK_UNINTERRUPTIBLE);
278 if (!wait_task_inactive(p, TASK_UNINTERRUPTIBLE)) {
279 WARN_ON(1);
280 return;
281 }
282 __kthread_bind(p, cpu);
283} 283}
284EXPORT_SYMBOL(kthread_bind); 284EXPORT_SYMBOL(kthread_bind);
285 285
@@ -324,6 +324,22 @@ static struct kthread *task_get_live_kthread(struct task_struct *k)
324 return NULL; 324 return NULL;
325} 325}
326 326
327static void __kthread_unpark(struct task_struct *k, struct kthread *kthread)
328{
329 clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags);
330 /*
331 * We clear the IS_PARKED bit here as we don't wait
332 * until the task has left the park code. So if we'd
333 * park before that happens we'd see the IS_PARKED bit
334 * which might be about to be cleared.
335 */
336 if (test_and_clear_bit(KTHREAD_IS_PARKED, &kthread->flags)) {
337 if (test_bit(KTHREAD_IS_PER_CPU, &kthread->flags))
338 __kthread_bind(k, kthread->cpu, TASK_PARKED);
339 wake_up_state(k, TASK_PARKED);
340 }
341}
342
327/** 343/**
328 * kthread_unpark - unpark a thread created by kthread_create(). 344 * kthread_unpark - unpark a thread created by kthread_create().
329 * @k: thread created by kthread_create(). 345 * @k: thread created by kthread_create().
@@ -336,20 +352,8 @@ void kthread_unpark(struct task_struct *k)
336{ 352{
337 struct kthread *kthread = task_get_live_kthread(k); 353 struct kthread *kthread = task_get_live_kthread(k);
338 354
339 if (kthread) { 355 if (kthread)
340 clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags); 356 __kthread_unpark(k, kthread);
341 /*
342 * We clear the IS_PARKED bit here as we don't wait
343 * until the task has left the park code. So if we'd
344 * park before that happens we'd see the IS_PARKED bit
345 * which might be about to be cleared.
346 */
347 if (test_and_clear_bit(KTHREAD_IS_PARKED, &kthread->flags)) {
348 if (test_bit(KTHREAD_IS_PER_CPU, &kthread->flags))
349 __kthread_bind(k, kthread->cpu);
350 wake_up_process(k);
351 }
352 }
353 put_task_struct(k); 357 put_task_struct(k);
354} 358}
355 359
@@ -407,7 +411,7 @@ int kthread_stop(struct task_struct *k)
407 trace_sched_kthread_stop(k); 411 trace_sched_kthread_stop(k);
408 if (kthread) { 412 if (kthread) {
409 set_bit(KTHREAD_SHOULD_STOP, &kthread->flags); 413 set_bit(KTHREAD_SHOULD_STOP, &kthread->flags);
410 clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags); 414 __kthread_unpark(k, kthread);
411 wake_up_process(k); 415 wake_up_process(k);
412 wait_for_completion(&kthread->exited); 416 wait_for_completion(&kthread->exited);
413 } 417 }
@@ -428,7 +432,7 @@ int kthreadd(void *unused)
428 set_task_comm(tsk, "kthreadd"); 432 set_task_comm(tsk, "kthreadd");
429 ignore_signals(tsk); 433 ignore_signals(tsk);
430 set_cpus_allowed_ptr(tsk, cpu_all_mask); 434 set_cpus_allowed_ptr(tsk, cpu_all_mask);
431 set_mems_allowed(node_states[N_HIGH_MEMORY]); 435 set_mems_allowed(node_states[N_MEMORY]);
432 436
433 current->flags |= PF_NOFREEZE; 437 current->flags |= PF_NOFREEZE;
434 438
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 7981e5b2350d..8a0efac4f99d 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -3190,9 +3190,14 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
3190#endif 3190#endif
3191 if (unlikely(curr->lockdep_depth >= MAX_LOCK_DEPTH)) { 3191 if (unlikely(curr->lockdep_depth >= MAX_LOCK_DEPTH)) {
3192 debug_locks_off(); 3192 debug_locks_off();
3193 printk("BUG: MAX_LOCK_DEPTH too low!\n"); 3193 printk("BUG: MAX_LOCK_DEPTH too low, depth: %i max: %lu!\n",
3194 curr->lockdep_depth, MAX_LOCK_DEPTH);
3194 printk("turning off the locking correctness validator.\n"); 3195 printk("turning off the locking correctness validator.\n");
3196
3197 lockdep_print_held_locks(current);
3198 debug_show_all_locks();
3195 dump_stack(); 3199 dump_stack();
3200
3196 return 0; 3201 return 0;
3197 } 3202 }
3198 3203
@@ -3203,7 +3208,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
3203} 3208}
3204 3209
3205static int 3210static int
3206print_unlock_inbalance_bug(struct task_struct *curr, struct lockdep_map *lock, 3211print_unlock_imbalance_bug(struct task_struct *curr, struct lockdep_map *lock,
3207 unsigned long ip) 3212 unsigned long ip)
3208{ 3213{
3209 if (!debug_locks_off()) 3214 if (!debug_locks_off())
@@ -3246,7 +3251,7 @@ static int check_unlock(struct task_struct *curr, struct lockdep_map *lock,
3246 return 0; 3251 return 0;
3247 3252
3248 if (curr->lockdep_depth <= 0) 3253 if (curr->lockdep_depth <= 0)
3249 return print_unlock_inbalance_bug(curr, lock, ip); 3254 return print_unlock_imbalance_bug(curr, lock, ip);
3250 3255
3251 return 1; 3256 return 1;
3252} 3257}
@@ -3317,7 +3322,7 @@ __lock_set_class(struct lockdep_map *lock, const char *name,
3317 goto found_it; 3322 goto found_it;
3318 prev_hlock = hlock; 3323 prev_hlock = hlock;
3319 } 3324 }
3320 return print_unlock_inbalance_bug(curr, lock, ip); 3325 return print_unlock_imbalance_bug(curr, lock, ip);
3321 3326
3322found_it: 3327found_it:
3323 lockdep_init_map(lock, name, key, 0); 3328 lockdep_init_map(lock, name, key, 0);
@@ -3384,7 +3389,7 @@ lock_release_non_nested(struct task_struct *curr,
3384 goto found_it; 3389 goto found_it;
3385 prev_hlock = hlock; 3390 prev_hlock = hlock;
3386 } 3391 }
3387 return print_unlock_inbalance_bug(curr, lock, ip); 3392 return print_unlock_imbalance_bug(curr, lock, ip);
3388 3393
3389found_it: 3394found_it:
3390 if (hlock->instance == lock) 3395 if (hlock->instance == lock)
diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c
index 91c32a0b612c..b2c71c5873e4 100644
--- a/kernel/lockdep_proc.c
+++ b/kernel/lockdep_proc.c
@@ -39,7 +39,7 @@ static void l_stop(struct seq_file *m, void *v)
39 39
40static void print_name(struct seq_file *m, struct lock_class *class) 40static void print_name(struct seq_file *m, struct lock_class *class)
41{ 41{
42 char str[128]; 42 char str[KSYM_NAME_LEN];
43 const char *name = class->name; 43 const char *name = class->name;
44 44
45 if (!name) { 45 if (!name) {
diff --git a/kernel/modsign_certificate.S b/kernel/modsign_certificate.S
new file mode 100644
index 000000000000..246b4c6e6135
--- /dev/null
+++ b/kernel/modsign_certificate.S
@@ -0,0 +1,19 @@
1/* SYMBOL_PREFIX defined on commandline from CONFIG_SYMBOL_PREFIX */
2#ifndef SYMBOL_PREFIX
3#define ASM_SYMBOL(sym) sym
4#else
5#define PASTE2(x,y) x##y
6#define PASTE(x,y) PASTE2(x,y)
7#define ASM_SYMBOL(sym) PASTE(SYMBOL_PREFIX, sym)
8#endif
9
10#define GLOBAL(name) \
11 .globl ASM_SYMBOL(name); \
12 ASM_SYMBOL(name):
13
14 .section ".init.data","aw"
15
16GLOBAL(modsign_certificate_list)
17 .incbin "signing_key.x509"
18 .incbin "extra_certificates"
19GLOBAL(modsign_certificate_list_end)
diff --git a/kernel/modsign_pubkey.c b/kernel/modsign_pubkey.c
index 4646eb2c3820..2b6e69909c39 100644
--- a/kernel/modsign_pubkey.c
+++ b/kernel/modsign_pubkey.c
@@ -20,12 +20,6 @@ struct key *modsign_keyring;
20 20
21extern __initdata const u8 modsign_certificate_list[]; 21extern __initdata const u8 modsign_certificate_list[];
22extern __initdata const u8 modsign_certificate_list_end[]; 22extern __initdata const u8 modsign_certificate_list_end[];
23asm(".section .init.data,\"aw\"\n"
24 "modsign_certificate_list:\n"
25 ".incbin \"signing_key.x509\"\n"
26 ".incbin \"extra_certificates\"\n"
27 "modsign_certificate_list_end:"
28 );
29 23
30/* 24/*
31 * We need to make sure ccache doesn't cache the .o file as it doesn't notice 25 * We need to make sure ccache doesn't cache the .o file as it doesn't notice
@@ -40,18 +34,15 @@ static __init int module_verify_init(void)
40{ 34{
41 pr_notice("Initialise module verification\n"); 35 pr_notice("Initialise module verification\n");
42 36
43 modsign_keyring = key_alloc(&key_type_keyring, ".module_sign", 37 modsign_keyring = keyring_alloc(".module_sign",
44 KUIDT_INIT(0), KGIDT_INIT(0), 38 KUIDT_INIT(0), KGIDT_INIT(0),
45 current_cred(), 39 current_cred(),
46 (KEY_POS_ALL & ~KEY_POS_SETATTR) | 40 ((KEY_POS_ALL & ~KEY_POS_SETATTR) |
47 KEY_USR_VIEW | KEY_USR_READ, 41 KEY_USR_VIEW | KEY_USR_READ),
48 KEY_ALLOC_NOT_IN_QUOTA); 42 KEY_ALLOC_NOT_IN_QUOTA, NULL);
49 if (IS_ERR(modsign_keyring)) 43 if (IS_ERR(modsign_keyring))
50 panic("Can't allocate module signing keyring\n"); 44 panic("Can't allocate module signing keyring\n");
51 45
52 if (key_instantiate_and_link(modsign_keyring, NULL, 0, NULL, NULL) < 0)
53 panic("Can't instantiate module signing keyring\n");
54
55 return 0; 46 return 0;
56} 47}
57 48
diff --git a/kernel/module.c b/kernel/module.c
index 6085f5ef88ea..0925c9a71975 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -21,6 +21,7 @@
21#include <linux/ftrace_event.h> 21#include <linux/ftrace_event.h>
22#include <linux/init.h> 22#include <linux/init.h>
23#include <linux/kallsyms.h> 23#include <linux/kallsyms.h>
24#include <linux/file.h>
24#include <linux/fs.h> 25#include <linux/fs.h>
25#include <linux/sysfs.h> 26#include <linux/sysfs.h>
26#include <linux/kernel.h> 27#include <linux/kernel.h>
@@ -28,6 +29,7 @@
28#include <linux/vmalloc.h> 29#include <linux/vmalloc.h>
29#include <linux/elf.h> 30#include <linux/elf.h>
30#include <linux/proc_fs.h> 31#include <linux/proc_fs.h>
32#include <linux/security.h>
31#include <linux/seq_file.h> 33#include <linux/seq_file.h>
32#include <linux/syscalls.h> 34#include <linux/syscalls.h>
33#include <linux/fcntl.h> 35#include <linux/fcntl.h>
@@ -59,6 +61,7 @@
59#include <linux/pfn.h> 61#include <linux/pfn.h>
60#include <linux/bsearch.h> 62#include <linux/bsearch.h>
61#include <linux/fips.h> 63#include <linux/fips.h>
64#include <uapi/linux/module.h>
62#include "module-internal.h" 65#include "module-internal.h"
63 66
64#define CREATE_TRACE_POINTS 67#define CREATE_TRACE_POINTS
@@ -185,6 +188,7 @@ struct load_info {
185 ongoing or failed initialization etc. */ 188 ongoing or failed initialization etc. */
186static inline int strong_try_module_get(struct module *mod) 189static inline int strong_try_module_get(struct module *mod)
187{ 190{
191 BUG_ON(mod && mod->state == MODULE_STATE_UNFORMED);
188 if (mod && mod->state == MODULE_STATE_COMING) 192 if (mod && mod->state == MODULE_STATE_COMING)
189 return -EBUSY; 193 return -EBUSY;
190 if (try_module_get(mod)) 194 if (try_module_get(mod))
@@ -193,9 +197,10 @@ static inline int strong_try_module_get(struct module *mod)
193 return -ENOENT; 197 return -ENOENT;
194} 198}
195 199
196static inline void add_taint_module(struct module *mod, unsigned flag) 200static inline void add_taint_module(struct module *mod, unsigned flag,
201 enum lockdep_ok lockdep_ok)
197{ 202{
198 add_taint(flag); 203 add_taint(flag, lockdep_ok);
199 mod->taints |= (1U << flag); 204 mod->taints |= (1U << flag);
200} 205}
201 206
@@ -340,6 +345,9 @@ bool each_symbol_section(bool (*fn)(const struct symsearch *arr,
340#endif 345#endif
341 }; 346 };
342 347
348 if (mod->state == MODULE_STATE_UNFORMED)
349 continue;
350
343 if (each_symbol_in_section(arr, ARRAY_SIZE(arr), mod, fn, data)) 351 if (each_symbol_in_section(arr, ARRAY_SIZE(arr), mod, fn, data))
344 return true; 352 return true;
345 } 353 }
@@ -372,9 +380,6 @@ static bool check_symbol(const struct symsearch *syms,
372 printk(KERN_WARNING "Symbol %s is being used " 380 printk(KERN_WARNING "Symbol %s is being used "
373 "by a non-GPL module, which will not " 381 "by a non-GPL module, which will not "
374 "be allowed in the future\n", fsa->name); 382 "be allowed in the future\n", fsa->name);
375 printk(KERN_WARNING "Please see the file "
376 "Documentation/feature-removal-schedule.txt "
377 "in the kernel source tree for more details.\n");
378 } 383 }
379 } 384 }
380 385
@@ -450,16 +455,24 @@ const struct kernel_symbol *find_symbol(const char *name,
450EXPORT_SYMBOL_GPL(find_symbol); 455EXPORT_SYMBOL_GPL(find_symbol);
451 456
452/* Search for module by name: must hold module_mutex. */ 457/* Search for module by name: must hold module_mutex. */
453struct module *find_module(const char *name) 458static struct module *find_module_all(const char *name,
459 bool even_unformed)
454{ 460{
455 struct module *mod; 461 struct module *mod;
456 462
457 list_for_each_entry(mod, &modules, list) { 463 list_for_each_entry(mod, &modules, list) {
464 if (!even_unformed && mod->state == MODULE_STATE_UNFORMED)
465 continue;
458 if (strcmp(mod->name, name) == 0) 466 if (strcmp(mod->name, name) == 0)
459 return mod; 467 return mod;
460 } 468 }
461 return NULL; 469 return NULL;
462} 470}
471
472struct module *find_module(const char *name)
473{
474 return find_module_all(name, false);
475}
463EXPORT_SYMBOL_GPL(find_module); 476EXPORT_SYMBOL_GPL(find_module);
464 477
465#ifdef CONFIG_SMP 478#ifdef CONFIG_SMP
@@ -525,6 +538,8 @@ bool is_module_percpu_address(unsigned long addr)
525 preempt_disable(); 538 preempt_disable();
526 539
527 list_for_each_entry_rcu(mod, &modules, list) { 540 list_for_each_entry_rcu(mod, &modules, list) {
541 if (mod->state == MODULE_STATE_UNFORMED)
542 continue;
528 if (!mod->percpu_size) 543 if (!mod->percpu_size)
529 continue; 544 continue;
530 for_each_possible_cpu(cpu) { 545 for_each_possible_cpu(cpu) {
@@ -713,7 +728,7 @@ static inline int try_force_unload(unsigned int flags)
713{ 728{
714 int ret = (flags & O_TRUNC); 729 int ret = (flags & O_TRUNC);
715 if (ret) 730 if (ret)
716 add_taint(TAINT_FORCED_RMMOD); 731 add_taint(TAINT_FORCED_RMMOD, LOCKDEP_NOW_UNRELIABLE);
717 return ret; 732 return ret;
718} 733}
719#else 734#else
@@ -1048,6 +1063,8 @@ static ssize_t show_initstate(struct module_attribute *mattr,
1048 case MODULE_STATE_GOING: 1063 case MODULE_STATE_GOING:
1049 state = "going"; 1064 state = "going";
1050 break; 1065 break;
1066 default:
1067 BUG();
1051 } 1068 }
1052 return sprintf(buffer, "%s\n", state); 1069 return sprintf(buffer, "%s\n", state);
1053} 1070}
@@ -1122,7 +1139,7 @@ static int try_to_force_load(struct module *mod, const char *reason)
1122 if (!test_taint(TAINT_FORCED_MODULE)) 1139 if (!test_taint(TAINT_FORCED_MODULE))
1123 printk(KERN_WARNING "%s: %s: kernel tainted.\n", 1140 printk(KERN_WARNING "%s: %s: kernel tainted.\n",
1124 mod->name, reason); 1141 mod->name, reason);
1125 add_taint_module(mod, TAINT_FORCED_MODULE); 1142 add_taint_module(mod, TAINT_FORCED_MODULE, LOCKDEP_NOW_UNRELIABLE);
1126 return 0; 1143 return 0;
1127#else 1144#else
1128 return -ENOEXEC; 1145 return -ENOEXEC;
@@ -1786,6 +1803,8 @@ void set_all_modules_text_rw(void)
1786 1803
1787 mutex_lock(&module_mutex); 1804 mutex_lock(&module_mutex);
1788 list_for_each_entry_rcu(mod, &modules, list) { 1805 list_for_each_entry_rcu(mod, &modules, list) {
1806 if (mod->state == MODULE_STATE_UNFORMED)
1807 continue;
1789 if ((mod->module_core) && (mod->core_text_size)) { 1808 if ((mod->module_core) && (mod->core_text_size)) {
1790 set_page_attributes(mod->module_core, 1809 set_page_attributes(mod->module_core,
1791 mod->module_core + mod->core_text_size, 1810 mod->module_core + mod->core_text_size,
@@ -1807,6 +1826,8 @@ void set_all_modules_text_ro(void)
1807 1826
1808 mutex_lock(&module_mutex); 1827 mutex_lock(&module_mutex);
1809 list_for_each_entry_rcu(mod, &modules, list) { 1828 list_for_each_entry_rcu(mod, &modules, list) {
1829 if (mod->state == MODULE_STATE_UNFORMED)
1830 continue;
1810 if ((mod->module_core) && (mod->core_text_size)) { 1831 if ((mod->module_core) && (mod->core_text_size)) {
1811 set_page_attributes(mod->module_core, 1832 set_page_attributes(mod->module_core,
1812 mod->module_core + mod->core_text_size, 1833 mod->module_core + mod->core_text_size,
@@ -2127,7 +2148,8 @@ static void set_license(struct module *mod, const char *license)
2127 if (!test_taint(TAINT_PROPRIETARY_MODULE)) 2148 if (!test_taint(TAINT_PROPRIETARY_MODULE))
2128 printk(KERN_WARNING "%s: module license '%s' taints " 2149 printk(KERN_WARNING "%s: module license '%s' taints "
2129 "kernel.\n", mod->name, license); 2150 "kernel.\n", mod->name, license);
2130 add_taint_module(mod, TAINT_PROPRIETARY_MODULE); 2151 add_taint_module(mod, TAINT_PROPRIETARY_MODULE,
2152 LOCKDEP_NOW_UNRELIABLE);
2131 } 2153 }
2132} 2154}
2133 2155
@@ -2282,7 +2304,7 @@ static void layout_symtab(struct module *mod, struct load_info *info)
2282 Elf_Shdr *symsect = info->sechdrs + info->index.sym; 2304 Elf_Shdr *symsect = info->sechdrs + info->index.sym;
2283 Elf_Shdr *strsect = info->sechdrs + info->index.str; 2305 Elf_Shdr *strsect = info->sechdrs + info->index.str;
2284 const Elf_Sym *src; 2306 const Elf_Sym *src;
2285 unsigned int i, nsrc, ndst, strtab_size; 2307 unsigned int i, nsrc, ndst, strtab_size = 0;
2286 2308
2287 /* Put symbol section at end of init part of module. */ 2309 /* Put symbol section at end of init part of module. */
2288 symsect->sh_flags |= SHF_ALLOC; 2310 symsect->sh_flags |= SHF_ALLOC;
@@ -2294,11 +2316,13 @@ static void layout_symtab(struct module *mod, struct load_info *info)
2294 nsrc = symsect->sh_size / sizeof(*src); 2316 nsrc = symsect->sh_size / sizeof(*src);
2295 2317
2296 /* Compute total space required for the core symbols' strtab. */ 2318 /* Compute total space required for the core symbols' strtab. */
2297 for (ndst = i = strtab_size = 1; i < nsrc; ++i, ++src) 2319 for (ndst = i = 0; i < nsrc; i++) {
2298 if (is_core_symbol(src, info->sechdrs, info->hdr->e_shnum)) { 2320 if (i == 0 ||
2299 strtab_size += strlen(&info->strtab[src->st_name]) + 1; 2321 is_core_symbol(src+i, info->sechdrs, info->hdr->e_shnum)) {
2322 strtab_size += strlen(&info->strtab[src[i].st_name])+1;
2300 ndst++; 2323 ndst++;
2301 } 2324 }
2325 }
2302 2326
2303 /* Append room for core symbols at end of core part. */ 2327 /* Append room for core symbols at end of core part. */
2304 info->symoffs = ALIGN(mod->core_size, symsect->sh_addralign ?: 1); 2328 info->symoffs = ALIGN(mod->core_size, symsect->sh_addralign ?: 1);
@@ -2332,15 +2356,14 @@ static void add_kallsyms(struct module *mod, const struct load_info *info)
2332 mod->core_symtab = dst = mod->module_core + info->symoffs; 2356 mod->core_symtab = dst = mod->module_core + info->symoffs;
2333 mod->core_strtab = s = mod->module_core + info->stroffs; 2357 mod->core_strtab = s = mod->module_core + info->stroffs;
2334 src = mod->symtab; 2358 src = mod->symtab;
2335 *dst = *src; 2359 for (ndst = i = 0; i < mod->num_symtab; i++) {
2336 *s++ = 0; 2360 if (i == 0 ||
2337 for (ndst = i = 1; i < mod->num_symtab; ++i, ++src) { 2361 is_core_symbol(src+i, info->sechdrs, info->hdr->e_shnum)) {
2338 if (!is_core_symbol(src, info->sechdrs, info->hdr->e_shnum)) 2362 dst[ndst] = src[i];
2339 continue; 2363 dst[ndst++].st_name = s - mod->core_strtab;
2340 2364 s += strlcpy(s, &mod->strtab[src[i].st_name],
2341 dst[ndst] = *src; 2365 KSYM_NAME_LEN) + 1;
2342 dst[ndst++].st_name = s - mod->core_strtab; 2366 }
2343 s += strlcpy(s, &mod->strtab[src->st_name], KSYM_NAME_LEN) + 1;
2344 } 2367 }
2345 mod->core_num_syms = ndst; 2368 mod->core_num_syms = ndst;
2346} 2369}
@@ -2373,7 +2396,7 @@ static void dynamic_debug_remove(struct _ddebug *debug)
2373 2396
2374void * __weak module_alloc(unsigned long size) 2397void * __weak module_alloc(unsigned long size)
2375{ 2398{
2376 return size == 0 ? NULL : vmalloc_exec(size); 2399 return vmalloc_exec(size);
2377} 2400}
2378 2401
2379static void *module_alloc_update_bounds(unsigned long size) 2402static void *module_alloc_update_bounds(unsigned long size)
@@ -2420,18 +2443,17 @@ static inline void kmemleak_load_module(const struct module *mod,
2420#endif 2443#endif
2421 2444
2422#ifdef CONFIG_MODULE_SIG 2445#ifdef CONFIG_MODULE_SIG
2423static int module_sig_check(struct load_info *info, 2446static int module_sig_check(struct load_info *info)
2424 const void *mod, unsigned long *_len)
2425{ 2447{
2426 int err = -ENOKEY; 2448 int err = -ENOKEY;
2427 unsigned long markerlen = sizeof(MODULE_SIG_STRING) - 1; 2449 const unsigned long markerlen = sizeof(MODULE_SIG_STRING) - 1;
2428 unsigned long len = *_len; 2450 const void *mod = info->hdr;
2429 2451
2430 if (len > markerlen && 2452 if (info->len > markerlen &&
2431 memcmp(mod + len - markerlen, MODULE_SIG_STRING, markerlen) == 0) { 2453 memcmp(mod + info->len - markerlen, MODULE_SIG_STRING, markerlen) == 0) {
2432 /* We truncate the module to discard the signature */ 2454 /* We truncate the module to discard the signature */
2433 *_len -= markerlen; 2455 info->len -= markerlen;
2434 err = mod_verify_sig(mod, _len); 2456 err = mod_verify_sig(mod, &info->len);
2435 } 2457 }
2436 2458
2437 if (!err) { 2459 if (!err) {
@@ -2449,59 +2471,114 @@ static int module_sig_check(struct load_info *info,
2449 return err; 2471 return err;
2450} 2472}
2451#else /* !CONFIG_MODULE_SIG */ 2473#else /* !CONFIG_MODULE_SIG */
2452static int module_sig_check(struct load_info *info, 2474static int module_sig_check(struct load_info *info)
2453 void *mod, unsigned long *len)
2454{ 2475{
2455 return 0; 2476 return 0;
2456} 2477}
2457#endif /* !CONFIG_MODULE_SIG */ 2478#endif /* !CONFIG_MODULE_SIG */
2458 2479
2459/* Sets info->hdr, info->len and info->sig_ok. */ 2480/* Sanity checks against invalid binaries, wrong arch, weird elf version. */
2460static int copy_and_check(struct load_info *info, 2481static int elf_header_check(struct load_info *info)
2461 const void __user *umod, unsigned long len, 2482{
2462 const char __user *uargs) 2483 if (info->len < sizeof(*(info->hdr)))
2484 return -ENOEXEC;
2485
2486 if (memcmp(info->hdr->e_ident, ELFMAG, SELFMAG) != 0
2487 || info->hdr->e_type != ET_REL
2488 || !elf_check_arch(info->hdr)
2489 || info->hdr->e_shentsize != sizeof(Elf_Shdr))
2490 return -ENOEXEC;
2491
2492 if (info->hdr->e_shoff >= info->len
2493 || (info->hdr->e_shnum * sizeof(Elf_Shdr) >
2494 info->len - info->hdr->e_shoff))
2495 return -ENOEXEC;
2496
2497 return 0;
2498}
2499
2500/* Sets info->hdr and info->len. */
2501static int copy_module_from_user(const void __user *umod, unsigned long len,
2502 struct load_info *info)
2463{ 2503{
2464 int err; 2504 int err;
2465 Elf_Ehdr *hdr;
2466 2505
2467 if (len < sizeof(*hdr)) 2506 info->len = len;
2507 if (info->len < sizeof(*(info->hdr)))
2468 return -ENOEXEC; 2508 return -ENOEXEC;
2469 2509
2510 err = security_kernel_module_from_file(NULL);
2511 if (err)
2512 return err;
2513
2470 /* Suck in entire file: we'll want most of it. */ 2514 /* Suck in entire file: we'll want most of it. */
2471 if ((hdr = vmalloc(len)) == NULL) 2515 info->hdr = vmalloc(info->len);
2516 if (!info->hdr)
2472 return -ENOMEM; 2517 return -ENOMEM;
2473 2518
2474 if (copy_from_user(hdr, umod, len) != 0) { 2519 if (copy_from_user(info->hdr, umod, info->len) != 0) {
2475 err = -EFAULT; 2520 vfree(info->hdr);
2476 goto free_hdr; 2521 return -EFAULT;
2477 } 2522 }
2478 2523
2479 err = module_sig_check(info, hdr, &len); 2524 return 0;
2525}
2526
2527/* Sets info->hdr and info->len. */
2528static int copy_module_from_fd(int fd, struct load_info *info)
2529{
2530 struct file *file;
2531 int err;
2532 struct kstat stat;
2533 loff_t pos;
2534 ssize_t bytes = 0;
2535
2536 file = fget(fd);
2537 if (!file)
2538 return -ENOEXEC;
2539
2540 err = security_kernel_module_from_file(file);
2480 if (err) 2541 if (err)
2481 goto free_hdr; 2542 goto out;
2482 2543
2483 /* Sanity checks against insmoding binaries or wrong arch, 2544 err = vfs_getattr(&file->f_path, &stat);
2484 weird elf version */ 2545 if (err)
2485 if (memcmp(hdr->e_ident, ELFMAG, SELFMAG) != 0 2546 goto out;
2486 || hdr->e_type != ET_REL 2547
2487 || !elf_check_arch(hdr) 2548 if (stat.size > INT_MAX) {
2488 || hdr->e_shentsize != sizeof(Elf_Shdr)) { 2549 err = -EFBIG;
2489 err = -ENOEXEC; 2550 goto out;
2490 goto free_hdr;
2491 } 2551 }
2492 2552
2493 if (hdr->e_shoff >= len || 2553 /* Don't hand 0 to vmalloc, it whines. */
2494 hdr->e_shnum * sizeof(Elf_Shdr) > len - hdr->e_shoff) { 2554 if (stat.size == 0) {
2495 err = -ENOEXEC; 2555 err = -EINVAL;
2496 goto free_hdr; 2556 goto out;
2497 } 2557 }
2498 2558
2499 info->hdr = hdr; 2559 info->hdr = vmalloc(stat.size);
2500 info->len = len; 2560 if (!info->hdr) {
2501 return 0; 2561 err = -ENOMEM;
2562 goto out;
2563 }
2564
2565 pos = 0;
2566 while (pos < stat.size) {
2567 bytes = kernel_read(file, pos, (char *)(info->hdr) + pos,
2568 stat.size - pos);
2569 if (bytes < 0) {
2570 vfree(info->hdr);
2571 err = bytes;
2572 goto out;
2573 }
2574 if (bytes == 0)
2575 break;
2576 pos += bytes;
2577 }
2578 info->len = pos;
2502 2579
2503free_hdr: 2580out:
2504 vfree(hdr); 2581 fput(file);
2505 return err; 2582 return err;
2506} 2583}
2507 2584
@@ -2510,7 +2587,7 @@ static void free_copy(struct load_info *info)
2510 vfree(info->hdr); 2587 vfree(info->hdr);
2511} 2588}
2512 2589
2513static int rewrite_section_headers(struct load_info *info) 2590static int rewrite_section_headers(struct load_info *info, int flags)
2514{ 2591{
2515 unsigned int i; 2592 unsigned int i;
2516 2593
@@ -2538,7 +2615,10 @@ static int rewrite_section_headers(struct load_info *info)
2538 } 2615 }
2539 2616
2540 /* Track but don't keep modinfo and version sections. */ 2617 /* Track but don't keep modinfo and version sections. */
2541 info->index.vers = find_sec(info, "__versions"); 2618 if (flags & MODULE_INIT_IGNORE_MODVERSIONS)
2619 info->index.vers = 0; /* Pretend no __versions section! */
2620 else
2621 info->index.vers = find_sec(info, "__versions");
2542 info->index.info = find_sec(info, ".modinfo"); 2622 info->index.info = find_sec(info, ".modinfo");
2543 info->sechdrs[info->index.info].sh_flags &= ~(unsigned long)SHF_ALLOC; 2623 info->sechdrs[info->index.info].sh_flags &= ~(unsigned long)SHF_ALLOC;
2544 info->sechdrs[info->index.vers].sh_flags &= ~(unsigned long)SHF_ALLOC; 2624 info->sechdrs[info->index.vers].sh_flags &= ~(unsigned long)SHF_ALLOC;
@@ -2553,7 +2633,7 @@ static int rewrite_section_headers(struct load_info *info)
2553 * Return the temporary module pointer (we'll replace it with the final 2633 * Return the temporary module pointer (we'll replace it with the final
2554 * one when we move the module sections around). 2634 * one when we move the module sections around).
2555 */ 2635 */
2556static struct module *setup_load_info(struct load_info *info) 2636static struct module *setup_load_info(struct load_info *info, int flags)
2557{ 2637{
2558 unsigned int i; 2638 unsigned int i;
2559 int err; 2639 int err;
@@ -2564,7 +2644,7 @@ static struct module *setup_load_info(struct load_info *info)
2564 info->secstrings = (void *)info->hdr 2644 info->secstrings = (void *)info->hdr
2565 + info->sechdrs[info->hdr->e_shstrndx].sh_offset; 2645 + info->sechdrs[info->hdr->e_shstrndx].sh_offset;
2566 2646
2567 err = rewrite_section_headers(info); 2647 err = rewrite_section_headers(info, flags);
2568 if (err) 2648 if (err)
2569 return ERR_PTR(err); 2649 return ERR_PTR(err);
2570 2650
@@ -2602,11 +2682,14 @@ static struct module *setup_load_info(struct load_info *info)
2602 return mod; 2682 return mod;
2603} 2683}
2604 2684
2605static int check_modinfo(struct module *mod, struct load_info *info) 2685static int check_modinfo(struct module *mod, struct load_info *info, int flags)
2606{ 2686{
2607 const char *modmagic = get_modinfo(info, "vermagic"); 2687 const char *modmagic = get_modinfo(info, "vermagic");
2608 int err; 2688 int err;
2609 2689
2690 if (flags & MODULE_INIT_IGNORE_VERMAGIC)
2691 modmagic = NULL;
2692
2610 /* This is allowed: modprobe --force will invalidate it. */ 2693 /* This is allowed: modprobe --force will invalidate it. */
2611 if (!modmagic) { 2694 if (!modmagic) {
2612 err = try_to_force_load(mod, "bad vermagic"); 2695 err = try_to_force_load(mod, "bad vermagic");
@@ -2619,10 +2702,10 @@ static int check_modinfo(struct module *mod, struct load_info *info)
2619 } 2702 }
2620 2703
2621 if (!get_modinfo(info, "intree")) 2704 if (!get_modinfo(info, "intree"))
2622 add_taint_module(mod, TAINT_OOT_MODULE); 2705 add_taint_module(mod, TAINT_OOT_MODULE, LOCKDEP_STILL_OK);
2623 2706
2624 if (get_modinfo(info, "staging")) { 2707 if (get_modinfo(info, "staging")) {
2625 add_taint_module(mod, TAINT_CRAP); 2708 add_taint_module(mod, TAINT_CRAP, LOCKDEP_STILL_OK);
2626 printk(KERN_WARNING "%s: module is from the staging directory," 2709 printk(KERN_WARNING "%s: module is from the staging directory,"
2627 " the quality is unknown, you have been warned.\n", 2710 " the quality is unknown, you have been warned.\n",
2628 mod->name); 2711 mod->name);
@@ -2736,20 +2819,23 @@ static int move_module(struct module *mod, struct load_info *info)
2736 memset(ptr, 0, mod->core_size); 2819 memset(ptr, 0, mod->core_size);
2737 mod->module_core = ptr; 2820 mod->module_core = ptr;
2738 2821
2739 ptr = module_alloc_update_bounds(mod->init_size); 2822 if (mod->init_size) {
2740 /* 2823 ptr = module_alloc_update_bounds(mod->init_size);
2741 * The pointer to this block is stored in the module structure 2824 /*
2742 * which is inside the block. This block doesn't need to be 2825 * The pointer to this block is stored in the module structure
2743 * scanned as it contains data and code that will be freed 2826 * which is inside the block. This block doesn't need to be
2744 * after the module is initialized. 2827 * scanned as it contains data and code that will be freed
2745 */ 2828 * after the module is initialized.
2746 kmemleak_ignore(ptr); 2829 */
2747 if (!ptr && mod->init_size) { 2830 kmemleak_ignore(ptr);
2748 module_free(mod, mod->module_core); 2831 if (!ptr) {
2749 return -ENOMEM; 2832 module_free(mod, mod->module_core);
2750 } 2833 return -ENOMEM;
2751 memset(ptr, 0, mod->init_size); 2834 }
2752 mod->module_init = ptr; 2835 memset(ptr, 0, mod->init_size);
2836 mod->module_init = ptr;
2837 } else
2838 mod->module_init = NULL;
2753 2839
2754 /* Transfer each section which specifies SHF_ALLOC */ 2840 /* Transfer each section which specifies SHF_ALLOC */
2755 pr_debug("final section addresses:\n"); 2841 pr_debug("final section addresses:\n");
@@ -2785,15 +2871,17 @@ static int check_module_license_and_versions(struct module *mod)
2785 * using GPL-only symbols it needs. 2871 * using GPL-only symbols it needs.
2786 */ 2872 */
2787 if (strcmp(mod->name, "ndiswrapper") == 0) 2873 if (strcmp(mod->name, "ndiswrapper") == 0)
2788 add_taint(TAINT_PROPRIETARY_MODULE); 2874 add_taint(TAINT_PROPRIETARY_MODULE, LOCKDEP_NOW_UNRELIABLE);
2789 2875
2790 /* driverloader was caught wrongly pretending to be under GPL */ 2876 /* driverloader was caught wrongly pretending to be under GPL */
2791 if (strcmp(mod->name, "driverloader") == 0) 2877 if (strcmp(mod->name, "driverloader") == 0)
2792 add_taint_module(mod, TAINT_PROPRIETARY_MODULE); 2878 add_taint_module(mod, TAINT_PROPRIETARY_MODULE,
2879 LOCKDEP_NOW_UNRELIABLE);
2793 2880
2794 /* lve claims to be GPL but upstream won't provide source */ 2881 /* lve claims to be GPL but upstream won't provide source */
2795 if (strcmp(mod->name, "lve") == 0) 2882 if (strcmp(mod->name, "lve") == 0)
2796 add_taint_module(mod, TAINT_PROPRIETARY_MODULE); 2883 add_taint_module(mod, TAINT_PROPRIETARY_MODULE,
2884 LOCKDEP_NOW_UNRELIABLE);
2797 2885
2798#ifdef CONFIG_MODVERSIONS 2886#ifdef CONFIG_MODVERSIONS
2799 if ((mod->num_syms && !mod->crcs) 2887 if ((mod->num_syms && !mod->crcs)
@@ -2842,18 +2930,18 @@ int __weak module_frob_arch_sections(Elf_Ehdr *hdr,
2842 return 0; 2930 return 0;
2843} 2931}
2844 2932
2845static struct module *layout_and_allocate(struct load_info *info) 2933static struct module *layout_and_allocate(struct load_info *info, int flags)
2846{ 2934{
2847 /* Module within temporary copy. */ 2935 /* Module within temporary copy. */
2848 struct module *mod; 2936 struct module *mod;
2849 Elf_Shdr *pcpusec; 2937 Elf_Shdr *pcpusec;
2850 int err; 2938 int err;
2851 2939
2852 mod = setup_load_info(info); 2940 mod = setup_load_info(info, flags);
2853 if (IS_ERR(mod)) 2941 if (IS_ERR(mod))
2854 return mod; 2942 return mod;
2855 2943
2856 err = check_modinfo(mod, info); 2944 err = check_modinfo(mod, info, flags);
2857 if (err) 2945 if (err)
2858 return ERR_PTR(err); 2946 return ERR_PTR(err);
2859 2947
@@ -2933,70 +3021,255 @@ static bool finished_loading(const char *name)
2933 bool ret; 3021 bool ret;
2934 3022
2935 mutex_lock(&module_mutex); 3023 mutex_lock(&module_mutex);
2936 mod = find_module(name); 3024 mod = find_module_all(name, true);
2937 ret = !mod || mod->state != MODULE_STATE_COMING; 3025 ret = !mod || mod->state == MODULE_STATE_LIVE
3026 || mod->state == MODULE_STATE_GOING;
2938 mutex_unlock(&module_mutex); 3027 mutex_unlock(&module_mutex);
2939 3028
2940 return ret; 3029 return ret;
2941} 3030}
2942 3031
3032/* Call module constructors. */
3033static void do_mod_ctors(struct module *mod)
3034{
3035#ifdef CONFIG_CONSTRUCTORS
3036 unsigned long i;
3037
3038 for (i = 0; i < mod->num_ctors; i++)
3039 mod->ctors[i]();
3040#endif
3041}
3042
3043/* This is where the real work happens */
3044static int do_init_module(struct module *mod)
3045{
3046 int ret = 0;
3047
3048 /*
3049 * We want to find out whether @mod uses async during init. Clear
3050 * PF_USED_ASYNC. async_schedule*() will set it.
3051 */
3052 current->flags &= ~PF_USED_ASYNC;
3053
3054 blocking_notifier_call_chain(&module_notify_list,
3055 MODULE_STATE_COMING, mod);
3056
3057 /* Set RO and NX regions for core */
3058 set_section_ro_nx(mod->module_core,
3059 mod->core_text_size,
3060 mod->core_ro_size,
3061 mod->core_size);
3062
3063 /* Set RO and NX regions for init */
3064 set_section_ro_nx(mod->module_init,
3065 mod->init_text_size,
3066 mod->init_ro_size,
3067 mod->init_size);
3068
3069 do_mod_ctors(mod);
3070 /* Start the module */
3071 if (mod->init != NULL)
3072 ret = do_one_initcall(mod->init);
3073 if (ret < 0) {
3074 /* Init routine failed: abort. Try to protect us from
3075 buggy refcounters. */
3076 mod->state = MODULE_STATE_GOING;
3077 synchronize_sched();
3078 module_put(mod);
3079 blocking_notifier_call_chain(&module_notify_list,
3080 MODULE_STATE_GOING, mod);
3081 free_module(mod);
3082 wake_up_all(&module_wq);
3083 return ret;
3084 }
3085 if (ret > 0) {
3086 printk(KERN_WARNING
3087"%s: '%s'->init suspiciously returned %d, it should follow 0/-E convention\n"
3088"%s: loading module anyway...\n",
3089 __func__, mod->name, ret,
3090 __func__);
3091 dump_stack();
3092 }
3093
3094 /* Now it's a first class citizen! */
3095 mod->state = MODULE_STATE_LIVE;
3096 blocking_notifier_call_chain(&module_notify_list,
3097 MODULE_STATE_LIVE, mod);
3098
3099 /*
3100 * We need to finish all async code before the module init sequence
3101 * is done. This has potential to deadlock. For example, a newly
3102 * detected block device can trigger request_module() of the
3103 * default iosched from async probing task. Once userland helper
3104 * reaches here, async_synchronize_full() will wait on the async
3105 * task waiting on request_module() and deadlock.
3106 *
3107 * This deadlock is avoided by perfomring async_synchronize_full()
3108 * iff module init queued any async jobs. This isn't a full
3109 * solution as it will deadlock the same if module loading from
3110 * async jobs nests more than once; however, due to the various
3111 * constraints, this hack seems to be the best option for now.
3112 * Please refer to the following thread for details.
3113 *
3114 * http://thread.gmane.org/gmane.linux.kernel/1420814
3115 */
3116 if (current->flags & PF_USED_ASYNC)
3117 async_synchronize_full();
3118
3119 mutex_lock(&module_mutex);
3120 /* Drop initial reference. */
3121 module_put(mod);
3122 trim_init_extable(mod);
3123#ifdef CONFIG_KALLSYMS
3124 mod->num_symtab = mod->core_num_syms;
3125 mod->symtab = mod->core_symtab;
3126 mod->strtab = mod->core_strtab;
3127#endif
3128 unset_module_init_ro_nx(mod);
3129 module_free(mod, mod->module_init);
3130 mod->module_init = NULL;
3131 mod->init_size = 0;
3132 mod->init_ro_size = 0;
3133 mod->init_text_size = 0;
3134 mutex_unlock(&module_mutex);
3135 wake_up_all(&module_wq);
3136
3137 return 0;
3138}
3139
3140static int may_init_module(void)
3141{
3142 if (!capable(CAP_SYS_MODULE) || modules_disabled)
3143 return -EPERM;
3144
3145 return 0;
3146}
3147
3148/*
3149 * We try to place it in the list now to make sure it's unique before
3150 * we dedicate too many resources. In particular, temporary percpu
3151 * memory exhaustion.
3152 */
3153static int add_unformed_module(struct module *mod)
3154{
3155 int err;
3156 struct module *old;
3157
3158 mod->state = MODULE_STATE_UNFORMED;
3159
3160again:
3161 mutex_lock(&module_mutex);
3162 if ((old = find_module_all(mod->name, true)) != NULL) {
3163 if (old->state == MODULE_STATE_COMING
3164 || old->state == MODULE_STATE_UNFORMED) {
3165 /* Wait in case it fails to load. */
3166 mutex_unlock(&module_mutex);
3167 err = wait_event_interruptible(module_wq,
3168 finished_loading(mod->name));
3169 if (err)
3170 goto out_unlocked;
3171 goto again;
3172 }
3173 err = -EEXIST;
3174 goto out;
3175 }
3176 list_add_rcu(&mod->list, &modules);
3177 err = 0;
3178
3179out:
3180 mutex_unlock(&module_mutex);
3181out_unlocked:
3182 return err;
3183}
3184
3185static int complete_formation(struct module *mod, struct load_info *info)
3186{
3187 int err;
3188
3189 mutex_lock(&module_mutex);
3190
3191 /* Find duplicate symbols (must be called under lock). */
3192 err = verify_export_symbols(mod);
3193 if (err < 0)
3194 goto out;
3195
3196 /* This relies on module_mutex for list integrity. */
3197 module_bug_finalize(info->hdr, info->sechdrs, mod);
3198
3199 /* Mark state as coming so strong_try_module_get() ignores us,
3200 * but kallsyms etc. can see us. */
3201 mod->state = MODULE_STATE_COMING;
3202
3203out:
3204 mutex_unlock(&module_mutex);
3205 return err;
3206}
3207
2943/* Allocate and load the module: note that size of section 0 is always 3208/* Allocate and load the module: note that size of section 0 is always
2944 zero, and we rely on this for optional sections. */ 3209 zero, and we rely on this for optional sections. */
2945static struct module *load_module(void __user *umod, 3210static int load_module(struct load_info *info, const char __user *uargs,
2946 unsigned long len, 3211 int flags)
2947 const char __user *uargs)
2948{ 3212{
2949 struct load_info info = { NULL, }; 3213 struct module *mod;
2950 struct module *mod, *old;
2951 long err; 3214 long err;
2952 3215
2953 pr_debug("load_module: umod=%p, len=%lu, uargs=%p\n", 3216 err = module_sig_check(info);
2954 umod, len, uargs); 3217 if (err)
3218 goto free_copy;
2955 3219
2956 /* Copy in the blobs from userspace, check they are vaguely sane. */ 3220 err = elf_header_check(info);
2957 err = copy_and_check(&info, umod, len, uargs);
2958 if (err) 3221 if (err)
2959 return ERR_PTR(err); 3222 goto free_copy;
2960 3223
2961 /* Figure out module layout, and allocate all the memory. */ 3224 /* Figure out module layout, and allocate all the memory. */
2962 mod = layout_and_allocate(&info); 3225 mod = layout_and_allocate(info, flags);
2963 if (IS_ERR(mod)) { 3226 if (IS_ERR(mod)) {
2964 err = PTR_ERR(mod); 3227 err = PTR_ERR(mod);
2965 goto free_copy; 3228 goto free_copy;
2966 } 3229 }
2967 3230
3231 /* Reserve our place in the list. */
3232 err = add_unformed_module(mod);
3233 if (err)
3234 goto free_module;
3235
2968#ifdef CONFIG_MODULE_SIG 3236#ifdef CONFIG_MODULE_SIG
2969 mod->sig_ok = info.sig_ok; 3237 mod->sig_ok = info->sig_ok;
2970 if (!mod->sig_ok) 3238 if (!mod->sig_ok) {
2971 add_taint_module(mod, TAINT_FORCED_MODULE); 3239 printk_once(KERN_NOTICE
3240 "%s: module verification failed: signature and/or"
3241 " required key missing - tainting kernel\n",
3242 mod->name);
3243 add_taint_module(mod, TAINT_FORCED_MODULE, LOCKDEP_STILL_OK);
3244 }
2972#endif 3245#endif
2973 3246
2974 /* Now module is in final location, initialize linked lists, etc. */ 3247 /* Now module is in final location, initialize linked lists, etc. */
2975 err = module_unload_init(mod); 3248 err = module_unload_init(mod);
2976 if (err) 3249 if (err)
2977 goto free_module; 3250 goto unlink_mod;
2978 3251
2979 /* Now we've got everything in the final locations, we can 3252 /* Now we've got everything in the final locations, we can
2980 * find optional sections. */ 3253 * find optional sections. */
2981 find_module_sections(mod, &info); 3254 find_module_sections(mod, info);
2982 3255
2983 err = check_module_license_and_versions(mod); 3256 err = check_module_license_and_versions(mod);
2984 if (err) 3257 if (err)
2985 goto free_unload; 3258 goto free_unload;
2986 3259
2987 /* Set up MODINFO_ATTR fields */ 3260 /* Set up MODINFO_ATTR fields */
2988 setup_modinfo(mod, &info); 3261 setup_modinfo(mod, info);
2989 3262
2990 /* Fix up syms, so that st_value is a pointer to location. */ 3263 /* Fix up syms, so that st_value is a pointer to location. */
2991 err = simplify_symbols(mod, &info); 3264 err = simplify_symbols(mod, info);
2992 if (err < 0) 3265 if (err < 0)
2993 goto free_modinfo; 3266 goto free_modinfo;
2994 3267
2995 err = apply_relocations(mod, &info); 3268 err = apply_relocations(mod, info);
2996 if (err < 0) 3269 if (err < 0)
2997 goto free_modinfo; 3270 goto free_modinfo;
2998 3271
2999 err = post_relocation(mod, &info); 3272 err = post_relocation(mod, info);
3000 if (err < 0) 3273 if (err < 0)
3001 goto free_modinfo; 3274 goto free_modinfo;
3002 3275
@@ -3009,72 +3282,39 @@ static struct module *load_module(void __user *umod,
3009 goto free_arch_cleanup; 3282 goto free_arch_cleanup;
3010 } 3283 }
3011 3284
3012 /* Mark state as coming so strong_try_module_get() ignores us. */ 3285 dynamic_debug_setup(info->debug, info->num_debug);
3013 mod->state = MODULE_STATE_COMING;
3014 3286
3015 /* Now sew it into the lists so we can get lockdep and oops 3287 /* Finally it's fully formed, ready to start executing. */
3016 * info during argument parsing. No one should access us, since 3288 err = complete_formation(mod, info);
3017 * strong_try_module_get() will fail. 3289 if (err)
3018 * lockdep/oops can run asynchronous, so use the RCU list insertion 3290 goto ddebug_cleanup;
3019 * function to insert in a way safe to concurrent readers.
3020 * The mutex protects against concurrent writers.
3021 */
3022again:
3023 mutex_lock(&module_mutex);
3024 if ((old = find_module(mod->name)) != NULL) {
3025 if (old->state == MODULE_STATE_COMING) {
3026 /* Wait in case it fails to load. */
3027 mutex_unlock(&module_mutex);
3028 err = wait_event_interruptible(module_wq,
3029 finished_loading(mod->name));
3030 if (err)
3031 goto free_arch_cleanup;
3032 goto again;
3033 }
3034 err = -EEXIST;
3035 goto unlock;
3036 }
3037
3038 /* This has to be done once we're sure module name is unique. */
3039 dynamic_debug_setup(info.debug, info.num_debug);
3040
3041 /* Find duplicate symbols */
3042 err = verify_export_symbols(mod);
3043 if (err < 0)
3044 goto ddebug;
3045
3046 module_bug_finalize(info.hdr, info.sechdrs, mod);
3047 list_add_rcu(&mod->list, &modules);
3048 mutex_unlock(&module_mutex);
3049 3291
3050 /* Module is ready to execute: parsing args may do that. */ 3292 /* Module is ready to execute: parsing args may do that. */
3051 err = parse_args(mod->name, mod->args, mod->kp, mod->num_kp, 3293 err = parse_args(mod->name, mod->args, mod->kp, mod->num_kp,
3052 -32768, 32767, &ddebug_dyndbg_module_param_cb); 3294 -32768, 32767, &ddebug_dyndbg_module_param_cb);
3053 if (err < 0) 3295 if (err < 0)
3054 goto unlink; 3296 goto bug_cleanup;
3055 3297
3056 /* Link in to syfs. */ 3298 /* Link in to syfs. */
3057 err = mod_sysfs_setup(mod, &info, mod->kp, mod->num_kp); 3299 err = mod_sysfs_setup(mod, info, mod->kp, mod->num_kp);
3058 if (err < 0) 3300 if (err < 0)
3059 goto unlink; 3301 goto bug_cleanup;
3060 3302
3061 /* Get rid of temporary copy. */ 3303 /* Get rid of temporary copy. */
3062 free_copy(&info); 3304 free_copy(info);
3063 3305
3064 /* Done! */ 3306 /* Done! */
3065 trace_module_load(mod); 3307 trace_module_load(mod);
3066 return mod;
3067 3308
3068 unlink: 3309 return do_init_module(mod);
3310
3311 bug_cleanup:
3312 /* module_bug_cleanup needs module_mutex protection */
3069 mutex_lock(&module_mutex); 3313 mutex_lock(&module_mutex);
3070 /* Unlink carefully: kallsyms could be walking list. */
3071 list_del_rcu(&mod->list);
3072 module_bug_cleanup(mod); 3314 module_bug_cleanup(mod);
3073 wake_up_all(&module_wq);
3074 ddebug:
3075 dynamic_debug_remove(info.debug);
3076 unlock:
3077 mutex_unlock(&module_mutex); 3315 mutex_unlock(&module_mutex);
3316 ddebug_cleanup:
3317 dynamic_debug_remove(info->debug);
3078 synchronize_sched(); 3318 synchronize_sched();
3079 kfree(mod->args); 3319 kfree(mod->args);
3080 free_arch_cleanup: 3320 free_arch_cleanup:
@@ -3083,107 +3323,59 @@ again:
3083 free_modinfo(mod); 3323 free_modinfo(mod);
3084 free_unload: 3324 free_unload:
3085 module_unload_free(mod); 3325 module_unload_free(mod);
3326 unlink_mod:
3327 mutex_lock(&module_mutex);
3328 /* Unlink carefully: kallsyms could be walking list. */
3329 list_del_rcu(&mod->list);
3330 wake_up_all(&module_wq);
3331 mutex_unlock(&module_mutex);
3086 free_module: 3332 free_module:
3087 module_deallocate(mod, &info); 3333 module_deallocate(mod, info);
3088 free_copy: 3334 free_copy:
3089 free_copy(&info); 3335 free_copy(info);
3090 return ERR_PTR(err); 3336 return err;
3091}
3092
3093/* Call module constructors. */
3094static void do_mod_ctors(struct module *mod)
3095{
3096#ifdef CONFIG_CONSTRUCTORS
3097 unsigned long i;
3098
3099 for (i = 0; i < mod->num_ctors; i++)
3100 mod->ctors[i]();
3101#endif
3102} 3337}
3103 3338
3104/* This is where the real work happens */
3105SYSCALL_DEFINE3(init_module, void __user *, umod, 3339SYSCALL_DEFINE3(init_module, void __user *, umod,
3106 unsigned long, len, const char __user *, uargs) 3340 unsigned long, len, const char __user *, uargs)
3107{ 3341{
3108 struct module *mod; 3342 int err;
3109 int ret = 0; 3343 struct load_info info = { };
3110 3344
3111 /* Must have permission */ 3345 err = may_init_module();
3112 if (!capable(CAP_SYS_MODULE) || modules_disabled) 3346 if (err)
3113 return -EPERM; 3347 return err;
3114 3348
3115 /* Do all the hard work */ 3349 pr_debug("init_module: umod=%p, len=%lu, uargs=%p\n",
3116 mod = load_module(umod, len, uargs); 3350 umod, len, uargs);
3117 if (IS_ERR(mod))
3118 return PTR_ERR(mod);
3119 3351
3120 blocking_notifier_call_chain(&module_notify_list, 3352 err = copy_module_from_user(umod, len, &info);
3121 MODULE_STATE_COMING, mod); 3353 if (err)
3354 return err;
3122 3355
3123 /* Set RO and NX regions for core */ 3356 return load_module(&info, uargs, 0);
3124 set_section_ro_nx(mod->module_core, 3357}
3125 mod->core_text_size,
3126 mod->core_ro_size,
3127 mod->core_size);
3128 3358
3129 /* Set RO and NX regions for init */ 3359SYSCALL_DEFINE3(finit_module, int, fd, const char __user *, uargs, int, flags)
3130 set_section_ro_nx(mod->module_init, 3360{
3131 mod->init_text_size, 3361 int err;
3132 mod->init_ro_size, 3362 struct load_info info = { };
3133 mod->init_size);
3134 3363
3135 do_mod_ctors(mod); 3364 err = may_init_module();
3136 /* Start the module */ 3365 if (err)
3137 if (mod->init != NULL) 3366 return err;
3138 ret = do_one_initcall(mod->init);
3139 if (ret < 0) {
3140 /* Init routine failed: abort. Try to protect us from
3141 buggy refcounters. */
3142 mod->state = MODULE_STATE_GOING;
3143 synchronize_sched();
3144 module_put(mod);
3145 blocking_notifier_call_chain(&module_notify_list,
3146 MODULE_STATE_GOING, mod);
3147 free_module(mod);
3148 wake_up_all(&module_wq);
3149 return ret;
3150 }
3151 if (ret > 0) {
3152 printk(KERN_WARNING
3153"%s: '%s'->init suspiciously returned %d, it should follow 0/-E convention\n"
3154"%s: loading module anyway...\n",
3155 __func__, mod->name, ret,
3156 __func__);
3157 dump_stack();
3158 }
3159 3367
3160 /* Now it's a first class citizen! */ 3368 pr_debug("finit_module: fd=%d, uargs=%p, flags=%i\n", fd, uargs, flags);
3161 mod->state = MODULE_STATE_LIVE;
3162 blocking_notifier_call_chain(&module_notify_list,
3163 MODULE_STATE_LIVE, mod);
3164 3369
3165 /* We need to finish all async code before the module init sequence is done */ 3370 if (flags & ~(MODULE_INIT_IGNORE_MODVERSIONS
3166 async_synchronize_full(); 3371 |MODULE_INIT_IGNORE_VERMAGIC))
3372 return -EINVAL;
3167 3373
3168 mutex_lock(&module_mutex); 3374 err = copy_module_from_fd(fd, &info);
3169 /* Drop initial reference. */ 3375 if (err)
3170 module_put(mod); 3376 return err;
3171 trim_init_extable(mod);
3172#ifdef CONFIG_KALLSYMS
3173 mod->num_symtab = mod->core_num_syms;
3174 mod->symtab = mod->core_symtab;
3175 mod->strtab = mod->core_strtab;
3176#endif
3177 unset_module_init_ro_nx(mod);
3178 module_free(mod, mod->module_init);
3179 mod->module_init = NULL;
3180 mod->init_size = 0;
3181 mod->init_ro_size = 0;
3182 mod->init_text_size = 0;
3183 mutex_unlock(&module_mutex);
3184 wake_up_all(&module_wq);
3185 3377
3186 return 0; 3378 return load_module(&info, uargs, flags);
3187} 3379}
3188 3380
3189static inline int within(unsigned long addr, void *start, unsigned long size) 3381static inline int within(unsigned long addr, void *start, unsigned long size)
@@ -3259,6 +3451,8 @@ const char *module_address_lookup(unsigned long addr,
3259 3451
3260 preempt_disable(); 3452 preempt_disable();
3261 list_for_each_entry_rcu(mod, &modules, list) { 3453 list_for_each_entry_rcu(mod, &modules, list) {
3454 if (mod->state == MODULE_STATE_UNFORMED)
3455 continue;
3262 if (within_module_init(addr, mod) || 3456 if (within_module_init(addr, mod) ||
3263 within_module_core(addr, mod)) { 3457 within_module_core(addr, mod)) {
3264 if (modname) 3458 if (modname)
@@ -3282,6 +3476,8 @@ int lookup_module_symbol_name(unsigned long addr, char *symname)
3282 3476
3283 preempt_disable(); 3477 preempt_disable();
3284 list_for_each_entry_rcu(mod, &modules, list) { 3478 list_for_each_entry_rcu(mod, &modules, list) {
3479 if (mod->state == MODULE_STATE_UNFORMED)
3480 continue;
3285 if (within_module_init(addr, mod) || 3481 if (within_module_init(addr, mod) ||
3286 within_module_core(addr, mod)) { 3482 within_module_core(addr, mod)) {
3287 const char *sym; 3483 const char *sym;
@@ -3306,6 +3502,8 @@ int lookup_module_symbol_attrs(unsigned long addr, unsigned long *size,
3306 3502
3307 preempt_disable(); 3503 preempt_disable();
3308 list_for_each_entry_rcu(mod, &modules, list) { 3504 list_for_each_entry_rcu(mod, &modules, list) {
3505 if (mod->state == MODULE_STATE_UNFORMED)
3506 continue;
3309 if (within_module_init(addr, mod) || 3507 if (within_module_init(addr, mod) ||
3310 within_module_core(addr, mod)) { 3508 within_module_core(addr, mod)) {
3311 const char *sym; 3509 const char *sym;
@@ -3333,6 +3531,8 @@ int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
3333 3531
3334 preempt_disable(); 3532 preempt_disable();
3335 list_for_each_entry_rcu(mod, &modules, list) { 3533 list_for_each_entry_rcu(mod, &modules, list) {
3534 if (mod->state == MODULE_STATE_UNFORMED)
3535 continue;
3336 if (symnum < mod->num_symtab) { 3536 if (symnum < mod->num_symtab) {
3337 *value = mod->symtab[symnum].st_value; 3537 *value = mod->symtab[symnum].st_value;
3338 *type = mod->symtab[symnum].st_info; 3538 *type = mod->symtab[symnum].st_info;
@@ -3375,9 +3575,12 @@ unsigned long module_kallsyms_lookup_name(const char *name)
3375 ret = mod_find_symname(mod, colon+1); 3575 ret = mod_find_symname(mod, colon+1);
3376 *colon = ':'; 3576 *colon = ':';
3377 } else { 3577 } else {
3378 list_for_each_entry_rcu(mod, &modules, list) 3578 list_for_each_entry_rcu(mod, &modules, list) {
3579 if (mod->state == MODULE_STATE_UNFORMED)
3580 continue;
3379 if ((ret = mod_find_symname(mod, name)) != 0) 3581 if ((ret = mod_find_symname(mod, name)) != 0)
3380 break; 3582 break;
3583 }
3381 } 3584 }
3382 preempt_enable(); 3585 preempt_enable();
3383 return ret; 3586 return ret;
@@ -3392,6 +3595,8 @@ int module_kallsyms_on_each_symbol(int (*fn)(void *, const char *,
3392 int ret; 3595 int ret;
3393 3596
3394 list_for_each_entry(mod, &modules, list) { 3597 list_for_each_entry(mod, &modules, list) {
3598 if (mod->state == MODULE_STATE_UNFORMED)
3599 continue;
3395 for (i = 0; i < mod->num_symtab; i++) { 3600 for (i = 0; i < mod->num_symtab; i++) {
3396 ret = fn(data, mod->strtab + mod->symtab[i].st_name, 3601 ret = fn(data, mod->strtab + mod->symtab[i].st_name,
3397 mod, mod->symtab[i].st_value); 3602 mod, mod->symtab[i].st_value);
@@ -3407,6 +3612,7 @@ static char *module_flags(struct module *mod, char *buf)
3407{ 3612{
3408 int bx = 0; 3613 int bx = 0;
3409 3614
3615 BUG_ON(mod->state == MODULE_STATE_UNFORMED);
3410 if (mod->taints || 3616 if (mod->taints ||
3411 mod->state == MODULE_STATE_GOING || 3617 mod->state == MODULE_STATE_GOING ||
3412 mod->state == MODULE_STATE_COMING) { 3618 mod->state == MODULE_STATE_COMING) {
@@ -3448,6 +3654,10 @@ static int m_show(struct seq_file *m, void *p)
3448 struct module *mod = list_entry(p, struct module, list); 3654 struct module *mod = list_entry(p, struct module, list);
3449 char buf[8]; 3655 char buf[8];
3450 3656
3657 /* We always ignore unformed modules. */
3658 if (mod->state == MODULE_STATE_UNFORMED)
3659 return 0;
3660
3451 seq_printf(m, "%s %u", 3661 seq_printf(m, "%s %u",
3452 mod->name, mod->init_size + mod->core_size); 3662 mod->name, mod->init_size + mod->core_size);
3453 print_unload_info(m, mod); 3663 print_unload_info(m, mod);
@@ -3508,6 +3718,8 @@ const struct exception_table_entry *search_module_extables(unsigned long addr)
3508 3718
3509 preempt_disable(); 3719 preempt_disable();
3510 list_for_each_entry_rcu(mod, &modules, list) { 3720 list_for_each_entry_rcu(mod, &modules, list) {
3721 if (mod->state == MODULE_STATE_UNFORMED)
3722 continue;
3511 if (mod->num_exentries == 0) 3723 if (mod->num_exentries == 0)
3512 continue; 3724 continue;
3513 3725
@@ -3556,10 +3768,13 @@ struct module *__module_address(unsigned long addr)
3556 if (addr < module_addr_min || addr > module_addr_max) 3768 if (addr < module_addr_min || addr > module_addr_max)
3557 return NULL; 3769 return NULL;
3558 3770
3559 list_for_each_entry_rcu(mod, &modules, list) 3771 list_for_each_entry_rcu(mod, &modules, list) {
3772 if (mod->state == MODULE_STATE_UNFORMED)
3773 continue;
3560 if (within_module_core(addr, mod) 3774 if (within_module_core(addr, mod)
3561 || within_module_init(addr, mod)) 3775 || within_module_init(addr, mod))
3562 return mod; 3776 return mod;
3777 }
3563 return NULL; 3778 return NULL;
3564} 3779}
3565EXPORT_SYMBOL_GPL(__module_address); 3780EXPORT_SYMBOL_GPL(__module_address);
@@ -3612,8 +3827,11 @@ void print_modules(void)
3612 printk(KERN_DEFAULT "Modules linked in:"); 3827 printk(KERN_DEFAULT "Modules linked in:");
3613 /* Most callers should already have preempt disabled, but make sure */ 3828 /* Most callers should already have preempt disabled, but make sure */
3614 preempt_disable(); 3829 preempt_disable();
3615 list_for_each_entry_rcu(mod, &modules, list) 3830 list_for_each_entry_rcu(mod, &modules, list) {
3831 if (mod->state == MODULE_STATE_UNFORMED)
3832 continue;
3616 printk(" %s%s", mod->name, module_flags(mod, buf)); 3833 printk(" %s%s", mod->name, module_flags(mod, buf));
3834 }
3617 preempt_enable(); 3835 preempt_enable();
3618 if (last_unloaded_module[0]) 3836 if (last_unloaded_module[0])
3619 printk(" [last unloaded: %s]", last_unloaded_module); 3837 printk(" [last unloaded: %s]", last_unloaded_module);
diff --git a/kernel/module_signing.c b/kernel/module_signing.c
index ea1b1df5dbb0..f2970bddc5ea 100644
--- a/kernel/module_signing.c
+++ b/kernel/module_signing.c
@@ -27,13 +27,13 @@
27 * - Information block 27 * - Information block
28 */ 28 */
29struct module_signature { 29struct module_signature {
30 enum pkey_algo algo : 8; /* Public-key crypto algorithm */ 30 u8 algo; /* Public-key crypto algorithm [enum pkey_algo] */
31 enum pkey_hash_algo hash : 8; /* Digest algorithm */ 31 u8 hash; /* Digest algorithm [enum pkey_hash_algo] */
32 enum pkey_id_type id_type : 8; /* Key identifier type */ 32 u8 id_type; /* Key identifier type [enum pkey_id_type] */
33 u8 signer_len; /* Length of signer's name */ 33 u8 signer_len; /* Length of signer's name */
34 u8 key_id_len; /* Length of key identifier */ 34 u8 key_id_len; /* Length of key identifier */
35 u8 __pad[3]; 35 u8 __pad[3];
36 __be32 sig_len; /* Length of signature data */ 36 __be32 sig_len; /* Length of signature data */
37}; 37};
38 38
39/* 39/*
diff --git a/kernel/mutex.c b/kernel/mutex.c
index a307cc9c9526..52f23011b6e0 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -19,6 +19,7 @@
19 */ 19 */
20#include <linux/mutex.h> 20#include <linux/mutex.h>
21#include <linux/sched.h> 21#include <linux/sched.h>
22#include <linux/sched/rt.h>
22#include <linux/export.h> 23#include <linux/export.h>
23#include <linux/spinlock.h> 24#include <linux/spinlock.h>
24#include <linux/interrupt.h> 25#include <linux/interrupt.h>
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index b576f7f14bc6..afc0456f227a 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -57,7 +57,8 @@ static inline struct nsproxy *create_nsproxy(void)
57 * leave it to the caller to do proper locking and attach it to task. 57 * leave it to the caller to do proper locking and attach it to task.
58 */ 58 */
59static struct nsproxy *create_new_namespaces(unsigned long flags, 59static struct nsproxy *create_new_namespaces(unsigned long flags,
60 struct task_struct *tsk, struct fs_struct *new_fs) 60 struct task_struct *tsk, struct user_namespace *user_ns,
61 struct fs_struct *new_fs)
61{ 62{
62 struct nsproxy *new_nsp; 63 struct nsproxy *new_nsp;
63 int err; 64 int err;
@@ -66,31 +67,31 @@ static struct nsproxy *create_new_namespaces(unsigned long flags,
66 if (!new_nsp) 67 if (!new_nsp)
67 return ERR_PTR(-ENOMEM); 68 return ERR_PTR(-ENOMEM);
68 69
69 new_nsp->mnt_ns = copy_mnt_ns(flags, tsk->nsproxy->mnt_ns, new_fs); 70 new_nsp->mnt_ns = copy_mnt_ns(flags, tsk->nsproxy->mnt_ns, user_ns, new_fs);
70 if (IS_ERR(new_nsp->mnt_ns)) { 71 if (IS_ERR(new_nsp->mnt_ns)) {
71 err = PTR_ERR(new_nsp->mnt_ns); 72 err = PTR_ERR(new_nsp->mnt_ns);
72 goto out_ns; 73 goto out_ns;
73 } 74 }
74 75
75 new_nsp->uts_ns = copy_utsname(flags, tsk); 76 new_nsp->uts_ns = copy_utsname(flags, user_ns, tsk->nsproxy->uts_ns);
76 if (IS_ERR(new_nsp->uts_ns)) { 77 if (IS_ERR(new_nsp->uts_ns)) {
77 err = PTR_ERR(new_nsp->uts_ns); 78 err = PTR_ERR(new_nsp->uts_ns);
78 goto out_uts; 79 goto out_uts;
79 } 80 }
80 81
81 new_nsp->ipc_ns = copy_ipcs(flags, tsk); 82 new_nsp->ipc_ns = copy_ipcs(flags, user_ns, tsk->nsproxy->ipc_ns);
82 if (IS_ERR(new_nsp->ipc_ns)) { 83 if (IS_ERR(new_nsp->ipc_ns)) {
83 err = PTR_ERR(new_nsp->ipc_ns); 84 err = PTR_ERR(new_nsp->ipc_ns);
84 goto out_ipc; 85 goto out_ipc;
85 } 86 }
86 87
87 new_nsp->pid_ns = copy_pid_ns(flags, task_active_pid_ns(tsk)); 88 new_nsp->pid_ns = copy_pid_ns(flags, user_ns, tsk->nsproxy->pid_ns);
88 if (IS_ERR(new_nsp->pid_ns)) { 89 if (IS_ERR(new_nsp->pid_ns)) {
89 err = PTR_ERR(new_nsp->pid_ns); 90 err = PTR_ERR(new_nsp->pid_ns);
90 goto out_pid; 91 goto out_pid;
91 } 92 }
92 93
93 new_nsp->net_ns = copy_net_ns(flags, tsk->nsproxy->net_ns); 94 new_nsp->net_ns = copy_net_ns(flags, user_ns, tsk->nsproxy->net_ns);
94 if (IS_ERR(new_nsp->net_ns)) { 95 if (IS_ERR(new_nsp->net_ns)) {
95 err = PTR_ERR(new_nsp->net_ns); 96 err = PTR_ERR(new_nsp->net_ns);
96 goto out_net; 97 goto out_net;
@@ -122,6 +123,7 @@ out_ns:
122int copy_namespaces(unsigned long flags, struct task_struct *tsk) 123int copy_namespaces(unsigned long flags, struct task_struct *tsk)
123{ 124{
124 struct nsproxy *old_ns = tsk->nsproxy; 125 struct nsproxy *old_ns = tsk->nsproxy;
126 struct user_namespace *user_ns = task_cred_xxx(tsk, user_ns);
125 struct nsproxy *new_ns; 127 struct nsproxy *new_ns;
126 int err = 0; 128 int err = 0;
127 129
@@ -134,7 +136,7 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk)
134 CLONE_NEWPID | CLONE_NEWNET))) 136 CLONE_NEWPID | CLONE_NEWNET)))
135 return 0; 137 return 0;
136 138
137 if (!capable(CAP_SYS_ADMIN)) { 139 if (!ns_capable(user_ns, CAP_SYS_ADMIN)) {
138 err = -EPERM; 140 err = -EPERM;
139 goto out; 141 goto out;
140 } 142 }
@@ -151,7 +153,7 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk)
151 goto out; 153 goto out;
152 } 154 }
153 155
154 new_ns = create_new_namespaces(flags, tsk, tsk->fs); 156 new_ns = create_new_namespaces(flags, tsk, user_ns, tsk->fs);
155 if (IS_ERR(new_ns)) { 157 if (IS_ERR(new_ns)) {
156 err = PTR_ERR(new_ns); 158 err = PTR_ERR(new_ns);
157 goto out; 159 goto out;
@@ -183,19 +185,21 @@ void free_nsproxy(struct nsproxy *ns)
183 * On success, returns the new nsproxy. 185 * On success, returns the new nsproxy.
184 */ 186 */
185int unshare_nsproxy_namespaces(unsigned long unshare_flags, 187int unshare_nsproxy_namespaces(unsigned long unshare_flags,
186 struct nsproxy **new_nsp, struct fs_struct *new_fs) 188 struct nsproxy **new_nsp, struct cred *new_cred, struct fs_struct *new_fs)
187{ 189{
190 struct user_namespace *user_ns;
188 int err = 0; 191 int err = 0;
189 192
190 if (!(unshare_flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC | 193 if (!(unshare_flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
191 CLONE_NEWNET))) 194 CLONE_NEWNET | CLONE_NEWPID)))
192 return 0; 195 return 0;
193 196
194 if (!capable(CAP_SYS_ADMIN)) 197 user_ns = new_cred ? new_cred->user_ns : current_user_ns();
198 if (!ns_capable(user_ns, CAP_SYS_ADMIN))
195 return -EPERM; 199 return -EPERM;
196 200
197 *new_nsp = create_new_namespaces(unshare_flags, current, 201 *new_nsp = create_new_namespaces(unshare_flags, current, user_ns,
198 new_fs ? new_fs : current->fs); 202 new_fs ? new_fs : current->fs);
199 if (IS_ERR(*new_nsp)) { 203 if (IS_ERR(*new_nsp)) {
200 err = PTR_ERR(*new_nsp); 204 err = PTR_ERR(*new_nsp);
201 goto out; 205 goto out;
@@ -241,20 +245,17 @@ SYSCALL_DEFINE2(setns, int, fd, int, nstype)
241 struct file *file; 245 struct file *file;
242 int err; 246 int err;
243 247
244 if (!capable(CAP_SYS_ADMIN))
245 return -EPERM;
246
247 file = proc_ns_fget(fd); 248 file = proc_ns_fget(fd);
248 if (IS_ERR(file)) 249 if (IS_ERR(file))
249 return PTR_ERR(file); 250 return PTR_ERR(file);
250 251
251 err = -EINVAL; 252 err = -EINVAL;
252 ei = PROC_I(file->f_dentry->d_inode); 253 ei = PROC_I(file_inode(file));
253 ops = ei->ns_ops; 254 ops = ei->ns_ops;
254 if (nstype && (ops->type != nstype)) 255 if (nstype && (ops->type != nstype))
255 goto out; 256 goto out;
256 257
257 new_nsproxy = create_new_namespaces(0, tsk, tsk->fs); 258 new_nsproxy = create_new_namespaces(0, tsk, current_user_ns(), tsk->fs);
258 if (IS_ERR(new_nsproxy)) { 259 if (IS_ERR(new_nsproxy)) {
259 err = PTR_ERR(new_nsproxy); 260 err = PTR_ERR(new_nsproxy);
260 goto out; 261 goto out;
diff --git a/kernel/padata.c b/kernel/padata.c
index 89fe3d1b9efb..072f4ee4eb89 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -171,7 +171,7 @@ static struct padata_priv *padata_get_next(struct parallel_data *pd)
171{ 171{
172 int cpu, num_cpus; 172 int cpu, num_cpus;
173 unsigned int next_nr, next_index; 173 unsigned int next_nr, next_index;
174 struct padata_parallel_queue *queue, *next_queue; 174 struct padata_parallel_queue *next_queue;
175 struct padata_priv *padata; 175 struct padata_priv *padata;
176 struct padata_list *reorder; 176 struct padata_list *reorder;
177 177
@@ -204,8 +204,7 @@ static struct padata_priv *padata_get_next(struct parallel_data *pd)
204 goto out; 204 goto out;
205 } 205 }
206 206
207 queue = per_cpu_ptr(pd->pqueue, smp_processor_id()); 207 if (__this_cpu_read(pd->pqueue->cpu_index) == next_queue->cpu_index) {
208 if (queue->cpu_index == next_queue->cpu_index) {
209 padata = ERR_PTR(-ENODATA); 208 padata = ERR_PTR(-ENODATA);
210 goto out; 209 goto out;
211 } 210 }
diff --git a/kernel/panic.c b/kernel/panic.c
index e1b2822fff97..7c57cc9eee2c 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -259,26 +259,19 @@ unsigned long get_taint(void)
259 return tainted_mask; 259 return tainted_mask;
260} 260}
261 261
262void add_taint(unsigned flag) 262/**
263 * add_taint: add a taint flag if not already set.
264 * @flag: one of the TAINT_* constants.
265 * @lockdep_ok: whether lock debugging is still OK.
266 *
267 * If something bad has gone wrong, you'll want @lockdebug_ok = false, but for
268 * some notewortht-but-not-corrupting cases, it can be set to true.
269 */
270void add_taint(unsigned flag, enum lockdep_ok lockdep_ok)
263{ 271{
264 /* 272 if (lockdep_ok == LOCKDEP_NOW_UNRELIABLE && __debug_locks_off())
265 * Can't trust the integrity of the kernel anymore. 273 printk(KERN_WARNING
266 * We don't call directly debug_locks_off() because the issue 274 "Disabling lock debugging due to kernel taint\n");
267 * is not necessarily serious enough to set oops_in_progress to 1
268 * Also we want to keep up lockdep for staging/out-of-tree
269 * development and post-warning case.
270 */
271 switch (flag) {
272 case TAINT_CRAP:
273 case TAINT_OOT_MODULE:
274 case TAINT_WARN:
275 case TAINT_FIRMWARE_WORKAROUND:
276 break;
277
278 default:
279 if (__debug_locks_off())
280 printk(KERN_WARNING "Disabling lock debugging due to kernel taint\n");
281 }
282 275
283 set_bit(flag, &tainted_mask); 276 set_bit(flag, &tainted_mask);
284} 277}
@@ -421,7 +414,8 @@ static void warn_slowpath_common(const char *file, int line, void *caller,
421 print_modules(); 414 print_modules();
422 dump_stack(); 415 dump_stack();
423 print_oops_end_marker(); 416 print_oops_end_marker();
424 add_taint(taint); 417 /* Just a warning, don't kill lockdep. */
418 add_taint(taint, LOCKDEP_STILL_OK);
425} 419}
426 420
427void warn_slowpath_fmt(const char *file, int line, const char *fmt, ...) 421void warn_slowpath_fmt(const char *file, int line, const char *fmt, ...)
diff --git a/kernel/pid.c b/kernel/pid.c
index aebd4f5aaf41..047dc6264638 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -1,8 +1,8 @@
1/* 1/*
2 * Generic pidhash and scalable, time-bounded PID allocator 2 * Generic pidhash and scalable, time-bounded PID allocator
3 * 3 *
4 * (C) 2002-2003 William Irwin, IBM 4 * (C) 2002-2003 Nadia Yvette Chambers, IBM
5 * (C) 2004 William Irwin, Oracle 5 * (C) 2004 Nadia Yvette Chambers, Oracle
6 * (C) 2002-2004 Ingo Molnar, Red Hat 6 * (C) 2002-2004 Ingo Molnar, Red Hat
7 * 7 *
8 * pid-structures are backing objects for tasks sharing a given ID to chain 8 * pid-structures are backing objects for tasks sharing a given ID to chain
@@ -36,6 +36,7 @@
36#include <linux/pid_namespace.h> 36#include <linux/pid_namespace.h>
37#include <linux/init_task.h> 37#include <linux/init_task.h>
38#include <linux/syscalls.h> 38#include <linux/syscalls.h>
39#include <linux/proc_fs.h>
39 40
40#define pid_hashfn(nr, ns) \ 41#define pid_hashfn(nr, ns) \
41 hash_long((unsigned long)nr + (unsigned long)ns, pidhash_shift) 42 hash_long((unsigned long)nr + (unsigned long)ns, pidhash_shift)
@@ -78,24 +79,11 @@ struct pid_namespace init_pid_ns = {
78 .last_pid = 0, 79 .last_pid = 0,
79 .level = 0, 80 .level = 0,
80 .child_reaper = &init_task, 81 .child_reaper = &init_task,
82 .user_ns = &init_user_ns,
83 .proc_inum = PROC_PID_INIT_INO,
81}; 84};
82EXPORT_SYMBOL_GPL(init_pid_ns); 85EXPORT_SYMBOL_GPL(init_pid_ns);
83 86
84int is_container_init(struct task_struct *tsk)
85{
86 int ret = 0;
87 struct pid *pid;
88
89 rcu_read_lock();
90 pid = task_pid(tsk);
91 if (pid != NULL && pid->numbers[pid->level].nr == 1)
92 ret = 1;
93 rcu_read_unlock();
94
95 return ret;
96}
97EXPORT_SYMBOL(is_container_init);
98
99/* 87/*
100 * Note: disable interrupts while the pidmap_lock is held as an 88 * Note: disable interrupts while the pidmap_lock is held as an
101 * interrupt might come in and do read_lock(&tasklist_lock). 89 * interrupt might come in and do read_lock(&tasklist_lock).
@@ -269,8 +257,23 @@ void free_pid(struct pid *pid)
269 unsigned long flags; 257 unsigned long flags;
270 258
271 spin_lock_irqsave(&pidmap_lock, flags); 259 spin_lock_irqsave(&pidmap_lock, flags);
272 for (i = 0; i <= pid->level; i++) 260 for (i = 0; i <= pid->level; i++) {
273 hlist_del_rcu(&pid->numbers[i].pid_chain); 261 struct upid *upid = pid->numbers + i;
262 struct pid_namespace *ns = upid->ns;
263 hlist_del_rcu(&upid->pid_chain);
264 switch(--ns->nr_hashed) {
265 case 1:
266 /* When all that is left in the pid namespace
267 * is the reaper wake up the reaper. The reaper
268 * may be sleeping in zap_pid_ns_processes().
269 */
270 wake_up_process(ns->child_reaper);
271 break;
272 case 0:
273 schedule_work(&ns->proc_work);
274 break;
275 }
276 }
274 spin_unlock_irqrestore(&pidmap_lock, flags); 277 spin_unlock_irqrestore(&pidmap_lock, flags);
275 278
276 for (i = 0; i <= pid->level; i++) 279 for (i = 0; i <= pid->level; i++)
@@ -292,6 +295,7 @@ struct pid *alloc_pid(struct pid_namespace *ns)
292 goto out; 295 goto out;
293 296
294 tmp = ns; 297 tmp = ns;
298 pid->level = ns->level;
295 for (i = ns->level; i >= 0; i--) { 299 for (i = ns->level; i >= 0; i--) {
296 nr = alloc_pidmap(tmp); 300 nr = alloc_pidmap(tmp);
297 if (nr < 0) 301 if (nr < 0)
@@ -302,22 +306,32 @@ struct pid *alloc_pid(struct pid_namespace *ns)
302 tmp = tmp->parent; 306 tmp = tmp->parent;
303 } 307 }
304 308
309 if (unlikely(is_child_reaper(pid))) {
310 if (pid_ns_prepare_proc(ns))
311 goto out_free;
312 }
313
305 get_pid_ns(ns); 314 get_pid_ns(ns);
306 pid->level = ns->level;
307 atomic_set(&pid->count, 1); 315 atomic_set(&pid->count, 1);
308 for (type = 0; type < PIDTYPE_MAX; ++type) 316 for (type = 0; type < PIDTYPE_MAX; ++type)
309 INIT_HLIST_HEAD(&pid->tasks[type]); 317 INIT_HLIST_HEAD(&pid->tasks[type]);
310 318
311 upid = pid->numbers + ns->level; 319 upid = pid->numbers + ns->level;
312 spin_lock_irq(&pidmap_lock); 320 spin_lock_irq(&pidmap_lock);
313 for ( ; upid >= pid->numbers; --upid) 321 if (!(ns->nr_hashed & PIDNS_HASH_ADDING))
322 goto out_unlock;
323 for ( ; upid >= pid->numbers; --upid) {
314 hlist_add_head_rcu(&upid->pid_chain, 324 hlist_add_head_rcu(&upid->pid_chain,
315 &pid_hash[pid_hashfn(upid->nr, upid->ns)]); 325 &pid_hash[pid_hashfn(upid->nr, upid->ns)]);
326 upid->ns->nr_hashed++;
327 }
316 spin_unlock_irq(&pidmap_lock); 328 spin_unlock_irq(&pidmap_lock);
317 329
318out: 330out:
319 return pid; 331 return pid;
320 332
333out_unlock:
334 spin_unlock_irq(&pidmap_lock);
321out_free: 335out_free:
322 while (++i <= ns->level) 336 while (++i <= ns->level)
323 free_pidmap(pid->numbers + i); 337 free_pidmap(pid->numbers + i);
@@ -327,12 +341,18 @@ out_free:
327 goto out; 341 goto out;
328} 342}
329 343
344void disable_pid_allocation(struct pid_namespace *ns)
345{
346 spin_lock_irq(&pidmap_lock);
347 ns->nr_hashed &= ~PIDNS_HASH_ADDING;
348 spin_unlock_irq(&pidmap_lock);
349}
350
330struct pid *find_pid_ns(int nr, struct pid_namespace *ns) 351struct pid *find_pid_ns(int nr, struct pid_namespace *ns)
331{ 352{
332 struct hlist_node *elem;
333 struct upid *pnr; 353 struct upid *pnr;
334 354
335 hlist_for_each_entry_rcu(pnr, elem, 355 hlist_for_each_entry_rcu(pnr,
336 &pid_hash[pid_hashfn(nr, ns)], pid_chain) 356 &pid_hash[pid_hashfn(nr, ns)], pid_chain)
337 if (pnr->nr == nr && pnr->ns == ns) 357 if (pnr->nr == nr && pnr->ns == ns)
338 return container_of(pnr, struct pid, 358 return container_of(pnr, struct pid,
@@ -344,7 +364,7 @@ EXPORT_SYMBOL_GPL(find_pid_ns);
344 364
345struct pid *find_vpid(int nr) 365struct pid *find_vpid(int nr)
346{ 366{
347 return find_pid_ns(nr, current->nsproxy->pid_ns); 367 return find_pid_ns(nr, task_active_pid_ns(current));
348} 368}
349EXPORT_SYMBOL_GPL(find_vpid); 369EXPORT_SYMBOL_GPL(find_vpid);
350 370
@@ -428,7 +448,7 @@ struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns)
428 448
429struct task_struct *find_task_by_vpid(pid_t vnr) 449struct task_struct *find_task_by_vpid(pid_t vnr)
430{ 450{
431 return find_task_by_pid_ns(vnr, current->nsproxy->pid_ns); 451 return find_task_by_pid_ns(vnr, task_active_pid_ns(current));
432} 452}
433 453
434struct pid *get_task_pid(struct task_struct *task, enum pid_type type) 454struct pid *get_task_pid(struct task_struct *task, enum pid_type type)
@@ -483,7 +503,7 @@ EXPORT_SYMBOL_GPL(pid_nr_ns);
483 503
484pid_t pid_vnr(struct pid *pid) 504pid_t pid_vnr(struct pid *pid)
485{ 505{
486 return pid_nr_ns(pid, current->nsproxy->pid_ns); 506 return pid_nr_ns(pid, task_active_pid_ns(current));
487} 507}
488EXPORT_SYMBOL_GPL(pid_vnr); 508EXPORT_SYMBOL_GPL(pid_vnr);
489 509
@@ -494,7 +514,7 @@ pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type,
494 514
495 rcu_read_lock(); 515 rcu_read_lock();
496 if (!ns) 516 if (!ns)
497 ns = current->nsproxy->pid_ns; 517 ns = task_active_pid_ns(current);
498 if (likely(pid_alive(task))) { 518 if (likely(pid_alive(task))) {
499 if (type != PIDTYPE_PID) 519 if (type != PIDTYPE_PID)
500 task = task->group_leader; 520 task = task->group_leader;
@@ -558,6 +578,9 @@ void __init pidhash_init(void)
558 578
559void __init pidmap_init(void) 579void __init pidmap_init(void)
560{ 580{
581 /* Veryify no one has done anything silly */
582 BUILD_BUG_ON(PID_MAX_LIMIT >= PIDNS_HASH_ADDING);
583
561 /* bump default and minimum pid_max based on number of cpus */ 584 /* bump default and minimum pid_max based on number of cpus */
562 pid_max = min(pid_max_max, max_t(int, pid_max, 585 pid_max = min(pid_max_max, max_t(int, pid_max,
563 PIDS_PER_CPU_DEFAULT * num_possible_cpus())); 586 PIDS_PER_CPU_DEFAULT * num_possible_cpus()));
@@ -569,6 +592,7 @@ void __init pidmap_init(void)
569 /* Reserve PID 0. We never call free_pidmap(0) */ 592 /* Reserve PID 0. We never call free_pidmap(0) */
570 set_bit(0, init_pid_ns.pidmap[0].page); 593 set_bit(0, init_pid_ns.pidmap[0].page);
571 atomic_dec(&init_pid_ns.pidmap[0].nr_free); 594 atomic_dec(&init_pid_ns.pidmap[0].nr_free);
595 init_pid_ns.nr_hashed = PIDNS_HASH_ADDING;
572 596
573 init_pid_ns.pid_cachep = KMEM_CACHE(pid, 597 init_pid_ns.pid_cachep = KMEM_CACHE(pid,
574 SLAB_HWCACHE_ALIGN | SLAB_PANIC); 598 SLAB_HWCACHE_ALIGN | SLAB_PANIC);
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 7b07cc0dfb75..bea15bdf82b0 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -10,6 +10,7 @@
10 10
11#include <linux/pid.h> 11#include <linux/pid.h>
12#include <linux/pid_namespace.h> 12#include <linux/pid_namespace.h>
13#include <linux/user_namespace.h>
13#include <linux/syscalls.h> 14#include <linux/syscalls.h>
14#include <linux/err.h> 15#include <linux/err.h>
15#include <linux/acct.h> 16#include <linux/acct.h>
@@ -71,10 +72,17 @@ err_alloc:
71 return NULL; 72 return NULL;
72} 73}
73 74
75static void proc_cleanup_work(struct work_struct *work)
76{
77 struct pid_namespace *ns = container_of(work, struct pid_namespace, proc_work);
78 pid_ns_release_proc(ns);
79}
80
74/* MAX_PID_NS_LEVEL is needed for limiting size of 'struct pid' */ 81/* MAX_PID_NS_LEVEL is needed for limiting size of 'struct pid' */
75#define MAX_PID_NS_LEVEL 32 82#define MAX_PID_NS_LEVEL 32
76 83
77static struct pid_namespace *create_pid_namespace(struct pid_namespace *parent_pid_ns) 84static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns,
85 struct pid_namespace *parent_pid_ns)
78{ 86{
79 struct pid_namespace *ns; 87 struct pid_namespace *ns;
80 unsigned int level = parent_pid_ns->level + 1; 88 unsigned int level = parent_pid_ns->level + 1;
@@ -99,9 +107,16 @@ static struct pid_namespace *create_pid_namespace(struct pid_namespace *parent_p
99 if (ns->pid_cachep == NULL) 107 if (ns->pid_cachep == NULL)
100 goto out_free_map; 108 goto out_free_map;
101 109
110 err = proc_alloc_inum(&ns->proc_inum);
111 if (err)
112 goto out_free_map;
113
102 kref_init(&ns->kref); 114 kref_init(&ns->kref);
103 ns->level = level; 115 ns->level = level;
104 ns->parent = get_pid_ns(parent_pid_ns); 116 ns->parent = get_pid_ns(parent_pid_ns);
117 ns->user_ns = get_user_ns(user_ns);
118 ns->nr_hashed = PIDNS_HASH_ADDING;
119 INIT_WORK(&ns->proc_work, proc_cleanup_work);
105 120
106 set_bit(0, ns->pidmap[0].page); 121 set_bit(0, ns->pidmap[0].page);
107 atomic_set(&ns->pidmap[0].nr_free, BITS_PER_PAGE - 1); 122 atomic_set(&ns->pidmap[0].nr_free, BITS_PER_PAGE - 1);
@@ -109,14 +124,8 @@ static struct pid_namespace *create_pid_namespace(struct pid_namespace *parent_p
109 for (i = 1; i < PIDMAP_ENTRIES; i++) 124 for (i = 1; i < PIDMAP_ENTRIES; i++)
110 atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE); 125 atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE);
111 126
112 err = pid_ns_prepare_proc(ns);
113 if (err)
114 goto out_put_parent_pid_ns;
115
116 return ns; 127 return ns;
117 128
118out_put_parent_pid_ns:
119 put_pid_ns(parent_pid_ns);
120out_free_map: 129out_free_map:
121 kfree(ns->pidmap[0].page); 130 kfree(ns->pidmap[0].page);
122out_free: 131out_free:
@@ -129,18 +138,21 @@ static void destroy_pid_namespace(struct pid_namespace *ns)
129{ 138{
130 int i; 139 int i;
131 140
141 proc_free_inum(ns->proc_inum);
132 for (i = 0; i < PIDMAP_ENTRIES; i++) 142 for (i = 0; i < PIDMAP_ENTRIES; i++)
133 kfree(ns->pidmap[i].page); 143 kfree(ns->pidmap[i].page);
144 put_user_ns(ns->user_ns);
134 kmem_cache_free(pid_ns_cachep, ns); 145 kmem_cache_free(pid_ns_cachep, ns);
135} 146}
136 147
137struct pid_namespace *copy_pid_ns(unsigned long flags, struct pid_namespace *old_ns) 148struct pid_namespace *copy_pid_ns(unsigned long flags,
149 struct user_namespace *user_ns, struct pid_namespace *old_ns)
138{ 150{
139 if (!(flags & CLONE_NEWPID)) 151 if (!(flags & CLONE_NEWPID))
140 return get_pid_ns(old_ns); 152 return get_pid_ns(old_ns);
141 if (flags & (CLONE_THREAD|CLONE_PARENT)) 153 if (task_active_pid_ns(current) != old_ns)
142 return ERR_PTR(-EINVAL); 154 return ERR_PTR(-EINVAL);
143 return create_pid_namespace(old_ns); 155 return create_pid_namespace(user_ns, old_ns);
144} 156}
145 157
146static void free_pid_ns(struct kref *kref) 158static void free_pid_ns(struct kref *kref)
@@ -169,6 +181,10 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
169 int nr; 181 int nr;
170 int rc; 182 int rc;
171 struct task_struct *task, *me = current; 183 struct task_struct *task, *me = current;
184 int init_pids = thread_group_leader(me) ? 1 : 2;
185
186 /* Don't allow any more processes into the pid namespace */
187 disable_pid_allocation(pid_ns);
172 188
173 /* Ignore SIGCHLD causing any terminated children to autoreap */ 189 /* Ignore SIGCHLD causing any terminated children to autoreap */
174 spin_lock_irq(&me->sighand->siglock); 190 spin_lock_irq(&me->sighand->siglock);
@@ -211,22 +227,15 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
211 227
212 /* 228 /*
213 * sys_wait4() above can't reap the TASK_DEAD children. 229 * sys_wait4() above can't reap the TASK_DEAD children.
214 * Make sure they all go away, see __unhash_process(). 230 * Make sure they all go away, see free_pid().
215 */ 231 */
216 for (;;) { 232 for (;;) {
217 bool need_wait = false; 233 set_current_state(TASK_UNINTERRUPTIBLE);
218 234 if (pid_ns->nr_hashed == init_pids)
219 read_lock(&tasklist_lock);
220 if (!list_empty(&current->children)) {
221 __set_current_state(TASK_UNINTERRUPTIBLE);
222 need_wait = true;
223 }
224 read_unlock(&tasklist_lock);
225
226 if (!need_wait)
227 break; 235 break;
228 schedule(); 236 schedule();
229 } 237 }
238 __set_current_state(TASK_RUNNING);
230 239
231 if (pid_ns->reboot) 240 if (pid_ns->reboot)
232 current->signal->group_exit_code = pid_ns->reboot; 241 current->signal->group_exit_code = pid_ns->reboot;
@@ -239,9 +248,10 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
239static int pid_ns_ctl_handler(struct ctl_table *table, int write, 248static int pid_ns_ctl_handler(struct ctl_table *table, int write,
240 void __user *buffer, size_t *lenp, loff_t *ppos) 249 void __user *buffer, size_t *lenp, loff_t *ppos)
241{ 250{
251 struct pid_namespace *pid_ns = task_active_pid_ns(current);
242 struct ctl_table tmp = *table; 252 struct ctl_table tmp = *table;
243 253
244 if (write && !capable(CAP_SYS_ADMIN)) 254 if (write && !ns_capable(pid_ns->user_ns, CAP_SYS_ADMIN))
245 return -EPERM; 255 return -EPERM;
246 256
247 /* 257 /*
@@ -250,7 +260,7 @@ static int pid_ns_ctl_handler(struct ctl_table *table, int write,
250 * it should synchronize its usage with external means. 260 * it should synchronize its usage with external means.
251 */ 261 */
252 262
253 tmp.data = &current->nsproxy->pid_ns->last_pid; 263 tmp.data = &pid_ns->last_pid;
254 return proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); 264 return proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
255} 265}
256 266
@@ -299,6 +309,68 @@ int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd)
299 return 0; 309 return 0;
300} 310}
301 311
312static void *pidns_get(struct task_struct *task)
313{
314 struct pid_namespace *ns;
315
316 rcu_read_lock();
317 ns = get_pid_ns(task_active_pid_ns(task));
318 rcu_read_unlock();
319
320 return ns;
321}
322
323static void pidns_put(void *ns)
324{
325 put_pid_ns(ns);
326}
327
328static int pidns_install(struct nsproxy *nsproxy, void *ns)
329{
330 struct pid_namespace *active = task_active_pid_ns(current);
331 struct pid_namespace *ancestor, *new = ns;
332
333 if (!ns_capable(new->user_ns, CAP_SYS_ADMIN) ||
334 !nsown_capable(CAP_SYS_ADMIN))
335 return -EPERM;
336
337 /*
338 * Only allow entering the current active pid namespace
339 * or a child of the current active pid namespace.
340 *
341 * This is required for fork to return a usable pid value and
342 * this maintains the property that processes and their
343 * children can not escape their current pid namespace.
344 */
345 if (new->level < active->level)
346 return -EINVAL;
347
348 ancestor = new;
349 while (ancestor->level > active->level)
350 ancestor = ancestor->parent;
351 if (ancestor != active)
352 return -EINVAL;
353
354 put_pid_ns(nsproxy->pid_ns);
355 nsproxy->pid_ns = get_pid_ns(new);
356 return 0;
357}
358
359static unsigned int pidns_inum(void *ns)
360{
361 struct pid_namespace *pid_ns = ns;
362 return pid_ns->proc_inum;
363}
364
365const struct proc_ns_operations pidns_operations = {
366 .name = "pid",
367 .type = CLONE_NEWPID,
368 .get = pidns_get,
369 .put = pidns_put,
370 .install = pidns_install,
371 .inum = pidns_inum,
372};
373
302static __init int pid_namespaces_init(void) 374static __init int pid_namespaces_init(void)
303{ 375{
304 pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC); 376 pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC);
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 125cb67daa21..8fd709c9bb58 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -9,6 +9,7 @@
9#include <asm/uaccess.h> 9#include <asm/uaccess.h>
10#include <linux/kernel_stat.h> 10#include <linux/kernel_stat.h>
11#include <trace/events/timer.h> 11#include <trace/events/timer.h>
12#include <linux/random.h>
12 13
13/* 14/*
14 * Called after updating RLIMIT_CPU to run cpu timer and update 15 * Called after updating RLIMIT_CPU to run cpu timer and update
@@ -154,11 +155,19 @@ static void bump_cpu_timer(struct k_itimer *timer,
154 155
155static inline cputime_t prof_ticks(struct task_struct *p) 156static inline cputime_t prof_ticks(struct task_struct *p)
156{ 157{
157 return p->utime + p->stime; 158 cputime_t utime, stime;
159
160 task_cputime(p, &utime, &stime);
161
162 return utime + stime;
158} 163}
159static inline cputime_t virt_ticks(struct task_struct *p) 164static inline cputime_t virt_ticks(struct task_struct *p)
160{ 165{
161 return p->utime; 166 cputime_t utime;
167
168 task_cputime(p, &utime, NULL);
169
170 return utime;
162} 171}
163 172
164static int 173static int
@@ -217,30 +226,6 @@ static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p,
217 return 0; 226 return 0;
218} 227}
219 228
220void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
221{
222 struct signal_struct *sig = tsk->signal;
223 struct task_struct *t;
224
225 times->utime = sig->utime;
226 times->stime = sig->stime;
227 times->sum_exec_runtime = sig->sum_sched_runtime;
228
229 rcu_read_lock();
230 /* make sure we can trust tsk->thread_group list */
231 if (!likely(pid_alive(tsk)))
232 goto out;
233
234 t = tsk;
235 do {
236 times->utime += t->utime;
237 times->stime += t->stime;
238 times->sum_exec_runtime += task_sched_runtime(t);
239 } while_each_thread(tsk, t);
240out:
241 rcu_read_unlock();
242}
243
244static void update_gt_cputime(struct task_cputime *a, struct task_cputime *b) 229static void update_gt_cputime(struct task_cputime *a, struct task_cputime *b)
245{ 230{
246 if (b->utime > a->utime) 231 if (b->utime > a->utime)
@@ -494,16 +479,23 @@ static void cleanup_timers(struct list_head *head,
494 */ 479 */
495void posix_cpu_timers_exit(struct task_struct *tsk) 480void posix_cpu_timers_exit(struct task_struct *tsk)
496{ 481{
482 cputime_t utime, stime;
483
484 add_device_randomness((const void*) &tsk->se.sum_exec_runtime,
485 sizeof(unsigned long long));
486 task_cputime(tsk, &utime, &stime);
497 cleanup_timers(tsk->cpu_timers, 487 cleanup_timers(tsk->cpu_timers,
498 tsk->utime, tsk->stime, tsk->se.sum_exec_runtime); 488 utime, stime, tsk->se.sum_exec_runtime);
499 489
500} 490}
501void posix_cpu_timers_exit_group(struct task_struct *tsk) 491void posix_cpu_timers_exit_group(struct task_struct *tsk)
502{ 492{
503 struct signal_struct *const sig = tsk->signal; 493 struct signal_struct *const sig = tsk->signal;
494 cputime_t utime, stime;
504 495
496 task_cputime(tsk, &utime, &stime);
505 cleanup_timers(tsk->signal->cpu_timers, 497 cleanup_timers(tsk->signal->cpu_timers,
506 tsk->utime + sig->utime, tsk->stime + sig->stime, 498 utime + sig->utime, stime + sig->stime,
507 tsk->se.sum_exec_runtime + sig->sum_sched_runtime); 499 tsk->se.sum_exec_runtime + sig->sum_sched_runtime);
508} 500}
509 501
@@ -1247,11 +1239,14 @@ static inline int task_cputime_expired(const struct task_cputime *sample,
1247static inline int fastpath_timer_check(struct task_struct *tsk) 1239static inline int fastpath_timer_check(struct task_struct *tsk)
1248{ 1240{
1249 struct signal_struct *sig; 1241 struct signal_struct *sig;
1242 cputime_t utime, stime;
1243
1244 task_cputime(tsk, &utime, &stime);
1250 1245
1251 if (!task_cputime_zero(&tsk->cputime_expires)) { 1246 if (!task_cputime_zero(&tsk->cputime_expires)) {
1252 struct task_cputime task_sample = { 1247 struct task_cputime task_sample = {
1253 .utime = tsk->utime, 1248 .utime = utime,
1254 .stime = tsk->stime, 1249 .stime = stime,
1255 .sum_exec_runtime = tsk->se.sum_exec_runtime 1250 .sum_exec_runtime = tsk->se.sum_exec_runtime
1256 }; 1251 };
1257 1252
@@ -1422,8 +1417,10 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags,
1422 while (!signal_pending(current)) { 1417 while (!signal_pending(current)) {
1423 if (timer.it.cpu.expires.sched == 0) { 1418 if (timer.it.cpu.expires.sched == 0) {
1424 /* 1419 /*
1425 * Our timer fired and was reset. 1420 * Our timer fired and was reset, below
1421 * deletion can not fail.
1426 */ 1422 */
1423 posix_cpu_timer_del(&timer);
1427 spin_unlock_irq(&timer.it_lock); 1424 spin_unlock_irq(&timer.it_lock);
1428 return 0; 1425 return 0;
1429 } 1426 }
@@ -1441,9 +1438,26 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags,
1441 * We were interrupted by a signal. 1438 * We were interrupted by a signal.
1442 */ 1439 */
1443 sample_to_timespec(which_clock, timer.it.cpu.expires, rqtp); 1440 sample_to_timespec(which_clock, timer.it.cpu.expires, rqtp);
1444 posix_cpu_timer_set(&timer, 0, &zero_it, it); 1441 error = posix_cpu_timer_set(&timer, 0, &zero_it, it);
1442 if (!error) {
1443 /*
1444 * Timer is now unarmed, deletion can not fail.
1445 */
1446 posix_cpu_timer_del(&timer);
1447 }
1445 spin_unlock_irq(&timer.it_lock); 1448 spin_unlock_irq(&timer.it_lock);
1446 1449
1450 while (error == TIMER_RETRY) {
1451 /*
1452 * We need to handle case when timer was or is in the
1453 * middle of firing. In other cases we already freed
1454 * resources.
1455 */
1456 spin_lock_irq(&timer.it_lock);
1457 error = posix_cpu_timer_del(&timer);
1458 spin_unlock_irq(&timer.it_lock);
1459 }
1460
1447 if ((it->it_value.tv_sec | it->it_value.tv_nsec) == 0) { 1461 if ((it->it_value.tv_sec | it->it_value.tv_nsec) == 0) {
1448 /* 1462 /*
1449 * It actually did fire already. 1463 * It actually did fire already.
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 69185ae6b701..6edbb2c55c22 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -552,24 +552,22 @@ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock,
552 return -EAGAIN; 552 return -EAGAIN;
553 553
554 spin_lock_init(&new_timer->it_lock); 554 spin_lock_init(&new_timer->it_lock);
555 retry: 555
556 if (unlikely(!idr_pre_get(&posix_timers_id, GFP_KERNEL))) { 556 idr_preload(GFP_KERNEL);
557 error = -EAGAIN;
558 goto out;
559 }
560 spin_lock_irq(&idr_lock); 557 spin_lock_irq(&idr_lock);
561 error = idr_get_new(&posix_timers_id, new_timer, &new_timer_id); 558 error = idr_alloc(&posix_timers_id, new_timer, 0, 0, GFP_NOWAIT);
562 spin_unlock_irq(&idr_lock); 559 spin_unlock_irq(&idr_lock);
563 if (error) { 560 idr_preload_end();
564 if (error == -EAGAIN) 561 if (error < 0) {
565 goto retry;
566 /* 562 /*
567 * Weird looking, but we return EAGAIN if the IDR is 563 * Weird looking, but we return EAGAIN if the IDR is
568 * full (proper POSIX return value for this) 564 * full (proper POSIX return value for this)
569 */ 565 */
570 error = -EAGAIN; 566 if (error == -ENOSPC)
567 error = -EAGAIN;
571 goto out; 568 goto out;
572 } 569 }
570 new_timer_id = error;
573 571
574 it_id_set = IT_ID_SET; 572 it_id_set = IT_ID_SET;
575 new_timer->it_id = (timer_t) new_timer_id; 573 new_timer->it_id = (timer_t) new_timer_id;
@@ -639,6 +637,13 @@ static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags)
639{ 637{
640 struct k_itimer *timr; 638 struct k_itimer *timr;
641 639
640 /*
641 * timer_t could be any type >= int and we want to make sure any
642 * @timer_id outside positive int range fails lookup.
643 */
644 if ((unsigned long long)timer_id > INT_MAX)
645 return NULL;
646
642 rcu_read_lock(); 647 rcu_read_lock();
643 timr = idr_find(&posix_timers_id, (int)timer_id); 648 timr = idr_find(&posix_timers_id, (int)timer_id);
644 if (timr) { 649 if (timr) {
@@ -997,7 +1002,7 @@ SYSCALL_DEFINE2(clock_adjtime, const clockid_t, which_clock,
997 1002
998 err = kc->clock_adj(which_clock, &ktx); 1003 err = kc->clock_adj(which_clock, &ktx);
999 1004
1000 if (!err && copy_to_user(utx, &ktx, sizeof(ktx))) 1005 if (err >= 0 && copy_to_user(utx, &ktx, sizeof(ktx)))
1001 return -EFAULT; 1006 return -EFAULT;
1002 1007
1003 return err; 1008 return err;
diff --git a/kernel/power/autosleep.c b/kernel/power/autosleep.c
index ca304046d9e2..c6422ffeda9a 100644
--- a/kernel/power/autosleep.c
+++ b/kernel/power/autosleep.c
@@ -66,7 +66,7 @@ static DECLARE_WORK(suspend_work, try_to_suspend);
66 66
67void queue_up_suspend_work(void) 67void queue_up_suspend_work(void)
68{ 68{
69 if (!work_pending(&suspend_work) && autosleep_state > PM_SUSPEND_ON) 69 if (autosleep_state > PM_SUSPEND_ON)
70 queue_work(autosleep_wq, &suspend_work); 70 queue_work(autosleep_wq, &suspend_work);
71} 71}
72 72
diff --git a/kernel/power/main.c b/kernel/power/main.c
index f458238109cc..d77663bfedeb 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -59,7 +59,7 @@ static ssize_t pm_async_store(struct kobject *kobj, struct kobj_attribute *attr,
59{ 59{
60 unsigned long val; 60 unsigned long val;
61 61
62 if (strict_strtoul(buf, 10, &val)) 62 if (kstrtoul(buf, 10, &val))
63 return -EINVAL; 63 return -EINVAL;
64 64
65 if (val > 1) 65 if (val > 1)
@@ -313,7 +313,7 @@ static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr,
313static suspend_state_t decode_state(const char *buf, size_t n) 313static suspend_state_t decode_state(const char *buf, size_t n)
314{ 314{
315#ifdef CONFIG_SUSPEND 315#ifdef CONFIG_SUSPEND
316 suspend_state_t state = PM_SUSPEND_STANDBY; 316 suspend_state_t state = PM_SUSPEND_MIN;
317 const char * const *s; 317 const char * const *s;
318#endif 318#endif
319 char *p; 319 char *p;
@@ -553,6 +553,30 @@ power_attr(pm_trace_dev_match);
553 553
554#endif /* CONFIG_PM_TRACE */ 554#endif /* CONFIG_PM_TRACE */
555 555
556#ifdef CONFIG_FREEZER
557static ssize_t pm_freeze_timeout_show(struct kobject *kobj,
558 struct kobj_attribute *attr, char *buf)
559{
560 return sprintf(buf, "%u\n", freeze_timeout_msecs);
561}
562
563static ssize_t pm_freeze_timeout_store(struct kobject *kobj,
564 struct kobj_attribute *attr,
565 const char *buf, size_t n)
566{
567 unsigned long val;
568
569 if (kstrtoul(buf, 10, &val))
570 return -EINVAL;
571
572 freeze_timeout_msecs = val;
573 return n;
574}
575
576power_attr(pm_freeze_timeout);
577
578#endif /* CONFIG_FREEZER*/
579
556static struct attribute * g[] = { 580static struct attribute * g[] = {
557 &state_attr.attr, 581 &state_attr.attr,
558#ifdef CONFIG_PM_TRACE 582#ifdef CONFIG_PM_TRACE
@@ -576,6 +600,9 @@ static struct attribute * g[] = {
576 &pm_print_times_attr.attr, 600 &pm_print_times_attr.attr,
577#endif 601#endif
578#endif 602#endif
603#ifdef CONFIG_FREEZER
604 &pm_freeze_timeout_attr.attr,
605#endif
579 NULL, 606 NULL,
580}; 607};
581 608
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 87da817f9e13..98088e0e71e8 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -21,7 +21,7 @@
21/* 21/*
22 * Timeout for stopping processes 22 * Timeout for stopping processes
23 */ 23 */
24#define TIMEOUT (20 * HZ) 24unsigned int __read_mostly freeze_timeout_msecs = 20 * MSEC_PER_SEC;
25 25
26static int try_to_freeze_tasks(bool user_only) 26static int try_to_freeze_tasks(bool user_only)
27{ 27{
@@ -36,7 +36,7 @@ static int try_to_freeze_tasks(bool user_only)
36 36
37 do_gettimeofday(&start); 37 do_gettimeofday(&start);
38 38
39 end_time = jiffies + TIMEOUT; 39 end_time = jiffies + msecs_to_jiffies(freeze_timeout_msecs);
40 40
41 if (!user_only) 41 if (!user_only)
42 freeze_workqueues_begin(); 42 freeze_workqueues_begin();
@@ -48,18 +48,7 @@ static int try_to_freeze_tasks(bool user_only)
48 if (p == current || !freeze_task(p)) 48 if (p == current || !freeze_task(p))
49 continue; 49 continue;
50 50
51 /* 51 if (!freezer_should_skip(p))
52 * Now that we've done set_freeze_flag, don't
53 * perturb a task in TASK_STOPPED or TASK_TRACED.
54 * It is "frozen enough". If the task does wake
55 * up, it will immediately call try_to_freeze.
56 *
57 * Because freeze_task() goes through p's scheduler lock, it's
58 * guaranteed that TASK_STOPPED/TRACED -> TASK_RUNNING
59 * transition can't race with task state testing here.
60 */
61 if (!task_is_stopped_or_traced(p) &&
62 !freezer_should_skip(p))
63 todo++; 52 todo++;
64 } while_each_thread(g, p); 53 } while_each_thread(g, p);
65 read_unlock(&tasklist_lock); 54 read_unlock(&tasklist_lock);
diff --git a/kernel/power/qos.c b/kernel/power/qos.c
index 846bd42c7ed1..587dddeebf15 100644
--- a/kernel/power/qos.c
+++ b/kernel/power/qos.c
@@ -213,6 +213,69 @@ int pm_qos_update_target(struct pm_qos_constraints *c, struct plist_node *node,
213} 213}
214 214
215/** 215/**
216 * pm_qos_flags_remove_req - Remove device PM QoS flags request.
217 * @pqf: Device PM QoS flags set to remove the request from.
218 * @req: Request to remove from the set.
219 */
220static void pm_qos_flags_remove_req(struct pm_qos_flags *pqf,
221 struct pm_qos_flags_request *req)
222{
223 s32 val = 0;
224
225 list_del(&req->node);
226 list_for_each_entry(req, &pqf->list, node)
227 val |= req->flags;
228
229 pqf->effective_flags = val;
230}
231
232/**
233 * pm_qos_update_flags - Update a set of PM QoS flags.
234 * @pqf: Set of flags to update.
235 * @req: Request to add to the set, to modify, or to remove from the set.
236 * @action: Action to take on the set.
237 * @val: Value of the request to add or modify.
238 *
239 * Update the given set of PM QoS flags and call notifiers if the aggregate
240 * value has changed. Returns 1 if the aggregate constraint value has changed,
241 * 0 otherwise.
242 */
243bool pm_qos_update_flags(struct pm_qos_flags *pqf,
244 struct pm_qos_flags_request *req,
245 enum pm_qos_req_action action, s32 val)
246{
247 unsigned long irqflags;
248 s32 prev_value, curr_value;
249
250 spin_lock_irqsave(&pm_qos_lock, irqflags);
251
252 prev_value = list_empty(&pqf->list) ? 0 : pqf->effective_flags;
253
254 switch (action) {
255 case PM_QOS_REMOVE_REQ:
256 pm_qos_flags_remove_req(pqf, req);
257 break;
258 case PM_QOS_UPDATE_REQ:
259 pm_qos_flags_remove_req(pqf, req);
260 case PM_QOS_ADD_REQ:
261 req->flags = val;
262 INIT_LIST_HEAD(&req->node);
263 list_add_tail(&req->node, &pqf->list);
264 pqf->effective_flags |= val;
265 break;
266 default:
267 /* no action */
268 ;
269 }
270
271 curr_value = list_empty(&pqf->list) ? 0 : pqf->effective_flags;
272
273 spin_unlock_irqrestore(&pm_qos_lock, irqflags);
274
275 return prev_value != curr_value;
276}
277
278/**
216 * pm_qos_request - returns current system wide qos expectation 279 * pm_qos_request - returns current system wide qos expectation
217 * @pm_qos_class: identification of which qos value is requested 280 * @pm_qos_class: identification of which qos value is requested
218 * 281 *
@@ -296,8 +359,7 @@ void pm_qos_update_request(struct pm_qos_request *req,
296 return; 359 return;
297 } 360 }
298 361
299 if (delayed_work_pending(&req->work)) 362 cancel_delayed_work_sync(&req->work);
300 cancel_delayed_work_sync(&req->work);
301 363
302 if (new_value != req->node.prio) 364 if (new_value != req->node.prio)
303 pm_qos_update_target( 365 pm_qos_update_target(
@@ -323,8 +385,7 @@ void pm_qos_update_request_timeout(struct pm_qos_request *req, s32 new_value,
323 "%s called for unknown object.", __func__)) 385 "%s called for unknown object.", __func__))
324 return; 386 return;
325 387
326 if (delayed_work_pending(&req->work)) 388 cancel_delayed_work_sync(&req->work);
327 cancel_delayed_work_sync(&req->work);
328 389
329 if (new_value != req->node.prio) 390 if (new_value != req->node.prio)
330 pm_qos_update_target( 391 pm_qos_update_target(
@@ -353,8 +414,7 @@ void pm_qos_remove_request(struct pm_qos_request *req)
353 return; 414 return;
354 } 415 }
355 416
356 if (delayed_work_pending(&req->work)) 417 cancel_delayed_work_sync(&req->work);
357 cancel_delayed_work_sync(&req->work);
358 418
359 pm_qos_update_target(pm_qos_array[req->pm_qos_class]->constraints, 419 pm_qos_update_target(pm_qos_array[req->pm_qos_class]->constraints,
360 &req->node, PM_QOS_REMOVE_REQ, 420 &req->node, PM_QOS_REMOVE_REQ,
@@ -500,7 +560,7 @@ static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf,
500 } else { 560 } else {
501 ascii_value[count] = '\0'; 561 ascii_value[count] = '\0';
502 } 562 }
503 ret = strict_strtoul(ascii_value, 16, &ulval); 563 ret = kstrtoul(ascii_value, 16, &ulval);
504 if (ret) { 564 if (ret) {
505 pr_debug("%s, 0x%lx, 0x%x\n", ascii_value, ulval, ret); 565 pr_debug("%s, 0x%lx, 0x%x\n", ascii_value, ulval, ret);
506 return -EINVAL; 566 return -EINVAL;
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index c8b7446b27df..d4feda084a3a 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -30,12 +30,38 @@
30#include "power.h" 30#include "power.h"
31 31
32const char *const pm_states[PM_SUSPEND_MAX] = { 32const char *const pm_states[PM_SUSPEND_MAX] = {
33 [PM_SUSPEND_FREEZE] = "freeze",
33 [PM_SUSPEND_STANDBY] = "standby", 34 [PM_SUSPEND_STANDBY] = "standby",
34 [PM_SUSPEND_MEM] = "mem", 35 [PM_SUSPEND_MEM] = "mem",
35}; 36};
36 37
37static const struct platform_suspend_ops *suspend_ops; 38static const struct platform_suspend_ops *suspend_ops;
38 39
40static bool need_suspend_ops(suspend_state_t state)
41{
42 return !!(state > PM_SUSPEND_FREEZE);
43}
44
45static DECLARE_WAIT_QUEUE_HEAD(suspend_freeze_wait_head);
46static bool suspend_freeze_wake;
47
48static void freeze_begin(void)
49{
50 suspend_freeze_wake = false;
51}
52
53static void freeze_enter(void)
54{
55 wait_event(suspend_freeze_wait_head, suspend_freeze_wake);
56}
57
58void freeze_wake(void)
59{
60 suspend_freeze_wake = true;
61 wake_up(&suspend_freeze_wait_head);
62}
63EXPORT_SYMBOL_GPL(freeze_wake);
64
39/** 65/**
40 * suspend_set_ops - Set the global suspend method table. 66 * suspend_set_ops - Set the global suspend method table.
41 * @ops: Suspend operations to use. 67 * @ops: Suspend operations to use.
@@ -50,8 +76,11 @@ EXPORT_SYMBOL_GPL(suspend_set_ops);
50 76
51bool valid_state(suspend_state_t state) 77bool valid_state(suspend_state_t state)
52{ 78{
79 if (state == PM_SUSPEND_FREEZE)
80 return true;
53 /* 81 /*
54 * All states need lowlevel support and need to be valid to the lowlevel 82 * PM_SUSPEND_STANDBY and PM_SUSPEND_MEMORY states need lowlevel
83 * support and need to be valid to the lowlevel
55 * implementation, no valid callback implies that none are valid. 84 * implementation, no valid callback implies that none are valid.
56 */ 85 */
57 return suspend_ops && suspend_ops->valid && suspend_ops->valid(state); 86 return suspend_ops && suspend_ops->valid && suspend_ops->valid(state);
@@ -89,11 +118,11 @@ static int suspend_test(int level)
89 * hibernation). Run suspend notifiers, allocate the "suspend" console and 118 * hibernation). Run suspend notifiers, allocate the "suspend" console and
90 * freeze processes. 119 * freeze processes.
91 */ 120 */
92static int suspend_prepare(void) 121static int suspend_prepare(suspend_state_t state)
93{ 122{
94 int error; 123 int error;
95 124
96 if (!suspend_ops || !suspend_ops->enter) 125 if (need_suspend_ops(state) && (!suspend_ops || !suspend_ops->enter))
97 return -EPERM; 126 return -EPERM;
98 127
99 pm_prepare_console(); 128 pm_prepare_console();
@@ -137,7 +166,7 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
137{ 166{
138 int error; 167 int error;
139 168
140 if (suspend_ops->prepare) { 169 if (need_suspend_ops(state) && suspend_ops->prepare) {
141 error = suspend_ops->prepare(); 170 error = suspend_ops->prepare();
142 if (error) 171 if (error)
143 goto Platform_finish; 172 goto Platform_finish;
@@ -149,12 +178,23 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
149 goto Platform_finish; 178 goto Platform_finish;
150 } 179 }
151 180
152 if (suspend_ops->prepare_late) { 181 if (need_suspend_ops(state) && suspend_ops->prepare_late) {
153 error = suspend_ops->prepare_late(); 182 error = suspend_ops->prepare_late();
154 if (error) 183 if (error)
155 goto Platform_wake; 184 goto Platform_wake;
156 } 185 }
157 186
187 /*
188 * PM_SUSPEND_FREEZE equals
189 * frozen processes + suspended devices + idle processors.
190 * Thus we should invoke freeze_enter() soon after
191 * all the devices are suspended.
192 */
193 if (state == PM_SUSPEND_FREEZE) {
194 freeze_enter();
195 goto Platform_wake;
196 }
197
158 if (suspend_test(TEST_PLATFORM)) 198 if (suspend_test(TEST_PLATFORM))
159 goto Platform_wake; 199 goto Platform_wake;
160 200
@@ -182,13 +222,13 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
182 enable_nonboot_cpus(); 222 enable_nonboot_cpus();
183 223
184 Platform_wake: 224 Platform_wake:
185 if (suspend_ops->wake) 225 if (need_suspend_ops(state) && suspend_ops->wake)
186 suspend_ops->wake(); 226 suspend_ops->wake();
187 227
188 dpm_resume_start(PMSG_RESUME); 228 dpm_resume_start(PMSG_RESUME);
189 229
190 Platform_finish: 230 Platform_finish:
191 if (suspend_ops->finish) 231 if (need_suspend_ops(state) && suspend_ops->finish)
192 suspend_ops->finish(); 232 suspend_ops->finish();
193 233
194 return error; 234 return error;
@@ -203,11 +243,11 @@ int suspend_devices_and_enter(suspend_state_t state)
203 int error; 243 int error;
204 bool wakeup = false; 244 bool wakeup = false;
205 245
206 if (!suspend_ops) 246 if (need_suspend_ops(state) && !suspend_ops)
207 return -ENOSYS; 247 return -ENOSYS;
208 248
209 trace_machine_suspend(state); 249 trace_machine_suspend(state);
210 if (suspend_ops->begin) { 250 if (need_suspend_ops(state) && suspend_ops->begin) {
211 error = suspend_ops->begin(state); 251 error = suspend_ops->begin(state);
212 if (error) 252 if (error)
213 goto Close; 253 goto Close;
@@ -226,7 +266,7 @@ int suspend_devices_and_enter(suspend_state_t state)
226 266
227 do { 267 do {
228 error = suspend_enter(state, &wakeup); 268 error = suspend_enter(state, &wakeup);
229 } while (!error && !wakeup 269 } while (!error && !wakeup && need_suspend_ops(state)
230 && suspend_ops->suspend_again && suspend_ops->suspend_again()); 270 && suspend_ops->suspend_again && suspend_ops->suspend_again());
231 271
232 Resume_devices: 272 Resume_devices:
@@ -236,13 +276,13 @@ int suspend_devices_and_enter(suspend_state_t state)
236 ftrace_start(); 276 ftrace_start();
237 resume_console(); 277 resume_console();
238 Close: 278 Close:
239 if (suspend_ops->end) 279 if (need_suspend_ops(state) && suspend_ops->end)
240 suspend_ops->end(); 280 suspend_ops->end();
241 trace_machine_suspend(PWR_EVENT_EXIT); 281 trace_machine_suspend(PWR_EVENT_EXIT);
242 return error; 282 return error;
243 283
244 Recover_platform: 284 Recover_platform:
245 if (suspend_ops->recover) 285 if (need_suspend_ops(state) && suspend_ops->recover)
246 suspend_ops->recover(); 286 suspend_ops->recover();
247 goto Resume_devices; 287 goto Resume_devices;
248} 288}
@@ -278,12 +318,15 @@ static int enter_state(suspend_state_t state)
278 if (!mutex_trylock(&pm_mutex)) 318 if (!mutex_trylock(&pm_mutex))
279 return -EBUSY; 319 return -EBUSY;
280 320
321 if (state == PM_SUSPEND_FREEZE)
322 freeze_begin();
323
281 printk(KERN_INFO "PM: Syncing filesystems ... "); 324 printk(KERN_INFO "PM: Syncing filesystems ... ");
282 sys_sync(); 325 sys_sync();
283 printk("done.\n"); 326 printk("done.\n");
284 327
285 pr_debug("PM: Preparing system for %s sleep\n", pm_states[state]); 328 pr_debug("PM: Preparing system for %s sleep\n", pm_states[state]);
286 error = suspend_prepare(); 329 error = suspend_prepare(state);
287 if (error) 330 if (error)
288 goto Unlock; 331 goto Unlock;
289 332
diff --git a/kernel/power/suspend_test.c b/kernel/power/suspend_test.c
index 25596e450ac7..9b2a1d58558d 100644
--- a/kernel/power/suspend_test.c
+++ b/kernel/power/suspend_test.c
@@ -112,7 +112,7 @@ static void __init test_wakealarm(struct rtc_device *rtc, suspend_state_t state)
112 rtc_set_alarm(rtc, &alm); 112 rtc_set_alarm(rtc, &alm);
113} 113}
114 114
115static int __init has_wakealarm(struct device *dev, void *name_ptr) 115static int __init has_wakealarm(struct device *dev, const void *data)
116{ 116{
117 struct rtc_device *candidate = to_rtc_device(dev); 117 struct rtc_device *candidate = to_rtc_device(dev);
118 118
@@ -121,7 +121,6 @@ static int __init has_wakealarm(struct device *dev, void *name_ptr)
121 if (!device_may_wakeup(candidate->dev.parent)) 121 if (!device_may_wakeup(candidate->dev.parent))
122 return 0; 122 return 0;
123 123
124 *(const char **)name_ptr = dev_name(dev);
125 return 1; 124 return 1;
126} 125}
127 126
@@ -159,8 +158,8 @@ static int __init test_suspend(void)
159 static char warn_no_rtc[] __initdata = 158 static char warn_no_rtc[] __initdata =
160 KERN_WARNING "PM: no wakealarm-capable RTC driver is ready\n"; 159 KERN_WARNING "PM: no wakealarm-capable RTC driver is ready\n";
161 160
162 char *pony = NULL;
163 struct rtc_device *rtc = NULL; 161 struct rtc_device *rtc = NULL;
162 struct device *dev;
164 163
165 /* PM is initialized by now; is that state testable? */ 164 /* PM is initialized by now; is that state testable? */
166 if (test_state == PM_SUSPEND_ON) 165 if (test_state == PM_SUSPEND_ON)
@@ -171,9 +170,9 @@ static int __init test_suspend(void)
171 } 170 }
172 171
173 /* RTCs have initialized by now too ... can we use one? */ 172 /* RTCs have initialized by now too ... can we use one? */
174 class_find_device(rtc_class, NULL, &pony, has_wakealarm); 173 dev = class_find_device(rtc_class, NULL, NULL, has_wakealarm);
175 if (pony) 174 if (dev)
176 rtc = rtc_class_open(pony); 175 rtc = rtc_class_open(dev_name(dev));
177 if (!rtc) { 176 if (!rtc) {
178 printk(warn_no_rtc); 177 printk(warn_no_rtc);
179 goto done; 178 goto done;
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 3c9d764eb0d8..7c33ed200410 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -126,7 +126,7 @@ static int swsusp_extents_insert(unsigned long swap_offset)
126 126
127 /* Figure out where to put the new node */ 127 /* Figure out where to put the new node */
128 while (*new) { 128 while (*new) {
129 ext = container_of(*new, struct swsusp_extent, node); 129 ext = rb_entry(*new, struct swsusp_extent, node);
130 parent = *new; 130 parent = *new;
131 if (swap_offset < ext->start) { 131 if (swap_offset < ext->start) {
132 /* Try to merge */ 132 /* Try to merge */
diff --git a/kernel/printk.c b/kernel/printk.c
index 2d607f4d1797..abbdd9e2ac82 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -42,6 +42,7 @@
42#include <linux/notifier.h> 42#include <linux/notifier.h>
43#include <linux/rculist.h> 43#include <linux/rculist.h>
44#include <linux/poll.h> 44#include <linux/poll.h>
45#include <linux/irq_work.h>
45 46
46#include <asm/uaccess.h> 47#include <asm/uaccess.h>
47 48
@@ -62,8 +63,6 @@ void asmlinkage __attribute__((weak)) early_printk(const char *fmt, ...)
62#define MINIMUM_CONSOLE_LOGLEVEL 1 /* Minimum loglevel we let people use */ 63#define MINIMUM_CONSOLE_LOGLEVEL 1 /* Minimum loglevel we let people use */
63#define DEFAULT_CONSOLE_LOGLEVEL 7 /* anything MORE serious than KERN_DEBUG */ 64#define DEFAULT_CONSOLE_LOGLEVEL 7 /* anything MORE serious than KERN_DEBUG */
64 65
65DECLARE_WAIT_QUEUE_HEAD(log_wait);
66
67int console_printk[4] = { 66int console_printk[4] = {
68 DEFAULT_CONSOLE_LOGLEVEL, /* console_loglevel */ 67 DEFAULT_CONSOLE_LOGLEVEL, /* console_loglevel */
69 DEFAULT_MESSAGE_LOGLEVEL, /* default_message_loglevel */ 68 DEFAULT_MESSAGE_LOGLEVEL, /* default_message_loglevel */
@@ -87,6 +86,12 @@ static DEFINE_SEMAPHORE(console_sem);
87struct console *console_drivers; 86struct console *console_drivers;
88EXPORT_SYMBOL_GPL(console_drivers); 87EXPORT_SYMBOL_GPL(console_drivers);
89 88
89#ifdef CONFIG_LOCKDEP
90static struct lockdep_map console_lock_dep_map = {
91 .name = "console_lock"
92};
93#endif
94
90/* 95/*
91 * This is used for debugging the mess that is the VT code by 96 * This is used for debugging the mess that is the VT code by
92 * keeping track if we have the console semaphore held. It's 97 * keeping track if we have the console semaphore held. It's
@@ -217,6 +222,7 @@ struct log {
217static DEFINE_RAW_SPINLOCK(logbuf_lock); 222static DEFINE_RAW_SPINLOCK(logbuf_lock);
218 223
219#ifdef CONFIG_PRINTK 224#ifdef CONFIG_PRINTK
225DECLARE_WAIT_QUEUE_HEAD(log_wait);
220/* the next printk record to read by syslog(READ) or /proc/kmsg */ 226/* the next printk record to read by syslog(READ) or /proc/kmsg */
221static u64 syslog_seq; 227static u64 syslog_seq;
222static u32 syslog_idx; 228static u32 syslog_idx;
@@ -741,6 +747,21 @@ void __init setup_log_buf(int early)
741 free, (free * 100) / __LOG_BUF_LEN); 747 free, (free * 100) / __LOG_BUF_LEN);
742} 748}
743 749
750static bool __read_mostly ignore_loglevel;
751
752static int __init ignore_loglevel_setup(char *str)
753{
754 ignore_loglevel = 1;
755 printk(KERN_INFO "debug: ignoring loglevel setting.\n");
756
757 return 0;
758}
759
760early_param("ignore_loglevel", ignore_loglevel_setup);
761module_param(ignore_loglevel, bool, S_IRUGO | S_IWUSR);
762MODULE_PARM_DESC(ignore_loglevel, "ignore loglevel setting, to"
763 "print all kernel messages to the console.");
764
744#ifdef CONFIG_BOOT_PRINTK_DELAY 765#ifdef CONFIG_BOOT_PRINTK_DELAY
745 766
746static int boot_delay; /* msecs delay after each printk during bootup */ 767static int boot_delay; /* msecs delay after each printk during bootup */
@@ -764,13 +785,15 @@ static int __init boot_delay_setup(char *str)
764} 785}
765__setup("boot_delay=", boot_delay_setup); 786__setup("boot_delay=", boot_delay_setup);
766 787
767static void boot_delay_msec(void) 788static void boot_delay_msec(int level)
768{ 789{
769 unsigned long long k; 790 unsigned long long k;
770 unsigned long timeout; 791 unsigned long timeout;
771 792
772 if (boot_delay == 0 || system_state != SYSTEM_BOOTING) 793 if ((boot_delay == 0 || system_state != SYSTEM_BOOTING)
794 || (level >= console_loglevel && !ignore_loglevel)) {
773 return; 795 return;
796 }
774 797
775 k = (unsigned long long)loops_per_msec * boot_delay; 798 k = (unsigned long long)loops_per_msec * boot_delay;
776 799
@@ -789,7 +812,7 @@ static void boot_delay_msec(void)
789 } 812 }
790} 813}
791#else 814#else
792static inline void boot_delay_msec(void) 815static inline void boot_delay_msec(int level)
793{ 816{
794} 817}
795#endif 818#endif
@@ -847,10 +870,11 @@ static size_t print_time(u64 ts, char *buf)
847 if (!printk_time) 870 if (!printk_time)
848 return 0; 871 return 0;
849 872
873 rem_nsec = do_div(ts, 1000000000);
874
850 if (!buf) 875 if (!buf)
851 return 15; 876 return snprintf(NULL, 0, "[%5lu.000000] ", (unsigned long)ts);
852 877
853 rem_nsec = do_div(ts, 1000000000);
854 return sprintf(buf, "[%5lu.%06lu] ", 878 return sprintf(buf, "[%5lu.%06lu] ",
855 (unsigned long)ts, rem_nsec / 1000); 879 (unsigned long)ts, rem_nsec / 1000);
856} 880}
@@ -1232,21 +1256,6 @@ SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len)
1232 return do_syslog(type, buf, len, SYSLOG_FROM_CALL); 1256 return do_syslog(type, buf, len, SYSLOG_FROM_CALL);
1233} 1257}
1234 1258
1235static bool __read_mostly ignore_loglevel;
1236
1237static int __init ignore_loglevel_setup(char *str)
1238{
1239 ignore_loglevel = 1;
1240 printk(KERN_INFO "debug: ignoring loglevel setting.\n");
1241
1242 return 0;
1243}
1244
1245early_param("ignore_loglevel", ignore_loglevel_setup);
1246module_param(ignore_loglevel, bool, S_IRUGO | S_IWUSR);
1247MODULE_PARM_DESC(ignore_loglevel, "ignore loglevel setting, to"
1248 "print all kernel messages to the console.");
1249
1250/* 1259/*
1251 * Call the console drivers, asking them to write out 1260 * Call the console drivers, asking them to write out
1252 * log_buf[start] to log_buf[end - 1]. 1261 * log_buf[start] to log_buf[end - 1].
@@ -1492,7 +1501,7 @@ asmlinkage int vprintk_emit(int facility, int level,
1492 int this_cpu; 1501 int this_cpu;
1493 int printed_len = 0; 1502 int printed_len = 0;
1494 1503
1495 boot_delay_msec(); 1504 boot_delay_msec(level);
1496 printk_delay(); 1505 printk_delay();
1497 1506
1498 /* This stops the holder of console_sem just where we want him */ 1507 /* This stops the holder of console_sem just where we want him */
@@ -1908,12 +1917,14 @@ static int __cpuinit console_cpu_notify(struct notifier_block *self,
1908 */ 1917 */
1909void console_lock(void) 1918void console_lock(void)
1910{ 1919{
1911 BUG_ON(in_interrupt()); 1920 might_sleep();
1921
1912 down(&console_sem); 1922 down(&console_sem);
1913 if (console_suspended) 1923 if (console_suspended)
1914 return; 1924 return;
1915 console_locked = 1; 1925 console_locked = 1;
1916 console_may_schedule = 1; 1926 console_may_schedule = 1;
1927 mutex_acquire(&console_lock_dep_map, 0, 0, _RET_IP_);
1917} 1928}
1918EXPORT_SYMBOL(console_lock); 1929EXPORT_SYMBOL(console_lock);
1919 1930
@@ -1935,6 +1946,7 @@ int console_trylock(void)
1935 } 1946 }
1936 console_locked = 1; 1947 console_locked = 1;
1937 console_may_schedule = 0; 1948 console_may_schedule = 0;
1949 mutex_acquire(&console_lock_dep_map, 0, 1, _RET_IP_);
1938 return 1; 1950 return 1;
1939} 1951}
1940EXPORT_SYMBOL(console_trylock); 1952EXPORT_SYMBOL(console_trylock);
@@ -1944,43 +1956,6 @@ int is_console_locked(void)
1944 return console_locked; 1956 return console_locked;
1945} 1957}
1946 1958
1947/*
1948 * Delayed printk version, for scheduler-internal messages:
1949 */
1950#define PRINTK_BUF_SIZE 512
1951
1952#define PRINTK_PENDING_WAKEUP 0x01
1953#define PRINTK_PENDING_SCHED 0x02
1954
1955static DEFINE_PER_CPU(int, printk_pending);
1956static DEFINE_PER_CPU(char [PRINTK_BUF_SIZE], printk_sched_buf);
1957
1958void printk_tick(void)
1959{
1960 if (__this_cpu_read(printk_pending)) {
1961 int pending = __this_cpu_xchg(printk_pending, 0);
1962 if (pending & PRINTK_PENDING_SCHED) {
1963 char *buf = __get_cpu_var(printk_sched_buf);
1964 printk(KERN_WARNING "[sched_delayed] %s", buf);
1965 }
1966 if (pending & PRINTK_PENDING_WAKEUP)
1967 wake_up_interruptible(&log_wait);
1968 }
1969}
1970
1971int printk_needs_cpu(int cpu)
1972{
1973 if (cpu_is_offline(cpu))
1974 printk_tick();
1975 return __this_cpu_read(printk_pending);
1976}
1977
1978void wake_up_klogd(void)
1979{
1980 if (waitqueue_active(&log_wait))
1981 this_cpu_or(printk_pending, PRINTK_PENDING_WAKEUP);
1982}
1983
1984static void console_cont_flush(char *text, size_t size) 1959static void console_cont_flush(char *text, size_t size)
1985{ 1960{
1986 unsigned long flags; 1961 unsigned long flags;
@@ -2095,6 +2070,7 @@ skip:
2095 local_irq_restore(flags); 2070 local_irq_restore(flags);
2096 } 2071 }
2097 console_locked = 0; 2072 console_locked = 0;
2073 mutex_release(&console_lock_dep_map, 1, _RET_IP_);
2098 2074
2099 /* Release the exclusive_console once it is used */ 2075 /* Release the exclusive_console once it is used */
2100 if (unlikely(exclusive_console)) 2076 if (unlikely(exclusive_console))
@@ -2442,6 +2418,44 @@ static int __init printk_late_init(void)
2442late_initcall(printk_late_init); 2418late_initcall(printk_late_init);
2443 2419
2444#if defined CONFIG_PRINTK 2420#if defined CONFIG_PRINTK
2421/*
2422 * Delayed printk version, for scheduler-internal messages:
2423 */
2424#define PRINTK_BUF_SIZE 512
2425
2426#define PRINTK_PENDING_WAKEUP 0x01
2427#define PRINTK_PENDING_SCHED 0x02
2428
2429static DEFINE_PER_CPU(int, printk_pending);
2430static DEFINE_PER_CPU(char [PRINTK_BUF_SIZE], printk_sched_buf);
2431
2432static void wake_up_klogd_work_func(struct irq_work *irq_work)
2433{
2434 int pending = __this_cpu_xchg(printk_pending, 0);
2435
2436 if (pending & PRINTK_PENDING_SCHED) {
2437 char *buf = __get_cpu_var(printk_sched_buf);
2438 printk(KERN_WARNING "[sched_delayed] %s", buf);
2439 }
2440
2441 if (pending & PRINTK_PENDING_WAKEUP)
2442 wake_up_interruptible(&log_wait);
2443}
2444
2445static DEFINE_PER_CPU(struct irq_work, wake_up_klogd_work) = {
2446 .func = wake_up_klogd_work_func,
2447 .flags = IRQ_WORK_LAZY,
2448};
2449
2450void wake_up_klogd(void)
2451{
2452 preempt_disable();
2453 if (waitqueue_active(&log_wait)) {
2454 this_cpu_or(printk_pending, PRINTK_PENDING_WAKEUP);
2455 irq_work_queue(&__get_cpu_var(wake_up_klogd_work));
2456 }
2457 preempt_enable();
2458}
2445 2459
2446int printk_sched(const char *fmt, ...) 2460int printk_sched(const char *fmt, ...)
2447{ 2461{
@@ -2458,6 +2472,7 @@ int printk_sched(const char *fmt, ...)
2458 va_end(args); 2472 va_end(args);
2459 2473
2460 __this_cpu_or(printk_pending, PRINTK_PENDING_SCHED); 2474 __this_cpu_or(printk_pending, PRINTK_PENDING_SCHED);
2475 irq_work_queue(&__get_cpu_var(wake_up_klogd_work));
2461 local_irq_restore(flags); 2476 local_irq_restore(flags);
2462 2477
2463 return r; 2478 return r;
diff --git a/kernel/profile.c b/kernel/profile.c
index 76b8e77773ee..dc3384ee874e 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -8,9 +8,10 @@
8 * Scheduler profiling support, Arjan van de Ven and Ingo Molnar, 8 * Scheduler profiling support, Arjan van de Ven and Ingo Molnar,
9 * Red Hat, July 2004 9 * Red Hat, July 2004
10 * Consolidation of architecture support code for profiling, 10 * Consolidation of architecture support code for profiling,
11 * William Irwin, Oracle, July 2004 11 * Nadia Yvette Chambers, Oracle, July 2004
12 * Amortized hit count accounting via per-cpu open-addressed hashtables 12 * Amortized hit count accounting via per-cpu open-addressed hashtables
13 * to resolve timer interrupt livelocks, William Irwin, Oracle, 2004 13 * to resolve timer interrupt livelocks, Nadia Yvette Chambers,
14 * Oracle, 2004
14 */ 15 */
15 16
16#include <linux/export.h> 17#include <linux/export.h>
@@ -36,9 +37,6 @@ struct profile_hit {
36#define NR_PROFILE_HIT (PAGE_SIZE/sizeof(struct profile_hit)) 37#define NR_PROFILE_HIT (PAGE_SIZE/sizeof(struct profile_hit))
37#define NR_PROFILE_GRP (NR_PROFILE_HIT/PROFILE_GRPSZ) 38#define NR_PROFILE_GRP (NR_PROFILE_HIT/PROFILE_GRPSZ)
38 39
39/* Oprofile timer tick hook */
40static int (*timer_hook)(struct pt_regs *) __read_mostly;
41
42static atomic_t *prof_buffer; 40static atomic_t *prof_buffer;
43static unsigned long prof_len, prof_shift; 41static unsigned long prof_len, prof_shift;
44 42
@@ -207,25 +205,6 @@ int profile_event_unregister(enum profile_type type, struct notifier_block *n)
207} 205}
208EXPORT_SYMBOL_GPL(profile_event_unregister); 206EXPORT_SYMBOL_GPL(profile_event_unregister);
209 207
210int register_timer_hook(int (*hook)(struct pt_regs *))
211{
212 if (timer_hook)
213 return -EBUSY;
214 timer_hook = hook;
215 return 0;
216}
217EXPORT_SYMBOL_GPL(register_timer_hook);
218
219void unregister_timer_hook(int (*hook)(struct pt_regs *))
220{
221 WARN_ON(hook != timer_hook);
222 timer_hook = NULL;
223 /* make sure all CPUs see the NULL hook */
224 synchronize_sched(); /* Allow ongoing interrupts to complete. */
225}
226EXPORT_SYMBOL_GPL(unregister_timer_hook);
227
228
229#ifdef CONFIG_SMP 208#ifdef CONFIG_SMP
230/* 209/*
231 * Each cpu has a pair of open-addressed hashtables for pending 210 * Each cpu has a pair of open-addressed hashtables for pending
@@ -256,7 +235,7 @@ EXPORT_SYMBOL_GPL(unregister_timer_hook);
256 * pagetable hash functions, but uses a full hashtable full of finite 235 * pagetable hash functions, but uses a full hashtable full of finite
257 * collision chains, not just pairs of them. 236 * collision chains, not just pairs of them.
258 * 237 *
259 * -- wli 238 * -- nyc
260 */ 239 */
261static void __profile_flip_buffers(void *unused) 240static void __profile_flip_buffers(void *unused)
262{ 241{
@@ -435,8 +414,6 @@ void profile_tick(int type)
435{ 414{
436 struct pt_regs *regs = get_irq_regs(); 415 struct pt_regs *regs = get_irq_regs();
437 416
438 if (type == CPU_PROFILING && timer_hook)
439 timer_hook(regs);
440 if (!user_mode(regs) && prof_cpu_mask != NULL && 417 if (!user_mode(regs) && prof_cpu_mask != NULL &&
441 cpumask_test_cpu(smp_processor_id(), prof_cpu_mask)) 418 cpumask_test_cpu(smp_processor_id(), prof_cpu_mask))
442 profile_hit(type, (void *)profile_pc(regs)); 419 profile_hit(type, (void *)profile_pc(regs));
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 1f5e55dda955..acbd28424d81 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -117,11 +117,45 @@ void __ptrace_unlink(struct task_struct *child)
117 * TASK_KILLABLE sleeps. 117 * TASK_KILLABLE sleeps.
118 */ 118 */
119 if (child->jobctl & JOBCTL_STOP_PENDING || task_is_traced(child)) 119 if (child->jobctl & JOBCTL_STOP_PENDING || task_is_traced(child))
120 signal_wake_up(child, task_is_traced(child)); 120 ptrace_signal_wake_up(child, true);
121 121
122 spin_unlock(&child->sighand->siglock); 122 spin_unlock(&child->sighand->siglock);
123} 123}
124 124
125/* Ensure that nothing can wake it up, even SIGKILL */
126static bool ptrace_freeze_traced(struct task_struct *task)
127{
128 bool ret = false;
129
130 /* Lockless, nobody but us can set this flag */
131 if (task->jobctl & JOBCTL_LISTENING)
132 return ret;
133
134 spin_lock_irq(&task->sighand->siglock);
135 if (task_is_traced(task) && !__fatal_signal_pending(task)) {
136 task->state = __TASK_TRACED;
137 ret = true;
138 }
139 spin_unlock_irq(&task->sighand->siglock);
140
141 return ret;
142}
143
144static void ptrace_unfreeze_traced(struct task_struct *task)
145{
146 if (task->state != __TASK_TRACED)
147 return;
148
149 WARN_ON(!task->ptrace || task->parent != current);
150
151 spin_lock_irq(&task->sighand->siglock);
152 if (__fatal_signal_pending(task))
153 wake_up_state(task, __TASK_TRACED);
154 else
155 task->state = TASK_TRACED;
156 spin_unlock_irq(&task->sighand->siglock);
157}
158
125/** 159/**
126 * ptrace_check_attach - check whether ptracee is ready for ptrace operation 160 * ptrace_check_attach - check whether ptracee is ready for ptrace operation
127 * @child: ptracee to check for 161 * @child: ptracee to check for
@@ -139,7 +173,7 @@ void __ptrace_unlink(struct task_struct *child)
139 * RETURNS: 173 * RETURNS:
140 * 0 on success, -ESRCH if %child is not ready. 174 * 0 on success, -ESRCH if %child is not ready.
141 */ 175 */
142int ptrace_check_attach(struct task_struct *child, bool ignore_state) 176static int ptrace_check_attach(struct task_struct *child, bool ignore_state)
143{ 177{
144 int ret = -ESRCH; 178 int ret = -ESRCH;
145 179
@@ -151,24 +185,29 @@ int ptrace_check_attach(struct task_struct *child, bool ignore_state)
151 * be changed by us so it's not changing right after this. 185 * be changed by us so it's not changing right after this.
152 */ 186 */
153 read_lock(&tasklist_lock); 187 read_lock(&tasklist_lock);
154 if ((child->ptrace & PT_PTRACED) && child->parent == current) { 188 if (child->ptrace && child->parent == current) {
189 WARN_ON(child->state == __TASK_TRACED);
155 /* 190 /*
156 * child->sighand can't be NULL, release_task() 191 * child->sighand can't be NULL, release_task()
157 * does ptrace_unlink() before __exit_signal(). 192 * does ptrace_unlink() before __exit_signal().
158 */ 193 */
159 spin_lock_irq(&child->sighand->siglock); 194 if (ignore_state || ptrace_freeze_traced(child))
160 WARN_ON_ONCE(task_is_stopped(child));
161 if (ignore_state || (task_is_traced(child) &&
162 !(child->jobctl & JOBCTL_LISTENING)))
163 ret = 0; 195 ret = 0;
164 spin_unlock_irq(&child->sighand->siglock);
165 } 196 }
166 read_unlock(&tasklist_lock); 197 read_unlock(&tasklist_lock);
167 198
168 if (!ret && !ignore_state) 199 if (!ret && !ignore_state) {
169 ret = wait_task_inactive(child, TASK_TRACED) ? 0 : -ESRCH; 200 if (!wait_task_inactive(child, __TASK_TRACED)) {
201 /*
202 * This can only happen if may_ptrace_stop() fails and
203 * ptrace_stop() changes ->state back to TASK_RUNNING,
204 * so we should not worry about leaking __TASK_TRACED.
205 */
206 WARN_ON(child->state == __TASK_TRACED);
207 ret = -ESRCH;
208 }
209 }
170 210
171 /* All systems go.. */
172 return ret; 211 return ret;
173} 212}
174 213
@@ -215,8 +254,12 @@ ok:
215 smp_rmb(); 254 smp_rmb();
216 if (task->mm) 255 if (task->mm)
217 dumpable = get_dumpable(task->mm); 256 dumpable = get_dumpable(task->mm);
218 if (!dumpable && !ptrace_has_cap(task_user_ns(task), mode)) 257 rcu_read_lock();
258 if (!dumpable && !ptrace_has_cap(__task_cred(task)->user_ns, mode)) {
259 rcu_read_unlock();
219 return -EPERM; 260 return -EPERM;
261 }
262 rcu_read_unlock();
220 263
221 return security_ptrace_access_check(task, mode); 264 return security_ptrace_access_check(task, mode);
222} 265}
@@ -280,8 +323,10 @@ static int ptrace_attach(struct task_struct *task, long request,
280 323
281 if (seize) 324 if (seize)
282 flags |= PT_SEIZED; 325 flags |= PT_SEIZED;
283 if (ns_capable(task_user_ns(task), CAP_SYS_PTRACE)) 326 rcu_read_lock();
327 if (ns_capable(__task_cred(task)->user_ns, CAP_SYS_PTRACE))
284 flags |= PT_PTRACE_CAP; 328 flags |= PT_PTRACE_CAP;
329 rcu_read_unlock();
285 task->ptrace = flags; 330 task->ptrace = flags;
286 331
287 __ptrace_link(task, current); 332 __ptrace_link(task, current);
@@ -311,7 +356,7 @@ static int ptrace_attach(struct task_struct *task, long request,
311 */ 356 */
312 if (task_is_stopped(task) && 357 if (task_is_stopped(task) &&
313 task_set_jobctl_pending(task, JOBCTL_TRAP_STOP | JOBCTL_TRAPPING)) 358 task_set_jobctl_pending(task, JOBCTL_TRAP_STOP | JOBCTL_TRAPPING))
314 signal_wake_up(task, 1); 359 signal_wake_up_state(task, __TASK_STOPPED);
315 360
316 spin_unlock(&task->sighand->siglock); 361 spin_unlock(&task->sighand->siglock);
317 362
@@ -457,6 +502,9 @@ void exit_ptrace(struct task_struct *tracer)
457 return; 502 return;
458 503
459 list_for_each_entry_safe(p, n, &tracer->ptraced, ptrace_entry) { 504 list_for_each_entry_safe(p, n, &tracer->ptraced, ptrace_entry) {
505 if (unlikely(p->ptrace & PT_EXITKILL))
506 send_sig_info(SIGKILL, SEND_SIG_FORCED, p);
507
460 if (__ptrace_detach(tracer, p)) 508 if (__ptrace_detach(tracer, p))
461 list_add(&p->ptrace_entry, &ptrace_dead); 509 list_add(&p->ptrace_entry, &ptrace_dead);
462 } 510 }
@@ -664,6 +712,12 @@ static int ptrace_regset(struct task_struct *task, int req, unsigned int type,
664 kiov->iov_len, kiov->iov_base); 712 kiov->iov_len, kiov->iov_base);
665} 713}
666 714
715/*
716 * This is declared in linux/regset.h and defined in machine-dependent
717 * code. We put the export here, near the primary machine-neutral use,
718 * to ensure no machine forgets it.
719 */
720EXPORT_SYMBOL_GPL(task_user_regset_view);
667#endif 721#endif
668 722
669int ptrace_request(struct task_struct *child, long request, 723int ptrace_request(struct task_struct *child, long request,
@@ -728,7 +782,7 @@ int ptrace_request(struct task_struct *child, long request,
728 * tracee into STOP. 782 * tracee into STOP.
729 */ 783 */
730 if (likely(task_set_jobctl_pending(child, JOBCTL_TRAP_STOP))) 784 if (likely(task_set_jobctl_pending(child, JOBCTL_TRAP_STOP)))
731 signal_wake_up(child, child->jobctl & JOBCTL_LISTENING); 785 ptrace_signal_wake_up(child, child->jobctl & JOBCTL_LISTENING);
732 786
733 unlock_task_sighand(child, &flags); 787 unlock_task_sighand(child, &flags);
734 ret = 0; 788 ret = 0;
@@ -754,7 +808,7 @@ int ptrace_request(struct task_struct *child, long request,
754 * start of this trap and now. Trigger re-trap. 808 * start of this trap and now. Trigger re-trap.
755 */ 809 */
756 if (child->jobctl & JOBCTL_TRAP_NOTIFY) 810 if (child->jobctl & JOBCTL_TRAP_NOTIFY)
757 signal_wake_up(child, true); 811 ptrace_signal_wake_up(child, true);
758 ret = 0; 812 ret = 0;
759 } 813 }
760 unlock_task_sighand(child, &flags); 814 unlock_task_sighand(child, &flags);
@@ -891,6 +945,8 @@ SYSCALL_DEFINE4(ptrace, long, request, long, pid, unsigned long, addr,
891 goto out_put_task_struct; 945 goto out_put_task_struct;
892 946
893 ret = arch_ptrace(child, request, addr, data); 947 ret = arch_ptrace(child, request, addr, data);
948 if (ret || request != PTRACE_DETACH)
949 ptrace_unfreeze_traced(child);
894 950
895 out_put_task_struct: 951 out_put_task_struct:
896 put_task_struct(child); 952 put_task_struct(child);
@@ -1030,8 +1086,11 @@ asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid,
1030 1086
1031 ret = ptrace_check_attach(child, request == PTRACE_KILL || 1087 ret = ptrace_check_attach(child, request == PTRACE_KILL ||
1032 request == PTRACE_INTERRUPT); 1088 request == PTRACE_INTERRUPT);
1033 if (!ret) 1089 if (!ret) {
1034 ret = compat_arch_ptrace(child, request, addr, data); 1090 ret = compat_arch_ptrace(child, request, addr, data);
1091 if (ret || request != PTRACE_DETACH)
1092 ptrace_unfreeze_traced(child);
1093 }
1035 1094
1036 out_put_task_struct: 1095 out_put_task_struct:
1037 put_task_struct(child); 1096 put_task_struct(child);
diff --git a/kernel/rcu.h b/kernel/rcu.h
index 8ba99cdc6515..7f8e7590e3e5 100644
--- a/kernel/rcu.h
+++ b/kernel/rcu.h
@@ -109,4 +109,13 @@ static inline bool __rcu_reclaim(char *rn, struct rcu_head *head)
109 } 109 }
110} 110}
111 111
112extern int rcu_expedited;
113
114#ifdef CONFIG_RCU_STALL_COMMON
115
116extern int rcu_cpu_stall_suppress;
117int rcu_jiffies_till_stall_check(void);
118
119#endif /* #ifdef CONFIG_RCU_STALL_COMMON */
120
112#endif /* __LINUX_RCU_H */ 121#endif /* __LINUX_RCU_H */
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 29ca1c6da594..48ab70384a4c 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -46,12 +46,15 @@
46#include <linux/export.h> 46#include <linux/export.h>
47#include <linux/hardirq.h> 47#include <linux/hardirq.h>
48#include <linux/delay.h> 48#include <linux/delay.h>
49#include <linux/module.h>
49 50
50#define CREATE_TRACE_POINTS 51#define CREATE_TRACE_POINTS
51#include <trace/events/rcu.h> 52#include <trace/events/rcu.h>
52 53
53#include "rcu.h" 54#include "rcu.h"
54 55
56module_param(rcu_expedited, int, 0);
57
55#ifdef CONFIG_PREEMPT_RCU 58#ifdef CONFIG_PREEMPT_RCU
56 59
57/* 60/*
@@ -401,11 +404,65 @@ EXPORT_SYMBOL_GPL(rcuhead_debug_descr);
401#endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ 404#endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */
402 405
403#if defined(CONFIG_TREE_RCU) || defined(CONFIG_TREE_PREEMPT_RCU) || defined(CONFIG_RCU_TRACE) 406#if defined(CONFIG_TREE_RCU) || defined(CONFIG_TREE_PREEMPT_RCU) || defined(CONFIG_RCU_TRACE)
404void do_trace_rcu_torture_read(char *rcutorturename, struct rcu_head *rhp) 407void do_trace_rcu_torture_read(char *rcutorturename, struct rcu_head *rhp,
408 unsigned long secs,
409 unsigned long c_old, unsigned long c)
405{ 410{
406 trace_rcu_torture_read(rcutorturename, rhp); 411 trace_rcu_torture_read(rcutorturename, rhp, secs, c_old, c);
407} 412}
408EXPORT_SYMBOL_GPL(do_trace_rcu_torture_read); 413EXPORT_SYMBOL_GPL(do_trace_rcu_torture_read);
409#else 414#else
410#define do_trace_rcu_torture_read(rcutorturename, rhp) do { } while (0) 415#define do_trace_rcu_torture_read(rcutorturename, rhp, secs, c_old, c) \
416 do { } while (0)
417#endif
418
419#ifdef CONFIG_RCU_STALL_COMMON
420
421#ifdef CONFIG_PROVE_RCU
422#define RCU_STALL_DELAY_DELTA (5 * HZ)
423#else
424#define RCU_STALL_DELAY_DELTA 0
411#endif 425#endif
426
427int rcu_cpu_stall_suppress __read_mostly; /* 1 = suppress stall warnings. */
428int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT;
429
430module_param(rcu_cpu_stall_suppress, int, 0644);
431module_param(rcu_cpu_stall_timeout, int, 0644);
432
433int rcu_jiffies_till_stall_check(void)
434{
435 int till_stall_check = ACCESS_ONCE(rcu_cpu_stall_timeout);
436
437 /*
438 * Limit check must be consistent with the Kconfig limits
439 * for CONFIG_RCU_CPU_STALL_TIMEOUT.
440 */
441 if (till_stall_check < 3) {
442 ACCESS_ONCE(rcu_cpu_stall_timeout) = 3;
443 till_stall_check = 3;
444 } else if (till_stall_check > 300) {
445 ACCESS_ONCE(rcu_cpu_stall_timeout) = 300;
446 till_stall_check = 300;
447 }
448 return till_stall_check * HZ + RCU_STALL_DELAY_DELTA;
449}
450
451static int rcu_panic(struct notifier_block *this, unsigned long ev, void *ptr)
452{
453 rcu_cpu_stall_suppress = 1;
454 return NOTIFY_DONE;
455}
456
457static struct notifier_block rcu_panic_block = {
458 .notifier_call = rcu_panic,
459};
460
461static int __init check_cpu_stall_init(void)
462{
463 atomic_notifier_chain_register(&panic_notifier_list, &rcu_panic_block);
464 return 0;
465}
466early_initcall(check_cpu_stall_init);
467
468#endif /* #ifdef CONFIG_RCU_STALL_COMMON */
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c
index e4c6a598d6f7..a0714a51b6d7 100644
--- a/kernel/rcutiny.c
+++ b/kernel/rcutiny.c
@@ -51,10 +51,10 @@ static void __call_rcu(struct rcu_head *head,
51 void (*func)(struct rcu_head *rcu), 51 void (*func)(struct rcu_head *rcu),
52 struct rcu_ctrlblk *rcp); 52 struct rcu_ctrlblk *rcp);
53 53
54#include "rcutiny_plugin.h"
55
56static long long rcu_dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; 54static long long rcu_dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;
57 55
56#include "rcutiny_plugin.h"
57
58/* Common code for rcu_idle_enter() and rcu_irq_exit(), see kernel/rcutree.c. */ 58/* Common code for rcu_idle_enter() and rcu_irq_exit(), see kernel/rcutree.c. */
59static void rcu_idle_enter_common(long long newval) 59static void rcu_idle_enter_common(long long newval)
60{ 60{
@@ -193,9 +193,9 @@ EXPORT_SYMBOL(rcu_is_cpu_idle);
193 * interrupts don't count, we must be running at the first interrupt 193 * interrupts don't count, we must be running at the first interrupt
194 * level. 194 * level.
195 */ 195 */
196int rcu_is_cpu_rrupt_from_idle(void) 196static int rcu_is_cpu_rrupt_from_idle(void)
197{ 197{
198 return rcu_dynticks_nesting <= 0; 198 return rcu_dynticks_nesting <= 1;
199} 199}
200 200
201/* 201/*
@@ -205,6 +205,7 @@ int rcu_is_cpu_rrupt_from_idle(void)
205 */ 205 */
206static int rcu_qsctr_help(struct rcu_ctrlblk *rcp) 206static int rcu_qsctr_help(struct rcu_ctrlblk *rcp)
207{ 207{
208 reset_cpu_stall_ticks(rcp);
208 if (rcp->rcucblist != NULL && 209 if (rcp->rcucblist != NULL &&
209 rcp->donetail != rcp->curtail) { 210 rcp->donetail != rcp->curtail) {
210 rcp->donetail = rcp->curtail; 211 rcp->donetail = rcp->curtail;
@@ -251,6 +252,7 @@ void rcu_bh_qs(int cpu)
251 */ 252 */
252void rcu_check_callbacks(int cpu, int user) 253void rcu_check_callbacks(int cpu, int user)
253{ 254{
255 check_cpu_stalls();
254 if (user || rcu_is_cpu_rrupt_from_idle()) 256 if (user || rcu_is_cpu_rrupt_from_idle())
255 rcu_sched_qs(cpu); 257 rcu_sched_qs(cpu);
256 else if (!in_softirq()) 258 else if (!in_softirq())
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h
index 3d0190282204..8a233002faeb 100644
--- a/kernel/rcutiny_plugin.h
+++ b/kernel/rcutiny_plugin.h
@@ -33,6 +33,9 @@ struct rcu_ctrlblk {
33 struct rcu_head **donetail; /* ->next pointer of last "done" CB. */ 33 struct rcu_head **donetail; /* ->next pointer of last "done" CB. */
34 struct rcu_head **curtail; /* ->next pointer of last CB. */ 34 struct rcu_head **curtail; /* ->next pointer of last CB. */
35 RCU_TRACE(long qlen); /* Number of pending CBs. */ 35 RCU_TRACE(long qlen); /* Number of pending CBs. */
36 RCU_TRACE(unsigned long gp_start); /* Start time for stalls. */
37 RCU_TRACE(unsigned long ticks_this_gp); /* Statistic for stalls. */
38 RCU_TRACE(unsigned long jiffies_stall); /* Jiffies at next stall. */
36 RCU_TRACE(char *name); /* Name of RCU type. */ 39 RCU_TRACE(char *name); /* Name of RCU type. */
37}; 40};
38 41
@@ -54,6 +57,51 @@ int rcu_scheduler_active __read_mostly;
54EXPORT_SYMBOL_GPL(rcu_scheduler_active); 57EXPORT_SYMBOL_GPL(rcu_scheduler_active);
55#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ 58#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
56 59
60#ifdef CONFIG_RCU_TRACE
61
62static void check_cpu_stall(struct rcu_ctrlblk *rcp)
63{
64 unsigned long j;
65 unsigned long js;
66
67 if (rcu_cpu_stall_suppress)
68 return;
69 rcp->ticks_this_gp++;
70 j = jiffies;
71 js = rcp->jiffies_stall;
72 if (*rcp->curtail && ULONG_CMP_GE(j, js)) {
73 pr_err("INFO: %s stall on CPU (%lu ticks this GP) idle=%llx (t=%lu jiffies q=%ld)\n",
74 rcp->name, rcp->ticks_this_gp, rcu_dynticks_nesting,
75 jiffies - rcp->gp_start, rcp->qlen);
76 dump_stack();
77 }
78 if (*rcp->curtail && ULONG_CMP_GE(j, js))
79 rcp->jiffies_stall = jiffies +
80 3 * rcu_jiffies_till_stall_check() + 3;
81 else if (ULONG_CMP_GE(j, js))
82 rcp->jiffies_stall = jiffies + rcu_jiffies_till_stall_check();
83}
84
85static void check_cpu_stall_preempt(void);
86
87#endif /* #ifdef CONFIG_RCU_TRACE */
88
89static void reset_cpu_stall_ticks(struct rcu_ctrlblk *rcp)
90{
91#ifdef CONFIG_RCU_TRACE
92 rcp->ticks_this_gp = 0;
93 rcp->gp_start = jiffies;
94 rcp->jiffies_stall = jiffies + rcu_jiffies_till_stall_check();
95#endif /* #ifdef CONFIG_RCU_TRACE */
96}
97
98static void check_cpu_stalls(void)
99{
100 RCU_TRACE(check_cpu_stall(&rcu_bh_ctrlblk));
101 RCU_TRACE(check_cpu_stall(&rcu_sched_ctrlblk));
102 RCU_TRACE(check_cpu_stall_preempt());
103}
104
57#ifdef CONFIG_TINY_PREEMPT_RCU 105#ifdef CONFIG_TINY_PREEMPT_RCU
58 106
59#include <linux/delay.h> 107#include <linux/delay.h>
@@ -448,6 +496,7 @@ static void rcu_preempt_start_gp(void)
448 /* Official start of GP. */ 496 /* Official start of GP. */
449 rcu_preempt_ctrlblk.gpnum++; 497 rcu_preempt_ctrlblk.gpnum++;
450 RCU_TRACE(rcu_preempt_ctrlblk.n_grace_periods++); 498 RCU_TRACE(rcu_preempt_ctrlblk.n_grace_periods++);
499 reset_cpu_stall_ticks(&rcu_preempt_ctrlblk.rcb);
451 500
452 /* Any blocked RCU readers block new GP. */ 501 /* Any blocked RCU readers block new GP. */
453 if (rcu_preempt_blocked_readers_any()) 502 if (rcu_preempt_blocked_readers_any())
@@ -706,7 +755,10 @@ void synchronize_rcu(void)
706 return; 755 return;
707 756
708 /* Once we get past the fastpath checks, same code as rcu_barrier(). */ 757 /* Once we get past the fastpath checks, same code as rcu_barrier(). */
709 rcu_barrier(); 758 if (rcu_expedited)
759 synchronize_rcu_expedited();
760 else
761 rcu_barrier();
710} 762}
711EXPORT_SYMBOL_GPL(synchronize_rcu); 763EXPORT_SYMBOL_GPL(synchronize_rcu);
712 764
@@ -1051,4 +1103,11 @@ MODULE_AUTHOR("Paul E. McKenney");
1051MODULE_DESCRIPTION("Read-Copy Update tracing for tiny implementation"); 1103MODULE_DESCRIPTION("Read-Copy Update tracing for tiny implementation");
1052MODULE_LICENSE("GPL"); 1104MODULE_LICENSE("GPL");
1053 1105
1106static void check_cpu_stall_preempt(void)
1107{
1108#ifdef CONFIG_TINY_PREEMPT_RCU
1109 check_cpu_stall(&rcu_preempt_ctrlblk.rcb);
1110#endif /* #ifdef CONFIG_TINY_PREEMPT_RCU */
1111}
1112
1054#endif /* #ifdef CONFIG_RCU_TRACE */ 1113#endif /* #ifdef CONFIG_RCU_TRACE */
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index aaa7b9f3532a..e1f3a8c96724 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -46,6 +46,7 @@
46#include <linux/stat.h> 46#include <linux/stat.h>
47#include <linux/srcu.h> 47#include <linux/srcu.h>
48#include <linux/slab.h> 48#include <linux/slab.h>
49#include <linux/trace_clock.h>
49#include <asm/byteorder.h> 50#include <asm/byteorder.h>
50 51
51MODULE_LICENSE("GPL"); 52MODULE_LICENSE("GPL");
@@ -207,6 +208,20 @@ MODULE_PARM_DESC(rcutorture_runnable, "Start rcutorture at boot");
207#define rcu_can_boost() 0 208#define rcu_can_boost() 0
208#endif /* #else #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) */ 209#endif /* #else #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) */
209 210
211#ifdef CONFIG_RCU_TRACE
212static u64 notrace rcu_trace_clock_local(void)
213{
214 u64 ts = trace_clock_local();
215 unsigned long __maybe_unused ts_rem = do_div(ts, NSEC_PER_USEC);
216 return ts;
217}
218#else /* #ifdef CONFIG_RCU_TRACE */
219static u64 notrace rcu_trace_clock_local(void)
220{
221 return 0ULL;
222}
223#endif /* #else #ifdef CONFIG_RCU_TRACE */
224
210static unsigned long shutdown_time; /* jiffies to system shutdown. */ 225static unsigned long shutdown_time; /* jiffies to system shutdown. */
211static unsigned long boost_starttime; /* jiffies of next boost test start. */ 226static unsigned long boost_starttime; /* jiffies of next boost test start. */
212DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */ 227DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */
@@ -339,7 +354,6 @@ rcu_stutter_wait(char *title)
339 354
340struct rcu_torture_ops { 355struct rcu_torture_ops {
341 void (*init)(void); 356 void (*init)(void);
342 void (*cleanup)(void);
343 int (*readlock)(void); 357 int (*readlock)(void);
344 void (*read_delay)(struct rcu_random_state *rrsp); 358 void (*read_delay)(struct rcu_random_state *rrsp);
345 void (*readunlock)(int idx); 359 void (*readunlock)(int idx);
@@ -431,7 +445,6 @@ static void rcu_torture_deferred_free(struct rcu_torture *p)
431 445
432static struct rcu_torture_ops rcu_ops = { 446static struct rcu_torture_ops rcu_ops = {
433 .init = NULL, 447 .init = NULL,
434 .cleanup = NULL,
435 .readlock = rcu_torture_read_lock, 448 .readlock = rcu_torture_read_lock,
436 .read_delay = rcu_read_delay, 449 .read_delay = rcu_read_delay,
437 .readunlock = rcu_torture_read_unlock, 450 .readunlock = rcu_torture_read_unlock,
@@ -475,7 +488,6 @@ static void rcu_sync_torture_init(void)
475 488
476static struct rcu_torture_ops rcu_sync_ops = { 489static struct rcu_torture_ops rcu_sync_ops = {
477 .init = rcu_sync_torture_init, 490 .init = rcu_sync_torture_init,
478 .cleanup = NULL,
479 .readlock = rcu_torture_read_lock, 491 .readlock = rcu_torture_read_lock,
480 .read_delay = rcu_read_delay, 492 .read_delay = rcu_read_delay,
481 .readunlock = rcu_torture_read_unlock, 493 .readunlock = rcu_torture_read_unlock,
@@ -493,7 +505,6 @@ static struct rcu_torture_ops rcu_sync_ops = {
493 505
494static struct rcu_torture_ops rcu_expedited_ops = { 506static struct rcu_torture_ops rcu_expedited_ops = {
495 .init = rcu_sync_torture_init, 507 .init = rcu_sync_torture_init,
496 .cleanup = NULL,
497 .readlock = rcu_torture_read_lock, 508 .readlock = rcu_torture_read_lock,
498 .read_delay = rcu_read_delay, /* just reuse rcu's version. */ 509 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
499 .readunlock = rcu_torture_read_unlock, 510 .readunlock = rcu_torture_read_unlock,
@@ -536,7 +547,6 @@ static void rcu_bh_torture_deferred_free(struct rcu_torture *p)
536 547
537static struct rcu_torture_ops rcu_bh_ops = { 548static struct rcu_torture_ops rcu_bh_ops = {
538 .init = NULL, 549 .init = NULL,
539 .cleanup = NULL,
540 .readlock = rcu_bh_torture_read_lock, 550 .readlock = rcu_bh_torture_read_lock,
541 .read_delay = rcu_read_delay, /* just reuse rcu's version. */ 551 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
542 .readunlock = rcu_bh_torture_read_unlock, 552 .readunlock = rcu_bh_torture_read_unlock,
@@ -553,7 +563,6 @@ static struct rcu_torture_ops rcu_bh_ops = {
553 563
554static struct rcu_torture_ops rcu_bh_sync_ops = { 564static struct rcu_torture_ops rcu_bh_sync_ops = {
555 .init = rcu_sync_torture_init, 565 .init = rcu_sync_torture_init,
556 .cleanup = NULL,
557 .readlock = rcu_bh_torture_read_lock, 566 .readlock = rcu_bh_torture_read_lock,
558 .read_delay = rcu_read_delay, /* just reuse rcu's version. */ 567 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
559 .readunlock = rcu_bh_torture_read_unlock, 568 .readunlock = rcu_bh_torture_read_unlock,
@@ -570,7 +579,6 @@ static struct rcu_torture_ops rcu_bh_sync_ops = {
570 579
571static struct rcu_torture_ops rcu_bh_expedited_ops = { 580static struct rcu_torture_ops rcu_bh_expedited_ops = {
572 .init = rcu_sync_torture_init, 581 .init = rcu_sync_torture_init,
573 .cleanup = NULL,
574 .readlock = rcu_bh_torture_read_lock, 582 .readlock = rcu_bh_torture_read_lock,
575 .read_delay = rcu_read_delay, /* just reuse rcu's version. */ 583 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
576 .readunlock = rcu_bh_torture_read_unlock, 584 .readunlock = rcu_bh_torture_read_unlock,
@@ -589,19 +597,7 @@ static struct rcu_torture_ops rcu_bh_expedited_ops = {
589 * Definitions for srcu torture testing. 597 * Definitions for srcu torture testing.
590 */ 598 */
591 599
592static struct srcu_struct srcu_ctl; 600DEFINE_STATIC_SRCU(srcu_ctl);
593
594static void srcu_torture_init(void)
595{
596 init_srcu_struct(&srcu_ctl);
597 rcu_sync_torture_init();
598}
599
600static void srcu_torture_cleanup(void)
601{
602 synchronize_srcu(&srcu_ctl);
603 cleanup_srcu_struct(&srcu_ctl);
604}
605 601
606static int srcu_torture_read_lock(void) __acquires(&srcu_ctl) 602static int srcu_torture_read_lock(void) __acquires(&srcu_ctl)
607{ 603{
@@ -672,8 +668,7 @@ static int srcu_torture_stats(char *page)
672} 668}
673 669
674static struct rcu_torture_ops srcu_ops = { 670static struct rcu_torture_ops srcu_ops = {
675 .init = srcu_torture_init, 671 .init = rcu_sync_torture_init,
676 .cleanup = srcu_torture_cleanup,
677 .readlock = srcu_torture_read_lock, 672 .readlock = srcu_torture_read_lock,
678 .read_delay = srcu_read_delay, 673 .read_delay = srcu_read_delay,
679 .readunlock = srcu_torture_read_unlock, 674 .readunlock = srcu_torture_read_unlock,
@@ -687,8 +682,7 @@ static struct rcu_torture_ops srcu_ops = {
687}; 682};
688 683
689static struct rcu_torture_ops srcu_sync_ops = { 684static struct rcu_torture_ops srcu_sync_ops = {
690 .init = srcu_torture_init, 685 .init = rcu_sync_torture_init,
691 .cleanup = srcu_torture_cleanup,
692 .readlock = srcu_torture_read_lock, 686 .readlock = srcu_torture_read_lock,
693 .read_delay = srcu_read_delay, 687 .read_delay = srcu_read_delay,
694 .readunlock = srcu_torture_read_unlock, 688 .readunlock = srcu_torture_read_unlock,
@@ -712,8 +706,7 @@ static void srcu_torture_read_unlock_raw(int idx) __releases(&srcu_ctl)
712} 706}
713 707
714static struct rcu_torture_ops srcu_raw_ops = { 708static struct rcu_torture_ops srcu_raw_ops = {
715 .init = srcu_torture_init, 709 .init = rcu_sync_torture_init,
716 .cleanup = srcu_torture_cleanup,
717 .readlock = srcu_torture_read_lock_raw, 710 .readlock = srcu_torture_read_lock_raw,
718 .read_delay = srcu_read_delay, 711 .read_delay = srcu_read_delay,
719 .readunlock = srcu_torture_read_unlock_raw, 712 .readunlock = srcu_torture_read_unlock_raw,
@@ -727,8 +720,7 @@ static struct rcu_torture_ops srcu_raw_ops = {
727}; 720};
728 721
729static struct rcu_torture_ops srcu_raw_sync_ops = { 722static struct rcu_torture_ops srcu_raw_sync_ops = {
730 .init = srcu_torture_init, 723 .init = rcu_sync_torture_init,
731 .cleanup = srcu_torture_cleanup,
732 .readlock = srcu_torture_read_lock_raw, 724 .readlock = srcu_torture_read_lock_raw,
733 .read_delay = srcu_read_delay, 725 .read_delay = srcu_read_delay,
734 .readunlock = srcu_torture_read_unlock_raw, 726 .readunlock = srcu_torture_read_unlock_raw,
@@ -747,8 +739,7 @@ static void srcu_torture_synchronize_expedited(void)
747} 739}
748 740
749static struct rcu_torture_ops srcu_expedited_ops = { 741static struct rcu_torture_ops srcu_expedited_ops = {
750 .init = srcu_torture_init, 742 .init = rcu_sync_torture_init,
751 .cleanup = srcu_torture_cleanup,
752 .readlock = srcu_torture_read_lock, 743 .readlock = srcu_torture_read_lock,
753 .read_delay = srcu_read_delay, 744 .read_delay = srcu_read_delay,
754 .readunlock = srcu_torture_read_unlock, 745 .readunlock = srcu_torture_read_unlock,
@@ -783,7 +774,6 @@ static void rcu_sched_torture_deferred_free(struct rcu_torture *p)
783 774
784static struct rcu_torture_ops sched_ops = { 775static struct rcu_torture_ops sched_ops = {
785 .init = rcu_sync_torture_init, 776 .init = rcu_sync_torture_init,
786 .cleanup = NULL,
787 .readlock = sched_torture_read_lock, 777 .readlock = sched_torture_read_lock,
788 .read_delay = rcu_read_delay, /* just reuse rcu's version. */ 778 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
789 .readunlock = sched_torture_read_unlock, 779 .readunlock = sched_torture_read_unlock,
@@ -799,7 +789,6 @@ static struct rcu_torture_ops sched_ops = {
799 789
800static struct rcu_torture_ops sched_sync_ops = { 790static struct rcu_torture_ops sched_sync_ops = {
801 .init = rcu_sync_torture_init, 791 .init = rcu_sync_torture_init,
802 .cleanup = NULL,
803 .readlock = sched_torture_read_lock, 792 .readlock = sched_torture_read_lock,
804 .read_delay = rcu_read_delay, /* just reuse rcu's version. */ 793 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
805 .readunlock = sched_torture_read_unlock, 794 .readunlock = sched_torture_read_unlock,
@@ -814,7 +803,6 @@ static struct rcu_torture_ops sched_sync_ops = {
814 803
815static struct rcu_torture_ops sched_expedited_ops = { 804static struct rcu_torture_ops sched_expedited_ops = {
816 .init = rcu_sync_torture_init, 805 .init = rcu_sync_torture_init,
817 .cleanup = NULL,
818 .readlock = sched_torture_read_lock, 806 .readlock = sched_torture_read_lock,
819 .read_delay = rcu_read_delay, /* just reuse rcu's version. */ 807 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
820 .readunlock = sched_torture_read_unlock, 808 .readunlock = sched_torture_read_unlock,
@@ -872,7 +860,7 @@ static int rcu_torture_boost(void *arg)
872 /* Wait for the next test interval. */ 860 /* Wait for the next test interval. */
873 oldstarttime = boost_starttime; 861 oldstarttime = boost_starttime;
874 while (ULONG_CMP_LT(jiffies, oldstarttime)) { 862 while (ULONG_CMP_LT(jiffies, oldstarttime)) {
875 schedule_timeout_uninterruptible(1); 863 schedule_timeout_interruptible(oldstarttime - jiffies);
876 rcu_stutter_wait("rcu_torture_boost"); 864 rcu_stutter_wait("rcu_torture_boost");
877 if (kthread_should_stop() || 865 if (kthread_should_stop() ||
878 fullstop != FULLSTOP_DONTSTOP) 866 fullstop != FULLSTOP_DONTSTOP)
@@ -1055,7 +1043,6 @@ void rcutorture_trace_dump(void)
1055 return; 1043 return;
1056 if (atomic_xchg(&beenhere, 1) != 0) 1044 if (atomic_xchg(&beenhere, 1) != 0)
1057 return; 1045 return;
1058 do_trace_rcu_torture_read(cur_ops->name, (struct rcu_head *)~0UL);
1059 ftrace_dump(DUMP_ALL); 1046 ftrace_dump(DUMP_ALL);
1060} 1047}
1061 1048
@@ -1069,13 +1056,16 @@ static void rcu_torture_timer(unsigned long unused)
1069{ 1056{
1070 int idx; 1057 int idx;
1071 int completed; 1058 int completed;
1059 int completed_end;
1072 static DEFINE_RCU_RANDOM(rand); 1060 static DEFINE_RCU_RANDOM(rand);
1073 static DEFINE_SPINLOCK(rand_lock); 1061 static DEFINE_SPINLOCK(rand_lock);
1074 struct rcu_torture *p; 1062 struct rcu_torture *p;
1075 int pipe_count; 1063 int pipe_count;
1064 unsigned long long ts;
1076 1065
1077 idx = cur_ops->readlock(); 1066 idx = cur_ops->readlock();
1078 completed = cur_ops->completed(); 1067 completed = cur_ops->completed();
1068 ts = rcu_trace_clock_local();
1079 p = rcu_dereference_check(rcu_torture_current, 1069 p = rcu_dereference_check(rcu_torture_current,
1080 rcu_read_lock_bh_held() || 1070 rcu_read_lock_bh_held() ||
1081 rcu_read_lock_sched_held() || 1071 rcu_read_lock_sched_held() ||
@@ -1085,7 +1075,6 @@ static void rcu_torture_timer(unsigned long unused)
1085 cur_ops->readunlock(idx); 1075 cur_ops->readunlock(idx);
1086 return; 1076 return;
1087 } 1077 }
1088 do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu);
1089 if (p->rtort_mbtest == 0) 1078 if (p->rtort_mbtest == 0)
1090 atomic_inc(&n_rcu_torture_mberror); 1079 atomic_inc(&n_rcu_torture_mberror);
1091 spin_lock(&rand_lock); 1080 spin_lock(&rand_lock);
@@ -1098,10 +1087,14 @@ static void rcu_torture_timer(unsigned long unused)
1098 /* Should not happen, but... */ 1087 /* Should not happen, but... */
1099 pipe_count = RCU_TORTURE_PIPE_LEN; 1088 pipe_count = RCU_TORTURE_PIPE_LEN;
1100 } 1089 }
1101 if (pipe_count > 1) 1090 completed_end = cur_ops->completed();
1091 if (pipe_count > 1) {
1092 do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu, ts,
1093 completed, completed_end);
1102 rcutorture_trace_dump(); 1094 rcutorture_trace_dump();
1095 }
1103 __this_cpu_inc(rcu_torture_count[pipe_count]); 1096 __this_cpu_inc(rcu_torture_count[pipe_count]);
1104 completed = cur_ops->completed() - completed; 1097 completed = completed_end - completed;
1105 if (completed > RCU_TORTURE_PIPE_LEN) { 1098 if (completed > RCU_TORTURE_PIPE_LEN) {
1106 /* Should not happen, but... */ 1099 /* Should not happen, but... */
1107 completed = RCU_TORTURE_PIPE_LEN; 1100 completed = RCU_TORTURE_PIPE_LEN;
@@ -1121,11 +1114,13 @@ static int
1121rcu_torture_reader(void *arg) 1114rcu_torture_reader(void *arg)
1122{ 1115{
1123 int completed; 1116 int completed;
1117 int completed_end;
1124 int idx; 1118 int idx;
1125 DEFINE_RCU_RANDOM(rand); 1119 DEFINE_RCU_RANDOM(rand);
1126 struct rcu_torture *p; 1120 struct rcu_torture *p;
1127 int pipe_count; 1121 int pipe_count;
1128 struct timer_list t; 1122 struct timer_list t;
1123 unsigned long long ts;
1129 1124
1130 VERBOSE_PRINTK_STRING("rcu_torture_reader task started"); 1125 VERBOSE_PRINTK_STRING("rcu_torture_reader task started");
1131 set_user_nice(current, 19); 1126 set_user_nice(current, 19);
@@ -1139,6 +1134,7 @@ rcu_torture_reader(void *arg)
1139 } 1134 }
1140 idx = cur_ops->readlock(); 1135 idx = cur_ops->readlock();
1141 completed = cur_ops->completed(); 1136 completed = cur_ops->completed();
1137 ts = rcu_trace_clock_local();
1142 p = rcu_dereference_check(rcu_torture_current, 1138 p = rcu_dereference_check(rcu_torture_current,
1143 rcu_read_lock_bh_held() || 1139 rcu_read_lock_bh_held() ||
1144 rcu_read_lock_sched_held() || 1140 rcu_read_lock_sched_held() ||
@@ -1149,7 +1145,6 @@ rcu_torture_reader(void *arg)
1149 schedule_timeout_interruptible(HZ); 1145 schedule_timeout_interruptible(HZ);
1150 continue; 1146 continue;
1151 } 1147 }
1152 do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu);
1153 if (p->rtort_mbtest == 0) 1148 if (p->rtort_mbtest == 0)
1154 atomic_inc(&n_rcu_torture_mberror); 1149 atomic_inc(&n_rcu_torture_mberror);
1155 cur_ops->read_delay(&rand); 1150 cur_ops->read_delay(&rand);
@@ -1159,10 +1154,14 @@ rcu_torture_reader(void *arg)
1159 /* Should not happen, but... */ 1154 /* Should not happen, but... */
1160 pipe_count = RCU_TORTURE_PIPE_LEN; 1155 pipe_count = RCU_TORTURE_PIPE_LEN;
1161 } 1156 }
1162 if (pipe_count > 1) 1157 completed_end = cur_ops->completed();
1158 if (pipe_count > 1) {
1159 do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu,
1160 ts, completed, completed_end);
1163 rcutorture_trace_dump(); 1161 rcutorture_trace_dump();
1162 }
1164 __this_cpu_inc(rcu_torture_count[pipe_count]); 1163 __this_cpu_inc(rcu_torture_count[pipe_count]);
1165 completed = cur_ops->completed() - completed; 1164 completed = completed_end - completed;
1166 if (completed > RCU_TORTURE_PIPE_LEN) { 1165 if (completed > RCU_TORTURE_PIPE_LEN) {
1167 /* Should not happen, but... */ 1166 /* Should not happen, but... */
1168 completed = RCU_TORTURE_PIPE_LEN; 1167 completed = RCU_TORTURE_PIPE_LEN;
@@ -1328,19 +1327,35 @@ static void rcu_torture_shuffle_tasks(void)
1328 set_cpus_allowed_ptr(reader_tasks[i], 1327 set_cpus_allowed_ptr(reader_tasks[i],
1329 shuffle_tmp_mask); 1328 shuffle_tmp_mask);
1330 } 1329 }
1331
1332 if (fakewriter_tasks) { 1330 if (fakewriter_tasks) {
1333 for (i = 0; i < nfakewriters; i++) 1331 for (i = 0; i < nfakewriters; i++)
1334 if (fakewriter_tasks[i]) 1332 if (fakewriter_tasks[i])
1335 set_cpus_allowed_ptr(fakewriter_tasks[i], 1333 set_cpus_allowed_ptr(fakewriter_tasks[i],
1336 shuffle_tmp_mask); 1334 shuffle_tmp_mask);
1337 } 1335 }
1338
1339 if (writer_task) 1336 if (writer_task)
1340 set_cpus_allowed_ptr(writer_task, shuffle_tmp_mask); 1337 set_cpus_allowed_ptr(writer_task, shuffle_tmp_mask);
1341
1342 if (stats_task) 1338 if (stats_task)
1343 set_cpus_allowed_ptr(stats_task, shuffle_tmp_mask); 1339 set_cpus_allowed_ptr(stats_task, shuffle_tmp_mask);
1340 if (stutter_task)
1341 set_cpus_allowed_ptr(stutter_task, shuffle_tmp_mask);
1342 if (fqs_task)
1343 set_cpus_allowed_ptr(fqs_task, shuffle_tmp_mask);
1344 if (shutdown_task)
1345 set_cpus_allowed_ptr(shutdown_task, shuffle_tmp_mask);
1346#ifdef CONFIG_HOTPLUG_CPU
1347 if (onoff_task)
1348 set_cpus_allowed_ptr(onoff_task, shuffle_tmp_mask);
1349#endif /* #ifdef CONFIG_HOTPLUG_CPU */
1350 if (stall_task)
1351 set_cpus_allowed_ptr(stall_task, shuffle_tmp_mask);
1352 if (barrier_cbs_tasks)
1353 for (i = 0; i < n_barrier_cbs; i++)
1354 if (barrier_cbs_tasks[i])
1355 set_cpus_allowed_ptr(barrier_cbs_tasks[i],
1356 shuffle_tmp_mask);
1357 if (barrier_task)
1358 set_cpus_allowed_ptr(barrier_task, shuffle_tmp_mask);
1344 1359
1345 if (rcu_idle_cpu == -1) 1360 if (rcu_idle_cpu == -1)
1346 rcu_idle_cpu = num_online_cpus() - 1; 1361 rcu_idle_cpu = num_online_cpus() - 1;
@@ -1396,12 +1411,16 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, char *tag)
1396 "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d " 1411 "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d "
1397 "test_boost=%d/%d test_boost_interval=%d " 1412 "test_boost=%d/%d test_boost_interval=%d "
1398 "test_boost_duration=%d shutdown_secs=%d " 1413 "test_boost_duration=%d shutdown_secs=%d "
1414 "stall_cpu=%d stall_cpu_holdoff=%d "
1415 "n_barrier_cbs=%d "
1399 "onoff_interval=%d onoff_holdoff=%d\n", 1416 "onoff_interval=%d onoff_holdoff=%d\n",
1400 torture_type, tag, nrealreaders, nfakewriters, 1417 torture_type, tag, nrealreaders, nfakewriters,
1401 stat_interval, verbose, test_no_idle_hz, shuffle_interval, 1418 stat_interval, verbose, test_no_idle_hz, shuffle_interval,
1402 stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter, 1419 stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter,
1403 test_boost, cur_ops->can_boost, 1420 test_boost, cur_ops->can_boost,
1404 test_boost_interval, test_boost_duration, shutdown_secs, 1421 test_boost_interval, test_boost_duration, shutdown_secs,
1422 stall_cpu, stall_cpu_holdoff,
1423 n_barrier_cbs,
1405 onoff_interval, onoff_holdoff); 1424 onoff_interval, onoff_holdoff);
1406} 1425}
1407 1426
@@ -1502,6 +1521,7 @@ rcu_torture_onoff(void *arg)
1502 unsigned long delta; 1521 unsigned long delta;
1503 int maxcpu = -1; 1522 int maxcpu = -1;
1504 DEFINE_RCU_RANDOM(rand); 1523 DEFINE_RCU_RANDOM(rand);
1524 int ret;
1505 unsigned long starttime; 1525 unsigned long starttime;
1506 1526
1507 VERBOSE_PRINTK_STRING("rcu_torture_onoff task started"); 1527 VERBOSE_PRINTK_STRING("rcu_torture_onoff task started");
@@ -1522,7 +1542,13 @@ rcu_torture_onoff(void *arg)
1522 torture_type, cpu); 1542 torture_type, cpu);
1523 starttime = jiffies; 1543 starttime = jiffies;
1524 n_offline_attempts++; 1544 n_offline_attempts++;
1525 if (cpu_down(cpu) == 0) { 1545 ret = cpu_down(cpu);
1546 if (ret) {
1547 if (verbose)
1548 pr_alert("%s" TORTURE_FLAG
1549 "rcu_torture_onoff task: offline %d failed: errno %d\n",
1550 torture_type, cpu, ret);
1551 } else {
1526 if (verbose) 1552 if (verbose)
1527 pr_alert("%s" TORTURE_FLAG 1553 pr_alert("%s" TORTURE_FLAG
1528 "rcu_torture_onoff task: offlined %d\n", 1554 "rcu_torture_onoff task: offlined %d\n",
@@ -1765,7 +1791,7 @@ static int rcu_torture_barrier_init(void)
1765 barrier_cbs_wq = 1791 barrier_cbs_wq =
1766 kzalloc(n_barrier_cbs * sizeof(barrier_cbs_wq[0]), 1792 kzalloc(n_barrier_cbs * sizeof(barrier_cbs_wq[0]),
1767 GFP_KERNEL); 1793 GFP_KERNEL);
1768 if (barrier_cbs_tasks == NULL || barrier_cbs_wq == 0) 1794 if (barrier_cbs_tasks == NULL || !barrier_cbs_wq)
1769 return -ENOMEM; 1795 return -ENOMEM;
1770 for (i = 0; i < n_barrier_cbs; i++) { 1796 for (i = 0; i < n_barrier_cbs; i++) {
1771 init_waitqueue_head(&barrier_cbs_wq[i]); 1797 init_waitqueue_head(&barrier_cbs_wq[i]);
@@ -1936,8 +1962,6 @@ rcu_torture_cleanup(void)
1936 1962
1937 rcu_torture_stats_print(); /* -After- the stats thread is stopped! */ 1963 rcu_torture_stats_print(); /* -After- the stats thread is stopped! */
1938 1964
1939 if (cur_ops->cleanup)
1940 cur_ops->cleanup();
1941 if (atomic_read(&n_rcu_torture_error) || n_rcu_torture_barrier_error) 1965 if (atomic_read(&n_rcu_torture_error) || n_rcu_torture_barrier_error)
1942 rcu_torture_print_module_parms(cur_ops, "End of test: FAILURE"); 1966 rcu_torture_print_module_parms(cur_ops, "End of test: FAILURE");
1943 else if (n_online_successes != n_online_attempts || 1967 else if (n_online_successes != n_online_attempts ||
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 74df86bd9204..5b8ad827fd86 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -68,9 +68,9 @@ static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS];
68 .level = { &sname##_state.node[0] }, \ 68 .level = { &sname##_state.node[0] }, \
69 .call = cr, \ 69 .call = cr, \
70 .fqs_state = RCU_GP_IDLE, \ 70 .fqs_state = RCU_GP_IDLE, \
71 .gpnum = -300, \ 71 .gpnum = 0UL - 300UL, \
72 .completed = -300, \ 72 .completed = 0UL - 300UL, \
73 .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&sname##_state.onofflock), \ 73 .orphan_lock = __RAW_SPIN_LOCK_UNLOCKED(&sname##_state.orphan_lock), \
74 .orphan_nxttail = &sname##_state.orphan_nxtlist, \ 74 .orphan_nxttail = &sname##_state.orphan_nxtlist, \
75 .orphan_donetail = &sname##_state.orphan_donelist, \ 75 .orphan_donetail = &sname##_state.orphan_donelist, \
76 .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \ 76 .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \
@@ -105,7 +105,7 @@ int rcu_num_nodes __read_mostly = NUM_RCU_NODES; /* Total # rcu_nodes in use. */
105 * The rcu_scheduler_active variable transitions from zero to one just 105 * The rcu_scheduler_active variable transitions from zero to one just
106 * before the first task is spawned. So when this variable is zero, RCU 106 * before the first task is spawned. So when this variable is zero, RCU
107 * can assume that there is but one task, allowing RCU to (for example) 107 * can assume that there is but one task, allowing RCU to (for example)
108 * optimized synchronize_sched() to a simple barrier(). When this variable 108 * optimize synchronize_sched() to a simple barrier(). When this variable
109 * is one, RCU must actually do all the hard work required to detect real 109 * is one, RCU must actually do all the hard work required to detect real
110 * grace periods. This variable is also used to suppress boot-time false 110 * grace periods. This variable is also used to suppress boot-time false
111 * positives from lockdep-RCU error checking. 111 * positives from lockdep-RCU error checking.
@@ -207,24 +207,15 @@ EXPORT_SYMBOL_GPL(rcu_note_context_switch);
207DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { 207DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
208 .dynticks_nesting = DYNTICK_TASK_EXIT_IDLE, 208 .dynticks_nesting = DYNTICK_TASK_EXIT_IDLE,
209 .dynticks = ATOMIC_INIT(1), 209 .dynticks = ATOMIC_INIT(1),
210#if defined(CONFIG_RCU_USER_QS) && !defined(CONFIG_RCU_USER_QS_FORCE)
211 .ignore_user_qs = true,
212#endif
213}; 210};
214 211
215static int blimit = 10; /* Maximum callbacks per rcu_do_batch. */ 212static long blimit = 10; /* Maximum callbacks per rcu_do_batch. */
216static int qhimark = 10000; /* If this many pending, ignore blimit. */ 213static long qhimark = 10000; /* If this many pending, ignore blimit. */
217static int qlowmark = 100; /* Once only this many pending, use blimit. */ 214static long qlowmark = 100; /* Once only this many pending, use blimit. */
218 215
219module_param(blimit, int, 0444); 216module_param(blimit, long, 0444);
220module_param(qhimark, int, 0444); 217module_param(qhimark, long, 0444);
221module_param(qlowmark, int, 0444); 218module_param(qlowmark, long, 0444);
222
223int rcu_cpu_stall_suppress __read_mostly; /* 1 = suppress stall warnings. */
224int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT;
225
226module_param(rcu_cpu_stall_suppress, int, 0644);
227module_param(rcu_cpu_stall_timeout, int, 0644);
228 219
229static ulong jiffies_till_first_fqs = RCU_JIFFIES_TILL_FORCE_QS; 220static ulong jiffies_till_first_fqs = RCU_JIFFIES_TILL_FORCE_QS;
230static ulong jiffies_till_next_fqs = RCU_JIFFIES_TILL_FORCE_QS; 221static ulong jiffies_till_next_fqs = RCU_JIFFIES_TILL_FORCE_QS;
@@ -303,18 +294,32 @@ EXPORT_SYMBOL_GPL(rcu_sched_force_quiescent_state);
303static int 294static int
304cpu_has_callbacks_ready_to_invoke(struct rcu_data *rdp) 295cpu_has_callbacks_ready_to_invoke(struct rcu_data *rdp)
305{ 296{
306 return &rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL]; 297 return &rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL] &&
298 rdp->nxttail[RCU_DONE_TAIL] != NULL;
307} 299}
308 300
309/* 301/*
310 * Does the current CPU require a yet-as-unscheduled grace period? 302 * Does the current CPU require a not-yet-started grace period?
303 * The caller must have disabled interrupts to prevent races with
304 * normal callback registry.
311 */ 305 */
312static int 306static int
313cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp) 307cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp)
314{ 308{
315 return *rdp->nxttail[RCU_DONE_TAIL + 309 int i;
316 ACCESS_ONCE(rsp->completed) != rdp->completed] && 310
317 !rcu_gp_in_progress(rsp); 311 if (rcu_gp_in_progress(rsp))
312 return 0; /* No, a grace period is already in progress. */
313 if (!rdp->nxttail[RCU_NEXT_TAIL])
314 return 0; /* No, this is a no-CBs (or offline) CPU. */
315 if (*rdp->nxttail[RCU_NEXT_READY_TAIL])
316 return 1; /* Yes, this CPU has newly registered callbacks. */
317 for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++)
318 if (rdp->nxttail[i - 1] != rdp->nxttail[i] &&
319 ULONG_CMP_LT(ACCESS_ONCE(rsp->completed),
320 rdp->nxtcompleted[i]))
321 return 1; /* Yes, CBs for future grace period. */
322 return 0; /* No grace period needed. */
318} 323}
319 324
320/* 325/*
@@ -335,7 +340,7 @@ static struct rcu_node *rcu_get_root(struct rcu_state *rsp)
335static void rcu_eqs_enter_common(struct rcu_dynticks *rdtp, long long oldval, 340static void rcu_eqs_enter_common(struct rcu_dynticks *rdtp, long long oldval,
336 bool user) 341 bool user)
337{ 342{
338 trace_rcu_dyntick("Start", oldval, 0); 343 trace_rcu_dyntick("Start", oldval, rdtp->dynticks_nesting);
339 if (!user && !is_idle_task(current)) { 344 if (!user && !is_idle_task(current)) {
340 struct task_struct *idle = idle_task(smp_processor_id()); 345 struct task_struct *idle = idle_task(smp_processor_id());
341 346
@@ -416,29 +421,7 @@ EXPORT_SYMBOL_GPL(rcu_idle_enter);
416 */ 421 */
417void rcu_user_enter(void) 422void rcu_user_enter(void)
418{ 423{
419 unsigned long flags; 424 rcu_eqs_enter(1);
420 struct rcu_dynticks *rdtp;
421
422 /*
423 * Some contexts may involve an exception occuring in an irq,
424 * leading to that nesting:
425 * rcu_irq_enter() rcu_user_exit() rcu_user_exit() rcu_irq_exit()
426 * This would mess up the dyntick_nesting count though. And rcu_irq_*()
427 * helpers are enough to protect RCU uses inside the exception. So
428 * just return immediately if we detect we are in an IRQ.
429 */
430 if (in_interrupt())
431 return;
432
433 WARN_ON_ONCE(!current->mm);
434
435 local_irq_save(flags);
436 rdtp = &__get_cpu_var(rcu_dynticks);
437 if (!rdtp->ignore_user_qs && !rdtp->in_user) {
438 rdtp->in_user = true;
439 rcu_eqs_enter(true);
440 }
441 local_irq_restore(flags);
442} 425}
443 426
444/** 427/**
@@ -575,27 +558,7 @@ EXPORT_SYMBOL_GPL(rcu_idle_exit);
575 */ 558 */
576void rcu_user_exit(void) 559void rcu_user_exit(void)
577{ 560{
578 unsigned long flags; 561 rcu_eqs_exit(1);
579 struct rcu_dynticks *rdtp;
580
581 /*
582 * Some contexts may involve an exception occuring in an irq,
583 * leading to that nesting:
584 * rcu_irq_enter() rcu_user_exit() rcu_user_exit() rcu_irq_exit()
585 * This would mess up the dyntick_nesting count though. And rcu_irq_*()
586 * helpers are enough to protect RCU uses inside the exception. So
587 * just return immediately if we detect we are in an IRQ.
588 */
589 if (in_interrupt())
590 return;
591
592 local_irq_save(flags);
593 rdtp = &__get_cpu_var(rcu_dynticks);
594 if (rdtp->in_user) {
595 rdtp->in_user = false;
596 rcu_eqs_exit(true);
597 }
598 local_irq_restore(flags);
599} 562}
600 563
601/** 564/**
@@ -718,21 +681,6 @@ int rcu_is_cpu_idle(void)
718} 681}
719EXPORT_SYMBOL(rcu_is_cpu_idle); 682EXPORT_SYMBOL(rcu_is_cpu_idle);
720 683
721#ifdef CONFIG_RCU_USER_QS
722void rcu_user_hooks_switch(struct task_struct *prev,
723 struct task_struct *next)
724{
725 struct rcu_dynticks *rdtp;
726
727 /* Interrupts are disabled in context switch */
728 rdtp = &__get_cpu_var(rcu_dynticks);
729 if (!rdtp->ignore_user_qs) {
730 clear_tsk_thread_flag(prev, TIF_NOHZ);
731 set_tsk_thread_flag(next, TIF_NOHZ);
732 }
733}
734#endif /* #ifdef CONFIG_RCU_USER_QS */
735
736#if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU) 684#if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU)
737 685
738/* 686/*
@@ -783,7 +731,7 @@ EXPORT_SYMBOL_GPL(rcu_lockdep_current_cpu_online);
783 * interrupt from idle, return true. The caller must have at least 731 * interrupt from idle, return true. The caller must have at least
784 * disabled preemption. 732 * disabled preemption.
785 */ 733 */
786int rcu_is_cpu_rrupt_from_idle(void) 734static int rcu_is_cpu_rrupt_from_idle(void)
787{ 735{
788 return __get_cpu_var(rcu_dynticks).dynticks_nesting <= 1; 736 return __get_cpu_var(rcu_dynticks).dynticks_nesting <= 1;
789} 737}
@@ -849,28 +797,33 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
849 return 0; 797 return 0;
850} 798}
851 799
852static int jiffies_till_stall_check(void) 800static void record_gp_stall_check_time(struct rcu_state *rsp)
853{ 801{
854 int till_stall_check = ACCESS_ONCE(rcu_cpu_stall_timeout); 802 rsp->gp_start = jiffies;
855 803 rsp->jiffies_stall = jiffies + rcu_jiffies_till_stall_check();
856 /*
857 * Limit check must be consistent with the Kconfig limits
858 * for CONFIG_RCU_CPU_STALL_TIMEOUT.
859 */
860 if (till_stall_check < 3) {
861 ACCESS_ONCE(rcu_cpu_stall_timeout) = 3;
862 till_stall_check = 3;
863 } else if (till_stall_check > 300) {
864 ACCESS_ONCE(rcu_cpu_stall_timeout) = 300;
865 till_stall_check = 300;
866 }
867 return till_stall_check * HZ + RCU_STALL_DELAY_DELTA;
868} 804}
869 805
870static void record_gp_stall_check_time(struct rcu_state *rsp) 806/*
807 * Dump stacks of all tasks running on stalled CPUs. This is a fallback
808 * for architectures that do not implement trigger_all_cpu_backtrace().
809 * The NMI-triggered stack traces are more accurate because they are
810 * printed by the target CPU.
811 */
812static void rcu_dump_cpu_stacks(struct rcu_state *rsp)
871{ 813{
872 rsp->gp_start = jiffies; 814 int cpu;
873 rsp->jiffies_stall = jiffies + jiffies_till_stall_check(); 815 unsigned long flags;
816 struct rcu_node *rnp;
817
818 rcu_for_each_leaf_node(rsp, rnp) {
819 raw_spin_lock_irqsave(&rnp->lock, flags);
820 if (rnp->qsmask != 0) {
821 for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++)
822 if (rnp->qsmask & (1UL << cpu))
823 dump_cpu_task(rnp->grplo + cpu);
824 }
825 raw_spin_unlock_irqrestore(&rnp->lock, flags);
826 }
874} 827}
875 828
876static void print_other_cpu_stall(struct rcu_state *rsp) 829static void print_other_cpu_stall(struct rcu_state *rsp)
@@ -880,6 +833,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
880 unsigned long flags; 833 unsigned long flags;
881 int ndetected = 0; 834 int ndetected = 0;
882 struct rcu_node *rnp = rcu_get_root(rsp); 835 struct rcu_node *rnp = rcu_get_root(rsp);
836 long totqlen = 0;
883 837
884 /* Only let one CPU complain about others per time interval. */ 838 /* Only let one CPU complain about others per time interval. */
885 839
@@ -889,7 +843,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
889 raw_spin_unlock_irqrestore(&rnp->lock, flags); 843 raw_spin_unlock_irqrestore(&rnp->lock, flags);
890 return; 844 return;
891 } 845 }
892 rsp->jiffies_stall = jiffies + 3 * jiffies_till_stall_check() + 3; 846 rsp->jiffies_stall = jiffies + 3 * rcu_jiffies_till_stall_check() + 3;
893 raw_spin_unlock_irqrestore(&rnp->lock, flags); 847 raw_spin_unlock_irqrestore(&rnp->lock, flags);
894 848
895 /* 849 /*
@@ -924,12 +878,15 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
924 raw_spin_unlock_irqrestore(&rnp->lock, flags); 878 raw_spin_unlock_irqrestore(&rnp->lock, flags);
925 879
926 print_cpu_stall_info_end(); 880 print_cpu_stall_info_end();
927 printk(KERN_CONT "(detected by %d, t=%ld jiffies)\n", 881 for_each_possible_cpu(cpu)
928 smp_processor_id(), (long)(jiffies - rsp->gp_start)); 882 totqlen += per_cpu_ptr(rsp->rda, cpu)->qlen;
883 pr_cont("(detected by %d, t=%ld jiffies, g=%lu, c=%lu, q=%lu)\n",
884 smp_processor_id(), (long)(jiffies - rsp->gp_start),
885 rsp->gpnum, rsp->completed, totqlen);
929 if (ndetected == 0) 886 if (ndetected == 0)
930 printk(KERN_ERR "INFO: Stall ended before state dump start\n"); 887 printk(KERN_ERR "INFO: Stall ended before state dump start\n");
931 else if (!trigger_all_cpu_backtrace()) 888 else if (!trigger_all_cpu_backtrace())
932 dump_stack(); 889 rcu_dump_cpu_stacks(rsp);
933 890
934 /* Complain about tasks blocking the grace period. */ 891 /* Complain about tasks blocking the grace period. */
935 892
@@ -940,8 +897,10 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
940 897
941static void print_cpu_stall(struct rcu_state *rsp) 898static void print_cpu_stall(struct rcu_state *rsp)
942{ 899{
900 int cpu;
943 unsigned long flags; 901 unsigned long flags;
944 struct rcu_node *rnp = rcu_get_root(rsp); 902 struct rcu_node *rnp = rcu_get_root(rsp);
903 long totqlen = 0;
945 904
946 /* 905 /*
947 * OK, time to rat on ourselves... 906 * OK, time to rat on ourselves...
@@ -952,14 +911,17 @@ static void print_cpu_stall(struct rcu_state *rsp)
952 print_cpu_stall_info_begin(); 911 print_cpu_stall_info_begin();
953 print_cpu_stall_info(rsp, smp_processor_id()); 912 print_cpu_stall_info(rsp, smp_processor_id());
954 print_cpu_stall_info_end(); 913 print_cpu_stall_info_end();
955 printk(KERN_CONT " (t=%lu jiffies)\n", jiffies - rsp->gp_start); 914 for_each_possible_cpu(cpu)
915 totqlen += per_cpu_ptr(rsp->rda, cpu)->qlen;
916 pr_cont(" (t=%lu jiffies g=%lu c=%lu q=%lu)\n",
917 jiffies - rsp->gp_start, rsp->gpnum, rsp->completed, totqlen);
956 if (!trigger_all_cpu_backtrace()) 918 if (!trigger_all_cpu_backtrace())
957 dump_stack(); 919 dump_stack();
958 920
959 raw_spin_lock_irqsave(&rnp->lock, flags); 921 raw_spin_lock_irqsave(&rnp->lock, flags);
960 if (ULONG_CMP_GE(jiffies, rsp->jiffies_stall)) 922 if (ULONG_CMP_GE(jiffies, rsp->jiffies_stall))
961 rsp->jiffies_stall = jiffies + 923 rsp->jiffies_stall = jiffies +
962 3 * jiffies_till_stall_check() + 3; 924 3 * rcu_jiffies_till_stall_check() + 3;
963 raw_spin_unlock_irqrestore(&rnp->lock, flags); 925 raw_spin_unlock_irqrestore(&rnp->lock, flags);
964 926
965 set_need_resched(); /* kick ourselves to get things going. */ 927 set_need_resched(); /* kick ourselves to get things going. */
@@ -990,12 +952,6 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
990 } 952 }
991} 953}
992 954
993static int rcu_panic(struct notifier_block *this, unsigned long ev, void *ptr)
994{
995 rcu_cpu_stall_suppress = 1;
996 return NOTIFY_DONE;
997}
998
999/** 955/**
1000 * rcu_cpu_stall_reset - prevent further stall warnings in current grace period 956 * rcu_cpu_stall_reset - prevent further stall warnings in current grace period
1001 * 957 *
@@ -1013,15 +969,6 @@ void rcu_cpu_stall_reset(void)
1013 rsp->jiffies_stall = jiffies + ULONG_MAX / 2; 969 rsp->jiffies_stall = jiffies + ULONG_MAX / 2;
1014} 970}
1015 971
1016static struct notifier_block rcu_panic_block = {
1017 .notifier_call = rcu_panic,
1018};
1019
1020static void __init check_cpu_stall_init(void)
1021{
1022 atomic_notifier_chain_register(&panic_notifier_list, &rcu_panic_block);
1023}
1024
1025/* 972/*
1026 * Update CPU-local rcu_data state to record the newly noticed grace period. 973 * Update CPU-local rcu_data state to record the newly noticed grace period.
1027 * This is used both when we started the grace period and when we notice 974 * This is used both when we started the grace period and when we notice
@@ -1091,6 +1038,146 @@ static void init_callback_list(struct rcu_data *rdp)
1091 rdp->nxtlist = NULL; 1038 rdp->nxtlist = NULL;
1092 for (i = 0; i < RCU_NEXT_SIZE; i++) 1039 for (i = 0; i < RCU_NEXT_SIZE; i++)
1093 rdp->nxttail[i] = &rdp->nxtlist; 1040 rdp->nxttail[i] = &rdp->nxtlist;
1041 init_nocb_callback_list(rdp);
1042}
1043
1044/*
1045 * Determine the value that ->completed will have at the end of the
1046 * next subsequent grace period. This is used to tag callbacks so that
1047 * a CPU can invoke callbacks in a timely fashion even if that CPU has
1048 * been dyntick-idle for an extended period with callbacks under the
1049 * influence of RCU_FAST_NO_HZ.
1050 *
1051 * The caller must hold rnp->lock with interrupts disabled.
1052 */
1053static unsigned long rcu_cbs_completed(struct rcu_state *rsp,
1054 struct rcu_node *rnp)
1055{
1056 /*
1057 * If RCU is idle, we just wait for the next grace period.
1058 * But we can only be sure that RCU is idle if we are looking
1059 * at the root rcu_node structure -- otherwise, a new grace
1060 * period might have started, but just not yet gotten around
1061 * to initializing the current non-root rcu_node structure.
1062 */
1063 if (rcu_get_root(rsp) == rnp && rnp->gpnum == rnp->completed)
1064 return rnp->completed + 1;
1065
1066 /*
1067 * Otherwise, wait for a possible partial grace period and
1068 * then the subsequent full grace period.
1069 */
1070 return rnp->completed + 2;
1071}
1072
1073/*
1074 * If there is room, assign a ->completed number to any callbacks on
1075 * this CPU that have not already been assigned. Also accelerate any
1076 * callbacks that were previously assigned a ->completed number that has
1077 * since proven to be too conservative, which can happen if callbacks get
1078 * assigned a ->completed number while RCU is idle, but with reference to
1079 * a non-root rcu_node structure. This function is idempotent, so it does
1080 * not hurt to call it repeatedly.
1081 *
1082 * The caller must hold rnp->lock with interrupts disabled.
1083 */
1084static void rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp,
1085 struct rcu_data *rdp)
1086{
1087 unsigned long c;
1088 int i;
1089
1090 /* If the CPU has no callbacks, nothing to do. */
1091 if (!rdp->nxttail[RCU_NEXT_TAIL] || !*rdp->nxttail[RCU_DONE_TAIL])
1092 return;
1093
1094 /*
1095 * Starting from the sublist containing the callbacks most
1096 * recently assigned a ->completed number and working down, find the
1097 * first sublist that is not assignable to an upcoming grace period.
1098 * Such a sublist has something in it (first two tests) and has
1099 * a ->completed number assigned that will complete sooner than
1100 * the ->completed number for newly arrived callbacks (last test).
1101 *
1102 * The key point is that any later sublist can be assigned the
1103 * same ->completed number as the newly arrived callbacks, which
1104 * means that the callbacks in any of these later sublist can be
1105 * grouped into a single sublist, whether or not they have already
1106 * been assigned a ->completed number.
1107 */
1108 c = rcu_cbs_completed(rsp, rnp);
1109 for (i = RCU_NEXT_TAIL - 1; i > RCU_DONE_TAIL; i--)
1110 if (rdp->nxttail[i] != rdp->nxttail[i - 1] &&
1111 !ULONG_CMP_GE(rdp->nxtcompleted[i], c))
1112 break;
1113
1114 /*
1115 * If there are no sublist for unassigned callbacks, leave.
1116 * At the same time, advance "i" one sublist, so that "i" will
1117 * index into the sublist where all the remaining callbacks should
1118 * be grouped into.
1119 */
1120 if (++i >= RCU_NEXT_TAIL)
1121 return;
1122
1123 /*
1124 * Assign all subsequent callbacks' ->completed number to the next
1125 * full grace period and group them all in the sublist initially
1126 * indexed by "i".
1127 */
1128 for (; i <= RCU_NEXT_TAIL; i++) {
1129 rdp->nxttail[i] = rdp->nxttail[RCU_NEXT_TAIL];
1130 rdp->nxtcompleted[i] = c;
1131 }
1132
1133 /* Trace depending on how much we were able to accelerate. */
1134 if (!*rdp->nxttail[RCU_WAIT_TAIL])
1135 trace_rcu_grace_period(rsp->name, rdp->gpnum, "AccWaitCB");
1136 else
1137 trace_rcu_grace_period(rsp->name, rdp->gpnum, "AccReadyCB");
1138}
1139
1140/*
1141 * Move any callbacks whose grace period has completed to the
1142 * RCU_DONE_TAIL sublist, then compact the remaining sublists and
1143 * assign ->completed numbers to any callbacks in the RCU_NEXT_TAIL
1144 * sublist. This function is idempotent, so it does not hurt to
1145 * invoke it repeatedly. As long as it is not invoked -too- often...
1146 *
1147 * The caller must hold rnp->lock with interrupts disabled.
1148 */
1149static void rcu_advance_cbs(struct rcu_state *rsp, struct rcu_node *rnp,
1150 struct rcu_data *rdp)
1151{
1152 int i, j;
1153
1154 /* If the CPU has no callbacks, nothing to do. */
1155 if (!rdp->nxttail[RCU_NEXT_TAIL] || !*rdp->nxttail[RCU_DONE_TAIL])
1156 return;
1157
1158 /*
1159 * Find all callbacks whose ->completed numbers indicate that they
1160 * are ready to invoke, and put them into the RCU_DONE_TAIL sublist.
1161 */
1162 for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++) {
1163 if (ULONG_CMP_LT(rnp->completed, rdp->nxtcompleted[i]))
1164 break;
1165 rdp->nxttail[RCU_DONE_TAIL] = rdp->nxttail[i];
1166 }
1167 /* Clean up any sublist tail pointers that were misordered above. */
1168 for (j = RCU_WAIT_TAIL; j < i; j++)
1169 rdp->nxttail[j] = rdp->nxttail[RCU_DONE_TAIL];
1170
1171 /* Copy down callbacks to fill in empty sublists. */
1172 for (j = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++, j++) {
1173 if (rdp->nxttail[j] == rdp->nxttail[RCU_NEXT_TAIL])
1174 break;
1175 rdp->nxttail[j] = rdp->nxttail[i];
1176 rdp->nxtcompleted[j] = rdp->nxtcompleted[i];
1177 }
1178
1179 /* Classify any remaining callbacks. */
1180 rcu_accelerate_cbs(rsp, rnp, rdp);
1094} 1181}
1095 1182
1096/* 1183/*
@@ -1103,12 +1190,15 @@ static void
1103__rcu_process_gp_end(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) 1190__rcu_process_gp_end(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp)
1104{ 1191{
1105 /* Did another grace period end? */ 1192 /* Did another grace period end? */
1106 if (rdp->completed != rnp->completed) { 1193 if (rdp->completed == rnp->completed) {
1194
1195 /* No, so just accelerate recent callbacks. */
1196 rcu_accelerate_cbs(rsp, rnp, rdp);
1197
1198 } else {
1107 1199
1108 /* Advance callbacks. No harm if list empty. */ 1200 /* Advance callbacks. */
1109 rdp->nxttail[RCU_DONE_TAIL] = rdp->nxttail[RCU_WAIT_TAIL]; 1201 rcu_advance_cbs(rsp, rnp, rdp);
1110 rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_READY_TAIL];
1111 rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
1112 1202
1113 /* Remember that we saw this grace-period completion. */ 1203 /* Remember that we saw this grace-period completion. */
1114 rdp->completed = rnp->completed; 1204 rdp->completed = rnp->completed;
@@ -1404,15 +1494,30 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
1404 !cpu_needs_another_gp(rsp, rdp)) { 1494 !cpu_needs_another_gp(rsp, rdp)) {
1405 /* 1495 /*
1406 * Either we have not yet spawned the grace-period 1496 * Either we have not yet spawned the grace-period
1407 * task or this CPU does not need another grace period. 1497 * task, this CPU does not need another grace period,
1498 * or a grace period is already in progress.
1408 * Either way, don't start a new grace period. 1499 * Either way, don't start a new grace period.
1409 */ 1500 */
1410 raw_spin_unlock_irqrestore(&rnp->lock, flags); 1501 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1411 return; 1502 return;
1412 } 1503 }
1413 1504
1505 /*
1506 * Because there is no grace period in progress right now,
1507 * any callbacks we have up to this point will be satisfied
1508 * by the next grace period. So this is a good place to
1509 * assign a grace period number to recently posted callbacks.
1510 */
1511 rcu_accelerate_cbs(rsp, rnp, rdp);
1512
1414 rsp->gp_flags = RCU_GP_FLAG_INIT; 1513 rsp->gp_flags = RCU_GP_FLAG_INIT;
1415 raw_spin_unlock_irqrestore(&rnp->lock, flags); 1514 raw_spin_unlock(&rnp->lock); /* Interrupts remain disabled. */
1515
1516 /* Ensure that CPU is aware of completion of last grace period. */
1517 rcu_process_gp_end(rsp, rdp);
1518 local_irq_restore(flags);
1519
1520 /* Wake up rcu_gp_kthread() to start the grace period. */
1416 wake_up(&rsp->gp_wq); 1521 wake_up(&rsp->gp_wq);
1417} 1522}
1418 1523
@@ -1528,7 +1633,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp)
1528 * This GP can't end until cpu checks in, so all of our 1633 * This GP can't end until cpu checks in, so all of our
1529 * callbacks can be processed during the next GP. 1634 * callbacks can be processed during the next GP.
1530 */ 1635 */
1531 rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; 1636 rcu_accelerate_cbs(rsp, rnp, rdp);
1532 1637
1533 rcu_report_qs_rnp(mask, rsp, rnp, flags); /* rlses rnp->lock */ 1638 rcu_report_qs_rnp(mask, rsp, rnp, flags); /* rlses rnp->lock */
1534 } 1639 }
@@ -1573,16 +1678,20 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp)
1573/* 1678/*
1574 * Send the specified CPU's RCU callbacks to the orphanage. The 1679 * Send the specified CPU's RCU callbacks to the orphanage. The
1575 * specified CPU must be offline, and the caller must hold the 1680 * specified CPU must be offline, and the caller must hold the
1576 * ->onofflock. 1681 * ->orphan_lock.
1577 */ 1682 */
1578static void 1683static void
1579rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp, 1684rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp,
1580 struct rcu_node *rnp, struct rcu_data *rdp) 1685 struct rcu_node *rnp, struct rcu_data *rdp)
1581{ 1686{
1687 /* No-CBs CPUs do not have orphanable callbacks. */
1688 if (is_nocb_cpu(rdp->cpu))
1689 return;
1690
1582 /* 1691 /*
1583 * Orphan the callbacks. First adjust the counts. This is safe 1692 * Orphan the callbacks. First adjust the counts. This is safe
1584 * because ->onofflock excludes _rcu_barrier()'s adoption of 1693 * because _rcu_barrier() excludes CPU-hotplug operations, so it
1585 * the callbacks, thus no memory barrier is required. 1694 * cannot be running now. Thus no memory barrier is required.
1586 */ 1695 */
1587 if (rdp->nxtlist != NULL) { 1696 if (rdp->nxtlist != NULL) {
1588 rsp->qlen_lazy += rdp->qlen_lazy; 1697 rsp->qlen_lazy += rdp->qlen_lazy;
@@ -1623,13 +1732,17 @@ rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp,
1623 1732
1624/* 1733/*
1625 * Adopt the RCU callbacks from the specified rcu_state structure's 1734 * Adopt the RCU callbacks from the specified rcu_state structure's
1626 * orphanage. The caller must hold the ->onofflock. 1735 * orphanage. The caller must hold the ->orphan_lock.
1627 */ 1736 */
1628static void rcu_adopt_orphan_cbs(struct rcu_state *rsp) 1737static void rcu_adopt_orphan_cbs(struct rcu_state *rsp)
1629{ 1738{
1630 int i; 1739 int i;
1631 struct rcu_data *rdp = __this_cpu_ptr(rsp->rda); 1740 struct rcu_data *rdp = __this_cpu_ptr(rsp->rda);
1632 1741
1742 /* No-CBs CPUs are handled specially. */
1743 if (rcu_nocb_adopt_orphan_cbs(rsp, rdp))
1744 return;
1745
1633 /* Do the accounting first. */ 1746 /* Do the accounting first. */
1634 rdp->qlen_lazy += rsp->qlen_lazy; 1747 rdp->qlen_lazy += rsp->qlen_lazy;
1635 rdp->qlen += rsp->qlen; 1748 rdp->qlen += rsp->qlen;
@@ -1702,7 +1815,7 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
1702 1815
1703 /* Exclude any attempts to start a new grace period. */ 1816 /* Exclude any attempts to start a new grace period. */
1704 mutex_lock(&rsp->onoff_mutex); 1817 mutex_lock(&rsp->onoff_mutex);
1705 raw_spin_lock_irqsave(&rsp->onofflock, flags); 1818 raw_spin_lock_irqsave(&rsp->orphan_lock, flags);
1706 1819
1707 /* Orphan the dead CPU's callbacks, and adopt them if appropriate. */ 1820 /* Orphan the dead CPU's callbacks, and adopt them if appropriate. */
1708 rcu_send_cbs_to_orphanage(cpu, rsp, rnp, rdp); 1821 rcu_send_cbs_to_orphanage(cpu, rsp, rnp, rdp);
@@ -1729,10 +1842,10 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
1729 /* 1842 /*
1730 * We still hold the leaf rcu_node structure lock here, and 1843 * We still hold the leaf rcu_node structure lock here, and
1731 * irqs are still disabled. The reason for this subterfuge is 1844 * irqs are still disabled. The reason for this subterfuge is
1732 * because invoking rcu_report_unblock_qs_rnp() with ->onofflock 1845 * because invoking rcu_report_unblock_qs_rnp() with ->orphan_lock
1733 * held leads to deadlock. 1846 * held leads to deadlock.
1734 */ 1847 */
1735 raw_spin_unlock(&rsp->onofflock); /* irqs remain disabled. */ 1848 raw_spin_unlock(&rsp->orphan_lock); /* irqs remain disabled. */
1736 rnp = rdp->mynode; 1849 rnp = rdp->mynode;
1737 if (need_report & RCU_OFL_TASKS_NORM_GP) 1850 if (need_report & RCU_OFL_TASKS_NORM_GP)
1738 rcu_report_unblock_qs_rnp(rnp, flags); 1851 rcu_report_unblock_qs_rnp(rnp, flags);
@@ -1769,9 +1882,10 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
1769{ 1882{
1770 unsigned long flags; 1883 unsigned long flags;
1771 struct rcu_head *next, *list, **tail; 1884 struct rcu_head *next, *list, **tail;
1772 int bl, count, count_lazy, i; 1885 long bl, count, count_lazy;
1886 int i;
1773 1887
1774 /* If no callbacks are ready, just return.*/ 1888 /* If no callbacks are ready, just return. */
1775 if (!cpu_has_callbacks_ready_to_invoke(rdp)) { 1889 if (!cpu_has_callbacks_ready_to_invoke(rdp)) {
1776 trace_rcu_batch_start(rsp->name, rdp->qlen_lazy, rdp->qlen, 0); 1890 trace_rcu_batch_start(rsp->name, rdp->qlen_lazy, rdp->qlen, 0);
1777 trace_rcu_batch_end(rsp->name, 0, !!ACCESS_ONCE(rdp->nxtlist), 1891 trace_rcu_batch_end(rsp->name, 0, !!ACCESS_ONCE(rdp->nxtlist),
@@ -2000,19 +2114,19 @@ __rcu_process_callbacks(struct rcu_state *rsp)
2000 2114
2001 WARN_ON_ONCE(rdp->beenonline == 0); 2115 WARN_ON_ONCE(rdp->beenonline == 0);
2002 2116
2003 /* 2117 /* Handle the end of a grace period that some other CPU ended. */
2004 * Advance callbacks in response to end of earlier grace
2005 * period that some other CPU ended.
2006 */
2007 rcu_process_gp_end(rsp, rdp); 2118 rcu_process_gp_end(rsp, rdp);
2008 2119
2009 /* Update RCU state based on any recent quiescent states. */ 2120 /* Update RCU state based on any recent quiescent states. */
2010 rcu_check_quiescent_state(rsp, rdp); 2121 rcu_check_quiescent_state(rsp, rdp);
2011 2122
2012 /* Does this CPU require a not-yet-started grace period? */ 2123 /* Does this CPU require a not-yet-started grace period? */
2124 local_irq_save(flags);
2013 if (cpu_needs_another_gp(rsp, rdp)) { 2125 if (cpu_needs_another_gp(rsp, rdp)) {
2014 raw_spin_lock_irqsave(&rcu_get_root(rsp)->lock, flags); 2126 raw_spin_lock(&rcu_get_root(rsp)->lock); /* irqs disabled. */
2015 rcu_start_gp(rsp, flags); /* releases above lock */ 2127 rcu_start_gp(rsp, flags); /* releases above lock */
2128 } else {
2129 local_irq_restore(flags);
2016 } 2130 }
2017 2131
2018 /* If there are callbacks ready, invoke them. */ 2132 /* If there are callbacks ready, invoke them. */
@@ -2107,9 +2221,15 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp,
2107 } 2221 }
2108} 2222}
2109 2223
2224/*
2225 * Helper function for call_rcu() and friends. The cpu argument will
2226 * normally be -1, indicating "currently running CPU". It may specify
2227 * a CPU only if that CPU is a no-CBs CPU. Currently, only _rcu_barrier()
2228 * is expected to specify a CPU.
2229 */
2110static void 2230static void
2111__call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), 2231__call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
2112 struct rcu_state *rsp, bool lazy) 2232 struct rcu_state *rsp, int cpu, bool lazy)
2113{ 2233{
2114 unsigned long flags; 2234 unsigned long flags;
2115 struct rcu_data *rdp; 2235 struct rcu_data *rdp;
@@ -2129,9 +2249,14 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
2129 rdp = this_cpu_ptr(rsp->rda); 2249 rdp = this_cpu_ptr(rsp->rda);
2130 2250
2131 /* Add the callback to our list. */ 2251 /* Add the callback to our list. */
2132 if (unlikely(rdp->nxttail[RCU_NEXT_TAIL] == NULL)) { 2252 if (unlikely(rdp->nxttail[RCU_NEXT_TAIL] == NULL) || cpu != -1) {
2253 int offline;
2254
2255 if (cpu != -1)
2256 rdp = per_cpu_ptr(rsp->rda, cpu);
2257 offline = !__call_rcu_nocb(rdp, head, lazy);
2258 WARN_ON_ONCE(offline);
2133 /* _call_rcu() is illegal on offline CPU; leak the callback. */ 2259 /* _call_rcu() is illegal on offline CPU; leak the callback. */
2134 WARN_ON_ONCE(1);
2135 local_irq_restore(flags); 2260 local_irq_restore(flags);
2136 return; 2261 return;
2137 } 2262 }
@@ -2160,7 +2285,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
2160 */ 2285 */
2161void call_rcu_sched(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) 2286void call_rcu_sched(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
2162{ 2287{
2163 __call_rcu(head, func, &rcu_sched_state, 0); 2288 __call_rcu(head, func, &rcu_sched_state, -1, 0);
2164} 2289}
2165EXPORT_SYMBOL_GPL(call_rcu_sched); 2290EXPORT_SYMBOL_GPL(call_rcu_sched);
2166 2291
@@ -2169,7 +2294,7 @@ EXPORT_SYMBOL_GPL(call_rcu_sched);
2169 */ 2294 */
2170void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) 2295void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
2171{ 2296{
2172 __call_rcu(head, func, &rcu_bh_state, 0); 2297 __call_rcu(head, func, &rcu_bh_state, -1, 0);
2173} 2298}
2174EXPORT_SYMBOL_GPL(call_rcu_bh); 2299EXPORT_SYMBOL_GPL(call_rcu_bh);
2175 2300
@@ -2205,10 +2330,28 @@ static inline int rcu_blocking_is_gp(void)
2205 * rcu_read_lock_sched(). 2330 * rcu_read_lock_sched().
2206 * 2331 *
2207 * This means that all preempt_disable code sequences, including NMI and 2332 * This means that all preempt_disable code sequences, including NMI and
2208 * hardware-interrupt handlers, in progress on entry will have completed 2333 * non-threaded hardware-interrupt handlers, in progress on entry will
2209 * before this primitive returns. However, this does not guarantee that 2334 * have completed before this primitive returns. However, this does not
2210 * softirq handlers will have completed, since in some kernels, these 2335 * guarantee that softirq handlers will have completed, since in some
2211 * handlers can run in process context, and can block. 2336 * kernels, these handlers can run in process context, and can block.
2337 *
2338 * Note that this guarantee implies further memory-ordering guarantees.
2339 * On systems with more than one CPU, when synchronize_sched() returns,
2340 * each CPU is guaranteed to have executed a full memory barrier since the
2341 * end of its last RCU-sched read-side critical section whose beginning
2342 * preceded the call to synchronize_sched(). In addition, each CPU having
2343 * an RCU read-side critical section that extends beyond the return from
2344 * synchronize_sched() is guaranteed to have executed a full memory barrier
2345 * after the beginning of synchronize_sched() and before the beginning of
2346 * that RCU read-side critical section. Note that these guarantees include
2347 * CPUs that are offline, idle, or executing in user mode, as well as CPUs
2348 * that are executing in the kernel.
2349 *
2350 * Furthermore, if CPU A invoked synchronize_sched(), which returned
2351 * to its caller on CPU B, then both CPU A and CPU B are guaranteed
2352 * to have executed a full memory barrier during the execution of
2353 * synchronize_sched() -- even if CPU A and CPU B are the same CPU (but
2354 * again only if the system has more than one CPU).
2212 * 2355 *
2213 * This primitive provides the guarantees made by the (now removed) 2356 * This primitive provides the guarantees made by the (now removed)
2214 * synchronize_kernel() API. In contrast, synchronize_rcu() only 2357 * synchronize_kernel() API. In contrast, synchronize_rcu() only
@@ -2224,7 +2367,10 @@ void synchronize_sched(void)
2224 "Illegal synchronize_sched() in RCU-sched read-side critical section"); 2367 "Illegal synchronize_sched() in RCU-sched read-side critical section");
2225 if (rcu_blocking_is_gp()) 2368 if (rcu_blocking_is_gp())
2226 return; 2369 return;
2227 wait_rcu_gp(call_rcu_sched); 2370 if (rcu_expedited)
2371 synchronize_sched_expedited();
2372 else
2373 wait_rcu_gp(call_rcu_sched);
2228} 2374}
2229EXPORT_SYMBOL_GPL(synchronize_sched); 2375EXPORT_SYMBOL_GPL(synchronize_sched);
2230 2376
@@ -2236,6 +2382,9 @@ EXPORT_SYMBOL_GPL(synchronize_sched);
2236 * read-side critical sections have completed. RCU read-side critical 2382 * read-side critical sections have completed. RCU read-side critical
2237 * sections are delimited by rcu_read_lock_bh() and rcu_read_unlock_bh(), 2383 * sections are delimited by rcu_read_lock_bh() and rcu_read_unlock_bh(),
2238 * and may be nested. 2384 * and may be nested.
2385 *
2386 * See the description of synchronize_sched() for more detailed information
2387 * on memory ordering guarantees.
2239 */ 2388 */
2240void synchronize_rcu_bh(void) 2389void synchronize_rcu_bh(void)
2241{ 2390{
@@ -2245,13 +2394,13 @@ void synchronize_rcu_bh(void)
2245 "Illegal synchronize_rcu_bh() in RCU-bh read-side critical section"); 2394 "Illegal synchronize_rcu_bh() in RCU-bh read-side critical section");
2246 if (rcu_blocking_is_gp()) 2395 if (rcu_blocking_is_gp())
2247 return; 2396 return;
2248 wait_rcu_gp(call_rcu_bh); 2397 if (rcu_expedited)
2398 synchronize_rcu_bh_expedited();
2399 else
2400 wait_rcu_gp(call_rcu_bh);
2249} 2401}
2250EXPORT_SYMBOL_GPL(synchronize_rcu_bh); 2402EXPORT_SYMBOL_GPL(synchronize_rcu_bh);
2251 2403
2252static atomic_t sync_sched_expedited_started = ATOMIC_INIT(0);
2253static atomic_t sync_sched_expedited_done = ATOMIC_INIT(0);
2254
2255static int synchronize_sched_expedited_cpu_stop(void *data) 2404static int synchronize_sched_expedited_cpu_stop(void *data)
2256{ 2405{
2257 /* 2406 /*
@@ -2308,10 +2457,32 @@ static int synchronize_sched_expedited_cpu_stop(void *data)
2308 */ 2457 */
2309void synchronize_sched_expedited(void) 2458void synchronize_sched_expedited(void)
2310{ 2459{
2311 int firstsnap, s, snap, trycount = 0; 2460 long firstsnap, s, snap;
2461 int trycount = 0;
2462 struct rcu_state *rsp = &rcu_sched_state;
2463
2464 /*
2465 * If we are in danger of counter wrap, just do synchronize_sched().
2466 * By allowing sync_sched_expedited_started to advance no more than
2467 * ULONG_MAX/8 ahead of sync_sched_expedited_done, we are ensuring
2468 * that more than 3.5 billion CPUs would be required to force a
2469 * counter wrap on a 32-bit system. Quite a few more CPUs would of
2470 * course be required on a 64-bit system.
2471 */
2472 if (ULONG_CMP_GE((ulong)atomic_long_read(&rsp->expedited_start),
2473 (ulong)atomic_long_read(&rsp->expedited_done) +
2474 ULONG_MAX / 8)) {
2475 synchronize_sched();
2476 atomic_long_inc(&rsp->expedited_wrap);
2477 return;
2478 }
2312 2479
2313 /* Note that atomic_inc_return() implies full memory barrier. */ 2480 /*
2314 firstsnap = snap = atomic_inc_return(&sync_sched_expedited_started); 2481 * Take a ticket. Note that atomic_inc_return() implies a
2482 * full memory barrier.
2483 */
2484 snap = atomic_long_inc_return(&rsp->expedited_start);
2485 firstsnap = snap;
2315 get_online_cpus(); 2486 get_online_cpus();
2316 WARN_ON_ONCE(cpu_is_offline(raw_smp_processor_id())); 2487 WARN_ON_ONCE(cpu_is_offline(raw_smp_processor_id()));
2317 2488
@@ -2323,48 +2494,65 @@ void synchronize_sched_expedited(void)
2323 synchronize_sched_expedited_cpu_stop, 2494 synchronize_sched_expedited_cpu_stop,
2324 NULL) == -EAGAIN) { 2495 NULL) == -EAGAIN) {
2325 put_online_cpus(); 2496 put_online_cpus();
2497 atomic_long_inc(&rsp->expedited_tryfail);
2498
2499 /* Check to see if someone else did our work for us. */
2500 s = atomic_long_read(&rsp->expedited_done);
2501 if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) {
2502 /* ensure test happens before caller kfree */
2503 smp_mb__before_atomic_inc(); /* ^^^ */
2504 atomic_long_inc(&rsp->expedited_workdone1);
2505 return;
2506 }
2326 2507
2327 /* No joy, try again later. Or just synchronize_sched(). */ 2508 /* No joy, try again later. Or just synchronize_sched(). */
2328 if (trycount++ < 10) { 2509 if (trycount++ < 10) {
2329 udelay(trycount * num_online_cpus()); 2510 udelay(trycount * num_online_cpus());
2330 } else { 2511 } else {
2331 synchronize_sched(); 2512 wait_rcu_gp(call_rcu_sched);
2513 atomic_long_inc(&rsp->expedited_normal);
2332 return; 2514 return;
2333 } 2515 }
2334 2516
2335 /* Check to see if someone else did our work for us. */ 2517 /* Recheck to see if someone else did our work for us. */
2336 s = atomic_read(&sync_sched_expedited_done); 2518 s = atomic_long_read(&rsp->expedited_done);
2337 if (UINT_CMP_GE((unsigned)s, (unsigned)firstsnap)) { 2519 if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) {
2338 smp_mb(); /* ensure test happens before caller kfree */ 2520 /* ensure test happens before caller kfree */
2521 smp_mb__before_atomic_inc(); /* ^^^ */
2522 atomic_long_inc(&rsp->expedited_workdone2);
2339 return; 2523 return;
2340 } 2524 }
2341 2525
2342 /* 2526 /*
2343 * Refetching sync_sched_expedited_started allows later 2527 * Refetching sync_sched_expedited_started allows later
2344 * callers to piggyback on our grace period. We subtract 2528 * callers to piggyback on our grace period. We retry
2345 * 1 to get the same token that the last incrementer got. 2529 * after they started, so our grace period works for them,
2346 * We retry after they started, so our grace period works 2530 * and they started after our first try, so their grace
2347 * for them, and they started after our first try, so their 2531 * period works for us.
2348 * grace period works for us.
2349 */ 2532 */
2350 get_online_cpus(); 2533 get_online_cpus();
2351 snap = atomic_read(&sync_sched_expedited_started); 2534 snap = atomic_long_read(&rsp->expedited_start);
2352 smp_mb(); /* ensure read is before try_stop_cpus(). */ 2535 smp_mb(); /* ensure read is before try_stop_cpus(). */
2353 } 2536 }
2537 atomic_long_inc(&rsp->expedited_stoppedcpus);
2354 2538
2355 /* 2539 /*
2356 * Everyone up to our most recent fetch is covered by our grace 2540 * Everyone up to our most recent fetch is covered by our grace
2357 * period. Update the counter, but only if our work is still 2541 * period. Update the counter, but only if our work is still
2358 * relevant -- which it won't be if someone who started later 2542 * relevant -- which it won't be if someone who started later
2359 * than we did beat us to the punch. 2543 * than we did already did their update.
2360 */ 2544 */
2361 do { 2545 do {
2362 s = atomic_read(&sync_sched_expedited_done); 2546 atomic_long_inc(&rsp->expedited_done_tries);
2363 if (UINT_CMP_GE((unsigned)s, (unsigned)snap)) { 2547 s = atomic_long_read(&rsp->expedited_done);
2364 smp_mb(); /* ensure test happens before caller kfree */ 2548 if (ULONG_CMP_GE((ulong)s, (ulong)snap)) {
2549 /* ensure test happens before caller kfree */
2550 smp_mb__before_atomic_inc(); /* ^^^ */
2551 atomic_long_inc(&rsp->expedited_done_lost);
2365 break; 2552 break;
2366 } 2553 }
2367 } while (atomic_cmpxchg(&sync_sched_expedited_done, s, snap) != s); 2554 } while (atomic_long_cmpxchg(&rsp->expedited_done, s, snap) != s);
2555 atomic_long_inc(&rsp->expedited_done_exit);
2368 2556
2369 put_online_cpus(); 2557 put_online_cpus();
2370} 2558}
@@ -2558,9 +2746,17 @@ static void _rcu_barrier(struct rcu_state *rsp)
2558 * When that callback is invoked, we will know that all of the 2746 * When that callback is invoked, we will know that all of the
2559 * corresponding CPU's preceding callbacks have been invoked. 2747 * corresponding CPU's preceding callbacks have been invoked.
2560 */ 2748 */
2561 for_each_online_cpu(cpu) { 2749 for_each_possible_cpu(cpu) {
2750 if (!cpu_online(cpu) && !is_nocb_cpu(cpu))
2751 continue;
2562 rdp = per_cpu_ptr(rsp->rda, cpu); 2752 rdp = per_cpu_ptr(rsp->rda, cpu);
2563 if (ACCESS_ONCE(rdp->qlen)) { 2753 if (is_nocb_cpu(cpu)) {
2754 _rcu_barrier_trace(rsp, "OnlineNoCB", cpu,
2755 rsp->n_barrier_done);
2756 atomic_inc(&rsp->barrier_cpu_count);
2757 __call_rcu(&rdp->barrier_head, rcu_barrier_callback,
2758 rsp, cpu, 0);
2759 } else if (ACCESS_ONCE(rdp->qlen)) {
2564 _rcu_barrier_trace(rsp, "OnlineQ", cpu, 2760 _rcu_barrier_trace(rsp, "OnlineQ", cpu,
2565 rsp->n_barrier_done); 2761 rsp->n_barrier_done);
2566 smp_call_function_single(cpu, rcu_barrier_func, rsp, 1); 2762 smp_call_function_single(cpu, rcu_barrier_func, rsp, 1);
@@ -2629,11 +2825,9 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
2629 rdp->dynticks = &per_cpu(rcu_dynticks, cpu); 2825 rdp->dynticks = &per_cpu(rcu_dynticks, cpu);
2630 WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_EXIT_IDLE); 2826 WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_EXIT_IDLE);
2631 WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1); 2827 WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1);
2632#ifdef CONFIG_RCU_USER_QS
2633 WARN_ON_ONCE(rdp->dynticks->in_user);
2634#endif
2635 rdp->cpu = cpu; 2828 rdp->cpu = cpu;
2636 rdp->rsp = rsp; 2829 rdp->rsp = rsp;
2830 rcu_boot_init_nocb_percpu_data(rdp);
2637 raw_spin_unlock_irqrestore(&rnp->lock, flags); 2831 raw_spin_unlock_irqrestore(&rnp->lock, flags);
2638} 2832}
2639 2833
@@ -2715,6 +2909,7 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
2715 struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu); 2909 struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu);
2716 struct rcu_node *rnp = rdp->mynode; 2910 struct rcu_node *rnp = rdp->mynode;
2717 struct rcu_state *rsp; 2911 struct rcu_state *rsp;
2912 int ret = NOTIFY_OK;
2718 2913
2719 trace_rcu_utilization("Start CPU hotplug"); 2914 trace_rcu_utilization("Start CPU hotplug");
2720 switch (action) { 2915 switch (action) {
@@ -2728,7 +2923,10 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
2728 rcu_boost_kthread_setaffinity(rnp, -1); 2923 rcu_boost_kthread_setaffinity(rnp, -1);
2729 break; 2924 break;
2730 case CPU_DOWN_PREPARE: 2925 case CPU_DOWN_PREPARE:
2731 rcu_boost_kthread_setaffinity(rnp, cpu); 2926 if (nocb_cpu_expendable(cpu))
2927 rcu_boost_kthread_setaffinity(rnp, cpu);
2928 else
2929 ret = NOTIFY_BAD;
2732 break; 2930 break;
2733 case CPU_DYING: 2931 case CPU_DYING:
2734 case CPU_DYING_FROZEN: 2932 case CPU_DYING_FROZEN:
@@ -2752,7 +2950,7 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
2752 break; 2950 break;
2753 } 2951 }
2754 trace_rcu_utilization("End CPU hotplug"); 2952 trace_rcu_utilization("End CPU hotplug");
2755 return NOTIFY_OK; 2953 return ret;
2756} 2954}
2757 2955
2758/* 2956/*
@@ -2772,6 +2970,7 @@ static int __init rcu_spawn_gp_kthread(void)
2772 raw_spin_lock_irqsave(&rnp->lock, flags); 2970 raw_spin_lock_irqsave(&rnp->lock, flags);
2773 rsp->gp_kthread = t; 2971 rsp->gp_kthread = t;
2774 raw_spin_unlock_irqrestore(&rnp->lock, flags); 2972 raw_spin_unlock_irqrestore(&rnp->lock, flags);
2973 rcu_spawn_nocb_kthreads(rsp);
2775 } 2974 }
2776 return 0; 2975 return 0;
2777} 2976}
@@ -2842,6 +3041,10 @@ static void __init rcu_init_one(struct rcu_state *rsp,
2842 3041
2843 BUILD_BUG_ON(MAX_RCU_LVLS > ARRAY_SIZE(buf)); /* Fix buf[] init! */ 3042 BUILD_BUG_ON(MAX_RCU_LVLS > ARRAY_SIZE(buf)); /* Fix buf[] init! */
2844 3043
3044 /* Silence gcc 4.8 warning about array index out of range. */
3045 if (rcu_num_lvls > RCU_NUM_LVLS)
3046 panic("rcu_init_one: rcu_num_lvls overflow");
3047
2845 /* Initialize the level-tracking arrays. */ 3048 /* Initialize the level-tracking arrays. */
2846 3049
2847 for (i = 0; i < rcu_num_lvls; i++) 3050 for (i = 0; i < rcu_num_lvls; i++)
@@ -2967,6 +3170,7 @@ void __init rcu_init(void)
2967 rcu_init_one(&rcu_sched_state, &rcu_sched_data); 3170 rcu_init_one(&rcu_sched_state, &rcu_sched_data);
2968 rcu_init_one(&rcu_bh_state, &rcu_bh_data); 3171 rcu_init_one(&rcu_bh_state, &rcu_bh_data);
2969 __rcu_init_preempt(); 3172 __rcu_init_preempt();
3173 rcu_init_nocb();
2970 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); 3174 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
2971 3175
2972 /* 3176 /*
@@ -2977,7 +3181,6 @@ void __init rcu_init(void)
2977 cpu_notifier(rcu_cpu_notify, 0); 3181 cpu_notifier(rcu_cpu_notify, 0);
2978 for_each_online_cpu(cpu) 3182 for_each_online_cpu(cpu)
2979 rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)cpu); 3183 rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)cpu);
2980 check_cpu_stall_init();
2981} 3184}
2982 3185
2983#include "rcutree_plugin.h" 3186#include "rcutree_plugin.h"
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index a240f032848e..c896b5045d9d 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -102,10 +102,6 @@ struct rcu_dynticks {
102 /* idle-period nonlazy_posted snapshot. */ 102 /* idle-period nonlazy_posted snapshot. */
103 int tick_nohz_enabled_snap; /* Previously seen value from sysfs. */ 103 int tick_nohz_enabled_snap; /* Previously seen value from sysfs. */
104#endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */ 104#endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */
105#ifdef CONFIG_RCU_USER_QS
106 bool ignore_user_qs; /* Treat userspace as extended QS or not */
107 bool in_user; /* Is the CPU in userland from RCU POV? */
108#endif
109}; 105};
110 106
111/* RCU's kthread states for tracing. */ 107/* RCU's kthread states for tracing. */
@@ -282,11 +278,14 @@ struct rcu_data {
282 */ 278 */
283 struct rcu_head *nxtlist; 279 struct rcu_head *nxtlist;
284 struct rcu_head **nxttail[RCU_NEXT_SIZE]; 280 struct rcu_head **nxttail[RCU_NEXT_SIZE];
281 unsigned long nxtcompleted[RCU_NEXT_SIZE];
282 /* grace periods for sublists. */
285 long qlen_lazy; /* # of lazy queued callbacks */ 283 long qlen_lazy; /* # of lazy queued callbacks */
286 long qlen; /* # of queued callbacks, incl lazy */ 284 long qlen; /* # of queued callbacks, incl lazy */
287 long qlen_last_fqs_check; 285 long qlen_last_fqs_check;
288 /* qlen at last check for QS forcing */ 286 /* qlen at last check for QS forcing */
289 unsigned long n_cbs_invoked; /* count of RCU cbs invoked. */ 287 unsigned long n_cbs_invoked; /* count of RCU cbs invoked. */
288 unsigned long n_nocbs_invoked; /* count of no-CBs RCU cbs invoked. */
290 unsigned long n_cbs_orphaned; /* RCU cbs orphaned by dying CPU */ 289 unsigned long n_cbs_orphaned; /* RCU cbs orphaned by dying CPU */
291 unsigned long n_cbs_adopted; /* RCU cbs adopted from dying CPU */ 290 unsigned long n_cbs_adopted; /* RCU cbs adopted from dying CPU */
292 unsigned long n_force_qs_snap; 291 unsigned long n_force_qs_snap;
@@ -317,6 +316,18 @@ struct rcu_data {
317 struct rcu_head oom_head; 316 struct rcu_head oom_head;
318#endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */ 317#endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */
319 318
319 /* 7) Callback offloading. */
320#ifdef CONFIG_RCU_NOCB_CPU
321 struct rcu_head *nocb_head; /* CBs waiting for kthread. */
322 struct rcu_head **nocb_tail;
323 atomic_long_t nocb_q_count; /* # CBs waiting for kthread */
324 atomic_long_t nocb_q_count_lazy; /* (approximate). */
325 int nocb_p_count; /* # CBs being invoked by kthread */
326 int nocb_p_count_lazy; /* (approximate). */
327 wait_queue_head_t nocb_wq; /* For nocb kthreads to sleep on. */
328 struct task_struct *nocb_kthread;
329#endif /* #ifdef CONFIG_RCU_NOCB_CPU */
330
320 int cpu; 331 int cpu;
321 struct rcu_state *rsp; 332 struct rcu_state *rsp;
322}; 333};
@@ -330,11 +341,6 @@ struct rcu_data {
330 341
331#define RCU_JIFFIES_TILL_FORCE_QS 3 /* for rsp->jiffies_force_qs */ 342#define RCU_JIFFIES_TILL_FORCE_QS 3 /* for rsp->jiffies_force_qs */
332 343
333#ifdef CONFIG_PROVE_RCU
334#define RCU_STALL_DELAY_DELTA (5 * HZ)
335#else
336#define RCU_STALL_DELAY_DELTA 0
337#endif
338#define RCU_STALL_RAT_DELAY 2 /* Allow other CPUs time */ 344#define RCU_STALL_RAT_DELAY 2 /* Allow other CPUs time */
339 /* to take at least one */ 345 /* to take at least one */
340 /* scheduling clock irq */ 346 /* scheduling clock irq */
@@ -369,6 +375,12 @@ struct rcu_state {
369 struct rcu_data __percpu *rda; /* pointer of percu rcu_data. */ 375 struct rcu_data __percpu *rda; /* pointer of percu rcu_data. */
370 void (*call)(struct rcu_head *head, /* call_rcu() flavor. */ 376 void (*call)(struct rcu_head *head, /* call_rcu() flavor. */
371 void (*func)(struct rcu_head *head)); 377 void (*func)(struct rcu_head *head));
378#ifdef CONFIG_RCU_NOCB_CPU
379 void (*call_remote)(struct rcu_head *head,
380 void (*func)(struct rcu_head *head));
381 /* call_rcu() flavor, but for */
382 /* placing on remote CPU. */
383#endif /* #ifdef CONFIG_RCU_NOCB_CPU */
372 384
373 /* The following fields are guarded by the root rcu_node's lock. */ 385 /* The following fields are guarded by the root rcu_node's lock. */
374 386
@@ -383,9 +395,8 @@ struct rcu_state {
383 395
384 /* End of fields guarded by root rcu_node's lock. */ 396 /* End of fields guarded by root rcu_node's lock. */
385 397
386 raw_spinlock_t onofflock ____cacheline_internodealigned_in_smp; 398 raw_spinlock_t orphan_lock ____cacheline_internodealigned_in_smp;
387 /* exclude on/offline and */ 399 /* Protect following fields. */
388 /* starting new GP. */
389 struct rcu_head *orphan_nxtlist; /* Orphaned callbacks that */ 400 struct rcu_head *orphan_nxtlist; /* Orphaned callbacks that */
390 /* need a grace period. */ 401 /* need a grace period. */
391 struct rcu_head **orphan_nxttail; /* Tail of above. */ 402 struct rcu_head **orphan_nxttail; /* Tail of above. */
@@ -394,7 +405,7 @@ struct rcu_state {
394 struct rcu_head **orphan_donetail; /* Tail of above. */ 405 struct rcu_head **orphan_donetail; /* Tail of above. */
395 long qlen_lazy; /* Number of lazy callbacks. */ 406 long qlen_lazy; /* Number of lazy callbacks. */
396 long qlen; /* Total number of callbacks. */ 407 long qlen; /* Total number of callbacks. */
397 /* End of fields guarded by onofflock. */ 408 /* End of fields guarded by orphan_lock. */
398 409
399 struct mutex onoff_mutex; /* Coordinate hotplug & GPs. */ 410 struct mutex onoff_mutex; /* Coordinate hotplug & GPs. */
400 411
@@ -405,6 +416,18 @@ struct rcu_state {
405 /* _rcu_barrier(). */ 416 /* _rcu_barrier(). */
406 /* End of fields guarded by barrier_mutex. */ 417 /* End of fields guarded by barrier_mutex. */
407 418
419 atomic_long_t expedited_start; /* Starting ticket. */
420 atomic_long_t expedited_done; /* Done ticket. */
421 atomic_long_t expedited_wrap; /* # near-wrap incidents. */
422 atomic_long_t expedited_tryfail; /* # acquisition failures. */
423 atomic_long_t expedited_workdone1; /* # done by others #1. */
424 atomic_long_t expedited_workdone2; /* # done by others #2. */
425 atomic_long_t expedited_normal; /* # fallbacks to normal. */
426 atomic_long_t expedited_stoppedcpus; /* # successful stop_cpus. */
427 atomic_long_t expedited_done_tries; /* # tries to update _done. */
428 atomic_long_t expedited_done_lost; /* # times beaten to _done. */
429 atomic_long_t expedited_done_exit; /* # times exited _done loop. */
430
408 unsigned long jiffies_force_qs; /* Time at which to invoke */ 431 unsigned long jiffies_force_qs; /* Time at which to invoke */
409 /* force_quiescent_state(). */ 432 /* force_quiescent_state(). */
410 unsigned long n_force_qs; /* Number of calls to */ 433 unsigned long n_force_qs; /* Number of calls to */
@@ -428,6 +451,8 @@ struct rcu_state {
428#define RCU_GP_FLAG_FQS 0x2 /* Need grace-period quiescent-state forcing. */ 451#define RCU_GP_FLAG_FQS 0x2 /* Need grace-period quiescent-state forcing. */
429 452
430extern struct list_head rcu_struct_flavors; 453extern struct list_head rcu_struct_flavors;
454
455/* Sequence through rcu_state structures for each RCU flavor. */
431#define for_each_rcu_flavor(rsp) \ 456#define for_each_rcu_flavor(rsp) \
432 list_for_each_entry((rsp), &rcu_struct_flavors, flavors) 457 list_for_each_entry((rsp), &rcu_struct_flavors, flavors)
433 458
@@ -504,5 +529,32 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu);
504static void print_cpu_stall_info_end(void); 529static void print_cpu_stall_info_end(void);
505static void zero_cpu_stall_ticks(struct rcu_data *rdp); 530static void zero_cpu_stall_ticks(struct rcu_data *rdp);
506static void increment_cpu_stall_ticks(void); 531static void increment_cpu_stall_ticks(void);
532static bool is_nocb_cpu(int cpu);
533static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
534 bool lazy);
535static bool rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp,
536 struct rcu_data *rdp);
537static bool nocb_cpu_expendable(int cpu);
538static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp);
539static void rcu_spawn_nocb_kthreads(struct rcu_state *rsp);
540static void init_nocb_callback_list(struct rcu_data *rdp);
541static void __init rcu_init_nocb(void);
507 542
508#endif /* #ifndef RCU_TREE_NONCORE */ 543#endif /* #ifndef RCU_TREE_NONCORE */
544
545#ifdef CONFIG_RCU_TRACE
546#ifdef CONFIG_RCU_NOCB_CPU
547/* Sum up queue lengths for tracing. */
548static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll)
549{
550 *ql = atomic_long_read(&rdp->nocb_q_count) + rdp->nocb_p_count;
551 *qll = atomic_long_read(&rdp->nocb_q_count_lazy) + rdp->nocb_p_count_lazy;
552}
553#else /* #ifdef CONFIG_RCU_NOCB_CPU */
554static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll)
555{
556 *ql = 0;
557 *qll = 0;
558}
559#endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */
560#endif /* #ifdef CONFIG_RCU_TRACE */
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index f92115488187..c1cc7e17ff9d 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -25,6 +25,7 @@
25 */ 25 */
26 26
27#include <linux/delay.h> 27#include <linux/delay.h>
28#include <linux/gfp.h>
28#include <linux/oom.h> 29#include <linux/oom.h>
29#include <linux/smpboot.h> 30#include <linux/smpboot.h>
30 31
@@ -36,6 +37,13 @@
36#define RCU_BOOST_PRIO RCU_KTHREAD_PRIO 37#define RCU_BOOST_PRIO RCU_KTHREAD_PRIO
37#endif 38#endif
38 39
40#ifdef CONFIG_RCU_NOCB_CPU
41static cpumask_var_t rcu_nocb_mask; /* CPUs to have callbacks offloaded. */
42static bool have_rcu_nocb_mask; /* Was rcu_nocb_mask allocated? */
43static bool __read_mostly rcu_nocb_poll; /* Offload kthread are to poll. */
44static char __initdata nocb_buf[NR_CPUS * 5];
45#endif /* #ifdef CONFIG_RCU_NOCB_CPU */
46
39/* 47/*
40 * Check the RCU kernel configuration parameters and print informative 48 * Check the RCU kernel configuration parameters and print informative
41 * messages about anything out of the ordinary. If you like #ifdef, you 49 * messages about anything out of the ordinary. If you like #ifdef, you
@@ -76,6 +84,18 @@ static void __init rcu_bootup_announce_oddness(void)
76 printk(KERN_INFO "\tExperimental boot-time adjustment of leaf fanout to %d.\n", rcu_fanout_leaf); 84 printk(KERN_INFO "\tExperimental boot-time adjustment of leaf fanout to %d.\n", rcu_fanout_leaf);
77 if (nr_cpu_ids != NR_CPUS) 85 if (nr_cpu_ids != NR_CPUS)
78 printk(KERN_INFO "\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids); 86 printk(KERN_INFO "\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids);
87#ifdef CONFIG_RCU_NOCB_CPU
88 if (have_rcu_nocb_mask) {
89 if (cpumask_test_cpu(0, rcu_nocb_mask)) {
90 cpumask_clear_cpu(0, rcu_nocb_mask);
91 pr_info("\tCPU 0: illegal no-CBs CPU (cleared).\n");
92 }
93 cpulist_scnprintf(nocb_buf, sizeof(nocb_buf), rcu_nocb_mask);
94 pr_info("\tExperimental no-CBs CPUs: %s.\n", nocb_buf);
95 if (rcu_nocb_poll)
96 pr_info("\tExperimental polled no-CBs CPUs.\n");
97 }
98#endif /* #ifdef CONFIG_RCU_NOCB_CPU */
79} 99}
80 100
81#ifdef CONFIG_TREE_PREEMPT_RCU 101#ifdef CONFIG_TREE_PREEMPT_RCU
@@ -642,7 +662,7 @@ static void rcu_preempt_do_callbacks(void)
642 */ 662 */
643void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) 663void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
644{ 664{
645 __call_rcu(head, func, &rcu_preempt_state, 0); 665 __call_rcu(head, func, &rcu_preempt_state, -1, 0);
646} 666}
647EXPORT_SYMBOL_GPL(call_rcu); 667EXPORT_SYMBOL_GPL(call_rcu);
648 668
@@ -656,7 +676,7 @@ EXPORT_SYMBOL_GPL(call_rcu);
656void kfree_call_rcu(struct rcu_head *head, 676void kfree_call_rcu(struct rcu_head *head,
657 void (*func)(struct rcu_head *rcu)) 677 void (*func)(struct rcu_head *rcu))
658{ 678{
659 __call_rcu(head, func, &rcu_preempt_state, 1); 679 __call_rcu(head, func, &rcu_preempt_state, -1, 1);
660} 680}
661EXPORT_SYMBOL_GPL(kfree_call_rcu); 681EXPORT_SYMBOL_GPL(kfree_call_rcu);
662 682
@@ -670,6 +690,9 @@ EXPORT_SYMBOL_GPL(kfree_call_rcu);
670 * concurrently with new RCU read-side critical sections that began while 690 * concurrently with new RCU read-side critical sections that began while
671 * synchronize_rcu() was waiting. RCU read-side critical sections are 691 * synchronize_rcu() was waiting. RCU read-side critical sections are
672 * delimited by rcu_read_lock() and rcu_read_unlock(), and may be nested. 692 * delimited by rcu_read_lock() and rcu_read_unlock(), and may be nested.
693 *
694 * See the description of synchronize_sched() for more detailed information
695 * on memory ordering guarantees.
673 */ 696 */
674void synchronize_rcu(void) 697void synchronize_rcu(void)
675{ 698{
@@ -679,7 +702,10 @@ void synchronize_rcu(void)
679 "Illegal synchronize_rcu() in RCU read-side critical section"); 702 "Illegal synchronize_rcu() in RCU read-side critical section");
680 if (!rcu_scheduler_active) 703 if (!rcu_scheduler_active)
681 return; 704 return;
682 wait_rcu_gp(call_rcu); 705 if (rcu_expedited)
706 synchronize_rcu_expedited();
707 else
708 wait_rcu_gp(call_rcu);
683} 709}
684EXPORT_SYMBOL_GPL(synchronize_rcu); 710EXPORT_SYMBOL_GPL(synchronize_rcu);
685 711
@@ -757,7 +783,8 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
757 * grace period for the specified rcu_node structure. If there are no such 783 * grace period for the specified rcu_node structure. If there are no such
758 * tasks, report it up the rcu_node hierarchy. 784 * tasks, report it up the rcu_node hierarchy.
759 * 785 *
760 * Caller must hold sync_rcu_preempt_exp_mutex and rsp->onofflock. 786 * Caller must hold sync_rcu_preempt_exp_mutex and must exclude
787 * CPU hotplug operations.
761 */ 788 */
762static void 789static void
763sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp) 790sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp)
@@ -831,7 +858,7 @@ void synchronize_rcu_expedited(void)
831 udelay(trycount * num_online_cpus()); 858 udelay(trycount * num_online_cpus());
832 } else { 859 } else {
833 put_online_cpus(); 860 put_online_cpus();
834 synchronize_rcu(); 861 wait_rcu_gp(call_rcu);
835 return; 862 return;
836 } 863 }
837 } 864 }
@@ -875,6 +902,11 @@ EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
875 902
876/** 903/**
877 * rcu_barrier - Wait until all in-flight call_rcu() callbacks complete. 904 * rcu_barrier - Wait until all in-flight call_rcu() callbacks complete.
905 *
906 * Note that this primitive does not necessarily wait for an RCU grace period
907 * to complete. For example, if there are no RCU callbacks queued anywhere
908 * in the system, then rcu_barrier() is within its rights to return
909 * immediately, without waiting for anything, much less an RCU grace period.
878 */ 910 */
879void rcu_barrier(void) 911void rcu_barrier(void)
880{ 912{
@@ -1013,7 +1045,7 @@ static void rcu_preempt_check_callbacks(int cpu)
1013void kfree_call_rcu(struct rcu_head *head, 1045void kfree_call_rcu(struct rcu_head *head,
1014 void (*func)(struct rcu_head *rcu)) 1046 void (*func)(struct rcu_head *rcu))
1015{ 1047{
1016 __call_rcu(head, func, &rcu_sched_state, 1); 1048 __call_rcu(head, func, &rcu_sched_state, -1, 1);
1017} 1049}
1018EXPORT_SYMBOL_GPL(kfree_call_rcu); 1050EXPORT_SYMBOL_GPL(kfree_call_rcu);
1019 1051
@@ -2092,3 +2124,381 @@ static void increment_cpu_stall_ticks(void)
2092} 2124}
2093 2125
2094#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_INFO */ 2126#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_INFO */
2127
2128#ifdef CONFIG_RCU_NOCB_CPU
2129
2130/*
2131 * Offload callback processing from the boot-time-specified set of CPUs
2132 * specified by rcu_nocb_mask. For each CPU in the set, there is a
2133 * kthread created that pulls the callbacks from the corresponding CPU,
2134 * waits for a grace period to elapse, and invokes the callbacks.
2135 * The no-CBs CPUs do a wake_up() on their kthread when they insert
2136 * a callback into any empty list, unless the rcu_nocb_poll boot parameter
2137 * has been specified, in which case each kthread actively polls its
2138 * CPU. (Which isn't so great for energy efficiency, but which does
2139 * reduce RCU's overhead on that CPU.)
2140 *
2141 * This is intended to be used in conjunction with Frederic Weisbecker's
2142 * adaptive-idle work, which would seriously reduce OS jitter on CPUs
2143 * running CPU-bound user-mode computations.
2144 *
2145 * Offloading of callback processing could also in theory be used as
2146 * an energy-efficiency measure because CPUs with no RCU callbacks
2147 * queued are more aggressive about entering dyntick-idle mode.
2148 */
2149
2150
2151/* Parse the boot-time rcu_nocb_mask CPU list from the kernel parameters. */
2152static int __init rcu_nocb_setup(char *str)
2153{
2154 alloc_bootmem_cpumask_var(&rcu_nocb_mask);
2155 have_rcu_nocb_mask = true;
2156 cpulist_parse(str, rcu_nocb_mask);
2157 return 1;
2158}
2159__setup("rcu_nocbs=", rcu_nocb_setup);
2160
2161static int __init parse_rcu_nocb_poll(char *arg)
2162{
2163 rcu_nocb_poll = 1;
2164 return 0;
2165}
2166early_param("rcu_nocb_poll", parse_rcu_nocb_poll);
2167
2168/* Is the specified CPU a no-CPUs CPU? */
2169static bool is_nocb_cpu(int cpu)
2170{
2171 if (have_rcu_nocb_mask)
2172 return cpumask_test_cpu(cpu, rcu_nocb_mask);
2173 return false;
2174}
2175
2176/*
2177 * Enqueue the specified string of rcu_head structures onto the specified
2178 * CPU's no-CBs lists. The CPU is specified by rdp, the head of the
2179 * string by rhp, and the tail of the string by rhtp. The non-lazy/lazy
2180 * counts are supplied by rhcount and rhcount_lazy.
2181 *
2182 * If warranted, also wake up the kthread servicing this CPUs queues.
2183 */
2184static void __call_rcu_nocb_enqueue(struct rcu_data *rdp,
2185 struct rcu_head *rhp,
2186 struct rcu_head **rhtp,
2187 int rhcount, int rhcount_lazy)
2188{
2189 int len;
2190 struct rcu_head **old_rhpp;
2191 struct task_struct *t;
2192
2193 /* Enqueue the callback on the nocb list and update counts. */
2194 old_rhpp = xchg(&rdp->nocb_tail, rhtp);
2195 ACCESS_ONCE(*old_rhpp) = rhp;
2196 atomic_long_add(rhcount, &rdp->nocb_q_count);
2197 atomic_long_add(rhcount_lazy, &rdp->nocb_q_count_lazy);
2198
2199 /* If we are not being polled and there is a kthread, awaken it ... */
2200 t = ACCESS_ONCE(rdp->nocb_kthread);
2201 if (rcu_nocb_poll | !t)
2202 return;
2203 len = atomic_long_read(&rdp->nocb_q_count);
2204 if (old_rhpp == &rdp->nocb_head) {
2205 wake_up(&rdp->nocb_wq); /* ... only if queue was empty ... */
2206 rdp->qlen_last_fqs_check = 0;
2207 } else if (len > rdp->qlen_last_fqs_check + qhimark) {
2208 wake_up_process(t); /* ... or if many callbacks queued. */
2209 rdp->qlen_last_fqs_check = LONG_MAX / 2;
2210 }
2211 return;
2212}
2213
2214/*
2215 * This is a helper for __call_rcu(), which invokes this when the normal
2216 * callback queue is inoperable. If this is not a no-CBs CPU, this
2217 * function returns failure back to __call_rcu(), which can complain
2218 * appropriately.
2219 *
2220 * Otherwise, this function queues the callback where the corresponding
2221 * "rcuo" kthread can find it.
2222 */
2223static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
2224 bool lazy)
2225{
2226
2227 if (!is_nocb_cpu(rdp->cpu))
2228 return 0;
2229 __call_rcu_nocb_enqueue(rdp, rhp, &rhp->next, 1, lazy);
2230 return 1;
2231}
2232
2233/*
2234 * Adopt orphaned callbacks on a no-CBs CPU, or return 0 if this is
2235 * not a no-CBs CPU.
2236 */
2237static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp,
2238 struct rcu_data *rdp)
2239{
2240 long ql = rsp->qlen;
2241 long qll = rsp->qlen_lazy;
2242
2243 /* If this is not a no-CBs CPU, tell the caller to do it the old way. */
2244 if (!is_nocb_cpu(smp_processor_id()))
2245 return 0;
2246 rsp->qlen = 0;
2247 rsp->qlen_lazy = 0;
2248
2249 /* First, enqueue the donelist, if any. This preserves CB ordering. */
2250 if (rsp->orphan_donelist != NULL) {
2251 __call_rcu_nocb_enqueue(rdp, rsp->orphan_donelist,
2252 rsp->orphan_donetail, ql, qll);
2253 ql = qll = 0;
2254 rsp->orphan_donelist = NULL;
2255 rsp->orphan_donetail = &rsp->orphan_donelist;
2256 }
2257 if (rsp->orphan_nxtlist != NULL) {
2258 __call_rcu_nocb_enqueue(rdp, rsp->orphan_nxtlist,
2259 rsp->orphan_nxttail, ql, qll);
2260 ql = qll = 0;
2261 rsp->orphan_nxtlist = NULL;
2262 rsp->orphan_nxttail = &rsp->orphan_nxtlist;
2263 }
2264 return 1;
2265}
2266
2267/*
2268 * There must be at least one non-no-CBs CPU in operation at any given
2269 * time, because no-CBs CPUs are not capable of initiating grace periods
2270 * independently. This function therefore complains if the specified
2271 * CPU is the last non-no-CBs CPU, allowing the CPU-hotplug system to
2272 * avoid offlining the last such CPU. (Recursion is a wonderful thing,
2273 * but you have to have a base case!)
2274 */
2275static bool nocb_cpu_expendable(int cpu)
2276{
2277 cpumask_var_t non_nocb_cpus;
2278 int ret;
2279
2280 /*
2281 * If there are no no-CB CPUs or if this CPU is not a no-CB CPU,
2282 * then offlining this CPU is harmless. Let it happen.
2283 */
2284 if (!have_rcu_nocb_mask || is_nocb_cpu(cpu))
2285 return 1;
2286
2287 /* If no memory, play it safe and keep the CPU around. */
2288 if (!alloc_cpumask_var(&non_nocb_cpus, GFP_NOIO))
2289 return 0;
2290 cpumask_andnot(non_nocb_cpus, cpu_online_mask, rcu_nocb_mask);
2291 cpumask_clear_cpu(cpu, non_nocb_cpus);
2292 ret = !cpumask_empty(non_nocb_cpus);
2293 free_cpumask_var(non_nocb_cpus);
2294 return ret;
2295}
2296
2297/*
2298 * Helper structure for remote registry of RCU callbacks.
2299 * This is needed for when a no-CBs CPU needs to start a grace period.
2300 * If it just invokes call_rcu(), the resulting callback will be queued,
2301 * which can result in deadlock.
2302 */
2303struct rcu_head_remote {
2304 struct rcu_head *rhp;
2305 call_rcu_func_t *crf;
2306 void (*func)(struct rcu_head *rhp);
2307};
2308
2309/*
2310 * Register a callback as specified by the rcu_head_remote struct.
2311 * This function is intended to be invoked via smp_call_function_single().
2312 */
2313static void call_rcu_local(void *arg)
2314{
2315 struct rcu_head_remote *rhrp =
2316 container_of(arg, struct rcu_head_remote, rhp);
2317
2318 rhrp->crf(rhrp->rhp, rhrp->func);
2319}
2320
2321/*
2322 * Set up an rcu_head_remote structure and the invoke call_rcu_local()
2323 * on CPU 0 (which is guaranteed to be a non-no-CBs CPU) via
2324 * smp_call_function_single().
2325 */
2326static void invoke_crf_remote(struct rcu_head *rhp,
2327 void (*func)(struct rcu_head *rhp),
2328 call_rcu_func_t crf)
2329{
2330 struct rcu_head_remote rhr;
2331
2332 rhr.rhp = rhp;
2333 rhr.crf = crf;
2334 rhr.func = func;
2335 smp_call_function_single(0, call_rcu_local, &rhr, 1);
2336}
2337
2338/*
2339 * Helper functions to be passed to wait_rcu_gp(), each of which
2340 * invokes invoke_crf_remote() to register a callback appropriately.
2341 */
2342static void __maybe_unused
2343call_rcu_preempt_remote(struct rcu_head *rhp,
2344 void (*func)(struct rcu_head *rhp))
2345{
2346 invoke_crf_remote(rhp, func, call_rcu);
2347}
2348static void call_rcu_bh_remote(struct rcu_head *rhp,
2349 void (*func)(struct rcu_head *rhp))
2350{
2351 invoke_crf_remote(rhp, func, call_rcu_bh);
2352}
2353static void call_rcu_sched_remote(struct rcu_head *rhp,
2354 void (*func)(struct rcu_head *rhp))
2355{
2356 invoke_crf_remote(rhp, func, call_rcu_sched);
2357}
2358
2359/*
2360 * Per-rcu_data kthread, but only for no-CBs CPUs. Each kthread invokes
2361 * callbacks queued by the corresponding no-CBs CPU.
2362 */
2363static int rcu_nocb_kthread(void *arg)
2364{
2365 int c, cl;
2366 struct rcu_head *list;
2367 struct rcu_head *next;
2368 struct rcu_head **tail;
2369 struct rcu_data *rdp = arg;
2370
2371 /* Each pass through this loop invokes one batch of callbacks */
2372 for (;;) {
2373 /* If not polling, wait for next batch of callbacks. */
2374 if (!rcu_nocb_poll)
2375 wait_event_interruptible(rdp->nocb_wq, rdp->nocb_head);
2376 list = ACCESS_ONCE(rdp->nocb_head);
2377 if (!list) {
2378 schedule_timeout_interruptible(1);
2379 flush_signals(current);
2380 continue;
2381 }
2382
2383 /*
2384 * Extract queued callbacks, update counts, and wait
2385 * for a grace period to elapse.
2386 */
2387 ACCESS_ONCE(rdp->nocb_head) = NULL;
2388 tail = xchg(&rdp->nocb_tail, &rdp->nocb_head);
2389 c = atomic_long_xchg(&rdp->nocb_q_count, 0);
2390 cl = atomic_long_xchg(&rdp->nocb_q_count_lazy, 0);
2391 ACCESS_ONCE(rdp->nocb_p_count) += c;
2392 ACCESS_ONCE(rdp->nocb_p_count_lazy) += cl;
2393 wait_rcu_gp(rdp->rsp->call_remote);
2394
2395 /* Each pass through the following loop invokes a callback. */
2396 trace_rcu_batch_start(rdp->rsp->name, cl, c, -1);
2397 c = cl = 0;
2398 while (list) {
2399 next = list->next;
2400 /* Wait for enqueuing to complete, if needed. */
2401 while (next == NULL && &list->next != tail) {
2402 schedule_timeout_interruptible(1);
2403 next = list->next;
2404 }
2405 debug_rcu_head_unqueue(list);
2406 local_bh_disable();
2407 if (__rcu_reclaim(rdp->rsp->name, list))
2408 cl++;
2409 c++;
2410 local_bh_enable();
2411 list = next;
2412 }
2413 trace_rcu_batch_end(rdp->rsp->name, c, !!list, 0, 0, 1);
2414 ACCESS_ONCE(rdp->nocb_p_count) -= c;
2415 ACCESS_ONCE(rdp->nocb_p_count_lazy) -= cl;
2416 rdp->n_nocbs_invoked += c;
2417 }
2418 return 0;
2419}
2420
2421/* Initialize per-rcu_data variables for no-CBs CPUs. */
2422static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp)
2423{
2424 rdp->nocb_tail = &rdp->nocb_head;
2425 init_waitqueue_head(&rdp->nocb_wq);
2426}
2427
2428/* Create a kthread for each RCU flavor for each no-CBs CPU. */
2429static void __init rcu_spawn_nocb_kthreads(struct rcu_state *rsp)
2430{
2431 int cpu;
2432 struct rcu_data *rdp;
2433 struct task_struct *t;
2434
2435 if (rcu_nocb_mask == NULL)
2436 return;
2437 for_each_cpu(cpu, rcu_nocb_mask) {
2438 rdp = per_cpu_ptr(rsp->rda, cpu);
2439 t = kthread_run(rcu_nocb_kthread, rdp, "rcuo%d", cpu);
2440 BUG_ON(IS_ERR(t));
2441 ACCESS_ONCE(rdp->nocb_kthread) = t;
2442 }
2443}
2444
2445/* Prevent __call_rcu() from enqueuing callbacks on no-CBs CPUs */
2446static void init_nocb_callback_list(struct rcu_data *rdp)
2447{
2448 if (rcu_nocb_mask == NULL ||
2449 !cpumask_test_cpu(rdp->cpu, rcu_nocb_mask))
2450 return;
2451 rdp->nxttail[RCU_NEXT_TAIL] = NULL;
2452}
2453
2454/* Initialize the ->call_remote fields in the rcu_state structures. */
2455static void __init rcu_init_nocb(void)
2456{
2457#ifdef CONFIG_PREEMPT_RCU
2458 rcu_preempt_state.call_remote = call_rcu_preempt_remote;
2459#endif /* #ifdef CONFIG_PREEMPT_RCU */
2460 rcu_bh_state.call_remote = call_rcu_bh_remote;
2461 rcu_sched_state.call_remote = call_rcu_sched_remote;
2462}
2463
2464#else /* #ifdef CONFIG_RCU_NOCB_CPU */
2465
2466static bool is_nocb_cpu(int cpu)
2467{
2468 return false;
2469}
2470
2471static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
2472 bool lazy)
2473{
2474 return 0;
2475}
2476
2477static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp,
2478 struct rcu_data *rdp)
2479{
2480 return 0;
2481}
2482
2483static bool nocb_cpu_expendable(int cpu)
2484{
2485 return 1;
2486}
2487
2488static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp)
2489{
2490}
2491
2492static void __init rcu_spawn_nocb_kthreads(struct rcu_state *rsp)
2493{
2494}
2495
2496static void init_nocb_callback_list(struct rcu_data *rdp)
2497{
2498}
2499
2500static void __init rcu_init_nocb(void)
2501{
2502}
2503
2504#endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index 693513bc50e6..0d095dcaa670 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -46,29 +46,58 @@
46#define RCU_TREE_NONCORE 46#define RCU_TREE_NONCORE
47#include "rcutree.h" 47#include "rcutree.h"
48 48
49static int show_rcubarrier(struct seq_file *m, void *unused) 49#define ulong2long(a) (*(long *)(&(a)))
50
51static int r_open(struct inode *inode, struct file *file,
52 const struct seq_operations *op)
50{ 53{
51 struct rcu_state *rsp; 54 int ret = seq_open(file, op);
55 if (!ret) {
56 struct seq_file *m = (struct seq_file *)file->private_data;
57 m->private = inode->i_private;
58 }
59 return ret;
60}
61
62static void *r_start(struct seq_file *m, loff_t *pos)
63{
64 struct rcu_state *rsp = (struct rcu_state *)m->private;
65 *pos = cpumask_next(*pos - 1, cpu_possible_mask);
66 if ((*pos) < nr_cpu_ids)
67 return per_cpu_ptr(rsp->rda, *pos);
68 return NULL;
69}
52 70
53 for_each_rcu_flavor(rsp) 71static void *r_next(struct seq_file *m, void *v, loff_t *pos)
54 seq_printf(m, "%s: bcc: %d nbd: %lu\n", 72{
55 rsp->name, 73 (*pos)++;
56 atomic_read(&rsp->barrier_cpu_count), 74 return r_start(m, pos);
57 rsp->n_barrier_done); 75}
76
77static void r_stop(struct seq_file *m, void *v)
78{
79}
80
81static int show_rcubarrier(struct seq_file *m, void *v)
82{
83 struct rcu_state *rsp = (struct rcu_state *)m->private;
84 seq_printf(m, "bcc: %d nbd: %lu\n",
85 atomic_read(&rsp->barrier_cpu_count),
86 rsp->n_barrier_done);
58 return 0; 87 return 0;
59} 88}
60 89
61static int rcubarrier_open(struct inode *inode, struct file *file) 90static int rcubarrier_open(struct inode *inode, struct file *file)
62{ 91{
63 return single_open(file, show_rcubarrier, NULL); 92 return single_open(file, show_rcubarrier, inode->i_private);
64} 93}
65 94
66static const struct file_operations rcubarrier_fops = { 95static const struct file_operations rcubarrier_fops = {
67 .owner = THIS_MODULE, 96 .owner = THIS_MODULE,
68 .open = rcubarrier_open, 97 .open = rcubarrier_open,
69 .read = seq_read, 98 .read = seq_read,
70 .llseek = seq_lseek, 99 .llseek = no_llseek,
71 .release = single_release, 100 .release = seq_release,
72}; 101};
73 102
74#ifdef CONFIG_RCU_BOOST 103#ifdef CONFIG_RCU_BOOST
@@ -84,12 +113,14 @@ static char convert_kthread_status(unsigned int kthread_status)
84 113
85static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) 114static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
86{ 115{
116 long ql, qll;
117
87 if (!rdp->beenonline) 118 if (!rdp->beenonline)
88 return; 119 return;
89 seq_printf(m, "%3d%cc=%lu g=%lu pq=%d qp=%d", 120 seq_printf(m, "%3d%cc=%ld g=%ld pq=%d qp=%d",
90 rdp->cpu, 121 rdp->cpu,
91 cpu_is_offline(rdp->cpu) ? '!' : ' ', 122 cpu_is_offline(rdp->cpu) ? '!' : ' ',
92 rdp->completed, rdp->gpnum, 123 ulong2long(rdp->completed), ulong2long(rdp->gpnum),
93 rdp->passed_quiesce, rdp->qs_pending); 124 rdp->passed_quiesce, rdp->qs_pending);
94 seq_printf(m, " dt=%d/%llx/%d df=%lu", 125 seq_printf(m, " dt=%d/%llx/%d df=%lu",
95 atomic_read(&rdp->dynticks->dynticks), 126 atomic_read(&rdp->dynticks->dynticks),
@@ -97,8 +128,11 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
97 rdp->dynticks->dynticks_nmi_nesting, 128 rdp->dynticks->dynticks_nmi_nesting,
98 rdp->dynticks_fqs); 129 rdp->dynticks_fqs);
99 seq_printf(m, " of=%lu", rdp->offline_fqs); 130 seq_printf(m, " of=%lu", rdp->offline_fqs);
131 rcu_nocb_q_lengths(rdp, &ql, &qll);
132 qll += rdp->qlen_lazy;
133 ql += rdp->qlen;
100 seq_printf(m, " ql=%ld/%ld qs=%c%c%c%c", 134 seq_printf(m, " ql=%ld/%ld qs=%c%c%c%c",
101 rdp->qlen_lazy, rdp->qlen, 135 qll, ql,
102 ".N"[rdp->nxttail[RCU_NEXT_READY_TAIL] != 136 ".N"[rdp->nxttail[RCU_NEXT_READY_TAIL] !=
103 rdp->nxttail[RCU_NEXT_TAIL]], 137 rdp->nxttail[RCU_NEXT_TAIL]],
104 ".R"[rdp->nxttail[RCU_WAIT_TAIL] != 138 ".R"[rdp->nxttail[RCU_WAIT_TAIL] !=
@@ -114,101 +148,67 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
114 per_cpu(rcu_cpu_kthread_loops, rdp->cpu) & 0xffff); 148 per_cpu(rcu_cpu_kthread_loops, rdp->cpu) & 0xffff);
115#endif /* #ifdef CONFIG_RCU_BOOST */ 149#endif /* #ifdef CONFIG_RCU_BOOST */
116 seq_printf(m, " b=%ld", rdp->blimit); 150 seq_printf(m, " b=%ld", rdp->blimit);
117 seq_printf(m, " ci=%lu co=%lu ca=%lu\n", 151 seq_printf(m, " ci=%lu nci=%lu co=%lu ca=%lu\n",
118 rdp->n_cbs_invoked, rdp->n_cbs_orphaned, rdp->n_cbs_adopted); 152 rdp->n_cbs_invoked, rdp->n_nocbs_invoked,
153 rdp->n_cbs_orphaned, rdp->n_cbs_adopted);
119} 154}
120 155
121static int show_rcudata(struct seq_file *m, void *unused) 156static int show_rcudata(struct seq_file *m, void *v)
122{ 157{
123 int cpu; 158 print_one_rcu_data(m, (struct rcu_data *)v);
124 struct rcu_state *rsp;
125
126 for_each_rcu_flavor(rsp) {
127 seq_printf(m, "%s:\n", rsp->name);
128 for_each_possible_cpu(cpu)
129 print_one_rcu_data(m, per_cpu_ptr(rsp->rda, cpu));
130 }
131 return 0; 159 return 0;
132} 160}
133 161
162static const struct seq_operations rcudate_op = {
163 .start = r_start,
164 .next = r_next,
165 .stop = r_stop,
166 .show = show_rcudata,
167};
168
134static int rcudata_open(struct inode *inode, struct file *file) 169static int rcudata_open(struct inode *inode, struct file *file)
135{ 170{
136 return single_open(file, show_rcudata, NULL); 171 return r_open(inode, file, &rcudate_op);
137} 172}
138 173
139static const struct file_operations rcudata_fops = { 174static const struct file_operations rcudata_fops = {
140 .owner = THIS_MODULE, 175 .owner = THIS_MODULE,
141 .open = rcudata_open, 176 .open = rcudata_open,
142 .read = seq_read, 177 .read = seq_read,
143 .llseek = seq_lseek, 178 .llseek = no_llseek,
144 .release = single_release, 179 .release = seq_release,
145}; 180};
146 181
147static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp) 182static int show_rcuexp(struct seq_file *m, void *v)
148{
149 if (!rdp->beenonline)
150 return;
151 seq_printf(m, "%d,%s,%lu,%lu,%d,%d",
152 rdp->cpu,
153 cpu_is_offline(rdp->cpu) ? "\"N\"" : "\"Y\"",
154 rdp->completed, rdp->gpnum,
155 rdp->passed_quiesce, rdp->qs_pending);
156 seq_printf(m, ",%d,%llx,%d,%lu",
157 atomic_read(&rdp->dynticks->dynticks),
158 rdp->dynticks->dynticks_nesting,
159 rdp->dynticks->dynticks_nmi_nesting,
160 rdp->dynticks_fqs);
161 seq_printf(m, ",%lu", rdp->offline_fqs);
162 seq_printf(m, ",%ld,%ld,\"%c%c%c%c\"", rdp->qlen_lazy, rdp->qlen,
163 ".N"[rdp->nxttail[RCU_NEXT_READY_TAIL] !=
164 rdp->nxttail[RCU_NEXT_TAIL]],
165 ".R"[rdp->nxttail[RCU_WAIT_TAIL] !=
166 rdp->nxttail[RCU_NEXT_READY_TAIL]],
167 ".W"[rdp->nxttail[RCU_DONE_TAIL] !=
168 rdp->nxttail[RCU_WAIT_TAIL]],
169 ".D"[&rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL]]);
170#ifdef CONFIG_RCU_BOOST
171 seq_printf(m, ",%d,\"%c\"",
172 per_cpu(rcu_cpu_has_work, rdp->cpu),
173 convert_kthread_status(per_cpu(rcu_cpu_kthread_status,
174 rdp->cpu)));
175#endif /* #ifdef CONFIG_RCU_BOOST */
176 seq_printf(m, ",%ld", rdp->blimit);
177 seq_printf(m, ",%lu,%lu,%lu\n",
178 rdp->n_cbs_invoked, rdp->n_cbs_orphaned, rdp->n_cbs_adopted);
179}
180
181static int show_rcudata_csv(struct seq_file *m, void *unused)
182{ 183{
183 int cpu; 184 struct rcu_state *rsp = (struct rcu_state *)m->private;
184 struct rcu_state *rsp; 185
185 186 seq_printf(m, "s=%lu d=%lu w=%lu tf=%lu wd1=%lu wd2=%lu n=%lu sc=%lu dt=%lu dl=%lu dx=%lu\n",
186 seq_puts(m, "\"CPU\",\"Online?\",\"c\",\"g\",\"pq\",\"pq\","); 187 atomic_long_read(&rsp->expedited_start),
187 seq_puts(m, "\"dt\",\"dt nesting\",\"dt NMI nesting\",\"df\","); 188 atomic_long_read(&rsp->expedited_done),
188 seq_puts(m, "\"of\",\"qll\",\"ql\",\"qs\""); 189 atomic_long_read(&rsp->expedited_wrap),
189#ifdef CONFIG_RCU_BOOST 190 atomic_long_read(&rsp->expedited_tryfail),
190 seq_puts(m, "\"kt\",\"ktl\""); 191 atomic_long_read(&rsp->expedited_workdone1),
191#endif /* #ifdef CONFIG_RCU_BOOST */ 192 atomic_long_read(&rsp->expedited_workdone2),
192 seq_puts(m, ",\"b\",\"ci\",\"co\",\"ca\"\n"); 193 atomic_long_read(&rsp->expedited_normal),
193 for_each_rcu_flavor(rsp) { 194 atomic_long_read(&rsp->expedited_stoppedcpus),
194 seq_printf(m, "\"%s:\"\n", rsp->name); 195 atomic_long_read(&rsp->expedited_done_tries),
195 for_each_possible_cpu(cpu) 196 atomic_long_read(&rsp->expedited_done_lost),
196 print_one_rcu_data_csv(m, per_cpu_ptr(rsp->rda, cpu)); 197 atomic_long_read(&rsp->expedited_done_exit));
197 }
198 return 0; 198 return 0;
199} 199}
200 200
201static int rcudata_csv_open(struct inode *inode, struct file *file) 201static int rcuexp_open(struct inode *inode, struct file *file)
202{ 202{
203 return single_open(file, show_rcudata_csv, NULL); 203 return single_open(file, show_rcuexp, inode->i_private);
204} 204}
205 205
206static const struct file_operations rcudata_csv_fops = { 206static const struct file_operations rcuexp_fops = {
207 .owner = THIS_MODULE, 207 .owner = THIS_MODULE,
208 .open = rcudata_csv_open, 208 .open = rcuexp_open,
209 .read = seq_read, 209 .read = seq_read,
210 .llseek = seq_lseek, 210 .llseek = no_llseek,
211 .release = single_release, 211 .release = seq_release,
212}; 212};
213 213
214#ifdef CONFIG_RCU_BOOST 214#ifdef CONFIG_RCU_BOOST
@@ -254,27 +254,11 @@ static const struct file_operations rcu_node_boost_fops = {
254 .owner = THIS_MODULE, 254 .owner = THIS_MODULE,
255 .open = rcu_node_boost_open, 255 .open = rcu_node_boost_open,
256 .read = seq_read, 256 .read = seq_read,
257 .llseek = seq_lseek, 257 .llseek = no_llseek,
258 .release = single_release, 258 .release = single_release,
259}; 259};
260 260
261/* 261#endif /* #ifdef CONFIG_RCU_BOOST */
262 * Create the rcuboost debugfs entry. Standard error return.
263 */
264static int rcu_boost_trace_create_file(struct dentry *rcudir)
265{
266 return !debugfs_create_file("rcuboost", 0444, rcudir, NULL,
267 &rcu_node_boost_fops);
268}
269
270#else /* #ifdef CONFIG_RCU_BOOST */
271
272static int rcu_boost_trace_create_file(struct dentry *rcudir)
273{
274 return 0; /* There cannot be an error if we didn't create it! */
275}
276
277#endif /* #else #ifdef CONFIG_RCU_BOOST */
278 262
279static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp) 263static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
280{ 264{
@@ -283,8 +267,9 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
283 struct rcu_node *rnp; 267 struct rcu_node *rnp;
284 268
285 gpnum = rsp->gpnum; 269 gpnum = rsp->gpnum;
286 seq_printf(m, "%s: c=%lu g=%lu s=%d jfq=%ld j=%x ", 270 seq_printf(m, "c=%ld g=%ld s=%d jfq=%ld j=%x ",
287 rsp->name, rsp->completed, gpnum, rsp->fqs_state, 271 ulong2long(rsp->completed), ulong2long(gpnum),
272 rsp->fqs_state,
288 (long)(rsp->jiffies_force_qs - jiffies), 273 (long)(rsp->jiffies_force_qs - jiffies),
289 (int)(jiffies & 0xffff)); 274 (int)(jiffies & 0xffff));
290 seq_printf(m, "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld/%ld\n", 275 seq_printf(m, "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld/%ld\n",
@@ -306,26 +291,24 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
306 seq_puts(m, "\n"); 291 seq_puts(m, "\n");
307} 292}
308 293
309static int show_rcuhier(struct seq_file *m, void *unused) 294static int show_rcuhier(struct seq_file *m, void *v)
310{ 295{
311 struct rcu_state *rsp; 296 struct rcu_state *rsp = (struct rcu_state *)m->private;
312 297 print_one_rcu_state(m, rsp);
313 for_each_rcu_flavor(rsp)
314 print_one_rcu_state(m, rsp);
315 return 0; 298 return 0;
316} 299}
317 300
318static int rcuhier_open(struct inode *inode, struct file *file) 301static int rcuhier_open(struct inode *inode, struct file *file)
319{ 302{
320 return single_open(file, show_rcuhier, NULL); 303 return single_open(file, show_rcuhier, inode->i_private);
321} 304}
322 305
323static const struct file_operations rcuhier_fops = { 306static const struct file_operations rcuhier_fops = {
324 .owner = THIS_MODULE, 307 .owner = THIS_MODULE,
325 .open = rcuhier_open, 308 .open = rcuhier_open,
326 .read = seq_read, 309 .read = seq_read,
327 .llseek = seq_lseek, 310 .llseek = no_llseek,
328 .release = single_release, 311 .release = seq_release,
329}; 312};
330 313
331static void show_one_rcugp(struct seq_file *m, struct rcu_state *rsp) 314static void show_one_rcugp(struct seq_file *m, struct rcu_state *rsp)
@@ -338,42 +321,42 @@ static void show_one_rcugp(struct seq_file *m, struct rcu_state *rsp)
338 struct rcu_node *rnp = &rsp->node[0]; 321 struct rcu_node *rnp = &rsp->node[0];
339 322
340 raw_spin_lock_irqsave(&rnp->lock, flags); 323 raw_spin_lock_irqsave(&rnp->lock, flags);
341 completed = rsp->completed; 324 completed = ACCESS_ONCE(rsp->completed);
342 gpnum = rsp->gpnum; 325 gpnum = ACCESS_ONCE(rsp->gpnum);
343 if (rsp->completed == rsp->gpnum) 326 if (completed == gpnum)
344 gpage = 0; 327 gpage = 0;
345 else 328 else
346 gpage = jiffies - rsp->gp_start; 329 gpage = jiffies - rsp->gp_start;
347 gpmax = rsp->gp_max; 330 gpmax = rsp->gp_max;
348 raw_spin_unlock_irqrestore(&rnp->lock, flags); 331 raw_spin_unlock_irqrestore(&rnp->lock, flags);
349 seq_printf(m, "%s: completed=%ld gpnum=%lu age=%ld max=%ld\n", 332 seq_printf(m, "completed=%ld gpnum=%ld age=%ld max=%ld\n",
350 rsp->name, completed, gpnum, gpage, gpmax); 333 ulong2long(completed), ulong2long(gpnum), gpage, gpmax);
351} 334}
352 335
353static int show_rcugp(struct seq_file *m, void *unused) 336static int show_rcugp(struct seq_file *m, void *v)
354{ 337{
355 struct rcu_state *rsp; 338 struct rcu_state *rsp = (struct rcu_state *)m->private;
356 339 show_one_rcugp(m, rsp);
357 for_each_rcu_flavor(rsp)
358 show_one_rcugp(m, rsp);
359 return 0; 340 return 0;
360} 341}
361 342
362static int rcugp_open(struct inode *inode, struct file *file) 343static int rcugp_open(struct inode *inode, struct file *file)
363{ 344{
364 return single_open(file, show_rcugp, NULL); 345 return single_open(file, show_rcugp, inode->i_private);
365} 346}
366 347
367static const struct file_operations rcugp_fops = { 348static const struct file_operations rcugp_fops = {
368 .owner = THIS_MODULE, 349 .owner = THIS_MODULE,
369 .open = rcugp_open, 350 .open = rcugp_open,
370 .read = seq_read, 351 .read = seq_read,
371 .llseek = seq_lseek, 352 .llseek = no_llseek,
372 .release = single_release, 353 .release = seq_release,
373}; 354};
374 355
375static void print_one_rcu_pending(struct seq_file *m, struct rcu_data *rdp) 356static void print_one_rcu_pending(struct seq_file *m, struct rcu_data *rdp)
376{ 357{
358 if (!rdp->beenonline)
359 return;
377 seq_printf(m, "%3d%cnp=%ld ", 360 seq_printf(m, "%3d%cnp=%ld ",
378 rdp->cpu, 361 rdp->cpu,
379 cpu_is_offline(rdp->cpu) ? '!' : ' ', 362 cpu_is_offline(rdp->cpu) ? '!' : ' ',
@@ -389,34 +372,30 @@ static void print_one_rcu_pending(struct seq_file *m, struct rcu_data *rdp)
389 rdp->n_rp_need_nothing); 372 rdp->n_rp_need_nothing);
390} 373}
391 374
392static int show_rcu_pending(struct seq_file *m, void *unused) 375static int show_rcu_pending(struct seq_file *m, void *v)
393{ 376{
394 int cpu; 377 print_one_rcu_pending(m, (struct rcu_data *)v);
395 struct rcu_data *rdp;
396 struct rcu_state *rsp;
397
398 for_each_rcu_flavor(rsp) {
399 seq_printf(m, "%s:\n", rsp->name);
400 for_each_possible_cpu(cpu) {
401 rdp = per_cpu_ptr(rsp->rda, cpu);
402 if (rdp->beenonline)
403 print_one_rcu_pending(m, rdp);
404 }
405 }
406 return 0; 378 return 0;
407} 379}
408 380
381static const struct seq_operations rcu_pending_op = {
382 .start = r_start,
383 .next = r_next,
384 .stop = r_stop,
385 .show = show_rcu_pending,
386};
387
409static int rcu_pending_open(struct inode *inode, struct file *file) 388static int rcu_pending_open(struct inode *inode, struct file *file)
410{ 389{
411 return single_open(file, show_rcu_pending, NULL); 390 return r_open(inode, file, &rcu_pending_op);
412} 391}
413 392
414static const struct file_operations rcu_pending_fops = { 393static const struct file_operations rcu_pending_fops = {
415 .owner = THIS_MODULE, 394 .owner = THIS_MODULE,
416 .open = rcu_pending_open, 395 .open = rcu_pending_open,
417 .read = seq_read, 396 .read = seq_read,
418 .llseek = seq_lseek, 397 .llseek = no_llseek,
419 .release = single_release, 398 .release = seq_release,
420}; 399};
421 400
422static int show_rcutorture(struct seq_file *m, void *unused) 401static int show_rcutorture(struct seq_file *m, void *unused)
@@ -446,43 +425,58 @@ static struct dentry *rcudir;
446 425
447static int __init rcutree_trace_init(void) 426static int __init rcutree_trace_init(void)
448{ 427{
428 struct rcu_state *rsp;
449 struct dentry *retval; 429 struct dentry *retval;
430 struct dentry *rspdir;
450 431
451 rcudir = debugfs_create_dir("rcu", NULL); 432 rcudir = debugfs_create_dir("rcu", NULL);
452 if (!rcudir) 433 if (!rcudir)
453 goto free_out; 434 goto free_out;
454 435
455 retval = debugfs_create_file("rcubarrier", 0444, rcudir, 436 for_each_rcu_flavor(rsp) {
456 NULL, &rcubarrier_fops); 437 rspdir = debugfs_create_dir(rsp->name, rcudir);
457 if (!retval) 438 if (!rspdir)
458 goto free_out; 439 goto free_out;
459 440
460 retval = debugfs_create_file("rcudata", 0444, rcudir, 441 retval = debugfs_create_file("rcudata", 0444,
461 NULL, &rcudata_fops); 442 rspdir, rsp, &rcudata_fops);
462 if (!retval) 443 if (!retval)
463 goto free_out; 444 goto free_out;
464 445
465 retval = debugfs_create_file("rcudata.csv", 0444, rcudir, 446 retval = debugfs_create_file("rcuexp", 0444,
466 NULL, &rcudata_csv_fops); 447 rspdir, rsp, &rcuexp_fops);
467 if (!retval) 448 if (!retval)
468 goto free_out; 449 goto free_out;
469 450
470 if (rcu_boost_trace_create_file(rcudir)) 451 retval = debugfs_create_file("rcu_pending", 0444,
471 goto free_out; 452 rspdir, rsp, &rcu_pending_fops);
453 if (!retval)
454 goto free_out;
455
456 retval = debugfs_create_file("rcubarrier", 0444,
457 rspdir, rsp, &rcubarrier_fops);
458 if (!retval)
459 goto free_out;
472 460
473 retval = debugfs_create_file("rcugp", 0444, rcudir, NULL, &rcugp_fops); 461#ifdef CONFIG_RCU_BOOST
474 if (!retval) 462 if (rsp == &rcu_preempt_state) {
475 goto free_out; 463 retval = debugfs_create_file("rcuboost", 0444,
464 rspdir, NULL, &rcu_node_boost_fops);
465 if (!retval)
466 goto free_out;
467 }
468#endif
476 469
477 retval = debugfs_create_file("rcuhier", 0444, rcudir, 470 retval = debugfs_create_file("rcugp", 0444,
478 NULL, &rcuhier_fops); 471 rspdir, rsp, &rcugp_fops);
479 if (!retval) 472 if (!retval)
480 goto free_out; 473 goto free_out;
481 474
482 retval = debugfs_create_file("rcu_pending", 0444, rcudir, 475 retval = debugfs_create_file("rcuhier", 0444,
483 NULL, &rcu_pending_fops); 476 rspdir, rsp, &rcuhier_fops);
484 if (!retval) 477 if (!retval)
485 goto free_out; 478 goto free_out;
479 }
486 480
487 retval = debugfs_create_file("rcutorture", 0444, rcudir, 481 retval = debugfs_create_file("rcutorture", 0444, rcudir,
488 NULL, &rcutorture_fops); 482 NULL, &rcutorture_fops);
diff --git a/kernel/relay.c b/kernel/relay.c
index e8cd2027abbd..01ab081ac53a 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -1139,7 +1139,7 @@ static ssize_t relay_file_read_subbufs(struct file *filp, loff_t *ppos,
1139 if (!desc->count) 1139 if (!desc->count)
1140 return 0; 1140 return 0;
1141 1141
1142 mutex_lock(&filp->f_path.dentry->d_inode->i_mutex); 1142 mutex_lock(&file_inode(filp)->i_mutex);
1143 do { 1143 do {
1144 if (!relay_file_read_avail(buf, *ppos)) 1144 if (!relay_file_read_avail(buf, *ppos))
1145 break; 1145 break;
@@ -1159,7 +1159,7 @@ static ssize_t relay_file_read_subbufs(struct file *filp, loff_t *ppos,
1159 *ppos = relay_file_read_end_pos(buf, read_start, ret); 1159 *ppos = relay_file_read_end_pos(buf, read_start, ret);
1160 } 1160 }
1161 } while (desc->count && ret); 1161 } while (desc->count && ret);
1162 mutex_unlock(&filp->f_path.dentry->d_inode->i_mutex); 1162 mutex_unlock(&file_inode(filp)->i_mutex);
1163 1163
1164 return desc->written; 1164 return desc->written;
1165} 1165}
diff --git a/kernel/res_counter.c b/kernel/res_counter.c
index ad581aa2369a..ff55247e7049 100644
--- a/kernel/res_counter.c
+++ b/kernel/res_counter.c
@@ -86,33 +86,39 @@ int res_counter_charge_nofail(struct res_counter *counter, unsigned long val,
86 return __res_counter_charge(counter, val, limit_fail_at, true); 86 return __res_counter_charge(counter, val, limit_fail_at, true);
87} 87}
88 88
89void res_counter_uncharge_locked(struct res_counter *counter, unsigned long val) 89u64 res_counter_uncharge_locked(struct res_counter *counter, unsigned long val)
90{ 90{
91 if (WARN_ON(counter->usage < val)) 91 if (WARN_ON(counter->usage < val))
92 val = counter->usage; 92 val = counter->usage;
93 93
94 counter->usage -= val; 94 counter->usage -= val;
95 return counter->usage;
95} 96}
96 97
97void res_counter_uncharge_until(struct res_counter *counter, 98u64 res_counter_uncharge_until(struct res_counter *counter,
98 struct res_counter *top, 99 struct res_counter *top,
99 unsigned long val) 100 unsigned long val)
100{ 101{
101 unsigned long flags; 102 unsigned long flags;
102 struct res_counter *c; 103 struct res_counter *c;
104 u64 ret = 0;
103 105
104 local_irq_save(flags); 106 local_irq_save(flags);
105 for (c = counter; c != top; c = c->parent) { 107 for (c = counter; c != top; c = c->parent) {
108 u64 r;
106 spin_lock(&c->lock); 109 spin_lock(&c->lock);
107 res_counter_uncharge_locked(c, val); 110 r = res_counter_uncharge_locked(c, val);
111 if (c == counter)
112 ret = r;
108 spin_unlock(&c->lock); 113 spin_unlock(&c->lock);
109 } 114 }
110 local_irq_restore(flags); 115 local_irq_restore(flags);
116 return ret;
111} 117}
112 118
113void res_counter_uncharge(struct res_counter *counter, unsigned long val) 119u64 res_counter_uncharge(struct res_counter *counter, unsigned long val)
114{ 120{
115 res_counter_uncharge_until(counter, NULL, val); 121 return res_counter_uncharge_until(counter, NULL, val);
116} 122}
117 123
118static inline unsigned long long * 124static inline unsigned long long *
@@ -192,25 +198,3 @@ int res_counter_memparse_write_strategy(const char *buf,
192 *res = PAGE_ALIGN(*res); 198 *res = PAGE_ALIGN(*res);
193 return 0; 199 return 0;
194} 200}
195
196int res_counter_write(struct res_counter *counter, int member,
197 const char *buf, write_strategy_fn write_strategy)
198{
199 char *end;
200 unsigned long flags;
201 unsigned long long tmp, *val;
202
203 if (write_strategy) {
204 if (write_strategy(buf, &tmp))
205 return -EINVAL;
206 } else {
207 tmp = simple_strtoull(buf, &end, 10);
208 if (*end != '\0')
209 return -EINVAL;
210 }
211 spin_lock_irqsave(&counter->lock, flags);
212 val = res_counter_member(counter, member);
213 *val = tmp;
214 spin_unlock_irqrestore(&counter->lock, flags);
215 return 0;
216}
diff --git a/kernel/rtmutex-debug.c b/kernel/rtmutex-debug.c
index 16502d3a71c8..13b243a323fa 100644
--- a/kernel/rtmutex-debug.c
+++ b/kernel/rtmutex-debug.c
@@ -17,6 +17,7 @@
17 * See rt.c in preempt-rt for proper credits and further information 17 * See rt.c in preempt-rt for proper credits and further information
18 */ 18 */
19#include <linux/sched.h> 19#include <linux/sched.h>
20#include <linux/sched/rt.h>
20#include <linux/delay.h> 21#include <linux/delay.h>
21#include <linux/export.h> 22#include <linux/export.h>
22#include <linux/spinlock.h> 23#include <linux/spinlock.h>
diff --git a/kernel/rtmutex-tester.c b/kernel/rtmutex-tester.c
index 98ec49475460..7890b10084a7 100644
--- a/kernel/rtmutex-tester.c
+++ b/kernel/rtmutex-tester.c
@@ -10,6 +10,7 @@
10#include <linux/kthread.h> 10#include <linux/kthread.h>
11#include <linux/export.h> 11#include <linux/export.h>
12#include <linux/sched.h> 12#include <linux/sched.h>
13#include <linux/sched/rt.h>
13#include <linux/spinlock.h> 14#include <linux/spinlock.h>
14#include <linux/timer.h> 15#include <linux/timer.h>
15#include <linux/freezer.h> 16#include <linux/freezer.h>
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c
index a242e691c993..1e09308bf2a1 100644
--- a/kernel/rtmutex.c
+++ b/kernel/rtmutex.c
@@ -13,6 +13,7 @@
13#include <linux/spinlock.h> 13#include <linux/spinlock.h>
14#include <linux/export.h> 14#include <linux/export.h>
15#include <linux/sched.h> 15#include <linux/sched.h>
16#include <linux/sched/rt.h>
16#include <linux/timer.h> 17#include <linux/timer.h>
17 18
18#include "rtmutex_common.h" 19#include "rtmutex_common.h"
diff --git a/kernel/rwsem.c b/kernel/rwsem.c
index 6850f53e02d8..b3c6c3fcd847 100644
--- a/kernel/rwsem.c
+++ b/kernel/rwsem.c
@@ -116,6 +116,16 @@ void down_read_nested(struct rw_semaphore *sem, int subclass)
116 116
117EXPORT_SYMBOL(down_read_nested); 117EXPORT_SYMBOL(down_read_nested);
118 118
119void _down_write_nest_lock(struct rw_semaphore *sem, struct lockdep_map *nest)
120{
121 might_sleep();
122 rwsem_acquire_nest(&sem->dep_map, 0, 0, nest, _RET_IP_);
123
124 LOCK_CONTENDED(sem, __down_write_trylock, __down_write);
125}
126
127EXPORT_SYMBOL(_down_write_nest_lock);
128
119void down_write_nested(struct rw_semaphore *sem, int subclass) 129void down_write_nested(struct rw_semaphore *sem, int subclass)
120{ 130{
121 might_sleep(); 131 might_sleep();
diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c
index 0984a21076a3..64de5f8b0c9e 100644
--- a/kernel/sched/auto_group.c
+++ b/kernel/sched/auto_group.c
@@ -35,6 +35,7 @@ static inline void autogroup_destroy(struct kref *kref)
35 ag->tg->rt_se = NULL; 35 ag->tg->rt_se = NULL;
36 ag->tg->rt_rq = NULL; 36 ag->tg->rt_rq = NULL;
37#endif 37#endif
38 sched_offline_group(ag->tg);
38 sched_destroy_group(ag->tg); 39 sched_destroy_group(ag->tg);
39} 40}
40 41
@@ -76,6 +77,8 @@ static inline struct autogroup *autogroup_create(void)
76 if (IS_ERR(tg)) 77 if (IS_ERR(tg))
77 goto out_free; 78 goto out_free;
78 79
80 sched_online_group(tg, &root_task_group);
81
79 kref_init(&ag->kref); 82 kref_init(&ag->kref);
80 init_rwsem(&ag->lock); 83 init_rwsem(&ag->lock);
81 ag->id = atomic_inc_return(&autogroup_seq_nr); 84 ag->id = atomic_inc_return(&autogroup_seq_nr);
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
index c685e31492df..c3ae1446461c 100644
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -176,10 +176,36 @@ static u64 sched_clock_remote(struct sched_clock_data *scd)
176 u64 this_clock, remote_clock; 176 u64 this_clock, remote_clock;
177 u64 *ptr, old_val, val; 177 u64 *ptr, old_val, val;
178 178
179#if BITS_PER_LONG != 64
180again:
181 /*
182 * Careful here: The local and the remote clock values need to
183 * be read out atomic as we need to compare the values and
184 * then update either the local or the remote side. So the
185 * cmpxchg64 below only protects one readout.
186 *
187 * We must reread via sched_clock_local() in the retry case on
188 * 32bit as an NMI could use sched_clock_local() via the
189 * tracer and hit between the readout of
190 * the low32bit and the high 32bit portion.
191 */
192 this_clock = sched_clock_local(my_scd);
193 /*
194 * We must enforce atomic readout on 32bit, otherwise the
195 * update on the remote cpu can hit inbetween the readout of
196 * the low32bit and the high 32bit portion.
197 */
198 remote_clock = cmpxchg64(&scd->clock, 0, 0);
199#else
200 /*
201 * On 64bit the read of [my]scd->clock is atomic versus the
202 * update, so we can avoid the above 32bit dance.
203 */
179 sched_clock_local(my_scd); 204 sched_clock_local(my_scd);
180again: 205again:
181 this_clock = my_scd->clock; 206 this_clock = my_scd->clock;
182 remote_clock = scd->clock; 207 remote_clock = scd->clock;
208#endif
183 209
184 /* 210 /*
185 * Use the opportunity that we have both locks 211 * Use the opportunity that we have both locks
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 2d8927fda712..67d04651f44b 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -72,6 +72,7 @@
72#include <linux/slab.h> 72#include <linux/slab.h>
73#include <linux/init_task.h> 73#include <linux/init_task.h>
74#include <linux/binfmts.h> 74#include <linux/binfmts.h>
75#include <linux/context_tracking.h>
75 76
76#include <asm/switch_to.h> 77#include <asm/switch_to.h>
77#include <asm/tlb.h> 78#include <asm/tlb.h>
@@ -82,7 +83,7 @@
82#endif 83#endif
83 84
84#include "sched.h" 85#include "sched.h"
85#include "../workqueue_sched.h" 86#include "../workqueue_internal.h"
86#include "../smpboot.h" 87#include "../smpboot.h"
87 88
88#define CREATE_TRACE_POINTS 89#define CREATE_TRACE_POINTS
@@ -192,23 +193,10 @@ static void sched_feat_disable(int i) { };
192static void sched_feat_enable(int i) { }; 193static void sched_feat_enable(int i) { };
193#endif /* HAVE_JUMP_LABEL */ 194#endif /* HAVE_JUMP_LABEL */
194 195
195static ssize_t 196static int sched_feat_set(char *cmp)
196sched_feat_write(struct file *filp, const char __user *ubuf,
197 size_t cnt, loff_t *ppos)
198{ 197{
199 char buf[64];
200 char *cmp;
201 int neg = 0;
202 int i; 198 int i;
203 199 int neg = 0;
204 if (cnt > 63)
205 cnt = 63;
206
207 if (copy_from_user(&buf, ubuf, cnt))
208 return -EFAULT;
209
210 buf[cnt] = 0;
211 cmp = strstrip(buf);
212 200
213 if (strncmp(cmp, "NO_", 3) == 0) { 201 if (strncmp(cmp, "NO_", 3) == 0) {
214 neg = 1; 202 neg = 1;
@@ -228,6 +216,27 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
228 } 216 }
229 } 217 }
230 218
219 return i;
220}
221
222static ssize_t
223sched_feat_write(struct file *filp, const char __user *ubuf,
224 size_t cnt, loff_t *ppos)
225{
226 char buf[64];
227 char *cmp;
228 int i;
229
230 if (cnt > 63)
231 cnt = 63;
232
233 if (copy_from_user(&buf, ubuf, cnt))
234 return -EFAULT;
235
236 buf[cnt] = 0;
237 cmp = strstrip(buf);
238
239 i = sched_feat_set(cmp);
231 if (i == __SCHED_FEAT_NR) 240 if (i == __SCHED_FEAT_NR)
232 return -EINVAL; 241 return -EINVAL;
233 242
@@ -922,6 +931,13 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
922 rq->skip_clock_update = 1; 931 rq->skip_clock_update = 1;
923} 932}
924 933
934static ATOMIC_NOTIFIER_HEAD(task_migration_notifier);
935
936void register_task_migration_notifier(struct notifier_block *n)
937{
938 atomic_notifier_chain_register(&task_migration_notifier, n);
939}
940
925#ifdef CONFIG_SMP 941#ifdef CONFIG_SMP
926void set_task_cpu(struct task_struct *p, unsigned int new_cpu) 942void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
927{ 943{
@@ -952,8 +968,18 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
952 trace_sched_migrate_task(p, new_cpu); 968 trace_sched_migrate_task(p, new_cpu);
953 969
954 if (task_cpu(p) != new_cpu) { 970 if (task_cpu(p) != new_cpu) {
971 struct task_migration_notifier tmn;
972
973 if (p->sched_class->migrate_task_rq)
974 p->sched_class->migrate_task_rq(p, new_cpu);
955 p->se.nr_migrations++; 975 p->se.nr_migrations++;
956 perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0); 976 perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0);
977
978 tmn.task = p;
979 tmn.from_cpu = task_cpu(p);
980 tmn.to_cpu = new_cpu;
981
982 atomic_notifier_call_chain(&task_migration_notifier, 0, &tmn);
957 } 983 }
958 984
959 __set_task_cpu(p, new_cpu); 985 __set_task_cpu(p, new_cpu);
@@ -1106,18 +1132,28 @@ EXPORT_SYMBOL_GPL(kick_process);
1106 */ 1132 */
1107static int select_fallback_rq(int cpu, struct task_struct *p) 1133static int select_fallback_rq(int cpu, struct task_struct *p)
1108{ 1134{
1109 const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(cpu)); 1135 int nid = cpu_to_node(cpu);
1136 const struct cpumask *nodemask = NULL;
1110 enum { cpuset, possible, fail } state = cpuset; 1137 enum { cpuset, possible, fail } state = cpuset;
1111 int dest_cpu; 1138 int dest_cpu;
1112 1139
1113 /* Look for allowed, online CPU in same node. */ 1140 /*
1114 for_each_cpu(dest_cpu, nodemask) { 1141 * If the node that the cpu is on has been offlined, cpu_to_node()
1115 if (!cpu_online(dest_cpu)) 1142 * will return -1. There is no cpu on the node, and we should
1116 continue; 1143 * select the cpu on the other node.
1117 if (!cpu_active(dest_cpu)) 1144 */
1118 continue; 1145 if (nid != -1) {
1119 if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p))) 1146 nodemask = cpumask_of_node(nid);
1120 return dest_cpu; 1147
1148 /* Look for allowed, online CPU in same node. */
1149 for_each_cpu(dest_cpu, nodemask) {
1150 if (!cpu_online(dest_cpu))
1151 continue;
1152 if (!cpu_active(dest_cpu))
1153 continue;
1154 if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
1155 return dest_cpu;
1156 }
1121 } 1157 }
1122 1158
1123 for (;;) { 1159 for (;;) {
@@ -1462,8 +1498,10 @@ static void try_to_wake_up_local(struct task_struct *p)
1462{ 1498{
1463 struct rq *rq = task_rq(p); 1499 struct rq *rq = task_rq(p);
1464 1500
1465 BUG_ON(rq != this_rq()); 1501 if (WARN_ON_ONCE(rq != this_rq()) ||
1466 BUG_ON(p == current); 1502 WARN_ON_ONCE(p == current))
1503 return;
1504
1467 lockdep_assert_held(&rq->lock); 1505 lockdep_assert_held(&rq->lock);
1468 1506
1469 if (!raw_spin_trylock(&p->pi_lock)) { 1507 if (!raw_spin_trylock(&p->pi_lock)) {
@@ -1497,7 +1535,8 @@ out:
1497 */ 1535 */
1498int wake_up_process(struct task_struct *p) 1536int wake_up_process(struct task_struct *p)
1499{ 1537{
1500 return try_to_wake_up(p, TASK_ALL, 0); 1538 WARN_ON(task_is_stopped_or_traced(p));
1539 return try_to_wake_up(p, TASK_NORMAL, 0);
1501} 1540}
1502EXPORT_SYMBOL(wake_up_process); 1541EXPORT_SYMBOL(wake_up_process);
1503 1542
@@ -1524,6 +1563,15 @@ static void __sched_fork(struct task_struct *p)
1524 p->se.vruntime = 0; 1563 p->se.vruntime = 0;
1525 INIT_LIST_HEAD(&p->se.group_node); 1564 INIT_LIST_HEAD(&p->se.group_node);
1526 1565
1566/*
1567 * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be
1568 * removed when useful for applications beyond shares distribution (e.g.
1569 * load-balance).
1570 */
1571#if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)
1572 p->se.avg.runnable_avg_period = 0;
1573 p->se.avg.runnable_avg_sum = 0;
1574#endif
1527#ifdef CONFIG_SCHEDSTATS 1575#ifdef CONFIG_SCHEDSTATS
1528 memset(&p->se.statistics, 0, sizeof(p->se.statistics)); 1576 memset(&p->se.statistics, 0, sizeof(p->se.statistics));
1529#endif 1577#endif
@@ -1533,8 +1581,41 @@ static void __sched_fork(struct task_struct *p)
1533#ifdef CONFIG_PREEMPT_NOTIFIERS 1581#ifdef CONFIG_PREEMPT_NOTIFIERS
1534 INIT_HLIST_HEAD(&p->preempt_notifiers); 1582 INIT_HLIST_HEAD(&p->preempt_notifiers);
1535#endif 1583#endif
1584
1585#ifdef CONFIG_NUMA_BALANCING
1586 if (p->mm && atomic_read(&p->mm->mm_users) == 1) {
1587 p->mm->numa_next_scan = jiffies;
1588 p->mm->numa_next_reset = jiffies;
1589 p->mm->numa_scan_seq = 0;
1590 }
1591
1592 p->node_stamp = 0ULL;
1593 p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0;
1594 p->numa_migrate_seq = p->mm ? p->mm->numa_scan_seq - 1 : 0;
1595 p->numa_scan_period = sysctl_numa_balancing_scan_delay;
1596 p->numa_work.next = &p->numa_work;
1597#endif /* CONFIG_NUMA_BALANCING */
1536} 1598}
1537 1599
1600#ifdef CONFIG_NUMA_BALANCING
1601#ifdef CONFIG_SCHED_DEBUG
1602void set_numabalancing_state(bool enabled)
1603{
1604 if (enabled)
1605 sched_feat_set("NUMA");
1606 else
1607 sched_feat_set("NO_NUMA");
1608}
1609#else
1610__read_mostly bool numabalancing_enabled;
1611
1612void set_numabalancing_state(bool enabled)
1613{
1614 numabalancing_enabled = enabled;
1615}
1616#endif /* CONFIG_SCHED_DEBUG */
1617#endif /* CONFIG_NUMA_BALANCING */
1618
1538/* 1619/*
1539 * fork()/clone()-time setup: 1620 * fork()/clone()-time setup:
1540 */ 1621 */
@@ -1673,9 +1754,8 @@ EXPORT_SYMBOL_GPL(preempt_notifier_unregister);
1673static void fire_sched_in_preempt_notifiers(struct task_struct *curr) 1754static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
1674{ 1755{
1675 struct preempt_notifier *notifier; 1756 struct preempt_notifier *notifier;
1676 struct hlist_node *node;
1677 1757
1678 hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link) 1758 hlist_for_each_entry(notifier, &curr->preempt_notifiers, link)
1679 notifier->ops->sched_in(notifier, raw_smp_processor_id()); 1759 notifier->ops->sched_in(notifier, raw_smp_processor_id());
1680} 1760}
1681 1761
@@ -1684,9 +1764,8 @@ fire_sched_out_preempt_notifiers(struct task_struct *curr,
1684 struct task_struct *next) 1764 struct task_struct *next)
1685{ 1765{
1686 struct preempt_notifier *notifier; 1766 struct preempt_notifier *notifier;
1687 struct hlist_node *node;
1688 1767
1689 hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link) 1768 hlist_for_each_entry(notifier, &curr->preempt_notifiers, link)
1690 notifier->ops->sched_out(notifier, next); 1769 notifier->ops->sched_out(notifier, next);
1691} 1770}
1692 1771
@@ -1886,8 +1965,8 @@ context_switch(struct rq *rq, struct task_struct *prev,
1886 spin_release(&rq->lock.dep_map, 1, _THIS_IP_); 1965 spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
1887#endif 1966#endif
1888 1967
1968 context_tracking_task_switch(prev, next);
1889 /* Here we just switch the register state and the stack. */ 1969 /* Here we just switch the register state and the stack. */
1890 rcu_switch(prev, next);
1891 switch_to(prev, next, prev); 1970 switch_to(prev, next, prev);
1892 1971
1893 barrier(); 1972 barrier();
@@ -1900,11 +1979,10 @@ context_switch(struct rq *rq, struct task_struct *prev,
1900} 1979}
1901 1980
1902/* 1981/*
1903 * nr_running, nr_uninterruptible and nr_context_switches: 1982 * nr_running and nr_context_switches:
1904 * 1983 *
1905 * externally visible scheduler statistics: current number of runnable 1984 * externally visible scheduler statistics: current number of runnable
1906 * threads, current number of uninterruptible-sleeping threads, total 1985 * threads, total number of context switches performed since bootup.
1907 * number of context switches performed since bootup.
1908 */ 1986 */
1909unsigned long nr_running(void) 1987unsigned long nr_running(void)
1910{ 1988{
@@ -1916,23 +1994,6 @@ unsigned long nr_running(void)
1916 return sum; 1994 return sum;
1917} 1995}
1918 1996
1919unsigned long nr_uninterruptible(void)
1920{
1921 unsigned long i, sum = 0;
1922
1923 for_each_possible_cpu(i)
1924 sum += cpu_rq(i)->nr_uninterruptible;
1925
1926 /*
1927 * Since we read the counters lockless, it might be slightly
1928 * inaccurate. Do not allow it to go below zero though:
1929 */
1930 if (unlikely((long)sum < 0))
1931 sum = 0;
1932
1933 return sum;
1934}
1935
1936unsigned long long nr_context_switches(void) 1997unsigned long long nr_context_switches(void)
1937{ 1998{
1938 int i; 1999 int i;
@@ -2717,7 +2778,7 @@ static noinline void __schedule_bug(struct task_struct *prev)
2717 if (irqs_disabled()) 2778 if (irqs_disabled())
2718 print_irqtrace_events(prev); 2779 print_irqtrace_events(prev);
2719 dump_stack(); 2780 dump_stack();
2720 add_taint(TAINT_WARN); 2781 add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
2721} 2782}
2722 2783
2723/* 2784/*
@@ -2911,7 +2972,7 @@ asmlinkage void __sched schedule(void)
2911} 2972}
2912EXPORT_SYMBOL(schedule); 2973EXPORT_SYMBOL(schedule);
2913 2974
2914#ifdef CONFIG_RCU_USER_QS 2975#ifdef CONFIG_CONTEXT_TRACKING
2915asmlinkage void __sched schedule_user(void) 2976asmlinkage void __sched schedule_user(void)
2916{ 2977{
2917 /* 2978 /*
@@ -2920,9 +2981,9 @@ asmlinkage void __sched schedule_user(void)
2920 * we haven't yet exited the RCU idle mode. Do it here manually until 2981 * we haven't yet exited the RCU idle mode. Do it here manually until
2921 * we find a better solution. 2982 * we find a better solution.
2922 */ 2983 */
2923 rcu_user_exit(); 2984 user_exit();
2924 schedule(); 2985 schedule();
2925 rcu_user_enter(); 2986 user_enter();
2926} 2987}
2927#endif 2988#endif
2928 2989
@@ -3027,7 +3088,7 @@ asmlinkage void __sched preempt_schedule_irq(void)
3027 /* Catch callers which need to be fixed */ 3088 /* Catch callers which need to be fixed */
3028 BUG_ON(ti->preempt_count || !irqs_disabled()); 3089 BUG_ON(ti->preempt_count || !irqs_disabled());
3029 3090
3030 rcu_user_exit(); 3091 user_exit();
3031 do { 3092 do {
3032 add_preempt_count(PREEMPT_ACTIVE); 3093 add_preempt_count(PREEMPT_ACTIVE);
3033 local_irq_enable(); 3094 local_irq_enable();
@@ -3199,7 +3260,8 @@ void complete_all(struct completion *x)
3199EXPORT_SYMBOL(complete_all); 3260EXPORT_SYMBOL(complete_all);
3200 3261
3201static inline long __sched 3262static inline long __sched
3202do_wait_for_common(struct completion *x, long timeout, int state) 3263do_wait_for_common(struct completion *x,
3264 long (*action)(long), long timeout, int state)
3203{ 3265{
3204 if (!x->done) { 3266 if (!x->done) {
3205 DECLARE_WAITQUEUE(wait, current); 3267 DECLARE_WAITQUEUE(wait, current);
@@ -3212,7 +3274,7 @@ do_wait_for_common(struct completion *x, long timeout, int state)
3212 } 3274 }
3213 __set_current_state(state); 3275 __set_current_state(state);
3214 spin_unlock_irq(&x->wait.lock); 3276 spin_unlock_irq(&x->wait.lock);
3215 timeout = schedule_timeout(timeout); 3277 timeout = action(timeout);
3216 spin_lock_irq(&x->wait.lock); 3278 spin_lock_irq(&x->wait.lock);
3217 } while (!x->done && timeout); 3279 } while (!x->done && timeout);
3218 __remove_wait_queue(&x->wait, &wait); 3280 __remove_wait_queue(&x->wait, &wait);
@@ -3223,17 +3285,30 @@ do_wait_for_common(struct completion *x, long timeout, int state)
3223 return timeout ?: 1; 3285 return timeout ?: 1;
3224} 3286}
3225 3287
3226static long __sched 3288static inline long __sched
3227wait_for_common(struct completion *x, long timeout, int state) 3289__wait_for_common(struct completion *x,
3290 long (*action)(long), long timeout, int state)
3228{ 3291{
3229 might_sleep(); 3292 might_sleep();
3230 3293
3231 spin_lock_irq(&x->wait.lock); 3294 spin_lock_irq(&x->wait.lock);
3232 timeout = do_wait_for_common(x, timeout, state); 3295 timeout = do_wait_for_common(x, action, timeout, state);
3233 spin_unlock_irq(&x->wait.lock); 3296 spin_unlock_irq(&x->wait.lock);
3234 return timeout; 3297 return timeout;
3235} 3298}
3236 3299
3300static long __sched
3301wait_for_common(struct completion *x, long timeout, int state)
3302{
3303 return __wait_for_common(x, schedule_timeout, timeout, state);
3304}
3305
3306static long __sched
3307wait_for_common_io(struct completion *x, long timeout, int state)
3308{
3309 return __wait_for_common(x, io_schedule_timeout, timeout, state);
3310}
3311
3237/** 3312/**
3238 * wait_for_completion: - waits for completion of a task 3313 * wait_for_completion: - waits for completion of a task
3239 * @x: holds the state of this particular completion 3314 * @x: holds the state of this particular completion
@@ -3270,6 +3345,39 @@ wait_for_completion_timeout(struct completion *x, unsigned long timeout)
3270EXPORT_SYMBOL(wait_for_completion_timeout); 3345EXPORT_SYMBOL(wait_for_completion_timeout);
3271 3346
3272/** 3347/**
3348 * wait_for_completion_io: - waits for completion of a task
3349 * @x: holds the state of this particular completion
3350 *
3351 * This waits to be signaled for completion of a specific task. It is NOT
3352 * interruptible and there is no timeout. The caller is accounted as waiting
3353 * for IO.
3354 */
3355void __sched wait_for_completion_io(struct completion *x)
3356{
3357 wait_for_common_io(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
3358}
3359EXPORT_SYMBOL(wait_for_completion_io);
3360
3361/**
3362 * wait_for_completion_io_timeout: - waits for completion of a task (w/timeout)
3363 * @x: holds the state of this particular completion
3364 * @timeout: timeout value in jiffies
3365 *
3366 * This waits for either a completion of a specific task to be signaled or for a
3367 * specified timeout to expire. The timeout is in jiffies. It is not
3368 * interruptible. The caller is accounted as waiting for IO.
3369 *
3370 * The return value is 0 if timed out, and positive (at least 1, or number of
3371 * jiffies left till timeout) if completed.
3372 */
3373unsigned long __sched
3374wait_for_completion_io_timeout(struct completion *x, unsigned long timeout)
3375{
3376 return wait_for_common_io(x, timeout, TASK_UNINTERRUPTIBLE);
3377}
3378EXPORT_SYMBOL(wait_for_completion_io_timeout);
3379
3380/**
3273 * wait_for_completion_interruptible: - waits for completion of a task (w/intr) 3381 * wait_for_completion_interruptible: - waits for completion of a task (w/intr)
3274 * @x: holds the state of this particular completion 3382 * @x: holds the state of this particular completion
3275 * 3383 *
@@ -4029,8 +4137,14 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
4029 goto out_free_cpus_allowed; 4137 goto out_free_cpus_allowed;
4030 } 4138 }
4031 retval = -EPERM; 4139 retval = -EPERM;
4032 if (!check_same_owner(p) && !ns_capable(task_user_ns(p), CAP_SYS_NICE)) 4140 if (!check_same_owner(p)) {
4033 goto out_unlock; 4141 rcu_read_lock();
4142 if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) {
4143 rcu_read_unlock();
4144 goto out_unlock;
4145 }
4146 rcu_read_unlock();
4147 }
4034 4148
4035 retval = security_task_setscheduler(p); 4149 retval = security_task_setscheduler(p);
4036 if (retval) 4150 if (retval)
@@ -4289,20 +4403,32 @@ EXPORT_SYMBOL(yield);
4289 * It's the caller's job to ensure that the target task struct 4403 * It's the caller's job to ensure that the target task struct
4290 * can't go away on us before we can do any checks. 4404 * can't go away on us before we can do any checks.
4291 * 4405 *
4292 * Returns true if we indeed boosted the target task. 4406 * Returns:
4407 * true (>0) if we indeed boosted the target task.
4408 * false (0) if we failed to boost the target.
4409 * -ESRCH if there's no task to yield to.
4293 */ 4410 */
4294bool __sched yield_to(struct task_struct *p, bool preempt) 4411bool __sched yield_to(struct task_struct *p, bool preempt)
4295{ 4412{
4296 struct task_struct *curr = current; 4413 struct task_struct *curr = current;
4297 struct rq *rq, *p_rq; 4414 struct rq *rq, *p_rq;
4298 unsigned long flags; 4415 unsigned long flags;
4299 bool yielded = 0; 4416 int yielded = 0;
4300 4417
4301 local_irq_save(flags); 4418 local_irq_save(flags);
4302 rq = this_rq(); 4419 rq = this_rq();
4303 4420
4304again: 4421again:
4305 p_rq = task_rq(p); 4422 p_rq = task_rq(p);
4423 /*
4424 * If we're the only runnable task on the rq and target rq also
4425 * has only one task, there's absolutely no point in yielding.
4426 */
4427 if (rq->nr_running == 1 && p_rq->nr_running == 1) {
4428 yielded = -ESRCH;
4429 goto out_irq;
4430 }
4431
4306 double_rq_lock(rq, p_rq); 4432 double_rq_lock(rq, p_rq);
4307 while (task_rq(p) != p_rq) { 4433 while (task_rq(p) != p_rq) {
4308 double_rq_unlock(rq, p_rq); 4434 double_rq_unlock(rq, p_rq);
@@ -4310,13 +4436,13 @@ again:
4310 } 4436 }
4311 4437
4312 if (!curr->sched_class->yield_to_task) 4438 if (!curr->sched_class->yield_to_task)
4313 goto out; 4439 goto out_unlock;
4314 4440
4315 if (curr->sched_class != p->sched_class) 4441 if (curr->sched_class != p->sched_class)
4316 goto out; 4442 goto out_unlock;
4317 4443
4318 if (task_running(p_rq, p) || p->state) 4444 if (task_running(p_rq, p) || p->state)
4319 goto out; 4445 goto out_unlock;
4320 4446
4321 yielded = curr->sched_class->yield_to_task(rq, p, preempt); 4447 yielded = curr->sched_class->yield_to_task(rq, p, preempt);
4322 if (yielded) { 4448 if (yielded) {
@@ -4329,11 +4455,12 @@ again:
4329 resched_task(p_rq->curr); 4455 resched_task(p_rq->curr);
4330 } 4456 }
4331 4457
4332out: 4458out_unlock:
4333 double_rq_unlock(rq, p_rq); 4459 double_rq_unlock(rq, p_rq);
4460out_irq:
4334 local_irq_restore(flags); 4461 local_irq_restore(flags);
4335 4462
4336 if (yielded) 4463 if (yielded > 0)
4337 schedule(); 4464 schedule();
4338 4465
4339 return yielded; 4466 return yielded;
@@ -4474,6 +4601,7 @@ static const char stat_nam[] = TASK_STATE_TO_CHAR_STR;
4474void sched_show_task(struct task_struct *p) 4601void sched_show_task(struct task_struct *p)
4475{ 4602{
4476 unsigned long free = 0; 4603 unsigned long free = 0;
4604 int ppid;
4477 unsigned state; 4605 unsigned state;
4478 4606
4479 state = p->state ? __ffs(p->state) + 1 : 0; 4607 state = p->state ? __ffs(p->state) + 1 : 0;
@@ -4493,8 +4621,11 @@ void sched_show_task(struct task_struct *p)
4493#ifdef CONFIG_DEBUG_STACK_USAGE 4621#ifdef CONFIG_DEBUG_STACK_USAGE
4494 free = stack_not_used(p); 4622 free = stack_not_used(p);
4495#endif 4623#endif
4624 rcu_read_lock();
4625 ppid = task_pid_nr(rcu_dereference(p->real_parent));
4626 rcu_read_unlock();
4496 printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free, 4627 printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free,
4497 task_pid_nr(p), task_pid_nr(rcu_dereference(p->real_parent)), 4628 task_pid_nr(p), ppid,
4498 (unsigned long)task_thread_info(p)->flags); 4629 (unsigned long)task_thread_info(p)->flags);
4499 4630
4500 show_stack(p, NULL); 4631 show_stack(p, NULL);
@@ -4588,6 +4719,7 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
4588 */ 4719 */
4589 idle->sched_class = &idle_sched_class; 4720 idle->sched_class = &idle_sched_class;
4590 ftrace_graph_init_idle_task(idle, cpu); 4721 ftrace_graph_init_idle_task(idle, cpu);
4722 vtime_init_idle(idle);
4591#if defined(CONFIG_SMP) 4723#if defined(CONFIG_SMP)
4592 sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu); 4724 sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu);
4593#endif 4725#endif
@@ -4869,7 +5001,7 @@ static void sd_free_ctl_entry(struct ctl_table **tablep)
4869} 5001}
4870 5002
4871static int min_load_idx = 0; 5003static int min_load_idx = 0;
4872static int max_load_idx = CPU_LOAD_IDX_MAX; 5004static int max_load_idx = CPU_LOAD_IDX_MAX-1;
4873 5005
4874static void 5006static void
4875set_table_entry(struct ctl_table *entry, 5007set_table_entry(struct ctl_table *entry,
@@ -7081,7 +7213,6 @@ static void free_sched_group(struct task_group *tg)
7081struct task_group *sched_create_group(struct task_group *parent) 7213struct task_group *sched_create_group(struct task_group *parent)
7082{ 7214{
7083 struct task_group *tg; 7215 struct task_group *tg;
7084 unsigned long flags;
7085 7216
7086 tg = kzalloc(sizeof(*tg), GFP_KERNEL); 7217 tg = kzalloc(sizeof(*tg), GFP_KERNEL);
7087 if (!tg) 7218 if (!tg)
@@ -7093,6 +7224,17 @@ struct task_group *sched_create_group(struct task_group *parent)
7093 if (!alloc_rt_sched_group(tg, parent)) 7224 if (!alloc_rt_sched_group(tg, parent))
7094 goto err; 7225 goto err;
7095 7226
7227 return tg;
7228
7229err:
7230 free_sched_group(tg);
7231 return ERR_PTR(-ENOMEM);
7232}
7233
7234void sched_online_group(struct task_group *tg, struct task_group *parent)
7235{
7236 unsigned long flags;
7237
7096 spin_lock_irqsave(&task_group_lock, flags); 7238 spin_lock_irqsave(&task_group_lock, flags);
7097 list_add_rcu(&tg->list, &task_groups); 7239 list_add_rcu(&tg->list, &task_groups);
7098 7240
@@ -7102,12 +7244,6 @@ struct task_group *sched_create_group(struct task_group *parent)
7102 INIT_LIST_HEAD(&tg->children); 7244 INIT_LIST_HEAD(&tg->children);
7103 list_add_rcu(&tg->siblings, &parent->children); 7245 list_add_rcu(&tg->siblings, &parent->children);
7104 spin_unlock_irqrestore(&task_group_lock, flags); 7246 spin_unlock_irqrestore(&task_group_lock, flags);
7105
7106 return tg;
7107
7108err:
7109 free_sched_group(tg);
7110 return ERR_PTR(-ENOMEM);
7111} 7247}
7112 7248
7113/* rcu callback to free various structures associated with a task group */ 7249/* rcu callback to free various structures associated with a task group */
@@ -7120,6 +7256,12 @@ static void free_sched_group_rcu(struct rcu_head *rhp)
7120/* Destroy runqueue etc associated with a task group */ 7256/* Destroy runqueue etc associated with a task group */
7121void sched_destroy_group(struct task_group *tg) 7257void sched_destroy_group(struct task_group *tg)
7122{ 7258{
7259 /* wait for possible concurrent references to cfs_rqs complete */
7260 call_rcu(&tg->rcu, free_sched_group_rcu);
7261}
7262
7263void sched_offline_group(struct task_group *tg)
7264{
7123 unsigned long flags; 7265 unsigned long flags;
7124 int i; 7266 int i;
7125 7267
@@ -7131,9 +7273,6 @@ void sched_destroy_group(struct task_group *tg)
7131 list_del_rcu(&tg->list); 7273 list_del_rcu(&tg->list);
7132 list_del_rcu(&tg->siblings); 7274 list_del_rcu(&tg->siblings);
7133 spin_unlock_irqrestore(&task_group_lock, flags); 7275 spin_unlock_irqrestore(&task_group_lock, flags);
7134
7135 /* wait for possible concurrent references to cfs_rqs complete */
7136 call_rcu(&tg->rcu, free_sched_group_rcu);
7137} 7276}
7138 7277
7139/* change task's runqueue when it moves between groups. 7278/* change task's runqueue when it moves between groups.
@@ -7429,6 +7568,25 @@ static int sched_rt_global_constraints(void)
7429} 7568}
7430#endif /* CONFIG_RT_GROUP_SCHED */ 7569#endif /* CONFIG_RT_GROUP_SCHED */
7431 7570
7571int sched_rr_handler(struct ctl_table *table, int write,
7572 void __user *buffer, size_t *lenp,
7573 loff_t *ppos)
7574{
7575 int ret;
7576 static DEFINE_MUTEX(mutex);
7577
7578 mutex_lock(&mutex);
7579 ret = proc_dointvec(table, write, buffer, lenp, ppos);
7580 /* make sure that internally we keep jiffies */
7581 /* also, writing zero resets timeslice to default */
7582 if (!ret && write) {
7583 sched_rr_timeslice = sched_rr_timeslice <= 0 ?
7584 RR_TIMESLICE : msecs_to_jiffies(sched_rr_timeslice);
7585 }
7586 mutex_unlock(&mutex);
7587 return ret;
7588}
7589
7432int sched_rt_handler(struct ctl_table *table, int write, 7590int sched_rt_handler(struct ctl_table *table, int write,
7433 void __user *buffer, size_t *lenp, 7591 void __user *buffer, size_t *lenp,
7434 loff_t *ppos) 7592 loff_t *ppos)
@@ -7468,7 +7626,7 @@ static inline struct task_group *cgroup_tg(struct cgroup *cgrp)
7468 struct task_group, css); 7626 struct task_group, css);
7469} 7627}
7470 7628
7471static struct cgroup_subsys_state *cpu_cgroup_create(struct cgroup *cgrp) 7629static struct cgroup_subsys_state *cpu_cgroup_css_alloc(struct cgroup *cgrp)
7472{ 7630{
7473 struct task_group *tg, *parent; 7631 struct task_group *tg, *parent;
7474 7632
@@ -7485,13 +7643,33 @@ static struct cgroup_subsys_state *cpu_cgroup_create(struct cgroup *cgrp)
7485 return &tg->css; 7643 return &tg->css;
7486} 7644}
7487 7645
7488static void cpu_cgroup_destroy(struct cgroup *cgrp) 7646static int cpu_cgroup_css_online(struct cgroup *cgrp)
7647{
7648 struct task_group *tg = cgroup_tg(cgrp);
7649 struct task_group *parent;
7650
7651 if (!cgrp->parent)
7652 return 0;
7653
7654 parent = cgroup_tg(cgrp->parent);
7655 sched_online_group(tg, parent);
7656 return 0;
7657}
7658
7659static void cpu_cgroup_css_free(struct cgroup *cgrp)
7489{ 7660{
7490 struct task_group *tg = cgroup_tg(cgrp); 7661 struct task_group *tg = cgroup_tg(cgrp);
7491 7662
7492 sched_destroy_group(tg); 7663 sched_destroy_group(tg);
7493} 7664}
7494 7665
7666static void cpu_cgroup_css_offline(struct cgroup *cgrp)
7667{
7668 struct task_group *tg = cgroup_tg(cgrp);
7669
7670 sched_offline_group(tg);
7671}
7672
7495static int cpu_cgroup_can_attach(struct cgroup *cgrp, 7673static int cpu_cgroup_can_attach(struct cgroup *cgrp,
7496 struct cgroup_taskset *tset) 7674 struct cgroup_taskset *tset)
7497{ 7675{
@@ -7845,8 +8023,10 @@ static struct cftype cpu_files[] = {
7845 8023
7846struct cgroup_subsys cpu_cgroup_subsys = { 8024struct cgroup_subsys cpu_cgroup_subsys = {
7847 .name = "cpu", 8025 .name = "cpu",
7848 .create = cpu_cgroup_create, 8026 .css_alloc = cpu_cgroup_css_alloc,
7849 .destroy = cpu_cgroup_destroy, 8027 .css_free = cpu_cgroup_css_free,
8028 .css_online = cpu_cgroup_css_online,
8029 .css_offline = cpu_cgroup_css_offline,
7850 .can_attach = cpu_cgroup_can_attach, 8030 .can_attach = cpu_cgroup_can_attach,
7851 .attach = cpu_cgroup_attach, 8031 .attach = cpu_cgroup_attach,
7852 .exit = cpu_cgroup_exit, 8032 .exit = cpu_cgroup_exit,
@@ -7869,7 +8049,7 @@ struct cgroup_subsys cpu_cgroup_subsys = {
7869struct cpuacct root_cpuacct; 8049struct cpuacct root_cpuacct;
7870 8050
7871/* create a new cpu accounting group */ 8051/* create a new cpu accounting group */
7872static struct cgroup_subsys_state *cpuacct_create(struct cgroup *cgrp) 8052static struct cgroup_subsys_state *cpuacct_css_alloc(struct cgroup *cgrp)
7873{ 8053{
7874 struct cpuacct *ca; 8054 struct cpuacct *ca;
7875 8055
@@ -7899,7 +8079,7 @@ out:
7899} 8079}
7900 8080
7901/* destroy an existing cpu accounting group */ 8081/* destroy an existing cpu accounting group */
7902static void cpuacct_destroy(struct cgroup *cgrp) 8082static void cpuacct_css_free(struct cgroup *cgrp)
7903{ 8083{
7904 struct cpuacct *ca = cgroup_ca(cgrp); 8084 struct cpuacct *ca = cgroup_ca(cgrp);
7905 8085
@@ -8070,9 +8250,15 @@ void cpuacct_charge(struct task_struct *tsk, u64 cputime)
8070 8250
8071struct cgroup_subsys cpuacct_subsys = { 8251struct cgroup_subsys cpuacct_subsys = {
8072 .name = "cpuacct", 8252 .name = "cpuacct",
8073 .create = cpuacct_create, 8253 .css_alloc = cpuacct_css_alloc,
8074 .destroy = cpuacct_destroy, 8254 .css_free = cpuacct_css_free,
8075 .subsys_id = cpuacct_subsys_id, 8255 .subsys_id = cpuacct_subsys_id,
8076 .base_cftypes = files, 8256 .base_cftypes = files,
8077}; 8257};
8078#endif /* CONFIG_CGROUP_CPUACCT */ 8258#endif /* CONFIG_CGROUP_CPUACCT */
8259
8260void dump_cpu_task(int cpu)
8261{
8262 pr_info("Task dump for CPU %d:\n", cpu);
8263 sched_show_task(cpu_curr(cpu));
8264}
diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c
index 23aa789c53ee..1095e878a46f 100644
--- a/kernel/sched/cpupri.c
+++ b/kernel/sched/cpupri.c
@@ -28,6 +28,8 @@
28 */ 28 */
29 29
30#include <linux/gfp.h> 30#include <linux/gfp.h>
31#include <linux/sched.h>
32#include <linux/sched/rt.h>
31#include "cpupri.h" 33#include "cpupri.h"
32 34
33/* Convert between a 140 based task->prio, and our 102 based cpupri */ 35/* Convert between a 140 based task->prio, and our 102 based cpupri */
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 81b763ba58a6..e93cca92f38b 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -3,6 +3,7 @@
3#include <linux/tsacct_kern.h> 3#include <linux/tsacct_kern.h>
4#include <linux/kernel_stat.h> 4#include <linux/kernel_stat.h>
5#include <linux/static_key.h> 5#include <linux/static_key.h>
6#include <linux/context_tracking.h>
6#include "sched.h" 7#include "sched.h"
7 8
8 9
@@ -43,7 +44,7 @@ DEFINE_PER_CPU(seqcount_t, irq_time_seq);
43 * Called before incrementing preempt_count on {soft,}irq_enter 44 * Called before incrementing preempt_count on {soft,}irq_enter
44 * and before decrementing preempt_count on {soft,}irq_exit. 45 * and before decrementing preempt_count on {soft,}irq_exit.
45 */ 46 */
46void vtime_account(struct task_struct *curr) 47void irqtime_account_irq(struct task_struct *curr)
47{ 48{
48 unsigned long flags; 49 unsigned long flags;
49 s64 delta; 50 s64 delta;
@@ -73,7 +74,7 @@ void vtime_account(struct task_struct *curr)
73 irq_time_write_end(); 74 irq_time_write_end();
74 local_irq_restore(flags); 75 local_irq_restore(flags);
75} 76}
76EXPORT_SYMBOL_GPL(vtime_account); 77EXPORT_SYMBOL_GPL(irqtime_account_irq);
77 78
78static int irqtime_account_hi_update(void) 79static int irqtime_account_hi_update(void)
79{ 80{
@@ -163,7 +164,7 @@ void account_user_time(struct task_struct *p, cputime_t cputime,
163 task_group_account_field(p, index, (__force u64) cputime); 164 task_group_account_field(p, index, (__force u64) cputime);
164 165
165 /* Account for user time used */ 166 /* Account for user time used */
166 acct_update_integrals(p); 167 acct_account_cputime(p);
167} 168}
168 169
169/* 170/*
@@ -213,7 +214,7 @@ void __account_system_time(struct task_struct *p, cputime_t cputime,
213 task_group_account_field(p, index, (__force u64) cputime); 214 task_group_account_field(p, index, (__force u64) cputime);
214 215
215 /* Account for system time used */ 216 /* Account for system time used */
216 acct_update_integrals(p); 217 acct_account_cputime(p);
217} 218}
218 219
219/* 220/*
@@ -288,7 +289,35 @@ static __always_inline bool steal_account_process_tick(void)
288 return false; 289 return false;
289} 290}
290 291
291#ifndef CONFIG_VIRT_CPU_ACCOUNTING 292/*
293 * Accumulate raw cputime values of dead tasks (sig->[us]time) and live
294 * tasks (sum on group iteration) belonging to @tsk's group.
295 */
296void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
297{
298 struct signal_struct *sig = tsk->signal;
299 cputime_t utime, stime;
300 struct task_struct *t;
301
302 times->utime = sig->utime;
303 times->stime = sig->stime;
304 times->sum_exec_runtime = sig->sum_sched_runtime;
305
306 rcu_read_lock();
307 /* make sure we can trust tsk->thread_group list */
308 if (!likely(pid_alive(tsk)))
309 goto out;
310
311 t = tsk;
312 do {
313 task_cputime(t, &utime, &stime);
314 times->utime += utime;
315 times->stime += stime;
316 times->sum_exec_runtime += task_sched_runtime(t);
317 } while_each_thread(tsk, t);
318out:
319 rcu_read_unlock();
320}
292 321
293#ifdef CONFIG_IRQ_TIME_ACCOUNTING 322#ifdef CONFIG_IRQ_TIME_ACCOUNTING
294/* 323/*
@@ -354,11 +383,12 @@ static void irqtime_account_idle_ticks(int ticks)
354 irqtime_account_process_tick(current, 0, rq); 383 irqtime_account_process_tick(current, 0, rq);
355} 384}
356#else /* CONFIG_IRQ_TIME_ACCOUNTING */ 385#else /* CONFIG_IRQ_TIME_ACCOUNTING */
357static void irqtime_account_idle_ticks(int ticks) {} 386static inline void irqtime_account_idle_ticks(int ticks) {}
358static void irqtime_account_process_tick(struct task_struct *p, int user_tick, 387static inline void irqtime_account_process_tick(struct task_struct *p, int user_tick,
359 struct rq *rq) {} 388 struct rq *rq) {}
360#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ 389#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
361 390
391#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
362/* 392/*
363 * Account a single tick of cpu time. 393 * Account a single tick of cpu time.
364 * @p: the process that the cpu time gets accounted to 394 * @p: the process that the cpu time gets accounted to
@@ -369,6 +399,9 @@ void account_process_tick(struct task_struct *p, int user_tick)
369 cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); 399 cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
370 struct rq *rq = this_rq(); 400 struct rq *rq = this_rq();
371 401
402 if (vtime_accounting_enabled())
403 return;
404
372 if (sched_clock_irqtime) { 405 if (sched_clock_irqtime) {
373 irqtime_account_process_tick(p, user_tick, rq); 406 irqtime_account_process_tick(p, user_tick, rq);
374 return; 407 return;
@@ -410,20 +443,19 @@ void account_idle_ticks(unsigned long ticks)
410 443
411 account_idle_time(jiffies_to_cputime(ticks)); 444 account_idle_time(jiffies_to_cputime(ticks));
412} 445}
413 446#endif /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
414#endif
415 447
416/* 448/*
417 * Use precise platform statistics if available: 449 * Use precise platform statistics if available:
418 */ 450 */
419#ifdef CONFIG_VIRT_CPU_ACCOUNTING 451#ifdef CONFIG_VIRT_CPU_ACCOUNTING
420void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) 452void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
421{ 453{
422 *ut = p->utime; 454 *ut = p->utime;
423 *st = p->stime; 455 *st = p->stime;
424} 456}
425 457
426void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) 458void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
427{ 459{
428 struct task_cputime cputime; 460 struct task_cputime cputime;
429 461
@@ -433,6 +465,24 @@ void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
433 *st = cputime.stime; 465 *st = cputime.stime;
434} 466}
435 467
468#ifndef __ARCH_HAS_VTIME_TASK_SWITCH
469void vtime_task_switch(struct task_struct *prev)
470{
471 if (!vtime_accounting_enabled())
472 return;
473
474 if (is_idle_task(prev))
475 vtime_account_idle(prev);
476 else
477 vtime_account_system(prev);
478
479#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
480 vtime_account_user(prev);
481#endif
482 arch_vtime_task_switch(prev);
483}
484#endif
485
436/* 486/*
437 * Archs that account the whole time spent in the idle task 487 * Archs that account the whole time spent in the idle task
438 * (outside irq) as idle time can rely on this and just implement 488 * (outside irq) as idle time can rely on this and just implement
@@ -442,33 +492,40 @@ void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
442 * vtime_account(). 492 * vtime_account().
443 */ 493 */
444#ifndef __ARCH_HAS_VTIME_ACCOUNT 494#ifndef __ARCH_HAS_VTIME_ACCOUNT
445void vtime_account(struct task_struct *tsk) 495void vtime_account_irq_enter(struct task_struct *tsk)
446{ 496{
447 unsigned long flags; 497 if (!vtime_accounting_enabled())
448 498 return;
449 local_irq_save(flags);
450
451 if (in_interrupt() || !is_idle_task(tsk))
452 vtime_account_system(tsk);
453 else
454 vtime_account_idle(tsk);
455 499
456 local_irq_restore(flags); 500 if (!in_interrupt()) {
501 /*
502 * If we interrupted user, context_tracking_in_user()
503 * is 1 because the context tracking don't hook
504 * on irq entry/exit. This way we know if
505 * we need to flush user time on kernel entry.
506 */
507 if (context_tracking_in_user()) {
508 vtime_account_user(tsk);
509 return;
510 }
511
512 if (is_idle_task(tsk)) {
513 vtime_account_idle(tsk);
514 return;
515 }
516 }
517 vtime_account_system(tsk);
457} 518}
458EXPORT_SYMBOL_GPL(vtime_account); 519EXPORT_SYMBOL_GPL(vtime_account_irq_enter);
459#endif /* __ARCH_HAS_VTIME_ACCOUNT */ 520#endif /* __ARCH_HAS_VTIME_ACCOUNT */
460 521
461#else 522#else /* !CONFIG_VIRT_CPU_ACCOUNTING */
462 523
463#ifndef nsecs_to_cputime 524static cputime_t scale_stime(cputime_t stime, cputime_t rtime, cputime_t total)
464# define nsecs_to_cputime(__nsecs) nsecs_to_jiffies(__nsecs)
465#endif
466
467static cputime_t scale_utime(cputime_t utime, cputime_t rtime, cputime_t total)
468{ 525{
469 u64 temp = (__force u64) rtime; 526 u64 temp = (__force u64) rtime;
470 527
471 temp *= (__force u64) utime; 528 temp *= (__force u64) stime;
472 529
473 if (sizeof(cputime_t) == 4) 530 if (sizeof(cputime_t) == 4)
474 temp = div_u64(temp, (__force u32) total); 531 temp = div_u64(temp, (__force u32) total);
@@ -478,53 +535,283 @@ static cputime_t scale_utime(cputime_t utime, cputime_t rtime, cputime_t total)
478 return (__force cputime_t) temp; 535 return (__force cputime_t) temp;
479} 536}
480 537
481void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) 538/*
539 * Adjust tick based cputime random precision against scheduler
540 * runtime accounting.
541 */
542static void cputime_adjust(struct task_cputime *curr,
543 struct cputime *prev,
544 cputime_t *ut, cputime_t *st)
482{ 545{
483 cputime_t rtime, utime = p->utime, total = utime + p->stime; 546 cputime_t rtime, stime, total;
547
548 stime = curr->stime;
549 total = stime + curr->utime;
484 550
485 /* 551 /*
486 * Use CFS's precise accounting: 552 * Tick based cputime accounting depend on random scheduling
553 * timeslices of a task to be interrupted or not by the timer.
554 * Depending on these circumstances, the number of these interrupts
555 * may be over or under-optimistic, matching the real user and system
556 * cputime with a variable precision.
557 *
558 * Fix this by scaling these tick based values against the total
559 * runtime accounted by the CFS scheduler.
487 */ 560 */
488 rtime = nsecs_to_cputime(p->se.sum_exec_runtime); 561 rtime = nsecs_to_cputime(curr->sum_exec_runtime);
489 562
490 if (total) 563 if (total)
491 utime = scale_utime(utime, rtime, total); 564 stime = scale_stime(stime, rtime, total);
492 else 565 else
493 utime = rtime; 566 stime = rtime;
494 567
495 /* 568 /*
496 * Compare with previous values, to keep monotonicity: 569 * If the tick based count grows faster than the scheduler one,
570 * the result of the scaling may go backward.
571 * Let's enforce monotonicity.
497 */ 572 */
498 p->prev_utime = max(p->prev_utime, utime); 573 prev->stime = max(prev->stime, stime);
499 p->prev_stime = max(p->prev_stime, rtime - p->prev_utime); 574 prev->utime = max(prev->utime, rtime - prev->stime);
575
576 *ut = prev->utime;
577 *st = prev->stime;
578}
500 579
501 *ut = p->prev_utime; 580void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
502 *st = p->prev_stime; 581{
582 struct task_cputime cputime = {
583 .sum_exec_runtime = p->se.sum_exec_runtime,
584 };
585
586 task_cputime(p, &cputime.utime, &cputime.stime);
587 cputime_adjust(&cputime, &p->prev_cputime, ut, st);
503} 588}
504 589
505/* 590/*
506 * Must be called with siglock held. 591 * Must be called with siglock held.
507 */ 592 */
508void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) 593void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
509{ 594{
510 struct signal_struct *sig = p->signal;
511 struct task_cputime cputime; 595 struct task_cputime cputime;
512 cputime_t rtime, utime, total;
513 596
514 thread_group_cputime(p, &cputime); 597 thread_group_cputime(p, &cputime);
598 cputime_adjust(&cputime, &p->signal->prev_cputime, ut, st);
599}
600#endif /* !CONFIG_VIRT_CPU_ACCOUNTING */
515 601
516 total = cputime.utime + cputime.stime; 602#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
517 rtime = nsecs_to_cputime(cputime.sum_exec_runtime); 603static unsigned long long vtime_delta(struct task_struct *tsk)
604{
605 unsigned long long clock;
518 606
519 if (total) 607 clock = local_clock();
520 utime = scale_utime(cputime.utime, rtime, total); 608 if (clock < tsk->vtime_snap)
521 else 609 return 0;
522 utime = rtime;
523 610
524 sig->prev_utime = max(sig->prev_utime, utime); 611 return clock - tsk->vtime_snap;
525 sig->prev_stime = max(sig->prev_stime, rtime - sig->prev_utime); 612}
526 613
527 *ut = sig->prev_utime; 614static cputime_t get_vtime_delta(struct task_struct *tsk)
528 *st = sig->prev_stime; 615{
616 unsigned long long delta = vtime_delta(tsk);
617
618 WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_SLEEPING);
619 tsk->vtime_snap += delta;
620
621 /* CHECKME: always safe to convert nsecs to cputime? */
622 return nsecs_to_cputime(delta);
529} 623}
530#endif 624
625static void __vtime_account_system(struct task_struct *tsk)
626{
627 cputime_t delta_cpu = get_vtime_delta(tsk);
628
629 account_system_time(tsk, irq_count(), delta_cpu, cputime_to_scaled(delta_cpu));
630}
631
632void vtime_account_system(struct task_struct *tsk)
633{
634 if (!vtime_accounting_enabled())
635 return;
636
637 write_seqlock(&tsk->vtime_seqlock);
638 __vtime_account_system(tsk);
639 write_sequnlock(&tsk->vtime_seqlock);
640}
641
642void vtime_account_irq_exit(struct task_struct *tsk)
643{
644 if (!vtime_accounting_enabled())
645 return;
646
647 write_seqlock(&tsk->vtime_seqlock);
648 if (context_tracking_in_user())
649 tsk->vtime_snap_whence = VTIME_USER;
650 __vtime_account_system(tsk);
651 write_sequnlock(&tsk->vtime_seqlock);
652}
653
654void vtime_account_user(struct task_struct *tsk)
655{
656 cputime_t delta_cpu;
657
658 if (!vtime_accounting_enabled())
659 return;
660
661 delta_cpu = get_vtime_delta(tsk);
662
663 write_seqlock(&tsk->vtime_seqlock);
664 tsk->vtime_snap_whence = VTIME_SYS;
665 account_user_time(tsk, delta_cpu, cputime_to_scaled(delta_cpu));
666 write_sequnlock(&tsk->vtime_seqlock);
667}
668
669void vtime_user_enter(struct task_struct *tsk)
670{
671 if (!vtime_accounting_enabled())
672 return;
673
674 write_seqlock(&tsk->vtime_seqlock);
675 tsk->vtime_snap_whence = VTIME_USER;
676 __vtime_account_system(tsk);
677 write_sequnlock(&tsk->vtime_seqlock);
678}
679
680void vtime_guest_enter(struct task_struct *tsk)
681{
682 write_seqlock(&tsk->vtime_seqlock);
683 __vtime_account_system(tsk);
684 current->flags |= PF_VCPU;
685 write_sequnlock(&tsk->vtime_seqlock);
686}
687
688void vtime_guest_exit(struct task_struct *tsk)
689{
690 write_seqlock(&tsk->vtime_seqlock);
691 __vtime_account_system(tsk);
692 current->flags &= ~PF_VCPU;
693 write_sequnlock(&tsk->vtime_seqlock);
694}
695
696void vtime_account_idle(struct task_struct *tsk)
697{
698 cputime_t delta_cpu = get_vtime_delta(tsk);
699
700 account_idle_time(delta_cpu);
701}
702
703bool vtime_accounting_enabled(void)
704{
705 return context_tracking_active();
706}
707
708void arch_vtime_task_switch(struct task_struct *prev)
709{
710 write_seqlock(&prev->vtime_seqlock);
711 prev->vtime_snap_whence = VTIME_SLEEPING;
712 write_sequnlock(&prev->vtime_seqlock);
713
714 write_seqlock(&current->vtime_seqlock);
715 current->vtime_snap_whence = VTIME_SYS;
716 current->vtime_snap = sched_clock();
717 write_sequnlock(&current->vtime_seqlock);
718}
719
720void vtime_init_idle(struct task_struct *t)
721{
722 unsigned long flags;
723
724 write_seqlock_irqsave(&t->vtime_seqlock, flags);
725 t->vtime_snap_whence = VTIME_SYS;
726 t->vtime_snap = sched_clock();
727 write_sequnlock_irqrestore(&t->vtime_seqlock, flags);
728}
729
730cputime_t task_gtime(struct task_struct *t)
731{
732 unsigned int seq;
733 cputime_t gtime;
734
735 do {
736 seq = read_seqbegin(&t->vtime_seqlock);
737
738 gtime = t->gtime;
739 if (t->flags & PF_VCPU)
740 gtime += vtime_delta(t);
741
742 } while (read_seqretry(&t->vtime_seqlock, seq));
743
744 return gtime;
745}
746
747/*
748 * Fetch cputime raw values from fields of task_struct and
749 * add up the pending nohz execution time since the last
750 * cputime snapshot.
751 */
752static void
753fetch_task_cputime(struct task_struct *t,
754 cputime_t *u_dst, cputime_t *s_dst,
755 cputime_t *u_src, cputime_t *s_src,
756 cputime_t *udelta, cputime_t *sdelta)
757{
758 unsigned int seq;
759 unsigned long long delta;
760
761 do {
762 *udelta = 0;
763 *sdelta = 0;
764
765 seq = read_seqbegin(&t->vtime_seqlock);
766
767 if (u_dst)
768 *u_dst = *u_src;
769 if (s_dst)
770 *s_dst = *s_src;
771
772 /* Task is sleeping, nothing to add */
773 if (t->vtime_snap_whence == VTIME_SLEEPING ||
774 is_idle_task(t))
775 continue;
776
777 delta = vtime_delta(t);
778
779 /*
780 * Task runs either in user or kernel space, add pending nohz time to
781 * the right place.
782 */
783 if (t->vtime_snap_whence == VTIME_USER || t->flags & PF_VCPU) {
784 *udelta = delta;
785 } else {
786 if (t->vtime_snap_whence == VTIME_SYS)
787 *sdelta = delta;
788 }
789 } while (read_seqretry(&t->vtime_seqlock, seq));
790}
791
792
793void task_cputime(struct task_struct *t, cputime_t *utime, cputime_t *stime)
794{
795 cputime_t udelta, sdelta;
796
797 fetch_task_cputime(t, utime, stime, &t->utime,
798 &t->stime, &udelta, &sdelta);
799 if (utime)
800 *utime += udelta;
801 if (stime)
802 *stime += sdelta;
803}
804
805void task_cputime_scaled(struct task_struct *t,
806 cputime_t *utimescaled, cputime_t *stimescaled)
807{
808 cputime_t udelta, sdelta;
809
810 fetch_task_cputime(t, utimescaled, stimescaled,
811 &t->utimescaled, &t->stimescaled, &udelta, &sdelta);
812 if (utimescaled)
813 *utimescaled += cputime_to_scaled(udelta);
814 if (stimescaled)
815 *stimescaled += cputime_to_scaled(sdelta);
816}
817#endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 6f79596e0ea9..75024a673520 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -61,14 +61,20 @@ static unsigned long nsec_low(unsigned long long nsec)
61static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group *tg) 61static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group *tg)
62{ 62{
63 struct sched_entity *se = tg->se[cpu]; 63 struct sched_entity *se = tg->se[cpu];
64 if (!se)
65 return;
66 64
67#define P(F) \ 65#define P(F) \
68 SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)F) 66 SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)F)
69#define PN(F) \ 67#define PN(F) \
70 SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)F)) 68 SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)F))
71 69
70 if (!se) {
71 struct sched_avg *avg = &cpu_rq(cpu)->avg;
72 P(avg->runnable_avg_sum);
73 P(avg->runnable_avg_period);
74 return;
75 }
76
77
72 PN(se->exec_start); 78 PN(se->exec_start);
73 PN(se->vruntime); 79 PN(se->vruntime);
74 PN(se->sum_exec_runtime); 80 PN(se->sum_exec_runtime);
@@ -85,6 +91,12 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
85 P(se->statistics.wait_count); 91 P(se->statistics.wait_count);
86#endif 92#endif
87 P(se->load.weight); 93 P(se->load.weight);
94#ifdef CONFIG_SMP
95 P(se->avg.runnable_avg_sum);
96 P(se->avg.runnable_avg_period);
97 P(se->avg.load_avg_contrib);
98 P(se->avg.decay_count);
99#endif
88#undef PN 100#undef PN
89#undef P 101#undef P
90} 102}
@@ -98,13 +110,6 @@ static char *task_group_path(struct task_group *tg)
98 if (autogroup_path(tg, group_path, PATH_MAX)) 110 if (autogroup_path(tg, group_path, PATH_MAX))
99 return group_path; 111 return group_path;
100 112
101 /*
102 * May be NULL if the underlying cgroup isn't fully-created yet
103 */
104 if (!tg->css.cgroup) {
105 group_path[0] = '\0';
106 return group_path;
107 }
108 cgroup_path(tg->css.cgroup, group_path, PATH_MAX); 113 cgroup_path(tg->css.cgroup, group_path, PATH_MAX);
109 return group_path; 114 return group_path;
110} 115}
@@ -206,14 +211,18 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
206 SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight); 211 SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight);
207#ifdef CONFIG_FAIR_GROUP_SCHED 212#ifdef CONFIG_FAIR_GROUP_SCHED
208#ifdef CONFIG_SMP 213#ifdef CONFIG_SMP
209 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "load_avg", 214 SEQ_printf(m, " .%-30s: %lld\n", "runnable_load_avg",
210 SPLIT_NS(cfs_rq->load_avg)); 215 cfs_rq->runnable_load_avg);
211 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "load_period", 216 SEQ_printf(m, " .%-30s: %lld\n", "blocked_load_avg",
212 SPLIT_NS(cfs_rq->load_period)); 217 cfs_rq->blocked_load_avg);
213 SEQ_printf(m, " .%-30s: %ld\n", "load_contrib", 218 SEQ_printf(m, " .%-30s: %lld\n", "tg_load_avg",
214 cfs_rq->load_contribution); 219 (unsigned long long)atomic64_read(&cfs_rq->tg->load_avg));
215 SEQ_printf(m, " .%-30s: %d\n", "load_tg", 220 SEQ_printf(m, " .%-30s: %lld\n", "tg_load_contrib",
216 atomic_read(&cfs_rq->tg->load_weight)); 221 cfs_rq->tg_load_contrib);
222 SEQ_printf(m, " .%-30s: %d\n", "tg_runnable_contrib",
223 cfs_rq->tg_runnable_contrib);
224 SEQ_printf(m, " .%-30s: %d\n", "tg->runnable_avg",
225 atomic_read(&cfs_rq->tg->runnable_avg));
217#endif 226#endif
218 227
219 print_cfs_group_stats(m, cpu, cfs_rq->tg); 228 print_cfs_group_stats(m, cpu, cfs_rq->tg);
@@ -253,11 +262,11 @@ static void print_cpu(struct seq_file *m, int cpu)
253 { 262 {
254 unsigned int freq = cpu_khz ? : 1; 263 unsigned int freq = cpu_khz ? : 1;
255 264
256 SEQ_printf(m, "\ncpu#%d, %u.%03u MHz\n", 265 SEQ_printf(m, "cpu#%d, %u.%03u MHz\n",
257 cpu, freq / 1000, (freq % 1000)); 266 cpu, freq / 1000, (freq % 1000));
258 } 267 }
259#else 268#else
260 SEQ_printf(m, "\ncpu#%d\n", cpu); 269 SEQ_printf(m, "cpu#%d\n", cpu);
261#endif 270#endif
262 271
263#define P(x) \ 272#define P(x) \
@@ -314,6 +323,7 @@ do { \
314 print_rq(m, rq, cpu); 323 print_rq(m, rq, cpu);
315 rcu_read_unlock(); 324 rcu_read_unlock();
316 spin_unlock_irqrestore(&sched_debug_lock, flags); 325 spin_unlock_irqrestore(&sched_debug_lock, flags);
326 SEQ_printf(m, "\n");
317} 327}
318 328
319static const char *sched_tunable_scaling_names[] = { 329static const char *sched_tunable_scaling_names[] = {
@@ -322,11 +332,10 @@ static const char *sched_tunable_scaling_names[] = {
322 "linear" 332 "linear"
323}; 333};
324 334
325static int sched_debug_show(struct seq_file *m, void *v) 335static void sched_debug_header(struct seq_file *m)
326{ 336{
327 u64 ktime, sched_clk, cpu_clk; 337 u64 ktime, sched_clk, cpu_clk;
328 unsigned long flags; 338 unsigned long flags;
329 int cpu;
330 339
331 local_irq_save(flags); 340 local_irq_save(flags);
332 ktime = ktime_to_ns(ktime_get()); 341 ktime = ktime_to_ns(ktime_get());
@@ -368,33 +377,101 @@ static int sched_debug_show(struct seq_file *m, void *v)
368#undef PN 377#undef PN
369#undef P 378#undef P
370 379
371 SEQ_printf(m, " .%-40s: %d (%s)\n", "sysctl_sched_tunable_scaling", 380 SEQ_printf(m, " .%-40s: %d (%s)\n",
381 "sysctl_sched_tunable_scaling",
372 sysctl_sched_tunable_scaling, 382 sysctl_sched_tunable_scaling,
373 sched_tunable_scaling_names[sysctl_sched_tunable_scaling]); 383 sched_tunable_scaling_names[sysctl_sched_tunable_scaling]);
384 SEQ_printf(m, "\n");
385}
374 386
375 for_each_online_cpu(cpu) 387static int sched_debug_show(struct seq_file *m, void *v)
376 print_cpu(m, cpu); 388{
389 int cpu = (unsigned long)(v - 2);
377 390
378 SEQ_printf(m, "\n"); 391 if (cpu != -1)
392 print_cpu(m, cpu);
393 else
394 sched_debug_header(m);
379 395
380 return 0; 396 return 0;
381} 397}
382 398
383void sysrq_sched_debug_show(void) 399void sysrq_sched_debug_show(void)
384{ 400{
385 sched_debug_show(NULL, NULL); 401 int cpu;
402
403 sched_debug_header(NULL);
404 for_each_online_cpu(cpu)
405 print_cpu(NULL, cpu);
406
407}
408
409/*
410 * This itererator needs some explanation.
411 * It returns 1 for the header position.
412 * This means 2 is cpu 0.
413 * In a hotplugged system some cpus, including cpu 0, may be missing so we have
414 * to use cpumask_* to iterate over the cpus.
415 */
416static void *sched_debug_start(struct seq_file *file, loff_t *offset)
417{
418 unsigned long n = *offset;
419
420 if (n == 0)
421 return (void *) 1;
422
423 n--;
424
425 if (n > 0)
426 n = cpumask_next(n - 1, cpu_online_mask);
427 else
428 n = cpumask_first(cpu_online_mask);
429
430 *offset = n + 1;
431
432 if (n < nr_cpu_ids)
433 return (void *)(unsigned long)(n + 2);
434 return NULL;
435}
436
437static void *sched_debug_next(struct seq_file *file, void *data, loff_t *offset)
438{
439 (*offset)++;
440 return sched_debug_start(file, offset);
441}
442
443static void sched_debug_stop(struct seq_file *file, void *data)
444{
445}
446
447static const struct seq_operations sched_debug_sops = {
448 .start = sched_debug_start,
449 .next = sched_debug_next,
450 .stop = sched_debug_stop,
451 .show = sched_debug_show,
452};
453
454static int sched_debug_release(struct inode *inode, struct file *file)
455{
456 seq_release(inode, file);
457
458 return 0;
386} 459}
387 460
388static int sched_debug_open(struct inode *inode, struct file *filp) 461static int sched_debug_open(struct inode *inode, struct file *filp)
389{ 462{
390 return single_open(filp, sched_debug_show, NULL); 463 int ret = 0;
464
465 ret = seq_open(filp, &sched_debug_sops);
466
467 return ret;
391} 468}
392 469
393static const struct file_operations sched_debug_fops = { 470static const struct file_operations sched_debug_fops = {
394 .open = sched_debug_open, 471 .open = sched_debug_open,
395 .read = seq_read, 472 .read = seq_read,
396 .llseek = seq_lseek, 473 .llseek = seq_lseek,
397 .release = single_release, 474 .release = sched_debug_release,
398}; 475};
399 476
400static int __init init_sched_debug_procfs(void) 477static int __init init_sched_debug_procfs(void)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 6b800a14b990..7a33e5986fc5 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -26,6 +26,9 @@
26#include <linux/slab.h> 26#include <linux/slab.h>
27#include <linux/profile.h> 27#include <linux/profile.h>
28#include <linux/interrupt.h> 28#include <linux/interrupt.h>
29#include <linux/mempolicy.h>
30#include <linux/migrate.h>
31#include <linux/task_work.h>
29 32
30#include <trace/events/sched.h> 33#include <trace/events/sched.h>
31 34
@@ -259,6 +262,9 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
259 return grp->my_q; 262 return grp->my_q;
260} 263}
261 264
265static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq,
266 int force_update);
267
262static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) 268static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
263{ 269{
264 if (!cfs_rq->on_list) { 270 if (!cfs_rq->on_list) {
@@ -278,6 +284,8 @@ static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
278 } 284 }
279 285
280 cfs_rq->on_list = 1; 286 cfs_rq->on_list = 1;
287 /* We should have no load, but we need to update last_decay. */
288 update_cfs_rq_blocked_load(cfs_rq, 0);
281 } 289 }
282} 290}
283 291
@@ -653,9 +661,6 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
653 return calc_delta_fair(sched_slice(cfs_rq, se), se); 661 return calc_delta_fair(sched_slice(cfs_rq, se), se);
654} 662}
655 663
656static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update);
657static void update_cfs_shares(struct cfs_rq *cfs_rq);
658
659/* 664/*
660 * Update the current task's runtime statistics. Skip current tasks that 665 * Update the current task's runtime statistics. Skip current tasks that
661 * are not in our scheduling class. 666 * are not in our scheduling class.
@@ -675,10 +680,6 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
675 680
676 curr->vruntime += delta_exec_weighted; 681 curr->vruntime += delta_exec_weighted;
677 update_min_vruntime(cfs_rq); 682 update_min_vruntime(cfs_rq);
678
679#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
680 cfs_rq->load_unacc_exec_time += delta_exec;
681#endif
682} 683}
683 684
684static void update_curr(struct cfs_rq *cfs_rq) 685static void update_curr(struct cfs_rq *cfs_rq)
@@ -776,6 +777,230 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
776 * Scheduling class queueing methods: 777 * Scheduling class queueing methods:
777 */ 778 */
778 779
780#ifdef CONFIG_NUMA_BALANCING
781/*
782 * numa task sample period in ms
783 */
784unsigned int sysctl_numa_balancing_scan_period_min = 100;
785unsigned int sysctl_numa_balancing_scan_period_max = 100*50;
786unsigned int sysctl_numa_balancing_scan_period_reset = 100*600;
787
788/* Portion of address space to scan in MB */
789unsigned int sysctl_numa_balancing_scan_size = 256;
790
791/* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
792unsigned int sysctl_numa_balancing_scan_delay = 1000;
793
794static void task_numa_placement(struct task_struct *p)
795{
796 int seq;
797
798 if (!p->mm) /* for example, ksmd faulting in a user's mm */
799 return;
800 seq = ACCESS_ONCE(p->mm->numa_scan_seq);
801 if (p->numa_scan_seq == seq)
802 return;
803 p->numa_scan_seq = seq;
804
805 /* FIXME: Scheduling placement policy hints go here */
806}
807
808/*
809 * Got a PROT_NONE fault for a page on @node.
810 */
811void task_numa_fault(int node, int pages, bool migrated)
812{
813 struct task_struct *p = current;
814
815 if (!sched_feat_numa(NUMA))
816 return;
817
818 /* FIXME: Allocate task-specific structure for placement policy here */
819
820 /*
821 * If pages are properly placed (did not migrate) then scan slower.
822 * This is reset periodically in case of phase changes
823 */
824 if (!migrated)
825 p->numa_scan_period = min(sysctl_numa_balancing_scan_period_max,
826 p->numa_scan_period + jiffies_to_msecs(10));
827
828 task_numa_placement(p);
829}
830
831static void reset_ptenuma_scan(struct task_struct *p)
832{
833 ACCESS_ONCE(p->mm->numa_scan_seq)++;
834 p->mm->numa_scan_offset = 0;
835}
836
837/*
838 * The expensive part of numa migration is done from task_work context.
839 * Triggered from task_tick_numa().
840 */
841void task_numa_work(struct callback_head *work)
842{
843 unsigned long migrate, next_scan, now = jiffies;
844 struct task_struct *p = current;
845 struct mm_struct *mm = p->mm;
846 struct vm_area_struct *vma;
847 unsigned long start, end;
848 long pages;
849
850 WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work));
851
852 work->next = work; /* protect against double add */
853 /*
854 * Who cares about NUMA placement when they're dying.
855 *
856 * NOTE: make sure not to dereference p->mm before this check,
857 * exit_task_work() happens _after_ exit_mm() so we could be called
858 * without p->mm even though we still had it when we enqueued this
859 * work.
860 */
861 if (p->flags & PF_EXITING)
862 return;
863
864 /*
865 * We do not care about task placement until a task runs on a node
866 * other than the first one used by the address space. This is
867 * largely because migrations are driven by what CPU the task
868 * is running on. If it's never scheduled on another node, it'll
869 * not migrate so why bother trapping the fault.
870 */
871 if (mm->first_nid == NUMA_PTE_SCAN_INIT)
872 mm->first_nid = numa_node_id();
873 if (mm->first_nid != NUMA_PTE_SCAN_ACTIVE) {
874 /* Are we running on a new node yet? */
875 if (numa_node_id() == mm->first_nid &&
876 !sched_feat_numa(NUMA_FORCE))
877 return;
878
879 mm->first_nid = NUMA_PTE_SCAN_ACTIVE;
880 }
881
882 /*
883 * Reset the scan period if enough time has gone by. Objective is that
884 * scanning will be reduced if pages are properly placed. As tasks
885 * can enter different phases this needs to be re-examined. Lacking
886 * proper tracking of reference behaviour, this blunt hammer is used.
887 */
888 migrate = mm->numa_next_reset;
889 if (time_after(now, migrate)) {
890 p->numa_scan_period = sysctl_numa_balancing_scan_period_min;
891 next_scan = now + msecs_to_jiffies(sysctl_numa_balancing_scan_period_reset);
892 xchg(&mm->numa_next_reset, next_scan);
893 }
894
895 /*
896 * Enforce maximal scan/migration frequency..
897 */
898 migrate = mm->numa_next_scan;
899 if (time_before(now, migrate))
900 return;
901
902 if (p->numa_scan_period == 0)
903 p->numa_scan_period = sysctl_numa_balancing_scan_period_min;
904
905 next_scan = now + msecs_to_jiffies(p->numa_scan_period);
906 if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate)
907 return;
908
909 /*
910 * Do not set pte_numa if the current running node is rate-limited.
911 * This loses statistics on the fault but if we are unwilling to
912 * migrate to this node, it is less likely we can do useful work
913 */
914 if (migrate_ratelimited(numa_node_id()))
915 return;
916
917 start = mm->numa_scan_offset;
918 pages = sysctl_numa_balancing_scan_size;
919 pages <<= 20 - PAGE_SHIFT; /* MB in pages */
920 if (!pages)
921 return;
922
923 down_read(&mm->mmap_sem);
924 vma = find_vma(mm, start);
925 if (!vma) {
926 reset_ptenuma_scan(p);
927 start = 0;
928 vma = mm->mmap;
929 }
930 for (; vma; vma = vma->vm_next) {
931 if (!vma_migratable(vma))
932 continue;
933
934 /* Skip small VMAs. They are not likely to be of relevance */
935 if (vma->vm_end - vma->vm_start < HPAGE_SIZE)
936 continue;
937
938 do {
939 start = max(start, vma->vm_start);
940 end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
941 end = min(end, vma->vm_end);
942 pages -= change_prot_numa(vma, start, end);
943
944 start = end;
945 if (pages <= 0)
946 goto out;
947 } while (end != vma->vm_end);
948 }
949
950out:
951 /*
952 * It is possible to reach the end of the VMA list but the last few VMAs are
953 * not guaranteed to the vma_migratable. If they are not, we would find the
954 * !migratable VMA on the next scan but not reset the scanner to the start
955 * so check it now.
956 */
957 if (vma)
958 mm->numa_scan_offset = start;
959 else
960 reset_ptenuma_scan(p);
961 up_read(&mm->mmap_sem);
962}
963
964/*
965 * Drive the periodic memory faults..
966 */
967void task_tick_numa(struct rq *rq, struct task_struct *curr)
968{
969 struct callback_head *work = &curr->numa_work;
970 u64 period, now;
971
972 /*
973 * We don't care about NUMA placement if we don't have memory.
974 */
975 if (!curr->mm || (curr->flags & PF_EXITING) || work->next != work)
976 return;
977
978 /*
979 * Using runtime rather than walltime has the dual advantage that
980 * we (mostly) drive the selection from busy threads and that the
981 * task needs to have done some actual work before we bother with
982 * NUMA placement.
983 */
984 now = curr->se.sum_exec_runtime;
985 period = (u64)curr->numa_scan_period * NSEC_PER_MSEC;
986
987 if (now - curr->node_stamp > period) {
988 if (!curr->node_stamp)
989 curr->numa_scan_period = sysctl_numa_balancing_scan_period_min;
990 curr->node_stamp = now;
991
992 if (!time_before(jiffies, curr->mm->numa_next_scan)) {
993 init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */
994 task_work_add(curr, work, true);
995 }
996 }
997}
998#else
999static void task_tick_numa(struct rq *rq, struct task_struct *curr)
1000{
1001}
1002#endif /* CONFIG_NUMA_BALANCING */
1003
779static void 1004static void
780account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) 1005account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
781{ 1006{
@@ -801,72 +1026,7 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
801} 1026}
802 1027
803#ifdef CONFIG_FAIR_GROUP_SCHED 1028#ifdef CONFIG_FAIR_GROUP_SCHED
804/* we need this in update_cfs_load and load-balance functions below */
805static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
806# ifdef CONFIG_SMP 1029# ifdef CONFIG_SMP
807static void update_cfs_rq_load_contribution(struct cfs_rq *cfs_rq,
808 int global_update)
809{
810 struct task_group *tg = cfs_rq->tg;
811 long load_avg;
812
813 load_avg = div64_u64(cfs_rq->load_avg, cfs_rq->load_period+1);
814 load_avg -= cfs_rq->load_contribution;
815
816 if (global_update || abs(load_avg) > cfs_rq->load_contribution / 8) {
817 atomic_add(load_avg, &tg->load_weight);
818 cfs_rq->load_contribution += load_avg;
819 }
820}
821
822static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
823{
824 u64 period = sysctl_sched_shares_window;
825 u64 now, delta;
826 unsigned long load = cfs_rq->load.weight;
827
828 if (cfs_rq->tg == &root_task_group || throttled_hierarchy(cfs_rq))
829 return;
830
831 now = rq_of(cfs_rq)->clock_task;
832 delta = now - cfs_rq->load_stamp;
833
834 /* truncate load history at 4 idle periods */
835 if (cfs_rq->load_stamp > cfs_rq->load_last &&
836 now - cfs_rq->load_last > 4 * period) {
837 cfs_rq->load_period = 0;
838 cfs_rq->load_avg = 0;
839 delta = period - 1;
840 }
841
842 cfs_rq->load_stamp = now;
843 cfs_rq->load_unacc_exec_time = 0;
844 cfs_rq->load_period += delta;
845 if (load) {
846 cfs_rq->load_last = now;
847 cfs_rq->load_avg += delta * load;
848 }
849
850 /* consider updating load contribution on each fold or truncate */
851 if (global_update || cfs_rq->load_period > period
852 || !cfs_rq->load_period)
853 update_cfs_rq_load_contribution(cfs_rq, global_update);
854
855 while (cfs_rq->load_period > period) {
856 /*
857 * Inline assembly required to prevent the compiler
858 * optimising this loop into a divmod call.
859 * See __iter_div_u64_rem() for another example of this.
860 */
861 asm("" : "+rm" (cfs_rq->load_period));
862 cfs_rq->load_period /= 2;
863 cfs_rq->load_avg /= 2;
864 }
865
866 if (!cfs_rq->curr && !cfs_rq->nr_running && !cfs_rq->load_avg)
867 list_del_leaf_cfs_rq(cfs_rq);
868}
869
870static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq) 1030static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq)
871{ 1031{
872 long tg_weight; 1032 long tg_weight;
@@ -876,8 +1036,8 @@ static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq)
876 * to gain a more accurate current total weight. See 1036 * to gain a more accurate current total weight. See
877 * update_cfs_rq_load_contribution(). 1037 * update_cfs_rq_load_contribution().
878 */ 1038 */
879 tg_weight = atomic_read(&tg->load_weight); 1039 tg_weight = atomic64_read(&tg->load_avg);
880 tg_weight -= cfs_rq->load_contribution; 1040 tg_weight -= cfs_rq->tg_load_contrib;
881 tg_weight += cfs_rq->load.weight; 1041 tg_weight += cfs_rq->load.weight;
882 1042
883 return tg_weight; 1043 return tg_weight;
@@ -901,27 +1061,11 @@ static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
901 1061
902 return shares; 1062 return shares;
903} 1063}
904
905static void update_entity_shares_tick(struct cfs_rq *cfs_rq)
906{
907 if (cfs_rq->load_unacc_exec_time > sysctl_sched_shares_window) {
908 update_cfs_load(cfs_rq, 0);
909 update_cfs_shares(cfs_rq);
910 }
911}
912# else /* CONFIG_SMP */ 1064# else /* CONFIG_SMP */
913static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
914{
915}
916
917static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg) 1065static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
918{ 1066{
919 return tg->shares; 1067 return tg->shares;
920} 1068}
921
922static inline void update_entity_shares_tick(struct cfs_rq *cfs_rq)
923{
924}
925# endif /* CONFIG_SMP */ 1069# endif /* CONFIG_SMP */
926static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, 1070static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
927 unsigned long weight) 1071 unsigned long weight)
@@ -939,6 +1083,8 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
939 account_entity_enqueue(cfs_rq, se); 1083 account_entity_enqueue(cfs_rq, se);
940} 1084}
941 1085
1086static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
1087
942static void update_cfs_shares(struct cfs_rq *cfs_rq) 1088static void update_cfs_shares(struct cfs_rq *cfs_rq)
943{ 1089{
944 struct task_group *tg; 1090 struct task_group *tg;
@@ -958,18 +1104,477 @@ static void update_cfs_shares(struct cfs_rq *cfs_rq)
958 reweight_entity(cfs_rq_of(se), se, shares); 1104 reweight_entity(cfs_rq_of(se), se, shares);
959} 1105}
960#else /* CONFIG_FAIR_GROUP_SCHED */ 1106#else /* CONFIG_FAIR_GROUP_SCHED */
961static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update) 1107static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
962{ 1108{
963} 1109}
1110#endif /* CONFIG_FAIR_GROUP_SCHED */
964 1111
965static inline void update_cfs_shares(struct cfs_rq *cfs_rq) 1112/* Only depends on SMP, FAIR_GROUP_SCHED may be removed when useful in lb */
1113#if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)
1114/*
1115 * We choose a half-life close to 1 scheduling period.
1116 * Note: The tables below are dependent on this value.
1117 */
1118#define LOAD_AVG_PERIOD 32
1119#define LOAD_AVG_MAX 47742 /* maximum possible load avg */
1120#define LOAD_AVG_MAX_N 345 /* number of full periods to produce LOAD_MAX_AVG */
1121
1122/* Precomputed fixed inverse multiplies for multiplication by y^n */
1123static const u32 runnable_avg_yN_inv[] = {
1124 0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6,
1125 0xe0ccdeeb, 0xdbfbb796, 0xd744fcc9, 0xd2a81d91, 0xce248c14, 0xc9b9bd85,
1126 0xc5672a10, 0xc12c4cc9, 0xbd08a39e, 0xb8fbaf46, 0xb504f333, 0xb123f581,
1127 0xad583ee9, 0xa9a15ab4, 0xa5fed6a9, 0xa2704302, 0x9ef5325f, 0x9b8d39b9,
1128 0x9837f050, 0x94f4efa8, 0x91c3d373, 0x8ea4398a, 0x8b95c1e3, 0x88980e80,
1129 0x85aac367, 0x82cd8698,
1130};
1131
1132/*
1133 * Precomputed \Sum y^k { 1<=k<=n }. These are floor(true_value) to prevent
1134 * over-estimates when re-combining.
1135 */
1136static const u32 runnable_avg_yN_sum[] = {
1137 0, 1002, 1982, 2941, 3880, 4798, 5697, 6576, 7437, 8279, 9103,
1138 9909,10698,11470,12226,12966,13690,14398,15091,15769,16433,17082,
1139 17718,18340,18949,19545,20128,20698,21256,21802,22336,22859,23371,
1140};
1141
1142/*
1143 * Approximate:
1144 * val * y^n, where y^32 ~= 0.5 (~1 scheduling period)
1145 */
1146static __always_inline u64 decay_load(u64 val, u64 n)
1147{
1148 unsigned int local_n;
1149
1150 if (!n)
1151 return val;
1152 else if (unlikely(n > LOAD_AVG_PERIOD * 63))
1153 return 0;
1154
1155 /* after bounds checking we can collapse to 32-bit */
1156 local_n = n;
1157
1158 /*
1159 * As y^PERIOD = 1/2, we can combine
1160 * y^n = 1/2^(n/PERIOD) * k^(n%PERIOD)
1161 * With a look-up table which covers k^n (n<PERIOD)
1162 *
1163 * To achieve constant time decay_load.
1164 */
1165 if (unlikely(local_n >= LOAD_AVG_PERIOD)) {
1166 val >>= local_n / LOAD_AVG_PERIOD;
1167 local_n %= LOAD_AVG_PERIOD;
1168 }
1169
1170 val *= runnable_avg_yN_inv[local_n];
1171 /* We don't use SRR here since we always want to round down. */
1172 return val >> 32;
1173}
1174
1175/*
1176 * For updates fully spanning n periods, the contribution to runnable
1177 * average will be: \Sum 1024*y^n
1178 *
1179 * We can compute this reasonably efficiently by combining:
1180 * y^PERIOD = 1/2 with precomputed \Sum 1024*y^n {for n <PERIOD}
1181 */
1182static u32 __compute_runnable_contrib(u64 n)
1183{
1184 u32 contrib = 0;
1185
1186 if (likely(n <= LOAD_AVG_PERIOD))
1187 return runnable_avg_yN_sum[n];
1188 else if (unlikely(n >= LOAD_AVG_MAX_N))
1189 return LOAD_AVG_MAX;
1190
1191 /* Compute \Sum k^n combining precomputed values for k^i, \Sum k^j */
1192 do {
1193 contrib /= 2; /* y^LOAD_AVG_PERIOD = 1/2 */
1194 contrib += runnable_avg_yN_sum[LOAD_AVG_PERIOD];
1195
1196 n -= LOAD_AVG_PERIOD;
1197 } while (n > LOAD_AVG_PERIOD);
1198
1199 contrib = decay_load(contrib, n);
1200 return contrib + runnable_avg_yN_sum[n];
1201}
1202
1203/*
1204 * We can represent the historical contribution to runnable average as the
1205 * coefficients of a geometric series. To do this we sub-divide our runnable
1206 * history into segments of approximately 1ms (1024us); label the segment that
1207 * occurred N-ms ago p_N, with p_0 corresponding to the current period, e.g.
1208 *
1209 * [<- 1024us ->|<- 1024us ->|<- 1024us ->| ...
1210 * p0 p1 p2
1211 * (now) (~1ms ago) (~2ms ago)
1212 *
1213 * Let u_i denote the fraction of p_i that the entity was runnable.
1214 *
1215 * We then designate the fractions u_i as our co-efficients, yielding the
1216 * following representation of historical load:
1217 * u_0 + u_1*y + u_2*y^2 + u_3*y^3 + ...
1218 *
1219 * We choose y based on the with of a reasonably scheduling period, fixing:
1220 * y^32 = 0.5
1221 *
1222 * This means that the contribution to load ~32ms ago (u_32) will be weighted
1223 * approximately half as much as the contribution to load within the last ms
1224 * (u_0).
1225 *
1226 * When a period "rolls over" and we have new u_0`, multiplying the previous
1227 * sum again by y is sufficient to update:
1228 * load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... )
1229 * = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}]
1230 */
1231static __always_inline int __update_entity_runnable_avg(u64 now,
1232 struct sched_avg *sa,
1233 int runnable)
966{ 1234{
1235 u64 delta, periods;
1236 u32 runnable_contrib;
1237 int delta_w, decayed = 0;
1238
1239 delta = now - sa->last_runnable_update;
1240 /*
1241 * This should only happen when time goes backwards, which it
1242 * unfortunately does during sched clock init when we swap over to TSC.
1243 */
1244 if ((s64)delta < 0) {
1245 sa->last_runnable_update = now;
1246 return 0;
1247 }
1248
1249 /*
1250 * Use 1024ns as the unit of measurement since it's a reasonable
1251 * approximation of 1us and fast to compute.
1252 */
1253 delta >>= 10;
1254 if (!delta)
1255 return 0;
1256 sa->last_runnable_update = now;
1257
1258 /* delta_w is the amount already accumulated against our next period */
1259 delta_w = sa->runnable_avg_period % 1024;
1260 if (delta + delta_w >= 1024) {
1261 /* period roll-over */
1262 decayed = 1;
1263
1264 /*
1265 * Now that we know we're crossing a period boundary, figure
1266 * out how much from delta we need to complete the current
1267 * period and accrue it.
1268 */
1269 delta_w = 1024 - delta_w;
1270 if (runnable)
1271 sa->runnable_avg_sum += delta_w;
1272 sa->runnable_avg_period += delta_w;
1273
1274 delta -= delta_w;
1275
1276 /* Figure out how many additional periods this update spans */
1277 periods = delta / 1024;
1278 delta %= 1024;
1279
1280 sa->runnable_avg_sum = decay_load(sa->runnable_avg_sum,
1281 periods + 1);
1282 sa->runnable_avg_period = decay_load(sa->runnable_avg_period,
1283 periods + 1);
1284
1285 /* Efficiently calculate \sum (1..n_period) 1024*y^i */
1286 runnable_contrib = __compute_runnable_contrib(periods);
1287 if (runnable)
1288 sa->runnable_avg_sum += runnable_contrib;
1289 sa->runnable_avg_period += runnable_contrib;
1290 }
1291
1292 /* Remainder of delta accrued against u_0` */
1293 if (runnable)
1294 sa->runnable_avg_sum += delta;
1295 sa->runnable_avg_period += delta;
1296
1297 return decayed;
967} 1298}
968 1299
969static inline void update_entity_shares_tick(struct cfs_rq *cfs_rq) 1300/* Synchronize an entity's decay with its parenting cfs_rq.*/
1301static inline u64 __synchronize_entity_decay(struct sched_entity *se)
970{ 1302{
1303 struct cfs_rq *cfs_rq = cfs_rq_of(se);
1304 u64 decays = atomic64_read(&cfs_rq->decay_counter);
1305
1306 decays -= se->avg.decay_count;
1307 if (!decays)
1308 return 0;
1309
1310 se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays);
1311 se->avg.decay_count = 0;
1312
1313 return decays;
971} 1314}
972#endif /* CONFIG_FAIR_GROUP_SCHED */ 1315
1316#ifdef CONFIG_FAIR_GROUP_SCHED
1317static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
1318 int force_update)
1319{
1320 struct task_group *tg = cfs_rq->tg;
1321 s64 tg_contrib;
1322
1323 tg_contrib = cfs_rq->runnable_load_avg + cfs_rq->blocked_load_avg;
1324 tg_contrib -= cfs_rq->tg_load_contrib;
1325
1326 if (force_update || abs64(tg_contrib) > cfs_rq->tg_load_contrib / 8) {
1327 atomic64_add(tg_contrib, &tg->load_avg);
1328 cfs_rq->tg_load_contrib += tg_contrib;
1329 }
1330}
1331
1332/*
1333 * Aggregate cfs_rq runnable averages into an equivalent task_group
1334 * representation for computing load contributions.
1335 */
1336static inline void __update_tg_runnable_avg(struct sched_avg *sa,
1337 struct cfs_rq *cfs_rq)
1338{
1339 struct task_group *tg = cfs_rq->tg;
1340 long contrib;
1341
1342 /* The fraction of a cpu used by this cfs_rq */
1343 contrib = div_u64(sa->runnable_avg_sum << NICE_0_SHIFT,
1344 sa->runnable_avg_period + 1);
1345 contrib -= cfs_rq->tg_runnable_contrib;
1346
1347 if (abs(contrib) > cfs_rq->tg_runnable_contrib / 64) {
1348 atomic_add(contrib, &tg->runnable_avg);
1349 cfs_rq->tg_runnable_contrib += contrib;
1350 }
1351}
1352
1353static inline void __update_group_entity_contrib(struct sched_entity *se)
1354{
1355 struct cfs_rq *cfs_rq = group_cfs_rq(se);
1356 struct task_group *tg = cfs_rq->tg;
1357 int runnable_avg;
1358
1359 u64 contrib;
1360
1361 contrib = cfs_rq->tg_load_contrib * tg->shares;
1362 se->avg.load_avg_contrib = div64_u64(contrib,
1363 atomic64_read(&tg->load_avg) + 1);
1364
1365 /*
1366 * For group entities we need to compute a correction term in the case
1367 * that they are consuming <1 cpu so that we would contribute the same
1368 * load as a task of equal weight.
1369 *
1370 * Explicitly co-ordinating this measurement would be expensive, but
1371 * fortunately the sum of each cpus contribution forms a usable
1372 * lower-bound on the true value.
1373 *
1374 * Consider the aggregate of 2 contributions. Either they are disjoint
1375 * (and the sum represents true value) or they are disjoint and we are
1376 * understating by the aggregate of their overlap.
1377 *
1378 * Extending this to N cpus, for a given overlap, the maximum amount we
1379 * understand is then n_i(n_i+1)/2 * w_i where n_i is the number of
1380 * cpus that overlap for this interval and w_i is the interval width.
1381 *
1382 * On a small machine; the first term is well-bounded which bounds the
1383 * total error since w_i is a subset of the period. Whereas on a
1384 * larger machine, while this first term can be larger, if w_i is the
1385 * of consequential size guaranteed to see n_i*w_i quickly converge to
1386 * our upper bound of 1-cpu.
1387 */
1388 runnable_avg = atomic_read(&tg->runnable_avg);
1389 if (runnable_avg < NICE_0_LOAD) {
1390 se->avg.load_avg_contrib *= runnable_avg;
1391 se->avg.load_avg_contrib >>= NICE_0_SHIFT;
1392 }
1393}
1394#else
1395static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
1396 int force_update) {}
1397static inline void __update_tg_runnable_avg(struct sched_avg *sa,
1398 struct cfs_rq *cfs_rq) {}
1399static inline void __update_group_entity_contrib(struct sched_entity *se) {}
1400#endif
1401
1402static inline void __update_task_entity_contrib(struct sched_entity *se)
1403{
1404 u32 contrib;
1405
1406 /* avoid overflowing a 32-bit type w/ SCHED_LOAD_SCALE */
1407 contrib = se->avg.runnable_avg_sum * scale_load_down(se->load.weight);
1408 contrib /= (se->avg.runnable_avg_period + 1);
1409 se->avg.load_avg_contrib = scale_load(contrib);
1410}
1411
1412/* Compute the current contribution to load_avg by se, return any delta */
1413static long __update_entity_load_avg_contrib(struct sched_entity *se)
1414{
1415 long old_contrib = se->avg.load_avg_contrib;
1416
1417 if (entity_is_task(se)) {
1418 __update_task_entity_contrib(se);
1419 } else {
1420 __update_tg_runnable_avg(&se->avg, group_cfs_rq(se));
1421 __update_group_entity_contrib(se);
1422 }
1423
1424 return se->avg.load_avg_contrib - old_contrib;
1425}
1426
1427static inline void subtract_blocked_load_contrib(struct cfs_rq *cfs_rq,
1428 long load_contrib)
1429{
1430 if (likely(load_contrib < cfs_rq->blocked_load_avg))
1431 cfs_rq->blocked_load_avg -= load_contrib;
1432 else
1433 cfs_rq->blocked_load_avg = 0;
1434}
1435
1436static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
1437
1438/* Update a sched_entity's runnable average */
1439static inline void update_entity_load_avg(struct sched_entity *se,
1440 int update_cfs_rq)
1441{
1442 struct cfs_rq *cfs_rq = cfs_rq_of(se);
1443 long contrib_delta;
1444 u64 now;
1445
1446 /*
1447 * For a group entity we need to use their owned cfs_rq_clock_task() in
1448 * case they are the parent of a throttled hierarchy.
1449 */
1450 if (entity_is_task(se))
1451 now = cfs_rq_clock_task(cfs_rq);
1452 else
1453 now = cfs_rq_clock_task(group_cfs_rq(se));
1454
1455 if (!__update_entity_runnable_avg(now, &se->avg, se->on_rq))
1456 return;
1457
1458 contrib_delta = __update_entity_load_avg_contrib(se);
1459
1460 if (!update_cfs_rq)
1461 return;
1462
1463 if (se->on_rq)
1464 cfs_rq->runnable_load_avg += contrib_delta;
1465 else
1466 subtract_blocked_load_contrib(cfs_rq, -contrib_delta);
1467}
1468
1469/*
1470 * Decay the load contributed by all blocked children and account this so that
1471 * their contribution may appropriately discounted when they wake up.
1472 */
1473static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update)
1474{
1475 u64 now = cfs_rq_clock_task(cfs_rq) >> 20;
1476 u64 decays;
1477
1478 decays = now - cfs_rq->last_decay;
1479 if (!decays && !force_update)
1480 return;
1481
1482 if (atomic64_read(&cfs_rq->removed_load)) {
1483 u64 removed_load = atomic64_xchg(&cfs_rq->removed_load, 0);
1484 subtract_blocked_load_contrib(cfs_rq, removed_load);
1485 }
1486
1487 if (decays) {
1488 cfs_rq->blocked_load_avg = decay_load(cfs_rq->blocked_load_avg,
1489 decays);
1490 atomic64_add(decays, &cfs_rq->decay_counter);
1491 cfs_rq->last_decay = now;
1492 }
1493
1494 __update_cfs_rq_tg_load_contrib(cfs_rq, force_update);
1495}
1496
1497static inline void update_rq_runnable_avg(struct rq *rq, int runnable)
1498{
1499 __update_entity_runnable_avg(rq->clock_task, &rq->avg, runnable);
1500 __update_tg_runnable_avg(&rq->avg, &rq->cfs);
1501}
1502
1503/* Add the load generated by se into cfs_rq's child load-average */
1504static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
1505 struct sched_entity *se,
1506 int wakeup)
1507{
1508 /*
1509 * We track migrations using entity decay_count <= 0, on a wake-up
1510 * migration we use a negative decay count to track the remote decays
1511 * accumulated while sleeping.
1512 */
1513 if (unlikely(se->avg.decay_count <= 0)) {
1514 se->avg.last_runnable_update = rq_of(cfs_rq)->clock_task;
1515 if (se->avg.decay_count) {
1516 /*
1517 * In a wake-up migration we have to approximate the
1518 * time sleeping. This is because we can't synchronize
1519 * clock_task between the two cpus, and it is not
1520 * guaranteed to be read-safe. Instead, we can
1521 * approximate this using our carried decays, which are
1522 * explicitly atomically readable.
1523 */
1524 se->avg.last_runnable_update -= (-se->avg.decay_count)
1525 << 20;
1526 update_entity_load_avg(se, 0);
1527 /* Indicate that we're now synchronized and on-rq */
1528 se->avg.decay_count = 0;
1529 }
1530 wakeup = 0;
1531 } else {
1532 __synchronize_entity_decay(se);
1533 }
1534
1535 /* migrated tasks did not contribute to our blocked load */
1536 if (wakeup) {
1537 subtract_blocked_load_contrib(cfs_rq, se->avg.load_avg_contrib);
1538 update_entity_load_avg(se, 0);
1539 }
1540
1541 cfs_rq->runnable_load_avg += se->avg.load_avg_contrib;
1542 /* we force update consideration on load-balancer moves */
1543 update_cfs_rq_blocked_load(cfs_rq, !wakeup);
1544}
1545
1546/*
1547 * Remove se's load from this cfs_rq child load-average, if the entity is
1548 * transitioning to a blocked state we track its projected decay using
1549 * blocked_load_avg.
1550 */
1551static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
1552 struct sched_entity *se,
1553 int sleep)
1554{
1555 update_entity_load_avg(se, 1);
1556 /* we force update consideration on load-balancer moves */
1557 update_cfs_rq_blocked_load(cfs_rq, !sleep);
1558
1559 cfs_rq->runnable_load_avg -= se->avg.load_avg_contrib;
1560 if (sleep) {
1561 cfs_rq->blocked_load_avg += se->avg.load_avg_contrib;
1562 se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
1563 } /* migrations, e.g. sleep=0 leave decay_count == 0 */
1564}
1565#else
1566static inline void update_entity_load_avg(struct sched_entity *se,
1567 int update_cfs_rq) {}
1568static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {}
1569static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
1570 struct sched_entity *se,
1571 int wakeup) {}
1572static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
1573 struct sched_entity *se,
1574 int sleep) {}
1575static inline void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq,
1576 int force_update) {}
1577#endif
973 1578
974static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) 1579static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
975{ 1580{
@@ -1075,9 +1680,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
1075 } 1680 }
1076 1681
1077 /* ensure we never gain time by being placed backwards. */ 1682 /* ensure we never gain time by being placed backwards. */
1078 vruntime = max_vruntime(se->vruntime, vruntime); 1683 se->vruntime = max_vruntime(se->vruntime, vruntime);
1079
1080 se->vruntime = vruntime;
1081} 1684}
1082 1685
1083static void check_enqueue_throttle(struct cfs_rq *cfs_rq); 1686static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
@@ -1096,7 +1699,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
1096 * Update run-time statistics of the 'current'. 1699 * Update run-time statistics of the 'current'.
1097 */ 1700 */
1098 update_curr(cfs_rq); 1701 update_curr(cfs_rq);
1099 update_cfs_load(cfs_rq, 0); 1702 enqueue_entity_load_avg(cfs_rq, se, flags & ENQUEUE_WAKEUP);
1100 account_entity_enqueue(cfs_rq, se); 1703 account_entity_enqueue(cfs_rq, se);
1101 update_cfs_shares(cfs_rq); 1704 update_cfs_shares(cfs_rq);
1102 1705
@@ -1171,6 +1774,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
1171 * Update run-time statistics of the 'current'. 1774 * Update run-time statistics of the 'current'.
1172 */ 1775 */
1173 update_curr(cfs_rq); 1776 update_curr(cfs_rq);
1777 dequeue_entity_load_avg(cfs_rq, se, flags & DEQUEUE_SLEEP);
1174 1778
1175 update_stats_dequeue(cfs_rq, se); 1779 update_stats_dequeue(cfs_rq, se);
1176 if (flags & DEQUEUE_SLEEP) { 1780 if (flags & DEQUEUE_SLEEP) {
@@ -1191,7 +1795,6 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
1191 if (se != cfs_rq->curr) 1795 if (se != cfs_rq->curr)
1192 __dequeue_entity(cfs_rq, se); 1796 __dequeue_entity(cfs_rq, se);
1193 se->on_rq = 0; 1797 se->on_rq = 0;
1194 update_cfs_load(cfs_rq, 0);
1195 account_entity_dequeue(cfs_rq, se); 1798 account_entity_dequeue(cfs_rq, se);
1196 1799
1197 /* 1800 /*
@@ -1340,6 +1943,8 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
1340 update_stats_wait_start(cfs_rq, prev); 1943 update_stats_wait_start(cfs_rq, prev);
1341 /* Put 'current' back into the tree. */ 1944 /* Put 'current' back into the tree. */
1342 __enqueue_entity(cfs_rq, prev); 1945 __enqueue_entity(cfs_rq, prev);
1946 /* in !on_rq case, update occurred at dequeue */
1947 update_entity_load_avg(prev, 1);
1343 } 1948 }
1344 cfs_rq->curr = NULL; 1949 cfs_rq->curr = NULL;
1345} 1950}
@@ -1353,9 +1958,10 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
1353 update_curr(cfs_rq); 1958 update_curr(cfs_rq);
1354 1959
1355 /* 1960 /*
1356 * Update share accounting for long-running entities. 1961 * Ensure that runnable average is periodically updated.
1357 */ 1962 */
1358 update_entity_shares_tick(cfs_rq); 1963 update_entity_load_avg(curr, 1);
1964 update_cfs_rq_blocked_load(cfs_rq, 1);
1359 1965
1360#ifdef CONFIG_SCHED_HRTICK 1966#ifdef CONFIG_SCHED_HRTICK
1361 /* 1967 /*
@@ -1448,6 +2054,15 @@ static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
1448 return &tg->cfs_bandwidth; 2054 return &tg->cfs_bandwidth;
1449} 2055}
1450 2056
2057/* rq->task_clock normalized against any time this cfs_rq has spent throttled */
2058static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
2059{
2060 if (unlikely(cfs_rq->throttle_count))
2061 return cfs_rq->throttled_clock_task;
2062
2063 return rq_of(cfs_rq)->clock_task - cfs_rq->throttled_clock_task_time;
2064}
2065
1451/* returns 0 on failure to allocate runtime */ 2066/* returns 0 on failure to allocate runtime */
1452static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) 2067static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
1453{ 2068{
@@ -1592,14 +2207,9 @@ static int tg_unthrottle_up(struct task_group *tg, void *data)
1592 cfs_rq->throttle_count--; 2207 cfs_rq->throttle_count--;
1593#ifdef CONFIG_SMP 2208#ifdef CONFIG_SMP
1594 if (!cfs_rq->throttle_count) { 2209 if (!cfs_rq->throttle_count) {
1595 u64 delta = rq->clock_task - cfs_rq->load_stamp; 2210 /* adjust cfs_rq_clock_task() */
1596 2211 cfs_rq->throttled_clock_task_time += rq->clock_task -
1597 /* leaving throttled state, advance shares averaging windows */ 2212 cfs_rq->throttled_clock_task;
1598 cfs_rq->load_stamp += delta;
1599 cfs_rq->load_last += delta;
1600
1601 /* update entity weight now that we are on_rq again */
1602 update_cfs_shares(cfs_rq);
1603 } 2213 }
1604#endif 2214#endif
1605 2215
@@ -1611,9 +2221,9 @@ static int tg_throttle_down(struct task_group *tg, void *data)
1611 struct rq *rq = data; 2221 struct rq *rq = data;
1612 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)]; 2222 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
1613 2223
1614 /* group is entering throttled state, record last load */ 2224 /* group is entering throttled state, stop time */
1615 if (!cfs_rq->throttle_count) 2225 if (!cfs_rq->throttle_count)
1616 update_cfs_load(cfs_rq, 0); 2226 cfs_rq->throttled_clock_task = rq->clock_task;
1617 cfs_rq->throttle_count++; 2227 cfs_rq->throttle_count++;
1618 2228
1619 return 0; 2229 return 0;
@@ -1628,7 +2238,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
1628 2238
1629 se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))]; 2239 se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
1630 2240
1631 /* account load preceding throttle */ 2241 /* freeze hierarchy runnable averages while throttled */
1632 rcu_read_lock(); 2242 rcu_read_lock();
1633 walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq); 2243 walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
1634 rcu_read_unlock(); 2244 rcu_read_unlock();
@@ -1652,7 +2262,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
1652 rq->nr_running -= task_delta; 2262 rq->nr_running -= task_delta;
1653 2263
1654 cfs_rq->throttled = 1; 2264 cfs_rq->throttled = 1;
1655 cfs_rq->throttled_timestamp = rq->clock; 2265 cfs_rq->throttled_clock = rq->clock;
1656 raw_spin_lock(&cfs_b->lock); 2266 raw_spin_lock(&cfs_b->lock);
1657 list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq); 2267 list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
1658 raw_spin_unlock(&cfs_b->lock); 2268 raw_spin_unlock(&cfs_b->lock);
@@ -1670,10 +2280,9 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
1670 2280
1671 cfs_rq->throttled = 0; 2281 cfs_rq->throttled = 0;
1672 raw_spin_lock(&cfs_b->lock); 2282 raw_spin_lock(&cfs_b->lock);
1673 cfs_b->throttled_time += rq->clock - cfs_rq->throttled_timestamp; 2283 cfs_b->throttled_time += rq->clock - cfs_rq->throttled_clock;
1674 list_del_rcu(&cfs_rq->throttled_list); 2284 list_del_rcu(&cfs_rq->throttled_list);
1675 raw_spin_unlock(&cfs_b->lock); 2285 raw_spin_unlock(&cfs_b->lock);
1676 cfs_rq->throttled_timestamp = 0;
1677 2286
1678 update_rq_clock(rq); 2287 update_rq_clock(rq);
1679 /* update hierarchical throttle state */ 2288 /* update hierarchical throttle state */
@@ -2052,7 +2661,7 @@ static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
2052 hrtimer_cancel(&cfs_b->slack_timer); 2661 hrtimer_cancel(&cfs_b->slack_timer);
2053} 2662}
2054 2663
2055static void unthrottle_offline_cfs_rqs(struct rq *rq) 2664static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
2056{ 2665{
2057 struct cfs_rq *cfs_rq; 2666 struct cfs_rq *cfs_rq;
2058 2667
@@ -2073,8 +2682,13 @@ static void unthrottle_offline_cfs_rqs(struct rq *rq)
2073} 2682}
2074 2683
2075#else /* CONFIG_CFS_BANDWIDTH */ 2684#else /* CONFIG_CFS_BANDWIDTH */
2076static __always_inline 2685static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
2077void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec) {} 2686{
2687 return rq_of(cfs_rq)->clock_task;
2688}
2689
2690static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
2691 unsigned long delta_exec) {}
2078static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} 2692static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
2079static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {} 2693static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
2080static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} 2694static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
@@ -2207,12 +2821,14 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
2207 if (cfs_rq_throttled(cfs_rq)) 2821 if (cfs_rq_throttled(cfs_rq))
2208 break; 2822 break;
2209 2823
2210 update_cfs_load(cfs_rq, 0);
2211 update_cfs_shares(cfs_rq); 2824 update_cfs_shares(cfs_rq);
2825 update_entity_load_avg(se, 1);
2212 } 2826 }
2213 2827
2214 if (!se) 2828 if (!se) {
2829 update_rq_runnable_avg(rq, rq->nr_running);
2215 inc_nr_running(rq); 2830 inc_nr_running(rq);
2831 }
2216 hrtick_update(rq); 2832 hrtick_update(rq);
2217} 2833}
2218 2834
@@ -2266,12 +2882,14 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
2266 if (cfs_rq_throttled(cfs_rq)) 2882 if (cfs_rq_throttled(cfs_rq))
2267 break; 2883 break;
2268 2884
2269 update_cfs_load(cfs_rq, 0);
2270 update_cfs_shares(cfs_rq); 2885 update_cfs_shares(cfs_rq);
2886 update_entity_load_avg(se, 1);
2271 } 2887 }
2272 2888
2273 if (!se) 2889 if (!se) {
2274 dec_nr_running(rq); 2890 dec_nr_running(rq);
2891 update_rq_runnable_avg(rq, 1);
2892 }
2275 hrtick_update(rq); 2893 hrtick_update(rq);
2276} 2894}
2277 2895
@@ -2634,25 +3252,18 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
2634 */ 3252 */
2635static int select_idle_sibling(struct task_struct *p, int target) 3253static int select_idle_sibling(struct task_struct *p, int target)
2636{ 3254{
2637 int cpu = smp_processor_id();
2638 int prev_cpu = task_cpu(p);
2639 struct sched_domain *sd; 3255 struct sched_domain *sd;
2640 struct sched_group *sg; 3256 struct sched_group *sg;
2641 int i; 3257 int i = task_cpu(p);
2642 3258
2643 /* 3259 if (idle_cpu(target))
2644 * If the task is going to be woken-up on this cpu and if it is 3260 return target;
2645 * already idle, then it is the right target.
2646 */
2647 if (target == cpu && idle_cpu(cpu))
2648 return cpu;
2649 3261
2650 /* 3262 /*
2651 * If the task is going to be woken-up on the cpu where it previously 3263 * If the prevous cpu is cache affine and idle, don't be stupid.
2652 * ran and if it is currently idle, then it the right target.
2653 */ 3264 */
2654 if (target == prev_cpu && idle_cpu(prev_cpu)) 3265 if (i != target && cpus_share_cache(i, target) && idle_cpu(i))
2655 return prev_cpu; 3266 return i;
2656 3267
2657 /* 3268 /*
2658 * Otherwise, iterate the domains and find an elegible idle cpu. 3269 * Otherwise, iterate the domains and find an elegible idle cpu.
@@ -2666,7 +3277,7 @@ static int select_idle_sibling(struct task_struct *p, int target)
2666 goto next; 3277 goto next;
2667 3278
2668 for_each_cpu(i, sched_group_cpus(sg)) { 3279 for_each_cpu(i, sched_group_cpus(sg)) {
2669 if (!idle_cpu(i)) 3280 if (i == target || !idle_cpu(i))
2670 goto next; 3281 goto next;
2671 } 3282 }
2672 3283
@@ -2781,6 +3392,37 @@ unlock:
2781 3392
2782 return new_cpu; 3393 return new_cpu;
2783} 3394}
3395
3396/*
3397 * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be
3398 * removed when useful for applications beyond shares distribution (e.g.
3399 * load-balance).
3400 */
3401#ifdef CONFIG_FAIR_GROUP_SCHED
3402/*
3403 * Called immediately before a task is migrated to a new cpu; task_cpu(p) and
3404 * cfs_rq_of(p) references at time of call are still valid and identify the
3405 * previous cpu. However, the caller only guarantees p->pi_lock is held; no
3406 * other assumptions, including the state of rq->lock, should be made.
3407 */
3408static void
3409migrate_task_rq_fair(struct task_struct *p, int next_cpu)
3410{
3411 struct sched_entity *se = &p->se;
3412 struct cfs_rq *cfs_rq = cfs_rq_of(se);
3413
3414 /*
3415 * Load tracking: accumulate removed load so that it can be processed
3416 * when we next update owning cfs_rq under rq->lock. Tasks contribute
3417 * to blocked load iff they have a positive decay-count. It can never
3418 * be negative here since on-rq tasks have decay-count == 0.
3419 */
3420 if (se->avg.decay_count) {
3421 se->avg.decay_count = -__synchronize_entity_decay(se);
3422 atomic64_add(se->avg.load_avg_contrib, &cfs_rq->removed_load);
3423 }
3424}
3425#endif
2784#endif /* CONFIG_SMP */ 3426#endif /* CONFIG_SMP */
2785 3427
2786static unsigned long 3428static unsigned long
@@ -2907,7 +3549,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
2907 * Batch and idle tasks do not preempt non-idle tasks (their preemption 3549 * Batch and idle tasks do not preempt non-idle tasks (their preemption
2908 * is driven by the tick): 3550 * is driven by the tick):
2909 */ 3551 */
2910 if (unlikely(p->policy != SCHED_NORMAL)) 3552 if (unlikely(p->policy != SCHED_NORMAL) || !sched_feat(WAKEUP_PREEMPTION))
2911 return; 3553 return;
2912 3554
2913 find_matching_se(&se, &pse); 3555 find_matching_se(&se, &pse);
@@ -3033,8 +3675,122 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
3033 3675
3034#ifdef CONFIG_SMP 3676#ifdef CONFIG_SMP
3035/************************************************** 3677/**************************************************
3036 * Fair scheduling class load-balancing methods: 3678 * Fair scheduling class load-balancing methods.
3037 */ 3679 *
3680 * BASICS
3681 *
3682 * The purpose of load-balancing is to achieve the same basic fairness the
3683 * per-cpu scheduler provides, namely provide a proportional amount of compute
3684 * time to each task. This is expressed in the following equation:
3685 *
3686 * W_i,n/P_i == W_j,n/P_j for all i,j (1)
3687 *
3688 * Where W_i,n is the n-th weight average for cpu i. The instantaneous weight
3689 * W_i,0 is defined as:
3690 *
3691 * W_i,0 = \Sum_j w_i,j (2)
3692 *
3693 * Where w_i,j is the weight of the j-th runnable task on cpu i. This weight
3694 * is derived from the nice value as per prio_to_weight[].
3695 *
3696 * The weight average is an exponential decay average of the instantaneous
3697 * weight:
3698 *
3699 * W'_i,n = (2^n - 1) / 2^n * W_i,n + 1 / 2^n * W_i,0 (3)
3700 *
3701 * P_i is the cpu power (or compute capacity) of cpu i, typically it is the
3702 * fraction of 'recent' time available for SCHED_OTHER task execution. But it
3703 * can also include other factors [XXX].
3704 *
3705 * To achieve this balance we define a measure of imbalance which follows
3706 * directly from (1):
3707 *
3708 * imb_i,j = max{ avg(W/P), W_i/P_i } - min{ avg(W/P), W_j/P_j } (4)
3709 *
3710 * We them move tasks around to minimize the imbalance. In the continuous
3711 * function space it is obvious this converges, in the discrete case we get
3712 * a few fun cases generally called infeasible weight scenarios.
3713 *
3714 * [XXX expand on:
3715 * - infeasible weights;
3716 * - local vs global optima in the discrete case. ]
3717 *
3718 *
3719 * SCHED DOMAINS
3720 *
3721 * In order to solve the imbalance equation (4), and avoid the obvious O(n^2)
3722 * for all i,j solution, we create a tree of cpus that follows the hardware
3723 * topology where each level pairs two lower groups (or better). This results
3724 * in O(log n) layers. Furthermore we reduce the number of cpus going up the
3725 * tree to only the first of the previous level and we decrease the frequency
3726 * of load-balance at each level inv. proportional to the number of cpus in
3727 * the groups.
3728 *
3729 * This yields:
3730 *
3731 * log_2 n 1 n
3732 * \Sum { --- * --- * 2^i } = O(n) (5)
3733 * i = 0 2^i 2^i
3734 * `- size of each group
3735 * | | `- number of cpus doing load-balance
3736 * | `- freq
3737 * `- sum over all levels
3738 *
3739 * Coupled with a limit on how many tasks we can migrate every balance pass,
3740 * this makes (5) the runtime complexity of the balancer.
3741 *
3742 * An important property here is that each CPU is still (indirectly) connected
3743 * to every other cpu in at most O(log n) steps:
3744 *
3745 * The adjacency matrix of the resulting graph is given by:
3746 *
3747 * log_2 n
3748 * A_i,j = \Union (i % 2^k == 0) && i / 2^(k+1) == j / 2^(k+1) (6)
3749 * k = 0
3750 *
3751 * And you'll find that:
3752 *
3753 * A^(log_2 n)_i,j != 0 for all i,j (7)
3754 *
3755 * Showing there's indeed a path between every cpu in at most O(log n) steps.
3756 * The task movement gives a factor of O(m), giving a convergence complexity
3757 * of:
3758 *
3759 * O(nm log n), n := nr_cpus, m := nr_tasks (8)
3760 *
3761 *
3762 * WORK CONSERVING
3763 *
3764 * In order to avoid CPUs going idle while there's still work to do, new idle
3765 * balancing is more aggressive and has the newly idle cpu iterate up the domain
3766 * tree itself instead of relying on other CPUs to bring it work.
3767 *
3768 * This adds some complexity to both (5) and (8) but it reduces the total idle
3769 * time.
3770 *
3771 * [XXX more?]
3772 *
3773 *
3774 * CGROUPS
3775 *
3776 * Cgroups make a horror show out of (2), instead of a simple sum we get:
3777 *
3778 * s_k,i
3779 * W_i,0 = \Sum_j \Prod_k w_k * ----- (9)
3780 * S_k
3781 *
3782 * Where
3783 *
3784 * s_k,i = \Sum_j w_i,j,k and S_k = \Sum_i s_k,i (10)
3785 *
3786 * w_i,j,k is the weight of the j-th runnable task in the k-th cgroup on cpu i.
3787 *
3788 * The big problem is S_k, its a global sum needed to compute a local (W_i)
3789 * property.
3790 *
3791 * [XXX write more on how we solve this.. _after_ merging pjt's patches that
3792 * rewrite all of this once again.]
3793 */
3038 3794
3039static unsigned long __read_mostly max_load_balance_interval = HZ/10; 3795static unsigned long __read_mostly max_load_balance_interval = HZ/10;
3040 3796
@@ -3300,52 +4056,58 @@ next:
3300/* 4056/*
3301 * update tg->load_weight by folding this cpu's load_avg 4057 * update tg->load_weight by folding this cpu's load_avg
3302 */ 4058 */
3303static int update_shares_cpu(struct task_group *tg, int cpu) 4059static void __update_blocked_averages_cpu(struct task_group *tg, int cpu)
3304{ 4060{
3305 struct cfs_rq *cfs_rq; 4061 struct sched_entity *se = tg->se[cpu];
3306 unsigned long flags; 4062 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu];
3307 struct rq *rq;
3308
3309 if (!tg->se[cpu])
3310 return 0;
3311
3312 rq = cpu_rq(cpu);
3313 cfs_rq = tg->cfs_rq[cpu];
3314
3315 raw_spin_lock_irqsave(&rq->lock, flags);
3316
3317 update_rq_clock(rq);
3318 update_cfs_load(cfs_rq, 1);
3319 4063
3320 /* 4064 /* throttled entities do not contribute to load */
3321 * We need to update shares after updating tg->load_weight in 4065 if (throttled_hierarchy(cfs_rq))
3322 * order to adjust the weight of groups with long running tasks. 4066 return;
3323 */
3324 update_cfs_shares(cfs_rq);
3325 4067
3326 raw_spin_unlock_irqrestore(&rq->lock, flags); 4068 update_cfs_rq_blocked_load(cfs_rq, 1);
3327 4069
3328 return 0; 4070 if (se) {
4071 update_entity_load_avg(se, 1);
4072 /*
4073 * We pivot on our runnable average having decayed to zero for
4074 * list removal. This generally implies that all our children
4075 * have also been removed (modulo rounding error or bandwidth
4076 * control); however, such cases are rare and we can fix these
4077 * at enqueue.
4078 *
4079 * TODO: fix up out-of-order children on enqueue.
4080 */
4081 if (!se->avg.runnable_avg_sum && !cfs_rq->nr_running)
4082 list_del_leaf_cfs_rq(cfs_rq);
4083 } else {
4084 struct rq *rq = rq_of(cfs_rq);
4085 update_rq_runnable_avg(rq, rq->nr_running);
4086 }
3329} 4087}
3330 4088
3331static void update_shares(int cpu) 4089static void update_blocked_averages(int cpu)
3332{ 4090{
3333 struct cfs_rq *cfs_rq;
3334 struct rq *rq = cpu_rq(cpu); 4091 struct rq *rq = cpu_rq(cpu);
4092 struct cfs_rq *cfs_rq;
4093 unsigned long flags;
3335 4094
3336 rcu_read_lock(); 4095 raw_spin_lock_irqsave(&rq->lock, flags);
4096 update_rq_clock(rq);
3337 /* 4097 /*
3338 * Iterates the task_group tree in a bottom up fashion, see 4098 * Iterates the task_group tree in a bottom up fashion, see
3339 * list_add_leaf_cfs_rq() for details. 4099 * list_add_leaf_cfs_rq() for details.
3340 */ 4100 */
3341 for_each_leaf_cfs_rq(rq, cfs_rq) { 4101 for_each_leaf_cfs_rq(rq, cfs_rq) {
3342 /* throttled entities do not contribute to load */ 4102 /*
3343 if (throttled_hierarchy(cfs_rq)) 4103 * Note: We may want to consider periodically releasing
3344 continue; 4104 * rq->lock about these updates so that creating many task
3345 4105 * groups does not result in continually extending hold time.
3346 update_shares_cpu(cfs_rq->tg, cpu); 4106 */
4107 __update_blocked_averages_cpu(cfs_rq->tg, rq->cpu);
3347 } 4108 }
3348 rcu_read_unlock(); 4109
4110 raw_spin_unlock_irqrestore(&rq->lock, flags);
3349} 4111}
3350 4112
3351/* 4113/*
@@ -3397,7 +4159,7 @@ static unsigned long task_h_load(struct task_struct *p)
3397 return load; 4159 return load;
3398} 4160}
3399#else 4161#else
3400static inline void update_shares(int cpu) 4162static inline void update_blocked_averages(int cpu)
3401{ 4163{
3402} 4164}
3403 4165
@@ -4457,12 +5219,14 @@ void idle_balance(int this_cpu, struct rq *this_rq)
4457 if (this_rq->avg_idle < sysctl_sched_migration_cost) 5219 if (this_rq->avg_idle < sysctl_sched_migration_cost)
4458 return; 5220 return;
4459 5221
5222 update_rq_runnable_avg(this_rq, 1);
5223
4460 /* 5224 /*
4461 * Drop the rq->lock, but keep IRQ/preempt disabled. 5225 * Drop the rq->lock, but keep IRQ/preempt disabled.
4462 */ 5226 */
4463 raw_spin_unlock(&this_rq->lock); 5227 raw_spin_unlock(&this_rq->lock);
4464 5228
4465 update_shares(this_cpu); 5229 update_blocked_averages(this_cpu);
4466 rcu_read_lock(); 5230 rcu_read_lock();
4467 for_each_domain(this_cpu, sd) { 5231 for_each_domain(this_cpu, sd) {
4468 unsigned long interval; 5232 unsigned long interval;
@@ -4717,7 +5481,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
4717 int update_next_balance = 0; 5481 int update_next_balance = 0;
4718 int need_serialize; 5482 int need_serialize;
4719 5483
4720 update_shares(cpu); 5484 update_blocked_averages(cpu);
4721 5485
4722 rcu_read_lock(); 5486 rcu_read_lock();
4723 for_each_domain(cpu, sd) { 5487 for_each_domain(cpu, sd) {
@@ -4954,6 +5718,11 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
4954 cfs_rq = cfs_rq_of(se); 5718 cfs_rq = cfs_rq_of(se);
4955 entity_tick(cfs_rq, se, queued); 5719 entity_tick(cfs_rq, se, queued);
4956 } 5720 }
5721
5722 if (sched_feat_numa(NUMA))
5723 task_tick_numa(rq, curr);
5724
5725 update_rq_runnable_avg(rq, 1);
4957} 5726}
4958 5727
4959/* 5728/*
@@ -5046,6 +5815,20 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p)
5046 place_entity(cfs_rq, se, 0); 5815 place_entity(cfs_rq, se, 0);
5047 se->vruntime -= cfs_rq->min_vruntime; 5816 se->vruntime -= cfs_rq->min_vruntime;
5048 } 5817 }
5818
5819#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP)
5820 /*
5821 * Remove our load from contribution when we leave sched_fair
5822 * and ensure we don't carry in an old decay_count if we
5823 * switch back.
5824 */
5825 if (p->se.avg.decay_count) {
5826 struct cfs_rq *cfs_rq = cfs_rq_of(&p->se);
5827 __synchronize_entity_decay(&p->se);
5828 subtract_blocked_load_contrib(cfs_rq,
5829 p->se.avg.load_avg_contrib);
5830 }
5831#endif
5049} 5832}
5050 5833
5051/* 5834/*
@@ -5092,11 +5875,16 @@ void init_cfs_rq(struct cfs_rq *cfs_rq)
5092#ifndef CONFIG_64BIT 5875#ifndef CONFIG_64BIT
5093 cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime; 5876 cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
5094#endif 5877#endif
5878#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP)
5879 atomic64_set(&cfs_rq->decay_counter, 1);
5880 atomic64_set(&cfs_rq->removed_load, 0);
5881#endif
5095} 5882}
5096 5883
5097#ifdef CONFIG_FAIR_GROUP_SCHED 5884#ifdef CONFIG_FAIR_GROUP_SCHED
5098static void task_move_group_fair(struct task_struct *p, int on_rq) 5885static void task_move_group_fair(struct task_struct *p, int on_rq)
5099{ 5886{
5887 struct cfs_rq *cfs_rq;
5100 /* 5888 /*
5101 * If the task was not on the rq at the time of this cgroup movement 5889 * If the task was not on the rq at the time of this cgroup movement
5102 * it must have been asleep, sleeping tasks keep their ->vruntime 5890 * it must have been asleep, sleeping tasks keep their ->vruntime
@@ -5128,8 +5916,19 @@ static void task_move_group_fair(struct task_struct *p, int on_rq)
5128 if (!on_rq) 5916 if (!on_rq)
5129 p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime; 5917 p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime;
5130 set_task_rq(p, task_cpu(p)); 5918 set_task_rq(p, task_cpu(p));
5131 if (!on_rq) 5919 if (!on_rq) {
5132 p->se.vruntime += cfs_rq_of(&p->se)->min_vruntime; 5920 cfs_rq = cfs_rq_of(&p->se);
5921 p->se.vruntime += cfs_rq->min_vruntime;
5922#ifdef CONFIG_SMP
5923 /*
5924 * migrate_task_rq_fair() will have removed our previous
5925 * contribution, but we must synchronize for ongoing future
5926 * decay.
5927 */
5928 p->se.avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
5929 cfs_rq->blocked_load_avg += p->se.avg.load_avg_contrib;
5930#endif
5931 }
5133} 5932}
5134 5933
5135void free_fair_sched_group(struct task_group *tg) 5934void free_fair_sched_group(struct task_group *tg)
@@ -5214,10 +6013,6 @@ void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
5214 6013
5215 cfs_rq->tg = tg; 6014 cfs_rq->tg = tg;
5216 cfs_rq->rq = rq; 6015 cfs_rq->rq = rq;
5217#ifdef CONFIG_SMP
5218 /* allow initial update_cfs_load() to truncate */
5219 cfs_rq->load_stamp = 1;
5220#endif
5221 init_cfs_rq_runtime(cfs_rq); 6016 init_cfs_rq_runtime(cfs_rq);
5222 6017
5223 tg->cfs_rq[cpu] = cfs_rq; 6018 tg->cfs_rq[cpu] = cfs_rq;
@@ -5297,7 +6092,7 @@ static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task
5297 * idle runqueue: 6092 * idle runqueue:
5298 */ 6093 */
5299 if (rq->cfs.load.weight) 6094 if (rq->cfs.load.weight)
5300 rr_interval = NS_TO_JIFFIES(sched_slice(&rq->cfs, se)); 6095 rr_interval = NS_TO_JIFFIES(sched_slice(cfs_rq_of(se), se));
5301 6096
5302 return rr_interval; 6097 return rr_interval;
5303} 6098}
@@ -5319,7 +6114,9 @@ const struct sched_class fair_sched_class = {
5319 6114
5320#ifdef CONFIG_SMP 6115#ifdef CONFIG_SMP
5321 .select_task_rq = select_task_rq_fair, 6116 .select_task_rq = select_task_rq_fair,
5322 6117#ifdef CONFIG_FAIR_GROUP_SCHED
6118 .migrate_task_rq = migrate_task_rq_fair,
6119#endif
5323 .rq_online = rq_online_fair, 6120 .rq_online = rq_online_fair,
5324 .rq_offline = rq_offline_fair, 6121 .rq_offline = rq_offline_fair,
5325 6122
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index eebefcad7027..1ad1d2b5395f 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -32,6 +32,11 @@ SCHED_FEAT(LAST_BUDDY, true)
32SCHED_FEAT(CACHE_HOT_BUDDY, true) 32SCHED_FEAT(CACHE_HOT_BUDDY, true)
33 33
34/* 34/*
35 * Allow wakeup-time preemption of the current task:
36 */
37SCHED_FEAT(WAKEUP_PREEMPTION, true)
38
39/*
35 * Use arch dependent cpu power functions 40 * Use arch dependent cpu power functions
36 */ 41 */
37SCHED_FEAT(ARCH_POWER, true) 42SCHED_FEAT(ARCH_POWER, true)
@@ -61,3 +66,14 @@ SCHED_FEAT(TTWU_QUEUE, true)
61SCHED_FEAT(FORCE_SD_OVERLAP, false) 66SCHED_FEAT(FORCE_SD_OVERLAP, false)
62SCHED_FEAT(RT_RUNTIME_SHARE, true) 67SCHED_FEAT(RT_RUNTIME_SHARE, true)
63SCHED_FEAT(LB_MIN, false) 68SCHED_FEAT(LB_MIN, false)
69
70/*
71 * Apply the automatic NUMA scheduling policy. Enabled automatically
72 * at runtime if running on a NUMA machine. Can be controlled via
73 * numa_balancing=. Allow PTE scanning to be forced on UMA machines
74 * for debugging the core machinery.
75 */
76#ifdef CONFIG_NUMA_BALANCING
77SCHED_FEAT(NUMA, false)
78SCHED_FEAT(NUMA_FORCE, false)
79#endif
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 418feb01344e..127a2c4cf4ab 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -7,6 +7,8 @@
7 7
8#include <linux/slab.h> 8#include <linux/slab.h>
9 9
10int sched_rr_timeslice = RR_TIMESLICE;
11
10static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun); 12static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);
11 13
12struct rt_bandwidth def_rt_bandwidth; 14struct rt_bandwidth def_rt_bandwidth;
@@ -566,7 +568,7 @@ static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
566static int do_balance_runtime(struct rt_rq *rt_rq) 568static int do_balance_runtime(struct rt_rq *rt_rq)
567{ 569{
568 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); 570 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
569 struct root_domain *rd = cpu_rq(smp_processor_id())->rd; 571 struct root_domain *rd = rq_of_rt_rq(rt_rq)->rd;
570 int i, weight, more = 0; 572 int i, weight, more = 0;
571 u64 rt_period; 573 u64 rt_period;
572 574
@@ -925,8 +927,8 @@ static void update_curr_rt(struct rq *rq)
925 return; 927 return;
926 928
927 delta_exec = rq->clock_task - curr->se.exec_start; 929 delta_exec = rq->clock_task - curr->se.exec_start;
928 if (unlikely((s64)delta_exec < 0)) 930 if (unlikely((s64)delta_exec <= 0))
929 delta_exec = 0; 931 return;
930 932
931 schedstat_set(curr->se.statistics.exec_max, 933 schedstat_set(curr->se.statistics.exec_max,
932 max(curr->se.statistics.exec_max, delta_exec)); 934 max(curr->se.statistics.exec_max, delta_exec));
@@ -1427,8 +1429,7 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
1427static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu) 1429static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
1428{ 1430{
1429 if (!task_running(rq, p) && 1431 if (!task_running(rq, p) &&
1430 (cpu < 0 || cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) && 1432 cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
1431 (p->nr_cpus_allowed > 1))
1432 return 1; 1433 return 1;
1433 return 0; 1434 return 0;
1434} 1435}
@@ -1889,8 +1890,11 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p)
1889 * we may need to handle the pulling of RT tasks 1890 * we may need to handle the pulling of RT tasks
1890 * now. 1891 * now.
1891 */ 1892 */
1892 if (p->on_rq && !rq->rt.rt_nr_running) 1893 if (!p->on_rq || rq->rt.rt_nr_running)
1893 pull_rt_task(rq); 1894 return;
1895
1896 if (pull_rt_task(rq))
1897 resched_task(rq->curr);
1894} 1898}
1895 1899
1896void init_sched_rt_class(void) 1900void init_sched_rt_class(void)
@@ -1985,7 +1989,11 @@ static void watchdog(struct rq *rq, struct task_struct *p)
1985 if (soft != RLIM_INFINITY) { 1989 if (soft != RLIM_INFINITY) {
1986 unsigned long next; 1990 unsigned long next;
1987 1991
1988 p->rt.timeout++; 1992 if (p->rt.watchdog_stamp != jiffies) {
1993 p->rt.timeout++;
1994 p->rt.watchdog_stamp = jiffies;
1995 }
1996
1989 next = DIV_ROUND_UP(min(soft, hard), USEC_PER_SEC/HZ); 1997 next = DIV_ROUND_UP(min(soft, hard), USEC_PER_SEC/HZ);
1990 if (p->rt.timeout > next) 1998 if (p->rt.timeout > next)
1991 p->cputime_expires.sched_exp = p->se.sum_exec_runtime; 1999 p->cputime_expires.sched_exp = p->se.sum_exec_runtime;
@@ -2010,7 +2018,7 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
2010 if (--p->rt.time_slice) 2018 if (--p->rt.time_slice)
2011 return; 2019 return;
2012 2020
2013 p->rt.time_slice = RR_TIMESLICE; 2021 p->rt.time_slice = sched_rr_timeslice;
2014 2022
2015 /* 2023 /*
2016 * Requeue to the end of queue if we (and all of our ancestors) are the 2024 * Requeue to the end of queue if we (and all of our ancestors) are the
@@ -2041,7 +2049,7 @@ static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task)
2041 * Time slice is 0 for SCHED_FIFO tasks 2049 * Time slice is 0 for SCHED_FIFO tasks
2042 */ 2050 */
2043 if (task->policy == SCHED_RR) 2051 if (task->policy == SCHED_RR)
2044 return RR_TIMESLICE; 2052 return sched_rr_timeslice;
2045 else 2053 else
2046 return 0; 2054 return 0;
2047} 2055}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 7a7db09cfabc..cc03cfdf469f 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1,5 +1,7 @@
1 1
2#include <linux/sched.h> 2#include <linux/sched.h>
3#include <linux/sched/sysctl.h>
4#include <linux/sched/rt.h>
3#include <linux/mutex.h> 5#include <linux/mutex.h>
4#include <linux/spinlock.h> 6#include <linux/spinlock.h>
5#include <linux/stop_machine.h> 7#include <linux/stop_machine.h>
@@ -112,6 +114,8 @@ struct task_group {
112 unsigned long shares; 114 unsigned long shares;
113 115
114 atomic_t load_weight; 116 atomic_t load_weight;
117 atomic64_t load_avg;
118 atomic_t runnable_avg;
115#endif 119#endif
116 120
117#ifdef CONFIG_RT_GROUP_SCHED 121#ifdef CONFIG_RT_GROUP_SCHED
@@ -222,22 +226,29 @@ struct cfs_rq {
222 unsigned int nr_spread_over; 226 unsigned int nr_spread_over;
223#endif 227#endif
224 228
229#ifdef CONFIG_SMP
230/*
231 * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be
232 * removed when useful for applications beyond shares distribution (e.g.
233 * load-balance).
234 */
225#ifdef CONFIG_FAIR_GROUP_SCHED 235#ifdef CONFIG_FAIR_GROUP_SCHED
226 struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */
227
228 /* 236 /*
229 * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in 237 * CFS Load tracking
230 * a hierarchy). Non-leaf lrqs hold other higher schedulable entities 238 * Under CFS, load is tracked on a per-entity basis and aggregated up.
231 * (like users, containers etc.) 239 * This allows for the description of both thread and group usage (in
232 * 240 * the FAIR_GROUP_SCHED case).
233 * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
234 * list is used during load balance.
235 */ 241 */
236 int on_list; 242 u64 runnable_load_avg, blocked_load_avg;
237 struct list_head leaf_cfs_rq_list; 243 atomic64_t decay_counter, removed_load;
238 struct task_group *tg; /* group that "owns" this runqueue */ 244 u64 last_decay;
245#endif /* CONFIG_FAIR_GROUP_SCHED */
246/* These always depend on CONFIG_FAIR_GROUP_SCHED */
247#ifdef CONFIG_FAIR_GROUP_SCHED
248 u32 tg_runnable_contrib;
249 u64 tg_load_contrib;
250#endif /* CONFIG_FAIR_GROUP_SCHED */
239 251
240#ifdef CONFIG_SMP
241 /* 252 /*
242 * h_load = weight * f(tg) 253 * h_load = weight * f(tg)
243 * 254 *
@@ -245,26 +256,30 @@ struct cfs_rq {
245 * this group. 256 * this group.
246 */ 257 */
247 unsigned long h_load; 258 unsigned long h_load;
259#endif /* CONFIG_SMP */
260
261#ifdef CONFIG_FAIR_GROUP_SCHED
262 struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */
248 263
249 /* 264 /*
250 * Maintaining per-cpu shares distribution for group scheduling 265 * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
266 * a hierarchy). Non-leaf lrqs hold other higher schedulable entities
267 * (like users, containers etc.)
251 * 268 *
252 * load_stamp is the last time we updated the load average 269 * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
253 * load_last is the last time we updated the load average and saw load 270 * list is used during load balance.
254 * load_unacc_exec_time is currently unaccounted execution time
255 */ 271 */
256 u64 load_avg; 272 int on_list;
257 u64 load_period; 273 struct list_head leaf_cfs_rq_list;
258 u64 load_stamp, load_last, load_unacc_exec_time; 274 struct task_group *tg; /* group that "owns" this runqueue */
259 275
260 unsigned long load_contribution;
261#endif /* CONFIG_SMP */
262#ifdef CONFIG_CFS_BANDWIDTH 276#ifdef CONFIG_CFS_BANDWIDTH
263 int runtime_enabled; 277 int runtime_enabled;
264 u64 runtime_expires; 278 u64 runtime_expires;
265 s64 runtime_remaining; 279 s64 runtime_remaining;
266 280
267 u64 throttled_timestamp; 281 u64 throttled_clock, throttled_clock_task;
282 u64 throttled_clock_task_time;
268 int throttled, throttle_count; 283 int throttled, throttle_count;
269 struct list_head throttled_list; 284 struct list_head throttled_list;
270#endif /* CONFIG_CFS_BANDWIDTH */ 285#endif /* CONFIG_CFS_BANDWIDTH */
@@ -467,6 +482,8 @@ struct rq {
467#ifdef CONFIG_SMP 482#ifdef CONFIG_SMP
468 struct llist_head wake_list; 483 struct llist_head wake_list;
469#endif 484#endif
485
486 struct sched_avg avg;
470}; 487};
471 488
472static inline int cpu_of(struct rq *rq) 489static inline int cpu_of(struct rq *rq)
@@ -648,6 +665,18 @@ extern struct static_key sched_feat_keys[__SCHED_FEAT_NR];
648#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x)) 665#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
649#endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */ 666#endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */
650 667
668#ifdef CONFIG_NUMA_BALANCING
669#define sched_feat_numa(x) sched_feat(x)
670#ifdef CONFIG_SCHED_DEBUG
671#define numabalancing_enabled sched_feat_numa(NUMA)
672#else
673extern bool numabalancing_enabled;
674#endif /* CONFIG_SCHED_DEBUG */
675#else
676#define sched_feat_numa(x) (0)
677#define numabalancing_enabled (0)
678#endif /* CONFIG_NUMA_BALANCING */
679
651static inline u64 global_rt_period(void) 680static inline u64 global_rt_period(void)
652{ 681{
653 return (u64)sysctl_sched_rt_period * NSEC_PER_USEC; 682 return (u64)sysctl_sched_rt_period * NSEC_PER_USEC;
@@ -1212,4 +1241,3 @@ static inline u64 irq_time_read(int cpu)
1212} 1241}
1213#endif /* CONFIG_64BIT */ 1242#endif /* CONFIG_64BIT */
1214#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ 1243#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
1215
diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c
index 903ffa9e8872..e036eda1a9c9 100644
--- a/kernel/sched/stats.c
+++ b/kernel/sched/stats.c
@@ -21,14 +21,17 @@ static int show_schedstat(struct seq_file *seq, void *v)
21 if (mask_str == NULL) 21 if (mask_str == NULL)
22 return -ENOMEM; 22 return -ENOMEM;
23 23
24 seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION); 24 if (v == (void *)1) {
25 seq_printf(seq, "timestamp %lu\n", jiffies); 25 seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION);
26 for_each_online_cpu(cpu) { 26 seq_printf(seq, "timestamp %lu\n", jiffies);
27 struct rq *rq = cpu_rq(cpu); 27 } else {
28 struct rq *rq;
28#ifdef CONFIG_SMP 29#ifdef CONFIG_SMP
29 struct sched_domain *sd; 30 struct sched_domain *sd;
30 int dcount = 0; 31 int dcount = 0;
31#endif 32#endif
33 cpu = (unsigned long)(v - 2);
34 rq = cpu_rq(cpu);
32 35
33 /* runqueue-specific stats */ 36 /* runqueue-specific stats */
34 seq_printf(seq, 37 seq_printf(seq,
@@ -77,30 +80,66 @@ static int show_schedstat(struct seq_file *seq, void *v)
77 return 0; 80 return 0;
78} 81}
79 82
80static int schedstat_open(struct inode *inode, struct file *file) 83/*
84 * This itererator needs some explanation.
85 * It returns 1 for the header position.
86 * This means 2 is cpu 0.
87 * In a hotplugged system some cpus, including cpu 0, may be missing so we have
88 * to use cpumask_* to iterate over the cpus.
89 */
90static void *schedstat_start(struct seq_file *file, loff_t *offset)
81{ 91{
82 unsigned int size = PAGE_SIZE * (1 + num_online_cpus() / 32); 92 unsigned long n = *offset;
83 char *buf = kmalloc(size, GFP_KERNEL);
84 struct seq_file *m;
85 int res;
86 93
87 if (!buf) 94 if (n == 0)
88 return -ENOMEM; 95 return (void *) 1;
89 res = single_open(file, show_schedstat, NULL); 96
90 if (!res) { 97 n--;
91 m = file->private_data; 98
92 m->buf = buf; 99 if (n > 0)
93 m->size = size; 100 n = cpumask_next(n - 1, cpu_online_mask);
94 } else 101 else
95 kfree(buf); 102 n = cpumask_first(cpu_online_mask);
96 return res; 103
104 *offset = n + 1;
105
106 if (n < nr_cpu_ids)
107 return (void *)(unsigned long)(n + 2);
108 return NULL;
109}
110
111static void *schedstat_next(struct seq_file *file, void *data, loff_t *offset)
112{
113 (*offset)++;
114 return schedstat_start(file, offset);
115}
116
117static void schedstat_stop(struct seq_file *file, void *data)
118{
119}
120
121static const struct seq_operations schedstat_sops = {
122 .start = schedstat_start,
123 .next = schedstat_next,
124 .stop = schedstat_stop,
125 .show = show_schedstat,
126};
127
128static int schedstat_open(struct inode *inode, struct file *file)
129{
130 return seq_open(file, &schedstat_sops);
97} 131}
98 132
133static int schedstat_release(struct inode *inode, struct file *file)
134{
135 return 0;
136};
137
99static const struct file_operations proc_schedstat_operations = { 138static const struct file_operations proc_schedstat_operations = {
100 .open = schedstat_open, 139 .open = schedstat_open,
101 .read = seq_read, 140 .read = seq_read,
102 .llseek = seq_lseek, 141 .llseek = seq_lseek,
103 .release = single_release, 142 .release = schedstat_release,
104}; 143};
105 144
106static int __init proc_schedstat_init(void) 145static int __init proc_schedstat_init(void)
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index ee376beedaf9..5af44b593770 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -396,25 +396,29 @@ int __secure_computing(int this_syscall)
396#ifdef CONFIG_SECCOMP_FILTER 396#ifdef CONFIG_SECCOMP_FILTER
397 case SECCOMP_MODE_FILTER: { 397 case SECCOMP_MODE_FILTER: {
398 int data; 398 int data;
399 struct pt_regs *regs = task_pt_regs(current);
399 ret = seccomp_run_filters(this_syscall); 400 ret = seccomp_run_filters(this_syscall);
400 data = ret & SECCOMP_RET_DATA; 401 data = ret & SECCOMP_RET_DATA;
401 ret &= SECCOMP_RET_ACTION; 402 ret &= SECCOMP_RET_ACTION;
402 switch (ret) { 403 switch (ret) {
403 case SECCOMP_RET_ERRNO: 404 case SECCOMP_RET_ERRNO:
404 /* Set the low-order 16-bits as a errno. */ 405 /* Set the low-order 16-bits as a errno. */
405 syscall_set_return_value(current, task_pt_regs(current), 406 syscall_set_return_value(current, regs,
406 -data, 0); 407 -data, 0);
407 goto skip; 408 goto skip;
408 case SECCOMP_RET_TRAP: 409 case SECCOMP_RET_TRAP:
409 /* Show the handler the original registers. */ 410 /* Show the handler the original registers. */
410 syscall_rollback(current, task_pt_regs(current)); 411 syscall_rollback(current, regs);
411 /* Let the filter pass back 16 bits of data. */ 412 /* Let the filter pass back 16 bits of data. */
412 seccomp_send_sigsys(this_syscall, data); 413 seccomp_send_sigsys(this_syscall, data);
413 goto skip; 414 goto skip;
414 case SECCOMP_RET_TRACE: 415 case SECCOMP_RET_TRACE:
415 /* Skip these calls if there is no tracer. */ 416 /* Skip these calls if there is no tracer. */
416 if (!ptrace_event_enabled(current, PTRACE_EVENT_SECCOMP)) 417 if (!ptrace_event_enabled(current, PTRACE_EVENT_SECCOMP)) {
418 syscall_set_return_value(current, regs,
419 -ENOSYS, 0);
417 goto skip; 420 goto skip;
421 }
418 /* Allow the BPF to provide the event message */ 422 /* Allow the BPF to provide the event message */
419 ptrace_event(PTRACE_EVENT_SECCOMP, data); 423 ptrace_event(PTRACE_EVENT_SECCOMP, data);
420 /* 424 /*
@@ -425,6 +429,9 @@ int __secure_computing(int this_syscall)
425 */ 429 */
426 if (fatal_signal_pending(current)) 430 if (fatal_signal_pending(current))
427 break; 431 break;
432 if (syscall_get_nr(current, regs) < 0)
433 goto skip; /* Explicit request to skip. */
434
428 return 0; 435 return 0;
429 case SECCOMP_RET_ALLOW: 436 case SECCOMP_RET_ALLOW:
430 return 0; 437 return 0;
diff --git a/kernel/signal.c b/kernel/signal.c
index 0af8868525d6..dd72567767d9 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -31,6 +31,7 @@
31#include <linux/nsproxy.h> 31#include <linux/nsproxy.h>
32#include <linux/user_namespace.h> 32#include <linux/user_namespace.h>
33#include <linux/uprobes.h> 33#include <linux/uprobes.h>
34#include <linux/compat.h>
34#define CREATE_TRACE_POINTS 35#define CREATE_TRACE_POINTS
35#include <trace/events/signal.h> 36#include <trace/events/signal.h>
36 37
@@ -484,6 +485,9 @@ flush_signal_handlers(struct task_struct *t, int force_default)
484 if (force_default || ka->sa.sa_handler != SIG_IGN) 485 if (force_default || ka->sa.sa_handler != SIG_IGN)
485 ka->sa.sa_handler = SIG_DFL; 486 ka->sa.sa_handler = SIG_DFL;
486 ka->sa.sa_flags = 0; 487 ka->sa.sa_flags = 0;
488#ifdef __ARCH_HAS_SA_RESTORER
489 ka->sa.sa_restorer = NULL;
490#endif
487 sigemptyset(&ka->sa.sa_mask); 491 sigemptyset(&ka->sa.sa_mask);
488 ka++; 492 ka++;
489 } 493 }
@@ -679,23 +683,17 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
679 * No need to set need_resched since signal event passing 683 * No need to set need_resched since signal event passing
680 * goes through ->blocked 684 * goes through ->blocked
681 */ 685 */
682void signal_wake_up(struct task_struct *t, int resume) 686void signal_wake_up_state(struct task_struct *t, unsigned int state)
683{ 687{
684 unsigned int mask;
685
686 set_tsk_thread_flag(t, TIF_SIGPENDING); 688 set_tsk_thread_flag(t, TIF_SIGPENDING);
687
688 /* 689 /*
689 * For SIGKILL, we want to wake it up in the stopped/traced/killable 690 * TASK_WAKEKILL also means wake it up in the stopped/traced/killable
690 * case. We don't check t->state here because there is a race with it 691 * case. We don't check t->state here because there is a race with it
691 * executing another processor and just now entering stopped state. 692 * executing another processor and just now entering stopped state.
692 * By using wake_up_state, we ensure the process will wake up and 693 * By using wake_up_state, we ensure the process will wake up and
693 * handle its death signal. 694 * handle its death signal.
694 */ 695 */
695 mask = TASK_INTERRUPTIBLE; 696 if (!wake_up_state(t, state | TASK_INTERRUPTIBLE))
696 if (resume)
697 mask |= TASK_WAKEKILL;
698 if (!wake_up_state(t, mask))
699 kick_process(t); 697 kick_process(t);
700} 698}
701 699
@@ -843,7 +841,7 @@ static void ptrace_trap_notify(struct task_struct *t)
843 assert_spin_locked(&t->sighand->siglock); 841 assert_spin_locked(&t->sighand->siglock);
844 842
845 task_set_jobctl_pending(t, JOBCTL_TRAP_NOTIFY); 843 task_set_jobctl_pending(t, JOBCTL_TRAP_NOTIFY);
846 signal_wake_up(t, t->jobctl & JOBCTL_LISTENING); 844 ptrace_signal_wake_up(t, t->jobctl & JOBCTL_LISTENING);
847} 845}
848 846
849/* 847/*
@@ -1159,13 +1157,14 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
1159 return __send_signal(sig, info, t, group, from_ancestor_ns); 1157 return __send_signal(sig, info, t, group, from_ancestor_ns);
1160} 1158}
1161 1159
1162static void print_fatal_signal(struct pt_regs *regs, int signr) 1160static void print_fatal_signal(int signr)
1163{ 1161{
1164 printk("%s/%d: potentially unexpected fatal signal %d.\n", 1162 struct pt_regs *regs = signal_pt_regs();
1163 printk(KERN_INFO "%s/%d: potentially unexpected fatal signal %d.\n",
1165 current->comm, task_pid_nr(current), signr); 1164 current->comm, task_pid_nr(current), signr);
1166 1165
1167#if defined(__i386__) && !defined(__arch_um__) 1166#if defined(__i386__) && !defined(__arch_um__)
1168 printk("code at %08lx: ", regs->ip); 1167 printk(KERN_INFO "code at %08lx: ", regs->ip);
1169 { 1168 {
1170 int i; 1169 int i;
1171 for (i = 0; i < 16; i++) { 1170 for (i = 0; i < 16; i++) {
@@ -1173,11 +1172,11 @@ static void print_fatal_signal(struct pt_regs *regs, int signr)
1173 1172
1174 if (get_user(insn, (unsigned char *)(regs->ip + i))) 1173 if (get_user(insn, (unsigned char *)(regs->ip + i)))
1175 break; 1174 break;
1176 printk("%02x ", insn); 1175 printk(KERN_CONT "%02x ", insn);
1177 } 1176 }
1178 } 1177 }
1178 printk(KERN_CONT "\n");
1179#endif 1179#endif
1180 printk("\n");
1181 preempt_disable(); 1180 preempt_disable();
1182 show_regs(regs); 1181 show_regs(regs);
1183 preempt_enable(); 1182 preempt_enable();
@@ -1636,6 +1635,7 @@ bool do_notify_parent(struct task_struct *tsk, int sig)
1636 unsigned long flags; 1635 unsigned long flags;
1637 struct sighand_struct *psig; 1636 struct sighand_struct *psig;
1638 bool autoreap = false; 1637 bool autoreap = false;
1638 cputime_t utime, stime;
1639 1639
1640 BUG_ON(sig == -1); 1640 BUG_ON(sig == -1);
1641 1641
@@ -1673,8 +1673,9 @@ bool do_notify_parent(struct task_struct *tsk, int sig)
1673 task_uid(tsk)); 1673 task_uid(tsk));
1674 rcu_read_unlock(); 1674 rcu_read_unlock();
1675 1675
1676 info.si_utime = cputime_to_clock_t(tsk->utime + tsk->signal->utime); 1676 task_cputime(tsk, &utime, &stime);
1677 info.si_stime = cputime_to_clock_t(tsk->stime + tsk->signal->stime); 1677 info.si_utime = cputime_to_clock_t(utime + tsk->signal->utime);
1678 info.si_stime = cputime_to_clock_t(stime + tsk->signal->stime);
1678 1679
1679 info.si_status = tsk->exit_code & 0x7f; 1680 info.si_status = tsk->exit_code & 0x7f;
1680 if (tsk->exit_code & 0x80) 1681 if (tsk->exit_code & 0x80)
@@ -1738,6 +1739,7 @@ static void do_notify_parent_cldstop(struct task_struct *tsk,
1738 unsigned long flags; 1739 unsigned long flags;
1739 struct task_struct *parent; 1740 struct task_struct *parent;
1740 struct sighand_struct *sighand; 1741 struct sighand_struct *sighand;
1742 cputime_t utime, stime;
1741 1743
1742 if (for_ptracer) { 1744 if (for_ptracer) {
1743 parent = tsk->parent; 1745 parent = tsk->parent;
@@ -1752,12 +1754,13 @@ static void do_notify_parent_cldstop(struct task_struct *tsk,
1752 * see comment in do_notify_parent() about the following 4 lines 1754 * see comment in do_notify_parent() about the following 4 lines
1753 */ 1755 */
1754 rcu_read_lock(); 1756 rcu_read_lock();
1755 info.si_pid = task_pid_nr_ns(tsk, parent->nsproxy->pid_ns); 1757 info.si_pid = task_pid_nr_ns(tsk, task_active_pid_ns(parent));
1756 info.si_uid = from_kuid_munged(task_cred_xxx(parent, user_ns), task_uid(tsk)); 1758 info.si_uid = from_kuid_munged(task_cred_xxx(parent, user_ns), task_uid(tsk));
1757 rcu_read_unlock(); 1759 rcu_read_unlock();
1758 1760
1759 info.si_utime = cputime_to_clock_t(tsk->utime); 1761 task_cputime(tsk, &utime, &stime);
1760 info.si_stime = cputime_to_clock_t(tsk->stime); 1762 info.si_utime = cputime_to_clock_t(utime);
1763 info.si_stime = cputime_to_clock_t(stime);
1761 1764
1762 info.si_code = why; 1765 info.si_code = why;
1763 switch (why) { 1766 switch (why) {
@@ -1798,6 +1801,10 @@ static inline int may_ptrace_stop(void)
1798 * If SIGKILL was already sent before the caller unlocked 1801 * If SIGKILL was already sent before the caller unlocked
1799 * ->siglock we must see ->core_state != NULL. Otherwise it 1802 * ->siglock we must see ->core_state != NULL. Otherwise it
1800 * is safe to enter schedule(). 1803 * is safe to enter schedule().
1804 *
1805 * This is almost outdated, a task with the pending SIGKILL can't
1806 * block in TASK_TRACED. But PTRACE_EVENT_EXIT can be reported
1807 * after SIGKILL was already dequeued.
1801 */ 1808 */
1802 if (unlikely(current->mm->core_state) && 1809 if (unlikely(current->mm->core_state) &&
1803 unlikely(current->mm == current->parent->mm)) 1810 unlikely(current->mm == current->parent->mm))
@@ -1908,7 +1915,7 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info)
1908 preempt_disable(); 1915 preempt_disable();
1909 read_unlock(&tasklist_lock); 1916 read_unlock(&tasklist_lock);
1910 preempt_enable_no_resched(); 1917 preempt_enable_no_resched();
1911 schedule(); 1918 freezable_schedule();
1912 } else { 1919 } else {
1913 /* 1920 /*
1914 * By the time we got the lock, our tracer went away. 1921 * By the time we got the lock, our tracer went away.
@@ -1923,6 +1930,7 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info)
1923 if (gstop_done) 1930 if (gstop_done)
1924 do_notify_parent_cldstop(current, false, why); 1931 do_notify_parent_cldstop(current, false, why);
1925 1932
1933 /* tasklist protects us from ptrace_freeze_traced() */
1926 __set_current_state(TASK_RUNNING); 1934 __set_current_state(TASK_RUNNING);
1927 if (clear_code) 1935 if (clear_code)
1928 current->exit_code = 0; 1936 current->exit_code = 0;
@@ -1930,13 +1938,6 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info)
1930 } 1938 }
1931 1939
1932 /* 1940 /*
1933 * While in TASK_TRACED, we were considered "frozen enough".
1934 * Now that we woke up, it's crucial if we're supposed to be
1935 * frozen that we freeze now before running anything substantial.
1936 */
1937 try_to_freeze();
1938
1939 /*
1940 * We are back. Now reacquire the siglock before touching 1941 * We are back. Now reacquire the siglock before touching
1941 * last_siginfo, so that we are sure to have synchronized with 1942 * last_siginfo, so that we are sure to have synchronized with
1942 * any signal-sending on another CPU that wants to examine it. 1943 * any signal-sending on another CPU that wants to examine it.
@@ -2092,7 +2093,7 @@ static bool do_signal_stop(int signr)
2092 } 2093 }
2093 2094
2094 /* Now we don't run again until woken by SIGCONT or SIGKILL */ 2095 /* Now we don't run again until woken by SIGCONT or SIGKILL */
2095 schedule(); 2096 freezable_schedule();
2096 return true; 2097 return true;
2097 } else { 2098 } else {
2098 /* 2099 /*
@@ -2138,10 +2139,9 @@ static void do_jobctl_trap(void)
2138 } 2139 }
2139} 2140}
2140 2141
2141static int ptrace_signal(int signr, siginfo_t *info, 2142static int ptrace_signal(int signr, siginfo_t *info)
2142 struct pt_regs *regs, void *cookie)
2143{ 2143{
2144 ptrace_signal_deliver(regs, cookie); 2144 ptrace_signal_deliver();
2145 /* 2145 /*
2146 * We do not check sig_kernel_stop(signr) but set this marker 2146 * We do not check sig_kernel_stop(signr) but set this marker
2147 * unconditionally because we do not know whether debugger will 2147 * unconditionally because we do not know whether debugger will
@@ -2200,15 +2200,14 @@ int get_signal_to_deliver(siginfo_t *info, struct k_sigaction *return_ka,
2200 if (unlikely(uprobe_deny_signal())) 2200 if (unlikely(uprobe_deny_signal()))
2201 return 0; 2201 return 0;
2202 2202
2203relock:
2204 /* 2203 /*
2205 * We'll jump back here after any time we were stopped in TASK_STOPPED. 2204 * Do this once, we can't return to user-mode if freezing() == T.
2206 * While in TASK_STOPPED, we were considered "frozen enough". 2205 * do_signal_stop() and ptrace_stop() do freezable_schedule() and
2207 * Now that we woke up, it's crucial if we're supposed to be 2206 * thus do not need another check after return.
2208 * frozen that we freeze now before running anything substantial.
2209 */ 2207 */
2210 try_to_freeze(); 2208 try_to_freeze();
2211 2209
2210relock:
2212 spin_lock_irq(&sighand->siglock); 2211 spin_lock_irq(&sighand->siglock);
2213 /* 2212 /*
2214 * Every stopped thread goes here after wakeup. Check to see if 2213 * Every stopped thread goes here after wakeup. Check to see if
@@ -2265,8 +2264,7 @@ relock:
2265 break; /* will return 0 */ 2264 break; /* will return 0 */
2266 2265
2267 if (unlikely(current->ptrace) && signr != SIGKILL) { 2266 if (unlikely(current->ptrace) && signr != SIGKILL) {
2268 signr = ptrace_signal(signr, info, 2267 signr = ptrace_signal(signr, info);
2269 regs, cookie);
2270 if (!signr) 2268 if (!signr)
2271 continue; 2269 continue;
2272 } 2270 }
@@ -2351,7 +2349,7 @@ relock:
2351 2349
2352 if (sig_kernel_coredump(signr)) { 2350 if (sig_kernel_coredump(signr)) {
2353 if (print_fatal_signals) 2351 if (print_fatal_signals)
2354 print_fatal_signal(regs, info->si_signo); 2352 print_fatal_signal(info->si_signo);
2355 /* 2353 /*
2356 * If it was able to dump core, this kills all 2354 * If it was able to dump core, this kills all
2357 * other threads in the group and synchronizes with 2355 * other threads in the group and synchronizes with
@@ -2360,7 +2358,7 @@ relock:
2360 * first and our do_group_exit call below will use 2358 * first and our do_group_exit call below will use
2361 * that value and ignore the one we pass it. 2359 * that value and ignore the one we pass it.
2362 */ 2360 */
2363 do_coredump(info, regs); 2361 do_coredump(info);
2364 } 2362 }
2365 2363
2366 /* 2364 /*
@@ -2404,6 +2402,15 @@ void signal_delivered(int sig, siginfo_t *info, struct k_sigaction *ka,
2404 tracehook_signal_handler(sig, info, ka, regs, stepping); 2402 tracehook_signal_handler(sig, info, ka, regs, stepping);
2405} 2403}
2406 2404
2405void signal_setup_done(int failed, struct ksignal *ksig, int stepping)
2406{
2407 if (failed)
2408 force_sigsegv(ksig->sig, current);
2409 else
2410 signal_delivered(ksig->sig, &ksig->info, &ksig->ka,
2411 signal_pt_regs(), stepping);
2412}
2413
2407/* 2414/*
2408 * It could be that complete_signal() picked us to notify about the 2415 * It could be that complete_signal() picked us to notify about the
2409 * group-wide signal. Other threads should be notified now to take 2416 * group-wide signal. Other threads should be notified now to take
@@ -2536,11 +2543,8 @@ static void __set_task_blocked(struct task_struct *tsk, const sigset_t *newset)
2536 */ 2543 */
2537void set_current_blocked(sigset_t *newset) 2544void set_current_blocked(sigset_t *newset)
2538{ 2545{
2539 struct task_struct *tsk = current;
2540 sigdelsetmask(newset, sigmask(SIGKILL) | sigmask(SIGSTOP)); 2546 sigdelsetmask(newset, sigmask(SIGKILL) | sigmask(SIGSTOP));
2541 spin_lock_irq(&tsk->sighand->siglock); 2547 __set_current_blocked(newset);
2542 __set_task_blocked(tsk, newset);
2543 spin_unlock_irq(&tsk->sighand->siglock);
2544} 2548}
2545 2549
2546void __set_current_blocked(const sigset_t *newset) 2550void __set_current_blocked(const sigset_t *newset)
@@ -2624,41 +2628,96 @@ SYSCALL_DEFINE4(rt_sigprocmask, int, how, sigset_t __user *, nset,
2624 return 0; 2628 return 0;
2625} 2629}
2626 2630
2627long do_sigpending(void __user *set, unsigned long sigsetsize) 2631#ifdef CONFIG_COMPAT
2632COMPAT_SYSCALL_DEFINE4(rt_sigprocmask, int, how, compat_sigset_t __user *, nset,
2633 compat_sigset_t __user *, oset, compat_size_t, sigsetsize)
2628{ 2634{
2629 long error = -EINVAL; 2635#ifdef __BIG_ENDIAN
2630 sigset_t pending; 2636 sigset_t old_set = current->blocked;
2637
2638 /* XXX: Don't preclude handling different sized sigset_t's. */
2639 if (sigsetsize != sizeof(sigset_t))
2640 return -EINVAL;
2631 2641
2642 if (nset) {
2643 compat_sigset_t new32;
2644 sigset_t new_set;
2645 int error;
2646 if (copy_from_user(&new32, nset, sizeof(compat_sigset_t)))
2647 return -EFAULT;
2648
2649 sigset_from_compat(&new_set, &new32);
2650 sigdelsetmask(&new_set, sigmask(SIGKILL)|sigmask(SIGSTOP));
2651
2652 error = sigprocmask(how, &new_set, NULL);
2653 if (error)
2654 return error;
2655 }
2656 if (oset) {
2657 compat_sigset_t old32;
2658 sigset_to_compat(&old32, &old_set);
2659 if (copy_to_user(oset, &old32, sizeof(compat_sigset_t)))
2660 return -EFAULT;
2661 }
2662 return 0;
2663#else
2664 return sys_rt_sigprocmask(how, (sigset_t __user *)nset,
2665 (sigset_t __user *)oset, sigsetsize);
2666#endif
2667}
2668#endif
2669
2670static int do_sigpending(void *set, unsigned long sigsetsize)
2671{
2632 if (sigsetsize > sizeof(sigset_t)) 2672 if (sigsetsize > sizeof(sigset_t))
2633 goto out; 2673 return -EINVAL;
2634 2674
2635 spin_lock_irq(&current->sighand->siglock); 2675 spin_lock_irq(&current->sighand->siglock);
2636 sigorsets(&pending, &current->pending.signal, 2676 sigorsets(set, &current->pending.signal,
2637 &current->signal->shared_pending.signal); 2677 &current->signal->shared_pending.signal);
2638 spin_unlock_irq(&current->sighand->siglock); 2678 spin_unlock_irq(&current->sighand->siglock);
2639 2679
2640 /* Outside the lock because only this thread touches it. */ 2680 /* Outside the lock because only this thread touches it. */
2641 sigandsets(&pending, &current->blocked, &pending); 2681 sigandsets(set, &current->blocked, set);
2642 2682 return 0;
2643 error = -EFAULT;
2644 if (!copy_to_user(set, &pending, sigsetsize))
2645 error = 0;
2646
2647out:
2648 return error;
2649} 2683}
2650 2684
2651/** 2685/**
2652 * sys_rt_sigpending - examine a pending signal that has been raised 2686 * sys_rt_sigpending - examine a pending signal that has been raised
2653 * while blocked 2687 * while blocked
2654 * @set: stores pending signals 2688 * @uset: stores pending signals
2655 * @sigsetsize: size of sigset_t type or larger 2689 * @sigsetsize: size of sigset_t type or larger
2656 */ 2690 */
2657SYSCALL_DEFINE2(rt_sigpending, sigset_t __user *, set, size_t, sigsetsize) 2691SYSCALL_DEFINE2(rt_sigpending, sigset_t __user *, uset, size_t, sigsetsize)
2658{ 2692{
2659 return do_sigpending(set, sigsetsize); 2693 sigset_t set;
2694 int err = do_sigpending(&set, sigsetsize);
2695 if (!err && copy_to_user(uset, &set, sigsetsize))
2696 err = -EFAULT;
2697 return err;
2660} 2698}
2661 2699
2700#ifdef CONFIG_COMPAT
2701COMPAT_SYSCALL_DEFINE2(rt_sigpending, compat_sigset_t __user *, uset,
2702 compat_size_t, sigsetsize)
2703{
2704#ifdef __BIG_ENDIAN
2705 sigset_t set;
2706 int err = do_sigpending(&set, sigsetsize);
2707 if (!err) {
2708 compat_sigset_t set32;
2709 sigset_to_compat(&set32, &set);
2710 /* we can get here only if sigsetsize <= sizeof(set) */
2711 if (copy_to_user(uset, &set32, sigsetsize))
2712 err = -EFAULT;
2713 }
2714 return err;
2715#else
2716 return sys_rt_sigpending((sigset_t __user *)uset, sigsetsize);
2717#endif
2718}
2719#endif
2720
2662#ifndef HAVE_ARCH_COPY_SIGINFO_TO_USER 2721#ifndef HAVE_ARCH_COPY_SIGINFO_TO_USER
2663 2722
2664int copy_siginfo_to_user(siginfo_t __user *to, siginfo_t *from) 2723int copy_siginfo_to_user(siginfo_t __user *to, siginfo_t *from)
@@ -2935,6 +2994,23 @@ SYSCALL_DEFINE2(tkill, pid_t, pid, int, sig)
2935 return do_tkill(0, pid, sig); 2994 return do_tkill(0, pid, sig);
2936} 2995}
2937 2996
2997static int do_rt_sigqueueinfo(pid_t pid, int sig, siginfo_t *info)
2998{
2999 /* Not even root can pretend to send signals from the kernel.
3000 * Nor can they impersonate a kill()/tgkill(), which adds source info.
3001 */
3002 if ((info->si_code >= 0 || info->si_code == SI_TKILL) &&
3003 (task_pid_vnr(current) != pid)) {
3004 /* We used to allow any < 0 si_code */
3005 WARN_ON_ONCE(info->si_code < 0);
3006 return -EPERM;
3007 }
3008 info->si_signo = sig;
3009
3010 /* POSIX.1b doesn't mention process groups. */
3011 return kill_proc_info(sig, info, pid);
3012}
3013
2938/** 3014/**
2939 * sys_rt_sigqueueinfo - send signal information to a signal 3015 * sys_rt_sigqueueinfo - send signal information to a signal
2940 * @pid: the PID of the thread 3016 * @pid: the PID of the thread
@@ -2945,25 +3021,26 @@ SYSCALL_DEFINE3(rt_sigqueueinfo, pid_t, pid, int, sig,
2945 siginfo_t __user *, uinfo) 3021 siginfo_t __user *, uinfo)
2946{ 3022{
2947 siginfo_t info; 3023 siginfo_t info;
2948
2949 if (copy_from_user(&info, uinfo, sizeof(siginfo_t))) 3024 if (copy_from_user(&info, uinfo, sizeof(siginfo_t)))
2950 return -EFAULT; 3025 return -EFAULT;
3026 return do_rt_sigqueueinfo(pid, sig, &info);
3027}
2951 3028
2952 /* Not even root can pretend to send signals from the kernel. 3029#ifdef CONFIG_COMPAT
2953 * Nor can they impersonate a kill()/tgkill(), which adds source info. 3030COMPAT_SYSCALL_DEFINE3(rt_sigqueueinfo,
2954 */ 3031 compat_pid_t, pid,
2955 if (info.si_code >= 0 || info.si_code == SI_TKILL) { 3032 int, sig,
2956 /* We used to allow any < 0 si_code */ 3033 struct compat_siginfo __user *, uinfo)
2957 WARN_ON_ONCE(info.si_code < 0); 3034{
2958 return -EPERM; 3035 siginfo_t info;
2959 } 3036 int ret = copy_siginfo_from_user32(&info, uinfo);
2960 info.si_signo = sig; 3037 if (unlikely(ret))
2961 3038 return ret;
2962 /* POSIX.1b doesn't mention process groups. */ 3039 return do_rt_sigqueueinfo(pid, sig, &info);
2963 return kill_proc_info(sig, &info, pid);
2964} 3040}
3041#endif
2965 3042
2966long do_rt_tgsigqueueinfo(pid_t tgid, pid_t pid, int sig, siginfo_t *info) 3043static int do_rt_tgsigqueueinfo(pid_t tgid, pid_t pid, int sig, siginfo_t *info)
2967{ 3044{
2968 /* This is only valid for single tasks */ 3045 /* This is only valid for single tasks */
2969 if (pid <= 0 || tgid <= 0) 3046 if (pid <= 0 || tgid <= 0)
@@ -2972,7 +3049,8 @@ long do_rt_tgsigqueueinfo(pid_t tgid, pid_t pid, int sig, siginfo_t *info)
2972 /* Not even root can pretend to send signals from the kernel. 3049 /* Not even root can pretend to send signals from the kernel.
2973 * Nor can they impersonate a kill()/tgkill(), which adds source info. 3050 * Nor can they impersonate a kill()/tgkill(), which adds source info.
2974 */ 3051 */
2975 if (info->si_code >= 0 || info->si_code == SI_TKILL) { 3052 if (((info->si_code >= 0 || info->si_code == SI_TKILL)) &&
3053 (task_pid_vnr(current) != pid)) {
2976 /* We used to allow any < 0 si_code */ 3054 /* We used to allow any < 0 si_code */
2977 WARN_ON_ONCE(info->si_code < 0); 3055 WARN_ON_ONCE(info->si_code < 0);
2978 return -EPERM; 3056 return -EPERM;
@@ -2993,6 +3071,21 @@ SYSCALL_DEFINE4(rt_tgsigqueueinfo, pid_t, tgid, pid_t, pid, int, sig,
2993 return do_rt_tgsigqueueinfo(tgid, pid, sig, &info); 3071 return do_rt_tgsigqueueinfo(tgid, pid, sig, &info);
2994} 3072}
2995 3073
3074#ifdef CONFIG_COMPAT
3075COMPAT_SYSCALL_DEFINE4(rt_tgsigqueueinfo,
3076 compat_pid_t, tgid,
3077 compat_pid_t, pid,
3078 int, sig,
3079 struct compat_siginfo __user *, uinfo)
3080{
3081 siginfo_t info;
3082
3083 if (copy_siginfo_from_user32(&info, uinfo))
3084 return -EFAULT;
3085 return do_rt_tgsigqueueinfo(tgid, pid, sig, &info);
3086}
3087#endif
3088
2996int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact) 3089int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact)
2997{ 3090{
2998 struct task_struct *t = current; 3091 struct task_struct *t = current;
@@ -3038,7 +3131,7 @@ int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact)
3038 return 0; 3131 return 0;
3039} 3132}
3040 3133
3041int 3134static int
3042do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long sp) 3135do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long sp)
3043{ 3136{
3044 stack_t oss; 3137 stack_t oss;
@@ -3103,6 +3196,76 @@ do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long s
3103out: 3196out:
3104 return error; 3197 return error;
3105} 3198}
3199SYSCALL_DEFINE2(sigaltstack,const stack_t __user *,uss, stack_t __user *,uoss)
3200{
3201 return do_sigaltstack(uss, uoss, current_user_stack_pointer());
3202}
3203
3204int restore_altstack(const stack_t __user *uss)
3205{
3206 int err = do_sigaltstack(uss, NULL, current_user_stack_pointer());
3207 /* squash all but EFAULT for now */
3208 return err == -EFAULT ? err : 0;
3209}
3210
3211int __save_altstack(stack_t __user *uss, unsigned long sp)
3212{
3213 struct task_struct *t = current;
3214 return __put_user((void __user *)t->sas_ss_sp, &uss->ss_sp) |
3215 __put_user(sas_ss_flags(sp), &uss->ss_flags) |
3216 __put_user(t->sas_ss_size, &uss->ss_size);
3217}
3218
3219#ifdef CONFIG_COMPAT
3220COMPAT_SYSCALL_DEFINE2(sigaltstack,
3221 const compat_stack_t __user *, uss_ptr,
3222 compat_stack_t __user *, uoss_ptr)
3223{
3224 stack_t uss, uoss;
3225 int ret;
3226 mm_segment_t seg;
3227
3228 if (uss_ptr) {
3229 compat_stack_t uss32;
3230
3231 memset(&uss, 0, sizeof(stack_t));
3232 if (copy_from_user(&uss32, uss_ptr, sizeof(compat_stack_t)))
3233 return -EFAULT;
3234 uss.ss_sp = compat_ptr(uss32.ss_sp);
3235 uss.ss_flags = uss32.ss_flags;
3236 uss.ss_size = uss32.ss_size;
3237 }
3238 seg = get_fs();
3239 set_fs(KERNEL_DS);
3240 ret = do_sigaltstack((stack_t __force __user *) (uss_ptr ? &uss : NULL),
3241 (stack_t __force __user *) &uoss,
3242 compat_user_stack_pointer());
3243 set_fs(seg);
3244 if (ret >= 0 && uoss_ptr) {
3245 if (!access_ok(VERIFY_WRITE, uoss_ptr, sizeof(compat_stack_t)) ||
3246 __put_user(ptr_to_compat(uoss.ss_sp), &uoss_ptr->ss_sp) ||
3247 __put_user(uoss.ss_flags, &uoss_ptr->ss_flags) ||
3248 __put_user(uoss.ss_size, &uoss_ptr->ss_size))
3249 ret = -EFAULT;
3250 }
3251 return ret;
3252}
3253
3254int compat_restore_altstack(const compat_stack_t __user *uss)
3255{
3256 int err = compat_sys_sigaltstack(uss, NULL);
3257 /* squash all but -EFAULT for now */
3258 return err == -EFAULT ? err : 0;
3259}
3260
3261int __compat_save_altstack(compat_stack_t __user *uss, unsigned long sp)
3262{
3263 struct task_struct *t = current;
3264 return __put_user(ptr_to_compat((void __user *)t->sas_ss_sp), &uss->ss_sp) |
3265 __put_user(sas_ss_flags(sp), &uss->ss_flags) |
3266 __put_user(t->sas_ss_size, &uss->ss_size);
3267}
3268#endif
3106 3269
3107#ifdef __ARCH_WANT_SYS_SIGPENDING 3270#ifdef __ARCH_WANT_SYS_SIGPENDING
3108 3271
@@ -3112,7 +3275,7 @@ out:
3112 */ 3275 */
3113SYSCALL_DEFINE1(sigpending, old_sigset_t __user *, set) 3276SYSCALL_DEFINE1(sigpending, old_sigset_t __user *, set)
3114{ 3277{
3115 return do_sigpending(set, sizeof(*set)); 3278 return sys_rt_sigpending((sigset_t __user *)set, sizeof(old_sigset_t));
3116} 3279}
3117 3280
3118#endif 3281#endif
@@ -3139,7 +3302,6 @@ SYSCALL_DEFINE3(sigprocmask, int, how, old_sigset_t __user *, nset,
3139 if (nset) { 3302 if (nset) {
3140 if (copy_from_user(&new_set, nset, sizeof(*nset))) 3303 if (copy_from_user(&new_set, nset, sizeof(*nset)))
3141 return -EFAULT; 3304 return -EFAULT;
3142 new_set &= ~(sigmask(SIGKILL) | sigmask(SIGSTOP));
3143 3305
3144 new_blocked = current->blocked; 3306 new_blocked = current->blocked;
3145 3307
@@ -3157,7 +3319,7 @@ SYSCALL_DEFINE3(sigprocmask, int, how, old_sigset_t __user *, nset,
3157 return -EINVAL; 3319 return -EINVAL;
3158 } 3320 }
3159 3321
3160 __set_current_blocked(&new_blocked); 3322 set_current_blocked(&new_blocked);
3161 } 3323 }
3162 3324
3163 if (oset) { 3325 if (oset) {
@@ -3169,7 +3331,7 @@ SYSCALL_DEFINE3(sigprocmask, int, how, old_sigset_t __user *, nset,
3169} 3331}
3170#endif /* __ARCH_WANT_SYS_SIGPROCMASK */ 3332#endif /* __ARCH_WANT_SYS_SIGPROCMASK */
3171 3333
3172#ifdef __ARCH_WANT_SYS_RT_SIGACTION 3334#ifndef CONFIG_ODD_RT_SIGACTION
3173/** 3335/**
3174 * sys_rt_sigaction - alter an action taken by a process 3336 * sys_rt_sigaction - alter an action taken by a process
3175 * @sig: signal to be sent 3337 * @sig: signal to be sent
@@ -3203,7 +3365,132 @@ SYSCALL_DEFINE4(rt_sigaction, int, sig,
3203out: 3365out:
3204 return ret; 3366 return ret;
3205} 3367}
3206#endif /* __ARCH_WANT_SYS_RT_SIGACTION */ 3368#ifdef CONFIG_COMPAT
3369COMPAT_SYSCALL_DEFINE4(rt_sigaction, int, sig,
3370 const struct compat_sigaction __user *, act,
3371 struct compat_sigaction __user *, oact,
3372 compat_size_t, sigsetsize)
3373{
3374 struct k_sigaction new_ka, old_ka;
3375 compat_sigset_t mask;
3376#ifdef __ARCH_HAS_SA_RESTORER
3377 compat_uptr_t restorer;
3378#endif
3379 int ret;
3380
3381 /* XXX: Don't preclude handling different sized sigset_t's. */
3382 if (sigsetsize != sizeof(compat_sigset_t))
3383 return -EINVAL;
3384
3385 if (act) {
3386 compat_uptr_t handler;
3387 ret = get_user(handler, &act->sa_handler);
3388 new_ka.sa.sa_handler = compat_ptr(handler);
3389#ifdef __ARCH_HAS_SA_RESTORER
3390 ret |= get_user(restorer, &act->sa_restorer);
3391 new_ka.sa.sa_restorer = compat_ptr(restorer);
3392#endif
3393 ret |= copy_from_user(&mask, &act->sa_mask, sizeof(mask));
3394 ret |= __get_user(new_ka.sa.sa_flags, &act->sa_flags);
3395 if (ret)
3396 return -EFAULT;
3397 sigset_from_compat(&new_ka.sa.sa_mask, &mask);
3398 }
3399
3400 ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL);
3401 if (!ret && oact) {
3402 sigset_to_compat(&mask, &old_ka.sa.sa_mask);
3403 ret = put_user(ptr_to_compat(old_ka.sa.sa_handler),
3404 &oact->sa_handler);
3405 ret |= copy_to_user(&oact->sa_mask, &mask, sizeof(mask));
3406 ret |= __put_user(old_ka.sa.sa_flags, &oact->sa_flags);
3407#ifdef __ARCH_HAS_SA_RESTORER
3408 ret |= put_user(ptr_to_compat(old_ka.sa.sa_restorer),
3409 &oact->sa_restorer);
3410#endif
3411 }
3412 return ret;
3413}
3414#endif
3415#endif /* !CONFIG_ODD_RT_SIGACTION */
3416
3417#ifdef CONFIG_OLD_SIGACTION
3418SYSCALL_DEFINE3(sigaction, int, sig,
3419 const struct old_sigaction __user *, act,
3420 struct old_sigaction __user *, oact)
3421{
3422 struct k_sigaction new_ka, old_ka;
3423 int ret;
3424
3425 if (act) {
3426 old_sigset_t mask;
3427 if (!access_ok(VERIFY_READ, act, sizeof(*act)) ||
3428 __get_user(new_ka.sa.sa_handler, &act->sa_handler) ||
3429 __get_user(new_ka.sa.sa_restorer, &act->sa_restorer) ||
3430 __get_user(new_ka.sa.sa_flags, &act->sa_flags) ||
3431 __get_user(mask, &act->sa_mask))
3432 return -EFAULT;
3433#ifdef __ARCH_HAS_KA_RESTORER
3434 new_ka.ka_restorer = NULL;
3435#endif
3436 siginitset(&new_ka.sa.sa_mask, mask);
3437 }
3438
3439 ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL);
3440
3441 if (!ret && oact) {
3442 if (!access_ok(VERIFY_WRITE, oact, sizeof(*oact)) ||
3443 __put_user(old_ka.sa.sa_handler, &oact->sa_handler) ||
3444 __put_user(old_ka.sa.sa_restorer, &oact->sa_restorer) ||
3445 __put_user(old_ka.sa.sa_flags, &oact->sa_flags) ||
3446 __put_user(old_ka.sa.sa_mask.sig[0], &oact->sa_mask))
3447 return -EFAULT;
3448 }
3449
3450 return ret;
3451}
3452#endif
3453#ifdef CONFIG_COMPAT_OLD_SIGACTION
3454COMPAT_SYSCALL_DEFINE3(sigaction, int, sig,
3455 const struct compat_old_sigaction __user *, act,
3456 struct compat_old_sigaction __user *, oact)
3457{
3458 struct k_sigaction new_ka, old_ka;
3459 int ret;
3460 compat_old_sigset_t mask;
3461 compat_uptr_t handler, restorer;
3462
3463 if (act) {
3464 if (!access_ok(VERIFY_READ, act, sizeof(*act)) ||
3465 __get_user(handler, &act->sa_handler) ||
3466 __get_user(restorer, &act->sa_restorer) ||
3467 __get_user(new_ka.sa.sa_flags, &act->sa_flags) ||
3468 __get_user(mask, &act->sa_mask))
3469 return -EFAULT;
3470
3471#ifdef __ARCH_HAS_KA_RESTORER
3472 new_ka.ka_restorer = NULL;
3473#endif
3474 new_ka.sa.sa_handler = compat_ptr(handler);
3475 new_ka.sa.sa_restorer = compat_ptr(restorer);
3476 siginitset(&new_ka.sa.sa_mask, mask);
3477 }
3478
3479 ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL);
3480
3481 if (!ret && oact) {
3482 if (!access_ok(VERIFY_WRITE, oact, sizeof(*oact)) ||
3483 __put_user(ptr_to_compat(old_ka.sa.sa_handler),
3484 &oact->sa_handler) ||
3485 __put_user(ptr_to_compat(old_ka.sa.sa_restorer),
3486 &oact->sa_restorer) ||
3487 __put_user(old_ka.sa.sa_flags, &oact->sa_flags) ||
3488 __put_user(old_ka.sa.sa_mask.sig[0], &oact->sa_mask))
3489 return -EFAULT;
3490 }
3491 return ret;
3492}
3493#endif
3207 3494
3208#ifdef __ARCH_WANT_SYS_SGETMASK 3495#ifdef __ARCH_WANT_SYS_SGETMASK
3209 3496
@@ -3221,6 +3508,7 @@ SYSCALL_DEFINE1(ssetmask, int, newmask)
3221 int old = current->blocked.sig[0]; 3508 int old = current->blocked.sig[0];
3222 sigset_t newset; 3509 sigset_t newset;
3223 3510
3511 siginitset(&newset, newmask);
3224 set_current_blocked(&newset); 3512 set_current_blocked(&newset);
3225 3513
3226 return old; 3514 return old;
@@ -3270,7 +3558,6 @@ int sigsuspend(sigset_t *set)
3270 return -ERESTARTNOHAND; 3558 return -ERESTARTNOHAND;
3271} 3559}
3272 3560
3273#ifdef __ARCH_WANT_SYS_RT_SIGSUSPEND
3274/** 3561/**
3275 * sys_rt_sigsuspend - replace the signal mask for a value with the 3562 * sys_rt_sigsuspend - replace the signal mask for a value with the
3276 * @unewset value until a signal is received 3563 * @unewset value until a signal is received
@@ -3289,7 +3576,45 @@ SYSCALL_DEFINE2(rt_sigsuspend, sigset_t __user *, unewset, size_t, sigsetsize)
3289 return -EFAULT; 3576 return -EFAULT;
3290 return sigsuspend(&newset); 3577 return sigsuspend(&newset);
3291} 3578}
3292#endif /* __ARCH_WANT_SYS_RT_SIGSUSPEND */ 3579
3580#ifdef CONFIG_COMPAT
3581COMPAT_SYSCALL_DEFINE2(rt_sigsuspend, compat_sigset_t __user *, unewset, compat_size_t, sigsetsize)
3582{
3583#ifdef __BIG_ENDIAN
3584 sigset_t newset;
3585 compat_sigset_t newset32;
3586
3587 /* XXX: Don't preclude handling different sized sigset_t's. */
3588 if (sigsetsize != sizeof(sigset_t))
3589 return -EINVAL;
3590
3591 if (copy_from_user(&newset32, unewset, sizeof(compat_sigset_t)))
3592 return -EFAULT;
3593 sigset_from_compat(&newset, &newset32);
3594 return sigsuspend(&newset);
3595#else
3596 /* on little-endian bitmaps don't care about granularity */
3597 return sys_rt_sigsuspend((sigset_t __user *)unewset, sigsetsize);
3598#endif
3599}
3600#endif
3601
3602#ifdef CONFIG_OLD_SIGSUSPEND
3603SYSCALL_DEFINE1(sigsuspend, old_sigset_t, mask)
3604{
3605 sigset_t blocked;
3606 siginitset(&blocked, mask);
3607 return sigsuspend(&blocked);
3608}
3609#endif
3610#ifdef CONFIG_OLD_SIGSUSPEND3
3611SYSCALL_DEFINE3(sigsuspend, int, unused1, int, unused2, old_sigset_t, mask)
3612{
3613 sigset_t blocked;
3614 siginitset(&blocked, mask);
3615 return sigsuspend(&blocked);
3616}
3617#endif
3293 3618
3294__attribute__((weak)) const char *arch_vma_name(struct vm_area_struct *vma) 3619__attribute__((weak)) const char *arch_vma_name(struct vm_area_struct *vma)
3295{ 3620{
diff --git a/kernel/smp.c b/kernel/smp.c
index 29dd40a9f2f4..8e451f3ff51b 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -16,23 +16,14 @@
16#include "smpboot.h" 16#include "smpboot.h"
17 17
18#ifdef CONFIG_USE_GENERIC_SMP_HELPERS 18#ifdef CONFIG_USE_GENERIC_SMP_HELPERS
19static struct {
20 struct list_head queue;
21 raw_spinlock_t lock;
22} call_function __cacheline_aligned_in_smp =
23 {
24 .queue = LIST_HEAD_INIT(call_function.queue),
25 .lock = __RAW_SPIN_LOCK_UNLOCKED(call_function.lock),
26 };
27
28enum { 19enum {
29 CSD_FLAG_LOCK = 0x01, 20 CSD_FLAG_LOCK = 0x01,
30}; 21};
31 22
32struct call_function_data { 23struct call_function_data {
33 struct call_single_data csd; 24 struct call_single_data __percpu *csd;
34 atomic_t refs;
35 cpumask_var_t cpumask; 25 cpumask_var_t cpumask;
26 cpumask_var_t cpumask_ipi;
36}; 27};
37 28
38static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_function_data, cfd_data); 29static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_function_data, cfd_data);
@@ -56,6 +47,14 @@ hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu)
56 if (!zalloc_cpumask_var_node(&cfd->cpumask, GFP_KERNEL, 47 if (!zalloc_cpumask_var_node(&cfd->cpumask, GFP_KERNEL,
57 cpu_to_node(cpu))) 48 cpu_to_node(cpu)))
58 return notifier_from_errno(-ENOMEM); 49 return notifier_from_errno(-ENOMEM);
50 if (!zalloc_cpumask_var_node(&cfd->cpumask_ipi, GFP_KERNEL,
51 cpu_to_node(cpu)))
52 return notifier_from_errno(-ENOMEM);
53 cfd->csd = alloc_percpu(struct call_single_data);
54 if (!cfd->csd) {
55 free_cpumask_var(cfd->cpumask);
56 return notifier_from_errno(-ENOMEM);
57 }
59 break; 58 break;
60 59
61#ifdef CONFIG_HOTPLUG_CPU 60#ifdef CONFIG_HOTPLUG_CPU
@@ -65,6 +64,8 @@ hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu)
65 case CPU_DEAD: 64 case CPU_DEAD:
66 case CPU_DEAD_FROZEN: 65 case CPU_DEAD_FROZEN:
67 free_cpumask_var(cfd->cpumask); 66 free_cpumask_var(cfd->cpumask);
67 free_cpumask_var(cfd->cpumask_ipi);
68 free_percpu(cfd->csd);
68 break; 69 break;
69#endif 70#endif
70 }; 71 };
@@ -166,85 +167,6 @@ void generic_exec_single(int cpu, struct call_single_data *data, int wait)
166} 167}
167 168
168/* 169/*
169 * Invoked by arch to handle an IPI for call function. Must be called with
170 * interrupts disabled.
171 */
172void generic_smp_call_function_interrupt(void)
173{
174 struct call_function_data *data;
175 int cpu = smp_processor_id();
176
177 /*
178 * Shouldn't receive this interrupt on a cpu that is not yet online.
179 */
180 WARN_ON_ONCE(!cpu_online(cpu));
181
182 /*
183 * Ensure entry is visible on call_function_queue after we have
184 * entered the IPI. See comment in smp_call_function_many.
185 * If we don't have this, then we may miss an entry on the list
186 * and never get another IPI to process it.
187 */
188 smp_mb();
189
190 /*
191 * It's ok to use list_for_each_rcu() here even though we may
192 * delete 'pos', since list_del_rcu() doesn't clear ->next
193 */
194 list_for_each_entry_rcu(data, &call_function.queue, csd.list) {
195 int refs;
196 smp_call_func_t func;
197
198 /*
199 * Since we walk the list without any locks, we might
200 * see an entry that was completed, removed from the
201 * list and is in the process of being reused.
202 *
203 * We must check that the cpu is in the cpumask before
204 * checking the refs, and both must be set before
205 * executing the callback on this cpu.
206 */
207
208 if (!cpumask_test_cpu(cpu, data->cpumask))
209 continue;
210
211 smp_rmb();
212
213 if (atomic_read(&data->refs) == 0)
214 continue;
215
216 func = data->csd.func; /* save for later warn */
217 func(data->csd.info);
218
219 /*
220 * If the cpu mask is not still set then func enabled
221 * interrupts (BUG), and this cpu took another smp call
222 * function interrupt and executed func(info) twice
223 * on this cpu. That nested execution decremented refs.
224 */
225 if (!cpumask_test_and_clear_cpu(cpu, data->cpumask)) {
226 WARN(1, "%pf enabled interrupts and double executed\n", func);
227 continue;
228 }
229
230 refs = atomic_dec_return(&data->refs);
231 WARN_ON(refs < 0);
232
233 if (refs)
234 continue;
235
236 WARN_ON(!cpumask_empty(data->cpumask));
237
238 raw_spin_lock(&call_function.lock);
239 list_del_rcu(&data->csd.list);
240 raw_spin_unlock(&call_function.lock);
241
242 csd_unlock(&data->csd);
243 }
244
245}
246
247/*
248 * Invoked by arch to handle an IPI for call function single. Must be 170 * Invoked by arch to handle an IPI for call function single. Must be
249 * called from the arch with interrupts disabled. 171 * called from the arch with interrupts disabled.
250 */ 172 */
@@ -448,8 +370,7 @@ void smp_call_function_many(const struct cpumask *mask,
448 smp_call_func_t func, void *info, bool wait) 370 smp_call_func_t func, void *info, bool wait)
449{ 371{
450 struct call_function_data *data; 372 struct call_function_data *data;
451 unsigned long flags; 373 int cpu, next_cpu, this_cpu = smp_processor_id();
452 int refs, cpu, next_cpu, this_cpu = smp_processor_id();
453 374
454 /* 375 /*
455 * Can deadlock when called with interrupts disabled. 376 * Can deadlock when called with interrupts disabled.
@@ -481,79 +402,46 @@ void smp_call_function_many(const struct cpumask *mask,
481 } 402 }
482 403
483 data = &__get_cpu_var(cfd_data); 404 data = &__get_cpu_var(cfd_data);
484 csd_lock(&data->csd);
485
486 /* This BUG_ON verifies our reuse assertions and can be removed */
487 BUG_ON(atomic_read(&data->refs) || !cpumask_empty(data->cpumask));
488 405
489 /*
490 * The global call function queue list add and delete are protected
491 * by a lock, but the list is traversed without any lock, relying
492 * on the rcu list add and delete to allow safe concurrent traversal.
493 * We reuse the call function data without waiting for any grace
494 * period after some other cpu removes it from the global queue.
495 * This means a cpu might find our data block as it is being
496 * filled out.
497 *
498 * We hold off the interrupt handler on the other cpu by
499 * ordering our writes to the cpu mask vs our setting of the
500 * refs counter. We assert only the cpu owning the data block
501 * will set a bit in cpumask, and each bit will only be cleared
502 * by the subject cpu. Each cpu must first find its bit is
503 * set and then check that refs is set indicating the element is
504 * ready to be processed, otherwise it must skip the entry.
505 *
506 * On the previous iteration refs was set to 0 by another cpu.
507 * To avoid the use of transitivity, set the counter to 0 here
508 * so the wmb will pair with the rmb in the interrupt handler.
509 */
510 atomic_set(&data->refs, 0); /* convert 3rd to 1st party write */
511
512 data->csd.func = func;
513 data->csd.info = info;
514
515 /* Ensure 0 refs is visible before mask. Also orders func and info */
516 smp_wmb();
517
518 /* We rely on the "and" being processed before the store */
519 cpumask_and(data->cpumask, mask, cpu_online_mask); 406 cpumask_and(data->cpumask, mask, cpu_online_mask);
520 cpumask_clear_cpu(this_cpu, data->cpumask); 407 cpumask_clear_cpu(this_cpu, data->cpumask);
521 refs = cpumask_weight(data->cpumask);
522 408
523 /* Some callers race with other cpus changing the passed mask */ 409 /* Some callers race with other cpus changing the passed mask */
524 if (unlikely(!refs)) { 410 if (unlikely(!cpumask_weight(data->cpumask)))
525 csd_unlock(&data->csd);
526 return; 411 return;
527 }
528 412
529 raw_spin_lock_irqsave(&call_function.lock, flags);
530 /* 413 /*
531 * Place entry at the _HEAD_ of the list, so that any cpu still 414 * After we put an entry into the list, data->cpumask
532 * observing the entry in generic_smp_call_function_interrupt() 415 * may be cleared again when another CPU sends another IPI for
533 * will not miss any other list entries: 416 * a SMP function call, so data->cpumask will be zero.
534 */ 417 */
535 list_add_rcu(&data->csd.list, &call_function.queue); 418 cpumask_copy(data->cpumask_ipi, data->cpumask);
536 /*
537 * We rely on the wmb() in list_add_rcu to complete our writes
538 * to the cpumask before this write to refs, which indicates
539 * data is on the list and is ready to be processed.
540 */
541 atomic_set(&data->refs, refs);
542 raw_spin_unlock_irqrestore(&call_function.lock, flags);
543 419
544 /* 420 for_each_cpu(cpu, data->cpumask) {
545 * Make the list addition visible before sending the ipi. 421 struct call_single_data *csd = per_cpu_ptr(data->csd, cpu);
546 * (IPIs must obey or appear to obey normal Linux cache 422 struct call_single_queue *dst =
547 * coherency rules -- see comment in generic_exec_single). 423 &per_cpu(call_single_queue, cpu);
548 */ 424 unsigned long flags;
549 smp_mb(); 425
426 csd_lock(csd);
427 csd->func = func;
428 csd->info = info;
429
430 raw_spin_lock_irqsave(&dst->lock, flags);
431 list_add_tail(&csd->list, &dst->list);
432 raw_spin_unlock_irqrestore(&dst->lock, flags);
433 }
550 434
551 /* Send a message to all CPUs in the map */ 435 /* Send a message to all CPUs in the map */
552 arch_send_call_function_ipi_mask(data->cpumask); 436 arch_send_call_function_ipi_mask(data->cpumask_ipi);
553 437
554 /* Optionally wait for the CPUs to complete */ 438 if (wait) {
555 if (wait) 439 for_each_cpu(cpu, data->cpumask) {
556 csd_lock_wait(&data->csd); 440 struct call_single_data *csd =
441 per_cpu_ptr(data->csd, cpu);
442 csd_lock_wait(csd);
443 }
444 }
557} 445}
558EXPORT_SYMBOL(smp_call_function_many); 446EXPORT_SYMBOL(smp_call_function_many);
559 447
diff --git a/kernel/smpboot.c b/kernel/smpboot.c
index d6c5fc054242..02fc5c933673 100644
--- a/kernel/smpboot.c
+++ b/kernel/smpboot.c
@@ -183,9 +183,20 @@ __smpboot_create_thread(struct smp_hotplug_thread *ht, unsigned int cpu)
183 kfree(td); 183 kfree(td);
184 return PTR_ERR(tsk); 184 return PTR_ERR(tsk);
185 } 185 }
186
187 get_task_struct(tsk); 186 get_task_struct(tsk);
188 *per_cpu_ptr(ht->store, cpu) = tsk; 187 *per_cpu_ptr(ht->store, cpu) = tsk;
188 if (ht->create) {
189 /*
190 * Make sure that the task has actually scheduled out
191 * into park position, before calling the create
192 * callback. At least the migration thread callback
193 * requires that the task is off the runqueue.
194 */
195 if (!wait_task_inactive(tsk, TASK_PARKED))
196 WARN_ON(1);
197 else
198 ht->create(cpu);
199 }
189 return 0; 200 return 0;
190} 201}
191 202
@@ -208,6 +219,8 @@ static void smpboot_unpark_thread(struct smp_hotplug_thread *ht, unsigned int cp
208{ 219{
209 struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu); 220 struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu);
210 221
222 if (ht->pre_unpark)
223 ht->pre_unpark(cpu);
211 kthread_unpark(tsk); 224 kthread_unpark(tsk);
212} 225}
213 226
@@ -225,7 +238,7 @@ static void smpboot_park_thread(struct smp_hotplug_thread *ht, unsigned int cpu)
225{ 238{
226 struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu); 239 struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu);
227 240
228 if (tsk) 241 if (tsk && !ht->selfparking)
229 kthread_park(tsk); 242 kthread_park(tsk);
230} 243}
231 244
diff --git a/kernel/softirq.c b/kernel/softirq.c
index cc96bdc0c2c9..14d7758074aa 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -195,21 +195,21 @@ void local_bh_enable_ip(unsigned long ip)
195EXPORT_SYMBOL(local_bh_enable_ip); 195EXPORT_SYMBOL(local_bh_enable_ip);
196 196
197/* 197/*
198 * We restart softirq processing MAX_SOFTIRQ_RESTART times, 198 * We restart softirq processing for at most 2 ms,
199 * and we fall back to softirqd after that. 199 * and if need_resched() is not set.
200 * 200 *
201 * This number has been established via experimentation. 201 * These limits have been established via experimentation.
202 * The two things to balance is latency against fairness - 202 * The two things to balance is latency against fairness -
203 * we want to handle softirqs as soon as possible, but they 203 * we want to handle softirqs as soon as possible, but they
204 * should not be able to lock up the box. 204 * should not be able to lock up the box.
205 */ 205 */
206#define MAX_SOFTIRQ_RESTART 10 206#define MAX_SOFTIRQ_TIME msecs_to_jiffies(2)
207 207
208asmlinkage void __do_softirq(void) 208asmlinkage void __do_softirq(void)
209{ 209{
210 struct softirq_action *h; 210 struct softirq_action *h;
211 __u32 pending; 211 __u32 pending;
212 int max_restart = MAX_SOFTIRQ_RESTART; 212 unsigned long end = jiffies + MAX_SOFTIRQ_TIME;
213 int cpu; 213 int cpu;
214 unsigned long old_flags = current->flags; 214 unsigned long old_flags = current->flags;
215 215
@@ -221,7 +221,7 @@ asmlinkage void __do_softirq(void)
221 current->flags &= ~PF_MEMALLOC; 221 current->flags &= ~PF_MEMALLOC;
222 222
223 pending = local_softirq_pending(); 223 pending = local_softirq_pending();
224 vtime_account(current); 224 account_irq_enter_time(current);
225 225
226 __local_bh_disable((unsigned long)__builtin_return_address(0), 226 __local_bh_disable((unsigned long)__builtin_return_address(0),
227 SOFTIRQ_OFFSET); 227 SOFTIRQ_OFFSET);
@@ -264,15 +264,16 @@ restart:
264 local_irq_disable(); 264 local_irq_disable();
265 265
266 pending = local_softirq_pending(); 266 pending = local_softirq_pending();
267 if (pending && --max_restart) 267 if (pending) {
268 goto restart; 268 if (time_before(jiffies, end) && !need_resched())
269 goto restart;
269 270
270 if (pending)
271 wakeup_softirqd(); 271 wakeup_softirqd();
272 }
272 273
273 lockdep_softirq_exit(); 274 lockdep_softirq_exit();
274 275
275 vtime_account(current); 276 account_irq_exit_time(current);
276 __local_bh_enable(SOFTIRQ_OFFSET); 277 __local_bh_enable(SOFTIRQ_OFFSET);
277 tsk_restore_flags(current, old_flags, PF_MEMALLOC); 278 tsk_restore_flags(current, old_flags, PF_MEMALLOC);
278} 279}
@@ -322,18 +323,10 @@ void irq_enter(void)
322 323
323static inline void invoke_softirq(void) 324static inline void invoke_softirq(void)
324{ 325{
325 if (!force_irqthreads) { 326 if (!force_irqthreads)
326#ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED
327 __do_softirq(); 327 __do_softirq();
328#else 328 else
329 do_softirq();
330#endif
331 } else {
332 __local_bh_disable((unsigned long)__builtin_return_address(0),
333 SOFTIRQ_OFFSET);
334 wakeup_softirqd(); 329 wakeup_softirqd();
335 __local_bh_enable(SOFTIRQ_OFFSET);
336 }
337} 330}
338 331
339/* 332/*
@@ -341,9 +334,15 @@ static inline void invoke_softirq(void)
341 */ 334 */
342void irq_exit(void) 335void irq_exit(void)
343{ 336{
344 vtime_account(current); 337#ifndef __ARCH_IRQ_EXIT_IRQS_DISABLED
338 local_irq_disable();
339#else
340 WARN_ON_ONCE(!irqs_disabled());
341#endif
342
343 account_irq_exit_time(current);
345 trace_hardirq_exit(); 344 trace_hardirq_exit();
346 sub_preempt_count(IRQ_EXIT_OFFSET); 345 sub_preempt_count(HARDIRQ_OFFSET);
347 if (!in_interrupt() && local_softirq_pending()) 346 if (!in_interrupt() && local_softirq_pending())
348 invoke_softirq(); 347 invoke_softirq();
349 348
@@ -353,7 +352,6 @@ void irq_exit(void)
353 tick_nohz_irq_exit(); 352 tick_nohz_irq_exit();
354#endif 353#endif
355 rcu_irq_exit(); 354 rcu_irq_exit();
356 sched_preempt_enable_no_resched();
357} 355}
358 356
359/* 357/*
diff --git a/kernel/srcu.c b/kernel/srcu.c
index 97c465ebd844..01d5ccb8bfe3 100644
--- a/kernel/srcu.c
+++ b/kernel/srcu.c
@@ -16,8 +16,10 @@
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. 16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 * 17 *
18 * Copyright (C) IBM Corporation, 2006 18 * Copyright (C) IBM Corporation, 2006
19 * Copyright (C) Fujitsu, 2012
19 * 20 *
20 * Author: Paul McKenney <paulmck@us.ibm.com> 21 * Author: Paul McKenney <paulmck@us.ibm.com>
22 * Lai Jiangshan <laijs@cn.fujitsu.com>
21 * 23 *
22 * For detailed explanation of Read-Copy Update mechanism see - 24 * For detailed explanation of Read-Copy Update mechanism see -
23 * Documentation/RCU/ *.txt 25 * Documentation/RCU/ *.txt
@@ -34,6 +36,10 @@
34#include <linux/delay.h> 36#include <linux/delay.h>
35#include <linux/srcu.h> 37#include <linux/srcu.h>
36 38
39#include <trace/events/rcu.h>
40
41#include "rcu.h"
42
37/* 43/*
38 * Initialize an rcu_batch structure to empty. 44 * Initialize an rcu_batch structure to empty.
39 */ 45 */
@@ -92,9 +98,6 @@ static inline void rcu_batch_move(struct rcu_batch *to, struct rcu_batch *from)
92 } 98 }
93} 99}
94 100
95/* single-thread state-machine */
96static void process_srcu(struct work_struct *work);
97
98static int init_srcu_struct_fields(struct srcu_struct *sp) 101static int init_srcu_struct_fields(struct srcu_struct *sp)
99{ 102{
100 sp->completed = 0; 103 sp->completed = 0;
@@ -279,12 +282,8 @@ static int srcu_readers_active(struct srcu_struct *sp)
279 */ 282 */
280void cleanup_srcu_struct(struct srcu_struct *sp) 283void cleanup_srcu_struct(struct srcu_struct *sp)
281{ 284{
282 int sum; 285 if (WARN_ON(srcu_readers_active(sp)))
283 286 return; /* Leakage unless caller handles error. */
284 sum = srcu_readers_active(sp);
285 WARN_ON(sum); /* Leakage unless caller handles error. */
286 if (sum != 0)
287 return;
288 free_percpu(sp->per_cpu_ref); 287 free_percpu(sp->per_cpu_ref);
289 sp->per_cpu_ref = NULL; 288 sp->per_cpu_ref = NULL;
290} 289}
@@ -299,9 +298,8 @@ int __srcu_read_lock(struct srcu_struct *sp)
299{ 298{
300 int idx; 299 int idx;
301 300
301 idx = ACCESS_ONCE(sp->completed) & 0x1;
302 preempt_disable(); 302 preempt_disable();
303 idx = rcu_dereference_index_check(sp->completed,
304 rcu_read_lock_sched_held()) & 0x1;
305 ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->c[idx]) += 1; 303 ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->c[idx]) += 1;
306 smp_mb(); /* B */ /* Avoid leaking the critical section. */ 304 smp_mb(); /* B */ /* Avoid leaking the critical section. */
307 ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->seq[idx]) += 1; 305 ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->seq[idx]) += 1;
@@ -318,10 +316,8 @@ EXPORT_SYMBOL_GPL(__srcu_read_lock);
318 */ 316 */
319void __srcu_read_unlock(struct srcu_struct *sp, int idx) 317void __srcu_read_unlock(struct srcu_struct *sp, int idx)
320{ 318{
321 preempt_disable();
322 smp_mb(); /* C */ /* Avoid leaking the critical section. */ 319 smp_mb(); /* C */ /* Avoid leaking the critical section. */
323 ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->c[idx]) -= 1; 320 this_cpu_dec(sp->per_cpu_ref->c[idx]);
324 preempt_enable();
325} 321}
326EXPORT_SYMBOL_GPL(__srcu_read_unlock); 322EXPORT_SYMBOL_GPL(__srcu_read_unlock);
327 323
@@ -420,6 +416,7 @@ static void __synchronize_srcu(struct srcu_struct *sp, int trycount)
420 !lock_is_held(&rcu_sched_lock_map), 416 !lock_is_held(&rcu_sched_lock_map),
421 "Illegal synchronize_srcu() in same-type SRCU (or RCU) read-side critical section"); 417 "Illegal synchronize_srcu() in same-type SRCU (or RCU) read-side critical section");
422 418
419 might_sleep();
423 init_completion(&rcu.completion); 420 init_completion(&rcu.completion);
424 421
425 head->next = NULL; 422 head->next = NULL;
@@ -452,10 +449,12 @@ static void __synchronize_srcu(struct srcu_struct *sp, int trycount)
452 * synchronize_srcu - wait for prior SRCU read-side critical-section completion 449 * synchronize_srcu - wait for prior SRCU read-side critical-section completion
453 * @sp: srcu_struct with which to synchronize. 450 * @sp: srcu_struct with which to synchronize.
454 * 451 *
455 * Flip the completed counter, and wait for the old count to drain to zero. 452 * Wait for the count to drain to zero of both indexes. To avoid the
456 * As with classic RCU, the updater must use some separate means of 453 * possible starvation of synchronize_srcu(), it waits for the count of
457 * synchronizing concurrent updates. Can block; must be called from 454 * the index=((->completed & 1) ^ 1) to drain to zero at first,
458 * process context. 455 * and then flip the completed and wait for the count of the other index.
456 *
457 * Can block; must be called from process context.
459 * 458 *
460 * Note that it is illegal to call synchronize_srcu() from the corresponding 459 * Note that it is illegal to call synchronize_srcu() from the corresponding
461 * SRCU read-side critical section; doing so will result in deadlock. 460 * SRCU read-side critical section; doing so will result in deadlock.
@@ -464,7 +463,9 @@ static void __synchronize_srcu(struct srcu_struct *sp, int trycount)
464 */ 463 */
465void synchronize_srcu(struct srcu_struct *sp) 464void synchronize_srcu(struct srcu_struct *sp)
466{ 465{
467 __synchronize_srcu(sp, SYNCHRONIZE_SRCU_TRYCOUNT); 466 __synchronize_srcu(sp, rcu_expedited
467 ? SYNCHRONIZE_SRCU_EXP_TRYCOUNT
468 : SYNCHRONIZE_SRCU_TRYCOUNT);
468} 469}
469EXPORT_SYMBOL_GPL(synchronize_srcu); 470EXPORT_SYMBOL_GPL(synchronize_srcu);
470 471
@@ -475,12 +476,11 @@ EXPORT_SYMBOL_GPL(synchronize_srcu);
475 * Wait for an SRCU grace period to elapse, but be more aggressive about 476 * Wait for an SRCU grace period to elapse, but be more aggressive about
476 * spinning rather than blocking when waiting. 477 * spinning rather than blocking when waiting.
477 * 478 *
478 * Note that it is illegal to call this function while holding any lock 479 * Note that it is also illegal to call synchronize_srcu_expedited()
479 * that is acquired by a CPU-hotplug notifier. It is also illegal to call 480 * from the corresponding SRCU read-side critical section;
480 * synchronize_srcu_expedited() from the corresponding SRCU read-side 481 * doing so will result in deadlock. However, it is perfectly legal
481 * critical section; doing so will result in deadlock. However, it is 482 * to call synchronize_srcu_expedited() on one srcu_struct from some
482 * perfectly legal to call synchronize_srcu_expedited() on one srcu_struct 483 * other srcu_struct's read-side critical section, as long as
483 * from some other srcu_struct's read-side critical section, as long as
484 * the resulting graph of srcu_structs is acyclic. 484 * the resulting graph of srcu_structs is acyclic.
485 */ 485 */
486void synchronize_srcu_expedited(struct srcu_struct *sp) 486void synchronize_srcu_expedited(struct srcu_struct *sp)
@@ -637,7 +637,7 @@ static void srcu_reschedule(struct srcu_struct *sp)
637/* 637/*
638 * This is the work-queue function that handles SRCU grace periods. 638 * This is the work-queue function that handles SRCU grace periods.
639 */ 639 */
640static void process_srcu(struct work_struct *work) 640void process_srcu(struct work_struct *work)
641{ 641{
642 struct srcu_struct *sp; 642 struct srcu_struct *sp;
643 643
@@ -648,3 +648,4 @@ static void process_srcu(struct work_struct *work)
648 srcu_invoke_callbacks(sp); 648 srcu_invoke_callbacks(sp);
649 srcu_reschedule(sp); 649 srcu_reschedule(sp);
650} 650}
651EXPORT_SYMBOL_GPL(process_srcu);
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 2f194e965715..c09f2955ae30 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -18,7 +18,7 @@
18#include <linux/stop_machine.h> 18#include <linux/stop_machine.h>
19#include <linux/interrupt.h> 19#include <linux/interrupt.h>
20#include <linux/kallsyms.h> 20#include <linux/kallsyms.h>
21 21#include <linux/smpboot.h>
22#include <linux/atomic.h> 22#include <linux/atomic.h>
23 23
24/* 24/*
@@ -37,10 +37,10 @@ struct cpu_stopper {
37 spinlock_t lock; 37 spinlock_t lock;
38 bool enabled; /* is this stopper enabled? */ 38 bool enabled; /* is this stopper enabled? */
39 struct list_head works; /* list of pending works */ 39 struct list_head works; /* list of pending works */
40 struct task_struct *thread; /* stopper thread */
41}; 40};
42 41
43static DEFINE_PER_CPU(struct cpu_stopper, cpu_stopper); 42static DEFINE_PER_CPU(struct cpu_stopper, cpu_stopper);
43static DEFINE_PER_CPU(struct task_struct *, cpu_stopper_task);
44static bool stop_machine_initialized = false; 44static bool stop_machine_initialized = false;
45 45
46static void cpu_stop_init_done(struct cpu_stop_done *done, unsigned int nr_todo) 46static void cpu_stop_init_done(struct cpu_stop_done *done, unsigned int nr_todo)
@@ -62,16 +62,18 @@ static void cpu_stop_signal_done(struct cpu_stop_done *done, bool executed)
62} 62}
63 63
64/* queue @work to @stopper. if offline, @work is completed immediately */ 64/* queue @work to @stopper. if offline, @work is completed immediately */
65static void cpu_stop_queue_work(struct cpu_stopper *stopper, 65static void cpu_stop_queue_work(unsigned int cpu, struct cpu_stop_work *work)
66 struct cpu_stop_work *work)
67{ 66{
67 struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
68 struct task_struct *p = per_cpu(cpu_stopper_task, cpu);
69
68 unsigned long flags; 70 unsigned long flags;
69 71
70 spin_lock_irqsave(&stopper->lock, flags); 72 spin_lock_irqsave(&stopper->lock, flags);
71 73
72 if (stopper->enabled) { 74 if (stopper->enabled) {
73 list_add_tail(&work->list, &stopper->works); 75 list_add_tail(&work->list, &stopper->works);
74 wake_up_process(stopper->thread); 76 wake_up_process(p);
75 } else 77 } else
76 cpu_stop_signal_done(work->done, false); 78 cpu_stop_signal_done(work->done, false);
77 79
@@ -108,7 +110,7 @@ int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg)
108 struct cpu_stop_work work = { .fn = fn, .arg = arg, .done = &done }; 110 struct cpu_stop_work work = { .fn = fn, .arg = arg, .done = &done };
109 111
110 cpu_stop_init_done(&done, 1); 112 cpu_stop_init_done(&done, 1);
111 cpu_stop_queue_work(&per_cpu(cpu_stopper, cpu), &work); 113 cpu_stop_queue_work(cpu, &work);
112 wait_for_completion(&done.completion); 114 wait_for_completion(&done.completion);
113 return done.executed ? done.ret : -ENOENT; 115 return done.executed ? done.ret : -ENOENT;
114} 116}
@@ -130,7 +132,7 @@ void stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg,
130 struct cpu_stop_work *work_buf) 132 struct cpu_stop_work *work_buf)
131{ 133{
132 *work_buf = (struct cpu_stop_work){ .fn = fn, .arg = arg, }; 134 *work_buf = (struct cpu_stop_work){ .fn = fn, .arg = arg, };
133 cpu_stop_queue_work(&per_cpu(cpu_stopper, cpu), work_buf); 135 cpu_stop_queue_work(cpu, work_buf);
134} 136}
135 137
136/* static data for stop_cpus */ 138/* static data for stop_cpus */
@@ -159,8 +161,7 @@ static void queue_stop_cpus_work(const struct cpumask *cpumask,
159 */ 161 */
160 preempt_disable(); 162 preempt_disable();
161 for_each_cpu(cpu, cpumask) 163 for_each_cpu(cpu, cpumask)
162 cpu_stop_queue_work(&per_cpu(cpu_stopper, cpu), 164 cpu_stop_queue_work(cpu, &per_cpu(stop_cpus_work, cpu));
163 &per_cpu(stop_cpus_work, cpu));
164 preempt_enable(); 165 preempt_enable();
165} 166}
166 167
@@ -244,20 +245,25 @@ int try_stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg)
244 return ret; 245 return ret;
245} 246}
246 247
247static int cpu_stopper_thread(void *data) 248static int cpu_stop_should_run(unsigned int cpu)
249{
250 struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
251 unsigned long flags;
252 int run;
253
254 spin_lock_irqsave(&stopper->lock, flags);
255 run = !list_empty(&stopper->works);
256 spin_unlock_irqrestore(&stopper->lock, flags);
257 return run;
258}
259
260static void cpu_stopper_thread(unsigned int cpu)
248{ 261{
249 struct cpu_stopper *stopper = data; 262 struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
250 struct cpu_stop_work *work; 263 struct cpu_stop_work *work;
251 int ret; 264 int ret;
252 265
253repeat: 266repeat:
254 set_current_state(TASK_INTERRUPTIBLE); /* mb paired w/ kthread_stop */
255
256 if (kthread_should_stop()) {
257 __set_current_state(TASK_RUNNING);
258 return 0;
259 }
260
261 work = NULL; 267 work = NULL;
262 spin_lock_irq(&stopper->lock); 268 spin_lock_irq(&stopper->lock);
263 if (!list_empty(&stopper->works)) { 269 if (!list_empty(&stopper->works)) {
@@ -273,8 +279,6 @@ repeat:
273 struct cpu_stop_done *done = work->done; 279 struct cpu_stop_done *done = work->done;
274 char ksym_buf[KSYM_NAME_LEN] __maybe_unused; 280 char ksym_buf[KSYM_NAME_LEN] __maybe_unused;
275 281
276 __set_current_state(TASK_RUNNING);
277
278 /* cpu stop callbacks are not allowed to sleep */ 282 /* cpu stop callbacks are not allowed to sleep */
279 preempt_disable(); 283 preempt_disable();
280 284
@@ -290,88 +294,55 @@ repeat:
290 ksym_buf), arg); 294 ksym_buf), arg);
291 295
292 cpu_stop_signal_done(done, true); 296 cpu_stop_signal_done(done, true);
293 } else 297 goto repeat;
294 schedule(); 298 }
295
296 goto repeat;
297} 299}
298 300
299extern void sched_set_stop_task(int cpu, struct task_struct *stop); 301extern void sched_set_stop_task(int cpu, struct task_struct *stop);
300 302
301/* manage stopper for a cpu, mostly lifted from sched migration thread mgmt */ 303static void cpu_stop_create(unsigned int cpu)
302static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb, 304{
303 unsigned long action, void *hcpu) 305 sched_set_stop_task(cpu, per_cpu(cpu_stopper_task, cpu));
306}
307
308static void cpu_stop_park(unsigned int cpu)
304{ 309{
305 unsigned int cpu = (unsigned long)hcpu;
306 struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu); 310 struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
307 struct task_struct *p; 311 struct cpu_stop_work *work;
308 312 unsigned long flags;
309 switch (action & ~CPU_TASKS_FROZEN) {
310 case CPU_UP_PREPARE:
311 BUG_ON(stopper->thread || stopper->enabled ||
312 !list_empty(&stopper->works));
313 p = kthread_create_on_node(cpu_stopper_thread,
314 stopper,
315 cpu_to_node(cpu),
316 "migration/%d", cpu);
317 if (IS_ERR(p))
318 return notifier_from_errno(PTR_ERR(p));
319 get_task_struct(p);
320 kthread_bind(p, cpu);
321 sched_set_stop_task(cpu, p);
322 stopper->thread = p;
323 break;
324
325 case CPU_ONLINE:
326 /* strictly unnecessary, as first user will wake it */
327 wake_up_process(stopper->thread);
328 /* mark enabled */
329 spin_lock_irq(&stopper->lock);
330 stopper->enabled = true;
331 spin_unlock_irq(&stopper->lock);
332 break;
333
334#ifdef CONFIG_HOTPLUG_CPU
335 case CPU_UP_CANCELED:
336 case CPU_POST_DEAD:
337 {
338 struct cpu_stop_work *work;
339
340 sched_set_stop_task(cpu, NULL);
341 /* kill the stopper */
342 kthread_stop(stopper->thread);
343 /* drain remaining works */
344 spin_lock_irq(&stopper->lock);
345 list_for_each_entry(work, &stopper->works, list)
346 cpu_stop_signal_done(work->done, false);
347 stopper->enabled = false;
348 spin_unlock_irq(&stopper->lock);
349 /* release the stopper */
350 put_task_struct(stopper->thread);
351 stopper->thread = NULL;
352 break;
353 }
354#endif
355 }
356 313
357 return NOTIFY_OK; 314 /* drain remaining works */
315 spin_lock_irqsave(&stopper->lock, flags);
316 list_for_each_entry(work, &stopper->works, list)
317 cpu_stop_signal_done(work->done, false);
318 stopper->enabled = false;
319 spin_unlock_irqrestore(&stopper->lock, flags);
358} 320}
359 321
360/* 322static void cpu_stop_unpark(unsigned int cpu)
361 * Give it a higher priority so that cpu stopper is available to other 323{
362 * cpu notifiers. It currently shares the same priority as sched 324 struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
363 * migration_notifier. 325
364 */ 326 spin_lock_irq(&stopper->lock);
365static struct notifier_block __cpuinitdata cpu_stop_cpu_notifier = { 327 stopper->enabled = true;
366 .notifier_call = cpu_stop_cpu_callback, 328 spin_unlock_irq(&stopper->lock);
367 .priority = 10, 329}
330
331static struct smp_hotplug_thread cpu_stop_threads = {
332 .store = &cpu_stopper_task,
333 .thread_should_run = cpu_stop_should_run,
334 .thread_fn = cpu_stopper_thread,
335 .thread_comm = "migration/%u",
336 .create = cpu_stop_create,
337 .setup = cpu_stop_unpark,
338 .park = cpu_stop_park,
339 .pre_unpark = cpu_stop_unpark,
340 .selfparking = true,
368}; 341};
369 342
370static int __init cpu_stop_init(void) 343static int __init cpu_stop_init(void)
371{ 344{
372 void *bcpu = (void *)(long)smp_processor_id();
373 unsigned int cpu; 345 unsigned int cpu;
374 int err;
375 346
376 for_each_possible_cpu(cpu) { 347 for_each_possible_cpu(cpu) {
377 struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu); 348 struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
@@ -380,15 +351,8 @@ static int __init cpu_stop_init(void)
380 INIT_LIST_HEAD(&stopper->works); 351 INIT_LIST_HEAD(&stopper->works);
381 } 352 }
382 353
383 /* start one for the boot cpu */ 354 BUG_ON(smpboot_register_percpu_thread(&cpu_stop_threads));
384 err = cpu_stop_cpu_callback(&cpu_stop_cpu_notifier, CPU_UP_PREPARE,
385 bcpu);
386 BUG_ON(err != NOTIFY_OK);
387 cpu_stop_cpu_callback(&cpu_stop_cpu_notifier, CPU_ONLINE, bcpu);
388 register_cpu_notifier(&cpu_stop_cpu_notifier);
389
390 stop_machine_initialized = true; 355 stop_machine_initialized = true;
391
392 return 0; 356 return 0;
393} 357}
394early_initcall(cpu_stop_init); 358early_initcall(cpu_stop_init);
diff --git a/kernel/sys.c b/kernel/sys.c
index e6e0ece5f6a0..0da73cf73e60 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -47,6 +47,7 @@
47#include <linux/syscalls.h> 47#include <linux/syscalls.h>
48#include <linux/kprobes.h> 48#include <linux/kprobes.h>
49#include <linux/user_namespace.h> 49#include <linux/user_namespace.h>
50#include <linux/binfmts.h>
50 51
51#include <linux/kmsg_dump.h> 52#include <linux/kmsg_dump.h>
52/* Move somewhere else to avoid recompiling? */ 53/* Move somewhere else to avoid recompiling? */
@@ -323,7 +324,6 @@ void kernel_restart_prepare(char *cmd)
323 system_state = SYSTEM_RESTART; 324 system_state = SYSTEM_RESTART;
324 usermodehelper_disable(); 325 usermodehelper_disable();
325 device_shutdown(); 326 device_shutdown();
326 syscore_shutdown();
327} 327}
328 328
329/** 329/**
@@ -369,6 +369,7 @@ void kernel_restart(char *cmd)
369{ 369{
370 kernel_restart_prepare(cmd); 370 kernel_restart_prepare(cmd);
371 disable_nonboot_cpus(); 371 disable_nonboot_cpus();
372 syscore_shutdown();
372 if (!cmd) 373 if (!cmd)
373 printk(KERN_EMERG "Restarting system.\n"); 374 printk(KERN_EMERG "Restarting system.\n");
374 else 375 else
@@ -394,6 +395,7 @@ static void kernel_shutdown_prepare(enum system_states state)
394void kernel_halt(void) 395void kernel_halt(void)
395{ 396{
396 kernel_shutdown_prepare(SYSTEM_HALT); 397 kernel_shutdown_prepare(SYSTEM_HALT);
398 disable_nonboot_cpus();
397 syscore_shutdown(); 399 syscore_shutdown();
398 printk(KERN_EMERG "System halted.\n"); 400 printk(KERN_EMERG "System halted.\n");
399 kmsg_dump(KMSG_DUMP_HALT); 401 kmsg_dump(KMSG_DUMP_HALT);
@@ -433,11 +435,12 @@ static DEFINE_MUTEX(reboot_mutex);
433SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd, 435SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd,
434 void __user *, arg) 436 void __user *, arg)
435{ 437{
438 struct pid_namespace *pid_ns = task_active_pid_ns(current);
436 char buffer[256]; 439 char buffer[256];
437 int ret = 0; 440 int ret = 0;
438 441
439 /* We only trust the superuser with rebooting the system. */ 442 /* We only trust the superuser with rebooting the system. */
440 if (!capable(CAP_SYS_BOOT)) 443 if (!ns_capable(pid_ns->user_ns, CAP_SYS_BOOT))
441 return -EPERM; 444 return -EPERM;
442 445
443 /* For safety, we require "magic" arguments. */ 446 /* For safety, we require "magic" arguments. */
@@ -453,7 +456,7 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd,
453 * pid_namespace, the command is handled by reboot_pid_ns() which will 456 * pid_namespace, the command is handled by reboot_pid_ns() which will
454 * call do_exit(). 457 * call do_exit().
455 */ 458 */
456 ret = reboot_pid_ns(task_active_pid_ns(current), cmd); 459 ret = reboot_pid_ns(pid_ns, cmd);
457 if (ret) 460 if (ret)
458 return ret; 461 return ret;
459 462
@@ -1046,7 +1049,7 @@ void do_sys_times(struct tms *tms)
1046 cputime_t tgutime, tgstime, cutime, cstime; 1049 cputime_t tgutime, tgstime, cutime, cstime;
1047 1050
1048 spin_lock_irq(&current->sighand->siglock); 1051 spin_lock_irq(&current->sighand->siglock);
1049 thread_group_times(current, &tgutime, &tgstime); 1052 thread_group_cputime_adjusted(current, &tgutime, &tgstime);
1050 cutime = current->signal->cutime; 1053 cutime = current->signal->cutime;
1051 cstime = current->signal->cstime; 1054 cstime = current->signal->cstime;
1052 spin_unlock_irq(&current->sighand->siglock); 1055 spin_unlock_irq(&current->sighand->siglock);
@@ -1704,7 +1707,7 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
1704 utime = stime = 0; 1707 utime = stime = 0;
1705 1708
1706 if (who == RUSAGE_THREAD) { 1709 if (who == RUSAGE_THREAD) {
1707 task_times(current, &utime, &stime); 1710 task_cputime_adjusted(current, &utime, &stime);
1708 accumulate_thread_rusage(p, r); 1711 accumulate_thread_rusage(p, r);
1709 maxrss = p->signal->maxrss; 1712 maxrss = p->signal->maxrss;
1710 goto out; 1713 goto out;
@@ -1730,7 +1733,7 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
1730 break; 1733 break;
1731 1734
1732 case RUSAGE_SELF: 1735 case RUSAGE_SELF:
1733 thread_group_times(p, &tgutime, &tgstime); 1736 thread_group_cputime_adjusted(p, &tgutime, &tgstime);
1734 utime += tgutime; 1737 utime += tgutime;
1735 stime += tgstime; 1738 stime += tgstime;
1736 r->ru_nvcsw += p->signal->nvcsw; 1739 r->ru_nvcsw += p->signal->nvcsw;
@@ -1792,14 +1795,14 @@ SYSCALL_DEFINE1(umask, int, mask)
1792static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd) 1795static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
1793{ 1796{
1794 struct fd exe; 1797 struct fd exe;
1795 struct dentry *dentry; 1798 struct inode *inode;
1796 int err; 1799 int err;
1797 1800
1798 exe = fdget(fd); 1801 exe = fdget(fd);
1799 if (!exe.file) 1802 if (!exe.file)
1800 return -EBADF; 1803 return -EBADF;
1801 1804
1802 dentry = exe.file->f_path.dentry; 1805 inode = file_inode(exe.file);
1803 1806
1804 /* 1807 /*
1805 * Because the original mm->exe_file points to executable file, make 1808 * Because the original mm->exe_file points to executable file, make
@@ -1807,11 +1810,11 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
1807 * overall picture. 1810 * overall picture.
1808 */ 1811 */
1809 err = -EACCES; 1812 err = -EACCES;
1810 if (!S_ISREG(dentry->d_inode->i_mode) || 1813 if (!S_ISREG(inode->i_mode) ||
1811 exe.file->f_path.mnt->mnt_flags & MNT_NOEXEC) 1814 exe.file->f_path.mnt->mnt_flags & MNT_NOEXEC)
1812 goto exit; 1815 goto exit;
1813 1816
1814 err = inode_permission(dentry->d_inode, MAY_EXEC); 1817 err = inode_permission(inode, MAY_EXEC);
1815 if (err) 1818 if (err)
1816 goto exit; 1819 goto exit;
1817 1820
@@ -2012,160 +2015,159 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
2012 2015
2013 error = 0; 2016 error = 0;
2014 switch (option) { 2017 switch (option) {
2015 case PR_SET_PDEATHSIG: 2018 case PR_SET_PDEATHSIG:
2016 if (!valid_signal(arg2)) { 2019 if (!valid_signal(arg2)) {
2017 error = -EINVAL; 2020 error = -EINVAL;
2018 break;
2019 }
2020 me->pdeath_signal = arg2;
2021 break;
2022 case PR_GET_PDEATHSIG:
2023 error = put_user(me->pdeath_signal, (int __user *)arg2);
2024 break;
2025 case PR_GET_DUMPABLE:
2026 error = get_dumpable(me->mm);
2027 break; 2021 break;
2028 case PR_SET_DUMPABLE: 2022 }
2029 if (arg2 < 0 || arg2 > 1) { 2023 me->pdeath_signal = arg2;
2030 error = -EINVAL; 2024 break;
2031 break; 2025 case PR_GET_PDEATHSIG:
2032 } 2026 error = put_user(me->pdeath_signal, (int __user *)arg2);
2033 set_dumpable(me->mm, arg2); 2027 break;
2028 case PR_GET_DUMPABLE:
2029 error = get_dumpable(me->mm);
2030 break;
2031 case PR_SET_DUMPABLE:
2032 if (arg2 != SUID_DUMP_DISABLE && arg2 != SUID_DUMP_USER) {
2033 error = -EINVAL;
2034 break; 2034 break;
2035 }
2036 set_dumpable(me->mm, arg2);
2037 break;
2035 2038
2036 case PR_SET_UNALIGN: 2039 case PR_SET_UNALIGN:
2037 error = SET_UNALIGN_CTL(me, arg2); 2040 error = SET_UNALIGN_CTL(me, arg2);
2038 break; 2041 break;
2039 case PR_GET_UNALIGN: 2042 case PR_GET_UNALIGN:
2040 error = GET_UNALIGN_CTL(me, arg2); 2043 error = GET_UNALIGN_CTL(me, arg2);
2041 break; 2044 break;
2042 case PR_SET_FPEMU: 2045 case PR_SET_FPEMU:
2043 error = SET_FPEMU_CTL(me, arg2); 2046 error = SET_FPEMU_CTL(me, arg2);
2044 break; 2047 break;
2045 case PR_GET_FPEMU: 2048 case PR_GET_FPEMU:
2046 error = GET_FPEMU_CTL(me, arg2); 2049 error = GET_FPEMU_CTL(me, arg2);
2047 break; 2050 break;
2048 case PR_SET_FPEXC: 2051 case PR_SET_FPEXC:
2049 error = SET_FPEXC_CTL(me, arg2); 2052 error = SET_FPEXC_CTL(me, arg2);
2050 break; 2053 break;
2051 case PR_GET_FPEXC: 2054 case PR_GET_FPEXC:
2052 error = GET_FPEXC_CTL(me, arg2); 2055 error = GET_FPEXC_CTL(me, arg2);
2053 break; 2056 break;
2054 case PR_GET_TIMING: 2057 case PR_GET_TIMING:
2055 error = PR_TIMING_STATISTICAL; 2058 error = PR_TIMING_STATISTICAL;
2056 break; 2059 break;
2057 case PR_SET_TIMING: 2060 case PR_SET_TIMING:
2058 if (arg2 != PR_TIMING_STATISTICAL) 2061 if (arg2 != PR_TIMING_STATISTICAL)
2059 error = -EINVAL; 2062 error = -EINVAL;
2060 break; 2063 break;
2061 case PR_SET_NAME: 2064 case PR_SET_NAME:
2062 comm[sizeof(me->comm)-1] = 0; 2065 comm[sizeof(me->comm) - 1] = 0;
2063 if (strncpy_from_user(comm, (char __user *)arg2, 2066 if (strncpy_from_user(comm, (char __user *)arg2,
2064 sizeof(me->comm) - 1) < 0) 2067 sizeof(me->comm) - 1) < 0)
2065 return -EFAULT; 2068 return -EFAULT;
2066 set_task_comm(me, comm); 2069 set_task_comm(me, comm);
2067 proc_comm_connector(me); 2070 proc_comm_connector(me);
2068 break; 2071 break;
2069 case PR_GET_NAME: 2072 case PR_GET_NAME:
2070 get_task_comm(comm, me); 2073 get_task_comm(comm, me);
2071 if (copy_to_user((char __user *)arg2, comm, 2074 if (copy_to_user((char __user *)arg2, comm, sizeof(comm)))
2072 sizeof(comm))) 2075 return -EFAULT;
2073 return -EFAULT; 2076 break;
2074 break; 2077 case PR_GET_ENDIAN:
2075 case PR_GET_ENDIAN: 2078 error = GET_ENDIAN(me, arg2);
2076 error = GET_ENDIAN(me, arg2); 2079 break;
2077 break; 2080 case PR_SET_ENDIAN:
2078 case PR_SET_ENDIAN: 2081 error = SET_ENDIAN(me, arg2);
2079 error = SET_ENDIAN(me, arg2); 2082 break;
2080 break; 2083 case PR_GET_SECCOMP:
2081 case PR_GET_SECCOMP: 2084 error = prctl_get_seccomp();
2082 error = prctl_get_seccomp(); 2085 break;
2083 break; 2086 case PR_SET_SECCOMP:
2084 case PR_SET_SECCOMP: 2087 error = prctl_set_seccomp(arg2, (char __user *)arg3);
2085 error = prctl_set_seccomp(arg2, (char __user *)arg3); 2088 break;
2086 break; 2089 case PR_GET_TSC:
2087 case PR_GET_TSC: 2090 error = GET_TSC_CTL(arg2);
2088 error = GET_TSC_CTL(arg2); 2091 break;
2089 break; 2092 case PR_SET_TSC:
2090 case PR_SET_TSC: 2093 error = SET_TSC_CTL(arg2);
2091 error = SET_TSC_CTL(arg2); 2094 break;
2092 break; 2095 case PR_TASK_PERF_EVENTS_DISABLE:
2093 case PR_TASK_PERF_EVENTS_DISABLE: 2096 error = perf_event_task_disable();
2094 error = perf_event_task_disable(); 2097 break;
2095 break; 2098 case PR_TASK_PERF_EVENTS_ENABLE:
2096 case PR_TASK_PERF_EVENTS_ENABLE: 2099 error = perf_event_task_enable();
2097 error = perf_event_task_enable(); 2100 break;
2098 break; 2101 case PR_GET_TIMERSLACK:
2099 case PR_GET_TIMERSLACK: 2102 error = current->timer_slack_ns;
2100 error = current->timer_slack_ns; 2103 break;
2101 break; 2104 case PR_SET_TIMERSLACK:
2102 case PR_SET_TIMERSLACK: 2105 if (arg2 <= 0)
2103 if (arg2 <= 0) 2106 current->timer_slack_ns =
2104 current->timer_slack_ns =
2105 current->default_timer_slack_ns; 2107 current->default_timer_slack_ns;
2106 else 2108 else
2107 current->timer_slack_ns = arg2; 2109 current->timer_slack_ns = arg2;
2108 break; 2110 break;
2109 case PR_MCE_KILL: 2111 case PR_MCE_KILL:
2110 if (arg4 | arg5) 2112 if (arg4 | arg5)
2111 return -EINVAL; 2113 return -EINVAL;
2112 switch (arg2) { 2114 switch (arg2) {
2113 case PR_MCE_KILL_CLEAR: 2115 case PR_MCE_KILL_CLEAR:
2114 if (arg3 != 0) 2116 if (arg3 != 0)
2115 return -EINVAL;
2116 current->flags &= ~PF_MCE_PROCESS;
2117 break;
2118 case PR_MCE_KILL_SET:
2119 current->flags |= PF_MCE_PROCESS;
2120 if (arg3 == PR_MCE_KILL_EARLY)
2121 current->flags |= PF_MCE_EARLY;
2122 else if (arg3 == PR_MCE_KILL_LATE)
2123 current->flags &= ~PF_MCE_EARLY;
2124 else if (arg3 == PR_MCE_KILL_DEFAULT)
2125 current->flags &=
2126 ~(PF_MCE_EARLY|PF_MCE_PROCESS);
2127 else
2128 return -EINVAL;
2129 break;
2130 default:
2131 return -EINVAL; 2117 return -EINVAL;
2132 } 2118 current->flags &= ~PF_MCE_PROCESS;
2133 break; 2119 break;
2134 case PR_MCE_KILL_GET: 2120 case PR_MCE_KILL_SET:
2135 if (arg2 | arg3 | arg4 | arg5) 2121 current->flags |= PF_MCE_PROCESS;
2136 return -EINVAL; 2122 if (arg3 == PR_MCE_KILL_EARLY)
2137 if (current->flags & PF_MCE_PROCESS) 2123 current->flags |= PF_MCE_EARLY;
2138 error = (current->flags & PF_MCE_EARLY) ? 2124 else if (arg3 == PR_MCE_KILL_LATE)
2139 PR_MCE_KILL_EARLY : PR_MCE_KILL_LATE; 2125 current->flags &= ~PF_MCE_EARLY;
2126 else if (arg3 == PR_MCE_KILL_DEFAULT)
2127 current->flags &=
2128 ~(PF_MCE_EARLY|PF_MCE_PROCESS);
2140 else 2129 else
2141 error = PR_MCE_KILL_DEFAULT;
2142 break;
2143 case PR_SET_MM:
2144 error = prctl_set_mm(arg2, arg3, arg4, arg5);
2145 break;
2146 case PR_GET_TID_ADDRESS:
2147 error = prctl_get_tid_address(me, (int __user **)arg2);
2148 break;
2149 case PR_SET_CHILD_SUBREAPER:
2150 me->signal->is_child_subreaper = !!arg2;
2151 break;
2152 case PR_GET_CHILD_SUBREAPER:
2153 error = put_user(me->signal->is_child_subreaper,
2154 (int __user *) arg2);
2155 break;
2156 case PR_SET_NO_NEW_PRIVS:
2157 if (arg2 != 1 || arg3 || arg4 || arg5)
2158 return -EINVAL; 2130 return -EINVAL;
2159
2160 current->no_new_privs = 1;
2161 break; 2131 break;
2162 case PR_GET_NO_NEW_PRIVS:
2163 if (arg2 || arg3 || arg4 || arg5)
2164 return -EINVAL;
2165 return current->no_new_privs ? 1 : 0;
2166 default: 2132 default:
2167 error = -EINVAL; 2133 return -EINVAL;
2168 break; 2134 }
2135 break;
2136 case PR_MCE_KILL_GET:
2137 if (arg2 | arg3 | arg4 | arg5)
2138 return -EINVAL;
2139 if (current->flags & PF_MCE_PROCESS)
2140 error = (current->flags & PF_MCE_EARLY) ?
2141 PR_MCE_KILL_EARLY : PR_MCE_KILL_LATE;
2142 else
2143 error = PR_MCE_KILL_DEFAULT;
2144 break;
2145 case PR_SET_MM:
2146 error = prctl_set_mm(arg2, arg3, arg4, arg5);
2147 break;
2148 case PR_GET_TID_ADDRESS:
2149 error = prctl_get_tid_address(me, (int __user **)arg2);
2150 break;
2151 case PR_SET_CHILD_SUBREAPER:
2152 me->signal->is_child_subreaper = !!arg2;
2153 break;
2154 case PR_GET_CHILD_SUBREAPER:
2155 error = put_user(me->signal->is_child_subreaper,
2156 (int __user *)arg2);
2157 break;
2158 case PR_SET_NO_NEW_PRIVS:
2159 if (arg2 != 1 || arg3 || arg4 || arg5)
2160 return -EINVAL;
2161
2162 current->no_new_privs = 1;
2163 break;
2164 case PR_GET_NO_NEW_PRIVS:
2165 if (arg2 || arg3 || arg4 || arg5)
2166 return -EINVAL;
2167 return current->no_new_privs ? 1 : 0;
2168 default:
2169 error = -EINVAL;
2170 break;
2169 } 2171 }
2170 return error; 2172 return error;
2171} 2173}
@@ -2184,14 +2186,8 @@ SYSCALL_DEFINE3(getcpu, unsigned __user *, cpup, unsigned __user *, nodep,
2184 2186
2185char poweroff_cmd[POWEROFF_CMD_PATH_LEN] = "/sbin/poweroff"; 2187char poweroff_cmd[POWEROFF_CMD_PATH_LEN] = "/sbin/poweroff";
2186 2188
2187static void argv_cleanup(struct subprocess_info *info) 2189static int __orderly_poweroff(bool force)
2188{ 2190{
2189 argv_free(info->argv);
2190}
2191
2192static int __orderly_poweroff(void)
2193{
2194 int argc;
2195 char **argv; 2191 char **argv;
2196 static char *envp[] = { 2192 static char *envp[] = {
2197 "HOME=/", 2193 "HOME=/",
@@ -2200,21 +2196,40 @@ static int __orderly_poweroff(void)
2200 }; 2196 };
2201 int ret; 2197 int ret;
2202 2198
2203 argv = argv_split(GFP_ATOMIC, poweroff_cmd, &argc); 2199 argv = argv_split(GFP_KERNEL, poweroff_cmd, NULL);
2204 if (argv == NULL) { 2200 if (argv) {
2201 ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
2202 argv_free(argv);
2203 } else {
2205 printk(KERN_WARNING "%s failed to allocate memory for \"%s\"\n", 2204 printk(KERN_WARNING "%s failed to allocate memory for \"%s\"\n",
2206 __func__, poweroff_cmd); 2205 __func__, poweroff_cmd);
2207 return -ENOMEM; 2206 ret = -ENOMEM;
2208 } 2207 }
2209 2208
2210 ret = call_usermodehelper_fns(argv[0], argv, envp, UMH_WAIT_EXEC, 2209 if (ret && force) {
2211 NULL, argv_cleanup, NULL); 2210 printk(KERN_WARNING "Failed to start orderly shutdown: "
2212 if (ret == -ENOMEM) 2211 "forcing the issue\n");
2213 argv_free(argv); 2212 /*
2213 * I guess this should try to kick off some daemon to sync and
2214 * poweroff asap. Or not even bother syncing if we're doing an
2215 * emergency shutdown?
2216 */
2217 emergency_sync();
2218 kernel_power_off();
2219 }
2214 2220
2215 return ret; 2221 return ret;
2216} 2222}
2217 2223
2224static bool poweroff_force;
2225
2226static void poweroff_work_func(struct work_struct *work)
2227{
2228 __orderly_poweroff(poweroff_force);
2229}
2230
2231static DECLARE_WORK(poweroff_work, poweroff_work_func);
2232
2218/** 2233/**
2219 * orderly_poweroff - Trigger an orderly system poweroff 2234 * orderly_poweroff - Trigger an orderly system poweroff
2220 * @force: force poweroff if command execution fails 2235 * @force: force poweroff if command execution fails
@@ -2224,21 +2239,9 @@ static int __orderly_poweroff(void)
2224 */ 2239 */
2225int orderly_poweroff(bool force) 2240int orderly_poweroff(bool force)
2226{ 2241{
2227 int ret = __orderly_poweroff(); 2242 if (force) /* do not override the pending "true" */
2228 2243 poweroff_force = true;
2229 if (ret && force) { 2244 schedule_work(&poweroff_work);
2230 printk(KERN_WARNING "Failed to start orderly shutdown: " 2245 return 0;
2231 "forcing the issue\n");
2232
2233 /*
2234 * I guess this should try to kick off some daemon to sync and
2235 * poweroff asap. Or not even bother syncing if we're doing an
2236 * emergency shutdown?
2237 */
2238 emergency_sync();
2239 kernel_power_off();
2240 }
2241
2242 return ret;
2243} 2246}
2244EXPORT_SYMBOL_GPL(orderly_poweroff); 2247EXPORT_SYMBOL_GPL(orderly_poweroff);
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index dbff751e4086..395084d4ce16 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -25,6 +25,7 @@ cond_syscall(sys_swapoff);
25cond_syscall(sys_kexec_load); 25cond_syscall(sys_kexec_load);
26cond_syscall(compat_sys_kexec_load); 26cond_syscall(compat_sys_kexec_load);
27cond_syscall(sys_init_module); 27cond_syscall(sys_init_module);
28cond_syscall(sys_finit_module);
28cond_syscall(sys_delete_module); 29cond_syscall(sys_delete_module);
29cond_syscall(sys_socketpair); 30cond_syscall(sys_socketpair);
30cond_syscall(sys_bind); 31cond_syscall(sys_bind);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 26f65eaa01f9..afc1dc60f3f8 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -61,6 +61,7 @@
61#include <linux/kmod.h> 61#include <linux/kmod.h>
62#include <linux/capability.h> 62#include <linux/capability.h>
63#include <linux/binfmts.h> 63#include <linux/binfmts.h>
64#include <linux/sched/sysctl.h>
64 65
65#include <asm/uaccess.h> 66#include <asm/uaccess.h>
66#include <asm/processor.h> 67#include <asm/processor.h>
@@ -104,7 +105,6 @@ extern char core_pattern[];
104extern unsigned int core_pipe_limit; 105extern unsigned int core_pipe_limit;
105#endif 106#endif
106extern int pid_max; 107extern int pid_max;
107extern int min_free_kbytes;
108extern int pid_max_min, pid_max_max; 108extern int pid_max_min, pid_max_max;
109extern int sysctl_drop_caches; 109extern int sysctl_drop_caches;
110extern int percpu_pagelist_fraction; 110extern int percpu_pagelist_fraction;
@@ -157,14 +157,20 @@ extern int sysctl_tsb_ratio;
157 157
158#ifdef __hppa__ 158#ifdef __hppa__
159extern int pwrsw_enabled; 159extern int pwrsw_enabled;
160#endif
161
162#ifdef CONFIG_SYSCTL_ARCH_UNALIGN_ALLOW
160extern int unaligned_enabled; 163extern int unaligned_enabled;
161#endif 164#endif
162 165
163#ifdef CONFIG_IA64 166#ifdef CONFIG_IA64
164extern int no_unaligned_warning;
165extern int unaligned_dump_stack; 167extern int unaligned_dump_stack;
166#endif 168#endif
167 169
170#ifdef CONFIG_SYSCTL_ARCH_UNALIGN_NO_WARN
171extern int no_unaligned_warning;
172#endif
173
168#ifdef CONFIG_PROC_SYSCTL 174#ifdef CONFIG_PROC_SYSCTL
169static int proc_do_cad_pid(struct ctl_table *table, int write, 175static int proc_do_cad_pid(struct ctl_table *table, int write,
170 void __user *buffer, size_t *lenp, loff_t *ppos); 176 void __user *buffer, size_t *lenp, loff_t *ppos);
@@ -256,9 +262,11 @@ static int min_sched_granularity_ns = 100000; /* 100 usecs */
256static int max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */ 262static int max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */
257static int min_wakeup_granularity_ns; /* 0 usecs */ 263static int min_wakeup_granularity_ns; /* 0 usecs */
258static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */ 264static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */
265#ifdef CONFIG_SMP
259static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE; 266static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE;
260static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1; 267static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1;
261#endif 268#endif /* CONFIG_SMP */
269#endif /* CONFIG_SCHED_DEBUG */
262 270
263#ifdef CONFIG_COMPACTION 271#ifdef CONFIG_COMPACTION
264static int min_extfrag_threshold; 272static int min_extfrag_threshold;
@@ -301,6 +309,7 @@ static struct ctl_table kern_table[] = {
301 .extra1 = &min_wakeup_granularity_ns, 309 .extra1 = &min_wakeup_granularity_ns,
302 .extra2 = &max_wakeup_granularity_ns, 310 .extra2 = &max_wakeup_granularity_ns,
303 }, 311 },
312#ifdef CONFIG_SMP
304 { 313 {
305 .procname = "sched_tunable_scaling", 314 .procname = "sched_tunable_scaling",
306 .data = &sysctl_sched_tunable_scaling, 315 .data = &sysctl_sched_tunable_scaling,
@@ -347,7 +356,45 @@ static struct ctl_table kern_table[] = {
347 .extra1 = &zero, 356 .extra1 = &zero,
348 .extra2 = &one, 357 .extra2 = &one,
349 }, 358 },
350#endif 359#endif /* CONFIG_SMP */
360#ifdef CONFIG_NUMA_BALANCING
361 {
362 .procname = "numa_balancing_scan_delay_ms",
363 .data = &sysctl_numa_balancing_scan_delay,
364 .maxlen = sizeof(unsigned int),
365 .mode = 0644,
366 .proc_handler = proc_dointvec,
367 },
368 {
369 .procname = "numa_balancing_scan_period_min_ms",
370 .data = &sysctl_numa_balancing_scan_period_min,
371 .maxlen = sizeof(unsigned int),
372 .mode = 0644,
373 .proc_handler = proc_dointvec,
374 },
375 {
376 .procname = "numa_balancing_scan_period_reset",
377 .data = &sysctl_numa_balancing_scan_period_reset,
378 .maxlen = sizeof(unsigned int),
379 .mode = 0644,
380 .proc_handler = proc_dointvec,
381 },
382 {
383 .procname = "numa_balancing_scan_period_max_ms",
384 .data = &sysctl_numa_balancing_scan_period_max,
385 .maxlen = sizeof(unsigned int),
386 .mode = 0644,
387 .proc_handler = proc_dointvec,
388 },
389 {
390 .procname = "numa_balancing_scan_size_mb",
391 .data = &sysctl_numa_balancing_scan_size,
392 .maxlen = sizeof(unsigned int),
393 .mode = 0644,
394 .proc_handler = proc_dointvec,
395 },
396#endif /* CONFIG_NUMA_BALANCING */
397#endif /* CONFIG_SCHED_DEBUG */
351 { 398 {
352 .procname = "sched_rt_period_us", 399 .procname = "sched_rt_period_us",
353 .data = &sysctl_sched_rt_period, 400 .data = &sysctl_sched_rt_period,
@@ -362,6 +409,13 @@ static struct ctl_table kern_table[] = {
362 .mode = 0644, 409 .mode = 0644,
363 .proc_handler = sched_rt_handler, 410 .proc_handler = sched_rt_handler,
364 }, 411 },
412 {
413 .procname = "sched_rr_timeslice_ms",
414 .data = &sched_rr_timeslice,
415 .maxlen = sizeof(int),
416 .mode = 0644,
417 .proc_handler = sched_rr_handler,
418 },
365#ifdef CONFIG_SCHED_AUTOGROUP 419#ifdef CONFIG_SCHED_AUTOGROUP
366 { 420 {
367 .procname = "sched_autogroup_enabled", 421 .procname = "sched_autogroup_enabled",
@@ -504,6 +558,8 @@ static struct ctl_table kern_table[] = {
504 .mode = 0644, 558 .mode = 0644,
505 .proc_handler = proc_dointvec, 559 .proc_handler = proc_dointvec,
506 }, 560 },
561#endif
562#ifdef CONFIG_SYSCTL_ARCH_UNALIGN_ALLOW
507 { 563 {
508 .procname = "unaligned-trap", 564 .procname = "unaligned-trap",
509 .data = &unaligned_enabled, 565 .data = &unaligned_enabled,
@@ -565,7 +621,7 @@ static struct ctl_table kern_table[] = {
565 .extra2 = &one, 621 .extra2 = &one,
566 }, 622 },
567#endif 623#endif
568#ifdef CONFIG_HOTPLUG 624
569 { 625 {
570 .procname = "hotplug", 626 .procname = "hotplug",
571 .data = &uevent_helper, 627 .data = &uevent_helper,
@@ -573,7 +629,7 @@ static struct ctl_table kern_table[] = {
573 .mode = 0644, 629 .mode = 0644,
574 .proc_handler = proc_dostring, 630 .proc_handler = proc_dostring,
575 }, 631 },
576#endif 632
577#ifdef CONFIG_CHR_DEV_SG 633#ifdef CONFIG_CHR_DEV_SG
578 { 634 {
579 .procname = "sg-big-buff", 635 .procname = "sg-big-buff",
@@ -870,7 +926,7 @@ static struct ctl_table kern_table[] = {
870 .proc_handler = proc_doulongvec_minmax, 926 .proc_handler = proc_doulongvec_minmax,
871 }, 927 },
872#endif 928#endif
873#ifdef CONFIG_IA64 929#ifdef CONFIG_SYSCTL_ARCH_UNALIGN_NO_WARN
874 { 930 {
875 .procname = "ignore-unaligned-usertrap", 931 .procname = "ignore-unaligned-usertrap",
876 .data = &no_unaligned_warning, 932 .data = &no_unaligned_warning,
@@ -878,6 +934,8 @@ static struct ctl_table kern_table[] = {
878 .mode = 0644, 934 .mode = 0644,
879 .proc_handler = proc_dointvec, 935 .proc_handler = proc_dointvec,
880 }, 936 },
937#endif
938#ifdef CONFIG_IA64
881 { 939 {
882 .procname = "unaligned-dump-stack", 940 .procname = "unaligned-dump-stack",
883 .data = &unaligned_dump_stack, 941 .data = &unaligned_dump_stack,
@@ -1965,7 +2023,7 @@ static int proc_taint(struct ctl_table *table, int write,
1965 int i; 2023 int i;
1966 for (i = 0; i < BITS_PER_LONG && tmptaint >> i; i++) { 2024 for (i = 0; i < BITS_PER_LONG && tmptaint >> i; i++) {
1967 if ((tmptaint >> i) & 1) 2025 if ((tmptaint >> i) & 1)
1968 add_taint(i); 2026 add_taint(i, LOCKDEP_STILL_OK);
1969 } 2027 }
1970 } 2028 }
1971 2029
@@ -2042,7 +2100,7 @@ int proc_dointvec_minmax(struct ctl_table *table, int write,
2042static void validate_coredump_safety(void) 2100static void validate_coredump_safety(void)
2043{ 2101{
2044#ifdef CONFIG_COREDUMP 2102#ifdef CONFIG_COREDUMP
2045 if (suid_dumpable == SUID_DUMPABLE_SAFE && 2103 if (suid_dumpable == SUID_DUMP_ROOT &&
2046 core_pattern[0] != '/' && core_pattern[0] != '|') { 2104 core_pattern[0] != '/' && core_pattern[0] != '|') {
2047 printk(KERN_WARNING "Unsafe core_pattern used with "\ 2105 printk(KERN_WARNING "Unsafe core_pattern used with "\
2048 "suid_dumpable=2. Pipe handler or fully qualified "\ 2106 "suid_dumpable=2. Pipe handler or fully qualified "\
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index 65bdcf198d4e..ebf72358e86a 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -387,7 +387,6 @@ static const struct bin_table bin_net_ipv4_table[] = {
387 { CTL_INT, NET_TCP_MODERATE_RCVBUF, "tcp_moderate_rcvbuf" }, 387 { CTL_INT, NET_TCP_MODERATE_RCVBUF, "tcp_moderate_rcvbuf" },
388 { CTL_INT, NET_TCP_TSO_WIN_DIVISOR, "tcp_tso_win_divisor" }, 388 { CTL_INT, NET_TCP_TSO_WIN_DIVISOR, "tcp_tso_win_divisor" },
389 { CTL_STR, NET_TCP_CONG_CONTROL, "tcp_congestion_control" }, 389 { CTL_STR, NET_TCP_CONG_CONTROL, "tcp_congestion_control" },
390 { CTL_INT, NET_TCP_ABC, "tcp_abc" },
391 { CTL_INT, NET_TCP_MTU_PROBING, "tcp_mtu_probing" }, 390 { CTL_INT, NET_TCP_MTU_PROBING, "tcp_mtu_probing" },
392 { CTL_INT, NET_TCP_BASE_MSS, "tcp_base_mss" }, 391 { CTL_INT, NET_TCP_BASE_MSS, "tcp_base_mss" },
393 { CTL_INT, NET_IPV4_TCP_WORKAROUND_SIGNED_WINDOWS, "tcp_workaround_signed_windows" }, 392 { CTL_INT, NET_IPV4_TCP_WORKAROUND_SIGNED_WINDOWS, "tcp_workaround_signed_windows" },
@@ -971,7 +970,6 @@ out:
971static ssize_t bin_intvec(struct file *file, 970static ssize_t bin_intvec(struct file *file,
972 void __user *oldval, size_t oldlen, void __user *newval, size_t newlen) 971 void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
973{ 972{
974 mm_segment_t old_fs = get_fs();
975 ssize_t copied = 0; 973 ssize_t copied = 0;
976 char *buffer; 974 char *buffer;
977 ssize_t result; 975 ssize_t result;
@@ -984,13 +982,10 @@ static ssize_t bin_intvec(struct file *file,
984 if (oldval && oldlen) { 982 if (oldval && oldlen) {
985 unsigned __user *vec = oldval; 983 unsigned __user *vec = oldval;
986 size_t length = oldlen / sizeof(*vec); 984 size_t length = oldlen / sizeof(*vec);
987 loff_t pos = 0;
988 char *str, *end; 985 char *str, *end;
989 int i; 986 int i;
990 987
991 set_fs(KERNEL_DS); 988 result = kernel_read(file, 0, buffer, BUFSZ - 1);
992 result = vfs_read(file, buffer, BUFSZ - 1, &pos);
993 set_fs(old_fs);
994 if (result < 0) 989 if (result < 0)
995 goto out_kfree; 990 goto out_kfree;
996 991
@@ -1017,7 +1012,6 @@ static ssize_t bin_intvec(struct file *file,
1017 if (newval && newlen) { 1012 if (newval && newlen) {
1018 unsigned __user *vec = newval; 1013 unsigned __user *vec = newval;
1019 size_t length = newlen / sizeof(*vec); 1014 size_t length = newlen / sizeof(*vec);
1020 loff_t pos = 0;
1021 char *str, *end; 1015 char *str, *end;
1022 int i; 1016 int i;
1023 1017
@@ -1033,9 +1027,7 @@ static ssize_t bin_intvec(struct file *file,
1033 str += snprintf(str, end - str, "%lu\t", value); 1027 str += snprintf(str, end - str, "%lu\t", value);
1034 } 1028 }
1035 1029
1036 set_fs(KERNEL_DS); 1030 result = kernel_write(file, buffer, str - buffer, 0);
1037 result = vfs_write(file, buffer, str - buffer, &pos);
1038 set_fs(old_fs);
1039 if (result < 0) 1031 if (result < 0)
1040 goto out_kfree; 1032 goto out_kfree;
1041 } 1033 }
@@ -1049,7 +1041,6 @@ out:
1049static ssize_t bin_ulongvec(struct file *file, 1041static ssize_t bin_ulongvec(struct file *file,
1050 void __user *oldval, size_t oldlen, void __user *newval, size_t newlen) 1042 void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
1051{ 1043{
1052 mm_segment_t old_fs = get_fs();
1053 ssize_t copied = 0; 1044 ssize_t copied = 0;
1054 char *buffer; 1045 char *buffer;
1055 ssize_t result; 1046 ssize_t result;
@@ -1062,13 +1053,10 @@ static ssize_t bin_ulongvec(struct file *file,
1062 if (oldval && oldlen) { 1053 if (oldval && oldlen) {
1063 unsigned long __user *vec = oldval; 1054 unsigned long __user *vec = oldval;
1064 size_t length = oldlen / sizeof(*vec); 1055 size_t length = oldlen / sizeof(*vec);
1065 loff_t pos = 0;
1066 char *str, *end; 1056 char *str, *end;
1067 int i; 1057 int i;
1068 1058
1069 set_fs(KERNEL_DS); 1059 result = kernel_read(file, 0, buffer, BUFSZ - 1);
1070 result = vfs_read(file, buffer, BUFSZ - 1, &pos);
1071 set_fs(old_fs);
1072 if (result < 0) 1060 if (result < 0)
1073 goto out_kfree; 1061 goto out_kfree;
1074 1062
@@ -1095,7 +1083,6 @@ static ssize_t bin_ulongvec(struct file *file,
1095 if (newval && newlen) { 1083 if (newval && newlen) {
1096 unsigned long __user *vec = newval; 1084 unsigned long __user *vec = newval;
1097 size_t length = newlen / sizeof(*vec); 1085 size_t length = newlen / sizeof(*vec);
1098 loff_t pos = 0;
1099 char *str, *end; 1086 char *str, *end;
1100 int i; 1087 int i;
1101 1088
@@ -1111,9 +1098,7 @@ static ssize_t bin_ulongvec(struct file *file,
1111 str += snprintf(str, end - str, "%lu\t", value); 1098 str += snprintf(str, end - str, "%lu\t", value);
1112 } 1099 }
1113 1100
1114 set_fs(KERNEL_DS); 1101 result = kernel_write(file, buffer, str - buffer, 0);
1115 result = vfs_write(file, buffer, str - buffer, &pos);
1116 set_fs(old_fs);
1117 if (result < 0) 1102 if (result < 0)
1118 goto out_kfree; 1103 goto out_kfree;
1119 } 1104 }
@@ -1127,19 +1112,15 @@ out:
1127static ssize_t bin_uuid(struct file *file, 1112static ssize_t bin_uuid(struct file *file,
1128 void __user *oldval, size_t oldlen, void __user *newval, size_t newlen) 1113 void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
1129{ 1114{
1130 mm_segment_t old_fs = get_fs();
1131 ssize_t result, copied = 0; 1115 ssize_t result, copied = 0;
1132 1116
1133 /* Only supports reads */ 1117 /* Only supports reads */
1134 if (oldval && oldlen) { 1118 if (oldval && oldlen) {
1135 loff_t pos = 0;
1136 char buf[40], *str = buf; 1119 char buf[40], *str = buf;
1137 unsigned char uuid[16]; 1120 unsigned char uuid[16];
1138 int i; 1121 int i;
1139 1122
1140 set_fs(KERNEL_DS); 1123 result = kernel_read(file, 0, buf, sizeof(buf) - 1);
1141 result = vfs_read(file, buf, sizeof(buf) - 1, &pos);
1142 set_fs(old_fs);
1143 if (result < 0) 1124 if (result < 0)
1144 goto out; 1125 goto out;
1145 1126
@@ -1175,18 +1156,14 @@ out:
1175static ssize_t bin_dn_node_address(struct file *file, 1156static ssize_t bin_dn_node_address(struct file *file,
1176 void __user *oldval, size_t oldlen, void __user *newval, size_t newlen) 1157 void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
1177{ 1158{
1178 mm_segment_t old_fs = get_fs();
1179 ssize_t result, copied = 0; 1159 ssize_t result, copied = 0;
1180 1160
1181 if (oldval && oldlen) { 1161 if (oldval && oldlen) {
1182 loff_t pos = 0;
1183 char buf[15], *nodep; 1162 char buf[15], *nodep;
1184 unsigned long area, node; 1163 unsigned long area, node;
1185 __le16 dnaddr; 1164 __le16 dnaddr;
1186 1165
1187 set_fs(KERNEL_DS); 1166 result = kernel_read(file, 0, buf, sizeof(buf) - 1);
1188 result = vfs_read(file, buf, sizeof(buf) - 1, &pos);
1189 set_fs(old_fs);
1190 if (result < 0) 1167 if (result < 0)
1191 goto out; 1168 goto out;
1192 1169
@@ -1194,9 +1171,10 @@ static ssize_t bin_dn_node_address(struct file *file,
1194 1171
1195 /* Convert the decnet address to binary */ 1172 /* Convert the decnet address to binary */
1196 result = -EIO; 1173 result = -EIO;
1197 nodep = strchr(buf, '.') + 1; 1174 nodep = strchr(buf, '.');
1198 if (!nodep) 1175 if (!nodep)
1199 goto out; 1176 goto out;
1177 ++nodep;
1200 1178
1201 area = simple_strtoul(buf, NULL, 10); 1179 area = simple_strtoul(buf, NULL, 10);
1202 node = simple_strtoul(nodep, NULL, 10); 1180 node = simple_strtoul(nodep, NULL, 10);
@@ -1215,7 +1193,6 @@ static ssize_t bin_dn_node_address(struct file *file,
1215 } 1193 }
1216 1194
1217 if (newval && newlen) { 1195 if (newval && newlen) {
1218 loff_t pos = 0;
1219 __le16 dnaddr; 1196 __le16 dnaddr;
1220 char buf[15]; 1197 char buf[15];
1221 int len; 1198 int len;
@@ -1232,9 +1209,7 @@ static ssize_t bin_dn_node_address(struct file *file,
1232 le16_to_cpu(dnaddr) >> 10, 1209 le16_to_cpu(dnaddr) >> 10,
1233 le16_to_cpu(dnaddr) & 0x3ff); 1210 le16_to_cpu(dnaddr) & 0x3ff);
1234 1211
1235 set_fs(KERNEL_DS); 1212 result = kernel_write(file, buf, len, 0);
1236 result = vfs_write(file, buf, len, &pos);
1237 set_fs(old_fs);
1238 if (result < 0) 1213 if (result < 0)
1239 goto out; 1214 goto out;
1240 } 1215 }
@@ -1344,7 +1319,7 @@ static ssize_t binary_sysctl(const int *name, int nlen,
1344 goto out_putname; 1319 goto out_putname;
1345 } 1320 }
1346 1321
1347 mnt = current->nsproxy->pid_ns->proc_mnt; 1322 mnt = task_active_pid_ns(current)->proc_mnt;
1348 file = file_open_root(mnt->mnt_root, mnt, pathname, flags); 1323 file = file_open_root(mnt->mnt_root, mnt, pathname, flags);
1349 result = PTR_ERR(file); 1324 result = PTR_ERR(file);
1350 if (IS_ERR(file)) 1325 if (IS_ERR(file))
diff --git a/kernel/time.c b/kernel/time.c
index d226c6a3fd28..f8342a41efa6 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -115,6 +115,12 @@ SYSCALL_DEFINE2(gettimeofday, struct timeval __user *, tv,
115} 115}
116 116
117/* 117/*
118 * Indicates if there is an offset between the system clock and the hardware
119 * clock/persistent clock/rtc.
120 */
121int persistent_clock_is_local;
122
123/*
118 * Adjust the time obtained from the CMOS to be UTC time instead of 124 * Adjust the time obtained from the CMOS to be UTC time instead of
119 * local time. 125 * local time.
120 * 126 *
@@ -135,6 +141,8 @@ static inline void warp_clock(void)
135 struct timespec adjust; 141 struct timespec adjust;
136 142
137 adjust = current_kernel_time(); 143 adjust = current_kernel_time();
144 if (sys_tz.tz_minuteswest != 0)
145 persistent_clock_is_local = 1;
138 adjust.tv_sec += sys_tz.tz_minuteswest * 60; 146 adjust.tv_sec += sys_tz.tz_minuteswest * 60;
139 do_settimeofday(&adjust); 147 do_settimeofday(&adjust);
140} 148}
@@ -232,7 +240,7 @@ EXPORT_SYMBOL(current_fs_time);
232 * Avoid unnecessary multiplications/divisions in the 240 * Avoid unnecessary multiplications/divisions in the
233 * two most common HZ cases: 241 * two most common HZ cases:
234 */ 242 */
235inline unsigned int jiffies_to_msecs(const unsigned long j) 243unsigned int jiffies_to_msecs(const unsigned long j)
236{ 244{
237#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ) 245#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ)
238 return (MSEC_PER_SEC / HZ) * j; 246 return (MSEC_PER_SEC / HZ) * j;
@@ -248,7 +256,7 @@ inline unsigned int jiffies_to_msecs(const unsigned long j)
248} 256}
249EXPORT_SYMBOL(jiffies_to_msecs); 257EXPORT_SYMBOL(jiffies_to_msecs);
250 258
251inline unsigned int jiffies_to_usecs(const unsigned long j) 259unsigned int jiffies_to_usecs(const unsigned long j)
252{ 260{
253#if HZ <= USEC_PER_SEC && !(USEC_PER_SEC % HZ) 261#if HZ <= USEC_PER_SEC && !(USEC_PER_SEC % HZ)
254 return (USEC_PER_SEC / HZ) * j; 262 return (USEC_PER_SEC / HZ) * j;
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index 8601f0db1261..24510d84efd7 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -12,6 +12,11 @@ config CLOCKSOURCE_WATCHDOG
12config ARCH_CLOCKSOURCE_DATA 12config ARCH_CLOCKSOURCE_DATA
13 bool 13 bool
14 14
15# Platforms has a persistent clock
16config ALWAYS_USE_PERSISTENT_CLOCK
17 bool
18 default n
19
15# Timekeeping vsyscall support 20# Timekeeping vsyscall support
16config GENERIC_TIME_VSYSCALL 21config GENERIC_TIME_VSYSCALL
17 bool 22 bool
@@ -38,6 +43,10 @@ config GENERIC_CLOCKEVENTS_BUILD
38 default y 43 default y
39 depends on GENERIC_CLOCKEVENTS 44 depends on GENERIC_CLOCKEVENTS
40 45
46# Architecture can handle broadcast in a driver-agnostic way
47config ARCH_HAS_TICK_BROADCAST
48 bool
49
41# Clockevents broadcasting infrastructure 50# Clockevents broadcasting infrastructure
42config GENERIC_CLOCKEVENTS_BROADCAST 51config GENERIC_CLOCKEVENTS_BROADCAST
43 bool 52 bool
diff --git a/kernel/time/Makefile b/kernel/time/Makefile
index e2fd74b8e8c2..ff7d9d2ab504 100644
--- a/kernel/time/Makefile
+++ b/kernel/time/Makefile
@@ -1,4 +1,4 @@
1obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o timecompare.o 1obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o
2obj-y += timeconv.o posix-clock.o alarmtimer.o 2obj-y += timeconv.o posix-clock.o alarmtimer.o
3 3
4obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD) += clockevents.o 4obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD) += clockevents.o
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 30b6de0d977c..c6d6400ee137 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -339,6 +339,7 @@ void clockevents_config_and_register(struct clock_event_device *dev,
339 clockevents_config(dev, freq); 339 clockevents_config(dev, freq);
340 clockevents_register_device(dev); 340 clockevents_register_device(dev);
341} 341}
342EXPORT_SYMBOL_GPL(clockevents_config_and_register);
342 343
343/** 344/**
344 * clockevents_update_freq - Update frequency and reprogram a clock event device. 345 * clockevents_update_freq - Update frequency and reprogram a clock event device.
diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
index 6629bf7b5285..7a925ba456fb 100644
--- a/kernel/time/jiffies.c
+++ b/kernel/time/jiffies.c
@@ -58,7 +58,7 @@ static cycle_t jiffies_read(struct clocksource *cs)
58 return (cycle_t) jiffies; 58 return (cycle_t) jiffies;
59} 59}
60 60
61struct clocksource clocksource_jiffies = { 61static struct clocksource clocksource_jiffies = {
62 .name = "jiffies", 62 .name = "jiffies",
63 .rating = 1, /* lowest valid rating*/ 63 .rating = 1, /* lowest valid rating*/
64 .read = jiffies_read, 64 .read = jiffies_read,
@@ -67,6 +67,8 @@ struct clocksource clocksource_jiffies = {
67 .shift = JIFFIES_SHIFT, 67 .shift = JIFFIES_SHIFT,
68}; 68};
69 69
70__cacheline_aligned_in_smp DEFINE_SEQLOCK(jiffies_lock);
71
70#if (BITS_PER_LONG < 64) 72#if (BITS_PER_LONG < 64)
71u64 get_jiffies_64(void) 73u64 get_jiffies_64(void)
72{ 74{
@@ -74,9 +76,9 @@ u64 get_jiffies_64(void)
74 u64 ret; 76 u64 ret;
75 77
76 do { 78 do {
77 seq = read_seqbegin(&xtime_lock); 79 seq = read_seqbegin(&jiffies_lock);
78 ret = jiffies_64; 80 ret = jiffies_64;
79 } while (read_seqretry(&xtime_lock, seq)); 81 } while (read_seqretry(&jiffies_lock, seq));
80 return ret; 82 return ret;
81} 83}
82EXPORT_SYMBOL(get_jiffies_64); 84EXPORT_SYMBOL(get_jiffies_64);
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 24174b4d669b..072bb066bb7d 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -15,6 +15,7 @@
15#include <linux/time.h> 15#include <linux/time.h>
16#include <linux/mm.h> 16#include <linux/mm.h>
17#include <linux/module.h> 17#include <linux/module.h>
18#include <linux/rtc.h>
18 19
19#include "tick-internal.h" 20#include "tick-internal.h"
20 21
@@ -22,7 +23,7 @@
22 * NTP timekeeping variables: 23 * NTP timekeeping variables:
23 */ 24 */
24 25
25DEFINE_SPINLOCK(ntp_lock); 26DEFINE_RAW_SPINLOCK(ntp_lock);
26 27
27 28
28/* USER_HZ period (usecs): */ 29/* USER_HZ period (usecs): */
@@ -347,7 +348,7 @@ void ntp_clear(void)
347{ 348{
348 unsigned long flags; 349 unsigned long flags;
349 350
350 spin_lock_irqsave(&ntp_lock, flags); 351 raw_spin_lock_irqsave(&ntp_lock, flags);
351 352
352 time_adjust = 0; /* stop active adjtime() */ 353 time_adjust = 0; /* stop active adjtime() */
353 time_status |= STA_UNSYNC; 354 time_status |= STA_UNSYNC;
@@ -361,7 +362,7 @@ void ntp_clear(void)
361 362
362 /* Clear PPS state variables */ 363 /* Clear PPS state variables */
363 pps_clear(); 364 pps_clear();
364 spin_unlock_irqrestore(&ntp_lock, flags); 365 raw_spin_unlock_irqrestore(&ntp_lock, flags);
365 366
366} 367}
367 368
@@ -371,9 +372,9 @@ u64 ntp_tick_length(void)
371 unsigned long flags; 372 unsigned long flags;
372 s64 ret; 373 s64 ret;
373 374
374 spin_lock_irqsave(&ntp_lock, flags); 375 raw_spin_lock_irqsave(&ntp_lock, flags);
375 ret = tick_length; 376 ret = tick_length;
376 spin_unlock_irqrestore(&ntp_lock, flags); 377 raw_spin_unlock_irqrestore(&ntp_lock, flags);
377 return ret; 378 return ret;
378} 379}
379 380
@@ -394,7 +395,7 @@ int second_overflow(unsigned long secs)
394 int leap = 0; 395 int leap = 0;
395 unsigned long flags; 396 unsigned long flags;
396 397
397 spin_lock_irqsave(&ntp_lock, flags); 398 raw_spin_lock_irqsave(&ntp_lock, flags);
398 399
399 /* 400 /*
400 * Leap second processing. If in leap-insert state at the end of the 401 * Leap second processing. If in leap-insert state at the end of the
@@ -478,13 +479,12 @@ int second_overflow(unsigned long secs)
478 time_adjust = 0; 479 time_adjust = 0;
479 480
480out: 481out:
481 spin_unlock_irqrestore(&ntp_lock, flags); 482 raw_spin_unlock_irqrestore(&ntp_lock, flags);
482 483
483 return leap; 484 return leap;
484} 485}
485 486
486#ifdef CONFIG_GENERIC_CMOS_UPDATE 487#if defined(CONFIG_GENERIC_CMOS_UPDATE) || defined(CONFIG_RTC_SYSTOHC)
487
488static void sync_cmos_clock(struct work_struct *work); 488static void sync_cmos_clock(struct work_struct *work);
489 489
490static DECLARE_DELAYED_WORK(sync_cmos_work, sync_cmos_clock); 490static DECLARE_DELAYED_WORK(sync_cmos_work, sync_cmos_clock);
@@ -510,14 +510,26 @@ static void sync_cmos_clock(struct work_struct *work)
510 } 510 }
511 511
512 getnstimeofday(&now); 512 getnstimeofday(&now);
513 if (abs(now.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec / 2) 513 if (abs(now.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec / 2) {
514 fail = update_persistent_clock(now); 514 struct timespec adjust = now;
515
516 fail = -ENODEV;
517 if (persistent_clock_is_local)
518 adjust.tv_sec -= (sys_tz.tz_minuteswest * 60);
519#ifdef CONFIG_GENERIC_CMOS_UPDATE
520 fail = update_persistent_clock(adjust);
521#endif
522#ifdef CONFIG_RTC_SYSTOHC
523 if (fail == -ENODEV)
524 fail = rtc_set_ntp_time(adjust);
525#endif
526 }
515 527
516 next.tv_nsec = (NSEC_PER_SEC / 2) - now.tv_nsec - (TICK_NSEC / 2); 528 next.tv_nsec = (NSEC_PER_SEC / 2) - now.tv_nsec - (TICK_NSEC / 2);
517 if (next.tv_nsec <= 0) 529 if (next.tv_nsec <= 0)
518 next.tv_nsec += NSEC_PER_SEC; 530 next.tv_nsec += NSEC_PER_SEC;
519 531
520 if (!fail) 532 if (!fail || fail == -ENODEV)
521 next.tv_sec = 659; 533 next.tv_sec = 659;
522 else 534 else
523 next.tv_sec = 0; 535 next.tv_sec = 0;
@@ -660,7 +672,7 @@ int do_adjtimex(struct timex *txc)
660 672
661 getnstimeofday(&ts); 673 getnstimeofday(&ts);
662 674
663 spin_lock_irq(&ntp_lock); 675 raw_spin_lock_irq(&ntp_lock);
664 676
665 if (txc->modes & ADJ_ADJTIME) { 677 if (txc->modes & ADJ_ADJTIME) {
666 long save_adjust = time_adjust; 678 long save_adjust = time_adjust;
@@ -702,7 +714,7 @@ int do_adjtimex(struct timex *txc)
702 /* fill PPS status fields */ 714 /* fill PPS status fields */
703 pps_fill_timex(txc); 715 pps_fill_timex(txc);
704 716
705 spin_unlock_irq(&ntp_lock); 717 raw_spin_unlock_irq(&ntp_lock);
706 718
707 txc->time.tv_sec = ts.tv_sec; 719 txc->time.tv_sec = ts.tv_sec;
708 txc->time.tv_usec = ts.tv_nsec; 720 txc->time.tv_usec = ts.tv_nsec;
@@ -900,7 +912,7 @@ void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)
900 912
901 pts_norm = pps_normalize_ts(*phase_ts); 913 pts_norm = pps_normalize_ts(*phase_ts);
902 914
903 spin_lock_irqsave(&ntp_lock, flags); 915 raw_spin_lock_irqsave(&ntp_lock, flags);
904 916
905 /* clear the error bits, they will be set again if needed */ 917 /* clear the error bits, they will be set again if needed */
906 time_status &= ~(STA_PPSJITTER | STA_PPSWANDER | STA_PPSERROR); 918 time_status &= ~(STA_PPSJITTER | STA_PPSWANDER | STA_PPSERROR);
@@ -913,7 +925,7 @@ void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)
913 * just start the frequency interval */ 925 * just start the frequency interval */
914 if (unlikely(pps_fbase.tv_sec == 0)) { 926 if (unlikely(pps_fbase.tv_sec == 0)) {
915 pps_fbase = *raw_ts; 927 pps_fbase = *raw_ts;
916 spin_unlock_irqrestore(&ntp_lock, flags); 928 raw_spin_unlock_irqrestore(&ntp_lock, flags);
917 return; 929 return;
918 } 930 }
919 931
@@ -928,7 +940,7 @@ void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)
928 time_status |= STA_PPSJITTER; 940 time_status |= STA_PPSJITTER;
929 /* restart the frequency calibration interval */ 941 /* restart the frequency calibration interval */
930 pps_fbase = *raw_ts; 942 pps_fbase = *raw_ts;
931 spin_unlock_irqrestore(&ntp_lock, flags); 943 raw_spin_unlock_irqrestore(&ntp_lock, flags);
932 pr_err("hardpps: PPSJITTER: bad pulse\n"); 944 pr_err("hardpps: PPSJITTER: bad pulse\n");
933 return; 945 return;
934 } 946 }
@@ -945,7 +957,7 @@ void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)
945 957
946 hardpps_update_phase(pts_norm.nsec); 958 hardpps_update_phase(pts_norm.nsec);
947 959
948 spin_unlock_irqrestore(&ntp_lock, flags); 960 raw_spin_unlock_irqrestore(&ntp_lock, flags);
949} 961}
950EXPORT_SYMBOL(hardpps); 962EXPORT_SYMBOL(hardpps);
951 963
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index f113755695e2..7f32fe0e52cd 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -18,6 +18,7 @@
18#include <linux/percpu.h> 18#include <linux/percpu.h>
19#include <linux/profile.h> 19#include <linux/profile.h>
20#include <linux/sched.h> 20#include <linux/sched.h>
21#include <linux/smp.h>
21 22
22#include "tick-internal.h" 23#include "tick-internal.h"
23 24
@@ -66,7 +67,8 @@ static void tick_broadcast_start_periodic(struct clock_event_device *bc)
66 */ 67 */
67int tick_check_broadcast_device(struct clock_event_device *dev) 68int tick_check_broadcast_device(struct clock_event_device *dev)
68{ 69{
69 if ((tick_broadcast_device.evtdev && 70 if ((dev->features & CLOCK_EVT_FEAT_DUMMY) ||
71 (tick_broadcast_device.evtdev &&
70 tick_broadcast_device.evtdev->rating >= dev->rating) || 72 tick_broadcast_device.evtdev->rating >= dev->rating) ||
71 (dev->features & CLOCK_EVT_FEAT_C3STOP)) 73 (dev->features & CLOCK_EVT_FEAT_C3STOP))
72 return 0; 74 return 0;
@@ -86,6 +88,22 @@ int tick_is_broadcast_device(struct clock_event_device *dev)
86 return (dev && tick_broadcast_device.evtdev == dev); 88 return (dev && tick_broadcast_device.evtdev == dev);
87} 89}
88 90
91static void err_broadcast(const struct cpumask *mask)
92{
93 pr_crit_once("Failed to broadcast timer tick. Some CPUs may be unresponsive.\n");
94}
95
96static void tick_device_setup_broadcast_func(struct clock_event_device *dev)
97{
98 if (!dev->broadcast)
99 dev->broadcast = tick_broadcast;
100 if (!dev->broadcast) {
101 pr_warn_once("%s depends on broadcast, but no broadcast function available\n",
102 dev->name);
103 dev->broadcast = err_broadcast;
104 }
105}
106
89/* 107/*
90 * Check, if the device is disfunctional and a place holder, which 108 * Check, if the device is disfunctional and a place holder, which
91 * needs to be handled by the broadcast device. 109 * needs to be handled by the broadcast device.
@@ -105,6 +123,7 @@ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu)
105 */ 123 */
106 if (!tick_device_is_functional(dev)) { 124 if (!tick_device_is_functional(dev)) {
107 dev->event_handler = tick_handle_periodic; 125 dev->event_handler = tick_handle_periodic;
126 tick_device_setup_broadcast_func(dev);
108 cpumask_set_cpu(cpu, tick_get_broadcast_mask()); 127 cpumask_set_cpu(cpu, tick_get_broadcast_mask());
109 tick_broadcast_start_periodic(tick_broadcast_device.evtdev); 128 tick_broadcast_start_periodic(tick_broadcast_device.evtdev);
110 ret = 1; 129 ret = 1;
@@ -116,15 +135,33 @@ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu)
116 */ 135 */
117 if (!(dev->features & CLOCK_EVT_FEAT_C3STOP)) { 136 if (!(dev->features & CLOCK_EVT_FEAT_C3STOP)) {
118 int cpu = smp_processor_id(); 137 int cpu = smp_processor_id();
119
120 cpumask_clear_cpu(cpu, tick_get_broadcast_mask()); 138 cpumask_clear_cpu(cpu, tick_get_broadcast_mask());
121 tick_broadcast_clear_oneshot(cpu); 139 tick_broadcast_clear_oneshot(cpu);
140 } else {
141 tick_device_setup_broadcast_func(dev);
122 } 142 }
123 } 143 }
124 raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); 144 raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
125 return ret; 145 return ret;
126} 146}
127 147
148#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
149int tick_receive_broadcast(void)
150{
151 struct tick_device *td = this_cpu_ptr(&tick_cpu_device);
152 struct clock_event_device *evt = td->evtdev;
153
154 if (!evt)
155 return -ENODEV;
156
157 if (!evt->event_handler)
158 return -EINVAL;
159
160 evt->event_handler(evt);
161 return 0;
162}
163#endif
164
128/* 165/*
129 * Broadcast the event to the cpus, which are set in the mask (mangled). 166 * Broadcast the event to the cpus, which are set in the mask (mangled).
130 */ 167 */
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index da6c9ecad4e4..b1600a6973f4 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -63,13 +63,13 @@ int tick_is_oneshot_available(void)
63static void tick_periodic(int cpu) 63static void tick_periodic(int cpu)
64{ 64{
65 if (tick_do_timer_cpu == cpu) { 65 if (tick_do_timer_cpu == cpu) {
66 write_seqlock(&xtime_lock); 66 write_seqlock(&jiffies_lock);
67 67
68 /* Keep track of the next tick event */ 68 /* Keep track of the next tick event */
69 tick_next_period = ktime_add(tick_next_period, tick_period); 69 tick_next_period = ktime_add(tick_next_period, tick_period);
70 70
71 do_timer(1); 71 do_timer(1);
72 write_sequnlock(&xtime_lock); 72 write_sequnlock(&jiffies_lock);
73 } 73 }
74 74
75 update_process_times(user_mode(get_irq_regs())); 75 update_process_times(user_mode(get_irq_regs()));
@@ -130,9 +130,9 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast)
130 ktime_t next; 130 ktime_t next;
131 131
132 do { 132 do {
133 seq = read_seqbegin(&xtime_lock); 133 seq = read_seqbegin(&jiffies_lock);
134 next = tick_next_period; 134 next = tick_next_period;
135 } while (read_seqretry(&xtime_lock, seq)); 135 } while (read_seqretry(&jiffies_lock, seq));
136 136
137 clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT); 137 clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT);
138 138
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index 4e265b901fed..cf3e59ed6dc0 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -141,4 +141,3 @@ static inline int tick_device_is_functional(struct clock_event_device *dev)
141#endif 141#endif
142 142
143extern void do_timer(unsigned long ticks); 143extern void do_timer(unsigned long ticks);
144extern seqlock_t xtime_lock;
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index a40260885265..a19a39952c1b 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -20,6 +20,7 @@
20#include <linux/profile.h> 20#include <linux/profile.h>
21#include <linux/sched.h> 21#include <linux/sched.h>
22#include <linux/module.h> 22#include <linux/module.h>
23#include <linux/irq_work.h>
23 24
24#include <asm/irq_regs.h> 25#include <asm/irq_regs.h>
25 26
@@ -28,10 +29,10 @@
28/* 29/*
29 * Per cpu nohz control structure 30 * Per cpu nohz control structure
30 */ 31 */
31static DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched); 32DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched);
32 33
33/* 34/*
34 * The time, when the last jiffy update happened. Protected by xtime_lock. 35 * The time, when the last jiffy update happened. Protected by jiffies_lock.
35 */ 36 */
36static ktime_t last_jiffies_update; 37static ktime_t last_jiffies_update;
37 38
@@ -49,14 +50,14 @@ static void tick_do_update_jiffies64(ktime_t now)
49 ktime_t delta; 50 ktime_t delta;
50 51
51 /* 52 /*
52 * Do a quick check without holding xtime_lock: 53 * Do a quick check without holding jiffies_lock:
53 */ 54 */
54 delta = ktime_sub(now, last_jiffies_update); 55 delta = ktime_sub(now, last_jiffies_update);
55 if (delta.tv64 < tick_period.tv64) 56 if (delta.tv64 < tick_period.tv64)
56 return; 57 return;
57 58
58 /* Reevalute with xtime_lock held */ 59 /* Reevalute with jiffies_lock held */
59 write_seqlock(&xtime_lock); 60 write_seqlock(&jiffies_lock);
60 61
61 delta = ktime_sub(now, last_jiffies_update); 62 delta = ktime_sub(now, last_jiffies_update);
62 if (delta.tv64 >= tick_period.tv64) { 63 if (delta.tv64 >= tick_period.tv64) {
@@ -79,7 +80,7 @@ static void tick_do_update_jiffies64(ktime_t now)
79 /* Keep the tick_next_period variable up to date */ 80 /* Keep the tick_next_period variable up to date */
80 tick_next_period = ktime_add(last_jiffies_update, tick_period); 81 tick_next_period = ktime_add(last_jiffies_update, tick_period);
81 } 82 }
82 write_sequnlock(&xtime_lock); 83 write_sequnlock(&jiffies_lock);
83} 84}
84 85
85/* 86/*
@@ -89,15 +90,58 @@ static ktime_t tick_init_jiffy_update(void)
89{ 90{
90 ktime_t period; 91 ktime_t period;
91 92
92 write_seqlock(&xtime_lock); 93 write_seqlock(&jiffies_lock);
93 /* Did we start the jiffies update yet ? */ 94 /* Did we start the jiffies update yet ? */
94 if (last_jiffies_update.tv64 == 0) 95 if (last_jiffies_update.tv64 == 0)
95 last_jiffies_update = tick_next_period; 96 last_jiffies_update = tick_next_period;
96 period = last_jiffies_update; 97 period = last_jiffies_update;
97 write_sequnlock(&xtime_lock); 98 write_sequnlock(&jiffies_lock);
98 return period; 99 return period;
99} 100}
100 101
102
103static void tick_sched_do_timer(ktime_t now)
104{
105 int cpu = smp_processor_id();
106
107#ifdef CONFIG_NO_HZ
108 /*
109 * Check if the do_timer duty was dropped. We don't care about
110 * concurrency: This happens only when the cpu in charge went
111 * into a long sleep. If two cpus happen to assign themself to
112 * this duty, then the jiffies update is still serialized by
113 * jiffies_lock.
114 */
115 if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE))
116 tick_do_timer_cpu = cpu;
117#endif
118
119 /* Check, if the jiffies need an update */
120 if (tick_do_timer_cpu == cpu)
121 tick_do_update_jiffies64(now);
122}
123
124static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs)
125{
126#ifdef CONFIG_NO_HZ
127 /*
128 * When we are idle and the tick is stopped, we have to touch
129 * the watchdog as we might not schedule for a really long
130 * time. This happens on complete idle SMP systems while
131 * waiting on the login prompt. We also increment the "start of
132 * idle" jiffy stamp so the idle accounting adjustment we do
133 * when we go busy again does not account too much ticks.
134 */
135 if (ts->tick_stopped) {
136 touch_softlockup_watchdog();
137 if (is_idle_task(current))
138 ts->idle_jiffies++;
139 }
140#endif
141 update_process_times(user_mode(regs));
142 profile_tick(CPU_PROFILING);
143}
144
101/* 145/*
102 * NOHZ - aka dynamic tick functionality 146 * NOHZ - aka dynamic tick functionality
103 */ 147 */
@@ -282,14 +326,14 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
282 326
283 /* Read jiffies and the time when jiffies were updated last */ 327 /* Read jiffies and the time when jiffies were updated last */
284 do { 328 do {
285 seq = read_seqbegin(&xtime_lock); 329 seq = read_seqbegin(&jiffies_lock);
286 last_update = last_jiffies_update; 330 last_update = last_jiffies_update;
287 last_jiffies = jiffies; 331 last_jiffies = jiffies;
288 time_delta = timekeeping_max_deferment(); 332 time_delta = timekeeping_max_deferment();
289 } while (read_seqretry(&xtime_lock, seq)); 333 } while (read_seqretry(&jiffies_lock, seq));
290 334
291 if (rcu_needs_cpu(cpu, &rcu_delta_jiffies) || printk_needs_cpu(cpu) || 335 if (rcu_needs_cpu(cpu, &rcu_delta_jiffies) ||
292 arch_needs_cpu(cpu)) { 336 arch_needs_cpu(cpu) || irq_work_needs_cpu()) {
293 next_jiffies = last_jiffies + 1; 337 next_jiffies = last_jiffies + 1;
294 delta_jiffies = 1; 338 delta_jiffies = 1;
295 } else { 339 } else {
@@ -510,6 +554,7 @@ void tick_nohz_idle_enter(void)
510 554
511 local_irq_enable(); 555 local_irq_enable();
512} 556}
557EXPORT_SYMBOL_GPL(tick_nohz_idle_enter);
513 558
514/** 559/**
515 * tick_nohz_irq_exit - update next tick event from interrupt exit 560 * tick_nohz_irq_exit - update next tick event from interrupt exit
@@ -526,6 +571,8 @@ void tick_nohz_irq_exit(void)
526 if (!ts->inidle) 571 if (!ts->inidle)
527 return; 572 return;
528 573
574 /* Cancel the timer because CPU already waken up from the C-states*/
575 menu_hrtimer_cancel();
529 __tick_nohz_idle_enter(ts); 576 __tick_nohz_idle_enter(ts);
530} 577}
531 578
@@ -586,8 +633,11 @@ static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now)
586 633
587static void tick_nohz_account_idle_ticks(struct tick_sched *ts) 634static void tick_nohz_account_idle_ticks(struct tick_sched *ts)
588{ 635{
589#ifndef CONFIG_VIRT_CPU_ACCOUNTING 636#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
590 unsigned long ticks; 637 unsigned long ticks;
638
639 if (vtime_accounting_enabled())
640 return;
591 /* 641 /*
592 * We stopped the tick in idle. Update process times would miss the 642 * We stopped the tick in idle. Update process times would miss the
593 * time we slept as update_process_times does only a 1 tick 643 * time we slept as update_process_times does only a 1 tick
@@ -621,6 +671,8 @@ void tick_nohz_idle_exit(void)
621 671
622 ts->inidle = 0; 672 ts->inidle = 0;
623 673
674 /* Cancel the timer because CPU already waken up from the C-states*/
675 menu_hrtimer_cancel();
624 if (ts->idle_active || ts->tick_stopped) 676 if (ts->idle_active || ts->tick_stopped)
625 now = ktime_get(); 677 now = ktime_get();
626 678
@@ -634,6 +686,7 @@ void tick_nohz_idle_exit(void)
634 686
635 local_irq_enable(); 687 local_irq_enable();
636} 688}
689EXPORT_SYMBOL_GPL(tick_nohz_idle_exit);
637 690
638static int tick_nohz_reprogram(struct tick_sched *ts, ktime_t now) 691static int tick_nohz_reprogram(struct tick_sched *ts, ktime_t now)
639{ 692{
@@ -648,40 +701,12 @@ static void tick_nohz_handler(struct clock_event_device *dev)
648{ 701{
649 struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); 702 struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
650 struct pt_regs *regs = get_irq_regs(); 703 struct pt_regs *regs = get_irq_regs();
651 int cpu = smp_processor_id();
652 ktime_t now = ktime_get(); 704 ktime_t now = ktime_get();
653 705
654 dev->next_event.tv64 = KTIME_MAX; 706 dev->next_event.tv64 = KTIME_MAX;
655 707
656 /* 708 tick_sched_do_timer(now);
657 * Check if the do_timer duty was dropped. We don't care about 709 tick_sched_handle(ts, regs);
658 * concurrency: This happens only when the cpu in charge went
659 * into a long sleep. If two cpus happen to assign themself to
660 * this duty, then the jiffies update is still serialized by
661 * xtime_lock.
662 */
663 if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE))
664 tick_do_timer_cpu = cpu;
665
666 /* Check, if the jiffies need an update */
667 if (tick_do_timer_cpu == cpu)
668 tick_do_update_jiffies64(now);
669
670 /*
671 * When we are idle and the tick is stopped, we have to touch
672 * the watchdog as we might not schedule for a really long
673 * time. This happens on complete idle SMP systems while
674 * waiting on the login prompt. We also increment the "start
675 * of idle" jiffy stamp so the idle accounting adjustment we
676 * do when we go busy again does not account too much ticks.
677 */
678 if (ts->tick_stopped) {
679 touch_softlockup_watchdog();
680 ts->idle_jiffies++;
681 }
682
683 update_process_times(user_mode(regs));
684 profile_tick(CPU_PROFILING);
685 710
686 while (tick_nohz_reprogram(ts, now)) { 711 while (tick_nohz_reprogram(ts, now)) {
687 now = ktime_get(); 712 now = ktime_get();
@@ -794,7 +819,7 @@ void tick_check_idle(int cpu)
794#ifdef CONFIG_HIGH_RES_TIMERS 819#ifdef CONFIG_HIGH_RES_TIMERS
795/* 820/*
796 * We rearm the timer until we get disabled by the idle code. 821 * We rearm the timer until we get disabled by the idle code.
797 * Called with interrupts disabled and timer->base->cpu_base->lock held. 822 * Called with interrupts disabled.
798 */ 823 */
799static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer) 824static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer)
800{ 825{
@@ -802,45 +827,15 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer)
802 container_of(timer, struct tick_sched, sched_timer); 827 container_of(timer, struct tick_sched, sched_timer);
803 struct pt_regs *regs = get_irq_regs(); 828 struct pt_regs *regs = get_irq_regs();
804 ktime_t now = ktime_get(); 829 ktime_t now = ktime_get();
805 int cpu = smp_processor_id();
806
807#ifdef CONFIG_NO_HZ
808 /*
809 * Check if the do_timer duty was dropped. We don't care about
810 * concurrency: This happens only when the cpu in charge went
811 * into a long sleep. If two cpus happen to assign themself to
812 * this duty, then the jiffies update is still serialized by
813 * xtime_lock.
814 */
815 if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE))
816 tick_do_timer_cpu = cpu;
817#endif
818 830
819 /* Check, if the jiffies need an update */ 831 tick_sched_do_timer(now);
820 if (tick_do_timer_cpu == cpu)
821 tick_do_update_jiffies64(now);
822 832
823 /* 833 /*
824 * Do not call, when we are not in irq context and have 834 * Do not call, when we are not in irq context and have
825 * no valid regs pointer 835 * no valid regs pointer
826 */ 836 */
827 if (regs) { 837 if (regs)
828 /* 838 tick_sched_handle(ts, regs);
829 * When we are idle and the tick is stopped, we have to touch
830 * the watchdog as we might not schedule for a really long
831 * time. This happens on complete idle SMP systems while
832 * waiting on the login prompt. We also increment the "start of
833 * idle" jiffy stamp so the idle accounting adjustment we do
834 * when we go busy again does not account too much ticks.
835 */
836 if (ts->tick_stopped) {
837 touch_softlockup_watchdog();
838 if (is_idle_task(current))
839 ts->idle_jiffies++;
840 }
841 update_process_times(user_mode(regs));
842 profile_tick(CPU_PROFILING);
843 }
844 839
845 hrtimer_forward(timer, now, tick_period); 840 hrtimer_forward(timer, now, tick_period);
846 841
@@ -874,7 +869,7 @@ void tick_setup_sched_timer(void)
874 /* Get the next period (per cpu) */ 869 /* Get the next period (per cpu) */
875 hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update()); 870 hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update());
876 871
877 /* Offset the tick to avert xtime_lock contention. */ 872 /* Offset the tick to avert jiffies_lock contention. */
878 if (sched_skew_tick) { 873 if (sched_skew_tick) {
879 u64 offset = ktime_to_ns(tick_period) >> 1; 874 u64 offset = ktime_to_ns(tick_period) >> 1;
880 do_div(offset, num_possible_cpus()); 875 do_div(offset, num_possible_cpus());
diff --git a/kernel/time/timecompare.c b/kernel/time/timecompare.c
deleted file mode 100644
index a9ae369925ce..000000000000
--- a/kernel/time/timecompare.c
+++ /dev/null
@@ -1,193 +0,0 @@
1/*
2 * Copyright (C) 2009 Intel Corporation.
3 * Author: Patrick Ohly <patrick.ohly@intel.com>
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18 */
19
20#include <linux/timecompare.h>
21#include <linux/module.h>
22#include <linux/slab.h>
23#include <linux/math64.h>
24#include <linux/kernel.h>
25
26/*
27 * fixed point arithmetic scale factor for skew
28 *
29 * Usually one would measure skew in ppb (parts per billion, 1e9), but
30 * using a factor of 2 simplifies the math.
31 */
32#define TIMECOMPARE_SKEW_RESOLUTION (((s64)1)<<30)
33
34ktime_t timecompare_transform(struct timecompare *sync,
35 u64 source_tstamp)
36{
37 u64 nsec;
38
39 nsec = source_tstamp + sync->offset;
40 nsec += (s64)(source_tstamp - sync->last_update) * sync->skew /
41 TIMECOMPARE_SKEW_RESOLUTION;
42
43 return ns_to_ktime(nsec);
44}
45EXPORT_SYMBOL_GPL(timecompare_transform);
46
47int timecompare_offset(struct timecompare *sync,
48 s64 *offset,
49 u64 *source_tstamp)
50{
51 u64 start_source = 0, end_source = 0;
52 struct {
53 s64 offset;
54 s64 duration_target;
55 } buffer[10], sample, *samples;
56 int counter = 0, i;
57 int used;
58 int index;
59 int num_samples = sync->num_samples;
60
61 if (num_samples > ARRAY_SIZE(buffer)) {
62 samples = kmalloc(sizeof(*samples) * num_samples, GFP_ATOMIC);
63 if (!samples) {
64 samples = buffer;
65 num_samples = ARRAY_SIZE(buffer);
66 }
67 } else {
68 samples = buffer;
69 }
70
71 /* run until we have enough valid samples, but do not try forever */
72 i = 0;
73 counter = 0;
74 while (1) {
75 u64 ts;
76 ktime_t start, end;
77
78 start = sync->target();
79 ts = timecounter_read(sync->source);
80 end = sync->target();
81
82 if (!i)
83 start_source = ts;
84
85 /* ignore negative durations */
86 sample.duration_target = ktime_to_ns(ktime_sub(end, start));
87 if (sample.duration_target >= 0) {
88 /*
89 * assume symetric delay to and from source:
90 * average target time corresponds to measured
91 * source time
92 */
93 sample.offset =
94 (ktime_to_ns(end) + ktime_to_ns(start)) / 2 -
95 ts;
96
97 /* simple insertion sort based on duration */
98 index = counter - 1;
99 while (index >= 0) {
100 if (samples[index].duration_target <
101 sample.duration_target)
102 break;
103 samples[index + 1] = samples[index];
104 index--;
105 }
106 samples[index + 1] = sample;
107 counter++;
108 }
109
110 i++;
111 if (counter >= num_samples || i >= 100000) {
112 end_source = ts;
113 break;
114 }
115 }
116
117 *source_tstamp = (end_source + start_source) / 2;
118
119 /* remove outliers by only using 75% of the samples */
120 used = counter * 3 / 4;
121 if (!used)
122 used = counter;
123 if (used) {
124 /* calculate average */
125 s64 off = 0;
126 for (index = 0; index < used; index++)
127 off += samples[index].offset;
128 *offset = div_s64(off, used);
129 }
130
131 if (samples && samples != buffer)
132 kfree(samples);
133
134 return used;
135}
136EXPORT_SYMBOL_GPL(timecompare_offset);
137
138void __timecompare_update(struct timecompare *sync,
139 u64 source_tstamp)
140{
141 s64 offset;
142 u64 average_time;
143
144 if (!timecompare_offset(sync, &offset, &average_time))
145 return;
146
147 if (!sync->last_update) {
148 sync->last_update = average_time;
149 sync->offset = offset;
150 sync->skew = 0;
151 } else {
152 s64 delta_nsec = average_time - sync->last_update;
153
154 /* avoid division by negative or small deltas */
155 if (delta_nsec >= 10000) {
156 s64 delta_offset_nsec = offset - sync->offset;
157 s64 skew; /* delta_offset_nsec *
158 TIMECOMPARE_SKEW_RESOLUTION /
159 delta_nsec */
160 u64 divisor;
161
162 /* div_s64() is limited to 32 bit divisor */
163 skew = delta_offset_nsec * TIMECOMPARE_SKEW_RESOLUTION;
164 divisor = delta_nsec;
165 while (unlikely(divisor >= ((s64)1) << 32)) {
166 /* divide both by 2; beware, right shift
167 of negative value has undefined
168 behavior and can only be used for
169 the positive divisor */
170 skew = div_s64(skew, 2);
171 divisor >>= 1;
172 }
173 skew = div_s64(skew, divisor);
174
175 /*
176 * Calculate new overall skew as 4/16 the
177 * old value and 12/16 the new one. This is
178 * a rather arbitrary tradeoff between
179 * only using the latest measurement (0/16 and
180 * 16/16) and even more weight on past measurements.
181 */
182#define TIMECOMPARE_NEW_SKEW_PER_16 12
183 sync->skew =
184 div_s64((16 - TIMECOMPARE_NEW_SKEW_PER_16) *
185 sync->skew +
186 TIMECOMPARE_NEW_SKEW_PER_16 * skew,
187 16);
188 sync->last_update = average_time;
189 sync->offset = offset;
190 }
191 }
192}
193EXPORT_SYMBOL_GPL(__timecompare_update);
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index e424970bb562..9a0bc98fbe1d 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -21,19 +21,17 @@
21#include <linux/time.h> 21#include <linux/time.h>
22#include <linux/tick.h> 22#include <linux/tick.h>
23#include <linux/stop_machine.h> 23#include <linux/stop_machine.h>
24#include <linux/pvclock_gtod.h>
24 25
25 26
26static struct timekeeper timekeeper; 27static struct timekeeper timekeeper;
27 28
28/*
29 * This read-write spinlock protects us from races in SMP while
30 * playing with xtime.
31 */
32__cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock);
33
34/* flag for if timekeeping is suspended */ 29/* flag for if timekeeping is suspended */
35int __read_mostly timekeeping_suspended; 30int __read_mostly timekeeping_suspended;
36 31
32/* Flag for if there is a persistent clock on this platform */
33bool __read_mostly persistent_clock_exist = false;
34
37static inline void tk_normalize_xtime(struct timekeeper *tk) 35static inline void tk_normalize_xtime(struct timekeeper *tk)
38{ 36{
39 while (tk->xtime_nsec >= ((u64)NSEC_PER_SEC << tk->shift)) { 37 while (tk->xtime_nsec >= ((u64)NSEC_PER_SEC << tk->shift)) {
@@ -140,6 +138,20 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock)
140} 138}
141 139
142/* Timekeeper helper functions. */ 140/* Timekeeper helper functions. */
141
142#ifdef CONFIG_ARCH_USES_GETTIMEOFFSET
143u32 (*arch_gettimeoffset)(void);
144
145u32 get_arch_timeoffset(void)
146{
147 if (likely(arch_gettimeoffset))
148 return arch_gettimeoffset();
149 return 0;
150}
151#else
152static inline u32 get_arch_timeoffset(void) { return 0; }
153#endif
154
143static inline s64 timekeeping_get_ns(struct timekeeper *tk) 155static inline s64 timekeeping_get_ns(struct timekeeper *tk)
144{ 156{
145 cycle_t cycle_now, cycle_delta; 157 cycle_t cycle_now, cycle_delta;
@@ -156,8 +168,8 @@ static inline s64 timekeeping_get_ns(struct timekeeper *tk)
156 nsec = cycle_delta * tk->mult + tk->xtime_nsec; 168 nsec = cycle_delta * tk->mult + tk->xtime_nsec;
157 nsec >>= tk->shift; 169 nsec >>= tk->shift;
158 170
159 /* If arch requires, add in gettimeoffset() */ 171 /* If arch requires, add in get_arch_timeoffset() */
160 return nsec + arch_gettimeoffset(); 172 return nsec + get_arch_timeoffset();
161} 173}
162 174
163static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk) 175static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk)
@@ -176,9 +188,57 @@ static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk)
176 /* convert delta to nanoseconds. */ 188 /* convert delta to nanoseconds. */
177 nsec = clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift); 189 nsec = clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift);
178 190
179 /* If arch requires, add in gettimeoffset() */ 191 /* If arch requires, add in get_arch_timeoffset() */
180 return nsec + arch_gettimeoffset(); 192 return nsec + get_arch_timeoffset();
193}
194
195static RAW_NOTIFIER_HEAD(pvclock_gtod_chain);
196
197static void update_pvclock_gtod(struct timekeeper *tk)
198{
199 raw_notifier_call_chain(&pvclock_gtod_chain, 0, tk);
200}
201
202/**
203 * pvclock_gtod_register_notifier - register a pvclock timedata update listener
204 *
205 * Must hold write on timekeeper.lock
206 */
207int pvclock_gtod_register_notifier(struct notifier_block *nb)
208{
209 struct timekeeper *tk = &timekeeper;
210 unsigned long flags;
211 int ret;
212
213 write_seqlock_irqsave(&tk->lock, flags);
214 ret = raw_notifier_chain_register(&pvclock_gtod_chain, nb);
215 /* update timekeeping data */
216 update_pvclock_gtod(tk);
217 write_sequnlock_irqrestore(&tk->lock, flags);
218
219 return ret;
181} 220}
221EXPORT_SYMBOL_GPL(pvclock_gtod_register_notifier);
222
223/**
224 * pvclock_gtod_unregister_notifier - unregister a pvclock
225 * timedata update listener
226 *
227 * Must hold write on timekeeper.lock
228 */
229int pvclock_gtod_unregister_notifier(struct notifier_block *nb)
230{
231 struct timekeeper *tk = &timekeeper;
232 unsigned long flags;
233 int ret;
234
235 write_seqlock_irqsave(&tk->lock, flags);
236 ret = raw_notifier_chain_unregister(&pvclock_gtod_chain, nb);
237 write_sequnlock_irqrestore(&tk->lock, flags);
238
239 return ret;
240}
241EXPORT_SYMBOL_GPL(pvclock_gtod_unregister_notifier);
182 242
183/* must hold write on timekeeper.lock */ 243/* must hold write on timekeeper.lock */
184static void timekeeping_update(struct timekeeper *tk, bool clearntp) 244static void timekeeping_update(struct timekeeper *tk, bool clearntp)
@@ -188,6 +248,7 @@ static void timekeeping_update(struct timekeeper *tk, bool clearntp)
188 ntp_clear(); 248 ntp_clear();
189 } 249 }
190 update_vsyscall(tk); 250 update_vsyscall(tk);
251 update_pvclock_gtod(tk);
191} 252}
192 253
193/** 254/**
@@ -210,8 +271,8 @@ static void timekeeping_forward_now(struct timekeeper *tk)
210 271
211 tk->xtime_nsec += cycle_delta * tk->mult; 272 tk->xtime_nsec += cycle_delta * tk->mult;
212 273
213 /* If arch requires, add in gettimeoffset() */ 274 /* If arch requires, add in get_arch_timeoffset() */
214 tk->xtime_nsec += (u64)arch_gettimeoffset() << tk->shift; 275 tk->xtime_nsec += (u64)get_arch_timeoffset() << tk->shift;
215 276
216 tk_normalize_xtime(tk); 277 tk_normalize_xtime(tk);
217 278
@@ -220,19 +281,18 @@ static void timekeeping_forward_now(struct timekeeper *tk)
220} 281}
221 282
222/** 283/**
223 * getnstimeofday - Returns the time of day in a timespec 284 * __getnstimeofday - Returns the time of day in a timespec.
224 * @ts: pointer to the timespec to be set 285 * @ts: pointer to the timespec to be set
225 * 286 *
226 * Returns the time of day in a timespec. 287 * Updates the time of day in the timespec.
288 * Returns 0 on success, or -ve when suspended (timespec will be undefined).
227 */ 289 */
228void getnstimeofday(struct timespec *ts) 290int __getnstimeofday(struct timespec *ts)
229{ 291{
230 struct timekeeper *tk = &timekeeper; 292 struct timekeeper *tk = &timekeeper;
231 unsigned long seq; 293 unsigned long seq;
232 s64 nsecs = 0; 294 s64 nsecs = 0;
233 295
234 WARN_ON(timekeeping_suspended);
235
236 do { 296 do {
237 seq = read_seqbegin(&tk->lock); 297 seq = read_seqbegin(&tk->lock);
238 298
@@ -243,6 +303,26 @@ void getnstimeofday(struct timespec *ts)
243 303
244 ts->tv_nsec = 0; 304 ts->tv_nsec = 0;
245 timespec_add_ns(ts, nsecs); 305 timespec_add_ns(ts, nsecs);
306
307 /*
308 * Do not bail out early, in case there were callers still using
309 * the value, even in the face of the WARN_ON.
310 */
311 if (unlikely(timekeeping_suspended))
312 return -EAGAIN;
313 return 0;
314}
315EXPORT_SYMBOL(__getnstimeofday);
316
317/**
318 * getnstimeofday - Returns the time of day in a timespec.
319 * @ts: pointer to the timespec to be set
320 *
321 * Returns the time of day in a timespec (WARN if suspended).
322 */
323void getnstimeofday(struct timespec *ts)
324{
325 WARN_ON(__getnstimeofday(ts));
246} 326}
247EXPORT_SYMBOL(getnstimeofday); 327EXPORT_SYMBOL(getnstimeofday);
248 328
@@ -596,12 +676,14 @@ void __init timekeeping_init(void)
596 struct timespec now, boot, tmp; 676 struct timespec now, boot, tmp;
597 677
598 read_persistent_clock(&now); 678 read_persistent_clock(&now);
679
599 if (!timespec_valid_strict(&now)) { 680 if (!timespec_valid_strict(&now)) {
600 pr_warn("WARNING: Persistent clock returned invalid value!\n" 681 pr_warn("WARNING: Persistent clock returned invalid value!\n"
601 " Check your CMOS/BIOS settings.\n"); 682 " Check your CMOS/BIOS settings.\n");
602 now.tv_sec = 0; 683 now.tv_sec = 0;
603 now.tv_nsec = 0; 684 now.tv_nsec = 0;
604 } 685 } else if (now.tv_sec || now.tv_nsec)
686 persistent_clock_exist = true;
605 687
606 read_boot_clock(&boot); 688 read_boot_clock(&boot);
607 if (!timespec_valid_strict(&boot)) { 689 if (!timespec_valid_strict(&boot)) {
@@ -674,11 +756,12 @@ void timekeeping_inject_sleeptime(struct timespec *delta)
674{ 756{
675 struct timekeeper *tk = &timekeeper; 757 struct timekeeper *tk = &timekeeper;
676 unsigned long flags; 758 unsigned long flags;
677 struct timespec ts;
678 759
679 /* Make sure we don't set the clock twice */ 760 /*
680 read_persistent_clock(&ts); 761 * Make sure we don't set the clock twice, as timekeeping_resume()
681 if (!(ts.tv_sec == 0 && ts.tv_nsec == 0)) 762 * already did it
763 */
764 if (has_persistent_clock())
682 return; 765 return;
683 766
684 write_seqlock_irqsave(&tk->lock, flags); 767 write_seqlock_irqsave(&tk->lock, flags);
@@ -1299,9 +1382,7 @@ struct timespec get_monotonic_coarse(void)
1299} 1382}
1300 1383
1301/* 1384/*
1302 * The 64-bit jiffies value is not atomic - you MUST NOT read it 1385 * Must hold jiffies_lock
1303 * without sampling the sequence number in xtime_lock.
1304 * jiffies is defined in the linker script...
1305 */ 1386 */
1306void do_timer(unsigned long ticks) 1387void do_timer(unsigned long ticks)
1307{ 1388{
@@ -1389,7 +1470,7 @@ EXPORT_SYMBOL_GPL(ktime_get_monotonic_offset);
1389 */ 1470 */
1390void xtime_update(unsigned long ticks) 1471void xtime_update(unsigned long ticks)
1391{ 1472{
1392 write_seqlock(&xtime_lock); 1473 write_seqlock(&jiffies_lock);
1393 do_timer(ticks); 1474 do_timer(ticks);
1394 write_sequnlock(&xtime_lock); 1475 write_sequnlock(&jiffies_lock);
1395} 1476}
diff --git a/kernel/timeconst.bc b/kernel/timeconst.bc
new file mode 100644
index 000000000000..511bdf2cafda
--- /dev/null
+++ b/kernel/timeconst.bc
@@ -0,0 +1,108 @@
1scale=0
2
3define gcd(a,b) {
4 auto t;
5 while (b) {
6 t = b;
7 b = a % b;
8 a = t;
9 }
10 return a;
11}
12
13/* Division by reciprocal multiplication. */
14define fmul(b,n,d) {
15 return (2^b*n+d-1)/d;
16}
17
18/* Adjustment factor when a ceiling value is used. Use as:
19 (imul * n) + (fmulxx * n + fadjxx) >> xx) */
20define fadj(b,n,d) {
21 auto v;
22 d = d/gcd(n,d);
23 v = 2^b*(d-1)/d;
24 return v;
25}
26
27/* Compute the appropriate mul/adj values as well as a shift count,
28 which brings the mul value into the range 2^b-1 <= x < 2^b. Such
29 a shift value will be correct in the signed integer range and off
30 by at most one in the upper half of the unsigned range. */
31define fmuls(b,n,d) {
32 auto s, m;
33 for (s = 0; 1; s++) {
34 m = fmul(s,n,d);
35 if (m >= 2^(b-1))
36 return s;
37 }
38 return 0;
39}
40
41define timeconst(hz) {
42 print "/* Automatically generated by kernel/timeconst.bc */\n"
43 print "/* Time conversion constants for HZ == ", hz, " */\n"
44 print "\n"
45
46 print "#ifndef KERNEL_TIMECONST_H\n"
47 print "#define KERNEL_TIMECONST_H\n\n"
48
49 print "#include <linux/param.h>\n"
50 print "#include <linux/types.h>\n\n"
51
52 print "#if HZ != ", hz, "\n"
53 print "#error \qkernel/timeconst.h has the wrong HZ value!\q\n"
54 print "#endif\n\n"
55
56 if (hz < 2) {
57 print "#error Totally bogus HZ value!\n"
58 } else {
59 s=fmuls(32,1000,hz)
60 obase=16
61 print "#define HZ_TO_MSEC_MUL32\tU64_C(0x", fmul(s,1000,hz), ")\n"
62 print "#define HZ_TO_MSEC_ADJ32\tU64_C(0x", fadj(s,1000,hz), ")\n"
63 obase=10
64 print "#define HZ_TO_MSEC_SHR32\t", s, "\n"
65
66 s=fmuls(32,hz,1000)
67 obase=16
68 print "#define MSEC_TO_HZ_MUL32\tU64_C(0x", fmul(s,hz,1000), ")\n"
69 print "#define MSEC_TO_HZ_ADJ32\tU64_C(0x", fadj(s,hz,1000), ")\n"
70 obase=10
71 print "#define MSEC_TO_HZ_SHR32\t", s, "\n"
72
73 obase=10
74 cd=gcd(hz,1000)
75 print "#define HZ_TO_MSEC_NUM\t\t", 1000/cd, "\n"
76 print "#define HZ_TO_MSEC_DEN\t\t", hz/cd, "\n"
77 print "#define MSEC_TO_HZ_NUM\t\t", hz/cd, "\n"
78 print "#define MSEC_TO_HZ_DEN\t\t", 1000/cd, "\n"
79 print "\n"
80
81 s=fmuls(32,1000000,hz)
82 obase=16
83 print "#define HZ_TO_USEC_MUL32\tU64_C(0x", fmul(s,1000000,hz), ")\n"
84 print "#define HZ_TO_USEC_ADJ32\tU64_C(0x", fadj(s,1000000,hz), ")\n"
85 obase=10
86 print "#define HZ_TO_USEC_SHR32\t", s, "\n"
87
88 s=fmuls(32,hz,1000000)
89 obase=16
90 print "#define USEC_TO_HZ_MUL32\tU64_C(0x", fmul(s,hz,1000000), ")\n"
91 print "#define USEC_TO_HZ_ADJ32\tU64_C(0x", fadj(s,hz,1000000), ")\n"
92 obase=10
93 print "#define USEC_TO_HZ_SHR32\t", s, "\n"
94
95 obase=10
96 cd=gcd(hz,1000000)
97 print "#define HZ_TO_USEC_NUM\t\t", 1000000/cd, "\n"
98 print "#define HZ_TO_USEC_DEN\t\t", hz/cd, "\n"
99 print "#define USEC_TO_HZ_NUM\t\t", hz/cd, "\n"
100 print "#define USEC_TO_HZ_DEN\t\t", 1000000/cd, "\n"
101 print "\n"
102
103 print "#endif /* KERNEL_TIMECONST_H */\n"
104 }
105 halt
106}
107
108timeconst(hz)
diff --git a/kernel/timeconst.pl b/kernel/timeconst.pl
deleted file mode 100644
index eb51d76e058a..000000000000
--- a/kernel/timeconst.pl
+++ /dev/null
@@ -1,378 +0,0 @@
1#!/usr/bin/perl
2# -----------------------------------------------------------------------
3#
4# Copyright 2007-2008 rPath, Inc. - All Rights Reserved
5#
6# This file is part of the Linux kernel, and is made available under
7# the terms of the GNU General Public License version 2 or (at your
8# option) any later version; incorporated herein by reference.
9#
10# -----------------------------------------------------------------------
11#
12
13#
14# Usage: timeconst.pl HZ > timeconst.h
15#
16
17# Precomputed values for systems without Math::BigInt
18# Generated by:
19# timeconst.pl --can 24 32 48 64 100 122 128 200 250 256 300 512 1000 1024 1200
20%canned_values = (
21 24 => [
22 '0xa6aaaaab','0x2aaaaaa',26,
23 125,3,
24 '0xc49ba5e4','0x1fbe76c8b4',37,
25 3,125,
26 '0xa2c2aaab','0xaaaa',16,
27 125000,3,
28 '0xc9539b89','0x7fffbce4217d',47,
29 3,125000,
30 ], 32 => [
31 '0xfa000000','0x6000000',27,
32 125,4,
33 '0x83126e98','0xfdf3b645a',36,
34 4,125,
35 '0xf4240000','0x0',17,
36 31250,1,
37 '0x8637bd06','0x3fff79c842fa',46,
38 1,31250,
39 ], 48 => [
40 '0xa6aaaaab','0x6aaaaaa',27,
41 125,6,
42 '0xc49ba5e4','0xfdf3b645a',36,
43 6,125,
44 '0xa2c2aaab','0x15555',17,
45 62500,3,
46 '0xc9539b89','0x3fffbce4217d',46,
47 3,62500,
48 ], 64 => [
49 '0xfa000000','0xe000000',28,
50 125,8,
51 '0x83126e98','0x7ef9db22d',35,
52 8,125,
53 '0xf4240000','0x0',18,
54 15625,1,
55 '0x8637bd06','0x1fff79c842fa',45,
56 1,15625,
57 ], 100 => [
58 '0xa0000000','0x0',28,
59 10,1,
60 '0xcccccccd','0x733333333',35,
61 1,10,
62 '0x9c400000','0x0',18,
63 10000,1,
64 '0xd1b71759','0x1fff2e48e8a7',45,
65 1,10000,
66 ], 122 => [
67 '0x8325c53f','0xfbcda3a',28,
68 500,61,
69 '0xf9db22d1','0x7fbe76c8b',35,
70 61,500,
71 '0x8012e2a0','0x3ef36',18,
72 500000,61,
73 '0xffda4053','0x1ffffbce4217',45,
74 61,500000,
75 ], 128 => [
76 '0xfa000000','0x1e000000',29,
77 125,16,
78 '0x83126e98','0x3f7ced916',34,
79 16,125,
80 '0xf4240000','0x40000',19,
81 15625,2,
82 '0x8637bd06','0xfffbce4217d',44,
83 2,15625,
84 ], 200 => [
85 '0xa0000000','0x0',29,
86 5,1,
87 '0xcccccccd','0x333333333',34,
88 1,5,
89 '0x9c400000','0x0',19,
90 5000,1,
91 '0xd1b71759','0xfff2e48e8a7',44,
92 1,5000,
93 ], 250 => [
94 '0x80000000','0x0',29,
95 4,1,
96 '0x80000000','0x180000000',33,
97 1,4,
98 '0xfa000000','0x0',20,
99 4000,1,
100 '0x83126e98','0x7ff7ced9168',43,
101 1,4000,
102 ], 256 => [
103 '0xfa000000','0x3e000000',30,
104 125,32,
105 '0x83126e98','0x1fbe76c8b',33,
106 32,125,
107 '0xf4240000','0xc0000',20,
108 15625,4,
109 '0x8637bd06','0x7ffde7210be',43,
110 4,15625,
111 ], 300 => [
112 '0xd5555556','0x2aaaaaaa',30,
113 10,3,
114 '0x9999999a','0x1cccccccc',33,
115 3,10,
116 '0xd0555556','0xaaaaa',20,
117 10000,3,
118 '0x9d495183','0x7ffcb923a29',43,
119 3,10000,
120 ], 512 => [
121 '0xfa000000','0x7e000000',31,
122 125,64,
123 '0x83126e98','0xfdf3b645',32,
124 64,125,
125 '0xf4240000','0x1c0000',21,
126 15625,8,
127 '0x8637bd06','0x3ffef39085f',42,
128 8,15625,
129 ], 1000 => [
130 '0x80000000','0x0',31,
131 1,1,
132 '0x80000000','0x0',31,
133 1,1,
134 '0xfa000000','0x0',22,
135 1000,1,
136 '0x83126e98','0x1ff7ced9168',41,
137 1,1000,
138 ], 1024 => [
139 '0xfa000000','0xfe000000',32,
140 125,128,
141 '0x83126e98','0x7ef9db22',31,
142 128,125,
143 '0xf4240000','0x3c0000',22,
144 15625,16,
145 '0x8637bd06','0x1fff79c842f',41,
146 16,15625,
147 ], 1200 => [
148 '0xd5555556','0xd5555555',32,
149 5,6,
150 '0x9999999a','0x66666666',31,
151 6,5,
152 '0xd0555556','0x2aaaaa',22,
153 2500,3,
154 '0x9d495183','0x1ffcb923a29',41,
155 3,2500,
156 ]
157);
158
159$has_bigint = eval 'use Math::BigInt qw(bgcd); 1;';
160
161sub bint($)
162{
163 my($x) = @_;
164 return Math::BigInt->new($x);
165}
166
167#
168# Constants for division by reciprocal multiplication.
169# (bits, numerator, denominator)
170#
171sub fmul($$$)
172{
173 my ($b,$n,$d) = @_;
174
175 $n = bint($n);
176 $d = bint($d);
177
178 return scalar (($n << $b)+$d-bint(1))/$d;
179}
180
181sub fadj($$$)
182{
183 my($b,$n,$d) = @_;
184
185 $n = bint($n);
186 $d = bint($d);
187
188 $d = $d/bgcd($n, $d);
189 return scalar (($d-bint(1)) << $b)/$d;
190}
191
192sub fmuls($$$) {
193 my($b,$n,$d) = @_;
194 my($s,$m);
195 my($thres) = bint(1) << ($b-1);
196
197 $n = bint($n);
198 $d = bint($d);
199
200 for ($s = 0; 1; $s++) {
201 $m = fmul($s,$n,$d);
202 return $s if ($m >= $thres);
203 }
204 return 0;
205}
206
207# Generate a hex value if the result fits in 64 bits;
208# otherwise skip.
209sub bignum_hex($) {
210 my($x) = @_;
211 my $s = $x->as_hex();
212
213 return (length($s) > 18) ? undef : $s;
214}
215
216# Provides mul, adj, and shr factors for a specific
217# (bit, time, hz) combination
218sub muladj($$$) {
219 my($b, $t, $hz) = @_;
220 my $s = fmuls($b, $t, $hz);
221 my $m = fmul($s, $t, $hz);
222 my $a = fadj($s, $t, $hz);
223 return (bignum_hex($m), bignum_hex($a), $s);
224}
225
226# Provides numerator, denominator values
227sub numden($$) {
228 my($n, $d) = @_;
229 my $g = bgcd($n, $d);
230 return ($n/$g, $d/$g);
231}
232
233# All values for a specific (time, hz) combo
234sub conversions($$) {
235 my ($t, $hz) = @_;
236 my @val = ();
237
238 # HZ_TO_xx
239 push(@val, muladj(32, $t, $hz));
240 push(@val, numden($t, $hz));
241
242 # xx_TO_HZ
243 push(@val, muladj(32, $hz, $t));
244 push(@val, numden($hz, $t));
245
246 return @val;
247}
248
249sub compute_values($) {
250 my($hz) = @_;
251 my @val = ();
252 my $s, $m, $a, $g;
253
254 if (!$has_bigint) {
255 die "$0: HZ == $hz not canned and ".
256 "Math::BigInt not available\n";
257 }
258
259 # MSEC conversions
260 push(@val, conversions(1000, $hz));
261
262 # USEC conversions
263 push(@val, conversions(1000000, $hz));
264
265 return @val;
266}
267
268sub outputval($$)
269{
270 my($name, $val) = @_;
271 my $csuf;
272
273 if (defined($val)) {
274 if ($name !~ /SHR/) {
275 $val = "U64_C($val)";
276 }
277 printf "#define %-23s %s\n", $name.$csuf, $val.$csuf;
278 }
279}
280
281sub output($@)
282{
283 my($hz, @val) = @_;
284 my $pfx, $bit, $suf, $s, $m, $a;
285
286 print "/* Automatically generated by kernel/timeconst.pl */\n";
287 print "/* Conversion constants for HZ == $hz */\n";
288 print "\n";
289 print "#ifndef KERNEL_TIMECONST_H\n";
290 print "#define KERNEL_TIMECONST_H\n";
291 print "\n";
292
293 print "#include <linux/param.h>\n";
294 print "#include <linux/types.h>\n";
295
296 print "\n";
297 print "#if HZ != $hz\n";
298 print "#error \"kernel/timeconst.h has the wrong HZ value!\"\n";
299 print "#endif\n";
300 print "\n";
301
302 foreach $pfx ('HZ_TO_MSEC','MSEC_TO_HZ',
303 'HZ_TO_USEC','USEC_TO_HZ') {
304 foreach $bit (32) {
305 foreach $suf ('MUL', 'ADJ', 'SHR') {
306 outputval("${pfx}_$suf$bit", shift(@val));
307 }
308 }
309 foreach $suf ('NUM', 'DEN') {
310 outputval("${pfx}_$suf", shift(@val));
311 }
312 }
313
314 print "\n";
315 print "#endif /* KERNEL_TIMECONST_H */\n";
316}
317
318# Pretty-print Perl values
319sub perlvals(@) {
320 my $v;
321 my @l = ();
322
323 foreach $v (@_) {
324 if (!defined($v)) {
325 push(@l, 'undef');
326 } elsif ($v =~ /^0x/) {
327 push(@l, "\'".$v."\'");
328 } else {
329 push(@l, $v.'');
330 }
331 }
332 return join(',', @l);
333}
334
335($hz) = @ARGV;
336
337# Use this to generate the %canned_values structure
338if ($hz eq '--can') {
339 shift(@ARGV);
340 @hzlist = sort {$a <=> $b} (@ARGV);
341
342 print "# Precomputed values for systems without Math::BigInt\n";
343 print "# Generated by:\n";
344 print "# timeconst.pl --can ", join(' ', @hzlist), "\n";
345 print "\%canned_values = (\n";
346 my $pf = "\t";
347 foreach $hz (@hzlist) {
348 my @values = compute_values($hz);
349 print "$pf$hz => [\n";
350 while (scalar(@values)) {
351 my $bit;
352 foreach $bit (32) {
353 my $m = shift(@values);
354 my $a = shift(@values);
355 my $s = shift(@values);
356 print "\t\t", perlvals($m,$a,$s), ",\n";
357 }
358 my $n = shift(@values);
359 my $d = shift(@values);
360 print "\t\t", perlvals($n,$d), ",\n";
361 }
362 print "\t]";
363 $pf = ', ';
364 }
365 print "\n);\n";
366} else {
367 $hz += 0; # Force to number
368 if ($hz < 1) {
369 die "Usage: $0 HZ\n";
370 }
371
372 @val = @{$canned_values{$hz}};
373 if (!defined(@val)) {
374 @val = compute_values($hz);
375 }
376 output($hz, @val);
377}
378exit 0;
diff --git a/kernel/timer.c b/kernel/timer.c
index 367d00858482..dbf7a78a1ef1 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -39,6 +39,7 @@
39#include <linux/kallsyms.h> 39#include <linux/kallsyms.h>
40#include <linux/irq_work.h> 40#include <linux/irq_work.h>
41#include <linux/sched.h> 41#include <linux/sched.h>
42#include <linux/sched/sysctl.h>
42#include <linux/slab.h> 43#include <linux/slab.h>
43 44
44#include <asm/uaccess.h> 45#include <asm/uaccess.h>
@@ -1351,7 +1352,6 @@ void update_process_times(int user_tick)
1351 account_process_tick(p, user_tick); 1352 account_process_tick(p, user_tick);
1352 run_local_timers(); 1353 run_local_timers();
1353 rcu_check_callbacks(cpu, user_tick); 1354 rcu_check_callbacks(cpu, user_tick);
1354 printk_tick();
1355#ifdef CONFIG_IRQ_WORK 1355#ifdef CONFIG_IRQ_WORK
1356 if (in_irq()) 1356 if (in_irq())
1357 irq_work_run(); 1357 irq_work_run();
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 4cea4f41c1d9..fc382d6e2765 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -39,6 +39,9 @@ config HAVE_DYNAMIC_FTRACE
39 help 39 help
40 See Documentation/trace/ftrace-design.txt 40 See Documentation/trace/ftrace-design.txt
41 41
42config HAVE_DYNAMIC_FTRACE_WITH_REGS
43 bool
44
42config HAVE_FTRACE_MCOUNT_RECORD 45config HAVE_FTRACE_MCOUNT_RECORD
43 bool 46 bool
44 help 47 help
@@ -78,21 +81,6 @@ config EVENT_TRACING
78 select CONTEXT_SWITCH_TRACER 81 select CONTEXT_SWITCH_TRACER
79 bool 82 bool
80 83
81config EVENT_POWER_TRACING_DEPRECATED
82 depends on EVENT_TRACING
83 bool "Deprecated power event trace API, to be removed"
84 default y
85 help
86 Provides old power event types:
87 C-state/idle accounting events:
88 power:power_start
89 power:power_end
90 and old cpufreq accounting event:
91 power:power_frequency
92 This is for userspace compatibility
93 and will vanish after 5 kernel iterations,
94 namely 3.1.
95
96config CONTEXT_SWITCH_TRACER 84config CONTEXT_SWITCH_TRACER
97 bool 85 bool
98 86
@@ -119,6 +107,7 @@ config TRACING
119 select BINARY_PRINTF 107 select BINARY_PRINTF
120 select EVENT_TRACING 108 select EVENT_TRACING
121 select TRACE_CLOCK 109 select TRACE_CLOCK
110 select IRQ_WORK
122 111
123config GENERIC_TRACER 112config GENERIC_TRACER
124 bool 113 bool
@@ -249,6 +238,16 @@ config FTRACE_SYSCALLS
249 help 238 help
250 Basic tracer to catch the syscall entry and exit events. 239 Basic tracer to catch the syscall entry and exit events.
251 240
241config TRACER_SNAPSHOT
242 bool "Create a snapshot trace buffer"
243 select TRACER_MAX_TRACE
244 help
245 Allow tracing users to take snapshot of the current buffer using the
246 ftrace interface, e.g.:
247
248 echo 1 > /sys/kernel/debug/tracing/snapshot
249 cat snapshot
250
252config TRACE_BRANCH_PROFILING 251config TRACE_BRANCH_PROFILING
253 bool 252 bool
254 select GENERIC_TRACER 253 select GENERIC_TRACER
@@ -415,23 +414,32 @@ config PROBE_EVENTS
415 def_bool n 414 def_bool n
416 415
417config DYNAMIC_FTRACE 416config DYNAMIC_FTRACE
418 bool "enable/disable ftrace tracepoints dynamically" 417 bool "enable/disable function tracing dynamically"
419 depends on FUNCTION_TRACER 418 depends on FUNCTION_TRACER
420 depends on HAVE_DYNAMIC_FTRACE 419 depends on HAVE_DYNAMIC_FTRACE
421 default y 420 default y
422 help 421 help
423 This option will modify all the calls to ftrace dynamically 422 This option will modify all the calls to function tracing
424 (will patch them out of the binary image and replace them 423 dynamically (will patch them out of the binary image and
425 with a No-Op instruction) as they are called. A table is 424 replace them with a No-Op instruction) on boot up. During
426 created to dynamically enable them again. 425 compile time, a table is made of all the locations that ftrace
426 can function trace, and this table is linked into the kernel
427 image. When this is enabled, functions can be individually
428 enabled, and the functions not enabled will not affect
429 performance of the system.
430
431 See the files in /sys/kernel/debug/tracing:
432 available_filter_functions
433 set_ftrace_filter
434 set_ftrace_notrace
427 435
428 This way a CONFIG_FUNCTION_TRACER kernel is slightly larger, but 436 This way a CONFIG_FUNCTION_TRACER kernel is slightly larger, but
429 otherwise has native performance as long as no tracing is active. 437 otherwise has native performance as long as no tracing is active.
430 438
431 The changes to the code are done by a kernel thread that 439config DYNAMIC_FTRACE_WITH_REGS
432 wakes up once a second and checks to see if any ftrace calls 440 def_bool y
433 were made. If so, it runs stop_machine (stops all CPUS) 441 depends on DYNAMIC_FTRACE
434 and modifies the code to jump over the call to ftrace. 442 depends on HAVE_DYNAMIC_FTRACE_WITH_REGS
435 443
436config FUNCTION_PROFILER 444config FUNCTION_PROFILER
437 bool "Kernel function profiler" 445 bool "Kernel function profiler"
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index c0bd0308741c..9e5b8c272eec 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -147,7 +147,7 @@ void __trace_note_message(struct blk_trace *bt, const char *fmt, ...)
147 return; 147 return;
148 148
149 local_irq_save(flags); 149 local_irq_save(flags);
150 buf = per_cpu_ptr(bt->msg_data, smp_processor_id()); 150 buf = this_cpu_ptr(bt->msg_data);
151 va_start(args, fmt); 151 va_start(args, fmt);
152 n = vscnprintf(buf, BLK_TN_MAX_MSG, fmt, args); 152 n = vscnprintf(buf, BLK_TN_MAX_MSG, fmt, args);
153 va_end(args); 153 va_end(args);
@@ -739,6 +739,12 @@ static void blk_add_trace_rq_complete(void *ignore,
739 struct request_queue *q, 739 struct request_queue *q,
740 struct request *rq) 740 struct request *rq)
741{ 741{
742 struct blk_trace *bt = q->blk_trace;
743
744 /* if control ever passes through here, it's a request based driver */
745 if (unlikely(bt && !bt->rq_based))
746 bt->rq_based = true;
747
742 blk_add_trace_rq(q, rq, BLK_TA_COMPLETE); 748 blk_add_trace_rq(q, rq, BLK_TA_COMPLETE);
743} 749}
744 750
@@ -774,15 +780,30 @@ static void blk_add_trace_bio_bounce(void *ignore,
774 blk_add_trace_bio(q, bio, BLK_TA_BOUNCE, 0); 780 blk_add_trace_bio(q, bio, BLK_TA_BOUNCE, 0);
775} 781}
776 782
777static void blk_add_trace_bio_complete(void *ignore, 783static void blk_add_trace_bio_complete(void *ignore, struct bio *bio, int error)
778 struct request_queue *q, struct bio *bio,
779 int error)
780{ 784{
785 struct request_queue *q;
786 struct blk_trace *bt;
787
788 if (!bio->bi_bdev)
789 return;
790
791 q = bdev_get_queue(bio->bi_bdev);
792 bt = q->blk_trace;
793
794 /*
795 * Request based drivers will generate both rq and bio completions.
796 * Ignore bio ones.
797 */
798 if (likely(!bt) || bt->rq_based)
799 return;
800
781 blk_add_trace_bio(q, bio, BLK_TA_COMPLETE, error); 801 blk_add_trace_bio(q, bio, BLK_TA_COMPLETE, error);
782} 802}
783 803
784static void blk_add_trace_bio_backmerge(void *ignore, 804static void blk_add_trace_bio_backmerge(void *ignore,
785 struct request_queue *q, 805 struct request_queue *q,
806 struct request *rq,
786 struct bio *bio) 807 struct bio *bio)
787{ 808{
788 blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE, 0); 809 blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE, 0);
@@ -790,6 +811,7 @@ static void blk_add_trace_bio_backmerge(void *ignore,
790 811
791static void blk_add_trace_bio_frontmerge(void *ignore, 812static void blk_add_trace_bio_frontmerge(void *ignore,
792 struct request_queue *q, 813 struct request_queue *q,
814 struct request *rq,
793 struct bio *bio) 815 struct bio *bio)
794{ 816{
795 blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE, 0); 817 blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE, 0);
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 9dcf15d38380..b3fde6d7b7fc 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -10,7 +10,7 @@
10 * Based on code in the latency_tracer, that is: 10 * Based on code in the latency_tracer, that is:
11 * 11 *
12 * Copyright (C) 2004-2006 Ingo Molnar 12 * Copyright (C) 2004-2006 Ingo Molnar
13 * Copyright (C) 2004 William Lee Irwin III 13 * Copyright (C) 2004 Nadia Yvette Chambers
14 */ 14 */
15 15
16#include <linux/stop_machine.h> 16#include <linux/stop_machine.h>
@@ -66,7 +66,7 @@
66 66
67static struct ftrace_ops ftrace_list_end __read_mostly = { 67static struct ftrace_ops ftrace_list_end __read_mostly = {
68 .func = ftrace_stub, 68 .func = ftrace_stub,
69 .flags = FTRACE_OPS_FL_RECURSION_SAFE, 69 .flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_STUB,
70}; 70};
71 71
72/* ftrace_enabled is a method to turn ftrace on or off */ 72/* ftrace_enabled is a method to turn ftrace on or off */
@@ -111,6 +111,26 @@ static void ftrace_ops_no_ops(unsigned long ip, unsigned long parent_ip);
111#define ftrace_ops_list_func ((ftrace_func_t)ftrace_ops_no_ops) 111#define ftrace_ops_list_func ((ftrace_func_t)ftrace_ops_no_ops)
112#endif 112#endif
113 113
114/*
115 * Traverse the ftrace_global_list, invoking all entries. The reason that we
116 * can use rcu_dereference_raw() is that elements removed from this list
117 * are simply leaked, so there is no need to interact with a grace-period
118 * mechanism. The rcu_dereference_raw() calls are needed to handle
119 * concurrent insertions into the ftrace_global_list.
120 *
121 * Silly Alpha and silly pointer-speculation compiler optimizations!
122 */
123#define do_for_each_ftrace_op(op, list) \
124 op = rcu_dereference_raw(list); \
125 do
126
127/*
128 * Optimized for just a single item in the list (as that is the normal case).
129 */
130#define while_for_each_ftrace_op(op) \
131 while (likely(op = rcu_dereference_raw((op)->next)) && \
132 unlikely((op) != &ftrace_list_end))
133
114/** 134/**
115 * ftrace_nr_registered_ops - return number of ops registered 135 * ftrace_nr_registered_ops - return number of ops registered
116 * 136 *
@@ -132,29 +152,21 @@ int ftrace_nr_registered_ops(void)
132 return cnt; 152 return cnt;
133} 153}
134 154
135/*
136 * Traverse the ftrace_global_list, invoking all entries. The reason that we
137 * can use rcu_dereference_raw() is that elements removed from this list
138 * are simply leaked, so there is no need to interact with a grace-period
139 * mechanism. The rcu_dereference_raw() calls are needed to handle
140 * concurrent insertions into the ftrace_global_list.
141 *
142 * Silly Alpha and silly pointer-speculation compiler optimizations!
143 */
144static void 155static void
145ftrace_global_list_func(unsigned long ip, unsigned long parent_ip, 156ftrace_global_list_func(unsigned long ip, unsigned long parent_ip,
146 struct ftrace_ops *op, struct pt_regs *regs) 157 struct ftrace_ops *op, struct pt_regs *regs)
147{ 158{
148 if (unlikely(trace_recursion_test(TRACE_GLOBAL_BIT))) 159 int bit;
160
161 bit = trace_test_and_set_recursion(TRACE_GLOBAL_START, TRACE_GLOBAL_MAX);
162 if (bit < 0)
149 return; 163 return;
150 164
151 trace_recursion_set(TRACE_GLOBAL_BIT); 165 do_for_each_ftrace_op(op, ftrace_global_list) {
152 op = rcu_dereference_raw(ftrace_global_list); /*see above*/
153 while (op != &ftrace_list_end) {
154 op->func(ip, parent_ip, op, regs); 166 op->func(ip, parent_ip, op, regs);
155 op = rcu_dereference_raw(op->next); /*see above*/ 167 } while_for_each_ftrace_op(op);
156 }; 168
157 trace_recursion_clear(TRACE_GLOBAL_BIT); 169 trace_clear_recursion(bit);
158} 170}
159 171
160static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip, 172static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip,
@@ -221,10 +233,24 @@ static void update_global_ops(void)
221 * registered callers. 233 * registered callers.
222 */ 234 */
223 if (ftrace_global_list == &ftrace_list_end || 235 if (ftrace_global_list == &ftrace_list_end ||
224 ftrace_global_list->next == &ftrace_list_end) 236 ftrace_global_list->next == &ftrace_list_end) {
225 func = ftrace_global_list->func; 237 func = ftrace_global_list->func;
226 else 238 /*
239 * As we are calling the function directly.
240 * If it does not have recursion protection,
241 * the function_trace_op needs to be updated
242 * accordingly.
243 */
244 if (ftrace_global_list->flags & FTRACE_OPS_FL_RECURSION_SAFE)
245 global_ops.flags |= FTRACE_OPS_FL_RECURSION_SAFE;
246 else
247 global_ops.flags &= ~FTRACE_OPS_FL_RECURSION_SAFE;
248 } else {
227 func = ftrace_global_list_func; 249 func = ftrace_global_list_func;
250 /* The list has its own recursion protection. */
251 global_ops.flags |= FTRACE_OPS_FL_RECURSION_SAFE;
252 }
253
228 254
229 /* If we filter on pids, update to use the pid function */ 255 /* If we filter on pids, update to use the pid function */
230 if (!list_empty(&ftrace_pids)) { 256 if (!list_empty(&ftrace_pids)) {
@@ -337,7 +363,7 @@ static int __register_ftrace_function(struct ftrace_ops *ops)
337 if ((ops->flags & FL_GLOBAL_CONTROL_MASK) == FL_GLOBAL_CONTROL_MASK) 363 if ((ops->flags & FL_GLOBAL_CONTROL_MASK) == FL_GLOBAL_CONTROL_MASK)
338 return -EINVAL; 364 return -EINVAL;
339 365
340#ifndef ARCH_SUPPORTS_FTRACE_SAVE_REGS 366#ifndef CONFIG_DYNAMIC_FTRACE_WITH_REGS
341 /* 367 /*
342 * If the ftrace_ops specifies SAVE_REGS, then it only can be used 368 * If the ftrace_ops specifies SAVE_REGS, then it only can be used
343 * if the arch supports it, or SAVE_REGS_IF_SUPPORTED is also set. 369 * if the arch supports it, or SAVE_REGS_IF_SUPPORTED is also set.
@@ -668,7 +694,6 @@ int ftrace_profile_pages_init(struct ftrace_profile_stat *stat)
668 free_page(tmp); 694 free_page(tmp);
669 } 695 }
670 696
671 free_page((unsigned long)stat->pages);
672 stat->pages = NULL; 697 stat->pages = NULL;
673 stat->start = NULL; 698 stat->start = NULL;
674 699
@@ -736,7 +761,6 @@ ftrace_find_profiled_func(struct ftrace_profile_stat *stat, unsigned long ip)
736{ 761{
737 struct ftrace_profile *rec; 762 struct ftrace_profile *rec;
738 struct hlist_head *hhd; 763 struct hlist_head *hhd;
739 struct hlist_node *n;
740 unsigned long key; 764 unsigned long key;
741 765
742 key = hash_long(ip, ftrace_profile_bits); 766 key = hash_long(ip, ftrace_profile_bits);
@@ -745,7 +769,7 @@ ftrace_find_profiled_func(struct ftrace_profile_stat *stat, unsigned long ip)
745 if (hlist_empty(hhd)) 769 if (hlist_empty(hhd))
746 return NULL; 770 return NULL;
747 771
748 hlist_for_each_entry_rcu(rec, n, hhd, node) { 772 hlist_for_each_entry_rcu(rec, hhd, node) {
749 if (rec->ip == ip) 773 if (rec->ip == ip)
750 return rec; 774 return rec;
751 } 775 }
@@ -1028,6 +1052,19 @@ static __init void ftrace_profile_debugfs(struct dentry *d_tracer)
1028 1052
1029static struct pid * const ftrace_swapper_pid = &init_struct_pid; 1053static struct pid * const ftrace_swapper_pid = &init_struct_pid;
1030 1054
1055loff_t
1056ftrace_filter_lseek(struct file *file, loff_t offset, int whence)
1057{
1058 loff_t ret;
1059
1060 if (file->f_mode & FMODE_READ)
1061 ret = seq_lseek(file, offset, whence);
1062 else
1063 file->f_pos = ret = 1;
1064
1065 return ret;
1066}
1067
1031#ifdef CONFIG_DYNAMIC_FTRACE 1068#ifdef CONFIG_DYNAMIC_FTRACE
1032 1069
1033#ifndef CONFIG_FTRACE_MCOUNT_RECORD 1070#ifndef CONFIG_FTRACE_MCOUNT_RECORD
@@ -1107,7 +1144,6 @@ ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip)
1107 unsigned long key; 1144 unsigned long key;
1108 struct ftrace_func_entry *entry; 1145 struct ftrace_func_entry *entry;
1109 struct hlist_head *hhd; 1146 struct hlist_head *hhd;
1110 struct hlist_node *n;
1111 1147
1112 if (ftrace_hash_empty(hash)) 1148 if (ftrace_hash_empty(hash))
1113 return NULL; 1149 return NULL;
@@ -1119,7 +1155,7 @@ ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip)
1119 1155
1120 hhd = &hash->buckets[key]; 1156 hhd = &hash->buckets[key];
1121 1157
1122 hlist_for_each_entry_rcu(entry, n, hhd, hlist) { 1158 hlist_for_each_entry_rcu(entry, hhd, hlist) {
1123 if (entry->ip == ip) 1159 if (entry->ip == ip)
1124 return entry; 1160 return entry;
1125 } 1161 }
@@ -1176,7 +1212,7 @@ remove_hash_entry(struct ftrace_hash *hash,
1176static void ftrace_hash_clear(struct ftrace_hash *hash) 1212static void ftrace_hash_clear(struct ftrace_hash *hash)
1177{ 1213{
1178 struct hlist_head *hhd; 1214 struct hlist_head *hhd;
1179 struct hlist_node *tp, *tn; 1215 struct hlist_node *tn;
1180 struct ftrace_func_entry *entry; 1216 struct ftrace_func_entry *entry;
1181 int size = 1 << hash->size_bits; 1217 int size = 1 << hash->size_bits;
1182 int i; 1218 int i;
@@ -1186,7 +1222,7 @@ static void ftrace_hash_clear(struct ftrace_hash *hash)
1186 1222
1187 for (i = 0; i < size; i++) { 1223 for (i = 0; i < size; i++) {
1188 hhd = &hash->buckets[i]; 1224 hhd = &hash->buckets[i];
1189 hlist_for_each_entry_safe(entry, tp, tn, hhd, hlist) 1225 hlist_for_each_entry_safe(entry, tn, hhd, hlist)
1190 free_hash_entry(hash, entry); 1226 free_hash_entry(hash, entry);
1191 } 1227 }
1192 FTRACE_WARN_ON(hash->count); 1228 FTRACE_WARN_ON(hash->count);
@@ -1249,7 +1285,6 @@ alloc_and_copy_ftrace_hash(int size_bits, struct ftrace_hash *hash)
1249{ 1285{
1250 struct ftrace_func_entry *entry; 1286 struct ftrace_func_entry *entry;
1251 struct ftrace_hash *new_hash; 1287 struct ftrace_hash *new_hash;
1252 struct hlist_node *tp;
1253 int size; 1288 int size;
1254 int ret; 1289 int ret;
1255 int i; 1290 int i;
@@ -1264,7 +1299,7 @@ alloc_and_copy_ftrace_hash(int size_bits, struct ftrace_hash *hash)
1264 1299
1265 size = 1 << hash->size_bits; 1300 size = 1 << hash->size_bits;
1266 for (i = 0; i < size; i++) { 1301 for (i = 0; i < size; i++) {
1267 hlist_for_each_entry(entry, tp, &hash->buckets[i], hlist) { 1302 hlist_for_each_entry(entry, &hash->buckets[i], hlist) {
1268 ret = add_hash_entry(new_hash, entry->ip); 1303 ret = add_hash_entry(new_hash, entry->ip);
1269 if (ret < 0) 1304 if (ret < 0)
1270 goto free_hash; 1305 goto free_hash;
@@ -1290,7 +1325,7 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable,
1290 struct ftrace_hash **dst, struct ftrace_hash *src) 1325 struct ftrace_hash **dst, struct ftrace_hash *src)
1291{ 1326{
1292 struct ftrace_func_entry *entry; 1327 struct ftrace_func_entry *entry;
1293 struct hlist_node *tp, *tn; 1328 struct hlist_node *tn;
1294 struct hlist_head *hhd; 1329 struct hlist_head *hhd;
1295 struct ftrace_hash *old_hash; 1330 struct ftrace_hash *old_hash;
1296 struct ftrace_hash *new_hash; 1331 struct ftrace_hash *new_hash;
@@ -1336,7 +1371,7 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable,
1336 size = 1 << src->size_bits; 1371 size = 1 << src->size_bits;
1337 for (i = 0; i < size; i++) { 1372 for (i = 0; i < size; i++) {
1338 hhd = &src->buckets[i]; 1373 hhd = &src->buckets[i];
1339 hlist_for_each_entry_safe(entry, tp, tn, hhd, hlist) { 1374 hlist_for_each_entry_safe(entry, tn, hhd, hlist) {
1340 if (bits > 0) 1375 if (bits > 0)
1341 key = hash_long(entry->ip, bits); 1376 key = hash_long(entry->ip, bits);
1342 else 1377 else
@@ -2437,7 +2472,7 @@ static void reset_iter_read(struct ftrace_iterator *iter)
2437{ 2472{
2438 iter->pos = 0; 2473 iter->pos = 0;
2439 iter->func_pos = 0; 2474 iter->func_pos = 0;
2440 iter->flags &= ~(FTRACE_ITER_PRINTALL & FTRACE_ITER_HASH); 2475 iter->flags &= ~(FTRACE_ITER_PRINTALL | FTRACE_ITER_HASH);
2441} 2476}
2442 2477
2443static void *t_start(struct seq_file *m, loff_t *pos) 2478static void *t_start(struct seq_file *m, loff_t *pos)
@@ -2590,7 +2625,7 @@ static void ftrace_filter_reset(struct ftrace_hash *hash)
2590 * routine, you can use ftrace_filter_write() for the write 2625 * routine, you can use ftrace_filter_write() for the write
2591 * routine if @flag has FTRACE_ITER_FILTER set, or 2626 * routine if @flag has FTRACE_ITER_FILTER set, or
2592 * ftrace_notrace_write() if @flag has FTRACE_ITER_NOTRACE set. 2627 * ftrace_notrace_write() if @flag has FTRACE_ITER_NOTRACE set.
2593 * ftrace_regex_lseek() should be used as the lseek routine, and 2628 * ftrace_filter_lseek() should be used as the lseek routine, and
2594 * release must call ftrace_regex_release(). 2629 * release must call ftrace_regex_release().
2595 */ 2630 */
2596int 2631int
@@ -2674,19 +2709,6 @@ ftrace_notrace_open(struct inode *inode, struct file *file)
2674 inode, file); 2709 inode, file);
2675} 2710}
2676 2711
2677loff_t
2678ftrace_regex_lseek(struct file *file, loff_t offset, int origin)
2679{
2680 loff_t ret;
2681
2682 if (file->f_mode & FMODE_READ)
2683 ret = seq_lseek(file, offset, origin);
2684 else
2685 file->f_pos = ret = 1;
2686
2687 return ret;
2688}
2689
2690static int ftrace_match(char *str, char *regex, int len, int type) 2712static int ftrace_match(char *str, char *regex, int len, int type)
2691{ 2713{
2692 int matched = 0; 2714 int matched = 0;
@@ -2868,14 +2890,13 @@ static int __init ftrace_mod_cmd_init(void)
2868{ 2890{
2869 return register_ftrace_command(&ftrace_mod_cmd); 2891 return register_ftrace_command(&ftrace_mod_cmd);
2870} 2892}
2871device_initcall(ftrace_mod_cmd_init); 2893core_initcall(ftrace_mod_cmd_init);
2872 2894
2873static void function_trace_probe_call(unsigned long ip, unsigned long parent_ip, 2895static void function_trace_probe_call(unsigned long ip, unsigned long parent_ip,
2874 struct ftrace_ops *op, struct pt_regs *pt_regs) 2896 struct ftrace_ops *op, struct pt_regs *pt_regs)
2875{ 2897{
2876 struct ftrace_func_probe *entry; 2898 struct ftrace_func_probe *entry;
2877 struct hlist_head *hhd; 2899 struct hlist_head *hhd;
2878 struct hlist_node *n;
2879 unsigned long key; 2900 unsigned long key;
2880 2901
2881 key = hash_long(ip, FTRACE_HASH_BITS); 2902 key = hash_long(ip, FTRACE_HASH_BITS);
@@ -2891,7 +2912,7 @@ static void function_trace_probe_call(unsigned long ip, unsigned long parent_ip,
2891 * on the hash. rcu_read_lock is too dangerous here. 2912 * on the hash. rcu_read_lock is too dangerous here.
2892 */ 2913 */
2893 preempt_disable_notrace(); 2914 preempt_disable_notrace();
2894 hlist_for_each_entry_rcu(entry, n, hhd, node) { 2915 hlist_for_each_entry_rcu(entry, hhd, node) {
2895 if (entry->ip == ip) 2916 if (entry->ip == ip)
2896 entry->ops->func(ip, parent_ip, &entry->data); 2917 entry->ops->func(ip, parent_ip, &entry->data);
2897 } 2918 }
@@ -3042,7 +3063,7 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
3042 void *data, int flags) 3063 void *data, int flags)
3043{ 3064{
3044 struct ftrace_func_probe *entry; 3065 struct ftrace_func_probe *entry;
3045 struct hlist_node *n, *tmp; 3066 struct hlist_node *tmp;
3046 char str[KSYM_SYMBOL_LEN]; 3067 char str[KSYM_SYMBOL_LEN];
3047 int type = MATCH_FULL; 3068 int type = MATCH_FULL;
3048 int i, len = 0; 3069 int i, len = 0;
@@ -3065,7 +3086,7 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
3065 for (i = 0; i < FTRACE_FUNC_HASHSIZE; i++) { 3086 for (i = 0; i < FTRACE_FUNC_HASHSIZE; i++) {
3066 struct hlist_head *hhd = &ftrace_func_hash[i]; 3087 struct hlist_head *hhd = &ftrace_func_hash[i];
3067 3088
3068 hlist_for_each_entry_safe(entry, n, tmp, hhd, node) { 3089 hlist_for_each_entry_safe(entry, tmp, hhd, node) {
3069 3090
3070 /* break up if statements for readability */ 3091 /* break up if statements for readability */
3071 if ((flags & PROBE_TEST_FUNC) && entry->ops != ops) 3092 if ((flags & PROBE_TEST_FUNC) && entry->ops != ops)
@@ -3082,8 +3103,8 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
3082 continue; 3103 continue;
3083 } 3104 }
3084 3105
3085 hlist_del(&entry->node); 3106 hlist_del_rcu(&entry->node);
3086 call_rcu(&entry->rcu, ftrace_free_entry_rcu); 3107 call_rcu_sched(&entry->rcu, ftrace_free_entry_rcu);
3087 } 3108 }
3088 } 3109 }
3089 __disable_ftrace_function_probe(); 3110 __disable_ftrace_function_probe();
@@ -3419,14 +3440,14 @@ static char ftrace_filter_buf[FTRACE_FILTER_SIZE] __initdata;
3419 3440
3420static int __init set_ftrace_notrace(char *str) 3441static int __init set_ftrace_notrace(char *str)
3421{ 3442{
3422 strncpy(ftrace_notrace_buf, str, FTRACE_FILTER_SIZE); 3443 strlcpy(ftrace_notrace_buf, str, FTRACE_FILTER_SIZE);
3423 return 1; 3444 return 1;
3424} 3445}
3425__setup("ftrace_notrace=", set_ftrace_notrace); 3446__setup("ftrace_notrace=", set_ftrace_notrace);
3426 3447
3427static int __init set_ftrace_filter(char *str) 3448static int __init set_ftrace_filter(char *str)
3428{ 3449{
3429 strncpy(ftrace_filter_buf, str, FTRACE_FILTER_SIZE); 3450 strlcpy(ftrace_filter_buf, str, FTRACE_FILTER_SIZE);
3430 return 1; 3451 return 1;
3431} 3452}
3432__setup("ftrace_filter=", set_ftrace_filter); 3453__setup("ftrace_filter=", set_ftrace_filter);
@@ -3549,7 +3570,7 @@ static const struct file_operations ftrace_filter_fops = {
3549 .open = ftrace_filter_open, 3570 .open = ftrace_filter_open,
3550 .read = seq_read, 3571 .read = seq_read,
3551 .write = ftrace_filter_write, 3572 .write = ftrace_filter_write,
3552 .llseek = ftrace_regex_lseek, 3573 .llseek = ftrace_filter_lseek,
3553 .release = ftrace_regex_release, 3574 .release = ftrace_regex_release,
3554}; 3575};
3555 3576
@@ -3557,7 +3578,7 @@ static const struct file_operations ftrace_notrace_fops = {
3557 .open = ftrace_notrace_open, 3578 .open = ftrace_notrace_open,
3558 .read = seq_read, 3579 .read = seq_read,
3559 .write = ftrace_notrace_write, 3580 .write = ftrace_notrace_write,
3560 .llseek = ftrace_regex_lseek, 3581 .llseek = ftrace_filter_lseek,
3561 .release = ftrace_regex_release, 3582 .release = ftrace_regex_release,
3562}; 3583};
3563 3584
@@ -3762,8 +3783,8 @@ static const struct file_operations ftrace_graph_fops = {
3762 .open = ftrace_graph_open, 3783 .open = ftrace_graph_open,
3763 .read = seq_read, 3784 .read = seq_read,
3764 .write = ftrace_graph_write, 3785 .write = ftrace_graph_write,
3786 .llseek = ftrace_filter_lseek,
3765 .release = ftrace_graph_release, 3787 .release = ftrace_graph_release,
3766 .llseek = seq_lseek,
3767}; 3788};
3768#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ 3789#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
3769 3790
@@ -3970,35 +3991,49 @@ static void ftrace_init_module(struct module *mod,
3970 ftrace_process_locs(mod, start, end); 3991 ftrace_process_locs(mod, start, end);
3971} 3992}
3972 3993
3973static int ftrace_module_notify(struct notifier_block *self, 3994static int ftrace_module_notify_enter(struct notifier_block *self,
3974 unsigned long val, void *data) 3995 unsigned long val, void *data)
3975{ 3996{
3976 struct module *mod = data; 3997 struct module *mod = data;
3977 3998
3978 switch (val) { 3999 if (val == MODULE_STATE_COMING)
3979 case MODULE_STATE_COMING:
3980 ftrace_init_module(mod, mod->ftrace_callsites, 4000 ftrace_init_module(mod, mod->ftrace_callsites,
3981 mod->ftrace_callsites + 4001 mod->ftrace_callsites +
3982 mod->num_ftrace_callsites); 4002 mod->num_ftrace_callsites);
3983 break; 4003 return 0;
3984 case MODULE_STATE_GOING: 4004}
4005
4006static int ftrace_module_notify_exit(struct notifier_block *self,
4007 unsigned long val, void *data)
4008{
4009 struct module *mod = data;
4010
4011 if (val == MODULE_STATE_GOING)
3985 ftrace_release_mod(mod); 4012 ftrace_release_mod(mod);
3986 break;
3987 }
3988 4013
3989 return 0; 4014 return 0;
3990} 4015}
3991#else 4016#else
3992static int ftrace_module_notify(struct notifier_block *self, 4017static int ftrace_module_notify_enter(struct notifier_block *self,
3993 unsigned long val, void *data) 4018 unsigned long val, void *data)
4019{
4020 return 0;
4021}
4022static int ftrace_module_notify_exit(struct notifier_block *self,
4023 unsigned long val, void *data)
3994{ 4024{
3995 return 0; 4025 return 0;
3996} 4026}
3997#endif /* CONFIG_MODULES */ 4027#endif /* CONFIG_MODULES */
3998 4028
3999struct notifier_block ftrace_module_nb = { 4029struct notifier_block ftrace_module_enter_nb = {
4000 .notifier_call = ftrace_module_notify, 4030 .notifier_call = ftrace_module_notify_enter,
4001 .priority = 0, 4031 .priority = INT_MAX, /* Run before anything that can use kprobes */
4032};
4033
4034struct notifier_block ftrace_module_exit_nb = {
4035 .notifier_call = ftrace_module_notify_exit,
4036 .priority = INT_MIN, /* Run after anything that can remove kprobes */
4002}; 4037};
4003 4038
4004extern unsigned long __start_mcount_loc[]; 4039extern unsigned long __start_mcount_loc[];
@@ -4032,9 +4067,13 @@ void __init ftrace_init(void)
4032 __start_mcount_loc, 4067 __start_mcount_loc,
4033 __stop_mcount_loc); 4068 __stop_mcount_loc);
4034 4069
4035 ret = register_module_notifier(&ftrace_module_nb); 4070 ret = register_module_notifier(&ftrace_module_enter_nb);
4036 if (ret) 4071 if (ret)
4037 pr_warning("Failed to register trace ftrace module notifier\n"); 4072 pr_warning("Failed to register trace ftrace module enter notifier\n");
4073
4074 ret = register_module_notifier(&ftrace_module_exit_nb);
4075 if (ret)
4076 pr_warning("Failed to register trace ftrace module exit notifier\n");
4038 4077
4039 set_ftrace_early_filters(); 4078 set_ftrace_early_filters();
4040 4079
@@ -4055,7 +4094,7 @@ static int __init ftrace_nodyn_init(void)
4055 ftrace_enabled = 1; 4094 ftrace_enabled = 1;
4056 return 0; 4095 return 0;
4057} 4096}
4058device_initcall(ftrace_nodyn_init); 4097core_initcall(ftrace_nodyn_init);
4059 4098
4060static inline int ftrace_init_dyn_debugfs(struct dentry *d_tracer) { return 0; } 4099static inline int ftrace_init_dyn_debugfs(struct dentry *d_tracer) { return 0; }
4061static inline void ftrace_startup_enable(int command) { } 4100static inline void ftrace_startup_enable(int command) { }
@@ -4090,14 +4129,12 @@ ftrace_ops_control_func(unsigned long ip, unsigned long parent_ip,
4090 */ 4129 */
4091 preempt_disable_notrace(); 4130 preempt_disable_notrace();
4092 trace_recursion_set(TRACE_CONTROL_BIT); 4131 trace_recursion_set(TRACE_CONTROL_BIT);
4093 op = rcu_dereference_raw(ftrace_control_list); 4132 do_for_each_ftrace_op(op, ftrace_control_list) {
4094 while (op != &ftrace_list_end) { 4133 if (!(op->flags & FTRACE_OPS_FL_STUB) &&
4095 if (!ftrace_function_local_disabled(op) && 4134 !ftrace_function_local_disabled(op) &&
4096 ftrace_ops_test(op, ip)) 4135 ftrace_ops_test(op, ip))
4097 op->func(ip, parent_ip, op, regs); 4136 op->func(ip, parent_ip, op, regs);
4098 4137 } while_for_each_ftrace_op(op);
4099 op = rcu_dereference_raw(op->next);
4100 };
4101 trace_recursion_clear(TRACE_CONTROL_BIT); 4138 trace_recursion_clear(TRACE_CONTROL_BIT);
4102 preempt_enable_notrace(); 4139 preempt_enable_notrace();
4103} 4140}
@@ -4112,27 +4149,26 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
4112 struct ftrace_ops *ignored, struct pt_regs *regs) 4149 struct ftrace_ops *ignored, struct pt_regs *regs)
4113{ 4150{
4114 struct ftrace_ops *op; 4151 struct ftrace_ops *op;
4152 int bit;
4115 4153
4116 if (function_trace_stop) 4154 if (function_trace_stop)
4117 return; 4155 return;
4118 4156
4119 if (unlikely(trace_recursion_test(TRACE_INTERNAL_BIT))) 4157 bit = trace_test_and_set_recursion(TRACE_LIST_START, TRACE_LIST_MAX);
4158 if (bit < 0)
4120 return; 4159 return;
4121 4160
4122 trace_recursion_set(TRACE_INTERNAL_BIT);
4123 /* 4161 /*
4124 * Some of the ops may be dynamically allocated, 4162 * Some of the ops may be dynamically allocated,
4125 * they must be freed after a synchronize_sched(). 4163 * they must be freed after a synchronize_sched().
4126 */ 4164 */
4127 preempt_disable_notrace(); 4165 preempt_disable_notrace();
4128 op = rcu_dereference_raw(ftrace_ops_list); 4166 do_for_each_ftrace_op(op, ftrace_ops_list) {
4129 while (op != &ftrace_list_end) {
4130 if (ftrace_ops_test(op, ip)) 4167 if (ftrace_ops_test(op, ip))
4131 op->func(ip, parent_ip, op, regs); 4168 op->func(ip, parent_ip, op, regs);
4132 op = rcu_dereference_raw(op->next); 4169 } while_for_each_ftrace_op(op);
4133 };
4134 preempt_enable_notrace(); 4170 preempt_enable_notrace();
4135 trace_recursion_clear(TRACE_INTERNAL_BIT); 4171 trace_clear_recursion(bit);
4136} 4172}
4137 4173
4138/* 4174/*
@@ -4143,8 +4179,8 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
4143 * Archs are to support both the regs and ftrace_ops at the same time. 4179 * Archs are to support both the regs and ftrace_ops at the same time.
4144 * If they support ftrace_ops, it is assumed they support regs. 4180 * If they support ftrace_ops, it is assumed they support regs.
4145 * If call backs want to use regs, they must either check for regs 4181 * If call backs want to use regs, they must either check for regs
4146 * being NULL, or ARCH_SUPPORTS_FTRACE_SAVE_REGS. 4182 * being NULL, or CONFIG_DYNAMIC_FTRACE_WITH_REGS.
4147 * Note, ARCH_SUPPORT_SAVE_REGS expects a full regs to be saved. 4183 * Note, CONFIG_DYNAMIC_FTRACE_WITH_REGS expects a full regs to be saved.
4148 * An architecture can pass partial regs with ftrace_ops and still 4184 * An architecture can pass partial regs with ftrace_ops and still
4149 * set the ARCH_SUPPORT_FTARCE_OPS. 4185 * set the ARCH_SUPPORT_FTARCE_OPS.
4150 */ 4186 */
@@ -4381,7 +4417,7 @@ ftrace_pid_write(struct file *filp, const char __user *ubuf,
4381 if (strlen(tmp) == 0) 4417 if (strlen(tmp) == 0)
4382 return 1; 4418 return 1;
4383 4419
4384 ret = strict_strtol(tmp, 10, &val); 4420 ret = kstrtol(tmp, 10, &val);
4385 if (ret < 0) 4421 if (ret < 0)
4386 return ret; 4422 return ret;
4387 4423
@@ -4403,7 +4439,7 @@ static const struct file_operations ftrace_pid_fops = {
4403 .open = ftrace_pid_open, 4439 .open = ftrace_pid_open,
4404 .write = ftrace_pid_write, 4440 .write = ftrace_pid_write,
4405 .read = seq_read, 4441 .read = seq_read,
4406 .llseek = seq_lseek, 4442 .llseek = ftrace_filter_lseek,
4407 .release = ftrace_pid_release, 4443 .release = ftrace_pid_release,
4408}; 4444};
4409 4445
@@ -4519,12 +4555,8 @@ ftrace_enable_sysctl(struct ctl_table *table, int write,
4519 ftrace_startup_sysctl(); 4555 ftrace_startup_sysctl();
4520 4556
4521 /* we are starting ftrace again */ 4557 /* we are starting ftrace again */
4522 if (ftrace_ops_list != &ftrace_list_end) { 4558 if (ftrace_ops_list != &ftrace_list_end)
4523 if (ftrace_ops_list->next == &ftrace_list_end) 4559 update_ftrace_function();
4524 ftrace_trace_function = ftrace_ops_list->func;
4525 else
4526 ftrace_trace_function = ftrace_ops_list_func;
4527 }
4528 4560
4529 } else { 4561 } else {
4530 /* stopping ftrace calls (just send to ftrace_stub) */ 4562 /* stopping ftrace calls (just send to ftrace_stub) */
diff --git a/kernel/trace/power-traces.c b/kernel/trace/power-traces.c
index f55fcf61b223..1c71382b283d 100644
--- a/kernel/trace/power-traces.c
+++ b/kernel/trace/power-traces.c
@@ -13,8 +13,5 @@
13#define CREATE_TRACE_POINTS 13#define CREATE_TRACE_POINTS
14#include <trace/events/power.h> 14#include <trace/events/power.h>
15 15
16#ifdef EVENT_POWER_TRACING_DEPRECATED
17EXPORT_TRACEPOINT_SYMBOL_GPL(power_start);
18#endif
19EXPORT_TRACEPOINT_SYMBOL_GPL(cpu_idle); 16EXPORT_TRACEPOINT_SYMBOL_GPL(cpu_idle);
20 17
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index b979426d16c6..6989df2ba194 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -3,8 +3,10 @@
3 * 3 *
4 * Copyright (C) 2008 Steven Rostedt <srostedt@redhat.com> 4 * Copyright (C) 2008 Steven Rostedt <srostedt@redhat.com>
5 */ 5 */
6#include <linux/ftrace_event.h>
6#include <linux/ring_buffer.h> 7#include <linux/ring_buffer.h>
7#include <linux/trace_clock.h> 8#include <linux/trace_clock.h>
9#include <linux/trace_seq.h>
8#include <linux/spinlock.h> 10#include <linux/spinlock.h>
9#include <linux/debugfs.h> 11#include <linux/debugfs.h>
10#include <linux/uaccess.h> 12#include <linux/uaccess.h>
@@ -21,7 +23,6 @@
21#include <linux/fs.h> 23#include <linux/fs.h>
22 24
23#include <asm/local.h> 25#include <asm/local.h>
24#include "trace.h"
25 26
26static void update_pages_handler(struct work_struct *work); 27static void update_pages_handler(struct work_struct *work);
27 28
@@ -177,7 +178,7 @@ void tracing_off_permanent(void)
177#define RB_MAX_SMALL_DATA (RB_ALIGNMENT * RINGBUF_TYPE_DATA_TYPE_LEN_MAX) 178#define RB_MAX_SMALL_DATA (RB_ALIGNMENT * RINGBUF_TYPE_DATA_TYPE_LEN_MAX)
178#define RB_EVNT_MIN_SIZE 8U /* two 32bit words */ 179#define RB_EVNT_MIN_SIZE 8U /* two 32bit words */
179 180
180#if !defined(CONFIG_64BIT) || defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) 181#ifndef CONFIG_HAVE_64BIT_ALIGNED_ACCESS
181# define RB_FORCE_8BYTE_ALIGNMENT 0 182# define RB_FORCE_8BYTE_ALIGNMENT 0
182# define RB_ARCH_ALIGNMENT RB_ALIGNMENT 183# define RB_ARCH_ALIGNMENT RB_ALIGNMENT
183#else 184#else
@@ -185,6 +186,8 @@ void tracing_off_permanent(void)
185# define RB_ARCH_ALIGNMENT 8U 186# define RB_ARCH_ALIGNMENT 8U
186#endif 187#endif
187 188
189#define RB_ALIGN_DATA __aligned(RB_ARCH_ALIGNMENT)
190
188/* define RINGBUF_TYPE_DATA for 'case RINGBUF_TYPE_DATA:' */ 191/* define RINGBUF_TYPE_DATA for 'case RINGBUF_TYPE_DATA:' */
189#define RINGBUF_TYPE_DATA 0 ... RINGBUF_TYPE_DATA_TYPE_LEN_MAX 192#define RINGBUF_TYPE_DATA 0 ... RINGBUF_TYPE_DATA_TYPE_LEN_MAX
190 193
@@ -333,7 +336,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_data);
333struct buffer_data_page { 336struct buffer_data_page {
334 u64 time_stamp; /* page time stamp */ 337 u64 time_stamp; /* page time stamp */
335 local_t commit; /* write committed index */ 338 local_t commit; /* write committed index */
336 unsigned char data[]; /* data of buffer page */ 339 unsigned char data[] RB_ALIGN_DATA; /* data of buffer page */
337}; 340};
338 341
339/* 342/*
@@ -460,9 +463,10 @@ struct ring_buffer_per_cpu {
460 unsigned long lost_events; 463 unsigned long lost_events;
461 unsigned long last_overrun; 464 unsigned long last_overrun;
462 local_t entries_bytes; 465 local_t entries_bytes;
463 local_t commit_overrun;
464 local_t overrun;
465 local_t entries; 466 local_t entries;
467 local_t overrun;
468 local_t commit_overrun;
469 local_t dropped_events;
466 local_t committing; 470 local_t committing;
467 local_t commits; 471 local_t commits;
468 unsigned long read; 472 unsigned long read;
@@ -1396,6 +1400,8 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer)
1396 struct list_head *head_page_with_bit; 1400 struct list_head *head_page_with_bit;
1397 1401
1398 head_page = &rb_set_head_page(cpu_buffer)->list; 1402 head_page = &rb_set_head_page(cpu_buffer)->list;
1403 if (!head_page)
1404 break;
1399 prev_page = head_page->prev; 1405 prev_page = head_page->prev;
1400 1406
1401 first_page = pages->next; 1407 first_page = pages->next;
@@ -1820,7 +1826,7 @@ rb_add_time_stamp(struct ring_buffer_event *event, u64 delta)
1820} 1826}
1821 1827
1822/** 1828/**
1823 * ring_buffer_update_event - update event type and data 1829 * rb_update_event - update event type and data
1824 * @event: the even to update 1830 * @event: the even to update
1825 * @type: the type of event 1831 * @type: the type of event
1826 * @length: the size of the event field in the ring buffer 1832 * @length: the size of the event field in the ring buffer
@@ -2155,8 +2161,10 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
2155 * If we are not in overwrite mode, 2161 * If we are not in overwrite mode,
2156 * this is easy, just stop here. 2162 * this is easy, just stop here.
2157 */ 2163 */
2158 if (!(buffer->flags & RB_FL_OVERWRITE)) 2164 if (!(buffer->flags & RB_FL_OVERWRITE)) {
2165 local_inc(&cpu_buffer->dropped_events);
2159 goto out_reset; 2166 goto out_reset;
2167 }
2160 2168
2161 ret = rb_handle_head_page(cpu_buffer, 2169 ret = rb_handle_head_page(cpu_buffer,
2162 tail_page, 2170 tail_page,
@@ -2427,41 +2435,76 @@ rb_reserve_next_event(struct ring_buffer *buffer,
2427 2435
2428#ifdef CONFIG_TRACING 2436#ifdef CONFIG_TRACING
2429 2437
2430#define TRACE_RECURSIVE_DEPTH 16 2438/*
2439 * The lock and unlock are done within a preempt disable section.
2440 * The current_context per_cpu variable can only be modified
2441 * by the current task between lock and unlock. But it can
2442 * be modified more than once via an interrupt. To pass this
2443 * information from the lock to the unlock without having to
2444 * access the 'in_interrupt()' functions again (which do show
2445 * a bit of overhead in something as critical as function tracing,
2446 * we use a bitmask trick.
2447 *
2448 * bit 0 = NMI context
2449 * bit 1 = IRQ context
2450 * bit 2 = SoftIRQ context
2451 * bit 3 = normal context.
2452 *
2453 * This works because this is the order of contexts that can
2454 * preempt other contexts. A SoftIRQ never preempts an IRQ
2455 * context.
2456 *
2457 * When the context is determined, the corresponding bit is
2458 * checked and set (if it was set, then a recursion of that context
2459 * happened).
2460 *
2461 * On unlock, we need to clear this bit. To do so, just subtract
2462 * 1 from the current_context and AND it to itself.
2463 *
2464 * (binary)
2465 * 101 - 1 = 100
2466 * 101 & 100 = 100 (clearing bit zero)
2467 *
2468 * 1010 - 1 = 1001
2469 * 1010 & 1001 = 1000 (clearing bit 1)
2470 *
2471 * The least significant bit can be cleared this way, and it
2472 * just so happens that it is the same bit corresponding to
2473 * the current context.
2474 */
2475static DEFINE_PER_CPU(unsigned int, current_context);
2431 2476
2432/* Keep this code out of the fast path cache */ 2477static __always_inline int trace_recursive_lock(void)
2433static noinline void trace_recursive_fail(void)
2434{ 2478{
2435 /* Disable all tracing before we do anything else */ 2479 unsigned int val = this_cpu_read(current_context);
2436 tracing_off_permanent(); 2480 int bit;
2437 2481
2438 printk_once(KERN_WARNING "Tracing recursion: depth[%ld]:" 2482 if (in_interrupt()) {
2439 "HC[%lu]:SC[%lu]:NMI[%lu]\n", 2483 if (in_nmi())
2440 trace_recursion_buffer(), 2484 bit = 0;
2441 hardirq_count() >> HARDIRQ_SHIFT, 2485 else if (in_irq())
2442 softirq_count() >> SOFTIRQ_SHIFT, 2486 bit = 1;
2443 in_nmi()); 2487 else
2444 2488 bit = 2;
2445 WARN_ON_ONCE(1); 2489 } else
2446} 2490 bit = 3;
2447
2448static inline int trace_recursive_lock(void)
2449{
2450 trace_recursion_inc();
2451 2491
2452 if (likely(trace_recursion_buffer() < TRACE_RECURSIVE_DEPTH)) 2492 if (unlikely(val & (1 << bit)))
2453 return 0; 2493 return 1;
2454 2494
2455 trace_recursive_fail(); 2495 val |= (1 << bit);
2496 this_cpu_write(current_context, val);
2456 2497
2457 return -1; 2498 return 0;
2458} 2499}
2459 2500
2460static inline void trace_recursive_unlock(void) 2501static __always_inline void trace_recursive_unlock(void)
2461{ 2502{
2462 WARN_ON_ONCE(!trace_recursion_buffer()); 2503 unsigned int val = this_cpu_read(current_context);
2463 2504
2464 trace_recursion_dec(); 2505 val--;
2506 val &= this_cpu_read(current_context);
2507 this_cpu_write(current_context, val);
2465} 2508}
2466 2509
2467#else 2510#else
@@ -2720,8 +2763,8 @@ EXPORT_SYMBOL_GPL(ring_buffer_discard_commit);
2720 * and not the length of the event which would hold the header. 2763 * and not the length of the event which would hold the header.
2721 */ 2764 */
2722int ring_buffer_write(struct ring_buffer *buffer, 2765int ring_buffer_write(struct ring_buffer *buffer,
2723 unsigned long length, 2766 unsigned long length,
2724 void *data) 2767 void *data)
2725{ 2768{
2726 struct ring_buffer_per_cpu *cpu_buffer; 2769 struct ring_buffer_per_cpu *cpu_buffer;
2727 struct ring_buffer_event *event; 2770 struct ring_buffer_event *event;
@@ -2929,12 +2972,12 @@ rb_num_of_entries(struct ring_buffer_per_cpu *cpu_buffer)
2929 * @buffer: The ring buffer 2972 * @buffer: The ring buffer
2930 * @cpu: The per CPU buffer to read from. 2973 * @cpu: The per CPU buffer to read from.
2931 */ 2974 */
2932unsigned long ring_buffer_oldest_event_ts(struct ring_buffer *buffer, int cpu) 2975u64 ring_buffer_oldest_event_ts(struct ring_buffer *buffer, int cpu)
2933{ 2976{
2934 unsigned long flags; 2977 unsigned long flags;
2935 struct ring_buffer_per_cpu *cpu_buffer; 2978 struct ring_buffer_per_cpu *cpu_buffer;
2936 struct buffer_page *bpage; 2979 struct buffer_page *bpage;
2937 unsigned long ret; 2980 u64 ret = 0;
2938 2981
2939 if (!cpumask_test_cpu(cpu, buffer->cpumask)) 2982 if (!cpumask_test_cpu(cpu, buffer->cpumask))
2940 return 0; 2983 return 0;
@@ -2949,7 +2992,8 @@ unsigned long ring_buffer_oldest_event_ts(struct ring_buffer *buffer, int cpu)
2949 bpage = cpu_buffer->reader_page; 2992 bpage = cpu_buffer->reader_page;
2950 else 2993 else
2951 bpage = rb_set_head_page(cpu_buffer); 2994 bpage = rb_set_head_page(cpu_buffer);
2952 ret = bpage->page->time_stamp; 2995 if (bpage)
2996 ret = bpage->page->time_stamp;
2953 raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); 2997 raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
2954 2998
2955 return ret; 2999 return ret;
@@ -2995,7 +3039,8 @@ unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu)
2995EXPORT_SYMBOL_GPL(ring_buffer_entries_cpu); 3039EXPORT_SYMBOL_GPL(ring_buffer_entries_cpu);
2996 3040
2997/** 3041/**
2998 * ring_buffer_overrun_cpu - get the number of overruns in a cpu_buffer 3042 * ring_buffer_overrun_cpu - get the number of overruns caused by the ring
3043 * buffer wrapping around (only if RB_FL_OVERWRITE is on).
2999 * @buffer: The ring buffer 3044 * @buffer: The ring buffer
3000 * @cpu: The per CPU buffer to get the number of overruns from 3045 * @cpu: The per CPU buffer to get the number of overruns from
3001 */ 3046 */
@@ -3015,7 +3060,9 @@ unsigned long ring_buffer_overrun_cpu(struct ring_buffer *buffer, int cpu)
3015EXPORT_SYMBOL_GPL(ring_buffer_overrun_cpu); 3060EXPORT_SYMBOL_GPL(ring_buffer_overrun_cpu);
3016 3061
3017/** 3062/**
3018 * ring_buffer_commit_overrun_cpu - get the number of overruns caused by commits 3063 * ring_buffer_commit_overrun_cpu - get the number of overruns caused by
3064 * commits failing due to the buffer wrapping around while there are uncommitted
3065 * events, such as during an interrupt storm.
3019 * @buffer: The ring buffer 3066 * @buffer: The ring buffer
3020 * @cpu: The per CPU buffer to get the number of overruns from 3067 * @cpu: The per CPU buffer to get the number of overruns from
3021 */ 3068 */
@@ -3036,6 +3083,46 @@ ring_buffer_commit_overrun_cpu(struct ring_buffer *buffer, int cpu)
3036EXPORT_SYMBOL_GPL(ring_buffer_commit_overrun_cpu); 3083EXPORT_SYMBOL_GPL(ring_buffer_commit_overrun_cpu);
3037 3084
3038/** 3085/**
3086 * ring_buffer_dropped_events_cpu - get the number of dropped events caused by
3087 * the ring buffer filling up (only if RB_FL_OVERWRITE is off).
3088 * @buffer: The ring buffer
3089 * @cpu: The per CPU buffer to get the number of overruns from
3090 */
3091unsigned long
3092ring_buffer_dropped_events_cpu(struct ring_buffer *buffer, int cpu)
3093{
3094 struct ring_buffer_per_cpu *cpu_buffer;
3095 unsigned long ret;
3096
3097 if (!cpumask_test_cpu(cpu, buffer->cpumask))
3098 return 0;
3099
3100 cpu_buffer = buffer->buffers[cpu];
3101 ret = local_read(&cpu_buffer->dropped_events);
3102
3103 return ret;
3104}
3105EXPORT_SYMBOL_GPL(ring_buffer_dropped_events_cpu);
3106
3107/**
3108 * ring_buffer_read_events_cpu - get the number of events successfully read
3109 * @buffer: The ring buffer
3110 * @cpu: The per CPU buffer to get the number of events read
3111 */
3112unsigned long
3113ring_buffer_read_events_cpu(struct ring_buffer *buffer, int cpu)
3114{
3115 struct ring_buffer_per_cpu *cpu_buffer;
3116
3117 if (!cpumask_test_cpu(cpu, buffer->cpumask))
3118 return 0;
3119
3120 cpu_buffer = buffer->buffers[cpu];
3121 return cpu_buffer->read;
3122}
3123EXPORT_SYMBOL_GPL(ring_buffer_read_events_cpu);
3124
3125/**
3039 * ring_buffer_entries - get the number of entries in a buffer 3126 * ring_buffer_entries - get the number of entries in a buffer
3040 * @buffer: The ring buffer 3127 * @buffer: The ring buffer
3041 * 3128 *
@@ -3260,6 +3347,8 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
3260 * Splice the empty reader page into the list around the head. 3347 * Splice the empty reader page into the list around the head.
3261 */ 3348 */
3262 reader = rb_set_head_page(cpu_buffer); 3349 reader = rb_set_head_page(cpu_buffer);
3350 if (!reader)
3351 goto out;
3263 cpu_buffer->reader_page->list.next = rb_list_head(reader->list.next); 3352 cpu_buffer->reader_page->list.next = rb_list_head(reader->list.next);
3264 cpu_buffer->reader_page->list.prev = reader->list.prev; 3353 cpu_buffer->reader_page->list.prev = reader->list.prev;
3265 3354
@@ -3392,7 +3481,7 @@ static void rb_advance_iter(struct ring_buffer_iter *iter)
3392 /* check for end of page padding */ 3481 /* check for end of page padding */
3393 if ((iter->head >= rb_page_size(iter->head_page)) && 3482 if ((iter->head >= rb_page_size(iter->head_page)) &&
3394 (iter->head_page != cpu_buffer->commit_page)) 3483 (iter->head_page != cpu_buffer->commit_page))
3395 rb_advance_iter(iter); 3484 rb_inc_iter(iter);
3396} 3485}
3397 3486
3398static int rb_lost_events(struct ring_buffer_per_cpu *cpu_buffer) 3487static int rb_lost_events(struct ring_buffer_per_cpu *cpu_buffer)
@@ -3778,12 +3867,17 @@ void
3778ring_buffer_read_finish(struct ring_buffer_iter *iter) 3867ring_buffer_read_finish(struct ring_buffer_iter *iter)
3779{ 3868{
3780 struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; 3869 struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
3870 unsigned long flags;
3781 3871
3782 /* 3872 /*
3783 * Ring buffer is disabled from recording, here's a good place 3873 * Ring buffer is disabled from recording, here's a good place
3784 * to check the integrity of the ring buffer. 3874 * to check the integrity of the ring buffer.
3875 * Must prevent readers from trying to read, as the check
3876 * clears the HEAD page and readers require it.
3785 */ 3877 */
3878 raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
3786 rb_check_pages(cpu_buffer); 3879 rb_check_pages(cpu_buffer);
3880 raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
3787 3881
3788 atomic_dec(&cpu_buffer->record_disabled); 3882 atomic_dec(&cpu_buffer->record_disabled);
3789 atomic_dec(&cpu_buffer->buffer->resize_disabled); 3883 atomic_dec(&cpu_buffer->buffer->resize_disabled);
@@ -3864,9 +3958,10 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
3864 local_set(&cpu_buffer->reader_page->page->commit, 0); 3958 local_set(&cpu_buffer->reader_page->page->commit, 0);
3865 cpu_buffer->reader_page->read = 0; 3959 cpu_buffer->reader_page->read = 0;
3866 3960
3867 local_set(&cpu_buffer->commit_overrun, 0);
3868 local_set(&cpu_buffer->entries_bytes, 0); 3961 local_set(&cpu_buffer->entries_bytes, 0);
3869 local_set(&cpu_buffer->overrun, 0); 3962 local_set(&cpu_buffer->overrun, 0);
3963 local_set(&cpu_buffer->commit_overrun, 0);
3964 local_set(&cpu_buffer->dropped_events, 0);
3870 local_set(&cpu_buffer->entries, 0); 3965 local_set(&cpu_buffer->entries, 0);
3871 local_set(&cpu_buffer->committing, 0); 3966 local_set(&cpu_buffer->committing, 0);
3872 local_set(&cpu_buffer->commits, 0); 3967 local_set(&cpu_buffer->commits, 0);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 31e4f55773f1..66338c4f7f4b 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -9,7 +9,7 @@
9 * 9 *
10 * Based on code from the latency_tracer, that is: 10 * Based on code from the latency_tracer, that is:
11 * Copyright (C) 2004-2006 Ingo Molnar 11 * Copyright (C) 2004-2006 Ingo Molnar
12 * Copyright (C) 2004 William Lee Irwin III 12 * Copyright (C) 2004 Nadia Yvette Chambers
13 */ 13 */
14#include <linux/ring_buffer.h> 14#include <linux/ring_buffer.h>
15#include <generated/utsrelease.h> 15#include <generated/utsrelease.h>
@@ -19,6 +19,7 @@
19#include <linux/seq_file.h> 19#include <linux/seq_file.h>
20#include <linux/notifier.h> 20#include <linux/notifier.h>
21#include <linux/irqflags.h> 21#include <linux/irqflags.h>
22#include <linux/irq_work.h>
22#include <linux/debugfs.h> 23#include <linux/debugfs.h>
23#include <linux/pagemap.h> 24#include <linux/pagemap.h>
24#include <linux/hardirq.h> 25#include <linux/hardirq.h>
@@ -38,6 +39,7 @@
38#include <linux/poll.h> 39#include <linux/poll.h>
39#include <linux/nmi.h> 40#include <linux/nmi.h>
40#include <linux/fs.h> 41#include <linux/fs.h>
42#include <linux/sched/rt.h>
41 43
42#include "trace.h" 44#include "trace.h"
43#include "trace_output.h" 45#include "trace_output.h"
@@ -78,6 +80,21 @@ static int dummy_set_flag(u32 old_flags, u32 bit, int set)
78} 80}
79 81
80/* 82/*
83 * To prevent the comm cache from being overwritten when no
84 * tracing is active, only save the comm when a trace event
85 * occurred.
86 */
87static DEFINE_PER_CPU(bool, trace_cmdline_save);
88
89/*
90 * When a reader is waiting for data, then this variable is
91 * set to true.
92 */
93static bool trace_wakeup_needed;
94
95static struct irq_work trace_work_wakeup;
96
97/*
81 * Kill all tracing for good (never come back). 98 * Kill all tracing for good (never come back).
82 * It is initialized to 1 but will turn to zero if the initialization 99 * It is initialized to 1 but will turn to zero if the initialization
83 * of the tracer is successful. But that is the only place that sets 100 * of the tracer is successful. But that is the only place that sets
@@ -115,7 +132,7 @@ static char *default_bootup_tracer;
115 132
116static int __init set_cmdline_ftrace(char *str) 133static int __init set_cmdline_ftrace(char *str)
117{ 134{
118 strncpy(bootup_tracer_buf, str, MAX_TRACER_SIZE); 135 strlcpy(bootup_tracer_buf, str, MAX_TRACER_SIZE);
119 default_bootup_tracer = bootup_tracer_buf; 136 default_bootup_tracer = bootup_tracer_buf;
120 /* We are using ftrace early, expand it */ 137 /* We are using ftrace early, expand it */
121 ring_buffer_expanded = 1; 138 ring_buffer_expanded = 1;
@@ -139,6 +156,18 @@ static int __init set_ftrace_dump_on_oops(char *str)
139} 156}
140__setup("ftrace_dump_on_oops", set_ftrace_dump_on_oops); 157__setup("ftrace_dump_on_oops", set_ftrace_dump_on_oops);
141 158
159
160static char trace_boot_options_buf[MAX_TRACER_SIZE] __initdata;
161static char *trace_boot_options __initdata;
162
163static int __init set_trace_boot_options(char *str)
164{
165 strlcpy(trace_boot_options_buf, str, MAX_TRACER_SIZE);
166 trace_boot_options = trace_boot_options_buf;
167 return 0;
168}
169__setup("trace_options=", set_trace_boot_options);
170
142unsigned long long ns2usecs(cycle_t nsec) 171unsigned long long ns2usecs(cycle_t nsec)
143{ 172{
144 nsec += 500; 173 nsec += 500;
@@ -198,20 +227,9 @@ static struct trace_array max_tr;
198 227
199static DEFINE_PER_CPU(struct trace_array_cpu, max_tr_data); 228static DEFINE_PER_CPU(struct trace_array_cpu, max_tr_data);
200 229
201/* tracer_enabled is used to toggle activation of a tracer */
202static int tracer_enabled = 1;
203
204/**
205 * tracing_is_enabled - return tracer_enabled status
206 *
207 * This function is used by other tracers to know the status
208 * of the tracer_enabled flag. Tracers may use this function
209 * to know if it should enable their features when starting
210 * up. See irqsoff tracer for an example (start_irqsoff_tracer).
211 */
212int tracing_is_enabled(void) 230int tracing_is_enabled(void)
213{ 231{
214 return tracer_enabled; 232 return tracing_is_on();
215} 233}
216 234
217/* 235/*
@@ -232,7 +250,7 @@ static unsigned long trace_buf_size = TRACE_BUF_SIZE_DEFAULT;
232static struct tracer *trace_types __read_mostly; 250static struct tracer *trace_types __read_mostly;
233 251
234/* current_trace points to the tracer that is currently active */ 252/* current_trace points to the tracer that is currently active */
235static struct tracer *current_trace __read_mostly; 253static struct tracer *current_trace __read_mostly = &nop_trace;
236 254
237/* 255/*
238 * trace_types_lock is used to protect the trace_types list. 256 * trace_types_lock is used to protect the trace_types list.
@@ -333,12 +351,18 @@ unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK |
333static int trace_stop_count; 351static int trace_stop_count;
334static DEFINE_RAW_SPINLOCK(tracing_start_lock); 352static DEFINE_RAW_SPINLOCK(tracing_start_lock);
335 353
336static void wakeup_work_handler(struct work_struct *work) 354/**
355 * trace_wake_up - wake up tasks waiting for trace input
356 *
357 * Schedules a delayed work to wake up any task that is blocked on the
358 * trace_wait queue. These is used with trace_poll for tasks polling the
359 * trace.
360 */
361static void trace_wake_up(struct irq_work *work)
337{ 362{
338 wake_up(&trace_wait); 363 wake_up_all(&trace_wait);
339}
340 364
341static DECLARE_DELAYED_WORK(wakeup_work, wakeup_work_handler); 365}
342 366
343/** 367/**
344 * tracing_on - enable tracing buffers 368 * tracing_on - enable tracing buffers
@@ -393,22 +417,6 @@ int tracing_is_on(void)
393} 417}
394EXPORT_SYMBOL_GPL(tracing_is_on); 418EXPORT_SYMBOL_GPL(tracing_is_on);
395 419
396/**
397 * trace_wake_up - wake up tasks waiting for trace input
398 *
399 * Schedules a delayed work to wake up any task that is blocked on the
400 * trace_wait queue. These is used with trace_poll for tasks polling the
401 * trace.
402 */
403void trace_wake_up(void)
404{
405 const unsigned long delay = msecs_to_jiffies(2);
406
407 if (trace_flags & TRACE_ITER_BLOCK)
408 return;
409 schedule_delayed_work(&wakeup_work, delay);
410}
411
412static int __init set_buf_size(char *str) 420static int __init set_buf_size(char *str)
413{ 421{
414 unsigned long buf_size; 422 unsigned long buf_size;
@@ -431,7 +439,7 @@ static int __init set_tracing_thresh(char *str)
431 439
432 if (!str) 440 if (!str)
433 return 0; 441 return 0;
434 ret = strict_strtoul(str, 0, &threshold); 442 ret = kstrtoul(str, 0, &threshold);
435 if (ret < 0) 443 if (ret < 0)
436 return 0; 444 return 0;
437 tracing_thresh = threshold * 1000; 445 tracing_thresh = threshold * 1000;
@@ -477,10 +485,12 @@ static const char *trace_options[] = {
477static struct { 485static struct {
478 u64 (*func)(void); 486 u64 (*func)(void);
479 const char *name; 487 const char *name;
488 int in_ns; /* is this clock in nanoseconds? */
480} trace_clocks[] = { 489} trace_clocks[] = {
481 { trace_clock_local, "local" }, 490 { trace_clock_local, "local", 1 },
482 { trace_clock_global, "global" }, 491 { trace_clock_global, "global", 1 },
483 { trace_clock_counter, "counter" }, 492 { trace_clock_counter, "counter", 0 },
493 ARCH_TRACE_CLOCKS
484}; 494};
485 495
486int trace_clock_id; 496int trace_clock_id;
@@ -694,18 +704,22 @@ __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
694void 704void
695update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) 705update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
696{ 706{
697 struct ring_buffer *buf = tr->buffer; 707 struct ring_buffer *buf;
698 708
699 if (trace_stop_count) 709 if (trace_stop_count)
700 return; 710 return;
701 711
702 WARN_ON_ONCE(!irqs_disabled()); 712 WARN_ON_ONCE(!irqs_disabled());
703 if (!current_trace->use_max_tr) { 713
704 WARN_ON_ONCE(1); 714 if (!current_trace->allocated_snapshot) {
715 /* Only the nop tracer should hit this when disabling */
716 WARN_ON_ONCE(current_trace != &nop_trace);
705 return; 717 return;
706 } 718 }
719
707 arch_spin_lock(&ftrace_max_lock); 720 arch_spin_lock(&ftrace_max_lock);
708 721
722 buf = tr->buffer;
709 tr->buffer = max_tr.buffer; 723 tr->buffer = max_tr.buffer;
710 max_tr.buffer = buf; 724 max_tr.buffer = buf;
711 725
@@ -730,8 +744,9 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
730 return; 744 return;
731 745
732 WARN_ON_ONCE(!irqs_disabled()); 746 WARN_ON_ONCE(!irqs_disabled());
733 if (!current_trace->use_max_tr) { 747 if (!current_trace->allocated_snapshot) {
734 WARN_ON_ONCE(1); 748 /* Only the nop tracer should hit this when disabling */
749 WARN_ON_ONCE(current_trace != &nop_trace);
735 return; 750 return;
736 } 751 }
737 752
@@ -757,6 +772,40 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
757} 772}
758#endif /* CONFIG_TRACER_MAX_TRACE */ 773#endif /* CONFIG_TRACER_MAX_TRACE */
759 774
775static void default_wait_pipe(struct trace_iterator *iter)
776{
777 DEFINE_WAIT(wait);
778
779 prepare_to_wait(&trace_wait, &wait, TASK_INTERRUPTIBLE);
780
781 /*
782 * The events can happen in critical sections where
783 * checking a work queue can cause deadlocks.
784 * After adding a task to the queue, this flag is set
785 * only to notify events to try to wake up the queue
786 * using irq_work.
787 *
788 * We don't clear it even if the buffer is no longer
789 * empty. The flag only causes the next event to run
790 * irq_work to do the work queue wake up. The worse
791 * that can happen if we race with !trace_empty() is that
792 * an event will cause an irq_work to try to wake up
793 * an empty queue.
794 *
795 * There's no reason to protect this flag either, as
796 * the work queue and irq_work logic will do the necessary
797 * synchronization for the wake ups. The only thing
798 * that is necessary is that the wake up happens after
799 * a task has been queued. It's OK for spurious wake ups.
800 */
801 trace_wakeup_needed = true;
802
803 if (trace_empty(iter))
804 schedule();
805
806 finish_wait(&trace_wait, &wait);
807}
808
760/** 809/**
761 * register_tracer - register a tracer with the ftrace system. 810 * register_tracer - register a tracer with the ftrace system.
762 * @type - the plugin for the tracer 811 * @type - the plugin for the tracer
@@ -819,10 +868,13 @@ int register_tracer(struct tracer *type)
819 868
820 current_trace = type; 869 current_trace = type;
821 870
822 /* If we expanded the buffers, make sure the max is expanded too */ 871 if (type->use_max_tr) {
823 if (ring_buffer_expanded && type->use_max_tr) 872 /* If we expanded the buffers, make sure the max is expanded too */
824 ring_buffer_resize(max_tr.buffer, trace_buf_size, 873 if (ring_buffer_expanded)
825 RING_BUFFER_ALL_CPUS); 874 ring_buffer_resize(max_tr.buffer, trace_buf_size,
875 RING_BUFFER_ALL_CPUS);
876 type->allocated_snapshot = true;
877 }
826 878
827 /* the test is responsible for initializing and enabling */ 879 /* the test is responsible for initializing and enabling */
828 pr_info("Testing tracer %s: ", type->name); 880 pr_info("Testing tracer %s: ", type->name);
@@ -838,10 +890,14 @@ int register_tracer(struct tracer *type)
838 /* Only reset on passing, to avoid touching corrupted buffers */ 890 /* Only reset on passing, to avoid touching corrupted buffers */
839 tracing_reset_online_cpus(tr); 891 tracing_reset_online_cpus(tr);
840 892
841 /* Shrink the max buffer again */ 893 if (type->use_max_tr) {
842 if (ring_buffer_expanded && type->use_max_tr) 894 type->allocated_snapshot = false;
843 ring_buffer_resize(max_tr.buffer, 1, 895
844 RING_BUFFER_ALL_CPUS); 896 /* Shrink the max buffer again */
897 if (ring_buffer_expanded)
898 ring_buffer_resize(max_tr.buffer, 1,
899 RING_BUFFER_ALL_CPUS);
900 }
845 901
846 printk(KERN_CONT "PASSED\n"); 902 printk(KERN_CONT "PASSED\n");
847 } 903 }
@@ -875,36 +931,13 @@ int register_tracer(struct tracer *type)
875 return ret; 931 return ret;
876} 932}
877 933
878void unregister_tracer(struct tracer *type)
879{
880 struct tracer **t;
881
882 mutex_lock(&trace_types_lock);
883 for (t = &trace_types; *t; t = &(*t)->next) {
884 if (*t == type)
885 goto found;
886 }
887 pr_info("Tracer %s not registered\n", type->name);
888 goto out;
889
890 found:
891 *t = (*t)->next;
892
893 if (type == current_trace && tracer_enabled) {
894 tracer_enabled = 0;
895 tracing_stop();
896 if (current_trace->stop)
897 current_trace->stop(&global_trace);
898 current_trace = &nop_trace;
899 }
900out:
901 mutex_unlock(&trace_types_lock);
902}
903
904void tracing_reset(struct trace_array *tr, int cpu) 934void tracing_reset(struct trace_array *tr, int cpu)
905{ 935{
906 struct ring_buffer *buffer = tr->buffer; 936 struct ring_buffer *buffer = tr->buffer;
907 937
938 if (!buffer)
939 return;
940
908 ring_buffer_record_disable(buffer); 941 ring_buffer_record_disable(buffer);
909 942
910 /* Make sure all commits have finished */ 943 /* Make sure all commits have finished */
@@ -919,6 +952,9 @@ void tracing_reset_online_cpus(struct trace_array *tr)
919 struct ring_buffer *buffer = tr->buffer; 952 struct ring_buffer *buffer = tr->buffer;
920 int cpu; 953 int cpu;
921 954
955 if (!buffer)
956 return;
957
922 ring_buffer_record_disable(buffer); 958 ring_buffer_record_disable(buffer);
923 959
924 /* Make sure all commits have finished */ 960 /* Make sure all commits have finished */
@@ -1131,10 +1167,14 @@ void trace_find_cmdline(int pid, char comm[])
1131 1167
1132void tracing_record_cmdline(struct task_struct *tsk) 1168void tracing_record_cmdline(struct task_struct *tsk)
1133{ 1169{
1134 if (atomic_read(&trace_record_cmdline_disabled) || !tracer_enabled || 1170 if (atomic_read(&trace_record_cmdline_disabled) || !tracing_is_on())
1135 !tracing_is_on())
1136 return; 1171 return;
1137 1172
1173 if (!__this_cpu_read(trace_cmdline_save))
1174 return;
1175
1176 __this_cpu_write(trace_cmdline_save, false);
1177
1138 trace_save_cmdline(tsk); 1178 trace_save_cmdline(tsk);
1139} 1179}
1140 1180
@@ -1146,7 +1186,6 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
1146 1186
1147 entry->preempt_count = pc & 0xff; 1187 entry->preempt_count = pc & 0xff;
1148 entry->pid = (tsk) ? tsk->pid : 0; 1188 entry->pid = (tsk) ? tsk->pid : 0;
1149 entry->padding = 0;
1150 entry->flags = 1189 entry->flags =
1151#ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT 1190#ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT
1152 (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) | 1191 (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) |
@@ -1178,27 +1217,36 @@ trace_buffer_lock_reserve(struct ring_buffer *buffer,
1178 return event; 1217 return event;
1179} 1218}
1180 1219
1220void
1221__buffer_unlock_commit(struct ring_buffer *buffer, struct ring_buffer_event *event)
1222{
1223 __this_cpu_write(trace_cmdline_save, true);
1224 if (trace_wakeup_needed) {
1225 trace_wakeup_needed = false;
1226 /* irq_work_queue() supplies it's own memory barriers */
1227 irq_work_queue(&trace_work_wakeup);
1228 }
1229 ring_buffer_unlock_commit(buffer, event);
1230}
1231
1181static inline void 1232static inline void
1182__trace_buffer_unlock_commit(struct ring_buffer *buffer, 1233__trace_buffer_unlock_commit(struct ring_buffer *buffer,
1183 struct ring_buffer_event *event, 1234 struct ring_buffer_event *event,
1184 unsigned long flags, int pc, 1235 unsigned long flags, int pc)
1185 int wake)
1186{ 1236{
1187 ring_buffer_unlock_commit(buffer, event); 1237 __buffer_unlock_commit(buffer, event);
1188 1238
1189 ftrace_trace_stack(buffer, flags, 6, pc); 1239 ftrace_trace_stack(buffer, flags, 6, pc);
1190 ftrace_trace_userstack(buffer, flags, pc); 1240 ftrace_trace_userstack(buffer, flags, pc);
1191
1192 if (wake)
1193 trace_wake_up();
1194} 1241}
1195 1242
1196void trace_buffer_unlock_commit(struct ring_buffer *buffer, 1243void trace_buffer_unlock_commit(struct ring_buffer *buffer,
1197 struct ring_buffer_event *event, 1244 struct ring_buffer_event *event,
1198 unsigned long flags, int pc) 1245 unsigned long flags, int pc)
1199{ 1246{
1200 __trace_buffer_unlock_commit(buffer, event, flags, pc, 1); 1247 __trace_buffer_unlock_commit(buffer, event, flags, pc);
1201} 1248}
1249EXPORT_SYMBOL_GPL(trace_buffer_unlock_commit);
1202 1250
1203struct ring_buffer_event * 1251struct ring_buffer_event *
1204trace_current_buffer_lock_reserve(struct ring_buffer **current_rb, 1252trace_current_buffer_lock_reserve(struct ring_buffer **current_rb,
@@ -1215,29 +1263,21 @@ void trace_current_buffer_unlock_commit(struct ring_buffer *buffer,
1215 struct ring_buffer_event *event, 1263 struct ring_buffer_event *event,
1216 unsigned long flags, int pc) 1264 unsigned long flags, int pc)
1217{ 1265{
1218 __trace_buffer_unlock_commit(buffer, event, flags, pc, 1); 1266 __trace_buffer_unlock_commit(buffer, event, flags, pc);
1219} 1267}
1220EXPORT_SYMBOL_GPL(trace_current_buffer_unlock_commit); 1268EXPORT_SYMBOL_GPL(trace_current_buffer_unlock_commit);
1221 1269
1222void trace_nowake_buffer_unlock_commit(struct ring_buffer *buffer, 1270void trace_buffer_unlock_commit_regs(struct ring_buffer *buffer,
1223 struct ring_buffer_event *event, 1271 struct ring_buffer_event *event,
1224 unsigned long flags, int pc) 1272 unsigned long flags, int pc,
1225{ 1273 struct pt_regs *regs)
1226 __trace_buffer_unlock_commit(buffer, event, flags, pc, 0);
1227}
1228EXPORT_SYMBOL_GPL(trace_nowake_buffer_unlock_commit);
1229
1230void trace_nowake_buffer_unlock_commit_regs(struct ring_buffer *buffer,
1231 struct ring_buffer_event *event,
1232 unsigned long flags, int pc,
1233 struct pt_regs *regs)
1234{ 1274{
1235 ring_buffer_unlock_commit(buffer, event); 1275 __buffer_unlock_commit(buffer, event);
1236 1276
1237 ftrace_trace_stack_regs(buffer, flags, 0, pc, regs); 1277 ftrace_trace_stack_regs(buffer, flags, 0, pc, regs);
1238 ftrace_trace_userstack(buffer, flags, pc); 1278 ftrace_trace_userstack(buffer, flags, pc);
1239} 1279}
1240EXPORT_SYMBOL_GPL(trace_nowake_buffer_unlock_commit_regs); 1280EXPORT_SYMBOL_GPL(trace_buffer_unlock_commit_regs);
1241 1281
1242void trace_current_buffer_discard_commit(struct ring_buffer *buffer, 1282void trace_current_buffer_discard_commit(struct ring_buffer *buffer,
1243 struct ring_buffer_event *event) 1283 struct ring_buffer_event *event)
@@ -1269,7 +1309,7 @@ trace_function(struct trace_array *tr,
1269 entry->parent_ip = parent_ip; 1309 entry->parent_ip = parent_ip;
1270 1310
1271 if (!filter_check_discard(call, entry, buffer, event)) 1311 if (!filter_check_discard(call, entry, buffer, event))
1272 ring_buffer_unlock_commit(buffer, event); 1312 __buffer_unlock_commit(buffer, event);
1273} 1313}
1274 1314
1275void 1315void
@@ -1313,7 +1353,7 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer,
1313 */ 1353 */
1314 preempt_disable_notrace(); 1354 preempt_disable_notrace();
1315 1355
1316 use_stack = ++__get_cpu_var(ftrace_stack_reserve); 1356 use_stack = __this_cpu_inc_return(ftrace_stack_reserve);
1317 /* 1357 /*
1318 * We don't need any atomic variables, just a barrier. 1358 * We don't need any atomic variables, just a barrier.
1319 * If an interrupt comes in, we don't care, because it would 1359 * If an interrupt comes in, we don't care, because it would
@@ -1362,12 +1402,12 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer,
1362 entry->size = trace.nr_entries; 1402 entry->size = trace.nr_entries;
1363 1403
1364 if (!filter_check_discard(call, entry, buffer, event)) 1404 if (!filter_check_discard(call, entry, buffer, event))
1365 ring_buffer_unlock_commit(buffer, event); 1405 __buffer_unlock_commit(buffer, event);
1366 1406
1367 out: 1407 out:
1368 /* Again, don't let gcc optimize things here */ 1408 /* Again, don't let gcc optimize things here */
1369 barrier(); 1409 barrier();
1370 __get_cpu_var(ftrace_stack_reserve)--; 1410 __this_cpu_dec(ftrace_stack_reserve);
1371 preempt_enable_notrace(); 1411 preempt_enable_notrace();
1372 1412
1373} 1413}
@@ -1458,7 +1498,7 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc)
1458 1498
1459 save_stack_trace_user(&trace); 1499 save_stack_trace_user(&trace);
1460 if (!filter_check_discard(call, entry, buffer, event)) 1500 if (!filter_check_discard(call, entry, buffer, event))
1461 ring_buffer_unlock_commit(buffer, event); 1501 __buffer_unlock_commit(buffer, event);
1462 1502
1463 out_drop_count: 1503 out_drop_count:
1464 __this_cpu_dec(user_stack_count); 1504 __this_cpu_dec(user_stack_count);
@@ -1495,7 +1535,6 @@ static struct trace_buffer_struct *trace_percpu_nmi_buffer;
1495static char *get_trace_buf(void) 1535static char *get_trace_buf(void)
1496{ 1536{
1497 struct trace_buffer_struct *percpu_buffer; 1537 struct trace_buffer_struct *percpu_buffer;
1498 struct trace_buffer_struct *buffer;
1499 1538
1500 /* 1539 /*
1501 * If we have allocated per cpu buffers, then we do not 1540 * If we have allocated per cpu buffers, then we do not
@@ -1513,9 +1552,7 @@ static char *get_trace_buf(void)
1513 if (!percpu_buffer) 1552 if (!percpu_buffer)
1514 return NULL; 1553 return NULL;
1515 1554
1516 buffer = per_cpu_ptr(percpu_buffer, smp_processor_id()); 1555 return this_cpu_ptr(&percpu_buffer->buffer[0]);
1517
1518 return buffer->buffer;
1519} 1556}
1520 1557
1521static int alloc_percpu_trace_buffer(void) 1558static int alloc_percpu_trace_buffer(void)
@@ -1559,10 +1596,10 @@ static int alloc_percpu_trace_buffer(void)
1559 return -ENOMEM; 1596 return -ENOMEM;
1560} 1597}
1561 1598
1599static int buffers_allocated;
1600
1562void trace_printk_init_buffers(void) 1601void trace_printk_init_buffers(void)
1563{ 1602{
1564 static int buffers_allocated;
1565
1566 if (buffers_allocated) 1603 if (buffers_allocated)
1567 return; 1604 return;
1568 1605
@@ -1571,7 +1608,38 @@ void trace_printk_init_buffers(void)
1571 1608
1572 pr_info("ftrace: Allocated trace_printk buffers\n"); 1609 pr_info("ftrace: Allocated trace_printk buffers\n");
1573 1610
1611 /* Expand the buffers to set size */
1612 tracing_update_buffers();
1613
1574 buffers_allocated = 1; 1614 buffers_allocated = 1;
1615
1616 /*
1617 * trace_printk_init_buffers() can be called by modules.
1618 * If that happens, then we need to start cmdline recording
1619 * directly here. If the global_trace.buffer is already
1620 * allocated here, then this was called by module code.
1621 */
1622 if (global_trace.buffer)
1623 tracing_start_cmdline_record();
1624}
1625
1626void trace_printk_start_comm(void)
1627{
1628 /* Start tracing comms if trace printk is set */
1629 if (!buffers_allocated)
1630 return;
1631 tracing_start_cmdline_record();
1632}
1633
1634static void trace_printk_start_stop_comm(int enabled)
1635{
1636 if (!buffers_allocated)
1637 return;
1638
1639 if (enabled)
1640 tracing_start_cmdline_record();
1641 else
1642 tracing_stop_cmdline_record();
1575} 1643}
1576 1644
1577/** 1645/**
@@ -1622,7 +1690,7 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
1622 1690
1623 memcpy(entry->buf, tbuffer, sizeof(u32) * len); 1691 memcpy(entry->buf, tbuffer, sizeof(u32) * len);
1624 if (!filter_check_discard(call, entry, buffer, event)) { 1692 if (!filter_check_discard(call, entry, buffer, event)) {
1625 ring_buffer_unlock_commit(buffer, event); 1693 __buffer_unlock_commit(buffer, event);
1626 ftrace_trace_stack(buffer, flags, 6, pc); 1694 ftrace_trace_stack(buffer, flags, 6, pc);
1627 } 1695 }
1628 1696
@@ -1693,7 +1761,7 @@ int trace_array_vprintk(struct trace_array *tr,
1693 memcpy(&entry->buf, tbuffer, len); 1761 memcpy(&entry->buf, tbuffer, len);
1694 entry->buf[len] = '\0'; 1762 entry->buf[len] = '\0';
1695 if (!filter_check_discard(call, entry, buffer, event)) { 1763 if (!filter_check_discard(call, entry, buffer, event)) {
1696 ring_buffer_unlock_commit(buffer, event); 1764 __buffer_unlock_commit(buffer, event);
1697 ftrace_trace_stack(buffer, flags, 6, pc); 1765 ftrace_trace_stack(buffer, flags, 6, pc);
1698 } 1766 }
1699 out: 1767 out:
@@ -1889,21 +1957,27 @@ void tracing_iter_reset(struct trace_iterator *iter, int cpu)
1889static void *s_start(struct seq_file *m, loff_t *pos) 1957static void *s_start(struct seq_file *m, loff_t *pos)
1890{ 1958{
1891 struct trace_iterator *iter = m->private; 1959 struct trace_iterator *iter = m->private;
1892 static struct tracer *old_tracer;
1893 int cpu_file = iter->cpu_file; 1960 int cpu_file = iter->cpu_file;
1894 void *p = NULL; 1961 void *p = NULL;
1895 loff_t l = 0; 1962 loff_t l = 0;
1896 int cpu; 1963 int cpu;
1897 1964
1898 /* copy the tracer to avoid using a global lock all around */ 1965 /*
1966 * copy the tracer to avoid using a global lock all around.
1967 * iter->trace is a copy of current_trace, the pointer to the
1968 * name may be used instead of a strcmp(), as iter->trace->name
1969 * will point to the same string as current_trace->name.
1970 */
1899 mutex_lock(&trace_types_lock); 1971 mutex_lock(&trace_types_lock);
1900 if (unlikely(old_tracer != current_trace && current_trace)) { 1972 if (unlikely(current_trace && iter->trace->name != current_trace->name))
1901 old_tracer = current_trace;
1902 *iter->trace = *current_trace; 1973 *iter->trace = *current_trace;
1903 }
1904 mutex_unlock(&trace_types_lock); 1974 mutex_unlock(&trace_types_lock);
1905 1975
1906 atomic_inc(&trace_record_cmdline_disabled); 1976 if (iter->snapshot && iter->trace->use_max_tr)
1977 return ERR_PTR(-EBUSY);
1978
1979 if (!iter->snapshot)
1980 atomic_inc(&trace_record_cmdline_disabled);
1907 1981
1908 if (*pos != iter->pos) { 1982 if (*pos != iter->pos) {
1909 iter->ent = NULL; 1983 iter->ent = NULL;
@@ -1942,7 +2016,11 @@ static void s_stop(struct seq_file *m, void *p)
1942{ 2016{
1943 struct trace_iterator *iter = m->private; 2017 struct trace_iterator *iter = m->private;
1944 2018
1945 atomic_dec(&trace_record_cmdline_disabled); 2019 if (iter->snapshot && iter->trace->use_max_tr)
2020 return;
2021
2022 if (!iter->snapshot)
2023 atomic_dec(&trace_record_cmdline_disabled);
1946 trace_access_unlock(iter->cpu_file); 2024 trace_access_unlock(iter->cpu_file);
1947 trace_event_read_unlock(); 2025 trace_event_read_unlock();
1948} 2026}
@@ -2027,8 +2105,7 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter)
2027 unsigned long total; 2105 unsigned long total;
2028 const char *name = "preemption"; 2106 const char *name = "preemption";
2029 2107
2030 if (type) 2108 name = type->name;
2031 name = type->name;
2032 2109
2033 get_total_entries(tr, &total, &entries); 2110 get_total_entries(tr, &total, &entries);
2034 2111
@@ -2327,6 +2404,27 @@ static void test_ftrace_alive(struct seq_file *m)
2327 seq_printf(m, "# MAY BE MISSING FUNCTION EVENTS\n"); 2404 seq_printf(m, "# MAY BE MISSING FUNCTION EVENTS\n");
2328} 2405}
2329 2406
2407#ifdef CONFIG_TRACER_MAX_TRACE
2408static void print_snapshot_help(struct seq_file *m, struct trace_iterator *iter)
2409{
2410 if (iter->trace->allocated_snapshot)
2411 seq_printf(m, "#\n# * Snapshot is allocated *\n#\n");
2412 else
2413 seq_printf(m, "#\n# * Snapshot is freed *\n#\n");
2414
2415 seq_printf(m, "# Snapshot commands:\n");
2416 seq_printf(m, "# echo 0 > snapshot : Clears and frees snapshot buffer\n");
2417 seq_printf(m, "# echo 1 > snapshot : Allocates snapshot buffer, if not already allocated.\n");
2418 seq_printf(m, "# Takes a snapshot of the main buffer.\n");
2419 seq_printf(m, "# echo 2 > snapshot : Clears snapshot buffer (but does not allocate)\n");
2420 seq_printf(m, "# (Doesn't have to be '2' works with any number that\n");
2421 seq_printf(m, "# is not a '0' or '1')\n");
2422}
2423#else
2424/* Should never be called */
2425static inline void print_snapshot_help(struct seq_file *m, struct trace_iterator *iter) { }
2426#endif
2427
2330static int s_show(struct seq_file *m, void *v) 2428static int s_show(struct seq_file *m, void *v)
2331{ 2429{
2332 struct trace_iterator *iter = v; 2430 struct trace_iterator *iter = v;
@@ -2338,7 +2436,9 @@ static int s_show(struct seq_file *m, void *v)
2338 seq_puts(m, "#\n"); 2436 seq_puts(m, "#\n");
2339 test_ftrace_alive(m); 2437 test_ftrace_alive(m);
2340 } 2438 }
2341 if (iter->trace && iter->trace->print_header) 2439 if (iter->snapshot && trace_empty(iter))
2440 print_snapshot_help(m, iter);
2441 else if (iter->trace && iter->trace->print_header)
2342 iter->trace->print_header(m); 2442 iter->trace->print_header(m);
2343 else 2443 else
2344 trace_default_header(m); 2444 trace_default_header(m);
@@ -2377,7 +2477,7 @@ static const struct seq_operations tracer_seq_ops = {
2377}; 2477};
2378 2478
2379static struct trace_iterator * 2479static struct trace_iterator *
2380__tracing_open(struct inode *inode, struct file *file) 2480__tracing_open(struct inode *inode, struct file *file, bool snapshot)
2381{ 2481{
2382 long cpu_file = (long) inode->i_private; 2482 long cpu_file = (long) inode->i_private;
2383 struct trace_iterator *iter; 2483 struct trace_iterator *iter;
@@ -2404,16 +2504,16 @@ __tracing_open(struct inode *inode, struct file *file)
2404 if (!iter->trace) 2504 if (!iter->trace)
2405 goto fail; 2505 goto fail;
2406 2506
2407 if (current_trace) 2507 *iter->trace = *current_trace;
2408 *iter->trace = *current_trace;
2409 2508
2410 if (!zalloc_cpumask_var(&iter->started, GFP_KERNEL)) 2509 if (!zalloc_cpumask_var(&iter->started, GFP_KERNEL))
2411 goto fail; 2510 goto fail;
2412 2511
2413 if (current_trace && current_trace->print_max) 2512 if (current_trace->print_max || snapshot)
2414 iter->tr = &max_tr; 2513 iter->tr = &max_tr;
2415 else 2514 else
2416 iter->tr = &global_trace; 2515 iter->tr = &global_trace;
2516 iter->snapshot = snapshot;
2417 iter->pos = -1; 2517 iter->pos = -1;
2418 mutex_init(&iter->mutex); 2518 mutex_init(&iter->mutex);
2419 iter->cpu_file = cpu_file; 2519 iter->cpu_file = cpu_file;
@@ -2426,8 +2526,13 @@ __tracing_open(struct inode *inode, struct file *file)
2426 if (ring_buffer_overruns(iter->tr->buffer)) 2526 if (ring_buffer_overruns(iter->tr->buffer))
2427 iter->iter_flags |= TRACE_FILE_ANNOTATE; 2527 iter->iter_flags |= TRACE_FILE_ANNOTATE;
2428 2528
2429 /* stop the trace while dumping */ 2529 /* Output in nanoseconds only if we are using a clock in nanoseconds. */
2430 tracing_stop(); 2530 if (trace_clocks[trace_clock_id].in_ns)
2531 iter->iter_flags |= TRACE_FILE_TIME_IN_NS;
2532
2533 /* stop the trace while dumping if we are not opening "snapshot" */
2534 if (!iter->snapshot)
2535 tracing_stop();
2431 2536
2432 if (iter->cpu_file == TRACE_PIPE_ALL_CPU) { 2537 if (iter->cpu_file == TRACE_PIPE_ALL_CPU) {
2433 for_each_tracing_cpu(cpu) { 2538 for_each_tracing_cpu(cpu) {
@@ -2490,8 +2595,9 @@ static int tracing_release(struct inode *inode, struct file *file)
2490 if (iter->trace && iter->trace->close) 2595 if (iter->trace && iter->trace->close)
2491 iter->trace->close(iter); 2596 iter->trace->close(iter);
2492 2597
2493 /* reenable tracing if it was previously enabled */ 2598 if (!iter->snapshot)
2494 tracing_start(); 2599 /* reenable tracing if it was previously enabled */
2600 tracing_start();
2495 mutex_unlock(&trace_types_lock); 2601 mutex_unlock(&trace_types_lock);
2496 2602
2497 mutex_destroy(&iter->mutex); 2603 mutex_destroy(&iter->mutex);
@@ -2519,7 +2625,7 @@ static int tracing_open(struct inode *inode, struct file *file)
2519 } 2625 }
2520 2626
2521 if (file->f_mode & FMODE_READ) { 2627 if (file->f_mode & FMODE_READ) {
2522 iter = __tracing_open(inode, file); 2628 iter = __tracing_open(inode, file, false);
2523 if (IS_ERR(iter)) 2629 if (IS_ERR(iter))
2524 ret = PTR_ERR(iter); 2630 ret = PTR_ERR(iter);
2525 else if (trace_flags & TRACE_ITER_LATENCY_FMT) 2631 else if (trace_flags & TRACE_ITER_LATENCY_FMT)
@@ -2778,11 +2884,25 @@ static int set_tracer_option(struct tracer *trace, char *cmp, int neg)
2778 return -EINVAL; 2884 return -EINVAL;
2779} 2885}
2780 2886
2781static void set_tracer_flags(unsigned int mask, int enabled) 2887/* Some tracers require overwrite to stay enabled */
2888int trace_keep_overwrite(struct tracer *tracer, u32 mask, int set)
2889{
2890 if (tracer->enabled && (mask & TRACE_ITER_OVERWRITE) && !set)
2891 return -1;
2892
2893 return 0;
2894}
2895
2896int set_tracer_flag(unsigned int mask, int enabled)
2782{ 2897{
2783 /* do nothing if flag is already set */ 2898 /* do nothing if flag is already set */
2784 if (!!(trace_flags & mask) == !!enabled) 2899 if (!!(trace_flags & mask) == !!enabled)
2785 return; 2900 return 0;
2901
2902 /* Give the tracer a chance to approve the change */
2903 if (current_trace->flag_changed)
2904 if (current_trace->flag_changed(current_trace, mask, !!enabled))
2905 return -EINVAL;
2786 2906
2787 if (enabled) 2907 if (enabled)
2788 trace_flags |= mask; 2908 trace_flags |= mask;
@@ -2792,49 +2912,69 @@ static void set_tracer_flags(unsigned int mask, int enabled)
2792 if (mask == TRACE_ITER_RECORD_CMD) 2912 if (mask == TRACE_ITER_RECORD_CMD)
2793 trace_event_enable_cmd_record(enabled); 2913 trace_event_enable_cmd_record(enabled);
2794 2914
2795 if (mask == TRACE_ITER_OVERWRITE) 2915 if (mask == TRACE_ITER_OVERWRITE) {
2796 ring_buffer_change_overwrite(global_trace.buffer, enabled); 2916 ring_buffer_change_overwrite(global_trace.buffer, enabled);
2917#ifdef CONFIG_TRACER_MAX_TRACE
2918 ring_buffer_change_overwrite(max_tr.buffer, enabled);
2919#endif
2920 }
2921
2922 if (mask == TRACE_ITER_PRINTK)
2923 trace_printk_start_stop_comm(enabled);
2924
2925 return 0;
2797} 2926}
2798 2927
2799static ssize_t 2928static int trace_set_options(char *option)
2800tracing_trace_options_write(struct file *filp, const char __user *ubuf,
2801 size_t cnt, loff_t *ppos)
2802{ 2929{
2803 char buf[64];
2804 char *cmp; 2930 char *cmp;
2805 int neg = 0; 2931 int neg = 0;
2806 int ret; 2932 int ret = -ENODEV;
2807 int i; 2933 int i;
2808 2934
2809 if (cnt >= sizeof(buf)) 2935 cmp = strstrip(option);
2810 return -EINVAL;
2811
2812 if (copy_from_user(&buf, ubuf, cnt))
2813 return -EFAULT;
2814
2815 buf[cnt] = 0;
2816 cmp = strstrip(buf);
2817 2936
2818 if (strncmp(cmp, "no", 2) == 0) { 2937 if (strncmp(cmp, "no", 2) == 0) {
2819 neg = 1; 2938 neg = 1;
2820 cmp += 2; 2939 cmp += 2;
2821 } 2940 }
2822 2941
2942 mutex_lock(&trace_types_lock);
2943
2823 for (i = 0; trace_options[i]; i++) { 2944 for (i = 0; trace_options[i]; i++) {
2824 if (strcmp(cmp, trace_options[i]) == 0) { 2945 if (strcmp(cmp, trace_options[i]) == 0) {
2825 set_tracer_flags(1 << i, !neg); 2946 ret = set_tracer_flag(1 << i, !neg);
2826 break; 2947 break;
2827 } 2948 }
2828 } 2949 }
2829 2950
2830 /* If no option could be set, test the specific tracer options */ 2951 /* If no option could be set, test the specific tracer options */
2831 if (!trace_options[i]) { 2952 if (!trace_options[i])
2832 mutex_lock(&trace_types_lock);
2833 ret = set_tracer_option(current_trace, cmp, neg); 2953 ret = set_tracer_option(current_trace, cmp, neg);
2834 mutex_unlock(&trace_types_lock); 2954
2835 if (ret) 2955 mutex_unlock(&trace_types_lock);
2836 return ret; 2956
2837 } 2957 return ret;
2958}
2959
2960static ssize_t
2961tracing_trace_options_write(struct file *filp, const char __user *ubuf,
2962 size_t cnt, loff_t *ppos)
2963{
2964 char buf[64];
2965 int ret;
2966
2967 if (cnt >= sizeof(buf))
2968 return -EINVAL;
2969
2970 if (copy_from_user(&buf, ubuf, cnt))
2971 return -EFAULT;
2972
2973 buf[cnt] = 0;
2974
2975 ret = trace_set_options(buf);
2976 if (ret < 0)
2977 return ret;
2838 2978
2839 *ppos += cnt; 2979 *ppos += cnt;
2840 2980
@@ -2940,56 +3080,6 @@ static const struct file_operations tracing_saved_cmdlines_fops = {
2940}; 3080};
2941 3081
2942static ssize_t 3082static ssize_t
2943tracing_ctrl_read(struct file *filp, char __user *ubuf,
2944 size_t cnt, loff_t *ppos)
2945{
2946 char buf[64];
2947 int r;
2948
2949 r = sprintf(buf, "%u\n", tracer_enabled);
2950 return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
2951}
2952
2953static ssize_t
2954tracing_ctrl_write(struct file *filp, const char __user *ubuf,
2955 size_t cnt, loff_t *ppos)
2956{
2957 struct trace_array *tr = filp->private_data;
2958 unsigned long val;
2959 int ret;
2960
2961 ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
2962 if (ret)
2963 return ret;
2964
2965 val = !!val;
2966
2967 mutex_lock(&trace_types_lock);
2968 if (tracer_enabled ^ val) {
2969
2970 /* Only need to warn if this is used to change the state */
2971 WARN_ONCE(1, "tracing_enabled is deprecated. Use tracing_on");
2972
2973 if (val) {
2974 tracer_enabled = 1;
2975 if (current_trace->start)
2976 current_trace->start(tr);
2977 tracing_start();
2978 } else {
2979 tracer_enabled = 0;
2980 tracing_stop();
2981 if (current_trace->stop)
2982 current_trace->stop(tr);
2983 }
2984 }
2985 mutex_unlock(&trace_types_lock);
2986
2987 *ppos += cnt;
2988
2989 return cnt;
2990}
2991
2992static ssize_t
2993tracing_set_trace_read(struct file *filp, char __user *ubuf, 3083tracing_set_trace_read(struct file *filp, char __user *ubuf,
2994 size_t cnt, loff_t *ppos) 3084 size_t cnt, loff_t *ppos)
2995{ 3085{
@@ -2997,10 +3087,7 @@ tracing_set_trace_read(struct file *filp, char __user *ubuf,
2997 int r; 3087 int r;
2998 3088
2999 mutex_lock(&trace_types_lock); 3089 mutex_lock(&trace_types_lock);
3000 if (current_trace) 3090 r = sprintf(buf, "%s\n", current_trace->name);
3001 r = sprintf(buf, "%s\n", current_trace->name);
3002 else
3003 r = sprintf(buf, "\n");
3004 mutex_unlock(&trace_types_lock); 3091 mutex_unlock(&trace_types_lock);
3005 3092
3006 return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); 3093 return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
@@ -3019,6 +3106,31 @@ static void set_buffer_entries(struct trace_array *tr, unsigned long val)
3019 tr->data[cpu]->entries = val; 3106 tr->data[cpu]->entries = val;
3020} 3107}
3021 3108
3109/* resize @tr's buffer to the size of @size_tr's entries */
3110static int resize_buffer_duplicate_size(struct trace_array *tr,
3111 struct trace_array *size_tr, int cpu_id)
3112{
3113 int cpu, ret = 0;
3114
3115 if (cpu_id == RING_BUFFER_ALL_CPUS) {
3116 for_each_tracing_cpu(cpu) {
3117 ret = ring_buffer_resize(tr->buffer,
3118 size_tr->data[cpu]->entries, cpu);
3119 if (ret < 0)
3120 break;
3121 tr->data[cpu]->entries = size_tr->data[cpu]->entries;
3122 }
3123 } else {
3124 ret = ring_buffer_resize(tr->buffer,
3125 size_tr->data[cpu_id]->entries, cpu_id);
3126 if (ret == 0)
3127 tr->data[cpu_id]->entries =
3128 size_tr->data[cpu_id]->entries;
3129 }
3130
3131 return ret;
3132}
3133
3022static int __tracing_resize_ring_buffer(unsigned long size, int cpu) 3134static int __tracing_resize_ring_buffer(unsigned long size, int cpu)
3023{ 3135{
3024 int ret; 3136 int ret;
@@ -3030,6 +3142,10 @@ static int __tracing_resize_ring_buffer(unsigned long size, int cpu)
3030 */ 3142 */
3031 ring_buffer_expanded = 1; 3143 ring_buffer_expanded = 1;
3032 3144
3145 /* May be called before buffers are initialized */
3146 if (!global_trace.buffer)
3147 return 0;
3148
3033 ret = ring_buffer_resize(global_trace.buffer, size, cpu); 3149 ret = ring_buffer_resize(global_trace.buffer, size, cpu);
3034 if (ret < 0) 3150 if (ret < 0)
3035 return ret; 3151 return ret;
@@ -3039,23 +3155,8 @@ static int __tracing_resize_ring_buffer(unsigned long size, int cpu)
3039 3155
3040 ret = ring_buffer_resize(max_tr.buffer, size, cpu); 3156 ret = ring_buffer_resize(max_tr.buffer, size, cpu);
3041 if (ret < 0) { 3157 if (ret < 0) {
3042 int r = 0; 3158 int r = resize_buffer_duplicate_size(&global_trace,
3043 3159 &global_trace, cpu);
3044 if (cpu == RING_BUFFER_ALL_CPUS) {
3045 int i;
3046 for_each_tracing_cpu(i) {
3047 r = ring_buffer_resize(global_trace.buffer,
3048 global_trace.data[i]->entries,
3049 i);
3050 if (r < 0)
3051 break;
3052 }
3053 } else {
3054 r = ring_buffer_resize(global_trace.buffer,
3055 global_trace.data[cpu]->entries,
3056 cpu);
3057 }
3058
3059 if (r < 0) { 3160 if (r < 0) {
3060 /* 3161 /*
3061 * AARGH! We are left with different 3162 * AARGH! We are left with different
@@ -3152,6 +3253,7 @@ static int tracing_set_tracer(const char *buf)
3152 static struct trace_option_dentry *topts; 3253 static struct trace_option_dentry *topts;
3153 struct trace_array *tr = &global_trace; 3254 struct trace_array *tr = &global_trace;
3154 struct tracer *t; 3255 struct tracer *t;
3256 bool had_max_tr;
3155 int ret = 0; 3257 int ret = 0;
3156 3258
3157 mutex_lock(&trace_types_lock); 3259 mutex_lock(&trace_types_lock);
@@ -3176,9 +3278,24 @@ static int tracing_set_tracer(const char *buf)
3176 goto out; 3278 goto out;
3177 3279
3178 trace_branch_disable(); 3280 trace_branch_disable();
3179 if (current_trace && current_trace->reset) 3281
3282 current_trace->enabled = false;
3283
3284 if (current_trace->reset)
3180 current_trace->reset(tr); 3285 current_trace->reset(tr);
3181 if (current_trace && current_trace->use_max_tr) { 3286
3287 had_max_tr = current_trace->allocated_snapshot;
3288 current_trace = &nop_trace;
3289
3290 if (had_max_tr && !t->use_max_tr) {
3291 /*
3292 * We need to make sure that the update_max_tr sees that
3293 * current_trace changed to nop_trace to keep it from
3294 * swapping the buffers after we resize it.
3295 * The update_max_tr is called from interrupts disabled
3296 * so a synchronized_sched() is sufficient.
3297 */
3298 synchronize_sched();
3182 /* 3299 /*
3183 * We don't free the ring buffer. instead, resize it because 3300 * We don't free the ring buffer. instead, resize it because
3184 * The max_tr ring buffer has some state (e.g. ring->clock) and 3301 * The max_tr ring buffer has some state (e.g. ring->clock) and
@@ -3186,24 +3303,19 @@ static int tracing_set_tracer(const char *buf)
3186 */ 3303 */
3187 ring_buffer_resize(max_tr.buffer, 1, RING_BUFFER_ALL_CPUS); 3304 ring_buffer_resize(max_tr.buffer, 1, RING_BUFFER_ALL_CPUS);
3188 set_buffer_entries(&max_tr, 1); 3305 set_buffer_entries(&max_tr, 1);
3306 tracing_reset_online_cpus(&max_tr);
3307 current_trace->allocated_snapshot = false;
3189 } 3308 }
3190 destroy_trace_option_files(topts); 3309 destroy_trace_option_files(topts);
3191 3310
3192 current_trace = &nop_trace;
3193
3194 topts = create_trace_option_files(t); 3311 topts = create_trace_option_files(t);
3195 if (t->use_max_tr) { 3312 if (t->use_max_tr && !had_max_tr) {
3196 int cpu;
3197 /* we need to make per cpu buffer sizes equivalent */ 3313 /* we need to make per cpu buffer sizes equivalent */
3198 for_each_tracing_cpu(cpu) { 3314 ret = resize_buffer_duplicate_size(&max_tr, &global_trace,
3199 ret = ring_buffer_resize(max_tr.buffer, 3315 RING_BUFFER_ALL_CPUS);
3200 global_trace.data[cpu]->entries, 3316 if (ret < 0)
3201 cpu); 3317 goto out;
3202 if (ret < 0) 3318 t->allocated_snapshot = true;
3203 goto out;
3204 max_tr.data[cpu]->entries =
3205 global_trace.data[cpu]->entries;
3206 }
3207 } 3319 }
3208 3320
3209 if (t->init) { 3321 if (t->init) {
@@ -3213,6 +3325,7 @@ static int tracing_set_tracer(const char *buf)
3213 } 3325 }
3214 3326
3215 current_trace = t; 3327 current_trace = t;
3328 current_trace->enabled = true;
3216 trace_branch_enable(tr); 3329 trace_branch_enable(tr);
3217 out: 3330 out:
3218 mutex_unlock(&trace_types_lock); 3331 mutex_unlock(&trace_types_lock);
@@ -3311,8 +3424,7 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
3311 ret = -ENOMEM; 3424 ret = -ENOMEM;
3312 goto fail; 3425 goto fail;
3313 } 3426 }
3314 if (current_trace) 3427 *iter->trace = *current_trace;
3315 *iter->trace = *current_trace;
3316 3428
3317 if (!alloc_cpumask_var(&iter->started, GFP_KERNEL)) { 3429 if (!alloc_cpumask_var(&iter->started, GFP_KERNEL)) {
3318 ret = -ENOMEM; 3430 ret = -ENOMEM;
@@ -3325,6 +3437,10 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
3325 if (trace_flags & TRACE_ITER_LATENCY_FMT) 3437 if (trace_flags & TRACE_ITER_LATENCY_FMT)
3326 iter->iter_flags |= TRACE_FILE_LAT_FMT; 3438 iter->iter_flags |= TRACE_FILE_LAT_FMT;
3327 3439
3440 /* Output in nanoseconds only if we are using a clock in nanoseconds. */
3441 if (trace_clocks[trace_clock_id].in_ns)
3442 iter->iter_flags |= TRACE_FILE_TIME_IN_NS;
3443
3328 iter->cpu_file = cpu_file; 3444 iter->cpu_file = cpu_file;
3329 iter->tr = &global_trace; 3445 iter->tr = &global_trace;
3330 mutex_init(&iter->mutex); 3446 mutex_init(&iter->mutex);
@@ -3385,19 +3501,6 @@ tracing_poll_pipe(struct file *filp, poll_table *poll_table)
3385 } 3501 }
3386} 3502}
3387 3503
3388
3389void default_wait_pipe(struct trace_iterator *iter)
3390{
3391 DEFINE_WAIT(wait);
3392
3393 prepare_to_wait(&trace_wait, &wait, TASK_INTERRUPTIBLE);
3394
3395 if (trace_empty(iter))
3396 schedule();
3397
3398 finish_wait(&trace_wait, &wait);
3399}
3400
3401/* 3504/*
3402 * This is a make-shift waitqueue. 3505 * This is a make-shift waitqueue.
3403 * A tracer might use this callback on some rare cases: 3506 * A tracer might use this callback on some rare cases:
@@ -3446,7 +3549,7 @@ static int tracing_wait_pipe(struct file *filp)
3446 * 3549 *
3447 * iter->pos will be 0 if we haven't read anything. 3550 * iter->pos will be 0 if we haven't read anything.
3448 */ 3551 */
3449 if (!tracer_enabled && iter->pos) 3552 if (!tracing_is_enabled() && iter->pos)
3450 break; 3553 break;
3451 } 3554 }
3452 3555
@@ -3461,7 +3564,6 @@ tracing_read_pipe(struct file *filp, char __user *ubuf,
3461 size_t cnt, loff_t *ppos) 3564 size_t cnt, loff_t *ppos)
3462{ 3565{
3463 struct trace_iterator *iter = filp->private_data; 3566 struct trace_iterator *iter = filp->private_data;
3464 static struct tracer *old_tracer;
3465 ssize_t sret; 3567 ssize_t sret;
3466 3568
3467 /* return any leftover data */ 3569 /* return any leftover data */
@@ -3473,10 +3575,8 @@ tracing_read_pipe(struct file *filp, char __user *ubuf,
3473 3575
3474 /* copy the tracer to avoid using a global lock all around */ 3576 /* copy the tracer to avoid using a global lock all around */
3475 mutex_lock(&trace_types_lock); 3577 mutex_lock(&trace_types_lock);
3476 if (unlikely(old_tracer != current_trace && current_trace)) { 3578 if (unlikely(iter->trace->name != current_trace->name))
3477 old_tracer = current_trace;
3478 *iter->trace = *current_trace; 3579 *iter->trace = *current_trace;
3479 }
3480 mutex_unlock(&trace_types_lock); 3580 mutex_unlock(&trace_types_lock);
3481 3581
3482 /* 3582 /*
@@ -3632,7 +3732,6 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
3632 .ops = &tracing_pipe_buf_ops, 3732 .ops = &tracing_pipe_buf_ops,
3633 .spd_release = tracing_spd_release_pipe, 3733 .spd_release = tracing_spd_release_pipe,
3634 }; 3734 };
3635 static struct tracer *old_tracer;
3636 ssize_t ret; 3735 ssize_t ret;
3637 size_t rem; 3736 size_t rem;
3638 unsigned int i; 3737 unsigned int i;
@@ -3642,10 +3741,8 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
3642 3741
3643 /* copy the tracer to avoid using a global lock all around */ 3742 /* copy the tracer to avoid using a global lock all around */
3644 mutex_lock(&trace_types_lock); 3743 mutex_lock(&trace_types_lock);
3645 if (unlikely(old_tracer != current_trace && current_trace)) { 3744 if (unlikely(iter->trace->name != current_trace->name))
3646 old_tracer = current_trace;
3647 *iter->trace = *current_trace; 3745 *iter->trace = *current_trace;
3648 }
3649 mutex_unlock(&trace_types_lock); 3746 mutex_unlock(&trace_types_lock);
3650 3747
3651 mutex_lock(&iter->mutex); 3748 mutex_lock(&iter->mutex);
@@ -3955,7 +4052,7 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
3955 } else 4052 } else
3956 entry->buf[cnt] = '\0'; 4053 entry->buf[cnt] = '\0';
3957 4054
3958 ring_buffer_unlock_commit(buffer, event); 4055 __buffer_unlock_commit(buffer, event);
3959 4056
3960 written = cnt; 4057 written = cnt;
3961 4058
@@ -4016,6 +4113,13 @@ static ssize_t tracing_clock_write(struct file *filp, const char __user *ubuf,
4016 if (max_tr.buffer) 4113 if (max_tr.buffer)
4017 ring_buffer_set_clock(max_tr.buffer, trace_clocks[i].func); 4114 ring_buffer_set_clock(max_tr.buffer, trace_clocks[i].func);
4018 4115
4116 /*
4117 * New clock may not be consistent with the previous clock.
4118 * Reset the buffer so that it doesn't have incomparable timestamps.
4119 */
4120 tracing_reset_online_cpus(&global_trace);
4121 tracing_reset_online_cpus(&max_tr);
4122
4019 mutex_unlock(&trace_types_lock); 4123 mutex_unlock(&trace_types_lock);
4020 4124
4021 *fpos += cnt; 4125 *fpos += cnt;
@@ -4030,6 +4134,85 @@ static int tracing_clock_open(struct inode *inode, struct file *file)
4030 return single_open(file, tracing_clock_show, NULL); 4134 return single_open(file, tracing_clock_show, NULL);
4031} 4135}
4032 4136
4137#ifdef CONFIG_TRACER_SNAPSHOT
4138static int tracing_snapshot_open(struct inode *inode, struct file *file)
4139{
4140 struct trace_iterator *iter;
4141 int ret = 0;
4142
4143 if (file->f_mode & FMODE_READ) {
4144 iter = __tracing_open(inode, file, true);
4145 if (IS_ERR(iter))
4146 ret = PTR_ERR(iter);
4147 }
4148 return ret;
4149}
4150
4151static ssize_t
4152tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt,
4153 loff_t *ppos)
4154{
4155 unsigned long val;
4156 int ret;
4157
4158 ret = tracing_update_buffers();
4159 if (ret < 0)
4160 return ret;
4161
4162 ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
4163 if (ret)
4164 return ret;
4165
4166 mutex_lock(&trace_types_lock);
4167
4168 if (current_trace->use_max_tr) {
4169 ret = -EBUSY;
4170 goto out;
4171 }
4172
4173 switch (val) {
4174 case 0:
4175 if (current_trace->allocated_snapshot) {
4176 /* free spare buffer */
4177 ring_buffer_resize(max_tr.buffer, 1,
4178 RING_BUFFER_ALL_CPUS);
4179 set_buffer_entries(&max_tr, 1);
4180 tracing_reset_online_cpus(&max_tr);
4181 current_trace->allocated_snapshot = false;
4182 }
4183 break;
4184 case 1:
4185 if (!current_trace->allocated_snapshot) {
4186 /* allocate spare buffer */
4187 ret = resize_buffer_duplicate_size(&max_tr,
4188 &global_trace, RING_BUFFER_ALL_CPUS);
4189 if (ret < 0)
4190 break;
4191 current_trace->allocated_snapshot = true;
4192 }
4193
4194 local_irq_disable();
4195 /* Now, we're going to swap */
4196 update_max_tr(&global_trace, current, smp_processor_id());
4197 local_irq_enable();
4198 break;
4199 default:
4200 if (current_trace->allocated_snapshot)
4201 tracing_reset_online_cpus(&max_tr);
4202 break;
4203 }
4204
4205 if (ret >= 0) {
4206 *ppos += cnt;
4207 ret = cnt;
4208 }
4209out:
4210 mutex_unlock(&trace_types_lock);
4211 return ret;
4212}
4213#endif /* CONFIG_TRACER_SNAPSHOT */
4214
4215
4033static const struct file_operations tracing_max_lat_fops = { 4216static const struct file_operations tracing_max_lat_fops = {
4034 .open = tracing_open_generic, 4217 .open = tracing_open_generic,
4035 .read = tracing_max_lat_read, 4218 .read = tracing_max_lat_read,
@@ -4037,13 +4220,6 @@ static const struct file_operations tracing_max_lat_fops = {
4037 .llseek = generic_file_llseek, 4220 .llseek = generic_file_llseek,
4038}; 4221};
4039 4222
4040static const struct file_operations tracing_ctrl_fops = {
4041 .open = tracing_open_generic,
4042 .read = tracing_ctrl_read,
4043 .write = tracing_ctrl_write,
4044 .llseek = generic_file_llseek,
4045};
4046
4047static const struct file_operations set_tracer_fops = { 4223static const struct file_operations set_tracer_fops = {
4048 .open = tracing_open_generic, 4224 .open = tracing_open_generic,
4049 .read = tracing_set_trace_read, 4225 .read = tracing_set_trace_read,
@@ -4093,6 +4269,16 @@ static const struct file_operations trace_clock_fops = {
4093 .write = tracing_clock_write, 4269 .write = tracing_clock_write,
4094}; 4270};
4095 4271
4272#ifdef CONFIG_TRACER_SNAPSHOT
4273static const struct file_operations snapshot_fops = {
4274 .open = tracing_snapshot_open,
4275 .read = seq_read,
4276 .write = tracing_snapshot_write,
4277 .llseek = tracing_seek,
4278 .release = tracing_release,
4279};
4280#endif /* CONFIG_TRACER_SNAPSHOT */
4281
4096struct ftrace_buffer_info { 4282struct ftrace_buffer_info {
4097 struct trace_array *tr; 4283 struct trace_array *tr;
4098 void *spare; 4284 void *spare;
@@ -4260,13 +4446,11 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
4260 return -ENOMEM; 4446 return -ENOMEM;
4261 4447
4262 if (*ppos & (PAGE_SIZE - 1)) { 4448 if (*ppos & (PAGE_SIZE - 1)) {
4263 WARN_ONCE(1, "Ftrace: previous read must page-align\n");
4264 ret = -EINVAL; 4449 ret = -EINVAL;
4265 goto out; 4450 goto out;
4266 } 4451 }
4267 4452
4268 if (len & (PAGE_SIZE - 1)) { 4453 if (len & (PAGE_SIZE - 1)) {
4269 WARN_ONCE(1, "Ftrace: splice_read should page-align\n");
4270 if (len < PAGE_SIZE) { 4454 if (len < PAGE_SIZE) {
4271 ret = -EINVAL; 4455 ret = -EINVAL;
4272 goto out; 4456 goto out;
@@ -4377,13 +4561,30 @@ tracing_stats_read(struct file *filp, char __user *ubuf,
4377 cnt = ring_buffer_bytes_cpu(tr->buffer, cpu); 4561 cnt = ring_buffer_bytes_cpu(tr->buffer, cpu);
4378 trace_seq_printf(s, "bytes: %ld\n", cnt); 4562 trace_seq_printf(s, "bytes: %ld\n", cnt);
4379 4563
4380 t = ns2usecs(ring_buffer_oldest_event_ts(tr->buffer, cpu)); 4564 if (trace_clocks[trace_clock_id].in_ns) {
4381 usec_rem = do_div(t, USEC_PER_SEC); 4565 /* local or global for trace_clock */
4382 trace_seq_printf(s, "oldest event ts: %5llu.%06lu\n", t, usec_rem); 4566 t = ns2usecs(ring_buffer_oldest_event_ts(tr->buffer, cpu));
4567 usec_rem = do_div(t, USEC_PER_SEC);
4568 trace_seq_printf(s, "oldest event ts: %5llu.%06lu\n",
4569 t, usec_rem);
4570
4571 t = ns2usecs(ring_buffer_time_stamp(tr->buffer, cpu));
4572 usec_rem = do_div(t, USEC_PER_SEC);
4573 trace_seq_printf(s, "now ts: %5llu.%06lu\n", t, usec_rem);
4574 } else {
4575 /* counter or tsc mode for trace_clock */
4576 trace_seq_printf(s, "oldest event ts: %llu\n",
4577 ring_buffer_oldest_event_ts(tr->buffer, cpu));
4578
4579 trace_seq_printf(s, "now ts: %llu\n",
4580 ring_buffer_time_stamp(tr->buffer, cpu));
4581 }
4383 4582
4384 t = ns2usecs(ring_buffer_time_stamp(tr->buffer, cpu)); 4583 cnt = ring_buffer_dropped_events_cpu(tr->buffer, cpu);
4385 usec_rem = do_div(t, USEC_PER_SEC); 4584 trace_seq_printf(s, "dropped events: %ld\n", cnt);
4386 trace_seq_printf(s, "now ts: %5llu.%06lu\n", t, usec_rem); 4585
4586 cnt = ring_buffer_read_events_cpu(tr->buffer, cpu);
4587 trace_seq_printf(s, "read events: %ld\n", cnt);
4387 4588
4388 count = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len); 4589 count = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len);
4389 4590
@@ -4461,7 +4662,7 @@ struct dentry *tracing_init_dentry(void)
4461 4662
4462static struct dentry *d_percpu; 4663static struct dentry *d_percpu;
4463 4664
4464struct dentry *tracing_dentry_percpu(void) 4665static struct dentry *tracing_dentry_percpu(void)
4465{ 4666{
4466 static int once; 4667 static int once;
4467 struct dentry *d_tracer; 4668 struct dentry *d_tracer;
@@ -4611,7 +4812,13 @@ trace_options_core_write(struct file *filp, const char __user *ubuf, size_t cnt,
4611 4812
4612 if (val != 0 && val != 1) 4813 if (val != 0 && val != 1)
4613 return -EINVAL; 4814 return -EINVAL;
4614 set_tracer_flags(1 << index, val); 4815
4816 mutex_lock(&trace_types_lock);
4817 ret = set_tracer_flag(1 << index, val);
4818 mutex_unlock(&trace_types_lock);
4819
4820 if (ret < 0)
4821 return ret;
4615 4822
4616 *ppos += cnt; 4823 *ppos += cnt;
4617 4824
@@ -4788,10 +4995,17 @@ rb_simple_write(struct file *filp, const char __user *ubuf,
4788 return ret; 4995 return ret;
4789 4996
4790 if (buffer) { 4997 if (buffer) {
4791 if (val) 4998 mutex_lock(&trace_types_lock);
4999 if (val) {
4792 ring_buffer_record_on(buffer); 5000 ring_buffer_record_on(buffer);
4793 else 5001 if (current_trace->start)
5002 current_trace->start(tr);
5003 } else {
4794 ring_buffer_record_off(buffer); 5004 ring_buffer_record_off(buffer);
5005 if (current_trace->stop)
5006 current_trace->stop(tr);
5007 }
5008 mutex_unlock(&trace_types_lock);
4795 } 5009 }
4796 5010
4797 (*ppos)++; 5011 (*ppos)++;
@@ -4815,9 +5029,6 @@ static __init int tracer_init_debugfs(void)
4815 5029
4816 d_tracer = tracing_init_dentry(); 5030 d_tracer = tracing_init_dentry();
4817 5031
4818 trace_create_file("tracing_enabled", 0644, d_tracer,
4819 &global_trace, &tracing_ctrl_fops);
4820
4821 trace_create_file("trace_options", 0644, d_tracer, 5032 trace_create_file("trace_options", 0644, d_tracer,
4822 NULL, &tracing_iter_fops); 5033 NULL, &tracing_iter_fops);
4823 5034
@@ -4873,6 +5084,11 @@ static __init int tracer_init_debugfs(void)
4873 &ftrace_update_tot_cnt, &tracing_dyn_info_fops); 5084 &ftrace_update_tot_cnt, &tracing_dyn_info_fops);
4874#endif 5085#endif
4875 5086
5087#ifdef CONFIG_TRACER_SNAPSHOT
5088 trace_create_file("snapshot", 0644, d_tracer,
5089 (void *) TRACE_PIPE_ALL_CPU, &snapshot_fops);
5090#endif
5091
4876 create_trace_options_dir(); 5092 create_trace_options_dir();
4877 5093
4878 for_each_tracing_cpu(cpu) 5094 for_each_tracing_cpu(cpu)
@@ -4981,6 +5197,7 @@ __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode)
4981 if (disable_tracing) 5197 if (disable_tracing)
4982 ftrace_kill(); 5198 ftrace_kill();
4983 5199
5200 /* Simulate the iterator */
4984 trace_init_global_iter(&iter); 5201 trace_init_global_iter(&iter);
4985 5202
4986 for_each_tracing_cpu(cpu) { 5203 for_each_tracing_cpu(cpu) {
@@ -4992,10 +5209,6 @@ __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode)
4992 /* don't look at user memory in panic mode */ 5209 /* don't look at user memory in panic mode */
4993 trace_flags &= ~TRACE_ITER_SYM_USEROBJ; 5210 trace_flags &= ~TRACE_ITER_SYM_USEROBJ;
4994 5211
4995 /* Simulate the iterator */
4996 iter.tr = &global_trace;
4997 iter.trace = current_trace;
4998
4999 switch (oops_dump_mode) { 5212 switch (oops_dump_mode) {
5000 case DUMP_ALL: 5213 case DUMP_ALL:
5001 iter.cpu_file = TRACE_PIPE_ALL_CPU; 5214 iter.cpu_file = TRACE_PIPE_ALL_CPU;
@@ -5089,6 +5302,7 @@ __init static int tracer_alloc_buffers(void)
5089 5302
5090 /* Only allocate trace_printk buffers if a trace_printk exists */ 5303 /* Only allocate trace_printk buffers if a trace_printk exists */
5091 if (__stop___trace_bprintk_fmt != __start___trace_bprintk_fmt) 5304 if (__stop___trace_bprintk_fmt != __start___trace_bprintk_fmt)
5305 /* Must be called before global_trace.buffer is allocated */
5092 trace_printk_init_buffers(); 5306 trace_printk_init_buffers();
5093 5307
5094 /* To save memory, keep the ring buffer size to its minimum */ 5308 /* To save memory, keep the ring buffer size to its minimum */
@@ -5136,9 +5350,10 @@ __init static int tracer_alloc_buffers(void)
5136#endif 5350#endif
5137 5351
5138 trace_init_cmdlines(); 5352 trace_init_cmdlines();
5353 init_irq_work(&trace_work_wakeup, trace_wake_up);
5139 5354
5140 register_tracer(&nop_trace); 5355 register_tracer(&nop_trace);
5141 current_trace = &nop_trace; 5356
5142 /* All seems OK, enable tracing */ 5357 /* All seems OK, enable tracing */
5143 tracing_disabled = 0; 5358 tracing_disabled = 0;
5144 5359
@@ -5147,6 +5362,13 @@ __init static int tracer_alloc_buffers(void)
5147 5362
5148 register_die_notifier(&trace_die_notifier); 5363 register_die_notifier(&trace_die_notifier);
5149 5364
5365 while (trace_boot_options) {
5366 char *option;
5367
5368 option = strsep(&trace_boot_options, ",");
5369 trace_set_options(option);
5370 }
5371
5150 return 0; 5372 return 0;
5151 5373
5152out_free_cpumask: 5374out_free_cpumask:
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index c15f528c1af4..2081971367ea 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -283,24 +283,70 @@ struct tracer {
283 enum print_line_t (*print_line)(struct trace_iterator *iter); 283 enum print_line_t (*print_line)(struct trace_iterator *iter);
284 /* If you handled the flag setting, return 0 */ 284 /* If you handled the flag setting, return 0 */
285 int (*set_flag)(u32 old_flags, u32 bit, int set); 285 int (*set_flag)(u32 old_flags, u32 bit, int set);
286 /* Return 0 if OK with change, else return non-zero */
287 int (*flag_changed)(struct tracer *tracer,
288 u32 mask, int set);
286 struct tracer *next; 289 struct tracer *next;
287 struct tracer_flags *flags; 290 struct tracer_flags *flags;
288 int print_max; 291 bool print_max;
289 int use_max_tr; 292 bool use_max_tr;
293 bool allocated_snapshot;
294 bool enabled;
290}; 295};
291 296
292 297
293/* Only current can touch trace_recursion */ 298/* Only current can touch trace_recursion */
294#define trace_recursion_inc() do { (current)->trace_recursion++; } while (0)
295#define trace_recursion_dec() do { (current)->trace_recursion--; } while (0)
296 299
297/* Ring buffer has the 10 LSB bits to count */ 300/*
298#define trace_recursion_buffer() ((current)->trace_recursion & 0x3ff) 301 * For function tracing recursion:
299 302 * The order of these bits are important.
300/* for function tracing recursion */ 303 *
301#define TRACE_INTERNAL_BIT (1<<11) 304 * When function tracing occurs, the following steps are made:
302#define TRACE_GLOBAL_BIT (1<<12) 305 * If arch does not support a ftrace feature:
303#define TRACE_CONTROL_BIT (1<<13) 306 * call internal function (uses INTERNAL bits) which calls...
307 * If callback is registered to the "global" list, the list
308 * function is called and recursion checks the GLOBAL bits.
309 * then this function calls...
310 * The function callback, which can use the FTRACE bits to
311 * check for recursion.
312 *
313 * Now if the arch does not suppport a feature, and it calls
314 * the global list function which calls the ftrace callback
315 * all three of these steps will do a recursion protection.
316 * There's no reason to do one if the previous caller already
317 * did. The recursion that we are protecting against will
318 * go through the same steps again.
319 *
320 * To prevent the multiple recursion checks, if a recursion
321 * bit is set that is higher than the MAX bit of the current
322 * check, then we know that the check was made by the previous
323 * caller, and we can skip the current check.
324 */
325enum {
326 TRACE_BUFFER_BIT,
327 TRACE_BUFFER_NMI_BIT,
328 TRACE_BUFFER_IRQ_BIT,
329 TRACE_BUFFER_SIRQ_BIT,
330
331 /* Start of function recursion bits */
332 TRACE_FTRACE_BIT,
333 TRACE_FTRACE_NMI_BIT,
334 TRACE_FTRACE_IRQ_BIT,
335 TRACE_FTRACE_SIRQ_BIT,
336
337 /* GLOBAL_BITs must be greater than FTRACE_BITs */
338 TRACE_GLOBAL_BIT,
339 TRACE_GLOBAL_NMI_BIT,
340 TRACE_GLOBAL_IRQ_BIT,
341 TRACE_GLOBAL_SIRQ_BIT,
342
343 /* INTERNAL_BITs must be greater than GLOBAL_BITs */
344 TRACE_INTERNAL_BIT,
345 TRACE_INTERNAL_NMI_BIT,
346 TRACE_INTERNAL_IRQ_BIT,
347 TRACE_INTERNAL_SIRQ_BIT,
348
349 TRACE_CONTROL_BIT,
304 350
305/* 351/*
306 * Abuse of the trace_recursion. 352 * Abuse of the trace_recursion.
@@ -309,11 +355,77 @@ struct tracer {
309 * was called in irq context but we have irq tracing off. Since this 355 * was called in irq context but we have irq tracing off. Since this
310 * can only be modified by current, we can reuse trace_recursion. 356 * can only be modified by current, we can reuse trace_recursion.
311 */ 357 */
312#define TRACE_IRQ_BIT (1<<13) 358 TRACE_IRQ_BIT,
359};
360
361#define trace_recursion_set(bit) do { (current)->trace_recursion |= (1<<(bit)); } while (0)
362#define trace_recursion_clear(bit) do { (current)->trace_recursion &= ~(1<<(bit)); } while (0)
363#define trace_recursion_test(bit) ((current)->trace_recursion & (1<<(bit)))
313 364
314#define trace_recursion_set(bit) do { (current)->trace_recursion |= (bit); } while (0) 365#define TRACE_CONTEXT_BITS 4
315#define trace_recursion_clear(bit) do { (current)->trace_recursion &= ~(bit); } while (0) 366
316#define trace_recursion_test(bit) ((current)->trace_recursion & (bit)) 367#define TRACE_FTRACE_START TRACE_FTRACE_BIT
368#define TRACE_FTRACE_MAX ((1 << (TRACE_FTRACE_START + TRACE_CONTEXT_BITS)) - 1)
369
370#define TRACE_GLOBAL_START TRACE_GLOBAL_BIT
371#define TRACE_GLOBAL_MAX ((1 << (TRACE_GLOBAL_START + TRACE_CONTEXT_BITS)) - 1)
372
373#define TRACE_LIST_START TRACE_INTERNAL_BIT
374#define TRACE_LIST_MAX ((1 << (TRACE_LIST_START + TRACE_CONTEXT_BITS)) - 1)
375
376#define TRACE_CONTEXT_MASK TRACE_LIST_MAX
377
378static __always_inline int trace_get_context_bit(void)
379{
380 int bit;
381
382 if (in_interrupt()) {
383 if (in_nmi())
384 bit = 0;
385
386 else if (in_irq())
387 bit = 1;
388 else
389 bit = 2;
390 } else
391 bit = 3;
392
393 return bit;
394}
395
396static __always_inline int trace_test_and_set_recursion(int start, int max)
397{
398 unsigned int val = current->trace_recursion;
399 int bit;
400
401 /* A previous recursion check was made */
402 if ((val & TRACE_CONTEXT_MASK) > max)
403 return 0;
404
405 bit = trace_get_context_bit() + start;
406 if (unlikely(val & (1 << bit)))
407 return -1;
408
409 val |= 1 << bit;
410 current->trace_recursion = val;
411 barrier();
412
413 return bit;
414}
415
416static __always_inline void trace_clear_recursion(int bit)
417{
418 unsigned int val = current->trace_recursion;
419
420 if (!bit)
421 return;
422
423 bit = 1 << bit;
424 val &= ~bit;
425
426 barrier();
427 current->trace_recursion = val;
428}
317 429
318#define TRACE_PIPE_ALL_CPU -1 430#define TRACE_PIPE_ALL_CPU -1
319 431
@@ -327,7 +439,6 @@ trace_buffer_iter(struct trace_iterator *iter, int cpu)
327 439
328int tracer_init(struct tracer *t, struct trace_array *tr); 440int tracer_init(struct tracer *t, struct trace_array *tr);
329int tracing_is_enabled(void); 441int tracing_is_enabled(void);
330void trace_wake_up(void);
331void tracing_reset(struct trace_array *tr, int cpu); 442void tracing_reset(struct trace_array *tr, int cpu);
332void tracing_reset_online_cpus(struct trace_array *tr); 443void tracing_reset_online_cpus(struct trace_array *tr);
333void tracing_reset_current(int cpu); 444void tracing_reset_current(int cpu);
@@ -349,9 +460,6 @@ trace_buffer_lock_reserve(struct ring_buffer *buffer,
349 unsigned long len, 460 unsigned long len,
350 unsigned long flags, 461 unsigned long flags,
351 int pc); 462 int pc);
352void trace_buffer_unlock_commit(struct ring_buffer *buffer,
353 struct ring_buffer_event *event,
354 unsigned long flags, int pc);
355 463
356struct trace_entry *tracing_get_trace_entry(struct trace_array *tr, 464struct trace_entry *tracing_get_trace_entry(struct trace_array *tr,
357 struct trace_array_cpu *data); 465 struct trace_array_cpu *data);
@@ -359,6 +467,9 @@ struct trace_entry *tracing_get_trace_entry(struct trace_array *tr,
359struct trace_entry *trace_find_next_entry(struct trace_iterator *iter, 467struct trace_entry *trace_find_next_entry(struct trace_iterator *iter,
360 int *ent_cpu, u64 *ent_ts); 468 int *ent_cpu, u64 *ent_ts);
361 469
470void __buffer_unlock_commit(struct ring_buffer *buffer,
471 struct ring_buffer_event *event);
472
362int trace_empty(struct trace_iterator *iter); 473int trace_empty(struct trace_iterator *iter);
363 474
364void *trace_find_next_entry_inc(struct trace_iterator *iter); 475void *trace_find_next_entry_inc(struct trace_iterator *iter);
@@ -367,7 +478,6 @@ void trace_init_global_iter(struct trace_iterator *iter);
367 478
368void tracing_iter_reset(struct trace_iterator *iter, int cpu); 479void tracing_iter_reset(struct trace_iterator *iter, int cpu);
369 480
370void default_wait_pipe(struct trace_iterator *iter);
371void poll_wait_pipe(struct trace_iterator *iter); 481void poll_wait_pipe(struct trace_iterator *iter);
372 482
373void ftrace(struct trace_array *tr, 483void ftrace(struct trace_array *tr,
@@ -407,12 +517,7 @@ void tracing_sched_switch_assign_trace(struct trace_array *tr);
407void tracing_stop_sched_switch_record(void); 517void tracing_stop_sched_switch_record(void);
408void tracing_start_sched_switch_record(void); 518void tracing_start_sched_switch_record(void);
409int register_tracer(struct tracer *type); 519int register_tracer(struct tracer *type);
410void unregister_tracer(struct tracer *type);
411int is_tracing_stopped(void); 520int is_tracing_stopped(void);
412enum trace_file_type {
413 TRACE_FILE_LAT_FMT = 1,
414 TRACE_FILE_ANNOTATE = 2,
415};
416 521
417extern cpumask_var_t __read_mostly tracing_buffer_mask; 522extern cpumask_var_t __read_mostly tracing_buffer_mask;
418 523
@@ -841,6 +946,9 @@ extern const char *__start___trace_bprintk_fmt[];
841extern const char *__stop___trace_bprintk_fmt[]; 946extern const char *__stop___trace_bprintk_fmt[];
842 947
843void trace_printk_init_buffers(void); 948void trace_printk_init_buffers(void);
949void trace_printk_start_comm(void);
950int trace_keep_overwrite(struct tracer *tracer, u32 mask, int set);
951int set_tracer_flag(unsigned int mask, int enabled);
844 952
845#undef FTRACE_ENTRY 953#undef FTRACE_ENTRY
846#define FTRACE_ENTRY(call, struct_name, id, tstruct, print, filter) \ 954#define FTRACE_ENTRY(call, struct_name, id, tstruct, print, filter) \
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index 8d3538b4ea5f..95e96842ed29 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -77,7 +77,7 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
77 entry->correct = val == expect; 77 entry->correct = val == expect;
78 78
79 if (!filter_check_discard(call, entry, buffer, event)) 79 if (!filter_check_discard(call, entry, buffer, event))
80 ring_buffer_unlock_commit(buffer, event); 80 __buffer_unlock_commit(buffer, event);
81 81
82 out: 82 out:
83 atomic_dec(&tr->data[cpu]->disabled); 83 atomic_dec(&tr->data[cpu]->disabled);
@@ -199,7 +199,7 @@ __init static int init_branch_tracer(void)
199 } 199 }
200 return register_tracer(&branch_trace); 200 return register_tracer(&branch_trace);
201} 201}
202device_initcall(init_branch_tracer); 202core_initcall(init_branch_tracer);
203 203
204#else 204#else
205static inline 205static inline
diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c
index 394783531cbb..aa8f5f48dae6 100644
--- a/kernel/trace/trace_clock.c
+++ b/kernel/trace/trace_clock.c
@@ -21,8 +21,6 @@
21#include <linux/ktime.h> 21#include <linux/ktime.h>
22#include <linux/trace_clock.h> 22#include <linux/trace_clock.h>
23 23
24#include "trace.h"
25
26/* 24/*
27 * trace_clock_local(): the simplest and least coherent tracing clock. 25 * trace_clock_local(): the simplest and least coherent tracing clock.
28 * 26 *
@@ -44,6 +42,7 @@ u64 notrace trace_clock_local(void)
44 42
45 return clock; 43 return clock;
46} 44}
45EXPORT_SYMBOL_GPL(trace_clock_local);
47 46
48/* 47/*
49 * trace_clock(): 'between' trace clock. Not completely serialized, 48 * trace_clock(): 'between' trace clock. Not completely serialized,
@@ -86,7 +85,7 @@ u64 notrace trace_clock_global(void)
86 local_irq_save(flags); 85 local_irq_save(flags);
87 86
88 this_cpu = raw_smp_processor_id(); 87 this_cpu = raw_smp_processor_id();
89 now = cpu_clock(this_cpu); 88 now = sched_clock_cpu(this_cpu);
90 /* 89 /*
91 * If in an NMI context then dont risk lockups and return the 90 * If in an NMI context then dont risk lockups and return the
92 * cpu_clock() time: 91 * cpu_clock() time:
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index d608d09d08c0..57e9b284250c 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -116,7 +116,6 @@ static int trace_define_common_fields(void)
116 __common_field(unsigned char, flags); 116 __common_field(unsigned char, flags);
117 __common_field(unsigned char, preempt_count); 117 __common_field(unsigned char, preempt_count);
118 __common_field(int, pid); 118 __common_field(int, pid);
119 __common_field(int, padding);
120 119
121 return ret; 120 return ret;
122} 121}
@@ -491,19 +490,6 @@ static void t_stop(struct seq_file *m, void *p)
491 mutex_unlock(&event_mutex); 490 mutex_unlock(&event_mutex);
492} 491}
493 492
494static int
495ftrace_event_seq_open(struct inode *inode, struct file *file)
496{
497 const struct seq_operations *seq_ops;
498
499 if ((file->f_mode & FMODE_WRITE) &&
500 (file->f_flags & O_TRUNC))
501 ftrace_clear_events();
502
503 seq_ops = inode->i_private;
504 return seq_open(file, seq_ops);
505}
506
507static ssize_t 493static ssize_t
508event_enable_read(struct file *filp, char __user *ubuf, size_t cnt, 494event_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
509 loff_t *ppos) 495 loff_t *ppos)
@@ -980,6 +966,9 @@ show_header(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
980 return r; 966 return r;
981} 967}
982 968
969static int ftrace_event_avail_open(struct inode *inode, struct file *file);
970static int ftrace_event_set_open(struct inode *inode, struct file *file);
971
983static const struct seq_operations show_event_seq_ops = { 972static const struct seq_operations show_event_seq_ops = {
984 .start = t_start, 973 .start = t_start,
985 .next = t_next, 974 .next = t_next,
@@ -995,14 +984,14 @@ static const struct seq_operations show_set_event_seq_ops = {
995}; 984};
996 985
997static const struct file_operations ftrace_avail_fops = { 986static const struct file_operations ftrace_avail_fops = {
998 .open = ftrace_event_seq_open, 987 .open = ftrace_event_avail_open,
999 .read = seq_read, 988 .read = seq_read,
1000 .llseek = seq_lseek, 989 .llseek = seq_lseek,
1001 .release = seq_release, 990 .release = seq_release,
1002}; 991};
1003 992
1004static const struct file_operations ftrace_set_event_fops = { 993static const struct file_operations ftrace_set_event_fops = {
1005 .open = ftrace_event_seq_open, 994 .open = ftrace_event_set_open,
1006 .read = seq_read, 995 .read = seq_read,
1007 .write = ftrace_event_write, 996 .write = ftrace_event_write,
1008 .llseek = seq_lseek, 997 .llseek = seq_lseek,
@@ -1078,6 +1067,26 @@ static struct dentry *event_trace_events_dir(void)
1078 return d_events; 1067 return d_events;
1079} 1068}
1080 1069
1070static int
1071ftrace_event_avail_open(struct inode *inode, struct file *file)
1072{
1073 const struct seq_operations *seq_ops = &show_event_seq_ops;
1074
1075 return seq_open(file, seq_ops);
1076}
1077
1078static int
1079ftrace_event_set_open(struct inode *inode, struct file *file)
1080{
1081 const struct seq_operations *seq_ops = &show_set_event_seq_ops;
1082
1083 if ((file->f_mode & FMODE_WRITE) &&
1084 (file->f_flags & O_TRUNC))
1085 ftrace_clear_events();
1086
1087 return seq_open(file, seq_ops);
1088}
1089
1081static struct dentry * 1090static struct dentry *
1082event_subsystem_dir(const char *name, struct dentry *d_events) 1091event_subsystem_dir(const char *name, struct dentry *d_events)
1083{ 1092{
@@ -1489,6 +1498,9 @@ static __init int event_trace_enable(void)
1489 if (ret) 1498 if (ret)
1490 pr_warn("Failed to enable trace event: %s\n", token); 1499 pr_warn("Failed to enable trace event: %s\n", token);
1491 } 1500 }
1501
1502 trace_printk_start_comm();
1503
1492 return 0; 1504 return 0;
1493} 1505}
1494 1506
@@ -1505,15 +1517,13 @@ static __init int event_trace_init(void)
1505 return 0; 1517 return 0;
1506 1518
1507 entry = debugfs_create_file("available_events", 0444, d_tracer, 1519 entry = debugfs_create_file("available_events", 0444, d_tracer,
1508 (void *)&show_event_seq_ops, 1520 NULL, &ftrace_avail_fops);
1509 &ftrace_avail_fops);
1510 if (!entry) 1521 if (!entry)
1511 pr_warning("Could not create debugfs " 1522 pr_warning("Could not create debugfs "
1512 "'available_events' entry\n"); 1523 "'available_events' entry\n");
1513 1524
1514 entry = debugfs_create_file("set_event", 0644, d_tracer, 1525 entry = debugfs_create_file("set_event", 0644, d_tracer,
1515 (void *)&show_set_event_seq_ops, 1526 NULL, &ftrace_set_event_fops);
1516 &ftrace_set_event_fops);
1517 if (!entry) 1527 if (!entry)
1518 pr_warning("Could not create debugfs " 1528 pr_warning("Could not create debugfs "
1519 "'set_event' entry\n"); 1529 "'set_event' entry\n");
@@ -1749,7 +1759,7 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip,
1749 entry->ip = ip; 1759 entry->ip = ip;
1750 entry->parent_ip = parent_ip; 1760 entry->parent_ip = parent_ip;
1751 1761
1752 trace_nowake_buffer_unlock_commit(buffer, event, flags, pc); 1762 trace_buffer_unlock_commit(buffer, event, flags, pc);
1753 1763
1754 out: 1764 out:
1755 atomic_dec(&per_cpu(ftrace_test_event_disable, cpu)); 1765 atomic_dec(&per_cpu(ftrace_test_event_disable, cpu));
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index c154797a7ff7..e5b0ca8b8d4d 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -1000,9 +1000,9 @@ static int init_pred(struct filter_parse_state *ps,
1000 } 1000 }
1001 } else { 1001 } else {
1002 if (field->is_signed) 1002 if (field->is_signed)
1003 ret = strict_strtoll(pred->regex.pattern, 0, &val); 1003 ret = kstrtoll(pred->regex.pattern, 0, &val);
1004 else 1004 else
1005 ret = strict_strtoull(pred->regex.pattern, 0, &val); 1005 ret = kstrtoull(pred->regex.pattern, 0, &val);
1006 if (ret) { 1006 if (ret) {
1007 parse_error(ps, FILT_ERR_ILLEGAL_INTVAL, 0); 1007 parse_error(ps, FILT_ERR_ILLEGAL_INTVAL, 0);
1008 return -EINVAL; 1008 return -EINVAL;
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 507a7a9630bf..601152523326 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -7,7 +7,7 @@
7 * Based on code from the latency_tracer, that is: 7 * Based on code from the latency_tracer, that is:
8 * 8 *
9 * Copyright (C) 2004-2006 Ingo Molnar 9 * Copyright (C) 2004-2006 Ingo Molnar
10 * Copyright (C) 2004 William Lee Irwin III 10 * Copyright (C) 2004 Nadia Yvette Chambers
11 */ 11 */
12#include <linux/ring_buffer.h> 12#include <linux/ring_buffer.h>
13#include <linux/debugfs.h> 13#include <linux/debugfs.h>
@@ -47,34 +47,6 @@ static void function_trace_start(struct trace_array *tr)
47 tracing_reset_online_cpus(tr); 47 tracing_reset_online_cpus(tr);
48} 48}
49 49
50static void
51function_trace_call_preempt_only(unsigned long ip, unsigned long parent_ip,
52 struct ftrace_ops *op, struct pt_regs *pt_regs)
53{
54 struct trace_array *tr = func_trace;
55 struct trace_array_cpu *data;
56 unsigned long flags;
57 long disabled;
58 int cpu;
59 int pc;
60
61 if (unlikely(!ftrace_function_enabled))
62 return;
63
64 pc = preempt_count();
65 preempt_disable_notrace();
66 local_save_flags(flags);
67 cpu = raw_smp_processor_id();
68 data = tr->data[cpu];
69 disabled = atomic_inc_return(&data->disabled);
70
71 if (likely(disabled == 1))
72 trace_function(tr, ip, parent_ip, flags, pc);
73
74 atomic_dec(&data->disabled);
75 preempt_enable_notrace();
76}
77
78/* Our option */ 50/* Our option */
79enum { 51enum {
80 TRACE_FUNC_OPT_STACK = 0x1, 52 TRACE_FUNC_OPT_STACK = 0x1,
@@ -85,34 +57,34 @@ static struct tracer_flags func_flags;
85static void 57static void
86function_trace_call(unsigned long ip, unsigned long parent_ip, 58function_trace_call(unsigned long ip, unsigned long parent_ip,
87 struct ftrace_ops *op, struct pt_regs *pt_regs) 59 struct ftrace_ops *op, struct pt_regs *pt_regs)
88
89{ 60{
90 struct trace_array *tr = func_trace; 61 struct trace_array *tr = func_trace;
91 struct trace_array_cpu *data; 62 struct trace_array_cpu *data;
92 unsigned long flags; 63 unsigned long flags;
93 long disabled; 64 int bit;
94 int cpu; 65 int cpu;
95 int pc; 66 int pc;
96 67
97 if (unlikely(!ftrace_function_enabled)) 68 if (unlikely(!ftrace_function_enabled))
98 return; 69 return;
99 70
100 /* 71 pc = preempt_count();
101 * Need to use raw, since this must be called before the 72 preempt_disable_notrace();
102 * recursive protection is performed.
103 */
104 local_irq_save(flags);
105 cpu = raw_smp_processor_id();
106 data = tr->data[cpu];
107 disabled = atomic_inc_return(&data->disabled);
108 73
109 if (likely(disabled == 1)) { 74 bit = trace_test_and_set_recursion(TRACE_FTRACE_START, TRACE_FTRACE_MAX);
110 pc = preempt_count(); 75 if (bit < 0)
76 goto out;
77
78 cpu = smp_processor_id();
79 data = tr->data[cpu];
80 if (!atomic_read(&data->disabled)) {
81 local_save_flags(flags);
111 trace_function(tr, ip, parent_ip, flags, pc); 82 trace_function(tr, ip, parent_ip, flags, pc);
112 } 83 }
84 trace_clear_recursion(bit);
113 85
114 atomic_dec(&data->disabled); 86 out:
115 local_irq_restore(flags); 87 preempt_enable_notrace();
116} 88}
117 89
118static void 90static void
@@ -185,11 +157,6 @@ static void tracing_start_function_trace(void)
185{ 157{
186 ftrace_function_enabled = 0; 158 ftrace_function_enabled = 0;
187 159
188 if (trace_flags & TRACE_ITER_PREEMPTONLY)
189 trace_ops.func = function_trace_call_preempt_only;
190 else
191 trace_ops.func = function_trace_call;
192
193 if (func_flags.val & TRACE_FUNC_OPT_STACK) 160 if (func_flags.val & TRACE_FUNC_OPT_STACK)
194 register_ftrace_function(&trace_stack_ops); 161 register_ftrace_function(&trace_stack_ops);
195 else 162 else
@@ -366,7 +333,7 @@ ftrace_trace_onoff_callback(struct ftrace_hash *hash,
366 * We use the callback data field (which is a pointer) 333 * We use the callback data field (which is a pointer)
367 * as our counter. 334 * as our counter.
368 */ 335 */
369 ret = strict_strtoul(number, 0, (unsigned long *)&count); 336 ret = kstrtoul(number, 0, (unsigned long *)&count);
370 if (ret) 337 if (ret)
371 return ret; 338 return ret;
372 339
@@ -411,5 +378,4 @@ static __init int init_function_trace(void)
411 init_func_cmd_traceon(); 378 init_func_cmd_traceon();
412 return register_tracer(&function_trace); 379 return register_tracer(&function_trace);
413} 380}
414device_initcall(init_function_trace); 381core_initcall(init_function_trace);
415
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 99b4378393d5..39ada66389cc 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -47,6 +47,8 @@ struct fgraph_data {
47#define TRACE_GRAPH_PRINT_ABS_TIME 0x20 47#define TRACE_GRAPH_PRINT_ABS_TIME 0x20
48#define TRACE_GRAPH_PRINT_IRQS 0x40 48#define TRACE_GRAPH_PRINT_IRQS 0x40
49 49
50static unsigned int max_depth;
51
50static struct tracer_opt trace_opts[] = { 52static struct tracer_opt trace_opts[] = {
51 /* Display overruns? (for self-debug purpose) */ 53 /* Display overruns? (for self-debug purpose) */
52 { TRACER_OPT(funcgraph-overrun, TRACE_GRAPH_PRINT_OVERRUN) }, 54 { TRACER_OPT(funcgraph-overrun, TRACE_GRAPH_PRINT_OVERRUN) },
@@ -189,10 +191,16 @@ unsigned long ftrace_return_to_handler(unsigned long frame_pointer)
189 191
190 ftrace_pop_return_trace(&trace, &ret, frame_pointer); 192 ftrace_pop_return_trace(&trace, &ret, frame_pointer);
191 trace.rettime = trace_clock_local(); 193 trace.rettime = trace_clock_local();
192 ftrace_graph_return(&trace);
193 barrier(); 194 barrier();
194 current->curr_ret_stack--; 195 current->curr_ret_stack--;
195 196
197 /*
198 * The trace should run after decrementing the ret counter
199 * in case an interrupt were to come in. We don't want to
200 * lose the interrupt if max_depth is set.
201 */
202 ftrace_graph_return(&trace);
203
196 if (unlikely(!ret)) { 204 if (unlikely(!ret)) {
197 ftrace_graph_stop(); 205 ftrace_graph_stop();
198 WARN_ON(1); 206 WARN_ON(1);
@@ -223,7 +231,7 @@ int __trace_graph_entry(struct trace_array *tr,
223 entry = ring_buffer_event_data(event); 231 entry = ring_buffer_event_data(event);
224 entry->graph_ent = *trace; 232 entry->graph_ent = *trace;
225 if (!filter_current_check_discard(buffer, call, entry, event)) 233 if (!filter_current_check_discard(buffer, call, entry, event))
226 ring_buffer_unlock_commit(buffer, event); 234 __buffer_unlock_commit(buffer, event);
227 235
228 return 1; 236 return 1;
229} 237}
@@ -250,8 +258,9 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
250 return 0; 258 return 0;
251 259
252 /* trace it when it is-nested-in or is a function enabled. */ 260 /* trace it when it is-nested-in or is a function enabled. */
253 if (!(trace->depth || ftrace_graph_addr(trace->func)) || 261 if ((!(trace->depth || ftrace_graph_addr(trace->func)) ||
254 ftrace_graph_ignore_irqs()) 262 ftrace_graph_ignore_irqs()) ||
263 (max_depth && trace->depth >= max_depth))
255 return 0; 264 return 0;
256 265
257 local_irq_save(flags); 266 local_irq_save(flags);
@@ -327,7 +336,7 @@ void __trace_graph_return(struct trace_array *tr,
327 entry = ring_buffer_event_data(event); 336 entry = ring_buffer_event_data(event);
328 entry->ret = *trace; 337 entry->ret = *trace;
329 if (!filter_current_check_discard(buffer, call, entry, event)) 338 if (!filter_current_check_discard(buffer, call, entry, event))
330 ring_buffer_unlock_commit(buffer, event); 339 __buffer_unlock_commit(buffer, event);
331} 340}
332 341
333void trace_graph_return(struct ftrace_graph_ret *trace) 342void trace_graph_return(struct ftrace_graph_ret *trace)
@@ -1457,6 +1466,59 @@ static struct tracer graph_trace __read_mostly = {
1457#endif 1466#endif
1458}; 1467};
1459 1468
1469
1470static ssize_t
1471graph_depth_write(struct file *filp, const char __user *ubuf, size_t cnt,
1472 loff_t *ppos)
1473{
1474 unsigned long val;
1475 int ret;
1476
1477 ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
1478 if (ret)
1479 return ret;
1480
1481 max_depth = val;
1482
1483 *ppos += cnt;
1484
1485 return cnt;
1486}
1487
1488static ssize_t
1489graph_depth_read(struct file *filp, char __user *ubuf, size_t cnt,
1490 loff_t *ppos)
1491{
1492 char buf[15]; /* More than enough to hold UINT_MAX + "\n"*/
1493 int n;
1494
1495 n = sprintf(buf, "%d\n", max_depth);
1496
1497 return simple_read_from_buffer(ubuf, cnt, ppos, buf, n);
1498}
1499
1500static const struct file_operations graph_depth_fops = {
1501 .open = tracing_open_generic,
1502 .write = graph_depth_write,
1503 .read = graph_depth_read,
1504 .llseek = generic_file_llseek,
1505};
1506
1507static __init int init_graph_debugfs(void)
1508{
1509 struct dentry *d_tracer;
1510
1511 d_tracer = tracing_init_dentry();
1512 if (!d_tracer)
1513 return 0;
1514
1515 trace_create_file("max_graph_depth", 0644, d_tracer,
1516 NULL, &graph_depth_fops);
1517
1518 return 0;
1519}
1520fs_initcall(init_graph_debugfs);
1521
1460static __init int init_graph_trace(void) 1522static __init int init_graph_trace(void)
1461{ 1523{
1462 max_bytes_for_cpu = snprintf(NULL, 0, "%d", nr_cpu_ids - 1); 1524 max_bytes_for_cpu = snprintf(NULL, 0, "%d", nr_cpu_ids - 1);
@@ -1474,4 +1536,4 @@ static __init int init_graph_trace(void)
1474 return register_tracer(&graph_trace); 1536 return register_tracer(&graph_trace);
1475} 1537}
1476 1538
1477device_initcall(init_graph_trace); 1539core_initcall(init_graph_trace);
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index d98ee8283b29..443b25b43b4f 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -7,7 +7,7 @@
7 * From code in the latency_tracer, that is: 7 * From code in the latency_tracer, that is:
8 * 8 *
9 * Copyright (C) 2004-2006 Ingo Molnar 9 * Copyright (C) 2004-2006 Ingo Molnar
10 * Copyright (C) 2004 William Lee Irwin III 10 * Copyright (C) 2004 Nadia Yvette Chambers
11 */ 11 */
12#include <linux/kallsyms.h> 12#include <linux/kallsyms.h>
13#include <linux/debugfs.h> 13#include <linux/debugfs.h>
@@ -32,7 +32,7 @@ enum {
32 32
33static int trace_type __read_mostly; 33static int trace_type __read_mostly;
34 34
35static int save_lat_flag; 35static int save_flags;
36 36
37static void stop_irqsoff_tracer(struct trace_array *tr, int graph); 37static void stop_irqsoff_tracer(struct trace_array *tr, int graph);
38static int start_irqsoff_tracer(struct trace_array *tr, int graph); 38static int start_irqsoff_tracer(struct trace_array *tr, int graph);
@@ -558,8 +558,11 @@ static void stop_irqsoff_tracer(struct trace_array *tr, int graph)
558 558
559static void __irqsoff_tracer_init(struct trace_array *tr) 559static void __irqsoff_tracer_init(struct trace_array *tr)
560{ 560{
561 save_lat_flag = trace_flags & TRACE_ITER_LATENCY_FMT; 561 save_flags = trace_flags;
562 trace_flags |= TRACE_ITER_LATENCY_FMT; 562
563 /* non overwrite screws up the latency tracers */
564 set_tracer_flag(TRACE_ITER_OVERWRITE, 1);
565 set_tracer_flag(TRACE_ITER_LATENCY_FMT, 1);
563 566
564 tracing_max_latency = 0; 567 tracing_max_latency = 0;
565 irqsoff_trace = tr; 568 irqsoff_trace = tr;
@@ -573,10 +576,13 @@ static void __irqsoff_tracer_init(struct trace_array *tr)
573 576
574static void irqsoff_tracer_reset(struct trace_array *tr) 577static void irqsoff_tracer_reset(struct trace_array *tr)
575{ 578{
579 int lat_flag = save_flags & TRACE_ITER_LATENCY_FMT;
580 int overwrite_flag = save_flags & TRACE_ITER_OVERWRITE;
581
576 stop_irqsoff_tracer(tr, is_graph()); 582 stop_irqsoff_tracer(tr, is_graph());
577 583
578 if (!save_lat_flag) 584 set_tracer_flag(TRACE_ITER_LATENCY_FMT, lat_flag);
579 trace_flags &= ~TRACE_ITER_LATENCY_FMT; 585 set_tracer_flag(TRACE_ITER_OVERWRITE, overwrite_flag);
580} 586}
581 587
582static void irqsoff_tracer_start(struct trace_array *tr) 588static void irqsoff_tracer_start(struct trace_array *tr)
@@ -604,17 +610,18 @@ static struct tracer irqsoff_tracer __read_mostly =
604 .reset = irqsoff_tracer_reset, 610 .reset = irqsoff_tracer_reset,
605 .start = irqsoff_tracer_start, 611 .start = irqsoff_tracer_start,
606 .stop = irqsoff_tracer_stop, 612 .stop = irqsoff_tracer_stop,
607 .print_max = 1, 613 .print_max = true,
608 .print_header = irqsoff_print_header, 614 .print_header = irqsoff_print_header,
609 .print_line = irqsoff_print_line, 615 .print_line = irqsoff_print_line,
610 .flags = &tracer_flags, 616 .flags = &tracer_flags,
611 .set_flag = irqsoff_set_flag, 617 .set_flag = irqsoff_set_flag,
618 .flag_changed = trace_keep_overwrite,
612#ifdef CONFIG_FTRACE_SELFTEST 619#ifdef CONFIG_FTRACE_SELFTEST
613 .selftest = trace_selftest_startup_irqsoff, 620 .selftest = trace_selftest_startup_irqsoff,
614#endif 621#endif
615 .open = irqsoff_trace_open, 622 .open = irqsoff_trace_open,
616 .close = irqsoff_trace_close, 623 .close = irqsoff_trace_close,
617 .use_max_tr = 1, 624 .use_max_tr = true,
618}; 625};
619# define register_irqsoff(trace) register_tracer(&trace) 626# define register_irqsoff(trace) register_tracer(&trace)
620#else 627#else
@@ -637,17 +644,18 @@ static struct tracer preemptoff_tracer __read_mostly =
637 .reset = irqsoff_tracer_reset, 644 .reset = irqsoff_tracer_reset,
638 .start = irqsoff_tracer_start, 645 .start = irqsoff_tracer_start,
639 .stop = irqsoff_tracer_stop, 646 .stop = irqsoff_tracer_stop,
640 .print_max = 1, 647 .print_max = true,
641 .print_header = irqsoff_print_header, 648 .print_header = irqsoff_print_header,
642 .print_line = irqsoff_print_line, 649 .print_line = irqsoff_print_line,
643 .flags = &tracer_flags, 650 .flags = &tracer_flags,
644 .set_flag = irqsoff_set_flag, 651 .set_flag = irqsoff_set_flag,
652 .flag_changed = trace_keep_overwrite,
645#ifdef CONFIG_FTRACE_SELFTEST 653#ifdef CONFIG_FTRACE_SELFTEST
646 .selftest = trace_selftest_startup_preemptoff, 654 .selftest = trace_selftest_startup_preemptoff,
647#endif 655#endif
648 .open = irqsoff_trace_open, 656 .open = irqsoff_trace_open,
649 .close = irqsoff_trace_close, 657 .close = irqsoff_trace_close,
650 .use_max_tr = 1, 658 .use_max_tr = true,
651}; 659};
652# define register_preemptoff(trace) register_tracer(&trace) 660# define register_preemptoff(trace) register_tracer(&trace)
653#else 661#else
@@ -672,17 +680,18 @@ static struct tracer preemptirqsoff_tracer __read_mostly =
672 .reset = irqsoff_tracer_reset, 680 .reset = irqsoff_tracer_reset,
673 .start = irqsoff_tracer_start, 681 .start = irqsoff_tracer_start,
674 .stop = irqsoff_tracer_stop, 682 .stop = irqsoff_tracer_stop,
675 .print_max = 1, 683 .print_max = true,
676 .print_header = irqsoff_print_header, 684 .print_header = irqsoff_print_header,
677 .print_line = irqsoff_print_line, 685 .print_line = irqsoff_print_line,
678 .flags = &tracer_flags, 686 .flags = &tracer_flags,
679 .set_flag = irqsoff_set_flag, 687 .set_flag = irqsoff_set_flag,
688 .flag_changed = trace_keep_overwrite,
680#ifdef CONFIG_FTRACE_SELFTEST 689#ifdef CONFIG_FTRACE_SELFTEST
681 .selftest = trace_selftest_startup_preemptirqsoff, 690 .selftest = trace_selftest_startup_preemptirqsoff,
682#endif 691#endif
683 .open = irqsoff_trace_open, 692 .open = irqsoff_trace_open,
684 .close = irqsoff_trace_close, 693 .close = irqsoff_trace_close,
685 .use_max_tr = 1, 694 .use_max_tr = true,
686}; 695};
687 696
688# define register_preemptirqsoff(trace) register_tracer(&trace) 697# define register_preemptirqsoff(trace) register_tracer(&trace)
@@ -698,4 +707,4 @@ __init static int init_irqsoff_tracer(void)
698 707
699 return 0; 708 return 0;
700} 709}
701device_initcall(init_irqsoff_tracer); 710core_initcall(init_irqsoff_tracer);
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 1a2117043bb1..1865d5f76538 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -444,7 +444,7 @@ static int create_trace_probe(int argc, char **argv)
444 return -EINVAL; 444 return -EINVAL;
445 } 445 }
446 /* an address specified */ 446 /* an address specified */
447 ret = strict_strtoul(&argv[1][0], 0, (unsigned long *)&addr); 447 ret = kstrtoul(&argv[1][0], 0, (unsigned long *)&addr);
448 if (ret) { 448 if (ret) {
449 pr_info("Failed to parse address.\n"); 449 pr_info("Failed to parse address.\n");
450 return ret; 450 return ret;
@@ -751,8 +751,8 @@ static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
751 store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); 751 store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize);
752 752
753 if (!filter_current_check_discard(buffer, call, entry, event)) 753 if (!filter_current_check_discard(buffer, call, entry, event))
754 trace_nowake_buffer_unlock_commit_regs(buffer, event, 754 trace_buffer_unlock_commit_regs(buffer, event,
755 irq_flags, pc, regs); 755 irq_flags, pc, regs);
756} 756}
757 757
758/* Kretprobe handler */ 758/* Kretprobe handler */
@@ -784,8 +784,8 @@ static __kprobes void kretprobe_trace_func(struct kretprobe_instance *ri,
784 store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); 784 store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize);
785 785
786 if (!filter_current_check_discard(buffer, call, entry, event)) 786 if (!filter_current_check_discard(buffer, call, entry, event))
787 trace_nowake_buffer_unlock_commit_regs(buffer, event, 787 trace_buffer_unlock_commit_regs(buffer, event,
788 irq_flags, pc, regs); 788 irq_flags, pc, regs);
789} 789}
790 790
791/* Event entry printers */ 791/* Event entry printers */
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 123b189c732c..697e88d13907 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -610,24 +610,54 @@ lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu)
610 return trace_print_lat_fmt(s, entry); 610 return trace_print_lat_fmt(s, entry);
611} 611}
612 612
613static unsigned long preempt_mark_thresh = 100; 613static unsigned long preempt_mark_thresh_us = 100;
614 614
615static int 615static int
616lat_print_timestamp(struct trace_seq *s, u64 abs_usecs, 616lat_print_timestamp(struct trace_iterator *iter, u64 next_ts)
617 unsigned long rel_usecs)
618{ 617{
619 return trace_seq_printf(s, " %4lldus%c: ", abs_usecs, 618 unsigned long verbose = trace_flags & TRACE_ITER_VERBOSE;
620 rel_usecs > preempt_mark_thresh ? '!' : 619 unsigned long in_ns = iter->iter_flags & TRACE_FILE_TIME_IN_NS;
621 rel_usecs > 1 ? '+' : ' '); 620 unsigned long long abs_ts = iter->ts - iter->tr->time_start;
621 unsigned long long rel_ts = next_ts - iter->ts;
622 struct trace_seq *s = &iter->seq;
623
624 if (in_ns) {
625 abs_ts = ns2usecs(abs_ts);
626 rel_ts = ns2usecs(rel_ts);
627 }
628
629 if (verbose && in_ns) {
630 unsigned long abs_usec = do_div(abs_ts, USEC_PER_MSEC);
631 unsigned long abs_msec = (unsigned long)abs_ts;
632 unsigned long rel_usec = do_div(rel_ts, USEC_PER_MSEC);
633 unsigned long rel_msec = (unsigned long)rel_ts;
634
635 return trace_seq_printf(
636 s, "[%08llx] %ld.%03ldms (+%ld.%03ldms): ",
637 ns2usecs(iter->ts),
638 abs_msec, abs_usec,
639 rel_msec, rel_usec);
640 } else if (verbose && !in_ns) {
641 return trace_seq_printf(
642 s, "[%016llx] %lld (+%lld): ",
643 iter->ts, abs_ts, rel_ts);
644 } else if (!verbose && in_ns) {
645 return trace_seq_printf(
646 s, " %4lldus%c: ",
647 abs_ts,
648 rel_ts > preempt_mark_thresh_us ? '!' :
649 rel_ts > 1 ? '+' : ' ');
650 } else { /* !verbose && !in_ns */
651 return trace_seq_printf(s, " %4lld: ", abs_ts);
652 }
622} 653}
623 654
624int trace_print_context(struct trace_iterator *iter) 655int trace_print_context(struct trace_iterator *iter)
625{ 656{
626 struct trace_seq *s = &iter->seq; 657 struct trace_seq *s = &iter->seq;
627 struct trace_entry *entry = iter->ent; 658 struct trace_entry *entry = iter->ent;
628 unsigned long long t = ns2usecs(iter->ts); 659 unsigned long long t;
629 unsigned long usec_rem = do_div(t, USEC_PER_SEC); 660 unsigned long secs, usec_rem;
630 unsigned long secs = (unsigned long)t;
631 char comm[TASK_COMM_LEN]; 661 char comm[TASK_COMM_LEN];
632 int ret; 662 int ret;
633 663
@@ -644,8 +674,13 @@ int trace_print_context(struct trace_iterator *iter)
644 return 0; 674 return 0;
645 } 675 }
646 676
647 return trace_seq_printf(s, " %5lu.%06lu: ", 677 if (iter->iter_flags & TRACE_FILE_TIME_IN_NS) {
648 secs, usec_rem); 678 t = ns2usecs(iter->ts);
679 usec_rem = do_div(t, USEC_PER_SEC);
680 secs = (unsigned long)t;
681 return trace_seq_printf(s, " %5lu.%06lu: ", secs, usec_rem);
682 } else
683 return trace_seq_printf(s, " %12llu: ", iter->ts);
649} 684}
650 685
651int trace_print_lat_context(struct trace_iterator *iter) 686int trace_print_lat_context(struct trace_iterator *iter)
@@ -659,36 +694,29 @@ int trace_print_lat_context(struct trace_iterator *iter)
659 *next_entry = trace_find_next_entry(iter, NULL, 694 *next_entry = trace_find_next_entry(iter, NULL,
660 &next_ts); 695 &next_ts);
661 unsigned long verbose = (trace_flags & TRACE_ITER_VERBOSE); 696 unsigned long verbose = (trace_flags & TRACE_ITER_VERBOSE);
662 unsigned long abs_usecs = ns2usecs(iter->ts - iter->tr->time_start);
663 unsigned long rel_usecs;
664 697
665 /* Restore the original ent_size */ 698 /* Restore the original ent_size */
666 iter->ent_size = ent_size; 699 iter->ent_size = ent_size;
667 700
668 if (!next_entry) 701 if (!next_entry)
669 next_ts = iter->ts; 702 next_ts = iter->ts;
670 rel_usecs = ns2usecs(next_ts - iter->ts);
671 703
672 if (verbose) { 704 if (verbose) {
673 char comm[TASK_COMM_LEN]; 705 char comm[TASK_COMM_LEN];
674 706
675 trace_find_cmdline(entry->pid, comm); 707 trace_find_cmdline(entry->pid, comm);
676 708
677 ret = trace_seq_printf(s, "%16s %5d %3d %d %08x %08lx [%08llx]" 709 ret = trace_seq_printf(
678 " %ld.%03ldms (+%ld.%03ldms): ", comm, 710 s, "%16s %5d %3d %d %08x %08lx ",
679 entry->pid, iter->cpu, entry->flags, 711 comm, entry->pid, iter->cpu, entry->flags,
680 entry->preempt_count, iter->idx, 712 entry->preempt_count, iter->idx);
681 ns2usecs(iter->ts),
682 abs_usecs / USEC_PER_MSEC,
683 abs_usecs % USEC_PER_MSEC,
684 rel_usecs / USEC_PER_MSEC,
685 rel_usecs % USEC_PER_MSEC);
686 } else { 713 } else {
687 ret = lat_print_generic(s, entry, iter->cpu); 714 ret = lat_print_generic(s, entry, iter->cpu);
688 if (ret)
689 ret = lat_print_timestamp(s, abs_usecs, rel_usecs);
690 } 715 }
691 716
717 if (ret)
718 ret = lat_print_timestamp(iter, next_ts);
719
692 return ret; 720 return ret;
693} 721}
694 722
@@ -711,12 +739,11 @@ static int task_state_char(unsigned long state)
711struct trace_event *ftrace_find_event(int type) 739struct trace_event *ftrace_find_event(int type)
712{ 740{
713 struct trace_event *event; 741 struct trace_event *event;
714 struct hlist_node *n;
715 unsigned key; 742 unsigned key;
716 743
717 key = type & (EVENT_HASHSIZE - 1); 744 key = type & (EVENT_HASHSIZE - 1);
718 745
719 hlist_for_each_entry(event, n, &event_hash[key], node) { 746 hlist_for_each_entry(event, &event_hash[key], node) {
720 if (event->type == type) 747 if (event->type == type)
721 return event; 748 return event;
722 } 749 }
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
index daa9980153af..412e959709b4 100644
--- a/kernel/trace/trace_probe.c
+++ b/kernel/trace/trace_probe.c
@@ -441,7 +441,7 @@ static const struct fetch_type *find_fetch_type(const char *type)
441 goto fail; 441 goto fail;
442 442
443 type++; 443 type++;
444 if (strict_strtoul(type, 0, &bs)) 444 if (kstrtoul(type, 0, &bs))
445 goto fail; 445 goto fail;
446 446
447 switch (bs) { 447 switch (bs) {
@@ -501,8 +501,8 @@ int traceprobe_split_symbol_offset(char *symbol, unsigned long *offset)
501 501
502 tmp = strchr(symbol, '+'); 502 tmp = strchr(symbol, '+');
503 if (tmp) { 503 if (tmp) {
504 /* skip sign because strict_strtol doesn't accept '+' */ 504 /* skip sign because kstrtoul doesn't accept '+' */
505 ret = strict_strtoul(tmp + 1, 0, offset); 505 ret = kstrtoul(tmp + 1, 0, offset);
506 if (ret) 506 if (ret)
507 return ret; 507 return ret;
508 508
@@ -533,7 +533,7 @@ static int parse_probe_vars(char *arg, const struct fetch_type *t,
533 else 533 else
534 ret = -EINVAL; 534 ret = -EINVAL;
535 } else if (isdigit(arg[5])) { 535 } else if (isdigit(arg[5])) {
536 ret = strict_strtoul(arg + 5, 10, &param); 536 ret = kstrtoul(arg + 5, 10, &param);
537 if (ret || param > PARAM_MAX_STACK) 537 if (ret || param > PARAM_MAX_STACK)
538 ret = -EINVAL; 538 ret = -EINVAL;
539 else { 539 else {
@@ -579,7 +579,7 @@ static int parse_probe_arg(char *arg, const struct fetch_type *t,
579 579
580 case '@': /* memory or symbol */ 580 case '@': /* memory or symbol */
581 if (isdigit(arg[1])) { 581 if (isdigit(arg[1])) {
582 ret = strict_strtoul(arg + 1, 0, &param); 582 ret = kstrtoul(arg + 1, 0, &param);
583 if (ret) 583 if (ret)
584 break; 584 break;
585 585
@@ -597,14 +597,14 @@ static int parse_probe_arg(char *arg, const struct fetch_type *t,
597 break; 597 break;
598 598
599 case '+': /* deref memory */ 599 case '+': /* deref memory */
600 arg++; /* Skip '+', because strict_strtol() rejects it. */ 600 arg++; /* Skip '+', because kstrtol() rejects it. */
601 case '-': 601 case '-':
602 tmp = strchr(arg, '('); 602 tmp = strchr(arg, '(');
603 if (!tmp) 603 if (!tmp)
604 break; 604 break;
605 605
606 *tmp = '\0'; 606 *tmp = '\0';
607 ret = strict_strtol(arg, 0, &offset); 607 ret = kstrtol(arg, 0, &offset);
608 608
609 if (ret) 609 if (ret)
610 break; 610 break;
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
index 933708677814..5c7e09d10d74 100644
--- a/kernel/trace/trace_probe.h
+++ b/kernel/trace/trace_probe.h
@@ -66,7 +66,6 @@
66#define TP_FLAG_TRACE 1 66#define TP_FLAG_TRACE 1
67#define TP_FLAG_PROFILE 2 67#define TP_FLAG_PROFILE 2
68#define TP_FLAG_REGISTERED 4 68#define TP_FLAG_REGISTERED 4
69#define TP_FLAG_UPROBE 8
70 69
71 70
72/* data_rloc: data relative location, compatible with u32 */ 71/* data_rloc: data relative location, compatible with u32 */
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index 7e62c0a18456..3374c792ccd8 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -102,9 +102,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
102 entry->next_cpu = task_cpu(wakee); 102 entry->next_cpu = task_cpu(wakee);
103 103
104 if (!filter_check_discard(call, entry, buffer, event)) 104 if (!filter_check_discard(call, entry, buffer, event))
105 ring_buffer_unlock_commit(buffer, event); 105 trace_buffer_unlock_commit(buffer, event, flags, pc);
106 ftrace_trace_stack(tr->buffer, flags, 6, pc);
107 ftrace_trace_userstack(tr->buffer, flags, pc);
108} 106}
109 107
110static void 108static void
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 02170c00c413..fde652c9a511 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -7,7 +7,7 @@
7 * Based on code from the latency_tracer, that is: 7 * Based on code from the latency_tracer, that is:
8 * 8 *
9 * Copyright (C) 2004-2006 Ingo Molnar 9 * Copyright (C) 2004-2006 Ingo Molnar
10 * Copyright (C) 2004 William Lee Irwin III 10 * Copyright (C) 2004 Nadia Yvette Chambers
11 */ 11 */
12#include <linux/module.h> 12#include <linux/module.h>
13#include <linux/fs.h> 13#include <linux/fs.h>
@@ -15,8 +15,8 @@
15#include <linux/kallsyms.h> 15#include <linux/kallsyms.h>
16#include <linux/uaccess.h> 16#include <linux/uaccess.h>
17#include <linux/ftrace.h> 17#include <linux/ftrace.h>
18#include <linux/sched/rt.h>
18#include <trace/events/sched.h> 19#include <trace/events/sched.h>
19
20#include "trace.h" 20#include "trace.h"
21 21
22static struct trace_array *wakeup_trace; 22static struct trace_array *wakeup_trace;
@@ -36,7 +36,7 @@ static void __wakeup_reset(struct trace_array *tr);
36static int wakeup_graph_entry(struct ftrace_graph_ent *trace); 36static int wakeup_graph_entry(struct ftrace_graph_ent *trace);
37static void wakeup_graph_return(struct ftrace_graph_ret *trace); 37static void wakeup_graph_return(struct ftrace_graph_ret *trace);
38 38
39static int save_lat_flag; 39static int save_flags;
40 40
41#define TRACE_DISPLAY_GRAPH 1 41#define TRACE_DISPLAY_GRAPH 1
42 42
@@ -540,8 +540,11 @@ static void stop_wakeup_tracer(struct trace_array *tr)
540 540
541static int __wakeup_tracer_init(struct trace_array *tr) 541static int __wakeup_tracer_init(struct trace_array *tr)
542{ 542{
543 save_lat_flag = trace_flags & TRACE_ITER_LATENCY_FMT; 543 save_flags = trace_flags;
544 trace_flags |= TRACE_ITER_LATENCY_FMT; 544
545 /* non overwrite screws up the latency tracers */
546 set_tracer_flag(TRACE_ITER_OVERWRITE, 1);
547 set_tracer_flag(TRACE_ITER_LATENCY_FMT, 1);
545 548
546 tracing_max_latency = 0; 549 tracing_max_latency = 0;
547 wakeup_trace = tr; 550 wakeup_trace = tr;
@@ -563,12 +566,15 @@ static int wakeup_rt_tracer_init(struct trace_array *tr)
563 566
564static void wakeup_tracer_reset(struct trace_array *tr) 567static void wakeup_tracer_reset(struct trace_array *tr)
565{ 568{
569 int lat_flag = save_flags & TRACE_ITER_LATENCY_FMT;
570 int overwrite_flag = save_flags & TRACE_ITER_OVERWRITE;
571
566 stop_wakeup_tracer(tr); 572 stop_wakeup_tracer(tr);
567 /* make sure we put back any tasks we are tracing */ 573 /* make sure we put back any tasks we are tracing */
568 wakeup_reset(tr); 574 wakeup_reset(tr);
569 575
570 if (!save_lat_flag) 576 set_tracer_flag(TRACE_ITER_LATENCY_FMT, lat_flag);
571 trace_flags &= ~TRACE_ITER_LATENCY_FMT; 577 set_tracer_flag(TRACE_ITER_OVERWRITE, overwrite_flag);
572} 578}
573 579
574static void wakeup_tracer_start(struct trace_array *tr) 580static void wakeup_tracer_start(struct trace_array *tr)
@@ -589,17 +595,18 @@ static struct tracer wakeup_tracer __read_mostly =
589 .reset = wakeup_tracer_reset, 595 .reset = wakeup_tracer_reset,
590 .start = wakeup_tracer_start, 596 .start = wakeup_tracer_start,
591 .stop = wakeup_tracer_stop, 597 .stop = wakeup_tracer_stop,
592 .print_max = 1, 598 .print_max = true,
593 .print_header = wakeup_print_header, 599 .print_header = wakeup_print_header,
594 .print_line = wakeup_print_line, 600 .print_line = wakeup_print_line,
595 .flags = &tracer_flags, 601 .flags = &tracer_flags,
596 .set_flag = wakeup_set_flag, 602 .set_flag = wakeup_set_flag,
603 .flag_changed = trace_keep_overwrite,
597#ifdef CONFIG_FTRACE_SELFTEST 604#ifdef CONFIG_FTRACE_SELFTEST
598 .selftest = trace_selftest_startup_wakeup, 605 .selftest = trace_selftest_startup_wakeup,
599#endif 606#endif
600 .open = wakeup_trace_open, 607 .open = wakeup_trace_open,
601 .close = wakeup_trace_close, 608 .close = wakeup_trace_close,
602 .use_max_tr = 1, 609 .use_max_tr = true,
603}; 610};
604 611
605static struct tracer wakeup_rt_tracer __read_mostly = 612static struct tracer wakeup_rt_tracer __read_mostly =
@@ -610,17 +617,18 @@ static struct tracer wakeup_rt_tracer __read_mostly =
610 .start = wakeup_tracer_start, 617 .start = wakeup_tracer_start,
611 .stop = wakeup_tracer_stop, 618 .stop = wakeup_tracer_stop,
612 .wait_pipe = poll_wait_pipe, 619 .wait_pipe = poll_wait_pipe,
613 .print_max = 1, 620 .print_max = true,
614 .print_header = wakeup_print_header, 621 .print_header = wakeup_print_header,
615 .print_line = wakeup_print_line, 622 .print_line = wakeup_print_line,
616 .flags = &tracer_flags, 623 .flags = &tracer_flags,
617 .set_flag = wakeup_set_flag, 624 .set_flag = wakeup_set_flag,
625 .flag_changed = trace_keep_overwrite,
618#ifdef CONFIG_FTRACE_SELFTEST 626#ifdef CONFIG_FTRACE_SELFTEST
619 .selftest = trace_selftest_startup_wakeup, 627 .selftest = trace_selftest_startup_wakeup,
620#endif 628#endif
621 .open = wakeup_trace_open, 629 .open = wakeup_trace_open,
622 .close = wakeup_trace_close, 630 .close = wakeup_trace_close,
623 .use_max_tr = 1, 631 .use_max_tr = true,
624}; 632};
625 633
626__init static int init_wakeup_tracer(void) 634__init static int init_wakeup_tracer(void)
@@ -637,4 +645,4 @@ __init static int init_wakeup_tracer(void)
637 645
638 return 0; 646 return 0;
639} 647}
640device_initcall(init_wakeup_tracer); 648core_initcall(init_wakeup_tracer);
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 2c00a691a540..51c819c12c29 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -320,7 +320,6 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
320 int (*func)(void)) 320 int (*func)(void))
321{ 321{
322 int save_ftrace_enabled = ftrace_enabled; 322 int save_ftrace_enabled = ftrace_enabled;
323 int save_tracer_enabled = tracer_enabled;
324 unsigned long count; 323 unsigned long count;
325 char *func_name; 324 char *func_name;
326 int ret; 325 int ret;
@@ -331,7 +330,6 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
331 330
332 /* enable tracing, and record the filter function */ 331 /* enable tracing, and record the filter function */
333 ftrace_enabled = 1; 332 ftrace_enabled = 1;
334 tracer_enabled = 1;
335 333
336 /* passed in by parameter to fool gcc from optimizing */ 334 /* passed in by parameter to fool gcc from optimizing */
337 func(); 335 func();
@@ -395,7 +393,6 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
395 393
396 out: 394 out:
397 ftrace_enabled = save_ftrace_enabled; 395 ftrace_enabled = save_ftrace_enabled;
398 tracer_enabled = save_tracer_enabled;
399 396
400 /* Enable tracing on all functions again */ 397 /* Enable tracing on all functions again */
401 ftrace_set_global_filter(NULL, 0, 1); 398 ftrace_set_global_filter(NULL, 0, 1);
@@ -418,7 +415,8 @@ static void trace_selftest_test_recursion_func(unsigned long ip,
418 * The ftrace infrastructure should provide the recursion 415 * The ftrace infrastructure should provide the recursion
419 * protection. If not, this will crash the kernel! 416 * protection. If not, this will crash the kernel!
420 */ 417 */
421 trace_selftest_recursion_cnt++; 418 if (trace_selftest_recursion_cnt++ > 10)
419 return;
422 DYN_FTRACE_TEST_NAME(); 420 DYN_FTRACE_TEST_NAME();
423} 421}
424 422
@@ -452,11 +450,9 @@ static int
452trace_selftest_function_recursion(void) 450trace_selftest_function_recursion(void)
453{ 451{
454 int save_ftrace_enabled = ftrace_enabled; 452 int save_ftrace_enabled = ftrace_enabled;
455 int save_tracer_enabled = tracer_enabled;
456 char *func_name; 453 char *func_name;
457 int len; 454 int len;
458 int ret; 455 int ret;
459 int cnt;
460 456
461 /* The previous test PASSED */ 457 /* The previous test PASSED */
462 pr_cont("PASSED\n"); 458 pr_cont("PASSED\n");
@@ -465,7 +461,6 @@ trace_selftest_function_recursion(void)
465 461
466 /* enable tracing, and record the filter function */ 462 /* enable tracing, and record the filter function */
467 ftrace_enabled = 1; 463 ftrace_enabled = 1;
468 tracer_enabled = 1;
469 464
470 /* Handle PPC64 '.' name */ 465 /* Handle PPC64 '.' name */
471 func_name = "*" __stringify(DYN_FTRACE_TEST_NAME); 466 func_name = "*" __stringify(DYN_FTRACE_TEST_NAME);
@@ -515,26 +510,16 @@ trace_selftest_function_recursion(void)
515 510
516 unregister_ftrace_function(&test_recsafe_probe); 511 unregister_ftrace_function(&test_recsafe_probe);
517 512
518 /*
519 * If arch supports all ftrace features, and no other task
520 * was on the list, we should be fine.
521 */
522 if (!ftrace_nr_registered_ops() && !FTRACE_FORCE_LIST_FUNC)
523 cnt = 2; /* Should have recursed */
524 else
525 cnt = 1;
526
527 ret = -1; 513 ret = -1;
528 if (trace_selftest_recursion_cnt != cnt) { 514 if (trace_selftest_recursion_cnt != 2) {
529 pr_cont("*callback not called expected %d times (%d)* ", 515 pr_cont("*callback not called expected 2 times (%d)* ",
530 cnt, trace_selftest_recursion_cnt); 516 trace_selftest_recursion_cnt);
531 goto out; 517 goto out;
532 } 518 }
533 519
534 ret = 0; 520 ret = 0;
535out: 521out:
536 ftrace_enabled = save_ftrace_enabled; 522 ftrace_enabled = save_ftrace_enabled;
537 tracer_enabled = save_tracer_enabled;
538 523
539 return ret; 524 return ret;
540} 525}
@@ -569,13 +554,12 @@ static int
569trace_selftest_function_regs(void) 554trace_selftest_function_regs(void)
570{ 555{
571 int save_ftrace_enabled = ftrace_enabled; 556 int save_ftrace_enabled = ftrace_enabled;
572 int save_tracer_enabled = tracer_enabled;
573 char *func_name; 557 char *func_name;
574 int len; 558 int len;
575 int ret; 559 int ret;
576 int supported = 0; 560 int supported = 0;
577 561
578#ifdef ARCH_SUPPORTS_FTRACE_SAVE_REGS 562#ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS
579 supported = 1; 563 supported = 1;
580#endif 564#endif
581 565
@@ -586,7 +570,6 @@ trace_selftest_function_regs(void)
586 570
587 /* enable tracing, and record the filter function */ 571 /* enable tracing, and record the filter function */
588 ftrace_enabled = 1; 572 ftrace_enabled = 1;
589 tracer_enabled = 1;
590 573
591 /* Handle PPC64 '.' name */ 574 /* Handle PPC64 '.' name */
592 func_name = "*" __stringify(DYN_FTRACE_TEST_NAME); 575 func_name = "*" __stringify(DYN_FTRACE_TEST_NAME);
@@ -648,7 +631,6 @@ trace_selftest_function_regs(void)
648 ret = 0; 631 ret = 0;
649out: 632out:
650 ftrace_enabled = save_ftrace_enabled; 633 ftrace_enabled = save_ftrace_enabled;
651 tracer_enabled = save_tracer_enabled;
652 634
653 return ret; 635 return ret;
654} 636}
@@ -662,7 +644,6 @@ int
662trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr) 644trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr)
663{ 645{
664 int save_ftrace_enabled = ftrace_enabled; 646 int save_ftrace_enabled = ftrace_enabled;
665 int save_tracer_enabled = tracer_enabled;
666 unsigned long count; 647 unsigned long count;
667 int ret; 648 int ret;
668 649
@@ -671,7 +652,6 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr)
671 652
672 /* start the tracing */ 653 /* start the tracing */
673 ftrace_enabled = 1; 654 ftrace_enabled = 1;
674 tracer_enabled = 1;
675 655
676 ret = tracer_init(trace, tr); 656 ret = tracer_init(trace, tr);
677 if (ret) { 657 if (ret) {
@@ -708,7 +688,6 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr)
708 ret = trace_selftest_function_regs(); 688 ret = trace_selftest_function_regs();
709 out: 689 out:
710 ftrace_enabled = save_ftrace_enabled; 690 ftrace_enabled = save_ftrace_enabled;
711 tracer_enabled = save_tracer_enabled;
712 691
713 /* kill ftrace totally if we failed */ 692 /* kill ftrace totally if we failed */
714 if (ret) 693 if (ret)
@@ -1106,6 +1085,7 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr)
1106 tracing_stop(); 1085 tracing_stop();
1107 /* check both trace buffers */ 1086 /* check both trace buffers */
1108 ret = trace_test_buffer(tr, NULL); 1087 ret = trace_test_buffer(tr, NULL);
1088 printk("ret = %d\n", ret);
1109 if (!ret) 1089 if (!ret)
1110 ret = trace_test_buffer(&max_tr, &count); 1090 ret = trace_test_buffer(&max_tr, &count);
1111 1091
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 0c1b165778e5..83a8b5b7bd35 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -33,7 +33,6 @@ static unsigned long max_stack_size;
33static arch_spinlock_t max_stack_lock = 33static arch_spinlock_t max_stack_lock =
34 (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; 34 (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
35 35
36static int stack_trace_disabled __read_mostly;
37static DEFINE_PER_CPU(int, trace_active); 36static DEFINE_PER_CPU(int, trace_active);
38static DEFINE_MUTEX(stack_sysctl_mutex); 37static DEFINE_MUTEX(stack_sysctl_mutex);
39 38
@@ -116,9 +115,6 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip,
116{ 115{
117 int cpu; 116 int cpu;
118 117
119 if (unlikely(!ftrace_enabled || stack_trace_disabled))
120 return;
121
122 preempt_disable_notrace(); 118 preempt_disable_notrace();
123 119
124 cpu = raw_smp_processor_id(); 120 cpu = raw_smp_processor_id();
@@ -326,7 +322,7 @@ static const struct file_operations stack_trace_filter_fops = {
326 .open = stack_trace_filter_open, 322 .open = stack_trace_filter_open,
327 .read = seq_read, 323 .read = seq_read,
328 .write = ftrace_filter_write, 324 .write = ftrace_filter_write,
329 .llseek = ftrace_regex_lseek, 325 .llseek = ftrace_filter_lseek,
330 .release = ftrace_regex_release, 326 .release = ftrace_regex_release,
331}; 327};
332 328
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 2485a7d09b11..7a809e321058 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -1,5 +1,6 @@
1#include <trace/syscall.h> 1#include <trace/syscall.h>
2#include <trace/events/syscalls.h> 2#include <trace/events/syscalls.h>
3#include <linux/syscalls.h>
3#include <linux/slab.h> 4#include <linux/slab.h>
4#include <linux/kernel.h> 5#include <linux/kernel.h>
5#include <linux/module.h> /* for MODULE_NAME_LEN via KSYM_SYMBOL_LEN */ 6#include <linux/module.h> /* for MODULE_NAME_LEN via KSYM_SYMBOL_LEN */
@@ -21,9 +22,6 @@ static int syscall_enter_register(struct ftrace_event_call *event,
21static int syscall_exit_register(struct ftrace_event_call *event, 22static int syscall_exit_register(struct ftrace_event_call *event,
22 enum trace_reg type, void *data); 23 enum trace_reg type, void *data);
23 24
24static int syscall_enter_define_fields(struct ftrace_event_call *call);
25static int syscall_exit_define_fields(struct ftrace_event_call *call);
26
27static struct list_head * 25static struct list_head *
28syscall_get_enter_fields(struct ftrace_event_call *call) 26syscall_get_enter_fields(struct ftrace_event_call *call)
29{ 27{
@@ -32,30 +30,6 @@ syscall_get_enter_fields(struct ftrace_event_call *call)
32 return &entry->enter_fields; 30 return &entry->enter_fields;
33} 31}
34 32
35struct trace_event_functions enter_syscall_print_funcs = {
36 .trace = print_syscall_enter,
37};
38
39struct trace_event_functions exit_syscall_print_funcs = {
40 .trace = print_syscall_exit,
41};
42
43struct ftrace_event_class event_class_syscall_enter = {
44 .system = "syscalls",
45 .reg = syscall_enter_register,
46 .define_fields = syscall_enter_define_fields,
47 .get_fields = syscall_get_enter_fields,
48 .raw_init = init_syscall_trace,
49};
50
51struct ftrace_event_class event_class_syscall_exit = {
52 .system = "syscalls",
53 .reg = syscall_exit_register,
54 .define_fields = syscall_exit_define_fields,
55 .fields = LIST_HEAD_INIT(event_class_syscall_exit.fields),
56 .raw_init = init_syscall_trace,
57};
58
59extern struct syscall_metadata *__start_syscalls_metadata[]; 33extern struct syscall_metadata *__start_syscalls_metadata[];
60extern struct syscall_metadata *__stop_syscalls_metadata[]; 34extern struct syscall_metadata *__stop_syscalls_metadata[];
61 35
@@ -74,6 +48,38 @@ static inline bool arch_syscall_match_sym_name(const char *sym, const char *name
74} 48}
75#endif 49#endif
76 50
51#ifdef ARCH_TRACE_IGNORE_COMPAT_SYSCALLS
52/*
53 * Some architectures that allow for 32bit applications
54 * to run on a 64bit kernel, do not map the syscalls for
55 * the 32bit tasks the same as they do for 64bit tasks.
56 *
57 * *cough*x86*cough*
58 *
59 * In such a case, instead of reporting the wrong syscalls,
60 * simply ignore them.
61 *
62 * For an arch to ignore the compat syscalls it needs to
63 * define ARCH_TRACE_IGNORE_COMPAT_SYSCALLS as well as
64 * define the function arch_trace_is_compat_syscall() to let
65 * the tracing system know that it should ignore it.
66 */
67static int
68trace_get_syscall_nr(struct task_struct *task, struct pt_regs *regs)
69{
70 if (unlikely(arch_trace_is_compat_syscall(regs)))
71 return -1;
72
73 return syscall_get_nr(task, regs);
74}
75#else
76static inline int
77trace_get_syscall_nr(struct task_struct *task, struct pt_regs *regs)
78{
79 return syscall_get_nr(task, regs);
80}
81#endif /* ARCH_TRACE_IGNORE_COMPAT_SYSCALLS */
82
77static __init struct syscall_metadata * 83static __init struct syscall_metadata *
78find_syscall_meta(unsigned long syscall) 84find_syscall_meta(unsigned long syscall)
79{ 85{
@@ -104,7 +110,7 @@ static struct syscall_metadata *syscall_nr_to_meta(int nr)
104 return syscalls_metadata[nr]; 110 return syscalls_metadata[nr];
105} 111}
106 112
107enum print_line_t 113static enum print_line_t
108print_syscall_enter(struct trace_iterator *iter, int flags, 114print_syscall_enter(struct trace_iterator *iter, int flags,
109 struct trace_event *event) 115 struct trace_event *event)
110{ 116{
@@ -157,7 +163,7 @@ end:
157 return TRACE_TYPE_HANDLED; 163 return TRACE_TYPE_HANDLED;
158} 164}
159 165
160enum print_line_t 166static enum print_line_t
161print_syscall_exit(struct trace_iterator *iter, int flags, 167print_syscall_exit(struct trace_iterator *iter, int flags,
162 struct trace_event *event) 168 struct trace_event *event)
163{ 169{
@@ -297,16 +303,16 @@ static int syscall_exit_define_fields(struct ftrace_event_call *call)
297 return ret; 303 return ret;
298} 304}
299 305
300void ftrace_syscall_enter(void *ignore, struct pt_regs *regs, long id) 306static void ftrace_syscall_enter(void *ignore, struct pt_regs *regs, long id)
301{ 307{
302 struct syscall_trace_enter *entry; 308 struct syscall_trace_enter *entry;
303 struct syscall_metadata *sys_data; 309 struct syscall_metadata *sys_data;
304 struct ring_buffer_event *event; 310 struct ring_buffer_event *event;
305 struct ring_buffer *buffer; 311 struct ring_buffer *buffer;
306 int size;
307 int syscall_nr; 312 int syscall_nr;
313 int size;
308 314
309 syscall_nr = syscall_get_nr(current, regs); 315 syscall_nr = trace_get_syscall_nr(current, regs);
310 if (syscall_nr < 0) 316 if (syscall_nr < 0)
311 return; 317 return;
312 if (!test_bit(syscall_nr, enabled_enter_syscalls)) 318 if (!test_bit(syscall_nr, enabled_enter_syscalls))
@@ -332,7 +338,7 @@ void ftrace_syscall_enter(void *ignore, struct pt_regs *regs, long id)
332 trace_current_buffer_unlock_commit(buffer, event, 0, 0); 338 trace_current_buffer_unlock_commit(buffer, event, 0, 0);
333} 339}
334 340
335void ftrace_syscall_exit(void *ignore, struct pt_regs *regs, long ret) 341static void ftrace_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
336{ 342{
337 struct syscall_trace_exit *entry; 343 struct syscall_trace_exit *entry;
338 struct syscall_metadata *sys_data; 344 struct syscall_metadata *sys_data;
@@ -340,7 +346,7 @@ void ftrace_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
340 struct ring_buffer *buffer; 346 struct ring_buffer *buffer;
341 int syscall_nr; 347 int syscall_nr;
342 348
343 syscall_nr = syscall_get_nr(current, regs); 349 syscall_nr = trace_get_syscall_nr(current, regs);
344 if (syscall_nr < 0) 350 if (syscall_nr < 0)
345 return; 351 return;
346 if (!test_bit(syscall_nr, enabled_exit_syscalls)) 352 if (!test_bit(syscall_nr, enabled_exit_syscalls))
@@ -364,7 +370,7 @@ void ftrace_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
364 trace_current_buffer_unlock_commit(buffer, event, 0, 0); 370 trace_current_buffer_unlock_commit(buffer, event, 0, 0);
365} 371}
366 372
367int reg_event_syscall_enter(struct ftrace_event_call *call) 373static int reg_event_syscall_enter(struct ftrace_event_call *call)
368{ 374{
369 int ret = 0; 375 int ret = 0;
370 int num; 376 int num;
@@ -383,7 +389,7 @@ int reg_event_syscall_enter(struct ftrace_event_call *call)
383 return ret; 389 return ret;
384} 390}
385 391
386void unreg_event_syscall_enter(struct ftrace_event_call *call) 392static void unreg_event_syscall_enter(struct ftrace_event_call *call)
387{ 393{
388 int num; 394 int num;
389 395
@@ -398,7 +404,7 @@ void unreg_event_syscall_enter(struct ftrace_event_call *call)
398 mutex_unlock(&syscall_trace_lock); 404 mutex_unlock(&syscall_trace_lock);
399} 405}
400 406
401int reg_event_syscall_exit(struct ftrace_event_call *call) 407static int reg_event_syscall_exit(struct ftrace_event_call *call)
402{ 408{
403 int ret = 0; 409 int ret = 0;
404 int num; 410 int num;
@@ -417,7 +423,7 @@ int reg_event_syscall_exit(struct ftrace_event_call *call)
417 return ret; 423 return ret;
418} 424}
419 425
420void unreg_event_syscall_exit(struct ftrace_event_call *call) 426static void unreg_event_syscall_exit(struct ftrace_event_call *call)
421{ 427{
422 int num; 428 int num;
423 429
@@ -432,7 +438,7 @@ void unreg_event_syscall_exit(struct ftrace_event_call *call)
432 mutex_unlock(&syscall_trace_lock); 438 mutex_unlock(&syscall_trace_lock);
433} 439}
434 440
435int init_syscall_trace(struct ftrace_event_call *call) 441static int init_syscall_trace(struct ftrace_event_call *call)
436{ 442{
437 int id; 443 int id;
438 int num; 444 int num;
@@ -457,12 +463,36 @@ int init_syscall_trace(struct ftrace_event_call *call)
457 return id; 463 return id;
458} 464}
459 465
466struct trace_event_functions enter_syscall_print_funcs = {
467 .trace = print_syscall_enter,
468};
469
470struct trace_event_functions exit_syscall_print_funcs = {
471 .trace = print_syscall_exit,
472};
473
474struct ftrace_event_class event_class_syscall_enter = {
475 .system = "syscalls",
476 .reg = syscall_enter_register,
477 .define_fields = syscall_enter_define_fields,
478 .get_fields = syscall_get_enter_fields,
479 .raw_init = init_syscall_trace,
480};
481
482struct ftrace_event_class event_class_syscall_exit = {
483 .system = "syscalls",
484 .reg = syscall_exit_register,
485 .define_fields = syscall_exit_define_fields,
486 .fields = LIST_HEAD_INIT(event_class_syscall_exit.fields),
487 .raw_init = init_syscall_trace,
488};
489
460unsigned long __init __weak arch_syscall_addr(int nr) 490unsigned long __init __weak arch_syscall_addr(int nr)
461{ 491{
462 return (unsigned long)sys_call_table[nr]; 492 return (unsigned long)sys_call_table[nr];
463} 493}
464 494
465int __init init_ftrace_syscalls(void) 495static int __init init_ftrace_syscalls(void)
466{ 496{
467 struct syscall_metadata *meta; 497 struct syscall_metadata *meta;
468 unsigned long addr; 498 unsigned long addr;
@@ -505,7 +535,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
505 int rctx; 535 int rctx;
506 int size; 536 int size;
507 537
508 syscall_nr = syscall_get_nr(current, regs); 538 syscall_nr = trace_get_syscall_nr(current, regs);
509 if (syscall_nr < 0) 539 if (syscall_nr < 0)
510 return; 540 return;
511 if (!test_bit(syscall_nr, enabled_perf_enter_syscalls)) 541 if (!test_bit(syscall_nr, enabled_perf_enter_syscalls))
@@ -537,7 +567,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
537 perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head, NULL); 567 perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head, NULL);
538} 568}
539 569
540int perf_sysenter_enable(struct ftrace_event_call *call) 570static int perf_sysenter_enable(struct ftrace_event_call *call)
541{ 571{
542 int ret = 0; 572 int ret = 0;
543 int num; 573 int num;
@@ -558,7 +588,7 @@ int perf_sysenter_enable(struct ftrace_event_call *call)
558 return ret; 588 return ret;
559} 589}
560 590
561void perf_sysenter_disable(struct ftrace_event_call *call) 591static void perf_sysenter_disable(struct ftrace_event_call *call)
562{ 592{
563 int num; 593 int num;
564 594
@@ -581,7 +611,7 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
581 int rctx; 611 int rctx;
582 int size; 612 int size;
583 613
584 syscall_nr = syscall_get_nr(current, regs); 614 syscall_nr = trace_get_syscall_nr(current, regs);
585 if (syscall_nr < 0) 615 if (syscall_nr < 0)
586 return; 616 return;
587 if (!test_bit(syscall_nr, enabled_perf_exit_syscalls)) 617 if (!test_bit(syscall_nr, enabled_perf_exit_syscalls))
@@ -615,7 +645,7 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
615 perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head, NULL); 645 perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head, NULL);
616} 646}
617 647
618int perf_sysexit_enable(struct ftrace_event_call *call) 648static int perf_sysexit_enable(struct ftrace_event_call *call)
619{ 649{
620 int ret = 0; 650 int ret = 0;
621 int num; 651 int num;
@@ -636,7 +666,7 @@ int perf_sysexit_enable(struct ftrace_event_call *call)
636 return ret; 666 return ret;
637} 667}
638 668
639void perf_sysexit_disable(struct ftrace_event_call *call) 669static void perf_sysexit_disable(struct ftrace_event_call *call)
640{ 670{
641 int num; 671 int num;
642 672
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 03003cd7dd96..8dad2a92dee9 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -22,25 +22,27 @@
22#include <linux/uaccess.h> 22#include <linux/uaccess.h>
23#include <linux/uprobes.h> 23#include <linux/uprobes.h>
24#include <linux/namei.h> 24#include <linux/namei.h>
25#include <linux/string.h>
25 26
26#include "trace_probe.h" 27#include "trace_probe.h"
27 28
28#define UPROBE_EVENT_SYSTEM "uprobes" 29#define UPROBE_EVENT_SYSTEM "uprobes"
29 30
31struct trace_uprobe_filter {
32 rwlock_t rwlock;
33 int nr_systemwide;
34 struct list_head perf_events;
35};
36
30/* 37/*
31 * uprobe event core functions 38 * uprobe event core functions
32 */ 39 */
33struct trace_uprobe;
34struct uprobe_trace_consumer {
35 struct uprobe_consumer cons;
36 struct trace_uprobe *tu;
37};
38
39struct trace_uprobe { 40struct trace_uprobe {
40 struct list_head list; 41 struct list_head list;
41 struct ftrace_event_class class; 42 struct ftrace_event_class class;
42 struct ftrace_event_call call; 43 struct ftrace_event_call call;
43 struct uprobe_trace_consumer *consumer; 44 struct trace_uprobe_filter filter;
45 struct uprobe_consumer consumer;
44 struct inode *inode; 46 struct inode *inode;
45 char *filename; 47 char *filename;
46 unsigned long offset; 48 unsigned long offset;
@@ -63,6 +65,18 @@ static LIST_HEAD(uprobe_list);
63 65
64static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs); 66static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs);
65 67
68static inline void init_trace_uprobe_filter(struct trace_uprobe_filter *filter)
69{
70 rwlock_init(&filter->rwlock);
71 filter->nr_systemwide = 0;
72 INIT_LIST_HEAD(&filter->perf_events);
73}
74
75static inline bool uprobe_filter_is_empty(struct trace_uprobe_filter *filter)
76{
77 return !filter->nr_systemwide && list_empty(&filter->perf_events);
78}
79
66/* 80/*
67 * Allocate new trace_uprobe and initialize it (including uprobes). 81 * Allocate new trace_uprobe and initialize it (including uprobes).
68 */ 82 */
@@ -91,6 +105,8 @@ alloc_trace_uprobe(const char *group, const char *event, int nargs)
91 goto error; 105 goto error;
92 106
93 INIT_LIST_HEAD(&tu->list); 107 INIT_LIST_HEAD(&tu->list);
108 tu->consumer.handler = uprobe_dispatcher;
109 init_trace_uprobe_filter(&tu->filter);
94 return tu; 110 return tu;
95 111
96error: 112error:
@@ -189,7 +205,7 @@ static int create_trace_uprobe(int argc, char **argv)
189 if (argv[0][0] == '-') 205 if (argv[0][0] == '-')
190 is_delete = true; 206 is_delete = true;
191 else if (argv[0][0] != 'p') { 207 else if (argv[0][0] != 'p') {
192 pr_info("Probe definition must be started with 'p', 'r' or" " '-'.\n"); 208 pr_info("Probe definition must be started with 'p' or '-'.\n");
193 return -EINVAL; 209 return -EINVAL;
194 } 210 }
195 211
@@ -252,27 +268,32 @@ static int create_trace_uprobe(int argc, char **argv)
252 if (ret) 268 if (ret)
253 goto fail_address_parse; 269 goto fail_address_parse;
254 270
255 ret = strict_strtoul(arg, 0, &offset); 271 inode = igrab(path.dentry->d_inode);
256 if (ret) 272 path_put(&path);
273
274 if (!inode || !S_ISREG(inode->i_mode)) {
275 ret = -EINVAL;
257 goto fail_address_parse; 276 goto fail_address_parse;
277 }
258 278
259 inode = igrab(path.dentry->d_inode); 279 ret = kstrtoul(arg, 0, &offset);
280 if (ret)
281 goto fail_address_parse;
260 282
261 argc -= 2; 283 argc -= 2;
262 argv += 2; 284 argv += 2;
263 285
264 /* setup a probe */ 286 /* setup a probe */
265 if (!event) { 287 if (!event) {
266 char *tail = strrchr(filename, '/'); 288 char *tail;
267 char *ptr; 289 char *ptr;
268 290
269 ptr = kstrdup((tail ? tail + 1 : filename), GFP_KERNEL); 291 tail = kstrdup(kbasename(filename), GFP_KERNEL);
270 if (!ptr) { 292 if (!tail) {
271 ret = -ENOMEM; 293 ret = -ENOMEM;
272 goto fail_address_parse; 294 goto fail_address_parse;
273 } 295 }
274 296
275 tail = ptr;
276 ptr = strpbrk(tail, ".-_"); 297 ptr = strpbrk(tail, ".-_");
277 if (ptr) 298 if (ptr)
278 *ptr = '\0'; 299 *ptr = '\0';
@@ -356,7 +377,7 @@ fail_address_parse:
356 if (inode) 377 if (inode)
357 iput(inode); 378 iput(inode);
358 379
359 pr_info("Failed to parse address.\n"); 380 pr_info("Failed to parse address or file.\n");
360 381
361 return ret; 382 return ret;
362} 383}
@@ -465,7 +486,7 @@ static const struct file_operations uprobe_profile_ops = {
465}; 486};
466 487
467/* uprobe handler */ 488/* uprobe handler */
468static void uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs) 489static int uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs)
469{ 490{
470 struct uprobe_trace_entry_head *entry; 491 struct uprobe_trace_entry_head *entry;
471 struct ring_buffer_event *event; 492 struct ring_buffer_event *event;
@@ -475,8 +496,6 @@ static void uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs)
475 unsigned long irq_flags; 496 unsigned long irq_flags;
476 struct ftrace_event_call *call = &tu->call; 497 struct ftrace_event_call *call = &tu->call;
477 498
478 tu->nhit++;
479
480 local_save_flags(irq_flags); 499 local_save_flags(irq_flags);
481 pc = preempt_count(); 500 pc = preempt_count();
482 501
@@ -485,16 +504,18 @@ static void uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs)
485 event = trace_current_buffer_lock_reserve(&buffer, call->event.type, 504 event = trace_current_buffer_lock_reserve(&buffer, call->event.type,
486 size, irq_flags, pc); 505 size, irq_flags, pc);
487 if (!event) 506 if (!event)
488 return; 507 return 0;
489 508
490 entry = ring_buffer_event_data(event); 509 entry = ring_buffer_event_data(event);
491 entry->ip = uprobe_get_swbp_addr(task_pt_regs(current)); 510 entry->ip = instruction_pointer(task_pt_regs(current));
492 data = (u8 *)&entry[1]; 511 data = (u8 *)&entry[1];
493 for (i = 0; i < tu->nr_args; i++) 512 for (i = 0; i < tu->nr_args; i++)
494 call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset); 513 call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset);
495 514
496 if (!filter_current_check_discard(buffer, call, entry, event)) 515 if (!filter_current_check_discard(buffer, call, entry, event))
497 trace_buffer_unlock_commit(buffer, event, irq_flags, pc); 516 trace_buffer_unlock_commit(buffer, event, irq_flags, pc);
517
518 return 0;
498} 519}
499 520
500/* Event entry printers */ 521/* Event entry printers */
@@ -533,42 +554,43 @@ partial:
533 return TRACE_TYPE_PARTIAL_LINE; 554 return TRACE_TYPE_PARTIAL_LINE;
534} 555}
535 556
536static int probe_event_enable(struct trace_uprobe *tu, int flag) 557static inline bool is_trace_uprobe_enabled(struct trace_uprobe *tu)
537{ 558{
538 struct uprobe_trace_consumer *utc; 559 return tu->flags & (TP_FLAG_TRACE | TP_FLAG_PROFILE);
539 int ret = 0; 560}
540 561
541 if (!tu->inode || tu->consumer) 562typedef bool (*filter_func_t)(struct uprobe_consumer *self,
542 return -EINTR; 563 enum uprobe_filter_ctx ctx,
564 struct mm_struct *mm);
543 565
544 utc = kzalloc(sizeof(struct uprobe_trace_consumer), GFP_KERNEL); 566static int
545 if (!utc) 567probe_event_enable(struct trace_uprobe *tu, int flag, filter_func_t filter)
568{
569 int ret = 0;
570
571 if (is_trace_uprobe_enabled(tu))
546 return -EINTR; 572 return -EINTR;
547 573
548 utc->cons.handler = uprobe_dispatcher; 574 WARN_ON(!uprobe_filter_is_empty(&tu->filter));
549 utc->cons.filter = NULL;
550 ret = uprobe_register(tu->inode, tu->offset, &utc->cons);
551 if (ret) {
552 kfree(utc);
553 return ret;
554 }
555 575
556 tu->flags |= flag; 576 tu->flags |= flag;
557 utc->tu = tu; 577 tu->consumer.filter = filter;
558 tu->consumer = utc; 578 ret = uprobe_register(tu->inode, tu->offset, &tu->consumer);
579 if (ret)
580 tu->flags &= ~flag;
559 581
560 return 0; 582 return ret;
561} 583}
562 584
563static void probe_event_disable(struct trace_uprobe *tu, int flag) 585static void probe_event_disable(struct trace_uprobe *tu, int flag)
564{ 586{
565 if (!tu->inode || !tu->consumer) 587 if (!is_trace_uprobe_enabled(tu))
566 return; 588 return;
567 589
568 uprobe_unregister(tu->inode, tu->offset, &tu->consumer->cons); 590 WARN_ON(!uprobe_filter_is_empty(&tu->filter));
591
592 uprobe_unregister(tu->inode, tu->offset, &tu->consumer);
569 tu->flags &= ~flag; 593 tu->flags &= ~flag;
570 kfree(tu->consumer);
571 tu->consumer = NULL;
572} 594}
573 595
574static int uprobe_event_define_fields(struct ftrace_event_call *event_call) 596static int uprobe_event_define_fields(struct ftrace_event_call *event_call)
@@ -642,8 +664,96 @@ static int set_print_fmt(struct trace_uprobe *tu)
642} 664}
643 665
644#ifdef CONFIG_PERF_EVENTS 666#ifdef CONFIG_PERF_EVENTS
667static bool
668__uprobe_perf_filter(struct trace_uprobe_filter *filter, struct mm_struct *mm)
669{
670 struct perf_event *event;
671
672 if (filter->nr_systemwide)
673 return true;
674
675 list_for_each_entry(event, &filter->perf_events, hw.tp_list) {
676 if (event->hw.tp_target->mm == mm)
677 return true;
678 }
679
680 return false;
681}
682
683static inline bool
684uprobe_filter_event(struct trace_uprobe *tu, struct perf_event *event)
685{
686 return __uprobe_perf_filter(&tu->filter, event->hw.tp_target->mm);
687}
688
689static int uprobe_perf_open(struct trace_uprobe *tu, struct perf_event *event)
690{
691 bool done;
692
693 write_lock(&tu->filter.rwlock);
694 if (event->hw.tp_target) {
695 /*
696 * event->parent != NULL means copy_process(), we can avoid
697 * uprobe_apply(). current->mm must be probed and we can rely
698 * on dup_mmap() which preserves the already installed bp's.
699 *
700 * attr.enable_on_exec means that exec/mmap will install the
701 * breakpoints we need.
702 */
703 done = tu->filter.nr_systemwide ||
704 event->parent || event->attr.enable_on_exec ||
705 uprobe_filter_event(tu, event);
706 list_add(&event->hw.tp_list, &tu->filter.perf_events);
707 } else {
708 done = tu->filter.nr_systemwide;
709 tu->filter.nr_systemwide++;
710 }
711 write_unlock(&tu->filter.rwlock);
712
713 if (!done)
714 uprobe_apply(tu->inode, tu->offset, &tu->consumer, true);
715
716 return 0;
717}
718
719static int uprobe_perf_close(struct trace_uprobe *tu, struct perf_event *event)
720{
721 bool done;
722
723 write_lock(&tu->filter.rwlock);
724 if (event->hw.tp_target) {
725 list_del(&event->hw.tp_list);
726 done = tu->filter.nr_systemwide ||
727 (event->hw.tp_target->flags & PF_EXITING) ||
728 uprobe_filter_event(tu, event);
729 } else {
730 tu->filter.nr_systemwide--;
731 done = tu->filter.nr_systemwide;
732 }
733 write_unlock(&tu->filter.rwlock);
734
735 if (!done)
736 uprobe_apply(tu->inode, tu->offset, &tu->consumer, false);
737
738 return 0;
739}
740
741static bool uprobe_perf_filter(struct uprobe_consumer *uc,
742 enum uprobe_filter_ctx ctx, struct mm_struct *mm)
743{
744 struct trace_uprobe *tu;
745 int ret;
746
747 tu = container_of(uc, struct trace_uprobe, consumer);
748 read_lock(&tu->filter.rwlock);
749 ret = __uprobe_perf_filter(&tu->filter, mm);
750 read_unlock(&tu->filter.rwlock);
751
752 return ret;
753}
754
645/* uprobe profile handler */ 755/* uprobe profile handler */
646static void uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs) 756static int uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs)
647{ 757{
648 struct ftrace_event_call *call = &tu->call; 758 struct ftrace_event_call *call = &tu->call;
649 struct uprobe_trace_entry_head *entry; 759 struct uprobe_trace_entry_head *entry;
@@ -652,11 +762,14 @@ static void uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs)
652 int size, __size, i; 762 int size, __size, i;
653 int rctx; 763 int rctx;
654 764
765 if (!uprobe_perf_filter(&tu->consumer, 0, current->mm))
766 return UPROBE_HANDLER_REMOVE;
767
655 __size = sizeof(*entry) + tu->size; 768 __size = sizeof(*entry) + tu->size;
656 size = ALIGN(__size + sizeof(u32), sizeof(u64)); 769 size = ALIGN(__size + sizeof(u32), sizeof(u64));
657 size -= sizeof(u32); 770 size -= sizeof(u32);
658 if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, "profile buffer not large enough")) 771 if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, "profile buffer not large enough"))
659 return; 772 return 0;
660 773
661 preempt_disable(); 774 preempt_disable();
662 775
@@ -664,7 +777,7 @@ static void uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs)
664 if (!entry) 777 if (!entry)
665 goto out; 778 goto out;
666 779
667 entry->ip = uprobe_get_swbp_addr(task_pt_regs(current)); 780 entry->ip = instruction_pointer(task_pt_regs(current));
668 data = (u8 *)&entry[1]; 781 data = (u8 *)&entry[1];
669 for (i = 0; i < tu->nr_args; i++) 782 for (i = 0; i < tu->nr_args; i++)
670 call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset); 783 call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset);
@@ -674,6 +787,7 @@ static void uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs)
674 787
675 out: 788 out:
676 preempt_enable(); 789 preempt_enable();
790 return 0;
677} 791}
678#endif /* CONFIG_PERF_EVENTS */ 792#endif /* CONFIG_PERF_EVENTS */
679 793
@@ -684,7 +798,7 @@ int trace_uprobe_register(struct ftrace_event_call *event, enum trace_reg type,
684 798
685 switch (type) { 799 switch (type) {
686 case TRACE_REG_REGISTER: 800 case TRACE_REG_REGISTER:
687 return probe_event_enable(tu, TP_FLAG_TRACE); 801 return probe_event_enable(tu, TP_FLAG_TRACE, NULL);
688 802
689 case TRACE_REG_UNREGISTER: 803 case TRACE_REG_UNREGISTER:
690 probe_event_disable(tu, TP_FLAG_TRACE); 804 probe_event_disable(tu, TP_FLAG_TRACE);
@@ -692,11 +806,18 @@ int trace_uprobe_register(struct ftrace_event_call *event, enum trace_reg type,
692 806
693#ifdef CONFIG_PERF_EVENTS 807#ifdef CONFIG_PERF_EVENTS
694 case TRACE_REG_PERF_REGISTER: 808 case TRACE_REG_PERF_REGISTER:
695 return probe_event_enable(tu, TP_FLAG_PROFILE); 809 return probe_event_enable(tu, TP_FLAG_PROFILE, uprobe_perf_filter);
696 810
697 case TRACE_REG_PERF_UNREGISTER: 811 case TRACE_REG_PERF_UNREGISTER:
698 probe_event_disable(tu, TP_FLAG_PROFILE); 812 probe_event_disable(tu, TP_FLAG_PROFILE);
699 return 0; 813 return 0;
814
815 case TRACE_REG_PERF_OPEN:
816 return uprobe_perf_open(tu, data);
817
818 case TRACE_REG_PERF_CLOSE:
819 return uprobe_perf_close(tu, data);
820
700#endif 821#endif
701 default: 822 default:
702 return 0; 823 return 0;
@@ -706,22 +827,20 @@ int trace_uprobe_register(struct ftrace_event_call *event, enum trace_reg type,
706 827
707static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs) 828static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs)
708{ 829{
709 struct uprobe_trace_consumer *utc;
710 struct trace_uprobe *tu; 830 struct trace_uprobe *tu;
831 int ret = 0;
711 832
712 utc = container_of(con, struct uprobe_trace_consumer, cons); 833 tu = container_of(con, struct trace_uprobe, consumer);
713 tu = utc->tu; 834 tu->nhit++;
714 if (!tu || tu->consumer != utc)
715 return 0;
716 835
717 if (tu->flags & TP_FLAG_TRACE) 836 if (tu->flags & TP_FLAG_TRACE)
718 uprobe_trace_func(tu, regs); 837 ret |= uprobe_trace_func(tu, regs);
719 838
720#ifdef CONFIG_PERF_EVENTS 839#ifdef CONFIG_PERF_EVENTS
721 if (tu->flags & TP_FLAG_PROFILE) 840 if (tu->flags & TP_FLAG_PROFILE)
722 uprobe_perf_func(tu, regs); 841 ret |= uprobe_perf_func(tu, regs);
723#endif 842#endif
724 return 0; 843 return ret;
725} 844}
726 845
727static struct trace_event_functions uprobe_funcs = { 846static struct trace_event_functions uprobe_funcs = {
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index d96ba22dabfa..0c05a4592047 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -192,12 +192,11 @@ tracepoint_entry_remove_probe(struct tracepoint_entry *entry,
192static struct tracepoint_entry *get_tracepoint(const char *name) 192static struct tracepoint_entry *get_tracepoint(const char *name)
193{ 193{
194 struct hlist_head *head; 194 struct hlist_head *head;
195 struct hlist_node *node;
196 struct tracepoint_entry *e; 195 struct tracepoint_entry *e;
197 u32 hash = jhash(name, strlen(name), 0); 196 u32 hash = jhash(name, strlen(name), 0);
198 197
199 head = &tracepoint_table[hash & (TRACEPOINT_TABLE_SIZE - 1)]; 198 head = &tracepoint_table[hash & (TRACEPOINT_TABLE_SIZE - 1)];
200 hlist_for_each_entry(e, node, head, hlist) { 199 hlist_for_each_entry(e, head, hlist) {
201 if (!strcmp(name, e->name)) 200 if (!strcmp(name, e->name))
202 return e; 201 return e;
203 } 202 }
@@ -211,13 +210,12 @@ static struct tracepoint_entry *get_tracepoint(const char *name)
211static struct tracepoint_entry *add_tracepoint(const char *name) 210static struct tracepoint_entry *add_tracepoint(const char *name)
212{ 211{
213 struct hlist_head *head; 212 struct hlist_head *head;
214 struct hlist_node *node;
215 struct tracepoint_entry *e; 213 struct tracepoint_entry *e;
216 size_t name_len = strlen(name) + 1; 214 size_t name_len = strlen(name) + 1;
217 u32 hash = jhash(name, name_len-1, 0); 215 u32 hash = jhash(name, name_len-1, 0);
218 216
219 head = &tracepoint_table[hash & (TRACEPOINT_TABLE_SIZE - 1)]; 217 head = &tracepoint_table[hash & (TRACEPOINT_TABLE_SIZE - 1)];
220 hlist_for_each_entry(e, node, head, hlist) { 218 hlist_for_each_entry(e, head, hlist) {
221 if (!strcmp(name, e->name)) { 219 if (!strcmp(name, e->name)) {
222 printk(KERN_NOTICE 220 printk(KERN_NOTICE
223 "tracepoint %s busy\n", name); 221 "tracepoint %s busy\n", name);
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index 625df0b44690..a1dd9a1b1327 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -32,6 +32,7 @@ void bacct_add_tsk(struct user_namespace *user_ns,
32{ 32{
33 const struct cred *tcred; 33 const struct cred *tcred;
34 struct timespec uptime, ts; 34 struct timespec uptime, ts;
35 cputime_t utime, stime, utimescaled, stimescaled;
35 u64 ac_etime; 36 u64 ac_etime;
36 37
37 BUILD_BUG_ON(TS_COMM_LEN < TASK_COMM_LEN); 38 BUILD_BUG_ON(TS_COMM_LEN < TASK_COMM_LEN);
@@ -65,10 +66,15 @@ void bacct_add_tsk(struct user_namespace *user_ns,
65 stats->ac_ppid = pid_alive(tsk) ? 66 stats->ac_ppid = pid_alive(tsk) ?
66 task_tgid_nr_ns(rcu_dereference(tsk->real_parent), pid_ns) : 0; 67 task_tgid_nr_ns(rcu_dereference(tsk->real_parent), pid_ns) : 0;
67 rcu_read_unlock(); 68 rcu_read_unlock();
68 stats->ac_utime = cputime_to_usecs(tsk->utime); 69
69 stats->ac_stime = cputime_to_usecs(tsk->stime); 70 task_cputime(tsk, &utime, &stime);
70 stats->ac_utimescaled = cputime_to_usecs(tsk->utimescaled); 71 stats->ac_utime = cputime_to_usecs(utime);
71 stats->ac_stimescaled = cputime_to_usecs(tsk->stimescaled); 72 stats->ac_stime = cputime_to_usecs(stime);
73
74 task_cputime_scaled(tsk, &utimescaled, &stimescaled);
75 stats->ac_utimescaled = cputime_to_usecs(utimescaled);
76 stats->ac_stimescaled = cputime_to_usecs(stimescaled);
77
72 stats->ac_minflt = tsk->min_flt; 78 stats->ac_minflt = tsk->min_flt;
73 stats->ac_majflt = tsk->maj_flt; 79 stats->ac_majflt = tsk->maj_flt;
74 80
@@ -115,11 +121,8 @@ void xacct_add_tsk(struct taskstats *stats, struct task_struct *p)
115#undef KB 121#undef KB
116#undef MB 122#undef MB
117 123
118/** 124static void __acct_update_integrals(struct task_struct *tsk,
119 * acct_update_integrals - update mm integral fields in task_struct 125 cputime_t utime, cputime_t stime)
120 * @tsk: task_struct for accounting
121 */
122void acct_update_integrals(struct task_struct *tsk)
123{ 126{
124 if (likely(tsk->mm)) { 127 if (likely(tsk->mm)) {
125 cputime_t time, dtime; 128 cputime_t time, dtime;
@@ -128,7 +131,7 @@ void acct_update_integrals(struct task_struct *tsk)
128 u64 delta; 131 u64 delta;
129 132
130 local_irq_save(flags); 133 local_irq_save(flags);
131 time = tsk->stime + tsk->utime; 134 time = stime + utime;
132 dtime = time - tsk->acct_timexpd; 135 dtime = time - tsk->acct_timexpd;
133 jiffies_to_timeval(cputime_to_jiffies(dtime), &value); 136 jiffies_to_timeval(cputime_to_jiffies(dtime), &value);
134 delta = value.tv_sec; 137 delta = value.tv_sec;
@@ -145,6 +148,27 @@ void acct_update_integrals(struct task_struct *tsk)
145} 148}
146 149
147/** 150/**
151 * acct_update_integrals - update mm integral fields in task_struct
152 * @tsk: task_struct for accounting
153 */
154void acct_update_integrals(struct task_struct *tsk)
155{
156 cputime_t utime, stime;
157
158 task_cputime(tsk, &utime, &stime);
159 __acct_update_integrals(tsk, utime, stime);
160}
161
162/**
163 * acct_account_cputime - update mm integral after cputime update
164 * @tsk: task_struct for accounting
165 */
166void acct_account_cputime(struct task_struct *tsk)
167{
168 __acct_update_integrals(tsk, tsk->utime, tsk->stime);
169}
170
171/**
148 * acct_clear_integrals - clear the mm integral fields in task_struct 172 * acct_clear_integrals - clear the mm integral fields in task_struct
149 * @tsk: task_struct whose accounting fields are cleared 173 * @tsk: task_struct whose accounting fields are cleared
150 */ 174 */
diff --git a/kernel/user-return-notifier.c b/kernel/user-return-notifier.c
index 1744bb80f1fb..394f70b17162 100644
--- a/kernel/user-return-notifier.c
+++ b/kernel/user-return-notifier.c
@@ -34,11 +34,11 @@ EXPORT_SYMBOL_GPL(user_return_notifier_unregister);
34void fire_user_return_notifiers(void) 34void fire_user_return_notifiers(void)
35{ 35{
36 struct user_return_notifier *urn; 36 struct user_return_notifier *urn;
37 struct hlist_node *tmp1, *tmp2; 37 struct hlist_node *tmp2;
38 struct hlist_head *head; 38 struct hlist_head *head;
39 39
40 head = &get_cpu_var(return_notifier_list); 40 head = &get_cpu_var(return_notifier_list);
41 hlist_for_each_entry_safe(urn, tmp1, tmp2, head, link) 41 hlist_for_each_entry_safe(urn, tmp2, head, link)
42 urn->on_user_return(urn); 42 urn->on_user_return(urn);
43 put_cpu_var(return_notifier_list); 43 put_cpu_var(return_notifier_list);
44} 44}
diff --git a/kernel/user.c b/kernel/user.c
index 750acffbe9ec..8e635a18ab52 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -16,6 +16,7 @@
16#include <linux/interrupt.h> 16#include <linux/interrupt.h>
17#include <linux/export.h> 17#include <linux/export.h>
18#include <linux/user_namespace.h> 18#include <linux/user_namespace.h>
19#include <linux/proc_fs.h>
19 20
20/* 21/*
21 * userns count is 1 for root user, 1 for init_uts_ns, 22 * userns count is 1 for root user, 1 for init_uts_ns,
@@ -46,11 +47,12 @@ struct user_namespace init_user_ns = {
46 .count = 4294967295U, 47 .count = 4294967295U,
47 }, 48 },
48 }, 49 },
49 .kref = { 50 .count = ATOMIC_INIT(3),
50 .refcount = ATOMIC_INIT(3),
51 },
52 .owner = GLOBAL_ROOT_UID, 51 .owner = GLOBAL_ROOT_UID,
53 .group = GLOBAL_ROOT_GID, 52 .group = GLOBAL_ROOT_GID,
53 .proc_inum = PROC_USER_INIT_INO,
54 .may_mount_sysfs = true,
55 .may_mount_proc = true,
54}; 56};
55EXPORT_SYMBOL_GPL(init_user_ns); 57EXPORT_SYMBOL_GPL(init_user_ns);
56 58
@@ -105,9 +107,8 @@ static void uid_hash_remove(struct user_struct *up)
105static struct user_struct *uid_hash_find(kuid_t uid, struct hlist_head *hashent) 107static struct user_struct *uid_hash_find(kuid_t uid, struct hlist_head *hashent)
106{ 108{
107 struct user_struct *user; 109 struct user_struct *user;
108 struct hlist_node *h;
109 110
110 hlist_for_each_entry(user, h, hashent, uidhash_node) { 111 hlist_for_each_entry(user, hashent, uidhash_node) {
111 if (uid_eq(user->uid, uid)) { 112 if (uid_eq(user->uid, uid)) {
112 atomic_inc(&user->__count); 113 atomic_inc(&user->__count);
113 return user; 114 return user;
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 456a6b9fba34..a54f26f82eb2 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -9,6 +9,7 @@
9#include <linux/nsproxy.h> 9#include <linux/nsproxy.h>
10#include <linux/slab.h> 10#include <linux/slab.h>
11#include <linux/user_namespace.h> 11#include <linux/user_namespace.h>
12#include <linux/proc_fs.h>
12#include <linux/highuid.h> 13#include <linux/highuid.h>
13#include <linux/cred.h> 14#include <linux/cred.h>
14#include <linux/securebits.h> 15#include <linux/securebits.h>
@@ -20,12 +21,31 @@
20#include <linux/uaccess.h> 21#include <linux/uaccess.h>
21#include <linux/ctype.h> 22#include <linux/ctype.h>
22#include <linux/projid.h> 23#include <linux/projid.h>
24#include <linux/fs_struct.h>
23 25
24static struct kmem_cache *user_ns_cachep __read_mostly; 26static struct kmem_cache *user_ns_cachep __read_mostly;
25 27
26static bool new_idmap_permitted(struct user_namespace *ns, int cap_setid, 28static bool new_idmap_permitted(struct user_namespace *ns, int cap_setid,
27 struct uid_gid_map *map); 29 struct uid_gid_map *map);
28 30
31static void set_cred_user_ns(struct cred *cred, struct user_namespace *user_ns)
32{
33 /* Start with the same capabilities as init but useless for doing
34 * anything as the capabilities are bound to the new user namespace.
35 */
36 cred->securebits = SECUREBITS_DEFAULT;
37 cred->cap_inheritable = CAP_EMPTY_SET;
38 cred->cap_permitted = CAP_FULL_SET;
39 cred->cap_effective = CAP_FULL_SET;
40 cred->cap_bset = CAP_FULL_SET;
41#ifdef CONFIG_KEYS
42 key_put(cred->request_key_auth);
43 cred->request_key_auth = NULL;
44#endif
45 /* tgcred will be cleared in our caller bc CLONE_THREAD won't be set */
46 cred->user_ns = user_ns;
47}
48
29/* 49/*
30 * Create a new user namespace, deriving the creator from the user in the 50 * Create a new user namespace, deriving the creator from the user in the
31 * passed credentials, and replacing that user with the new root user for the 51 * passed credentials, and replacing that user with the new root user for the
@@ -39,6 +59,16 @@ int create_user_ns(struct cred *new)
39 struct user_namespace *ns, *parent_ns = new->user_ns; 59 struct user_namespace *ns, *parent_ns = new->user_ns;
40 kuid_t owner = new->euid; 60 kuid_t owner = new->euid;
41 kgid_t group = new->egid; 61 kgid_t group = new->egid;
62 int ret;
63
64 /*
65 * Verify that we can not violate the policy of which files
66 * may be accessed that is specified by the root directory,
67 * by verifing that the root directory is at the root of the
68 * mount namespace which allows all files to be accessed.
69 */
70 if (current_chrooted())
71 return -EPERM;
42 72
43 /* The creator needs a mapping in the parent user namespace 73 /* The creator needs a mapping in the parent user namespace
44 * or else we won't be able to reasonably tell userspace who 74 * or else we won't be able to reasonably tell userspace who
@@ -52,40 +82,50 @@ int create_user_ns(struct cred *new)
52 if (!ns) 82 if (!ns)
53 return -ENOMEM; 83 return -ENOMEM;
54 84
55 kref_init(&ns->kref); 85 ret = proc_alloc_inum(&ns->proc_inum);
86 if (ret) {
87 kmem_cache_free(user_ns_cachep, ns);
88 return ret;
89 }
90
91 atomic_set(&ns->count, 1);
92 /* Leave the new->user_ns reference with the new user namespace. */
56 ns->parent = parent_ns; 93 ns->parent = parent_ns;
57 ns->owner = owner; 94 ns->owner = owner;
58 ns->group = group; 95 ns->group = group;
59 96
60 /* Start with the same capabilities as init but useless for doing 97 set_cred_user_ns(new, ns);
61 * anything as the capabilities are bound to the new user namespace.
62 */
63 new->securebits = SECUREBITS_DEFAULT;
64 new->cap_inheritable = CAP_EMPTY_SET;
65 new->cap_permitted = CAP_FULL_SET;
66 new->cap_effective = CAP_FULL_SET;
67 new->cap_bset = CAP_FULL_SET;
68#ifdef CONFIG_KEYS
69 key_put(new->request_key_auth);
70 new->request_key_auth = NULL;
71#endif
72 /* tgcred will be cleared in our caller bc CLONE_THREAD won't be set */
73 98
74 /* Leave the new->user_ns reference with the new user namespace. */ 99 update_mnt_policy(ns);
75 /* Leave the reference to our user_ns with the new cred. */
76 new->user_ns = ns;
77 100
78 return 0; 101 return 0;
79} 102}
80 103
81void free_user_ns(struct kref *kref) 104int unshare_userns(unsigned long unshare_flags, struct cred **new_cred)
82{ 105{
83 struct user_namespace *parent, *ns = 106 struct cred *cred;
84 container_of(kref, struct user_namespace, kref); 107
108 if (!(unshare_flags & CLONE_NEWUSER))
109 return 0;
85 110
86 parent = ns->parent; 111 cred = prepare_creds();
87 kmem_cache_free(user_ns_cachep, ns); 112 if (!cred)
88 put_user_ns(parent); 113 return -ENOMEM;
114
115 *new_cred = cred;
116 return create_user_ns(cred);
117}
118
119void free_user_ns(struct user_namespace *ns)
120{
121 struct user_namespace *parent;
122
123 do {
124 parent = ns->parent;
125 proc_free_inum(ns->proc_inum);
126 kmem_cache_free(user_ns_cachep, ns);
127 ns = parent;
128 } while (atomic_dec_and_test(&parent->count));
89} 129}
90EXPORT_SYMBOL(free_user_ns); 130EXPORT_SYMBOL(free_user_ns);
91 131
@@ -372,7 +412,7 @@ static int uid_m_show(struct seq_file *seq, void *v)
372 struct user_namespace *lower_ns; 412 struct user_namespace *lower_ns;
373 uid_t lower; 413 uid_t lower;
374 414
375 lower_ns = current_user_ns(); 415 lower_ns = seq_user_ns(seq);
376 if ((lower_ns == ns) && lower_ns->parent) 416 if ((lower_ns == ns) && lower_ns->parent)
377 lower_ns = lower_ns->parent; 417 lower_ns = lower_ns->parent;
378 418
@@ -393,7 +433,7 @@ static int gid_m_show(struct seq_file *seq, void *v)
393 struct user_namespace *lower_ns; 433 struct user_namespace *lower_ns;
394 gid_t lower; 434 gid_t lower;
395 435
396 lower_ns = current_user_ns(); 436 lower_ns = seq_user_ns(seq);
397 if ((lower_ns == ns) && lower_ns->parent) 437 if ((lower_ns == ns) && lower_ns->parent)
398 lower_ns = lower_ns->parent; 438 lower_ns = lower_ns->parent;
399 439
@@ -492,6 +532,42 @@ struct seq_operations proc_projid_seq_operations = {
492 .show = projid_m_show, 532 .show = projid_m_show,
493}; 533};
494 534
535static bool mappings_overlap(struct uid_gid_map *new_map, struct uid_gid_extent *extent)
536{
537 u32 upper_first, lower_first, upper_last, lower_last;
538 unsigned idx;
539
540 upper_first = extent->first;
541 lower_first = extent->lower_first;
542 upper_last = upper_first + extent->count - 1;
543 lower_last = lower_first + extent->count - 1;
544
545 for (idx = 0; idx < new_map->nr_extents; idx++) {
546 u32 prev_upper_first, prev_lower_first;
547 u32 prev_upper_last, prev_lower_last;
548 struct uid_gid_extent *prev;
549
550 prev = &new_map->extent[idx];
551
552 prev_upper_first = prev->first;
553 prev_lower_first = prev->lower_first;
554 prev_upper_last = prev_upper_first + prev->count - 1;
555 prev_lower_last = prev_lower_first + prev->count - 1;
556
557 /* Does the upper range intersect a previous extent? */
558 if ((prev_upper_first <= upper_last) &&
559 (prev_upper_last >= upper_first))
560 return true;
561
562 /* Does the lower range intersect a previous extent? */
563 if ((prev_lower_first <= lower_last) &&
564 (prev_lower_last >= lower_first))
565 return true;
566 }
567 return false;
568}
569
570
495static DEFINE_MUTEX(id_map_mutex); 571static DEFINE_MUTEX(id_map_mutex);
496 572
497static ssize_t map_write(struct file *file, const char __user *buf, 573static ssize_t map_write(struct file *file, const char __user *buf,
@@ -504,7 +580,7 @@ static ssize_t map_write(struct file *file, const char __user *buf,
504 struct user_namespace *ns = seq->private; 580 struct user_namespace *ns = seq->private;
505 struct uid_gid_map new_map; 581 struct uid_gid_map new_map;
506 unsigned idx; 582 unsigned idx;
507 struct uid_gid_extent *extent, *last = NULL; 583 struct uid_gid_extent *extent = NULL;
508 unsigned long page = 0; 584 unsigned long page = 0;
509 char *kbuf, *pos, *next_line; 585 char *kbuf, *pos, *next_line;
510 ssize_t ret = -EINVAL; 586 ssize_t ret = -EINVAL;
@@ -607,14 +683,11 @@ static ssize_t map_write(struct file *file, const char __user *buf,
607 if ((extent->lower_first + extent->count) <= extent->lower_first) 683 if ((extent->lower_first + extent->count) <= extent->lower_first)
608 goto out; 684 goto out;
609 685
610 /* For now only accept extents that are strictly in order */ 686 /* Do the ranges in extent overlap any previous extents? */
611 if (last && 687 if (mappings_overlap(&new_map, extent))
612 (((last->first + last->count) > extent->first) ||
613 ((last->lower_first + last->count) > extent->lower_first)))
614 goto out; 688 goto out;
615 689
616 new_map.nr_extents++; 690 new_map.nr_extents++;
617 last = extent;
618 691
619 /* Fail if the file contains too many extents */ 692 /* Fail if the file contains too many extents */
620 if ((new_map.nr_extents == UID_GID_MAP_MAX_EXTENTS) && 693 if ((new_map.nr_extents == UID_GID_MAP_MAX_EXTENTS) &&
@@ -669,10 +742,14 @@ ssize_t proc_uid_map_write(struct file *file, const char __user *buf, size_t siz
669{ 742{
670 struct seq_file *seq = file->private_data; 743 struct seq_file *seq = file->private_data;
671 struct user_namespace *ns = seq->private; 744 struct user_namespace *ns = seq->private;
745 struct user_namespace *seq_ns = seq_user_ns(seq);
672 746
673 if (!ns->parent) 747 if (!ns->parent)
674 return -EPERM; 748 return -EPERM;
675 749
750 if ((seq_ns != ns) && (seq_ns != ns->parent))
751 return -EPERM;
752
676 return map_write(file, buf, size, ppos, CAP_SETUID, 753 return map_write(file, buf, size, ppos, CAP_SETUID,
677 &ns->uid_map, &ns->parent->uid_map); 754 &ns->uid_map, &ns->parent->uid_map);
678} 755}
@@ -681,10 +758,14 @@ ssize_t proc_gid_map_write(struct file *file, const char __user *buf, size_t siz
681{ 758{
682 struct seq_file *seq = file->private_data; 759 struct seq_file *seq = file->private_data;
683 struct user_namespace *ns = seq->private; 760 struct user_namespace *ns = seq->private;
761 struct user_namespace *seq_ns = seq_user_ns(seq);
684 762
685 if (!ns->parent) 763 if (!ns->parent)
686 return -EPERM; 764 return -EPERM;
687 765
766 if ((seq_ns != ns) && (seq_ns != ns->parent))
767 return -EPERM;
768
688 return map_write(file, buf, size, ppos, CAP_SETGID, 769 return map_write(file, buf, size, ppos, CAP_SETGID,
689 &ns->gid_map, &ns->parent->gid_map); 770 &ns->gid_map, &ns->parent->gid_map);
690} 771}
@@ -709,6 +790,21 @@ ssize_t proc_projid_map_write(struct file *file, const char __user *buf, size_t
709static bool new_idmap_permitted(struct user_namespace *ns, int cap_setid, 790static bool new_idmap_permitted(struct user_namespace *ns, int cap_setid,
710 struct uid_gid_map *new_map) 791 struct uid_gid_map *new_map)
711{ 792{
793 /* Allow mapping to your own filesystem ids */
794 if ((new_map->nr_extents == 1) && (new_map->extent[0].count == 1)) {
795 u32 id = new_map->extent[0].lower_first;
796 if (cap_setid == CAP_SETUID) {
797 kuid_t uid = make_kuid(ns->parent, id);
798 if (uid_eq(uid, current_fsuid()))
799 return true;
800 }
801 else if (cap_setid == CAP_SETGID) {
802 kgid_t gid = make_kgid(ns->parent, id);
803 if (gid_eq(gid, current_fsgid()))
804 return true;
805 }
806 }
807
712 /* Allow anyone to set a mapping that doesn't require privilege */ 808 /* Allow anyone to set a mapping that doesn't require privilege */
713 if (!cap_valid(cap_setid)) 809 if (!cap_valid(cap_setid))
714 return true; 810 return true;
@@ -722,6 +818,68 @@ static bool new_idmap_permitted(struct user_namespace *ns, int cap_setid,
722 return false; 818 return false;
723} 819}
724 820
821static void *userns_get(struct task_struct *task)
822{
823 struct user_namespace *user_ns;
824
825 rcu_read_lock();
826 user_ns = get_user_ns(__task_cred(task)->user_ns);
827 rcu_read_unlock();
828
829 return user_ns;
830}
831
832static void userns_put(void *ns)
833{
834 put_user_ns(ns);
835}
836
837static int userns_install(struct nsproxy *nsproxy, void *ns)
838{
839 struct user_namespace *user_ns = ns;
840 struct cred *cred;
841
842 /* Don't allow gaining capabilities by reentering
843 * the same user namespace.
844 */
845 if (user_ns == current_user_ns())
846 return -EINVAL;
847
848 /* Threaded processes may not enter a different user namespace */
849 if (atomic_read(&current->mm->mm_users) > 1)
850 return -EINVAL;
851
852 if (current->fs->users != 1)
853 return -EINVAL;
854
855 if (!ns_capable(user_ns, CAP_SYS_ADMIN))
856 return -EPERM;
857
858 cred = prepare_creds();
859 if (!cred)
860 return -ENOMEM;
861
862 put_user_ns(cred->user_ns);
863 set_cred_user_ns(cred, get_user_ns(user_ns));
864
865 return commit_creds(cred);
866}
867
868static unsigned int userns_inum(void *ns)
869{
870 struct user_namespace *user_ns = ns;
871 return user_ns->proc_inum;
872}
873
874const struct proc_ns_operations userns_operations = {
875 .name = "user",
876 .type = CLONE_NEWUSER,
877 .get = userns_get,
878 .put = userns_put,
879 .install = userns_install,
880 .inum = userns_inum,
881};
882
725static __init int user_namespaces_init(void) 883static __init int user_namespaces_init(void)
726{ 884{
727 user_ns_cachep = KMEM_CACHE(user_namespace, SLAB_PANIC); 885 user_ns_cachep = KMEM_CACHE(user_namespace, SLAB_PANIC);
diff --git a/kernel/utsname.c b/kernel/utsname.c
index 679d97a5d3fd..a47fc5de3113 100644
--- a/kernel/utsname.c
+++ b/kernel/utsname.c
@@ -30,20 +30,27 @@ static struct uts_namespace *create_uts_ns(void)
30/* 30/*
31 * Clone a new ns copying an original utsname, setting refcount to 1 31 * Clone a new ns copying an original utsname, setting refcount to 1
32 * @old_ns: namespace to clone 32 * @old_ns: namespace to clone
33 * Return NULL on error (failure to kmalloc), new ns otherwise 33 * Return ERR_PTR(-ENOMEM) on error (failure to kmalloc), new ns otherwise
34 */ 34 */
35static struct uts_namespace *clone_uts_ns(struct task_struct *tsk, 35static struct uts_namespace *clone_uts_ns(struct user_namespace *user_ns,
36 struct uts_namespace *old_ns) 36 struct uts_namespace *old_ns)
37{ 37{
38 struct uts_namespace *ns; 38 struct uts_namespace *ns;
39 int err;
39 40
40 ns = create_uts_ns(); 41 ns = create_uts_ns();
41 if (!ns) 42 if (!ns)
42 return ERR_PTR(-ENOMEM); 43 return ERR_PTR(-ENOMEM);
43 44
45 err = proc_alloc_inum(&ns->proc_inum);
46 if (err) {
47 kfree(ns);
48 return ERR_PTR(err);
49 }
50
44 down_read(&uts_sem); 51 down_read(&uts_sem);
45 memcpy(&ns->name, &old_ns->name, sizeof(ns->name)); 52 memcpy(&ns->name, &old_ns->name, sizeof(ns->name));
46 ns->user_ns = get_user_ns(task_cred_xxx(tsk, user_ns)); 53 ns->user_ns = get_user_ns(user_ns);
47 up_read(&uts_sem); 54 up_read(&uts_sem);
48 return ns; 55 return ns;
49} 56}
@@ -55,9 +62,8 @@ static struct uts_namespace *clone_uts_ns(struct task_struct *tsk,
55 * versa. 62 * versa.
56 */ 63 */
57struct uts_namespace *copy_utsname(unsigned long flags, 64struct uts_namespace *copy_utsname(unsigned long flags,
58 struct task_struct *tsk) 65 struct user_namespace *user_ns, struct uts_namespace *old_ns)
59{ 66{
60 struct uts_namespace *old_ns = tsk->nsproxy->uts_ns;
61 struct uts_namespace *new_ns; 67 struct uts_namespace *new_ns;
62 68
63 BUG_ON(!old_ns); 69 BUG_ON(!old_ns);
@@ -66,7 +72,7 @@ struct uts_namespace *copy_utsname(unsigned long flags,
66 if (!(flags & CLONE_NEWUTS)) 72 if (!(flags & CLONE_NEWUTS))
67 return old_ns; 73 return old_ns;
68 74
69 new_ns = clone_uts_ns(tsk, old_ns); 75 new_ns = clone_uts_ns(user_ns, old_ns);
70 76
71 put_uts_ns(old_ns); 77 put_uts_ns(old_ns);
72 return new_ns; 78 return new_ns;
@@ -78,6 +84,7 @@ void free_uts_ns(struct kref *kref)
78 84
79 ns = container_of(kref, struct uts_namespace, kref); 85 ns = container_of(kref, struct uts_namespace, kref);
80 put_user_ns(ns->user_ns); 86 put_user_ns(ns->user_ns);
87 proc_free_inum(ns->proc_inum);
81 kfree(ns); 88 kfree(ns);
82} 89}
83 90
@@ -102,19 +109,32 @@ static void utsns_put(void *ns)
102 put_uts_ns(ns); 109 put_uts_ns(ns);
103} 110}
104 111
105static int utsns_install(struct nsproxy *nsproxy, void *ns) 112static int utsns_install(struct nsproxy *nsproxy, void *new)
106{ 113{
114 struct uts_namespace *ns = new;
115
116 if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN) ||
117 !nsown_capable(CAP_SYS_ADMIN))
118 return -EPERM;
119
107 get_uts_ns(ns); 120 get_uts_ns(ns);
108 put_uts_ns(nsproxy->uts_ns); 121 put_uts_ns(nsproxy->uts_ns);
109 nsproxy->uts_ns = ns; 122 nsproxy->uts_ns = ns;
110 return 0; 123 return 0;
111} 124}
112 125
126static unsigned int utsns_inum(void *vp)
127{
128 struct uts_namespace *ns = vp;
129
130 return ns->proc_inum;
131}
132
113const struct proc_ns_operations utsns_operations = { 133const struct proc_ns_operations utsns_operations = {
114 .name = "uts", 134 .name = "uts",
115 .type = CLONE_NEWUTS, 135 .type = CLONE_NEWUTS,
116 .get = utsns_get, 136 .get = utsns_get,
117 .put = utsns_put, 137 .put = utsns_put,
118 .install = utsns_install, 138 .install = utsns_install,
139 .inum = utsns_inum,
119}; 140};
120
diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c
index 63da38c2d820..4f69f9a5e221 100644
--- a/kernel/utsname_sysctl.c
+++ b/kernel/utsname_sysctl.c
@@ -15,6 +15,8 @@
15#include <linux/sysctl.h> 15#include <linux/sysctl.h>
16#include <linux/wait.h> 16#include <linux/wait.h>
17 17
18#ifdef CONFIG_PROC_SYSCTL
19
18static void *get_uts(ctl_table *table, int write) 20static void *get_uts(ctl_table *table, int write)
19{ 21{
20 char *which = table->data; 22 char *which = table->data;
@@ -38,7 +40,6 @@ static void put_uts(ctl_table *table, int write, void *which)
38 up_write(&uts_sem); 40 up_write(&uts_sem);
39} 41}
40 42
41#ifdef CONFIG_PROC_SYSCTL
42/* 43/*
43 * Special case of dostring for the UTS structure. This has locks 44 * Special case of dostring for the UTS structure. This has locks
44 * to observe. Should this be in kernel/sys.c ???? 45 * to observe. Should this be in kernel/sys.c ????
diff --git a/kernel/wait.c b/kernel/wait.c
index 7fdd9eaca2c3..6698e0c04ead 100644
--- a/kernel/wait.c
+++ b/kernel/wait.c
@@ -1,7 +1,7 @@
1/* 1/*
2 * Generic waiting primitives. 2 * Generic waiting primitives.
3 * 3 *
4 * (C) 2004 William Irwin, Oracle 4 * (C) 2004 Nadia Yvette Chambers, Oracle
5 */ 5 */
6#include <linux/init.h> 6#include <linux/init.h>
7#include <linux/export.h> 7#include <linux/export.h>
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 9d4c8d5a1f53..4a944676358e 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -23,6 +23,7 @@
23#include <linux/module.h> 23#include <linux/module.h>
24#include <linux/sysctl.h> 24#include <linux/sysctl.h>
25#include <linux/smpboot.h> 25#include <linux/smpboot.h>
26#include <linux/sched/rt.h>
26 27
27#include <asm/irq_regs.h> 28#include <asm/irq_regs.h>
28#include <linux/kvm_para.h> 29#include <linux/kvm_para.h>
@@ -31,6 +32,7 @@
31int watchdog_enabled = 1; 32int watchdog_enabled = 1;
32int __read_mostly watchdog_thresh = 10; 33int __read_mostly watchdog_thresh = 10;
33static int __read_mostly watchdog_disabled; 34static int __read_mostly watchdog_disabled;
35static u64 __read_mostly sample_period;
34 36
35static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts); 37static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts);
36static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog); 38static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog);
@@ -111,12 +113,12 @@ static int get_softlockup_thresh(void)
111 * resolution, and we don't need to waste time with a big divide when 113 * resolution, and we don't need to waste time with a big divide when
112 * 2^30ns == 1.074s. 114 * 2^30ns == 1.074s.
113 */ 115 */
114static unsigned long get_timestamp(int this_cpu) 116static unsigned long get_timestamp(void)
115{ 117{
116 return cpu_clock(this_cpu) >> 30LL; /* 2^30 ~= 10^9 */ 118 return local_clock() >> 30LL; /* 2^30 ~= 10^9 */
117} 119}
118 120
119static unsigned long get_sample_period(void) 121static void set_sample_period(void)
120{ 122{
121 /* 123 /*
122 * convert watchdog_thresh from seconds to ns 124 * convert watchdog_thresh from seconds to ns
@@ -125,15 +127,13 @@ static unsigned long get_sample_period(void)
125 * and hard thresholds) to increment before the 127 * and hard thresholds) to increment before the
126 * hardlockup detector generates a warning 128 * hardlockup detector generates a warning
127 */ 129 */
128 return get_softlockup_thresh() * (NSEC_PER_SEC / 5); 130 sample_period = get_softlockup_thresh() * ((u64)NSEC_PER_SEC / 5);
129} 131}
130 132
131/* Commands for resetting the watchdog */ 133/* Commands for resetting the watchdog */
132static void __touch_watchdog(void) 134static void __touch_watchdog(void)
133{ 135{
134 int this_cpu = smp_processor_id(); 136 __this_cpu_write(watchdog_touch_ts, get_timestamp());
135
136 __this_cpu_write(watchdog_touch_ts, get_timestamp(this_cpu));
137} 137}
138 138
139void touch_softlockup_watchdog(void) 139void touch_softlockup_watchdog(void)
@@ -194,7 +194,7 @@ static int is_hardlockup(void)
194 194
195static int is_softlockup(unsigned long touch_ts) 195static int is_softlockup(unsigned long touch_ts)
196{ 196{
197 unsigned long now = get_timestamp(smp_processor_id()); 197 unsigned long now = get_timestamp();
198 198
199 /* Warn about unreasonable delays: */ 199 /* Warn about unreasonable delays: */
200 if (time_after(now, touch_ts + get_softlockup_thresh())) 200 if (time_after(now, touch_ts + get_softlockup_thresh()))
@@ -275,7 +275,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
275 wake_up_process(__this_cpu_read(softlockup_watchdog)); 275 wake_up_process(__this_cpu_read(softlockup_watchdog));
276 276
277 /* .. and repeat */ 277 /* .. and repeat */
278 hrtimer_forward_now(hrtimer, ns_to_ktime(get_sample_period())); 278 hrtimer_forward_now(hrtimer, ns_to_ktime(sample_period));
279 279
280 if (touch_ts == 0) { 280 if (touch_ts == 0) {
281 if (unlikely(__this_cpu_read(softlockup_touch_sync))) { 281 if (unlikely(__this_cpu_read(softlockup_touch_sync))) {
@@ -343,6 +343,10 @@ static void watchdog_enable(unsigned int cpu)
343{ 343{
344 struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer); 344 struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer);
345 345
346 /* kick off the timer for the hardlockup detector */
347 hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
348 hrtimer->function = watchdog_timer_fn;
349
346 if (!watchdog_enabled) { 350 if (!watchdog_enabled) {
347 kthread_park(current); 351 kthread_park(current);
348 return; 352 return;
@@ -351,12 +355,8 @@ static void watchdog_enable(unsigned int cpu)
351 /* Enable the perf event */ 355 /* Enable the perf event */
352 watchdog_nmi_enable(cpu); 356 watchdog_nmi_enable(cpu);
353 357
354 /* kick off the timer for the hardlockup detector */
355 hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
356 hrtimer->function = watchdog_timer_fn;
357
358 /* done here because hrtimer_start can only pin to smp_processor_id() */ 358 /* done here because hrtimer_start can only pin to smp_processor_id() */
359 hrtimer_start(hrtimer, ns_to_ktime(get_sample_period()), 359 hrtimer_start(hrtimer, ns_to_ktime(sample_period),
360 HRTIMER_MODE_REL_PINNED); 360 HRTIMER_MODE_REL_PINNED);
361 361
362 /* initialize timestamp */ 362 /* initialize timestamp */
@@ -383,7 +383,7 @@ static int watchdog_should_run(unsigned int cpu)
383/* 383/*
384 * The watchdog thread function - touches the timestamp. 384 * The watchdog thread function - touches the timestamp.
385 * 385 *
386 * It only runs once every get_sample_period() seconds (4 seconds by 386 * It only runs once every sample_period seconds (4 seconds by
387 * default) to reset the softlockup timestamp. If this gets delayed 387 * default) to reset the softlockup timestamp. If this gets delayed
388 * for more than 2*watchdog_thresh seconds then the debug-printout 388 * for more than 2*watchdog_thresh seconds then the debug-printout
389 * triggers in watchdog_timer_fn(). 389 * triggers in watchdog_timer_fn().
@@ -516,6 +516,7 @@ int proc_dowatchdog(struct ctl_table *table, int write,
516 if (ret || !write) 516 if (ret || !write)
517 return ret; 517 return ret;
518 518
519 set_sample_period();
519 if (watchdog_enabled && watchdog_thresh) 520 if (watchdog_enabled && watchdog_thresh)
520 watchdog_enable_all_cpus(); 521 watchdog_enable_all_cpus();
521 else 522 else
@@ -537,6 +538,7 @@ static struct smp_hotplug_thread watchdog_threads = {
537 538
538void __init lockup_detector_init(void) 539void __init lockup_detector_init(void)
539{ 540{
541 set_sample_period();
540 if (smpboot_register_percpu_thread(&watchdog_threads)) { 542 if (smpboot_register_percpu_thread(&watchdog_threads)) {
541 pr_err("Failed to create watchdog threads, disabled\n"); 543 pr_err("Failed to create watchdog threads, disabled\n");
542 watchdog_disabled = -ENODEV; 544 watchdog_disabled = -ENODEV;
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 042d221d33cc..b48cd597145d 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -41,32 +41,31 @@
41#include <linux/debug_locks.h> 41#include <linux/debug_locks.h>
42#include <linux/lockdep.h> 42#include <linux/lockdep.h>
43#include <linux/idr.h> 43#include <linux/idr.h>
44#include <linux/hashtable.h>
44 45
45#include "workqueue_sched.h" 46#include "workqueue_internal.h"
46 47
47enum { 48enum {
48 /* 49 /*
49 * global_cwq flags 50 * worker_pool flags
50 * 51 *
51 * A bound gcwq is either associated or disassociated with its CPU. 52 * A bound pool is either associated or disassociated with its CPU.
52 * While associated (!DISASSOCIATED), all workers are bound to the 53 * While associated (!DISASSOCIATED), all workers are bound to the
53 * CPU and none has %WORKER_UNBOUND set and concurrency management 54 * CPU and none has %WORKER_UNBOUND set and concurrency management
54 * is in effect. 55 * is in effect.
55 * 56 *
56 * While DISASSOCIATED, the cpu may be offline and all workers have 57 * While DISASSOCIATED, the cpu may be offline and all workers have
57 * %WORKER_UNBOUND set and concurrency management disabled, and may 58 * %WORKER_UNBOUND set and concurrency management disabled, and may
58 * be executing on any CPU. The gcwq behaves as an unbound one. 59 * be executing on any CPU. The pool behaves as an unbound one.
59 * 60 *
60 * Note that DISASSOCIATED can be flipped only while holding 61 * Note that DISASSOCIATED can be flipped only while holding
61 * assoc_mutex of all pools on the gcwq to avoid changing binding 62 * assoc_mutex to avoid changing binding state while
62 * state while create_worker() is in progress. 63 * create_worker() is in progress.
63 */ 64 */
64 GCWQ_DISASSOCIATED = 1 << 0, /* cpu can't serve workers */
65 GCWQ_FREEZING = 1 << 1, /* freeze in progress */
66
67 /* pool flags */
68 POOL_MANAGE_WORKERS = 1 << 0, /* need to manage workers */ 65 POOL_MANAGE_WORKERS = 1 << 0, /* need to manage workers */
69 POOL_MANAGING_WORKERS = 1 << 1, /* managing workers */ 66 POOL_MANAGING_WORKERS = 1 << 1, /* managing workers */
67 POOL_DISASSOCIATED = 1 << 2, /* cpu can't serve workers */
68 POOL_FREEZING = 1 << 3, /* freeze in progress */
70 69
71 /* worker flags */ 70 /* worker flags */
72 WORKER_STARTED = 1 << 0, /* started */ 71 WORKER_STARTED = 1 << 0, /* started */
@@ -79,11 +78,9 @@ enum {
79 WORKER_NOT_RUNNING = WORKER_PREP | WORKER_UNBOUND | 78 WORKER_NOT_RUNNING = WORKER_PREP | WORKER_UNBOUND |
80 WORKER_CPU_INTENSIVE, 79 WORKER_CPU_INTENSIVE,
81 80
82 NR_WORKER_POOLS = 2, /* # worker pools per gcwq */ 81 NR_STD_WORKER_POOLS = 2, /* # standard pools per cpu */
83 82
84 BUSY_WORKER_HASH_ORDER = 6, /* 64 pointers */ 83 BUSY_WORKER_HASH_ORDER = 6, /* 64 pointers */
85 BUSY_WORKER_HASH_SIZE = 1 << BUSY_WORKER_HASH_ORDER,
86 BUSY_WORKER_HASH_MASK = BUSY_WORKER_HASH_SIZE - 1,
87 84
88 MAX_IDLE_WORKERS_RATIO = 4, /* 1/4 of busy can be idle */ 85 MAX_IDLE_WORKERS_RATIO = 4, /* 1/4 of busy can be idle */
89 IDLE_WORKER_TIMEOUT = 300 * HZ, /* keep idle ones for 5 mins */ 86 IDLE_WORKER_TIMEOUT = 300 * HZ, /* keep idle ones for 5 mins */
@@ -111,48 +108,24 @@ enum {
111 * P: Preemption protected. Disabling preemption is enough and should 108 * P: Preemption protected. Disabling preemption is enough and should
112 * only be modified and accessed from the local cpu. 109 * only be modified and accessed from the local cpu.
113 * 110 *
114 * L: gcwq->lock protected. Access with gcwq->lock held. 111 * L: pool->lock protected. Access with pool->lock held.
115 * 112 *
116 * X: During normal operation, modification requires gcwq->lock and 113 * X: During normal operation, modification requires pool->lock and should
117 * should be done only from local cpu. Either disabling preemption 114 * be done only from local cpu. Either disabling preemption on local
118 * on local cpu or grabbing gcwq->lock is enough for read access. 115 * cpu or grabbing pool->lock is enough for read access. If
119 * If GCWQ_DISASSOCIATED is set, it's identical to L. 116 * POOL_DISASSOCIATED is set, it's identical to L.
120 * 117 *
121 * F: wq->flush_mutex protected. 118 * F: wq->flush_mutex protected.
122 * 119 *
123 * W: workqueue_lock protected. 120 * W: workqueue_lock protected.
124 */ 121 */
125 122
126struct global_cwq; 123/* struct worker is defined in workqueue_internal.h */
127struct worker_pool;
128
129/*
130 * The poor guys doing the actual heavy lifting. All on-duty workers
131 * are either serving the manager role, on idle list or on busy hash.
132 */
133struct worker {
134 /* on idle list while idle, on busy hash table while busy */
135 union {
136 struct list_head entry; /* L: while idle */
137 struct hlist_node hentry; /* L: while busy */
138 };
139
140 struct work_struct *current_work; /* L: work being processed */
141 struct cpu_workqueue_struct *current_cwq; /* L: current_work's cwq */
142 struct list_head scheduled; /* L: scheduled works */
143 struct task_struct *task; /* I: worker task */
144 struct worker_pool *pool; /* I: the associated pool */
145 /* 64 bytes boundary on 64bit, 32 on 32bit */
146 unsigned long last_active; /* L: last active timestamp */
147 unsigned int flags; /* X: flags */
148 int id; /* I: worker id */
149
150 /* for rebinding worker to CPU */
151 struct work_struct rebind_work; /* L: for busy worker */
152};
153 124
154struct worker_pool { 125struct worker_pool {
155 struct global_cwq *gcwq; /* I: the owning gcwq */ 126 spinlock_t lock; /* the pool lock */
127 unsigned int cpu; /* I: the associated cpu */
128 int id; /* I: pool ID */
156 unsigned int flags; /* X: flags */ 129 unsigned int flags; /* X: flags */
157 130
158 struct list_head worklist; /* L: list of pending works */ 131 struct list_head worklist; /* L: list of pending works */
@@ -165,34 +138,28 @@ struct worker_pool {
165 struct timer_list idle_timer; /* L: worker idle timeout */ 138 struct timer_list idle_timer; /* L: worker idle timeout */
166 struct timer_list mayday_timer; /* L: SOS timer for workers */ 139 struct timer_list mayday_timer; /* L: SOS timer for workers */
167 140
168 struct mutex assoc_mutex; /* protect GCWQ_DISASSOCIATED */ 141 /* workers are chained either in busy_hash or idle_list */
169 struct ida worker_ida; /* L: for worker IDs */ 142 DECLARE_HASHTABLE(busy_hash, BUSY_WORKER_HASH_ORDER);
170};
171
172/*
173 * Global per-cpu workqueue. There's one and only one for each cpu
174 * and all works are queued and processed here regardless of their
175 * target workqueues.
176 */
177struct global_cwq {
178 spinlock_t lock; /* the gcwq lock */
179 unsigned int cpu; /* I: the associated cpu */
180 unsigned int flags; /* L: GCWQ_* flags */
181
182 /* workers are chained either in busy_hash or pool idle_list */
183 struct hlist_head busy_hash[BUSY_WORKER_HASH_SIZE];
184 /* L: hash of busy workers */ 143 /* L: hash of busy workers */
185 144
186 struct worker_pool pools[NR_WORKER_POOLS]; 145 struct mutex assoc_mutex; /* protect POOL_DISASSOCIATED */
187 /* normal and highpri pools */ 146 struct ida worker_ida; /* L: for worker IDs */
147
148 /*
149 * The current concurrency level. As it's likely to be accessed
150 * from other CPUs during try_to_wake_up(), put it in a separate
151 * cacheline.
152 */
153 atomic_t nr_running ____cacheline_aligned_in_smp;
188} ____cacheline_aligned_in_smp; 154} ____cacheline_aligned_in_smp;
189 155
190/* 156/*
191 * The per-CPU workqueue. The lower WORK_STRUCT_FLAG_BITS of 157 * The per-pool workqueue. While queued, the lower WORK_STRUCT_FLAG_BITS
192 * work_struct->data are used for flags and thus cwqs need to be 158 * of work_struct->data are used for flags and the remaining high bits
193 * aligned at two's power of the number of flag bits. 159 * point to the pwq; thus, pwqs need to be aligned at two's power of the
160 * number of flag bits.
194 */ 161 */
195struct cpu_workqueue_struct { 162struct pool_workqueue {
196 struct worker_pool *pool; /* I: the associated pool */ 163 struct worker_pool *pool; /* I: the associated pool */
197 struct workqueue_struct *wq; /* I: the owning workqueue */ 164 struct workqueue_struct *wq; /* I: the owning workqueue */
198 int work_color; /* L: current color */ 165 int work_color; /* L: current color */
@@ -241,16 +208,16 @@ typedef unsigned long mayday_mask_t;
241struct workqueue_struct { 208struct workqueue_struct {
242 unsigned int flags; /* W: WQ_* flags */ 209 unsigned int flags; /* W: WQ_* flags */
243 union { 210 union {
244 struct cpu_workqueue_struct __percpu *pcpu; 211 struct pool_workqueue __percpu *pcpu;
245 struct cpu_workqueue_struct *single; 212 struct pool_workqueue *single;
246 unsigned long v; 213 unsigned long v;
247 } cpu_wq; /* I: cwq's */ 214 } pool_wq; /* I: pwq's */
248 struct list_head list; /* W: list of all workqueues */ 215 struct list_head list; /* W: list of all workqueues */
249 216
250 struct mutex flush_mutex; /* protects wq flushing */ 217 struct mutex flush_mutex; /* protects wq flushing */
251 int work_color; /* F: current work color */ 218 int work_color; /* F: current work color */
252 int flush_color; /* F: current flush color */ 219 int flush_color; /* F: current flush color */
253 atomic_t nr_cwqs_to_flush; /* flush in progress */ 220 atomic_t nr_pwqs_to_flush; /* flush in progress */
254 struct wq_flusher *first_flusher; /* F: first flusher */ 221 struct wq_flusher *first_flusher; /* F: first flusher */
255 struct list_head flusher_queue; /* F: flush waiters */ 222 struct list_head flusher_queue; /* F: flush waiters */
256 struct list_head flusher_overflow; /* F: flush overflow list */ 223 struct list_head flusher_overflow; /* F: flush overflow list */
@@ -259,7 +226,7 @@ struct workqueue_struct {
259 struct worker *rescuer; /* I: rescue worker */ 226 struct worker *rescuer; /* I: rescue worker */
260 227
261 int nr_drainers; /* W: drain in progress */ 228 int nr_drainers; /* W: drain in progress */
262 int saved_max_active; /* W: saved cwq max_active */ 229 int saved_max_active; /* W: saved pwq max_active */
263#ifdef CONFIG_LOCKDEP 230#ifdef CONFIG_LOCKDEP
264 struct lockdep_map lockdep_map; 231 struct lockdep_map lockdep_map;
265#endif 232#endif
@@ -280,16 +247,15 @@ EXPORT_SYMBOL_GPL(system_freezable_wq);
280#define CREATE_TRACE_POINTS 247#define CREATE_TRACE_POINTS
281#include <trace/events/workqueue.h> 248#include <trace/events/workqueue.h>
282 249
283#define for_each_worker_pool(pool, gcwq) \ 250#define for_each_std_worker_pool(pool, cpu) \
284 for ((pool) = &(gcwq)->pools[0]; \ 251 for ((pool) = &std_worker_pools(cpu)[0]; \
285 (pool) < &(gcwq)->pools[NR_WORKER_POOLS]; (pool)++) 252 (pool) < &std_worker_pools(cpu)[NR_STD_WORKER_POOLS]; (pool)++)
286 253
287#define for_each_busy_worker(worker, i, pos, gcwq) \ 254#define for_each_busy_worker(worker, i, pool) \
288 for (i = 0; i < BUSY_WORKER_HASH_SIZE; i++) \ 255 hash_for_each(pool->busy_hash, i, worker, hentry)
289 hlist_for_each_entry(worker, pos, &gcwq->busy_hash[i], hentry)
290 256
291static inline int __next_gcwq_cpu(int cpu, const struct cpumask *mask, 257static inline int __next_wq_cpu(int cpu, const struct cpumask *mask,
292 unsigned int sw) 258 unsigned int sw)
293{ 259{
294 if (cpu < nr_cpu_ids) { 260 if (cpu < nr_cpu_ids) {
295 if (sw & 1) { 261 if (sw & 1) {
@@ -300,42 +266,42 @@ static inline int __next_gcwq_cpu(int cpu, const struct cpumask *mask,
300 if (sw & 2) 266 if (sw & 2)
301 return WORK_CPU_UNBOUND; 267 return WORK_CPU_UNBOUND;
302 } 268 }
303 return WORK_CPU_NONE; 269 return WORK_CPU_END;
304} 270}
305 271
306static inline int __next_wq_cpu(int cpu, const struct cpumask *mask, 272static inline int __next_pwq_cpu(int cpu, const struct cpumask *mask,
307 struct workqueue_struct *wq) 273 struct workqueue_struct *wq)
308{ 274{
309 return __next_gcwq_cpu(cpu, mask, !(wq->flags & WQ_UNBOUND) ? 1 : 2); 275 return __next_wq_cpu(cpu, mask, !(wq->flags & WQ_UNBOUND) ? 1 : 2);
310} 276}
311 277
312/* 278/*
313 * CPU iterators 279 * CPU iterators
314 * 280 *
315 * An extra gcwq is defined for an invalid cpu number 281 * An extra cpu number is defined using an invalid cpu number
316 * (WORK_CPU_UNBOUND) to host workqueues which are not bound to any 282 * (WORK_CPU_UNBOUND) to host workqueues which are not bound to any
317 * specific CPU. The following iterators are similar to 283 * specific CPU. The following iterators are similar to for_each_*_cpu()
318 * for_each_*_cpu() iterators but also considers the unbound gcwq. 284 * iterators but also considers the unbound CPU.
319 * 285 *
320 * for_each_gcwq_cpu() : possible CPUs + WORK_CPU_UNBOUND 286 * for_each_wq_cpu() : possible CPUs + WORK_CPU_UNBOUND
321 * for_each_online_gcwq_cpu() : online CPUs + WORK_CPU_UNBOUND 287 * for_each_online_wq_cpu() : online CPUs + WORK_CPU_UNBOUND
322 * for_each_cwq_cpu() : possible CPUs for bound workqueues, 288 * for_each_pwq_cpu() : possible CPUs for bound workqueues,
323 * WORK_CPU_UNBOUND for unbound workqueues 289 * WORK_CPU_UNBOUND for unbound workqueues
324 */ 290 */
325#define for_each_gcwq_cpu(cpu) \ 291#define for_each_wq_cpu(cpu) \
326 for ((cpu) = __next_gcwq_cpu(-1, cpu_possible_mask, 3); \ 292 for ((cpu) = __next_wq_cpu(-1, cpu_possible_mask, 3); \
327 (cpu) < WORK_CPU_NONE; \ 293 (cpu) < WORK_CPU_END; \
328 (cpu) = __next_gcwq_cpu((cpu), cpu_possible_mask, 3)) 294 (cpu) = __next_wq_cpu((cpu), cpu_possible_mask, 3))
329 295
330#define for_each_online_gcwq_cpu(cpu) \ 296#define for_each_online_wq_cpu(cpu) \
331 for ((cpu) = __next_gcwq_cpu(-1, cpu_online_mask, 3); \ 297 for ((cpu) = __next_wq_cpu(-1, cpu_online_mask, 3); \
332 (cpu) < WORK_CPU_NONE; \ 298 (cpu) < WORK_CPU_END; \
333 (cpu) = __next_gcwq_cpu((cpu), cpu_online_mask, 3)) 299 (cpu) = __next_wq_cpu((cpu), cpu_online_mask, 3))
334 300
335#define for_each_cwq_cpu(cpu, wq) \ 301#define for_each_pwq_cpu(cpu, wq) \
336 for ((cpu) = __next_wq_cpu(-1, cpu_possible_mask, (wq)); \ 302 for ((cpu) = __next_pwq_cpu(-1, cpu_possible_mask, (wq)); \
337 (cpu) < WORK_CPU_NONE; \ 303 (cpu) < WORK_CPU_END; \
338 (cpu) = __next_wq_cpu((cpu), cpu_possible_mask, (wq))) 304 (cpu) = __next_pwq_cpu((cpu), cpu_possible_mask, (wq)))
339 305
340#ifdef CONFIG_DEBUG_OBJECTS_WORK 306#ifdef CONFIG_DEBUG_OBJECTS_WORK
341 307
@@ -459,57 +425,70 @@ static LIST_HEAD(workqueues);
459static bool workqueue_freezing; /* W: have wqs started freezing? */ 425static bool workqueue_freezing; /* W: have wqs started freezing? */
460 426
461/* 427/*
462 * The almighty global cpu workqueues. nr_running is the only field 428 * The CPU and unbound standard worker pools. The unbound ones have
463 * which is expected to be used frequently by other cpus via 429 * POOL_DISASSOCIATED set, and their workers have WORKER_UNBOUND set.
464 * try_to_wake_up(). Put it in a separate cacheline.
465 */ 430 */
466static DEFINE_PER_CPU(struct global_cwq, global_cwq); 431static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS],
467static DEFINE_PER_CPU_SHARED_ALIGNED(atomic_t, pool_nr_running[NR_WORKER_POOLS]); 432 cpu_std_worker_pools);
433static struct worker_pool unbound_std_worker_pools[NR_STD_WORKER_POOLS];
468 434
469/* 435/* idr of all pools */
470 * Global cpu workqueue and nr_running counter for unbound gcwq. The 436static DEFINE_MUTEX(worker_pool_idr_mutex);
471 * gcwq is always online, has GCWQ_DISASSOCIATED set, and all its 437static DEFINE_IDR(worker_pool_idr);
472 * workers have WORKER_UNBOUND set.
473 */
474static struct global_cwq unbound_global_cwq;
475static atomic_t unbound_pool_nr_running[NR_WORKER_POOLS] = {
476 [0 ... NR_WORKER_POOLS - 1] = ATOMIC_INIT(0), /* always 0 */
477};
478 438
479static int worker_thread(void *__worker); 439static int worker_thread(void *__worker);
480 440
481static int worker_pool_pri(struct worker_pool *pool) 441static struct worker_pool *std_worker_pools(int cpu)
442{
443 if (cpu != WORK_CPU_UNBOUND)
444 return per_cpu(cpu_std_worker_pools, cpu);
445 else
446 return unbound_std_worker_pools;
447}
448
449static int std_worker_pool_pri(struct worker_pool *pool)
482{ 450{
483 return pool - pool->gcwq->pools; 451 return pool - std_worker_pools(pool->cpu);
484} 452}
485 453
486static struct global_cwq *get_gcwq(unsigned int cpu) 454/* allocate ID and assign it to @pool */
455static int worker_pool_assign_id(struct worker_pool *pool)
487{ 456{
488 if (cpu != WORK_CPU_UNBOUND) 457 int ret;
489 return &per_cpu(global_cwq, cpu); 458
490 else 459 mutex_lock(&worker_pool_idr_mutex);
491 return &unbound_global_cwq; 460 ret = idr_alloc(&worker_pool_idr, pool, 0, 0, GFP_KERNEL);
461 if (ret >= 0)
462 pool->id = ret;
463 mutex_unlock(&worker_pool_idr_mutex);
464
465 return ret < 0 ? ret : 0;
492} 466}
493 467
494static atomic_t *get_pool_nr_running(struct worker_pool *pool) 468/*
469 * Lookup worker_pool by id. The idr currently is built during boot and
470 * never modified. Don't worry about locking for now.
471 */
472static struct worker_pool *worker_pool_by_id(int pool_id)
495{ 473{
496 int cpu = pool->gcwq->cpu; 474 return idr_find(&worker_pool_idr, pool_id);
497 int idx = worker_pool_pri(pool); 475}
498 476
499 if (cpu != WORK_CPU_UNBOUND) 477static struct worker_pool *get_std_worker_pool(int cpu, bool highpri)
500 return &per_cpu(pool_nr_running, cpu)[idx]; 478{
501 else 479 struct worker_pool *pools = std_worker_pools(cpu);
502 return &unbound_pool_nr_running[idx]; 480
481 return &pools[highpri];
503} 482}
504 483
505static struct cpu_workqueue_struct *get_cwq(unsigned int cpu, 484static struct pool_workqueue *get_pwq(unsigned int cpu,
506 struct workqueue_struct *wq) 485 struct workqueue_struct *wq)
507{ 486{
508 if (!(wq->flags & WQ_UNBOUND)) { 487 if (!(wq->flags & WQ_UNBOUND)) {
509 if (likely(cpu < nr_cpu_ids)) 488 if (likely(cpu < nr_cpu_ids))
510 return per_cpu_ptr(wq->cpu_wq.pcpu, cpu); 489 return per_cpu_ptr(wq->pool_wq.pcpu, cpu);
511 } else if (likely(cpu == WORK_CPU_UNBOUND)) 490 } else if (likely(cpu == WORK_CPU_UNBOUND))
512 return wq->cpu_wq.single; 491 return wq->pool_wq.single;
513 return NULL; 492 return NULL;
514} 493}
515 494
@@ -530,19 +509,19 @@ static int work_next_color(int color)
530} 509}
531 510
532/* 511/*
533 * While queued, %WORK_STRUCT_CWQ is set and non flag bits of a work's data 512 * While queued, %WORK_STRUCT_PWQ is set and non flag bits of a work's data
534 * contain the pointer to the queued cwq. Once execution starts, the flag 513 * contain the pointer to the queued pwq. Once execution starts, the flag
535 * is cleared and the high bits contain OFFQ flags and CPU number. 514 * is cleared and the high bits contain OFFQ flags and pool ID.
536 * 515 *
537 * set_work_cwq(), set_work_cpu_and_clear_pending(), mark_work_canceling() 516 * set_work_pwq(), set_work_pool_and_clear_pending(), mark_work_canceling()
538 * and clear_work_data() can be used to set the cwq, cpu or clear 517 * and clear_work_data() can be used to set the pwq, pool or clear
539 * work->data. These functions should only be called while the work is 518 * work->data. These functions should only be called while the work is
540 * owned - ie. while the PENDING bit is set. 519 * owned - ie. while the PENDING bit is set.
541 * 520 *
542 * get_work_[g]cwq() can be used to obtain the gcwq or cwq corresponding to 521 * get_work_pool() and get_work_pwq() can be used to obtain the pool or pwq
543 * a work. gcwq is available once the work has been queued anywhere after 522 * corresponding to a work. Pool is available once the work has been
544 * initialization until it is sync canceled. cwq is available only while 523 * queued anywhere after initialization until it is sync canceled. pwq is
545 * the work item is queued. 524 * available only while the work item is queued.
546 * 525 *
547 * %WORK_OFFQ_CANCELING is used to mark a work item which is being 526 * %WORK_OFFQ_CANCELING is used to mark a work item which is being
548 * canceled. While being canceled, a work item may have its PENDING set 527 * canceled. While being canceled, a work item may have its PENDING set
@@ -556,16 +535,22 @@ static inline void set_work_data(struct work_struct *work, unsigned long data,
556 atomic_long_set(&work->data, data | flags | work_static(work)); 535 atomic_long_set(&work->data, data | flags | work_static(work));
557} 536}
558 537
559static void set_work_cwq(struct work_struct *work, 538static void set_work_pwq(struct work_struct *work, struct pool_workqueue *pwq,
560 struct cpu_workqueue_struct *cwq,
561 unsigned long extra_flags) 539 unsigned long extra_flags)
562{ 540{
563 set_work_data(work, (unsigned long)cwq, 541 set_work_data(work, (unsigned long)pwq,
564 WORK_STRUCT_PENDING | WORK_STRUCT_CWQ | extra_flags); 542 WORK_STRUCT_PENDING | WORK_STRUCT_PWQ | extra_flags);
565} 543}
566 544
567static void set_work_cpu_and_clear_pending(struct work_struct *work, 545static void set_work_pool_and_keep_pending(struct work_struct *work,
568 unsigned int cpu) 546 int pool_id)
547{
548 set_work_data(work, (unsigned long)pool_id << WORK_OFFQ_POOL_SHIFT,
549 WORK_STRUCT_PENDING);
550}
551
552static void set_work_pool_and_clear_pending(struct work_struct *work,
553 int pool_id)
569{ 554{
570 /* 555 /*
571 * The following wmb is paired with the implied mb in 556 * The following wmb is paired with the implied mb in
@@ -574,67 +559,92 @@ static void set_work_cpu_and_clear_pending(struct work_struct *work,
574 * owner. 559 * owner.
575 */ 560 */
576 smp_wmb(); 561 smp_wmb();
577 set_work_data(work, (unsigned long)cpu << WORK_OFFQ_CPU_SHIFT, 0); 562 set_work_data(work, (unsigned long)pool_id << WORK_OFFQ_POOL_SHIFT, 0);
578} 563}
579 564
580static void clear_work_data(struct work_struct *work) 565static void clear_work_data(struct work_struct *work)
581{ 566{
582 smp_wmb(); /* see set_work_cpu_and_clear_pending() */ 567 smp_wmb(); /* see set_work_pool_and_clear_pending() */
583 set_work_data(work, WORK_STRUCT_NO_CPU, 0); 568 set_work_data(work, WORK_STRUCT_NO_POOL, 0);
584} 569}
585 570
586static struct cpu_workqueue_struct *get_work_cwq(struct work_struct *work) 571static struct pool_workqueue *get_work_pwq(struct work_struct *work)
587{ 572{
588 unsigned long data = atomic_long_read(&work->data); 573 unsigned long data = atomic_long_read(&work->data);
589 574
590 if (data & WORK_STRUCT_CWQ) 575 if (data & WORK_STRUCT_PWQ)
591 return (void *)(data & WORK_STRUCT_WQ_DATA_MASK); 576 return (void *)(data & WORK_STRUCT_WQ_DATA_MASK);
592 else 577 else
593 return NULL; 578 return NULL;
594} 579}
595 580
596static struct global_cwq *get_work_gcwq(struct work_struct *work) 581/**
582 * get_work_pool - return the worker_pool a given work was associated with
583 * @work: the work item of interest
584 *
585 * Return the worker_pool @work was last associated with. %NULL if none.
586 */
587static struct worker_pool *get_work_pool(struct work_struct *work)
597{ 588{
598 unsigned long data = atomic_long_read(&work->data); 589 unsigned long data = atomic_long_read(&work->data);
599 unsigned int cpu; 590 struct worker_pool *pool;
591 int pool_id;
600 592
601 if (data & WORK_STRUCT_CWQ) 593 if (data & WORK_STRUCT_PWQ)
602 return ((struct cpu_workqueue_struct *) 594 return ((struct pool_workqueue *)
603 (data & WORK_STRUCT_WQ_DATA_MASK))->pool->gcwq; 595 (data & WORK_STRUCT_WQ_DATA_MASK))->pool;
604 596
605 cpu = data >> WORK_OFFQ_CPU_SHIFT; 597 pool_id = data >> WORK_OFFQ_POOL_SHIFT;
606 if (cpu == WORK_CPU_NONE) 598 if (pool_id == WORK_OFFQ_POOL_NONE)
607 return NULL; 599 return NULL;
608 600
609 BUG_ON(cpu >= nr_cpu_ids && cpu != WORK_CPU_UNBOUND); 601 pool = worker_pool_by_id(pool_id);
610 return get_gcwq(cpu); 602 WARN_ON_ONCE(!pool);
603 return pool;
604}
605
606/**
607 * get_work_pool_id - return the worker pool ID a given work is associated with
608 * @work: the work item of interest
609 *
610 * Return the worker_pool ID @work was last associated with.
611 * %WORK_OFFQ_POOL_NONE if none.
612 */
613static int get_work_pool_id(struct work_struct *work)
614{
615 unsigned long data = atomic_long_read(&work->data);
616
617 if (data & WORK_STRUCT_PWQ)
618 return ((struct pool_workqueue *)
619 (data & WORK_STRUCT_WQ_DATA_MASK))->pool->id;
620
621 return data >> WORK_OFFQ_POOL_SHIFT;
611} 622}
612 623
613static void mark_work_canceling(struct work_struct *work) 624static void mark_work_canceling(struct work_struct *work)
614{ 625{
615 struct global_cwq *gcwq = get_work_gcwq(work); 626 unsigned long pool_id = get_work_pool_id(work);
616 unsigned long cpu = gcwq ? gcwq->cpu : WORK_CPU_NONE;
617 627
618 set_work_data(work, (cpu << WORK_OFFQ_CPU_SHIFT) | WORK_OFFQ_CANCELING, 628 pool_id <<= WORK_OFFQ_POOL_SHIFT;
619 WORK_STRUCT_PENDING); 629 set_work_data(work, pool_id | WORK_OFFQ_CANCELING, WORK_STRUCT_PENDING);
620} 630}
621 631
622static bool work_is_canceling(struct work_struct *work) 632static bool work_is_canceling(struct work_struct *work)
623{ 633{
624 unsigned long data = atomic_long_read(&work->data); 634 unsigned long data = atomic_long_read(&work->data);
625 635
626 return !(data & WORK_STRUCT_CWQ) && (data & WORK_OFFQ_CANCELING); 636 return !(data & WORK_STRUCT_PWQ) && (data & WORK_OFFQ_CANCELING);
627} 637}
628 638
629/* 639/*
630 * Policy functions. These define the policies on how the global worker 640 * Policy functions. These define the policies on how the global worker
631 * pools are managed. Unless noted otherwise, these functions assume that 641 * pools are managed. Unless noted otherwise, these functions assume that
632 * they're being called with gcwq->lock held. 642 * they're being called with pool->lock held.
633 */ 643 */
634 644
635static bool __need_more_worker(struct worker_pool *pool) 645static bool __need_more_worker(struct worker_pool *pool)
636{ 646{
637 return !atomic_read(get_pool_nr_running(pool)); 647 return !atomic_read(&pool->nr_running);
638} 648}
639 649
640/* 650/*
@@ -642,7 +652,7 @@ static bool __need_more_worker(struct worker_pool *pool)
642 * running workers. 652 * running workers.
643 * 653 *
644 * Note that, because unbound workers never contribute to nr_running, this 654 * Note that, because unbound workers never contribute to nr_running, this
645 * function will always return %true for unbound gcwq as long as the 655 * function will always return %true for unbound pools as long as the
646 * worklist isn't empty. 656 * worklist isn't empty.
647 */ 657 */
648static bool need_more_worker(struct worker_pool *pool) 658static bool need_more_worker(struct worker_pool *pool)
@@ -659,9 +669,8 @@ static bool may_start_working(struct worker_pool *pool)
659/* Do I need to keep working? Called from currently running workers. */ 669/* Do I need to keep working? Called from currently running workers. */
660static bool keep_working(struct worker_pool *pool) 670static bool keep_working(struct worker_pool *pool)
661{ 671{
662 atomic_t *nr_running = get_pool_nr_running(pool); 672 return !list_empty(&pool->worklist) &&
663 673 atomic_read(&pool->nr_running) <= 1;
664 return !list_empty(&pool->worklist) && atomic_read(nr_running) <= 1;
665} 674}
666 675
667/* Do we need a new worker? Called from manager. */ 676/* Do we need a new worker? Called from manager. */
@@ -714,7 +723,7 @@ static struct worker *first_worker(struct worker_pool *pool)
714 * Wake up the first idle worker of @pool. 723 * Wake up the first idle worker of @pool.
715 * 724 *
716 * CONTEXT: 725 * CONTEXT:
717 * spin_lock_irq(gcwq->lock). 726 * spin_lock_irq(pool->lock).
718 */ 727 */
719static void wake_up_worker(struct worker_pool *pool) 728static void wake_up_worker(struct worker_pool *pool)
720{ 729{
@@ -739,8 +748,10 @@ void wq_worker_waking_up(struct task_struct *task, unsigned int cpu)
739{ 748{
740 struct worker *worker = kthread_data(task); 749 struct worker *worker = kthread_data(task);
741 750
742 if (!(worker->flags & WORKER_NOT_RUNNING)) 751 if (!(worker->flags & WORKER_NOT_RUNNING)) {
743 atomic_inc(get_pool_nr_running(worker->pool)); 752 WARN_ON_ONCE(worker->pool->cpu != cpu);
753 atomic_inc(&worker->pool->nr_running);
754 }
744} 755}
745 756
746/** 757/**
@@ -762,12 +773,18 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task,
762 unsigned int cpu) 773 unsigned int cpu)
763{ 774{
764 struct worker *worker = kthread_data(task), *to_wakeup = NULL; 775 struct worker *worker = kthread_data(task), *to_wakeup = NULL;
765 struct worker_pool *pool = worker->pool; 776 struct worker_pool *pool;
766 atomic_t *nr_running = get_pool_nr_running(pool);
767 777
778 /*
779 * Rescuers, which may not have all the fields set up like normal
780 * workers, also reach here, let's not access anything before
781 * checking NOT_RUNNING.
782 */
768 if (worker->flags & WORKER_NOT_RUNNING) 783 if (worker->flags & WORKER_NOT_RUNNING)
769 return NULL; 784 return NULL;
770 785
786 pool = worker->pool;
787
771 /* this can only happen on the local cpu */ 788 /* this can only happen on the local cpu */
772 BUG_ON(cpu != raw_smp_processor_id()); 789 BUG_ON(cpu != raw_smp_processor_id());
773 790
@@ -779,10 +796,11 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task,
779 * NOT_RUNNING is clear. This means that we're bound to and 796 * NOT_RUNNING is clear. This means that we're bound to and
780 * running on the local cpu w/ rq lock held and preemption 797 * running on the local cpu w/ rq lock held and preemption
781 * disabled, which in turn means that none else could be 798 * disabled, which in turn means that none else could be
782 * manipulating idle_list, so dereferencing idle_list without gcwq 799 * manipulating idle_list, so dereferencing idle_list without pool
783 * lock is safe. 800 * lock is safe.
784 */ 801 */
785 if (atomic_dec_and_test(nr_running) && !list_empty(&pool->worklist)) 802 if (atomic_dec_and_test(&pool->nr_running) &&
803 !list_empty(&pool->worklist))
786 to_wakeup = first_worker(pool); 804 to_wakeup = first_worker(pool);
787 return to_wakeup ? to_wakeup->task : NULL; 805 return to_wakeup ? to_wakeup->task : NULL;
788} 806}
@@ -798,7 +816,7 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task,
798 * woken up. 816 * woken up.
799 * 817 *
800 * CONTEXT: 818 * CONTEXT:
801 * spin_lock_irq(gcwq->lock) 819 * spin_lock_irq(pool->lock)
802 */ 820 */
803static inline void worker_set_flags(struct worker *worker, unsigned int flags, 821static inline void worker_set_flags(struct worker *worker, unsigned int flags,
804 bool wakeup) 822 bool wakeup)
@@ -814,14 +832,12 @@ static inline void worker_set_flags(struct worker *worker, unsigned int flags,
814 */ 832 */
815 if ((flags & WORKER_NOT_RUNNING) && 833 if ((flags & WORKER_NOT_RUNNING) &&
816 !(worker->flags & WORKER_NOT_RUNNING)) { 834 !(worker->flags & WORKER_NOT_RUNNING)) {
817 atomic_t *nr_running = get_pool_nr_running(pool);
818
819 if (wakeup) { 835 if (wakeup) {
820 if (atomic_dec_and_test(nr_running) && 836 if (atomic_dec_and_test(&pool->nr_running) &&
821 !list_empty(&pool->worklist)) 837 !list_empty(&pool->worklist))
822 wake_up_worker(pool); 838 wake_up_worker(pool);
823 } else 839 } else
824 atomic_dec(nr_running); 840 atomic_dec(&pool->nr_running);
825 } 841 }
826 842
827 worker->flags |= flags; 843 worker->flags |= flags;
@@ -835,7 +851,7 @@ static inline void worker_set_flags(struct worker *worker, unsigned int flags,
835 * Clear @flags in @worker->flags and adjust nr_running accordingly. 851 * Clear @flags in @worker->flags and adjust nr_running accordingly.
836 * 852 *
837 * CONTEXT: 853 * CONTEXT:
838 * spin_lock_irq(gcwq->lock) 854 * spin_lock_irq(pool->lock)
839 */ 855 */
840static inline void worker_clr_flags(struct worker *worker, unsigned int flags) 856static inline void worker_clr_flags(struct worker *worker, unsigned int flags)
841{ 857{
@@ -853,87 +869,55 @@ static inline void worker_clr_flags(struct worker *worker, unsigned int flags)
853 */ 869 */
854 if ((flags & WORKER_NOT_RUNNING) && (oflags & WORKER_NOT_RUNNING)) 870 if ((flags & WORKER_NOT_RUNNING) && (oflags & WORKER_NOT_RUNNING))
855 if (!(worker->flags & WORKER_NOT_RUNNING)) 871 if (!(worker->flags & WORKER_NOT_RUNNING))
856 atomic_inc(get_pool_nr_running(pool)); 872 atomic_inc(&pool->nr_running);
857} 873}
858 874
859/** 875/**
860 * busy_worker_head - return the busy hash head for a work 876 * find_worker_executing_work - find worker which is executing a work
861 * @gcwq: gcwq of interest 877 * @pool: pool of interest
862 * @work: work to be hashed
863 *
864 * Return hash head of @gcwq for @work.
865 *
866 * CONTEXT:
867 * spin_lock_irq(gcwq->lock).
868 *
869 * RETURNS:
870 * Pointer to the hash head.
871 */
872static struct hlist_head *busy_worker_head(struct global_cwq *gcwq,
873 struct work_struct *work)
874{
875 const int base_shift = ilog2(sizeof(struct work_struct));
876 unsigned long v = (unsigned long)work;
877
878 /* simple shift and fold hash, do we need something better? */
879 v >>= base_shift;
880 v += v >> BUSY_WORKER_HASH_ORDER;
881 v &= BUSY_WORKER_HASH_MASK;
882
883 return &gcwq->busy_hash[v];
884}
885
886/**
887 * __find_worker_executing_work - find worker which is executing a work
888 * @gcwq: gcwq of interest
889 * @bwh: hash head as returned by busy_worker_head()
890 * @work: work to find worker for 878 * @work: work to find worker for
891 * 879 *
892 * Find a worker which is executing @work on @gcwq. @bwh should be 880 * Find a worker which is executing @work on @pool by searching
893 * the hash head obtained by calling busy_worker_head() with the same 881 * @pool->busy_hash which is keyed by the address of @work. For a worker
894 * work. 882 * to match, its current execution should match the address of @work and
883 * its work function. This is to avoid unwanted dependency between
884 * unrelated work executions through a work item being recycled while still
885 * being executed.
886 *
887 * This is a bit tricky. A work item may be freed once its execution
888 * starts and nothing prevents the freed area from being recycled for
889 * another work item. If the same work item address ends up being reused
890 * before the original execution finishes, workqueue will identify the
891 * recycled work item as currently executing and make it wait until the
892 * current execution finishes, introducing an unwanted dependency.
893 *
894 * This function checks the work item address, work function and workqueue
895 * to avoid false positives. Note that this isn't complete as one may
896 * construct a work function which can introduce dependency onto itself
897 * through a recycled work item. Well, if somebody wants to shoot oneself
898 * in the foot that badly, there's only so much we can do, and if such
899 * deadlock actually occurs, it should be easy to locate the culprit work
900 * function.
895 * 901 *
896 * CONTEXT: 902 * CONTEXT:
897 * spin_lock_irq(gcwq->lock). 903 * spin_lock_irq(pool->lock).
898 * 904 *
899 * RETURNS: 905 * RETURNS:
900 * Pointer to worker which is executing @work if found, NULL 906 * Pointer to worker which is executing @work if found, NULL
901 * otherwise. 907 * otherwise.
902 */ 908 */
903static struct worker *__find_worker_executing_work(struct global_cwq *gcwq, 909static struct worker *find_worker_executing_work(struct worker_pool *pool,
904 struct hlist_head *bwh, 910 struct work_struct *work)
905 struct work_struct *work)
906{ 911{
907 struct worker *worker; 912 struct worker *worker;
908 struct hlist_node *tmp;
909 913
910 hlist_for_each_entry(worker, tmp, bwh, hentry) 914 hash_for_each_possible(pool->busy_hash, worker, hentry,
911 if (worker->current_work == work) 915 (unsigned long)work)
916 if (worker->current_work == work &&
917 worker->current_func == work->func)
912 return worker; 918 return worker;
913 return NULL;
914}
915 919
916/** 920 return NULL;
917 * find_worker_executing_work - find worker which is executing a work
918 * @gcwq: gcwq of interest
919 * @work: work to find worker for
920 *
921 * Find a worker which is executing @work on @gcwq. This function is
922 * identical to __find_worker_executing_work() except that this
923 * function calculates @bwh itself.
924 *
925 * CONTEXT:
926 * spin_lock_irq(gcwq->lock).
927 *
928 * RETURNS:
929 * Pointer to worker which is executing @work if found, NULL
930 * otherwise.
931 */
932static struct worker *find_worker_executing_work(struct global_cwq *gcwq,
933 struct work_struct *work)
934{
935 return __find_worker_executing_work(gcwq, busy_worker_head(gcwq, work),
936 work);
937} 921}
938 922
939/** 923/**
@@ -951,7 +935,7 @@ static struct worker *find_worker_executing_work(struct global_cwq *gcwq,
951 * nested inside outer list_for_each_entry_safe(). 935 * nested inside outer list_for_each_entry_safe().
952 * 936 *
953 * CONTEXT: 937 * CONTEXT:
954 * spin_lock_irq(gcwq->lock). 938 * spin_lock_irq(pool->lock).
955 */ 939 */
956static void move_linked_works(struct work_struct *work, struct list_head *head, 940static void move_linked_works(struct work_struct *work, struct list_head *head,
957 struct work_struct **nextp) 941 struct work_struct **nextp)
@@ -977,67 +961,67 @@ static void move_linked_works(struct work_struct *work, struct list_head *head,
977 *nextp = n; 961 *nextp = n;
978} 962}
979 963
980static void cwq_activate_delayed_work(struct work_struct *work) 964static void pwq_activate_delayed_work(struct work_struct *work)
981{ 965{
982 struct cpu_workqueue_struct *cwq = get_work_cwq(work); 966 struct pool_workqueue *pwq = get_work_pwq(work);
983 967
984 trace_workqueue_activate_work(work); 968 trace_workqueue_activate_work(work);
985 move_linked_works(work, &cwq->pool->worklist, NULL); 969 move_linked_works(work, &pwq->pool->worklist, NULL);
986 __clear_bit(WORK_STRUCT_DELAYED_BIT, work_data_bits(work)); 970 __clear_bit(WORK_STRUCT_DELAYED_BIT, work_data_bits(work));
987 cwq->nr_active++; 971 pwq->nr_active++;
988} 972}
989 973
990static void cwq_activate_first_delayed(struct cpu_workqueue_struct *cwq) 974static void pwq_activate_first_delayed(struct pool_workqueue *pwq)
991{ 975{
992 struct work_struct *work = list_first_entry(&cwq->delayed_works, 976 struct work_struct *work = list_first_entry(&pwq->delayed_works,
993 struct work_struct, entry); 977 struct work_struct, entry);
994 978
995 cwq_activate_delayed_work(work); 979 pwq_activate_delayed_work(work);
996} 980}
997 981
998/** 982/**
999 * cwq_dec_nr_in_flight - decrement cwq's nr_in_flight 983 * pwq_dec_nr_in_flight - decrement pwq's nr_in_flight
1000 * @cwq: cwq of interest 984 * @pwq: pwq of interest
1001 * @color: color of work which left the queue 985 * @color: color of work which left the queue
1002 * 986 *
1003 * A work either has completed or is removed from pending queue, 987 * A work either has completed or is removed from pending queue,
1004 * decrement nr_in_flight of its cwq and handle workqueue flushing. 988 * decrement nr_in_flight of its pwq and handle workqueue flushing.
1005 * 989 *
1006 * CONTEXT: 990 * CONTEXT:
1007 * spin_lock_irq(gcwq->lock). 991 * spin_lock_irq(pool->lock).
1008 */ 992 */
1009static void cwq_dec_nr_in_flight(struct cpu_workqueue_struct *cwq, int color) 993static void pwq_dec_nr_in_flight(struct pool_workqueue *pwq, int color)
1010{ 994{
1011 /* ignore uncolored works */ 995 /* ignore uncolored works */
1012 if (color == WORK_NO_COLOR) 996 if (color == WORK_NO_COLOR)
1013 return; 997 return;
1014 998
1015 cwq->nr_in_flight[color]--; 999 pwq->nr_in_flight[color]--;
1016 1000
1017 cwq->nr_active--; 1001 pwq->nr_active--;
1018 if (!list_empty(&cwq->delayed_works)) { 1002 if (!list_empty(&pwq->delayed_works)) {
1019 /* one down, submit a delayed one */ 1003 /* one down, submit a delayed one */
1020 if (cwq->nr_active < cwq->max_active) 1004 if (pwq->nr_active < pwq->max_active)
1021 cwq_activate_first_delayed(cwq); 1005 pwq_activate_first_delayed(pwq);
1022 } 1006 }
1023 1007
1024 /* is flush in progress and are we at the flushing tip? */ 1008 /* is flush in progress and are we at the flushing tip? */
1025 if (likely(cwq->flush_color != color)) 1009 if (likely(pwq->flush_color != color))
1026 return; 1010 return;
1027 1011
1028 /* are there still in-flight works? */ 1012 /* are there still in-flight works? */
1029 if (cwq->nr_in_flight[color]) 1013 if (pwq->nr_in_flight[color])
1030 return; 1014 return;
1031 1015
1032 /* this cwq is done, clear flush_color */ 1016 /* this pwq is done, clear flush_color */
1033 cwq->flush_color = -1; 1017 pwq->flush_color = -1;
1034 1018
1035 /* 1019 /*
1036 * If this was the last cwq, wake up the first flusher. It 1020 * If this was the last pwq, wake up the first flusher. It
1037 * will handle the rest. 1021 * will handle the rest.
1038 */ 1022 */
1039 if (atomic_dec_and_test(&cwq->wq->nr_cwqs_to_flush)) 1023 if (atomic_dec_and_test(&pwq->wq->nr_pwqs_to_flush))
1040 complete(&cwq->wq->first_flusher->done); 1024 complete(&pwq->wq->first_flusher->done);
1041} 1025}
1042 1026
1043/** 1027/**
@@ -1068,7 +1052,8 @@ static void cwq_dec_nr_in_flight(struct cpu_workqueue_struct *cwq, int color)
1068static int try_to_grab_pending(struct work_struct *work, bool is_dwork, 1052static int try_to_grab_pending(struct work_struct *work, bool is_dwork,
1069 unsigned long *flags) 1053 unsigned long *flags)
1070{ 1054{
1071 struct global_cwq *gcwq; 1055 struct worker_pool *pool;
1056 struct pool_workqueue *pwq;
1072 1057
1073 local_irq_save(*flags); 1058 local_irq_save(*flags);
1074 1059
@@ -1093,41 +1078,43 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork,
1093 * The queueing is in progress, or it is already queued. Try to 1078 * The queueing is in progress, or it is already queued. Try to
1094 * steal it from ->worklist without clearing WORK_STRUCT_PENDING. 1079 * steal it from ->worklist without clearing WORK_STRUCT_PENDING.
1095 */ 1080 */
1096 gcwq = get_work_gcwq(work); 1081 pool = get_work_pool(work);
1097 if (!gcwq) 1082 if (!pool)
1098 goto fail; 1083 goto fail;
1099 1084
1100 spin_lock(&gcwq->lock); 1085 spin_lock(&pool->lock);
1101 if (!list_empty(&work->entry)) { 1086 /*
1087 * work->data is guaranteed to point to pwq only while the work
1088 * item is queued on pwq->wq, and both updating work->data to point
1089 * to pwq on queueing and to pool on dequeueing are done under
1090 * pwq->pool->lock. This in turn guarantees that, if work->data
1091 * points to pwq which is associated with a locked pool, the work
1092 * item is currently queued on that pool.
1093 */
1094 pwq = get_work_pwq(work);
1095 if (pwq && pwq->pool == pool) {
1096 debug_work_deactivate(work);
1097
1102 /* 1098 /*
1103 * This work is queued, but perhaps we locked the wrong gcwq. 1099 * A delayed work item cannot be grabbed directly because
1104 * In that case we must see the new value after rmb(), see 1100 * it might have linked NO_COLOR work items which, if left
1105 * insert_work()->wmb(). 1101 * on the delayed_list, will confuse pwq->nr_active
1102 * management later on and cause stall. Make sure the work
1103 * item is activated before grabbing.
1106 */ 1104 */
1107 smp_rmb(); 1105 if (*work_data_bits(work) & WORK_STRUCT_DELAYED)
1108 if (gcwq == get_work_gcwq(work)) { 1106 pwq_activate_delayed_work(work);
1109 debug_work_deactivate(work);
1110 1107
1111 /* 1108 list_del_init(&work->entry);
1112 * A delayed work item cannot be grabbed directly 1109 pwq_dec_nr_in_flight(get_work_pwq(work), get_work_color(work));
1113 * because it might have linked NO_COLOR work items
1114 * which, if left on the delayed_list, will confuse
1115 * cwq->nr_active management later on and cause
1116 * stall. Make sure the work item is activated
1117 * before grabbing.
1118 */
1119 if (*work_data_bits(work) & WORK_STRUCT_DELAYED)
1120 cwq_activate_delayed_work(work);
1121 1110
1122 list_del_init(&work->entry); 1111 /* work->data points to pwq iff queued, point to pool */
1123 cwq_dec_nr_in_flight(get_work_cwq(work), 1112 set_work_pool_and_keep_pending(work, pool->id);
1124 get_work_color(work));
1125 1113
1126 spin_unlock(&gcwq->lock); 1114 spin_unlock(&pool->lock);
1127 return 1; 1115 return 1;
1128 }
1129 } 1116 }
1130 spin_unlock(&gcwq->lock); 1117 spin_unlock(&pool->lock);
1131fail: 1118fail:
1132 local_irq_restore(*flags); 1119 local_irq_restore(*flags);
1133 if (work_is_canceling(work)) 1120 if (work_is_canceling(work))
@@ -1137,33 +1124,25 @@ fail:
1137} 1124}
1138 1125
1139/** 1126/**
1140 * insert_work - insert a work into gcwq 1127 * insert_work - insert a work into a pool
1141 * @cwq: cwq @work belongs to 1128 * @pwq: pwq @work belongs to
1142 * @work: work to insert 1129 * @work: work to insert
1143 * @head: insertion point 1130 * @head: insertion point
1144 * @extra_flags: extra WORK_STRUCT_* flags to set 1131 * @extra_flags: extra WORK_STRUCT_* flags to set
1145 * 1132 *
1146 * Insert @work which belongs to @cwq into @gcwq after @head. 1133 * Insert @work which belongs to @pwq after @head. @extra_flags is or'd to
1147 * @extra_flags is or'd to work_struct flags. 1134 * work_struct flags.
1148 * 1135 *
1149 * CONTEXT: 1136 * CONTEXT:
1150 * spin_lock_irq(gcwq->lock). 1137 * spin_lock_irq(pool->lock).
1151 */ 1138 */
1152static void insert_work(struct cpu_workqueue_struct *cwq, 1139static void insert_work(struct pool_workqueue *pwq, struct work_struct *work,
1153 struct work_struct *work, struct list_head *head, 1140 struct list_head *head, unsigned int extra_flags)
1154 unsigned int extra_flags)
1155{ 1141{
1156 struct worker_pool *pool = cwq->pool; 1142 struct worker_pool *pool = pwq->pool;
1157 1143
1158 /* we own @work, set data and link */ 1144 /* we own @work, set data and link */
1159 set_work_cwq(work, cwq, extra_flags); 1145 set_work_pwq(work, pwq, extra_flags);
1160
1161 /*
1162 * Ensure that we get the right work->data if we see the
1163 * result of list_add() below, see try_to_grab_pending().
1164 */
1165 smp_wmb();
1166
1167 list_add_tail(&work->entry, head); 1146 list_add_tail(&work->entry, head);
1168 1147
1169 /* 1148 /*
@@ -1179,41 +1158,24 @@ static void insert_work(struct cpu_workqueue_struct *cwq,
1179 1158
1180/* 1159/*
1181 * Test whether @work is being queued from another work executing on the 1160 * Test whether @work is being queued from another work executing on the
1182 * same workqueue. This is rather expensive and should only be used from 1161 * same workqueue.
1183 * cold paths.
1184 */ 1162 */
1185static bool is_chained_work(struct workqueue_struct *wq) 1163static bool is_chained_work(struct workqueue_struct *wq)
1186{ 1164{
1187 unsigned long flags; 1165 struct worker *worker;
1188 unsigned int cpu;
1189
1190 for_each_gcwq_cpu(cpu) {
1191 struct global_cwq *gcwq = get_gcwq(cpu);
1192 struct worker *worker;
1193 struct hlist_node *pos;
1194 int i;
1195 1166
1196 spin_lock_irqsave(&gcwq->lock, flags); 1167 worker = current_wq_worker();
1197 for_each_busy_worker(worker, i, pos, gcwq) { 1168 /*
1198 if (worker->task != current) 1169 * Return %true iff I'm a worker execuing a work item on @wq. If
1199 continue; 1170 * I'm @worker, it's safe to dereference it without locking.
1200 spin_unlock_irqrestore(&gcwq->lock, flags); 1171 */
1201 /* 1172 return worker && worker->current_pwq->wq == wq;
1202 * I'm @worker, no locking necessary. See if @work
1203 * is headed to the same workqueue.
1204 */
1205 return worker->current_cwq->wq == wq;
1206 }
1207 spin_unlock_irqrestore(&gcwq->lock, flags);
1208 }
1209 return false;
1210} 1173}
1211 1174
1212static void __queue_work(unsigned int cpu, struct workqueue_struct *wq, 1175static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
1213 struct work_struct *work) 1176 struct work_struct *work)
1214{ 1177{
1215 struct global_cwq *gcwq; 1178 struct pool_workqueue *pwq;
1216 struct cpu_workqueue_struct *cwq;
1217 struct list_head *worklist; 1179 struct list_head *worklist;
1218 unsigned int work_flags; 1180 unsigned int work_flags;
1219 unsigned int req_cpu = cpu; 1181 unsigned int req_cpu = cpu;
@@ -1233,9 +1195,9 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
1233 WARN_ON_ONCE(!is_chained_work(wq))) 1195 WARN_ON_ONCE(!is_chained_work(wq)))
1234 return; 1196 return;
1235 1197
1236 /* determine gcwq to use */ 1198 /* determine the pwq to use */
1237 if (!(wq->flags & WQ_UNBOUND)) { 1199 if (!(wq->flags & WQ_UNBOUND)) {
1238 struct global_cwq *last_gcwq; 1200 struct worker_pool *last_pool;
1239 1201
1240 if (cpu == WORK_CPU_UNBOUND) 1202 if (cpu == WORK_CPU_UNBOUND)
1241 cpu = raw_smp_processor_id(); 1203 cpu = raw_smp_processor_id();
@@ -1246,55 +1208,54 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
1246 * work needs to be queued on that cpu to guarantee 1208 * work needs to be queued on that cpu to guarantee
1247 * non-reentrancy. 1209 * non-reentrancy.
1248 */ 1210 */
1249 gcwq = get_gcwq(cpu); 1211 pwq = get_pwq(cpu, wq);
1250 last_gcwq = get_work_gcwq(work); 1212 last_pool = get_work_pool(work);
1251 1213
1252 if (last_gcwq && last_gcwq != gcwq) { 1214 if (last_pool && last_pool != pwq->pool) {
1253 struct worker *worker; 1215 struct worker *worker;
1254 1216
1255 spin_lock(&last_gcwq->lock); 1217 spin_lock(&last_pool->lock);
1256 1218
1257 worker = find_worker_executing_work(last_gcwq, work); 1219 worker = find_worker_executing_work(last_pool, work);
1258 1220
1259 if (worker && worker->current_cwq->wq == wq) 1221 if (worker && worker->current_pwq->wq == wq) {
1260 gcwq = last_gcwq; 1222 pwq = get_pwq(last_pool->cpu, wq);
1261 else { 1223 } else {
1262 /* meh... not running there, queue here */ 1224 /* meh... not running there, queue here */
1263 spin_unlock(&last_gcwq->lock); 1225 spin_unlock(&last_pool->lock);
1264 spin_lock(&gcwq->lock); 1226 spin_lock(&pwq->pool->lock);
1265 } 1227 }
1266 } else { 1228 } else {
1267 spin_lock(&gcwq->lock); 1229 spin_lock(&pwq->pool->lock);
1268 } 1230 }
1269 } else { 1231 } else {
1270 gcwq = get_gcwq(WORK_CPU_UNBOUND); 1232 pwq = get_pwq(WORK_CPU_UNBOUND, wq);
1271 spin_lock(&gcwq->lock); 1233 spin_lock(&pwq->pool->lock);
1272 } 1234 }
1273 1235
1274 /* gcwq determined, get cwq and queue */ 1236 /* pwq determined, queue */
1275 cwq = get_cwq(gcwq->cpu, wq); 1237 trace_workqueue_queue_work(req_cpu, pwq, work);
1276 trace_workqueue_queue_work(req_cpu, cwq, work);
1277 1238
1278 if (WARN_ON(!list_empty(&work->entry))) { 1239 if (WARN_ON(!list_empty(&work->entry))) {
1279 spin_unlock(&gcwq->lock); 1240 spin_unlock(&pwq->pool->lock);
1280 return; 1241 return;
1281 } 1242 }
1282 1243
1283 cwq->nr_in_flight[cwq->work_color]++; 1244 pwq->nr_in_flight[pwq->work_color]++;
1284 work_flags = work_color_to_flags(cwq->work_color); 1245 work_flags = work_color_to_flags(pwq->work_color);
1285 1246
1286 if (likely(cwq->nr_active < cwq->max_active)) { 1247 if (likely(pwq->nr_active < pwq->max_active)) {
1287 trace_workqueue_activate_work(work); 1248 trace_workqueue_activate_work(work);
1288 cwq->nr_active++; 1249 pwq->nr_active++;
1289 worklist = &cwq->pool->worklist; 1250 worklist = &pwq->pool->worklist;
1290 } else { 1251 } else {
1291 work_flags |= WORK_STRUCT_DELAYED; 1252 work_flags |= WORK_STRUCT_DELAYED;
1292 worklist = &cwq->delayed_works; 1253 worklist = &pwq->delayed_works;
1293 } 1254 }
1294 1255
1295 insert_work(cwq, work, worklist, work_flags); 1256 insert_work(pwq, work, worklist, work_flags);
1296 1257
1297 spin_unlock(&gcwq->lock); 1258 spin_unlock(&pwq->pool->lock);
1298} 1259}
1299 1260
1300/** 1261/**
@@ -1345,51 +1306,37 @@ EXPORT_SYMBOL_GPL(queue_work);
1345void delayed_work_timer_fn(unsigned long __data) 1306void delayed_work_timer_fn(unsigned long __data)
1346{ 1307{
1347 struct delayed_work *dwork = (struct delayed_work *)__data; 1308 struct delayed_work *dwork = (struct delayed_work *)__data;
1348 struct cpu_workqueue_struct *cwq = get_work_cwq(&dwork->work);
1349 1309
1350 /* should have been called from irqsafe timer with irq already off */ 1310 /* should have been called from irqsafe timer with irq already off */
1351 __queue_work(dwork->cpu, cwq->wq, &dwork->work); 1311 __queue_work(dwork->cpu, dwork->wq, &dwork->work);
1352} 1312}
1353EXPORT_SYMBOL_GPL(delayed_work_timer_fn); 1313EXPORT_SYMBOL(delayed_work_timer_fn);
1354 1314
1355static void __queue_delayed_work(int cpu, struct workqueue_struct *wq, 1315static void __queue_delayed_work(int cpu, struct workqueue_struct *wq,
1356 struct delayed_work *dwork, unsigned long delay) 1316 struct delayed_work *dwork, unsigned long delay)
1357{ 1317{
1358 struct timer_list *timer = &dwork->timer; 1318 struct timer_list *timer = &dwork->timer;
1359 struct work_struct *work = &dwork->work; 1319 struct work_struct *work = &dwork->work;
1360 unsigned int lcpu;
1361 1320
1362 WARN_ON_ONCE(timer->function != delayed_work_timer_fn || 1321 WARN_ON_ONCE(timer->function != delayed_work_timer_fn ||
1363 timer->data != (unsigned long)dwork); 1322 timer->data != (unsigned long)dwork);
1364 BUG_ON(timer_pending(timer)); 1323 WARN_ON_ONCE(timer_pending(timer));
1365 BUG_ON(!list_empty(&work->entry)); 1324 WARN_ON_ONCE(!list_empty(&work->entry));
1366
1367 timer_stats_timer_set_start_info(&dwork->timer);
1368 1325
1369 /* 1326 /*
1370 * This stores cwq for the moment, for the timer_fn. Note that the 1327 * If @delay is 0, queue @dwork->work immediately. This is for
1371 * work's gcwq is preserved to allow reentrance detection for 1328 * both optimization and correctness. The earliest @timer can
1372 * delayed works. 1329 * expire is on the closest next tick and delayed_work users depend
1330 * on that there's no such delay when @delay is 0.
1373 */ 1331 */
1374 if (!(wq->flags & WQ_UNBOUND)) { 1332 if (!delay) {
1375 struct global_cwq *gcwq = get_work_gcwq(work); 1333 __queue_work(cpu, wq, &dwork->work);
1376 1334 return;
1377 /*
1378 * If we cannot get the last gcwq from @work directly,
1379 * select the last CPU such that it avoids unnecessarily
1380 * triggering non-reentrancy check in __queue_work().
1381 */
1382 lcpu = cpu;
1383 if (gcwq)
1384 lcpu = gcwq->cpu;
1385 if (lcpu == WORK_CPU_UNBOUND)
1386 lcpu = raw_smp_processor_id();
1387 } else {
1388 lcpu = WORK_CPU_UNBOUND;
1389 } 1335 }
1390 1336
1391 set_work_cwq(work, get_cwq(lcpu, wq), 0); 1337 timer_stats_timer_set_start_info(&dwork->timer);
1392 1338
1339 dwork->wq = wq;
1393 dwork->cpu = cpu; 1340 dwork->cpu = cpu;
1394 timer->expires = jiffies + delay; 1341 timer->expires = jiffies + delay;
1395 1342
@@ -1417,9 +1364,6 @@ bool queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
1417 bool ret = false; 1364 bool ret = false;
1418 unsigned long flags; 1365 unsigned long flags;
1419 1366
1420 if (!delay)
1421 return queue_work_on(cpu, wq, &dwork->work);
1422
1423 /* read the comment in __queue_work() */ 1367 /* read the comment in __queue_work() */
1424 local_irq_save(flags); 1368 local_irq_save(flags);
1425 1369
@@ -1509,12 +1453,11 @@ EXPORT_SYMBOL_GPL(mod_delayed_work);
1509 * necessary. 1453 * necessary.
1510 * 1454 *
1511 * LOCKING: 1455 * LOCKING:
1512 * spin_lock_irq(gcwq->lock). 1456 * spin_lock_irq(pool->lock).
1513 */ 1457 */
1514static void worker_enter_idle(struct worker *worker) 1458static void worker_enter_idle(struct worker *worker)
1515{ 1459{
1516 struct worker_pool *pool = worker->pool; 1460 struct worker_pool *pool = worker->pool;
1517 struct global_cwq *gcwq = pool->gcwq;
1518 1461
1519 BUG_ON(worker->flags & WORKER_IDLE); 1462 BUG_ON(worker->flags & WORKER_IDLE);
1520 BUG_ON(!list_empty(&worker->entry) && 1463 BUG_ON(!list_empty(&worker->entry) &&
@@ -1532,14 +1475,14 @@ static void worker_enter_idle(struct worker *worker)
1532 mod_timer(&pool->idle_timer, jiffies + IDLE_WORKER_TIMEOUT); 1475 mod_timer(&pool->idle_timer, jiffies + IDLE_WORKER_TIMEOUT);
1533 1476
1534 /* 1477 /*
1535 * Sanity check nr_running. Because gcwq_unbind_fn() releases 1478 * Sanity check nr_running. Because wq_unbind_fn() releases
1536 * gcwq->lock between setting %WORKER_UNBOUND and zapping 1479 * pool->lock between setting %WORKER_UNBOUND and zapping
1537 * nr_running, the warning may trigger spuriously. Check iff 1480 * nr_running, the warning may trigger spuriously. Check iff
1538 * unbind is not in progress. 1481 * unbind is not in progress.
1539 */ 1482 */
1540 WARN_ON_ONCE(!(gcwq->flags & GCWQ_DISASSOCIATED) && 1483 WARN_ON_ONCE(!(pool->flags & POOL_DISASSOCIATED) &&
1541 pool->nr_workers == pool->nr_idle && 1484 pool->nr_workers == pool->nr_idle &&
1542 atomic_read(get_pool_nr_running(pool))); 1485 atomic_read(&pool->nr_running));
1543} 1486}
1544 1487
1545/** 1488/**
@@ -1549,7 +1492,7 @@ static void worker_enter_idle(struct worker *worker)
1549 * @worker is leaving idle state. Update stats. 1492 * @worker is leaving idle state. Update stats.
1550 * 1493 *
1551 * LOCKING: 1494 * LOCKING:
1552 * spin_lock_irq(gcwq->lock). 1495 * spin_lock_irq(pool->lock).
1553 */ 1496 */
1554static void worker_leave_idle(struct worker *worker) 1497static void worker_leave_idle(struct worker *worker)
1555{ 1498{
@@ -1562,7 +1505,7 @@ static void worker_leave_idle(struct worker *worker)
1562} 1505}
1563 1506
1564/** 1507/**
1565 * worker_maybe_bind_and_lock - bind worker to its cpu if possible and lock gcwq 1508 * worker_maybe_bind_and_lock - bind worker to its cpu if possible and lock pool
1566 * @worker: self 1509 * @worker: self
1567 * 1510 *
1568 * Works which are scheduled while the cpu is online must at least be 1511 * Works which are scheduled while the cpu is online must at least be
@@ -1574,27 +1517,27 @@ static void worker_leave_idle(struct worker *worker)
1574 * themselves to the target cpu and may race with cpu going down or 1517 * themselves to the target cpu and may race with cpu going down or
1575 * coming online. kthread_bind() can't be used because it may put the 1518 * coming online. kthread_bind() can't be used because it may put the
1576 * worker to already dead cpu and set_cpus_allowed_ptr() can't be used 1519 * worker to already dead cpu and set_cpus_allowed_ptr() can't be used
1577 * verbatim as it's best effort and blocking and gcwq may be 1520 * verbatim as it's best effort and blocking and pool may be
1578 * [dis]associated in the meantime. 1521 * [dis]associated in the meantime.
1579 * 1522 *
1580 * This function tries set_cpus_allowed() and locks gcwq and verifies the 1523 * This function tries set_cpus_allowed() and locks pool and verifies the
1581 * binding against %GCWQ_DISASSOCIATED which is set during 1524 * binding against %POOL_DISASSOCIATED which is set during
1582 * %CPU_DOWN_PREPARE and cleared during %CPU_ONLINE, so if the worker 1525 * %CPU_DOWN_PREPARE and cleared during %CPU_ONLINE, so if the worker
1583 * enters idle state or fetches works without dropping lock, it can 1526 * enters idle state or fetches works without dropping lock, it can
1584 * guarantee the scheduling requirement described in the first paragraph. 1527 * guarantee the scheduling requirement described in the first paragraph.
1585 * 1528 *
1586 * CONTEXT: 1529 * CONTEXT:
1587 * Might sleep. Called without any lock but returns with gcwq->lock 1530 * Might sleep. Called without any lock but returns with pool->lock
1588 * held. 1531 * held.
1589 * 1532 *
1590 * RETURNS: 1533 * RETURNS:
1591 * %true if the associated gcwq is online (@worker is successfully 1534 * %true if the associated pool is online (@worker is successfully
1592 * bound), %false if offline. 1535 * bound), %false if offline.
1593 */ 1536 */
1594static bool worker_maybe_bind_and_lock(struct worker *worker) 1537static bool worker_maybe_bind_and_lock(struct worker *worker)
1595__acquires(&gcwq->lock) 1538__acquires(&pool->lock)
1596{ 1539{
1597 struct global_cwq *gcwq = worker->pool->gcwq; 1540 struct worker_pool *pool = worker->pool;
1598 struct task_struct *task = worker->task; 1541 struct task_struct *task = worker->task;
1599 1542
1600 while (true) { 1543 while (true) {
@@ -1602,19 +1545,19 @@ __acquires(&gcwq->lock)
1602 * The following call may fail, succeed or succeed 1545 * The following call may fail, succeed or succeed
1603 * without actually migrating the task to the cpu if 1546 * without actually migrating the task to the cpu if
1604 * it races with cpu hotunplug operation. Verify 1547 * it races with cpu hotunplug operation. Verify
1605 * against GCWQ_DISASSOCIATED. 1548 * against POOL_DISASSOCIATED.
1606 */ 1549 */
1607 if (!(gcwq->flags & GCWQ_DISASSOCIATED)) 1550 if (!(pool->flags & POOL_DISASSOCIATED))
1608 set_cpus_allowed_ptr(task, get_cpu_mask(gcwq->cpu)); 1551 set_cpus_allowed_ptr(task, get_cpu_mask(pool->cpu));
1609 1552
1610 spin_lock_irq(&gcwq->lock); 1553 spin_lock_irq(&pool->lock);
1611 if (gcwq->flags & GCWQ_DISASSOCIATED) 1554 if (pool->flags & POOL_DISASSOCIATED)
1612 return false; 1555 return false;
1613 if (task_cpu(task) == gcwq->cpu && 1556 if (task_cpu(task) == pool->cpu &&
1614 cpumask_equal(&current->cpus_allowed, 1557 cpumask_equal(&current->cpus_allowed,
1615 get_cpu_mask(gcwq->cpu))) 1558 get_cpu_mask(pool->cpu)))
1616 return true; 1559 return true;
1617 spin_unlock_irq(&gcwq->lock); 1560 spin_unlock_irq(&pool->lock);
1618 1561
1619 /* 1562 /*
1620 * We've raced with CPU hot[un]plug. Give it a breather 1563 * We've raced with CPU hot[un]plug. Give it a breather
@@ -1633,15 +1576,13 @@ __acquires(&gcwq->lock)
1633 */ 1576 */
1634static void idle_worker_rebind(struct worker *worker) 1577static void idle_worker_rebind(struct worker *worker)
1635{ 1578{
1636 struct global_cwq *gcwq = worker->pool->gcwq;
1637
1638 /* CPU may go down again inbetween, clear UNBOUND only on success */ 1579 /* CPU may go down again inbetween, clear UNBOUND only on success */
1639 if (worker_maybe_bind_and_lock(worker)) 1580 if (worker_maybe_bind_and_lock(worker))
1640 worker_clr_flags(worker, WORKER_UNBOUND); 1581 worker_clr_flags(worker, WORKER_UNBOUND);
1641 1582
1642 /* rebind complete, become available again */ 1583 /* rebind complete, become available again */
1643 list_add(&worker->entry, &worker->pool->idle_list); 1584 list_add(&worker->entry, &worker->pool->idle_list);
1644 spin_unlock_irq(&gcwq->lock); 1585 spin_unlock_irq(&worker->pool->lock);
1645} 1586}
1646 1587
1647/* 1588/*
@@ -1653,19 +1594,18 @@ static void idle_worker_rebind(struct worker *worker)
1653static void busy_worker_rebind_fn(struct work_struct *work) 1594static void busy_worker_rebind_fn(struct work_struct *work)
1654{ 1595{
1655 struct worker *worker = container_of(work, struct worker, rebind_work); 1596 struct worker *worker = container_of(work, struct worker, rebind_work);
1656 struct global_cwq *gcwq = worker->pool->gcwq;
1657 1597
1658 if (worker_maybe_bind_and_lock(worker)) 1598 if (worker_maybe_bind_and_lock(worker))
1659 worker_clr_flags(worker, WORKER_UNBOUND); 1599 worker_clr_flags(worker, WORKER_UNBOUND);
1660 1600
1661 spin_unlock_irq(&gcwq->lock); 1601 spin_unlock_irq(&worker->pool->lock);
1662} 1602}
1663 1603
1664/** 1604/**
1665 * rebind_workers - rebind all workers of a gcwq to the associated CPU 1605 * rebind_workers - rebind all workers of a pool to the associated CPU
1666 * @gcwq: gcwq of interest 1606 * @pool: pool of interest
1667 * 1607 *
1668 * @gcwq->cpu is coming online. Rebind all workers to the CPU. Rebinding 1608 * @pool->cpu is coming online. Rebind all workers to the CPU. Rebinding
1669 * is different for idle and busy ones. 1609 * is different for idle and busy ones.
1670 * 1610 *
1671 * Idle ones will be removed from the idle_list and woken up. They will 1611 * Idle ones will be removed from the idle_list and woken up. They will
@@ -1683,38 +1623,31 @@ static void busy_worker_rebind_fn(struct work_struct *work)
1683 * including the manager will not appear on @idle_list until rebind is 1623 * including the manager will not appear on @idle_list until rebind is
1684 * complete, making local wake-ups safe. 1624 * complete, making local wake-ups safe.
1685 */ 1625 */
1686static void rebind_workers(struct global_cwq *gcwq) 1626static void rebind_workers(struct worker_pool *pool)
1687{ 1627{
1688 struct worker_pool *pool;
1689 struct worker *worker, *n; 1628 struct worker *worker, *n;
1690 struct hlist_node *pos;
1691 int i; 1629 int i;
1692 1630
1693 lockdep_assert_held(&gcwq->lock); 1631 lockdep_assert_held(&pool->assoc_mutex);
1694 1632 lockdep_assert_held(&pool->lock);
1695 for_each_worker_pool(pool, gcwq)
1696 lockdep_assert_held(&pool->assoc_mutex);
1697 1633
1698 /* dequeue and kick idle ones */ 1634 /* dequeue and kick idle ones */
1699 for_each_worker_pool(pool, gcwq) { 1635 list_for_each_entry_safe(worker, n, &pool->idle_list, entry) {
1700 list_for_each_entry_safe(worker, n, &pool->idle_list, entry) { 1636 /*
1701 /* 1637 * idle workers should be off @pool->idle_list until rebind
1702 * idle workers should be off @pool->idle_list 1638 * is complete to avoid receiving premature local wake-ups.
1703 * until rebind is complete to avoid receiving 1639 */
1704 * premature local wake-ups. 1640 list_del_init(&worker->entry);
1705 */
1706 list_del_init(&worker->entry);
1707 1641
1708 /* 1642 /*
1709 * worker_thread() will see the above dequeuing 1643 * worker_thread() will see the above dequeuing and call
1710 * and call idle_worker_rebind(). 1644 * idle_worker_rebind().
1711 */ 1645 */
1712 wake_up_process(worker->task); 1646 wake_up_process(worker->task);
1713 }
1714 } 1647 }
1715 1648
1716 /* rebind busy workers */ 1649 /* rebind busy workers */
1717 for_each_busy_worker(worker, i, pos, gcwq) { 1650 for_each_busy_worker(worker, i, pool) {
1718 struct work_struct *rebind_work = &worker->rebind_work; 1651 struct work_struct *rebind_work = &worker->rebind_work;
1719 struct workqueue_struct *wq; 1652 struct workqueue_struct *wq;
1720 1653
@@ -1726,16 +1659,16 @@ static void rebind_workers(struct global_cwq *gcwq)
1726 1659
1727 /* 1660 /*
1728 * wq doesn't really matter but let's keep @worker->pool 1661 * wq doesn't really matter but let's keep @worker->pool
1729 * and @cwq->pool consistent for sanity. 1662 * and @pwq->pool consistent for sanity.
1730 */ 1663 */
1731 if (worker_pool_pri(worker->pool)) 1664 if (std_worker_pool_pri(worker->pool))
1732 wq = system_highpri_wq; 1665 wq = system_highpri_wq;
1733 else 1666 else
1734 wq = system_wq; 1667 wq = system_wq;
1735 1668
1736 insert_work(get_cwq(gcwq->cpu, wq), rebind_work, 1669 insert_work(get_pwq(pool->cpu, wq), rebind_work,
1737 worker->scheduled.next, 1670 worker->scheduled.next,
1738 work_color_to_flags(WORK_NO_COLOR)); 1671 work_color_to_flags(WORK_NO_COLOR));
1739 } 1672 }
1740} 1673}
1741 1674
@@ -1770,19 +1703,18 @@ static struct worker *alloc_worker(void)
1770 */ 1703 */
1771static struct worker *create_worker(struct worker_pool *pool) 1704static struct worker *create_worker(struct worker_pool *pool)
1772{ 1705{
1773 struct global_cwq *gcwq = pool->gcwq; 1706 const char *pri = std_worker_pool_pri(pool) ? "H" : "";
1774 const char *pri = worker_pool_pri(pool) ? "H" : "";
1775 struct worker *worker = NULL; 1707 struct worker *worker = NULL;
1776 int id = -1; 1708 int id = -1;
1777 1709
1778 spin_lock_irq(&gcwq->lock); 1710 spin_lock_irq(&pool->lock);
1779 while (ida_get_new(&pool->worker_ida, &id)) { 1711 while (ida_get_new(&pool->worker_ida, &id)) {
1780 spin_unlock_irq(&gcwq->lock); 1712 spin_unlock_irq(&pool->lock);
1781 if (!ida_pre_get(&pool->worker_ida, GFP_KERNEL)) 1713 if (!ida_pre_get(&pool->worker_ida, GFP_KERNEL))
1782 goto fail; 1714 goto fail;
1783 spin_lock_irq(&gcwq->lock); 1715 spin_lock_irq(&pool->lock);
1784 } 1716 }
1785 spin_unlock_irq(&gcwq->lock); 1717 spin_unlock_irq(&pool->lock);
1786 1718
1787 worker = alloc_worker(); 1719 worker = alloc_worker();
1788 if (!worker) 1720 if (!worker)
@@ -1791,30 +1723,30 @@ static struct worker *create_worker(struct worker_pool *pool)
1791 worker->pool = pool; 1723 worker->pool = pool;
1792 worker->id = id; 1724 worker->id = id;
1793 1725
1794 if (gcwq->cpu != WORK_CPU_UNBOUND) 1726 if (pool->cpu != WORK_CPU_UNBOUND)
1795 worker->task = kthread_create_on_node(worker_thread, 1727 worker->task = kthread_create_on_node(worker_thread,
1796 worker, cpu_to_node(gcwq->cpu), 1728 worker, cpu_to_node(pool->cpu),
1797 "kworker/%u:%d%s", gcwq->cpu, id, pri); 1729 "kworker/%u:%d%s", pool->cpu, id, pri);
1798 else 1730 else
1799 worker->task = kthread_create(worker_thread, worker, 1731 worker->task = kthread_create(worker_thread, worker,
1800 "kworker/u:%d%s", id, pri); 1732 "kworker/u:%d%s", id, pri);
1801 if (IS_ERR(worker->task)) 1733 if (IS_ERR(worker->task))
1802 goto fail; 1734 goto fail;
1803 1735
1804 if (worker_pool_pri(pool)) 1736 if (std_worker_pool_pri(pool))
1805 set_user_nice(worker->task, HIGHPRI_NICE_LEVEL); 1737 set_user_nice(worker->task, HIGHPRI_NICE_LEVEL);
1806 1738
1807 /* 1739 /*
1808 * Determine CPU binding of the new worker depending on 1740 * Determine CPU binding of the new worker depending on
1809 * %GCWQ_DISASSOCIATED. The caller is responsible for ensuring the 1741 * %POOL_DISASSOCIATED. The caller is responsible for ensuring the
1810 * flag remains stable across this function. See the comments 1742 * flag remains stable across this function. See the comments
1811 * above the flag definition for details. 1743 * above the flag definition for details.
1812 * 1744 *
1813 * As an unbound worker may later become a regular one if CPU comes 1745 * As an unbound worker may later become a regular one if CPU comes
1814 * online, make sure every worker has %PF_THREAD_BOUND set. 1746 * online, make sure every worker has %PF_THREAD_BOUND set.
1815 */ 1747 */
1816 if (!(gcwq->flags & GCWQ_DISASSOCIATED)) { 1748 if (!(pool->flags & POOL_DISASSOCIATED)) {
1817 kthread_bind(worker->task, gcwq->cpu); 1749 kthread_bind(worker->task, pool->cpu);
1818 } else { 1750 } else {
1819 worker->task->flags |= PF_THREAD_BOUND; 1751 worker->task->flags |= PF_THREAD_BOUND;
1820 worker->flags |= WORKER_UNBOUND; 1752 worker->flags |= WORKER_UNBOUND;
@@ -1823,9 +1755,9 @@ static struct worker *create_worker(struct worker_pool *pool)
1823 return worker; 1755 return worker;
1824fail: 1756fail:
1825 if (id >= 0) { 1757 if (id >= 0) {
1826 spin_lock_irq(&gcwq->lock); 1758 spin_lock_irq(&pool->lock);
1827 ida_remove(&pool->worker_ida, id); 1759 ida_remove(&pool->worker_ida, id);
1828 spin_unlock_irq(&gcwq->lock); 1760 spin_unlock_irq(&pool->lock);
1829 } 1761 }
1830 kfree(worker); 1762 kfree(worker);
1831 return NULL; 1763 return NULL;
@@ -1835,10 +1767,10 @@ fail:
1835 * start_worker - start a newly created worker 1767 * start_worker - start a newly created worker
1836 * @worker: worker to start 1768 * @worker: worker to start
1837 * 1769 *
1838 * Make the gcwq aware of @worker and start it. 1770 * Make the pool aware of @worker and start it.
1839 * 1771 *
1840 * CONTEXT: 1772 * CONTEXT:
1841 * spin_lock_irq(gcwq->lock). 1773 * spin_lock_irq(pool->lock).
1842 */ 1774 */
1843static void start_worker(struct worker *worker) 1775static void start_worker(struct worker *worker)
1844{ 1776{
@@ -1852,15 +1784,14 @@ static void start_worker(struct worker *worker)
1852 * destroy_worker - destroy a workqueue worker 1784 * destroy_worker - destroy a workqueue worker
1853 * @worker: worker to be destroyed 1785 * @worker: worker to be destroyed
1854 * 1786 *
1855 * Destroy @worker and adjust @gcwq stats accordingly. 1787 * Destroy @worker and adjust @pool stats accordingly.
1856 * 1788 *
1857 * CONTEXT: 1789 * CONTEXT:
1858 * spin_lock_irq(gcwq->lock) which is released and regrabbed. 1790 * spin_lock_irq(pool->lock) which is released and regrabbed.
1859 */ 1791 */
1860static void destroy_worker(struct worker *worker) 1792static void destroy_worker(struct worker *worker)
1861{ 1793{
1862 struct worker_pool *pool = worker->pool; 1794 struct worker_pool *pool = worker->pool;
1863 struct global_cwq *gcwq = pool->gcwq;
1864 int id = worker->id; 1795 int id = worker->id;
1865 1796
1866 /* sanity check frenzy */ 1797 /* sanity check frenzy */
@@ -1875,21 +1806,20 @@ static void destroy_worker(struct worker *worker)
1875 list_del_init(&worker->entry); 1806 list_del_init(&worker->entry);
1876 worker->flags |= WORKER_DIE; 1807 worker->flags |= WORKER_DIE;
1877 1808
1878 spin_unlock_irq(&gcwq->lock); 1809 spin_unlock_irq(&pool->lock);
1879 1810
1880 kthread_stop(worker->task); 1811 kthread_stop(worker->task);
1881 kfree(worker); 1812 kfree(worker);
1882 1813
1883 spin_lock_irq(&gcwq->lock); 1814 spin_lock_irq(&pool->lock);
1884 ida_remove(&pool->worker_ida, id); 1815 ida_remove(&pool->worker_ida, id);
1885} 1816}
1886 1817
1887static void idle_worker_timeout(unsigned long __pool) 1818static void idle_worker_timeout(unsigned long __pool)
1888{ 1819{
1889 struct worker_pool *pool = (void *)__pool; 1820 struct worker_pool *pool = (void *)__pool;
1890 struct global_cwq *gcwq = pool->gcwq;
1891 1821
1892 spin_lock_irq(&gcwq->lock); 1822 spin_lock_irq(&pool->lock);
1893 1823
1894 if (too_many_workers(pool)) { 1824 if (too_many_workers(pool)) {
1895 struct worker *worker; 1825 struct worker *worker;
@@ -1908,20 +1838,20 @@ static void idle_worker_timeout(unsigned long __pool)
1908 } 1838 }
1909 } 1839 }
1910 1840
1911 spin_unlock_irq(&gcwq->lock); 1841 spin_unlock_irq(&pool->lock);
1912} 1842}
1913 1843
1914static bool send_mayday(struct work_struct *work) 1844static bool send_mayday(struct work_struct *work)
1915{ 1845{
1916 struct cpu_workqueue_struct *cwq = get_work_cwq(work); 1846 struct pool_workqueue *pwq = get_work_pwq(work);
1917 struct workqueue_struct *wq = cwq->wq; 1847 struct workqueue_struct *wq = pwq->wq;
1918 unsigned int cpu; 1848 unsigned int cpu;
1919 1849
1920 if (!(wq->flags & WQ_RESCUER)) 1850 if (!(wq->flags & WQ_RESCUER))
1921 return false; 1851 return false;
1922 1852
1923 /* mayday mayday mayday */ 1853 /* mayday mayday mayday */
1924 cpu = cwq->pool->gcwq->cpu; 1854 cpu = pwq->pool->cpu;
1925 /* WORK_CPU_UNBOUND can't be set in cpumask, use cpu 0 instead */ 1855 /* WORK_CPU_UNBOUND can't be set in cpumask, use cpu 0 instead */
1926 if (cpu == WORK_CPU_UNBOUND) 1856 if (cpu == WORK_CPU_UNBOUND)
1927 cpu = 0; 1857 cpu = 0;
@@ -1930,13 +1860,12 @@ static bool send_mayday(struct work_struct *work)
1930 return true; 1860 return true;
1931} 1861}
1932 1862
1933static void gcwq_mayday_timeout(unsigned long __pool) 1863static void pool_mayday_timeout(unsigned long __pool)
1934{ 1864{
1935 struct worker_pool *pool = (void *)__pool; 1865 struct worker_pool *pool = (void *)__pool;
1936 struct global_cwq *gcwq = pool->gcwq;
1937 struct work_struct *work; 1866 struct work_struct *work;
1938 1867
1939 spin_lock_irq(&gcwq->lock); 1868 spin_lock_irq(&pool->lock);
1940 1869
1941 if (need_to_create_worker(pool)) { 1870 if (need_to_create_worker(pool)) {
1942 /* 1871 /*
@@ -1949,7 +1878,7 @@ static void gcwq_mayday_timeout(unsigned long __pool)
1949 send_mayday(work); 1878 send_mayday(work);
1950 } 1879 }
1951 1880
1952 spin_unlock_irq(&gcwq->lock); 1881 spin_unlock_irq(&pool->lock);
1953 1882
1954 mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INTERVAL); 1883 mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INTERVAL);
1955} 1884}
@@ -1968,24 +1897,22 @@ static void gcwq_mayday_timeout(unsigned long __pool)
1968 * may_start_working() true. 1897 * may_start_working() true.
1969 * 1898 *
1970 * LOCKING: 1899 * LOCKING:
1971 * spin_lock_irq(gcwq->lock) which may be released and regrabbed 1900 * spin_lock_irq(pool->lock) which may be released and regrabbed
1972 * multiple times. Does GFP_KERNEL allocations. Called only from 1901 * multiple times. Does GFP_KERNEL allocations. Called only from
1973 * manager. 1902 * manager.
1974 * 1903 *
1975 * RETURNS: 1904 * RETURNS:
1976 * false if no action was taken and gcwq->lock stayed locked, true 1905 * false if no action was taken and pool->lock stayed locked, true
1977 * otherwise. 1906 * otherwise.
1978 */ 1907 */
1979static bool maybe_create_worker(struct worker_pool *pool) 1908static bool maybe_create_worker(struct worker_pool *pool)
1980__releases(&gcwq->lock) 1909__releases(&pool->lock)
1981__acquires(&gcwq->lock) 1910__acquires(&pool->lock)
1982{ 1911{
1983 struct global_cwq *gcwq = pool->gcwq;
1984
1985 if (!need_to_create_worker(pool)) 1912 if (!need_to_create_worker(pool))
1986 return false; 1913 return false;
1987restart: 1914restart:
1988 spin_unlock_irq(&gcwq->lock); 1915 spin_unlock_irq(&pool->lock);
1989 1916
1990 /* if we don't make progress in MAYDAY_INITIAL_TIMEOUT, call for help */ 1917 /* if we don't make progress in MAYDAY_INITIAL_TIMEOUT, call for help */
1991 mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INITIAL_TIMEOUT); 1918 mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INITIAL_TIMEOUT);
@@ -1996,7 +1923,7 @@ restart:
1996 worker = create_worker(pool); 1923 worker = create_worker(pool);
1997 if (worker) { 1924 if (worker) {
1998 del_timer_sync(&pool->mayday_timer); 1925 del_timer_sync(&pool->mayday_timer);
1999 spin_lock_irq(&gcwq->lock); 1926 spin_lock_irq(&pool->lock);
2000 start_worker(worker); 1927 start_worker(worker);
2001 BUG_ON(need_to_create_worker(pool)); 1928 BUG_ON(need_to_create_worker(pool));
2002 return true; 1929 return true;
@@ -2013,7 +1940,7 @@ restart:
2013 } 1940 }
2014 1941
2015 del_timer_sync(&pool->mayday_timer); 1942 del_timer_sync(&pool->mayday_timer);
2016 spin_lock_irq(&gcwq->lock); 1943 spin_lock_irq(&pool->lock);
2017 if (need_to_create_worker(pool)) 1944 if (need_to_create_worker(pool))
2018 goto restart; 1945 goto restart;
2019 return true; 1946 return true;
@@ -2027,11 +1954,11 @@ restart:
2027 * IDLE_WORKER_TIMEOUT. 1954 * IDLE_WORKER_TIMEOUT.
2028 * 1955 *
2029 * LOCKING: 1956 * LOCKING:
2030 * spin_lock_irq(gcwq->lock) which may be released and regrabbed 1957 * spin_lock_irq(pool->lock) which may be released and regrabbed
2031 * multiple times. Called only from manager. 1958 * multiple times. Called only from manager.
2032 * 1959 *
2033 * RETURNS: 1960 * RETURNS:
2034 * false if no action was taken and gcwq->lock stayed locked, true 1961 * false if no action was taken and pool->lock stayed locked, true
2035 * otherwise. 1962 * otherwise.
2036 */ 1963 */
2037static bool maybe_destroy_workers(struct worker_pool *pool) 1964static bool maybe_destroy_workers(struct worker_pool *pool)
@@ -2061,21 +1988,21 @@ static bool maybe_destroy_workers(struct worker_pool *pool)
2061 * manage_workers - manage worker pool 1988 * manage_workers - manage worker pool
2062 * @worker: self 1989 * @worker: self
2063 * 1990 *
2064 * Assume the manager role and manage gcwq worker pool @worker belongs 1991 * Assume the manager role and manage the worker pool @worker belongs
2065 * to. At any given time, there can be only zero or one manager per 1992 * to. At any given time, there can be only zero or one manager per
2066 * gcwq. The exclusion is handled automatically by this function. 1993 * pool. The exclusion is handled automatically by this function.
2067 * 1994 *
2068 * The caller can safely start processing works on false return. On 1995 * The caller can safely start processing works on false return. On
2069 * true return, it's guaranteed that need_to_create_worker() is false 1996 * true return, it's guaranteed that need_to_create_worker() is false
2070 * and may_start_working() is true. 1997 * and may_start_working() is true.
2071 * 1998 *
2072 * CONTEXT: 1999 * CONTEXT:
2073 * spin_lock_irq(gcwq->lock) which may be released and regrabbed 2000 * spin_lock_irq(pool->lock) which may be released and regrabbed
2074 * multiple times. Does GFP_KERNEL allocations. 2001 * multiple times. Does GFP_KERNEL allocations.
2075 * 2002 *
2076 * RETURNS: 2003 * RETURNS:
2077 * false if no action was taken and gcwq->lock stayed locked, true if 2004 * spin_lock_irq(pool->lock) which may be released and regrabbed
2078 * some action was taken. 2005 * multiple times. Does GFP_KERNEL allocations.
2079 */ 2006 */
2080static bool manage_workers(struct worker *worker) 2007static bool manage_workers(struct worker *worker)
2081{ 2008{
@@ -2097,20 +2024,20 @@ static bool manage_workers(struct worker *worker)
2097 * manager against CPU hotplug. 2024 * manager against CPU hotplug.
2098 * 2025 *
2099 * assoc_mutex would always be free unless CPU hotplug is in 2026 * assoc_mutex would always be free unless CPU hotplug is in
2100 * progress. trylock first without dropping @gcwq->lock. 2027 * progress. trylock first without dropping @pool->lock.
2101 */ 2028 */
2102 if (unlikely(!mutex_trylock(&pool->assoc_mutex))) { 2029 if (unlikely(!mutex_trylock(&pool->assoc_mutex))) {
2103 spin_unlock_irq(&pool->gcwq->lock); 2030 spin_unlock_irq(&pool->lock);
2104 mutex_lock(&pool->assoc_mutex); 2031 mutex_lock(&pool->assoc_mutex);
2105 /* 2032 /*
2106 * CPU hotplug could have happened while we were waiting 2033 * CPU hotplug could have happened while we were waiting
2107 * for assoc_mutex. Hotplug itself can't handle us 2034 * for assoc_mutex. Hotplug itself can't handle us
2108 * because manager isn't either on idle or busy list, and 2035 * because manager isn't either on idle or busy list, and
2109 * @gcwq's state and ours could have deviated. 2036 * @pool's state and ours could have deviated.
2110 * 2037 *
2111 * As hotplug is now excluded via assoc_mutex, we can 2038 * As hotplug is now excluded via assoc_mutex, we can
2112 * simply try to bind. It will succeed or fail depending 2039 * simply try to bind. It will succeed or fail depending
2113 * on @gcwq's current state. Try it and adjust 2040 * on @pool's current state. Try it and adjust
2114 * %WORKER_UNBOUND accordingly. 2041 * %WORKER_UNBOUND accordingly.
2115 */ 2042 */
2116 if (worker_maybe_bind_and_lock(worker)) 2043 if (worker_maybe_bind_and_lock(worker))
@@ -2147,18 +2074,15 @@ static bool manage_workers(struct worker *worker)
2147 * call this function to process a work. 2074 * call this function to process a work.
2148 * 2075 *
2149 * CONTEXT: 2076 * CONTEXT:
2150 * spin_lock_irq(gcwq->lock) which is released and regrabbed. 2077 * spin_lock_irq(pool->lock) which is released and regrabbed.
2151 */ 2078 */
2152static void process_one_work(struct worker *worker, struct work_struct *work) 2079static void process_one_work(struct worker *worker, struct work_struct *work)
2153__releases(&gcwq->lock) 2080__releases(&pool->lock)
2154__acquires(&gcwq->lock) 2081__acquires(&pool->lock)
2155{ 2082{
2156 struct cpu_workqueue_struct *cwq = get_work_cwq(work); 2083 struct pool_workqueue *pwq = get_work_pwq(work);
2157 struct worker_pool *pool = worker->pool; 2084 struct worker_pool *pool = worker->pool;
2158 struct global_cwq *gcwq = pool->gcwq; 2085 bool cpu_intensive = pwq->wq->flags & WQ_CPU_INTENSIVE;
2159 struct hlist_head *bwh = busy_worker_head(gcwq, work);
2160 bool cpu_intensive = cwq->wq->flags & WQ_CPU_INTENSIVE;
2161 work_func_t f = work->func;
2162 int work_color; 2086 int work_color;
2163 struct worker *collision; 2087 struct worker *collision;
2164#ifdef CONFIG_LOCKDEP 2088#ifdef CONFIG_LOCKDEP
@@ -2176,11 +2100,11 @@ __acquires(&gcwq->lock)
2176 /* 2100 /*
2177 * Ensure we're on the correct CPU. DISASSOCIATED test is 2101 * Ensure we're on the correct CPU. DISASSOCIATED test is
2178 * necessary to avoid spurious warnings from rescuers servicing the 2102 * necessary to avoid spurious warnings from rescuers servicing the
2179 * unbound or a disassociated gcwq. 2103 * unbound or a disassociated pool.
2180 */ 2104 */
2181 WARN_ON_ONCE(!(worker->flags & WORKER_UNBOUND) && 2105 WARN_ON_ONCE(!(worker->flags & WORKER_UNBOUND) &&
2182 !(gcwq->flags & GCWQ_DISASSOCIATED) && 2106 !(pool->flags & POOL_DISASSOCIATED) &&
2183 raw_smp_processor_id() != gcwq->cpu); 2107 raw_smp_processor_id() != pool->cpu);
2184 2108
2185 /* 2109 /*
2186 * A single work shouldn't be executed concurrently by 2110 * A single work shouldn't be executed concurrently by
@@ -2188,7 +2112,7 @@ __acquires(&gcwq->lock)
2188 * already processing the work. If so, defer the work to the 2112 * already processing the work. If so, defer the work to the
2189 * currently executing one. 2113 * currently executing one.
2190 */ 2114 */
2191 collision = __find_worker_executing_work(gcwq, bwh, work); 2115 collision = find_worker_executing_work(pool, work);
2192 if (unlikely(collision)) { 2116 if (unlikely(collision)) {
2193 move_linked_works(work, &collision->scheduled, NULL); 2117 move_linked_works(work, &collision->scheduled, NULL);
2194 return; 2118 return;
@@ -2196,9 +2120,10 @@ __acquires(&gcwq->lock)
2196 2120
2197 /* claim and dequeue */ 2121 /* claim and dequeue */
2198 debug_work_deactivate(work); 2122 debug_work_deactivate(work);
2199 hlist_add_head(&worker->hentry, bwh); 2123 hash_add(pool->busy_hash, &worker->hentry, (unsigned long)work);
2200 worker->current_work = work; 2124 worker->current_work = work;
2201 worker->current_cwq = cwq; 2125 worker->current_func = work->func;
2126 worker->current_pwq = pwq;
2202 work_color = get_work_color(work); 2127 work_color = get_work_color(work);
2203 2128
2204 list_del_init(&work->entry); 2129 list_del_init(&work->entry);
@@ -2211,53 +2136,55 @@ __acquires(&gcwq->lock)
2211 worker_set_flags(worker, WORKER_CPU_INTENSIVE, true); 2136 worker_set_flags(worker, WORKER_CPU_INTENSIVE, true);
2212 2137
2213 /* 2138 /*
2214 * Unbound gcwq isn't concurrency managed and work items should be 2139 * Unbound pool isn't concurrency managed and work items should be
2215 * executed ASAP. Wake up another worker if necessary. 2140 * executed ASAP. Wake up another worker if necessary.
2216 */ 2141 */
2217 if ((worker->flags & WORKER_UNBOUND) && need_more_worker(pool)) 2142 if ((worker->flags & WORKER_UNBOUND) && need_more_worker(pool))
2218 wake_up_worker(pool); 2143 wake_up_worker(pool);
2219 2144
2220 /* 2145 /*
2221 * Record the last CPU and clear PENDING which should be the last 2146 * Record the last pool and clear PENDING which should be the last
2222 * update to @work. Also, do this inside @gcwq->lock so that 2147 * update to @work. Also, do this inside @pool->lock so that
2223 * PENDING and queued state changes happen together while IRQ is 2148 * PENDING and queued state changes happen together while IRQ is
2224 * disabled. 2149 * disabled.
2225 */ 2150 */
2226 set_work_cpu_and_clear_pending(work, gcwq->cpu); 2151 set_work_pool_and_clear_pending(work, pool->id);
2227 2152
2228 spin_unlock_irq(&gcwq->lock); 2153 spin_unlock_irq(&pool->lock);
2229 2154
2230 lock_map_acquire_read(&cwq->wq->lockdep_map); 2155 lock_map_acquire_read(&pwq->wq->lockdep_map);
2231 lock_map_acquire(&lockdep_map); 2156 lock_map_acquire(&lockdep_map);
2232 trace_workqueue_execute_start(work); 2157 trace_workqueue_execute_start(work);
2233 f(work); 2158 worker->current_func(work);
2234 /* 2159 /*
2235 * While we must be careful to not use "work" after this, the trace 2160 * While we must be careful to not use "work" after this, the trace
2236 * point will only record its address. 2161 * point will only record its address.
2237 */ 2162 */
2238 trace_workqueue_execute_end(work); 2163 trace_workqueue_execute_end(work);
2239 lock_map_release(&lockdep_map); 2164 lock_map_release(&lockdep_map);
2240 lock_map_release(&cwq->wq->lockdep_map); 2165 lock_map_release(&pwq->wq->lockdep_map);
2241 2166
2242 if (unlikely(in_atomic() || lockdep_depth(current) > 0)) { 2167 if (unlikely(in_atomic() || lockdep_depth(current) > 0)) {
2243 pr_err("BUG: workqueue leaked lock or atomic: %s/0x%08x/%d\n" 2168 pr_err("BUG: workqueue leaked lock or atomic: %s/0x%08x/%d\n"
2244 " last function: %pf\n", 2169 " last function: %pf\n",
2245 current->comm, preempt_count(), task_pid_nr(current), f); 2170 current->comm, preempt_count(), task_pid_nr(current),
2171 worker->current_func);
2246 debug_show_held_locks(current); 2172 debug_show_held_locks(current);
2247 dump_stack(); 2173 dump_stack();
2248 } 2174 }
2249 2175
2250 spin_lock_irq(&gcwq->lock); 2176 spin_lock_irq(&pool->lock);
2251 2177
2252 /* clear cpu intensive status */ 2178 /* clear cpu intensive status */
2253 if (unlikely(cpu_intensive)) 2179 if (unlikely(cpu_intensive))
2254 worker_clr_flags(worker, WORKER_CPU_INTENSIVE); 2180 worker_clr_flags(worker, WORKER_CPU_INTENSIVE);
2255 2181
2256 /* we're done with it, release */ 2182 /* we're done with it, release */
2257 hlist_del_init(&worker->hentry); 2183 hash_del(&worker->hentry);
2258 worker->current_work = NULL; 2184 worker->current_work = NULL;
2259 worker->current_cwq = NULL; 2185 worker->current_func = NULL;
2260 cwq_dec_nr_in_flight(cwq, work_color); 2186 worker->current_pwq = NULL;
2187 pwq_dec_nr_in_flight(pwq, work_color);
2261} 2188}
2262 2189
2263/** 2190/**
@@ -2269,7 +2196,7 @@ __acquires(&gcwq->lock)
2269 * fetches a work from the top and executes it. 2196 * fetches a work from the top and executes it.
2270 * 2197 *
2271 * CONTEXT: 2198 * CONTEXT:
2272 * spin_lock_irq(gcwq->lock) which may be released and regrabbed 2199 * spin_lock_irq(pool->lock) which may be released and regrabbed
2273 * multiple times. 2200 * multiple times.
2274 */ 2201 */
2275static void process_scheduled_works(struct worker *worker) 2202static void process_scheduled_works(struct worker *worker)
@@ -2285,8 +2212,8 @@ static void process_scheduled_works(struct worker *worker)
2285 * worker_thread - the worker thread function 2212 * worker_thread - the worker thread function
2286 * @__worker: self 2213 * @__worker: self
2287 * 2214 *
2288 * The gcwq worker thread function. There's a single dynamic pool of 2215 * The worker thread function. There are NR_CPU_WORKER_POOLS dynamic pools
2289 * these per each cpu. These workers process all works regardless of 2216 * of these per each cpu. These workers process all works regardless of
2290 * their specific target workqueue. The only exception is works which 2217 * their specific target workqueue. The only exception is works which
2291 * belong to workqueues with a rescuer which will be explained in 2218 * belong to workqueues with a rescuer which will be explained in
2292 * rescuer_thread(). 2219 * rescuer_thread().
@@ -2295,16 +2222,15 @@ static int worker_thread(void *__worker)
2295{ 2222{
2296 struct worker *worker = __worker; 2223 struct worker *worker = __worker;
2297 struct worker_pool *pool = worker->pool; 2224 struct worker_pool *pool = worker->pool;
2298 struct global_cwq *gcwq = pool->gcwq;
2299 2225
2300 /* tell the scheduler that this is a workqueue worker */ 2226 /* tell the scheduler that this is a workqueue worker */
2301 worker->task->flags |= PF_WQ_WORKER; 2227 worker->task->flags |= PF_WQ_WORKER;
2302woke_up: 2228woke_up:
2303 spin_lock_irq(&gcwq->lock); 2229 spin_lock_irq(&pool->lock);
2304 2230
2305 /* we are off idle list if destruction or rebind is requested */ 2231 /* we are off idle list if destruction or rebind is requested */
2306 if (unlikely(list_empty(&worker->entry))) { 2232 if (unlikely(list_empty(&worker->entry))) {
2307 spin_unlock_irq(&gcwq->lock); 2233 spin_unlock_irq(&pool->lock);
2308 2234
2309 /* if DIE is set, destruction is requested */ 2235 /* if DIE is set, destruction is requested */
2310 if (worker->flags & WORKER_DIE) { 2236 if (worker->flags & WORKER_DIE) {
@@ -2363,52 +2289,61 @@ sleep:
2363 goto recheck; 2289 goto recheck;
2364 2290
2365 /* 2291 /*
2366 * gcwq->lock is held and there's no work to process and no 2292 * pool->lock is held and there's no work to process and no need to
2367 * need to manage, sleep. Workers are woken up only while 2293 * manage, sleep. Workers are woken up only while holding
2368 * holding gcwq->lock or from local cpu, so setting the 2294 * pool->lock or from local cpu, so setting the current state
2369 * current state before releasing gcwq->lock is enough to 2295 * before releasing pool->lock is enough to prevent losing any
2370 * prevent losing any event. 2296 * event.
2371 */ 2297 */
2372 worker_enter_idle(worker); 2298 worker_enter_idle(worker);
2373 __set_current_state(TASK_INTERRUPTIBLE); 2299 __set_current_state(TASK_INTERRUPTIBLE);
2374 spin_unlock_irq(&gcwq->lock); 2300 spin_unlock_irq(&pool->lock);
2375 schedule(); 2301 schedule();
2376 goto woke_up; 2302 goto woke_up;
2377} 2303}
2378 2304
2379/** 2305/**
2380 * rescuer_thread - the rescuer thread function 2306 * rescuer_thread - the rescuer thread function
2381 * @__wq: the associated workqueue 2307 * @__rescuer: self
2382 * 2308 *
2383 * Workqueue rescuer thread function. There's one rescuer for each 2309 * Workqueue rescuer thread function. There's one rescuer for each
2384 * workqueue which has WQ_RESCUER set. 2310 * workqueue which has WQ_RESCUER set.
2385 * 2311 *
2386 * Regular work processing on a gcwq may block trying to create a new 2312 * Regular work processing on a pool may block trying to create a new
2387 * worker which uses GFP_KERNEL allocation which has slight chance of 2313 * worker which uses GFP_KERNEL allocation which has slight chance of
2388 * developing into deadlock if some works currently on the same queue 2314 * developing into deadlock if some works currently on the same queue
2389 * need to be processed to satisfy the GFP_KERNEL allocation. This is 2315 * need to be processed to satisfy the GFP_KERNEL allocation. This is
2390 * the problem rescuer solves. 2316 * the problem rescuer solves.
2391 * 2317 *
2392 * When such condition is possible, the gcwq summons rescuers of all 2318 * When such condition is possible, the pool summons rescuers of all
2393 * workqueues which have works queued on the gcwq and let them process 2319 * workqueues which have works queued on the pool and let them process
2394 * those works so that forward progress can be guaranteed. 2320 * those works so that forward progress can be guaranteed.
2395 * 2321 *
2396 * This should happen rarely. 2322 * This should happen rarely.
2397 */ 2323 */
2398static int rescuer_thread(void *__wq) 2324static int rescuer_thread(void *__rescuer)
2399{ 2325{
2400 struct workqueue_struct *wq = __wq; 2326 struct worker *rescuer = __rescuer;
2401 struct worker *rescuer = wq->rescuer; 2327 struct workqueue_struct *wq = rescuer->rescue_wq;
2402 struct list_head *scheduled = &rescuer->scheduled; 2328 struct list_head *scheduled = &rescuer->scheduled;
2403 bool is_unbound = wq->flags & WQ_UNBOUND; 2329 bool is_unbound = wq->flags & WQ_UNBOUND;
2404 unsigned int cpu; 2330 unsigned int cpu;
2405 2331
2406 set_user_nice(current, RESCUER_NICE_LEVEL); 2332 set_user_nice(current, RESCUER_NICE_LEVEL);
2333
2334 /*
2335 * Mark rescuer as worker too. As WORKER_PREP is never cleared, it
2336 * doesn't participate in concurrency management.
2337 */
2338 rescuer->task->flags |= PF_WQ_WORKER;
2407repeat: 2339repeat:
2408 set_current_state(TASK_INTERRUPTIBLE); 2340 set_current_state(TASK_INTERRUPTIBLE);
2409 2341
2410 if (kthread_should_stop()) 2342 if (kthread_should_stop()) {
2343 __set_current_state(TASK_RUNNING);
2344 rescuer->task->flags &= ~PF_WQ_WORKER;
2411 return 0; 2345 return 0;
2346 }
2412 2347
2413 /* 2348 /*
2414 * See whether any cpu is asking for help. Unbounded 2349 * See whether any cpu is asking for help. Unbounded
@@ -2416,9 +2351,8 @@ repeat:
2416 */ 2351 */
2417 for_each_mayday_cpu(cpu, wq->mayday_mask) { 2352 for_each_mayday_cpu(cpu, wq->mayday_mask) {
2418 unsigned int tcpu = is_unbound ? WORK_CPU_UNBOUND : cpu; 2353 unsigned int tcpu = is_unbound ? WORK_CPU_UNBOUND : cpu;
2419 struct cpu_workqueue_struct *cwq = get_cwq(tcpu, wq); 2354 struct pool_workqueue *pwq = get_pwq(tcpu, wq);
2420 struct worker_pool *pool = cwq->pool; 2355 struct worker_pool *pool = pwq->pool;
2421 struct global_cwq *gcwq = pool->gcwq;
2422 struct work_struct *work, *n; 2356 struct work_struct *work, *n;
2423 2357
2424 __set_current_state(TASK_RUNNING); 2358 __set_current_state(TASK_RUNNING);
@@ -2434,22 +2368,24 @@ repeat:
2434 */ 2368 */
2435 BUG_ON(!list_empty(&rescuer->scheduled)); 2369 BUG_ON(!list_empty(&rescuer->scheduled));
2436 list_for_each_entry_safe(work, n, &pool->worklist, entry) 2370 list_for_each_entry_safe(work, n, &pool->worklist, entry)
2437 if (get_work_cwq(work) == cwq) 2371 if (get_work_pwq(work) == pwq)
2438 move_linked_works(work, scheduled, &n); 2372 move_linked_works(work, scheduled, &n);
2439 2373
2440 process_scheduled_works(rescuer); 2374 process_scheduled_works(rescuer);
2441 2375
2442 /* 2376 /*
2443 * Leave this gcwq. If keep_working() is %true, notify a 2377 * Leave this pool. If keep_working() is %true, notify a
2444 * regular worker; otherwise, we end up with 0 concurrency 2378 * regular worker; otherwise, we end up with 0 concurrency
2445 * and stalling the execution. 2379 * and stalling the execution.
2446 */ 2380 */
2447 if (keep_working(pool)) 2381 if (keep_working(pool))
2448 wake_up_worker(pool); 2382 wake_up_worker(pool);
2449 2383
2450 spin_unlock_irq(&gcwq->lock); 2384 spin_unlock_irq(&pool->lock);
2451 } 2385 }
2452 2386
2387 /* rescuers should never participate in concurrency management */
2388 WARN_ON_ONCE(!(rescuer->flags & WORKER_NOT_RUNNING));
2453 schedule(); 2389 schedule();
2454 goto repeat; 2390 goto repeat;
2455} 2391}
@@ -2467,7 +2403,7 @@ static void wq_barrier_func(struct work_struct *work)
2467 2403
2468/** 2404/**
2469 * insert_wq_barrier - insert a barrier work 2405 * insert_wq_barrier - insert a barrier work
2470 * @cwq: cwq to insert barrier into 2406 * @pwq: pwq to insert barrier into
2471 * @barr: wq_barrier to insert 2407 * @barr: wq_barrier to insert
2472 * @target: target work to attach @barr to 2408 * @target: target work to attach @barr to
2473 * @worker: worker currently executing @target, NULL if @target is not executing 2409 * @worker: worker currently executing @target, NULL if @target is not executing
@@ -2484,12 +2420,12 @@ static void wq_barrier_func(struct work_struct *work)
2484 * after a work with LINKED flag set. 2420 * after a work with LINKED flag set.
2485 * 2421 *
2486 * Note that when @worker is non-NULL, @target may be modified 2422 * Note that when @worker is non-NULL, @target may be modified
2487 * underneath us, so we can't reliably determine cwq from @target. 2423 * underneath us, so we can't reliably determine pwq from @target.
2488 * 2424 *
2489 * CONTEXT: 2425 * CONTEXT:
2490 * spin_lock_irq(gcwq->lock). 2426 * spin_lock_irq(pool->lock).
2491 */ 2427 */
2492static void insert_wq_barrier(struct cpu_workqueue_struct *cwq, 2428static void insert_wq_barrier(struct pool_workqueue *pwq,
2493 struct wq_barrier *barr, 2429 struct wq_barrier *barr,
2494 struct work_struct *target, struct worker *worker) 2430 struct work_struct *target, struct worker *worker)
2495{ 2431{
@@ -2497,7 +2433,7 @@ static void insert_wq_barrier(struct cpu_workqueue_struct *cwq,
2497 unsigned int linked = 0; 2433 unsigned int linked = 0;
2498 2434
2499 /* 2435 /*
2500 * debugobject calls are safe here even with gcwq->lock locked 2436 * debugobject calls are safe here even with pool->lock locked
2501 * as we know for sure that this will not trigger any of the 2437 * as we know for sure that this will not trigger any of the
2502 * checks and call back into the fixup functions where we 2438 * checks and call back into the fixup functions where we
2503 * might deadlock. 2439 * might deadlock.
@@ -2522,23 +2458,23 @@ static void insert_wq_barrier(struct cpu_workqueue_struct *cwq,
2522 } 2458 }
2523 2459
2524 debug_work_activate(&barr->work); 2460 debug_work_activate(&barr->work);
2525 insert_work(cwq, &barr->work, head, 2461 insert_work(pwq, &barr->work, head,
2526 work_color_to_flags(WORK_NO_COLOR) | linked); 2462 work_color_to_flags(WORK_NO_COLOR) | linked);
2527} 2463}
2528 2464
2529/** 2465/**
2530 * flush_workqueue_prep_cwqs - prepare cwqs for workqueue flushing 2466 * flush_workqueue_prep_pwqs - prepare pwqs for workqueue flushing
2531 * @wq: workqueue being flushed 2467 * @wq: workqueue being flushed
2532 * @flush_color: new flush color, < 0 for no-op 2468 * @flush_color: new flush color, < 0 for no-op
2533 * @work_color: new work color, < 0 for no-op 2469 * @work_color: new work color, < 0 for no-op
2534 * 2470 *
2535 * Prepare cwqs for workqueue flushing. 2471 * Prepare pwqs for workqueue flushing.
2536 * 2472 *
2537 * If @flush_color is non-negative, flush_color on all cwqs should be 2473 * If @flush_color is non-negative, flush_color on all pwqs should be
2538 * -1. If no cwq has in-flight commands at the specified color, all 2474 * -1. If no pwq has in-flight commands at the specified color, all
2539 * cwq->flush_color's stay at -1 and %false is returned. If any cwq 2475 * pwq->flush_color's stay at -1 and %false is returned. If any pwq
2540 * has in flight commands, its cwq->flush_color is set to 2476 * has in flight commands, its pwq->flush_color is set to
2541 * @flush_color, @wq->nr_cwqs_to_flush is updated accordingly, cwq 2477 * @flush_color, @wq->nr_pwqs_to_flush is updated accordingly, pwq
2542 * wakeup logic is armed and %true is returned. 2478 * wakeup logic is armed and %true is returned.
2543 * 2479 *
2544 * The caller should have initialized @wq->first_flusher prior to 2480 * The caller should have initialized @wq->first_flusher prior to
@@ -2546,7 +2482,7 @@ static void insert_wq_barrier(struct cpu_workqueue_struct *cwq,
2546 * @flush_color is negative, no flush color update is done and %false 2482 * @flush_color is negative, no flush color update is done and %false
2547 * is returned. 2483 * is returned.
2548 * 2484 *
2549 * If @work_color is non-negative, all cwqs should have the same 2485 * If @work_color is non-negative, all pwqs should have the same
2550 * work_color which is previous to @work_color and all will be 2486 * work_color which is previous to @work_color and all will be
2551 * advanced to @work_color. 2487 * advanced to @work_color.
2552 * 2488 *
@@ -2557,42 +2493,42 @@ static void insert_wq_barrier(struct cpu_workqueue_struct *cwq,
2557 * %true if @flush_color >= 0 and there's something to flush. %false 2493 * %true if @flush_color >= 0 and there's something to flush. %false
2558 * otherwise. 2494 * otherwise.
2559 */ 2495 */
2560static bool flush_workqueue_prep_cwqs(struct workqueue_struct *wq, 2496static bool flush_workqueue_prep_pwqs(struct workqueue_struct *wq,
2561 int flush_color, int work_color) 2497 int flush_color, int work_color)
2562{ 2498{
2563 bool wait = false; 2499 bool wait = false;
2564 unsigned int cpu; 2500 unsigned int cpu;
2565 2501
2566 if (flush_color >= 0) { 2502 if (flush_color >= 0) {
2567 BUG_ON(atomic_read(&wq->nr_cwqs_to_flush)); 2503 BUG_ON(atomic_read(&wq->nr_pwqs_to_flush));
2568 atomic_set(&wq->nr_cwqs_to_flush, 1); 2504 atomic_set(&wq->nr_pwqs_to_flush, 1);
2569 } 2505 }
2570 2506
2571 for_each_cwq_cpu(cpu, wq) { 2507 for_each_pwq_cpu(cpu, wq) {
2572 struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); 2508 struct pool_workqueue *pwq = get_pwq(cpu, wq);
2573 struct global_cwq *gcwq = cwq->pool->gcwq; 2509 struct worker_pool *pool = pwq->pool;
2574 2510
2575 spin_lock_irq(&gcwq->lock); 2511 spin_lock_irq(&pool->lock);
2576 2512
2577 if (flush_color >= 0) { 2513 if (flush_color >= 0) {
2578 BUG_ON(cwq->flush_color != -1); 2514 BUG_ON(pwq->flush_color != -1);
2579 2515
2580 if (cwq->nr_in_flight[flush_color]) { 2516 if (pwq->nr_in_flight[flush_color]) {
2581 cwq->flush_color = flush_color; 2517 pwq->flush_color = flush_color;
2582 atomic_inc(&wq->nr_cwqs_to_flush); 2518 atomic_inc(&wq->nr_pwqs_to_flush);
2583 wait = true; 2519 wait = true;
2584 } 2520 }
2585 } 2521 }
2586 2522
2587 if (work_color >= 0) { 2523 if (work_color >= 0) {
2588 BUG_ON(work_color != work_next_color(cwq->work_color)); 2524 BUG_ON(work_color != work_next_color(pwq->work_color));
2589 cwq->work_color = work_color; 2525 pwq->work_color = work_color;
2590 } 2526 }
2591 2527
2592 spin_unlock_irq(&gcwq->lock); 2528 spin_unlock_irq(&pool->lock);
2593 } 2529 }
2594 2530
2595 if (flush_color >= 0 && atomic_dec_and_test(&wq->nr_cwqs_to_flush)) 2531 if (flush_color >= 0 && atomic_dec_and_test(&wq->nr_pwqs_to_flush))
2596 complete(&wq->first_flusher->done); 2532 complete(&wq->first_flusher->done);
2597 2533
2598 return wait; 2534 return wait;
@@ -2643,7 +2579,7 @@ void flush_workqueue(struct workqueue_struct *wq)
2643 2579
2644 wq->first_flusher = &this_flusher; 2580 wq->first_flusher = &this_flusher;
2645 2581
2646 if (!flush_workqueue_prep_cwqs(wq, wq->flush_color, 2582 if (!flush_workqueue_prep_pwqs(wq, wq->flush_color,
2647 wq->work_color)) { 2583 wq->work_color)) {
2648 /* nothing to flush, done */ 2584 /* nothing to flush, done */
2649 wq->flush_color = next_color; 2585 wq->flush_color = next_color;
@@ -2654,7 +2590,7 @@ void flush_workqueue(struct workqueue_struct *wq)
2654 /* wait in queue */ 2590 /* wait in queue */
2655 BUG_ON(wq->flush_color == this_flusher.flush_color); 2591 BUG_ON(wq->flush_color == this_flusher.flush_color);
2656 list_add_tail(&this_flusher.list, &wq->flusher_queue); 2592 list_add_tail(&this_flusher.list, &wq->flusher_queue);
2657 flush_workqueue_prep_cwqs(wq, -1, wq->work_color); 2593 flush_workqueue_prep_pwqs(wq, -1, wq->work_color);
2658 } 2594 }
2659 } else { 2595 } else {
2660 /* 2596 /*
@@ -2721,7 +2657,7 @@ void flush_workqueue(struct workqueue_struct *wq)
2721 2657
2722 list_splice_tail_init(&wq->flusher_overflow, 2658 list_splice_tail_init(&wq->flusher_overflow,
2723 &wq->flusher_queue); 2659 &wq->flusher_queue);
2724 flush_workqueue_prep_cwqs(wq, -1, wq->work_color); 2660 flush_workqueue_prep_pwqs(wq, -1, wq->work_color);
2725 } 2661 }
2726 2662
2727 if (list_empty(&wq->flusher_queue)) { 2663 if (list_empty(&wq->flusher_queue)) {
@@ -2731,7 +2667,7 @@ void flush_workqueue(struct workqueue_struct *wq)
2731 2667
2732 /* 2668 /*
2733 * Need to flush more colors. Make the next flusher 2669 * Need to flush more colors. Make the next flusher
2734 * the new first flusher and arm cwqs. 2670 * the new first flusher and arm pwqs.
2735 */ 2671 */
2736 BUG_ON(wq->flush_color == wq->work_color); 2672 BUG_ON(wq->flush_color == wq->work_color);
2737 BUG_ON(wq->flush_color != next->flush_color); 2673 BUG_ON(wq->flush_color != next->flush_color);
@@ -2739,7 +2675,7 @@ void flush_workqueue(struct workqueue_struct *wq)
2739 list_del_init(&next->list); 2675 list_del_init(&next->list);
2740 wq->first_flusher = next; 2676 wq->first_flusher = next;
2741 2677
2742 if (flush_workqueue_prep_cwqs(wq, wq->flush_color, -1)) 2678 if (flush_workqueue_prep_pwqs(wq, wq->flush_color, -1))
2743 break; 2679 break;
2744 2680
2745 /* 2681 /*
@@ -2782,13 +2718,13 @@ void drain_workqueue(struct workqueue_struct *wq)
2782reflush: 2718reflush:
2783 flush_workqueue(wq); 2719 flush_workqueue(wq);
2784 2720
2785 for_each_cwq_cpu(cpu, wq) { 2721 for_each_pwq_cpu(cpu, wq) {
2786 struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); 2722 struct pool_workqueue *pwq = get_pwq(cpu, wq);
2787 bool drained; 2723 bool drained;
2788 2724
2789 spin_lock_irq(&cwq->pool->gcwq->lock); 2725 spin_lock_irq(&pwq->pool->lock);
2790 drained = !cwq->nr_active && list_empty(&cwq->delayed_works); 2726 drained = !pwq->nr_active && list_empty(&pwq->delayed_works);
2791 spin_unlock_irq(&cwq->pool->gcwq->lock); 2727 spin_unlock_irq(&pwq->pool->lock);
2792 2728
2793 if (drained) 2729 if (drained)
2794 continue; 2730 continue;
@@ -2810,34 +2746,29 @@ EXPORT_SYMBOL_GPL(drain_workqueue);
2810static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr) 2746static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr)
2811{ 2747{
2812 struct worker *worker = NULL; 2748 struct worker *worker = NULL;
2813 struct global_cwq *gcwq; 2749 struct worker_pool *pool;
2814 struct cpu_workqueue_struct *cwq; 2750 struct pool_workqueue *pwq;
2815 2751
2816 might_sleep(); 2752 might_sleep();
2817 gcwq = get_work_gcwq(work); 2753 pool = get_work_pool(work);
2818 if (!gcwq) 2754 if (!pool)
2819 return false; 2755 return false;
2820 2756
2821 spin_lock_irq(&gcwq->lock); 2757 spin_lock_irq(&pool->lock);
2822 if (!list_empty(&work->entry)) { 2758 /* see the comment in try_to_grab_pending() with the same code */
2823 /* 2759 pwq = get_work_pwq(work);
2824 * See the comment near try_to_grab_pending()->smp_rmb(). 2760 if (pwq) {
2825 * If it was re-queued to a different gcwq under us, we 2761 if (unlikely(pwq->pool != pool))
2826 * are not going to wait.
2827 */
2828 smp_rmb();
2829 cwq = get_work_cwq(work);
2830 if (unlikely(!cwq || gcwq != cwq->pool->gcwq))
2831 goto already_gone; 2762 goto already_gone;
2832 } else { 2763 } else {
2833 worker = find_worker_executing_work(gcwq, work); 2764 worker = find_worker_executing_work(pool, work);
2834 if (!worker) 2765 if (!worker)
2835 goto already_gone; 2766 goto already_gone;
2836 cwq = worker->current_cwq; 2767 pwq = worker->current_pwq;
2837 } 2768 }
2838 2769
2839 insert_wq_barrier(cwq, barr, work, worker); 2770 insert_wq_barrier(pwq, barr, work, worker);
2840 spin_unlock_irq(&gcwq->lock); 2771 spin_unlock_irq(&pool->lock);
2841 2772
2842 /* 2773 /*
2843 * If @max_active is 1 or rescuer is in use, flushing another work 2774 * If @max_active is 1 or rescuer is in use, flushing another work
@@ -2845,15 +2776,15 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr)
2845 * flusher is not running on the same workqueue by verifying write 2776 * flusher is not running on the same workqueue by verifying write
2846 * access. 2777 * access.
2847 */ 2778 */
2848 if (cwq->wq->saved_max_active == 1 || cwq->wq->flags & WQ_RESCUER) 2779 if (pwq->wq->saved_max_active == 1 || pwq->wq->flags & WQ_RESCUER)
2849 lock_map_acquire(&cwq->wq->lockdep_map); 2780 lock_map_acquire(&pwq->wq->lockdep_map);
2850 else 2781 else
2851 lock_map_acquire_read(&cwq->wq->lockdep_map); 2782 lock_map_acquire_read(&pwq->wq->lockdep_map);
2852 lock_map_release(&cwq->wq->lockdep_map); 2783 lock_map_release(&pwq->wq->lockdep_map);
2853 2784
2854 return true; 2785 return true;
2855already_gone: 2786already_gone:
2856 spin_unlock_irq(&gcwq->lock); 2787 spin_unlock_irq(&pool->lock);
2857 return false; 2788 return false;
2858} 2789}
2859 2790
@@ -2949,8 +2880,7 @@ bool flush_delayed_work(struct delayed_work *dwork)
2949{ 2880{
2950 local_irq_disable(); 2881 local_irq_disable();
2951 if (del_timer_sync(&dwork->timer)) 2882 if (del_timer_sync(&dwork->timer))
2952 __queue_work(dwork->cpu, 2883 __queue_work(dwork->cpu, dwork->wq, &dwork->work);
2953 get_work_cwq(&dwork->work)->wq, &dwork->work);
2954 local_irq_enable(); 2884 local_irq_enable();
2955 return flush_work(&dwork->work); 2885 return flush_work(&dwork->work);
2956} 2886}
@@ -2980,7 +2910,8 @@ bool cancel_delayed_work(struct delayed_work *dwork)
2980 if (unlikely(ret < 0)) 2910 if (unlikely(ret < 0))
2981 return false; 2911 return false;
2982 2912
2983 set_work_cpu_and_clear_pending(&dwork->work, work_cpu(&dwork->work)); 2913 set_work_pool_and_clear_pending(&dwork->work,
2914 get_work_pool_id(&dwork->work));
2984 local_irq_restore(flags); 2915 local_irq_restore(flags);
2985 return ret; 2916 return ret;
2986} 2917}
@@ -3159,46 +3090,46 @@ int keventd_up(void)
3159 return system_wq != NULL; 3090 return system_wq != NULL;
3160} 3091}
3161 3092
3162static int alloc_cwqs(struct workqueue_struct *wq) 3093static int alloc_pwqs(struct workqueue_struct *wq)
3163{ 3094{
3164 /* 3095 /*
3165 * cwqs are forced aligned according to WORK_STRUCT_FLAG_BITS. 3096 * pwqs are forced aligned according to WORK_STRUCT_FLAG_BITS.
3166 * Make sure that the alignment isn't lower than that of 3097 * Make sure that the alignment isn't lower than that of
3167 * unsigned long long. 3098 * unsigned long long.
3168 */ 3099 */
3169 const size_t size = sizeof(struct cpu_workqueue_struct); 3100 const size_t size = sizeof(struct pool_workqueue);
3170 const size_t align = max_t(size_t, 1 << WORK_STRUCT_FLAG_BITS, 3101 const size_t align = max_t(size_t, 1 << WORK_STRUCT_FLAG_BITS,
3171 __alignof__(unsigned long long)); 3102 __alignof__(unsigned long long));
3172 3103
3173 if (!(wq->flags & WQ_UNBOUND)) 3104 if (!(wq->flags & WQ_UNBOUND))
3174 wq->cpu_wq.pcpu = __alloc_percpu(size, align); 3105 wq->pool_wq.pcpu = __alloc_percpu(size, align);
3175 else { 3106 else {
3176 void *ptr; 3107 void *ptr;
3177 3108
3178 /* 3109 /*
3179 * Allocate enough room to align cwq and put an extra 3110 * Allocate enough room to align pwq and put an extra
3180 * pointer at the end pointing back to the originally 3111 * pointer at the end pointing back to the originally
3181 * allocated pointer which will be used for free. 3112 * allocated pointer which will be used for free.
3182 */ 3113 */
3183 ptr = kzalloc(size + align + sizeof(void *), GFP_KERNEL); 3114 ptr = kzalloc(size + align + sizeof(void *), GFP_KERNEL);
3184 if (ptr) { 3115 if (ptr) {
3185 wq->cpu_wq.single = PTR_ALIGN(ptr, align); 3116 wq->pool_wq.single = PTR_ALIGN(ptr, align);
3186 *(void **)(wq->cpu_wq.single + 1) = ptr; 3117 *(void **)(wq->pool_wq.single + 1) = ptr;
3187 } 3118 }
3188 } 3119 }
3189 3120
3190 /* just in case, make sure it's actually aligned */ 3121 /* just in case, make sure it's actually aligned */
3191 BUG_ON(!IS_ALIGNED(wq->cpu_wq.v, align)); 3122 BUG_ON(!IS_ALIGNED(wq->pool_wq.v, align));
3192 return wq->cpu_wq.v ? 0 : -ENOMEM; 3123 return wq->pool_wq.v ? 0 : -ENOMEM;
3193} 3124}
3194 3125
3195static void free_cwqs(struct workqueue_struct *wq) 3126static void free_pwqs(struct workqueue_struct *wq)
3196{ 3127{
3197 if (!(wq->flags & WQ_UNBOUND)) 3128 if (!(wq->flags & WQ_UNBOUND))
3198 free_percpu(wq->cpu_wq.pcpu); 3129 free_percpu(wq->pool_wq.pcpu);
3199 else if (wq->cpu_wq.single) { 3130 else if (wq->pool_wq.single) {
3200 /* the pointer to free is stored right after the cwq */ 3131 /* the pointer to free is stored right after the pwq */
3201 kfree(*(void **)(wq->cpu_wq.single + 1)); 3132 kfree(*(void **)(wq->pool_wq.single + 1));
3202 } 3133 }
3203} 3134}
3204 3135
@@ -3252,27 +3183,25 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
3252 wq->flags = flags; 3183 wq->flags = flags;
3253 wq->saved_max_active = max_active; 3184 wq->saved_max_active = max_active;
3254 mutex_init(&wq->flush_mutex); 3185 mutex_init(&wq->flush_mutex);
3255 atomic_set(&wq->nr_cwqs_to_flush, 0); 3186 atomic_set(&wq->nr_pwqs_to_flush, 0);
3256 INIT_LIST_HEAD(&wq->flusher_queue); 3187 INIT_LIST_HEAD(&wq->flusher_queue);
3257 INIT_LIST_HEAD(&wq->flusher_overflow); 3188 INIT_LIST_HEAD(&wq->flusher_overflow);
3258 3189
3259 lockdep_init_map(&wq->lockdep_map, lock_name, key, 0); 3190 lockdep_init_map(&wq->lockdep_map, lock_name, key, 0);
3260 INIT_LIST_HEAD(&wq->list); 3191 INIT_LIST_HEAD(&wq->list);
3261 3192
3262 if (alloc_cwqs(wq) < 0) 3193 if (alloc_pwqs(wq) < 0)
3263 goto err; 3194 goto err;
3264 3195
3265 for_each_cwq_cpu(cpu, wq) { 3196 for_each_pwq_cpu(cpu, wq) {
3266 struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); 3197 struct pool_workqueue *pwq = get_pwq(cpu, wq);
3267 struct global_cwq *gcwq = get_gcwq(cpu); 3198
3268 int pool_idx = (bool)(flags & WQ_HIGHPRI); 3199 BUG_ON((unsigned long)pwq & WORK_STRUCT_FLAG_MASK);
3269 3200 pwq->pool = get_std_worker_pool(cpu, flags & WQ_HIGHPRI);
3270 BUG_ON((unsigned long)cwq & WORK_STRUCT_FLAG_MASK); 3201 pwq->wq = wq;
3271 cwq->pool = &gcwq->pools[pool_idx]; 3202 pwq->flush_color = -1;
3272 cwq->wq = wq; 3203 pwq->max_active = max_active;
3273 cwq->flush_color = -1; 3204 INIT_LIST_HEAD(&pwq->delayed_works);
3274 cwq->max_active = max_active;
3275 INIT_LIST_HEAD(&cwq->delayed_works);
3276 } 3205 }
3277 3206
3278 if (flags & WQ_RESCUER) { 3207 if (flags & WQ_RESCUER) {
@@ -3285,7 +3214,8 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
3285 if (!rescuer) 3214 if (!rescuer)
3286 goto err; 3215 goto err;
3287 3216
3288 rescuer->task = kthread_create(rescuer_thread, wq, "%s", 3217 rescuer->rescue_wq = wq;
3218 rescuer->task = kthread_create(rescuer_thread, rescuer, "%s",
3289 wq->name); 3219 wq->name);
3290 if (IS_ERR(rescuer->task)) 3220 if (IS_ERR(rescuer->task))
3291 goto err; 3221 goto err;
@@ -3302,8 +3232,8 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
3302 spin_lock(&workqueue_lock); 3232 spin_lock(&workqueue_lock);
3303 3233
3304 if (workqueue_freezing && wq->flags & WQ_FREEZABLE) 3234 if (workqueue_freezing && wq->flags & WQ_FREEZABLE)
3305 for_each_cwq_cpu(cpu, wq) 3235 for_each_pwq_cpu(cpu, wq)
3306 get_cwq(cpu, wq)->max_active = 0; 3236 get_pwq(cpu, wq)->max_active = 0;
3307 3237
3308 list_add(&wq->list, &workqueues); 3238 list_add(&wq->list, &workqueues);
3309 3239
@@ -3312,7 +3242,7 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
3312 return wq; 3242 return wq;
3313err: 3243err:
3314 if (wq) { 3244 if (wq) {
3315 free_cwqs(wq); 3245 free_pwqs(wq);
3316 free_mayday_mask(wq->mayday_mask); 3246 free_mayday_mask(wq->mayday_mask);
3317 kfree(wq->rescuer); 3247 kfree(wq->rescuer);
3318 kfree(wq); 3248 kfree(wq);
@@ -3343,14 +3273,14 @@ void destroy_workqueue(struct workqueue_struct *wq)
3343 spin_unlock(&workqueue_lock); 3273 spin_unlock(&workqueue_lock);
3344 3274
3345 /* sanity check */ 3275 /* sanity check */
3346 for_each_cwq_cpu(cpu, wq) { 3276 for_each_pwq_cpu(cpu, wq) {
3347 struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); 3277 struct pool_workqueue *pwq = get_pwq(cpu, wq);
3348 int i; 3278 int i;
3349 3279
3350 for (i = 0; i < WORK_NR_COLORS; i++) 3280 for (i = 0; i < WORK_NR_COLORS; i++)
3351 BUG_ON(cwq->nr_in_flight[i]); 3281 BUG_ON(pwq->nr_in_flight[i]);
3352 BUG_ON(cwq->nr_active); 3282 BUG_ON(pwq->nr_active);
3353 BUG_ON(!list_empty(&cwq->delayed_works)); 3283 BUG_ON(!list_empty(&pwq->delayed_works));
3354 } 3284 }
3355 3285
3356 if (wq->flags & WQ_RESCUER) { 3286 if (wq->flags & WQ_RESCUER) {
@@ -3359,29 +3289,29 @@ void destroy_workqueue(struct workqueue_struct *wq)
3359 kfree(wq->rescuer); 3289 kfree(wq->rescuer);
3360 } 3290 }
3361 3291
3362 free_cwqs(wq); 3292 free_pwqs(wq);
3363 kfree(wq); 3293 kfree(wq);
3364} 3294}
3365EXPORT_SYMBOL_GPL(destroy_workqueue); 3295EXPORT_SYMBOL_GPL(destroy_workqueue);
3366 3296
3367/** 3297/**
3368 * cwq_set_max_active - adjust max_active of a cwq 3298 * pwq_set_max_active - adjust max_active of a pwq
3369 * @cwq: target cpu_workqueue_struct 3299 * @pwq: target pool_workqueue
3370 * @max_active: new max_active value. 3300 * @max_active: new max_active value.
3371 * 3301 *
3372 * Set @cwq->max_active to @max_active and activate delayed works if 3302 * Set @pwq->max_active to @max_active and activate delayed works if
3373 * increased. 3303 * increased.
3374 * 3304 *
3375 * CONTEXT: 3305 * CONTEXT:
3376 * spin_lock_irq(gcwq->lock). 3306 * spin_lock_irq(pool->lock).
3377 */ 3307 */
3378static void cwq_set_max_active(struct cpu_workqueue_struct *cwq, int max_active) 3308static void pwq_set_max_active(struct pool_workqueue *pwq, int max_active)
3379{ 3309{
3380 cwq->max_active = max_active; 3310 pwq->max_active = max_active;
3381 3311
3382 while (!list_empty(&cwq->delayed_works) && 3312 while (!list_empty(&pwq->delayed_works) &&
3383 cwq->nr_active < cwq->max_active) 3313 pwq->nr_active < pwq->max_active)
3384 cwq_activate_first_delayed(cwq); 3314 pwq_activate_first_delayed(pwq);
3385} 3315}
3386 3316
3387/** 3317/**
@@ -3404,16 +3334,17 @@ void workqueue_set_max_active(struct workqueue_struct *wq, int max_active)
3404 3334
3405 wq->saved_max_active = max_active; 3335 wq->saved_max_active = max_active;
3406 3336
3407 for_each_cwq_cpu(cpu, wq) { 3337 for_each_pwq_cpu(cpu, wq) {
3408 struct global_cwq *gcwq = get_gcwq(cpu); 3338 struct pool_workqueue *pwq = get_pwq(cpu, wq);
3339 struct worker_pool *pool = pwq->pool;
3409 3340
3410 spin_lock_irq(&gcwq->lock); 3341 spin_lock_irq(&pool->lock);
3411 3342
3412 if (!(wq->flags & WQ_FREEZABLE) || 3343 if (!(wq->flags & WQ_FREEZABLE) ||
3413 !(gcwq->flags & GCWQ_FREEZING)) 3344 !(pool->flags & POOL_FREEZING))
3414 cwq_set_max_active(get_cwq(gcwq->cpu, wq), max_active); 3345 pwq_set_max_active(pwq, max_active);
3415 3346
3416 spin_unlock_irq(&gcwq->lock); 3347 spin_unlock_irq(&pool->lock);
3417 } 3348 }
3418 3349
3419 spin_unlock(&workqueue_lock); 3350 spin_unlock(&workqueue_lock);
@@ -3434,57 +3365,38 @@ EXPORT_SYMBOL_GPL(workqueue_set_max_active);
3434 */ 3365 */
3435bool workqueue_congested(unsigned int cpu, struct workqueue_struct *wq) 3366bool workqueue_congested(unsigned int cpu, struct workqueue_struct *wq)
3436{ 3367{
3437 struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); 3368 struct pool_workqueue *pwq = get_pwq(cpu, wq);
3438 3369
3439 return !list_empty(&cwq->delayed_works); 3370 return !list_empty(&pwq->delayed_works);
3440} 3371}
3441EXPORT_SYMBOL_GPL(workqueue_congested); 3372EXPORT_SYMBOL_GPL(workqueue_congested);
3442 3373
3443/** 3374/**
3444 * work_cpu - return the last known associated cpu for @work
3445 * @work: the work of interest
3446 *
3447 * RETURNS:
3448 * CPU number if @work was ever queued. WORK_CPU_NONE otherwise.
3449 */
3450unsigned int work_cpu(struct work_struct *work)
3451{
3452 struct global_cwq *gcwq = get_work_gcwq(work);
3453
3454 return gcwq ? gcwq->cpu : WORK_CPU_NONE;
3455}
3456EXPORT_SYMBOL_GPL(work_cpu);
3457
3458/**
3459 * work_busy - test whether a work is currently pending or running 3375 * work_busy - test whether a work is currently pending or running
3460 * @work: the work to be tested 3376 * @work: the work to be tested
3461 * 3377 *
3462 * Test whether @work is currently pending or running. There is no 3378 * Test whether @work is currently pending or running. There is no
3463 * synchronization around this function and the test result is 3379 * synchronization around this function and the test result is
3464 * unreliable and only useful as advisory hints or for debugging. 3380 * unreliable and only useful as advisory hints or for debugging.
3465 * Especially for reentrant wqs, the pending state might hide the
3466 * running state.
3467 * 3381 *
3468 * RETURNS: 3382 * RETURNS:
3469 * OR'd bitmask of WORK_BUSY_* bits. 3383 * OR'd bitmask of WORK_BUSY_* bits.
3470 */ 3384 */
3471unsigned int work_busy(struct work_struct *work) 3385unsigned int work_busy(struct work_struct *work)
3472{ 3386{
3473 struct global_cwq *gcwq = get_work_gcwq(work); 3387 struct worker_pool *pool = get_work_pool(work);
3474 unsigned long flags; 3388 unsigned long flags;
3475 unsigned int ret = 0; 3389 unsigned int ret = 0;
3476 3390
3477 if (!gcwq)
3478 return false;
3479
3480 spin_lock_irqsave(&gcwq->lock, flags);
3481
3482 if (work_pending(work)) 3391 if (work_pending(work))
3483 ret |= WORK_BUSY_PENDING; 3392 ret |= WORK_BUSY_PENDING;
3484 if (find_worker_executing_work(gcwq, work))
3485 ret |= WORK_BUSY_RUNNING;
3486 3393
3487 spin_unlock_irqrestore(&gcwq->lock, flags); 3394 if (pool) {
3395 spin_lock_irqsave(&pool->lock, flags);
3396 if (find_worker_executing_work(pool, work))
3397 ret |= WORK_BUSY_RUNNING;
3398 spin_unlock_irqrestore(&pool->lock, flags);
3399 }
3488 3400
3489 return ret; 3401 return ret;
3490} 3402}
@@ -3494,86 +3406,75 @@ EXPORT_SYMBOL_GPL(work_busy);
3494 * CPU hotplug. 3406 * CPU hotplug.
3495 * 3407 *
3496 * There are two challenges in supporting CPU hotplug. Firstly, there 3408 * There are two challenges in supporting CPU hotplug. Firstly, there
3497 * are a lot of assumptions on strong associations among work, cwq and 3409 * are a lot of assumptions on strong associations among work, pwq and
3498 * gcwq which make migrating pending and scheduled works very 3410 * pool which make migrating pending and scheduled works very
3499 * difficult to implement without impacting hot paths. Secondly, 3411 * difficult to implement without impacting hot paths. Secondly,
3500 * gcwqs serve mix of short, long and very long running works making 3412 * worker pools serve mix of short, long and very long running works making
3501 * blocked draining impractical. 3413 * blocked draining impractical.
3502 * 3414 *
3503 * This is solved by allowing a gcwq to be disassociated from the CPU 3415 * This is solved by allowing the pools to be disassociated from the CPU
3504 * running as an unbound one and allowing it to be reattached later if the 3416 * running as an unbound one and allowing it to be reattached later if the
3505 * cpu comes back online. 3417 * cpu comes back online.
3506 */ 3418 */
3507 3419
3508/* claim manager positions of all pools */ 3420static void wq_unbind_fn(struct work_struct *work)
3509static void gcwq_claim_assoc_and_lock(struct global_cwq *gcwq)
3510{
3511 struct worker_pool *pool;
3512
3513 for_each_worker_pool(pool, gcwq)
3514 mutex_lock_nested(&pool->assoc_mutex, pool - gcwq->pools);
3515 spin_lock_irq(&gcwq->lock);
3516}
3517
3518/* release manager positions */
3519static void gcwq_release_assoc_and_unlock(struct global_cwq *gcwq)
3520{
3521 struct worker_pool *pool;
3522
3523 spin_unlock_irq(&gcwq->lock);
3524 for_each_worker_pool(pool, gcwq)
3525 mutex_unlock(&pool->assoc_mutex);
3526}
3527
3528static void gcwq_unbind_fn(struct work_struct *work)
3529{ 3421{
3530 struct global_cwq *gcwq = get_gcwq(smp_processor_id()); 3422 int cpu = smp_processor_id();
3531 struct worker_pool *pool; 3423 struct worker_pool *pool;
3532 struct worker *worker; 3424 struct worker *worker;
3533 struct hlist_node *pos;
3534 int i; 3425 int i;
3535 3426
3536 BUG_ON(gcwq->cpu != smp_processor_id()); 3427 for_each_std_worker_pool(pool, cpu) {
3428 BUG_ON(cpu != smp_processor_id());
3537 3429
3538 gcwq_claim_assoc_and_lock(gcwq); 3430 mutex_lock(&pool->assoc_mutex);
3431 spin_lock_irq(&pool->lock);
3539 3432
3540 /* 3433 /*
3541 * We've claimed all manager positions. Make all workers unbound 3434 * We've claimed all manager positions. Make all workers
3542 * and set DISASSOCIATED. Before this, all workers except for the 3435 * unbound and set DISASSOCIATED. Before this, all workers
3543 * ones which are still executing works from before the last CPU 3436 * except for the ones which are still executing works from
3544 * down must be on the cpu. After this, they may become diasporas. 3437 * before the last CPU down must be on the cpu. After
3545 */ 3438 * this, they may become diasporas.
3546 for_each_worker_pool(pool, gcwq) 3439 */
3547 list_for_each_entry(worker, &pool->idle_list, entry) 3440 list_for_each_entry(worker, &pool->idle_list, entry)
3548 worker->flags |= WORKER_UNBOUND; 3441 worker->flags |= WORKER_UNBOUND;
3549 3442
3550 for_each_busy_worker(worker, i, pos, gcwq) 3443 for_each_busy_worker(worker, i, pool)
3551 worker->flags |= WORKER_UNBOUND; 3444 worker->flags |= WORKER_UNBOUND;
3552 3445
3553 gcwq->flags |= GCWQ_DISASSOCIATED; 3446 pool->flags |= POOL_DISASSOCIATED;
3554 3447
3555 gcwq_release_assoc_and_unlock(gcwq); 3448 spin_unlock_irq(&pool->lock);
3449 mutex_unlock(&pool->assoc_mutex);
3556 3450
3557 /* 3451 /*
3558 * Call schedule() so that we cross rq->lock and thus can guarantee 3452 * Call schedule() so that we cross rq->lock and thus can
3559 * sched callbacks see the %WORKER_UNBOUND flag. This is necessary 3453 * guarantee sched callbacks see the %WORKER_UNBOUND flag.
3560 * as scheduler callbacks may be invoked from other cpus. 3454 * This is necessary as scheduler callbacks may be invoked
3561 */ 3455 * from other cpus.
3562 schedule(); 3456 */
3457 schedule();
3563 3458
3564 /* 3459 /*
3565 * Sched callbacks are disabled now. Zap nr_running. After this, 3460 * Sched callbacks are disabled now. Zap nr_running.
3566 * nr_running stays zero and need_more_worker() and keep_working() 3461 * After this, nr_running stays zero and need_more_worker()
3567 * are always true as long as the worklist is not empty. @gcwq now 3462 * and keep_working() are always true as long as the
3568 * behaves as unbound (in terms of concurrency management) gcwq 3463 * worklist is not empty. This pool now behaves as an
3569 * which is served by workers tied to the CPU. 3464 * unbound (in terms of concurrency management) pool which
3570 * 3465 * are served by workers tied to the pool.
3571 * On return from this function, the current worker would trigger 3466 */
3572 * unbound chain execution of pending work items if other workers 3467 atomic_set(&pool->nr_running, 0);
3573 * didn't already. 3468
3574 */ 3469 /*
3575 for_each_worker_pool(pool, gcwq) 3470 * With concurrency management just turned off, a busy
3576 atomic_set(get_pool_nr_running(pool), 0); 3471 * worker blocking could lead to lengthy stalls. Kick off
3472 * unbound chain execution of currently pending work items.
3473 */
3474 spin_lock_irq(&pool->lock);
3475 wake_up_worker(pool);
3476 spin_unlock_irq(&pool->lock);
3477 }
3577} 3478}
3578 3479
3579/* 3480/*
@@ -3585,12 +3486,11 @@ static int __cpuinit workqueue_cpu_up_callback(struct notifier_block *nfb,
3585 void *hcpu) 3486 void *hcpu)
3586{ 3487{
3587 unsigned int cpu = (unsigned long)hcpu; 3488 unsigned int cpu = (unsigned long)hcpu;
3588 struct global_cwq *gcwq = get_gcwq(cpu);
3589 struct worker_pool *pool; 3489 struct worker_pool *pool;
3590 3490
3591 switch (action & ~CPU_TASKS_FROZEN) { 3491 switch (action & ~CPU_TASKS_FROZEN) {
3592 case CPU_UP_PREPARE: 3492 case CPU_UP_PREPARE:
3593 for_each_worker_pool(pool, gcwq) { 3493 for_each_std_worker_pool(pool, cpu) {
3594 struct worker *worker; 3494 struct worker *worker;
3595 3495
3596 if (pool->nr_workers) 3496 if (pool->nr_workers)
@@ -3600,18 +3500,24 @@ static int __cpuinit workqueue_cpu_up_callback(struct notifier_block *nfb,
3600 if (!worker) 3500 if (!worker)
3601 return NOTIFY_BAD; 3501 return NOTIFY_BAD;
3602 3502
3603 spin_lock_irq(&gcwq->lock); 3503 spin_lock_irq(&pool->lock);
3604 start_worker(worker); 3504 start_worker(worker);
3605 spin_unlock_irq(&gcwq->lock); 3505 spin_unlock_irq(&pool->lock);
3606 } 3506 }
3607 break; 3507 break;
3608 3508
3609 case CPU_DOWN_FAILED: 3509 case CPU_DOWN_FAILED:
3610 case CPU_ONLINE: 3510 case CPU_ONLINE:
3611 gcwq_claim_assoc_and_lock(gcwq); 3511 for_each_std_worker_pool(pool, cpu) {
3612 gcwq->flags &= ~GCWQ_DISASSOCIATED; 3512 mutex_lock(&pool->assoc_mutex);
3613 rebind_workers(gcwq); 3513 spin_lock_irq(&pool->lock);
3614 gcwq_release_assoc_and_unlock(gcwq); 3514
3515 pool->flags &= ~POOL_DISASSOCIATED;
3516 rebind_workers(pool);
3517
3518 spin_unlock_irq(&pool->lock);
3519 mutex_unlock(&pool->assoc_mutex);
3520 }
3615 break; 3521 break;
3616 } 3522 }
3617 return NOTIFY_OK; 3523 return NOTIFY_OK;
@@ -3631,7 +3537,7 @@ static int __cpuinit workqueue_cpu_down_callback(struct notifier_block *nfb,
3631 switch (action & ~CPU_TASKS_FROZEN) { 3537 switch (action & ~CPU_TASKS_FROZEN) {
3632 case CPU_DOWN_PREPARE: 3538 case CPU_DOWN_PREPARE:
3633 /* unbinding should happen on the local CPU */ 3539 /* unbinding should happen on the local CPU */
3634 INIT_WORK_ONSTACK(&unbind_work, gcwq_unbind_fn); 3540 INIT_WORK_ONSTACK(&unbind_work, wq_unbind_fn);
3635 queue_work_on(cpu, system_highpri_wq, &unbind_work); 3541 queue_work_on(cpu, system_highpri_wq, &unbind_work);
3636 flush_work(&unbind_work); 3542 flush_work(&unbind_work);
3637 break; 3543 break;
@@ -3684,10 +3590,10 @@ EXPORT_SYMBOL_GPL(work_on_cpu);
3684 * 3590 *
3685 * Start freezing workqueues. After this function returns, all freezable 3591 * Start freezing workqueues. After this function returns, all freezable
3686 * workqueues will queue new works to their frozen_works list instead of 3592 * workqueues will queue new works to their frozen_works list instead of
3687 * gcwq->worklist. 3593 * pool->worklist.
3688 * 3594 *
3689 * CONTEXT: 3595 * CONTEXT:
3690 * Grabs and releases workqueue_lock and gcwq->lock's. 3596 * Grabs and releases workqueue_lock and pool->lock's.
3691 */ 3597 */
3692void freeze_workqueues_begin(void) 3598void freeze_workqueues_begin(void)
3693{ 3599{
@@ -3698,23 +3604,26 @@ void freeze_workqueues_begin(void)
3698 BUG_ON(workqueue_freezing); 3604 BUG_ON(workqueue_freezing);
3699 workqueue_freezing = true; 3605 workqueue_freezing = true;
3700 3606
3701 for_each_gcwq_cpu(cpu) { 3607 for_each_wq_cpu(cpu) {
3702 struct global_cwq *gcwq = get_gcwq(cpu); 3608 struct worker_pool *pool;
3703 struct workqueue_struct *wq; 3609 struct workqueue_struct *wq;
3704 3610
3705 spin_lock_irq(&gcwq->lock); 3611 for_each_std_worker_pool(pool, cpu) {
3612 spin_lock_irq(&pool->lock);
3706 3613
3707 BUG_ON(gcwq->flags & GCWQ_FREEZING); 3614 WARN_ON_ONCE(pool->flags & POOL_FREEZING);
3708 gcwq->flags |= GCWQ_FREEZING; 3615 pool->flags |= POOL_FREEZING;
3709 3616
3710 list_for_each_entry(wq, &workqueues, list) { 3617 list_for_each_entry(wq, &workqueues, list) {
3711 struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); 3618 struct pool_workqueue *pwq = get_pwq(cpu, wq);
3712 3619
3713 if (cwq && wq->flags & WQ_FREEZABLE) 3620 if (pwq && pwq->pool == pool &&
3714 cwq->max_active = 0; 3621 (wq->flags & WQ_FREEZABLE))
3715 } 3622 pwq->max_active = 0;
3623 }
3716 3624
3717 spin_unlock_irq(&gcwq->lock); 3625 spin_unlock_irq(&pool->lock);
3626 }
3718 } 3627 }
3719 3628
3720 spin_unlock(&workqueue_lock); 3629 spin_unlock(&workqueue_lock);
@@ -3742,20 +3651,20 @@ bool freeze_workqueues_busy(void)
3742 3651
3743 BUG_ON(!workqueue_freezing); 3652 BUG_ON(!workqueue_freezing);
3744 3653
3745 for_each_gcwq_cpu(cpu) { 3654 for_each_wq_cpu(cpu) {
3746 struct workqueue_struct *wq; 3655 struct workqueue_struct *wq;
3747 /* 3656 /*
3748 * nr_active is monotonically decreasing. It's safe 3657 * nr_active is monotonically decreasing. It's safe
3749 * to peek without lock. 3658 * to peek without lock.
3750 */ 3659 */
3751 list_for_each_entry(wq, &workqueues, list) { 3660 list_for_each_entry(wq, &workqueues, list) {
3752 struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); 3661 struct pool_workqueue *pwq = get_pwq(cpu, wq);
3753 3662
3754 if (!cwq || !(wq->flags & WQ_FREEZABLE)) 3663 if (!pwq || !(wq->flags & WQ_FREEZABLE))
3755 continue; 3664 continue;
3756 3665
3757 BUG_ON(cwq->nr_active < 0); 3666 BUG_ON(pwq->nr_active < 0);
3758 if (cwq->nr_active) { 3667 if (pwq->nr_active) {
3759 busy = true; 3668 busy = true;
3760 goto out_unlock; 3669 goto out_unlock;
3761 } 3670 }
@@ -3770,10 +3679,10 @@ out_unlock:
3770 * thaw_workqueues - thaw workqueues 3679 * thaw_workqueues - thaw workqueues
3771 * 3680 *
3772 * Thaw workqueues. Normal queueing is restored and all collected 3681 * Thaw workqueues. Normal queueing is restored and all collected
3773 * frozen works are transferred to their respective gcwq worklists. 3682 * frozen works are transferred to their respective pool worklists.
3774 * 3683 *
3775 * CONTEXT: 3684 * CONTEXT:
3776 * Grabs and releases workqueue_lock and gcwq->lock's. 3685 * Grabs and releases workqueue_lock and pool->lock's.
3777 */ 3686 */
3778void thaw_workqueues(void) 3687void thaw_workqueues(void)
3779{ 3688{
@@ -3784,30 +3693,31 @@ void thaw_workqueues(void)
3784 if (!workqueue_freezing) 3693 if (!workqueue_freezing)
3785 goto out_unlock; 3694 goto out_unlock;
3786 3695
3787 for_each_gcwq_cpu(cpu) { 3696 for_each_wq_cpu(cpu) {
3788 struct global_cwq *gcwq = get_gcwq(cpu);
3789 struct worker_pool *pool; 3697 struct worker_pool *pool;
3790 struct workqueue_struct *wq; 3698 struct workqueue_struct *wq;
3791 3699
3792 spin_lock_irq(&gcwq->lock); 3700 for_each_std_worker_pool(pool, cpu) {
3701 spin_lock_irq(&pool->lock);
3793 3702
3794 BUG_ON(!(gcwq->flags & GCWQ_FREEZING)); 3703 WARN_ON_ONCE(!(pool->flags & POOL_FREEZING));
3795 gcwq->flags &= ~GCWQ_FREEZING; 3704 pool->flags &= ~POOL_FREEZING;
3796 3705
3797 list_for_each_entry(wq, &workqueues, list) { 3706 list_for_each_entry(wq, &workqueues, list) {
3798 struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); 3707 struct pool_workqueue *pwq = get_pwq(cpu, wq);
3799 3708
3800 if (!cwq || !(wq->flags & WQ_FREEZABLE)) 3709 if (!pwq || pwq->pool != pool ||
3801 continue; 3710 !(wq->flags & WQ_FREEZABLE))
3711 continue;
3802 3712
3803 /* restore max_active and repopulate worklist */ 3713 /* restore max_active and repopulate worklist */
3804 cwq_set_max_active(cwq, wq->saved_max_active); 3714 pwq_set_max_active(pwq, wq->saved_max_active);
3805 } 3715 }
3806 3716
3807 for_each_worker_pool(pool, gcwq)
3808 wake_up_worker(pool); 3717 wake_up_worker(pool);
3809 3718
3810 spin_unlock_irq(&gcwq->lock); 3719 spin_unlock_irq(&pool->lock);
3720 }
3811 } 3721 }
3812 3722
3813 workqueue_freezing = false; 3723 workqueue_freezing = false;
@@ -3819,60 +3729,56 @@ out_unlock:
3819static int __init init_workqueues(void) 3729static int __init init_workqueues(void)
3820{ 3730{
3821 unsigned int cpu; 3731 unsigned int cpu;
3822 int i;
3823 3732
3824 /* make sure we have enough bits for OFFQ CPU number */ 3733 /* make sure we have enough bits for OFFQ pool ID */
3825 BUILD_BUG_ON((1LU << (BITS_PER_LONG - WORK_OFFQ_CPU_SHIFT)) < 3734 BUILD_BUG_ON((1LU << (BITS_PER_LONG - WORK_OFFQ_POOL_SHIFT)) <
3826 WORK_CPU_LAST); 3735 WORK_CPU_END * NR_STD_WORKER_POOLS);
3827 3736
3828 cpu_notifier(workqueue_cpu_up_callback, CPU_PRI_WORKQUEUE_UP); 3737 cpu_notifier(workqueue_cpu_up_callback, CPU_PRI_WORKQUEUE_UP);
3829 hotcpu_notifier(workqueue_cpu_down_callback, CPU_PRI_WORKQUEUE_DOWN); 3738 hotcpu_notifier(workqueue_cpu_down_callback, CPU_PRI_WORKQUEUE_DOWN);
3830 3739
3831 /* initialize gcwqs */ 3740 /* initialize CPU pools */
3832 for_each_gcwq_cpu(cpu) { 3741 for_each_wq_cpu(cpu) {
3833 struct global_cwq *gcwq = get_gcwq(cpu);
3834 struct worker_pool *pool; 3742 struct worker_pool *pool;
3835 3743
3836 spin_lock_init(&gcwq->lock); 3744 for_each_std_worker_pool(pool, cpu) {
3837 gcwq->cpu = cpu; 3745 spin_lock_init(&pool->lock);
3838 gcwq->flags |= GCWQ_DISASSOCIATED; 3746 pool->cpu = cpu;
3839 3747 pool->flags |= POOL_DISASSOCIATED;
3840 for (i = 0; i < BUSY_WORKER_HASH_SIZE; i++)
3841 INIT_HLIST_HEAD(&gcwq->busy_hash[i]);
3842
3843 for_each_worker_pool(pool, gcwq) {
3844 pool->gcwq = gcwq;
3845 INIT_LIST_HEAD(&pool->worklist); 3748 INIT_LIST_HEAD(&pool->worklist);
3846 INIT_LIST_HEAD(&pool->idle_list); 3749 INIT_LIST_HEAD(&pool->idle_list);
3750 hash_init(pool->busy_hash);
3847 3751
3848 init_timer_deferrable(&pool->idle_timer); 3752 init_timer_deferrable(&pool->idle_timer);
3849 pool->idle_timer.function = idle_worker_timeout; 3753 pool->idle_timer.function = idle_worker_timeout;
3850 pool->idle_timer.data = (unsigned long)pool; 3754 pool->idle_timer.data = (unsigned long)pool;
3851 3755
3852 setup_timer(&pool->mayday_timer, gcwq_mayday_timeout, 3756 setup_timer(&pool->mayday_timer, pool_mayday_timeout,
3853 (unsigned long)pool); 3757 (unsigned long)pool);
3854 3758
3855 mutex_init(&pool->assoc_mutex); 3759 mutex_init(&pool->assoc_mutex);
3856 ida_init(&pool->worker_ida); 3760 ida_init(&pool->worker_ida);
3761
3762 /* alloc pool ID */
3763 BUG_ON(worker_pool_assign_id(pool));
3857 } 3764 }
3858 } 3765 }
3859 3766
3860 /* create the initial worker */ 3767 /* create the initial worker */
3861 for_each_online_gcwq_cpu(cpu) { 3768 for_each_online_wq_cpu(cpu) {
3862 struct global_cwq *gcwq = get_gcwq(cpu);
3863 struct worker_pool *pool; 3769 struct worker_pool *pool;
3864 3770
3865 if (cpu != WORK_CPU_UNBOUND) 3771 for_each_std_worker_pool(pool, cpu) {
3866 gcwq->flags &= ~GCWQ_DISASSOCIATED;
3867
3868 for_each_worker_pool(pool, gcwq) {
3869 struct worker *worker; 3772 struct worker *worker;
3870 3773
3774 if (cpu != WORK_CPU_UNBOUND)
3775 pool->flags &= ~POOL_DISASSOCIATED;
3776
3871 worker = create_worker(pool); 3777 worker = create_worker(pool);
3872 BUG_ON(!worker); 3778 BUG_ON(!worker);
3873 spin_lock_irq(&gcwq->lock); 3779 spin_lock_irq(&pool->lock);
3874 start_worker(worker); 3780 start_worker(worker);
3875 spin_unlock_irq(&gcwq->lock); 3781 spin_unlock_irq(&pool->lock);
3876 } 3782 }
3877 } 3783 }
3878 3784
diff --git a/kernel/workqueue_internal.h b/kernel/workqueue_internal.h
new file mode 100644
index 000000000000..07650264ec15
--- /dev/null
+++ b/kernel/workqueue_internal.h
@@ -0,0 +1,65 @@
1/*
2 * kernel/workqueue_internal.h
3 *
4 * Workqueue internal header file. Only to be included by workqueue and
5 * core kernel subsystems.
6 */
7#ifndef _KERNEL_WORKQUEUE_INTERNAL_H
8#define _KERNEL_WORKQUEUE_INTERNAL_H
9
10#include <linux/workqueue.h>
11#include <linux/kthread.h>
12
13struct worker_pool;
14
15/*
16 * The poor guys doing the actual heavy lifting. All on-duty workers are
17 * either serving the manager role, on idle list or on busy hash. For
18 * details on the locking annotation (L, I, X...), refer to workqueue.c.
19 *
20 * Only to be used in workqueue and async.
21 */
22struct worker {
23 /* on idle list while idle, on busy hash table while busy */
24 union {
25 struct list_head entry; /* L: while idle */
26 struct hlist_node hentry; /* L: while busy */
27 };
28
29 struct work_struct *current_work; /* L: work being processed */
30 work_func_t current_func; /* L: current_work's fn */
31 struct pool_workqueue *current_pwq; /* L: current_work's pwq */
32 struct list_head scheduled; /* L: scheduled works */
33 struct task_struct *task; /* I: worker task */
34 struct worker_pool *pool; /* I: the associated pool */
35 /* 64 bytes boundary on 64bit, 32 on 32bit */
36 unsigned long last_active; /* L: last active timestamp */
37 unsigned int flags; /* X: flags */
38 int id; /* I: worker id */
39
40 /* for rebinding worker to CPU */
41 struct work_struct rebind_work; /* L: for busy worker */
42
43 /* used only by rescuers to point to the target workqueue */
44 struct workqueue_struct *rescue_wq; /* I: the workqueue to rescue */
45};
46
47/**
48 * current_wq_worker - return struct worker if %current is a workqueue worker
49 */
50static inline struct worker *current_wq_worker(void)
51{
52 if (current->flags & PF_WQ_WORKER)
53 return kthread_data(current);
54 return NULL;
55}
56
57/*
58 * Scheduler hooks for concurrency managed workqueue. Only to be used from
59 * sched.c and workqueue.c.
60 */
61void wq_worker_waking_up(struct task_struct *task, unsigned int cpu);
62struct task_struct *wq_worker_sleeping(struct task_struct *task,
63 unsigned int cpu);
64
65#endif /* _KERNEL_WORKQUEUE_INTERNAL_H */
diff --git a/kernel/workqueue_sched.h b/kernel/workqueue_sched.h
deleted file mode 100644
index 2d10fc98dc79..000000000000
--- a/kernel/workqueue_sched.h
+++ /dev/null
@@ -1,9 +0,0 @@
1/*
2 * kernel/workqueue_sched.h
3 *
4 * Scheduler hooks for concurrency managed workqueue. Only to be
5 * included from sched.c and workqueue.c.
6 */
7void wq_worker_waking_up(struct task_struct *task, unsigned int cpu);
8struct task_struct *wq_worker_sleeping(struct task_struct *task,
9 unsigned int cpu);