aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorThomas Gleixner <tglx@linutronix.de>2011-01-27 06:29:13 -0500
committerThomas Gleixner <tglx@linutronix.de>2011-01-27 06:29:37 -0500
commitf97b12cce6dea51880a6a89d4607c29c70a6a841 (patch)
tree1f05f6d39975bd213e7506e8a73ae0a59188c75e /kernel
parentccaa8d657117bb1876d471bd91579d774106778d (diff)
parent1bae4ce27c9c90344f23c65ea6966c50ffeae2f5 (diff)
Merge commit 'v2.6.38-rc2' into core/locking
Reason: Update to mainline before adding the locking cleanup Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile5
-rw-r--r--kernel/audit.c2
-rw-r--r--kernel/cgroup.c55
-rw-r--r--kernel/cpu.c29
-rw-r--r--kernel/debug/kdb/kdb_main.c23
-rw-r--r--kernel/exit.c25
-rw-r--r--kernel/fork.c46
-rw-r--r--kernel/freezer.c9
-rw-r--r--kernel/futex.c300
-rw-r--r--kernel/futex_compat.c3
-rw-r--r--kernel/hrtimer.c87
-rw-r--r--kernel/hw_breakpoint.c5
-rw-r--r--kernel/irq/Kconfig3
-rw-r--r--kernel/irq/handle.c111
-rw-r--r--kernel/irq/irqdesc.c40
-rw-r--r--kernel/irq/manage.c4
-rw-r--r--kernel/irq/proc.c2
-rw-r--r--kernel/irq_work.c22
-rw-r--r--kernel/kexec.c2
-rw-r--r--kernel/kprobes.c573
-rw-r--r--kernel/kthread.c13
-rw-r--r--kernel/latencytop.c40
-rw-r--r--kernel/lockdep.c18
-rw-r--r--kernel/module.c183
-rw-r--r--kernel/mutex.c2
-rw-r--r--kernel/panic.c1
-rw-r--r--kernel/perf_event.c844
-rw-r--r--kernel/pm_qos_params.c4
-rw-r--r--kernel/posix-cpu-timers.c12
-rw-r--r--kernel/posix-timers.c10
-rw-r--r--kernel/power/Kconfig9
-rw-r--r--kernel/power/Makefile6
-rw-r--r--kernel/power/hibernate.c33
-rw-r--r--kernel/power/nvs.c136
-rw-r--r--kernel/power/process.c8
-rw-r--r--kernel/power/suspend.c14
-rw-r--r--kernel/power/swap.c62
-rw-r--r--kernel/power/user.c4
-rw-r--r--kernel/printk.c75
-rw-r--r--kernel/range.c2
-rw-r--r--kernel/rcutiny.c106
-rw-r--r--kernel/rcutiny_plugin.h433
-rw-r--r--kernel/rcutorture.c270
-rw-r--r--kernel/rcutree.c160
-rw-r--r--kernel/rcutree.h61
-rw-r--r--kernel/rcutree_plugin.h135
-rw-r--r--kernel/rcutree_trace.c12
-rw-r--r--kernel/resource.c104
-rw-r--r--kernel/sched.c1033
-rw-r--r--kernel/sched_autogroup.c270
-rw-r--r--kernel/sched_autogroup.h36
-rw-r--r--kernel/sched_clock.c2
-rw-r--r--kernel/sched_debug.c123
-rw-r--r--kernel/sched_fair.c401
-rw-r--r--kernel/sched_features.h2
-rw-r--r--kernel/sched_rt.c24
-rw-r--r--kernel/sched_stoptask.c4
-rw-r--r--kernel/smp.c75
-rw-r--r--kernel/softirq.c65
-rw-r--r--kernel/srcu.c19
-rw-r--r--kernel/sys.c10
-rw-r--r--kernel/sysctl.c93
-rw-r--r--kernel/sysctl_binary.c3
-rw-r--r--kernel/taskstats.c62
-rw-r--r--kernel/time.c4
-rw-r--r--kernel/time/clocksource.c11
-rw-r--r--kernel/time/ntp.c425
-rw-r--r--kernel/time/tick-common.c2
-rw-r--r--kernel/time/tick-oneshot.c4
-rw-r--r--kernel/time/timecompare.c5
-rw-r--r--kernel/time/timekeeping.c56
-rw-r--r--kernel/time/timer_list.c8
-rw-r--r--kernel/timer.c58
-rw-r--r--kernel/trace/Kconfig17
-rw-r--r--kernel/trace/Makefile2
-rw-r--r--kernel/trace/blktrace.c41
-rw-r--r--kernel/trace/power-traces.c5
-rw-r--r--kernel/trace/ring_buffer.c9
-rw-r--r--kernel/trace/trace.c30
-rw-r--r--kernel/trace/trace_entries.h2
-rw-r--r--kernel/trace/trace_event_perf.c31
-rw-r--r--kernel/trace/trace_events.c6
-rw-r--r--kernel/trace/trace_export.c14
-rw-r--r--kernel/trace/trace_irqsoff.c8
-rw-r--r--kernel/trace/trace_selftest.c2
-rw-r--r--kernel/trace/trace_syscalls.c33
-rw-r--r--kernel/user.c1
-rw-r--r--kernel/user_namespace.c15
-rw-r--r--kernel/watchdog.c50
-rw-r--r--kernel/workqueue.c87
90 files changed, 4867 insertions, 2384 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 0b5ff083fa22..353d3fe8ba33 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -43,7 +43,7 @@ obj-$(CONFIG_RT_MUTEXES) += rtmutex.o
43obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o 43obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o
44obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o 44obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o
45obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o 45obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o
46obj-$(CONFIG_USE_GENERIC_SMP_HELPERS) += smp.o 46obj-$(CONFIG_SMP) += smp.o
47ifneq ($(CONFIG_SMP),y) 47ifneq ($(CONFIG_SMP),y)
48obj-y += up.o 48obj-y += up.o
49endif 49endif
@@ -100,6 +100,7 @@ obj-$(CONFIG_FUNCTION_TRACER) += trace/
100obj-$(CONFIG_TRACING) += trace/ 100obj-$(CONFIG_TRACING) += trace/
101obj-$(CONFIG_X86_DS) += trace/ 101obj-$(CONFIG_X86_DS) += trace/
102obj-$(CONFIG_RING_BUFFER) += trace/ 102obj-$(CONFIG_RING_BUFFER) += trace/
103obj-$(CONFIG_TRACEPOINTS) += trace/
103obj-$(CONFIG_SMP) += sched_cpupri.o 104obj-$(CONFIG_SMP) += sched_cpupri.o
104obj-$(CONFIG_IRQ_WORK) += irq_work.o 105obj-$(CONFIG_IRQ_WORK) += irq_work.o
105obj-$(CONFIG_PERF_EVENTS) += perf_event.o 106obj-$(CONFIG_PERF_EVENTS) += perf_event.o
@@ -121,7 +122,7 @@ $(obj)/configs.o: $(obj)/config_data.h
121# config_data.h contains the same information as ikconfig.h but gzipped. 122# config_data.h contains the same information as ikconfig.h but gzipped.
122# Info from config_data can be extracted from /proc/config* 123# Info from config_data can be extracted from /proc/config*
123targets += config_data.gz 124targets += config_data.gz
124$(obj)/config_data.gz: .config FORCE 125$(obj)/config_data.gz: $(KCONFIG_CONFIG) FORCE
125 $(call if_changed,gzip) 126 $(call if_changed,gzip)
126 127
127quiet_cmd_ikconfiggz = IKCFG $@ 128quiet_cmd_ikconfiggz = IKCFG $@
diff --git a/kernel/audit.c b/kernel/audit.c
index 77770a034d59..e4956244ae50 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -400,7 +400,7 @@ static void kauditd_send_skb(struct sk_buff *skb)
400 if (err < 0) { 400 if (err < 0) {
401 BUG_ON(err != -ECONNREFUSED); /* Shouldn't happen */ 401 BUG_ON(err != -ECONNREFUSED); /* Shouldn't happen */
402 printk(KERN_ERR "audit: *NO* daemon at audit_pid=%d\n", audit_pid); 402 printk(KERN_ERR "audit: *NO* daemon at audit_pid=%d\n", audit_pid);
403 audit_log_lost("auditd dissapeared\n"); 403 audit_log_lost("auditd disappeared\n");
404 audit_pid = 0; 404 audit_pid = 0;
405 /* we might get lucky and get this in the next auditd */ 405 /* we might get lucky and get this in the next auditd */
406 audit_hold_skb(skb); 406 audit_hold_skb(skb);
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 66a416b42c18..b24d7027b83c 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -764,6 +764,7 @@ EXPORT_SYMBOL_GPL(cgroup_unlock);
764 */ 764 */
765 765
766static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode); 766static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode);
767static struct dentry *cgroup_lookup(struct inode *, struct dentry *, struct nameidata *);
767static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry); 768static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry);
768static int cgroup_populate_dir(struct cgroup *cgrp); 769static int cgroup_populate_dir(struct cgroup *cgrp);
769static const struct inode_operations cgroup_dir_inode_operations; 770static const struct inode_operations cgroup_dir_inode_operations;
@@ -860,6 +861,11 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
860 iput(inode); 861 iput(inode);
861} 862}
862 863
864static int cgroup_delete(const struct dentry *d)
865{
866 return 1;
867}
868
863static void remove_dir(struct dentry *d) 869static void remove_dir(struct dentry *d)
864{ 870{
865 struct dentry *parent = dget(d->d_parent); 871 struct dentry *parent = dget(d->d_parent);
@@ -874,25 +880,29 @@ static void cgroup_clear_directory(struct dentry *dentry)
874 struct list_head *node; 880 struct list_head *node;
875 881
876 BUG_ON(!mutex_is_locked(&dentry->d_inode->i_mutex)); 882 BUG_ON(!mutex_is_locked(&dentry->d_inode->i_mutex));
877 spin_lock(&dcache_lock); 883 spin_lock(&dentry->d_lock);
878 node = dentry->d_subdirs.next; 884 node = dentry->d_subdirs.next;
879 while (node != &dentry->d_subdirs) { 885 while (node != &dentry->d_subdirs) {
880 struct dentry *d = list_entry(node, struct dentry, d_u.d_child); 886 struct dentry *d = list_entry(node, struct dentry, d_u.d_child);
887
888 spin_lock_nested(&d->d_lock, DENTRY_D_LOCK_NESTED);
881 list_del_init(node); 889 list_del_init(node);
882 if (d->d_inode) { 890 if (d->d_inode) {
883 /* This should never be called on a cgroup 891 /* This should never be called on a cgroup
884 * directory with child cgroups */ 892 * directory with child cgroups */
885 BUG_ON(d->d_inode->i_mode & S_IFDIR); 893 BUG_ON(d->d_inode->i_mode & S_IFDIR);
886 d = dget_locked(d); 894 dget_dlock(d);
887 spin_unlock(&dcache_lock); 895 spin_unlock(&d->d_lock);
896 spin_unlock(&dentry->d_lock);
888 d_delete(d); 897 d_delete(d);
889 simple_unlink(dentry->d_inode, d); 898 simple_unlink(dentry->d_inode, d);
890 dput(d); 899 dput(d);
891 spin_lock(&dcache_lock); 900 spin_lock(&dentry->d_lock);
892 } 901 } else
902 spin_unlock(&d->d_lock);
893 node = dentry->d_subdirs.next; 903 node = dentry->d_subdirs.next;
894 } 904 }
895 spin_unlock(&dcache_lock); 905 spin_unlock(&dentry->d_lock);
896} 906}
897 907
898/* 908/*
@@ -900,11 +910,16 @@ static void cgroup_clear_directory(struct dentry *dentry)
900 */ 910 */
901static void cgroup_d_remove_dir(struct dentry *dentry) 911static void cgroup_d_remove_dir(struct dentry *dentry)
902{ 912{
913 struct dentry *parent;
914
903 cgroup_clear_directory(dentry); 915 cgroup_clear_directory(dentry);
904 916
905 spin_lock(&dcache_lock); 917 parent = dentry->d_parent;
918 spin_lock(&parent->d_lock);
919 spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
906 list_del_init(&dentry->d_u.d_child); 920 list_del_init(&dentry->d_u.d_child);
907 spin_unlock(&dcache_lock); 921 spin_unlock(&dentry->d_lock);
922 spin_unlock(&parent->d_lock);
908 remove_dir(dentry); 923 remove_dir(dentry);
909} 924}
910 925
@@ -1440,6 +1455,11 @@ static int cgroup_set_super(struct super_block *sb, void *data)
1440 1455
1441static int cgroup_get_rootdir(struct super_block *sb) 1456static int cgroup_get_rootdir(struct super_block *sb)
1442{ 1457{
1458 static const struct dentry_operations cgroup_dops = {
1459 .d_iput = cgroup_diput,
1460 .d_delete = cgroup_delete,
1461 };
1462
1443 struct inode *inode = 1463 struct inode *inode =
1444 cgroup_new_inode(S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR, sb); 1464 cgroup_new_inode(S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR, sb);
1445 struct dentry *dentry; 1465 struct dentry *dentry;
@@ -1457,6 +1477,8 @@ static int cgroup_get_rootdir(struct super_block *sb)
1457 return -ENOMEM; 1477 return -ENOMEM;
1458 } 1478 }
1459 sb->s_root = dentry; 1479 sb->s_root = dentry;
1480 /* for everything else we want ->d_op set */
1481 sb->s_d_op = &cgroup_dops;
1460 return 0; 1482 return 0;
1461} 1483}
1462 1484
@@ -2180,12 +2202,20 @@ static const struct file_operations cgroup_file_operations = {
2180}; 2202};
2181 2203
2182static const struct inode_operations cgroup_dir_inode_operations = { 2204static const struct inode_operations cgroup_dir_inode_operations = {
2183 .lookup = simple_lookup, 2205 .lookup = cgroup_lookup,
2184 .mkdir = cgroup_mkdir, 2206 .mkdir = cgroup_mkdir,
2185 .rmdir = cgroup_rmdir, 2207 .rmdir = cgroup_rmdir,
2186 .rename = cgroup_rename, 2208 .rename = cgroup_rename,
2187}; 2209};
2188 2210
2211static struct dentry *cgroup_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
2212{
2213 if (dentry->d_name.len > NAME_MAX)
2214 return ERR_PTR(-ENAMETOOLONG);
2215 d_add(dentry, NULL);
2216 return NULL;
2217}
2218
2189/* 2219/*
2190 * Check if a file is a control file 2220 * Check if a file is a control file
2191 */ 2221 */
@@ -2199,10 +2229,6 @@ static inline struct cftype *__file_cft(struct file *file)
2199static int cgroup_create_file(struct dentry *dentry, mode_t mode, 2229static int cgroup_create_file(struct dentry *dentry, mode_t mode,
2200 struct super_block *sb) 2230 struct super_block *sb)
2201{ 2231{
2202 static const struct dentry_operations cgroup_dops = {
2203 .d_iput = cgroup_diput,
2204 };
2205
2206 struct inode *inode; 2232 struct inode *inode;
2207 2233
2208 if (!dentry) 2234 if (!dentry)
@@ -2228,7 +2254,6 @@ static int cgroup_create_file(struct dentry *dentry, mode_t mode,
2228 inode->i_size = 0; 2254 inode->i_size = 0;
2229 inode->i_fop = &cgroup_file_operations; 2255 inode->i_fop = &cgroup_file_operations;
2230 } 2256 }
2231 dentry->d_op = &cgroup_dops;
2232 d_instantiate(dentry, inode); 2257 d_instantiate(dentry, inode);
2233 dget(dentry); /* Extra count - pin the dentry in core */ 2258 dget(dentry); /* Extra count - pin the dentry in core */
2234 return 0; 2259 return 0;
@@ -3638,9 +3663,7 @@ again:
3638 list_del(&cgrp->sibling); 3663 list_del(&cgrp->sibling);
3639 cgroup_unlock_hierarchy(cgrp->root); 3664 cgroup_unlock_hierarchy(cgrp->root);
3640 3665
3641 spin_lock(&cgrp->dentry->d_lock);
3642 d = dget(cgrp->dentry); 3666 d = dget(cgrp->dentry);
3643 spin_unlock(&d->d_lock);
3644 3667
3645 cgroup_d_remove_dir(d); 3668 cgroup_d_remove_dir(d);
3646 dput(d); 3669 dput(d);
diff --git a/kernel/cpu.c b/kernel/cpu.c
index f6e726f18491..156cc5556140 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -189,7 +189,6 @@ static inline void check_for_tasks(int cpu)
189} 189}
190 190
191struct take_cpu_down_param { 191struct take_cpu_down_param {
192 struct task_struct *caller;
193 unsigned long mod; 192 unsigned long mod;
194 void *hcpu; 193 void *hcpu;
195}; 194};
@@ -198,7 +197,6 @@ struct take_cpu_down_param {
198static int __ref take_cpu_down(void *_param) 197static int __ref take_cpu_down(void *_param)
199{ 198{
200 struct take_cpu_down_param *param = _param; 199 struct take_cpu_down_param *param = _param;
201 unsigned int cpu = (unsigned long)param->hcpu;
202 int err; 200 int err;
203 201
204 /* Ensure this CPU doesn't handle any more interrupts. */ 202 /* Ensure this CPU doesn't handle any more interrupts. */
@@ -208,11 +206,6 @@ static int __ref take_cpu_down(void *_param)
208 206
209 cpu_notify(CPU_DYING | param->mod, param->hcpu); 207 cpu_notify(CPU_DYING | param->mod, param->hcpu);
210 208
211 if (task_cpu(param->caller) == cpu)
212 move_task_off_dead_cpu(cpu, param->caller);
213 /* Force idle task to run as soon as we yield: it should
214 immediately notice cpu is offline and die quickly. */
215 sched_idle_next();
216 return 0; 209 return 0;
217} 210}
218 211
@@ -223,7 +216,6 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
223 void *hcpu = (void *)(long)cpu; 216 void *hcpu = (void *)(long)cpu;
224 unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0; 217 unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0;
225 struct take_cpu_down_param tcd_param = { 218 struct take_cpu_down_param tcd_param = {
226 .caller = current,
227 .mod = mod, 219 .mod = mod,
228 .hcpu = hcpu, 220 .hcpu = hcpu,
229 }; 221 };
@@ -253,9 +245,15 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
253 } 245 }
254 BUG_ON(cpu_online(cpu)); 246 BUG_ON(cpu_online(cpu));
255 247
256 /* Wait for it to sleep (leaving idle task). */ 248 /*
249 * The migration_call() CPU_DYING callback will have removed all
250 * runnable tasks from the cpu, there's only the idle task left now
251 * that the migration thread is done doing the stop_machine thing.
252 *
253 * Wait for the stop thread to go away.
254 */
257 while (!idle_cpu(cpu)) 255 while (!idle_cpu(cpu))
258 yield(); 256 cpu_relax();
259 257
260 /* This actually kills the CPU. */ 258 /* This actually kills the CPU. */
261 __cpu_die(cpu); 259 __cpu_die(cpu);
@@ -386,6 +384,14 @@ out:
386#ifdef CONFIG_PM_SLEEP_SMP 384#ifdef CONFIG_PM_SLEEP_SMP
387static cpumask_var_t frozen_cpus; 385static cpumask_var_t frozen_cpus;
388 386
387void __weak arch_disable_nonboot_cpus_begin(void)
388{
389}
390
391void __weak arch_disable_nonboot_cpus_end(void)
392{
393}
394
389int disable_nonboot_cpus(void) 395int disable_nonboot_cpus(void)
390{ 396{
391 int cpu, first_cpu, error = 0; 397 int cpu, first_cpu, error = 0;
@@ -397,6 +403,7 @@ int disable_nonboot_cpus(void)
397 * with the userspace trying to use the CPU hotplug at the same time 403 * with the userspace trying to use the CPU hotplug at the same time
398 */ 404 */
399 cpumask_clear(frozen_cpus); 405 cpumask_clear(frozen_cpus);
406 arch_disable_nonboot_cpus_begin();
400 407
401 printk("Disabling non-boot CPUs ...\n"); 408 printk("Disabling non-boot CPUs ...\n");
402 for_each_online_cpu(cpu) { 409 for_each_online_cpu(cpu) {
@@ -412,6 +419,8 @@ int disable_nonboot_cpus(void)
412 } 419 }
413 } 420 }
414 421
422 arch_disable_nonboot_cpus_end();
423
415 if (!error) { 424 if (!error) {
416 BUG_ON(num_online_cpus() > 1); 425 BUG_ON(num_online_cpus() > 1);
417 /* Make sure the CPUs won't be enabled by someone else */ 426 /* Make sure the CPUs won't be enabled by someone else */
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index 37755d621924..bd3e8e29caa3 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -82,7 +82,7 @@ static kdbtab_t kdb_base_commands[50];
82#define for_each_kdbcmd(cmd, num) \ 82#define for_each_kdbcmd(cmd, num) \
83 for ((cmd) = kdb_base_commands, (num) = 0; \ 83 for ((cmd) = kdb_base_commands, (num) = 0; \
84 num < kdb_max_commands; \ 84 num < kdb_max_commands; \
85 num == KDB_BASE_CMD_MAX ? cmd = kdb_commands : cmd++, num++) 85 num++, num == KDB_BASE_CMD_MAX ? cmd = kdb_commands : cmd++)
86 86
87typedef struct _kdbmsg { 87typedef struct _kdbmsg {
88 int km_diag; /* kdb diagnostic */ 88 int km_diag; /* kdb diagnostic */
@@ -646,7 +646,7 @@ static int kdb_defcmd2(const char *cmdstr, const char *argv0)
646 } 646 }
647 if (!s->usable) 647 if (!s->usable)
648 return KDB_NOTIMP; 648 return KDB_NOTIMP;
649 s->command = kmalloc((s->count + 1) * sizeof(*(s->command)), GFP_KDB); 649 s->command = kzalloc((s->count + 1) * sizeof(*(s->command)), GFP_KDB);
650 if (!s->command) { 650 if (!s->command) {
651 kdb_printf("Could not allocate new kdb_defcmd table for %s\n", 651 kdb_printf("Could not allocate new kdb_defcmd table for %s\n",
652 cmdstr); 652 cmdstr);
@@ -2361,7 +2361,7 @@ static int kdb_pid(int argc, const char **argv)
2361 */ 2361 */
2362static int kdb_ll(int argc, const char **argv) 2362static int kdb_ll(int argc, const char **argv)
2363{ 2363{
2364 int diag; 2364 int diag = 0;
2365 unsigned long addr; 2365 unsigned long addr;
2366 long offset = 0; 2366 long offset = 0;
2367 unsigned long va; 2367 unsigned long va;
@@ -2400,20 +2400,21 @@ static int kdb_ll(int argc, const char **argv)
2400 char buf[80]; 2400 char buf[80];
2401 2401
2402 if (KDB_FLAG(CMD_INTERRUPT)) 2402 if (KDB_FLAG(CMD_INTERRUPT))
2403 return 0; 2403 goto out;
2404 2404
2405 sprintf(buf, "%s " kdb_machreg_fmt "\n", command, va); 2405 sprintf(buf, "%s " kdb_machreg_fmt "\n", command, va);
2406 diag = kdb_parse(buf); 2406 diag = kdb_parse(buf);
2407 if (diag) 2407 if (diag)
2408 return diag; 2408 goto out;
2409 2409
2410 addr = va + linkoffset; 2410 addr = va + linkoffset;
2411 if (kdb_getword(&va, addr, sizeof(va))) 2411 if (kdb_getword(&va, addr, sizeof(va)))
2412 return 0; 2412 goto out;
2413 } 2413 }
2414 kfree(command);
2415 2414
2416 return 0; 2415out:
2416 kfree(command);
2417 return diag;
2417} 2418}
2418 2419
2419static int kdb_kgdb(int argc, const char **argv) 2420static int kdb_kgdb(int argc, const char **argv)
@@ -2739,13 +2740,13 @@ int kdb_register_repeat(char *cmd,
2739 } 2740 }
2740 if (kdb_commands) { 2741 if (kdb_commands) {
2741 memcpy(new, kdb_commands, 2742 memcpy(new, kdb_commands,
2742 kdb_max_commands * sizeof(*new)); 2743 (kdb_max_commands - KDB_BASE_CMD_MAX) * sizeof(*new));
2743 kfree(kdb_commands); 2744 kfree(kdb_commands);
2744 } 2745 }
2745 memset(new + kdb_max_commands, 0, 2746 memset(new + kdb_max_commands, 0,
2746 kdb_command_extend * sizeof(*new)); 2747 kdb_command_extend * sizeof(*new));
2747 kdb_commands = new; 2748 kdb_commands = new;
2748 kp = kdb_commands + kdb_max_commands; 2749 kp = kdb_commands + kdb_max_commands - KDB_BASE_CMD_MAX;
2749 kdb_max_commands += kdb_command_extend; 2750 kdb_max_commands += kdb_command_extend;
2750 } 2751 }
2751 2752
@@ -2913,7 +2914,7 @@ static void __init kdb_cmd_init(void)
2913 } 2914 }
2914} 2915}
2915 2916
2916/* Intialize kdb_printf, breakpoint tables and kdb state */ 2917/* Initialize kdb_printf, breakpoint tables and kdb state */
2917void __init kdb_init(int lvl) 2918void __init kdb_init(int lvl)
2918{ 2919{
2919 static int kdb_init_lvl = KDB_NOT_INITIALIZED; 2920 static int kdb_init_lvl = KDB_NOT_INITIALIZED;
diff --git a/kernel/exit.c b/kernel/exit.c
index 21aa7b3001fb..f9a45ebcc7b1 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -69,7 +69,7 @@ static void __unhash_process(struct task_struct *p, bool group_dead)
69 69
70 list_del_rcu(&p->tasks); 70 list_del_rcu(&p->tasks);
71 list_del_init(&p->sibling); 71 list_del_init(&p->sibling);
72 __get_cpu_var(process_counts)--; 72 __this_cpu_dec(process_counts);
73 } 73 }
74 list_del_rcu(&p->thread_group); 74 list_del_rcu(&p->thread_group);
75} 75}
@@ -914,6 +914,15 @@ NORET_TYPE void do_exit(long code)
914 if (unlikely(!tsk->pid)) 914 if (unlikely(!tsk->pid))
915 panic("Attempted to kill the idle task!"); 915 panic("Attempted to kill the idle task!");
916 916
917 /*
918 * If do_exit is called because this processes oopsed, it's possible
919 * that get_fs() was left as KERNEL_DS, so reset it to USER_DS before
920 * continuing. Amongst other possible reasons, this is to prevent
921 * mm_release()->clear_child_tid() from writing to a user-controlled
922 * kernel address.
923 */
924 set_fs(USER_DS);
925
917 tracehook_report_exit(&code); 926 tracehook_report_exit(&code);
918 927
919 validate_creds_for_do_exit(tsk); 928 validate_creds_for_do_exit(tsk);
@@ -985,6 +994,15 @@ NORET_TYPE void do_exit(long code)
985 exit_fs(tsk); 994 exit_fs(tsk);
986 check_stack_usage(); 995 check_stack_usage();
987 exit_thread(); 996 exit_thread();
997
998 /*
999 * Flush inherited counters to the parent - before the parent
1000 * gets woken up by child-exit notifications.
1001 *
1002 * because of cgroup mode, must be called before cgroup_exit()
1003 */
1004 perf_event_exit_task(tsk);
1005
988 cgroup_exit(tsk, 1); 1006 cgroup_exit(tsk, 1);
989 1007
990 if (group_dead) 1008 if (group_dead)
@@ -998,11 +1016,6 @@ NORET_TYPE void do_exit(long code)
998 * FIXME: do that only when needed, using sched_exit tracepoint 1016 * FIXME: do that only when needed, using sched_exit tracepoint
999 */ 1017 */
1000 flush_ptrace_hw_breakpoint(tsk); 1018 flush_ptrace_hw_breakpoint(tsk);
1001 /*
1002 * Flush inherited counters to the parent - before the parent
1003 * gets woken up by child-exit notifications.
1004 */
1005 perf_event_exit_task(tsk);
1006 1019
1007 exit_notify(tsk, group_dead); 1020 exit_notify(tsk, group_dead);
1008#ifdef CONFIG_NUMA 1021#ifdef CONFIG_NUMA
diff --git a/kernel/fork.c b/kernel/fork.c
index 3b159c5991b7..25e429152ddc 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -66,6 +66,7 @@
66#include <linux/posix-timers.h> 66#include <linux/posix-timers.h>
67#include <linux/user-return-notifier.h> 67#include <linux/user-return-notifier.h>
68#include <linux/oom.h> 68#include <linux/oom.h>
69#include <linux/khugepaged.h>
69 70
70#include <asm/pgtable.h> 71#include <asm/pgtable.h>
71#include <asm/pgalloc.h> 72#include <asm/pgalloc.h>
@@ -169,6 +170,7 @@ EXPORT_SYMBOL(free_task);
169static inline void free_signal_struct(struct signal_struct *sig) 170static inline void free_signal_struct(struct signal_struct *sig)
170{ 171{
171 taskstats_tgid_free(sig); 172 taskstats_tgid_free(sig);
173 sched_autogroup_exit(sig);
172 kmem_cache_free(signal_cachep, sig); 174 kmem_cache_free(signal_cachep, sig);
173} 175}
174 176
@@ -273,6 +275,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
273 275
274 setup_thread_stack(tsk, orig); 276 setup_thread_stack(tsk, orig);
275 clear_user_return_notifier(tsk); 277 clear_user_return_notifier(tsk);
278 clear_tsk_need_resched(tsk);
276 stackend = end_of_stack(tsk); 279 stackend = end_of_stack(tsk);
277 *stackend = STACK_END_MAGIC; /* for overflow detection */ 280 *stackend = STACK_END_MAGIC; /* for overflow detection */
278 281
@@ -328,6 +331,9 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
328 retval = ksm_fork(mm, oldmm); 331 retval = ksm_fork(mm, oldmm);
329 if (retval) 332 if (retval)
330 goto out; 333 goto out;
334 retval = khugepaged_fork(mm, oldmm);
335 if (retval)
336 goto out;
331 337
332 prev = NULL; 338 prev = NULL;
333 for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) { 339 for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) {
@@ -527,6 +533,9 @@ void __mmdrop(struct mm_struct *mm)
527 mm_free_pgd(mm); 533 mm_free_pgd(mm);
528 destroy_context(mm); 534 destroy_context(mm);
529 mmu_notifier_mm_destroy(mm); 535 mmu_notifier_mm_destroy(mm);
536#ifdef CONFIG_TRANSPARENT_HUGEPAGE
537 VM_BUG_ON(mm->pmd_huge_pte);
538#endif
530 free_mm(mm); 539 free_mm(mm);
531} 540}
532EXPORT_SYMBOL_GPL(__mmdrop); 541EXPORT_SYMBOL_GPL(__mmdrop);
@@ -541,6 +550,7 @@ void mmput(struct mm_struct *mm)
541 if (atomic_dec_and_test(&mm->mm_users)) { 550 if (atomic_dec_and_test(&mm->mm_users)) {
542 exit_aio(mm); 551 exit_aio(mm);
543 ksm_exit(mm); 552 ksm_exit(mm);
553 khugepaged_exit(mm); /* must run before exit_mmap */
544 exit_mmap(mm); 554 exit_mmap(mm);
545 set_mm_exe_file(mm, NULL); 555 set_mm_exe_file(mm, NULL);
546 if (!list_empty(&mm->mmlist)) { 556 if (!list_empty(&mm->mmlist)) {
@@ -667,6 +677,10 @@ struct mm_struct *dup_mm(struct task_struct *tsk)
667 mm->token_priority = 0; 677 mm->token_priority = 0;
668 mm->last_interval = 0; 678 mm->last_interval = 0;
669 679
680#ifdef CONFIG_TRANSPARENT_HUGEPAGE
681 mm->pmd_huge_pte = NULL;
682#endif
683
670 if (!mm_init(mm, tsk)) 684 if (!mm_init(mm, tsk))
671 goto fail_nomem; 685 goto fail_nomem;
672 686
@@ -904,9 +918,11 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
904 posix_cpu_timers_init_group(sig); 918 posix_cpu_timers_init_group(sig);
905 919
906 tty_audit_fork(sig); 920 tty_audit_fork(sig);
921 sched_autogroup_fork(sig);
907 922
908 sig->oom_adj = current->signal->oom_adj; 923 sig->oom_adj = current->signal->oom_adj;
909 sig->oom_score_adj = current->signal->oom_score_adj; 924 sig->oom_score_adj = current->signal->oom_score_adj;
925 sig->oom_score_adj_min = current->signal->oom_score_adj_min;
910 926
911 mutex_init(&sig->cred_guard_mutex); 927 mutex_init(&sig->cred_guard_mutex);
912 928
@@ -1282,7 +1298,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1282 attach_pid(p, PIDTYPE_SID, task_session(current)); 1298 attach_pid(p, PIDTYPE_SID, task_session(current));
1283 list_add_tail(&p->sibling, &p->real_parent->children); 1299 list_add_tail(&p->sibling, &p->real_parent->children);
1284 list_add_tail_rcu(&p->tasks, &init_task.tasks); 1300 list_add_tail_rcu(&p->tasks, &init_task.tasks);
1285 __get_cpu_var(process_counts)++; 1301 __this_cpu_inc(process_counts);
1286 } 1302 }
1287 attach_pid(p, PIDTYPE_PID, pid); 1303 attach_pid(p, PIDTYPE_PID, pid);
1288 nr_threads++; 1304 nr_threads++;
@@ -1407,23 +1423,6 @@ long do_fork(unsigned long clone_flags,
1407 } 1423 }
1408 1424
1409 /* 1425 /*
1410 * We hope to recycle these flags after 2.6.26
1411 */
1412 if (unlikely(clone_flags & CLONE_STOPPED)) {
1413 static int __read_mostly count = 100;
1414
1415 if (count > 0 && printk_ratelimit()) {
1416 char comm[TASK_COMM_LEN];
1417
1418 count--;
1419 printk(KERN_INFO "fork(): process `%s' used deprecated "
1420 "clone flags 0x%lx\n",
1421 get_task_comm(comm, current),
1422 clone_flags & CLONE_STOPPED);
1423 }
1424 }
1425
1426 /*
1427 * When called from kernel_thread, don't do user tracing stuff. 1426 * When called from kernel_thread, don't do user tracing stuff.
1428 */ 1427 */
1429 if (likely(user_mode(regs))) 1428 if (likely(user_mode(regs)))
@@ -1461,16 +1460,7 @@ long do_fork(unsigned long clone_flags,
1461 */ 1460 */
1462 p->flags &= ~PF_STARTING; 1461 p->flags &= ~PF_STARTING;
1463 1462
1464 if (unlikely(clone_flags & CLONE_STOPPED)) { 1463 wake_up_new_task(p, clone_flags);
1465 /*
1466 * We'll start up with an immediate SIGSTOP.
1467 */
1468 sigaddset(&p->pending.signal, SIGSTOP);
1469 set_tsk_thread_flag(p, TIF_SIGPENDING);
1470 __set_task_state(p, TASK_STOPPED);
1471 } else {
1472 wake_up_new_task(p, clone_flags);
1473 }
1474 1464
1475 tracehook_report_clone_complete(trace, regs, 1465 tracehook_report_clone_complete(trace, regs,
1476 clone_flags, nr, p); 1466 clone_flags, nr, p);
diff --git a/kernel/freezer.c b/kernel/freezer.c
index bd1d42b17cb2..66ecd2ead215 100644
--- a/kernel/freezer.c
+++ b/kernel/freezer.c
@@ -104,8 +104,13 @@ bool freeze_task(struct task_struct *p, bool sig_only)
104 } 104 }
105 105
106 if (should_send_signal(p)) { 106 if (should_send_signal(p)) {
107 if (!signal_pending(p)) 107 fake_signal_wake_up(p);
108 fake_signal_wake_up(p); 108 /*
109 * fake_signal_wake_up() goes through p's scheduler
110 * lock and guarantees that TASK_STOPPED/TRACED ->
111 * TASK_RUNNING transition can't race with task state
112 * testing in try_to_freeze_tasks().
113 */
109 } else if (sig_only) { 114 } else if (sig_only) {
110 return false; 115 return false;
111 } else { 116 } else {
diff --git a/kernel/futex.c b/kernel/futex.c
index 6c683b37f2ce..b766d28accd6 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -69,6 +69,14 @@ int __read_mostly futex_cmpxchg_enabled;
69#define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8) 69#define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8)
70 70
71/* 71/*
72 * Futex flags used to encode options to functions and preserve them across
73 * restarts.
74 */
75#define FLAGS_SHARED 0x01
76#define FLAGS_CLOCKRT 0x02
77#define FLAGS_HAS_TIMEOUT 0x04
78
79/*
72 * Priority Inheritance state: 80 * Priority Inheritance state:
73 */ 81 */
74struct futex_pi_state { 82struct futex_pi_state {
@@ -123,6 +131,12 @@ struct futex_q {
123 u32 bitset; 131 u32 bitset;
124}; 132};
125 133
134static const struct futex_q futex_q_init = {
135 /* list gets initialized in queue_me()*/
136 .key = FUTEX_KEY_INIT,
137 .bitset = FUTEX_BITSET_MATCH_ANY
138};
139
126/* 140/*
127 * Hash buckets are shared by all the futex_keys that hash to the same 141 * Hash buckets are shared by all the futex_keys that hash to the same
128 * location. Each key may have multiple futex_q structures, one for each task 142 * location. Each key may have multiple futex_q structures, one for each task
@@ -219,7 +233,7 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key)
219{ 233{
220 unsigned long address = (unsigned long)uaddr; 234 unsigned long address = (unsigned long)uaddr;
221 struct mm_struct *mm = current->mm; 235 struct mm_struct *mm = current->mm;
222 struct page *page; 236 struct page *page, *page_head;
223 int err; 237 int err;
224 238
225 /* 239 /*
@@ -251,11 +265,46 @@ again:
251 if (err < 0) 265 if (err < 0)
252 return err; 266 return err;
253 267
254 page = compound_head(page); 268#ifdef CONFIG_TRANSPARENT_HUGEPAGE
255 lock_page(page); 269 page_head = page;
256 if (!page->mapping) { 270 if (unlikely(PageTail(page))) {
257 unlock_page(page);
258 put_page(page); 271 put_page(page);
272 /* serialize against __split_huge_page_splitting() */
273 local_irq_disable();
274 if (likely(__get_user_pages_fast(address, 1, 1, &page) == 1)) {
275 page_head = compound_head(page);
276 /*
277 * page_head is valid pointer but we must pin
278 * it before taking the PG_lock and/or
279 * PG_compound_lock. The moment we re-enable
280 * irqs __split_huge_page_splitting() can
281 * return and the head page can be freed from
282 * under us. We can't take the PG_lock and/or
283 * PG_compound_lock on a page that could be
284 * freed from under us.
285 */
286 if (page != page_head) {
287 get_page(page_head);
288 put_page(page);
289 }
290 local_irq_enable();
291 } else {
292 local_irq_enable();
293 goto again;
294 }
295 }
296#else
297 page_head = compound_head(page);
298 if (page != page_head) {
299 get_page(page_head);
300 put_page(page);
301 }
302#endif
303
304 lock_page(page_head);
305 if (!page_head->mapping) {
306 unlock_page(page_head);
307 put_page(page_head);
259 goto again; 308 goto again;
260 } 309 }
261 310
@@ -266,25 +315,24 @@ again:
266 * it's a read-only handle, it's expected that futexes attach to 315 * it's a read-only handle, it's expected that futexes attach to
267 * the object not the particular process. 316 * the object not the particular process.
268 */ 317 */
269 if (PageAnon(page)) { 318 if (PageAnon(page_head)) {
270 key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */ 319 key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */
271 key->private.mm = mm; 320 key->private.mm = mm;
272 key->private.address = address; 321 key->private.address = address;
273 } else { 322 } else {
274 key->both.offset |= FUT_OFF_INODE; /* inode-based key */ 323 key->both.offset |= FUT_OFF_INODE; /* inode-based key */
275 key->shared.inode = page->mapping->host; 324 key->shared.inode = page_head->mapping->host;
276 key->shared.pgoff = page->index; 325 key->shared.pgoff = page_head->index;
277 } 326 }
278 327
279 get_futex_key_refs(key); 328 get_futex_key_refs(key);
280 329
281 unlock_page(page); 330 unlock_page(page_head);
282 put_page(page); 331 put_page(page_head);
283 return 0; 332 return 0;
284} 333}
285 334
286static inline 335static inline void put_futex_key(union futex_key *key)
287void put_futex_key(int fshared, union futex_key *key)
288{ 336{
289 drop_futex_key_refs(key); 337 drop_futex_key_refs(key);
290} 338}
@@ -778,10 +826,9 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
778 new_owner = rt_mutex_next_owner(&pi_state->pi_mutex); 826 new_owner = rt_mutex_next_owner(&pi_state->pi_mutex);
779 827
780 /* 828 /*
781 * This happens when we have stolen the lock and the original 829 * It is possible that the next waiter (the one that brought
782 * pending owner did not enqueue itself back on the rt_mutex. 830 * this owner to the kernel) timed out and is no longer
783 * Thats not a tragedy. We know that way, that a lock waiter 831 * waiting on the lock.
784 * is on the fly. We make the futex_q waiter the pending owner.
785 */ 832 */
786 if (!new_owner) 833 if (!new_owner)
787 new_owner = this->task; 834 new_owner = this->task;
@@ -870,7 +917,8 @@ double_unlock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2)
870/* 917/*
871 * Wake up waiters matching bitset queued on this futex (uaddr). 918 * Wake up waiters matching bitset queued on this futex (uaddr).
872 */ 919 */
873static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset) 920static int
921futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
874{ 922{
875 struct futex_hash_bucket *hb; 923 struct futex_hash_bucket *hb;
876 struct futex_q *this, *next; 924 struct futex_q *this, *next;
@@ -881,7 +929,7 @@ static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset)
881 if (!bitset) 929 if (!bitset)
882 return -EINVAL; 930 return -EINVAL;
883 931
884 ret = get_futex_key(uaddr, fshared, &key); 932 ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key);
885 if (unlikely(ret != 0)) 933 if (unlikely(ret != 0))
886 goto out; 934 goto out;
887 935
@@ -907,7 +955,7 @@ static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset)
907 } 955 }
908 956
909 spin_unlock(&hb->lock); 957 spin_unlock(&hb->lock);
910 put_futex_key(fshared, &key); 958 put_futex_key(&key);
911out: 959out:
912 return ret; 960 return ret;
913} 961}
@@ -917,7 +965,7 @@ out:
917 * to this virtual address: 965 * to this virtual address:
918 */ 966 */
919static int 967static int
920futex_wake_op(u32 __user *uaddr1, int fshared, u32 __user *uaddr2, 968futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2,
921 int nr_wake, int nr_wake2, int op) 969 int nr_wake, int nr_wake2, int op)
922{ 970{
923 union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT; 971 union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
@@ -927,10 +975,10 @@ futex_wake_op(u32 __user *uaddr1, int fshared, u32 __user *uaddr2,
927 int ret, op_ret; 975 int ret, op_ret;
928 976
929retry: 977retry:
930 ret = get_futex_key(uaddr1, fshared, &key1); 978 ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1);
931 if (unlikely(ret != 0)) 979 if (unlikely(ret != 0))
932 goto out; 980 goto out;
933 ret = get_futex_key(uaddr2, fshared, &key2); 981 ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2);
934 if (unlikely(ret != 0)) 982 if (unlikely(ret != 0))
935 goto out_put_key1; 983 goto out_put_key1;
936 984
@@ -962,11 +1010,11 @@ retry_private:
962 if (ret) 1010 if (ret)
963 goto out_put_keys; 1011 goto out_put_keys;
964 1012
965 if (!fshared) 1013 if (!(flags & FLAGS_SHARED))
966 goto retry_private; 1014 goto retry_private;
967 1015
968 put_futex_key(fshared, &key2); 1016 put_futex_key(&key2);
969 put_futex_key(fshared, &key1); 1017 put_futex_key(&key1);
970 goto retry; 1018 goto retry;
971 } 1019 }
972 1020
@@ -996,9 +1044,9 @@ retry_private:
996 1044
997 double_unlock_hb(hb1, hb2); 1045 double_unlock_hb(hb1, hb2);
998out_put_keys: 1046out_put_keys:
999 put_futex_key(fshared, &key2); 1047 put_futex_key(&key2);
1000out_put_key1: 1048out_put_key1:
1001 put_futex_key(fshared, &key1); 1049 put_futex_key(&key1);
1002out: 1050out:
1003 return ret; 1051 return ret;
1004} 1052}
@@ -1133,13 +1181,13 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex,
1133/** 1181/**
1134 * futex_requeue() - Requeue waiters from uaddr1 to uaddr2 1182 * futex_requeue() - Requeue waiters from uaddr1 to uaddr2
1135 * @uaddr1: source futex user address 1183 * @uaddr1: source futex user address
1136 * @fshared: 0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED 1184 * @flags: futex flags (FLAGS_SHARED, etc.)
1137 * @uaddr2: target futex user address 1185 * @uaddr2: target futex user address
1138 * @nr_wake: number of waiters to wake (must be 1 for requeue_pi) 1186 * @nr_wake: number of waiters to wake (must be 1 for requeue_pi)
1139 * @nr_requeue: number of waiters to requeue (0-INT_MAX) 1187 * @nr_requeue: number of waiters to requeue (0-INT_MAX)
1140 * @cmpval: @uaddr1 expected value (or %NULL) 1188 * @cmpval: @uaddr1 expected value (or %NULL)
1141 * @requeue_pi: if we are attempting to requeue from a non-pi futex to a 1189 * @requeue_pi: if we are attempting to requeue from a non-pi futex to a
1142 * pi futex (pi to pi requeue is not supported) 1190 * pi futex (pi to pi requeue is not supported)
1143 * 1191 *
1144 * Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire 1192 * Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire
1145 * uaddr2 atomically on behalf of the top waiter. 1193 * uaddr2 atomically on behalf of the top waiter.
@@ -1148,9 +1196,9 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex,
1148 * >=0 - on success, the number of tasks requeued or woken 1196 * >=0 - on success, the number of tasks requeued or woken
1149 * <0 - on error 1197 * <0 - on error
1150 */ 1198 */
1151static int futex_requeue(u32 __user *uaddr1, int fshared, u32 __user *uaddr2, 1199static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
1152 int nr_wake, int nr_requeue, u32 *cmpval, 1200 u32 __user *uaddr2, int nr_wake, int nr_requeue,
1153 int requeue_pi) 1201 u32 *cmpval, int requeue_pi)
1154{ 1202{
1155 union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT; 1203 union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
1156 int drop_count = 0, task_count = 0, ret; 1204 int drop_count = 0, task_count = 0, ret;
@@ -1191,10 +1239,10 @@ retry:
1191 pi_state = NULL; 1239 pi_state = NULL;
1192 } 1240 }
1193 1241
1194 ret = get_futex_key(uaddr1, fshared, &key1); 1242 ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1);
1195 if (unlikely(ret != 0)) 1243 if (unlikely(ret != 0))
1196 goto out; 1244 goto out;
1197 ret = get_futex_key(uaddr2, fshared, &key2); 1245 ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2);
1198 if (unlikely(ret != 0)) 1246 if (unlikely(ret != 0))
1199 goto out_put_key1; 1247 goto out_put_key1;
1200 1248
@@ -1216,11 +1264,11 @@ retry_private:
1216 if (ret) 1264 if (ret)
1217 goto out_put_keys; 1265 goto out_put_keys;
1218 1266
1219 if (!fshared) 1267 if (!(flags & FLAGS_SHARED))
1220 goto retry_private; 1268 goto retry_private;
1221 1269
1222 put_futex_key(fshared, &key2); 1270 put_futex_key(&key2);
1223 put_futex_key(fshared, &key1); 1271 put_futex_key(&key1);
1224 goto retry; 1272 goto retry;
1225 } 1273 }
1226 if (curval != *cmpval) { 1274 if (curval != *cmpval) {
@@ -1260,8 +1308,8 @@ retry_private:
1260 break; 1308 break;
1261 case -EFAULT: 1309 case -EFAULT:
1262 double_unlock_hb(hb1, hb2); 1310 double_unlock_hb(hb1, hb2);
1263 put_futex_key(fshared, &key2); 1311 put_futex_key(&key2);
1264 put_futex_key(fshared, &key1); 1312 put_futex_key(&key1);
1265 ret = fault_in_user_writeable(uaddr2); 1313 ret = fault_in_user_writeable(uaddr2);
1266 if (!ret) 1314 if (!ret)
1267 goto retry; 1315 goto retry;
@@ -1269,8 +1317,8 @@ retry_private:
1269 case -EAGAIN: 1317 case -EAGAIN:
1270 /* The owner was exiting, try again. */ 1318 /* The owner was exiting, try again. */
1271 double_unlock_hb(hb1, hb2); 1319 double_unlock_hb(hb1, hb2);
1272 put_futex_key(fshared, &key2); 1320 put_futex_key(&key2);
1273 put_futex_key(fshared, &key1); 1321 put_futex_key(&key1);
1274 cond_resched(); 1322 cond_resched();
1275 goto retry; 1323 goto retry;
1276 default: 1324 default:
@@ -1352,9 +1400,9 @@ out_unlock:
1352 drop_futex_key_refs(&key1); 1400 drop_futex_key_refs(&key1);
1353 1401
1354out_put_keys: 1402out_put_keys:
1355 put_futex_key(fshared, &key2); 1403 put_futex_key(&key2);
1356out_put_key1: 1404out_put_key1:
1357 put_futex_key(fshared, &key1); 1405 put_futex_key(&key1);
1358out: 1406out:
1359 if (pi_state != NULL) 1407 if (pi_state != NULL)
1360 free_pi_state(pi_state); 1408 free_pi_state(pi_state);
@@ -1494,7 +1542,7 @@ static void unqueue_me_pi(struct futex_q *q)
1494 * private futexes. 1542 * private futexes.
1495 */ 1543 */
1496static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, 1544static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
1497 struct task_struct *newowner, int fshared) 1545 struct task_struct *newowner)
1498{ 1546{
1499 u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS; 1547 u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS;
1500 struct futex_pi_state *pi_state = q->pi_state; 1548 struct futex_pi_state *pi_state = q->pi_state;
@@ -1587,20 +1635,11 @@ handle_fault:
1587 goto retry; 1635 goto retry;
1588} 1636}
1589 1637
1590/*
1591 * In case we must use restart_block to restart a futex_wait,
1592 * we encode in the 'flags' shared capability
1593 */
1594#define FLAGS_SHARED 0x01
1595#define FLAGS_CLOCKRT 0x02
1596#define FLAGS_HAS_TIMEOUT 0x04
1597
1598static long futex_wait_restart(struct restart_block *restart); 1638static long futex_wait_restart(struct restart_block *restart);
1599 1639
1600/** 1640/**
1601 * fixup_owner() - Post lock pi_state and corner case management 1641 * fixup_owner() - Post lock pi_state and corner case management
1602 * @uaddr: user address of the futex 1642 * @uaddr: user address of the futex
1603 * @fshared: whether the futex is shared (1) or not (0)
1604 * @q: futex_q (contains pi_state and access to the rt_mutex) 1643 * @q: futex_q (contains pi_state and access to the rt_mutex)
1605 * @locked: if the attempt to take the rt_mutex succeeded (1) or not (0) 1644 * @locked: if the attempt to take the rt_mutex succeeded (1) or not (0)
1606 * 1645 *
@@ -1613,8 +1652,7 @@ static long futex_wait_restart(struct restart_block *restart);
1613 * 0 - success, lock not taken 1652 * 0 - success, lock not taken
1614 * <0 - on error (-EFAULT) 1653 * <0 - on error (-EFAULT)
1615 */ 1654 */
1616static int fixup_owner(u32 __user *uaddr, int fshared, struct futex_q *q, 1655static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked)
1617 int locked)
1618{ 1656{
1619 struct task_struct *owner; 1657 struct task_struct *owner;
1620 int ret = 0; 1658 int ret = 0;
@@ -1625,7 +1663,7 @@ static int fixup_owner(u32 __user *uaddr, int fshared, struct futex_q *q,
1625 * did a lock-steal - fix up the PI-state in that case: 1663 * did a lock-steal - fix up the PI-state in that case:
1626 */ 1664 */
1627 if (q->pi_state->owner != current) 1665 if (q->pi_state->owner != current)
1628 ret = fixup_pi_state_owner(uaddr, q, current, fshared); 1666 ret = fixup_pi_state_owner(uaddr, q, current);
1629 goto out; 1667 goto out;
1630 } 1668 }
1631 1669
@@ -1652,7 +1690,7 @@ static int fixup_owner(u32 __user *uaddr, int fshared, struct futex_q *q,
1652 * lock. Fix the state up. 1690 * lock. Fix the state up.
1653 */ 1691 */
1654 owner = rt_mutex_owner(&q->pi_state->pi_mutex); 1692 owner = rt_mutex_owner(&q->pi_state->pi_mutex);
1655 ret = fixup_pi_state_owner(uaddr, q, owner, fshared); 1693 ret = fixup_pi_state_owner(uaddr, q, owner);
1656 goto out; 1694 goto out;
1657 } 1695 }
1658 1696
@@ -1715,7 +1753,7 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
1715 * futex_wait_setup() - Prepare to wait on a futex 1753 * futex_wait_setup() - Prepare to wait on a futex
1716 * @uaddr: the futex userspace address 1754 * @uaddr: the futex userspace address
1717 * @val: the expected value 1755 * @val: the expected value
1718 * @fshared: whether the futex is shared (1) or not (0) 1756 * @flags: futex flags (FLAGS_SHARED, etc.)
1719 * @q: the associated futex_q 1757 * @q: the associated futex_q
1720 * @hb: storage for hash_bucket pointer to be returned to caller 1758 * @hb: storage for hash_bucket pointer to be returned to caller
1721 * 1759 *
@@ -1728,7 +1766,7 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
1728 * 0 - uaddr contains val and hb has been locked 1766 * 0 - uaddr contains val and hb has been locked
1729 * <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlcoked 1767 * <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlcoked
1730 */ 1768 */
1731static int futex_wait_setup(u32 __user *uaddr, u32 val, int fshared, 1769static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags,
1732 struct futex_q *q, struct futex_hash_bucket **hb) 1770 struct futex_q *q, struct futex_hash_bucket **hb)
1733{ 1771{
1734 u32 uval; 1772 u32 uval;
@@ -1752,8 +1790,7 @@ static int futex_wait_setup(u32 __user *uaddr, u32 val, int fshared,
1752 * rare, but normal. 1790 * rare, but normal.
1753 */ 1791 */
1754retry: 1792retry:
1755 q->key = FUTEX_KEY_INIT; 1793 ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q->key);
1756 ret = get_futex_key(uaddr, fshared, &q->key);
1757 if (unlikely(ret != 0)) 1794 if (unlikely(ret != 0))
1758 return ret; 1795 return ret;
1759 1796
@@ -1769,10 +1806,10 @@ retry_private:
1769 if (ret) 1806 if (ret)
1770 goto out; 1807 goto out;
1771 1808
1772 if (!fshared) 1809 if (!(flags & FLAGS_SHARED))
1773 goto retry_private; 1810 goto retry_private;
1774 1811
1775 put_futex_key(fshared, &q->key); 1812 put_futex_key(&q->key);
1776 goto retry; 1813 goto retry;
1777 } 1814 }
1778 1815
@@ -1783,32 +1820,29 @@ retry_private:
1783 1820
1784out: 1821out:
1785 if (ret) 1822 if (ret)
1786 put_futex_key(fshared, &q->key); 1823 put_futex_key(&q->key);
1787 return ret; 1824 return ret;
1788} 1825}
1789 1826
1790static int futex_wait(u32 __user *uaddr, int fshared, 1827static int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val,
1791 u32 val, ktime_t *abs_time, u32 bitset, int clockrt) 1828 ktime_t *abs_time, u32 bitset)
1792{ 1829{
1793 struct hrtimer_sleeper timeout, *to = NULL; 1830 struct hrtimer_sleeper timeout, *to = NULL;
1794 struct restart_block *restart; 1831 struct restart_block *restart;
1795 struct futex_hash_bucket *hb; 1832 struct futex_hash_bucket *hb;
1796 struct futex_q q; 1833 struct futex_q q = futex_q_init;
1797 int ret; 1834 int ret;
1798 1835
1799 if (!bitset) 1836 if (!bitset)
1800 return -EINVAL; 1837 return -EINVAL;
1801
1802 q.pi_state = NULL;
1803 q.bitset = bitset; 1838 q.bitset = bitset;
1804 q.rt_waiter = NULL;
1805 q.requeue_pi_key = NULL;
1806 1839
1807 if (abs_time) { 1840 if (abs_time) {
1808 to = &timeout; 1841 to = &timeout;
1809 1842
1810 hrtimer_init_on_stack(&to->timer, clockrt ? CLOCK_REALTIME : 1843 hrtimer_init_on_stack(&to->timer, (flags & FLAGS_CLOCKRT) ?
1811 CLOCK_MONOTONIC, HRTIMER_MODE_ABS); 1844 CLOCK_REALTIME : CLOCK_MONOTONIC,
1845 HRTIMER_MODE_ABS);
1812 hrtimer_init_sleeper(to, current); 1846 hrtimer_init_sleeper(to, current);
1813 hrtimer_set_expires_range_ns(&to->timer, *abs_time, 1847 hrtimer_set_expires_range_ns(&to->timer, *abs_time,
1814 current->timer_slack_ns); 1848 current->timer_slack_ns);
@@ -1819,7 +1853,7 @@ retry:
1819 * Prepare to wait on uaddr. On success, holds hb lock and increments 1853 * Prepare to wait on uaddr. On success, holds hb lock and increments
1820 * q.key refs. 1854 * q.key refs.
1821 */ 1855 */
1822 ret = futex_wait_setup(uaddr, val, fshared, &q, &hb); 1856 ret = futex_wait_setup(uaddr, val, flags, &q, &hb);
1823 if (ret) 1857 if (ret)
1824 goto out; 1858 goto out;
1825 1859
@@ -1852,12 +1886,7 @@ retry:
1852 restart->futex.val = val; 1886 restart->futex.val = val;
1853 restart->futex.time = abs_time->tv64; 1887 restart->futex.time = abs_time->tv64;
1854 restart->futex.bitset = bitset; 1888 restart->futex.bitset = bitset;
1855 restart->futex.flags = FLAGS_HAS_TIMEOUT; 1889 restart->futex.flags = flags;
1856
1857 if (fshared)
1858 restart->futex.flags |= FLAGS_SHARED;
1859 if (clockrt)
1860 restart->futex.flags |= FLAGS_CLOCKRT;
1861 1890
1862 ret = -ERESTART_RESTARTBLOCK; 1891 ret = -ERESTART_RESTARTBLOCK;
1863 1892
@@ -1873,7 +1902,6 @@ out:
1873static long futex_wait_restart(struct restart_block *restart) 1902static long futex_wait_restart(struct restart_block *restart)
1874{ 1903{
1875 u32 __user *uaddr = restart->futex.uaddr; 1904 u32 __user *uaddr = restart->futex.uaddr;
1876 int fshared = 0;
1877 ktime_t t, *tp = NULL; 1905 ktime_t t, *tp = NULL;
1878 1906
1879 if (restart->futex.flags & FLAGS_HAS_TIMEOUT) { 1907 if (restart->futex.flags & FLAGS_HAS_TIMEOUT) {
@@ -1881,11 +1909,9 @@ static long futex_wait_restart(struct restart_block *restart)
1881 tp = &t; 1909 tp = &t;
1882 } 1910 }
1883 restart->fn = do_no_restart_syscall; 1911 restart->fn = do_no_restart_syscall;
1884 if (restart->futex.flags & FLAGS_SHARED) 1912
1885 fshared = 1; 1913 return (long)futex_wait(uaddr, restart->futex.flags,
1886 return (long)futex_wait(uaddr, fshared, restart->futex.val, tp, 1914 restart->futex.val, tp, restart->futex.bitset);
1887 restart->futex.bitset,
1888 restart->futex.flags & FLAGS_CLOCKRT);
1889} 1915}
1890 1916
1891 1917
@@ -1895,12 +1921,12 @@ static long futex_wait_restart(struct restart_block *restart)
1895 * if there are waiters then it will block, it does PI, etc. (Due to 1921 * if there are waiters then it will block, it does PI, etc. (Due to
1896 * races the kernel might see a 0 value of the futex too.) 1922 * races the kernel might see a 0 value of the futex too.)
1897 */ 1923 */
1898static int futex_lock_pi(u32 __user *uaddr, int fshared, 1924static int futex_lock_pi(u32 __user *uaddr, unsigned int flags, int detect,
1899 int detect, ktime_t *time, int trylock) 1925 ktime_t *time, int trylock)
1900{ 1926{
1901 struct hrtimer_sleeper timeout, *to = NULL; 1927 struct hrtimer_sleeper timeout, *to = NULL;
1902 struct futex_hash_bucket *hb; 1928 struct futex_hash_bucket *hb;
1903 struct futex_q q; 1929 struct futex_q q = futex_q_init;
1904 int res, ret; 1930 int res, ret;
1905 1931
1906 if (refill_pi_state_cache()) 1932 if (refill_pi_state_cache())
@@ -1914,12 +1940,8 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared,
1914 hrtimer_set_expires(&to->timer, *time); 1940 hrtimer_set_expires(&to->timer, *time);
1915 } 1941 }
1916 1942
1917 q.pi_state = NULL;
1918 q.rt_waiter = NULL;
1919 q.requeue_pi_key = NULL;
1920retry: 1943retry:
1921 q.key = FUTEX_KEY_INIT; 1944 ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q.key);
1922 ret = get_futex_key(uaddr, fshared, &q.key);
1923 if (unlikely(ret != 0)) 1945 if (unlikely(ret != 0))
1924 goto out; 1946 goto out;
1925 1947
@@ -1941,7 +1963,7 @@ retry_private:
1941 * exit to complete. 1963 * exit to complete.
1942 */ 1964 */
1943 queue_unlock(&q, hb); 1965 queue_unlock(&q, hb);
1944 put_futex_key(fshared, &q.key); 1966 put_futex_key(&q.key);
1945 cond_resched(); 1967 cond_resched();
1946 goto retry; 1968 goto retry;
1947 default: 1969 default:
@@ -1971,7 +1993,7 @@ retry_private:
1971 * Fixup the pi_state owner and possibly acquire the lock if we 1993 * Fixup the pi_state owner and possibly acquire the lock if we
1972 * haven't already. 1994 * haven't already.
1973 */ 1995 */
1974 res = fixup_owner(uaddr, fshared, &q, !ret); 1996 res = fixup_owner(uaddr, &q, !ret);
1975 /* 1997 /*
1976 * If fixup_owner() returned an error, proprogate that. If it acquired 1998 * If fixup_owner() returned an error, proprogate that. If it acquired
1977 * the lock, clear our -ETIMEDOUT or -EINTR. 1999 * the lock, clear our -ETIMEDOUT or -EINTR.
@@ -1995,7 +2017,7 @@ out_unlock_put_key:
1995 queue_unlock(&q, hb); 2017 queue_unlock(&q, hb);
1996 2018
1997out_put_key: 2019out_put_key:
1998 put_futex_key(fshared, &q.key); 2020 put_futex_key(&q.key);
1999out: 2021out:
2000 if (to) 2022 if (to)
2001 destroy_hrtimer_on_stack(&to->timer); 2023 destroy_hrtimer_on_stack(&to->timer);
@@ -2008,10 +2030,10 @@ uaddr_faulted:
2008 if (ret) 2030 if (ret)
2009 goto out_put_key; 2031 goto out_put_key;
2010 2032
2011 if (!fshared) 2033 if (!(flags & FLAGS_SHARED))
2012 goto retry_private; 2034 goto retry_private;
2013 2035
2014 put_futex_key(fshared, &q.key); 2036 put_futex_key(&q.key);
2015 goto retry; 2037 goto retry;
2016} 2038}
2017 2039
@@ -2020,7 +2042,7 @@ uaddr_faulted:
2020 * This is the in-kernel slowpath: we look up the PI state (if any), 2042 * This is the in-kernel slowpath: we look up the PI state (if any),
2021 * and do the rt-mutex unlock. 2043 * and do the rt-mutex unlock.
2022 */ 2044 */
2023static int futex_unlock_pi(u32 __user *uaddr, int fshared) 2045static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
2024{ 2046{
2025 struct futex_hash_bucket *hb; 2047 struct futex_hash_bucket *hb;
2026 struct futex_q *this, *next; 2048 struct futex_q *this, *next;
@@ -2038,7 +2060,7 @@ retry:
2038 if ((uval & FUTEX_TID_MASK) != task_pid_vnr(current)) 2060 if ((uval & FUTEX_TID_MASK) != task_pid_vnr(current))
2039 return -EPERM; 2061 return -EPERM;
2040 2062
2041 ret = get_futex_key(uaddr, fshared, &key); 2063 ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key);
2042 if (unlikely(ret != 0)) 2064 if (unlikely(ret != 0))
2043 goto out; 2065 goto out;
2044 2066
@@ -2093,14 +2115,14 @@ retry:
2093 2115
2094out_unlock: 2116out_unlock:
2095 spin_unlock(&hb->lock); 2117 spin_unlock(&hb->lock);
2096 put_futex_key(fshared, &key); 2118 put_futex_key(&key);
2097 2119
2098out: 2120out:
2099 return ret; 2121 return ret;
2100 2122
2101pi_faulted: 2123pi_faulted:
2102 spin_unlock(&hb->lock); 2124 spin_unlock(&hb->lock);
2103 put_futex_key(fshared, &key); 2125 put_futex_key(&key);
2104 2126
2105 ret = fault_in_user_writeable(uaddr); 2127 ret = fault_in_user_writeable(uaddr);
2106 if (!ret) 2128 if (!ret)
@@ -2160,7 +2182,7 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
2160/** 2182/**
2161 * futex_wait_requeue_pi() - Wait on uaddr and take uaddr2 2183 * futex_wait_requeue_pi() - Wait on uaddr and take uaddr2
2162 * @uaddr: the futex we initially wait on (non-pi) 2184 * @uaddr: the futex we initially wait on (non-pi)
2163 * @fshared: whether the futexes are shared (1) or not (0). They must be 2185 * @flags: futex flags (FLAGS_SHARED, FLAGS_CLOCKRT, etc.), they must be
2164 * the same type, no requeueing from private to shared, etc. 2186 * the same type, no requeueing from private to shared, etc.
2165 * @val: the expected value of uaddr 2187 * @val: the expected value of uaddr
2166 * @abs_time: absolute timeout 2188 * @abs_time: absolute timeout
@@ -2198,16 +2220,16 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
2198 * 0 - On success 2220 * 0 - On success
2199 * <0 - On error 2221 * <0 - On error
2200 */ 2222 */
2201static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared, 2223static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
2202 u32 val, ktime_t *abs_time, u32 bitset, 2224 u32 val, ktime_t *abs_time, u32 bitset,
2203 int clockrt, u32 __user *uaddr2) 2225 u32 __user *uaddr2)
2204{ 2226{
2205 struct hrtimer_sleeper timeout, *to = NULL; 2227 struct hrtimer_sleeper timeout, *to = NULL;
2206 struct rt_mutex_waiter rt_waiter; 2228 struct rt_mutex_waiter rt_waiter;
2207 struct rt_mutex *pi_mutex = NULL; 2229 struct rt_mutex *pi_mutex = NULL;
2208 struct futex_hash_bucket *hb; 2230 struct futex_hash_bucket *hb;
2209 union futex_key key2; 2231 union futex_key key2 = FUTEX_KEY_INIT;
2210 struct futex_q q; 2232 struct futex_q q = futex_q_init;
2211 int res, ret; 2233 int res, ret;
2212 2234
2213 if (!bitset) 2235 if (!bitset)
@@ -2215,8 +2237,9 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
2215 2237
2216 if (abs_time) { 2238 if (abs_time) {
2217 to = &timeout; 2239 to = &timeout;
2218 hrtimer_init_on_stack(&to->timer, clockrt ? CLOCK_REALTIME : 2240 hrtimer_init_on_stack(&to->timer, (flags & FLAGS_CLOCKRT) ?
2219 CLOCK_MONOTONIC, HRTIMER_MODE_ABS); 2241 CLOCK_REALTIME : CLOCK_MONOTONIC,
2242 HRTIMER_MODE_ABS);
2220 hrtimer_init_sleeper(to, current); 2243 hrtimer_init_sleeper(to, current);
2221 hrtimer_set_expires_range_ns(&to->timer, *abs_time, 2244 hrtimer_set_expires_range_ns(&to->timer, *abs_time,
2222 current->timer_slack_ns); 2245 current->timer_slack_ns);
@@ -2229,12 +2252,10 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
2229 debug_rt_mutex_init_waiter(&rt_waiter); 2252 debug_rt_mutex_init_waiter(&rt_waiter);
2230 rt_waiter.task = NULL; 2253 rt_waiter.task = NULL;
2231 2254
2232 key2 = FUTEX_KEY_INIT; 2255 ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2);
2233 ret = get_futex_key(uaddr2, fshared, &key2);
2234 if (unlikely(ret != 0)) 2256 if (unlikely(ret != 0))
2235 goto out; 2257 goto out;
2236 2258
2237 q.pi_state = NULL;
2238 q.bitset = bitset; 2259 q.bitset = bitset;
2239 q.rt_waiter = &rt_waiter; 2260 q.rt_waiter = &rt_waiter;
2240 q.requeue_pi_key = &key2; 2261 q.requeue_pi_key = &key2;
@@ -2243,7 +2264,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
2243 * Prepare to wait on uaddr. On success, increments q.key (key1) ref 2264 * Prepare to wait on uaddr. On success, increments q.key (key1) ref
2244 * count. 2265 * count.
2245 */ 2266 */
2246 ret = futex_wait_setup(uaddr, val, fshared, &q, &hb); 2267 ret = futex_wait_setup(uaddr, val, flags, &q, &hb);
2247 if (ret) 2268 if (ret)
2248 goto out_key2; 2269 goto out_key2;
2249 2270
@@ -2273,8 +2294,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
2273 */ 2294 */
2274 if (q.pi_state && (q.pi_state->owner != current)) { 2295 if (q.pi_state && (q.pi_state->owner != current)) {
2275 spin_lock(q.lock_ptr); 2296 spin_lock(q.lock_ptr);
2276 ret = fixup_pi_state_owner(uaddr2, &q, current, 2297 ret = fixup_pi_state_owner(uaddr2, &q, current);
2277 fshared);
2278 spin_unlock(q.lock_ptr); 2298 spin_unlock(q.lock_ptr);
2279 } 2299 }
2280 } else { 2300 } else {
@@ -2293,7 +2313,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
2293 * Fixup the pi_state owner and possibly acquire the lock if we 2313 * Fixup the pi_state owner and possibly acquire the lock if we
2294 * haven't already. 2314 * haven't already.
2295 */ 2315 */
2296 res = fixup_owner(uaddr2, fshared, &q, !ret); 2316 res = fixup_owner(uaddr2, &q, !ret);
2297 /* 2317 /*
2298 * If fixup_owner() returned an error, proprogate that. If it 2318 * If fixup_owner() returned an error, proprogate that. If it
2299 * acquired the lock, clear -ETIMEDOUT or -EINTR. 2319 * acquired the lock, clear -ETIMEDOUT or -EINTR.
@@ -2324,9 +2344,9 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
2324 } 2344 }
2325 2345
2326out_put_keys: 2346out_put_keys:
2327 put_futex_key(fshared, &q.key); 2347 put_futex_key(&q.key);
2328out_key2: 2348out_key2:
2329 put_futex_key(fshared, &key2); 2349 put_futex_key(&key2);
2330 2350
2331out: 2351out:
2332 if (to) { 2352 if (to) {
@@ -2489,7 +2509,8 @@ void exit_robust_list(struct task_struct *curr)
2489{ 2509{
2490 struct robust_list_head __user *head = curr->robust_list; 2510 struct robust_list_head __user *head = curr->robust_list;
2491 struct robust_list __user *entry, *next_entry, *pending; 2511 struct robust_list __user *entry, *next_entry, *pending;
2492 unsigned int limit = ROBUST_LIST_LIMIT, pi, next_pi, pip; 2512 unsigned int limit = ROBUST_LIST_LIMIT, pi, pip;
2513 unsigned int uninitialized_var(next_pi);
2493 unsigned long futex_offset; 2514 unsigned long futex_offset;
2494 int rc; 2515 int rc;
2495 2516
@@ -2550,58 +2571,57 @@ void exit_robust_list(struct task_struct *curr)
2550long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, 2571long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
2551 u32 __user *uaddr2, u32 val2, u32 val3) 2572 u32 __user *uaddr2, u32 val2, u32 val3)
2552{ 2573{
2553 int clockrt, ret = -ENOSYS; 2574 int ret = -ENOSYS, cmd = op & FUTEX_CMD_MASK;
2554 int cmd = op & FUTEX_CMD_MASK; 2575 unsigned int flags = 0;
2555 int fshared = 0;
2556 2576
2557 if (!(op & FUTEX_PRIVATE_FLAG)) 2577 if (!(op & FUTEX_PRIVATE_FLAG))
2558 fshared = 1; 2578 flags |= FLAGS_SHARED;
2559 2579
2560 clockrt = op & FUTEX_CLOCK_REALTIME; 2580 if (op & FUTEX_CLOCK_REALTIME) {
2561 if (clockrt && cmd != FUTEX_WAIT_BITSET && cmd != FUTEX_WAIT_REQUEUE_PI) 2581 flags |= FLAGS_CLOCKRT;
2562 return -ENOSYS; 2582 if (cmd != FUTEX_WAIT_BITSET && cmd != FUTEX_WAIT_REQUEUE_PI)
2583 return -ENOSYS;
2584 }
2563 2585
2564 switch (cmd) { 2586 switch (cmd) {
2565 case FUTEX_WAIT: 2587 case FUTEX_WAIT:
2566 val3 = FUTEX_BITSET_MATCH_ANY; 2588 val3 = FUTEX_BITSET_MATCH_ANY;
2567 case FUTEX_WAIT_BITSET: 2589 case FUTEX_WAIT_BITSET:
2568 ret = futex_wait(uaddr, fshared, val, timeout, val3, clockrt); 2590 ret = futex_wait(uaddr, flags, val, timeout, val3);
2569 break; 2591 break;
2570 case FUTEX_WAKE: 2592 case FUTEX_WAKE:
2571 val3 = FUTEX_BITSET_MATCH_ANY; 2593 val3 = FUTEX_BITSET_MATCH_ANY;
2572 case FUTEX_WAKE_BITSET: 2594 case FUTEX_WAKE_BITSET:
2573 ret = futex_wake(uaddr, fshared, val, val3); 2595 ret = futex_wake(uaddr, flags, val, val3);
2574 break; 2596 break;
2575 case FUTEX_REQUEUE: 2597 case FUTEX_REQUEUE:
2576 ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, NULL, 0); 2598 ret = futex_requeue(uaddr, flags, uaddr2, val, val2, NULL, 0);
2577 break; 2599 break;
2578 case FUTEX_CMP_REQUEUE: 2600 case FUTEX_CMP_REQUEUE:
2579 ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3, 2601 ret = futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 0);
2580 0);
2581 break; 2602 break;
2582 case FUTEX_WAKE_OP: 2603 case FUTEX_WAKE_OP:
2583 ret = futex_wake_op(uaddr, fshared, uaddr2, val, val2, val3); 2604 ret = futex_wake_op(uaddr, flags, uaddr2, val, val2, val3);
2584 break; 2605 break;
2585 case FUTEX_LOCK_PI: 2606 case FUTEX_LOCK_PI:
2586 if (futex_cmpxchg_enabled) 2607 if (futex_cmpxchg_enabled)
2587 ret = futex_lock_pi(uaddr, fshared, val, timeout, 0); 2608 ret = futex_lock_pi(uaddr, flags, val, timeout, 0);
2588 break; 2609 break;
2589 case FUTEX_UNLOCK_PI: 2610 case FUTEX_UNLOCK_PI:
2590 if (futex_cmpxchg_enabled) 2611 if (futex_cmpxchg_enabled)
2591 ret = futex_unlock_pi(uaddr, fshared); 2612 ret = futex_unlock_pi(uaddr, flags);
2592 break; 2613 break;
2593 case FUTEX_TRYLOCK_PI: 2614 case FUTEX_TRYLOCK_PI:
2594 if (futex_cmpxchg_enabled) 2615 if (futex_cmpxchg_enabled)
2595 ret = futex_lock_pi(uaddr, fshared, 0, timeout, 1); 2616 ret = futex_lock_pi(uaddr, flags, 0, timeout, 1);
2596 break; 2617 break;
2597 case FUTEX_WAIT_REQUEUE_PI: 2618 case FUTEX_WAIT_REQUEUE_PI:
2598 val3 = FUTEX_BITSET_MATCH_ANY; 2619 val3 = FUTEX_BITSET_MATCH_ANY;
2599 ret = futex_wait_requeue_pi(uaddr, fshared, val, timeout, val3, 2620 ret = futex_wait_requeue_pi(uaddr, flags, val, timeout, val3,
2600 clockrt, uaddr2); 2621 uaddr2);
2601 break; 2622 break;
2602 case FUTEX_CMP_REQUEUE_PI: 2623 case FUTEX_CMP_REQUEUE_PI:
2603 ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3, 2624 ret = futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 1);
2604 1);
2605 break; 2625 break;
2606 default: 2626 default:
2607 ret = -ENOSYS; 2627 ret = -ENOSYS;
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
index 06da4dfc339b..a7934ac75e5b 100644
--- a/kernel/futex_compat.c
+++ b/kernel/futex_compat.c
@@ -49,7 +49,8 @@ void compat_exit_robust_list(struct task_struct *curr)
49{ 49{
50 struct compat_robust_list_head __user *head = curr->compat_robust_list; 50 struct compat_robust_list_head __user *head = curr->compat_robust_list;
51 struct robust_list __user *entry, *next_entry, *pending; 51 struct robust_list __user *entry, *next_entry, *pending;
52 unsigned int limit = ROBUST_LIST_LIMIT, pi, next_pi, pip; 52 unsigned int limit = ROBUST_LIST_LIMIT, pi, pip;
53 unsigned int uninitialized_var(next_pi);
53 compat_uptr_t uentry, next_uentry, upending; 54 compat_uptr_t uentry, next_uentry, upending;
54 compat_long_t futex_offset; 55 compat_long_t futex_offset;
55 int rc; 56 int rc;
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 72206cf5c6cf..0c8d7c048615 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -497,7 +497,7 @@ static inline int hrtimer_is_hres_enabled(void)
497 */ 497 */
498static inline int hrtimer_hres_active(void) 498static inline int hrtimer_hres_active(void)
499{ 499{
500 return __get_cpu_var(hrtimer_bases).hres_active; 500 return __this_cpu_read(hrtimer_bases.hres_active);
501} 501}
502 502
503/* 503/*
@@ -516,10 +516,13 @@ hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal)
516 516
517 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) { 517 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) {
518 struct hrtimer *timer; 518 struct hrtimer *timer;
519 struct timerqueue_node *next;
519 520
520 if (!base->first) 521 next = timerqueue_getnext(&base->active);
522 if (!next)
521 continue; 523 continue;
522 timer = rb_entry(base->first, struct hrtimer, node); 524 timer = container_of(next, struct hrtimer, node);
525
523 expires = ktime_sub(hrtimer_get_expires(timer), base->offset); 526 expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
524 /* 527 /*
525 * clock_was_set() has changed base->offset so the 528 * clock_was_set() has changed base->offset so the
@@ -840,48 +843,17 @@ EXPORT_SYMBOL_GPL(hrtimer_forward);
840static int enqueue_hrtimer(struct hrtimer *timer, 843static int enqueue_hrtimer(struct hrtimer *timer,
841 struct hrtimer_clock_base *base) 844 struct hrtimer_clock_base *base)
842{ 845{
843 struct rb_node **link = &base->active.rb_node;
844 struct rb_node *parent = NULL;
845 struct hrtimer *entry;
846 int leftmost = 1;
847
848 debug_activate(timer); 846 debug_activate(timer);
849 847
850 /* 848 timerqueue_add(&base->active, &timer->node);
851 * Find the right place in the rbtree:
852 */
853 while (*link) {
854 parent = *link;
855 entry = rb_entry(parent, struct hrtimer, node);
856 /*
857 * We dont care about collisions. Nodes with
858 * the same expiry time stay together.
859 */
860 if (hrtimer_get_expires_tv64(timer) <
861 hrtimer_get_expires_tv64(entry)) {
862 link = &(*link)->rb_left;
863 } else {
864 link = &(*link)->rb_right;
865 leftmost = 0;
866 }
867 }
868
869 /*
870 * Insert the timer to the rbtree and check whether it
871 * replaces the first pending timer
872 */
873 if (leftmost)
874 base->first = &timer->node;
875 849
876 rb_link_node(&timer->node, parent, link);
877 rb_insert_color(&timer->node, &base->active);
878 /* 850 /*
879 * HRTIMER_STATE_ENQUEUED is or'ed to the current state to preserve the 851 * HRTIMER_STATE_ENQUEUED is or'ed to the current state to preserve the
880 * state of a possibly running callback. 852 * state of a possibly running callback.
881 */ 853 */
882 timer->state |= HRTIMER_STATE_ENQUEUED; 854 timer->state |= HRTIMER_STATE_ENQUEUED;
883 855
884 return leftmost; 856 return (&timer->node == base->active.next);
885} 857}
886 858
887/* 859/*
@@ -901,12 +873,7 @@ static void __remove_hrtimer(struct hrtimer *timer,
901 if (!(timer->state & HRTIMER_STATE_ENQUEUED)) 873 if (!(timer->state & HRTIMER_STATE_ENQUEUED))
902 goto out; 874 goto out;
903 875
904 /* 876 if (&timer->node == timerqueue_getnext(&base->active)) {
905 * Remove the timer from the rbtree and replace the first
906 * entry pointer if necessary.
907 */
908 if (base->first == &timer->node) {
909 base->first = rb_next(&timer->node);
910#ifdef CONFIG_HIGH_RES_TIMERS 877#ifdef CONFIG_HIGH_RES_TIMERS
911 /* Reprogram the clock event device. if enabled */ 878 /* Reprogram the clock event device. if enabled */
912 if (reprogram && hrtimer_hres_active()) { 879 if (reprogram && hrtimer_hres_active()) {
@@ -919,7 +886,7 @@ static void __remove_hrtimer(struct hrtimer *timer,
919 } 886 }
920#endif 887#endif
921 } 888 }
922 rb_erase(&timer->node, &base->active); 889 timerqueue_del(&base->active, &timer->node);
923out: 890out:
924 timer->state = newstate; 891 timer->state = newstate;
925} 892}
@@ -1128,11 +1095,13 @@ ktime_t hrtimer_get_next_event(void)
1128 if (!hrtimer_hres_active()) { 1095 if (!hrtimer_hres_active()) {
1129 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) { 1096 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) {
1130 struct hrtimer *timer; 1097 struct hrtimer *timer;
1098 struct timerqueue_node *next;
1131 1099
1132 if (!base->first) 1100 next = timerqueue_getnext(&base->active);
1101 if (!next)
1133 continue; 1102 continue;
1134 1103
1135 timer = rb_entry(base->first, struct hrtimer, node); 1104 timer = container_of(next, struct hrtimer, node);
1136 delta.tv64 = hrtimer_get_expires_tv64(timer); 1105 delta.tv64 = hrtimer_get_expires_tv64(timer);
1137 delta = ktime_sub(delta, base->get_time()); 1106 delta = ktime_sub(delta, base->get_time());
1138 if (delta.tv64 < mindelta.tv64) 1107 if (delta.tv64 < mindelta.tv64)
@@ -1162,6 +1131,7 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
1162 1131
1163 timer->base = &cpu_base->clock_base[clock_id]; 1132 timer->base = &cpu_base->clock_base[clock_id];
1164 hrtimer_init_timer_hres(timer); 1133 hrtimer_init_timer_hres(timer);
1134 timerqueue_init(&timer->node);
1165 1135
1166#ifdef CONFIG_TIMER_STATS 1136#ifdef CONFIG_TIMER_STATS
1167 timer->start_site = NULL; 1137 timer->start_site = NULL;
@@ -1278,14 +1248,14 @@ retry:
1278 1248
1279 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { 1249 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
1280 ktime_t basenow; 1250 ktime_t basenow;
1281 struct rb_node *node; 1251 struct timerqueue_node *node;
1282 1252
1283 basenow = ktime_add(now, base->offset); 1253 basenow = ktime_add(now, base->offset);
1284 1254
1285 while ((node = base->first)) { 1255 while ((node = timerqueue_getnext(&base->active))) {
1286 struct hrtimer *timer; 1256 struct hrtimer *timer;
1287 1257
1288 timer = rb_entry(node, struct hrtimer, node); 1258 timer = container_of(node, struct hrtimer, node);
1289 1259
1290 /* 1260 /*
1291 * The immediate goal for using the softexpires is 1261 * The immediate goal for using the softexpires is
@@ -1441,7 +1411,7 @@ void hrtimer_run_pending(void)
1441 */ 1411 */
1442void hrtimer_run_queues(void) 1412void hrtimer_run_queues(void)
1443{ 1413{
1444 struct rb_node *node; 1414 struct timerqueue_node *node;
1445 struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); 1415 struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
1446 struct hrtimer_clock_base *base; 1416 struct hrtimer_clock_base *base;
1447 int index, gettime = 1; 1417 int index, gettime = 1;
@@ -1451,8 +1421,7 @@ void hrtimer_run_queues(void)
1451 1421
1452 for (index = 0; index < HRTIMER_MAX_CLOCK_BASES; index++) { 1422 for (index = 0; index < HRTIMER_MAX_CLOCK_BASES; index++) {
1453 base = &cpu_base->clock_base[index]; 1423 base = &cpu_base->clock_base[index];
1454 1424 if (!timerqueue_getnext(&base->active))
1455 if (!base->first)
1456 continue; 1425 continue;
1457 1426
1458 if (gettime) { 1427 if (gettime) {
@@ -1462,10 +1431,10 @@ void hrtimer_run_queues(void)
1462 1431
1463 raw_spin_lock(&cpu_base->lock); 1432 raw_spin_lock(&cpu_base->lock);
1464 1433
1465 while ((node = base->first)) { 1434 while ((node = timerqueue_getnext(&base->active))) {
1466 struct hrtimer *timer; 1435 struct hrtimer *timer;
1467 1436
1468 timer = rb_entry(node, struct hrtimer, node); 1437 timer = container_of(node, struct hrtimer, node);
1469 if (base->softirq_time.tv64 <= 1438 if (base->softirq_time.tv64 <=
1470 hrtimer_get_expires_tv64(timer)) 1439 hrtimer_get_expires_tv64(timer))
1471 break; 1440 break;
@@ -1630,8 +1599,10 @@ static void __cpuinit init_hrtimers_cpu(int cpu)
1630 1599
1631 raw_spin_lock_init(&cpu_base->lock); 1600 raw_spin_lock_init(&cpu_base->lock);
1632 1601
1633 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) 1602 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
1634 cpu_base->clock_base[i].cpu_base = cpu_base; 1603 cpu_base->clock_base[i].cpu_base = cpu_base;
1604 timerqueue_init_head(&cpu_base->clock_base[i].active);
1605 }
1635 1606
1636 hrtimer_init_hres(cpu_base); 1607 hrtimer_init_hres(cpu_base);
1637} 1608}
@@ -1642,10 +1613,10 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
1642 struct hrtimer_clock_base *new_base) 1613 struct hrtimer_clock_base *new_base)
1643{ 1614{
1644 struct hrtimer *timer; 1615 struct hrtimer *timer;
1645 struct rb_node *node; 1616 struct timerqueue_node *node;
1646 1617
1647 while ((node = rb_first(&old_base->active))) { 1618 while ((node = timerqueue_getnext(&old_base->active))) {
1648 timer = rb_entry(node, struct hrtimer, node); 1619 timer = container_of(node, struct hrtimer, node);
1649 BUG_ON(hrtimer_callback_running(timer)); 1620 BUG_ON(hrtimer_callback_running(timer));
1650 debug_deactivate(timer); 1621 debug_deactivate(timer);
1651 1622
@@ -1774,7 +1745,7 @@ schedule_hrtimeout_range_clock(ktime_t *expires, unsigned long delta,
1774 } 1745 }
1775 1746
1776 /* 1747 /*
1777 * A NULL parameter means "inifinte" 1748 * A NULL parameter means "infinite"
1778 */ 1749 */
1779 if (!expires) { 1750 if (!expires) {
1780 schedule(); 1751 schedule();
diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c
index 2c9120f0afca..086adf25a55e 100644
--- a/kernel/hw_breakpoint.c
+++ b/kernel/hw_breakpoint.c
@@ -620,7 +620,7 @@ static struct pmu perf_breakpoint = {
620 .read = hw_breakpoint_pmu_read, 620 .read = hw_breakpoint_pmu_read,
621}; 621};
622 622
623static int __init init_hw_breakpoint(void) 623int __init init_hw_breakpoint(void)
624{ 624{
625 unsigned int **task_bp_pinned; 625 unsigned int **task_bp_pinned;
626 int cpu, err_cpu; 626 int cpu, err_cpu;
@@ -641,7 +641,7 @@ static int __init init_hw_breakpoint(void)
641 641
642 constraints_initialized = 1; 642 constraints_initialized = 1;
643 643
644 perf_pmu_register(&perf_breakpoint); 644 perf_pmu_register(&perf_breakpoint, "breakpoint", PERF_TYPE_BREAKPOINT);
645 645
646 return register_die_notifier(&hw_breakpoint_exceptions_nb); 646 return register_die_notifier(&hw_breakpoint_exceptions_nb);
647 647
@@ -655,6 +655,5 @@ static int __init init_hw_breakpoint(void)
655 655
656 return -ENOMEM; 656 return -ENOMEM;
657} 657}
658core_initcall(init_hw_breakpoint);
659 658
660 659
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
index 31d766bf5d2e..8e42fec7686d 100644
--- a/kernel/irq/Kconfig
+++ b/kernel/irq/Kconfig
@@ -9,9 +9,6 @@ menu "IRQ subsystem"
9config GENERIC_HARDIRQS 9config GENERIC_HARDIRQS
10 def_bool y 10 def_bool y
11 11
12config GENERIC_HARDIRQS_NO__DO_IRQ
13 def_bool y
14
15# Select this to disable the deprecated stuff 12# Select this to disable the deprecated stuff
16config GENERIC_HARDIRQS_NO_DEPRECATED 13config GENERIC_HARDIRQS_NO_DEPRECATED
17 def_bool n 14 def_bool n
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index e2347eb63306..3540a7190122 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -118,114 +118,3 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action)
118 118
119 return retval; 119 return retval;
120} 120}
121
122#ifndef CONFIG_GENERIC_HARDIRQS_NO__DO_IRQ
123
124#ifdef CONFIG_ENABLE_WARN_DEPRECATED
125# warning __do_IRQ is deprecated. Please convert to proper flow handlers
126#endif
127
128/**
129 * __do_IRQ - original all in one highlevel IRQ handler
130 * @irq: the interrupt number
131 *
132 * __do_IRQ handles all normal device IRQ's (the special
133 * SMP cross-CPU interrupts have their own specific
134 * handlers).
135 *
136 * This is the original x86 implementation which is used for every
137 * interrupt type.
138 */
139unsigned int __do_IRQ(unsigned int irq)
140{
141 struct irq_desc *desc = irq_to_desc(irq);
142 struct irqaction *action;
143 unsigned int status;
144
145 kstat_incr_irqs_this_cpu(irq, desc);
146
147 if (CHECK_IRQ_PER_CPU(desc->status)) {
148 irqreturn_t action_ret;
149
150 /*
151 * No locking required for CPU-local interrupts:
152 */
153 if (desc->irq_data.chip->ack)
154 desc->irq_data.chip->ack(irq);
155 if (likely(!(desc->status & IRQ_DISABLED))) {
156 action_ret = handle_IRQ_event(irq, desc->action);
157 if (!noirqdebug)
158 note_interrupt(irq, desc, action_ret);
159 }
160 desc->irq_data.chip->end(irq);
161 return 1;
162 }
163
164 raw_spin_lock(&desc->lock);
165 if (desc->irq_data.chip->ack)
166 desc->irq_data.chip->ack(irq);
167 /*
168 * REPLAY is when Linux resends an IRQ that was dropped earlier
169 * WAITING is used by probe to mark irqs that are being tested
170 */
171 status = desc->status & ~(IRQ_REPLAY | IRQ_WAITING);
172 status |= IRQ_PENDING; /* we _want_ to handle it */
173
174 /*
175 * If the IRQ is disabled for whatever reason, we cannot
176 * use the action we have.
177 */
178 action = NULL;
179 if (likely(!(status & (IRQ_DISABLED | IRQ_INPROGRESS)))) {
180 action = desc->action;
181 status &= ~IRQ_PENDING; /* we commit to handling */
182 status |= IRQ_INPROGRESS; /* we are handling it */
183 }
184 desc->status = status;
185
186 /*
187 * If there is no IRQ handler or it was disabled, exit early.
188 * Since we set PENDING, if another processor is handling
189 * a different instance of this same irq, the other processor
190 * will take care of it.
191 */
192 if (unlikely(!action))
193 goto out;
194
195 /*
196 * Edge triggered interrupts need to remember
197 * pending events.
198 * This applies to any hw interrupts that allow a second
199 * instance of the same irq to arrive while we are in do_IRQ
200 * or in the handler. But the code here only handles the _second_
201 * instance of the irq, not the third or fourth. So it is mostly
202 * useful for irq hardware that does not mask cleanly in an
203 * SMP environment.
204 */
205 for (;;) {
206 irqreturn_t action_ret;
207
208 raw_spin_unlock(&desc->lock);
209
210 action_ret = handle_IRQ_event(irq, action);
211 if (!noirqdebug)
212 note_interrupt(irq, desc, action_ret);
213
214 raw_spin_lock(&desc->lock);
215 if (likely(!(desc->status & IRQ_PENDING)))
216 break;
217 desc->status &= ~IRQ_PENDING;
218 }
219 desc->status &= ~IRQ_INPROGRESS;
220
221out:
222 /*
223 * The ->end() handler has to deal with interrupts which got
224 * disabled while the handler was running.
225 */
226 desc->irq_data.chip->end(irq);
227 raw_spin_unlock(&desc->lock);
228
229 return 1;
230}
231#endif
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 9988d03797f5..282f20230e67 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -72,6 +72,8 @@ static inline int desc_node(struct irq_desc *desc) { return 0; }
72 72
73static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node) 73static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node)
74{ 74{
75 int cpu;
76
75 desc->irq_data.irq = irq; 77 desc->irq_data.irq = irq;
76 desc->irq_data.chip = &no_irq_chip; 78 desc->irq_data.chip = &no_irq_chip;
77 desc->irq_data.chip_data = NULL; 79 desc->irq_data.chip_data = NULL;
@@ -83,7 +85,8 @@ static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node)
83 desc->irq_count = 0; 85 desc->irq_count = 0;
84 desc->irqs_unhandled = 0; 86 desc->irqs_unhandled = 0;
85 desc->name = NULL; 87 desc->name = NULL;
86 memset(desc->kstat_irqs, 0, nr_cpu_ids * sizeof(*(desc->kstat_irqs))); 88 for_each_possible_cpu(cpu)
89 *per_cpu_ptr(desc->kstat_irqs, cpu) = 0;
87 desc_smp_init(desc, node); 90 desc_smp_init(desc, node);
88} 91}
89 92
@@ -133,8 +136,7 @@ static struct irq_desc *alloc_desc(int irq, int node)
133 if (!desc) 136 if (!desc)
134 return NULL; 137 return NULL;
135 /* allocate based on nr_cpu_ids */ 138 /* allocate based on nr_cpu_ids */
136 desc->kstat_irqs = kzalloc_node(nr_cpu_ids * sizeof(*desc->kstat_irqs), 139 desc->kstat_irqs = alloc_percpu(unsigned int);
137 gfp, node);
138 if (!desc->kstat_irqs) 140 if (!desc->kstat_irqs)
139 goto err_desc; 141 goto err_desc;
140 142
@@ -149,7 +151,7 @@ static struct irq_desc *alloc_desc(int irq, int node)
149 return desc; 151 return desc;
150 152
151err_kstat: 153err_kstat:
152 kfree(desc->kstat_irqs); 154 free_percpu(desc->kstat_irqs);
153err_desc: 155err_desc:
154 kfree(desc); 156 kfree(desc);
155 return NULL; 157 return NULL;
@@ -166,7 +168,7 @@ static void free_desc(unsigned int irq)
166 mutex_unlock(&sparse_irq_lock); 168 mutex_unlock(&sparse_irq_lock);
167 169
168 free_masks(desc); 170 free_masks(desc);
169 kfree(desc->kstat_irqs); 171 free_percpu(desc->kstat_irqs);
170 kfree(desc); 172 kfree(desc);
171} 173}
172 174
@@ -234,7 +236,6 @@ struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = {
234 } 236 }
235}; 237};
236 238
237static unsigned int kstat_irqs_all[NR_IRQS][NR_CPUS];
238int __init early_irq_init(void) 239int __init early_irq_init(void)
239{ 240{
240 int count, i, node = first_online_node; 241 int count, i, node = first_online_node;
@@ -250,7 +251,8 @@ int __init early_irq_init(void)
250 for (i = 0; i < count; i++) { 251 for (i = 0; i < count; i++) {
251 desc[i].irq_data.irq = i; 252 desc[i].irq_data.irq = i;
252 desc[i].irq_data.chip = &no_irq_chip; 253 desc[i].irq_data.chip = &no_irq_chip;
253 desc[i].kstat_irqs = kstat_irqs_all[i]; 254 /* TODO : do this allocation on-demand ... */
255 desc[i].kstat_irqs = alloc_percpu(unsigned int);
254 alloc_masks(desc + i, GFP_KERNEL, node); 256 alloc_masks(desc + i, GFP_KERNEL, node);
255 desc_smp_init(desc + i, node); 257 desc_smp_init(desc + i, node);
256 lockdep_set_class(&desc[i].lock, &irq_desc_lock_class); 258 lockdep_set_class(&desc[i].lock, &irq_desc_lock_class);
@@ -275,6 +277,22 @@ static void free_desc(unsigned int irq)
275 277
276static inline int alloc_descs(unsigned int start, unsigned int cnt, int node) 278static inline int alloc_descs(unsigned int start, unsigned int cnt, int node)
277{ 279{
280#if defined(CONFIG_KSTAT_IRQS_ONDEMAND)
281 struct irq_desc *desc;
282 unsigned int i;
283
284 for (i = 0; i < cnt; i++) {
285 desc = irq_to_desc(start + i);
286 if (desc && !desc->kstat_irqs) {
287 unsigned int __percpu *stats = alloc_percpu(unsigned int);
288
289 if (!stats)
290 return -1;
291 if (cmpxchg(&desc->kstat_irqs, NULL, stats) != NULL)
292 free_percpu(stats);
293 }
294 }
295#endif
278 return start; 296 return start;
279} 297}
280#endif /* !CONFIG_SPARSE_IRQ */ 298#endif /* !CONFIG_SPARSE_IRQ */
@@ -391,7 +409,9 @@ void dynamic_irq_cleanup(unsigned int irq)
391unsigned int kstat_irqs_cpu(unsigned int irq, int cpu) 409unsigned int kstat_irqs_cpu(unsigned int irq, int cpu)
392{ 410{
393 struct irq_desc *desc = irq_to_desc(irq); 411 struct irq_desc *desc = irq_to_desc(irq);
394 return desc ? desc->kstat_irqs[cpu] : 0; 412
413 return desc && desc->kstat_irqs ?
414 *per_cpu_ptr(desc->kstat_irqs, cpu) : 0;
395} 415}
396 416
397#ifdef CONFIG_GENERIC_HARDIRQS 417#ifdef CONFIG_GENERIC_HARDIRQS
@@ -401,10 +421,10 @@ unsigned int kstat_irqs(unsigned int irq)
401 int cpu; 421 int cpu;
402 int sum = 0; 422 int sum = 0;
403 423
404 if (!desc) 424 if (!desc || !desc->kstat_irqs)
405 return 0; 425 return 0;
406 for_each_possible_cpu(cpu) 426 for_each_possible_cpu(cpu)
407 sum += desc->kstat_irqs[cpu]; 427 sum += *per_cpu_ptr(desc->kstat_irqs, cpu);
408 return sum; 428 return sum;
409} 429}
410#endif /* CONFIG_GENERIC_HARDIRQS */ 430#endif /* CONFIG_GENERIC_HARDIRQS */
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 5f92acc5f952..0caa59f747dd 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -577,7 +577,9 @@ irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) { }
577 */ 577 */
578static int irq_thread(void *data) 578static int irq_thread(void *data)
579{ 579{
580 struct sched_param param = { .sched_priority = MAX_USER_RT_PRIO/2, }; 580 static const struct sched_param param = {
581 .sched_priority = MAX_USER_RT_PRIO/2,
582 };
581 struct irqaction *action = data; 583 struct irqaction *action = data;
582 struct irq_desc *desc = irq_to_desc(action->irq); 584 struct irq_desc *desc = irq_to_desc(action->irq);
583 int wake, oneshot = desc->status & IRQ_ONESHOT; 585 int wake, oneshot = desc->status & IRQ_ONESHOT;
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 01b1d3a88983..6c8a2a9f8a7b 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -214,7 +214,7 @@ static int irq_spurious_proc_show(struct seq_file *m, void *v)
214 214
215static int irq_spurious_proc_open(struct inode *inode, struct file *file) 215static int irq_spurious_proc_open(struct inode *inode, struct file *file)
216{ 216{
217 return single_open(file, irq_spurious_proc_show, NULL); 217 return single_open(file, irq_spurious_proc_show, PDE(inode)->data);
218} 218}
219 219
220static const struct file_operations irq_spurious_proc_fops = { 220static const struct file_operations irq_spurious_proc_fops = {
diff --git a/kernel/irq_work.c b/kernel/irq_work.c
index f16763ff8481..c58fa7da8aef 100644
--- a/kernel/irq_work.c
+++ b/kernel/irq_work.c
@@ -77,21 +77,21 @@ void __weak arch_irq_work_raise(void)
77 */ 77 */
78static void __irq_work_queue(struct irq_work *entry) 78static void __irq_work_queue(struct irq_work *entry)
79{ 79{
80 struct irq_work **head, *next; 80 struct irq_work *next;
81 81
82 head = &get_cpu_var(irq_work_list); 82 preempt_disable();
83 83
84 do { 84 do {
85 next = *head; 85 next = __this_cpu_read(irq_work_list);
86 /* Can assign non-atomic because we keep the flags set. */ 86 /* Can assign non-atomic because we keep the flags set. */
87 entry->next = next_flags(next, IRQ_WORK_FLAGS); 87 entry->next = next_flags(next, IRQ_WORK_FLAGS);
88 } while (cmpxchg(head, next, entry) != next); 88 } while (this_cpu_cmpxchg(irq_work_list, next, entry) != next);
89 89
90 /* The list was empty, raise self-interrupt to start processing. */ 90 /* The list was empty, raise self-interrupt to start processing. */
91 if (!irq_work_next(entry)) 91 if (!irq_work_next(entry))
92 arch_irq_work_raise(); 92 arch_irq_work_raise();
93 93
94 put_cpu_var(irq_work_list); 94 preempt_enable();
95} 95}
96 96
97/* 97/*
@@ -120,16 +120,16 @@ EXPORT_SYMBOL_GPL(irq_work_queue);
120 */ 120 */
121void irq_work_run(void) 121void irq_work_run(void)
122{ 122{
123 struct irq_work *list, **head; 123 struct irq_work *list;
124 124
125 head = &__get_cpu_var(irq_work_list); 125 if (this_cpu_read(irq_work_list) == NULL)
126 if (*head == NULL)
127 return; 126 return;
128 127
129 BUG_ON(!in_irq()); 128 BUG_ON(!in_irq());
130 BUG_ON(!irqs_disabled()); 129 BUG_ON(!irqs_disabled());
131 130
132 list = xchg(head, NULL); 131 list = this_cpu_xchg(irq_work_list, NULL);
132
133 while (list != NULL) { 133 while (list != NULL) {
134 struct irq_work *entry = list; 134 struct irq_work *entry = list;
135 135
@@ -145,7 +145,9 @@ void irq_work_run(void)
145 * Clear the BUSY bit and return to the free state if 145 * Clear the BUSY bit and return to the free state if
146 * no-one else claimed it meanwhile. 146 * no-one else claimed it meanwhile.
147 */ 147 */
148 cmpxchg(&entry->next, next_flags(NULL, IRQ_WORK_BUSY), NULL); 148 (void)cmpxchg(&entry->next,
149 next_flags(NULL, IRQ_WORK_BUSY),
150 NULL);
149 } 151 }
150} 152}
151EXPORT_SYMBOL_GPL(irq_work_run); 153EXPORT_SYMBOL_GPL(irq_work_run);
diff --git a/kernel/kexec.c b/kernel/kexec.c
index b55045bc7563..ec19b92c7ebd 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -163,7 +163,7 @@ static int do_kimage_alloc(struct kimage **rimage, unsigned long entry,
163 * just verifies it is an address we can use. 163 * just verifies it is an address we can use.
164 * 164 *
165 * Since the kernel does everything in page size chunks ensure 165 * Since the kernel does everything in page size chunks ensure
166 * the destination addreses are page aligned. Too many 166 * the destination addresses are page aligned. Too many
167 * special cases crop of when we don't do this. The most 167 * special cases crop of when we don't do this. The most
168 * insidious is getting overlapping destination addresses 168 * insidious is getting overlapping destination addresses
169 * simply because addresses are changed to page size 169 * simply because addresses are changed to page size
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 9737a76e106f..77981813a1e7 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -317,12 +317,12 @@ void __kprobes free_optinsn_slot(kprobe_opcode_t * slot, int dirty)
317/* We have preemption disabled.. so it is safe to use __ versions */ 317/* We have preemption disabled.. so it is safe to use __ versions */
318static inline void set_kprobe_instance(struct kprobe *kp) 318static inline void set_kprobe_instance(struct kprobe *kp)
319{ 319{
320 __get_cpu_var(kprobe_instance) = kp; 320 __this_cpu_write(kprobe_instance, kp);
321} 321}
322 322
323static inline void reset_kprobe_instance(void) 323static inline void reset_kprobe_instance(void)
324{ 324{
325 __get_cpu_var(kprobe_instance) = NULL; 325 __this_cpu_write(kprobe_instance, NULL);
326} 326}
327 327
328/* 328/*
@@ -354,13 +354,20 @@ static inline int kprobe_aggrprobe(struct kprobe *p)
354 return p->pre_handler == aggr_pre_handler; 354 return p->pre_handler == aggr_pre_handler;
355} 355}
356 356
357/* Return true(!0) if the kprobe is unused */
358static inline int kprobe_unused(struct kprobe *p)
359{
360 return kprobe_aggrprobe(p) && kprobe_disabled(p) &&
361 list_empty(&p->list);
362}
363
357/* 364/*
358 * Keep all fields in the kprobe consistent 365 * Keep all fields in the kprobe consistent
359 */ 366 */
360static inline void copy_kprobe(struct kprobe *old_p, struct kprobe *p) 367static inline void copy_kprobe(struct kprobe *ap, struct kprobe *p)
361{ 368{
362 memcpy(&p->opcode, &old_p->opcode, sizeof(kprobe_opcode_t)); 369 memcpy(&p->opcode, &ap->opcode, sizeof(kprobe_opcode_t));
363 memcpy(&p->ainsn, &old_p->ainsn, sizeof(struct arch_specific_insn)); 370 memcpy(&p->ainsn, &ap->ainsn, sizeof(struct arch_specific_insn));
364} 371}
365 372
366#ifdef CONFIG_OPTPROBES 373#ifdef CONFIG_OPTPROBES
@@ -384,6 +391,17 @@ void __kprobes opt_pre_handler(struct kprobe *p, struct pt_regs *regs)
384 } 391 }
385} 392}
386 393
394/* Free optimized instructions and optimized_kprobe */
395static __kprobes void free_aggr_kprobe(struct kprobe *p)
396{
397 struct optimized_kprobe *op;
398
399 op = container_of(p, struct optimized_kprobe, kp);
400 arch_remove_optimized_kprobe(op);
401 arch_remove_kprobe(p);
402 kfree(op);
403}
404
387/* Return true(!0) if the kprobe is ready for optimization. */ 405/* Return true(!0) if the kprobe is ready for optimization. */
388static inline int kprobe_optready(struct kprobe *p) 406static inline int kprobe_optready(struct kprobe *p)
389{ 407{
@@ -397,6 +415,33 @@ static inline int kprobe_optready(struct kprobe *p)
397 return 0; 415 return 0;
398} 416}
399 417
418/* Return true(!0) if the kprobe is disarmed. Note: p must be on hash list */
419static inline int kprobe_disarmed(struct kprobe *p)
420{
421 struct optimized_kprobe *op;
422
423 /* If kprobe is not aggr/opt probe, just return kprobe is disabled */
424 if (!kprobe_aggrprobe(p))
425 return kprobe_disabled(p);
426
427 op = container_of(p, struct optimized_kprobe, kp);
428
429 return kprobe_disabled(p) && list_empty(&op->list);
430}
431
432/* Return true(!0) if the probe is queued on (un)optimizing lists */
433static int __kprobes kprobe_queued(struct kprobe *p)
434{
435 struct optimized_kprobe *op;
436
437 if (kprobe_aggrprobe(p)) {
438 op = container_of(p, struct optimized_kprobe, kp);
439 if (!list_empty(&op->list))
440 return 1;
441 }
442 return 0;
443}
444
400/* 445/*
401 * Return an optimized kprobe whose optimizing code replaces 446 * Return an optimized kprobe whose optimizing code replaces
402 * instructions including addr (exclude breakpoint). 447 * instructions including addr (exclude breakpoint).
@@ -422,30 +467,23 @@ static struct kprobe *__kprobes get_optimized_kprobe(unsigned long addr)
422 467
423/* Optimization staging list, protected by kprobe_mutex */ 468/* Optimization staging list, protected by kprobe_mutex */
424static LIST_HEAD(optimizing_list); 469static LIST_HEAD(optimizing_list);
470static LIST_HEAD(unoptimizing_list);
425 471
426static void kprobe_optimizer(struct work_struct *work); 472static void kprobe_optimizer(struct work_struct *work);
427static DECLARE_DELAYED_WORK(optimizing_work, kprobe_optimizer); 473static DECLARE_DELAYED_WORK(optimizing_work, kprobe_optimizer);
474static DECLARE_COMPLETION(optimizer_comp);
428#define OPTIMIZE_DELAY 5 475#define OPTIMIZE_DELAY 5
429 476
430/* Kprobe jump optimizer */ 477/*
431static __kprobes void kprobe_optimizer(struct work_struct *work) 478 * Optimize (replace a breakpoint with a jump) kprobes listed on
479 * optimizing_list.
480 */
481static __kprobes void do_optimize_kprobes(void)
432{ 482{
433 struct optimized_kprobe *op, *tmp; 483 /* Optimization never be done when disarmed */
434 484 if (kprobes_all_disarmed || !kprobes_allow_optimization ||
435 /* Lock modules while optimizing kprobes */ 485 list_empty(&optimizing_list))
436 mutex_lock(&module_mutex); 486 return;
437 mutex_lock(&kprobe_mutex);
438 if (kprobes_all_disarmed || !kprobes_allow_optimization)
439 goto end;
440
441 /*
442 * Wait for quiesence period to ensure all running interrupts
443 * are done. Because optprobe may modify multiple instructions
444 * there is a chance that Nth instruction is interrupted. In that
445 * case, running interrupt can return to 2nd-Nth byte of jump
446 * instruction. This wait is for avoiding it.
447 */
448 synchronize_sched();
449 487
450 /* 488 /*
451 * The optimization/unoptimization refers online_cpus via 489 * The optimization/unoptimization refers online_cpus via
@@ -459,17 +497,111 @@ static __kprobes void kprobe_optimizer(struct work_struct *work)
459 */ 497 */
460 get_online_cpus(); 498 get_online_cpus();
461 mutex_lock(&text_mutex); 499 mutex_lock(&text_mutex);
462 list_for_each_entry_safe(op, tmp, &optimizing_list, list) { 500 arch_optimize_kprobes(&optimizing_list);
463 WARN_ON(kprobe_disabled(&op->kp)); 501 mutex_unlock(&text_mutex);
464 if (arch_optimize_kprobe(op) < 0) 502 put_online_cpus();
465 op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED; 503}
466 list_del_init(&op->list); 504
505/*
506 * Unoptimize (replace a jump with a breakpoint and remove the breakpoint
507 * if need) kprobes listed on unoptimizing_list.
508 */
509static __kprobes void do_unoptimize_kprobes(struct list_head *free_list)
510{
511 struct optimized_kprobe *op, *tmp;
512
513 /* Unoptimization must be done anytime */
514 if (list_empty(&unoptimizing_list))
515 return;
516
517 /* Ditto to do_optimize_kprobes */
518 get_online_cpus();
519 mutex_lock(&text_mutex);
520 arch_unoptimize_kprobes(&unoptimizing_list, free_list);
521 /* Loop free_list for disarming */
522 list_for_each_entry_safe(op, tmp, free_list, list) {
523 /* Disarm probes if marked disabled */
524 if (kprobe_disabled(&op->kp))
525 arch_disarm_kprobe(&op->kp);
526 if (kprobe_unused(&op->kp)) {
527 /*
528 * Remove unused probes from hash list. After waiting
529 * for synchronization, these probes are reclaimed.
530 * (reclaiming is done by do_free_cleaned_kprobes.)
531 */
532 hlist_del_rcu(&op->kp.hlist);
533 } else
534 list_del_init(&op->list);
467 } 535 }
468 mutex_unlock(&text_mutex); 536 mutex_unlock(&text_mutex);
469 put_online_cpus(); 537 put_online_cpus();
470end: 538}
539
540/* Reclaim all kprobes on the free_list */
541static __kprobes void do_free_cleaned_kprobes(struct list_head *free_list)
542{
543 struct optimized_kprobe *op, *tmp;
544
545 list_for_each_entry_safe(op, tmp, free_list, list) {
546 BUG_ON(!kprobe_unused(&op->kp));
547 list_del_init(&op->list);
548 free_aggr_kprobe(&op->kp);
549 }
550}
551
552/* Start optimizer after OPTIMIZE_DELAY passed */
553static __kprobes void kick_kprobe_optimizer(void)
554{
555 if (!delayed_work_pending(&optimizing_work))
556 schedule_delayed_work(&optimizing_work, OPTIMIZE_DELAY);
557}
558
559/* Kprobe jump optimizer */
560static __kprobes void kprobe_optimizer(struct work_struct *work)
561{
562 LIST_HEAD(free_list);
563
564 /* Lock modules while optimizing kprobes */
565 mutex_lock(&module_mutex);
566 mutex_lock(&kprobe_mutex);
567
568 /*
569 * Step 1: Unoptimize kprobes and collect cleaned (unused and disarmed)
570 * kprobes before waiting for quiesence period.
571 */
572 do_unoptimize_kprobes(&free_list);
573
574 /*
575 * Step 2: Wait for quiesence period to ensure all running interrupts
576 * are done. Because optprobe may modify multiple instructions
577 * there is a chance that Nth instruction is interrupted. In that
578 * case, running interrupt can return to 2nd-Nth byte of jump
579 * instruction. This wait is for avoiding it.
580 */
581 synchronize_sched();
582
583 /* Step 3: Optimize kprobes after quiesence period */
584 do_optimize_kprobes();
585
586 /* Step 4: Free cleaned kprobes after quiesence period */
587 do_free_cleaned_kprobes(&free_list);
588
471 mutex_unlock(&kprobe_mutex); 589 mutex_unlock(&kprobe_mutex);
472 mutex_unlock(&module_mutex); 590 mutex_unlock(&module_mutex);
591
592 /* Step 5: Kick optimizer again if needed */
593 if (!list_empty(&optimizing_list) || !list_empty(&unoptimizing_list))
594 kick_kprobe_optimizer();
595 else
596 /* Wake up all waiters */
597 complete_all(&optimizer_comp);
598}
599
600/* Wait for completing optimization and unoptimization */
601static __kprobes void wait_for_kprobe_optimizer(void)
602{
603 if (delayed_work_pending(&optimizing_work))
604 wait_for_completion(&optimizer_comp);
473} 605}
474 606
475/* Optimize kprobe if p is ready to be optimized */ 607/* Optimize kprobe if p is ready to be optimized */
@@ -495,42 +627,99 @@ static __kprobes void optimize_kprobe(struct kprobe *p)
495 /* Check if it is already optimized. */ 627 /* Check if it is already optimized. */
496 if (op->kp.flags & KPROBE_FLAG_OPTIMIZED) 628 if (op->kp.flags & KPROBE_FLAG_OPTIMIZED)
497 return; 629 return;
498
499 op->kp.flags |= KPROBE_FLAG_OPTIMIZED; 630 op->kp.flags |= KPROBE_FLAG_OPTIMIZED;
500 list_add(&op->list, &optimizing_list); 631
501 if (!delayed_work_pending(&optimizing_work)) 632 if (!list_empty(&op->list))
502 schedule_delayed_work(&optimizing_work, OPTIMIZE_DELAY); 633 /* This is under unoptimizing. Just dequeue the probe */
634 list_del_init(&op->list);
635 else {
636 list_add(&op->list, &optimizing_list);
637 kick_kprobe_optimizer();
638 }
639}
640
641/* Short cut to direct unoptimizing */
642static __kprobes void force_unoptimize_kprobe(struct optimized_kprobe *op)
643{
644 get_online_cpus();
645 arch_unoptimize_kprobe(op);
646 put_online_cpus();
647 if (kprobe_disabled(&op->kp))
648 arch_disarm_kprobe(&op->kp);
503} 649}
504 650
505/* Unoptimize a kprobe if p is optimized */ 651/* Unoptimize a kprobe if p is optimized */
506static __kprobes void unoptimize_kprobe(struct kprobe *p) 652static __kprobes void unoptimize_kprobe(struct kprobe *p, bool force)
507{ 653{
508 struct optimized_kprobe *op; 654 struct optimized_kprobe *op;
509 655
510 if ((p->flags & KPROBE_FLAG_OPTIMIZED) && kprobe_aggrprobe(p)) { 656 if (!kprobe_aggrprobe(p) || kprobe_disarmed(p))
511 op = container_of(p, struct optimized_kprobe, kp); 657 return; /* This is not an optprobe nor optimized */
512 if (!list_empty(&op->list)) 658
513 /* Dequeue from the optimization queue */ 659 op = container_of(p, struct optimized_kprobe, kp);
660 if (!kprobe_optimized(p)) {
661 /* Unoptimized or unoptimizing case */
662 if (force && !list_empty(&op->list)) {
663 /*
664 * Only if this is unoptimizing kprobe and forced,
665 * forcibly unoptimize it. (No need to unoptimize
666 * unoptimized kprobe again :)
667 */
514 list_del_init(&op->list); 668 list_del_init(&op->list);
515 else 669 force_unoptimize_kprobe(op);
516 /* Replace jump with break */ 670 }
517 arch_unoptimize_kprobe(op); 671 return;
518 op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED; 672 }
673
674 op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
675 if (!list_empty(&op->list)) {
676 /* Dequeue from the optimization queue */
677 list_del_init(&op->list);
678 return;
679 }
680 /* Optimized kprobe case */
681 if (force)
682 /* Forcibly update the code: this is a special case */
683 force_unoptimize_kprobe(op);
684 else {
685 list_add(&op->list, &unoptimizing_list);
686 kick_kprobe_optimizer();
519 } 687 }
520} 688}
521 689
690/* Cancel unoptimizing for reusing */
691static void reuse_unused_kprobe(struct kprobe *ap)
692{
693 struct optimized_kprobe *op;
694
695 BUG_ON(!kprobe_unused(ap));
696 /*
697 * Unused kprobe MUST be on the way of delayed unoptimizing (means
698 * there is still a relative jump) and disabled.
699 */
700 op = container_of(ap, struct optimized_kprobe, kp);
701 if (unlikely(list_empty(&op->list)))
702 printk(KERN_WARNING "Warning: found a stray unused "
703 "aggrprobe@%p\n", ap->addr);
704 /* Enable the probe again */
705 ap->flags &= ~KPROBE_FLAG_DISABLED;
706 /* Optimize it again (remove from op->list) */
707 BUG_ON(!kprobe_optready(ap));
708 optimize_kprobe(ap);
709}
710
522/* Remove optimized instructions */ 711/* Remove optimized instructions */
523static void __kprobes kill_optimized_kprobe(struct kprobe *p) 712static void __kprobes kill_optimized_kprobe(struct kprobe *p)
524{ 713{
525 struct optimized_kprobe *op; 714 struct optimized_kprobe *op;
526 715
527 op = container_of(p, struct optimized_kprobe, kp); 716 op = container_of(p, struct optimized_kprobe, kp);
528 if (!list_empty(&op->list)) { 717 if (!list_empty(&op->list))
529 /* Dequeue from the optimization queue */ 718 /* Dequeue from the (un)optimization queue */
530 list_del_init(&op->list); 719 list_del_init(&op->list);
531 op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED; 720
532 } 721 op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
533 /* Don't unoptimize, because the target code will be freed. */ 722 /* Don't touch the code, because it is already freed. */
534 arch_remove_optimized_kprobe(op); 723 arch_remove_optimized_kprobe(op);
535} 724}
536 725
@@ -543,16 +732,6 @@ static __kprobes void prepare_optimized_kprobe(struct kprobe *p)
543 arch_prepare_optimized_kprobe(op); 732 arch_prepare_optimized_kprobe(op);
544} 733}
545 734
546/* Free optimized instructions and optimized_kprobe */
547static __kprobes void free_aggr_kprobe(struct kprobe *p)
548{
549 struct optimized_kprobe *op;
550
551 op = container_of(p, struct optimized_kprobe, kp);
552 arch_remove_optimized_kprobe(op);
553 kfree(op);
554}
555
556/* Allocate new optimized_kprobe and try to prepare optimized instructions */ 735/* Allocate new optimized_kprobe and try to prepare optimized instructions */
557static __kprobes struct kprobe *alloc_aggr_kprobe(struct kprobe *p) 736static __kprobes struct kprobe *alloc_aggr_kprobe(struct kprobe *p)
558{ 737{
@@ -587,7 +766,8 @@ static __kprobes void try_to_optimize_kprobe(struct kprobe *p)
587 op = container_of(ap, struct optimized_kprobe, kp); 766 op = container_of(ap, struct optimized_kprobe, kp);
588 if (!arch_prepared_optinsn(&op->optinsn)) { 767 if (!arch_prepared_optinsn(&op->optinsn)) {
589 /* If failed to setup optimizing, fallback to kprobe */ 768 /* If failed to setup optimizing, fallback to kprobe */
590 free_aggr_kprobe(ap); 769 arch_remove_optimized_kprobe(op);
770 kfree(op);
591 return; 771 return;
592 } 772 }
593 773
@@ -631,21 +811,16 @@ static void __kprobes unoptimize_all_kprobes(void)
631 return; 811 return;
632 812
633 kprobes_allow_optimization = false; 813 kprobes_allow_optimization = false;
634 printk(KERN_INFO "Kprobes globally unoptimized\n");
635 get_online_cpus(); /* For avoiding text_mutex deadlock */
636 mutex_lock(&text_mutex);
637 for (i = 0; i < KPROBE_TABLE_SIZE; i++) { 814 for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
638 head = &kprobe_table[i]; 815 head = &kprobe_table[i];
639 hlist_for_each_entry_rcu(p, node, head, hlist) { 816 hlist_for_each_entry_rcu(p, node, head, hlist) {
640 if (!kprobe_disabled(p)) 817 if (!kprobe_disabled(p))
641 unoptimize_kprobe(p); 818 unoptimize_kprobe(p, false);
642 } 819 }
643 } 820 }
644 821 /* Wait for unoptimizing completion */
645 mutex_unlock(&text_mutex); 822 wait_for_kprobe_optimizer();
646 put_online_cpus(); 823 printk(KERN_INFO "Kprobes globally unoptimized\n");
647 /* Allow all currently running kprobes to complete */
648 synchronize_sched();
649} 824}
650 825
651int sysctl_kprobes_optimization; 826int sysctl_kprobes_optimization;
@@ -669,44 +844,60 @@ int proc_kprobes_optimization_handler(struct ctl_table *table, int write,
669} 844}
670#endif /* CONFIG_SYSCTL */ 845#endif /* CONFIG_SYSCTL */
671 846
847/* Put a breakpoint for a probe. Must be called with text_mutex locked */
672static void __kprobes __arm_kprobe(struct kprobe *p) 848static void __kprobes __arm_kprobe(struct kprobe *p)
673{ 849{
674 struct kprobe *old_p; 850 struct kprobe *_p;
675 851
676 /* Check collision with other optimized kprobes */ 852 /* Check collision with other optimized kprobes */
677 old_p = get_optimized_kprobe((unsigned long)p->addr); 853 _p = get_optimized_kprobe((unsigned long)p->addr);
678 if (unlikely(old_p)) 854 if (unlikely(_p))
679 unoptimize_kprobe(old_p); /* Fallback to unoptimized kprobe */ 855 /* Fallback to unoptimized kprobe */
856 unoptimize_kprobe(_p, true);
680 857
681 arch_arm_kprobe(p); 858 arch_arm_kprobe(p);
682 optimize_kprobe(p); /* Try to optimize (add kprobe to a list) */ 859 optimize_kprobe(p); /* Try to optimize (add kprobe to a list) */
683} 860}
684 861
685static void __kprobes __disarm_kprobe(struct kprobe *p) 862/* Remove the breakpoint of a probe. Must be called with text_mutex locked */
863static void __kprobes __disarm_kprobe(struct kprobe *p, bool reopt)
686{ 864{
687 struct kprobe *old_p; 865 struct kprobe *_p;
688 866
689 unoptimize_kprobe(p); /* Try to unoptimize */ 867 unoptimize_kprobe(p, false); /* Try to unoptimize */
690 arch_disarm_kprobe(p);
691 868
692 /* If another kprobe was blocked, optimize it. */ 869 if (!kprobe_queued(p)) {
693 old_p = get_optimized_kprobe((unsigned long)p->addr); 870 arch_disarm_kprobe(p);
694 if (unlikely(old_p)) 871 /* If another kprobe was blocked, optimize it. */
695 optimize_kprobe(old_p); 872 _p = get_optimized_kprobe((unsigned long)p->addr);
873 if (unlikely(_p) && reopt)
874 optimize_kprobe(_p);
875 }
876 /* TODO: reoptimize others after unoptimized this probe */
696} 877}
697 878
698#else /* !CONFIG_OPTPROBES */ 879#else /* !CONFIG_OPTPROBES */
699 880
700#define optimize_kprobe(p) do {} while (0) 881#define optimize_kprobe(p) do {} while (0)
701#define unoptimize_kprobe(p) do {} while (0) 882#define unoptimize_kprobe(p, f) do {} while (0)
702#define kill_optimized_kprobe(p) do {} while (0) 883#define kill_optimized_kprobe(p) do {} while (0)
703#define prepare_optimized_kprobe(p) do {} while (0) 884#define prepare_optimized_kprobe(p) do {} while (0)
704#define try_to_optimize_kprobe(p) do {} while (0) 885#define try_to_optimize_kprobe(p) do {} while (0)
705#define __arm_kprobe(p) arch_arm_kprobe(p) 886#define __arm_kprobe(p) arch_arm_kprobe(p)
706#define __disarm_kprobe(p) arch_disarm_kprobe(p) 887#define __disarm_kprobe(p, o) arch_disarm_kprobe(p)
888#define kprobe_disarmed(p) kprobe_disabled(p)
889#define wait_for_kprobe_optimizer() do {} while (0)
890
891/* There should be no unused kprobes can be reused without optimization */
892static void reuse_unused_kprobe(struct kprobe *ap)
893{
894 printk(KERN_ERR "Error: There should be no unused kprobe here.\n");
895 BUG_ON(kprobe_unused(ap));
896}
707 897
708static __kprobes void free_aggr_kprobe(struct kprobe *p) 898static __kprobes void free_aggr_kprobe(struct kprobe *p)
709{ 899{
900 arch_remove_kprobe(p);
710 kfree(p); 901 kfree(p);
711} 902}
712 903
@@ -732,11 +923,10 @@ static void __kprobes arm_kprobe(struct kprobe *kp)
732/* Disarm a kprobe with text_mutex */ 923/* Disarm a kprobe with text_mutex */
733static void __kprobes disarm_kprobe(struct kprobe *kp) 924static void __kprobes disarm_kprobe(struct kprobe *kp)
734{ 925{
735 get_online_cpus(); /* For avoiding text_mutex deadlock */ 926 /* Ditto */
736 mutex_lock(&text_mutex); 927 mutex_lock(&text_mutex);
737 __disarm_kprobe(kp); 928 __disarm_kprobe(kp, true);
738 mutex_unlock(&text_mutex); 929 mutex_unlock(&text_mutex);
739 put_online_cpus();
740} 930}
741 931
742/* 932/*
@@ -775,7 +965,7 @@ static void __kprobes aggr_post_handler(struct kprobe *p, struct pt_regs *regs,
775static int __kprobes aggr_fault_handler(struct kprobe *p, struct pt_regs *regs, 965static int __kprobes aggr_fault_handler(struct kprobe *p, struct pt_regs *regs,
776 int trapnr) 966 int trapnr)
777{ 967{
778 struct kprobe *cur = __get_cpu_var(kprobe_instance); 968 struct kprobe *cur = __this_cpu_read(kprobe_instance);
779 969
780 /* 970 /*
781 * if we faulted "during" the execution of a user specified 971 * if we faulted "during" the execution of a user specified
@@ -790,7 +980,7 @@ static int __kprobes aggr_fault_handler(struct kprobe *p, struct pt_regs *regs,
790 980
791static int __kprobes aggr_break_handler(struct kprobe *p, struct pt_regs *regs) 981static int __kprobes aggr_break_handler(struct kprobe *p, struct pt_regs *regs)
792{ 982{
793 struct kprobe *cur = __get_cpu_var(kprobe_instance); 983 struct kprobe *cur = __this_cpu_read(kprobe_instance);
794 int ret = 0; 984 int ret = 0;
795 985
796 if (cur && cur->break_handler) { 986 if (cur && cur->break_handler) {
@@ -942,7 +1132,7 @@ static int __kprobes add_new_kprobe(struct kprobe *ap, struct kprobe *p)
942 BUG_ON(kprobe_gone(ap) || kprobe_gone(p)); 1132 BUG_ON(kprobe_gone(ap) || kprobe_gone(p));
943 1133
944 if (p->break_handler || p->post_handler) 1134 if (p->break_handler || p->post_handler)
945 unoptimize_kprobe(ap); /* Fall back to normal kprobe */ 1135 unoptimize_kprobe(ap, true); /* Fall back to normal kprobe */
946 1136
947 if (p->break_handler) { 1137 if (p->break_handler) {
948 if (ap->break_handler) 1138 if (ap->break_handler)
@@ -993,19 +1183,21 @@ static void __kprobes init_aggr_kprobe(struct kprobe *ap, struct kprobe *p)
993 * This is the second or subsequent kprobe at the address - handle 1183 * This is the second or subsequent kprobe at the address - handle
994 * the intricacies 1184 * the intricacies
995 */ 1185 */
996static int __kprobes register_aggr_kprobe(struct kprobe *old_p, 1186static int __kprobes register_aggr_kprobe(struct kprobe *orig_p,
997 struct kprobe *p) 1187 struct kprobe *p)
998{ 1188{
999 int ret = 0; 1189 int ret = 0;
1000 struct kprobe *ap = old_p; 1190 struct kprobe *ap = orig_p;
1001 1191
1002 if (!kprobe_aggrprobe(old_p)) { 1192 if (!kprobe_aggrprobe(orig_p)) {
1003 /* If old_p is not an aggr_kprobe, create new aggr_kprobe. */ 1193 /* If orig_p is not an aggr_kprobe, create new aggr_kprobe. */
1004 ap = alloc_aggr_kprobe(old_p); 1194 ap = alloc_aggr_kprobe(orig_p);
1005 if (!ap) 1195 if (!ap)
1006 return -ENOMEM; 1196 return -ENOMEM;
1007 init_aggr_kprobe(ap, old_p); 1197 init_aggr_kprobe(ap, orig_p);
1008 } 1198 } else if (kprobe_unused(ap))
1199 /* This probe is going to die. Rescue it */
1200 reuse_unused_kprobe(ap);
1009 1201
1010 if (kprobe_gone(ap)) { 1202 if (kprobe_gone(ap)) {
1011 /* 1203 /*
@@ -1039,23 +1231,6 @@ static int __kprobes register_aggr_kprobe(struct kprobe *old_p,
1039 return add_new_kprobe(ap, p); 1231 return add_new_kprobe(ap, p);
1040} 1232}
1041 1233
1042/* Try to disable aggr_kprobe, and return 1 if succeeded.*/
1043static int __kprobes try_to_disable_aggr_kprobe(struct kprobe *p)
1044{
1045 struct kprobe *kp;
1046
1047 list_for_each_entry_rcu(kp, &p->list, list) {
1048 if (!kprobe_disabled(kp))
1049 /*
1050 * There is an active probe on the list.
1051 * We can't disable aggr_kprobe.
1052 */
1053 return 0;
1054 }
1055 p->flags |= KPROBE_FLAG_DISABLED;
1056 return 1;
1057}
1058
1059static int __kprobes in_kprobes_functions(unsigned long addr) 1234static int __kprobes in_kprobes_functions(unsigned long addr)
1060{ 1235{
1061 struct kprobe_blackpoint *kb; 1236 struct kprobe_blackpoint *kb;
@@ -1098,34 +1273,33 @@ static kprobe_opcode_t __kprobes *kprobe_addr(struct kprobe *p)
1098/* Check passed kprobe is valid and return kprobe in kprobe_table. */ 1273/* Check passed kprobe is valid and return kprobe in kprobe_table. */
1099static struct kprobe * __kprobes __get_valid_kprobe(struct kprobe *p) 1274static struct kprobe * __kprobes __get_valid_kprobe(struct kprobe *p)
1100{ 1275{
1101 struct kprobe *old_p, *list_p; 1276 struct kprobe *ap, *list_p;
1102 1277
1103 old_p = get_kprobe(p->addr); 1278 ap = get_kprobe(p->addr);
1104 if (unlikely(!old_p)) 1279 if (unlikely(!ap))
1105 return NULL; 1280 return NULL;
1106 1281
1107 if (p != old_p) { 1282 if (p != ap) {
1108 list_for_each_entry_rcu(list_p, &old_p->list, list) 1283 list_for_each_entry_rcu(list_p, &ap->list, list)
1109 if (list_p == p) 1284 if (list_p == p)
1110 /* kprobe p is a valid probe */ 1285 /* kprobe p is a valid probe */
1111 goto valid; 1286 goto valid;
1112 return NULL; 1287 return NULL;
1113 } 1288 }
1114valid: 1289valid:
1115 return old_p; 1290 return ap;
1116} 1291}
1117 1292
1118/* Return error if the kprobe is being re-registered */ 1293/* Return error if the kprobe is being re-registered */
1119static inline int check_kprobe_rereg(struct kprobe *p) 1294static inline int check_kprobe_rereg(struct kprobe *p)
1120{ 1295{
1121 int ret = 0; 1296 int ret = 0;
1122 struct kprobe *old_p;
1123 1297
1124 mutex_lock(&kprobe_mutex); 1298 mutex_lock(&kprobe_mutex);
1125 old_p = __get_valid_kprobe(p); 1299 if (__get_valid_kprobe(p))
1126 if (old_p)
1127 ret = -EINVAL; 1300 ret = -EINVAL;
1128 mutex_unlock(&kprobe_mutex); 1301 mutex_unlock(&kprobe_mutex);
1302
1129 return ret; 1303 return ret;
1130} 1304}
1131 1305
@@ -1229,67 +1403,121 @@ fail_with_jump_label:
1229} 1403}
1230EXPORT_SYMBOL_GPL(register_kprobe); 1404EXPORT_SYMBOL_GPL(register_kprobe);
1231 1405
1406/* Check if all probes on the aggrprobe are disabled */
1407static int __kprobes aggr_kprobe_disabled(struct kprobe *ap)
1408{
1409 struct kprobe *kp;
1410
1411 list_for_each_entry_rcu(kp, &ap->list, list)
1412 if (!kprobe_disabled(kp))
1413 /*
1414 * There is an active probe on the list.
1415 * We can't disable this ap.
1416 */
1417 return 0;
1418
1419 return 1;
1420}
1421
1422/* Disable one kprobe: Make sure called under kprobe_mutex is locked */
1423static struct kprobe *__kprobes __disable_kprobe(struct kprobe *p)
1424{
1425 struct kprobe *orig_p;
1426
1427 /* Get an original kprobe for return */
1428 orig_p = __get_valid_kprobe(p);
1429 if (unlikely(orig_p == NULL))
1430 return NULL;
1431
1432 if (!kprobe_disabled(p)) {
1433 /* Disable probe if it is a child probe */
1434 if (p != orig_p)
1435 p->flags |= KPROBE_FLAG_DISABLED;
1436
1437 /* Try to disarm and disable this/parent probe */
1438 if (p == orig_p || aggr_kprobe_disabled(orig_p)) {
1439 disarm_kprobe(orig_p);
1440 orig_p->flags |= KPROBE_FLAG_DISABLED;
1441 }
1442 }
1443
1444 return orig_p;
1445}
1446
1232/* 1447/*
1233 * Unregister a kprobe without a scheduler synchronization. 1448 * Unregister a kprobe without a scheduler synchronization.
1234 */ 1449 */
1235static int __kprobes __unregister_kprobe_top(struct kprobe *p) 1450static int __kprobes __unregister_kprobe_top(struct kprobe *p)
1236{ 1451{
1237 struct kprobe *old_p, *list_p; 1452 struct kprobe *ap, *list_p;
1238 1453
1239 old_p = __get_valid_kprobe(p); 1454 /* Disable kprobe. This will disarm it if needed. */
1240 if (old_p == NULL) 1455 ap = __disable_kprobe(p);
1456 if (ap == NULL)
1241 return -EINVAL; 1457 return -EINVAL;
1242 1458
1243 if (old_p == p || 1459 if (ap == p)
1244 (kprobe_aggrprobe(old_p) &&
1245 list_is_singular(&old_p->list))) {
1246 /* 1460 /*
1247 * Only probe on the hash list. Disarm only if kprobes are 1461 * This probe is an independent(and non-optimized) kprobe
1248 * enabled and not gone - otherwise, the breakpoint would 1462 * (not an aggrprobe). Remove from the hash list.
1249 * already have been removed. We save on flushing icache.
1250 */ 1463 */
1251 if (!kprobes_all_disarmed && !kprobe_disabled(old_p)) 1464 goto disarmed;
1252 disarm_kprobe(old_p); 1465
1253 hlist_del_rcu(&old_p->hlist); 1466 /* Following process expects this probe is an aggrprobe */
1254 } else { 1467 WARN_ON(!kprobe_aggrprobe(ap));
1468
1469 if (list_is_singular(&ap->list) && kprobe_disarmed(ap))
1470 /*
1471 * !disarmed could be happen if the probe is under delayed
1472 * unoptimizing.
1473 */
1474 goto disarmed;
1475 else {
1476 /* If disabling probe has special handlers, update aggrprobe */
1255 if (p->break_handler && !kprobe_gone(p)) 1477 if (p->break_handler && !kprobe_gone(p))
1256 old_p->break_handler = NULL; 1478 ap->break_handler = NULL;
1257 if (p->post_handler && !kprobe_gone(p)) { 1479 if (p->post_handler && !kprobe_gone(p)) {
1258 list_for_each_entry_rcu(list_p, &old_p->list, list) { 1480 list_for_each_entry_rcu(list_p, &ap->list, list) {
1259 if ((list_p != p) && (list_p->post_handler)) 1481 if ((list_p != p) && (list_p->post_handler))
1260 goto noclean; 1482 goto noclean;
1261 } 1483 }
1262 old_p->post_handler = NULL; 1484 ap->post_handler = NULL;
1263 } 1485 }
1264noclean: 1486noclean:
1487 /*
1488 * Remove from the aggrprobe: this path will do nothing in
1489 * __unregister_kprobe_bottom().
1490 */
1265 list_del_rcu(&p->list); 1491 list_del_rcu(&p->list);
1266 if (!kprobe_disabled(old_p)) { 1492 if (!kprobe_disabled(ap) && !kprobes_all_disarmed)
1267 try_to_disable_aggr_kprobe(old_p); 1493 /*
1268 if (!kprobes_all_disarmed) { 1494 * Try to optimize this probe again, because post
1269 if (kprobe_disabled(old_p)) 1495 * handler may have been changed.
1270 disarm_kprobe(old_p); 1496 */
1271 else 1497 optimize_kprobe(ap);
1272 /* Try to optimize this probe again */
1273 optimize_kprobe(old_p);
1274 }
1275 }
1276 } 1498 }
1277 return 0; 1499 return 0;
1500
1501disarmed:
1502 BUG_ON(!kprobe_disarmed(ap));
1503 hlist_del_rcu(&ap->hlist);
1504 return 0;
1278} 1505}
1279 1506
1280static void __kprobes __unregister_kprobe_bottom(struct kprobe *p) 1507static void __kprobes __unregister_kprobe_bottom(struct kprobe *p)
1281{ 1508{
1282 struct kprobe *old_p; 1509 struct kprobe *ap;
1283 1510
1284 if (list_empty(&p->list)) 1511 if (list_empty(&p->list))
1512 /* This is an independent kprobe */
1285 arch_remove_kprobe(p); 1513 arch_remove_kprobe(p);
1286 else if (list_is_singular(&p->list)) { 1514 else if (list_is_singular(&p->list)) {
1287 /* "p" is the last child of an aggr_kprobe */ 1515 /* This is the last child of an aggrprobe */
1288 old_p = list_entry(p->list.next, struct kprobe, list); 1516 ap = list_entry(p->list.next, struct kprobe, list);
1289 list_del(&p->list); 1517 list_del(&p->list);
1290 arch_remove_kprobe(old_p); 1518 free_aggr_kprobe(ap);
1291 free_aggr_kprobe(old_p);
1292 } 1519 }
1520 /* Otherwise, do nothing. */
1293} 1521}
1294 1522
1295int __kprobes register_kprobes(struct kprobe **kps, int num) 1523int __kprobes register_kprobes(struct kprobe **kps, int num)
@@ -1607,29 +1835,13 @@ static void __kprobes kill_kprobe(struct kprobe *p)
1607int __kprobes disable_kprobe(struct kprobe *kp) 1835int __kprobes disable_kprobe(struct kprobe *kp)
1608{ 1836{
1609 int ret = 0; 1837 int ret = 0;
1610 struct kprobe *p;
1611 1838
1612 mutex_lock(&kprobe_mutex); 1839 mutex_lock(&kprobe_mutex);
1613 1840
1614 /* Check whether specified probe is valid. */ 1841 /* Disable this kprobe */
1615 p = __get_valid_kprobe(kp); 1842 if (__disable_kprobe(kp) == NULL)
1616 if (unlikely(p == NULL)) {
1617 ret = -EINVAL; 1843 ret = -EINVAL;
1618 goto out;
1619 }
1620 1844
1621 /* If the probe is already disabled (or gone), just return */
1622 if (kprobe_disabled(kp))
1623 goto out;
1624
1625 kp->flags |= KPROBE_FLAG_DISABLED;
1626 if (p != kp)
1627 /* When kp != p, p is always enabled. */
1628 try_to_disable_aggr_kprobe(p);
1629
1630 if (!kprobes_all_disarmed && kprobe_disabled(p))
1631 disarm_kprobe(p);
1632out:
1633 mutex_unlock(&kprobe_mutex); 1845 mutex_unlock(&kprobe_mutex);
1634 return ret; 1846 return ret;
1635} 1847}
@@ -1927,36 +2139,27 @@ static void __kprobes disarm_all_kprobes(void)
1927 mutex_lock(&kprobe_mutex); 2139 mutex_lock(&kprobe_mutex);
1928 2140
1929 /* If kprobes are already disarmed, just return */ 2141 /* If kprobes are already disarmed, just return */
1930 if (kprobes_all_disarmed) 2142 if (kprobes_all_disarmed) {
1931 goto already_disabled; 2143 mutex_unlock(&kprobe_mutex);
2144 return;
2145 }
1932 2146
1933 kprobes_all_disarmed = true; 2147 kprobes_all_disarmed = true;
1934 printk(KERN_INFO "Kprobes globally disabled\n"); 2148 printk(KERN_INFO "Kprobes globally disabled\n");
1935 2149
1936 /*
1937 * Here we call get_online_cpus() for avoiding text_mutex deadlock,
1938 * because disarming may also unoptimize kprobes.
1939 */
1940 get_online_cpus();
1941 mutex_lock(&text_mutex); 2150 mutex_lock(&text_mutex);
1942 for (i = 0; i < KPROBE_TABLE_SIZE; i++) { 2151 for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
1943 head = &kprobe_table[i]; 2152 head = &kprobe_table[i];
1944 hlist_for_each_entry_rcu(p, node, head, hlist) { 2153 hlist_for_each_entry_rcu(p, node, head, hlist) {
1945 if (!arch_trampoline_kprobe(p) && !kprobe_disabled(p)) 2154 if (!arch_trampoline_kprobe(p) && !kprobe_disabled(p))
1946 __disarm_kprobe(p); 2155 __disarm_kprobe(p, false);
1947 } 2156 }
1948 } 2157 }
1949
1950 mutex_unlock(&text_mutex); 2158 mutex_unlock(&text_mutex);
1951 put_online_cpus();
1952 mutex_unlock(&kprobe_mutex); 2159 mutex_unlock(&kprobe_mutex);
1953 /* Allow all currently running kprobes to complete */
1954 synchronize_sched();
1955 return;
1956 2160
1957already_disabled: 2161 /* Wait for disarming all kprobes by optimizer */
1958 mutex_unlock(&kprobe_mutex); 2162 wait_for_kprobe_optimizer();
1959 return;
1960} 2163}
1961 2164
1962/* 2165/*
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 2dc3786349d1..c55afba990a3 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -148,7 +148,7 @@ struct task_struct *kthread_create(int (*threadfn)(void *data),
148 wait_for_completion(&create.done); 148 wait_for_completion(&create.done);
149 149
150 if (!IS_ERR(create.result)) { 150 if (!IS_ERR(create.result)) {
151 struct sched_param param = { .sched_priority = 0 }; 151 static const struct sched_param param = { .sched_priority = 0 };
152 va_list args; 152 va_list args;
153 153
154 va_start(args, namefmt); 154 va_start(args, namefmt);
@@ -265,6 +265,17 @@ int kthreadd(void *unused)
265 return 0; 265 return 0;
266} 266}
267 267
268void __init_kthread_worker(struct kthread_worker *worker,
269 const char *name,
270 struct lock_class_key *key)
271{
272 spin_lock_init(&worker->lock);
273 lockdep_set_class_and_name(&worker->lock, key, name);
274 INIT_LIST_HEAD(&worker->work_list);
275 worker->task = NULL;
276}
277EXPORT_SYMBOL_GPL(__init_kthread_worker);
278
268/** 279/**
269 * kthread_worker_fn - kthread function to process kthread_worker 280 * kthread_worker_fn - kthread function to process kthread_worker
270 * @worker_ptr: pointer to initialized kthread_worker 281 * @worker_ptr: pointer to initialized kthread_worker
diff --git a/kernel/latencytop.c b/kernel/latencytop.c
index 877fb306d415..ee74b35e528d 100644
--- a/kernel/latencytop.c
+++ b/kernel/latencytop.c
@@ -194,14 +194,7 @@ __account_scheduler_latency(struct task_struct *tsk, int usecs, int inter)
194 194
195 account_global_scheduler_latency(tsk, &lat); 195 account_global_scheduler_latency(tsk, &lat);
196 196
197 /* 197 for (i = 0; i < tsk->latency_record_count; i++) {
198 * short term hack; if we're > 32 we stop; future we recycle:
199 */
200 tsk->latency_record_count++;
201 if (tsk->latency_record_count >= LT_SAVECOUNT)
202 goto out_unlock;
203
204 for (i = 0; i < LT_SAVECOUNT; i++) {
205 struct latency_record *mylat; 198 struct latency_record *mylat;
206 int same = 1; 199 int same = 1;
207 200
@@ -227,8 +220,14 @@ __account_scheduler_latency(struct task_struct *tsk, int usecs, int inter)
227 } 220 }
228 } 221 }
229 222
223 /*
224 * short term hack; if we're > 32 we stop; future we recycle:
225 */
226 if (tsk->latency_record_count >= LT_SAVECOUNT)
227 goto out_unlock;
228
230 /* Allocated a new one: */ 229 /* Allocated a new one: */
231 i = tsk->latency_record_count; 230 i = tsk->latency_record_count++;
232 memcpy(&tsk->latency_record[i], &lat, sizeof(struct latency_record)); 231 memcpy(&tsk->latency_record[i], &lat, sizeof(struct latency_record));
233 232
234out_unlock: 233out_unlock:
@@ -242,24 +241,19 @@ static int lstats_show(struct seq_file *m, void *v)
242 seq_puts(m, "Latency Top version : v0.1\n"); 241 seq_puts(m, "Latency Top version : v0.1\n");
243 242
244 for (i = 0; i < MAXLR; i++) { 243 for (i = 0; i < MAXLR; i++) {
245 if (latency_record[i].backtrace[0]) { 244 struct latency_record *lr = &latency_record[i];
245
246 if (lr->backtrace[0]) {
246 int q; 247 int q;
247 seq_printf(m, "%i %lu %lu ", 248 seq_printf(m, "%i %lu %lu",
248 latency_record[i].count, 249 lr->count, lr->time, lr->max);
249 latency_record[i].time,
250 latency_record[i].max);
251 for (q = 0; q < LT_BACKTRACEDEPTH; q++) { 250 for (q = 0; q < LT_BACKTRACEDEPTH; q++) {
252 char sym[KSYM_SYMBOL_LEN]; 251 unsigned long bt = lr->backtrace[q];
253 char *c; 252 if (!bt)
254 if (!latency_record[i].backtrace[q])
255 break; 253 break;
256 if (latency_record[i].backtrace[q] == ULONG_MAX) 254 if (bt == ULONG_MAX)
257 break; 255 break;
258 sprint_symbol(sym, latency_record[i].backtrace[q]); 256 seq_printf(m, " %ps", (void *)bt);
259 c = strchr(sym, '+');
260 if (c)
261 *c = 0;
262 seq_printf(m, "%s ", sym);
263 } 257 }
264 seq_printf(m, "\n"); 258 seq_printf(m, "\n");
265 } 259 }
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 42ba65dff7d9..0d2058da80f5 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -2292,22 +2292,6 @@ mark_held_locks(struct task_struct *curr, enum mark_type mark)
2292} 2292}
2293 2293
2294/* 2294/*
2295 * Debugging helper: via this flag we know that we are in
2296 * 'early bootup code', and will warn about any invalid irqs-on event:
2297 */
2298static int early_boot_irqs_enabled;
2299
2300void early_boot_irqs_off(void)
2301{
2302 early_boot_irqs_enabled = 0;
2303}
2304
2305void early_boot_irqs_on(void)
2306{
2307 early_boot_irqs_enabled = 1;
2308}
2309
2310/*
2311 * Hardirqs will be enabled: 2295 * Hardirqs will be enabled:
2312 */ 2296 */
2313void trace_hardirqs_on_caller(unsigned long ip) 2297void trace_hardirqs_on_caller(unsigned long ip)
@@ -2319,7 +2303,7 @@ void trace_hardirqs_on_caller(unsigned long ip)
2319 if (unlikely(!debug_locks || current->lockdep_recursion)) 2303 if (unlikely(!debug_locks || current->lockdep_recursion))
2320 return; 2304 return;
2321 2305
2322 if (DEBUG_LOCKS_WARN_ON(unlikely(!early_boot_irqs_enabled))) 2306 if (DEBUG_LOCKS_WARN_ON(unlikely(early_boot_irqs_disabled)))
2323 return; 2307 return;
2324 2308
2325 if (unlikely(curr->hardirqs_enabled)) { 2309 if (unlikely(curr->hardirqs_enabled)) {
diff --git a/kernel/module.c b/kernel/module.c
index 437a74a7524a..34e00b708fad 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -56,6 +56,7 @@
56#include <linux/percpu.h> 56#include <linux/percpu.h>
57#include <linux/kmemleak.h> 57#include <linux/kmemleak.h>
58#include <linux/jump_label.h> 58#include <linux/jump_label.h>
59#include <linux/pfn.h>
59 60
60#define CREATE_TRACE_POINTS 61#define CREATE_TRACE_POINTS
61#include <trace/events/module.h> 62#include <trace/events/module.h>
@@ -70,6 +71,26 @@
70#define ARCH_SHF_SMALL 0 71#define ARCH_SHF_SMALL 0
71#endif 72#endif
72 73
74/*
75 * Modules' sections will be aligned on page boundaries
76 * to ensure complete separation of code and data, but
77 * only when CONFIG_DEBUG_SET_MODULE_RONX=y
78 */
79#ifdef CONFIG_DEBUG_SET_MODULE_RONX
80# define debug_align(X) ALIGN(X, PAGE_SIZE)
81#else
82# define debug_align(X) (X)
83#endif
84
85/*
86 * Given BASE and SIZE this macro calculates the number of pages the
87 * memory regions occupies
88 */
89#define MOD_NUMBER_OF_PAGES(BASE, SIZE) (((SIZE) > 0) ? \
90 (PFN_DOWN((unsigned long)(BASE) + (SIZE) - 1) - \
91 PFN_DOWN((unsigned long)BASE) + 1) \
92 : (0UL))
93
73/* If this is set, the section belongs in the init part of the module */ 94/* If this is set, the section belongs in the init part of the module */
74#define INIT_OFFSET_MASK (1UL << (BITS_PER_LONG-1)) 95#define INIT_OFFSET_MASK (1UL << (BITS_PER_LONG-1))
75 96
@@ -1542,6 +1563,115 @@ static int __unlink_module(void *_mod)
1542 return 0; 1563 return 0;
1543} 1564}
1544 1565
1566#ifdef CONFIG_DEBUG_SET_MODULE_RONX
1567/*
1568 * LKM RO/NX protection: protect module's text/ro-data
1569 * from modification and any data from execution.
1570 */
1571void set_page_attributes(void *start, void *end, int (*set)(unsigned long start, int num_pages))
1572{
1573 unsigned long begin_pfn = PFN_DOWN((unsigned long)start);
1574 unsigned long end_pfn = PFN_DOWN((unsigned long)end);
1575
1576 if (end_pfn > begin_pfn)
1577 set(begin_pfn << PAGE_SHIFT, end_pfn - begin_pfn);
1578}
1579
1580static void set_section_ro_nx(void *base,
1581 unsigned long text_size,
1582 unsigned long ro_size,
1583 unsigned long total_size)
1584{
1585 /* begin and end PFNs of the current subsection */
1586 unsigned long begin_pfn;
1587 unsigned long end_pfn;
1588
1589 /*
1590 * Set RO for module text and RO-data:
1591 * - Always protect first page.
1592 * - Do not protect last partial page.
1593 */
1594 if (ro_size > 0)
1595 set_page_attributes(base, base + ro_size, set_memory_ro);
1596
1597 /*
1598 * Set NX permissions for module data:
1599 * - Do not protect first partial page.
1600 * - Always protect last page.
1601 */
1602 if (total_size > text_size) {
1603 begin_pfn = PFN_UP((unsigned long)base + text_size);
1604 end_pfn = PFN_UP((unsigned long)base + total_size);
1605 if (end_pfn > begin_pfn)
1606 set_memory_nx(begin_pfn << PAGE_SHIFT, end_pfn - begin_pfn);
1607 }
1608}
1609
1610/* Setting memory back to RW+NX before releasing it */
1611void unset_section_ro_nx(struct module *mod, void *module_region)
1612{
1613 unsigned long total_pages;
1614
1615 if (mod->module_core == module_region) {
1616 /* Set core as NX+RW */
1617 total_pages = MOD_NUMBER_OF_PAGES(mod->module_core, mod->core_size);
1618 set_memory_nx((unsigned long)mod->module_core, total_pages);
1619 set_memory_rw((unsigned long)mod->module_core, total_pages);
1620
1621 } else if (mod->module_init == module_region) {
1622 /* Set init as NX+RW */
1623 total_pages = MOD_NUMBER_OF_PAGES(mod->module_init, mod->init_size);
1624 set_memory_nx((unsigned long)mod->module_init, total_pages);
1625 set_memory_rw((unsigned long)mod->module_init, total_pages);
1626 }
1627}
1628
1629/* Iterate through all modules and set each module's text as RW */
1630void set_all_modules_text_rw()
1631{
1632 struct module *mod;
1633
1634 mutex_lock(&module_mutex);
1635 list_for_each_entry_rcu(mod, &modules, list) {
1636 if ((mod->module_core) && (mod->core_text_size)) {
1637 set_page_attributes(mod->module_core,
1638 mod->module_core + mod->core_text_size,
1639 set_memory_rw);
1640 }
1641 if ((mod->module_init) && (mod->init_text_size)) {
1642 set_page_attributes(mod->module_init,
1643 mod->module_init + mod->init_text_size,
1644 set_memory_rw);
1645 }
1646 }
1647 mutex_unlock(&module_mutex);
1648}
1649
1650/* Iterate through all modules and set each module's text as RO */
1651void set_all_modules_text_ro()
1652{
1653 struct module *mod;
1654
1655 mutex_lock(&module_mutex);
1656 list_for_each_entry_rcu(mod, &modules, list) {
1657 if ((mod->module_core) && (mod->core_text_size)) {
1658 set_page_attributes(mod->module_core,
1659 mod->module_core + mod->core_text_size,
1660 set_memory_ro);
1661 }
1662 if ((mod->module_init) && (mod->init_text_size)) {
1663 set_page_attributes(mod->module_init,
1664 mod->module_init + mod->init_text_size,
1665 set_memory_ro);
1666 }
1667 }
1668 mutex_unlock(&module_mutex);
1669}
1670#else
1671static inline void set_section_ro_nx(void *base, unsigned long text_size, unsigned long ro_size, unsigned long total_size) { }
1672static inline void unset_section_ro_nx(struct module *mod, void *module_region) { }
1673#endif
1674
1545/* Free a module, remove from lists, etc. */ 1675/* Free a module, remove from lists, etc. */
1546static void free_module(struct module *mod) 1676static void free_module(struct module *mod)
1547{ 1677{
@@ -1566,6 +1696,7 @@ static void free_module(struct module *mod)
1566 destroy_params(mod->kp, mod->num_kp); 1696 destroy_params(mod->kp, mod->num_kp);
1567 1697
1568 /* This may be NULL, but that's OK */ 1698 /* This may be NULL, but that's OK */
1699 unset_section_ro_nx(mod, mod->module_init);
1569 module_free(mod, mod->module_init); 1700 module_free(mod, mod->module_init);
1570 kfree(mod->args); 1701 kfree(mod->args);
1571 percpu_modfree(mod); 1702 percpu_modfree(mod);
@@ -1574,6 +1705,7 @@ static void free_module(struct module *mod)
1574 lockdep_free_key_range(mod->module_core, mod->core_size); 1705 lockdep_free_key_range(mod->module_core, mod->core_size);
1575 1706
1576 /* Finally, free the core (containing the module structure) */ 1707 /* Finally, free the core (containing the module structure) */
1708 unset_section_ro_nx(mod, mod->module_core);
1577 module_free(mod, mod->module_core); 1709 module_free(mod, mod->module_core);
1578 1710
1579#ifdef CONFIG_MPU 1711#ifdef CONFIG_MPU
@@ -1777,8 +1909,19 @@ static void layout_sections(struct module *mod, struct load_info *info)
1777 s->sh_entsize = get_offset(mod, &mod->core_size, s, i); 1909 s->sh_entsize = get_offset(mod, &mod->core_size, s, i);
1778 DEBUGP("\t%s\n", name); 1910 DEBUGP("\t%s\n", name);
1779 } 1911 }
1780 if (m == 0) 1912 switch (m) {
1913 case 0: /* executable */
1914 mod->core_size = debug_align(mod->core_size);
1781 mod->core_text_size = mod->core_size; 1915 mod->core_text_size = mod->core_size;
1916 break;
1917 case 1: /* RO: text and ro-data */
1918 mod->core_size = debug_align(mod->core_size);
1919 mod->core_ro_size = mod->core_size;
1920 break;
1921 case 3: /* whole core */
1922 mod->core_size = debug_align(mod->core_size);
1923 break;
1924 }
1782 } 1925 }
1783 1926
1784 DEBUGP("Init section allocation order:\n"); 1927 DEBUGP("Init section allocation order:\n");
@@ -1796,8 +1939,19 @@ static void layout_sections(struct module *mod, struct load_info *info)
1796 | INIT_OFFSET_MASK); 1939 | INIT_OFFSET_MASK);
1797 DEBUGP("\t%s\n", sname); 1940 DEBUGP("\t%s\n", sname);
1798 } 1941 }
1799 if (m == 0) 1942 switch (m) {
1943 case 0: /* executable */
1944 mod->init_size = debug_align(mod->init_size);
1800 mod->init_text_size = mod->init_size; 1945 mod->init_text_size = mod->init_size;
1946 break;
1947 case 1: /* RO: text and ro-data */
1948 mod->init_size = debug_align(mod->init_size);
1949 mod->init_ro_size = mod->init_size;
1950 break;
1951 case 3: /* whole init */
1952 mod->init_size = debug_align(mod->init_size);
1953 break;
1954 }
1801 } 1955 }
1802} 1956}
1803 1957
@@ -2326,6 +2480,18 @@ static void find_module_sections(struct module *mod, struct load_info *info)
2326 kmemleak_scan_area(mod->trace_events, sizeof(*mod->trace_events) * 2480 kmemleak_scan_area(mod->trace_events, sizeof(*mod->trace_events) *
2327 mod->num_trace_events, GFP_KERNEL); 2481 mod->num_trace_events, GFP_KERNEL);
2328#endif 2482#endif
2483#ifdef CONFIG_TRACING
2484 mod->trace_bprintk_fmt_start = section_objs(info, "__trace_printk_fmt",
2485 sizeof(*mod->trace_bprintk_fmt_start),
2486 &mod->num_trace_bprintk_fmt);
2487 /*
2488 * This section contains pointers to allocated objects in the trace
2489 * code and not scanning it leads to false positives.
2490 */
2491 kmemleak_scan_area(mod->trace_bprintk_fmt_start,
2492 sizeof(*mod->trace_bprintk_fmt_start) *
2493 mod->num_trace_bprintk_fmt, GFP_KERNEL);
2494#endif
2329#ifdef CONFIG_FTRACE_MCOUNT_RECORD 2495#ifdef CONFIG_FTRACE_MCOUNT_RECORD
2330 /* sechdrs[0].sh_size is always zero */ 2496 /* sechdrs[0].sh_size is always zero */
2331 mod->ftrace_callsites = section_objs(info, "__mcount_loc", 2497 mod->ftrace_callsites = section_objs(info, "__mcount_loc",
@@ -2710,6 +2876,18 @@ SYSCALL_DEFINE3(init_module, void __user *, umod,
2710 blocking_notifier_call_chain(&module_notify_list, 2876 blocking_notifier_call_chain(&module_notify_list,
2711 MODULE_STATE_COMING, mod); 2877 MODULE_STATE_COMING, mod);
2712 2878
2879 /* Set RO and NX regions for core */
2880 set_section_ro_nx(mod->module_core,
2881 mod->core_text_size,
2882 mod->core_ro_size,
2883 mod->core_size);
2884
2885 /* Set RO and NX regions for init */
2886 set_section_ro_nx(mod->module_init,
2887 mod->init_text_size,
2888 mod->init_ro_size,
2889 mod->init_size);
2890
2713 do_mod_ctors(mod); 2891 do_mod_ctors(mod);
2714 /* Start the module */ 2892 /* Start the module */
2715 if (mod->init != NULL) 2893 if (mod->init != NULL)
@@ -2753,6 +2931,7 @@ SYSCALL_DEFINE3(init_module, void __user *, umod,
2753 mod->symtab = mod->core_symtab; 2931 mod->symtab = mod->core_symtab;
2754 mod->strtab = mod->core_strtab; 2932 mod->strtab = mod->core_strtab;
2755#endif 2933#endif
2934 unset_section_ro_nx(mod, mod->module_init);
2756 module_free(mod, mod->module_init); 2935 module_free(mod, mod->module_init);
2757 mod->module_init = NULL; 2936 mod->module_init = NULL;
2758 mod->init_size = 0; 2937 mod->init_size = 0;
diff --git a/kernel/mutex.c b/kernel/mutex.c
index 200407c1502f..a5889fb28ecf 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -199,7 +199,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
199 * memory barriers as we'll eventually observe the right 199 * memory barriers as we'll eventually observe the right
200 * values at the cost of a few extra spins. 200 * values at the cost of a few extra spins.
201 */ 201 */
202 cpu_relax(); 202 arch_mutex_cpu_relax();
203 } 203 }
204#endif 204#endif
205 spin_lock_mutex(&lock->wait_lock, flags); 205 spin_lock_mutex(&lock->wait_lock, flags);
diff --git a/kernel/panic.c b/kernel/panic.c
index 4c13b1a88ebb..991bb87a1704 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -34,6 +34,7 @@ static int pause_on_oops_flag;
34static DEFINE_SPINLOCK(pause_on_oops_lock); 34static DEFINE_SPINLOCK(pause_on_oops_lock);
35 35
36int panic_timeout; 36int panic_timeout;
37EXPORT_SYMBOL_GPL(panic_timeout);
37 38
38ATOMIC_NOTIFIER_HEAD(panic_notifier_list); 39ATOMIC_NOTIFIER_HEAD(panic_notifier_list);
39 40
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 517d827f4982..84522c796987 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -13,6 +13,7 @@
13#include <linux/mm.h> 13#include <linux/mm.h>
14#include <linux/cpu.h> 14#include <linux/cpu.h>
15#include <linux/smp.h> 15#include <linux/smp.h>
16#include <linux/idr.h>
16#include <linux/file.h> 17#include <linux/file.h>
17#include <linux/poll.h> 18#include <linux/poll.h>
18#include <linux/slab.h> 19#include <linux/slab.h>
@@ -21,7 +22,9 @@
21#include <linux/dcache.h> 22#include <linux/dcache.h>
22#include <linux/percpu.h> 23#include <linux/percpu.h>
23#include <linux/ptrace.h> 24#include <linux/ptrace.h>
25#include <linux/reboot.h>
24#include <linux/vmstat.h> 26#include <linux/vmstat.h>
27#include <linux/device.h>
25#include <linux/vmalloc.h> 28#include <linux/vmalloc.h>
26#include <linux/hardirq.h> 29#include <linux/hardirq.h>
27#include <linux/rculist.h> 30#include <linux/rculist.h>
@@ -31,9 +34,16 @@
31#include <linux/kernel_stat.h> 34#include <linux/kernel_stat.h>
32#include <linux/perf_event.h> 35#include <linux/perf_event.h>
33#include <linux/ftrace_event.h> 36#include <linux/ftrace_event.h>
37#include <linux/hw_breakpoint.h>
34 38
35#include <asm/irq_regs.h> 39#include <asm/irq_regs.h>
36 40
41enum event_type_t {
42 EVENT_FLEXIBLE = 0x1,
43 EVENT_PINNED = 0x2,
44 EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
45};
46
37atomic_t perf_task_events __read_mostly; 47atomic_t perf_task_events __read_mostly;
38static atomic_t nr_mmap_events __read_mostly; 48static atomic_t nr_mmap_events __read_mostly;
39static atomic_t nr_comm_events __read_mostly; 49static atomic_t nr_comm_events __read_mostly;
@@ -61,6 +71,12 @@ int sysctl_perf_event_sample_rate __read_mostly = 100000;
61 71
62static atomic64_t perf_event_id; 72static atomic64_t perf_event_id;
63 73
74static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
75 enum event_type_t event_type);
76
77static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
78 enum event_type_t event_type);
79
64void __weak perf_event_print_debug(void) { } 80void __weak perf_event_print_debug(void) { }
65 81
66extern __weak const char *perf_pmu_name(void) 82extern __weak const char *perf_pmu_name(void)
@@ -68,6 +84,11 @@ extern __weak const char *perf_pmu_name(void)
68 return "pmu"; 84 return "pmu";
69} 85}
70 86
87static inline u64 perf_clock(void)
88{
89 return local_clock();
90}
91
71void perf_pmu_disable(struct pmu *pmu) 92void perf_pmu_disable(struct pmu *pmu)
72{ 93{
73 int *count = this_cpu_ptr(pmu->pmu_disable_count); 94 int *count = this_cpu_ptr(pmu->pmu_disable_count);
@@ -132,6 +153,28 @@ static void unclone_ctx(struct perf_event_context *ctx)
132 } 153 }
133} 154}
134 155
156static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
157{
158 /*
159 * only top level events have the pid namespace they were created in
160 */
161 if (event->parent)
162 event = event->parent;
163
164 return task_tgid_nr_ns(p, event->ns);
165}
166
167static u32 perf_event_tid(struct perf_event *event, struct task_struct *p)
168{
169 /*
170 * only top level events have the pid namespace they were created in
171 */
172 if (event->parent)
173 event = event->parent;
174
175 return task_pid_nr_ns(p, event->ns);
176}
177
135/* 178/*
136 * If we inherit events we want to return the parent event id 179 * If we inherit events we want to return the parent event id
137 * to userspace. 180 * to userspace.
@@ -214,11 +257,6 @@ static void perf_unpin_context(struct perf_event_context *ctx)
214 put_ctx(ctx); 257 put_ctx(ctx);
215} 258}
216 259
217static inline u64 perf_clock(void)
218{
219 return local_clock();
220}
221
222/* 260/*
223 * Update the record of the current time in a context. 261 * Update the record of the current time in a context.
224 */ 262 */
@@ -230,6 +268,12 @@ static void update_context_time(struct perf_event_context *ctx)
230 ctx->timestamp = now; 268 ctx->timestamp = now;
231} 269}
232 270
271static u64 perf_event_time(struct perf_event *event)
272{
273 struct perf_event_context *ctx = event->ctx;
274 return ctx ? ctx->time : 0;
275}
276
233/* 277/*
234 * Update the total_time_enabled and total_time_running fields for a event. 278 * Update the total_time_enabled and total_time_running fields for a event.
235 */ 279 */
@@ -243,7 +287,7 @@ static void update_event_times(struct perf_event *event)
243 return; 287 return;
244 288
245 if (ctx->is_active) 289 if (ctx->is_active)
246 run_end = ctx->time; 290 run_end = perf_event_time(event);
247 else 291 else
248 run_end = event->tstamp_stopped; 292 run_end = event->tstamp_stopped;
249 293
@@ -252,7 +296,7 @@ static void update_event_times(struct perf_event *event)
252 if (event->state == PERF_EVENT_STATE_INACTIVE) 296 if (event->state == PERF_EVENT_STATE_INACTIVE)
253 run_end = event->tstamp_stopped; 297 run_end = event->tstamp_stopped;
254 else 298 else
255 run_end = ctx->time; 299 run_end = perf_event_time(event);
256 300
257 event->total_time_running = run_end - event->tstamp_running; 301 event->total_time_running = run_end - event->tstamp_running;
258} 302}
@@ -311,9 +355,84 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
311 ctx->nr_stat++; 355 ctx->nr_stat++;
312} 356}
313 357
358/*
359 * Called at perf_event creation and when events are attached/detached from a
360 * group.
361 */
362static void perf_event__read_size(struct perf_event *event)
363{
364 int entry = sizeof(u64); /* value */
365 int size = 0;
366 int nr = 1;
367
368 if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
369 size += sizeof(u64);
370
371 if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
372 size += sizeof(u64);
373
374 if (event->attr.read_format & PERF_FORMAT_ID)
375 entry += sizeof(u64);
376
377 if (event->attr.read_format & PERF_FORMAT_GROUP) {
378 nr += event->group_leader->nr_siblings;
379 size += sizeof(u64);
380 }
381
382 size += entry * nr;
383 event->read_size = size;
384}
385
386static void perf_event__header_size(struct perf_event *event)
387{
388 struct perf_sample_data *data;
389 u64 sample_type = event->attr.sample_type;
390 u16 size = 0;
391
392 perf_event__read_size(event);
393
394 if (sample_type & PERF_SAMPLE_IP)
395 size += sizeof(data->ip);
396
397 if (sample_type & PERF_SAMPLE_ADDR)
398 size += sizeof(data->addr);
399
400 if (sample_type & PERF_SAMPLE_PERIOD)
401 size += sizeof(data->period);
402
403 if (sample_type & PERF_SAMPLE_READ)
404 size += event->read_size;
405
406 event->header_size = size;
407}
408
409static void perf_event__id_header_size(struct perf_event *event)
410{
411 struct perf_sample_data *data;
412 u64 sample_type = event->attr.sample_type;
413 u16 size = 0;
414
415 if (sample_type & PERF_SAMPLE_TID)
416 size += sizeof(data->tid_entry);
417
418 if (sample_type & PERF_SAMPLE_TIME)
419 size += sizeof(data->time);
420
421 if (sample_type & PERF_SAMPLE_ID)
422 size += sizeof(data->id);
423
424 if (sample_type & PERF_SAMPLE_STREAM_ID)
425 size += sizeof(data->stream_id);
426
427 if (sample_type & PERF_SAMPLE_CPU)
428 size += sizeof(data->cpu_entry);
429
430 event->id_header_size = size;
431}
432
314static void perf_group_attach(struct perf_event *event) 433static void perf_group_attach(struct perf_event *event)
315{ 434{
316 struct perf_event *group_leader = event->group_leader; 435 struct perf_event *group_leader = event->group_leader, *pos;
317 436
318 /* 437 /*
319 * We can have double attach due to group movement in perf_event_open. 438 * We can have double attach due to group movement in perf_event_open.
@@ -332,6 +451,11 @@ static void perf_group_attach(struct perf_event *event)
332 451
333 list_add_tail(&event->group_entry, &group_leader->sibling_list); 452 list_add_tail(&event->group_entry, &group_leader->sibling_list);
334 group_leader->nr_siblings++; 453 group_leader->nr_siblings++;
454
455 perf_event__header_size(group_leader);
456
457 list_for_each_entry(pos, &group_leader->sibling_list, group_entry)
458 perf_event__header_size(pos);
335} 459}
336 460
337/* 461/*
@@ -390,7 +514,7 @@ static void perf_group_detach(struct perf_event *event)
390 if (event->group_leader != event) { 514 if (event->group_leader != event) {
391 list_del_init(&event->group_entry); 515 list_del_init(&event->group_entry);
392 event->group_leader->nr_siblings--; 516 event->group_leader->nr_siblings--;
393 return; 517 goto out;
394 } 518 }
395 519
396 if (!list_empty(&event->group_entry)) 520 if (!list_empty(&event->group_entry))
@@ -409,6 +533,12 @@ static void perf_group_detach(struct perf_event *event)
409 /* Inherit group flags from the previous leader */ 533 /* Inherit group flags from the previous leader */
410 sibling->group_flags = event->group_flags; 534 sibling->group_flags = event->group_flags;
411 } 535 }
536
537out:
538 perf_event__header_size(event->group_leader);
539
540 list_for_each_entry(tmp, &event->group_leader->sibling_list, group_entry)
541 perf_event__header_size(tmp);
412} 542}
413 543
414static inline int 544static inline int
@@ -422,6 +552,7 @@ event_sched_out(struct perf_event *event,
422 struct perf_cpu_context *cpuctx, 552 struct perf_cpu_context *cpuctx,
423 struct perf_event_context *ctx) 553 struct perf_event_context *ctx)
424{ 554{
555 u64 tstamp = perf_event_time(event);
425 u64 delta; 556 u64 delta;
426 /* 557 /*
427 * An event which could not be activated because of 558 * An event which could not be activated because of
@@ -433,7 +564,7 @@ event_sched_out(struct perf_event *event,
433 && !event_filter_match(event)) { 564 && !event_filter_match(event)) {
434 delta = ctx->time - event->tstamp_stopped; 565 delta = ctx->time - event->tstamp_stopped;
435 event->tstamp_running += delta; 566 event->tstamp_running += delta;
436 event->tstamp_stopped = ctx->time; 567 event->tstamp_stopped = tstamp;
437 } 568 }
438 569
439 if (event->state != PERF_EVENT_STATE_ACTIVE) 570 if (event->state != PERF_EVENT_STATE_ACTIVE)
@@ -444,7 +575,7 @@ event_sched_out(struct perf_event *event,
444 event->pending_disable = 0; 575 event->pending_disable = 0;
445 event->state = PERF_EVENT_STATE_OFF; 576 event->state = PERF_EVENT_STATE_OFF;
446 } 577 }
447 event->tstamp_stopped = ctx->time; 578 event->tstamp_stopped = tstamp;
448 event->pmu->del(event, 0); 579 event->pmu->del(event, 0);
449 event->oncpu = -1; 580 event->oncpu = -1;
450 581
@@ -656,6 +787,8 @@ event_sched_in(struct perf_event *event,
656 struct perf_cpu_context *cpuctx, 787 struct perf_cpu_context *cpuctx,
657 struct perf_event_context *ctx) 788 struct perf_event_context *ctx)
658{ 789{
790 u64 tstamp = perf_event_time(event);
791
659 if (event->state <= PERF_EVENT_STATE_OFF) 792 if (event->state <= PERF_EVENT_STATE_OFF)
660 return 0; 793 return 0;
661 794
@@ -672,7 +805,9 @@ event_sched_in(struct perf_event *event,
672 return -EAGAIN; 805 return -EAGAIN;
673 } 806 }
674 807
675 event->tstamp_running += ctx->time - event->tstamp_stopped; 808 event->tstamp_running += tstamp - event->tstamp_stopped;
809
810 event->shadow_ctx_time = tstamp - ctx->timestamp;
676 811
677 if (!is_software_event(event)) 812 if (!is_software_event(event))
678 cpuctx->active_oncpu++; 813 cpuctx->active_oncpu++;
@@ -784,11 +919,13 @@ static int group_can_go_on(struct perf_event *event,
784static void add_event_to_ctx(struct perf_event *event, 919static void add_event_to_ctx(struct perf_event *event,
785 struct perf_event_context *ctx) 920 struct perf_event_context *ctx)
786{ 921{
922 u64 tstamp = perf_event_time(event);
923
787 list_add_event(event, ctx); 924 list_add_event(event, ctx);
788 perf_group_attach(event); 925 perf_group_attach(event);
789 event->tstamp_enabled = ctx->time; 926 event->tstamp_enabled = tstamp;
790 event->tstamp_running = ctx->time; 927 event->tstamp_running = tstamp;
791 event->tstamp_stopped = ctx->time; 928 event->tstamp_stopped = tstamp;
792} 929}
793 930
794/* 931/*
@@ -823,7 +960,7 @@ static void __perf_install_in_context(void *info)
823 960
824 add_event_to_ctx(event, ctx); 961 add_event_to_ctx(event, ctx);
825 962
826 if (event->cpu != -1 && event->cpu != smp_processor_id()) 963 if (!event_filter_match(event))
827 goto unlock; 964 goto unlock;
828 965
829 /* 966 /*
@@ -928,14 +1065,13 @@ static void __perf_event_mark_enabled(struct perf_event *event,
928 struct perf_event_context *ctx) 1065 struct perf_event_context *ctx)
929{ 1066{
930 struct perf_event *sub; 1067 struct perf_event *sub;
1068 u64 tstamp = perf_event_time(event);
931 1069
932 event->state = PERF_EVENT_STATE_INACTIVE; 1070 event->state = PERF_EVENT_STATE_INACTIVE;
933 event->tstamp_enabled = ctx->time - event->total_time_enabled; 1071 event->tstamp_enabled = tstamp - event->total_time_enabled;
934 list_for_each_entry(sub, &event->sibling_list, group_entry) { 1072 list_for_each_entry(sub, &event->sibling_list, group_entry) {
935 if (sub->state >= PERF_EVENT_STATE_INACTIVE) { 1073 if (sub->state >= PERF_EVENT_STATE_INACTIVE)
936 sub->tstamp_enabled = 1074 sub->tstamp_enabled = tstamp - sub->total_time_enabled;
937 ctx->time - sub->total_time_enabled;
938 }
939 } 1075 }
940} 1076}
941 1077
@@ -968,7 +1104,7 @@ static void __perf_event_enable(void *info)
968 goto unlock; 1104 goto unlock;
969 __perf_event_mark_enabled(event, ctx); 1105 __perf_event_mark_enabled(event, ctx);
970 1106
971 if (event->cpu != -1 && event->cpu != smp_processor_id()) 1107 if (!event_filter_match(event))
972 goto unlock; 1108 goto unlock;
973 1109
974 /* 1110 /*
@@ -1070,7 +1206,7 @@ static int perf_event_refresh(struct perf_event *event, int refresh)
1070 /* 1206 /*
1071 * not supported on inherited events 1207 * not supported on inherited events
1072 */ 1208 */
1073 if (event->attr.inherit) 1209 if (event->attr.inherit || !is_sampling_event(event))
1074 return -EINVAL; 1210 return -EINVAL;
1075 1211
1076 atomic_add(refresh, &event->event_limit); 1212 atomic_add(refresh, &event->event_limit);
@@ -1079,12 +1215,6 @@ static int perf_event_refresh(struct perf_event *event, int refresh)
1079 return 0; 1215 return 0;
1080} 1216}
1081 1217
1082enum event_type_t {
1083 EVENT_FLEXIBLE = 0x1,
1084 EVENT_PINNED = 0x2,
1085 EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
1086};
1087
1088static void ctx_sched_out(struct perf_event_context *ctx, 1218static void ctx_sched_out(struct perf_event_context *ctx,
1089 struct perf_cpu_context *cpuctx, 1219 struct perf_cpu_context *cpuctx,
1090 enum event_type_t event_type) 1220 enum event_type_t event_type)
@@ -1284,8 +1414,6 @@ void __perf_event_task_sched_out(struct task_struct *task,
1284{ 1414{
1285 int ctxn; 1415 int ctxn;
1286 1416
1287 perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, NULL, 0);
1288
1289 for_each_task_context_nr(ctxn) 1417 for_each_task_context_nr(ctxn)
1290 perf_event_context_sched_out(task, ctxn, next); 1418 perf_event_context_sched_out(task, ctxn, next);
1291} 1419}
@@ -1323,7 +1451,7 @@ ctx_pinned_sched_in(struct perf_event_context *ctx,
1323 list_for_each_entry(event, &ctx->pinned_groups, group_entry) { 1451 list_for_each_entry(event, &ctx->pinned_groups, group_entry) {
1324 if (event->state <= PERF_EVENT_STATE_OFF) 1452 if (event->state <= PERF_EVENT_STATE_OFF)
1325 continue; 1453 continue;
1326 if (event->cpu != -1 && event->cpu != smp_processor_id()) 1454 if (!event_filter_match(event))
1327 continue; 1455 continue;
1328 1456
1329 if (group_can_go_on(event, cpuctx, 1)) 1457 if (group_can_go_on(event, cpuctx, 1))
@@ -1355,7 +1483,7 @@ ctx_flexible_sched_in(struct perf_event_context *ctx,
1355 * Listen to the 'cpu' scheduling filter constraint 1483 * Listen to the 'cpu' scheduling filter constraint
1356 * of events: 1484 * of events:
1357 */ 1485 */
1358 if (event->cpu != -1 && event->cpu != smp_processor_id()) 1486 if (!event_filter_match(event))
1359 continue; 1487 continue;
1360 1488
1361 if (group_can_go_on(event, cpuctx, can_add_hw)) { 1489 if (group_can_go_on(event, cpuctx, can_add_hw)) {
@@ -1582,7 +1710,7 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx, u64 period)
1582 if (event->state != PERF_EVENT_STATE_ACTIVE) 1710 if (event->state != PERF_EVENT_STATE_ACTIVE)
1583 continue; 1711 continue;
1584 1712
1585 if (event->cpu != -1 && event->cpu != smp_processor_id()) 1713 if (!event_filter_match(event))
1586 continue; 1714 continue;
1587 1715
1588 hwc = &event->hw; 1716 hwc = &event->hw;
@@ -1619,8 +1747,12 @@ static void rotate_ctx(struct perf_event_context *ctx)
1619{ 1747{
1620 raw_spin_lock(&ctx->lock); 1748 raw_spin_lock(&ctx->lock);
1621 1749
1622 /* Rotate the first entry last of non-pinned groups */ 1750 /*
1623 list_rotate_left(&ctx->flexible_groups); 1751 * Rotate the first entry last of non-pinned groups. Rotation might be
1752 * disabled by the inheritance code.
1753 */
1754 if (!ctx->rotate_disable)
1755 list_rotate_left(&ctx->flexible_groups);
1624 1756
1625 raw_spin_unlock(&ctx->lock); 1757 raw_spin_unlock(&ctx->lock);
1626} 1758}
@@ -2096,14 +2228,11 @@ find_get_context(struct pmu *pmu, struct task_struct *task, int cpu)
2096 unsigned long flags; 2228 unsigned long flags;
2097 int ctxn, err; 2229 int ctxn, err;
2098 2230
2099 if (!task && cpu != -1) { 2231 if (!task) {
2100 /* Must be root to operate on a CPU event: */ 2232 /* Must be root to operate on a CPU event: */
2101 if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN)) 2233 if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
2102 return ERR_PTR(-EACCES); 2234 return ERR_PTR(-EACCES);
2103 2235
2104 if (cpu < 0 || cpu >= nr_cpumask_bits)
2105 return ERR_PTR(-EINVAL);
2106
2107 /* 2236 /*
2108 * We could be clever and allow to attach a event to an 2237 * We could be clever and allow to attach a event to an
2109 * offline CPU and activate it when the CPU comes up, but 2238 * offline CPU and activate it when the CPU comes up, but
@@ -2232,11 +2361,6 @@ int perf_event_release_kernel(struct perf_event *event)
2232 raw_spin_unlock_irq(&ctx->lock); 2361 raw_spin_unlock_irq(&ctx->lock);
2233 mutex_unlock(&ctx->mutex); 2362 mutex_unlock(&ctx->mutex);
2234 2363
2235 mutex_lock(&event->owner->perf_event_mutex);
2236 list_del_init(&event->owner_entry);
2237 mutex_unlock(&event->owner->perf_event_mutex);
2238 put_task_struct(event->owner);
2239
2240 free_event(event); 2364 free_event(event);
2241 2365
2242 return 0; 2366 return 0;
@@ -2249,35 +2373,44 @@ EXPORT_SYMBOL_GPL(perf_event_release_kernel);
2249static int perf_release(struct inode *inode, struct file *file) 2373static int perf_release(struct inode *inode, struct file *file)
2250{ 2374{
2251 struct perf_event *event = file->private_data; 2375 struct perf_event *event = file->private_data;
2376 struct task_struct *owner;
2252 2377
2253 file->private_data = NULL; 2378 file->private_data = NULL;
2254 2379
2255 return perf_event_release_kernel(event); 2380 rcu_read_lock();
2256} 2381 owner = ACCESS_ONCE(event->owner);
2257 2382 /*
2258static int perf_event_read_size(struct perf_event *event) 2383 * Matches the smp_wmb() in perf_event_exit_task(). If we observe
2259{ 2384 * !owner it means the list deletion is complete and we can indeed
2260 int entry = sizeof(u64); /* value */ 2385 * free this event, otherwise we need to serialize on
2261 int size = 0; 2386 * owner->perf_event_mutex.
2262 int nr = 1; 2387 */
2263 2388 smp_read_barrier_depends();
2264 if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) 2389 if (owner) {
2265 size += sizeof(u64); 2390 /*
2266 2391 * Since delayed_put_task_struct() also drops the last
2267 if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) 2392 * task reference we can safely take a new reference
2268 size += sizeof(u64); 2393 * while holding the rcu_read_lock().
2269 2394 */
2270 if (event->attr.read_format & PERF_FORMAT_ID) 2395 get_task_struct(owner);
2271 entry += sizeof(u64);
2272
2273 if (event->attr.read_format & PERF_FORMAT_GROUP) {
2274 nr += event->group_leader->nr_siblings;
2275 size += sizeof(u64);
2276 } 2396 }
2397 rcu_read_unlock();
2277 2398
2278 size += entry * nr; 2399 if (owner) {
2400 mutex_lock(&owner->perf_event_mutex);
2401 /*
2402 * We have to re-check the event->owner field, if it is cleared
2403 * we raced with perf_event_exit_task(), acquiring the mutex
2404 * ensured they're done, and we can proceed with freeing the
2405 * event.
2406 */
2407 if (event->owner)
2408 list_del_init(&event->owner_entry);
2409 mutex_unlock(&owner->perf_event_mutex);
2410 put_task_struct(owner);
2411 }
2279 2412
2280 return size; 2413 return perf_event_release_kernel(event);
2281} 2414}
2282 2415
2283u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running) 2416u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
@@ -2394,7 +2527,7 @@ perf_read_hw(struct perf_event *event, char __user *buf, size_t count)
2394 if (event->state == PERF_EVENT_STATE_ERROR) 2527 if (event->state == PERF_EVENT_STATE_ERROR)
2395 return 0; 2528 return 0;
2396 2529
2397 if (count < perf_event_read_size(event)) 2530 if (count < event->read_size)
2398 return -ENOSPC; 2531 return -ENOSPC;
2399 2532
2400 WARN_ON_ONCE(event->ctx->parent_ctx); 2533 WARN_ON_ONCE(event->ctx->parent_ctx);
@@ -2480,7 +2613,7 @@ static int perf_event_period(struct perf_event *event, u64 __user *arg)
2480 int ret = 0; 2613 int ret = 0;
2481 u64 value; 2614 u64 value;
2482 2615
2483 if (!event->attr.sample_period) 2616 if (!is_sampling_event(event))
2484 return -EINVAL; 2617 return -EINVAL;
2485 2618
2486 if (copy_from_user(&value, arg, sizeof(value))) 2619 if (copy_from_user(&value, arg, sizeof(value)))
@@ -3271,6 +3404,73 @@ __always_inline void perf_output_copy(struct perf_output_handle *handle,
3271 } while (len); 3404 } while (len);
3272} 3405}
3273 3406
3407static void __perf_event_header__init_id(struct perf_event_header *header,
3408 struct perf_sample_data *data,
3409 struct perf_event *event)
3410{
3411 u64 sample_type = event->attr.sample_type;
3412
3413 data->type = sample_type;
3414 header->size += event->id_header_size;
3415
3416 if (sample_type & PERF_SAMPLE_TID) {
3417 /* namespace issues */
3418 data->tid_entry.pid = perf_event_pid(event, current);
3419 data->tid_entry.tid = perf_event_tid(event, current);
3420 }
3421
3422 if (sample_type & PERF_SAMPLE_TIME)
3423 data->time = perf_clock();
3424
3425 if (sample_type & PERF_SAMPLE_ID)
3426 data->id = primary_event_id(event);
3427
3428 if (sample_type & PERF_SAMPLE_STREAM_ID)
3429 data->stream_id = event->id;
3430
3431 if (sample_type & PERF_SAMPLE_CPU) {
3432 data->cpu_entry.cpu = raw_smp_processor_id();
3433 data->cpu_entry.reserved = 0;
3434 }
3435}
3436
3437static void perf_event_header__init_id(struct perf_event_header *header,
3438 struct perf_sample_data *data,
3439 struct perf_event *event)
3440{
3441 if (event->attr.sample_id_all)
3442 __perf_event_header__init_id(header, data, event);
3443}
3444
3445static void __perf_event__output_id_sample(struct perf_output_handle *handle,
3446 struct perf_sample_data *data)
3447{
3448 u64 sample_type = data->type;
3449
3450 if (sample_type & PERF_SAMPLE_TID)
3451 perf_output_put(handle, data->tid_entry);
3452
3453 if (sample_type & PERF_SAMPLE_TIME)
3454 perf_output_put(handle, data->time);
3455
3456 if (sample_type & PERF_SAMPLE_ID)
3457 perf_output_put(handle, data->id);
3458
3459 if (sample_type & PERF_SAMPLE_STREAM_ID)
3460 perf_output_put(handle, data->stream_id);
3461
3462 if (sample_type & PERF_SAMPLE_CPU)
3463 perf_output_put(handle, data->cpu_entry);
3464}
3465
3466static void perf_event__output_id_sample(struct perf_event *event,
3467 struct perf_output_handle *handle,
3468 struct perf_sample_data *sample)
3469{
3470 if (event->attr.sample_id_all)
3471 __perf_event__output_id_sample(handle, sample);
3472}
3473
3274int perf_output_begin(struct perf_output_handle *handle, 3474int perf_output_begin(struct perf_output_handle *handle,
3275 struct perf_event *event, unsigned int size, 3475 struct perf_event *event, unsigned int size,
3276 int nmi, int sample) 3476 int nmi, int sample)
@@ -3278,6 +3478,7 @@ int perf_output_begin(struct perf_output_handle *handle,
3278 struct perf_buffer *buffer; 3478 struct perf_buffer *buffer;
3279 unsigned long tail, offset, head; 3479 unsigned long tail, offset, head;
3280 int have_lost; 3480 int have_lost;
3481 struct perf_sample_data sample_data;
3281 struct { 3482 struct {
3282 struct perf_event_header header; 3483 struct perf_event_header header;
3283 u64 id; 3484 u64 id;
@@ -3304,8 +3505,12 @@ int perf_output_begin(struct perf_output_handle *handle,
3304 goto out; 3505 goto out;
3305 3506
3306 have_lost = local_read(&buffer->lost); 3507 have_lost = local_read(&buffer->lost);
3307 if (have_lost) 3508 if (have_lost) {
3308 size += sizeof(lost_event); 3509 lost_event.header.size = sizeof(lost_event);
3510 perf_event_header__init_id(&lost_event.header, &sample_data,
3511 event);
3512 size += lost_event.header.size;
3513 }
3309 3514
3310 perf_output_get_handle(handle); 3515 perf_output_get_handle(handle);
3311 3516
@@ -3336,11 +3541,11 @@ int perf_output_begin(struct perf_output_handle *handle,
3336 if (have_lost) { 3541 if (have_lost) {
3337 lost_event.header.type = PERF_RECORD_LOST; 3542 lost_event.header.type = PERF_RECORD_LOST;
3338 lost_event.header.misc = 0; 3543 lost_event.header.misc = 0;
3339 lost_event.header.size = sizeof(lost_event);
3340 lost_event.id = event->id; 3544 lost_event.id = event->id;
3341 lost_event.lost = local_xchg(&buffer->lost, 0); 3545 lost_event.lost = local_xchg(&buffer->lost, 0);
3342 3546
3343 perf_output_put(handle, lost_event); 3547 perf_output_put(handle, lost_event);
3548 perf_event__output_id_sample(event, handle, &sample_data);
3344 } 3549 }
3345 3550
3346 return 0; 3551 return 0;
@@ -3373,30 +3578,9 @@ void perf_output_end(struct perf_output_handle *handle)
3373 rcu_read_unlock(); 3578 rcu_read_unlock();
3374} 3579}
3375 3580
3376static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
3377{
3378 /*
3379 * only top level events have the pid namespace they were created in
3380 */
3381 if (event->parent)
3382 event = event->parent;
3383
3384 return task_tgid_nr_ns(p, event->ns);
3385}
3386
3387static u32 perf_event_tid(struct perf_event *event, struct task_struct *p)
3388{
3389 /*
3390 * only top level events have the pid namespace they were created in
3391 */
3392 if (event->parent)
3393 event = event->parent;
3394
3395 return task_pid_nr_ns(p, event->ns);
3396}
3397
3398static void perf_output_read_one(struct perf_output_handle *handle, 3581static void perf_output_read_one(struct perf_output_handle *handle,
3399 struct perf_event *event) 3582 struct perf_event *event,
3583 u64 enabled, u64 running)
3400{ 3584{
3401 u64 read_format = event->attr.read_format; 3585 u64 read_format = event->attr.read_format;
3402 u64 values[4]; 3586 u64 values[4];
@@ -3404,11 +3588,11 @@ static void perf_output_read_one(struct perf_output_handle *handle,
3404 3588
3405 values[n++] = perf_event_count(event); 3589 values[n++] = perf_event_count(event);
3406 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { 3590 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
3407 values[n++] = event->total_time_enabled + 3591 values[n++] = enabled +
3408 atomic64_read(&event->child_total_time_enabled); 3592 atomic64_read(&event->child_total_time_enabled);
3409 } 3593 }
3410 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) { 3594 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
3411 values[n++] = event->total_time_running + 3595 values[n++] = running +
3412 atomic64_read(&event->child_total_time_running); 3596 atomic64_read(&event->child_total_time_running);
3413 } 3597 }
3414 if (read_format & PERF_FORMAT_ID) 3598 if (read_format & PERF_FORMAT_ID)
@@ -3421,7 +3605,8 @@ static void perf_output_read_one(struct perf_output_handle *handle,
3421 * XXX PERF_FORMAT_GROUP vs inherited events seems difficult. 3605 * XXX PERF_FORMAT_GROUP vs inherited events seems difficult.
3422 */ 3606 */
3423static void perf_output_read_group(struct perf_output_handle *handle, 3607static void perf_output_read_group(struct perf_output_handle *handle,
3424 struct perf_event *event) 3608 struct perf_event *event,
3609 u64 enabled, u64 running)
3425{ 3610{
3426 struct perf_event *leader = event->group_leader, *sub; 3611 struct perf_event *leader = event->group_leader, *sub;
3427 u64 read_format = event->attr.read_format; 3612 u64 read_format = event->attr.read_format;
@@ -3431,10 +3616,10 @@ static void perf_output_read_group(struct perf_output_handle *handle,
3431 values[n++] = 1 + leader->nr_siblings; 3616 values[n++] = 1 + leader->nr_siblings;
3432 3617
3433 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) 3618 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
3434 values[n++] = leader->total_time_enabled; 3619 values[n++] = enabled;
3435 3620
3436 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) 3621 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
3437 values[n++] = leader->total_time_running; 3622 values[n++] = running;
3438 3623
3439 if (leader != event) 3624 if (leader != event)
3440 leader->pmu->read(leader); 3625 leader->pmu->read(leader);
@@ -3459,13 +3644,35 @@ static void perf_output_read_group(struct perf_output_handle *handle,
3459 } 3644 }
3460} 3645}
3461 3646
3647#define PERF_FORMAT_TOTAL_TIMES (PERF_FORMAT_TOTAL_TIME_ENABLED|\
3648 PERF_FORMAT_TOTAL_TIME_RUNNING)
3649
3462static void perf_output_read(struct perf_output_handle *handle, 3650static void perf_output_read(struct perf_output_handle *handle,
3463 struct perf_event *event) 3651 struct perf_event *event)
3464{ 3652{
3653 u64 enabled = 0, running = 0, now, ctx_time;
3654 u64 read_format = event->attr.read_format;
3655
3656 /*
3657 * compute total_time_enabled, total_time_running
3658 * based on snapshot values taken when the event
3659 * was last scheduled in.
3660 *
3661 * we cannot simply called update_context_time()
3662 * because of locking issue as we are called in
3663 * NMI context
3664 */
3665 if (read_format & PERF_FORMAT_TOTAL_TIMES) {
3666 now = perf_clock();
3667 ctx_time = event->shadow_ctx_time + now;
3668 enabled = ctx_time - event->tstamp_enabled;
3669 running = ctx_time - event->tstamp_running;
3670 }
3671
3465 if (event->attr.read_format & PERF_FORMAT_GROUP) 3672 if (event->attr.read_format & PERF_FORMAT_GROUP)
3466 perf_output_read_group(handle, event); 3673 perf_output_read_group(handle, event, enabled, running);
3467 else 3674 else
3468 perf_output_read_one(handle, event); 3675 perf_output_read_one(handle, event, enabled, running);
3469} 3676}
3470 3677
3471void perf_output_sample(struct perf_output_handle *handle, 3678void perf_output_sample(struct perf_output_handle *handle,
@@ -3545,61 +3752,16 @@ void perf_prepare_sample(struct perf_event_header *header,
3545{ 3752{
3546 u64 sample_type = event->attr.sample_type; 3753 u64 sample_type = event->attr.sample_type;
3547 3754
3548 data->type = sample_type;
3549
3550 header->type = PERF_RECORD_SAMPLE; 3755 header->type = PERF_RECORD_SAMPLE;
3551 header->size = sizeof(*header); 3756 header->size = sizeof(*header) + event->header_size;
3552 3757
3553 header->misc = 0; 3758 header->misc = 0;
3554 header->misc |= perf_misc_flags(regs); 3759 header->misc |= perf_misc_flags(regs);
3555 3760
3556 if (sample_type & PERF_SAMPLE_IP) { 3761 __perf_event_header__init_id(header, data, event);
3557 data->ip = perf_instruction_pointer(regs);
3558 3762
3559 header->size += sizeof(data->ip); 3763 if (sample_type & PERF_SAMPLE_IP)
3560 } 3764 data->ip = perf_instruction_pointer(regs);
3561
3562 if (sample_type & PERF_SAMPLE_TID) {
3563 /* namespace issues */
3564 data->tid_entry.pid = perf_event_pid(event, current);
3565 data->tid_entry.tid = perf_event_tid(event, current);
3566
3567 header->size += sizeof(data->tid_entry);
3568 }
3569
3570 if (sample_type & PERF_SAMPLE_TIME) {
3571 data->time = perf_clock();
3572
3573 header->size += sizeof(data->time);
3574 }
3575
3576 if (sample_type & PERF_SAMPLE_ADDR)
3577 header->size += sizeof(data->addr);
3578
3579 if (sample_type & PERF_SAMPLE_ID) {
3580 data->id = primary_event_id(event);
3581
3582 header->size += sizeof(data->id);
3583 }
3584
3585 if (sample_type & PERF_SAMPLE_STREAM_ID) {
3586 data->stream_id = event->id;
3587
3588 header->size += sizeof(data->stream_id);
3589 }
3590
3591 if (sample_type & PERF_SAMPLE_CPU) {
3592 data->cpu_entry.cpu = raw_smp_processor_id();
3593 data->cpu_entry.reserved = 0;
3594
3595 header->size += sizeof(data->cpu_entry);
3596 }
3597
3598 if (sample_type & PERF_SAMPLE_PERIOD)
3599 header->size += sizeof(data->period);
3600
3601 if (sample_type & PERF_SAMPLE_READ)
3602 header->size += perf_event_read_size(event);
3603 3765
3604 if (sample_type & PERF_SAMPLE_CALLCHAIN) { 3766 if (sample_type & PERF_SAMPLE_CALLCHAIN) {
3605 int size = 1; 3767 int size = 1;
@@ -3664,23 +3826,26 @@ perf_event_read_event(struct perf_event *event,
3664 struct task_struct *task) 3826 struct task_struct *task)
3665{ 3827{
3666 struct perf_output_handle handle; 3828 struct perf_output_handle handle;
3829 struct perf_sample_data sample;
3667 struct perf_read_event read_event = { 3830 struct perf_read_event read_event = {
3668 .header = { 3831 .header = {
3669 .type = PERF_RECORD_READ, 3832 .type = PERF_RECORD_READ,
3670 .misc = 0, 3833 .misc = 0,
3671 .size = sizeof(read_event) + perf_event_read_size(event), 3834 .size = sizeof(read_event) + event->read_size,
3672 }, 3835 },
3673 .pid = perf_event_pid(event, task), 3836 .pid = perf_event_pid(event, task),
3674 .tid = perf_event_tid(event, task), 3837 .tid = perf_event_tid(event, task),
3675 }; 3838 };
3676 int ret; 3839 int ret;
3677 3840
3841 perf_event_header__init_id(&read_event.header, &sample, event);
3678 ret = perf_output_begin(&handle, event, read_event.header.size, 0, 0); 3842 ret = perf_output_begin(&handle, event, read_event.header.size, 0, 0);
3679 if (ret) 3843 if (ret)
3680 return; 3844 return;
3681 3845
3682 perf_output_put(&handle, read_event); 3846 perf_output_put(&handle, read_event);
3683 perf_output_read(&handle, event); 3847 perf_output_read(&handle, event);
3848 perf_event__output_id_sample(event, &handle, &sample);
3684 3849
3685 perf_output_end(&handle); 3850 perf_output_end(&handle);
3686} 3851}
@@ -3710,14 +3875,16 @@ static void perf_event_task_output(struct perf_event *event,
3710 struct perf_task_event *task_event) 3875 struct perf_task_event *task_event)
3711{ 3876{
3712 struct perf_output_handle handle; 3877 struct perf_output_handle handle;
3878 struct perf_sample_data sample;
3713 struct task_struct *task = task_event->task; 3879 struct task_struct *task = task_event->task;
3714 int size, ret; 3880 int ret, size = task_event->event_id.header.size;
3715 3881
3716 size = task_event->event_id.header.size; 3882 perf_event_header__init_id(&task_event->event_id.header, &sample, event);
3717 ret = perf_output_begin(&handle, event, size, 0, 0);
3718 3883
3884 ret = perf_output_begin(&handle, event,
3885 task_event->event_id.header.size, 0, 0);
3719 if (ret) 3886 if (ret)
3720 return; 3887 goto out;
3721 3888
3722 task_event->event_id.pid = perf_event_pid(event, task); 3889 task_event->event_id.pid = perf_event_pid(event, task);
3723 task_event->event_id.ppid = perf_event_pid(event, current); 3890 task_event->event_id.ppid = perf_event_pid(event, current);
@@ -3727,7 +3894,11 @@ static void perf_event_task_output(struct perf_event *event,
3727 3894
3728 perf_output_put(&handle, task_event->event_id); 3895 perf_output_put(&handle, task_event->event_id);
3729 3896
3897 perf_event__output_id_sample(event, &handle, &sample);
3898
3730 perf_output_end(&handle); 3899 perf_output_end(&handle);
3900out:
3901 task_event->event_id.header.size = size;
3731} 3902}
3732 3903
3733static int perf_event_task_match(struct perf_event *event) 3904static int perf_event_task_match(struct perf_event *event)
@@ -3735,7 +3906,7 @@ static int perf_event_task_match(struct perf_event *event)
3735 if (event->state < PERF_EVENT_STATE_INACTIVE) 3906 if (event->state < PERF_EVENT_STATE_INACTIVE)
3736 return 0; 3907 return 0;
3737 3908
3738 if (event->cpu != -1 && event->cpu != smp_processor_id()) 3909 if (!event_filter_match(event))
3739 return 0; 3910 return 0;
3740 3911
3741 if (event->attr.comm || event->attr.mmap || 3912 if (event->attr.comm || event->attr.mmap ||
@@ -3766,6 +3937,8 @@ static void perf_event_task_event(struct perf_task_event *task_event)
3766 rcu_read_lock(); 3937 rcu_read_lock();
3767 list_for_each_entry_rcu(pmu, &pmus, entry) { 3938 list_for_each_entry_rcu(pmu, &pmus, entry) {
3768 cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); 3939 cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
3940 if (cpuctx->active_pmu != pmu)
3941 goto next;
3769 perf_event_task_ctx(&cpuctx->ctx, task_event); 3942 perf_event_task_ctx(&cpuctx->ctx, task_event);
3770 3943
3771 ctx = task_event->task_ctx; 3944 ctx = task_event->task_ctx;
@@ -3840,11 +4013,16 @@ static void perf_event_comm_output(struct perf_event *event,
3840 struct perf_comm_event *comm_event) 4013 struct perf_comm_event *comm_event)
3841{ 4014{
3842 struct perf_output_handle handle; 4015 struct perf_output_handle handle;
4016 struct perf_sample_data sample;
3843 int size = comm_event->event_id.header.size; 4017 int size = comm_event->event_id.header.size;
3844 int ret = perf_output_begin(&handle, event, size, 0, 0); 4018 int ret;
4019
4020 perf_event_header__init_id(&comm_event->event_id.header, &sample, event);
4021 ret = perf_output_begin(&handle, event,
4022 comm_event->event_id.header.size, 0, 0);
3845 4023
3846 if (ret) 4024 if (ret)
3847 return; 4025 goto out;
3848 4026
3849 comm_event->event_id.pid = perf_event_pid(event, comm_event->task); 4027 comm_event->event_id.pid = perf_event_pid(event, comm_event->task);
3850 comm_event->event_id.tid = perf_event_tid(event, comm_event->task); 4028 comm_event->event_id.tid = perf_event_tid(event, comm_event->task);
@@ -3852,7 +4030,12 @@ static void perf_event_comm_output(struct perf_event *event,
3852 perf_output_put(&handle, comm_event->event_id); 4030 perf_output_put(&handle, comm_event->event_id);
3853 perf_output_copy(&handle, comm_event->comm, 4031 perf_output_copy(&handle, comm_event->comm,
3854 comm_event->comm_size); 4032 comm_event->comm_size);
4033
4034 perf_event__output_id_sample(event, &handle, &sample);
4035
3855 perf_output_end(&handle); 4036 perf_output_end(&handle);
4037out:
4038 comm_event->event_id.header.size = size;
3856} 4039}
3857 4040
3858static int perf_event_comm_match(struct perf_event *event) 4041static int perf_event_comm_match(struct perf_event *event)
@@ -3860,7 +4043,7 @@ static int perf_event_comm_match(struct perf_event *event)
3860 if (event->state < PERF_EVENT_STATE_INACTIVE) 4043 if (event->state < PERF_EVENT_STATE_INACTIVE)
3861 return 0; 4044 return 0;
3862 4045
3863 if (event->cpu != -1 && event->cpu != smp_processor_id()) 4046 if (!event_filter_match(event))
3864 return 0; 4047 return 0;
3865 4048
3866 if (event->attr.comm) 4049 if (event->attr.comm)
@@ -3897,10 +4080,11 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event)
3897 comm_event->comm_size = size; 4080 comm_event->comm_size = size;
3898 4081
3899 comm_event->event_id.header.size = sizeof(comm_event->event_id) + size; 4082 comm_event->event_id.header.size = sizeof(comm_event->event_id) + size;
3900
3901 rcu_read_lock(); 4083 rcu_read_lock();
3902 list_for_each_entry_rcu(pmu, &pmus, entry) { 4084 list_for_each_entry_rcu(pmu, &pmus, entry) {
3903 cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); 4085 cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
4086 if (cpuctx->active_pmu != pmu)
4087 goto next;
3904 perf_event_comm_ctx(&cpuctx->ctx, comm_event); 4088 perf_event_comm_ctx(&cpuctx->ctx, comm_event);
3905 4089
3906 ctxn = pmu->task_ctx_nr; 4090 ctxn = pmu->task_ctx_nr;
@@ -3976,11 +4160,15 @@ static void perf_event_mmap_output(struct perf_event *event,
3976 struct perf_mmap_event *mmap_event) 4160 struct perf_mmap_event *mmap_event)
3977{ 4161{
3978 struct perf_output_handle handle; 4162 struct perf_output_handle handle;
4163 struct perf_sample_data sample;
3979 int size = mmap_event->event_id.header.size; 4164 int size = mmap_event->event_id.header.size;
3980 int ret = perf_output_begin(&handle, event, size, 0, 0); 4165 int ret;
3981 4166
4167 perf_event_header__init_id(&mmap_event->event_id.header, &sample, event);
4168 ret = perf_output_begin(&handle, event,
4169 mmap_event->event_id.header.size, 0, 0);
3982 if (ret) 4170 if (ret)
3983 return; 4171 goto out;
3984 4172
3985 mmap_event->event_id.pid = perf_event_pid(event, current); 4173 mmap_event->event_id.pid = perf_event_pid(event, current);
3986 mmap_event->event_id.tid = perf_event_tid(event, current); 4174 mmap_event->event_id.tid = perf_event_tid(event, current);
@@ -3988,7 +4176,12 @@ static void perf_event_mmap_output(struct perf_event *event,
3988 perf_output_put(&handle, mmap_event->event_id); 4176 perf_output_put(&handle, mmap_event->event_id);
3989 perf_output_copy(&handle, mmap_event->file_name, 4177 perf_output_copy(&handle, mmap_event->file_name,
3990 mmap_event->file_size); 4178 mmap_event->file_size);
4179
4180 perf_event__output_id_sample(event, &handle, &sample);
4181
3991 perf_output_end(&handle); 4182 perf_output_end(&handle);
4183out:
4184 mmap_event->event_id.header.size = size;
3992} 4185}
3993 4186
3994static int perf_event_mmap_match(struct perf_event *event, 4187static int perf_event_mmap_match(struct perf_event *event,
@@ -3998,7 +4191,7 @@ static int perf_event_mmap_match(struct perf_event *event,
3998 if (event->state < PERF_EVENT_STATE_INACTIVE) 4191 if (event->state < PERF_EVENT_STATE_INACTIVE)
3999 return 0; 4192 return 0;
4000 4193
4001 if (event->cpu != -1 && event->cpu != smp_processor_id()) 4194 if (!event_filter_match(event))
4002 return 0; 4195 return 0;
4003 4196
4004 if ((!executable && event->attr.mmap_data) || 4197 if ((!executable && event->attr.mmap_data) ||
@@ -4086,6 +4279,8 @@ got_name:
4086 rcu_read_lock(); 4279 rcu_read_lock();
4087 list_for_each_entry_rcu(pmu, &pmus, entry) { 4280 list_for_each_entry_rcu(pmu, &pmus, entry) {
4088 cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); 4281 cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
4282 if (cpuctx->active_pmu != pmu)
4283 goto next;
4089 perf_event_mmap_ctx(&cpuctx->ctx, mmap_event, 4284 perf_event_mmap_ctx(&cpuctx->ctx, mmap_event,
4090 vma->vm_flags & VM_EXEC); 4285 vma->vm_flags & VM_EXEC);
4091 4286
@@ -4141,6 +4336,7 @@ void perf_event_mmap(struct vm_area_struct *vma)
4141static void perf_log_throttle(struct perf_event *event, int enable) 4336static void perf_log_throttle(struct perf_event *event, int enable)
4142{ 4337{
4143 struct perf_output_handle handle; 4338 struct perf_output_handle handle;
4339 struct perf_sample_data sample;
4144 int ret; 4340 int ret;
4145 4341
4146 struct { 4342 struct {
@@ -4162,11 +4358,15 @@ static void perf_log_throttle(struct perf_event *event, int enable)
4162 if (enable) 4358 if (enable)
4163 throttle_event.header.type = PERF_RECORD_UNTHROTTLE; 4359 throttle_event.header.type = PERF_RECORD_UNTHROTTLE;
4164 4360
4165 ret = perf_output_begin(&handle, event, sizeof(throttle_event), 1, 0); 4361 perf_event_header__init_id(&throttle_event.header, &sample, event);
4362
4363 ret = perf_output_begin(&handle, event,
4364 throttle_event.header.size, 1, 0);
4166 if (ret) 4365 if (ret)
4167 return; 4366 return;
4168 4367
4169 perf_output_put(&handle, throttle_event); 4368 perf_output_put(&handle, throttle_event);
4369 perf_event__output_id_sample(event, &handle, &sample);
4170 perf_output_end(&handle); 4370 perf_output_end(&handle);
4171} 4371}
4172 4372
@@ -4182,6 +4382,13 @@ static int __perf_event_overflow(struct perf_event *event, int nmi,
4182 struct hw_perf_event *hwc = &event->hw; 4382 struct hw_perf_event *hwc = &event->hw;
4183 int ret = 0; 4383 int ret = 0;
4184 4384
4385 /*
4386 * Non-sampling counters might still use the PMI to fold short
4387 * hardware counters, ignore those.
4388 */
4389 if (unlikely(!is_sampling_event(event)))
4390 return 0;
4391
4185 if (!throttle) { 4392 if (!throttle) {
4186 hwc->interrupts++; 4393 hwc->interrupts++;
4187 } else { 4394 } else {
@@ -4327,7 +4534,7 @@ static void perf_swevent_event(struct perf_event *event, u64 nr,
4327 if (!regs) 4534 if (!regs)
4328 return; 4535 return;
4329 4536
4330 if (!hwc->sample_period) 4537 if (!is_sampling_event(event))
4331 return; 4538 return;
4332 4539
4333 if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq) 4540 if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq)
@@ -4454,7 +4661,7 @@ int perf_swevent_get_recursion_context(void)
4454} 4661}
4455EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context); 4662EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context);
4456 4663
4457void inline perf_swevent_put_recursion_context(int rctx) 4664inline void perf_swevent_put_recursion_context(int rctx)
4458{ 4665{
4459 struct swevent_htable *swhash = &__get_cpu_var(swevent_htable); 4666 struct swevent_htable *swhash = &__get_cpu_var(swevent_htable);
4460 4667
@@ -4490,7 +4697,7 @@ static int perf_swevent_add(struct perf_event *event, int flags)
4490 struct hw_perf_event *hwc = &event->hw; 4697 struct hw_perf_event *hwc = &event->hw;
4491 struct hlist_head *head; 4698 struct hlist_head *head;
4492 4699
4493 if (hwc->sample_period) { 4700 if (is_sampling_event(event)) {
4494 hwc->last_period = hwc->sample_period; 4701 hwc->last_period = hwc->sample_period;
4495 perf_swevent_set_period(event); 4702 perf_swevent_set_period(event);
4496 } 4703 }
@@ -4655,7 +4862,7 @@ static int perf_swevent_init(struct perf_event *event)
4655 break; 4862 break;
4656 } 4863 }
4657 4864
4658 if (event_id > PERF_COUNT_SW_MAX) 4865 if (event_id >= PERF_COUNT_SW_MAX)
4659 return -ENOENT; 4866 return -ENOENT;
4660 4867
4661 if (!event->parent) { 4868 if (!event->parent) {
@@ -4747,15 +4954,6 @@ static int perf_tp_event_init(struct perf_event *event)
4747 if (event->attr.type != PERF_TYPE_TRACEPOINT) 4954 if (event->attr.type != PERF_TYPE_TRACEPOINT)
4748 return -ENOENT; 4955 return -ENOENT;
4749 4956
4750 /*
4751 * Raw tracepoint data is a severe data leak, only allow root to
4752 * have these.
4753 */
4754 if ((event->attr.sample_type & PERF_SAMPLE_RAW) &&
4755 perf_paranoid_tracepoint_raw() &&
4756 !capable(CAP_SYS_ADMIN))
4757 return -EPERM;
4758
4759 err = perf_trace_init(event); 4957 err = perf_trace_init(event);
4760 if (err) 4958 if (err)
4761 return err; 4959 return err;
@@ -4778,7 +4976,7 @@ static struct pmu perf_tracepoint = {
4778 4976
4779static inline void perf_tp_register(void) 4977static inline void perf_tp_register(void)
4780{ 4978{
4781 perf_pmu_register(&perf_tracepoint); 4979 perf_pmu_register(&perf_tracepoint, "tracepoint", PERF_TYPE_TRACEPOINT);
4782} 4980}
4783 4981
4784static int perf_event_set_filter(struct perf_event *event, void __user *arg) 4982static int perf_event_set_filter(struct perf_event *event, void __user *arg)
@@ -4868,31 +5066,33 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
4868static void perf_swevent_start_hrtimer(struct perf_event *event) 5066static void perf_swevent_start_hrtimer(struct perf_event *event)
4869{ 5067{
4870 struct hw_perf_event *hwc = &event->hw; 5068 struct hw_perf_event *hwc = &event->hw;
5069 s64 period;
5070
5071 if (!is_sampling_event(event))
5072 return;
4871 5073
4872 hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 5074 hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
4873 hwc->hrtimer.function = perf_swevent_hrtimer; 5075 hwc->hrtimer.function = perf_swevent_hrtimer;
4874 if (hwc->sample_period) {
4875 s64 period = local64_read(&hwc->period_left);
4876 5076
4877 if (period) { 5077 period = local64_read(&hwc->period_left);
4878 if (period < 0) 5078 if (period) {
4879 period = 10000; 5079 if (period < 0)
5080 period = 10000;
4880 5081
4881 local64_set(&hwc->period_left, 0); 5082 local64_set(&hwc->period_left, 0);
4882 } else { 5083 } else {
4883 period = max_t(u64, 10000, hwc->sample_period); 5084 period = max_t(u64, 10000, hwc->sample_period);
4884 } 5085 }
4885 __hrtimer_start_range_ns(&hwc->hrtimer, 5086 __hrtimer_start_range_ns(&hwc->hrtimer,
4886 ns_to_ktime(period), 0, 5087 ns_to_ktime(period), 0,
4887 HRTIMER_MODE_REL_PINNED, 0); 5088 HRTIMER_MODE_REL_PINNED, 0);
4888 }
4889} 5089}
4890 5090
4891static void perf_swevent_cancel_hrtimer(struct perf_event *event) 5091static void perf_swevent_cancel_hrtimer(struct perf_event *event)
4892{ 5092{
4893 struct hw_perf_event *hwc = &event->hw; 5093 struct hw_perf_event *hwc = &event->hw;
4894 5094
4895 if (hwc->sample_period) { 5095 if (is_sampling_event(event)) {
4896 ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer); 5096 ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer);
4897 local64_set(&hwc->period_left, ktime_to_ns(remaining)); 5097 local64_set(&hwc->period_left, ktime_to_ns(remaining));
4898 5098
@@ -5087,25 +5287,94 @@ static void *find_pmu_context(int ctxn)
5087 return NULL; 5287 return NULL;
5088} 5288}
5089 5289
5090static void free_pmu_context(void * __percpu cpu_context) 5290static void update_pmu_context(struct pmu *pmu, struct pmu *old_pmu)
5091{ 5291{
5092 struct pmu *pmu; 5292 int cpu;
5293
5294 for_each_possible_cpu(cpu) {
5295 struct perf_cpu_context *cpuctx;
5296
5297 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
5298
5299 if (cpuctx->active_pmu == old_pmu)
5300 cpuctx->active_pmu = pmu;
5301 }
5302}
5303
5304static void free_pmu_context(struct pmu *pmu)
5305{
5306 struct pmu *i;
5093 5307
5094 mutex_lock(&pmus_lock); 5308 mutex_lock(&pmus_lock);
5095 /* 5309 /*
5096 * Like a real lame refcount. 5310 * Like a real lame refcount.
5097 */ 5311 */
5098 list_for_each_entry(pmu, &pmus, entry) { 5312 list_for_each_entry(i, &pmus, entry) {
5099 if (pmu->pmu_cpu_context == cpu_context) 5313 if (i->pmu_cpu_context == pmu->pmu_cpu_context) {
5314 update_pmu_context(i, pmu);
5100 goto out; 5315 goto out;
5316 }
5101 } 5317 }
5102 5318
5103 free_percpu(cpu_context); 5319 free_percpu(pmu->pmu_cpu_context);
5104out: 5320out:
5105 mutex_unlock(&pmus_lock); 5321 mutex_unlock(&pmus_lock);
5106} 5322}
5323static struct idr pmu_idr;
5324
5325static ssize_t
5326type_show(struct device *dev, struct device_attribute *attr, char *page)
5327{
5328 struct pmu *pmu = dev_get_drvdata(dev);
5329
5330 return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->type);
5331}
5332
5333static struct device_attribute pmu_dev_attrs[] = {
5334 __ATTR_RO(type),
5335 __ATTR_NULL,
5336};
5337
5338static int pmu_bus_running;
5339static struct bus_type pmu_bus = {
5340 .name = "event_source",
5341 .dev_attrs = pmu_dev_attrs,
5342};
5343
5344static void pmu_dev_release(struct device *dev)
5345{
5346 kfree(dev);
5347}
5348
5349static int pmu_dev_alloc(struct pmu *pmu)
5350{
5351 int ret = -ENOMEM;
5352
5353 pmu->dev = kzalloc(sizeof(struct device), GFP_KERNEL);
5354 if (!pmu->dev)
5355 goto out;
5356
5357 device_initialize(pmu->dev);
5358 ret = dev_set_name(pmu->dev, "%s", pmu->name);
5359 if (ret)
5360 goto free_dev;
5361
5362 dev_set_drvdata(pmu->dev, pmu);
5363 pmu->dev->bus = &pmu_bus;
5364 pmu->dev->release = pmu_dev_release;
5365 ret = device_add(pmu->dev);
5366 if (ret)
5367 goto free_dev;
5368
5369out:
5370 return ret;
5371
5372free_dev:
5373 put_device(pmu->dev);
5374 goto out;
5375}
5107 5376
5108int perf_pmu_register(struct pmu *pmu) 5377int perf_pmu_register(struct pmu *pmu, char *name, int type)
5109{ 5378{
5110 int cpu, ret; 5379 int cpu, ret;
5111 5380
@@ -5115,13 +5384,38 @@ int perf_pmu_register(struct pmu *pmu)
5115 if (!pmu->pmu_disable_count) 5384 if (!pmu->pmu_disable_count)
5116 goto unlock; 5385 goto unlock;
5117 5386
5387 pmu->type = -1;
5388 if (!name)
5389 goto skip_type;
5390 pmu->name = name;
5391
5392 if (type < 0) {
5393 int err = idr_pre_get(&pmu_idr, GFP_KERNEL);
5394 if (!err)
5395 goto free_pdc;
5396
5397 err = idr_get_new_above(&pmu_idr, pmu, PERF_TYPE_MAX, &type);
5398 if (err) {
5399 ret = err;
5400 goto free_pdc;
5401 }
5402 }
5403 pmu->type = type;
5404
5405 if (pmu_bus_running) {
5406 ret = pmu_dev_alloc(pmu);
5407 if (ret)
5408 goto free_idr;
5409 }
5410
5411skip_type:
5118 pmu->pmu_cpu_context = find_pmu_context(pmu->task_ctx_nr); 5412 pmu->pmu_cpu_context = find_pmu_context(pmu->task_ctx_nr);
5119 if (pmu->pmu_cpu_context) 5413 if (pmu->pmu_cpu_context)
5120 goto got_cpu_context; 5414 goto got_cpu_context;
5121 5415
5122 pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context); 5416 pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context);
5123 if (!pmu->pmu_cpu_context) 5417 if (!pmu->pmu_cpu_context)
5124 goto free_pdc; 5418 goto free_dev;
5125 5419
5126 for_each_possible_cpu(cpu) { 5420 for_each_possible_cpu(cpu) {
5127 struct perf_cpu_context *cpuctx; 5421 struct perf_cpu_context *cpuctx;
@@ -5132,6 +5426,7 @@ int perf_pmu_register(struct pmu *pmu)
5132 cpuctx->ctx.pmu = pmu; 5426 cpuctx->ctx.pmu = pmu;
5133 cpuctx->jiffies_interval = 1; 5427 cpuctx->jiffies_interval = 1;
5134 INIT_LIST_HEAD(&cpuctx->rotation_list); 5428 INIT_LIST_HEAD(&cpuctx->rotation_list);
5429 cpuctx->active_pmu = pmu;
5135 } 5430 }
5136 5431
5137got_cpu_context: 5432got_cpu_context:
@@ -5164,6 +5459,14 @@ unlock:
5164 5459
5165 return ret; 5460 return ret;
5166 5461
5462free_dev:
5463 device_del(pmu->dev);
5464 put_device(pmu->dev);
5465
5466free_idr:
5467 if (pmu->type >= PERF_TYPE_MAX)
5468 idr_remove(&pmu_idr, pmu->type);
5469
5167free_pdc: 5470free_pdc:
5168 free_percpu(pmu->pmu_disable_count); 5471 free_percpu(pmu->pmu_disable_count);
5169 goto unlock; 5472 goto unlock;
@@ -5183,7 +5486,11 @@ void perf_pmu_unregister(struct pmu *pmu)
5183 synchronize_rcu(); 5486 synchronize_rcu();
5184 5487
5185 free_percpu(pmu->pmu_disable_count); 5488 free_percpu(pmu->pmu_disable_count);
5186 free_pmu_context(pmu->pmu_cpu_context); 5489 if (pmu->type >= PERF_TYPE_MAX)
5490 idr_remove(&pmu_idr, pmu->type);
5491 device_del(pmu->dev);
5492 put_device(pmu->dev);
5493 free_pmu_context(pmu);
5187} 5494}
5188 5495
5189struct pmu *perf_init_event(struct perf_event *event) 5496struct pmu *perf_init_event(struct perf_event *event)
@@ -5192,6 +5499,13 @@ struct pmu *perf_init_event(struct perf_event *event)
5192 int idx; 5499 int idx;
5193 5500
5194 idx = srcu_read_lock(&pmus_srcu); 5501 idx = srcu_read_lock(&pmus_srcu);
5502
5503 rcu_read_lock();
5504 pmu = idr_find(&pmu_idr, event->attr.type);
5505 rcu_read_unlock();
5506 if (pmu)
5507 goto unlock;
5508
5195 list_for_each_entry_rcu(pmu, &pmus, entry) { 5509 list_for_each_entry_rcu(pmu, &pmus, entry) {
5196 int ret = pmu->event_init(event); 5510 int ret = pmu->event_init(event);
5197 if (!ret) 5511 if (!ret)
@@ -5224,6 +5538,11 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
5224 struct hw_perf_event *hwc; 5538 struct hw_perf_event *hwc;
5225 long err; 5539 long err;
5226 5540
5541 if ((unsigned)cpu >= nr_cpu_ids) {
5542 if (!task || cpu != -1)
5543 return ERR_PTR(-EINVAL);
5544 }
5545
5227 event = kzalloc(sizeof(*event), GFP_KERNEL); 5546 event = kzalloc(sizeof(*event), GFP_KERNEL);
5228 if (!event) 5547 if (!event)
5229 return ERR_PTR(-ENOMEM); 5548 return ERR_PTR(-ENOMEM);
@@ -5272,7 +5591,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
5272 5591
5273 if (!overflow_handler && parent_event) 5592 if (!overflow_handler && parent_event)
5274 overflow_handler = parent_event->overflow_handler; 5593 overflow_handler = parent_event->overflow_handler;
5275 5594
5276 event->overflow_handler = overflow_handler; 5595 event->overflow_handler = overflow_handler;
5277 5596
5278 if (attr->disabled) 5597 if (attr->disabled)
@@ -5651,12 +5970,18 @@ SYSCALL_DEFINE5(perf_event_open,
5651 mutex_unlock(&ctx->mutex); 5970 mutex_unlock(&ctx->mutex);
5652 5971
5653 event->owner = current; 5972 event->owner = current;
5654 get_task_struct(current); 5973
5655 mutex_lock(&current->perf_event_mutex); 5974 mutex_lock(&current->perf_event_mutex);
5656 list_add_tail(&event->owner_entry, &current->perf_event_list); 5975 list_add_tail(&event->owner_entry, &current->perf_event_list);
5657 mutex_unlock(&current->perf_event_mutex); 5976 mutex_unlock(&current->perf_event_mutex);
5658 5977
5659 /* 5978 /*
5979 * Precalculate sample_data sizes
5980 */
5981 perf_event__header_size(event);
5982 perf_event__id_header_size(event);
5983
5984 /*
5660 * Drop the reference on the group_event after placing the 5985 * Drop the reference on the group_event after placing the
5661 * new event on the sibling_list. This ensures destruction 5986 * new event on the sibling_list. This ensures destruction
5662 * of the group leader will find the pointer to itself in 5987 * of the group leader will find the pointer to itself in
@@ -5719,12 +6044,6 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
5719 ++ctx->generation; 6044 ++ctx->generation;
5720 mutex_unlock(&ctx->mutex); 6045 mutex_unlock(&ctx->mutex);
5721 6046
5722 event->owner = current;
5723 get_task_struct(current);
5724 mutex_lock(&current->perf_event_mutex);
5725 list_add_tail(&event->owner_entry, &current->perf_event_list);
5726 mutex_unlock(&current->perf_event_mutex);
5727
5728 return event; 6047 return event;
5729 6048
5730err_free: 6049err_free:
@@ -5875,8 +6194,24 @@ again:
5875 */ 6194 */
5876void perf_event_exit_task(struct task_struct *child) 6195void perf_event_exit_task(struct task_struct *child)
5877{ 6196{
6197 struct perf_event *event, *tmp;
5878 int ctxn; 6198 int ctxn;
5879 6199
6200 mutex_lock(&child->perf_event_mutex);
6201 list_for_each_entry_safe(event, tmp, &child->perf_event_list,
6202 owner_entry) {
6203 list_del_init(&event->owner_entry);
6204
6205 /*
6206 * Ensure the list deletion is visible before we clear
6207 * the owner, closes a race against perf_release() where
6208 * we need to serialize on the owner->perf_event_mutex.
6209 */
6210 smp_wmb();
6211 event->owner = NULL;
6212 }
6213 mutex_unlock(&child->perf_event_mutex);
6214
5880 for_each_task_context_nr(ctxn) 6215 for_each_task_context_nr(ctxn)
5881 perf_event_exit_task_context(child, ctxn); 6216 perf_event_exit_task_context(child, ctxn);
5882} 6217}
@@ -5999,6 +6334,12 @@ inherit_event(struct perf_event *parent_event,
5999 child_event->overflow_handler = parent_event->overflow_handler; 6334 child_event->overflow_handler = parent_event->overflow_handler;
6000 6335
6001 /* 6336 /*
6337 * Precalculate sample_data sizes
6338 */
6339 perf_event__header_size(child_event);
6340 perf_event__id_header_size(child_event);
6341
6342 /*
6002 * Link it up in the child's context: 6343 * Link it up in the child's context:
6003 */ 6344 */
6004 raw_spin_lock_irqsave(&child_ctx->lock, flags); 6345 raw_spin_lock_irqsave(&child_ctx->lock, flags);
@@ -6096,6 +6437,7 @@ int perf_event_init_context(struct task_struct *child, int ctxn)
6096 struct perf_event *event; 6437 struct perf_event *event;
6097 struct task_struct *parent = current; 6438 struct task_struct *parent = current;
6098 int inherited_all = 1; 6439 int inherited_all = 1;
6440 unsigned long flags;
6099 int ret = 0; 6441 int ret = 0;
6100 6442
6101 child->perf_event_ctxp[ctxn] = NULL; 6443 child->perf_event_ctxp[ctxn] = NULL;
@@ -6136,6 +6478,15 @@ int perf_event_init_context(struct task_struct *child, int ctxn)
6136 break; 6478 break;
6137 } 6479 }
6138 6480
6481 /*
6482 * We can't hold ctx->lock when iterating the ->flexible_group list due
6483 * to allocations, but we need to prevent rotation because
6484 * rotate_ctx() will change the list from interrupt context.
6485 */
6486 raw_spin_lock_irqsave(&parent_ctx->lock, flags);
6487 parent_ctx->rotate_disable = 1;
6488 raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
6489
6139 list_for_each_entry(event, &parent_ctx->flexible_groups, group_entry) { 6490 list_for_each_entry(event, &parent_ctx->flexible_groups, group_entry) {
6140 ret = inherit_task_group(event, parent, parent_ctx, 6491 ret = inherit_task_group(event, parent, parent_ctx,
6141 child, ctxn, &inherited_all); 6492 child, ctxn, &inherited_all);
@@ -6143,18 +6494,20 @@ int perf_event_init_context(struct task_struct *child, int ctxn)
6143 break; 6494 break;
6144 } 6495 }
6145 6496
6497 raw_spin_lock_irqsave(&parent_ctx->lock, flags);
6498 parent_ctx->rotate_disable = 0;
6499
6146 child_ctx = child->perf_event_ctxp[ctxn]; 6500 child_ctx = child->perf_event_ctxp[ctxn];
6147 6501
6148 if (child_ctx && inherited_all) { 6502 if (child_ctx && inherited_all) {
6149 /* 6503 /*
6150 * Mark the child context as a clone of the parent 6504 * Mark the child context as a clone of the parent
6151 * context, or of whatever the parent is a clone of. 6505 * context, or of whatever the parent is a clone of.
6152 * Note that if the parent is a clone, it could get 6506 *
6153 * uncloned at any point, but that doesn't matter 6507 * Note that if the parent is a clone, the holding of
6154 * because the list of events and the generation 6508 * parent_ctx->lock avoids it from being uncloned.
6155 * count can't have changed since we took the mutex.
6156 */ 6509 */
6157 cloned_ctx = rcu_dereference(parent_ctx->parent_ctx); 6510 cloned_ctx = parent_ctx->parent_ctx;
6158 if (cloned_ctx) { 6511 if (cloned_ctx) {
6159 child_ctx->parent_ctx = cloned_ctx; 6512 child_ctx->parent_ctx = cloned_ctx;
6160 child_ctx->parent_gen = parent_ctx->parent_gen; 6513 child_ctx->parent_gen = parent_ctx->parent_gen;
@@ -6165,6 +6518,7 @@ int perf_event_init_context(struct task_struct *child, int ctxn)
6165 get_ctx(child_ctx->parent_ctx); 6518 get_ctx(child_ctx->parent_ctx);
6166 } 6519 }
6167 6520
6521 raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
6168 mutex_unlock(&parent_ctx->mutex); 6522 mutex_unlock(&parent_ctx->mutex);
6169 6523
6170 perf_unpin_context(parent_ctx); 6524 perf_unpin_context(parent_ctx);
@@ -6215,7 +6569,7 @@ static void __cpuinit perf_event_init_cpu(int cpu)
6215 mutex_unlock(&swhash->hlist_mutex); 6569 mutex_unlock(&swhash->hlist_mutex);
6216} 6570}
6217 6571
6218#ifdef CONFIG_HOTPLUG_CPU 6572#if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC
6219static void perf_pmu_rotate_stop(struct pmu *pmu) 6573static void perf_pmu_rotate_stop(struct pmu *pmu)
6220{ 6574{
6221 struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); 6575 struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
@@ -6269,6 +6623,26 @@ static void perf_event_exit_cpu(int cpu)
6269static inline void perf_event_exit_cpu(int cpu) { } 6623static inline void perf_event_exit_cpu(int cpu) { }
6270#endif 6624#endif
6271 6625
6626static int
6627perf_reboot(struct notifier_block *notifier, unsigned long val, void *v)
6628{
6629 int cpu;
6630
6631 for_each_online_cpu(cpu)
6632 perf_event_exit_cpu(cpu);
6633
6634 return NOTIFY_OK;
6635}
6636
6637/*
6638 * Run the perf reboot notifier at the very last possible moment so that
6639 * the generic watchdog code runs as long as possible.
6640 */
6641static struct notifier_block perf_reboot_notifier = {
6642 .notifier_call = perf_reboot,
6643 .priority = INT_MIN,
6644};
6645
6272static int __cpuinit 6646static int __cpuinit
6273perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) 6647perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
6274{ 6648{
@@ -6295,11 +6669,47 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
6295 6669
6296void __init perf_event_init(void) 6670void __init perf_event_init(void)
6297{ 6671{
6672 int ret;
6673
6674 idr_init(&pmu_idr);
6675
6298 perf_event_init_all_cpus(); 6676 perf_event_init_all_cpus();
6299 init_srcu_struct(&pmus_srcu); 6677 init_srcu_struct(&pmus_srcu);
6300 perf_pmu_register(&perf_swevent); 6678 perf_pmu_register(&perf_swevent, "software", PERF_TYPE_SOFTWARE);
6301 perf_pmu_register(&perf_cpu_clock); 6679 perf_pmu_register(&perf_cpu_clock, NULL, -1);
6302 perf_pmu_register(&perf_task_clock); 6680 perf_pmu_register(&perf_task_clock, NULL, -1);
6303 perf_tp_register(); 6681 perf_tp_register();
6304 perf_cpu_notifier(perf_cpu_notify); 6682 perf_cpu_notifier(perf_cpu_notify);
6683 register_reboot_notifier(&perf_reboot_notifier);
6684
6685 ret = init_hw_breakpoint();
6686 WARN(ret, "hw_breakpoint initialization failed with: %d", ret);
6687}
6688
6689static int __init perf_event_sysfs_init(void)
6690{
6691 struct pmu *pmu;
6692 int ret;
6693
6694 mutex_lock(&pmus_lock);
6695
6696 ret = bus_register(&pmu_bus);
6697 if (ret)
6698 goto unlock;
6699
6700 list_for_each_entry(pmu, &pmus, entry) {
6701 if (!pmu->name || pmu->type < 0)
6702 continue;
6703
6704 ret = pmu_dev_alloc(pmu);
6705 WARN(ret, "Failed to register pmu: %s, reason %d\n", pmu->name, ret);
6706 }
6707 pmu_bus_running = 1;
6708 ret = 0;
6709
6710unlock:
6711 mutex_unlock(&pmus_lock);
6712
6713 return ret;
6305} 6714}
6715device_initcall(perf_event_sysfs_init);
diff --git a/kernel/pm_qos_params.c b/kernel/pm_qos_params.c
index c7a8f453919e..aeaa7f846821 100644
--- a/kernel/pm_qos_params.c
+++ b/kernel/pm_qos_params.c
@@ -121,10 +121,10 @@ static inline int pm_qos_get_value(struct pm_qos_object *o)
121 121
122 switch (o->type) { 122 switch (o->type) {
123 case PM_QOS_MIN: 123 case PM_QOS_MIN:
124 return plist_last(&o->requests)->prio; 124 return plist_first(&o->requests)->prio;
125 125
126 case PM_QOS_MAX: 126 case PM_QOS_MAX:
127 return plist_first(&o->requests)->prio; 127 return plist_last(&o->requests)->prio;
128 128
129 default: 129 default:
130 /* runtime check for not using enum */ 130 /* runtime check for not using enum */
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 6842eeba5879..05bb7173850e 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -37,13 +37,13 @@ static int check_clock(const clockid_t which_clock)
37 if (pid == 0) 37 if (pid == 0)
38 return 0; 38 return 0;
39 39
40 read_lock(&tasklist_lock); 40 rcu_read_lock();
41 p = find_task_by_vpid(pid); 41 p = find_task_by_vpid(pid);
42 if (!p || !(CPUCLOCK_PERTHREAD(which_clock) ? 42 if (!p || !(CPUCLOCK_PERTHREAD(which_clock) ?
43 same_thread_group(p, current) : thread_group_leader(p))) { 43 same_thread_group(p, current) : has_group_leader_pid(p))) {
44 error = -EINVAL; 44 error = -EINVAL;
45 } 45 }
46 read_unlock(&tasklist_lock); 46 rcu_read_unlock();
47 47
48 return error; 48 return error;
49} 49}
@@ -390,7 +390,7 @@ int posix_cpu_timer_create(struct k_itimer *new_timer)
390 390
391 INIT_LIST_HEAD(&new_timer->it.cpu.entry); 391 INIT_LIST_HEAD(&new_timer->it.cpu.entry);
392 392
393 read_lock(&tasklist_lock); 393 rcu_read_lock();
394 if (CPUCLOCK_PERTHREAD(new_timer->it_clock)) { 394 if (CPUCLOCK_PERTHREAD(new_timer->it_clock)) {
395 if (pid == 0) { 395 if (pid == 0) {
396 p = current; 396 p = current;
@@ -404,7 +404,7 @@ int posix_cpu_timer_create(struct k_itimer *new_timer)
404 p = current->group_leader; 404 p = current->group_leader;
405 } else { 405 } else {
406 p = find_task_by_vpid(pid); 406 p = find_task_by_vpid(pid);
407 if (p && !thread_group_leader(p)) 407 if (p && !has_group_leader_pid(p))
408 p = NULL; 408 p = NULL;
409 } 409 }
410 } 410 }
@@ -414,7 +414,7 @@ int posix_cpu_timer_create(struct k_itimer *new_timer)
414 } else { 414 } else {
415 ret = -EINVAL; 415 ret = -EINVAL;
416 } 416 }
417 read_unlock(&tasklist_lock); 417 rcu_read_unlock();
418 418
419 return ret; 419 return ret;
420} 420}
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 9ca4973f736d..93bd2eb2bc53 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -145,7 +145,13 @@ static int common_timer_del(struct k_itimer *timer);
145 145
146static enum hrtimer_restart posix_timer_fn(struct hrtimer *data); 146static enum hrtimer_restart posix_timer_fn(struct hrtimer *data);
147 147
148static struct k_itimer *lock_timer(timer_t timer_id, unsigned long *flags); 148static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags);
149
150#define lock_timer(tid, flags) \
151({ struct k_itimer *__timr; \
152 __cond_lock(&__timr->it_lock, __timr = __lock_timer(tid, flags)); \
153 __timr; \
154})
149 155
150static inline void unlock_timer(struct k_itimer *timr, unsigned long flags) 156static inline void unlock_timer(struct k_itimer *timr, unsigned long flags)
151{ 157{
@@ -619,7 +625,7 @@ out:
619 * the find to the timer lock. To avoid a dead lock, the timer id MUST 625 * the find to the timer lock. To avoid a dead lock, the timer id MUST
620 * be release with out holding the timer lock. 626 * be release with out holding the timer lock.
621 */ 627 */
622static struct k_itimer *lock_timer(timer_t timer_id, unsigned long *flags) 628static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags)
623{ 629{
624 struct k_itimer *timr; 630 struct k_itimer *timr;
625 /* 631 /*
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 29bff6117abc..265729966ece 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -100,13 +100,9 @@ config PM_SLEEP_ADVANCED_DEBUG
100 depends on PM_ADVANCED_DEBUG 100 depends on PM_ADVANCED_DEBUG
101 default n 101 default n
102 102
103config SUSPEND_NVS
104 bool
105
106config SUSPEND 103config SUSPEND
107 bool "Suspend to RAM and standby" 104 bool "Suspend to RAM and standby"
108 depends on PM && ARCH_SUSPEND_POSSIBLE 105 depends on PM && ARCH_SUSPEND_POSSIBLE
109 select SUSPEND_NVS if HAS_IOMEM
110 default y 106 default y
111 ---help--- 107 ---help---
112 Allow the system to enter sleep states in which main memory is 108 Allow the system to enter sleep states in which main memory is
@@ -140,7 +136,6 @@ config HIBERNATION
140 depends on PM && SWAP && ARCH_HIBERNATION_POSSIBLE 136 depends on PM && SWAP && ARCH_HIBERNATION_POSSIBLE
141 select LZO_COMPRESS 137 select LZO_COMPRESS
142 select LZO_DECOMPRESS 138 select LZO_DECOMPRESS
143 select SUSPEND_NVS if HAS_IOMEM
144 ---help--- 139 ---help---
145 Enable the suspend to disk (STD) functionality, which is usually 140 Enable the suspend to disk (STD) functionality, which is usually
146 called "hibernation" in user interfaces. STD checkpoints the 141 called "hibernation" in user interfaces. STD checkpoints the
@@ -246,9 +241,13 @@ config PM_OPS
246 depends on PM_SLEEP || PM_RUNTIME 241 depends on PM_SLEEP || PM_RUNTIME
247 default y 242 default y
248 243
244config ARCH_HAS_OPP
245 bool
246
249config PM_OPP 247config PM_OPP
250 bool "Operating Performance Point (OPP) Layer library" 248 bool "Operating Performance Point (OPP) Layer library"
251 depends on PM 249 depends on PM
250 depends on ARCH_HAS_OPP
252 ---help--- 251 ---help---
253 SOCs have a standard set of tuples consisting of frequency and 252 SOCs have a standard set of tuples consisting of frequency and
254 voltage pairs that the device will support per voltage domain. This 253 voltage pairs that the device will support per voltage domain. This
diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index f9063c6b185d..c350e18b53e3 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -1,7 +1,4 @@
1 1ccflags-$(CONFIG_PM_DEBUG) := -DDEBUG
2ifeq ($(CONFIG_PM_DEBUG),y)
3EXTRA_CFLAGS += -DDEBUG
4endif
5 2
6obj-$(CONFIG_PM) += main.o 3obj-$(CONFIG_PM) += main.o
7obj-$(CONFIG_PM_SLEEP) += console.o 4obj-$(CONFIG_PM_SLEEP) += console.o
@@ -10,6 +7,5 @@ obj-$(CONFIG_SUSPEND) += suspend.o
10obj-$(CONFIG_PM_TEST_SUSPEND) += suspend_test.o 7obj-$(CONFIG_PM_TEST_SUSPEND) += suspend_test.o
11obj-$(CONFIG_HIBERNATION) += hibernate.o snapshot.o swap.o user.o \ 8obj-$(CONFIG_HIBERNATION) += hibernate.o snapshot.o swap.o user.o \
12 block_io.o 9 block_io.o
13obj-$(CONFIG_SUSPEND_NVS) += nvs.o
14 10
15obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o 11obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 657272e91d0a..1832bd264219 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -51,18 +51,18 @@ enum {
51 51
52static int hibernation_mode = HIBERNATION_SHUTDOWN; 52static int hibernation_mode = HIBERNATION_SHUTDOWN;
53 53
54static struct platform_hibernation_ops *hibernation_ops; 54static const struct platform_hibernation_ops *hibernation_ops;
55 55
56/** 56/**
57 * hibernation_set_ops - set the global hibernate operations 57 * hibernation_set_ops - set the global hibernate operations
58 * @ops: the hibernation operations to use in subsequent hibernation transitions 58 * @ops: the hibernation operations to use in subsequent hibernation transitions
59 */ 59 */
60 60
61void hibernation_set_ops(struct platform_hibernation_ops *ops) 61void hibernation_set_ops(const struct platform_hibernation_ops *ops)
62{ 62{
63 if (ops && !(ops->begin && ops->end && ops->pre_snapshot 63 if (ops && !(ops->begin && ops->end && ops->pre_snapshot
64 && ops->prepare && ops->finish && ops->enter && ops->pre_restore 64 && ops->prepare && ops->finish && ops->enter && ops->pre_restore
65 && ops->restore_cleanup)) { 65 && ops->restore_cleanup && ops->leave)) {
66 WARN_ON(1); 66 WARN_ON(1);
67 return; 67 return;
68 } 68 }
@@ -278,7 +278,7 @@ static int create_image(int platform_mode)
278 goto Enable_irqs; 278 goto Enable_irqs;
279 } 279 }
280 280
281 if (hibernation_test(TEST_CORE) || !pm_check_wakeup_events()) 281 if (hibernation_test(TEST_CORE) || pm_wakeup_pending())
282 goto Power_up; 282 goto Power_up;
283 283
284 in_suspend = 1; 284 in_suspend = 1;
@@ -327,7 +327,6 @@ static int create_image(int platform_mode)
327int hibernation_snapshot(int platform_mode) 327int hibernation_snapshot(int platform_mode)
328{ 328{
329 int error; 329 int error;
330 gfp_t saved_mask;
331 330
332 error = platform_begin(platform_mode); 331 error = platform_begin(platform_mode);
333 if (error) 332 if (error)
@@ -339,7 +338,7 @@ int hibernation_snapshot(int platform_mode)
339 goto Close; 338 goto Close;
340 339
341 suspend_console(); 340 suspend_console();
342 saved_mask = clear_gfp_allowed_mask(GFP_IOFS); 341 pm_restrict_gfp_mask();
343 error = dpm_suspend_start(PMSG_FREEZE); 342 error = dpm_suspend_start(PMSG_FREEZE);
344 if (error) 343 if (error)
345 goto Recover_platform; 344 goto Recover_platform;
@@ -348,7 +347,10 @@ int hibernation_snapshot(int platform_mode)
348 goto Recover_platform; 347 goto Recover_platform;
349 348
350 error = create_image(platform_mode); 349 error = create_image(platform_mode);
351 /* Control returns here after successful restore */ 350 /*
351 * Control returns here (1) after the image has been created or the
352 * image creation has failed and (2) after a successful restore.
353 */
352 354
353 Resume_devices: 355 Resume_devices:
354 /* We may need to release the preallocated image pages here. */ 356 /* We may need to release the preallocated image pages here. */
@@ -357,7 +359,10 @@ int hibernation_snapshot(int platform_mode)
357 359
358 dpm_resume_end(in_suspend ? 360 dpm_resume_end(in_suspend ?
359 (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE); 361 (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE);
360 set_gfp_allowed_mask(saved_mask); 362
363 if (error || !in_suspend)
364 pm_restore_gfp_mask();
365
361 resume_console(); 366 resume_console();
362 Close: 367 Close:
363 platform_end(platform_mode); 368 platform_end(platform_mode);
@@ -452,17 +457,16 @@ static int resume_target_kernel(bool platform_mode)
452int hibernation_restore(int platform_mode) 457int hibernation_restore(int platform_mode)
453{ 458{
454 int error; 459 int error;
455 gfp_t saved_mask;
456 460
457 pm_prepare_console(); 461 pm_prepare_console();
458 suspend_console(); 462 suspend_console();
459 saved_mask = clear_gfp_allowed_mask(GFP_IOFS); 463 pm_restrict_gfp_mask();
460 error = dpm_suspend_start(PMSG_QUIESCE); 464 error = dpm_suspend_start(PMSG_QUIESCE);
461 if (!error) { 465 if (!error) {
462 error = resume_target_kernel(platform_mode); 466 error = resume_target_kernel(platform_mode);
463 dpm_resume_end(PMSG_RECOVER); 467 dpm_resume_end(PMSG_RECOVER);
464 } 468 }
465 set_gfp_allowed_mask(saved_mask); 469 pm_restore_gfp_mask();
466 resume_console(); 470 resume_console();
467 pm_restore_console(); 471 pm_restore_console();
468 return error; 472 return error;
@@ -476,7 +480,6 @@ int hibernation_restore(int platform_mode)
476int hibernation_platform_enter(void) 480int hibernation_platform_enter(void)
477{ 481{
478 int error; 482 int error;
479 gfp_t saved_mask;
480 483
481 if (!hibernation_ops) 484 if (!hibernation_ops)
482 return -ENOSYS; 485 return -ENOSYS;
@@ -492,7 +495,6 @@ int hibernation_platform_enter(void)
492 495
493 entering_platform_hibernation = true; 496 entering_platform_hibernation = true;
494 suspend_console(); 497 suspend_console();
495 saved_mask = clear_gfp_allowed_mask(GFP_IOFS);
496 error = dpm_suspend_start(PMSG_HIBERNATE); 498 error = dpm_suspend_start(PMSG_HIBERNATE);
497 if (error) { 499 if (error) {
498 if (hibernation_ops->recover) 500 if (hibernation_ops->recover)
@@ -514,7 +516,7 @@ int hibernation_platform_enter(void)
514 516
515 local_irq_disable(); 517 local_irq_disable();
516 sysdev_suspend(PMSG_HIBERNATE); 518 sysdev_suspend(PMSG_HIBERNATE);
517 if (!pm_check_wakeup_events()) { 519 if (pm_wakeup_pending()) {
518 error = -EAGAIN; 520 error = -EAGAIN;
519 goto Power_up; 521 goto Power_up;
520 } 522 }
@@ -536,7 +538,6 @@ int hibernation_platform_enter(void)
536 Resume_devices: 538 Resume_devices:
537 entering_platform_hibernation = false; 539 entering_platform_hibernation = false;
538 dpm_resume_end(PMSG_RESTORE); 540 dpm_resume_end(PMSG_RESTORE);
539 set_gfp_allowed_mask(saved_mask);
540 resume_console(); 541 resume_console();
541 542
542 Close: 543 Close:
@@ -646,6 +647,8 @@ int hibernate(void)
646 swsusp_free(); 647 swsusp_free();
647 if (!error) 648 if (!error)
648 power_down(); 649 power_down();
650 in_suspend = 0;
651 pm_restore_gfp_mask();
649 } else { 652 } else {
650 pr_debug("PM: Image restored successfully.\n"); 653 pr_debug("PM: Image restored successfully.\n");
651 } 654 }
diff --git a/kernel/power/nvs.c b/kernel/power/nvs.c
deleted file mode 100644
index 1836db60bbb6..000000000000
--- a/kernel/power/nvs.c
+++ /dev/null
@@ -1,136 +0,0 @@
1/*
2 * linux/kernel/power/hibernate_nvs.c - Routines for handling NVS memory
3 *
4 * Copyright (C) 2008,2009 Rafael J. Wysocki <rjw@sisk.pl>, Novell Inc.
5 *
6 * This file is released under the GPLv2.
7 */
8
9#include <linux/io.h>
10#include <linux/kernel.h>
11#include <linux/list.h>
12#include <linux/mm.h>
13#include <linux/slab.h>
14#include <linux/suspend.h>
15
16/*
17 * Platforms, like ACPI, may want us to save some memory used by them during
18 * suspend and to restore the contents of this memory during the subsequent
19 * resume. The code below implements a mechanism allowing us to do that.
20 */
21
22struct nvs_page {
23 unsigned long phys_start;
24 unsigned int size;
25 void *kaddr;
26 void *data;
27 struct list_head node;
28};
29
30static LIST_HEAD(nvs_list);
31
32/**
33 * suspend_nvs_register - register platform NVS memory region to save
34 * @start - physical address of the region
35 * @size - size of the region
36 *
37 * The NVS region need not be page-aligned (both ends) and we arrange
38 * things so that the data from page-aligned addresses in this region will
39 * be copied into separate RAM pages.
40 */
41int suspend_nvs_register(unsigned long start, unsigned long size)
42{
43 struct nvs_page *entry, *next;
44
45 while (size > 0) {
46 unsigned int nr_bytes;
47
48 entry = kzalloc(sizeof(struct nvs_page), GFP_KERNEL);
49 if (!entry)
50 goto Error;
51
52 list_add_tail(&entry->node, &nvs_list);
53 entry->phys_start = start;
54 nr_bytes = PAGE_SIZE - (start & ~PAGE_MASK);
55 entry->size = (size < nr_bytes) ? size : nr_bytes;
56
57 start += entry->size;
58 size -= entry->size;
59 }
60 return 0;
61
62 Error:
63 list_for_each_entry_safe(entry, next, &nvs_list, node) {
64 list_del(&entry->node);
65 kfree(entry);
66 }
67 return -ENOMEM;
68}
69
70/**
71 * suspend_nvs_free - free data pages allocated for saving NVS regions
72 */
73void suspend_nvs_free(void)
74{
75 struct nvs_page *entry;
76
77 list_for_each_entry(entry, &nvs_list, node)
78 if (entry->data) {
79 free_page((unsigned long)entry->data);
80 entry->data = NULL;
81 if (entry->kaddr) {
82 iounmap(entry->kaddr);
83 entry->kaddr = NULL;
84 }
85 }
86}
87
88/**
89 * suspend_nvs_alloc - allocate memory necessary for saving NVS regions
90 */
91int suspend_nvs_alloc(void)
92{
93 struct nvs_page *entry;
94
95 list_for_each_entry(entry, &nvs_list, node) {
96 entry->data = (void *)__get_free_page(GFP_KERNEL);
97 if (!entry->data) {
98 suspend_nvs_free();
99 return -ENOMEM;
100 }
101 }
102 return 0;
103}
104
105/**
106 * suspend_nvs_save - save NVS memory regions
107 */
108void suspend_nvs_save(void)
109{
110 struct nvs_page *entry;
111
112 printk(KERN_INFO "PM: Saving platform NVS memory\n");
113
114 list_for_each_entry(entry, &nvs_list, node)
115 if (entry->data) {
116 entry->kaddr = ioremap(entry->phys_start, entry->size);
117 memcpy(entry->data, entry->kaddr, entry->size);
118 }
119}
120
121/**
122 * suspend_nvs_restore - restore NVS memory regions
123 *
124 * This function is going to be called with interrupts disabled, so it
125 * cannot iounmap the virtual addresses used to access the NVS region.
126 */
127void suspend_nvs_restore(void)
128{
129 struct nvs_page *entry;
130
131 printk(KERN_INFO "PM: Restoring platform NVS memory\n");
132
133 list_for_each_entry(entry, &nvs_list, node)
134 if (entry->data)
135 memcpy(entry->kaddr, entry->data, entry->size);
136}
diff --git a/kernel/power/process.c b/kernel/power/process.c
index e50b4c1b2a0f..d6d2a10320e0 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -64,6 +64,12 @@ static int try_to_freeze_tasks(bool sig_only)
64 * perturb a task in TASK_STOPPED or TASK_TRACED. 64 * perturb a task in TASK_STOPPED or TASK_TRACED.
65 * It is "frozen enough". If the task does wake 65 * It is "frozen enough". If the task does wake
66 * up, it will immediately call try_to_freeze. 66 * up, it will immediately call try_to_freeze.
67 *
68 * Because freeze_task() goes through p's
69 * scheduler lock after setting TIF_FREEZE, it's
70 * guaranteed that either we see TASK_RUNNING or
71 * try_to_stop() after schedule() in ptrace/signal
72 * stop sees TIF_FREEZE.
67 */ 73 */
68 if (!task_is_stopped_or_traced(p) && 74 if (!task_is_stopped_or_traced(p) &&
69 !freezer_should_skip(p)) 75 !freezer_should_skip(p))
@@ -79,7 +85,7 @@ static int try_to_freeze_tasks(bool sig_only)
79 if (!todo || time_after(jiffies, end_time)) 85 if (!todo || time_after(jiffies, end_time))
80 break; 86 break;
81 87
82 if (!pm_check_wakeup_events()) { 88 if (pm_wakeup_pending()) {
83 wakeup = true; 89 wakeup = true;
84 break; 90 break;
85 } 91 }
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 7335952ee473..de6f86bfa303 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -22,6 +22,7 @@
22#include <linux/mm.h> 22#include <linux/mm.h>
23#include <linux/slab.h> 23#include <linux/slab.h>
24#include <linux/suspend.h> 24#include <linux/suspend.h>
25#include <trace/events/power.h>
25 26
26#include "power.h" 27#include "power.h"
27 28
@@ -30,13 +31,13 @@ const char *const pm_states[PM_SUSPEND_MAX] = {
30 [PM_SUSPEND_MEM] = "mem", 31 [PM_SUSPEND_MEM] = "mem",
31}; 32};
32 33
33static struct platform_suspend_ops *suspend_ops; 34static const struct platform_suspend_ops *suspend_ops;
34 35
35/** 36/**
36 * suspend_set_ops - Set the global suspend method table. 37 * suspend_set_ops - Set the global suspend method table.
37 * @ops: Pointer to ops structure. 38 * @ops: Pointer to ops structure.
38 */ 39 */
39void suspend_set_ops(struct platform_suspend_ops *ops) 40void suspend_set_ops(const struct platform_suspend_ops *ops)
40{ 41{
41 mutex_lock(&pm_mutex); 42 mutex_lock(&pm_mutex);
42 suspend_ops = ops; 43 suspend_ops = ops;
@@ -163,7 +164,7 @@ static int suspend_enter(suspend_state_t state)
163 164
164 error = sysdev_suspend(PMSG_SUSPEND); 165 error = sysdev_suspend(PMSG_SUSPEND);
165 if (!error) { 166 if (!error) {
166 if (!suspend_test(TEST_CORE) && pm_check_wakeup_events()) { 167 if (!(suspend_test(TEST_CORE) || pm_wakeup_pending())) {
167 error = suspend_ops->enter(state); 168 error = suspend_ops->enter(state);
168 events_check_enabled = false; 169 events_check_enabled = false;
169 } 170 }
@@ -197,18 +198,18 @@ static int suspend_enter(suspend_state_t state)
197int suspend_devices_and_enter(suspend_state_t state) 198int suspend_devices_and_enter(suspend_state_t state)
198{ 199{
199 int error; 200 int error;
200 gfp_t saved_mask;
201 201
202 if (!suspend_ops) 202 if (!suspend_ops)
203 return -ENOSYS; 203 return -ENOSYS;
204 204
205 trace_machine_suspend(state);
205 if (suspend_ops->begin) { 206 if (suspend_ops->begin) {
206 error = suspend_ops->begin(state); 207 error = suspend_ops->begin(state);
207 if (error) 208 if (error)
208 goto Close; 209 goto Close;
209 } 210 }
210 suspend_console(); 211 suspend_console();
211 saved_mask = clear_gfp_allowed_mask(GFP_IOFS); 212 pm_restrict_gfp_mask();
212 suspend_test_start(); 213 suspend_test_start();
213 error = dpm_suspend_start(PMSG_SUSPEND); 214 error = dpm_suspend_start(PMSG_SUSPEND);
214 if (error) { 215 if (error) {
@@ -225,11 +226,12 @@ int suspend_devices_and_enter(suspend_state_t state)
225 suspend_test_start(); 226 suspend_test_start();
226 dpm_resume_end(PMSG_RESUME); 227 dpm_resume_end(PMSG_RESUME);
227 suspend_test_finish("resume devices"); 228 suspend_test_finish("resume devices");
228 set_gfp_allowed_mask(saved_mask); 229 pm_restore_gfp_mask();
229 resume_console(); 230 resume_console();
230 Close: 231 Close:
231 if (suspend_ops->end) 232 if (suspend_ops->end)
232 suspend_ops->end(); 233 suspend_ops->end();
234 trace_machine_suspend(PWR_EVENT_EXIT);
233 return error; 235 return error;
234 236
235 Recover_platform: 237 Recover_platform:
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index a0e4a86ccf94..7c97c3a0eee3 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -6,6 +6,7 @@
6 * 6 *
7 * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@ucw.cz> 7 * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@ucw.cz>
8 * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl> 8 * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl>
9 * Copyright (C) 2010 Bojan Smojver <bojan@rexursive.com>
9 * 10 *
10 * This file is released under the GPLv2. 11 * This file is released under the GPLv2.
11 * 12 *
@@ -29,7 +30,7 @@
29 30
30#include "power.h" 31#include "power.h"
31 32
32#define HIBERNATE_SIG "LINHIB0001" 33#define HIBERNATE_SIG "S1SUSPEND"
33 34
34/* 35/*
35 * The swap map is a data structure used for keeping track of each page 36 * The swap map is a data structure used for keeping track of each page
@@ -223,7 +224,7 @@ static int swsusp_swap_check(void)
223 return res; 224 return res;
224 225
225 root_swap = res; 226 root_swap = res;
226 res = blkdev_get(hib_resume_bdev, FMODE_WRITE); 227 res = blkdev_get(hib_resume_bdev, FMODE_WRITE, NULL);
227 if (res) 228 if (res)
228 return res; 229 return res;
229 230
@@ -753,30 +754,43 @@ static int load_image_lzo(struct swap_map_handle *handle,
753{ 754{
754 unsigned int m; 755 unsigned int m;
755 int error = 0; 756 int error = 0;
757 struct bio *bio;
756 struct timeval start; 758 struct timeval start;
757 struct timeval stop; 759 struct timeval stop;
758 unsigned nr_pages; 760 unsigned nr_pages;
759 size_t off, unc_len, cmp_len; 761 size_t i, off, unc_len, cmp_len;
760 unsigned char *unc, *cmp, *page; 762 unsigned char *unc, *cmp, *page[LZO_CMP_PAGES];
761 763
762 page = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH); 764 for (i = 0; i < LZO_CMP_PAGES; i++) {
763 if (!page) { 765 page[i] = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH);
764 printk(KERN_ERR "PM: Failed to allocate LZO page\n"); 766 if (!page[i]) {
765 return -ENOMEM; 767 printk(KERN_ERR "PM: Failed to allocate LZO page\n");
768
769 while (i)
770 free_page((unsigned long)page[--i]);
771
772 return -ENOMEM;
773 }
766 } 774 }
767 775
768 unc = vmalloc(LZO_UNC_SIZE); 776 unc = vmalloc(LZO_UNC_SIZE);
769 if (!unc) { 777 if (!unc) {
770 printk(KERN_ERR "PM: Failed to allocate LZO uncompressed\n"); 778 printk(KERN_ERR "PM: Failed to allocate LZO uncompressed\n");
771 free_page((unsigned long)page); 779
780 for (i = 0; i < LZO_CMP_PAGES; i++)
781 free_page((unsigned long)page[i]);
782
772 return -ENOMEM; 783 return -ENOMEM;
773 } 784 }
774 785
775 cmp = vmalloc(LZO_CMP_SIZE); 786 cmp = vmalloc(LZO_CMP_SIZE);
776 if (!cmp) { 787 if (!cmp) {
777 printk(KERN_ERR "PM: Failed to allocate LZO compressed\n"); 788 printk(KERN_ERR "PM: Failed to allocate LZO compressed\n");
789
778 vfree(unc); 790 vfree(unc);
779 free_page((unsigned long)page); 791 for (i = 0; i < LZO_CMP_PAGES; i++)
792 free_page((unsigned long)page[i]);
793
780 return -ENOMEM; 794 return -ENOMEM;
781 } 795 }
782 796
@@ -787,6 +801,7 @@ static int load_image_lzo(struct swap_map_handle *handle,
787 if (!m) 801 if (!m)
788 m = 1; 802 m = 1;
789 nr_pages = 0; 803 nr_pages = 0;
804 bio = NULL;
790 do_gettimeofday(&start); 805 do_gettimeofday(&start);
791 806
792 error = snapshot_write_next(snapshot); 807 error = snapshot_write_next(snapshot);
@@ -794,11 +809,11 @@ static int load_image_lzo(struct swap_map_handle *handle,
794 goto out_finish; 809 goto out_finish;
795 810
796 for (;;) { 811 for (;;) {
797 error = swap_read_page(handle, page, NULL); /* sync */ 812 error = swap_read_page(handle, page[0], NULL); /* sync */
798 if (error) 813 if (error)
799 break; 814 break;
800 815
801 cmp_len = *(size_t *)page; 816 cmp_len = *(size_t *)page[0];
802 if (unlikely(!cmp_len || 817 if (unlikely(!cmp_len ||
803 cmp_len > lzo1x_worst_compress(LZO_UNC_SIZE))) { 818 cmp_len > lzo1x_worst_compress(LZO_UNC_SIZE))) {
804 printk(KERN_ERR "PM: Invalid LZO compressed length\n"); 819 printk(KERN_ERR "PM: Invalid LZO compressed length\n");
@@ -806,13 +821,20 @@ static int load_image_lzo(struct swap_map_handle *handle,
806 break; 821 break;
807 } 822 }
808 823
809 memcpy(cmp, page, PAGE_SIZE); 824 for (off = PAGE_SIZE, i = 1;
810 for (off = PAGE_SIZE; off < LZO_HEADER + cmp_len; off += PAGE_SIZE) { 825 off < LZO_HEADER + cmp_len; off += PAGE_SIZE, i++) {
811 error = swap_read_page(handle, page, NULL); /* sync */ 826 error = swap_read_page(handle, page[i], &bio);
812 if (error) 827 if (error)
813 goto out_finish; 828 goto out_finish;
829 }
814 830
815 memcpy(cmp + off, page, PAGE_SIZE); 831 error = hib_wait_on_bio_chain(&bio); /* need all data now */
832 if (error)
833 goto out_finish;
834
835 for (off = 0, i = 0;
836 off < LZO_HEADER + cmp_len; off += PAGE_SIZE, i++) {
837 memcpy(cmp + off, page[i], PAGE_SIZE);
816 } 838 }
817 839
818 unc_len = LZO_UNC_SIZE; 840 unc_len = LZO_UNC_SIZE;
@@ -857,7 +879,8 @@ out_finish:
857 879
858 vfree(cmp); 880 vfree(cmp);
859 vfree(unc); 881 vfree(unc);
860 free_page((unsigned long)page); 882 for (i = 0; i < LZO_CMP_PAGES; i++)
883 free_page((unsigned long)page[i]);
861 884
862 return error; 885 return error;
863} 886}
@@ -865,7 +888,7 @@ out_finish:
865/** 888/**
866 * swsusp_read - read the hibernation image. 889 * swsusp_read - read the hibernation image.
867 * @flags_p: flags passed by the "frozen" kernel in the image header should 890 * @flags_p: flags passed by the "frozen" kernel in the image header should
868 * be written into this memeory location 891 * be written into this memory location
869 */ 892 */
870 893
871int swsusp_read(unsigned int *flags_p) 894int swsusp_read(unsigned int *flags_p)
@@ -907,7 +930,8 @@ int swsusp_check(void)
907{ 930{
908 int error; 931 int error;
909 932
910 hib_resume_bdev = open_by_devnum(swsusp_resume_device, FMODE_READ); 933 hib_resume_bdev = blkdev_get_by_dev(swsusp_resume_device,
934 FMODE_READ, NULL);
911 if (!IS_ERR(hib_resume_bdev)) { 935 if (!IS_ERR(hib_resume_bdev)) {
912 set_blocksize(hib_resume_bdev, PAGE_SIZE); 936 set_blocksize(hib_resume_bdev, PAGE_SIZE);
913 clear_page(swsusp_header); 937 clear_page(swsusp_header);
diff --git a/kernel/power/user.c b/kernel/power/user.c
index e819e17877ca..c36c3b9e8a84 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -137,7 +137,7 @@ static int snapshot_release(struct inode *inode, struct file *filp)
137 free_all_swap_pages(data->swap); 137 free_all_swap_pages(data->swap);
138 if (data->frozen) 138 if (data->frozen)
139 thaw_processes(); 139 thaw_processes();
140 pm_notifier_call_chain(data->mode == O_WRONLY ? 140 pm_notifier_call_chain(data->mode == O_RDONLY ?
141 PM_POST_HIBERNATION : PM_POST_RESTORE); 141 PM_POST_HIBERNATION : PM_POST_RESTORE);
142 atomic_inc(&snapshot_device_available); 142 atomic_inc(&snapshot_device_available);
143 143
@@ -263,6 +263,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
263 case SNAPSHOT_UNFREEZE: 263 case SNAPSHOT_UNFREEZE:
264 if (!data->frozen || data->ready) 264 if (!data->frozen || data->ready)
265 break; 265 break;
266 pm_restore_gfp_mask();
266 thaw_processes(); 267 thaw_processes();
267 usermodehelper_enable(); 268 usermodehelper_enable();
268 data->frozen = 0; 269 data->frozen = 0;
@@ -275,6 +276,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
275 error = -EPERM; 276 error = -EPERM;
276 break; 277 break;
277 } 278 }
279 pm_restore_gfp_mask();
278 error = hibernation_snapshot(data->platform_support); 280 error = hibernation_snapshot(data->platform_support);
279 if (!error) 281 if (!error)
280 error = put_user(in_suspend, (int __user *)arg); 282 error = put_user(in_suspend, (int __user *)arg);
diff --git a/kernel/printk.c b/kernel/printk.c
index b2ebaee8c377..53d9a9ec88e6 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -39,16 +39,11 @@
39#include <linux/syslog.h> 39#include <linux/syslog.h>
40#include <linux/cpu.h> 40#include <linux/cpu.h>
41#include <linux/notifier.h> 41#include <linux/notifier.h>
42#include <linux/rculist.h>
42 43
43#include <asm/uaccess.h> 44#include <asm/uaccess.h>
44 45
45/* 46/*
46 * for_each_console() allows you to iterate on each console
47 */
48#define for_each_console(con) \
49 for (con = console_drivers; con != NULL; con = con->next)
50
51/*
52 * Architectures can override it: 47 * Architectures can override it:
53 */ 48 */
54void asmlinkage __attribute__((weak)) early_printk(const char *fmt, ...) 49void asmlinkage __attribute__((weak)) early_printk(const char *fmt, ...)
@@ -261,6 +256,12 @@ static inline void boot_delay_msec(void)
261} 256}
262#endif 257#endif
263 258
259#ifdef CONFIG_SECURITY_DMESG_RESTRICT
260int dmesg_restrict = 1;
261#else
262int dmesg_restrict;
263#endif
264
264int do_syslog(int type, char __user *buf, int len, bool from_file) 265int do_syslog(int type, char __user *buf, int len, bool from_file)
265{ 266{
266 unsigned i, j, limit, count; 267 unsigned i, j, limit, count;
@@ -268,7 +269,20 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)
268 char c; 269 char c;
269 int error = 0; 270 int error = 0;
270 271
271 error = security_syslog(type, from_file); 272 /*
273 * If this is from /proc/kmsg we only do the capabilities checks
274 * at open time.
275 */
276 if (type == SYSLOG_ACTION_OPEN || !from_file) {
277 if (dmesg_restrict && !capable(CAP_SYSLOG))
278 goto warn; /* switch to return -EPERM after 2.6.39 */
279 if ((type != SYSLOG_ACTION_READ_ALL &&
280 type != SYSLOG_ACTION_SIZE_BUFFER) &&
281 !capable(CAP_SYSLOG))
282 goto warn; /* switch to return -EPERM after 2.6.39 */
283 }
284
285 error = security_syslog(type);
272 if (error) 286 if (error)
273 return error; 287 return error;
274 288
@@ -409,6 +423,12 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)
409 } 423 }
410out: 424out:
411 return error; 425 return error;
426warn:
427 /* remove after 2.6.39 */
428 if (capable(CAP_SYS_ADMIN))
429 WARN_ONCE(1, "Attempt to access syslog with CAP_SYS_ADMIN "
430 "but no CAP_SYSLOG (deprecated and denied).\n");
431 return -EPERM;
412} 432}
413 433
414SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len) 434SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len)
@@ -1055,21 +1075,23 @@ static DEFINE_PER_CPU(int, printk_pending);
1055 1075
1056void printk_tick(void) 1076void printk_tick(void)
1057{ 1077{
1058 if (__get_cpu_var(printk_pending)) { 1078 if (__this_cpu_read(printk_pending)) {
1059 __get_cpu_var(printk_pending) = 0; 1079 __this_cpu_write(printk_pending, 0);
1060 wake_up_interruptible(&log_wait); 1080 wake_up_interruptible(&log_wait);
1061 } 1081 }
1062} 1082}
1063 1083
1064int printk_needs_cpu(int cpu) 1084int printk_needs_cpu(int cpu)
1065{ 1085{
1066 return per_cpu(printk_pending, cpu); 1086 if (cpu_is_offline(cpu))
1087 printk_tick();
1088 return __this_cpu_read(printk_pending);
1067} 1089}
1068 1090
1069void wake_up_klogd(void) 1091void wake_up_klogd(void)
1070{ 1092{
1071 if (waitqueue_active(&log_wait)) 1093 if (waitqueue_active(&log_wait))
1072 __raw_get_cpu_var(printk_pending) = 1; 1094 this_cpu_write(printk_pending, 1);
1073} 1095}
1074 1096
1075/** 1097/**
@@ -1338,6 +1360,7 @@ void register_console(struct console *newcon)
1338 spin_unlock_irqrestore(&logbuf_lock, flags); 1360 spin_unlock_irqrestore(&logbuf_lock, flags);
1339 } 1361 }
1340 release_console_sem(); 1362 release_console_sem();
1363 console_sysfs_notify();
1341 1364
1342 /* 1365 /*
1343 * By unregistering the bootconsoles after we enable the real console 1366 * By unregistering the bootconsoles after we enable the real console
@@ -1396,6 +1419,7 @@ int unregister_console(struct console *console)
1396 console_drivers->flags |= CON_CONSDEV; 1419 console_drivers->flags |= CON_CONSDEV;
1397 1420
1398 release_console_sem(); 1421 release_console_sem();
1422 console_sysfs_notify();
1399 return res; 1423 return res;
1400} 1424}
1401EXPORT_SYMBOL(unregister_console); 1425EXPORT_SYMBOL(unregister_console);
@@ -1479,7 +1503,7 @@ int kmsg_dump_register(struct kmsg_dumper *dumper)
1479 /* Don't allow registering multiple times */ 1503 /* Don't allow registering multiple times */
1480 if (!dumper->registered) { 1504 if (!dumper->registered) {
1481 dumper->registered = 1; 1505 dumper->registered = 1;
1482 list_add_tail(&dumper->list, &dump_list); 1506 list_add_tail_rcu(&dumper->list, &dump_list);
1483 err = 0; 1507 err = 0;
1484 } 1508 }
1485 spin_unlock_irqrestore(&dump_list_lock, flags); 1509 spin_unlock_irqrestore(&dump_list_lock, flags);
@@ -1503,29 +1527,16 @@ int kmsg_dump_unregister(struct kmsg_dumper *dumper)
1503 spin_lock_irqsave(&dump_list_lock, flags); 1527 spin_lock_irqsave(&dump_list_lock, flags);
1504 if (dumper->registered) { 1528 if (dumper->registered) {
1505 dumper->registered = 0; 1529 dumper->registered = 0;
1506 list_del(&dumper->list); 1530 list_del_rcu(&dumper->list);
1507 err = 0; 1531 err = 0;
1508 } 1532 }
1509 spin_unlock_irqrestore(&dump_list_lock, flags); 1533 spin_unlock_irqrestore(&dump_list_lock, flags);
1534 synchronize_rcu();
1510 1535
1511 return err; 1536 return err;
1512} 1537}
1513EXPORT_SYMBOL_GPL(kmsg_dump_unregister); 1538EXPORT_SYMBOL_GPL(kmsg_dump_unregister);
1514 1539
1515static const char * const kmsg_reasons[] = {
1516 [KMSG_DUMP_OOPS] = "oops",
1517 [KMSG_DUMP_PANIC] = "panic",
1518 [KMSG_DUMP_KEXEC] = "kexec",
1519};
1520
1521static const char *kmsg_to_str(enum kmsg_dump_reason reason)
1522{
1523 if (reason >= ARRAY_SIZE(kmsg_reasons) || reason < 0)
1524 return "unknown";
1525
1526 return kmsg_reasons[reason];
1527}
1528
1529/** 1540/**
1530 * kmsg_dump - dump kernel log to kernel message dumpers. 1541 * kmsg_dump - dump kernel log to kernel message dumpers.
1531 * @reason: the reason (oops, panic etc) for dumping 1542 * @reason: the reason (oops, panic etc) for dumping
@@ -1564,13 +1575,9 @@ void kmsg_dump(enum kmsg_dump_reason reason)
1564 l2 = chars; 1575 l2 = chars;
1565 } 1576 }
1566 1577
1567 if (!spin_trylock_irqsave(&dump_list_lock, flags)) { 1578 rcu_read_lock();
1568 printk(KERN_ERR "dump_kmsg: dump list lock is held during %s, skipping dump\n", 1579 list_for_each_entry_rcu(dumper, &dump_list, list)
1569 kmsg_to_str(reason));
1570 return;
1571 }
1572 list_for_each_entry(dumper, &dump_list, list)
1573 dumper->dump(dumper, reason, s1, l1, s2, l2); 1580 dumper->dump(dumper, reason, s1, l1, s2, l2);
1574 spin_unlock_irqrestore(&dump_list_lock, flags); 1581 rcu_read_unlock();
1575} 1582}
1576#endif 1583#endif
diff --git a/kernel/range.c b/kernel/range.c
index 471b66acabb5..37fa9b99ad58 100644
--- a/kernel/range.c
+++ b/kernel/range.c
@@ -119,7 +119,7 @@ static int cmp_range(const void *x1, const void *x2)
119 119
120int clean_sort_range(struct range *range, int az) 120int clean_sort_range(struct range *range, int az)
121{ 121{
122 int i, j, k = az - 1, nr_range = 0; 122 int i, j, k = az - 1, nr_range = az;
123 123
124 for (i = 0; i < k; i++) { 124 for (i = 0; i < k; i++) {
125 if (range[i].end) 125 if (range[i].end)
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c
index d806735342ac..0c343b9a46d5 100644
--- a/kernel/rcutiny.c
+++ b/kernel/rcutiny.c
@@ -36,31 +36,16 @@
36#include <linux/time.h> 36#include <linux/time.h>
37#include <linux/cpu.h> 37#include <linux/cpu.h>
38 38
39/* Global control variables for rcupdate callback mechanism. */ 39/* Controls for rcu_kthread() kthread, replacing RCU_SOFTIRQ used previously. */
40struct rcu_ctrlblk { 40static struct task_struct *rcu_kthread_task;
41 struct rcu_head *rcucblist; /* List of pending callbacks (CBs). */ 41static DECLARE_WAIT_QUEUE_HEAD(rcu_kthread_wq);
42 struct rcu_head **donetail; /* ->next pointer of last "done" CB. */ 42static unsigned long have_rcu_kthread_work;
43 struct rcu_head **curtail; /* ->next pointer of last CB. */ 43static void invoke_rcu_kthread(void);
44};
45
46/* Definition for rcupdate control block. */
47static struct rcu_ctrlblk rcu_sched_ctrlblk = {
48 .donetail = &rcu_sched_ctrlblk.rcucblist,
49 .curtail = &rcu_sched_ctrlblk.rcucblist,
50};
51
52static struct rcu_ctrlblk rcu_bh_ctrlblk = {
53 .donetail = &rcu_bh_ctrlblk.rcucblist,
54 .curtail = &rcu_bh_ctrlblk.rcucblist,
55};
56
57#ifdef CONFIG_DEBUG_LOCK_ALLOC
58int rcu_scheduler_active __read_mostly;
59EXPORT_SYMBOL_GPL(rcu_scheduler_active);
60#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
61 44
62/* Forward declarations for rcutiny_plugin.h. */ 45/* Forward declarations for rcutiny_plugin.h. */
63static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp); 46struct rcu_ctrlblk;
47static void rcu_process_callbacks(struct rcu_ctrlblk *rcp);
48static int rcu_kthread(void *arg);
64static void __call_rcu(struct rcu_head *head, 49static void __call_rcu(struct rcu_head *head,
65 void (*func)(struct rcu_head *rcu), 50 void (*func)(struct rcu_head *rcu),
66 struct rcu_ctrlblk *rcp); 51 struct rcu_ctrlblk *rcp);
@@ -123,7 +108,7 @@ void rcu_sched_qs(int cpu)
123{ 108{
124 if (rcu_qsctr_help(&rcu_sched_ctrlblk) + 109 if (rcu_qsctr_help(&rcu_sched_ctrlblk) +
125 rcu_qsctr_help(&rcu_bh_ctrlblk)) 110 rcu_qsctr_help(&rcu_bh_ctrlblk))
126 raise_softirq(RCU_SOFTIRQ); 111 invoke_rcu_kthread();
127} 112}
128 113
129/* 114/*
@@ -132,7 +117,7 @@ void rcu_sched_qs(int cpu)
132void rcu_bh_qs(int cpu) 117void rcu_bh_qs(int cpu)
133{ 118{
134 if (rcu_qsctr_help(&rcu_bh_ctrlblk)) 119 if (rcu_qsctr_help(&rcu_bh_ctrlblk))
135 raise_softirq(RCU_SOFTIRQ); 120 invoke_rcu_kthread();
136} 121}
137 122
138/* 123/*
@@ -152,13 +137,14 @@ void rcu_check_callbacks(int cpu, int user)
152} 137}
153 138
154/* 139/*
155 * Helper function for rcu_process_callbacks() that operates on the 140 * Invoke the RCU callbacks on the specified rcu_ctrlkblk structure
156 * specified rcu_ctrlkblk structure. 141 * whose grace period has elapsed.
157 */ 142 */
158static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) 143static void rcu_process_callbacks(struct rcu_ctrlblk *rcp)
159{ 144{
160 struct rcu_head *next, *list; 145 struct rcu_head *next, *list;
161 unsigned long flags; 146 unsigned long flags;
147 RCU_TRACE(int cb_count = 0);
162 148
163 /* If no RCU callbacks ready to invoke, just return. */ 149 /* If no RCU callbacks ready to invoke, just return. */
164 if (&rcp->rcucblist == rcp->donetail) 150 if (&rcp->rcucblist == rcp->donetail)
@@ -180,19 +166,59 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
180 next = list->next; 166 next = list->next;
181 prefetch(next); 167 prefetch(next);
182 debug_rcu_head_unqueue(list); 168 debug_rcu_head_unqueue(list);
169 local_bh_disable();
183 list->func(list); 170 list->func(list);
171 local_bh_enable();
184 list = next; 172 list = next;
173 RCU_TRACE(cb_count++);
185 } 174 }
175 RCU_TRACE(rcu_trace_sub_qlen(rcp, cb_count));
186} 176}
187 177
188/* 178/*
189 * Invoke any callbacks whose grace period has completed. 179 * This kthread invokes RCU callbacks whose grace periods have
180 * elapsed. It is awakened as needed, and takes the place of the
181 * RCU_SOFTIRQ that was used previously for this purpose.
182 * This is a kthread, but it is never stopped, at least not until
183 * the system goes down.
190 */ 184 */
191static void rcu_process_callbacks(struct softirq_action *unused) 185static int rcu_kthread(void *arg)
192{ 186{
193 __rcu_process_callbacks(&rcu_sched_ctrlblk); 187 unsigned long work;
194 __rcu_process_callbacks(&rcu_bh_ctrlblk); 188 unsigned long morework;
195 rcu_preempt_process_callbacks(); 189 unsigned long flags;
190
191 for (;;) {
192 wait_event_interruptible(rcu_kthread_wq,
193 have_rcu_kthread_work != 0);
194 morework = rcu_boost();
195 local_irq_save(flags);
196 work = have_rcu_kthread_work;
197 have_rcu_kthread_work = morework;
198 local_irq_restore(flags);
199 if (work) {
200 rcu_process_callbacks(&rcu_sched_ctrlblk);
201 rcu_process_callbacks(&rcu_bh_ctrlblk);
202 rcu_preempt_process_callbacks();
203 }
204 schedule_timeout_interruptible(1); /* Leave CPU for others. */
205 }
206
207 return 0; /* Not reached, but needed to shut gcc up. */
208}
209
210/*
211 * Wake up rcu_kthread() to process callbacks now eligible for invocation
212 * or to boost readers.
213 */
214static void invoke_rcu_kthread(void)
215{
216 unsigned long flags;
217
218 local_irq_save(flags);
219 have_rcu_kthread_work = 1;
220 wake_up(&rcu_kthread_wq);
221 local_irq_restore(flags);
196} 222}
197 223
198/* 224/*
@@ -230,6 +256,7 @@ static void __call_rcu(struct rcu_head *head,
230 local_irq_save(flags); 256 local_irq_save(flags);
231 *rcp->curtail = head; 257 *rcp->curtail = head;
232 rcp->curtail = &head->next; 258 rcp->curtail = &head->next;
259 RCU_TRACE(rcp->qlen++);
233 local_irq_restore(flags); 260 local_irq_restore(flags);
234} 261}
235 262
@@ -282,7 +309,16 @@ void rcu_barrier_sched(void)
282} 309}
283EXPORT_SYMBOL_GPL(rcu_barrier_sched); 310EXPORT_SYMBOL_GPL(rcu_barrier_sched);
284 311
285void __init rcu_init(void) 312/*
313 * Spawn the kthread that invokes RCU callbacks.
314 */
315static int __init rcu_spawn_kthreads(void)
286{ 316{
287 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); 317 struct sched_param sp;
318
319 rcu_kthread_task = kthread_run(rcu_kthread, NULL, "rcu_kthread");
320 sp.sched_priority = RCU_BOOST_PRIO;
321 sched_setscheduler_nocheck(rcu_kthread_task, SCHED_FIFO, &sp);
322 return 0;
288} 323}
324early_initcall(rcu_spawn_kthreads);
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h
index 6ceca4f745ff..015abaea962a 100644
--- a/kernel/rcutiny_plugin.h
+++ b/kernel/rcutiny_plugin.h
@@ -22,6 +22,40 @@
22 * Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com> 22 * Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
23 */ 23 */
24 24
25#include <linux/kthread.h>
26#include <linux/debugfs.h>
27#include <linux/seq_file.h>
28
29#ifdef CONFIG_RCU_TRACE
30#define RCU_TRACE(stmt) stmt
31#else /* #ifdef CONFIG_RCU_TRACE */
32#define RCU_TRACE(stmt)
33#endif /* #else #ifdef CONFIG_RCU_TRACE */
34
35/* Global control variables for rcupdate callback mechanism. */
36struct rcu_ctrlblk {
37 struct rcu_head *rcucblist; /* List of pending callbacks (CBs). */
38 struct rcu_head **donetail; /* ->next pointer of last "done" CB. */
39 struct rcu_head **curtail; /* ->next pointer of last CB. */
40 RCU_TRACE(long qlen); /* Number of pending CBs. */
41};
42
43/* Definition for rcupdate control block. */
44static struct rcu_ctrlblk rcu_sched_ctrlblk = {
45 .donetail = &rcu_sched_ctrlblk.rcucblist,
46 .curtail = &rcu_sched_ctrlblk.rcucblist,
47};
48
49static struct rcu_ctrlblk rcu_bh_ctrlblk = {
50 .donetail = &rcu_bh_ctrlblk.rcucblist,
51 .curtail = &rcu_bh_ctrlblk.rcucblist,
52};
53
54#ifdef CONFIG_DEBUG_LOCK_ALLOC
55int rcu_scheduler_active __read_mostly;
56EXPORT_SYMBOL_GPL(rcu_scheduler_active);
57#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
58
25#ifdef CONFIG_TINY_PREEMPT_RCU 59#ifdef CONFIG_TINY_PREEMPT_RCU
26 60
27#include <linux/delay.h> 61#include <linux/delay.h>
@@ -46,17 +80,45 @@ struct rcu_preempt_ctrlblk {
46 struct list_head *gp_tasks; 80 struct list_head *gp_tasks;
47 /* Pointer to the first task blocking the */ 81 /* Pointer to the first task blocking the */
48 /* current grace period, or NULL if there */ 82 /* current grace period, or NULL if there */
49 /* is not such task. */ 83 /* is no such task. */
50 struct list_head *exp_tasks; 84 struct list_head *exp_tasks;
51 /* Pointer to first task blocking the */ 85 /* Pointer to first task blocking the */
52 /* current expedited grace period, or NULL */ 86 /* current expedited grace period, or NULL */
53 /* if there is no such task. If there */ 87 /* if there is no such task. If there */
54 /* is no current expedited grace period, */ 88 /* is no current expedited grace period, */
55 /* then there cannot be any such task. */ 89 /* then there cannot be any such task. */
90#ifdef CONFIG_RCU_BOOST
91 struct list_head *boost_tasks;
92 /* Pointer to first task that needs to be */
93 /* priority-boosted, or NULL if no priority */
94 /* boosting is needed. If there is no */
95 /* current or expedited grace period, there */
96 /* can be no such task. */
97#endif /* #ifdef CONFIG_RCU_BOOST */
56 u8 gpnum; /* Current grace period. */ 98 u8 gpnum; /* Current grace period. */
57 u8 gpcpu; /* Last grace period blocked by the CPU. */ 99 u8 gpcpu; /* Last grace period blocked by the CPU. */
58 u8 completed; /* Last grace period completed. */ 100 u8 completed; /* Last grace period completed. */
59 /* If all three are equal, RCU is idle. */ 101 /* If all three are equal, RCU is idle. */
102#ifdef CONFIG_RCU_BOOST
103 s8 boosted_this_gp; /* Has boosting already happened? */
104 unsigned long boost_time; /* When to start boosting (jiffies) */
105#endif /* #ifdef CONFIG_RCU_BOOST */
106#ifdef CONFIG_RCU_TRACE
107 unsigned long n_grace_periods;
108#ifdef CONFIG_RCU_BOOST
109 unsigned long n_tasks_boosted;
110 unsigned long n_exp_boosts;
111 unsigned long n_normal_boosts;
112 unsigned long n_normal_balk_blkd_tasks;
113 unsigned long n_normal_balk_gp_tasks;
114 unsigned long n_normal_balk_boost_tasks;
115 unsigned long n_normal_balk_boosted;
116 unsigned long n_normal_balk_notyet;
117 unsigned long n_normal_balk_nos;
118 unsigned long n_exp_balk_blkd_tasks;
119 unsigned long n_exp_balk_nos;
120#endif /* #ifdef CONFIG_RCU_BOOST */
121#endif /* #ifdef CONFIG_RCU_TRACE */
60}; 122};
61 123
62static struct rcu_preempt_ctrlblk rcu_preempt_ctrlblk = { 124static struct rcu_preempt_ctrlblk rcu_preempt_ctrlblk = {
@@ -122,6 +184,210 @@ static int rcu_preempt_gp_in_progress(void)
122} 184}
123 185
124/* 186/*
187 * Advance a ->blkd_tasks-list pointer to the next entry, instead
188 * returning NULL if at the end of the list.
189 */
190static struct list_head *rcu_next_node_entry(struct task_struct *t)
191{
192 struct list_head *np;
193
194 np = t->rcu_node_entry.next;
195 if (np == &rcu_preempt_ctrlblk.blkd_tasks)
196 np = NULL;
197 return np;
198}
199
200#ifdef CONFIG_RCU_TRACE
201
202#ifdef CONFIG_RCU_BOOST
203static void rcu_initiate_boost_trace(void);
204static void rcu_initiate_exp_boost_trace(void);
205#endif /* #ifdef CONFIG_RCU_BOOST */
206
207/*
208 * Dump additional statistice for TINY_PREEMPT_RCU.
209 */
210static void show_tiny_preempt_stats(struct seq_file *m)
211{
212 seq_printf(m, "rcu_preempt: qlen=%ld gp=%lu g%u/p%u/c%u tasks=%c%c%c\n",
213 rcu_preempt_ctrlblk.rcb.qlen,
214 rcu_preempt_ctrlblk.n_grace_periods,
215 rcu_preempt_ctrlblk.gpnum,
216 rcu_preempt_ctrlblk.gpcpu,
217 rcu_preempt_ctrlblk.completed,
218 "T."[list_empty(&rcu_preempt_ctrlblk.blkd_tasks)],
219 "N."[!rcu_preempt_ctrlblk.gp_tasks],
220 "E."[!rcu_preempt_ctrlblk.exp_tasks]);
221#ifdef CONFIG_RCU_BOOST
222 seq_printf(m, " ttb=%c btg=",
223 "B."[!rcu_preempt_ctrlblk.boost_tasks]);
224 switch (rcu_preempt_ctrlblk.boosted_this_gp) {
225 case -1:
226 seq_puts(m, "exp");
227 break;
228 case 0:
229 seq_puts(m, "no");
230 break;
231 case 1:
232 seq_puts(m, "begun");
233 break;
234 case 2:
235 seq_puts(m, "done");
236 break;
237 default:
238 seq_printf(m, "?%d?", rcu_preempt_ctrlblk.boosted_this_gp);
239 }
240 seq_printf(m, " ntb=%lu neb=%lu nnb=%lu j=%04x bt=%04x\n",
241 rcu_preempt_ctrlblk.n_tasks_boosted,
242 rcu_preempt_ctrlblk.n_exp_boosts,
243 rcu_preempt_ctrlblk.n_normal_boosts,
244 (int)(jiffies & 0xffff),
245 (int)(rcu_preempt_ctrlblk.boost_time & 0xffff));
246 seq_printf(m, " %s: nt=%lu gt=%lu bt=%lu b=%lu ny=%lu nos=%lu\n",
247 "normal balk",
248 rcu_preempt_ctrlblk.n_normal_balk_blkd_tasks,
249 rcu_preempt_ctrlblk.n_normal_balk_gp_tasks,
250 rcu_preempt_ctrlblk.n_normal_balk_boost_tasks,
251 rcu_preempt_ctrlblk.n_normal_balk_boosted,
252 rcu_preempt_ctrlblk.n_normal_balk_notyet,
253 rcu_preempt_ctrlblk.n_normal_balk_nos);
254 seq_printf(m, " exp balk: bt=%lu nos=%lu\n",
255 rcu_preempt_ctrlblk.n_exp_balk_blkd_tasks,
256 rcu_preempt_ctrlblk.n_exp_balk_nos);
257#endif /* #ifdef CONFIG_RCU_BOOST */
258}
259
260#endif /* #ifdef CONFIG_RCU_TRACE */
261
262#ifdef CONFIG_RCU_BOOST
263
264#include "rtmutex_common.h"
265
266/*
267 * Carry out RCU priority boosting on the task indicated by ->boost_tasks,
268 * and advance ->boost_tasks to the next task in the ->blkd_tasks list.
269 */
270static int rcu_boost(void)
271{
272 unsigned long flags;
273 struct rt_mutex mtx;
274 struct list_head *np;
275 struct task_struct *t;
276
277 if (rcu_preempt_ctrlblk.boost_tasks == NULL)
278 return 0; /* Nothing to boost. */
279 raw_local_irq_save(flags);
280 rcu_preempt_ctrlblk.boosted_this_gp++;
281 t = container_of(rcu_preempt_ctrlblk.boost_tasks, struct task_struct,
282 rcu_node_entry);
283 np = rcu_next_node_entry(t);
284 rt_mutex_init_proxy_locked(&mtx, t);
285 t->rcu_boost_mutex = &mtx;
286 t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BOOSTED;
287 raw_local_irq_restore(flags);
288 rt_mutex_lock(&mtx);
289 RCU_TRACE(rcu_preempt_ctrlblk.n_tasks_boosted++);
290 rcu_preempt_ctrlblk.boosted_this_gp++;
291 rt_mutex_unlock(&mtx);
292 return rcu_preempt_ctrlblk.boost_tasks != NULL;
293}
294
295/*
296 * Check to see if it is now time to start boosting RCU readers blocking
297 * the current grace period, and, if so, tell the rcu_kthread_task to
298 * start boosting them. If there is an expedited boost in progress,
299 * we wait for it to complete.
300 *
301 * If there are no blocked readers blocking the current grace period,
302 * return 0 to let the caller know, otherwise return 1. Note that this
303 * return value is independent of whether or not boosting was done.
304 */
305static int rcu_initiate_boost(void)
306{
307 if (!rcu_preempt_blocked_readers_cgp()) {
308 RCU_TRACE(rcu_preempt_ctrlblk.n_normal_balk_blkd_tasks++);
309 return 0;
310 }
311 if (rcu_preempt_ctrlblk.gp_tasks != NULL &&
312 rcu_preempt_ctrlblk.boost_tasks == NULL &&
313 rcu_preempt_ctrlblk.boosted_this_gp == 0 &&
314 ULONG_CMP_GE(jiffies, rcu_preempt_ctrlblk.boost_time)) {
315 rcu_preempt_ctrlblk.boost_tasks = rcu_preempt_ctrlblk.gp_tasks;
316 invoke_rcu_kthread();
317 RCU_TRACE(rcu_preempt_ctrlblk.n_normal_boosts++);
318 } else
319 RCU_TRACE(rcu_initiate_boost_trace());
320 return 1;
321}
322
323/*
324 * Initiate boosting for an expedited grace period.
325 */
326static void rcu_initiate_expedited_boost(void)
327{
328 unsigned long flags;
329
330 raw_local_irq_save(flags);
331 if (!list_empty(&rcu_preempt_ctrlblk.blkd_tasks)) {
332 rcu_preempt_ctrlblk.boost_tasks =
333 rcu_preempt_ctrlblk.blkd_tasks.next;
334 rcu_preempt_ctrlblk.boosted_this_gp = -1;
335 invoke_rcu_kthread();
336 RCU_TRACE(rcu_preempt_ctrlblk.n_exp_boosts++);
337 } else
338 RCU_TRACE(rcu_initiate_exp_boost_trace());
339 raw_local_irq_restore(flags);
340}
341
342#define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000);
343
344/*
345 * Do priority-boost accounting for the start of a new grace period.
346 */
347static void rcu_preempt_boost_start_gp(void)
348{
349 rcu_preempt_ctrlblk.boost_time = jiffies + RCU_BOOST_DELAY_JIFFIES;
350 if (rcu_preempt_ctrlblk.boosted_this_gp > 0)
351 rcu_preempt_ctrlblk.boosted_this_gp = 0;
352}
353
354#else /* #ifdef CONFIG_RCU_BOOST */
355
356/*
357 * If there is no RCU priority boosting, we don't boost.
358 */
359static int rcu_boost(void)
360{
361 return 0;
362}
363
364/*
365 * If there is no RCU priority boosting, we don't initiate boosting,
366 * but we do indicate whether there are blocked readers blocking the
367 * current grace period.
368 */
369static int rcu_initiate_boost(void)
370{
371 return rcu_preempt_blocked_readers_cgp();
372}
373
374/*
375 * If there is no RCU priority boosting, we don't initiate expedited boosting.
376 */
377static void rcu_initiate_expedited_boost(void)
378{
379}
380
381/*
382 * If there is no RCU priority boosting, nothing to do at grace-period start.
383 */
384static void rcu_preempt_boost_start_gp(void)
385{
386}
387
388#endif /* else #ifdef CONFIG_RCU_BOOST */
389
390/*
125 * Record a preemptible-RCU quiescent state for the specified CPU. Note 391 * Record a preemptible-RCU quiescent state for the specified CPU. Note
126 * that this just means that the task currently running on the CPU is 392 * that this just means that the task currently running on the CPU is
127 * in a quiescent state. There might be any number of tasks blocked 393 * in a quiescent state. There might be any number of tasks blocked
@@ -148,11 +414,14 @@ static void rcu_preempt_cpu_qs(void)
148 rcu_preempt_ctrlblk.gpcpu = rcu_preempt_ctrlblk.gpnum; 414 rcu_preempt_ctrlblk.gpcpu = rcu_preempt_ctrlblk.gpnum;
149 current->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS; 415 current->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS;
150 416
417 /* If there is no GP then there is nothing more to do. */
418 if (!rcu_preempt_gp_in_progress())
419 return;
151 /* 420 /*
152 * If there is no GP, or if blocked readers are still blocking GP, 421 * Check up on boosting. If there are no readers blocking the
153 * then there is nothing more to do. 422 * current grace period, leave.
154 */ 423 */
155 if (!rcu_preempt_gp_in_progress() || rcu_preempt_blocked_readers_cgp()) 424 if (rcu_initiate_boost())
156 return; 425 return;
157 426
158 /* Advance callbacks. */ 427 /* Advance callbacks. */
@@ -164,9 +433,9 @@ static void rcu_preempt_cpu_qs(void)
164 if (!rcu_preempt_blocked_readers_any()) 433 if (!rcu_preempt_blocked_readers_any())
165 rcu_preempt_ctrlblk.rcb.donetail = rcu_preempt_ctrlblk.nexttail; 434 rcu_preempt_ctrlblk.rcb.donetail = rcu_preempt_ctrlblk.nexttail;
166 435
167 /* If there are done callbacks, make RCU_SOFTIRQ process them. */ 436 /* If there are done callbacks, cause them to be invoked. */
168 if (*rcu_preempt_ctrlblk.rcb.donetail != NULL) 437 if (*rcu_preempt_ctrlblk.rcb.donetail != NULL)
169 raise_softirq(RCU_SOFTIRQ); 438 invoke_rcu_kthread();
170} 439}
171 440
172/* 441/*
@@ -178,12 +447,16 @@ static void rcu_preempt_start_gp(void)
178 447
179 /* Official start of GP. */ 448 /* Official start of GP. */
180 rcu_preempt_ctrlblk.gpnum++; 449 rcu_preempt_ctrlblk.gpnum++;
450 RCU_TRACE(rcu_preempt_ctrlblk.n_grace_periods++);
181 451
182 /* Any blocked RCU readers block new GP. */ 452 /* Any blocked RCU readers block new GP. */
183 if (rcu_preempt_blocked_readers_any()) 453 if (rcu_preempt_blocked_readers_any())
184 rcu_preempt_ctrlblk.gp_tasks = 454 rcu_preempt_ctrlblk.gp_tasks =
185 rcu_preempt_ctrlblk.blkd_tasks.next; 455 rcu_preempt_ctrlblk.blkd_tasks.next;
186 456
457 /* Set up for RCU priority boosting. */
458 rcu_preempt_boost_start_gp();
459
187 /* If there is no running reader, CPU is done with GP. */ 460 /* If there is no running reader, CPU is done with GP. */
188 if (!rcu_preempt_running_reader()) 461 if (!rcu_preempt_running_reader())
189 rcu_preempt_cpu_qs(); 462 rcu_preempt_cpu_qs();
@@ -304,14 +577,16 @@ static void rcu_read_unlock_special(struct task_struct *t)
304 */ 577 */
305 empty = !rcu_preempt_blocked_readers_cgp(); 578 empty = !rcu_preempt_blocked_readers_cgp();
306 empty_exp = rcu_preempt_ctrlblk.exp_tasks == NULL; 579 empty_exp = rcu_preempt_ctrlblk.exp_tasks == NULL;
307 np = t->rcu_node_entry.next; 580 np = rcu_next_node_entry(t);
308 if (np == &rcu_preempt_ctrlblk.blkd_tasks)
309 np = NULL;
310 list_del(&t->rcu_node_entry); 581 list_del(&t->rcu_node_entry);
311 if (&t->rcu_node_entry == rcu_preempt_ctrlblk.gp_tasks) 582 if (&t->rcu_node_entry == rcu_preempt_ctrlblk.gp_tasks)
312 rcu_preempt_ctrlblk.gp_tasks = np; 583 rcu_preempt_ctrlblk.gp_tasks = np;
313 if (&t->rcu_node_entry == rcu_preempt_ctrlblk.exp_tasks) 584 if (&t->rcu_node_entry == rcu_preempt_ctrlblk.exp_tasks)
314 rcu_preempt_ctrlblk.exp_tasks = np; 585 rcu_preempt_ctrlblk.exp_tasks = np;
586#ifdef CONFIG_RCU_BOOST
587 if (&t->rcu_node_entry == rcu_preempt_ctrlblk.boost_tasks)
588 rcu_preempt_ctrlblk.boost_tasks = np;
589#endif /* #ifdef CONFIG_RCU_BOOST */
315 INIT_LIST_HEAD(&t->rcu_node_entry); 590 INIT_LIST_HEAD(&t->rcu_node_entry);
316 591
317 /* 592 /*
@@ -331,6 +606,14 @@ static void rcu_read_unlock_special(struct task_struct *t)
331 if (!empty_exp && rcu_preempt_ctrlblk.exp_tasks == NULL) 606 if (!empty_exp && rcu_preempt_ctrlblk.exp_tasks == NULL)
332 rcu_report_exp_done(); 607 rcu_report_exp_done();
333 } 608 }
609#ifdef CONFIG_RCU_BOOST
610 /* Unboost self if was boosted. */
611 if (special & RCU_READ_UNLOCK_BOOSTED) {
612 t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_BOOSTED;
613 rt_mutex_unlock(t->rcu_boost_mutex);
614 t->rcu_boost_mutex = NULL;
615 }
616#endif /* #ifdef CONFIG_RCU_BOOST */
334 local_irq_restore(flags); 617 local_irq_restore(flags);
335} 618}
336 619
@@ -374,7 +657,7 @@ static void rcu_preempt_check_callbacks(void)
374 rcu_preempt_cpu_qs(); 657 rcu_preempt_cpu_qs();
375 if (&rcu_preempt_ctrlblk.rcb.rcucblist != 658 if (&rcu_preempt_ctrlblk.rcb.rcucblist !=
376 rcu_preempt_ctrlblk.rcb.donetail) 659 rcu_preempt_ctrlblk.rcb.donetail)
377 raise_softirq(RCU_SOFTIRQ); 660 invoke_rcu_kthread();
378 if (rcu_preempt_gp_in_progress() && 661 if (rcu_preempt_gp_in_progress() &&
379 rcu_cpu_blocking_cur_gp() && 662 rcu_cpu_blocking_cur_gp() &&
380 rcu_preempt_running_reader()) 663 rcu_preempt_running_reader())
@@ -383,7 +666,7 @@ static void rcu_preempt_check_callbacks(void)
383 666
384/* 667/*
385 * TINY_PREEMPT_RCU has an extra callback-list tail pointer to 668 * TINY_PREEMPT_RCU has an extra callback-list tail pointer to
386 * update, so this is invoked from __rcu_process_callbacks() to 669 * update, so this is invoked from rcu_process_callbacks() to
387 * handle that case. Of course, it is invoked for all flavors of 670 * handle that case. Of course, it is invoked for all flavors of
388 * RCU, but RCU callbacks can appear only on one of the lists, and 671 * RCU, but RCU callbacks can appear only on one of the lists, and
389 * neither ->nexttail nor ->donetail can possibly be NULL, so there 672 * neither ->nexttail nor ->donetail can possibly be NULL, so there
@@ -400,7 +683,7 @@ static void rcu_preempt_remove_callbacks(struct rcu_ctrlblk *rcp)
400 */ 683 */
401static void rcu_preempt_process_callbacks(void) 684static void rcu_preempt_process_callbacks(void)
402{ 685{
403 __rcu_process_callbacks(&rcu_preempt_ctrlblk.rcb); 686 rcu_process_callbacks(&rcu_preempt_ctrlblk.rcb);
404} 687}
405 688
406/* 689/*
@@ -417,6 +700,7 @@ void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
417 local_irq_save(flags); 700 local_irq_save(flags);
418 *rcu_preempt_ctrlblk.nexttail = head; 701 *rcu_preempt_ctrlblk.nexttail = head;
419 rcu_preempt_ctrlblk.nexttail = &head->next; 702 rcu_preempt_ctrlblk.nexttail = &head->next;
703 RCU_TRACE(rcu_preempt_ctrlblk.rcb.qlen++);
420 rcu_preempt_start_gp(); /* checks to see if GP needed. */ 704 rcu_preempt_start_gp(); /* checks to see if GP needed. */
421 local_irq_restore(flags); 705 local_irq_restore(flags);
422} 706}
@@ -532,6 +816,7 @@ void synchronize_rcu_expedited(void)
532 816
533 /* Wait for tail of ->blkd_tasks list to drain. */ 817 /* Wait for tail of ->blkd_tasks list to drain. */
534 if (rcu_preempted_readers_exp()) 818 if (rcu_preempted_readers_exp())
819 rcu_initiate_expedited_boost();
535 wait_event(sync_rcu_preempt_exp_wq, 820 wait_event(sync_rcu_preempt_exp_wq,
536 !rcu_preempted_readers_exp()); 821 !rcu_preempted_readers_exp());
537 822
@@ -572,6 +857,27 @@ void exit_rcu(void)
572 857
573#else /* #ifdef CONFIG_TINY_PREEMPT_RCU */ 858#else /* #ifdef CONFIG_TINY_PREEMPT_RCU */
574 859
860#ifdef CONFIG_RCU_TRACE
861
862/*
863 * Because preemptible RCU does not exist, it is not necessary to
864 * dump out its statistics.
865 */
866static void show_tiny_preempt_stats(struct seq_file *m)
867{
868}
869
870#endif /* #ifdef CONFIG_RCU_TRACE */
871
872/*
873 * Because preemptible RCU does not exist, it is never necessary to
874 * boost preempted RCU readers.
875 */
876static int rcu_boost(void)
877{
878 return 0;
879}
880
575/* 881/*
576 * Because preemptible RCU does not exist, it never has any callbacks 882 * Because preemptible RCU does not exist, it never has any callbacks
577 * to check. 883 * to check.
@@ -599,17 +905,116 @@ static void rcu_preempt_process_callbacks(void)
599#endif /* #else #ifdef CONFIG_TINY_PREEMPT_RCU */ 905#endif /* #else #ifdef CONFIG_TINY_PREEMPT_RCU */
600 906
601#ifdef CONFIG_DEBUG_LOCK_ALLOC 907#ifdef CONFIG_DEBUG_LOCK_ALLOC
602
603#include <linux/kernel_stat.h> 908#include <linux/kernel_stat.h>
604 909
605/* 910/*
606 * During boot, we forgive RCU lockdep issues. After this function is 911 * During boot, we forgive RCU lockdep issues. After this function is
607 * invoked, we start taking RCU lockdep issues seriously. 912 * invoked, we start taking RCU lockdep issues seriously.
608 */ 913 */
609void rcu_scheduler_starting(void) 914void __init rcu_scheduler_starting(void)
610{ 915{
611 WARN_ON(nr_context_switches() > 0); 916 WARN_ON(nr_context_switches() > 0);
612 rcu_scheduler_active = 1; 917 rcu_scheduler_active = 1;
613} 918}
614 919
615#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ 920#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
921
922#ifdef CONFIG_RCU_BOOST
923#define RCU_BOOST_PRIO CONFIG_RCU_BOOST_PRIO
924#else /* #ifdef CONFIG_RCU_BOOST */
925#define RCU_BOOST_PRIO 1
926#endif /* #else #ifdef CONFIG_RCU_BOOST */
927
928#ifdef CONFIG_RCU_TRACE
929
930#ifdef CONFIG_RCU_BOOST
931
932static void rcu_initiate_boost_trace(void)
933{
934 if (rcu_preempt_ctrlblk.gp_tasks == NULL)
935 rcu_preempt_ctrlblk.n_normal_balk_gp_tasks++;
936 else if (rcu_preempt_ctrlblk.boost_tasks != NULL)
937 rcu_preempt_ctrlblk.n_normal_balk_boost_tasks++;
938 else if (rcu_preempt_ctrlblk.boosted_this_gp != 0)
939 rcu_preempt_ctrlblk.n_normal_balk_boosted++;
940 else if (!ULONG_CMP_GE(jiffies, rcu_preempt_ctrlblk.boost_time))
941 rcu_preempt_ctrlblk.n_normal_balk_notyet++;
942 else
943 rcu_preempt_ctrlblk.n_normal_balk_nos++;
944}
945
946static void rcu_initiate_exp_boost_trace(void)
947{
948 if (list_empty(&rcu_preempt_ctrlblk.blkd_tasks))
949 rcu_preempt_ctrlblk.n_exp_balk_blkd_tasks++;
950 else
951 rcu_preempt_ctrlblk.n_exp_balk_nos++;
952}
953
954#endif /* #ifdef CONFIG_RCU_BOOST */
955
956static void rcu_trace_sub_qlen(struct rcu_ctrlblk *rcp, int n)
957{
958 unsigned long flags;
959
960 raw_local_irq_save(flags);
961 rcp->qlen -= n;
962 raw_local_irq_restore(flags);
963}
964
965/*
966 * Dump statistics for TINY_RCU, such as they are.
967 */
968static int show_tiny_stats(struct seq_file *m, void *unused)
969{
970 show_tiny_preempt_stats(m);
971 seq_printf(m, "rcu_sched: qlen: %ld\n", rcu_sched_ctrlblk.qlen);
972 seq_printf(m, "rcu_bh: qlen: %ld\n", rcu_bh_ctrlblk.qlen);
973 return 0;
974}
975
976static int show_tiny_stats_open(struct inode *inode, struct file *file)
977{
978 return single_open(file, show_tiny_stats, NULL);
979}
980
981static const struct file_operations show_tiny_stats_fops = {
982 .owner = THIS_MODULE,
983 .open = show_tiny_stats_open,
984 .read = seq_read,
985 .llseek = seq_lseek,
986 .release = single_release,
987};
988
989static struct dentry *rcudir;
990
991static int __init rcutiny_trace_init(void)
992{
993 struct dentry *retval;
994
995 rcudir = debugfs_create_dir("rcu", NULL);
996 if (!rcudir)
997 goto free_out;
998 retval = debugfs_create_file("rcudata", 0444, rcudir,
999 NULL, &show_tiny_stats_fops);
1000 if (!retval)
1001 goto free_out;
1002 return 0;
1003free_out:
1004 debugfs_remove_recursive(rcudir);
1005 return 1;
1006}
1007
1008static void __exit rcutiny_trace_cleanup(void)
1009{
1010 debugfs_remove_recursive(rcudir);
1011}
1012
1013module_init(rcutiny_trace_init);
1014module_exit(rcutiny_trace_cleanup);
1015
1016MODULE_AUTHOR("Paul E. McKenney");
1017MODULE_DESCRIPTION("Read-Copy Update tracing for tiny implementation");
1018MODULE_LICENSE("GPL");
1019
1020#endif /* #ifdef CONFIG_RCU_TRACE */
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 9d8e8fb2515f..89613f97ff26 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -47,6 +47,7 @@
47#include <linux/srcu.h> 47#include <linux/srcu.h>
48#include <linux/slab.h> 48#include <linux/slab.h>
49#include <asm/byteorder.h> 49#include <asm/byteorder.h>
50#include <linux/sched.h>
50 51
51MODULE_LICENSE("GPL"); 52MODULE_LICENSE("GPL");
52MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and " 53MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and "
@@ -64,6 +65,9 @@ static int irqreader = 1; /* RCU readers from irq (timers). */
64static int fqs_duration = 0; /* Duration of bursts (us), 0 to disable. */ 65static int fqs_duration = 0; /* Duration of bursts (us), 0 to disable. */
65static int fqs_holdoff = 0; /* Hold time within burst (us). */ 66static int fqs_holdoff = 0; /* Hold time within burst (us). */
66static int fqs_stutter = 3; /* Wait time between bursts (s). */ 67static int fqs_stutter = 3; /* Wait time between bursts (s). */
68static int test_boost = 1; /* Test RCU prio boost: 0=no, 1=maybe, 2=yes. */
69static int test_boost_interval = 7; /* Interval between boost tests, seconds. */
70static int test_boost_duration = 4; /* Duration of each boost test, seconds. */
67static char *torture_type = "rcu"; /* What RCU implementation to torture. */ 71static char *torture_type = "rcu"; /* What RCU implementation to torture. */
68 72
69module_param(nreaders, int, 0444); 73module_param(nreaders, int, 0444);
@@ -88,6 +92,12 @@ module_param(fqs_holdoff, int, 0444);
88MODULE_PARM_DESC(fqs_holdoff, "Holdoff time within fqs bursts (us)"); 92MODULE_PARM_DESC(fqs_holdoff, "Holdoff time within fqs bursts (us)");
89module_param(fqs_stutter, int, 0444); 93module_param(fqs_stutter, int, 0444);
90MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)"); 94MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)");
95module_param(test_boost, int, 0444);
96MODULE_PARM_DESC(test_boost, "Test RCU prio boost: 0=no, 1=maybe, 2=yes.");
97module_param(test_boost_interval, int, 0444);
98MODULE_PARM_DESC(test_boost_interval, "Interval between boost tests, seconds.");
99module_param(test_boost_duration, int, 0444);
100MODULE_PARM_DESC(test_boost_duration, "Duration of each boost test, seconds.");
91module_param(torture_type, charp, 0444); 101module_param(torture_type, charp, 0444);
92MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, srcu)"); 102MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, srcu)");
93 103
@@ -109,6 +119,7 @@ static struct task_struct *stats_task;
109static struct task_struct *shuffler_task; 119static struct task_struct *shuffler_task;
110static struct task_struct *stutter_task; 120static struct task_struct *stutter_task;
111static struct task_struct *fqs_task; 121static struct task_struct *fqs_task;
122static struct task_struct *boost_tasks[NR_CPUS];
112 123
113#define RCU_TORTURE_PIPE_LEN 10 124#define RCU_TORTURE_PIPE_LEN 10
114 125
@@ -134,6 +145,12 @@ static atomic_t n_rcu_torture_alloc_fail;
134static atomic_t n_rcu_torture_free; 145static atomic_t n_rcu_torture_free;
135static atomic_t n_rcu_torture_mberror; 146static atomic_t n_rcu_torture_mberror;
136static atomic_t n_rcu_torture_error; 147static atomic_t n_rcu_torture_error;
148static long n_rcu_torture_boost_ktrerror;
149static long n_rcu_torture_boost_rterror;
150static long n_rcu_torture_boost_allocerror;
151static long n_rcu_torture_boost_afferror;
152static long n_rcu_torture_boost_failure;
153static long n_rcu_torture_boosts;
137static long n_rcu_torture_timers; 154static long n_rcu_torture_timers;
138static struct list_head rcu_torture_removed; 155static struct list_head rcu_torture_removed;
139static cpumask_var_t shuffle_tmp_mask; 156static cpumask_var_t shuffle_tmp_mask;
@@ -147,6 +164,16 @@ static int stutter_pause_test;
147#endif 164#endif
148int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT; 165int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT;
149 166
167#ifdef CONFIG_RCU_BOOST
168#define rcu_can_boost() 1
169#else /* #ifdef CONFIG_RCU_BOOST */
170#define rcu_can_boost() 0
171#endif /* #else #ifdef CONFIG_RCU_BOOST */
172
173static unsigned long boost_starttime; /* jiffies of next boost test start. */
174DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */
175 /* and boost task create/destroy. */
176
150/* Mediate rmmod and system shutdown. Concurrent rmmod & shutdown illegal! */ 177/* Mediate rmmod and system shutdown. Concurrent rmmod & shutdown illegal! */
151 178
152#define FULLSTOP_DONTSTOP 0 /* Normal operation. */ 179#define FULLSTOP_DONTSTOP 0 /* Normal operation. */
@@ -277,6 +304,7 @@ struct rcu_torture_ops {
277 void (*fqs)(void); 304 void (*fqs)(void);
278 int (*stats)(char *page); 305 int (*stats)(char *page);
279 int irq_capable; 306 int irq_capable;
307 int can_boost;
280 char *name; 308 char *name;
281}; 309};
282 310
@@ -366,6 +394,7 @@ static struct rcu_torture_ops rcu_ops = {
366 .fqs = rcu_force_quiescent_state, 394 .fqs = rcu_force_quiescent_state,
367 .stats = NULL, 395 .stats = NULL,
368 .irq_capable = 1, 396 .irq_capable = 1,
397 .can_boost = rcu_can_boost(),
369 .name = "rcu" 398 .name = "rcu"
370}; 399};
371 400
@@ -408,6 +437,7 @@ static struct rcu_torture_ops rcu_sync_ops = {
408 .fqs = rcu_force_quiescent_state, 437 .fqs = rcu_force_quiescent_state,
409 .stats = NULL, 438 .stats = NULL,
410 .irq_capable = 1, 439 .irq_capable = 1,
440 .can_boost = rcu_can_boost(),
411 .name = "rcu_sync" 441 .name = "rcu_sync"
412}; 442};
413 443
@@ -424,6 +454,7 @@ static struct rcu_torture_ops rcu_expedited_ops = {
424 .fqs = rcu_force_quiescent_state, 454 .fqs = rcu_force_quiescent_state,
425 .stats = NULL, 455 .stats = NULL,
426 .irq_capable = 1, 456 .irq_capable = 1,
457 .can_boost = rcu_can_boost(),
427 .name = "rcu_expedited" 458 .name = "rcu_expedited"
428}; 459};
429 460
@@ -684,6 +715,110 @@ static struct rcu_torture_ops sched_expedited_ops = {
684}; 715};
685 716
686/* 717/*
718 * RCU torture priority-boost testing. Runs one real-time thread per
719 * CPU for moderate bursts, repeatedly registering RCU callbacks and
720 * spinning waiting for them to be invoked. If a given callback takes
721 * too long to be invoked, we assume that priority inversion has occurred.
722 */
723
724struct rcu_boost_inflight {
725 struct rcu_head rcu;
726 int inflight;
727};
728
729static void rcu_torture_boost_cb(struct rcu_head *head)
730{
731 struct rcu_boost_inflight *rbip =
732 container_of(head, struct rcu_boost_inflight, rcu);
733
734 smp_mb(); /* Ensure RCU-core accesses precede clearing ->inflight */
735 rbip->inflight = 0;
736}
737
738static int rcu_torture_boost(void *arg)
739{
740 unsigned long call_rcu_time;
741 unsigned long endtime;
742 unsigned long oldstarttime;
743 struct rcu_boost_inflight rbi = { .inflight = 0 };
744 struct sched_param sp;
745
746 VERBOSE_PRINTK_STRING("rcu_torture_boost started");
747
748 /* Set real-time priority. */
749 sp.sched_priority = 1;
750 if (sched_setscheduler(current, SCHED_FIFO, &sp) < 0) {
751 VERBOSE_PRINTK_STRING("rcu_torture_boost RT prio failed!");
752 n_rcu_torture_boost_rterror++;
753 }
754
755 /* Each pass through the following loop does one boost-test cycle. */
756 do {
757 /* Wait for the next test interval. */
758 oldstarttime = boost_starttime;
759 while (jiffies - oldstarttime > ULONG_MAX / 2) {
760 schedule_timeout_uninterruptible(1);
761 rcu_stutter_wait("rcu_torture_boost");
762 if (kthread_should_stop() ||
763 fullstop != FULLSTOP_DONTSTOP)
764 goto checkwait;
765 }
766
767 /* Do one boost-test interval. */
768 endtime = oldstarttime + test_boost_duration * HZ;
769 call_rcu_time = jiffies;
770 while (jiffies - endtime > ULONG_MAX / 2) {
771 /* If we don't have a callback in flight, post one. */
772 if (!rbi.inflight) {
773 smp_mb(); /* RCU core before ->inflight = 1. */
774 rbi.inflight = 1;
775 call_rcu(&rbi.rcu, rcu_torture_boost_cb);
776 if (jiffies - call_rcu_time >
777 test_boost_duration * HZ - HZ / 2) {
778 VERBOSE_PRINTK_STRING("rcu_torture_boost boosting failed");
779 n_rcu_torture_boost_failure++;
780 }
781 call_rcu_time = jiffies;
782 }
783 cond_resched();
784 rcu_stutter_wait("rcu_torture_boost");
785 if (kthread_should_stop() ||
786 fullstop != FULLSTOP_DONTSTOP)
787 goto checkwait;
788 }
789
790 /*
791 * Set the start time of the next test interval.
792 * Yes, this is vulnerable to long delays, but such
793 * delays simply cause a false negative for the next
794 * interval. Besides, we are running at RT priority,
795 * so delays should be relatively rare.
796 */
797 while (oldstarttime == boost_starttime) {
798 if (mutex_trylock(&boost_mutex)) {
799 boost_starttime = jiffies +
800 test_boost_interval * HZ;
801 n_rcu_torture_boosts++;
802 mutex_unlock(&boost_mutex);
803 break;
804 }
805 schedule_timeout_uninterruptible(1);
806 }
807
808 /* Go do the stutter. */
809checkwait: rcu_stutter_wait("rcu_torture_boost");
810 } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP);
811
812 /* Clean up and exit. */
813 VERBOSE_PRINTK_STRING("rcu_torture_boost task stopping");
814 rcutorture_shutdown_absorb("rcu_torture_boost");
815 while (!kthread_should_stop() || rbi.inflight)
816 schedule_timeout_uninterruptible(1);
817 smp_mb(); /* order accesses to ->inflight before stack-frame death. */
818 return 0;
819}
820
821/*
687 * RCU torture force-quiescent-state kthread. Repeatedly induces 822 * RCU torture force-quiescent-state kthread. Repeatedly induces
688 * bursts of calls to force_quiescent_state(), increasing the probability 823 * bursts of calls to force_quiescent_state(), increasing the probability
689 * of occurrence of some important types of race conditions. 824 * of occurrence of some important types of race conditions.
@@ -933,7 +1068,8 @@ rcu_torture_printk(char *page)
933 cnt += sprintf(&page[cnt], "%s%s ", torture_type, TORTURE_FLAG); 1068 cnt += sprintf(&page[cnt], "%s%s ", torture_type, TORTURE_FLAG);
934 cnt += sprintf(&page[cnt], 1069 cnt += sprintf(&page[cnt],
935 "rtc: %p ver: %ld tfle: %d rta: %d rtaf: %d rtf: %d " 1070 "rtc: %p ver: %ld tfle: %d rta: %d rtaf: %d rtf: %d "
936 "rtmbe: %d nt: %ld", 1071 "rtmbe: %d rtbke: %ld rtbre: %ld rtbae: %ld rtbafe: %ld "
1072 "rtbf: %ld rtb: %ld nt: %ld",
937 rcu_torture_current, 1073 rcu_torture_current,
938 rcu_torture_current_version, 1074 rcu_torture_current_version,
939 list_empty(&rcu_torture_freelist), 1075 list_empty(&rcu_torture_freelist),
@@ -941,8 +1077,19 @@ rcu_torture_printk(char *page)
941 atomic_read(&n_rcu_torture_alloc_fail), 1077 atomic_read(&n_rcu_torture_alloc_fail),
942 atomic_read(&n_rcu_torture_free), 1078 atomic_read(&n_rcu_torture_free),
943 atomic_read(&n_rcu_torture_mberror), 1079 atomic_read(&n_rcu_torture_mberror),
1080 n_rcu_torture_boost_ktrerror,
1081 n_rcu_torture_boost_rterror,
1082 n_rcu_torture_boost_allocerror,
1083 n_rcu_torture_boost_afferror,
1084 n_rcu_torture_boost_failure,
1085 n_rcu_torture_boosts,
944 n_rcu_torture_timers); 1086 n_rcu_torture_timers);
945 if (atomic_read(&n_rcu_torture_mberror) != 0) 1087 if (atomic_read(&n_rcu_torture_mberror) != 0 ||
1088 n_rcu_torture_boost_ktrerror != 0 ||
1089 n_rcu_torture_boost_rterror != 0 ||
1090 n_rcu_torture_boost_allocerror != 0 ||
1091 n_rcu_torture_boost_afferror != 0 ||
1092 n_rcu_torture_boost_failure != 0)
946 cnt += sprintf(&page[cnt], " !!!"); 1093 cnt += sprintf(&page[cnt], " !!!");
947 cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG); 1094 cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG);
948 if (i > 1) { 1095 if (i > 1) {
@@ -1094,22 +1241,91 @@ rcu_torture_stutter(void *arg)
1094} 1241}
1095 1242
1096static inline void 1243static inline void
1097rcu_torture_print_module_parms(char *tag) 1244rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, char *tag)
1098{ 1245{
1099 printk(KERN_ALERT "%s" TORTURE_FLAG 1246 printk(KERN_ALERT "%s" TORTURE_FLAG
1100 "--- %s: nreaders=%d nfakewriters=%d " 1247 "--- %s: nreaders=%d nfakewriters=%d "
1101 "stat_interval=%d verbose=%d test_no_idle_hz=%d " 1248 "stat_interval=%d verbose=%d test_no_idle_hz=%d "
1102 "shuffle_interval=%d stutter=%d irqreader=%d " 1249 "shuffle_interval=%d stutter=%d irqreader=%d "
1103 "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d\n", 1250 "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d "
1251 "test_boost=%d/%d test_boost_interval=%d "
1252 "test_boost_duration=%d\n",
1104 torture_type, tag, nrealreaders, nfakewriters, 1253 torture_type, tag, nrealreaders, nfakewriters,
1105 stat_interval, verbose, test_no_idle_hz, shuffle_interval, 1254 stat_interval, verbose, test_no_idle_hz, shuffle_interval,
1106 stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter); 1255 stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter,
1256 test_boost, cur_ops->can_boost,
1257 test_boost_interval, test_boost_duration);
1107} 1258}
1108 1259
1109static struct notifier_block rcutorture_nb = { 1260static struct notifier_block rcutorture_shutdown_nb = {
1110 .notifier_call = rcutorture_shutdown_notify, 1261 .notifier_call = rcutorture_shutdown_notify,
1111}; 1262};
1112 1263
1264static void rcutorture_booster_cleanup(int cpu)
1265{
1266 struct task_struct *t;
1267
1268 if (boost_tasks[cpu] == NULL)
1269 return;
1270 mutex_lock(&boost_mutex);
1271 VERBOSE_PRINTK_STRING("Stopping rcu_torture_boost task");
1272 t = boost_tasks[cpu];
1273 boost_tasks[cpu] = NULL;
1274 mutex_unlock(&boost_mutex);
1275
1276 /* This must be outside of the mutex, otherwise deadlock! */
1277 kthread_stop(t);
1278}
1279
1280static int rcutorture_booster_init(int cpu)
1281{
1282 int retval;
1283
1284 if (boost_tasks[cpu] != NULL)
1285 return 0; /* Already created, nothing more to do. */
1286
1287 /* Don't allow time recalculation while creating a new task. */
1288 mutex_lock(&boost_mutex);
1289 VERBOSE_PRINTK_STRING("Creating rcu_torture_boost task");
1290 boost_tasks[cpu] = kthread_create(rcu_torture_boost, NULL,
1291 "rcu_torture_boost");
1292 if (IS_ERR(boost_tasks[cpu])) {
1293 retval = PTR_ERR(boost_tasks[cpu]);
1294 VERBOSE_PRINTK_STRING("rcu_torture_boost task create failed");
1295 n_rcu_torture_boost_ktrerror++;
1296 boost_tasks[cpu] = NULL;
1297 mutex_unlock(&boost_mutex);
1298 return retval;
1299 }
1300 kthread_bind(boost_tasks[cpu], cpu);
1301 wake_up_process(boost_tasks[cpu]);
1302 mutex_unlock(&boost_mutex);
1303 return 0;
1304}
1305
1306static int rcutorture_cpu_notify(struct notifier_block *self,
1307 unsigned long action, void *hcpu)
1308{
1309 long cpu = (long)hcpu;
1310
1311 switch (action) {
1312 case CPU_ONLINE:
1313 case CPU_DOWN_FAILED:
1314 (void)rcutorture_booster_init(cpu);
1315 break;
1316 case CPU_DOWN_PREPARE:
1317 rcutorture_booster_cleanup(cpu);
1318 break;
1319 default:
1320 break;
1321 }
1322 return NOTIFY_OK;
1323}
1324
1325static struct notifier_block rcutorture_cpu_nb = {
1326 .notifier_call = rcutorture_cpu_notify,
1327};
1328
1113static void 1329static void
1114rcu_torture_cleanup(void) 1330rcu_torture_cleanup(void)
1115{ 1331{
@@ -1127,7 +1343,7 @@ rcu_torture_cleanup(void)
1127 } 1343 }
1128 fullstop = FULLSTOP_RMMOD; 1344 fullstop = FULLSTOP_RMMOD;
1129 mutex_unlock(&fullstop_mutex); 1345 mutex_unlock(&fullstop_mutex);
1130 unregister_reboot_notifier(&rcutorture_nb); 1346 unregister_reboot_notifier(&rcutorture_shutdown_nb);
1131 if (stutter_task) { 1347 if (stutter_task) {
1132 VERBOSE_PRINTK_STRING("Stopping rcu_torture_stutter task"); 1348 VERBOSE_PRINTK_STRING("Stopping rcu_torture_stutter task");
1133 kthread_stop(stutter_task); 1349 kthread_stop(stutter_task);
@@ -1184,6 +1400,12 @@ rcu_torture_cleanup(void)
1184 kthread_stop(fqs_task); 1400 kthread_stop(fqs_task);
1185 } 1401 }
1186 fqs_task = NULL; 1402 fqs_task = NULL;
1403 if ((test_boost == 1 && cur_ops->can_boost) ||
1404 test_boost == 2) {
1405 unregister_cpu_notifier(&rcutorture_cpu_nb);
1406 for_each_possible_cpu(i)
1407 rcutorture_booster_cleanup(i);
1408 }
1187 1409
1188 /* Wait for all RCU callbacks to fire. */ 1410 /* Wait for all RCU callbacks to fire. */
1189 1411
@@ -1195,9 +1417,9 @@ rcu_torture_cleanup(void)
1195 if (cur_ops->cleanup) 1417 if (cur_ops->cleanup)
1196 cur_ops->cleanup(); 1418 cur_ops->cleanup();
1197 if (atomic_read(&n_rcu_torture_error)) 1419 if (atomic_read(&n_rcu_torture_error))
1198 rcu_torture_print_module_parms("End of test: FAILURE"); 1420 rcu_torture_print_module_parms(cur_ops, "End of test: FAILURE");
1199 else 1421 else
1200 rcu_torture_print_module_parms("End of test: SUCCESS"); 1422 rcu_torture_print_module_parms(cur_ops, "End of test: SUCCESS");
1201} 1423}
1202 1424
1203static int __init 1425static int __init
@@ -1242,7 +1464,7 @@ rcu_torture_init(void)
1242 nrealreaders = nreaders; 1464 nrealreaders = nreaders;
1243 else 1465 else
1244 nrealreaders = 2 * num_online_cpus(); 1466 nrealreaders = 2 * num_online_cpus();
1245 rcu_torture_print_module_parms("Start of test"); 1467 rcu_torture_print_module_parms(cur_ops, "Start of test");
1246 fullstop = FULLSTOP_DONTSTOP; 1468 fullstop = FULLSTOP_DONTSTOP;
1247 1469
1248 /* Set up the freelist. */ 1470 /* Set up the freelist. */
@@ -1263,6 +1485,12 @@ rcu_torture_init(void)
1263 atomic_set(&n_rcu_torture_free, 0); 1485 atomic_set(&n_rcu_torture_free, 0);
1264 atomic_set(&n_rcu_torture_mberror, 0); 1486 atomic_set(&n_rcu_torture_mberror, 0);
1265 atomic_set(&n_rcu_torture_error, 0); 1487 atomic_set(&n_rcu_torture_error, 0);
1488 n_rcu_torture_boost_ktrerror = 0;
1489 n_rcu_torture_boost_rterror = 0;
1490 n_rcu_torture_boost_allocerror = 0;
1491 n_rcu_torture_boost_afferror = 0;
1492 n_rcu_torture_boost_failure = 0;
1493 n_rcu_torture_boosts = 0;
1266 for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) 1494 for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++)
1267 atomic_set(&rcu_torture_wcount[i], 0); 1495 atomic_set(&rcu_torture_wcount[i], 0);
1268 for_each_possible_cpu(cpu) { 1496 for_each_possible_cpu(cpu) {
@@ -1376,7 +1604,27 @@ rcu_torture_init(void)
1376 goto unwind; 1604 goto unwind;
1377 } 1605 }
1378 } 1606 }
1379 register_reboot_notifier(&rcutorture_nb); 1607 if (test_boost_interval < 1)
1608 test_boost_interval = 1;
1609 if (test_boost_duration < 2)
1610 test_boost_duration = 2;
1611 if ((test_boost == 1 && cur_ops->can_boost) ||
1612 test_boost == 2) {
1613 int retval;
1614
1615 boost_starttime = jiffies + test_boost_interval * HZ;
1616 register_cpu_notifier(&rcutorture_cpu_nb);
1617 for_each_possible_cpu(i) {
1618 if (cpu_is_offline(i))
1619 continue; /* Heuristic: CPU can go offline. */
1620 retval = rcutorture_booster_init(i);
1621 if (retval < 0) {
1622 firsterr = retval;
1623 goto unwind;
1624 }
1625 }
1626 }
1627 register_reboot_notifier(&rcutorture_shutdown_nb);
1380 mutex_unlock(&fullstop_mutex); 1628 mutex_unlock(&fullstop_mutex);
1381 return 0; 1629 return 0;
1382 1630
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index ccdc04c47981..dd4aea806f8e 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -67,9 +67,6 @@ static struct lock_class_key rcu_node_class[NUM_RCU_LVLS];
67 .gpnum = -300, \ 67 .gpnum = -300, \
68 .completed = -300, \ 68 .completed = -300, \
69 .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&structname.onofflock), \ 69 .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&structname.onofflock), \
70 .orphan_cbs_list = NULL, \
71 .orphan_cbs_tail = &structname.orphan_cbs_list, \
72 .orphan_qlen = 0, \
73 .fqslock = __RAW_SPIN_LOCK_UNLOCKED(&structname.fqslock), \ 70 .fqslock = __RAW_SPIN_LOCK_UNLOCKED(&structname.fqslock), \
74 .n_force_qs = 0, \ 71 .n_force_qs = 0, \
75 .n_force_qs_ngp = 0, \ 72 .n_force_qs_ngp = 0, \
@@ -367,8 +364,8 @@ void rcu_irq_exit(void)
367 WARN_ON_ONCE(rdtp->dynticks & 0x1); 364 WARN_ON_ONCE(rdtp->dynticks & 0x1);
368 365
369 /* If the interrupt queued a callback, get out of dyntick mode. */ 366 /* If the interrupt queued a callback, get out of dyntick mode. */
370 if (__get_cpu_var(rcu_sched_data).nxtlist || 367 if (__this_cpu_read(rcu_sched_data.nxtlist) ||
371 __get_cpu_var(rcu_bh_data).nxtlist) 368 __this_cpu_read(rcu_bh_data.nxtlist))
372 set_need_resched(); 369 set_need_resched();
373} 370}
374 371
@@ -620,9 +617,17 @@ static void __init check_cpu_stall_init(void)
620static void __note_new_gpnum(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) 617static void __note_new_gpnum(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp)
621{ 618{
622 if (rdp->gpnum != rnp->gpnum) { 619 if (rdp->gpnum != rnp->gpnum) {
623 rdp->qs_pending = 1; 620 /*
624 rdp->passed_quiesc = 0; 621 * If the current grace period is waiting for this CPU,
622 * set up to detect a quiescent state, otherwise don't
623 * go looking for one.
624 */
625 rdp->gpnum = rnp->gpnum; 625 rdp->gpnum = rnp->gpnum;
626 if (rnp->qsmask & rdp->grpmask) {
627 rdp->qs_pending = 1;
628 rdp->passed_quiesc = 0;
629 } else
630 rdp->qs_pending = 0;
626 } 631 }
627} 632}
628 633
@@ -681,6 +686,24 @@ __rcu_process_gp_end(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_dat
681 686
682 /* Remember that we saw this grace-period completion. */ 687 /* Remember that we saw this grace-period completion. */
683 rdp->completed = rnp->completed; 688 rdp->completed = rnp->completed;
689
690 /*
691 * If we were in an extended quiescent state, we may have
692 * missed some grace periods that others CPUs handled on
693 * our behalf. Catch up with this state to avoid noting
694 * spurious new grace periods. If another grace period
695 * has started, then rnp->gpnum will have advanced, so
696 * we will detect this later on.
697 */
698 if (ULONG_CMP_LT(rdp->gpnum, rdp->completed))
699 rdp->gpnum = rdp->completed;
700
701 /*
702 * If RCU does not need a quiescent state from this CPU,
703 * then make sure that this CPU doesn't go looking for one.
704 */
705 if ((rnp->qsmask & rdp->grpmask) == 0)
706 rdp->qs_pending = 0;
684 } 707 }
685} 708}
686 709
@@ -984,53 +1007,31 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp)
984#ifdef CONFIG_HOTPLUG_CPU 1007#ifdef CONFIG_HOTPLUG_CPU
985 1008
986/* 1009/*
987 * Move a dying CPU's RCU callbacks to the ->orphan_cbs_list for the 1010 * Move a dying CPU's RCU callbacks to online CPU's callback list.
988 * specified flavor of RCU. The callbacks will be adopted by the next 1011 * Synchronization is not required because this function executes
989 * _rcu_barrier() invocation or by the CPU_DEAD notifier, whichever 1012 * in stop_machine() context.
990 * comes first. Because this is invoked from the CPU_DYING notifier,
991 * irqs are already disabled.
992 */ 1013 */
993static void rcu_send_cbs_to_orphanage(struct rcu_state *rsp) 1014static void rcu_send_cbs_to_online(struct rcu_state *rsp)
994{ 1015{
995 int i; 1016 int i;
1017 /* current DYING CPU is cleared in the cpu_online_mask */
1018 int receive_cpu = cpumask_any(cpu_online_mask);
996 struct rcu_data *rdp = this_cpu_ptr(rsp->rda); 1019 struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
1020 struct rcu_data *receive_rdp = per_cpu_ptr(rsp->rda, receive_cpu);
997 1021
998 if (rdp->nxtlist == NULL) 1022 if (rdp->nxtlist == NULL)
999 return; /* irqs disabled, so comparison is stable. */ 1023 return; /* irqs disabled, so comparison is stable. */
1000 raw_spin_lock(&rsp->onofflock); /* irqs already disabled. */ 1024
1001 *rsp->orphan_cbs_tail = rdp->nxtlist; 1025 *receive_rdp->nxttail[RCU_NEXT_TAIL] = rdp->nxtlist;
1002 rsp->orphan_cbs_tail = rdp->nxttail[RCU_NEXT_TAIL]; 1026 receive_rdp->nxttail[RCU_NEXT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
1027 receive_rdp->qlen += rdp->qlen;
1028 receive_rdp->n_cbs_adopted += rdp->qlen;
1029 rdp->n_cbs_orphaned += rdp->qlen;
1030
1003 rdp->nxtlist = NULL; 1031 rdp->nxtlist = NULL;
1004 for (i = 0; i < RCU_NEXT_SIZE; i++) 1032 for (i = 0; i < RCU_NEXT_SIZE; i++)
1005 rdp->nxttail[i] = &rdp->nxtlist; 1033 rdp->nxttail[i] = &rdp->nxtlist;
1006 rsp->orphan_qlen += rdp->qlen;
1007 rdp->n_cbs_orphaned += rdp->qlen;
1008 rdp->qlen = 0; 1034 rdp->qlen = 0;
1009 raw_spin_unlock(&rsp->onofflock); /* irqs remain disabled. */
1010}
1011
1012/*
1013 * Adopt previously orphaned RCU callbacks.
1014 */
1015static void rcu_adopt_orphan_cbs(struct rcu_state *rsp)
1016{
1017 unsigned long flags;
1018 struct rcu_data *rdp;
1019
1020 raw_spin_lock_irqsave(&rsp->onofflock, flags);
1021 rdp = this_cpu_ptr(rsp->rda);
1022 if (rsp->orphan_cbs_list == NULL) {
1023 raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
1024 return;
1025 }
1026 *rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_cbs_list;
1027 rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_cbs_tail;
1028 rdp->qlen += rsp->orphan_qlen;
1029 rdp->n_cbs_adopted += rsp->orphan_qlen;
1030 rsp->orphan_cbs_list = NULL;
1031 rsp->orphan_cbs_tail = &rsp->orphan_cbs_list;
1032 rsp->orphan_qlen = 0;
1033 raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
1034} 1035}
1035 1036
1036/* 1037/*
@@ -1081,8 +1082,6 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
1081 raw_spin_unlock_irqrestore(&rnp->lock, flags); 1082 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1082 if (need_report & RCU_OFL_TASKS_EXP_GP) 1083 if (need_report & RCU_OFL_TASKS_EXP_GP)
1083 rcu_report_exp_rnp(rsp, rnp); 1084 rcu_report_exp_rnp(rsp, rnp);
1084
1085 rcu_adopt_orphan_cbs(rsp);
1086} 1085}
1087 1086
1088/* 1087/*
@@ -1100,11 +1099,7 @@ static void rcu_offline_cpu(int cpu)
1100 1099
1101#else /* #ifdef CONFIG_HOTPLUG_CPU */ 1100#else /* #ifdef CONFIG_HOTPLUG_CPU */
1102 1101
1103static void rcu_send_cbs_to_orphanage(struct rcu_state *rsp) 1102static void rcu_send_cbs_to_online(struct rcu_state *rsp)
1104{
1105}
1106
1107static void rcu_adopt_orphan_cbs(struct rcu_state *rsp)
1108{ 1103{
1109} 1104}
1110 1105
@@ -1440,22 +1435,11 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
1440 */ 1435 */
1441 local_irq_save(flags); 1436 local_irq_save(flags);
1442 rdp = this_cpu_ptr(rsp->rda); 1437 rdp = this_cpu_ptr(rsp->rda);
1443 rcu_process_gp_end(rsp, rdp);
1444 check_for_new_grace_period(rsp, rdp);
1445 1438
1446 /* Add the callback to our list. */ 1439 /* Add the callback to our list. */
1447 *rdp->nxttail[RCU_NEXT_TAIL] = head; 1440 *rdp->nxttail[RCU_NEXT_TAIL] = head;
1448 rdp->nxttail[RCU_NEXT_TAIL] = &head->next; 1441 rdp->nxttail[RCU_NEXT_TAIL] = &head->next;
1449 1442
1450 /* Start a new grace period if one not already started. */
1451 if (!rcu_gp_in_progress(rsp)) {
1452 unsigned long nestflag;
1453 struct rcu_node *rnp_root = rcu_get_root(rsp);
1454
1455 raw_spin_lock_irqsave(&rnp_root->lock, nestflag);
1456 rcu_start_gp(rsp, nestflag); /* releases rnp_root->lock. */
1457 }
1458
1459 /* 1443 /*
1460 * Force the grace period if too many callbacks or too long waiting. 1444 * Force the grace period if too many callbacks or too long waiting.
1461 * Enforce hysteresis, and don't invoke force_quiescent_state() 1445 * Enforce hysteresis, and don't invoke force_quiescent_state()
@@ -1464,12 +1448,27 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
1464 * is the only one waiting for a grace period to complete. 1448 * is the only one waiting for a grace period to complete.
1465 */ 1449 */
1466 if (unlikely(++rdp->qlen > rdp->qlen_last_fqs_check + qhimark)) { 1450 if (unlikely(++rdp->qlen > rdp->qlen_last_fqs_check + qhimark)) {
1467 rdp->blimit = LONG_MAX; 1451
1468 if (rsp->n_force_qs == rdp->n_force_qs_snap && 1452 /* Are we ignoring a completed grace period? */
1469 *rdp->nxttail[RCU_DONE_TAIL] != head) 1453 rcu_process_gp_end(rsp, rdp);
1470 force_quiescent_state(rsp, 0); 1454 check_for_new_grace_period(rsp, rdp);
1471 rdp->n_force_qs_snap = rsp->n_force_qs; 1455
1472 rdp->qlen_last_fqs_check = rdp->qlen; 1456 /* Start a new grace period if one not already started. */
1457 if (!rcu_gp_in_progress(rsp)) {
1458 unsigned long nestflag;
1459 struct rcu_node *rnp_root = rcu_get_root(rsp);
1460
1461 raw_spin_lock_irqsave(&rnp_root->lock, nestflag);
1462 rcu_start_gp(rsp, nestflag); /* rlses rnp_root->lock */
1463 } else {
1464 /* Give the grace period a kick. */
1465 rdp->blimit = LONG_MAX;
1466 if (rsp->n_force_qs == rdp->n_force_qs_snap &&
1467 *rdp->nxttail[RCU_DONE_TAIL] != head)
1468 force_quiescent_state(rsp, 0);
1469 rdp->n_force_qs_snap = rsp->n_force_qs;
1470 rdp->qlen_last_fqs_check = rdp->qlen;
1471 }
1473 } else if (ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs), jiffies)) 1472 } else if (ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs), jiffies))
1474 force_quiescent_state(rsp, 1); 1473 force_quiescent_state(rsp, 1);
1475 local_irq_restore(flags); 1474 local_irq_restore(flags);
@@ -1699,13 +1698,12 @@ static void _rcu_barrier(struct rcu_state *rsp,
1699 * decrement rcu_barrier_cpu_count -- otherwise the first CPU 1698 * decrement rcu_barrier_cpu_count -- otherwise the first CPU
1700 * might complete its grace period before all of the other CPUs 1699 * might complete its grace period before all of the other CPUs
1701 * did their increment, causing this function to return too 1700 * did their increment, causing this function to return too
1702 * early. 1701 * early. Note that on_each_cpu() disables irqs, which prevents
1702 * any CPUs from coming online or going offline until each online
1703 * CPU has queued its RCU-barrier callback.
1703 */ 1704 */
1704 atomic_set(&rcu_barrier_cpu_count, 1); 1705 atomic_set(&rcu_barrier_cpu_count, 1);
1705 preempt_disable(); /* stop CPU_DYING from filling orphan_cbs_list */
1706 rcu_adopt_orphan_cbs(rsp);
1707 on_each_cpu(rcu_barrier_func, (void *)call_rcu_func, 1); 1706 on_each_cpu(rcu_barrier_func, (void *)call_rcu_func, 1);
1708 preempt_enable(); /* CPU_DYING can again fill orphan_cbs_list */
1709 if (atomic_dec_and_test(&rcu_barrier_cpu_count)) 1707 if (atomic_dec_and_test(&rcu_barrier_cpu_count))
1710 complete(&rcu_barrier_completion); 1708 complete(&rcu_barrier_completion);
1711 wait_for_completion(&rcu_barrier_completion); 1709 wait_for_completion(&rcu_barrier_completion);
@@ -1831,18 +1829,13 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
1831 case CPU_DYING: 1829 case CPU_DYING:
1832 case CPU_DYING_FROZEN: 1830 case CPU_DYING_FROZEN:
1833 /* 1831 /*
1834 * preempt_disable() in _rcu_barrier() prevents stop_machine(), 1832 * The whole machine is "stopped" except this CPU, so we can
1835 * so when "on_each_cpu(rcu_barrier_func, (void *)type, 1);" 1833 * touch any data without introducing corruption. We send the
1836 * returns, all online cpus have queued rcu_barrier_func(). 1834 * dying CPU's callbacks to an arbitrarily chosen online CPU.
1837 * The dying CPU clears its cpu_online_mask bit and
1838 * moves all of its RCU callbacks to ->orphan_cbs_list
1839 * in the context of stop_machine(), so subsequent calls
1840 * to _rcu_barrier() will adopt these callbacks and only
1841 * then queue rcu_barrier_func() on all remaining CPUs.
1842 */ 1835 */
1843 rcu_send_cbs_to_orphanage(&rcu_bh_state); 1836 rcu_send_cbs_to_online(&rcu_bh_state);
1844 rcu_send_cbs_to_orphanage(&rcu_sched_state); 1837 rcu_send_cbs_to_online(&rcu_sched_state);
1845 rcu_preempt_send_cbs_to_orphanage(); 1838 rcu_preempt_send_cbs_to_online();
1846 break; 1839 break;
1847 case CPU_DEAD: 1840 case CPU_DEAD:
1848 case CPU_DEAD_FROZEN: 1841 case CPU_DEAD_FROZEN:
@@ -1880,8 +1873,9 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp)
1880{ 1873{
1881 int i; 1874 int i;
1882 1875
1883 for (i = NUM_RCU_LVLS - 1; i >= 0; i--) 1876 for (i = NUM_RCU_LVLS - 1; i > 0; i--)
1884 rsp->levelspread[i] = CONFIG_RCU_FANOUT; 1877 rsp->levelspread[i] = CONFIG_RCU_FANOUT;
1878 rsp->levelspread[0] = RCU_FANOUT_LEAF;
1885} 1879}
1886#else /* #ifdef CONFIG_RCU_FANOUT_EXACT */ 1880#else /* #ifdef CONFIG_RCU_FANOUT_EXACT */
1887static void __init rcu_init_levelspread(struct rcu_state *rsp) 1881static void __init rcu_init_levelspread(struct rcu_state *rsp)
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index 91d4170c5c13..e8f057e44e3e 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -31,46 +31,51 @@
31/* 31/*
32 * Define shape of hierarchy based on NR_CPUS and CONFIG_RCU_FANOUT. 32 * Define shape of hierarchy based on NR_CPUS and CONFIG_RCU_FANOUT.
33 * In theory, it should be possible to add more levels straightforwardly. 33 * In theory, it should be possible to add more levels straightforwardly.
34 * In practice, this has not been tested, so there is probably some 34 * In practice, this did work well going from three levels to four.
35 * bug somewhere. 35 * Of course, your mileage may vary.
36 */ 36 */
37#define MAX_RCU_LVLS 4 37#define MAX_RCU_LVLS 4
38#define RCU_FANOUT (CONFIG_RCU_FANOUT) 38#if CONFIG_RCU_FANOUT > 16
39#define RCU_FANOUT_SQ (RCU_FANOUT * RCU_FANOUT) 39#define RCU_FANOUT_LEAF 16
40#define RCU_FANOUT_CUBE (RCU_FANOUT_SQ * RCU_FANOUT) 40#else /* #if CONFIG_RCU_FANOUT > 16 */
41#define RCU_FANOUT_FOURTH (RCU_FANOUT_CUBE * RCU_FANOUT) 41#define RCU_FANOUT_LEAF (CONFIG_RCU_FANOUT)
42 42#endif /* #else #if CONFIG_RCU_FANOUT > 16 */
43#if NR_CPUS <= RCU_FANOUT 43#define RCU_FANOUT_1 (RCU_FANOUT_LEAF)
44#define RCU_FANOUT_2 (RCU_FANOUT_1 * CONFIG_RCU_FANOUT)
45#define RCU_FANOUT_3 (RCU_FANOUT_2 * CONFIG_RCU_FANOUT)
46#define RCU_FANOUT_4 (RCU_FANOUT_3 * CONFIG_RCU_FANOUT)
47
48#if NR_CPUS <= RCU_FANOUT_1
44# define NUM_RCU_LVLS 1 49# define NUM_RCU_LVLS 1
45# define NUM_RCU_LVL_0 1 50# define NUM_RCU_LVL_0 1
46# define NUM_RCU_LVL_1 (NR_CPUS) 51# define NUM_RCU_LVL_1 (NR_CPUS)
47# define NUM_RCU_LVL_2 0 52# define NUM_RCU_LVL_2 0
48# define NUM_RCU_LVL_3 0 53# define NUM_RCU_LVL_3 0
49# define NUM_RCU_LVL_4 0 54# define NUM_RCU_LVL_4 0
50#elif NR_CPUS <= RCU_FANOUT_SQ 55#elif NR_CPUS <= RCU_FANOUT_2
51# define NUM_RCU_LVLS 2 56# define NUM_RCU_LVLS 2
52# define NUM_RCU_LVL_0 1 57# define NUM_RCU_LVL_0 1
53# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT) 58# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
54# define NUM_RCU_LVL_2 (NR_CPUS) 59# define NUM_RCU_LVL_2 (NR_CPUS)
55# define NUM_RCU_LVL_3 0 60# define NUM_RCU_LVL_3 0
56# define NUM_RCU_LVL_4 0 61# define NUM_RCU_LVL_4 0
57#elif NR_CPUS <= RCU_FANOUT_CUBE 62#elif NR_CPUS <= RCU_FANOUT_3
58# define NUM_RCU_LVLS 3 63# define NUM_RCU_LVLS 3
59# define NUM_RCU_LVL_0 1 64# define NUM_RCU_LVL_0 1
60# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_SQ) 65# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2)
61# define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT) 66# define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
62# define NUM_RCU_LVL_3 NR_CPUS 67# define NUM_RCU_LVL_3 (NR_CPUS)
63# define NUM_RCU_LVL_4 0 68# define NUM_RCU_LVL_4 0
64#elif NR_CPUS <= RCU_FANOUT_FOURTH 69#elif NR_CPUS <= RCU_FANOUT_4
65# define NUM_RCU_LVLS 4 70# define NUM_RCU_LVLS 4
66# define NUM_RCU_LVL_0 1 71# define NUM_RCU_LVL_0 1
67# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_CUBE) 72# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_3)
68# define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_SQ) 73# define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2)
69# define NUM_RCU_LVL_3 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT) 74# define NUM_RCU_LVL_3 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
70# define NUM_RCU_LVL_4 NR_CPUS 75# define NUM_RCU_LVL_4 (NR_CPUS)
71#else 76#else
72# error "CONFIG_RCU_FANOUT insufficient for NR_CPUS" 77# error "CONFIG_RCU_FANOUT insufficient for NR_CPUS"
73#endif /* #if (NR_CPUS) <= RCU_FANOUT */ 78#endif /* #if (NR_CPUS) <= RCU_FANOUT_1 */
74 79
75#define RCU_SUM (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3 + NUM_RCU_LVL_4) 80#define RCU_SUM (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3 + NUM_RCU_LVL_4)
76#define NUM_RCU_NODES (RCU_SUM - NR_CPUS) 81#define NUM_RCU_NODES (RCU_SUM - NR_CPUS)
@@ -203,8 +208,8 @@ struct rcu_data {
203 long qlen_last_fqs_check; 208 long qlen_last_fqs_check;
204 /* qlen at last check for QS forcing */ 209 /* qlen at last check for QS forcing */
205 unsigned long n_cbs_invoked; /* count of RCU cbs invoked. */ 210 unsigned long n_cbs_invoked; /* count of RCU cbs invoked. */
206 unsigned long n_cbs_orphaned; /* RCU cbs sent to orphanage. */ 211 unsigned long n_cbs_orphaned; /* RCU cbs orphaned by dying CPU */
207 unsigned long n_cbs_adopted; /* RCU cbs adopted from orphanage. */ 212 unsigned long n_cbs_adopted; /* RCU cbs adopted from dying CPU */
208 unsigned long n_force_qs_snap; 213 unsigned long n_force_qs_snap;
209 /* did other CPU force QS recently? */ 214 /* did other CPU force QS recently? */
210 long blimit; /* Upper limit on a processed batch */ 215 long blimit; /* Upper limit on a processed batch */
@@ -309,15 +314,7 @@ struct rcu_state {
309 /* End of fields guarded by root rcu_node's lock. */ 314 /* End of fields guarded by root rcu_node's lock. */
310 315
311 raw_spinlock_t onofflock; /* exclude on/offline and */ 316 raw_spinlock_t onofflock; /* exclude on/offline and */
312 /* starting new GP. Also */ 317 /* starting new GP. */
313 /* protects the following */
314 /* orphan_cbs fields. */
315 struct rcu_head *orphan_cbs_list; /* list of rcu_head structs */
316 /* orphaned by all CPUs in */
317 /* a given leaf rcu_node */
318 /* going offline. */
319 struct rcu_head **orphan_cbs_tail; /* And tail pointer. */
320 long orphan_qlen; /* Number of orphaned cbs. */
321 raw_spinlock_t fqslock; /* Only one task forcing */ 318 raw_spinlock_t fqslock; /* Only one task forcing */
322 /* quiescent states. */ 319 /* quiescent states. */
323 unsigned long jiffies_force_qs; /* Time at which to invoke */ 320 unsigned long jiffies_force_qs; /* Time at which to invoke */
@@ -390,7 +387,7 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp);
390static int rcu_preempt_pending(int cpu); 387static int rcu_preempt_pending(int cpu);
391static int rcu_preempt_needs_cpu(int cpu); 388static int rcu_preempt_needs_cpu(int cpu);
392static void __cpuinit rcu_preempt_init_percpu_data(int cpu); 389static void __cpuinit rcu_preempt_init_percpu_data(int cpu);
393static void rcu_preempt_send_cbs_to_orphanage(void); 390static void rcu_preempt_send_cbs_to_online(void);
394static void __init __rcu_init_preempt(void); 391static void __init __rcu_init_preempt(void);
395static void rcu_needs_cpu_flush(void); 392static void rcu_needs_cpu_flush(void);
396 393
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 71a4147473f9..a3638710dc67 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -25,6 +25,7 @@
25 */ 25 */
26 26
27#include <linux/delay.h> 27#include <linux/delay.h>
28#include <linux/stop_machine.h>
28 29
29/* 30/*
30 * Check the RCU kernel configuration parameters and print informative 31 * Check the RCU kernel configuration parameters and print informative
@@ -773,11 +774,11 @@ static void __cpuinit rcu_preempt_init_percpu_data(int cpu)
773} 774}
774 775
775/* 776/*
776 * Move preemptable RCU's callbacks to ->orphan_cbs_list. 777 * Move preemptable RCU's callbacks from dying CPU to other online CPU.
777 */ 778 */
778static void rcu_preempt_send_cbs_to_orphanage(void) 779static void rcu_preempt_send_cbs_to_online(void)
779{ 780{
780 rcu_send_cbs_to_orphanage(&rcu_preempt_state); 781 rcu_send_cbs_to_online(&rcu_preempt_state);
781} 782}
782 783
783/* 784/*
@@ -1001,7 +1002,7 @@ static void __cpuinit rcu_preempt_init_percpu_data(int cpu)
1001/* 1002/*
1002 * Because there is no preemptable RCU, there are no callbacks to move. 1003 * Because there is no preemptable RCU, there are no callbacks to move.
1003 */ 1004 */
1004static void rcu_preempt_send_cbs_to_orphanage(void) 1005static void rcu_preempt_send_cbs_to_online(void)
1005{ 1006{
1006} 1007}
1007 1008
@@ -1014,6 +1015,132 @@ static void __init __rcu_init_preempt(void)
1014 1015
1015#endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */ 1016#endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */
1016 1017
1018#ifndef CONFIG_SMP
1019
1020void synchronize_sched_expedited(void)
1021{
1022 cond_resched();
1023}
1024EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
1025
1026#else /* #ifndef CONFIG_SMP */
1027
1028static atomic_t sync_sched_expedited_started = ATOMIC_INIT(0);
1029static atomic_t sync_sched_expedited_done = ATOMIC_INIT(0);
1030
1031static int synchronize_sched_expedited_cpu_stop(void *data)
1032{
1033 /*
1034 * There must be a full memory barrier on each affected CPU
1035 * between the time that try_stop_cpus() is called and the
1036 * time that it returns.
1037 *
1038 * In the current initial implementation of cpu_stop, the
1039 * above condition is already met when the control reaches
1040 * this point and the following smp_mb() is not strictly
1041 * necessary. Do smp_mb() anyway for documentation and
1042 * robustness against future implementation changes.
1043 */
1044 smp_mb(); /* See above comment block. */
1045 return 0;
1046}
1047
1048/*
1049 * Wait for an rcu-sched grace period to elapse, but use "big hammer"
1050 * approach to force grace period to end quickly. This consumes
1051 * significant time on all CPUs, and is thus not recommended for
1052 * any sort of common-case code.
1053 *
1054 * Note that it is illegal to call this function while holding any
1055 * lock that is acquired by a CPU-hotplug notifier. Failing to
1056 * observe this restriction will result in deadlock.
1057 *
1058 * This implementation can be thought of as an application of ticket
1059 * locking to RCU, with sync_sched_expedited_started and
1060 * sync_sched_expedited_done taking on the roles of the halves
1061 * of the ticket-lock word. Each task atomically increments
1062 * sync_sched_expedited_started upon entry, snapshotting the old value,
1063 * then attempts to stop all the CPUs. If this succeeds, then each
1064 * CPU will have executed a context switch, resulting in an RCU-sched
1065 * grace period. We are then done, so we use atomic_cmpxchg() to
1066 * update sync_sched_expedited_done to match our snapshot -- but
1067 * only if someone else has not already advanced past our snapshot.
1068 *
1069 * On the other hand, if try_stop_cpus() fails, we check the value
1070 * of sync_sched_expedited_done. If it has advanced past our
1071 * initial snapshot, then someone else must have forced a grace period
1072 * some time after we took our snapshot. In this case, our work is
1073 * done for us, and we can simply return. Otherwise, we try again,
1074 * but keep our initial snapshot for purposes of checking for someone
1075 * doing our work for us.
1076 *
1077 * If we fail too many times in a row, we fall back to synchronize_sched().
1078 */
1079void synchronize_sched_expedited(void)
1080{
1081 int firstsnap, s, snap, trycount = 0;
1082
1083 /* Note that atomic_inc_return() implies full memory barrier. */
1084 firstsnap = snap = atomic_inc_return(&sync_sched_expedited_started);
1085 get_online_cpus();
1086
1087 /*
1088 * Each pass through the following loop attempts to force a
1089 * context switch on each CPU.
1090 */
1091 while (try_stop_cpus(cpu_online_mask,
1092 synchronize_sched_expedited_cpu_stop,
1093 NULL) == -EAGAIN) {
1094 put_online_cpus();
1095
1096 /* No joy, try again later. Or just synchronize_sched(). */
1097 if (trycount++ < 10)
1098 udelay(trycount * num_online_cpus());
1099 else {
1100 synchronize_sched();
1101 return;
1102 }
1103
1104 /* Check to see if someone else did our work for us. */
1105 s = atomic_read(&sync_sched_expedited_done);
1106 if (UINT_CMP_GE((unsigned)s, (unsigned)firstsnap)) {
1107 smp_mb(); /* ensure test happens before caller kfree */
1108 return;
1109 }
1110
1111 /*
1112 * Refetching sync_sched_expedited_started allows later
1113 * callers to piggyback on our grace period. We subtract
1114 * 1 to get the same token that the last incrementer got.
1115 * We retry after they started, so our grace period works
1116 * for them, and they started after our first try, so their
1117 * grace period works for us.
1118 */
1119 get_online_cpus();
1120 snap = atomic_read(&sync_sched_expedited_started) - 1;
1121 smp_mb(); /* ensure read is before try_stop_cpus(). */
1122 }
1123
1124 /*
1125 * Everyone up to our most recent fetch is covered by our grace
1126 * period. Update the counter, but only if our work is still
1127 * relevant -- which it won't be if someone who started later
1128 * than we did beat us to the punch.
1129 */
1130 do {
1131 s = atomic_read(&sync_sched_expedited_done);
1132 if (UINT_CMP_GE((unsigned)s, (unsigned)snap)) {
1133 smp_mb(); /* ensure test happens before caller kfree */
1134 break;
1135 }
1136 } while (atomic_cmpxchg(&sync_sched_expedited_done, s, snap) != s);
1137
1138 put_online_cpus();
1139}
1140EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
1141
1142#endif /* #else #ifndef CONFIG_SMP */
1143
1017#if !defined(CONFIG_RCU_FAST_NO_HZ) 1144#if !defined(CONFIG_RCU_FAST_NO_HZ)
1018 1145
1019/* 1146/*
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index d15430b9d122..c8e97853b970 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -166,13 +166,13 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
166 166
167 gpnum = rsp->gpnum; 167 gpnum = rsp->gpnum;
168 seq_printf(m, "c=%lu g=%lu s=%d jfq=%ld j=%x " 168 seq_printf(m, "c=%lu g=%lu s=%d jfq=%ld j=%x "
169 "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld\n", 169 "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu\n",
170 rsp->completed, gpnum, rsp->signaled, 170 rsp->completed, gpnum, rsp->signaled,
171 (long)(rsp->jiffies_force_qs - jiffies), 171 (long)(rsp->jiffies_force_qs - jiffies),
172 (int)(jiffies & 0xffff), 172 (int)(jiffies & 0xffff),
173 rsp->n_force_qs, rsp->n_force_qs_ngp, 173 rsp->n_force_qs, rsp->n_force_qs_ngp,
174 rsp->n_force_qs - rsp->n_force_qs_ngp, 174 rsp->n_force_qs - rsp->n_force_qs_ngp,
175 rsp->n_force_qs_lh, rsp->orphan_qlen); 175 rsp->n_force_qs_lh);
176 for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < NUM_RCU_NODES; rnp++) { 176 for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < NUM_RCU_NODES; rnp++) {
177 if (rnp->level != level) { 177 if (rnp->level != level) {
178 seq_puts(m, "\n"); 178 seq_puts(m, "\n");
@@ -300,7 +300,7 @@ static const struct file_operations rcu_pending_fops = {
300 300
301static struct dentry *rcudir; 301static struct dentry *rcudir;
302 302
303static int __init rcuclassic_trace_init(void) 303static int __init rcutree_trace_init(void)
304{ 304{
305 struct dentry *retval; 305 struct dentry *retval;
306 306
@@ -337,14 +337,14 @@ free_out:
337 return 1; 337 return 1;
338} 338}
339 339
340static void __exit rcuclassic_trace_cleanup(void) 340static void __exit rcutree_trace_cleanup(void)
341{ 341{
342 debugfs_remove_recursive(rcudir); 342 debugfs_remove_recursive(rcudir);
343} 343}
344 344
345 345
346module_init(rcuclassic_trace_init); 346module_init(rcutree_trace_init);
347module_exit(rcuclassic_trace_cleanup); 347module_exit(rcutree_trace_cleanup);
348 348
349MODULE_AUTHOR("Paul E. McKenney"); 349MODULE_AUTHOR("Paul E. McKenney");
350MODULE_DESCRIPTION("Read-Copy Update tracing for hierarchical implementation"); 350MODULE_DESCRIPTION("Read-Copy Update tracing for hierarchical implementation");
diff --git a/kernel/resource.c b/kernel/resource.c
index 9fad33efd0db..798e2fae2a06 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -40,23 +40,6 @@ EXPORT_SYMBOL(iomem_resource);
40 40
41static DEFINE_RWLOCK(resource_lock); 41static DEFINE_RWLOCK(resource_lock);
42 42
43/*
44 * By default, we allocate free space bottom-up. The architecture can request
45 * top-down by clearing this flag. The user can override the architecture's
46 * choice with the "resource_alloc_from_bottom" kernel boot option, but that
47 * should only be a debugging tool.
48 */
49int resource_alloc_from_bottom = 1;
50
51static __init int setup_alloc_from_bottom(char *s)
52{
53 printk(KERN_INFO
54 "resource: allocating from bottom-up; please report a bug\n");
55 resource_alloc_from_bottom = 1;
56 return 0;
57}
58early_param("resource_alloc_from_bottom", setup_alloc_from_bottom);
59
60static void *r_next(struct seq_file *m, void *v, loff_t *pos) 43static void *r_next(struct seq_file *m, void *v, loff_t *pos)
61{ 44{
62 struct resource *p = v; 45 struct resource *p = v;
@@ -374,6 +357,10 @@ int __weak page_is_ram(unsigned long pfn)
374 return walk_system_ram_range(pfn, 1, NULL, __is_ram) == 1; 357 return walk_system_ram_range(pfn, 1, NULL, __is_ram) == 1;
375} 358}
376 359
360void __weak arch_remove_reservations(struct resource *avail)
361{
362}
363
377static resource_size_t simple_align_resource(void *data, 364static resource_size_t simple_align_resource(void *data,
378 const struct resource *avail, 365 const struct resource *avail,
379 resource_size_t size, 366 resource_size_t size,
@@ -397,74 +384,7 @@ static bool resource_contains(struct resource *res1, struct resource *res2)
397} 384}
398 385
399/* 386/*
400 * Find the resource before "child" in the sibling list of "root" children.
401 */
402static struct resource *find_sibling_prev(struct resource *root, struct resource *child)
403{
404 struct resource *this;
405
406 for (this = root->child; this; this = this->sibling)
407 if (this->sibling == child)
408 return this;
409
410 return NULL;
411}
412
413/*
414 * Find empty slot in the resource tree given range and alignment. 387 * Find empty slot in the resource tree given range and alignment.
415 * This version allocates from the end of the root resource first.
416 */
417static int find_resource_from_top(struct resource *root, struct resource *new,
418 resource_size_t size, resource_size_t min,
419 resource_size_t max, resource_size_t align,
420 resource_size_t (*alignf)(void *,
421 const struct resource *,
422 resource_size_t,
423 resource_size_t),
424 void *alignf_data)
425{
426 struct resource *this;
427 struct resource tmp, avail, alloc;
428
429 tmp.start = root->end;
430 tmp.end = root->end;
431
432 this = find_sibling_prev(root, NULL);
433 for (;;) {
434 if (this) {
435 if (this->end < root->end)
436 tmp.start = this->end + 1;
437 } else
438 tmp.start = root->start;
439
440 resource_clip(&tmp, min, max);
441
442 /* Check for overflow after ALIGN() */
443 avail = *new;
444 avail.start = ALIGN(tmp.start, align);
445 avail.end = tmp.end;
446 if (avail.start >= tmp.start) {
447 alloc.start = alignf(alignf_data, &avail, size, align);
448 alloc.end = alloc.start + size - 1;
449 if (resource_contains(&avail, &alloc)) {
450 new->start = alloc.start;
451 new->end = alloc.end;
452 return 0;
453 }
454 }
455
456 if (!this || this->start == root->start)
457 break;
458
459 tmp.end = this->start - 1;
460 this = find_sibling_prev(root, this);
461 }
462 return -EBUSY;
463}
464
465/*
466 * Find empty slot in the resource tree given range and alignment.
467 * This version allocates from the beginning of the root resource first.
468 */ 388 */
469static int find_resource(struct resource *root, struct resource *new, 389static int find_resource(struct resource *root, struct resource *new,
470 resource_size_t size, resource_size_t min, 390 resource_size_t size, resource_size_t min,
@@ -478,23 +398,24 @@ static int find_resource(struct resource *root, struct resource *new,
478 struct resource *this = root->child; 398 struct resource *this = root->child;
479 struct resource tmp = *new, avail, alloc; 399 struct resource tmp = *new, avail, alloc;
480 400
401 tmp.flags = new->flags;
481 tmp.start = root->start; 402 tmp.start = root->start;
482 /* 403 /*
483 * Skip past an allocated resource that starts at 0, since the 404 * Skip past an allocated resource that starts at 0, since the assignment
484 * assignment of this->start - 1 to tmp->end below would cause an 405 * of this->start - 1 to tmp->end below would cause an underflow.
485 * underflow.
486 */ 406 */
487 if (this && this->start == 0) { 407 if (this && this->start == 0) {
488 tmp.start = this->end + 1; 408 tmp.start = this->end + 1;
489 this = this->sibling; 409 this = this->sibling;
490 } 410 }
491 for (;;) { 411 for(;;) {
492 if (this) 412 if (this)
493 tmp.end = this->start - 1; 413 tmp.end = this->start - 1;
494 else 414 else
495 tmp.end = root->end; 415 tmp.end = root->end;
496 416
497 resource_clip(&tmp, min, max); 417 resource_clip(&tmp, min, max);
418 arch_remove_reservations(&tmp);
498 419
499 /* Check for overflow after ALIGN() */ 420 /* Check for overflow after ALIGN() */
500 avail = *new; 421 avail = *new;
@@ -509,10 +430,8 @@ static int find_resource(struct resource *root, struct resource *new,
509 return 0; 430 return 0;
510 } 431 }
511 } 432 }
512
513 if (!this) 433 if (!this)
514 break; 434 break;
515
516 tmp.start = this->end + 1; 435 tmp.start = this->end + 1;
517 this = this->sibling; 436 this = this->sibling;
518 } 437 }
@@ -545,10 +464,7 @@ int allocate_resource(struct resource *root, struct resource *new,
545 alignf = simple_align_resource; 464 alignf = simple_align_resource;
546 465
547 write_lock(&resource_lock); 466 write_lock(&resource_lock);
548 if (resource_alloc_from_bottom) 467 err = find_resource(root, new, size, min, max, align, alignf, alignf_data);
549 err = find_resource(root, new, size, min, max, align, alignf, alignf_data);
550 else
551 err = find_resource_from_top(root, new, size, min, max, align, alignf, alignf_data);
552 if (err >= 0 && __request_resource(root, new)) 468 if (err >= 0 && __request_resource(root, new))
553 err = -EBUSY; 469 err = -EBUSY;
554 write_unlock(&resource_lock); 470 write_unlock(&resource_lock);
diff --git a/kernel/sched.c b/kernel/sched.c
index aa14a56f9d03..18d38e4ec7ba 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -75,9 +75,11 @@
75 75
76#include <asm/tlb.h> 76#include <asm/tlb.h>
77#include <asm/irq_regs.h> 77#include <asm/irq_regs.h>
78#include <asm/mutex.h>
78 79
79#include "sched_cpupri.h" 80#include "sched_cpupri.h"
80#include "workqueue_sched.h" 81#include "workqueue_sched.h"
82#include "sched_autogroup.h"
81 83
82#define CREATE_TRACE_POINTS 84#define CREATE_TRACE_POINTS
83#include <trace/events/sched.h> 85#include <trace/events/sched.h>
@@ -253,6 +255,8 @@ struct task_group {
253 /* runqueue "owned" by this group on each cpu */ 255 /* runqueue "owned" by this group on each cpu */
254 struct cfs_rq **cfs_rq; 256 struct cfs_rq **cfs_rq;
255 unsigned long shares; 257 unsigned long shares;
258
259 atomic_t load_weight;
256#endif 260#endif
257 261
258#ifdef CONFIG_RT_GROUP_SCHED 262#ifdef CONFIG_RT_GROUP_SCHED
@@ -268,25 +272,18 @@ struct task_group {
268 struct task_group *parent; 272 struct task_group *parent;
269 struct list_head siblings; 273 struct list_head siblings;
270 struct list_head children; 274 struct list_head children;
271};
272 275
273#define root_task_group init_task_group 276#ifdef CONFIG_SCHED_AUTOGROUP
277 struct autogroup *autogroup;
278#endif
279};
274 280
275/* task_group_lock serializes add/remove of task groups and also changes to 281/* task_group_lock serializes the addition/removal of task groups */
276 * a task group's cpu shares.
277 */
278static DEFINE_SPINLOCK(task_group_lock); 282static DEFINE_SPINLOCK(task_group_lock);
279 283
280#ifdef CONFIG_FAIR_GROUP_SCHED 284#ifdef CONFIG_FAIR_GROUP_SCHED
281 285
282#ifdef CONFIG_SMP 286# define ROOT_TASK_GROUP_LOAD NICE_0_LOAD
283static int root_task_group_empty(void)
284{
285 return list_empty(&root_task_group.children);
286}
287#endif
288
289# define INIT_TASK_GROUP_LOAD NICE_0_LOAD
290 287
291/* 288/*
292 * A weight of 0 or 1 can cause arithmetics problems. 289 * A weight of 0 or 1 can cause arithmetics problems.
@@ -299,13 +296,13 @@ static int root_task_group_empty(void)
299#define MIN_SHARES 2 296#define MIN_SHARES 2
300#define MAX_SHARES (1UL << 18) 297#define MAX_SHARES (1UL << 18)
301 298
302static int init_task_group_load = INIT_TASK_GROUP_LOAD; 299static int root_task_group_load = ROOT_TASK_GROUP_LOAD;
303#endif 300#endif
304 301
305/* Default task group. 302/* Default task group.
306 * Every task in system belong to this group at bootup. 303 * Every task in system belong to this group at bootup.
307 */ 304 */
308struct task_group init_task_group; 305struct task_group root_task_group;
309 306
310#endif /* CONFIG_CGROUP_SCHED */ 307#endif /* CONFIG_CGROUP_SCHED */
311 308
@@ -342,6 +339,7 @@ struct cfs_rq {
342 * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This 339 * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
343 * list is used during load balance. 340 * list is used during load balance.
344 */ 341 */
342 int on_list;
345 struct list_head leaf_cfs_rq_list; 343 struct list_head leaf_cfs_rq_list;
346 struct task_group *tg; /* group that "owns" this runqueue */ 344 struct task_group *tg; /* group that "owns" this runqueue */
347 345
@@ -360,14 +358,17 @@ struct cfs_rq {
360 unsigned long h_load; 358 unsigned long h_load;
361 359
362 /* 360 /*
363 * this cpu's part of tg->shares 361 * Maintaining per-cpu shares distribution for group scheduling
362 *
363 * load_stamp is the last time we updated the load average
364 * load_last is the last time we updated the load average and saw load
365 * load_unacc_exec_time is currently unaccounted execution time
364 */ 366 */
365 unsigned long shares; 367 u64 load_avg;
368 u64 load_period;
369 u64 load_stamp, load_last, load_unacc_exec_time;
366 370
367 /* 371 unsigned long load_contribution;
368 * load.weight at the time we set shares
369 */
370 unsigned long rq_weight;
371#endif 372#endif
372#endif 373#endif
373}; 374};
@@ -552,26 +553,13 @@ struct rq {
552 /* try_to_wake_up() stats */ 553 /* try_to_wake_up() stats */
553 unsigned int ttwu_count; 554 unsigned int ttwu_count;
554 unsigned int ttwu_local; 555 unsigned int ttwu_local;
555
556 /* BKL stats */
557 unsigned int bkl_count;
558#endif 556#endif
559}; 557};
560 558
561static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); 559static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
562 560
563static inline
564void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
565{
566 rq->curr->sched_class->check_preempt_curr(rq, p, flags);
567 561
568 /* 562static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags);
569 * A queue event has occurred, and we're going to schedule. In
570 * this case, we can save a useless back to back clock update.
571 */
572 if (test_tsk_need_resched(p))
573 rq->skip_clock_update = 1;
574}
575 563
576static inline int cpu_of(struct rq *rq) 564static inline int cpu_of(struct rq *rq)
577{ 565{
@@ -615,11 +603,17 @@ static inline int cpu_of(struct rq *rq)
615 */ 603 */
616static inline struct task_group *task_group(struct task_struct *p) 604static inline struct task_group *task_group(struct task_struct *p)
617{ 605{
606 struct task_group *tg;
618 struct cgroup_subsys_state *css; 607 struct cgroup_subsys_state *css;
619 608
609 if (p->flags & PF_EXITING)
610 return &root_task_group;
611
620 css = task_subsys_state_check(p, cpu_cgroup_subsys_id, 612 css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
621 lockdep_is_held(&task_rq(p)->lock)); 613 lockdep_is_held(&task_rq(p)->lock));
622 return container_of(css, struct task_group, css); 614 tg = container_of(css, struct task_group, css);
615
616 return autogroup_task_group(p, tg);
623} 617}
624 618
625/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ 619/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
@@ -646,22 +640,18 @@ static inline struct task_group *task_group(struct task_struct *p)
646 640
647#endif /* CONFIG_CGROUP_SCHED */ 641#endif /* CONFIG_CGROUP_SCHED */
648 642
649static u64 irq_time_cpu(int cpu); 643static void update_rq_clock_task(struct rq *rq, s64 delta);
650static void sched_irq_time_avg_update(struct rq *rq, u64 irq_time);
651 644
652inline void update_rq_clock(struct rq *rq) 645static void update_rq_clock(struct rq *rq)
653{ 646{
654 if (!rq->skip_clock_update) { 647 s64 delta;
655 int cpu = cpu_of(rq);
656 u64 irq_time;
657 648
658 rq->clock = sched_clock_cpu(cpu); 649 if (rq->skip_clock_update)
659 irq_time = irq_time_cpu(cpu); 650 return;
660 if (rq->clock - irq_time > rq->clock_task)
661 rq->clock_task = rq->clock - irq_time;
662 651
663 sched_irq_time_avg_update(rq, irq_time); 652 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
664 } 653 rq->clock += delta;
654 update_rq_clock_task(rq, delta);
665} 655}
666 656
667/* 657/*
@@ -751,7 +741,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
751 buf[cnt] = 0; 741 buf[cnt] = 0;
752 cmp = strstrip(buf); 742 cmp = strstrip(buf);
753 743
754 if (strncmp(buf, "NO_", 3) == 0) { 744 if (strncmp(cmp, "NO_", 3) == 0) {
755 neg = 1; 745 neg = 1;
756 cmp += 3; 746 cmp += 3;
757 } 747 }
@@ -807,20 +797,6 @@ late_initcall(sched_init_debug);
807const_debug unsigned int sysctl_sched_nr_migrate = 32; 797const_debug unsigned int sysctl_sched_nr_migrate = 32;
808 798
809/* 799/*
810 * ratelimit for updating the group shares.
811 * default: 0.25ms
812 */
813unsigned int sysctl_sched_shares_ratelimit = 250000;
814unsigned int normalized_sysctl_sched_shares_ratelimit = 250000;
815
816/*
817 * Inject some fuzzyness into changing the per-cpu group shares
818 * this avoids remote rq-locks at the expense of fairness.
819 * default: 4
820 */
821unsigned int sysctl_sched_shares_thresh = 4;
822
823/*
824 * period over which we average the RT time consumption, measured 800 * period over which we average the RT time consumption, measured
825 * in ms. 801 * in ms.
826 * 802 *
@@ -1369,6 +1345,12 @@ static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
1369 lw->inv_weight = 0; 1345 lw->inv_weight = 0;
1370} 1346}
1371 1347
1348static inline void update_load_set(struct load_weight *lw, unsigned long w)
1349{
1350 lw->weight = w;
1351 lw->inv_weight = 0;
1352}
1353
1372/* 1354/*
1373 * To aid in avoiding the subversion of "niceness" due to uneven distribution 1355 * To aid in avoiding the subversion of "niceness" due to uneven distribution
1374 * of tasks with abnormal "nice" values across CPUs the contribution that 1356 * of tasks with abnormal "nice" values across CPUs the contribution that
@@ -1557,101 +1539,6 @@ static unsigned long cpu_avg_load_per_task(int cpu)
1557 1539
1558#ifdef CONFIG_FAIR_GROUP_SCHED 1540#ifdef CONFIG_FAIR_GROUP_SCHED
1559 1541
1560static __read_mostly unsigned long __percpu *update_shares_data;
1561
1562static void __set_se_shares(struct sched_entity *se, unsigned long shares);
1563
1564/*
1565 * Calculate and set the cpu's group shares.
1566 */
1567static void update_group_shares_cpu(struct task_group *tg, int cpu,
1568 unsigned long sd_shares,
1569 unsigned long sd_rq_weight,
1570 unsigned long *usd_rq_weight)
1571{
1572 unsigned long shares, rq_weight;
1573 int boost = 0;
1574
1575 rq_weight = usd_rq_weight[cpu];
1576 if (!rq_weight) {
1577 boost = 1;
1578 rq_weight = NICE_0_LOAD;
1579 }
1580
1581 /*
1582 * \Sum_j shares_j * rq_weight_i
1583 * shares_i = -----------------------------
1584 * \Sum_j rq_weight_j
1585 */
1586 shares = (sd_shares * rq_weight) / sd_rq_weight;
1587 shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES);
1588
1589 if (abs(shares - tg->se[cpu]->load.weight) >
1590 sysctl_sched_shares_thresh) {
1591 struct rq *rq = cpu_rq(cpu);
1592 unsigned long flags;
1593
1594 raw_spin_lock_irqsave(&rq->lock, flags);
1595 tg->cfs_rq[cpu]->rq_weight = boost ? 0 : rq_weight;
1596 tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
1597 __set_se_shares(tg->se[cpu], shares);
1598 raw_spin_unlock_irqrestore(&rq->lock, flags);
1599 }
1600}
1601
1602/*
1603 * Re-compute the task group their per cpu shares over the given domain.
1604 * This needs to be done in a bottom-up fashion because the rq weight of a
1605 * parent group depends on the shares of its child groups.
1606 */
1607static int tg_shares_up(struct task_group *tg, void *data)
1608{
1609 unsigned long weight, rq_weight = 0, sum_weight = 0, shares = 0;
1610 unsigned long *usd_rq_weight;
1611 struct sched_domain *sd = data;
1612 unsigned long flags;
1613 int i;
1614
1615 if (!tg->se[0])
1616 return 0;
1617
1618 local_irq_save(flags);
1619 usd_rq_weight = per_cpu_ptr(update_shares_data, smp_processor_id());
1620
1621 for_each_cpu(i, sched_domain_span(sd)) {
1622 weight = tg->cfs_rq[i]->load.weight;
1623 usd_rq_weight[i] = weight;
1624
1625 rq_weight += weight;
1626 /*
1627 * If there are currently no tasks on the cpu pretend there
1628 * is one of average load so that when a new task gets to
1629 * run here it will not get delayed by group starvation.
1630 */
1631 if (!weight)
1632 weight = NICE_0_LOAD;
1633
1634 sum_weight += weight;
1635 shares += tg->cfs_rq[i]->shares;
1636 }
1637
1638 if (!rq_weight)
1639 rq_weight = sum_weight;
1640
1641 if ((!shares && rq_weight) || shares > tg->shares)
1642 shares = tg->shares;
1643
1644 if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))
1645 shares = tg->shares;
1646
1647 for_each_cpu(i, sched_domain_span(sd))
1648 update_group_shares_cpu(tg, i, shares, rq_weight, usd_rq_weight);
1649
1650 local_irq_restore(flags);
1651
1652 return 0;
1653}
1654
1655/* 1542/*
1656 * Compute the cpu's hierarchical load factor for each task group. 1543 * Compute the cpu's hierarchical load factor for each task group.
1657 * This needs to be done in a top-down fashion because the load of a child 1544 * This needs to be done in a top-down fashion because the load of a child
@@ -1666,7 +1553,7 @@ static int tg_load_down(struct task_group *tg, void *data)
1666 load = cpu_rq(cpu)->load.weight; 1553 load = cpu_rq(cpu)->load.weight;
1667 } else { 1554 } else {
1668 load = tg->parent->cfs_rq[cpu]->h_load; 1555 load = tg->parent->cfs_rq[cpu]->h_load;
1669 load *= tg->cfs_rq[cpu]->shares; 1556 load *= tg->se[cpu]->load.weight;
1670 load /= tg->parent->cfs_rq[cpu]->load.weight + 1; 1557 load /= tg->parent->cfs_rq[cpu]->load.weight + 1;
1671 } 1558 }
1672 1559
@@ -1675,34 +1562,11 @@ static int tg_load_down(struct task_group *tg, void *data)
1675 return 0; 1562 return 0;
1676} 1563}
1677 1564
1678static void update_shares(struct sched_domain *sd)
1679{
1680 s64 elapsed;
1681 u64 now;
1682
1683 if (root_task_group_empty())
1684 return;
1685
1686 now = local_clock();
1687 elapsed = now - sd->last_update;
1688
1689 if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {
1690 sd->last_update = now;
1691 walk_tg_tree(tg_nop, tg_shares_up, sd);
1692 }
1693}
1694
1695static void update_h_load(long cpu) 1565static void update_h_load(long cpu)
1696{ 1566{
1697 walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); 1567 walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
1698} 1568}
1699 1569
1700#else
1701
1702static inline void update_shares(struct sched_domain *sd)
1703{
1704}
1705
1706#endif 1570#endif
1707 1571
1708#ifdef CONFIG_PREEMPT 1572#ifdef CONFIG_PREEMPT
@@ -1824,15 +1688,6 @@ static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
1824 1688
1825#endif 1689#endif
1826 1690
1827#ifdef CONFIG_FAIR_GROUP_SCHED
1828static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
1829{
1830#ifdef CONFIG_SMP
1831 cfs_rq->shares = shares;
1832#endif
1833}
1834#endif
1835
1836static void calc_load_account_idle(struct rq *this_rq); 1691static void calc_load_account_idle(struct rq *this_rq);
1837static void update_sysctl(void); 1692static void update_sysctl(void);
1838static int get_update_sysctl_factor(void); 1693static int get_update_sysctl_factor(void);
@@ -1934,10 +1789,9 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
1934 * They are read and saved off onto struct rq in update_rq_clock(). 1789 * They are read and saved off onto struct rq in update_rq_clock().
1935 * This may result in other CPU reading this CPU's irq time and can 1790 * This may result in other CPU reading this CPU's irq time and can
1936 * race with irq/account_system_vtime on this CPU. We would either get old 1791 * race with irq/account_system_vtime on this CPU. We would either get old
1937 * or new value (or semi updated value on 32 bit) with a side effect of 1792 * or new value with a side effect of accounting a slice of irq time to wrong
1938 * accounting a slice of irq time to wrong task when irq is in progress 1793 * task when irq is in progress while we read rq->clock. That is a worthy
1939 * while we read rq->clock. That is a worthy compromise in place of having 1794 * compromise in place of having locks on each irq in account_system_time.
1940 * locks on each irq in account_system_time.
1941 */ 1795 */
1942static DEFINE_PER_CPU(u64, cpu_hardirq_time); 1796static DEFINE_PER_CPU(u64, cpu_hardirq_time);
1943static DEFINE_PER_CPU(u64, cpu_softirq_time); 1797static DEFINE_PER_CPU(u64, cpu_softirq_time);
@@ -1955,19 +1809,58 @@ void disable_sched_clock_irqtime(void)
1955 sched_clock_irqtime = 0; 1809 sched_clock_irqtime = 0;
1956} 1810}
1957 1811
1958static u64 irq_time_cpu(int cpu) 1812#ifndef CONFIG_64BIT
1813static DEFINE_PER_CPU(seqcount_t, irq_time_seq);
1814
1815static inline void irq_time_write_begin(void)
1959{ 1816{
1960 if (!sched_clock_irqtime) 1817 __this_cpu_inc(irq_time_seq.sequence);
1961 return 0; 1818 smp_wmb();
1819}
1820
1821static inline void irq_time_write_end(void)
1822{
1823 smp_wmb();
1824 __this_cpu_inc(irq_time_seq.sequence);
1825}
1826
1827static inline u64 irq_time_read(int cpu)
1828{
1829 u64 irq_time;
1830 unsigned seq;
1831
1832 do {
1833 seq = read_seqcount_begin(&per_cpu(irq_time_seq, cpu));
1834 irq_time = per_cpu(cpu_softirq_time, cpu) +
1835 per_cpu(cpu_hardirq_time, cpu);
1836 } while (read_seqcount_retry(&per_cpu(irq_time_seq, cpu), seq));
1962 1837
1838 return irq_time;
1839}
1840#else /* CONFIG_64BIT */
1841static inline void irq_time_write_begin(void)
1842{
1843}
1844
1845static inline void irq_time_write_end(void)
1846{
1847}
1848
1849static inline u64 irq_time_read(int cpu)
1850{
1963 return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu); 1851 return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu);
1964} 1852}
1853#endif /* CONFIG_64BIT */
1965 1854
1855/*
1856 * Called before incrementing preempt_count on {soft,}irq_enter
1857 * and before decrementing preempt_count on {soft,}irq_exit.
1858 */
1966void account_system_vtime(struct task_struct *curr) 1859void account_system_vtime(struct task_struct *curr)
1967{ 1860{
1968 unsigned long flags; 1861 unsigned long flags;
1862 s64 delta;
1969 int cpu; 1863 int cpu;
1970 u64 now, delta;
1971 1864
1972 if (!sched_clock_irqtime) 1865 if (!sched_clock_irqtime)
1973 return; 1866 return;
@@ -1975,9 +1868,10 @@ void account_system_vtime(struct task_struct *curr)
1975 local_irq_save(flags); 1868 local_irq_save(flags);
1976 1869
1977 cpu = smp_processor_id(); 1870 cpu = smp_processor_id();
1978 now = sched_clock_cpu(cpu); 1871 delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time);
1979 delta = now - per_cpu(irq_start_time, cpu); 1872 __this_cpu_add(irq_start_time, delta);
1980 per_cpu(irq_start_time, cpu) = now; 1873
1874 irq_time_write_begin();
1981 /* 1875 /*
1982 * We do not account for softirq time from ksoftirqd here. 1876 * We do not account for softirq time from ksoftirqd here.
1983 * We want to continue accounting softirq time to ksoftirqd thread 1877 * We want to continue accounting softirq time to ksoftirqd thread
@@ -1985,37 +1879,60 @@ void account_system_vtime(struct task_struct *curr)
1985 * that do not consume any time, but still wants to run. 1879 * that do not consume any time, but still wants to run.
1986 */ 1880 */
1987 if (hardirq_count()) 1881 if (hardirq_count())
1988 per_cpu(cpu_hardirq_time, cpu) += delta; 1882 __this_cpu_add(cpu_hardirq_time, delta);
1989 else if (in_serving_softirq() && !(curr->flags & PF_KSOFTIRQD)) 1883 else if (in_serving_softirq() && !(curr->flags & PF_KSOFTIRQD))
1990 per_cpu(cpu_softirq_time, cpu) += delta; 1884 __this_cpu_add(cpu_softirq_time, delta);
1991 1885
1886 irq_time_write_end();
1992 local_irq_restore(flags); 1887 local_irq_restore(flags);
1993} 1888}
1994EXPORT_SYMBOL_GPL(account_system_vtime); 1889EXPORT_SYMBOL_GPL(account_system_vtime);
1995 1890
1996static void sched_irq_time_avg_update(struct rq *rq, u64 curr_irq_time) 1891static void update_rq_clock_task(struct rq *rq, s64 delta)
1997{ 1892{
1998 if (sched_clock_irqtime && sched_feat(NONIRQ_POWER)) { 1893 s64 irq_delta;
1999 u64 delta_irq = curr_irq_time - rq->prev_irq_time; 1894
2000 rq->prev_irq_time = curr_irq_time; 1895 irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
2001 sched_rt_avg_update(rq, delta_irq); 1896
2002 } 1897 /*
1898 * Since irq_time is only updated on {soft,}irq_exit, we might run into
1899 * this case when a previous update_rq_clock() happened inside a
1900 * {soft,}irq region.
1901 *
1902 * When this happens, we stop ->clock_task and only update the
1903 * prev_irq_time stamp to account for the part that fit, so that a next
1904 * update will consume the rest. This ensures ->clock_task is
1905 * monotonic.
1906 *
1907 * It does however cause some slight miss-attribution of {soft,}irq
1908 * time, a more accurate solution would be to update the irq_time using
1909 * the current rq->clock timestamp, except that would require using
1910 * atomic ops.
1911 */
1912 if (irq_delta > delta)
1913 irq_delta = delta;
1914
1915 rq->prev_irq_time += irq_delta;
1916 delta -= irq_delta;
1917 rq->clock_task += delta;
1918
1919 if (irq_delta && sched_feat(NONIRQ_POWER))
1920 sched_rt_avg_update(rq, irq_delta);
2003} 1921}
2004 1922
2005#else 1923#else /* CONFIG_IRQ_TIME_ACCOUNTING */
2006 1924
2007static u64 irq_time_cpu(int cpu) 1925static void update_rq_clock_task(struct rq *rq, s64 delta)
2008{ 1926{
2009 return 0; 1927 rq->clock_task += delta;
2010} 1928}
2011 1929
2012static void sched_irq_time_avg_update(struct rq *rq, u64 curr_irq_time) { } 1930#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
2013
2014#endif
2015 1931
2016#include "sched_idletask.c" 1932#include "sched_idletask.c"
2017#include "sched_fair.c" 1933#include "sched_fair.c"
2018#include "sched_rt.c" 1934#include "sched_rt.c"
1935#include "sched_autogroup.c"
2019#include "sched_stoptask.c" 1936#include "sched_stoptask.c"
2020#ifdef CONFIG_SCHED_DEBUG 1937#ifdef CONFIG_SCHED_DEBUG
2021# include "sched_debug.c" 1938# include "sched_debug.c"
@@ -2118,6 +2035,31 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p,
2118 p->sched_class->prio_changed(rq, p, oldprio, running); 2035 p->sched_class->prio_changed(rq, p, oldprio, running);
2119} 2036}
2120 2037
2038static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
2039{
2040 const struct sched_class *class;
2041
2042 if (p->sched_class == rq->curr->sched_class) {
2043 rq->curr->sched_class->check_preempt_curr(rq, p, flags);
2044 } else {
2045 for_each_class(class) {
2046 if (class == rq->curr->sched_class)
2047 break;
2048 if (class == p->sched_class) {
2049 resched_task(rq->curr);
2050 break;
2051 }
2052 }
2053 }
2054
2055 /*
2056 * A queue event has occurred, and we're going to schedule. In
2057 * this case, we can save a useless back to back clock update.
2058 */
2059 if (rq->curr->se.on_rq && test_tsk_need_resched(rq->curr))
2060 rq->skip_clock_update = 1;
2061}
2062
2121#ifdef CONFIG_SMP 2063#ifdef CONFIG_SMP
2122/* 2064/*
2123 * Is this task likely cache-hot: 2065 * Is this task likely cache-hot:
@@ -2183,10 +2125,8 @@ static int migration_cpu_stop(void *data);
2183 * The task's runqueue lock must be held. 2125 * The task's runqueue lock must be held.
2184 * Returns true if you have to wait for migration thread. 2126 * Returns true if you have to wait for migration thread.
2185 */ 2127 */
2186static bool migrate_task(struct task_struct *p, int dest_cpu) 2128static bool migrate_task(struct task_struct *p, struct rq *rq)
2187{ 2129{
2188 struct rq *rq = task_rq(p);
2189
2190 /* 2130 /*
2191 * If the task is not on a runqueue (and not running), then 2131 * If the task is not on a runqueue (and not running), then
2192 * the next wake-up will properly place the task. 2132 * the next wake-up will properly place the task.
@@ -2366,18 +2306,15 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
2366 return dest_cpu; 2306 return dest_cpu;
2367 2307
2368 /* No more Mr. Nice Guy. */ 2308 /* No more Mr. Nice Guy. */
2369 if (unlikely(dest_cpu >= nr_cpu_ids)) { 2309 dest_cpu = cpuset_cpus_allowed_fallback(p);
2370 dest_cpu = cpuset_cpus_allowed_fallback(p); 2310 /*
2371 /* 2311 * Don't tell them about moving exiting tasks or
2372 * Don't tell them about moving exiting tasks or 2312 * kernel threads (both mm NULL), since they never
2373 * kernel threads (both mm NULL), since they never 2313 * leave kernel.
2374 * leave kernel. 2314 */
2375 */ 2315 if (p->mm && printk_ratelimit()) {
2376 if (p->mm && printk_ratelimit()) { 2316 printk(KERN_INFO "process %d (%s) no longer affine to cpu%d\n",
2377 printk(KERN_INFO "process %d (%s) no " 2317 task_pid_nr(p), p->comm, cpu);
2378 "longer affine to cpu%d\n",
2379 task_pid_nr(p), p->comm, cpu);
2380 }
2381 } 2318 }
2382 2319
2383 return dest_cpu; 2320 return dest_cpu;
@@ -2568,7 +2505,7 @@ out:
2568 * try_to_wake_up_local - try to wake up a local task with rq lock held 2505 * try_to_wake_up_local - try to wake up a local task with rq lock held
2569 * @p: the thread to be awakened 2506 * @p: the thread to be awakened
2570 * 2507 *
2571 * Put @p on the run-queue if it's not alredy there. The caller must 2508 * Put @p on the run-queue if it's not already there. The caller must
2572 * ensure that this_rq() is locked, @p is bound to this_rq() and not 2509 * ensure that this_rq() is locked, @p is bound to this_rq() and not
2573 * the current task. this_rq() stays locked over invocation. 2510 * the current task. this_rq() stays locked over invocation.
2574 */ 2511 */
@@ -2713,7 +2650,9 @@ void sched_fork(struct task_struct *p, int clone_flags)
2713 /* Want to start with kernel preemption disabled. */ 2650 /* Want to start with kernel preemption disabled. */
2714 task_thread_info(p)->preempt_count = 1; 2651 task_thread_info(p)->preempt_count = 1;
2715#endif 2652#endif
2653#ifdef CONFIG_SMP
2716 plist_node_init(&p->pushable_tasks, MAX_PRIO); 2654 plist_node_init(&p->pushable_tasks, MAX_PRIO);
2655#endif
2717 2656
2718 put_cpu(); 2657 put_cpu();
2719} 2658}
@@ -3104,6 +3043,15 @@ static long calc_load_fold_active(struct rq *this_rq)
3104 return delta; 3043 return delta;
3105} 3044}
3106 3045
3046static unsigned long
3047calc_load(unsigned long load, unsigned long exp, unsigned long active)
3048{
3049 load *= exp;
3050 load += active * (FIXED_1 - exp);
3051 load += 1UL << (FSHIFT - 1);
3052 return load >> FSHIFT;
3053}
3054
3107#ifdef CONFIG_NO_HZ 3055#ifdef CONFIG_NO_HZ
3108/* 3056/*
3109 * For NO_HZ we delay the active fold to the next LOAD_FREQ update. 3057 * For NO_HZ we delay the active fold to the next LOAD_FREQ update.
@@ -3133,6 +3081,128 @@ static long calc_load_fold_idle(void)
3133 3081
3134 return delta; 3082 return delta;
3135} 3083}
3084
3085/**
3086 * fixed_power_int - compute: x^n, in O(log n) time
3087 *
3088 * @x: base of the power
3089 * @frac_bits: fractional bits of @x
3090 * @n: power to raise @x to.
3091 *
3092 * By exploiting the relation between the definition of the natural power
3093 * function: x^n := x*x*...*x (x multiplied by itself for n times), and
3094 * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i,
3095 * (where: n_i \elem {0, 1}, the binary vector representing n),
3096 * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is
3097 * of course trivially computable in O(log_2 n), the length of our binary
3098 * vector.
3099 */
3100static unsigned long
3101fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n)
3102{
3103 unsigned long result = 1UL << frac_bits;
3104
3105 if (n) for (;;) {
3106 if (n & 1) {
3107 result *= x;
3108 result += 1UL << (frac_bits - 1);
3109 result >>= frac_bits;
3110 }
3111 n >>= 1;
3112 if (!n)
3113 break;
3114 x *= x;
3115 x += 1UL << (frac_bits - 1);
3116 x >>= frac_bits;
3117 }
3118
3119 return result;
3120}
3121
3122/*
3123 * a1 = a0 * e + a * (1 - e)
3124 *
3125 * a2 = a1 * e + a * (1 - e)
3126 * = (a0 * e + a * (1 - e)) * e + a * (1 - e)
3127 * = a0 * e^2 + a * (1 - e) * (1 + e)
3128 *
3129 * a3 = a2 * e + a * (1 - e)
3130 * = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e)
3131 * = a0 * e^3 + a * (1 - e) * (1 + e + e^2)
3132 *
3133 * ...
3134 *
3135 * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1]
3136 * = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e)
3137 * = a0 * e^n + a * (1 - e^n)
3138 *
3139 * [1] application of the geometric series:
3140 *
3141 * n 1 - x^(n+1)
3142 * S_n := \Sum x^i = -------------
3143 * i=0 1 - x
3144 */
3145static unsigned long
3146calc_load_n(unsigned long load, unsigned long exp,
3147 unsigned long active, unsigned int n)
3148{
3149
3150 return calc_load(load, fixed_power_int(exp, FSHIFT, n), active);
3151}
3152
3153/*
3154 * NO_HZ can leave us missing all per-cpu ticks calling
3155 * calc_load_account_active(), but since an idle CPU folds its delta into
3156 * calc_load_tasks_idle per calc_load_account_idle(), all we need to do is fold
3157 * in the pending idle delta if our idle period crossed a load cycle boundary.
3158 *
3159 * Once we've updated the global active value, we need to apply the exponential
3160 * weights adjusted to the number of cycles missed.
3161 */
3162static void calc_global_nohz(unsigned long ticks)
3163{
3164 long delta, active, n;
3165
3166 if (time_before(jiffies, calc_load_update))
3167 return;
3168
3169 /*
3170 * If we crossed a calc_load_update boundary, make sure to fold
3171 * any pending idle changes, the respective CPUs might have
3172 * missed the tick driven calc_load_account_active() update
3173 * due to NO_HZ.
3174 */
3175 delta = calc_load_fold_idle();
3176 if (delta)
3177 atomic_long_add(delta, &calc_load_tasks);
3178
3179 /*
3180 * If we were idle for multiple load cycles, apply them.
3181 */
3182 if (ticks >= LOAD_FREQ) {
3183 n = ticks / LOAD_FREQ;
3184
3185 active = atomic_long_read(&calc_load_tasks);
3186 active = active > 0 ? active * FIXED_1 : 0;
3187
3188 avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n);
3189 avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);
3190 avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);
3191
3192 calc_load_update += n * LOAD_FREQ;
3193 }
3194
3195 /*
3196 * Its possible the remainder of the above division also crosses
3197 * a LOAD_FREQ period, the regular check in calc_global_load()
3198 * which comes after this will take care of that.
3199 *
3200 * Consider us being 11 ticks before a cycle completion, and us
3201 * sleeping for 4*LOAD_FREQ + 22 ticks, then the above code will
3202 * age us 4 cycles, and the test in calc_global_load() will
3203 * pick up the final one.
3204 */
3205}
3136#else 3206#else
3137static void calc_load_account_idle(struct rq *this_rq) 3207static void calc_load_account_idle(struct rq *this_rq)
3138{ 3208{
@@ -3142,6 +3212,10 @@ static inline long calc_load_fold_idle(void)
3142{ 3212{
3143 return 0; 3213 return 0;
3144} 3214}
3215
3216static void calc_global_nohz(unsigned long ticks)
3217{
3218}
3145#endif 3219#endif
3146 3220
3147/** 3221/**
@@ -3159,24 +3233,17 @@ void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
3159 loads[2] = (avenrun[2] + offset) << shift; 3233 loads[2] = (avenrun[2] + offset) << shift;
3160} 3234}
3161 3235
3162static unsigned long
3163calc_load(unsigned long load, unsigned long exp, unsigned long active)
3164{
3165 load *= exp;
3166 load += active * (FIXED_1 - exp);
3167 return load >> FSHIFT;
3168}
3169
3170/* 3236/*
3171 * calc_load - update the avenrun load estimates 10 ticks after the 3237 * calc_load - update the avenrun load estimates 10 ticks after the
3172 * CPUs have updated calc_load_tasks. 3238 * CPUs have updated calc_load_tasks.
3173 */ 3239 */
3174void calc_global_load(void) 3240void calc_global_load(unsigned long ticks)
3175{ 3241{
3176 unsigned long upd = calc_load_update + 10;
3177 long active; 3242 long active;
3178 3243
3179 if (time_before(jiffies, upd)) 3244 calc_global_nohz(ticks);
3245
3246 if (time_before(jiffies, calc_load_update + 10))
3180 return; 3247 return;
3181 3248
3182 active = atomic_long_read(&calc_load_tasks); 3249 active = atomic_long_read(&calc_load_tasks);
@@ -3349,7 +3416,7 @@ void sched_exec(void)
3349 * select_task_rq() can race against ->cpus_allowed 3416 * select_task_rq() can race against ->cpus_allowed
3350 */ 3417 */
3351 if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed) && 3418 if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed) &&
3352 likely(cpu_active(dest_cpu)) && migrate_task(p, dest_cpu)) { 3419 likely(cpu_active(dest_cpu)) && migrate_task(p, rq)) {
3353 struct migration_arg arg = { p, dest_cpu }; 3420 struct migration_arg arg = { p, dest_cpu };
3354 3421
3355 task_rq_unlock(rq, &flags); 3422 task_rq_unlock(rq, &flags);
@@ -3820,7 +3887,7 @@ static inline void schedule_debug(struct task_struct *prev)
3820 schedstat_inc(this_rq(), sched_count); 3887 schedstat_inc(this_rq(), sched_count);
3821#ifdef CONFIG_SCHEDSTATS 3888#ifdef CONFIG_SCHEDSTATS
3822 if (unlikely(prev->lock_depth >= 0)) { 3889 if (unlikely(prev->lock_depth >= 0)) {
3823 schedstat_inc(this_rq(), bkl_count); 3890 schedstat_inc(this_rq(), rq_sched_info.bkl_count);
3824 schedstat_inc(prev, sched_info.bkl_count); 3891 schedstat_inc(prev, sched_info.bkl_count);
3825 } 3892 }
3826#endif 3893#endif
@@ -3830,7 +3897,6 @@ static void put_prev_task(struct rq *rq, struct task_struct *prev)
3830{ 3897{
3831 if (prev->se.on_rq) 3898 if (prev->se.on_rq)
3832 update_rq_clock(rq); 3899 update_rq_clock(rq);
3833 rq->skip_clock_update = 0;
3834 prev->sched_class->put_prev_task(rq, prev); 3900 prev->sched_class->put_prev_task(rq, prev);
3835} 3901}
3836 3902
@@ -3888,7 +3954,6 @@ need_resched_nonpreemptible:
3888 hrtick_clear(rq); 3954 hrtick_clear(rq);
3889 3955
3890 raw_spin_lock_irq(&rq->lock); 3956 raw_spin_lock_irq(&rq->lock);
3891 clear_tsk_need_resched(prev);
3892 3957
3893 switch_count = &prev->nivcsw; 3958 switch_count = &prev->nivcsw;
3894 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { 3959 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
@@ -3920,6 +3985,8 @@ need_resched_nonpreemptible:
3920 3985
3921 put_prev_task(rq, prev); 3986 put_prev_task(rq, prev);
3922 next = pick_next_task(rq); 3987 next = pick_next_task(rq);
3988 clear_tsk_need_resched(prev);
3989 rq->skip_clock_update = 0;
3923 3990
3924 if (likely(prev != next)) { 3991 if (likely(prev != next)) {
3925 sched_info_switch(prev, next); 3992 sched_info_switch(prev, next);
@@ -4014,7 +4081,7 @@ int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)
4014 if (task_thread_info(rq->curr) != owner || need_resched()) 4081 if (task_thread_info(rq->curr) != owner || need_resched())
4015 return 0; 4082 return 0;
4016 4083
4017 cpu_relax(); 4084 arch_mutex_cpu_relax();
4018 } 4085 }
4019 4086
4020 return 1; 4087 return 1;
@@ -4326,7 +4393,7 @@ EXPORT_SYMBOL(wait_for_completion_interruptible);
4326 * This waits for either a completion of a specific task to be signaled or for a 4393 * This waits for either a completion of a specific task to be signaled or for a
4327 * specified timeout to expire. It is interruptible. The timeout is in jiffies. 4394 * specified timeout to expire. It is interruptible. The timeout is in jiffies.
4328 */ 4395 */
4329unsigned long __sched 4396long __sched
4330wait_for_completion_interruptible_timeout(struct completion *x, 4397wait_for_completion_interruptible_timeout(struct completion *x,
4331 unsigned long timeout) 4398 unsigned long timeout)
4332{ 4399{
@@ -4359,7 +4426,7 @@ EXPORT_SYMBOL(wait_for_completion_killable);
4359 * signaled or for a specified timeout to expire. It can be 4426 * signaled or for a specified timeout to expire. It can be
4360 * interrupted by a kill signal. The timeout is in jiffies. 4427 * interrupted by a kill signal. The timeout is in jiffies.
4361 */ 4428 */
4362unsigned long __sched 4429long __sched
4363wait_for_completion_killable_timeout(struct completion *x, 4430wait_for_completion_killable_timeout(struct completion *x,
4364 unsigned long timeout) 4431 unsigned long timeout)
4365{ 4432{
@@ -4701,7 +4768,7 @@ static bool check_same_owner(struct task_struct *p)
4701} 4768}
4702 4769
4703static int __sched_setscheduler(struct task_struct *p, int policy, 4770static int __sched_setscheduler(struct task_struct *p, int policy,
4704 struct sched_param *param, bool user) 4771 const struct sched_param *param, bool user)
4705{ 4772{
4706 int retval, oldprio, oldpolicy = -1, on_rq, running; 4773 int retval, oldprio, oldpolicy = -1, on_rq, running;
4707 unsigned long flags; 4774 unsigned long flags;
@@ -4804,7 +4871,8 @@ recheck:
4804 * assigned. 4871 * assigned.
4805 */ 4872 */
4806 if (rt_bandwidth_enabled() && rt_policy(policy) && 4873 if (rt_bandwidth_enabled() && rt_policy(policy) &&
4807 task_group(p)->rt_bandwidth.rt_runtime == 0) { 4874 task_group(p)->rt_bandwidth.rt_runtime == 0 &&
4875 !task_group_is_autogroup(task_group(p))) {
4808 __task_rq_unlock(rq); 4876 __task_rq_unlock(rq);
4809 raw_spin_unlock_irqrestore(&p->pi_lock, flags); 4877 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
4810 return -EPERM; 4878 return -EPERM;
@@ -4856,7 +4924,7 @@ recheck:
4856 * NOTE that the task may be already dead. 4924 * NOTE that the task may be already dead.
4857 */ 4925 */
4858int sched_setscheduler(struct task_struct *p, int policy, 4926int sched_setscheduler(struct task_struct *p, int policy,
4859 struct sched_param *param) 4927 const struct sched_param *param)
4860{ 4928{
4861 return __sched_setscheduler(p, policy, param, true); 4929 return __sched_setscheduler(p, policy, param, true);
4862} 4930}
@@ -4874,7 +4942,7 @@ EXPORT_SYMBOL_GPL(sched_setscheduler);
4874 * but our caller might not have that capability. 4942 * but our caller might not have that capability.
4875 */ 4943 */
4876int sched_setscheduler_nocheck(struct task_struct *p, int policy, 4944int sched_setscheduler_nocheck(struct task_struct *p, int policy,
4877 struct sched_param *param) 4945 const struct sched_param *param)
4878{ 4946{
4879 return __sched_setscheduler(p, policy, param, false); 4947 return __sched_setscheduler(p, policy, param, false);
4880} 4948}
@@ -5390,7 +5458,7 @@ void sched_show_task(struct task_struct *p)
5390 unsigned state; 5458 unsigned state;
5391 5459
5392 state = p->state ? __ffs(p->state) + 1 : 0; 5460 state = p->state ? __ffs(p->state) + 1 : 0;
5393 printk(KERN_INFO "%-13.13s %c", p->comm, 5461 printk(KERN_INFO "%-15.15s %c", p->comm,
5394 state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?'); 5462 state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
5395#if BITS_PER_LONG == 32 5463#if BITS_PER_LONG == 32
5396 if (state == TASK_RUNNING) 5464 if (state == TASK_RUNNING)
@@ -5554,7 +5622,6 @@ static void update_sysctl(void)
5554 SET_SYSCTL(sched_min_granularity); 5622 SET_SYSCTL(sched_min_granularity);
5555 SET_SYSCTL(sched_latency); 5623 SET_SYSCTL(sched_latency);
5556 SET_SYSCTL(sched_wakeup_granularity); 5624 SET_SYSCTL(sched_wakeup_granularity);
5557 SET_SYSCTL(sched_shares_ratelimit);
5558#undef SET_SYSCTL 5625#undef SET_SYSCTL
5559} 5626}
5560 5627
@@ -5630,7 +5697,7 @@ again:
5630 goto out; 5697 goto out;
5631 5698
5632 dest_cpu = cpumask_any_and(cpu_active_mask, new_mask); 5699 dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
5633 if (migrate_task(p, dest_cpu)) { 5700 if (migrate_task(p, rq)) {
5634 struct migration_arg arg = { p, dest_cpu }; 5701 struct migration_arg arg = { p, dest_cpu };
5635 /* Need help from migration thread: drop lock and wait. */ 5702 /* Need help from migration thread: drop lock and wait. */
5636 task_rq_unlock(rq, &flags); 5703 task_rq_unlock(rq, &flags);
@@ -5712,29 +5779,20 @@ static int migration_cpu_stop(void *data)
5712} 5779}
5713 5780
5714#ifdef CONFIG_HOTPLUG_CPU 5781#ifdef CONFIG_HOTPLUG_CPU
5782
5715/* 5783/*
5716 * Figure out where task on dead CPU should go, use force if necessary. 5784 * Ensures that the idle task is using init_mm right before its cpu goes
5785 * offline.
5717 */ 5786 */
5718void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) 5787void idle_task_exit(void)
5719{ 5788{
5720 struct rq *rq = cpu_rq(dead_cpu); 5789 struct mm_struct *mm = current->active_mm;
5721 int needs_cpu, uninitialized_var(dest_cpu);
5722 unsigned long flags;
5723 5790
5724 local_irq_save(flags); 5791 BUG_ON(cpu_online(smp_processor_id()));
5725 5792
5726 raw_spin_lock(&rq->lock); 5793 if (mm != &init_mm)
5727 needs_cpu = (task_cpu(p) == dead_cpu) && (p->state != TASK_WAKING); 5794 switch_mm(mm, &init_mm, current);
5728 if (needs_cpu) 5795 mmdrop(mm);
5729 dest_cpu = select_fallback_rq(dead_cpu, p);
5730 raw_spin_unlock(&rq->lock);
5731 /*
5732 * It can only fail if we race with set_cpus_allowed(),
5733 * in the racer should migrate the task anyway.
5734 */
5735 if (needs_cpu)
5736 __migrate_task(p, dead_cpu, dest_cpu);
5737 local_irq_restore(flags);
5738} 5796}
5739 5797
5740/* 5798/*
@@ -5747,128 +5805,69 @@ void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
5747static void migrate_nr_uninterruptible(struct rq *rq_src) 5805static void migrate_nr_uninterruptible(struct rq *rq_src)
5748{ 5806{
5749 struct rq *rq_dest = cpu_rq(cpumask_any(cpu_active_mask)); 5807 struct rq *rq_dest = cpu_rq(cpumask_any(cpu_active_mask));
5750 unsigned long flags;
5751 5808
5752 local_irq_save(flags);
5753 double_rq_lock(rq_src, rq_dest);
5754 rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible; 5809 rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible;
5755 rq_src->nr_uninterruptible = 0; 5810 rq_src->nr_uninterruptible = 0;
5756 double_rq_unlock(rq_src, rq_dest);
5757 local_irq_restore(flags);
5758}
5759
5760/* Run through task list and migrate tasks from the dead cpu. */
5761static void migrate_live_tasks(int src_cpu)
5762{
5763 struct task_struct *p, *t;
5764
5765 read_lock(&tasklist_lock);
5766
5767 do_each_thread(t, p) {
5768 if (p == current)
5769 continue;
5770
5771 if (task_cpu(p) == src_cpu)
5772 move_task_off_dead_cpu(src_cpu, p);
5773 } while_each_thread(t, p);
5774
5775 read_unlock(&tasklist_lock);
5776} 5811}
5777 5812
5778/* 5813/*
5779 * Schedules idle task to be the next runnable task on current CPU. 5814 * remove the tasks which were accounted by rq from calc_load_tasks.
5780 * It does so by boosting its priority to highest possible.
5781 * Used by CPU offline code.
5782 */ 5815 */
5783void sched_idle_next(void) 5816static void calc_global_load_remove(struct rq *rq)
5784{ 5817{
5785 int this_cpu = smp_processor_id(); 5818 atomic_long_sub(rq->calc_load_active, &calc_load_tasks);
5786 struct rq *rq = cpu_rq(this_cpu); 5819 rq->calc_load_active = 0;
5787 struct task_struct *p = rq->idle;
5788 unsigned long flags;
5789
5790 /* cpu has to be offline */
5791 BUG_ON(cpu_online(this_cpu));
5792
5793 /*
5794 * Strictly not necessary since rest of the CPUs are stopped by now
5795 * and interrupts disabled on the current cpu.
5796 */
5797 raw_spin_lock_irqsave(&rq->lock, flags);
5798
5799 __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
5800
5801 activate_task(rq, p, 0);
5802
5803 raw_spin_unlock_irqrestore(&rq->lock, flags);
5804} 5820}
5805 5821
5806/* 5822/*
5807 * Ensures that the idle task is using init_mm right before its cpu goes 5823 * Migrate all tasks from the rq, sleeping tasks will be migrated by
5808 * offline. 5824 * try_to_wake_up()->select_task_rq().
5825 *
5826 * Called with rq->lock held even though we'er in stop_machine() and
5827 * there's no concurrency possible, we hold the required locks anyway
5828 * because of lock validation efforts.
5809 */ 5829 */
5810void idle_task_exit(void) 5830static void migrate_tasks(unsigned int dead_cpu)
5811{
5812 struct mm_struct *mm = current->active_mm;
5813
5814 BUG_ON(cpu_online(smp_processor_id()));
5815
5816 if (mm != &init_mm)
5817 switch_mm(mm, &init_mm, current);
5818 mmdrop(mm);
5819}
5820
5821/* called under rq->lock with disabled interrupts */
5822static void migrate_dead(unsigned int dead_cpu, struct task_struct *p)
5823{ 5831{
5824 struct rq *rq = cpu_rq(dead_cpu); 5832 struct rq *rq = cpu_rq(dead_cpu);
5825 5833 struct task_struct *next, *stop = rq->stop;
5826 /* Must be exiting, otherwise would be on tasklist. */ 5834 int dest_cpu;
5827 BUG_ON(!p->exit_state);
5828
5829 /* Cannot have done final schedule yet: would have vanished. */
5830 BUG_ON(p->state == TASK_DEAD);
5831
5832 get_task_struct(p);
5833 5835
5834 /* 5836 /*
5835 * Drop lock around migration; if someone else moves it, 5837 * Fudge the rq selection such that the below task selection loop
5836 * that's OK. No task can be added to this CPU, so iteration is 5838 * doesn't get stuck on the currently eligible stop task.
5837 * fine. 5839 *
5840 * We're currently inside stop_machine() and the rq is either stuck
5841 * in the stop_machine_cpu_stop() loop, or we're executing this code,
5842 * either way we should never end up calling schedule() until we're
5843 * done here.
5838 */ 5844 */
5839 raw_spin_unlock_irq(&rq->lock); 5845 rq->stop = NULL;
5840 move_task_off_dead_cpu(dead_cpu, p);
5841 raw_spin_lock_irq(&rq->lock);
5842
5843 put_task_struct(p);
5844}
5845
5846/* release_task() removes task from tasklist, so we won't find dead tasks. */
5847static void migrate_dead_tasks(unsigned int dead_cpu)
5848{
5849 struct rq *rq = cpu_rq(dead_cpu);
5850 struct task_struct *next;
5851 5846
5852 for ( ; ; ) { 5847 for ( ; ; ) {
5853 if (!rq->nr_running) 5848 /*
5849 * There's this thread running, bail when that's the only
5850 * remaining thread.
5851 */
5852 if (rq->nr_running == 1)
5854 break; 5853 break;
5854
5855 next = pick_next_task(rq); 5855 next = pick_next_task(rq);
5856 if (!next) 5856 BUG_ON(!next);
5857 break;
5858 next->sched_class->put_prev_task(rq, next); 5857 next->sched_class->put_prev_task(rq, next);
5859 migrate_dead(dead_cpu, next);
5860 5858
5859 /* Find suitable destination for @next, with force if needed. */
5860 dest_cpu = select_fallback_rq(dead_cpu, next);
5861 raw_spin_unlock(&rq->lock);
5862
5863 __migrate_task(next, dead_cpu, dest_cpu);
5864
5865 raw_spin_lock(&rq->lock);
5861 } 5866 }
5862}
5863 5867
5864/* 5868 rq->stop = stop;
5865 * remove the tasks which were accounted by rq from calc_load_tasks.
5866 */
5867static void calc_global_load_remove(struct rq *rq)
5868{
5869 atomic_long_sub(rq->calc_load_active, &calc_load_tasks);
5870 rq->calc_load_active = 0;
5871} 5869}
5870
5872#endif /* CONFIG_HOTPLUG_CPU */ 5871#endif /* CONFIG_HOTPLUG_CPU */
5873 5872
5874#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) 5873#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
@@ -6078,15 +6077,13 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
6078 unsigned long flags; 6077 unsigned long flags;
6079 struct rq *rq = cpu_rq(cpu); 6078 struct rq *rq = cpu_rq(cpu);
6080 6079
6081 switch (action) { 6080 switch (action & ~CPU_TASKS_FROZEN) {
6082 6081
6083 case CPU_UP_PREPARE: 6082 case CPU_UP_PREPARE:
6084 case CPU_UP_PREPARE_FROZEN:
6085 rq->calc_load_update = calc_load_update; 6083 rq->calc_load_update = calc_load_update;
6086 break; 6084 break;
6087 6085
6088 case CPU_ONLINE: 6086 case CPU_ONLINE:
6089 case CPU_ONLINE_FROZEN:
6090 /* Update our root-domain */ 6087 /* Update our root-domain */
6091 raw_spin_lock_irqsave(&rq->lock, flags); 6088 raw_spin_lock_irqsave(&rq->lock, flags);
6092 if (rq->rd) { 6089 if (rq->rd) {
@@ -6098,30 +6095,19 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
6098 break; 6095 break;
6099 6096
6100#ifdef CONFIG_HOTPLUG_CPU 6097#ifdef CONFIG_HOTPLUG_CPU
6101 case CPU_DEAD:
6102 case CPU_DEAD_FROZEN:
6103 migrate_live_tasks(cpu);
6104 /* Idle task back to normal (off runqueue, low prio) */
6105 raw_spin_lock_irq(&rq->lock);
6106 deactivate_task(rq, rq->idle, 0);
6107 __setscheduler(rq, rq->idle, SCHED_NORMAL, 0);
6108 rq->idle->sched_class = &idle_sched_class;
6109 migrate_dead_tasks(cpu);
6110 raw_spin_unlock_irq(&rq->lock);
6111 migrate_nr_uninterruptible(rq);
6112 BUG_ON(rq->nr_running != 0);
6113 calc_global_load_remove(rq);
6114 break;
6115
6116 case CPU_DYING: 6098 case CPU_DYING:
6117 case CPU_DYING_FROZEN:
6118 /* Update our root-domain */ 6099 /* Update our root-domain */
6119 raw_spin_lock_irqsave(&rq->lock, flags); 6100 raw_spin_lock_irqsave(&rq->lock, flags);
6120 if (rq->rd) { 6101 if (rq->rd) {
6121 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); 6102 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
6122 set_rq_offline(rq); 6103 set_rq_offline(rq);
6123 } 6104 }
6105 migrate_tasks(cpu);
6106 BUG_ON(rq->nr_running != 1); /* the migration thread */
6124 raw_spin_unlock_irqrestore(&rq->lock, flags); 6107 raw_spin_unlock_irqrestore(&rq->lock, flags);
6108
6109 migrate_nr_uninterruptible(rq);
6110 calc_global_load_remove(rq);
6125 break; 6111 break;
6126#endif 6112#endif
6127 } 6113 }
@@ -6960,6 +6946,8 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
6960 if (cpu != group_first_cpu(sd->groups)) 6946 if (cpu != group_first_cpu(sd->groups))
6961 return; 6947 return;
6962 6948
6949 sd->groups->group_weight = cpumask_weight(sched_group_cpus(sd->groups));
6950
6963 child = sd->child; 6951 child = sd->child;
6964 6952
6965 sd->groups->cpu_power = 0; 6953 sd->groups->cpu_power = 0;
@@ -7850,18 +7838,16 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
7850 7838
7851#ifdef CONFIG_FAIR_GROUP_SCHED 7839#ifdef CONFIG_FAIR_GROUP_SCHED
7852static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, 7840static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
7853 struct sched_entity *se, int cpu, int add, 7841 struct sched_entity *se, int cpu,
7854 struct sched_entity *parent) 7842 struct sched_entity *parent)
7855{ 7843{
7856 struct rq *rq = cpu_rq(cpu); 7844 struct rq *rq = cpu_rq(cpu);
7857 tg->cfs_rq[cpu] = cfs_rq; 7845 tg->cfs_rq[cpu] = cfs_rq;
7858 init_cfs_rq(cfs_rq, rq); 7846 init_cfs_rq(cfs_rq, rq);
7859 cfs_rq->tg = tg; 7847 cfs_rq->tg = tg;
7860 if (add)
7861 list_add(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
7862 7848
7863 tg->se[cpu] = se; 7849 tg->se[cpu] = se;
7864 /* se could be NULL for init_task_group */ 7850 /* se could be NULL for root_task_group */
7865 if (!se) 7851 if (!se)
7866 return; 7852 return;
7867 7853
@@ -7871,15 +7857,14 @@ static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
7871 se->cfs_rq = parent->my_q; 7857 se->cfs_rq = parent->my_q;
7872 7858
7873 se->my_q = cfs_rq; 7859 se->my_q = cfs_rq;
7874 se->load.weight = tg->shares; 7860 update_load_set(&se->load, 0);
7875 se->load.inv_weight = 0;
7876 se->parent = parent; 7861 se->parent = parent;
7877} 7862}
7878#endif 7863#endif
7879 7864
7880#ifdef CONFIG_RT_GROUP_SCHED 7865#ifdef CONFIG_RT_GROUP_SCHED
7881static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, 7866static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
7882 struct sched_rt_entity *rt_se, int cpu, int add, 7867 struct sched_rt_entity *rt_se, int cpu,
7883 struct sched_rt_entity *parent) 7868 struct sched_rt_entity *parent)
7884{ 7869{
7885 struct rq *rq = cpu_rq(cpu); 7870 struct rq *rq = cpu_rq(cpu);
@@ -7888,8 +7873,6 @@ static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
7888 init_rt_rq(rt_rq, rq); 7873 init_rt_rq(rt_rq, rq);
7889 rt_rq->tg = tg; 7874 rt_rq->tg = tg;
7890 rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime; 7875 rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
7891 if (add)
7892 list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list);
7893 7876
7894 tg->rt_se[cpu] = rt_se; 7877 tg->rt_se[cpu] = rt_se;
7895 if (!rt_se) 7878 if (!rt_se)
@@ -7924,18 +7907,18 @@ void __init sched_init(void)
7924 ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT); 7907 ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);
7925 7908
7926#ifdef CONFIG_FAIR_GROUP_SCHED 7909#ifdef CONFIG_FAIR_GROUP_SCHED
7927 init_task_group.se = (struct sched_entity **)ptr; 7910 root_task_group.se = (struct sched_entity **)ptr;
7928 ptr += nr_cpu_ids * sizeof(void **); 7911 ptr += nr_cpu_ids * sizeof(void **);
7929 7912
7930 init_task_group.cfs_rq = (struct cfs_rq **)ptr; 7913 root_task_group.cfs_rq = (struct cfs_rq **)ptr;
7931 ptr += nr_cpu_ids * sizeof(void **); 7914 ptr += nr_cpu_ids * sizeof(void **);
7932 7915
7933#endif /* CONFIG_FAIR_GROUP_SCHED */ 7916#endif /* CONFIG_FAIR_GROUP_SCHED */
7934#ifdef CONFIG_RT_GROUP_SCHED 7917#ifdef CONFIG_RT_GROUP_SCHED
7935 init_task_group.rt_se = (struct sched_rt_entity **)ptr; 7918 root_task_group.rt_se = (struct sched_rt_entity **)ptr;
7936 ptr += nr_cpu_ids * sizeof(void **); 7919 ptr += nr_cpu_ids * sizeof(void **);
7937 7920
7938 init_task_group.rt_rq = (struct rt_rq **)ptr; 7921 root_task_group.rt_rq = (struct rt_rq **)ptr;
7939 ptr += nr_cpu_ids * sizeof(void **); 7922 ptr += nr_cpu_ids * sizeof(void **);
7940 7923
7941#endif /* CONFIG_RT_GROUP_SCHED */ 7924#endif /* CONFIG_RT_GROUP_SCHED */
@@ -7955,20 +7938,16 @@ void __init sched_init(void)
7955 global_rt_period(), global_rt_runtime()); 7938 global_rt_period(), global_rt_runtime());
7956 7939
7957#ifdef CONFIG_RT_GROUP_SCHED 7940#ifdef CONFIG_RT_GROUP_SCHED
7958 init_rt_bandwidth(&init_task_group.rt_bandwidth, 7941 init_rt_bandwidth(&root_task_group.rt_bandwidth,
7959 global_rt_period(), global_rt_runtime()); 7942 global_rt_period(), global_rt_runtime());
7960#endif /* CONFIG_RT_GROUP_SCHED */ 7943#endif /* CONFIG_RT_GROUP_SCHED */
7961 7944
7962#ifdef CONFIG_CGROUP_SCHED 7945#ifdef CONFIG_CGROUP_SCHED
7963 list_add(&init_task_group.list, &task_groups); 7946 list_add(&root_task_group.list, &task_groups);
7964 INIT_LIST_HEAD(&init_task_group.children); 7947 INIT_LIST_HEAD(&root_task_group.children);
7965 7948 autogroup_init(&init_task);
7966#endif /* CONFIG_CGROUP_SCHED */ 7949#endif /* CONFIG_CGROUP_SCHED */
7967 7950
7968#if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP
7969 update_shares_data = __alloc_percpu(nr_cpu_ids * sizeof(unsigned long),
7970 __alignof__(unsigned long));
7971#endif
7972 for_each_possible_cpu(i) { 7951 for_each_possible_cpu(i) {
7973 struct rq *rq; 7952 struct rq *rq;
7974 7953
@@ -7980,38 +7959,34 @@ void __init sched_init(void)
7980 init_cfs_rq(&rq->cfs, rq); 7959 init_cfs_rq(&rq->cfs, rq);
7981 init_rt_rq(&rq->rt, rq); 7960 init_rt_rq(&rq->rt, rq);
7982#ifdef CONFIG_FAIR_GROUP_SCHED 7961#ifdef CONFIG_FAIR_GROUP_SCHED
7983 init_task_group.shares = init_task_group_load; 7962 root_task_group.shares = root_task_group_load;
7984 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); 7963 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
7985#ifdef CONFIG_CGROUP_SCHED
7986 /* 7964 /*
7987 * How much cpu bandwidth does init_task_group get? 7965 * How much cpu bandwidth does root_task_group get?
7988 * 7966 *
7989 * In case of task-groups formed thr' the cgroup filesystem, it 7967 * In case of task-groups formed thr' the cgroup filesystem, it
7990 * gets 100% of the cpu resources in the system. This overall 7968 * gets 100% of the cpu resources in the system. This overall
7991 * system cpu resource is divided among the tasks of 7969 * system cpu resource is divided among the tasks of
7992 * init_task_group and its child task-groups in a fair manner, 7970 * root_task_group and its child task-groups in a fair manner,
7993 * based on each entity's (task or task-group's) weight 7971 * based on each entity's (task or task-group's) weight
7994 * (se->load.weight). 7972 * (se->load.weight).
7995 * 7973 *
7996 * In other words, if init_task_group has 10 tasks of weight 7974 * In other words, if root_task_group has 10 tasks of weight
7997 * 1024) and two child groups A0 and A1 (of weight 1024 each), 7975 * 1024) and two child groups A0 and A1 (of weight 1024 each),
7998 * then A0's share of the cpu resource is: 7976 * then A0's share of the cpu resource is:
7999 * 7977 *
8000 * A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33% 7978 * A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33%
8001 * 7979 *
8002 * We achieve this by letting init_task_group's tasks sit 7980 * We achieve this by letting root_task_group's tasks sit
8003 * directly in rq->cfs (i.e init_task_group->se[] = NULL). 7981 * directly in rq->cfs (i.e root_task_group->se[] = NULL).
8004 */ 7982 */
8005 init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, 1, NULL); 7983 init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL);
8006#endif
8007#endif /* CONFIG_FAIR_GROUP_SCHED */ 7984#endif /* CONFIG_FAIR_GROUP_SCHED */
8008 7985
8009 rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime; 7986 rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
8010#ifdef CONFIG_RT_GROUP_SCHED 7987#ifdef CONFIG_RT_GROUP_SCHED
8011 INIT_LIST_HEAD(&rq->leaf_rt_rq_list); 7988 INIT_LIST_HEAD(&rq->leaf_rt_rq_list);
8012#ifdef CONFIG_CGROUP_SCHED 7989 init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL);
8013 init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, 1, NULL);
8014#endif
8015#endif 7990#endif
8016 7991
8017 for (j = 0; j < CPU_LOAD_IDX_MAX; j++) 7992 for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
@@ -8091,8 +8066,6 @@ void __init sched_init(void)
8091 zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); 8066 zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
8092#endif /* SMP */ 8067#endif /* SMP */
8093 8068
8094 perf_event_init();
8095
8096 scheduler_running = 1; 8069 scheduler_running = 1;
8097} 8070}
8098 8071
@@ -8286,7 +8259,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
8286 if (!se) 8259 if (!se)
8287 goto err_free_rq; 8260 goto err_free_rq;
8288 8261
8289 init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent->se[i]); 8262 init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
8290 } 8263 }
8291 8264
8292 return 1; 8265 return 1;
@@ -8297,15 +8270,21 @@ err:
8297 return 0; 8270 return 0;
8298} 8271}
8299 8272
8300static inline void register_fair_sched_group(struct task_group *tg, int cpu)
8301{
8302 list_add_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list,
8303 &cpu_rq(cpu)->leaf_cfs_rq_list);
8304}
8305
8306static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) 8273static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
8307{ 8274{
8308 list_del_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list); 8275 struct rq *rq = cpu_rq(cpu);
8276 unsigned long flags;
8277
8278 /*
8279 * Only empty task groups can be destroyed; so we can speculatively
8280 * check on_list without danger of it being re-added.
8281 */
8282 if (!tg->cfs_rq[cpu]->on_list)
8283 return;
8284
8285 raw_spin_lock_irqsave(&rq->lock, flags);
8286 list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
8287 raw_spin_unlock_irqrestore(&rq->lock, flags);
8309} 8288}
8310#else /* !CONFG_FAIR_GROUP_SCHED */ 8289#else /* !CONFG_FAIR_GROUP_SCHED */
8311static inline void free_fair_sched_group(struct task_group *tg) 8290static inline void free_fair_sched_group(struct task_group *tg)
@@ -8318,10 +8297,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
8318 return 1; 8297 return 1;
8319} 8298}
8320 8299
8321static inline void register_fair_sched_group(struct task_group *tg, int cpu)
8322{
8323}
8324
8325static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) 8300static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
8326{ 8301{
8327} 8302}
@@ -8376,7 +8351,7 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
8376 if (!rt_se) 8351 if (!rt_se)
8377 goto err_free_rq; 8352 goto err_free_rq;
8378 8353
8379 init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent->rt_se[i]); 8354 init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]);
8380 } 8355 }
8381 8356
8382 return 1; 8357 return 1;
@@ -8386,17 +8361,6 @@ err_free_rq:
8386err: 8361err:
8387 return 0; 8362 return 0;
8388} 8363}
8389
8390static inline void register_rt_sched_group(struct task_group *tg, int cpu)
8391{
8392 list_add_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list,
8393 &cpu_rq(cpu)->leaf_rt_rq_list);
8394}
8395
8396static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
8397{
8398 list_del_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list);
8399}
8400#else /* !CONFIG_RT_GROUP_SCHED */ 8364#else /* !CONFIG_RT_GROUP_SCHED */
8401static inline void free_rt_sched_group(struct task_group *tg) 8365static inline void free_rt_sched_group(struct task_group *tg)
8402{ 8366{
@@ -8407,14 +8371,6 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
8407{ 8371{
8408 return 1; 8372 return 1;
8409} 8373}
8410
8411static inline void register_rt_sched_group(struct task_group *tg, int cpu)
8412{
8413}
8414
8415static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
8416{
8417}
8418#endif /* CONFIG_RT_GROUP_SCHED */ 8374#endif /* CONFIG_RT_GROUP_SCHED */
8419 8375
8420#ifdef CONFIG_CGROUP_SCHED 8376#ifdef CONFIG_CGROUP_SCHED
@@ -8422,6 +8378,7 @@ static void free_sched_group(struct task_group *tg)
8422{ 8378{
8423 free_fair_sched_group(tg); 8379 free_fair_sched_group(tg);
8424 free_rt_sched_group(tg); 8380 free_rt_sched_group(tg);
8381 autogroup_free(tg);
8425 kfree(tg); 8382 kfree(tg);
8426} 8383}
8427 8384
@@ -8430,7 +8387,6 @@ struct task_group *sched_create_group(struct task_group *parent)
8430{ 8387{
8431 struct task_group *tg; 8388 struct task_group *tg;
8432 unsigned long flags; 8389 unsigned long flags;
8433 int i;
8434 8390
8435 tg = kzalloc(sizeof(*tg), GFP_KERNEL); 8391 tg = kzalloc(sizeof(*tg), GFP_KERNEL);
8436 if (!tg) 8392 if (!tg)
@@ -8443,10 +8399,6 @@ struct task_group *sched_create_group(struct task_group *parent)
8443 goto err; 8399 goto err;
8444 8400
8445 spin_lock_irqsave(&task_group_lock, flags); 8401 spin_lock_irqsave(&task_group_lock, flags);
8446 for_each_possible_cpu(i) {
8447 register_fair_sched_group(tg, i);
8448 register_rt_sched_group(tg, i);
8449 }
8450 list_add_rcu(&tg->list, &task_groups); 8402 list_add_rcu(&tg->list, &task_groups);
8451 8403
8452 WARN_ON(!parent); /* root should already exist */ 8404 WARN_ON(!parent); /* root should already exist */
@@ -8476,11 +8428,11 @@ void sched_destroy_group(struct task_group *tg)
8476 unsigned long flags; 8428 unsigned long flags;
8477 int i; 8429 int i;
8478 8430
8479 spin_lock_irqsave(&task_group_lock, flags); 8431 /* end participation in shares distribution */
8480 for_each_possible_cpu(i) { 8432 for_each_possible_cpu(i)
8481 unregister_fair_sched_group(tg, i); 8433 unregister_fair_sched_group(tg, i);
8482 unregister_rt_sched_group(tg, i); 8434
8483 } 8435 spin_lock_irqsave(&task_group_lock, flags);
8484 list_del_rcu(&tg->list); 8436 list_del_rcu(&tg->list);
8485 list_del_rcu(&tg->siblings); 8437 list_del_rcu(&tg->siblings);
8486 spin_unlock_irqrestore(&task_group_lock, flags); 8438 spin_unlock_irqrestore(&task_group_lock, flags);
@@ -8527,33 +8479,6 @@ void sched_move_task(struct task_struct *tsk)
8527#endif /* CONFIG_CGROUP_SCHED */ 8479#endif /* CONFIG_CGROUP_SCHED */
8528 8480
8529#ifdef CONFIG_FAIR_GROUP_SCHED 8481#ifdef CONFIG_FAIR_GROUP_SCHED
8530static void __set_se_shares(struct sched_entity *se, unsigned long shares)
8531{
8532 struct cfs_rq *cfs_rq = se->cfs_rq;
8533 int on_rq;
8534
8535 on_rq = se->on_rq;
8536 if (on_rq)
8537 dequeue_entity(cfs_rq, se, 0);
8538
8539 se->load.weight = shares;
8540 se->load.inv_weight = 0;
8541
8542 if (on_rq)
8543 enqueue_entity(cfs_rq, se, 0);
8544}
8545
8546static void set_se_shares(struct sched_entity *se, unsigned long shares)
8547{
8548 struct cfs_rq *cfs_rq = se->cfs_rq;
8549 struct rq *rq = cfs_rq->rq;
8550 unsigned long flags;
8551
8552 raw_spin_lock_irqsave(&rq->lock, flags);
8553 __set_se_shares(se, shares);
8554 raw_spin_unlock_irqrestore(&rq->lock, flags);
8555}
8556
8557static DEFINE_MUTEX(shares_mutex); 8482static DEFINE_MUTEX(shares_mutex);
8558 8483
8559int sched_group_set_shares(struct task_group *tg, unsigned long shares) 8484int sched_group_set_shares(struct task_group *tg, unsigned long shares)
@@ -8576,37 +8501,19 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
8576 if (tg->shares == shares) 8501 if (tg->shares == shares)
8577 goto done; 8502 goto done;
8578 8503
8579 spin_lock_irqsave(&task_group_lock, flags);
8580 for_each_possible_cpu(i)
8581 unregister_fair_sched_group(tg, i);
8582 list_del_rcu(&tg->siblings);
8583 spin_unlock_irqrestore(&task_group_lock, flags);
8584
8585 /* wait for any ongoing reference to this group to finish */
8586 synchronize_sched();
8587
8588 /*
8589 * Now we are free to modify the group's share on each cpu
8590 * w/o tripping rebalance_share or load_balance_fair.
8591 */
8592 tg->shares = shares; 8504 tg->shares = shares;
8593 for_each_possible_cpu(i) { 8505 for_each_possible_cpu(i) {
8594 /* 8506 struct rq *rq = cpu_rq(i);
8595 * force a rebalance 8507 struct sched_entity *se;
8596 */ 8508
8597 cfs_rq_set_shares(tg->cfs_rq[i], 0); 8509 se = tg->se[i];
8598 set_se_shares(tg->se[i], shares); 8510 /* Propagate contribution to hierarchy */
8511 raw_spin_lock_irqsave(&rq->lock, flags);
8512 for_each_sched_entity(se)
8513 update_cfs_shares(group_cfs_rq(se), 0);
8514 raw_spin_unlock_irqrestore(&rq->lock, flags);
8599 } 8515 }
8600 8516
8601 /*
8602 * Enable load balance activity on this group, by inserting it back on
8603 * each cpu's rq->leaf_cfs_rq_list.
8604 */
8605 spin_lock_irqsave(&task_group_lock, flags);
8606 for_each_possible_cpu(i)
8607 register_fair_sched_group(tg, i);
8608 list_add_rcu(&tg->siblings, &tg->parent->children);
8609 spin_unlock_irqrestore(&task_group_lock, flags);
8610done: 8517done:
8611 mutex_unlock(&shares_mutex); 8518 mutex_unlock(&shares_mutex);
8612 return 0; 8519 return 0;
@@ -8905,7 +8812,7 @@ cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp)
8905 8812
8906 if (!cgrp->parent) { 8813 if (!cgrp->parent) {
8907 /* This is early initialization for the top cgroup */ 8814 /* This is early initialization for the top cgroup */
8908 return &init_task_group.css; 8815 return &root_task_group.css;
8909 } 8816 }
8910 8817
8911 parent = cgroup_tg(cgrp->parent); 8818 parent = cgroup_tg(cgrp->parent);
@@ -8976,6 +8883,20 @@ cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
8976 } 8883 }
8977} 8884}
8978 8885
8886static void
8887cpu_cgroup_exit(struct cgroup_subsys *ss, struct task_struct *task)
8888{
8889 /*
8890 * cgroup_exit() is called in the copy_process() failure path.
8891 * Ignore this case since the task hasn't ran yet, this avoids
8892 * trying to poke a half freed task state from generic code.
8893 */
8894 if (!(task->flags & PF_EXITING))
8895 return;
8896
8897 sched_move_task(task);
8898}
8899
8979#ifdef CONFIG_FAIR_GROUP_SCHED 8900#ifdef CONFIG_FAIR_GROUP_SCHED
8980static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype, 8901static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype,
8981 u64 shareval) 8902 u64 shareval)
@@ -9048,6 +8969,7 @@ struct cgroup_subsys cpu_cgroup_subsys = {
9048 .destroy = cpu_cgroup_destroy, 8969 .destroy = cpu_cgroup_destroy,
9049 .can_attach = cpu_cgroup_can_attach, 8970 .can_attach = cpu_cgroup_can_attach,
9050 .attach = cpu_cgroup_attach, 8971 .attach = cpu_cgroup_attach,
8972 .exit = cpu_cgroup_exit,
9051 .populate = cpu_cgroup_populate, 8973 .populate = cpu_cgroup_populate,
9052 .subsys_id = cpu_cgroup_subsys_id, 8974 .subsys_id = cpu_cgroup_subsys_id,
9053 .early_init = 1, 8975 .early_init = 1,
@@ -9332,72 +9254,3 @@ struct cgroup_subsys cpuacct_subsys = {
9332}; 9254};
9333#endif /* CONFIG_CGROUP_CPUACCT */ 9255#endif /* CONFIG_CGROUP_CPUACCT */
9334 9256
9335#ifndef CONFIG_SMP
9336
9337void synchronize_sched_expedited(void)
9338{
9339 barrier();
9340}
9341EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
9342
9343#else /* #ifndef CONFIG_SMP */
9344
9345static atomic_t synchronize_sched_expedited_count = ATOMIC_INIT(0);
9346
9347static int synchronize_sched_expedited_cpu_stop(void *data)
9348{
9349 /*
9350 * There must be a full memory barrier on each affected CPU
9351 * between the time that try_stop_cpus() is called and the
9352 * time that it returns.
9353 *
9354 * In the current initial implementation of cpu_stop, the
9355 * above condition is already met when the control reaches
9356 * this point and the following smp_mb() is not strictly
9357 * necessary. Do smp_mb() anyway for documentation and
9358 * robustness against future implementation changes.
9359 */
9360 smp_mb(); /* See above comment block. */
9361 return 0;
9362}
9363
9364/*
9365 * Wait for an rcu-sched grace period to elapse, but use "big hammer"
9366 * approach to force grace period to end quickly. This consumes
9367 * significant time on all CPUs, and is thus not recommended for
9368 * any sort of common-case code.
9369 *
9370 * Note that it is illegal to call this function while holding any
9371 * lock that is acquired by a CPU-hotplug notifier. Failing to
9372 * observe this restriction will result in deadlock.
9373 */
9374void synchronize_sched_expedited(void)
9375{
9376 int snap, trycount = 0;
9377
9378 smp_mb(); /* ensure prior mod happens before capturing snap. */
9379 snap = atomic_read(&synchronize_sched_expedited_count) + 1;
9380 get_online_cpus();
9381 while (try_stop_cpus(cpu_online_mask,
9382 synchronize_sched_expedited_cpu_stop,
9383 NULL) == -EAGAIN) {
9384 put_online_cpus();
9385 if (trycount++ < 10)
9386 udelay(trycount * num_online_cpus());
9387 else {
9388 synchronize_sched();
9389 return;
9390 }
9391 if (atomic_read(&synchronize_sched_expedited_count) - snap > 0) {
9392 smp_mb(); /* ensure test happens before caller kfree */
9393 return;
9394 }
9395 get_online_cpus();
9396 }
9397 atomic_inc(&synchronize_sched_expedited_count);
9398 smp_mb__after_atomic_inc(); /* ensure post-GP actions seen after GP. */
9399 put_online_cpus();
9400}
9401EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
9402
9403#endif /* #else #ifndef CONFIG_SMP */
diff --git a/kernel/sched_autogroup.c b/kernel/sched_autogroup.c
new file mode 100644
index 000000000000..9fb656283157
--- /dev/null
+++ b/kernel/sched_autogroup.c
@@ -0,0 +1,270 @@
1#ifdef CONFIG_SCHED_AUTOGROUP
2
3#include <linux/proc_fs.h>
4#include <linux/seq_file.h>
5#include <linux/kallsyms.h>
6#include <linux/utsname.h>
7
8unsigned int __read_mostly sysctl_sched_autogroup_enabled = 1;
9static struct autogroup autogroup_default;
10static atomic_t autogroup_seq_nr;
11
12static void __init autogroup_init(struct task_struct *init_task)
13{
14 autogroup_default.tg = &root_task_group;
15 root_task_group.autogroup = &autogroup_default;
16 kref_init(&autogroup_default.kref);
17 init_rwsem(&autogroup_default.lock);
18 init_task->signal->autogroup = &autogroup_default;
19}
20
21static inline void autogroup_free(struct task_group *tg)
22{
23 kfree(tg->autogroup);
24}
25
26static inline void autogroup_destroy(struct kref *kref)
27{
28 struct autogroup *ag = container_of(kref, struct autogroup, kref);
29
30#ifdef CONFIG_RT_GROUP_SCHED
31 /* We've redirected RT tasks to the root task group... */
32 ag->tg->rt_se = NULL;
33 ag->tg->rt_rq = NULL;
34#endif
35 sched_destroy_group(ag->tg);
36}
37
38static inline void autogroup_kref_put(struct autogroup *ag)
39{
40 kref_put(&ag->kref, autogroup_destroy);
41}
42
43static inline struct autogroup *autogroup_kref_get(struct autogroup *ag)
44{
45 kref_get(&ag->kref);
46 return ag;
47}
48
49static inline struct autogroup *autogroup_task_get(struct task_struct *p)
50{
51 struct autogroup *ag;
52 unsigned long flags;
53
54 if (!lock_task_sighand(p, &flags))
55 return autogroup_kref_get(&autogroup_default);
56
57 ag = autogroup_kref_get(p->signal->autogroup);
58 unlock_task_sighand(p, &flags);
59
60 return ag;
61}
62
63#ifdef CONFIG_RT_GROUP_SCHED
64static void free_rt_sched_group(struct task_group *tg);
65#endif
66
67static inline struct autogroup *autogroup_create(void)
68{
69 struct autogroup *ag = kzalloc(sizeof(*ag), GFP_KERNEL);
70 struct task_group *tg;
71
72 if (!ag)
73 goto out_fail;
74
75 tg = sched_create_group(&root_task_group);
76
77 if (IS_ERR(tg))
78 goto out_free;
79
80 kref_init(&ag->kref);
81 init_rwsem(&ag->lock);
82 ag->id = atomic_inc_return(&autogroup_seq_nr);
83 ag->tg = tg;
84#ifdef CONFIG_RT_GROUP_SCHED
85 /*
86 * Autogroup RT tasks are redirected to the root task group
87 * so we don't have to move tasks around upon policy change,
88 * or flail around trying to allocate bandwidth on the fly.
89 * A bandwidth exception in __sched_setscheduler() allows
90 * the policy change to proceed. Thereafter, task_group()
91 * returns &root_task_group, so zero bandwidth is required.
92 */
93 free_rt_sched_group(tg);
94 tg->rt_se = root_task_group.rt_se;
95 tg->rt_rq = root_task_group.rt_rq;
96#endif
97 tg->autogroup = ag;
98
99 return ag;
100
101out_free:
102 kfree(ag);
103out_fail:
104 if (printk_ratelimit()) {
105 printk(KERN_WARNING "autogroup_create: %s failure.\n",
106 ag ? "sched_create_group()" : "kmalloc()");
107 }
108
109 return autogroup_kref_get(&autogroup_default);
110}
111
112static inline bool
113task_wants_autogroup(struct task_struct *p, struct task_group *tg)
114{
115 if (tg != &root_task_group)
116 return false;
117
118 if (p->sched_class != &fair_sched_class)
119 return false;
120
121 /*
122 * We can only assume the task group can't go away on us if
123 * autogroup_move_group() can see us on ->thread_group list.
124 */
125 if (p->flags & PF_EXITING)
126 return false;
127
128 return true;
129}
130
131static inline bool task_group_is_autogroup(struct task_group *tg)
132{
133 return tg != &root_task_group && tg->autogroup;
134}
135
136static inline struct task_group *
137autogroup_task_group(struct task_struct *p, struct task_group *tg)
138{
139 int enabled = ACCESS_ONCE(sysctl_sched_autogroup_enabled);
140
141 if (enabled && task_wants_autogroup(p, tg))
142 return p->signal->autogroup->tg;
143
144 return tg;
145}
146
147static void
148autogroup_move_group(struct task_struct *p, struct autogroup *ag)
149{
150 struct autogroup *prev;
151 struct task_struct *t;
152 unsigned long flags;
153
154 BUG_ON(!lock_task_sighand(p, &flags));
155
156 prev = p->signal->autogroup;
157 if (prev == ag) {
158 unlock_task_sighand(p, &flags);
159 return;
160 }
161
162 p->signal->autogroup = autogroup_kref_get(ag);
163
164 t = p;
165 do {
166 sched_move_task(t);
167 } while_each_thread(p, t);
168
169 unlock_task_sighand(p, &flags);
170 autogroup_kref_put(prev);
171}
172
173/* Allocates GFP_KERNEL, cannot be called under any spinlock */
174void sched_autogroup_create_attach(struct task_struct *p)
175{
176 struct autogroup *ag = autogroup_create();
177
178 autogroup_move_group(p, ag);
179 /* drop extra refrence added by autogroup_create() */
180 autogroup_kref_put(ag);
181}
182EXPORT_SYMBOL(sched_autogroup_create_attach);
183
184/* Cannot be called under siglock. Currently has no users */
185void sched_autogroup_detach(struct task_struct *p)
186{
187 autogroup_move_group(p, &autogroup_default);
188}
189EXPORT_SYMBOL(sched_autogroup_detach);
190
191void sched_autogroup_fork(struct signal_struct *sig)
192{
193 sig->autogroup = autogroup_task_get(current);
194}
195
196void sched_autogroup_exit(struct signal_struct *sig)
197{
198 autogroup_kref_put(sig->autogroup);
199}
200
201static int __init setup_autogroup(char *str)
202{
203 sysctl_sched_autogroup_enabled = 0;
204
205 return 1;
206}
207
208__setup("noautogroup", setup_autogroup);
209
210#ifdef CONFIG_PROC_FS
211
212int proc_sched_autogroup_set_nice(struct task_struct *p, int *nice)
213{
214 static unsigned long next = INITIAL_JIFFIES;
215 struct autogroup *ag;
216 int err;
217
218 if (*nice < -20 || *nice > 19)
219 return -EINVAL;
220
221 err = security_task_setnice(current, *nice);
222 if (err)
223 return err;
224
225 if (*nice < 0 && !can_nice(current, *nice))
226 return -EPERM;
227
228 /* this is a heavy operation taking global locks.. */
229 if (!capable(CAP_SYS_ADMIN) && time_before(jiffies, next))
230 return -EAGAIN;
231
232 next = HZ / 10 + jiffies;
233 ag = autogroup_task_get(p);
234
235 down_write(&ag->lock);
236 err = sched_group_set_shares(ag->tg, prio_to_weight[*nice + 20]);
237 if (!err)
238 ag->nice = *nice;
239 up_write(&ag->lock);
240
241 autogroup_kref_put(ag);
242
243 return err;
244}
245
246void proc_sched_autogroup_show_task(struct task_struct *p, struct seq_file *m)
247{
248 struct autogroup *ag = autogroup_task_get(p);
249
250 down_read(&ag->lock);
251 seq_printf(m, "/autogroup-%ld nice %d\n", ag->id, ag->nice);
252 up_read(&ag->lock);
253
254 autogroup_kref_put(ag);
255}
256#endif /* CONFIG_PROC_FS */
257
258#ifdef CONFIG_SCHED_DEBUG
259static inline int autogroup_path(struct task_group *tg, char *buf, int buflen)
260{
261 int enabled = ACCESS_ONCE(sysctl_sched_autogroup_enabled);
262
263 if (!enabled || !tg->autogroup)
264 return 0;
265
266 return snprintf(buf, buflen, "%s-%ld", "/autogroup", tg->autogroup->id);
267}
268#endif /* CONFIG_SCHED_DEBUG */
269
270#endif /* CONFIG_SCHED_AUTOGROUP */
diff --git a/kernel/sched_autogroup.h b/kernel/sched_autogroup.h
new file mode 100644
index 000000000000..7b859ffe5dad
--- /dev/null
+++ b/kernel/sched_autogroup.h
@@ -0,0 +1,36 @@
1#ifdef CONFIG_SCHED_AUTOGROUP
2
3struct autogroup {
4 struct kref kref;
5 struct task_group *tg;
6 struct rw_semaphore lock;
7 unsigned long id;
8 int nice;
9};
10
11static inline struct task_group *
12autogroup_task_group(struct task_struct *p, struct task_group *tg);
13
14#else /* !CONFIG_SCHED_AUTOGROUP */
15
16static inline void autogroup_init(struct task_struct *init_task) { }
17static inline void autogroup_free(struct task_group *tg) { }
18static inline bool task_group_is_autogroup(struct task_group *tg)
19{
20 return 0;
21}
22
23static inline struct task_group *
24autogroup_task_group(struct task_struct *p, struct task_group *tg)
25{
26 return tg;
27}
28
29#ifdef CONFIG_SCHED_DEBUG
30static inline int autogroup_path(struct task_group *tg, char *buf, int buflen)
31{
32 return 0;
33}
34#endif
35
36#endif /* CONFIG_SCHED_AUTOGROUP */
diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c
index 52f1a149bfb1..9d8af0b3fb64 100644
--- a/kernel/sched_clock.c
+++ b/kernel/sched_clock.c
@@ -79,7 +79,7 @@ unsigned long long __attribute__((weak)) sched_clock(void)
79} 79}
80EXPORT_SYMBOL_GPL(sched_clock); 80EXPORT_SYMBOL_GPL(sched_clock);
81 81
82static __read_mostly int sched_clock_running; 82__read_mostly int sched_clock_running;
83 83
84#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK 84#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
85__read_mostly int sched_clock_stable; 85__read_mostly int sched_clock_stable;
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 2e1b0d17dd9b..eb6cb8edd075 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -16,6 +16,8 @@
16#include <linux/kallsyms.h> 16#include <linux/kallsyms.h>
17#include <linux/utsname.h> 17#include <linux/utsname.h>
18 18
19static DEFINE_SPINLOCK(sched_debug_lock);
20
19/* 21/*
20 * This allows printing both to /proc/sched_debug and 22 * This allows printing both to /proc/sched_debug and
21 * to the console 23 * to the console
@@ -54,8 +56,7 @@ static unsigned long nsec_low(unsigned long long nsec)
54#define SPLIT_NS(x) nsec_high(x), nsec_low(x) 56#define SPLIT_NS(x) nsec_high(x), nsec_low(x)
55 57
56#ifdef CONFIG_FAIR_GROUP_SCHED 58#ifdef CONFIG_FAIR_GROUP_SCHED
57static void print_cfs_group_stats(struct seq_file *m, int cpu, 59static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group *tg)
58 struct task_group *tg)
59{ 60{
60 struct sched_entity *se = tg->se[cpu]; 61 struct sched_entity *se = tg->se[cpu];
61 if (!se) 62 if (!se)
@@ -87,6 +88,26 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu,
87} 88}
88#endif 89#endif
89 90
91#ifdef CONFIG_CGROUP_SCHED
92static char group_path[PATH_MAX];
93
94static char *task_group_path(struct task_group *tg)
95{
96 if (autogroup_path(tg, group_path, PATH_MAX))
97 return group_path;
98
99 /*
100 * May be NULL if the underlying cgroup isn't fully-created yet
101 */
102 if (!tg->css.cgroup) {
103 group_path[0] = '\0';
104 return group_path;
105 }
106 cgroup_path(tg->css.cgroup, group_path, PATH_MAX);
107 return group_path;
108}
109#endif
110
90static void 111static void
91print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) 112print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
92{ 113{
@@ -109,17 +130,10 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
109 SEQ_printf(m, "%15Ld %15Ld %15Ld.%06ld %15Ld.%06ld %15Ld.%06ld", 130 SEQ_printf(m, "%15Ld %15Ld %15Ld.%06ld %15Ld.%06ld %15Ld.%06ld",
110 0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L); 131 0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L);
111#endif 132#endif
112
113#ifdef CONFIG_CGROUP_SCHED 133#ifdef CONFIG_CGROUP_SCHED
114 { 134 SEQ_printf(m, " %s", task_group_path(task_group(p)));
115 char path[64];
116
117 rcu_read_lock();
118 cgroup_path(task_group(p)->css.cgroup, path, sizeof(path));
119 rcu_read_unlock();
120 SEQ_printf(m, " %s", path);
121 }
122#endif 135#endif
136
123 SEQ_printf(m, "\n"); 137 SEQ_printf(m, "\n");
124} 138}
125 139
@@ -147,19 +161,6 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
147 read_unlock_irqrestore(&tasklist_lock, flags); 161 read_unlock_irqrestore(&tasklist_lock, flags);
148} 162}
149 163
150#if defined(CONFIG_CGROUP_SCHED) && \
151 (defined(CONFIG_FAIR_GROUP_SCHED) || defined(CONFIG_RT_GROUP_SCHED))
152static void task_group_path(struct task_group *tg, char *buf, int buflen)
153{
154 /* may be NULL if the underlying cgroup isn't fully-created yet */
155 if (!tg->css.cgroup) {
156 buf[0] = '\0';
157 return;
158 }
159 cgroup_path(tg->css.cgroup, buf, buflen);
160}
161#endif
162
163void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) 164void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
164{ 165{
165 s64 MIN_vruntime = -1, min_vruntime, max_vruntime = -1, 166 s64 MIN_vruntime = -1, min_vruntime, max_vruntime = -1,
@@ -168,13 +169,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
168 struct sched_entity *last; 169 struct sched_entity *last;
169 unsigned long flags; 170 unsigned long flags;
170 171
171#if defined(CONFIG_CGROUP_SCHED) && defined(CONFIG_FAIR_GROUP_SCHED) 172#ifdef CONFIG_FAIR_GROUP_SCHED
172 char path[128]; 173 SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, task_group_path(cfs_rq->tg));
173 struct task_group *tg = cfs_rq->tg;
174
175 task_group_path(tg, path, sizeof(path));
176
177 SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, path);
178#else 174#else
179 SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu); 175 SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu);
180#endif 176#endif
@@ -202,33 +198,34 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
202 spread0 = min_vruntime - rq0_min_vruntime; 198 spread0 = min_vruntime - rq0_min_vruntime;
203 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread0", 199 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread0",
204 SPLIT_NS(spread0)); 200 SPLIT_NS(spread0));
205 SEQ_printf(m, " .%-30s: %ld\n", "nr_running", cfs_rq->nr_running);
206 SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight);
207
208 SEQ_printf(m, " .%-30s: %d\n", "nr_spread_over", 201 SEQ_printf(m, " .%-30s: %d\n", "nr_spread_over",
209 cfs_rq->nr_spread_over); 202 cfs_rq->nr_spread_over);
203 SEQ_printf(m, " .%-30s: %ld\n", "nr_running", cfs_rq->nr_running);
204 SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight);
210#ifdef CONFIG_FAIR_GROUP_SCHED 205#ifdef CONFIG_FAIR_GROUP_SCHED
211#ifdef CONFIG_SMP 206#ifdef CONFIG_SMP
212 SEQ_printf(m, " .%-30s: %lu\n", "shares", cfs_rq->shares); 207 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "load_avg",
208 SPLIT_NS(cfs_rq->load_avg));
209 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "load_period",
210 SPLIT_NS(cfs_rq->load_period));
211 SEQ_printf(m, " .%-30s: %ld\n", "load_contrib",
212 cfs_rq->load_contribution);
213 SEQ_printf(m, " .%-30s: %d\n", "load_tg",
214 atomic_read(&cfs_rq->tg->load_weight));
213#endif 215#endif
216
214 print_cfs_group_stats(m, cpu, cfs_rq->tg); 217 print_cfs_group_stats(m, cpu, cfs_rq->tg);
215#endif 218#endif
216} 219}
217 220
218void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq) 221void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
219{ 222{
220#if defined(CONFIG_CGROUP_SCHED) && defined(CONFIG_RT_GROUP_SCHED) 223#ifdef CONFIG_RT_GROUP_SCHED
221 char path[128]; 224 SEQ_printf(m, "\nrt_rq[%d]:%s\n", cpu, task_group_path(rt_rq->tg));
222 struct task_group *tg = rt_rq->tg;
223
224 task_group_path(tg, path, sizeof(path));
225
226 SEQ_printf(m, "\nrt_rq[%d]:%s\n", cpu, path);
227#else 225#else
228 SEQ_printf(m, "\nrt_rq[%d]:\n", cpu); 226 SEQ_printf(m, "\nrt_rq[%d]:\n", cpu);
229#endif 227#endif
230 228
231
232#define P(x) \ 229#define P(x) \
233 SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(rt_rq->x)) 230 SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(rt_rq->x))
234#define PN(x) \ 231#define PN(x) \
@@ -243,9 +240,12 @@ void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
243#undef P 240#undef P
244} 241}
245 242
243extern __read_mostly int sched_clock_running;
244
246static void print_cpu(struct seq_file *m, int cpu) 245static void print_cpu(struct seq_file *m, int cpu)
247{ 246{
248 struct rq *rq = cpu_rq(cpu); 247 struct rq *rq = cpu_rq(cpu);
248 unsigned long flags;
249 249
250#ifdef CONFIG_X86 250#ifdef CONFIG_X86
251 { 251 {
@@ -296,14 +296,20 @@ static void print_cpu(struct seq_file *m, int cpu)
296 P(ttwu_count); 296 P(ttwu_count);
297 P(ttwu_local); 297 P(ttwu_local);
298 298
299 P(bkl_count); 299 SEQ_printf(m, " .%-30s: %d\n", "bkl_count",
300 rq->rq_sched_info.bkl_count);
300 301
301#undef P 302#undef P
303#undef P64
302#endif 304#endif
305 spin_lock_irqsave(&sched_debug_lock, flags);
303 print_cfs_stats(m, cpu); 306 print_cfs_stats(m, cpu);
304 print_rt_stats(m, cpu); 307 print_rt_stats(m, cpu);
305 308
309 rcu_read_lock();
306 print_rq(m, rq, cpu); 310 print_rq(m, rq, cpu);
311 rcu_read_unlock();
312 spin_unlock_irqrestore(&sched_debug_lock, flags);
307} 313}
308 314
309static const char *sched_tunable_scaling_names[] = { 315static const char *sched_tunable_scaling_names[] = {
@@ -314,21 +320,42 @@ static const char *sched_tunable_scaling_names[] = {
314 320
315static int sched_debug_show(struct seq_file *m, void *v) 321static int sched_debug_show(struct seq_file *m, void *v)
316{ 322{
317 u64 now = ktime_to_ns(ktime_get()); 323 u64 ktime, sched_clk, cpu_clk;
324 unsigned long flags;
318 int cpu; 325 int cpu;
319 326
320 SEQ_printf(m, "Sched Debug Version: v0.09, %s %.*s\n", 327 local_irq_save(flags);
328 ktime = ktime_to_ns(ktime_get());
329 sched_clk = sched_clock();
330 cpu_clk = local_clock();
331 local_irq_restore(flags);
332
333 SEQ_printf(m, "Sched Debug Version: v0.10, %s %.*s\n",
321 init_utsname()->release, 334 init_utsname()->release,
322 (int)strcspn(init_utsname()->version, " "), 335 (int)strcspn(init_utsname()->version, " "),
323 init_utsname()->version); 336 init_utsname()->version);
324 337
325 SEQ_printf(m, "now at %Lu.%06ld msecs\n", SPLIT_NS(now)); 338#define P(x) \
339 SEQ_printf(m, "%-40s: %Ld\n", #x, (long long)(x))
340#define PN(x) \
341 SEQ_printf(m, "%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x))
342 PN(ktime);
343 PN(sched_clk);
344 PN(cpu_clk);
345 P(jiffies);
346#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
347 P(sched_clock_stable);
348#endif
349#undef PN
350#undef P
351
352 SEQ_printf(m, "\n");
353 SEQ_printf(m, "sysctl_sched\n");
326 354
327#define P(x) \ 355#define P(x) \
328 SEQ_printf(m, " .%-40s: %Ld\n", #x, (long long)(x)) 356 SEQ_printf(m, " .%-40s: %Ld\n", #x, (long long)(x))
329#define PN(x) \ 357#define PN(x) \
330 SEQ_printf(m, " .%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x)) 358 SEQ_printf(m, " .%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x))
331 P(jiffies);
332 PN(sysctl_sched_latency); 359 PN(sysctl_sched_latency);
333 PN(sysctl_sched_min_granularity); 360 PN(sysctl_sched_min_granularity);
334 PN(sysctl_sched_wakeup_granularity); 361 PN(sysctl_sched_wakeup_granularity);
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index f4f6a8326dd0..77e9166d7bbf 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -89,6 +89,13 @@ unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
89 89
90const_debug unsigned int sysctl_sched_migration_cost = 500000UL; 90const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
91 91
92/*
93 * The exponential sliding window over which load is averaged for shares
94 * distribution.
95 * (default: 10msec)
96 */
97unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
98
92static const struct sched_class fair_sched_class; 99static const struct sched_class fair_sched_class;
93 100
94/************************************************************** 101/**************************************************************
@@ -143,6 +150,36 @@ static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
143 return cfs_rq->tg->cfs_rq[this_cpu]; 150 return cfs_rq->tg->cfs_rq[this_cpu];
144} 151}
145 152
153static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
154{
155 if (!cfs_rq->on_list) {
156 /*
157 * Ensure we either appear before our parent (if already
158 * enqueued) or force our parent to appear after us when it is
159 * enqueued. The fact that we always enqueue bottom-up
160 * reduces this to two cases.
161 */
162 if (cfs_rq->tg->parent &&
163 cfs_rq->tg->parent->cfs_rq[cpu_of(rq_of(cfs_rq))]->on_list) {
164 list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
165 &rq_of(cfs_rq)->leaf_cfs_rq_list);
166 } else {
167 list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
168 &rq_of(cfs_rq)->leaf_cfs_rq_list);
169 }
170
171 cfs_rq->on_list = 1;
172 }
173}
174
175static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
176{
177 if (cfs_rq->on_list) {
178 list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
179 cfs_rq->on_list = 0;
180 }
181}
182
146/* Iterate thr' all leaf cfs_rq's on a runqueue */ 183/* Iterate thr' all leaf cfs_rq's on a runqueue */
147#define for_each_leaf_cfs_rq(rq, cfs_rq) \ 184#define for_each_leaf_cfs_rq(rq, cfs_rq) \
148 list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list) 185 list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
@@ -246,6 +283,14 @@ static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
246 return &cpu_rq(this_cpu)->cfs; 283 return &cpu_rq(this_cpu)->cfs;
247} 284}
248 285
286static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
287{
288}
289
290static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
291{
292}
293
249#define for_each_leaf_cfs_rq(rq, cfs_rq) \ 294#define for_each_leaf_cfs_rq(rq, cfs_rq) \
250 for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL) 295 for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
251 296
@@ -417,7 +462,6 @@ int sched_proc_update_handler(struct ctl_table *table, int write,
417 WRT_SYSCTL(sched_min_granularity); 462 WRT_SYSCTL(sched_min_granularity);
418 WRT_SYSCTL(sched_latency); 463 WRT_SYSCTL(sched_latency);
419 WRT_SYSCTL(sched_wakeup_granularity); 464 WRT_SYSCTL(sched_wakeup_granularity);
420 WRT_SYSCTL(sched_shares_ratelimit);
421#undef WRT_SYSCTL 465#undef WRT_SYSCTL
422 466
423 return 0; 467 return 0;
@@ -495,6 +539,9 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
495 return calc_delta_fair(sched_slice(cfs_rq, se), se); 539 return calc_delta_fair(sched_slice(cfs_rq, se), se);
496} 540}
497 541
542static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update);
543static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta);
544
498/* 545/*
499 * Update the current task's runtime statistics. Skip current tasks that 546 * Update the current task's runtime statistics. Skip current tasks that
500 * are not in our scheduling class. 547 * are not in our scheduling class.
@@ -514,6 +561,10 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
514 561
515 curr->vruntime += delta_exec_weighted; 562 curr->vruntime += delta_exec_weighted;
516 update_min_vruntime(cfs_rq); 563 update_min_vruntime(cfs_rq);
564
565#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
566 cfs_rq->load_unacc_exec_time += delta_exec;
567#endif
517} 568}
518 569
519static void update_curr(struct cfs_rq *cfs_rq) 570static void update_curr(struct cfs_rq *cfs_rq)
@@ -633,7 +684,6 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
633 list_add(&se->group_node, &cfs_rq->tasks); 684 list_add(&se->group_node, &cfs_rq->tasks);
634 } 685 }
635 cfs_rq->nr_running++; 686 cfs_rq->nr_running++;
636 se->on_rq = 1;
637} 687}
638 688
639static void 689static void
@@ -647,9 +697,140 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
647 list_del_init(&se->group_node); 697 list_del_init(&se->group_node);
648 } 698 }
649 cfs_rq->nr_running--; 699 cfs_rq->nr_running--;
650 se->on_rq = 0;
651} 700}
652 701
702#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
703static void update_cfs_rq_load_contribution(struct cfs_rq *cfs_rq,
704 int global_update)
705{
706 struct task_group *tg = cfs_rq->tg;
707 long load_avg;
708
709 load_avg = div64_u64(cfs_rq->load_avg, cfs_rq->load_period+1);
710 load_avg -= cfs_rq->load_contribution;
711
712 if (global_update || abs(load_avg) > cfs_rq->load_contribution / 8) {
713 atomic_add(load_avg, &tg->load_weight);
714 cfs_rq->load_contribution += load_avg;
715 }
716}
717
718static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
719{
720 u64 period = sysctl_sched_shares_window;
721 u64 now, delta;
722 unsigned long load = cfs_rq->load.weight;
723
724 if (!cfs_rq)
725 return;
726
727 now = rq_of(cfs_rq)->clock;
728 delta = now - cfs_rq->load_stamp;
729
730 /* truncate load history at 4 idle periods */
731 if (cfs_rq->load_stamp > cfs_rq->load_last &&
732 now - cfs_rq->load_last > 4 * period) {
733 cfs_rq->load_period = 0;
734 cfs_rq->load_avg = 0;
735 }
736
737 cfs_rq->load_stamp = now;
738 cfs_rq->load_unacc_exec_time = 0;
739 cfs_rq->load_period += delta;
740 if (load) {
741 cfs_rq->load_last = now;
742 cfs_rq->load_avg += delta * load;
743 }
744
745 /* consider updating load contribution on each fold or truncate */
746 if (global_update || cfs_rq->load_period > period
747 || !cfs_rq->load_period)
748 update_cfs_rq_load_contribution(cfs_rq, global_update);
749
750 while (cfs_rq->load_period > period) {
751 /*
752 * Inline assembly required to prevent the compiler
753 * optimising this loop into a divmod call.
754 * See __iter_div_u64_rem() for another example of this.
755 */
756 asm("" : "+rm" (cfs_rq->load_period));
757 cfs_rq->load_period /= 2;
758 cfs_rq->load_avg /= 2;
759 }
760
761 if (!cfs_rq->curr && !cfs_rq->nr_running && !cfs_rq->load_avg)
762 list_del_leaf_cfs_rq(cfs_rq);
763}
764
765static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
766 unsigned long weight)
767{
768 if (se->on_rq) {
769 /* commit outstanding execution time */
770 if (cfs_rq->curr == se)
771 update_curr(cfs_rq);
772 account_entity_dequeue(cfs_rq, se);
773 }
774
775 update_load_set(&se->load, weight);
776
777 if (se->on_rq)
778 account_entity_enqueue(cfs_rq, se);
779}
780
781static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta)
782{
783 struct task_group *tg;
784 struct sched_entity *se;
785 long load_weight, load, shares;
786
787 if (!cfs_rq)
788 return;
789
790 tg = cfs_rq->tg;
791 se = tg->se[cpu_of(rq_of(cfs_rq))];
792 if (!se)
793 return;
794
795 load = cfs_rq->load.weight + weight_delta;
796
797 load_weight = atomic_read(&tg->load_weight);
798 load_weight -= cfs_rq->load_contribution;
799 load_weight += load;
800
801 shares = (tg->shares * load);
802 if (load_weight)
803 shares /= load_weight;
804
805 if (shares < MIN_SHARES)
806 shares = MIN_SHARES;
807 if (shares > tg->shares)
808 shares = tg->shares;
809
810 reweight_entity(cfs_rq_of(se), se, shares);
811}
812
813static void update_entity_shares_tick(struct cfs_rq *cfs_rq)
814{
815 if (cfs_rq->load_unacc_exec_time > sysctl_sched_shares_window) {
816 update_cfs_load(cfs_rq, 0);
817 update_cfs_shares(cfs_rq, 0);
818 }
819}
820#else /* CONFIG_FAIR_GROUP_SCHED */
821static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
822{
823}
824
825static inline void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta)
826{
827}
828
829static inline void update_entity_shares_tick(struct cfs_rq *cfs_rq)
830{
831}
832#endif /* CONFIG_FAIR_GROUP_SCHED */
833
653static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) 834static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
654{ 835{
655#ifdef CONFIG_SCHEDSTATS 836#ifdef CONFIG_SCHEDSTATS
@@ -771,6 +952,8 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
771 * Update run-time statistics of the 'current'. 952 * Update run-time statistics of the 'current'.
772 */ 953 */
773 update_curr(cfs_rq); 954 update_curr(cfs_rq);
955 update_cfs_load(cfs_rq, 0);
956 update_cfs_shares(cfs_rq, se->load.weight);
774 account_entity_enqueue(cfs_rq, se); 957 account_entity_enqueue(cfs_rq, se);
775 958
776 if (flags & ENQUEUE_WAKEUP) { 959 if (flags & ENQUEUE_WAKEUP) {
@@ -782,6 +965,10 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
782 check_spread(cfs_rq, se); 965 check_spread(cfs_rq, se);
783 if (se != cfs_rq->curr) 966 if (se != cfs_rq->curr)
784 __enqueue_entity(cfs_rq, se); 967 __enqueue_entity(cfs_rq, se);
968 se->on_rq = 1;
969
970 if (cfs_rq->nr_running == 1)
971 list_add_leaf_cfs_rq(cfs_rq);
785} 972}
786 973
787static void __clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) 974static void __clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
@@ -825,8 +1012,11 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
825 1012
826 if (se != cfs_rq->curr) 1013 if (se != cfs_rq->curr)
827 __dequeue_entity(cfs_rq, se); 1014 __dequeue_entity(cfs_rq, se);
1015 se->on_rq = 0;
1016 update_cfs_load(cfs_rq, 0);
828 account_entity_dequeue(cfs_rq, se); 1017 account_entity_dequeue(cfs_rq, se);
829 update_min_vruntime(cfs_rq); 1018 update_min_vruntime(cfs_rq);
1019 update_cfs_shares(cfs_rq, 0);
830 1020
831 /* 1021 /*
832 * Normalize the entity after updating the min_vruntime because the 1022 * Normalize the entity after updating the min_vruntime because the
@@ -872,6 +1062,9 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
872 struct sched_entity *se = __pick_next_entity(cfs_rq); 1062 struct sched_entity *se = __pick_next_entity(cfs_rq);
873 s64 delta = curr->vruntime - se->vruntime; 1063 s64 delta = curr->vruntime - se->vruntime;
874 1064
1065 if (delta < 0)
1066 return;
1067
875 if (delta > ideal_runtime) 1068 if (delta > ideal_runtime)
876 resched_task(rq_of(cfs_rq)->curr); 1069 resched_task(rq_of(cfs_rq)->curr);
877 } 1070 }
@@ -955,6 +1148,11 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
955 */ 1148 */
956 update_curr(cfs_rq); 1149 update_curr(cfs_rq);
957 1150
1151 /*
1152 * Update share accounting for long-running entities.
1153 */
1154 update_entity_shares_tick(cfs_rq);
1155
958#ifdef CONFIG_SCHED_HRTICK 1156#ifdef CONFIG_SCHED_HRTICK
959 /* 1157 /*
960 * queued ticks are scheduled to match the slice, so don't bother 1158 * queued ticks are scheduled to match the slice, so don't bother
@@ -1055,6 +1253,13 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
1055 flags = ENQUEUE_WAKEUP; 1253 flags = ENQUEUE_WAKEUP;
1056 } 1254 }
1057 1255
1256 for_each_sched_entity(se) {
1257 struct cfs_rq *cfs_rq = cfs_rq_of(se);
1258
1259 update_cfs_load(cfs_rq, 0);
1260 update_cfs_shares(cfs_rq, 0);
1261 }
1262
1058 hrtick_update(rq); 1263 hrtick_update(rq);
1059} 1264}
1060 1265
@@ -1071,12 +1276,20 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
1071 for_each_sched_entity(se) { 1276 for_each_sched_entity(se) {
1072 cfs_rq = cfs_rq_of(se); 1277 cfs_rq = cfs_rq_of(se);
1073 dequeue_entity(cfs_rq, se, flags); 1278 dequeue_entity(cfs_rq, se, flags);
1279
1074 /* Don't dequeue parent if it has other entities besides us */ 1280 /* Don't dequeue parent if it has other entities besides us */
1075 if (cfs_rq->load.weight) 1281 if (cfs_rq->load.weight)
1076 break; 1282 break;
1077 flags |= DEQUEUE_SLEEP; 1283 flags |= DEQUEUE_SLEEP;
1078 } 1284 }
1079 1285
1286 for_each_sched_entity(se) {
1287 struct cfs_rq *cfs_rq = cfs_rq_of(se);
1288
1289 update_cfs_load(cfs_rq, 0);
1290 update_cfs_shares(cfs_rq, 0);
1291 }
1292
1080 hrtick_update(rq); 1293 hrtick_update(rq);
1081} 1294}
1082 1295
@@ -1143,67 +1356,36 @@ static void task_waking_fair(struct rq *rq, struct task_struct *p)
1143 * Adding load to a group doesn't make a group heavier, but can cause movement 1356 * Adding load to a group doesn't make a group heavier, but can cause movement
1144 * of group shares between cpus. Assuming the shares were perfectly aligned one 1357 * of group shares between cpus. Assuming the shares were perfectly aligned one
1145 * can calculate the shift in shares. 1358 * can calculate the shift in shares.
1146 *
1147 * The problem is that perfectly aligning the shares is rather expensive, hence
1148 * we try to avoid doing that too often - see update_shares(), which ratelimits
1149 * this change.
1150 *
1151 * We compensate this by not only taking the current delta into account, but
1152 * also considering the delta between when the shares were last adjusted and
1153 * now.
1154 *
1155 * We still saw a performance dip, some tracing learned us that between
1156 * cgroup:/ and cgroup:/foo balancing the number of affine wakeups increased
1157 * significantly. Therefore try to bias the error in direction of failing
1158 * the affine wakeup.
1159 *
1160 */ 1359 */
1161static long effective_load(struct task_group *tg, int cpu, 1360static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
1162 long wl, long wg)
1163{ 1361{
1164 struct sched_entity *se = tg->se[cpu]; 1362 struct sched_entity *se = tg->se[cpu];
1165 1363
1166 if (!tg->parent) 1364 if (!tg->parent)
1167 return wl; 1365 return wl;
1168 1366
1169 /*
1170 * By not taking the decrease of shares on the other cpu into
1171 * account our error leans towards reducing the affine wakeups.
1172 */
1173 if (!wl && sched_feat(ASYM_EFF_LOAD))
1174 return wl;
1175
1176 for_each_sched_entity(se) { 1367 for_each_sched_entity(se) {
1177 long S, rw, s, a, b; 1368 long lw, w;
1178 long more_w;
1179 1369
1180 /* 1370 tg = se->my_q->tg;
1181 * Instead of using this increment, also add the difference 1371 w = se->my_q->load.weight;
1182 * between when the shares were last updated and now.
1183 */
1184 more_w = se->my_q->load.weight - se->my_q->rq_weight;
1185 wl += more_w;
1186 wg += more_w;
1187
1188 S = se->my_q->tg->shares;
1189 s = se->my_q->shares;
1190 rw = se->my_q->rq_weight;
1191 1372
1192 a = S*(rw + wl); 1373 /* use this cpu's instantaneous contribution */
1193 b = S*rw + s*wg; 1374 lw = atomic_read(&tg->load_weight);
1375 lw -= se->my_q->load_contribution;
1376 lw += w + wg;
1194 1377
1195 wl = s*(a-b); 1378 wl += w;
1196 1379
1197 if (likely(b)) 1380 if (lw > 0 && wl < lw)
1198 wl /= b; 1381 wl = (wl * tg->shares) / lw;
1382 else
1383 wl = tg->shares;
1199 1384
1200 /* 1385 /* zero point is MIN_SHARES */
1201 * Assume the group is already running and will 1386 if (wl < MIN_SHARES)
1202 * thus already be accounted for in the weight. 1387 wl = MIN_SHARES;
1203 * 1388 wl -= se->load.weight;
1204 * That is, moving shares between CPUs, does not
1205 * alter the group weight.
1206 */
1207 wg = 0; 1389 wg = 0;
1208 } 1390 }
1209 1391
@@ -1508,23 +1690,6 @@ select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_
1508 sd = tmp; 1690 sd = tmp;
1509 } 1691 }
1510 1692
1511#ifdef CONFIG_FAIR_GROUP_SCHED
1512 if (sched_feat(LB_SHARES_UPDATE)) {
1513 /*
1514 * Pick the largest domain to update shares over
1515 */
1516 tmp = sd;
1517 if (affine_sd && (!tmp || affine_sd->span_weight > sd->span_weight))
1518 tmp = affine_sd;
1519
1520 if (tmp) {
1521 raw_spin_unlock(&rq->lock);
1522 update_shares(tmp);
1523 raw_spin_lock(&rq->lock);
1524 }
1525 }
1526#endif
1527
1528 if (affine_sd) { 1693 if (affine_sd) {
1529 if (cpu == prev_cpu || wake_affine(affine_sd, p, sync)) 1694 if (cpu == prev_cpu || wake_affine(affine_sd, p, sync))
1530 return select_idle_sibling(p, cpu); 1695 return select_idle_sibling(p, cpu);
@@ -1654,12 +1819,6 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
1654 struct cfs_rq *cfs_rq = task_cfs_rq(curr); 1819 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
1655 int scale = cfs_rq->nr_running >= sched_nr_latency; 1820 int scale = cfs_rq->nr_running >= sched_nr_latency;
1656 1821
1657 if (unlikely(rt_prio(p->prio)))
1658 goto preempt;
1659
1660 if (unlikely(p->sched_class != &fair_sched_class))
1661 return;
1662
1663 if (unlikely(se == pse)) 1822 if (unlikely(se == pse))
1664 return; 1823 return;
1665 1824
@@ -1764,10 +1923,6 @@ static void pull_task(struct rq *src_rq, struct task_struct *p,
1764 set_task_cpu(p, this_cpu); 1923 set_task_cpu(p, this_cpu);
1765 activate_task(this_rq, p, 0); 1924 activate_task(this_rq, p, 0);
1766 check_preempt_curr(this_rq, p, 0); 1925 check_preempt_curr(this_rq, p, 0);
1767
1768 /* re-arm NEWIDLE balancing when moving tasks */
1769 src_rq->avg_idle = this_rq->avg_idle = 2*sysctl_sched_migration_cost;
1770 this_rq->idle_stamp = 0;
1771} 1926}
1772 1927
1773/* 1928/*
@@ -1919,6 +2074,48 @@ out:
1919} 2074}
1920 2075
1921#ifdef CONFIG_FAIR_GROUP_SCHED 2076#ifdef CONFIG_FAIR_GROUP_SCHED
2077/*
2078 * update tg->load_weight by folding this cpu's load_avg
2079 */
2080static int update_shares_cpu(struct task_group *tg, int cpu)
2081{
2082 struct cfs_rq *cfs_rq;
2083 unsigned long flags;
2084 struct rq *rq;
2085
2086 if (!tg->se[cpu])
2087 return 0;
2088
2089 rq = cpu_rq(cpu);
2090 cfs_rq = tg->cfs_rq[cpu];
2091
2092 raw_spin_lock_irqsave(&rq->lock, flags);
2093
2094 update_rq_clock(rq);
2095 update_cfs_load(cfs_rq, 1);
2096
2097 /*
2098 * We need to update shares after updating tg->load_weight in
2099 * order to adjust the weight of groups with long running tasks.
2100 */
2101 update_cfs_shares(cfs_rq, 0);
2102
2103 raw_spin_unlock_irqrestore(&rq->lock, flags);
2104
2105 return 0;
2106}
2107
2108static void update_shares(int cpu)
2109{
2110 struct cfs_rq *cfs_rq;
2111 struct rq *rq = cpu_rq(cpu);
2112
2113 rcu_read_lock();
2114 for_each_leaf_cfs_rq(rq, cfs_rq)
2115 update_shares_cpu(cfs_rq->tg, cpu);
2116 rcu_read_unlock();
2117}
2118
1922static unsigned long 2119static unsigned long
1923load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, 2120load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
1924 unsigned long max_load_move, 2121 unsigned long max_load_move,
@@ -1966,6 +2163,10 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
1966 return max_load_move - rem_load_move; 2163 return max_load_move - rem_load_move;
1967} 2164}
1968#else 2165#else
2166static inline void update_shares(int cpu)
2167{
2168}
2169
1969static unsigned long 2170static unsigned long
1970load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, 2171load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
1971 unsigned long max_load_move, 2172 unsigned long max_load_move,
@@ -2035,13 +2236,16 @@ struct sd_lb_stats {
2035 unsigned long this_load_per_task; 2236 unsigned long this_load_per_task;
2036 unsigned long this_nr_running; 2237 unsigned long this_nr_running;
2037 unsigned long this_has_capacity; 2238 unsigned long this_has_capacity;
2239 unsigned int this_idle_cpus;
2038 2240
2039 /* Statistics of the busiest group */ 2241 /* Statistics of the busiest group */
2242 unsigned int busiest_idle_cpus;
2040 unsigned long max_load; 2243 unsigned long max_load;
2041 unsigned long busiest_load_per_task; 2244 unsigned long busiest_load_per_task;
2042 unsigned long busiest_nr_running; 2245 unsigned long busiest_nr_running;
2043 unsigned long busiest_group_capacity; 2246 unsigned long busiest_group_capacity;
2044 unsigned long busiest_has_capacity; 2247 unsigned long busiest_has_capacity;
2248 unsigned int busiest_group_weight;
2045 2249
2046 int group_imb; /* Is there imbalance in this sd */ 2250 int group_imb; /* Is there imbalance in this sd */
2047#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) 2251#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
@@ -2063,6 +2267,8 @@ struct sg_lb_stats {
2063 unsigned long sum_nr_running; /* Nr tasks running in the group */ 2267 unsigned long sum_nr_running; /* Nr tasks running in the group */
2064 unsigned long sum_weighted_load; /* Weighted load of group's tasks */ 2268 unsigned long sum_weighted_load; /* Weighted load of group's tasks */
2065 unsigned long group_capacity; 2269 unsigned long group_capacity;
2270 unsigned long idle_cpus;
2271 unsigned long group_weight;
2066 int group_imb; /* Is there an imbalance in the group ? */ 2272 int group_imb; /* Is there an imbalance in the group ? */
2067 int group_has_capacity; /* Is there extra capacity in the group? */ 2273 int group_has_capacity; /* Is there extra capacity in the group? */
2068}; 2274};
@@ -2431,7 +2637,8 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
2431 sgs->group_load += load; 2637 sgs->group_load += load;
2432 sgs->sum_nr_running += rq->nr_running; 2638 sgs->sum_nr_running += rq->nr_running;
2433 sgs->sum_weighted_load += weighted_cpuload(i); 2639 sgs->sum_weighted_load += weighted_cpuload(i);
2434 2640 if (idle_cpu(i))
2641 sgs->idle_cpus++;
2435 } 2642 }
2436 2643
2437 /* 2644 /*
@@ -2469,6 +2676,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
2469 sgs->group_capacity = DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE); 2676 sgs->group_capacity = DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
2470 if (!sgs->group_capacity) 2677 if (!sgs->group_capacity)
2471 sgs->group_capacity = fix_small_capacity(sd, group); 2678 sgs->group_capacity = fix_small_capacity(sd, group);
2679 sgs->group_weight = group->group_weight;
2472 2680
2473 if (sgs->group_capacity > sgs->sum_nr_running) 2681 if (sgs->group_capacity > sgs->sum_nr_running)
2474 sgs->group_has_capacity = 1; 2682 sgs->group_has_capacity = 1;
@@ -2576,13 +2784,16 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
2576 sds->this_nr_running = sgs.sum_nr_running; 2784 sds->this_nr_running = sgs.sum_nr_running;
2577 sds->this_load_per_task = sgs.sum_weighted_load; 2785 sds->this_load_per_task = sgs.sum_weighted_load;
2578 sds->this_has_capacity = sgs.group_has_capacity; 2786 sds->this_has_capacity = sgs.group_has_capacity;
2787 sds->this_idle_cpus = sgs.idle_cpus;
2579 } else if (update_sd_pick_busiest(sd, sds, sg, &sgs, this_cpu)) { 2788 } else if (update_sd_pick_busiest(sd, sds, sg, &sgs, this_cpu)) {
2580 sds->max_load = sgs.avg_load; 2789 sds->max_load = sgs.avg_load;
2581 sds->busiest = sg; 2790 sds->busiest = sg;
2582 sds->busiest_nr_running = sgs.sum_nr_running; 2791 sds->busiest_nr_running = sgs.sum_nr_running;
2792 sds->busiest_idle_cpus = sgs.idle_cpus;
2583 sds->busiest_group_capacity = sgs.group_capacity; 2793 sds->busiest_group_capacity = sgs.group_capacity;
2584 sds->busiest_load_per_task = sgs.sum_weighted_load; 2794 sds->busiest_load_per_task = sgs.sum_weighted_load;
2585 sds->busiest_has_capacity = sgs.group_has_capacity; 2795 sds->busiest_has_capacity = sgs.group_has_capacity;
2796 sds->busiest_group_weight = sgs.group_weight;
2586 sds->group_imb = sgs.group_imb; 2797 sds->group_imb = sgs.group_imb;
2587 } 2798 }
2588 2799
@@ -2860,8 +3071,26 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2860 if (sds.this_load >= sds.avg_load) 3071 if (sds.this_load >= sds.avg_load)
2861 goto out_balanced; 3072 goto out_balanced;
2862 3073
2863 if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load) 3074 /*
2864 goto out_balanced; 3075 * In the CPU_NEWLY_IDLE, use imbalance_pct to be conservative.
3076 * And to check for busy balance use !idle_cpu instead of
3077 * CPU_NOT_IDLE. This is because HT siblings will use CPU_NOT_IDLE
3078 * even when they are idle.
3079 */
3080 if (idle == CPU_NEWLY_IDLE || !idle_cpu(this_cpu)) {
3081 if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
3082 goto out_balanced;
3083 } else {
3084 /*
3085 * This cpu is idle. If the busiest group load doesn't
3086 * have more tasks than the number of available cpu's and
3087 * there is no imbalance between this and busiest group
3088 * wrt to idle cpu's, it is balanced.
3089 */
3090 if ((sds.this_idle_cpus <= sds.busiest_idle_cpus + 1) &&
3091 sds.busiest_nr_running <= sds.busiest_group_weight)
3092 goto out_balanced;
3093 }
2865 3094
2866force_balance: 3095force_balance:
2867 /* Looks like there is an imbalance. Compute it */ 3096 /* Looks like there is an imbalance. Compute it */
@@ -3014,7 +3243,6 @@ static int load_balance(int this_cpu, struct rq *this_rq,
3014 schedstat_inc(sd, lb_count[idle]); 3243 schedstat_inc(sd, lb_count[idle]);
3015 3244
3016redo: 3245redo:
3017 update_shares(sd);
3018 group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle, 3246 group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
3019 cpus, balance); 3247 cpus, balance);
3020 3248
@@ -3156,8 +3384,6 @@ out_one_pinned:
3156 else 3384 else
3157 ld_moved = 0; 3385 ld_moved = 0;
3158out: 3386out:
3159 if (ld_moved)
3160 update_shares(sd);
3161 return ld_moved; 3387 return ld_moved;
3162} 3388}
3163 3389
@@ -3181,6 +3407,7 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
3181 */ 3407 */
3182 raw_spin_unlock(&this_rq->lock); 3408 raw_spin_unlock(&this_rq->lock);
3183 3409
3410 update_shares(this_cpu);
3184 for_each_domain(this_cpu, sd) { 3411 for_each_domain(this_cpu, sd) {
3185 unsigned long interval; 3412 unsigned long interval;
3186 int balance = 1; 3413 int balance = 1;
@@ -3197,8 +3424,10 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
3197 interval = msecs_to_jiffies(sd->balance_interval); 3424 interval = msecs_to_jiffies(sd->balance_interval);
3198 if (time_after(next_balance, sd->last_balance + interval)) 3425 if (time_after(next_balance, sd->last_balance + interval))
3199 next_balance = sd->last_balance + interval; 3426 next_balance = sd->last_balance + interval;
3200 if (pulled_task) 3427 if (pulled_task) {
3428 this_rq->idle_stamp = 0;
3201 break; 3429 break;
3430 }
3202 } 3431 }
3203 3432
3204 raw_spin_lock(&this_rq->lock); 3433 raw_spin_lock(&this_rq->lock);
@@ -3549,6 +3778,8 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
3549 int update_next_balance = 0; 3778 int update_next_balance = 0;
3550 int need_serialize; 3779 int need_serialize;
3551 3780
3781 update_shares(cpu);
3782
3552 for_each_domain(cpu, sd) { 3783 for_each_domain(cpu, sd) {
3553 if (!(sd->flags & SD_LOAD_BALANCE)) 3784 if (!(sd->flags & SD_LOAD_BALANCE))
3554 continue; 3785 continue;
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index 185f920ec1a2..68e69acc29b9 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -52,8 +52,6 @@ SCHED_FEAT(ARCH_POWER, 0)
52SCHED_FEAT(HRTICK, 0) 52SCHED_FEAT(HRTICK, 0)
53SCHED_FEAT(DOUBLE_TICK, 0) 53SCHED_FEAT(DOUBLE_TICK, 0)
54SCHED_FEAT(LB_BIAS, 1) 54SCHED_FEAT(LB_BIAS, 1)
55SCHED_FEAT(LB_SHARES_UPDATE, 1)
56SCHED_FEAT(ASYM_EFF_LOAD, 1)
57 55
58/* 56/*
59 * Spin-wait on mutex acquisition when the mutex owner is running on 57 * Spin-wait on mutex acquisition when the mutex owner is running on
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index bea7d79f7e9c..c914ec747ca6 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -183,6 +183,17 @@ static inline u64 sched_rt_period(struct rt_rq *rt_rq)
183 return ktime_to_ns(rt_rq->tg->rt_bandwidth.rt_period); 183 return ktime_to_ns(rt_rq->tg->rt_bandwidth.rt_period);
184} 184}
185 185
186static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq)
187{
188 list_add_rcu(&rt_rq->leaf_rt_rq_list,
189 &rq_of_rt_rq(rt_rq)->leaf_rt_rq_list);
190}
191
192static inline void list_del_leaf_rt_rq(struct rt_rq *rt_rq)
193{
194 list_del_rcu(&rt_rq->leaf_rt_rq_list);
195}
196
186#define for_each_leaf_rt_rq(rt_rq, rq) \ 197#define for_each_leaf_rt_rq(rt_rq, rq) \
187 list_for_each_entry_rcu(rt_rq, &rq->leaf_rt_rq_list, leaf_rt_rq_list) 198 list_for_each_entry_rcu(rt_rq, &rq->leaf_rt_rq_list, leaf_rt_rq_list)
188 199
@@ -276,6 +287,14 @@ static inline u64 sched_rt_period(struct rt_rq *rt_rq)
276 return ktime_to_ns(def_rt_bandwidth.rt_period); 287 return ktime_to_ns(def_rt_bandwidth.rt_period);
277} 288}
278 289
290static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq)
291{
292}
293
294static inline void list_del_leaf_rt_rq(struct rt_rq *rt_rq)
295{
296}
297
279#define for_each_leaf_rt_rq(rt_rq, rq) \ 298#define for_each_leaf_rt_rq(rt_rq, rq) \
280 for (rt_rq = &rq->rt; rt_rq; rt_rq = NULL) 299 for (rt_rq = &rq->rt; rt_rq; rt_rq = NULL)
281 300
@@ -825,6 +844,9 @@ static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
825 if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running)) 844 if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running))
826 return; 845 return;
827 846
847 if (!rt_rq->rt_nr_running)
848 list_add_leaf_rt_rq(rt_rq);
849
828 if (head) 850 if (head)
829 list_add(&rt_se->run_list, queue); 851 list_add(&rt_se->run_list, queue);
830 else 852 else
@@ -844,6 +866,8 @@ static void __dequeue_rt_entity(struct sched_rt_entity *rt_se)
844 __clear_bit(rt_se_prio(rt_se), array->bitmap); 866 __clear_bit(rt_se_prio(rt_se), array->bitmap);
845 867
846 dec_rt_tasks(rt_se, rt_rq); 868 dec_rt_tasks(rt_se, rt_rq);
869 if (!rt_rq->rt_nr_running)
870 list_del_leaf_rt_rq(rt_rq);
847} 871}
848 872
849/* 873/*
diff --git a/kernel/sched_stoptask.c b/kernel/sched_stoptask.c
index 45bddc0c1048..2bf6b47058c1 100644
--- a/kernel/sched_stoptask.c
+++ b/kernel/sched_stoptask.c
@@ -19,14 +19,14 @@ select_task_rq_stop(struct rq *rq, struct task_struct *p,
19static void 19static void
20check_preempt_curr_stop(struct rq *rq, struct task_struct *p, int flags) 20check_preempt_curr_stop(struct rq *rq, struct task_struct *p, int flags)
21{ 21{
22 resched_task(rq->curr); /* we preempt everything */ 22 /* we're never preempted */
23} 23}
24 24
25static struct task_struct *pick_next_task_stop(struct rq *rq) 25static struct task_struct *pick_next_task_stop(struct rq *rq)
26{ 26{
27 struct task_struct *stop = rq->stop; 27 struct task_struct *stop = rq->stop;
28 28
29 if (stop && stop->state == TASK_RUNNING) 29 if (stop && stop->se.on_rq)
30 return stop; 30 return stop;
31 31
32 return NULL; 32 return NULL;
diff --git a/kernel/smp.c b/kernel/smp.c
index 12ed8b013e2d..9910744f0856 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -13,6 +13,7 @@
13#include <linux/smp.h> 13#include <linux/smp.h>
14#include <linux/cpu.h> 14#include <linux/cpu.h>
15 15
16#ifdef CONFIG_USE_GENERIC_SMP_HELPERS
16static struct { 17static struct {
17 struct list_head queue; 18 struct list_head queue;
18 raw_spinlock_t lock; 19 raw_spinlock_t lock;
@@ -193,23 +194,52 @@ void generic_smp_call_function_interrupt(void)
193 */ 194 */
194 list_for_each_entry_rcu(data, &call_function.queue, csd.list) { 195 list_for_each_entry_rcu(data, &call_function.queue, csd.list) {
195 int refs; 196 int refs;
197 void (*func) (void *info);
196 198
197 if (!cpumask_test_and_clear_cpu(cpu, data->cpumask)) 199 /*
200 * Since we walk the list without any locks, we might
201 * see an entry that was completed, removed from the
202 * list and is in the process of being reused.
203 *
204 * We must check that the cpu is in the cpumask before
205 * checking the refs, and both must be set before
206 * executing the callback on this cpu.
207 */
208
209 if (!cpumask_test_cpu(cpu, data->cpumask))
210 continue;
211
212 smp_rmb();
213
214 if (atomic_read(&data->refs) == 0)
198 continue; 215 continue;
199 216
217 func = data->csd.func; /* for later warn */
200 data->csd.func(data->csd.info); 218 data->csd.func(data->csd.info);
201 219
220 /*
221 * If the cpu mask is not still set then it enabled interrupts,
222 * we took another smp interrupt, and executed the function
223 * twice on this cpu. In theory that copy decremented refs.
224 */
225 if (!cpumask_test_and_clear_cpu(cpu, data->cpumask)) {
226 WARN(1, "%pS enabled interrupts and double executed\n",
227 func);
228 continue;
229 }
230
202 refs = atomic_dec_return(&data->refs); 231 refs = atomic_dec_return(&data->refs);
203 WARN_ON(refs < 0); 232 WARN_ON(refs < 0);
204 if (!refs) {
205 raw_spin_lock(&call_function.lock);
206 list_del_rcu(&data->csd.list);
207 raw_spin_unlock(&call_function.lock);
208 }
209 233
210 if (refs) 234 if (refs)
211 continue; 235 continue;
212 236
237 WARN_ON(!cpumask_empty(data->cpumask));
238
239 raw_spin_lock(&call_function.lock);
240 list_del_rcu(&data->csd.list);
241 raw_spin_unlock(&call_function.lock);
242
213 csd_unlock(&data->csd); 243 csd_unlock(&data->csd);
214 } 244 }
215 245
@@ -429,7 +459,7 @@ void smp_call_function_many(const struct cpumask *mask,
429 * can't happen. 459 * can't happen.
430 */ 460 */
431 WARN_ON_ONCE(cpu_online(this_cpu) && irqs_disabled() 461 WARN_ON_ONCE(cpu_online(this_cpu) && irqs_disabled()
432 && !oops_in_progress); 462 && !oops_in_progress && !early_boot_irqs_disabled);
433 463
434 /* So, what's a CPU they want? Ignoring this one. */ 464 /* So, what's a CPU they want? Ignoring this one. */
435 cpu = cpumask_first_and(mask, cpu_online_mask); 465 cpu = cpumask_first_and(mask, cpu_online_mask);
@@ -453,11 +483,21 @@ void smp_call_function_many(const struct cpumask *mask,
453 483
454 data = &__get_cpu_var(cfd_data); 484 data = &__get_cpu_var(cfd_data);
455 csd_lock(&data->csd); 485 csd_lock(&data->csd);
486 BUG_ON(atomic_read(&data->refs) || !cpumask_empty(data->cpumask));
456 487
457 data->csd.func = func; 488 data->csd.func = func;
458 data->csd.info = info; 489 data->csd.info = info;
459 cpumask_and(data->cpumask, mask, cpu_online_mask); 490 cpumask_and(data->cpumask, mask, cpu_online_mask);
460 cpumask_clear_cpu(this_cpu, data->cpumask); 491 cpumask_clear_cpu(this_cpu, data->cpumask);
492
493 /*
494 * To ensure the interrupt handler gets an complete view
495 * we order the cpumask and refs writes and order the read
496 * of them in the interrupt handler. In addition we may
497 * only clear our own cpu bit from the mask.
498 */
499 smp_wmb();
500
461 atomic_set(&data->refs, cpumask_weight(data->cpumask)); 501 atomic_set(&data->refs, cpumask_weight(data->cpumask));
462 502
463 raw_spin_lock_irqsave(&call_function.lock, flags); 503 raw_spin_lock_irqsave(&call_function.lock, flags);
@@ -529,3 +569,24 @@ void ipi_call_unlock_irq(void)
529{ 569{
530 raw_spin_unlock_irq(&call_function.lock); 570 raw_spin_unlock_irq(&call_function.lock);
531} 571}
572#endif /* USE_GENERIC_SMP_HELPERS */
573
574/*
575 * Call a function on all processors. May be used during early boot while
576 * early_boot_irqs_disabled is set. Use local_irq_save/restore() instead
577 * of local_irq_disable/enable().
578 */
579int on_each_cpu(void (*func) (void *info), void *info, int wait)
580{
581 unsigned long flags;
582 int ret = 0;
583
584 preempt_disable();
585 ret = smp_call_function(func, info, wait);
586 local_irq_save(flags);
587 func(info);
588 local_irq_restore(flags);
589 preempt_enable();
590 return ret;
591}
592EXPORT_SYMBOL(on_each_cpu);
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 18f4be0d5fe0..68eb5efec388 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -70,7 +70,7 @@ char *softirq_to_name[NR_SOFTIRQS] = {
70static void wakeup_softirqd(void) 70static void wakeup_softirqd(void)
71{ 71{
72 /* Interrupts are disabled: no need to stop preemption */ 72 /* Interrupts are disabled: no need to stop preemption */
73 struct task_struct *tsk = __get_cpu_var(ksoftirqd); 73 struct task_struct *tsk = __this_cpu_read(ksoftirqd);
74 74
75 if (tsk && tsk->state != TASK_RUNNING) 75 if (tsk && tsk->state != TASK_RUNNING)
76 wake_up_process(tsk); 76 wake_up_process(tsk);
@@ -388,8 +388,8 @@ void __tasklet_schedule(struct tasklet_struct *t)
388 388
389 local_irq_save(flags); 389 local_irq_save(flags);
390 t->next = NULL; 390 t->next = NULL;
391 *__get_cpu_var(tasklet_vec).tail = t; 391 *__this_cpu_read(tasklet_vec.tail) = t;
392 __get_cpu_var(tasklet_vec).tail = &(t->next); 392 __this_cpu_write(tasklet_vec.tail, &(t->next));
393 raise_softirq_irqoff(TASKLET_SOFTIRQ); 393 raise_softirq_irqoff(TASKLET_SOFTIRQ);
394 local_irq_restore(flags); 394 local_irq_restore(flags);
395} 395}
@@ -402,8 +402,8 @@ void __tasklet_hi_schedule(struct tasklet_struct *t)
402 402
403 local_irq_save(flags); 403 local_irq_save(flags);
404 t->next = NULL; 404 t->next = NULL;
405 *__get_cpu_var(tasklet_hi_vec).tail = t; 405 *__this_cpu_read(tasklet_hi_vec.tail) = t;
406 __get_cpu_var(tasklet_hi_vec).tail = &(t->next); 406 __this_cpu_write(tasklet_hi_vec.tail, &(t->next));
407 raise_softirq_irqoff(HI_SOFTIRQ); 407 raise_softirq_irqoff(HI_SOFTIRQ);
408 local_irq_restore(flags); 408 local_irq_restore(flags);
409} 409}
@@ -414,8 +414,8 @@ void __tasklet_hi_schedule_first(struct tasklet_struct *t)
414{ 414{
415 BUG_ON(!irqs_disabled()); 415 BUG_ON(!irqs_disabled());
416 416
417 t->next = __get_cpu_var(tasklet_hi_vec).head; 417 t->next = __this_cpu_read(tasklet_hi_vec.head);
418 __get_cpu_var(tasklet_hi_vec).head = t; 418 __this_cpu_write(tasklet_hi_vec.head, t);
419 __raise_softirq_irqoff(HI_SOFTIRQ); 419 __raise_softirq_irqoff(HI_SOFTIRQ);
420} 420}
421 421
@@ -426,9 +426,9 @@ static void tasklet_action(struct softirq_action *a)
426 struct tasklet_struct *list; 426 struct tasklet_struct *list;
427 427
428 local_irq_disable(); 428 local_irq_disable();
429 list = __get_cpu_var(tasklet_vec).head; 429 list = __this_cpu_read(tasklet_vec.head);
430 __get_cpu_var(tasklet_vec).head = NULL; 430 __this_cpu_write(tasklet_vec.head, NULL);
431 __get_cpu_var(tasklet_vec).tail = &__get_cpu_var(tasklet_vec).head; 431 __this_cpu_write(tasklet_vec.tail, &__get_cpu_var(tasklet_vec).head);
432 local_irq_enable(); 432 local_irq_enable();
433 433
434 while (list) { 434 while (list) {
@@ -449,8 +449,8 @@ static void tasklet_action(struct softirq_action *a)
449 449
450 local_irq_disable(); 450 local_irq_disable();
451 t->next = NULL; 451 t->next = NULL;
452 *__get_cpu_var(tasklet_vec).tail = t; 452 *__this_cpu_read(tasklet_vec.tail) = t;
453 __get_cpu_var(tasklet_vec).tail = &(t->next); 453 __this_cpu_write(tasklet_vec.tail, &(t->next));
454 __raise_softirq_irqoff(TASKLET_SOFTIRQ); 454 __raise_softirq_irqoff(TASKLET_SOFTIRQ);
455 local_irq_enable(); 455 local_irq_enable();
456 } 456 }
@@ -461,9 +461,9 @@ static void tasklet_hi_action(struct softirq_action *a)
461 struct tasklet_struct *list; 461 struct tasklet_struct *list;
462 462
463 local_irq_disable(); 463 local_irq_disable();
464 list = __get_cpu_var(tasklet_hi_vec).head; 464 list = __this_cpu_read(tasklet_hi_vec.head);
465 __get_cpu_var(tasklet_hi_vec).head = NULL; 465 __this_cpu_write(tasklet_hi_vec.head, NULL);
466 __get_cpu_var(tasklet_hi_vec).tail = &__get_cpu_var(tasklet_hi_vec).head; 466 __this_cpu_write(tasklet_hi_vec.tail, &__get_cpu_var(tasklet_hi_vec).head);
467 local_irq_enable(); 467 local_irq_enable();
468 468
469 while (list) { 469 while (list) {
@@ -484,8 +484,8 @@ static void tasklet_hi_action(struct softirq_action *a)
484 484
485 local_irq_disable(); 485 local_irq_disable();
486 t->next = NULL; 486 t->next = NULL;
487 *__get_cpu_var(tasklet_hi_vec).tail = t; 487 *__this_cpu_read(tasklet_hi_vec.tail) = t;
488 __get_cpu_var(tasklet_hi_vec).tail = &(t->next); 488 __this_cpu_write(tasklet_hi_vec.tail, &(t->next));
489 __raise_softirq_irqoff(HI_SOFTIRQ); 489 __raise_softirq_irqoff(HI_SOFTIRQ);
490 local_irq_enable(); 490 local_irq_enable();
491 } 491 }
@@ -802,16 +802,16 @@ static void takeover_tasklets(unsigned int cpu)
802 802
803 /* Find end, append list for that CPU. */ 803 /* Find end, append list for that CPU. */
804 if (&per_cpu(tasklet_vec, cpu).head != per_cpu(tasklet_vec, cpu).tail) { 804 if (&per_cpu(tasklet_vec, cpu).head != per_cpu(tasklet_vec, cpu).tail) {
805 *(__get_cpu_var(tasklet_vec).tail) = per_cpu(tasklet_vec, cpu).head; 805 *__this_cpu_read(tasklet_vec.tail) = per_cpu(tasklet_vec, cpu).head;
806 __get_cpu_var(tasklet_vec).tail = per_cpu(tasklet_vec, cpu).tail; 806 this_cpu_write(tasklet_vec.tail, per_cpu(tasklet_vec, cpu).tail);
807 per_cpu(tasklet_vec, cpu).head = NULL; 807 per_cpu(tasklet_vec, cpu).head = NULL;
808 per_cpu(tasklet_vec, cpu).tail = &per_cpu(tasklet_vec, cpu).head; 808 per_cpu(tasklet_vec, cpu).tail = &per_cpu(tasklet_vec, cpu).head;
809 } 809 }
810 raise_softirq_irqoff(TASKLET_SOFTIRQ); 810 raise_softirq_irqoff(TASKLET_SOFTIRQ);
811 811
812 if (&per_cpu(tasklet_hi_vec, cpu).head != per_cpu(tasklet_hi_vec, cpu).tail) { 812 if (&per_cpu(tasklet_hi_vec, cpu).head != per_cpu(tasklet_hi_vec, cpu).tail) {
813 *__get_cpu_var(tasklet_hi_vec).tail = per_cpu(tasklet_hi_vec, cpu).head; 813 *__this_cpu_read(tasklet_hi_vec.tail) = per_cpu(tasklet_hi_vec, cpu).head;
814 __get_cpu_var(tasklet_hi_vec).tail = per_cpu(tasklet_hi_vec, cpu).tail; 814 __this_cpu_write(tasklet_hi_vec.tail, per_cpu(tasklet_hi_vec, cpu).tail);
815 per_cpu(tasklet_hi_vec, cpu).head = NULL; 815 per_cpu(tasklet_hi_vec, cpu).head = NULL;
816 per_cpu(tasklet_hi_vec, cpu).tail = &per_cpu(tasklet_hi_vec, cpu).head; 816 per_cpu(tasklet_hi_vec, cpu).tail = &per_cpu(tasklet_hi_vec, cpu).head;
817 } 817 }
@@ -853,7 +853,9 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb,
853 cpumask_any(cpu_online_mask)); 853 cpumask_any(cpu_online_mask));
854 case CPU_DEAD: 854 case CPU_DEAD:
855 case CPU_DEAD_FROZEN: { 855 case CPU_DEAD_FROZEN: {
856 struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; 856 static const struct sched_param param = {
857 .sched_priority = MAX_RT_PRIO-1
858 };
857 859
858 p = per_cpu(ksoftirqd, hotcpu); 860 p = per_cpu(ksoftirqd, hotcpu);
859 per_cpu(ksoftirqd, hotcpu) = NULL; 861 per_cpu(ksoftirqd, hotcpu) = NULL;
@@ -883,25 +885,6 @@ static __init int spawn_ksoftirqd(void)
883} 885}
884early_initcall(spawn_ksoftirqd); 886early_initcall(spawn_ksoftirqd);
885 887
886#ifdef CONFIG_SMP
887/*
888 * Call a function on all processors
889 */
890int on_each_cpu(void (*func) (void *info), void *info, int wait)
891{
892 int ret = 0;
893
894 preempt_disable();
895 ret = smp_call_function(func, info, wait);
896 local_irq_disable();
897 func(info);
898 local_irq_enable();
899 preempt_enable();
900 return ret;
901}
902EXPORT_SYMBOL(on_each_cpu);
903#endif
904
905/* 888/*
906 * [ These __weak aliases are kept in a separate compilation unit, so that 889 * [ These __weak aliases are kept in a separate compilation unit, so that
907 * GCC does not inline them incorrectly. ] 890 * GCC does not inline them incorrectly. ]
diff --git a/kernel/srcu.c b/kernel/srcu.c
index c71e07500536..73ce23feaea9 100644
--- a/kernel/srcu.c
+++ b/kernel/srcu.c
@@ -31,6 +31,7 @@
31#include <linux/rcupdate.h> 31#include <linux/rcupdate.h>
32#include <linux/sched.h> 32#include <linux/sched.h>
33#include <linux/smp.h> 33#include <linux/smp.h>
34#include <linux/delay.h>
34#include <linux/srcu.h> 35#include <linux/srcu.h>
35 36
36static int init_srcu_struct_fields(struct srcu_struct *sp) 37static int init_srcu_struct_fields(struct srcu_struct *sp)
@@ -155,6 +156,16 @@ void __srcu_read_unlock(struct srcu_struct *sp, int idx)
155EXPORT_SYMBOL_GPL(__srcu_read_unlock); 156EXPORT_SYMBOL_GPL(__srcu_read_unlock);
156 157
157/* 158/*
159 * We use an adaptive strategy for synchronize_srcu() and especially for
160 * synchronize_srcu_expedited(). We spin for a fixed time period
161 * (defined below) to allow SRCU readers to exit their read-side critical
162 * sections. If there are still some readers after 10 microseconds,
163 * we repeatedly block for 1-millisecond time periods. This approach
164 * has done well in testing, so there is no need for a config parameter.
165 */
166#define SYNCHRONIZE_SRCU_READER_DELAY 10
167
168/*
158 * Helper function for synchronize_srcu() and synchronize_srcu_expedited(). 169 * Helper function for synchronize_srcu() and synchronize_srcu_expedited().
159 */ 170 */
160static void __synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void)) 171static void __synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void))
@@ -203,9 +214,15 @@ static void __synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void))
203 * all srcu_read_lock() calls using the old counters have completed. 214 * all srcu_read_lock() calls using the old counters have completed.
204 * Their corresponding critical sections might well be still 215 * Their corresponding critical sections might well be still
205 * executing, but the srcu_read_lock() primitives themselves 216 * executing, but the srcu_read_lock() primitives themselves
206 * will have finished executing. 217 * will have finished executing. We initially give readers
218 * an arbitrarily chosen 10 microseconds to get out of their
219 * SRCU read-side critical sections, then loop waiting 1/HZ
220 * seconds per iteration. The 10-microsecond value has done
221 * very well in testing.
207 */ 222 */
208 223
224 if (srcu_readers_active_idx(sp, idx))
225 udelay(SYNCHRONIZE_SRCU_READER_DELAY);
209 while (srcu_readers_active_idx(sp, idx)) 226 while (srcu_readers_active_idx(sp, idx))
210 schedule_timeout_interruptible(1); 227 schedule_timeout_interruptible(1);
211 228
diff --git a/kernel/sys.c b/kernel/sys.c
index 7f5a0cd296a9..31b71a276b40 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -43,6 +43,8 @@
43#include <linux/kprobes.h> 43#include <linux/kprobes.h>
44#include <linux/user_namespace.h> 44#include <linux/user_namespace.h>
45 45
46#include <linux/kmsg_dump.h>
47
46#include <asm/uaccess.h> 48#include <asm/uaccess.h>
47#include <asm/io.h> 49#include <asm/io.h>
48#include <asm/unistd.h> 50#include <asm/unistd.h>
@@ -285,6 +287,7 @@ out_unlock:
285 */ 287 */
286void emergency_restart(void) 288void emergency_restart(void)
287{ 289{
290 kmsg_dump(KMSG_DUMP_EMERG);
288 machine_emergency_restart(); 291 machine_emergency_restart();
289} 292}
290EXPORT_SYMBOL_GPL(emergency_restart); 293EXPORT_SYMBOL_GPL(emergency_restart);
@@ -312,6 +315,7 @@ void kernel_restart(char *cmd)
312 printk(KERN_EMERG "Restarting system.\n"); 315 printk(KERN_EMERG "Restarting system.\n");
313 else 316 else
314 printk(KERN_EMERG "Restarting system with command '%s'.\n", cmd); 317 printk(KERN_EMERG "Restarting system with command '%s'.\n", cmd);
318 kmsg_dump(KMSG_DUMP_RESTART);
315 machine_restart(cmd); 319 machine_restart(cmd);
316} 320}
317EXPORT_SYMBOL_GPL(kernel_restart); 321EXPORT_SYMBOL_GPL(kernel_restart);
@@ -333,6 +337,7 @@ void kernel_halt(void)
333 kernel_shutdown_prepare(SYSTEM_HALT); 337 kernel_shutdown_prepare(SYSTEM_HALT);
334 sysdev_shutdown(); 338 sysdev_shutdown();
335 printk(KERN_EMERG "System halted.\n"); 339 printk(KERN_EMERG "System halted.\n");
340 kmsg_dump(KMSG_DUMP_HALT);
336 machine_halt(); 341 machine_halt();
337} 342}
338 343
@@ -351,6 +356,7 @@ void kernel_power_off(void)
351 disable_nonboot_cpus(); 356 disable_nonboot_cpus();
352 sysdev_shutdown(); 357 sysdev_shutdown();
353 printk(KERN_EMERG "Power down.\n"); 358 printk(KERN_EMERG "Power down.\n");
359 kmsg_dump(KMSG_DUMP_POWEROFF);
354 machine_power_off(); 360 machine_power_off();
355} 361}
356EXPORT_SYMBOL_GPL(kernel_power_off); 362EXPORT_SYMBOL_GPL(kernel_power_off);
@@ -1080,8 +1086,10 @@ SYSCALL_DEFINE0(setsid)
1080 err = session; 1086 err = session;
1081out: 1087out:
1082 write_unlock_irq(&tasklist_lock); 1088 write_unlock_irq(&tasklist_lock);
1083 if (err > 0) 1089 if (err > 0) {
1084 proc_sid_connector(group_leader); 1090 proc_sid_connector(group_leader);
1091 sched_autogroup_create_attach(group_leader);
1092 }
1085 return err; 1093 return err;
1086} 1094}
1087 1095
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index c33a1edb799f..bc86bb32e126 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -24,6 +24,7 @@
24#include <linux/slab.h> 24#include <linux/slab.h>
25#include <linux/sysctl.h> 25#include <linux/sysctl.h>
26#include <linux/signal.h> 26#include <linux/signal.h>
27#include <linux/printk.h>
27#include <linux/proc_fs.h> 28#include <linux/proc_fs.h>
28#include <linux/security.h> 29#include <linux/security.h>
29#include <linux/ctype.h> 30#include <linux/ctype.h>
@@ -245,10 +246,6 @@ static struct ctl_table root_table[] = {
245 .mode = 0555, 246 .mode = 0555,
246 .child = dev_table, 247 .child = dev_table,
247 }, 248 },
248/*
249 * NOTE: do not add new entries to this table unless you have read
250 * Documentation/sysctl/ctl_unnumbered.txt
251 */
252 { } 249 { }
253}; 250};
254 251
@@ -259,8 +256,6 @@ static int min_wakeup_granularity_ns; /* 0 usecs */
259static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */ 256static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */
260static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE; 257static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE;
261static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1; 258static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1;
262static int min_sched_shares_ratelimit = 100000; /* 100 usec */
263static int max_sched_shares_ratelimit = NSEC_PER_SEC; /* 1 second */
264#endif 259#endif
265 260
266#ifdef CONFIG_COMPACTION 261#ifdef CONFIG_COMPACTION
@@ -305,15 +300,6 @@ static struct ctl_table kern_table[] = {
305 .extra2 = &max_wakeup_granularity_ns, 300 .extra2 = &max_wakeup_granularity_ns,
306 }, 301 },
307 { 302 {
308 .procname = "sched_shares_ratelimit",
309 .data = &sysctl_sched_shares_ratelimit,
310 .maxlen = sizeof(unsigned int),
311 .mode = 0644,
312 .proc_handler = sched_proc_update_handler,
313 .extra1 = &min_sched_shares_ratelimit,
314 .extra2 = &max_sched_shares_ratelimit,
315 },
316 {
317 .procname = "sched_tunable_scaling", 303 .procname = "sched_tunable_scaling",
318 .data = &sysctl_sched_tunable_scaling, 304 .data = &sysctl_sched_tunable_scaling,
319 .maxlen = sizeof(enum sched_tunable_scaling), 305 .maxlen = sizeof(enum sched_tunable_scaling),
@@ -323,14 +309,6 @@ static struct ctl_table kern_table[] = {
323 .extra2 = &max_sched_tunable_scaling, 309 .extra2 = &max_sched_tunable_scaling,
324 }, 310 },
325 { 311 {
326 .procname = "sched_shares_thresh",
327 .data = &sysctl_sched_shares_thresh,
328 .maxlen = sizeof(unsigned int),
329 .mode = 0644,
330 .proc_handler = proc_dointvec_minmax,
331 .extra1 = &zero,
332 },
333 {
334 .procname = "sched_migration_cost", 312 .procname = "sched_migration_cost",
335 .data = &sysctl_sched_migration_cost, 313 .data = &sysctl_sched_migration_cost,
336 .maxlen = sizeof(unsigned int), 314 .maxlen = sizeof(unsigned int),
@@ -352,6 +330,13 @@ static struct ctl_table kern_table[] = {
352 .proc_handler = proc_dointvec, 330 .proc_handler = proc_dointvec,
353 }, 331 },
354 { 332 {
333 .procname = "sched_shares_window",
334 .data = &sysctl_sched_shares_window,
335 .maxlen = sizeof(unsigned int),
336 .mode = 0644,
337 .proc_handler = proc_dointvec,
338 },
339 {
355 .procname = "timer_migration", 340 .procname = "timer_migration",
356 .data = &sysctl_timer_migration, 341 .data = &sysctl_timer_migration,
357 .maxlen = sizeof(unsigned int), 342 .maxlen = sizeof(unsigned int),
@@ -382,6 +367,17 @@ static struct ctl_table kern_table[] = {
382 .mode = 0644, 367 .mode = 0644,
383 .proc_handler = proc_dointvec, 368 .proc_handler = proc_dointvec,
384 }, 369 },
370#ifdef CONFIG_SCHED_AUTOGROUP
371 {
372 .procname = "sched_autogroup_enabled",
373 .data = &sysctl_sched_autogroup_enabled,
374 .maxlen = sizeof(unsigned int),
375 .mode = 0644,
376 .proc_handler = proc_dointvec,
377 .extra1 = &zero,
378 .extra2 = &one,
379 },
380#endif
385#ifdef CONFIG_PROVE_LOCKING 381#ifdef CONFIG_PROVE_LOCKING
386 { 382 {
387 .procname = "prove_locking", 383 .procname = "prove_locking",
@@ -702,6 +698,24 @@ static struct ctl_table kern_table[] = {
702 .extra1 = &zero, 698 .extra1 = &zero,
703 .extra2 = &ten_thousand, 699 .extra2 = &ten_thousand,
704 }, 700 },
701 {
702 .procname = "dmesg_restrict",
703 .data = &dmesg_restrict,
704 .maxlen = sizeof(int),
705 .mode = 0644,
706 .proc_handler = proc_dointvec_minmax,
707 .extra1 = &zero,
708 .extra2 = &one,
709 },
710 {
711 .procname = "kptr_restrict",
712 .data = &kptr_restrict,
713 .maxlen = sizeof(int),
714 .mode = 0644,
715 .proc_handler = proc_dointvec_minmax,
716 .extra1 = &zero,
717 .extra2 = &two,
718 },
705#endif 719#endif
706 { 720 {
707 .procname = "ngroups_max", 721 .procname = "ngroups_max",
@@ -736,21 +750,21 @@ static struct ctl_table kern_table[] = {
736 .extra1 = &zero, 750 .extra1 = &zero,
737 .extra2 = &one, 751 .extra2 = &one,
738 }, 752 },
739#endif
740#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) && !defined(CONFIG_LOCKUP_DETECTOR)
741 { 753 {
742 .procname = "unknown_nmi_panic", 754 .procname = "nmi_watchdog",
743 .data = &unknown_nmi_panic, 755 .data = &watchdog_enabled,
744 .maxlen = sizeof (int), 756 .maxlen = sizeof (int),
745 .mode = 0644, 757 .mode = 0644,
746 .proc_handler = proc_dointvec, 758 .proc_handler = proc_dowatchdog_enabled,
747 }, 759 },
760#endif
761#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86)
748 { 762 {
749 .procname = "nmi_watchdog", 763 .procname = "unknown_nmi_panic",
750 .data = &nmi_watchdog_enabled, 764 .data = &unknown_nmi_panic,
751 .maxlen = sizeof (int), 765 .maxlen = sizeof (int),
752 .mode = 0644, 766 .mode = 0644,
753 .proc_handler = proc_nmi_enabled, 767 .proc_handler = proc_dointvec,
754 }, 768 },
755#endif 769#endif
756#if defined(CONFIG_X86) 770#if defined(CONFIG_X86)
@@ -954,10 +968,6 @@ static struct ctl_table kern_table[] = {
954 .proc_handler = proc_dointvec, 968 .proc_handler = proc_dointvec,
955 }, 969 },
956#endif 970#endif
957/*
958 * NOTE: do not add new entries to this table unless you have read
959 * Documentation/sysctl/ctl_unnumbered.txt
960 */
961 { } 971 { }
962}; 972};
963 973
@@ -1318,11 +1328,6 @@ static struct ctl_table vm_table[] = {
1318 .extra2 = &one, 1328 .extra2 = &one,
1319 }, 1329 },
1320#endif 1330#endif
1321
1322/*
1323 * NOTE: do not add new entries to this table unless you have read
1324 * Documentation/sysctl/ctl_unnumbered.txt
1325 */
1326 { } 1331 { }
1327}; 1332};
1328 1333
@@ -1478,10 +1483,6 @@ static struct ctl_table fs_table[] = {
1478 .proc_handler = &pipe_proc_fn, 1483 .proc_handler = &pipe_proc_fn,
1479 .extra1 = &pipe_min_size, 1484 .extra1 = &pipe_min_size,
1480 }, 1485 },
1481/*
1482 * NOTE: do not add new entries to this table unless you have read
1483 * Documentation/sysctl/ctl_unnumbered.txt
1484 */
1485 { } 1486 { }
1486}; 1487};
1487 1488
@@ -2891,7 +2892,7 @@ int proc_do_large_bitmap(struct ctl_table *table, int write,
2891 } 2892 }
2892} 2893}
2893 2894
2894#else /* CONFIG_PROC_FS */ 2895#else /* CONFIG_PROC_SYSCTL */
2895 2896
2896int proc_dostring(struct ctl_table *table, int write, 2897int proc_dostring(struct ctl_table *table, int write,
2897 void __user *buffer, size_t *lenp, loff_t *ppos) 2898 void __user *buffer, size_t *lenp, loff_t *ppos)
@@ -2943,7 +2944,7 @@ int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write,
2943} 2944}
2944 2945
2945 2946
2946#endif /* CONFIG_PROC_FS */ 2947#endif /* CONFIG_PROC_SYSCTL */
2947 2948
2948/* 2949/*
2949 * No sense putting this after each symbol definition, twice, 2950 * No sense putting this after each symbol definition, twice,
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index 1357c5786064..b875bedf7c9a 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -136,7 +136,6 @@ static const struct bin_table bin_kern_table[] = {
136 { CTL_INT, KERN_IA64_UNALIGNED, "ignore-unaligned-usertrap" }, 136 { CTL_INT, KERN_IA64_UNALIGNED, "ignore-unaligned-usertrap" },
137 { CTL_INT, KERN_COMPAT_LOG, "compat-log" }, 137 { CTL_INT, KERN_COMPAT_LOG, "compat-log" },
138 { CTL_INT, KERN_MAX_LOCK_DEPTH, "max_lock_depth" }, 138 { CTL_INT, KERN_MAX_LOCK_DEPTH, "max_lock_depth" },
139 { CTL_INT, KERN_NMI_WATCHDOG, "nmi_watchdog" },
140 { CTL_INT, KERN_PANIC_ON_NMI, "panic_on_unrecovered_nmi" }, 139 { CTL_INT, KERN_PANIC_ON_NMI, "panic_on_unrecovered_nmi" },
141 {} 140 {}
142}; 141};
@@ -1193,7 +1192,7 @@ static ssize_t bin_dn_node_address(struct file *file,
1193 1192
1194 buf[result] = '\0'; 1193 buf[result] = '\0';
1195 1194
1196 /* Convert the decnet addresss to binary */ 1195 /* Convert the decnet address to binary */
1197 result = -EIO; 1196 result = -EIO;
1198 nodep = strchr(buf, '.') + 1; 1197 nodep = strchr(buf, '.') + 1;
1199 if (!nodep) 1198 if (!nodep)
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index c8231fb15708..3971c6b9d58d 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -89,8 +89,7 @@ static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp,
89 return -ENOMEM; 89 return -ENOMEM;
90 90
91 if (!info) { 91 if (!info) {
92 int seq = get_cpu_var(taskstats_seqnum)++; 92 int seq = this_cpu_inc_return(taskstats_seqnum) - 1;
93 put_cpu_var(taskstats_seqnum);
94 93
95 reply = genlmsg_put(skb, 0, seq, &family, 0, cmd); 94 reply = genlmsg_put(skb, 0, seq, &family, 0, cmd);
96 } else 95 } else
@@ -349,25 +348,47 @@ static int parse(struct nlattr *na, struct cpumask *mask)
349 return ret; 348 return ret;
350} 349}
351 350
351#if defined(CONFIG_64BIT) && !defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
352#define TASKSTATS_NEEDS_PADDING 1
353#endif
354
352static struct taskstats *mk_reply(struct sk_buff *skb, int type, u32 pid) 355static struct taskstats *mk_reply(struct sk_buff *skb, int type, u32 pid)
353{ 356{
354 struct nlattr *na, *ret; 357 struct nlattr *na, *ret;
355 int aggr; 358 int aggr;
356 359
357 /* If we don't pad, we end up with alignment on a 4 byte boundary.
358 * This causes lots of runtime warnings on systems requiring 8 byte
359 * alignment */
360 u32 pids[2] = { pid, 0 };
361 int pid_size = ALIGN(sizeof(pid), sizeof(long));
362
363 aggr = (type == TASKSTATS_TYPE_PID) 360 aggr = (type == TASKSTATS_TYPE_PID)
364 ? TASKSTATS_TYPE_AGGR_PID 361 ? TASKSTATS_TYPE_AGGR_PID
365 : TASKSTATS_TYPE_AGGR_TGID; 362 : TASKSTATS_TYPE_AGGR_TGID;
366 363
364 /*
365 * The taskstats structure is internally aligned on 8 byte
366 * boundaries but the layout of the aggregrate reply, with
367 * two NLA headers and the pid (each 4 bytes), actually
368 * force the entire structure to be unaligned. This causes
369 * the kernel to issue unaligned access warnings on some
370 * architectures like ia64. Unfortunately, some software out there
371 * doesn't properly unroll the NLA packet and assumes that the start
372 * of the taskstats structure will always be 20 bytes from the start
373 * of the netlink payload. Aligning the start of the taskstats
374 * structure breaks this software, which we don't want. So, for now
375 * the alignment only happens on architectures that require it
376 * and those users will have to update to fixed versions of those
377 * packages. Space is reserved in the packet only when needed.
378 * This ifdef should be removed in several years e.g. 2012 once
379 * we can be confident that fixed versions are installed on most
380 * systems. We add the padding before the aggregate since the
381 * aggregate is already a defined type.
382 */
383#ifdef TASKSTATS_NEEDS_PADDING
384 if (nla_put(skb, TASKSTATS_TYPE_NULL, 0, NULL) < 0)
385 goto err;
386#endif
367 na = nla_nest_start(skb, aggr); 387 na = nla_nest_start(skb, aggr);
368 if (!na) 388 if (!na)
369 goto err; 389 goto err;
370 if (nla_put(skb, type, pid_size, pids) < 0) 390
391 if (nla_put(skb, type, sizeof(pid), &pid) < 0)
371 goto err; 392 goto err;
372 ret = nla_reserve(skb, TASKSTATS_TYPE_STATS, sizeof(struct taskstats)); 393 ret = nla_reserve(skb, TASKSTATS_TYPE_STATS, sizeof(struct taskstats));
373 if (!ret) 394 if (!ret)
@@ -456,6 +477,18 @@ out:
456 return rc; 477 return rc;
457} 478}
458 479
480static size_t taskstats_packet_size(void)
481{
482 size_t size;
483
484 size = nla_total_size(sizeof(u32)) +
485 nla_total_size(sizeof(struct taskstats)) + nla_total_size(0);
486#ifdef TASKSTATS_NEEDS_PADDING
487 size += nla_total_size(0); /* Padding for alignment */
488#endif
489 return size;
490}
491
459static int cmd_attr_pid(struct genl_info *info) 492static int cmd_attr_pid(struct genl_info *info)
460{ 493{
461 struct taskstats *stats; 494 struct taskstats *stats;
@@ -464,8 +497,7 @@ static int cmd_attr_pid(struct genl_info *info)
464 u32 pid; 497 u32 pid;
465 int rc; 498 int rc;
466 499
467 size = nla_total_size(sizeof(u32)) + 500 size = taskstats_packet_size();
468 nla_total_size(sizeof(struct taskstats)) + nla_total_size(0);
469 501
470 rc = prepare_reply(info, TASKSTATS_CMD_NEW, &rep_skb, size); 502 rc = prepare_reply(info, TASKSTATS_CMD_NEW, &rep_skb, size);
471 if (rc < 0) 503 if (rc < 0)
@@ -494,8 +526,7 @@ static int cmd_attr_tgid(struct genl_info *info)
494 u32 tgid; 526 u32 tgid;
495 int rc; 527 int rc;
496 528
497 size = nla_total_size(sizeof(u32)) + 529 size = taskstats_packet_size();
498 nla_total_size(sizeof(struct taskstats)) + nla_total_size(0);
499 530
500 rc = prepare_reply(info, TASKSTATS_CMD_NEW, &rep_skb, size); 531 rc = prepare_reply(info, TASKSTATS_CMD_NEW, &rep_skb, size);
501 if (rc < 0) 532 if (rc < 0)
@@ -570,8 +601,7 @@ void taskstats_exit(struct task_struct *tsk, int group_dead)
570 /* 601 /*
571 * Size includes space for nested attributes 602 * Size includes space for nested attributes
572 */ 603 */
573 size = nla_total_size(sizeof(u32)) + 604 size = taskstats_packet_size();
574 nla_total_size(sizeof(struct taskstats)) + nla_total_size(0);
575 605
576 is_thread_group = !!taskstats_tgid_alloc(tsk); 606 is_thread_group = !!taskstats_tgid_alloc(tsk);
577 if (is_thread_group) { 607 if (is_thread_group) {
@@ -581,7 +611,7 @@ void taskstats_exit(struct task_struct *tsk, int group_dead)
581 fill_tgid_exit(tsk); 611 fill_tgid_exit(tsk);
582 } 612 }
583 613
584 listeners = &__raw_get_cpu_var(listener_array); 614 listeners = __this_cpu_ptr(&listener_array);
585 if (list_empty(&listeners->list)) 615 if (list_empty(&listeners->list))
586 return; 616 return;
587 617
diff --git a/kernel/time.c b/kernel/time.c
index ba9b338d1835..32174359576f 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -238,7 +238,7 @@ EXPORT_SYMBOL(current_fs_time);
238 * Avoid unnecessary multiplications/divisions in the 238 * Avoid unnecessary multiplications/divisions in the
239 * two most common HZ cases: 239 * two most common HZ cases:
240 */ 240 */
241unsigned int inline jiffies_to_msecs(const unsigned long j) 241inline unsigned int jiffies_to_msecs(const unsigned long j)
242{ 242{
243#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ) 243#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ)
244 return (MSEC_PER_SEC / HZ) * j; 244 return (MSEC_PER_SEC / HZ) * j;
@@ -254,7 +254,7 @@ unsigned int inline jiffies_to_msecs(const unsigned long j)
254} 254}
255EXPORT_SYMBOL(jiffies_to_msecs); 255EXPORT_SYMBOL(jiffies_to_msecs);
256 256
257unsigned int inline jiffies_to_usecs(const unsigned long j) 257inline unsigned int jiffies_to_usecs(const unsigned long j)
258{ 258{
259#if HZ <= USEC_PER_SEC && !(USEC_PER_SEC % HZ) 259#if HZ <= USEC_PER_SEC && !(USEC_PER_SEC % HZ)
260 return (USEC_PER_SEC / HZ) * j; 260 return (USEC_PER_SEC / HZ) * j;
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index c18d7efa1b4b..6519cf62d9cd 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -113,7 +113,7 @@ EXPORT_SYMBOL_GPL(timecounter_cyc2time);
113 * @shift: pointer to shift variable 113 * @shift: pointer to shift variable
114 * @from: frequency to convert from 114 * @from: frequency to convert from
115 * @to: frequency to convert to 115 * @to: frequency to convert to
116 * @minsec: guaranteed runtime conversion range in seconds 116 * @maxsec: guaranteed runtime conversion range in seconds
117 * 117 *
118 * The function evaluates the shift/mult pair for the scaled math 118 * The function evaluates the shift/mult pair for the scaled math
119 * operations of clocksources and clockevents. 119 * operations of clocksources and clockevents.
@@ -122,7 +122,7 @@ EXPORT_SYMBOL_GPL(timecounter_cyc2time);
122 * NSEC_PER_SEC == 1GHz and @from is the counter frequency. For clock 122 * NSEC_PER_SEC == 1GHz and @from is the counter frequency. For clock
123 * event @to is the counter frequency and @from is NSEC_PER_SEC. 123 * event @to is the counter frequency and @from is NSEC_PER_SEC.
124 * 124 *
125 * The @minsec conversion range argument controls the time frame in 125 * The @maxsec conversion range argument controls the time frame in
126 * seconds which must be covered by the runtime conversion with the 126 * seconds which must be covered by the runtime conversion with the
127 * calculated mult and shift factors. This guarantees that no 64bit 127 * calculated mult and shift factors. This guarantees that no 64bit
128 * overflow happens when the input value of the conversion is 128 * overflow happens when the input value of the conversion is
@@ -131,7 +131,7 @@ EXPORT_SYMBOL_GPL(timecounter_cyc2time);
131 * factors. 131 * factors.
132 */ 132 */
133void 133void
134clocks_calc_mult_shift(u32 *mult, u32 *shift, u32 from, u32 to, u32 minsec) 134clocks_calc_mult_shift(u32 *mult, u32 *shift, u32 from, u32 to, u32 maxsec)
135{ 135{
136 u64 tmp; 136 u64 tmp;
137 u32 sft, sftacc= 32; 137 u32 sft, sftacc= 32;
@@ -140,7 +140,7 @@ clocks_calc_mult_shift(u32 *mult, u32 *shift, u32 from, u32 to, u32 minsec)
140 * Calculate the shift factor which is limiting the conversion 140 * Calculate the shift factor which is limiting the conversion
141 * range: 141 * range:
142 */ 142 */
143 tmp = ((u64)minsec * from) >> 32; 143 tmp = ((u64)maxsec * from) >> 32;
144 while (tmp) { 144 while (tmp) {
145 tmp >>=1; 145 tmp >>=1;
146 sftacc--; 146 sftacc--;
@@ -152,6 +152,7 @@ clocks_calc_mult_shift(u32 *mult, u32 *shift, u32 from, u32 to, u32 minsec)
152 */ 152 */
153 for (sft = 32; sft > 0; sft--) { 153 for (sft = 32; sft > 0; sft--) {
154 tmp = (u64) to << sft; 154 tmp = (u64) to << sft;
155 tmp += from / 2;
155 do_div(tmp, from); 156 do_div(tmp, from);
156 if ((tmp >> sftacc) == 0) 157 if ((tmp >> sftacc) == 0)
157 break; 158 break;
@@ -678,7 +679,7 @@ EXPORT_SYMBOL_GPL(__clocksource_updatefreq_scale);
678int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq) 679int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq)
679{ 680{
680 681
681 /* Intialize mult/shift and max_idle_ns */ 682 /* Initialize mult/shift and max_idle_ns */
682 __clocksource_updatefreq_scale(cs, scale, freq); 683 __clocksource_updatefreq_scale(cs, scale, freq);
683 684
684 /* Add clocksource to the clcoksource list */ 685 /* Add clocksource to the clcoksource list */
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index d2321891538f..5c00242fa921 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -14,6 +14,7 @@
14#include <linux/timex.h> 14#include <linux/timex.h>
15#include <linux/time.h> 15#include <linux/time.h>
16#include <linux/mm.h> 16#include <linux/mm.h>
17#include <linux/module.h>
17 18
18/* 19/*
19 * NTP timekeeping variables: 20 * NTP timekeeping variables:
@@ -74,6 +75,162 @@ static long time_adjust;
74/* constant (boot-param configurable) NTP tick adjustment (upscaled) */ 75/* constant (boot-param configurable) NTP tick adjustment (upscaled) */
75static s64 ntp_tick_adj; 76static s64 ntp_tick_adj;
76 77
78#ifdef CONFIG_NTP_PPS
79
80/*
81 * The following variables are used when a pulse-per-second (PPS) signal
82 * is available. They establish the engineering parameters of the clock
83 * discipline loop when controlled by the PPS signal.
84 */
85#define PPS_VALID 10 /* PPS signal watchdog max (s) */
86#define PPS_POPCORN 4 /* popcorn spike threshold (shift) */
87#define PPS_INTMIN 2 /* min freq interval (s) (shift) */
88#define PPS_INTMAX 8 /* max freq interval (s) (shift) */
89#define PPS_INTCOUNT 4 /* number of consecutive good intervals to
90 increase pps_shift or consecutive bad
91 intervals to decrease it */
92#define PPS_MAXWANDER 100000 /* max PPS freq wander (ns/s) */
93
94static int pps_valid; /* signal watchdog counter */
95static long pps_tf[3]; /* phase median filter */
96static long pps_jitter; /* current jitter (ns) */
97static struct timespec pps_fbase; /* beginning of the last freq interval */
98static int pps_shift; /* current interval duration (s) (shift) */
99static int pps_intcnt; /* interval counter */
100static s64 pps_freq; /* frequency offset (scaled ns/s) */
101static long pps_stabil; /* current stability (scaled ns/s) */
102
103/*
104 * PPS signal quality monitors
105 */
106static long pps_calcnt; /* calibration intervals */
107static long pps_jitcnt; /* jitter limit exceeded */
108static long pps_stbcnt; /* stability limit exceeded */
109static long pps_errcnt; /* calibration errors */
110
111
112/* PPS kernel consumer compensates the whole phase error immediately.
113 * Otherwise, reduce the offset by a fixed factor times the time constant.
114 */
115static inline s64 ntp_offset_chunk(s64 offset)
116{
117 if (time_status & STA_PPSTIME && time_status & STA_PPSSIGNAL)
118 return offset;
119 else
120 return shift_right(offset, SHIFT_PLL + time_constant);
121}
122
123static inline void pps_reset_freq_interval(void)
124{
125 /* the PPS calibration interval may end
126 surprisingly early */
127 pps_shift = PPS_INTMIN;
128 pps_intcnt = 0;
129}
130
131/**
132 * pps_clear - Clears the PPS state variables
133 *
134 * Must be called while holding a write on the xtime_lock
135 */
136static inline void pps_clear(void)
137{
138 pps_reset_freq_interval();
139 pps_tf[0] = 0;
140 pps_tf[1] = 0;
141 pps_tf[2] = 0;
142 pps_fbase.tv_sec = pps_fbase.tv_nsec = 0;
143 pps_freq = 0;
144}
145
146/* Decrease pps_valid to indicate that another second has passed since
147 * the last PPS signal. When it reaches 0, indicate that PPS signal is
148 * missing.
149 *
150 * Must be called while holding a write on the xtime_lock
151 */
152static inline void pps_dec_valid(void)
153{
154 if (pps_valid > 0)
155 pps_valid--;
156 else {
157 time_status &= ~(STA_PPSSIGNAL | STA_PPSJITTER |
158 STA_PPSWANDER | STA_PPSERROR);
159 pps_clear();
160 }
161}
162
163static inline void pps_set_freq(s64 freq)
164{
165 pps_freq = freq;
166}
167
168static inline int is_error_status(int status)
169{
170 return (time_status & (STA_UNSYNC|STA_CLOCKERR))
171 /* PPS signal lost when either PPS time or
172 * PPS frequency synchronization requested
173 */
174 || ((time_status & (STA_PPSFREQ|STA_PPSTIME))
175 && !(time_status & STA_PPSSIGNAL))
176 /* PPS jitter exceeded when
177 * PPS time synchronization requested */
178 || ((time_status & (STA_PPSTIME|STA_PPSJITTER))
179 == (STA_PPSTIME|STA_PPSJITTER))
180 /* PPS wander exceeded or calibration error when
181 * PPS frequency synchronization requested
182 */
183 || ((time_status & STA_PPSFREQ)
184 && (time_status & (STA_PPSWANDER|STA_PPSERROR)));
185}
186
187static inline void pps_fill_timex(struct timex *txc)
188{
189 txc->ppsfreq = shift_right((pps_freq >> PPM_SCALE_INV_SHIFT) *
190 PPM_SCALE_INV, NTP_SCALE_SHIFT);
191 txc->jitter = pps_jitter;
192 if (!(time_status & STA_NANO))
193 txc->jitter /= NSEC_PER_USEC;
194 txc->shift = pps_shift;
195 txc->stabil = pps_stabil;
196 txc->jitcnt = pps_jitcnt;
197 txc->calcnt = pps_calcnt;
198 txc->errcnt = pps_errcnt;
199 txc->stbcnt = pps_stbcnt;
200}
201
202#else /* !CONFIG_NTP_PPS */
203
204static inline s64 ntp_offset_chunk(s64 offset)
205{
206 return shift_right(offset, SHIFT_PLL + time_constant);
207}
208
209static inline void pps_reset_freq_interval(void) {}
210static inline void pps_clear(void) {}
211static inline void pps_dec_valid(void) {}
212static inline void pps_set_freq(s64 freq) {}
213
214static inline int is_error_status(int status)
215{
216 return status & (STA_UNSYNC|STA_CLOCKERR);
217}
218
219static inline void pps_fill_timex(struct timex *txc)
220{
221 /* PPS is not implemented, so these are zero */
222 txc->ppsfreq = 0;
223 txc->jitter = 0;
224 txc->shift = 0;
225 txc->stabil = 0;
226 txc->jitcnt = 0;
227 txc->calcnt = 0;
228 txc->errcnt = 0;
229 txc->stbcnt = 0;
230}
231
232#endif /* CONFIG_NTP_PPS */
233
77/* 234/*
78 * NTP methods: 235 * NTP methods:
79 */ 236 */
@@ -185,6 +342,9 @@ void ntp_clear(void)
185 342
186 tick_length = tick_length_base; 343 tick_length = tick_length_base;
187 time_offset = 0; 344 time_offset = 0;
345
346 /* Clear PPS state variables */
347 pps_clear();
188} 348}
189 349
190/* 350/*
@@ -250,16 +410,16 @@ void second_overflow(void)
250 time_status |= STA_UNSYNC; 410 time_status |= STA_UNSYNC;
251 } 411 }
252 412
253 /* 413 /* Compute the phase adjustment for the next second */
254 * Compute the phase adjustment for the next second. The offset is
255 * reduced by a fixed factor times the time constant.
256 */
257 tick_length = tick_length_base; 414 tick_length = tick_length_base;
258 415
259 delta = shift_right(time_offset, SHIFT_PLL + time_constant); 416 delta = ntp_offset_chunk(time_offset);
260 time_offset -= delta; 417 time_offset -= delta;
261 tick_length += delta; 418 tick_length += delta;
262 419
420 /* Check PPS signal */
421 pps_dec_valid();
422
263 if (!time_adjust) 423 if (!time_adjust)
264 return; 424 return;
265 425
@@ -369,6 +529,8 @@ static inline void process_adj_status(struct timex *txc, struct timespec *ts)
369 if ((time_status & STA_PLL) && !(txc->status & STA_PLL)) { 529 if ((time_status & STA_PLL) && !(txc->status & STA_PLL)) {
370 time_state = TIME_OK; 530 time_state = TIME_OK;
371 time_status = STA_UNSYNC; 531 time_status = STA_UNSYNC;
532 /* restart PPS frequency calibration */
533 pps_reset_freq_interval();
372 } 534 }
373 535
374 /* 536 /*
@@ -418,6 +580,8 @@ static inline void process_adjtimex_modes(struct timex *txc, struct timespec *ts
418 time_freq = txc->freq * PPM_SCALE; 580 time_freq = txc->freq * PPM_SCALE;
419 time_freq = min(time_freq, MAXFREQ_SCALED); 581 time_freq = min(time_freq, MAXFREQ_SCALED);
420 time_freq = max(time_freq, -MAXFREQ_SCALED); 582 time_freq = max(time_freq, -MAXFREQ_SCALED);
583 /* update pps_freq */
584 pps_set_freq(time_freq);
421 } 585 }
422 586
423 if (txc->modes & ADJ_MAXERROR) 587 if (txc->modes & ADJ_MAXERROR)
@@ -508,7 +672,8 @@ int do_adjtimex(struct timex *txc)
508 } 672 }
509 673
510 result = time_state; /* mostly `TIME_OK' */ 674 result = time_state; /* mostly `TIME_OK' */
511 if (time_status & (STA_UNSYNC|STA_CLOCKERR)) 675 /* check for errors */
676 if (is_error_status(time_status))
512 result = TIME_ERROR; 677 result = TIME_ERROR;
513 678
514 txc->freq = shift_right((time_freq >> PPM_SCALE_INV_SHIFT) * 679 txc->freq = shift_right((time_freq >> PPM_SCALE_INV_SHIFT) *
@@ -522,15 +687,8 @@ int do_adjtimex(struct timex *txc)
522 txc->tick = tick_usec; 687 txc->tick = tick_usec;
523 txc->tai = time_tai; 688 txc->tai = time_tai;
524 689
525 /* PPS is not implemented, so these are zero */ 690 /* fill PPS status fields */
526 txc->ppsfreq = 0; 691 pps_fill_timex(txc);
527 txc->jitter = 0;
528 txc->shift = 0;
529 txc->stabil = 0;
530 txc->jitcnt = 0;
531 txc->calcnt = 0;
532 txc->errcnt = 0;
533 txc->stbcnt = 0;
534 692
535 write_sequnlock_irq(&xtime_lock); 693 write_sequnlock_irq(&xtime_lock);
536 694
@@ -544,6 +702,243 @@ int do_adjtimex(struct timex *txc)
544 return result; 702 return result;
545} 703}
546 704
705#ifdef CONFIG_NTP_PPS
706
707/* actually struct pps_normtime is good old struct timespec, but it is
708 * semantically different (and it is the reason why it was invented):
709 * pps_normtime.nsec has a range of ( -NSEC_PER_SEC / 2, NSEC_PER_SEC / 2 ]
710 * while timespec.tv_nsec has a range of [0, NSEC_PER_SEC) */
711struct pps_normtime {
712 __kernel_time_t sec; /* seconds */
713 long nsec; /* nanoseconds */
714};
715
716/* normalize the timestamp so that nsec is in the
717 ( -NSEC_PER_SEC / 2, NSEC_PER_SEC / 2 ] interval */
718static inline struct pps_normtime pps_normalize_ts(struct timespec ts)
719{
720 struct pps_normtime norm = {
721 .sec = ts.tv_sec,
722 .nsec = ts.tv_nsec
723 };
724
725 if (norm.nsec > (NSEC_PER_SEC >> 1)) {
726 norm.nsec -= NSEC_PER_SEC;
727 norm.sec++;
728 }
729
730 return norm;
731}
732
733/* get current phase correction and jitter */
734static inline long pps_phase_filter_get(long *jitter)
735{
736 *jitter = pps_tf[0] - pps_tf[1];
737 if (*jitter < 0)
738 *jitter = -*jitter;
739
740 /* TODO: test various filters */
741 return pps_tf[0];
742}
743
744/* add the sample to the phase filter */
745static inline void pps_phase_filter_add(long err)
746{
747 pps_tf[2] = pps_tf[1];
748 pps_tf[1] = pps_tf[0];
749 pps_tf[0] = err;
750}
751
752/* decrease frequency calibration interval length.
753 * It is halved after four consecutive unstable intervals.
754 */
755static inline void pps_dec_freq_interval(void)
756{
757 if (--pps_intcnt <= -PPS_INTCOUNT) {
758 pps_intcnt = -PPS_INTCOUNT;
759 if (pps_shift > PPS_INTMIN) {
760 pps_shift--;
761 pps_intcnt = 0;
762 }
763 }
764}
765
766/* increase frequency calibration interval length.
767 * It is doubled after four consecutive stable intervals.
768 */
769static inline void pps_inc_freq_interval(void)
770{
771 if (++pps_intcnt >= PPS_INTCOUNT) {
772 pps_intcnt = PPS_INTCOUNT;
773 if (pps_shift < PPS_INTMAX) {
774 pps_shift++;
775 pps_intcnt = 0;
776 }
777 }
778}
779
780/* update clock frequency based on MONOTONIC_RAW clock PPS signal
781 * timestamps
782 *
783 * At the end of the calibration interval the difference between the
784 * first and last MONOTONIC_RAW clock timestamps divided by the length
785 * of the interval becomes the frequency update. If the interval was
786 * too long, the data are discarded.
787 * Returns the difference between old and new frequency values.
788 */
789static long hardpps_update_freq(struct pps_normtime freq_norm)
790{
791 long delta, delta_mod;
792 s64 ftemp;
793
794 /* check if the frequency interval was too long */
795 if (freq_norm.sec > (2 << pps_shift)) {
796 time_status |= STA_PPSERROR;
797 pps_errcnt++;
798 pps_dec_freq_interval();
799 pr_err("hardpps: PPSERROR: interval too long - %ld s\n",
800 freq_norm.sec);
801 return 0;
802 }
803
804 /* here the raw frequency offset and wander (stability) is
805 * calculated. If the wander is less than the wander threshold
806 * the interval is increased; otherwise it is decreased.
807 */
808 ftemp = div_s64(((s64)(-freq_norm.nsec)) << NTP_SCALE_SHIFT,
809 freq_norm.sec);
810 delta = shift_right(ftemp - pps_freq, NTP_SCALE_SHIFT);
811 pps_freq = ftemp;
812 if (delta > PPS_MAXWANDER || delta < -PPS_MAXWANDER) {
813 pr_warning("hardpps: PPSWANDER: change=%ld\n", delta);
814 time_status |= STA_PPSWANDER;
815 pps_stbcnt++;
816 pps_dec_freq_interval();
817 } else { /* good sample */
818 pps_inc_freq_interval();
819 }
820
821 /* the stability metric is calculated as the average of recent
822 * frequency changes, but is used only for performance
823 * monitoring
824 */
825 delta_mod = delta;
826 if (delta_mod < 0)
827 delta_mod = -delta_mod;
828 pps_stabil += (div_s64(((s64)delta_mod) <<
829 (NTP_SCALE_SHIFT - SHIFT_USEC),
830 NSEC_PER_USEC) - pps_stabil) >> PPS_INTMIN;
831
832 /* if enabled, the system clock frequency is updated */
833 if ((time_status & STA_PPSFREQ) != 0 &&
834 (time_status & STA_FREQHOLD) == 0) {
835 time_freq = pps_freq;
836 ntp_update_frequency();
837 }
838
839 return delta;
840}
841
842/* correct REALTIME clock phase error against PPS signal */
843static void hardpps_update_phase(long error)
844{
845 long correction = -error;
846 long jitter;
847
848 /* add the sample to the median filter */
849 pps_phase_filter_add(correction);
850 correction = pps_phase_filter_get(&jitter);
851
852 /* Nominal jitter is due to PPS signal noise. If it exceeds the
853 * threshold, the sample is discarded; otherwise, if so enabled,
854 * the time offset is updated.
855 */
856 if (jitter > (pps_jitter << PPS_POPCORN)) {
857 pr_warning("hardpps: PPSJITTER: jitter=%ld, limit=%ld\n",
858 jitter, (pps_jitter << PPS_POPCORN));
859 time_status |= STA_PPSJITTER;
860 pps_jitcnt++;
861 } else if (time_status & STA_PPSTIME) {
862 /* correct the time using the phase offset */
863 time_offset = div_s64(((s64)correction) << NTP_SCALE_SHIFT,
864 NTP_INTERVAL_FREQ);
865 /* cancel running adjtime() */
866 time_adjust = 0;
867 }
868 /* update jitter */
869 pps_jitter += (jitter - pps_jitter) >> PPS_INTMIN;
870}
871
872/*
873 * hardpps() - discipline CPU clock oscillator to external PPS signal
874 *
875 * This routine is called at each PPS signal arrival in order to
876 * discipline the CPU clock oscillator to the PPS signal. It takes two
877 * parameters: REALTIME and MONOTONIC_RAW clock timestamps. The former
878 * is used to correct clock phase error and the latter is used to
879 * correct the frequency.
880 *
881 * This code is based on David Mills's reference nanokernel
882 * implementation. It was mostly rewritten but keeps the same idea.
883 */
884void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)
885{
886 struct pps_normtime pts_norm, freq_norm;
887 unsigned long flags;
888
889 pts_norm = pps_normalize_ts(*phase_ts);
890
891 write_seqlock_irqsave(&xtime_lock, flags);
892
893 /* clear the error bits, they will be set again if needed */
894 time_status &= ~(STA_PPSJITTER | STA_PPSWANDER | STA_PPSERROR);
895
896 /* indicate signal presence */
897 time_status |= STA_PPSSIGNAL;
898 pps_valid = PPS_VALID;
899
900 /* when called for the first time,
901 * just start the frequency interval */
902 if (unlikely(pps_fbase.tv_sec == 0)) {
903 pps_fbase = *raw_ts;
904 write_sequnlock_irqrestore(&xtime_lock, flags);
905 return;
906 }
907
908 /* ok, now we have a base for frequency calculation */
909 freq_norm = pps_normalize_ts(timespec_sub(*raw_ts, pps_fbase));
910
911 /* check that the signal is in the range
912 * [1s - MAXFREQ us, 1s + MAXFREQ us], otherwise reject it */
913 if ((freq_norm.sec == 0) ||
914 (freq_norm.nsec > MAXFREQ * freq_norm.sec) ||
915 (freq_norm.nsec < -MAXFREQ * freq_norm.sec)) {
916 time_status |= STA_PPSJITTER;
917 /* restart the frequency calibration interval */
918 pps_fbase = *raw_ts;
919 write_sequnlock_irqrestore(&xtime_lock, flags);
920 pr_err("hardpps: PPSJITTER: bad pulse\n");
921 return;
922 }
923
924 /* signal is ok */
925
926 /* check if the current frequency interval is finished */
927 if (freq_norm.sec >= (1 << pps_shift)) {
928 pps_calcnt++;
929 /* restart the frequency calibration interval */
930 pps_fbase = *raw_ts;
931 hardpps_update_freq(freq_norm);
932 }
933
934 hardpps_update_phase(pts_norm.nsec);
935
936 write_sequnlock_irqrestore(&xtime_lock, flags);
937}
938EXPORT_SYMBOL(hardpps);
939
940#endif /* CONFIG_NTP_PPS */
941
547static int __init ntp_tick_adj_setup(char *str) 942static int __init ntp_tick_adj_setup(char *str)
548{ 943{
549 ntp_tick_adj = simple_strtol(str, NULL, 0); 944 ntp_tick_adj = simple_strtol(str, NULL, 0);
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index b6b898d2eeef..051bc80a0c43 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -49,7 +49,7 @@ struct tick_device *tick_get_device(int cpu)
49 */ 49 */
50int tick_is_oneshot_available(void) 50int tick_is_oneshot_available(void)
51{ 51{
52 struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; 52 struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev);
53 53
54 return dev && (dev->features & CLOCK_EVT_FEAT_ONESHOT); 54 return dev && (dev->features & CLOCK_EVT_FEAT_ONESHOT);
55} 55}
diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c
index aada0e52680a..5cbc101f908b 100644
--- a/kernel/time/tick-oneshot.c
+++ b/kernel/time/tick-oneshot.c
@@ -95,7 +95,7 @@ int tick_dev_program_event(struct clock_event_device *dev, ktime_t expires,
95 */ 95 */
96int tick_program_event(ktime_t expires, int force) 96int tick_program_event(ktime_t expires, int force)
97{ 97{
98 struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; 98 struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev);
99 99
100 return tick_dev_program_event(dev, expires, force); 100 return tick_dev_program_event(dev, expires, force);
101} 101}
@@ -167,7 +167,7 @@ int tick_oneshot_mode_active(void)
167 int ret; 167 int ret;
168 168
169 local_irq_save(flags); 169 local_irq_save(flags);
170 ret = __get_cpu_var(tick_cpu_device).mode == TICKDEV_MODE_ONESHOT; 170 ret = __this_cpu_read(tick_cpu_device.mode) == TICKDEV_MODE_ONESHOT;
171 local_irq_restore(flags); 171 local_irq_restore(flags);
172 172
173 return ret; 173 return ret;
diff --git a/kernel/time/timecompare.c b/kernel/time/timecompare.c
index ac38fbb176cc..a9ae369925ce 100644
--- a/kernel/time/timecompare.c
+++ b/kernel/time/timecompare.c
@@ -21,6 +21,7 @@
21#include <linux/module.h> 21#include <linux/module.h>
22#include <linux/slab.h> 22#include <linux/slab.h>
23#include <linux/math64.h> 23#include <linux/math64.h>
24#include <linux/kernel.h>
24 25
25/* 26/*
26 * fixed point arithmetic scale factor for skew 27 * fixed point arithmetic scale factor for skew
@@ -57,11 +58,11 @@ int timecompare_offset(struct timecompare *sync,
57 int index; 58 int index;
58 int num_samples = sync->num_samples; 59 int num_samples = sync->num_samples;
59 60
60 if (num_samples > sizeof(buffer)/sizeof(buffer[0])) { 61 if (num_samples > ARRAY_SIZE(buffer)) {
61 samples = kmalloc(sizeof(*samples) * num_samples, GFP_ATOMIC); 62 samples = kmalloc(sizeof(*samples) * num_samples, GFP_ATOMIC);
62 if (!samples) { 63 if (!samples) {
63 samples = buffer; 64 samples = buffer;
64 num_samples = sizeof(buffer)/sizeof(buffer[0]); 65 num_samples = ARRAY_SIZE(buffer);
65 } 66 }
66 } else { 67 } else {
67 samples = buffer; 68 samples = buffer;
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 49010d822f72..d27c7562902c 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -32,6 +32,8 @@ struct timekeeper {
32 cycle_t cycle_interval; 32 cycle_t cycle_interval;
33 /* Number of clock shifted nano seconds in one NTP interval. */ 33 /* Number of clock shifted nano seconds in one NTP interval. */
34 u64 xtime_interval; 34 u64 xtime_interval;
35 /* shifted nano seconds left over when rounding cycle_interval */
36 s64 xtime_remainder;
35 /* Raw nano seconds accumulated per NTP interval. */ 37 /* Raw nano seconds accumulated per NTP interval. */
36 u32 raw_interval; 38 u32 raw_interval;
37 39
@@ -47,7 +49,7 @@ struct timekeeper {
47 u32 mult; 49 u32 mult;
48}; 50};
49 51
50struct timekeeper timekeeper; 52static struct timekeeper timekeeper;
51 53
52/** 54/**
53 * timekeeper_setup_internals - Set up internals to use clocksource clock. 55 * timekeeper_setup_internals - Set up internals to use clocksource clock.
@@ -62,7 +64,7 @@ struct timekeeper timekeeper;
62static void timekeeper_setup_internals(struct clocksource *clock) 64static void timekeeper_setup_internals(struct clocksource *clock)
63{ 65{
64 cycle_t interval; 66 cycle_t interval;
65 u64 tmp; 67 u64 tmp, ntpinterval;
66 68
67 timekeeper.clock = clock; 69 timekeeper.clock = clock;
68 clock->cycle_last = clock->read(clock); 70 clock->cycle_last = clock->read(clock);
@@ -70,6 +72,7 @@ static void timekeeper_setup_internals(struct clocksource *clock)
70 /* Do the ns -> cycle conversion first, using original mult */ 72 /* Do the ns -> cycle conversion first, using original mult */
71 tmp = NTP_INTERVAL_LENGTH; 73 tmp = NTP_INTERVAL_LENGTH;
72 tmp <<= clock->shift; 74 tmp <<= clock->shift;
75 ntpinterval = tmp;
73 tmp += clock->mult/2; 76 tmp += clock->mult/2;
74 do_div(tmp, clock->mult); 77 do_div(tmp, clock->mult);
75 if (tmp == 0) 78 if (tmp == 0)
@@ -80,6 +83,7 @@ static void timekeeper_setup_internals(struct clocksource *clock)
80 83
81 /* Go back from cycles -> shifted ns */ 84 /* Go back from cycles -> shifted ns */
82 timekeeper.xtime_interval = (u64) interval * clock->mult; 85 timekeeper.xtime_interval = (u64) interval * clock->mult;
86 timekeeper.xtime_remainder = ntpinterval - timekeeper.xtime_interval;
83 timekeeper.raw_interval = 87 timekeeper.raw_interval =
84 ((u64) interval * clock->mult) >> clock->shift; 88 ((u64) interval * clock->mult) >> clock->shift;
85 89
@@ -160,7 +164,7 @@ static struct timespec total_sleep_time;
160/* 164/*
161 * The raw monotonic time for the CLOCK_MONOTONIC_RAW posix clock. 165 * The raw monotonic time for the CLOCK_MONOTONIC_RAW posix clock.
162 */ 166 */
163struct timespec raw_time; 167static struct timespec raw_time;
164 168
165/* flag for if timekeeping is suspended */ 169/* flag for if timekeeping is suspended */
166int __read_mostly timekeeping_suspended; 170int __read_mostly timekeeping_suspended;
@@ -284,6 +288,49 @@ void ktime_get_ts(struct timespec *ts)
284} 288}
285EXPORT_SYMBOL_GPL(ktime_get_ts); 289EXPORT_SYMBOL_GPL(ktime_get_ts);
286 290
291#ifdef CONFIG_NTP_PPS
292
293/**
294 * getnstime_raw_and_real - get day and raw monotonic time in timespec format
295 * @ts_raw: pointer to the timespec to be set to raw monotonic time
296 * @ts_real: pointer to the timespec to be set to the time of day
297 *
298 * This function reads both the time of day and raw monotonic time at the
299 * same time atomically and stores the resulting timestamps in timespec
300 * format.
301 */
302void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real)
303{
304 unsigned long seq;
305 s64 nsecs_raw, nsecs_real;
306
307 WARN_ON_ONCE(timekeeping_suspended);
308
309 do {
310 u32 arch_offset;
311
312 seq = read_seqbegin(&xtime_lock);
313
314 *ts_raw = raw_time;
315 *ts_real = xtime;
316
317 nsecs_raw = timekeeping_get_ns_raw();
318 nsecs_real = timekeeping_get_ns();
319
320 /* If arch requires, add in gettimeoffset() */
321 arch_offset = arch_gettimeoffset();
322 nsecs_raw += arch_offset;
323 nsecs_real += arch_offset;
324
325 } while (read_seqretry(&xtime_lock, seq));
326
327 timespec_add_ns(ts_raw, nsecs_raw);
328 timespec_add_ns(ts_real, nsecs_real);
329}
330EXPORT_SYMBOL(getnstime_raw_and_real);
331
332#endif /* CONFIG_NTP_PPS */
333
287/** 334/**
288 * do_gettimeofday - Returns the time of day in a timeval 335 * do_gettimeofday - Returns the time of day in a timeval
289 * @tv: pointer to the timeval to be set 336 * @tv: pointer to the timeval to be set
@@ -719,7 +766,8 @@ static cycle_t logarithmic_accumulation(cycle_t offset, int shift)
719 766
720 /* Accumulate error between NTP and clock interval */ 767 /* Accumulate error between NTP and clock interval */
721 timekeeper.ntp_error += tick_length << shift; 768 timekeeper.ntp_error += tick_length << shift;
722 timekeeper.ntp_error -= timekeeper.xtime_interval << 769 timekeeper.ntp_error -=
770 (timekeeper.xtime_interval + timekeeper.xtime_remainder) <<
723 (timekeeper.ntp_error_shift + shift); 771 (timekeeper.ntp_error_shift + shift);
724 772
725 return offset; 773 return offset;
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index ab8f5e33fa92..32a19f9397fc 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -79,26 +79,26 @@ print_active_timers(struct seq_file *m, struct hrtimer_clock_base *base,
79{ 79{
80 struct hrtimer *timer, tmp; 80 struct hrtimer *timer, tmp;
81 unsigned long next = 0, i; 81 unsigned long next = 0, i;
82 struct rb_node *curr; 82 struct timerqueue_node *curr;
83 unsigned long flags; 83 unsigned long flags;
84 84
85next_one: 85next_one:
86 i = 0; 86 i = 0;
87 raw_spin_lock_irqsave(&base->cpu_base->lock, flags); 87 raw_spin_lock_irqsave(&base->cpu_base->lock, flags);
88 88
89 curr = base->first; 89 curr = timerqueue_getnext(&base->active);
90 /* 90 /*
91 * Crude but we have to do this O(N*N) thing, because 91 * Crude but we have to do this O(N*N) thing, because
92 * we have to unlock the base when printing: 92 * we have to unlock the base when printing:
93 */ 93 */
94 while (curr && i < next) { 94 while (curr && i < next) {
95 curr = rb_next(curr); 95 curr = timerqueue_iterate_next(curr);
96 i++; 96 i++;
97 } 97 }
98 98
99 if (curr) { 99 if (curr) {
100 100
101 timer = rb_entry(curr, struct hrtimer, node); 101 timer = container_of(curr, struct hrtimer, node);
102 tmp = *timer; 102 tmp = *timer;
103 raw_spin_unlock_irqrestore(&base->cpu_base->lock, flags); 103 raw_spin_unlock_irqrestore(&base->cpu_base->lock, flags);
104 104
diff --git a/kernel/timer.c b/kernel/timer.c
index 68a9ae7679b7..43ca9936f2d0 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -88,18 +88,6 @@ struct tvec_base boot_tvec_bases;
88EXPORT_SYMBOL(boot_tvec_bases); 88EXPORT_SYMBOL(boot_tvec_bases);
89static DEFINE_PER_CPU(struct tvec_base *, tvec_bases) = &boot_tvec_bases; 89static DEFINE_PER_CPU(struct tvec_base *, tvec_bases) = &boot_tvec_bases;
90 90
91/*
92 * Note that all tvec_bases are 2 byte aligned and lower bit of
93 * base in timer_list is guaranteed to be zero. Use the LSB to
94 * indicate whether the timer is deferrable.
95 *
96 * A deferrable timer will work normally when the system is busy, but
97 * will not cause a CPU to come out of idle just to service it; instead,
98 * the timer will be serviced when the CPU eventually wakes up with a
99 * subsequent non-deferrable timer.
100 */
101#define TBASE_DEFERRABLE_FLAG (0x1)
102
103/* Functions below help us manage 'deferrable' flag */ 91/* Functions below help us manage 'deferrable' flag */
104static inline unsigned int tbase_get_deferrable(struct tvec_base *base) 92static inline unsigned int tbase_get_deferrable(struct tvec_base *base)
105{ 93{
@@ -113,8 +101,7 @@ static inline struct tvec_base *tbase_get_base(struct tvec_base *base)
113 101
114static inline void timer_set_deferrable(struct timer_list *timer) 102static inline void timer_set_deferrable(struct timer_list *timer)
115{ 103{
116 timer->base = ((struct tvec_base *)((unsigned long)(timer->base) | 104 timer->base = TBASE_MAKE_DEFERRED(timer->base);
117 TBASE_DEFERRABLE_FLAG));
118} 105}
119 106
120static inline void 107static inline void
@@ -343,15 +330,6 @@ void set_timer_slack(struct timer_list *timer, int slack_hz)
343} 330}
344EXPORT_SYMBOL_GPL(set_timer_slack); 331EXPORT_SYMBOL_GPL(set_timer_slack);
345 332
346
347static inline void set_running_timer(struct tvec_base *base,
348 struct timer_list *timer)
349{
350#ifdef CONFIG_SMP
351 base->running_timer = timer;
352#endif
353}
354
355static void internal_add_timer(struct tvec_base *base, struct timer_list *timer) 333static void internal_add_timer(struct tvec_base *base, struct timer_list *timer)
356{ 334{
357 unsigned long expires = timer->expires; 335 unsigned long expires = timer->expires;
@@ -936,15 +914,12 @@ int del_timer(struct timer_list *timer)
936} 914}
937EXPORT_SYMBOL(del_timer); 915EXPORT_SYMBOL(del_timer);
938 916
939#ifdef CONFIG_SMP
940/** 917/**
941 * try_to_del_timer_sync - Try to deactivate a timer 918 * try_to_del_timer_sync - Try to deactivate a timer
942 * @timer: timer do del 919 * @timer: timer do del
943 * 920 *
944 * This function tries to deactivate a timer. Upon successful (ret >= 0) 921 * This function tries to deactivate a timer. Upon successful (ret >= 0)
945 * exit the timer is not queued and the handler is not running on any CPU. 922 * exit the timer is not queued and the handler is not running on any CPU.
946 *
947 * It must not be called from interrupt contexts.
948 */ 923 */
949int try_to_del_timer_sync(struct timer_list *timer) 924int try_to_del_timer_sync(struct timer_list *timer)
950{ 925{
@@ -973,6 +948,7 @@ out:
973} 948}
974EXPORT_SYMBOL(try_to_del_timer_sync); 949EXPORT_SYMBOL(try_to_del_timer_sync);
975 950
951#ifdef CONFIG_SMP
976/** 952/**
977 * del_timer_sync - deactivate a timer and wait for the handler to finish. 953 * del_timer_sync - deactivate a timer and wait for the handler to finish.
978 * @timer: the timer to be deactivated 954 * @timer: the timer to be deactivated
@@ -983,7 +959,7 @@ EXPORT_SYMBOL(try_to_del_timer_sync);
983 * 959 *
984 * Synchronization rules: Callers must prevent restarting of the timer, 960 * Synchronization rules: Callers must prevent restarting of the timer,
985 * otherwise this function is meaningless. It must not be called from 961 * otherwise this function is meaningless. It must not be called from
986 * interrupt contexts. The caller must not hold locks which would prevent 962 * hardirq contexts. The caller must not hold locks which would prevent
987 * completion of the timer's handler. The timer's handler must not call 963 * completion of the timer's handler. The timer's handler must not call
988 * add_timer_on(). Upon exit the timer is not queued and the handler is 964 * add_timer_on(). Upon exit the timer is not queued and the handler is
989 * not running on any CPU. 965 * not running on any CPU.
@@ -993,14 +969,16 @@ EXPORT_SYMBOL(try_to_del_timer_sync);
993int del_timer_sync(struct timer_list *timer) 969int del_timer_sync(struct timer_list *timer)
994{ 970{
995#ifdef CONFIG_LOCKDEP 971#ifdef CONFIG_LOCKDEP
996 unsigned long flags; 972 local_bh_disable();
997
998 local_irq_save(flags);
999 lock_map_acquire(&timer->lockdep_map); 973 lock_map_acquire(&timer->lockdep_map);
1000 lock_map_release(&timer->lockdep_map); 974 lock_map_release(&timer->lockdep_map);
1001 local_irq_restore(flags); 975 local_bh_enable();
1002#endif 976#endif
1003 977 /*
978 * don't use it in hardirq context, because it
979 * could lead to deadlock.
980 */
981 WARN_ON(in_irq());
1004 for (;;) { 982 for (;;) {
1005 int ret = try_to_del_timer_sync(timer); 983 int ret = try_to_del_timer_sync(timer);
1006 if (ret >= 0) 984 if (ret >= 0)
@@ -1111,7 +1089,7 @@ static inline void __run_timers(struct tvec_base *base)
1111 1089
1112 timer_stats_account_timer(timer); 1090 timer_stats_account_timer(timer);
1113 1091
1114 set_running_timer(base, timer); 1092 base->running_timer = timer;
1115 detach_timer(timer, 1); 1093 detach_timer(timer, 1);
1116 1094
1117 spin_unlock_irq(&base->lock); 1095 spin_unlock_irq(&base->lock);
@@ -1119,7 +1097,7 @@ static inline void __run_timers(struct tvec_base *base)
1119 spin_lock_irq(&base->lock); 1097 spin_lock_irq(&base->lock);
1120 } 1098 }
1121 } 1099 }
1122 set_running_timer(base, NULL); 1100 base->running_timer = NULL;
1123 spin_unlock_irq(&base->lock); 1101 spin_unlock_irq(&base->lock);
1124} 1102}
1125 1103
@@ -1249,9 +1227,15 @@ static unsigned long cmp_next_hrtimer_event(unsigned long now,
1249 */ 1227 */
1250unsigned long get_next_timer_interrupt(unsigned long now) 1228unsigned long get_next_timer_interrupt(unsigned long now)
1251{ 1229{
1252 struct tvec_base *base = __get_cpu_var(tvec_bases); 1230 struct tvec_base *base = __this_cpu_read(tvec_bases);
1253 unsigned long expires; 1231 unsigned long expires;
1254 1232
1233 /*
1234 * Pretend that there is no timer pending if the cpu is offline.
1235 * Possible pending timers will be migrated later to an active cpu.
1236 */
1237 if (cpu_is_offline(smp_processor_id()))
1238 return now + NEXT_TIMER_MAX_DELTA;
1255 spin_lock(&base->lock); 1239 spin_lock(&base->lock);
1256 if (time_before_eq(base->next_timer, base->timer_jiffies)) 1240 if (time_before_eq(base->next_timer, base->timer_jiffies))
1257 base->next_timer = __next_timer_interrupt(base); 1241 base->next_timer = __next_timer_interrupt(base);
@@ -1292,7 +1276,7 @@ void update_process_times(int user_tick)
1292 */ 1276 */
1293static void run_timer_softirq(struct softirq_action *h) 1277static void run_timer_softirq(struct softirq_action *h)
1294{ 1278{
1295 struct tvec_base *base = __get_cpu_var(tvec_bases); 1279 struct tvec_base *base = __this_cpu_read(tvec_bases);
1296 1280
1297 hrtimer_run_pending(); 1281 hrtimer_run_pending();
1298 1282
@@ -1319,7 +1303,7 @@ void do_timer(unsigned long ticks)
1319{ 1303{
1320 jiffies_64 += ticks; 1304 jiffies_64 += ticks;
1321 update_wall_time(); 1305 update_wall_time();
1322 calc_global_load(); 1306 calc_global_load(ticks);
1323} 1307}
1324 1308
1325#ifdef __ARCH_WANT_SYS_ALARM 1309#ifdef __ARCH_WANT_SYS_ALARM
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index e04b8bcdef88..14674dce77a6 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -69,6 +69,21 @@ config EVENT_TRACING
69 select CONTEXT_SWITCH_TRACER 69 select CONTEXT_SWITCH_TRACER
70 bool 70 bool
71 71
72config EVENT_POWER_TRACING_DEPRECATED
73 depends on EVENT_TRACING
74 bool "Deprecated power event trace API, to be removed"
75 default y
76 help
77 Provides old power event types:
78 C-state/idle accounting events:
79 power:power_start
80 power:power_end
81 and old cpufreq accounting event:
82 power:power_frequency
83 This is for userspace compatibility
84 and will vanish after 5 kernel iterations,
85 namely 2.6.41.
86
72config CONTEXT_SWITCH_TRACER 87config CONTEXT_SWITCH_TRACER
73 bool 88 bool
74 89
@@ -126,7 +141,7 @@ if FTRACE
126config FUNCTION_TRACER 141config FUNCTION_TRACER
127 bool "Kernel Function Tracer" 142 bool "Kernel Function Tracer"
128 depends on HAVE_FUNCTION_TRACER 143 depends on HAVE_FUNCTION_TRACER
129 select FRAME_POINTER if (!ARM_UNWIND) 144 select FRAME_POINTER if !ARM_UNWIND && !S390
130 select KALLSYMS 145 select KALLSYMS
131 select GENERIC_TRACER 146 select GENERIC_TRACER
132 select CONTEXT_SWITCH_TRACER 147 select CONTEXT_SWITCH_TRACER
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 53f338190b26..761c510a06c5 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -52,7 +52,7 @@ obj-$(CONFIG_EVENT_TRACING) += trace_event_perf.o
52endif 52endif
53obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o 53obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o
54obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o 54obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o
55obj-$(CONFIG_EVENT_TRACING) += power-traces.o 55obj-$(CONFIG_TRACEPOINTS) += power-traces.o
56ifeq ($(CONFIG_TRACING),y) 56ifeq ($(CONFIG_TRACING),y)
57obj-$(CONFIG_KGDB_KDB) += trace_kdb.o 57obj-$(CONFIG_KGDB_KDB) += trace_kdb.o
58endif 58endif
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index bc251ed66724..153562d0b93c 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -168,7 +168,6 @@ static int act_log_check(struct blk_trace *bt, u32 what, sector_t sector,
168static const u32 ddir_act[2] = { BLK_TC_ACT(BLK_TC_READ), 168static const u32 ddir_act[2] = { BLK_TC_ACT(BLK_TC_READ),
169 BLK_TC_ACT(BLK_TC_WRITE) }; 169 BLK_TC_ACT(BLK_TC_WRITE) };
170 170
171#define BLK_TC_HARDBARRIER BLK_TC_BARRIER
172#define BLK_TC_RAHEAD BLK_TC_AHEAD 171#define BLK_TC_RAHEAD BLK_TC_AHEAD
173 172
174/* The ilog2() calls fall out because they're constant */ 173/* The ilog2() calls fall out because they're constant */
@@ -196,7 +195,6 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
196 return; 195 return;
197 196
198 what |= ddir_act[rw & WRITE]; 197 what |= ddir_act[rw & WRITE];
199 what |= MASK_TC_BIT(rw, HARDBARRIER);
200 what |= MASK_TC_BIT(rw, SYNC); 198 what |= MASK_TC_BIT(rw, SYNC);
201 what |= MASK_TC_BIT(rw, RAHEAD); 199 what |= MASK_TC_BIT(rw, RAHEAD);
202 what |= MASK_TC_BIT(rw, META); 200 what |= MASK_TC_BIT(rw, META);
@@ -760,53 +758,58 @@ static void blk_add_trace_rq_complete(void *ignore,
760 * @q: queue the io is for 758 * @q: queue the io is for
761 * @bio: the source bio 759 * @bio: the source bio
762 * @what: the action 760 * @what: the action
761 * @error: error, if any
763 * 762 *
764 * Description: 763 * Description:
765 * Records an action against a bio. Will log the bio offset + size. 764 * Records an action against a bio. Will log the bio offset + size.
766 * 765 *
767 **/ 766 **/
768static void blk_add_trace_bio(struct request_queue *q, struct bio *bio, 767static void blk_add_trace_bio(struct request_queue *q, struct bio *bio,
769 u32 what) 768 u32 what, int error)
770{ 769{
771 struct blk_trace *bt = q->blk_trace; 770 struct blk_trace *bt = q->blk_trace;
772 771
773 if (likely(!bt)) 772 if (likely(!bt))
774 return; 773 return;
775 774
775 if (!error && !bio_flagged(bio, BIO_UPTODATE))
776 error = EIO;
777
776 __blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, what, 778 __blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, what,
777 !bio_flagged(bio, BIO_UPTODATE), 0, NULL); 779 error, 0, NULL);
778} 780}
779 781
780static void blk_add_trace_bio_bounce(void *ignore, 782static void blk_add_trace_bio_bounce(void *ignore,
781 struct request_queue *q, struct bio *bio) 783 struct request_queue *q, struct bio *bio)
782{ 784{
783 blk_add_trace_bio(q, bio, BLK_TA_BOUNCE); 785 blk_add_trace_bio(q, bio, BLK_TA_BOUNCE, 0);
784} 786}
785 787
786static void blk_add_trace_bio_complete(void *ignore, 788static void blk_add_trace_bio_complete(void *ignore,
787 struct request_queue *q, struct bio *bio) 789 struct request_queue *q, struct bio *bio,
790 int error)
788{ 791{
789 blk_add_trace_bio(q, bio, BLK_TA_COMPLETE); 792 blk_add_trace_bio(q, bio, BLK_TA_COMPLETE, error);
790} 793}
791 794
792static void blk_add_trace_bio_backmerge(void *ignore, 795static void blk_add_trace_bio_backmerge(void *ignore,
793 struct request_queue *q, 796 struct request_queue *q,
794 struct bio *bio) 797 struct bio *bio)
795{ 798{
796 blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE); 799 blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE, 0);
797} 800}
798 801
799static void blk_add_trace_bio_frontmerge(void *ignore, 802static void blk_add_trace_bio_frontmerge(void *ignore,
800 struct request_queue *q, 803 struct request_queue *q,
801 struct bio *bio) 804 struct bio *bio)
802{ 805{
803 blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE); 806 blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE, 0);
804} 807}
805 808
806static void blk_add_trace_bio_queue(void *ignore, 809static void blk_add_trace_bio_queue(void *ignore,
807 struct request_queue *q, struct bio *bio) 810 struct request_queue *q, struct bio *bio)
808{ 811{
809 blk_add_trace_bio(q, bio, BLK_TA_QUEUE); 812 blk_add_trace_bio(q, bio, BLK_TA_QUEUE, 0);
810} 813}
811 814
812static void blk_add_trace_getrq(void *ignore, 815static void blk_add_trace_getrq(void *ignore,
@@ -814,7 +817,7 @@ static void blk_add_trace_getrq(void *ignore,
814 struct bio *bio, int rw) 817 struct bio *bio, int rw)
815{ 818{
816 if (bio) 819 if (bio)
817 blk_add_trace_bio(q, bio, BLK_TA_GETRQ); 820 blk_add_trace_bio(q, bio, BLK_TA_GETRQ, 0);
818 else { 821 else {
819 struct blk_trace *bt = q->blk_trace; 822 struct blk_trace *bt = q->blk_trace;
820 823
@@ -829,7 +832,7 @@ static void blk_add_trace_sleeprq(void *ignore,
829 struct bio *bio, int rw) 832 struct bio *bio, int rw)
830{ 833{
831 if (bio) 834 if (bio)
832 blk_add_trace_bio(q, bio, BLK_TA_SLEEPRQ); 835 blk_add_trace_bio(q, bio, BLK_TA_SLEEPRQ, 0);
833 else { 836 else {
834 struct blk_trace *bt = q->blk_trace; 837 struct blk_trace *bt = q->blk_trace;
835 838
@@ -889,7 +892,7 @@ static void blk_add_trace_split(void *ignore,
889} 892}
890 893
891/** 894/**
892 * blk_add_trace_remap - Add a trace for a remap operation 895 * blk_add_trace_bio_remap - Add a trace for a bio-remap operation
893 * @ignore: trace callback data parameter (not used) 896 * @ignore: trace callback data parameter (not used)
894 * @q: queue the io is for 897 * @q: queue the io is for
895 * @bio: the source bio 898 * @bio: the source bio
@@ -901,9 +904,9 @@ static void blk_add_trace_split(void *ignore,
901 * it spans a stripe (or similar). Add a trace for that action. 904 * it spans a stripe (or similar). Add a trace for that action.
902 * 905 *
903 **/ 906 **/
904static void blk_add_trace_remap(void *ignore, 907static void blk_add_trace_bio_remap(void *ignore,
905 struct request_queue *q, struct bio *bio, 908 struct request_queue *q, struct bio *bio,
906 dev_t dev, sector_t from) 909 dev_t dev, sector_t from)
907{ 910{
908 struct blk_trace *bt = q->blk_trace; 911 struct blk_trace *bt = q->blk_trace;
909 struct blk_io_trace_remap r; 912 struct blk_io_trace_remap r;
@@ -1018,7 +1021,7 @@ static void blk_register_tracepoints(void)
1018 WARN_ON(ret); 1021 WARN_ON(ret);
1019 ret = register_trace_block_split(blk_add_trace_split, NULL); 1022 ret = register_trace_block_split(blk_add_trace_split, NULL);
1020 WARN_ON(ret); 1023 WARN_ON(ret);
1021 ret = register_trace_block_remap(blk_add_trace_remap, NULL); 1024 ret = register_trace_block_bio_remap(blk_add_trace_bio_remap, NULL);
1022 WARN_ON(ret); 1025 WARN_ON(ret);
1023 ret = register_trace_block_rq_remap(blk_add_trace_rq_remap, NULL); 1026 ret = register_trace_block_rq_remap(blk_add_trace_rq_remap, NULL);
1024 WARN_ON(ret); 1027 WARN_ON(ret);
@@ -1027,7 +1030,7 @@ static void blk_register_tracepoints(void)
1027static void blk_unregister_tracepoints(void) 1030static void blk_unregister_tracepoints(void)
1028{ 1031{
1029 unregister_trace_block_rq_remap(blk_add_trace_rq_remap, NULL); 1032 unregister_trace_block_rq_remap(blk_add_trace_rq_remap, NULL);
1030 unregister_trace_block_remap(blk_add_trace_remap, NULL); 1033 unregister_trace_block_bio_remap(blk_add_trace_bio_remap, NULL);
1031 unregister_trace_block_split(blk_add_trace_split, NULL); 1034 unregister_trace_block_split(blk_add_trace_split, NULL);
1032 unregister_trace_block_unplug_io(blk_add_trace_unplug_io, NULL); 1035 unregister_trace_block_unplug_io(blk_add_trace_unplug_io, NULL);
1033 unregister_trace_block_unplug_timer(blk_add_trace_unplug_timer, NULL); 1036 unregister_trace_block_unplug_timer(blk_add_trace_unplug_timer, NULL);
@@ -1807,8 +1810,6 @@ void blk_fill_rwbs(char *rwbs, u32 rw, int bytes)
1807 1810
1808 if (rw & REQ_RAHEAD) 1811 if (rw & REQ_RAHEAD)
1809 rwbs[i++] = 'A'; 1812 rwbs[i++] = 'A';
1810 if (rw & REQ_HARDBARRIER)
1811 rwbs[i++] = 'B';
1812 if (rw & REQ_SYNC) 1813 if (rw & REQ_SYNC)
1813 rwbs[i++] = 'S'; 1814 rwbs[i++] = 'S';
1814 if (rw & REQ_META) 1815 if (rw & REQ_META)
diff --git a/kernel/trace/power-traces.c b/kernel/trace/power-traces.c
index a22582a06161..f55fcf61b223 100644
--- a/kernel/trace/power-traces.c
+++ b/kernel/trace/power-traces.c
@@ -13,5 +13,8 @@
13#define CREATE_TRACE_POINTS 13#define CREATE_TRACE_POINTS
14#include <trace/events/power.h> 14#include <trace/events/power.h>
15 15
16EXPORT_TRACEPOINT_SYMBOL_GPL(power_frequency); 16#ifdef EVENT_POWER_TRACING_DEPRECATED
17EXPORT_TRACEPOINT_SYMBOL_GPL(power_start);
18#endif
19EXPORT_TRACEPOINT_SYMBOL_GPL(cpu_idle);
17 20
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 9ed509a015d8..bd1c35a4fbcc 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -3853,6 +3853,13 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
3853 3853
3854 /* Need to copy one event at a time */ 3854 /* Need to copy one event at a time */
3855 do { 3855 do {
3856 /* We need the size of one event, because
3857 * rb_advance_reader only advances by one event,
3858 * whereas rb_event_ts_length may include the size of
3859 * one or two events.
3860 * We have already ensured there's enough space if this
3861 * is a time extend. */
3862 size = rb_event_length(event);
3856 memcpy(bpage->data + pos, rpage->data + rpos, size); 3863 memcpy(bpage->data + pos, rpage->data + rpos, size);
3857 3864
3858 len -= size; 3865 len -= size;
@@ -3867,7 +3874,7 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
3867 event = rb_reader_event(cpu_buffer); 3874 event = rb_reader_event(cpu_buffer);
3868 /* Always keep the time extend and data together */ 3875 /* Always keep the time extend and data together */
3869 size = rb_event_ts_length(event); 3876 size = rb_event_ts_length(event);
3870 } while (len > size); 3877 } while (len >= size);
3871 3878
3872 /* update bpage */ 3879 /* update bpage */
3873 local_set(&bpage->commit, pos); 3880 local_set(&bpage->commit, pos);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 82d9b8106cd0..dc53ecb80589 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -17,7 +17,6 @@
17#include <linux/writeback.h> 17#include <linux/writeback.h>
18#include <linux/kallsyms.h> 18#include <linux/kallsyms.h>
19#include <linux/seq_file.h> 19#include <linux/seq_file.h>
20#include <linux/smp_lock.h>
21#include <linux/notifier.h> 20#include <linux/notifier.h>
22#include <linux/irqflags.h> 21#include <linux/irqflags.h>
23#include <linux/debugfs.h> 22#include <linux/debugfs.h>
@@ -1284,6 +1283,8 @@ void trace_dump_stack(void)
1284 __ftrace_trace_stack(global_trace.buffer, flags, 3, preempt_count()); 1283 __ftrace_trace_stack(global_trace.buffer, flags, 3, preempt_count());
1285} 1284}
1286 1285
1286static DEFINE_PER_CPU(int, user_stack_count);
1287
1287void 1288void
1288ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc) 1289ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc)
1289{ 1290{
@@ -1302,10 +1303,20 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc)
1302 if (unlikely(in_nmi())) 1303 if (unlikely(in_nmi()))
1303 return; 1304 return;
1304 1305
1306 /*
1307 * prevent recursion, since the user stack tracing may
1308 * trigger other kernel events.
1309 */
1310 preempt_disable();
1311 if (__this_cpu_read(user_stack_count))
1312 goto out;
1313
1314 __this_cpu_inc(user_stack_count);
1315
1305 event = trace_buffer_lock_reserve(buffer, TRACE_USER_STACK, 1316 event = trace_buffer_lock_reserve(buffer, TRACE_USER_STACK,
1306 sizeof(*entry), flags, pc); 1317 sizeof(*entry), flags, pc);
1307 if (!event) 1318 if (!event)
1308 return; 1319 goto out_drop_count;
1309 entry = ring_buffer_event_data(event); 1320 entry = ring_buffer_event_data(event);
1310 1321
1311 entry->tgid = current->tgid; 1322 entry->tgid = current->tgid;
@@ -1319,6 +1330,11 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc)
1319 save_stack_trace_user(&trace); 1330 save_stack_trace_user(&trace);
1320 if (!filter_check_discard(call, entry, buffer, event)) 1331 if (!filter_check_discard(call, entry, buffer, event))
1321 ring_buffer_unlock_commit(buffer, event); 1332 ring_buffer_unlock_commit(buffer, event);
1333
1334 out_drop_count:
1335 __this_cpu_dec(user_stack_count);
1336 out:
1337 preempt_enable();
1322} 1338}
1323 1339
1324#ifdef UNUSED 1340#ifdef UNUSED
@@ -2320,11 +2336,19 @@ tracing_write_stub(struct file *filp, const char __user *ubuf,
2320 return count; 2336 return count;
2321} 2337}
2322 2338
2339static loff_t tracing_seek(struct file *file, loff_t offset, int origin)
2340{
2341 if (file->f_mode & FMODE_READ)
2342 return seq_lseek(file, offset, origin);
2343 else
2344 return 0;
2345}
2346
2323static const struct file_operations tracing_fops = { 2347static const struct file_operations tracing_fops = {
2324 .open = tracing_open, 2348 .open = tracing_open,
2325 .read = seq_read, 2349 .read = seq_read,
2326 .write = tracing_write_stub, 2350 .write = tracing_write_stub,
2327 .llseek = seq_lseek, 2351 .llseek = tracing_seek,
2328 .release = tracing_release, 2352 .release = tracing_release,
2329}; 2353};
2330 2354
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index e3dfecaf13e6..6cf223764be8 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -53,7 +53,7 @@
53 */ 53 */
54 54
55/* 55/*
56 * Function trace entry - function address and parent function addres: 56 * Function trace entry - function address and parent function address:
57 */ 57 */
58FTRACE_ENTRY(function, ftrace_entry, 58FTRACE_ENTRY(function, ftrace_entry,
59 59
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 39c059ca670e..19a359d5e6d5 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -21,17 +21,46 @@ typedef typeof(unsigned long [PERF_MAX_TRACE_SIZE / sizeof(unsigned long)])
21/* Count the events in use (per event id, not per instance) */ 21/* Count the events in use (per event id, not per instance) */
22static int total_ref_count; 22static int total_ref_count;
23 23
24static int perf_trace_event_perm(struct ftrace_event_call *tp_event,
25 struct perf_event *p_event)
26{
27 /* No tracing, just counting, so no obvious leak */
28 if (!(p_event->attr.sample_type & PERF_SAMPLE_RAW))
29 return 0;
30
31 /* Some events are ok to be traced by non-root users... */
32 if (p_event->attach_state == PERF_ATTACH_TASK) {
33 if (tp_event->flags & TRACE_EVENT_FL_CAP_ANY)
34 return 0;
35 }
36
37 /*
38 * ...otherwise raw tracepoint data can be a severe data leak,
39 * only allow root to have these.
40 */
41 if (perf_paranoid_tracepoint_raw() && !capable(CAP_SYS_ADMIN))
42 return -EPERM;
43
44 return 0;
45}
46
24static int perf_trace_event_init(struct ftrace_event_call *tp_event, 47static int perf_trace_event_init(struct ftrace_event_call *tp_event,
25 struct perf_event *p_event) 48 struct perf_event *p_event)
26{ 49{
27 struct hlist_head __percpu *list; 50 struct hlist_head __percpu *list;
28 int ret = -ENOMEM; 51 int ret;
29 int cpu; 52 int cpu;
30 53
54 ret = perf_trace_event_perm(tp_event, p_event);
55 if (ret)
56 return ret;
57
31 p_event->tp_event = tp_event; 58 p_event->tp_event = tp_event;
32 if (tp_event->perf_refcount++ > 0) 59 if (tp_event->perf_refcount++ > 0)
33 return 0; 60 return 0;
34 61
62 ret = -ENOMEM;
63
35 list = alloc_percpu(struct hlist_head); 64 list = alloc_percpu(struct hlist_head);
36 if (!list) 65 if (!list)
37 goto fail; 66 goto fail;
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 0725eeab1937..35fde09b81de 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -27,6 +27,12 @@
27 27
28DEFINE_MUTEX(event_mutex); 28DEFINE_MUTEX(event_mutex);
29 29
30DEFINE_MUTEX(event_storage_mutex);
31EXPORT_SYMBOL_GPL(event_storage_mutex);
32
33char event_storage[EVENT_STORAGE_SIZE];
34EXPORT_SYMBOL_GPL(event_storage);
35
30LIST_HEAD(ftrace_events); 36LIST_HEAD(ftrace_events);
31LIST_HEAD(ftrace_common_fields); 37LIST_HEAD(ftrace_common_fields);
32 38
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index 4ba44deaac25..4b74d71705c0 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -83,13 +83,19 @@ static void __always_unused ____ftrace_check_##name(void) \
83 83
84#undef __array 84#undef __array
85#define __array(type, item, len) \ 85#define __array(type, item, len) \
86 BUILD_BUG_ON(len > MAX_FILTER_STR_VAL); \ 86 do { \
87 ret = trace_define_field(event_call, #type "[" #len "]", #item, \ 87 BUILD_BUG_ON(len > MAX_FILTER_STR_VAL); \
88 mutex_lock(&event_storage_mutex); \
89 snprintf(event_storage, sizeof(event_storage), \
90 "%s[%d]", #type, len); \
91 ret = trace_define_field(event_call, event_storage, #item, \
88 offsetof(typeof(field), item), \ 92 offsetof(typeof(field), item), \
89 sizeof(field.item), \ 93 sizeof(field.item), \
90 is_signed_type(type), FILTER_OTHER); \ 94 is_signed_type(type), FILTER_OTHER); \
91 if (ret) \ 95 mutex_unlock(&event_storage_mutex); \
92 return ret; 96 if (ret) \
97 return ret; \
98 } while (0);
93 99
94#undef __array_desc 100#undef __array_desc
95#define __array_desc(type, container, item, len) \ 101#define __array_desc(type, container, item, len) \
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 5cf8c602b880..92b6e1e12d98 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -453,14 +453,6 @@ void time_hardirqs_off(unsigned long a0, unsigned long a1)
453 * Stubs: 453 * Stubs:
454 */ 454 */
455 455
456void early_boot_irqs_off(void)
457{
458}
459
460void early_boot_irqs_on(void)
461{
462}
463
464void trace_softirqs_on(unsigned long ip) 456void trace_softirqs_on(unsigned long ip)
465{ 457{
466} 458}
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 155a415b3209..659732eba07c 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -558,7 +558,7 @@ trace_selftest_startup_nop(struct tracer *trace, struct trace_array *tr)
558static int trace_wakeup_test_thread(void *data) 558static int trace_wakeup_test_thread(void *data)
559{ 559{
560 /* Make this a RT thread, doesn't need to be too high */ 560 /* Make this a RT thread, doesn't need to be too high */
561 struct sched_param param = { .sched_priority = 5 }; 561 static const struct sched_param param = { .sched_priority = 5 };
562 struct completion *x = data; 562 struct completion *x = data;
563 563
564 sched_setscheduler(current, SCHED_FIFO, &param); 564 sched_setscheduler(current, SCHED_FIFO, &param);
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index bac752f0cfb5..b706529b4fc7 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -23,9 +23,6 @@ static int syscall_exit_register(struct ftrace_event_call *event,
23static int syscall_enter_define_fields(struct ftrace_event_call *call); 23static int syscall_enter_define_fields(struct ftrace_event_call *call);
24static int syscall_exit_define_fields(struct ftrace_event_call *call); 24static int syscall_exit_define_fields(struct ftrace_event_call *call);
25 25
26/* All syscall exit events have the same fields */
27static LIST_HEAD(syscall_exit_fields);
28
29static struct list_head * 26static struct list_head *
30syscall_get_enter_fields(struct ftrace_event_call *call) 27syscall_get_enter_fields(struct ftrace_event_call *call)
31{ 28{
@@ -34,34 +31,28 @@ syscall_get_enter_fields(struct ftrace_event_call *call)
34 return &entry->enter_fields; 31 return &entry->enter_fields;
35} 32}
36 33
37static struct list_head *
38syscall_get_exit_fields(struct ftrace_event_call *call)
39{
40 return &syscall_exit_fields;
41}
42
43struct trace_event_functions enter_syscall_print_funcs = { 34struct trace_event_functions enter_syscall_print_funcs = {
44 .trace = print_syscall_enter, 35 .trace = print_syscall_enter,
45}; 36};
46 37
47struct trace_event_functions exit_syscall_print_funcs = { 38struct trace_event_functions exit_syscall_print_funcs = {
48 .trace = print_syscall_exit, 39 .trace = print_syscall_exit,
49}; 40};
50 41
51struct ftrace_event_class event_class_syscall_enter = { 42struct ftrace_event_class event_class_syscall_enter = {
52 .system = "syscalls", 43 .system = "syscalls",
53 .reg = syscall_enter_register, 44 .reg = syscall_enter_register,
54 .define_fields = syscall_enter_define_fields, 45 .define_fields = syscall_enter_define_fields,
55 .get_fields = syscall_get_enter_fields, 46 .get_fields = syscall_get_enter_fields,
56 .raw_init = init_syscall_trace, 47 .raw_init = init_syscall_trace,
57}; 48};
58 49
59struct ftrace_event_class event_class_syscall_exit = { 50struct ftrace_event_class event_class_syscall_exit = {
60 .system = "syscalls", 51 .system = "syscalls",
61 .reg = syscall_exit_register, 52 .reg = syscall_exit_register,
62 .define_fields = syscall_exit_define_fields, 53 .define_fields = syscall_exit_define_fields,
63 .get_fields = syscall_get_exit_fields, 54 .fields = LIST_HEAD_INIT(event_class_syscall_exit.fields),
64 .raw_init = init_syscall_trace, 55 .raw_init = init_syscall_trace,
65}; 56};
66 57
67extern unsigned long __start_syscalls_metadata[]; 58extern unsigned long __start_syscalls_metadata[];
diff --git a/kernel/user.c b/kernel/user.c
index 2c7d8d5914b1..5c598ca781df 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -158,6 +158,7 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid)
158 spin_lock_irq(&uidhash_lock); 158 spin_lock_irq(&uidhash_lock);
159 up = uid_hash_find(uid, hashent); 159 up = uid_hash_find(uid, hashent);
160 if (up) { 160 if (up) {
161 put_user_ns(ns);
161 key_put(new->uid_keyring); 162 key_put(new->uid_keyring);
162 key_put(new->session_keyring); 163 key_put(new->session_keyring);
163 kmem_cache_free(uid_cachep, new); 164 kmem_cache_free(uid_cachep, new);
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 25915832291a..9da289c34f22 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -12,6 +12,8 @@
12#include <linux/highuid.h> 12#include <linux/highuid.h>
13#include <linux/cred.h> 13#include <linux/cred.h>
14 14
15static struct kmem_cache *user_ns_cachep __read_mostly;
16
15/* 17/*
16 * Create a new user namespace, deriving the creator from the user in the 18 * Create a new user namespace, deriving the creator from the user in the
17 * passed credentials, and replacing that user with the new root user for the 19 * passed credentials, and replacing that user with the new root user for the
@@ -26,7 +28,7 @@ int create_user_ns(struct cred *new)
26 struct user_struct *root_user; 28 struct user_struct *root_user;
27 int n; 29 int n;
28 30
29 ns = kmalloc(sizeof(struct user_namespace), GFP_KERNEL); 31 ns = kmem_cache_alloc(user_ns_cachep, GFP_KERNEL);
30 if (!ns) 32 if (!ns)
31 return -ENOMEM; 33 return -ENOMEM;
32 34
@@ -38,7 +40,7 @@ int create_user_ns(struct cred *new)
38 /* Alloc new root user. */ 40 /* Alloc new root user. */
39 root_user = alloc_uid(ns, 0); 41 root_user = alloc_uid(ns, 0);
40 if (!root_user) { 42 if (!root_user) {
41 kfree(ns); 43 kmem_cache_free(user_ns_cachep, ns);
42 return -ENOMEM; 44 return -ENOMEM;
43 } 45 }
44 46
@@ -71,7 +73,7 @@ static void free_user_ns_work(struct work_struct *work)
71 struct user_namespace *ns = 73 struct user_namespace *ns =
72 container_of(work, struct user_namespace, destroyer); 74 container_of(work, struct user_namespace, destroyer);
73 free_uid(ns->creator); 75 free_uid(ns->creator);
74 kfree(ns); 76 kmem_cache_free(user_ns_cachep, ns);
75} 77}
76 78
77void free_user_ns(struct kref *kref) 79void free_user_ns(struct kref *kref)
@@ -126,3 +128,10 @@ gid_t user_ns_map_gid(struct user_namespace *to, const struct cred *cred, gid_t
126 /* No useful relationship so no mapping */ 128 /* No useful relationship so no mapping */
127 return overflowgid; 129 return overflowgid;
128} 130}
131
132static __init int user_namespaces_init(void)
133{
134 user_ns_cachep = KMEM_CACHE(user_namespace, SLAB_PANIC);
135 return 0;
136}
137module_init(user_namespaces_init);
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 6e3c41a4024c..d7ebdf4cea98 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -57,6 +57,8 @@ static int __init hardlockup_panic_setup(char *str)
57{ 57{
58 if (!strncmp(str, "panic", 5)) 58 if (!strncmp(str, "panic", 5))
59 hardlockup_panic = 1; 59 hardlockup_panic = 1;
60 else if (!strncmp(str, "0", 1))
61 no_watchdog = 1;
60 return 1; 62 return 1;
61} 63}
62__setup("nmi_watchdog=", hardlockup_panic_setup); 64__setup("nmi_watchdog=", hardlockup_panic_setup);
@@ -116,12 +118,12 @@ static void __touch_watchdog(void)
116{ 118{
117 int this_cpu = smp_processor_id(); 119 int this_cpu = smp_processor_id();
118 120
119 __get_cpu_var(watchdog_touch_ts) = get_timestamp(this_cpu); 121 __this_cpu_write(watchdog_touch_ts, get_timestamp(this_cpu));
120} 122}
121 123
122void touch_softlockup_watchdog(void) 124void touch_softlockup_watchdog(void)
123{ 125{
124 __raw_get_cpu_var(watchdog_touch_ts) = 0; 126 __this_cpu_write(watchdog_touch_ts, 0);
125} 127}
126EXPORT_SYMBOL(touch_softlockup_watchdog); 128EXPORT_SYMBOL(touch_softlockup_watchdog);
127 129
@@ -165,12 +167,12 @@ void touch_softlockup_watchdog_sync(void)
165/* watchdog detector functions */ 167/* watchdog detector functions */
166static int is_hardlockup(void) 168static int is_hardlockup(void)
167{ 169{
168 unsigned long hrint = __get_cpu_var(hrtimer_interrupts); 170 unsigned long hrint = __this_cpu_read(hrtimer_interrupts);
169 171
170 if (__get_cpu_var(hrtimer_interrupts_saved) == hrint) 172 if (__this_cpu_read(hrtimer_interrupts_saved) == hrint)
171 return 1; 173 return 1;
172 174
173 __get_cpu_var(hrtimer_interrupts_saved) = hrint; 175 __this_cpu_write(hrtimer_interrupts_saved, hrint);
174 return 0; 176 return 0;
175} 177}
176#endif 178#endif
@@ -203,8 +205,8 @@ static void watchdog_overflow_callback(struct perf_event *event, int nmi,
203 /* Ensure the watchdog never gets throttled */ 205 /* Ensure the watchdog never gets throttled */
204 event->hw.interrupts = 0; 206 event->hw.interrupts = 0;
205 207
206 if (__get_cpu_var(watchdog_nmi_touch) == true) { 208 if (__this_cpu_read(watchdog_nmi_touch) == true) {
207 __get_cpu_var(watchdog_nmi_touch) = false; 209 __this_cpu_write(watchdog_nmi_touch, false);
208 return; 210 return;
209 } 211 }
210 212
@@ -218,7 +220,7 @@ static void watchdog_overflow_callback(struct perf_event *event, int nmi,
218 int this_cpu = smp_processor_id(); 220 int this_cpu = smp_processor_id();
219 221
220 /* only print hardlockups once */ 222 /* only print hardlockups once */
221 if (__get_cpu_var(hard_watchdog_warn) == true) 223 if (__this_cpu_read(hard_watchdog_warn) == true)
222 return; 224 return;
223 225
224 if (hardlockup_panic) 226 if (hardlockup_panic)
@@ -226,16 +228,16 @@ static void watchdog_overflow_callback(struct perf_event *event, int nmi,
226 else 228 else
227 WARN(1, "Watchdog detected hard LOCKUP on cpu %d", this_cpu); 229 WARN(1, "Watchdog detected hard LOCKUP on cpu %d", this_cpu);
228 230
229 __get_cpu_var(hard_watchdog_warn) = true; 231 __this_cpu_write(hard_watchdog_warn, true);
230 return; 232 return;
231 } 233 }
232 234
233 __get_cpu_var(hard_watchdog_warn) = false; 235 __this_cpu_write(hard_watchdog_warn, false);
234 return; 236 return;
235} 237}
236static void watchdog_interrupt_count(void) 238static void watchdog_interrupt_count(void)
237{ 239{
238 __get_cpu_var(hrtimer_interrupts)++; 240 __this_cpu_inc(hrtimer_interrupts);
239} 241}
240#else 242#else
241static inline void watchdog_interrupt_count(void) { return; } 243static inline void watchdog_interrupt_count(void) { return; }
@@ -244,7 +246,7 @@ static inline void watchdog_interrupt_count(void) { return; }
244/* watchdog kicker functions */ 246/* watchdog kicker functions */
245static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) 247static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
246{ 248{
247 unsigned long touch_ts = __get_cpu_var(watchdog_touch_ts); 249 unsigned long touch_ts = __this_cpu_read(watchdog_touch_ts);
248 struct pt_regs *regs = get_irq_regs(); 250 struct pt_regs *regs = get_irq_regs();
249 int duration; 251 int duration;
250 252
@@ -252,18 +254,18 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
252 watchdog_interrupt_count(); 254 watchdog_interrupt_count();
253 255
254 /* kick the softlockup detector */ 256 /* kick the softlockup detector */
255 wake_up_process(__get_cpu_var(softlockup_watchdog)); 257 wake_up_process(__this_cpu_read(softlockup_watchdog));
256 258
257 /* .. and repeat */ 259 /* .. and repeat */
258 hrtimer_forward_now(hrtimer, ns_to_ktime(get_sample_period())); 260 hrtimer_forward_now(hrtimer, ns_to_ktime(get_sample_period()));
259 261
260 if (touch_ts == 0) { 262 if (touch_ts == 0) {
261 if (unlikely(__get_cpu_var(softlockup_touch_sync))) { 263 if (unlikely(__this_cpu_read(softlockup_touch_sync))) {
262 /* 264 /*
263 * If the time stamp was touched atomically 265 * If the time stamp was touched atomically
264 * make sure the scheduler tick is up to date. 266 * make sure the scheduler tick is up to date.
265 */ 267 */
266 __get_cpu_var(softlockup_touch_sync) = false; 268 __this_cpu_write(softlockup_touch_sync, false);
267 sched_clock_tick(); 269 sched_clock_tick();
268 } 270 }
269 __touch_watchdog(); 271 __touch_watchdog();
@@ -279,7 +281,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
279 duration = is_softlockup(touch_ts); 281 duration = is_softlockup(touch_ts);
280 if (unlikely(duration)) { 282 if (unlikely(duration)) {
281 /* only warn once */ 283 /* only warn once */
282 if (__get_cpu_var(soft_watchdog_warn) == true) 284 if (__this_cpu_read(soft_watchdog_warn) == true)
283 return HRTIMER_RESTART; 285 return HRTIMER_RESTART;
284 286
285 printk(KERN_ERR "BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n", 287 printk(KERN_ERR "BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n",
@@ -294,9 +296,9 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
294 296
295 if (softlockup_panic) 297 if (softlockup_panic)
296 panic("softlockup: hung tasks"); 298 panic("softlockup: hung tasks");
297 __get_cpu_var(soft_watchdog_warn) = true; 299 __this_cpu_write(soft_watchdog_warn, true);
298 } else 300 } else
299 __get_cpu_var(soft_watchdog_warn) = false; 301 __this_cpu_write(soft_watchdog_warn, false);
300 302
301 return HRTIMER_RESTART; 303 return HRTIMER_RESTART;
302} 304}
@@ -307,7 +309,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
307 */ 309 */
308static int watchdog(void *unused) 310static int watchdog(void *unused)
309{ 311{
310 struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; 312 static struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
311 struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer); 313 struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer);
312 314
313 sched_setscheduler(current, SCHED_FIFO, &param); 315 sched_setscheduler(current, SCHED_FIFO, &param);
@@ -364,7 +366,8 @@ static int watchdog_nmi_enable(int cpu)
364 goto out_save; 366 goto out_save;
365 } 367 }
366 368
367 printk(KERN_ERR "NMI watchdog failed to create perf event on cpu%i: %p\n", cpu, event); 369 printk(KERN_ERR "NMI watchdog disabled for cpu%i: unable to create perf event: %ld\n",
370 cpu, PTR_ERR(event));
368 return PTR_ERR(event); 371 return PTR_ERR(event);
369 372
370 /* success path */ 373 /* success path */
@@ -547,13 +550,13 @@ static struct notifier_block __cpuinitdata cpu_nfb = {
547 .notifier_call = cpu_callback 550 .notifier_call = cpu_callback
548}; 551};
549 552
550static int __init spawn_watchdog_task(void) 553void __init lockup_detector_init(void)
551{ 554{
552 void *cpu = (void *)(long)smp_processor_id(); 555 void *cpu = (void *)(long)smp_processor_id();
553 int err; 556 int err;
554 557
555 if (no_watchdog) 558 if (no_watchdog)
556 return 0; 559 return;
557 560
558 err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu); 561 err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
559 WARN_ON(notifier_to_errno(err)); 562 WARN_ON(notifier_to_errno(err));
@@ -561,6 +564,5 @@ static int __init spawn_watchdog_task(void)
561 cpu_callback(&cpu_nfb, CPU_ONLINE, cpu); 564 cpu_callback(&cpu_nfb, CPU_ONLINE, cpu);
562 register_cpu_notifier(&cpu_nfb); 565 register_cpu_notifier(&cpu_nfb);
563 566
564 return 0; 567 return;
565} 568}
566early_initcall(spawn_watchdog_task);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 90db1bd1a978..11869faa6819 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -661,7 +661,7 @@ void wq_worker_waking_up(struct task_struct *task, unsigned int cpu)
661{ 661{
662 struct worker *worker = kthread_data(task); 662 struct worker *worker = kthread_data(task);
663 663
664 if (likely(!(worker->flags & WORKER_NOT_RUNNING))) 664 if (!(worker->flags & WORKER_NOT_RUNNING))
665 atomic_inc(get_gcwq_nr_running(cpu)); 665 atomic_inc(get_gcwq_nr_running(cpu));
666} 666}
667 667
@@ -687,7 +687,7 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task,
687 struct global_cwq *gcwq = get_gcwq(cpu); 687 struct global_cwq *gcwq = get_gcwq(cpu);
688 atomic_t *nr_running = get_gcwq_nr_running(cpu); 688 atomic_t *nr_running = get_gcwq_nr_running(cpu);
689 689
690 if (unlikely(worker->flags & WORKER_NOT_RUNNING)) 690 if (worker->flags & WORKER_NOT_RUNNING)
691 return NULL; 691 return NULL;
692 692
693 /* this can only happen on the local cpu */ 693 /* this can only happen on the local cpu */
@@ -768,7 +768,11 @@ static inline void worker_clr_flags(struct worker *worker, unsigned int flags)
768 768
769 worker->flags &= ~flags; 769 worker->flags &= ~flags;
770 770
771 /* if transitioning out of NOT_RUNNING, increment nr_running */ 771 /*
772 * If transitioning out of NOT_RUNNING, increment nr_running. Note
773 * that the nested NOT_RUNNING is not a noop. NOT_RUNNING is mask
774 * of multiple flags, not a single flag.
775 */
772 if ((flags & WORKER_NOT_RUNNING) && (oflags & WORKER_NOT_RUNNING)) 776 if ((flags & WORKER_NOT_RUNNING) && (oflags & WORKER_NOT_RUNNING))
773 if (!(worker->flags & WORKER_NOT_RUNNING)) 777 if (!(worker->flags & WORKER_NOT_RUNNING))
774 atomic_inc(get_gcwq_nr_running(gcwq->cpu)); 778 atomic_inc(get_gcwq_nr_running(gcwq->cpu));
@@ -932,6 +936,38 @@ static void insert_work(struct cpu_workqueue_struct *cwq,
932 wake_up_worker(gcwq); 936 wake_up_worker(gcwq);
933} 937}
934 938
939/*
940 * Test whether @work is being queued from another work executing on the
941 * same workqueue. This is rather expensive and should only be used from
942 * cold paths.
943 */
944static bool is_chained_work(struct workqueue_struct *wq)
945{
946 unsigned long flags;
947 unsigned int cpu;
948
949 for_each_gcwq_cpu(cpu) {
950 struct global_cwq *gcwq = get_gcwq(cpu);
951 struct worker *worker;
952 struct hlist_node *pos;
953 int i;
954
955 spin_lock_irqsave(&gcwq->lock, flags);
956 for_each_busy_worker(worker, i, pos, gcwq) {
957 if (worker->task != current)
958 continue;
959 spin_unlock_irqrestore(&gcwq->lock, flags);
960 /*
961 * I'm @worker, no locking necessary. See if @work
962 * is headed to the same workqueue.
963 */
964 return worker->current_cwq->wq == wq;
965 }
966 spin_unlock_irqrestore(&gcwq->lock, flags);
967 }
968 return false;
969}
970
935static void __queue_work(unsigned int cpu, struct workqueue_struct *wq, 971static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
936 struct work_struct *work) 972 struct work_struct *work)
937{ 973{
@@ -943,7 +979,9 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
943 979
944 debug_work_activate(work); 980 debug_work_activate(work);
945 981
946 if (WARN_ON_ONCE(wq->flags & WQ_DYING)) 982 /* if dying, only works from the same workqueue are allowed */
983 if (unlikely(wq->flags & WQ_DYING) &&
984 WARN_ON_ONCE(!is_chained_work(wq)))
947 return; 985 return;
948 986
949 /* determine gcwq to use */ 987 /* determine gcwq to use */
@@ -1806,7 +1844,7 @@ __acquires(&gcwq->lock)
1806 spin_unlock_irq(&gcwq->lock); 1844 spin_unlock_irq(&gcwq->lock);
1807 1845
1808 work_clear_pending(work); 1846 work_clear_pending(work);
1809 lock_map_acquire(&cwq->wq->lockdep_map); 1847 lock_map_acquire_read(&cwq->wq->lockdep_map);
1810 lock_map_acquire(&lockdep_map); 1848 lock_map_acquire(&lockdep_map);
1811 trace_workqueue_execute_start(work); 1849 trace_workqueue_execute_start(work);
1812 f(work); 1850 f(work);
@@ -2350,8 +2388,18 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr,
2350 insert_wq_barrier(cwq, barr, work, worker); 2388 insert_wq_barrier(cwq, barr, work, worker);
2351 spin_unlock_irq(&gcwq->lock); 2389 spin_unlock_irq(&gcwq->lock);
2352 2390
2353 lock_map_acquire(&cwq->wq->lockdep_map); 2391 /*
2392 * If @max_active is 1 or rescuer is in use, flushing another work
2393 * item on the same workqueue may lead to deadlock. Make sure the
2394 * flusher is not running on the same workqueue by verifying write
2395 * access.
2396 */
2397 if (cwq->wq->saved_max_active == 1 || cwq->wq->flags & WQ_RESCUER)
2398 lock_map_acquire(&cwq->wq->lockdep_map);
2399 else
2400 lock_map_acquire_read(&cwq->wq->lockdep_map);
2354 lock_map_release(&cwq->wq->lockdep_map); 2401 lock_map_release(&cwq->wq->lockdep_map);
2402
2355 return true; 2403 return true;
2356already_gone: 2404already_gone:
2357 spin_unlock_irq(&gcwq->lock); 2405 spin_unlock_irq(&gcwq->lock);
@@ -2936,11 +2984,35 @@ EXPORT_SYMBOL_GPL(__alloc_workqueue_key);
2936 */ 2984 */
2937void destroy_workqueue(struct workqueue_struct *wq) 2985void destroy_workqueue(struct workqueue_struct *wq)
2938{ 2986{
2987 unsigned int flush_cnt = 0;
2939 unsigned int cpu; 2988 unsigned int cpu;
2940 2989
2990 /*
2991 * Mark @wq dying and drain all pending works. Once WQ_DYING is
2992 * set, only chain queueing is allowed. IOW, only currently
2993 * pending or running work items on @wq can queue further work
2994 * items on it. @wq is flushed repeatedly until it becomes empty.
2995 * The number of flushing is detemined by the depth of chaining and
2996 * should be relatively short. Whine if it takes too long.
2997 */
2941 wq->flags |= WQ_DYING; 2998 wq->flags |= WQ_DYING;
2999reflush:
2942 flush_workqueue(wq); 3000 flush_workqueue(wq);
2943 3001
3002 for_each_cwq_cpu(cpu, wq) {
3003 struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
3004
3005 if (!cwq->nr_active && list_empty(&cwq->delayed_works))
3006 continue;
3007
3008 if (++flush_cnt == 10 ||
3009 (flush_cnt % 100 == 0 && flush_cnt <= 1000))
3010 printk(KERN_WARNING "workqueue %s: flush on "
3011 "destruction isn't complete after %u tries\n",
3012 wq->name, flush_cnt);
3013 goto reflush;
3014 }
3015
2944 /* 3016 /*
2945 * wq list is used to freeze wq, remove from list after 3017 * wq list is used to freeze wq, remove from list after
2946 * flushing is complete in case freeze races us. 3018 * flushing is complete in case freeze races us.
@@ -3692,7 +3764,8 @@ static int __init init_workqueues(void)
3692 system_nrt_wq = alloc_workqueue("events_nrt", WQ_NON_REENTRANT, 0); 3764 system_nrt_wq = alloc_workqueue("events_nrt", WQ_NON_REENTRANT, 0);
3693 system_unbound_wq = alloc_workqueue("events_unbound", WQ_UNBOUND, 3765 system_unbound_wq = alloc_workqueue("events_unbound", WQ_UNBOUND,
3694 WQ_UNBOUND_MAX_ACTIVE); 3766 WQ_UNBOUND_MAX_ACTIVE);
3695 BUG_ON(!system_wq || !system_long_wq || !system_nrt_wq); 3767 BUG_ON(!system_wq || !system_long_wq || !system_nrt_wq ||
3768 !system_unbound_wq);
3696 return 0; 3769 return 0;
3697} 3770}
3698early_initcall(init_workqueues); 3771early_initcall(init_workqueues);