aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile5
-rw-r--r--kernel/audit.c2
-rw-r--r--kernel/cgroup.c55
-rw-r--r--kernel/cpu.c29
-rw-r--r--kernel/debug/kdb/kdb_main.c2
-rw-r--r--kernel/exit.c16
-rw-r--r--kernel/fork.c45
-rw-r--r--kernel/freezer.c9
-rw-r--r--kernel/futex.c290
-rw-r--r--kernel/hrtimer.c87
-rw-r--r--kernel/hw_breakpoint.c2
-rw-r--r--kernel/irq/irqdesc.c40
-rw-r--r--kernel/irq/manage.c4
-rw-r--r--kernel/irq_work.c18
-rw-r--r--kernel/kexec.c2
-rw-r--r--kernel/kprobes.c573
-rw-r--r--kernel/kthread.c2
-rw-r--r--kernel/latencytop.c23
-rw-r--r--kernel/lockdep_proc.c16
-rw-r--r--kernel/module.c171
-rw-r--r--kernel/mutex.c2
-rw-r--r--kernel/panic.c1
-rw-r--r--kernel/perf_event.c657
-rw-r--r--kernel/posix-timers.c10
-rw-r--r--kernel/power/Kconfig5
-rw-r--r--kernel/power/Makefile6
-rw-r--r--kernel/power/hibernate.c11
-rw-r--r--kernel/power/nvs.c136
-rw-r--r--kernel/power/process.c8
-rw-r--r--kernel/power/suspend.c9
-rw-r--r--kernel/power/swap.c7
-rw-r--r--kernel/printk.c60
-rw-r--r--kernel/rcutiny.c105
-rw-r--r--kernel/rcutiny_plugin.h433
-rw-r--r--kernel/rcutorture.c270
-rw-r--r--kernel/rcutree.c160
-rw-r--r--kernel/rcutree.h61
-rw-r--r--kernel/rcutree_plugin.h135
-rw-r--r--kernel/rcutree_trace.c12
-rw-r--r--kernel/sched.c683
-rw-r--r--kernel/sched_autogroup.c238
-rw-r--r--kernel/sched_autogroup.h32
-rw-r--r--kernel/sched_clock.c2
-rw-r--r--kernel/sched_debug.c91
-rw-r--r--kernel/sched_fair.c322
-rw-r--r--kernel/sched_features.h2
-rw-r--r--kernel/sched_rt.c24
-rw-r--r--kernel/smp.c19
-rw-r--r--kernel/softirq.c65
-rw-r--r--kernel/srcu.c8
-rw-r--r--kernel/sys.c10
-rw-r--r--kernel/sysctl.c84
-rw-r--r--kernel/sysctl_binary.c3
-rw-r--r--kernel/taskstats.c7
-rw-r--r--kernel/time.c4
-rw-r--r--kernel/time/clocksource.c3
-rw-r--r--kernel/time/ntp.c425
-rw-r--r--kernel/time/tick-common.c2
-rw-r--r--kernel/time/tick-oneshot.c4
-rw-r--r--kernel/time/timecompare.c5
-rw-r--r--kernel/time/timekeeping.c52
-rw-r--r--kernel/time/timer_list.c8
-rw-r--r--kernel/timer.c50
-rw-r--r--kernel/trace/Kconfig15
-rw-r--r--kernel/trace/Makefile2
-rw-r--r--kernel/trace/blktrace.c37
-rw-r--r--kernel/trace/power-traces.c5
-rw-r--r--kernel/trace/trace.c6
-rw-r--r--kernel/trace/trace_entries.h2
-rw-r--r--kernel/trace/trace_event_perf.c31
-rw-r--r--kernel/trace/trace_events.c6
-rw-r--r--kernel/trace/trace_export.c14
-rw-r--r--kernel/trace/trace_selftest.c2
-rw-r--r--kernel/user_namespace.c15
-rw-r--r--kernel/watchdog.c47
-rw-r--r--kernel/workqueue.c60
76 files changed, 3957 insertions, 1907 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 0b5ff083fa22..353d3fe8ba33 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -43,7 +43,7 @@ obj-$(CONFIG_RT_MUTEXES) += rtmutex.o
43obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o 43obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o
44obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o 44obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o
45obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o 45obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o
46obj-$(CONFIG_USE_GENERIC_SMP_HELPERS) += smp.o 46obj-$(CONFIG_SMP) += smp.o
47ifneq ($(CONFIG_SMP),y) 47ifneq ($(CONFIG_SMP),y)
48obj-y += up.o 48obj-y += up.o
49endif 49endif
@@ -100,6 +100,7 @@ obj-$(CONFIG_FUNCTION_TRACER) += trace/
100obj-$(CONFIG_TRACING) += trace/ 100obj-$(CONFIG_TRACING) += trace/
101obj-$(CONFIG_X86_DS) += trace/ 101obj-$(CONFIG_X86_DS) += trace/
102obj-$(CONFIG_RING_BUFFER) += trace/ 102obj-$(CONFIG_RING_BUFFER) += trace/
103obj-$(CONFIG_TRACEPOINTS) += trace/
103obj-$(CONFIG_SMP) += sched_cpupri.o 104obj-$(CONFIG_SMP) += sched_cpupri.o
104obj-$(CONFIG_IRQ_WORK) += irq_work.o 105obj-$(CONFIG_IRQ_WORK) += irq_work.o
105obj-$(CONFIG_PERF_EVENTS) += perf_event.o 106obj-$(CONFIG_PERF_EVENTS) += perf_event.o
@@ -121,7 +122,7 @@ $(obj)/configs.o: $(obj)/config_data.h
121# config_data.h contains the same information as ikconfig.h but gzipped. 122# config_data.h contains the same information as ikconfig.h but gzipped.
122# Info from config_data can be extracted from /proc/config* 123# Info from config_data can be extracted from /proc/config*
123targets += config_data.gz 124targets += config_data.gz
124$(obj)/config_data.gz: .config FORCE 125$(obj)/config_data.gz: $(KCONFIG_CONFIG) FORCE
125 $(call if_changed,gzip) 126 $(call if_changed,gzip)
126 127
127quiet_cmd_ikconfiggz = IKCFG $@ 128quiet_cmd_ikconfiggz = IKCFG $@
diff --git a/kernel/audit.c b/kernel/audit.c
index 77770a034d59..e4956244ae50 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -400,7 +400,7 @@ static void kauditd_send_skb(struct sk_buff *skb)
400 if (err < 0) { 400 if (err < 0) {
401 BUG_ON(err != -ECONNREFUSED); /* Shouldn't happen */ 401 BUG_ON(err != -ECONNREFUSED); /* Shouldn't happen */
402 printk(KERN_ERR "audit: *NO* daemon at audit_pid=%d\n", audit_pid); 402 printk(KERN_ERR "audit: *NO* daemon at audit_pid=%d\n", audit_pid);
403 audit_log_lost("auditd dissapeared\n"); 403 audit_log_lost("auditd disappeared\n");
404 audit_pid = 0; 404 audit_pid = 0;
405 /* we might get lucky and get this in the next auditd */ 405 /* we might get lucky and get this in the next auditd */
406 audit_hold_skb(skb); 406 audit_hold_skb(skb);
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 66a416b42c18..b24d7027b83c 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -764,6 +764,7 @@ EXPORT_SYMBOL_GPL(cgroup_unlock);
764 */ 764 */
765 765
766static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode); 766static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode);
767static struct dentry *cgroup_lookup(struct inode *, struct dentry *, struct nameidata *);
767static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry); 768static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry);
768static int cgroup_populate_dir(struct cgroup *cgrp); 769static int cgroup_populate_dir(struct cgroup *cgrp);
769static const struct inode_operations cgroup_dir_inode_operations; 770static const struct inode_operations cgroup_dir_inode_operations;
@@ -860,6 +861,11 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
860 iput(inode); 861 iput(inode);
861} 862}
862 863
864static int cgroup_delete(const struct dentry *d)
865{
866 return 1;
867}
868
863static void remove_dir(struct dentry *d) 869static void remove_dir(struct dentry *d)
864{ 870{
865 struct dentry *parent = dget(d->d_parent); 871 struct dentry *parent = dget(d->d_parent);
@@ -874,25 +880,29 @@ static void cgroup_clear_directory(struct dentry *dentry)
874 struct list_head *node; 880 struct list_head *node;
875 881
876 BUG_ON(!mutex_is_locked(&dentry->d_inode->i_mutex)); 882 BUG_ON(!mutex_is_locked(&dentry->d_inode->i_mutex));
877 spin_lock(&dcache_lock); 883 spin_lock(&dentry->d_lock);
878 node = dentry->d_subdirs.next; 884 node = dentry->d_subdirs.next;
879 while (node != &dentry->d_subdirs) { 885 while (node != &dentry->d_subdirs) {
880 struct dentry *d = list_entry(node, struct dentry, d_u.d_child); 886 struct dentry *d = list_entry(node, struct dentry, d_u.d_child);
887
888 spin_lock_nested(&d->d_lock, DENTRY_D_LOCK_NESTED);
881 list_del_init(node); 889 list_del_init(node);
882 if (d->d_inode) { 890 if (d->d_inode) {
883 /* This should never be called on a cgroup 891 /* This should never be called on a cgroup
884 * directory with child cgroups */ 892 * directory with child cgroups */
885 BUG_ON(d->d_inode->i_mode & S_IFDIR); 893 BUG_ON(d->d_inode->i_mode & S_IFDIR);
886 d = dget_locked(d); 894 dget_dlock(d);
887 spin_unlock(&dcache_lock); 895 spin_unlock(&d->d_lock);
896 spin_unlock(&dentry->d_lock);
888 d_delete(d); 897 d_delete(d);
889 simple_unlink(dentry->d_inode, d); 898 simple_unlink(dentry->d_inode, d);
890 dput(d); 899 dput(d);
891 spin_lock(&dcache_lock); 900 spin_lock(&dentry->d_lock);
892 } 901 } else
902 spin_unlock(&d->d_lock);
893 node = dentry->d_subdirs.next; 903 node = dentry->d_subdirs.next;
894 } 904 }
895 spin_unlock(&dcache_lock); 905 spin_unlock(&dentry->d_lock);
896} 906}
897 907
898/* 908/*
@@ -900,11 +910,16 @@ static void cgroup_clear_directory(struct dentry *dentry)
900 */ 910 */
901static void cgroup_d_remove_dir(struct dentry *dentry) 911static void cgroup_d_remove_dir(struct dentry *dentry)
902{ 912{
913 struct dentry *parent;
914
903 cgroup_clear_directory(dentry); 915 cgroup_clear_directory(dentry);
904 916
905 spin_lock(&dcache_lock); 917 parent = dentry->d_parent;
918 spin_lock(&parent->d_lock);
919 spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
906 list_del_init(&dentry->d_u.d_child); 920 list_del_init(&dentry->d_u.d_child);
907 spin_unlock(&dcache_lock); 921 spin_unlock(&dentry->d_lock);
922 spin_unlock(&parent->d_lock);
908 remove_dir(dentry); 923 remove_dir(dentry);
909} 924}
910 925
@@ -1440,6 +1455,11 @@ static int cgroup_set_super(struct super_block *sb, void *data)
1440 1455
1441static int cgroup_get_rootdir(struct super_block *sb) 1456static int cgroup_get_rootdir(struct super_block *sb)
1442{ 1457{
1458 static const struct dentry_operations cgroup_dops = {
1459 .d_iput = cgroup_diput,
1460 .d_delete = cgroup_delete,
1461 };
1462
1443 struct inode *inode = 1463 struct inode *inode =
1444 cgroup_new_inode(S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR, sb); 1464 cgroup_new_inode(S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR, sb);
1445 struct dentry *dentry; 1465 struct dentry *dentry;
@@ -1457,6 +1477,8 @@ static int cgroup_get_rootdir(struct super_block *sb)
1457 return -ENOMEM; 1477 return -ENOMEM;
1458 } 1478 }
1459 sb->s_root = dentry; 1479 sb->s_root = dentry;
1480 /* for everything else we want ->d_op set */
1481 sb->s_d_op = &cgroup_dops;
1460 return 0; 1482 return 0;
1461} 1483}
1462 1484
@@ -2180,12 +2202,20 @@ static const struct file_operations cgroup_file_operations = {
2180}; 2202};
2181 2203
2182static const struct inode_operations cgroup_dir_inode_operations = { 2204static const struct inode_operations cgroup_dir_inode_operations = {
2183 .lookup = simple_lookup, 2205 .lookup = cgroup_lookup,
2184 .mkdir = cgroup_mkdir, 2206 .mkdir = cgroup_mkdir,
2185 .rmdir = cgroup_rmdir, 2207 .rmdir = cgroup_rmdir,
2186 .rename = cgroup_rename, 2208 .rename = cgroup_rename,
2187}; 2209};
2188 2210
2211static struct dentry *cgroup_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
2212{
2213 if (dentry->d_name.len > NAME_MAX)
2214 return ERR_PTR(-ENAMETOOLONG);
2215 d_add(dentry, NULL);
2216 return NULL;
2217}
2218
2189/* 2219/*
2190 * Check if a file is a control file 2220 * Check if a file is a control file
2191 */ 2221 */
@@ -2199,10 +2229,6 @@ static inline struct cftype *__file_cft(struct file *file)
2199static int cgroup_create_file(struct dentry *dentry, mode_t mode, 2229static int cgroup_create_file(struct dentry *dentry, mode_t mode,
2200 struct super_block *sb) 2230 struct super_block *sb)
2201{ 2231{
2202 static const struct dentry_operations cgroup_dops = {
2203 .d_iput = cgroup_diput,
2204 };
2205
2206 struct inode *inode; 2232 struct inode *inode;
2207 2233
2208 if (!dentry) 2234 if (!dentry)
@@ -2228,7 +2254,6 @@ static int cgroup_create_file(struct dentry *dentry, mode_t mode,
2228 inode->i_size = 0; 2254 inode->i_size = 0;
2229 inode->i_fop = &cgroup_file_operations; 2255 inode->i_fop = &cgroup_file_operations;
2230 } 2256 }
2231 dentry->d_op = &cgroup_dops;
2232 d_instantiate(dentry, inode); 2257 d_instantiate(dentry, inode);
2233 dget(dentry); /* Extra count - pin the dentry in core */ 2258 dget(dentry); /* Extra count - pin the dentry in core */
2234 return 0; 2259 return 0;
@@ -3638,9 +3663,7 @@ again:
3638 list_del(&cgrp->sibling); 3663 list_del(&cgrp->sibling);
3639 cgroup_unlock_hierarchy(cgrp->root); 3664 cgroup_unlock_hierarchy(cgrp->root);
3640 3665
3641 spin_lock(&cgrp->dentry->d_lock);
3642 d = dget(cgrp->dentry); 3666 d = dget(cgrp->dentry);
3643 spin_unlock(&d->d_lock);
3644 3667
3645 cgroup_d_remove_dir(d); 3668 cgroup_d_remove_dir(d);
3646 dput(d); 3669 dput(d);
diff --git a/kernel/cpu.c b/kernel/cpu.c
index f6e726f18491..156cc5556140 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -189,7 +189,6 @@ static inline void check_for_tasks(int cpu)
189} 189}
190 190
191struct take_cpu_down_param { 191struct take_cpu_down_param {
192 struct task_struct *caller;
193 unsigned long mod; 192 unsigned long mod;
194 void *hcpu; 193 void *hcpu;
195}; 194};
@@ -198,7 +197,6 @@ struct take_cpu_down_param {
198static int __ref take_cpu_down(void *_param) 197static int __ref take_cpu_down(void *_param)
199{ 198{
200 struct take_cpu_down_param *param = _param; 199 struct take_cpu_down_param *param = _param;
201 unsigned int cpu = (unsigned long)param->hcpu;
202 int err; 200 int err;
203 201
204 /* Ensure this CPU doesn't handle any more interrupts. */ 202 /* Ensure this CPU doesn't handle any more interrupts. */
@@ -208,11 +206,6 @@ static int __ref take_cpu_down(void *_param)
208 206
209 cpu_notify(CPU_DYING | param->mod, param->hcpu); 207 cpu_notify(CPU_DYING | param->mod, param->hcpu);
210 208
211 if (task_cpu(param->caller) == cpu)
212 move_task_off_dead_cpu(cpu, param->caller);
213 /* Force idle task to run as soon as we yield: it should
214 immediately notice cpu is offline and die quickly. */
215 sched_idle_next();
216 return 0; 209 return 0;
217} 210}
218 211
@@ -223,7 +216,6 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
223 void *hcpu = (void *)(long)cpu; 216 void *hcpu = (void *)(long)cpu;
224 unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0; 217 unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0;
225 struct take_cpu_down_param tcd_param = { 218 struct take_cpu_down_param tcd_param = {
226 .caller = current,
227 .mod = mod, 219 .mod = mod,
228 .hcpu = hcpu, 220 .hcpu = hcpu,
229 }; 221 };
@@ -253,9 +245,15 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
253 } 245 }
254 BUG_ON(cpu_online(cpu)); 246 BUG_ON(cpu_online(cpu));
255 247
256 /* Wait for it to sleep (leaving idle task). */ 248 /*
249 * The migration_call() CPU_DYING callback will have removed all
250 * runnable tasks from the cpu, there's only the idle task left now
251 * that the migration thread is done doing the stop_machine thing.
252 *
253 * Wait for the stop thread to go away.
254 */
257 while (!idle_cpu(cpu)) 255 while (!idle_cpu(cpu))
258 yield(); 256 cpu_relax();
259 257
260 /* This actually kills the CPU. */ 258 /* This actually kills the CPU. */
261 __cpu_die(cpu); 259 __cpu_die(cpu);
@@ -386,6 +384,14 @@ out:
386#ifdef CONFIG_PM_SLEEP_SMP 384#ifdef CONFIG_PM_SLEEP_SMP
387static cpumask_var_t frozen_cpus; 385static cpumask_var_t frozen_cpus;
388 386
387void __weak arch_disable_nonboot_cpus_begin(void)
388{
389}
390
391void __weak arch_disable_nonboot_cpus_end(void)
392{
393}
394
389int disable_nonboot_cpus(void) 395int disable_nonboot_cpus(void)
390{ 396{
391 int cpu, first_cpu, error = 0; 397 int cpu, first_cpu, error = 0;
@@ -397,6 +403,7 @@ int disable_nonboot_cpus(void)
397 * with the userspace trying to use the CPU hotplug at the same time 403 * with the userspace trying to use the CPU hotplug at the same time
398 */ 404 */
399 cpumask_clear(frozen_cpus); 405 cpumask_clear(frozen_cpus);
406 arch_disable_nonboot_cpus_begin();
400 407
401 printk("Disabling non-boot CPUs ...\n"); 408 printk("Disabling non-boot CPUs ...\n");
402 for_each_online_cpu(cpu) { 409 for_each_online_cpu(cpu) {
@@ -412,6 +419,8 @@ int disable_nonboot_cpus(void)
412 } 419 }
413 } 420 }
414 421
422 arch_disable_nonboot_cpus_end();
423
415 if (!error) { 424 if (!error) {
416 BUG_ON(num_online_cpus() > 1); 425 BUG_ON(num_online_cpus() > 1);
417 /* Make sure the CPUs won't be enabled by someone else */ 426 /* Make sure the CPUs won't be enabled by someone else */
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index a6e729766821..bd3e8e29caa3 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -2914,7 +2914,7 @@ static void __init kdb_cmd_init(void)
2914 } 2914 }
2915} 2915}
2916 2916
2917/* Intialize kdb_printf, breakpoint tables and kdb state */ 2917/* Initialize kdb_printf, breakpoint tables and kdb state */
2918void __init kdb_init(int lvl) 2918void __init kdb_init(int lvl)
2919{ 2919{
2920 static int kdb_init_lvl = KDB_NOT_INITIALIZED; 2920 static int kdb_init_lvl = KDB_NOT_INITIALIZED;
diff --git a/kernel/exit.c b/kernel/exit.c
index 676149a4ac5f..f9a45ebcc7b1 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -69,7 +69,7 @@ static void __unhash_process(struct task_struct *p, bool group_dead)
69 69
70 list_del_rcu(&p->tasks); 70 list_del_rcu(&p->tasks);
71 list_del_init(&p->sibling); 71 list_del_init(&p->sibling);
72 __get_cpu_var(process_counts)--; 72 __this_cpu_dec(process_counts);
73 } 73 }
74 list_del_rcu(&p->thread_group); 74 list_del_rcu(&p->thread_group);
75} 75}
@@ -994,6 +994,15 @@ NORET_TYPE void do_exit(long code)
994 exit_fs(tsk); 994 exit_fs(tsk);
995 check_stack_usage(); 995 check_stack_usage();
996 exit_thread(); 996 exit_thread();
997
998 /*
999 * Flush inherited counters to the parent - before the parent
1000 * gets woken up by child-exit notifications.
1001 *
1002 * because of cgroup mode, must be called before cgroup_exit()
1003 */
1004 perf_event_exit_task(tsk);
1005
997 cgroup_exit(tsk, 1); 1006 cgroup_exit(tsk, 1);
998 1007
999 if (group_dead) 1008 if (group_dead)
@@ -1007,11 +1016,6 @@ NORET_TYPE void do_exit(long code)
1007 * FIXME: do that only when needed, using sched_exit tracepoint 1016 * FIXME: do that only when needed, using sched_exit tracepoint
1008 */ 1017 */
1009 flush_ptrace_hw_breakpoint(tsk); 1018 flush_ptrace_hw_breakpoint(tsk);
1010 /*
1011 * Flush inherited counters to the parent - before the parent
1012 * gets woken up by child-exit notifications.
1013 */
1014 perf_event_exit_task(tsk);
1015 1019
1016 exit_notify(tsk, group_dead); 1020 exit_notify(tsk, group_dead);
1017#ifdef CONFIG_NUMA 1021#ifdef CONFIG_NUMA
diff --git a/kernel/fork.c b/kernel/fork.c
index 5447dc7defa9..25e429152ddc 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -66,6 +66,7 @@
66#include <linux/posix-timers.h> 66#include <linux/posix-timers.h>
67#include <linux/user-return-notifier.h> 67#include <linux/user-return-notifier.h>
68#include <linux/oom.h> 68#include <linux/oom.h>
69#include <linux/khugepaged.h>
69 70
70#include <asm/pgtable.h> 71#include <asm/pgtable.h>
71#include <asm/pgalloc.h> 72#include <asm/pgalloc.h>
@@ -169,6 +170,7 @@ EXPORT_SYMBOL(free_task);
169static inline void free_signal_struct(struct signal_struct *sig) 170static inline void free_signal_struct(struct signal_struct *sig)
170{ 171{
171 taskstats_tgid_free(sig); 172 taskstats_tgid_free(sig);
173 sched_autogroup_exit(sig);
172 kmem_cache_free(signal_cachep, sig); 174 kmem_cache_free(signal_cachep, sig);
173} 175}
174 176
@@ -329,6 +331,9 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
329 retval = ksm_fork(mm, oldmm); 331 retval = ksm_fork(mm, oldmm);
330 if (retval) 332 if (retval)
331 goto out; 333 goto out;
334 retval = khugepaged_fork(mm, oldmm);
335 if (retval)
336 goto out;
332 337
333 prev = NULL; 338 prev = NULL;
334 for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) { 339 for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) {
@@ -528,6 +533,9 @@ void __mmdrop(struct mm_struct *mm)
528 mm_free_pgd(mm); 533 mm_free_pgd(mm);
529 destroy_context(mm); 534 destroy_context(mm);
530 mmu_notifier_mm_destroy(mm); 535 mmu_notifier_mm_destroy(mm);
536#ifdef CONFIG_TRANSPARENT_HUGEPAGE
537 VM_BUG_ON(mm->pmd_huge_pte);
538#endif
531 free_mm(mm); 539 free_mm(mm);
532} 540}
533EXPORT_SYMBOL_GPL(__mmdrop); 541EXPORT_SYMBOL_GPL(__mmdrop);
@@ -542,6 +550,7 @@ void mmput(struct mm_struct *mm)
542 if (atomic_dec_and_test(&mm->mm_users)) { 550 if (atomic_dec_and_test(&mm->mm_users)) {
543 exit_aio(mm); 551 exit_aio(mm);
544 ksm_exit(mm); 552 ksm_exit(mm);
553 khugepaged_exit(mm); /* must run before exit_mmap */
545 exit_mmap(mm); 554 exit_mmap(mm);
546 set_mm_exe_file(mm, NULL); 555 set_mm_exe_file(mm, NULL);
547 if (!list_empty(&mm->mmlist)) { 556 if (!list_empty(&mm->mmlist)) {
@@ -668,6 +677,10 @@ struct mm_struct *dup_mm(struct task_struct *tsk)
668 mm->token_priority = 0; 677 mm->token_priority = 0;
669 mm->last_interval = 0; 678 mm->last_interval = 0;
670 679
680#ifdef CONFIG_TRANSPARENT_HUGEPAGE
681 mm->pmd_huge_pte = NULL;
682#endif
683
671 if (!mm_init(mm, tsk)) 684 if (!mm_init(mm, tsk))
672 goto fail_nomem; 685 goto fail_nomem;
673 686
@@ -905,9 +918,11 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
905 posix_cpu_timers_init_group(sig); 918 posix_cpu_timers_init_group(sig);
906 919
907 tty_audit_fork(sig); 920 tty_audit_fork(sig);
921 sched_autogroup_fork(sig);
908 922
909 sig->oom_adj = current->signal->oom_adj; 923 sig->oom_adj = current->signal->oom_adj;
910 sig->oom_score_adj = current->signal->oom_score_adj; 924 sig->oom_score_adj = current->signal->oom_score_adj;
925 sig->oom_score_adj_min = current->signal->oom_score_adj_min;
911 926
912 mutex_init(&sig->cred_guard_mutex); 927 mutex_init(&sig->cred_guard_mutex);
913 928
@@ -1283,7 +1298,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1283 attach_pid(p, PIDTYPE_SID, task_session(current)); 1298 attach_pid(p, PIDTYPE_SID, task_session(current));
1284 list_add_tail(&p->sibling, &p->real_parent->children); 1299 list_add_tail(&p->sibling, &p->real_parent->children);
1285 list_add_tail_rcu(&p->tasks, &init_task.tasks); 1300 list_add_tail_rcu(&p->tasks, &init_task.tasks);
1286 __get_cpu_var(process_counts)++; 1301 __this_cpu_inc(process_counts);
1287 } 1302 }
1288 attach_pid(p, PIDTYPE_PID, pid); 1303 attach_pid(p, PIDTYPE_PID, pid);
1289 nr_threads++; 1304 nr_threads++;
@@ -1408,23 +1423,6 @@ long do_fork(unsigned long clone_flags,
1408 } 1423 }
1409 1424
1410 /* 1425 /*
1411 * We hope to recycle these flags after 2.6.26
1412 */
1413 if (unlikely(clone_flags & CLONE_STOPPED)) {
1414 static int __read_mostly count = 100;
1415
1416 if (count > 0 && printk_ratelimit()) {
1417 char comm[TASK_COMM_LEN];
1418
1419 count--;
1420 printk(KERN_INFO "fork(): process `%s' used deprecated "
1421 "clone flags 0x%lx\n",
1422 get_task_comm(comm, current),
1423 clone_flags & CLONE_STOPPED);
1424 }
1425 }
1426
1427 /*
1428 * When called from kernel_thread, don't do user tracing stuff. 1426 * When called from kernel_thread, don't do user tracing stuff.
1429 */ 1427 */
1430 if (likely(user_mode(regs))) 1428 if (likely(user_mode(regs)))
@@ -1462,16 +1460,7 @@ long do_fork(unsigned long clone_flags,
1462 */ 1460 */
1463 p->flags &= ~PF_STARTING; 1461 p->flags &= ~PF_STARTING;
1464 1462
1465 if (unlikely(clone_flags & CLONE_STOPPED)) { 1463 wake_up_new_task(p, clone_flags);
1466 /*
1467 * We'll start up with an immediate SIGSTOP.
1468 */
1469 sigaddset(&p->pending.signal, SIGSTOP);
1470 set_tsk_thread_flag(p, TIF_SIGPENDING);
1471 __set_task_state(p, TASK_STOPPED);
1472 } else {
1473 wake_up_new_task(p, clone_flags);
1474 }
1475 1464
1476 tracehook_report_clone_complete(trace, regs, 1465 tracehook_report_clone_complete(trace, regs,
1477 clone_flags, nr, p); 1466 clone_flags, nr, p);
diff --git a/kernel/freezer.c b/kernel/freezer.c
index bd1d42b17cb2..66ecd2ead215 100644
--- a/kernel/freezer.c
+++ b/kernel/freezer.c
@@ -104,8 +104,13 @@ bool freeze_task(struct task_struct *p, bool sig_only)
104 } 104 }
105 105
106 if (should_send_signal(p)) { 106 if (should_send_signal(p)) {
107 if (!signal_pending(p)) 107 fake_signal_wake_up(p);
108 fake_signal_wake_up(p); 108 /*
109 * fake_signal_wake_up() goes through p's scheduler
110 * lock and guarantees that TASK_STOPPED/TRACED ->
111 * TASK_RUNNING transition can't race with task state
112 * testing in try_to_freeze_tasks().
113 */
109 } else if (sig_only) { 114 } else if (sig_only) {
110 return false; 115 return false;
111 } else { 116 } else {
diff --git a/kernel/futex.c b/kernel/futex.c
index 40a8777a27d0..52075633373f 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -69,6 +69,14 @@ int __read_mostly futex_cmpxchg_enabled;
69#define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8) 69#define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8)
70 70
71/* 71/*
72 * Futex flags used to encode options to functions and preserve them across
73 * restarts.
74 */
75#define FLAGS_SHARED 0x01
76#define FLAGS_CLOCKRT 0x02
77#define FLAGS_HAS_TIMEOUT 0x04
78
79/*
72 * Priority Inheritance state: 80 * Priority Inheritance state:
73 */ 81 */
74struct futex_pi_state { 82struct futex_pi_state {
@@ -123,6 +131,12 @@ struct futex_q {
123 u32 bitset; 131 u32 bitset;
124}; 132};
125 133
134static const struct futex_q futex_q_init = {
135 /* list gets initialized in queue_me()*/
136 .key = FUTEX_KEY_INIT,
137 .bitset = FUTEX_BITSET_MATCH_ANY
138};
139
126/* 140/*
127 * Hash buckets are shared by all the futex_keys that hash to the same 141 * Hash buckets are shared by all the futex_keys that hash to the same
128 * location. Each key may have multiple futex_q structures, one for each task 142 * location. Each key may have multiple futex_q structures, one for each task
@@ -219,7 +233,7 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key)
219{ 233{
220 unsigned long address = (unsigned long)uaddr; 234 unsigned long address = (unsigned long)uaddr;
221 struct mm_struct *mm = current->mm; 235 struct mm_struct *mm = current->mm;
222 struct page *page; 236 struct page *page, *page_head;
223 int err; 237 int err;
224 238
225 /* 239 /*
@@ -251,11 +265,46 @@ again:
251 if (err < 0) 265 if (err < 0)
252 return err; 266 return err;
253 267
254 page = compound_head(page); 268#ifdef CONFIG_TRANSPARENT_HUGEPAGE
255 lock_page(page); 269 page_head = page;
256 if (!page->mapping) { 270 if (unlikely(PageTail(page))) {
257 unlock_page(page);
258 put_page(page); 271 put_page(page);
272 /* serialize against __split_huge_page_splitting() */
273 local_irq_disable();
274 if (likely(__get_user_pages_fast(address, 1, 1, &page) == 1)) {
275 page_head = compound_head(page);
276 /*
277 * page_head is valid pointer but we must pin
278 * it before taking the PG_lock and/or
279 * PG_compound_lock. The moment we re-enable
280 * irqs __split_huge_page_splitting() can
281 * return and the head page can be freed from
282 * under us. We can't take the PG_lock and/or
283 * PG_compound_lock on a page that could be
284 * freed from under us.
285 */
286 if (page != page_head) {
287 get_page(page_head);
288 put_page(page);
289 }
290 local_irq_enable();
291 } else {
292 local_irq_enable();
293 goto again;
294 }
295 }
296#else
297 page_head = compound_head(page);
298 if (page != page_head) {
299 get_page(page_head);
300 put_page(page);
301 }
302#endif
303
304 lock_page(page_head);
305 if (!page_head->mapping) {
306 unlock_page(page_head);
307 put_page(page_head);
259 goto again; 308 goto again;
260 } 309 }
261 310
@@ -266,25 +315,24 @@ again:
266 * it's a read-only handle, it's expected that futexes attach to 315 * it's a read-only handle, it's expected that futexes attach to
267 * the object not the particular process. 316 * the object not the particular process.
268 */ 317 */
269 if (PageAnon(page)) { 318 if (PageAnon(page_head)) {
270 key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */ 319 key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */
271 key->private.mm = mm; 320 key->private.mm = mm;
272 key->private.address = address; 321 key->private.address = address;
273 } else { 322 } else {
274 key->both.offset |= FUT_OFF_INODE; /* inode-based key */ 323 key->both.offset |= FUT_OFF_INODE; /* inode-based key */
275 key->shared.inode = page->mapping->host; 324 key->shared.inode = page_head->mapping->host;
276 key->shared.pgoff = page->index; 325 key->shared.pgoff = page_head->index;
277 } 326 }
278 327
279 get_futex_key_refs(key); 328 get_futex_key_refs(key);
280 329
281 unlock_page(page); 330 unlock_page(page_head);
282 put_page(page); 331 put_page(page_head);
283 return 0; 332 return 0;
284} 333}
285 334
286static inline 335static inline void put_futex_key(union futex_key *key)
287void put_futex_key(int fshared, union futex_key *key)
288{ 336{
289 drop_futex_key_refs(key); 337 drop_futex_key_refs(key);
290} 338}
@@ -870,7 +918,8 @@ double_unlock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2)
870/* 918/*
871 * Wake up waiters matching bitset queued on this futex (uaddr). 919 * Wake up waiters matching bitset queued on this futex (uaddr).
872 */ 920 */
873static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset) 921static int
922futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
874{ 923{
875 struct futex_hash_bucket *hb; 924 struct futex_hash_bucket *hb;
876 struct futex_q *this, *next; 925 struct futex_q *this, *next;
@@ -881,7 +930,7 @@ static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset)
881 if (!bitset) 930 if (!bitset)
882 return -EINVAL; 931 return -EINVAL;
883 932
884 ret = get_futex_key(uaddr, fshared, &key); 933 ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key);
885 if (unlikely(ret != 0)) 934 if (unlikely(ret != 0))
886 goto out; 935 goto out;
887 936
@@ -907,7 +956,7 @@ static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset)
907 } 956 }
908 957
909 spin_unlock(&hb->lock); 958 spin_unlock(&hb->lock);
910 put_futex_key(fshared, &key); 959 put_futex_key(&key);
911out: 960out:
912 return ret; 961 return ret;
913} 962}
@@ -917,7 +966,7 @@ out:
917 * to this virtual address: 966 * to this virtual address:
918 */ 967 */
919static int 968static int
920futex_wake_op(u32 __user *uaddr1, int fshared, u32 __user *uaddr2, 969futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2,
921 int nr_wake, int nr_wake2, int op) 970 int nr_wake, int nr_wake2, int op)
922{ 971{
923 union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT; 972 union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
@@ -927,10 +976,10 @@ futex_wake_op(u32 __user *uaddr1, int fshared, u32 __user *uaddr2,
927 int ret, op_ret; 976 int ret, op_ret;
928 977
929retry: 978retry:
930 ret = get_futex_key(uaddr1, fshared, &key1); 979 ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1);
931 if (unlikely(ret != 0)) 980 if (unlikely(ret != 0))
932 goto out; 981 goto out;
933 ret = get_futex_key(uaddr2, fshared, &key2); 982 ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2);
934 if (unlikely(ret != 0)) 983 if (unlikely(ret != 0))
935 goto out_put_key1; 984 goto out_put_key1;
936 985
@@ -962,11 +1011,11 @@ retry_private:
962 if (ret) 1011 if (ret)
963 goto out_put_keys; 1012 goto out_put_keys;
964 1013
965 if (!fshared) 1014 if (!(flags & FLAGS_SHARED))
966 goto retry_private; 1015 goto retry_private;
967 1016
968 put_futex_key(fshared, &key2); 1017 put_futex_key(&key2);
969 put_futex_key(fshared, &key1); 1018 put_futex_key(&key1);
970 goto retry; 1019 goto retry;
971 } 1020 }
972 1021
@@ -996,9 +1045,9 @@ retry_private:
996 1045
997 double_unlock_hb(hb1, hb2); 1046 double_unlock_hb(hb1, hb2);
998out_put_keys: 1047out_put_keys:
999 put_futex_key(fshared, &key2); 1048 put_futex_key(&key2);
1000out_put_key1: 1049out_put_key1:
1001 put_futex_key(fshared, &key1); 1050 put_futex_key(&key1);
1002out: 1051out:
1003 return ret; 1052 return ret;
1004} 1053}
@@ -1133,13 +1182,13 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex,
1133/** 1182/**
1134 * futex_requeue() - Requeue waiters from uaddr1 to uaddr2 1183 * futex_requeue() - Requeue waiters from uaddr1 to uaddr2
1135 * @uaddr1: source futex user address 1184 * @uaddr1: source futex user address
1136 * @fshared: 0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED 1185 * @flags: futex flags (FLAGS_SHARED, etc.)
1137 * @uaddr2: target futex user address 1186 * @uaddr2: target futex user address
1138 * @nr_wake: number of waiters to wake (must be 1 for requeue_pi) 1187 * @nr_wake: number of waiters to wake (must be 1 for requeue_pi)
1139 * @nr_requeue: number of waiters to requeue (0-INT_MAX) 1188 * @nr_requeue: number of waiters to requeue (0-INT_MAX)
1140 * @cmpval: @uaddr1 expected value (or %NULL) 1189 * @cmpval: @uaddr1 expected value (or %NULL)
1141 * @requeue_pi: if we are attempting to requeue from a non-pi futex to a 1190 * @requeue_pi: if we are attempting to requeue from a non-pi futex to a
1142 * pi futex (pi to pi requeue is not supported) 1191 * pi futex (pi to pi requeue is not supported)
1143 * 1192 *
1144 * Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire 1193 * Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire
1145 * uaddr2 atomically on behalf of the top waiter. 1194 * uaddr2 atomically on behalf of the top waiter.
@@ -1148,9 +1197,9 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex,
1148 * >=0 - on success, the number of tasks requeued or woken 1197 * >=0 - on success, the number of tasks requeued or woken
1149 * <0 - on error 1198 * <0 - on error
1150 */ 1199 */
1151static int futex_requeue(u32 __user *uaddr1, int fshared, u32 __user *uaddr2, 1200static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
1152 int nr_wake, int nr_requeue, u32 *cmpval, 1201 u32 __user *uaddr2, int nr_wake, int nr_requeue,
1153 int requeue_pi) 1202 u32 *cmpval, int requeue_pi)
1154{ 1203{
1155 union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT; 1204 union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
1156 int drop_count = 0, task_count = 0, ret; 1205 int drop_count = 0, task_count = 0, ret;
@@ -1191,10 +1240,10 @@ retry:
1191 pi_state = NULL; 1240 pi_state = NULL;
1192 } 1241 }
1193 1242
1194 ret = get_futex_key(uaddr1, fshared, &key1); 1243 ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1);
1195 if (unlikely(ret != 0)) 1244 if (unlikely(ret != 0))
1196 goto out; 1245 goto out;
1197 ret = get_futex_key(uaddr2, fshared, &key2); 1246 ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2);
1198 if (unlikely(ret != 0)) 1247 if (unlikely(ret != 0))
1199 goto out_put_key1; 1248 goto out_put_key1;
1200 1249
@@ -1216,11 +1265,11 @@ retry_private:
1216 if (ret) 1265 if (ret)
1217 goto out_put_keys; 1266 goto out_put_keys;
1218 1267
1219 if (!fshared) 1268 if (!(flags & FLAGS_SHARED))
1220 goto retry_private; 1269 goto retry_private;
1221 1270
1222 put_futex_key(fshared, &key2); 1271 put_futex_key(&key2);
1223 put_futex_key(fshared, &key1); 1272 put_futex_key(&key1);
1224 goto retry; 1273 goto retry;
1225 } 1274 }
1226 if (curval != *cmpval) { 1275 if (curval != *cmpval) {
@@ -1260,8 +1309,8 @@ retry_private:
1260 break; 1309 break;
1261 case -EFAULT: 1310 case -EFAULT:
1262 double_unlock_hb(hb1, hb2); 1311 double_unlock_hb(hb1, hb2);
1263 put_futex_key(fshared, &key2); 1312 put_futex_key(&key2);
1264 put_futex_key(fshared, &key1); 1313 put_futex_key(&key1);
1265 ret = fault_in_user_writeable(uaddr2); 1314 ret = fault_in_user_writeable(uaddr2);
1266 if (!ret) 1315 if (!ret)
1267 goto retry; 1316 goto retry;
@@ -1269,8 +1318,8 @@ retry_private:
1269 case -EAGAIN: 1318 case -EAGAIN:
1270 /* The owner was exiting, try again. */ 1319 /* The owner was exiting, try again. */
1271 double_unlock_hb(hb1, hb2); 1320 double_unlock_hb(hb1, hb2);
1272 put_futex_key(fshared, &key2); 1321 put_futex_key(&key2);
1273 put_futex_key(fshared, &key1); 1322 put_futex_key(&key1);
1274 cond_resched(); 1323 cond_resched();
1275 goto retry; 1324 goto retry;
1276 default: 1325 default:
@@ -1352,9 +1401,9 @@ out_unlock:
1352 drop_futex_key_refs(&key1); 1401 drop_futex_key_refs(&key1);
1353 1402
1354out_put_keys: 1403out_put_keys:
1355 put_futex_key(fshared, &key2); 1404 put_futex_key(&key2);
1356out_put_key1: 1405out_put_key1:
1357 put_futex_key(fshared, &key1); 1406 put_futex_key(&key1);
1358out: 1407out:
1359 if (pi_state != NULL) 1408 if (pi_state != NULL)
1360 free_pi_state(pi_state); 1409 free_pi_state(pi_state);
@@ -1494,7 +1543,7 @@ static void unqueue_me_pi(struct futex_q *q)
1494 * private futexes. 1543 * private futexes.
1495 */ 1544 */
1496static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, 1545static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
1497 struct task_struct *newowner, int fshared) 1546 struct task_struct *newowner)
1498{ 1547{
1499 u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS; 1548 u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS;
1500 struct futex_pi_state *pi_state = q->pi_state; 1549 struct futex_pi_state *pi_state = q->pi_state;
@@ -1587,20 +1636,11 @@ handle_fault:
1587 goto retry; 1636 goto retry;
1588} 1637}
1589 1638
1590/*
1591 * In case we must use restart_block to restart a futex_wait,
1592 * we encode in the 'flags' shared capability
1593 */
1594#define FLAGS_SHARED 0x01
1595#define FLAGS_CLOCKRT 0x02
1596#define FLAGS_HAS_TIMEOUT 0x04
1597
1598static long futex_wait_restart(struct restart_block *restart); 1639static long futex_wait_restart(struct restart_block *restart);
1599 1640
1600/** 1641/**
1601 * fixup_owner() - Post lock pi_state and corner case management 1642 * fixup_owner() - Post lock pi_state and corner case management
1602 * @uaddr: user address of the futex 1643 * @uaddr: user address of the futex
1603 * @fshared: whether the futex is shared (1) or not (0)
1604 * @q: futex_q (contains pi_state and access to the rt_mutex) 1644 * @q: futex_q (contains pi_state and access to the rt_mutex)
1605 * @locked: if the attempt to take the rt_mutex succeeded (1) or not (0) 1645 * @locked: if the attempt to take the rt_mutex succeeded (1) or not (0)
1606 * 1646 *
@@ -1613,8 +1653,7 @@ static long futex_wait_restart(struct restart_block *restart);
1613 * 0 - success, lock not taken 1653 * 0 - success, lock not taken
1614 * <0 - on error (-EFAULT) 1654 * <0 - on error (-EFAULT)
1615 */ 1655 */
1616static int fixup_owner(u32 __user *uaddr, int fshared, struct futex_q *q, 1656static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked)
1617 int locked)
1618{ 1657{
1619 struct task_struct *owner; 1658 struct task_struct *owner;
1620 int ret = 0; 1659 int ret = 0;
@@ -1625,7 +1664,7 @@ static int fixup_owner(u32 __user *uaddr, int fshared, struct futex_q *q,
1625 * did a lock-steal - fix up the PI-state in that case: 1664 * did a lock-steal - fix up the PI-state in that case:
1626 */ 1665 */
1627 if (q->pi_state->owner != current) 1666 if (q->pi_state->owner != current)
1628 ret = fixup_pi_state_owner(uaddr, q, current, fshared); 1667 ret = fixup_pi_state_owner(uaddr, q, current);
1629 goto out; 1668 goto out;
1630 } 1669 }
1631 1670
@@ -1652,7 +1691,7 @@ static int fixup_owner(u32 __user *uaddr, int fshared, struct futex_q *q,
1652 * lock. Fix the state up. 1691 * lock. Fix the state up.
1653 */ 1692 */
1654 owner = rt_mutex_owner(&q->pi_state->pi_mutex); 1693 owner = rt_mutex_owner(&q->pi_state->pi_mutex);
1655 ret = fixup_pi_state_owner(uaddr, q, owner, fshared); 1694 ret = fixup_pi_state_owner(uaddr, q, owner);
1656 goto out; 1695 goto out;
1657 } 1696 }
1658 1697
@@ -1715,7 +1754,7 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
1715 * futex_wait_setup() - Prepare to wait on a futex 1754 * futex_wait_setup() - Prepare to wait on a futex
1716 * @uaddr: the futex userspace address 1755 * @uaddr: the futex userspace address
1717 * @val: the expected value 1756 * @val: the expected value
1718 * @fshared: whether the futex is shared (1) or not (0) 1757 * @flags: futex flags (FLAGS_SHARED, etc.)
1719 * @q: the associated futex_q 1758 * @q: the associated futex_q
1720 * @hb: storage for hash_bucket pointer to be returned to caller 1759 * @hb: storage for hash_bucket pointer to be returned to caller
1721 * 1760 *
@@ -1728,7 +1767,7 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
1728 * 0 - uaddr contains val and hb has been locked 1767 * 0 - uaddr contains val and hb has been locked
1729 * <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlcoked 1768 * <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlcoked
1730 */ 1769 */
1731static int futex_wait_setup(u32 __user *uaddr, u32 val, int fshared, 1770static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags,
1732 struct futex_q *q, struct futex_hash_bucket **hb) 1771 struct futex_q *q, struct futex_hash_bucket **hb)
1733{ 1772{
1734 u32 uval; 1773 u32 uval;
@@ -1752,8 +1791,7 @@ static int futex_wait_setup(u32 __user *uaddr, u32 val, int fshared,
1752 * rare, but normal. 1791 * rare, but normal.
1753 */ 1792 */
1754retry: 1793retry:
1755 q->key = FUTEX_KEY_INIT; 1794 ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q->key);
1756 ret = get_futex_key(uaddr, fshared, &q->key);
1757 if (unlikely(ret != 0)) 1795 if (unlikely(ret != 0))
1758 return ret; 1796 return ret;
1759 1797
@@ -1769,10 +1807,10 @@ retry_private:
1769 if (ret) 1807 if (ret)
1770 goto out; 1808 goto out;
1771 1809
1772 if (!fshared) 1810 if (!(flags & FLAGS_SHARED))
1773 goto retry_private; 1811 goto retry_private;
1774 1812
1775 put_futex_key(fshared, &q->key); 1813 put_futex_key(&q->key);
1776 goto retry; 1814 goto retry;
1777 } 1815 }
1778 1816
@@ -1783,32 +1821,29 @@ retry_private:
1783 1821
1784out: 1822out:
1785 if (ret) 1823 if (ret)
1786 put_futex_key(fshared, &q->key); 1824 put_futex_key(&q->key);
1787 return ret; 1825 return ret;
1788} 1826}
1789 1827
1790static int futex_wait(u32 __user *uaddr, int fshared, 1828static int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val,
1791 u32 val, ktime_t *abs_time, u32 bitset, int clockrt) 1829 ktime_t *abs_time, u32 bitset)
1792{ 1830{
1793 struct hrtimer_sleeper timeout, *to = NULL; 1831 struct hrtimer_sleeper timeout, *to = NULL;
1794 struct restart_block *restart; 1832 struct restart_block *restart;
1795 struct futex_hash_bucket *hb; 1833 struct futex_hash_bucket *hb;
1796 struct futex_q q; 1834 struct futex_q q = futex_q_init;
1797 int ret; 1835 int ret;
1798 1836
1799 if (!bitset) 1837 if (!bitset)
1800 return -EINVAL; 1838 return -EINVAL;
1801
1802 q.pi_state = NULL;
1803 q.bitset = bitset; 1839 q.bitset = bitset;
1804 q.rt_waiter = NULL;
1805 q.requeue_pi_key = NULL;
1806 1840
1807 if (abs_time) { 1841 if (abs_time) {
1808 to = &timeout; 1842 to = &timeout;
1809 1843
1810 hrtimer_init_on_stack(&to->timer, clockrt ? CLOCK_REALTIME : 1844 hrtimer_init_on_stack(&to->timer, (flags & FLAGS_CLOCKRT) ?
1811 CLOCK_MONOTONIC, HRTIMER_MODE_ABS); 1845 CLOCK_REALTIME : CLOCK_MONOTONIC,
1846 HRTIMER_MODE_ABS);
1812 hrtimer_init_sleeper(to, current); 1847 hrtimer_init_sleeper(to, current);
1813 hrtimer_set_expires_range_ns(&to->timer, *abs_time, 1848 hrtimer_set_expires_range_ns(&to->timer, *abs_time,
1814 current->timer_slack_ns); 1849 current->timer_slack_ns);
@@ -1819,7 +1854,7 @@ retry:
1819 * Prepare to wait on uaddr. On success, holds hb lock and increments 1854 * Prepare to wait on uaddr. On success, holds hb lock and increments
1820 * q.key refs. 1855 * q.key refs.
1821 */ 1856 */
1822 ret = futex_wait_setup(uaddr, val, fshared, &q, &hb); 1857 ret = futex_wait_setup(uaddr, val, flags, &q, &hb);
1823 if (ret) 1858 if (ret)
1824 goto out; 1859 goto out;
1825 1860
@@ -1852,12 +1887,7 @@ retry:
1852 restart->futex.val = val; 1887 restart->futex.val = val;
1853 restart->futex.time = abs_time->tv64; 1888 restart->futex.time = abs_time->tv64;
1854 restart->futex.bitset = bitset; 1889 restart->futex.bitset = bitset;
1855 restart->futex.flags = FLAGS_HAS_TIMEOUT; 1890 restart->futex.flags = flags;
1856
1857 if (fshared)
1858 restart->futex.flags |= FLAGS_SHARED;
1859 if (clockrt)
1860 restart->futex.flags |= FLAGS_CLOCKRT;
1861 1891
1862 ret = -ERESTART_RESTARTBLOCK; 1892 ret = -ERESTART_RESTARTBLOCK;
1863 1893
@@ -1873,7 +1903,6 @@ out:
1873static long futex_wait_restart(struct restart_block *restart) 1903static long futex_wait_restart(struct restart_block *restart)
1874{ 1904{
1875 u32 __user *uaddr = restart->futex.uaddr; 1905 u32 __user *uaddr = restart->futex.uaddr;
1876 int fshared = 0;
1877 ktime_t t, *tp = NULL; 1906 ktime_t t, *tp = NULL;
1878 1907
1879 if (restart->futex.flags & FLAGS_HAS_TIMEOUT) { 1908 if (restart->futex.flags & FLAGS_HAS_TIMEOUT) {
@@ -1881,11 +1910,9 @@ static long futex_wait_restart(struct restart_block *restart)
1881 tp = &t; 1910 tp = &t;
1882 } 1911 }
1883 restart->fn = do_no_restart_syscall; 1912 restart->fn = do_no_restart_syscall;
1884 if (restart->futex.flags & FLAGS_SHARED) 1913
1885 fshared = 1; 1914 return (long)futex_wait(uaddr, restart->futex.flags,
1886 return (long)futex_wait(uaddr, fshared, restart->futex.val, tp, 1915 restart->futex.val, tp, restart->futex.bitset);
1887 restart->futex.bitset,
1888 restart->futex.flags & FLAGS_CLOCKRT);
1889} 1916}
1890 1917
1891 1918
@@ -1895,12 +1922,12 @@ static long futex_wait_restart(struct restart_block *restart)
1895 * if there are waiters then it will block, it does PI, etc. (Due to 1922 * if there are waiters then it will block, it does PI, etc. (Due to
1896 * races the kernel might see a 0 value of the futex too.) 1923 * races the kernel might see a 0 value of the futex too.)
1897 */ 1924 */
1898static int futex_lock_pi(u32 __user *uaddr, int fshared, 1925static int futex_lock_pi(u32 __user *uaddr, unsigned int flags, int detect,
1899 int detect, ktime_t *time, int trylock) 1926 ktime_t *time, int trylock)
1900{ 1927{
1901 struct hrtimer_sleeper timeout, *to = NULL; 1928 struct hrtimer_sleeper timeout, *to = NULL;
1902 struct futex_hash_bucket *hb; 1929 struct futex_hash_bucket *hb;
1903 struct futex_q q; 1930 struct futex_q q = futex_q_init;
1904 int res, ret; 1931 int res, ret;
1905 1932
1906 if (refill_pi_state_cache()) 1933 if (refill_pi_state_cache())
@@ -1914,12 +1941,8 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared,
1914 hrtimer_set_expires(&to->timer, *time); 1941 hrtimer_set_expires(&to->timer, *time);
1915 } 1942 }
1916 1943
1917 q.pi_state = NULL;
1918 q.rt_waiter = NULL;
1919 q.requeue_pi_key = NULL;
1920retry: 1944retry:
1921 q.key = FUTEX_KEY_INIT; 1945 ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q.key);
1922 ret = get_futex_key(uaddr, fshared, &q.key);
1923 if (unlikely(ret != 0)) 1946 if (unlikely(ret != 0))
1924 goto out; 1947 goto out;
1925 1948
@@ -1941,7 +1964,7 @@ retry_private:
1941 * exit to complete. 1964 * exit to complete.
1942 */ 1965 */
1943 queue_unlock(&q, hb); 1966 queue_unlock(&q, hb);
1944 put_futex_key(fshared, &q.key); 1967 put_futex_key(&q.key);
1945 cond_resched(); 1968 cond_resched();
1946 goto retry; 1969 goto retry;
1947 default: 1970 default:
@@ -1971,7 +1994,7 @@ retry_private:
1971 * Fixup the pi_state owner and possibly acquire the lock if we 1994 * Fixup the pi_state owner and possibly acquire the lock if we
1972 * haven't already. 1995 * haven't already.
1973 */ 1996 */
1974 res = fixup_owner(uaddr, fshared, &q, !ret); 1997 res = fixup_owner(uaddr, &q, !ret);
1975 /* 1998 /*
1976 * If fixup_owner() returned an error, proprogate that. If it acquired 1999 * If fixup_owner() returned an error, proprogate that. If it acquired
1977 * the lock, clear our -ETIMEDOUT or -EINTR. 2000 * the lock, clear our -ETIMEDOUT or -EINTR.
@@ -1995,7 +2018,7 @@ out_unlock_put_key:
1995 queue_unlock(&q, hb); 2018 queue_unlock(&q, hb);
1996 2019
1997out_put_key: 2020out_put_key:
1998 put_futex_key(fshared, &q.key); 2021 put_futex_key(&q.key);
1999out: 2022out:
2000 if (to) 2023 if (to)
2001 destroy_hrtimer_on_stack(&to->timer); 2024 destroy_hrtimer_on_stack(&to->timer);
@@ -2008,10 +2031,10 @@ uaddr_faulted:
2008 if (ret) 2031 if (ret)
2009 goto out_put_key; 2032 goto out_put_key;
2010 2033
2011 if (!fshared) 2034 if (!(flags & FLAGS_SHARED))
2012 goto retry_private; 2035 goto retry_private;
2013 2036
2014 put_futex_key(fshared, &q.key); 2037 put_futex_key(&q.key);
2015 goto retry; 2038 goto retry;
2016} 2039}
2017 2040
@@ -2020,7 +2043,7 @@ uaddr_faulted:
2020 * This is the in-kernel slowpath: we look up the PI state (if any), 2043 * This is the in-kernel slowpath: we look up the PI state (if any),
2021 * and do the rt-mutex unlock. 2044 * and do the rt-mutex unlock.
2022 */ 2045 */
2023static int futex_unlock_pi(u32 __user *uaddr, int fshared) 2046static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
2024{ 2047{
2025 struct futex_hash_bucket *hb; 2048 struct futex_hash_bucket *hb;
2026 struct futex_q *this, *next; 2049 struct futex_q *this, *next;
@@ -2038,7 +2061,7 @@ retry:
2038 if ((uval & FUTEX_TID_MASK) != task_pid_vnr(current)) 2061 if ((uval & FUTEX_TID_MASK) != task_pid_vnr(current))
2039 return -EPERM; 2062 return -EPERM;
2040 2063
2041 ret = get_futex_key(uaddr, fshared, &key); 2064 ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key);
2042 if (unlikely(ret != 0)) 2065 if (unlikely(ret != 0))
2043 goto out; 2066 goto out;
2044 2067
@@ -2093,14 +2116,14 @@ retry:
2093 2116
2094out_unlock: 2117out_unlock:
2095 spin_unlock(&hb->lock); 2118 spin_unlock(&hb->lock);
2096 put_futex_key(fshared, &key); 2119 put_futex_key(&key);
2097 2120
2098out: 2121out:
2099 return ret; 2122 return ret;
2100 2123
2101pi_faulted: 2124pi_faulted:
2102 spin_unlock(&hb->lock); 2125 spin_unlock(&hb->lock);
2103 put_futex_key(fshared, &key); 2126 put_futex_key(&key);
2104 2127
2105 ret = fault_in_user_writeable(uaddr); 2128 ret = fault_in_user_writeable(uaddr);
2106 if (!ret) 2129 if (!ret)
@@ -2160,7 +2183,7 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
2160/** 2183/**
2161 * futex_wait_requeue_pi() - Wait on uaddr and take uaddr2 2184 * futex_wait_requeue_pi() - Wait on uaddr and take uaddr2
2162 * @uaddr: the futex we initially wait on (non-pi) 2185 * @uaddr: the futex we initially wait on (non-pi)
2163 * @fshared: whether the futexes are shared (1) or not (0). They must be 2186 * @flags: futex flags (FLAGS_SHARED, FLAGS_CLOCKRT, etc.), they must be
2164 * the same type, no requeueing from private to shared, etc. 2187 * the same type, no requeueing from private to shared, etc.
2165 * @val: the expected value of uaddr 2188 * @val: the expected value of uaddr
2166 * @abs_time: absolute timeout 2189 * @abs_time: absolute timeout
@@ -2198,16 +2221,16 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
2198 * 0 - On success 2221 * 0 - On success
2199 * <0 - On error 2222 * <0 - On error
2200 */ 2223 */
2201static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared, 2224static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
2202 u32 val, ktime_t *abs_time, u32 bitset, 2225 u32 val, ktime_t *abs_time, u32 bitset,
2203 int clockrt, u32 __user *uaddr2) 2226 u32 __user *uaddr2)
2204{ 2227{
2205 struct hrtimer_sleeper timeout, *to = NULL; 2228 struct hrtimer_sleeper timeout, *to = NULL;
2206 struct rt_mutex_waiter rt_waiter; 2229 struct rt_mutex_waiter rt_waiter;
2207 struct rt_mutex *pi_mutex = NULL; 2230 struct rt_mutex *pi_mutex = NULL;
2208 struct futex_hash_bucket *hb; 2231 struct futex_hash_bucket *hb;
2209 union futex_key key2; 2232 union futex_key key2 = FUTEX_KEY_INIT;
2210 struct futex_q q; 2233 struct futex_q q = futex_q_init;
2211 int res, ret; 2234 int res, ret;
2212 2235
2213 if (!bitset) 2236 if (!bitset)
@@ -2215,8 +2238,9 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
2215 2238
2216 if (abs_time) { 2239 if (abs_time) {
2217 to = &timeout; 2240 to = &timeout;
2218 hrtimer_init_on_stack(&to->timer, clockrt ? CLOCK_REALTIME : 2241 hrtimer_init_on_stack(&to->timer, (flags & FLAGS_CLOCKRT) ?
2219 CLOCK_MONOTONIC, HRTIMER_MODE_ABS); 2242 CLOCK_REALTIME : CLOCK_MONOTONIC,
2243 HRTIMER_MODE_ABS);
2220 hrtimer_init_sleeper(to, current); 2244 hrtimer_init_sleeper(to, current);
2221 hrtimer_set_expires_range_ns(&to->timer, *abs_time, 2245 hrtimer_set_expires_range_ns(&to->timer, *abs_time,
2222 current->timer_slack_ns); 2246 current->timer_slack_ns);
@@ -2229,12 +2253,10 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
2229 debug_rt_mutex_init_waiter(&rt_waiter); 2253 debug_rt_mutex_init_waiter(&rt_waiter);
2230 rt_waiter.task = NULL; 2254 rt_waiter.task = NULL;
2231 2255
2232 key2 = FUTEX_KEY_INIT; 2256 ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2);
2233 ret = get_futex_key(uaddr2, fshared, &key2);
2234 if (unlikely(ret != 0)) 2257 if (unlikely(ret != 0))
2235 goto out; 2258 goto out;
2236 2259
2237 q.pi_state = NULL;
2238 q.bitset = bitset; 2260 q.bitset = bitset;
2239 q.rt_waiter = &rt_waiter; 2261 q.rt_waiter = &rt_waiter;
2240 q.requeue_pi_key = &key2; 2262 q.requeue_pi_key = &key2;
@@ -2243,7 +2265,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
2243 * Prepare to wait on uaddr. On success, increments q.key (key1) ref 2265 * Prepare to wait on uaddr. On success, increments q.key (key1) ref
2244 * count. 2266 * count.
2245 */ 2267 */
2246 ret = futex_wait_setup(uaddr, val, fshared, &q, &hb); 2268 ret = futex_wait_setup(uaddr, val, flags, &q, &hb);
2247 if (ret) 2269 if (ret)
2248 goto out_key2; 2270 goto out_key2;
2249 2271
@@ -2273,8 +2295,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
2273 */ 2295 */
2274 if (q.pi_state && (q.pi_state->owner != current)) { 2296 if (q.pi_state && (q.pi_state->owner != current)) {
2275 spin_lock(q.lock_ptr); 2297 spin_lock(q.lock_ptr);
2276 ret = fixup_pi_state_owner(uaddr2, &q, current, 2298 ret = fixup_pi_state_owner(uaddr2, &q, current);
2277 fshared);
2278 spin_unlock(q.lock_ptr); 2299 spin_unlock(q.lock_ptr);
2279 } 2300 }
2280 } else { 2301 } else {
@@ -2293,7 +2314,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
2293 * Fixup the pi_state owner and possibly acquire the lock if we 2314 * Fixup the pi_state owner and possibly acquire the lock if we
2294 * haven't already. 2315 * haven't already.
2295 */ 2316 */
2296 res = fixup_owner(uaddr2, fshared, &q, !ret); 2317 res = fixup_owner(uaddr2, &q, !ret);
2297 /* 2318 /*
2298 * If fixup_owner() returned an error, proprogate that. If it 2319 * If fixup_owner() returned an error, proprogate that. If it
2299 * acquired the lock, clear -ETIMEDOUT or -EINTR. 2320 * acquired the lock, clear -ETIMEDOUT or -EINTR.
@@ -2324,9 +2345,9 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
2324 } 2345 }
2325 2346
2326out_put_keys: 2347out_put_keys:
2327 put_futex_key(fshared, &q.key); 2348 put_futex_key(&q.key);
2328out_key2: 2349out_key2:
2329 put_futex_key(fshared, &key2); 2350 put_futex_key(&key2);
2330 2351
2331out: 2352out:
2332 if (to) { 2353 if (to) {
@@ -2551,58 +2572,57 @@ void exit_robust_list(struct task_struct *curr)
2551long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, 2572long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
2552 u32 __user *uaddr2, u32 val2, u32 val3) 2573 u32 __user *uaddr2, u32 val2, u32 val3)
2553{ 2574{
2554 int clockrt, ret = -ENOSYS; 2575 int ret = -ENOSYS, cmd = op & FUTEX_CMD_MASK;
2555 int cmd = op & FUTEX_CMD_MASK; 2576 unsigned int flags = 0;
2556 int fshared = 0;
2557 2577
2558 if (!(op & FUTEX_PRIVATE_FLAG)) 2578 if (!(op & FUTEX_PRIVATE_FLAG))
2559 fshared = 1; 2579 flags |= FLAGS_SHARED;
2560 2580
2561 clockrt = op & FUTEX_CLOCK_REALTIME; 2581 if (op & FUTEX_CLOCK_REALTIME) {
2562 if (clockrt && cmd != FUTEX_WAIT_BITSET && cmd != FUTEX_WAIT_REQUEUE_PI) 2582 flags |= FLAGS_CLOCKRT;
2563 return -ENOSYS; 2583 if (cmd != FUTEX_WAIT_BITSET && cmd != FUTEX_WAIT_REQUEUE_PI)
2584 return -ENOSYS;
2585 }
2564 2586
2565 switch (cmd) { 2587 switch (cmd) {
2566 case FUTEX_WAIT: 2588 case FUTEX_WAIT:
2567 val3 = FUTEX_BITSET_MATCH_ANY; 2589 val3 = FUTEX_BITSET_MATCH_ANY;
2568 case FUTEX_WAIT_BITSET: 2590 case FUTEX_WAIT_BITSET:
2569 ret = futex_wait(uaddr, fshared, val, timeout, val3, clockrt); 2591 ret = futex_wait(uaddr, flags, val, timeout, val3);
2570 break; 2592 break;
2571 case FUTEX_WAKE: 2593 case FUTEX_WAKE:
2572 val3 = FUTEX_BITSET_MATCH_ANY; 2594 val3 = FUTEX_BITSET_MATCH_ANY;
2573 case FUTEX_WAKE_BITSET: 2595 case FUTEX_WAKE_BITSET:
2574 ret = futex_wake(uaddr, fshared, val, val3); 2596 ret = futex_wake(uaddr, flags, val, val3);
2575 break; 2597 break;
2576 case FUTEX_REQUEUE: 2598 case FUTEX_REQUEUE:
2577 ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, NULL, 0); 2599 ret = futex_requeue(uaddr, flags, uaddr2, val, val2, NULL, 0);
2578 break; 2600 break;
2579 case FUTEX_CMP_REQUEUE: 2601 case FUTEX_CMP_REQUEUE:
2580 ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3, 2602 ret = futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 0);
2581 0);
2582 break; 2603 break;
2583 case FUTEX_WAKE_OP: 2604 case FUTEX_WAKE_OP:
2584 ret = futex_wake_op(uaddr, fshared, uaddr2, val, val2, val3); 2605 ret = futex_wake_op(uaddr, flags, uaddr2, val, val2, val3);
2585 break; 2606 break;
2586 case FUTEX_LOCK_PI: 2607 case FUTEX_LOCK_PI:
2587 if (futex_cmpxchg_enabled) 2608 if (futex_cmpxchg_enabled)
2588 ret = futex_lock_pi(uaddr, fshared, val, timeout, 0); 2609 ret = futex_lock_pi(uaddr, flags, val, timeout, 0);
2589 break; 2610 break;
2590 case FUTEX_UNLOCK_PI: 2611 case FUTEX_UNLOCK_PI:
2591 if (futex_cmpxchg_enabled) 2612 if (futex_cmpxchg_enabled)
2592 ret = futex_unlock_pi(uaddr, fshared); 2613 ret = futex_unlock_pi(uaddr, flags);
2593 break; 2614 break;
2594 case FUTEX_TRYLOCK_PI: 2615 case FUTEX_TRYLOCK_PI:
2595 if (futex_cmpxchg_enabled) 2616 if (futex_cmpxchg_enabled)
2596 ret = futex_lock_pi(uaddr, fshared, 0, timeout, 1); 2617 ret = futex_lock_pi(uaddr, flags, 0, timeout, 1);
2597 break; 2618 break;
2598 case FUTEX_WAIT_REQUEUE_PI: 2619 case FUTEX_WAIT_REQUEUE_PI:
2599 val3 = FUTEX_BITSET_MATCH_ANY; 2620 val3 = FUTEX_BITSET_MATCH_ANY;
2600 ret = futex_wait_requeue_pi(uaddr, fshared, val, timeout, val3, 2621 ret = futex_wait_requeue_pi(uaddr, flags, val, timeout, val3,
2601 clockrt, uaddr2); 2622 uaddr2);
2602 break; 2623 break;
2603 case FUTEX_CMP_REQUEUE_PI: 2624 case FUTEX_CMP_REQUEUE_PI:
2604 ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3, 2625 ret = futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 1);
2605 1);
2606 break; 2626 break;
2607 default: 2627 default:
2608 ret = -ENOSYS; 2628 ret = -ENOSYS;
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 72206cf5c6cf..0c8d7c048615 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -497,7 +497,7 @@ static inline int hrtimer_is_hres_enabled(void)
497 */ 497 */
498static inline int hrtimer_hres_active(void) 498static inline int hrtimer_hres_active(void)
499{ 499{
500 return __get_cpu_var(hrtimer_bases).hres_active; 500 return __this_cpu_read(hrtimer_bases.hres_active);
501} 501}
502 502
503/* 503/*
@@ -516,10 +516,13 @@ hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal)
516 516
517 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) { 517 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) {
518 struct hrtimer *timer; 518 struct hrtimer *timer;
519 struct timerqueue_node *next;
519 520
520 if (!base->first) 521 next = timerqueue_getnext(&base->active);
522 if (!next)
521 continue; 523 continue;
522 timer = rb_entry(base->first, struct hrtimer, node); 524 timer = container_of(next, struct hrtimer, node);
525
523 expires = ktime_sub(hrtimer_get_expires(timer), base->offset); 526 expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
524 /* 527 /*
525 * clock_was_set() has changed base->offset so the 528 * clock_was_set() has changed base->offset so the
@@ -840,48 +843,17 @@ EXPORT_SYMBOL_GPL(hrtimer_forward);
840static int enqueue_hrtimer(struct hrtimer *timer, 843static int enqueue_hrtimer(struct hrtimer *timer,
841 struct hrtimer_clock_base *base) 844 struct hrtimer_clock_base *base)
842{ 845{
843 struct rb_node **link = &base->active.rb_node;
844 struct rb_node *parent = NULL;
845 struct hrtimer *entry;
846 int leftmost = 1;
847
848 debug_activate(timer); 846 debug_activate(timer);
849 847
850 /* 848 timerqueue_add(&base->active, &timer->node);
851 * Find the right place in the rbtree:
852 */
853 while (*link) {
854 parent = *link;
855 entry = rb_entry(parent, struct hrtimer, node);
856 /*
857 * We dont care about collisions. Nodes with
858 * the same expiry time stay together.
859 */
860 if (hrtimer_get_expires_tv64(timer) <
861 hrtimer_get_expires_tv64(entry)) {
862 link = &(*link)->rb_left;
863 } else {
864 link = &(*link)->rb_right;
865 leftmost = 0;
866 }
867 }
868
869 /*
870 * Insert the timer to the rbtree and check whether it
871 * replaces the first pending timer
872 */
873 if (leftmost)
874 base->first = &timer->node;
875 849
876 rb_link_node(&timer->node, parent, link);
877 rb_insert_color(&timer->node, &base->active);
878 /* 850 /*
879 * HRTIMER_STATE_ENQUEUED is or'ed to the current state to preserve the 851 * HRTIMER_STATE_ENQUEUED is or'ed to the current state to preserve the
880 * state of a possibly running callback. 852 * state of a possibly running callback.
881 */ 853 */
882 timer->state |= HRTIMER_STATE_ENQUEUED; 854 timer->state |= HRTIMER_STATE_ENQUEUED;
883 855
884 return leftmost; 856 return (&timer->node == base->active.next);
885} 857}
886 858
887/* 859/*
@@ -901,12 +873,7 @@ static void __remove_hrtimer(struct hrtimer *timer,
901 if (!(timer->state & HRTIMER_STATE_ENQUEUED)) 873 if (!(timer->state & HRTIMER_STATE_ENQUEUED))
902 goto out; 874 goto out;
903 875
904 /* 876 if (&timer->node == timerqueue_getnext(&base->active)) {
905 * Remove the timer from the rbtree and replace the first
906 * entry pointer if necessary.
907 */
908 if (base->first == &timer->node) {
909 base->first = rb_next(&timer->node);
910#ifdef CONFIG_HIGH_RES_TIMERS 877#ifdef CONFIG_HIGH_RES_TIMERS
911 /* Reprogram the clock event device. if enabled */ 878 /* Reprogram the clock event device. if enabled */
912 if (reprogram && hrtimer_hres_active()) { 879 if (reprogram && hrtimer_hres_active()) {
@@ -919,7 +886,7 @@ static void __remove_hrtimer(struct hrtimer *timer,
919 } 886 }
920#endif 887#endif
921 } 888 }
922 rb_erase(&timer->node, &base->active); 889 timerqueue_del(&base->active, &timer->node);
923out: 890out:
924 timer->state = newstate; 891 timer->state = newstate;
925} 892}
@@ -1128,11 +1095,13 @@ ktime_t hrtimer_get_next_event(void)
1128 if (!hrtimer_hres_active()) { 1095 if (!hrtimer_hres_active()) {
1129 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) { 1096 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) {
1130 struct hrtimer *timer; 1097 struct hrtimer *timer;
1098 struct timerqueue_node *next;
1131 1099
1132 if (!base->first) 1100 next = timerqueue_getnext(&base->active);
1101 if (!next)
1133 continue; 1102 continue;
1134 1103
1135 timer = rb_entry(base->first, struct hrtimer, node); 1104 timer = container_of(next, struct hrtimer, node);
1136 delta.tv64 = hrtimer_get_expires_tv64(timer); 1105 delta.tv64 = hrtimer_get_expires_tv64(timer);
1137 delta = ktime_sub(delta, base->get_time()); 1106 delta = ktime_sub(delta, base->get_time());
1138 if (delta.tv64 < mindelta.tv64) 1107 if (delta.tv64 < mindelta.tv64)
@@ -1162,6 +1131,7 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
1162 1131
1163 timer->base = &cpu_base->clock_base[clock_id]; 1132 timer->base = &cpu_base->clock_base[clock_id];
1164 hrtimer_init_timer_hres(timer); 1133 hrtimer_init_timer_hres(timer);
1134 timerqueue_init(&timer->node);
1165 1135
1166#ifdef CONFIG_TIMER_STATS 1136#ifdef CONFIG_TIMER_STATS
1167 timer->start_site = NULL; 1137 timer->start_site = NULL;
@@ -1278,14 +1248,14 @@ retry:
1278 1248
1279 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { 1249 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
1280 ktime_t basenow; 1250 ktime_t basenow;
1281 struct rb_node *node; 1251 struct timerqueue_node *node;
1282 1252
1283 basenow = ktime_add(now, base->offset); 1253 basenow = ktime_add(now, base->offset);
1284 1254
1285 while ((node = base->first)) { 1255 while ((node = timerqueue_getnext(&base->active))) {
1286 struct hrtimer *timer; 1256 struct hrtimer *timer;
1287 1257
1288 timer = rb_entry(node, struct hrtimer, node); 1258 timer = container_of(node, struct hrtimer, node);
1289 1259
1290 /* 1260 /*
1291 * The immediate goal for using the softexpires is 1261 * The immediate goal for using the softexpires is
@@ -1441,7 +1411,7 @@ void hrtimer_run_pending(void)
1441 */ 1411 */
1442void hrtimer_run_queues(void) 1412void hrtimer_run_queues(void)
1443{ 1413{
1444 struct rb_node *node; 1414 struct timerqueue_node *node;
1445 struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); 1415 struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
1446 struct hrtimer_clock_base *base; 1416 struct hrtimer_clock_base *base;
1447 int index, gettime = 1; 1417 int index, gettime = 1;
@@ -1451,8 +1421,7 @@ void hrtimer_run_queues(void)
1451 1421
1452 for (index = 0; index < HRTIMER_MAX_CLOCK_BASES; index++) { 1422 for (index = 0; index < HRTIMER_MAX_CLOCK_BASES; index++) {
1453 base = &cpu_base->clock_base[index]; 1423 base = &cpu_base->clock_base[index];
1454 1424 if (!timerqueue_getnext(&base->active))
1455 if (!base->first)
1456 continue; 1425 continue;
1457 1426
1458 if (gettime) { 1427 if (gettime) {
@@ -1462,10 +1431,10 @@ void hrtimer_run_queues(void)
1462 1431
1463 raw_spin_lock(&cpu_base->lock); 1432 raw_spin_lock(&cpu_base->lock);
1464 1433
1465 while ((node = base->first)) { 1434 while ((node = timerqueue_getnext(&base->active))) {
1466 struct hrtimer *timer; 1435 struct hrtimer *timer;
1467 1436
1468 timer = rb_entry(node, struct hrtimer, node); 1437 timer = container_of(node, struct hrtimer, node);
1469 if (base->softirq_time.tv64 <= 1438 if (base->softirq_time.tv64 <=
1470 hrtimer_get_expires_tv64(timer)) 1439 hrtimer_get_expires_tv64(timer))
1471 break; 1440 break;
@@ -1630,8 +1599,10 @@ static void __cpuinit init_hrtimers_cpu(int cpu)
1630 1599
1631 raw_spin_lock_init(&cpu_base->lock); 1600 raw_spin_lock_init(&cpu_base->lock);
1632 1601
1633 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) 1602 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
1634 cpu_base->clock_base[i].cpu_base = cpu_base; 1603 cpu_base->clock_base[i].cpu_base = cpu_base;
1604 timerqueue_init_head(&cpu_base->clock_base[i].active);
1605 }
1635 1606
1636 hrtimer_init_hres(cpu_base); 1607 hrtimer_init_hres(cpu_base);
1637} 1608}
@@ -1642,10 +1613,10 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
1642 struct hrtimer_clock_base *new_base) 1613 struct hrtimer_clock_base *new_base)
1643{ 1614{
1644 struct hrtimer *timer; 1615 struct hrtimer *timer;
1645 struct rb_node *node; 1616 struct timerqueue_node *node;
1646 1617
1647 while ((node = rb_first(&old_base->active))) { 1618 while ((node = timerqueue_getnext(&old_base->active))) {
1648 timer = rb_entry(node, struct hrtimer, node); 1619 timer = container_of(node, struct hrtimer, node);
1649 BUG_ON(hrtimer_callback_running(timer)); 1620 BUG_ON(hrtimer_callback_running(timer));
1650 debug_deactivate(timer); 1621 debug_deactivate(timer);
1651 1622
@@ -1774,7 +1745,7 @@ schedule_hrtimeout_range_clock(ktime_t *expires, unsigned long delta,
1774 } 1745 }
1775 1746
1776 /* 1747 /*
1777 * A NULL parameter means "inifinte" 1748 * A NULL parameter means "infinite"
1778 */ 1749 */
1779 if (!expires) { 1750 if (!expires) {
1780 schedule(); 1751 schedule();
diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c
index e5325825aeb6..086adf25a55e 100644
--- a/kernel/hw_breakpoint.c
+++ b/kernel/hw_breakpoint.c
@@ -641,7 +641,7 @@ int __init init_hw_breakpoint(void)
641 641
642 constraints_initialized = 1; 642 constraints_initialized = 1;
643 643
644 perf_pmu_register(&perf_breakpoint); 644 perf_pmu_register(&perf_breakpoint, "breakpoint", PERF_TYPE_BREAKPOINT);
645 645
646 return register_die_notifier(&hw_breakpoint_exceptions_nb); 646 return register_die_notifier(&hw_breakpoint_exceptions_nb);
647 647
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 9988d03797f5..282f20230e67 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -72,6 +72,8 @@ static inline int desc_node(struct irq_desc *desc) { return 0; }
72 72
73static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node) 73static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node)
74{ 74{
75 int cpu;
76
75 desc->irq_data.irq = irq; 77 desc->irq_data.irq = irq;
76 desc->irq_data.chip = &no_irq_chip; 78 desc->irq_data.chip = &no_irq_chip;
77 desc->irq_data.chip_data = NULL; 79 desc->irq_data.chip_data = NULL;
@@ -83,7 +85,8 @@ static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node)
83 desc->irq_count = 0; 85 desc->irq_count = 0;
84 desc->irqs_unhandled = 0; 86 desc->irqs_unhandled = 0;
85 desc->name = NULL; 87 desc->name = NULL;
86 memset(desc->kstat_irqs, 0, nr_cpu_ids * sizeof(*(desc->kstat_irqs))); 88 for_each_possible_cpu(cpu)
89 *per_cpu_ptr(desc->kstat_irqs, cpu) = 0;
87 desc_smp_init(desc, node); 90 desc_smp_init(desc, node);
88} 91}
89 92
@@ -133,8 +136,7 @@ static struct irq_desc *alloc_desc(int irq, int node)
133 if (!desc) 136 if (!desc)
134 return NULL; 137 return NULL;
135 /* allocate based on nr_cpu_ids */ 138 /* allocate based on nr_cpu_ids */
136 desc->kstat_irqs = kzalloc_node(nr_cpu_ids * sizeof(*desc->kstat_irqs), 139 desc->kstat_irqs = alloc_percpu(unsigned int);
137 gfp, node);
138 if (!desc->kstat_irqs) 140 if (!desc->kstat_irqs)
139 goto err_desc; 141 goto err_desc;
140 142
@@ -149,7 +151,7 @@ static struct irq_desc *alloc_desc(int irq, int node)
149 return desc; 151 return desc;
150 152
151err_kstat: 153err_kstat:
152 kfree(desc->kstat_irqs); 154 free_percpu(desc->kstat_irqs);
153err_desc: 155err_desc:
154 kfree(desc); 156 kfree(desc);
155 return NULL; 157 return NULL;
@@ -166,7 +168,7 @@ static void free_desc(unsigned int irq)
166 mutex_unlock(&sparse_irq_lock); 168 mutex_unlock(&sparse_irq_lock);
167 169
168 free_masks(desc); 170 free_masks(desc);
169 kfree(desc->kstat_irqs); 171 free_percpu(desc->kstat_irqs);
170 kfree(desc); 172 kfree(desc);
171} 173}
172 174
@@ -234,7 +236,6 @@ struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = {
234 } 236 }
235}; 237};
236 238
237static unsigned int kstat_irqs_all[NR_IRQS][NR_CPUS];
238int __init early_irq_init(void) 239int __init early_irq_init(void)
239{ 240{
240 int count, i, node = first_online_node; 241 int count, i, node = first_online_node;
@@ -250,7 +251,8 @@ int __init early_irq_init(void)
250 for (i = 0; i < count; i++) { 251 for (i = 0; i < count; i++) {
251 desc[i].irq_data.irq = i; 252 desc[i].irq_data.irq = i;
252 desc[i].irq_data.chip = &no_irq_chip; 253 desc[i].irq_data.chip = &no_irq_chip;
253 desc[i].kstat_irqs = kstat_irqs_all[i]; 254 /* TODO : do this allocation on-demand ... */
255 desc[i].kstat_irqs = alloc_percpu(unsigned int);
254 alloc_masks(desc + i, GFP_KERNEL, node); 256 alloc_masks(desc + i, GFP_KERNEL, node);
255 desc_smp_init(desc + i, node); 257 desc_smp_init(desc + i, node);
256 lockdep_set_class(&desc[i].lock, &irq_desc_lock_class); 258 lockdep_set_class(&desc[i].lock, &irq_desc_lock_class);
@@ -275,6 +277,22 @@ static void free_desc(unsigned int irq)
275 277
276static inline int alloc_descs(unsigned int start, unsigned int cnt, int node) 278static inline int alloc_descs(unsigned int start, unsigned int cnt, int node)
277{ 279{
280#if defined(CONFIG_KSTAT_IRQS_ONDEMAND)
281 struct irq_desc *desc;
282 unsigned int i;
283
284 for (i = 0; i < cnt; i++) {
285 desc = irq_to_desc(start + i);
286 if (desc && !desc->kstat_irqs) {
287 unsigned int __percpu *stats = alloc_percpu(unsigned int);
288
289 if (!stats)
290 return -1;
291 if (cmpxchg(&desc->kstat_irqs, NULL, stats) != NULL)
292 free_percpu(stats);
293 }
294 }
295#endif
278 return start; 296 return start;
279} 297}
280#endif /* !CONFIG_SPARSE_IRQ */ 298#endif /* !CONFIG_SPARSE_IRQ */
@@ -391,7 +409,9 @@ void dynamic_irq_cleanup(unsigned int irq)
391unsigned int kstat_irqs_cpu(unsigned int irq, int cpu) 409unsigned int kstat_irqs_cpu(unsigned int irq, int cpu)
392{ 410{
393 struct irq_desc *desc = irq_to_desc(irq); 411 struct irq_desc *desc = irq_to_desc(irq);
394 return desc ? desc->kstat_irqs[cpu] : 0; 412
413 return desc && desc->kstat_irqs ?
414 *per_cpu_ptr(desc->kstat_irqs, cpu) : 0;
395} 415}
396 416
397#ifdef CONFIG_GENERIC_HARDIRQS 417#ifdef CONFIG_GENERIC_HARDIRQS
@@ -401,10 +421,10 @@ unsigned int kstat_irqs(unsigned int irq)
401 int cpu; 421 int cpu;
402 int sum = 0; 422 int sum = 0;
403 423
404 if (!desc) 424 if (!desc || !desc->kstat_irqs)
405 return 0; 425 return 0;
406 for_each_possible_cpu(cpu) 426 for_each_possible_cpu(cpu)
407 sum += desc->kstat_irqs[cpu]; 427 sum += *per_cpu_ptr(desc->kstat_irqs, cpu);
408 return sum; 428 return sum;
409} 429}
410#endif /* CONFIG_GENERIC_HARDIRQS */ 430#endif /* CONFIG_GENERIC_HARDIRQS */
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 5f92acc5f952..0caa59f747dd 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -577,7 +577,9 @@ irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) { }
577 */ 577 */
578static int irq_thread(void *data) 578static int irq_thread(void *data)
579{ 579{
580 struct sched_param param = { .sched_priority = MAX_USER_RT_PRIO/2, }; 580 static const struct sched_param param = {
581 .sched_priority = MAX_USER_RT_PRIO/2,
582 };
581 struct irqaction *action = data; 583 struct irqaction *action = data;
582 struct irq_desc *desc = irq_to_desc(action->irq); 584 struct irq_desc *desc = irq_to_desc(action->irq);
583 int wake, oneshot = desc->status & IRQ_ONESHOT; 585 int wake, oneshot = desc->status & IRQ_ONESHOT;
diff --git a/kernel/irq_work.c b/kernel/irq_work.c
index 90f881904bb1..c58fa7da8aef 100644
--- a/kernel/irq_work.c
+++ b/kernel/irq_work.c
@@ -77,21 +77,21 @@ void __weak arch_irq_work_raise(void)
77 */ 77 */
78static void __irq_work_queue(struct irq_work *entry) 78static void __irq_work_queue(struct irq_work *entry)
79{ 79{
80 struct irq_work **head, *next; 80 struct irq_work *next;
81 81
82 head = &get_cpu_var(irq_work_list); 82 preempt_disable();
83 83
84 do { 84 do {
85 next = *head; 85 next = __this_cpu_read(irq_work_list);
86 /* Can assign non-atomic because we keep the flags set. */ 86 /* Can assign non-atomic because we keep the flags set. */
87 entry->next = next_flags(next, IRQ_WORK_FLAGS); 87 entry->next = next_flags(next, IRQ_WORK_FLAGS);
88 } while (cmpxchg(head, next, entry) != next); 88 } while (this_cpu_cmpxchg(irq_work_list, next, entry) != next);
89 89
90 /* The list was empty, raise self-interrupt to start processing. */ 90 /* The list was empty, raise self-interrupt to start processing. */
91 if (!irq_work_next(entry)) 91 if (!irq_work_next(entry))
92 arch_irq_work_raise(); 92 arch_irq_work_raise();
93 93
94 put_cpu_var(irq_work_list); 94 preempt_enable();
95} 95}
96 96
97/* 97/*
@@ -120,16 +120,16 @@ EXPORT_SYMBOL_GPL(irq_work_queue);
120 */ 120 */
121void irq_work_run(void) 121void irq_work_run(void)
122{ 122{
123 struct irq_work *list, **head; 123 struct irq_work *list;
124 124
125 head = &__get_cpu_var(irq_work_list); 125 if (this_cpu_read(irq_work_list) == NULL)
126 if (*head == NULL)
127 return; 126 return;
128 127
129 BUG_ON(!in_irq()); 128 BUG_ON(!in_irq());
130 BUG_ON(!irqs_disabled()); 129 BUG_ON(!irqs_disabled());
131 130
132 list = xchg(head, NULL); 131 list = this_cpu_xchg(irq_work_list, NULL);
132
133 while (list != NULL) { 133 while (list != NULL) {
134 struct irq_work *entry = list; 134 struct irq_work *entry = list;
135 135
diff --git a/kernel/kexec.c b/kernel/kexec.c
index b55045bc7563..ec19b92c7ebd 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -163,7 +163,7 @@ static int do_kimage_alloc(struct kimage **rimage, unsigned long entry,
163 * just verifies it is an address we can use. 163 * just verifies it is an address we can use.
164 * 164 *
165 * Since the kernel does everything in page size chunks ensure 165 * Since the kernel does everything in page size chunks ensure
166 * the destination addreses are page aligned. Too many 166 * the destination addresses are page aligned. Too many
167 * special cases crop of when we don't do this. The most 167 * special cases crop of when we don't do this. The most
168 * insidious is getting overlapping destination addresses 168 * insidious is getting overlapping destination addresses
169 * simply because addresses are changed to page size 169 * simply because addresses are changed to page size
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 9737a76e106f..77981813a1e7 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -317,12 +317,12 @@ void __kprobes free_optinsn_slot(kprobe_opcode_t * slot, int dirty)
317/* We have preemption disabled.. so it is safe to use __ versions */ 317/* We have preemption disabled.. so it is safe to use __ versions */
318static inline void set_kprobe_instance(struct kprobe *kp) 318static inline void set_kprobe_instance(struct kprobe *kp)
319{ 319{
320 __get_cpu_var(kprobe_instance) = kp; 320 __this_cpu_write(kprobe_instance, kp);
321} 321}
322 322
323static inline void reset_kprobe_instance(void) 323static inline void reset_kprobe_instance(void)
324{ 324{
325 __get_cpu_var(kprobe_instance) = NULL; 325 __this_cpu_write(kprobe_instance, NULL);
326} 326}
327 327
328/* 328/*
@@ -354,13 +354,20 @@ static inline int kprobe_aggrprobe(struct kprobe *p)
354 return p->pre_handler == aggr_pre_handler; 354 return p->pre_handler == aggr_pre_handler;
355} 355}
356 356
357/* Return true(!0) if the kprobe is unused */
358static inline int kprobe_unused(struct kprobe *p)
359{
360 return kprobe_aggrprobe(p) && kprobe_disabled(p) &&
361 list_empty(&p->list);
362}
363
357/* 364/*
358 * Keep all fields in the kprobe consistent 365 * Keep all fields in the kprobe consistent
359 */ 366 */
360static inline void copy_kprobe(struct kprobe *old_p, struct kprobe *p) 367static inline void copy_kprobe(struct kprobe *ap, struct kprobe *p)
361{ 368{
362 memcpy(&p->opcode, &old_p->opcode, sizeof(kprobe_opcode_t)); 369 memcpy(&p->opcode, &ap->opcode, sizeof(kprobe_opcode_t));
363 memcpy(&p->ainsn, &old_p->ainsn, sizeof(struct arch_specific_insn)); 370 memcpy(&p->ainsn, &ap->ainsn, sizeof(struct arch_specific_insn));
364} 371}
365 372
366#ifdef CONFIG_OPTPROBES 373#ifdef CONFIG_OPTPROBES
@@ -384,6 +391,17 @@ void __kprobes opt_pre_handler(struct kprobe *p, struct pt_regs *regs)
384 } 391 }
385} 392}
386 393
394/* Free optimized instructions and optimized_kprobe */
395static __kprobes void free_aggr_kprobe(struct kprobe *p)
396{
397 struct optimized_kprobe *op;
398
399 op = container_of(p, struct optimized_kprobe, kp);
400 arch_remove_optimized_kprobe(op);
401 arch_remove_kprobe(p);
402 kfree(op);
403}
404
387/* Return true(!0) if the kprobe is ready for optimization. */ 405/* Return true(!0) if the kprobe is ready for optimization. */
388static inline int kprobe_optready(struct kprobe *p) 406static inline int kprobe_optready(struct kprobe *p)
389{ 407{
@@ -397,6 +415,33 @@ static inline int kprobe_optready(struct kprobe *p)
397 return 0; 415 return 0;
398} 416}
399 417
418/* Return true(!0) if the kprobe is disarmed. Note: p must be on hash list */
419static inline int kprobe_disarmed(struct kprobe *p)
420{
421 struct optimized_kprobe *op;
422
423 /* If kprobe is not aggr/opt probe, just return kprobe is disabled */
424 if (!kprobe_aggrprobe(p))
425 return kprobe_disabled(p);
426
427 op = container_of(p, struct optimized_kprobe, kp);
428
429 return kprobe_disabled(p) && list_empty(&op->list);
430}
431
432/* Return true(!0) if the probe is queued on (un)optimizing lists */
433static int __kprobes kprobe_queued(struct kprobe *p)
434{
435 struct optimized_kprobe *op;
436
437 if (kprobe_aggrprobe(p)) {
438 op = container_of(p, struct optimized_kprobe, kp);
439 if (!list_empty(&op->list))
440 return 1;
441 }
442 return 0;
443}
444
400/* 445/*
401 * Return an optimized kprobe whose optimizing code replaces 446 * Return an optimized kprobe whose optimizing code replaces
402 * instructions including addr (exclude breakpoint). 447 * instructions including addr (exclude breakpoint).
@@ -422,30 +467,23 @@ static struct kprobe *__kprobes get_optimized_kprobe(unsigned long addr)
422 467
423/* Optimization staging list, protected by kprobe_mutex */ 468/* Optimization staging list, protected by kprobe_mutex */
424static LIST_HEAD(optimizing_list); 469static LIST_HEAD(optimizing_list);
470static LIST_HEAD(unoptimizing_list);
425 471
426static void kprobe_optimizer(struct work_struct *work); 472static void kprobe_optimizer(struct work_struct *work);
427static DECLARE_DELAYED_WORK(optimizing_work, kprobe_optimizer); 473static DECLARE_DELAYED_WORK(optimizing_work, kprobe_optimizer);
474static DECLARE_COMPLETION(optimizer_comp);
428#define OPTIMIZE_DELAY 5 475#define OPTIMIZE_DELAY 5
429 476
430/* Kprobe jump optimizer */ 477/*
431static __kprobes void kprobe_optimizer(struct work_struct *work) 478 * Optimize (replace a breakpoint with a jump) kprobes listed on
479 * optimizing_list.
480 */
481static __kprobes void do_optimize_kprobes(void)
432{ 482{
433 struct optimized_kprobe *op, *tmp; 483 /* Optimization never be done when disarmed */
434 484 if (kprobes_all_disarmed || !kprobes_allow_optimization ||
435 /* Lock modules while optimizing kprobes */ 485 list_empty(&optimizing_list))
436 mutex_lock(&module_mutex); 486 return;
437 mutex_lock(&kprobe_mutex);
438 if (kprobes_all_disarmed || !kprobes_allow_optimization)
439 goto end;
440
441 /*
442 * Wait for quiesence period to ensure all running interrupts
443 * are done. Because optprobe may modify multiple instructions
444 * there is a chance that Nth instruction is interrupted. In that
445 * case, running interrupt can return to 2nd-Nth byte of jump
446 * instruction. This wait is for avoiding it.
447 */
448 synchronize_sched();
449 487
450 /* 488 /*
451 * The optimization/unoptimization refers online_cpus via 489 * The optimization/unoptimization refers online_cpus via
@@ -459,17 +497,111 @@ static __kprobes void kprobe_optimizer(struct work_struct *work)
459 */ 497 */
460 get_online_cpus(); 498 get_online_cpus();
461 mutex_lock(&text_mutex); 499 mutex_lock(&text_mutex);
462 list_for_each_entry_safe(op, tmp, &optimizing_list, list) { 500 arch_optimize_kprobes(&optimizing_list);
463 WARN_ON(kprobe_disabled(&op->kp)); 501 mutex_unlock(&text_mutex);
464 if (arch_optimize_kprobe(op) < 0) 502 put_online_cpus();
465 op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED; 503}
466 list_del_init(&op->list); 504
505/*
506 * Unoptimize (replace a jump with a breakpoint and remove the breakpoint
507 * if need) kprobes listed on unoptimizing_list.
508 */
509static __kprobes void do_unoptimize_kprobes(struct list_head *free_list)
510{
511 struct optimized_kprobe *op, *tmp;
512
513 /* Unoptimization must be done anytime */
514 if (list_empty(&unoptimizing_list))
515 return;
516
517 /* Ditto to do_optimize_kprobes */
518 get_online_cpus();
519 mutex_lock(&text_mutex);
520 arch_unoptimize_kprobes(&unoptimizing_list, free_list);
521 /* Loop free_list for disarming */
522 list_for_each_entry_safe(op, tmp, free_list, list) {
523 /* Disarm probes if marked disabled */
524 if (kprobe_disabled(&op->kp))
525 arch_disarm_kprobe(&op->kp);
526 if (kprobe_unused(&op->kp)) {
527 /*
528 * Remove unused probes from hash list. After waiting
529 * for synchronization, these probes are reclaimed.
530 * (reclaiming is done by do_free_cleaned_kprobes.)
531 */
532 hlist_del_rcu(&op->kp.hlist);
533 } else
534 list_del_init(&op->list);
467 } 535 }
468 mutex_unlock(&text_mutex); 536 mutex_unlock(&text_mutex);
469 put_online_cpus(); 537 put_online_cpus();
470end: 538}
539
540/* Reclaim all kprobes on the free_list */
541static __kprobes void do_free_cleaned_kprobes(struct list_head *free_list)
542{
543 struct optimized_kprobe *op, *tmp;
544
545 list_for_each_entry_safe(op, tmp, free_list, list) {
546 BUG_ON(!kprobe_unused(&op->kp));
547 list_del_init(&op->list);
548 free_aggr_kprobe(&op->kp);
549 }
550}
551
552/* Start optimizer after OPTIMIZE_DELAY passed */
553static __kprobes void kick_kprobe_optimizer(void)
554{
555 if (!delayed_work_pending(&optimizing_work))
556 schedule_delayed_work(&optimizing_work, OPTIMIZE_DELAY);
557}
558
559/* Kprobe jump optimizer */
560static __kprobes void kprobe_optimizer(struct work_struct *work)
561{
562 LIST_HEAD(free_list);
563
564 /* Lock modules while optimizing kprobes */
565 mutex_lock(&module_mutex);
566 mutex_lock(&kprobe_mutex);
567
568 /*
569 * Step 1: Unoptimize kprobes and collect cleaned (unused and disarmed)
570 * kprobes before waiting for quiesence period.
571 */
572 do_unoptimize_kprobes(&free_list);
573
574 /*
575 * Step 2: Wait for quiesence period to ensure all running interrupts
576 * are done. Because optprobe may modify multiple instructions
577 * there is a chance that Nth instruction is interrupted. In that
578 * case, running interrupt can return to 2nd-Nth byte of jump
579 * instruction. This wait is for avoiding it.
580 */
581 synchronize_sched();
582
583 /* Step 3: Optimize kprobes after quiesence period */
584 do_optimize_kprobes();
585
586 /* Step 4: Free cleaned kprobes after quiesence period */
587 do_free_cleaned_kprobes(&free_list);
588
471 mutex_unlock(&kprobe_mutex); 589 mutex_unlock(&kprobe_mutex);
472 mutex_unlock(&module_mutex); 590 mutex_unlock(&module_mutex);
591
592 /* Step 5: Kick optimizer again if needed */
593 if (!list_empty(&optimizing_list) || !list_empty(&unoptimizing_list))
594 kick_kprobe_optimizer();
595 else
596 /* Wake up all waiters */
597 complete_all(&optimizer_comp);
598}
599
600/* Wait for completing optimization and unoptimization */
601static __kprobes void wait_for_kprobe_optimizer(void)
602{
603 if (delayed_work_pending(&optimizing_work))
604 wait_for_completion(&optimizer_comp);
473} 605}
474 606
475/* Optimize kprobe if p is ready to be optimized */ 607/* Optimize kprobe if p is ready to be optimized */
@@ -495,42 +627,99 @@ static __kprobes void optimize_kprobe(struct kprobe *p)
495 /* Check if it is already optimized. */ 627 /* Check if it is already optimized. */
496 if (op->kp.flags & KPROBE_FLAG_OPTIMIZED) 628 if (op->kp.flags & KPROBE_FLAG_OPTIMIZED)
497 return; 629 return;
498
499 op->kp.flags |= KPROBE_FLAG_OPTIMIZED; 630 op->kp.flags |= KPROBE_FLAG_OPTIMIZED;
500 list_add(&op->list, &optimizing_list); 631
501 if (!delayed_work_pending(&optimizing_work)) 632 if (!list_empty(&op->list))
502 schedule_delayed_work(&optimizing_work, OPTIMIZE_DELAY); 633 /* This is under unoptimizing. Just dequeue the probe */
634 list_del_init(&op->list);
635 else {
636 list_add(&op->list, &optimizing_list);
637 kick_kprobe_optimizer();
638 }
639}
640
641/* Short cut to direct unoptimizing */
642static __kprobes void force_unoptimize_kprobe(struct optimized_kprobe *op)
643{
644 get_online_cpus();
645 arch_unoptimize_kprobe(op);
646 put_online_cpus();
647 if (kprobe_disabled(&op->kp))
648 arch_disarm_kprobe(&op->kp);
503} 649}
504 650
505/* Unoptimize a kprobe if p is optimized */ 651/* Unoptimize a kprobe if p is optimized */
506static __kprobes void unoptimize_kprobe(struct kprobe *p) 652static __kprobes void unoptimize_kprobe(struct kprobe *p, bool force)
507{ 653{
508 struct optimized_kprobe *op; 654 struct optimized_kprobe *op;
509 655
510 if ((p->flags & KPROBE_FLAG_OPTIMIZED) && kprobe_aggrprobe(p)) { 656 if (!kprobe_aggrprobe(p) || kprobe_disarmed(p))
511 op = container_of(p, struct optimized_kprobe, kp); 657 return; /* This is not an optprobe nor optimized */
512 if (!list_empty(&op->list)) 658
513 /* Dequeue from the optimization queue */ 659 op = container_of(p, struct optimized_kprobe, kp);
660 if (!kprobe_optimized(p)) {
661 /* Unoptimized or unoptimizing case */
662 if (force && !list_empty(&op->list)) {
663 /*
664 * Only if this is unoptimizing kprobe and forced,
665 * forcibly unoptimize it. (No need to unoptimize
666 * unoptimized kprobe again :)
667 */
514 list_del_init(&op->list); 668 list_del_init(&op->list);
515 else 669 force_unoptimize_kprobe(op);
516 /* Replace jump with break */ 670 }
517 arch_unoptimize_kprobe(op); 671 return;
518 op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED; 672 }
673
674 op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
675 if (!list_empty(&op->list)) {
676 /* Dequeue from the optimization queue */
677 list_del_init(&op->list);
678 return;
679 }
680 /* Optimized kprobe case */
681 if (force)
682 /* Forcibly update the code: this is a special case */
683 force_unoptimize_kprobe(op);
684 else {
685 list_add(&op->list, &unoptimizing_list);
686 kick_kprobe_optimizer();
519 } 687 }
520} 688}
521 689
690/* Cancel unoptimizing for reusing */
691static void reuse_unused_kprobe(struct kprobe *ap)
692{
693 struct optimized_kprobe *op;
694
695 BUG_ON(!kprobe_unused(ap));
696 /*
697 * Unused kprobe MUST be on the way of delayed unoptimizing (means
698 * there is still a relative jump) and disabled.
699 */
700 op = container_of(ap, struct optimized_kprobe, kp);
701 if (unlikely(list_empty(&op->list)))
702 printk(KERN_WARNING "Warning: found a stray unused "
703 "aggrprobe@%p\n", ap->addr);
704 /* Enable the probe again */
705 ap->flags &= ~KPROBE_FLAG_DISABLED;
706 /* Optimize it again (remove from op->list) */
707 BUG_ON(!kprobe_optready(ap));
708 optimize_kprobe(ap);
709}
710
522/* Remove optimized instructions */ 711/* Remove optimized instructions */
523static void __kprobes kill_optimized_kprobe(struct kprobe *p) 712static void __kprobes kill_optimized_kprobe(struct kprobe *p)
524{ 713{
525 struct optimized_kprobe *op; 714 struct optimized_kprobe *op;
526 715
527 op = container_of(p, struct optimized_kprobe, kp); 716 op = container_of(p, struct optimized_kprobe, kp);
528 if (!list_empty(&op->list)) { 717 if (!list_empty(&op->list))
529 /* Dequeue from the optimization queue */ 718 /* Dequeue from the (un)optimization queue */
530 list_del_init(&op->list); 719 list_del_init(&op->list);
531 op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED; 720
532 } 721 op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
533 /* Don't unoptimize, because the target code will be freed. */ 722 /* Don't touch the code, because it is already freed. */
534 arch_remove_optimized_kprobe(op); 723 arch_remove_optimized_kprobe(op);
535} 724}
536 725
@@ -543,16 +732,6 @@ static __kprobes void prepare_optimized_kprobe(struct kprobe *p)
543 arch_prepare_optimized_kprobe(op); 732 arch_prepare_optimized_kprobe(op);
544} 733}
545 734
546/* Free optimized instructions and optimized_kprobe */
547static __kprobes void free_aggr_kprobe(struct kprobe *p)
548{
549 struct optimized_kprobe *op;
550
551 op = container_of(p, struct optimized_kprobe, kp);
552 arch_remove_optimized_kprobe(op);
553 kfree(op);
554}
555
556/* Allocate new optimized_kprobe and try to prepare optimized instructions */ 735/* Allocate new optimized_kprobe and try to prepare optimized instructions */
557static __kprobes struct kprobe *alloc_aggr_kprobe(struct kprobe *p) 736static __kprobes struct kprobe *alloc_aggr_kprobe(struct kprobe *p)
558{ 737{
@@ -587,7 +766,8 @@ static __kprobes void try_to_optimize_kprobe(struct kprobe *p)
587 op = container_of(ap, struct optimized_kprobe, kp); 766 op = container_of(ap, struct optimized_kprobe, kp);
588 if (!arch_prepared_optinsn(&op->optinsn)) { 767 if (!arch_prepared_optinsn(&op->optinsn)) {
589 /* If failed to setup optimizing, fallback to kprobe */ 768 /* If failed to setup optimizing, fallback to kprobe */
590 free_aggr_kprobe(ap); 769 arch_remove_optimized_kprobe(op);
770 kfree(op);
591 return; 771 return;
592 } 772 }
593 773
@@ -631,21 +811,16 @@ static void __kprobes unoptimize_all_kprobes(void)
631 return; 811 return;
632 812
633 kprobes_allow_optimization = false; 813 kprobes_allow_optimization = false;
634 printk(KERN_INFO "Kprobes globally unoptimized\n");
635 get_online_cpus(); /* For avoiding text_mutex deadlock */
636 mutex_lock(&text_mutex);
637 for (i = 0; i < KPROBE_TABLE_SIZE; i++) { 814 for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
638 head = &kprobe_table[i]; 815 head = &kprobe_table[i];
639 hlist_for_each_entry_rcu(p, node, head, hlist) { 816 hlist_for_each_entry_rcu(p, node, head, hlist) {
640 if (!kprobe_disabled(p)) 817 if (!kprobe_disabled(p))
641 unoptimize_kprobe(p); 818 unoptimize_kprobe(p, false);
642 } 819 }
643 } 820 }
644 821 /* Wait for unoptimizing completion */
645 mutex_unlock(&text_mutex); 822 wait_for_kprobe_optimizer();
646 put_online_cpus(); 823 printk(KERN_INFO "Kprobes globally unoptimized\n");
647 /* Allow all currently running kprobes to complete */
648 synchronize_sched();
649} 824}
650 825
651int sysctl_kprobes_optimization; 826int sysctl_kprobes_optimization;
@@ -669,44 +844,60 @@ int proc_kprobes_optimization_handler(struct ctl_table *table, int write,
669} 844}
670#endif /* CONFIG_SYSCTL */ 845#endif /* CONFIG_SYSCTL */
671 846
847/* Put a breakpoint for a probe. Must be called with text_mutex locked */
672static void __kprobes __arm_kprobe(struct kprobe *p) 848static void __kprobes __arm_kprobe(struct kprobe *p)
673{ 849{
674 struct kprobe *old_p; 850 struct kprobe *_p;
675 851
676 /* Check collision with other optimized kprobes */ 852 /* Check collision with other optimized kprobes */
677 old_p = get_optimized_kprobe((unsigned long)p->addr); 853 _p = get_optimized_kprobe((unsigned long)p->addr);
678 if (unlikely(old_p)) 854 if (unlikely(_p))
679 unoptimize_kprobe(old_p); /* Fallback to unoptimized kprobe */ 855 /* Fallback to unoptimized kprobe */
856 unoptimize_kprobe(_p, true);
680 857
681 arch_arm_kprobe(p); 858 arch_arm_kprobe(p);
682 optimize_kprobe(p); /* Try to optimize (add kprobe to a list) */ 859 optimize_kprobe(p); /* Try to optimize (add kprobe to a list) */
683} 860}
684 861
685static void __kprobes __disarm_kprobe(struct kprobe *p) 862/* Remove the breakpoint of a probe. Must be called with text_mutex locked */
863static void __kprobes __disarm_kprobe(struct kprobe *p, bool reopt)
686{ 864{
687 struct kprobe *old_p; 865 struct kprobe *_p;
688 866
689 unoptimize_kprobe(p); /* Try to unoptimize */ 867 unoptimize_kprobe(p, false); /* Try to unoptimize */
690 arch_disarm_kprobe(p);
691 868
692 /* If another kprobe was blocked, optimize it. */ 869 if (!kprobe_queued(p)) {
693 old_p = get_optimized_kprobe((unsigned long)p->addr); 870 arch_disarm_kprobe(p);
694 if (unlikely(old_p)) 871 /* If another kprobe was blocked, optimize it. */
695 optimize_kprobe(old_p); 872 _p = get_optimized_kprobe((unsigned long)p->addr);
873 if (unlikely(_p) && reopt)
874 optimize_kprobe(_p);
875 }
876 /* TODO: reoptimize others after unoptimized this probe */
696} 877}
697 878
698#else /* !CONFIG_OPTPROBES */ 879#else /* !CONFIG_OPTPROBES */
699 880
700#define optimize_kprobe(p) do {} while (0) 881#define optimize_kprobe(p) do {} while (0)
701#define unoptimize_kprobe(p) do {} while (0) 882#define unoptimize_kprobe(p, f) do {} while (0)
702#define kill_optimized_kprobe(p) do {} while (0) 883#define kill_optimized_kprobe(p) do {} while (0)
703#define prepare_optimized_kprobe(p) do {} while (0) 884#define prepare_optimized_kprobe(p) do {} while (0)
704#define try_to_optimize_kprobe(p) do {} while (0) 885#define try_to_optimize_kprobe(p) do {} while (0)
705#define __arm_kprobe(p) arch_arm_kprobe(p) 886#define __arm_kprobe(p) arch_arm_kprobe(p)
706#define __disarm_kprobe(p) arch_disarm_kprobe(p) 887#define __disarm_kprobe(p, o) arch_disarm_kprobe(p)
888#define kprobe_disarmed(p) kprobe_disabled(p)
889#define wait_for_kprobe_optimizer() do {} while (0)
890
891/* There should be no unused kprobes can be reused without optimization */
892static void reuse_unused_kprobe(struct kprobe *ap)
893{
894 printk(KERN_ERR "Error: There should be no unused kprobe here.\n");
895 BUG_ON(kprobe_unused(ap));
896}
707 897
708static __kprobes void free_aggr_kprobe(struct kprobe *p) 898static __kprobes void free_aggr_kprobe(struct kprobe *p)
709{ 899{
900 arch_remove_kprobe(p);
710 kfree(p); 901 kfree(p);
711} 902}
712 903
@@ -732,11 +923,10 @@ static void __kprobes arm_kprobe(struct kprobe *kp)
732/* Disarm a kprobe with text_mutex */ 923/* Disarm a kprobe with text_mutex */
733static void __kprobes disarm_kprobe(struct kprobe *kp) 924static void __kprobes disarm_kprobe(struct kprobe *kp)
734{ 925{
735 get_online_cpus(); /* For avoiding text_mutex deadlock */ 926 /* Ditto */
736 mutex_lock(&text_mutex); 927 mutex_lock(&text_mutex);
737 __disarm_kprobe(kp); 928 __disarm_kprobe(kp, true);
738 mutex_unlock(&text_mutex); 929 mutex_unlock(&text_mutex);
739 put_online_cpus();
740} 930}
741 931
742/* 932/*
@@ -775,7 +965,7 @@ static void __kprobes aggr_post_handler(struct kprobe *p, struct pt_regs *regs,
775static int __kprobes aggr_fault_handler(struct kprobe *p, struct pt_regs *regs, 965static int __kprobes aggr_fault_handler(struct kprobe *p, struct pt_regs *regs,
776 int trapnr) 966 int trapnr)
777{ 967{
778 struct kprobe *cur = __get_cpu_var(kprobe_instance); 968 struct kprobe *cur = __this_cpu_read(kprobe_instance);
779 969
780 /* 970 /*
781 * if we faulted "during" the execution of a user specified 971 * if we faulted "during" the execution of a user specified
@@ -790,7 +980,7 @@ static int __kprobes aggr_fault_handler(struct kprobe *p, struct pt_regs *regs,
790 980
791static int __kprobes aggr_break_handler(struct kprobe *p, struct pt_regs *regs) 981static int __kprobes aggr_break_handler(struct kprobe *p, struct pt_regs *regs)
792{ 982{
793 struct kprobe *cur = __get_cpu_var(kprobe_instance); 983 struct kprobe *cur = __this_cpu_read(kprobe_instance);
794 int ret = 0; 984 int ret = 0;
795 985
796 if (cur && cur->break_handler) { 986 if (cur && cur->break_handler) {
@@ -942,7 +1132,7 @@ static int __kprobes add_new_kprobe(struct kprobe *ap, struct kprobe *p)
942 BUG_ON(kprobe_gone(ap) || kprobe_gone(p)); 1132 BUG_ON(kprobe_gone(ap) || kprobe_gone(p));
943 1133
944 if (p->break_handler || p->post_handler) 1134 if (p->break_handler || p->post_handler)
945 unoptimize_kprobe(ap); /* Fall back to normal kprobe */ 1135 unoptimize_kprobe(ap, true); /* Fall back to normal kprobe */
946 1136
947 if (p->break_handler) { 1137 if (p->break_handler) {
948 if (ap->break_handler) 1138 if (ap->break_handler)
@@ -993,19 +1183,21 @@ static void __kprobes init_aggr_kprobe(struct kprobe *ap, struct kprobe *p)
993 * This is the second or subsequent kprobe at the address - handle 1183 * This is the second or subsequent kprobe at the address - handle
994 * the intricacies 1184 * the intricacies
995 */ 1185 */
996static int __kprobes register_aggr_kprobe(struct kprobe *old_p, 1186static int __kprobes register_aggr_kprobe(struct kprobe *orig_p,
997 struct kprobe *p) 1187 struct kprobe *p)
998{ 1188{
999 int ret = 0; 1189 int ret = 0;
1000 struct kprobe *ap = old_p; 1190 struct kprobe *ap = orig_p;
1001 1191
1002 if (!kprobe_aggrprobe(old_p)) { 1192 if (!kprobe_aggrprobe(orig_p)) {
1003 /* If old_p is not an aggr_kprobe, create new aggr_kprobe. */ 1193 /* If orig_p is not an aggr_kprobe, create new aggr_kprobe. */
1004 ap = alloc_aggr_kprobe(old_p); 1194 ap = alloc_aggr_kprobe(orig_p);
1005 if (!ap) 1195 if (!ap)
1006 return -ENOMEM; 1196 return -ENOMEM;
1007 init_aggr_kprobe(ap, old_p); 1197 init_aggr_kprobe(ap, orig_p);
1008 } 1198 } else if (kprobe_unused(ap))
1199 /* This probe is going to die. Rescue it */
1200 reuse_unused_kprobe(ap);
1009 1201
1010 if (kprobe_gone(ap)) { 1202 if (kprobe_gone(ap)) {
1011 /* 1203 /*
@@ -1039,23 +1231,6 @@ static int __kprobes register_aggr_kprobe(struct kprobe *old_p,
1039 return add_new_kprobe(ap, p); 1231 return add_new_kprobe(ap, p);
1040} 1232}
1041 1233
1042/* Try to disable aggr_kprobe, and return 1 if succeeded.*/
1043static int __kprobes try_to_disable_aggr_kprobe(struct kprobe *p)
1044{
1045 struct kprobe *kp;
1046
1047 list_for_each_entry_rcu(kp, &p->list, list) {
1048 if (!kprobe_disabled(kp))
1049 /*
1050 * There is an active probe on the list.
1051 * We can't disable aggr_kprobe.
1052 */
1053 return 0;
1054 }
1055 p->flags |= KPROBE_FLAG_DISABLED;
1056 return 1;
1057}
1058
1059static int __kprobes in_kprobes_functions(unsigned long addr) 1234static int __kprobes in_kprobes_functions(unsigned long addr)
1060{ 1235{
1061 struct kprobe_blackpoint *kb; 1236 struct kprobe_blackpoint *kb;
@@ -1098,34 +1273,33 @@ static kprobe_opcode_t __kprobes *kprobe_addr(struct kprobe *p)
1098/* Check passed kprobe is valid and return kprobe in kprobe_table. */ 1273/* Check passed kprobe is valid and return kprobe in kprobe_table. */
1099static struct kprobe * __kprobes __get_valid_kprobe(struct kprobe *p) 1274static struct kprobe * __kprobes __get_valid_kprobe(struct kprobe *p)
1100{ 1275{
1101 struct kprobe *old_p, *list_p; 1276 struct kprobe *ap, *list_p;
1102 1277
1103 old_p = get_kprobe(p->addr); 1278 ap = get_kprobe(p->addr);
1104 if (unlikely(!old_p)) 1279 if (unlikely(!ap))
1105 return NULL; 1280 return NULL;
1106 1281
1107 if (p != old_p) { 1282 if (p != ap) {
1108 list_for_each_entry_rcu(list_p, &old_p->list, list) 1283 list_for_each_entry_rcu(list_p, &ap->list, list)
1109 if (list_p == p) 1284 if (list_p == p)
1110 /* kprobe p is a valid probe */ 1285 /* kprobe p is a valid probe */
1111 goto valid; 1286 goto valid;
1112 return NULL; 1287 return NULL;
1113 } 1288 }
1114valid: 1289valid:
1115 return old_p; 1290 return ap;
1116} 1291}
1117 1292
1118/* Return error if the kprobe is being re-registered */ 1293/* Return error if the kprobe is being re-registered */
1119static inline int check_kprobe_rereg(struct kprobe *p) 1294static inline int check_kprobe_rereg(struct kprobe *p)
1120{ 1295{
1121 int ret = 0; 1296 int ret = 0;
1122 struct kprobe *old_p;
1123 1297
1124 mutex_lock(&kprobe_mutex); 1298 mutex_lock(&kprobe_mutex);
1125 old_p = __get_valid_kprobe(p); 1299 if (__get_valid_kprobe(p))
1126 if (old_p)
1127 ret = -EINVAL; 1300 ret = -EINVAL;
1128 mutex_unlock(&kprobe_mutex); 1301 mutex_unlock(&kprobe_mutex);
1302
1129 return ret; 1303 return ret;
1130} 1304}
1131 1305
@@ -1229,67 +1403,121 @@ fail_with_jump_label:
1229} 1403}
1230EXPORT_SYMBOL_GPL(register_kprobe); 1404EXPORT_SYMBOL_GPL(register_kprobe);
1231 1405
1406/* Check if all probes on the aggrprobe are disabled */
1407static int __kprobes aggr_kprobe_disabled(struct kprobe *ap)
1408{
1409 struct kprobe *kp;
1410
1411 list_for_each_entry_rcu(kp, &ap->list, list)
1412 if (!kprobe_disabled(kp))
1413 /*
1414 * There is an active probe on the list.
1415 * We can't disable this ap.
1416 */
1417 return 0;
1418
1419 return 1;
1420}
1421
1422/* Disable one kprobe: Make sure called under kprobe_mutex is locked */
1423static struct kprobe *__kprobes __disable_kprobe(struct kprobe *p)
1424{
1425 struct kprobe *orig_p;
1426
1427 /* Get an original kprobe for return */
1428 orig_p = __get_valid_kprobe(p);
1429 if (unlikely(orig_p == NULL))
1430 return NULL;
1431
1432 if (!kprobe_disabled(p)) {
1433 /* Disable probe if it is a child probe */
1434 if (p != orig_p)
1435 p->flags |= KPROBE_FLAG_DISABLED;
1436
1437 /* Try to disarm and disable this/parent probe */
1438 if (p == orig_p || aggr_kprobe_disabled(orig_p)) {
1439 disarm_kprobe(orig_p);
1440 orig_p->flags |= KPROBE_FLAG_DISABLED;
1441 }
1442 }
1443
1444 return orig_p;
1445}
1446
1232/* 1447/*
1233 * Unregister a kprobe without a scheduler synchronization. 1448 * Unregister a kprobe without a scheduler synchronization.
1234 */ 1449 */
1235static int __kprobes __unregister_kprobe_top(struct kprobe *p) 1450static int __kprobes __unregister_kprobe_top(struct kprobe *p)
1236{ 1451{
1237 struct kprobe *old_p, *list_p; 1452 struct kprobe *ap, *list_p;
1238 1453
1239 old_p = __get_valid_kprobe(p); 1454 /* Disable kprobe. This will disarm it if needed. */
1240 if (old_p == NULL) 1455 ap = __disable_kprobe(p);
1456 if (ap == NULL)
1241 return -EINVAL; 1457 return -EINVAL;
1242 1458
1243 if (old_p == p || 1459 if (ap == p)
1244 (kprobe_aggrprobe(old_p) &&
1245 list_is_singular(&old_p->list))) {
1246 /* 1460 /*
1247 * Only probe on the hash list. Disarm only if kprobes are 1461 * This probe is an independent(and non-optimized) kprobe
1248 * enabled and not gone - otherwise, the breakpoint would 1462 * (not an aggrprobe). Remove from the hash list.
1249 * already have been removed. We save on flushing icache.
1250 */ 1463 */
1251 if (!kprobes_all_disarmed && !kprobe_disabled(old_p)) 1464 goto disarmed;
1252 disarm_kprobe(old_p); 1465
1253 hlist_del_rcu(&old_p->hlist); 1466 /* Following process expects this probe is an aggrprobe */
1254 } else { 1467 WARN_ON(!kprobe_aggrprobe(ap));
1468
1469 if (list_is_singular(&ap->list) && kprobe_disarmed(ap))
1470 /*
1471 * !disarmed could be happen if the probe is under delayed
1472 * unoptimizing.
1473 */
1474 goto disarmed;
1475 else {
1476 /* If disabling probe has special handlers, update aggrprobe */
1255 if (p->break_handler && !kprobe_gone(p)) 1477 if (p->break_handler && !kprobe_gone(p))
1256 old_p->break_handler = NULL; 1478 ap->break_handler = NULL;
1257 if (p->post_handler && !kprobe_gone(p)) { 1479 if (p->post_handler && !kprobe_gone(p)) {
1258 list_for_each_entry_rcu(list_p, &old_p->list, list) { 1480 list_for_each_entry_rcu(list_p, &ap->list, list) {
1259 if ((list_p != p) && (list_p->post_handler)) 1481 if ((list_p != p) && (list_p->post_handler))
1260 goto noclean; 1482 goto noclean;
1261 } 1483 }
1262 old_p->post_handler = NULL; 1484 ap->post_handler = NULL;
1263 } 1485 }
1264noclean: 1486noclean:
1487 /*
1488 * Remove from the aggrprobe: this path will do nothing in
1489 * __unregister_kprobe_bottom().
1490 */
1265 list_del_rcu(&p->list); 1491 list_del_rcu(&p->list);
1266 if (!kprobe_disabled(old_p)) { 1492 if (!kprobe_disabled(ap) && !kprobes_all_disarmed)
1267 try_to_disable_aggr_kprobe(old_p); 1493 /*
1268 if (!kprobes_all_disarmed) { 1494 * Try to optimize this probe again, because post
1269 if (kprobe_disabled(old_p)) 1495 * handler may have been changed.
1270 disarm_kprobe(old_p); 1496 */
1271 else 1497 optimize_kprobe(ap);
1272 /* Try to optimize this probe again */
1273 optimize_kprobe(old_p);
1274 }
1275 }
1276 } 1498 }
1277 return 0; 1499 return 0;
1500
1501disarmed:
1502 BUG_ON(!kprobe_disarmed(ap));
1503 hlist_del_rcu(&ap->hlist);
1504 return 0;
1278} 1505}
1279 1506
1280static void __kprobes __unregister_kprobe_bottom(struct kprobe *p) 1507static void __kprobes __unregister_kprobe_bottom(struct kprobe *p)
1281{ 1508{
1282 struct kprobe *old_p; 1509 struct kprobe *ap;
1283 1510
1284 if (list_empty(&p->list)) 1511 if (list_empty(&p->list))
1512 /* This is an independent kprobe */
1285 arch_remove_kprobe(p); 1513 arch_remove_kprobe(p);
1286 else if (list_is_singular(&p->list)) { 1514 else if (list_is_singular(&p->list)) {
1287 /* "p" is the last child of an aggr_kprobe */ 1515 /* This is the last child of an aggrprobe */
1288 old_p = list_entry(p->list.next, struct kprobe, list); 1516 ap = list_entry(p->list.next, struct kprobe, list);
1289 list_del(&p->list); 1517 list_del(&p->list);
1290 arch_remove_kprobe(old_p); 1518 free_aggr_kprobe(ap);
1291 free_aggr_kprobe(old_p);
1292 } 1519 }
1520 /* Otherwise, do nothing. */
1293} 1521}
1294 1522
1295int __kprobes register_kprobes(struct kprobe **kps, int num) 1523int __kprobes register_kprobes(struct kprobe **kps, int num)
@@ -1607,29 +1835,13 @@ static void __kprobes kill_kprobe(struct kprobe *p)
1607int __kprobes disable_kprobe(struct kprobe *kp) 1835int __kprobes disable_kprobe(struct kprobe *kp)
1608{ 1836{
1609 int ret = 0; 1837 int ret = 0;
1610 struct kprobe *p;
1611 1838
1612 mutex_lock(&kprobe_mutex); 1839 mutex_lock(&kprobe_mutex);
1613 1840
1614 /* Check whether specified probe is valid. */ 1841 /* Disable this kprobe */
1615 p = __get_valid_kprobe(kp); 1842 if (__disable_kprobe(kp) == NULL)
1616 if (unlikely(p == NULL)) {
1617 ret = -EINVAL; 1843 ret = -EINVAL;
1618 goto out;
1619 }
1620 1844
1621 /* If the probe is already disabled (or gone), just return */
1622 if (kprobe_disabled(kp))
1623 goto out;
1624
1625 kp->flags |= KPROBE_FLAG_DISABLED;
1626 if (p != kp)
1627 /* When kp != p, p is always enabled. */
1628 try_to_disable_aggr_kprobe(p);
1629
1630 if (!kprobes_all_disarmed && kprobe_disabled(p))
1631 disarm_kprobe(p);
1632out:
1633 mutex_unlock(&kprobe_mutex); 1845 mutex_unlock(&kprobe_mutex);
1634 return ret; 1846 return ret;
1635} 1847}
@@ -1927,36 +2139,27 @@ static void __kprobes disarm_all_kprobes(void)
1927 mutex_lock(&kprobe_mutex); 2139 mutex_lock(&kprobe_mutex);
1928 2140
1929 /* If kprobes are already disarmed, just return */ 2141 /* If kprobes are already disarmed, just return */
1930 if (kprobes_all_disarmed) 2142 if (kprobes_all_disarmed) {
1931 goto already_disabled; 2143 mutex_unlock(&kprobe_mutex);
2144 return;
2145 }
1932 2146
1933 kprobes_all_disarmed = true; 2147 kprobes_all_disarmed = true;
1934 printk(KERN_INFO "Kprobes globally disabled\n"); 2148 printk(KERN_INFO "Kprobes globally disabled\n");
1935 2149
1936 /*
1937 * Here we call get_online_cpus() for avoiding text_mutex deadlock,
1938 * because disarming may also unoptimize kprobes.
1939 */
1940 get_online_cpus();
1941 mutex_lock(&text_mutex); 2150 mutex_lock(&text_mutex);
1942 for (i = 0; i < KPROBE_TABLE_SIZE; i++) { 2151 for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
1943 head = &kprobe_table[i]; 2152 head = &kprobe_table[i];
1944 hlist_for_each_entry_rcu(p, node, head, hlist) { 2153 hlist_for_each_entry_rcu(p, node, head, hlist) {
1945 if (!arch_trampoline_kprobe(p) && !kprobe_disabled(p)) 2154 if (!arch_trampoline_kprobe(p) && !kprobe_disabled(p))
1946 __disarm_kprobe(p); 2155 __disarm_kprobe(p, false);
1947 } 2156 }
1948 } 2157 }
1949
1950 mutex_unlock(&text_mutex); 2158 mutex_unlock(&text_mutex);
1951 put_online_cpus();
1952 mutex_unlock(&kprobe_mutex); 2159 mutex_unlock(&kprobe_mutex);
1953 /* Allow all currently running kprobes to complete */
1954 synchronize_sched();
1955 return;
1956 2160
1957already_disabled: 2161 /* Wait for disarming all kprobes by optimizer */
1958 mutex_unlock(&kprobe_mutex); 2162 wait_for_kprobe_optimizer();
1959 return;
1960} 2163}
1961 2164
1962/* 2165/*
diff --git a/kernel/kthread.c b/kernel/kthread.c
index ca61bbdd44b2..c55afba990a3 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -148,7 +148,7 @@ struct task_struct *kthread_create(int (*threadfn)(void *data),
148 wait_for_completion(&create.done); 148 wait_for_completion(&create.done);
149 149
150 if (!IS_ERR(create.result)) { 150 if (!IS_ERR(create.result)) {
151 struct sched_param param = { .sched_priority = 0 }; 151 static const struct sched_param param = { .sched_priority = 0 };
152 va_list args; 152 va_list args;
153 153
154 va_start(args, namefmt); 154 va_start(args, namefmt);
diff --git a/kernel/latencytop.c b/kernel/latencytop.c
index 17110a4a4fc2..ee74b35e528d 100644
--- a/kernel/latencytop.c
+++ b/kernel/latencytop.c
@@ -241,24 +241,19 @@ static int lstats_show(struct seq_file *m, void *v)
241 seq_puts(m, "Latency Top version : v0.1\n"); 241 seq_puts(m, "Latency Top version : v0.1\n");
242 242
243 for (i = 0; i < MAXLR; i++) { 243 for (i = 0; i < MAXLR; i++) {
244 if (latency_record[i].backtrace[0]) { 244 struct latency_record *lr = &latency_record[i];
245
246 if (lr->backtrace[0]) {
245 int q; 247 int q;
246 seq_printf(m, "%i %lu %lu ", 248 seq_printf(m, "%i %lu %lu",
247 latency_record[i].count, 249 lr->count, lr->time, lr->max);
248 latency_record[i].time,
249 latency_record[i].max);
250 for (q = 0; q < LT_BACKTRACEDEPTH; q++) { 250 for (q = 0; q < LT_BACKTRACEDEPTH; q++) {
251 char sym[KSYM_SYMBOL_LEN]; 251 unsigned long bt = lr->backtrace[q];
252 char *c; 252 if (!bt)
253 if (!latency_record[i].backtrace[q])
254 break; 253 break;
255 if (latency_record[i].backtrace[q] == ULONG_MAX) 254 if (bt == ULONG_MAX)
256 break; 255 break;
257 sprint_symbol(sym, latency_record[i].backtrace[q]); 256 seq_printf(m, " %ps", (void *)bt);
258 c = strchr(sym, '+');
259 if (c)
260 *c = 0;
261 seq_printf(m, "%s ", sym);
262 } 257 }
263 seq_printf(m, "\n"); 258 seq_printf(m, "\n");
264 } 259 }
diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c
index 59b76c8ce9d7..1969d2fc4b36 100644
--- a/kernel/lockdep_proc.c
+++ b/kernel/lockdep_proc.c
@@ -494,7 +494,6 @@ static void seq_stats(struct seq_file *m, struct lock_stat_data *data)
494 namelen += 2; 494 namelen += 2;
495 495
496 for (i = 0; i < LOCKSTAT_POINTS; i++) { 496 for (i = 0; i < LOCKSTAT_POINTS; i++) {
497 char sym[KSYM_SYMBOL_LEN];
498 char ip[32]; 497 char ip[32];
499 498
500 if (class->contention_point[i] == 0) 499 if (class->contention_point[i] == 0)
@@ -503,15 +502,13 @@ static void seq_stats(struct seq_file *m, struct lock_stat_data *data)
503 if (!i) 502 if (!i)
504 seq_line(m, '-', 40-namelen, namelen); 503 seq_line(m, '-', 40-namelen, namelen);
505 504
506 sprint_symbol(sym, class->contention_point[i]);
507 snprintf(ip, sizeof(ip), "[<%p>]", 505 snprintf(ip, sizeof(ip), "[<%p>]",
508 (void *)class->contention_point[i]); 506 (void *)class->contention_point[i]);
509 seq_printf(m, "%40s %14lu %29s %s\n", name, 507 seq_printf(m, "%40s %14lu %29s %pS\n",
510 stats->contention_point[i], 508 name, stats->contention_point[i],
511 ip, sym); 509 ip, (void *)class->contention_point[i]);
512 } 510 }
513 for (i = 0; i < LOCKSTAT_POINTS; i++) { 511 for (i = 0; i < LOCKSTAT_POINTS; i++) {
514 char sym[KSYM_SYMBOL_LEN];
515 char ip[32]; 512 char ip[32];
516 513
517 if (class->contending_point[i] == 0) 514 if (class->contending_point[i] == 0)
@@ -520,12 +517,11 @@ static void seq_stats(struct seq_file *m, struct lock_stat_data *data)
520 if (!i) 517 if (!i)
521 seq_line(m, '-', 40-namelen, namelen); 518 seq_line(m, '-', 40-namelen, namelen);
522 519
523 sprint_symbol(sym, class->contending_point[i]);
524 snprintf(ip, sizeof(ip), "[<%p>]", 520 snprintf(ip, sizeof(ip), "[<%p>]",
525 (void *)class->contending_point[i]); 521 (void *)class->contending_point[i]);
526 seq_printf(m, "%40s %14lu %29s %s\n", name, 522 seq_printf(m, "%40s %14lu %29s %pS\n",
527 stats->contending_point[i], 523 name, stats->contending_point[i],
528 ip, sym); 524 ip, (void *)class->contending_point[i]);
529 } 525 }
530 if (i) { 526 if (i) {
531 seq_puts(m, "\n"); 527 seq_puts(m, "\n");
diff --git a/kernel/module.c b/kernel/module.c
index d190664f25ff..34e00b708fad 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -56,6 +56,7 @@
56#include <linux/percpu.h> 56#include <linux/percpu.h>
57#include <linux/kmemleak.h> 57#include <linux/kmemleak.h>
58#include <linux/jump_label.h> 58#include <linux/jump_label.h>
59#include <linux/pfn.h>
59 60
60#define CREATE_TRACE_POINTS 61#define CREATE_TRACE_POINTS
61#include <trace/events/module.h> 62#include <trace/events/module.h>
@@ -70,6 +71,26 @@
70#define ARCH_SHF_SMALL 0 71#define ARCH_SHF_SMALL 0
71#endif 72#endif
72 73
74/*
75 * Modules' sections will be aligned on page boundaries
76 * to ensure complete separation of code and data, but
77 * only when CONFIG_DEBUG_SET_MODULE_RONX=y
78 */
79#ifdef CONFIG_DEBUG_SET_MODULE_RONX
80# define debug_align(X) ALIGN(X, PAGE_SIZE)
81#else
82# define debug_align(X) (X)
83#endif
84
85/*
86 * Given BASE and SIZE this macro calculates the number of pages the
87 * memory regions occupies
88 */
89#define MOD_NUMBER_OF_PAGES(BASE, SIZE) (((SIZE) > 0) ? \
90 (PFN_DOWN((unsigned long)(BASE) + (SIZE) - 1) - \
91 PFN_DOWN((unsigned long)BASE) + 1) \
92 : (0UL))
93
73/* If this is set, the section belongs in the init part of the module */ 94/* If this is set, the section belongs in the init part of the module */
74#define INIT_OFFSET_MASK (1UL << (BITS_PER_LONG-1)) 95#define INIT_OFFSET_MASK (1UL << (BITS_PER_LONG-1))
75 96
@@ -1542,6 +1563,115 @@ static int __unlink_module(void *_mod)
1542 return 0; 1563 return 0;
1543} 1564}
1544 1565
1566#ifdef CONFIG_DEBUG_SET_MODULE_RONX
1567/*
1568 * LKM RO/NX protection: protect module's text/ro-data
1569 * from modification and any data from execution.
1570 */
1571void set_page_attributes(void *start, void *end, int (*set)(unsigned long start, int num_pages))
1572{
1573 unsigned long begin_pfn = PFN_DOWN((unsigned long)start);
1574 unsigned long end_pfn = PFN_DOWN((unsigned long)end);
1575
1576 if (end_pfn > begin_pfn)
1577 set(begin_pfn << PAGE_SHIFT, end_pfn - begin_pfn);
1578}
1579
1580static void set_section_ro_nx(void *base,
1581 unsigned long text_size,
1582 unsigned long ro_size,
1583 unsigned long total_size)
1584{
1585 /* begin and end PFNs of the current subsection */
1586 unsigned long begin_pfn;
1587 unsigned long end_pfn;
1588
1589 /*
1590 * Set RO for module text and RO-data:
1591 * - Always protect first page.
1592 * - Do not protect last partial page.
1593 */
1594 if (ro_size > 0)
1595 set_page_attributes(base, base + ro_size, set_memory_ro);
1596
1597 /*
1598 * Set NX permissions for module data:
1599 * - Do not protect first partial page.
1600 * - Always protect last page.
1601 */
1602 if (total_size > text_size) {
1603 begin_pfn = PFN_UP((unsigned long)base + text_size);
1604 end_pfn = PFN_UP((unsigned long)base + total_size);
1605 if (end_pfn > begin_pfn)
1606 set_memory_nx(begin_pfn << PAGE_SHIFT, end_pfn - begin_pfn);
1607 }
1608}
1609
1610/* Setting memory back to RW+NX before releasing it */
1611void unset_section_ro_nx(struct module *mod, void *module_region)
1612{
1613 unsigned long total_pages;
1614
1615 if (mod->module_core == module_region) {
1616 /* Set core as NX+RW */
1617 total_pages = MOD_NUMBER_OF_PAGES(mod->module_core, mod->core_size);
1618 set_memory_nx((unsigned long)mod->module_core, total_pages);
1619 set_memory_rw((unsigned long)mod->module_core, total_pages);
1620
1621 } else if (mod->module_init == module_region) {
1622 /* Set init as NX+RW */
1623 total_pages = MOD_NUMBER_OF_PAGES(mod->module_init, mod->init_size);
1624 set_memory_nx((unsigned long)mod->module_init, total_pages);
1625 set_memory_rw((unsigned long)mod->module_init, total_pages);
1626 }
1627}
1628
1629/* Iterate through all modules and set each module's text as RW */
1630void set_all_modules_text_rw()
1631{
1632 struct module *mod;
1633
1634 mutex_lock(&module_mutex);
1635 list_for_each_entry_rcu(mod, &modules, list) {
1636 if ((mod->module_core) && (mod->core_text_size)) {
1637 set_page_attributes(mod->module_core,
1638 mod->module_core + mod->core_text_size,
1639 set_memory_rw);
1640 }
1641 if ((mod->module_init) && (mod->init_text_size)) {
1642 set_page_attributes(mod->module_init,
1643 mod->module_init + mod->init_text_size,
1644 set_memory_rw);
1645 }
1646 }
1647 mutex_unlock(&module_mutex);
1648}
1649
1650/* Iterate through all modules and set each module's text as RO */
1651void set_all_modules_text_ro()
1652{
1653 struct module *mod;
1654
1655 mutex_lock(&module_mutex);
1656 list_for_each_entry_rcu(mod, &modules, list) {
1657 if ((mod->module_core) && (mod->core_text_size)) {
1658 set_page_attributes(mod->module_core,
1659 mod->module_core + mod->core_text_size,
1660 set_memory_ro);
1661 }
1662 if ((mod->module_init) && (mod->init_text_size)) {
1663 set_page_attributes(mod->module_init,
1664 mod->module_init + mod->init_text_size,
1665 set_memory_ro);
1666 }
1667 }
1668 mutex_unlock(&module_mutex);
1669}
1670#else
1671static inline void set_section_ro_nx(void *base, unsigned long text_size, unsigned long ro_size, unsigned long total_size) { }
1672static inline void unset_section_ro_nx(struct module *mod, void *module_region) { }
1673#endif
1674
1545/* Free a module, remove from lists, etc. */ 1675/* Free a module, remove from lists, etc. */
1546static void free_module(struct module *mod) 1676static void free_module(struct module *mod)
1547{ 1677{
@@ -1566,6 +1696,7 @@ static void free_module(struct module *mod)
1566 destroy_params(mod->kp, mod->num_kp); 1696 destroy_params(mod->kp, mod->num_kp);
1567 1697
1568 /* This may be NULL, but that's OK */ 1698 /* This may be NULL, but that's OK */
1699 unset_section_ro_nx(mod, mod->module_init);
1569 module_free(mod, mod->module_init); 1700 module_free(mod, mod->module_init);
1570 kfree(mod->args); 1701 kfree(mod->args);
1571 percpu_modfree(mod); 1702 percpu_modfree(mod);
@@ -1574,6 +1705,7 @@ static void free_module(struct module *mod)
1574 lockdep_free_key_range(mod->module_core, mod->core_size); 1705 lockdep_free_key_range(mod->module_core, mod->core_size);
1575 1706
1576 /* Finally, free the core (containing the module structure) */ 1707 /* Finally, free the core (containing the module structure) */
1708 unset_section_ro_nx(mod, mod->module_core);
1577 module_free(mod, mod->module_core); 1709 module_free(mod, mod->module_core);
1578 1710
1579#ifdef CONFIG_MPU 1711#ifdef CONFIG_MPU
@@ -1777,8 +1909,19 @@ static void layout_sections(struct module *mod, struct load_info *info)
1777 s->sh_entsize = get_offset(mod, &mod->core_size, s, i); 1909 s->sh_entsize = get_offset(mod, &mod->core_size, s, i);
1778 DEBUGP("\t%s\n", name); 1910 DEBUGP("\t%s\n", name);
1779 } 1911 }
1780 if (m == 0) 1912 switch (m) {
1913 case 0: /* executable */
1914 mod->core_size = debug_align(mod->core_size);
1781 mod->core_text_size = mod->core_size; 1915 mod->core_text_size = mod->core_size;
1916 break;
1917 case 1: /* RO: text and ro-data */
1918 mod->core_size = debug_align(mod->core_size);
1919 mod->core_ro_size = mod->core_size;
1920 break;
1921 case 3: /* whole core */
1922 mod->core_size = debug_align(mod->core_size);
1923 break;
1924 }
1782 } 1925 }
1783 1926
1784 DEBUGP("Init section allocation order:\n"); 1927 DEBUGP("Init section allocation order:\n");
@@ -1796,8 +1939,19 @@ static void layout_sections(struct module *mod, struct load_info *info)
1796 | INIT_OFFSET_MASK); 1939 | INIT_OFFSET_MASK);
1797 DEBUGP("\t%s\n", sname); 1940 DEBUGP("\t%s\n", sname);
1798 } 1941 }
1799 if (m == 0) 1942 switch (m) {
1943 case 0: /* executable */
1944 mod->init_size = debug_align(mod->init_size);
1800 mod->init_text_size = mod->init_size; 1945 mod->init_text_size = mod->init_size;
1946 break;
1947 case 1: /* RO: text and ro-data */
1948 mod->init_size = debug_align(mod->init_size);
1949 mod->init_ro_size = mod->init_size;
1950 break;
1951 case 3: /* whole init */
1952 mod->init_size = debug_align(mod->init_size);
1953 break;
1954 }
1801 } 1955 }
1802} 1956}
1803 1957
@@ -2722,6 +2876,18 @@ SYSCALL_DEFINE3(init_module, void __user *, umod,
2722 blocking_notifier_call_chain(&module_notify_list, 2876 blocking_notifier_call_chain(&module_notify_list,
2723 MODULE_STATE_COMING, mod); 2877 MODULE_STATE_COMING, mod);
2724 2878
2879 /* Set RO and NX regions for core */
2880 set_section_ro_nx(mod->module_core,
2881 mod->core_text_size,
2882 mod->core_ro_size,
2883 mod->core_size);
2884
2885 /* Set RO and NX regions for init */
2886 set_section_ro_nx(mod->module_init,
2887 mod->init_text_size,
2888 mod->init_ro_size,
2889 mod->init_size);
2890
2725 do_mod_ctors(mod); 2891 do_mod_ctors(mod);
2726 /* Start the module */ 2892 /* Start the module */
2727 if (mod->init != NULL) 2893 if (mod->init != NULL)
@@ -2765,6 +2931,7 @@ SYSCALL_DEFINE3(init_module, void __user *, umod,
2765 mod->symtab = mod->core_symtab; 2931 mod->symtab = mod->core_symtab;
2766 mod->strtab = mod->core_strtab; 2932 mod->strtab = mod->core_strtab;
2767#endif 2933#endif
2934 unset_section_ro_nx(mod, mod->module_init);
2768 module_free(mod, mod->module_init); 2935 module_free(mod, mod->module_init);
2769 mod->module_init = NULL; 2936 mod->module_init = NULL;
2770 mod->init_size = 0; 2937 mod->init_size = 0;
diff --git a/kernel/mutex.c b/kernel/mutex.c
index 200407c1502f..a5889fb28ecf 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -199,7 +199,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
199 * memory barriers as we'll eventually observe the right 199 * memory barriers as we'll eventually observe the right
200 * values at the cost of a few extra spins. 200 * values at the cost of a few extra spins.
201 */ 201 */
202 cpu_relax(); 202 arch_mutex_cpu_relax();
203 } 203 }
204#endif 204#endif
205 spin_lock_mutex(&lock->wait_lock, flags); 205 spin_lock_mutex(&lock->wait_lock, flags);
diff --git a/kernel/panic.c b/kernel/panic.c
index 4c13b1a88ebb..991bb87a1704 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -34,6 +34,7 @@ static int pause_on_oops_flag;
34static DEFINE_SPINLOCK(pause_on_oops_lock); 34static DEFINE_SPINLOCK(pause_on_oops_lock);
35 35
36int panic_timeout; 36int panic_timeout;
37EXPORT_SYMBOL_GPL(panic_timeout);
37 38
38ATOMIC_NOTIFIER_HEAD(panic_notifier_list); 39ATOMIC_NOTIFIER_HEAD(panic_notifier_list);
39 40
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 2870feee81dd..05ebe841270b 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -13,6 +13,7 @@
13#include <linux/mm.h> 13#include <linux/mm.h>
14#include <linux/cpu.h> 14#include <linux/cpu.h>
15#include <linux/smp.h> 15#include <linux/smp.h>
16#include <linux/idr.h>
16#include <linux/file.h> 17#include <linux/file.h>
17#include <linux/poll.h> 18#include <linux/poll.h>
18#include <linux/slab.h> 19#include <linux/slab.h>
@@ -21,7 +22,9 @@
21#include <linux/dcache.h> 22#include <linux/dcache.h>
22#include <linux/percpu.h> 23#include <linux/percpu.h>
23#include <linux/ptrace.h> 24#include <linux/ptrace.h>
25#include <linux/reboot.h>
24#include <linux/vmstat.h> 26#include <linux/vmstat.h>
27#include <linux/device.h>
25#include <linux/vmalloc.h> 28#include <linux/vmalloc.h>
26#include <linux/hardirq.h> 29#include <linux/hardirq.h>
27#include <linux/rculist.h> 30#include <linux/rculist.h>
@@ -35,6 +38,12 @@
35 38
36#include <asm/irq_regs.h> 39#include <asm/irq_regs.h>
37 40
41enum event_type_t {
42 EVENT_FLEXIBLE = 0x1,
43 EVENT_PINNED = 0x2,
44 EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
45};
46
38atomic_t perf_task_events __read_mostly; 47atomic_t perf_task_events __read_mostly;
39static atomic_t nr_mmap_events __read_mostly; 48static atomic_t nr_mmap_events __read_mostly;
40static atomic_t nr_comm_events __read_mostly; 49static atomic_t nr_comm_events __read_mostly;
@@ -62,6 +71,12 @@ int sysctl_perf_event_sample_rate __read_mostly = 100000;
62 71
63static atomic64_t perf_event_id; 72static atomic64_t perf_event_id;
64 73
74static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
75 enum event_type_t event_type);
76
77static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
78 enum event_type_t event_type);
79
65void __weak perf_event_print_debug(void) { } 80void __weak perf_event_print_debug(void) { }
66 81
67extern __weak const char *perf_pmu_name(void) 82extern __weak const char *perf_pmu_name(void)
@@ -69,6 +84,11 @@ extern __weak const char *perf_pmu_name(void)
69 return "pmu"; 84 return "pmu";
70} 85}
71 86
87static inline u64 perf_clock(void)
88{
89 return local_clock();
90}
91
72void perf_pmu_disable(struct pmu *pmu) 92void perf_pmu_disable(struct pmu *pmu)
73{ 93{
74 int *count = this_cpu_ptr(pmu->pmu_disable_count); 94 int *count = this_cpu_ptr(pmu->pmu_disable_count);
@@ -133,6 +153,28 @@ static void unclone_ctx(struct perf_event_context *ctx)
133 } 153 }
134} 154}
135 155
156static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
157{
158 /*
159 * only top level events have the pid namespace they were created in
160 */
161 if (event->parent)
162 event = event->parent;
163
164 return task_tgid_nr_ns(p, event->ns);
165}
166
167static u32 perf_event_tid(struct perf_event *event, struct task_struct *p)
168{
169 /*
170 * only top level events have the pid namespace they were created in
171 */
172 if (event->parent)
173 event = event->parent;
174
175 return task_pid_nr_ns(p, event->ns);
176}
177
136/* 178/*
137 * If we inherit events we want to return the parent event id 179 * If we inherit events we want to return the parent event id
138 * to userspace. 180 * to userspace.
@@ -215,11 +257,6 @@ static void perf_unpin_context(struct perf_event_context *ctx)
215 put_ctx(ctx); 257 put_ctx(ctx);
216} 258}
217 259
218static inline u64 perf_clock(void)
219{
220 return local_clock();
221}
222
223/* 260/*
224 * Update the record of the current time in a context. 261 * Update the record of the current time in a context.
225 */ 262 */
@@ -231,6 +268,12 @@ static void update_context_time(struct perf_event_context *ctx)
231 ctx->timestamp = now; 268 ctx->timestamp = now;
232} 269}
233 270
271static u64 perf_event_time(struct perf_event *event)
272{
273 struct perf_event_context *ctx = event->ctx;
274 return ctx ? ctx->time : 0;
275}
276
234/* 277/*
235 * Update the total_time_enabled and total_time_running fields for a event. 278 * Update the total_time_enabled and total_time_running fields for a event.
236 */ 279 */
@@ -244,7 +287,7 @@ static void update_event_times(struct perf_event *event)
244 return; 287 return;
245 288
246 if (ctx->is_active) 289 if (ctx->is_active)
247 run_end = ctx->time; 290 run_end = perf_event_time(event);
248 else 291 else
249 run_end = event->tstamp_stopped; 292 run_end = event->tstamp_stopped;
250 293
@@ -253,7 +296,7 @@ static void update_event_times(struct perf_event *event)
253 if (event->state == PERF_EVENT_STATE_INACTIVE) 296 if (event->state == PERF_EVENT_STATE_INACTIVE)
254 run_end = event->tstamp_stopped; 297 run_end = event->tstamp_stopped;
255 else 298 else
256 run_end = ctx->time; 299 run_end = perf_event_time(event);
257 300
258 event->total_time_running = run_end - event->tstamp_running; 301 event->total_time_running = run_end - event->tstamp_running;
259} 302}
@@ -312,9 +355,84 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
312 ctx->nr_stat++; 355 ctx->nr_stat++;
313} 356}
314 357
358/*
359 * Called at perf_event creation and when events are attached/detached from a
360 * group.
361 */
362static void perf_event__read_size(struct perf_event *event)
363{
364 int entry = sizeof(u64); /* value */
365 int size = 0;
366 int nr = 1;
367
368 if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
369 size += sizeof(u64);
370
371 if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
372 size += sizeof(u64);
373
374 if (event->attr.read_format & PERF_FORMAT_ID)
375 entry += sizeof(u64);
376
377 if (event->attr.read_format & PERF_FORMAT_GROUP) {
378 nr += event->group_leader->nr_siblings;
379 size += sizeof(u64);
380 }
381
382 size += entry * nr;
383 event->read_size = size;
384}
385
386static void perf_event__header_size(struct perf_event *event)
387{
388 struct perf_sample_data *data;
389 u64 sample_type = event->attr.sample_type;
390 u16 size = 0;
391
392 perf_event__read_size(event);
393
394 if (sample_type & PERF_SAMPLE_IP)
395 size += sizeof(data->ip);
396
397 if (sample_type & PERF_SAMPLE_ADDR)
398 size += sizeof(data->addr);
399
400 if (sample_type & PERF_SAMPLE_PERIOD)
401 size += sizeof(data->period);
402
403 if (sample_type & PERF_SAMPLE_READ)
404 size += event->read_size;
405
406 event->header_size = size;
407}
408
409static void perf_event__id_header_size(struct perf_event *event)
410{
411 struct perf_sample_data *data;
412 u64 sample_type = event->attr.sample_type;
413 u16 size = 0;
414
415 if (sample_type & PERF_SAMPLE_TID)
416 size += sizeof(data->tid_entry);
417
418 if (sample_type & PERF_SAMPLE_TIME)
419 size += sizeof(data->time);
420
421 if (sample_type & PERF_SAMPLE_ID)
422 size += sizeof(data->id);
423
424 if (sample_type & PERF_SAMPLE_STREAM_ID)
425 size += sizeof(data->stream_id);
426
427 if (sample_type & PERF_SAMPLE_CPU)
428 size += sizeof(data->cpu_entry);
429
430 event->id_header_size = size;
431}
432
315static void perf_group_attach(struct perf_event *event) 433static void perf_group_attach(struct perf_event *event)
316{ 434{
317 struct perf_event *group_leader = event->group_leader; 435 struct perf_event *group_leader = event->group_leader, *pos;
318 436
319 /* 437 /*
320 * We can have double attach due to group movement in perf_event_open. 438 * We can have double attach due to group movement in perf_event_open.
@@ -333,6 +451,11 @@ static void perf_group_attach(struct perf_event *event)
333 451
334 list_add_tail(&event->group_entry, &group_leader->sibling_list); 452 list_add_tail(&event->group_entry, &group_leader->sibling_list);
335 group_leader->nr_siblings++; 453 group_leader->nr_siblings++;
454
455 perf_event__header_size(group_leader);
456
457 list_for_each_entry(pos, &group_leader->sibling_list, group_entry)
458 perf_event__header_size(pos);
336} 459}
337 460
338/* 461/*
@@ -391,7 +514,7 @@ static void perf_group_detach(struct perf_event *event)
391 if (event->group_leader != event) { 514 if (event->group_leader != event) {
392 list_del_init(&event->group_entry); 515 list_del_init(&event->group_entry);
393 event->group_leader->nr_siblings--; 516 event->group_leader->nr_siblings--;
394 return; 517 goto out;
395 } 518 }
396 519
397 if (!list_empty(&event->group_entry)) 520 if (!list_empty(&event->group_entry))
@@ -410,6 +533,12 @@ static void perf_group_detach(struct perf_event *event)
410 /* Inherit group flags from the previous leader */ 533 /* Inherit group flags from the previous leader */
411 sibling->group_flags = event->group_flags; 534 sibling->group_flags = event->group_flags;
412 } 535 }
536
537out:
538 perf_event__header_size(event->group_leader);
539
540 list_for_each_entry(tmp, &event->group_leader->sibling_list, group_entry)
541 perf_event__header_size(tmp);
413} 542}
414 543
415static inline int 544static inline int
@@ -423,6 +552,7 @@ event_sched_out(struct perf_event *event,
423 struct perf_cpu_context *cpuctx, 552 struct perf_cpu_context *cpuctx,
424 struct perf_event_context *ctx) 553 struct perf_event_context *ctx)
425{ 554{
555 u64 tstamp = perf_event_time(event);
426 u64 delta; 556 u64 delta;
427 /* 557 /*
428 * An event which could not be activated because of 558 * An event which could not be activated because of
@@ -434,7 +564,7 @@ event_sched_out(struct perf_event *event,
434 && !event_filter_match(event)) { 564 && !event_filter_match(event)) {
435 delta = ctx->time - event->tstamp_stopped; 565 delta = ctx->time - event->tstamp_stopped;
436 event->tstamp_running += delta; 566 event->tstamp_running += delta;
437 event->tstamp_stopped = ctx->time; 567 event->tstamp_stopped = tstamp;
438 } 568 }
439 569
440 if (event->state != PERF_EVENT_STATE_ACTIVE) 570 if (event->state != PERF_EVENT_STATE_ACTIVE)
@@ -445,7 +575,7 @@ event_sched_out(struct perf_event *event,
445 event->pending_disable = 0; 575 event->pending_disable = 0;
446 event->state = PERF_EVENT_STATE_OFF; 576 event->state = PERF_EVENT_STATE_OFF;
447 } 577 }
448 event->tstamp_stopped = ctx->time; 578 event->tstamp_stopped = tstamp;
449 event->pmu->del(event, 0); 579 event->pmu->del(event, 0);
450 event->oncpu = -1; 580 event->oncpu = -1;
451 581
@@ -657,6 +787,8 @@ event_sched_in(struct perf_event *event,
657 struct perf_cpu_context *cpuctx, 787 struct perf_cpu_context *cpuctx,
658 struct perf_event_context *ctx) 788 struct perf_event_context *ctx)
659{ 789{
790 u64 tstamp = perf_event_time(event);
791
660 if (event->state <= PERF_EVENT_STATE_OFF) 792 if (event->state <= PERF_EVENT_STATE_OFF)
661 return 0; 793 return 0;
662 794
@@ -673,9 +805,9 @@ event_sched_in(struct perf_event *event,
673 return -EAGAIN; 805 return -EAGAIN;
674 } 806 }
675 807
676 event->tstamp_running += ctx->time - event->tstamp_stopped; 808 event->tstamp_running += tstamp - event->tstamp_stopped;
677 809
678 event->shadow_ctx_time = ctx->time - ctx->timestamp; 810 event->shadow_ctx_time = tstamp - ctx->timestamp;
679 811
680 if (!is_software_event(event)) 812 if (!is_software_event(event))
681 cpuctx->active_oncpu++; 813 cpuctx->active_oncpu++;
@@ -787,11 +919,13 @@ static int group_can_go_on(struct perf_event *event,
787static void add_event_to_ctx(struct perf_event *event, 919static void add_event_to_ctx(struct perf_event *event,
788 struct perf_event_context *ctx) 920 struct perf_event_context *ctx)
789{ 921{
922 u64 tstamp = perf_event_time(event);
923
790 list_add_event(event, ctx); 924 list_add_event(event, ctx);
791 perf_group_attach(event); 925 perf_group_attach(event);
792 event->tstamp_enabled = ctx->time; 926 event->tstamp_enabled = tstamp;
793 event->tstamp_running = ctx->time; 927 event->tstamp_running = tstamp;
794 event->tstamp_stopped = ctx->time; 928 event->tstamp_stopped = tstamp;
795} 929}
796 930
797/* 931/*
@@ -826,7 +960,7 @@ static void __perf_install_in_context(void *info)
826 960
827 add_event_to_ctx(event, ctx); 961 add_event_to_ctx(event, ctx);
828 962
829 if (event->cpu != -1 && event->cpu != smp_processor_id()) 963 if (!event_filter_match(event))
830 goto unlock; 964 goto unlock;
831 965
832 /* 966 /*
@@ -931,14 +1065,13 @@ static void __perf_event_mark_enabled(struct perf_event *event,
931 struct perf_event_context *ctx) 1065 struct perf_event_context *ctx)
932{ 1066{
933 struct perf_event *sub; 1067 struct perf_event *sub;
1068 u64 tstamp = perf_event_time(event);
934 1069
935 event->state = PERF_EVENT_STATE_INACTIVE; 1070 event->state = PERF_EVENT_STATE_INACTIVE;
936 event->tstamp_enabled = ctx->time - event->total_time_enabled; 1071 event->tstamp_enabled = tstamp - event->total_time_enabled;
937 list_for_each_entry(sub, &event->sibling_list, group_entry) { 1072 list_for_each_entry(sub, &event->sibling_list, group_entry) {
938 if (sub->state >= PERF_EVENT_STATE_INACTIVE) { 1073 if (sub->state >= PERF_EVENT_STATE_INACTIVE)
939 sub->tstamp_enabled = 1074 sub->tstamp_enabled = tstamp - sub->total_time_enabled;
940 ctx->time - sub->total_time_enabled;
941 }
942 } 1075 }
943} 1076}
944 1077
@@ -971,7 +1104,7 @@ static void __perf_event_enable(void *info)
971 goto unlock; 1104 goto unlock;
972 __perf_event_mark_enabled(event, ctx); 1105 __perf_event_mark_enabled(event, ctx);
973 1106
974 if (event->cpu != -1 && event->cpu != smp_processor_id()) 1107 if (!event_filter_match(event))
975 goto unlock; 1108 goto unlock;
976 1109
977 /* 1110 /*
@@ -1073,7 +1206,7 @@ static int perf_event_refresh(struct perf_event *event, int refresh)
1073 /* 1206 /*
1074 * not supported on inherited events 1207 * not supported on inherited events
1075 */ 1208 */
1076 if (event->attr.inherit) 1209 if (event->attr.inherit || !is_sampling_event(event))
1077 return -EINVAL; 1210 return -EINVAL;
1078 1211
1079 atomic_add(refresh, &event->event_limit); 1212 atomic_add(refresh, &event->event_limit);
@@ -1082,12 +1215,6 @@ static int perf_event_refresh(struct perf_event *event, int refresh)
1082 return 0; 1215 return 0;
1083} 1216}
1084 1217
1085enum event_type_t {
1086 EVENT_FLEXIBLE = 0x1,
1087 EVENT_PINNED = 0x2,
1088 EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
1089};
1090
1091static void ctx_sched_out(struct perf_event_context *ctx, 1218static void ctx_sched_out(struct perf_event_context *ctx,
1092 struct perf_cpu_context *cpuctx, 1219 struct perf_cpu_context *cpuctx,
1093 enum event_type_t event_type) 1220 enum event_type_t event_type)
@@ -1324,7 +1451,7 @@ ctx_pinned_sched_in(struct perf_event_context *ctx,
1324 list_for_each_entry(event, &ctx->pinned_groups, group_entry) { 1451 list_for_each_entry(event, &ctx->pinned_groups, group_entry) {
1325 if (event->state <= PERF_EVENT_STATE_OFF) 1452 if (event->state <= PERF_EVENT_STATE_OFF)
1326 continue; 1453 continue;
1327 if (event->cpu != -1 && event->cpu != smp_processor_id()) 1454 if (!event_filter_match(event))
1328 continue; 1455 continue;
1329 1456
1330 if (group_can_go_on(event, cpuctx, 1)) 1457 if (group_can_go_on(event, cpuctx, 1))
@@ -1356,7 +1483,7 @@ ctx_flexible_sched_in(struct perf_event_context *ctx,
1356 * Listen to the 'cpu' scheduling filter constraint 1483 * Listen to the 'cpu' scheduling filter constraint
1357 * of events: 1484 * of events:
1358 */ 1485 */
1359 if (event->cpu != -1 && event->cpu != smp_processor_id()) 1486 if (!event_filter_match(event))
1360 continue; 1487 continue;
1361 1488
1362 if (group_can_go_on(event, cpuctx, can_add_hw)) { 1489 if (group_can_go_on(event, cpuctx, can_add_hw)) {
@@ -1583,7 +1710,7 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx, u64 period)
1583 if (event->state != PERF_EVENT_STATE_ACTIVE) 1710 if (event->state != PERF_EVENT_STATE_ACTIVE)
1584 continue; 1711 continue;
1585 1712
1586 if (event->cpu != -1 && event->cpu != smp_processor_id()) 1713 if (!event_filter_match(event))
1587 continue; 1714 continue;
1588 1715
1589 hwc = &event->hw; 1716 hwc = &event->hw;
@@ -2289,31 +2416,6 @@ static int perf_release(struct inode *inode, struct file *file)
2289 return perf_event_release_kernel(event); 2416 return perf_event_release_kernel(event);
2290} 2417}
2291 2418
2292static int perf_event_read_size(struct perf_event *event)
2293{
2294 int entry = sizeof(u64); /* value */
2295 int size = 0;
2296 int nr = 1;
2297
2298 if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
2299 size += sizeof(u64);
2300
2301 if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
2302 size += sizeof(u64);
2303
2304 if (event->attr.read_format & PERF_FORMAT_ID)
2305 entry += sizeof(u64);
2306
2307 if (event->attr.read_format & PERF_FORMAT_GROUP) {
2308 nr += event->group_leader->nr_siblings;
2309 size += sizeof(u64);
2310 }
2311
2312 size += entry * nr;
2313
2314 return size;
2315}
2316
2317u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running) 2419u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
2318{ 2420{
2319 struct perf_event *child; 2421 struct perf_event *child;
@@ -2428,7 +2530,7 @@ perf_read_hw(struct perf_event *event, char __user *buf, size_t count)
2428 if (event->state == PERF_EVENT_STATE_ERROR) 2530 if (event->state == PERF_EVENT_STATE_ERROR)
2429 return 0; 2531 return 0;
2430 2532
2431 if (count < perf_event_read_size(event)) 2533 if (count < event->read_size)
2432 return -ENOSPC; 2534 return -ENOSPC;
2433 2535
2434 WARN_ON_ONCE(event->ctx->parent_ctx); 2536 WARN_ON_ONCE(event->ctx->parent_ctx);
@@ -2514,7 +2616,7 @@ static int perf_event_period(struct perf_event *event, u64 __user *arg)
2514 int ret = 0; 2616 int ret = 0;
2515 u64 value; 2617 u64 value;
2516 2618
2517 if (!event->attr.sample_period) 2619 if (!is_sampling_event(event))
2518 return -EINVAL; 2620 return -EINVAL;
2519 2621
2520 if (copy_from_user(&value, arg, sizeof(value))) 2622 if (copy_from_user(&value, arg, sizeof(value)))
@@ -3305,6 +3407,73 @@ __always_inline void perf_output_copy(struct perf_output_handle *handle,
3305 } while (len); 3407 } while (len);
3306} 3408}
3307 3409
3410static void __perf_event_header__init_id(struct perf_event_header *header,
3411 struct perf_sample_data *data,
3412 struct perf_event *event)
3413{
3414 u64 sample_type = event->attr.sample_type;
3415
3416 data->type = sample_type;
3417 header->size += event->id_header_size;
3418
3419 if (sample_type & PERF_SAMPLE_TID) {
3420 /* namespace issues */
3421 data->tid_entry.pid = perf_event_pid(event, current);
3422 data->tid_entry.tid = perf_event_tid(event, current);
3423 }
3424
3425 if (sample_type & PERF_SAMPLE_TIME)
3426 data->time = perf_clock();
3427
3428 if (sample_type & PERF_SAMPLE_ID)
3429 data->id = primary_event_id(event);
3430
3431 if (sample_type & PERF_SAMPLE_STREAM_ID)
3432 data->stream_id = event->id;
3433
3434 if (sample_type & PERF_SAMPLE_CPU) {
3435 data->cpu_entry.cpu = raw_smp_processor_id();
3436 data->cpu_entry.reserved = 0;
3437 }
3438}
3439
3440static void perf_event_header__init_id(struct perf_event_header *header,
3441 struct perf_sample_data *data,
3442 struct perf_event *event)
3443{
3444 if (event->attr.sample_id_all)
3445 __perf_event_header__init_id(header, data, event);
3446}
3447
3448static void __perf_event__output_id_sample(struct perf_output_handle *handle,
3449 struct perf_sample_data *data)
3450{
3451 u64 sample_type = data->type;
3452
3453 if (sample_type & PERF_SAMPLE_TID)
3454 perf_output_put(handle, data->tid_entry);
3455
3456 if (sample_type & PERF_SAMPLE_TIME)
3457 perf_output_put(handle, data->time);
3458
3459 if (sample_type & PERF_SAMPLE_ID)
3460 perf_output_put(handle, data->id);
3461
3462 if (sample_type & PERF_SAMPLE_STREAM_ID)
3463 perf_output_put(handle, data->stream_id);
3464
3465 if (sample_type & PERF_SAMPLE_CPU)
3466 perf_output_put(handle, data->cpu_entry);
3467}
3468
3469static void perf_event__output_id_sample(struct perf_event *event,
3470 struct perf_output_handle *handle,
3471 struct perf_sample_data *sample)
3472{
3473 if (event->attr.sample_id_all)
3474 __perf_event__output_id_sample(handle, sample);
3475}
3476
3308int perf_output_begin(struct perf_output_handle *handle, 3477int perf_output_begin(struct perf_output_handle *handle,
3309 struct perf_event *event, unsigned int size, 3478 struct perf_event *event, unsigned int size,
3310 int nmi, int sample) 3479 int nmi, int sample)
@@ -3312,6 +3481,7 @@ int perf_output_begin(struct perf_output_handle *handle,
3312 struct perf_buffer *buffer; 3481 struct perf_buffer *buffer;
3313 unsigned long tail, offset, head; 3482 unsigned long tail, offset, head;
3314 int have_lost; 3483 int have_lost;
3484 struct perf_sample_data sample_data;
3315 struct { 3485 struct {
3316 struct perf_event_header header; 3486 struct perf_event_header header;
3317 u64 id; 3487 u64 id;
@@ -3338,8 +3508,12 @@ int perf_output_begin(struct perf_output_handle *handle,
3338 goto out; 3508 goto out;
3339 3509
3340 have_lost = local_read(&buffer->lost); 3510 have_lost = local_read(&buffer->lost);
3341 if (have_lost) 3511 if (have_lost) {
3342 size += sizeof(lost_event); 3512 lost_event.header.size = sizeof(lost_event);
3513 perf_event_header__init_id(&lost_event.header, &sample_data,
3514 event);
3515 size += lost_event.header.size;
3516 }
3343 3517
3344 perf_output_get_handle(handle); 3518 perf_output_get_handle(handle);
3345 3519
@@ -3370,11 +3544,11 @@ int perf_output_begin(struct perf_output_handle *handle,
3370 if (have_lost) { 3544 if (have_lost) {
3371 lost_event.header.type = PERF_RECORD_LOST; 3545 lost_event.header.type = PERF_RECORD_LOST;
3372 lost_event.header.misc = 0; 3546 lost_event.header.misc = 0;
3373 lost_event.header.size = sizeof(lost_event);
3374 lost_event.id = event->id; 3547 lost_event.id = event->id;
3375 lost_event.lost = local_xchg(&buffer->lost, 0); 3548 lost_event.lost = local_xchg(&buffer->lost, 0);
3376 3549
3377 perf_output_put(handle, lost_event); 3550 perf_output_put(handle, lost_event);
3551 perf_event__output_id_sample(event, handle, &sample_data);
3378 } 3552 }
3379 3553
3380 return 0; 3554 return 0;
@@ -3407,28 +3581,6 @@ void perf_output_end(struct perf_output_handle *handle)
3407 rcu_read_unlock(); 3581 rcu_read_unlock();
3408} 3582}
3409 3583
3410static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
3411{
3412 /*
3413 * only top level events have the pid namespace they were created in
3414 */
3415 if (event->parent)
3416 event = event->parent;
3417
3418 return task_tgid_nr_ns(p, event->ns);
3419}
3420
3421static u32 perf_event_tid(struct perf_event *event, struct task_struct *p)
3422{
3423 /*
3424 * only top level events have the pid namespace they were created in
3425 */
3426 if (event->parent)
3427 event = event->parent;
3428
3429 return task_pid_nr_ns(p, event->ns);
3430}
3431
3432static void perf_output_read_one(struct perf_output_handle *handle, 3584static void perf_output_read_one(struct perf_output_handle *handle,
3433 struct perf_event *event, 3585 struct perf_event *event,
3434 u64 enabled, u64 running) 3586 u64 enabled, u64 running)
@@ -3603,61 +3755,16 @@ void perf_prepare_sample(struct perf_event_header *header,
3603{ 3755{
3604 u64 sample_type = event->attr.sample_type; 3756 u64 sample_type = event->attr.sample_type;
3605 3757
3606 data->type = sample_type;
3607
3608 header->type = PERF_RECORD_SAMPLE; 3758 header->type = PERF_RECORD_SAMPLE;
3609 header->size = sizeof(*header); 3759 header->size = sizeof(*header) + event->header_size;
3610 3760
3611 header->misc = 0; 3761 header->misc = 0;
3612 header->misc |= perf_misc_flags(regs); 3762 header->misc |= perf_misc_flags(regs);
3613 3763
3614 if (sample_type & PERF_SAMPLE_IP) { 3764 __perf_event_header__init_id(header, data, event);
3615 data->ip = perf_instruction_pointer(regs);
3616
3617 header->size += sizeof(data->ip);
3618 }
3619
3620 if (sample_type & PERF_SAMPLE_TID) {
3621 /* namespace issues */
3622 data->tid_entry.pid = perf_event_pid(event, current);
3623 data->tid_entry.tid = perf_event_tid(event, current);
3624
3625 header->size += sizeof(data->tid_entry);
3626 }
3627
3628 if (sample_type & PERF_SAMPLE_TIME) {
3629 data->time = perf_clock();
3630
3631 header->size += sizeof(data->time);
3632 }
3633
3634 if (sample_type & PERF_SAMPLE_ADDR)
3635 header->size += sizeof(data->addr);
3636
3637 if (sample_type & PERF_SAMPLE_ID) {
3638 data->id = primary_event_id(event);
3639
3640 header->size += sizeof(data->id);
3641 }
3642
3643 if (sample_type & PERF_SAMPLE_STREAM_ID) {
3644 data->stream_id = event->id;
3645
3646 header->size += sizeof(data->stream_id);
3647 }
3648
3649 if (sample_type & PERF_SAMPLE_CPU) {
3650 data->cpu_entry.cpu = raw_smp_processor_id();
3651 data->cpu_entry.reserved = 0;
3652
3653 header->size += sizeof(data->cpu_entry);
3654 }
3655 3765
3656 if (sample_type & PERF_SAMPLE_PERIOD) 3766 if (sample_type & PERF_SAMPLE_IP)
3657 header->size += sizeof(data->period); 3767 data->ip = perf_instruction_pointer(regs);
3658
3659 if (sample_type & PERF_SAMPLE_READ)
3660 header->size += perf_event_read_size(event);
3661 3768
3662 if (sample_type & PERF_SAMPLE_CALLCHAIN) { 3769 if (sample_type & PERF_SAMPLE_CALLCHAIN) {
3663 int size = 1; 3770 int size = 1;
@@ -3722,23 +3829,26 @@ perf_event_read_event(struct perf_event *event,
3722 struct task_struct *task) 3829 struct task_struct *task)
3723{ 3830{
3724 struct perf_output_handle handle; 3831 struct perf_output_handle handle;
3832 struct perf_sample_data sample;
3725 struct perf_read_event read_event = { 3833 struct perf_read_event read_event = {
3726 .header = { 3834 .header = {
3727 .type = PERF_RECORD_READ, 3835 .type = PERF_RECORD_READ,
3728 .misc = 0, 3836 .misc = 0,
3729 .size = sizeof(read_event) + perf_event_read_size(event), 3837 .size = sizeof(read_event) + event->read_size,
3730 }, 3838 },
3731 .pid = perf_event_pid(event, task), 3839 .pid = perf_event_pid(event, task),
3732 .tid = perf_event_tid(event, task), 3840 .tid = perf_event_tid(event, task),
3733 }; 3841 };
3734 int ret; 3842 int ret;
3735 3843
3844 perf_event_header__init_id(&read_event.header, &sample, event);
3736 ret = perf_output_begin(&handle, event, read_event.header.size, 0, 0); 3845 ret = perf_output_begin(&handle, event, read_event.header.size, 0, 0);
3737 if (ret) 3846 if (ret)
3738 return; 3847 return;
3739 3848
3740 perf_output_put(&handle, read_event); 3849 perf_output_put(&handle, read_event);
3741 perf_output_read(&handle, event); 3850 perf_output_read(&handle, event);
3851 perf_event__output_id_sample(event, &handle, &sample);
3742 3852
3743 perf_output_end(&handle); 3853 perf_output_end(&handle);
3744} 3854}
@@ -3768,14 +3878,16 @@ static void perf_event_task_output(struct perf_event *event,
3768 struct perf_task_event *task_event) 3878 struct perf_task_event *task_event)
3769{ 3879{
3770 struct perf_output_handle handle; 3880 struct perf_output_handle handle;
3881 struct perf_sample_data sample;
3771 struct task_struct *task = task_event->task; 3882 struct task_struct *task = task_event->task;
3772 int size, ret; 3883 int ret, size = task_event->event_id.header.size;
3773 3884
3774 size = task_event->event_id.header.size; 3885 perf_event_header__init_id(&task_event->event_id.header, &sample, event);
3775 ret = perf_output_begin(&handle, event, size, 0, 0);
3776 3886
3887 ret = perf_output_begin(&handle, event,
3888 task_event->event_id.header.size, 0, 0);
3777 if (ret) 3889 if (ret)
3778 return; 3890 goto out;
3779 3891
3780 task_event->event_id.pid = perf_event_pid(event, task); 3892 task_event->event_id.pid = perf_event_pid(event, task);
3781 task_event->event_id.ppid = perf_event_pid(event, current); 3893 task_event->event_id.ppid = perf_event_pid(event, current);
@@ -3785,7 +3897,11 @@ static void perf_event_task_output(struct perf_event *event,
3785 3897
3786 perf_output_put(&handle, task_event->event_id); 3898 perf_output_put(&handle, task_event->event_id);
3787 3899
3900 perf_event__output_id_sample(event, &handle, &sample);
3901
3788 perf_output_end(&handle); 3902 perf_output_end(&handle);
3903out:
3904 task_event->event_id.header.size = size;
3789} 3905}
3790 3906
3791static int perf_event_task_match(struct perf_event *event) 3907static int perf_event_task_match(struct perf_event *event)
@@ -3793,7 +3909,7 @@ static int perf_event_task_match(struct perf_event *event)
3793 if (event->state < PERF_EVENT_STATE_INACTIVE) 3909 if (event->state < PERF_EVENT_STATE_INACTIVE)
3794 return 0; 3910 return 0;
3795 3911
3796 if (event->cpu != -1 && event->cpu != smp_processor_id()) 3912 if (!event_filter_match(event))
3797 return 0; 3913 return 0;
3798 3914
3799 if (event->attr.comm || event->attr.mmap || 3915 if (event->attr.comm || event->attr.mmap ||
@@ -3900,11 +4016,16 @@ static void perf_event_comm_output(struct perf_event *event,
3900 struct perf_comm_event *comm_event) 4016 struct perf_comm_event *comm_event)
3901{ 4017{
3902 struct perf_output_handle handle; 4018 struct perf_output_handle handle;
4019 struct perf_sample_data sample;
3903 int size = comm_event->event_id.header.size; 4020 int size = comm_event->event_id.header.size;
3904 int ret = perf_output_begin(&handle, event, size, 0, 0); 4021 int ret;
4022
4023 perf_event_header__init_id(&comm_event->event_id.header, &sample, event);
4024 ret = perf_output_begin(&handle, event,
4025 comm_event->event_id.header.size, 0, 0);
3905 4026
3906 if (ret) 4027 if (ret)
3907 return; 4028 goto out;
3908 4029
3909 comm_event->event_id.pid = perf_event_pid(event, comm_event->task); 4030 comm_event->event_id.pid = perf_event_pid(event, comm_event->task);
3910 comm_event->event_id.tid = perf_event_tid(event, comm_event->task); 4031 comm_event->event_id.tid = perf_event_tid(event, comm_event->task);
@@ -3912,7 +4033,12 @@ static void perf_event_comm_output(struct perf_event *event,
3912 perf_output_put(&handle, comm_event->event_id); 4033 perf_output_put(&handle, comm_event->event_id);
3913 perf_output_copy(&handle, comm_event->comm, 4034 perf_output_copy(&handle, comm_event->comm,
3914 comm_event->comm_size); 4035 comm_event->comm_size);
4036
4037 perf_event__output_id_sample(event, &handle, &sample);
4038
3915 perf_output_end(&handle); 4039 perf_output_end(&handle);
4040out:
4041 comm_event->event_id.header.size = size;
3916} 4042}
3917 4043
3918static int perf_event_comm_match(struct perf_event *event) 4044static int perf_event_comm_match(struct perf_event *event)
@@ -3920,7 +4046,7 @@ static int perf_event_comm_match(struct perf_event *event)
3920 if (event->state < PERF_EVENT_STATE_INACTIVE) 4046 if (event->state < PERF_EVENT_STATE_INACTIVE)
3921 return 0; 4047 return 0;
3922 4048
3923 if (event->cpu != -1 && event->cpu != smp_processor_id()) 4049 if (!event_filter_match(event))
3924 return 0; 4050 return 0;
3925 4051
3926 if (event->attr.comm) 4052 if (event->attr.comm)
@@ -3957,7 +4083,6 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event)
3957 comm_event->comm_size = size; 4083 comm_event->comm_size = size;
3958 4084
3959 comm_event->event_id.header.size = sizeof(comm_event->event_id) + size; 4085 comm_event->event_id.header.size = sizeof(comm_event->event_id) + size;
3960
3961 rcu_read_lock(); 4086 rcu_read_lock();
3962 list_for_each_entry_rcu(pmu, &pmus, entry) { 4087 list_for_each_entry_rcu(pmu, &pmus, entry) {
3963 cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); 4088 cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
@@ -4038,11 +4163,15 @@ static void perf_event_mmap_output(struct perf_event *event,
4038 struct perf_mmap_event *mmap_event) 4163 struct perf_mmap_event *mmap_event)
4039{ 4164{
4040 struct perf_output_handle handle; 4165 struct perf_output_handle handle;
4166 struct perf_sample_data sample;
4041 int size = mmap_event->event_id.header.size; 4167 int size = mmap_event->event_id.header.size;
4042 int ret = perf_output_begin(&handle, event, size, 0, 0); 4168 int ret;
4043 4169
4170 perf_event_header__init_id(&mmap_event->event_id.header, &sample, event);
4171 ret = perf_output_begin(&handle, event,
4172 mmap_event->event_id.header.size, 0, 0);
4044 if (ret) 4173 if (ret)
4045 return; 4174 goto out;
4046 4175
4047 mmap_event->event_id.pid = perf_event_pid(event, current); 4176 mmap_event->event_id.pid = perf_event_pid(event, current);
4048 mmap_event->event_id.tid = perf_event_tid(event, current); 4177 mmap_event->event_id.tid = perf_event_tid(event, current);
@@ -4050,7 +4179,12 @@ static void perf_event_mmap_output(struct perf_event *event,
4050 perf_output_put(&handle, mmap_event->event_id); 4179 perf_output_put(&handle, mmap_event->event_id);
4051 perf_output_copy(&handle, mmap_event->file_name, 4180 perf_output_copy(&handle, mmap_event->file_name,
4052 mmap_event->file_size); 4181 mmap_event->file_size);
4182
4183 perf_event__output_id_sample(event, &handle, &sample);
4184
4053 perf_output_end(&handle); 4185 perf_output_end(&handle);
4186out:
4187 mmap_event->event_id.header.size = size;
4054} 4188}
4055 4189
4056static int perf_event_mmap_match(struct perf_event *event, 4190static int perf_event_mmap_match(struct perf_event *event,
@@ -4060,7 +4194,7 @@ static int perf_event_mmap_match(struct perf_event *event,
4060 if (event->state < PERF_EVENT_STATE_INACTIVE) 4194 if (event->state < PERF_EVENT_STATE_INACTIVE)
4061 return 0; 4195 return 0;
4062 4196
4063 if (event->cpu != -1 && event->cpu != smp_processor_id()) 4197 if (!event_filter_match(event))
4064 return 0; 4198 return 0;
4065 4199
4066 if ((!executable && event->attr.mmap_data) || 4200 if ((!executable && event->attr.mmap_data) ||
@@ -4205,6 +4339,7 @@ void perf_event_mmap(struct vm_area_struct *vma)
4205static void perf_log_throttle(struct perf_event *event, int enable) 4339static void perf_log_throttle(struct perf_event *event, int enable)
4206{ 4340{
4207 struct perf_output_handle handle; 4341 struct perf_output_handle handle;
4342 struct perf_sample_data sample;
4208 int ret; 4343 int ret;
4209 4344
4210 struct { 4345 struct {
@@ -4226,11 +4361,15 @@ static void perf_log_throttle(struct perf_event *event, int enable)
4226 if (enable) 4361 if (enable)
4227 throttle_event.header.type = PERF_RECORD_UNTHROTTLE; 4362 throttle_event.header.type = PERF_RECORD_UNTHROTTLE;
4228 4363
4229 ret = perf_output_begin(&handle, event, sizeof(throttle_event), 1, 0); 4364 perf_event_header__init_id(&throttle_event.header, &sample, event);
4365
4366 ret = perf_output_begin(&handle, event,
4367 throttle_event.header.size, 1, 0);
4230 if (ret) 4368 if (ret)
4231 return; 4369 return;
4232 4370
4233 perf_output_put(&handle, throttle_event); 4371 perf_output_put(&handle, throttle_event);
4372 perf_event__output_id_sample(event, &handle, &sample);
4234 perf_output_end(&handle); 4373 perf_output_end(&handle);
4235} 4374}
4236 4375
@@ -4246,6 +4385,13 @@ static int __perf_event_overflow(struct perf_event *event, int nmi,
4246 struct hw_perf_event *hwc = &event->hw; 4385 struct hw_perf_event *hwc = &event->hw;
4247 int ret = 0; 4386 int ret = 0;
4248 4387
4388 /*
4389 * Non-sampling counters might still use the PMI to fold short
4390 * hardware counters, ignore those.
4391 */
4392 if (unlikely(!is_sampling_event(event)))
4393 return 0;
4394
4249 if (!throttle) { 4395 if (!throttle) {
4250 hwc->interrupts++; 4396 hwc->interrupts++;
4251 } else { 4397 } else {
@@ -4391,7 +4537,7 @@ static void perf_swevent_event(struct perf_event *event, u64 nr,
4391 if (!regs) 4537 if (!regs)
4392 return; 4538 return;
4393 4539
4394 if (!hwc->sample_period) 4540 if (!is_sampling_event(event))
4395 return; 4541 return;
4396 4542
4397 if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq) 4543 if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq)
@@ -4518,7 +4664,7 @@ int perf_swevent_get_recursion_context(void)
4518} 4664}
4519EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context); 4665EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context);
4520 4666
4521void inline perf_swevent_put_recursion_context(int rctx) 4667inline void perf_swevent_put_recursion_context(int rctx)
4522{ 4668{
4523 struct swevent_htable *swhash = &__get_cpu_var(swevent_htable); 4669 struct swevent_htable *swhash = &__get_cpu_var(swevent_htable);
4524 4670
@@ -4554,7 +4700,7 @@ static int perf_swevent_add(struct perf_event *event, int flags)
4554 struct hw_perf_event *hwc = &event->hw; 4700 struct hw_perf_event *hwc = &event->hw;
4555 struct hlist_head *head; 4701 struct hlist_head *head;
4556 4702
4557 if (hwc->sample_period) { 4703 if (is_sampling_event(event)) {
4558 hwc->last_period = hwc->sample_period; 4704 hwc->last_period = hwc->sample_period;
4559 perf_swevent_set_period(event); 4705 perf_swevent_set_period(event);
4560 } 4706 }
@@ -4811,15 +4957,6 @@ static int perf_tp_event_init(struct perf_event *event)
4811 if (event->attr.type != PERF_TYPE_TRACEPOINT) 4957 if (event->attr.type != PERF_TYPE_TRACEPOINT)
4812 return -ENOENT; 4958 return -ENOENT;
4813 4959
4814 /*
4815 * Raw tracepoint data is a severe data leak, only allow root to
4816 * have these.
4817 */
4818 if ((event->attr.sample_type & PERF_SAMPLE_RAW) &&
4819 perf_paranoid_tracepoint_raw() &&
4820 !capable(CAP_SYS_ADMIN))
4821 return -EPERM;
4822
4823 err = perf_trace_init(event); 4960 err = perf_trace_init(event);
4824 if (err) 4961 if (err)
4825 return err; 4962 return err;
@@ -4842,7 +4979,7 @@ static struct pmu perf_tracepoint = {
4842 4979
4843static inline void perf_tp_register(void) 4980static inline void perf_tp_register(void)
4844{ 4981{
4845 perf_pmu_register(&perf_tracepoint); 4982 perf_pmu_register(&perf_tracepoint, "tracepoint", PERF_TYPE_TRACEPOINT);
4846} 4983}
4847 4984
4848static int perf_event_set_filter(struct perf_event *event, void __user *arg) 4985static int perf_event_set_filter(struct perf_event *event, void __user *arg)
@@ -4932,31 +5069,33 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
4932static void perf_swevent_start_hrtimer(struct perf_event *event) 5069static void perf_swevent_start_hrtimer(struct perf_event *event)
4933{ 5070{
4934 struct hw_perf_event *hwc = &event->hw; 5071 struct hw_perf_event *hwc = &event->hw;
5072 s64 period;
5073
5074 if (!is_sampling_event(event))
5075 return;
4935 5076
4936 hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 5077 hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
4937 hwc->hrtimer.function = perf_swevent_hrtimer; 5078 hwc->hrtimer.function = perf_swevent_hrtimer;
4938 if (hwc->sample_period) {
4939 s64 period = local64_read(&hwc->period_left);
4940 5079
4941 if (period) { 5080 period = local64_read(&hwc->period_left);
4942 if (period < 0) 5081 if (period) {
4943 period = 10000; 5082 if (period < 0)
5083 period = 10000;
4944 5084
4945 local64_set(&hwc->period_left, 0); 5085 local64_set(&hwc->period_left, 0);
4946 } else { 5086 } else {
4947 period = max_t(u64, 10000, hwc->sample_period); 5087 period = max_t(u64, 10000, hwc->sample_period);
4948 } 5088 }
4949 __hrtimer_start_range_ns(&hwc->hrtimer, 5089 __hrtimer_start_range_ns(&hwc->hrtimer,
4950 ns_to_ktime(period), 0, 5090 ns_to_ktime(period), 0,
4951 HRTIMER_MODE_REL_PINNED, 0); 5091 HRTIMER_MODE_REL_PINNED, 0);
4952 }
4953} 5092}
4954 5093
4955static void perf_swevent_cancel_hrtimer(struct perf_event *event) 5094static void perf_swevent_cancel_hrtimer(struct perf_event *event)
4956{ 5095{
4957 struct hw_perf_event *hwc = &event->hw; 5096 struct hw_perf_event *hwc = &event->hw;
4958 5097
4959 if (hwc->sample_period) { 5098 if (is_sampling_event(event)) {
4960 ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer); 5099 ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer);
4961 local64_set(&hwc->period_left, ktime_to_ns(remaining)); 5100 local64_set(&hwc->period_left, ktime_to_ns(remaining));
4962 5101
@@ -5184,8 +5323,61 @@ static void free_pmu_context(struct pmu *pmu)
5184out: 5323out:
5185 mutex_unlock(&pmus_lock); 5324 mutex_unlock(&pmus_lock);
5186} 5325}
5326static struct idr pmu_idr;
5327
5328static ssize_t
5329type_show(struct device *dev, struct device_attribute *attr, char *page)
5330{
5331 struct pmu *pmu = dev_get_drvdata(dev);
5332
5333 return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->type);
5334}
5335
5336static struct device_attribute pmu_dev_attrs[] = {
5337 __ATTR_RO(type),
5338 __ATTR_NULL,
5339};
5340
5341static int pmu_bus_running;
5342static struct bus_type pmu_bus = {
5343 .name = "event_source",
5344 .dev_attrs = pmu_dev_attrs,
5345};
5346
5347static void pmu_dev_release(struct device *dev)
5348{
5349 kfree(dev);
5350}
5351
5352static int pmu_dev_alloc(struct pmu *pmu)
5353{
5354 int ret = -ENOMEM;
5355
5356 pmu->dev = kzalloc(sizeof(struct device), GFP_KERNEL);
5357 if (!pmu->dev)
5358 goto out;
5359
5360 device_initialize(pmu->dev);
5361 ret = dev_set_name(pmu->dev, "%s", pmu->name);
5362 if (ret)
5363 goto free_dev;
5364
5365 dev_set_drvdata(pmu->dev, pmu);
5366 pmu->dev->bus = &pmu_bus;
5367 pmu->dev->release = pmu_dev_release;
5368 ret = device_add(pmu->dev);
5369 if (ret)
5370 goto free_dev;
5371
5372out:
5373 return ret;
5374
5375free_dev:
5376 put_device(pmu->dev);
5377 goto out;
5378}
5187 5379
5188int perf_pmu_register(struct pmu *pmu) 5380int perf_pmu_register(struct pmu *pmu, char *name, int type)
5189{ 5381{
5190 int cpu, ret; 5382 int cpu, ret;
5191 5383
@@ -5195,13 +5387,38 @@ int perf_pmu_register(struct pmu *pmu)
5195 if (!pmu->pmu_disable_count) 5387 if (!pmu->pmu_disable_count)
5196 goto unlock; 5388 goto unlock;
5197 5389
5390 pmu->type = -1;
5391 if (!name)
5392 goto skip_type;
5393 pmu->name = name;
5394
5395 if (type < 0) {
5396 int err = idr_pre_get(&pmu_idr, GFP_KERNEL);
5397 if (!err)
5398 goto free_pdc;
5399
5400 err = idr_get_new_above(&pmu_idr, pmu, PERF_TYPE_MAX, &type);
5401 if (err) {
5402 ret = err;
5403 goto free_pdc;
5404 }
5405 }
5406 pmu->type = type;
5407
5408 if (pmu_bus_running) {
5409 ret = pmu_dev_alloc(pmu);
5410 if (ret)
5411 goto free_idr;
5412 }
5413
5414skip_type:
5198 pmu->pmu_cpu_context = find_pmu_context(pmu->task_ctx_nr); 5415 pmu->pmu_cpu_context = find_pmu_context(pmu->task_ctx_nr);
5199 if (pmu->pmu_cpu_context) 5416 if (pmu->pmu_cpu_context)
5200 goto got_cpu_context; 5417 goto got_cpu_context;
5201 5418
5202 pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context); 5419 pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context);
5203 if (!pmu->pmu_cpu_context) 5420 if (!pmu->pmu_cpu_context)
5204 goto free_pdc; 5421 goto free_dev;
5205 5422
5206 for_each_possible_cpu(cpu) { 5423 for_each_possible_cpu(cpu) {
5207 struct perf_cpu_context *cpuctx; 5424 struct perf_cpu_context *cpuctx;
@@ -5245,6 +5462,14 @@ unlock:
5245 5462
5246 return ret; 5463 return ret;
5247 5464
5465free_dev:
5466 device_del(pmu->dev);
5467 put_device(pmu->dev);
5468
5469free_idr:
5470 if (pmu->type >= PERF_TYPE_MAX)
5471 idr_remove(&pmu_idr, pmu->type);
5472
5248free_pdc: 5473free_pdc:
5249 free_percpu(pmu->pmu_disable_count); 5474 free_percpu(pmu->pmu_disable_count);
5250 goto unlock; 5475 goto unlock;
@@ -5264,6 +5489,10 @@ void perf_pmu_unregister(struct pmu *pmu)
5264 synchronize_rcu(); 5489 synchronize_rcu();
5265 5490
5266 free_percpu(pmu->pmu_disable_count); 5491 free_percpu(pmu->pmu_disable_count);
5492 if (pmu->type >= PERF_TYPE_MAX)
5493 idr_remove(&pmu_idr, pmu->type);
5494 device_del(pmu->dev);
5495 put_device(pmu->dev);
5267 free_pmu_context(pmu); 5496 free_pmu_context(pmu);
5268} 5497}
5269 5498
@@ -5273,6 +5502,13 @@ struct pmu *perf_init_event(struct perf_event *event)
5273 int idx; 5502 int idx;
5274 5503
5275 idx = srcu_read_lock(&pmus_srcu); 5504 idx = srcu_read_lock(&pmus_srcu);
5505
5506 rcu_read_lock();
5507 pmu = idr_find(&pmu_idr, event->attr.type);
5508 rcu_read_unlock();
5509 if (pmu)
5510 goto unlock;
5511
5276 list_for_each_entry_rcu(pmu, &pmus, entry) { 5512 list_for_each_entry_rcu(pmu, &pmus, entry) {
5277 int ret = pmu->event_init(event); 5513 int ret = pmu->event_init(event);
5278 if (!ret) 5514 if (!ret)
@@ -5738,6 +5974,12 @@ SYSCALL_DEFINE5(perf_event_open,
5738 mutex_unlock(&current->perf_event_mutex); 5974 mutex_unlock(&current->perf_event_mutex);
5739 5975
5740 /* 5976 /*
5977 * Precalculate sample_data sizes
5978 */
5979 perf_event__header_size(event);
5980 perf_event__id_header_size(event);
5981
5982 /*
5741 * Drop the reference on the group_event after placing the 5983 * Drop the reference on the group_event after placing the
5742 * new event on the sibling_list. This ensures destruction 5984 * new event on the sibling_list. This ensures destruction
5743 * of the group leader will find the pointer to itself in 5985 * of the group leader will find the pointer to itself in
@@ -6090,6 +6332,12 @@ inherit_event(struct perf_event *parent_event,
6090 child_event->overflow_handler = parent_event->overflow_handler; 6332 child_event->overflow_handler = parent_event->overflow_handler;
6091 6333
6092 /* 6334 /*
6335 * Precalculate sample_data sizes
6336 */
6337 perf_event__header_size(child_event);
6338 perf_event__id_header_size(child_event);
6339
6340 /*
6093 * Link it up in the child's context: 6341 * Link it up in the child's context:
6094 */ 6342 */
6095 raw_spin_lock_irqsave(&child_ctx->lock, flags); 6343 raw_spin_lock_irqsave(&child_ctx->lock, flags);
@@ -6320,7 +6568,7 @@ static void __cpuinit perf_event_init_cpu(int cpu)
6320 mutex_unlock(&swhash->hlist_mutex); 6568 mutex_unlock(&swhash->hlist_mutex);
6321} 6569}
6322 6570
6323#ifdef CONFIG_HOTPLUG_CPU 6571#if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC
6324static void perf_pmu_rotate_stop(struct pmu *pmu) 6572static void perf_pmu_rotate_stop(struct pmu *pmu)
6325{ 6573{
6326 struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); 6574 struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
@@ -6374,6 +6622,26 @@ static void perf_event_exit_cpu(int cpu)
6374static inline void perf_event_exit_cpu(int cpu) { } 6622static inline void perf_event_exit_cpu(int cpu) { }
6375#endif 6623#endif
6376 6624
6625static int
6626perf_reboot(struct notifier_block *notifier, unsigned long val, void *v)
6627{
6628 int cpu;
6629
6630 for_each_online_cpu(cpu)
6631 perf_event_exit_cpu(cpu);
6632
6633 return NOTIFY_OK;
6634}
6635
6636/*
6637 * Run the perf reboot notifier at the very last possible moment so that
6638 * the generic watchdog code runs as long as possible.
6639 */
6640static struct notifier_block perf_reboot_notifier = {
6641 .notifier_call = perf_reboot,
6642 .priority = INT_MIN,
6643};
6644
6377static int __cpuinit 6645static int __cpuinit
6378perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) 6646perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
6379{ 6647{
@@ -6402,14 +6670,45 @@ void __init perf_event_init(void)
6402{ 6670{
6403 int ret; 6671 int ret;
6404 6672
6673 idr_init(&pmu_idr);
6674
6405 perf_event_init_all_cpus(); 6675 perf_event_init_all_cpus();
6406 init_srcu_struct(&pmus_srcu); 6676 init_srcu_struct(&pmus_srcu);
6407 perf_pmu_register(&perf_swevent); 6677 perf_pmu_register(&perf_swevent, "software", PERF_TYPE_SOFTWARE);
6408 perf_pmu_register(&perf_cpu_clock); 6678 perf_pmu_register(&perf_cpu_clock, NULL, -1);
6409 perf_pmu_register(&perf_task_clock); 6679 perf_pmu_register(&perf_task_clock, NULL, -1);
6410 perf_tp_register(); 6680 perf_tp_register();
6411 perf_cpu_notifier(perf_cpu_notify); 6681 perf_cpu_notifier(perf_cpu_notify);
6682 register_reboot_notifier(&perf_reboot_notifier);
6412 6683
6413 ret = init_hw_breakpoint(); 6684 ret = init_hw_breakpoint();
6414 WARN(ret, "hw_breakpoint initialization failed with: %d", ret); 6685 WARN(ret, "hw_breakpoint initialization failed with: %d", ret);
6415} 6686}
6687
6688static int __init perf_event_sysfs_init(void)
6689{
6690 struct pmu *pmu;
6691 int ret;
6692
6693 mutex_lock(&pmus_lock);
6694
6695 ret = bus_register(&pmu_bus);
6696 if (ret)
6697 goto unlock;
6698
6699 list_for_each_entry(pmu, &pmus, entry) {
6700 if (!pmu->name || pmu->type < 0)
6701 continue;
6702
6703 ret = pmu_dev_alloc(pmu);
6704 WARN(ret, "Failed to register pmu: %s, reason %d\n", pmu->name, ret);
6705 }
6706 pmu_bus_running = 1;
6707 ret = 0;
6708
6709unlock:
6710 mutex_unlock(&pmus_lock);
6711
6712 return ret;
6713}
6714device_initcall(perf_event_sysfs_init);
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 9ca4973f736d..93bd2eb2bc53 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -145,7 +145,13 @@ static int common_timer_del(struct k_itimer *timer);
145 145
146static enum hrtimer_restart posix_timer_fn(struct hrtimer *data); 146static enum hrtimer_restart posix_timer_fn(struct hrtimer *data);
147 147
148static struct k_itimer *lock_timer(timer_t timer_id, unsigned long *flags); 148static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags);
149
150#define lock_timer(tid, flags) \
151({ struct k_itimer *__timr; \
152 __cond_lock(&__timr->it_lock, __timr = __lock_timer(tid, flags)); \
153 __timr; \
154})
149 155
150static inline void unlock_timer(struct k_itimer *timr, unsigned long flags) 156static inline void unlock_timer(struct k_itimer *timr, unsigned long flags)
151{ 157{
@@ -619,7 +625,7 @@ out:
619 * the find to the timer lock. To avoid a dead lock, the timer id MUST 625 * the find to the timer lock. To avoid a dead lock, the timer id MUST
620 * be release with out holding the timer lock. 626 * be release with out holding the timer lock.
621 */ 627 */
622static struct k_itimer *lock_timer(timer_t timer_id, unsigned long *flags) 628static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags)
623{ 629{
624 struct k_itimer *timr; 630 struct k_itimer *timr;
625 /* 631 /*
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index a5aff3ebad38..265729966ece 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -100,13 +100,9 @@ config PM_SLEEP_ADVANCED_DEBUG
100 depends on PM_ADVANCED_DEBUG 100 depends on PM_ADVANCED_DEBUG
101 default n 101 default n
102 102
103config SUSPEND_NVS
104 bool
105
106config SUSPEND 103config SUSPEND
107 bool "Suspend to RAM and standby" 104 bool "Suspend to RAM and standby"
108 depends on PM && ARCH_SUSPEND_POSSIBLE 105 depends on PM && ARCH_SUSPEND_POSSIBLE
109 select SUSPEND_NVS if HAS_IOMEM
110 default y 106 default y
111 ---help--- 107 ---help---
112 Allow the system to enter sleep states in which main memory is 108 Allow the system to enter sleep states in which main memory is
@@ -140,7 +136,6 @@ config HIBERNATION
140 depends on PM && SWAP && ARCH_HIBERNATION_POSSIBLE 136 depends on PM && SWAP && ARCH_HIBERNATION_POSSIBLE
141 select LZO_COMPRESS 137 select LZO_COMPRESS
142 select LZO_DECOMPRESS 138 select LZO_DECOMPRESS
143 select SUSPEND_NVS if HAS_IOMEM
144 ---help--- 139 ---help---
145 Enable the suspend to disk (STD) functionality, which is usually 140 Enable the suspend to disk (STD) functionality, which is usually
146 called "hibernation" in user interfaces. STD checkpoints the 141 called "hibernation" in user interfaces. STD checkpoints the
diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index f9063c6b185d..c350e18b53e3 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -1,7 +1,4 @@
1 1ccflags-$(CONFIG_PM_DEBUG) := -DDEBUG
2ifeq ($(CONFIG_PM_DEBUG),y)
3EXTRA_CFLAGS += -DDEBUG
4endif
5 2
6obj-$(CONFIG_PM) += main.o 3obj-$(CONFIG_PM) += main.o
7obj-$(CONFIG_PM_SLEEP) += console.o 4obj-$(CONFIG_PM_SLEEP) += console.o
@@ -10,6 +7,5 @@ obj-$(CONFIG_SUSPEND) += suspend.o
10obj-$(CONFIG_PM_TEST_SUSPEND) += suspend_test.o 7obj-$(CONFIG_PM_TEST_SUSPEND) += suspend_test.o
11obj-$(CONFIG_HIBERNATION) += hibernate.o snapshot.o swap.o user.o \ 8obj-$(CONFIG_HIBERNATION) += hibernate.o snapshot.o swap.o user.o \
12 block_io.o 9 block_io.o
13obj-$(CONFIG_SUSPEND_NVS) += nvs.o
14 10
15obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o 11obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 048d0b514831..1832bd264219 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -51,18 +51,18 @@ enum {
51 51
52static int hibernation_mode = HIBERNATION_SHUTDOWN; 52static int hibernation_mode = HIBERNATION_SHUTDOWN;
53 53
54static struct platform_hibernation_ops *hibernation_ops; 54static const struct platform_hibernation_ops *hibernation_ops;
55 55
56/** 56/**
57 * hibernation_set_ops - set the global hibernate operations 57 * hibernation_set_ops - set the global hibernate operations
58 * @ops: the hibernation operations to use in subsequent hibernation transitions 58 * @ops: the hibernation operations to use in subsequent hibernation transitions
59 */ 59 */
60 60
61void hibernation_set_ops(struct platform_hibernation_ops *ops) 61void hibernation_set_ops(const struct platform_hibernation_ops *ops)
62{ 62{
63 if (ops && !(ops->begin && ops->end && ops->pre_snapshot 63 if (ops && !(ops->begin && ops->end && ops->pre_snapshot
64 && ops->prepare && ops->finish && ops->enter && ops->pre_restore 64 && ops->prepare && ops->finish && ops->enter && ops->pre_restore
65 && ops->restore_cleanup)) { 65 && ops->restore_cleanup && ops->leave)) {
66 WARN_ON(1); 66 WARN_ON(1);
67 return; 67 return;
68 } 68 }
@@ -278,7 +278,7 @@ static int create_image(int platform_mode)
278 goto Enable_irqs; 278 goto Enable_irqs;
279 } 279 }
280 280
281 if (hibernation_test(TEST_CORE) || !pm_check_wakeup_events()) 281 if (hibernation_test(TEST_CORE) || pm_wakeup_pending())
282 goto Power_up; 282 goto Power_up;
283 283
284 in_suspend = 1; 284 in_suspend = 1;
@@ -516,7 +516,7 @@ int hibernation_platform_enter(void)
516 516
517 local_irq_disable(); 517 local_irq_disable();
518 sysdev_suspend(PMSG_HIBERNATE); 518 sysdev_suspend(PMSG_HIBERNATE);
519 if (!pm_check_wakeup_events()) { 519 if (pm_wakeup_pending()) {
520 error = -EAGAIN; 520 error = -EAGAIN;
521 goto Power_up; 521 goto Power_up;
522 } 522 }
@@ -647,6 +647,7 @@ int hibernate(void)
647 swsusp_free(); 647 swsusp_free();
648 if (!error) 648 if (!error)
649 power_down(); 649 power_down();
650 in_suspend = 0;
650 pm_restore_gfp_mask(); 651 pm_restore_gfp_mask();
651 } else { 652 } else {
652 pr_debug("PM: Image restored successfully.\n"); 653 pr_debug("PM: Image restored successfully.\n");
diff --git a/kernel/power/nvs.c b/kernel/power/nvs.c
deleted file mode 100644
index 1836db60bbb6..000000000000
--- a/kernel/power/nvs.c
+++ /dev/null
@@ -1,136 +0,0 @@
1/*
2 * linux/kernel/power/hibernate_nvs.c - Routines for handling NVS memory
3 *
4 * Copyright (C) 2008,2009 Rafael J. Wysocki <rjw@sisk.pl>, Novell Inc.
5 *
6 * This file is released under the GPLv2.
7 */
8
9#include <linux/io.h>
10#include <linux/kernel.h>
11#include <linux/list.h>
12#include <linux/mm.h>
13#include <linux/slab.h>
14#include <linux/suspend.h>
15
16/*
17 * Platforms, like ACPI, may want us to save some memory used by them during
18 * suspend and to restore the contents of this memory during the subsequent
19 * resume. The code below implements a mechanism allowing us to do that.
20 */
21
22struct nvs_page {
23 unsigned long phys_start;
24 unsigned int size;
25 void *kaddr;
26 void *data;
27 struct list_head node;
28};
29
30static LIST_HEAD(nvs_list);
31
32/**
33 * suspend_nvs_register - register platform NVS memory region to save
34 * @start - physical address of the region
35 * @size - size of the region
36 *
37 * The NVS region need not be page-aligned (both ends) and we arrange
38 * things so that the data from page-aligned addresses in this region will
39 * be copied into separate RAM pages.
40 */
41int suspend_nvs_register(unsigned long start, unsigned long size)
42{
43 struct nvs_page *entry, *next;
44
45 while (size > 0) {
46 unsigned int nr_bytes;
47
48 entry = kzalloc(sizeof(struct nvs_page), GFP_KERNEL);
49 if (!entry)
50 goto Error;
51
52 list_add_tail(&entry->node, &nvs_list);
53 entry->phys_start = start;
54 nr_bytes = PAGE_SIZE - (start & ~PAGE_MASK);
55 entry->size = (size < nr_bytes) ? size : nr_bytes;
56
57 start += entry->size;
58 size -= entry->size;
59 }
60 return 0;
61
62 Error:
63 list_for_each_entry_safe(entry, next, &nvs_list, node) {
64 list_del(&entry->node);
65 kfree(entry);
66 }
67 return -ENOMEM;
68}
69
70/**
71 * suspend_nvs_free - free data pages allocated for saving NVS regions
72 */
73void suspend_nvs_free(void)
74{
75 struct nvs_page *entry;
76
77 list_for_each_entry(entry, &nvs_list, node)
78 if (entry->data) {
79 free_page((unsigned long)entry->data);
80 entry->data = NULL;
81 if (entry->kaddr) {
82 iounmap(entry->kaddr);
83 entry->kaddr = NULL;
84 }
85 }
86}
87
88/**
89 * suspend_nvs_alloc - allocate memory necessary for saving NVS regions
90 */
91int suspend_nvs_alloc(void)
92{
93 struct nvs_page *entry;
94
95 list_for_each_entry(entry, &nvs_list, node) {
96 entry->data = (void *)__get_free_page(GFP_KERNEL);
97 if (!entry->data) {
98 suspend_nvs_free();
99 return -ENOMEM;
100 }
101 }
102 return 0;
103}
104
105/**
106 * suspend_nvs_save - save NVS memory regions
107 */
108void suspend_nvs_save(void)
109{
110 struct nvs_page *entry;
111
112 printk(KERN_INFO "PM: Saving platform NVS memory\n");
113
114 list_for_each_entry(entry, &nvs_list, node)
115 if (entry->data) {
116 entry->kaddr = ioremap(entry->phys_start, entry->size);
117 memcpy(entry->data, entry->kaddr, entry->size);
118 }
119}
120
121/**
122 * suspend_nvs_restore - restore NVS memory regions
123 *
124 * This function is going to be called with interrupts disabled, so it
125 * cannot iounmap the virtual addresses used to access the NVS region.
126 */
127void suspend_nvs_restore(void)
128{
129 struct nvs_page *entry;
130
131 printk(KERN_INFO "PM: Restoring platform NVS memory\n");
132
133 list_for_each_entry(entry, &nvs_list, node)
134 if (entry->data)
135 memcpy(entry->kaddr, entry->data, entry->size);
136}
diff --git a/kernel/power/process.c b/kernel/power/process.c
index e50b4c1b2a0f..d6d2a10320e0 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -64,6 +64,12 @@ static int try_to_freeze_tasks(bool sig_only)
64 * perturb a task in TASK_STOPPED or TASK_TRACED. 64 * perturb a task in TASK_STOPPED or TASK_TRACED.
65 * It is "frozen enough". If the task does wake 65 * It is "frozen enough". If the task does wake
66 * up, it will immediately call try_to_freeze. 66 * up, it will immediately call try_to_freeze.
67 *
68 * Because freeze_task() goes through p's
69 * scheduler lock after setting TIF_FREEZE, it's
70 * guaranteed that either we see TASK_RUNNING or
71 * try_to_stop() after schedule() in ptrace/signal
72 * stop sees TIF_FREEZE.
67 */ 73 */
68 if (!task_is_stopped_or_traced(p) && 74 if (!task_is_stopped_or_traced(p) &&
69 !freezer_should_skip(p)) 75 !freezer_should_skip(p))
@@ -79,7 +85,7 @@ static int try_to_freeze_tasks(bool sig_only)
79 if (!todo || time_after(jiffies, end_time)) 85 if (!todo || time_after(jiffies, end_time))
80 break; 86 break;
81 87
82 if (!pm_check_wakeup_events()) { 88 if (pm_wakeup_pending()) {
83 wakeup = true; 89 wakeup = true;
84 break; 90 break;
85 } 91 }
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index ecf770509d0d..de6f86bfa303 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -22,6 +22,7 @@
22#include <linux/mm.h> 22#include <linux/mm.h>
23#include <linux/slab.h> 23#include <linux/slab.h>
24#include <linux/suspend.h> 24#include <linux/suspend.h>
25#include <trace/events/power.h>
25 26
26#include "power.h" 27#include "power.h"
27 28
@@ -30,13 +31,13 @@ const char *const pm_states[PM_SUSPEND_MAX] = {
30 [PM_SUSPEND_MEM] = "mem", 31 [PM_SUSPEND_MEM] = "mem",
31}; 32};
32 33
33static struct platform_suspend_ops *suspend_ops; 34static const struct platform_suspend_ops *suspend_ops;
34 35
35/** 36/**
36 * suspend_set_ops - Set the global suspend method table. 37 * suspend_set_ops - Set the global suspend method table.
37 * @ops: Pointer to ops structure. 38 * @ops: Pointer to ops structure.
38 */ 39 */
39void suspend_set_ops(struct platform_suspend_ops *ops) 40void suspend_set_ops(const struct platform_suspend_ops *ops)
40{ 41{
41 mutex_lock(&pm_mutex); 42 mutex_lock(&pm_mutex);
42 suspend_ops = ops; 43 suspend_ops = ops;
@@ -163,7 +164,7 @@ static int suspend_enter(suspend_state_t state)
163 164
164 error = sysdev_suspend(PMSG_SUSPEND); 165 error = sysdev_suspend(PMSG_SUSPEND);
165 if (!error) { 166 if (!error) {
166 if (!suspend_test(TEST_CORE) && pm_check_wakeup_events()) { 167 if (!(suspend_test(TEST_CORE) || pm_wakeup_pending())) {
167 error = suspend_ops->enter(state); 168 error = suspend_ops->enter(state);
168 events_check_enabled = false; 169 events_check_enabled = false;
169 } 170 }
@@ -201,6 +202,7 @@ int suspend_devices_and_enter(suspend_state_t state)
201 if (!suspend_ops) 202 if (!suspend_ops)
202 return -ENOSYS; 203 return -ENOSYS;
203 204
205 trace_machine_suspend(state);
204 if (suspend_ops->begin) { 206 if (suspend_ops->begin) {
205 error = suspend_ops->begin(state); 207 error = suspend_ops->begin(state);
206 if (error) 208 if (error)
@@ -229,6 +231,7 @@ int suspend_devices_and_enter(suspend_state_t state)
229 Close: 231 Close:
230 if (suspend_ops->end) 232 if (suspend_ops->end)
231 suspend_ops->end(); 233 suspend_ops->end();
234 trace_machine_suspend(PWR_EVENT_EXIT);
232 return error; 235 return error;
233 236
234 Recover_platform: 237 Recover_platform:
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 8c7e4832b9be..7c97c3a0eee3 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -224,7 +224,7 @@ static int swsusp_swap_check(void)
224 return res; 224 return res;
225 225
226 root_swap = res; 226 root_swap = res;
227 res = blkdev_get(hib_resume_bdev, FMODE_WRITE); 227 res = blkdev_get(hib_resume_bdev, FMODE_WRITE, NULL);
228 if (res) 228 if (res)
229 return res; 229 return res;
230 230
@@ -888,7 +888,7 @@ out_finish:
888/** 888/**
889 * swsusp_read - read the hibernation image. 889 * swsusp_read - read the hibernation image.
890 * @flags_p: flags passed by the "frozen" kernel in the image header should 890 * @flags_p: flags passed by the "frozen" kernel in the image header should
891 * be written into this memeory location 891 * be written into this memory location
892 */ 892 */
893 893
894int swsusp_read(unsigned int *flags_p) 894int swsusp_read(unsigned int *flags_p)
@@ -930,7 +930,8 @@ int swsusp_check(void)
930{ 930{
931 int error; 931 int error;
932 932
933 hib_resume_bdev = open_by_devnum(swsusp_resume_device, FMODE_READ); 933 hib_resume_bdev = blkdev_get_by_dev(swsusp_resume_device,
934 FMODE_READ, NULL);
934 if (!IS_ERR(hib_resume_bdev)) { 935 if (!IS_ERR(hib_resume_bdev)) {
935 set_blocksize(hib_resume_bdev, PAGE_SIZE); 936 set_blocksize(hib_resume_bdev, PAGE_SIZE);
936 clear_page(swsusp_header); 937 clear_page(swsusp_header);
diff --git a/kernel/printk.c b/kernel/printk.c
index a23315dc4498..53d9a9ec88e6 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -39,16 +39,11 @@
39#include <linux/syslog.h> 39#include <linux/syslog.h>
40#include <linux/cpu.h> 40#include <linux/cpu.h>
41#include <linux/notifier.h> 41#include <linux/notifier.h>
42#include <linux/rculist.h>
42 43
43#include <asm/uaccess.h> 44#include <asm/uaccess.h>
44 45
45/* 46/*
46 * for_each_console() allows you to iterate on each console
47 */
48#define for_each_console(con) \
49 for (con = console_drivers; con != NULL; con = con->next)
50
51/*
52 * Architectures can override it: 47 * Architectures can override it:
53 */ 48 */
54void asmlinkage __attribute__((weak)) early_printk(const char *fmt, ...) 49void asmlinkage __attribute__((weak)) early_printk(const char *fmt, ...)
@@ -279,12 +274,12 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)
279 * at open time. 274 * at open time.
280 */ 275 */
281 if (type == SYSLOG_ACTION_OPEN || !from_file) { 276 if (type == SYSLOG_ACTION_OPEN || !from_file) {
282 if (dmesg_restrict && !capable(CAP_SYS_ADMIN)) 277 if (dmesg_restrict && !capable(CAP_SYSLOG))
283 return -EPERM; 278 goto warn; /* switch to return -EPERM after 2.6.39 */
284 if ((type != SYSLOG_ACTION_READ_ALL && 279 if ((type != SYSLOG_ACTION_READ_ALL &&
285 type != SYSLOG_ACTION_SIZE_BUFFER) && 280 type != SYSLOG_ACTION_SIZE_BUFFER) &&
286 !capable(CAP_SYS_ADMIN)) 281 !capable(CAP_SYSLOG))
287 return -EPERM; 282 goto warn; /* switch to return -EPERM after 2.6.39 */
288 } 283 }
289 284
290 error = security_syslog(type); 285 error = security_syslog(type);
@@ -428,6 +423,12 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)
428 } 423 }
429out: 424out:
430 return error; 425 return error;
426warn:
427 /* remove after 2.6.39 */
428 if (capable(CAP_SYS_ADMIN))
429 WARN_ONCE(1, "Attempt to access syslog with CAP_SYS_ADMIN "
430 "but no CAP_SYSLOG (deprecated and denied).\n");
431 return -EPERM;
431} 432}
432 433
433SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len) 434SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len)
@@ -1074,17 +1075,17 @@ static DEFINE_PER_CPU(int, printk_pending);
1074 1075
1075void printk_tick(void) 1076void printk_tick(void)
1076{ 1077{
1077 if (__get_cpu_var(printk_pending)) { 1078 if (__this_cpu_read(printk_pending)) {
1078 __get_cpu_var(printk_pending) = 0; 1079 __this_cpu_write(printk_pending, 0);
1079 wake_up_interruptible(&log_wait); 1080 wake_up_interruptible(&log_wait);
1080 } 1081 }
1081} 1082}
1082 1083
1083int printk_needs_cpu(int cpu) 1084int printk_needs_cpu(int cpu)
1084{ 1085{
1085 if (unlikely(cpu_is_offline(cpu))) 1086 if (cpu_is_offline(cpu))
1086 printk_tick(); 1087 printk_tick();
1087 return per_cpu(printk_pending, cpu); 1088 return __this_cpu_read(printk_pending);
1088} 1089}
1089 1090
1090void wake_up_klogd(void) 1091void wake_up_klogd(void)
@@ -1359,6 +1360,7 @@ void register_console(struct console *newcon)
1359 spin_unlock_irqrestore(&logbuf_lock, flags); 1360 spin_unlock_irqrestore(&logbuf_lock, flags);
1360 } 1361 }
1361 release_console_sem(); 1362 release_console_sem();
1363 console_sysfs_notify();
1362 1364
1363 /* 1365 /*
1364 * By unregistering the bootconsoles after we enable the real console 1366 * By unregistering the bootconsoles after we enable the real console
@@ -1417,6 +1419,7 @@ int unregister_console(struct console *console)
1417 console_drivers->flags |= CON_CONSDEV; 1419 console_drivers->flags |= CON_CONSDEV;
1418 1420
1419 release_console_sem(); 1421 release_console_sem();
1422 console_sysfs_notify();
1420 return res; 1423 return res;
1421} 1424}
1422EXPORT_SYMBOL(unregister_console); 1425EXPORT_SYMBOL(unregister_console);
@@ -1500,7 +1503,7 @@ int kmsg_dump_register(struct kmsg_dumper *dumper)
1500 /* Don't allow registering multiple times */ 1503 /* Don't allow registering multiple times */
1501 if (!dumper->registered) { 1504 if (!dumper->registered) {
1502 dumper->registered = 1; 1505 dumper->registered = 1;
1503 list_add_tail(&dumper->list, &dump_list); 1506 list_add_tail_rcu(&dumper->list, &dump_list);
1504 err = 0; 1507 err = 0;
1505 } 1508 }
1506 spin_unlock_irqrestore(&dump_list_lock, flags); 1509 spin_unlock_irqrestore(&dump_list_lock, flags);
@@ -1524,29 +1527,16 @@ int kmsg_dump_unregister(struct kmsg_dumper *dumper)
1524 spin_lock_irqsave(&dump_list_lock, flags); 1527 spin_lock_irqsave(&dump_list_lock, flags);
1525 if (dumper->registered) { 1528 if (dumper->registered) {
1526 dumper->registered = 0; 1529 dumper->registered = 0;
1527 list_del(&dumper->list); 1530 list_del_rcu(&dumper->list);
1528 err = 0; 1531 err = 0;
1529 } 1532 }
1530 spin_unlock_irqrestore(&dump_list_lock, flags); 1533 spin_unlock_irqrestore(&dump_list_lock, flags);
1534 synchronize_rcu();
1531 1535
1532 return err; 1536 return err;
1533} 1537}
1534EXPORT_SYMBOL_GPL(kmsg_dump_unregister); 1538EXPORT_SYMBOL_GPL(kmsg_dump_unregister);
1535 1539
1536static const char * const kmsg_reasons[] = {
1537 [KMSG_DUMP_OOPS] = "oops",
1538 [KMSG_DUMP_PANIC] = "panic",
1539 [KMSG_DUMP_KEXEC] = "kexec",
1540};
1541
1542static const char *kmsg_to_str(enum kmsg_dump_reason reason)
1543{
1544 if (reason >= ARRAY_SIZE(kmsg_reasons) || reason < 0)
1545 return "unknown";
1546
1547 return kmsg_reasons[reason];
1548}
1549
1550/** 1540/**
1551 * kmsg_dump - dump kernel log to kernel message dumpers. 1541 * kmsg_dump - dump kernel log to kernel message dumpers.
1552 * @reason: the reason (oops, panic etc) for dumping 1542 * @reason: the reason (oops, panic etc) for dumping
@@ -1585,13 +1575,9 @@ void kmsg_dump(enum kmsg_dump_reason reason)
1585 l2 = chars; 1575 l2 = chars;
1586 } 1576 }
1587 1577
1588 if (!spin_trylock_irqsave(&dump_list_lock, flags)) { 1578 rcu_read_lock();
1589 printk(KERN_ERR "dump_kmsg: dump list lock is held during %s, skipping dump\n", 1579 list_for_each_entry_rcu(dumper, &dump_list, list)
1590 kmsg_to_str(reason));
1591 return;
1592 }
1593 list_for_each_entry(dumper, &dump_list, list)
1594 dumper->dump(dumper, reason, s1, l1, s2, l2); 1580 dumper->dump(dumper, reason, s1, l1, s2, l2);
1595 spin_unlock_irqrestore(&dump_list_lock, flags); 1581 rcu_read_unlock();
1596} 1582}
1597#endif 1583#endif
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c
index d806735342ac..034493724749 100644
--- a/kernel/rcutiny.c
+++ b/kernel/rcutiny.c
@@ -36,31 +36,16 @@
36#include <linux/time.h> 36#include <linux/time.h>
37#include <linux/cpu.h> 37#include <linux/cpu.h>
38 38
39/* Global control variables for rcupdate callback mechanism. */ 39/* Controls for rcu_kthread() kthread, replacing RCU_SOFTIRQ used previously. */
40struct rcu_ctrlblk { 40static struct task_struct *rcu_kthread_task;
41 struct rcu_head *rcucblist; /* List of pending callbacks (CBs). */ 41static DECLARE_WAIT_QUEUE_HEAD(rcu_kthread_wq);
42 struct rcu_head **donetail; /* ->next pointer of last "done" CB. */ 42static unsigned long have_rcu_kthread_work;
43 struct rcu_head **curtail; /* ->next pointer of last CB. */ 43static void invoke_rcu_kthread(void);
44};
45
46/* Definition for rcupdate control block. */
47static struct rcu_ctrlblk rcu_sched_ctrlblk = {
48 .donetail = &rcu_sched_ctrlblk.rcucblist,
49 .curtail = &rcu_sched_ctrlblk.rcucblist,
50};
51
52static struct rcu_ctrlblk rcu_bh_ctrlblk = {
53 .donetail = &rcu_bh_ctrlblk.rcucblist,
54 .curtail = &rcu_bh_ctrlblk.rcucblist,
55};
56
57#ifdef CONFIG_DEBUG_LOCK_ALLOC
58int rcu_scheduler_active __read_mostly;
59EXPORT_SYMBOL_GPL(rcu_scheduler_active);
60#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
61 44
62/* Forward declarations for rcutiny_plugin.h. */ 45/* Forward declarations for rcutiny_plugin.h. */
63static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp); 46struct rcu_ctrlblk;
47static void rcu_process_callbacks(struct rcu_ctrlblk *rcp);
48static int rcu_kthread(void *arg);
64static void __call_rcu(struct rcu_head *head, 49static void __call_rcu(struct rcu_head *head,
65 void (*func)(struct rcu_head *rcu), 50 void (*func)(struct rcu_head *rcu),
66 struct rcu_ctrlblk *rcp); 51 struct rcu_ctrlblk *rcp);
@@ -123,7 +108,7 @@ void rcu_sched_qs(int cpu)
123{ 108{
124 if (rcu_qsctr_help(&rcu_sched_ctrlblk) + 109 if (rcu_qsctr_help(&rcu_sched_ctrlblk) +
125 rcu_qsctr_help(&rcu_bh_ctrlblk)) 110 rcu_qsctr_help(&rcu_bh_ctrlblk))
126 raise_softirq(RCU_SOFTIRQ); 111 invoke_rcu_kthread();
127} 112}
128 113
129/* 114/*
@@ -132,7 +117,7 @@ void rcu_sched_qs(int cpu)
132void rcu_bh_qs(int cpu) 117void rcu_bh_qs(int cpu)
133{ 118{
134 if (rcu_qsctr_help(&rcu_bh_ctrlblk)) 119 if (rcu_qsctr_help(&rcu_bh_ctrlblk))
135 raise_softirq(RCU_SOFTIRQ); 120 invoke_rcu_kthread();
136} 121}
137 122
138/* 123/*
@@ -152,13 +137,14 @@ void rcu_check_callbacks(int cpu, int user)
152} 137}
153 138
154/* 139/*
155 * Helper function for rcu_process_callbacks() that operates on the 140 * Invoke the RCU callbacks on the specified rcu_ctrlkblk structure
156 * specified rcu_ctrlkblk structure. 141 * whose grace period has elapsed.
157 */ 142 */
158static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) 143static void rcu_process_callbacks(struct rcu_ctrlblk *rcp)
159{ 144{
160 struct rcu_head *next, *list; 145 struct rcu_head *next, *list;
161 unsigned long flags; 146 unsigned long flags;
147 RCU_TRACE(int cb_count = 0);
162 148
163 /* If no RCU callbacks ready to invoke, just return. */ 149 /* If no RCU callbacks ready to invoke, just return. */
164 if (&rcp->rcucblist == rcp->donetail) 150 if (&rcp->rcucblist == rcp->donetail)
@@ -180,19 +166,58 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
180 next = list->next; 166 next = list->next;
181 prefetch(next); 167 prefetch(next);
182 debug_rcu_head_unqueue(list); 168 debug_rcu_head_unqueue(list);
169 local_bh_disable();
183 list->func(list); 170 list->func(list);
171 local_bh_enable();
184 list = next; 172 list = next;
173 RCU_TRACE(cb_count++);
185 } 174 }
175 RCU_TRACE(rcu_trace_sub_qlen(rcp, cb_count));
186} 176}
187 177
188/* 178/*
189 * Invoke any callbacks whose grace period has completed. 179 * This kthread invokes RCU callbacks whose grace periods have
180 * elapsed. It is awakened as needed, and takes the place of the
181 * RCU_SOFTIRQ that was used previously for this purpose.
182 * This is a kthread, but it is never stopped, at least not until
183 * the system goes down.
190 */ 184 */
191static void rcu_process_callbacks(struct softirq_action *unused) 185static int rcu_kthread(void *arg)
192{ 186{
193 __rcu_process_callbacks(&rcu_sched_ctrlblk); 187 unsigned long work;
194 __rcu_process_callbacks(&rcu_bh_ctrlblk); 188 unsigned long morework;
195 rcu_preempt_process_callbacks(); 189 unsigned long flags;
190
191 for (;;) {
192 wait_event(rcu_kthread_wq, have_rcu_kthread_work != 0);
193 morework = rcu_boost();
194 local_irq_save(flags);
195 work = have_rcu_kthread_work;
196 have_rcu_kthread_work = morework;
197 local_irq_restore(flags);
198 if (work) {
199 rcu_process_callbacks(&rcu_sched_ctrlblk);
200 rcu_process_callbacks(&rcu_bh_ctrlblk);
201 rcu_preempt_process_callbacks();
202 }
203 schedule_timeout_interruptible(1); /* Leave CPU for others. */
204 }
205
206 return 0; /* Not reached, but needed to shut gcc up. */
207}
208
209/*
210 * Wake up rcu_kthread() to process callbacks now eligible for invocation
211 * or to boost readers.
212 */
213static void invoke_rcu_kthread(void)
214{
215 unsigned long flags;
216
217 local_irq_save(flags);
218 have_rcu_kthread_work = 1;
219 wake_up(&rcu_kthread_wq);
220 local_irq_restore(flags);
196} 221}
197 222
198/* 223/*
@@ -230,6 +255,7 @@ static void __call_rcu(struct rcu_head *head,
230 local_irq_save(flags); 255 local_irq_save(flags);
231 *rcp->curtail = head; 256 *rcp->curtail = head;
232 rcp->curtail = &head->next; 257 rcp->curtail = &head->next;
258 RCU_TRACE(rcp->qlen++);
233 local_irq_restore(flags); 259 local_irq_restore(flags);
234} 260}
235 261
@@ -282,7 +308,16 @@ void rcu_barrier_sched(void)
282} 308}
283EXPORT_SYMBOL_GPL(rcu_barrier_sched); 309EXPORT_SYMBOL_GPL(rcu_barrier_sched);
284 310
285void __init rcu_init(void) 311/*
312 * Spawn the kthread that invokes RCU callbacks.
313 */
314static int __init rcu_spawn_kthreads(void)
286{ 315{
287 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); 316 struct sched_param sp;
317
318 rcu_kthread_task = kthread_run(rcu_kthread, NULL, "rcu_kthread");
319 sp.sched_priority = RCU_BOOST_PRIO;
320 sched_setscheduler_nocheck(rcu_kthread_task, SCHED_FIFO, &sp);
321 return 0;
288} 322}
323early_initcall(rcu_spawn_kthreads);
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h
index 6ceca4f745ff..015abaea962a 100644
--- a/kernel/rcutiny_plugin.h
+++ b/kernel/rcutiny_plugin.h
@@ -22,6 +22,40 @@
22 * Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com> 22 * Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
23 */ 23 */
24 24
25#include <linux/kthread.h>
26#include <linux/debugfs.h>
27#include <linux/seq_file.h>
28
29#ifdef CONFIG_RCU_TRACE
30#define RCU_TRACE(stmt) stmt
31#else /* #ifdef CONFIG_RCU_TRACE */
32#define RCU_TRACE(stmt)
33#endif /* #else #ifdef CONFIG_RCU_TRACE */
34
35/* Global control variables for rcupdate callback mechanism. */
36struct rcu_ctrlblk {
37 struct rcu_head *rcucblist; /* List of pending callbacks (CBs). */
38 struct rcu_head **donetail; /* ->next pointer of last "done" CB. */
39 struct rcu_head **curtail; /* ->next pointer of last CB. */
40 RCU_TRACE(long qlen); /* Number of pending CBs. */
41};
42
43/* Definition for rcupdate control block. */
44static struct rcu_ctrlblk rcu_sched_ctrlblk = {
45 .donetail = &rcu_sched_ctrlblk.rcucblist,
46 .curtail = &rcu_sched_ctrlblk.rcucblist,
47};
48
49static struct rcu_ctrlblk rcu_bh_ctrlblk = {
50 .donetail = &rcu_bh_ctrlblk.rcucblist,
51 .curtail = &rcu_bh_ctrlblk.rcucblist,
52};
53
54#ifdef CONFIG_DEBUG_LOCK_ALLOC
55int rcu_scheduler_active __read_mostly;
56EXPORT_SYMBOL_GPL(rcu_scheduler_active);
57#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
58
25#ifdef CONFIG_TINY_PREEMPT_RCU 59#ifdef CONFIG_TINY_PREEMPT_RCU
26 60
27#include <linux/delay.h> 61#include <linux/delay.h>
@@ -46,17 +80,45 @@ struct rcu_preempt_ctrlblk {
46 struct list_head *gp_tasks; 80 struct list_head *gp_tasks;
47 /* Pointer to the first task blocking the */ 81 /* Pointer to the first task blocking the */
48 /* current grace period, or NULL if there */ 82 /* current grace period, or NULL if there */
49 /* is not such task. */ 83 /* is no such task. */
50 struct list_head *exp_tasks; 84 struct list_head *exp_tasks;
51 /* Pointer to first task blocking the */ 85 /* Pointer to first task blocking the */
52 /* current expedited grace period, or NULL */ 86 /* current expedited grace period, or NULL */
53 /* if there is no such task. If there */ 87 /* if there is no such task. If there */
54 /* is no current expedited grace period, */ 88 /* is no current expedited grace period, */
55 /* then there cannot be any such task. */ 89 /* then there cannot be any such task. */
90#ifdef CONFIG_RCU_BOOST
91 struct list_head *boost_tasks;
92 /* Pointer to first task that needs to be */
93 /* priority-boosted, or NULL if no priority */
94 /* boosting is needed. If there is no */
95 /* current or expedited grace period, there */
96 /* can be no such task. */
97#endif /* #ifdef CONFIG_RCU_BOOST */
56 u8 gpnum; /* Current grace period. */ 98 u8 gpnum; /* Current grace period. */
57 u8 gpcpu; /* Last grace period blocked by the CPU. */ 99 u8 gpcpu; /* Last grace period blocked by the CPU. */
58 u8 completed; /* Last grace period completed. */ 100 u8 completed; /* Last grace period completed. */
59 /* If all three are equal, RCU is idle. */ 101 /* If all three are equal, RCU is idle. */
102#ifdef CONFIG_RCU_BOOST
103 s8 boosted_this_gp; /* Has boosting already happened? */
104 unsigned long boost_time; /* When to start boosting (jiffies) */
105#endif /* #ifdef CONFIG_RCU_BOOST */
106#ifdef CONFIG_RCU_TRACE
107 unsigned long n_grace_periods;
108#ifdef CONFIG_RCU_BOOST
109 unsigned long n_tasks_boosted;
110 unsigned long n_exp_boosts;
111 unsigned long n_normal_boosts;
112 unsigned long n_normal_balk_blkd_tasks;
113 unsigned long n_normal_balk_gp_tasks;
114 unsigned long n_normal_balk_boost_tasks;
115 unsigned long n_normal_balk_boosted;
116 unsigned long n_normal_balk_notyet;
117 unsigned long n_normal_balk_nos;
118 unsigned long n_exp_balk_blkd_tasks;
119 unsigned long n_exp_balk_nos;
120#endif /* #ifdef CONFIG_RCU_BOOST */
121#endif /* #ifdef CONFIG_RCU_TRACE */
60}; 122};
61 123
62static struct rcu_preempt_ctrlblk rcu_preempt_ctrlblk = { 124static struct rcu_preempt_ctrlblk rcu_preempt_ctrlblk = {
@@ -122,6 +184,210 @@ static int rcu_preempt_gp_in_progress(void)
122} 184}
123 185
124/* 186/*
187 * Advance a ->blkd_tasks-list pointer to the next entry, instead
188 * returning NULL if at the end of the list.
189 */
190static struct list_head *rcu_next_node_entry(struct task_struct *t)
191{
192 struct list_head *np;
193
194 np = t->rcu_node_entry.next;
195 if (np == &rcu_preempt_ctrlblk.blkd_tasks)
196 np = NULL;
197 return np;
198}
199
200#ifdef CONFIG_RCU_TRACE
201
202#ifdef CONFIG_RCU_BOOST
203static void rcu_initiate_boost_trace(void);
204static void rcu_initiate_exp_boost_trace(void);
205#endif /* #ifdef CONFIG_RCU_BOOST */
206
207/*
208 * Dump additional statistice for TINY_PREEMPT_RCU.
209 */
210static void show_tiny_preempt_stats(struct seq_file *m)
211{
212 seq_printf(m, "rcu_preempt: qlen=%ld gp=%lu g%u/p%u/c%u tasks=%c%c%c\n",
213 rcu_preempt_ctrlblk.rcb.qlen,
214 rcu_preempt_ctrlblk.n_grace_periods,
215 rcu_preempt_ctrlblk.gpnum,
216 rcu_preempt_ctrlblk.gpcpu,
217 rcu_preempt_ctrlblk.completed,
218 "T."[list_empty(&rcu_preempt_ctrlblk.blkd_tasks)],
219 "N."[!rcu_preempt_ctrlblk.gp_tasks],
220 "E."[!rcu_preempt_ctrlblk.exp_tasks]);
221#ifdef CONFIG_RCU_BOOST
222 seq_printf(m, " ttb=%c btg=",
223 "B."[!rcu_preempt_ctrlblk.boost_tasks]);
224 switch (rcu_preempt_ctrlblk.boosted_this_gp) {
225 case -1:
226 seq_puts(m, "exp");
227 break;
228 case 0:
229 seq_puts(m, "no");
230 break;
231 case 1:
232 seq_puts(m, "begun");
233 break;
234 case 2:
235 seq_puts(m, "done");
236 break;
237 default:
238 seq_printf(m, "?%d?", rcu_preempt_ctrlblk.boosted_this_gp);
239 }
240 seq_printf(m, " ntb=%lu neb=%lu nnb=%lu j=%04x bt=%04x\n",
241 rcu_preempt_ctrlblk.n_tasks_boosted,
242 rcu_preempt_ctrlblk.n_exp_boosts,
243 rcu_preempt_ctrlblk.n_normal_boosts,
244 (int)(jiffies & 0xffff),
245 (int)(rcu_preempt_ctrlblk.boost_time & 0xffff));
246 seq_printf(m, " %s: nt=%lu gt=%lu bt=%lu b=%lu ny=%lu nos=%lu\n",
247 "normal balk",
248 rcu_preempt_ctrlblk.n_normal_balk_blkd_tasks,
249 rcu_preempt_ctrlblk.n_normal_balk_gp_tasks,
250 rcu_preempt_ctrlblk.n_normal_balk_boost_tasks,
251 rcu_preempt_ctrlblk.n_normal_balk_boosted,
252 rcu_preempt_ctrlblk.n_normal_balk_notyet,
253 rcu_preempt_ctrlblk.n_normal_balk_nos);
254 seq_printf(m, " exp balk: bt=%lu nos=%lu\n",
255 rcu_preempt_ctrlblk.n_exp_balk_blkd_tasks,
256 rcu_preempt_ctrlblk.n_exp_balk_nos);
257#endif /* #ifdef CONFIG_RCU_BOOST */
258}
259
260#endif /* #ifdef CONFIG_RCU_TRACE */
261
262#ifdef CONFIG_RCU_BOOST
263
264#include "rtmutex_common.h"
265
266/*
267 * Carry out RCU priority boosting on the task indicated by ->boost_tasks,
268 * and advance ->boost_tasks to the next task in the ->blkd_tasks list.
269 */
270static int rcu_boost(void)
271{
272 unsigned long flags;
273 struct rt_mutex mtx;
274 struct list_head *np;
275 struct task_struct *t;
276
277 if (rcu_preempt_ctrlblk.boost_tasks == NULL)
278 return 0; /* Nothing to boost. */
279 raw_local_irq_save(flags);
280 rcu_preempt_ctrlblk.boosted_this_gp++;
281 t = container_of(rcu_preempt_ctrlblk.boost_tasks, struct task_struct,
282 rcu_node_entry);
283 np = rcu_next_node_entry(t);
284 rt_mutex_init_proxy_locked(&mtx, t);
285 t->rcu_boost_mutex = &mtx;
286 t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BOOSTED;
287 raw_local_irq_restore(flags);
288 rt_mutex_lock(&mtx);
289 RCU_TRACE(rcu_preempt_ctrlblk.n_tasks_boosted++);
290 rcu_preempt_ctrlblk.boosted_this_gp++;
291 rt_mutex_unlock(&mtx);
292 return rcu_preempt_ctrlblk.boost_tasks != NULL;
293}
294
295/*
296 * Check to see if it is now time to start boosting RCU readers blocking
297 * the current grace period, and, if so, tell the rcu_kthread_task to
298 * start boosting them. If there is an expedited boost in progress,
299 * we wait for it to complete.
300 *
301 * If there are no blocked readers blocking the current grace period,
302 * return 0 to let the caller know, otherwise return 1. Note that this
303 * return value is independent of whether or not boosting was done.
304 */
305static int rcu_initiate_boost(void)
306{
307 if (!rcu_preempt_blocked_readers_cgp()) {
308 RCU_TRACE(rcu_preempt_ctrlblk.n_normal_balk_blkd_tasks++);
309 return 0;
310 }
311 if (rcu_preempt_ctrlblk.gp_tasks != NULL &&
312 rcu_preempt_ctrlblk.boost_tasks == NULL &&
313 rcu_preempt_ctrlblk.boosted_this_gp == 0 &&
314 ULONG_CMP_GE(jiffies, rcu_preempt_ctrlblk.boost_time)) {
315 rcu_preempt_ctrlblk.boost_tasks = rcu_preempt_ctrlblk.gp_tasks;
316 invoke_rcu_kthread();
317 RCU_TRACE(rcu_preempt_ctrlblk.n_normal_boosts++);
318 } else
319 RCU_TRACE(rcu_initiate_boost_trace());
320 return 1;
321}
322
323/*
324 * Initiate boosting for an expedited grace period.
325 */
326static void rcu_initiate_expedited_boost(void)
327{
328 unsigned long flags;
329
330 raw_local_irq_save(flags);
331 if (!list_empty(&rcu_preempt_ctrlblk.blkd_tasks)) {
332 rcu_preempt_ctrlblk.boost_tasks =
333 rcu_preempt_ctrlblk.blkd_tasks.next;
334 rcu_preempt_ctrlblk.boosted_this_gp = -1;
335 invoke_rcu_kthread();
336 RCU_TRACE(rcu_preempt_ctrlblk.n_exp_boosts++);
337 } else
338 RCU_TRACE(rcu_initiate_exp_boost_trace());
339 raw_local_irq_restore(flags);
340}
341
342#define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000);
343
344/*
345 * Do priority-boost accounting for the start of a new grace period.
346 */
347static void rcu_preempt_boost_start_gp(void)
348{
349 rcu_preempt_ctrlblk.boost_time = jiffies + RCU_BOOST_DELAY_JIFFIES;
350 if (rcu_preempt_ctrlblk.boosted_this_gp > 0)
351 rcu_preempt_ctrlblk.boosted_this_gp = 0;
352}
353
354#else /* #ifdef CONFIG_RCU_BOOST */
355
356/*
357 * If there is no RCU priority boosting, we don't boost.
358 */
359static int rcu_boost(void)
360{
361 return 0;
362}
363
364/*
365 * If there is no RCU priority boosting, we don't initiate boosting,
366 * but we do indicate whether there are blocked readers blocking the
367 * current grace period.
368 */
369static int rcu_initiate_boost(void)
370{
371 return rcu_preempt_blocked_readers_cgp();
372}
373
374/*
375 * If there is no RCU priority boosting, we don't initiate expedited boosting.
376 */
377static void rcu_initiate_expedited_boost(void)
378{
379}
380
381/*
382 * If there is no RCU priority boosting, nothing to do at grace-period start.
383 */
384static void rcu_preempt_boost_start_gp(void)
385{
386}
387
388#endif /* else #ifdef CONFIG_RCU_BOOST */
389
390/*
125 * Record a preemptible-RCU quiescent state for the specified CPU. Note 391 * Record a preemptible-RCU quiescent state for the specified CPU. Note
126 * that this just means that the task currently running on the CPU is 392 * that this just means that the task currently running on the CPU is
127 * in a quiescent state. There might be any number of tasks blocked 393 * in a quiescent state. There might be any number of tasks blocked
@@ -148,11 +414,14 @@ static void rcu_preempt_cpu_qs(void)
148 rcu_preempt_ctrlblk.gpcpu = rcu_preempt_ctrlblk.gpnum; 414 rcu_preempt_ctrlblk.gpcpu = rcu_preempt_ctrlblk.gpnum;
149 current->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS; 415 current->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS;
150 416
417 /* If there is no GP then there is nothing more to do. */
418 if (!rcu_preempt_gp_in_progress())
419 return;
151 /* 420 /*
152 * If there is no GP, or if blocked readers are still blocking GP, 421 * Check up on boosting. If there are no readers blocking the
153 * then there is nothing more to do. 422 * current grace period, leave.
154 */ 423 */
155 if (!rcu_preempt_gp_in_progress() || rcu_preempt_blocked_readers_cgp()) 424 if (rcu_initiate_boost())
156 return; 425 return;
157 426
158 /* Advance callbacks. */ 427 /* Advance callbacks. */
@@ -164,9 +433,9 @@ static void rcu_preempt_cpu_qs(void)
164 if (!rcu_preempt_blocked_readers_any()) 433 if (!rcu_preempt_blocked_readers_any())
165 rcu_preempt_ctrlblk.rcb.donetail = rcu_preempt_ctrlblk.nexttail; 434 rcu_preempt_ctrlblk.rcb.donetail = rcu_preempt_ctrlblk.nexttail;
166 435
167 /* If there are done callbacks, make RCU_SOFTIRQ process them. */ 436 /* If there are done callbacks, cause them to be invoked. */
168 if (*rcu_preempt_ctrlblk.rcb.donetail != NULL) 437 if (*rcu_preempt_ctrlblk.rcb.donetail != NULL)
169 raise_softirq(RCU_SOFTIRQ); 438 invoke_rcu_kthread();
170} 439}
171 440
172/* 441/*
@@ -178,12 +447,16 @@ static void rcu_preempt_start_gp(void)
178 447
179 /* Official start of GP. */ 448 /* Official start of GP. */
180 rcu_preempt_ctrlblk.gpnum++; 449 rcu_preempt_ctrlblk.gpnum++;
450 RCU_TRACE(rcu_preempt_ctrlblk.n_grace_periods++);
181 451
182 /* Any blocked RCU readers block new GP. */ 452 /* Any blocked RCU readers block new GP. */
183 if (rcu_preempt_blocked_readers_any()) 453 if (rcu_preempt_blocked_readers_any())
184 rcu_preempt_ctrlblk.gp_tasks = 454 rcu_preempt_ctrlblk.gp_tasks =
185 rcu_preempt_ctrlblk.blkd_tasks.next; 455 rcu_preempt_ctrlblk.blkd_tasks.next;
186 456
457 /* Set up for RCU priority boosting. */
458 rcu_preempt_boost_start_gp();
459
187 /* If there is no running reader, CPU is done with GP. */ 460 /* If there is no running reader, CPU is done with GP. */
188 if (!rcu_preempt_running_reader()) 461 if (!rcu_preempt_running_reader())
189 rcu_preempt_cpu_qs(); 462 rcu_preempt_cpu_qs();
@@ -304,14 +577,16 @@ static void rcu_read_unlock_special(struct task_struct *t)
304 */ 577 */
305 empty = !rcu_preempt_blocked_readers_cgp(); 578 empty = !rcu_preempt_blocked_readers_cgp();
306 empty_exp = rcu_preempt_ctrlblk.exp_tasks == NULL; 579 empty_exp = rcu_preempt_ctrlblk.exp_tasks == NULL;
307 np = t->rcu_node_entry.next; 580 np = rcu_next_node_entry(t);
308 if (np == &rcu_preempt_ctrlblk.blkd_tasks)
309 np = NULL;
310 list_del(&t->rcu_node_entry); 581 list_del(&t->rcu_node_entry);
311 if (&t->rcu_node_entry == rcu_preempt_ctrlblk.gp_tasks) 582 if (&t->rcu_node_entry == rcu_preempt_ctrlblk.gp_tasks)
312 rcu_preempt_ctrlblk.gp_tasks = np; 583 rcu_preempt_ctrlblk.gp_tasks = np;
313 if (&t->rcu_node_entry == rcu_preempt_ctrlblk.exp_tasks) 584 if (&t->rcu_node_entry == rcu_preempt_ctrlblk.exp_tasks)
314 rcu_preempt_ctrlblk.exp_tasks = np; 585 rcu_preempt_ctrlblk.exp_tasks = np;
586#ifdef CONFIG_RCU_BOOST
587 if (&t->rcu_node_entry == rcu_preempt_ctrlblk.boost_tasks)
588 rcu_preempt_ctrlblk.boost_tasks = np;
589#endif /* #ifdef CONFIG_RCU_BOOST */
315 INIT_LIST_HEAD(&t->rcu_node_entry); 590 INIT_LIST_HEAD(&t->rcu_node_entry);
316 591
317 /* 592 /*
@@ -331,6 +606,14 @@ static void rcu_read_unlock_special(struct task_struct *t)
331 if (!empty_exp && rcu_preempt_ctrlblk.exp_tasks == NULL) 606 if (!empty_exp && rcu_preempt_ctrlblk.exp_tasks == NULL)
332 rcu_report_exp_done(); 607 rcu_report_exp_done();
333 } 608 }
609#ifdef CONFIG_RCU_BOOST
610 /* Unboost self if was boosted. */
611 if (special & RCU_READ_UNLOCK_BOOSTED) {
612 t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_BOOSTED;
613 rt_mutex_unlock(t->rcu_boost_mutex);
614 t->rcu_boost_mutex = NULL;
615 }
616#endif /* #ifdef CONFIG_RCU_BOOST */
334 local_irq_restore(flags); 617 local_irq_restore(flags);
335} 618}
336 619
@@ -374,7 +657,7 @@ static void rcu_preempt_check_callbacks(void)
374 rcu_preempt_cpu_qs(); 657 rcu_preempt_cpu_qs();
375 if (&rcu_preempt_ctrlblk.rcb.rcucblist != 658 if (&rcu_preempt_ctrlblk.rcb.rcucblist !=
376 rcu_preempt_ctrlblk.rcb.donetail) 659 rcu_preempt_ctrlblk.rcb.donetail)
377 raise_softirq(RCU_SOFTIRQ); 660 invoke_rcu_kthread();
378 if (rcu_preempt_gp_in_progress() && 661 if (rcu_preempt_gp_in_progress() &&
379 rcu_cpu_blocking_cur_gp() && 662 rcu_cpu_blocking_cur_gp() &&
380 rcu_preempt_running_reader()) 663 rcu_preempt_running_reader())
@@ -383,7 +666,7 @@ static void rcu_preempt_check_callbacks(void)
383 666
384/* 667/*
385 * TINY_PREEMPT_RCU has an extra callback-list tail pointer to 668 * TINY_PREEMPT_RCU has an extra callback-list tail pointer to
386 * update, so this is invoked from __rcu_process_callbacks() to 669 * update, so this is invoked from rcu_process_callbacks() to
387 * handle that case. Of course, it is invoked for all flavors of 670 * handle that case. Of course, it is invoked for all flavors of
388 * RCU, but RCU callbacks can appear only on one of the lists, and 671 * RCU, but RCU callbacks can appear only on one of the lists, and
389 * neither ->nexttail nor ->donetail can possibly be NULL, so there 672 * neither ->nexttail nor ->donetail can possibly be NULL, so there
@@ -400,7 +683,7 @@ static void rcu_preempt_remove_callbacks(struct rcu_ctrlblk *rcp)
400 */ 683 */
401static void rcu_preempt_process_callbacks(void) 684static void rcu_preempt_process_callbacks(void)
402{ 685{
403 __rcu_process_callbacks(&rcu_preempt_ctrlblk.rcb); 686 rcu_process_callbacks(&rcu_preempt_ctrlblk.rcb);
404} 687}
405 688
406/* 689/*
@@ -417,6 +700,7 @@ void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
417 local_irq_save(flags); 700 local_irq_save(flags);
418 *rcu_preempt_ctrlblk.nexttail = head; 701 *rcu_preempt_ctrlblk.nexttail = head;
419 rcu_preempt_ctrlblk.nexttail = &head->next; 702 rcu_preempt_ctrlblk.nexttail = &head->next;
703 RCU_TRACE(rcu_preempt_ctrlblk.rcb.qlen++);
420 rcu_preempt_start_gp(); /* checks to see if GP needed. */ 704 rcu_preempt_start_gp(); /* checks to see if GP needed. */
421 local_irq_restore(flags); 705 local_irq_restore(flags);
422} 706}
@@ -532,6 +816,7 @@ void synchronize_rcu_expedited(void)
532 816
533 /* Wait for tail of ->blkd_tasks list to drain. */ 817 /* Wait for tail of ->blkd_tasks list to drain. */
534 if (rcu_preempted_readers_exp()) 818 if (rcu_preempted_readers_exp())
819 rcu_initiate_expedited_boost();
535 wait_event(sync_rcu_preempt_exp_wq, 820 wait_event(sync_rcu_preempt_exp_wq,
536 !rcu_preempted_readers_exp()); 821 !rcu_preempted_readers_exp());
537 822
@@ -572,6 +857,27 @@ void exit_rcu(void)
572 857
573#else /* #ifdef CONFIG_TINY_PREEMPT_RCU */ 858#else /* #ifdef CONFIG_TINY_PREEMPT_RCU */
574 859
860#ifdef CONFIG_RCU_TRACE
861
862/*
863 * Because preemptible RCU does not exist, it is not necessary to
864 * dump out its statistics.
865 */
866static void show_tiny_preempt_stats(struct seq_file *m)
867{
868}
869
870#endif /* #ifdef CONFIG_RCU_TRACE */
871
872/*
873 * Because preemptible RCU does not exist, it is never necessary to
874 * boost preempted RCU readers.
875 */
876static int rcu_boost(void)
877{
878 return 0;
879}
880
575/* 881/*
576 * Because preemptible RCU does not exist, it never has any callbacks 882 * Because preemptible RCU does not exist, it never has any callbacks
577 * to check. 883 * to check.
@@ -599,17 +905,116 @@ static void rcu_preempt_process_callbacks(void)
599#endif /* #else #ifdef CONFIG_TINY_PREEMPT_RCU */ 905#endif /* #else #ifdef CONFIG_TINY_PREEMPT_RCU */
600 906
601#ifdef CONFIG_DEBUG_LOCK_ALLOC 907#ifdef CONFIG_DEBUG_LOCK_ALLOC
602
603#include <linux/kernel_stat.h> 908#include <linux/kernel_stat.h>
604 909
605/* 910/*
606 * During boot, we forgive RCU lockdep issues. After this function is 911 * During boot, we forgive RCU lockdep issues. After this function is
607 * invoked, we start taking RCU lockdep issues seriously. 912 * invoked, we start taking RCU lockdep issues seriously.
608 */ 913 */
609void rcu_scheduler_starting(void) 914void __init rcu_scheduler_starting(void)
610{ 915{
611 WARN_ON(nr_context_switches() > 0); 916 WARN_ON(nr_context_switches() > 0);
612 rcu_scheduler_active = 1; 917 rcu_scheduler_active = 1;
613} 918}
614 919
615#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ 920#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
921
922#ifdef CONFIG_RCU_BOOST
923#define RCU_BOOST_PRIO CONFIG_RCU_BOOST_PRIO
924#else /* #ifdef CONFIG_RCU_BOOST */
925#define RCU_BOOST_PRIO 1
926#endif /* #else #ifdef CONFIG_RCU_BOOST */
927
928#ifdef CONFIG_RCU_TRACE
929
930#ifdef CONFIG_RCU_BOOST
931
932static void rcu_initiate_boost_trace(void)
933{
934 if (rcu_preempt_ctrlblk.gp_tasks == NULL)
935 rcu_preempt_ctrlblk.n_normal_balk_gp_tasks++;
936 else if (rcu_preempt_ctrlblk.boost_tasks != NULL)
937 rcu_preempt_ctrlblk.n_normal_balk_boost_tasks++;
938 else if (rcu_preempt_ctrlblk.boosted_this_gp != 0)
939 rcu_preempt_ctrlblk.n_normal_balk_boosted++;
940 else if (!ULONG_CMP_GE(jiffies, rcu_preempt_ctrlblk.boost_time))
941 rcu_preempt_ctrlblk.n_normal_balk_notyet++;
942 else
943 rcu_preempt_ctrlblk.n_normal_balk_nos++;
944}
945
946static void rcu_initiate_exp_boost_trace(void)
947{
948 if (list_empty(&rcu_preempt_ctrlblk.blkd_tasks))
949 rcu_preempt_ctrlblk.n_exp_balk_blkd_tasks++;
950 else
951 rcu_preempt_ctrlblk.n_exp_balk_nos++;
952}
953
954#endif /* #ifdef CONFIG_RCU_BOOST */
955
956static void rcu_trace_sub_qlen(struct rcu_ctrlblk *rcp, int n)
957{
958 unsigned long flags;
959
960 raw_local_irq_save(flags);
961 rcp->qlen -= n;
962 raw_local_irq_restore(flags);
963}
964
965/*
966 * Dump statistics for TINY_RCU, such as they are.
967 */
968static int show_tiny_stats(struct seq_file *m, void *unused)
969{
970 show_tiny_preempt_stats(m);
971 seq_printf(m, "rcu_sched: qlen: %ld\n", rcu_sched_ctrlblk.qlen);
972 seq_printf(m, "rcu_bh: qlen: %ld\n", rcu_bh_ctrlblk.qlen);
973 return 0;
974}
975
976static int show_tiny_stats_open(struct inode *inode, struct file *file)
977{
978 return single_open(file, show_tiny_stats, NULL);
979}
980
981static const struct file_operations show_tiny_stats_fops = {
982 .owner = THIS_MODULE,
983 .open = show_tiny_stats_open,
984 .read = seq_read,
985 .llseek = seq_lseek,
986 .release = single_release,
987};
988
989static struct dentry *rcudir;
990
991static int __init rcutiny_trace_init(void)
992{
993 struct dentry *retval;
994
995 rcudir = debugfs_create_dir("rcu", NULL);
996 if (!rcudir)
997 goto free_out;
998 retval = debugfs_create_file("rcudata", 0444, rcudir,
999 NULL, &show_tiny_stats_fops);
1000 if (!retval)
1001 goto free_out;
1002 return 0;
1003free_out:
1004 debugfs_remove_recursive(rcudir);
1005 return 1;
1006}
1007
1008static void __exit rcutiny_trace_cleanup(void)
1009{
1010 debugfs_remove_recursive(rcudir);
1011}
1012
1013module_init(rcutiny_trace_init);
1014module_exit(rcutiny_trace_cleanup);
1015
1016MODULE_AUTHOR("Paul E. McKenney");
1017MODULE_DESCRIPTION("Read-Copy Update tracing for tiny implementation");
1018MODULE_LICENSE("GPL");
1019
1020#endif /* #ifdef CONFIG_RCU_TRACE */
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 9d8e8fb2515f..89613f97ff26 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -47,6 +47,7 @@
47#include <linux/srcu.h> 47#include <linux/srcu.h>
48#include <linux/slab.h> 48#include <linux/slab.h>
49#include <asm/byteorder.h> 49#include <asm/byteorder.h>
50#include <linux/sched.h>
50 51
51MODULE_LICENSE("GPL"); 52MODULE_LICENSE("GPL");
52MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and " 53MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and "
@@ -64,6 +65,9 @@ static int irqreader = 1; /* RCU readers from irq (timers). */
64static int fqs_duration = 0; /* Duration of bursts (us), 0 to disable. */ 65static int fqs_duration = 0; /* Duration of bursts (us), 0 to disable. */
65static int fqs_holdoff = 0; /* Hold time within burst (us). */ 66static int fqs_holdoff = 0; /* Hold time within burst (us). */
66static int fqs_stutter = 3; /* Wait time between bursts (s). */ 67static int fqs_stutter = 3; /* Wait time between bursts (s). */
68static int test_boost = 1; /* Test RCU prio boost: 0=no, 1=maybe, 2=yes. */
69static int test_boost_interval = 7; /* Interval between boost tests, seconds. */
70static int test_boost_duration = 4; /* Duration of each boost test, seconds. */
67static char *torture_type = "rcu"; /* What RCU implementation to torture. */ 71static char *torture_type = "rcu"; /* What RCU implementation to torture. */
68 72
69module_param(nreaders, int, 0444); 73module_param(nreaders, int, 0444);
@@ -88,6 +92,12 @@ module_param(fqs_holdoff, int, 0444);
88MODULE_PARM_DESC(fqs_holdoff, "Holdoff time within fqs bursts (us)"); 92MODULE_PARM_DESC(fqs_holdoff, "Holdoff time within fqs bursts (us)");
89module_param(fqs_stutter, int, 0444); 93module_param(fqs_stutter, int, 0444);
90MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)"); 94MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)");
95module_param(test_boost, int, 0444);
96MODULE_PARM_DESC(test_boost, "Test RCU prio boost: 0=no, 1=maybe, 2=yes.");
97module_param(test_boost_interval, int, 0444);
98MODULE_PARM_DESC(test_boost_interval, "Interval between boost tests, seconds.");
99module_param(test_boost_duration, int, 0444);
100MODULE_PARM_DESC(test_boost_duration, "Duration of each boost test, seconds.");
91module_param(torture_type, charp, 0444); 101module_param(torture_type, charp, 0444);
92MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, srcu)"); 102MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, srcu)");
93 103
@@ -109,6 +119,7 @@ static struct task_struct *stats_task;
109static struct task_struct *shuffler_task; 119static struct task_struct *shuffler_task;
110static struct task_struct *stutter_task; 120static struct task_struct *stutter_task;
111static struct task_struct *fqs_task; 121static struct task_struct *fqs_task;
122static struct task_struct *boost_tasks[NR_CPUS];
112 123
113#define RCU_TORTURE_PIPE_LEN 10 124#define RCU_TORTURE_PIPE_LEN 10
114 125
@@ -134,6 +145,12 @@ static atomic_t n_rcu_torture_alloc_fail;
134static atomic_t n_rcu_torture_free; 145static atomic_t n_rcu_torture_free;
135static atomic_t n_rcu_torture_mberror; 146static atomic_t n_rcu_torture_mberror;
136static atomic_t n_rcu_torture_error; 147static atomic_t n_rcu_torture_error;
148static long n_rcu_torture_boost_ktrerror;
149static long n_rcu_torture_boost_rterror;
150static long n_rcu_torture_boost_allocerror;
151static long n_rcu_torture_boost_afferror;
152static long n_rcu_torture_boost_failure;
153static long n_rcu_torture_boosts;
137static long n_rcu_torture_timers; 154static long n_rcu_torture_timers;
138static struct list_head rcu_torture_removed; 155static struct list_head rcu_torture_removed;
139static cpumask_var_t shuffle_tmp_mask; 156static cpumask_var_t shuffle_tmp_mask;
@@ -147,6 +164,16 @@ static int stutter_pause_test;
147#endif 164#endif
148int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT; 165int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT;
149 166
167#ifdef CONFIG_RCU_BOOST
168#define rcu_can_boost() 1
169#else /* #ifdef CONFIG_RCU_BOOST */
170#define rcu_can_boost() 0
171#endif /* #else #ifdef CONFIG_RCU_BOOST */
172
173static unsigned long boost_starttime; /* jiffies of next boost test start. */
174DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */
175 /* and boost task create/destroy. */
176
150/* Mediate rmmod and system shutdown. Concurrent rmmod & shutdown illegal! */ 177/* Mediate rmmod and system shutdown. Concurrent rmmod & shutdown illegal! */
151 178
152#define FULLSTOP_DONTSTOP 0 /* Normal operation. */ 179#define FULLSTOP_DONTSTOP 0 /* Normal operation. */
@@ -277,6 +304,7 @@ struct rcu_torture_ops {
277 void (*fqs)(void); 304 void (*fqs)(void);
278 int (*stats)(char *page); 305 int (*stats)(char *page);
279 int irq_capable; 306 int irq_capable;
307 int can_boost;
280 char *name; 308 char *name;
281}; 309};
282 310
@@ -366,6 +394,7 @@ static struct rcu_torture_ops rcu_ops = {
366 .fqs = rcu_force_quiescent_state, 394 .fqs = rcu_force_quiescent_state,
367 .stats = NULL, 395 .stats = NULL,
368 .irq_capable = 1, 396 .irq_capable = 1,
397 .can_boost = rcu_can_boost(),
369 .name = "rcu" 398 .name = "rcu"
370}; 399};
371 400
@@ -408,6 +437,7 @@ static struct rcu_torture_ops rcu_sync_ops = {
408 .fqs = rcu_force_quiescent_state, 437 .fqs = rcu_force_quiescent_state,
409 .stats = NULL, 438 .stats = NULL,
410 .irq_capable = 1, 439 .irq_capable = 1,
440 .can_boost = rcu_can_boost(),
411 .name = "rcu_sync" 441 .name = "rcu_sync"
412}; 442};
413 443
@@ -424,6 +454,7 @@ static struct rcu_torture_ops rcu_expedited_ops = {
424 .fqs = rcu_force_quiescent_state, 454 .fqs = rcu_force_quiescent_state,
425 .stats = NULL, 455 .stats = NULL,
426 .irq_capable = 1, 456 .irq_capable = 1,
457 .can_boost = rcu_can_boost(),
427 .name = "rcu_expedited" 458 .name = "rcu_expedited"
428}; 459};
429 460
@@ -684,6 +715,110 @@ static struct rcu_torture_ops sched_expedited_ops = {
684}; 715};
685 716
686/* 717/*
718 * RCU torture priority-boost testing. Runs one real-time thread per
719 * CPU for moderate bursts, repeatedly registering RCU callbacks and
720 * spinning waiting for them to be invoked. If a given callback takes
721 * too long to be invoked, we assume that priority inversion has occurred.
722 */
723
724struct rcu_boost_inflight {
725 struct rcu_head rcu;
726 int inflight;
727};
728
729static void rcu_torture_boost_cb(struct rcu_head *head)
730{
731 struct rcu_boost_inflight *rbip =
732 container_of(head, struct rcu_boost_inflight, rcu);
733
734 smp_mb(); /* Ensure RCU-core accesses precede clearing ->inflight */
735 rbip->inflight = 0;
736}
737
738static int rcu_torture_boost(void *arg)
739{
740 unsigned long call_rcu_time;
741 unsigned long endtime;
742 unsigned long oldstarttime;
743 struct rcu_boost_inflight rbi = { .inflight = 0 };
744 struct sched_param sp;
745
746 VERBOSE_PRINTK_STRING("rcu_torture_boost started");
747
748 /* Set real-time priority. */
749 sp.sched_priority = 1;
750 if (sched_setscheduler(current, SCHED_FIFO, &sp) < 0) {
751 VERBOSE_PRINTK_STRING("rcu_torture_boost RT prio failed!");
752 n_rcu_torture_boost_rterror++;
753 }
754
755 /* Each pass through the following loop does one boost-test cycle. */
756 do {
757 /* Wait for the next test interval. */
758 oldstarttime = boost_starttime;
759 while (jiffies - oldstarttime > ULONG_MAX / 2) {
760 schedule_timeout_uninterruptible(1);
761 rcu_stutter_wait("rcu_torture_boost");
762 if (kthread_should_stop() ||
763 fullstop != FULLSTOP_DONTSTOP)
764 goto checkwait;
765 }
766
767 /* Do one boost-test interval. */
768 endtime = oldstarttime + test_boost_duration * HZ;
769 call_rcu_time = jiffies;
770 while (jiffies - endtime > ULONG_MAX / 2) {
771 /* If we don't have a callback in flight, post one. */
772 if (!rbi.inflight) {
773 smp_mb(); /* RCU core before ->inflight = 1. */
774 rbi.inflight = 1;
775 call_rcu(&rbi.rcu, rcu_torture_boost_cb);
776 if (jiffies - call_rcu_time >
777 test_boost_duration * HZ - HZ / 2) {
778 VERBOSE_PRINTK_STRING("rcu_torture_boost boosting failed");
779 n_rcu_torture_boost_failure++;
780 }
781 call_rcu_time = jiffies;
782 }
783 cond_resched();
784 rcu_stutter_wait("rcu_torture_boost");
785 if (kthread_should_stop() ||
786 fullstop != FULLSTOP_DONTSTOP)
787 goto checkwait;
788 }
789
790 /*
791 * Set the start time of the next test interval.
792 * Yes, this is vulnerable to long delays, but such
793 * delays simply cause a false negative for the next
794 * interval. Besides, we are running at RT priority,
795 * so delays should be relatively rare.
796 */
797 while (oldstarttime == boost_starttime) {
798 if (mutex_trylock(&boost_mutex)) {
799 boost_starttime = jiffies +
800 test_boost_interval * HZ;
801 n_rcu_torture_boosts++;
802 mutex_unlock(&boost_mutex);
803 break;
804 }
805 schedule_timeout_uninterruptible(1);
806 }
807
808 /* Go do the stutter. */
809checkwait: rcu_stutter_wait("rcu_torture_boost");
810 } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP);
811
812 /* Clean up and exit. */
813 VERBOSE_PRINTK_STRING("rcu_torture_boost task stopping");
814 rcutorture_shutdown_absorb("rcu_torture_boost");
815 while (!kthread_should_stop() || rbi.inflight)
816 schedule_timeout_uninterruptible(1);
817 smp_mb(); /* order accesses to ->inflight before stack-frame death. */
818 return 0;
819}
820
821/*
687 * RCU torture force-quiescent-state kthread. Repeatedly induces 822 * RCU torture force-quiescent-state kthread. Repeatedly induces
688 * bursts of calls to force_quiescent_state(), increasing the probability 823 * bursts of calls to force_quiescent_state(), increasing the probability
689 * of occurrence of some important types of race conditions. 824 * of occurrence of some important types of race conditions.
@@ -933,7 +1068,8 @@ rcu_torture_printk(char *page)
933 cnt += sprintf(&page[cnt], "%s%s ", torture_type, TORTURE_FLAG); 1068 cnt += sprintf(&page[cnt], "%s%s ", torture_type, TORTURE_FLAG);
934 cnt += sprintf(&page[cnt], 1069 cnt += sprintf(&page[cnt],
935 "rtc: %p ver: %ld tfle: %d rta: %d rtaf: %d rtf: %d " 1070 "rtc: %p ver: %ld tfle: %d rta: %d rtaf: %d rtf: %d "
936 "rtmbe: %d nt: %ld", 1071 "rtmbe: %d rtbke: %ld rtbre: %ld rtbae: %ld rtbafe: %ld "
1072 "rtbf: %ld rtb: %ld nt: %ld",
937 rcu_torture_current, 1073 rcu_torture_current,
938 rcu_torture_current_version, 1074 rcu_torture_current_version,
939 list_empty(&rcu_torture_freelist), 1075 list_empty(&rcu_torture_freelist),
@@ -941,8 +1077,19 @@ rcu_torture_printk(char *page)
941 atomic_read(&n_rcu_torture_alloc_fail), 1077 atomic_read(&n_rcu_torture_alloc_fail),
942 atomic_read(&n_rcu_torture_free), 1078 atomic_read(&n_rcu_torture_free),
943 atomic_read(&n_rcu_torture_mberror), 1079 atomic_read(&n_rcu_torture_mberror),
1080 n_rcu_torture_boost_ktrerror,
1081 n_rcu_torture_boost_rterror,
1082 n_rcu_torture_boost_allocerror,
1083 n_rcu_torture_boost_afferror,
1084 n_rcu_torture_boost_failure,
1085 n_rcu_torture_boosts,
944 n_rcu_torture_timers); 1086 n_rcu_torture_timers);
945 if (atomic_read(&n_rcu_torture_mberror) != 0) 1087 if (atomic_read(&n_rcu_torture_mberror) != 0 ||
1088 n_rcu_torture_boost_ktrerror != 0 ||
1089 n_rcu_torture_boost_rterror != 0 ||
1090 n_rcu_torture_boost_allocerror != 0 ||
1091 n_rcu_torture_boost_afferror != 0 ||
1092 n_rcu_torture_boost_failure != 0)
946 cnt += sprintf(&page[cnt], " !!!"); 1093 cnt += sprintf(&page[cnt], " !!!");
947 cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG); 1094 cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG);
948 if (i > 1) { 1095 if (i > 1) {
@@ -1094,22 +1241,91 @@ rcu_torture_stutter(void *arg)
1094} 1241}
1095 1242
1096static inline void 1243static inline void
1097rcu_torture_print_module_parms(char *tag) 1244rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, char *tag)
1098{ 1245{
1099 printk(KERN_ALERT "%s" TORTURE_FLAG 1246 printk(KERN_ALERT "%s" TORTURE_FLAG
1100 "--- %s: nreaders=%d nfakewriters=%d " 1247 "--- %s: nreaders=%d nfakewriters=%d "
1101 "stat_interval=%d verbose=%d test_no_idle_hz=%d " 1248 "stat_interval=%d verbose=%d test_no_idle_hz=%d "
1102 "shuffle_interval=%d stutter=%d irqreader=%d " 1249 "shuffle_interval=%d stutter=%d irqreader=%d "
1103 "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d\n", 1250 "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d "
1251 "test_boost=%d/%d test_boost_interval=%d "
1252 "test_boost_duration=%d\n",
1104 torture_type, tag, nrealreaders, nfakewriters, 1253 torture_type, tag, nrealreaders, nfakewriters,
1105 stat_interval, verbose, test_no_idle_hz, shuffle_interval, 1254 stat_interval, verbose, test_no_idle_hz, shuffle_interval,
1106 stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter); 1255 stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter,
1256 test_boost, cur_ops->can_boost,
1257 test_boost_interval, test_boost_duration);
1107} 1258}
1108 1259
1109static struct notifier_block rcutorture_nb = { 1260static struct notifier_block rcutorture_shutdown_nb = {
1110 .notifier_call = rcutorture_shutdown_notify, 1261 .notifier_call = rcutorture_shutdown_notify,
1111}; 1262};
1112 1263
1264static void rcutorture_booster_cleanup(int cpu)
1265{
1266 struct task_struct *t;
1267
1268 if (boost_tasks[cpu] == NULL)
1269 return;
1270 mutex_lock(&boost_mutex);
1271 VERBOSE_PRINTK_STRING("Stopping rcu_torture_boost task");
1272 t = boost_tasks[cpu];
1273 boost_tasks[cpu] = NULL;
1274 mutex_unlock(&boost_mutex);
1275
1276 /* This must be outside of the mutex, otherwise deadlock! */
1277 kthread_stop(t);
1278}
1279
1280static int rcutorture_booster_init(int cpu)
1281{
1282 int retval;
1283
1284 if (boost_tasks[cpu] != NULL)
1285 return 0; /* Already created, nothing more to do. */
1286
1287 /* Don't allow time recalculation while creating a new task. */
1288 mutex_lock(&boost_mutex);
1289 VERBOSE_PRINTK_STRING("Creating rcu_torture_boost task");
1290 boost_tasks[cpu] = kthread_create(rcu_torture_boost, NULL,
1291 "rcu_torture_boost");
1292 if (IS_ERR(boost_tasks[cpu])) {
1293 retval = PTR_ERR(boost_tasks[cpu]);
1294 VERBOSE_PRINTK_STRING("rcu_torture_boost task create failed");
1295 n_rcu_torture_boost_ktrerror++;
1296 boost_tasks[cpu] = NULL;
1297 mutex_unlock(&boost_mutex);
1298 return retval;
1299 }
1300 kthread_bind(boost_tasks[cpu], cpu);
1301 wake_up_process(boost_tasks[cpu]);
1302 mutex_unlock(&boost_mutex);
1303 return 0;
1304}
1305
1306static int rcutorture_cpu_notify(struct notifier_block *self,
1307 unsigned long action, void *hcpu)
1308{
1309 long cpu = (long)hcpu;
1310
1311 switch (action) {
1312 case CPU_ONLINE:
1313 case CPU_DOWN_FAILED:
1314 (void)rcutorture_booster_init(cpu);
1315 break;
1316 case CPU_DOWN_PREPARE:
1317 rcutorture_booster_cleanup(cpu);
1318 break;
1319 default:
1320 break;
1321 }
1322 return NOTIFY_OK;
1323}
1324
1325static struct notifier_block rcutorture_cpu_nb = {
1326 .notifier_call = rcutorture_cpu_notify,
1327};
1328
1113static void 1329static void
1114rcu_torture_cleanup(void) 1330rcu_torture_cleanup(void)
1115{ 1331{
@@ -1127,7 +1343,7 @@ rcu_torture_cleanup(void)
1127 } 1343 }
1128 fullstop = FULLSTOP_RMMOD; 1344 fullstop = FULLSTOP_RMMOD;
1129 mutex_unlock(&fullstop_mutex); 1345 mutex_unlock(&fullstop_mutex);
1130 unregister_reboot_notifier(&rcutorture_nb); 1346 unregister_reboot_notifier(&rcutorture_shutdown_nb);
1131 if (stutter_task) { 1347 if (stutter_task) {
1132 VERBOSE_PRINTK_STRING("Stopping rcu_torture_stutter task"); 1348 VERBOSE_PRINTK_STRING("Stopping rcu_torture_stutter task");
1133 kthread_stop(stutter_task); 1349 kthread_stop(stutter_task);
@@ -1184,6 +1400,12 @@ rcu_torture_cleanup(void)
1184 kthread_stop(fqs_task); 1400 kthread_stop(fqs_task);
1185 } 1401 }
1186 fqs_task = NULL; 1402 fqs_task = NULL;
1403 if ((test_boost == 1 && cur_ops->can_boost) ||
1404 test_boost == 2) {
1405 unregister_cpu_notifier(&rcutorture_cpu_nb);
1406 for_each_possible_cpu(i)
1407 rcutorture_booster_cleanup(i);
1408 }
1187 1409
1188 /* Wait for all RCU callbacks to fire. */ 1410 /* Wait for all RCU callbacks to fire. */
1189 1411
@@ -1195,9 +1417,9 @@ rcu_torture_cleanup(void)
1195 if (cur_ops->cleanup) 1417 if (cur_ops->cleanup)
1196 cur_ops->cleanup(); 1418 cur_ops->cleanup();
1197 if (atomic_read(&n_rcu_torture_error)) 1419 if (atomic_read(&n_rcu_torture_error))
1198 rcu_torture_print_module_parms("End of test: FAILURE"); 1420 rcu_torture_print_module_parms(cur_ops, "End of test: FAILURE");
1199 else 1421 else
1200 rcu_torture_print_module_parms("End of test: SUCCESS"); 1422 rcu_torture_print_module_parms(cur_ops, "End of test: SUCCESS");
1201} 1423}
1202 1424
1203static int __init 1425static int __init
@@ -1242,7 +1464,7 @@ rcu_torture_init(void)
1242 nrealreaders = nreaders; 1464 nrealreaders = nreaders;
1243 else 1465 else
1244 nrealreaders = 2 * num_online_cpus(); 1466 nrealreaders = 2 * num_online_cpus();
1245 rcu_torture_print_module_parms("Start of test"); 1467 rcu_torture_print_module_parms(cur_ops, "Start of test");
1246 fullstop = FULLSTOP_DONTSTOP; 1468 fullstop = FULLSTOP_DONTSTOP;
1247 1469
1248 /* Set up the freelist. */ 1470 /* Set up the freelist. */
@@ -1263,6 +1485,12 @@ rcu_torture_init(void)
1263 atomic_set(&n_rcu_torture_free, 0); 1485 atomic_set(&n_rcu_torture_free, 0);
1264 atomic_set(&n_rcu_torture_mberror, 0); 1486 atomic_set(&n_rcu_torture_mberror, 0);
1265 atomic_set(&n_rcu_torture_error, 0); 1487 atomic_set(&n_rcu_torture_error, 0);
1488 n_rcu_torture_boost_ktrerror = 0;
1489 n_rcu_torture_boost_rterror = 0;
1490 n_rcu_torture_boost_allocerror = 0;
1491 n_rcu_torture_boost_afferror = 0;
1492 n_rcu_torture_boost_failure = 0;
1493 n_rcu_torture_boosts = 0;
1266 for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) 1494 for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++)
1267 atomic_set(&rcu_torture_wcount[i], 0); 1495 atomic_set(&rcu_torture_wcount[i], 0);
1268 for_each_possible_cpu(cpu) { 1496 for_each_possible_cpu(cpu) {
@@ -1376,7 +1604,27 @@ rcu_torture_init(void)
1376 goto unwind; 1604 goto unwind;
1377 } 1605 }
1378 } 1606 }
1379 register_reboot_notifier(&rcutorture_nb); 1607 if (test_boost_interval < 1)
1608 test_boost_interval = 1;
1609 if (test_boost_duration < 2)
1610 test_boost_duration = 2;
1611 if ((test_boost == 1 && cur_ops->can_boost) ||
1612 test_boost == 2) {
1613 int retval;
1614
1615 boost_starttime = jiffies + test_boost_interval * HZ;
1616 register_cpu_notifier(&rcutorture_cpu_nb);
1617 for_each_possible_cpu(i) {
1618 if (cpu_is_offline(i))
1619 continue; /* Heuristic: CPU can go offline. */
1620 retval = rcutorture_booster_init(i);
1621 if (retval < 0) {
1622 firsterr = retval;
1623 goto unwind;
1624 }
1625 }
1626 }
1627 register_reboot_notifier(&rcutorture_shutdown_nb);
1380 mutex_unlock(&fullstop_mutex); 1628 mutex_unlock(&fullstop_mutex);
1381 return 0; 1629 return 0;
1382 1630
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index ccdc04c47981..dd4aea806f8e 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -67,9 +67,6 @@ static struct lock_class_key rcu_node_class[NUM_RCU_LVLS];
67 .gpnum = -300, \ 67 .gpnum = -300, \
68 .completed = -300, \ 68 .completed = -300, \
69 .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&structname.onofflock), \ 69 .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&structname.onofflock), \
70 .orphan_cbs_list = NULL, \
71 .orphan_cbs_tail = &structname.orphan_cbs_list, \
72 .orphan_qlen = 0, \
73 .fqslock = __RAW_SPIN_LOCK_UNLOCKED(&structname.fqslock), \ 70 .fqslock = __RAW_SPIN_LOCK_UNLOCKED(&structname.fqslock), \
74 .n_force_qs = 0, \ 71 .n_force_qs = 0, \
75 .n_force_qs_ngp = 0, \ 72 .n_force_qs_ngp = 0, \
@@ -367,8 +364,8 @@ void rcu_irq_exit(void)
367 WARN_ON_ONCE(rdtp->dynticks & 0x1); 364 WARN_ON_ONCE(rdtp->dynticks & 0x1);
368 365
369 /* If the interrupt queued a callback, get out of dyntick mode. */ 366 /* If the interrupt queued a callback, get out of dyntick mode. */
370 if (__get_cpu_var(rcu_sched_data).nxtlist || 367 if (__this_cpu_read(rcu_sched_data.nxtlist) ||
371 __get_cpu_var(rcu_bh_data).nxtlist) 368 __this_cpu_read(rcu_bh_data.nxtlist))
372 set_need_resched(); 369 set_need_resched();
373} 370}
374 371
@@ -620,9 +617,17 @@ static void __init check_cpu_stall_init(void)
620static void __note_new_gpnum(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) 617static void __note_new_gpnum(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp)
621{ 618{
622 if (rdp->gpnum != rnp->gpnum) { 619 if (rdp->gpnum != rnp->gpnum) {
623 rdp->qs_pending = 1; 620 /*
624 rdp->passed_quiesc = 0; 621 * If the current grace period is waiting for this CPU,
622 * set up to detect a quiescent state, otherwise don't
623 * go looking for one.
624 */
625 rdp->gpnum = rnp->gpnum; 625 rdp->gpnum = rnp->gpnum;
626 if (rnp->qsmask & rdp->grpmask) {
627 rdp->qs_pending = 1;
628 rdp->passed_quiesc = 0;
629 } else
630 rdp->qs_pending = 0;
626 } 631 }
627} 632}
628 633
@@ -681,6 +686,24 @@ __rcu_process_gp_end(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_dat
681 686
682 /* Remember that we saw this grace-period completion. */ 687 /* Remember that we saw this grace-period completion. */
683 rdp->completed = rnp->completed; 688 rdp->completed = rnp->completed;
689
690 /*
691 * If we were in an extended quiescent state, we may have
692 * missed some grace periods that others CPUs handled on
693 * our behalf. Catch up with this state to avoid noting
694 * spurious new grace periods. If another grace period
695 * has started, then rnp->gpnum will have advanced, so
696 * we will detect this later on.
697 */
698 if (ULONG_CMP_LT(rdp->gpnum, rdp->completed))
699 rdp->gpnum = rdp->completed;
700
701 /*
702 * If RCU does not need a quiescent state from this CPU,
703 * then make sure that this CPU doesn't go looking for one.
704 */
705 if ((rnp->qsmask & rdp->grpmask) == 0)
706 rdp->qs_pending = 0;
684 } 707 }
685} 708}
686 709
@@ -984,53 +1007,31 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp)
984#ifdef CONFIG_HOTPLUG_CPU 1007#ifdef CONFIG_HOTPLUG_CPU
985 1008
986/* 1009/*
987 * Move a dying CPU's RCU callbacks to the ->orphan_cbs_list for the 1010 * Move a dying CPU's RCU callbacks to online CPU's callback list.
988 * specified flavor of RCU. The callbacks will be adopted by the next 1011 * Synchronization is not required because this function executes
989 * _rcu_barrier() invocation or by the CPU_DEAD notifier, whichever 1012 * in stop_machine() context.
990 * comes first. Because this is invoked from the CPU_DYING notifier,
991 * irqs are already disabled.
992 */ 1013 */
993static void rcu_send_cbs_to_orphanage(struct rcu_state *rsp) 1014static void rcu_send_cbs_to_online(struct rcu_state *rsp)
994{ 1015{
995 int i; 1016 int i;
1017 /* current DYING CPU is cleared in the cpu_online_mask */
1018 int receive_cpu = cpumask_any(cpu_online_mask);
996 struct rcu_data *rdp = this_cpu_ptr(rsp->rda); 1019 struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
1020 struct rcu_data *receive_rdp = per_cpu_ptr(rsp->rda, receive_cpu);
997 1021
998 if (rdp->nxtlist == NULL) 1022 if (rdp->nxtlist == NULL)
999 return; /* irqs disabled, so comparison is stable. */ 1023 return; /* irqs disabled, so comparison is stable. */
1000 raw_spin_lock(&rsp->onofflock); /* irqs already disabled. */ 1024
1001 *rsp->orphan_cbs_tail = rdp->nxtlist; 1025 *receive_rdp->nxttail[RCU_NEXT_TAIL] = rdp->nxtlist;
1002 rsp->orphan_cbs_tail = rdp->nxttail[RCU_NEXT_TAIL]; 1026 receive_rdp->nxttail[RCU_NEXT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
1027 receive_rdp->qlen += rdp->qlen;
1028 receive_rdp->n_cbs_adopted += rdp->qlen;
1029 rdp->n_cbs_orphaned += rdp->qlen;
1030
1003 rdp->nxtlist = NULL; 1031 rdp->nxtlist = NULL;
1004 for (i = 0; i < RCU_NEXT_SIZE; i++) 1032 for (i = 0; i < RCU_NEXT_SIZE; i++)
1005 rdp->nxttail[i] = &rdp->nxtlist; 1033 rdp->nxttail[i] = &rdp->nxtlist;
1006 rsp->orphan_qlen += rdp->qlen;
1007 rdp->n_cbs_orphaned += rdp->qlen;
1008 rdp->qlen = 0; 1034 rdp->qlen = 0;
1009 raw_spin_unlock(&rsp->onofflock); /* irqs remain disabled. */
1010}
1011
1012/*
1013 * Adopt previously orphaned RCU callbacks.
1014 */
1015static void rcu_adopt_orphan_cbs(struct rcu_state *rsp)
1016{
1017 unsigned long flags;
1018 struct rcu_data *rdp;
1019
1020 raw_spin_lock_irqsave(&rsp->onofflock, flags);
1021 rdp = this_cpu_ptr(rsp->rda);
1022 if (rsp->orphan_cbs_list == NULL) {
1023 raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
1024 return;
1025 }
1026 *rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_cbs_list;
1027 rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_cbs_tail;
1028 rdp->qlen += rsp->orphan_qlen;
1029 rdp->n_cbs_adopted += rsp->orphan_qlen;
1030 rsp->orphan_cbs_list = NULL;
1031 rsp->orphan_cbs_tail = &rsp->orphan_cbs_list;
1032 rsp->orphan_qlen = 0;
1033 raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
1034} 1035}
1035 1036
1036/* 1037/*
@@ -1081,8 +1082,6 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
1081 raw_spin_unlock_irqrestore(&rnp->lock, flags); 1082 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1082 if (need_report & RCU_OFL_TASKS_EXP_GP) 1083 if (need_report & RCU_OFL_TASKS_EXP_GP)
1083 rcu_report_exp_rnp(rsp, rnp); 1084 rcu_report_exp_rnp(rsp, rnp);
1084
1085 rcu_adopt_orphan_cbs(rsp);
1086} 1085}
1087 1086
1088/* 1087/*
@@ -1100,11 +1099,7 @@ static void rcu_offline_cpu(int cpu)
1100 1099
1101#else /* #ifdef CONFIG_HOTPLUG_CPU */ 1100#else /* #ifdef CONFIG_HOTPLUG_CPU */
1102 1101
1103static void rcu_send_cbs_to_orphanage(struct rcu_state *rsp) 1102static void rcu_send_cbs_to_online(struct rcu_state *rsp)
1104{
1105}
1106
1107static void rcu_adopt_orphan_cbs(struct rcu_state *rsp)
1108{ 1103{
1109} 1104}
1110 1105
@@ -1440,22 +1435,11 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
1440 */ 1435 */
1441 local_irq_save(flags); 1436 local_irq_save(flags);
1442 rdp = this_cpu_ptr(rsp->rda); 1437 rdp = this_cpu_ptr(rsp->rda);
1443 rcu_process_gp_end(rsp, rdp);
1444 check_for_new_grace_period(rsp, rdp);
1445 1438
1446 /* Add the callback to our list. */ 1439 /* Add the callback to our list. */
1447 *rdp->nxttail[RCU_NEXT_TAIL] = head; 1440 *rdp->nxttail[RCU_NEXT_TAIL] = head;
1448 rdp->nxttail[RCU_NEXT_TAIL] = &head->next; 1441 rdp->nxttail[RCU_NEXT_TAIL] = &head->next;
1449 1442
1450 /* Start a new grace period if one not already started. */
1451 if (!rcu_gp_in_progress(rsp)) {
1452 unsigned long nestflag;
1453 struct rcu_node *rnp_root = rcu_get_root(rsp);
1454
1455 raw_spin_lock_irqsave(&rnp_root->lock, nestflag);
1456 rcu_start_gp(rsp, nestflag); /* releases rnp_root->lock. */
1457 }
1458
1459 /* 1443 /*
1460 * Force the grace period if too many callbacks or too long waiting. 1444 * Force the grace period if too many callbacks or too long waiting.
1461 * Enforce hysteresis, and don't invoke force_quiescent_state() 1445 * Enforce hysteresis, and don't invoke force_quiescent_state()
@@ -1464,12 +1448,27 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
1464 * is the only one waiting for a grace period to complete. 1448 * is the only one waiting for a grace period to complete.
1465 */ 1449 */
1466 if (unlikely(++rdp->qlen > rdp->qlen_last_fqs_check + qhimark)) { 1450 if (unlikely(++rdp->qlen > rdp->qlen_last_fqs_check + qhimark)) {
1467 rdp->blimit = LONG_MAX; 1451
1468 if (rsp->n_force_qs == rdp->n_force_qs_snap && 1452 /* Are we ignoring a completed grace period? */
1469 *rdp->nxttail[RCU_DONE_TAIL] != head) 1453 rcu_process_gp_end(rsp, rdp);
1470 force_quiescent_state(rsp, 0); 1454 check_for_new_grace_period(rsp, rdp);
1471 rdp->n_force_qs_snap = rsp->n_force_qs; 1455
1472 rdp->qlen_last_fqs_check = rdp->qlen; 1456 /* Start a new grace period if one not already started. */
1457 if (!rcu_gp_in_progress(rsp)) {
1458 unsigned long nestflag;
1459 struct rcu_node *rnp_root = rcu_get_root(rsp);
1460
1461 raw_spin_lock_irqsave(&rnp_root->lock, nestflag);
1462 rcu_start_gp(rsp, nestflag); /* rlses rnp_root->lock */
1463 } else {
1464 /* Give the grace period a kick. */
1465 rdp->blimit = LONG_MAX;
1466 if (rsp->n_force_qs == rdp->n_force_qs_snap &&
1467 *rdp->nxttail[RCU_DONE_TAIL] != head)
1468 force_quiescent_state(rsp, 0);
1469 rdp->n_force_qs_snap = rsp->n_force_qs;
1470 rdp->qlen_last_fqs_check = rdp->qlen;
1471 }
1473 } else if (ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs), jiffies)) 1472 } else if (ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs), jiffies))
1474 force_quiescent_state(rsp, 1); 1473 force_quiescent_state(rsp, 1);
1475 local_irq_restore(flags); 1474 local_irq_restore(flags);
@@ -1699,13 +1698,12 @@ static void _rcu_barrier(struct rcu_state *rsp,
1699 * decrement rcu_barrier_cpu_count -- otherwise the first CPU 1698 * decrement rcu_barrier_cpu_count -- otherwise the first CPU
1700 * might complete its grace period before all of the other CPUs 1699 * might complete its grace period before all of the other CPUs
1701 * did their increment, causing this function to return too 1700 * did their increment, causing this function to return too
1702 * early. 1701 * early. Note that on_each_cpu() disables irqs, which prevents
1702 * any CPUs from coming online or going offline until each online
1703 * CPU has queued its RCU-barrier callback.
1703 */ 1704 */
1704 atomic_set(&rcu_barrier_cpu_count, 1); 1705 atomic_set(&rcu_barrier_cpu_count, 1);
1705 preempt_disable(); /* stop CPU_DYING from filling orphan_cbs_list */
1706 rcu_adopt_orphan_cbs(rsp);
1707 on_each_cpu(rcu_barrier_func, (void *)call_rcu_func, 1); 1706 on_each_cpu(rcu_barrier_func, (void *)call_rcu_func, 1);
1708 preempt_enable(); /* CPU_DYING can again fill orphan_cbs_list */
1709 if (atomic_dec_and_test(&rcu_barrier_cpu_count)) 1707 if (atomic_dec_and_test(&rcu_barrier_cpu_count))
1710 complete(&rcu_barrier_completion); 1708 complete(&rcu_barrier_completion);
1711 wait_for_completion(&rcu_barrier_completion); 1709 wait_for_completion(&rcu_barrier_completion);
@@ -1831,18 +1829,13 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
1831 case CPU_DYING: 1829 case CPU_DYING:
1832 case CPU_DYING_FROZEN: 1830 case CPU_DYING_FROZEN:
1833 /* 1831 /*
1834 * preempt_disable() in _rcu_barrier() prevents stop_machine(), 1832 * The whole machine is "stopped" except this CPU, so we can
1835 * so when "on_each_cpu(rcu_barrier_func, (void *)type, 1);" 1833 * touch any data without introducing corruption. We send the
1836 * returns, all online cpus have queued rcu_barrier_func(). 1834 * dying CPU's callbacks to an arbitrarily chosen online CPU.
1837 * The dying CPU clears its cpu_online_mask bit and
1838 * moves all of its RCU callbacks to ->orphan_cbs_list
1839 * in the context of stop_machine(), so subsequent calls
1840 * to _rcu_barrier() will adopt these callbacks and only
1841 * then queue rcu_barrier_func() on all remaining CPUs.
1842 */ 1835 */
1843 rcu_send_cbs_to_orphanage(&rcu_bh_state); 1836 rcu_send_cbs_to_online(&rcu_bh_state);
1844 rcu_send_cbs_to_orphanage(&rcu_sched_state); 1837 rcu_send_cbs_to_online(&rcu_sched_state);
1845 rcu_preempt_send_cbs_to_orphanage(); 1838 rcu_preempt_send_cbs_to_online();
1846 break; 1839 break;
1847 case CPU_DEAD: 1840 case CPU_DEAD:
1848 case CPU_DEAD_FROZEN: 1841 case CPU_DEAD_FROZEN:
@@ -1880,8 +1873,9 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp)
1880{ 1873{
1881 int i; 1874 int i;
1882 1875
1883 for (i = NUM_RCU_LVLS - 1; i >= 0; i--) 1876 for (i = NUM_RCU_LVLS - 1; i > 0; i--)
1884 rsp->levelspread[i] = CONFIG_RCU_FANOUT; 1877 rsp->levelspread[i] = CONFIG_RCU_FANOUT;
1878 rsp->levelspread[0] = RCU_FANOUT_LEAF;
1885} 1879}
1886#else /* #ifdef CONFIG_RCU_FANOUT_EXACT */ 1880#else /* #ifdef CONFIG_RCU_FANOUT_EXACT */
1887static void __init rcu_init_levelspread(struct rcu_state *rsp) 1881static void __init rcu_init_levelspread(struct rcu_state *rsp)
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index 91d4170c5c13..e8f057e44e3e 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -31,46 +31,51 @@
31/* 31/*
32 * Define shape of hierarchy based on NR_CPUS and CONFIG_RCU_FANOUT. 32 * Define shape of hierarchy based on NR_CPUS and CONFIG_RCU_FANOUT.
33 * In theory, it should be possible to add more levels straightforwardly. 33 * In theory, it should be possible to add more levels straightforwardly.
34 * In practice, this has not been tested, so there is probably some 34 * In practice, this did work well going from three levels to four.
35 * bug somewhere. 35 * Of course, your mileage may vary.
36 */ 36 */
37#define MAX_RCU_LVLS 4 37#define MAX_RCU_LVLS 4
38#define RCU_FANOUT (CONFIG_RCU_FANOUT) 38#if CONFIG_RCU_FANOUT > 16
39#define RCU_FANOUT_SQ (RCU_FANOUT * RCU_FANOUT) 39#define RCU_FANOUT_LEAF 16
40#define RCU_FANOUT_CUBE (RCU_FANOUT_SQ * RCU_FANOUT) 40#else /* #if CONFIG_RCU_FANOUT > 16 */
41#define RCU_FANOUT_FOURTH (RCU_FANOUT_CUBE * RCU_FANOUT) 41#define RCU_FANOUT_LEAF (CONFIG_RCU_FANOUT)
42 42#endif /* #else #if CONFIG_RCU_FANOUT > 16 */
43#if NR_CPUS <= RCU_FANOUT 43#define RCU_FANOUT_1 (RCU_FANOUT_LEAF)
44#define RCU_FANOUT_2 (RCU_FANOUT_1 * CONFIG_RCU_FANOUT)
45#define RCU_FANOUT_3 (RCU_FANOUT_2 * CONFIG_RCU_FANOUT)
46#define RCU_FANOUT_4 (RCU_FANOUT_3 * CONFIG_RCU_FANOUT)
47
48#if NR_CPUS <= RCU_FANOUT_1
44# define NUM_RCU_LVLS 1 49# define NUM_RCU_LVLS 1
45# define NUM_RCU_LVL_0 1 50# define NUM_RCU_LVL_0 1
46# define NUM_RCU_LVL_1 (NR_CPUS) 51# define NUM_RCU_LVL_1 (NR_CPUS)
47# define NUM_RCU_LVL_2 0 52# define NUM_RCU_LVL_2 0
48# define NUM_RCU_LVL_3 0 53# define NUM_RCU_LVL_3 0
49# define NUM_RCU_LVL_4 0 54# define NUM_RCU_LVL_4 0
50#elif NR_CPUS <= RCU_FANOUT_SQ 55#elif NR_CPUS <= RCU_FANOUT_2
51# define NUM_RCU_LVLS 2 56# define NUM_RCU_LVLS 2
52# define NUM_RCU_LVL_0 1 57# define NUM_RCU_LVL_0 1
53# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT) 58# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
54# define NUM_RCU_LVL_2 (NR_CPUS) 59# define NUM_RCU_LVL_2 (NR_CPUS)
55# define NUM_RCU_LVL_3 0 60# define NUM_RCU_LVL_3 0
56# define NUM_RCU_LVL_4 0 61# define NUM_RCU_LVL_4 0
57#elif NR_CPUS <= RCU_FANOUT_CUBE 62#elif NR_CPUS <= RCU_FANOUT_3
58# define NUM_RCU_LVLS 3 63# define NUM_RCU_LVLS 3
59# define NUM_RCU_LVL_0 1 64# define NUM_RCU_LVL_0 1
60# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_SQ) 65# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2)
61# define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT) 66# define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
62# define NUM_RCU_LVL_3 NR_CPUS 67# define NUM_RCU_LVL_3 (NR_CPUS)
63# define NUM_RCU_LVL_4 0 68# define NUM_RCU_LVL_4 0
64#elif NR_CPUS <= RCU_FANOUT_FOURTH 69#elif NR_CPUS <= RCU_FANOUT_4
65# define NUM_RCU_LVLS 4 70# define NUM_RCU_LVLS 4
66# define NUM_RCU_LVL_0 1 71# define NUM_RCU_LVL_0 1
67# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_CUBE) 72# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_3)
68# define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_SQ) 73# define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2)
69# define NUM_RCU_LVL_3 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT) 74# define NUM_RCU_LVL_3 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
70# define NUM_RCU_LVL_4 NR_CPUS 75# define NUM_RCU_LVL_4 (NR_CPUS)
71#else 76#else
72# error "CONFIG_RCU_FANOUT insufficient for NR_CPUS" 77# error "CONFIG_RCU_FANOUT insufficient for NR_CPUS"
73#endif /* #if (NR_CPUS) <= RCU_FANOUT */ 78#endif /* #if (NR_CPUS) <= RCU_FANOUT_1 */
74 79
75#define RCU_SUM (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3 + NUM_RCU_LVL_4) 80#define RCU_SUM (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3 + NUM_RCU_LVL_4)
76#define NUM_RCU_NODES (RCU_SUM - NR_CPUS) 81#define NUM_RCU_NODES (RCU_SUM - NR_CPUS)
@@ -203,8 +208,8 @@ struct rcu_data {
203 long qlen_last_fqs_check; 208 long qlen_last_fqs_check;
204 /* qlen at last check for QS forcing */ 209 /* qlen at last check for QS forcing */
205 unsigned long n_cbs_invoked; /* count of RCU cbs invoked. */ 210 unsigned long n_cbs_invoked; /* count of RCU cbs invoked. */
206 unsigned long n_cbs_orphaned; /* RCU cbs sent to orphanage. */ 211 unsigned long n_cbs_orphaned; /* RCU cbs orphaned by dying CPU */
207 unsigned long n_cbs_adopted; /* RCU cbs adopted from orphanage. */ 212 unsigned long n_cbs_adopted; /* RCU cbs adopted from dying CPU */
208 unsigned long n_force_qs_snap; 213 unsigned long n_force_qs_snap;
209 /* did other CPU force QS recently? */ 214 /* did other CPU force QS recently? */
210 long blimit; /* Upper limit on a processed batch */ 215 long blimit; /* Upper limit on a processed batch */
@@ -309,15 +314,7 @@ struct rcu_state {
309 /* End of fields guarded by root rcu_node's lock. */ 314 /* End of fields guarded by root rcu_node's lock. */
310 315
311 raw_spinlock_t onofflock; /* exclude on/offline and */ 316 raw_spinlock_t onofflock; /* exclude on/offline and */
312 /* starting new GP. Also */ 317 /* starting new GP. */
313 /* protects the following */
314 /* orphan_cbs fields. */
315 struct rcu_head *orphan_cbs_list; /* list of rcu_head structs */
316 /* orphaned by all CPUs in */
317 /* a given leaf rcu_node */
318 /* going offline. */
319 struct rcu_head **orphan_cbs_tail; /* And tail pointer. */
320 long orphan_qlen; /* Number of orphaned cbs. */
321 raw_spinlock_t fqslock; /* Only one task forcing */ 318 raw_spinlock_t fqslock; /* Only one task forcing */
322 /* quiescent states. */ 319 /* quiescent states. */
323 unsigned long jiffies_force_qs; /* Time at which to invoke */ 320 unsigned long jiffies_force_qs; /* Time at which to invoke */
@@ -390,7 +387,7 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp);
390static int rcu_preempt_pending(int cpu); 387static int rcu_preempt_pending(int cpu);
391static int rcu_preempt_needs_cpu(int cpu); 388static int rcu_preempt_needs_cpu(int cpu);
392static void __cpuinit rcu_preempt_init_percpu_data(int cpu); 389static void __cpuinit rcu_preempt_init_percpu_data(int cpu);
393static void rcu_preempt_send_cbs_to_orphanage(void); 390static void rcu_preempt_send_cbs_to_online(void);
394static void __init __rcu_init_preempt(void); 391static void __init __rcu_init_preempt(void);
395static void rcu_needs_cpu_flush(void); 392static void rcu_needs_cpu_flush(void);
396 393
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 71a4147473f9..a3638710dc67 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -25,6 +25,7 @@
25 */ 25 */
26 26
27#include <linux/delay.h> 27#include <linux/delay.h>
28#include <linux/stop_machine.h>
28 29
29/* 30/*
30 * Check the RCU kernel configuration parameters and print informative 31 * Check the RCU kernel configuration parameters and print informative
@@ -773,11 +774,11 @@ static void __cpuinit rcu_preempt_init_percpu_data(int cpu)
773} 774}
774 775
775/* 776/*
776 * Move preemptable RCU's callbacks to ->orphan_cbs_list. 777 * Move preemptable RCU's callbacks from dying CPU to other online CPU.
777 */ 778 */
778static void rcu_preempt_send_cbs_to_orphanage(void) 779static void rcu_preempt_send_cbs_to_online(void)
779{ 780{
780 rcu_send_cbs_to_orphanage(&rcu_preempt_state); 781 rcu_send_cbs_to_online(&rcu_preempt_state);
781} 782}
782 783
783/* 784/*
@@ -1001,7 +1002,7 @@ static void __cpuinit rcu_preempt_init_percpu_data(int cpu)
1001/* 1002/*
1002 * Because there is no preemptable RCU, there are no callbacks to move. 1003 * Because there is no preemptable RCU, there are no callbacks to move.
1003 */ 1004 */
1004static void rcu_preempt_send_cbs_to_orphanage(void) 1005static void rcu_preempt_send_cbs_to_online(void)
1005{ 1006{
1006} 1007}
1007 1008
@@ -1014,6 +1015,132 @@ static void __init __rcu_init_preempt(void)
1014 1015
1015#endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */ 1016#endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */
1016 1017
1018#ifndef CONFIG_SMP
1019
1020void synchronize_sched_expedited(void)
1021{
1022 cond_resched();
1023}
1024EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
1025
1026#else /* #ifndef CONFIG_SMP */
1027
1028static atomic_t sync_sched_expedited_started = ATOMIC_INIT(0);
1029static atomic_t sync_sched_expedited_done = ATOMIC_INIT(0);
1030
1031static int synchronize_sched_expedited_cpu_stop(void *data)
1032{
1033 /*
1034 * There must be a full memory barrier on each affected CPU
1035 * between the time that try_stop_cpus() is called and the
1036 * time that it returns.
1037 *
1038 * In the current initial implementation of cpu_stop, the
1039 * above condition is already met when the control reaches
1040 * this point and the following smp_mb() is not strictly
1041 * necessary. Do smp_mb() anyway for documentation and
1042 * robustness against future implementation changes.
1043 */
1044 smp_mb(); /* See above comment block. */
1045 return 0;
1046}
1047
1048/*
1049 * Wait for an rcu-sched grace period to elapse, but use "big hammer"
1050 * approach to force grace period to end quickly. This consumes
1051 * significant time on all CPUs, and is thus not recommended for
1052 * any sort of common-case code.
1053 *
1054 * Note that it is illegal to call this function while holding any
1055 * lock that is acquired by a CPU-hotplug notifier. Failing to
1056 * observe this restriction will result in deadlock.
1057 *
1058 * This implementation can be thought of as an application of ticket
1059 * locking to RCU, with sync_sched_expedited_started and
1060 * sync_sched_expedited_done taking on the roles of the halves
1061 * of the ticket-lock word. Each task atomically increments
1062 * sync_sched_expedited_started upon entry, snapshotting the old value,
1063 * then attempts to stop all the CPUs. If this succeeds, then each
1064 * CPU will have executed a context switch, resulting in an RCU-sched
1065 * grace period. We are then done, so we use atomic_cmpxchg() to
1066 * update sync_sched_expedited_done to match our snapshot -- but
1067 * only if someone else has not already advanced past our snapshot.
1068 *
1069 * On the other hand, if try_stop_cpus() fails, we check the value
1070 * of sync_sched_expedited_done. If it has advanced past our
1071 * initial snapshot, then someone else must have forced a grace period
1072 * some time after we took our snapshot. In this case, our work is
1073 * done for us, and we can simply return. Otherwise, we try again,
1074 * but keep our initial snapshot for purposes of checking for someone
1075 * doing our work for us.
1076 *
1077 * If we fail too many times in a row, we fall back to synchronize_sched().
1078 */
1079void synchronize_sched_expedited(void)
1080{
1081 int firstsnap, s, snap, trycount = 0;
1082
1083 /* Note that atomic_inc_return() implies full memory barrier. */
1084 firstsnap = snap = atomic_inc_return(&sync_sched_expedited_started);
1085 get_online_cpus();
1086
1087 /*
1088 * Each pass through the following loop attempts to force a
1089 * context switch on each CPU.
1090 */
1091 while (try_stop_cpus(cpu_online_mask,
1092 synchronize_sched_expedited_cpu_stop,
1093 NULL) == -EAGAIN) {
1094 put_online_cpus();
1095
1096 /* No joy, try again later. Or just synchronize_sched(). */
1097 if (trycount++ < 10)
1098 udelay(trycount * num_online_cpus());
1099 else {
1100 synchronize_sched();
1101 return;
1102 }
1103
1104 /* Check to see if someone else did our work for us. */
1105 s = atomic_read(&sync_sched_expedited_done);
1106 if (UINT_CMP_GE((unsigned)s, (unsigned)firstsnap)) {
1107 smp_mb(); /* ensure test happens before caller kfree */
1108 return;
1109 }
1110
1111 /*
1112 * Refetching sync_sched_expedited_started allows later
1113 * callers to piggyback on our grace period. We subtract
1114 * 1 to get the same token that the last incrementer got.
1115 * We retry after they started, so our grace period works
1116 * for them, and they started after our first try, so their
1117 * grace period works for us.
1118 */
1119 get_online_cpus();
1120 snap = atomic_read(&sync_sched_expedited_started) - 1;
1121 smp_mb(); /* ensure read is before try_stop_cpus(). */
1122 }
1123
1124 /*
1125 * Everyone up to our most recent fetch is covered by our grace
1126 * period. Update the counter, but only if our work is still
1127 * relevant -- which it won't be if someone who started later
1128 * than we did beat us to the punch.
1129 */
1130 do {
1131 s = atomic_read(&sync_sched_expedited_done);
1132 if (UINT_CMP_GE((unsigned)s, (unsigned)snap)) {
1133 smp_mb(); /* ensure test happens before caller kfree */
1134 break;
1135 }
1136 } while (atomic_cmpxchg(&sync_sched_expedited_done, s, snap) != s);
1137
1138 put_online_cpus();
1139}
1140EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
1141
1142#endif /* #else #ifndef CONFIG_SMP */
1143
1017#if !defined(CONFIG_RCU_FAST_NO_HZ) 1144#if !defined(CONFIG_RCU_FAST_NO_HZ)
1018 1145
1019/* 1146/*
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index d15430b9d122..c8e97853b970 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -166,13 +166,13 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
166 166
167 gpnum = rsp->gpnum; 167 gpnum = rsp->gpnum;
168 seq_printf(m, "c=%lu g=%lu s=%d jfq=%ld j=%x " 168 seq_printf(m, "c=%lu g=%lu s=%d jfq=%ld j=%x "
169 "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld\n", 169 "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu\n",
170 rsp->completed, gpnum, rsp->signaled, 170 rsp->completed, gpnum, rsp->signaled,
171 (long)(rsp->jiffies_force_qs - jiffies), 171 (long)(rsp->jiffies_force_qs - jiffies),
172 (int)(jiffies & 0xffff), 172 (int)(jiffies & 0xffff),
173 rsp->n_force_qs, rsp->n_force_qs_ngp, 173 rsp->n_force_qs, rsp->n_force_qs_ngp,
174 rsp->n_force_qs - rsp->n_force_qs_ngp, 174 rsp->n_force_qs - rsp->n_force_qs_ngp,
175 rsp->n_force_qs_lh, rsp->orphan_qlen); 175 rsp->n_force_qs_lh);
176 for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < NUM_RCU_NODES; rnp++) { 176 for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < NUM_RCU_NODES; rnp++) {
177 if (rnp->level != level) { 177 if (rnp->level != level) {
178 seq_puts(m, "\n"); 178 seq_puts(m, "\n");
@@ -300,7 +300,7 @@ static const struct file_operations rcu_pending_fops = {
300 300
301static struct dentry *rcudir; 301static struct dentry *rcudir;
302 302
303static int __init rcuclassic_trace_init(void) 303static int __init rcutree_trace_init(void)
304{ 304{
305 struct dentry *retval; 305 struct dentry *retval;
306 306
@@ -337,14 +337,14 @@ free_out:
337 return 1; 337 return 1;
338} 338}
339 339
340static void __exit rcuclassic_trace_cleanup(void) 340static void __exit rcutree_trace_cleanup(void)
341{ 341{
342 debugfs_remove_recursive(rcudir); 342 debugfs_remove_recursive(rcudir);
343} 343}
344 344
345 345
346module_init(rcuclassic_trace_init); 346module_init(rcutree_trace_init);
347module_exit(rcuclassic_trace_cleanup); 347module_exit(rcutree_trace_cleanup);
348 348
349MODULE_AUTHOR("Paul E. McKenney"); 349MODULE_AUTHOR("Paul E. McKenney");
350MODULE_DESCRIPTION("Read-Copy Update tracing for hierarchical implementation"); 350MODULE_DESCRIPTION("Read-Copy Update tracing for hierarchical implementation");
diff --git a/kernel/sched.c b/kernel/sched.c
index 297d1a0eedb0..ea3e5eff3878 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -75,9 +75,11 @@
75 75
76#include <asm/tlb.h> 76#include <asm/tlb.h>
77#include <asm/irq_regs.h> 77#include <asm/irq_regs.h>
78#include <asm/mutex.h>
78 79
79#include "sched_cpupri.h" 80#include "sched_cpupri.h"
80#include "workqueue_sched.h" 81#include "workqueue_sched.h"
82#include "sched_autogroup.h"
81 83
82#define CREATE_TRACE_POINTS 84#define CREATE_TRACE_POINTS
83#include <trace/events/sched.h> 85#include <trace/events/sched.h>
@@ -253,6 +255,8 @@ struct task_group {
253 /* runqueue "owned" by this group on each cpu */ 255 /* runqueue "owned" by this group on each cpu */
254 struct cfs_rq **cfs_rq; 256 struct cfs_rq **cfs_rq;
255 unsigned long shares; 257 unsigned long shares;
258
259 atomic_t load_weight;
256#endif 260#endif
257 261
258#ifdef CONFIG_RT_GROUP_SCHED 262#ifdef CONFIG_RT_GROUP_SCHED
@@ -268,25 +272,18 @@ struct task_group {
268 struct task_group *parent; 272 struct task_group *parent;
269 struct list_head siblings; 273 struct list_head siblings;
270 struct list_head children; 274 struct list_head children;
271};
272 275
273#define root_task_group init_task_group 276#ifdef CONFIG_SCHED_AUTOGROUP
277 struct autogroup *autogroup;
278#endif
279};
274 280
275/* task_group_lock serializes add/remove of task groups and also changes to 281/* task_group_lock serializes the addition/removal of task groups */
276 * a task group's cpu shares.
277 */
278static DEFINE_SPINLOCK(task_group_lock); 282static DEFINE_SPINLOCK(task_group_lock);
279 283
280#ifdef CONFIG_FAIR_GROUP_SCHED 284#ifdef CONFIG_FAIR_GROUP_SCHED
281 285
282#ifdef CONFIG_SMP 286# define ROOT_TASK_GROUP_LOAD NICE_0_LOAD
283static int root_task_group_empty(void)
284{
285 return list_empty(&root_task_group.children);
286}
287#endif
288
289# define INIT_TASK_GROUP_LOAD NICE_0_LOAD
290 287
291/* 288/*
292 * A weight of 0 or 1 can cause arithmetics problems. 289 * A weight of 0 or 1 can cause arithmetics problems.
@@ -299,13 +296,13 @@ static int root_task_group_empty(void)
299#define MIN_SHARES 2 296#define MIN_SHARES 2
300#define MAX_SHARES (1UL << 18) 297#define MAX_SHARES (1UL << 18)
301 298
302static int init_task_group_load = INIT_TASK_GROUP_LOAD; 299static int root_task_group_load = ROOT_TASK_GROUP_LOAD;
303#endif 300#endif
304 301
305/* Default task group. 302/* Default task group.
306 * Every task in system belong to this group at bootup. 303 * Every task in system belong to this group at bootup.
307 */ 304 */
308struct task_group init_task_group; 305struct task_group root_task_group;
309 306
310#endif /* CONFIG_CGROUP_SCHED */ 307#endif /* CONFIG_CGROUP_SCHED */
311 308
@@ -342,6 +339,7 @@ struct cfs_rq {
342 * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This 339 * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
343 * list is used during load balance. 340 * list is used during load balance.
344 */ 341 */
342 int on_list;
345 struct list_head leaf_cfs_rq_list; 343 struct list_head leaf_cfs_rq_list;
346 struct task_group *tg; /* group that "owns" this runqueue */ 344 struct task_group *tg; /* group that "owns" this runqueue */
347 345
@@ -360,14 +358,17 @@ struct cfs_rq {
360 unsigned long h_load; 358 unsigned long h_load;
361 359
362 /* 360 /*
363 * this cpu's part of tg->shares 361 * Maintaining per-cpu shares distribution for group scheduling
362 *
363 * load_stamp is the last time we updated the load average
364 * load_last is the last time we updated the load average and saw load
365 * load_unacc_exec_time is currently unaccounted execution time
364 */ 366 */
365 unsigned long shares; 367 u64 load_avg;
368 u64 load_period;
369 u64 load_stamp, load_last, load_unacc_exec_time;
366 370
367 /* 371 unsigned long load_contribution;
368 * load.weight at the time we set shares
369 */
370 unsigned long rq_weight;
371#endif 372#endif
372#endif 373#endif
373}; 374};
@@ -605,11 +606,14 @@ static inline int cpu_of(struct rq *rq)
605 */ 606 */
606static inline struct task_group *task_group(struct task_struct *p) 607static inline struct task_group *task_group(struct task_struct *p)
607{ 608{
609 struct task_group *tg;
608 struct cgroup_subsys_state *css; 610 struct cgroup_subsys_state *css;
609 611
610 css = task_subsys_state_check(p, cpu_cgroup_subsys_id, 612 css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
611 lockdep_is_held(&task_rq(p)->lock)); 613 lockdep_is_held(&task_rq(p)->lock));
612 return container_of(css, struct task_group, css); 614 tg = container_of(css, struct task_group, css);
615
616 return autogroup_task_group(p, tg);
613} 617}
614 618
615/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ 619/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
@@ -737,7 +741,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
737 buf[cnt] = 0; 741 buf[cnt] = 0;
738 cmp = strstrip(buf); 742 cmp = strstrip(buf);
739 743
740 if (strncmp(buf, "NO_", 3) == 0) { 744 if (strncmp(cmp, "NO_", 3) == 0) {
741 neg = 1; 745 neg = 1;
742 cmp += 3; 746 cmp += 3;
743 } 747 }
@@ -793,20 +797,6 @@ late_initcall(sched_init_debug);
793const_debug unsigned int sysctl_sched_nr_migrate = 32; 797const_debug unsigned int sysctl_sched_nr_migrate = 32;
794 798
795/* 799/*
796 * ratelimit for updating the group shares.
797 * default: 0.25ms
798 */
799unsigned int sysctl_sched_shares_ratelimit = 250000;
800unsigned int normalized_sysctl_sched_shares_ratelimit = 250000;
801
802/*
803 * Inject some fuzzyness into changing the per-cpu group shares
804 * this avoids remote rq-locks at the expense of fairness.
805 * default: 4
806 */
807unsigned int sysctl_sched_shares_thresh = 4;
808
809/*
810 * period over which we average the RT time consumption, measured 800 * period over which we average the RT time consumption, measured
811 * in ms. 801 * in ms.
812 * 802 *
@@ -1355,6 +1345,12 @@ static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
1355 lw->inv_weight = 0; 1345 lw->inv_weight = 0;
1356} 1346}
1357 1347
1348static inline void update_load_set(struct load_weight *lw, unsigned long w)
1349{
1350 lw->weight = w;
1351 lw->inv_weight = 0;
1352}
1353
1358/* 1354/*
1359 * To aid in avoiding the subversion of "niceness" due to uneven distribution 1355 * To aid in avoiding the subversion of "niceness" due to uneven distribution
1360 * of tasks with abnormal "nice" values across CPUs the contribution that 1356 * of tasks with abnormal "nice" values across CPUs the contribution that
@@ -1543,101 +1539,6 @@ static unsigned long cpu_avg_load_per_task(int cpu)
1543 1539
1544#ifdef CONFIG_FAIR_GROUP_SCHED 1540#ifdef CONFIG_FAIR_GROUP_SCHED
1545 1541
1546static __read_mostly unsigned long __percpu *update_shares_data;
1547
1548static void __set_se_shares(struct sched_entity *se, unsigned long shares);
1549
1550/*
1551 * Calculate and set the cpu's group shares.
1552 */
1553static void update_group_shares_cpu(struct task_group *tg, int cpu,
1554 unsigned long sd_shares,
1555 unsigned long sd_rq_weight,
1556 unsigned long *usd_rq_weight)
1557{
1558 unsigned long shares, rq_weight;
1559 int boost = 0;
1560
1561 rq_weight = usd_rq_weight[cpu];
1562 if (!rq_weight) {
1563 boost = 1;
1564 rq_weight = NICE_0_LOAD;
1565 }
1566
1567 /*
1568 * \Sum_j shares_j * rq_weight_i
1569 * shares_i = -----------------------------
1570 * \Sum_j rq_weight_j
1571 */
1572 shares = (sd_shares * rq_weight) / sd_rq_weight;
1573 shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES);
1574
1575 if (abs(shares - tg->se[cpu]->load.weight) >
1576 sysctl_sched_shares_thresh) {
1577 struct rq *rq = cpu_rq(cpu);
1578 unsigned long flags;
1579
1580 raw_spin_lock_irqsave(&rq->lock, flags);
1581 tg->cfs_rq[cpu]->rq_weight = boost ? 0 : rq_weight;
1582 tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
1583 __set_se_shares(tg->se[cpu], shares);
1584 raw_spin_unlock_irqrestore(&rq->lock, flags);
1585 }
1586}
1587
1588/*
1589 * Re-compute the task group their per cpu shares over the given domain.
1590 * This needs to be done in a bottom-up fashion because the rq weight of a
1591 * parent group depends on the shares of its child groups.
1592 */
1593static int tg_shares_up(struct task_group *tg, void *data)
1594{
1595 unsigned long weight, rq_weight = 0, sum_weight = 0, shares = 0;
1596 unsigned long *usd_rq_weight;
1597 struct sched_domain *sd = data;
1598 unsigned long flags;
1599 int i;
1600
1601 if (!tg->se[0])
1602 return 0;
1603
1604 local_irq_save(flags);
1605 usd_rq_weight = per_cpu_ptr(update_shares_data, smp_processor_id());
1606
1607 for_each_cpu(i, sched_domain_span(sd)) {
1608 weight = tg->cfs_rq[i]->load.weight;
1609 usd_rq_weight[i] = weight;
1610
1611 rq_weight += weight;
1612 /*
1613 * If there are currently no tasks on the cpu pretend there
1614 * is one of average load so that when a new task gets to
1615 * run here it will not get delayed by group starvation.
1616 */
1617 if (!weight)
1618 weight = NICE_0_LOAD;
1619
1620 sum_weight += weight;
1621 shares += tg->cfs_rq[i]->shares;
1622 }
1623
1624 if (!rq_weight)
1625 rq_weight = sum_weight;
1626
1627 if ((!shares && rq_weight) || shares > tg->shares)
1628 shares = tg->shares;
1629
1630 if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))
1631 shares = tg->shares;
1632
1633 for_each_cpu(i, sched_domain_span(sd))
1634 update_group_shares_cpu(tg, i, shares, rq_weight, usd_rq_weight);
1635
1636 local_irq_restore(flags);
1637
1638 return 0;
1639}
1640
1641/* 1542/*
1642 * Compute the cpu's hierarchical load factor for each task group. 1543 * Compute the cpu's hierarchical load factor for each task group.
1643 * This needs to be done in a top-down fashion because the load of a child 1544 * This needs to be done in a top-down fashion because the load of a child
@@ -1652,7 +1553,7 @@ static int tg_load_down(struct task_group *tg, void *data)
1652 load = cpu_rq(cpu)->load.weight; 1553 load = cpu_rq(cpu)->load.weight;
1653 } else { 1554 } else {
1654 load = tg->parent->cfs_rq[cpu]->h_load; 1555 load = tg->parent->cfs_rq[cpu]->h_load;
1655 load *= tg->cfs_rq[cpu]->shares; 1556 load *= tg->se[cpu]->load.weight;
1656 load /= tg->parent->cfs_rq[cpu]->load.weight + 1; 1557 load /= tg->parent->cfs_rq[cpu]->load.weight + 1;
1657 } 1558 }
1658 1559
@@ -1661,34 +1562,11 @@ static int tg_load_down(struct task_group *tg, void *data)
1661 return 0; 1562 return 0;
1662} 1563}
1663 1564
1664static void update_shares(struct sched_domain *sd)
1665{
1666 s64 elapsed;
1667 u64 now;
1668
1669 if (root_task_group_empty())
1670 return;
1671
1672 now = local_clock();
1673 elapsed = now - sd->last_update;
1674
1675 if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {
1676 sd->last_update = now;
1677 walk_tg_tree(tg_nop, tg_shares_up, sd);
1678 }
1679}
1680
1681static void update_h_load(long cpu) 1565static void update_h_load(long cpu)
1682{ 1566{
1683 walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); 1567 walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
1684} 1568}
1685 1569
1686#else
1687
1688static inline void update_shares(struct sched_domain *sd)
1689{
1690}
1691
1692#endif 1570#endif
1693 1571
1694#ifdef CONFIG_PREEMPT 1572#ifdef CONFIG_PREEMPT
@@ -1810,15 +1688,6 @@ static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
1810 1688
1811#endif 1689#endif
1812 1690
1813#ifdef CONFIG_FAIR_GROUP_SCHED
1814static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
1815{
1816#ifdef CONFIG_SMP
1817 cfs_rq->shares = shares;
1818#endif
1819}
1820#endif
1821
1822static void calc_load_account_idle(struct rq *this_rq); 1691static void calc_load_account_idle(struct rq *this_rq);
1823static void update_sysctl(void); 1692static void update_sysctl(void);
1824static int get_update_sysctl_factor(void); 1693static int get_update_sysctl_factor(void);
@@ -2063,6 +1932,7 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
2063#include "sched_idletask.c" 1932#include "sched_idletask.c"
2064#include "sched_fair.c" 1933#include "sched_fair.c"
2065#include "sched_rt.c" 1934#include "sched_rt.c"
1935#include "sched_autogroup.c"
2066#include "sched_stoptask.c" 1936#include "sched_stoptask.c"
2067#ifdef CONFIG_SCHED_DEBUG 1937#ifdef CONFIG_SCHED_DEBUG
2068# include "sched_debug.c" 1938# include "sched_debug.c"
@@ -2255,10 +2125,8 @@ static int migration_cpu_stop(void *data);
2255 * The task's runqueue lock must be held. 2125 * The task's runqueue lock must be held.
2256 * Returns true if you have to wait for migration thread. 2126 * Returns true if you have to wait for migration thread.
2257 */ 2127 */
2258static bool migrate_task(struct task_struct *p, int dest_cpu) 2128static bool migrate_task(struct task_struct *p, struct rq *rq)
2259{ 2129{
2260 struct rq *rq = task_rq(p);
2261
2262 /* 2130 /*
2263 * If the task is not on a runqueue (and not running), then 2131 * If the task is not on a runqueue (and not running), then
2264 * the next wake-up will properly place the task. 2132 * the next wake-up will properly place the task.
@@ -2438,18 +2306,15 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
2438 return dest_cpu; 2306 return dest_cpu;
2439 2307
2440 /* No more Mr. Nice Guy. */ 2308 /* No more Mr. Nice Guy. */
2441 if (unlikely(dest_cpu >= nr_cpu_ids)) { 2309 dest_cpu = cpuset_cpus_allowed_fallback(p);
2442 dest_cpu = cpuset_cpus_allowed_fallback(p); 2310 /*
2443 /* 2311 * Don't tell them about moving exiting tasks or
2444 * Don't tell them about moving exiting tasks or 2312 * kernel threads (both mm NULL), since they never
2445 * kernel threads (both mm NULL), since they never 2313 * leave kernel.
2446 * leave kernel. 2314 */
2447 */ 2315 if (p->mm && printk_ratelimit()) {
2448 if (p->mm && printk_ratelimit()) { 2316 printk(KERN_INFO "process %d (%s) no longer affine to cpu%d\n",
2449 printk(KERN_INFO "process %d (%s) no " 2317 task_pid_nr(p), p->comm, cpu);
2450 "longer affine to cpu%d\n",
2451 task_pid_nr(p), p->comm, cpu);
2452 }
2453 } 2318 }
2454 2319
2455 return dest_cpu; 2320 return dest_cpu;
@@ -2640,7 +2505,7 @@ out:
2640 * try_to_wake_up_local - try to wake up a local task with rq lock held 2505 * try_to_wake_up_local - try to wake up a local task with rq lock held
2641 * @p: the thread to be awakened 2506 * @p: the thread to be awakened
2642 * 2507 *
2643 * Put @p on the run-queue if it's not alredy there. The caller must 2508 * Put @p on the run-queue if it's not already there. The caller must
2644 * ensure that this_rq() is locked, @p is bound to this_rq() and not 2509 * ensure that this_rq() is locked, @p is bound to this_rq() and not
2645 * the current task. this_rq() stays locked over invocation. 2510 * the current task. this_rq() stays locked over invocation.
2646 */ 2511 */
@@ -2785,7 +2650,9 @@ void sched_fork(struct task_struct *p, int clone_flags)
2785 /* Want to start with kernel preemption disabled. */ 2650 /* Want to start with kernel preemption disabled. */
2786 task_thread_info(p)->preempt_count = 1; 2651 task_thread_info(p)->preempt_count = 1;
2787#endif 2652#endif
2653#ifdef CONFIG_SMP
2788 plist_node_init(&p->pushable_tasks, MAX_PRIO); 2654 plist_node_init(&p->pushable_tasks, MAX_PRIO);
2655#endif
2789 2656
2790 put_cpu(); 2657 put_cpu();
2791} 2658}
@@ -3549,7 +3416,7 @@ void sched_exec(void)
3549 * select_task_rq() can race against ->cpus_allowed 3416 * select_task_rq() can race against ->cpus_allowed
3550 */ 3417 */
3551 if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed) && 3418 if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed) &&
3552 likely(cpu_active(dest_cpu)) && migrate_task(p, dest_cpu)) { 3419 likely(cpu_active(dest_cpu)) && migrate_task(p, rq)) {
3553 struct migration_arg arg = { p, dest_cpu }; 3420 struct migration_arg arg = { p, dest_cpu };
3554 3421
3555 task_rq_unlock(rq, &flags); 3422 task_rq_unlock(rq, &flags);
@@ -4214,7 +4081,7 @@ int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)
4214 if (task_thread_info(rq->curr) != owner || need_resched()) 4081 if (task_thread_info(rq->curr) != owner || need_resched())
4215 return 0; 4082 return 0;
4216 4083
4217 cpu_relax(); 4084 arch_mutex_cpu_relax();
4218 } 4085 }
4219 4086
4220 return 1; 4087 return 1;
@@ -4526,7 +4393,7 @@ EXPORT_SYMBOL(wait_for_completion_interruptible);
4526 * This waits for either a completion of a specific task to be signaled or for a 4393 * This waits for either a completion of a specific task to be signaled or for a
4527 * specified timeout to expire. It is interruptible. The timeout is in jiffies. 4394 * specified timeout to expire. It is interruptible. The timeout is in jiffies.
4528 */ 4395 */
4529unsigned long __sched 4396long __sched
4530wait_for_completion_interruptible_timeout(struct completion *x, 4397wait_for_completion_interruptible_timeout(struct completion *x,
4531 unsigned long timeout) 4398 unsigned long timeout)
4532{ 4399{
@@ -4559,7 +4426,7 @@ EXPORT_SYMBOL(wait_for_completion_killable);
4559 * signaled or for a specified timeout to expire. It can be 4426 * signaled or for a specified timeout to expire. It can be
4560 * interrupted by a kill signal. The timeout is in jiffies. 4427 * interrupted by a kill signal. The timeout is in jiffies.
4561 */ 4428 */
4562unsigned long __sched 4429long __sched
4563wait_for_completion_killable_timeout(struct completion *x, 4430wait_for_completion_killable_timeout(struct completion *x,
4564 unsigned long timeout) 4431 unsigned long timeout)
4565{ 4432{
@@ -4901,7 +4768,7 @@ static bool check_same_owner(struct task_struct *p)
4901} 4768}
4902 4769
4903static int __sched_setscheduler(struct task_struct *p, int policy, 4770static int __sched_setscheduler(struct task_struct *p, int policy,
4904 struct sched_param *param, bool user) 4771 const struct sched_param *param, bool user)
4905{ 4772{
4906 int retval, oldprio, oldpolicy = -1, on_rq, running; 4773 int retval, oldprio, oldpolicy = -1, on_rq, running;
4907 unsigned long flags; 4774 unsigned long flags;
@@ -5056,7 +4923,7 @@ recheck:
5056 * NOTE that the task may be already dead. 4923 * NOTE that the task may be already dead.
5057 */ 4924 */
5058int sched_setscheduler(struct task_struct *p, int policy, 4925int sched_setscheduler(struct task_struct *p, int policy,
5059 struct sched_param *param) 4926 const struct sched_param *param)
5060{ 4927{
5061 return __sched_setscheduler(p, policy, param, true); 4928 return __sched_setscheduler(p, policy, param, true);
5062} 4929}
@@ -5074,7 +4941,7 @@ EXPORT_SYMBOL_GPL(sched_setscheduler);
5074 * but our caller might not have that capability. 4941 * but our caller might not have that capability.
5075 */ 4942 */
5076int sched_setscheduler_nocheck(struct task_struct *p, int policy, 4943int sched_setscheduler_nocheck(struct task_struct *p, int policy,
5077 struct sched_param *param) 4944 const struct sched_param *param)
5078{ 4945{
5079 return __sched_setscheduler(p, policy, param, false); 4946 return __sched_setscheduler(p, policy, param, false);
5080} 4947}
@@ -5590,7 +5457,7 @@ void sched_show_task(struct task_struct *p)
5590 unsigned state; 5457 unsigned state;
5591 5458
5592 state = p->state ? __ffs(p->state) + 1 : 0; 5459 state = p->state ? __ffs(p->state) + 1 : 0;
5593 printk(KERN_INFO "%-13.13s %c", p->comm, 5460 printk(KERN_INFO "%-15.15s %c", p->comm,
5594 state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?'); 5461 state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
5595#if BITS_PER_LONG == 32 5462#if BITS_PER_LONG == 32
5596 if (state == TASK_RUNNING) 5463 if (state == TASK_RUNNING)
@@ -5754,7 +5621,6 @@ static void update_sysctl(void)
5754 SET_SYSCTL(sched_min_granularity); 5621 SET_SYSCTL(sched_min_granularity);
5755 SET_SYSCTL(sched_latency); 5622 SET_SYSCTL(sched_latency);
5756 SET_SYSCTL(sched_wakeup_granularity); 5623 SET_SYSCTL(sched_wakeup_granularity);
5757 SET_SYSCTL(sched_shares_ratelimit);
5758#undef SET_SYSCTL 5624#undef SET_SYSCTL
5759} 5625}
5760 5626
@@ -5830,7 +5696,7 @@ again:
5830 goto out; 5696 goto out;
5831 5697
5832 dest_cpu = cpumask_any_and(cpu_active_mask, new_mask); 5698 dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
5833 if (migrate_task(p, dest_cpu)) { 5699 if (migrate_task(p, rq)) {
5834 struct migration_arg arg = { p, dest_cpu }; 5700 struct migration_arg arg = { p, dest_cpu };
5835 /* Need help from migration thread: drop lock and wait. */ 5701 /* Need help from migration thread: drop lock and wait. */
5836 task_rq_unlock(rq, &flags); 5702 task_rq_unlock(rq, &flags);
@@ -5912,29 +5778,20 @@ static int migration_cpu_stop(void *data)
5912} 5778}
5913 5779
5914#ifdef CONFIG_HOTPLUG_CPU 5780#ifdef CONFIG_HOTPLUG_CPU
5781
5915/* 5782/*
5916 * Figure out where task on dead CPU should go, use force if necessary. 5783 * Ensures that the idle task is using init_mm right before its cpu goes
5784 * offline.
5917 */ 5785 */
5918void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) 5786void idle_task_exit(void)
5919{ 5787{
5920 struct rq *rq = cpu_rq(dead_cpu); 5788 struct mm_struct *mm = current->active_mm;
5921 int needs_cpu, uninitialized_var(dest_cpu);
5922 unsigned long flags;
5923 5789
5924 local_irq_save(flags); 5790 BUG_ON(cpu_online(smp_processor_id()));
5925 5791
5926 raw_spin_lock(&rq->lock); 5792 if (mm != &init_mm)
5927 needs_cpu = (task_cpu(p) == dead_cpu) && (p->state != TASK_WAKING); 5793 switch_mm(mm, &init_mm, current);
5928 if (needs_cpu) 5794 mmdrop(mm);
5929 dest_cpu = select_fallback_rq(dead_cpu, p);
5930 raw_spin_unlock(&rq->lock);
5931 /*
5932 * It can only fail if we race with set_cpus_allowed(),
5933 * in the racer should migrate the task anyway.
5934 */
5935 if (needs_cpu)
5936 __migrate_task(p, dead_cpu, dest_cpu);
5937 local_irq_restore(flags);
5938} 5795}
5939 5796
5940/* 5797/*
@@ -5947,128 +5804,69 @@ void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
5947static void migrate_nr_uninterruptible(struct rq *rq_src) 5804static void migrate_nr_uninterruptible(struct rq *rq_src)
5948{ 5805{
5949 struct rq *rq_dest = cpu_rq(cpumask_any(cpu_active_mask)); 5806 struct rq *rq_dest = cpu_rq(cpumask_any(cpu_active_mask));
5950 unsigned long flags;
5951 5807
5952 local_irq_save(flags);
5953 double_rq_lock(rq_src, rq_dest);
5954 rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible; 5808 rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible;
5955 rq_src->nr_uninterruptible = 0; 5809 rq_src->nr_uninterruptible = 0;
5956 double_rq_unlock(rq_src, rq_dest);
5957 local_irq_restore(flags);
5958}
5959
5960/* Run through task list and migrate tasks from the dead cpu. */
5961static void migrate_live_tasks(int src_cpu)
5962{
5963 struct task_struct *p, *t;
5964
5965 read_lock(&tasklist_lock);
5966
5967 do_each_thread(t, p) {
5968 if (p == current)
5969 continue;
5970
5971 if (task_cpu(p) == src_cpu)
5972 move_task_off_dead_cpu(src_cpu, p);
5973 } while_each_thread(t, p);
5974
5975 read_unlock(&tasklist_lock);
5976} 5810}
5977 5811
5978/* 5812/*
5979 * Schedules idle task to be the next runnable task on current CPU. 5813 * remove the tasks which were accounted by rq from calc_load_tasks.
5980 * It does so by boosting its priority to highest possible.
5981 * Used by CPU offline code.
5982 */ 5814 */
5983void sched_idle_next(void) 5815static void calc_global_load_remove(struct rq *rq)
5984{ 5816{
5985 int this_cpu = smp_processor_id(); 5817 atomic_long_sub(rq->calc_load_active, &calc_load_tasks);
5986 struct rq *rq = cpu_rq(this_cpu); 5818 rq->calc_load_active = 0;
5987 struct task_struct *p = rq->idle;
5988 unsigned long flags;
5989
5990 /* cpu has to be offline */
5991 BUG_ON(cpu_online(this_cpu));
5992
5993 /*
5994 * Strictly not necessary since rest of the CPUs are stopped by now
5995 * and interrupts disabled on the current cpu.
5996 */
5997 raw_spin_lock_irqsave(&rq->lock, flags);
5998
5999 __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
6000
6001 activate_task(rq, p, 0);
6002
6003 raw_spin_unlock_irqrestore(&rq->lock, flags);
6004} 5819}
6005 5820
6006/* 5821/*
6007 * Ensures that the idle task is using init_mm right before its cpu goes 5822 * Migrate all tasks from the rq, sleeping tasks will be migrated by
6008 * offline. 5823 * try_to_wake_up()->select_task_rq().
5824 *
5825 * Called with rq->lock held even though we'er in stop_machine() and
5826 * there's no concurrency possible, we hold the required locks anyway
5827 * because of lock validation efforts.
6009 */ 5828 */
6010void idle_task_exit(void) 5829static void migrate_tasks(unsigned int dead_cpu)
6011{
6012 struct mm_struct *mm = current->active_mm;
6013
6014 BUG_ON(cpu_online(smp_processor_id()));
6015
6016 if (mm != &init_mm)
6017 switch_mm(mm, &init_mm, current);
6018 mmdrop(mm);
6019}
6020
6021/* called under rq->lock with disabled interrupts */
6022static void migrate_dead(unsigned int dead_cpu, struct task_struct *p)
6023{ 5830{
6024 struct rq *rq = cpu_rq(dead_cpu); 5831 struct rq *rq = cpu_rq(dead_cpu);
6025 5832 struct task_struct *next, *stop = rq->stop;
6026 /* Must be exiting, otherwise would be on tasklist. */ 5833 int dest_cpu;
6027 BUG_ON(!p->exit_state);
6028
6029 /* Cannot have done final schedule yet: would have vanished. */
6030 BUG_ON(p->state == TASK_DEAD);
6031
6032 get_task_struct(p);
6033 5834
6034 /* 5835 /*
6035 * Drop lock around migration; if someone else moves it, 5836 * Fudge the rq selection such that the below task selection loop
6036 * that's OK. No task can be added to this CPU, so iteration is 5837 * doesn't get stuck on the currently eligible stop task.
6037 * fine. 5838 *
5839 * We're currently inside stop_machine() and the rq is either stuck
5840 * in the stop_machine_cpu_stop() loop, or we're executing this code,
5841 * either way we should never end up calling schedule() until we're
5842 * done here.
6038 */ 5843 */
6039 raw_spin_unlock_irq(&rq->lock); 5844 rq->stop = NULL;
6040 move_task_off_dead_cpu(dead_cpu, p);
6041 raw_spin_lock_irq(&rq->lock);
6042
6043 put_task_struct(p);
6044}
6045
6046/* release_task() removes task from tasklist, so we won't find dead tasks. */
6047static void migrate_dead_tasks(unsigned int dead_cpu)
6048{
6049 struct rq *rq = cpu_rq(dead_cpu);
6050 struct task_struct *next;
6051 5845
6052 for ( ; ; ) { 5846 for ( ; ; ) {
6053 if (!rq->nr_running) 5847 /*
5848 * There's this thread running, bail when that's the only
5849 * remaining thread.
5850 */
5851 if (rq->nr_running == 1)
6054 break; 5852 break;
5853
6055 next = pick_next_task(rq); 5854 next = pick_next_task(rq);
6056 if (!next) 5855 BUG_ON(!next);
6057 break;
6058 next->sched_class->put_prev_task(rq, next); 5856 next->sched_class->put_prev_task(rq, next);
6059 migrate_dead(dead_cpu, next);
6060 5857
5858 /* Find suitable destination for @next, with force if needed. */
5859 dest_cpu = select_fallback_rq(dead_cpu, next);
5860 raw_spin_unlock(&rq->lock);
5861
5862 __migrate_task(next, dead_cpu, dest_cpu);
5863
5864 raw_spin_lock(&rq->lock);
6061 } 5865 }
6062}
6063 5866
6064/* 5867 rq->stop = stop;
6065 * remove the tasks which were accounted by rq from calc_load_tasks.
6066 */
6067static void calc_global_load_remove(struct rq *rq)
6068{
6069 atomic_long_sub(rq->calc_load_active, &calc_load_tasks);
6070 rq->calc_load_active = 0;
6071} 5868}
5869
6072#endif /* CONFIG_HOTPLUG_CPU */ 5870#endif /* CONFIG_HOTPLUG_CPU */
6073 5871
6074#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) 5872#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
@@ -6278,15 +6076,13 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
6278 unsigned long flags; 6076 unsigned long flags;
6279 struct rq *rq = cpu_rq(cpu); 6077 struct rq *rq = cpu_rq(cpu);
6280 6078
6281 switch (action) { 6079 switch (action & ~CPU_TASKS_FROZEN) {
6282 6080
6283 case CPU_UP_PREPARE: 6081 case CPU_UP_PREPARE:
6284 case CPU_UP_PREPARE_FROZEN:
6285 rq->calc_load_update = calc_load_update; 6082 rq->calc_load_update = calc_load_update;
6286 break; 6083 break;
6287 6084
6288 case CPU_ONLINE: 6085 case CPU_ONLINE:
6289 case CPU_ONLINE_FROZEN:
6290 /* Update our root-domain */ 6086 /* Update our root-domain */
6291 raw_spin_lock_irqsave(&rq->lock, flags); 6087 raw_spin_lock_irqsave(&rq->lock, flags);
6292 if (rq->rd) { 6088 if (rq->rd) {
@@ -6298,30 +6094,19 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
6298 break; 6094 break;
6299 6095
6300#ifdef CONFIG_HOTPLUG_CPU 6096#ifdef CONFIG_HOTPLUG_CPU
6301 case CPU_DEAD:
6302 case CPU_DEAD_FROZEN:
6303 migrate_live_tasks(cpu);
6304 /* Idle task back to normal (off runqueue, low prio) */
6305 raw_spin_lock_irq(&rq->lock);
6306 deactivate_task(rq, rq->idle, 0);
6307 __setscheduler(rq, rq->idle, SCHED_NORMAL, 0);
6308 rq->idle->sched_class = &idle_sched_class;
6309 migrate_dead_tasks(cpu);
6310 raw_spin_unlock_irq(&rq->lock);
6311 migrate_nr_uninterruptible(rq);
6312 BUG_ON(rq->nr_running != 0);
6313 calc_global_load_remove(rq);
6314 break;
6315
6316 case CPU_DYING: 6097 case CPU_DYING:
6317 case CPU_DYING_FROZEN:
6318 /* Update our root-domain */ 6098 /* Update our root-domain */
6319 raw_spin_lock_irqsave(&rq->lock, flags); 6099 raw_spin_lock_irqsave(&rq->lock, flags);
6320 if (rq->rd) { 6100 if (rq->rd) {
6321 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); 6101 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
6322 set_rq_offline(rq); 6102 set_rq_offline(rq);
6323 } 6103 }
6104 migrate_tasks(cpu);
6105 BUG_ON(rq->nr_running != 1); /* the migration thread */
6324 raw_spin_unlock_irqrestore(&rq->lock, flags); 6106 raw_spin_unlock_irqrestore(&rq->lock, flags);
6107
6108 migrate_nr_uninterruptible(rq);
6109 calc_global_load_remove(rq);
6325 break; 6110 break;
6326#endif 6111#endif
6327 } 6112 }
@@ -8052,18 +7837,16 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
8052 7837
8053#ifdef CONFIG_FAIR_GROUP_SCHED 7838#ifdef CONFIG_FAIR_GROUP_SCHED
8054static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, 7839static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
8055 struct sched_entity *se, int cpu, int add, 7840 struct sched_entity *se, int cpu,
8056 struct sched_entity *parent) 7841 struct sched_entity *parent)
8057{ 7842{
8058 struct rq *rq = cpu_rq(cpu); 7843 struct rq *rq = cpu_rq(cpu);
8059 tg->cfs_rq[cpu] = cfs_rq; 7844 tg->cfs_rq[cpu] = cfs_rq;
8060 init_cfs_rq(cfs_rq, rq); 7845 init_cfs_rq(cfs_rq, rq);
8061 cfs_rq->tg = tg; 7846 cfs_rq->tg = tg;
8062 if (add)
8063 list_add(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
8064 7847
8065 tg->se[cpu] = se; 7848 tg->se[cpu] = se;
8066 /* se could be NULL for init_task_group */ 7849 /* se could be NULL for root_task_group */
8067 if (!se) 7850 if (!se)
8068 return; 7851 return;
8069 7852
@@ -8073,15 +7856,14 @@ static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
8073 se->cfs_rq = parent->my_q; 7856 se->cfs_rq = parent->my_q;
8074 7857
8075 se->my_q = cfs_rq; 7858 se->my_q = cfs_rq;
8076 se->load.weight = tg->shares; 7859 update_load_set(&se->load, 0);
8077 se->load.inv_weight = 0;
8078 se->parent = parent; 7860 se->parent = parent;
8079} 7861}
8080#endif 7862#endif
8081 7863
8082#ifdef CONFIG_RT_GROUP_SCHED 7864#ifdef CONFIG_RT_GROUP_SCHED
8083static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, 7865static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
8084 struct sched_rt_entity *rt_se, int cpu, int add, 7866 struct sched_rt_entity *rt_se, int cpu,
8085 struct sched_rt_entity *parent) 7867 struct sched_rt_entity *parent)
8086{ 7868{
8087 struct rq *rq = cpu_rq(cpu); 7869 struct rq *rq = cpu_rq(cpu);
@@ -8090,8 +7872,6 @@ static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
8090 init_rt_rq(rt_rq, rq); 7872 init_rt_rq(rt_rq, rq);
8091 rt_rq->tg = tg; 7873 rt_rq->tg = tg;
8092 rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime; 7874 rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
8093 if (add)
8094 list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list);
8095 7875
8096 tg->rt_se[cpu] = rt_se; 7876 tg->rt_se[cpu] = rt_se;
8097 if (!rt_se) 7877 if (!rt_se)
@@ -8126,18 +7906,18 @@ void __init sched_init(void)
8126 ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT); 7906 ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);
8127 7907
8128#ifdef CONFIG_FAIR_GROUP_SCHED 7908#ifdef CONFIG_FAIR_GROUP_SCHED
8129 init_task_group.se = (struct sched_entity **)ptr; 7909 root_task_group.se = (struct sched_entity **)ptr;
8130 ptr += nr_cpu_ids * sizeof(void **); 7910 ptr += nr_cpu_ids * sizeof(void **);
8131 7911
8132 init_task_group.cfs_rq = (struct cfs_rq **)ptr; 7912 root_task_group.cfs_rq = (struct cfs_rq **)ptr;
8133 ptr += nr_cpu_ids * sizeof(void **); 7913 ptr += nr_cpu_ids * sizeof(void **);
8134 7914
8135#endif /* CONFIG_FAIR_GROUP_SCHED */ 7915#endif /* CONFIG_FAIR_GROUP_SCHED */
8136#ifdef CONFIG_RT_GROUP_SCHED 7916#ifdef CONFIG_RT_GROUP_SCHED
8137 init_task_group.rt_se = (struct sched_rt_entity **)ptr; 7917 root_task_group.rt_se = (struct sched_rt_entity **)ptr;
8138 ptr += nr_cpu_ids * sizeof(void **); 7918 ptr += nr_cpu_ids * sizeof(void **);
8139 7919
8140 init_task_group.rt_rq = (struct rt_rq **)ptr; 7920 root_task_group.rt_rq = (struct rt_rq **)ptr;
8141 ptr += nr_cpu_ids * sizeof(void **); 7921 ptr += nr_cpu_ids * sizeof(void **);
8142 7922
8143#endif /* CONFIG_RT_GROUP_SCHED */ 7923#endif /* CONFIG_RT_GROUP_SCHED */
@@ -8157,20 +7937,16 @@ void __init sched_init(void)
8157 global_rt_period(), global_rt_runtime()); 7937 global_rt_period(), global_rt_runtime());
8158 7938
8159#ifdef CONFIG_RT_GROUP_SCHED 7939#ifdef CONFIG_RT_GROUP_SCHED
8160 init_rt_bandwidth(&init_task_group.rt_bandwidth, 7940 init_rt_bandwidth(&root_task_group.rt_bandwidth,
8161 global_rt_period(), global_rt_runtime()); 7941 global_rt_period(), global_rt_runtime());
8162#endif /* CONFIG_RT_GROUP_SCHED */ 7942#endif /* CONFIG_RT_GROUP_SCHED */
8163 7943
8164#ifdef CONFIG_CGROUP_SCHED 7944#ifdef CONFIG_CGROUP_SCHED
8165 list_add(&init_task_group.list, &task_groups); 7945 list_add(&root_task_group.list, &task_groups);
8166 INIT_LIST_HEAD(&init_task_group.children); 7946 INIT_LIST_HEAD(&root_task_group.children);
8167 7947 autogroup_init(&init_task);
8168#endif /* CONFIG_CGROUP_SCHED */ 7948#endif /* CONFIG_CGROUP_SCHED */
8169 7949
8170#if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP
8171 update_shares_data = __alloc_percpu(nr_cpu_ids * sizeof(unsigned long),
8172 __alignof__(unsigned long));
8173#endif
8174 for_each_possible_cpu(i) { 7950 for_each_possible_cpu(i) {
8175 struct rq *rq; 7951 struct rq *rq;
8176 7952
@@ -8182,38 +7958,34 @@ void __init sched_init(void)
8182 init_cfs_rq(&rq->cfs, rq); 7958 init_cfs_rq(&rq->cfs, rq);
8183 init_rt_rq(&rq->rt, rq); 7959 init_rt_rq(&rq->rt, rq);
8184#ifdef CONFIG_FAIR_GROUP_SCHED 7960#ifdef CONFIG_FAIR_GROUP_SCHED
8185 init_task_group.shares = init_task_group_load; 7961 root_task_group.shares = root_task_group_load;
8186 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); 7962 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
8187#ifdef CONFIG_CGROUP_SCHED
8188 /* 7963 /*
8189 * How much cpu bandwidth does init_task_group get? 7964 * How much cpu bandwidth does root_task_group get?
8190 * 7965 *
8191 * In case of task-groups formed thr' the cgroup filesystem, it 7966 * In case of task-groups formed thr' the cgroup filesystem, it
8192 * gets 100% of the cpu resources in the system. This overall 7967 * gets 100% of the cpu resources in the system. This overall
8193 * system cpu resource is divided among the tasks of 7968 * system cpu resource is divided among the tasks of
8194 * init_task_group and its child task-groups in a fair manner, 7969 * root_task_group and its child task-groups in a fair manner,
8195 * based on each entity's (task or task-group's) weight 7970 * based on each entity's (task or task-group's) weight
8196 * (se->load.weight). 7971 * (se->load.weight).
8197 * 7972 *
8198 * In other words, if init_task_group has 10 tasks of weight 7973 * In other words, if root_task_group has 10 tasks of weight
8199 * 1024) and two child groups A0 and A1 (of weight 1024 each), 7974 * 1024) and two child groups A0 and A1 (of weight 1024 each),
8200 * then A0's share of the cpu resource is: 7975 * then A0's share of the cpu resource is:
8201 * 7976 *
8202 * A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33% 7977 * A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33%
8203 * 7978 *
8204 * We achieve this by letting init_task_group's tasks sit 7979 * We achieve this by letting root_task_group's tasks sit
8205 * directly in rq->cfs (i.e init_task_group->se[] = NULL). 7980 * directly in rq->cfs (i.e root_task_group->se[] = NULL).
8206 */ 7981 */
8207 init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, 1, NULL); 7982 init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL);
8208#endif
8209#endif /* CONFIG_FAIR_GROUP_SCHED */ 7983#endif /* CONFIG_FAIR_GROUP_SCHED */
8210 7984
8211 rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime; 7985 rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
8212#ifdef CONFIG_RT_GROUP_SCHED 7986#ifdef CONFIG_RT_GROUP_SCHED
8213 INIT_LIST_HEAD(&rq->leaf_rt_rq_list); 7987 INIT_LIST_HEAD(&rq->leaf_rt_rq_list);
8214#ifdef CONFIG_CGROUP_SCHED 7988 init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL);
8215 init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, 1, NULL);
8216#endif
8217#endif 7989#endif
8218 7990
8219 for (j = 0; j < CPU_LOAD_IDX_MAX; j++) 7991 for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
@@ -8293,8 +8065,6 @@ void __init sched_init(void)
8293 zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); 8065 zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
8294#endif /* SMP */ 8066#endif /* SMP */
8295 8067
8296 perf_event_init();
8297
8298 scheduler_running = 1; 8068 scheduler_running = 1;
8299} 8069}
8300 8070
@@ -8488,7 +8258,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
8488 if (!se) 8258 if (!se)
8489 goto err_free_rq; 8259 goto err_free_rq;
8490 8260
8491 init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent->se[i]); 8261 init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
8492 } 8262 }
8493 8263
8494 return 1; 8264 return 1;
@@ -8499,15 +8269,21 @@ err:
8499 return 0; 8269 return 0;
8500} 8270}
8501 8271
8502static inline void register_fair_sched_group(struct task_group *tg, int cpu)
8503{
8504 list_add_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list,
8505 &cpu_rq(cpu)->leaf_cfs_rq_list);
8506}
8507
8508static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) 8272static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
8509{ 8273{
8510 list_del_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list); 8274 struct rq *rq = cpu_rq(cpu);
8275 unsigned long flags;
8276
8277 /*
8278 * Only empty task groups can be destroyed; so we can speculatively
8279 * check on_list without danger of it being re-added.
8280 */
8281 if (!tg->cfs_rq[cpu]->on_list)
8282 return;
8283
8284 raw_spin_lock_irqsave(&rq->lock, flags);
8285 list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
8286 raw_spin_unlock_irqrestore(&rq->lock, flags);
8511} 8287}
8512#else /* !CONFG_FAIR_GROUP_SCHED */ 8288#else /* !CONFG_FAIR_GROUP_SCHED */
8513static inline void free_fair_sched_group(struct task_group *tg) 8289static inline void free_fair_sched_group(struct task_group *tg)
@@ -8520,10 +8296,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
8520 return 1; 8296 return 1;
8521} 8297}
8522 8298
8523static inline void register_fair_sched_group(struct task_group *tg, int cpu)
8524{
8525}
8526
8527static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) 8299static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
8528{ 8300{
8529} 8301}
@@ -8578,7 +8350,7 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
8578 if (!rt_se) 8350 if (!rt_se)
8579 goto err_free_rq; 8351 goto err_free_rq;
8580 8352
8581 init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent->rt_se[i]); 8353 init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]);
8582 } 8354 }
8583 8355
8584 return 1; 8356 return 1;
@@ -8588,17 +8360,6 @@ err_free_rq:
8588err: 8360err:
8589 return 0; 8361 return 0;
8590} 8362}
8591
8592static inline void register_rt_sched_group(struct task_group *tg, int cpu)
8593{
8594 list_add_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list,
8595 &cpu_rq(cpu)->leaf_rt_rq_list);
8596}
8597
8598static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
8599{
8600 list_del_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list);
8601}
8602#else /* !CONFIG_RT_GROUP_SCHED */ 8363#else /* !CONFIG_RT_GROUP_SCHED */
8603static inline void free_rt_sched_group(struct task_group *tg) 8364static inline void free_rt_sched_group(struct task_group *tg)
8604{ 8365{
@@ -8609,14 +8370,6 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
8609{ 8370{
8610 return 1; 8371 return 1;
8611} 8372}
8612
8613static inline void register_rt_sched_group(struct task_group *tg, int cpu)
8614{
8615}
8616
8617static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
8618{
8619}
8620#endif /* CONFIG_RT_GROUP_SCHED */ 8373#endif /* CONFIG_RT_GROUP_SCHED */
8621 8374
8622#ifdef CONFIG_CGROUP_SCHED 8375#ifdef CONFIG_CGROUP_SCHED
@@ -8624,6 +8377,7 @@ static void free_sched_group(struct task_group *tg)
8624{ 8377{
8625 free_fair_sched_group(tg); 8378 free_fair_sched_group(tg);
8626 free_rt_sched_group(tg); 8379 free_rt_sched_group(tg);
8380 autogroup_free(tg);
8627 kfree(tg); 8381 kfree(tg);
8628} 8382}
8629 8383
@@ -8632,7 +8386,6 @@ struct task_group *sched_create_group(struct task_group *parent)
8632{ 8386{
8633 struct task_group *tg; 8387 struct task_group *tg;
8634 unsigned long flags; 8388 unsigned long flags;
8635 int i;
8636 8389
8637 tg = kzalloc(sizeof(*tg), GFP_KERNEL); 8390 tg = kzalloc(sizeof(*tg), GFP_KERNEL);
8638 if (!tg) 8391 if (!tg)
@@ -8645,10 +8398,6 @@ struct task_group *sched_create_group(struct task_group *parent)
8645 goto err; 8398 goto err;
8646 8399
8647 spin_lock_irqsave(&task_group_lock, flags); 8400 spin_lock_irqsave(&task_group_lock, flags);
8648 for_each_possible_cpu(i) {
8649 register_fair_sched_group(tg, i);
8650 register_rt_sched_group(tg, i);
8651 }
8652 list_add_rcu(&tg->list, &task_groups); 8401 list_add_rcu(&tg->list, &task_groups);
8653 8402
8654 WARN_ON(!parent); /* root should already exist */ 8403 WARN_ON(!parent); /* root should already exist */
@@ -8678,11 +8427,11 @@ void sched_destroy_group(struct task_group *tg)
8678 unsigned long flags; 8427 unsigned long flags;
8679 int i; 8428 int i;
8680 8429
8681 spin_lock_irqsave(&task_group_lock, flags); 8430 /* end participation in shares distribution */
8682 for_each_possible_cpu(i) { 8431 for_each_possible_cpu(i)
8683 unregister_fair_sched_group(tg, i); 8432 unregister_fair_sched_group(tg, i);
8684 unregister_rt_sched_group(tg, i); 8433
8685 } 8434 spin_lock_irqsave(&task_group_lock, flags);
8686 list_del_rcu(&tg->list); 8435 list_del_rcu(&tg->list);
8687 list_del_rcu(&tg->siblings); 8436 list_del_rcu(&tg->siblings);
8688 spin_unlock_irqrestore(&task_group_lock, flags); 8437 spin_unlock_irqrestore(&task_group_lock, flags);
@@ -8729,33 +8478,6 @@ void sched_move_task(struct task_struct *tsk)
8729#endif /* CONFIG_CGROUP_SCHED */ 8478#endif /* CONFIG_CGROUP_SCHED */
8730 8479
8731#ifdef CONFIG_FAIR_GROUP_SCHED 8480#ifdef CONFIG_FAIR_GROUP_SCHED
8732static void __set_se_shares(struct sched_entity *se, unsigned long shares)
8733{
8734 struct cfs_rq *cfs_rq = se->cfs_rq;
8735 int on_rq;
8736
8737 on_rq = se->on_rq;
8738 if (on_rq)
8739 dequeue_entity(cfs_rq, se, 0);
8740
8741 se->load.weight = shares;
8742 se->load.inv_weight = 0;
8743
8744 if (on_rq)
8745 enqueue_entity(cfs_rq, se, 0);
8746}
8747
8748static void set_se_shares(struct sched_entity *se, unsigned long shares)
8749{
8750 struct cfs_rq *cfs_rq = se->cfs_rq;
8751 struct rq *rq = cfs_rq->rq;
8752 unsigned long flags;
8753
8754 raw_spin_lock_irqsave(&rq->lock, flags);
8755 __set_se_shares(se, shares);
8756 raw_spin_unlock_irqrestore(&rq->lock, flags);
8757}
8758
8759static DEFINE_MUTEX(shares_mutex); 8481static DEFINE_MUTEX(shares_mutex);
8760 8482
8761int sched_group_set_shares(struct task_group *tg, unsigned long shares) 8483int sched_group_set_shares(struct task_group *tg, unsigned long shares)
@@ -8778,37 +8500,19 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
8778 if (tg->shares == shares) 8500 if (tg->shares == shares)
8779 goto done; 8501 goto done;
8780 8502
8781 spin_lock_irqsave(&task_group_lock, flags);
8782 for_each_possible_cpu(i)
8783 unregister_fair_sched_group(tg, i);
8784 list_del_rcu(&tg->siblings);
8785 spin_unlock_irqrestore(&task_group_lock, flags);
8786
8787 /* wait for any ongoing reference to this group to finish */
8788 synchronize_sched();
8789
8790 /*
8791 * Now we are free to modify the group's share on each cpu
8792 * w/o tripping rebalance_share or load_balance_fair.
8793 */
8794 tg->shares = shares; 8503 tg->shares = shares;
8795 for_each_possible_cpu(i) { 8504 for_each_possible_cpu(i) {
8796 /* 8505 struct rq *rq = cpu_rq(i);
8797 * force a rebalance 8506 struct sched_entity *se;
8798 */ 8507
8799 cfs_rq_set_shares(tg->cfs_rq[i], 0); 8508 se = tg->se[i];
8800 set_se_shares(tg->se[i], shares); 8509 /* Propagate contribution to hierarchy */
8510 raw_spin_lock_irqsave(&rq->lock, flags);
8511 for_each_sched_entity(se)
8512 update_cfs_shares(group_cfs_rq(se), 0);
8513 raw_spin_unlock_irqrestore(&rq->lock, flags);
8801 } 8514 }
8802 8515
8803 /*
8804 * Enable load balance activity on this group, by inserting it back on
8805 * each cpu's rq->leaf_cfs_rq_list.
8806 */
8807 spin_lock_irqsave(&task_group_lock, flags);
8808 for_each_possible_cpu(i)
8809 register_fair_sched_group(tg, i);
8810 list_add_rcu(&tg->siblings, &tg->parent->children);
8811 spin_unlock_irqrestore(&task_group_lock, flags);
8812done: 8516done:
8813 mutex_unlock(&shares_mutex); 8517 mutex_unlock(&shares_mutex);
8814 return 0; 8518 return 0;
@@ -9107,7 +8811,7 @@ cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp)
9107 8811
9108 if (!cgrp->parent) { 8812 if (!cgrp->parent) {
9109 /* This is early initialization for the top cgroup */ 8813 /* This is early initialization for the top cgroup */
9110 return &init_task_group.css; 8814 return &root_task_group.css;
9111 } 8815 }
9112 8816
9113 parent = cgroup_tg(cgrp->parent); 8817 parent = cgroup_tg(cgrp->parent);
@@ -9534,72 +9238,3 @@ struct cgroup_subsys cpuacct_subsys = {
9534}; 9238};
9535#endif /* CONFIG_CGROUP_CPUACCT */ 9239#endif /* CONFIG_CGROUP_CPUACCT */
9536 9240
9537#ifndef CONFIG_SMP
9538
9539void synchronize_sched_expedited(void)
9540{
9541 barrier();
9542}
9543EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
9544
9545#else /* #ifndef CONFIG_SMP */
9546
9547static atomic_t synchronize_sched_expedited_count = ATOMIC_INIT(0);
9548
9549static int synchronize_sched_expedited_cpu_stop(void *data)
9550{
9551 /*
9552 * There must be a full memory barrier on each affected CPU
9553 * between the time that try_stop_cpus() is called and the
9554 * time that it returns.
9555 *
9556 * In the current initial implementation of cpu_stop, the
9557 * above condition is already met when the control reaches
9558 * this point and the following smp_mb() is not strictly
9559 * necessary. Do smp_mb() anyway for documentation and
9560 * robustness against future implementation changes.
9561 */
9562 smp_mb(); /* See above comment block. */
9563 return 0;
9564}
9565
9566/*
9567 * Wait for an rcu-sched grace period to elapse, but use "big hammer"
9568 * approach to force grace period to end quickly. This consumes
9569 * significant time on all CPUs, and is thus not recommended for
9570 * any sort of common-case code.
9571 *
9572 * Note that it is illegal to call this function while holding any
9573 * lock that is acquired by a CPU-hotplug notifier. Failing to
9574 * observe this restriction will result in deadlock.
9575 */
9576void synchronize_sched_expedited(void)
9577{
9578 int snap, trycount = 0;
9579
9580 smp_mb(); /* ensure prior mod happens before capturing snap. */
9581 snap = atomic_read(&synchronize_sched_expedited_count) + 1;
9582 get_online_cpus();
9583 while (try_stop_cpus(cpu_online_mask,
9584 synchronize_sched_expedited_cpu_stop,
9585 NULL) == -EAGAIN) {
9586 put_online_cpus();
9587 if (trycount++ < 10)
9588 udelay(trycount * num_online_cpus());
9589 else {
9590 synchronize_sched();
9591 return;
9592 }
9593 if (atomic_read(&synchronize_sched_expedited_count) - snap > 0) {
9594 smp_mb(); /* ensure test happens before caller kfree */
9595 return;
9596 }
9597 get_online_cpus();
9598 }
9599 atomic_inc(&synchronize_sched_expedited_count);
9600 smp_mb__after_atomic_inc(); /* ensure post-GP actions seen after GP. */
9601 put_online_cpus();
9602}
9603EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
9604
9605#endif /* #else #ifndef CONFIG_SMP */
diff --git a/kernel/sched_autogroup.c b/kernel/sched_autogroup.c
new file mode 100644
index 000000000000..32a723b8f84c
--- /dev/null
+++ b/kernel/sched_autogroup.c
@@ -0,0 +1,238 @@
1#ifdef CONFIG_SCHED_AUTOGROUP
2
3#include <linux/proc_fs.h>
4#include <linux/seq_file.h>
5#include <linux/kallsyms.h>
6#include <linux/utsname.h>
7
8unsigned int __read_mostly sysctl_sched_autogroup_enabled = 1;
9static struct autogroup autogroup_default;
10static atomic_t autogroup_seq_nr;
11
12static void __init autogroup_init(struct task_struct *init_task)
13{
14 autogroup_default.tg = &root_task_group;
15 root_task_group.autogroup = &autogroup_default;
16 kref_init(&autogroup_default.kref);
17 init_rwsem(&autogroup_default.lock);
18 init_task->signal->autogroup = &autogroup_default;
19}
20
21static inline void autogroup_free(struct task_group *tg)
22{
23 kfree(tg->autogroup);
24}
25
26static inline void autogroup_destroy(struct kref *kref)
27{
28 struct autogroup *ag = container_of(kref, struct autogroup, kref);
29
30 sched_destroy_group(ag->tg);
31}
32
33static inline void autogroup_kref_put(struct autogroup *ag)
34{
35 kref_put(&ag->kref, autogroup_destroy);
36}
37
38static inline struct autogroup *autogroup_kref_get(struct autogroup *ag)
39{
40 kref_get(&ag->kref);
41 return ag;
42}
43
44static inline struct autogroup *autogroup_task_get(struct task_struct *p)
45{
46 struct autogroup *ag;
47 unsigned long flags;
48
49 if (!lock_task_sighand(p, &flags))
50 return autogroup_kref_get(&autogroup_default);
51
52 ag = autogroup_kref_get(p->signal->autogroup);
53 unlock_task_sighand(p, &flags);
54
55 return ag;
56}
57
58static inline struct autogroup *autogroup_create(void)
59{
60 struct autogroup *ag = kzalloc(sizeof(*ag), GFP_KERNEL);
61 struct task_group *tg;
62
63 if (!ag)
64 goto out_fail;
65
66 tg = sched_create_group(&root_task_group);
67
68 if (IS_ERR(tg))
69 goto out_free;
70
71 kref_init(&ag->kref);
72 init_rwsem(&ag->lock);
73 ag->id = atomic_inc_return(&autogroup_seq_nr);
74 ag->tg = tg;
75 tg->autogroup = ag;
76
77 return ag;
78
79out_free:
80 kfree(ag);
81out_fail:
82 if (printk_ratelimit()) {
83 printk(KERN_WARNING "autogroup_create: %s failure.\n",
84 ag ? "sched_create_group()" : "kmalloc()");
85 }
86
87 return autogroup_kref_get(&autogroup_default);
88}
89
90static inline bool
91task_wants_autogroup(struct task_struct *p, struct task_group *tg)
92{
93 if (tg != &root_task_group)
94 return false;
95
96 if (p->sched_class != &fair_sched_class)
97 return false;
98
99 /*
100 * We can only assume the task group can't go away on us if
101 * autogroup_move_group() can see us on ->thread_group list.
102 */
103 if (p->flags & PF_EXITING)
104 return false;
105
106 return true;
107}
108
109static inline struct task_group *
110autogroup_task_group(struct task_struct *p, struct task_group *tg)
111{
112 int enabled = ACCESS_ONCE(sysctl_sched_autogroup_enabled);
113
114 if (enabled && task_wants_autogroup(p, tg))
115 return p->signal->autogroup->tg;
116
117 return tg;
118}
119
120static void
121autogroup_move_group(struct task_struct *p, struct autogroup *ag)
122{
123 struct autogroup *prev;
124 struct task_struct *t;
125 unsigned long flags;
126
127 BUG_ON(!lock_task_sighand(p, &flags));
128
129 prev = p->signal->autogroup;
130 if (prev == ag) {
131 unlock_task_sighand(p, &flags);
132 return;
133 }
134
135 p->signal->autogroup = autogroup_kref_get(ag);
136
137 t = p;
138 do {
139 sched_move_task(t);
140 } while_each_thread(p, t);
141
142 unlock_task_sighand(p, &flags);
143 autogroup_kref_put(prev);
144}
145
146/* Allocates GFP_KERNEL, cannot be called under any spinlock */
147void sched_autogroup_create_attach(struct task_struct *p)
148{
149 struct autogroup *ag = autogroup_create();
150
151 autogroup_move_group(p, ag);
152 /* drop extra refrence added by autogroup_create() */
153 autogroup_kref_put(ag);
154}
155EXPORT_SYMBOL(sched_autogroup_create_attach);
156
157/* Cannot be called under siglock. Currently has no users */
158void sched_autogroup_detach(struct task_struct *p)
159{
160 autogroup_move_group(p, &autogroup_default);
161}
162EXPORT_SYMBOL(sched_autogroup_detach);
163
164void sched_autogroup_fork(struct signal_struct *sig)
165{
166 sig->autogroup = autogroup_task_get(current);
167}
168
169void sched_autogroup_exit(struct signal_struct *sig)
170{
171 autogroup_kref_put(sig->autogroup);
172}
173
174static int __init setup_autogroup(char *str)
175{
176 sysctl_sched_autogroup_enabled = 0;
177
178 return 1;
179}
180
181__setup("noautogroup", setup_autogroup);
182
183#ifdef CONFIG_PROC_FS
184
185int proc_sched_autogroup_set_nice(struct task_struct *p, int *nice)
186{
187 static unsigned long next = INITIAL_JIFFIES;
188 struct autogroup *ag;
189 int err;
190
191 if (*nice < -20 || *nice > 19)
192 return -EINVAL;
193
194 err = security_task_setnice(current, *nice);
195 if (err)
196 return err;
197
198 if (*nice < 0 && !can_nice(current, *nice))
199 return -EPERM;
200
201 /* this is a heavy operation taking global locks.. */
202 if (!capable(CAP_SYS_ADMIN) && time_before(jiffies, next))
203 return -EAGAIN;
204
205 next = HZ / 10 + jiffies;
206 ag = autogroup_task_get(p);
207
208 down_write(&ag->lock);
209 err = sched_group_set_shares(ag->tg, prio_to_weight[*nice + 20]);
210 if (!err)
211 ag->nice = *nice;
212 up_write(&ag->lock);
213
214 autogroup_kref_put(ag);
215
216 return err;
217}
218
219void proc_sched_autogroup_show_task(struct task_struct *p, struct seq_file *m)
220{
221 struct autogroup *ag = autogroup_task_get(p);
222
223 down_read(&ag->lock);
224 seq_printf(m, "/autogroup-%ld nice %d\n", ag->id, ag->nice);
225 up_read(&ag->lock);
226
227 autogroup_kref_put(ag);
228}
229#endif /* CONFIG_PROC_FS */
230
231#ifdef CONFIG_SCHED_DEBUG
232static inline int autogroup_path(struct task_group *tg, char *buf, int buflen)
233{
234 return snprintf(buf, buflen, "%s-%ld", "/autogroup", tg->autogroup->id);
235}
236#endif /* CONFIG_SCHED_DEBUG */
237
238#endif /* CONFIG_SCHED_AUTOGROUP */
diff --git a/kernel/sched_autogroup.h b/kernel/sched_autogroup.h
new file mode 100644
index 000000000000..5358e241cb20
--- /dev/null
+++ b/kernel/sched_autogroup.h
@@ -0,0 +1,32 @@
1#ifdef CONFIG_SCHED_AUTOGROUP
2
3struct autogroup {
4 struct kref kref;
5 struct task_group *tg;
6 struct rw_semaphore lock;
7 unsigned long id;
8 int nice;
9};
10
11static inline struct task_group *
12autogroup_task_group(struct task_struct *p, struct task_group *tg);
13
14#else /* !CONFIG_SCHED_AUTOGROUP */
15
16static inline void autogroup_init(struct task_struct *init_task) { }
17static inline void autogroup_free(struct task_group *tg) { }
18
19static inline struct task_group *
20autogroup_task_group(struct task_struct *p, struct task_group *tg)
21{
22 return tg;
23}
24
25#ifdef CONFIG_SCHED_DEBUG
26static inline int autogroup_path(struct task_group *tg, char *buf, int buflen)
27{
28 return 0;
29}
30#endif
31
32#endif /* CONFIG_SCHED_AUTOGROUP */
diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c
index 52f1a149bfb1..9d8af0b3fb64 100644
--- a/kernel/sched_clock.c
+++ b/kernel/sched_clock.c
@@ -79,7 +79,7 @@ unsigned long long __attribute__((weak)) sched_clock(void)
79} 79}
80EXPORT_SYMBOL_GPL(sched_clock); 80EXPORT_SYMBOL_GPL(sched_clock);
81 81
82static __read_mostly int sched_clock_running; 82__read_mostly int sched_clock_running;
83 83
84#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK 84#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
85__read_mostly int sched_clock_stable; 85__read_mostly int sched_clock_stable;
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 2e1b0d17dd9b..1dfae3d014b5 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -54,8 +54,7 @@ static unsigned long nsec_low(unsigned long long nsec)
54#define SPLIT_NS(x) nsec_high(x), nsec_low(x) 54#define SPLIT_NS(x) nsec_high(x), nsec_low(x)
55 55
56#ifdef CONFIG_FAIR_GROUP_SCHED 56#ifdef CONFIG_FAIR_GROUP_SCHED
57static void print_cfs_group_stats(struct seq_file *m, int cpu, 57static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group *tg)
58 struct task_group *tg)
59{ 58{
60 struct sched_entity *se = tg->se[cpu]; 59 struct sched_entity *se = tg->se[cpu];
61 if (!se) 60 if (!se)
@@ -110,16 +109,6 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
110 0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L); 109 0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L);
111#endif 110#endif
112 111
113#ifdef CONFIG_CGROUP_SCHED
114 {
115 char path[64];
116
117 rcu_read_lock();
118 cgroup_path(task_group(p)->css.cgroup, path, sizeof(path));
119 rcu_read_unlock();
120 SEQ_printf(m, " %s", path);
121 }
122#endif
123 SEQ_printf(m, "\n"); 112 SEQ_printf(m, "\n");
124} 113}
125 114
@@ -147,19 +136,6 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
147 read_unlock_irqrestore(&tasklist_lock, flags); 136 read_unlock_irqrestore(&tasklist_lock, flags);
148} 137}
149 138
150#if defined(CONFIG_CGROUP_SCHED) && \
151 (defined(CONFIG_FAIR_GROUP_SCHED) || defined(CONFIG_RT_GROUP_SCHED))
152static void task_group_path(struct task_group *tg, char *buf, int buflen)
153{
154 /* may be NULL if the underlying cgroup isn't fully-created yet */
155 if (!tg->css.cgroup) {
156 buf[0] = '\0';
157 return;
158 }
159 cgroup_path(tg->css.cgroup, buf, buflen);
160}
161#endif
162
163void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) 139void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
164{ 140{
165 s64 MIN_vruntime = -1, min_vruntime, max_vruntime = -1, 141 s64 MIN_vruntime = -1, min_vruntime, max_vruntime = -1,
@@ -168,16 +144,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
168 struct sched_entity *last; 144 struct sched_entity *last;
169 unsigned long flags; 145 unsigned long flags;
170 146
171#if defined(CONFIG_CGROUP_SCHED) && defined(CONFIG_FAIR_GROUP_SCHED)
172 char path[128];
173 struct task_group *tg = cfs_rq->tg;
174
175 task_group_path(tg, path, sizeof(path));
176
177 SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, path);
178#else
179 SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu); 147 SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu);
180#endif
181 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "exec_clock", 148 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "exec_clock",
182 SPLIT_NS(cfs_rq->exec_clock)); 149 SPLIT_NS(cfs_rq->exec_clock));
183 150
@@ -202,32 +169,29 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
202 spread0 = min_vruntime - rq0_min_vruntime; 169 spread0 = min_vruntime - rq0_min_vruntime;
203 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread0", 170 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread0",
204 SPLIT_NS(spread0)); 171 SPLIT_NS(spread0));
205 SEQ_printf(m, " .%-30s: %ld\n", "nr_running", cfs_rq->nr_running);
206 SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight);
207
208 SEQ_printf(m, " .%-30s: %d\n", "nr_spread_over", 172 SEQ_printf(m, " .%-30s: %d\n", "nr_spread_over",
209 cfs_rq->nr_spread_over); 173 cfs_rq->nr_spread_over);
174 SEQ_printf(m, " .%-30s: %ld\n", "nr_running", cfs_rq->nr_running);
175 SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight);
210#ifdef CONFIG_FAIR_GROUP_SCHED 176#ifdef CONFIG_FAIR_GROUP_SCHED
211#ifdef CONFIG_SMP 177#ifdef CONFIG_SMP
212 SEQ_printf(m, " .%-30s: %lu\n", "shares", cfs_rq->shares); 178 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "load_avg",
179 SPLIT_NS(cfs_rq->load_avg));
180 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "load_period",
181 SPLIT_NS(cfs_rq->load_period));
182 SEQ_printf(m, " .%-30s: %ld\n", "load_contrib",
183 cfs_rq->load_contribution);
184 SEQ_printf(m, " .%-30s: %d\n", "load_tg",
185 atomic_read(&cfs_rq->tg->load_weight));
213#endif 186#endif
187
214 print_cfs_group_stats(m, cpu, cfs_rq->tg); 188 print_cfs_group_stats(m, cpu, cfs_rq->tg);
215#endif 189#endif
216} 190}
217 191
218void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq) 192void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
219{ 193{
220#if defined(CONFIG_CGROUP_SCHED) && defined(CONFIG_RT_GROUP_SCHED)
221 char path[128];
222 struct task_group *tg = rt_rq->tg;
223
224 task_group_path(tg, path, sizeof(path));
225
226 SEQ_printf(m, "\nrt_rq[%d]:%s\n", cpu, path);
227#else
228 SEQ_printf(m, "\nrt_rq[%d]:\n", cpu); 194 SEQ_printf(m, "\nrt_rq[%d]:\n", cpu);
229#endif
230
231 195
232#define P(x) \ 196#define P(x) \
233 SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(rt_rq->x)) 197 SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(rt_rq->x))
@@ -243,6 +207,8 @@ void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
243#undef P 207#undef P
244} 208}
245 209
210extern __read_mostly int sched_clock_running;
211
246static void print_cpu(struct seq_file *m, int cpu) 212static void print_cpu(struct seq_file *m, int cpu)
247{ 213{
248 struct rq *rq = cpu_rq(cpu); 214 struct rq *rq = cpu_rq(cpu);
@@ -314,21 +280,42 @@ static const char *sched_tunable_scaling_names[] = {
314 280
315static int sched_debug_show(struct seq_file *m, void *v) 281static int sched_debug_show(struct seq_file *m, void *v)
316{ 282{
317 u64 now = ktime_to_ns(ktime_get()); 283 u64 ktime, sched_clk, cpu_clk;
284 unsigned long flags;
318 int cpu; 285 int cpu;
319 286
320 SEQ_printf(m, "Sched Debug Version: v0.09, %s %.*s\n", 287 local_irq_save(flags);
288 ktime = ktime_to_ns(ktime_get());
289 sched_clk = sched_clock();
290 cpu_clk = local_clock();
291 local_irq_restore(flags);
292
293 SEQ_printf(m, "Sched Debug Version: v0.10, %s %.*s\n",
321 init_utsname()->release, 294 init_utsname()->release,
322 (int)strcspn(init_utsname()->version, " "), 295 (int)strcspn(init_utsname()->version, " "),
323 init_utsname()->version); 296 init_utsname()->version);
324 297
325 SEQ_printf(m, "now at %Lu.%06ld msecs\n", SPLIT_NS(now)); 298#define P(x) \
299 SEQ_printf(m, "%-40s: %Ld\n", #x, (long long)(x))
300#define PN(x) \
301 SEQ_printf(m, "%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x))
302 PN(ktime);
303 PN(sched_clk);
304 PN(cpu_clk);
305 P(jiffies);
306#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
307 P(sched_clock_stable);
308#endif
309#undef PN
310#undef P
311
312 SEQ_printf(m, "\n");
313 SEQ_printf(m, "sysctl_sched\n");
326 314
327#define P(x) \ 315#define P(x) \
328 SEQ_printf(m, " .%-40s: %Ld\n", #x, (long long)(x)) 316 SEQ_printf(m, " .%-40s: %Ld\n", #x, (long long)(x))
329#define PN(x) \ 317#define PN(x) \
330 SEQ_printf(m, " .%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x)) 318 SEQ_printf(m, " .%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x))
331 P(jiffies);
332 PN(sysctl_sched_latency); 319 PN(sysctl_sched_latency);
333 PN(sysctl_sched_min_granularity); 320 PN(sysctl_sched_min_granularity);
334 PN(sysctl_sched_wakeup_granularity); 321 PN(sysctl_sched_wakeup_granularity);
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 00ebd7686676..c62ebae65cf0 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -89,6 +89,13 @@ unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
89 89
90const_debug unsigned int sysctl_sched_migration_cost = 500000UL; 90const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
91 91
92/*
93 * The exponential sliding window over which load is averaged for shares
94 * distribution.
95 * (default: 10msec)
96 */
97unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
98
92static const struct sched_class fair_sched_class; 99static const struct sched_class fair_sched_class;
93 100
94/************************************************************** 101/**************************************************************
@@ -143,6 +150,36 @@ static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
143 return cfs_rq->tg->cfs_rq[this_cpu]; 150 return cfs_rq->tg->cfs_rq[this_cpu];
144} 151}
145 152
153static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
154{
155 if (!cfs_rq->on_list) {
156 /*
157 * Ensure we either appear before our parent (if already
158 * enqueued) or force our parent to appear after us when it is
159 * enqueued. The fact that we always enqueue bottom-up
160 * reduces this to two cases.
161 */
162 if (cfs_rq->tg->parent &&
163 cfs_rq->tg->parent->cfs_rq[cpu_of(rq_of(cfs_rq))]->on_list) {
164 list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
165 &rq_of(cfs_rq)->leaf_cfs_rq_list);
166 } else {
167 list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
168 &rq_of(cfs_rq)->leaf_cfs_rq_list);
169 }
170
171 cfs_rq->on_list = 1;
172 }
173}
174
175static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
176{
177 if (cfs_rq->on_list) {
178 list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
179 cfs_rq->on_list = 0;
180 }
181}
182
146/* Iterate thr' all leaf cfs_rq's on a runqueue */ 183/* Iterate thr' all leaf cfs_rq's on a runqueue */
147#define for_each_leaf_cfs_rq(rq, cfs_rq) \ 184#define for_each_leaf_cfs_rq(rq, cfs_rq) \
148 list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list) 185 list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
@@ -246,6 +283,14 @@ static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
246 return &cpu_rq(this_cpu)->cfs; 283 return &cpu_rq(this_cpu)->cfs;
247} 284}
248 285
286static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
287{
288}
289
290static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
291{
292}
293
249#define for_each_leaf_cfs_rq(rq, cfs_rq) \ 294#define for_each_leaf_cfs_rq(rq, cfs_rq) \
250 for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL) 295 for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
251 296
@@ -417,7 +462,6 @@ int sched_proc_update_handler(struct ctl_table *table, int write,
417 WRT_SYSCTL(sched_min_granularity); 462 WRT_SYSCTL(sched_min_granularity);
418 WRT_SYSCTL(sched_latency); 463 WRT_SYSCTL(sched_latency);
419 WRT_SYSCTL(sched_wakeup_granularity); 464 WRT_SYSCTL(sched_wakeup_granularity);
420 WRT_SYSCTL(sched_shares_ratelimit);
421#undef WRT_SYSCTL 465#undef WRT_SYSCTL
422 466
423 return 0; 467 return 0;
@@ -495,6 +539,9 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
495 return calc_delta_fair(sched_slice(cfs_rq, se), se); 539 return calc_delta_fair(sched_slice(cfs_rq, se), se);
496} 540}
497 541
542static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update);
543static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta);
544
498/* 545/*
499 * Update the current task's runtime statistics. Skip current tasks that 546 * Update the current task's runtime statistics. Skip current tasks that
500 * are not in our scheduling class. 547 * are not in our scheduling class.
@@ -514,6 +561,10 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
514 561
515 curr->vruntime += delta_exec_weighted; 562 curr->vruntime += delta_exec_weighted;
516 update_min_vruntime(cfs_rq); 563 update_min_vruntime(cfs_rq);
564
565#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
566 cfs_rq->load_unacc_exec_time += delta_exec;
567#endif
517} 568}
518 569
519static void update_curr(struct cfs_rq *cfs_rq) 570static void update_curr(struct cfs_rq *cfs_rq)
@@ -633,7 +684,6 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
633 list_add(&se->group_node, &cfs_rq->tasks); 684 list_add(&se->group_node, &cfs_rq->tasks);
634 } 685 }
635 cfs_rq->nr_running++; 686 cfs_rq->nr_running++;
636 se->on_rq = 1;
637} 687}
638 688
639static void 689static void
@@ -647,9 +697,140 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
647 list_del_init(&se->group_node); 697 list_del_init(&se->group_node);
648 } 698 }
649 cfs_rq->nr_running--; 699 cfs_rq->nr_running--;
650 se->on_rq = 0;
651} 700}
652 701
702#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
703static void update_cfs_rq_load_contribution(struct cfs_rq *cfs_rq,
704 int global_update)
705{
706 struct task_group *tg = cfs_rq->tg;
707 long load_avg;
708
709 load_avg = div64_u64(cfs_rq->load_avg, cfs_rq->load_period+1);
710 load_avg -= cfs_rq->load_contribution;
711
712 if (global_update || abs(load_avg) > cfs_rq->load_contribution / 8) {
713 atomic_add(load_avg, &tg->load_weight);
714 cfs_rq->load_contribution += load_avg;
715 }
716}
717
718static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
719{
720 u64 period = sysctl_sched_shares_window;
721 u64 now, delta;
722 unsigned long load = cfs_rq->load.weight;
723
724 if (!cfs_rq)
725 return;
726
727 now = rq_of(cfs_rq)->clock;
728 delta = now - cfs_rq->load_stamp;
729
730 /* truncate load history at 4 idle periods */
731 if (cfs_rq->load_stamp > cfs_rq->load_last &&
732 now - cfs_rq->load_last > 4 * period) {
733 cfs_rq->load_period = 0;
734 cfs_rq->load_avg = 0;
735 }
736
737 cfs_rq->load_stamp = now;
738 cfs_rq->load_unacc_exec_time = 0;
739 cfs_rq->load_period += delta;
740 if (load) {
741 cfs_rq->load_last = now;
742 cfs_rq->load_avg += delta * load;
743 }
744
745 /* consider updating load contribution on each fold or truncate */
746 if (global_update || cfs_rq->load_period > period
747 || !cfs_rq->load_period)
748 update_cfs_rq_load_contribution(cfs_rq, global_update);
749
750 while (cfs_rq->load_period > period) {
751 /*
752 * Inline assembly required to prevent the compiler
753 * optimising this loop into a divmod call.
754 * See __iter_div_u64_rem() for another example of this.
755 */
756 asm("" : "+rm" (cfs_rq->load_period));
757 cfs_rq->load_period /= 2;
758 cfs_rq->load_avg /= 2;
759 }
760
761 if (!cfs_rq->curr && !cfs_rq->nr_running && !cfs_rq->load_avg)
762 list_del_leaf_cfs_rq(cfs_rq);
763}
764
765static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
766 unsigned long weight)
767{
768 if (se->on_rq) {
769 /* commit outstanding execution time */
770 if (cfs_rq->curr == se)
771 update_curr(cfs_rq);
772 account_entity_dequeue(cfs_rq, se);
773 }
774
775 update_load_set(&se->load, weight);
776
777 if (se->on_rq)
778 account_entity_enqueue(cfs_rq, se);
779}
780
781static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta)
782{
783 struct task_group *tg;
784 struct sched_entity *se;
785 long load_weight, load, shares;
786
787 if (!cfs_rq)
788 return;
789
790 tg = cfs_rq->tg;
791 se = tg->se[cpu_of(rq_of(cfs_rq))];
792 if (!se)
793 return;
794
795 load = cfs_rq->load.weight + weight_delta;
796
797 load_weight = atomic_read(&tg->load_weight);
798 load_weight -= cfs_rq->load_contribution;
799 load_weight += load;
800
801 shares = (tg->shares * load);
802 if (load_weight)
803 shares /= load_weight;
804
805 if (shares < MIN_SHARES)
806 shares = MIN_SHARES;
807 if (shares > tg->shares)
808 shares = tg->shares;
809
810 reweight_entity(cfs_rq_of(se), se, shares);
811}
812
813static void update_entity_shares_tick(struct cfs_rq *cfs_rq)
814{
815 if (cfs_rq->load_unacc_exec_time > sysctl_sched_shares_window) {
816 update_cfs_load(cfs_rq, 0);
817 update_cfs_shares(cfs_rq, 0);
818 }
819}
820#else /* CONFIG_FAIR_GROUP_SCHED */
821static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
822{
823}
824
825static inline void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta)
826{
827}
828
829static inline void update_entity_shares_tick(struct cfs_rq *cfs_rq)
830{
831}
832#endif /* CONFIG_FAIR_GROUP_SCHED */
833
653static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) 834static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
654{ 835{
655#ifdef CONFIG_SCHEDSTATS 836#ifdef CONFIG_SCHEDSTATS
@@ -771,6 +952,8 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
771 * Update run-time statistics of the 'current'. 952 * Update run-time statistics of the 'current'.
772 */ 953 */
773 update_curr(cfs_rq); 954 update_curr(cfs_rq);
955 update_cfs_load(cfs_rq, 0);
956 update_cfs_shares(cfs_rq, se->load.weight);
774 account_entity_enqueue(cfs_rq, se); 957 account_entity_enqueue(cfs_rq, se);
775 958
776 if (flags & ENQUEUE_WAKEUP) { 959 if (flags & ENQUEUE_WAKEUP) {
@@ -782,6 +965,10 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
782 check_spread(cfs_rq, se); 965 check_spread(cfs_rq, se);
783 if (se != cfs_rq->curr) 966 if (se != cfs_rq->curr)
784 __enqueue_entity(cfs_rq, se); 967 __enqueue_entity(cfs_rq, se);
968 se->on_rq = 1;
969
970 if (cfs_rq->nr_running == 1)
971 list_add_leaf_cfs_rq(cfs_rq);
785} 972}
786 973
787static void __clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) 974static void __clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
@@ -825,8 +1012,11 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
825 1012
826 if (se != cfs_rq->curr) 1013 if (se != cfs_rq->curr)
827 __dequeue_entity(cfs_rq, se); 1014 __dequeue_entity(cfs_rq, se);
1015 se->on_rq = 0;
1016 update_cfs_load(cfs_rq, 0);
828 account_entity_dequeue(cfs_rq, se); 1017 account_entity_dequeue(cfs_rq, se);
829 update_min_vruntime(cfs_rq); 1018 update_min_vruntime(cfs_rq);
1019 update_cfs_shares(cfs_rq, 0);
830 1020
831 /* 1021 /*
832 * Normalize the entity after updating the min_vruntime because the 1022 * Normalize the entity after updating the min_vruntime because the
@@ -955,6 +1145,11 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
955 */ 1145 */
956 update_curr(cfs_rq); 1146 update_curr(cfs_rq);
957 1147
1148 /*
1149 * Update share accounting for long-running entities.
1150 */
1151 update_entity_shares_tick(cfs_rq);
1152
958#ifdef CONFIG_SCHED_HRTICK 1153#ifdef CONFIG_SCHED_HRTICK
959 /* 1154 /*
960 * queued ticks are scheduled to match the slice, so don't bother 1155 * queued ticks are scheduled to match the slice, so don't bother
@@ -1055,6 +1250,13 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
1055 flags = ENQUEUE_WAKEUP; 1250 flags = ENQUEUE_WAKEUP;
1056 } 1251 }
1057 1252
1253 for_each_sched_entity(se) {
1254 struct cfs_rq *cfs_rq = cfs_rq_of(se);
1255
1256 update_cfs_load(cfs_rq, 0);
1257 update_cfs_shares(cfs_rq, 0);
1258 }
1259
1058 hrtick_update(rq); 1260 hrtick_update(rq);
1059} 1261}
1060 1262
@@ -1071,12 +1273,20 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
1071 for_each_sched_entity(se) { 1273 for_each_sched_entity(se) {
1072 cfs_rq = cfs_rq_of(se); 1274 cfs_rq = cfs_rq_of(se);
1073 dequeue_entity(cfs_rq, se, flags); 1275 dequeue_entity(cfs_rq, se, flags);
1276
1074 /* Don't dequeue parent if it has other entities besides us */ 1277 /* Don't dequeue parent if it has other entities besides us */
1075 if (cfs_rq->load.weight) 1278 if (cfs_rq->load.weight)
1076 break; 1279 break;
1077 flags |= DEQUEUE_SLEEP; 1280 flags |= DEQUEUE_SLEEP;
1078 } 1281 }
1079 1282
1283 for_each_sched_entity(se) {
1284 struct cfs_rq *cfs_rq = cfs_rq_of(se);
1285
1286 update_cfs_load(cfs_rq, 0);
1287 update_cfs_shares(cfs_rq, 0);
1288 }
1289
1080 hrtick_update(rq); 1290 hrtick_update(rq);
1081} 1291}
1082 1292
@@ -1143,51 +1353,20 @@ static void task_waking_fair(struct rq *rq, struct task_struct *p)
1143 * Adding load to a group doesn't make a group heavier, but can cause movement 1353 * Adding load to a group doesn't make a group heavier, but can cause movement
1144 * of group shares between cpus. Assuming the shares were perfectly aligned one 1354 * of group shares between cpus. Assuming the shares were perfectly aligned one
1145 * can calculate the shift in shares. 1355 * can calculate the shift in shares.
1146 *
1147 * The problem is that perfectly aligning the shares is rather expensive, hence
1148 * we try to avoid doing that too often - see update_shares(), which ratelimits
1149 * this change.
1150 *
1151 * We compensate this by not only taking the current delta into account, but
1152 * also considering the delta between when the shares were last adjusted and
1153 * now.
1154 *
1155 * We still saw a performance dip, some tracing learned us that between
1156 * cgroup:/ and cgroup:/foo balancing the number of affine wakeups increased
1157 * significantly. Therefore try to bias the error in direction of failing
1158 * the affine wakeup.
1159 *
1160 */ 1356 */
1161static long effective_load(struct task_group *tg, int cpu, 1357static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
1162 long wl, long wg)
1163{ 1358{
1164 struct sched_entity *se = tg->se[cpu]; 1359 struct sched_entity *se = tg->se[cpu];
1165 1360
1166 if (!tg->parent) 1361 if (!tg->parent)
1167 return wl; 1362 return wl;
1168 1363
1169 /*
1170 * By not taking the decrease of shares on the other cpu into
1171 * account our error leans towards reducing the affine wakeups.
1172 */
1173 if (!wl && sched_feat(ASYM_EFF_LOAD))
1174 return wl;
1175
1176 for_each_sched_entity(se) { 1364 for_each_sched_entity(se) {
1177 long S, rw, s, a, b; 1365 long S, rw, s, a, b;
1178 long more_w;
1179
1180 /*
1181 * Instead of using this increment, also add the difference
1182 * between when the shares were last updated and now.
1183 */
1184 more_w = se->my_q->load.weight - se->my_q->rq_weight;
1185 wl += more_w;
1186 wg += more_w;
1187 1366
1188 S = se->my_q->tg->shares; 1367 S = se->my_q->tg->shares;
1189 s = se->my_q->shares; 1368 s = se->load.weight;
1190 rw = se->my_q->rq_weight; 1369 rw = se->my_q->load.weight;
1191 1370
1192 a = S*(rw + wl); 1371 a = S*(rw + wl);
1193 b = S*rw + s*wg; 1372 b = S*rw + s*wg;
@@ -1508,23 +1687,6 @@ select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_
1508 sd = tmp; 1687 sd = tmp;
1509 } 1688 }
1510 1689
1511#ifdef CONFIG_FAIR_GROUP_SCHED
1512 if (sched_feat(LB_SHARES_UPDATE)) {
1513 /*
1514 * Pick the largest domain to update shares over
1515 */
1516 tmp = sd;
1517 if (affine_sd && (!tmp || affine_sd->span_weight > sd->span_weight))
1518 tmp = affine_sd;
1519
1520 if (tmp) {
1521 raw_spin_unlock(&rq->lock);
1522 update_shares(tmp);
1523 raw_spin_lock(&rq->lock);
1524 }
1525 }
1526#endif
1527
1528 if (affine_sd) { 1690 if (affine_sd) {
1529 if (cpu == prev_cpu || wake_affine(affine_sd, p, sync)) 1691 if (cpu == prev_cpu || wake_affine(affine_sd, p, sync))
1530 return select_idle_sibling(p, cpu); 1692 return select_idle_sibling(p, cpu);
@@ -1909,6 +2071,48 @@ out:
1909} 2071}
1910 2072
1911#ifdef CONFIG_FAIR_GROUP_SCHED 2073#ifdef CONFIG_FAIR_GROUP_SCHED
2074/*
2075 * update tg->load_weight by folding this cpu's load_avg
2076 */
2077static int update_shares_cpu(struct task_group *tg, int cpu)
2078{
2079 struct cfs_rq *cfs_rq;
2080 unsigned long flags;
2081 struct rq *rq;
2082
2083 if (!tg->se[cpu])
2084 return 0;
2085
2086 rq = cpu_rq(cpu);
2087 cfs_rq = tg->cfs_rq[cpu];
2088
2089 raw_spin_lock_irqsave(&rq->lock, flags);
2090
2091 update_rq_clock(rq);
2092 update_cfs_load(cfs_rq, 1);
2093
2094 /*
2095 * We need to update shares after updating tg->load_weight in
2096 * order to adjust the weight of groups with long running tasks.
2097 */
2098 update_cfs_shares(cfs_rq, 0);
2099
2100 raw_spin_unlock_irqrestore(&rq->lock, flags);
2101
2102 return 0;
2103}
2104
2105static void update_shares(int cpu)
2106{
2107 struct cfs_rq *cfs_rq;
2108 struct rq *rq = cpu_rq(cpu);
2109
2110 rcu_read_lock();
2111 for_each_leaf_cfs_rq(rq, cfs_rq)
2112 update_shares_cpu(cfs_rq->tg, cpu);
2113 rcu_read_unlock();
2114}
2115
1912static unsigned long 2116static unsigned long
1913load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, 2117load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
1914 unsigned long max_load_move, 2118 unsigned long max_load_move,
@@ -1956,6 +2160,10 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
1956 return max_load_move - rem_load_move; 2160 return max_load_move - rem_load_move;
1957} 2161}
1958#else 2162#else
2163static inline void update_shares(int cpu)
2164{
2165}
2166
1959static unsigned long 2167static unsigned long
1960load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, 2168load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
1961 unsigned long max_load_move, 2169 unsigned long max_load_move,
@@ -3032,7 +3240,6 @@ static int load_balance(int this_cpu, struct rq *this_rq,
3032 schedstat_inc(sd, lb_count[idle]); 3240 schedstat_inc(sd, lb_count[idle]);
3033 3241
3034redo: 3242redo:
3035 update_shares(sd);
3036 group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle, 3243 group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
3037 cpus, balance); 3244 cpus, balance);
3038 3245
@@ -3174,8 +3381,6 @@ out_one_pinned:
3174 else 3381 else
3175 ld_moved = 0; 3382 ld_moved = 0;
3176out: 3383out:
3177 if (ld_moved)
3178 update_shares(sd);
3179 return ld_moved; 3384 return ld_moved;
3180} 3385}
3181 3386
@@ -3199,6 +3404,7 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
3199 */ 3404 */
3200 raw_spin_unlock(&this_rq->lock); 3405 raw_spin_unlock(&this_rq->lock);
3201 3406
3407 update_shares(this_cpu);
3202 for_each_domain(this_cpu, sd) { 3408 for_each_domain(this_cpu, sd) {
3203 unsigned long interval; 3409 unsigned long interval;
3204 int balance = 1; 3410 int balance = 1;
@@ -3569,6 +3775,8 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
3569 int update_next_balance = 0; 3775 int update_next_balance = 0;
3570 int need_serialize; 3776 int need_serialize;
3571 3777
3778 update_shares(cpu);
3779
3572 for_each_domain(cpu, sd) { 3780 for_each_domain(cpu, sd) {
3573 if (!(sd->flags & SD_LOAD_BALANCE)) 3781 if (!(sd->flags & SD_LOAD_BALANCE))
3574 continue; 3782 continue;
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index 185f920ec1a2..68e69acc29b9 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -52,8 +52,6 @@ SCHED_FEAT(ARCH_POWER, 0)
52SCHED_FEAT(HRTICK, 0) 52SCHED_FEAT(HRTICK, 0)
53SCHED_FEAT(DOUBLE_TICK, 0) 53SCHED_FEAT(DOUBLE_TICK, 0)
54SCHED_FEAT(LB_BIAS, 1) 54SCHED_FEAT(LB_BIAS, 1)
55SCHED_FEAT(LB_SHARES_UPDATE, 1)
56SCHED_FEAT(ASYM_EFF_LOAD, 1)
57 55
58/* 56/*
59 * Spin-wait on mutex acquisition when the mutex owner is running on 57 * Spin-wait on mutex acquisition when the mutex owner is running on
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index bea7d79f7e9c..c914ec747ca6 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -183,6 +183,17 @@ static inline u64 sched_rt_period(struct rt_rq *rt_rq)
183 return ktime_to_ns(rt_rq->tg->rt_bandwidth.rt_period); 183 return ktime_to_ns(rt_rq->tg->rt_bandwidth.rt_period);
184} 184}
185 185
186static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq)
187{
188 list_add_rcu(&rt_rq->leaf_rt_rq_list,
189 &rq_of_rt_rq(rt_rq)->leaf_rt_rq_list);
190}
191
192static inline void list_del_leaf_rt_rq(struct rt_rq *rt_rq)
193{
194 list_del_rcu(&rt_rq->leaf_rt_rq_list);
195}
196
186#define for_each_leaf_rt_rq(rt_rq, rq) \ 197#define for_each_leaf_rt_rq(rt_rq, rq) \
187 list_for_each_entry_rcu(rt_rq, &rq->leaf_rt_rq_list, leaf_rt_rq_list) 198 list_for_each_entry_rcu(rt_rq, &rq->leaf_rt_rq_list, leaf_rt_rq_list)
188 199
@@ -276,6 +287,14 @@ static inline u64 sched_rt_period(struct rt_rq *rt_rq)
276 return ktime_to_ns(def_rt_bandwidth.rt_period); 287 return ktime_to_ns(def_rt_bandwidth.rt_period);
277} 288}
278 289
290static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq)
291{
292}
293
294static inline void list_del_leaf_rt_rq(struct rt_rq *rt_rq)
295{
296}
297
279#define for_each_leaf_rt_rq(rt_rq, rq) \ 298#define for_each_leaf_rt_rq(rt_rq, rq) \
280 for (rt_rq = &rq->rt; rt_rq; rt_rq = NULL) 299 for (rt_rq = &rq->rt; rt_rq; rt_rq = NULL)
281 300
@@ -825,6 +844,9 @@ static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
825 if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running)) 844 if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running))
826 return; 845 return;
827 846
847 if (!rt_rq->rt_nr_running)
848 list_add_leaf_rt_rq(rt_rq);
849
828 if (head) 850 if (head)
829 list_add(&rt_se->run_list, queue); 851 list_add(&rt_se->run_list, queue);
830 else 852 else
@@ -844,6 +866,8 @@ static void __dequeue_rt_entity(struct sched_rt_entity *rt_se)
844 __clear_bit(rt_se_prio(rt_se), array->bitmap); 866 __clear_bit(rt_se_prio(rt_se), array->bitmap);
845 867
846 dec_rt_tasks(rt_se, rt_rq); 868 dec_rt_tasks(rt_se, rt_rq);
869 if (!rt_rq->rt_nr_running)
870 list_del_leaf_rt_rq(rt_rq);
847} 871}
848 872
849/* 873/*
diff --git a/kernel/smp.c b/kernel/smp.c
index 12ed8b013e2d..4ec30e069987 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -13,6 +13,7 @@
13#include <linux/smp.h> 13#include <linux/smp.h>
14#include <linux/cpu.h> 14#include <linux/cpu.h>
15 15
16#ifdef CONFIG_USE_GENERIC_SMP_HELPERS
16static struct { 17static struct {
17 struct list_head queue; 18 struct list_head queue;
18 raw_spinlock_t lock; 19 raw_spinlock_t lock;
@@ -529,3 +530,21 @@ void ipi_call_unlock_irq(void)
529{ 530{
530 raw_spin_unlock_irq(&call_function.lock); 531 raw_spin_unlock_irq(&call_function.lock);
531} 532}
533#endif /* USE_GENERIC_SMP_HELPERS */
534
535/*
536 * Call a function on all processors
537 */
538int on_each_cpu(void (*func) (void *info), void *info, int wait)
539{
540 int ret = 0;
541
542 preempt_disable();
543 ret = smp_call_function(func, info, wait);
544 local_irq_disable();
545 func(info);
546 local_irq_enable();
547 preempt_enable();
548 return ret;
549}
550EXPORT_SYMBOL(on_each_cpu);
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 18f4be0d5fe0..68eb5efec388 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -70,7 +70,7 @@ char *softirq_to_name[NR_SOFTIRQS] = {
70static void wakeup_softirqd(void) 70static void wakeup_softirqd(void)
71{ 71{
72 /* Interrupts are disabled: no need to stop preemption */ 72 /* Interrupts are disabled: no need to stop preemption */
73 struct task_struct *tsk = __get_cpu_var(ksoftirqd); 73 struct task_struct *tsk = __this_cpu_read(ksoftirqd);
74 74
75 if (tsk && tsk->state != TASK_RUNNING) 75 if (tsk && tsk->state != TASK_RUNNING)
76 wake_up_process(tsk); 76 wake_up_process(tsk);
@@ -388,8 +388,8 @@ void __tasklet_schedule(struct tasklet_struct *t)
388 388
389 local_irq_save(flags); 389 local_irq_save(flags);
390 t->next = NULL; 390 t->next = NULL;
391 *__get_cpu_var(tasklet_vec).tail = t; 391 *__this_cpu_read(tasklet_vec.tail) = t;
392 __get_cpu_var(tasklet_vec).tail = &(t->next); 392 __this_cpu_write(tasklet_vec.tail, &(t->next));
393 raise_softirq_irqoff(TASKLET_SOFTIRQ); 393 raise_softirq_irqoff(TASKLET_SOFTIRQ);
394 local_irq_restore(flags); 394 local_irq_restore(flags);
395} 395}
@@ -402,8 +402,8 @@ void __tasklet_hi_schedule(struct tasklet_struct *t)
402 402
403 local_irq_save(flags); 403 local_irq_save(flags);
404 t->next = NULL; 404 t->next = NULL;
405 *__get_cpu_var(tasklet_hi_vec).tail = t; 405 *__this_cpu_read(tasklet_hi_vec.tail) = t;
406 __get_cpu_var(tasklet_hi_vec).tail = &(t->next); 406 __this_cpu_write(tasklet_hi_vec.tail, &(t->next));
407 raise_softirq_irqoff(HI_SOFTIRQ); 407 raise_softirq_irqoff(HI_SOFTIRQ);
408 local_irq_restore(flags); 408 local_irq_restore(flags);
409} 409}
@@ -414,8 +414,8 @@ void __tasklet_hi_schedule_first(struct tasklet_struct *t)
414{ 414{
415 BUG_ON(!irqs_disabled()); 415 BUG_ON(!irqs_disabled());
416 416
417 t->next = __get_cpu_var(tasklet_hi_vec).head; 417 t->next = __this_cpu_read(tasklet_hi_vec.head);
418 __get_cpu_var(tasklet_hi_vec).head = t; 418 __this_cpu_write(tasklet_hi_vec.head, t);
419 __raise_softirq_irqoff(HI_SOFTIRQ); 419 __raise_softirq_irqoff(HI_SOFTIRQ);
420} 420}
421 421
@@ -426,9 +426,9 @@ static void tasklet_action(struct softirq_action *a)
426 struct tasklet_struct *list; 426 struct tasklet_struct *list;
427 427
428 local_irq_disable(); 428 local_irq_disable();
429 list = __get_cpu_var(tasklet_vec).head; 429 list = __this_cpu_read(tasklet_vec.head);
430 __get_cpu_var(tasklet_vec).head = NULL; 430 __this_cpu_write(tasklet_vec.head, NULL);
431 __get_cpu_var(tasklet_vec).tail = &__get_cpu_var(tasklet_vec).head; 431 __this_cpu_write(tasklet_vec.tail, &__get_cpu_var(tasklet_vec).head);
432 local_irq_enable(); 432 local_irq_enable();
433 433
434 while (list) { 434 while (list) {
@@ -449,8 +449,8 @@ static void tasklet_action(struct softirq_action *a)
449 449
450 local_irq_disable(); 450 local_irq_disable();
451 t->next = NULL; 451 t->next = NULL;
452 *__get_cpu_var(tasklet_vec).tail = t; 452 *__this_cpu_read(tasklet_vec.tail) = t;
453 __get_cpu_var(tasklet_vec).tail = &(t->next); 453 __this_cpu_write(tasklet_vec.tail, &(t->next));
454 __raise_softirq_irqoff(TASKLET_SOFTIRQ); 454 __raise_softirq_irqoff(TASKLET_SOFTIRQ);
455 local_irq_enable(); 455 local_irq_enable();
456 } 456 }
@@ -461,9 +461,9 @@ static void tasklet_hi_action(struct softirq_action *a)
461 struct tasklet_struct *list; 461 struct tasklet_struct *list;
462 462
463 local_irq_disable(); 463 local_irq_disable();
464 list = __get_cpu_var(tasklet_hi_vec).head; 464 list = __this_cpu_read(tasklet_hi_vec.head);
465 __get_cpu_var(tasklet_hi_vec).head = NULL; 465 __this_cpu_write(tasklet_hi_vec.head, NULL);
466 __get_cpu_var(tasklet_hi_vec).tail = &__get_cpu_var(tasklet_hi_vec).head; 466 __this_cpu_write(tasklet_hi_vec.tail, &__get_cpu_var(tasklet_hi_vec).head);
467 local_irq_enable(); 467 local_irq_enable();
468 468
469 while (list) { 469 while (list) {
@@ -484,8 +484,8 @@ static void tasklet_hi_action(struct softirq_action *a)
484 484
485 local_irq_disable(); 485 local_irq_disable();
486 t->next = NULL; 486 t->next = NULL;
487 *__get_cpu_var(tasklet_hi_vec).tail = t; 487 *__this_cpu_read(tasklet_hi_vec.tail) = t;
488 __get_cpu_var(tasklet_hi_vec).tail = &(t->next); 488 __this_cpu_write(tasklet_hi_vec.tail, &(t->next));
489 __raise_softirq_irqoff(HI_SOFTIRQ); 489 __raise_softirq_irqoff(HI_SOFTIRQ);
490 local_irq_enable(); 490 local_irq_enable();
491 } 491 }
@@ -802,16 +802,16 @@ static void takeover_tasklets(unsigned int cpu)
802 802
803 /* Find end, append list for that CPU. */ 803 /* Find end, append list for that CPU. */
804 if (&per_cpu(tasklet_vec, cpu).head != per_cpu(tasklet_vec, cpu).tail) { 804 if (&per_cpu(tasklet_vec, cpu).head != per_cpu(tasklet_vec, cpu).tail) {
805 *(__get_cpu_var(tasklet_vec).tail) = per_cpu(tasklet_vec, cpu).head; 805 *__this_cpu_read(tasklet_vec.tail) = per_cpu(tasklet_vec, cpu).head;
806 __get_cpu_var(tasklet_vec).tail = per_cpu(tasklet_vec, cpu).tail; 806 this_cpu_write(tasklet_vec.tail, per_cpu(tasklet_vec, cpu).tail);
807 per_cpu(tasklet_vec, cpu).head = NULL; 807 per_cpu(tasklet_vec, cpu).head = NULL;
808 per_cpu(tasklet_vec, cpu).tail = &per_cpu(tasklet_vec, cpu).head; 808 per_cpu(tasklet_vec, cpu).tail = &per_cpu(tasklet_vec, cpu).head;
809 } 809 }
810 raise_softirq_irqoff(TASKLET_SOFTIRQ); 810 raise_softirq_irqoff(TASKLET_SOFTIRQ);
811 811
812 if (&per_cpu(tasklet_hi_vec, cpu).head != per_cpu(tasklet_hi_vec, cpu).tail) { 812 if (&per_cpu(tasklet_hi_vec, cpu).head != per_cpu(tasklet_hi_vec, cpu).tail) {
813 *__get_cpu_var(tasklet_hi_vec).tail = per_cpu(tasklet_hi_vec, cpu).head; 813 *__this_cpu_read(tasklet_hi_vec.tail) = per_cpu(tasklet_hi_vec, cpu).head;
814 __get_cpu_var(tasklet_hi_vec).tail = per_cpu(tasklet_hi_vec, cpu).tail; 814 __this_cpu_write(tasklet_hi_vec.tail, per_cpu(tasklet_hi_vec, cpu).tail);
815 per_cpu(tasklet_hi_vec, cpu).head = NULL; 815 per_cpu(tasklet_hi_vec, cpu).head = NULL;
816 per_cpu(tasklet_hi_vec, cpu).tail = &per_cpu(tasklet_hi_vec, cpu).head; 816 per_cpu(tasklet_hi_vec, cpu).tail = &per_cpu(tasklet_hi_vec, cpu).head;
817 } 817 }
@@ -853,7 +853,9 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb,
853 cpumask_any(cpu_online_mask)); 853 cpumask_any(cpu_online_mask));
854 case CPU_DEAD: 854 case CPU_DEAD:
855 case CPU_DEAD_FROZEN: { 855 case CPU_DEAD_FROZEN: {
856 struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; 856 static const struct sched_param param = {
857 .sched_priority = MAX_RT_PRIO-1
858 };
857 859
858 p = per_cpu(ksoftirqd, hotcpu); 860 p = per_cpu(ksoftirqd, hotcpu);
859 per_cpu(ksoftirqd, hotcpu) = NULL; 861 per_cpu(ksoftirqd, hotcpu) = NULL;
@@ -883,25 +885,6 @@ static __init int spawn_ksoftirqd(void)
883} 885}
884early_initcall(spawn_ksoftirqd); 886early_initcall(spawn_ksoftirqd);
885 887
886#ifdef CONFIG_SMP
887/*
888 * Call a function on all processors
889 */
890int on_each_cpu(void (*func) (void *info), void *info, int wait)
891{
892 int ret = 0;
893
894 preempt_disable();
895 ret = smp_call_function(func, info, wait);
896 local_irq_disable();
897 func(info);
898 local_irq_enable();
899 preempt_enable();
900 return ret;
901}
902EXPORT_SYMBOL(on_each_cpu);
903#endif
904
905/* 888/*
906 * [ These __weak aliases are kept in a separate compilation unit, so that 889 * [ These __weak aliases are kept in a separate compilation unit, so that
907 * GCC does not inline them incorrectly. ] 890 * GCC does not inline them incorrectly. ]
diff --git a/kernel/srcu.c b/kernel/srcu.c
index c71e07500536..98d8c1e80edb 100644
--- a/kernel/srcu.c
+++ b/kernel/srcu.c
@@ -31,6 +31,7 @@
31#include <linux/rcupdate.h> 31#include <linux/rcupdate.h>
32#include <linux/sched.h> 32#include <linux/sched.h>
33#include <linux/smp.h> 33#include <linux/smp.h>
34#include <linux/delay.h>
34#include <linux/srcu.h> 35#include <linux/srcu.h>
35 36
36static int init_srcu_struct_fields(struct srcu_struct *sp) 37static int init_srcu_struct_fields(struct srcu_struct *sp)
@@ -203,9 +204,14 @@ static void __synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void))
203 * all srcu_read_lock() calls using the old counters have completed. 204 * all srcu_read_lock() calls using the old counters have completed.
204 * Their corresponding critical sections might well be still 205 * Their corresponding critical sections might well be still
205 * executing, but the srcu_read_lock() primitives themselves 206 * executing, but the srcu_read_lock() primitives themselves
206 * will have finished executing. 207 * will have finished executing. We initially give readers
208 * an arbitrarily chosen 10 microseconds to get out of their
209 * SRCU read-side critical sections, then loop waiting 1/HZ
210 * seconds per iteration.
207 */ 211 */
208 212
213 if (srcu_readers_active_idx(sp, idx))
214 udelay(CONFIG_SRCU_SYNCHRONIZE_DELAY);
209 while (srcu_readers_active_idx(sp, idx)) 215 while (srcu_readers_active_idx(sp, idx))
210 schedule_timeout_interruptible(1); 216 schedule_timeout_interruptible(1);
211 217
diff --git a/kernel/sys.c b/kernel/sys.c
index 7f5a0cd296a9..31b71a276b40 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -43,6 +43,8 @@
43#include <linux/kprobes.h> 43#include <linux/kprobes.h>
44#include <linux/user_namespace.h> 44#include <linux/user_namespace.h>
45 45
46#include <linux/kmsg_dump.h>
47
46#include <asm/uaccess.h> 48#include <asm/uaccess.h>
47#include <asm/io.h> 49#include <asm/io.h>
48#include <asm/unistd.h> 50#include <asm/unistd.h>
@@ -285,6 +287,7 @@ out_unlock:
285 */ 287 */
286void emergency_restart(void) 288void emergency_restart(void)
287{ 289{
290 kmsg_dump(KMSG_DUMP_EMERG);
288 machine_emergency_restart(); 291 machine_emergency_restart();
289} 292}
290EXPORT_SYMBOL_GPL(emergency_restart); 293EXPORT_SYMBOL_GPL(emergency_restart);
@@ -312,6 +315,7 @@ void kernel_restart(char *cmd)
312 printk(KERN_EMERG "Restarting system.\n"); 315 printk(KERN_EMERG "Restarting system.\n");
313 else 316 else
314 printk(KERN_EMERG "Restarting system with command '%s'.\n", cmd); 317 printk(KERN_EMERG "Restarting system with command '%s'.\n", cmd);
318 kmsg_dump(KMSG_DUMP_RESTART);
315 machine_restart(cmd); 319 machine_restart(cmd);
316} 320}
317EXPORT_SYMBOL_GPL(kernel_restart); 321EXPORT_SYMBOL_GPL(kernel_restart);
@@ -333,6 +337,7 @@ void kernel_halt(void)
333 kernel_shutdown_prepare(SYSTEM_HALT); 337 kernel_shutdown_prepare(SYSTEM_HALT);
334 sysdev_shutdown(); 338 sysdev_shutdown();
335 printk(KERN_EMERG "System halted.\n"); 339 printk(KERN_EMERG "System halted.\n");
340 kmsg_dump(KMSG_DUMP_HALT);
336 machine_halt(); 341 machine_halt();
337} 342}
338 343
@@ -351,6 +356,7 @@ void kernel_power_off(void)
351 disable_nonboot_cpus(); 356 disable_nonboot_cpus();
352 sysdev_shutdown(); 357 sysdev_shutdown();
353 printk(KERN_EMERG "Power down.\n"); 358 printk(KERN_EMERG "Power down.\n");
359 kmsg_dump(KMSG_DUMP_POWEROFF);
354 machine_power_off(); 360 machine_power_off();
355} 361}
356EXPORT_SYMBOL_GPL(kernel_power_off); 362EXPORT_SYMBOL_GPL(kernel_power_off);
@@ -1080,8 +1086,10 @@ SYSCALL_DEFINE0(setsid)
1080 err = session; 1086 err = session;
1081out: 1087out:
1082 write_unlock_irq(&tasklist_lock); 1088 write_unlock_irq(&tasklist_lock);
1083 if (err > 0) 1089 if (err > 0) {
1084 proc_sid_connector(group_leader); 1090 proc_sid_connector(group_leader);
1091 sched_autogroup_create_attach(group_leader);
1092 }
1085 return err; 1093 return err;
1086} 1094}
1087 1095
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 5abfa1518554..bc86bb32e126 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -24,6 +24,7 @@
24#include <linux/slab.h> 24#include <linux/slab.h>
25#include <linux/sysctl.h> 25#include <linux/sysctl.h>
26#include <linux/signal.h> 26#include <linux/signal.h>
27#include <linux/printk.h>
27#include <linux/proc_fs.h> 28#include <linux/proc_fs.h>
28#include <linux/security.h> 29#include <linux/security.h>
29#include <linux/ctype.h> 30#include <linux/ctype.h>
@@ -245,10 +246,6 @@ static struct ctl_table root_table[] = {
245 .mode = 0555, 246 .mode = 0555,
246 .child = dev_table, 247 .child = dev_table,
247 }, 248 },
248/*
249 * NOTE: do not add new entries to this table unless you have read
250 * Documentation/sysctl/ctl_unnumbered.txt
251 */
252 { } 249 { }
253}; 250};
254 251
@@ -259,8 +256,6 @@ static int min_wakeup_granularity_ns; /* 0 usecs */
259static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */ 256static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */
260static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE; 257static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE;
261static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1; 258static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1;
262static int min_sched_shares_ratelimit = 100000; /* 100 usec */
263static int max_sched_shares_ratelimit = NSEC_PER_SEC; /* 1 second */
264#endif 259#endif
265 260
266#ifdef CONFIG_COMPACTION 261#ifdef CONFIG_COMPACTION
@@ -305,15 +300,6 @@ static struct ctl_table kern_table[] = {
305 .extra2 = &max_wakeup_granularity_ns, 300 .extra2 = &max_wakeup_granularity_ns,
306 }, 301 },
307 { 302 {
308 .procname = "sched_shares_ratelimit",
309 .data = &sysctl_sched_shares_ratelimit,
310 .maxlen = sizeof(unsigned int),
311 .mode = 0644,
312 .proc_handler = sched_proc_update_handler,
313 .extra1 = &min_sched_shares_ratelimit,
314 .extra2 = &max_sched_shares_ratelimit,
315 },
316 {
317 .procname = "sched_tunable_scaling", 303 .procname = "sched_tunable_scaling",
318 .data = &sysctl_sched_tunable_scaling, 304 .data = &sysctl_sched_tunable_scaling,
319 .maxlen = sizeof(enum sched_tunable_scaling), 305 .maxlen = sizeof(enum sched_tunable_scaling),
@@ -323,14 +309,6 @@ static struct ctl_table kern_table[] = {
323 .extra2 = &max_sched_tunable_scaling, 309 .extra2 = &max_sched_tunable_scaling,
324 }, 310 },
325 { 311 {
326 .procname = "sched_shares_thresh",
327 .data = &sysctl_sched_shares_thresh,
328 .maxlen = sizeof(unsigned int),
329 .mode = 0644,
330 .proc_handler = proc_dointvec_minmax,
331 .extra1 = &zero,
332 },
333 {
334 .procname = "sched_migration_cost", 312 .procname = "sched_migration_cost",
335 .data = &sysctl_sched_migration_cost, 313 .data = &sysctl_sched_migration_cost,
336 .maxlen = sizeof(unsigned int), 314 .maxlen = sizeof(unsigned int),
@@ -352,6 +330,13 @@ static struct ctl_table kern_table[] = {
352 .proc_handler = proc_dointvec, 330 .proc_handler = proc_dointvec,
353 }, 331 },
354 { 332 {
333 .procname = "sched_shares_window",
334 .data = &sysctl_sched_shares_window,
335 .maxlen = sizeof(unsigned int),
336 .mode = 0644,
337 .proc_handler = proc_dointvec,
338 },
339 {
355 .procname = "timer_migration", 340 .procname = "timer_migration",
356 .data = &sysctl_timer_migration, 341 .data = &sysctl_timer_migration,
357 .maxlen = sizeof(unsigned int), 342 .maxlen = sizeof(unsigned int),
@@ -382,6 +367,17 @@ static struct ctl_table kern_table[] = {
382 .mode = 0644, 367 .mode = 0644,
383 .proc_handler = proc_dointvec, 368 .proc_handler = proc_dointvec,
384 }, 369 },
370#ifdef CONFIG_SCHED_AUTOGROUP
371 {
372 .procname = "sched_autogroup_enabled",
373 .data = &sysctl_sched_autogroup_enabled,
374 .maxlen = sizeof(unsigned int),
375 .mode = 0644,
376 .proc_handler = proc_dointvec,
377 .extra1 = &zero,
378 .extra2 = &one,
379 },
380#endif
385#ifdef CONFIG_PROVE_LOCKING 381#ifdef CONFIG_PROVE_LOCKING
386 { 382 {
387 .procname = "prove_locking", 383 .procname = "prove_locking",
@@ -711,6 +707,15 @@ static struct ctl_table kern_table[] = {
711 .extra1 = &zero, 707 .extra1 = &zero,
712 .extra2 = &one, 708 .extra2 = &one,
713 }, 709 },
710 {
711 .procname = "kptr_restrict",
712 .data = &kptr_restrict,
713 .maxlen = sizeof(int),
714 .mode = 0644,
715 .proc_handler = proc_dointvec_minmax,
716 .extra1 = &zero,
717 .extra2 = &two,
718 },
714#endif 719#endif
715 { 720 {
716 .procname = "ngroups_max", 721 .procname = "ngroups_max",
@@ -745,21 +750,21 @@ static struct ctl_table kern_table[] = {
745 .extra1 = &zero, 750 .extra1 = &zero,
746 .extra2 = &one, 751 .extra2 = &one,
747 }, 752 },
748#endif
749#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) && !defined(CONFIG_LOCKUP_DETECTOR)
750 { 753 {
751 .procname = "unknown_nmi_panic", 754 .procname = "nmi_watchdog",
752 .data = &unknown_nmi_panic, 755 .data = &watchdog_enabled,
753 .maxlen = sizeof (int), 756 .maxlen = sizeof (int),
754 .mode = 0644, 757 .mode = 0644,
755 .proc_handler = proc_dointvec, 758 .proc_handler = proc_dowatchdog_enabled,
756 }, 759 },
760#endif
761#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86)
757 { 762 {
758 .procname = "nmi_watchdog", 763 .procname = "unknown_nmi_panic",
759 .data = &nmi_watchdog_enabled, 764 .data = &unknown_nmi_panic,
760 .maxlen = sizeof (int), 765 .maxlen = sizeof (int),
761 .mode = 0644, 766 .mode = 0644,
762 .proc_handler = proc_nmi_enabled, 767 .proc_handler = proc_dointvec,
763 }, 768 },
764#endif 769#endif
765#if defined(CONFIG_X86) 770#if defined(CONFIG_X86)
@@ -963,10 +968,6 @@ static struct ctl_table kern_table[] = {
963 .proc_handler = proc_dointvec, 968 .proc_handler = proc_dointvec,
964 }, 969 },
965#endif 970#endif
966/*
967 * NOTE: do not add new entries to this table unless you have read
968 * Documentation/sysctl/ctl_unnumbered.txt
969 */
970 { } 971 { }
971}; 972};
972 973
@@ -1327,11 +1328,6 @@ static struct ctl_table vm_table[] = {
1327 .extra2 = &one, 1328 .extra2 = &one,
1328 }, 1329 },
1329#endif 1330#endif
1330
1331/*
1332 * NOTE: do not add new entries to this table unless you have read
1333 * Documentation/sysctl/ctl_unnumbered.txt
1334 */
1335 { } 1331 { }
1336}; 1332};
1337 1333
@@ -1487,10 +1483,6 @@ static struct ctl_table fs_table[] = {
1487 .proc_handler = &pipe_proc_fn, 1483 .proc_handler = &pipe_proc_fn,
1488 .extra1 = &pipe_min_size, 1484 .extra1 = &pipe_min_size,
1489 }, 1485 },
1490/*
1491 * NOTE: do not add new entries to this table unless you have read
1492 * Documentation/sysctl/ctl_unnumbered.txt
1493 */
1494 { } 1486 { }
1495}; 1487};
1496 1488
@@ -2900,7 +2892,7 @@ int proc_do_large_bitmap(struct ctl_table *table, int write,
2900 } 2892 }
2901} 2893}
2902 2894
2903#else /* CONFIG_PROC_FS */ 2895#else /* CONFIG_PROC_SYSCTL */
2904 2896
2905int proc_dostring(struct ctl_table *table, int write, 2897int proc_dostring(struct ctl_table *table, int write,
2906 void __user *buffer, size_t *lenp, loff_t *ppos) 2898 void __user *buffer, size_t *lenp, loff_t *ppos)
@@ -2952,7 +2944,7 @@ int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write,
2952} 2944}
2953 2945
2954 2946
2955#endif /* CONFIG_PROC_FS */ 2947#endif /* CONFIG_PROC_SYSCTL */
2956 2948
2957/* 2949/*
2958 * No sense putting this after each symbol definition, twice, 2950 * No sense putting this after each symbol definition, twice,
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index 1357c5786064..b875bedf7c9a 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -136,7 +136,6 @@ static const struct bin_table bin_kern_table[] = {
136 { CTL_INT, KERN_IA64_UNALIGNED, "ignore-unaligned-usertrap" }, 136 { CTL_INT, KERN_IA64_UNALIGNED, "ignore-unaligned-usertrap" },
137 { CTL_INT, KERN_COMPAT_LOG, "compat-log" }, 137 { CTL_INT, KERN_COMPAT_LOG, "compat-log" },
138 { CTL_INT, KERN_MAX_LOCK_DEPTH, "max_lock_depth" }, 138 { CTL_INT, KERN_MAX_LOCK_DEPTH, "max_lock_depth" },
139 { CTL_INT, KERN_NMI_WATCHDOG, "nmi_watchdog" },
140 { CTL_INT, KERN_PANIC_ON_NMI, "panic_on_unrecovered_nmi" }, 139 { CTL_INT, KERN_PANIC_ON_NMI, "panic_on_unrecovered_nmi" },
141 {} 140 {}
142}; 141};
@@ -1193,7 +1192,7 @@ static ssize_t bin_dn_node_address(struct file *file,
1193 1192
1194 buf[result] = '\0'; 1193 buf[result] = '\0';
1195 1194
1196 /* Convert the decnet addresss to binary */ 1195 /* Convert the decnet address to binary */
1197 result = -EIO; 1196 result = -EIO;
1198 nodep = strchr(buf, '.') + 1; 1197 nodep = strchr(buf, '.') + 1;
1199 if (!nodep) 1198 if (!nodep)
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 3308fd7f1b52..3971c6b9d58d 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -89,8 +89,7 @@ static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp,
89 return -ENOMEM; 89 return -ENOMEM;
90 90
91 if (!info) { 91 if (!info) {
92 int seq = get_cpu_var(taskstats_seqnum)++; 92 int seq = this_cpu_inc_return(taskstats_seqnum) - 1;
93 put_cpu_var(taskstats_seqnum);
94 93
95 reply = genlmsg_put(skb, 0, seq, &family, 0, cmd); 94 reply = genlmsg_put(skb, 0, seq, &family, 0, cmd);
96 } else 95 } else
@@ -349,7 +348,7 @@ static int parse(struct nlattr *na, struct cpumask *mask)
349 return ret; 348 return ret;
350} 349}
351 350
352#ifdef CONFIG_IA64 351#if defined(CONFIG_64BIT) && !defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
353#define TASKSTATS_NEEDS_PADDING 1 352#define TASKSTATS_NEEDS_PADDING 1
354#endif 353#endif
355 354
@@ -612,7 +611,7 @@ void taskstats_exit(struct task_struct *tsk, int group_dead)
612 fill_tgid_exit(tsk); 611 fill_tgid_exit(tsk);
613 } 612 }
614 613
615 listeners = &__raw_get_cpu_var(listener_array); 614 listeners = __this_cpu_ptr(&listener_array);
616 if (list_empty(&listeners->list)) 615 if (list_empty(&listeners->list))
617 return; 616 return;
618 617
diff --git a/kernel/time.c b/kernel/time.c
index ba9b338d1835..32174359576f 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -238,7 +238,7 @@ EXPORT_SYMBOL(current_fs_time);
238 * Avoid unnecessary multiplications/divisions in the 238 * Avoid unnecessary multiplications/divisions in the
239 * two most common HZ cases: 239 * two most common HZ cases:
240 */ 240 */
241unsigned int inline jiffies_to_msecs(const unsigned long j) 241inline unsigned int jiffies_to_msecs(const unsigned long j)
242{ 242{
243#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ) 243#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ)
244 return (MSEC_PER_SEC / HZ) * j; 244 return (MSEC_PER_SEC / HZ) * j;
@@ -254,7 +254,7 @@ unsigned int inline jiffies_to_msecs(const unsigned long j)
254} 254}
255EXPORT_SYMBOL(jiffies_to_msecs); 255EXPORT_SYMBOL(jiffies_to_msecs);
256 256
257unsigned int inline jiffies_to_usecs(const unsigned long j) 257inline unsigned int jiffies_to_usecs(const unsigned long j)
258{ 258{
259#if HZ <= USEC_PER_SEC && !(USEC_PER_SEC % HZ) 259#if HZ <= USEC_PER_SEC && !(USEC_PER_SEC % HZ)
260 return (USEC_PER_SEC / HZ) * j; 260 return (USEC_PER_SEC / HZ) * j;
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index c18d7efa1b4b..c50a034de30f 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -152,6 +152,7 @@ clocks_calc_mult_shift(u32 *mult, u32 *shift, u32 from, u32 to, u32 minsec)
152 */ 152 */
153 for (sft = 32; sft > 0; sft--) { 153 for (sft = 32; sft > 0; sft--) {
154 tmp = (u64) to << sft; 154 tmp = (u64) to << sft;
155 tmp += from / 2;
155 do_div(tmp, from); 156 do_div(tmp, from);
156 if ((tmp >> sftacc) == 0) 157 if ((tmp >> sftacc) == 0)
157 break; 158 break;
@@ -678,7 +679,7 @@ EXPORT_SYMBOL_GPL(__clocksource_updatefreq_scale);
678int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq) 679int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq)
679{ 680{
680 681
681 /* Intialize mult/shift and max_idle_ns */ 682 /* Initialize mult/shift and max_idle_ns */
682 __clocksource_updatefreq_scale(cs, scale, freq); 683 __clocksource_updatefreq_scale(cs, scale, freq);
683 684
684 /* Add clocksource to the clcoksource list */ 685 /* Add clocksource to the clcoksource list */
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index d2321891538f..5c00242fa921 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -14,6 +14,7 @@
14#include <linux/timex.h> 14#include <linux/timex.h>
15#include <linux/time.h> 15#include <linux/time.h>
16#include <linux/mm.h> 16#include <linux/mm.h>
17#include <linux/module.h>
17 18
18/* 19/*
19 * NTP timekeeping variables: 20 * NTP timekeeping variables:
@@ -74,6 +75,162 @@ static long time_adjust;
74/* constant (boot-param configurable) NTP tick adjustment (upscaled) */ 75/* constant (boot-param configurable) NTP tick adjustment (upscaled) */
75static s64 ntp_tick_adj; 76static s64 ntp_tick_adj;
76 77
78#ifdef CONFIG_NTP_PPS
79
80/*
81 * The following variables are used when a pulse-per-second (PPS) signal
82 * is available. They establish the engineering parameters of the clock
83 * discipline loop when controlled by the PPS signal.
84 */
85#define PPS_VALID 10 /* PPS signal watchdog max (s) */
86#define PPS_POPCORN 4 /* popcorn spike threshold (shift) */
87#define PPS_INTMIN 2 /* min freq interval (s) (shift) */
88#define PPS_INTMAX 8 /* max freq interval (s) (shift) */
89#define PPS_INTCOUNT 4 /* number of consecutive good intervals to
90 increase pps_shift or consecutive bad
91 intervals to decrease it */
92#define PPS_MAXWANDER 100000 /* max PPS freq wander (ns/s) */
93
94static int pps_valid; /* signal watchdog counter */
95static long pps_tf[3]; /* phase median filter */
96static long pps_jitter; /* current jitter (ns) */
97static struct timespec pps_fbase; /* beginning of the last freq interval */
98static int pps_shift; /* current interval duration (s) (shift) */
99static int pps_intcnt; /* interval counter */
100static s64 pps_freq; /* frequency offset (scaled ns/s) */
101static long pps_stabil; /* current stability (scaled ns/s) */
102
103/*
104 * PPS signal quality monitors
105 */
106static long pps_calcnt; /* calibration intervals */
107static long pps_jitcnt; /* jitter limit exceeded */
108static long pps_stbcnt; /* stability limit exceeded */
109static long pps_errcnt; /* calibration errors */
110
111
112/* PPS kernel consumer compensates the whole phase error immediately.
113 * Otherwise, reduce the offset by a fixed factor times the time constant.
114 */
115static inline s64 ntp_offset_chunk(s64 offset)
116{
117 if (time_status & STA_PPSTIME && time_status & STA_PPSSIGNAL)
118 return offset;
119 else
120 return shift_right(offset, SHIFT_PLL + time_constant);
121}
122
123static inline void pps_reset_freq_interval(void)
124{
125 /* the PPS calibration interval may end
126 surprisingly early */
127 pps_shift = PPS_INTMIN;
128 pps_intcnt = 0;
129}
130
131/**
132 * pps_clear - Clears the PPS state variables
133 *
134 * Must be called while holding a write on the xtime_lock
135 */
136static inline void pps_clear(void)
137{
138 pps_reset_freq_interval();
139 pps_tf[0] = 0;
140 pps_tf[1] = 0;
141 pps_tf[2] = 0;
142 pps_fbase.tv_sec = pps_fbase.tv_nsec = 0;
143 pps_freq = 0;
144}
145
146/* Decrease pps_valid to indicate that another second has passed since
147 * the last PPS signal. When it reaches 0, indicate that PPS signal is
148 * missing.
149 *
150 * Must be called while holding a write on the xtime_lock
151 */
152static inline void pps_dec_valid(void)
153{
154 if (pps_valid > 0)
155 pps_valid--;
156 else {
157 time_status &= ~(STA_PPSSIGNAL | STA_PPSJITTER |
158 STA_PPSWANDER | STA_PPSERROR);
159 pps_clear();
160 }
161}
162
163static inline void pps_set_freq(s64 freq)
164{
165 pps_freq = freq;
166}
167
168static inline int is_error_status(int status)
169{
170 return (time_status & (STA_UNSYNC|STA_CLOCKERR))
171 /* PPS signal lost when either PPS time or
172 * PPS frequency synchronization requested
173 */
174 || ((time_status & (STA_PPSFREQ|STA_PPSTIME))
175 && !(time_status & STA_PPSSIGNAL))
176 /* PPS jitter exceeded when
177 * PPS time synchronization requested */
178 || ((time_status & (STA_PPSTIME|STA_PPSJITTER))
179 == (STA_PPSTIME|STA_PPSJITTER))
180 /* PPS wander exceeded or calibration error when
181 * PPS frequency synchronization requested
182 */
183 || ((time_status & STA_PPSFREQ)
184 && (time_status & (STA_PPSWANDER|STA_PPSERROR)));
185}
186
187static inline void pps_fill_timex(struct timex *txc)
188{
189 txc->ppsfreq = shift_right((pps_freq >> PPM_SCALE_INV_SHIFT) *
190 PPM_SCALE_INV, NTP_SCALE_SHIFT);
191 txc->jitter = pps_jitter;
192 if (!(time_status & STA_NANO))
193 txc->jitter /= NSEC_PER_USEC;
194 txc->shift = pps_shift;
195 txc->stabil = pps_stabil;
196 txc->jitcnt = pps_jitcnt;
197 txc->calcnt = pps_calcnt;
198 txc->errcnt = pps_errcnt;
199 txc->stbcnt = pps_stbcnt;
200}
201
202#else /* !CONFIG_NTP_PPS */
203
204static inline s64 ntp_offset_chunk(s64 offset)
205{
206 return shift_right(offset, SHIFT_PLL + time_constant);
207}
208
209static inline void pps_reset_freq_interval(void) {}
210static inline void pps_clear(void) {}
211static inline void pps_dec_valid(void) {}
212static inline void pps_set_freq(s64 freq) {}
213
214static inline int is_error_status(int status)
215{
216 return status & (STA_UNSYNC|STA_CLOCKERR);
217}
218
219static inline void pps_fill_timex(struct timex *txc)
220{
221 /* PPS is not implemented, so these are zero */
222 txc->ppsfreq = 0;
223 txc->jitter = 0;
224 txc->shift = 0;
225 txc->stabil = 0;
226 txc->jitcnt = 0;
227 txc->calcnt = 0;
228 txc->errcnt = 0;
229 txc->stbcnt = 0;
230}
231
232#endif /* CONFIG_NTP_PPS */
233
77/* 234/*
78 * NTP methods: 235 * NTP methods:
79 */ 236 */
@@ -185,6 +342,9 @@ void ntp_clear(void)
185 342
186 tick_length = tick_length_base; 343 tick_length = tick_length_base;
187 time_offset = 0; 344 time_offset = 0;
345
346 /* Clear PPS state variables */
347 pps_clear();
188} 348}
189 349
190/* 350/*
@@ -250,16 +410,16 @@ void second_overflow(void)
250 time_status |= STA_UNSYNC; 410 time_status |= STA_UNSYNC;
251 } 411 }
252 412
253 /* 413 /* Compute the phase adjustment for the next second */
254 * Compute the phase adjustment for the next second. The offset is
255 * reduced by a fixed factor times the time constant.
256 */
257 tick_length = tick_length_base; 414 tick_length = tick_length_base;
258 415
259 delta = shift_right(time_offset, SHIFT_PLL + time_constant); 416 delta = ntp_offset_chunk(time_offset);
260 time_offset -= delta; 417 time_offset -= delta;
261 tick_length += delta; 418 tick_length += delta;
262 419
420 /* Check PPS signal */
421 pps_dec_valid();
422
263 if (!time_adjust) 423 if (!time_adjust)
264 return; 424 return;
265 425
@@ -369,6 +529,8 @@ static inline void process_adj_status(struct timex *txc, struct timespec *ts)
369 if ((time_status & STA_PLL) && !(txc->status & STA_PLL)) { 529 if ((time_status & STA_PLL) && !(txc->status & STA_PLL)) {
370 time_state = TIME_OK; 530 time_state = TIME_OK;
371 time_status = STA_UNSYNC; 531 time_status = STA_UNSYNC;
532 /* restart PPS frequency calibration */
533 pps_reset_freq_interval();
372 } 534 }
373 535
374 /* 536 /*
@@ -418,6 +580,8 @@ static inline void process_adjtimex_modes(struct timex *txc, struct timespec *ts
418 time_freq = txc->freq * PPM_SCALE; 580 time_freq = txc->freq * PPM_SCALE;
419 time_freq = min(time_freq, MAXFREQ_SCALED); 581 time_freq = min(time_freq, MAXFREQ_SCALED);
420 time_freq = max(time_freq, -MAXFREQ_SCALED); 582 time_freq = max(time_freq, -MAXFREQ_SCALED);
583 /* update pps_freq */
584 pps_set_freq(time_freq);
421 } 585 }
422 586
423 if (txc->modes & ADJ_MAXERROR) 587 if (txc->modes & ADJ_MAXERROR)
@@ -508,7 +672,8 @@ int do_adjtimex(struct timex *txc)
508 } 672 }
509 673
510 result = time_state; /* mostly `TIME_OK' */ 674 result = time_state; /* mostly `TIME_OK' */
511 if (time_status & (STA_UNSYNC|STA_CLOCKERR)) 675 /* check for errors */
676 if (is_error_status(time_status))
512 result = TIME_ERROR; 677 result = TIME_ERROR;
513 678
514 txc->freq = shift_right((time_freq >> PPM_SCALE_INV_SHIFT) * 679 txc->freq = shift_right((time_freq >> PPM_SCALE_INV_SHIFT) *
@@ -522,15 +687,8 @@ int do_adjtimex(struct timex *txc)
522 txc->tick = tick_usec; 687 txc->tick = tick_usec;
523 txc->tai = time_tai; 688 txc->tai = time_tai;
524 689
525 /* PPS is not implemented, so these are zero */ 690 /* fill PPS status fields */
526 txc->ppsfreq = 0; 691 pps_fill_timex(txc);
527 txc->jitter = 0;
528 txc->shift = 0;
529 txc->stabil = 0;
530 txc->jitcnt = 0;
531 txc->calcnt = 0;
532 txc->errcnt = 0;
533 txc->stbcnt = 0;
534 692
535 write_sequnlock_irq(&xtime_lock); 693 write_sequnlock_irq(&xtime_lock);
536 694
@@ -544,6 +702,243 @@ int do_adjtimex(struct timex *txc)
544 return result; 702 return result;
545} 703}
546 704
705#ifdef CONFIG_NTP_PPS
706
707/* actually struct pps_normtime is good old struct timespec, but it is
708 * semantically different (and it is the reason why it was invented):
709 * pps_normtime.nsec has a range of ( -NSEC_PER_SEC / 2, NSEC_PER_SEC / 2 ]
710 * while timespec.tv_nsec has a range of [0, NSEC_PER_SEC) */
711struct pps_normtime {
712 __kernel_time_t sec; /* seconds */
713 long nsec; /* nanoseconds */
714};
715
716/* normalize the timestamp so that nsec is in the
717 ( -NSEC_PER_SEC / 2, NSEC_PER_SEC / 2 ] interval */
718static inline struct pps_normtime pps_normalize_ts(struct timespec ts)
719{
720 struct pps_normtime norm = {
721 .sec = ts.tv_sec,
722 .nsec = ts.tv_nsec
723 };
724
725 if (norm.nsec > (NSEC_PER_SEC >> 1)) {
726 norm.nsec -= NSEC_PER_SEC;
727 norm.sec++;
728 }
729
730 return norm;
731}
732
733/* get current phase correction and jitter */
734static inline long pps_phase_filter_get(long *jitter)
735{
736 *jitter = pps_tf[0] - pps_tf[1];
737 if (*jitter < 0)
738 *jitter = -*jitter;
739
740 /* TODO: test various filters */
741 return pps_tf[0];
742}
743
744/* add the sample to the phase filter */
745static inline void pps_phase_filter_add(long err)
746{
747 pps_tf[2] = pps_tf[1];
748 pps_tf[1] = pps_tf[0];
749 pps_tf[0] = err;
750}
751
752/* decrease frequency calibration interval length.
753 * It is halved after four consecutive unstable intervals.
754 */
755static inline void pps_dec_freq_interval(void)
756{
757 if (--pps_intcnt <= -PPS_INTCOUNT) {
758 pps_intcnt = -PPS_INTCOUNT;
759 if (pps_shift > PPS_INTMIN) {
760 pps_shift--;
761 pps_intcnt = 0;
762 }
763 }
764}
765
766/* increase frequency calibration interval length.
767 * It is doubled after four consecutive stable intervals.
768 */
769static inline void pps_inc_freq_interval(void)
770{
771 if (++pps_intcnt >= PPS_INTCOUNT) {
772 pps_intcnt = PPS_INTCOUNT;
773 if (pps_shift < PPS_INTMAX) {
774 pps_shift++;
775 pps_intcnt = 0;
776 }
777 }
778}
779
780/* update clock frequency based on MONOTONIC_RAW clock PPS signal
781 * timestamps
782 *
783 * At the end of the calibration interval the difference between the
784 * first and last MONOTONIC_RAW clock timestamps divided by the length
785 * of the interval becomes the frequency update. If the interval was
786 * too long, the data are discarded.
787 * Returns the difference between old and new frequency values.
788 */
789static long hardpps_update_freq(struct pps_normtime freq_norm)
790{
791 long delta, delta_mod;
792 s64 ftemp;
793
794 /* check if the frequency interval was too long */
795 if (freq_norm.sec > (2 << pps_shift)) {
796 time_status |= STA_PPSERROR;
797 pps_errcnt++;
798 pps_dec_freq_interval();
799 pr_err("hardpps: PPSERROR: interval too long - %ld s\n",
800 freq_norm.sec);
801 return 0;
802 }
803
804 /* here the raw frequency offset and wander (stability) is
805 * calculated. If the wander is less than the wander threshold
806 * the interval is increased; otherwise it is decreased.
807 */
808 ftemp = div_s64(((s64)(-freq_norm.nsec)) << NTP_SCALE_SHIFT,
809 freq_norm.sec);
810 delta = shift_right(ftemp - pps_freq, NTP_SCALE_SHIFT);
811 pps_freq = ftemp;
812 if (delta > PPS_MAXWANDER || delta < -PPS_MAXWANDER) {
813 pr_warning("hardpps: PPSWANDER: change=%ld\n", delta);
814 time_status |= STA_PPSWANDER;
815 pps_stbcnt++;
816 pps_dec_freq_interval();
817 } else { /* good sample */
818 pps_inc_freq_interval();
819 }
820
821 /* the stability metric is calculated as the average of recent
822 * frequency changes, but is used only for performance
823 * monitoring
824 */
825 delta_mod = delta;
826 if (delta_mod < 0)
827 delta_mod = -delta_mod;
828 pps_stabil += (div_s64(((s64)delta_mod) <<
829 (NTP_SCALE_SHIFT - SHIFT_USEC),
830 NSEC_PER_USEC) - pps_stabil) >> PPS_INTMIN;
831
832 /* if enabled, the system clock frequency is updated */
833 if ((time_status & STA_PPSFREQ) != 0 &&
834 (time_status & STA_FREQHOLD) == 0) {
835 time_freq = pps_freq;
836 ntp_update_frequency();
837 }
838
839 return delta;
840}
841
842/* correct REALTIME clock phase error against PPS signal */
843static void hardpps_update_phase(long error)
844{
845 long correction = -error;
846 long jitter;
847
848 /* add the sample to the median filter */
849 pps_phase_filter_add(correction);
850 correction = pps_phase_filter_get(&jitter);
851
852 /* Nominal jitter is due to PPS signal noise. If it exceeds the
853 * threshold, the sample is discarded; otherwise, if so enabled,
854 * the time offset is updated.
855 */
856 if (jitter > (pps_jitter << PPS_POPCORN)) {
857 pr_warning("hardpps: PPSJITTER: jitter=%ld, limit=%ld\n",
858 jitter, (pps_jitter << PPS_POPCORN));
859 time_status |= STA_PPSJITTER;
860 pps_jitcnt++;
861 } else if (time_status & STA_PPSTIME) {
862 /* correct the time using the phase offset */
863 time_offset = div_s64(((s64)correction) << NTP_SCALE_SHIFT,
864 NTP_INTERVAL_FREQ);
865 /* cancel running adjtime() */
866 time_adjust = 0;
867 }
868 /* update jitter */
869 pps_jitter += (jitter - pps_jitter) >> PPS_INTMIN;
870}
871
872/*
873 * hardpps() - discipline CPU clock oscillator to external PPS signal
874 *
875 * This routine is called at each PPS signal arrival in order to
876 * discipline the CPU clock oscillator to the PPS signal. It takes two
877 * parameters: REALTIME and MONOTONIC_RAW clock timestamps. The former
878 * is used to correct clock phase error and the latter is used to
879 * correct the frequency.
880 *
881 * This code is based on David Mills's reference nanokernel
882 * implementation. It was mostly rewritten but keeps the same idea.
883 */
884void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)
885{
886 struct pps_normtime pts_norm, freq_norm;
887 unsigned long flags;
888
889 pts_norm = pps_normalize_ts(*phase_ts);
890
891 write_seqlock_irqsave(&xtime_lock, flags);
892
893 /* clear the error bits, they will be set again if needed */
894 time_status &= ~(STA_PPSJITTER | STA_PPSWANDER | STA_PPSERROR);
895
896 /* indicate signal presence */
897 time_status |= STA_PPSSIGNAL;
898 pps_valid = PPS_VALID;
899
900 /* when called for the first time,
901 * just start the frequency interval */
902 if (unlikely(pps_fbase.tv_sec == 0)) {
903 pps_fbase = *raw_ts;
904 write_sequnlock_irqrestore(&xtime_lock, flags);
905 return;
906 }
907
908 /* ok, now we have a base for frequency calculation */
909 freq_norm = pps_normalize_ts(timespec_sub(*raw_ts, pps_fbase));
910
911 /* check that the signal is in the range
912 * [1s - MAXFREQ us, 1s + MAXFREQ us], otherwise reject it */
913 if ((freq_norm.sec == 0) ||
914 (freq_norm.nsec > MAXFREQ * freq_norm.sec) ||
915 (freq_norm.nsec < -MAXFREQ * freq_norm.sec)) {
916 time_status |= STA_PPSJITTER;
917 /* restart the frequency calibration interval */
918 pps_fbase = *raw_ts;
919 write_sequnlock_irqrestore(&xtime_lock, flags);
920 pr_err("hardpps: PPSJITTER: bad pulse\n");
921 return;
922 }
923
924 /* signal is ok */
925
926 /* check if the current frequency interval is finished */
927 if (freq_norm.sec >= (1 << pps_shift)) {
928 pps_calcnt++;
929 /* restart the frequency calibration interval */
930 pps_fbase = *raw_ts;
931 hardpps_update_freq(freq_norm);
932 }
933
934 hardpps_update_phase(pts_norm.nsec);
935
936 write_sequnlock_irqrestore(&xtime_lock, flags);
937}
938EXPORT_SYMBOL(hardpps);
939
940#endif /* CONFIG_NTP_PPS */
941
547static int __init ntp_tick_adj_setup(char *str) 942static int __init ntp_tick_adj_setup(char *str)
548{ 943{
549 ntp_tick_adj = simple_strtol(str, NULL, 0); 944 ntp_tick_adj = simple_strtol(str, NULL, 0);
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index b6b898d2eeef..051bc80a0c43 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -49,7 +49,7 @@ struct tick_device *tick_get_device(int cpu)
49 */ 49 */
50int tick_is_oneshot_available(void) 50int tick_is_oneshot_available(void)
51{ 51{
52 struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; 52 struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev);
53 53
54 return dev && (dev->features & CLOCK_EVT_FEAT_ONESHOT); 54 return dev && (dev->features & CLOCK_EVT_FEAT_ONESHOT);
55} 55}
diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c
index aada0e52680a..5cbc101f908b 100644
--- a/kernel/time/tick-oneshot.c
+++ b/kernel/time/tick-oneshot.c
@@ -95,7 +95,7 @@ int tick_dev_program_event(struct clock_event_device *dev, ktime_t expires,
95 */ 95 */
96int tick_program_event(ktime_t expires, int force) 96int tick_program_event(ktime_t expires, int force)
97{ 97{
98 struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; 98 struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev);
99 99
100 return tick_dev_program_event(dev, expires, force); 100 return tick_dev_program_event(dev, expires, force);
101} 101}
@@ -167,7 +167,7 @@ int tick_oneshot_mode_active(void)
167 int ret; 167 int ret;
168 168
169 local_irq_save(flags); 169 local_irq_save(flags);
170 ret = __get_cpu_var(tick_cpu_device).mode == TICKDEV_MODE_ONESHOT; 170 ret = __this_cpu_read(tick_cpu_device.mode) == TICKDEV_MODE_ONESHOT;
171 local_irq_restore(flags); 171 local_irq_restore(flags);
172 172
173 return ret; 173 return ret;
diff --git a/kernel/time/timecompare.c b/kernel/time/timecompare.c
index ac38fbb176cc..a9ae369925ce 100644
--- a/kernel/time/timecompare.c
+++ b/kernel/time/timecompare.c
@@ -21,6 +21,7 @@
21#include <linux/module.h> 21#include <linux/module.h>
22#include <linux/slab.h> 22#include <linux/slab.h>
23#include <linux/math64.h> 23#include <linux/math64.h>
24#include <linux/kernel.h>
24 25
25/* 26/*
26 * fixed point arithmetic scale factor for skew 27 * fixed point arithmetic scale factor for skew
@@ -57,11 +58,11 @@ int timecompare_offset(struct timecompare *sync,
57 int index; 58 int index;
58 int num_samples = sync->num_samples; 59 int num_samples = sync->num_samples;
59 60
60 if (num_samples > sizeof(buffer)/sizeof(buffer[0])) { 61 if (num_samples > ARRAY_SIZE(buffer)) {
61 samples = kmalloc(sizeof(*samples) * num_samples, GFP_ATOMIC); 62 samples = kmalloc(sizeof(*samples) * num_samples, GFP_ATOMIC);
62 if (!samples) { 63 if (!samples) {
63 samples = buffer; 64 samples = buffer;
64 num_samples = sizeof(buffer)/sizeof(buffer[0]); 65 num_samples = ARRAY_SIZE(buffer);
65 } 66 }
66 } else { 67 } else {
67 samples = buffer; 68 samples = buffer;
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 49010d822f72..5536aaf3ba36 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -32,6 +32,8 @@ struct timekeeper {
32 cycle_t cycle_interval; 32 cycle_t cycle_interval;
33 /* Number of clock shifted nano seconds in one NTP interval. */ 33 /* Number of clock shifted nano seconds in one NTP interval. */
34 u64 xtime_interval; 34 u64 xtime_interval;
35 /* shifted nano seconds left over when rounding cycle_interval */
36 s64 xtime_remainder;
35 /* Raw nano seconds accumulated per NTP interval. */ 37 /* Raw nano seconds accumulated per NTP interval. */
36 u32 raw_interval; 38 u32 raw_interval;
37 39
@@ -62,7 +64,7 @@ struct timekeeper timekeeper;
62static void timekeeper_setup_internals(struct clocksource *clock) 64static void timekeeper_setup_internals(struct clocksource *clock)
63{ 65{
64 cycle_t interval; 66 cycle_t interval;
65 u64 tmp; 67 u64 tmp, ntpinterval;
66 68
67 timekeeper.clock = clock; 69 timekeeper.clock = clock;
68 clock->cycle_last = clock->read(clock); 70 clock->cycle_last = clock->read(clock);
@@ -70,6 +72,7 @@ static void timekeeper_setup_internals(struct clocksource *clock)
70 /* Do the ns -> cycle conversion first, using original mult */ 72 /* Do the ns -> cycle conversion first, using original mult */
71 tmp = NTP_INTERVAL_LENGTH; 73 tmp = NTP_INTERVAL_LENGTH;
72 tmp <<= clock->shift; 74 tmp <<= clock->shift;
75 ntpinterval = tmp;
73 tmp += clock->mult/2; 76 tmp += clock->mult/2;
74 do_div(tmp, clock->mult); 77 do_div(tmp, clock->mult);
75 if (tmp == 0) 78 if (tmp == 0)
@@ -80,6 +83,7 @@ static void timekeeper_setup_internals(struct clocksource *clock)
80 83
81 /* Go back from cycles -> shifted ns */ 84 /* Go back from cycles -> shifted ns */
82 timekeeper.xtime_interval = (u64) interval * clock->mult; 85 timekeeper.xtime_interval = (u64) interval * clock->mult;
86 timekeeper.xtime_remainder = ntpinterval - timekeeper.xtime_interval;
83 timekeeper.raw_interval = 87 timekeeper.raw_interval =
84 ((u64) interval * clock->mult) >> clock->shift; 88 ((u64) interval * clock->mult) >> clock->shift;
85 89
@@ -284,6 +288,49 @@ void ktime_get_ts(struct timespec *ts)
284} 288}
285EXPORT_SYMBOL_GPL(ktime_get_ts); 289EXPORT_SYMBOL_GPL(ktime_get_ts);
286 290
291#ifdef CONFIG_NTP_PPS
292
293/**
294 * getnstime_raw_and_real - get day and raw monotonic time in timespec format
295 * @ts_raw: pointer to the timespec to be set to raw monotonic time
296 * @ts_real: pointer to the timespec to be set to the time of day
297 *
298 * This function reads both the time of day and raw monotonic time at the
299 * same time atomically and stores the resulting timestamps in timespec
300 * format.
301 */
302void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real)
303{
304 unsigned long seq;
305 s64 nsecs_raw, nsecs_real;
306
307 WARN_ON_ONCE(timekeeping_suspended);
308
309 do {
310 u32 arch_offset;
311
312 seq = read_seqbegin(&xtime_lock);
313
314 *ts_raw = raw_time;
315 *ts_real = xtime;
316
317 nsecs_raw = timekeeping_get_ns_raw();
318 nsecs_real = timekeeping_get_ns();
319
320 /* If arch requires, add in gettimeoffset() */
321 arch_offset = arch_gettimeoffset();
322 nsecs_raw += arch_offset;
323 nsecs_real += arch_offset;
324
325 } while (read_seqretry(&xtime_lock, seq));
326
327 timespec_add_ns(ts_raw, nsecs_raw);
328 timespec_add_ns(ts_real, nsecs_real);
329}
330EXPORT_SYMBOL(getnstime_raw_and_real);
331
332#endif /* CONFIG_NTP_PPS */
333
287/** 334/**
288 * do_gettimeofday - Returns the time of day in a timeval 335 * do_gettimeofday - Returns the time of day in a timeval
289 * @tv: pointer to the timeval to be set 336 * @tv: pointer to the timeval to be set
@@ -719,7 +766,8 @@ static cycle_t logarithmic_accumulation(cycle_t offset, int shift)
719 766
720 /* Accumulate error between NTP and clock interval */ 767 /* Accumulate error between NTP and clock interval */
721 timekeeper.ntp_error += tick_length << shift; 768 timekeeper.ntp_error += tick_length << shift;
722 timekeeper.ntp_error -= timekeeper.xtime_interval << 769 timekeeper.ntp_error -=
770 (timekeeper.xtime_interval + timekeeper.xtime_remainder) <<
723 (timekeeper.ntp_error_shift + shift); 771 (timekeeper.ntp_error_shift + shift);
724 772
725 return offset; 773 return offset;
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index ab8f5e33fa92..32a19f9397fc 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -79,26 +79,26 @@ print_active_timers(struct seq_file *m, struct hrtimer_clock_base *base,
79{ 79{
80 struct hrtimer *timer, tmp; 80 struct hrtimer *timer, tmp;
81 unsigned long next = 0, i; 81 unsigned long next = 0, i;
82 struct rb_node *curr; 82 struct timerqueue_node *curr;
83 unsigned long flags; 83 unsigned long flags;
84 84
85next_one: 85next_one:
86 i = 0; 86 i = 0;
87 raw_spin_lock_irqsave(&base->cpu_base->lock, flags); 87 raw_spin_lock_irqsave(&base->cpu_base->lock, flags);
88 88
89 curr = base->first; 89 curr = timerqueue_getnext(&base->active);
90 /* 90 /*
91 * Crude but we have to do this O(N*N) thing, because 91 * Crude but we have to do this O(N*N) thing, because
92 * we have to unlock the base when printing: 92 * we have to unlock the base when printing:
93 */ 93 */
94 while (curr && i < next) { 94 while (curr && i < next) {
95 curr = rb_next(curr); 95 curr = timerqueue_iterate_next(curr);
96 i++; 96 i++;
97 } 97 }
98 98
99 if (curr) { 99 if (curr) {
100 100
101 timer = rb_entry(curr, struct hrtimer, node); 101 timer = container_of(curr, struct hrtimer, node);
102 tmp = *timer; 102 tmp = *timer;
103 raw_spin_unlock_irqrestore(&base->cpu_base->lock, flags); 103 raw_spin_unlock_irqrestore(&base->cpu_base->lock, flags);
104 104
diff --git a/kernel/timer.c b/kernel/timer.c
index 353b9227c2ec..43ca9936f2d0 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -88,18 +88,6 @@ struct tvec_base boot_tvec_bases;
88EXPORT_SYMBOL(boot_tvec_bases); 88EXPORT_SYMBOL(boot_tvec_bases);
89static DEFINE_PER_CPU(struct tvec_base *, tvec_bases) = &boot_tvec_bases; 89static DEFINE_PER_CPU(struct tvec_base *, tvec_bases) = &boot_tvec_bases;
90 90
91/*
92 * Note that all tvec_bases are 2 byte aligned and lower bit of
93 * base in timer_list is guaranteed to be zero. Use the LSB to
94 * indicate whether the timer is deferrable.
95 *
96 * A deferrable timer will work normally when the system is busy, but
97 * will not cause a CPU to come out of idle just to service it; instead,
98 * the timer will be serviced when the CPU eventually wakes up with a
99 * subsequent non-deferrable timer.
100 */
101#define TBASE_DEFERRABLE_FLAG (0x1)
102
103/* Functions below help us manage 'deferrable' flag */ 91/* Functions below help us manage 'deferrable' flag */
104static inline unsigned int tbase_get_deferrable(struct tvec_base *base) 92static inline unsigned int tbase_get_deferrable(struct tvec_base *base)
105{ 93{
@@ -113,8 +101,7 @@ static inline struct tvec_base *tbase_get_base(struct tvec_base *base)
113 101
114static inline void timer_set_deferrable(struct timer_list *timer) 102static inline void timer_set_deferrable(struct timer_list *timer)
115{ 103{
116 timer->base = ((struct tvec_base *)((unsigned long)(timer->base) | 104 timer->base = TBASE_MAKE_DEFERRED(timer->base);
117 TBASE_DEFERRABLE_FLAG));
118} 105}
119 106
120static inline void 107static inline void
@@ -343,15 +330,6 @@ void set_timer_slack(struct timer_list *timer, int slack_hz)
343} 330}
344EXPORT_SYMBOL_GPL(set_timer_slack); 331EXPORT_SYMBOL_GPL(set_timer_slack);
345 332
346
347static inline void set_running_timer(struct tvec_base *base,
348 struct timer_list *timer)
349{
350#ifdef CONFIG_SMP
351 base->running_timer = timer;
352#endif
353}
354
355static void internal_add_timer(struct tvec_base *base, struct timer_list *timer) 333static void internal_add_timer(struct tvec_base *base, struct timer_list *timer)
356{ 334{
357 unsigned long expires = timer->expires; 335 unsigned long expires = timer->expires;
@@ -936,15 +914,12 @@ int del_timer(struct timer_list *timer)
936} 914}
937EXPORT_SYMBOL(del_timer); 915EXPORT_SYMBOL(del_timer);
938 916
939#ifdef CONFIG_SMP
940/** 917/**
941 * try_to_del_timer_sync - Try to deactivate a timer 918 * try_to_del_timer_sync - Try to deactivate a timer
942 * @timer: timer do del 919 * @timer: timer do del
943 * 920 *
944 * This function tries to deactivate a timer. Upon successful (ret >= 0) 921 * This function tries to deactivate a timer. Upon successful (ret >= 0)
945 * exit the timer is not queued and the handler is not running on any CPU. 922 * exit the timer is not queued and the handler is not running on any CPU.
946 *
947 * It must not be called from interrupt contexts.
948 */ 923 */
949int try_to_del_timer_sync(struct timer_list *timer) 924int try_to_del_timer_sync(struct timer_list *timer)
950{ 925{
@@ -973,6 +948,7 @@ out:
973} 948}
974EXPORT_SYMBOL(try_to_del_timer_sync); 949EXPORT_SYMBOL(try_to_del_timer_sync);
975 950
951#ifdef CONFIG_SMP
976/** 952/**
977 * del_timer_sync - deactivate a timer and wait for the handler to finish. 953 * del_timer_sync - deactivate a timer and wait for the handler to finish.
978 * @timer: the timer to be deactivated 954 * @timer: the timer to be deactivated
@@ -983,7 +959,7 @@ EXPORT_SYMBOL(try_to_del_timer_sync);
983 * 959 *
984 * Synchronization rules: Callers must prevent restarting of the timer, 960 * Synchronization rules: Callers must prevent restarting of the timer,
985 * otherwise this function is meaningless. It must not be called from 961 * otherwise this function is meaningless. It must not be called from
986 * interrupt contexts. The caller must not hold locks which would prevent 962 * hardirq contexts. The caller must not hold locks which would prevent
987 * completion of the timer's handler. The timer's handler must not call 963 * completion of the timer's handler. The timer's handler must not call
988 * add_timer_on(). Upon exit the timer is not queued and the handler is 964 * add_timer_on(). Upon exit the timer is not queued and the handler is
989 * not running on any CPU. 965 * not running on any CPU.
@@ -993,14 +969,16 @@ EXPORT_SYMBOL(try_to_del_timer_sync);
993int del_timer_sync(struct timer_list *timer) 969int del_timer_sync(struct timer_list *timer)
994{ 970{
995#ifdef CONFIG_LOCKDEP 971#ifdef CONFIG_LOCKDEP
996 unsigned long flags; 972 local_bh_disable();
997
998 local_irq_save(flags);
999 lock_map_acquire(&timer->lockdep_map); 973 lock_map_acquire(&timer->lockdep_map);
1000 lock_map_release(&timer->lockdep_map); 974 lock_map_release(&timer->lockdep_map);
1001 local_irq_restore(flags); 975 local_bh_enable();
1002#endif 976#endif
1003 977 /*
978 * don't use it in hardirq context, because it
979 * could lead to deadlock.
980 */
981 WARN_ON(in_irq());
1004 for (;;) { 982 for (;;) {
1005 int ret = try_to_del_timer_sync(timer); 983 int ret = try_to_del_timer_sync(timer);
1006 if (ret >= 0) 984 if (ret >= 0)
@@ -1111,7 +1089,7 @@ static inline void __run_timers(struct tvec_base *base)
1111 1089
1112 timer_stats_account_timer(timer); 1090 timer_stats_account_timer(timer);
1113 1091
1114 set_running_timer(base, timer); 1092 base->running_timer = timer;
1115 detach_timer(timer, 1); 1093 detach_timer(timer, 1);
1116 1094
1117 spin_unlock_irq(&base->lock); 1095 spin_unlock_irq(&base->lock);
@@ -1119,7 +1097,7 @@ static inline void __run_timers(struct tvec_base *base)
1119 spin_lock_irq(&base->lock); 1097 spin_lock_irq(&base->lock);
1120 } 1098 }
1121 } 1099 }
1122 set_running_timer(base, NULL); 1100 base->running_timer = NULL;
1123 spin_unlock_irq(&base->lock); 1101 spin_unlock_irq(&base->lock);
1124} 1102}
1125 1103
@@ -1249,7 +1227,7 @@ static unsigned long cmp_next_hrtimer_event(unsigned long now,
1249 */ 1227 */
1250unsigned long get_next_timer_interrupt(unsigned long now) 1228unsigned long get_next_timer_interrupt(unsigned long now)
1251{ 1229{
1252 struct tvec_base *base = __get_cpu_var(tvec_bases); 1230 struct tvec_base *base = __this_cpu_read(tvec_bases);
1253 unsigned long expires; 1231 unsigned long expires;
1254 1232
1255 /* 1233 /*
@@ -1298,7 +1276,7 @@ void update_process_times(int user_tick)
1298 */ 1276 */
1299static void run_timer_softirq(struct softirq_action *h) 1277static void run_timer_softirq(struct softirq_action *h)
1300{ 1278{
1301 struct tvec_base *base = __get_cpu_var(tvec_bases); 1279 struct tvec_base *base = __this_cpu_read(tvec_bases);
1302 1280
1303 hrtimer_run_pending(); 1281 hrtimer_run_pending();
1304 1282
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index ea37e2ff4164..14674dce77a6 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -69,6 +69,21 @@ config EVENT_TRACING
69 select CONTEXT_SWITCH_TRACER 69 select CONTEXT_SWITCH_TRACER
70 bool 70 bool
71 71
72config EVENT_POWER_TRACING_DEPRECATED
73 depends on EVENT_TRACING
74 bool "Deprecated power event trace API, to be removed"
75 default y
76 help
77 Provides old power event types:
78 C-state/idle accounting events:
79 power:power_start
80 power:power_end
81 and old cpufreq accounting event:
82 power:power_frequency
83 This is for userspace compatibility
84 and will vanish after 5 kernel iterations,
85 namely 2.6.41.
86
72config CONTEXT_SWITCH_TRACER 87config CONTEXT_SWITCH_TRACER
73 bool 88 bool
74 89
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 53f338190b26..761c510a06c5 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -52,7 +52,7 @@ obj-$(CONFIG_EVENT_TRACING) += trace_event_perf.o
52endif 52endif
53obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o 53obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o
54obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o 54obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o
55obj-$(CONFIG_EVENT_TRACING) += power-traces.o 55obj-$(CONFIG_TRACEPOINTS) += power-traces.o
56ifeq ($(CONFIG_TRACING),y) 56ifeq ($(CONFIG_TRACING),y)
57obj-$(CONFIG_KGDB_KDB) += trace_kdb.o 57obj-$(CONFIG_KGDB_KDB) += trace_kdb.o
58endif 58endif
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 7b8ec0281548..153562d0b93c 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -758,53 +758,58 @@ static void blk_add_trace_rq_complete(void *ignore,
758 * @q: queue the io is for 758 * @q: queue the io is for
759 * @bio: the source bio 759 * @bio: the source bio
760 * @what: the action 760 * @what: the action
761 * @error: error, if any
761 * 762 *
762 * Description: 763 * Description:
763 * Records an action against a bio. Will log the bio offset + size. 764 * Records an action against a bio. Will log the bio offset + size.
764 * 765 *
765 **/ 766 **/
766static void blk_add_trace_bio(struct request_queue *q, struct bio *bio, 767static void blk_add_trace_bio(struct request_queue *q, struct bio *bio,
767 u32 what) 768 u32 what, int error)
768{ 769{
769 struct blk_trace *bt = q->blk_trace; 770 struct blk_trace *bt = q->blk_trace;
770 771
771 if (likely(!bt)) 772 if (likely(!bt))
772 return; 773 return;
773 774
775 if (!error && !bio_flagged(bio, BIO_UPTODATE))
776 error = EIO;
777
774 __blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, what, 778 __blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, what,
775 !bio_flagged(bio, BIO_UPTODATE), 0, NULL); 779 error, 0, NULL);
776} 780}
777 781
778static void blk_add_trace_bio_bounce(void *ignore, 782static void blk_add_trace_bio_bounce(void *ignore,
779 struct request_queue *q, struct bio *bio) 783 struct request_queue *q, struct bio *bio)
780{ 784{
781 blk_add_trace_bio(q, bio, BLK_TA_BOUNCE); 785 blk_add_trace_bio(q, bio, BLK_TA_BOUNCE, 0);
782} 786}
783 787
784static void blk_add_trace_bio_complete(void *ignore, 788static void blk_add_trace_bio_complete(void *ignore,
785 struct request_queue *q, struct bio *bio) 789 struct request_queue *q, struct bio *bio,
790 int error)
786{ 791{
787 blk_add_trace_bio(q, bio, BLK_TA_COMPLETE); 792 blk_add_trace_bio(q, bio, BLK_TA_COMPLETE, error);
788} 793}
789 794
790static void blk_add_trace_bio_backmerge(void *ignore, 795static void blk_add_trace_bio_backmerge(void *ignore,
791 struct request_queue *q, 796 struct request_queue *q,
792 struct bio *bio) 797 struct bio *bio)
793{ 798{
794 blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE); 799 blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE, 0);
795} 800}
796 801
797static void blk_add_trace_bio_frontmerge(void *ignore, 802static void blk_add_trace_bio_frontmerge(void *ignore,
798 struct request_queue *q, 803 struct request_queue *q,
799 struct bio *bio) 804 struct bio *bio)
800{ 805{
801 blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE); 806 blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE, 0);
802} 807}
803 808
804static void blk_add_trace_bio_queue(void *ignore, 809static void blk_add_trace_bio_queue(void *ignore,
805 struct request_queue *q, struct bio *bio) 810 struct request_queue *q, struct bio *bio)
806{ 811{
807 blk_add_trace_bio(q, bio, BLK_TA_QUEUE); 812 blk_add_trace_bio(q, bio, BLK_TA_QUEUE, 0);
808} 813}
809 814
810static void blk_add_trace_getrq(void *ignore, 815static void blk_add_trace_getrq(void *ignore,
@@ -812,7 +817,7 @@ static void blk_add_trace_getrq(void *ignore,
812 struct bio *bio, int rw) 817 struct bio *bio, int rw)
813{ 818{
814 if (bio) 819 if (bio)
815 blk_add_trace_bio(q, bio, BLK_TA_GETRQ); 820 blk_add_trace_bio(q, bio, BLK_TA_GETRQ, 0);
816 else { 821 else {
817 struct blk_trace *bt = q->blk_trace; 822 struct blk_trace *bt = q->blk_trace;
818 823
@@ -827,7 +832,7 @@ static void blk_add_trace_sleeprq(void *ignore,
827 struct bio *bio, int rw) 832 struct bio *bio, int rw)
828{ 833{
829 if (bio) 834 if (bio)
830 blk_add_trace_bio(q, bio, BLK_TA_SLEEPRQ); 835 blk_add_trace_bio(q, bio, BLK_TA_SLEEPRQ, 0);
831 else { 836 else {
832 struct blk_trace *bt = q->blk_trace; 837 struct blk_trace *bt = q->blk_trace;
833 838
@@ -887,7 +892,7 @@ static void blk_add_trace_split(void *ignore,
887} 892}
888 893
889/** 894/**
890 * blk_add_trace_remap - Add a trace for a remap operation 895 * blk_add_trace_bio_remap - Add a trace for a bio-remap operation
891 * @ignore: trace callback data parameter (not used) 896 * @ignore: trace callback data parameter (not used)
892 * @q: queue the io is for 897 * @q: queue the io is for
893 * @bio: the source bio 898 * @bio: the source bio
@@ -899,9 +904,9 @@ static void blk_add_trace_split(void *ignore,
899 * it spans a stripe (or similar). Add a trace for that action. 904 * it spans a stripe (or similar). Add a trace for that action.
900 * 905 *
901 **/ 906 **/
902static void blk_add_trace_remap(void *ignore, 907static void blk_add_trace_bio_remap(void *ignore,
903 struct request_queue *q, struct bio *bio, 908 struct request_queue *q, struct bio *bio,
904 dev_t dev, sector_t from) 909 dev_t dev, sector_t from)
905{ 910{
906 struct blk_trace *bt = q->blk_trace; 911 struct blk_trace *bt = q->blk_trace;
907 struct blk_io_trace_remap r; 912 struct blk_io_trace_remap r;
@@ -1016,7 +1021,7 @@ static void blk_register_tracepoints(void)
1016 WARN_ON(ret); 1021 WARN_ON(ret);
1017 ret = register_trace_block_split(blk_add_trace_split, NULL); 1022 ret = register_trace_block_split(blk_add_trace_split, NULL);
1018 WARN_ON(ret); 1023 WARN_ON(ret);
1019 ret = register_trace_block_remap(blk_add_trace_remap, NULL); 1024 ret = register_trace_block_bio_remap(blk_add_trace_bio_remap, NULL);
1020 WARN_ON(ret); 1025 WARN_ON(ret);
1021 ret = register_trace_block_rq_remap(blk_add_trace_rq_remap, NULL); 1026 ret = register_trace_block_rq_remap(blk_add_trace_rq_remap, NULL);
1022 WARN_ON(ret); 1027 WARN_ON(ret);
@@ -1025,7 +1030,7 @@ static void blk_register_tracepoints(void)
1025static void blk_unregister_tracepoints(void) 1030static void blk_unregister_tracepoints(void)
1026{ 1031{
1027 unregister_trace_block_rq_remap(blk_add_trace_rq_remap, NULL); 1032 unregister_trace_block_rq_remap(blk_add_trace_rq_remap, NULL);
1028 unregister_trace_block_remap(blk_add_trace_remap, NULL); 1033 unregister_trace_block_bio_remap(blk_add_trace_bio_remap, NULL);
1029 unregister_trace_block_split(blk_add_trace_split, NULL); 1034 unregister_trace_block_split(blk_add_trace_split, NULL);
1030 unregister_trace_block_unplug_io(blk_add_trace_unplug_io, NULL); 1035 unregister_trace_block_unplug_io(blk_add_trace_unplug_io, NULL);
1031 unregister_trace_block_unplug_timer(blk_add_trace_unplug_timer, NULL); 1036 unregister_trace_block_unplug_timer(blk_add_trace_unplug_timer, NULL);
diff --git a/kernel/trace/power-traces.c b/kernel/trace/power-traces.c
index a22582a06161..f55fcf61b223 100644
--- a/kernel/trace/power-traces.c
+++ b/kernel/trace/power-traces.c
@@ -13,5 +13,8 @@
13#define CREATE_TRACE_POINTS 13#define CREATE_TRACE_POINTS
14#include <trace/events/power.h> 14#include <trace/events/power.h>
15 15
16EXPORT_TRACEPOINT_SYMBOL_GPL(power_frequency); 16#ifdef EVENT_POWER_TRACING_DEPRECATED
17EXPORT_TRACEPOINT_SYMBOL_GPL(power_start);
18#endif
19EXPORT_TRACEPOINT_SYMBOL_GPL(cpu_idle);
17 20
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index f8cf959bad45..dc53ecb80589 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1313,12 +1313,10 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc)
1313 1313
1314 __this_cpu_inc(user_stack_count); 1314 __this_cpu_inc(user_stack_count);
1315 1315
1316
1317
1318 event = trace_buffer_lock_reserve(buffer, TRACE_USER_STACK, 1316 event = trace_buffer_lock_reserve(buffer, TRACE_USER_STACK,
1319 sizeof(*entry), flags, pc); 1317 sizeof(*entry), flags, pc);
1320 if (!event) 1318 if (!event)
1321 return; 1319 goto out_drop_count;
1322 entry = ring_buffer_event_data(event); 1320 entry = ring_buffer_event_data(event);
1323 1321
1324 entry->tgid = current->tgid; 1322 entry->tgid = current->tgid;
@@ -1333,8 +1331,8 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc)
1333 if (!filter_check_discard(call, entry, buffer, event)) 1331 if (!filter_check_discard(call, entry, buffer, event))
1334 ring_buffer_unlock_commit(buffer, event); 1332 ring_buffer_unlock_commit(buffer, event);
1335 1333
1334 out_drop_count:
1336 __this_cpu_dec(user_stack_count); 1335 __this_cpu_dec(user_stack_count);
1337
1338 out: 1336 out:
1339 preempt_enable(); 1337 preempt_enable();
1340} 1338}
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index e3dfecaf13e6..6cf223764be8 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -53,7 +53,7 @@
53 */ 53 */
54 54
55/* 55/*
56 * Function trace entry - function address and parent function addres: 56 * Function trace entry - function address and parent function address:
57 */ 57 */
58FTRACE_ENTRY(function, ftrace_entry, 58FTRACE_ENTRY(function, ftrace_entry,
59 59
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 39c059ca670e..19a359d5e6d5 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -21,17 +21,46 @@ typedef typeof(unsigned long [PERF_MAX_TRACE_SIZE / sizeof(unsigned long)])
21/* Count the events in use (per event id, not per instance) */ 21/* Count the events in use (per event id, not per instance) */
22static int total_ref_count; 22static int total_ref_count;
23 23
24static int perf_trace_event_perm(struct ftrace_event_call *tp_event,
25 struct perf_event *p_event)
26{
27 /* No tracing, just counting, so no obvious leak */
28 if (!(p_event->attr.sample_type & PERF_SAMPLE_RAW))
29 return 0;
30
31 /* Some events are ok to be traced by non-root users... */
32 if (p_event->attach_state == PERF_ATTACH_TASK) {
33 if (tp_event->flags & TRACE_EVENT_FL_CAP_ANY)
34 return 0;
35 }
36
37 /*
38 * ...otherwise raw tracepoint data can be a severe data leak,
39 * only allow root to have these.
40 */
41 if (perf_paranoid_tracepoint_raw() && !capable(CAP_SYS_ADMIN))
42 return -EPERM;
43
44 return 0;
45}
46
24static int perf_trace_event_init(struct ftrace_event_call *tp_event, 47static int perf_trace_event_init(struct ftrace_event_call *tp_event,
25 struct perf_event *p_event) 48 struct perf_event *p_event)
26{ 49{
27 struct hlist_head __percpu *list; 50 struct hlist_head __percpu *list;
28 int ret = -ENOMEM; 51 int ret;
29 int cpu; 52 int cpu;
30 53
54 ret = perf_trace_event_perm(tp_event, p_event);
55 if (ret)
56 return ret;
57
31 p_event->tp_event = tp_event; 58 p_event->tp_event = tp_event;
32 if (tp_event->perf_refcount++ > 0) 59 if (tp_event->perf_refcount++ > 0)
33 return 0; 60 return 0;
34 61
62 ret = -ENOMEM;
63
35 list = alloc_percpu(struct hlist_head); 64 list = alloc_percpu(struct hlist_head);
36 if (!list) 65 if (!list)
37 goto fail; 66 goto fail;
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 0725eeab1937..35fde09b81de 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -27,6 +27,12 @@
27 27
28DEFINE_MUTEX(event_mutex); 28DEFINE_MUTEX(event_mutex);
29 29
30DEFINE_MUTEX(event_storage_mutex);
31EXPORT_SYMBOL_GPL(event_storage_mutex);
32
33char event_storage[EVENT_STORAGE_SIZE];
34EXPORT_SYMBOL_GPL(event_storage);
35
30LIST_HEAD(ftrace_events); 36LIST_HEAD(ftrace_events);
31LIST_HEAD(ftrace_common_fields); 37LIST_HEAD(ftrace_common_fields);
32 38
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index 4ba44deaac25..4b74d71705c0 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -83,13 +83,19 @@ static void __always_unused ____ftrace_check_##name(void) \
83 83
84#undef __array 84#undef __array
85#define __array(type, item, len) \ 85#define __array(type, item, len) \
86 BUILD_BUG_ON(len > MAX_FILTER_STR_VAL); \ 86 do { \
87 ret = trace_define_field(event_call, #type "[" #len "]", #item, \ 87 BUILD_BUG_ON(len > MAX_FILTER_STR_VAL); \
88 mutex_lock(&event_storage_mutex); \
89 snprintf(event_storage, sizeof(event_storage), \
90 "%s[%d]", #type, len); \
91 ret = trace_define_field(event_call, event_storage, #item, \
88 offsetof(typeof(field), item), \ 92 offsetof(typeof(field), item), \
89 sizeof(field.item), \ 93 sizeof(field.item), \
90 is_signed_type(type), FILTER_OTHER); \ 94 is_signed_type(type), FILTER_OTHER); \
91 if (ret) \ 95 mutex_unlock(&event_storage_mutex); \
92 return ret; 96 if (ret) \
97 return ret; \
98 } while (0);
93 99
94#undef __array_desc 100#undef __array_desc
95#define __array_desc(type, container, item, len) \ 101#define __array_desc(type, container, item, len) \
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 155a415b3209..659732eba07c 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -558,7 +558,7 @@ trace_selftest_startup_nop(struct tracer *trace, struct trace_array *tr)
558static int trace_wakeup_test_thread(void *data) 558static int trace_wakeup_test_thread(void *data)
559{ 559{
560 /* Make this a RT thread, doesn't need to be too high */ 560 /* Make this a RT thread, doesn't need to be too high */
561 struct sched_param param = { .sched_priority = 5 }; 561 static const struct sched_param param = { .sched_priority = 5 };
562 struct completion *x = data; 562 struct completion *x = data;
563 563
564 sched_setscheduler(current, SCHED_FIFO, &param); 564 sched_setscheduler(current, SCHED_FIFO, &param);
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 25915832291a..9da289c34f22 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -12,6 +12,8 @@
12#include <linux/highuid.h> 12#include <linux/highuid.h>
13#include <linux/cred.h> 13#include <linux/cred.h>
14 14
15static struct kmem_cache *user_ns_cachep __read_mostly;
16
15/* 17/*
16 * Create a new user namespace, deriving the creator from the user in the 18 * Create a new user namespace, deriving the creator from the user in the
17 * passed credentials, and replacing that user with the new root user for the 19 * passed credentials, and replacing that user with the new root user for the
@@ -26,7 +28,7 @@ int create_user_ns(struct cred *new)
26 struct user_struct *root_user; 28 struct user_struct *root_user;
27 int n; 29 int n;
28 30
29 ns = kmalloc(sizeof(struct user_namespace), GFP_KERNEL); 31 ns = kmem_cache_alloc(user_ns_cachep, GFP_KERNEL);
30 if (!ns) 32 if (!ns)
31 return -ENOMEM; 33 return -ENOMEM;
32 34
@@ -38,7 +40,7 @@ int create_user_ns(struct cred *new)
38 /* Alloc new root user. */ 40 /* Alloc new root user. */
39 root_user = alloc_uid(ns, 0); 41 root_user = alloc_uid(ns, 0);
40 if (!root_user) { 42 if (!root_user) {
41 kfree(ns); 43 kmem_cache_free(user_ns_cachep, ns);
42 return -ENOMEM; 44 return -ENOMEM;
43 } 45 }
44 46
@@ -71,7 +73,7 @@ static void free_user_ns_work(struct work_struct *work)
71 struct user_namespace *ns = 73 struct user_namespace *ns =
72 container_of(work, struct user_namespace, destroyer); 74 container_of(work, struct user_namespace, destroyer);
73 free_uid(ns->creator); 75 free_uid(ns->creator);
74 kfree(ns); 76 kmem_cache_free(user_ns_cachep, ns);
75} 77}
76 78
77void free_user_ns(struct kref *kref) 79void free_user_ns(struct kref *kref)
@@ -126,3 +128,10 @@ gid_t user_ns_map_gid(struct user_namespace *to, const struct cred *cred, gid_t
126 /* No useful relationship so no mapping */ 128 /* No useful relationship so no mapping */
127 return overflowgid; 129 return overflowgid;
128} 130}
131
132static __init int user_namespaces_init(void)
133{
134 user_ns_cachep = KMEM_CACHE(user_namespace, SLAB_PANIC);
135 return 0;
136}
137module_init(user_namespaces_init);
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 5b082156cd21..d7ebdf4cea98 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -57,6 +57,8 @@ static int __init hardlockup_panic_setup(char *str)
57{ 57{
58 if (!strncmp(str, "panic", 5)) 58 if (!strncmp(str, "panic", 5))
59 hardlockup_panic = 1; 59 hardlockup_panic = 1;
60 else if (!strncmp(str, "0", 1))
61 no_watchdog = 1;
60 return 1; 62 return 1;
61} 63}
62__setup("nmi_watchdog=", hardlockup_panic_setup); 64__setup("nmi_watchdog=", hardlockup_panic_setup);
@@ -116,12 +118,12 @@ static void __touch_watchdog(void)
116{ 118{
117 int this_cpu = smp_processor_id(); 119 int this_cpu = smp_processor_id();
118 120
119 __get_cpu_var(watchdog_touch_ts) = get_timestamp(this_cpu); 121 __this_cpu_write(watchdog_touch_ts, get_timestamp(this_cpu));
120} 122}
121 123
122void touch_softlockup_watchdog(void) 124void touch_softlockup_watchdog(void)
123{ 125{
124 __raw_get_cpu_var(watchdog_touch_ts) = 0; 126 __this_cpu_write(watchdog_touch_ts, 0);
125} 127}
126EXPORT_SYMBOL(touch_softlockup_watchdog); 128EXPORT_SYMBOL(touch_softlockup_watchdog);
127 129
@@ -165,12 +167,12 @@ void touch_softlockup_watchdog_sync(void)
165/* watchdog detector functions */ 167/* watchdog detector functions */
166static int is_hardlockup(void) 168static int is_hardlockup(void)
167{ 169{
168 unsigned long hrint = __get_cpu_var(hrtimer_interrupts); 170 unsigned long hrint = __this_cpu_read(hrtimer_interrupts);
169 171
170 if (__get_cpu_var(hrtimer_interrupts_saved) == hrint) 172 if (__this_cpu_read(hrtimer_interrupts_saved) == hrint)
171 return 1; 173 return 1;
172 174
173 __get_cpu_var(hrtimer_interrupts_saved) = hrint; 175 __this_cpu_write(hrtimer_interrupts_saved, hrint);
174 return 0; 176 return 0;
175} 177}
176#endif 178#endif
@@ -203,8 +205,8 @@ static void watchdog_overflow_callback(struct perf_event *event, int nmi,
203 /* Ensure the watchdog never gets throttled */ 205 /* Ensure the watchdog never gets throttled */
204 event->hw.interrupts = 0; 206 event->hw.interrupts = 0;
205 207
206 if (__get_cpu_var(watchdog_nmi_touch) == true) { 208 if (__this_cpu_read(watchdog_nmi_touch) == true) {
207 __get_cpu_var(watchdog_nmi_touch) = false; 209 __this_cpu_write(watchdog_nmi_touch, false);
208 return; 210 return;
209 } 211 }
210 212
@@ -218,7 +220,7 @@ static void watchdog_overflow_callback(struct perf_event *event, int nmi,
218 int this_cpu = smp_processor_id(); 220 int this_cpu = smp_processor_id();
219 221
220 /* only print hardlockups once */ 222 /* only print hardlockups once */
221 if (__get_cpu_var(hard_watchdog_warn) == true) 223 if (__this_cpu_read(hard_watchdog_warn) == true)
222 return; 224 return;
223 225
224 if (hardlockup_panic) 226 if (hardlockup_panic)
@@ -226,16 +228,16 @@ static void watchdog_overflow_callback(struct perf_event *event, int nmi,
226 else 228 else
227 WARN(1, "Watchdog detected hard LOCKUP on cpu %d", this_cpu); 229 WARN(1, "Watchdog detected hard LOCKUP on cpu %d", this_cpu);
228 230
229 __get_cpu_var(hard_watchdog_warn) = true; 231 __this_cpu_write(hard_watchdog_warn, true);
230 return; 232 return;
231 } 233 }
232 234
233 __get_cpu_var(hard_watchdog_warn) = false; 235 __this_cpu_write(hard_watchdog_warn, false);
234 return; 236 return;
235} 237}
236static void watchdog_interrupt_count(void) 238static void watchdog_interrupt_count(void)
237{ 239{
238 __get_cpu_var(hrtimer_interrupts)++; 240 __this_cpu_inc(hrtimer_interrupts);
239} 241}
240#else 242#else
241static inline void watchdog_interrupt_count(void) { return; } 243static inline void watchdog_interrupt_count(void) { return; }
@@ -244,7 +246,7 @@ static inline void watchdog_interrupt_count(void) { return; }
244/* watchdog kicker functions */ 246/* watchdog kicker functions */
245static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) 247static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
246{ 248{
247 unsigned long touch_ts = __get_cpu_var(watchdog_touch_ts); 249 unsigned long touch_ts = __this_cpu_read(watchdog_touch_ts);
248 struct pt_regs *regs = get_irq_regs(); 250 struct pt_regs *regs = get_irq_regs();
249 int duration; 251 int duration;
250 252
@@ -252,18 +254,18 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
252 watchdog_interrupt_count(); 254 watchdog_interrupt_count();
253 255
254 /* kick the softlockup detector */ 256 /* kick the softlockup detector */
255 wake_up_process(__get_cpu_var(softlockup_watchdog)); 257 wake_up_process(__this_cpu_read(softlockup_watchdog));
256 258
257 /* .. and repeat */ 259 /* .. and repeat */
258 hrtimer_forward_now(hrtimer, ns_to_ktime(get_sample_period())); 260 hrtimer_forward_now(hrtimer, ns_to_ktime(get_sample_period()));
259 261
260 if (touch_ts == 0) { 262 if (touch_ts == 0) {
261 if (unlikely(__get_cpu_var(softlockup_touch_sync))) { 263 if (unlikely(__this_cpu_read(softlockup_touch_sync))) {
262 /* 264 /*
263 * If the time stamp was touched atomically 265 * If the time stamp was touched atomically
264 * make sure the scheduler tick is up to date. 266 * make sure the scheduler tick is up to date.
265 */ 267 */
266 __get_cpu_var(softlockup_touch_sync) = false; 268 __this_cpu_write(softlockup_touch_sync, false);
267 sched_clock_tick(); 269 sched_clock_tick();
268 } 270 }
269 __touch_watchdog(); 271 __touch_watchdog();
@@ -279,7 +281,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
279 duration = is_softlockup(touch_ts); 281 duration = is_softlockup(touch_ts);
280 if (unlikely(duration)) { 282 if (unlikely(duration)) {
281 /* only warn once */ 283 /* only warn once */
282 if (__get_cpu_var(soft_watchdog_warn) == true) 284 if (__this_cpu_read(soft_watchdog_warn) == true)
283 return HRTIMER_RESTART; 285 return HRTIMER_RESTART;
284 286
285 printk(KERN_ERR "BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n", 287 printk(KERN_ERR "BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n",
@@ -294,9 +296,9 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
294 296
295 if (softlockup_panic) 297 if (softlockup_panic)
296 panic("softlockup: hung tasks"); 298 panic("softlockup: hung tasks");
297 __get_cpu_var(soft_watchdog_warn) = true; 299 __this_cpu_write(soft_watchdog_warn, true);
298 } else 300 } else
299 __get_cpu_var(soft_watchdog_warn) = false; 301 __this_cpu_write(soft_watchdog_warn, false);
300 302
301 return HRTIMER_RESTART; 303 return HRTIMER_RESTART;
302} 304}
@@ -307,7 +309,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
307 */ 309 */
308static int watchdog(void *unused) 310static int watchdog(void *unused)
309{ 311{
310 struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; 312 static struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
311 struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer); 313 struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer);
312 314
313 sched_setscheduler(current, SCHED_FIFO, &param); 315 sched_setscheduler(current, SCHED_FIFO, &param);
@@ -548,13 +550,13 @@ static struct notifier_block __cpuinitdata cpu_nfb = {
548 .notifier_call = cpu_callback 550 .notifier_call = cpu_callback
549}; 551};
550 552
551static int __init spawn_watchdog_task(void) 553void __init lockup_detector_init(void)
552{ 554{
553 void *cpu = (void *)(long)smp_processor_id(); 555 void *cpu = (void *)(long)smp_processor_id();
554 int err; 556 int err;
555 557
556 if (no_watchdog) 558 if (no_watchdog)
557 return 0; 559 return;
558 560
559 err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu); 561 err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
560 WARN_ON(notifier_to_errno(err)); 562 WARN_ON(notifier_to_errno(err));
@@ -562,6 +564,5 @@ static int __init spawn_watchdog_task(void)
562 cpu_callback(&cpu_nfb, CPU_ONLINE, cpu); 564 cpu_callback(&cpu_nfb, CPU_ONLINE, cpu);
563 register_cpu_notifier(&cpu_nfb); 565 register_cpu_notifier(&cpu_nfb);
564 566
565 return 0; 567 return;
566} 568}
567early_initcall(spawn_watchdog_task);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index e785b0f2aea5..8ee6ec82f88a 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -932,6 +932,38 @@ static void insert_work(struct cpu_workqueue_struct *cwq,
932 wake_up_worker(gcwq); 932 wake_up_worker(gcwq);
933} 933}
934 934
935/*
936 * Test whether @work is being queued from another work executing on the
937 * same workqueue. This is rather expensive and should only be used from
938 * cold paths.
939 */
940static bool is_chained_work(struct workqueue_struct *wq)
941{
942 unsigned long flags;
943 unsigned int cpu;
944
945 for_each_gcwq_cpu(cpu) {
946 struct global_cwq *gcwq = get_gcwq(cpu);
947 struct worker *worker;
948 struct hlist_node *pos;
949 int i;
950
951 spin_lock_irqsave(&gcwq->lock, flags);
952 for_each_busy_worker(worker, i, pos, gcwq) {
953 if (worker->task != current)
954 continue;
955 spin_unlock_irqrestore(&gcwq->lock, flags);
956 /*
957 * I'm @worker, no locking necessary. See if @work
958 * is headed to the same workqueue.
959 */
960 return worker->current_cwq->wq == wq;
961 }
962 spin_unlock_irqrestore(&gcwq->lock, flags);
963 }
964 return false;
965}
966
935static void __queue_work(unsigned int cpu, struct workqueue_struct *wq, 967static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
936 struct work_struct *work) 968 struct work_struct *work)
937{ 969{
@@ -943,7 +975,9 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
943 975
944 debug_work_activate(work); 976 debug_work_activate(work);
945 977
946 if (WARN_ON_ONCE(wq->flags & WQ_DYING)) 978 /* if dying, only works from the same workqueue are allowed */
979 if (unlikely(wq->flags & WQ_DYING) &&
980 WARN_ON_ONCE(!is_chained_work(wq)))
947 return; 981 return;
948 982
949 /* determine gcwq to use */ 983 /* determine gcwq to use */
@@ -2936,11 +2970,35 @@ EXPORT_SYMBOL_GPL(__alloc_workqueue_key);
2936 */ 2970 */
2937void destroy_workqueue(struct workqueue_struct *wq) 2971void destroy_workqueue(struct workqueue_struct *wq)
2938{ 2972{
2973 unsigned int flush_cnt = 0;
2939 unsigned int cpu; 2974 unsigned int cpu;
2940 2975
2976 /*
2977 * Mark @wq dying and drain all pending works. Once WQ_DYING is
2978 * set, only chain queueing is allowed. IOW, only currently
2979 * pending or running work items on @wq can queue further work
2980 * items on it. @wq is flushed repeatedly until it becomes empty.
2981 * The number of flushing is detemined by the depth of chaining and
2982 * should be relatively short. Whine if it takes too long.
2983 */
2941 wq->flags |= WQ_DYING; 2984 wq->flags |= WQ_DYING;
2985reflush:
2942 flush_workqueue(wq); 2986 flush_workqueue(wq);
2943 2987
2988 for_each_cwq_cpu(cpu, wq) {
2989 struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
2990
2991 if (!cwq->nr_active && list_empty(&cwq->delayed_works))
2992 continue;
2993
2994 if (++flush_cnt == 10 ||
2995 (flush_cnt % 100 == 0 && flush_cnt <= 1000))
2996 printk(KERN_WARNING "workqueue %s: flush on "
2997 "destruction isn't complete after %u tries\n",
2998 wq->name, flush_cnt);
2999 goto reflush;
3000 }
3001
2944 /* 3002 /*
2945 * wq list is used to freeze wq, remove from list after 3003 * wq list is used to freeze wq, remove from list after
2946 * flushing is complete in case freeze races us. 3004 * flushing is complete in case freeze races us.