aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile5
-rw-r--r--kernel/audit.c2
-rw-r--r--kernel/capability.c2
-rw-r--r--kernel/cgroup.c55
-rw-r--r--kernel/cpu.c29
-rw-r--r--kernel/cred.c16
-rw-r--r--kernel/debug/kdb/kdb_main.c2
-rw-r--r--kernel/exit.c16
-rw-r--r--kernel/fork.c45
-rw-r--r--kernel/freezer.c9
-rw-r--r--kernel/futex.c297
-rw-r--r--kernel/hrtimer.c87
-rw-r--r--kernel/hw_breakpoint.c2
-rw-r--r--kernel/irq/Kconfig3
-rw-r--r--kernel/irq/handle.c111
-rw-r--r--kernel/irq/irqdesc.c40
-rw-r--r--kernel/irq/manage.c4
-rw-r--r--kernel/irq/migration.c14
-rw-r--r--kernel/irq_work.c18
-rw-r--r--kernel/kexec.c2
-rw-r--r--kernel/kprobes.c573
-rw-r--r--kernel/kthread.c2
-rw-r--r--kernel/latencytop.c23
-rw-r--r--kernel/lockdep.c18
-rw-r--r--kernel/lockdep_proc.c16
-rw-r--r--kernel/module.c187
-rw-r--r--kernel/mutex.c2
-rw-r--r--kernel/panic.c1
-rw-r--r--kernel/params.c65
-rw-r--r--kernel/perf_event.c736
-rw-r--r--kernel/posix-timers.c10
-rw-r--r--kernel/power/Kconfig5
-rw-r--r--kernel/power/Makefile6
-rw-r--r--kernel/power/hibernate.c11
-rw-r--r--kernel/power/main.c2
-rw-r--r--kernel/power/nvs.c136
-rw-r--r--kernel/power/process.c14
-rw-r--r--kernel/power/snapshot.c7
-rw-r--r--kernel/power/suspend.c9
-rw-r--r--kernel/power/swap.c7
-rw-r--r--kernel/printk.c194
-rw-r--r--kernel/ptrace.c2
-rw-r--r--kernel/rcutiny.c106
-rw-r--r--kernel/rcutiny_plugin.h433
-rw-r--r--kernel/rcutorture.c270
-rw-r--r--kernel/rcutree.c160
-rw-r--r--kernel/rcutree.h61
-rw-r--r--kernel/rcutree_plugin.h135
-rw-r--r--kernel/rcutree_trace.c12
-rw-r--r--kernel/sched.c709
-rw-r--r--kernel/sched_autogroup.c270
-rw-r--r--kernel/sched_autogroup.h36
-rw-r--r--kernel/sched_clock.c2
-rw-r--r--kernel/sched_debug.c123
-rw-r--r--kernel/sched_fair.c384
-rw-r--r--kernel/sched_features.h2
-rw-r--r--kernel/sched_rt.c26
-rw-r--r--kernel/smp.c75
-rw-r--r--kernel/softirq.c65
-rw-r--r--kernel/srcu.c19
-rw-r--r--kernel/sys.c13
-rw-r--r--kernel/sysctl.c87
-rw-r--r--kernel/sysctl_binary.c3
-rw-r--r--kernel/taskstats.c7
-rw-r--r--kernel/time.c4
-rw-r--r--kernel/time/clocksource.c11
-rw-r--r--kernel/time/ntp.c425
-rw-r--r--kernel/time/tick-common.c2
-rw-r--r--kernel/time/tick-oneshot.c4
-rw-r--r--kernel/time/tick-sched.c7
-rw-r--r--kernel/time/timecompare.c5
-rw-r--r--kernel/time/timekeeping.c56
-rw-r--r--kernel/time/timer_list.c12
-rw-r--r--kernel/timer.c42
-rw-r--r--kernel/trace/Kconfig15
-rw-r--r--kernel/trace/Makefile2
-rw-r--r--kernel/trace/blktrace.c44
-rw-r--r--kernel/trace/power-traces.c5
-rw-r--r--kernel/trace/trace.c6
-rw-r--r--kernel/trace/trace_entries.h2
-rw-r--r--kernel/trace/trace_event_perf.c31
-rw-r--r--kernel/trace/trace_events.c18
-rw-r--r--kernel/trace/trace_export.c20
-rw-r--r--kernel/trace/trace_irqsoff.c8
-rw-r--r--kernel/trace/trace_selftest.c2
-rw-r--r--kernel/trace/trace_syscalls.c52
-rw-r--r--kernel/tracepoint.c31
-rw-r--r--kernel/user_namespace.c15
-rw-r--r--kernel/watchdog.c96
-rw-r--r--kernel/workqueue.c117
90 files changed, 4508 insertions, 2307 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 0b5ff083fa22..353d3fe8ba33 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -43,7 +43,7 @@ obj-$(CONFIG_RT_MUTEXES) += rtmutex.o
43obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o 43obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o
44obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o 44obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o
45obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o 45obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o
46obj-$(CONFIG_USE_GENERIC_SMP_HELPERS) += smp.o 46obj-$(CONFIG_SMP) += smp.o
47ifneq ($(CONFIG_SMP),y) 47ifneq ($(CONFIG_SMP),y)
48obj-y += up.o 48obj-y += up.o
49endif 49endif
@@ -100,6 +100,7 @@ obj-$(CONFIG_FUNCTION_TRACER) += trace/
100obj-$(CONFIG_TRACING) += trace/ 100obj-$(CONFIG_TRACING) += trace/
101obj-$(CONFIG_X86_DS) += trace/ 101obj-$(CONFIG_X86_DS) += trace/
102obj-$(CONFIG_RING_BUFFER) += trace/ 102obj-$(CONFIG_RING_BUFFER) += trace/
103obj-$(CONFIG_TRACEPOINTS) += trace/
103obj-$(CONFIG_SMP) += sched_cpupri.o 104obj-$(CONFIG_SMP) += sched_cpupri.o
104obj-$(CONFIG_IRQ_WORK) += irq_work.o 105obj-$(CONFIG_IRQ_WORK) += irq_work.o
105obj-$(CONFIG_PERF_EVENTS) += perf_event.o 106obj-$(CONFIG_PERF_EVENTS) += perf_event.o
@@ -121,7 +122,7 @@ $(obj)/configs.o: $(obj)/config_data.h
121# config_data.h contains the same information as ikconfig.h but gzipped. 122# config_data.h contains the same information as ikconfig.h but gzipped.
122# Info from config_data can be extracted from /proc/config* 123# Info from config_data can be extracted from /proc/config*
123targets += config_data.gz 124targets += config_data.gz
124$(obj)/config_data.gz: .config FORCE 125$(obj)/config_data.gz: $(KCONFIG_CONFIG) FORCE
125 $(call if_changed,gzip) 126 $(call if_changed,gzip)
126 127
127quiet_cmd_ikconfiggz = IKCFG $@ 128quiet_cmd_ikconfiggz = IKCFG $@
diff --git a/kernel/audit.c b/kernel/audit.c
index 77770a034d59..e4956244ae50 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -400,7 +400,7 @@ static void kauditd_send_skb(struct sk_buff *skb)
400 if (err < 0) { 400 if (err < 0) {
401 BUG_ON(err != -ECONNREFUSED); /* Shouldn't happen */ 401 BUG_ON(err != -ECONNREFUSED); /* Shouldn't happen */
402 printk(KERN_ERR "audit: *NO* daemon at audit_pid=%d\n", audit_pid); 402 printk(KERN_ERR "audit: *NO* daemon at audit_pid=%d\n", audit_pid);
403 audit_log_lost("auditd dissapeared\n"); 403 audit_log_lost("auditd disappeared\n");
404 audit_pid = 0; 404 audit_pid = 0;
405 /* we might get lucky and get this in the next auditd */ 405 /* we might get lucky and get this in the next auditd */
406 audit_hold_skb(skb); 406 audit_hold_skb(skb);
diff --git a/kernel/capability.c b/kernel/capability.c
index 2f05303715a5..9e9385f132c8 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -306,7 +306,7 @@ int capable(int cap)
306 BUG(); 306 BUG();
307 } 307 }
308 308
309 if (security_capable(cap) == 0) { 309 if (security_capable(current_cred(), cap) == 0) {
310 current->flags |= PF_SUPERPRIV; 310 current->flags |= PF_SUPERPRIV;
311 return 1; 311 return 1;
312 } 312 }
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 66a416b42c18..b24d7027b83c 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -764,6 +764,7 @@ EXPORT_SYMBOL_GPL(cgroup_unlock);
764 */ 764 */
765 765
766static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode); 766static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode);
767static struct dentry *cgroup_lookup(struct inode *, struct dentry *, struct nameidata *);
767static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry); 768static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry);
768static int cgroup_populate_dir(struct cgroup *cgrp); 769static int cgroup_populate_dir(struct cgroup *cgrp);
769static const struct inode_operations cgroup_dir_inode_operations; 770static const struct inode_operations cgroup_dir_inode_operations;
@@ -860,6 +861,11 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
860 iput(inode); 861 iput(inode);
861} 862}
862 863
864static int cgroup_delete(const struct dentry *d)
865{
866 return 1;
867}
868
863static void remove_dir(struct dentry *d) 869static void remove_dir(struct dentry *d)
864{ 870{
865 struct dentry *parent = dget(d->d_parent); 871 struct dentry *parent = dget(d->d_parent);
@@ -874,25 +880,29 @@ static void cgroup_clear_directory(struct dentry *dentry)
874 struct list_head *node; 880 struct list_head *node;
875 881
876 BUG_ON(!mutex_is_locked(&dentry->d_inode->i_mutex)); 882 BUG_ON(!mutex_is_locked(&dentry->d_inode->i_mutex));
877 spin_lock(&dcache_lock); 883 spin_lock(&dentry->d_lock);
878 node = dentry->d_subdirs.next; 884 node = dentry->d_subdirs.next;
879 while (node != &dentry->d_subdirs) { 885 while (node != &dentry->d_subdirs) {
880 struct dentry *d = list_entry(node, struct dentry, d_u.d_child); 886 struct dentry *d = list_entry(node, struct dentry, d_u.d_child);
887
888 spin_lock_nested(&d->d_lock, DENTRY_D_LOCK_NESTED);
881 list_del_init(node); 889 list_del_init(node);
882 if (d->d_inode) { 890 if (d->d_inode) {
883 /* This should never be called on a cgroup 891 /* This should never be called on a cgroup
884 * directory with child cgroups */ 892 * directory with child cgroups */
885 BUG_ON(d->d_inode->i_mode & S_IFDIR); 893 BUG_ON(d->d_inode->i_mode & S_IFDIR);
886 d = dget_locked(d); 894 dget_dlock(d);
887 spin_unlock(&dcache_lock); 895 spin_unlock(&d->d_lock);
896 spin_unlock(&dentry->d_lock);
888 d_delete(d); 897 d_delete(d);
889 simple_unlink(dentry->d_inode, d); 898 simple_unlink(dentry->d_inode, d);
890 dput(d); 899 dput(d);
891 spin_lock(&dcache_lock); 900 spin_lock(&dentry->d_lock);
892 } 901 } else
902 spin_unlock(&d->d_lock);
893 node = dentry->d_subdirs.next; 903 node = dentry->d_subdirs.next;
894 } 904 }
895 spin_unlock(&dcache_lock); 905 spin_unlock(&dentry->d_lock);
896} 906}
897 907
898/* 908/*
@@ -900,11 +910,16 @@ static void cgroup_clear_directory(struct dentry *dentry)
900 */ 910 */
901static void cgroup_d_remove_dir(struct dentry *dentry) 911static void cgroup_d_remove_dir(struct dentry *dentry)
902{ 912{
913 struct dentry *parent;
914
903 cgroup_clear_directory(dentry); 915 cgroup_clear_directory(dentry);
904 916
905 spin_lock(&dcache_lock); 917 parent = dentry->d_parent;
918 spin_lock(&parent->d_lock);
919 spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
906 list_del_init(&dentry->d_u.d_child); 920 list_del_init(&dentry->d_u.d_child);
907 spin_unlock(&dcache_lock); 921 spin_unlock(&dentry->d_lock);
922 spin_unlock(&parent->d_lock);
908 remove_dir(dentry); 923 remove_dir(dentry);
909} 924}
910 925
@@ -1440,6 +1455,11 @@ static int cgroup_set_super(struct super_block *sb, void *data)
1440 1455
1441static int cgroup_get_rootdir(struct super_block *sb) 1456static int cgroup_get_rootdir(struct super_block *sb)
1442{ 1457{
1458 static const struct dentry_operations cgroup_dops = {
1459 .d_iput = cgroup_diput,
1460 .d_delete = cgroup_delete,
1461 };
1462
1443 struct inode *inode = 1463 struct inode *inode =
1444 cgroup_new_inode(S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR, sb); 1464 cgroup_new_inode(S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR, sb);
1445 struct dentry *dentry; 1465 struct dentry *dentry;
@@ -1457,6 +1477,8 @@ static int cgroup_get_rootdir(struct super_block *sb)
1457 return -ENOMEM; 1477 return -ENOMEM;
1458 } 1478 }
1459 sb->s_root = dentry; 1479 sb->s_root = dentry;
1480 /* for everything else we want ->d_op set */
1481 sb->s_d_op = &cgroup_dops;
1460 return 0; 1482 return 0;
1461} 1483}
1462 1484
@@ -2180,12 +2202,20 @@ static const struct file_operations cgroup_file_operations = {
2180}; 2202};
2181 2203
2182static const struct inode_operations cgroup_dir_inode_operations = { 2204static const struct inode_operations cgroup_dir_inode_operations = {
2183 .lookup = simple_lookup, 2205 .lookup = cgroup_lookup,
2184 .mkdir = cgroup_mkdir, 2206 .mkdir = cgroup_mkdir,
2185 .rmdir = cgroup_rmdir, 2207 .rmdir = cgroup_rmdir,
2186 .rename = cgroup_rename, 2208 .rename = cgroup_rename,
2187}; 2209};
2188 2210
2211static struct dentry *cgroup_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
2212{
2213 if (dentry->d_name.len > NAME_MAX)
2214 return ERR_PTR(-ENAMETOOLONG);
2215 d_add(dentry, NULL);
2216 return NULL;
2217}
2218
2189/* 2219/*
2190 * Check if a file is a control file 2220 * Check if a file is a control file
2191 */ 2221 */
@@ -2199,10 +2229,6 @@ static inline struct cftype *__file_cft(struct file *file)
2199static int cgroup_create_file(struct dentry *dentry, mode_t mode, 2229static int cgroup_create_file(struct dentry *dentry, mode_t mode,
2200 struct super_block *sb) 2230 struct super_block *sb)
2201{ 2231{
2202 static const struct dentry_operations cgroup_dops = {
2203 .d_iput = cgroup_diput,
2204 };
2205
2206 struct inode *inode; 2232 struct inode *inode;
2207 2233
2208 if (!dentry) 2234 if (!dentry)
@@ -2228,7 +2254,6 @@ static int cgroup_create_file(struct dentry *dentry, mode_t mode,
2228 inode->i_size = 0; 2254 inode->i_size = 0;
2229 inode->i_fop = &cgroup_file_operations; 2255 inode->i_fop = &cgroup_file_operations;
2230 } 2256 }
2231 dentry->d_op = &cgroup_dops;
2232 d_instantiate(dentry, inode); 2257 d_instantiate(dentry, inode);
2233 dget(dentry); /* Extra count - pin the dentry in core */ 2258 dget(dentry); /* Extra count - pin the dentry in core */
2234 return 0; 2259 return 0;
@@ -3638,9 +3663,7 @@ again:
3638 list_del(&cgrp->sibling); 3663 list_del(&cgrp->sibling);
3639 cgroup_unlock_hierarchy(cgrp->root); 3664 cgroup_unlock_hierarchy(cgrp->root);
3640 3665
3641 spin_lock(&cgrp->dentry->d_lock);
3642 d = dget(cgrp->dentry); 3666 d = dget(cgrp->dentry);
3643 spin_unlock(&d->d_lock);
3644 3667
3645 cgroup_d_remove_dir(d); 3668 cgroup_d_remove_dir(d);
3646 dput(d); 3669 dput(d);
diff --git a/kernel/cpu.c b/kernel/cpu.c
index f6e726f18491..156cc5556140 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -189,7 +189,6 @@ static inline void check_for_tasks(int cpu)
189} 189}
190 190
191struct take_cpu_down_param { 191struct take_cpu_down_param {
192 struct task_struct *caller;
193 unsigned long mod; 192 unsigned long mod;
194 void *hcpu; 193 void *hcpu;
195}; 194};
@@ -198,7 +197,6 @@ struct take_cpu_down_param {
198static int __ref take_cpu_down(void *_param) 197static int __ref take_cpu_down(void *_param)
199{ 198{
200 struct take_cpu_down_param *param = _param; 199 struct take_cpu_down_param *param = _param;
201 unsigned int cpu = (unsigned long)param->hcpu;
202 int err; 200 int err;
203 201
204 /* Ensure this CPU doesn't handle any more interrupts. */ 202 /* Ensure this CPU doesn't handle any more interrupts. */
@@ -208,11 +206,6 @@ static int __ref take_cpu_down(void *_param)
208 206
209 cpu_notify(CPU_DYING | param->mod, param->hcpu); 207 cpu_notify(CPU_DYING | param->mod, param->hcpu);
210 208
211 if (task_cpu(param->caller) == cpu)
212 move_task_off_dead_cpu(cpu, param->caller);
213 /* Force idle task to run as soon as we yield: it should
214 immediately notice cpu is offline and die quickly. */
215 sched_idle_next();
216 return 0; 209 return 0;
217} 210}
218 211
@@ -223,7 +216,6 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
223 void *hcpu = (void *)(long)cpu; 216 void *hcpu = (void *)(long)cpu;
224 unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0; 217 unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0;
225 struct take_cpu_down_param tcd_param = { 218 struct take_cpu_down_param tcd_param = {
226 .caller = current,
227 .mod = mod, 219 .mod = mod,
228 .hcpu = hcpu, 220 .hcpu = hcpu,
229 }; 221 };
@@ -253,9 +245,15 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
253 } 245 }
254 BUG_ON(cpu_online(cpu)); 246 BUG_ON(cpu_online(cpu));
255 247
256 /* Wait for it to sleep (leaving idle task). */ 248 /*
249 * The migration_call() CPU_DYING callback will have removed all
250 * runnable tasks from the cpu, there's only the idle task left now
251 * that the migration thread is done doing the stop_machine thing.
252 *
253 * Wait for the stop thread to go away.
254 */
257 while (!idle_cpu(cpu)) 255 while (!idle_cpu(cpu))
258 yield(); 256 cpu_relax();
259 257
260 /* This actually kills the CPU. */ 258 /* This actually kills the CPU. */
261 __cpu_die(cpu); 259 __cpu_die(cpu);
@@ -386,6 +384,14 @@ out:
386#ifdef CONFIG_PM_SLEEP_SMP 384#ifdef CONFIG_PM_SLEEP_SMP
387static cpumask_var_t frozen_cpus; 385static cpumask_var_t frozen_cpus;
388 386
387void __weak arch_disable_nonboot_cpus_begin(void)
388{
389}
390
391void __weak arch_disable_nonboot_cpus_end(void)
392{
393}
394
389int disable_nonboot_cpus(void) 395int disable_nonboot_cpus(void)
390{ 396{
391 int cpu, first_cpu, error = 0; 397 int cpu, first_cpu, error = 0;
@@ -397,6 +403,7 @@ int disable_nonboot_cpus(void)
397 * with the userspace trying to use the CPU hotplug at the same time 403 * with the userspace trying to use the CPU hotplug at the same time
398 */ 404 */
399 cpumask_clear(frozen_cpus); 405 cpumask_clear(frozen_cpus);
406 arch_disable_nonboot_cpus_begin();
400 407
401 printk("Disabling non-boot CPUs ...\n"); 408 printk("Disabling non-boot CPUs ...\n");
402 for_each_online_cpu(cpu) { 409 for_each_online_cpu(cpu) {
@@ -412,6 +419,8 @@ int disable_nonboot_cpus(void)
412 } 419 }
413 } 420 }
414 421
422 arch_disable_nonboot_cpus_end();
423
415 if (!error) { 424 if (!error) {
416 BUG_ON(num_online_cpus() > 1); 425 BUG_ON(num_online_cpus() > 1);
417 /* Make sure the CPUs won't be enabled by someone else */ 426 /* Make sure the CPUs won't be enabled by someone else */
diff --git a/kernel/cred.c b/kernel/cred.c
index 6a1aa004e376..3a9d6dd53a6c 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -252,13 +252,13 @@ struct cred *cred_alloc_blank(void)
252#endif 252#endif
253 253
254 atomic_set(&new->usage, 1); 254 atomic_set(&new->usage, 1);
255#ifdef CONFIG_DEBUG_CREDENTIALS
256 new->magic = CRED_MAGIC;
257#endif
255 258
256 if (security_cred_alloc_blank(new, GFP_KERNEL) < 0) 259 if (security_cred_alloc_blank(new, GFP_KERNEL) < 0)
257 goto error; 260 goto error;
258 261
259#ifdef CONFIG_DEBUG_CREDENTIALS
260 new->magic = CRED_MAGIC;
261#endif
262 return new; 262 return new;
263 263
264error: 264error:
@@ -657,6 +657,8 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon)
657 validate_creds(old); 657 validate_creds(old);
658 658
659 *new = *old; 659 *new = *old;
660 atomic_set(&new->usage, 1);
661 set_cred_subscribers(new, 0);
660 get_uid(new->user); 662 get_uid(new->user);
661 get_group_info(new->group_info); 663 get_group_info(new->group_info);
662 664
@@ -674,8 +676,6 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon)
674 if (security_prepare_creds(new, old, GFP_KERNEL) < 0) 676 if (security_prepare_creds(new, old, GFP_KERNEL) < 0)
675 goto error; 677 goto error;
676 678
677 atomic_set(&new->usage, 1);
678 set_cred_subscribers(new, 0);
679 put_cred(old); 679 put_cred(old);
680 validate_creds(new); 680 validate_creds(new);
681 return new; 681 return new;
@@ -748,7 +748,11 @@ bool creds_are_invalid(const struct cred *cred)
748 if (cred->magic != CRED_MAGIC) 748 if (cred->magic != CRED_MAGIC)
749 return true; 749 return true;
750#ifdef CONFIG_SECURITY_SELINUX 750#ifdef CONFIG_SECURITY_SELINUX
751 if (selinux_is_enabled()) { 751 /*
752 * cred->security == NULL if security_cred_alloc_blank() or
753 * security_prepare_creds() returned an error.
754 */
755 if (selinux_is_enabled() && cred->security) {
752 if ((unsigned long) cred->security < PAGE_SIZE) 756 if ((unsigned long) cred->security < PAGE_SIZE)
753 return true; 757 return true;
754 if ((*(u32 *)cred->security & 0xffffff00) == 758 if ((*(u32 *)cred->security & 0xffffff00) ==
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index a6e729766821..bd3e8e29caa3 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -2914,7 +2914,7 @@ static void __init kdb_cmd_init(void)
2914 } 2914 }
2915} 2915}
2916 2916
2917/* Intialize kdb_printf, breakpoint tables and kdb state */ 2917/* Initialize kdb_printf, breakpoint tables and kdb state */
2918void __init kdb_init(int lvl) 2918void __init kdb_init(int lvl)
2919{ 2919{
2920 static int kdb_init_lvl = KDB_NOT_INITIALIZED; 2920 static int kdb_init_lvl = KDB_NOT_INITIALIZED;
diff --git a/kernel/exit.c b/kernel/exit.c
index 676149a4ac5f..f9a45ebcc7b1 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -69,7 +69,7 @@ static void __unhash_process(struct task_struct *p, bool group_dead)
69 69
70 list_del_rcu(&p->tasks); 70 list_del_rcu(&p->tasks);
71 list_del_init(&p->sibling); 71 list_del_init(&p->sibling);
72 __get_cpu_var(process_counts)--; 72 __this_cpu_dec(process_counts);
73 } 73 }
74 list_del_rcu(&p->thread_group); 74 list_del_rcu(&p->thread_group);
75} 75}
@@ -994,6 +994,15 @@ NORET_TYPE void do_exit(long code)
994 exit_fs(tsk); 994 exit_fs(tsk);
995 check_stack_usage(); 995 check_stack_usage();
996 exit_thread(); 996 exit_thread();
997
998 /*
999 * Flush inherited counters to the parent - before the parent
1000 * gets woken up by child-exit notifications.
1001 *
1002 * because of cgroup mode, must be called before cgroup_exit()
1003 */
1004 perf_event_exit_task(tsk);
1005
997 cgroup_exit(tsk, 1); 1006 cgroup_exit(tsk, 1);
998 1007
999 if (group_dead) 1008 if (group_dead)
@@ -1007,11 +1016,6 @@ NORET_TYPE void do_exit(long code)
1007 * FIXME: do that only when needed, using sched_exit tracepoint 1016 * FIXME: do that only when needed, using sched_exit tracepoint
1008 */ 1017 */
1009 flush_ptrace_hw_breakpoint(tsk); 1018 flush_ptrace_hw_breakpoint(tsk);
1010 /*
1011 * Flush inherited counters to the parent - before the parent
1012 * gets woken up by child-exit notifications.
1013 */
1014 perf_event_exit_task(tsk);
1015 1019
1016 exit_notify(tsk, group_dead); 1020 exit_notify(tsk, group_dead);
1017#ifdef CONFIG_NUMA 1021#ifdef CONFIG_NUMA
diff --git a/kernel/fork.c b/kernel/fork.c
index 5447dc7defa9..25e429152ddc 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -66,6 +66,7 @@
66#include <linux/posix-timers.h> 66#include <linux/posix-timers.h>
67#include <linux/user-return-notifier.h> 67#include <linux/user-return-notifier.h>
68#include <linux/oom.h> 68#include <linux/oom.h>
69#include <linux/khugepaged.h>
69 70
70#include <asm/pgtable.h> 71#include <asm/pgtable.h>
71#include <asm/pgalloc.h> 72#include <asm/pgalloc.h>
@@ -169,6 +170,7 @@ EXPORT_SYMBOL(free_task);
169static inline void free_signal_struct(struct signal_struct *sig) 170static inline void free_signal_struct(struct signal_struct *sig)
170{ 171{
171 taskstats_tgid_free(sig); 172 taskstats_tgid_free(sig);
173 sched_autogroup_exit(sig);
172 kmem_cache_free(signal_cachep, sig); 174 kmem_cache_free(signal_cachep, sig);
173} 175}
174 176
@@ -329,6 +331,9 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
329 retval = ksm_fork(mm, oldmm); 331 retval = ksm_fork(mm, oldmm);
330 if (retval) 332 if (retval)
331 goto out; 333 goto out;
334 retval = khugepaged_fork(mm, oldmm);
335 if (retval)
336 goto out;
332 337
333 prev = NULL; 338 prev = NULL;
334 for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) { 339 for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) {
@@ -528,6 +533,9 @@ void __mmdrop(struct mm_struct *mm)
528 mm_free_pgd(mm); 533 mm_free_pgd(mm);
529 destroy_context(mm); 534 destroy_context(mm);
530 mmu_notifier_mm_destroy(mm); 535 mmu_notifier_mm_destroy(mm);
536#ifdef CONFIG_TRANSPARENT_HUGEPAGE
537 VM_BUG_ON(mm->pmd_huge_pte);
538#endif
531 free_mm(mm); 539 free_mm(mm);
532} 540}
533EXPORT_SYMBOL_GPL(__mmdrop); 541EXPORT_SYMBOL_GPL(__mmdrop);
@@ -542,6 +550,7 @@ void mmput(struct mm_struct *mm)
542 if (atomic_dec_and_test(&mm->mm_users)) { 550 if (atomic_dec_and_test(&mm->mm_users)) {
543 exit_aio(mm); 551 exit_aio(mm);
544 ksm_exit(mm); 552 ksm_exit(mm);
553 khugepaged_exit(mm); /* must run before exit_mmap */
545 exit_mmap(mm); 554 exit_mmap(mm);
546 set_mm_exe_file(mm, NULL); 555 set_mm_exe_file(mm, NULL);
547 if (!list_empty(&mm->mmlist)) { 556 if (!list_empty(&mm->mmlist)) {
@@ -668,6 +677,10 @@ struct mm_struct *dup_mm(struct task_struct *tsk)
668 mm->token_priority = 0; 677 mm->token_priority = 0;
669 mm->last_interval = 0; 678 mm->last_interval = 0;
670 679
680#ifdef CONFIG_TRANSPARENT_HUGEPAGE
681 mm->pmd_huge_pte = NULL;
682#endif
683
671 if (!mm_init(mm, tsk)) 684 if (!mm_init(mm, tsk))
672 goto fail_nomem; 685 goto fail_nomem;
673 686
@@ -905,9 +918,11 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
905 posix_cpu_timers_init_group(sig); 918 posix_cpu_timers_init_group(sig);
906 919
907 tty_audit_fork(sig); 920 tty_audit_fork(sig);
921 sched_autogroup_fork(sig);
908 922
909 sig->oom_adj = current->signal->oom_adj; 923 sig->oom_adj = current->signal->oom_adj;
910 sig->oom_score_adj = current->signal->oom_score_adj; 924 sig->oom_score_adj = current->signal->oom_score_adj;
925 sig->oom_score_adj_min = current->signal->oom_score_adj_min;
911 926
912 mutex_init(&sig->cred_guard_mutex); 927 mutex_init(&sig->cred_guard_mutex);
913 928
@@ -1283,7 +1298,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1283 attach_pid(p, PIDTYPE_SID, task_session(current)); 1298 attach_pid(p, PIDTYPE_SID, task_session(current));
1284 list_add_tail(&p->sibling, &p->real_parent->children); 1299 list_add_tail(&p->sibling, &p->real_parent->children);
1285 list_add_tail_rcu(&p->tasks, &init_task.tasks); 1300 list_add_tail_rcu(&p->tasks, &init_task.tasks);
1286 __get_cpu_var(process_counts)++; 1301 __this_cpu_inc(process_counts);
1287 } 1302 }
1288 attach_pid(p, PIDTYPE_PID, pid); 1303 attach_pid(p, PIDTYPE_PID, pid);
1289 nr_threads++; 1304 nr_threads++;
@@ -1408,23 +1423,6 @@ long do_fork(unsigned long clone_flags,
1408 } 1423 }
1409 1424
1410 /* 1425 /*
1411 * We hope to recycle these flags after 2.6.26
1412 */
1413 if (unlikely(clone_flags & CLONE_STOPPED)) {
1414 static int __read_mostly count = 100;
1415
1416 if (count > 0 && printk_ratelimit()) {
1417 char comm[TASK_COMM_LEN];
1418
1419 count--;
1420 printk(KERN_INFO "fork(): process `%s' used deprecated "
1421 "clone flags 0x%lx\n",
1422 get_task_comm(comm, current),
1423 clone_flags & CLONE_STOPPED);
1424 }
1425 }
1426
1427 /*
1428 * When called from kernel_thread, don't do user tracing stuff. 1426 * When called from kernel_thread, don't do user tracing stuff.
1429 */ 1427 */
1430 if (likely(user_mode(regs))) 1428 if (likely(user_mode(regs)))
@@ -1462,16 +1460,7 @@ long do_fork(unsigned long clone_flags,
1462 */ 1460 */
1463 p->flags &= ~PF_STARTING; 1461 p->flags &= ~PF_STARTING;
1464 1462
1465 if (unlikely(clone_flags & CLONE_STOPPED)) { 1463 wake_up_new_task(p, clone_flags);
1466 /*
1467 * We'll start up with an immediate SIGSTOP.
1468 */
1469 sigaddset(&p->pending.signal, SIGSTOP);
1470 set_tsk_thread_flag(p, TIF_SIGPENDING);
1471 __set_task_state(p, TASK_STOPPED);
1472 } else {
1473 wake_up_new_task(p, clone_flags);
1474 }
1475 1464
1476 tracehook_report_clone_complete(trace, regs, 1465 tracehook_report_clone_complete(trace, regs,
1477 clone_flags, nr, p); 1466 clone_flags, nr, p);
diff --git a/kernel/freezer.c b/kernel/freezer.c
index bd1d42b17cb2..66ecd2ead215 100644
--- a/kernel/freezer.c
+++ b/kernel/freezer.c
@@ -104,8 +104,13 @@ bool freeze_task(struct task_struct *p, bool sig_only)
104 } 104 }
105 105
106 if (should_send_signal(p)) { 106 if (should_send_signal(p)) {
107 if (!signal_pending(p)) 107 fake_signal_wake_up(p);
108 fake_signal_wake_up(p); 108 /*
109 * fake_signal_wake_up() goes through p's scheduler
110 * lock and guarantees that TASK_STOPPED/TRACED ->
111 * TASK_RUNNING transition can't race with task state
112 * testing in try_to_freeze_tasks().
113 */
109 } else if (sig_only) { 114 } else if (sig_only) {
110 return false; 115 return false;
111 } else { 116 } else {
diff --git a/kernel/futex.c b/kernel/futex.c
index 40a8777a27d0..b766d28accd6 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -69,6 +69,14 @@ int __read_mostly futex_cmpxchg_enabled;
69#define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8) 69#define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8)
70 70
71/* 71/*
72 * Futex flags used to encode options to functions and preserve them across
73 * restarts.
74 */
75#define FLAGS_SHARED 0x01
76#define FLAGS_CLOCKRT 0x02
77#define FLAGS_HAS_TIMEOUT 0x04
78
79/*
72 * Priority Inheritance state: 80 * Priority Inheritance state:
73 */ 81 */
74struct futex_pi_state { 82struct futex_pi_state {
@@ -123,6 +131,12 @@ struct futex_q {
123 u32 bitset; 131 u32 bitset;
124}; 132};
125 133
134static const struct futex_q futex_q_init = {
135 /* list gets initialized in queue_me()*/
136 .key = FUTEX_KEY_INIT,
137 .bitset = FUTEX_BITSET_MATCH_ANY
138};
139
126/* 140/*
127 * Hash buckets are shared by all the futex_keys that hash to the same 141 * Hash buckets are shared by all the futex_keys that hash to the same
128 * location. Each key may have multiple futex_q structures, one for each task 142 * location. Each key may have multiple futex_q structures, one for each task
@@ -219,7 +233,7 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key)
219{ 233{
220 unsigned long address = (unsigned long)uaddr; 234 unsigned long address = (unsigned long)uaddr;
221 struct mm_struct *mm = current->mm; 235 struct mm_struct *mm = current->mm;
222 struct page *page; 236 struct page *page, *page_head;
223 int err; 237 int err;
224 238
225 /* 239 /*
@@ -251,11 +265,46 @@ again:
251 if (err < 0) 265 if (err < 0)
252 return err; 266 return err;
253 267
254 page = compound_head(page); 268#ifdef CONFIG_TRANSPARENT_HUGEPAGE
255 lock_page(page); 269 page_head = page;
256 if (!page->mapping) { 270 if (unlikely(PageTail(page))) {
257 unlock_page(page);
258 put_page(page); 271 put_page(page);
272 /* serialize against __split_huge_page_splitting() */
273 local_irq_disable();
274 if (likely(__get_user_pages_fast(address, 1, 1, &page) == 1)) {
275 page_head = compound_head(page);
276 /*
277 * page_head is valid pointer but we must pin
278 * it before taking the PG_lock and/or
279 * PG_compound_lock. The moment we re-enable
280 * irqs __split_huge_page_splitting() can
281 * return and the head page can be freed from
282 * under us. We can't take the PG_lock and/or
283 * PG_compound_lock on a page that could be
284 * freed from under us.
285 */
286 if (page != page_head) {
287 get_page(page_head);
288 put_page(page);
289 }
290 local_irq_enable();
291 } else {
292 local_irq_enable();
293 goto again;
294 }
295 }
296#else
297 page_head = compound_head(page);
298 if (page != page_head) {
299 get_page(page_head);
300 put_page(page);
301 }
302#endif
303
304 lock_page(page_head);
305 if (!page_head->mapping) {
306 unlock_page(page_head);
307 put_page(page_head);
259 goto again; 308 goto again;
260 } 309 }
261 310
@@ -266,25 +315,24 @@ again:
266 * it's a read-only handle, it's expected that futexes attach to 315 * it's a read-only handle, it's expected that futexes attach to
267 * the object not the particular process. 316 * the object not the particular process.
268 */ 317 */
269 if (PageAnon(page)) { 318 if (PageAnon(page_head)) {
270 key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */ 319 key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */
271 key->private.mm = mm; 320 key->private.mm = mm;
272 key->private.address = address; 321 key->private.address = address;
273 } else { 322 } else {
274 key->both.offset |= FUT_OFF_INODE; /* inode-based key */ 323 key->both.offset |= FUT_OFF_INODE; /* inode-based key */
275 key->shared.inode = page->mapping->host; 324 key->shared.inode = page_head->mapping->host;
276 key->shared.pgoff = page->index; 325 key->shared.pgoff = page_head->index;
277 } 326 }
278 327
279 get_futex_key_refs(key); 328 get_futex_key_refs(key);
280 329
281 unlock_page(page); 330 unlock_page(page_head);
282 put_page(page); 331 put_page(page_head);
283 return 0; 332 return 0;
284} 333}
285 334
286static inline 335static inline void put_futex_key(union futex_key *key)
287void put_futex_key(int fshared, union futex_key *key)
288{ 336{
289 drop_futex_key_refs(key); 337 drop_futex_key_refs(key);
290} 338}
@@ -778,10 +826,9 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
778 new_owner = rt_mutex_next_owner(&pi_state->pi_mutex); 826 new_owner = rt_mutex_next_owner(&pi_state->pi_mutex);
779 827
780 /* 828 /*
781 * This happens when we have stolen the lock and the original 829 * It is possible that the next waiter (the one that brought
782 * pending owner did not enqueue itself back on the rt_mutex. 830 * this owner to the kernel) timed out and is no longer
783 * Thats not a tragedy. We know that way, that a lock waiter 831 * waiting on the lock.
784 * is on the fly. We make the futex_q waiter the pending owner.
785 */ 832 */
786 if (!new_owner) 833 if (!new_owner)
787 new_owner = this->task; 834 new_owner = this->task;
@@ -870,7 +917,8 @@ double_unlock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2)
870/* 917/*
871 * Wake up waiters matching bitset queued on this futex (uaddr). 918 * Wake up waiters matching bitset queued on this futex (uaddr).
872 */ 919 */
873static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset) 920static int
921futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
874{ 922{
875 struct futex_hash_bucket *hb; 923 struct futex_hash_bucket *hb;
876 struct futex_q *this, *next; 924 struct futex_q *this, *next;
@@ -881,7 +929,7 @@ static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset)
881 if (!bitset) 929 if (!bitset)
882 return -EINVAL; 930 return -EINVAL;
883 931
884 ret = get_futex_key(uaddr, fshared, &key); 932 ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key);
885 if (unlikely(ret != 0)) 933 if (unlikely(ret != 0))
886 goto out; 934 goto out;
887 935
@@ -907,7 +955,7 @@ static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset)
907 } 955 }
908 956
909 spin_unlock(&hb->lock); 957 spin_unlock(&hb->lock);
910 put_futex_key(fshared, &key); 958 put_futex_key(&key);
911out: 959out:
912 return ret; 960 return ret;
913} 961}
@@ -917,7 +965,7 @@ out:
917 * to this virtual address: 965 * to this virtual address:
918 */ 966 */
919static int 967static int
920futex_wake_op(u32 __user *uaddr1, int fshared, u32 __user *uaddr2, 968futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2,
921 int nr_wake, int nr_wake2, int op) 969 int nr_wake, int nr_wake2, int op)
922{ 970{
923 union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT; 971 union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
@@ -927,10 +975,10 @@ futex_wake_op(u32 __user *uaddr1, int fshared, u32 __user *uaddr2,
927 int ret, op_ret; 975 int ret, op_ret;
928 976
929retry: 977retry:
930 ret = get_futex_key(uaddr1, fshared, &key1); 978 ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1);
931 if (unlikely(ret != 0)) 979 if (unlikely(ret != 0))
932 goto out; 980 goto out;
933 ret = get_futex_key(uaddr2, fshared, &key2); 981 ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2);
934 if (unlikely(ret != 0)) 982 if (unlikely(ret != 0))
935 goto out_put_key1; 983 goto out_put_key1;
936 984
@@ -962,11 +1010,11 @@ retry_private:
962 if (ret) 1010 if (ret)
963 goto out_put_keys; 1011 goto out_put_keys;
964 1012
965 if (!fshared) 1013 if (!(flags & FLAGS_SHARED))
966 goto retry_private; 1014 goto retry_private;
967 1015
968 put_futex_key(fshared, &key2); 1016 put_futex_key(&key2);
969 put_futex_key(fshared, &key1); 1017 put_futex_key(&key1);
970 goto retry; 1018 goto retry;
971 } 1019 }
972 1020
@@ -996,9 +1044,9 @@ retry_private:
996 1044
997 double_unlock_hb(hb1, hb2); 1045 double_unlock_hb(hb1, hb2);
998out_put_keys: 1046out_put_keys:
999 put_futex_key(fshared, &key2); 1047 put_futex_key(&key2);
1000out_put_key1: 1048out_put_key1:
1001 put_futex_key(fshared, &key1); 1049 put_futex_key(&key1);
1002out: 1050out:
1003 return ret; 1051 return ret;
1004} 1052}
@@ -1133,13 +1181,13 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex,
1133/** 1181/**
1134 * futex_requeue() - Requeue waiters from uaddr1 to uaddr2 1182 * futex_requeue() - Requeue waiters from uaddr1 to uaddr2
1135 * @uaddr1: source futex user address 1183 * @uaddr1: source futex user address
1136 * @fshared: 0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED 1184 * @flags: futex flags (FLAGS_SHARED, etc.)
1137 * @uaddr2: target futex user address 1185 * @uaddr2: target futex user address
1138 * @nr_wake: number of waiters to wake (must be 1 for requeue_pi) 1186 * @nr_wake: number of waiters to wake (must be 1 for requeue_pi)
1139 * @nr_requeue: number of waiters to requeue (0-INT_MAX) 1187 * @nr_requeue: number of waiters to requeue (0-INT_MAX)
1140 * @cmpval: @uaddr1 expected value (or %NULL) 1188 * @cmpval: @uaddr1 expected value (or %NULL)
1141 * @requeue_pi: if we are attempting to requeue from a non-pi futex to a 1189 * @requeue_pi: if we are attempting to requeue from a non-pi futex to a
1142 * pi futex (pi to pi requeue is not supported) 1190 * pi futex (pi to pi requeue is not supported)
1143 * 1191 *
1144 * Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire 1192 * Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire
1145 * uaddr2 atomically on behalf of the top waiter. 1193 * uaddr2 atomically on behalf of the top waiter.
@@ -1148,9 +1196,9 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex,
1148 * >=0 - on success, the number of tasks requeued or woken 1196 * >=0 - on success, the number of tasks requeued or woken
1149 * <0 - on error 1197 * <0 - on error
1150 */ 1198 */
1151static int futex_requeue(u32 __user *uaddr1, int fshared, u32 __user *uaddr2, 1199static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
1152 int nr_wake, int nr_requeue, u32 *cmpval, 1200 u32 __user *uaddr2, int nr_wake, int nr_requeue,
1153 int requeue_pi) 1201 u32 *cmpval, int requeue_pi)
1154{ 1202{
1155 union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT; 1203 union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
1156 int drop_count = 0, task_count = 0, ret; 1204 int drop_count = 0, task_count = 0, ret;
@@ -1191,10 +1239,10 @@ retry:
1191 pi_state = NULL; 1239 pi_state = NULL;
1192 } 1240 }
1193 1241
1194 ret = get_futex_key(uaddr1, fshared, &key1); 1242 ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1);
1195 if (unlikely(ret != 0)) 1243 if (unlikely(ret != 0))
1196 goto out; 1244 goto out;
1197 ret = get_futex_key(uaddr2, fshared, &key2); 1245 ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2);
1198 if (unlikely(ret != 0)) 1246 if (unlikely(ret != 0))
1199 goto out_put_key1; 1247 goto out_put_key1;
1200 1248
@@ -1216,11 +1264,11 @@ retry_private:
1216 if (ret) 1264 if (ret)
1217 goto out_put_keys; 1265 goto out_put_keys;
1218 1266
1219 if (!fshared) 1267 if (!(flags & FLAGS_SHARED))
1220 goto retry_private; 1268 goto retry_private;
1221 1269
1222 put_futex_key(fshared, &key2); 1270 put_futex_key(&key2);
1223 put_futex_key(fshared, &key1); 1271 put_futex_key(&key1);
1224 goto retry; 1272 goto retry;
1225 } 1273 }
1226 if (curval != *cmpval) { 1274 if (curval != *cmpval) {
@@ -1260,8 +1308,8 @@ retry_private:
1260 break; 1308 break;
1261 case -EFAULT: 1309 case -EFAULT:
1262 double_unlock_hb(hb1, hb2); 1310 double_unlock_hb(hb1, hb2);
1263 put_futex_key(fshared, &key2); 1311 put_futex_key(&key2);
1264 put_futex_key(fshared, &key1); 1312 put_futex_key(&key1);
1265 ret = fault_in_user_writeable(uaddr2); 1313 ret = fault_in_user_writeable(uaddr2);
1266 if (!ret) 1314 if (!ret)
1267 goto retry; 1315 goto retry;
@@ -1269,8 +1317,8 @@ retry_private:
1269 case -EAGAIN: 1317 case -EAGAIN:
1270 /* The owner was exiting, try again. */ 1318 /* The owner was exiting, try again. */
1271 double_unlock_hb(hb1, hb2); 1319 double_unlock_hb(hb1, hb2);
1272 put_futex_key(fshared, &key2); 1320 put_futex_key(&key2);
1273 put_futex_key(fshared, &key1); 1321 put_futex_key(&key1);
1274 cond_resched(); 1322 cond_resched();
1275 goto retry; 1323 goto retry;
1276 default: 1324 default:
@@ -1352,9 +1400,9 @@ out_unlock:
1352 drop_futex_key_refs(&key1); 1400 drop_futex_key_refs(&key1);
1353 1401
1354out_put_keys: 1402out_put_keys:
1355 put_futex_key(fshared, &key2); 1403 put_futex_key(&key2);
1356out_put_key1: 1404out_put_key1:
1357 put_futex_key(fshared, &key1); 1405 put_futex_key(&key1);
1358out: 1406out:
1359 if (pi_state != NULL) 1407 if (pi_state != NULL)
1360 free_pi_state(pi_state); 1408 free_pi_state(pi_state);
@@ -1494,7 +1542,7 @@ static void unqueue_me_pi(struct futex_q *q)
1494 * private futexes. 1542 * private futexes.
1495 */ 1543 */
1496static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, 1544static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
1497 struct task_struct *newowner, int fshared) 1545 struct task_struct *newowner)
1498{ 1546{
1499 u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS; 1547 u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS;
1500 struct futex_pi_state *pi_state = q->pi_state; 1548 struct futex_pi_state *pi_state = q->pi_state;
@@ -1587,20 +1635,11 @@ handle_fault:
1587 goto retry; 1635 goto retry;
1588} 1636}
1589 1637
1590/*
1591 * In case we must use restart_block to restart a futex_wait,
1592 * we encode in the 'flags' shared capability
1593 */
1594#define FLAGS_SHARED 0x01
1595#define FLAGS_CLOCKRT 0x02
1596#define FLAGS_HAS_TIMEOUT 0x04
1597
1598static long futex_wait_restart(struct restart_block *restart); 1638static long futex_wait_restart(struct restart_block *restart);
1599 1639
1600/** 1640/**
1601 * fixup_owner() - Post lock pi_state and corner case management 1641 * fixup_owner() - Post lock pi_state and corner case management
1602 * @uaddr: user address of the futex 1642 * @uaddr: user address of the futex
1603 * @fshared: whether the futex is shared (1) or not (0)
1604 * @q: futex_q (contains pi_state and access to the rt_mutex) 1643 * @q: futex_q (contains pi_state and access to the rt_mutex)
1605 * @locked: if the attempt to take the rt_mutex succeeded (1) or not (0) 1644 * @locked: if the attempt to take the rt_mutex succeeded (1) or not (0)
1606 * 1645 *
@@ -1613,8 +1652,7 @@ static long futex_wait_restart(struct restart_block *restart);
1613 * 0 - success, lock not taken 1652 * 0 - success, lock not taken
1614 * <0 - on error (-EFAULT) 1653 * <0 - on error (-EFAULT)
1615 */ 1654 */
1616static int fixup_owner(u32 __user *uaddr, int fshared, struct futex_q *q, 1655static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked)
1617 int locked)
1618{ 1656{
1619 struct task_struct *owner; 1657 struct task_struct *owner;
1620 int ret = 0; 1658 int ret = 0;
@@ -1625,7 +1663,7 @@ static int fixup_owner(u32 __user *uaddr, int fshared, struct futex_q *q,
1625 * did a lock-steal - fix up the PI-state in that case: 1663 * did a lock-steal - fix up the PI-state in that case:
1626 */ 1664 */
1627 if (q->pi_state->owner != current) 1665 if (q->pi_state->owner != current)
1628 ret = fixup_pi_state_owner(uaddr, q, current, fshared); 1666 ret = fixup_pi_state_owner(uaddr, q, current);
1629 goto out; 1667 goto out;
1630 } 1668 }
1631 1669
@@ -1652,7 +1690,7 @@ static int fixup_owner(u32 __user *uaddr, int fshared, struct futex_q *q,
1652 * lock. Fix the state up. 1690 * lock. Fix the state up.
1653 */ 1691 */
1654 owner = rt_mutex_owner(&q->pi_state->pi_mutex); 1692 owner = rt_mutex_owner(&q->pi_state->pi_mutex);
1655 ret = fixup_pi_state_owner(uaddr, q, owner, fshared); 1693 ret = fixup_pi_state_owner(uaddr, q, owner);
1656 goto out; 1694 goto out;
1657 } 1695 }
1658 1696
@@ -1715,7 +1753,7 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
1715 * futex_wait_setup() - Prepare to wait on a futex 1753 * futex_wait_setup() - Prepare to wait on a futex
1716 * @uaddr: the futex userspace address 1754 * @uaddr: the futex userspace address
1717 * @val: the expected value 1755 * @val: the expected value
1718 * @fshared: whether the futex is shared (1) or not (0) 1756 * @flags: futex flags (FLAGS_SHARED, etc.)
1719 * @q: the associated futex_q 1757 * @q: the associated futex_q
1720 * @hb: storage for hash_bucket pointer to be returned to caller 1758 * @hb: storage for hash_bucket pointer to be returned to caller
1721 * 1759 *
@@ -1728,7 +1766,7 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
1728 * 0 - uaddr contains val and hb has been locked 1766 * 0 - uaddr contains val and hb has been locked
1729 * <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlcoked 1767 * <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlcoked
1730 */ 1768 */
1731static int futex_wait_setup(u32 __user *uaddr, u32 val, int fshared, 1769static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags,
1732 struct futex_q *q, struct futex_hash_bucket **hb) 1770 struct futex_q *q, struct futex_hash_bucket **hb)
1733{ 1771{
1734 u32 uval; 1772 u32 uval;
@@ -1752,8 +1790,7 @@ static int futex_wait_setup(u32 __user *uaddr, u32 val, int fshared,
1752 * rare, but normal. 1790 * rare, but normal.
1753 */ 1791 */
1754retry: 1792retry:
1755 q->key = FUTEX_KEY_INIT; 1793 ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q->key);
1756 ret = get_futex_key(uaddr, fshared, &q->key);
1757 if (unlikely(ret != 0)) 1794 if (unlikely(ret != 0))
1758 return ret; 1795 return ret;
1759 1796
@@ -1769,10 +1806,10 @@ retry_private:
1769 if (ret) 1806 if (ret)
1770 goto out; 1807 goto out;
1771 1808
1772 if (!fshared) 1809 if (!(flags & FLAGS_SHARED))
1773 goto retry_private; 1810 goto retry_private;
1774 1811
1775 put_futex_key(fshared, &q->key); 1812 put_futex_key(&q->key);
1776 goto retry; 1813 goto retry;
1777 } 1814 }
1778 1815
@@ -1783,32 +1820,29 @@ retry_private:
1783 1820
1784out: 1821out:
1785 if (ret) 1822 if (ret)
1786 put_futex_key(fshared, &q->key); 1823 put_futex_key(&q->key);
1787 return ret; 1824 return ret;
1788} 1825}
1789 1826
1790static int futex_wait(u32 __user *uaddr, int fshared, 1827static int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val,
1791 u32 val, ktime_t *abs_time, u32 bitset, int clockrt) 1828 ktime_t *abs_time, u32 bitset)
1792{ 1829{
1793 struct hrtimer_sleeper timeout, *to = NULL; 1830 struct hrtimer_sleeper timeout, *to = NULL;
1794 struct restart_block *restart; 1831 struct restart_block *restart;
1795 struct futex_hash_bucket *hb; 1832 struct futex_hash_bucket *hb;
1796 struct futex_q q; 1833 struct futex_q q = futex_q_init;
1797 int ret; 1834 int ret;
1798 1835
1799 if (!bitset) 1836 if (!bitset)
1800 return -EINVAL; 1837 return -EINVAL;
1801
1802 q.pi_state = NULL;
1803 q.bitset = bitset; 1838 q.bitset = bitset;
1804 q.rt_waiter = NULL;
1805 q.requeue_pi_key = NULL;
1806 1839
1807 if (abs_time) { 1840 if (abs_time) {
1808 to = &timeout; 1841 to = &timeout;
1809 1842
1810 hrtimer_init_on_stack(&to->timer, clockrt ? CLOCK_REALTIME : 1843 hrtimer_init_on_stack(&to->timer, (flags & FLAGS_CLOCKRT) ?
1811 CLOCK_MONOTONIC, HRTIMER_MODE_ABS); 1844 CLOCK_REALTIME : CLOCK_MONOTONIC,
1845 HRTIMER_MODE_ABS);
1812 hrtimer_init_sleeper(to, current); 1846 hrtimer_init_sleeper(to, current);
1813 hrtimer_set_expires_range_ns(&to->timer, *abs_time, 1847 hrtimer_set_expires_range_ns(&to->timer, *abs_time,
1814 current->timer_slack_ns); 1848 current->timer_slack_ns);
@@ -1819,7 +1853,7 @@ retry:
1819 * Prepare to wait on uaddr. On success, holds hb lock and increments 1853 * Prepare to wait on uaddr. On success, holds hb lock and increments
1820 * q.key refs. 1854 * q.key refs.
1821 */ 1855 */
1822 ret = futex_wait_setup(uaddr, val, fshared, &q, &hb); 1856 ret = futex_wait_setup(uaddr, val, flags, &q, &hb);
1823 if (ret) 1857 if (ret)
1824 goto out; 1858 goto out;
1825 1859
@@ -1852,12 +1886,7 @@ retry:
1852 restart->futex.val = val; 1886 restart->futex.val = val;
1853 restart->futex.time = abs_time->tv64; 1887 restart->futex.time = abs_time->tv64;
1854 restart->futex.bitset = bitset; 1888 restart->futex.bitset = bitset;
1855 restart->futex.flags = FLAGS_HAS_TIMEOUT; 1889 restart->futex.flags = flags;
1856
1857 if (fshared)
1858 restart->futex.flags |= FLAGS_SHARED;
1859 if (clockrt)
1860 restart->futex.flags |= FLAGS_CLOCKRT;
1861 1890
1862 ret = -ERESTART_RESTARTBLOCK; 1891 ret = -ERESTART_RESTARTBLOCK;
1863 1892
@@ -1873,7 +1902,6 @@ out:
1873static long futex_wait_restart(struct restart_block *restart) 1902static long futex_wait_restart(struct restart_block *restart)
1874{ 1903{
1875 u32 __user *uaddr = restart->futex.uaddr; 1904 u32 __user *uaddr = restart->futex.uaddr;
1876 int fshared = 0;
1877 ktime_t t, *tp = NULL; 1905 ktime_t t, *tp = NULL;
1878 1906
1879 if (restart->futex.flags & FLAGS_HAS_TIMEOUT) { 1907 if (restart->futex.flags & FLAGS_HAS_TIMEOUT) {
@@ -1881,11 +1909,9 @@ static long futex_wait_restart(struct restart_block *restart)
1881 tp = &t; 1909 tp = &t;
1882 } 1910 }
1883 restart->fn = do_no_restart_syscall; 1911 restart->fn = do_no_restart_syscall;
1884 if (restart->futex.flags & FLAGS_SHARED) 1912
1885 fshared = 1; 1913 return (long)futex_wait(uaddr, restart->futex.flags,
1886 return (long)futex_wait(uaddr, fshared, restart->futex.val, tp, 1914 restart->futex.val, tp, restart->futex.bitset);
1887 restart->futex.bitset,
1888 restart->futex.flags & FLAGS_CLOCKRT);
1889} 1915}
1890 1916
1891 1917
@@ -1895,12 +1921,12 @@ static long futex_wait_restart(struct restart_block *restart)
1895 * if there are waiters then it will block, it does PI, etc. (Due to 1921 * if there are waiters then it will block, it does PI, etc. (Due to
1896 * races the kernel might see a 0 value of the futex too.) 1922 * races the kernel might see a 0 value of the futex too.)
1897 */ 1923 */
1898static int futex_lock_pi(u32 __user *uaddr, int fshared, 1924static int futex_lock_pi(u32 __user *uaddr, unsigned int flags, int detect,
1899 int detect, ktime_t *time, int trylock) 1925 ktime_t *time, int trylock)
1900{ 1926{
1901 struct hrtimer_sleeper timeout, *to = NULL; 1927 struct hrtimer_sleeper timeout, *to = NULL;
1902 struct futex_hash_bucket *hb; 1928 struct futex_hash_bucket *hb;
1903 struct futex_q q; 1929 struct futex_q q = futex_q_init;
1904 int res, ret; 1930 int res, ret;
1905 1931
1906 if (refill_pi_state_cache()) 1932 if (refill_pi_state_cache())
@@ -1914,12 +1940,8 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared,
1914 hrtimer_set_expires(&to->timer, *time); 1940 hrtimer_set_expires(&to->timer, *time);
1915 } 1941 }
1916 1942
1917 q.pi_state = NULL;
1918 q.rt_waiter = NULL;
1919 q.requeue_pi_key = NULL;
1920retry: 1943retry:
1921 q.key = FUTEX_KEY_INIT; 1944 ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q.key);
1922 ret = get_futex_key(uaddr, fshared, &q.key);
1923 if (unlikely(ret != 0)) 1945 if (unlikely(ret != 0))
1924 goto out; 1946 goto out;
1925 1947
@@ -1941,7 +1963,7 @@ retry_private:
1941 * exit to complete. 1963 * exit to complete.
1942 */ 1964 */
1943 queue_unlock(&q, hb); 1965 queue_unlock(&q, hb);
1944 put_futex_key(fshared, &q.key); 1966 put_futex_key(&q.key);
1945 cond_resched(); 1967 cond_resched();
1946 goto retry; 1968 goto retry;
1947 default: 1969 default:
@@ -1971,7 +1993,7 @@ retry_private:
1971 * Fixup the pi_state owner and possibly acquire the lock if we 1993 * Fixup the pi_state owner and possibly acquire the lock if we
1972 * haven't already. 1994 * haven't already.
1973 */ 1995 */
1974 res = fixup_owner(uaddr, fshared, &q, !ret); 1996 res = fixup_owner(uaddr, &q, !ret);
1975 /* 1997 /*
1976 * If fixup_owner() returned an error, proprogate that. If it acquired 1998 * If fixup_owner() returned an error, proprogate that. If it acquired
1977 * the lock, clear our -ETIMEDOUT or -EINTR. 1999 * the lock, clear our -ETIMEDOUT or -EINTR.
@@ -1995,7 +2017,7 @@ out_unlock_put_key:
1995 queue_unlock(&q, hb); 2017 queue_unlock(&q, hb);
1996 2018
1997out_put_key: 2019out_put_key:
1998 put_futex_key(fshared, &q.key); 2020 put_futex_key(&q.key);
1999out: 2021out:
2000 if (to) 2022 if (to)
2001 destroy_hrtimer_on_stack(&to->timer); 2023 destroy_hrtimer_on_stack(&to->timer);
@@ -2008,10 +2030,10 @@ uaddr_faulted:
2008 if (ret) 2030 if (ret)
2009 goto out_put_key; 2031 goto out_put_key;
2010 2032
2011 if (!fshared) 2033 if (!(flags & FLAGS_SHARED))
2012 goto retry_private; 2034 goto retry_private;
2013 2035
2014 put_futex_key(fshared, &q.key); 2036 put_futex_key(&q.key);
2015 goto retry; 2037 goto retry;
2016} 2038}
2017 2039
@@ -2020,7 +2042,7 @@ uaddr_faulted:
2020 * This is the in-kernel slowpath: we look up the PI state (if any), 2042 * This is the in-kernel slowpath: we look up the PI state (if any),
2021 * and do the rt-mutex unlock. 2043 * and do the rt-mutex unlock.
2022 */ 2044 */
2023static int futex_unlock_pi(u32 __user *uaddr, int fshared) 2045static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
2024{ 2046{
2025 struct futex_hash_bucket *hb; 2047 struct futex_hash_bucket *hb;
2026 struct futex_q *this, *next; 2048 struct futex_q *this, *next;
@@ -2038,7 +2060,7 @@ retry:
2038 if ((uval & FUTEX_TID_MASK) != task_pid_vnr(current)) 2060 if ((uval & FUTEX_TID_MASK) != task_pid_vnr(current))
2039 return -EPERM; 2061 return -EPERM;
2040 2062
2041 ret = get_futex_key(uaddr, fshared, &key); 2063 ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key);
2042 if (unlikely(ret != 0)) 2064 if (unlikely(ret != 0))
2043 goto out; 2065 goto out;
2044 2066
@@ -2093,14 +2115,14 @@ retry:
2093 2115
2094out_unlock: 2116out_unlock:
2095 spin_unlock(&hb->lock); 2117 spin_unlock(&hb->lock);
2096 put_futex_key(fshared, &key); 2118 put_futex_key(&key);
2097 2119
2098out: 2120out:
2099 return ret; 2121 return ret;
2100 2122
2101pi_faulted: 2123pi_faulted:
2102 spin_unlock(&hb->lock); 2124 spin_unlock(&hb->lock);
2103 put_futex_key(fshared, &key); 2125 put_futex_key(&key);
2104 2126
2105 ret = fault_in_user_writeable(uaddr); 2127 ret = fault_in_user_writeable(uaddr);
2106 if (!ret) 2128 if (!ret)
@@ -2160,7 +2182,7 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
2160/** 2182/**
2161 * futex_wait_requeue_pi() - Wait on uaddr and take uaddr2 2183 * futex_wait_requeue_pi() - Wait on uaddr and take uaddr2
2162 * @uaddr: the futex we initially wait on (non-pi) 2184 * @uaddr: the futex we initially wait on (non-pi)
2163 * @fshared: whether the futexes are shared (1) or not (0). They must be 2185 * @flags: futex flags (FLAGS_SHARED, FLAGS_CLOCKRT, etc.), they must be
2164 * the same type, no requeueing from private to shared, etc. 2186 * the same type, no requeueing from private to shared, etc.
2165 * @val: the expected value of uaddr 2187 * @val: the expected value of uaddr
2166 * @abs_time: absolute timeout 2188 * @abs_time: absolute timeout
@@ -2198,16 +2220,16 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
2198 * 0 - On success 2220 * 0 - On success
2199 * <0 - On error 2221 * <0 - On error
2200 */ 2222 */
2201static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared, 2223static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
2202 u32 val, ktime_t *abs_time, u32 bitset, 2224 u32 val, ktime_t *abs_time, u32 bitset,
2203 int clockrt, u32 __user *uaddr2) 2225 u32 __user *uaddr2)
2204{ 2226{
2205 struct hrtimer_sleeper timeout, *to = NULL; 2227 struct hrtimer_sleeper timeout, *to = NULL;
2206 struct rt_mutex_waiter rt_waiter; 2228 struct rt_mutex_waiter rt_waiter;
2207 struct rt_mutex *pi_mutex = NULL; 2229 struct rt_mutex *pi_mutex = NULL;
2208 struct futex_hash_bucket *hb; 2230 struct futex_hash_bucket *hb;
2209 union futex_key key2; 2231 union futex_key key2 = FUTEX_KEY_INIT;
2210 struct futex_q q; 2232 struct futex_q q = futex_q_init;
2211 int res, ret; 2233 int res, ret;
2212 2234
2213 if (!bitset) 2235 if (!bitset)
@@ -2215,8 +2237,9 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
2215 2237
2216 if (abs_time) { 2238 if (abs_time) {
2217 to = &timeout; 2239 to = &timeout;
2218 hrtimer_init_on_stack(&to->timer, clockrt ? CLOCK_REALTIME : 2240 hrtimer_init_on_stack(&to->timer, (flags & FLAGS_CLOCKRT) ?
2219 CLOCK_MONOTONIC, HRTIMER_MODE_ABS); 2241 CLOCK_REALTIME : CLOCK_MONOTONIC,
2242 HRTIMER_MODE_ABS);
2220 hrtimer_init_sleeper(to, current); 2243 hrtimer_init_sleeper(to, current);
2221 hrtimer_set_expires_range_ns(&to->timer, *abs_time, 2244 hrtimer_set_expires_range_ns(&to->timer, *abs_time,
2222 current->timer_slack_ns); 2245 current->timer_slack_ns);
@@ -2229,12 +2252,10 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
2229 debug_rt_mutex_init_waiter(&rt_waiter); 2252 debug_rt_mutex_init_waiter(&rt_waiter);
2230 rt_waiter.task = NULL; 2253 rt_waiter.task = NULL;
2231 2254
2232 key2 = FUTEX_KEY_INIT; 2255 ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2);
2233 ret = get_futex_key(uaddr2, fshared, &key2);
2234 if (unlikely(ret != 0)) 2256 if (unlikely(ret != 0))
2235 goto out; 2257 goto out;
2236 2258
2237 q.pi_state = NULL;
2238 q.bitset = bitset; 2259 q.bitset = bitset;
2239 q.rt_waiter = &rt_waiter; 2260 q.rt_waiter = &rt_waiter;
2240 q.requeue_pi_key = &key2; 2261 q.requeue_pi_key = &key2;
@@ -2243,7 +2264,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
2243 * Prepare to wait on uaddr. On success, increments q.key (key1) ref 2264 * Prepare to wait on uaddr. On success, increments q.key (key1) ref
2244 * count. 2265 * count.
2245 */ 2266 */
2246 ret = futex_wait_setup(uaddr, val, fshared, &q, &hb); 2267 ret = futex_wait_setup(uaddr, val, flags, &q, &hb);
2247 if (ret) 2268 if (ret)
2248 goto out_key2; 2269 goto out_key2;
2249 2270
@@ -2273,8 +2294,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
2273 */ 2294 */
2274 if (q.pi_state && (q.pi_state->owner != current)) { 2295 if (q.pi_state && (q.pi_state->owner != current)) {
2275 spin_lock(q.lock_ptr); 2296 spin_lock(q.lock_ptr);
2276 ret = fixup_pi_state_owner(uaddr2, &q, current, 2297 ret = fixup_pi_state_owner(uaddr2, &q, current);
2277 fshared);
2278 spin_unlock(q.lock_ptr); 2298 spin_unlock(q.lock_ptr);
2279 } 2299 }
2280 } else { 2300 } else {
@@ -2293,7 +2313,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
2293 * Fixup the pi_state owner and possibly acquire the lock if we 2313 * Fixup the pi_state owner and possibly acquire the lock if we
2294 * haven't already. 2314 * haven't already.
2295 */ 2315 */
2296 res = fixup_owner(uaddr2, fshared, &q, !ret); 2316 res = fixup_owner(uaddr2, &q, !ret);
2297 /* 2317 /*
2298 * If fixup_owner() returned an error, proprogate that. If it 2318 * If fixup_owner() returned an error, proprogate that. If it
2299 * acquired the lock, clear -ETIMEDOUT or -EINTR. 2319 * acquired the lock, clear -ETIMEDOUT or -EINTR.
@@ -2324,9 +2344,9 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
2324 } 2344 }
2325 2345
2326out_put_keys: 2346out_put_keys:
2327 put_futex_key(fshared, &q.key); 2347 put_futex_key(&q.key);
2328out_key2: 2348out_key2:
2329 put_futex_key(fshared, &key2); 2349 put_futex_key(&key2);
2330 2350
2331out: 2351out:
2332 if (to) { 2352 if (to) {
@@ -2551,58 +2571,57 @@ void exit_robust_list(struct task_struct *curr)
2551long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, 2571long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
2552 u32 __user *uaddr2, u32 val2, u32 val3) 2572 u32 __user *uaddr2, u32 val2, u32 val3)
2553{ 2573{
2554 int clockrt, ret = -ENOSYS; 2574 int ret = -ENOSYS, cmd = op & FUTEX_CMD_MASK;
2555 int cmd = op & FUTEX_CMD_MASK; 2575 unsigned int flags = 0;
2556 int fshared = 0;
2557 2576
2558 if (!(op & FUTEX_PRIVATE_FLAG)) 2577 if (!(op & FUTEX_PRIVATE_FLAG))
2559 fshared = 1; 2578 flags |= FLAGS_SHARED;
2560 2579
2561 clockrt = op & FUTEX_CLOCK_REALTIME; 2580 if (op & FUTEX_CLOCK_REALTIME) {
2562 if (clockrt && cmd != FUTEX_WAIT_BITSET && cmd != FUTEX_WAIT_REQUEUE_PI) 2581 flags |= FLAGS_CLOCKRT;
2563 return -ENOSYS; 2582 if (cmd != FUTEX_WAIT_BITSET && cmd != FUTEX_WAIT_REQUEUE_PI)
2583 return -ENOSYS;
2584 }
2564 2585
2565 switch (cmd) { 2586 switch (cmd) {
2566 case FUTEX_WAIT: 2587 case FUTEX_WAIT:
2567 val3 = FUTEX_BITSET_MATCH_ANY; 2588 val3 = FUTEX_BITSET_MATCH_ANY;
2568 case FUTEX_WAIT_BITSET: 2589 case FUTEX_WAIT_BITSET:
2569 ret = futex_wait(uaddr, fshared, val, timeout, val3, clockrt); 2590 ret = futex_wait(uaddr, flags, val, timeout, val3);
2570 break; 2591 break;
2571 case FUTEX_WAKE: 2592 case FUTEX_WAKE:
2572 val3 = FUTEX_BITSET_MATCH_ANY; 2593 val3 = FUTEX_BITSET_MATCH_ANY;
2573 case FUTEX_WAKE_BITSET: 2594 case FUTEX_WAKE_BITSET:
2574 ret = futex_wake(uaddr, fshared, val, val3); 2595 ret = futex_wake(uaddr, flags, val, val3);
2575 break; 2596 break;
2576 case FUTEX_REQUEUE: 2597 case FUTEX_REQUEUE:
2577 ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, NULL, 0); 2598 ret = futex_requeue(uaddr, flags, uaddr2, val, val2, NULL, 0);
2578 break; 2599 break;
2579 case FUTEX_CMP_REQUEUE: 2600 case FUTEX_CMP_REQUEUE:
2580 ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3, 2601 ret = futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 0);
2581 0);
2582 break; 2602 break;
2583 case FUTEX_WAKE_OP: 2603 case FUTEX_WAKE_OP:
2584 ret = futex_wake_op(uaddr, fshared, uaddr2, val, val2, val3); 2604 ret = futex_wake_op(uaddr, flags, uaddr2, val, val2, val3);
2585 break; 2605 break;
2586 case FUTEX_LOCK_PI: 2606 case FUTEX_LOCK_PI:
2587 if (futex_cmpxchg_enabled) 2607 if (futex_cmpxchg_enabled)
2588 ret = futex_lock_pi(uaddr, fshared, val, timeout, 0); 2608 ret = futex_lock_pi(uaddr, flags, val, timeout, 0);
2589 break; 2609 break;
2590 case FUTEX_UNLOCK_PI: 2610 case FUTEX_UNLOCK_PI:
2591 if (futex_cmpxchg_enabled) 2611 if (futex_cmpxchg_enabled)
2592 ret = futex_unlock_pi(uaddr, fshared); 2612 ret = futex_unlock_pi(uaddr, flags);
2593 break; 2613 break;
2594 case FUTEX_TRYLOCK_PI: 2614 case FUTEX_TRYLOCK_PI:
2595 if (futex_cmpxchg_enabled) 2615 if (futex_cmpxchg_enabled)
2596 ret = futex_lock_pi(uaddr, fshared, 0, timeout, 1); 2616 ret = futex_lock_pi(uaddr, flags, 0, timeout, 1);
2597 break; 2617 break;
2598 case FUTEX_WAIT_REQUEUE_PI: 2618 case FUTEX_WAIT_REQUEUE_PI:
2599 val3 = FUTEX_BITSET_MATCH_ANY; 2619 val3 = FUTEX_BITSET_MATCH_ANY;
2600 ret = futex_wait_requeue_pi(uaddr, fshared, val, timeout, val3, 2620 ret = futex_wait_requeue_pi(uaddr, flags, val, timeout, val3,
2601 clockrt, uaddr2); 2621 uaddr2);
2602 break; 2622 break;
2603 case FUTEX_CMP_REQUEUE_PI: 2623 case FUTEX_CMP_REQUEUE_PI:
2604 ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3, 2624 ret = futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 1);
2605 1);
2606 break; 2625 break;
2607 default: 2626 default:
2608 ret = -ENOSYS; 2627 ret = -ENOSYS;
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 72206cf5c6cf..0c8d7c048615 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -497,7 +497,7 @@ static inline int hrtimer_is_hres_enabled(void)
497 */ 497 */
498static inline int hrtimer_hres_active(void) 498static inline int hrtimer_hres_active(void)
499{ 499{
500 return __get_cpu_var(hrtimer_bases).hres_active; 500 return __this_cpu_read(hrtimer_bases.hres_active);
501} 501}
502 502
503/* 503/*
@@ -516,10 +516,13 @@ hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal)
516 516
517 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) { 517 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) {
518 struct hrtimer *timer; 518 struct hrtimer *timer;
519 struct timerqueue_node *next;
519 520
520 if (!base->first) 521 next = timerqueue_getnext(&base->active);
522 if (!next)
521 continue; 523 continue;
522 timer = rb_entry(base->first, struct hrtimer, node); 524 timer = container_of(next, struct hrtimer, node);
525
523 expires = ktime_sub(hrtimer_get_expires(timer), base->offset); 526 expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
524 /* 527 /*
525 * clock_was_set() has changed base->offset so the 528 * clock_was_set() has changed base->offset so the
@@ -840,48 +843,17 @@ EXPORT_SYMBOL_GPL(hrtimer_forward);
840static int enqueue_hrtimer(struct hrtimer *timer, 843static int enqueue_hrtimer(struct hrtimer *timer,
841 struct hrtimer_clock_base *base) 844 struct hrtimer_clock_base *base)
842{ 845{
843 struct rb_node **link = &base->active.rb_node;
844 struct rb_node *parent = NULL;
845 struct hrtimer *entry;
846 int leftmost = 1;
847
848 debug_activate(timer); 846 debug_activate(timer);
849 847
850 /* 848 timerqueue_add(&base->active, &timer->node);
851 * Find the right place in the rbtree:
852 */
853 while (*link) {
854 parent = *link;
855 entry = rb_entry(parent, struct hrtimer, node);
856 /*
857 * We dont care about collisions. Nodes with
858 * the same expiry time stay together.
859 */
860 if (hrtimer_get_expires_tv64(timer) <
861 hrtimer_get_expires_tv64(entry)) {
862 link = &(*link)->rb_left;
863 } else {
864 link = &(*link)->rb_right;
865 leftmost = 0;
866 }
867 }
868
869 /*
870 * Insert the timer to the rbtree and check whether it
871 * replaces the first pending timer
872 */
873 if (leftmost)
874 base->first = &timer->node;
875 849
876 rb_link_node(&timer->node, parent, link);
877 rb_insert_color(&timer->node, &base->active);
878 /* 850 /*
879 * HRTIMER_STATE_ENQUEUED is or'ed to the current state to preserve the 851 * HRTIMER_STATE_ENQUEUED is or'ed to the current state to preserve the
880 * state of a possibly running callback. 852 * state of a possibly running callback.
881 */ 853 */
882 timer->state |= HRTIMER_STATE_ENQUEUED; 854 timer->state |= HRTIMER_STATE_ENQUEUED;
883 855
884 return leftmost; 856 return (&timer->node == base->active.next);
885} 857}
886 858
887/* 859/*
@@ -901,12 +873,7 @@ static void __remove_hrtimer(struct hrtimer *timer,
901 if (!(timer->state & HRTIMER_STATE_ENQUEUED)) 873 if (!(timer->state & HRTIMER_STATE_ENQUEUED))
902 goto out; 874 goto out;
903 875
904 /* 876 if (&timer->node == timerqueue_getnext(&base->active)) {
905 * Remove the timer from the rbtree and replace the first
906 * entry pointer if necessary.
907 */
908 if (base->first == &timer->node) {
909 base->first = rb_next(&timer->node);
910#ifdef CONFIG_HIGH_RES_TIMERS 877#ifdef CONFIG_HIGH_RES_TIMERS
911 /* Reprogram the clock event device. if enabled */ 878 /* Reprogram the clock event device. if enabled */
912 if (reprogram && hrtimer_hres_active()) { 879 if (reprogram && hrtimer_hres_active()) {
@@ -919,7 +886,7 @@ static void __remove_hrtimer(struct hrtimer *timer,
919 } 886 }
920#endif 887#endif
921 } 888 }
922 rb_erase(&timer->node, &base->active); 889 timerqueue_del(&base->active, &timer->node);
923out: 890out:
924 timer->state = newstate; 891 timer->state = newstate;
925} 892}
@@ -1128,11 +1095,13 @@ ktime_t hrtimer_get_next_event(void)
1128 if (!hrtimer_hres_active()) { 1095 if (!hrtimer_hres_active()) {
1129 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) { 1096 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) {
1130 struct hrtimer *timer; 1097 struct hrtimer *timer;
1098 struct timerqueue_node *next;
1131 1099
1132 if (!base->first) 1100 next = timerqueue_getnext(&base->active);
1101 if (!next)
1133 continue; 1102 continue;
1134 1103
1135 timer = rb_entry(base->first, struct hrtimer, node); 1104 timer = container_of(next, struct hrtimer, node);
1136 delta.tv64 = hrtimer_get_expires_tv64(timer); 1105 delta.tv64 = hrtimer_get_expires_tv64(timer);
1137 delta = ktime_sub(delta, base->get_time()); 1106 delta = ktime_sub(delta, base->get_time());
1138 if (delta.tv64 < mindelta.tv64) 1107 if (delta.tv64 < mindelta.tv64)
@@ -1162,6 +1131,7 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
1162 1131
1163 timer->base = &cpu_base->clock_base[clock_id]; 1132 timer->base = &cpu_base->clock_base[clock_id];
1164 hrtimer_init_timer_hres(timer); 1133 hrtimer_init_timer_hres(timer);
1134 timerqueue_init(&timer->node);
1165 1135
1166#ifdef CONFIG_TIMER_STATS 1136#ifdef CONFIG_TIMER_STATS
1167 timer->start_site = NULL; 1137 timer->start_site = NULL;
@@ -1278,14 +1248,14 @@ retry:
1278 1248
1279 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { 1249 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
1280 ktime_t basenow; 1250 ktime_t basenow;
1281 struct rb_node *node; 1251 struct timerqueue_node *node;
1282 1252
1283 basenow = ktime_add(now, base->offset); 1253 basenow = ktime_add(now, base->offset);
1284 1254
1285 while ((node = base->first)) { 1255 while ((node = timerqueue_getnext(&base->active))) {
1286 struct hrtimer *timer; 1256 struct hrtimer *timer;
1287 1257
1288 timer = rb_entry(node, struct hrtimer, node); 1258 timer = container_of(node, struct hrtimer, node);
1289 1259
1290 /* 1260 /*
1291 * The immediate goal for using the softexpires is 1261 * The immediate goal for using the softexpires is
@@ -1441,7 +1411,7 @@ void hrtimer_run_pending(void)
1441 */ 1411 */
1442void hrtimer_run_queues(void) 1412void hrtimer_run_queues(void)
1443{ 1413{
1444 struct rb_node *node; 1414 struct timerqueue_node *node;
1445 struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); 1415 struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
1446 struct hrtimer_clock_base *base; 1416 struct hrtimer_clock_base *base;
1447 int index, gettime = 1; 1417 int index, gettime = 1;
@@ -1451,8 +1421,7 @@ void hrtimer_run_queues(void)
1451 1421
1452 for (index = 0; index < HRTIMER_MAX_CLOCK_BASES; index++) { 1422 for (index = 0; index < HRTIMER_MAX_CLOCK_BASES; index++) {
1453 base = &cpu_base->clock_base[index]; 1423 base = &cpu_base->clock_base[index];
1454 1424 if (!timerqueue_getnext(&base->active))
1455 if (!base->first)
1456 continue; 1425 continue;
1457 1426
1458 if (gettime) { 1427 if (gettime) {
@@ -1462,10 +1431,10 @@ void hrtimer_run_queues(void)
1462 1431
1463 raw_spin_lock(&cpu_base->lock); 1432 raw_spin_lock(&cpu_base->lock);
1464 1433
1465 while ((node = base->first)) { 1434 while ((node = timerqueue_getnext(&base->active))) {
1466 struct hrtimer *timer; 1435 struct hrtimer *timer;
1467 1436
1468 timer = rb_entry(node, struct hrtimer, node); 1437 timer = container_of(node, struct hrtimer, node);
1469 if (base->softirq_time.tv64 <= 1438 if (base->softirq_time.tv64 <=
1470 hrtimer_get_expires_tv64(timer)) 1439 hrtimer_get_expires_tv64(timer))
1471 break; 1440 break;
@@ -1630,8 +1599,10 @@ static void __cpuinit init_hrtimers_cpu(int cpu)
1630 1599
1631 raw_spin_lock_init(&cpu_base->lock); 1600 raw_spin_lock_init(&cpu_base->lock);
1632 1601
1633 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) 1602 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
1634 cpu_base->clock_base[i].cpu_base = cpu_base; 1603 cpu_base->clock_base[i].cpu_base = cpu_base;
1604 timerqueue_init_head(&cpu_base->clock_base[i].active);
1605 }
1635 1606
1636 hrtimer_init_hres(cpu_base); 1607 hrtimer_init_hres(cpu_base);
1637} 1608}
@@ -1642,10 +1613,10 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
1642 struct hrtimer_clock_base *new_base) 1613 struct hrtimer_clock_base *new_base)
1643{ 1614{
1644 struct hrtimer *timer; 1615 struct hrtimer *timer;
1645 struct rb_node *node; 1616 struct timerqueue_node *node;
1646 1617
1647 while ((node = rb_first(&old_base->active))) { 1618 while ((node = timerqueue_getnext(&old_base->active))) {
1648 timer = rb_entry(node, struct hrtimer, node); 1619 timer = container_of(node, struct hrtimer, node);
1649 BUG_ON(hrtimer_callback_running(timer)); 1620 BUG_ON(hrtimer_callback_running(timer));
1650 debug_deactivate(timer); 1621 debug_deactivate(timer);
1651 1622
@@ -1774,7 +1745,7 @@ schedule_hrtimeout_range_clock(ktime_t *expires, unsigned long delta,
1774 } 1745 }
1775 1746
1776 /* 1747 /*
1777 * A NULL parameter means "inifinte" 1748 * A NULL parameter means "infinite"
1778 */ 1749 */
1779 if (!expires) { 1750 if (!expires) {
1780 schedule(); 1751 schedule();
diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c
index e5325825aeb6..086adf25a55e 100644
--- a/kernel/hw_breakpoint.c
+++ b/kernel/hw_breakpoint.c
@@ -641,7 +641,7 @@ int __init init_hw_breakpoint(void)
641 641
642 constraints_initialized = 1; 642 constraints_initialized = 1;
643 643
644 perf_pmu_register(&perf_breakpoint); 644 perf_pmu_register(&perf_breakpoint, "breakpoint", PERF_TYPE_BREAKPOINT);
645 645
646 return register_die_notifier(&hw_breakpoint_exceptions_nb); 646 return register_die_notifier(&hw_breakpoint_exceptions_nb);
647 647
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
index 31d766bf5d2e..8e42fec7686d 100644
--- a/kernel/irq/Kconfig
+++ b/kernel/irq/Kconfig
@@ -9,9 +9,6 @@ menu "IRQ subsystem"
9config GENERIC_HARDIRQS 9config GENERIC_HARDIRQS
10 def_bool y 10 def_bool y
11 11
12config GENERIC_HARDIRQS_NO__DO_IRQ
13 def_bool y
14
15# Select this to disable the deprecated stuff 12# Select this to disable the deprecated stuff
16config GENERIC_HARDIRQS_NO_DEPRECATED 13config GENERIC_HARDIRQS_NO_DEPRECATED
17 def_bool n 14 def_bool n
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index e2347eb63306..3540a7190122 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -118,114 +118,3 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action)
118 118
119 return retval; 119 return retval;
120} 120}
121
122#ifndef CONFIG_GENERIC_HARDIRQS_NO__DO_IRQ
123
124#ifdef CONFIG_ENABLE_WARN_DEPRECATED
125# warning __do_IRQ is deprecated. Please convert to proper flow handlers
126#endif
127
128/**
129 * __do_IRQ - original all in one highlevel IRQ handler
130 * @irq: the interrupt number
131 *
132 * __do_IRQ handles all normal device IRQ's (the special
133 * SMP cross-CPU interrupts have their own specific
134 * handlers).
135 *
136 * This is the original x86 implementation which is used for every
137 * interrupt type.
138 */
139unsigned int __do_IRQ(unsigned int irq)
140{
141 struct irq_desc *desc = irq_to_desc(irq);
142 struct irqaction *action;
143 unsigned int status;
144
145 kstat_incr_irqs_this_cpu(irq, desc);
146
147 if (CHECK_IRQ_PER_CPU(desc->status)) {
148 irqreturn_t action_ret;
149
150 /*
151 * No locking required for CPU-local interrupts:
152 */
153 if (desc->irq_data.chip->ack)
154 desc->irq_data.chip->ack(irq);
155 if (likely(!(desc->status & IRQ_DISABLED))) {
156 action_ret = handle_IRQ_event(irq, desc->action);
157 if (!noirqdebug)
158 note_interrupt(irq, desc, action_ret);
159 }
160 desc->irq_data.chip->end(irq);
161 return 1;
162 }
163
164 raw_spin_lock(&desc->lock);
165 if (desc->irq_data.chip->ack)
166 desc->irq_data.chip->ack(irq);
167 /*
168 * REPLAY is when Linux resends an IRQ that was dropped earlier
169 * WAITING is used by probe to mark irqs that are being tested
170 */
171 status = desc->status & ~(IRQ_REPLAY | IRQ_WAITING);
172 status |= IRQ_PENDING; /* we _want_ to handle it */
173
174 /*
175 * If the IRQ is disabled for whatever reason, we cannot
176 * use the action we have.
177 */
178 action = NULL;
179 if (likely(!(status & (IRQ_DISABLED | IRQ_INPROGRESS)))) {
180 action = desc->action;
181 status &= ~IRQ_PENDING; /* we commit to handling */
182 status |= IRQ_INPROGRESS; /* we are handling it */
183 }
184 desc->status = status;
185
186 /*
187 * If there is no IRQ handler or it was disabled, exit early.
188 * Since we set PENDING, if another processor is handling
189 * a different instance of this same irq, the other processor
190 * will take care of it.
191 */
192 if (unlikely(!action))
193 goto out;
194
195 /*
196 * Edge triggered interrupts need to remember
197 * pending events.
198 * This applies to any hw interrupts that allow a second
199 * instance of the same irq to arrive while we are in do_IRQ
200 * or in the handler. But the code here only handles the _second_
201 * instance of the irq, not the third or fourth. So it is mostly
202 * useful for irq hardware that does not mask cleanly in an
203 * SMP environment.
204 */
205 for (;;) {
206 irqreturn_t action_ret;
207
208 raw_spin_unlock(&desc->lock);
209
210 action_ret = handle_IRQ_event(irq, action);
211 if (!noirqdebug)
212 note_interrupt(irq, desc, action_ret);
213
214 raw_spin_lock(&desc->lock);
215 if (likely(!(desc->status & IRQ_PENDING)))
216 break;
217 desc->status &= ~IRQ_PENDING;
218 }
219 desc->status &= ~IRQ_INPROGRESS;
220
221out:
222 /*
223 * The ->end() handler has to deal with interrupts which got
224 * disabled while the handler was running.
225 */
226 desc->irq_data.chip->end(irq);
227 raw_spin_unlock(&desc->lock);
228
229 return 1;
230}
231#endif
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 9988d03797f5..282f20230e67 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -72,6 +72,8 @@ static inline int desc_node(struct irq_desc *desc) { return 0; }
72 72
73static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node) 73static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node)
74{ 74{
75 int cpu;
76
75 desc->irq_data.irq = irq; 77 desc->irq_data.irq = irq;
76 desc->irq_data.chip = &no_irq_chip; 78 desc->irq_data.chip = &no_irq_chip;
77 desc->irq_data.chip_data = NULL; 79 desc->irq_data.chip_data = NULL;
@@ -83,7 +85,8 @@ static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node)
83 desc->irq_count = 0; 85 desc->irq_count = 0;
84 desc->irqs_unhandled = 0; 86 desc->irqs_unhandled = 0;
85 desc->name = NULL; 87 desc->name = NULL;
86 memset(desc->kstat_irqs, 0, nr_cpu_ids * sizeof(*(desc->kstat_irqs))); 88 for_each_possible_cpu(cpu)
89 *per_cpu_ptr(desc->kstat_irqs, cpu) = 0;
87 desc_smp_init(desc, node); 90 desc_smp_init(desc, node);
88} 91}
89 92
@@ -133,8 +136,7 @@ static struct irq_desc *alloc_desc(int irq, int node)
133 if (!desc) 136 if (!desc)
134 return NULL; 137 return NULL;
135 /* allocate based on nr_cpu_ids */ 138 /* allocate based on nr_cpu_ids */
136 desc->kstat_irqs = kzalloc_node(nr_cpu_ids * sizeof(*desc->kstat_irqs), 139 desc->kstat_irqs = alloc_percpu(unsigned int);
137 gfp, node);
138 if (!desc->kstat_irqs) 140 if (!desc->kstat_irqs)
139 goto err_desc; 141 goto err_desc;
140 142
@@ -149,7 +151,7 @@ static struct irq_desc *alloc_desc(int irq, int node)
149 return desc; 151 return desc;
150 152
151err_kstat: 153err_kstat:
152 kfree(desc->kstat_irqs); 154 free_percpu(desc->kstat_irqs);
153err_desc: 155err_desc:
154 kfree(desc); 156 kfree(desc);
155 return NULL; 157 return NULL;
@@ -166,7 +168,7 @@ static void free_desc(unsigned int irq)
166 mutex_unlock(&sparse_irq_lock); 168 mutex_unlock(&sparse_irq_lock);
167 169
168 free_masks(desc); 170 free_masks(desc);
169 kfree(desc->kstat_irqs); 171 free_percpu(desc->kstat_irqs);
170 kfree(desc); 172 kfree(desc);
171} 173}
172 174
@@ -234,7 +236,6 @@ struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = {
234 } 236 }
235}; 237};
236 238
237static unsigned int kstat_irqs_all[NR_IRQS][NR_CPUS];
238int __init early_irq_init(void) 239int __init early_irq_init(void)
239{ 240{
240 int count, i, node = first_online_node; 241 int count, i, node = first_online_node;
@@ -250,7 +251,8 @@ int __init early_irq_init(void)
250 for (i = 0; i < count; i++) { 251 for (i = 0; i < count; i++) {
251 desc[i].irq_data.irq = i; 252 desc[i].irq_data.irq = i;
252 desc[i].irq_data.chip = &no_irq_chip; 253 desc[i].irq_data.chip = &no_irq_chip;
253 desc[i].kstat_irqs = kstat_irqs_all[i]; 254 /* TODO : do this allocation on-demand ... */
255 desc[i].kstat_irqs = alloc_percpu(unsigned int);
254 alloc_masks(desc + i, GFP_KERNEL, node); 256 alloc_masks(desc + i, GFP_KERNEL, node);
255 desc_smp_init(desc + i, node); 257 desc_smp_init(desc + i, node);
256 lockdep_set_class(&desc[i].lock, &irq_desc_lock_class); 258 lockdep_set_class(&desc[i].lock, &irq_desc_lock_class);
@@ -275,6 +277,22 @@ static void free_desc(unsigned int irq)
275 277
276static inline int alloc_descs(unsigned int start, unsigned int cnt, int node) 278static inline int alloc_descs(unsigned int start, unsigned int cnt, int node)
277{ 279{
280#if defined(CONFIG_KSTAT_IRQS_ONDEMAND)
281 struct irq_desc *desc;
282 unsigned int i;
283
284 for (i = 0; i < cnt; i++) {
285 desc = irq_to_desc(start + i);
286 if (desc && !desc->kstat_irqs) {
287 unsigned int __percpu *stats = alloc_percpu(unsigned int);
288
289 if (!stats)
290 return -1;
291 if (cmpxchg(&desc->kstat_irqs, NULL, stats) != NULL)
292 free_percpu(stats);
293 }
294 }
295#endif
278 return start; 296 return start;
279} 297}
280#endif /* !CONFIG_SPARSE_IRQ */ 298#endif /* !CONFIG_SPARSE_IRQ */
@@ -391,7 +409,9 @@ void dynamic_irq_cleanup(unsigned int irq)
391unsigned int kstat_irqs_cpu(unsigned int irq, int cpu) 409unsigned int kstat_irqs_cpu(unsigned int irq, int cpu)
392{ 410{
393 struct irq_desc *desc = irq_to_desc(irq); 411 struct irq_desc *desc = irq_to_desc(irq);
394 return desc ? desc->kstat_irqs[cpu] : 0; 412
413 return desc && desc->kstat_irqs ?
414 *per_cpu_ptr(desc->kstat_irqs, cpu) : 0;
395} 415}
396 416
397#ifdef CONFIG_GENERIC_HARDIRQS 417#ifdef CONFIG_GENERIC_HARDIRQS
@@ -401,10 +421,10 @@ unsigned int kstat_irqs(unsigned int irq)
401 int cpu; 421 int cpu;
402 int sum = 0; 422 int sum = 0;
403 423
404 if (!desc) 424 if (!desc || !desc->kstat_irqs)
405 return 0; 425 return 0;
406 for_each_possible_cpu(cpu) 426 for_each_possible_cpu(cpu)
407 sum += desc->kstat_irqs[cpu]; 427 sum += *per_cpu_ptr(desc->kstat_irqs, cpu);
408 return sum; 428 return sum;
409} 429}
410#endif /* CONFIG_GENERIC_HARDIRQS */ 430#endif /* CONFIG_GENERIC_HARDIRQS */
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 5f92acc5f952..0caa59f747dd 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -577,7 +577,9 @@ irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) { }
577 */ 577 */
578static int irq_thread(void *data) 578static int irq_thread(void *data)
579{ 579{
580 struct sched_param param = { .sched_priority = MAX_USER_RT_PRIO/2, }; 580 static const struct sched_param param = {
581 .sched_priority = MAX_USER_RT_PRIO/2,
582 };
581 struct irqaction *action = data; 583 struct irqaction *action = data;
582 struct irq_desc *desc = irq_to_desc(action->irq); 584 struct irq_desc *desc = irq_to_desc(action->irq);
583 int wake, oneshot = desc->status & IRQ_ONESHOT; 585 int wake, oneshot = desc->status & IRQ_ONESHOT;
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c
index 1d2541940480..441fd629ff04 100644
--- a/kernel/irq/migration.c
+++ b/kernel/irq/migration.c
@@ -56,6 +56,7 @@ void move_masked_irq(int irq)
56void move_native_irq(int irq) 56void move_native_irq(int irq)
57{ 57{
58 struct irq_desc *desc = irq_to_desc(irq); 58 struct irq_desc *desc = irq_to_desc(irq);
59 bool masked;
59 60
60 if (likely(!(desc->status & IRQ_MOVE_PENDING))) 61 if (likely(!(desc->status & IRQ_MOVE_PENDING)))
61 return; 62 return;
@@ -63,8 +64,15 @@ void move_native_irq(int irq)
63 if (unlikely(desc->status & IRQ_DISABLED)) 64 if (unlikely(desc->status & IRQ_DISABLED))
64 return; 65 return;
65 66
66 desc->irq_data.chip->irq_mask(&desc->irq_data); 67 /*
68 * Be careful vs. already masked interrupts. If this is a
69 * threaded interrupt with ONESHOT set, we can end up with an
70 * interrupt storm.
71 */
72 masked = desc->status & IRQ_MASKED;
73 if (!masked)
74 desc->irq_data.chip->irq_mask(&desc->irq_data);
67 move_masked_irq(irq); 75 move_masked_irq(irq);
68 desc->irq_data.chip->irq_unmask(&desc->irq_data); 76 if (!masked)
77 desc->irq_data.chip->irq_unmask(&desc->irq_data);
69} 78}
70
diff --git a/kernel/irq_work.c b/kernel/irq_work.c
index 90f881904bb1..c58fa7da8aef 100644
--- a/kernel/irq_work.c
+++ b/kernel/irq_work.c
@@ -77,21 +77,21 @@ void __weak arch_irq_work_raise(void)
77 */ 77 */
78static void __irq_work_queue(struct irq_work *entry) 78static void __irq_work_queue(struct irq_work *entry)
79{ 79{
80 struct irq_work **head, *next; 80 struct irq_work *next;
81 81
82 head = &get_cpu_var(irq_work_list); 82 preempt_disable();
83 83
84 do { 84 do {
85 next = *head; 85 next = __this_cpu_read(irq_work_list);
86 /* Can assign non-atomic because we keep the flags set. */ 86 /* Can assign non-atomic because we keep the flags set. */
87 entry->next = next_flags(next, IRQ_WORK_FLAGS); 87 entry->next = next_flags(next, IRQ_WORK_FLAGS);
88 } while (cmpxchg(head, next, entry) != next); 88 } while (this_cpu_cmpxchg(irq_work_list, next, entry) != next);
89 89
90 /* The list was empty, raise self-interrupt to start processing. */ 90 /* The list was empty, raise self-interrupt to start processing. */
91 if (!irq_work_next(entry)) 91 if (!irq_work_next(entry))
92 arch_irq_work_raise(); 92 arch_irq_work_raise();
93 93
94 put_cpu_var(irq_work_list); 94 preempt_enable();
95} 95}
96 96
97/* 97/*
@@ -120,16 +120,16 @@ EXPORT_SYMBOL_GPL(irq_work_queue);
120 */ 120 */
121void irq_work_run(void) 121void irq_work_run(void)
122{ 122{
123 struct irq_work *list, **head; 123 struct irq_work *list;
124 124
125 head = &__get_cpu_var(irq_work_list); 125 if (this_cpu_read(irq_work_list) == NULL)
126 if (*head == NULL)
127 return; 126 return;
128 127
129 BUG_ON(!in_irq()); 128 BUG_ON(!in_irq());
130 BUG_ON(!irqs_disabled()); 129 BUG_ON(!irqs_disabled());
131 130
132 list = xchg(head, NULL); 131 list = this_cpu_xchg(irq_work_list, NULL);
132
133 while (list != NULL) { 133 while (list != NULL) {
134 struct irq_work *entry = list; 134 struct irq_work *entry = list;
135 135
diff --git a/kernel/kexec.c b/kernel/kexec.c
index b55045bc7563..ec19b92c7ebd 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -163,7 +163,7 @@ static int do_kimage_alloc(struct kimage **rimage, unsigned long entry,
163 * just verifies it is an address we can use. 163 * just verifies it is an address we can use.
164 * 164 *
165 * Since the kernel does everything in page size chunks ensure 165 * Since the kernel does everything in page size chunks ensure
166 * the destination addreses are page aligned. Too many 166 * the destination addresses are page aligned. Too many
167 * special cases crop of when we don't do this. The most 167 * special cases crop of when we don't do this. The most
168 * insidious is getting overlapping destination addresses 168 * insidious is getting overlapping destination addresses
169 * simply because addresses are changed to page size 169 * simply because addresses are changed to page size
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 9737a76e106f..77981813a1e7 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -317,12 +317,12 @@ void __kprobes free_optinsn_slot(kprobe_opcode_t * slot, int dirty)
317/* We have preemption disabled.. so it is safe to use __ versions */ 317/* We have preemption disabled.. so it is safe to use __ versions */
318static inline void set_kprobe_instance(struct kprobe *kp) 318static inline void set_kprobe_instance(struct kprobe *kp)
319{ 319{
320 __get_cpu_var(kprobe_instance) = kp; 320 __this_cpu_write(kprobe_instance, kp);
321} 321}
322 322
323static inline void reset_kprobe_instance(void) 323static inline void reset_kprobe_instance(void)
324{ 324{
325 __get_cpu_var(kprobe_instance) = NULL; 325 __this_cpu_write(kprobe_instance, NULL);
326} 326}
327 327
328/* 328/*
@@ -354,13 +354,20 @@ static inline int kprobe_aggrprobe(struct kprobe *p)
354 return p->pre_handler == aggr_pre_handler; 354 return p->pre_handler == aggr_pre_handler;
355} 355}
356 356
357/* Return true(!0) if the kprobe is unused */
358static inline int kprobe_unused(struct kprobe *p)
359{
360 return kprobe_aggrprobe(p) && kprobe_disabled(p) &&
361 list_empty(&p->list);
362}
363
357/* 364/*
358 * Keep all fields in the kprobe consistent 365 * Keep all fields in the kprobe consistent
359 */ 366 */
360static inline void copy_kprobe(struct kprobe *old_p, struct kprobe *p) 367static inline void copy_kprobe(struct kprobe *ap, struct kprobe *p)
361{ 368{
362 memcpy(&p->opcode, &old_p->opcode, sizeof(kprobe_opcode_t)); 369 memcpy(&p->opcode, &ap->opcode, sizeof(kprobe_opcode_t));
363 memcpy(&p->ainsn, &old_p->ainsn, sizeof(struct arch_specific_insn)); 370 memcpy(&p->ainsn, &ap->ainsn, sizeof(struct arch_specific_insn));
364} 371}
365 372
366#ifdef CONFIG_OPTPROBES 373#ifdef CONFIG_OPTPROBES
@@ -384,6 +391,17 @@ void __kprobes opt_pre_handler(struct kprobe *p, struct pt_regs *regs)
384 } 391 }
385} 392}
386 393
394/* Free optimized instructions and optimized_kprobe */
395static __kprobes void free_aggr_kprobe(struct kprobe *p)
396{
397 struct optimized_kprobe *op;
398
399 op = container_of(p, struct optimized_kprobe, kp);
400 arch_remove_optimized_kprobe(op);
401 arch_remove_kprobe(p);
402 kfree(op);
403}
404
387/* Return true(!0) if the kprobe is ready for optimization. */ 405/* Return true(!0) if the kprobe is ready for optimization. */
388static inline int kprobe_optready(struct kprobe *p) 406static inline int kprobe_optready(struct kprobe *p)
389{ 407{
@@ -397,6 +415,33 @@ static inline int kprobe_optready(struct kprobe *p)
397 return 0; 415 return 0;
398} 416}
399 417
418/* Return true(!0) if the kprobe is disarmed. Note: p must be on hash list */
419static inline int kprobe_disarmed(struct kprobe *p)
420{
421 struct optimized_kprobe *op;
422
423 /* If kprobe is not aggr/opt probe, just return kprobe is disabled */
424 if (!kprobe_aggrprobe(p))
425 return kprobe_disabled(p);
426
427 op = container_of(p, struct optimized_kprobe, kp);
428
429 return kprobe_disabled(p) && list_empty(&op->list);
430}
431
432/* Return true(!0) if the probe is queued on (un)optimizing lists */
433static int __kprobes kprobe_queued(struct kprobe *p)
434{
435 struct optimized_kprobe *op;
436
437 if (kprobe_aggrprobe(p)) {
438 op = container_of(p, struct optimized_kprobe, kp);
439 if (!list_empty(&op->list))
440 return 1;
441 }
442 return 0;
443}
444
400/* 445/*
401 * Return an optimized kprobe whose optimizing code replaces 446 * Return an optimized kprobe whose optimizing code replaces
402 * instructions including addr (exclude breakpoint). 447 * instructions including addr (exclude breakpoint).
@@ -422,30 +467,23 @@ static struct kprobe *__kprobes get_optimized_kprobe(unsigned long addr)
422 467
423/* Optimization staging list, protected by kprobe_mutex */ 468/* Optimization staging list, protected by kprobe_mutex */
424static LIST_HEAD(optimizing_list); 469static LIST_HEAD(optimizing_list);
470static LIST_HEAD(unoptimizing_list);
425 471
426static void kprobe_optimizer(struct work_struct *work); 472static void kprobe_optimizer(struct work_struct *work);
427static DECLARE_DELAYED_WORK(optimizing_work, kprobe_optimizer); 473static DECLARE_DELAYED_WORK(optimizing_work, kprobe_optimizer);
474static DECLARE_COMPLETION(optimizer_comp);
428#define OPTIMIZE_DELAY 5 475#define OPTIMIZE_DELAY 5
429 476
430/* Kprobe jump optimizer */ 477/*
431static __kprobes void kprobe_optimizer(struct work_struct *work) 478 * Optimize (replace a breakpoint with a jump) kprobes listed on
479 * optimizing_list.
480 */
481static __kprobes void do_optimize_kprobes(void)
432{ 482{
433 struct optimized_kprobe *op, *tmp; 483 /* Optimization never be done when disarmed */
434 484 if (kprobes_all_disarmed || !kprobes_allow_optimization ||
435 /* Lock modules while optimizing kprobes */ 485 list_empty(&optimizing_list))
436 mutex_lock(&module_mutex); 486 return;
437 mutex_lock(&kprobe_mutex);
438 if (kprobes_all_disarmed || !kprobes_allow_optimization)
439 goto end;
440
441 /*
442 * Wait for quiesence period to ensure all running interrupts
443 * are done. Because optprobe may modify multiple instructions
444 * there is a chance that Nth instruction is interrupted. In that
445 * case, running interrupt can return to 2nd-Nth byte of jump
446 * instruction. This wait is for avoiding it.
447 */
448 synchronize_sched();
449 487
450 /* 488 /*
451 * The optimization/unoptimization refers online_cpus via 489 * The optimization/unoptimization refers online_cpus via
@@ -459,17 +497,111 @@ static __kprobes void kprobe_optimizer(struct work_struct *work)
459 */ 497 */
460 get_online_cpus(); 498 get_online_cpus();
461 mutex_lock(&text_mutex); 499 mutex_lock(&text_mutex);
462 list_for_each_entry_safe(op, tmp, &optimizing_list, list) { 500 arch_optimize_kprobes(&optimizing_list);
463 WARN_ON(kprobe_disabled(&op->kp)); 501 mutex_unlock(&text_mutex);
464 if (arch_optimize_kprobe(op) < 0) 502 put_online_cpus();
465 op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED; 503}
466 list_del_init(&op->list); 504
505/*
506 * Unoptimize (replace a jump with a breakpoint and remove the breakpoint
507 * if need) kprobes listed on unoptimizing_list.
508 */
509static __kprobes void do_unoptimize_kprobes(struct list_head *free_list)
510{
511 struct optimized_kprobe *op, *tmp;
512
513 /* Unoptimization must be done anytime */
514 if (list_empty(&unoptimizing_list))
515 return;
516
517 /* Ditto to do_optimize_kprobes */
518 get_online_cpus();
519 mutex_lock(&text_mutex);
520 arch_unoptimize_kprobes(&unoptimizing_list, free_list);
521 /* Loop free_list for disarming */
522 list_for_each_entry_safe(op, tmp, free_list, list) {
523 /* Disarm probes if marked disabled */
524 if (kprobe_disabled(&op->kp))
525 arch_disarm_kprobe(&op->kp);
526 if (kprobe_unused(&op->kp)) {
527 /*
528 * Remove unused probes from hash list. After waiting
529 * for synchronization, these probes are reclaimed.
530 * (reclaiming is done by do_free_cleaned_kprobes.)
531 */
532 hlist_del_rcu(&op->kp.hlist);
533 } else
534 list_del_init(&op->list);
467 } 535 }
468 mutex_unlock(&text_mutex); 536 mutex_unlock(&text_mutex);
469 put_online_cpus(); 537 put_online_cpus();
470end: 538}
539
540/* Reclaim all kprobes on the free_list */
541static __kprobes void do_free_cleaned_kprobes(struct list_head *free_list)
542{
543 struct optimized_kprobe *op, *tmp;
544
545 list_for_each_entry_safe(op, tmp, free_list, list) {
546 BUG_ON(!kprobe_unused(&op->kp));
547 list_del_init(&op->list);
548 free_aggr_kprobe(&op->kp);
549 }
550}
551
552/* Start optimizer after OPTIMIZE_DELAY passed */
553static __kprobes void kick_kprobe_optimizer(void)
554{
555 if (!delayed_work_pending(&optimizing_work))
556 schedule_delayed_work(&optimizing_work, OPTIMIZE_DELAY);
557}
558
559/* Kprobe jump optimizer */
560static __kprobes void kprobe_optimizer(struct work_struct *work)
561{
562 LIST_HEAD(free_list);
563
564 /* Lock modules while optimizing kprobes */
565 mutex_lock(&module_mutex);
566 mutex_lock(&kprobe_mutex);
567
568 /*
569 * Step 1: Unoptimize kprobes and collect cleaned (unused and disarmed)
570 * kprobes before waiting for quiesence period.
571 */
572 do_unoptimize_kprobes(&free_list);
573
574 /*
575 * Step 2: Wait for quiesence period to ensure all running interrupts
576 * are done. Because optprobe may modify multiple instructions
577 * there is a chance that Nth instruction is interrupted. In that
578 * case, running interrupt can return to 2nd-Nth byte of jump
579 * instruction. This wait is for avoiding it.
580 */
581 synchronize_sched();
582
583 /* Step 3: Optimize kprobes after quiesence period */
584 do_optimize_kprobes();
585
586 /* Step 4: Free cleaned kprobes after quiesence period */
587 do_free_cleaned_kprobes(&free_list);
588
471 mutex_unlock(&kprobe_mutex); 589 mutex_unlock(&kprobe_mutex);
472 mutex_unlock(&module_mutex); 590 mutex_unlock(&module_mutex);
591
592 /* Step 5: Kick optimizer again if needed */
593 if (!list_empty(&optimizing_list) || !list_empty(&unoptimizing_list))
594 kick_kprobe_optimizer();
595 else
596 /* Wake up all waiters */
597 complete_all(&optimizer_comp);
598}
599
600/* Wait for completing optimization and unoptimization */
601static __kprobes void wait_for_kprobe_optimizer(void)
602{
603 if (delayed_work_pending(&optimizing_work))
604 wait_for_completion(&optimizer_comp);
473} 605}
474 606
475/* Optimize kprobe if p is ready to be optimized */ 607/* Optimize kprobe if p is ready to be optimized */
@@ -495,42 +627,99 @@ static __kprobes void optimize_kprobe(struct kprobe *p)
495 /* Check if it is already optimized. */ 627 /* Check if it is already optimized. */
496 if (op->kp.flags & KPROBE_FLAG_OPTIMIZED) 628 if (op->kp.flags & KPROBE_FLAG_OPTIMIZED)
497 return; 629 return;
498
499 op->kp.flags |= KPROBE_FLAG_OPTIMIZED; 630 op->kp.flags |= KPROBE_FLAG_OPTIMIZED;
500 list_add(&op->list, &optimizing_list); 631
501 if (!delayed_work_pending(&optimizing_work)) 632 if (!list_empty(&op->list))
502 schedule_delayed_work(&optimizing_work, OPTIMIZE_DELAY); 633 /* This is under unoptimizing. Just dequeue the probe */
634 list_del_init(&op->list);
635 else {
636 list_add(&op->list, &optimizing_list);
637 kick_kprobe_optimizer();
638 }
639}
640
641/* Short cut to direct unoptimizing */
642static __kprobes void force_unoptimize_kprobe(struct optimized_kprobe *op)
643{
644 get_online_cpus();
645 arch_unoptimize_kprobe(op);
646 put_online_cpus();
647 if (kprobe_disabled(&op->kp))
648 arch_disarm_kprobe(&op->kp);
503} 649}
504 650
505/* Unoptimize a kprobe if p is optimized */ 651/* Unoptimize a kprobe if p is optimized */
506static __kprobes void unoptimize_kprobe(struct kprobe *p) 652static __kprobes void unoptimize_kprobe(struct kprobe *p, bool force)
507{ 653{
508 struct optimized_kprobe *op; 654 struct optimized_kprobe *op;
509 655
510 if ((p->flags & KPROBE_FLAG_OPTIMIZED) && kprobe_aggrprobe(p)) { 656 if (!kprobe_aggrprobe(p) || kprobe_disarmed(p))
511 op = container_of(p, struct optimized_kprobe, kp); 657 return; /* This is not an optprobe nor optimized */
512 if (!list_empty(&op->list)) 658
513 /* Dequeue from the optimization queue */ 659 op = container_of(p, struct optimized_kprobe, kp);
660 if (!kprobe_optimized(p)) {
661 /* Unoptimized or unoptimizing case */
662 if (force && !list_empty(&op->list)) {
663 /*
664 * Only if this is unoptimizing kprobe and forced,
665 * forcibly unoptimize it. (No need to unoptimize
666 * unoptimized kprobe again :)
667 */
514 list_del_init(&op->list); 668 list_del_init(&op->list);
515 else 669 force_unoptimize_kprobe(op);
516 /* Replace jump with break */ 670 }
517 arch_unoptimize_kprobe(op); 671 return;
518 op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED; 672 }
673
674 op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
675 if (!list_empty(&op->list)) {
676 /* Dequeue from the optimization queue */
677 list_del_init(&op->list);
678 return;
679 }
680 /* Optimized kprobe case */
681 if (force)
682 /* Forcibly update the code: this is a special case */
683 force_unoptimize_kprobe(op);
684 else {
685 list_add(&op->list, &unoptimizing_list);
686 kick_kprobe_optimizer();
519 } 687 }
520} 688}
521 689
690/* Cancel unoptimizing for reusing */
691static void reuse_unused_kprobe(struct kprobe *ap)
692{
693 struct optimized_kprobe *op;
694
695 BUG_ON(!kprobe_unused(ap));
696 /*
697 * Unused kprobe MUST be on the way of delayed unoptimizing (means
698 * there is still a relative jump) and disabled.
699 */
700 op = container_of(ap, struct optimized_kprobe, kp);
701 if (unlikely(list_empty(&op->list)))
702 printk(KERN_WARNING "Warning: found a stray unused "
703 "aggrprobe@%p\n", ap->addr);
704 /* Enable the probe again */
705 ap->flags &= ~KPROBE_FLAG_DISABLED;
706 /* Optimize it again (remove from op->list) */
707 BUG_ON(!kprobe_optready(ap));
708 optimize_kprobe(ap);
709}
710
522/* Remove optimized instructions */ 711/* Remove optimized instructions */
523static void __kprobes kill_optimized_kprobe(struct kprobe *p) 712static void __kprobes kill_optimized_kprobe(struct kprobe *p)
524{ 713{
525 struct optimized_kprobe *op; 714 struct optimized_kprobe *op;
526 715
527 op = container_of(p, struct optimized_kprobe, kp); 716 op = container_of(p, struct optimized_kprobe, kp);
528 if (!list_empty(&op->list)) { 717 if (!list_empty(&op->list))
529 /* Dequeue from the optimization queue */ 718 /* Dequeue from the (un)optimization queue */
530 list_del_init(&op->list); 719 list_del_init(&op->list);
531 op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED; 720
532 } 721 op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
533 /* Don't unoptimize, because the target code will be freed. */ 722 /* Don't touch the code, because it is already freed. */
534 arch_remove_optimized_kprobe(op); 723 arch_remove_optimized_kprobe(op);
535} 724}
536 725
@@ -543,16 +732,6 @@ static __kprobes void prepare_optimized_kprobe(struct kprobe *p)
543 arch_prepare_optimized_kprobe(op); 732 arch_prepare_optimized_kprobe(op);
544} 733}
545 734
546/* Free optimized instructions and optimized_kprobe */
547static __kprobes void free_aggr_kprobe(struct kprobe *p)
548{
549 struct optimized_kprobe *op;
550
551 op = container_of(p, struct optimized_kprobe, kp);
552 arch_remove_optimized_kprobe(op);
553 kfree(op);
554}
555
556/* Allocate new optimized_kprobe and try to prepare optimized instructions */ 735/* Allocate new optimized_kprobe and try to prepare optimized instructions */
557static __kprobes struct kprobe *alloc_aggr_kprobe(struct kprobe *p) 736static __kprobes struct kprobe *alloc_aggr_kprobe(struct kprobe *p)
558{ 737{
@@ -587,7 +766,8 @@ static __kprobes void try_to_optimize_kprobe(struct kprobe *p)
587 op = container_of(ap, struct optimized_kprobe, kp); 766 op = container_of(ap, struct optimized_kprobe, kp);
588 if (!arch_prepared_optinsn(&op->optinsn)) { 767 if (!arch_prepared_optinsn(&op->optinsn)) {
589 /* If failed to setup optimizing, fallback to kprobe */ 768 /* If failed to setup optimizing, fallback to kprobe */
590 free_aggr_kprobe(ap); 769 arch_remove_optimized_kprobe(op);
770 kfree(op);
591 return; 771 return;
592 } 772 }
593 773
@@ -631,21 +811,16 @@ static void __kprobes unoptimize_all_kprobes(void)
631 return; 811 return;
632 812
633 kprobes_allow_optimization = false; 813 kprobes_allow_optimization = false;
634 printk(KERN_INFO "Kprobes globally unoptimized\n");
635 get_online_cpus(); /* For avoiding text_mutex deadlock */
636 mutex_lock(&text_mutex);
637 for (i = 0; i < KPROBE_TABLE_SIZE; i++) { 814 for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
638 head = &kprobe_table[i]; 815 head = &kprobe_table[i];
639 hlist_for_each_entry_rcu(p, node, head, hlist) { 816 hlist_for_each_entry_rcu(p, node, head, hlist) {
640 if (!kprobe_disabled(p)) 817 if (!kprobe_disabled(p))
641 unoptimize_kprobe(p); 818 unoptimize_kprobe(p, false);
642 } 819 }
643 } 820 }
644 821 /* Wait for unoptimizing completion */
645 mutex_unlock(&text_mutex); 822 wait_for_kprobe_optimizer();
646 put_online_cpus(); 823 printk(KERN_INFO "Kprobes globally unoptimized\n");
647 /* Allow all currently running kprobes to complete */
648 synchronize_sched();
649} 824}
650 825
651int sysctl_kprobes_optimization; 826int sysctl_kprobes_optimization;
@@ -669,44 +844,60 @@ int proc_kprobes_optimization_handler(struct ctl_table *table, int write,
669} 844}
670#endif /* CONFIG_SYSCTL */ 845#endif /* CONFIG_SYSCTL */
671 846
847/* Put a breakpoint for a probe. Must be called with text_mutex locked */
672static void __kprobes __arm_kprobe(struct kprobe *p) 848static void __kprobes __arm_kprobe(struct kprobe *p)
673{ 849{
674 struct kprobe *old_p; 850 struct kprobe *_p;
675 851
676 /* Check collision with other optimized kprobes */ 852 /* Check collision with other optimized kprobes */
677 old_p = get_optimized_kprobe((unsigned long)p->addr); 853 _p = get_optimized_kprobe((unsigned long)p->addr);
678 if (unlikely(old_p)) 854 if (unlikely(_p))
679 unoptimize_kprobe(old_p); /* Fallback to unoptimized kprobe */ 855 /* Fallback to unoptimized kprobe */
856 unoptimize_kprobe(_p, true);
680 857
681 arch_arm_kprobe(p); 858 arch_arm_kprobe(p);
682 optimize_kprobe(p); /* Try to optimize (add kprobe to a list) */ 859 optimize_kprobe(p); /* Try to optimize (add kprobe to a list) */
683} 860}
684 861
685static void __kprobes __disarm_kprobe(struct kprobe *p) 862/* Remove the breakpoint of a probe. Must be called with text_mutex locked */
863static void __kprobes __disarm_kprobe(struct kprobe *p, bool reopt)
686{ 864{
687 struct kprobe *old_p; 865 struct kprobe *_p;
688 866
689 unoptimize_kprobe(p); /* Try to unoptimize */ 867 unoptimize_kprobe(p, false); /* Try to unoptimize */
690 arch_disarm_kprobe(p);
691 868
692 /* If another kprobe was blocked, optimize it. */ 869 if (!kprobe_queued(p)) {
693 old_p = get_optimized_kprobe((unsigned long)p->addr); 870 arch_disarm_kprobe(p);
694 if (unlikely(old_p)) 871 /* If another kprobe was blocked, optimize it. */
695 optimize_kprobe(old_p); 872 _p = get_optimized_kprobe((unsigned long)p->addr);
873 if (unlikely(_p) && reopt)
874 optimize_kprobe(_p);
875 }
876 /* TODO: reoptimize others after unoptimized this probe */
696} 877}
697 878
698#else /* !CONFIG_OPTPROBES */ 879#else /* !CONFIG_OPTPROBES */
699 880
700#define optimize_kprobe(p) do {} while (0) 881#define optimize_kprobe(p) do {} while (0)
701#define unoptimize_kprobe(p) do {} while (0) 882#define unoptimize_kprobe(p, f) do {} while (0)
702#define kill_optimized_kprobe(p) do {} while (0) 883#define kill_optimized_kprobe(p) do {} while (0)
703#define prepare_optimized_kprobe(p) do {} while (0) 884#define prepare_optimized_kprobe(p) do {} while (0)
704#define try_to_optimize_kprobe(p) do {} while (0) 885#define try_to_optimize_kprobe(p) do {} while (0)
705#define __arm_kprobe(p) arch_arm_kprobe(p) 886#define __arm_kprobe(p) arch_arm_kprobe(p)
706#define __disarm_kprobe(p) arch_disarm_kprobe(p) 887#define __disarm_kprobe(p, o) arch_disarm_kprobe(p)
888#define kprobe_disarmed(p) kprobe_disabled(p)
889#define wait_for_kprobe_optimizer() do {} while (0)
890
891/* There should be no unused kprobes can be reused without optimization */
892static void reuse_unused_kprobe(struct kprobe *ap)
893{
894 printk(KERN_ERR "Error: There should be no unused kprobe here.\n");
895 BUG_ON(kprobe_unused(ap));
896}
707 897
708static __kprobes void free_aggr_kprobe(struct kprobe *p) 898static __kprobes void free_aggr_kprobe(struct kprobe *p)
709{ 899{
900 arch_remove_kprobe(p);
710 kfree(p); 901 kfree(p);
711} 902}
712 903
@@ -732,11 +923,10 @@ static void __kprobes arm_kprobe(struct kprobe *kp)
732/* Disarm a kprobe with text_mutex */ 923/* Disarm a kprobe with text_mutex */
733static void __kprobes disarm_kprobe(struct kprobe *kp) 924static void __kprobes disarm_kprobe(struct kprobe *kp)
734{ 925{
735 get_online_cpus(); /* For avoiding text_mutex deadlock */ 926 /* Ditto */
736 mutex_lock(&text_mutex); 927 mutex_lock(&text_mutex);
737 __disarm_kprobe(kp); 928 __disarm_kprobe(kp, true);
738 mutex_unlock(&text_mutex); 929 mutex_unlock(&text_mutex);
739 put_online_cpus();
740} 930}
741 931
742/* 932/*
@@ -775,7 +965,7 @@ static void __kprobes aggr_post_handler(struct kprobe *p, struct pt_regs *regs,
775static int __kprobes aggr_fault_handler(struct kprobe *p, struct pt_regs *regs, 965static int __kprobes aggr_fault_handler(struct kprobe *p, struct pt_regs *regs,
776 int trapnr) 966 int trapnr)
777{ 967{
778 struct kprobe *cur = __get_cpu_var(kprobe_instance); 968 struct kprobe *cur = __this_cpu_read(kprobe_instance);
779 969
780 /* 970 /*
781 * if we faulted "during" the execution of a user specified 971 * if we faulted "during" the execution of a user specified
@@ -790,7 +980,7 @@ static int __kprobes aggr_fault_handler(struct kprobe *p, struct pt_regs *regs,
790 980
791static int __kprobes aggr_break_handler(struct kprobe *p, struct pt_regs *regs) 981static int __kprobes aggr_break_handler(struct kprobe *p, struct pt_regs *regs)
792{ 982{
793 struct kprobe *cur = __get_cpu_var(kprobe_instance); 983 struct kprobe *cur = __this_cpu_read(kprobe_instance);
794 int ret = 0; 984 int ret = 0;
795 985
796 if (cur && cur->break_handler) { 986 if (cur && cur->break_handler) {
@@ -942,7 +1132,7 @@ static int __kprobes add_new_kprobe(struct kprobe *ap, struct kprobe *p)
942 BUG_ON(kprobe_gone(ap) || kprobe_gone(p)); 1132 BUG_ON(kprobe_gone(ap) || kprobe_gone(p));
943 1133
944 if (p->break_handler || p->post_handler) 1134 if (p->break_handler || p->post_handler)
945 unoptimize_kprobe(ap); /* Fall back to normal kprobe */ 1135 unoptimize_kprobe(ap, true); /* Fall back to normal kprobe */
946 1136
947 if (p->break_handler) { 1137 if (p->break_handler) {
948 if (ap->break_handler) 1138 if (ap->break_handler)
@@ -993,19 +1183,21 @@ static void __kprobes init_aggr_kprobe(struct kprobe *ap, struct kprobe *p)
993 * This is the second or subsequent kprobe at the address - handle 1183 * This is the second or subsequent kprobe at the address - handle
994 * the intricacies 1184 * the intricacies
995 */ 1185 */
996static int __kprobes register_aggr_kprobe(struct kprobe *old_p, 1186static int __kprobes register_aggr_kprobe(struct kprobe *orig_p,
997 struct kprobe *p) 1187 struct kprobe *p)
998{ 1188{
999 int ret = 0; 1189 int ret = 0;
1000 struct kprobe *ap = old_p; 1190 struct kprobe *ap = orig_p;
1001 1191
1002 if (!kprobe_aggrprobe(old_p)) { 1192 if (!kprobe_aggrprobe(orig_p)) {
1003 /* If old_p is not an aggr_kprobe, create new aggr_kprobe. */ 1193 /* If orig_p is not an aggr_kprobe, create new aggr_kprobe. */
1004 ap = alloc_aggr_kprobe(old_p); 1194 ap = alloc_aggr_kprobe(orig_p);
1005 if (!ap) 1195 if (!ap)
1006 return -ENOMEM; 1196 return -ENOMEM;
1007 init_aggr_kprobe(ap, old_p); 1197 init_aggr_kprobe(ap, orig_p);
1008 } 1198 } else if (kprobe_unused(ap))
1199 /* This probe is going to die. Rescue it */
1200 reuse_unused_kprobe(ap);
1009 1201
1010 if (kprobe_gone(ap)) { 1202 if (kprobe_gone(ap)) {
1011 /* 1203 /*
@@ -1039,23 +1231,6 @@ static int __kprobes register_aggr_kprobe(struct kprobe *old_p,
1039 return add_new_kprobe(ap, p); 1231 return add_new_kprobe(ap, p);
1040} 1232}
1041 1233
1042/* Try to disable aggr_kprobe, and return 1 if succeeded.*/
1043static int __kprobes try_to_disable_aggr_kprobe(struct kprobe *p)
1044{
1045 struct kprobe *kp;
1046
1047 list_for_each_entry_rcu(kp, &p->list, list) {
1048 if (!kprobe_disabled(kp))
1049 /*
1050 * There is an active probe on the list.
1051 * We can't disable aggr_kprobe.
1052 */
1053 return 0;
1054 }
1055 p->flags |= KPROBE_FLAG_DISABLED;
1056 return 1;
1057}
1058
1059static int __kprobes in_kprobes_functions(unsigned long addr) 1234static int __kprobes in_kprobes_functions(unsigned long addr)
1060{ 1235{
1061 struct kprobe_blackpoint *kb; 1236 struct kprobe_blackpoint *kb;
@@ -1098,34 +1273,33 @@ static kprobe_opcode_t __kprobes *kprobe_addr(struct kprobe *p)
1098/* Check passed kprobe is valid and return kprobe in kprobe_table. */ 1273/* Check passed kprobe is valid and return kprobe in kprobe_table. */
1099static struct kprobe * __kprobes __get_valid_kprobe(struct kprobe *p) 1274static struct kprobe * __kprobes __get_valid_kprobe(struct kprobe *p)
1100{ 1275{
1101 struct kprobe *old_p, *list_p; 1276 struct kprobe *ap, *list_p;
1102 1277
1103 old_p = get_kprobe(p->addr); 1278 ap = get_kprobe(p->addr);
1104 if (unlikely(!old_p)) 1279 if (unlikely(!ap))
1105 return NULL; 1280 return NULL;
1106 1281
1107 if (p != old_p) { 1282 if (p != ap) {
1108 list_for_each_entry_rcu(list_p, &old_p->list, list) 1283 list_for_each_entry_rcu(list_p, &ap->list, list)
1109 if (list_p == p) 1284 if (list_p == p)
1110 /* kprobe p is a valid probe */ 1285 /* kprobe p is a valid probe */
1111 goto valid; 1286 goto valid;
1112 return NULL; 1287 return NULL;
1113 } 1288 }
1114valid: 1289valid:
1115 return old_p; 1290 return ap;
1116} 1291}
1117 1292
1118/* Return error if the kprobe is being re-registered */ 1293/* Return error if the kprobe is being re-registered */
1119static inline int check_kprobe_rereg(struct kprobe *p) 1294static inline int check_kprobe_rereg(struct kprobe *p)
1120{ 1295{
1121 int ret = 0; 1296 int ret = 0;
1122 struct kprobe *old_p;
1123 1297
1124 mutex_lock(&kprobe_mutex); 1298 mutex_lock(&kprobe_mutex);
1125 old_p = __get_valid_kprobe(p); 1299 if (__get_valid_kprobe(p))
1126 if (old_p)
1127 ret = -EINVAL; 1300 ret = -EINVAL;
1128 mutex_unlock(&kprobe_mutex); 1301 mutex_unlock(&kprobe_mutex);
1302
1129 return ret; 1303 return ret;
1130} 1304}
1131 1305
@@ -1229,67 +1403,121 @@ fail_with_jump_label:
1229} 1403}
1230EXPORT_SYMBOL_GPL(register_kprobe); 1404EXPORT_SYMBOL_GPL(register_kprobe);
1231 1405
1406/* Check if all probes on the aggrprobe are disabled */
1407static int __kprobes aggr_kprobe_disabled(struct kprobe *ap)
1408{
1409 struct kprobe *kp;
1410
1411 list_for_each_entry_rcu(kp, &ap->list, list)
1412 if (!kprobe_disabled(kp))
1413 /*
1414 * There is an active probe on the list.
1415 * We can't disable this ap.
1416 */
1417 return 0;
1418
1419 return 1;
1420}
1421
1422/* Disable one kprobe: Make sure called under kprobe_mutex is locked */
1423static struct kprobe *__kprobes __disable_kprobe(struct kprobe *p)
1424{
1425 struct kprobe *orig_p;
1426
1427 /* Get an original kprobe for return */
1428 orig_p = __get_valid_kprobe(p);
1429 if (unlikely(orig_p == NULL))
1430 return NULL;
1431
1432 if (!kprobe_disabled(p)) {
1433 /* Disable probe if it is a child probe */
1434 if (p != orig_p)
1435 p->flags |= KPROBE_FLAG_DISABLED;
1436
1437 /* Try to disarm and disable this/parent probe */
1438 if (p == orig_p || aggr_kprobe_disabled(orig_p)) {
1439 disarm_kprobe(orig_p);
1440 orig_p->flags |= KPROBE_FLAG_DISABLED;
1441 }
1442 }
1443
1444 return orig_p;
1445}
1446
1232/* 1447/*
1233 * Unregister a kprobe without a scheduler synchronization. 1448 * Unregister a kprobe without a scheduler synchronization.
1234 */ 1449 */
1235static int __kprobes __unregister_kprobe_top(struct kprobe *p) 1450static int __kprobes __unregister_kprobe_top(struct kprobe *p)
1236{ 1451{
1237 struct kprobe *old_p, *list_p; 1452 struct kprobe *ap, *list_p;
1238 1453
1239 old_p = __get_valid_kprobe(p); 1454 /* Disable kprobe. This will disarm it if needed. */
1240 if (old_p == NULL) 1455 ap = __disable_kprobe(p);
1456 if (ap == NULL)
1241 return -EINVAL; 1457 return -EINVAL;
1242 1458
1243 if (old_p == p || 1459 if (ap == p)
1244 (kprobe_aggrprobe(old_p) &&
1245 list_is_singular(&old_p->list))) {
1246 /* 1460 /*
1247 * Only probe on the hash list. Disarm only if kprobes are 1461 * This probe is an independent(and non-optimized) kprobe
1248 * enabled and not gone - otherwise, the breakpoint would 1462 * (not an aggrprobe). Remove from the hash list.
1249 * already have been removed. We save on flushing icache.
1250 */ 1463 */
1251 if (!kprobes_all_disarmed && !kprobe_disabled(old_p)) 1464 goto disarmed;
1252 disarm_kprobe(old_p); 1465
1253 hlist_del_rcu(&old_p->hlist); 1466 /* Following process expects this probe is an aggrprobe */
1254 } else { 1467 WARN_ON(!kprobe_aggrprobe(ap));
1468
1469 if (list_is_singular(&ap->list) && kprobe_disarmed(ap))
1470 /*
1471 * !disarmed could be happen if the probe is under delayed
1472 * unoptimizing.
1473 */
1474 goto disarmed;
1475 else {
1476 /* If disabling probe has special handlers, update aggrprobe */
1255 if (p->break_handler && !kprobe_gone(p)) 1477 if (p->break_handler && !kprobe_gone(p))
1256 old_p->break_handler = NULL; 1478 ap->break_handler = NULL;
1257 if (p->post_handler && !kprobe_gone(p)) { 1479 if (p->post_handler && !kprobe_gone(p)) {
1258 list_for_each_entry_rcu(list_p, &old_p->list, list) { 1480 list_for_each_entry_rcu(list_p, &ap->list, list) {
1259 if ((list_p != p) && (list_p->post_handler)) 1481 if ((list_p != p) && (list_p->post_handler))
1260 goto noclean; 1482 goto noclean;
1261 } 1483 }
1262 old_p->post_handler = NULL; 1484 ap->post_handler = NULL;
1263 } 1485 }
1264noclean: 1486noclean:
1487 /*
1488 * Remove from the aggrprobe: this path will do nothing in
1489 * __unregister_kprobe_bottom().
1490 */
1265 list_del_rcu(&p->list); 1491 list_del_rcu(&p->list);
1266 if (!kprobe_disabled(old_p)) { 1492 if (!kprobe_disabled(ap) && !kprobes_all_disarmed)
1267 try_to_disable_aggr_kprobe(old_p); 1493 /*
1268 if (!kprobes_all_disarmed) { 1494 * Try to optimize this probe again, because post
1269 if (kprobe_disabled(old_p)) 1495 * handler may have been changed.
1270 disarm_kprobe(old_p); 1496 */
1271 else 1497 optimize_kprobe(ap);
1272 /* Try to optimize this probe again */
1273 optimize_kprobe(old_p);
1274 }
1275 }
1276 } 1498 }
1277 return 0; 1499 return 0;
1500
1501disarmed:
1502 BUG_ON(!kprobe_disarmed(ap));
1503 hlist_del_rcu(&ap->hlist);
1504 return 0;
1278} 1505}
1279 1506
1280static void __kprobes __unregister_kprobe_bottom(struct kprobe *p) 1507static void __kprobes __unregister_kprobe_bottom(struct kprobe *p)
1281{ 1508{
1282 struct kprobe *old_p; 1509 struct kprobe *ap;
1283 1510
1284 if (list_empty(&p->list)) 1511 if (list_empty(&p->list))
1512 /* This is an independent kprobe */
1285 arch_remove_kprobe(p); 1513 arch_remove_kprobe(p);
1286 else if (list_is_singular(&p->list)) { 1514 else if (list_is_singular(&p->list)) {
1287 /* "p" is the last child of an aggr_kprobe */ 1515 /* This is the last child of an aggrprobe */
1288 old_p = list_entry(p->list.next, struct kprobe, list); 1516 ap = list_entry(p->list.next, struct kprobe, list);
1289 list_del(&p->list); 1517 list_del(&p->list);
1290 arch_remove_kprobe(old_p); 1518 free_aggr_kprobe(ap);
1291 free_aggr_kprobe(old_p);
1292 } 1519 }
1520 /* Otherwise, do nothing. */
1293} 1521}
1294 1522
1295int __kprobes register_kprobes(struct kprobe **kps, int num) 1523int __kprobes register_kprobes(struct kprobe **kps, int num)
@@ -1607,29 +1835,13 @@ static void __kprobes kill_kprobe(struct kprobe *p)
1607int __kprobes disable_kprobe(struct kprobe *kp) 1835int __kprobes disable_kprobe(struct kprobe *kp)
1608{ 1836{
1609 int ret = 0; 1837 int ret = 0;
1610 struct kprobe *p;
1611 1838
1612 mutex_lock(&kprobe_mutex); 1839 mutex_lock(&kprobe_mutex);
1613 1840
1614 /* Check whether specified probe is valid. */ 1841 /* Disable this kprobe */
1615 p = __get_valid_kprobe(kp); 1842 if (__disable_kprobe(kp) == NULL)
1616 if (unlikely(p == NULL)) {
1617 ret = -EINVAL; 1843 ret = -EINVAL;
1618 goto out;
1619 }
1620 1844
1621 /* If the probe is already disabled (or gone), just return */
1622 if (kprobe_disabled(kp))
1623 goto out;
1624
1625 kp->flags |= KPROBE_FLAG_DISABLED;
1626 if (p != kp)
1627 /* When kp != p, p is always enabled. */
1628 try_to_disable_aggr_kprobe(p);
1629
1630 if (!kprobes_all_disarmed && kprobe_disabled(p))
1631 disarm_kprobe(p);
1632out:
1633 mutex_unlock(&kprobe_mutex); 1845 mutex_unlock(&kprobe_mutex);
1634 return ret; 1846 return ret;
1635} 1847}
@@ -1927,36 +2139,27 @@ static void __kprobes disarm_all_kprobes(void)
1927 mutex_lock(&kprobe_mutex); 2139 mutex_lock(&kprobe_mutex);
1928 2140
1929 /* If kprobes are already disarmed, just return */ 2141 /* If kprobes are already disarmed, just return */
1930 if (kprobes_all_disarmed) 2142 if (kprobes_all_disarmed) {
1931 goto already_disabled; 2143 mutex_unlock(&kprobe_mutex);
2144 return;
2145 }
1932 2146
1933 kprobes_all_disarmed = true; 2147 kprobes_all_disarmed = true;
1934 printk(KERN_INFO "Kprobes globally disabled\n"); 2148 printk(KERN_INFO "Kprobes globally disabled\n");
1935 2149
1936 /*
1937 * Here we call get_online_cpus() for avoiding text_mutex deadlock,
1938 * because disarming may also unoptimize kprobes.
1939 */
1940 get_online_cpus();
1941 mutex_lock(&text_mutex); 2150 mutex_lock(&text_mutex);
1942 for (i = 0; i < KPROBE_TABLE_SIZE; i++) { 2151 for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
1943 head = &kprobe_table[i]; 2152 head = &kprobe_table[i];
1944 hlist_for_each_entry_rcu(p, node, head, hlist) { 2153 hlist_for_each_entry_rcu(p, node, head, hlist) {
1945 if (!arch_trampoline_kprobe(p) && !kprobe_disabled(p)) 2154 if (!arch_trampoline_kprobe(p) && !kprobe_disabled(p))
1946 __disarm_kprobe(p); 2155 __disarm_kprobe(p, false);
1947 } 2156 }
1948 } 2157 }
1949
1950 mutex_unlock(&text_mutex); 2158 mutex_unlock(&text_mutex);
1951 put_online_cpus();
1952 mutex_unlock(&kprobe_mutex); 2159 mutex_unlock(&kprobe_mutex);
1953 /* Allow all currently running kprobes to complete */
1954 synchronize_sched();
1955 return;
1956 2160
1957already_disabled: 2161 /* Wait for disarming all kprobes by optimizer */
1958 mutex_unlock(&kprobe_mutex); 2162 wait_for_kprobe_optimizer();
1959 return;
1960} 2163}
1961 2164
1962/* 2165/*
diff --git a/kernel/kthread.c b/kernel/kthread.c
index ca61bbdd44b2..c55afba990a3 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -148,7 +148,7 @@ struct task_struct *kthread_create(int (*threadfn)(void *data),
148 wait_for_completion(&create.done); 148 wait_for_completion(&create.done);
149 149
150 if (!IS_ERR(create.result)) { 150 if (!IS_ERR(create.result)) {
151 struct sched_param param = { .sched_priority = 0 }; 151 static const struct sched_param param = { .sched_priority = 0 };
152 va_list args; 152 va_list args;
153 153
154 va_start(args, namefmt); 154 va_start(args, namefmt);
diff --git a/kernel/latencytop.c b/kernel/latencytop.c
index 17110a4a4fc2..ee74b35e528d 100644
--- a/kernel/latencytop.c
+++ b/kernel/latencytop.c
@@ -241,24 +241,19 @@ static int lstats_show(struct seq_file *m, void *v)
241 seq_puts(m, "Latency Top version : v0.1\n"); 241 seq_puts(m, "Latency Top version : v0.1\n");
242 242
243 for (i = 0; i < MAXLR; i++) { 243 for (i = 0; i < MAXLR; i++) {
244 if (latency_record[i].backtrace[0]) { 244 struct latency_record *lr = &latency_record[i];
245
246 if (lr->backtrace[0]) {
245 int q; 247 int q;
246 seq_printf(m, "%i %lu %lu ", 248 seq_printf(m, "%i %lu %lu",
247 latency_record[i].count, 249 lr->count, lr->time, lr->max);
248 latency_record[i].time,
249 latency_record[i].max);
250 for (q = 0; q < LT_BACKTRACEDEPTH; q++) { 250 for (q = 0; q < LT_BACKTRACEDEPTH; q++) {
251 char sym[KSYM_SYMBOL_LEN]; 251 unsigned long bt = lr->backtrace[q];
252 char *c; 252 if (!bt)
253 if (!latency_record[i].backtrace[q])
254 break; 253 break;
255 if (latency_record[i].backtrace[q] == ULONG_MAX) 254 if (bt == ULONG_MAX)
256 break; 255 break;
257 sprint_symbol(sym, latency_record[i].backtrace[q]); 256 seq_printf(m, " %ps", (void *)bt);
258 c = strchr(sym, '+');
259 if (c)
260 *c = 0;
261 seq_printf(m, "%s ", sym);
262 } 257 }
263 seq_printf(m, "\n"); 258 seq_printf(m, "\n");
264 } 259 }
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 42ba65dff7d9..0d2058da80f5 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -2292,22 +2292,6 @@ mark_held_locks(struct task_struct *curr, enum mark_type mark)
2292} 2292}
2293 2293
2294/* 2294/*
2295 * Debugging helper: via this flag we know that we are in
2296 * 'early bootup code', and will warn about any invalid irqs-on event:
2297 */
2298static int early_boot_irqs_enabled;
2299
2300void early_boot_irqs_off(void)
2301{
2302 early_boot_irqs_enabled = 0;
2303}
2304
2305void early_boot_irqs_on(void)
2306{
2307 early_boot_irqs_enabled = 1;
2308}
2309
2310/*
2311 * Hardirqs will be enabled: 2295 * Hardirqs will be enabled:
2312 */ 2296 */
2313void trace_hardirqs_on_caller(unsigned long ip) 2297void trace_hardirqs_on_caller(unsigned long ip)
@@ -2319,7 +2303,7 @@ void trace_hardirqs_on_caller(unsigned long ip)
2319 if (unlikely(!debug_locks || current->lockdep_recursion)) 2303 if (unlikely(!debug_locks || current->lockdep_recursion))
2320 return; 2304 return;
2321 2305
2322 if (DEBUG_LOCKS_WARN_ON(unlikely(!early_boot_irqs_enabled))) 2306 if (DEBUG_LOCKS_WARN_ON(unlikely(early_boot_irqs_disabled)))
2323 return; 2307 return;
2324 2308
2325 if (unlikely(curr->hardirqs_enabled)) { 2309 if (unlikely(curr->hardirqs_enabled)) {
diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c
index 59b76c8ce9d7..1969d2fc4b36 100644
--- a/kernel/lockdep_proc.c
+++ b/kernel/lockdep_proc.c
@@ -494,7 +494,6 @@ static void seq_stats(struct seq_file *m, struct lock_stat_data *data)
494 namelen += 2; 494 namelen += 2;
495 495
496 for (i = 0; i < LOCKSTAT_POINTS; i++) { 496 for (i = 0; i < LOCKSTAT_POINTS; i++) {
497 char sym[KSYM_SYMBOL_LEN];
498 char ip[32]; 497 char ip[32];
499 498
500 if (class->contention_point[i] == 0) 499 if (class->contention_point[i] == 0)
@@ -503,15 +502,13 @@ static void seq_stats(struct seq_file *m, struct lock_stat_data *data)
503 if (!i) 502 if (!i)
504 seq_line(m, '-', 40-namelen, namelen); 503 seq_line(m, '-', 40-namelen, namelen);
505 504
506 sprint_symbol(sym, class->contention_point[i]);
507 snprintf(ip, sizeof(ip), "[<%p>]", 505 snprintf(ip, sizeof(ip), "[<%p>]",
508 (void *)class->contention_point[i]); 506 (void *)class->contention_point[i]);
509 seq_printf(m, "%40s %14lu %29s %s\n", name, 507 seq_printf(m, "%40s %14lu %29s %pS\n",
510 stats->contention_point[i], 508 name, stats->contention_point[i],
511 ip, sym); 509 ip, (void *)class->contention_point[i]);
512 } 510 }
513 for (i = 0; i < LOCKSTAT_POINTS; i++) { 511 for (i = 0; i < LOCKSTAT_POINTS; i++) {
514 char sym[KSYM_SYMBOL_LEN];
515 char ip[32]; 512 char ip[32];
516 513
517 if (class->contending_point[i] == 0) 514 if (class->contending_point[i] == 0)
@@ -520,12 +517,11 @@ static void seq_stats(struct seq_file *m, struct lock_stat_data *data)
520 if (!i) 517 if (!i)
521 seq_line(m, '-', 40-namelen, namelen); 518 seq_line(m, '-', 40-namelen, namelen);
522 519
523 sprint_symbol(sym, class->contending_point[i]);
524 snprintf(ip, sizeof(ip), "[<%p>]", 520 snprintf(ip, sizeof(ip), "[<%p>]",
525 (void *)class->contending_point[i]); 521 (void *)class->contending_point[i]);
526 seq_printf(m, "%40s %14lu %29s %s\n", name, 522 seq_printf(m, "%40s %14lu %29s %pS\n",
527 stats->contending_point[i], 523 name, stats->contending_point[i],
528 ip, sym); 524 ip, (void *)class->contending_point[i]);
529 } 525 }
530 if (i) { 526 if (i) {
531 seq_puts(m, "\n"); 527 seq_puts(m, "\n");
diff --git a/kernel/module.c b/kernel/module.c
index d190664f25ff..efa290ea94bf 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -56,6 +56,7 @@
56#include <linux/percpu.h> 56#include <linux/percpu.h>
57#include <linux/kmemleak.h> 57#include <linux/kmemleak.h>
58#include <linux/jump_label.h> 58#include <linux/jump_label.h>
59#include <linux/pfn.h>
59 60
60#define CREATE_TRACE_POINTS 61#define CREATE_TRACE_POINTS
61#include <trace/events/module.h> 62#include <trace/events/module.h>
@@ -70,6 +71,26 @@
70#define ARCH_SHF_SMALL 0 71#define ARCH_SHF_SMALL 0
71#endif 72#endif
72 73
74/*
75 * Modules' sections will be aligned on page boundaries
76 * to ensure complete separation of code and data, but
77 * only when CONFIG_DEBUG_SET_MODULE_RONX=y
78 */
79#ifdef CONFIG_DEBUG_SET_MODULE_RONX
80# define debug_align(X) ALIGN(X, PAGE_SIZE)
81#else
82# define debug_align(X) (X)
83#endif
84
85/*
86 * Given BASE and SIZE this macro calculates the number of pages the
87 * memory regions occupies
88 */
89#define MOD_NUMBER_OF_PAGES(BASE, SIZE) (((SIZE) > 0) ? \
90 (PFN_DOWN((unsigned long)(BASE) + (SIZE) - 1) - \
91 PFN_DOWN((unsigned long)BASE) + 1) \
92 : (0UL))
93
73/* If this is set, the section belongs in the init part of the module */ 94/* If this is set, the section belongs in the init part of the module */
74#define INIT_OFFSET_MASK (1UL << (BITS_PER_LONG-1)) 95#define INIT_OFFSET_MASK (1UL << (BITS_PER_LONG-1))
75 96
@@ -1542,6 +1563,115 @@ static int __unlink_module(void *_mod)
1542 return 0; 1563 return 0;
1543} 1564}
1544 1565
1566#ifdef CONFIG_DEBUG_SET_MODULE_RONX
1567/*
1568 * LKM RO/NX protection: protect module's text/ro-data
1569 * from modification and any data from execution.
1570 */
1571void set_page_attributes(void *start, void *end, int (*set)(unsigned long start, int num_pages))
1572{
1573 unsigned long begin_pfn = PFN_DOWN((unsigned long)start);
1574 unsigned long end_pfn = PFN_DOWN((unsigned long)end);
1575
1576 if (end_pfn > begin_pfn)
1577 set(begin_pfn << PAGE_SHIFT, end_pfn - begin_pfn);
1578}
1579
1580static void set_section_ro_nx(void *base,
1581 unsigned long text_size,
1582 unsigned long ro_size,
1583 unsigned long total_size)
1584{
1585 /* begin and end PFNs of the current subsection */
1586 unsigned long begin_pfn;
1587 unsigned long end_pfn;
1588
1589 /*
1590 * Set RO for module text and RO-data:
1591 * - Always protect first page.
1592 * - Do not protect last partial page.
1593 */
1594 if (ro_size > 0)
1595 set_page_attributes(base, base + ro_size, set_memory_ro);
1596
1597 /*
1598 * Set NX permissions for module data:
1599 * - Do not protect first partial page.
1600 * - Always protect last page.
1601 */
1602 if (total_size > text_size) {
1603 begin_pfn = PFN_UP((unsigned long)base + text_size);
1604 end_pfn = PFN_UP((unsigned long)base + total_size);
1605 if (end_pfn > begin_pfn)
1606 set_memory_nx(begin_pfn << PAGE_SHIFT, end_pfn - begin_pfn);
1607 }
1608}
1609
1610/* Setting memory back to RW+NX before releasing it */
1611void unset_section_ro_nx(struct module *mod, void *module_region)
1612{
1613 unsigned long total_pages;
1614
1615 if (mod->module_core == module_region) {
1616 /* Set core as NX+RW */
1617 total_pages = MOD_NUMBER_OF_PAGES(mod->module_core, mod->core_size);
1618 set_memory_nx((unsigned long)mod->module_core, total_pages);
1619 set_memory_rw((unsigned long)mod->module_core, total_pages);
1620
1621 } else if (mod->module_init == module_region) {
1622 /* Set init as NX+RW */
1623 total_pages = MOD_NUMBER_OF_PAGES(mod->module_init, mod->init_size);
1624 set_memory_nx((unsigned long)mod->module_init, total_pages);
1625 set_memory_rw((unsigned long)mod->module_init, total_pages);
1626 }
1627}
1628
1629/* Iterate through all modules and set each module's text as RW */
1630void set_all_modules_text_rw()
1631{
1632 struct module *mod;
1633
1634 mutex_lock(&module_mutex);
1635 list_for_each_entry_rcu(mod, &modules, list) {
1636 if ((mod->module_core) && (mod->core_text_size)) {
1637 set_page_attributes(mod->module_core,
1638 mod->module_core + mod->core_text_size,
1639 set_memory_rw);
1640 }
1641 if ((mod->module_init) && (mod->init_text_size)) {
1642 set_page_attributes(mod->module_init,
1643 mod->module_init + mod->init_text_size,
1644 set_memory_rw);
1645 }
1646 }
1647 mutex_unlock(&module_mutex);
1648}
1649
1650/* Iterate through all modules and set each module's text as RO */
1651void set_all_modules_text_ro()
1652{
1653 struct module *mod;
1654
1655 mutex_lock(&module_mutex);
1656 list_for_each_entry_rcu(mod, &modules, list) {
1657 if ((mod->module_core) && (mod->core_text_size)) {
1658 set_page_attributes(mod->module_core,
1659 mod->module_core + mod->core_text_size,
1660 set_memory_ro);
1661 }
1662 if ((mod->module_init) && (mod->init_text_size)) {
1663 set_page_attributes(mod->module_init,
1664 mod->module_init + mod->init_text_size,
1665 set_memory_ro);
1666 }
1667 }
1668 mutex_unlock(&module_mutex);
1669}
1670#else
1671static inline void set_section_ro_nx(void *base, unsigned long text_size, unsigned long ro_size, unsigned long total_size) { }
1672static inline void unset_section_ro_nx(struct module *mod, void *module_region) { }
1673#endif
1674
1545/* Free a module, remove from lists, etc. */ 1675/* Free a module, remove from lists, etc. */
1546static void free_module(struct module *mod) 1676static void free_module(struct module *mod)
1547{ 1677{
@@ -1566,6 +1696,7 @@ static void free_module(struct module *mod)
1566 destroy_params(mod->kp, mod->num_kp); 1696 destroy_params(mod->kp, mod->num_kp);
1567 1697
1568 /* This may be NULL, but that's OK */ 1698 /* This may be NULL, but that's OK */
1699 unset_section_ro_nx(mod, mod->module_init);
1569 module_free(mod, mod->module_init); 1700 module_free(mod, mod->module_init);
1570 kfree(mod->args); 1701 kfree(mod->args);
1571 percpu_modfree(mod); 1702 percpu_modfree(mod);
@@ -1574,6 +1705,7 @@ static void free_module(struct module *mod)
1574 lockdep_free_key_range(mod->module_core, mod->core_size); 1705 lockdep_free_key_range(mod->module_core, mod->core_size);
1575 1706
1576 /* Finally, free the core (containing the module structure) */ 1707 /* Finally, free the core (containing the module structure) */
1708 unset_section_ro_nx(mod, mod->module_core);
1577 module_free(mod, mod->module_core); 1709 module_free(mod, mod->module_core);
1578 1710
1579#ifdef CONFIG_MPU 1711#ifdef CONFIG_MPU
@@ -1777,8 +1909,19 @@ static void layout_sections(struct module *mod, struct load_info *info)
1777 s->sh_entsize = get_offset(mod, &mod->core_size, s, i); 1909 s->sh_entsize = get_offset(mod, &mod->core_size, s, i);
1778 DEBUGP("\t%s\n", name); 1910 DEBUGP("\t%s\n", name);
1779 } 1911 }
1780 if (m == 0) 1912 switch (m) {
1913 case 0: /* executable */
1914 mod->core_size = debug_align(mod->core_size);
1781 mod->core_text_size = mod->core_size; 1915 mod->core_text_size = mod->core_size;
1916 break;
1917 case 1: /* RO: text and ro-data */
1918 mod->core_size = debug_align(mod->core_size);
1919 mod->core_ro_size = mod->core_size;
1920 break;
1921 case 3: /* whole core */
1922 mod->core_size = debug_align(mod->core_size);
1923 break;
1924 }
1782 } 1925 }
1783 1926
1784 DEBUGP("Init section allocation order:\n"); 1927 DEBUGP("Init section allocation order:\n");
@@ -1796,8 +1939,19 @@ static void layout_sections(struct module *mod, struct load_info *info)
1796 | INIT_OFFSET_MASK); 1939 | INIT_OFFSET_MASK);
1797 DEBUGP("\t%s\n", sname); 1940 DEBUGP("\t%s\n", sname);
1798 } 1941 }
1799 if (m == 0) 1942 switch (m) {
1943 case 0: /* executable */
1944 mod->init_size = debug_align(mod->init_size);
1800 mod->init_text_size = mod->init_size; 1945 mod->init_text_size = mod->init_size;
1946 break;
1947 case 1: /* RO: text and ro-data */
1948 mod->init_size = debug_align(mod->init_size);
1949 mod->init_ro_size = mod->init_size;
1950 break;
1951 case 3: /* whole init */
1952 mod->init_size = debug_align(mod->init_size);
1953 break;
1954 }
1801 } 1955 }
1802} 1956}
1803 1957
@@ -2306,9 +2460,9 @@ static void find_module_sections(struct module *mod, struct load_info *info)
2306#endif 2460#endif
2307 2461
2308#ifdef CONFIG_TRACEPOINTS 2462#ifdef CONFIG_TRACEPOINTS
2309 mod->tracepoints = section_objs(info, "__tracepoints", 2463 mod->tracepoints_ptrs = section_objs(info, "__tracepoints_ptrs",
2310 sizeof(*mod->tracepoints), 2464 sizeof(*mod->tracepoints_ptrs),
2311 &mod->num_tracepoints); 2465 &mod->num_tracepoints);
2312#endif 2466#endif
2313#ifdef HAVE_JUMP_LABEL 2467#ifdef HAVE_JUMP_LABEL
2314 mod->jump_entries = section_objs(info, "__jump_table", 2468 mod->jump_entries = section_objs(info, "__jump_table",
@@ -2722,6 +2876,18 @@ SYSCALL_DEFINE3(init_module, void __user *, umod,
2722 blocking_notifier_call_chain(&module_notify_list, 2876 blocking_notifier_call_chain(&module_notify_list,
2723 MODULE_STATE_COMING, mod); 2877 MODULE_STATE_COMING, mod);
2724 2878
2879 /* Set RO and NX regions for core */
2880 set_section_ro_nx(mod->module_core,
2881 mod->core_text_size,
2882 mod->core_ro_size,
2883 mod->core_size);
2884
2885 /* Set RO and NX regions for init */
2886 set_section_ro_nx(mod->module_init,
2887 mod->init_text_size,
2888 mod->init_ro_size,
2889 mod->init_size);
2890
2725 do_mod_ctors(mod); 2891 do_mod_ctors(mod);
2726 /* Start the module */ 2892 /* Start the module */
2727 if (mod->init != NULL) 2893 if (mod->init != NULL)
@@ -2765,6 +2931,7 @@ SYSCALL_DEFINE3(init_module, void __user *, umod,
2765 mod->symtab = mod->core_symtab; 2931 mod->symtab = mod->core_symtab;
2766 mod->strtab = mod->core_strtab; 2932 mod->strtab = mod->core_strtab;
2767#endif 2933#endif
2934 unset_section_ro_nx(mod, mod->module_init);
2768 module_free(mod, mod->module_init); 2935 module_free(mod, mod->module_init);
2769 mod->module_init = NULL; 2936 mod->module_init = NULL;
2770 mod->init_size = 0; 2937 mod->init_size = 0;
@@ -3226,7 +3393,7 @@ void module_layout(struct module *mod,
3226 struct modversion_info *ver, 3393 struct modversion_info *ver,
3227 struct kernel_param *kp, 3394 struct kernel_param *kp,
3228 struct kernel_symbol *ks, 3395 struct kernel_symbol *ks,
3229 struct tracepoint *tp) 3396 struct tracepoint * const *tp)
3230{ 3397{
3231} 3398}
3232EXPORT_SYMBOL(module_layout); 3399EXPORT_SYMBOL(module_layout);
@@ -3240,8 +3407,8 @@ void module_update_tracepoints(void)
3240 mutex_lock(&module_mutex); 3407 mutex_lock(&module_mutex);
3241 list_for_each_entry(mod, &modules, list) 3408 list_for_each_entry(mod, &modules, list)
3242 if (!mod->taints) 3409 if (!mod->taints)
3243 tracepoint_update_probe_range(mod->tracepoints, 3410 tracepoint_update_probe_range(mod->tracepoints_ptrs,
3244 mod->tracepoints + mod->num_tracepoints); 3411 mod->tracepoints_ptrs + mod->num_tracepoints);
3245 mutex_unlock(&module_mutex); 3412 mutex_unlock(&module_mutex);
3246} 3413}
3247 3414
@@ -3265,8 +3432,8 @@ int module_get_iter_tracepoints(struct tracepoint_iter *iter)
3265 else if (iter_mod > iter->module) 3432 else if (iter_mod > iter->module)
3266 iter->tracepoint = NULL; 3433 iter->tracepoint = NULL;
3267 found = tracepoint_get_iter_range(&iter->tracepoint, 3434 found = tracepoint_get_iter_range(&iter->tracepoint,
3268 iter_mod->tracepoints, 3435 iter_mod->tracepoints_ptrs,
3269 iter_mod->tracepoints 3436 iter_mod->tracepoints_ptrs
3270 + iter_mod->num_tracepoints); 3437 + iter_mod->num_tracepoints);
3271 if (found) { 3438 if (found) {
3272 iter->module = iter_mod; 3439 iter->module = iter_mod;
diff --git a/kernel/mutex.c b/kernel/mutex.c
index 200407c1502f..a5889fb28ecf 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -199,7 +199,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
199 * memory barriers as we'll eventually observe the right 199 * memory barriers as we'll eventually observe the right
200 * values at the cost of a few extra spins. 200 * values at the cost of a few extra spins.
201 */ 201 */
202 cpu_relax(); 202 arch_mutex_cpu_relax();
203 } 203 }
204#endif 204#endif
205 spin_lock_mutex(&lock->wait_lock, flags); 205 spin_lock_mutex(&lock->wait_lock, flags);
diff --git a/kernel/panic.c b/kernel/panic.c
index 4c13b1a88ebb..991bb87a1704 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -34,6 +34,7 @@ static int pause_on_oops_flag;
34static DEFINE_SPINLOCK(pause_on_oops_lock); 34static DEFINE_SPINLOCK(pause_on_oops_lock);
35 35
36int panic_timeout; 36int panic_timeout;
37EXPORT_SYMBOL_GPL(panic_timeout);
37 38
38ATOMIC_NOTIFIER_HEAD(panic_notifier_list); 39ATOMIC_NOTIFIER_HEAD(panic_notifier_list);
39 40
diff --git a/kernel/params.c b/kernel/params.c
index 08107d181758..0da1411222b9 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -719,9 +719,7 @@ void destroy_params(const struct kernel_param *params, unsigned num)
719 params[i].ops->free(params[i].arg); 719 params[i].ops->free(params[i].arg);
720} 720}
721 721
722static void __init kernel_add_sysfs_param(const char *name, 722static struct module_kobject * __init locate_module_kobject(const char *name)
723 struct kernel_param *kparam,
724 unsigned int name_skip)
725{ 723{
726 struct module_kobject *mk; 724 struct module_kobject *mk;
727 struct kobject *kobj; 725 struct kobject *kobj;
@@ -729,10 +727,7 @@ static void __init kernel_add_sysfs_param(const char *name,
729 727
730 kobj = kset_find_obj(module_kset, name); 728 kobj = kset_find_obj(module_kset, name);
731 if (kobj) { 729 if (kobj) {
732 /* We already have one. Remove params so we can add more. */
733 mk = to_module_kobject(kobj); 730 mk = to_module_kobject(kobj);
734 /* We need to remove it before adding parameters. */
735 sysfs_remove_group(&mk->kobj, &mk->mp->grp);
736 } else { 731 } else {
737 mk = kzalloc(sizeof(struct module_kobject), GFP_KERNEL); 732 mk = kzalloc(sizeof(struct module_kobject), GFP_KERNEL);
738 BUG_ON(!mk); 733 BUG_ON(!mk);
@@ -743,15 +738,36 @@ static void __init kernel_add_sysfs_param(const char *name,
743 "%s", name); 738 "%s", name);
744 if (err) { 739 if (err) {
745 kobject_put(&mk->kobj); 740 kobject_put(&mk->kobj);
746 printk(KERN_ERR "Module '%s' failed add to sysfs, " 741 printk(KERN_ERR
747 "error number %d\n", name, err); 742 "Module '%s' failed add to sysfs, error number %d\n",
748 printk(KERN_ERR "The system will be unstable now.\n"); 743 name, err);
749 return; 744 printk(KERN_ERR
745 "The system will be unstable now.\n");
746 return NULL;
750 } 747 }
751 /* So that exit path is even. */ 748
749 /* So that we hold reference in both cases. */
752 kobject_get(&mk->kobj); 750 kobject_get(&mk->kobj);
753 } 751 }
754 752
753 return mk;
754}
755
756static void __init kernel_add_sysfs_param(const char *name,
757 struct kernel_param *kparam,
758 unsigned int name_skip)
759{
760 struct module_kobject *mk;
761 int err;
762
763 mk = locate_module_kobject(name);
764 if (!mk)
765 return;
766
767 /* We need to remove old parameters before adding more. */
768 if (mk->mp)
769 sysfs_remove_group(&mk->kobj, &mk->mp->grp);
770
755 /* These should not fail at boot. */ 771 /* These should not fail at boot. */
756 err = add_sysfs_param(mk, kparam, kparam->name + name_skip); 772 err = add_sysfs_param(mk, kparam, kparam->name + name_skip);
757 BUG_ON(err); 773 BUG_ON(err);
@@ -796,6 +812,32 @@ static void __init param_sysfs_builtin(void)
796 } 812 }
797} 813}
798 814
815ssize_t __modver_version_show(struct module_attribute *mattr,
816 struct module *mod, char *buf)
817{
818 struct module_version_attribute *vattr =
819 container_of(mattr, struct module_version_attribute, mattr);
820
821 return sprintf(buf, "%s\n", vattr->version);
822}
823
824extern struct module_version_attribute __start___modver[], __stop___modver[];
825
826static void __init version_sysfs_builtin(void)
827{
828 const struct module_version_attribute *vattr;
829 struct module_kobject *mk;
830 int err;
831
832 for (vattr = __start___modver; vattr < __stop___modver; vattr++) {
833 mk = locate_module_kobject(vattr->module_name);
834 if (mk) {
835 err = sysfs_create_file(&mk->kobj, &vattr->mattr.attr);
836 kobject_uevent(&mk->kobj, KOBJ_ADD);
837 kobject_put(&mk->kobj);
838 }
839 }
840}
799 841
800/* module-related sysfs stuff */ 842/* module-related sysfs stuff */
801 843
@@ -875,6 +917,7 @@ static int __init param_sysfs_init(void)
875 } 917 }
876 module_sysfs_initialized = 1; 918 module_sysfs_initialized = 1;
877 919
920 version_sysfs_builtin();
878 param_sysfs_builtin(); 921 param_sysfs_builtin();
879 922
880 return 0; 923 return 0;
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 2870feee81dd..999835b6112b 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -13,6 +13,7 @@
13#include <linux/mm.h> 13#include <linux/mm.h>
14#include <linux/cpu.h> 14#include <linux/cpu.h>
15#include <linux/smp.h> 15#include <linux/smp.h>
16#include <linux/idr.h>
16#include <linux/file.h> 17#include <linux/file.h>
17#include <linux/poll.h> 18#include <linux/poll.h>
18#include <linux/slab.h> 19#include <linux/slab.h>
@@ -21,7 +22,9 @@
21#include <linux/dcache.h> 22#include <linux/dcache.h>
22#include <linux/percpu.h> 23#include <linux/percpu.h>
23#include <linux/ptrace.h> 24#include <linux/ptrace.h>
25#include <linux/reboot.h>
24#include <linux/vmstat.h> 26#include <linux/vmstat.h>
27#include <linux/device.h>
25#include <linux/vmalloc.h> 28#include <linux/vmalloc.h>
26#include <linux/hardirq.h> 29#include <linux/hardirq.h>
27#include <linux/rculist.h> 30#include <linux/rculist.h>
@@ -35,6 +38,12 @@
35 38
36#include <asm/irq_regs.h> 39#include <asm/irq_regs.h>
37 40
41enum event_type_t {
42 EVENT_FLEXIBLE = 0x1,
43 EVENT_PINNED = 0x2,
44 EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
45};
46
38atomic_t perf_task_events __read_mostly; 47atomic_t perf_task_events __read_mostly;
39static atomic_t nr_mmap_events __read_mostly; 48static atomic_t nr_mmap_events __read_mostly;
40static atomic_t nr_comm_events __read_mostly; 49static atomic_t nr_comm_events __read_mostly;
@@ -62,6 +71,12 @@ int sysctl_perf_event_sample_rate __read_mostly = 100000;
62 71
63static atomic64_t perf_event_id; 72static atomic64_t perf_event_id;
64 73
74static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
75 enum event_type_t event_type);
76
77static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
78 enum event_type_t event_type);
79
65void __weak perf_event_print_debug(void) { } 80void __weak perf_event_print_debug(void) { }
66 81
67extern __weak const char *perf_pmu_name(void) 82extern __weak const char *perf_pmu_name(void)
@@ -69,6 +84,11 @@ extern __weak const char *perf_pmu_name(void)
69 return "pmu"; 84 return "pmu";
70} 85}
71 86
87static inline u64 perf_clock(void)
88{
89 return local_clock();
90}
91
72void perf_pmu_disable(struct pmu *pmu) 92void perf_pmu_disable(struct pmu *pmu)
73{ 93{
74 int *count = this_cpu_ptr(pmu->pmu_disable_count); 94 int *count = this_cpu_ptr(pmu->pmu_disable_count);
@@ -133,6 +153,28 @@ static void unclone_ctx(struct perf_event_context *ctx)
133 } 153 }
134} 154}
135 155
156static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
157{
158 /*
159 * only top level events have the pid namespace they were created in
160 */
161 if (event->parent)
162 event = event->parent;
163
164 return task_tgid_nr_ns(p, event->ns);
165}
166
167static u32 perf_event_tid(struct perf_event *event, struct task_struct *p)
168{
169 /*
170 * only top level events have the pid namespace they were created in
171 */
172 if (event->parent)
173 event = event->parent;
174
175 return task_pid_nr_ns(p, event->ns);
176}
177
136/* 178/*
137 * If we inherit events we want to return the parent event id 179 * If we inherit events we want to return the parent event id
138 * to userspace. 180 * to userspace.
@@ -215,11 +257,6 @@ static void perf_unpin_context(struct perf_event_context *ctx)
215 put_ctx(ctx); 257 put_ctx(ctx);
216} 258}
217 259
218static inline u64 perf_clock(void)
219{
220 return local_clock();
221}
222
223/* 260/*
224 * Update the record of the current time in a context. 261 * Update the record of the current time in a context.
225 */ 262 */
@@ -231,6 +268,12 @@ static void update_context_time(struct perf_event_context *ctx)
231 ctx->timestamp = now; 268 ctx->timestamp = now;
232} 269}
233 270
271static u64 perf_event_time(struct perf_event *event)
272{
273 struct perf_event_context *ctx = event->ctx;
274 return ctx ? ctx->time : 0;
275}
276
234/* 277/*
235 * Update the total_time_enabled and total_time_running fields for a event. 278 * Update the total_time_enabled and total_time_running fields for a event.
236 */ 279 */
@@ -244,7 +287,7 @@ static void update_event_times(struct perf_event *event)
244 return; 287 return;
245 288
246 if (ctx->is_active) 289 if (ctx->is_active)
247 run_end = ctx->time; 290 run_end = perf_event_time(event);
248 else 291 else
249 run_end = event->tstamp_stopped; 292 run_end = event->tstamp_stopped;
250 293
@@ -253,7 +296,7 @@ static void update_event_times(struct perf_event *event)
253 if (event->state == PERF_EVENT_STATE_INACTIVE) 296 if (event->state == PERF_EVENT_STATE_INACTIVE)
254 run_end = event->tstamp_stopped; 297 run_end = event->tstamp_stopped;
255 else 298 else
256 run_end = ctx->time; 299 run_end = perf_event_time(event);
257 300
258 event->total_time_running = run_end - event->tstamp_running; 301 event->total_time_running = run_end - event->tstamp_running;
259} 302}
@@ -312,9 +355,84 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
312 ctx->nr_stat++; 355 ctx->nr_stat++;
313} 356}
314 357
358/*
359 * Called at perf_event creation and when events are attached/detached from a
360 * group.
361 */
362static void perf_event__read_size(struct perf_event *event)
363{
364 int entry = sizeof(u64); /* value */
365 int size = 0;
366 int nr = 1;
367
368 if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
369 size += sizeof(u64);
370
371 if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
372 size += sizeof(u64);
373
374 if (event->attr.read_format & PERF_FORMAT_ID)
375 entry += sizeof(u64);
376
377 if (event->attr.read_format & PERF_FORMAT_GROUP) {
378 nr += event->group_leader->nr_siblings;
379 size += sizeof(u64);
380 }
381
382 size += entry * nr;
383 event->read_size = size;
384}
385
386static void perf_event__header_size(struct perf_event *event)
387{
388 struct perf_sample_data *data;
389 u64 sample_type = event->attr.sample_type;
390 u16 size = 0;
391
392 perf_event__read_size(event);
393
394 if (sample_type & PERF_SAMPLE_IP)
395 size += sizeof(data->ip);
396
397 if (sample_type & PERF_SAMPLE_ADDR)
398 size += sizeof(data->addr);
399
400 if (sample_type & PERF_SAMPLE_PERIOD)
401 size += sizeof(data->period);
402
403 if (sample_type & PERF_SAMPLE_READ)
404 size += event->read_size;
405
406 event->header_size = size;
407}
408
409static void perf_event__id_header_size(struct perf_event *event)
410{
411 struct perf_sample_data *data;
412 u64 sample_type = event->attr.sample_type;
413 u16 size = 0;
414
415 if (sample_type & PERF_SAMPLE_TID)
416 size += sizeof(data->tid_entry);
417
418 if (sample_type & PERF_SAMPLE_TIME)
419 size += sizeof(data->time);
420
421 if (sample_type & PERF_SAMPLE_ID)
422 size += sizeof(data->id);
423
424 if (sample_type & PERF_SAMPLE_STREAM_ID)
425 size += sizeof(data->stream_id);
426
427 if (sample_type & PERF_SAMPLE_CPU)
428 size += sizeof(data->cpu_entry);
429
430 event->id_header_size = size;
431}
432
315static void perf_group_attach(struct perf_event *event) 433static void perf_group_attach(struct perf_event *event)
316{ 434{
317 struct perf_event *group_leader = event->group_leader; 435 struct perf_event *group_leader = event->group_leader, *pos;
318 436
319 /* 437 /*
320 * We can have double attach due to group movement in perf_event_open. 438 * We can have double attach due to group movement in perf_event_open.
@@ -333,6 +451,11 @@ static void perf_group_attach(struct perf_event *event)
333 451
334 list_add_tail(&event->group_entry, &group_leader->sibling_list); 452 list_add_tail(&event->group_entry, &group_leader->sibling_list);
335 group_leader->nr_siblings++; 453 group_leader->nr_siblings++;
454
455 perf_event__header_size(group_leader);
456
457 list_for_each_entry(pos, &group_leader->sibling_list, group_entry)
458 perf_event__header_size(pos);
336} 459}
337 460
338/* 461/*
@@ -391,7 +514,7 @@ static void perf_group_detach(struct perf_event *event)
391 if (event->group_leader != event) { 514 if (event->group_leader != event) {
392 list_del_init(&event->group_entry); 515 list_del_init(&event->group_entry);
393 event->group_leader->nr_siblings--; 516 event->group_leader->nr_siblings--;
394 return; 517 goto out;
395 } 518 }
396 519
397 if (!list_empty(&event->group_entry)) 520 if (!list_empty(&event->group_entry))
@@ -410,6 +533,12 @@ static void perf_group_detach(struct perf_event *event)
410 /* Inherit group flags from the previous leader */ 533 /* Inherit group flags from the previous leader */
411 sibling->group_flags = event->group_flags; 534 sibling->group_flags = event->group_flags;
412 } 535 }
536
537out:
538 perf_event__header_size(event->group_leader);
539
540 list_for_each_entry(tmp, &event->group_leader->sibling_list, group_entry)
541 perf_event__header_size(tmp);
413} 542}
414 543
415static inline int 544static inline int
@@ -423,6 +552,7 @@ event_sched_out(struct perf_event *event,
423 struct perf_cpu_context *cpuctx, 552 struct perf_cpu_context *cpuctx,
424 struct perf_event_context *ctx) 553 struct perf_event_context *ctx)
425{ 554{
555 u64 tstamp = perf_event_time(event);
426 u64 delta; 556 u64 delta;
427 /* 557 /*
428 * An event which could not be activated because of 558 * An event which could not be activated because of
@@ -434,7 +564,7 @@ event_sched_out(struct perf_event *event,
434 && !event_filter_match(event)) { 564 && !event_filter_match(event)) {
435 delta = ctx->time - event->tstamp_stopped; 565 delta = ctx->time - event->tstamp_stopped;
436 event->tstamp_running += delta; 566 event->tstamp_running += delta;
437 event->tstamp_stopped = ctx->time; 567 event->tstamp_stopped = tstamp;
438 } 568 }
439 569
440 if (event->state != PERF_EVENT_STATE_ACTIVE) 570 if (event->state != PERF_EVENT_STATE_ACTIVE)
@@ -445,7 +575,7 @@ event_sched_out(struct perf_event *event,
445 event->pending_disable = 0; 575 event->pending_disable = 0;
446 event->state = PERF_EVENT_STATE_OFF; 576 event->state = PERF_EVENT_STATE_OFF;
447 } 577 }
448 event->tstamp_stopped = ctx->time; 578 event->tstamp_stopped = tstamp;
449 event->pmu->del(event, 0); 579 event->pmu->del(event, 0);
450 event->oncpu = -1; 580 event->oncpu = -1;
451 581
@@ -657,6 +787,8 @@ event_sched_in(struct perf_event *event,
657 struct perf_cpu_context *cpuctx, 787 struct perf_cpu_context *cpuctx,
658 struct perf_event_context *ctx) 788 struct perf_event_context *ctx)
659{ 789{
790 u64 tstamp = perf_event_time(event);
791
660 if (event->state <= PERF_EVENT_STATE_OFF) 792 if (event->state <= PERF_EVENT_STATE_OFF)
661 return 0; 793 return 0;
662 794
@@ -673,9 +805,9 @@ event_sched_in(struct perf_event *event,
673 return -EAGAIN; 805 return -EAGAIN;
674 } 806 }
675 807
676 event->tstamp_running += ctx->time - event->tstamp_stopped; 808 event->tstamp_running += tstamp - event->tstamp_stopped;
677 809
678 event->shadow_ctx_time = ctx->time - ctx->timestamp; 810 event->shadow_ctx_time = tstamp - ctx->timestamp;
679 811
680 if (!is_software_event(event)) 812 if (!is_software_event(event))
681 cpuctx->active_oncpu++; 813 cpuctx->active_oncpu++;
@@ -787,11 +919,13 @@ static int group_can_go_on(struct perf_event *event,
787static void add_event_to_ctx(struct perf_event *event, 919static void add_event_to_ctx(struct perf_event *event,
788 struct perf_event_context *ctx) 920 struct perf_event_context *ctx)
789{ 921{
922 u64 tstamp = perf_event_time(event);
923
790 list_add_event(event, ctx); 924 list_add_event(event, ctx);
791 perf_group_attach(event); 925 perf_group_attach(event);
792 event->tstamp_enabled = ctx->time; 926 event->tstamp_enabled = tstamp;
793 event->tstamp_running = ctx->time; 927 event->tstamp_running = tstamp;
794 event->tstamp_stopped = ctx->time; 928 event->tstamp_stopped = tstamp;
795} 929}
796 930
797/* 931/*
@@ -826,7 +960,7 @@ static void __perf_install_in_context(void *info)
826 960
827 add_event_to_ctx(event, ctx); 961 add_event_to_ctx(event, ctx);
828 962
829 if (event->cpu != -1 && event->cpu != smp_processor_id()) 963 if (!event_filter_match(event))
830 goto unlock; 964 goto unlock;
831 965
832 /* 966 /*
@@ -931,14 +1065,13 @@ static void __perf_event_mark_enabled(struct perf_event *event,
931 struct perf_event_context *ctx) 1065 struct perf_event_context *ctx)
932{ 1066{
933 struct perf_event *sub; 1067 struct perf_event *sub;
1068 u64 tstamp = perf_event_time(event);
934 1069
935 event->state = PERF_EVENT_STATE_INACTIVE; 1070 event->state = PERF_EVENT_STATE_INACTIVE;
936 event->tstamp_enabled = ctx->time - event->total_time_enabled; 1071 event->tstamp_enabled = tstamp - event->total_time_enabled;
937 list_for_each_entry(sub, &event->sibling_list, group_entry) { 1072 list_for_each_entry(sub, &event->sibling_list, group_entry) {
938 if (sub->state >= PERF_EVENT_STATE_INACTIVE) { 1073 if (sub->state >= PERF_EVENT_STATE_INACTIVE)
939 sub->tstamp_enabled = 1074 sub->tstamp_enabled = tstamp - sub->total_time_enabled;
940 ctx->time - sub->total_time_enabled;
941 }
942 } 1075 }
943} 1076}
944 1077
@@ -971,7 +1104,7 @@ static void __perf_event_enable(void *info)
971 goto unlock; 1104 goto unlock;
972 __perf_event_mark_enabled(event, ctx); 1105 __perf_event_mark_enabled(event, ctx);
973 1106
974 if (event->cpu != -1 && event->cpu != smp_processor_id()) 1107 if (!event_filter_match(event))
975 goto unlock; 1108 goto unlock;
976 1109
977 /* 1110 /*
@@ -1073,7 +1206,7 @@ static int perf_event_refresh(struct perf_event *event, int refresh)
1073 /* 1206 /*
1074 * not supported on inherited events 1207 * not supported on inherited events
1075 */ 1208 */
1076 if (event->attr.inherit) 1209 if (event->attr.inherit || !is_sampling_event(event))
1077 return -EINVAL; 1210 return -EINVAL;
1078 1211
1079 atomic_add(refresh, &event->event_limit); 1212 atomic_add(refresh, &event->event_limit);
@@ -1082,12 +1215,6 @@ static int perf_event_refresh(struct perf_event *event, int refresh)
1082 return 0; 1215 return 0;
1083} 1216}
1084 1217
1085enum event_type_t {
1086 EVENT_FLEXIBLE = 0x1,
1087 EVENT_PINNED = 0x2,
1088 EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
1089};
1090
1091static void ctx_sched_out(struct perf_event_context *ctx, 1218static void ctx_sched_out(struct perf_event_context *ctx,
1092 struct perf_cpu_context *cpuctx, 1219 struct perf_cpu_context *cpuctx,
1093 enum event_type_t event_type) 1220 enum event_type_t event_type)
@@ -1324,7 +1451,7 @@ ctx_pinned_sched_in(struct perf_event_context *ctx,
1324 list_for_each_entry(event, &ctx->pinned_groups, group_entry) { 1451 list_for_each_entry(event, &ctx->pinned_groups, group_entry) {
1325 if (event->state <= PERF_EVENT_STATE_OFF) 1452 if (event->state <= PERF_EVENT_STATE_OFF)
1326 continue; 1453 continue;
1327 if (event->cpu != -1 && event->cpu != smp_processor_id()) 1454 if (!event_filter_match(event))
1328 continue; 1455 continue;
1329 1456
1330 if (group_can_go_on(event, cpuctx, 1)) 1457 if (group_can_go_on(event, cpuctx, 1))
@@ -1356,7 +1483,7 @@ ctx_flexible_sched_in(struct perf_event_context *ctx,
1356 * Listen to the 'cpu' scheduling filter constraint 1483 * Listen to the 'cpu' scheduling filter constraint
1357 * of events: 1484 * of events:
1358 */ 1485 */
1359 if (event->cpu != -1 && event->cpu != smp_processor_id()) 1486 if (!event_filter_match(event))
1360 continue; 1487 continue;
1361 1488
1362 if (group_can_go_on(event, cpuctx, can_add_hw)) { 1489 if (group_can_go_on(event, cpuctx, can_add_hw)) {
@@ -1583,7 +1710,7 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx, u64 period)
1583 if (event->state != PERF_EVENT_STATE_ACTIVE) 1710 if (event->state != PERF_EVENT_STATE_ACTIVE)
1584 continue; 1711 continue;
1585 1712
1586 if (event->cpu != -1 && event->cpu != smp_processor_id()) 1713 if (!event_filter_match(event))
1587 continue; 1714 continue;
1588 1715
1589 hwc = &event->hw; 1716 hwc = &event->hw;
@@ -1774,11 +1901,12 @@ static void __perf_event_read(void *info)
1774 return; 1901 return;
1775 1902
1776 raw_spin_lock(&ctx->lock); 1903 raw_spin_lock(&ctx->lock);
1777 update_context_time(ctx); 1904 if (ctx->is_active)
1905 update_context_time(ctx);
1778 update_event_times(event); 1906 update_event_times(event);
1907 if (event->state == PERF_EVENT_STATE_ACTIVE)
1908 event->pmu->read(event);
1779 raw_spin_unlock(&ctx->lock); 1909 raw_spin_unlock(&ctx->lock);
1780
1781 event->pmu->read(event);
1782} 1910}
1783 1911
1784static inline u64 perf_event_count(struct perf_event *event) 1912static inline u64 perf_event_count(struct perf_event *event)
@@ -1872,8 +2000,7 @@ static int alloc_callchain_buffers(void)
1872 * accessed from NMI. Use a temporary manual per cpu allocation 2000 * accessed from NMI. Use a temporary manual per cpu allocation
1873 * until that gets sorted out. 2001 * until that gets sorted out.
1874 */ 2002 */
1875 size = sizeof(*entries) + sizeof(struct perf_callchain_entry *) * 2003 size = offsetof(struct callchain_cpus_entries, cpu_entries[nr_cpu_ids]);
1876 num_possible_cpus();
1877 2004
1878 entries = kzalloc(size, GFP_KERNEL); 2005 entries = kzalloc(size, GFP_KERNEL);
1879 if (!entries) 2006 if (!entries)
@@ -2074,13 +2201,6 @@ find_lively_task_by_vpid(pid_t vpid)
2074 if (!task) 2201 if (!task)
2075 return ERR_PTR(-ESRCH); 2202 return ERR_PTR(-ESRCH);
2076 2203
2077 /*
2078 * Can't attach events to a dying task.
2079 */
2080 err = -ESRCH;
2081 if (task->flags & PF_EXITING)
2082 goto errout;
2083
2084 /* Reuse ptrace permission checks for now. */ 2204 /* Reuse ptrace permission checks for now. */
2085 err = -EACCES; 2205 err = -EACCES;
2086 if (!ptrace_may_access(task, PTRACE_MODE_READ)) 2206 if (!ptrace_may_access(task, PTRACE_MODE_READ))
@@ -2101,14 +2221,11 @@ find_get_context(struct pmu *pmu, struct task_struct *task, int cpu)
2101 unsigned long flags; 2221 unsigned long flags;
2102 int ctxn, err; 2222 int ctxn, err;
2103 2223
2104 if (!task && cpu != -1) { 2224 if (!task) {
2105 /* Must be root to operate on a CPU event: */ 2225 /* Must be root to operate on a CPU event: */
2106 if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN)) 2226 if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
2107 return ERR_PTR(-EACCES); 2227 return ERR_PTR(-EACCES);
2108 2228
2109 if (cpu < 0 || cpu >= nr_cpumask_bits)
2110 return ERR_PTR(-EINVAL);
2111
2112 /* 2229 /*
2113 * We could be clever and allow to attach a event to an 2230 * We could be clever and allow to attach a event to an
2114 * offline CPU and activate it when the CPU comes up, but 2231 * offline CPU and activate it when the CPU comes up, but
@@ -2144,14 +2261,27 @@ retry:
2144 2261
2145 get_ctx(ctx); 2262 get_ctx(ctx);
2146 2263
2147 if (cmpxchg(&task->perf_event_ctxp[ctxn], NULL, ctx)) { 2264 err = 0;
2148 /* 2265 mutex_lock(&task->perf_event_mutex);
2149 * We raced with some other task; use 2266 /*
2150 * the context they set. 2267 * If it has already passed perf_event_exit_task().
2151 */ 2268 * we must see PF_EXITING, it takes this mutex too.
2269 */
2270 if (task->flags & PF_EXITING)
2271 err = -ESRCH;
2272 else if (task->perf_event_ctxp[ctxn])
2273 err = -EAGAIN;
2274 else
2275 rcu_assign_pointer(task->perf_event_ctxp[ctxn], ctx);
2276 mutex_unlock(&task->perf_event_mutex);
2277
2278 if (unlikely(err)) {
2152 put_task_struct(task); 2279 put_task_struct(task);
2153 kfree(ctx); 2280 kfree(ctx);
2154 goto retry; 2281
2282 if (err == -EAGAIN)
2283 goto retry;
2284 goto errout;
2155 } 2285 }
2156 } 2286 }
2157 2287
@@ -2289,31 +2419,6 @@ static int perf_release(struct inode *inode, struct file *file)
2289 return perf_event_release_kernel(event); 2419 return perf_event_release_kernel(event);
2290} 2420}
2291 2421
2292static int perf_event_read_size(struct perf_event *event)
2293{
2294 int entry = sizeof(u64); /* value */
2295 int size = 0;
2296 int nr = 1;
2297
2298 if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
2299 size += sizeof(u64);
2300
2301 if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
2302 size += sizeof(u64);
2303
2304 if (event->attr.read_format & PERF_FORMAT_ID)
2305 entry += sizeof(u64);
2306
2307 if (event->attr.read_format & PERF_FORMAT_GROUP) {
2308 nr += event->group_leader->nr_siblings;
2309 size += sizeof(u64);
2310 }
2311
2312 size += entry * nr;
2313
2314 return size;
2315}
2316
2317u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running) 2422u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
2318{ 2423{
2319 struct perf_event *child; 2424 struct perf_event *child;
@@ -2428,7 +2533,7 @@ perf_read_hw(struct perf_event *event, char __user *buf, size_t count)
2428 if (event->state == PERF_EVENT_STATE_ERROR) 2533 if (event->state == PERF_EVENT_STATE_ERROR)
2429 return 0; 2534 return 0;
2430 2535
2431 if (count < perf_event_read_size(event)) 2536 if (count < event->read_size)
2432 return -ENOSPC; 2537 return -ENOSPC;
2433 2538
2434 WARN_ON_ONCE(event->ctx->parent_ctx); 2539 WARN_ON_ONCE(event->ctx->parent_ctx);
@@ -2514,7 +2619,7 @@ static int perf_event_period(struct perf_event *event, u64 __user *arg)
2514 int ret = 0; 2619 int ret = 0;
2515 u64 value; 2620 u64 value;
2516 2621
2517 if (!event->attr.sample_period) 2622 if (!is_sampling_event(event))
2518 return -EINVAL; 2623 return -EINVAL;
2519 2624
2520 if (copy_from_user(&value, arg, sizeof(value))) 2625 if (copy_from_user(&value, arg, sizeof(value)))
@@ -3305,6 +3410,73 @@ __always_inline void perf_output_copy(struct perf_output_handle *handle,
3305 } while (len); 3410 } while (len);
3306} 3411}
3307 3412
3413static void __perf_event_header__init_id(struct perf_event_header *header,
3414 struct perf_sample_data *data,
3415 struct perf_event *event)
3416{
3417 u64 sample_type = event->attr.sample_type;
3418
3419 data->type = sample_type;
3420 header->size += event->id_header_size;
3421
3422 if (sample_type & PERF_SAMPLE_TID) {
3423 /* namespace issues */
3424 data->tid_entry.pid = perf_event_pid(event, current);
3425 data->tid_entry.tid = perf_event_tid(event, current);
3426 }
3427
3428 if (sample_type & PERF_SAMPLE_TIME)
3429 data->time = perf_clock();
3430
3431 if (sample_type & PERF_SAMPLE_ID)
3432 data->id = primary_event_id(event);
3433
3434 if (sample_type & PERF_SAMPLE_STREAM_ID)
3435 data->stream_id = event->id;
3436
3437 if (sample_type & PERF_SAMPLE_CPU) {
3438 data->cpu_entry.cpu = raw_smp_processor_id();
3439 data->cpu_entry.reserved = 0;
3440 }
3441}
3442
3443static void perf_event_header__init_id(struct perf_event_header *header,
3444 struct perf_sample_data *data,
3445 struct perf_event *event)
3446{
3447 if (event->attr.sample_id_all)
3448 __perf_event_header__init_id(header, data, event);
3449}
3450
3451static void __perf_event__output_id_sample(struct perf_output_handle *handle,
3452 struct perf_sample_data *data)
3453{
3454 u64 sample_type = data->type;
3455
3456 if (sample_type & PERF_SAMPLE_TID)
3457 perf_output_put(handle, data->tid_entry);
3458
3459 if (sample_type & PERF_SAMPLE_TIME)
3460 perf_output_put(handle, data->time);
3461
3462 if (sample_type & PERF_SAMPLE_ID)
3463 perf_output_put(handle, data->id);
3464
3465 if (sample_type & PERF_SAMPLE_STREAM_ID)
3466 perf_output_put(handle, data->stream_id);
3467
3468 if (sample_type & PERF_SAMPLE_CPU)
3469 perf_output_put(handle, data->cpu_entry);
3470}
3471
3472static void perf_event__output_id_sample(struct perf_event *event,
3473 struct perf_output_handle *handle,
3474 struct perf_sample_data *sample)
3475{
3476 if (event->attr.sample_id_all)
3477 __perf_event__output_id_sample(handle, sample);
3478}
3479
3308int perf_output_begin(struct perf_output_handle *handle, 3480int perf_output_begin(struct perf_output_handle *handle,
3309 struct perf_event *event, unsigned int size, 3481 struct perf_event *event, unsigned int size,
3310 int nmi, int sample) 3482 int nmi, int sample)
@@ -3312,6 +3484,7 @@ int perf_output_begin(struct perf_output_handle *handle,
3312 struct perf_buffer *buffer; 3484 struct perf_buffer *buffer;
3313 unsigned long tail, offset, head; 3485 unsigned long tail, offset, head;
3314 int have_lost; 3486 int have_lost;
3487 struct perf_sample_data sample_data;
3315 struct { 3488 struct {
3316 struct perf_event_header header; 3489 struct perf_event_header header;
3317 u64 id; 3490 u64 id;
@@ -3338,8 +3511,12 @@ int perf_output_begin(struct perf_output_handle *handle,
3338 goto out; 3511 goto out;
3339 3512
3340 have_lost = local_read(&buffer->lost); 3513 have_lost = local_read(&buffer->lost);
3341 if (have_lost) 3514 if (have_lost) {
3342 size += sizeof(lost_event); 3515 lost_event.header.size = sizeof(lost_event);
3516 perf_event_header__init_id(&lost_event.header, &sample_data,
3517 event);
3518 size += lost_event.header.size;
3519 }
3343 3520
3344 perf_output_get_handle(handle); 3521 perf_output_get_handle(handle);
3345 3522
@@ -3370,11 +3547,11 @@ int perf_output_begin(struct perf_output_handle *handle,
3370 if (have_lost) { 3547 if (have_lost) {
3371 lost_event.header.type = PERF_RECORD_LOST; 3548 lost_event.header.type = PERF_RECORD_LOST;
3372 lost_event.header.misc = 0; 3549 lost_event.header.misc = 0;
3373 lost_event.header.size = sizeof(lost_event);
3374 lost_event.id = event->id; 3550 lost_event.id = event->id;
3375 lost_event.lost = local_xchg(&buffer->lost, 0); 3551 lost_event.lost = local_xchg(&buffer->lost, 0);
3376 3552
3377 perf_output_put(handle, lost_event); 3553 perf_output_put(handle, lost_event);
3554 perf_event__output_id_sample(event, handle, &sample_data);
3378 } 3555 }
3379 3556
3380 return 0; 3557 return 0;
@@ -3407,28 +3584,6 @@ void perf_output_end(struct perf_output_handle *handle)
3407 rcu_read_unlock(); 3584 rcu_read_unlock();
3408} 3585}
3409 3586
3410static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
3411{
3412 /*
3413 * only top level events have the pid namespace they were created in
3414 */
3415 if (event->parent)
3416 event = event->parent;
3417
3418 return task_tgid_nr_ns(p, event->ns);
3419}
3420
3421static u32 perf_event_tid(struct perf_event *event, struct task_struct *p)
3422{
3423 /*
3424 * only top level events have the pid namespace they were created in
3425 */
3426 if (event->parent)
3427 event = event->parent;
3428
3429 return task_pid_nr_ns(p, event->ns);
3430}
3431
3432static void perf_output_read_one(struct perf_output_handle *handle, 3587static void perf_output_read_one(struct perf_output_handle *handle,
3433 struct perf_event *event, 3588 struct perf_event *event,
3434 u64 enabled, u64 running) 3589 u64 enabled, u64 running)
@@ -3603,61 +3758,16 @@ void perf_prepare_sample(struct perf_event_header *header,
3603{ 3758{
3604 u64 sample_type = event->attr.sample_type; 3759 u64 sample_type = event->attr.sample_type;
3605 3760
3606 data->type = sample_type;
3607
3608 header->type = PERF_RECORD_SAMPLE; 3761 header->type = PERF_RECORD_SAMPLE;
3609 header->size = sizeof(*header); 3762 header->size = sizeof(*header) + event->header_size;
3610 3763
3611 header->misc = 0; 3764 header->misc = 0;
3612 header->misc |= perf_misc_flags(regs); 3765 header->misc |= perf_misc_flags(regs);
3613 3766
3614 if (sample_type & PERF_SAMPLE_IP) { 3767 __perf_event_header__init_id(header, data, event);
3615 data->ip = perf_instruction_pointer(regs);
3616
3617 header->size += sizeof(data->ip);
3618 }
3619
3620 if (sample_type & PERF_SAMPLE_TID) {
3621 /* namespace issues */
3622 data->tid_entry.pid = perf_event_pid(event, current);
3623 data->tid_entry.tid = perf_event_tid(event, current);
3624
3625 header->size += sizeof(data->tid_entry);
3626 }
3627
3628 if (sample_type & PERF_SAMPLE_TIME) {
3629 data->time = perf_clock();
3630
3631 header->size += sizeof(data->time);
3632 }
3633
3634 if (sample_type & PERF_SAMPLE_ADDR)
3635 header->size += sizeof(data->addr);
3636
3637 if (sample_type & PERF_SAMPLE_ID) {
3638 data->id = primary_event_id(event);
3639
3640 header->size += sizeof(data->id);
3641 }
3642
3643 if (sample_type & PERF_SAMPLE_STREAM_ID) {
3644 data->stream_id = event->id;
3645
3646 header->size += sizeof(data->stream_id);
3647 }
3648
3649 if (sample_type & PERF_SAMPLE_CPU) {
3650 data->cpu_entry.cpu = raw_smp_processor_id();
3651 data->cpu_entry.reserved = 0;
3652
3653 header->size += sizeof(data->cpu_entry);
3654 }
3655
3656 if (sample_type & PERF_SAMPLE_PERIOD)
3657 header->size += sizeof(data->period);
3658 3768
3659 if (sample_type & PERF_SAMPLE_READ) 3769 if (sample_type & PERF_SAMPLE_IP)
3660 header->size += perf_event_read_size(event); 3770 data->ip = perf_instruction_pointer(regs);
3661 3771
3662 if (sample_type & PERF_SAMPLE_CALLCHAIN) { 3772 if (sample_type & PERF_SAMPLE_CALLCHAIN) {
3663 int size = 1; 3773 int size = 1;
@@ -3722,23 +3832,26 @@ perf_event_read_event(struct perf_event *event,
3722 struct task_struct *task) 3832 struct task_struct *task)
3723{ 3833{
3724 struct perf_output_handle handle; 3834 struct perf_output_handle handle;
3835 struct perf_sample_data sample;
3725 struct perf_read_event read_event = { 3836 struct perf_read_event read_event = {
3726 .header = { 3837 .header = {
3727 .type = PERF_RECORD_READ, 3838 .type = PERF_RECORD_READ,
3728 .misc = 0, 3839 .misc = 0,
3729 .size = sizeof(read_event) + perf_event_read_size(event), 3840 .size = sizeof(read_event) + event->read_size,
3730 }, 3841 },
3731 .pid = perf_event_pid(event, task), 3842 .pid = perf_event_pid(event, task),
3732 .tid = perf_event_tid(event, task), 3843 .tid = perf_event_tid(event, task),
3733 }; 3844 };
3734 int ret; 3845 int ret;
3735 3846
3847 perf_event_header__init_id(&read_event.header, &sample, event);
3736 ret = perf_output_begin(&handle, event, read_event.header.size, 0, 0); 3848 ret = perf_output_begin(&handle, event, read_event.header.size, 0, 0);
3737 if (ret) 3849 if (ret)
3738 return; 3850 return;
3739 3851
3740 perf_output_put(&handle, read_event); 3852 perf_output_put(&handle, read_event);
3741 perf_output_read(&handle, event); 3853 perf_output_read(&handle, event);
3854 perf_event__output_id_sample(event, &handle, &sample);
3742 3855
3743 perf_output_end(&handle); 3856 perf_output_end(&handle);
3744} 3857}
@@ -3768,14 +3881,16 @@ static void perf_event_task_output(struct perf_event *event,
3768 struct perf_task_event *task_event) 3881 struct perf_task_event *task_event)
3769{ 3882{
3770 struct perf_output_handle handle; 3883 struct perf_output_handle handle;
3884 struct perf_sample_data sample;
3771 struct task_struct *task = task_event->task; 3885 struct task_struct *task = task_event->task;
3772 int size, ret; 3886 int ret, size = task_event->event_id.header.size;
3773 3887
3774 size = task_event->event_id.header.size; 3888 perf_event_header__init_id(&task_event->event_id.header, &sample, event);
3775 ret = perf_output_begin(&handle, event, size, 0, 0);
3776 3889
3890 ret = perf_output_begin(&handle, event,
3891 task_event->event_id.header.size, 0, 0);
3777 if (ret) 3892 if (ret)
3778 return; 3893 goto out;
3779 3894
3780 task_event->event_id.pid = perf_event_pid(event, task); 3895 task_event->event_id.pid = perf_event_pid(event, task);
3781 task_event->event_id.ppid = perf_event_pid(event, current); 3896 task_event->event_id.ppid = perf_event_pid(event, current);
@@ -3785,7 +3900,11 @@ static void perf_event_task_output(struct perf_event *event,
3785 3900
3786 perf_output_put(&handle, task_event->event_id); 3901 perf_output_put(&handle, task_event->event_id);
3787 3902
3903 perf_event__output_id_sample(event, &handle, &sample);
3904
3788 perf_output_end(&handle); 3905 perf_output_end(&handle);
3906out:
3907 task_event->event_id.header.size = size;
3789} 3908}
3790 3909
3791static int perf_event_task_match(struct perf_event *event) 3910static int perf_event_task_match(struct perf_event *event)
@@ -3793,7 +3912,7 @@ static int perf_event_task_match(struct perf_event *event)
3793 if (event->state < PERF_EVENT_STATE_INACTIVE) 3912 if (event->state < PERF_EVENT_STATE_INACTIVE)
3794 return 0; 3913 return 0;
3795 3914
3796 if (event->cpu != -1 && event->cpu != smp_processor_id()) 3915 if (!event_filter_match(event))
3797 return 0; 3916 return 0;
3798 3917
3799 if (event->attr.comm || event->attr.mmap || 3918 if (event->attr.comm || event->attr.mmap ||
@@ -3900,11 +4019,16 @@ static void perf_event_comm_output(struct perf_event *event,
3900 struct perf_comm_event *comm_event) 4019 struct perf_comm_event *comm_event)
3901{ 4020{
3902 struct perf_output_handle handle; 4021 struct perf_output_handle handle;
4022 struct perf_sample_data sample;
3903 int size = comm_event->event_id.header.size; 4023 int size = comm_event->event_id.header.size;
3904 int ret = perf_output_begin(&handle, event, size, 0, 0); 4024 int ret;
4025
4026 perf_event_header__init_id(&comm_event->event_id.header, &sample, event);
4027 ret = perf_output_begin(&handle, event,
4028 comm_event->event_id.header.size, 0, 0);
3905 4029
3906 if (ret) 4030 if (ret)
3907 return; 4031 goto out;
3908 4032
3909 comm_event->event_id.pid = perf_event_pid(event, comm_event->task); 4033 comm_event->event_id.pid = perf_event_pid(event, comm_event->task);
3910 comm_event->event_id.tid = perf_event_tid(event, comm_event->task); 4034 comm_event->event_id.tid = perf_event_tid(event, comm_event->task);
@@ -3912,7 +4036,12 @@ static void perf_event_comm_output(struct perf_event *event,
3912 perf_output_put(&handle, comm_event->event_id); 4036 perf_output_put(&handle, comm_event->event_id);
3913 perf_output_copy(&handle, comm_event->comm, 4037 perf_output_copy(&handle, comm_event->comm,
3914 comm_event->comm_size); 4038 comm_event->comm_size);
4039
4040 perf_event__output_id_sample(event, &handle, &sample);
4041
3915 perf_output_end(&handle); 4042 perf_output_end(&handle);
4043out:
4044 comm_event->event_id.header.size = size;
3916} 4045}
3917 4046
3918static int perf_event_comm_match(struct perf_event *event) 4047static int perf_event_comm_match(struct perf_event *event)
@@ -3920,7 +4049,7 @@ static int perf_event_comm_match(struct perf_event *event)
3920 if (event->state < PERF_EVENT_STATE_INACTIVE) 4049 if (event->state < PERF_EVENT_STATE_INACTIVE)
3921 return 0; 4050 return 0;
3922 4051
3923 if (event->cpu != -1 && event->cpu != smp_processor_id()) 4052 if (!event_filter_match(event))
3924 return 0; 4053 return 0;
3925 4054
3926 if (event->attr.comm) 4055 if (event->attr.comm)
@@ -3957,7 +4086,6 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event)
3957 comm_event->comm_size = size; 4086 comm_event->comm_size = size;
3958 4087
3959 comm_event->event_id.header.size = sizeof(comm_event->event_id) + size; 4088 comm_event->event_id.header.size = sizeof(comm_event->event_id) + size;
3960
3961 rcu_read_lock(); 4089 rcu_read_lock();
3962 list_for_each_entry_rcu(pmu, &pmus, entry) { 4090 list_for_each_entry_rcu(pmu, &pmus, entry) {
3963 cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); 4091 cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
@@ -4038,11 +4166,15 @@ static void perf_event_mmap_output(struct perf_event *event,
4038 struct perf_mmap_event *mmap_event) 4166 struct perf_mmap_event *mmap_event)
4039{ 4167{
4040 struct perf_output_handle handle; 4168 struct perf_output_handle handle;
4169 struct perf_sample_data sample;
4041 int size = mmap_event->event_id.header.size; 4170 int size = mmap_event->event_id.header.size;
4042 int ret = perf_output_begin(&handle, event, size, 0, 0); 4171 int ret;
4043 4172
4173 perf_event_header__init_id(&mmap_event->event_id.header, &sample, event);
4174 ret = perf_output_begin(&handle, event,
4175 mmap_event->event_id.header.size, 0, 0);
4044 if (ret) 4176 if (ret)
4045 return; 4177 goto out;
4046 4178
4047 mmap_event->event_id.pid = perf_event_pid(event, current); 4179 mmap_event->event_id.pid = perf_event_pid(event, current);
4048 mmap_event->event_id.tid = perf_event_tid(event, current); 4180 mmap_event->event_id.tid = perf_event_tid(event, current);
@@ -4050,7 +4182,12 @@ static void perf_event_mmap_output(struct perf_event *event,
4050 perf_output_put(&handle, mmap_event->event_id); 4182 perf_output_put(&handle, mmap_event->event_id);
4051 perf_output_copy(&handle, mmap_event->file_name, 4183 perf_output_copy(&handle, mmap_event->file_name,
4052 mmap_event->file_size); 4184 mmap_event->file_size);
4185
4186 perf_event__output_id_sample(event, &handle, &sample);
4187
4053 perf_output_end(&handle); 4188 perf_output_end(&handle);
4189out:
4190 mmap_event->event_id.header.size = size;
4054} 4191}
4055 4192
4056static int perf_event_mmap_match(struct perf_event *event, 4193static int perf_event_mmap_match(struct perf_event *event,
@@ -4060,7 +4197,7 @@ static int perf_event_mmap_match(struct perf_event *event,
4060 if (event->state < PERF_EVENT_STATE_INACTIVE) 4197 if (event->state < PERF_EVENT_STATE_INACTIVE)
4061 return 0; 4198 return 0;
4062 4199
4063 if (event->cpu != -1 && event->cpu != smp_processor_id()) 4200 if (!event_filter_match(event))
4064 return 0; 4201 return 0;
4065 4202
4066 if ((!executable && event->attr.mmap_data) || 4203 if ((!executable && event->attr.mmap_data) ||
@@ -4205,6 +4342,7 @@ void perf_event_mmap(struct vm_area_struct *vma)
4205static void perf_log_throttle(struct perf_event *event, int enable) 4342static void perf_log_throttle(struct perf_event *event, int enable)
4206{ 4343{
4207 struct perf_output_handle handle; 4344 struct perf_output_handle handle;
4345 struct perf_sample_data sample;
4208 int ret; 4346 int ret;
4209 4347
4210 struct { 4348 struct {
@@ -4226,11 +4364,15 @@ static void perf_log_throttle(struct perf_event *event, int enable)
4226 if (enable) 4364 if (enable)
4227 throttle_event.header.type = PERF_RECORD_UNTHROTTLE; 4365 throttle_event.header.type = PERF_RECORD_UNTHROTTLE;
4228 4366
4229 ret = perf_output_begin(&handle, event, sizeof(throttle_event), 1, 0); 4367 perf_event_header__init_id(&throttle_event.header, &sample, event);
4368
4369 ret = perf_output_begin(&handle, event,
4370 throttle_event.header.size, 1, 0);
4230 if (ret) 4371 if (ret)
4231 return; 4372 return;
4232 4373
4233 perf_output_put(&handle, throttle_event); 4374 perf_output_put(&handle, throttle_event);
4375 perf_event__output_id_sample(event, &handle, &sample);
4234 perf_output_end(&handle); 4376 perf_output_end(&handle);
4235} 4377}
4236 4378
@@ -4246,6 +4388,13 @@ static int __perf_event_overflow(struct perf_event *event, int nmi,
4246 struct hw_perf_event *hwc = &event->hw; 4388 struct hw_perf_event *hwc = &event->hw;
4247 int ret = 0; 4389 int ret = 0;
4248 4390
4391 /*
4392 * Non-sampling counters might still use the PMI to fold short
4393 * hardware counters, ignore those.
4394 */
4395 if (unlikely(!is_sampling_event(event)))
4396 return 0;
4397
4249 if (!throttle) { 4398 if (!throttle) {
4250 hwc->interrupts++; 4399 hwc->interrupts++;
4251 } else { 4400 } else {
@@ -4391,7 +4540,7 @@ static void perf_swevent_event(struct perf_event *event, u64 nr,
4391 if (!regs) 4540 if (!regs)
4392 return; 4541 return;
4393 4542
4394 if (!hwc->sample_period) 4543 if (!is_sampling_event(event))
4395 return; 4544 return;
4396 4545
4397 if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq) 4546 if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq)
@@ -4518,7 +4667,7 @@ int perf_swevent_get_recursion_context(void)
4518} 4667}
4519EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context); 4668EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context);
4520 4669
4521void inline perf_swevent_put_recursion_context(int rctx) 4670inline void perf_swevent_put_recursion_context(int rctx)
4522{ 4671{
4523 struct swevent_htable *swhash = &__get_cpu_var(swevent_htable); 4672 struct swevent_htable *swhash = &__get_cpu_var(swevent_htable);
4524 4673
@@ -4554,7 +4703,7 @@ static int perf_swevent_add(struct perf_event *event, int flags)
4554 struct hw_perf_event *hwc = &event->hw; 4703 struct hw_perf_event *hwc = &event->hw;
4555 struct hlist_head *head; 4704 struct hlist_head *head;
4556 4705
4557 if (hwc->sample_period) { 4706 if (is_sampling_event(event)) {
4558 hwc->last_period = hwc->sample_period; 4707 hwc->last_period = hwc->sample_period;
4559 perf_swevent_set_period(event); 4708 perf_swevent_set_period(event);
4560 } 4709 }
@@ -4811,15 +4960,6 @@ static int perf_tp_event_init(struct perf_event *event)
4811 if (event->attr.type != PERF_TYPE_TRACEPOINT) 4960 if (event->attr.type != PERF_TYPE_TRACEPOINT)
4812 return -ENOENT; 4961 return -ENOENT;
4813 4962
4814 /*
4815 * Raw tracepoint data is a severe data leak, only allow root to
4816 * have these.
4817 */
4818 if ((event->attr.sample_type & PERF_SAMPLE_RAW) &&
4819 perf_paranoid_tracepoint_raw() &&
4820 !capable(CAP_SYS_ADMIN))
4821 return -EPERM;
4822
4823 err = perf_trace_init(event); 4963 err = perf_trace_init(event);
4824 if (err) 4964 if (err)
4825 return err; 4965 return err;
@@ -4842,7 +4982,7 @@ static struct pmu perf_tracepoint = {
4842 4982
4843static inline void perf_tp_register(void) 4983static inline void perf_tp_register(void)
4844{ 4984{
4845 perf_pmu_register(&perf_tracepoint); 4985 perf_pmu_register(&perf_tracepoint, "tracepoint", PERF_TYPE_TRACEPOINT);
4846} 4986}
4847 4987
4848static int perf_event_set_filter(struct perf_event *event, void __user *arg) 4988static int perf_event_set_filter(struct perf_event *event, void __user *arg)
@@ -4932,31 +5072,33 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
4932static void perf_swevent_start_hrtimer(struct perf_event *event) 5072static void perf_swevent_start_hrtimer(struct perf_event *event)
4933{ 5073{
4934 struct hw_perf_event *hwc = &event->hw; 5074 struct hw_perf_event *hwc = &event->hw;
5075 s64 period;
5076
5077 if (!is_sampling_event(event))
5078 return;
4935 5079
4936 hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 5080 hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
4937 hwc->hrtimer.function = perf_swevent_hrtimer; 5081 hwc->hrtimer.function = perf_swevent_hrtimer;
4938 if (hwc->sample_period) {
4939 s64 period = local64_read(&hwc->period_left);
4940 5082
4941 if (period) { 5083 period = local64_read(&hwc->period_left);
4942 if (period < 0) 5084 if (period) {
4943 period = 10000; 5085 if (period < 0)
5086 period = 10000;
4944 5087
4945 local64_set(&hwc->period_left, 0); 5088 local64_set(&hwc->period_left, 0);
4946 } else { 5089 } else {
4947 period = max_t(u64, 10000, hwc->sample_period); 5090 period = max_t(u64, 10000, hwc->sample_period);
4948 } 5091 }
4949 __hrtimer_start_range_ns(&hwc->hrtimer, 5092 __hrtimer_start_range_ns(&hwc->hrtimer,
4950 ns_to_ktime(period), 0, 5093 ns_to_ktime(period), 0,
4951 HRTIMER_MODE_REL_PINNED, 0); 5094 HRTIMER_MODE_REL_PINNED, 0);
4952 }
4953} 5095}
4954 5096
4955static void perf_swevent_cancel_hrtimer(struct perf_event *event) 5097static void perf_swevent_cancel_hrtimer(struct perf_event *event)
4956{ 5098{
4957 struct hw_perf_event *hwc = &event->hw; 5099 struct hw_perf_event *hwc = &event->hw;
4958 5100
4959 if (hwc->sample_period) { 5101 if (is_sampling_event(event)) {
4960 ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer); 5102 ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer);
4961 local64_set(&hwc->period_left, ktime_to_ns(remaining)); 5103 local64_set(&hwc->period_left, ktime_to_ns(remaining));
4962 5104
@@ -5184,8 +5326,63 @@ static void free_pmu_context(struct pmu *pmu)
5184out: 5326out:
5185 mutex_unlock(&pmus_lock); 5327 mutex_unlock(&pmus_lock);
5186} 5328}
5329static struct idr pmu_idr;
5330
5331static ssize_t
5332type_show(struct device *dev, struct device_attribute *attr, char *page)
5333{
5334 struct pmu *pmu = dev_get_drvdata(dev);
5335
5336 return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->type);
5337}
5338
5339static struct device_attribute pmu_dev_attrs[] = {
5340 __ATTR_RO(type),
5341 __ATTR_NULL,
5342};
5343
5344static int pmu_bus_running;
5345static struct bus_type pmu_bus = {
5346 .name = "event_source",
5347 .dev_attrs = pmu_dev_attrs,
5348};
5349
5350static void pmu_dev_release(struct device *dev)
5351{
5352 kfree(dev);
5353}
5187 5354
5188int perf_pmu_register(struct pmu *pmu) 5355static int pmu_dev_alloc(struct pmu *pmu)
5356{
5357 int ret = -ENOMEM;
5358
5359 pmu->dev = kzalloc(sizeof(struct device), GFP_KERNEL);
5360 if (!pmu->dev)
5361 goto out;
5362
5363 device_initialize(pmu->dev);
5364 ret = dev_set_name(pmu->dev, "%s", pmu->name);
5365 if (ret)
5366 goto free_dev;
5367
5368 dev_set_drvdata(pmu->dev, pmu);
5369 pmu->dev->bus = &pmu_bus;
5370 pmu->dev->release = pmu_dev_release;
5371 ret = device_add(pmu->dev);
5372 if (ret)
5373 goto free_dev;
5374
5375out:
5376 return ret;
5377
5378free_dev:
5379 put_device(pmu->dev);
5380 goto out;
5381}
5382
5383static struct lock_class_key cpuctx_mutex;
5384
5385int perf_pmu_register(struct pmu *pmu, char *name, int type)
5189{ 5386{
5190 int cpu, ret; 5387 int cpu, ret;
5191 5388
@@ -5195,19 +5392,45 @@ int perf_pmu_register(struct pmu *pmu)
5195 if (!pmu->pmu_disable_count) 5392 if (!pmu->pmu_disable_count)
5196 goto unlock; 5393 goto unlock;
5197 5394
5395 pmu->type = -1;
5396 if (!name)
5397 goto skip_type;
5398 pmu->name = name;
5399
5400 if (type < 0) {
5401 int err = idr_pre_get(&pmu_idr, GFP_KERNEL);
5402 if (!err)
5403 goto free_pdc;
5404
5405 err = idr_get_new_above(&pmu_idr, pmu, PERF_TYPE_MAX, &type);
5406 if (err) {
5407 ret = err;
5408 goto free_pdc;
5409 }
5410 }
5411 pmu->type = type;
5412
5413 if (pmu_bus_running) {
5414 ret = pmu_dev_alloc(pmu);
5415 if (ret)
5416 goto free_idr;
5417 }
5418
5419skip_type:
5198 pmu->pmu_cpu_context = find_pmu_context(pmu->task_ctx_nr); 5420 pmu->pmu_cpu_context = find_pmu_context(pmu->task_ctx_nr);
5199 if (pmu->pmu_cpu_context) 5421 if (pmu->pmu_cpu_context)
5200 goto got_cpu_context; 5422 goto got_cpu_context;
5201 5423
5202 pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context); 5424 pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context);
5203 if (!pmu->pmu_cpu_context) 5425 if (!pmu->pmu_cpu_context)
5204 goto free_pdc; 5426 goto free_dev;
5205 5427
5206 for_each_possible_cpu(cpu) { 5428 for_each_possible_cpu(cpu) {
5207 struct perf_cpu_context *cpuctx; 5429 struct perf_cpu_context *cpuctx;
5208 5430
5209 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); 5431 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
5210 __perf_event_init_context(&cpuctx->ctx); 5432 __perf_event_init_context(&cpuctx->ctx);
5433 lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex);
5211 cpuctx->ctx.type = cpu_context; 5434 cpuctx->ctx.type = cpu_context;
5212 cpuctx->ctx.pmu = pmu; 5435 cpuctx->ctx.pmu = pmu;
5213 cpuctx->jiffies_interval = 1; 5436 cpuctx->jiffies_interval = 1;
@@ -5245,6 +5468,14 @@ unlock:
5245 5468
5246 return ret; 5469 return ret;
5247 5470
5471free_dev:
5472 device_del(pmu->dev);
5473 put_device(pmu->dev);
5474
5475free_idr:
5476 if (pmu->type >= PERF_TYPE_MAX)
5477 idr_remove(&pmu_idr, pmu->type);
5478
5248free_pdc: 5479free_pdc:
5249 free_percpu(pmu->pmu_disable_count); 5480 free_percpu(pmu->pmu_disable_count);
5250 goto unlock; 5481 goto unlock;
@@ -5264,6 +5495,10 @@ void perf_pmu_unregister(struct pmu *pmu)
5264 synchronize_rcu(); 5495 synchronize_rcu();
5265 5496
5266 free_percpu(pmu->pmu_disable_count); 5497 free_percpu(pmu->pmu_disable_count);
5498 if (pmu->type >= PERF_TYPE_MAX)
5499 idr_remove(&pmu_idr, pmu->type);
5500 device_del(pmu->dev);
5501 put_device(pmu->dev);
5267 free_pmu_context(pmu); 5502 free_pmu_context(pmu);
5268} 5503}
5269 5504
@@ -5273,6 +5508,13 @@ struct pmu *perf_init_event(struct perf_event *event)
5273 int idx; 5508 int idx;
5274 5509
5275 idx = srcu_read_lock(&pmus_srcu); 5510 idx = srcu_read_lock(&pmus_srcu);
5511
5512 rcu_read_lock();
5513 pmu = idr_find(&pmu_idr, event->attr.type);
5514 rcu_read_unlock();
5515 if (pmu)
5516 goto unlock;
5517
5276 list_for_each_entry_rcu(pmu, &pmus, entry) { 5518 list_for_each_entry_rcu(pmu, &pmus, entry) {
5277 int ret = pmu->event_init(event); 5519 int ret = pmu->event_init(event);
5278 if (!ret) 5520 if (!ret)
@@ -5305,6 +5547,11 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
5305 struct hw_perf_event *hwc; 5547 struct hw_perf_event *hwc;
5306 long err; 5548 long err;
5307 5549
5550 if ((unsigned)cpu >= nr_cpu_ids) {
5551 if (!task || cpu != -1)
5552 return ERR_PTR(-EINVAL);
5553 }
5554
5308 event = kzalloc(sizeof(*event), GFP_KERNEL); 5555 event = kzalloc(sizeof(*event), GFP_KERNEL);
5309 if (!event) 5556 if (!event)
5310 return ERR_PTR(-ENOMEM); 5557 return ERR_PTR(-ENOMEM);
@@ -5353,7 +5600,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
5353 5600
5354 if (!overflow_handler && parent_event) 5601 if (!overflow_handler && parent_event)
5355 overflow_handler = parent_event->overflow_handler; 5602 overflow_handler = parent_event->overflow_handler;
5356 5603
5357 event->overflow_handler = overflow_handler; 5604 event->overflow_handler = overflow_handler;
5358 5605
5359 if (attr->disabled) 5606 if (attr->disabled)
@@ -5738,6 +5985,12 @@ SYSCALL_DEFINE5(perf_event_open,
5738 mutex_unlock(&current->perf_event_mutex); 5985 mutex_unlock(&current->perf_event_mutex);
5739 5986
5740 /* 5987 /*
5988 * Precalculate sample_data sizes
5989 */
5990 perf_event__header_size(event);
5991 perf_event__id_header_size(event);
5992
5993 /*
5741 * Drop the reference on the group_event after placing the 5994 * Drop the reference on the group_event after placing the
5742 * new event on the sibling_list. This ensures destruction 5995 * new event on the sibling_list. This ensures destruction
5743 * of the group leader will find the pointer to itself in 5996 * of the group leader will find the pointer to itself in
@@ -5883,7 +6136,7 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
5883 * scheduled, so we are now safe from rescheduling changing 6136 * scheduled, so we are now safe from rescheduling changing
5884 * our context. 6137 * our context.
5885 */ 6138 */
5886 child_ctx = child->perf_event_ctxp[ctxn]; 6139 child_ctx = rcu_dereference_raw(child->perf_event_ctxp[ctxn]);
5887 task_ctx_sched_out(child_ctx, EVENT_ALL); 6140 task_ctx_sched_out(child_ctx, EVENT_ALL);
5888 6141
5889 /* 6142 /*
@@ -6090,6 +6343,12 @@ inherit_event(struct perf_event *parent_event,
6090 child_event->overflow_handler = parent_event->overflow_handler; 6343 child_event->overflow_handler = parent_event->overflow_handler;
6091 6344
6092 /* 6345 /*
6346 * Precalculate sample_data sizes
6347 */
6348 perf_event__header_size(child_event);
6349 perf_event__id_header_size(child_event);
6350
6351 /*
6093 * Link it up in the child's context: 6352 * Link it up in the child's context:
6094 */ 6353 */
6095 raw_spin_lock_irqsave(&child_ctx->lock, flags); 6354 raw_spin_lock_irqsave(&child_ctx->lock, flags);
@@ -6190,11 +6449,6 @@ int perf_event_init_context(struct task_struct *child, int ctxn)
6190 unsigned long flags; 6449 unsigned long flags;
6191 int ret = 0; 6450 int ret = 0;
6192 6451
6193 child->perf_event_ctxp[ctxn] = NULL;
6194
6195 mutex_init(&child->perf_event_mutex);
6196 INIT_LIST_HEAD(&child->perf_event_list);
6197
6198 if (likely(!parent->perf_event_ctxp[ctxn])) 6452 if (likely(!parent->perf_event_ctxp[ctxn]))
6199 return 0; 6453 return 0;
6200 6454
@@ -6246,7 +6500,6 @@ int perf_event_init_context(struct task_struct *child, int ctxn)
6246 6500
6247 raw_spin_lock_irqsave(&parent_ctx->lock, flags); 6501 raw_spin_lock_irqsave(&parent_ctx->lock, flags);
6248 parent_ctx->rotate_disable = 0; 6502 parent_ctx->rotate_disable = 0;
6249 raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
6250 6503
6251 child_ctx = child->perf_event_ctxp[ctxn]; 6504 child_ctx = child->perf_event_ctxp[ctxn];
6252 6505
@@ -6254,12 +6507,11 @@ int perf_event_init_context(struct task_struct *child, int ctxn)
6254 /* 6507 /*
6255 * Mark the child context as a clone of the parent 6508 * Mark the child context as a clone of the parent
6256 * context, or of whatever the parent is a clone of. 6509 * context, or of whatever the parent is a clone of.
6257 * Note that if the parent is a clone, it could get 6510 *
6258 * uncloned at any point, but that doesn't matter 6511 * Note that if the parent is a clone, the holding of
6259 * because the list of events and the generation 6512 * parent_ctx->lock avoids it from being uncloned.
6260 * count can't have changed since we took the mutex.
6261 */ 6513 */
6262 cloned_ctx = rcu_dereference(parent_ctx->parent_ctx); 6514 cloned_ctx = parent_ctx->parent_ctx;
6263 if (cloned_ctx) { 6515 if (cloned_ctx) {
6264 child_ctx->parent_ctx = cloned_ctx; 6516 child_ctx->parent_ctx = cloned_ctx;
6265 child_ctx->parent_gen = parent_ctx->parent_gen; 6517 child_ctx->parent_gen = parent_ctx->parent_gen;
@@ -6270,6 +6522,7 @@ int perf_event_init_context(struct task_struct *child, int ctxn)
6270 get_ctx(child_ctx->parent_ctx); 6522 get_ctx(child_ctx->parent_ctx);
6271 } 6523 }
6272 6524
6525 raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
6273 mutex_unlock(&parent_ctx->mutex); 6526 mutex_unlock(&parent_ctx->mutex);
6274 6527
6275 perf_unpin_context(parent_ctx); 6528 perf_unpin_context(parent_ctx);
@@ -6284,6 +6537,10 @@ int perf_event_init_task(struct task_struct *child)
6284{ 6537{
6285 int ctxn, ret; 6538 int ctxn, ret;
6286 6539
6540 memset(child->perf_event_ctxp, 0, sizeof(child->perf_event_ctxp));
6541 mutex_init(&child->perf_event_mutex);
6542 INIT_LIST_HEAD(&child->perf_event_list);
6543
6287 for_each_task_context_nr(ctxn) { 6544 for_each_task_context_nr(ctxn) {
6288 ret = perf_event_init_context(child, ctxn); 6545 ret = perf_event_init_context(child, ctxn);
6289 if (ret) 6546 if (ret)
@@ -6320,7 +6577,7 @@ static void __cpuinit perf_event_init_cpu(int cpu)
6320 mutex_unlock(&swhash->hlist_mutex); 6577 mutex_unlock(&swhash->hlist_mutex);
6321} 6578}
6322 6579
6323#ifdef CONFIG_HOTPLUG_CPU 6580#if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC
6324static void perf_pmu_rotate_stop(struct pmu *pmu) 6581static void perf_pmu_rotate_stop(struct pmu *pmu)
6325{ 6582{
6326 struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); 6583 struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
@@ -6374,6 +6631,26 @@ static void perf_event_exit_cpu(int cpu)
6374static inline void perf_event_exit_cpu(int cpu) { } 6631static inline void perf_event_exit_cpu(int cpu) { }
6375#endif 6632#endif
6376 6633
6634static int
6635perf_reboot(struct notifier_block *notifier, unsigned long val, void *v)
6636{
6637 int cpu;
6638
6639 for_each_online_cpu(cpu)
6640 perf_event_exit_cpu(cpu);
6641
6642 return NOTIFY_OK;
6643}
6644
6645/*
6646 * Run the perf reboot notifier at the very last possible moment so that
6647 * the generic watchdog code runs as long as possible.
6648 */
6649static struct notifier_block perf_reboot_notifier = {
6650 .notifier_call = perf_reboot,
6651 .priority = INT_MIN,
6652};
6653
6377static int __cpuinit 6654static int __cpuinit
6378perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) 6655perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
6379{ 6656{
@@ -6402,14 +6679,45 @@ void __init perf_event_init(void)
6402{ 6679{
6403 int ret; 6680 int ret;
6404 6681
6682 idr_init(&pmu_idr);
6683
6405 perf_event_init_all_cpus(); 6684 perf_event_init_all_cpus();
6406 init_srcu_struct(&pmus_srcu); 6685 init_srcu_struct(&pmus_srcu);
6407 perf_pmu_register(&perf_swevent); 6686 perf_pmu_register(&perf_swevent, "software", PERF_TYPE_SOFTWARE);
6408 perf_pmu_register(&perf_cpu_clock); 6687 perf_pmu_register(&perf_cpu_clock, NULL, -1);
6409 perf_pmu_register(&perf_task_clock); 6688 perf_pmu_register(&perf_task_clock, NULL, -1);
6410 perf_tp_register(); 6689 perf_tp_register();
6411 perf_cpu_notifier(perf_cpu_notify); 6690 perf_cpu_notifier(perf_cpu_notify);
6691 register_reboot_notifier(&perf_reboot_notifier);
6412 6692
6413 ret = init_hw_breakpoint(); 6693 ret = init_hw_breakpoint();
6414 WARN(ret, "hw_breakpoint initialization failed with: %d", ret); 6694 WARN(ret, "hw_breakpoint initialization failed with: %d", ret);
6415} 6695}
6696
6697static int __init perf_event_sysfs_init(void)
6698{
6699 struct pmu *pmu;
6700 int ret;
6701
6702 mutex_lock(&pmus_lock);
6703
6704 ret = bus_register(&pmu_bus);
6705 if (ret)
6706 goto unlock;
6707
6708 list_for_each_entry(pmu, &pmus, entry) {
6709 if (!pmu->name || pmu->type < 0)
6710 continue;
6711
6712 ret = pmu_dev_alloc(pmu);
6713 WARN(ret, "Failed to register pmu: %s, reason %d\n", pmu->name, ret);
6714 }
6715 pmu_bus_running = 1;
6716 ret = 0;
6717
6718unlock:
6719 mutex_unlock(&pmus_lock);
6720
6721 return ret;
6722}
6723device_initcall(perf_event_sysfs_init);
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 9ca4973f736d..93bd2eb2bc53 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -145,7 +145,13 @@ static int common_timer_del(struct k_itimer *timer);
145 145
146static enum hrtimer_restart posix_timer_fn(struct hrtimer *data); 146static enum hrtimer_restart posix_timer_fn(struct hrtimer *data);
147 147
148static struct k_itimer *lock_timer(timer_t timer_id, unsigned long *flags); 148static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags);
149
150#define lock_timer(tid, flags) \
151({ struct k_itimer *__timr; \
152 __cond_lock(&__timr->it_lock, __timr = __lock_timer(tid, flags)); \
153 __timr; \
154})
149 155
150static inline void unlock_timer(struct k_itimer *timr, unsigned long flags) 156static inline void unlock_timer(struct k_itimer *timr, unsigned long flags)
151{ 157{
@@ -619,7 +625,7 @@ out:
619 * the find to the timer lock. To avoid a dead lock, the timer id MUST 625 * the find to the timer lock. To avoid a dead lock, the timer id MUST
620 * be release with out holding the timer lock. 626 * be release with out holding the timer lock.
621 */ 627 */
622static struct k_itimer *lock_timer(timer_t timer_id, unsigned long *flags) 628static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags)
623{ 629{
624 struct k_itimer *timr; 630 struct k_itimer *timr;
625 /* 631 /*
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index a5aff3ebad38..265729966ece 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -100,13 +100,9 @@ config PM_SLEEP_ADVANCED_DEBUG
100 depends on PM_ADVANCED_DEBUG 100 depends on PM_ADVANCED_DEBUG
101 default n 101 default n
102 102
103config SUSPEND_NVS
104 bool
105
106config SUSPEND 103config SUSPEND
107 bool "Suspend to RAM and standby" 104 bool "Suspend to RAM and standby"
108 depends on PM && ARCH_SUSPEND_POSSIBLE 105 depends on PM && ARCH_SUSPEND_POSSIBLE
109 select SUSPEND_NVS if HAS_IOMEM
110 default y 106 default y
111 ---help--- 107 ---help---
112 Allow the system to enter sleep states in which main memory is 108 Allow the system to enter sleep states in which main memory is
@@ -140,7 +136,6 @@ config HIBERNATION
140 depends on PM && SWAP && ARCH_HIBERNATION_POSSIBLE 136 depends on PM && SWAP && ARCH_HIBERNATION_POSSIBLE
141 select LZO_COMPRESS 137 select LZO_COMPRESS
142 select LZO_DECOMPRESS 138 select LZO_DECOMPRESS
143 select SUSPEND_NVS if HAS_IOMEM
144 ---help--- 139 ---help---
145 Enable the suspend to disk (STD) functionality, which is usually 140 Enable the suspend to disk (STD) functionality, which is usually
146 called "hibernation" in user interfaces. STD checkpoints the 141 called "hibernation" in user interfaces. STD checkpoints the
diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index f9063c6b185d..c350e18b53e3 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -1,7 +1,4 @@
1 1ccflags-$(CONFIG_PM_DEBUG) := -DDEBUG
2ifeq ($(CONFIG_PM_DEBUG),y)
3EXTRA_CFLAGS += -DDEBUG
4endif
5 2
6obj-$(CONFIG_PM) += main.o 3obj-$(CONFIG_PM) += main.o
7obj-$(CONFIG_PM_SLEEP) += console.o 4obj-$(CONFIG_PM_SLEEP) += console.o
@@ -10,6 +7,5 @@ obj-$(CONFIG_SUSPEND) += suspend.o
10obj-$(CONFIG_PM_TEST_SUSPEND) += suspend_test.o 7obj-$(CONFIG_PM_TEST_SUSPEND) += suspend_test.o
11obj-$(CONFIG_HIBERNATION) += hibernate.o snapshot.o swap.o user.o \ 8obj-$(CONFIG_HIBERNATION) += hibernate.o snapshot.o swap.o user.o \
12 block_io.o 9 block_io.o
13obj-$(CONFIG_SUSPEND_NVS) += nvs.o
14 10
15obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o 11obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 048d0b514831..1832bd264219 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -51,18 +51,18 @@ enum {
51 51
52static int hibernation_mode = HIBERNATION_SHUTDOWN; 52static int hibernation_mode = HIBERNATION_SHUTDOWN;
53 53
54static struct platform_hibernation_ops *hibernation_ops; 54static const struct platform_hibernation_ops *hibernation_ops;
55 55
56/** 56/**
57 * hibernation_set_ops - set the global hibernate operations 57 * hibernation_set_ops - set the global hibernate operations
58 * @ops: the hibernation operations to use in subsequent hibernation transitions 58 * @ops: the hibernation operations to use in subsequent hibernation transitions
59 */ 59 */
60 60
61void hibernation_set_ops(struct platform_hibernation_ops *ops) 61void hibernation_set_ops(const struct platform_hibernation_ops *ops)
62{ 62{
63 if (ops && !(ops->begin && ops->end && ops->pre_snapshot 63 if (ops && !(ops->begin && ops->end && ops->pre_snapshot
64 && ops->prepare && ops->finish && ops->enter && ops->pre_restore 64 && ops->prepare && ops->finish && ops->enter && ops->pre_restore
65 && ops->restore_cleanup)) { 65 && ops->restore_cleanup && ops->leave)) {
66 WARN_ON(1); 66 WARN_ON(1);
67 return; 67 return;
68 } 68 }
@@ -278,7 +278,7 @@ static int create_image(int platform_mode)
278 goto Enable_irqs; 278 goto Enable_irqs;
279 } 279 }
280 280
281 if (hibernation_test(TEST_CORE) || !pm_check_wakeup_events()) 281 if (hibernation_test(TEST_CORE) || pm_wakeup_pending())
282 goto Power_up; 282 goto Power_up;
283 283
284 in_suspend = 1; 284 in_suspend = 1;
@@ -516,7 +516,7 @@ int hibernation_platform_enter(void)
516 516
517 local_irq_disable(); 517 local_irq_disable();
518 sysdev_suspend(PMSG_HIBERNATE); 518 sysdev_suspend(PMSG_HIBERNATE);
519 if (!pm_check_wakeup_events()) { 519 if (pm_wakeup_pending()) {
520 error = -EAGAIN; 520 error = -EAGAIN;
521 goto Power_up; 521 goto Power_up;
522 } 522 }
@@ -647,6 +647,7 @@ int hibernate(void)
647 swsusp_free(); 647 swsusp_free();
648 if (!error) 648 if (!error)
649 power_down(); 649 power_down();
650 in_suspend = 0;
650 pm_restore_gfp_mask(); 651 pm_restore_gfp_mask();
651 } else { 652 } else {
652 pr_debug("PM: Image restored successfully.\n"); 653 pr_debug("PM: Image restored successfully.\n");
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 7b5db6a8561e..701853042c28 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -326,7 +326,7 @@ EXPORT_SYMBOL_GPL(pm_wq);
326 326
327static int __init pm_start_workqueue(void) 327static int __init pm_start_workqueue(void)
328{ 328{
329 pm_wq = alloc_workqueue("pm", WQ_FREEZEABLE, 0); 329 pm_wq = alloc_workqueue("pm", WQ_FREEZABLE, 0);
330 330
331 return pm_wq ? 0 : -ENOMEM; 331 return pm_wq ? 0 : -ENOMEM;
332} 332}
diff --git a/kernel/power/nvs.c b/kernel/power/nvs.c
deleted file mode 100644
index 1836db60bbb6..000000000000
--- a/kernel/power/nvs.c
+++ /dev/null
@@ -1,136 +0,0 @@
1/*
2 * linux/kernel/power/hibernate_nvs.c - Routines for handling NVS memory
3 *
4 * Copyright (C) 2008,2009 Rafael J. Wysocki <rjw@sisk.pl>, Novell Inc.
5 *
6 * This file is released under the GPLv2.
7 */
8
9#include <linux/io.h>
10#include <linux/kernel.h>
11#include <linux/list.h>
12#include <linux/mm.h>
13#include <linux/slab.h>
14#include <linux/suspend.h>
15
16/*
17 * Platforms, like ACPI, may want us to save some memory used by them during
18 * suspend and to restore the contents of this memory during the subsequent
19 * resume. The code below implements a mechanism allowing us to do that.
20 */
21
22struct nvs_page {
23 unsigned long phys_start;
24 unsigned int size;
25 void *kaddr;
26 void *data;
27 struct list_head node;
28};
29
30static LIST_HEAD(nvs_list);
31
32/**
33 * suspend_nvs_register - register platform NVS memory region to save
34 * @start - physical address of the region
35 * @size - size of the region
36 *
37 * The NVS region need not be page-aligned (both ends) and we arrange
38 * things so that the data from page-aligned addresses in this region will
39 * be copied into separate RAM pages.
40 */
41int suspend_nvs_register(unsigned long start, unsigned long size)
42{
43 struct nvs_page *entry, *next;
44
45 while (size > 0) {
46 unsigned int nr_bytes;
47
48 entry = kzalloc(sizeof(struct nvs_page), GFP_KERNEL);
49 if (!entry)
50 goto Error;
51
52 list_add_tail(&entry->node, &nvs_list);
53 entry->phys_start = start;
54 nr_bytes = PAGE_SIZE - (start & ~PAGE_MASK);
55 entry->size = (size < nr_bytes) ? size : nr_bytes;
56
57 start += entry->size;
58 size -= entry->size;
59 }
60 return 0;
61
62 Error:
63 list_for_each_entry_safe(entry, next, &nvs_list, node) {
64 list_del(&entry->node);
65 kfree(entry);
66 }
67 return -ENOMEM;
68}
69
70/**
71 * suspend_nvs_free - free data pages allocated for saving NVS regions
72 */
73void suspend_nvs_free(void)
74{
75 struct nvs_page *entry;
76
77 list_for_each_entry(entry, &nvs_list, node)
78 if (entry->data) {
79 free_page((unsigned long)entry->data);
80 entry->data = NULL;
81 if (entry->kaddr) {
82 iounmap(entry->kaddr);
83 entry->kaddr = NULL;
84 }
85 }
86}
87
88/**
89 * suspend_nvs_alloc - allocate memory necessary for saving NVS regions
90 */
91int suspend_nvs_alloc(void)
92{
93 struct nvs_page *entry;
94
95 list_for_each_entry(entry, &nvs_list, node) {
96 entry->data = (void *)__get_free_page(GFP_KERNEL);
97 if (!entry->data) {
98 suspend_nvs_free();
99 return -ENOMEM;
100 }
101 }
102 return 0;
103}
104
105/**
106 * suspend_nvs_save - save NVS memory regions
107 */
108void suspend_nvs_save(void)
109{
110 struct nvs_page *entry;
111
112 printk(KERN_INFO "PM: Saving platform NVS memory\n");
113
114 list_for_each_entry(entry, &nvs_list, node)
115 if (entry->data) {
116 entry->kaddr = ioremap(entry->phys_start, entry->size);
117 memcpy(entry->data, entry->kaddr, entry->size);
118 }
119}
120
121/**
122 * suspend_nvs_restore - restore NVS memory regions
123 *
124 * This function is going to be called with interrupts disabled, so it
125 * cannot iounmap the virtual addresses used to access the NVS region.
126 */
127void suspend_nvs_restore(void)
128{
129 struct nvs_page *entry;
130
131 printk(KERN_INFO "PM: Restoring platform NVS memory\n");
132
133 list_for_each_entry(entry, &nvs_list, node)
134 if (entry->data)
135 memcpy(entry->kaddr, entry->data, entry->size);
136}
diff --git a/kernel/power/process.c b/kernel/power/process.c
index e50b4c1b2a0f..0cf3a27a6c9d 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -22,7 +22,7 @@
22 */ 22 */
23#define TIMEOUT (20 * HZ) 23#define TIMEOUT (20 * HZ)
24 24
25static inline int freezeable(struct task_struct * p) 25static inline int freezable(struct task_struct * p)
26{ 26{
27 if ((p == current) || 27 if ((p == current) ||
28 (p->flags & PF_NOFREEZE) || 28 (p->flags & PF_NOFREEZE) ||
@@ -53,7 +53,7 @@ static int try_to_freeze_tasks(bool sig_only)
53 todo = 0; 53 todo = 0;
54 read_lock(&tasklist_lock); 54 read_lock(&tasklist_lock);
55 do_each_thread(g, p) { 55 do_each_thread(g, p) {
56 if (frozen(p) || !freezeable(p)) 56 if (frozen(p) || !freezable(p))
57 continue; 57 continue;
58 58
59 if (!freeze_task(p, sig_only)) 59 if (!freeze_task(p, sig_only))
@@ -64,6 +64,12 @@ static int try_to_freeze_tasks(bool sig_only)
64 * perturb a task in TASK_STOPPED or TASK_TRACED. 64 * perturb a task in TASK_STOPPED or TASK_TRACED.
65 * It is "frozen enough". If the task does wake 65 * It is "frozen enough". If the task does wake
66 * up, it will immediately call try_to_freeze. 66 * up, it will immediately call try_to_freeze.
67 *
68 * Because freeze_task() goes through p's
69 * scheduler lock after setting TIF_FREEZE, it's
70 * guaranteed that either we see TASK_RUNNING or
71 * try_to_stop() after schedule() in ptrace/signal
72 * stop sees TIF_FREEZE.
67 */ 73 */
68 if (!task_is_stopped_or_traced(p) && 74 if (!task_is_stopped_or_traced(p) &&
69 !freezer_should_skip(p)) 75 !freezer_should_skip(p))
@@ -79,7 +85,7 @@ static int try_to_freeze_tasks(bool sig_only)
79 if (!todo || time_after(jiffies, end_time)) 85 if (!todo || time_after(jiffies, end_time))
80 break; 86 break;
81 87
82 if (!pm_check_wakeup_events()) { 88 if (pm_wakeup_pending()) {
83 wakeup = true; 89 wakeup = true;
84 break; 90 break;
85 } 91 }
@@ -161,7 +167,7 @@ static void thaw_tasks(bool nosig_only)
161 167
162 read_lock(&tasklist_lock); 168 read_lock(&tasklist_lock);
163 do_each_thread(g, p) { 169 do_each_thread(g, p) {
164 if (!freezeable(p)) 170 if (!freezable(p))
165 continue; 171 continue;
166 172
167 if (nosig_only && should_send_signal(p)) 173 if (nosig_only && should_send_signal(p))
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 0dac75ea4456..64db648ff911 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -1519,11 +1519,8 @@ static int
1519swsusp_alloc(struct memory_bitmap *orig_bm, struct memory_bitmap *copy_bm, 1519swsusp_alloc(struct memory_bitmap *orig_bm, struct memory_bitmap *copy_bm,
1520 unsigned int nr_pages, unsigned int nr_highmem) 1520 unsigned int nr_pages, unsigned int nr_highmem)
1521{ 1521{
1522 int error = 0;
1523
1524 if (nr_highmem > 0) { 1522 if (nr_highmem > 0) {
1525 error = get_highmem_buffer(PG_ANY); 1523 if (get_highmem_buffer(PG_ANY))
1526 if (error)
1527 goto err_out; 1524 goto err_out;
1528 if (nr_highmem > alloc_highmem) { 1525 if (nr_highmem > alloc_highmem) {
1529 nr_highmem -= alloc_highmem; 1526 nr_highmem -= alloc_highmem;
@@ -1546,7 +1543,7 @@ swsusp_alloc(struct memory_bitmap *orig_bm, struct memory_bitmap *copy_bm,
1546 1543
1547 err_out: 1544 err_out:
1548 swsusp_free(); 1545 swsusp_free();
1549 return error; 1546 return -ENOMEM;
1550} 1547}
1551 1548
1552asmlinkage int swsusp_save(void) 1549asmlinkage int swsusp_save(void)
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index ecf770509d0d..de6f86bfa303 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -22,6 +22,7 @@
22#include <linux/mm.h> 22#include <linux/mm.h>
23#include <linux/slab.h> 23#include <linux/slab.h>
24#include <linux/suspend.h> 24#include <linux/suspend.h>
25#include <trace/events/power.h>
25 26
26#include "power.h" 27#include "power.h"
27 28
@@ -30,13 +31,13 @@ const char *const pm_states[PM_SUSPEND_MAX] = {
30 [PM_SUSPEND_MEM] = "mem", 31 [PM_SUSPEND_MEM] = "mem",
31}; 32};
32 33
33static struct platform_suspend_ops *suspend_ops; 34static const struct platform_suspend_ops *suspend_ops;
34 35
35/** 36/**
36 * suspend_set_ops - Set the global suspend method table. 37 * suspend_set_ops - Set the global suspend method table.
37 * @ops: Pointer to ops structure. 38 * @ops: Pointer to ops structure.
38 */ 39 */
39void suspend_set_ops(struct platform_suspend_ops *ops) 40void suspend_set_ops(const struct platform_suspend_ops *ops)
40{ 41{
41 mutex_lock(&pm_mutex); 42 mutex_lock(&pm_mutex);
42 suspend_ops = ops; 43 suspend_ops = ops;
@@ -163,7 +164,7 @@ static int suspend_enter(suspend_state_t state)
163 164
164 error = sysdev_suspend(PMSG_SUSPEND); 165 error = sysdev_suspend(PMSG_SUSPEND);
165 if (!error) { 166 if (!error) {
166 if (!suspend_test(TEST_CORE) && pm_check_wakeup_events()) { 167 if (!(suspend_test(TEST_CORE) || pm_wakeup_pending())) {
167 error = suspend_ops->enter(state); 168 error = suspend_ops->enter(state);
168 events_check_enabled = false; 169 events_check_enabled = false;
169 } 170 }
@@ -201,6 +202,7 @@ int suspend_devices_and_enter(suspend_state_t state)
201 if (!suspend_ops) 202 if (!suspend_ops)
202 return -ENOSYS; 203 return -ENOSYS;
203 204
205 trace_machine_suspend(state);
204 if (suspend_ops->begin) { 206 if (suspend_ops->begin) {
205 error = suspend_ops->begin(state); 207 error = suspend_ops->begin(state);
206 if (error) 208 if (error)
@@ -229,6 +231,7 @@ int suspend_devices_and_enter(suspend_state_t state)
229 Close: 231 Close:
230 if (suspend_ops->end) 232 if (suspend_ops->end)
231 suspend_ops->end(); 233 suspend_ops->end();
234 trace_machine_suspend(PWR_EVENT_EXIT);
232 return error; 235 return error;
233 236
234 Recover_platform: 237 Recover_platform:
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 8c7e4832b9be..7c97c3a0eee3 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -224,7 +224,7 @@ static int swsusp_swap_check(void)
224 return res; 224 return res;
225 225
226 root_swap = res; 226 root_swap = res;
227 res = blkdev_get(hib_resume_bdev, FMODE_WRITE); 227 res = blkdev_get(hib_resume_bdev, FMODE_WRITE, NULL);
228 if (res) 228 if (res)
229 return res; 229 return res;
230 230
@@ -888,7 +888,7 @@ out_finish:
888/** 888/**
889 * swsusp_read - read the hibernation image. 889 * swsusp_read - read the hibernation image.
890 * @flags_p: flags passed by the "frozen" kernel in the image header should 890 * @flags_p: flags passed by the "frozen" kernel in the image header should
891 * be written into this memeory location 891 * be written into this memory location
892 */ 892 */
893 893
894int swsusp_read(unsigned int *flags_p) 894int swsusp_read(unsigned int *flags_p)
@@ -930,7 +930,8 @@ int swsusp_check(void)
930{ 930{
931 int error; 931 int error;
932 932
933 hib_resume_bdev = open_by_devnum(swsusp_resume_device, FMODE_READ); 933 hib_resume_bdev = blkdev_get_by_dev(swsusp_resume_device,
934 FMODE_READ, NULL);
934 if (!IS_ERR(hib_resume_bdev)) { 935 if (!IS_ERR(hib_resume_bdev)) {
935 set_blocksize(hib_resume_bdev, PAGE_SIZE); 936 set_blocksize(hib_resume_bdev, PAGE_SIZE);
936 clear_page(swsusp_header); 937 clear_page(swsusp_header);
diff --git a/kernel/printk.c b/kernel/printk.c
index a23315dc4498..36231525e22f 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -39,16 +39,11 @@
39#include <linux/syslog.h> 39#include <linux/syslog.h>
40#include <linux/cpu.h> 40#include <linux/cpu.h>
41#include <linux/notifier.h> 41#include <linux/notifier.h>
42#include <linux/rculist.h>
42 43
43#include <asm/uaccess.h> 44#include <asm/uaccess.h>
44 45
45/* 46/*
46 * for_each_console() allows you to iterate on each console
47 */
48#define for_each_console(con) \
49 for (con = console_drivers; con != NULL; con = con->next)
50
51/*
52 * Architectures can override it: 47 * Architectures can override it:
53 */ 48 */
54void asmlinkage __attribute__((weak)) early_printk(const char *fmt, ...) 49void asmlinkage __attribute__((weak)) early_printk(const char *fmt, ...)
@@ -102,7 +97,7 @@ static int console_locked, console_suspended;
102/* 97/*
103 * logbuf_lock protects log_buf, log_start, log_end, con_start and logged_chars 98 * logbuf_lock protects log_buf, log_start, log_end, con_start and logged_chars
104 * It is also used in interesting ways to provide interlocking in 99 * It is also used in interesting ways to provide interlocking in
105 * release_console_sem(). 100 * console_unlock();.
106 */ 101 */
107static DEFINE_SPINLOCK(logbuf_lock); 102static DEFINE_SPINLOCK(logbuf_lock);
108 103
@@ -267,25 +262,47 @@ int dmesg_restrict = 1;
267int dmesg_restrict; 262int dmesg_restrict;
268#endif 263#endif
269 264
265static int syslog_action_restricted(int type)
266{
267 if (dmesg_restrict)
268 return 1;
269 /* Unless restricted, we allow "read all" and "get buffer size" for everybody */
270 return type != SYSLOG_ACTION_READ_ALL && type != SYSLOG_ACTION_SIZE_BUFFER;
271}
272
273static int check_syslog_permissions(int type, bool from_file)
274{
275 /*
276 * If this is from /proc/kmsg and we've already opened it, then we've
277 * already done the capabilities checks at open time.
278 */
279 if (from_file && type != SYSLOG_ACTION_OPEN)
280 return 0;
281
282 if (syslog_action_restricted(type)) {
283 if (capable(CAP_SYSLOG))
284 return 0;
285 /* For historical reasons, accept CAP_SYS_ADMIN too, with a warning */
286 if (capable(CAP_SYS_ADMIN)) {
287 WARN_ONCE(1, "Attempt to access syslog with CAP_SYS_ADMIN "
288 "but no CAP_SYSLOG (deprecated).\n");
289 return 0;
290 }
291 return -EPERM;
292 }
293 return 0;
294}
295
270int do_syslog(int type, char __user *buf, int len, bool from_file) 296int do_syslog(int type, char __user *buf, int len, bool from_file)
271{ 297{
272 unsigned i, j, limit, count; 298 unsigned i, j, limit, count;
273 int do_clear = 0; 299 int do_clear = 0;
274 char c; 300 char c;
275 int error = 0; 301 int error;
276 302
277 /* 303 error = check_syslog_permissions(type, from_file);
278 * If this is from /proc/kmsg we only do the capabilities checks 304 if (error)
279 * at open time. 305 goto out;
280 */
281 if (type == SYSLOG_ACTION_OPEN || !from_file) {
282 if (dmesg_restrict && !capable(CAP_SYS_ADMIN))
283 return -EPERM;
284 if ((type != SYSLOG_ACTION_READ_ALL &&
285 type != SYSLOG_ACTION_SIZE_BUFFER) &&
286 !capable(CAP_SYS_ADMIN))
287 return -EPERM;
288 }
289 306
290 error = security_syslog(type); 307 error = security_syslog(type);
291 if (error) 308 if (error)
@@ -500,7 +517,7 @@ static void _call_console_drivers(unsigned start,
500/* 517/*
501 * Call the console drivers, asking them to write out 518 * Call the console drivers, asking them to write out
502 * log_buf[start] to log_buf[end - 1]. 519 * log_buf[start] to log_buf[end - 1].
503 * The console_sem must be held. 520 * The console_lock must be held.
504 */ 521 */
505static void call_console_drivers(unsigned start, unsigned end) 522static void call_console_drivers(unsigned start, unsigned end)
506{ 523{
@@ -603,11 +620,11 @@ static int have_callable_console(void)
603 * 620 *
604 * This is printk(). It can be called from any context. We want it to work. 621 * This is printk(). It can be called from any context. We want it to work.
605 * 622 *
606 * We try to grab the console_sem. If we succeed, it's easy - we log the output and 623 * We try to grab the console_lock. If we succeed, it's easy - we log the output and
607 * call the console drivers. If we fail to get the semaphore we place the output 624 * call the console drivers. If we fail to get the semaphore we place the output
608 * into the log buffer and return. The current holder of the console_sem will 625 * into the log buffer and return. The current holder of the console_sem will
609 * notice the new output in release_console_sem() and will send it to the 626 * notice the new output in console_unlock(); and will send it to the
610 * consoles before releasing the semaphore. 627 * consoles before releasing the lock.
611 * 628 *
612 * One effect of this deferred printing is that code which calls printk() and 629 * One effect of this deferred printing is that code which calls printk() and
613 * then changes console_loglevel may break. This is because console_loglevel 630 * then changes console_loglevel may break. This is because console_loglevel
@@ -658,19 +675,19 @@ static inline int can_use_console(unsigned int cpu)
658/* 675/*
659 * Try to get console ownership to actually show the kernel 676 * Try to get console ownership to actually show the kernel
660 * messages from a 'printk'. Return true (and with the 677 * messages from a 'printk'. Return true (and with the
661 * console_semaphore held, and 'console_locked' set) if it 678 * console_lock held, and 'console_locked' set) if it
662 * is successful, false otherwise. 679 * is successful, false otherwise.
663 * 680 *
664 * This gets called with the 'logbuf_lock' spinlock held and 681 * This gets called with the 'logbuf_lock' spinlock held and
665 * interrupts disabled. It should return with 'lockbuf_lock' 682 * interrupts disabled. It should return with 'lockbuf_lock'
666 * released but interrupts still disabled. 683 * released but interrupts still disabled.
667 */ 684 */
668static int acquire_console_semaphore_for_printk(unsigned int cpu) 685static int console_trylock_for_printk(unsigned int cpu)
669 __releases(&logbuf_lock) 686 __releases(&logbuf_lock)
670{ 687{
671 int retval = 0; 688 int retval = 0;
672 689
673 if (!try_acquire_console_sem()) { 690 if (console_trylock()) {
674 retval = 1; 691 retval = 1;
675 692
676 /* 693 /*
@@ -826,12 +843,12 @@ asmlinkage int vprintk(const char *fmt, va_list args)
826 * actual magic (print out buffers, wake up klogd, 843 * actual magic (print out buffers, wake up klogd,
827 * etc). 844 * etc).
828 * 845 *
829 * The acquire_console_semaphore_for_printk() function 846 * The console_trylock_for_printk() function
830 * will release 'logbuf_lock' regardless of whether it 847 * will release 'logbuf_lock' regardless of whether it
831 * actually gets the semaphore or not. 848 * actually gets the semaphore or not.
832 */ 849 */
833 if (acquire_console_semaphore_for_printk(this_cpu)) 850 if (console_trylock_for_printk(this_cpu))
834 release_console_sem(); 851 console_unlock();
835 852
836 lockdep_on(); 853 lockdep_on();
837out_restore_irqs: 854out_restore_irqs:
@@ -992,7 +1009,7 @@ void suspend_console(void)
992 if (!console_suspend_enabled) 1009 if (!console_suspend_enabled)
993 return; 1010 return;
994 printk("Suspending console(s) (use no_console_suspend to debug)\n"); 1011 printk("Suspending console(s) (use no_console_suspend to debug)\n");
995 acquire_console_sem(); 1012 console_lock();
996 console_suspended = 1; 1013 console_suspended = 1;
997 up(&console_sem); 1014 up(&console_sem);
998} 1015}
@@ -1003,7 +1020,7 @@ void resume_console(void)
1003 return; 1020 return;
1004 down(&console_sem); 1021 down(&console_sem);
1005 console_suspended = 0; 1022 console_suspended = 0;
1006 release_console_sem(); 1023 console_unlock();
1007} 1024}
1008 1025
1009/** 1026/**
@@ -1026,21 +1043,21 @@ static int __cpuinit console_cpu_notify(struct notifier_block *self,
1026 case CPU_DYING: 1043 case CPU_DYING:
1027 case CPU_DOWN_FAILED: 1044 case CPU_DOWN_FAILED:
1028 case CPU_UP_CANCELED: 1045 case CPU_UP_CANCELED:
1029 acquire_console_sem(); 1046 console_lock();
1030 release_console_sem(); 1047 console_unlock();
1031 } 1048 }
1032 return NOTIFY_OK; 1049 return NOTIFY_OK;
1033} 1050}
1034 1051
1035/** 1052/**
1036 * acquire_console_sem - lock the console system for exclusive use. 1053 * console_lock - lock the console system for exclusive use.
1037 * 1054 *
1038 * Acquires a semaphore which guarantees that the caller has 1055 * Acquires a lock which guarantees that the caller has
1039 * exclusive access to the console system and the console_drivers list. 1056 * exclusive access to the console system and the console_drivers list.
1040 * 1057 *
1041 * Can sleep, returns nothing. 1058 * Can sleep, returns nothing.
1042 */ 1059 */
1043void acquire_console_sem(void) 1060void console_lock(void)
1044{ 1061{
1045 BUG_ON(in_interrupt()); 1062 BUG_ON(in_interrupt());
1046 down(&console_sem); 1063 down(&console_sem);
@@ -1049,21 +1066,29 @@ void acquire_console_sem(void)
1049 console_locked = 1; 1066 console_locked = 1;
1050 console_may_schedule = 1; 1067 console_may_schedule = 1;
1051} 1068}
1052EXPORT_SYMBOL(acquire_console_sem); 1069EXPORT_SYMBOL(console_lock);
1053 1070
1054int try_acquire_console_sem(void) 1071/**
1072 * console_trylock - try to lock the console system for exclusive use.
1073 *
1074 * Tried to acquire a lock which guarantees that the caller has
1075 * exclusive access to the console system and the console_drivers list.
1076 *
1077 * returns 1 on success, and 0 on failure to acquire the lock.
1078 */
1079int console_trylock(void)
1055{ 1080{
1056 if (down_trylock(&console_sem)) 1081 if (down_trylock(&console_sem))
1057 return -1; 1082 return 0;
1058 if (console_suspended) { 1083 if (console_suspended) {
1059 up(&console_sem); 1084 up(&console_sem);
1060 return -1; 1085 return 0;
1061 } 1086 }
1062 console_locked = 1; 1087 console_locked = 1;
1063 console_may_schedule = 0; 1088 console_may_schedule = 0;
1064 return 0; 1089 return 1;
1065} 1090}
1066EXPORT_SYMBOL(try_acquire_console_sem); 1091EXPORT_SYMBOL(console_trylock);
1067 1092
1068int is_console_locked(void) 1093int is_console_locked(void)
1069{ 1094{
@@ -1074,17 +1099,17 @@ static DEFINE_PER_CPU(int, printk_pending);
1074 1099
1075void printk_tick(void) 1100void printk_tick(void)
1076{ 1101{
1077 if (__get_cpu_var(printk_pending)) { 1102 if (__this_cpu_read(printk_pending)) {
1078 __get_cpu_var(printk_pending) = 0; 1103 __this_cpu_write(printk_pending, 0);
1079 wake_up_interruptible(&log_wait); 1104 wake_up_interruptible(&log_wait);
1080 } 1105 }
1081} 1106}
1082 1107
1083int printk_needs_cpu(int cpu) 1108int printk_needs_cpu(int cpu)
1084{ 1109{
1085 if (unlikely(cpu_is_offline(cpu))) 1110 if (cpu_is_offline(cpu))
1086 printk_tick(); 1111 printk_tick();
1087 return per_cpu(printk_pending, cpu); 1112 return __this_cpu_read(printk_pending);
1088} 1113}
1089 1114
1090void wake_up_klogd(void) 1115void wake_up_klogd(void)
@@ -1094,20 +1119,20 @@ void wake_up_klogd(void)
1094} 1119}
1095 1120
1096/** 1121/**
1097 * release_console_sem - unlock the console system 1122 * console_unlock - unlock the console system
1098 * 1123 *
1099 * Releases the semaphore which the caller holds on the console system 1124 * Releases the console_lock which the caller holds on the console system
1100 * and the console driver list. 1125 * and the console driver list.
1101 * 1126 *
1102 * While the semaphore was held, console output may have been buffered 1127 * While the console_lock was held, console output may have been buffered
1103 * by printk(). If this is the case, release_console_sem() emits 1128 * by printk(). If this is the case, console_unlock(); emits
1104 * the output prior to releasing the semaphore. 1129 * the output prior to releasing the lock.
1105 * 1130 *
1106 * If there is output waiting for klogd, we wake it up. 1131 * If there is output waiting for klogd, we wake it up.
1107 * 1132 *
1108 * release_console_sem() may be called from any context. 1133 * console_unlock(); may be called from any context.
1109 */ 1134 */
1110void release_console_sem(void) 1135void console_unlock(void)
1111{ 1136{
1112 unsigned long flags; 1137 unsigned long flags;
1113 unsigned _con_start, _log_end; 1138 unsigned _con_start, _log_end;
@@ -1140,7 +1165,7 @@ void release_console_sem(void)
1140 if (wake_klogd) 1165 if (wake_klogd)
1141 wake_up_klogd(); 1166 wake_up_klogd();
1142} 1167}
1143EXPORT_SYMBOL(release_console_sem); 1168EXPORT_SYMBOL(console_unlock);
1144 1169
1145/** 1170/**
1146 * console_conditional_schedule - yield the CPU if required 1171 * console_conditional_schedule - yield the CPU if required
@@ -1149,7 +1174,7 @@ EXPORT_SYMBOL(release_console_sem);
1149 * if this CPU should yield the CPU to another task, do 1174 * if this CPU should yield the CPU to another task, do
1150 * so here. 1175 * so here.
1151 * 1176 *
1152 * Must be called within acquire_console_sem(). 1177 * Must be called within console_lock();.
1153 */ 1178 */
1154void __sched console_conditional_schedule(void) 1179void __sched console_conditional_schedule(void)
1155{ 1180{
@@ -1170,14 +1195,14 @@ void console_unblank(void)
1170 if (down_trylock(&console_sem) != 0) 1195 if (down_trylock(&console_sem) != 0)
1171 return; 1196 return;
1172 } else 1197 } else
1173 acquire_console_sem(); 1198 console_lock();
1174 1199
1175 console_locked = 1; 1200 console_locked = 1;
1176 console_may_schedule = 0; 1201 console_may_schedule = 0;
1177 for_each_console(c) 1202 for_each_console(c)
1178 if ((c->flags & CON_ENABLED) && c->unblank) 1203 if ((c->flags & CON_ENABLED) && c->unblank)
1179 c->unblank(); 1204 c->unblank();
1180 release_console_sem(); 1205 console_unlock();
1181} 1206}
1182 1207
1183/* 1208/*
@@ -1188,7 +1213,7 @@ struct tty_driver *console_device(int *index)
1188 struct console *c; 1213 struct console *c;
1189 struct tty_driver *driver = NULL; 1214 struct tty_driver *driver = NULL;
1190 1215
1191 acquire_console_sem(); 1216 console_lock();
1192 for_each_console(c) { 1217 for_each_console(c) {
1193 if (!c->device) 1218 if (!c->device)
1194 continue; 1219 continue;
@@ -1196,7 +1221,7 @@ struct tty_driver *console_device(int *index)
1196 if (driver) 1221 if (driver)
1197 break; 1222 break;
1198 } 1223 }
1199 release_console_sem(); 1224 console_unlock();
1200 return driver; 1225 return driver;
1201} 1226}
1202 1227
@@ -1207,17 +1232,17 @@ struct tty_driver *console_device(int *index)
1207 */ 1232 */
1208void console_stop(struct console *console) 1233void console_stop(struct console *console)
1209{ 1234{
1210 acquire_console_sem(); 1235 console_lock();
1211 console->flags &= ~CON_ENABLED; 1236 console->flags &= ~CON_ENABLED;
1212 release_console_sem(); 1237 console_unlock();
1213} 1238}
1214EXPORT_SYMBOL(console_stop); 1239EXPORT_SYMBOL(console_stop);
1215 1240
1216void console_start(struct console *console) 1241void console_start(struct console *console)
1217{ 1242{
1218 acquire_console_sem(); 1243 console_lock();
1219 console->flags |= CON_ENABLED; 1244 console->flags |= CON_ENABLED;
1220 release_console_sem(); 1245 console_unlock();
1221} 1246}
1222EXPORT_SYMBOL(console_start); 1247EXPORT_SYMBOL(console_start);
1223 1248
@@ -1339,7 +1364,7 @@ void register_console(struct console *newcon)
1339 * Put this console in the list - keep the 1364 * Put this console in the list - keep the
1340 * preferred driver at the head of the list. 1365 * preferred driver at the head of the list.
1341 */ 1366 */
1342 acquire_console_sem(); 1367 console_lock();
1343 if ((newcon->flags & CON_CONSDEV) || console_drivers == NULL) { 1368 if ((newcon->flags & CON_CONSDEV) || console_drivers == NULL) {
1344 newcon->next = console_drivers; 1369 newcon->next = console_drivers;
1345 console_drivers = newcon; 1370 console_drivers = newcon;
@@ -1351,14 +1376,15 @@ void register_console(struct console *newcon)
1351 } 1376 }
1352 if (newcon->flags & CON_PRINTBUFFER) { 1377 if (newcon->flags & CON_PRINTBUFFER) {
1353 /* 1378 /*
1354 * release_console_sem() will print out the buffered messages 1379 * console_unlock(); will print out the buffered messages
1355 * for us. 1380 * for us.
1356 */ 1381 */
1357 spin_lock_irqsave(&logbuf_lock, flags); 1382 spin_lock_irqsave(&logbuf_lock, flags);
1358 con_start = log_start; 1383 con_start = log_start;
1359 spin_unlock_irqrestore(&logbuf_lock, flags); 1384 spin_unlock_irqrestore(&logbuf_lock, flags);
1360 } 1385 }
1361 release_console_sem(); 1386 console_unlock();
1387 console_sysfs_notify();
1362 1388
1363 /* 1389 /*
1364 * By unregistering the bootconsoles after we enable the real console 1390 * By unregistering the bootconsoles after we enable the real console
@@ -1394,7 +1420,7 @@ int unregister_console(struct console *console)
1394 return braille_unregister_console(console); 1420 return braille_unregister_console(console);
1395#endif 1421#endif
1396 1422
1397 acquire_console_sem(); 1423 console_lock();
1398 if (console_drivers == console) { 1424 if (console_drivers == console) {
1399 console_drivers=console->next; 1425 console_drivers=console->next;
1400 res = 0; 1426 res = 0;
@@ -1416,7 +1442,8 @@ int unregister_console(struct console *console)
1416 if (console_drivers != NULL && console->flags & CON_CONSDEV) 1442 if (console_drivers != NULL && console->flags & CON_CONSDEV)
1417 console_drivers->flags |= CON_CONSDEV; 1443 console_drivers->flags |= CON_CONSDEV;
1418 1444
1419 release_console_sem(); 1445 console_unlock();
1446 console_sysfs_notify();
1420 return res; 1447 return res;
1421} 1448}
1422EXPORT_SYMBOL(unregister_console); 1449EXPORT_SYMBOL(unregister_console);
@@ -1500,7 +1527,7 @@ int kmsg_dump_register(struct kmsg_dumper *dumper)
1500 /* Don't allow registering multiple times */ 1527 /* Don't allow registering multiple times */
1501 if (!dumper->registered) { 1528 if (!dumper->registered) {
1502 dumper->registered = 1; 1529 dumper->registered = 1;
1503 list_add_tail(&dumper->list, &dump_list); 1530 list_add_tail_rcu(&dumper->list, &dump_list);
1504 err = 0; 1531 err = 0;
1505 } 1532 }
1506 spin_unlock_irqrestore(&dump_list_lock, flags); 1533 spin_unlock_irqrestore(&dump_list_lock, flags);
@@ -1524,29 +1551,16 @@ int kmsg_dump_unregister(struct kmsg_dumper *dumper)
1524 spin_lock_irqsave(&dump_list_lock, flags); 1551 spin_lock_irqsave(&dump_list_lock, flags);
1525 if (dumper->registered) { 1552 if (dumper->registered) {
1526 dumper->registered = 0; 1553 dumper->registered = 0;
1527 list_del(&dumper->list); 1554 list_del_rcu(&dumper->list);
1528 err = 0; 1555 err = 0;
1529 } 1556 }
1530 spin_unlock_irqrestore(&dump_list_lock, flags); 1557 spin_unlock_irqrestore(&dump_list_lock, flags);
1558 synchronize_rcu();
1531 1559
1532 return err; 1560 return err;
1533} 1561}
1534EXPORT_SYMBOL_GPL(kmsg_dump_unregister); 1562EXPORT_SYMBOL_GPL(kmsg_dump_unregister);
1535 1563
1536static const char * const kmsg_reasons[] = {
1537 [KMSG_DUMP_OOPS] = "oops",
1538 [KMSG_DUMP_PANIC] = "panic",
1539 [KMSG_DUMP_KEXEC] = "kexec",
1540};
1541
1542static const char *kmsg_to_str(enum kmsg_dump_reason reason)
1543{
1544 if (reason >= ARRAY_SIZE(kmsg_reasons) || reason < 0)
1545 return "unknown";
1546
1547 return kmsg_reasons[reason];
1548}
1549
1550/** 1564/**
1551 * kmsg_dump - dump kernel log to kernel message dumpers. 1565 * kmsg_dump - dump kernel log to kernel message dumpers.
1552 * @reason: the reason (oops, panic etc) for dumping 1566 * @reason: the reason (oops, panic etc) for dumping
@@ -1585,13 +1599,9 @@ void kmsg_dump(enum kmsg_dump_reason reason)
1585 l2 = chars; 1599 l2 = chars;
1586 } 1600 }
1587 1601
1588 if (!spin_trylock_irqsave(&dump_list_lock, flags)) { 1602 rcu_read_lock();
1589 printk(KERN_ERR "dump_kmsg: dump list lock is held during %s, skipping dump\n", 1603 list_for_each_entry_rcu(dumper, &dump_list, list)
1590 kmsg_to_str(reason));
1591 return;
1592 }
1593 list_for_each_entry(dumper, &dump_list, list)
1594 dumper->dump(dumper, reason, s1, l1, s2, l2); 1604 dumper->dump(dumper, reason, s1, l1, s2, l2);
1595 spin_unlock_irqrestore(&dump_list_lock, flags); 1605 rcu_read_unlock();
1596} 1606}
1597#endif 1607#endif
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 99bbaa3e5b0d..1708b1e2972d 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -313,7 +313,7 @@ int ptrace_detach(struct task_struct *child, unsigned int data)
313 child->exit_code = data; 313 child->exit_code = data;
314 dead = __ptrace_detach(current, child); 314 dead = __ptrace_detach(current, child);
315 if (!child->exit_state) 315 if (!child->exit_state)
316 wake_up_process(child); 316 wake_up_state(child, TASK_TRACED | TASK_STOPPED);
317 } 317 }
318 write_unlock_irq(&tasklist_lock); 318 write_unlock_irq(&tasklist_lock);
319 319
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c
index d806735342ac..0c343b9a46d5 100644
--- a/kernel/rcutiny.c
+++ b/kernel/rcutiny.c
@@ -36,31 +36,16 @@
36#include <linux/time.h> 36#include <linux/time.h>
37#include <linux/cpu.h> 37#include <linux/cpu.h>
38 38
39/* Global control variables for rcupdate callback mechanism. */ 39/* Controls for rcu_kthread() kthread, replacing RCU_SOFTIRQ used previously. */
40struct rcu_ctrlblk { 40static struct task_struct *rcu_kthread_task;
41 struct rcu_head *rcucblist; /* List of pending callbacks (CBs). */ 41static DECLARE_WAIT_QUEUE_HEAD(rcu_kthread_wq);
42 struct rcu_head **donetail; /* ->next pointer of last "done" CB. */ 42static unsigned long have_rcu_kthread_work;
43 struct rcu_head **curtail; /* ->next pointer of last CB. */ 43static void invoke_rcu_kthread(void);
44};
45
46/* Definition for rcupdate control block. */
47static struct rcu_ctrlblk rcu_sched_ctrlblk = {
48 .donetail = &rcu_sched_ctrlblk.rcucblist,
49 .curtail = &rcu_sched_ctrlblk.rcucblist,
50};
51
52static struct rcu_ctrlblk rcu_bh_ctrlblk = {
53 .donetail = &rcu_bh_ctrlblk.rcucblist,
54 .curtail = &rcu_bh_ctrlblk.rcucblist,
55};
56
57#ifdef CONFIG_DEBUG_LOCK_ALLOC
58int rcu_scheduler_active __read_mostly;
59EXPORT_SYMBOL_GPL(rcu_scheduler_active);
60#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
61 44
62/* Forward declarations for rcutiny_plugin.h. */ 45/* Forward declarations for rcutiny_plugin.h. */
63static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp); 46struct rcu_ctrlblk;
47static void rcu_process_callbacks(struct rcu_ctrlblk *rcp);
48static int rcu_kthread(void *arg);
64static void __call_rcu(struct rcu_head *head, 49static void __call_rcu(struct rcu_head *head,
65 void (*func)(struct rcu_head *rcu), 50 void (*func)(struct rcu_head *rcu),
66 struct rcu_ctrlblk *rcp); 51 struct rcu_ctrlblk *rcp);
@@ -123,7 +108,7 @@ void rcu_sched_qs(int cpu)
123{ 108{
124 if (rcu_qsctr_help(&rcu_sched_ctrlblk) + 109 if (rcu_qsctr_help(&rcu_sched_ctrlblk) +
125 rcu_qsctr_help(&rcu_bh_ctrlblk)) 110 rcu_qsctr_help(&rcu_bh_ctrlblk))
126 raise_softirq(RCU_SOFTIRQ); 111 invoke_rcu_kthread();
127} 112}
128 113
129/* 114/*
@@ -132,7 +117,7 @@ void rcu_sched_qs(int cpu)
132void rcu_bh_qs(int cpu) 117void rcu_bh_qs(int cpu)
133{ 118{
134 if (rcu_qsctr_help(&rcu_bh_ctrlblk)) 119 if (rcu_qsctr_help(&rcu_bh_ctrlblk))
135 raise_softirq(RCU_SOFTIRQ); 120 invoke_rcu_kthread();
136} 121}
137 122
138/* 123/*
@@ -152,13 +137,14 @@ void rcu_check_callbacks(int cpu, int user)
152} 137}
153 138
154/* 139/*
155 * Helper function for rcu_process_callbacks() that operates on the 140 * Invoke the RCU callbacks on the specified rcu_ctrlkblk structure
156 * specified rcu_ctrlkblk structure. 141 * whose grace period has elapsed.
157 */ 142 */
158static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) 143static void rcu_process_callbacks(struct rcu_ctrlblk *rcp)
159{ 144{
160 struct rcu_head *next, *list; 145 struct rcu_head *next, *list;
161 unsigned long flags; 146 unsigned long flags;
147 RCU_TRACE(int cb_count = 0);
162 148
163 /* If no RCU callbacks ready to invoke, just return. */ 149 /* If no RCU callbacks ready to invoke, just return. */
164 if (&rcp->rcucblist == rcp->donetail) 150 if (&rcp->rcucblist == rcp->donetail)
@@ -180,19 +166,59 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
180 next = list->next; 166 next = list->next;
181 prefetch(next); 167 prefetch(next);
182 debug_rcu_head_unqueue(list); 168 debug_rcu_head_unqueue(list);
169 local_bh_disable();
183 list->func(list); 170 list->func(list);
171 local_bh_enable();
184 list = next; 172 list = next;
173 RCU_TRACE(cb_count++);
185 } 174 }
175 RCU_TRACE(rcu_trace_sub_qlen(rcp, cb_count));
186} 176}
187 177
188/* 178/*
189 * Invoke any callbacks whose grace period has completed. 179 * This kthread invokes RCU callbacks whose grace periods have
180 * elapsed. It is awakened as needed, and takes the place of the
181 * RCU_SOFTIRQ that was used previously for this purpose.
182 * This is a kthread, but it is never stopped, at least not until
183 * the system goes down.
190 */ 184 */
191static void rcu_process_callbacks(struct softirq_action *unused) 185static int rcu_kthread(void *arg)
192{ 186{
193 __rcu_process_callbacks(&rcu_sched_ctrlblk); 187 unsigned long work;
194 __rcu_process_callbacks(&rcu_bh_ctrlblk); 188 unsigned long morework;
195 rcu_preempt_process_callbacks(); 189 unsigned long flags;
190
191 for (;;) {
192 wait_event_interruptible(rcu_kthread_wq,
193 have_rcu_kthread_work != 0);
194 morework = rcu_boost();
195 local_irq_save(flags);
196 work = have_rcu_kthread_work;
197 have_rcu_kthread_work = morework;
198 local_irq_restore(flags);
199 if (work) {
200 rcu_process_callbacks(&rcu_sched_ctrlblk);
201 rcu_process_callbacks(&rcu_bh_ctrlblk);
202 rcu_preempt_process_callbacks();
203 }
204 schedule_timeout_interruptible(1); /* Leave CPU for others. */
205 }
206
207 return 0; /* Not reached, but needed to shut gcc up. */
208}
209
210/*
211 * Wake up rcu_kthread() to process callbacks now eligible for invocation
212 * or to boost readers.
213 */
214static void invoke_rcu_kthread(void)
215{
216 unsigned long flags;
217
218 local_irq_save(flags);
219 have_rcu_kthread_work = 1;
220 wake_up(&rcu_kthread_wq);
221 local_irq_restore(flags);
196} 222}
197 223
198/* 224/*
@@ -230,6 +256,7 @@ static void __call_rcu(struct rcu_head *head,
230 local_irq_save(flags); 256 local_irq_save(flags);
231 *rcp->curtail = head; 257 *rcp->curtail = head;
232 rcp->curtail = &head->next; 258 rcp->curtail = &head->next;
259 RCU_TRACE(rcp->qlen++);
233 local_irq_restore(flags); 260 local_irq_restore(flags);
234} 261}
235 262
@@ -282,7 +309,16 @@ void rcu_barrier_sched(void)
282} 309}
283EXPORT_SYMBOL_GPL(rcu_barrier_sched); 310EXPORT_SYMBOL_GPL(rcu_barrier_sched);
284 311
285void __init rcu_init(void) 312/*
313 * Spawn the kthread that invokes RCU callbacks.
314 */
315static int __init rcu_spawn_kthreads(void)
286{ 316{
287 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); 317 struct sched_param sp;
318
319 rcu_kthread_task = kthread_run(rcu_kthread, NULL, "rcu_kthread");
320 sp.sched_priority = RCU_BOOST_PRIO;
321 sched_setscheduler_nocheck(rcu_kthread_task, SCHED_FIFO, &sp);
322 return 0;
288} 323}
324early_initcall(rcu_spawn_kthreads);
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h
index 6ceca4f745ff..015abaea962a 100644
--- a/kernel/rcutiny_plugin.h
+++ b/kernel/rcutiny_plugin.h
@@ -22,6 +22,40 @@
22 * Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com> 22 * Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
23 */ 23 */
24 24
25#include <linux/kthread.h>
26#include <linux/debugfs.h>
27#include <linux/seq_file.h>
28
29#ifdef CONFIG_RCU_TRACE
30#define RCU_TRACE(stmt) stmt
31#else /* #ifdef CONFIG_RCU_TRACE */
32#define RCU_TRACE(stmt)
33#endif /* #else #ifdef CONFIG_RCU_TRACE */
34
35/* Global control variables for rcupdate callback mechanism. */
36struct rcu_ctrlblk {
37 struct rcu_head *rcucblist; /* List of pending callbacks (CBs). */
38 struct rcu_head **donetail; /* ->next pointer of last "done" CB. */
39 struct rcu_head **curtail; /* ->next pointer of last CB. */
40 RCU_TRACE(long qlen); /* Number of pending CBs. */
41};
42
43/* Definition for rcupdate control block. */
44static struct rcu_ctrlblk rcu_sched_ctrlblk = {
45 .donetail = &rcu_sched_ctrlblk.rcucblist,
46 .curtail = &rcu_sched_ctrlblk.rcucblist,
47};
48
49static struct rcu_ctrlblk rcu_bh_ctrlblk = {
50 .donetail = &rcu_bh_ctrlblk.rcucblist,
51 .curtail = &rcu_bh_ctrlblk.rcucblist,
52};
53
54#ifdef CONFIG_DEBUG_LOCK_ALLOC
55int rcu_scheduler_active __read_mostly;
56EXPORT_SYMBOL_GPL(rcu_scheduler_active);
57#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
58
25#ifdef CONFIG_TINY_PREEMPT_RCU 59#ifdef CONFIG_TINY_PREEMPT_RCU
26 60
27#include <linux/delay.h> 61#include <linux/delay.h>
@@ -46,17 +80,45 @@ struct rcu_preempt_ctrlblk {
46 struct list_head *gp_tasks; 80 struct list_head *gp_tasks;
47 /* Pointer to the first task blocking the */ 81 /* Pointer to the first task blocking the */
48 /* current grace period, or NULL if there */ 82 /* current grace period, or NULL if there */
49 /* is not such task. */ 83 /* is no such task. */
50 struct list_head *exp_tasks; 84 struct list_head *exp_tasks;
51 /* Pointer to first task blocking the */ 85 /* Pointer to first task blocking the */
52 /* current expedited grace period, or NULL */ 86 /* current expedited grace period, or NULL */
53 /* if there is no such task. If there */ 87 /* if there is no such task. If there */
54 /* is no current expedited grace period, */ 88 /* is no current expedited grace period, */
55 /* then there cannot be any such task. */ 89 /* then there cannot be any such task. */
90#ifdef CONFIG_RCU_BOOST
91 struct list_head *boost_tasks;
92 /* Pointer to first task that needs to be */
93 /* priority-boosted, or NULL if no priority */
94 /* boosting is needed. If there is no */
95 /* current or expedited grace period, there */
96 /* can be no such task. */
97#endif /* #ifdef CONFIG_RCU_BOOST */
56 u8 gpnum; /* Current grace period. */ 98 u8 gpnum; /* Current grace period. */
57 u8 gpcpu; /* Last grace period blocked by the CPU. */ 99 u8 gpcpu; /* Last grace period blocked by the CPU. */
58 u8 completed; /* Last grace period completed. */ 100 u8 completed; /* Last grace period completed. */
59 /* If all three are equal, RCU is idle. */ 101 /* If all three are equal, RCU is idle. */
102#ifdef CONFIG_RCU_BOOST
103 s8 boosted_this_gp; /* Has boosting already happened? */
104 unsigned long boost_time; /* When to start boosting (jiffies) */
105#endif /* #ifdef CONFIG_RCU_BOOST */
106#ifdef CONFIG_RCU_TRACE
107 unsigned long n_grace_periods;
108#ifdef CONFIG_RCU_BOOST
109 unsigned long n_tasks_boosted;
110 unsigned long n_exp_boosts;
111 unsigned long n_normal_boosts;
112 unsigned long n_normal_balk_blkd_tasks;
113 unsigned long n_normal_balk_gp_tasks;
114 unsigned long n_normal_balk_boost_tasks;
115 unsigned long n_normal_balk_boosted;
116 unsigned long n_normal_balk_notyet;
117 unsigned long n_normal_balk_nos;
118 unsigned long n_exp_balk_blkd_tasks;
119 unsigned long n_exp_balk_nos;
120#endif /* #ifdef CONFIG_RCU_BOOST */
121#endif /* #ifdef CONFIG_RCU_TRACE */
60}; 122};
61 123
62static struct rcu_preempt_ctrlblk rcu_preempt_ctrlblk = { 124static struct rcu_preempt_ctrlblk rcu_preempt_ctrlblk = {
@@ -122,6 +184,210 @@ static int rcu_preempt_gp_in_progress(void)
122} 184}
123 185
124/* 186/*
187 * Advance a ->blkd_tasks-list pointer to the next entry, instead
188 * returning NULL if at the end of the list.
189 */
190static struct list_head *rcu_next_node_entry(struct task_struct *t)
191{
192 struct list_head *np;
193
194 np = t->rcu_node_entry.next;
195 if (np == &rcu_preempt_ctrlblk.blkd_tasks)
196 np = NULL;
197 return np;
198}
199
200#ifdef CONFIG_RCU_TRACE
201
202#ifdef CONFIG_RCU_BOOST
203static void rcu_initiate_boost_trace(void);
204static void rcu_initiate_exp_boost_trace(void);
205#endif /* #ifdef CONFIG_RCU_BOOST */
206
207/*
208 * Dump additional statistice for TINY_PREEMPT_RCU.
209 */
210static void show_tiny_preempt_stats(struct seq_file *m)
211{
212 seq_printf(m, "rcu_preempt: qlen=%ld gp=%lu g%u/p%u/c%u tasks=%c%c%c\n",
213 rcu_preempt_ctrlblk.rcb.qlen,
214 rcu_preempt_ctrlblk.n_grace_periods,
215 rcu_preempt_ctrlblk.gpnum,
216 rcu_preempt_ctrlblk.gpcpu,
217 rcu_preempt_ctrlblk.completed,
218 "T."[list_empty(&rcu_preempt_ctrlblk.blkd_tasks)],
219 "N."[!rcu_preempt_ctrlblk.gp_tasks],
220 "E."[!rcu_preempt_ctrlblk.exp_tasks]);
221#ifdef CONFIG_RCU_BOOST
222 seq_printf(m, " ttb=%c btg=",
223 "B."[!rcu_preempt_ctrlblk.boost_tasks]);
224 switch (rcu_preempt_ctrlblk.boosted_this_gp) {
225 case -1:
226 seq_puts(m, "exp");
227 break;
228 case 0:
229 seq_puts(m, "no");
230 break;
231 case 1:
232 seq_puts(m, "begun");
233 break;
234 case 2:
235 seq_puts(m, "done");
236 break;
237 default:
238 seq_printf(m, "?%d?", rcu_preempt_ctrlblk.boosted_this_gp);
239 }
240 seq_printf(m, " ntb=%lu neb=%lu nnb=%lu j=%04x bt=%04x\n",
241 rcu_preempt_ctrlblk.n_tasks_boosted,
242 rcu_preempt_ctrlblk.n_exp_boosts,
243 rcu_preempt_ctrlblk.n_normal_boosts,
244 (int)(jiffies & 0xffff),
245 (int)(rcu_preempt_ctrlblk.boost_time & 0xffff));
246 seq_printf(m, " %s: nt=%lu gt=%lu bt=%lu b=%lu ny=%lu nos=%lu\n",
247 "normal balk",
248 rcu_preempt_ctrlblk.n_normal_balk_blkd_tasks,
249 rcu_preempt_ctrlblk.n_normal_balk_gp_tasks,
250 rcu_preempt_ctrlblk.n_normal_balk_boost_tasks,
251 rcu_preempt_ctrlblk.n_normal_balk_boosted,
252 rcu_preempt_ctrlblk.n_normal_balk_notyet,
253 rcu_preempt_ctrlblk.n_normal_balk_nos);
254 seq_printf(m, " exp balk: bt=%lu nos=%lu\n",
255 rcu_preempt_ctrlblk.n_exp_balk_blkd_tasks,
256 rcu_preempt_ctrlblk.n_exp_balk_nos);
257#endif /* #ifdef CONFIG_RCU_BOOST */
258}
259
260#endif /* #ifdef CONFIG_RCU_TRACE */
261
262#ifdef CONFIG_RCU_BOOST
263
264#include "rtmutex_common.h"
265
266/*
267 * Carry out RCU priority boosting on the task indicated by ->boost_tasks,
268 * and advance ->boost_tasks to the next task in the ->blkd_tasks list.
269 */
270static int rcu_boost(void)
271{
272 unsigned long flags;
273 struct rt_mutex mtx;
274 struct list_head *np;
275 struct task_struct *t;
276
277 if (rcu_preempt_ctrlblk.boost_tasks == NULL)
278 return 0; /* Nothing to boost. */
279 raw_local_irq_save(flags);
280 rcu_preempt_ctrlblk.boosted_this_gp++;
281 t = container_of(rcu_preempt_ctrlblk.boost_tasks, struct task_struct,
282 rcu_node_entry);
283 np = rcu_next_node_entry(t);
284 rt_mutex_init_proxy_locked(&mtx, t);
285 t->rcu_boost_mutex = &mtx;
286 t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BOOSTED;
287 raw_local_irq_restore(flags);
288 rt_mutex_lock(&mtx);
289 RCU_TRACE(rcu_preempt_ctrlblk.n_tasks_boosted++);
290 rcu_preempt_ctrlblk.boosted_this_gp++;
291 rt_mutex_unlock(&mtx);
292 return rcu_preempt_ctrlblk.boost_tasks != NULL;
293}
294
295/*
296 * Check to see if it is now time to start boosting RCU readers blocking
297 * the current grace period, and, if so, tell the rcu_kthread_task to
298 * start boosting them. If there is an expedited boost in progress,
299 * we wait for it to complete.
300 *
301 * If there are no blocked readers blocking the current grace period,
302 * return 0 to let the caller know, otherwise return 1. Note that this
303 * return value is independent of whether or not boosting was done.
304 */
305static int rcu_initiate_boost(void)
306{
307 if (!rcu_preempt_blocked_readers_cgp()) {
308 RCU_TRACE(rcu_preempt_ctrlblk.n_normal_balk_blkd_tasks++);
309 return 0;
310 }
311 if (rcu_preempt_ctrlblk.gp_tasks != NULL &&
312 rcu_preempt_ctrlblk.boost_tasks == NULL &&
313 rcu_preempt_ctrlblk.boosted_this_gp == 0 &&
314 ULONG_CMP_GE(jiffies, rcu_preempt_ctrlblk.boost_time)) {
315 rcu_preempt_ctrlblk.boost_tasks = rcu_preempt_ctrlblk.gp_tasks;
316 invoke_rcu_kthread();
317 RCU_TRACE(rcu_preempt_ctrlblk.n_normal_boosts++);
318 } else
319 RCU_TRACE(rcu_initiate_boost_trace());
320 return 1;
321}
322
323/*
324 * Initiate boosting for an expedited grace period.
325 */
326static void rcu_initiate_expedited_boost(void)
327{
328 unsigned long flags;
329
330 raw_local_irq_save(flags);
331 if (!list_empty(&rcu_preempt_ctrlblk.blkd_tasks)) {
332 rcu_preempt_ctrlblk.boost_tasks =
333 rcu_preempt_ctrlblk.blkd_tasks.next;
334 rcu_preempt_ctrlblk.boosted_this_gp = -1;
335 invoke_rcu_kthread();
336 RCU_TRACE(rcu_preempt_ctrlblk.n_exp_boosts++);
337 } else
338 RCU_TRACE(rcu_initiate_exp_boost_trace());
339 raw_local_irq_restore(flags);
340}
341
342#define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000);
343
344/*
345 * Do priority-boost accounting for the start of a new grace period.
346 */
347static void rcu_preempt_boost_start_gp(void)
348{
349 rcu_preempt_ctrlblk.boost_time = jiffies + RCU_BOOST_DELAY_JIFFIES;
350 if (rcu_preempt_ctrlblk.boosted_this_gp > 0)
351 rcu_preempt_ctrlblk.boosted_this_gp = 0;
352}
353
354#else /* #ifdef CONFIG_RCU_BOOST */
355
356/*
357 * If there is no RCU priority boosting, we don't boost.
358 */
359static int rcu_boost(void)
360{
361 return 0;
362}
363
364/*
365 * If there is no RCU priority boosting, we don't initiate boosting,
366 * but we do indicate whether there are blocked readers blocking the
367 * current grace period.
368 */
369static int rcu_initiate_boost(void)
370{
371 return rcu_preempt_blocked_readers_cgp();
372}
373
374/*
375 * If there is no RCU priority boosting, we don't initiate expedited boosting.
376 */
377static void rcu_initiate_expedited_boost(void)
378{
379}
380
381/*
382 * If there is no RCU priority boosting, nothing to do at grace-period start.
383 */
384static void rcu_preempt_boost_start_gp(void)
385{
386}
387
388#endif /* else #ifdef CONFIG_RCU_BOOST */
389
390/*
125 * Record a preemptible-RCU quiescent state for the specified CPU. Note 391 * Record a preemptible-RCU quiescent state for the specified CPU. Note
126 * that this just means that the task currently running on the CPU is 392 * that this just means that the task currently running on the CPU is
127 * in a quiescent state. There might be any number of tasks blocked 393 * in a quiescent state. There might be any number of tasks blocked
@@ -148,11 +414,14 @@ static void rcu_preempt_cpu_qs(void)
148 rcu_preempt_ctrlblk.gpcpu = rcu_preempt_ctrlblk.gpnum; 414 rcu_preempt_ctrlblk.gpcpu = rcu_preempt_ctrlblk.gpnum;
149 current->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS; 415 current->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS;
150 416
417 /* If there is no GP then there is nothing more to do. */
418 if (!rcu_preempt_gp_in_progress())
419 return;
151 /* 420 /*
152 * If there is no GP, or if blocked readers are still blocking GP, 421 * Check up on boosting. If there are no readers blocking the
153 * then there is nothing more to do. 422 * current grace period, leave.
154 */ 423 */
155 if (!rcu_preempt_gp_in_progress() || rcu_preempt_blocked_readers_cgp()) 424 if (rcu_initiate_boost())
156 return; 425 return;
157 426
158 /* Advance callbacks. */ 427 /* Advance callbacks. */
@@ -164,9 +433,9 @@ static void rcu_preempt_cpu_qs(void)
164 if (!rcu_preempt_blocked_readers_any()) 433 if (!rcu_preempt_blocked_readers_any())
165 rcu_preempt_ctrlblk.rcb.donetail = rcu_preempt_ctrlblk.nexttail; 434 rcu_preempt_ctrlblk.rcb.donetail = rcu_preempt_ctrlblk.nexttail;
166 435
167 /* If there are done callbacks, make RCU_SOFTIRQ process them. */ 436 /* If there are done callbacks, cause them to be invoked. */
168 if (*rcu_preempt_ctrlblk.rcb.donetail != NULL) 437 if (*rcu_preempt_ctrlblk.rcb.donetail != NULL)
169 raise_softirq(RCU_SOFTIRQ); 438 invoke_rcu_kthread();
170} 439}
171 440
172/* 441/*
@@ -178,12 +447,16 @@ static void rcu_preempt_start_gp(void)
178 447
179 /* Official start of GP. */ 448 /* Official start of GP. */
180 rcu_preempt_ctrlblk.gpnum++; 449 rcu_preempt_ctrlblk.gpnum++;
450 RCU_TRACE(rcu_preempt_ctrlblk.n_grace_periods++);
181 451
182 /* Any blocked RCU readers block new GP. */ 452 /* Any blocked RCU readers block new GP. */
183 if (rcu_preempt_blocked_readers_any()) 453 if (rcu_preempt_blocked_readers_any())
184 rcu_preempt_ctrlblk.gp_tasks = 454 rcu_preempt_ctrlblk.gp_tasks =
185 rcu_preempt_ctrlblk.blkd_tasks.next; 455 rcu_preempt_ctrlblk.blkd_tasks.next;
186 456
457 /* Set up for RCU priority boosting. */
458 rcu_preempt_boost_start_gp();
459
187 /* If there is no running reader, CPU is done with GP. */ 460 /* If there is no running reader, CPU is done with GP. */
188 if (!rcu_preempt_running_reader()) 461 if (!rcu_preempt_running_reader())
189 rcu_preempt_cpu_qs(); 462 rcu_preempt_cpu_qs();
@@ -304,14 +577,16 @@ static void rcu_read_unlock_special(struct task_struct *t)
304 */ 577 */
305 empty = !rcu_preempt_blocked_readers_cgp(); 578 empty = !rcu_preempt_blocked_readers_cgp();
306 empty_exp = rcu_preempt_ctrlblk.exp_tasks == NULL; 579 empty_exp = rcu_preempt_ctrlblk.exp_tasks == NULL;
307 np = t->rcu_node_entry.next; 580 np = rcu_next_node_entry(t);
308 if (np == &rcu_preempt_ctrlblk.blkd_tasks)
309 np = NULL;
310 list_del(&t->rcu_node_entry); 581 list_del(&t->rcu_node_entry);
311 if (&t->rcu_node_entry == rcu_preempt_ctrlblk.gp_tasks) 582 if (&t->rcu_node_entry == rcu_preempt_ctrlblk.gp_tasks)
312 rcu_preempt_ctrlblk.gp_tasks = np; 583 rcu_preempt_ctrlblk.gp_tasks = np;
313 if (&t->rcu_node_entry == rcu_preempt_ctrlblk.exp_tasks) 584 if (&t->rcu_node_entry == rcu_preempt_ctrlblk.exp_tasks)
314 rcu_preempt_ctrlblk.exp_tasks = np; 585 rcu_preempt_ctrlblk.exp_tasks = np;
586#ifdef CONFIG_RCU_BOOST
587 if (&t->rcu_node_entry == rcu_preempt_ctrlblk.boost_tasks)
588 rcu_preempt_ctrlblk.boost_tasks = np;
589#endif /* #ifdef CONFIG_RCU_BOOST */
315 INIT_LIST_HEAD(&t->rcu_node_entry); 590 INIT_LIST_HEAD(&t->rcu_node_entry);
316 591
317 /* 592 /*
@@ -331,6 +606,14 @@ static void rcu_read_unlock_special(struct task_struct *t)
331 if (!empty_exp && rcu_preempt_ctrlblk.exp_tasks == NULL) 606 if (!empty_exp && rcu_preempt_ctrlblk.exp_tasks == NULL)
332 rcu_report_exp_done(); 607 rcu_report_exp_done();
333 } 608 }
609#ifdef CONFIG_RCU_BOOST
610 /* Unboost self if was boosted. */
611 if (special & RCU_READ_UNLOCK_BOOSTED) {
612 t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_BOOSTED;
613 rt_mutex_unlock(t->rcu_boost_mutex);
614 t->rcu_boost_mutex = NULL;
615 }
616#endif /* #ifdef CONFIG_RCU_BOOST */
334 local_irq_restore(flags); 617 local_irq_restore(flags);
335} 618}
336 619
@@ -374,7 +657,7 @@ static void rcu_preempt_check_callbacks(void)
374 rcu_preempt_cpu_qs(); 657 rcu_preempt_cpu_qs();
375 if (&rcu_preempt_ctrlblk.rcb.rcucblist != 658 if (&rcu_preempt_ctrlblk.rcb.rcucblist !=
376 rcu_preempt_ctrlblk.rcb.donetail) 659 rcu_preempt_ctrlblk.rcb.donetail)
377 raise_softirq(RCU_SOFTIRQ); 660 invoke_rcu_kthread();
378 if (rcu_preempt_gp_in_progress() && 661 if (rcu_preempt_gp_in_progress() &&
379 rcu_cpu_blocking_cur_gp() && 662 rcu_cpu_blocking_cur_gp() &&
380 rcu_preempt_running_reader()) 663 rcu_preempt_running_reader())
@@ -383,7 +666,7 @@ static void rcu_preempt_check_callbacks(void)
383 666
384/* 667/*
385 * TINY_PREEMPT_RCU has an extra callback-list tail pointer to 668 * TINY_PREEMPT_RCU has an extra callback-list tail pointer to
386 * update, so this is invoked from __rcu_process_callbacks() to 669 * update, so this is invoked from rcu_process_callbacks() to
387 * handle that case. Of course, it is invoked for all flavors of 670 * handle that case. Of course, it is invoked for all flavors of
388 * RCU, but RCU callbacks can appear only on one of the lists, and 671 * RCU, but RCU callbacks can appear only on one of the lists, and
389 * neither ->nexttail nor ->donetail can possibly be NULL, so there 672 * neither ->nexttail nor ->donetail can possibly be NULL, so there
@@ -400,7 +683,7 @@ static void rcu_preempt_remove_callbacks(struct rcu_ctrlblk *rcp)
400 */ 683 */
401static void rcu_preempt_process_callbacks(void) 684static void rcu_preempt_process_callbacks(void)
402{ 685{
403 __rcu_process_callbacks(&rcu_preempt_ctrlblk.rcb); 686 rcu_process_callbacks(&rcu_preempt_ctrlblk.rcb);
404} 687}
405 688
406/* 689/*
@@ -417,6 +700,7 @@ void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
417 local_irq_save(flags); 700 local_irq_save(flags);
418 *rcu_preempt_ctrlblk.nexttail = head; 701 *rcu_preempt_ctrlblk.nexttail = head;
419 rcu_preempt_ctrlblk.nexttail = &head->next; 702 rcu_preempt_ctrlblk.nexttail = &head->next;
703 RCU_TRACE(rcu_preempt_ctrlblk.rcb.qlen++);
420 rcu_preempt_start_gp(); /* checks to see if GP needed. */ 704 rcu_preempt_start_gp(); /* checks to see if GP needed. */
421 local_irq_restore(flags); 705 local_irq_restore(flags);
422} 706}
@@ -532,6 +816,7 @@ void synchronize_rcu_expedited(void)
532 816
533 /* Wait for tail of ->blkd_tasks list to drain. */ 817 /* Wait for tail of ->blkd_tasks list to drain. */
534 if (rcu_preempted_readers_exp()) 818 if (rcu_preempted_readers_exp())
819 rcu_initiate_expedited_boost();
535 wait_event(sync_rcu_preempt_exp_wq, 820 wait_event(sync_rcu_preempt_exp_wq,
536 !rcu_preempted_readers_exp()); 821 !rcu_preempted_readers_exp());
537 822
@@ -572,6 +857,27 @@ void exit_rcu(void)
572 857
573#else /* #ifdef CONFIG_TINY_PREEMPT_RCU */ 858#else /* #ifdef CONFIG_TINY_PREEMPT_RCU */
574 859
860#ifdef CONFIG_RCU_TRACE
861
862/*
863 * Because preemptible RCU does not exist, it is not necessary to
864 * dump out its statistics.
865 */
866static void show_tiny_preempt_stats(struct seq_file *m)
867{
868}
869
870#endif /* #ifdef CONFIG_RCU_TRACE */
871
872/*
873 * Because preemptible RCU does not exist, it is never necessary to
874 * boost preempted RCU readers.
875 */
876static int rcu_boost(void)
877{
878 return 0;
879}
880
575/* 881/*
576 * Because preemptible RCU does not exist, it never has any callbacks 882 * Because preemptible RCU does not exist, it never has any callbacks
577 * to check. 883 * to check.
@@ -599,17 +905,116 @@ static void rcu_preempt_process_callbacks(void)
599#endif /* #else #ifdef CONFIG_TINY_PREEMPT_RCU */ 905#endif /* #else #ifdef CONFIG_TINY_PREEMPT_RCU */
600 906
601#ifdef CONFIG_DEBUG_LOCK_ALLOC 907#ifdef CONFIG_DEBUG_LOCK_ALLOC
602
603#include <linux/kernel_stat.h> 908#include <linux/kernel_stat.h>
604 909
605/* 910/*
606 * During boot, we forgive RCU lockdep issues. After this function is 911 * During boot, we forgive RCU lockdep issues. After this function is
607 * invoked, we start taking RCU lockdep issues seriously. 912 * invoked, we start taking RCU lockdep issues seriously.
608 */ 913 */
609void rcu_scheduler_starting(void) 914void __init rcu_scheduler_starting(void)
610{ 915{
611 WARN_ON(nr_context_switches() > 0); 916 WARN_ON(nr_context_switches() > 0);
612 rcu_scheduler_active = 1; 917 rcu_scheduler_active = 1;
613} 918}
614 919
615#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ 920#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
921
922#ifdef CONFIG_RCU_BOOST
923#define RCU_BOOST_PRIO CONFIG_RCU_BOOST_PRIO
924#else /* #ifdef CONFIG_RCU_BOOST */
925#define RCU_BOOST_PRIO 1
926#endif /* #else #ifdef CONFIG_RCU_BOOST */
927
928#ifdef CONFIG_RCU_TRACE
929
930#ifdef CONFIG_RCU_BOOST
931
932static void rcu_initiate_boost_trace(void)
933{
934 if (rcu_preempt_ctrlblk.gp_tasks == NULL)
935 rcu_preempt_ctrlblk.n_normal_balk_gp_tasks++;
936 else if (rcu_preempt_ctrlblk.boost_tasks != NULL)
937 rcu_preempt_ctrlblk.n_normal_balk_boost_tasks++;
938 else if (rcu_preempt_ctrlblk.boosted_this_gp != 0)
939 rcu_preempt_ctrlblk.n_normal_balk_boosted++;
940 else if (!ULONG_CMP_GE(jiffies, rcu_preempt_ctrlblk.boost_time))
941 rcu_preempt_ctrlblk.n_normal_balk_notyet++;
942 else
943 rcu_preempt_ctrlblk.n_normal_balk_nos++;
944}
945
946static void rcu_initiate_exp_boost_trace(void)
947{
948 if (list_empty(&rcu_preempt_ctrlblk.blkd_tasks))
949 rcu_preempt_ctrlblk.n_exp_balk_blkd_tasks++;
950 else
951 rcu_preempt_ctrlblk.n_exp_balk_nos++;
952}
953
954#endif /* #ifdef CONFIG_RCU_BOOST */
955
956static void rcu_trace_sub_qlen(struct rcu_ctrlblk *rcp, int n)
957{
958 unsigned long flags;
959
960 raw_local_irq_save(flags);
961 rcp->qlen -= n;
962 raw_local_irq_restore(flags);
963}
964
965/*
966 * Dump statistics for TINY_RCU, such as they are.
967 */
968static int show_tiny_stats(struct seq_file *m, void *unused)
969{
970 show_tiny_preempt_stats(m);
971 seq_printf(m, "rcu_sched: qlen: %ld\n", rcu_sched_ctrlblk.qlen);
972 seq_printf(m, "rcu_bh: qlen: %ld\n", rcu_bh_ctrlblk.qlen);
973 return 0;
974}
975
976static int show_tiny_stats_open(struct inode *inode, struct file *file)
977{
978 return single_open(file, show_tiny_stats, NULL);
979}
980
981static const struct file_operations show_tiny_stats_fops = {
982 .owner = THIS_MODULE,
983 .open = show_tiny_stats_open,
984 .read = seq_read,
985 .llseek = seq_lseek,
986 .release = single_release,
987};
988
989static struct dentry *rcudir;
990
991static int __init rcutiny_trace_init(void)
992{
993 struct dentry *retval;
994
995 rcudir = debugfs_create_dir("rcu", NULL);
996 if (!rcudir)
997 goto free_out;
998 retval = debugfs_create_file("rcudata", 0444, rcudir,
999 NULL, &show_tiny_stats_fops);
1000 if (!retval)
1001 goto free_out;
1002 return 0;
1003free_out:
1004 debugfs_remove_recursive(rcudir);
1005 return 1;
1006}
1007
1008static void __exit rcutiny_trace_cleanup(void)
1009{
1010 debugfs_remove_recursive(rcudir);
1011}
1012
1013module_init(rcutiny_trace_init);
1014module_exit(rcutiny_trace_cleanup);
1015
1016MODULE_AUTHOR("Paul E. McKenney");
1017MODULE_DESCRIPTION("Read-Copy Update tracing for tiny implementation");
1018MODULE_LICENSE("GPL");
1019
1020#endif /* #ifdef CONFIG_RCU_TRACE */
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 9d8e8fb2515f..89613f97ff26 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -47,6 +47,7 @@
47#include <linux/srcu.h> 47#include <linux/srcu.h>
48#include <linux/slab.h> 48#include <linux/slab.h>
49#include <asm/byteorder.h> 49#include <asm/byteorder.h>
50#include <linux/sched.h>
50 51
51MODULE_LICENSE("GPL"); 52MODULE_LICENSE("GPL");
52MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and " 53MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and "
@@ -64,6 +65,9 @@ static int irqreader = 1; /* RCU readers from irq (timers). */
64static int fqs_duration = 0; /* Duration of bursts (us), 0 to disable. */ 65static int fqs_duration = 0; /* Duration of bursts (us), 0 to disable. */
65static int fqs_holdoff = 0; /* Hold time within burst (us). */ 66static int fqs_holdoff = 0; /* Hold time within burst (us). */
66static int fqs_stutter = 3; /* Wait time between bursts (s). */ 67static int fqs_stutter = 3; /* Wait time between bursts (s). */
68static int test_boost = 1; /* Test RCU prio boost: 0=no, 1=maybe, 2=yes. */
69static int test_boost_interval = 7; /* Interval between boost tests, seconds. */
70static int test_boost_duration = 4; /* Duration of each boost test, seconds. */
67static char *torture_type = "rcu"; /* What RCU implementation to torture. */ 71static char *torture_type = "rcu"; /* What RCU implementation to torture. */
68 72
69module_param(nreaders, int, 0444); 73module_param(nreaders, int, 0444);
@@ -88,6 +92,12 @@ module_param(fqs_holdoff, int, 0444);
88MODULE_PARM_DESC(fqs_holdoff, "Holdoff time within fqs bursts (us)"); 92MODULE_PARM_DESC(fqs_holdoff, "Holdoff time within fqs bursts (us)");
89module_param(fqs_stutter, int, 0444); 93module_param(fqs_stutter, int, 0444);
90MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)"); 94MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)");
95module_param(test_boost, int, 0444);
96MODULE_PARM_DESC(test_boost, "Test RCU prio boost: 0=no, 1=maybe, 2=yes.");
97module_param(test_boost_interval, int, 0444);
98MODULE_PARM_DESC(test_boost_interval, "Interval between boost tests, seconds.");
99module_param(test_boost_duration, int, 0444);
100MODULE_PARM_DESC(test_boost_duration, "Duration of each boost test, seconds.");
91module_param(torture_type, charp, 0444); 101module_param(torture_type, charp, 0444);
92MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, srcu)"); 102MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, srcu)");
93 103
@@ -109,6 +119,7 @@ static struct task_struct *stats_task;
109static struct task_struct *shuffler_task; 119static struct task_struct *shuffler_task;
110static struct task_struct *stutter_task; 120static struct task_struct *stutter_task;
111static struct task_struct *fqs_task; 121static struct task_struct *fqs_task;
122static struct task_struct *boost_tasks[NR_CPUS];
112 123
113#define RCU_TORTURE_PIPE_LEN 10 124#define RCU_TORTURE_PIPE_LEN 10
114 125
@@ -134,6 +145,12 @@ static atomic_t n_rcu_torture_alloc_fail;
134static atomic_t n_rcu_torture_free; 145static atomic_t n_rcu_torture_free;
135static atomic_t n_rcu_torture_mberror; 146static atomic_t n_rcu_torture_mberror;
136static atomic_t n_rcu_torture_error; 147static atomic_t n_rcu_torture_error;
148static long n_rcu_torture_boost_ktrerror;
149static long n_rcu_torture_boost_rterror;
150static long n_rcu_torture_boost_allocerror;
151static long n_rcu_torture_boost_afferror;
152static long n_rcu_torture_boost_failure;
153static long n_rcu_torture_boosts;
137static long n_rcu_torture_timers; 154static long n_rcu_torture_timers;
138static struct list_head rcu_torture_removed; 155static struct list_head rcu_torture_removed;
139static cpumask_var_t shuffle_tmp_mask; 156static cpumask_var_t shuffle_tmp_mask;
@@ -147,6 +164,16 @@ static int stutter_pause_test;
147#endif 164#endif
148int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT; 165int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT;
149 166
167#ifdef CONFIG_RCU_BOOST
168#define rcu_can_boost() 1
169#else /* #ifdef CONFIG_RCU_BOOST */
170#define rcu_can_boost() 0
171#endif /* #else #ifdef CONFIG_RCU_BOOST */
172
173static unsigned long boost_starttime; /* jiffies of next boost test start. */
174DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */
175 /* and boost task create/destroy. */
176
150/* Mediate rmmod and system shutdown. Concurrent rmmod & shutdown illegal! */ 177/* Mediate rmmod and system shutdown. Concurrent rmmod & shutdown illegal! */
151 178
152#define FULLSTOP_DONTSTOP 0 /* Normal operation. */ 179#define FULLSTOP_DONTSTOP 0 /* Normal operation. */
@@ -277,6 +304,7 @@ struct rcu_torture_ops {
277 void (*fqs)(void); 304 void (*fqs)(void);
278 int (*stats)(char *page); 305 int (*stats)(char *page);
279 int irq_capable; 306 int irq_capable;
307 int can_boost;
280 char *name; 308 char *name;
281}; 309};
282 310
@@ -366,6 +394,7 @@ static struct rcu_torture_ops rcu_ops = {
366 .fqs = rcu_force_quiescent_state, 394 .fqs = rcu_force_quiescent_state,
367 .stats = NULL, 395 .stats = NULL,
368 .irq_capable = 1, 396 .irq_capable = 1,
397 .can_boost = rcu_can_boost(),
369 .name = "rcu" 398 .name = "rcu"
370}; 399};
371 400
@@ -408,6 +437,7 @@ static struct rcu_torture_ops rcu_sync_ops = {
408 .fqs = rcu_force_quiescent_state, 437 .fqs = rcu_force_quiescent_state,
409 .stats = NULL, 438 .stats = NULL,
410 .irq_capable = 1, 439 .irq_capable = 1,
440 .can_boost = rcu_can_boost(),
411 .name = "rcu_sync" 441 .name = "rcu_sync"
412}; 442};
413 443
@@ -424,6 +454,7 @@ static struct rcu_torture_ops rcu_expedited_ops = {
424 .fqs = rcu_force_quiescent_state, 454 .fqs = rcu_force_quiescent_state,
425 .stats = NULL, 455 .stats = NULL,
426 .irq_capable = 1, 456 .irq_capable = 1,
457 .can_boost = rcu_can_boost(),
427 .name = "rcu_expedited" 458 .name = "rcu_expedited"
428}; 459};
429 460
@@ -684,6 +715,110 @@ static struct rcu_torture_ops sched_expedited_ops = {
684}; 715};
685 716
686/* 717/*
718 * RCU torture priority-boost testing. Runs one real-time thread per
719 * CPU for moderate bursts, repeatedly registering RCU callbacks and
720 * spinning waiting for them to be invoked. If a given callback takes
721 * too long to be invoked, we assume that priority inversion has occurred.
722 */
723
724struct rcu_boost_inflight {
725 struct rcu_head rcu;
726 int inflight;
727};
728
729static void rcu_torture_boost_cb(struct rcu_head *head)
730{
731 struct rcu_boost_inflight *rbip =
732 container_of(head, struct rcu_boost_inflight, rcu);
733
734 smp_mb(); /* Ensure RCU-core accesses precede clearing ->inflight */
735 rbip->inflight = 0;
736}
737
738static int rcu_torture_boost(void *arg)
739{
740 unsigned long call_rcu_time;
741 unsigned long endtime;
742 unsigned long oldstarttime;
743 struct rcu_boost_inflight rbi = { .inflight = 0 };
744 struct sched_param sp;
745
746 VERBOSE_PRINTK_STRING("rcu_torture_boost started");
747
748 /* Set real-time priority. */
749 sp.sched_priority = 1;
750 if (sched_setscheduler(current, SCHED_FIFO, &sp) < 0) {
751 VERBOSE_PRINTK_STRING("rcu_torture_boost RT prio failed!");
752 n_rcu_torture_boost_rterror++;
753 }
754
755 /* Each pass through the following loop does one boost-test cycle. */
756 do {
757 /* Wait for the next test interval. */
758 oldstarttime = boost_starttime;
759 while (jiffies - oldstarttime > ULONG_MAX / 2) {
760 schedule_timeout_uninterruptible(1);
761 rcu_stutter_wait("rcu_torture_boost");
762 if (kthread_should_stop() ||
763 fullstop != FULLSTOP_DONTSTOP)
764 goto checkwait;
765 }
766
767 /* Do one boost-test interval. */
768 endtime = oldstarttime + test_boost_duration * HZ;
769 call_rcu_time = jiffies;
770 while (jiffies - endtime > ULONG_MAX / 2) {
771 /* If we don't have a callback in flight, post one. */
772 if (!rbi.inflight) {
773 smp_mb(); /* RCU core before ->inflight = 1. */
774 rbi.inflight = 1;
775 call_rcu(&rbi.rcu, rcu_torture_boost_cb);
776 if (jiffies - call_rcu_time >
777 test_boost_duration * HZ - HZ / 2) {
778 VERBOSE_PRINTK_STRING("rcu_torture_boost boosting failed");
779 n_rcu_torture_boost_failure++;
780 }
781 call_rcu_time = jiffies;
782 }
783 cond_resched();
784 rcu_stutter_wait("rcu_torture_boost");
785 if (kthread_should_stop() ||
786 fullstop != FULLSTOP_DONTSTOP)
787 goto checkwait;
788 }
789
790 /*
791 * Set the start time of the next test interval.
792 * Yes, this is vulnerable to long delays, but such
793 * delays simply cause a false negative for the next
794 * interval. Besides, we are running at RT priority,
795 * so delays should be relatively rare.
796 */
797 while (oldstarttime == boost_starttime) {
798 if (mutex_trylock(&boost_mutex)) {
799 boost_starttime = jiffies +
800 test_boost_interval * HZ;
801 n_rcu_torture_boosts++;
802 mutex_unlock(&boost_mutex);
803 break;
804 }
805 schedule_timeout_uninterruptible(1);
806 }
807
808 /* Go do the stutter. */
809checkwait: rcu_stutter_wait("rcu_torture_boost");
810 } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP);
811
812 /* Clean up and exit. */
813 VERBOSE_PRINTK_STRING("rcu_torture_boost task stopping");
814 rcutorture_shutdown_absorb("rcu_torture_boost");
815 while (!kthread_should_stop() || rbi.inflight)
816 schedule_timeout_uninterruptible(1);
817 smp_mb(); /* order accesses to ->inflight before stack-frame death. */
818 return 0;
819}
820
821/*
687 * RCU torture force-quiescent-state kthread. Repeatedly induces 822 * RCU torture force-quiescent-state kthread. Repeatedly induces
688 * bursts of calls to force_quiescent_state(), increasing the probability 823 * bursts of calls to force_quiescent_state(), increasing the probability
689 * of occurrence of some important types of race conditions. 824 * of occurrence of some important types of race conditions.
@@ -933,7 +1068,8 @@ rcu_torture_printk(char *page)
933 cnt += sprintf(&page[cnt], "%s%s ", torture_type, TORTURE_FLAG); 1068 cnt += sprintf(&page[cnt], "%s%s ", torture_type, TORTURE_FLAG);
934 cnt += sprintf(&page[cnt], 1069 cnt += sprintf(&page[cnt],
935 "rtc: %p ver: %ld tfle: %d rta: %d rtaf: %d rtf: %d " 1070 "rtc: %p ver: %ld tfle: %d rta: %d rtaf: %d rtf: %d "
936 "rtmbe: %d nt: %ld", 1071 "rtmbe: %d rtbke: %ld rtbre: %ld rtbae: %ld rtbafe: %ld "
1072 "rtbf: %ld rtb: %ld nt: %ld",
937 rcu_torture_current, 1073 rcu_torture_current,
938 rcu_torture_current_version, 1074 rcu_torture_current_version,
939 list_empty(&rcu_torture_freelist), 1075 list_empty(&rcu_torture_freelist),
@@ -941,8 +1077,19 @@ rcu_torture_printk(char *page)
941 atomic_read(&n_rcu_torture_alloc_fail), 1077 atomic_read(&n_rcu_torture_alloc_fail),
942 atomic_read(&n_rcu_torture_free), 1078 atomic_read(&n_rcu_torture_free),
943 atomic_read(&n_rcu_torture_mberror), 1079 atomic_read(&n_rcu_torture_mberror),
1080 n_rcu_torture_boost_ktrerror,
1081 n_rcu_torture_boost_rterror,
1082 n_rcu_torture_boost_allocerror,
1083 n_rcu_torture_boost_afferror,
1084 n_rcu_torture_boost_failure,
1085 n_rcu_torture_boosts,
944 n_rcu_torture_timers); 1086 n_rcu_torture_timers);
945 if (atomic_read(&n_rcu_torture_mberror) != 0) 1087 if (atomic_read(&n_rcu_torture_mberror) != 0 ||
1088 n_rcu_torture_boost_ktrerror != 0 ||
1089 n_rcu_torture_boost_rterror != 0 ||
1090 n_rcu_torture_boost_allocerror != 0 ||
1091 n_rcu_torture_boost_afferror != 0 ||
1092 n_rcu_torture_boost_failure != 0)
946 cnt += sprintf(&page[cnt], " !!!"); 1093 cnt += sprintf(&page[cnt], " !!!");
947 cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG); 1094 cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG);
948 if (i > 1) { 1095 if (i > 1) {
@@ -1094,22 +1241,91 @@ rcu_torture_stutter(void *arg)
1094} 1241}
1095 1242
1096static inline void 1243static inline void
1097rcu_torture_print_module_parms(char *tag) 1244rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, char *tag)
1098{ 1245{
1099 printk(KERN_ALERT "%s" TORTURE_FLAG 1246 printk(KERN_ALERT "%s" TORTURE_FLAG
1100 "--- %s: nreaders=%d nfakewriters=%d " 1247 "--- %s: nreaders=%d nfakewriters=%d "
1101 "stat_interval=%d verbose=%d test_no_idle_hz=%d " 1248 "stat_interval=%d verbose=%d test_no_idle_hz=%d "
1102 "shuffle_interval=%d stutter=%d irqreader=%d " 1249 "shuffle_interval=%d stutter=%d irqreader=%d "
1103 "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d\n", 1250 "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d "
1251 "test_boost=%d/%d test_boost_interval=%d "
1252 "test_boost_duration=%d\n",
1104 torture_type, tag, nrealreaders, nfakewriters, 1253 torture_type, tag, nrealreaders, nfakewriters,
1105 stat_interval, verbose, test_no_idle_hz, shuffle_interval, 1254 stat_interval, verbose, test_no_idle_hz, shuffle_interval,
1106 stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter); 1255 stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter,
1256 test_boost, cur_ops->can_boost,
1257 test_boost_interval, test_boost_duration);
1107} 1258}
1108 1259
1109static struct notifier_block rcutorture_nb = { 1260static struct notifier_block rcutorture_shutdown_nb = {
1110 .notifier_call = rcutorture_shutdown_notify, 1261 .notifier_call = rcutorture_shutdown_notify,
1111}; 1262};
1112 1263
1264static void rcutorture_booster_cleanup(int cpu)
1265{
1266 struct task_struct *t;
1267
1268 if (boost_tasks[cpu] == NULL)
1269 return;
1270 mutex_lock(&boost_mutex);
1271 VERBOSE_PRINTK_STRING("Stopping rcu_torture_boost task");
1272 t = boost_tasks[cpu];
1273 boost_tasks[cpu] = NULL;
1274 mutex_unlock(&boost_mutex);
1275
1276 /* This must be outside of the mutex, otherwise deadlock! */
1277 kthread_stop(t);
1278}
1279
1280static int rcutorture_booster_init(int cpu)
1281{
1282 int retval;
1283
1284 if (boost_tasks[cpu] != NULL)
1285 return 0; /* Already created, nothing more to do. */
1286
1287 /* Don't allow time recalculation while creating a new task. */
1288 mutex_lock(&boost_mutex);
1289 VERBOSE_PRINTK_STRING("Creating rcu_torture_boost task");
1290 boost_tasks[cpu] = kthread_create(rcu_torture_boost, NULL,
1291 "rcu_torture_boost");
1292 if (IS_ERR(boost_tasks[cpu])) {
1293 retval = PTR_ERR(boost_tasks[cpu]);
1294 VERBOSE_PRINTK_STRING("rcu_torture_boost task create failed");
1295 n_rcu_torture_boost_ktrerror++;
1296 boost_tasks[cpu] = NULL;
1297 mutex_unlock(&boost_mutex);
1298 return retval;
1299 }
1300 kthread_bind(boost_tasks[cpu], cpu);
1301 wake_up_process(boost_tasks[cpu]);
1302 mutex_unlock(&boost_mutex);
1303 return 0;
1304}
1305
1306static int rcutorture_cpu_notify(struct notifier_block *self,
1307 unsigned long action, void *hcpu)
1308{
1309 long cpu = (long)hcpu;
1310
1311 switch (action) {
1312 case CPU_ONLINE:
1313 case CPU_DOWN_FAILED:
1314 (void)rcutorture_booster_init(cpu);
1315 break;
1316 case CPU_DOWN_PREPARE:
1317 rcutorture_booster_cleanup(cpu);
1318 break;
1319 default:
1320 break;
1321 }
1322 return NOTIFY_OK;
1323}
1324
1325static struct notifier_block rcutorture_cpu_nb = {
1326 .notifier_call = rcutorture_cpu_notify,
1327};
1328
1113static void 1329static void
1114rcu_torture_cleanup(void) 1330rcu_torture_cleanup(void)
1115{ 1331{
@@ -1127,7 +1343,7 @@ rcu_torture_cleanup(void)
1127 } 1343 }
1128 fullstop = FULLSTOP_RMMOD; 1344 fullstop = FULLSTOP_RMMOD;
1129 mutex_unlock(&fullstop_mutex); 1345 mutex_unlock(&fullstop_mutex);
1130 unregister_reboot_notifier(&rcutorture_nb); 1346 unregister_reboot_notifier(&rcutorture_shutdown_nb);
1131 if (stutter_task) { 1347 if (stutter_task) {
1132 VERBOSE_PRINTK_STRING("Stopping rcu_torture_stutter task"); 1348 VERBOSE_PRINTK_STRING("Stopping rcu_torture_stutter task");
1133 kthread_stop(stutter_task); 1349 kthread_stop(stutter_task);
@@ -1184,6 +1400,12 @@ rcu_torture_cleanup(void)
1184 kthread_stop(fqs_task); 1400 kthread_stop(fqs_task);
1185 } 1401 }
1186 fqs_task = NULL; 1402 fqs_task = NULL;
1403 if ((test_boost == 1 && cur_ops->can_boost) ||
1404 test_boost == 2) {
1405 unregister_cpu_notifier(&rcutorture_cpu_nb);
1406 for_each_possible_cpu(i)
1407 rcutorture_booster_cleanup(i);
1408 }
1187 1409
1188 /* Wait for all RCU callbacks to fire. */ 1410 /* Wait for all RCU callbacks to fire. */
1189 1411
@@ -1195,9 +1417,9 @@ rcu_torture_cleanup(void)
1195 if (cur_ops->cleanup) 1417 if (cur_ops->cleanup)
1196 cur_ops->cleanup(); 1418 cur_ops->cleanup();
1197 if (atomic_read(&n_rcu_torture_error)) 1419 if (atomic_read(&n_rcu_torture_error))
1198 rcu_torture_print_module_parms("End of test: FAILURE"); 1420 rcu_torture_print_module_parms(cur_ops, "End of test: FAILURE");
1199 else 1421 else
1200 rcu_torture_print_module_parms("End of test: SUCCESS"); 1422 rcu_torture_print_module_parms(cur_ops, "End of test: SUCCESS");
1201} 1423}
1202 1424
1203static int __init 1425static int __init
@@ -1242,7 +1464,7 @@ rcu_torture_init(void)
1242 nrealreaders = nreaders; 1464 nrealreaders = nreaders;
1243 else 1465 else
1244 nrealreaders = 2 * num_online_cpus(); 1466 nrealreaders = 2 * num_online_cpus();
1245 rcu_torture_print_module_parms("Start of test"); 1467 rcu_torture_print_module_parms(cur_ops, "Start of test");
1246 fullstop = FULLSTOP_DONTSTOP; 1468 fullstop = FULLSTOP_DONTSTOP;
1247 1469
1248 /* Set up the freelist. */ 1470 /* Set up the freelist. */
@@ -1263,6 +1485,12 @@ rcu_torture_init(void)
1263 atomic_set(&n_rcu_torture_free, 0); 1485 atomic_set(&n_rcu_torture_free, 0);
1264 atomic_set(&n_rcu_torture_mberror, 0); 1486 atomic_set(&n_rcu_torture_mberror, 0);
1265 atomic_set(&n_rcu_torture_error, 0); 1487 atomic_set(&n_rcu_torture_error, 0);
1488 n_rcu_torture_boost_ktrerror = 0;
1489 n_rcu_torture_boost_rterror = 0;
1490 n_rcu_torture_boost_allocerror = 0;
1491 n_rcu_torture_boost_afferror = 0;
1492 n_rcu_torture_boost_failure = 0;
1493 n_rcu_torture_boosts = 0;
1266 for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) 1494 for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++)
1267 atomic_set(&rcu_torture_wcount[i], 0); 1495 atomic_set(&rcu_torture_wcount[i], 0);
1268 for_each_possible_cpu(cpu) { 1496 for_each_possible_cpu(cpu) {
@@ -1376,7 +1604,27 @@ rcu_torture_init(void)
1376 goto unwind; 1604 goto unwind;
1377 } 1605 }
1378 } 1606 }
1379 register_reboot_notifier(&rcutorture_nb); 1607 if (test_boost_interval < 1)
1608 test_boost_interval = 1;
1609 if (test_boost_duration < 2)
1610 test_boost_duration = 2;
1611 if ((test_boost == 1 && cur_ops->can_boost) ||
1612 test_boost == 2) {
1613 int retval;
1614
1615 boost_starttime = jiffies + test_boost_interval * HZ;
1616 register_cpu_notifier(&rcutorture_cpu_nb);
1617 for_each_possible_cpu(i) {
1618 if (cpu_is_offline(i))
1619 continue; /* Heuristic: CPU can go offline. */
1620 retval = rcutorture_booster_init(i);
1621 if (retval < 0) {
1622 firsterr = retval;
1623 goto unwind;
1624 }
1625 }
1626 }
1627 register_reboot_notifier(&rcutorture_shutdown_nb);
1380 mutex_unlock(&fullstop_mutex); 1628 mutex_unlock(&fullstop_mutex);
1381 return 0; 1629 return 0;
1382 1630
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index ccdc04c47981..dd4aea806f8e 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -67,9 +67,6 @@ static struct lock_class_key rcu_node_class[NUM_RCU_LVLS];
67 .gpnum = -300, \ 67 .gpnum = -300, \
68 .completed = -300, \ 68 .completed = -300, \
69 .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&structname.onofflock), \ 69 .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&structname.onofflock), \
70 .orphan_cbs_list = NULL, \
71 .orphan_cbs_tail = &structname.orphan_cbs_list, \
72 .orphan_qlen = 0, \
73 .fqslock = __RAW_SPIN_LOCK_UNLOCKED(&structname.fqslock), \ 70 .fqslock = __RAW_SPIN_LOCK_UNLOCKED(&structname.fqslock), \
74 .n_force_qs = 0, \ 71 .n_force_qs = 0, \
75 .n_force_qs_ngp = 0, \ 72 .n_force_qs_ngp = 0, \
@@ -367,8 +364,8 @@ void rcu_irq_exit(void)
367 WARN_ON_ONCE(rdtp->dynticks & 0x1); 364 WARN_ON_ONCE(rdtp->dynticks & 0x1);
368 365
369 /* If the interrupt queued a callback, get out of dyntick mode. */ 366 /* If the interrupt queued a callback, get out of dyntick mode. */
370 if (__get_cpu_var(rcu_sched_data).nxtlist || 367 if (__this_cpu_read(rcu_sched_data.nxtlist) ||
371 __get_cpu_var(rcu_bh_data).nxtlist) 368 __this_cpu_read(rcu_bh_data.nxtlist))
372 set_need_resched(); 369 set_need_resched();
373} 370}
374 371
@@ -620,9 +617,17 @@ static void __init check_cpu_stall_init(void)
620static void __note_new_gpnum(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) 617static void __note_new_gpnum(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp)
621{ 618{
622 if (rdp->gpnum != rnp->gpnum) { 619 if (rdp->gpnum != rnp->gpnum) {
623 rdp->qs_pending = 1; 620 /*
624 rdp->passed_quiesc = 0; 621 * If the current grace period is waiting for this CPU,
622 * set up to detect a quiescent state, otherwise don't
623 * go looking for one.
624 */
625 rdp->gpnum = rnp->gpnum; 625 rdp->gpnum = rnp->gpnum;
626 if (rnp->qsmask & rdp->grpmask) {
627 rdp->qs_pending = 1;
628 rdp->passed_quiesc = 0;
629 } else
630 rdp->qs_pending = 0;
626 } 631 }
627} 632}
628 633
@@ -681,6 +686,24 @@ __rcu_process_gp_end(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_dat
681 686
682 /* Remember that we saw this grace-period completion. */ 687 /* Remember that we saw this grace-period completion. */
683 rdp->completed = rnp->completed; 688 rdp->completed = rnp->completed;
689
690 /*
691 * If we were in an extended quiescent state, we may have
692 * missed some grace periods that others CPUs handled on
693 * our behalf. Catch up with this state to avoid noting
694 * spurious new grace periods. If another grace period
695 * has started, then rnp->gpnum will have advanced, so
696 * we will detect this later on.
697 */
698 if (ULONG_CMP_LT(rdp->gpnum, rdp->completed))
699 rdp->gpnum = rdp->completed;
700
701 /*
702 * If RCU does not need a quiescent state from this CPU,
703 * then make sure that this CPU doesn't go looking for one.
704 */
705 if ((rnp->qsmask & rdp->grpmask) == 0)
706 rdp->qs_pending = 0;
684 } 707 }
685} 708}
686 709
@@ -984,53 +1007,31 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp)
984#ifdef CONFIG_HOTPLUG_CPU 1007#ifdef CONFIG_HOTPLUG_CPU
985 1008
986/* 1009/*
987 * Move a dying CPU's RCU callbacks to the ->orphan_cbs_list for the 1010 * Move a dying CPU's RCU callbacks to online CPU's callback list.
988 * specified flavor of RCU. The callbacks will be adopted by the next 1011 * Synchronization is not required because this function executes
989 * _rcu_barrier() invocation or by the CPU_DEAD notifier, whichever 1012 * in stop_machine() context.
990 * comes first. Because this is invoked from the CPU_DYING notifier,
991 * irqs are already disabled.
992 */ 1013 */
993static void rcu_send_cbs_to_orphanage(struct rcu_state *rsp) 1014static void rcu_send_cbs_to_online(struct rcu_state *rsp)
994{ 1015{
995 int i; 1016 int i;
1017 /* current DYING CPU is cleared in the cpu_online_mask */
1018 int receive_cpu = cpumask_any(cpu_online_mask);
996 struct rcu_data *rdp = this_cpu_ptr(rsp->rda); 1019 struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
1020 struct rcu_data *receive_rdp = per_cpu_ptr(rsp->rda, receive_cpu);
997 1021
998 if (rdp->nxtlist == NULL) 1022 if (rdp->nxtlist == NULL)
999 return; /* irqs disabled, so comparison is stable. */ 1023 return; /* irqs disabled, so comparison is stable. */
1000 raw_spin_lock(&rsp->onofflock); /* irqs already disabled. */ 1024
1001 *rsp->orphan_cbs_tail = rdp->nxtlist; 1025 *receive_rdp->nxttail[RCU_NEXT_TAIL] = rdp->nxtlist;
1002 rsp->orphan_cbs_tail = rdp->nxttail[RCU_NEXT_TAIL]; 1026 receive_rdp->nxttail[RCU_NEXT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
1027 receive_rdp->qlen += rdp->qlen;
1028 receive_rdp->n_cbs_adopted += rdp->qlen;
1029 rdp->n_cbs_orphaned += rdp->qlen;
1030
1003 rdp->nxtlist = NULL; 1031 rdp->nxtlist = NULL;
1004 for (i = 0; i < RCU_NEXT_SIZE; i++) 1032 for (i = 0; i < RCU_NEXT_SIZE; i++)
1005 rdp->nxttail[i] = &rdp->nxtlist; 1033 rdp->nxttail[i] = &rdp->nxtlist;
1006 rsp->orphan_qlen += rdp->qlen;
1007 rdp->n_cbs_orphaned += rdp->qlen;
1008 rdp->qlen = 0; 1034 rdp->qlen = 0;
1009 raw_spin_unlock(&rsp->onofflock); /* irqs remain disabled. */
1010}
1011
1012/*
1013 * Adopt previously orphaned RCU callbacks.
1014 */
1015static void rcu_adopt_orphan_cbs(struct rcu_state *rsp)
1016{
1017 unsigned long flags;
1018 struct rcu_data *rdp;
1019
1020 raw_spin_lock_irqsave(&rsp->onofflock, flags);
1021 rdp = this_cpu_ptr(rsp->rda);
1022 if (rsp->orphan_cbs_list == NULL) {
1023 raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
1024 return;
1025 }
1026 *rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_cbs_list;
1027 rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_cbs_tail;
1028 rdp->qlen += rsp->orphan_qlen;
1029 rdp->n_cbs_adopted += rsp->orphan_qlen;
1030 rsp->orphan_cbs_list = NULL;
1031 rsp->orphan_cbs_tail = &rsp->orphan_cbs_list;
1032 rsp->orphan_qlen = 0;
1033 raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
1034} 1035}
1035 1036
1036/* 1037/*
@@ -1081,8 +1082,6 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
1081 raw_spin_unlock_irqrestore(&rnp->lock, flags); 1082 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1082 if (need_report & RCU_OFL_TASKS_EXP_GP) 1083 if (need_report & RCU_OFL_TASKS_EXP_GP)
1083 rcu_report_exp_rnp(rsp, rnp); 1084 rcu_report_exp_rnp(rsp, rnp);
1084
1085 rcu_adopt_orphan_cbs(rsp);
1086} 1085}
1087 1086
1088/* 1087/*
@@ -1100,11 +1099,7 @@ static void rcu_offline_cpu(int cpu)
1100 1099
1101#else /* #ifdef CONFIG_HOTPLUG_CPU */ 1100#else /* #ifdef CONFIG_HOTPLUG_CPU */
1102 1101
1103static void rcu_send_cbs_to_orphanage(struct rcu_state *rsp) 1102static void rcu_send_cbs_to_online(struct rcu_state *rsp)
1104{
1105}
1106
1107static void rcu_adopt_orphan_cbs(struct rcu_state *rsp)
1108{ 1103{
1109} 1104}
1110 1105
@@ -1440,22 +1435,11 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
1440 */ 1435 */
1441 local_irq_save(flags); 1436 local_irq_save(flags);
1442 rdp = this_cpu_ptr(rsp->rda); 1437 rdp = this_cpu_ptr(rsp->rda);
1443 rcu_process_gp_end(rsp, rdp);
1444 check_for_new_grace_period(rsp, rdp);
1445 1438
1446 /* Add the callback to our list. */ 1439 /* Add the callback to our list. */
1447 *rdp->nxttail[RCU_NEXT_TAIL] = head; 1440 *rdp->nxttail[RCU_NEXT_TAIL] = head;
1448 rdp->nxttail[RCU_NEXT_TAIL] = &head->next; 1441 rdp->nxttail[RCU_NEXT_TAIL] = &head->next;
1449 1442
1450 /* Start a new grace period if one not already started. */
1451 if (!rcu_gp_in_progress(rsp)) {
1452 unsigned long nestflag;
1453 struct rcu_node *rnp_root = rcu_get_root(rsp);
1454
1455 raw_spin_lock_irqsave(&rnp_root->lock, nestflag);
1456 rcu_start_gp(rsp, nestflag); /* releases rnp_root->lock. */
1457 }
1458
1459 /* 1443 /*
1460 * Force the grace period if too many callbacks or too long waiting. 1444 * Force the grace period if too many callbacks or too long waiting.
1461 * Enforce hysteresis, and don't invoke force_quiescent_state() 1445 * Enforce hysteresis, and don't invoke force_quiescent_state()
@@ -1464,12 +1448,27 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
1464 * is the only one waiting for a grace period to complete. 1448 * is the only one waiting for a grace period to complete.
1465 */ 1449 */
1466 if (unlikely(++rdp->qlen > rdp->qlen_last_fqs_check + qhimark)) { 1450 if (unlikely(++rdp->qlen > rdp->qlen_last_fqs_check + qhimark)) {
1467 rdp->blimit = LONG_MAX; 1451
1468 if (rsp->n_force_qs == rdp->n_force_qs_snap && 1452 /* Are we ignoring a completed grace period? */
1469 *rdp->nxttail[RCU_DONE_TAIL] != head) 1453 rcu_process_gp_end(rsp, rdp);
1470 force_quiescent_state(rsp, 0); 1454 check_for_new_grace_period(rsp, rdp);
1471 rdp->n_force_qs_snap = rsp->n_force_qs; 1455
1472 rdp->qlen_last_fqs_check = rdp->qlen; 1456 /* Start a new grace period if one not already started. */
1457 if (!rcu_gp_in_progress(rsp)) {
1458 unsigned long nestflag;
1459 struct rcu_node *rnp_root = rcu_get_root(rsp);
1460
1461 raw_spin_lock_irqsave(&rnp_root->lock, nestflag);
1462 rcu_start_gp(rsp, nestflag); /* rlses rnp_root->lock */
1463 } else {
1464 /* Give the grace period a kick. */
1465 rdp->blimit = LONG_MAX;
1466 if (rsp->n_force_qs == rdp->n_force_qs_snap &&
1467 *rdp->nxttail[RCU_DONE_TAIL] != head)
1468 force_quiescent_state(rsp, 0);
1469 rdp->n_force_qs_snap = rsp->n_force_qs;
1470 rdp->qlen_last_fqs_check = rdp->qlen;
1471 }
1473 } else if (ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs), jiffies)) 1472 } else if (ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs), jiffies))
1474 force_quiescent_state(rsp, 1); 1473 force_quiescent_state(rsp, 1);
1475 local_irq_restore(flags); 1474 local_irq_restore(flags);
@@ -1699,13 +1698,12 @@ static void _rcu_barrier(struct rcu_state *rsp,
1699 * decrement rcu_barrier_cpu_count -- otherwise the first CPU 1698 * decrement rcu_barrier_cpu_count -- otherwise the first CPU
1700 * might complete its grace period before all of the other CPUs 1699 * might complete its grace period before all of the other CPUs
1701 * did their increment, causing this function to return too 1700 * did their increment, causing this function to return too
1702 * early. 1701 * early. Note that on_each_cpu() disables irqs, which prevents
1702 * any CPUs from coming online or going offline until each online
1703 * CPU has queued its RCU-barrier callback.
1703 */ 1704 */
1704 atomic_set(&rcu_barrier_cpu_count, 1); 1705 atomic_set(&rcu_barrier_cpu_count, 1);
1705 preempt_disable(); /* stop CPU_DYING from filling orphan_cbs_list */
1706 rcu_adopt_orphan_cbs(rsp);
1707 on_each_cpu(rcu_barrier_func, (void *)call_rcu_func, 1); 1706 on_each_cpu(rcu_barrier_func, (void *)call_rcu_func, 1);
1708 preempt_enable(); /* CPU_DYING can again fill orphan_cbs_list */
1709 if (atomic_dec_and_test(&rcu_barrier_cpu_count)) 1707 if (atomic_dec_and_test(&rcu_barrier_cpu_count))
1710 complete(&rcu_barrier_completion); 1708 complete(&rcu_barrier_completion);
1711 wait_for_completion(&rcu_barrier_completion); 1709 wait_for_completion(&rcu_barrier_completion);
@@ -1831,18 +1829,13 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
1831 case CPU_DYING: 1829 case CPU_DYING:
1832 case CPU_DYING_FROZEN: 1830 case CPU_DYING_FROZEN:
1833 /* 1831 /*
1834 * preempt_disable() in _rcu_barrier() prevents stop_machine(), 1832 * The whole machine is "stopped" except this CPU, so we can
1835 * so when "on_each_cpu(rcu_barrier_func, (void *)type, 1);" 1833 * touch any data without introducing corruption. We send the
1836 * returns, all online cpus have queued rcu_barrier_func(). 1834 * dying CPU's callbacks to an arbitrarily chosen online CPU.
1837 * The dying CPU clears its cpu_online_mask bit and
1838 * moves all of its RCU callbacks to ->orphan_cbs_list
1839 * in the context of stop_machine(), so subsequent calls
1840 * to _rcu_barrier() will adopt these callbacks and only
1841 * then queue rcu_barrier_func() on all remaining CPUs.
1842 */ 1835 */
1843 rcu_send_cbs_to_orphanage(&rcu_bh_state); 1836 rcu_send_cbs_to_online(&rcu_bh_state);
1844 rcu_send_cbs_to_orphanage(&rcu_sched_state); 1837 rcu_send_cbs_to_online(&rcu_sched_state);
1845 rcu_preempt_send_cbs_to_orphanage(); 1838 rcu_preempt_send_cbs_to_online();
1846 break; 1839 break;
1847 case CPU_DEAD: 1840 case CPU_DEAD:
1848 case CPU_DEAD_FROZEN: 1841 case CPU_DEAD_FROZEN:
@@ -1880,8 +1873,9 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp)
1880{ 1873{
1881 int i; 1874 int i;
1882 1875
1883 for (i = NUM_RCU_LVLS - 1; i >= 0; i--) 1876 for (i = NUM_RCU_LVLS - 1; i > 0; i--)
1884 rsp->levelspread[i] = CONFIG_RCU_FANOUT; 1877 rsp->levelspread[i] = CONFIG_RCU_FANOUT;
1878 rsp->levelspread[0] = RCU_FANOUT_LEAF;
1885} 1879}
1886#else /* #ifdef CONFIG_RCU_FANOUT_EXACT */ 1880#else /* #ifdef CONFIG_RCU_FANOUT_EXACT */
1887static void __init rcu_init_levelspread(struct rcu_state *rsp) 1881static void __init rcu_init_levelspread(struct rcu_state *rsp)
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index 91d4170c5c13..e8f057e44e3e 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -31,46 +31,51 @@
31/* 31/*
32 * Define shape of hierarchy based on NR_CPUS and CONFIG_RCU_FANOUT. 32 * Define shape of hierarchy based on NR_CPUS and CONFIG_RCU_FANOUT.
33 * In theory, it should be possible to add more levels straightforwardly. 33 * In theory, it should be possible to add more levels straightforwardly.
34 * In practice, this has not been tested, so there is probably some 34 * In practice, this did work well going from three levels to four.
35 * bug somewhere. 35 * Of course, your mileage may vary.
36 */ 36 */
37#define MAX_RCU_LVLS 4 37#define MAX_RCU_LVLS 4
38#define RCU_FANOUT (CONFIG_RCU_FANOUT) 38#if CONFIG_RCU_FANOUT > 16
39#define RCU_FANOUT_SQ (RCU_FANOUT * RCU_FANOUT) 39#define RCU_FANOUT_LEAF 16
40#define RCU_FANOUT_CUBE (RCU_FANOUT_SQ * RCU_FANOUT) 40#else /* #if CONFIG_RCU_FANOUT > 16 */
41#define RCU_FANOUT_FOURTH (RCU_FANOUT_CUBE * RCU_FANOUT) 41#define RCU_FANOUT_LEAF (CONFIG_RCU_FANOUT)
42 42#endif /* #else #if CONFIG_RCU_FANOUT > 16 */
43#if NR_CPUS <= RCU_FANOUT 43#define RCU_FANOUT_1 (RCU_FANOUT_LEAF)
44#define RCU_FANOUT_2 (RCU_FANOUT_1 * CONFIG_RCU_FANOUT)
45#define RCU_FANOUT_3 (RCU_FANOUT_2 * CONFIG_RCU_FANOUT)
46#define RCU_FANOUT_4 (RCU_FANOUT_3 * CONFIG_RCU_FANOUT)
47
48#if NR_CPUS <= RCU_FANOUT_1
44# define NUM_RCU_LVLS 1 49# define NUM_RCU_LVLS 1
45# define NUM_RCU_LVL_0 1 50# define NUM_RCU_LVL_0 1
46# define NUM_RCU_LVL_1 (NR_CPUS) 51# define NUM_RCU_LVL_1 (NR_CPUS)
47# define NUM_RCU_LVL_2 0 52# define NUM_RCU_LVL_2 0
48# define NUM_RCU_LVL_3 0 53# define NUM_RCU_LVL_3 0
49# define NUM_RCU_LVL_4 0 54# define NUM_RCU_LVL_4 0
50#elif NR_CPUS <= RCU_FANOUT_SQ 55#elif NR_CPUS <= RCU_FANOUT_2
51# define NUM_RCU_LVLS 2 56# define NUM_RCU_LVLS 2
52# define NUM_RCU_LVL_0 1 57# define NUM_RCU_LVL_0 1
53# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT) 58# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
54# define NUM_RCU_LVL_2 (NR_CPUS) 59# define NUM_RCU_LVL_2 (NR_CPUS)
55# define NUM_RCU_LVL_3 0 60# define NUM_RCU_LVL_3 0
56# define NUM_RCU_LVL_4 0 61# define NUM_RCU_LVL_4 0
57#elif NR_CPUS <= RCU_FANOUT_CUBE 62#elif NR_CPUS <= RCU_FANOUT_3
58# define NUM_RCU_LVLS 3 63# define NUM_RCU_LVLS 3
59# define NUM_RCU_LVL_0 1 64# define NUM_RCU_LVL_0 1
60# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_SQ) 65# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2)
61# define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT) 66# define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
62# define NUM_RCU_LVL_3 NR_CPUS 67# define NUM_RCU_LVL_3 (NR_CPUS)
63# define NUM_RCU_LVL_4 0 68# define NUM_RCU_LVL_4 0
64#elif NR_CPUS <= RCU_FANOUT_FOURTH 69#elif NR_CPUS <= RCU_FANOUT_4
65# define NUM_RCU_LVLS 4 70# define NUM_RCU_LVLS 4
66# define NUM_RCU_LVL_0 1 71# define NUM_RCU_LVL_0 1
67# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_CUBE) 72# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_3)
68# define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_SQ) 73# define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2)
69# define NUM_RCU_LVL_3 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT) 74# define NUM_RCU_LVL_3 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
70# define NUM_RCU_LVL_4 NR_CPUS 75# define NUM_RCU_LVL_4 (NR_CPUS)
71#else 76#else
72# error "CONFIG_RCU_FANOUT insufficient for NR_CPUS" 77# error "CONFIG_RCU_FANOUT insufficient for NR_CPUS"
73#endif /* #if (NR_CPUS) <= RCU_FANOUT */ 78#endif /* #if (NR_CPUS) <= RCU_FANOUT_1 */
74 79
75#define RCU_SUM (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3 + NUM_RCU_LVL_4) 80#define RCU_SUM (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3 + NUM_RCU_LVL_4)
76#define NUM_RCU_NODES (RCU_SUM - NR_CPUS) 81#define NUM_RCU_NODES (RCU_SUM - NR_CPUS)
@@ -203,8 +208,8 @@ struct rcu_data {
203 long qlen_last_fqs_check; 208 long qlen_last_fqs_check;
204 /* qlen at last check for QS forcing */ 209 /* qlen at last check for QS forcing */
205 unsigned long n_cbs_invoked; /* count of RCU cbs invoked. */ 210 unsigned long n_cbs_invoked; /* count of RCU cbs invoked. */
206 unsigned long n_cbs_orphaned; /* RCU cbs sent to orphanage. */ 211 unsigned long n_cbs_orphaned; /* RCU cbs orphaned by dying CPU */
207 unsigned long n_cbs_adopted; /* RCU cbs adopted from orphanage. */ 212 unsigned long n_cbs_adopted; /* RCU cbs adopted from dying CPU */
208 unsigned long n_force_qs_snap; 213 unsigned long n_force_qs_snap;
209 /* did other CPU force QS recently? */ 214 /* did other CPU force QS recently? */
210 long blimit; /* Upper limit on a processed batch */ 215 long blimit; /* Upper limit on a processed batch */
@@ -309,15 +314,7 @@ struct rcu_state {
309 /* End of fields guarded by root rcu_node's lock. */ 314 /* End of fields guarded by root rcu_node's lock. */
310 315
311 raw_spinlock_t onofflock; /* exclude on/offline and */ 316 raw_spinlock_t onofflock; /* exclude on/offline and */
312 /* starting new GP. Also */ 317 /* starting new GP. */
313 /* protects the following */
314 /* orphan_cbs fields. */
315 struct rcu_head *orphan_cbs_list; /* list of rcu_head structs */
316 /* orphaned by all CPUs in */
317 /* a given leaf rcu_node */
318 /* going offline. */
319 struct rcu_head **orphan_cbs_tail; /* And tail pointer. */
320 long orphan_qlen; /* Number of orphaned cbs. */
321 raw_spinlock_t fqslock; /* Only one task forcing */ 318 raw_spinlock_t fqslock; /* Only one task forcing */
322 /* quiescent states. */ 319 /* quiescent states. */
323 unsigned long jiffies_force_qs; /* Time at which to invoke */ 320 unsigned long jiffies_force_qs; /* Time at which to invoke */
@@ -390,7 +387,7 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp);
390static int rcu_preempt_pending(int cpu); 387static int rcu_preempt_pending(int cpu);
391static int rcu_preempt_needs_cpu(int cpu); 388static int rcu_preempt_needs_cpu(int cpu);
392static void __cpuinit rcu_preempt_init_percpu_data(int cpu); 389static void __cpuinit rcu_preempt_init_percpu_data(int cpu);
393static void rcu_preempt_send_cbs_to_orphanage(void); 390static void rcu_preempt_send_cbs_to_online(void);
394static void __init __rcu_init_preempt(void); 391static void __init __rcu_init_preempt(void);
395static void rcu_needs_cpu_flush(void); 392static void rcu_needs_cpu_flush(void);
396 393
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 71a4147473f9..a3638710dc67 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -25,6 +25,7 @@
25 */ 25 */
26 26
27#include <linux/delay.h> 27#include <linux/delay.h>
28#include <linux/stop_machine.h>
28 29
29/* 30/*
30 * Check the RCU kernel configuration parameters and print informative 31 * Check the RCU kernel configuration parameters and print informative
@@ -773,11 +774,11 @@ static void __cpuinit rcu_preempt_init_percpu_data(int cpu)
773} 774}
774 775
775/* 776/*
776 * Move preemptable RCU's callbacks to ->orphan_cbs_list. 777 * Move preemptable RCU's callbacks from dying CPU to other online CPU.
777 */ 778 */
778static void rcu_preempt_send_cbs_to_orphanage(void) 779static void rcu_preempt_send_cbs_to_online(void)
779{ 780{
780 rcu_send_cbs_to_orphanage(&rcu_preempt_state); 781 rcu_send_cbs_to_online(&rcu_preempt_state);
781} 782}
782 783
783/* 784/*
@@ -1001,7 +1002,7 @@ static void __cpuinit rcu_preempt_init_percpu_data(int cpu)
1001/* 1002/*
1002 * Because there is no preemptable RCU, there are no callbacks to move. 1003 * Because there is no preemptable RCU, there are no callbacks to move.
1003 */ 1004 */
1004static void rcu_preempt_send_cbs_to_orphanage(void) 1005static void rcu_preempt_send_cbs_to_online(void)
1005{ 1006{
1006} 1007}
1007 1008
@@ -1014,6 +1015,132 @@ static void __init __rcu_init_preempt(void)
1014 1015
1015#endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */ 1016#endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */
1016 1017
1018#ifndef CONFIG_SMP
1019
1020void synchronize_sched_expedited(void)
1021{
1022 cond_resched();
1023}
1024EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
1025
1026#else /* #ifndef CONFIG_SMP */
1027
1028static atomic_t sync_sched_expedited_started = ATOMIC_INIT(0);
1029static atomic_t sync_sched_expedited_done = ATOMIC_INIT(0);
1030
1031static int synchronize_sched_expedited_cpu_stop(void *data)
1032{
1033 /*
1034 * There must be a full memory barrier on each affected CPU
1035 * between the time that try_stop_cpus() is called and the
1036 * time that it returns.
1037 *
1038 * In the current initial implementation of cpu_stop, the
1039 * above condition is already met when the control reaches
1040 * this point and the following smp_mb() is not strictly
1041 * necessary. Do smp_mb() anyway for documentation and
1042 * robustness against future implementation changes.
1043 */
1044 smp_mb(); /* See above comment block. */
1045 return 0;
1046}
1047
1048/*
1049 * Wait for an rcu-sched grace period to elapse, but use "big hammer"
1050 * approach to force grace period to end quickly. This consumes
1051 * significant time on all CPUs, and is thus not recommended for
1052 * any sort of common-case code.
1053 *
1054 * Note that it is illegal to call this function while holding any
1055 * lock that is acquired by a CPU-hotplug notifier. Failing to
1056 * observe this restriction will result in deadlock.
1057 *
1058 * This implementation can be thought of as an application of ticket
1059 * locking to RCU, with sync_sched_expedited_started and
1060 * sync_sched_expedited_done taking on the roles of the halves
1061 * of the ticket-lock word. Each task atomically increments
1062 * sync_sched_expedited_started upon entry, snapshotting the old value,
1063 * then attempts to stop all the CPUs. If this succeeds, then each
1064 * CPU will have executed a context switch, resulting in an RCU-sched
1065 * grace period. We are then done, so we use atomic_cmpxchg() to
1066 * update sync_sched_expedited_done to match our snapshot -- but
1067 * only if someone else has not already advanced past our snapshot.
1068 *
1069 * On the other hand, if try_stop_cpus() fails, we check the value
1070 * of sync_sched_expedited_done. If it has advanced past our
1071 * initial snapshot, then someone else must have forced a grace period
1072 * some time after we took our snapshot. In this case, our work is
1073 * done for us, and we can simply return. Otherwise, we try again,
1074 * but keep our initial snapshot for purposes of checking for someone
1075 * doing our work for us.
1076 *
1077 * If we fail too many times in a row, we fall back to synchronize_sched().
1078 */
1079void synchronize_sched_expedited(void)
1080{
1081 int firstsnap, s, snap, trycount = 0;
1082
1083 /* Note that atomic_inc_return() implies full memory barrier. */
1084 firstsnap = snap = atomic_inc_return(&sync_sched_expedited_started);
1085 get_online_cpus();
1086
1087 /*
1088 * Each pass through the following loop attempts to force a
1089 * context switch on each CPU.
1090 */
1091 while (try_stop_cpus(cpu_online_mask,
1092 synchronize_sched_expedited_cpu_stop,
1093 NULL) == -EAGAIN) {
1094 put_online_cpus();
1095
1096 /* No joy, try again later. Or just synchronize_sched(). */
1097 if (trycount++ < 10)
1098 udelay(trycount * num_online_cpus());
1099 else {
1100 synchronize_sched();
1101 return;
1102 }
1103
1104 /* Check to see if someone else did our work for us. */
1105 s = atomic_read(&sync_sched_expedited_done);
1106 if (UINT_CMP_GE((unsigned)s, (unsigned)firstsnap)) {
1107 smp_mb(); /* ensure test happens before caller kfree */
1108 return;
1109 }
1110
1111 /*
1112 * Refetching sync_sched_expedited_started allows later
1113 * callers to piggyback on our grace period. We subtract
1114 * 1 to get the same token that the last incrementer got.
1115 * We retry after they started, so our grace period works
1116 * for them, and they started after our first try, so their
1117 * grace period works for us.
1118 */
1119 get_online_cpus();
1120 snap = atomic_read(&sync_sched_expedited_started) - 1;
1121 smp_mb(); /* ensure read is before try_stop_cpus(). */
1122 }
1123
1124 /*
1125 * Everyone up to our most recent fetch is covered by our grace
1126 * period. Update the counter, but only if our work is still
1127 * relevant -- which it won't be if someone who started later
1128 * than we did beat us to the punch.
1129 */
1130 do {
1131 s = atomic_read(&sync_sched_expedited_done);
1132 if (UINT_CMP_GE((unsigned)s, (unsigned)snap)) {
1133 smp_mb(); /* ensure test happens before caller kfree */
1134 break;
1135 }
1136 } while (atomic_cmpxchg(&sync_sched_expedited_done, s, snap) != s);
1137
1138 put_online_cpus();
1139}
1140EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
1141
1142#endif /* #else #ifndef CONFIG_SMP */
1143
1017#if !defined(CONFIG_RCU_FAST_NO_HZ) 1144#if !defined(CONFIG_RCU_FAST_NO_HZ)
1018 1145
1019/* 1146/*
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index d15430b9d122..c8e97853b970 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -166,13 +166,13 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
166 166
167 gpnum = rsp->gpnum; 167 gpnum = rsp->gpnum;
168 seq_printf(m, "c=%lu g=%lu s=%d jfq=%ld j=%x " 168 seq_printf(m, "c=%lu g=%lu s=%d jfq=%ld j=%x "
169 "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld\n", 169 "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu\n",
170 rsp->completed, gpnum, rsp->signaled, 170 rsp->completed, gpnum, rsp->signaled,
171 (long)(rsp->jiffies_force_qs - jiffies), 171 (long)(rsp->jiffies_force_qs - jiffies),
172 (int)(jiffies & 0xffff), 172 (int)(jiffies & 0xffff),
173 rsp->n_force_qs, rsp->n_force_qs_ngp, 173 rsp->n_force_qs, rsp->n_force_qs_ngp,
174 rsp->n_force_qs - rsp->n_force_qs_ngp, 174 rsp->n_force_qs - rsp->n_force_qs_ngp,
175 rsp->n_force_qs_lh, rsp->orphan_qlen); 175 rsp->n_force_qs_lh);
176 for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < NUM_RCU_NODES; rnp++) { 176 for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < NUM_RCU_NODES; rnp++) {
177 if (rnp->level != level) { 177 if (rnp->level != level) {
178 seq_puts(m, "\n"); 178 seq_puts(m, "\n");
@@ -300,7 +300,7 @@ static const struct file_operations rcu_pending_fops = {
300 300
301static struct dentry *rcudir; 301static struct dentry *rcudir;
302 302
303static int __init rcuclassic_trace_init(void) 303static int __init rcutree_trace_init(void)
304{ 304{
305 struct dentry *retval; 305 struct dentry *retval;
306 306
@@ -337,14 +337,14 @@ free_out:
337 return 1; 337 return 1;
338} 338}
339 339
340static void __exit rcuclassic_trace_cleanup(void) 340static void __exit rcutree_trace_cleanup(void)
341{ 341{
342 debugfs_remove_recursive(rcudir); 342 debugfs_remove_recursive(rcudir);
343} 343}
344 344
345 345
346module_init(rcuclassic_trace_init); 346module_init(rcutree_trace_init);
347module_exit(rcuclassic_trace_cleanup); 347module_exit(rcutree_trace_cleanup);
348 348
349MODULE_AUTHOR("Paul E. McKenney"); 349MODULE_AUTHOR("Paul E. McKenney");
350MODULE_DESCRIPTION("Read-Copy Update tracing for hierarchical implementation"); 350MODULE_DESCRIPTION("Read-Copy Update tracing for hierarchical implementation");
diff --git a/kernel/sched.c b/kernel/sched.c
index 297d1a0eedb0..18d38e4ec7ba 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -75,9 +75,11 @@
75 75
76#include <asm/tlb.h> 76#include <asm/tlb.h>
77#include <asm/irq_regs.h> 77#include <asm/irq_regs.h>
78#include <asm/mutex.h>
78 79
79#include "sched_cpupri.h" 80#include "sched_cpupri.h"
80#include "workqueue_sched.h" 81#include "workqueue_sched.h"
82#include "sched_autogroup.h"
81 83
82#define CREATE_TRACE_POINTS 84#define CREATE_TRACE_POINTS
83#include <trace/events/sched.h> 85#include <trace/events/sched.h>
@@ -253,6 +255,8 @@ struct task_group {
253 /* runqueue "owned" by this group on each cpu */ 255 /* runqueue "owned" by this group on each cpu */
254 struct cfs_rq **cfs_rq; 256 struct cfs_rq **cfs_rq;
255 unsigned long shares; 257 unsigned long shares;
258
259 atomic_t load_weight;
256#endif 260#endif
257 261
258#ifdef CONFIG_RT_GROUP_SCHED 262#ifdef CONFIG_RT_GROUP_SCHED
@@ -268,25 +272,18 @@ struct task_group {
268 struct task_group *parent; 272 struct task_group *parent;
269 struct list_head siblings; 273 struct list_head siblings;
270 struct list_head children; 274 struct list_head children;
271};
272 275
273#define root_task_group init_task_group 276#ifdef CONFIG_SCHED_AUTOGROUP
277 struct autogroup *autogroup;
278#endif
279};
274 280
275/* task_group_lock serializes add/remove of task groups and also changes to 281/* task_group_lock serializes the addition/removal of task groups */
276 * a task group's cpu shares.
277 */
278static DEFINE_SPINLOCK(task_group_lock); 282static DEFINE_SPINLOCK(task_group_lock);
279 283
280#ifdef CONFIG_FAIR_GROUP_SCHED 284#ifdef CONFIG_FAIR_GROUP_SCHED
281 285
282#ifdef CONFIG_SMP 286# define ROOT_TASK_GROUP_LOAD NICE_0_LOAD
283static int root_task_group_empty(void)
284{
285 return list_empty(&root_task_group.children);
286}
287#endif
288
289# define INIT_TASK_GROUP_LOAD NICE_0_LOAD
290 287
291/* 288/*
292 * A weight of 0 or 1 can cause arithmetics problems. 289 * A weight of 0 or 1 can cause arithmetics problems.
@@ -299,13 +296,13 @@ static int root_task_group_empty(void)
299#define MIN_SHARES 2 296#define MIN_SHARES 2
300#define MAX_SHARES (1UL << 18) 297#define MAX_SHARES (1UL << 18)
301 298
302static int init_task_group_load = INIT_TASK_GROUP_LOAD; 299static int root_task_group_load = ROOT_TASK_GROUP_LOAD;
303#endif 300#endif
304 301
305/* Default task group. 302/* Default task group.
306 * Every task in system belong to this group at bootup. 303 * Every task in system belong to this group at bootup.
307 */ 304 */
308struct task_group init_task_group; 305struct task_group root_task_group;
309 306
310#endif /* CONFIG_CGROUP_SCHED */ 307#endif /* CONFIG_CGROUP_SCHED */
311 308
@@ -342,6 +339,7 @@ struct cfs_rq {
342 * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This 339 * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
343 * list is used during load balance. 340 * list is used during load balance.
344 */ 341 */
342 int on_list;
345 struct list_head leaf_cfs_rq_list; 343 struct list_head leaf_cfs_rq_list;
346 struct task_group *tg; /* group that "owns" this runqueue */ 344 struct task_group *tg; /* group that "owns" this runqueue */
347 345
@@ -360,14 +358,17 @@ struct cfs_rq {
360 unsigned long h_load; 358 unsigned long h_load;
361 359
362 /* 360 /*
363 * this cpu's part of tg->shares 361 * Maintaining per-cpu shares distribution for group scheduling
362 *
363 * load_stamp is the last time we updated the load average
364 * load_last is the last time we updated the load average and saw load
365 * load_unacc_exec_time is currently unaccounted execution time
364 */ 366 */
365 unsigned long shares; 367 u64 load_avg;
368 u64 load_period;
369 u64 load_stamp, load_last, load_unacc_exec_time;
366 370
367 /* 371 unsigned long load_contribution;
368 * load.weight at the time we set shares
369 */
370 unsigned long rq_weight;
371#endif 372#endif
372#endif 373#endif
373}; 374};
@@ -552,9 +553,6 @@ struct rq {
552 /* try_to_wake_up() stats */ 553 /* try_to_wake_up() stats */
553 unsigned int ttwu_count; 554 unsigned int ttwu_count;
554 unsigned int ttwu_local; 555 unsigned int ttwu_local;
555
556 /* BKL stats */
557 unsigned int bkl_count;
558#endif 556#endif
559}; 557};
560 558
@@ -605,11 +603,17 @@ static inline int cpu_of(struct rq *rq)
605 */ 603 */
606static inline struct task_group *task_group(struct task_struct *p) 604static inline struct task_group *task_group(struct task_struct *p)
607{ 605{
606 struct task_group *tg;
608 struct cgroup_subsys_state *css; 607 struct cgroup_subsys_state *css;
609 608
609 if (p->flags & PF_EXITING)
610 return &root_task_group;
611
610 css = task_subsys_state_check(p, cpu_cgroup_subsys_id, 612 css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
611 lockdep_is_held(&task_rq(p)->lock)); 613 lockdep_is_held(&task_rq(p)->lock));
612 return container_of(css, struct task_group, css); 614 tg = container_of(css, struct task_group, css);
615
616 return autogroup_task_group(p, tg);
613} 617}
614 618
615/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ 619/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
@@ -737,7 +741,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
737 buf[cnt] = 0; 741 buf[cnt] = 0;
738 cmp = strstrip(buf); 742 cmp = strstrip(buf);
739 743
740 if (strncmp(buf, "NO_", 3) == 0) { 744 if (strncmp(cmp, "NO_", 3) == 0) {
741 neg = 1; 745 neg = 1;
742 cmp += 3; 746 cmp += 3;
743 } 747 }
@@ -793,20 +797,6 @@ late_initcall(sched_init_debug);
793const_debug unsigned int sysctl_sched_nr_migrate = 32; 797const_debug unsigned int sysctl_sched_nr_migrate = 32;
794 798
795/* 799/*
796 * ratelimit for updating the group shares.
797 * default: 0.25ms
798 */
799unsigned int sysctl_sched_shares_ratelimit = 250000;
800unsigned int normalized_sysctl_sched_shares_ratelimit = 250000;
801
802/*
803 * Inject some fuzzyness into changing the per-cpu group shares
804 * this avoids remote rq-locks at the expense of fairness.
805 * default: 4
806 */
807unsigned int sysctl_sched_shares_thresh = 4;
808
809/*
810 * period over which we average the RT time consumption, measured 800 * period over which we average the RT time consumption, measured
811 * in ms. 801 * in ms.
812 * 802 *
@@ -1355,6 +1345,12 @@ static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
1355 lw->inv_weight = 0; 1345 lw->inv_weight = 0;
1356} 1346}
1357 1347
1348static inline void update_load_set(struct load_weight *lw, unsigned long w)
1349{
1350 lw->weight = w;
1351 lw->inv_weight = 0;
1352}
1353
1358/* 1354/*
1359 * To aid in avoiding the subversion of "niceness" due to uneven distribution 1355 * To aid in avoiding the subversion of "niceness" due to uneven distribution
1360 * of tasks with abnormal "nice" values across CPUs the contribution that 1356 * of tasks with abnormal "nice" values across CPUs the contribution that
@@ -1543,101 +1539,6 @@ static unsigned long cpu_avg_load_per_task(int cpu)
1543 1539
1544#ifdef CONFIG_FAIR_GROUP_SCHED 1540#ifdef CONFIG_FAIR_GROUP_SCHED
1545 1541
1546static __read_mostly unsigned long __percpu *update_shares_data;
1547
1548static void __set_se_shares(struct sched_entity *se, unsigned long shares);
1549
1550/*
1551 * Calculate and set the cpu's group shares.
1552 */
1553static void update_group_shares_cpu(struct task_group *tg, int cpu,
1554 unsigned long sd_shares,
1555 unsigned long sd_rq_weight,
1556 unsigned long *usd_rq_weight)
1557{
1558 unsigned long shares, rq_weight;
1559 int boost = 0;
1560
1561 rq_weight = usd_rq_weight[cpu];
1562 if (!rq_weight) {
1563 boost = 1;
1564 rq_weight = NICE_0_LOAD;
1565 }
1566
1567 /*
1568 * \Sum_j shares_j * rq_weight_i
1569 * shares_i = -----------------------------
1570 * \Sum_j rq_weight_j
1571 */
1572 shares = (sd_shares * rq_weight) / sd_rq_weight;
1573 shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES);
1574
1575 if (abs(shares - tg->se[cpu]->load.weight) >
1576 sysctl_sched_shares_thresh) {
1577 struct rq *rq = cpu_rq(cpu);
1578 unsigned long flags;
1579
1580 raw_spin_lock_irqsave(&rq->lock, flags);
1581 tg->cfs_rq[cpu]->rq_weight = boost ? 0 : rq_weight;
1582 tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
1583 __set_se_shares(tg->se[cpu], shares);
1584 raw_spin_unlock_irqrestore(&rq->lock, flags);
1585 }
1586}
1587
1588/*
1589 * Re-compute the task group their per cpu shares over the given domain.
1590 * This needs to be done in a bottom-up fashion because the rq weight of a
1591 * parent group depends on the shares of its child groups.
1592 */
1593static int tg_shares_up(struct task_group *tg, void *data)
1594{
1595 unsigned long weight, rq_weight = 0, sum_weight = 0, shares = 0;
1596 unsigned long *usd_rq_weight;
1597 struct sched_domain *sd = data;
1598 unsigned long flags;
1599 int i;
1600
1601 if (!tg->se[0])
1602 return 0;
1603
1604 local_irq_save(flags);
1605 usd_rq_weight = per_cpu_ptr(update_shares_data, smp_processor_id());
1606
1607 for_each_cpu(i, sched_domain_span(sd)) {
1608 weight = tg->cfs_rq[i]->load.weight;
1609 usd_rq_weight[i] = weight;
1610
1611 rq_weight += weight;
1612 /*
1613 * If there are currently no tasks on the cpu pretend there
1614 * is one of average load so that when a new task gets to
1615 * run here it will not get delayed by group starvation.
1616 */
1617 if (!weight)
1618 weight = NICE_0_LOAD;
1619
1620 sum_weight += weight;
1621 shares += tg->cfs_rq[i]->shares;
1622 }
1623
1624 if (!rq_weight)
1625 rq_weight = sum_weight;
1626
1627 if ((!shares && rq_weight) || shares > tg->shares)
1628 shares = tg->shares;
1629
1630 if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))
1631 shares = tg->shares;
1632
1633 for_each_cpu(i, sched_domain_span(sd))
1634 update_group_shares_cpu(tg, i, shares, rq_weight, usd_rq_weight);
1635
1636 local_irq_restore(flags);
1637
1638 return 0;
1639}
1640
1641/* 1542/*
1642 * Compute the cpu's hierarchical load factor for each task group. 1543 * Compute the cpu's hierarchical load factor for each task group.
1643 * This needs to be done in a top-down fashion because the load of a child 1544 * This needs to be done in a top-down fashion because the load of a child
@@ -1652,7 +1553,7 @@ static int tg_load_down(struct task_group *tg, void *data)
1652 load = cpu_rq(cpu)->load.weight; 1553 load = cpu_rq(cpu)->load.weight;
1653 } else { 1554 } else {
1654 load = tg->parent->cfs_rq[cpu]->h_load; 1555 load = tg->parent->cfs_rq[cpu]->h_load;
1655 load *= tg->cfs_rq[cpu]->shares; 1556 load *= tg->se[cpu]->load.weight;
1656 load /= tg->parent->cfs_rq[cpu]->load.weight + 1; 1557 load /= tg->parent->cfs_rq[cpu]->load.weight + 1;
1657 } 1558 }
1658 1559
@@ -1661,34 +1562,11 @@ static int tg_load_down(struct task_group *tg, void *data)
1661 return 0; 1562 return 0;
1662} 1563}
1663 1564
1664static void update_shares(struct sched_domain *sd)
1665{
1666 s64 elapsed;
1667 u64 now;
1668
1669 if (root_task_group_empty())
1670 return;
1671
1672 now = local_clock();
1673 elapsed = now - sd->last_update;
1674
1675 if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {
1676 sd->last_update = now;
1677 walk_tg_tree(tg_nop, tg_shares_up, sd);
1678 }
1679}
1680
1681static void update_h_load(long cpu) 1565static void update_h_load(long cpu)
1682{ 1566{
1683 walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); 1567 walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
1684} 1568}
1685 1569
1686#else
1687
1688static inline void update_shares(struct sched_domain *sd)
1689{
1690}
1691
1692#endif 1570#endif
1693 1571
1694#ifdef CONFIG_PREEMPT 1572#ifdef CONFIG_PREEMPT
@@ -1810,15 +1688,6 @@ static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
1810 1688
1811#endif 1689#endif
1812 1690
1813#ifdef CONFIG_FAIR_GROUP_SCHED
1814static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
1815{
1816#ifdef CONFIG_SMP
1817 cfs_rq->shares = shares;
1818#endif
1819}
1820#endif
1821
1822static void calc_load_account_idle(struct rq *this_rq); 1691static void calc_load_account_idle(struct rq *this_rq);
1823static void update_sysctl(void); 1692static void update_sysctl(void);
1824static int get_update_sysctl_factor(void); 1693static int get_update_sysctl_factor(void);
@@ -2063,6 +1932,7 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
2063#include "sched_idletask.c" 1932#include "sched_idletask.c"
2064#include "sched_fair.c" 1933#include "sched_fair.c"
2065#include "sched_rt.c" 1934#include "sched_rt.c"
1935#include "sched_autogroup.c"
2066#include "sched_stoptask.c" 1936#include "sched_stoptask.c"
2067#ifdef CONFIG_SCHED_DEBUG 1937#ifdef CONFIG_SCHED_DEBUG
2068# include "sched_debug.c" 1938# include "sched_debug.c"
@@ -2255,10 +2125,8 @@ static int migration_cpu_stop(void *data);
2255 * The task's runqueue lock must be held. 2125 * The task's runqueue lock must be held.
2256 * Returns true if you have to wait for migration thread. 2126 * Returns true if you have to wait for migration thread.
2257 */ 2127 */
2258static bool migrate_task(struct task_struct *p, int dest_cpu) 2128static bool migrate_task(struct task_struct *p, struct rq *rq)
2259{ 2129{
2260 struct rq *rq = task_rq(p);
2261
2262 /* 2130 /*
2263 * If the task is not on a runqueue (and not running), then 2131 * If the task is not on a runqueue (and not running), then
2264 * the next wake-up will properly place the task. 2132 * the next wake-up will properly place the task.
@@ -2438,18 +2306,15 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
2438 return dest_cpu; 2306 return dest_cpu;
2439 2307
2440 /* No more Mr. Nice Guy. */ 2308 /* No more Mr. Nice Guy. */
2441 if (unlikely(dest_cpu >= nr_cpu_ids)) { 2309 dest_cpu = cpuset_cpus_allowed_fallback(p);
2442 dest_cpu = cpuset_cpus_allowed_fallback(p); 2310 /*
2443 /* 2311 * Don't tell them about moving exiting tasks or
2444 * Don't tell them about moving exiting tasks or 2312 * kernel threads (both mm NULL), since they never
2445 * kernel threads (both mm NULL), since they never 2313 * leave kernel.
2446 * leave kernel. 2314 */
2447 */ 2315 if (p->mm && printk_ratelimit()) {
2448 if (p->mm && printk_ratelimit()) { 2316 printk(KERN_INFO "process %d (%s) no longer affine to cpu%d\n",
2449 printk(KERN_INFO "process %d (%s) no " 2317 task_pid_nr(p), p->comm, cpu);
2450 "longer affine to cpu%d\n",
2451 task_pid_nr(p), p->comm, cpu);
2452 }
2453 } 2318 }
2454 2319
2455 return dest_cpu; 2320 return dest_cpu;
@@ -2640,7 +2505,7 @@ out:
2640 * try_to_wake_up_local - try to wake up a local task with rq lock held 2505 * try_to_wake_up_local - try to wake up a local task with rq lock held
2641 * @p: the thread to be awakened 2506 * @p: the thread to be awakened
2642 * 2507 *
2643 * Put @p on the run-queue if it's not alredy there. The caller must 2508 * Put @p on the run-queue if it's not already there. The caller must
2644 * ensure that this_rq() is locked, @p is bound to this_rq() and not 2509 * ensure that this_rq() is locked, @p is bound to this_rq() and not
2645 * the current task. this_rq() stays locked over invocation. 2510 * the current task. this_rq() stays locked over invocation.
2646 */ 2511 */
@@ -2785,7 +2650,9 @@ void sched_fork(struct task_struct *p, int clone_flags)
2785 /* Want to start with kernel preemption disabled. */ 2650 /* Want to start with kernel preemption disabled. */
2786 task_thread_info(p)->preempt_count = 1; 2651 task_thread_info(p)->preempt_count = 1;
2787#endif 2652#endif
2653#ifdef CONFIG_SMP
2788 plist_node_init(&p->pushable_tasks, MAX_PRIO); 2654 plist_node_init(&p->pushable_tasks, MAX_PRIO);
2655#endif
2789 2656
2790 put_cpu(); 2657 put_cpu();
2791} 2658}
@@ -3549,7 +3416,7 @@ void sched_exec(void)
3549 * select_task_rq() can race against ->cpus_allowed 3416 * select_task_rq() can race against ->cpus_allowed
3550 */ 3417 */
3551 if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed) && 3418 if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed) &&
3552 likely(cpu_active(dest_cpu)) && migrate_task(p, dest_cpu)) { 3419 likely(cpu_active(dest_cpu)) && migrate_task(p, rq)) {
3553 struct migration_arg arg = { p, dest_cpu }; 3420 struct migration_arg arg = { p, dest_cpu };
3554 3421
3555 task_rq_unlock(rq, &flags); 3422 task_rq_unlock(rq, &flags);
@@ -4020,7 +3887,7 @@ static inline void schedule_debug(struct task_struct *prev)
4020 schedstat_inc(this_rq(), sched_count); 3887 schedstat_inc(this_rq(), sched_count);
4021#ifdef CONFIG_SCHEDSTATS 3888#ifdef CONFIG_SCHEDSTATS
4022 if (unlikely(prev->lock_depth >= 0)) { 3889 if (unlikely(prev->lock_depth >= 0)) {
4023 schedstat_inc(this_rq(), bkl_count); 3890 schedstat_inc(this_rq(), rq_sched_info.bkl_count);
4024 schedstat_inc(prev, sched_info.bkl_count); 3891 schedstat_inc(prev, sched_info.bkl_count);
4025 } 3892 }
4026#endif 3893#endif
@@ -4214,7 +4081,7 @@ int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)
4214 if (task_thread_info(rq->curr) != owner || need_resched()) 4081 if (task_thread_info(rq->curr) != owner || need_resched())
4215 return 0; 4082 return 0;
4216 4083
4217 cpu_relax(); 4084 arch_mutex_cpu_relax();
4218 } 4085 }
4219 4086
4220 return 1; 4087 return 1;
@@ -4526,7 +4393,7 @@ EXPORT_SYMBOL(wait_for_completion_interruptible);
4526 * This waits for either a completion of a specific task to be signaled or for a 4393 * This waits for either a completion of a specific task to be signaled or for a
4527 * specified timeout to expire. It is interruptible. The timeout is in jiffies. 4394 * specified timeout to expire. It is interruptible. The timeout is in jiffies.
4528 */ 4395 */
4529unsigned long __sched 4396long __sched
4530wait_for_completion_interruptible_timeout(struct completion *x, 4397wait_for_completion_interruptible_timeout(struct completion *x,
4531 unsigned long timeout) 4398 unsigned long timeout)
4532{ 4399{
@@ -4559,7 +4426,7 @@ EXPORT_SYMBOL(wait_for_completion_killable);
4559 * signaled or for a specified timeout to expire. It can be 4426 * signaled or for a specified timeout to expire. It can be
4560 * interrupted by a kill signal. The timeout is in jiffies. 4427 * interrupted by a kill signal. The timeout is in jiffies.
4561 */ 4428 */
4562unsigned long __sched 4429long __sched
4563wait_for_completion_killable_timeout(struct completion *x, 4430wait_for_completion_killable_timeout(struct completion *x,
4564 unsigned long timeout) 4431 unsigned long timeout)
4565{ 4432{
@@ -4901,7 +4768,7 @@ static bool check_same_owner(struct task_struct *p)
4901} 4768}
4902 4769
4903static int __sched_setscheduler(struct task_struct *p, int policy, 4770static int __sched_setscheduler(struct task_struct *p, int policy,
4904 struct sched_param *param, bool user) 4771 const struct sched_param *param, bool user)
4905{ 4772{
4906 int retval, oldprio, oldpolicy = -1, on_rq, running; 4773 int retval, oldprio, oldpolicy = -1, on_rq, running;
4907 unsigned long flags; 4774 unsigned long flags;
@@ -5004,7 +4871,8 @@ recheck:
5004 * assigned. 4871 * assigned.
5005 */ 4872 */
5006 if (rt_bandwidth_enabled() && rt_policy(policy) && 4873 if (rt_bandwidth_enabled() && rt_policy(policy) &&
5007 task_group(p)->rt_bandwidth.rt_runtime == 0) { 4874 task_group(p)->rt_bandwidth.rt_runtime == 0 &&
4875 !task_group_is_autogroup(task_group(p))) {
5008 __task_rq_unlock(rq); 4876 __task_rq_unlock(rq);
5009 raw_spin_unlock_irqrestore(&p->pi_lock, flags); 4877 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
5010 return -EPERM; 4878 return -EPERM;
@@ -5056,7 +4924,7 @@ recheck:
5056 * NOTE that the task may be already dead. 4924 * NOTE that the task may be already dead.
5057 */ 4925 */
5058int sched_setscheduler(struct task_struct *p, int policy, 4926int sched_setscheduler(struct task_struct *p, int policy,
5059 struct sched_param *param) 4927 const struct sched_param *param)
5060{ 4928{
5061 return __sched_setscheduler(p, policy, param, true); 4929 return __sched_setscheduler(p, policy, param, true);
5062} 4930}
@@ -5074,7 +4942,7 @@ EXPORT_SYMBOL_GPL(sched_setscheduler);
5074 * but our caller might not have that capability. 4942 * but our caller might not have that capability.
5075 */ 4943 */
5076int sched_setscheduler_nocheck(struct task_struct *p, int policy, 4944int sched_setscheduler_nocheck(struct task_struct *p, int policy,
5077 struct sched_param *param) 4945 const struct sched_param *param)
5078{ 4946{
5079 return __sched_setscheduler(p, policy, param, false); 4947 return __sched_setscheduler(p, policy, param, false);
5080} 4948}
@@ -5590,7 +5458,7 @@ void sched_show_task(struct task_struct *p)
5590 unsigned state; 5458 unsigned state;
5591 5459
5592 state = p->state ? __ffs(p->state) + 1 : 0; 5460 state = p->state ? __ffs(p->state) + 1 : 0;
5593 printk(KERN_INFO "%-13.13s %c", p->comm, 5461 printk(KERN_INFO "%-15.15s %c", p->comm,
5594 state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?'); 5462 state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
5595#if BITS_PER_LONG == 32 5463#if BITS_PER_LONG == 32
5596 if (state == TASK_RUNNING) 5464 if (state == TASK_RUNNING)
@@ -5754,7 +5622,6 @@ static void update_sysctl(void)
5754 SET_SYSCTL(sched_min_granularity); 5622 SET_SYSCTL(sched_min_granularity);
5755 SET_SYSCTL(sched_latency); 5623 SET_SYSCTL(sched_latency);
5756 SET_SYSCTL(sched_wakeup_granularity); 5624 SET_SYSCTL(sched_wakeup_granularity);
5757 SET_SYSCTL(sched_shares_ratelimit);
5758#undef SET_SYSCTL 5625#undef SET_SYSCTL
5759} 5626}
5760 5627
@@ -5830,7 +5697,7 @@ again:
5830 goto out; 5697 goto out;
5831 5698
5832 dest_cpu = cpumask_any_and(cpu_active_mask, new_mask); 5699 dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
5833 if (migrate_task(p, dest_cpu)) { 5700 if (migrate_task(p, rq)) {
5834 struct migration_arg arg = { p, dest_cpu }; 5701 struct migration_arg arg = { p, dest_cpu };
5835 /* Need help from migration thread: drop lock and wait. */ 5702 /* Need help from migration thread: drop lock and wait. */
5836 task_rq_unlock(rq, &flags); 5703 task_rq_unlock(rq, &flags);
@@ -5912,29 +5779,20 @@ static int migration_cpu_stop(void *data)
5912} 5779}
5913 5780
5914#ifdef CONFIG_HOTPLUG_CPU 5781#ifdef CONFIG_HOTPLUG_CPU
5782
5915/* 5783/*
5916 * Figure out where task on dead CPU should go, use force if necessary. 5784 * Ensures that the idle task is using init_mm right before its cpu goes
5785 * offline.
5917 */ 5786 */
5918void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) 5787void idle_task_exit(void)
5919{ 5788{
5920 struct rq *rq = cpu_rq(dead_cpu); 5789 struct mm_struct *mm = current->active_mm;
5921 int needs_cpu, uninitialized_var(dest_cpu);
5922 unsigned long flags;
5923 5790
5924 local_irq_save(flags); 5791 BUG_ON(cpu_online(smp_processor_id()));
5925 5792
5926 raw_spin_lock(&rq->lock); 5793 if (mm != &init_mm)
5927 needs_cpu = (task_cpu(p) == dead_cpu) && (p->state != TASK_WAKING); 5794 switch_mm(mm, &init_mm, current);
5928 if (needs_cpu) 5795 mmdrop(mm);
5929 dest_cpu = select_fallback_rq(dead_cpu, p);
5930 raw_spin_unlock(&rq->lock);
5931 /*
5932 * It can only fail if we race with set_cpus_allowed(),
5933 * in the racer should migrate the task anyway.
5934 */
5935 if (needs_cpu)
5936 __migrate_task(p, dead_cpu, dest_cpu);
5937 local_irq_restore(flags);
5938} 5796}
5939 5797
5940/* 5798/*
@@ -5947,128 +5805,69 @@ void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
5947static void migrate_nr_uninterruptible(struct rq *rq_src) 5805static void migrate_nr_uninterruptible(struct rq *rq_src)
5948{ 5806{
5949 struct rq *rq_dest = cpu_rq(cpumask_any(cpu_active_mask)); 5807 struct rq *rq_dest = cpu_rq(cpumask_any(cpu_active_mask));
5950 unsigned long flags;
5951 5808
5952 local_irq_save(flags);
5953 double_rq_lock(rq_src, rq_dest);
5954 rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible; 5809 rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible;
5955 rq_src->nr_uninterruptible = 0; 5810 rq_src->nr_uninterruptible = 0;
5956 double_rq_unlock(rq_src, rq_dest);
5957 local_irq_restore(flags);
5958}
5959
5960/* Run through task list and migrate tasks from the dead cpu. */
5961static void migrate_live_tasks(int src_cpu)
5962{
5963 struct task_struct *p, *t;
5964
5965 read_lock(&tasklist_lock);
5966
5967 do_each_thread(t, p) {
5968 if (p == current)
5969 continue;
5970
5971 if (task_cpu(p) == src_cpu)
5972 move_task_off_dead_cpu(src_cpu, p);
5973 } while_each_thread(t, p);
5974
5975 read_unlock(&tasklist_lock);
5976} 5811}
5977 5812
5978/* 5813/*
5979 * Schedules idle task to be the next runnable task on current CPU. 5814 * remove the tasks which were accounted by rq from calc_load_tasks.
5980 * It does so by boosting its priority to highest possible.
5981 * Used by CPU offline code.
5982 */ 5815 */
5983void sched_idle_next(void) 5816static void calc_global_load_remove(struct rq *rq)
5984{ 5817{
5985 int this_cpu = smp_processor_id(); 5818 atomic_long_sub(rq->calc_load_active, &calc_load_tasks);
5986 struct rq *rq = cpu_rq(this_cpu); 5819 rq->calc_load_active = 0;
5987 struct task_struct *p = rq->idle;
5988 unsigned long flags;
5989
5990 /* cpu has to be offline */
5991 BUG_ON(cpu_online(this_cpu));
5992
5993 /*
5994 * Strictly not necessary since rest of the CPUs are stopped by now
5995 * and interrupts disabled on the current cpu.
5996 */
5997 raw_spin_lock_irqsave(&rq->lock, flags);
5998
5999 __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
6000
6001 activate_task(rq, p, 0);
6002
6003 raw_spin_unlock_irqrestore(&rq->lock, flags);
6004} 5820}
6005 5821
6006/* 5822/*
6007 * Ensures that the idle task is using init_mm right before its cpu goes 5823 * Migrate all tasks from the rq, sleeping tasks will be migrated by
6008 * offline. 5824 * try_to_wake_up()->select_task_rq().
5825 *
5826 * Called with rq->lock held even though we'er in stop_machine() and
5827 * there's no concurrency possible, we hold the required locks anyway
5828 * because of lock validation efforts.
6009 */ 5829 */
6010void idle_task_exit(void) 5830static void migrate_tasks(unsigned int dead_cpu)
6011{
6012 struct mm_struct *mm = current->active_mm;
6013
6014 BUG_ON(cpu_online(smp_processor_id()));
6015
6016 if (mm != &init_mm)
6017 switch_mm(mm, &init_mm, current);
6018 mmdrop(mm);
6019}
6020
6021/* called under rq->lock with disabled interrupts */
6022static void migrate_dead(unsigned int dead_cpu, struct task_struct *p)
6023{ 5831{
6024 struct rq *rq = cpu_rq(dead_cpu); 5832 struct rq *rq = cpu_rq(dead_cpu);
6025 5833 struct task_struct *next, *stop = rq->stop;
6026 /* Must be exiting, otherwise would be on tasklist. */ 5834 int dest_cpu;
6027 BUG_ON(!p->exit_state);
6028
6029 /* Cannot have done final schedule yet: would have vanished. */
6030 BUG_ON(p->state == TASK_DEAD);
6031
6032 get_task_struct(p);
6033 5835
6034 /* 5836 /*
6035 * Drop lock around migration; if someone else moves it, 5837 * Fudge the rq selection such that the below task selection loop
6036 * that's OK. No task can be added to this CPU, so iteration is 5838 * doesn't get stuck on the currently eligible stop task.
6037 * fine. 5839 *
5840 * We're currently inside stop_machine() and the rq is either stuck
5841 * in the stop_machine_cpu_stop() loop, or we're executing this code,
5842 * either way we should never end up calling schedule() until we're
5843 * done here.
6038 */ 5844 */
6039 raw_spin_unlock_irq(&rq->lock); 5845 rq->stop = NULL;
6040 move_task_off_dead_cpu(dead_cpu, p);
6041 raw_spin_lock_irq(&rq->lock);
6042
6043 put_task_struct(p);
6044}
6045
6046/* release_task() removes task from tasklist, so we won't find dead tasks. */
6047static void migrate_dead_tasks(unsigned int dead_cpu)
6048{
6049 struct rq *rq = cpu_rq(dead_cpu);
6050 struct task_struct *next;
6051 5846
6052 for ( ; ; ) { 5847 for ( ; ; ) {
6053 if (!rq->nr_running) 5848 /*
5849 * There's this thread running, bail when that's the only
5850 * remaining thread.
5851 */
5852 if (rq->nr_running == 1)
6054 break; 5853 break;
5854
6055 next = pick_next_task(rq); 5855 next = pick_next_task(rq);
6056 if (!next) 5856 BUG_ON(!next);
6057 break;
6058 next->sched_class->put_prev_task(rq, next); 5857 next->sched_class->put_prev_task(rq, next);
6059 migrate_dead(dead_cpu, next);
6060 5858
5859 /* Find suitable destination for @next, with force if needed. */
5860 dest_cpu = select_fallback_rq(dead_cpu, next);
5861 raw_spin_unlock(&rq->lock);
5862
5863 __migrate_task(next, dead_cpu, dest_cpu);
5864
5865 raw_spin_lock(&rq->lock);
6061 } 5866 }
6062}
6063 5867
6064/* 5868 rq->stop = stop;
6065 * remove the tasks which were accounted by rq from calc_load_tasks.
6066 */
6067static void calc_global_load_remove(struct rq *rq)
6068{
6069 atomic_long_sub(rq->calc_load_active, &calc_load_tasks);
6070 rq->calc_load_active = 0;
6071} 5869}
5870
6072#endif /* CONFIG_HOTPLUG_CPU */ 5871#endif /* CONFIG_HOTPLUG_CPU */
6073 5872
6074#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) 5873#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
@@ -6278,15 +6077,13 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
6278 unsigned long flags; 6077 unsigned long flags;
6279 struct rq *rq = cpu_rq(cpu); 6078 struct rq *rq = cpu_rq(cpu);
6280 6079
6281 switch (action) { 6080 switch (action & ~CPU_TASKS_FROZEN) {
6282 6081
6283 case CPU_UP_PREPARE: 6082 case CPU_UP_PREPARE:
6284 case CPU_UP_PREPARE_FROZEN:
6285 rq->calc_load_update = calc_load_update; 6083 rq->calc_load_update = calc_load_update;
6286 break; 6084 break;
6287 6085
6288 case CPU_ONLINE: 6086 case CPU_ONLINE:
6289 case CPU_ONLINE_FROZEN:
6290 /* Update our root-domain */ 6087 /* Update our root-domain */
6291 raw_spin_lock_irqsave(&rq->lock, flags); 6088 raw_spin_lock_irqsave(&rq->lock, flags);
6292 if (rq->rd) { 6089 if (rq->rd) {
@@ -6298,30 +6095,19 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
6298 break; 6095 break;
6299 6096
6300#ifdef CONFIG_HOTPLUG_CPU 6097#ifdef CONFIG_HOTPLUG_CPU
6301 case CPU_DEAD:
6302 case CPU_DEAD_FROZEN:
6303 migrate_live_tasks(cpu);
6304 /* Idle task back to normal (off runqueue, low prio) */
6305 raw_spin_lock_irq(&rq->lock);
6306 deactivate_task(rq, rq->idle, 0);
6307 __setscheduler(rq, rq->idle, SCHED_NORMAL, 0);
6308 rq->idle->sched_class = &idle_sched_class;
6309 migrate_dead_tasks(cpu);
6310 raw_spin_unlock_irq(&rq->lock);
6311 migrate_nr_uninterruptible(rq);
6312 BUG_ON(rq->nr_running != 0);
6313 calc_global_load_remove(rq);
6314 break;
6315
6316 case CPU_DYING: 6098 case CPU_DYING:
6317 case CPU_DYING_FROZEN:
6318 /* Update our root-domain */ 6099 /* Update our root-domain */
6319 raw_spin_lock_irqsave(&rq->lock, flags); 6100 raw_spin_lock_irqsave(&rq->lock, flags);
6320 if (rq->rd) { 6101 if (rq->rd) {
6321 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); 6102 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
6322 set_rq_offline(rq); 6103 set_rq_offline(rq);
6323 } 6104 }
6105 migrate_tasks(cpu);
6106 BUG_ON(rq->nr_running != 1); /* the migration thread */
6324 raw_spin_unlock_irqrestore(&rq->lock, flags); 6107 raw_spin_unlock_irqrestore(&rq->lock, flags);
6108
6109 migrate_nr_uninterruptible(rq);
6110 calc_global_load_remove(rq);
6325 break; 6111 break;
6326#endif 6112#endif
6327 } 6113 }
@@ -8052,18 +7838,16 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
8052 7838
8053#ifdef CONFIG_FAIR_GROUP_SCHED 7839#ifdef CONFIG_FAIR_GROUP_SCHED
8054static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, 7840static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
8055 struct sched_entity *se, int cpu, int add, 7841 struct sched_entity *se, int cpu,
8056 struct sched_entity *parent) 7842 struct sched_entity *parent)
8057{ 7843{
8058 struct rq *rq = cpu_rq(cpu); 7844 struct rq *rq = cpu_rq(cpu);
8059 tg->cfs_rq[cpu] = cfs_rq; 7845 tg->cfs_rq[cpu] = cfs_rq;
8060 init_cfs_rq(cfs_rq, rq); 7846 init_cfs_rq(cfs_rq, rq);
8061 cfs_rq->tg = tg; 7847 cfs_rq->tg = tg;
8062 if (add)
8063 list_add(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
8064 7848
8065 tg->se[cpu] = se; 7849 tg->se[cpu] = se;
8066 /* se could be NULL for init_task_group */ 7850 /* se could be NULL for root_task_group */
8067 if (!se) 7851 if (!se)
8068 return; 7852 return;
8069 7853
@@ -8073,15 +7857,14 @@ static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
8073 se->cfs_rq = parent->my_q; 7857 se->cfs_rq = parent->my_q;
8074 7858
8075 se->my_q = cfs_rq; 7859 se->my_q = cfs_rq;
8076 se->load.weight = tg->shares; 7860 update_load_set(&se->load, 0);
8077 se->load.inv_weight = 0;
8078 se->parent = parent; 7861 se->parent = parent;
8079} 7862}
8080#endif 7863#endif
8081 7864
8082#ifdef CONFIG_RT_GROUP_SCHED 7865#ifdef CONFIG_RT_GROUP_SCHED
8083static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, 7866static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
8084 struct sched_rt_entity *rt_se, int cpu, int add, 7867 struct sched_rt_entity *rt_se, int cpu,
8085 struct sched_rt_entity *parent) 7868 struct sched_rt_entity *parent)
8086{ 7869{
8087 struct rq *rq = cpu_rq(cpu); 7870 struct rq *rq = cpu_rq(cpu);
@@ -8090,8 +7873,6 @@ static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
8090 init_rt_rq(rt_rq, rq); 7873 init_rt_rq(rt_rq, rq);
8091 rt_rq->tg = tg; 7874 rt_rq->tg = tg;
8092 rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime; 7875 rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
8093 if (add)
8094 list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list);
8095 7876
8096 tg->rt_se[cpu] = rt_se; 7877 tg->rt_se[cpu] = rt_se;
8097 if (!rt_se) 7878 if (!rt_se)
@@ -8126,18 +7907,18 @@ void __init sched_init(void)
8126 ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT); 7907 ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);
8127 7908
8128#ifdef CONFIG_FAIR_GROUP_SCHED 7909#ifdef CONFIG_FAIR_GROUP_SCHED
8129 init_task_group.se = (struct sched_entity **)ptr; 7910 root_task_group.se = (struct sched_entity **)ptr;
8130 ptr += nr_cpu_ids * sizeof(void **); 7911 ptr += nr_cpu_ids * sizeof(void **);
8131 7912
8132 init_task_group.cfs_rq = (struct cfs_rq **)ptr; 7913 root_task_group.cfs_rq = (struct cfs_rq **)ptr;
8133 ptr += nr_cpu_ids * sizeof(void **); 7914 ptr += nr_cpu_ids * sizeof(void **);
8134 7915
8135#endif /* CONFIG_FAIR_GROUP_SCHED */ 7916#endif /* CONFIG_FAIR_GROUP_SCHED */
8136#ifdef CONFIG_RT_GROUP_SCHED 7917#ifdef CONFIG_RT_GROUP_SCHED
8137 init_task_group.rt_se = (struct sched_rt_entity **)ptr; 7918 root_task_group.rt_se = (struct sched_rt_entity **)ptr;
8138 ptr += nr_cpu_ids * sizeof(void **); 7919 ptr += nr_cpu_ids * sizeof(void **);
8139 7920
8140 init_task_group.rt_rq = (struct rt_rq **)ptr; 7921 root_task_group.rt_rq = (struct rt_rq **)ptr;
8141 ptr += nr_cpu_ids * sizeof(void **); 7922 ptr += nr_cpu_ids * sizeof(void **);
8142 7923
8143#endif /* CONFIG_RT_GROUP_SCHED */ 7924#endif /* CONFIG_RT_GROUP_SCHED */
@@ -8157,20 +7938,16 @@ void __init sched_init(void)
8157 global_rt_period(), global_rt_runtime()); 7938 global_rt_period(), global_rt_runtime());
8158 7939
8159#ifdef CONFIG_RT_GROUP_SCHED 7940#ifdef CONFIG_RT_GROUP_SCHED
8160 init_rt_bandwidth(&init_task_group.rt_bandwidth, 7941 init_rt_bandwidth(&root_task_group.rt_bandwidth,
8161 global_rt_period(), global_rt_runtime()); 7942 global_rt_period(), global_rt_runtime());
8162#endif /* CONFIG_RT_GROUP_SCHED */ 7943#endif /* CONFIG_RT_GROUP_SCHED */
8163 7944
8164#ifdef CONFIG_CGROUP_SCHED 7945#ifdef CONFIG_CGROUP_SCHED
8165 list_add(&init_task_group.list, &task_groups); 7946 list_add(&root_task_group.list, &task_groups);
8166 INIT_LIST_HEAD(&init_task_group.children); 7947 INIT_LIST_HEAD(&root_task_group.children);
8167 7948 autogroup_init(&init_task);
8168#endif /* CONFIG_CGROUP_SCHED */ 7949#endif /* CONFIG_CGROUP_SCHED */
8169 7950
8170#if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP
8171 update_shares_data = __alloc_percpu(nr_cpu_ids * sizeof(unsigned long),
8172 __alignof__(unsigned long));
8173#endif
8174 for_each_possible_cpu(i) { 7951 for_each_possible_cpu(i) {
8175 struct rq *rq; 7952 struct rq *rq;
8176 7953
@@ -8182,38 +7959,34 @@ void __init sched_init(void)
8182 init_cfs_rq(&rq->cfs, rq); 7959 init_cfs_rq(&rq->cfs, rq);
8183 init_rt_rq(&rq->rt, rq); 7960 init_rt_rq(&rq->rt, rq);
8184#ifdef CONFIG_FAIR_GROUP_SCHED 7961#ifdef CONFIG_FAIR_GROUP_SCHED
8185 init_task_group.shares = init_task_group_load; 7962 root_task_group.shares = root_task_group_load;
8186 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); 7963 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
8187#ifdef CONFIG_CGROUP_SCHED
8188 /* 7964 /*
8189 * How much cpu bandwidth does init_task_group get? 7965 * How much cpu bandwidth does root_task_group get?
8190 * 7966 *
8191 * In case of task-groups formed thr' the cgroup filesystem, it 7967 * In case of task-groups formed thr' the cgroup filesystem, it
8192 * gets 100% of the cpu resources in the system. This overall 7968 * gets 100% of the cpu resources in the system. This overall
8193 * system cpu resource is divided among the tasks of 7969 * system cpu resource is divided among the tasks of
8194 * init_task_group and its child task-groups in a fair manner, 7970 * root_task_group and its child task-groups in a fair manner,
8195 * based on each entity's (task or task-group's) weight 7971 * based on each entity's (task or task-group's) weight
8196 * (se->load.weight). 7972 * (se->load.weight).
8197 * 7973 *
8198 * In other words, if init_task_group has 10 tasks of weight 7974 * In other words, if root_task_group has 10 tasks of weight
8199 * 1024) and two child groups A0 and A1 (of weight 1024 each), 7975 * 1024) and two child groups A0 and A1 (of weight 1024 each),
8200 * then A0's share of the cpu resource is: 7976 * then A0's share of the cpu resource is:
8201 * 7977 *
8202 * A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33% 7978 * A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33%
8203 * 7979 *
8204 * We achieve this by letting init_task_group's tasks sit 7980 * We achieve this by letting root_task_group's tasks sit
8205 * directly in rq->cfs (i.e init_task_group->se[] = NULL). 7981 * directly in rq->cfs (i.e root_task_group->se[] = NULL).
8206 */ 7982 */
8207 init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, 1, NULL); 7983 init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL);
8208#endif
8209#endif /* CONFIG_FAIR_GROUP_SCHED */ 7984#endif /* CONFIG_FAIR_GROUP_SCHED */
8210 7985
8211 rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime; 7986 rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
8212#ifdef CONFIG_RT_GROUP_SCHED 7987#ifdef CONFIG_RT_GROUP_SCHED
8213 INIT_LIST_HEAD(&rq->leaf_rt_rq_list); 7988 INIT_LIST_HEAD(&rq->leaf_rt_rq_list);
8214#ifdef CONFIG_CGROUP_SCHED 7989 init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL);
8215 init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, 1, NULL);
8216#endif
8217#endif 7990#endif
8218 7991
8219 for (j = 0; j < CPU_LOAD_IDX_MAX; j++) 7992 for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
@@ -8293,8 +8066,6 @@ void __init sched_init(void)
8293 zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); 8066 zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
8294#endif /* SMP */ 8067#endif /* SMP */
8295 8068
8296 perf_event_init();
8297
8298 scheduler_running = 1; 8069 scheduler_running = 1;
8299} 8070}
8300 8071
@@ -8488,7 +8259,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
8488 if (!se) 8259 if (!se)
8489 goto err_free_rq; 8260 goto err_free_rq;
8490 8261
8491 init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent->se[i]); 8262 init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
8492 } 8263 }
8493 8264
8494 return 1; 8265 return 1;
@@ -8499,15 +8270,21 @@ err:
8499 return 0; 8270 return 0;
8500} 8271}
8501 8272
8502static inline void register_fair_sched_group(struct task_group *tg, int cpu)
8503{
8504 list_add_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list,
8505 &cpu_rq(cpu)->leaf_cfs_rq_list);
8506}
8507
8508static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) 8273static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
8509{ 8274{
8510 list_del_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list); 8275 struct rq *rq = cpu_rq(cpu);
8276 unsigned long flags;
8277
8278 /*
8279 * Only empty task groups can be destroyed; so we can speculatively
8280 * check on_list without danger of it being re-added.
8281 */
8282 if (!tg->cfs_rq[cpu]->on_list)
8283 return;
8284
8285 raw_spin_lock_irqsave(&rq->lock, flags);
8286 list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
8287 raw_spin_unlock_irqrestore(&rq->lock, flags);
8511} 8288}
8512#else /* !CONFG_FAIR_GROUP_SCHED */ 8289#else /* !CONFG_FAIR_GROUP_SCHED */
8513static inline void free_fair_sched_group(struct task_group *tg) 8290static inline void free_fair_sched_group(struct task_group *tg)
@@ -8520,10 +8297,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
8520 return 1; 8297 return 1;
8521} 8298}
8522 8299
8523static inline void register_fair_sched_group(struct task_group *tg, int cpu)
8524{
8525}
8526
8527static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) 8300static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
8528{ 8301{
8529} 8302}
@@ -8578,7 +8351,7 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
8578 if (!rt_se) 8351 if (!rt_se)
8579 goto err_free_rq; 8352 goto err_free_rq;
8580 8353
8581 init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent->rt_se[i]); 8354 init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]);
8582 } 8355 }
8583 8356
8584 return 1; 8357 return 1;
@@ -8588,17 +8361,6 @@ err_free_rq:
8588err: 8361err:
8589 return 0; 8362 return 0;
8590} 8363}
8591
8592static inline void register_rt_sched_group(struct task_group *tg, int cpu)
8593{
8594 list_add_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list,
8595 &cpu_rq(cpu)->leaf_rt_rq_list);
8596}
8597
8598static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
8599{
8600 list_del_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list);
8601}
8602#else /* !CONFIG_RT_GROUP_SCHED */ 8364#else /* !CONFIG_RT_GROUP_SCHED */
8603static inline void free_rt_sched_group(struct task_group *tg) 8365static inline void free_rt_sched_group(struct task_group *tg)
8604{ 8366{
@@ -8609,14 +8371,6 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
8609{ 8371{
8610 return 1; 8372 return 1;
8611} 8373}
8612
8613static inline void register_rt_sched_group(struct task_group *tg, int cpu)
8614{
8615}
8616
8617static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
8618{
8619}
8620#endif /* CONFIG_RT_GROUP_SCHED */ 8374#endif /* CONFIG_RT_GROUP_SCHED */
8621 8375
8622#ifdef CONFIG_CGROUP_SCHED 8376#ifdef CONFIG_CGROUP_SCHED
@@ -8624,6 +8378,7 @@ static void free_sched_group(struct task_group *tg)
8624{ 8378{
8625 free_fair_sched_group(tg); 8379 free_fair_sched_group(tg);
8626 free_rt_sched_group(tg); 8380 free_rt_sched_group(tg);
8381 autogroup_free(tg);
8627 kfree(tg); 8382 kfree(tg);
8628} 8383}
8629 8384
@@ -8632,7 +8387,6 @@ struct task_group *sched_create_group(struct task_group *parent)
8632{ 8387{
8633 struct task_group *tg; 8388 struct task_group *tg;
8634 unsigned long flags; 8389 unsigned long flags;
8635 int i;
8636 8390
8637 tg = kzalloc(sizeof(*tg), GFP_KERNEL); 8391 tg = kzalloc(sizeof(*tg), GFP_KERNEL);
8638 if (!tg) 8392 if (!tg)
@@ -8645,10 +8399,6 @@ struct task_group *sched_create_group(struct task_group *parent)
8645 goto err; 8399 goto err;
8646 8400
8647 spin_lock_irqsave(&task_group_lock, flags); 8401 spin_lock_irqsave(&task_group_lock, flags);
8648 for_each_possible_cpu(i) {
8649 register_fair_sched_group(tg, i);
8650 register_rt_sched_group(tg, i);
8651 }
8652 list_add_rcu(&tg->list, &task_groups); 8402 list_add_rcu(&tg->list, &task_groups);
8653 8403
8654 WARN_ON(!parent); /* root should already exist */ 8404 WARN_ON(!parent); /* root should already exist */
@@ -8678,11 +8428,11 @@ void sched_destroy_group(struct task_group *tg)
8678 unsigned long flags; 8428 unsigned long flags;
8679 int i; 8429 int i;
8680 8430
8681 spin_lock_irqsave(&task_group_lock, flags); 8431 /* end participation in shares distribution */
8682 for_each_possible_cpu(i) { 8432 for_each_possible_cpu(i)
8683 unregister_fair_sched_group(tg, i); 8433 unregister_fair_sched_group(tg, i);
8684 unregister_rt_sched_group(tg, i); 8434
8685 } 8435 spin_lock_irqsave(&task_group_lock, flags);
8686 list_del_rcu(&tg->list); 8436 list_del_rcu(&tg->list);
8687 list_del_rcu(&tg->siblings); 8437 list_del_rcu(&tg->siblings);
8688 spin_unlock_irqrestore(&task_group_lock, flags); 8438 spin_unlock_irqrestore(&task_group_lock, flags);
@@ -8729,33 +8479,6 @@ void sched_move_task(struct task_struct *tsk)
8729#endif /* CONFIG_CGROUP_SCHED */ 8479#endif /* CONFIG_CGROUP_SCHED */
8730 8480
8731#ifdef CONFIG_FAIR_GROUP_SCHED 8481#ifdef CONFIG_FAIR_GROUP_SCHED
8732static void __set_se_shares(struct sched_entity *se, unsigned long shares)
8733{
8734 struct cfs_rq *cfs_rq = se->cfs_rq;
8735 int on_rq;
8736
8737 on_rq = se->on_rq;
8738 if (on_rq)
8739 dequeue_entity(cfs_rq, se, 0);
8740
8741 se->load.weight = shares;
8742 se->load.inv_weight = 0;
8743
8744 if (on_rq)
8745 enqueue_entity(cfs_rq, se, 0);
8746}
8747
8748static void set_se_shares(struct sched_entity *se, unsigned long shares)
8749{
8750 struct cfs_rq *cfs_rq = se->cfs_rq;
8751 struct rq *rq = cfs_rq->rq;
8752 unsigned long flags;
8753
8754 raw_spin_lock_irqsave(&rq->lock, flags);
8755 __set_se_shares(se, shares);
8756 raw_spin_unlock_irqrestore(&rq->lock, flags);
8757}
8758
8759static DEFINE_MUTEX(shares_mutex); 8482static DEFINE_MUTEX(shares_mutex);
8760 8483
8761int sched_group_set_shares(struct task_group *tg, unsigned long shares) 8484int sched_group_set_shares(struct task_group *tg, unsigned long shares)
@@ -8778,37 +8501,19 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
8778 if (tg->shares == shares) 8501 if (tg->shares == shares)
8779 goto done; 8502 goto done;
8780 8503
8781 spin_lock_irqsave(&task_group_lock, flags);
8782 for_each_possible_cpu(i)
8783 unregister_fair_sched_group(tg, i);
8784 list_del_rcu(&tg->siblings);
8785 spin_unlock_irqrestore(&task_group_lock, flags);
8786
8787 /* wait for any ongoing reference to this group to finish */
8788 synchronize_sched();
8789
8790 /*
8791 * Now we are free to modify the group's share on each cpu
8792 * w/o tripping rebalance_share or load_balance_fair.
8793 */
8794 tg->shares = shares; 8504 tg->shares = shares;
8795 for_each_possible_cpu(i) { 8505 for_each_possible_cpu(i) {
8796 /* 8506 struct rq *rq = cpu_rq(i);
8797 * force a rebalance 8507 struct sched_entity *se;
8798 */ 8508
8799 cfs_rq_set_shares(tg->cfs_rq[i], 0); 8509 se = tg->se[i];
8800 set_se_shares(tg->se[i], shares); 8510 /* Propagate contribution to hierarchy */
8511 raw_spin_lock_irqsave(&rq->lock, flags);
8512 for_each_sched_entity(se)
8513 update_cfs_shares(group_cfs_rq(se), 0);
8514 raw_spin_unlock_irqrestore(&rq->lock, flags);
8801 } 8515 }
8802 8516
8803 /*
8804 * Enable load balance activity on this group, by inserting it back on
8805 * each cpu's rq->leaf_cfs_rq_list.
8806 */
8807 spin_lock_irqsave(&task_group_lock, flags);
8808 for_each_possible_cpu(i)
8809 register_fair_sched_group(tg, i);
8810 list_add_rcu(&tg->siblings, &tg->parent->children);
8811 spin_unlock_irqrestore(&task_group_lock, flags);
8812done: 8517done:
8813 mutex_unlock(&shares_mutex); 8518 mutex_unlock(&shares_mutex);
8814 return 0; 8519 return 0;
@@ -9107,7 +8812,7 @@ cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp)
9107 8812
9108 if (!cgrp->parent) { 8813 if (!cgrp->parent) {
9109 /* This is early initialization for the top cgroup */ 8814 /* This is early initialization for the top cgroup */
9110 return &init_task_group.css; 8815 return &root_task_group.css;
9111 } 8816 }
9112 8817
9113 parent = cgroup_tg(cgrp->parent); 8818 parent = cgroup_tg(cgrp->parent);
@@ -9178,6 +8883,20 @@ cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
9178 } 8883 }
9179} 8884}
9180 8885
8886static void
8887cpu_cgroup_exit(struct cgroup_subsys *ss, struct task_struct *task)
8888{
8889 /*
8890 * cgroup_exit() is called in the copy_process() failure path.
8891 * Ignore this case since the task hasn't ran yet, this avoids
8892 * trying to poke a half freed task state from generic code.
8893 */
8894 if (!(task->flags & PF_EXITING))
8895 return;
8896
8897 sched_move_task(task);
8898}
8899
9181#ifdef CONFIG_FAIR_GROUP_SCHED 8900#ifdef CONFIG_FAIR_GROUP_SCHED
9182static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype, 8901static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype,
9183 u64 shareval) 8902 u64 shareval)
@@ -9250,6 +8969,7 @@ struct cgroup_subsys cpu_cgroup_subsys = {
9250 .destroy = cpu_cgroup_destroy, 8969 .destroy = cpu_cgroup_destroy,
9251 .can_attach = cpu_cgroup_can_attach, 8970 .can_attach = cpu_cgroup_can_attach,
9252 .attach = cpu_cgroup_attach, 8971 .attach = cpu_cgroup_attach,
8972 .exit = cpu_cgroup_exit,
9253 .populate = cpu_cgroup_populate, 8973 .populate = cpu_cgroup_populate,
9254 .subsys_id = cpu_cgroup_subsys_id, 8974 .subsys_id = cpu_cgroup_subsys_id,
9255 .early_init = 1, 8975 .early_init = 1,
@@ -9534,72 +9254,3 @@ struct cgroup_subsys cpuacct_subsys = {
9534}; 9254};
9535#endif /* CONFIG_CGROUP_CPUACCT */ 9255#endif /* CONFIG_CGROUP_CPUACCT */
9536 9256
9537#ifndef CONFIG_SMP
9538
9539void synchronize_sched_expedited(void)
9540{
9541 barrier();
9542}
9543EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
9544
9545#else /* #ifndef CONFIG_SMP */
9546
9547static atomic_t synchronize_sched_expedited_count = ATOMIC_INIT(0);
9548
9549static int synchronize_sched_expedited_cpu_stop(void *data)
9550{
9551 /*
9552 * There must be a full memory barrier on each affected CPU
9553 * between the time that try_stop_cpus() is called and the
9554 * time that it returns.
9555 *
9556 * In the current initial implementation of cpu_stop, the
9557 * above condition is already met when the control reaches
9558 * this point and the following smp_mb() is not strictly
9559 * necessary. Do smp_mb() anyway for documentation and
9560 * robustness against future implementation changes.
9561 */
9562 smp_mb(); /* See above comment block. */
9563 return 0;
9564}
9565
9566/*
9567 * Wait for an rcu-sched grace period to elapse, but use "big hammer"
9568 * approach to force grace period to end quickly. This consumes
9569 * significant time on all CPUs, and is thus not recommended for
9570 * any sort of common-case code.
9571 *
9572 * Note that it is illegal to call this function while holding any
9573 * lock that is acquired by a CPU-hotplug notifier. Failing to
9574 * observe this restriction will result in deadlock.
9575 */
9576void synchronize_sched_expedited(void)
9577{
9578 int snap, trycount = 0;
9579
9580 smp_mb(); /* ensure prior mod happens before capturing snap. */
9581 snap = atomic_read(&synchronize_sched_expedited_count) + 1;
9582 get_online_cpus();
9583 while (try_stop_cpus(cpu_online_mask,
9584 synchronize_sched_expedited_cpu_stop,
9585 NULL) == -EAGAIN) {
9586 put_online_cpus();
9587 if (trycount++ < 10)
9588 udelay(trycount * num_online_cpus());
9589 else {
9590 synchronize_sched();
9591 return;
9592 }
9593 if (atomic_read(&synchronize_sched_expedited_count) - snap > 0) {
9594 smp_mb(); /* ensure test happens before caller kfree */
9595 return;
9596 }
9597 get_online_cpus();
9598 }
9599 atomic_inc(&synchronize_sched_expedited_count);
9600 smp_mb__after_atomic_inc(); /* ensure post-GP actions seen after GP. */
9601 put_online_cpus();
9602}
9603EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
9604
9605#endif /* #else #ifndef CONFIG_SMP */
diff --git a/kernel/sched_autogroup.c b/kernel/sched_autogroup.c
new file mode 100644
index 000000000000..9fb656283157
--- /dev/null
+++ b/kernel/sched_autogroup.c
@@ -0,0 +1,270 @@
1#ifdef CONFIG_SCHED_AUTOGROUP
2
3#include <linux/proc_fs.h>
4#include <linux/seq_file.h>
5#include <linux/kallsyms.h>
6#include <linux/utsname.h>
7
8unsigned int __read_mostly sysctl_sched_autogroup_enabled = 1;
9static struct autogroup autogroup_default;
10static atomic_t autogroup_seq_nr;
11
12static void __init autogroup_init(struct task_struct *init_task)
13{
14 autogroup_default.tg = &root_task_group;
15 root_task_group.autogroup = &autogroup_default;
16 kref_init(&autogroup_default.kref);
17 init_rwsem(&autogroup_default.lock);
18 init_task->signal->autogroup = &autogroup_default;
19}
20
21static inline void autogroup_free(struct task_group *tg)
22{
23 kfree(tg->autogroup);
24}
25
26static inline void autogroup_destroy(struct kref *kref)
27{
28 struct autogroup *ag = container_of(kref, struct autogroup, kref);
29
30#ifdef CONFIG_RT_GROUP_SCHED
31 /* We've redirected RT tasks to the root task group... */
32 ag->tg->rt_se = NULL;
33 ag->tg->rt_rq = NULL;
34#endif
35 sched_destroy_group(ag->tg);
36}
37
38static inline void autogroup_kref_put(struct autogroup *ag)
39{
40 kref_put(&ag->kref, autogroup_destroy);
41}
42
43static inline struct autogroup *autogroup_kref_get(struct autogroup *ag)
44{
45 kref_get(&ag->kref);
46 return ag;
47}
48
49static inline struct autogroup *autogroup_task_get(struct task_struct *p)
50{
51 struct autogroup *ag;
52 unsigned long flags;
53
54 if (!lock_task_sighand(p, &flags))
55 return autogroup_kref_get(&autogroup_default);
56
57 ag = autogroup_kref_get(p->signal->autogroup);
58 unlock_task_sighand(p, &flags);
59
60 return ag;
61}
62
63#ifdef CONFIG_RT_GROUP_SCHED
64static void free_rt_sched_group(struct task_group *tg);
65#endif
66
67static inline struct autogroup *autogroup_create(void)
68{
69 struct autogroup *ag = kzalloc(sizeof(*ag), GFP_KERNEL);
70 struct task_group *tg;
71
72 if (!ag)
73 goto out_fail;
74
75 tg = sched_create_group(&root_task_group);
76
77 if (IS_ERR(tg))
78 goto out_free;
79
80 kref_init(&ag->kref);
81 init_rwsem(&ag->lock);
82 ag->id = atomic_inc_return(&autogroup_seq_nr);
83 ag->tg = tg;
84#ifdef CONFIG_RT_GROUP_SCHED
85 /*
86 * Autogroup RT tasks are redirected to the root task group
87 * so we don't have to move tasks around upon policy change,
88 * or flail around trying to allocate bandwidth on the fly.
89 * A bandwidth exception in __sched_setscheduler() allows
90 * the policy change to proceed. Thereafter, task_group()
91 * returns &root_task_group, so zero bandwidth is required.
92 */
93 free_rt_sched_group(tg);
94 tg->rt_se = root_task_group.rt_se;
95 tg->rt_rq = root_task_group.rt_rq;
96#endif
97 tg->autogroup = ag;
98
99 return ag;
100
101out_free:
102 kfree(ag);
103out_fail:
104 if (printk_ratelimit()) {
105 printk(KERN_WARNING "autogroup_create: %s failure.\n",
106 ag ? "sched_create_group()" : "kmalloc()");
107 }
108
109 return autogroup_kref_get(&autogroup_default);
110}
111
112static inline bool
113task_wants_autogroup(struct task_struct *p, struct task_group *tg)
114{
115 if (tg != &root_task_group)
116 return false;
117
118 if (p->sched_class != &fair_sched_class)
119 return false;
120
121 /*
122 * We can only assume the task group can't go away on us if
123 * autogroup_move_group() can see us on ->thread_group list.
124 */
125 if (p->flags & PF_EXITING)
126 return false;
127
128 return true;
129}
130
131static inline bool task_group_is_autogroup(struct task_group *tg)
132{
133 return tg != &root_task_group && tg->autogroup;
134}
135
136static inline struct task_group *
137autogroup_task_group(struct task_struct *p, struct task_group *tg)
138{
139 int enabled = ACCESS_ONCE(sysctl_sched_autogroup_enabled);
140
141 if (enabled && task_wants_autogroup(p, tg))
142 return p->signal->autogroup->tg;
143
144 return tg;
145}
146
147static void
148autogroup_move_group(struct task_struct *p, struct autogroup *ag)
149{
150 struct autogroup *prev;
151 struct task_struct *t;
152 unsigned long flags;
153
154 BUG_ON(!lock_task_sighand(p, &flags));
155
156 prev = p->signal->autogroup;
157 if (prev == ag) {
158 unlock_task_sighand(p, &flags);
159 return;
160 }
161
162 p->signal->autogroup = autogroup_kref_get(ag);
163
164 t = p;
165 do {
166 sched_move_task(t);
167 } while_each_thread(p, t);
168
169 unlock_task_sighand(p, &flags);
170 autogroup_kref_put(prev);
171}
172
173/* Allocates GFP_KERNEL, cannot be called under any spinlock */
174void sched_autogroup_create_attach(struct task_struct *p)
175{
176 struct autogroup *ag = autogroup_create();
177
178 autogroup_move_group(p, ag);
179 /* drop extra refrence added by autogroup_create() */
180 autogroup_kref_put(ag);
181}
182EXPORT_SYMBOL(sched_autogroup_create_attach);
183
184/* Cannot be called under siglock. Currently has no users */
185void sched_autogroup_detach(struct task_struct *p)
186{
187 autogroup_move_group(p, &autogroup_default);
188}
189EXPORT_SYMBOL(sched_autogroup_detach);
190
191void sched_autogroup_fork(struct signal_struct *sig)
192{
193 sig->autogroup = autogroup_task_get(current);
194}
195
196void sched_autogroup_exit(struct signal_struct *sig)
197{
198 autogroup_kref_put(sig->autogroup);
199}
200
201static int __init setup_autogroup(char *str)
202{
203 sysctl_sched_autogroup_enabled = 0;
204
205 return 1;
206}
207
208__setup("noautogroup", setup_autogroup);
209
210#ifdef CONFIG_PROC_FS
211
212int proc_sched_autogroup_set_nice(struct task_struct *p, int *nice)
213{
214 static unsigned long next = INITIAL_JIFFIES;
215 struct autogroup *ag;
216 int err;
217
218 if (*nice < -20 || *nice > 19)
219 return -EINVAL;
220
221 err = security_task_setnice(current, *nice);
222 if (err)
223 return err;
224
225 if (*nice < 0 && !can_nice(current, *nice))
226 return -EPERM;
227
228 /* this is a heavy operation taking global locks.. */
229 if (!capable(CAP_SYS_ADMIN) && time_before(jiffies, next))
230 return -EAGAIN;
231
232 next = HZ / 10 + jiffies;
233 ag = autogroup_task_get(p);
234
235 down_write(&ag->lock);
236 err = sched_group_set_shares(ag->tg, prio_to_weight[*nice + 20]);
237 if (!err)
238 ag->nice = *nice;
239 up_write(&ag->lock);
240
241 autogroup_kref_put(ag);
242
243 return err;
244}
245
246void proc_sched_autogroup_show_task(struct task_struct *p, struct seq_file *m)
247{
248 struct autogroup *ag = autogroup_task_get(p);
249
250 down_read(&ag->lock);
251 seq_printf(m, "/autogroup-%ld nice %d\n", ag->id, ag->nice);
252 up_read(&ag->lock);
253
254 autogroup_kref_put(ag);
255}
256#endif /* CONFIG_PROC_FS */
257
258#ifdef CONFIG_SCHED_DEBUG
259static inline int autogroup_path(struct task_group *tg, char *buf, int buflen)
260{
261 int enabled = ACCESS_ONCE(sysctl_sched_autogroup_enabled);
262
263 if (!enabled || !tg->autogroup)
264 return 0;
265
266 return snprintf(buf, buflen, "%s-%ld", "/autogroup", tg->autogroup->id);
267}
268#endif /* CONFIG_SCHED_DEBUG */
269
270#endif /* CONFIG_SCHED_AUTOGROUP */
diff --git a/kernel/sched_autogroup.h b/kernel/sched_autogroup.h
new file mode 100644
index 000000000000..7b859ffe5dad
--- /dev/null
+++ b/kernel/sched_autogroup.h
@@ -0,0 +1,36 @@
1#ifdef CONFIG_SCHED_AUTOGROUP
2
3struct autogroup {
4 struct kref kref;
5 struct task_group *tg;
6 struct rw_semaphore lock;
7 unsigned long id;
8 int nice;
9};
10
11static inline struct task_group *
12autogroup_task_group(struct task_struct *p, struct task_group *tg);
13
14#else /* !CONFIG_SCHED_AUTOGROUP */
15
16static inline void autogroup_init(struct task_struct *init_task) { }
17static inline void autogroup_free(struct task_group *tg) { }
18static inline bool task_group_is_autogroup(struct task_group *tg)
19{
20 return 0;
21}
22
23static inline struct task_group *
24autogroup_task_group(struct task_struct *p, struct task_group *tg)
25{
26 return tg;
27}
28
29#ifdef CONFIG_SCHED_DEBUG
30static inline int autogroup_path(struct task_group *tg, char *buf, int buflen)
31{
32 return 0;
33}
34#endif
35
36#endif /* CONFIG_SCHED_AUTOGROUP */
diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c
index 52f1a149bfb1..9d8af0b3fb64 100644
--- a/kernel/sched_clock.c
+++ b/kernel/sched_clock.c
@@ -79,7 +79,7 @@ unsigned long long __attribute__((weak)) sched_clock(void)
79} 79}
80EXPORT_SYMBOL_GPL(sched_clock); 80EXPORT_SYMBOL_GPL(sched_clock);
81 81
82static __read_mostly int sched_clock_running; 82__read_mostly int sched_clock_running;
83 83
84#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK 84#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
85__read_mostly int sched_clock_stable; 85__read_mostly int sched_clock_stable;
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 2e1b0d17dd9b..eb6cb8edd075 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -16,6 +16,8 @@
16#include <linux/kallsyms.h> 16#include <linux/kallsyms.h>
17#include <linux/utsname.h> 17#include <linux/utsname.h>
18 18
19static DEFINE_SPINLOCK(sched_debug_lock);
20
19/* 21/*
20 * This allows printing both to /proc/sched_debug and 22 * This allows printing both to /proc/sched_debug and
21 * to the console 23 * to the console
@@ -54,8 +56,7 @@ static unsigned long nsec_low(unsigned long long nsec)
54#define SPLIT_NS(x) nsec_high(x), nsec_low(x) 56#define SPLIT_NS(x) nsec_high(x), nsec_low(x)
55 57
56#ifdef CONFIG_FAIR_GROUP_SCHED 58#ifdef CONFIG_FAIR_GROUP_SCHED
57static void print_cfs_group_stats(struct seq_file *m, int cpu, 59static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group *tg)
58 struct task_group *tg)
59{ 60{
60 struct sched_entity *se = tg->se[cpu]; 61 struct sched_entity *se = tg->se[cpu];
61 if (!se) 62 if (!se)
@@ -87,6 +88,26 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu,
87} 88}
88#endif 89#endif
89 90
91#ifdef CONFIG_CGROUP_SCHED
92static char group_path[PATH_MAX];
93
94static char *task_group_path(struct task_group *tg)
95{
96 if (autogroup_path(tg, group_path, PATH_MAX))
97 return group_path;
98
99 /*
100 * May be NULL if the underlying cgroup isn't fully-created yet
101 */
102 if (!tg->css.cgroup) {
103 group_path[0] = '\0';
104 return group_path;
105 }
106 cgroup_path(tg->css.cgroup, group_path, PATH_MAX);
107 return group_path;
108}
109#endif
110
90static void 111static void
91print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) 112print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
92{ 113{
@@ -109,17 +130,10 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
109 SEQ_printf(m, "%15Ld %15Ld %15Ld.%06ld %15Ld.%06ld %15Ld.%06ld", 130 SEQ_printf(m, "%15Ld %15Ld %15Ld.%06ld %15Ld.%06ld %15Ld.%06ld",
110 0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L); 131 0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L);
111#endif 132#endif
112
113#ifdef CONFIG_CGROUP_SCHED 133#ifdef CONFIG_CGROUP_SCHED
114 { 134 SEQ_printf(m, " %s", task_group_path(task_group(p)));
115 char path[64];
116
117 rcu_read_lock();
118 cgroup_path(task_group(p)->css.cgroup, path, sizeof(path));
119 rcu_read_unlock();
120 SEQ_printf(m, " %s", path);
121 }
122#endif 135#endif
136
123 SEQ_printf(m, "\n"); 137 SEQ_printf(m, "\n");
124} 138}
125 139
@@ -147,19 +161,6 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
147 read_unlock_irqrestore(&tasklist_lock, flags); 161 read_unlock_irqrestore(&tasklist_lock, flags);
148} 162}
149 163
150#if defined(CONFIG_CGROUP_SCHED) && \
151 (defined(CONFIG_FAIR_GROUP_SCHED) || defined(CONFIG_RT_GROUP_SCHED))
152static void task_group_path(struct task_group *tg, char *buf, int buflen)
153{
154 /* may be NULL if the underlying cgroup isn't fully-created yet */
155 if (!tg->css.cgroup) {
156 buf[0] = '\0';
157 return;
158 }
159 cgroup_path(tg->css.cgroup, buf, buflen);
160}
161#endif
162
163void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) 164void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
164{ 165{
165 s64 MIN_vruntime = -1, min_vruntime, max_vruntime = -1, 166 s64 MIN_vruntime = -1, min_vruntime, max_vruntime = -1,
@@ -168,13 +169,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
168 struct sched_entity *last; 169 struct sched_entity *last;
169 unsigned long flags; 170 unsigned long flags;
170 171
171#if defined(CONFIG_CGROUP_SCHED) && defined(CONFIG_FAIR_GROUP_SCHED) 172#ifdef CONFIG_FAIR_GROUP_SCHED
172 char path[128]; 173 SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, task_group_path(cfs_rq->tg));
173 struct task_group *tg = cfs_rq->tg;
174
175 task_group_path(tg, path, sizeof(path));
176
177 SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, path);
178#else 174#else
179 SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu); 175 SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu);
180#endif 176#endif
@@ -202,33 +198,34 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
202 spread0 = min_vruntime - rq0_min_vruntime; 198 spread0 = min_vruntime - rq0_min_vruntime;
203 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread0", 199 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread0",
204 SPLIT_NS(spread0)); 200 SPLIT_NS(spread0));
205 SEQ_printf(m, " .%-30s: %ld\n", "nr_running", cfs_rq->nr_running);
206 SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight);
207
208 SEQ_printf(m, " .%-30s: %d\n", "nr_spread_over", 201 SEQ_printf(m, " .%-30s: %d\n", "nr_spread_over",
209 cfs_rq->nr_spread_over); 202 cfs_rq->nr_spread_over);
203 SEQ_printf(m, " .%-30s: %ld\n", "nr_running", cfs_rq->nr_running);
204 SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight);
210#ifdef CONFIG_FAIR_GROUP_SCHED 205#ifdef CONFIG_FAIR_GROUP_SCHED
211#ifdef CONFIG_SMP 206#ifdef CONFIG_SMP
212 SEQ_printf(m, " .%-30s: %lu\n", "shares", cfs_rq->shares); 207 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "load_avg",
208 SPLIT_NS(cfs_rq->load_avg));
209 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "load_period",
210 SPLIT_NS(cfs_rq->load_period));
211 SEQ_printf(m, " .%-30s: %ld\n", "load_contrib",
212 cfs_rq->load_contribution);
213 SEQ_printf(m, " .%-30s: %d\n", "load_tg",
214 atomic_read(&cfs_rq->tg->load_weight));
213#endif 215#endif
216
214 print_cfs_group_stats(m, cpu, cfs_rq->tg); 217 print_cfs_group_stats(m, cpu, cfs_rq->tg);
215#endif 218#endif
216} 219}
217 220
218void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq) 221void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
219{ 222{
220#if defined(CONFIG_CGROUP_SCHED) && defined(CONFIG_RT_GROUP_SCHED) 223#ifdef CONFIG_RT_GROUP_SCHED
221 char path[128]; 224 SEQ_printf(m, "\nrt_rq[%d]:%s\n", cpu, task_group_path(rt_rq->tg));
222 struct task_group *tg = rt_rq->tg;
223
224 task_group_path(tg, path, sizeof(path));
225
226 SEQ_printf(m, "\nrt_rq[%d]:%s\n", cpu, path);
227#else 225#else
228 SEQ_printf(m, "\nrt_rq[%d]:\n", cpu); 226 SEQ_printf(m, "\nrt_rq[%d]:\n", cpu);
229#endif 227#endif
230 228
231
232#define P(x) \ 229#define P(x) \
233 SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(rt_rq->x)) 230 SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(rt_rq->x))
234#define PN(x) \ 231#define PN(x) \
@@ -243,9 +240,12 @@ void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
243#undef P 240#undef P
244} 241}
245 242
243extern __read_mostly int sched_clock_running;
244
246static void print_cpu(struct seq_file *m, int cpu) 245static void print_cpu(struct seq_file *m, int cpu)
247{ 246{
248 struct rq *rq = cpu_rq(cpu); 247 struct rq *rq = cpu_rq(cpu);
248 unsigned long flags;
249 249
250#ifdef CONFIG_X86 250#ifdef CONFIG_X86
251 { 251 {
@@ -296,14 +296,20 @@ static void print_cpu(struct seq_file *m, int cpu)
296 P(ttwu_count); 296 P(ttwu_count);
297 P(ttwu_local); 297 P(ttwu_local);
298 298
299 P(bkl_count); 299 SEQ_printf(m, " .%-30s: %d\n", "bkl_count",
300 rq->rq_sched_info.bkl_count);
300 301
301#undef P 302#undef P
303#undef P64
302#endif 304#endif
305 spin_lock_irqsave(&sched_debug_lock, flags);
303 print_cfs_stats(m, cpu); 306 print_cfs_stats(m, cpu);
304 print_rt_stats(m, cpu); 307 print_rt_stats(m, cpu);
305 308
309 rcu_read_lock();
306 print_rq(m, rq, cpu); 310 print_rq(m, rq, cpu);
311 rcu_read_unlock();
312 spin_unlock_irqrestore(&sched_debug_lock, flags);
307} 313}
308 314
309static const char *sched_tunable_scaling_names[] = { 315static const char *sched_tunable_scaling_names[] = {
@@ -314,21 +320,42 @@ static const char *sched_tunable_scaling_names[] = {
314 320
315static int sched_debug_show(struct seq_file *m, void *v) 321static int sched_debug_show(struct seq_file *m, void *v)
316{ 322{
317 u64 now = ktime_to_ns(ktime_get()); 323 u64 ktime, sched_clk, cpu_clk;
324 unsigned long flags;
318 int cpu; 325 int cpu;
319 326
320 SEQ_printf(m, "Sched Debug Version: v0.09, %s %.*s\n", 327 local_irq_save(flags);
328 ktime = ktime_to_ns(ktime_get());
329 sched_clk = sched_clock();
330 cpu_clk = local_clock();
331 local_irq_restore(flags);
332
333 SEQ_printf(m, "Sched Debug Version: v0.10, %s %.*s\n",
321 init_utsname()->release, 334 init_utsname()->release,
322 (int)strcspn(init_utsname()->version, " "), 335 (int)strcspn(init_utsname()->version, " "),
323 init_utsname()->version); 336 init_utsname()->version);
324 337
325 SEQ_printf(m, "now at %Lu.%06ld msecs\n", SPLIT_NS(now)); 338#define P(x) \
339 SEQ_printf(m, "%-40s: %Ld\n", #x, (long long)(x))
340#define PN(x) \
341 SEQ_printf(m, "%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x))
342 PN(ktime);
343 PN(sched_clk);
344 PN(cpu_clk);
345 P(jiffies);
346#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
347 P(sched_clock_stable);
348#endif
349#undef PN
350#undef P
351
352 SEQ_printf(m, "\n");
353 SEQ_printf(m, "sysctl_sched\n");
326 354
327#define P(x) \ 355#define P(x) \
328 SEQ_printf(m, " .%-40s: %Ld\n", #x, (long long)(x)) 356 SEQ_printf(m, " .%-40s: %Ld\n", #x, (long long)(x))
329#define PN(x) \ 357#define PN(x) \
330 SEQ_printf(m, " .%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x)) 358 SEQ_printf(m, " .%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x))
331 P(jiffies);
332 PN(sysctl_sched_latency); 359 PN(sysctl_sched_latency);
333 PN(sysctl_sched_min_granularity); 360 PN(sysctl_sched_min_granularity);
334 PN(sysctl_sched_wakeup_granularity); 361 PN(sysctl_sched_wakeup_granularity);
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 00ebd7686676..0c26e2df450e 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -89,6 +89,13 @@ unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
89 89
90const_debug unsigned int sysctl_sched_migration_cost = 500000UL; 90const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
91 91
92/*
93 * The exponential sliding window over which load is averaged for shares
94 * distribution.
95 * (default: 10msec)
96 */
97unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
98
92static const struct sched_class fair_sched_class; 99static const struct sched_class fair_sched_class;
93 100
94/************************************************************** 101/**************************************************************
@@ -143,6 +150,36 @@ static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
143 return cfs_rq->tg->cfs_rq[this_cpu]; 150 return cfs_rq->tg->cfs_rq[this_cpu];
144} 151}
145 152
153static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
154{
155 if (!cfs_rq->on_list) {
156 /*
157 * Ensure we either appear before our parent (if already
158 * enqueued) or force our parent to appear after us when it is
159 * enqueued. The fact that we always enqueue bottom-up
160 * reduces this to two cases.
161 */
162 if (cfs_rq->tg->parent &&
163 cfs_rq->tg->parent->cfs_rq[cpu_of(rq_of(cfs_rq))]->on_list) {
164 list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
165 &rq_of(cfs_rq)->leaf_cfs_rq_list);
166 } else {
167 list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
168 &rq_of(cfs_rq)->leaf_cfs_rq_list);
169 }
170
171 cfs_rq->on_list = 1;
172 }
173}
174
175static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
176{
177 if (cfs_rq->on_list) {
178 list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
179 cfs_rq->on_list = 0;
180 }
181}
182
146/* Iterate thr' all leaf cfs_rq's on a runqueue */ 183/* Iterate thr' all leaf cfs_rq's on a runqueue */
147#define for_each_leaf_cfs_rq(rq, cfs_rq) \ 184#define for_each_leaf_cfs_rq(rq, cfs_rq) \
148 list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list) 185 list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
@@ -246,6 +283,14 @@ static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
246 return &cpu_rq(this_cpu)->cfs; 283 return &cpu_rq(this_cpu)->cfs;
247} 284}
248 285
286static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
287{
288}
289
290static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
291{
292}
293
249#define for_each_leaf_cfs_rq(rq, cfs_rq) \ 294#define for_each_leaf_cfs_rq(rq, cfs_rq) \
250 for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL) 295 for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
251 296
@@ -417,7 +462,6 @@ int sched_proc_update_handler(struct ctl_table *table, int write,
417 WRT_SYSCTL(sched_min_granularity); 462 WRT_SYSCTL(sched_min_granularity);
418 WRT_SYSCTL(sched_latency); 463 WRT_SYSCTL(sched_latency);
419 WRT_SYSCTL(sched_wakeup_granularity); 464 WRT_SYSCTL(sched_wakeup_granularity);
420 WRT_SYSCTL(sched_shares_ratelimit);
421#undef WRT_SYSCTL 465#undef WRT_SYSCTL
422 466
423 return 0; 467 return 0;
@@ -495,6 +539,9 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
495 return calc_delta_fair(sched_slice(cfs_rq, se), se); 539 return calc_delta_fair(sched_slice(cfs_rq, se), se);
496} 540}
497 541
542static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update);
543static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta);
544
498/* 545/*
499 * Update the current task's runtime statistics. Skip current tasks that 546 * Update the current task's runtime statistics. Skip current tasks that
500 * are not in our scheduling class. 547 * are not in our scheduling class.
@@ -514,6 +561,10 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
514 561
515 curr->vruntime += delta_exec_weighted; 562 curr->vruntime += delta_exec_weighted;
516 update_min_vruntime(cfs_rq); 563 update_min_vruntime(cfs_rq);
564
565#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
566 cfs_rq->load_unacc_exec_time += delta_exec;
567#endif
517} 568}
518 569
519static void update_curr(struct cfs_rq *cfs_rq) 570static void update_curr(struct cfs_rq *cfs_rq)
@@ -633,7 +684,6 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
633 list_add(&se->group_node, &cfs_rq->tasks); 684 list_add(&se->group_node, &cfs_rq->tasks);
634 } 685 }
635 cfs_rq->nr_running++; 686 cfs_rq->nr_running++;
636 se->on_rq = 1;
637} 687}
638 688
639static void 689static void
@@ -647,9 +697,165 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
647 list_del_init(&se->group_node); 697 list_del_init(&se->group_node);
648 } 698 }
649 cfs_rq->nr_running--; 699 cfs_rq->nr_running--;
650 se->on_rq = 0;
651} 700}
652 701
702#ifdef CONFIG_FAIR_GROUP_SCHED
703# ifdef CONFIG_SMP
704static void update_cfs_rq_load_contribution(struct cfs_rq *cfs_rq,
705 int global_update)
706{
707 struct task_group *tg = cfs_rq->tg;
708 long load_avg;
709
710 load_avg = div64_u64(cfs_rq->load_avg, cfs_rq->load_period+1);
711 load_avg -= cfs_rq->load_contribution;
712
713 if (global_update || abs(load_avg) > cfs_rq->load_contribution / 8) {
714 atomic_add(load_avg, &tg->load_weight);
715 cfs_rq->load_contribution += load_avg;
716 }
717}
718
719static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
720{
721 u64 period = sysctl_sched_shares_window;
722 u64 now, delta;
723 unsigned long load = cfs_rq->load.weight;
724
725 if (cfs_rq->tg == &root_task_group)
726 return;
727
728 now = rq_of(cfs_rq)->clock_task;
729 delta = now - cfs_rq->load_stamp;
730
731 /* truncate load history at 4 idle periods */
732 if (cfs_rq->load_stamp > cfs_rq->load_last &&
733 now - cfs_rq->load_last > 4 * period) {
734 cfs_rq->load_period = 0;
735 cfs_rq->load_avg = 0;
736 }
737
738 cfs_rq->load_stamp = now;
739 cfs_rq->load_unacc_exec_time = 0;
740 cfs_rq->load_period += delta;
741 if (load) {
742 cfs_rq->load_last = now;
743 cfs_rq->load_avg += delta * load;
744 }
745
746 /* consider updating load contribution on each fold or truncate */
747 if (global_update || cfs_rq->load_period > period
748 || !cfs_rq->load_period)
749 update_cfs_rq_load_contribution(cfs_rq, global_update);
750
751 while (cfs_rq->load_period > period) {
752 /*
753 * Inline assembly required to prevent the compiler
754 * optimising this loop into a divmod call.
755 * See __iter_div_u64_rem() for another example of this.
756 */
757 asm("" : "+rm" (cfs_rq->load_period));
758 cfs_rq->load_period /= 2;
759 cfs_rq->load_avg /= 2;
760 }
761
762 if (!cfs_rq->curr && !cfs_rq->nr_running && !cfs_rq->load_avg)
763 list_del_leaf_cfs_rq(cfs_rq);
764}
765
766static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg,
767 long weight_delta)
768{
769 long load_weight, load, shares;
770
771 load = cfs_rq->load.weight + weight_delta;
772
773 load_weight = atomic_read(&tg->load_weight);
774 load_weight -= cfs_rq->load_contribution;
775 load_weight += load;
776
777 shares = (tg->shares * load);
778 if (load_weight)
779 shares /= load_weight;
780
781 if (shares < MIN_SHARES)
782 shares = MIN_SHARES;
783 if (shares > tg->shares)
784 shares = tg->shares;
785
786 return shares;
787}
788
789static void update_entity_shares_tick(struct cfs_rq *cfs_rq)
790{
791 if (cfs_rq->load_unacc_exec_time > sysctl_sched_shares_window) {
792 update_cfs_load(cfs_rq, 0);
793 update_cfs_shares(cfs_rq, 0);
794 }
795}
796# else /* CONFIG_SMP */
797static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
798{
799}
800
801static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg,
802 long weight_delta)
803{
804 return tg->shares;
805}
806
807static inline void update_entity_shares_tick(struct cfs_rq *cfs_rq)
808{
809}
810# endif /* CONFIG_SMP */
811static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
812 unsigned long weight)
813{
814 if (se->on_rq) {
815 /* commit outstanding execution time */
816 if (cfs_rq->curr == se)
817 update_curr(cfs_rq);
818 account_entity_dequeue(cfs_rq, se);
819 }
820
821 update_load_set(&se->load, weight);
822
823 if (se->on_rq)
824 account_entity_enqueue(cfs_rq, se);
825}
826
827static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta)
828{
829 struct task_group *tg;
830 struct sched_entity *se;
831 long shares;
832
833 tg = cfs_rq->tg;
834 se = tg->se[cpu_of(rq_of(cfs_rq))];
835 if (!se)
836 return;
837#ifndef CONFIG_SMP
838 if (likely(se->load.weight == tg->shares))
839 return;
840#endif
841 shares = calc_cfs_shares(cfs_rq, tg, weight_delta);
842
843 reweight_entity(cfs_rq_of(se), se, shares);
844}
845#else /* CONFIG_FAIR_GROUP_SCHED */
846static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
847{
848}
849
850static inline void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta)
851{
852}
853
854static inline void update_entity_shares_tick(struct cfs_rq *cfs_rq)
855{
856}
857#endif /* CONFIG_FAIR_GROUP_SCHED */
858
653static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) 859static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
654{ 860{
655#ifdef CONFIG_SCHEDSTATS 861#ifdef CONFIG_SCHEDSTATS
@@ -771,6 +977,8 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
771 * Update run-time statistics of the 'current'. 977 * Update run-time statistics of the 'current'.
772 */ 978 */
773 update_curr(cfs_rq); 979 update_curr(cfs_rq);
980 update_cfs_load(cfs_rq, 0);
981 update_cfs_shares(cfs_rq, se->load.weight);
774 account_entity_enqueue(cfs_rq, se); 982 account_entity_enqueue(cfs_rq, se);
775 983
776 if (flags & ENQUEUE_WAKEUP) { 984 if (flags & ENQUEUE_WAKEUP) {
@@ -782,6 +990,10 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
782 check_spread(cfs_rq, se); 990 check_spread(cfs_rq, se);
783 if (se != cfs_rq->curr) 991 if (se != cfs_rq->curr)
784 __enqueue_entity(cfs_rq, se); 992 __enqueue_entity(cfs_rq, se);
993 se->on_rq = 1;
994
995 if (cfs_rq->nr_running == 1)
996 list_add_leaf_cfs_rq(cfs_rq);
785} 997}
786 998
787static void __clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) 999static void __clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
@@ -825,8 +1037,11 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
825 1037
826 if (se != cfs_rq->curr) 1038 if (se != cfs_rq->curr)
827 __dequeue_entity(cfs_rq, se); 1039 __dequeue_entity(cfs_rq, se);
1040 se->on_rq = 0;
1041 update_cfs_load(cfs_rq, 0);
828 account_entity_dequeue(cfs_rq, se); 1042 account_entity_dequeue(cfs_rq, se);
829 update_min_vruntime(cfs_rq); 1043 update_min_vruntime(cfs_rq);
1044 update_cfs_shares(cfs_rq, 0);
830 1045
831 /* 1046 /*
832 * Normalize the entity after updating the min_vruntime because the 1047 * Normalize the entity after updating the min_vruntime because the
@@ -872,6 +1087,9 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
872 struct sched_entity *se = __pick_next_entity(cfs_rq); 1087 struct sched_entity *se = __pick_next_entity(cfs_rq);
873 s64 delta = curr->vruntime - se->vruntime; 1088 s64 delta = curr->vruntime - se->vruntime;
874 1089
1090 if (delta < 0)
1091 return;
1092
875 if (delta > ideal_runtime) 1093 if (delta > ideal_runtime)
876 resched_task(rq_of(cfs_rq)->curr); 1094 resched_task(rq_of(cfs_rq)->curr);
877 } 1095 }
@@ -955,6 +1173,11 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
955 */ 1173 */
956 update_curr(cfs_rq); 1174 update_curr(cfs_rq);
957 1175
1176 /*
1177 * Update share accounting for long-running entities.
1178 */
1179 update_entity_shares_tick(cfs_rq);
1180
958#ifdef CONFIG_SCHED_HRTICK 1181#ifdef CONFIG_SCHED_HRTICK
959 /* 1182 /*
960 * queued ticks are scheduled to match the slice, so don't bother 1183 * queued ticks are scheduled to match the slice, so don't bother
@@ -1055,6 +1278,13 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
1055 flags = ENQUEUE_WAKEUP; 1278 flags = ENQUEUE_WAKEUP;
1056 } 1279 }
1057 1280
1281 for_each_sched_entity(se) {
1282 struct cfs_rq *cfs_rq = cfs_rq_of(se);
1283
1284 update_cfs_load(cfs_rq, 0);
1285 update_cfs_shares(cfs_rq, 0);
1286 }
1287
1058 hrtick_update(rq); 1288 hrtick_update(rq);
1059} 1289}
1060 1290
@@ -1071,12 +1301,20 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
1071 for_each_sched_entity(se) { 1301 for_each_sched_entity(se) {
1072 cfs_rq = cfs_rq_of(se); 1302 cfs_rq = cfs_rq_of(se);
1073 dequeue_entity(cfs_rq, se, flags); 1303 dequeue_entity(cfs_rq, se, flags);
1304
1074 /* Don't dequeue parent if it has other entities besides us */ 1305 /* Don't dequeue parent if it has other entities besides us */
1075 if (cfs_rq->load.weight) 1306 if (cfs_rq->load.weight)
1076 break; 1307 break;
1077 flags |= DEQUEUE_SLEEP; 1308 flags |= DEQUEUE_SLEEP;
1078 } 1309 }
1079 1310
1311 for_each_sched_entity(se) {
1312 struct cfs_rq *cfs_rq = cfs_rq_of(se);
1313
1314 update_cfs_load(cfs_rq, 0);
1315 update_cfs_shares(cfs_rq, 0);
1316 }
1317
1080 hrtick_update(rq); 1318 hrtick_update(rq);
1081} 1319}
1082 1320
@@ -1143,67 +1381,36 @@ static void task_waking_fair(struct rq *rq, struct task_struct *p)
1143 * Adding load to a group doesn't make a group heavier, but can cause movement 1381 * Adding load to a group doesn't make a group heavier, but can cause movement
1144 * of group shares between cpus. Assuming the shares were perfectly aligned one 1382 * of group shares between cpus. Assuming the shares were perfectly aligned one
1145 * can calculate the shift in shares. 1383 * can calculate the shift in shares.
1146 *
1147 * The problem is that perfectly aligning the shares is rather expensive, hence
1148 * we try to avoid doing that too often - see update_shares(), which ratelimits
1149 * this change.
1150 *
1151 * We compensate this by not only taking the current delta into account, but
1152 * also considering the delta between when the shares were last adjusted and
1153 * now.
1154 *
1155 * We still saw a performance dip, some tracing learned us that between
1156 * cgroup:/ and cgroup:/foo balancing the number of affine wakeups increased
1157 * significantly. Therefore try to bias the error in direction of failing
1158 * the affine wakeup.
1159 *
1160 */ 1384 */
1161static long effective_load(struct task_group *tg, int cpu, 1385static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
1162 long wl, long wg)
1163{ 1386{
1164 struct sched_entity *se = tg->se[cpu]; 1387 struct sched_entity *se = tg->se[cpu];
1165 1388
1166 if (!tg->parent) 1389 if (!tg->parent)
1167 return wl; 1390 return wl;
1168 1391
1169 /*
1170 * By not taking the decrease of shares on the other cpu into
1171 * account our error leans towards reducing the affine wakeups.
1172 */
1173 if (!wl && sched_feat(ASYM_EFF_LOAD))
1174 return wl;
1175
1176 for_each_sched_entity(se) { 1392 for_each_sched_entity(se) {
1177 long S, rw, s, a, b; 1393 long lw, w;
1178 long more_w;
1179
1180 /*
1181 * Instead of using this increment, also add the difference
1182 * between when the shares were last updated and now.
1183 */
1184 more_w = se->my_q->load.weight - se->my_q->rq_weight;
1185 wl += more_w;
1186 wg += more_w;
1187 1394
1188 S = se->my_q->tg->shares; 1395 tg = se->my_q->tg;
1189 s = se->my_q->shares; 1396 w = se->my_q->load.weight;
1190 rw = se->my_q->rq_weight;
1191 1397
1192 a = S*(rw + wl); 1398 /* use this cpu's instantaneous contribution */
1193 b = S*rw + s*wg; 1399 lw = atomic_read(&tg->load_weight);
1400 lw -= se->my_q->load_contribution;
1401 lw += w + wg;
1194 1402
1195 wl = s*(a-b); 1403 wl += w;
1196 1404
1197 if (likely(b)) 1405 if (lw > 0 && wl < lw)
1198 wl /= b; 1406 wl = (wl * tg->shares) / lw;
1407 else
1408 wl = tg->shares;
1199 1409
1200 /* 1410 /* zero point is MIN_SHARES */
1201 * Assume the group is already running and will 1411 if (wl < MIN_SHARES)
1202 * thus already be accounted for in the weight. 1412 wl = MIN_SHARES;
1203 * 1413 wl -= se->load.weight;
1204 * That is, moving shares between CPUs, does not
1205 * alter the group weight.
1206 */
1207 wg = 0; 1414 wg = 0;
1208 } 1415 }
1209 1416
@@ -1222,7 +1429,7 @@ static inline unsigned long effective_load(struct task_group *tg, int cpu,
1222 1429
1223static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) 1430static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
1224{ 1431{
1225 unsigned long this_load, load; 1432 s64 this_load, load;
1226 int idx, this_cpu, prev_cpu; 1433 int idx, this_cpu, prev_cpu;
1227 unsigned long tl_per_task; 1434 unsigned long tl_per_task;
1228 struct task_group *tg; 1435 struct task_group *tg;
@@ -1261,8 +1468,8 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
1261 * Otherwise check if either cpus are near enough in load to allow this 1468 * Otherwise check if either cpus are near enough in load to allow this
1262 * task to be woken on this_cpu. 1469 * task to be woken on this_cpu.
1263 */ 1470 */
1264 if (this_load) { 1471 if (this_load > 0) {
1265 unsigned long this_eff_load, prev_eff_load; 1472 s64 this_eff_load, prev_eff_load;
1266 1473
1267 this_eff_load = 100; 1474 this_eff_load = 100;
1268 this_eff_load *= power_of(prev_cpu); 1475 this_eff_load *= power_of(prev_cpu);
@@ -1508,23 +1715,6 @@ select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_
1508 sd = tmp; 1715 sd = tmp;
1509 } 1716 }
1510 1717
1511#ifdef CONFIG_FAIR_GROUP_SCHED
1512 if (sched_feat(LB_SHARES_UPDATE)) {
1513 /*
1514 * Pick the largest domain to update shares over
1515 */
1516 tmp = sd;
1517 if (affine_sd && (!tmp || affine_sd->span_weight > sd->span_weight))
1518 tmp = affine_sd;
1519
1520 if (tmp) {
1521 raw_spin_unlock(&rq->lock);
1522 update_shares(tmp);
1523 raw_spin_lock(&rq->lock);
1524 }
1525 }
1526#endif
1527
1528 if (affine_sd) { 1718 if (affine_sd) {
1529 if (cpu == prev_cpu || wake_affine(affine_sd, p, sync)) 1719 if (cpu == prev_cpu || wake_affine(affine_sd, p, sync))
1530 return select_idle_sibling(p, cpu); 1720 return select_idle_sibling(p, cpu);
@@ -1909,6 +2099,48 @@ out:
1909} 2099}
1910 2100
1911#ifdef CONFIG_FAIR_GROUP_SCHED 2101#ifdef CONFIG_FAIR_GROUP_SCHED
2102/*
2103 * update tg->load_weight by folding this cpu's load_avg
2104 */
2105static int update_shares_cpu(struct task_group *tg, int cpu)
2106{
2107 struct cfs_rq *cfs_rq;
2108 unsigned long flags;
2109 struct rq *rq;
2110
2111 if (!tg->se[cpu])
2112 return 0;
2113
2114 rq = cpu_rq(cpu);
2115 cfs_rq = tg->cfs_rq[cpu];
2116
2117 raw_spin_lock_irqsave(&rq->lock, flags);
2118
2119 update_rq_clock(rq);
2120 update_cfs_load(cfs_rq, 1);
2121
2122 /*
2123 * We need to update shares after updating tg->load_weight in
2124 * order to adjust the weight of groups with long running tasks.
2125 */
2126 update_cfs_shares(cfs_rq, 0);
2127
2128 raw_spin_unlock_irqrestore(&rq->lock, flags);
2129
2130 return 0;
2131}
2132
2133static void update_shares(int cpu)
2134{
2135 struct cfs_rq *cfs_rq;
2136 struct rq *rq = cpu_rq(cpu);
2137
2138 rcu_read_lock();
2139 for_each_leaf_cfs_rq(rq, cfs_rq)
2140 update_shares_cpu(cfs_rq->tg, cpu);
2141 rcu_read_unlock();
2142}
2143
1912static unsigned long 2144static unsigned long
1913load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, 2145load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
1914 unsigned long max_load_move, 2146 unsigned long max_load_move,
@@ -1956,6 +2188,10 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
1956 return max_load_move - rem_load_move; 2188 return max_load_move - rem_load_move;
1957} 2189}
1958#else 2190#else
2191static inline void update_shares(int cpu)
2192{
2193}
2194
1959static unsigned long 2195static unsigned long
1960load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, 2196load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
1961 unsigned long max_load_move, 2197 unsigned long max_load_move,
@@ -3032,7 +3268,6 @@ static int load_balance(int this_cpu, struct rq *this_rq,
3032 schedstat_inc(sd, lb_count[idle]); 3268 schedstat_inc(sd, lb_count[idle]);
3033 3269
3034redo: 3270redo:
3035 update_shares(sd);
3036 group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle, 3271 group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
3037 cpus, balance); 3272 cpus, balance);
3038 3273
@@ -3174,8 +3409,6 @@ out_one_pinned:
3174 else 3409 else
3175 ld_moved = 0; 3410 ld_moved = 0;
3176out: 3411out:
3177 if (ld_moved)
3178 update_shares(sd);
3179 return ld_moved; 3412 return ld_moved;
3180} 3413}
3181 3414
@@ -3199,6 +3432,7 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
3199 */ 3432 */
3200 raw_spin_unlock(&this_rq->lock); 3433 raw_spin_unlock(&this_rq->lock);
3201 3434
3435 update_shares(this_cpu);
3202 for_each_domain(this_cpu, sd) { 3436 for_each_domain(this_cpu, sd) {
3203 unsigned long interval; 3437 unsigned long interval;
3204 int balance = 1; 3438 int balance = 1;
@@ -3569,6 +3803,8 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
3569 int update_next_balance = 0; 3803 int update_next_balance = 0;
3570 int need_serialize; 3804 int need_serialize;
3571 3805
3806 update_shares(cpu);
3807
3572 for_each_domain(cpu, sd) { 3808 for_each_domain(cpu, sd) {
3573 if (!(sd->flags & SD_LOAD_BALANCE)) 3809 if (!(sd->flags & SD_LOAD_BALANCE))
3574 continue; 3810 continue;
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index 185f920ec1a2..68e69acc29b9 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -52,8 +52,6 @@ SCHED_FEAT(ARCH_POWER, 0)
52SCHED_FEAT(HRTICK, 0) 52SCHED_FEAT(HRTICK, 0)
53SCHED_FEAT(DOUBLE_TICK, 0) 53SCHED_FEAT(DOUBLE_TICK, 0)
54SCHED_FEAT(LB_BIAS, 1) 54SCHED_FEAT(LB_BIAS, 1)
55SCHED_FEAT(LB_SHARES_UPDATE, 1)
56SCHED_FEAT(ASYM_EFF_LOAD, 1)
57 55
58/* 56/*
59 * Spin-wait on mutex acquisition when the mutex owner is running on 57 * Spin-wait on mutex acquisition when the mutex owner is running on
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index bea7d79f7e9c..ad6267714c84 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -183,6 +183,17 @@ static inline u64 sched_rt_period(struct rt_rq *rt_rq)
183 return ktime_to_ns(rt_rq->tg->rt_bandwidth.rt_period); 183 return ktime_to_ns(rt_rq->tg->rt_bandwidth.rt_period);
184} 184}
185 185
186static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq)
187{
188 list_add_rcu(&rt_rq->leaf_rt_rq_list,
189 &rq_of_rt_rq(rt_rq)->leaf_rt_rq_list);
190}
191
192static inline void list_del_leaf_rt_rq(struct rt_rq *rt_rq)
193{
194 list_del_rcu(&rt_rq->leaf_rt_rq_list);
195}
196
186#define for_each_leaf_rt_rq(rt_rq, rq) \ 197#define for_each_leaf_rt_rq(rt_rq, rq) \
187 list_for_each_entry_rcu(rt_rq, &rq->leaf_rt_rq_list, leaf_rt_rq_list) 198 list_for_each_entry_rcu(rt_rq, &rq->leaf_rt_rq_list, leaf_rt_rq_list)
188 199
@@ -276,6 +287,14 @@ static inline u64 sched_rt_period(struct rt_rq *rt_rq)
276 return ktime_to_ns(def_rt_bandwidth.rt_period); 287 return ktime_to_ns(def_rt_bandwidth.rt_period);
277} 288}
278 289
290static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq)
291{
292}
293
294static inline void list_del_leaf_rt_rq(struct rt_rq *rt_rq)
295{
296}
297
279#define for_each_leaf_rt_rq(rt_rq, rq) \ 298#define for_each_leaf_rt_rq(rt_rq, rq) \
280 for (rt_rq = &rq->rt; rt_rq; rt_rq = NULL) 299 for (rt_rq = &rq->rt; rt_rq; rt_rq = NULL)
281 300
@@ -606,7 +625,7 @@ static void update_curr_rt(struct rq *rq)
606 struct rt_rq *rt_rq = rt_rq_of_se(rt_se); 625 struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
607 u64 delta_exec; 626 u64 delta_exec;
608 627
609 if (!task_has_rt_policy(curr)) 628 if (curr->sched_class != &rt_sched_class)
610 return; 629 return;
611 630
612 delta_exec = rq->clock_task - curr->se.exec_start; 631 delta_exec = rq->clock_task - curr->se.exec_start;
@@ -825,6 +844,9 @@ static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
825 if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running)) 844 if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running))
826 return; 845 return;
827 846
847 if (!rt_rq->rt_nr_running)
848 list_add_leaf_rt_rq(rt_rq);
849
828 if (head) 850 if (head)
829 list_add(&rt_se->run_list, queue); 851 list_add(&rt_se->run_list, queue);
830 else 852 else
@@ -844,6 +866,8 @@ static void __dequeue_rt_entity(struct sched_rt_entity *rt_se)
844 __clear_bit(rt_se_prio(rt_se), array->bitmap); 866 __clear_bit(rt_se_prio(rt_se), array->bitmap);
845 867
846 dec_rt_tasks(rt_se, rt_rq); 868 dec_rt_tasks(rt_se, rt_rq);
869 if (!rt_rq->rt_nr_running)
870 list_del_leaf_rt_rq(rt_rq);
847} 871}
848 872
849/* 873/*
diff --git a/kernel/smp.c b/kernel/smp.c
index 12ed8b013e2d..9910744f0856 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -13,6 +13,7 @@
13#include <linux/smp.h> 13#include <linux/smp.h>
14#include <linux/cpu.h> 14#include <linux/cpu.h>
15 15
16#ifdef CONFIG_USE_GENERIC_SMP_HELPERS
16static struct { 17static struct {
17 struct list_head queue; 18 struct list_head queue;
18 raw_spinlock_t lock; 19 raw_spinlock_t lock;
@@ -193,23 +194,52 @@ void generic_smp_call_function_interrupt(void)
193 */ 194 */
194 list_for_each_entry_rcu(data, &call_function.queue, csd.list) { 195 list_for_each_entry_rcu(data, &call_function.queue, csd.list) {
195 int refs; 196 int refs;
197 void (*func) (void *info);
196 198
197 if (!cpumask_test_and_clear_cpu(cpu, data->cpumask)) 199 /*
200 * Since we walk the list without any locks, we might
201 * see an entry that was completed, removed from the
202 * list and is in the process of being reused.
203 *
204 * We must check that the cpu is in the cpumask before
205 * checking the refs, and both must be set before
206 * executing the callback on this cpu.
207 */
208
209 if (!cpumask_test_cpu(cpu, data->cpumask))
210 continue;
211
212 smp_rmb();
213
214 if (atomic_read(&data->refs) == 0)
198 continue; 215 continue;
199 216
217 func = data->csd.func; /* for later warn */
200 data->csd.func(data->csd.info); 218 data->csd.func(data->csd.info);
201 219
220 /*
221 * If the cpu mask is not still set then it enabled interrupts,
222 * we took another smp interrupt, and executed the function
223 * twice on this cpu. In theory that copy decremented refs.
224 */
225 if (!cpumask_test_and_clear_cpu(cpu, data->cpumask)) {
226 WARN(1, "%pS enabled interrupts and double executed\n",
227 func);
228 continue;
229 }
230
202 refs = atomic_dec_return(&data->refs); 231 refs = atomic_dec_return(&data->refs);
203 WARN_ON(refs < 0); 232 WARN_ON(refs < 0);
204 if (!refs) {
205 raw_spin_lock(&call_function.lock);
206 list_del_rcu(&data->csd.list);
207 raw_spin_unlock(&call_function.lock);
208 }
209 233
210 if (refs) 234 if (refs)
211 continue; 235 continue;
212 236
237 WARN_ON(!cpumask_empty(data->cpumask));
238
239 raw_spin_lock(&call_function.lock);
240 list_del_rcu(&data->csd.list);
241 raw_spin_unlock(&call_function.lock);
242
213 csd_unlock(&data->csd); 243 csd_unlock(&data->csd);
214 } 244 }
215 245
@@ -429,7 +459,7 @@ void smp_call_function_many(const struct cpumask *mask,
429 * can't happen. 459 * can't happen.
430 */ 460 */
431 WARN_ON_ONCE(cpu_online(this_cpu) && irqs_disabled() 461 WARN_ON_ONCE(cpu_online(this_cpu) && irqs_disabled()
432 && !oops_in_progress); 462 && !oops_in_progress && !early_boot_irqs_disabled);
433 463
434 /* So, what's a CPU they want? Ignoring this one. */ 464 /* So, what's a CPU they want? Ignoring this one. */
435 cpu = cpumask_first_and(mask, cpu_online_mask); 465 cpu = cpumask_first_and(mask, cpu_online_mask);
@@ -453,11 +483,21 @@ void smp_call_function_many(const struct cpumask *mask,
453 483
454 data = &__get_cpu_var(cfd_data); 484 data = &__get_cpu_var(cfd_data);
455 csd_lock(&data->csd); 485 csd_lock(&data->csd);
486 BUG_ON(atomic_read(&data->refs) || !cpumask_empty(data->cpumask));
456 487
457 data->csd.func = func; 488 data->csd.func = func;
458 data->csd.info = info; 489 data->csd.info = info;
459 cpumask_and(data->cpumask, mask, cpu_online_mask); 490 cpumask_and(data->cpumask, mask, cpu_online_mask);
460 cpumask_clear_cpu(this_cpu, data->cpumask); 491 cpumask_clear_cpu(this_cpu, data->cpumask);
492
493 /*
494 * To ensure the interrupt handler gets an complete view
495 * we order the cpumask and refs writes and order the read
496 * of them in the interrupt handler. In addition we may
497 * only clear our own cpu bit from the mask.
498 */
499 smp_wmb();
500
461 atomic_set(&data->refs, cpumask_weight(data->cpumask)); 501 atomic_set(&data->refs, cpumask_weight(data->cpumask));
462 502
463 raw_spin_lock_irqsave(&call_function.lock, flags); 503 raw_spin_lock_irqsave(&call_function.lock, flags);
@@ -529,3 +569,24 @@ void ipi_call_unlock_irq(void)
529{ 569{
530 raw_spin_unlock_irq(&call_function.lock); 570 raw_spin_unlock_irq(&call_function.lock);
531} 571}
572#endif /* USE_GENERIC_SMP_HELPERS */
573
574/*
575 * Call a function on all processors. May be used during early boot while
576 * early_boot_irqs_disabled is set. Use local_irq_save/restore() instead
577 * of local_irq_disable/enable().
578 */
579int on_each_cpu(void (*func) (void *info), void *info, int wait)
580{
581 unsigned long flags;
582 int ret = 0;
583
584 preempt_disable();
585 ret = smp_call_function(func, info, wait);
586 local_irq_save(flags);
587 func(info);
588 local_irq_restore(flags);
589 preempt_enable();
590 return ret;
591}
592EXPORT_SYMBOL(on_each_cpu);
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 18f4be0d5fe0..68eb5efec388 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -70,7 +70,7 @@ char *softirq_to_name[NR_SOFTIRQS] = {
70static void wakeup_softirqd(void) 70static void wakeup_softirqd(void)
71{ 71{
72 /* Interrupts are disabled: no need to stop preemption */ 72 /* Interrupts are disabled: no need to stop preemption */
73 struct task_struct *tsk = __get_cpu_var(ksoftirqd); 73 struct task_struct *tsk = __this_cpu_read(ksoftirqd);
74 74
75 if (tsk && tsk->state != TASK_RUNNING) 75 if (tsk && tsk->state != TASK_RUNNING)
76 wake_up_process(tsk); 76 wake_up_process(tsk);
@@ -388,8 +388,8 @@ void __tasklet_schedule(struct tasklet_struct *t)
388 388
389 local_irq_save(flags); 389 local_irq_save(flags);
390 t->next = NULL; 390 t->next = NULL;
391 *__get_cpu_var(tasklet_vec).tail = t; 391 *__this_cpu_read(tasklet_vec.tail) = t;
392 __get_cpu_var(tasklet_vec).tail = &(t->next); 392 __this_cpu_write(tasklet_vec.tail, &(t->next));
393 raise_softirq_irqoff(TASKLET_SOFTIRQ); 393 raise_softirq_irqoff(TASKLET_SOFTIRQ);
394 local_irq_restore(flags); 394 local_irq_restore(flags);
395} 395}
@@ -402,8 +402,8 @@ void __tasklet_hi_schedule(struct tasklet_struct *t)
402 402
403 local_irq_save(flags); 403 local_irq_save(flags);
404 t->next = NULL; 404 t->next = NULL;
405 *__get_cpu_var(tasklet_hi_vec).tail = t; 405 *__this_cpu_read(tasklet_hi_vec.tail) = t;
406 __get_cpu_var(tasklet_hi_vec).tail = &(t->next); 406 __this_cpu_write(tasklet_hi_vec.tail, &(t->next));
407 raise_softirq_irqoff(HI_SOFTIRQ); 407 raise_softirq_irqoff(HI_SOFTIRQ);
408 local_irq_restore(flags); 408 local_irq_restore(flags);
409} 409}
@@ -414,8 +414,8 @@ void __tasklet_hi_schedule_first(struct tasklet_struct *t)
414{ 414{
415 BUG_ON(!irqs_disabled()); 415 BUG_ON(!irqs_disabled());
416 416
417 t->next = __get_cpu_var(tasklet_hi_vec).head; 417 t->next = __this_cpu_read(tasklet_hi_vec.head);
418 __get_cpu_var(tasklet_hi_vec).head = t; 418 __this_cpu_write(tasklet_hi_vec.head, t);
419 __raise_softirq_irqoff(HI_SOFTIRQ); 419 __raise_softirq_irqoff(HI_SOFTIRQ);
420} 420}
421 421
@@ -426,9 +426,9 @@ static void tasklet_action(struct softirq_action *a)
426 struct tasklet_struct *list; 426 struct tasklet_struct *list;
427 427
428 local_irq_disable(); 428 local_irq_disable();
429 list = __get_cpu_var(tasklet_vec).head; 429 list = __this_cpu_read(tasklet_vec.head);
430 __get_cpu_var(tasklet_vec).head = NULL; 430 __this_cpu_write(tasklet_vec.head, NULL);
431 __get_cpu_var(tasklet_vec).tail = &__get_cpu_var(tasklet_vec).head; 431 __this_cpu_write(tasklet_vec.tail, &__get_cpu_var(tasklet_vec).head);
432 local_irq_enable(); 432 local_irq_enable();
433 433
434 while (list) { 434 while (list) {
@@ -449,8 +449,8 @@ static void tasklet_action(struct softirq_action *a)
449 449
450 local_irq_disable(); 450 local_irq_disable();
451 t->next = NULL; 451 t->next = NULL;
452 *__get_cpu_var(tasklet_vec).tail = t; 452 *__this_cpu_read(tasklet_vec.tail) = t;
453 __get_cpu_var(tasklet_vec).tail = &(t->next); 453 __this_cpu_write(tasklet_vec.tail, &(t->next));
454 __raise_softirq_irqoff(TASKLET_SOFTIRQ); 454 __raise_softirq_irqoff(TASKLET_SOFTIRQ);
455 local_irq_enable(); 455 local_irq_enable();
456 } 456 }
@@ -461,9 +461,9 @@ static void tasklet_hi_action(struct softirq_action *a)
461 struct tasklet_struct *list; 461 struct tasklet_struct *list;
462 462
463 local_irq_disable(); 463 local_irq_disable();
464 list = __get_cpu_var(tasklet_hi_vec).head; 464 list = __this_cpu_read(tasklet_hi_vec.head);
465 __get_cpu_var(tasklet_hi_vec).head = NULL; 465 __this_cpu_write(tasklet_hi_vec.head, NULL);
466 __get_cpu_var(tasklet_hi_vec).tail = &__get_cpu_var(tasklet_hi_vec).head; 466 __this_cpu_write(tasklet_hi_vec.tail, &__get_cpu_var(tasklet_hi_vec).head);
467 local_irq_enable(); 467 local_irq_enable();
468 468
469 while (list) { 469 while (list) {
@@ -484,8 +484,8 @@ static void tasklet_hi_action(struct softirq_action *a)
484 484
485 local_irq_disable(); 485 local_irq_disable();
486 t->next = NULL; 486 t->next = NULL;
487 *__get_cpu_var(tasklet_hi_vec).tail = t; 487 *__this_cpu_read(tasklet_hi_vec.tail) = t;
488 __get_cpu_var(tasklet_hi_vec).tail = &(t->next); 488 __this_cpu_write(tasklet_hi_vec.tail, &(t->next));
489 __raise_softirq_irqoff(HI_SOFTIRQ); 489 __raise_softirq_irqoff(HI_SOFTIRQ);
490 local_irq_enable(); 490 local_irq_enable();
491 } 491 }
@@ -802,16 +802,16 @@ static void takeover_tasklets(unsigned int cpu)
802 802
803 /* Find end, append list for that CPU. */ 803 /* Find end, append list for that CPU. */
804 if (&per_cpu(tasklet_vec, cpu).head != per_cpu(tasklet_vec, cpu).tail) { 804 if (&per_cpu(tasklet_vec, cpu).head != per_cpu(tasklet_vec, cpu).tail) {
805 *(__get_cpu_var(tasklet_vec).tail) = per_cpu(tasklet_vec, cpu).head; 805 *__this_cpu_read(tasklet_vec.tail) = per_cpu(tasklet_vec, cpu).head;
806 __get_cpu_var(tasklet_vec).tail = per_cpu(tasklet_vec, cpu).tail; 806 this_cpu_write(tasklet_vec.tail, per_cpu(tasklet_vec, cpu).tail);
807 per_cpu(tasklet_vec, cpu).head = NULL; 807 per_cpu(tasklet_vec, cpu).head = NULL;
808 per_cpu(tasklet_vec, cpu).tail = &per_cpu(tasklet_vec, cpu).head; 808 per_cpu(tasklet_vec, cpu).tail = &per_cpu(tasklet_vec, cpu).head;
809 } 809 }
810 raise_softirq_irqoff(TASKLET_SOFTIRQ); 810 raise_softirq_irqoff(TASKLET_SOFTIRQ);
811 811
812 if (&per_cpu(tasklet_hi_vec, cpu).head != per_cpu(tasklet_hi_vec, cpu).tail) { 812 if (&per_cpu(tasklet_hi_vec, cpu).head != per_cpu(tasklet_hi_vec, cpu).tail) {
813 *__get_cpu_var(tasklet_hi_vec).tail = per_cpu(tasklet_hi_vec, cpu).head; 813 *__this_cpu_read(tasklet_hi_vec.tail) = per_cpu(tasklet_hi_vec, cpu).head;
814 __get_cpu_var(tasklet_hi_vec).tail = per_cpu(tasklet_hi_vec, cpu).tail; 814 __this_cpu_write(tasklet_hi_vec.tail, per_cpu(tasklet_hi_vec, cpu).tail);
815 per_cpu(tasklet_hi_vec, cpu).head = NULL; 815 per_cpu(tasklet_hi_vec, cpu).head = NULL;
816 per_cpu(tasklet_hi_vec, cpu).tail = &per_cpu(tasklet_hi_vec, cpu).head; 816 per_cpu(tasklet_hi_vec, cpu).tail = &per_cpu(tasklet_hi_vec, cpu).head;
817 } 817 }
@@ -853,7 +853,9 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb,
853 cpumask_any(cpu_online_mask)); 853 cpumask_any(cpu_online_mask));
854 case CPU_DEAD: 854 case CPU_DEAD:
855 case CPU_DEAD_FROZEN: { 855 case CPU_DEAD_FROZEN: {
856 struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; 856 static const struct sched_param param = {
857 .sched_priority = MAX_RT_PRIO-1
858 };
857 859
858 p = per_cpu(ksoftirqd, hotcpu); 860 p = per_cpu(ksoftirqd, hotcpu);
859 per_cpu(ksoftirqd, hotcpu) = NULL; 861 per_cpu(ksoftirqd, hotcpu) = NULL;
@@ -883,25 +885,6 @@ static __init int spawn_ksoftirqd(void)
883} 885}
884early_initcall(spawn_ksoftirqd); 886early_initcall(spawn_ksoftirqd);
885 887
886#ifdef CONFIG_SMP
887/*
888 * Call a function on all processors
889 */
890int on_each_cpu(void (*func) (void *info), void *info, int wait)
891{
892 int ret = 0;
893
894 preempt_disable();
895 ret = smp_call_function(func, info, wait);
896 local_irq_disable();
897 func(info);
898 local_irq_enable();
899 preempt_enable();
900 return ret;
901}
902EXPORT_SYMBOL(on_each_cpu);
903#endif
904
905/* 888/*
906 * [ These __weak aliases are kept in a separate compilation unit, so that 889 * [ These __weak aliases are kept in a separate compilation unit, so that
907 * GCC does not inline them incorrectly. ] 890 * GCC does not inline them incorrectly. ]
diff --git a/kernel/srcu.c b/kernel/srcu.c
index c71e07500536..73ce23feaea9 100644
--- a/kernel/srcu.c
+++ b/kernel/srcu.c
@@ -31,6 +31,7 @@
31#include <linux/rcupdate.h> 31#include <linux/rcupdate.h>
32#include <linux/sched.h> 32#include <linux/sched.h>
33#include <linux/smp.h> 33#include <linux/smp.h>
34#include <linux/delay.h>
34#include <linux/srcu.h> 35#include <linux/srcu.h>
35 36
36static int init_srcu_struct_fields(struct srcu_struct *sp) 37static int init_srcu_struct_fields(struct srcu_struct *sp)
@@ -155,6 +156,16 @@ void __srcu_read_unlock(struct srcu_struct *sp, int idx)
155EXPORT_SYMBOL_GPL(__srcu_read_unlock); 156EXPORT_SYMBOL_GPL(__srcu_read_unlock);
156 157
157/* 158/*
159 * We use an adaptive strategy for synchronize_srcu() and especially for
160 * synchronize_srcu_expedited(). We spin for a fixed time period
161 * (defined below) to allow SRCU readers to exit their read-side critical
162 * sections. If there are still some readers after 10 microseconds,
163 * we repeatedly block for 1-millisecond time periods. This approach
164 * has done well in testing, so there is no need for a config parameter.
165 */
166#define SYNCHRONIZE_SRCU_READER_DELAY 10
167
168/*
158 * Helper function for synchronize_srcu() and synchronize_srcu_expedited(). 169 * Helper function for synchronize_srcu() and synchronize_srcu_expedited().
159 */ 170 */
160static void __synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void)) 171static void __synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void))
@@ -203,9 +214,15 @@ static void __synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void))
203 * all srcu_read_lock() calls using the old counters have completed. 214 * all srcu_read_lock() calls using the old counters have completed.
204 * Their corresponding critical sections might well be still 215 * Their corresponding critical sections might well be still
205 * executing, but the srcu_read_lock() primitives themselves 216 * executing, but the srcu_read_lock() primitives themselves
206 * will have finished executing. 217 * will have finished executing. We initially give readers
218 * an arbitrarily chosen 10 microseconds to get out of their
219 * SRCU read-side critical sections, then loop waiting 1/HZ
220 * seconds per iteration. The 10-microsecond value has done
221 * very well in testing.
207 */ 222 */
208 223
224 if (srcu_readers_active_idx(sp, idx))
225 udelay(SYNCHRONIZE_SRCU_READER_DELAY);
209 while (srcu_readers_active_idx(sp, idx)) 226 while (srcu_readers_active_idx(sp, idx))
210 schedule_timeout_interruptible(1); 227 schedule_timeout_interruptible(1);
211 228
diff --git a/kernel/sys.c b/kernel/sys.c
index 7f5a0cd296a9..18da702ec813 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -43,6 +43,8 @@
43#include <linux/kprobes.h> 43#include <linux/kprobes.h>
44#include <linux/user_namespace.h> 44#include <linux/user_namespace.h>
45 45
46#include <linux/kmsg_dump.h>
47
46#include <asm/uaccess.h> 48#include <asm/uaccess.h>
47#include <asm/io.h> 49#include <asm/io.h>
48#include <asm/unistd.h> 50#include <asm/unistd.h>
@@ -285,6 +287,7 @@ out_unlock:
285 */ 287 */
286void emergency_restart(void) 288void emergency_restart(void)
287{ 289{
290 kmsg_dump(KMSG_DUMP_EMERG);
288 machine_emergency_restart(); 291 machine_emergency_restart();
289} 292}
290EXPORT_SYMBOL_GPL(emergency_restart); 293EXPORT_SYMBOL_GPL(emergency_restart);
@@ -312,6 +315,7 @@ void kernel_restart(char *cmd)
312 printk(KERN_EMERG "Restarting system.\n"); 315 printk(KERN_EMERG "Restarting system.\n");
313 else 316 else
314 printk(KERN_EMERG "Restarting system with command '%s'.\n", cmd); 317 printk(KERN_EMERG "Restarting system with command '%s'.\n", cmd);
318 kmsg_dump(KMSG_DUMP_RESTART);
315 machine_restart(cmd); 319 machine_restart(cmd);
316} 320}
317EXPORT_SYMBOL_GPL(kernel_restart); 321EXPORT_SYMBOL_GPL(kernel_restart);
@@ -333,6 +337,7 @@ void kernel_halt(void)
333 kernel_shutdown_prepare(SYSTEM_HALT); 337 kernel_shutdown_prepare(SYSTEM_HALT);
334 sysdev_shutdown(); 338 sysdev_shutdown();
335 printk(KERN_EMERG "System halted.\n"); 339 printk(KERN_EMERG "System halted.\n");
340 kmsg_dump(KMSG_DUMP_HALT);
336 machine_halt(); 341 machine_halt();
337} 342}
338 343
@@ -351,6 +356,7 @@ void kernel_power_off(void)
351 disable_nonboot_cpus(); 356 disable_nonboot_cpus();
352 sysdev_shutdown(); 357 sysdev_shutdown();
353 printk(KERN_EMERG "Power down.\n"); 358 printk(KERN_EMERG "Power down.\n");
359 kmsg_dump(KMSG_DUMP_POWEROFF);
354 machine_power_off(); 360 machine_power_off();
355} 361}
356EXPORT_SYMBOL_GPL(kernel_power_off); 362EXPORT_SYMBOL_GPL(kernel_power_off);
@@ -1080,8 +1086,10 @@ SYSCALL_DEFINE0(setsid)
1080 err = session; 1086 err = session;
1081out: 1087out:
1082 write_unlock_irq(&tasklist_lock); 1088 write_unlock_irq(&tasklist_lock);
1083 if (err > 0) 1089 if (err > 0) {
1084 proc_sid_connector(group_leader); 1090 proc_sid_connector(group_leader);
1091 sched_autogroup_create_attach(group_leader);
1092 }
1085 return err; 1093 return err;
1086} 1094}
1087 1095
@@ -1377,7 +1385,8 @@ static int check_prlimit_permission(struct task_struct *task)
1377 const struct cred *cred = current_cred(), *tcred; 1385 const struct cred *cred = current_cred(), *tcred;
1378 1386
1379 tcred = __task_cred(task); 1387 tcred = __task_cred(task);
1380 if ((cred->uid != tcred->euid || 1388 if (current != task &&
1389 (cred->uid != tcred->euid ||
1381 cred->uid != tcred->suid || 1390 cred->uid != tcred->suid ||
1382 cred->uid != tcred->uid || 1391 cred->uid != tcred->uid ||
1383 cred->gid != tcred->egid || 1392 cred->gid != tcred->egid ||
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 5abfa1518554..0f1bd83db985 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -24,6 +24,7 @@
24#include <linux/slab.h> 24#include <linux/slab.h>
25#include <linux/sysctl.h> 25#include <linux/sysctl.h>
26#include <linux/signal.h> 26#include <linux/signal.h>
27#include <linux/printk.h>
27#include <linux/proc_fs.h> 28#include <linux/proc_fs.h>
28#include <linux/security.h> 29#include <linux/security.h>
29#include <linux/ctype.h> 30#include <linux/ctype.h>
@@ -169,7 +170,8 @@ static int proc_taint(struct ctl_table *table, int write,
169#endif 170#endif
170 171
171#ifdef CONFIG_MAGIC_SYSRQ 172#ifdef CONFIG_MAGIC_SYSRQ
172static int __sysrq_enabled; /* Note: sysrq code ises it's own private copy */ 173/* Note: sysrq code uses it's own private copy */
174static int __sysrq_enabled = SYSRQ_DEFAULT_ENABLE;
173 175
174static int sysrq_sysctl_handler(ctl_table *table, int write, 176static int sysrq_sysctl_handler(ctl_table *table, int write,
175 void __user *buffer, size_t *lenp, 177 void __user *buffer, size_t *lenp,
@@ -245,10 +247,6 @@ static struct ctl_table root_table[] = {
245 .mode = 0555, 247 .mode = 0555,
246 .child = dev_table, 248 .child = dev_table,
247 }, 249 },
248/*
249 * NOTE: do not add new entries to this table unless you have read
250 * Documentation/sysctl/ctl_unnumbered.txt
251 */
252 { } 250 { }
253}; 251};
254 252
@@ -259,8 +257,6 @@ static int min_wakeup_granularity_ns; /* 0 usecs */
259static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */ 257static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */
260static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE; 258static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE;
261static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1; 259static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1;
262static int min_sched_shares_ratelimit = 100000; /* 100 usec */
263static int max_sched_shares_ratelimit = NSEC_PER_SEC; /* 1 second */
264#endif 260#endif
265 261
266#ifdef CONFIG_COMPACTION 262#ifdef CONFIG_COMPACTION
@@ -305,15 +301,6 @@ static struct ctl_table kern_table[] = {
305 .extra2 = &max_wakeup_granularity_ns, 301 .extra2 = &max_wakeup_granularity_ns,
306 }, 302 },
307 { 303 {
308 .procname = "sched_shares_ratelimit",
309 .data = &sysctl_sched_shares_ratelimit,
310 .maxlen = sizeof(unsigned int),
311 .mode = 0644,
312 .proc_handler = sched_proc_update_handler,
313 .extra1 = &min_sched_shares_ratelimit,
314 .extra2 = &max_sched_shares_ratelimit,
315 },
316 {
317 .procname = "sched_tunable_scaling", 304 .procname = "sched_tunable_scaling",
318 .data = &sysctl_sched_tunable_scaling, 305 .data = &sysctl_sched_tunable_scaling,
319 .maxlen = sizeof(enum sched_tunable_scaling), 306 .maxlen = sizeof(enum sched_tunable_scaling),
@@ -323,14 +310,6 @@ static struct ctl_table kern_table[] = {
323 .extra2 = &max_sched_tunable_scaling, 310 .extra2 = &max_sched_tunable_scaling,
324 }, 311 },
325 { 312 {
326 .procname = "sched_shares_thresh",
327 .data = &sysctl_sched_shares_thresh,
328 .maxlen = sizeof(unsigned int),
329 .mode = 0644,
330 .proc_handler = proc_dointvec_minmax,
331 .extra1 = &zero,
332 },
333 {
334 .procname = "sched_migration_cost", 313 .procname = "sched_migration_cost",
335 .data = &sysctl_sched_migration_cost, 314 .data = &sysctl_sched_migration_cost,
336 .maxlen = sizeof(unsigned int), 315 .maxlen = sizeof(unsigned int),
@@ -352,6 +331,13 @@ static struct ctl_table kern_table[] = {
352 .proc_handler = proc_dointvec, 331 .proc_handler = proc_dointvec,
353 }, 332 },
354 { 333 {
334 .procname = "sched_shares_window",
335 .data = &sysctl_sched_shares_window,
336 .maxlen = sizeof(unsigned int),
337 .mode = 0644,
338 .proc_handler = proc_dointvec,
339 },
340 {
355 .procname = "timer_migration", 341 .procname = "timer_migration",
356 .data = &sysctl_timer_migration, 342 .data = &sysctl_timer_migration,
357 .maxlen = sizeof(unsigned int), 343 .maxlen = sizeof(unsigned int),
@@ -382,6 +368,17 @@ static struct ctl_table kern_table[] = {
382 .mode = 0644, 368 .mode = 0644,
383 .proc_handler = proc_dointvec, 369 .proc_handler = proc_dointvec,
384 }, 370 },
371#ifdef CONFIG_SCHED_AUTOGROUP
372 {
373 .procname = "sched_autogroup_enabled",
374 .data = &sysctl_sched_autogroup_enabled,
375 .maxlen = sizeof(unsigned int),
376 .mode = 0644,
377 .proc_handler = proc_dointvec,
378 .extra1 = &zero,
379 .extra2 = &one,
380 },
381#endif
385#ifdef CONFIG_PROVE_LOCKING 382#ifdef CONFIG_PROVE_LOCKING
386 { 383 {
387 .procname = "prove_locking", 384 .procname = "prove_locking",
@@ -711,6 +708,15 @@ static struct ctl_table kern_table[] = {
711 .extra1 = &zero, 708 .extra1 = &zero,
712 .extra2 = &one, 709 .extra2 = &one,
713 }, 710 },
711 {
712 .procname = "kptr_restrict",
713 .data = &kptr_restrict,
714 .maxlen = sizeof(int),
715 .mode = 0644,
716 .proc_handler = proc_dointvec_minmax,
717 .extra1 = &zero,
718 .extra2 = &two,
719 },
714#endif 720#endif
715 { 721 {
716 .procname = "ngroups_max", 722 .procname = "ngroups_max",
@@ -745,21 +751,21 @@ static struct ctl_table kern_table[] = {
745 .extra1 = &zero, 751 .extra1 = &zero,
746 .extra2 = &one, 752 .extra2 = &one,
747 }, 753 },
748#endif
749#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) && !defined(CONFIG_LOCKUP_DETECTOR)
750 { 754 {
751 .procname = "unknown_nmi_panic", 755 .procname = "nmi_watchdog",
752 .data = &unknown_nmi_panic, 756 .data = &watchdog_enabled,
753 .maxlen = sizeof (int), 757 .maxlen = sizeof (int),
754 .mode = 0644, 758 .mode = 0644,
755 .proc_handler = proc_dointvec, 759 .proc_handler = proc_dowatchdog_enabled,
756 }, 760 },
761#endif
762#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86)
757 { 763 {
758 .procname = "nmi_watchdog", 764 .procname = "unknown_nmi_panic",
759 .data = &nmi_watchdog_enabled, 765 .data = &unknown_nmi_panic,
760 .maxlen = sizeof (int), 766 .maxlen = sizeof (int),
761 .mode = 0644, 767 .mode = 0644,
762 .proc_handler = proc_nmi_enabled, 768 .proc_handler = proc_dointvec,
763 }, 769 },
764#endif 770#endif
765#if defined(CONFIG_X86) 771#if defined(CONFIG_X86)
@@ -963,10 +969,6 @@ static struct ctl_table kern_table[] = {
963 .proc_handler = proc_dointvec, 969 .proc_handler = proc_dointvec,
964 }, 970 },
965#endif 971#endif
966/*
967 * NOTE: do not add new entries to this table unless you have read
968 * Documentation/sysctl/ctl_unnumbered.txt
969 */
970 { } 972 { }
971}; 973};
972 974
@@ -1327,11 +1329,6 @@ static struct ctl_table vm_table[] = {
1327 .extra2 = &one, 1329 .extra2 = &one,
1328 }, 1330 },
1329#endif 1331#endif
1330
1331/*
1332 * NOTE: do not add new entries to this table unless you have read
1333 * Documentation/sysctl/ctl_unnumbered.txt
1334 */
1335 { } 1332 { }
1336}; 1333};
1337 1334
@@ -1487,10 +1484,6 @@ static struct ctl_table fs_table[] = {
1487 .proc_handler = &pipe_proc_fn, 1484 .proc_handler = &pipe_proc_fn,
1488 .extra1 = &pipe_min_size, 1485 .extra1 = &pipe_min_size,
1489 }, 1486 },
1490/*
1491 * NOTE: do not add new entries to this table unless you have read
1492 * Documentation/sysctl/ctl_unnumbered.txt
1493 */
1494 { } 1487 { }
1495}; 1488};
1496 1489
@@ -2900,7 +2893,7 @@ int proc_do_large_bitmap(struct ctl_table *table, int write,
2900 } 2893 }
2901} 2894}
2902 2895
2903#else /* CONFIG_PROC_FS */ 2896#else /* CONFIG_PROC_SYSCTL */
2904 2897
2905int proc_dostring(struct ctl_table *table, int write, 2898int proc_dostring(struct ctl_table *table, int write,
2906 void __user *buffer, size_t *lenp, loff_t *ppos) 2899 void __user *buffer, size_t *lenp, loff_t *ppos)
@@ -2952,7 +2945,7 @@ int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write,
2952} 2945}
2953 2946
2954 2947
2955#endif /* CONFIG_PROC_FS */ 2948#endif /* CONFIG_PROC_SYSCTL */
2956 2949
2957/* 2950/*
2958 * No sense putting this after each symbol definition, twice, 2951 * No sense putting this after each symbol definition, twice,
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index 1357c5786064..b875bedf7c9a 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -136,7 +136,6 @@ static const struct bin_table bin_kern_table[] = {
136 { CTL_INT, KERN_IA64_UNALIGNED, "ignore-unaligned-usertrap" }, 136 { CTL_INT, KERN_IA64_UNALIGNED, "ignore-unaligned-usertrap" },
137 { CTL_INT, KERN_COMPAT_LOG, "compat-log" }, 137 { CTL_INT, KERN_COMPAT_LOG, "compat-log" },
138 { CTL_INT, KERN_MAX_LOCK_DEPTH, "max_lock_depth" }, 138 { CTL_INT, KERN_MAX_LOCK_DEPTH, "max_lock_depth" },
139 { CTL_INT, KERN_NMI_WATCHDOG, "nmi_watchdog" },
140 { CTL_INT, KERN_PANIC_ON_NMI, "panic_on_unrecovered_nmi" }, 139 { CTL_INT, KERN_PANIC_ON_NMI, "panic_on_unrecovered_nmi" },
141 {} 140 {}
142}; 141};
@@ -1193,7 +1192,7 @@ static ssize_t bin_dn_node_address(struct file *file,
1193 1192
1194 buf[result] = '\0'; 1193 buf[result] = '\0';
1195 1194
1196 /* Convert the decnet addresss to binary */ 1195 /* Convert the decnet address to binary */
1197 result = -EIO; 1196 result = -EIO;
1198 nodep = strchr(buf, '.') + 1; 1197 nodep = strchr(buf, '.') + 1;
1199 if (!nodep) 1198 if (!nodep)
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 3308fd7f1b52..3971c6b9d58d 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -89,8 +89,7 @@ static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp,
89 return -ENOMEM; 89 return -ENOMEM;
90 90
91 if (!info) { 91 if (!info) {
92 int seq = get_cpu_var(taskstats_seqnum)++; 92 int seq = this_cpu_inc_return(taskstats_seqnum) - 1;
93 put_cpu_var(taskstats_seqnum);
94 93
95 reply = genlmsg_put(skb, 0, seq, &family, 0, cmd); 94 reply = genlmsg_put(skb, 0, seq, &family, 0, cmd);
96 } else 95 } else
@@ -349,7 +348,7 @@ static int parse(struct nlattr *na, struct cpumask *mask)
349 return ret; 348 return ret;
350} 349}
351 350
352#ifdef CONFIG_IA64 351#if defined(CONFIG_64BIT) && !defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
353#define TASKSTATS_NEEDS_PADDING 1 352#define TASKSTATS_NEEDS_PADDING 1
354#endif 353#endif
355 354
@@ -612,7 +611,7 @@ void taskstats_exit(struct task_struct *tsk, int group_dead)
612 fill_tgid_exit(tsk); 611 fill_tgid_exit(tsk);
613 } 612 }
614 613
615 listeners = &__raw_get_cpu_var(listener_array); 614 listeners = __this_cpu_ptr(&listener_array);
616 if (list_empty(&listeners->list)) 615 if (list_empty(&listeners->list))
617 return; 616 return;
618 617
diff --git a/kernel/time.c b/kernel/time.c
index ba9b338d1835..32174359576f 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -238,7 +238,7 @@ EXPORT_SYMBOL(current_fs_time);
238 * Avoid unnecessary multiplications/divisions in the 238 * Avoid unnecessary multiplications/divisions in the
239 * two most common HZ cases: 239 * two most common HZ cases:
240 */ 240 */
241unsigned int inline jiffies_to_msecs(const unsigned long j) 241inline unsigned int jiffies_to_msecs(const unsigned long j)
242{ 242{
243#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ) 243#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ)
244 return (MSEC_PER_SEC / HZ) * j; 244 return (MSEC_PER_SEC / HZ) * j;
@@ -254,7 +254,7 @@ unsigned int inline jiffies_to_msecs(const unsigned long j)
254} 254}
255EXPORT_SYMBOL(jiffies_to_msecs); 255EXPORT_SYMBOL(jiffies_to_msecs);
256 256
257unsigned int inline jiffies_to_usecs(const unsigned long j) 257inline unsigned int jiffies_to_usecs(const unsigned long j)
258{ 258{
259#if HZ <= USEC_PER_SEC && !(USEC_PER_SEC % HZ) 259#if HZ <= USEC_PER_SEC && !(USEC_PER_SEC % HZ)
260 return (USEC_PER_SEC / HZ) * j; 260 return (USEC_PER_SEC / HZ) * j;
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index c18d7efa1b4b..6519cf62d9cd 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -113,7 +113,7 @@ EXPORT_SYMBOL_GPL(timecounter_cyc2time);
113 * @shift: pointer to shift variable 113 * @shift: pointer to shift variable
114 * @from: frequency to convert from 114 * @from: frequency to convert from
115 * @to: frequency to convert to 115 * @to: frequency to convert to
116 * @minsec: guaranteed runtime conversion range in seconds 116 * @maxsec: guaranteed runtime conversion range in seconds
117 * 117 *
118 * The function evaluates the shift/mult pair for the scaled math 118 * The function evaluates the shift/mult pair for the scaled math
119 * operations of clocksources and clockevents. 119 * operations of clocksources and clockevents.
@@ -122,7 +122,7 @@ EXPORT_SYMBOL_GPL(timecounter_cyc2time);
122 * NSEC_PER_SEC == 1GHz and @from is the counter frequency. For clock 122 * NSEC_PER_SEC == 1GHz and @from is the counter frequency. For clock
123 * event @to is the counter frequency and @from is NSEC_PER_SEC. 123 * event @to is the counter frequency and @from is NSEC_PER_SEC.
124 * 124 *
125 * The @minsec conversion range argument controls the time frame in 125 * The @maxsec conversion range argument controls the time frame in
126 * seconds which must be covered by the runtime conversion with the 126 * seconds which must be covered by the runtime conversion with the
127 * calculated mult and shift factors. This guarantees that no 64bit 127 * calculated mult and shift factors. This guarantees that no 64bit
128 * overflow happens when the input value of the conversion is 128 * overflow happens when the input value of the conversion is
@@ -131,7 +131,7 @@ EXPORT_SYMBOL_GPL(timecounter_cyc2time);
131 * factors. 131 * factors.
132 */ 132 */
133void 133void
134clocks_calc_mult_shift(u32 *mult, u32 *shift, u32 from, u32 to, u32 minsec) 134clocks_calc_mult_shift(u32 *mult, u32 *shift, u32 from, u32 to, u32 maxsec)
135{ 135{
136 u64 tmp; 136 u64 tmp;
137 u32 sft, sftacc= 32; 137 u32 sft, sftacc= 32;
@@ -140,7 +140,7 @@ clocks_calc_mult_shift(u32 *mult, u32 *shift, u32 from, u32 to, u32 minsec)
140 * Calculate the shift factor which is limiting the conversion 140 * Calculate the shift factor which is limiting the conversion
141 * range: 141 * range:
142 */ 142 */
143 tmp = ((u64)minsec * from) >> 32; 143 tmp = ((u64)maxsec * from) >> 32;
144 while (tmp) { 144 while (tmp) {
145 tmp >>=1; 145 tmp >>=1;
146 sftacc--; 146 sftacc--;
@@ -152,6 +152,7 @@ clocks_calc_mult_shift(u32 *mult, u32 *shift, u32 from, u32 to, u32 minsec)
152 */ 152 */
153 for (sft = 32; sft > 0; sft--) { 153 for (sft = 32; sft > 0; sft--) {
154 tmp = (u64) to << sft; 154 tmp = (u64) to << sft;
155 tmp += from / 2;
155 do_div(tmp, from); 156 do_div(tmp, from);
156 if ((tmp >> sftacc) == 0) 157 if ((tmp >> sftacc) == 0)
157 break; 158 break;
@@ -678,7 +679,7 @@ EXPORT_SYMBOL_GPL(__clocksource_updatefreq_scale);
678int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq) 679int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq)
679{ 680{
680 681
681 /* Intialize mult/shift and max_idle_ns */ 682 /* Initialize mult/shift and max_idle_ns */
682 __clocksource_updatefreq_scale(cs, scale, freq); 683 __clocksource_updatefreq_scale(cs, scale, freq);
683 684
684 /* Add clocksource to the clcoksource list */ 685 /* Add clocksource to the clcoksource list */
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index d2321891538f..5c00242fa921 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -14,6 +14,7 @@
14#include <linux/timex.h> 14#include <linux/timex.h>
15#include <linux/time.h> 15#include <linux/time.h>
16#include <linux/mm.h> 16#include <linux/mm.h>
17#include <linux/module.h>
17 18
18/* 19/*
19 * NTP timekeeping variables: 20 * NTP timekeeping variables:
@@ -74,6 +75,162 @@ static long time_adjust;
74/* constant (boot-param configurable) NTP tick adjustment (upscaled) */ 75/* constant (boot-param configurable) NTP tick adjustment (upscaled) */
75static s64 ntp_tick_adj; 76static s64 ntp_tick_adj;
76 77
78#ifdef CONFIG_NTP_PPS
79
80/*
81 * The following variables are used when a pulse-per-second (PPS) signal
82 * is available. They establish the engineering parameters of the clock
83 * discipline loop when controlled by the PPS signal.
84 */
85#define PPS_VALID 10 /* PPS signal watchdog max (s) */
86#define PPS_POPCORN 4 /* popcorn spike threshold (shift) */
87#define PPS_INTMIN 2 /* min freq interval (s) (shift) */
88#define PPS_INTMAX 8 /* max freq interval (s) (shift) */
89#define PPS_INTCOUNT 4 /* number of consecutive good intervals to
90 increase pps_shift or consecutive bad
91 intervals to decrease it */
92#define PPS_MAXWANDER 100000 /* max PPS freq wander (ns/s) */
93
94static int pps_valid; /* signal watchdog counter */
95static long pps_tf[3]; /* phase median filter */
96static long pps_jitter; /* current jitter (ns) */
97static struct timespec pps_fbase; /* beginning of the last freq interval */
98static int pps_shift; /* current interval duration (s) (shift) */
99static int pps_intcnt; /* interval counter */
100static s64 pps_freq; /* frequency offset (scaled ns/s) */
101static long pps_stabil; /* current stability (scaled ns/s) */
102
103/*
104 * PPS signal quality monitors
105 */
106static long pps_calcnt; /* calibration intervals */
107static long pps_jitcnt; /* jitter limit exceeded */
108static long pps_stbcnt; /* stability limit exceeded */
109static long pps_errcnt; /* calibration errors */
110
111
112/* PPS kernel consumer compensates the whole phase error immediately.
113 * Otherwise, reduce the offset by a fixed factor times the time constant.
114 */
115static inline s64 ntp_offset_chunk(s64 offset)
116{
117 if (time_status & STA_PPSTIME && time_status & STA_PPSSIGNAL)
118 return offset;
119 else
120 return shift_right(offset, SHIFT_PLL + time_constant);
121}
122
123static inline void pps_reset_freq_interval(void)
124{
125 /* the PPS calibration interval may end
126 surprisingly early */
127 pps_shift = PPS_INTMIN;
128 pps_intcnt = 0;
129}
130
131/**
132 * pps_clear - Clears the PPS state variables
133 *
134 * Must be called while holding a write on the xtime_lock
135 */
136static inline void pps_clear(void)
137{
138 pps_reset_freq_interval();
139 pps_tf[0] = 0;
140 pps_tf[1] = 0;
141 pps_tf[2] = 0;
142 pps_fbase.tv_sec = pps_fbase.tv_nsec = 0;
143 pps_freq = 0;
144}
145
146/* Decrease pps_valid to indicate that another second has passed since
147 * the last PPS signal. When it reaches 0, indicate that PPS signal is
148 * missing.
149 *
150 * Must be called while holding a write on the xtime_lock
151 */
152static inline void pps_dec_valid(void)
153{
154 if (pps_valid > 0)
155 pps_valid--;
156 else {
157 time_status &= ~(STA_PPSSIGNAL | STA_PPSJITTER |
158 STA_PPSWANDER | STA_PPSERROR);
159 pps_clear();
160 }
161}
162
163static inline void pps_set_freq(s64 freq)
164{
165 pps_freq = freq;
166}
167
168static inline int is_error_status(int status)
169{
170 return (time_status & (STA_UNSYNC|STA_CLOCKERR))
171 /* PPS signal lost when either PPS time or
172 * PPS frequency synchronization requested
173 */
174 || ((time_status & (STA_PPSFREQ|STA_PPSTIME))
175 && !(time_status & STA_PPSSIGNAL))
176 /* PPS jitter exceeded when
177 * PPS time synchronization requested */
178 || ((time_status & (STA_PPSTIME|STA_PPSJITTER))
179 == (STA_PPSTIME|STA_PPSJITTER))
180 /* PPS wander exceeded or calibration error when
181 * PPS frequency synchronization requested
182 */
183 || ((time_status & STA_PPSFREQ)
184 && (time_status & (STA_PPSWANDER|STA_PPSERROR)));
185}
186
187static inline void pps_fill_timex(struct timex *txc)
188{
189 txc->ppsfreq = shift_right((pps_freq >> PPM_SCALE_INV_SHIFT) *
190 PPM_SCALE_INV, NTP_SCALE_SHIFT);
191 txc->jitter = pps_jitter;
192 if (!(time_status & STA_NANO))
193 txc->jitter /= NSEC_PER_USEC;
194 txc->shift = pps_shift;
195 txc->stabil = pps_stabil;
196 txc->jitcnt = pps_jitcnt;
197 txc->calcnt = pps_calcnt;
198 txc->errcnt = pps_errcnt;
199 txc->stbcnt = pps_stbcnt;
200}
201
202#else /* !CONFIG_NTP_PPS */
203
204static inline s64 ntp_offset_chunk(s64 offset)
205{
206 return shift_right(offset, SHIFT_PLL + time_constant);
207}
208
209static inline void pps_reset_freq_interval(void) {}
210static inline void pps_clear(void) {}
211static inline void pps_dec_valid(void) {}
212static inline void pps_set_freq(s64 freq) {}
213
214static inline int is_error_status(int status)
215{
216 return status & (STA_UNSYNC|STA_CLOCKERR);
217}
218
219static inline void pps_fill_timex(struct timex *txc)
220{
221 /* PPS is not implemented, so these are zero */
222 txc->ppsfreq = 0;
223 txc->jitter = 0;
224 txc->shift = 0;
225 txc->stabil = 0;
226 txc->jitcnt = 0;
227 txc->calcnt = 0;
228 txc->errcnt = 0;
229 txc->stbcnt = 0;
230}
231
232#endif /* CONFIG_NTP_PPS */
233
77/* 234/*
78 * NTP methods: 235 * NTP methods:
79 */ 236 */
@@ -185,6 +342,9 @@ void ntp_clear(void)
185 342
186 tick_length = tick_length_base; 343 tick_length = tick_length_base;
187 time_offset = 0; 344 time_offset = 0;
345
346 /* Clear PPS state variables */
347 pps_clear();
188} 348}
189 349
190/* 350/*
@@ -250,16 +410,16 @@ void second_overflow(void)
250 time_status |= STA_UNSYNC; 410 time_status |= STA_UNSYNC;
251 } 411 }
252 412
253 /* 413 /* Compute the phase adjustment for the next second */
254 * Compute the phase adjustment for the next second. The offset is
255 * reduced by a fixed factor times the time constant.
256 */
257 tick_length = tick_length_base; 414 tick_length = tick_length_base;
258 415
259 delta = shift_right(time_offset, SHIFT_PLL + time_constant); 416 delta = ntp_offset_chunk(time_offset);
260 time_offset -= delta; 417 time_offset -= delta;
261 tick_length += delta; 418 tick_length += delta;
262 419
420 /* Check PPS signal */
421 pps_dec_valid();
422
263 if (!time_adjust) 423 if (!time_adjust)
264 return; 424 return;
265 425
@@ -369,6 +529,8 @@ static inline void process_adj_status(struct timex *txc, struct timespec *ts)
369 if ((time_status & STA_PLL) && !(txc->status & STA_PLL)) { 529 if ((time_status & STA_PLL) && !(txc->status & STA_PLL)) {
370 time_state = TIME_OK; 530 time_state = TIME_OK;
371 time_status = STA_UNSYNC; 531 time_status = STA_UNSYNC;
532 /* restart PPS frequency calibration */
533 pps_reset_freq_interval();
372 } 534 }
373 535
374 /* 536 /*
@@ -418,6 +580,8 @@ static inline void process_adjtimex_modes(struct timex *txc, struct timespec *ts
418 time_freq = txc->freq * PPM_SCALE; 580 time_freq = txc->freq * PPM_SCALE;
419 time_freq = min(time_freq, MAXFREQ_SCALED); 581 time_freq = min(time_freq, MAXFREQ_SCALED);
420 time_freq = max(time_freq, -MAXFREQ_SCALED); 582 time_freq = max(time_freq, -MAXFREQ_SCALED);
583 /* update pps_freq */
584 pps_set_freq(time_freq);
421 } 585 }
422 586
423 if (txc->modes & ADJ_MAXERROR) 587 if (txc->modes & ADJ_MAXERROR)
@@ -508,7 +672,8 @@ int do_adjtimex(struct timex *txc)
508 } 672 }
509 673
510 result = time_state; /* mostly `TIME_OK' */ 674 result = time_state; /* mostly `TIME_OK' */
511 if (time_status & (STA_UNSYNC|STA_CLOCKERR)) 675 /* check for errors */
676 if (is_error_status(time_status))
512 result = TIME_ERROR; 677 result = TIME_ERROR;
513 678
514 txc->freq = shift_right((time_freq >> PPM_SCALE_INV_SHIFT) * 679 txc->freq = shift_right((time_freq >> PPM_SCALE_INV_SHIFT) *
@@ -522,15 +687,8 @@ int do_adjtimex(struct timex *txc)
522 txc->tick = tick_usec; 687 txc->tick = tick_usec;
523 txc->tai = time_tai; 688 txc->tai = time_tai;
524 689
525 /* PPS is not implemented, so these are zero */ 690 /* fill PPS status fields */
526 txc->ppsfreq = 0; 691 pps_fill_timex(txc);
527 txc->jitter = 0;
528 txc->shift = 0;
529 txc->stabil = 0;
530 txc->jitcnt = 0;
531 txc->calcnt = 0;
532 txc->errcnt = 0;
533 txc->stbcnt = 0;
534 692
535 write_sequnlock_irq(&xtime_lock); 693 write_sequnlock_irq(&xtime_lock);
536 694
@@ -544,6 +702,243 @@ int do_adjtimex(struct timex *txc)
544 return result; 702 return result;
545} 703}
546 704
705#ifdef CONFIG_NTP_PPS
706
707/* actually struct pps_normtime is good old struct timespec, but it is
708 * semantically different (and it is the reason why it was invented):
709 * pps_normtime.nsec has a range of ( -NSEC_PER_SEC / 2, NSEC_PER_SEC / 2 ]
710 * while timespec.tv_nsec has a range of [0, NSEC_PER_SEC) */
711struct pps_normtime {
712 __kernel_time_t sec; /* seconds */
713 long nsec; /* nanoseconds */
714};
715
716/* normalize the timestamp so that nsec is in the
717 ( -NSEC_PER_SEC / 2, NSEC_PER_SEC / 2 ] interval */
718static inline struct pps_normtime pps_normalize_ts(struct timespec ts)
719{
720 struct pps_normtime norm = {
721 .sec = ts.tv_sec,
722 .nsec = ts.tv_nsec
723 };
724
725 if (norm.nsec > (NSEC_PER_SEC >> 1)) {
726 norm.nsec -= NSEC_PER_SEC;
727 norm.sec++;
728 }
729
730 return norm;
731}
732
733/* get current phase correction and jitter */
734static inline long pps_phase_filter_get(long *jitter)
735{
736 *jitter = pps_tf[0] - pps_tf[1];
737 if (*jitter < 0)
738 *jitter = -*jitter;
739
740 /* TODO: test various filters */
741 return pps_tf[0];
742}
743
744/* add the sample to the phase filter */
745static inline void pps_phase_filter_add(long err)
746{
747 pps_tf[2] = pps_tf[1];
748 pps_tf[1] = pps_tf[0];
749 pps_tf[0] = err;
750}
751
752/* decrease frequency calibration interval length.
753 * It is halved after four consecutive unstable intervals.
754 */
755static inline void pps_dec_freq_interval(void)
756{
757 if (--pps_intcnt <= -PPS_INTCOUNT) {
758 pps_intcnt = -PPS_INTCOUNT;
759 if (pps_shift > PPS_INTMIN) {
760 pps_shift--;
761 pps_intcnt = 0;
762 }
763 }
764}
765
766/* increase frequency calibration interval length.
767 * It is doubled after four consecutive stable intervals.
768 */
769static inline void pps_inc_freq_interval(void)
770{
771 if (++pps_intcnt >= PPS_INTCOUNT) {
772 pps_intcnt = PPS_INTCOUNT;
773 if (pps_shift < PPS_INTMAX) {
774 pps_shift++;
775 pps_intcnt = 0;
776 }
777 }
778}
779
780/* update clock frequency based on MONOTONIC_RAW clock PPS signal
781 * timestamps
782 *
783 * At the end of the calibration interval the difference between the
784 * first and last MONOTONIC_RAW clock timestamps divided by the length
785 * of the interval becomes the frequency update. If the interval was
786 * too long, the data are discarded.
787 * Returns the difference between old and new frequency values.
788 */
789static long hardpps_update_freq(struct pps_normtime freq_norm)
790{
791 long delta, delta_mod;
792 s64 ftemp;
793
794 /* check if the frequency interval was too long */
795 if (freq_norm.sec > (2 << pps_shift)) {
796 time_status |= STA_PPSERROR;
797 pps_errcnt++;
798 pps_dec_freq_interval();
799 pr_err("hardpps: PPSERROR: interval too long - %ld s\n",
800 freq_norm.sec);
801 return 0;
802 }
803
804 /* here the raw frequency offset and wander (stability) is
805 * calculated. If the wander is less than the wander threshold
806 * the interval is increased; otherwise it is decreased.
807 */
808 ftemp = div_s64(((s64)(-freq_norm.nsec)) << NTP_SCALE_SHIFT,
809 freq_norm.sec);
810 delta = shift_right(ftemp - pps_freq, NTP_SCALE_SHIFT);
811 pps_freq = ftemp;
812 if (delta > PPS_MAXWANDER || delta < -PPS_MAXWANDER) {
813 pr_warning("hardpps: PPSWANDER: change=%ld\n", delta);
814 time_status |= STA_PPSWANDER;
815 pps_stbcnt++;
816 pps_dec_freq_interval();
817 } else { /* good sample */
818 pps_inc_freq_interval();
819 }
820
821 /* the stability metric is calculated as the average of recent
822 * frequency changes, but is used only for performance
823 * monitoring
824 */
825 delta_mod = delta;
826 if (delta_mod < 0)
827 delta_mod = -delta_mod;
828 pps_stabil += (div_s64(((s64)delta_mod) <<
829 (NTP_SCALE_SHIFT - SHIFT_USEC),
830 NSEC_PER_USEC) - pps_stabil) >> PPS_INTMIN;
831
832 /* if enabled, the system clock frequency is updated */
833 if ((time_status & STA_PPSFREQ) != 0 &&
834 (time_status & STA_FREQHOLD) == 0) {
835 time_freq = pps_freq;
836 ntp_update_frequency();
837 }
838
839 return delta;
840}
841
842/* correct REALTIME clock phase error against PPS signal */
843static void hardpps_update_phase(long error)
844{
845 long correction = -error;
846 long jitter;
847
848 /* add the sample to the median filter */
849 pps_phase_filter_add(correction);
850 correction = pps_phase_filter_get(&jitter);
851
852 /* Nominal jitter is due to PPS signal noise. If it exceeds the
853 * threshold, the sample is discarded; otherwise, if so enabled,
854 * the time offset is updated.
855 */
856 if (jitter > (pps_jitter << PPS_POPCORN)) {
857 pr_warning("hardpps: PPSJITTER: jitter=%ld, limit=%ld\n",
858 jitter, (pps_jitter << PPS_POPCORN));
859 time_status |= STA_PPSJITTER;
860 pps_jitcnt++;
861 } else if (time_status & STA_PPSTIME) {
862 /* correct the time using the phase offset */
863 time_offset = div_s64(((s64)correction) << NTP_SCALE_SHIFT,
864 NTP_INTERVAL_FREQ);
865 /* cancel running adjtime() */
866 time_adjust = 0;
867 }
868 /* update jitter */
869 pps_jitter += (jitter - pps_jitter) >> PPS_INTMIN;
870}
871
872/*
873 * hardpps() - discipline CPU clock oscillator to external PPS signal
874 *
875 * This routine is called at each PPS signal arrival in order to
876 * discipline the CPU clock oscillator to the PPS signal. It takes two
877 * parameters: REALTIME and MONOTONIC_RAW clock timestamps. The former
878 * is used to correct clock phase error and the latter is used to
879 * correct the frequency.
880 *
881 * This code is based on David Mills's reference nanokernel
882 * implementation. It was mostly rewritten but keeps the same idea.
883 */
884void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)
885{
886 struct pps_normtime pts_norm, freq_norm;
887 unsigned long flags;
888
889 pts_norm = pps_normalize_ts(*phase_ts);
890
891 write_seqlock_irqsave(&xtime_lock, flags);
892
893 /* clear the error bits, they will be set again if needed */
894 time_status &= ~(STA_PPSJITTER | STA_PPSWANDER | STA_PPSERROR);
895
896 /* indicate signal presence */
897 time_status |= STA_PPSSIGNAL;
898 pps_valid = PPS_VALID;
899
900 /* when called for the first time,
901 * just start the frequency interval */
902 if (unlikely(pps_fbase.tv_sec == 0)) {
903 pps_fbase = *raw_ts;
904 write_sequnlock_irqrestore(&xtime_lock, flags);
905 return;
906 }
907
908 /* ok, now we have a base for frequency calculation */
909 freq_norm = pps_normalize_ts(timespec_sub(*raw_ts, pps_fbase));
910
911 /* check that the signal is in the range
912 * [1s - MAXFREQ us, 1s + MAXFREQ us], otherwise reject it */
913 if ((freq_norm.sec == 0) ||
914 (freq_norm.nsec > MAXFREQ * freq_norm.sec) ||
915 (freq_norm.nsec < -MAXFREQ * freq_norm.sec)) {
916 time_status |= STA_PPSJITTER;
917 /* restart the frequency calibration interval */
918 pps_fbase = *raw_ts;
919 write_sequnlock_irqrestore(&xtime_lock, flags);
920 pr_err("hardpps: PPSJITTER: bad pulse\n");
921 return;
922 }
923
924 /* signal is ok */
925
926 /* check if the current frequency interval is finished */
927 if (freq_norm.sec >= (1 << pps_shift)) {
928 pps_calcnt++;
929 /* restart the frequency calibration interval */
930 pps_fbase = *raw_ts;
931 hardpps_update_freq(freq_norm);
932 }
933
934 hardpps_update_phase(pts_norm.nsec);
935
936 write_sequnlock_irqrestore(&xtime_lock, flags);
937}
938EXPORT_SYMBOL(hardpps);
939
940#endif /* CONFIG_NTP_PPS */
941
547static int __init ntp_tick_adj_setup(char *str) 942static int __init ntp_tick_adj_setup(char *str)
548{ 943{
549 ntp_tick_adj = simple_strtol(str, NULL, 0); 944 ntp_tick_adj = simple_strtol(str, NULL, 0);
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index b6b898d2eeef..051bc80a0c43 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -49,7 +49,7 @@ struct tick_device *tick_get_device(int cpu)
49 */ 49 */
50int tick_is_oneshot_available(void) 50int tick_is_oneshot_available(void)
51{ 51{
52 struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; 52 struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev);
53 53
54 return dev && (dev->features & CLOCK_EVT_FEAT_ONESHOT); 54 return dev && (dev->features & CLOCK_EVT_FEAT_ONESHOT);
55} 55}
diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c
index aada0e52680a..5cbc101f908b 100644
--- a/kernel/time/tick-oneshot.c
+++ b/kernel/time/tick-oneshot.c
@@ -95,7 +95,7 @@ int tick_dev_program_event(struct clock_event_device *dev, ktime_t expires,
95 */ 95 */
96int tick_program_event(ktime_t expires, int force) 96int tick_program_event(ktime_t expires, int force)
97{ 97{
98 struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; 98 struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev);
99 99
100 return tick_dev_program_event(dev, expires, force); 100 return tick_dev_program_event(dev, expires, force);
101} 101}
@@ -167,7 +167,7 @@ int tick_oneshot_mode_active(void)
167 int ret; 167 int ret;
168 168
169 local_irq_save(flags); 169 local_irq_save(flags);
170 ret = __get_cpu_var(tick_cpu_device).mode == TICKDEV_MODE_ONESHOT; 170 ret = __this_cpu_read(tick_cpu_device.mode) == TICKDEV_MODE_ONESHOT;
171 local_irq_restore(flags); 171 local_irq_restore(flags);
172 172
173 return ret; 173 return ret;
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 3e216e01bbd1..c55ea2433471 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -642,8 +642,7 @@ static void tick_nohz_switch_to_nohz(void)
642 } 642 }
643 local_irq_enable(); 643 local_irq_enable();
644 644
645 printk(KERN_INFO "Switched to NOHz mode on CPU #%d\n", 645 printk(KERN_INFO "Switched to NOHz mode on CPU #%d\n", smp_processor_id());
646 smp_processor_id());
647} 646}
648 647
649/* 648/*
@@ -795,8 +794,10 @@ void tick_setup_sched_timer(void)
795 } 794 }
796 795
797#ifdef CONFIG_NO_HZ 796#ifdef CONFIG_NO_HZ
798 if (tick_nohz_enabled) 797 if (tick_nohz_enabled) {
799 ts->nohz_mode = NOHZ_MODE_HIGHRES; 798 ts->nohz_mode = NOHZ_MODE_HIGHRES;
799 printk(KERN_INFO "Switched to NOHz mode on CPU #%d\n", smp_processor_id());
800 }
800#endif 801#endif
801} 802}
802#endif /* HIGH_RES_TIMERS */ 803#endif /* HIGH_RES_TIMERS */
diff --git a/kernel/time/timecompare.c b/kernel/time/timecompare.c
index ac38fbb176cc..a9ae369925ce 100644
--- a/kernel/time/timecompare.c
+++ b/kernel/time/timecompare.c
@@ -21,6 +21,7 @@
21#include <linux/module.h> 21#include <linux/module.h>
22#include <linux/slab.h> 22#include <linux/slab.h>
23#include <linux/math64.h> 23#include <linux/math64.h>
24#include <linux/kernel.h>
24 25
25/* 26/*
26 * fixed point arithmetic scale factor for skew 27 * fixed point arithmetic scale factor for skew
@@ -57,11 +58,11 @@ int timecompare_offset(struct timecompare *sync,
57 int index; 58 int index;
58 int num_samples = sync->num_samples; 59 int num_samples = sync->num_samples;
59 60
60 if (num_samples > sizeof(buffer)/sizeof(buffer[0])) { 61 if (num_samples > ARRAY_SIZE(buffer)) {
61 samples = kmalloc(sizeof(*samples) * num_samples, GFP_ATOMIC); 62 samples = kmalloc(sizeof(*samples) * num_samples, GFP_ATOMIC);
62 if (!samples) { 63 if (!samples) {
63 samples = buffer; 64 samples = buffer;
64 num_samples = sizeof(buffer)/sizeof(buffer[0]); 65 num_samples = ARRAY_SIZE(buffer);
65 } 66 }
66 } else { 67 } else {
67 samples = buffer; 68 samples = buffer;
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 49010d822f72..d27c7562902c 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -32,6 +32,8 @@ struct timekeeper {
32 cycle_t cycle_interval; 32 cycle_t cycle_interval;
33 /* Number of clock shifted nano seconds in one NTP interval. */ 33 /* Number of clock shifted nano seconds in one NTP interval. */
34 u64 xtime_interval; 34 u64 xtime_interval;
35 /* shifted nano seconds left over when rounding cycle_interval */
36 s64 xtime_remainder;
35 /* Raw nano seconds accumulated per NTP interval. */ 37 /* Raw nano seconds accumulated per NTP interval. */
36 u32 raw_interval; 38 u32 raw_interval;
37 39
@@ -47,7 +49,7 @@ struct timekeeper {
47 u32 mult; 49 u32 mult;
48}; 50};
49 51
50struct timekeeper timekeeper; 52static struct timekeeper timekeeper;
51 53
52/** 54/**
53 * timekeeper_setup_internals - Set up internals to use clocksource clock. 55 * timekeeper_setup_internals - Set up internals to use clocksource clock.
@@ -62,7 +64,7 @@ struct timekeeper timekeeper;
62static void timekeeper_setup_internals(struct clocksource *clock) 64static void timekeeper_setup_internals(struct clocksource *clock)
63{ 65{
64 cycle_t interval; 66 cycle_t interval;
65 u64 tmp; 67 u64 tmp, ntpinterval;
66 68
67 timekeeper.clock = clock; 69 timekeeper.clock = clock;
68 clock->cycle_last = clock->read(clock); 70 clock->cycle_last = clock->read(clock);
@@ -70,6 +72,7 @@ static void timekeeper_setup_internals(struct clocksource *clock)
70 /* Do the ns -> cycle conversion first, using original mult */ 72 /* Do the ns -> cycle conversion first, using original mult */
71 tmp = NTP_INTERVAL_LENGTH; 73 tmp = NTP_INTERVAL_LENGTH;
72 tmp <<= clock->shift; 74 tmp <<= clock->shift;
75 ntpinterval = tmp;
73 tmp += clock->mult/2; 76 tmp += clock->mult/2;
74 do_div(tmp, clock->mult); 77 do_div(tmp, clock->mult);
75 if (tmp == 0) 78 if (tmp == 0)
@@ -80,6 +83,7 @@ static void timekeeper_setup_internals(struct clocksource *clock)
80 83
81 /* Go back from cycles -> shifted ns */ 84 /* Go back from cycles -> shifted ns */
82 timekeeper.xtime_interval = (u64) interval * clock->mult; 85 timekeeper.xtime_interval = (u64) interval * clock->mult;
86 timekeeper.xtime_remainder = ntpinterval - timekeeper.xtime_interval;
83 timekeeper.raw_interval = 87 timekeeper.raw_interval =
84 ((u64) interval * clock->mult) >> clock->shift; 88 ((u64) interval * clock->mult) >> clock->shift;
85 89
@@ -160,7 +164,7 @@ static struct timespec total_sleep_time;
160/* 164/*
161 * The raw monotonic time for the CLOCK_MONOTONIC_RAW posix clock. 165 * The raw monotonic time for the CLOCK_MONOTONIC_RAW posix clock.
162 */ 166 */
163struct timespec raw_time; 167static struct timespec raw_time;
164 168
165/* flag for if timekeeping is suspended */ 169/* flag for if timekeeping is suspended */
166int __read_mostly timekeeping_suspended; 170int __read_mostly timekeeping_suspended;
@@ -284,6 +288,49 @@ void ktime_get_ts(struct timespec *ts)
284} 288}
285EXPORT_SYMBOL_GPL(ktime_get_ts); 289EXPORT_SYMBOL_GPL(ktime_get_ts);
286 290
291#ifdef CONFIG_NTP_PPS
292
293/**
294 * getnstime_raw_and_real - get day and raw monotonic time in timespec format
295 * @ts_raw: pointer to the timespec to be set to raw monotonic time
296 * @ts_real: pointer to the timespec to be set to the time of day
297 *
298 * This function reads both the time of day and raw monotonic time at the
299 * same time atomically and stores the resulting timestamps in timespec
300 * format.
301 */
302void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real)
303{
304 unsigned long seq;
305 s64 nsecs_raw, nsecs_real;
306
307 WARN_ON_ONCE(timekeeping_suspended);
308
309 do {
310 u32 arch_offset;
311
312 seq = read_seqbegin(&xtime_lock);
313
314 *ts_raw = raw_time;
315 *ts_real = xtime;
316
317 nsecs_raw = timekeeping_get_ns_raw();
318 nsecs_real = timekeeping_get_ns();
319
320 /* If arch requires, add in gettimeoffset() */
321 arch_offset = arch_gettimeoffset();
322 nsecs_raw += arch_offset;
323 nsecs_real += arch_offset;
324
325 } while (read_seqretry(&xtime_lock, seq));
326
327 timespec_add_ns(ts_raw, nsecs_raw);
328 timespec_add_ns(ts_real, nsecs_real);
329}
330EXPORT_SYMBOL(getnstime_raw_and_real);
331
332#endif /* CONFIG_NTP_PPS */
333
287/** 334/**
288 * do_gettimeofday - Returns the time of day in a timeval 335 * do_gettimeofday - Returns the time of day in a timeval
289 * @tv: pointer to the timeval to be set 336 * @tv: pointer to the timeval to be set
@@ -719,7 +766,8 @@ static cycle_t logarithmic_accumulation(cycle_t offset, int shift)
719 766
720 /* Accumulate error between NTP and clock interval */ 767 /* Accumulate error between NTP and clock interval */
721 timekeeper.ntp_error += tick_length << shift; 768 timekeeper.ntp_error += tick_length << shift;
722 timekeeper.ntp_error -= timekeeper.xtime_interval << 769 timekeeper.ntp_error -=
770 (timekeeper.xtime_interval + timekeeper.xtime_remainder) <<
723 (timekeeper.ntp_error_shift + shift); 771 (timekeeper.ntp_error_shift + shift);
724 772
725 return offset; 773 return offset;
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index ab8f5e33fa92..3258455549f4 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -41,7 +41,7 @@ static void print_name_offset(struct seq_file *m, void *sym)
41 char symname[KSYM_NAME_LEN]; 41 char symname[KSYM_NAME_LEN];
42 42
43 if (lookup_symbol_name((unsigned long)sym, symname) < 0) 43 if (lookup_symbol_name((unsigned long)sym, symname) < 0)
44 SEQ_printf(m, "<%p>", sym); 44 SEQ_printf(m, "<%pK>", sym);
45 else 45 else
46 SEQ_printf(m, "%s", symname); 46 SEQ_printf(m, "%s", symname);
47} 47}
@@ -79,26 +79,26 @@ print_active_timers(struct seq_file *m, struct hrtimer_clock_base *base,
79{ 79{
80 struct hrtimer *timer, tmp; 80 struct hrtimer *timer, tmp;
81 unsigned long next = 0, i; 81 unsigned long next = 0, i;
82 struct rb_node *curr; 82 struct timerqueue_node *curr;
83 unsigned long flags; 83 unsigned long flags;
84 84
85next_one: 85next_one:
86 i = 0; 86 i = 0;
87 raw_spin_lock_irqsave(&base->cpu_base->lock, flags); 87 raw_spin_lock_irqsave(&base->cpu_base->lock, flags);
88 88
89 curr = base->first; 89 curr = timerqueue_getnext(&base->active);
90 /* 90 /*
91 * Crude but we have to do this O(N*N) thing, because 91 * Crude but we have to do this O(N*N) thing, because
92 * we have to unlock the base when printing: 92 * we have to unlock the base when printing:
93 */ 93 */
94 while (curr && i < next) { 94 while (curr && i < next) {
95 curr = rb_next(curr); 95 curr = timerqueue_iterate_next(curr);
96 i++; 96 i++;
97 } 97 }
98 98
99 if (curr) { 99 if (curr) {
100 100
101 timer = rb_entry(curr, struct hrtimer, node); 101 timer = container_of(curr, struct hrtimer, node);
102 tmp = *timer; 102 tmp = *timer;
103 raw_spin_unlock_irqrestore(&base->cpu_base->lock, flags); 103 raw_spin_unlock_irqrestore(&base->cpu_base->lock, flags);
104 104
@@ -112,7 +112,7 @@ next_one:
112static void 112static void
113print_base(struct seq_file *m, struct hrtimer_clock_base *base, u64 now) 113print_base(struct seq_file *m, struct hrtimer_clock_base *base, u64 now)
114{ 114{
115 SEQ_printf(m, " .base: %p\n", base); 115 SEQ_printf(m, " .base: %pK\n", base);
116 SEQ_printf(m, " .index: %d\n", 116 SEQ_printf(m, " .index: %d\n",
117 base->index); 117 base->index);
118 SEQ_printf(m, " .resolution: %Lu nsecs\n", 118 SEQ_printf(m, " .resolution: %Lu nsecs\n",
diff --git a/kernel/timer.c b/kernel/timer.c
index 353b9227c2ec..d6459923d245 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -88,18 +88,6 @@ struct tvec_base boot_tvec_bases;
88EXPORT_SYMBOL(boot_tvec_bases); 88EXPORT_SYMBOL(boot_tvec_bases);
89static DEFINE_PER_CPU(struct tvec_base *, tvec_bases) = &boot_tvec_bases; 89static DEFINE_PER_CPU(struct tvec_base *, tvec_bases) = &boot_tvec_bases;
90 90
91/*
92 * Note that all tvec_bases are 2 byte aligned and lower bit of
93 * base in timer_list is guaranteed to be zero. Use the LSB to
94 * indicate whether the timer is deferrable.
95 *
96 * A deferrable timer will work normally when the system is busy, but
97 * will not cause a CPU to come out of idle just to service it; instead,
98 * the timer will be serviced when the CPU eventually wakes up with a
99 * subsequent non-deferrable timer.
100 */
101#define TBASE_DEFERRABLE_FLAG (0x1)
102
103/* Functions below help us manage 'deferrable' flag */ 91/* Functions below help us manage 'deferrable' flag */
104static inline unsigned int tbase_get_deferrable(struct tvec_base *base) 92static inline unsigned int tbase_get_deferrable(struct tvec_base *base)
105{ 93{
@@ -113,8 +101,7 @@ static inline struct tvec_base *tbase_get_base(struct tvec_base *base)
113 101
114static inline void timer_set_deferrable(struct timer_list *timer) 102static inline void timer_set_deferrable(struct timer_list *timer)
115{ 103{
116 timer->base = ((struct tvec_base *)((unsigned long)(timer->base) | 104 timer->base = TBASE_MAKE_DEFERRED(timer->base);
117 TBASE_DEFERRABLE_FLAG));
118} 105}
119 106
120static inline void 107static inline void
@@ -343,15 +330,6 @@ void set_timer_slack(struct timer_list *timer, int slack_hz)
343} 330}
344EXPORT_SYMBOL_GPL(set_timer_slack); 331EXPORT_SYMBOL_GPL(set_timer_slack);
345 332
346
347static inline void set_running_timer(struct tvec_base *base,
348 struct timer_list *timer)
349{
350#ifdef CONFIG_SMP
351 base->running_timer = timer;
352#endif
353}
354
355static void internal_add_timer(struct tvec_base *base, struct timer_list *timer) 333static void internal_add_timer(struct tvec_base *base, struct timer_list *timer)
356{ 334{
357 unsigned long expires = timer->expires; 335 unsigned long expires = timer->expires;
@@ -936,15 +914,12 @@ int del_timer(struct timer_list *timer)
936} 914}
937EXPORT_SYMBOL(del_timer); 915EXPORT_SYMBOL(del_timer);
938 916
939#ifdef CONFIG_SMP
940/** 917/**
941 * try_to_del_timer_sync - Try to deactivate a timer 918 * try_to_del_timer_sync - Try to deactivate a timer
942 * @timer: timer do del 919 * @timer: timer do del
943 * 920 *
944 * This function tries to deactivate a timer. Upon successful (ret >= 0) 921 * This function tries to deactivate a timer. Upon successful (ret >= 0)
945 * exit the timer is not queued and the handler is not running on any CPU. 922 * exit the timer is not queued and the handler is not running on any CPU.
946 *
947 * It must not be called from interrupt contexts.
948 */ 923 */
949int try_to_del_timer_sync(struct timer_list *timer) 924int try_to_del_timer_sync(struct timer_list *timer)
950{ 925{
@@ -973,6 +948,7 @@ out:
973} 948}
974EXPORT_SYMBOL(try_to_del_timer_sync); 949EXPORT_SYMBOL(try_to_del_timer_sync);
975 950
951#ifdef CONFIG_SMP
976/** 952/**
977 * del_timer_sync - deactivate a timer and wait for the handler to finish. 953 * del_timer_sync - deactivate a timer and wait for the handler to finish.
978 * @timer: the timer to be deactivated 954 * @timer: the timer to be deactivated
@@ -1000,7 +976,11 @@ int del_timer_sync(struct timer_list *timer)
1000 lock_map_release(&timer->lockdep_map); 976 lock_map_release(&timer->lockdep_map);
1001 local_irq_restore(flags); 977 local_irq_restore(flags);
1002#endif 978#endif
1003 979 /*
980 * don't use it in hardirq context, because it
981 * could lead to deadlock.
982 */
983 WARN_ON(in_irq());
1004 for (;;) { 984 for (;;) {
1005 int ret = try_to_del_timer_sync(timer); 985 int ret = try_to_del_timer_sync(timer);
1006 if (ret >= 0) 986 if (ret >= 0)
@@ -1111,7 +1091,7 @@ static inline void __run_timers(struct tvec_base *base)
1111 1091
1112 timer_stats_account_timer(timer); 1092 timer_stats_account_timer(timer);
1113 1093
1114 set_running_timer(base, timer); 1094 base->running_timer = timer;
1115 detach_timer(timer, 1); 1095 detach_timer(timer, 1);
1116 1096
1117 spin_unlock_irq(&base->lock); 1097 spin_unlock_irq(&base->lock);
@@ -1119,7 +1099,7 @@ static inline void __run_timers(struct tvec_base *base)
1119 spin_lock_irq(&base->lock); 1099 spin_lock_irq(&base->lock);
1120 } 1100 }
1121 } 1101 }
1122 set_running_timer(base, NULL); 1102 base->running_timer = NULL;
1123 spin_unlock_irq(&base->lock); 1103 spin_unlock_irq(&base->lock);
1124} 1104}
1125 1105
@@ -1249,7 +1229,7 @@ static unsigned long cmp_next_hrtimer_event(unsigned long now,
1249 */ 1229 */
1250unsigned long get_next_timer_interrupt(unsigned long now) 1230unsigned long get_next_timer_interrupt(unsigned long now)
1251{ 1231{
1252 struct tvec_base *base = __get_cpu_var(tvec_bases); 1232 struct tvec_base *base = __this_cpu_read(tvec_bases);
1253 unsigned long expires; 1233 unsigned long expires;
1254 1234
1255 /* 1235 /*
@@ -1298,7 +1278,7 @@ void update_process_times(int user_tick)
1298 */ 1278 */
1299static void run_timer_softirq(struct softirq_action *h) 1279static void run_timer_softirq(struct softirq_action *h)
1300{ 1280{
1301 struct tvec_base *base = __get_cpu_var(tvec_bases); 1281 struct tvec_base *base = __this_cpu_read(tvec_bases);
1302 1282
1303 hrtimer_run_pending(); 1283 hrtimer_run_pending();
1304 1284
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index ea37e2ff4164..14674dce77a6 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -69,6 +69,21 @@ config EVENT_TRACING
69 select CONTEXT_SWITCH_TRACER 69 select CONTEXT_SWITCH_TRACER
70 bool 70 bool
71 71
72config EVENT_POWER_TRACING_DEPRECATED
73 depends on EVENT_TRACING
74 bool "Deprecated power event trace API, to be removed"
75 default y
76 help
77 Provides old power event types:
78 C-state/idle accounting events:
79 power:power_start
80 power:power_end
81 and old cpufreq accounting event:
82 power:power_frequency
83 This is for userspace compatibility
84 and will vanish after 5 kernel iterations,
85 namely 2.6.41.
86
72config CONTEXT_SWITCH_TRACER 87config CONTEXT_SWITCH_TRACER
73 bool 88 bool
74 89
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 53f338190b26..761c510a06c5 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -52,7 +52,7 @@ obj-$(CONFIG_EVENT_TRACING) += trace_event_perf.o
52endif 52endif
53obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o 53obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o
54obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o 54obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o
55obj-$(CONFIG_EVENT_TRACING) += power-traces.o 55obj-$(CONFIG_TRACEPOINTS) += power-traces.o
56ifeq ($(CONFIG_TRACING),y) 56ifeq ($(CONFIG_TRACING),y)
57obj-$(CONFIG_KGDB_KDB) += trace_kdb.o 57obj-$(CONFIG_KGDB_KDB) += trace_kdb.o
58endif 58endif
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 7b8ec0281548..d95721f33702 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -138,6 +138,13 @@ void __trace_note_message(struct blk_trace *bt, const char *fmt, ...)
138 !blk_tracer_enabled)) 138 !blk_tracer_enabled))
139 return; 139 return;
140 140
141 /*
142 * If the BLK_TC_NOTIFY action mask isn't set, don't send any note
143 * message to the trace.
144 */
145 if (!(bt->act_mask & BLK_TC_NOTIFY))
146 return;
147
141 local_irq_save(flags); 148 local_irq_save(flags);
142 buf = per_cpu_ptr(bt->msg_data, smp_processor_id()); 149 buf = per_cpu_ptr(bt->msg_data, smp_processor_id());
143 va_start(args, fmt); 150 va_start(args, fmt);
@@ -758,53 +765,58 @@ static void blk_add_trace_rq_complete(void *ignore,
758 * @q: queue the io is for 765 * @q: queue the io is for
759 * @bio: the source bio 766 * @bio: the source bio
760 * @what: the action 767 * @what: the action
768 * @error: error, if any
761 * 769 *
762 * Description: 770 * Description:
763 * Records an action against a bio. Will log the bio offset + size. 771 * Records an action against a bio. Will log the bio offset + size.
764 * 772 *
765 **/ 773 **/
766static void blk_add_trace_bio(struct request_queue *q, struct bio *bio, 774static void blk_add_trace_bio(struct request_queue *q, struct bio *bio,
767 u32 what) 775 u32 what, int error)
768{ 776{
769 struct blk_trace *bt = q->blk_trace; 777 struct blk_trace *bt = q->blk_trace;
770 778
771 if (likely(!bt)) 779 if (likely(!bt))
772 return; 780 return;
773 781
782 if (!error && !bio_flagged(bio, BIO_UPTODATE))
783 error = EIO;
784
774 __blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, what, 785 __blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, what,
775 !bio_flagged(bio, BIO_UPTODATE), 0, NULL); 786 error, 0, NULL);
776} 787}
777 788
778static void blk_add_trace_bio_bounce(void *ignore, 789static void blk_add_trace_bio_bounce(void *ignore,
779 struct request_queue *q, struct bio *bio) 790 struct request_queue *q, struct bio *bio)
780{ 791{
781 blk_add_trace_bio(q, bio, BLK_TA_BOUNCE); 792 blk_add_trace_bio(q, bio, BLK_TA_BOUNCE, 0);
782} 793}
783 794
784static void blk_add_trace_bio_complete(void *ignore, 795static void blk_add_trace_bio_complete(void *ignore,
785 struct request_queue *q, struct bio *bio) 796 struct request_queue *q, struct bio *bio,
797 int error)
786{ 798{
787 blk_add_trace_bio(q, bio, BLK_TA_COMPLETE); 799 blk_add_trace_bio(q, bio, BLK_TA_COMPLETE, error);
788} 800}
789 801
790static void blk_add_trace_bio_backmerge(void *ignore, 802static void blk_add_trace_bio_backmerge(void *ignore,
791 struct request_queue *q, 803 struct request_queue *q,
792 struct bio *bio) 804 struct bio *bio)
793{ 805{
794 blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE); 806 blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE, 0);
795} 807}
796 808
797static void blk_add_trace_bio_frontmerge(void *ignore, 809static void blk_add_trace_bio_frontmerge(void *ignore,
798 struct request_queue *q, 810 struct request_queue *q,
799 struct bio *bio) 811 struct bio *bio)
800{ 812{
801 blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE); 813 blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE, 0);
802} 814}
803 815
804static void blk_add_trace_bio_queue(void *ignore, 816static void blk_add_trace_bio_queue(void *ignore,
805 struct request_queue *q, struct bio *bio) 817 struct request_queue *q, struct bio *bio)
806{ 818{
807 blk_add_trace_bio(q, bio, BLK_TA_QUEUE); 819 blk_add_trace_bio(q, bio, BLK_TA_QUEUE, 0);
808} 820}
809 821
810static void blk_add_trace_getrq(void *ignore, 822static void blk_add_trace_getrq(void *ignore,
@@ -812,7 +824,7 @@ static void blk_add_trace_getrq(void *ignore,
812 struct bio *bio, int rw) 824 struct bio *bio, int rw)
813{ 825{
814 if (bio) 826 if (bio)
815 blk_add_trace_bio(q, bio, BLK_TA_GETRQ); 827 blk_add_trace_bio(q, bio, BLK_TA_GETRQ, 0);
816 else { 828 else {
817 struct blk_trace *bt = q->blk_trace; 829 struct blk_trace *bt = q->blk_trace;
818 830
@@ -827,7 +839,7 @@ static void blk_add_trace_sleeprq(void *ignore,
827 struct bio *bio, int rw) 839 struct bio *bio, int rw)
828{ 840{
829 if (bio) 841 if (bio)
830 blk_add_trace_bio(q, bio, BLK_TA_SLEEPRQ); 842 blk_add_trace_bio(q, bio, BLK_TA_SLEEPRQ, 0);
831 else { 843 else {
832 struct blk_trace *bt = q->blk_trace; 844 struct blk_trace *bt = q->blk_trace;
833 845
@@ -887,7 +899,7 @@ static void blk_add_trace_split(void *ignore,
887} 899}
888 900
889/** 901/**
890 * blk_add_trace_remap - Add a trace for a remap operation 902 * blk_add_trace_bio_remap - Add a trace for a bio-remap operation
891 * @ignore: trace callback data parameter (not used) 903 * @ignore: trace callback data parameter (not used)
892 * @q: queue the io is for 904 * @q: queue the io is for
893 * @bio: the source bio 905 * @bio: the source bio
@@ -899,9 +911,9 @@ static void blk_add_trace_split(void *ignore,
899 * it spans a stripe (or similar). Add a trace for that action. 911 * it spans a stripe (or similar). Add a trace for that action.
900 * 912 *
901 **/ 913 **/
902static void blk_add_trace_remap(void *ignore, 914static void blk_add_trace_bio_remap(void *ignore,
903 struct request_queue *q, struct bio *bio, 915 struct request_queue *q, struct bio *bio,
904 dev_t dev, sector_t from) 916 dev_t dev, sector_t from)
905{ 917{
906 struct blk_trace *bt = q->blk_trace; 918 struct blk_trace *bt = q->blk_trace;
907 struct blk_io_trace_remap r; 919 struct blk_io_trace_remap r;
@@ -1016,7 +1028,7 @@ static void blk_register_tracepoints(void)
1016 WARN_ON(ret); 1028 WARN_ON(ret);
1017 ret = register_trace_block_split(blk_add_trace_split, NULL); 1029 ret = register_trace_block_split(blk_add_trace_split, NULL);
1018 WARN_ON(ret); 1030 WARN_ON(ret);
1019 ret = register_trace_block_remap(blk_add_trace_remap, NULL); 1031 ret = register_trace_block_bio_remap(blk_add_trace_bio_remap, NULL);
1020 WARN_ON(ret); 1032 WARN_ON(ret);
1021 ret = register_trace_block_rq_remap(blk_add_trace_rq_remap, NULL); 1033 ret = register_trace_block_rq_remap(blk_add_trace_rq_remap, NULL);
1022 WARN_ON(ret); 1034 WARN_ON(ret);
@@ -1025,7 +1037,7 @@ static void blk_register_tracepoints(void)
1025static void blk_unregister_tracepoints(void) 1037static void blk_unregister_tracepoints(void)
1026{ 1038{
1027 unregister_trace_block_rq_remap(blk_add_trace_rq_remap, NULL); 1039 unregister_trace_block_rq_remap(blk_add_trace_rq_remap, NULL);
1028 unregister_trace_block_remap(blk_add_trace_remap, NULL); 1040 unregister_trace_block_bio_remap(blk_add_trace_bio_remap, NULL);
1029 unregister_trace_block_split(blk_add_trace_split, NULL); 1041 unregister_trace_block_split(blk_add_trace_split, NULL);
1030 unregister_trace_block_unplug_io(blk_add_trace_unplug_io, NULL); 1042 unregister_trace_block_unplug_io(blk_add_trace_unplug_io, NULL);
1031 unregister_trace_block_unplug_timer(blk_add_trace_unplug_timer, NULL); 1043 unregister_trace_block_unplug_timer(blk_add_trace_unplug_timer, NULL);
diff --git a/kernel/trace/power-traces.c b/kernel/trace/power-traces.c
index a22582a06161..f55fcf61b223 100644
--- a/kernel/trace/power-traces.c
+++ b/kernel/trace/power-traces.c
@@ -13,5 +13,8 @@
13#define CREATE_TRACE_POINTS 13#define CREATE_TRACE_POINTS
14#include <trace/events/power.h> 14#include <trace/events/power.h>
15 15
16EXPORT_TRACEPOINT_SYMBOL_GPL(power_frequency); 16#ifdef EVENT_POWER_TRACING_DEPRECATED
17EXPORT_TRACEPOINT_SYMBOL_GPL(power_start);
18#endif
19EXPORT_TRACEPOINT_SYMBOL_GPL(cpu_idle);
17 20
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index f8cf959bad45..dc53ecb80589 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1313,12 +1313,10 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc)
1313 1313
1314 __this_cpu_inc(user_stack_count); 1314 __this_cpu_inc(user_stack_count);
1315 1315
1316
1317
1318 event = trace_buffer_lock_reserve(buffer, TRACE_USER_STACK, 1316 event = trace_buffer_lock_reserve(buffer, TRACE_USER_STACK,
1319 sizeof(*entry), flags, pc); 1317 sizeof(*entry), flags, pc);
1320 if (!event) 1318 if (!event)
1321 return; 1319 goto out_drop_count;
1322 entry = ring_buffer_event_data(event); 1320 entry = ring_buffer_event_data(event);
1323 1321
1324 entry->tgid = current->tgid; 1322 entry->tgid = current->tgid;
@@ -1333,8 +1331,8 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc)
1333 if (!filter_check_discard(call, entry, buffer, event)) 1331 if (!filter_check_discard(call, entry, buffer, event))
1334 ring_buffer_unlock_commit(buffer, event); 1332 ring_buffer_unlock_commit(buffer, event);
1335 1333
1334 out_drop_count:
1336 __this_cpu_dec(user_stack_count); 1335 __this_cpu_dec(user_stack_count);
1337
1338 out: 1336 out:
1339 preempt_enable(); 1337 preempt_enable();
1340} 1338}
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index e3dfecaf13e6..6cf223764be8 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -53,7 +53,7 @@
53 */ 53 */
54 54
55/* 55/*
56 * Function trace entry - function address and parent function addres: 56 * Function trace entry - function address and parent function address:
57 */ 57 */
58FTRACE_ENTRY(function, ftrace_entry, 58FTRACE_ENTRY(function, ftrace_entry,
59 59
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 39c059ca670e..19a359d5e6d5 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -21,17 +21,46 @@ typedef typeof(unsigned long [PERF_MAX_TRACE_SIZE / sizeof(unsigned long)])
21/* Count the events in use (per event id, not per instance) */ 21/* Count the events in use (per event id, not per instance) */
22static int total_ref_count; 22static int total_ref_count;
23 23
24static int perf_trace_event_perm(struct ftrace_event_call *tp_event,
25 struct perf_event *p_event)
26{
27 /* No tracing, just counting, so no obvious leak */
28 if (!(p_event->attr.sample_type & PERF_SAMPLE_RAW))
29 return 0;
30
31 /* Some events are ok to be traced by non-root users... */
32 if (p_event->attach_state == PERF_ATTACH_TASK) {
33 if (tp_event->flags & TRACE_EVENT_FL_CAP_ANY)
34 return 0;
35 }
36
37 /*
38 * ...otherwise raw tracepoint data can be a severe data leak,
39 * only allow root to have these.
40 */
41 if (perf_paranoid_tracepoint_raw() && !capable(CAP_SYS_ADMIN))
42 return -EPERM;
43
44 return 0;
45}
46
24static int perf_trace_event_init(struct ftrace_event_call *tp_event, 47static int perf_trace_event_init(struct ftrace_event_call *tp_event,
25 struct perf_event *p_event) 48 struct perf_event *p_event)
26{ 49{
27 struct hlist_head __percpu *list; 50 struct hlist_head __percpu *list;
28 int ret = -ENOMEM; 51 int ret;
29 int cpu; 52 int cpu;
30 53
54 ret = perf_trace_event_perm(tp_event, p_event);
55 if (ret)
56 return ret;
57
31 p_event->tp_event = tp_event; 58 p_event->tp_event = tp_event;
32 if (tp_event->perf_refcount++ > 0) 59 if (tp_event->perf_refcount++ > 0)
33 return 0; 60 return 0;
34 61
62 ret = -ENOMEM;
63
35 list = alloc_percpu(struct hlist_head); 64 list = alloc_percpu(struct hlist_head);
36 if (!list) 65 if (!list)
37 goto fail; 66 goto fail;
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 0725eeab1937..5f499e0438a4 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -27,6 +27,12 @@
27 27
28DEFINE_MUTEX(event_mutex); 28DEFINE_MUTEX(event_mutex);
29 29
30DEFINE_MUTEX(event_storage_mutex);
31EXPORT_SYMBOL_GPL(event_storage_mutex);
32
33char event_storage[EVENT_STORAGE_SIZE];
34EXPORT_SYMBOL_GPL(event_storage);
35
30LIST_HEAD(ftrace_events); 36LIST_HEAD(ftrace_events);
31LIST_HEAD(ftrace_common_fields); 37LIST_HEAD(ftrace_common_fields);
32 38
@@ -1278,7 +1284,7 @@ trace_create_file_ops(struct module *mod)
1278static void trace_module_add_events(struct module *mod) 1284static void trace_module_add_events(struct module *mod)
1279{ 1285{
1280 struct ftrace_module_file_ops *file_ops = NULL; 1286 struct ftrace_module_file_ops *file_ops = NULL;
1281 struct ftrace_event_call *call, *start, *end; 1287 struct ftrace_event_call **call, **start, **end;
1282 1288
1283 start = mod->trace_events; 1289 start = mod->trace_events;
1284 end = mod->trace_events + mod->num_trace_events; 1290 end = mod->trace_events + mod->num_trace_events;
@@ -1291,7 +1297,7 @@ static void trace_module_add_events(struct module *mod)
1291 return; 1297 return;
1292 1298
1293 for_each_event(call, start, end) { 1299 for_each_event(call, start, end) {
1294 __trace_add_event_call(call, mod, 1300 __trace_add_event_call(*call, mod,
1295 &file_ops->id, &file_ops->enable, 1301 &file_ops->id, &file_ops->enable,
1296 &file_ops->filter, &file_ops->format); 1302 &file_ops->filter, &file_ops->format);
1297 } 1303 }
@@ -1361,8 +1367,8 @@ static struct notifier_block trace_module_nb = {
1361 .priority = 0, 1367 .priority = 0,
1362}; 1368};
1363 1369
1364extern struct ftrace_event_call __start_ftrace_events[]; 1370extern struct ftrace_event_call *__start_ftrace_events[];
1365extern struct ftrace_event_call __stop_ftrace_events[]; 1371extern struct ftrace_event_call *__stop_ftrace_events[];
1366 1372
1367static char bootup_event_buf[COMMAND_LINE_SIZE] __initdata; 1373static char bootup_event_buf[COMMAND_LINE_SIZE] __initdata;
1368 1374
@@ -1378,7 +1384,7 @@ __setup("trace_event=", setup_trace_event);
1378 1384
1379static __init int event_trace_init(void) 1385static __init int event_trace_init(void)
1380{ 1386{
1381 struct ftrace_event_call *call; 1387 struct ftrace_event_call **call;
1382 struct dentry *d_tracer; 1388 struct dentry *d_tracer;
1383 struct dentry *entry; 1389 struct dentry *entry;
1384 struct dentry *d_events; 1390 struct dentry *d_events;
@@ -1424,7 +1430,7 @@ static __init int event_trace_init(void)
1424 pr_warning("tracing: Failed to allocate common fields"); 1430 pr_warning("tracing: Failed to allocate common fields");
1425 1431
1426 for_each_event(call, __start_ftrace_events, __stop_ftrace_events) { 1432 for_each_event(call, __start_ftrace_events, __stop_ftrace_events) {
1427 __trace_add_event_call(call, NULL, &ftrace_event_id_fops, 1433 __trace_add_event_call(*call, NULL, &ftrace_event_id_fops,
1428 &ftrace_enable_fops, 1434 &ftrace_enable_fops,
1429 &ftrace_event_filter_fops, 1435 &ftrace_event_filter_fops,
1430 &ftrace_event_format_fops); 1436 &ftrace_event_format_fops);
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index 4ba44deaac25..bbeec31e0ae3 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -83,13 +83,19 @@ static void __always_unused ____ftrace_check_##name(void) \
83 83
84#undef __array 84#undef __array
85#define __array(type, item, len) \ 85#define __array(type, item, len) \
86 BUILD_BUG_ON(len > MAX_FILTER_STR_VAL); \ 86 do { \
87 ret = trace_define_field(event_call, #type "[" #len "]", #item, \ 87 BUILD_BUG_ON(len > MAX_FILTER_STR_VAL); \
88 mutex_lock(&event_storage_mutex); \
89 snprintf(event_storage, sizeof(event_storage), \
90 "%s[%d]", #type, len); \
91 ret = trace_define_field(event_call, event_storage, #item, \
88 offsetof(typeof(field), item), \ 92 offsetof(typeof(field), item), \
89 sizeof(field.item), \ 93 sizeof(field.item), \
90 is_signed_type(type), FILTER_OTHER); \ 94 is_signed_type(type), FILTER_OTHER); \
91 if (ret) \ 95 mutex_unlock(&event_storage_mutex); \
92 return ret; 96 if (ret) \
97 return ret; \
98 } while (0);
93 99
94#undef __array_desc 100#undef __array_desc
95#define __array_desc(type, container, item, len) \ 101#define __array_desc(type, container, item, len) \
@@ -155,13 +161,13 @@ struct ftrace_event_class event_class_ftrace_##call = { \
155 .fields = LIST_HEAD_INIT(event_class_ftrace_##call.fields),\ 161 .fields = LIST_HEAD_INIT(event_class_ftrace_##call.fields),\
156}; \ 162}; \
157 \ 163 \
158struct ftrace_event_call __used \ 164struct ftrace_event_call __used event_##call = { \
159__attribute__((__aligned__(4))) \
160__attribute__((section("_ftrace_events"))) event_##call = { \
161 .name = #call, \ 165 .name = #call, \
162 .event.type = etype, \ 166 .event.type = etype, \
163 .class = &event_class_ftrace_##call, \ 167 .class = &event_class_ftrace_##call, \
164 .print_fmt = print, \ 168 .print_fmt = print, \
165}; \ 169}; \
170struct ftrace_event_call __used \
171__attribute__((section("_ftrace_events"))) *__event_##call = &event_##call;
166 172
167#include "trace_entries.h" 173#include "trace_entries.h"
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 5cf8c602b880..92b6e1e12d98 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -453,14 +453,6 @@ void time_hardirqs_off(unsigned long a0, unsigned long a1)
453 * Stubs: 453 * Stubs:
454 */ 454 */
455 455
456void early_boot_irqs_off(void)
457{
458}
459
460void early_boot_irqs_on(void)
461{
462}
463
464void trace_softirqs_on(unsigned long ip) 456void trace_softirqs_on(unsigned long ip)
465{ 457{
466} 458}
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 155a415b3209..659732eba07c 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -558,7 +558,7 @@ trace_selftest_startup_nop(struct tracer *trace, struct trace_array *tr)
558static int trace_wakeup_test_thread(void *data) 558static int trace_wakeup_test_thread(void *data)
559{ 559{
560 /* Make this a RT thread, doesn't need to be too high */ 560 /* Make this a RT thread, doesn't need to be too high */
561 struct sched_param param = { .sched_priority = 5 }; 561 static const struct sched_param param = { .sched_priority = 5 };
562 struct completion *x = data; 562 struct completion *x = data;
563 563
564 sched_setscheduler(current, SCHED_FIFO, &param); 564 sched_setscheduler(current, SCHED_FIFO, &param);
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index bac752f0cfb5..5c9fe08d2093 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -23,9 +23,6 @@ static int syscall_exit_register(struct ftrace_event_call *event,
23static int syscall_enter_define_fields(struct ftrace_event_call *call); 23static int syscall_enter_define_fields(struct ftrace_event_call *call);
24static int syscall_exit_define_fields(struct ftrace_event_call *call); 24static int syscall_exit_define_fields(struct ftrace_event_call *call);
25 25
26/* All syscall exit events have the same fields */
27static LIST_HEAD(syscall_exit_fields);
28
29static struct list_head * 26static struct list_head *
30syscall_get_enter_fields(struct ftrace_event_call *call) 27syscall_get_enter_fields(struct ftrace_event_call *call)
31{ 28{
@@ -34,50 +31,45 @@ syscall_get_enter_fields(struct ftrace_event_call *call)
34 return &entry->enter_fields; 31 return &entry->enter_fields;
35} 32}
36 33
37static struct list_head *
38syscall_get_exit_fields(struct ftrace_event_call *call)
39{
40 return &syscall_exit_fields;
41}
42
43struct trace_event_functions enter_syscall_print_funcs = { 34struct trace_event_functions enter_syscall_print_funcs = {
44 .trace = print_syscall_enter, 35 .trace = print_syscall_enter,
45}; 36};
46 37
47struct trace_event_functions exit_syscall_print_funcs = { 38struct trace_event_functions exit_syscall_print_funcs = {
48 .trace = print_syscall_exit, 39 .trace = print_syscall_exit,
49}; 40};
50 41
51struct ftrace_event_class event_class_syscall_enter = { 42struct ftrace_event_class event_class_syscall_enter = {
52 .system = "syscalls", 43 .system = "syscalls",
53 .reg = syscall_enter_register, 44 .reg = syscall_enter_register,
54 .define_fields = syscall_enter_define_fields, 45 .define_fields = syscall_enter_define_fields,
55 .get_fields = syscall_get_enter_fields, 46 .get_fields = syscall_get_enter_fields,
56 .raw_init = init_syscall_trace, 47 .raw_init = init_syscall_trace,
57}; 48};
58 49
59struct ftrace_event_class event_class_syscall_exit = { 50struct ftrace_event_class event_class_syscall_exit = {
60 .system = "syscalls", 51 .system = "syscalls",
61 .reg = syscall_exit_register, 52 .reg = syscall_exit_register,
62 .define_fields = syscall_exit_define_fields, 53 .define_fields = syscall_exit_define_fields,
63 .get_fields = syscall_get_exit_fields, 54 .fields = LIST_HEAD_INIT(event_class_syscall_exit.fields),
64 .raw_init = init_syscall_trace, 55 .raw_init = init_syscall_trace,
65}; 56};
66 57
67extern unsigned long __start_syscalls_metadata[]; 58extern struct syscall_metadata *__start_syscalls_metadata[];
68extern unsigned long __stop_syscalls_metadata[]; 59extern struct syscall_metadata *__stop_syscalls_metadata[];
69 60
70static struct syscall_metadata **syscalls_metadata; 61static struct syscall_metadata **syscalls_metadata;
71 62
72static struct syscall_metadata *find_syscall_meta(unsigned long syscall) 63static __init struct syscall_metadata *
64find_syscall_meta(unsigned long syscall)
73{ 65{
74 struct syscall_metadata *start; 66 struct syscall_metadata **start;
75 struct syscall_metadata *stop; 67 struct syscall_metadata **stop;
76 char str[KSYM_SYMBOL_LEN]; 68 char str[KSYM_SYMBOL_LEN];
77 69
78 70
79 start = (struct syscall_metadata *)__start_syscalls_metadata; 71 start = __start_syscalls_metadata;
80 stop = (struct syscall_metadata *)__stop_syscalls_metadata; 72 stop = __stop_syscalls_metadata;
81 kallsyms_lookup(syscall, NULL, NULL, NULL, str); 73 kallsyms_lookup(syscall, NULL, NULL, NULL, str);
82 74
83 for ( ; start < stop; start++) { 75 for ( ; start < stop; start++) {
@@ -87,8 +79,8 @@ static struct syscall_metadata *find_syscall_meta(unsigned long syscall)
87 * with "SyS" instead of "sys", leading to an unwanted 79 * with "SyS" instead of "sys", leading to an unwanted
88 * mismatch. 80 * mismatch.
89 */ 81 */
90 if (start->name && !strcmp(start->name + 3, str + 3)) 82 if ((*start)->name && !strcmp((*start)->name + 3, str + 3))
91 return start; 83 return *start;
92 } 84 }
93 return NULL; 85 return NULL;
94} 86}
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index e95ee7f31d43..68187af4889e 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -27,8 +27,8 @@
27#include <linux/sched.h> 27#include <linux/sched.h>
28#include <linux/jump_label.h> 28#include <linux/jump_label.h>
29 29
30extern struct tracepoint __start___tracepoints[]; 30extern struct tracepoint * const __start___tracepoints_ptrs[];
31extern struct tracepoint __stop___tracepoints[]; 31extern struct tracepoint * const __stop___tracepoints_ptrs[];
32 32
33/* Set to 1 to enable tracepoint debug output */ 33/* Set to 1 to enable tracepoint debug output */
34static const int tracepoint_debug; 34static const int tracepoint_debug;
@@ -298,10 +298,10 @@ static void disable_tracepoint(struct tracepoint *elem)
298 * 298 *
299 * Updates the probe callback corresponding to a range of tracepoints. 299 * Updates the probe callback corresponding to a range of tracepoints.
300 */ 300 */
301void 301void tracepoint_update_probe_range(struct tracepoint * const *begin,
302tracepoint_update_probe_range(struct tracepoint *begin, struct tracepoint *end) 302 struct tracepoint * const *end)
303{ 303{
304 struct tracepoint *iter; 304 struct tracepoint * const *iter;
305 struct tracepoint_entry *mark_entry; 305 struct tracepoint_entry *mark_entry;
306 306
307 if (!begin) 307 if (!begin)
@@ -309,12 +309,12 @@ tracepoint_update_probe_range(struct tracepoint *begin, struct tracepoint *end)
309 309
310 mutex_lock(&tracepoints_mutex); 310 mutex_lock(&tracepoints_mutex);
311 for (iter = begin; iter < end; iter++) { 311 for (iter = begin; iter < end; iter++) {
312 mark_entry = get_tracepoint(iter->name); 312 mark_entry = get_tracepoint((*iter)->name);
313 if (mark_entry) { 313 if (mark_entry) {
314 set_tracepoint(&mark_entry, iter, 314 set_tracepoint(&mark_entry, *iter,
315 !!mark_entry->refcount); 315 !!mark_entry->refcount);
316 } else { 316 } else {
317 disable_tracepoint(iter); 317 disable_tracepoint(*iter);
318 } 318 }
319 } 319 }
320 mutex_unlock(&tracepoints_mutex); 320 mutex_unlock(&tracepoints_mutex);
@@ -326,8 +326,8 @@ tracepoint_update_probe_range(struct tracepoint *begin, struct tracepoint *end)
326static void tracepoint_update_probes(void) 326static void tracepoint_update_probes(void)
327{ 327{
328 /* Core kernel tracepoints */ 328 /* Core kernel tracepoints */
329 tracepoint_update_probe_range(__start___tracepoints, 329 tracepoint_update_probe_range(__start___tracepoints_ptrs,
330 __stop___tracepoints); 330 __stop___tracepoints_ptrs);
331 /* tracepoints in modules. */ 331 /* tracepoints in modules. */
332 module_update_tracepoints(); 332 module_update_tracepoints();
333} 333}
@@ -514,8 +514,8 @@ EXPORT_SYMBOL_GPL(tracepoint_probe_update_all);
514 * Will return the first tracepoint in the range if the input tracepoint is 514 * Will return the first tracepoint in the range if the input tracepoint is
515 * NULL. 515 * NULL.
516 */ 516 */
517int tracepoint_get_iter_range(struct tracepoint **tracepoint, 517int tracepoint_get_iter_range(struct tracepoint * const **tracepoint,
518 struct tracepoint *begin, struct tracepoint *end) 518 struct tracepoint * const *begin, struct tracepoint * const *end)
519{ 519{
520 if (!*tracepoint && begin != end) { 520 if (!*tracepoint && begin != end) {
521 *tracepoint = begin; 521 *tracepoint = begin;
@@ -534,7 +534,8 @@ static void tracepoint_get_iter(struct tracepoint_iter *iter)
534 /* Core kernel tracepoints */ 534 /* Core kernel tracepoints */
535 if (!iter->module) { 535 if (!iter->module) {
536 found = tracepoint_get_iter_range(&iter->tracepoint, 536 found = tracepoint_get_iter_range(&iter->tracepoint,
537 __start___tracepoints, __stop___tracepoints); 537 __start___tracepoints_ptrs,
538 __stop___tracepoints_ptrs);
538 if (found) 539 if (found)
539 goto end; 540 goto end;
540 } 541 }
@@ -585,8 +586,8 @@ int tracepoint_module_notify(struct notifier_block *self,
585 switch (val) { 586 switch (val) {
586 case MODULE_STATE_COMING: 587 case MODULE_STATE_COMING:
587 case MODULE_STATE_GOING: 588 case MODULE_STATE_GOING:
588 tracepoint_update_probe_range(mod->tracepoints, 589 tracepoint_update_probe_range(mod->tracepoints_ptrs,
589 mod->tracepoints + mod->num_tracepoints); 590 mod->tracepoints_ptrs + mod->num_tracepoints);
590 break; 591 break;
591 } 592 }
592 return 0; 593 return 0;
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 25915832291a..9da289c34f22 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -12,6 +12,8 @@
12#include <linux/highuid.h> 12#include <linux/highuid.h>
13#include <linux/cred.h> 13#include <linux/cred.h>
14 14
15static struct kmem_cache *user_ns_cachep __read_mostly;
16
15/* 17/*
16 * Create a new user namespace, deriving the creator from the user in the 18 * Create a new user namespace, deriving the creator from the user in the
17 * passed credentials, and replacing that user with the new root user for the 19 * passed credentials, and replacing that user with the new root user for the
@@ -26,7 +28,7 @@ int create_user_ns(struct cred *new)
26 struct user_struct *root_user; 28 struct user_struct *root_user;
27 int n; 29 int n;
28 30
29 ns = kmalloc(sizeof(struct user_namespace), GFP_KERNEL); 31 ns = kmem_cache_alloc(user_ns_cachep, GFP_KERNEL);
30 if (!ns) 32 if (!ns)
31 return -ENOMEM; 33 return -ENOMEM;
32 34
@@ -38,7 +40,7 @@ int create_user_ns(struct cred *new)
38 /* Alloc new root user. */ 40 /* Alloc new root user. */
39 root_user = alloc_uid(ns, 0); 41 root_user = alloc_uid(ns, 0);
40 if (!root_user) { 42 if (!root_user) {
41 kfree(ns); 43 kmem_cache_free(user_ns_cachep, ns);
42 return -ENOMEM; 44 return -ENOMEM;
43 } 45 }
44 46
@@ -71,7 +73,7 @@ static void free_user_ns_work(struct work_struct *work)
71 struct user_namespace *ns = 73 struct user_namespace *ns =
72 container_of(work, struct user_namespace, destroyer); 74 container_of(work, struct user_namespace, destroyer);
73 free_uid(ns->creator); 75 free_uid(ns->creator);
74 kfree(ns); 76 kmem_cache_free(user_ns_cachep, ns);
75} 77}
76 78
77void free_user_ns(struct kref *kref) 79void free_user_ns(struct kref *kref)
@@ -126,3 +128,10 @@ gid_t user_ns_map_gid(struct user_namespace *to, const struct cred *cred, gid_t
126 /* No useful relationship so no mapping */ 128 /* No useful relationship so no mapping */
127 return overflowgid; 129 return overflowgid;
128} 130}
131
132static __init int user_namespaces_init(void)
133{
134 user_ns_cachep = KMEM_CACHE(user_namespace, SLAB_PANIC);
135 return 0;
136}
137module_init(user_namespaces_init);
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 5b082156cd21..18bb15776c57 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -27,7 +27,7 @@
27#include <asm/irq_regs.h> 27#include <asm/irq_regs.h>
28#include <linux/perf_event.h> 28#include <linux/perf_event.h>
29 29
30int watchdog_enabled; 30int watchdog_enabled = 1;
31int __read_mostly softlockup_thresh = 60; 31int __read_mostly softlockup_thresh = 60;
32 32
33static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts); 33static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts);
@@ -43,9 +43,6 @@ static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved);
43static DEFINE_PER_CPU(struct perf_event *, watchdog_ev); 43static DEFINE_PER_CPU(struct perf_event *, watchdog_ev);
44#endif 44#endif
45 45
46static int no_watchdog;
47
48
49/* boot commands */ 46/* boot commands */
50/* 47/*
51 * Should we panic when a soft-lockup or hard-lockup occurs: 48 * Should we panic when a soft-lockup or hard-lockup occurs:
@@ -57,6 +54,8 @@ static int __init hardlockup_panic_setup(char *str)
57{ 54{
58 if (!strncmp(str, "panic", 5)) 55 if (!strncmp(str, "panic", 5))
59 hardlockup_panic = 1; 56 hardlockup_panic = 1;
57 else if (!strncmp(str, "0", 1))
58 watchdog_enabled = 0;
60 return 1; 59 return 1;
61} 60}
62__setup("nmi_watchdog=", hardlockup_panic_setup); 61__setup("nmi_watchdog=", hardlockup_panic_setup);
@@ -75,7 +74,7 @@ __setup("softlockup_panic=", softlockup_panic_setup);
75 74
76static int __init nowatchdog_setup(char *str) 75static int __init nowatchdog_setup(char *str)
77{ 76{
78 no_watchdog = 1; 77 watchdog_enabled = 0;
79 return 1; 78 return 1;
80} 79}
81__setup("nowatchdog", nowatchdog_setup); 80__setup("nowatchdog", nowatchdog_setup);
@@ -83,7 +82,7 @@ __setup("nowatchdog", nowatchdog_setup);
83/* deprecated */ 82/* deprecated */
84static int __init nosoftlockup_setup(char *str) 83static int __init nosoftlockup_setup(char *str)
85{ 84{
86 no_watchdog = 1; 85 watchdog_enabled = 0;
87 return 1; 86 return 1;
88} 87}
89__setup("nosoftlockup", nosoftlockup_setup); 88__setup("nosoftlockup", nosoftlockup_setup);
@@ -116,12 +115,12 @@ static void __touch_watchdog(void)
116{ 115{
117 int this_cpu = smp_processor_id(); 116 int this_cpu = smp_processor_id();
118 117
119 __get_cpu_var(watchdog_touch_ts) = get_timestamp(this_cpu); 118 __this_cpu_write(watchdog_touch_ts, get_timestamp(this_cpu));
120} 119}
121 120
122void touch_softlockup_watchdog(void) 121void touch_softlockup_watchdog(void)
123{ 122{
124 __raw_get_cpu_var(watchdog_touch_ts) = 0; 123 __this_cpu_write(watchdog_touch_ts, 0);
125} 124}
126EXPORT_SYMBOL(touch_softlockup_watchdog); 125EXPORT_SYMBOL(touch_softlockup_watchdog);
127 126
@@ -165,12 +164,12 @@ void touch_softlockup_watchdog_sync(void)
165/* watchdog detector functions */ 164/* watchdog detector functions */
166static int is_hardlockup(void) 165static int is_hardlockup(void)
167{ 166{
168 unsigned long hrint = __get_cpu_var(hrtimer_interrupts); 167 unsigned long hrint = __this_cpu_read(hrtimer_interrupts);
169 168
170 if (__get_cpu_var(hrtimer_interrupts_saved) == hrint) 169 if (__this_cpu_read(hrtimer_interrupts_saved) == hrint)
171 return 1; 170 return 1;
172 171
173 __get_cpu_var(hrtimer_interrupts_saved) = hrint; 172 __this_cpu_write(hrtimer_interrupts_saved, hrint);
174 return 0; 173 return 0;
175} 174}
176#endif 175#endif
@@ -203,8 +202,8 @@ static void watchdog_overflow_callback(struct perf_event *event, int nmi,
203 /* Ensure the watchdog never gets throttled */ 202 /* Ensure the watchdog never gets throttled */
204 event->hw.interrupts = 0; 203 event->hw.interrupts = 0;
205 204
206 if (__get_cpu_var(watchdog_nmi_touch) == true) { 205 if (__this_cpu_read(watchdog_nmi_touch) == true) {
207 __get_cpu_var(watchdog_nmi_touch) = false; 206 __this_cpu_write(watchdog_nmi_touch, false);
208 return; 207 return;
209 } 208 }
210 209
@@ -218,7 +217,7 @@ static void watchdog_overflow_callback(struct perf_event *event, int nmi,
218 int this_cpu = smp_processor_id(); 217 int this_cpu = smp_processor_id();
219 218
220 /* only print hardlockups once */ 219 /* only print hardlockups once */
221 if (__get_cpu_var(hard_watchdog_warn) == true) 220 if (__this_cpu_read(hard_watchdog_warn) == true)
222 return; 221 return;
223 222
224 if (hardlockup_panic) 223 if (hardlockup_panic)
@@ -226,16 +225,16 @@ static void watchdog_overflow_callback(struct perf_event *event, int nmi,
226 else 225 else
227 WARN(1, "Watchdog detected hard LOCKUP on cpu %d", this_cpu); 226 WARN(1, "Watchdog detected hard LOCKUP on cpu %d", this_cpu);
228 227
229 __get_cpu_var(hard_watchdog_warn) = true; 228 __this_cpu_write(hard_watchdog_warn, true);
230 return; 229 return;
231 } 230 }
232 231
233 __get_cpu_var(hard_watchdog_warn) = false; 232 __this_cpu_write(hard_watchdog_warn, false);
234 return; 233 return;
235} 234}
236static void watchdog_interrupt_count(void) 235static void watchdog_interrupt_count(void)
237{ 236{
238 __get_cpu_var(hrtimer_interrupts)++; 237 __this_cpu_inc(hrtimer_interrupts);
239} 238}
240#else 239#else
241static inline void watchdog_interrupt_count(void) { return; } 240static inline void watchdog_interrupt_count(void) { return; }
@@ -244,7 +243,7 @@ static inline void watchdog_interrupt_count(void) { return; }
244/* watchdog kicker functions */ 243/* watchdog kicker functions */
245static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) 244static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
246{ 245{
247 unsigned long touch_ts = __get_cpu_var(watchdog_touch_ts); 246 unsigned long touch_ts = __this_cpu_read(watchdog_touch_ts);
248 struct pt_regs *regs = get_irq_regs(); 247 struct pt_regs *regs = get_irq_regs();
249 int duration; 248 int duration;
250 249
@@ -252,18 +251,18 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
252 watchdog_interrupt_count(); 251 watchdog_interrupt_count();
253 252
254 /* kick the softlockup detector */ 253 /* kick the softlockup detector */
255 wake_up_process(__get_cpu_var(softlockup_watchdog)); 254 wake_up_process(__this_cpu_read(softlockup_watchdog));
256 255
257 /* .. and repeat */ 256 /* .. and repeat */
258 hrtimer_forward_now(hrtimer, ns_to_ktime(get_sample_period())); 257 hrtimer_forward_now(hrtimer, ns_to_ktime(get_sample_period()));
259 258
260 if (touch_ts == 0) { 259 if (touch_ts == 0) {
261 if (unlikely(__get_cpu_var(softlockup_touch_sync))) { 260 if (unlikely(__this_cpu_read(softlockup_touch_sync))) {
262 /* 261 /*
263 * If the time stamp was touched atomically 262 * If the time stamp was touched atomically
264 * make sure the scheduler tick is up to date. 263 * make sure the scheduler tick is up to date.
265 */ 264 */
266 __get_cpu_var(softlockup_touch_sync) = false; 265 __this_cpu_write(softlockup_touch_sync, false);
267 sched_clock_tick(); 266 sched_clock_tick();
268 } 267 }
269 __touch_watchdog(); 268 __touch_watchdog();
@@ -279,7 +278,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
279 duration = is_softlockup(touch_ts); 278 duration = is_softlockup(touch_ts);
280 if (unlikely(duration)) { 279 if (unlikely(duration)) {
281 /* only warn once */ 280 /* only warn once */
282 if (__get_cpu_var(soft_watchdog_warn) == true) 281 if (__this_cpu_read(soft_watchdog_warn) == true)
283 return HRTIMER_RESTART; 282 return HRTIMER_RESTART;
284 283
285 printk(KERN_ERR "BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n", 284 printk(KERN_ERR "BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n",
@@ -294,9 +293,9 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
294 293
295 if (softlockup_panic) 294 if (softlockup_panic)
296 panic("softlockup: hung tasks"); 295 panic("softlockup: hung tasks");
297 __get_cpu_var(soft_watchdog_warn) = true; 296 __this_cpu_write(soft_watchdog_warn, true);
298 } else 297 } else
299 __get_cpu_var(soft_watchdog_warn) = false; 298 __this_cpu_write(soft_watchdog_warn, false);
300 299
301 return HRTIMER_RESTART; 300 return HRTIMER_RESTART;
302} 301}
@@ -307,7 +306,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
307 */ 306 */
308static int watchdog(void *unused) 307static int watchdog(void *unused)
309{ 308{
310 struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; 309 static struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
311 struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer); 310 struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer);
312 311
313 sched_setscheduler(current, SCHED_FIFO, &param); 312 sched_setscheduler(current, SCHED_FIFO, &param);
@@ -364,8 +363,14 @@ static int watchdog_nmi_enable(int cpu)
364 goto out_save; 363 goto out_save;
365 } 364 }
366 365
367 printk(KERN_ERR "NMI watchdog disabled for cpu%i: unable to create perf event: %ld\n", 366
368 cpu, PTR_ERR(event)); 367 /* vary the KERN level based on the returned errno */
368 if (PTR_ERR(event) == -EOPNOTSUPP)
369 printk(KERN_INFO "NMI watchdog disabled (cpu%i): not supported (no LAPIC?)\n", cpu);
370 else if (PTR_ERR(event) == -ENOENT)
371 printk(KERN_WARNING "NMI watchdog disabled (cpu%i): hardware events not enabled\n", cpu);
372 else
373 printk(KERN_ERR "NMI watchdog disabled (cpu%i): unable to create perf event: %ld\n", cpu, PTR_ERR(event));
369 return PTR_ERR(event); 374 return PTR_ERR(event);
370 375
371 /* success path */ 376 /* success path */
@@ -430,9 +435,6 @@ static int watchdog_enable(int cpu)
430 wake_up_process(p); 435 wake_up_process(p);
431 } 436 }
432 437
433 /* if any cpu succeeds, watchdog is considered enabled for the system */
434 watchdog_enabled = 1;
435
436 return 0; 438 return 0;
437} 439}
438 440
@@ -460,12 +462,16 @@ static void watchdog_disable(int cpu)
460static void watchdog_enable_all_cpus(void) 462static void watchdog_enable_all_cpus(void)
461{ 463{
462 int cpu; 464 int cpu;
463 int result = 0; 465
466 watchdog_enabled = 0;
464 467
465 for_each_online_cpu(cpu) 468 for_each_online_cpu(cpu)
466 result += watchdog_enable(cpu); 469 if (!watchdog_enable(cpu))
470 /* if any cpu succeeds, watchdog is considered
471 enabled for the system */
472 watchdog_enabled = 1;
467 473
468 if (result) 474 if (!watchdog_enabled)
469 printk(KERN_ERR "watchdog: failed to be enabled on some cpus\n"); 475 printk(KERN_ERR "watchdog: failed to be enabled on some cpus\n");
470 476
471} 477}
@@ -474,9 +480,6 @@ static void watchdog_disable_all_cpus(void)
474{ 480{
475 int cpu; 481 int cpu;
476 482
477 if (no_watchdog)
478 return;
479
480 for_each_online_cpu(cpu) 483 for_each_online_cpu(cpu)
481 watchdog_disable(cpu); 484 watchdog_disable(cpu);
482 485
@@ -496,10 +499,12 @@ int proc_dowatchdog_enabled(struct ctl_table *table, int write,
496{ 499{
497 proc_dointvec(table, write, buffer, length, ppos); 500 proc_dointvec(table, write, buffer, length, ppos);
498 501
499 if (watchdog_enabled) 502 if (write) {
500 watchdog_enable_all_cpus(); 503 if (watchdog_enabled)
501 else 504 watchdog_enable_all_cpus();
502 watchdog_disable_all_cpus(); 505 else
506 watchdog_disable_all_cpus();
507 }
503 return 0; 508 return 0;
504} 509}
505 510
@@ -528,7 +533,8 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
528 break; 533 break;
529 case CPU_ONLINE: 534 case CPU_ONLINE:
530 case CPU_ONLINE_FROZEN: 535 case CPU_ONLINE_FROZEN:
531 err = watchdog_enable(hotcpu); 536 if (watchdog_enabled)
537 err = watchdog_enable(hotcpu);
532 break; 538 break;
533#ifdef CONFIG_HOTPLUG_CPU 539#ifdef CONFIG_HOTPLUG_CPU
534 case CPU_UP_CANCELED: 540 case CPU_UP_CANCELED:
@@ -548,20 +554,16 @@ static struct notifier_block __cpuinitdata cpu_nfb = {
548 .notifier_call = cpu_callback 554 .notifier_call = cpu_callback
549}; 555};
550 556
551static int __init spawn_watchdog_task(void) 557void __init lockup_detector_init(void)
552{ 558{
553 void *cpu = (void *)(long)smp_processor_id(); 559 void *cpu = (void *)(long)smp_processor_id();
554 int err; 560 int err;
555 561
556 if (no_watchdog)
557 return 0;
558
559 err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu); 562 err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
560 WARN_ON(notifier_to_errno(err)); 563 WARN_ON(notifier_to_errno(err));
561 564
562 cpu_callback(&cpu_nfb, CPU_ONLINE, cpu); 565 cpu_callback(&cpu_nfb, CPU_ONLINE, cpu);
563 register_cpu_notifier(&cpu_nfb); 566 register_cpu_notifier(&cpu_nfb);
564 567
565 return 0; 568 return;
566} 569}
567early_initcall(spawn_watchdog_task);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index e785b0f2aea5..ee6578b578ad 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -79,7 +79,9 @@ enum {
79 MAX_IDLE_WORKERS_RATIO = 4, /* 1/4 of busy can be idle */ 79 MAX_IDLE_WORKERS_RATIO = 4, /* 1/4 of busy can be idle */
80 IDLE_WORKER_TIMEOUT = 300 * HZ, /* keep idle ones for 5 mins */ 80 IDLE_WORKER_TIMEOUT = 300 * HZ, /* keep idle ones for 5 mins */
81 81
82 MAYDAY_INITIAL_TIMEOUT = HZ / 100, /* call for help after 10ms */ 82 MAYDAY_INITIAL_TIMEOUT = HZ / 100 >= 2 ? HZ / 100 : 2,
83 /* call for help after 10ms
84 (min two ticks) */
83 MAYDAY_INTERVAL = HZ / 10, /* and then every 100ms */ 85 MAYDAY_INTERVAL = HZ / 10, /* and then every 100ms */
84 CREATE_COOLDOWN = HZ, /* time to breath after fail */ 86 CREATE_COOLDOWN = HZ, /* time to breath after fail */
85 TRUSTEE_COOLDOWN = HZ / 10, /* for trustee draining */ 87 TRUSTEE_COOLDOWN = HZ / 10, /* for trustee draining */
@@ -768,7 +770,11 @@ static inline void worker_clr_flags(struct worker *worker, unsigned int flags)
768 770
769 worker->flags &= ~flags; 771 worker->flags &= ~flags;
770 772
771 /* if transitioning out of NOT_RUNNING, increment nr_running */ 773 /*
774 * If transitioning out of NOT_RUNNING, increment nr_running. Note
775 * that the nested NOT_RUNNING is not a noop. NOT_RUNNING is mask
776 * of multiple flags, not a single flag.
777 */
772 if ((flags & WORKER_NOT_RUNNING) && (oflags & WORKER_NOT_RUNNING)) 778 if ((flags & WORKER_NOT_RUNNING) && (oflags & WORKER_NOT_RUNNING))
773 if (!(worker->flags & WORKER_NOT_RUNNING)) 779 if (!(worker->flags & WORKER_NOT_RUNNING))
774 atomic_inc(get_gcwq_nr_running(gcwq->cpu)); 780 atomic_inc(get_gcwq_nr_running(gcwq->cpu));
@@ -932,6 +938,38 @@ static void insert_work(struct cpu_workqueue_struct *cwq,
932 wake_up_worker(gcwq); 938 wake_up_worker(gcwq);
933} 939}
934 940
941/*
942 * Test whether @work is being queued from another work executing on the
943 * same workqueue. This is rather expensive and should only be used from
944 * cold paths.
945 */
946static bool is_chained_work(struct workqueue_struct *wq)
947{
948 unsigned long flags;
949 unsigned int cpu;
950
951 for_each_gcwq_cpu(cpu) {
952 struct global_cwq *gcwq = get_gcwq(cpu);
953 struct worker *worker;
954 struct hlist_node *pos;
955 int i;
956
957 spin_lock_irqsave(&gcwq->lock, flags);
958 for_each_busy_worker(worker, i, pos, gcwq) {
959 if (worker->task != current)
960 continue;
961 spin_unlock_irqrestore(&gcwq->lock, flags);
962 /*
963 * I'm @worker, no locking necessary. See if @work
964 * is headed to the same workqueue.
965 */
966 return worker->current_cwq->wq == wq;
967 }
968 spin_unlock_irqrestore(&gcwq->lock, flags);
969 }
970 return false;
971}
972
935static void __queue_work(unsigned int cpu, struct workqueue_struct *wq, 973static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
936 struct work_struct *work) 974 struct work_struct *work)
937{ 975{
@@ -943,7 +981,9 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
943 981
944 debug_work_activate(work); 982 debug_work_activate(work);
945 983
946 if (WARN_ON_ONCE(wq->flags & WQ_DYING)) 984 /* if dying, only works from the same workqueue are allowed */
985 if (unlikely(wq->flags & WQ_DYING) &&
986 WARN_ON_ONCE(!is_chained_work(wq)))
947 return; 987 return;
948 988
949 /* determine gcwq to use */ 989 /* determine gcwq to use */
@@ -1806,7 +1846,7 @@ __acquires(&gcwq->lock)
1806 spin_unlock_irq(&gcwq->lock); 1846 spin_unlock_irq(&gcwq->lock);
1807 1847
1808 work_clear_pending(work); 1848 work_clear_pending(work);
1809 lock_map_acquire(&cwq->wq->lockdep_map); 1849 lock_map_acquire_read(&cwq->wq->lockdep_map);
1810 lock_map_acquire(&lockdep_map); 1850 lock_map_acquire(&lockdep_map);
1811 trace_workqueue_execute_start(work); 1851 trace_workqueue_execute_start(work);
1812 f(work); 1852 f(work);
@@ -2009,6 +2049,15 @@ repeat:
2009 move_linked_works(work, scheduled, &n); 2049 move_linked_works(work, scheduled, &n);
2010 2050
2011 process_scheduled_works(rescuer); 2051 process_scheduled_works(rescuer);
2052
2053 /*
2054 * Leave this gcwq. If keep_working() is %true, notify a
2055 * regular worker; otherwise, we end up with 0 concurrency
2056 * and stalling the execution.
2057 */
2058 if (keep_working(gcwq))
2059 wake_up_worker(gcwq);
2060
2012 spin_unlock_irq(&gcwq->lock); 2061 spin_unlock_irq(&gcwq->lock);
2013 } 2062 }
2014 2063
@@ -2350,8 +2399,18 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr,
2350 insert_wq_barrier(cwq, barr, work, worker); 2399 insert_wq_barrier(cwq, barr, work, worker);
2351 spin_unlock_irq(&gcwq->lock); 2400 spin_unlock_irq(&gcwq->lock);
2352 2401
2353 lock_map_acquire(&cwq->wq->lockdep_map); 2402 /*
2403 * If @max_active is 1 or rescuer is in use, flushing another work
2404 * item on the same workqueue may lead to deadlock. Make sure the
2405 * flusher is not running on the same workqueue by verifying write
2406 * access.
2407 */
2408 if (cwq->wq->saved_max_active == 1 || cwq->wq->flags & WQ_RESCUER)
2409 lock_map_acquire(&cwq->wq->lockdep_map);
2410 else
2411 lock_map_acquire_read(&cwq->wq->lockdep_map);
2354 lock_map_release(&cwq->wq->lockdep_map); 2412 lock_map_release(&cwq->wq->lockdep_map);
2413
2355 return true; 2414 return true;
2356already_gone: 2415already_gone:
2357 spin_unlock_irq(&gcwq->lock); 2416 spin_unlock_irq(&gcwq->lock);
@@ -2908,7 +2967,7 @@ struct workqueue_struct *__alloc_workqueue_key(const char *name,
2908 */ 2967 */
2909 spin_lock(&workqueue_lock); 2968 spin_lock(&workqueue_lock);
2910 2969
2911 if (workqueue_freezing && wq->flags & WQ_FREEZEABLE) 2970 if (workqueue_freezing && wq->flags & WQ_FREEZABLE)
2912 for_each_cwq_cpu(cpu, wq) 2971 for_each_cwq_cpu(cpu, wq)
2913 get_cwq(cpu, wq)->max_active = 0; 2972 get_cwq(cpu, wq)->max_active = 0;
2914 2973
@@ -2936,11 +2995,35 @@ EXPORT_SYMBOL_GPL(__alloc_workqueue_key);
2936 */ 2995 */
2937void destroy_workqueue(struct workqueue_struct *wq) 2996void destroy_workqueue(struct workqueue_struct *wq)
2938{ 2997{
2998 unsigned int flush_cnt = 0;
2939 unsigned int cpu; 2999 unsigned int cpu;
2940 3000
3001 /*
3002 * Mark @wq dying and drain all pending works. Once WQ_DYING is
3003 * set, only chain queueing is allowed. IOW, only currently
3004 * pending or running work items on @wq can queue further work
3005 * items on it. @wq is flushed repeatedly until it becomes empty.
3006 * The number of flushing is detemined by the depth of chaining and
3007 * should be relatively short. Whine if it takes too long.
3008 */
2941 wq->flags |= WQ_DYING; 3009 wq->flags |= WQ_DYING;
3010reflush:
2942 flush_workqueue(wq); 3011 flush_workqueue(wq);
2943 3012
3013 for_each_cwq_cpu(cpu, wq) {
3014 struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
3015
3016 if (!cwq->nr_active && list_empty(&cwq->delayed_works))
3017 continue;
3018
3019 if (++flush_cnt == 10 ||
3020 (flush_cnt % 100 == 0 && flush_cnt <= 1000))
3021 printk(KERN_WARNING "workqueue %s: flush on "
3022 "destruction isn't complete after %u tries\n",
3023 wq->name, flush_cnt);
3024 goto reflush;
3025 }
3026
2944 /* 3027 /*
2945 * wq list is used to freeze wq, remove from list after 3028 * wq list is used to freeze wq, remove from list after
2946 * flushing is complete in case freeze races us. 3029 * flushing is complete in case freeze races us.
@@ -2996,7 +3079,7 @@ void workqueue_set_max_active(struct workqueue_struct *wq, int max_active)
2996 3079
2997 spin_lock_irq(&gcwq->lock); 3080 spin_lock_irq(&gcwq->lock);
2998 3081
2999 if (!(wq->flags & WQ_FREEZEABLE) || 3082 if (!(wq->flags & WQ_FREEZABLE) ||
3000 !(gcwq->flags & GCWQ_FREEZING)) 3083 !(gcwq->flags & GCWQ_FREEZING))
3001 get_cwq(gcwq->cpu, wq)->max_active = max_active; 3084 get_cwq(gcwq->cpu, wq)->max_active = max_active;
3002 3085
@@ -3246,7 +3329,7 @@ static int __cpuinit trustee_thread(void *__gcwq)
3246 * want to get it over with ASAP - spam rescuers, wake up as 3329 * want to get it over with ASAP - spam rescuers, wake up as
3247 * many idlers as necessary and create new ones till the 3330 * many idlers as necessary and create new ones till the
3248 * worklist is empty. Note that if the gcwq is frozen, there 3331 * worklist is empty. Note that if the gcwq is frozen, there
3249 * may be frozen works in freezeable cwqs. Don't declare 3332 * may be frozen works in freezable cwqs. Don't declare
3250 * completion while frozen. 3333 * completion while frozen.
3251 */ 3334 */
3252 while (gcwq->nr_workers != gcwq->nr_idle || 3335 while (gcwq->nr_workers != gcwq->nr_idle ||
@@ -3504,9 +3587,9 @@ EXPORT_SYMBOL_GPL(work_on_cpu);
3504/** 3587/**
3505 * freeze_workqueues_begin - begin freezing workqueues 3588 * freeze_workqueues_begin - begin freezing workqueues
3506 * 3589 *
3507 * Start freezing workqueues. After this function returns, all 3590 * Start freezing workqueues. After this function returns, all freezable
3508 * freezeable workqueues will queue new works to their frozen_works 3591 * workqueues will queue new works to their frozen_works list instead of
3509 * list instead of gcwq->worklist. 3592 * gcwq->worklist.
3510 * 3593 *
3511 * CONTEXT: 3594 * CONTEXT:
3512 * Grabs and releases workqueue_lock and gcwq->lock's. 3595 * Grabs and releases workqueue_lock and gcwq->lock's.
@@ -3532,7 +3615,7 @@ void freeze_workqueues_begin(void)
3532 list_for_each_entry(wq, &workqueues, list) { 3615 list_for_each_entry(wq, &workqueues, list) {
3533 struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); 3616 struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
3534 3617
3535 if (cwq && wq->flags & WQ_FREEZEABLE) 3618 if (cwq && wq->flags & WQ_FREEZABLE)
3536 cwq->max_active = 0; 3619 cwq->max_active = 0;
3537 } 3620 }
3538 3621
@@ -3543,7 +3626,7 @@ void freeze_workqueues_begin(void)
3543} 3626}
3544 3627
3545/** 3628/**
3546 * freeze_workqueues_busy - are freezeable workqueues still busy? 3629 * freeze_workqueues_busy - are freezable workqueues still busy?
3547 * 3630 *
3548 * Check whether freezing is complete. This function must be called 3631 * Check whether freezing is complete. This function must be called
3549 * between freeze_workqueues_begin() and thaw_workqueues(). 3632 * between freeze_workqueues_begin() and thaw_workqueues().
@@ -3552,8 +3635,8 @@ void freeze_workqueues_begin(void)
3552 * Grabs and releases workqueue_lock. 3635 * Grabs and releases workqueue_lock.
3553 * 3636 *
3554 * RETURNS: 3637 * RETURNS:
3555 * %true if some freezeable workqueues are still busy. %false if 3638 * %true if some freezable workqueues are still busy. %false if freezing
3556 * freezing is complete. 3639 * is complete.
3557 */ 3640 */
3558bool freeze_workqueues_busy(void) 3641bool freeze_workqueues_busy(void)
3559{ 3642{
@@ -3573,7 +3656,7 @@ bool freeze_workqueues_busy(void)
3573 list_for_each_entry(wq, &workqueues, list) { 3656 list_for_each_entry(wq, &workqueues, list) {
3574 struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); 3657 struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
3575 3658
3576 if (!cwq || !(wq->flags & WQ_FREEZEABLE)) 3659 if (!cwq || !(wq->flags & WQ_FREEZABLE))
3577 continue; 3660 continue;
3578 3661
3579 BUG_ON(cwq->nr_active < 0); 3662 BUG_ON(cwq->nr_active < 0);
@@ -3618,7 +3701,7 @@ void thaw_workqueues(void)
3618 list_for_each_entry(wq, &workqueues, list) { 3701 list_for_each_entry(wq, &workqueues, list) {
3619 struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); 3702 struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
3620 3703
3621 if (!cwq || !(wq->flags & WQ_FREEZEABLE)) 3704 if (!cwq || !(wq->flags & WQ_FREEZABLE))
3622 continue; 3705 continue;
3623 3706
3624 /* restore max_active and repopulate worklist */ 3707 /* restore max_active and repopulate worklist */