aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile5
-rw-r--r--kernel/audit.c2
-rw-r--r--kernel/audit_watch.c85
-rw-r--r--kernel/capability.c2
-rw-r--r--kernel/cgroup.c109
-rw-r--r--kernel/compat.c136
-rw-r--r--kernel/cpuset.c7
-rw-r--r--kernel/cred.c18
-rw-r--r--kernel/debug/kdb/kdb_main.c2
-rw-r--r--kernel/exit.c16
-rw-r--r--kernel/fork.c50
-rw-r--r--kernel/freezer.c9
-rw-r--r--kernel/futex.c209
-rw-r--r--kernel/hrtimer.c94
-rw-r--r--kernel/irq/Kconfig42
-rw-r--r--kernel/irq/autoprobe.c54
-rw-r--r--kernel/irq/chip.c483
-rw-r--r--kernel/irq/compat.h72
-rw-r--r--kernel/irq/debug.h40
-rw-r--r--kernel/irq/handle.c237
-rw-r--r--kernel/irq/internals.h173
-rw-r--r--kernel/irq/irqdesc.c85
-rw-r--r--kernel/irq/manage.c606
-rw-r--r--kernel/irq/migration.c44
-rw-r--r--kernel/irq/pm.c30
-rw-r--r--kernel/irq/proc.c70
-rw-r--r--kernel/irq/resend.c19
-rw-r--r--kernel/irq/settings.h138
-rw-r--r--kernel/irq/spurious.c163
-rw-r--r--kernel/irq_work.c18
-rw-r--r--kernel/kexec.c2
-rw-r--r--kernel/kprobes.c8
-rw-r--r--kernel/kthread.c2
-rw-r--r--kernel/latencytop.c23
-rw-r--r--kernel/lockdep.c18
-rw-r--r--kernel/module.c16
-rw-r--r--kernel/panic.c1
-rw-r--r--kernel/params.c65
-rw-r--r--kernel/perf_event.c1174
-rw-r--r--kernel/posix-cpu-timers.c110
-rw-r--r--kernel/posix-timers.c342
-rw-r--r--kernel/power/Kconfig5
-rw-r--r--kernel/power/Makefile6
-rw-r--r--kernel/power/hibernate.c11
-rw-r--r--kernel/power/main.c2
-rw-r--r--kernel/power/nvs.c136
-rw-r--r--kernel/power/process.c14
-rw-r--r--kernel/power/snapshot.c7
-rw-r--r--kernel/power/suspend.c6
-rw-r--r--kernel/power/swap.c7
-rw-r--r--kernel/printk.c186
-rw-r--r--kernel/ptrace.c8
-rw-r--r--kernel/rcutiny.c3
-rw-r--r--kernel/rcutree.c4
-rw-r--r--kernel/rtmutex-debug.c1
-rw-r--r--kernel/rtmutex-tester.c40
-rw-r--r--kernel/rtmutex.c318
-rw-r--r--kernel/rtmutex_common.h16
-rw-r--r--kernel/sched.c404
-rw-r--r--kernel/sched_autogroup.c45
-rw-r--r--kernel/sched_autogroup.h9
-rw-r--r--kernel/sched_debug.c44
-rw-r--r--kernel/sched_fair.c507
-rw-r--r--kernel/sched_idletask.c26
-rw-r--r--kernel/sched_rt.c35
-rw-r--r--kernel/sched_stoptask.c7
-rw-r--r--kernel/smp.c75
-rw-r--r--kernel/softirq.c87
-rw-r--r--kernel/srcu.c15
-rw-r--r--kernel/sys.c9
-rw-r--r--kernel/sys_ni.c5
-rw-r--r--kernel/sysctl.c60
-rw-r--r--kernel/sysctl_binary.c21
-rw-r--r--kernel/taskstats.c7
-rw-r--r--kernel/time.c39
-rw-r--r--kernel/time/Makefile3
-rw-r--r--kernel/time/clockevents.c1
-rw-r--r--kernel/time/clocksource.c11
-rw-r--r--kernel/time/jiffies.c20
-rw-r--r--kernel/time/ntp.c438
-rw-r--r--kernel/time/posix-clock.c451
-rw-r--r--kernel/time/tick-broadcast.c11
-rw-r--r--kernel/time/tick-common.c9
-rw-r--r--kernel/time/tick-internal.h12
-rw-r--r--kernel/time/tick-oneshot.c5
-rw-r--r--kernel/time/tick-sched.c8
-rw-r--r--kernel/time/timekeeping.c188
-rw-r--r--kernel/time/timer_list.c4
-rw-r--r--kernel/timer.c50
-rw-r--r--kernel/trace/Makefile2
-rw-r--r--kernel/trace/blktrace.c60
-rw-r--r--kernel/trace/ftrace.c52
-rw-r--r--kernel/trace/ring_buffer.c24
-rw-r--r--kernel/trace/trace.c44
-rw-r--r--kernel/trace/trace.h41
-rw-r--r--kernel/trace/trace_entries.h8
-rw-r--r--kernel/trace/trace_events.c14
-rw-r--r--kernel/trace/trace_events_filter.c885
-rw-r--r--kernel/trace/trace_export.c6
-rw-r--r--kernel/trace/trace_irqsoff.c8
-rw-r--r--kernel/trace/trace_kprobe.c111
-rw-r--r--kernel/trace/trace_output.c36
-rw-r--r--kernel/trace/trace_sched_switch.c48
-rw-r--r--kernel/trace/trace_selftest.c2
-rw-r--r--kernel/trace/trace_syscalls.c92
-rw-r--r--kernel/tracepoint.c31
-rw-r--r--kernel/user_namespace.c15
-rw-r--r--kernel/watchdog.c89
-rw-r--r--kernel/workqueue.c123
109 files changed, 6720 insertions, 3126 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 0b5ff083fa22..353d3fe8ba33 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -43,7 +43,7 @@ obj-$(CONFIG_RT_MUTEXES) += rtmutex.o
43obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o 43obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o
44obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o 44obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o
45obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o 45obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o
46obj-$(CONFIG_USE_GENERIC_SMP_HELPERS) += smp.o 46obj-$(CONFIG_SMP) += smp.o
47ifneq ($(CONFIG_SMP),y) 47ifneq ($(CONFIG_SMP),y)
48obj-y += up.o 48obj-y += up.o
49endif 49endif
@@ -100,6 +100,7 @@ obj-$(CONFIG_FUNCTION_TRACER) += trace/
100obj-$(CONFIG_TRACING) += trace/ 100obj-$(CONFIG_TRACING) += trace/
101obj-$(CONFIG_X86_DS) += trace/ 101obj-$(CONFIG_X86_DS) += trace/
102obj-$(CONFIG_RING_BUFFER) += trace/ 102obj-$(CONFIG_RING_BUFFER) += trace/
103obj-$(CONFIG_TRACEPOINTS) += trace/
103obj-$(CONFIG_SMP) += sched_cpupri.o 104obj-$(CONFIG_SMP) += sched_cpupri.o
104obj-$(CONFIG_IRQ_WORK) += irq_work.o 105obj-$(CONFIG_IRQ_WORK) += irq_work.o
105obj-$(CONFIG_PERF_EVENTS) += perf_event.o 106obj-$(CONFIG_PERF_EVENTS) += perf_event.o
@@ -121,7 +122,7 @@ $(obj)/configs.o: $(obj)/config_data.h
121# config_data.h contains the same information as ikconfig.h but gzipped. 122# config_data.h contains the same information as ikconfig.h but gzipped.
122# Info from config_data can be extracted from /proc/config* 123# Info from config_data can be extracted from /proc/config*
123targets += config_data.gz 124targets += config_data.gz
124$(obj)/config_data.gz: .config FORCE 125$(obj)/config_data.gz: $(KCONFIG_CONFIG) FORCE
125 $(call if_changed,gzip) 126 $(call if_changed,gzip)
126 127
127quiet_cmd_ikconfiggz = IKCFG $@ 128quiet_cmd_ikconfiggz = IKCFG $@
diff --git a/kernel/audit.c b/kernel/audit.c
index 77770a034d59..e4956244ae50 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -400,7 +400,7 @@ static void kauditd_send_skb(struct sk_buff *skb)
400 if (err < 0) { 400 if (err < 0) {
401 BUG_ON(err != -ECONNREFUSED); /* Shouldn't happen */ 401 BUG_ON(err != -ECONNREFUSED); /* Shouldn't happen */
402 printk(KERN_ERR "audit: *NO* daemon at audit_pid=%d\n", audit_pid); 402 printk(KERN_ERR "audit: *NO* daemon at audit_pid=%d\n", audit_pid);
403 audit_log_lost("auditd dissapeared\n"); 403 audit_log_lost("auditd disappeared\n");
404 audit_pid = 0; 404 audit_pid = 0;
405 /* we might get lucky and get this in the next auditd */ 405 /* we might get lucky and get this in the next auditd */
406 audit_hold_skb(skb); 406 audit_hold_skb(skb);
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index d2e3c7866460..e683869365d9 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -144,9 +144,9 @@ int audit_watch_compare(struct audit_watch *watch, unsigned long ino, dev_t dev)
144} 144}
145 145
146/* Initialize a parent watch entry. */ 146/* Initialize a parent watch entry. */
147static struct audit_parent *audit_init_parent(struct nameidata *ndp) 147static struct audit_parent *audit_init_parent(struct path *path)
148{ 148{
149 struct inode *inode = ndp->path.dentry->d_inode; 149 struct inode *inode = path->dentry->d_inode;
150 struct audit_parent *parent; 150 struct audit_parent *parent;
151 int ret; 151 int ret;
152 152
@@ -353,53 +353,40 @@ static void audit_remove_parent_watches(struct audit_parent *parent)
353} 353}
354 354
355/* Get path information necessary for adding watches. */ 355/* Get path information necessary for adding watches. */
356static int audit_get_nd(char *path, struct nameidata **ndp, struct nameidata **ndw) 356static int audit_get_nd(struct audit_watch *watch, struct path *parent)
357{ 357{
358 struct nameidata *ndparent, *ndwatch; 358 struct nameidata nd;
359 struct dentry *d;
359 int err; 360 int err;
360 361
361 ndparent = kmalloc(sizeof(*ndparent), GFP_KERNEL); 362 err = kern_path_parent(watch->path, &nd);
362 if (unlikely(!ndparent)) 363 if (err)
363 return -ENOMEM; 364 return err;
364 365
365 ndwatch = kmalloc(sizeof(*ndwatch), GFP_KERNEL); 366 if (nd.last_type != LAST_NORM) {
366 if (unlikely(!ndwatch)) { 367 path_put(&nd.path);
367 kfree(ndparent); 368 return -EINVAL;
368 return -ENOMEM;
369 } 369 }
370 370
371 err = path_lookup(path, LOOKUP_PARENT, ndparent); 371 mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT);
372 if (err) { 372 d = lookup_one_len(nd.last.name, nd.path.dentry, nd.last.len);
373 kfree(ndparent); 373 if (IS_ERR(d)) {
374 kfree(ndwatch); 374 mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
375 return err; 375 path_put(&nd.path);
376 return PTR_ERR(d);
376 } 377 }
377 378 if (d->d_inode) {
378 err = path_lookup(path, 0, ndwatch); 379 /* update watch filter fields */
379 if (err) { 380 watch->dev = d->d_inode->i_sb->s_dev;
380 kfree(ndwatch); 381 watch->ino = d->d_inode->i_ino;
381 ndwatch = NULL;
382 } 382 }
383 mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
383 384
384 *ndp = ndparent; 385 *parent = nd.path;
385 *ndw = ndwatch; 386 dput(d);
386
387 return 0; 387 return 0;
388} 388}
389 389
390/* Release resources used for watch path information. */
391static void audit_put_nd(struct nameidata *ndp, struct nameidata *ndw)
392{
393 if (ndp) {
394 path_put(&ndp->path);
395 kfree(ndp);
396 }
397 if (ndw) {
398 path_put(&ndw->path);
399 kfree(ndw);
400 }
401}
402
403/* Associate the given rule with an existing parent. 390/* Associate the given rule with an existing parent.
404 * Caller must hold audit_filter_mutex. */ 391 * Caller must hold audit_filter_mutex. */
405static void audit_add_to_parent(struct audit_krule *krule, 392static void audit_add_to_parent(struct audit_krule *krule,
@@ -440,31 +427,24 @@ int audit_add_watch(struct audit_krule *krule, struct list_head **list)
440{ 427{
441 struct audit_watch *watch = krule->watch; 428 struct audit_watch *watch = krule->watch;
442 struct audit_parent *parent; 429 struct audit_parent *parent;
443 struct nameidata *ndp = NULL, *ndw = NULL; 430 struct path parent_path;
444 int h, ret = 0; 431 int h, ret = 0;
445 432
446 mutex_unlock(&audit_filter_mutex); 433 mutex_unlock(&audit_filter_mutex);
447 434
448 /* Avoid calling path_lookup under audit_filter_mutex. */ 435 /* Avoid calling path_lookup under audit_filter_mutex. */
449 ret = audit_get_nd(watch->path, &ndp, &ndw); 436 ret = audit_get_nd(watch, &parent_path);
450 if (ret) {
451 /* caller expects mutex locked */
452 mutex_lock(&audit_filter_mutex);
453 goto error;
454 }
455 437
438 /* caller expects mutex locked */
456 mutex_lock(&audit_filter_mutex); 439 mutex_lock(&audit_filter_mutex);
457 440
458 /* update watch filter fields */ 441 if (ret)
459 if (ndw) { 442 return ret;
460 watch->dev = ndw->path.dentry->d_inode->i_sb->s_dev;
461 watch->ino = ndw->path.dentry->d_inode->i_ino;
462 }
463 443
464 /* either find an old parent or attach a new one */ 444 /* either find an old parent or attach a new one */
465 parent = audit_find_parent(ndp->path.dentry->d_inode); 445 parent = audit_find_parent(parent_path.dentry->d_inode);
466 if (!parent) { 446 if (!parent) {
467 parent = audit_init_parent(ndp); 447 parent = audit_init_parent(&parent_path);
468 if (IS_ERR(parent)) { 448 if (IS_ERR(parent)) {
469 ret = PTR_ERR(parent); 449 ret = PTR_ERR(parent);
470 goto error; 450 goto error;
@@ -479,9 +459,8 @@ int audit_add_watch(struct audit_krule *krule, struct list_head **list)
479 h = audit_hash_ino((u32)watch->ino); 459 h = audit_hash_ino((u32)watch->ino);
480 *list = &audit_inode_hash[h]; 460 *list = &audit_inode_hash[h];
481error: 461error:
482 audit_put_nd(ndp, ndw); /* NULL args OK */ 462 path_put(&parent_path);
483 return ret; 463 return ret;
484
485} 464}
486 465
487void audit_remove_watch_rule(struct audit_krule *krule) 466void audit_remove_watch_rule(struct audit_krule *krule)
diff --git a/kernel/capability.c b/kernel/capability.c
index 2f05303715a5..9e9385f132c8 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -306,7 +306,7 @@ int capable(int cap)
306 BUG(); 306 BUG();
307 } 307 }
308 308
309 if (security_capable(cap) == 0) { 309 if (security_capable(current_cred(), cap) == 0) {
310 current->flags |= PF_SUPERPRIV; 310 current->flags |= PF_SUPERPRIV;
311 return 1; 311 return 1;
312 } 312 }
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 66a416b42c18..95362d15128c 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -764,6 +764,7 @@ EXPORT_SYMBOL_GPL(cgroup_unlock);
764 */ 764 */
765 765
766static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode); 766static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode);
767static struct dentry *cgroup_lookup(struct inode *, struct dentry *, struct nameidata *);
767static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry); 768static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry);
768static int cgroup_populate_dir(struct cgroup *cgrp); 769static int cgroup_populate_dir(struct cgroup *cgrp);
769static const struct inode_operations cgroup_dir_inode_operations; 770static const struct inode_operations cgroup_dir_inode_operations;
@@ -860,6 +861,11 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
860 iput(inode); 861 iput(inode);
861} 862}
862 863
864static int cgroup_delete(const struct dentry *d)
865{
866 return 1;
867}
868
863static void remove_dir(struct dentry *d) 869static void remove_dir(struct dentry *d)
864{ 870{
865 struct dentry *parent = dget(d->d_parent); 871 struct dentry *parent = dget(d->d_parent);
@@ -874,25 +880,29 @@ static void cgroup_clear_directory(struct dentry *dentry)
874 struct list_head *node; 880 struct list_head *node;
875 881
876 BUG_ON(!mutex_is_locked(&dentry->d_inode->i_mutex)); 882 BUG_ON(!mutex_is_locked(&dentry->d_inode->i_mutex));
877 spin_lock(&dcache_lock); 883 spin_lock(&dentry->d_lock);
878 node = dentry->d_subdirs.next; 884 node = dentry->d_subdirs.next;
879 while (node != &dentry->d_subdirs) { 885 while (node != &dentry->d_subdirs) {
880 struct dentry *d = list_entry(node, struct dentry, d_u.d_child); 886 struct dentry *d = list_entry(node, struct dentry, d_u.d_child);
887
888 spin_lock_nested(&d->d_lock, DENTRY_D_LOCK_NESTED);
881 list_del_init(node); 889 list_del_init(node);
882 if (d->d_inode) { 890 if (d->d_inode) {
883 /* This should never be called on a cgroup 891 /* This should never be called on a cgroup
884 * directory with child cgroups */ 892 * directory with child cgroups */
885 BUG_ON(d->d_inode->i_mode & S_IFDIR); 893 BUG_ON(d->d_inode->i_mode & S_IFDIR);
886 d = dget_locked(d); 894 dget_dlock(d);
887 spin_unlock(&dcache_lock); 895 spin_unlock(&d->d_lock);
896 spin_unlock(&dentry->d_lock);
888 d_delete(d); 897 d_delete(d);
889 simple_unlink(dentry->d_inode, d); 898 simple_unlink(dentry->d_inode, d);
890 dput(d); 899 dput(d);
891 spin_lock(&dcache_lock); 900 spin_lock(&dentry->d_lock);
892 } 901 } else
902 spin_unlock(&d->d_lock);
893 node = dentry->d_subdirs.next; 903 node = dentry->d_subdirs.next;
894 } 904 }
895 spin_unlock(&dcache_lock); 905 spin_unlock(&dentry->d_lock);
896} 906}
897 907
898/* 908/*
@@ -900,11 +910,16 @@ static void cgroup_clear_directory(struct dentry *dentry)
900 */ 910 */
901static void cgroup_d_remove_dir(struct dentry *dentry) 911static void cgroup_d_remove_dir(struct dentry *dentry)
902{ 912{
913 struct dentry *parent;
914
903 cgroup_clear_directory(dentry); 915 cgroup_clear_directory(dentry);
904 916
905 spin_lock(&dcache_lock); 917 parent = dentry->d_parent;
918 spin_lock(&parent->d_lock);
919 spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
906 list_del_init(&dentry->d_u.d_child); 920 list_del_init(&dentry->d_u.d_child);
907 spin_unlock(&dcache_lock); 921 spin_unlock(&dentry->d_lock);
922 spin_unlock(&parent->d_lock);
908 remove_dir(dentry); 923 remove_dir(dentry);
909} 924}
910 925
@@ -1440,6 +1455,11 @@ static int cgroup_set_super(struct super_block *sb, void *data)
1440 1455
1441static int cgroup_get_rootdir(struct super_block *sb) 1456static int cgroup_get_rootdir(struct super_block *sb)
1442{ 1457{
1458 static const struct dentry_operations cgroup_dops = {
1459 .d_iput = cgroup_diput,
1460 .d_delete = cgroup_delete,
1461 };
1462
1443 struct inode *inode = 1463 struct inode *inode =
1444 cgroup_new_inode(S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR, sb); 1464 cgroup_new_inode(S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR, sb);
1445 struct dentry *dentry; 1465 struct dentry *dentry;
@@ -1457,6 +1477,8 @@ static int cgroup_get_rootdir(struct super_block *sb)
1457 return -ENOMEM; 1477 return -ENOMEM;
1458 } 1478 }
1459 sb->s_root = dentry; 1479 sb->s_root = dentry;
1480 /* for everything else we want ->d_op set */
1481 sb->s_d_op = &cgroup_dops;
1460 return 0; 1482 return 0;
1461} 1483}
1462 1484
@@ -2180,12 +2202,20 @@ static const struct file_operations cgroup_file_operations = {
2180}; 2202};
2181 2203
2182static const struct inode_operations cgroup_dir_inode_operations = { 2204static const struct inode_operations cgroup_dir_inode_operations = {
2183 .lookup = simple_lookup, 2205 .lookup = cgroup_lookup,
2184 .mkdir = cgroup_mkdir, 2206 .mkdir = cgroup_mkdir,
2185 .rmdir = cgroup_rmdir, 2207 .rmdir = cgroup_rmdir,
2186 .rename = cgroup_rename, 2208 .rename = cgroup_rename,
2187}; 2209};
2188 2210
2211static struct dentry *cgroup_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
2212{
2213 if (dentry->d_name.len > NAME_MAX)
2214 return ERR_PTR(-ENAMETOOLONG);
2215 d_add(dentry, NULL);
2216 return NULL;
2217}
2218
2189/* 2219/*
2190 * Check if a file is a control file 2220 * Check if a file is a control file
2191 */ 2221 */
@@ -2199,10 +2229,6 @@ static inline struct cftype *__file_cft(struct file *file)
2199static int cgroup_create_file(struct dentry *dentry, mode_t mode, 2229static int cgroup_create_file(struct dentry *dentry, mode_t mode,
2200 struct super_block *sb) 2230 struct super_block *sb)
2201{ 2231{
2202 static const struct dentry_operations cgroup_dops = {
2203 .d_iput = cgroup_diput,
2204 };
2205
2206 struct inode *inode; 2232 struct inode *inode;
2207 2233
2208 if (!dentry) 2234 if (!dentry)
@@ -2228,7 +2254,6 @@ static int cgroup_create_file(struct dentry *dentry, mode_t mode,
2228 inode->i_size = 0; 2254 inode->i_size = 0;
2229 inode->i_fop = &cgroup_file_operations; 2255 inode->i_fop = &cgroup_file_operations;
2230 } 2256 }
2231 dentry->d_op = &cgroup_dops;
2232 d_instantiate(dentry, inode); 2257 d_instantiate(dentry, inode);
2233 dget(dentry); /* Extra count - pin the dentry in core */ 2258 dget(dentry); /* Extra count - pin the dentry in core */
2234 return 0; 2259 return 0;
@@ -3638,9 +3663,7 @@ again:
3638 list_del(&cgrp->sibling); 3663 list_del(&cgrp->sibling);
3639 cgroup_unlock_hierarchy(cgrp->root); 3664 cgroup_unlock_hierarchy(cgrp->root);
3640 3665
3641 spin_lock(&cgrp->dentry->d_lock);
3642 d = dget(cgrp->dentry); 3666 d = dget(cgrp->dentry);
3643 spin_unlock(&d->d_lock);
3644 3667
3645 cgroup_d_remove_dir(d); 3668 cgroup_d_remove_dir(d);
3646 dput(d); 3669 dput(d);
@@ -4207,20 +4230,8 @@ void cgroup_post_fork(struct task_struct *child)
4207 */ 4230 */
4208void cgroup_exit(struct task_struct *tsk, int run_callbacks) 4231void cgroup_exit(struct task_struct *tsk, int run_callbacks)
4209{ 4232{
4210 int i;
4211 struct css_set *cg; 4233 struct css_set *cg;
4212 4234 int i;
4213 if (run_callbacks && need_forkexit_callback) {
4214 /*
4215 * modular subsystems can't use callbacks, so no need to lock
4216 * the subsys array
4217 */
4218 for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
4219 struct cgroup_subsys *ss = subsys[i];
4220 if (ss->exit)
4221 ss->exit(ss, tsk);
4222 }
4223 }
4224 4235
4225 /* 4236 /*
4226 * Unlink from the css_set task list if necessary. 4237 * Unlink from the css_set task list if necessary.
@@ -4238,7 +4249,24 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
4238 task_lock(tsk); 4249 task_lock(tsk);
4239 cg = tsk->cgroups; 4250 cg = tsk->cgroups;
4240 tsk->cgroups = &init_css_set; 4251 tsk->cgroups = &init_css_set;
4252
4253 if (run_callbacks && need_forkexit_callback) {
4254 /*
4255 * modular subsystems can't use callbacks, so no need to lock
4256 * the subsys array
4257 */
4258 for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
4259 struct cgroup_subsys *ss = subsys[i];
4260 if (ss->exit) {
4261 struct cgroup *old_cgrp =
4262 rcu_dereference_raw(cg->subsys[i])->cgroup;
4263 struct cgroup *cgrp = task_cgroup(tsk, i);
4264 ss->exit(ss, cgrp, old_cgrp, tsk);
4265 }
4266 }
4267 }
4241 task_unlock(tsk); 4268 task_unlock(tsk);
4269
4242 if (cg) 4270 if (cg)
4243 put_css_set_taskexit(cg); 4271 put_css_set_taskexit(cg);
4244} 4272}
@@ -4790,6 +4818,29 @@ css_get_next(struct cgroup_subsys *ss, int id,
4790 return ret; 4818 return ret;
4791} 4819}
4792 4820
4821/*
4822 * get corresponding css from file open on cgroupfs directory
4823 */
4824struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id)
4825{
4826 struct cgroup *cgrp;
4827 struct inode *inode;
4828 struct cgroup_subsys_state *css;
4829
4830 inode = f->f_dentry->d_inode;
4831 /* check in cgroup filesystem dir */
4832 if (inode->i_op != &cgroup_dir_inode_operations)
4833 return ERR_PTR(-EBADF);
4834
4835 if (id < 0 || id >= CGROUP_SUBSYS_COUNT)
4836 return ERR_PTR(-EINVAL);
4837
4838 /* get cgroup */
4839 cgrp = __d_cgrp(f->f_dentry);
4840 css = cgrp->subsys[id];
4841 return css ? css : ERR_PTR(-ENOENT);
4842}
4843
4793#ifdef CONFIG_CGROUP_DEBUG 4844#ifdef CONFIG_CGROUP_DEBUG
4794static struct cgroup_subsys_state *debug_create(struct cgroup_subsys *ss, 4845static struct cgroup_subsys_state *debug_create(struct cgroup_subsys *ss,
4795 struct cgroup *cont) 4846 struct cgroup *cont)
diff --git a/kernel/compat.c b/kernel/compat.c
index c9e2ec0b34a8..38b1d2c1cbe8 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -52,6 +52,64 @@ static int compat_put_timeval(struct compat_timeval __user *o,
52 put_user(i->tv_usec, &o->tv_usec)) ? -EFAULT : 0; 52 put_user(i->tv_usec, &o->tv_usec)) ? -EFAULT : 0;
53} 53}
54 54
55static int compat_get_timex(struct timex *txc, struct compat_timex __user *utp)
56{
57 memset(txc, 0, sizeof(struct timex));
58
59 if (!access_ok(VERIFY_READ, utp, sizeof(struct compat_timex)) ||
60 __get_user(txc->modes, &utp->modes) ||
61 __get_user(txc->offset, &utp->offset) ||
62 __get_user(txc->freq, &utp->freq) ||
63 __get_user(txc->maxerror, &utp->maxerror) ||
64 __get_user(txc->esterror, &utp->esterror) ||
65 __get_user(txc->status, &utp->status) ||
66 __get_user(txc->constant, &utp->constant) ||
67 __get_user(txc->precision, &utp->precision) ||
68 __get_user(txc->tolerance, &utp->tolerance) ||
69 __get_user(txc->time.tv_sec, &utp->time.tv_sec) ||
70 __get_user(txc->time.tv_usec, &utp->time.tv_usec) ||
71 __get_user(txc->tick, &utp->tick) ||
72 __get_user(txc->ppsfreq, &utp->ppsfreq) ||
73 __get_user(txc->jitter, &utp->jitter) ||
74 __get_user(txc->shift, &utp->shift) ||
75 __get_user(txc->stabil, &utp->stabil) ||
76 __get_user(txc->jitcnt, &utp->jitcnt) ||
77 __get_user(txc->calcnt, &utp->calcnt) ||
78 __get_user(txc->errcnt, &utp->errcnt) ||
79 __get_user(txc->stbcnt, &utp->stbcnt))
80 return -EFAULT;
81
82 return 0;
83}
84
85static int compat_put_timex(struct compat_timex __user *utp, struct timex *txc)
86{
87 if (!access_ok(VERIFY_WRITE, utp, sizeof(struct compat_timex)) ||
88 __put_user(txc->modes, &utp->modes) ||
89 __put_user(txc->offset, &utp->offset) ||
90 __put_user(txc->freq, &utp->freq) ||
91 __put_user(txc->maxerror, &utp->maxerror) ||
92 __put_user(txc->esterror, &utp->esterror) ||
93 __put_user(txc->status, &utp->status) ||
94 __put_user(txc->constant, &utp->constant) ||
95 __put_user(txc->precision, &utp->precision) ||
96 __put_user(txc->tolerance, &utp->tolerance) ||
97 __put_user(txc->time.tv_sec, &utp->time.tv_sec) ||
98 __put_user(txc->time.tv_usec, &utp->time.tv_usec) ||
99 __put_user(txc->tick, &utp->tick) ||
100 __put_user(txc->ppsfreq, &utp->ppsfreq) ||
101 __put_user(txc->jitter, &utp->jitter) ||
102 __put_user(txc->shift, &utp->shift) ||
103 __put_user(txc->stabil, &utp->stabil) ||
104 __put_user(txc->jitcnt, &utp->jitcnt) ||
105 __put_user(txc->calcnt, &utp->calcnt) ||
106 __put_user(txc->errcnt, &utp->errcnt) ||
107 __put_user(txc->stbcnt, &utp->stbcnt) ||
108 __put_user(txc->tai, &utp->tai))
109 return -EFAULT;
110 return 0;
111}
112
55asmlinkage long compat_sys_gettimeofday(struct compat_timeval __user *tv, 113asmlinkage long compat_sys_gettimeofday(struct compat_timeval __user *tv,
56 struct timezone __user *tz) 114 struct timezone __user *tz)
57{ 115{
@@ -617,6 +675,29 @@ long compat_sys_clock_gettime(clockid_t which_clock,
617 return err; 675 return err;
618} 676}
619 677
678long compat_sys_clock_adjtime(clockid_t which_clock,
679 struct compat_timex __user *utp)
680{
681 struct timex txc;
682 mm_segment_t oldfs;
683 int err, ret;
684
685 err = compat_get_timex(&txc, utp);
686 if (err)
687 return err;
688
689 oldfs = get_fs();
690 set_fs(KERNEL_DS);
691 ret = sys_clock_adjtime(which_clock, (struct timex __user *) &txc);
692 set_fs(oldfs);
693
694 err = compat_put_timex(utp, &txc);
695 if (err)
696 return err;
697
698 return ret;
699}
700
620long compat_sys_clock_getres(clockid_t which_clock, 701long compat_sys_clock_getres(clockid_t which_clock,
621 struct compat_timespec __user *tp) 702 struct compat_timespec __user *tp)
622{ 703{
@@ -951,58 +1032,17 @@ asmlinkage long compat_sys_rt_sigsuspend(compat_sigset_t __user *unewset, compat
951asmlinkage long compat_sys_adjtimex(struct compat_timex __user *utp) 1032asmlinkage long compat_sys_adjtimex(struct compat_timex __user *utp)
952{ 1033{
953 struct timex txc; 1034 struct timex txc;
954 int ret; 1035 int err, ret;
955
956 memset(&txc, 0, sizeof(struct timex));
957 1036
958 if (!access_ok(VERIFY_READ, utp, sizeof(struct compat_timex)) || 1037 err = compat_get_timex(&txc, utp);
959 __get_user(txc.modes, &utp->modes) || 1038 if (err)
960 __get_user(txc.offset, &utp->offset) || 1039 return err;
961 __get_user(txc.freq, &utp->freq) ||
962 __get_user(txc.maxerror, &utp->maxerror) ||
963 __get_user(txc.esterror, &utp->esterror) ||
964 __get_user(txc.status, &utp->status) ||
965 __get_user(txc.constant, &utp->constant) ||
966 __get_user(txc.precision, &utp->precision) ||
967 __get_user(txc.tolerance, &utp->tolerance) ||
968 __get_user(txc.time.tv_sec, &utp->time.tv_sec) ||
969 __get_user(txc.time.tv_usec, &utp->time.tv_usec) ||
970 __get_user(txc.tick, &utp->tick) ||
971 __get_user(txc.ppsfreq, &utp->ppsfreq) ||
972 __get_user(txc.jitter, &utp->jitter) ||
973 __get_user(txc.shift, &utp->shift) ||
974 __get_user(txc.stabil, &utp->stabil) ||
975 __get_user(txc.jitcnt, &utp->jitcnt) ||
976 __get_user(txc.calcnt, &utp->calcnt) ||
977 __get_user(txc.errcnt, &utp->errcnt) ||
978 __get_user(txc.stbcnt, &utp->stbcnt))
979 return -EFAULT;
980 1040
981 ret = do_adjtimex(&txc); 1041 ret = do_adjtimex(&txc);
982 1042
983 if (!access_ok(VERIFY_WRITE, utp, sizeof(struct compat_timex)) || 1043 err = compat_put_timex(utp, &txc);
984 __put_user(txc.modes, &utp->modes) || 1044 if (err)
985 __put_user(txc.offset, &utp->offset) || 1045 return err;
986 __put_user(txc.freq, &utp->freq) ||
987 __put_user(txc.maxerror, &utp->maxerror) ||
988 __put_user(txc.esterror, &utp->esterror) ||
989 __put_user(txc.status, &utp->status) ||
990 __put_user(txc.constant, &utp->constant) ||
991 __put_user(txc.precision, &utp->precision) ||
992 __put_user(txc.tolerance, &utp->tolerance) ||
993 __put_user(txc.time.tv_sec, &utp->time.tv_sec) ||
994 __put_user(txc.time.tv_usec, &utp->time.tv_usec) ||
995 __put_user(txc.tick, &utp->tick) ||
996 __put_user(txc.ppsfreq, &utp->ppsfreq) ||
997 __put_user(txc.jitter, &utp->jitter) ||
998 __put_user(txc.shift, &utp->shift) ||
999 __put_user(txc.stabil, &utp->stabil) ||
1000 __put_user(txc.jitcnt, &utp->jitcnt) ||
1001 __put_user(txc.calcnt, &utp->calcnt) ||
1002 __put_user(txc.errcnt, &utp->errcnt) ||
1003 __put_user(txc.stbcnt, &utp->stbcnt) ||
1004 __put_user(txc.tai, &utp->tai))
1005 ret = -EFAULT;
1006 1046
1007 return ret; 1047 return ret;
1008} 1048}
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 4349935c2ad8..e92e98189032 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1575,8 +1575,10 @@ static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft,
1575 return -ENODEV; 1575 return -ENODEV;
1576 1576
1577 trialcs = alloc_trial_cpuset(cs); 1577 trialcs = alloc_trial_cpuset(cs);
1578 if (!trialcs) 1578 if (!trialcs) {
1579 return -ENOMEM; 1579 retval = -ENOMEM;
1580 goto out;
1581 }
1580 1582
1581 switch (cft->private) { 1583 switch (cft->private) {
1582 case FILE_CPULIST: 1584 case FILE_CPULIST:
@@ -1591,6 +1593,7 @@ static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft,
1591 } 1593 }
1592 1594
1593 free_trial_cpuset(trialcs); 1595 free_trial_cpuset(trialcs);
1596out:
1594 cgroup_unlock(); 1597 cgroup_unlock();
1595 return retval; 1598 return retval;
1596} 1599}
diff --git a/kernel/cred.c b/kernel/cred.c
index 6a1aa004e376..2343c132c5a7 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -35,7 +35,7 @@ static struct kmem_cache *cred_jar;
35static struct thread_group_cred init_tgcred = { 35static struct thread_group_cred init_tgcred = {
36 .usage = ATOMIC_INIT(2), 36 .usage = ATOMIC_INIT(2),
37 .tgid = 0, 37 .tgid = 0,
38 .lock = SPIN_LOCK_UNLOCKED, 38 .lock = __SPIN_LOCK_UNLOCKED(init_cred.tgcred.lock),
39}; 39};
40#endif 40#endif
41 41
@@ -252,13 +252,13 @@ struct cred *cred_alloc_blank(void)
252#endif 252#endif
253 253
254 atomic_set(&new->usage, 1); 254 atomic_set(&new->usage, 1);
255#ifdef CONFIG_DEBUG_CREDENTIALS
256 new->magic = CRED_MAGIC;
257#endif
255 258
256 if (security_cred_alloc_blank(new, GFP_KERNEL) < 0) 259 if (security_cred_alloc_blank(new, GFP_KERNEL) < 0)
257 goto error; 260 goto error;
258 261
259#ifdef CONFIG_DEBUG_CREDENTIALS
260 new->magic = CRED_MAGIC;
261#endif
262 return new; 262 return new;
263 263
264error: 264error:
@@ -657,6 +657,8 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon)
657 validate_creds(old); 657 validate_creds(old);
658 658
659 *new = *old; 659 *new = *old;
660 atomic_set(&new->usage, 1);
661 set_cred_subscribers(new, 0);
660 get_uid(new->user); 662 get_uid(new->user);
661 get_group_info(new->group_info); 663 get_group_info(new->group_info);
662 664
@@ -674,8 +676,6 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon)
674 if (security_prepare_creds(new, old, GFP_KERNEL) < 0) 676 if (security_prepare_creds(new, old, GFP_KERNEL) < 0)
675 goto error; 677 goto error;
676 678
677 atomic_set(&new->usage, 1);
678 set_cred_subscribers(new, 0);
679 put_cred(old); 679 put_cred(old);
680 validate_creds(new); 680 validate_creds(new);
681 return new; 681 return new;
@@ -748,7 +748,11 @@ bool creds_are_invalid(const struct cred *cred)
748 if (cred->magic != CRED_MAGIC) 748 if (cred->magic != CRED_MAGIC)
749 return true; 749 return true;
750#ifdef CONFIG_SECURITY_SELINUX 750#ifdef CONFIG_SECURITY_SELINUX
751 if (selinux_is_enabled()) { 751 /*
752 * cred->security == NULL if security_cred_alloc_blank() or
753 * security_prepare_creds() returned an error.
754 */
755 if (selinux_is_enabled() && cred->security) {
752 if ((unsigned long) cred->security < PAGE_SIZE) 756 if ((unsigned long) cred->security < PAGE_SIZE)
753 return true; 757 return true;
754 if ((*(u32 *)cred->security & 0xffffff00) == 758 if ((*(u32 *)cred->security & 0xffffff00) ==
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index a6e729766821..bd3e8e29caa3 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -2914,7 +2914,7 @@ static void __init kdb_cmd_init(void)
2914 } 2914 }
2915} 2915}
2916 2916
2917/* Intialize kdb_printf, breakpoint tables and kdb state */ 2917/* Initialize kdb_printf, breakpoint tables and kdb state */
2918void __init kdb_init(int lvl) 2918void __init kdb_init(int lvl)
2919{ 2919{
2920 static int kdb_init_lvl = KDB_NOT_INITIALIZED; 2920 static int kdb_init_lvl = KDB_NOT_INITIALIZED;
diff --git a/kernel/exit.c b/kernel/exit.c
index 676149a4ac5f..f9a45ebcc7b1 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -69,7 +69,7 @@ static void __unhash_process(struct task_struct *p, bool group_dead)
69 69
70 list_del_rcu(&p->tasks); 70 list_del_rcu(&p->tasks);
71 list_del_init(&p->sibling); 71 list_del_init(&p->sibling);
72 __get_cpu_var(process_counts)--; 72 __this_cpu_dec(process_counts);
73 } 73 }
74 list_del_rcu(&p->thread_group); 74 list_del_rcu(&p->thread_group);
75} 75}
@@ -994,6 +994,15 @@ NORET_TYPE void do_exit(long code)
994 exit_fs(tsk); 994 exit_fs(tsk);
995 check_stack_usage(); 995 check_stack_usage();
996 exit_thread(); 996 exit_thread();
997
998 /*
999 * Flush inherited counters to the parent - before the parent
1000 * gets woken up by child-exit notifications.
1001 *
1002 * because of cgroup mode, must be called before cgroup_exit()
1003 */
1004 perf_event_exit_task(tsk);
1005
997 cgroup_exit(tsk, 1); 1006 cgroup_exit(tsk, 1);
998 1007
999 if (group_dead) 1008 if (group_dead)
@@ -1007,11 +1016,6 @@ NORET_TYPE void do_exit(long code)
1007 * FIXME: do that only when needed, using sched_exit tracepoint 1016 * FIXME: do that only when needed, using sched_exit tracepoint
1008 */ 1017 */
1009 flush_ptrace_hw_breakpoint(tsk); 1018 flush_ptrace_hw_breakpoint(tsk);
1010 /*
1011 * Flush inherited counters to the parent - before the parent
1012 * gets woken up by child-exit notifications.
1013 */
1014 perf_event_exit_task(tsk);
1015 1019
1016 exit_notify(tsk, group_dead); 1020 exit_notify(tsk, group_dead);
1017#ifdef CONFIG_NUMA 1021#ifdef CONFIG_NUMA
diff --git a/kernel/fork.c b/kernel/fork.c
index 7d164e25b0f0..25e429152ddc 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -66,6 +66,7 @@
66#include <linux/posix-timers.h> 66#include <linux/posix-timers.h>
67#include <linux/user-return-notifier.h> 67#include <linux/user-return-notifier.h>
68#include <linux/oom.h> 68#include <linux/oom.h>
69#include <linux/khugepaged.h>
69 70
70#include <asm/pgtable.h> 71#include <asm/pgtable.h>
71#include <asm/pgalloc.h> 72#include <asm/pgalloc.h>
@@ -169,15 +170,14 @@ EXPORT_SYMBOL(free_task);
169static inline void free_signal_struct(struct signal_struct *sig) 170static inline void free_signal_struct(struct signal_struct *sig)
170{ 171{
171 taskstats_tgid_free(sig); 172 taskstats_tgid_free(sig);
173 sched_autogroup_exit(sig);
172 kmem_cache_free(signal_cachep, sig); 174 kmem_cache_free(signal_cachep, sig);
173} 175}
174 176
175static inline void put_signal_struct(struct signal_struct *sig) 177static inline void put_signal_struct(struct signal_struct *sig)
176{ 178{
177 if (atomic_dec_and_test(&sig->sigcnt)) { 179 if (atomic_dec_and_test(&sig->sigcnt))
178 sched_autogroup_exit(sig);
179 free_signal_struct(sig); 180 free_signal_struct(sig);
180 }
181} 181}
182 182
183void __put_task_struct(struct task_struct *tsk) 183void __put_task_struct(struct task_struct *tsk)
@@ -331,6 +331,9 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
331 retval = ksm_fork(mm, oldmm); 331 retval = ksm_fork(mm, oldmm);
332 if (retval) 332 if (retval)
333 goto out; 333 goto out;
334 retval = khugepaged_fork(mm, oldmm);
335 if (retval)
336 goto out;
334 337
335 prev = NULL; 338 prev = NULL;
336 for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) { 339 for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) {
@@ -530,6 +533,9 @@ void __mmdrop(struct mm_struct *mm)
530 mm_free_pgd(mm); 533 mm_free_pgd(mm);
531 destroy_context(mm); 534 destroy_context(mm);
532 mmu_notifier_mm_destroy(mm); 535 mmu_notifier_mm_destroy(mm);
536#ifdef CONFIG_TRANSPARENT_HUGEPAGE
537 VM_BUG_ON(mm->pmd_huge_pte);
538#endif
533 free_mm(mm); 539 free_mm(mm);
534} 540}
535EXPORT_SYMBOL_GPL(__mmdrop); 541EXPORT_SYMBOL_GPL(__mmdrop);
@@ -544,6 +550,7 @@ void mmput(struct mm_struct *mm)
544 if (atomic_dec_and_test(&mm->mm_users)) { 550 if (atomic_dec_and_test(&mm->mm_users)) {
545 exit_aio(mm); 551 exit_aio(mm);
546 ksm_exit(mm); 552 ksm_exit(mm);
553 khugepaged_exit(mm); /* must run before exit_mmap */
547 exit_mmap(mm); 554 exit_mmap(mm);
548 set_mm_exe_file(mm, NULL); 555 set_mm_exe_file(mm, NULL);
549 if (!list_empty(&mm->mmlist)) { 556 if (!list_empty(&mm->mmlist)) {
@@ -670,6 +677,10 @@ struct mm_struct *dup_mm(struct task_struct *tsk)
670 mm->token_priority = 0; 677 mm->token_priority = 0;
671 mm->last_interval = 0; 678 mm->last_interval = 0;
672 679
680#ifdef CONFIG_TRANSPARENT_HUGEPAGE
681 mm->pmd_huge_pte = NULL;
682#endif
683
673 if (!mm_init(mm, tsk)) 684 if (!mm_init(mm, tsk))
674 goto fail_nomem; 685 goto fail_nomem;
675 686
@@ -911,6 +922,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
911 922
912 sig->oom_adj = current->signal->oom_adj; 923 sig->oom_adj = current->signal->oom_adj;
913 sig->oom_score_adj = current->signal->oom_score_adj; 924 sig->oom_score_adj = current->signal->oom_score_adj;
925 sig->oom_score_adj_min = current->signal->oom_score_adj_min;
914 926
915 mutex_init(&sig->cred_guard_mutex); 927 mutex_init(&sig->cred_guard_mutex);
916 928
@@ -1286,7 +1298,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1286 attach_pid(p, PIDTYPE_SID, task_session(current)); 1298 attach_pid(p, PIDTYPE_SID, task_session(current));
1287 list_add_tail(&p->sibling, &p->real_parent->children); 1299 list_add_tail(&p->sibling, &p->real_parent->children);
1288 list_add_tail_rcu(&p->tasks, &init_task.tasks); 1300 list_add_tail_rcu(&p->tasks, &init_task.tasks);
1289 __get_cpu_var(process_counts)++; 1301 __this_cpu_inc(process_counts);
1290 } 1302 }
1291 attach_pid(p, PIDTYPE_PID, pid); 1303 attach_pid(p, PIDTYPE_PID, pid);
1292 nr_threads++; 1304 nr_threads++;
@@ -1318,7 +1330,7 @@ bad_fork_cleanup_mm:
1318 } 1330 }
1319bad_fork_cleanup_signal: 1331bad_fork_cleanup_signal:
1320 if (!(clone_flags & CLONE_THREAD)) 1332 if (!(clone_flags & CLONE_THREAD))
1321 put_signal_struct(p->signal); 1333 free_signal_struct(p->signal);
1322bad_fork_cleanup_sighand: 1334bad_fork_cleanup_sighand:
1323 __cleanup_sighand(p->sighand); 1335 __cleanup_sighand(p->sighand);
1324bad_fork_cleanup_fs: 1336bad_fork_cleanup_fs:
@@ -1411,23 +1423,6 @@ long do_fork(unsigned long clone_flags,
1411 } 1423 }
1412 1424
1413 /* 1425 /*
1414 * We hope to recycle these flags after 2.6.26
1415 */
1416 if (unlikely(clone_flags & CLONE_STOPPED)) {
1417 static int __read_mostly count = 100;
1418
1419 if (count > 0 && printk_ratelimit()) {
1420 char comm[TASK_COMM_LEN];
1421
1422 count--;
1423 printk(KERN_INFO "fork(): process `%s' used deprecated "
1424 "clone flags 0x%lx\n",
1425 get_task_comm(comm, current),
1426 clone_flags & CLONE_STOPPED);
1427 }
1428 }
1429
1430 /*
1431 * When called from kernel_thread, don't do user tracing stuff. 1426 * When called from kernel_thread, don't do user tracing stuff.
1432 */ 1427 */
1433 if (likely(user_mode(regs))) 1428 if (likely(user_mode(regs)))
@@ -1465,16 +1460,7 @@ long do_fork(unsigned long clone_flags,
1465 */ 1460 */
1466 p->flags &= ~PF_STARTING; 1461 p->flags &= ~PF_STARTING;
1467 1462
1468 if (unlikely(clone_flags & CLONE_STOPPED)) { 1463 wake_up_new_task(p, clone_flags);
1469 /*
1470 * We'll start up with an immediate SIGSTOP.
1471 */
1472 sigaddset(&p->pending.signal, SIGSTOP);
1473 set_tsk_thread_flag(p, TIF_SIGPENDING);
1474 __set_task_state(p, TASK_STOPPED);
1475 } else {
1476 wake_up_new_task(p, clone_flags);
1477 }
1478 1464
1479 tracehook_report_clone_complete(trace, regs, 1465 tracehook_report_clone_complete(trace, regs,
1480 clone_flags, nr, p); 1466 clone_flags, nr, p);
diff --git a/kernel/freezer.c b/kernel/freezer.c
index bd1d42b17cb2..66ecd2ead215 100644
--- a/kernel/freezer.c
+++ b/kernel/freezer.c
@@ -104,8 +104,13 @@ bool freeze_task(struct task_struct *p, bool sig_only)
104 } 104 }
105 105
106 if (should_send_signal(p)) { 106 if (should_send_signal(p)) {
107 if (!signal_pending(p)) 107 fake_signal_wake_up(p);
108 fake_signal_wake_up(p); 108 /*
109 * fake_signal_wake_up() goes through p's scheduler
110 * lock and guarantees that TASK_STOPPED/TRACED ->
111 * TASK_RUNNING transition can't race with task state
112 * testing in try_to_freeze_tasks().
113 */
109 } else if (sig_only) { 114 } else if (sig_only) {
110 return false; 115 return false;
111 } else { 116 } else {
diff --git a/kernel/futex.c b/kernel/futex.c
index 3019b92e6917..bda415715382 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -233,7 +233,7 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key)
233{ 233{
234 unsigned long address = (unsigned long)uaddr; 234 unsigned long address = (unsigned long)uaddr;
235 struct mm_struct *mm = current->mm; 235 struct mm_struct *mm = current->mm;
236 struct page *page; 236 struct page *page, *page_head;
237 int err; 237 int err;
238 238
239 /* 239 /*
@@ -265,11 +265,46 @@ again:
265 if (err < 0) 265 if (err < 0)
266 return err; 266 return err;
267 267
268 page = compound_head(page); 268#ifdef CONFIG_TRANSPARENT_HUGEPAGE
269 lock_page(page); 269 page_head = page;
270 if (!page->mapping) { 270 if (unlikely(PageTail(page))) {
271 unlock_page(page);
272 put_page(page); 271 put_page(page);
272 /* serialize against __split_huge_page_splitting() */
273 local_irq_disable();
274 if (likely(__get_user_pages_fast(address, 1, 1, &page) == 1)) {
275 page_head = compound_head(page);
276 /*
277 * page_head is valid pointer but we must pin
278 * it before taking the PG_lock and/or
279 * PG_compound_lock. The moment we re-enable
280 * irqs __split_huge_page_splitting() can
281 * return and the head page can be freed from
282 * under us. We can't take the PG_lock and/or
283 * PG_compound_lock on a page that could be
284 * freed from under us.
285 */
286 if (page != page_head) {
287 get_page(page_head);
288 put_page(page);
289 }
290 local_irq_enable();
291 } else {
292 local_irq_enable();
293 goto again;
294 }
295 }
296#else
297 page_head = compound_head(page);
298 if (page != page_head) {
299 get_page(page_head);
300 put_page(page);
301 }
302#endif
303
304 lock_page(page_head);
305 if (!page_head->mapping) {
306 unlock_page(page_head);
307 put_page(page_head);
273 goto again; 308 goto again;
274 } 309 }
275 310
@@ -280,20 +315,20 @@ again:
280 * it's a read-only handle, it's expected that futexes attach to 315 * it's a read-only handle, it's expected that futexes attach to
281 * the object not the particular process. 316 * the object not the particular process.
282 */ 317 */
283 if (PageAnon(page)) { 318 if (PageAnon(page_head)) {
284 key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */ 319 key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */
285 key->private.mm = mm; 320 key->private.mm = mm;
286 key->private.address = address; 321 key->private.address = address;
287 } else { 322 } else {
288 key->both.offset |= FUT_OFF_INODE; /* inode-based key */ 323 key->both.offset |= FUT_OFF_INODE; /* inode-based key */
289 key->shared.inode = page->mapping->host; 324 key->shared.inode = page_head->mapping->host;
290 key->shared.pgoff = page->index; 325 key->shared.pgoff = page_head->index;
291 } 326 }
292 327
293 get_futex_key_refs(key); 328 get_futex_key_refs(key);
294 329
295 unlock_page(page); 330 unlock_page(page_head);
296 put_page(page); 331 put_page(page_head);
297 return 0; 332 return 0;
298} 333}
299 334
@@ -346,15 +381,16 @@ static struct futex_q *futex_top_waiter(struct futex_hash_bucket *hb,
346 return NULL; 381 return NULL;
347} 382}
348 383
349static u32 cmpxchg_futex_value_locked(u32 __user *uaddr, u32 uval, u32 newval) 384static int cmpxchg_futex_value_locked(u32 *curval, u32 __user *uaddr,
385 u32 uval, u32 newval)
350{ 386{
351 u32 curval; 387 int ret;
352 388
353 pagefault_disable(); 389 pagefault_disable();
354 curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval); 390 ret = futex_atomic_cmpxchg_inatomic(curval, uaddr, uval, newval);
355 pagefault_enable(); 391 pagefault_enable();
356 392
357 return curval; 393 return ret;
358} 394}
359 395
360static int get_futex_value_locked(u32 *dest, u32 __user *from) 396static int get_futex_value_locked(u32 *dest, u32 __user *from)
@@ -639,7 +675,7 @@ static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
639 struct task_struct *task, int set_waiters) 675 struct task_struct *task, int set_waiters)
640{ 676{
641 int lock_taken, ret, ownerdied = 0; 677 int lock_taken, ret, ownerdied = 0;
642 u32 uval, newval, curval; 678 u32 uval, newval, curval, vpid = task_pid_vnr(task);
643 679
644retry: 680retry:
645 ret = lock_taken = 0; 681 ret = lock_taken = 0;
@@ -649,19 +685,17 @@ retry:
649 * (by doing a 0 -> TID atomic cmpxchg), while holding all 685 * (by doing a 0 -> TID atomic cmpxchg), while holding all
650 * the locks. It will most likely not succeed. 686 * the locks. It will most likely not succeed.
651 */ 687 */
652 newval = task_pid_vnr(task); 688 newval = vpid;
653 if (set_waiters) 689 if (set_waiters)
654 newval |= FUTEX_WAITERS; 690 newval |= FUTEX_WAITERS;
655 691
656 curval = cmpxchg_futex_value_locked(uaddr, 0, newval); 692 if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, 0, newval)))
657
658 if (unlikely(curval == -EFAULT))
659 return -EFAULT; 693 return -EFAULT;
660 694
661 /* 695 /*
662 * Detect deadlocks. 696 * Detect deadlocks.
663 */ 697 */
664 if ((unlikely((curval & FUTEX_TID_MASK) == task_pid_vnr(task)))) 698 if ((unlikely((curval & FUTEX_TID_MASK) == vpid)))
665 return -EDEADLK; 699 return -EDEADLK;
666 700
667 /* 701 /*
@@ -688,14 +722,12 @@ retry:
688 */ 722 */
689 if (unlikely(ownerdied || !(curval & FUTEX_TID_MASK))) { 723 if (unlikely(ownerdied || !(curval & FUTEX_TID_MASK))) {
690 /* Keep the OWNER_DIED bit */ 724 /* Keep the OWNER_DIED bit */
691 newval = (curval & ~FUTEX_TID_MASK) | task_pid_vnr(task); 725 newval = (curval & ~FUTEX_TID_MASK) | vpid;
692 ownerdied = 0; 726 ownerdied = 0;
693 lock_taken = 1; 727 lock_taken = 1;
694 } 728 }
695 729
696 curval = cmpxchg_futex_value_locked(uaddr, uval, newval); 730 if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)))
697
698 if (unlikely(curval == -EFAULT))
699 return -EFAULT; 731 return -EFAULT;
700 if (unlikely(curval != uval)) 732 if (unlikely(curval != uval))
701 goto retry; 733 goto retry;
@@ -740,6 +772,24 @@ retry:
740 return ret; 772 return ret;
741} 773}
742 774
775/**
776 * __unqueue_futex() - Remove the futex_q from its futex_hash_bucket
777 * @q: The futex_q to unqueue
778 *
779 * The q->lock_ptr must not be NULL and must be held by the caller.
780 */
781static void __unqueue_futex(struct futex_q *q)
782{
783 struct futex_hash_bucket *hb;
784
785 if (WARN_ON(!q->lock_ptr || !spin_is_locked(q->lock_ptr)
786 || plist_node_empty(&q->list)))
787 return;
788
789 hb = container_of(q->lock_ptr, struct futex_hash_bucket, lock);
790 plist_del(&q->list, &hb->chain);
791}
792
743/* 793/*
744 * The hash bucket lock must be held when this is called. 794 * The hash bucket lock must be held when this is called.
745 * Afterwards, the futex_q must not be accessed. 795 * Afterwards, the futex_q must not be accessed.
@@ -757,7 +807,7 @@ static void wake_futex(struct futex_q *q)
757 */ 807 */
758 get_task_struct(p); 808 get_task_struct(p);
759 809
760 plist_del(&q->list, &q->list.plist); 810 __unqueue_futex(q);
761 /* 811 /*
762 * The waiting task can free the futex_q as soon as 812 * The waiting task can free the futex_q as soon as
763 * q->lock_ptr = NULL is written, without taking any locks. A 813 * q->lock_ptr = NULL is written, without taking any locks. A
@@ -791,10 +841,9 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
791 new_owner = rt_mutex_next_owner(&pi_state->pi_mutex); 841 new_owner = rt_mutex_next_owner(&pi_state->pi_mutex);
792 842
793 /* 843 /*
794 * This happens when we have stolen the lock and the original 844 * It is possible that the next waiter (the one that brought
795 * pending owner did not enqueue itself back on the rt_mutex. 845 * this owner to the kernel) timed out and is no longer
796 * Thats not a tragedy. We know that way, that a lock waiter 846 * waiting on the lock.
797 * is on the fly. We make the futex_q waiter the pending owner.
798 */ 847 */
799 if (!new_owner) 848 if (!new_owner)
800 new_owner = this->task; 849 new_owner = this->task;
@@ -809,9 +858,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
809 858
810 newval = FUTEX_WAITERS | task_pid_vnr(new_owner); 859 newval = FUTEX_WAITERS | task_pid_vnr(new_owner);
811 860
812 curval = cmpxchg_futex_value_locked(uaddr, uval, newval); 861 if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))
813
814 if (curval == -EFAULT)
815 ret = -EFAULT; 862 ret = -EFAULT;
816 else if (curval != uval) 863 else if (curval != uval)
817 ret = -EINVAL; 864 ret = -EINVAL;
@@ -846,10 +893,8 @@ static int unlock_futex_pi(u32 __user *uaddr, u32 uval)
846 * There is no waiter, so we unlock the futex. The owner died 893 * There is no waiter, so we unlock the futex. The owner died
847 * bit has not to be preserved here. We are the owner: 894 * bit has not to be preserved here. We are the owner:
848 */ 895 */
849 oldval = cmpxchg_futex_value_locked(uaddr, uval, 0); 896 if (cmpxchg_futex_value_locked(&oldval, uaddr, uval, 0))
850 897 return -EFAULT;
851 if (oldval == -EFAULT)
852 return oldval;
853 if (oldval != uval) 898 if (oldval != uval)
854 return -EAGAIN; 899 return -EAGAIN;
855 900
@@ -1037,9 +1082,6 @@ void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1,
1037 plist_del(&q->list, &hb1->chain); 1082 plist_del(&q->list, &hb1->chain);
1038 plist_add(&q->list, &hb2->chain); 1083 plist_add(&q->list, &hb2->chain);
1039 q->lock_ptr = &hb2->lock; 1084 q->lock_ptr = &hb2->lock;
1040#ifdef CONFIG_DEBUG_PI_LIST
1041 q->list.plist.spinlock = &hb2->lock;
1042#endif
1043 } 1085 }
1044 get_futex_key_refs(key2); 1086 get_futex_key_refs(key2);
1045 q->key = *key2; 1087 q->key = *key2;
@@ -1066,16 +1108,12 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key,
1066 get_futex_key_refs(key); 1108 get_futex_key_refs(key);
1067 q->key = *key; 1109 q->key = *key;
1068 1110
1069 WARN_ON(plist_node_empty(&q->list)); 1111 __unqueue_futex(q);
1070 plist_del(&q->list, &q->list.plist);
1071 1112
1072 WARN_ON(!q->rt_waiter); 1113 WARN_ON(!q->rt_waiter);
1073 q->rt_waiter = NULL; 1114 q->rt_waiter = NULL;
1074 1115
1075 q->lock_ptr = &hb->lock; 1116 q->lock_ptr = &hb->lock;
1076#ifdef CONFIG_DEBUG_PI_LIST
1077 q->list.plist.spinlock = &hb->lock;
1078#endif
1079 1117
1080 wake_up_state(q->task, TASK_NORMAL); 1118 wake_up_state(q->task, TASK_NORMAL);
1081} 1119}
@@ -1423,9 +1461,6 @@ static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
1423 prio = min(current->normal_prio, MAX_RT_PRIO); 1461 prio = min(current->normal_prio, MAX_RT_PRIO);
1424 1462
1425 plist_node_init(&q->list, prio); 1463 plist_node_init(&q->list, prio);
1426#ifdef CONFIG_DEBUG_PI_LIST
1427 q->list.plist.spinlock = &hb->lock;
1428#endif
1429 plist_add(&q->list, &hb->chain); 1464 plist_add(&q->list, &hb->chain);
1430 q->task = current; 1465 q->task = current;
1431 spin_unlock(&hb->lock); 1466 spin_unlock(&hb->lock);
@@ -1470,8 +1505,7 @@ retry:
1470 spin_unlock(lock_ptr); 1505 spin_unlock(lock_ptr);
1471 goto retry; 1506 goto retry;
1472 } 1507 }
1473 WARN_ON(plist_node_empty(&q->list)); 1508 __unqueue_futex(q);
1474 plist_del(&q->list, &q->list.plist);
1475 1509
1476 BUG_ON(q->pi_state); 1510 BUG_ON(q->pi_state);
1477 1511
@@ -1491,8 +1525,7 @@ retry:
1491static void unqueue_me_pi(struct futex_q *q) 1525static void unqueue_me_pi(struct futex_q *q)
1492 __releases(q->lock_ptr) 1526 __releases(q->lock_ptr)
1493{ 1527{
1494 WARN_ON(plist_node_empty(&q->list)); 1528 __unqueue_futex(q);
1495 plist_del(&q->list, &q->list.plist);
1496 1529
1497 BUG_ON(!q->pi_state); 1530 BUG_ON(!q->pi_state);
1498 free_pi_state(q->pi_state); 1531 free_pi_state(q->pi_state);
@@ -1522,10 +1555,10 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
1522 1555
1523 /* 1556 /*
1524 * We are here either because we stole the rtmutex from the 1557 * We are here either because we stole the rtmutex from the
1525 * pending owner or we are the pending owner which failed to 1558 * previous highest priority waiter or we are the highest priority
1526 * get the rtmutex. We have to replace the pending owner TID 1559 * waiter but failed to get the rtmutex the first time.
1527 * in the user space variable. This must be atomic as we have 1560 * We have to replace the newowner TID in the user space variable.
1528 * to preserve the owner died bit here. 1561 * This must be atomic as we have to preserve the owner died bit here.
1529 * 1562 *
1530 * Note: We write the user space value _before_ changing the pi_state 1563 * Note: We write the user space value _before_ changing the pi_state
1531 * because we can fault here. Imagine swapped out pages or a fork 1564 * because we can fault here. Imagine swapped out pages or a fork
@@ -1544,9 +1577,7 @@ retry:
1544 while (1) { 1577 while (1) {
1545 newval = (uval & FUTEX_OWNER_DIED) | newtid; 1578 newval = (uval & FUTEX_OWNER_DIED) | newtid;
1546 1579
1547 curval = cmpxchg_futex_value_locked(uaddr, uval, newval); 1580 if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))
1548
1549 if (curval == -EFAULT)
1550 goto handle_fault; 1581 goto handle_fault;
1551 if (curval == uval) 1582 if (curval == uval)
1552 break; 1583 break;
@@ -1574,8 +1605,8 @@ retry:
1574 1605
1575 /* 1606 /*
1576 * To handle the page fault we need to drop the hash bucket 1607 * To handle the page fault we need to drop the hash bucket
1577 * lock here. That gives the other task (either the pending 1608 * lock here. That gives the other task (either the highest priority
1578 * owner itself or the task which stole the rtmutex) the 1609 * waiter itself or the task which stole the rtmutex) the
1579 * chance to try the fixup of the pi_state. So once we are 1610 * chance to try the fixup of the pi_state. So once we are
1580 * back from handling the fault we need to check the pi_state 1611 * back from handling the fault we need to check the pi_state
1581 * after reacquiring the hash bucket lock and before trying to 1612 * after reacquiring the hash bucket lock and before trying to
@@ -1651,18 +1682,20 @@ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked)
1651 /* 1682 /*
1652 * pi_state is incorrect, some other task did a lock steal and 1683 * pi_state is incorrect, some other task did a lock steal and
1653 * we returned due to timeout or signal without taking the 1684 * we returned due to timeout or signal without taking the
1654 * rt_mutex. Too late. We can access the rt_mutex_owner without 1685 * rt_mutex. Too late.
1655 * locking, as the other task is now blocked on the hash bucket
1656 * lock. Fix the state up.
1657 */ 1686 */
1687 raw_spin_lock(&q->pi_state->pi_mutex.wait_lock);
1658 owner = rt_mutex_owner(&q->pi_state->pi_mutex); 1688 owner = rt_mutex_owner(&q->pi_state->pi_mutex);
1689 if (!owner)
1690 owner = rt_mutex_next_owner(&q->pi_state->pi_mutex);
1691 raw_spin_unlock(&q->pi_state->pi_mutex.wait_lock);
1659 ret = fixup_pi_state_owner(uaddr, q, owner); 1692 ret = fixup_pi_state_owner(uaddr, q, owner);
1660 goto out; 1693 goto out;
1661 } 1694 }
1662 1695
1663 /* 1696 /*
1664 * Paranoia check. If we did not take the lock, then we should not be 1697 * Paranoia check. If we did not take the lock, then we should not be
1665 * the owner, nor the pending owner, of the rt_mutex. 1698 * the owner of the rt_mutex.
1666 */ 1699 */
1667 if (rt_mutex_owner(&q->pi_state->pi_mutex) == current) 1700 if (rt_mutex_owner(&q->pi_state->pi_mutex) == current)
1668 printk(KERN_ERR "fixup_owner: ret = %d pi-mutex: %p " 1701 printk(KERN_ERR "fixup_owner: ret = %d pi-mutex: %p "
@@ -1747,13 +1780,14 @@ static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags,
1747 * 1780 *
1748 * The basic logical guarantee of a futex is that it blocks ONLY 1781 * The basic logical guarantee of a futex is that it blocks ONLY
1749 * if cond(var) is known to be true at the time of blocking, for 1782 * if cond(var) is known to be true at the time of blocking, for
1750 * any cond. If we queued after testing *uaddr, that would open 1783 * any cond. If we locked the hash-bucket after testing *uaddr, that
1751 * a race condition where we could block indefinitely with 1784 * would open a race condition where we could block indefinitely with
1752 * cond(var) false, which would violate the guarantee. 1785 * cond(var) false, which would violate the guarantee.
1753 * 1786 *
1754 * A consequence is that futex_wait() can return zero and absorb 1787 * On the other hand, we insert q and release the hash-bucket only
1755 * a wakeup when *uaddr != val on entry to the syscall. This is 1788 * after testing *uaddr. This guarantees that futex_wait() will NOT
1756 * rare, but normal. 1789 * absorb a wakeup if *uaddr does not match the desired values
1790 * while the syscall executes.
1757 */ 1791 */
1758retry: 1792retry:
1759 ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q->key); 1793 ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q->key);
@@ -2012,9 +2046,9 @@ static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
2012{ 2046{
2013 struct futex_hash_bucket *hb; 2047 struct futex_hash_bucket *hb;
2014 struct futex_q *this, *next; 2048 struct futex_q *this, *next;
2015 u32 uval;
2016 struct plist_head *head; 2049 struct plist_head *head;
2017 union futex_key key = FUTEX_KEY_INIT; 2050 union futex_key key = FUTEX_KEY_INIT;
2051 u32 uval, vpid = task_pid_vnr(current);
2018 int ret; 2052 int ret;
2019 2053
2020retry: 2054retry:
@@ -2023,7 +2057,7 @@ retry:
2023 /* 2057 /*
2024 * We release only a lock we actually own: 2058 * We release only a lock we actually own:
2025 */ 2059 */
2026 if ((uval & FUTEX_TID_MASK) != task_pid_vnr(current)) 2060 if ((uval & FUTEX_TID_MASK) != vpid)
2027 return -EPERM; 2061 return -EPERM;
2028 2062
2029 ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key); 2063 ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key);
@@ -2038,17 +2072,14 @@ retry:
2038 * again. If it succeeds then we can return without waking 2072 * again. If it succeeds then we can return without waking
2039 * anyone else up: 2073 * anyone else up:
2040 */ 2074 */
2041 if (!(uval & FUTEX_OWNER_DIED)) 2075 if (!(uval & FUTEX_OWNER_DIED) &&
2042 uval = cmpxchg_futex_value_locked(uaddr, task_pid_vnr(current), 0); 2076 cmpxchg_futex_value_locked(&uval, uaddr, vpid, 0))
2043
2044
2045 if (unlikely(uval == -EFAULT))
2046 goto pi_faulted; 2077 goto pi_faulted;
2047 /* 2078 /*
2048 * Rare case: we managed to release the lock atomically, 2079 * Rare case: we managed to release the lock atomically,
2049 * no need to wake anyone else up: 2080 * no need to wake anyone else up:
2050 */ 2081 */
2051 if (unlikely(uval == task_pid_vnr(current))) 2082 if (unlikely(uval == vpid))
2052 goto out_unlock; 2083 goto out_unlock;
2053 2084
2054 /* 2085 /*
@@ -2133,7 +2164,7 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
2133 * We were woken prior to requeue by a timeout or a signal. 2164 * We were woken prior to requeue by a timeout or a signal.
2134 * Unqueue the futex_q and determine which it was. 2165 * Unqueue the futex_q and determine which it was.
2135 */ 2166 */
2136 plist_del(&q->list, &q->list.plist); 2167 plist_del(&q->list, &hb->chain);
2137 2168
2138 /* Handle spurious wakeups gracefully */ 2169 /* Handle spurious wakeups gracefully */
2139 ret = -EWOULDBLOCK; 2170 ret = -EWOULDBLOCK;
@@ -2429,11 +2460,20 @@ retry:
2429 * userspace. 2460 * userspace.
2430 */ 2461 */
2431 mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED; 2462 mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED;
2432 nval = futex_atomic_cmpxchg_inatomic(uaddr, uval, mval); 2463 /*
2433 2464 * We are not holding a lock here, but we want to have
2434 if (nval == -EFAULT) 2465 * the pagefault_disable/enable() protection because
2435 return -1; 2466 * we want to handle the fault gracefully. If the
2436 2467 * access fails we try to fault in the futex with R/W
2468 * verification via get_user_pages. get_user() above
2469 * does not guarantee R/W access. If that fails we
2470 * give up and leave the futex locked.
2471 */
2472 if (cmpxchg_futex_value_locked(&nval, uaddr, uval, mval)) {
2473 if (fault_in_user_writeable(uaddr))
2474 return -1;
2475 goto retry;
2476 }
2437 if (nval != uval) 2477 if (nval != uval)
2438 goto retry; 2478 goto retry;
2439 2479
@@ -2644,8 +2684,7 @@ static int __init futex_init(void)
2644 * implementation, the non-functional ones will return 2684 * implementation, the non-functional ones will return
2645 * -ENOSYS. 2685 * -ENOSYS.
2646 */ 2686 */
2647 curval = cmpxchg_futex_value_locked(NULL, 0, 0); 2687 if (cmpxchg_futex_value_locked(&curval, NULL, 0, 0) == -EFAULT)
2648 if (curval == -EFAULT)
2649 futex_cmpxchg_enabled = 1; 2688 futex_cmpxchg_enabled = 1;
2650 2689
2651 for (i = 0; i < ARRAY_SIZE(futex_queues); i++) { 2690 for (i = 0; i < ARRAY_SIZE(futex_queues); i++) {
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index f2429fc3438c..9017478c5d4c 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -53,11 +53,10 @@
53/* 53/*
54 * The timer bases: 54 * The timer bases:
55 * 55 *
56 * Note: If we want to add new timer bases, we have to skip the two 56 * There are more clockids then hrtimer bases. Thus, we index
57 * clock ids captured by the cpu-timers. We do this by holding empty 57 * into the timer bases by the hrtimer_base_type enum. When trying
58 * entries rather than doing math adjustment of the clock ids. 58 * to reach a base using a clockid, hrtimer_clockid_to_base()
59 * This ensures that we capture erroneous accesses to these clock ids 59 * is used to convert from clockid to the proper hrtimer_base_type.
60 * rather than moving them into the range of valid clock id's.
61 */ 60 */
62DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) = 61DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
63{ 62{
@@ -74,30 +73,39 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
74 .get_time = &ktime_get, 73 .get_time = &ktime_get,
75 .resolution = KTIME_LOW_RES, 74 .resolution = KTIME_LOW_RES,
76 }, 75 },
76 {
77 .index = CLOCK_BOOTTIME,
78 .get_time = &ktime_get_boottime,
79 .resolution = KTIME_LOW_RES,
80 },
77 } 81 }
78}; 82};
79 83
84static int hrtimer_clock_to_base_table[MAX_CLOCKS];
85
86static inline int hrtimer_clockid_to_base(clockid_t clock_id)
87{
88 return hrtimer_clock_to_base_table[clock_id];
89}
90
91
80/* 92/*
81 * Get the coarse grained time at the softirq based on xtime and 93 * Get the coarse grained time at the softirq based on xtime and
82 * wall_to_monotonic. 94 * wall_to_monotonic.
83 */ 95 */
84static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base) 96static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base)
85{ 97{
86 ktime_t xtim, tomono; 98 ktime_t xtim, mono, boot;
87 struct timespec xts, tom; 99 struct timespec xts, tom, slp;
88 unsigned long seq;
89 100
90 do { 101 get_xtime_and_monotonic_and_sleep_offset(&xts, &tom, &slp);
91 seq = read_seqbegin(&xtime_lock);
92 xts = __current_kernel_time();
93 tom = __get_wall_to_monotonic();
94 } while (read_seqretry(&xtime_lock, seq));
95 102
96 xtim = timespec_to_ktime(xts); 103 xtim = timespec_to_ktime(xts);
97 tomono = timespec_to_ktime(tom); 104 mono = ktime_add(xtim, timespec_to_ktime(tom));
98 base->clock_base[CLOCK_REALTIME].softirq_time = xtim; 105 boot = ktime_add(mono, timespec_to_ktime(slp));
99 base->clock_base[CLOCK_MONOTONIC].softirq_time = 106 base->clock_base[HRTIMER_BASE_REALTIME].softirq_time = xtim;
100 ktime_add(xtim, tomono); 107 base->clock_base[HRTIMER_BASE_MONOTONIC].softirq_time = mono;
108 base->clock_base[HRTIMER_BASE_BOOTTIME].softirq_time = boot;
101} 109}
102 110
103/* 111/*
@@ -184,10 +192,11 @@ switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base,
184 struct hrtimer_cpu_base *new_cpu_base; 192 struct hrtimer_cpu_base *new_cpu_base;
185 int this_cpu = smp_processor_id(); 193 int this_cpu = smp_processor_id();
186 int cpu = hrtimer_get_target(this_cpu, pinned); 194 int cpu = hrtimer_get_target(this_cpu, pinned);
195 int basenum = hrtimer_clockid_to_base(base->index);
187 196
188again: 197again:
189 new_cpu_base = &per_cpu(hrtimer_bases, cpu); 198 new_cpu_base = &per_cpu(hrtimer_bases, cpu);
190 new_base = &new_cpu_base->clock_base[base->index]; 199 new_base = &new_cpu_base->clock_base[basenum];
191 200
192 if (base != new_base) { 201 if (base != new_base) {
193 /* 202 /*
@@ -334,6 +343,11 @@ EXPORT_SYMBOL_GPL(ktime_add_safe);
334 343
335static struct debug_obj_descr hrtimer_debug_descr; 344static struct debug_obj_descr hrtimer_debug_descr;
336 345
346static void *hrtimer_debug_hint(void *addr)
347{
348 return ((struct hrtimer *) addr)->function;
349}
350
337/* 351/*
338 * fixup_init is called when: 352 * fixup_init is called when:
339 * - an active object is initialized 353 * - an active object is initialized
@@ -393,6 +407,7 @@ static int hrtimer_fixup_free(void *addr, enum debug_obj_state state)
393 407
394static struct debug_obj_descr hrtimer_debug_descr = { 408static struct debug_obj_descr hrtimer_debug_descr = {
395 .name = "hrtimer", 409 .name = "hrtimer",
410 .debug_hint = hrtimer_debug_hint,
396 .fixup_init = hrtimer_fixup_init, 411 .fixup_init = hrtimer_fixup_init,
397 .fixup_activate = hrtimer_fixup_activate, 412 .fixup_activate = hrtimer_fixup_activate,
398 .fixup_free = hrtimer_fixup_free, 413 .fixup_free = hrtimer_fixup_free,
@@ -497,7 +512,7 @@ static inline int hrtimer_is_hres_enabled(void)
497 */ 512 */
498static inline int hrtimer_hres_active(void) 513static inline int hrtimer_hres_active(void)
499{ 514{
500 return __get_cpu_var(hrtimer_bases).hres_active; 515 return __this_cpu_read(hrtimer_bases.hres_active);
501} 516}
502 517
503/* 518/*
@@ -611,24 +626,23 @@ static int hrtimer_reprogram(struct hrtimer *timer,
611static void retrigger_next_event(void *arg) 626static void retrigger_next_event(void *arg)
612{ 627{
613 struct hrtimer_cpu_base *base; 628 struct hrtimer_cpu_base *base;
614 struct timespec realtime_offset, wtm; 629 struct timespec realtime_offset, wtm, sleep;
615 unsigned long seq;
616 630
617 if (!hrtimer_hres_active()) 631 if (!hrtimer_hres_active())
618 return; 632 return;
619 633
620 do { 634 get_xtime_and_monotonic_and_sleep_offset(&realtime_offset, &wtm,
621 seq = read_seqbegin(&xtime_lock); 635 &sleep);
622 wtm = __get_wall_to_monotonic();
623 } while (read_seqretry(&xtime_lock, seq));
624 set_normalized_timespec(&realtime_offset, -wtm.tv_sec, -wtm.tv_nsec); 636 set_normalized_timespec(&realtime_offset, -wtm.tv_sec, -wtm.tv_nsec);
625 637
626 base = &__get_cpu_var(hrtimer_bases); 638 base = &__get_cpu_var(hrtimer_bases);
627 639
628 /* Adjust CLOCK_REALTIME offset */ 640 /* Adjust CLOCK_REALTIME offset */
629 raw_spin_lock(&base->lock); 641 raw_spin_lock(&base->lock);
630 base->clock_base[CLOCK_REALTIME].offset = 642 base->clock_base[HRTIMER_BASE_REALTIME].offset =
631 timespec_to_ktime(realtime_offset); 643 timespec_to_ktime(realtime_offset);
644 base->clock_base[HRTIMER_BASE_BOOTTIME].offset =
645 timespec_to_ktime(sleep);
632 646
633 hrtimer_force_reprogram(base, 0); 647 hrtimer_force_reprogram(base, 0);
634 raw_spin_unlock(&base->lock); 648 raw_spin_unlock(&base->lock);
@@ -673,14 +687,6 @@ static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base)
673} 687}
674 688
675/* 689/*
676 * Initialize the high resolution related parts of a hrtimer
677 */
678static inline void hrtimer_init_timer_hres(struct hrtimer *timer)
679{
680}
681
682
683/*
684 * When High resolution timers are active, try to reprogram. Note, that in case 690 * When High resolution timers are active, try to reprogram. Note, that in case
685 * the state has HRTIMER_STATE_CALLBACK set, no reprogramming and no expiry 691 * the state has HRTIMER_STATE_CALLBACK set, no reprogramming and no expiry
686 * check happens. The timer gets enqueued into the rbtree. The reprogramming 692 * check happens. The timer gets enqueued into the rbtree. The reprogramming
@@ -725,8 +731,9 @@ static int hrtimer_switch_to_hres(void)
725 return 0; 731 return 0;
726 } 732 }
727 base->hres_active = 1; 733 base->hres_active = 1;
728 base->clock_base[CLOCK_REALTIME].resolution = KTIME_HIGH_RES; 734 base->clock_base[HRTIMER_BASE_REALTIME].resolution = KTIME_HIGH_RES;
729 base->clock_base[CLOCK_MONOTONIC].resolution = KTIME_HIGH_RES; 735 base->clock_base[HRTIMER_BASE_MONOTONIC].resolution = KTIME_HIGH_RES;
736 base->clock_base[HRTIMER_BASE_BOOTTIME].resolution = KTIME_HIGH_RES;
730 737
731 tick_setup_sched_timer(); 738 tick_setup_sched_timer();
732 739
@@ -750,7 +757,6 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
750 return 0; 757 return 0;
751} 758}
752static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) { } 759static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) { }
753static inline void hrtimer_init_timer_hres(struct hrtimer *timer) { }
754 760
755#endif /* CONFIG_HIGH_RES_TIMERS */ 761#endif /* CONFIG_HIGH_RES_TIMERS */
756 762
@@ -1121,6 +1127,7 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
1121 enum hrtimer_mode mode) 1127 enum hrtimer_mode mode)
1122{ 1128{
1123 struct hrtimer_cpu_base *cpu_base; 1129 struct hrtimer_cpu_base *cpu_base;
1130 int base;
1124 1131
1125 memset(timer, 0, sizeof(struct hrtimer)); 1132 memset(timer, 0, sizeof(struct hrtimer));
1126 1133
@@ -1129,8 +1136,8 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
1129 if (clock_id == CLOCK_REALTIME && mode != HRTIMER_MODE_ABS) 1136 if (clock_id == CLOCK_REALTIME && mode != HRTIMER_MODE_ABS)
1130 clock_id = CLOCK_MONOTONIC; 1137 clock_id = CLOCK_MONOTONIC;
1131 1138
1132 timer->base = &cpu_base->clock_base[clock_id]; 1139 base = hrtimer_clockid_to_base(clock_id);
1133 hrtimer_init_timer_hres(timer); 1140 timer->base = &cpu_base->clock_base[base];
1134 timerqueue_init(&timer->node); 1141 timerqueue_init(&timer->node);
1135 1142
1136#ifdef CONFIG_TIMER_STATS 1143#ifdef CONFIG_TIMER_STATS
@@ -1165,9 +1172,10 @@ EXPORT_SYMBOL_GPL(hrtimer_init);
1165int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp) 1172int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp)
1166{ 1173{
1167 struct hrtimer_cpu_base *cpu_base; 1174 struct hrtimer_cpu_base *cpu_base;
1175 int base = hrtimer_clockid_to_base(which_clock);
1168 1176
1169 cpu_base = &__raw_get_cpu_var(hrtimer_bases); 1177 cpu_base = &__raw_get_cpu_var(hrtimer_bases);
1170 *tp = ktime_to_timespec(cpu_base->clock_base[which_clock].resolution); 1178 *tp = ktime_to_timespec(cpu_base->clock_base[base].resolution);
1171 1179
1172 return 0; 1180 return 0;
1173} 1181}
@@ -1714,6 +1722,10 @@ static struct notifier_block __cpuinitdata hrtimers_nb = {
1714 1722
1715void __init hrtimers_init(void) 1723void __init hrtimers_init(void)
1716{ 1724{
1725 hrtimer_clock_to_base_table[CLOCK_REALTIME] = HRTIMER_BASE_REALTIME;
1726 hrtimer_clock_to_base_table[CLOCK_MONOTONIC] = HRTIMER_BASE_MONOTONIC;
1727 hrtimer_clock_to_base_table[CLOCK_BOOTTIME] = HRTIMER_BASE_BOOTTIME;
1728
1717 hrtimer_cpu_notify(&hrtimers_nb, (unsigned long)CPU_UP_PREPARE, 1729 hrtimer_cpu_notify(&hrtimers_nb, (unsigned long)CPU_UP_PREPARE,
1718 (void *)(long)smp_processor_id()); 1730 (void *)(long)smp_processor_id());
1719 register_cpu_notifier(&hrtimers_nb); 1731 register_cpu_notifier(&hrtimers_nb);
@@ -1745,7 +1757,7 @@ schedule_hrtimeout_range_clock(ktime_t *expires, unsigned long delta,
1745 } 1757 }
1746 1758
1747 /* 1759 /*
1748 * A NULL parameter means "inifinte" 1760 * A NULL parameter means "infinite"
1749 */ 1761 */
1750 if (!expires) { 1762 if (!expires) {
1751 schedule(); 1763 schedule();
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
index 31d766bf5d2e..09bef82d74cb 100644
--- a/kernel/irq/Kconfig
+++ b/kernel/irq/Kconfig
@@ -1,5 +1,6 @@
1# Select this to activate the generic irq options below
1config HAVE_GENERIC_HARDIRQS 2config HAVE_GENERIC_HARDIRQS
2 def_bool n 3 bool
3 4
4if HAVE_GENERIC_HARDIRQS 5if HAVE_GENERIC_HARDIRQS
5menu "IRQ subsystem" 6menu "IRQ subsystem"
@@ -9,31 +10,46 @@ menu "IRQ subsystem"
9config GENERIC_HARDIRQS 10config GENERIC_HARDIRQS
10 def_bool y 11 def_bool y
11 12
12config GENERIC_HARDIRQS_NO__DO_IRQ
13 def_bool y
14
15# Select this to disable the deprecated stuff 13# Select this to disable the deprecated stuff
16config GENERIC_HARDIRQS_NO_DEPRECATED 14config GENERIC_HARDIRQS_NO_DEPRECATED
17 def_bool n 15 bool
16
17config GENERIC_HARDIRQS_NO_COMPAT
18 bool
18 19
19# Options selectable by the architecture code 20# Options selectable by the architecture code
21
22# Make sparse irq Kconfig switch below available
20config HAVE_SPARSE_IRQ 23config HAVE_SPARSE_IRQ
21 def_bool n 24 bool
22 25
26# Enable the generic irq autoprobe mechanism
23config GENERIC_IRQ_PROBE 27config GENERIC_IRQ_PROBE
24 def_bool n 28 bool
25 29
30# Use the generic /proc/interrupts implementation
31config GENERIC_IRQ_SHOW
32 bool
33
34# Support for delayed migration from interrupt context
26config GENERIC_PENDING_IRQ 35config GENERIC_PENDING_IRQ
27 def_bool n 36 bool
28 37
38# Alpha specific irq affinity mechanism
29config AUTO_IRQ_AFFINITY 39config AUTO_IRQ_AFFINITY
30 def_bool n 40 bool
31
32config IRQ_PER_CPU
33 def_bool n
34 41
42# Tasklet based software resend for pending interrupts on enable_irq()
35config HARDIRQS_SW_RESEND 43config HARDIRQS_SW_RESEND
36 def_bool n 44 bool
45
46# Preflow handler support for fasteoi (sparc64)
47config IRQ_PREFLOW_FASTEOI
48 bool
49
50# Support forced irq threading
51config IRQ_FORCED_THREADING
52 bool
37 53
38config SPARSE_IRQ 54config SPARSE_IRQ
39 bool "Support sparse irq numbering" 55 bool "Support sparse irq numbering"
diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c
index 505798f86c36..394784c57060 100644
--- a/kernel/irq/autoprobe.c
+++ b/kernel/irq/autoprobe.c
@@ -17,7 +17,7 @@
17/* 17/*
18 * Autodetection depends on the fact that any interrupt that 18 * Autodetection depends on the fact that any interrupt that
19 * comes in on to an unassigned handler will get stuck with 19 * comes in on to an unassigned handler will get stuck with
20 * "IRQ_WAITING" cleared and the interrupt disabled. 20 * "IRQS_WAITING" cleared and the interrupt disabled.
21 */ 21 */
22static DEFINE_MUTEX(probing_active); 22static DEFINE_MUTEX(probing_active);
23 23
@@ -32,7 +32,6 @@ unsigned long probe_irq_on(void)
32{ 32{
33 struct irq_desc *desc; 33 struct irq_desc *desc;
34 unsigned long mask = 0; 34 unsigned long mask = 0;
35 unsigned int status;
36 int i; 35 int i;
37 36
38 /* 37 /*
@@ -46,13 +45,7 @@ unsigned long probe_irq_on(void)
46 */ 45 */
47 for_each_irq_desc_reverse(i, desc) { 46 for_each_irq_desc_reverse(i, desc) {
48 raw_spin_lock_irq(&desc->lock); 47 raw_spin_lock_irq(&desc->lock);
49 if (!desc->action && !(desc->status & IRQ_NOPROBE)) { 48 if (!desc->action && irq_settings_can_probe(desc)) {
50 /*
51 * An old-style architecture might still have
52 * the handle_bad_irq handler there:
53 */
54 compat_irq_chip_set_default_handler(desc);
55
56 /* 49 /*
57 * Some chips need to know about probing in 50 * Some chips need to know about probing in
58 * progress: 51 * progress:
@@ -60,7 +53,7 @@ unsigned long probe_irq_on(void)
60 if (desc->irq_data.chip->irq_set_type) 53 if (desc->irq_data.chip->irq_set_type)
61 desc->irq_data.chip->irq_set_type(&desc->irq_data, 54 desc->irq_data.chip->irq_set_type(&desc->irq_data,
62 IRQ_TYPE_PROBE); 55 IRQ_TYPE_PROBE);
63 desc->irq_data.chip->irq_startup(&desc->irq_data); 56 irq_startup(desc);
64 } 57 }
65 raw_spin_unlock_irq(&desc->lock); 58 raw_spin_unlock_irq(&desc->lock);
66 } 59 }
@@ -75,10 +68,12 @@ unsigned long probe_irq_on(void)
75 */ 68 */
76 for_each_irq_desc_reverse(i, desc) { 69 for_each_irq_desc_reverse(i, desc) {
77 raw_spin_lock_irq(&desc->lock); 70 raw_spin_lock_irq(&desc->lock);
78 if (!desc->action && !(desc->status & IRQ_NOPROBE)) { 71 if (!desc->action && irq_settings_can_probe(desc)) {
79 desc->status |= IRQ_AUTODETECT | IRQ_WAITING; 72 desc->istate |= IRQS_AUTODETECT | IRQS_WAITING;
80 if (desc->irq_data.chip->irq_startup(&desc->irq_data)) 73 if (irq_startup(desc)) {
81 desc->status |= IRQ_PENDING; 74 irq_compat_set_pending(desc);
75 desc->istate |= IRQS_PENDING;
76 }
82 } 77 }
83 raw_spin_unlock_irq(&desc->lock); 78 raw_spin_unlock_irq(&desc->lock);
84 } 79 }
@@ -93,13 +88,12 @@ unsigned long probe_irq_on(void)
93 */ 88 */
94 for_each_irq_desc(i, desc) { 89 for_each_irq_desc(i, desc) {
95 raw_spin_lock_irq(&desc->lock); 90 raw_spin_lock_irq(&desc->lock);
96 status = desc->status;
97 91
98 if (status & IRQ_AUTODETECT) { 92 if (desc->istate & IRQS_AUTODETECT) {
99 /* It triggered already - consider it spurious. */ 93 /* It triggered already - consider it spurious. */
100 if (!(status & IRQ_WAITING)) { 94 if (!(desc->istate & IRQS_WAITING)) {
101 desc->status = status & ~IRQ_AUTODETECT; 95 desc->istate &= ~IRQS_AUTODETECT;
102 desc->irq_data.chip->irq_shutdown(&desc->irq_data); 96 irq_shutdown(desc);
103 } else 97 } else
104 if (i < 32) 98 if (i < 32)
105 mask |= 1 << i; 99 mask |= 1 << i;
@@ -125,20 +119,18 @@ EXPORT_SYMBOL(probe_irq_on);
125 */ 119 */
126unsigned int probe_irq_mask(unsigned long val) 120unsigned int probe_irq_mask(unsigned long val)
127{ 121{
128 unsigned int status, mask = 0; 122 unsigned int mask = 0;
129 struct irq_desc *desc; 123 struct irq_desc *desc;
130 int i; 124 int i;
131 125
132 for_each_irq_desc(i, desc) { 126 for_each_irq_desc(i, desc) {
133 raw_spin_lock_irq(&desc->lock); 127 raw_spin_lock_irq(&desc->lock);
134 status = desc->status; 128 if (desc->istate & IRQS_AUTODETECT) {
135 129 if (i < 16 && !(desc->istate & IRQS_WAITING))
136 if (status & IRQ_AUTODETECT) {
137 if (i < 16 && !(status & IRQ_WAITING))
138 mask |= 1 << i; 130 mask |= 1 << i;
139 131
140 desc->status = status & ~IRQ_AUTODETECT; 132 desc->istate &= ~IRQS_AUTODETECT;
141 desc->irq_data.chip->irq_shutdown(&desc->irq_data); 133 irq_shutdown(desc);
142 } 134 }
143 raw_spin_unlock_irq(&desc->lock); 135 raw_spin_unlock_irq(&desc->lock);
144 } 136 }
@@ -169,20 +161,18 @@ int probe_irq_off(unsigned long val)
169{ 161{
170 int i, irq_found = 0, nr_of_irqs = 0; 162 int i, irq_found = 0, nr_of_irqs = 0;
171 struct irq_desc *desc; 163 struct irq_desc *desc;
172 unsigned int status;
173 164
174 for_each_irq_desc(i, desc) { 165 for_each_irq_desc(i, desc) {
175 raw_spin_lock_irq(&desc->lock); 166 raw_spin_lock_irq(&desc->lock);
176 status = desc->status;
177 167
178 if (status & IRQ_AUTODETECT) { 168 if (desc->istate & IRQS_AUTODETECT) {
179 if (!(status & IRQ_WAITING)) { 169 if (!(desc->istate & IRQS_WAITING)) {
180 if (!nr_of_irqs) 170 if (!nr_of_irqs)
181 irq_found = i; 171 irq_found = i;
182 nr_of_irqs++; 172 nr_of_irqs++;
183 } 173 }
184 desc->status = status & ~IRQ_AUTODETECT; 174 desc->istate &= ~IRQS_AUTODETECT;
185 desc->irq_data.chip->irq_shutdown(&desc->irq_data); 175 irq_shutdown(desc);
186 } 176 }
187 raw_spin_unlock_irq(&desc->lock); 177 raw_spin_unlock_irq(&desc->lock);
188 } 178 }
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index baa5c4acad83..c9c0601f0615 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -19,140 +19,110 @@
19#include "internals.h" 19#include "internals.h"
20 20
21/** 21/**
22 * set_irq_chip - set the irq chip for an irq 22 * irq_set_chip - set the irq chip for an irq
23 * @irq: irq number 23 * @irq: irq number
24 * @chip: pointer to irq chip description structure 24 * @chip: pointer to irq chip description structure
25 */ 25 */
26int set_irq_chip(unsigned int irq, struct irq_chip *chip) 26int irq_set_chip(unsigned int irq, struct irq_chip *chip)
27{ 27{
28 struct irq_desc *desc = irq_to_desc(irq);
29 unsigned long flags; 28 unsigned long flags;
29 struct irq_desc *desc = irq_get_desc_lock(irq, &flags);
30 30
31 if (!desc) { 31 if (!desc)
32 WARN(1, KERN_ERR "Trying to install chip for IRQ%d\n", irq);
33 return -EINVAL; 32 return -EINVAL;
34 }
35 33
36 if (!chip) 34 if (!chip)
37 chip = &no_irq_chip; 35 chip = &no_irq_chip;
38 36
39 raw_spin_lock_irqsave(&desc->lock, flags);
40 irq_chip_set_defaults(chip); 37 irq_chip_set_defaults(chip);
41 desc->irq_data.chip = chip; 38 desc->irq_data.chip = chip;
42 raw_spin_unlock_irqrestore(&desc->lock, flags); 39 irq_put_desc_unlock(desc, flags);
43
44 return 0; 40 return 0;
45} 41}
46EXPORT_SYMBOL(set_irq_chip); 42EXPORT_SYMBOL(irq_set_chip);
47 43
48/** 44/**
49 * set_irq_type - set the irq trigger type for an irq 45 * irq_set_type - set the irq trigger type for an irq
50 * @irq: irq number 46 * @irq: irq number
51 * @type: IRQ_TYPE_{LEVEL,EDGE}_* value - see include/linux/irq.h 47 * @type: IRQ_TYPE_{LEVEL,EDGE}_* value - see include/linux/irq.h
52 */ 48 */
53int set_irq_type(unsigned int irq, unsigned int type) 49int irq_set_irq_type(unsigned int irq, unsigned int type)
54{ 50{
55 struct irq_desc *desc = irq_to_desc(irq);
56 unsigned long flags; 51 unsigned long flags;
57 int ret = -ENXIO; 52 struct irq_desc *desc = irq_get_desc_buslock(irq, &flags);
53 int ret = 0;
58 54
59 if (!desc) { 55 if (!desc)
60 printk(KERN_ERR "Trying to set irq type for IRQ%d\n", irq); 56 return -EINVAL;
61 return -ENODEV;
62 }
63 57
64 type &= IRQ_TYPE_SENSE_MASK; 58 type &= IRQ_TYPE_SENSE_MASK;
65 if (type == IRQ_TYPE_NONE) 59 if (type != IRQ_TYPE_NONE)
66 return 0; 60 ret = __irq_set_trigger(desc, irq, type);
67 61 irq_put_desc_busunlock(desc, flags);
68 raw_spin_lock_irqsave(&desc->lock, flags);
69 ret = __irq_set_trigger(desc, irq, type);
70 raw_spin_unlock_irqrestore(&desc->lock, flags);
71 return ret; 62 return ret;
72} 63}
73EXPORT_SYMBOL(set_irq_type); 64EXPORT_SYMBOL(irq_set_irq_type);
74 65
75/** 66/**
76 * set_irq_data - set irq type data for an irq 67 * irq_set_handler_data - set irq handler data for an irq
77 * @irq: Interrupt number 68 * @irq: Interrupt number
78 * @data: Pointer to interrupt specific data 69 * @data: Pointer to interrupt specific data
79 * 70 *
80 * Set the hardware irq controller data for an irq 71 * Set the hardware irq controller data for an irq
81 */ 72 */
82int set_irq_data(unsigned int irq, void *data) 73int irq_set_handler_data(unsigned int irq, void *data)
83{ 74{
84 struct irq_desc *desc = irq_to_desc(irq);
85 unsigned long flags; 75 unsigned long flags;
76 struct irq_desc *desc = irq_get_desc_lock(irq, &flags);
86 77
87 if (!desc) { 78 if (!desc)
88 printk(KERN_ERR
89 "Trying to install controller data for IRQ%d\n", irq);
90 return -EINVAL; 79 return -EINVAL;
91 }
92
93 raw_spin_lock_irqsave(&desc->lock, flags);
94 desc->irq_data.handler_data = data; 80 desc->irq_data.handler_data = data;
95 raw_spin_unlock_irqrestore(&desc->lock, flags); 81 irq_put_desc_unlock(desc, flags);
96 return 0; 82 return 0;
97} 83}
98EXPORT_SYMBOL(set_irq_data); 84EXPORT_SYMBOL(irq_set_handler_data);
99 85
100/** 86/**
101 * set_irq_msi - set MSI descriptor data for an irq 87 * irq_set_msi_desc - set MSI descriptor data for an irq
102 * @irq: Interrupt number 88 * @irq: Interrupt number
103 * @entry: Pointer to MSI descriptor data 89 * @entry: Pointer to MSI descriptor data
104 * 90 *
105 * Set the MSI descriptor entry for an irq 91 * Set the MSI descriptor entry for an irq
106 */ 92 */
107int set_irq_msi(unsigned int irq, struct msi_desc *entry) 93int irq_set_msi_desc(unsigned int irq, struct msi_desc *entry)
108{ 94{
109 struct irq_desc *desc = irq_to_desc(irq);
110 unsigned long flags; 95 unsigned long flags;
96 struct irq_desc *desc = irq_get_desc_lock(irq, &flags);
111 97
112 if (!desc) { 98 if (!desc)
113 printk(KERN_ERR
114 "Trying to install msi data for IRQ%d\n", irq);
115 return -EINVAL; 99 return -EINVAL;
116 }
117
118 raw_spin_lock_irqsave(&desc->lock, flags);
119 desc->irq_data.msi_desc = entry; 100 desc->irq_data.msi_desc = entry;
120 if (entry) 101 if (entry)
121 entry->irq = irq; 102 entry->irq = irq;
122 raw_spin_unlock_irqrestore(&desc->lock, flags); 103 irq_put_desc_unlock(desc, flags);
123 return 0; 104 return 0;
124} 105}
125 106
126/** 107/**
127 * set_irq_chip_data - set irq chip data for an irq 108 * irq_set_chip_data - set irq chip data for an irq
128 * @irq: Interrupt number 109 * @irq: Interrupt number
129 * @data: Pointer to chip specific data 110 * @data: Pointer to chip specific data
130 * 111 *
131 * Set the hardware irq chip data for an irq 112 * Set the hardware irq chip data for an irq
132 */ 113 */
133int set_irq_chip_data(unsigned int irq, void *data) 114int irq_set_chip_data(unsigned int irq, void *data)
134{ 115{
135 struct irq_desc *desc = irq_to_desc(irq);
136 unsigned long flags; 116 unsigned long flags;
117 struct irq_desc *desc = irq_get_desc_lock(irq, &flags);
137 118
138 if (!desc) { 119 if (!desc)
139 printk(KERN_ERR
140 "Trying to install chip data for IRQ%d\n", irq);
141 return -EINVAL;
142 }
143
144 if (!desc->irq_data.chip) {
145 printk(KERN_ERR "BUG: bad set_irq_chip_data(IRQ#%d)\n", irq);
146 return -EINVAL; 120 return -EINVAL;
147 }
148
149 raw_spin_lock_irqsave(&desc->lock, flags);
150 desc->irq_data.chip_data = data; 121 desc->irq_data.chip_data = data;
151 raw_spin_unlock_irqrestore(&desc->lock, flags); 122 irq_put_desc_unlock(desc, flags);
152
153 return 0; 123 return 0;
154} 124}
155EXPORT_SYMBOL(set_irq_chip_data); 125EXPORT_SYMBOL(irq_set_chip_data);
156 126
157struct irq_data *irq_get_irq_data(unsigned int irq) 127struct irq_data *irq_get_irq_data(unsigned int irq)
158{ 128{
@@ -162,72 +132,75 @@ struct irq_data *irq_get_irq_data(unsigned int irq)
162} 132}
163EXPORT_SYMBOL_GPL(irq_get_irq_data); 133EXPORT_SYMBOL_GPL(irq_get_irq_data);
164 134
165/** 135static void irq_state_clr_disabled(struct irq_desc *desc)
166 * set_irq_nested_thread - Set/Reset the IRQ_NESTED_THREAD flag of an irq
167 *
168 * @irq: Interrupt number
169 * @nest: 0 to clear / 1 to set the IRQ_NESTED_THREAD flag
170 *
171 * The IRQ_NESTED_THREAD flag indicates that on
172 * request_threaded_irq() no separate interrupt thread should be
173 * created for the irq as the handler are called nested in the
174 * context of a demultiplexing interrupt handler thread.
175 */
176void set_irq_nested_thread(unsigned int irq, int nest)
177{ 136{
178 struct irq_desc *desc = irq_to_desc(irq); 137 desc->istate &= ~IRQS_DISABLED;
179 unsigned long flags; 138 irq_compat_clr_disabled(desc);
180
181 if (!desc)
182 return;
183
184 raw_spin_lock_irqsave(&desc->lock, flags);
185 if (nest)
186 desc->status |= IRQ_NESTED_THREAD;
187 else
188 desc->status &= ~IRQ_NESTED_THREAD;
189 raw_spin_unlock_irqrestore(&desc->lock, flags);
190} 139}
191EXPORT_SYMBOL_GPL(set_irq_nested_thread);
192 140
193/* 141static void irq_state_set_disabled(struct irq_desc *desc)
194 * default enable function
195 */
196static void default_enable(struct irq_data *data)
197{ 142{
198 struct irq_desc *desc = irq_data_to_desc(data); 143 desc->istate |= IRQS_DISABLED;
144 irq_compat_set_disabled(desc);
145}
199 146
200 desc->irq_data.chip->irq_unmask(&desc->irq_data); 147static void irq_state_clr_masked(struct irq_desc *desc)
201 desc->status &= ~IRQ_MASKED; 148{
149 desc->istate &= ~IRQS_MASKED;
150 irq_compat_clr_masked(desc);
202} 151}
203 152
204/* 153static void irq_state_set_masked(struct irq_desc *desc)
205 * default disable function
206 */
207static void default_disable(struct irq_data *data)
208{ 154{
155 desc->istate |= IRQS_MASKED;
156 irq_compat_set_masked(desc);
209} 157}
210 158
211/* 159int irq_startup(struct irq_desc *desc)
212 * default startup function
213 */
214static unsigned int default_startup(struct irq_data *data)
215{ 160{
216 struct irq_desc *desc = irq_data_to_desc(data); 161 irq_state_clr_disabled(desc);
162 desc->depth = 0;
163
164 if (desc->irq_data.chip->irq_startup) {
165 int ret = desc->irq_data.chip->irq_startup(&desc->irq_data);
166 irq_state_clr_masked(desc);
167 return ret;
168 }
217 169
218 desc->irq_data.chip->irq_enable(data); 170 irq_enable(desc);
219 return 0; 171 return 0;
220} 172}
221 173
222/* 174void irq_shutdown(struct irq_desc *desc)
223 * default shutdown function
224 */
225static void default_shutdown(struct irq_data *data)
226{ 175{
227 struct irq_desc *desc = irq_data_to_desc(data); 176 irq_state_set_disabled(desc);
177 desc->depth = 1;
178 if (desc->irq_data.chip->irq_shutdown)
179 desc->irq_data.chip->irq_shutdown(&desc->irq_data);
180 if (desc->irq_data.chip->irq_disable)
181 desc->irq_data.chip->irq_disable(&desc->irq_data);
182 else
183 desc->irq_data.chip->irq_mask(&desc->irq_data);
184 irq_state_set_masked(desc);
185}
228 186
229 desc->irq_data.chip->irq_mask(&desc->irq_data); 187void irq_enable(struct irq_desc *desc)
230 desc->status |= IRQ_MASKED; 188{
189 irq_state_clr_disabled(desc);
190 if (desc->irq_data.chip->irq_enable)
191 desc->irq_data.chip->irq_enable(&desc->irq_data);
192 else
193 desc->irq_data.chip->irq_unmask(&desc->irq_data);
194 irq_state_clr_masked(desc);
195}
196
197void irq_disable(struct irq_desc *desc)
198{
199 irq_state_set_disabled(desc);
200 if (desc->irq_data.chip->irq_disable) {
201 desc->irq_data.chip->irq_disable(&desc->irq_data);
202 irq_state_set_masked(desc);
203 }
231} 204}
232 205
233#ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED 206#ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED
@@ -315,10 +288,6 @@ static void compat_bus_sync_unlock(struct irq_data *data)
315void irq_chip_set_defaults(struct irq_chip *chip) 288void irq_chip_set_defaults(struct irq_chip *chip)
316{ 289{
317#ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED 290#ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED
318 /*
319 * Compat fixup functions need to be before we set the
320 * defaults for enable/disable/startup/shutdown
321 */
322 if (chip->enable) 291 if (chip->enable)
323 chip->irq_enable = compat_irq_enable; 292 chip->irq_enable = compat_irq_enable;
324 if (chip->disable) 293 if (chip->disable)
@@ -327,33 +296,8 @@ void irq_chip_set_defaults(struct irq_chip *chip)
327 chip->irq_shutdown = compat_irq_shutdown; 296 chip->irq_shutdown = compat_irq_shutdown;
328 if (chip->startup) 297 if (chip->startup)
329 chip->irq_startup = compat_irq_startup; 298 chip->irq_startup = compat_irq_startup;
330#endif
331 /*
332 * The real defaults
333 */
334 if (!chip->irq_enable)
335 chip->irq_enable = default_enable;
336 if (!chip->irq_disable)
337 chip->irq_disable = default_disable;
338 if (!chip->irq_startup)
339 chip->irq_startup = default_startup;
340 /*
341 * We use chip->irq_disable, when the user provided its own. When
342 * we have default_disable set for chip->irq_disable, then we need
343 * to use default_shutdown, otherwise the irq line is not
344 * disabled on free_irq():
345 */
346 if (!chip->irq_shutdown)
347 chip->irq_shutdown = chip->irq_disable != default_disable ?
348 chip->irq_disable : default_shutdown;
349
350#ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED
351 if (!chip->end) 299 if (!chip->end)
352 chip->end = dummy_irq_chip.end; 300 chip->end = dummy_irq_chip.end;
353
354 /*
355 * Now fix up the remaining compat handlers
356 */
357 if (chip->bus_lock) 301 if (chip->bus_lock)
358 chip->irq_bus_lock = compat_bus_lock; 302 chip->irq_bus_lock = compat_bus_lock;
359 if (chip->bus_sync_unlock) 303 if (chip->bus_sync_unlock)
@@ -388,22 +332,22 @@ static inline void mask_ack_irq(struct irq_desc *desc)
388 if (desc->irq_data.chip->irq_ack) 332 if (desc->irq_data.chip->irq_ack)
389 desc->irq_data.chip->irq_ack(&desc->irq_data); 333 desc->irq_data.chip->irq_ack(&desc->irq_data);
390 } 334 }
391 desc->status |= IRQ_MASKED; 335 irq_state_set_masked(desc);
392} 336}
393 337
394static inline void mask_irq(struct irq_desc *desc) 338void mask_irq(struct irq_desc *desc)
395{ 339{
396 if (desc->irq_data.chip->irq_mask) { 340 if (desc->irq_data.chip->irq_mask) {
397 desc->irq_data.chip->irq_mask(&desc->irq_data); 341 desc->irq_data.chip->irq_mask(&desc->irq_data);
398 desc->status |= IRQ_MASKED; 342 irq_state_set_masked(desc);
399 } 343 }
400} 344}
401 345
402static inline void unmask_irq(struct irq_desc *desc) 346void unmask_irq(struct irq_desc *desc)
403{ 347{
404 if (desc->irq_data.chip->irq_unmask) { 348 if (desc->irq_data.chip->irq_unmask) {
405 desc->irq_data.chip->irq_unmask(&desc->irq_data); 349 desc->irq_data.chip->irq_unmask(&desc->irq_data);
406 desc->status &= ~IRQ_MASKED; 350 irq_state_clr_masked(desc);
407 } 351 }
408} 352}
409 353
@@ -428,10 +372,11 @@ void handle_nested_irq(unsigned int irq)
428 kstat_incr_irqs_this_cpu(irq, desc); 372 kstat_incr_irqs_this_cpu(irq, desc);
429 373
430 action = desc->action; 374 action = desc->action;
431 if (unlikely(!action || (desc->status & IRQ_DISABLED))) 375 if (unlikely(!action || (desc->istate & IRQS_DISABLED)))
432 goto out_unlock; 376 goto out_unlock;
433 377
434 desc->status |= IRQ_INPROGRESS; 378 irq_compat_set_progress(desc);
379 desc->istate |= IRQS_INPROGRESS;
435 raw_spin_unlock_irq(&desc->lock); 380 raw_spin_unlock_irq(&desc->lock);
436 381
437 action_ret = action->thread_fn(action->irq, action->dev_id); 382 action_ret = action->thread_fn(action->irq, action->dev_id);
@@ -439,13 +384,21 @@ void handle_nested_irq(unsigned int irq)
439 note_interrupt(irq, desc, action_ret); 384 note_interrupt(irq, desc, action_ret);
440 385
441 raw_spin_lock_irq(&desc->lock); 386 raw_spin_lock_irq(&desc->lock);
442 desc->status &= ~IRQ_INPROGRESS; 387 desc->istate &= ~IRQS_INPROGRESS;
388 irq_compat_clr_progress(desc);
443 389
444out_unlock: 390out_unlock:
445 raw_spin_unlock_irq(&desc->lock); 391 raw_spin_unlock_irq(&desc->lock);
446} 392}
447EXPORT_SYMBOL_GPL(handle_nested_irq); 393EXPORT_SYMBOL_GPL(handle_nested_irq);
448 394
395static bool irq_check_poll(struct irq_desc *desc)
396{
397 if (!(desc->istate & IRQS_POLL_INPROGRESS))
398 return false;
399 return irq_wait_for_poll(desc);
400}
401
449/** 402/**
450 * handle_simple_irq - Simple and software-decoded IRQs. 403 * handle_simple_irq - Simple and software-decoded IRQs.
451 * @irq: the interrupt number 404 * @irq: the interrupt number
@@ -461,29 +414,20 @@ EXPORT_SYMBOL_GPL(handle_nested_irq);
461void 414void
462handle_simple_irq(unsigned int irq, struct irq_desc *desc) 415handle_simple_irq(unsigned int irq, struct irq_desc *desc)
463{ 416{
464 struct irqaction *action;
465 irqreturn_t action_ret;
466
467 raw_spin_lock(&desc->lock); 417 raw_spin_lock(&desc->lock);
468 418
469 if (unlikely(desc->status & IRQ_INPROGRESS)) 419 if (unlikely(desc->istate & IRQS_INPROGRESS))
470 goto out_unlock; 420 if (!irq_check_poll(desc))
471 desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); 421 goto out_unlock;
422
423 desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
472 kstat_incr_irqs_this_cpu(irq, desc); 424 kstat_incr_irqs_this_cpu(irq, desc);
473 425
474 action = desc->action; 426 if (unlikely(!desc->action || (desc->istate & IRQS_DISABLED)))
475 if (unlikely(!action || (desc->status & IRQ_DISABLED)))
476 goto out_unlock; 427 goto out_unlock;
477 428
478 desc->status |= IRQ_INPROGRESS; 429 handle_irq_event(desc);
479 raw_spin_unlock(&desc->lock);
480 430
481 action_ret = handle_IRQ_event(irq, action);
482 if (!noirqdebug)
483 note_interrupt(irq, desc, action_ret);
484
485 raw_spin_lock(&desc->lock);
486 desc->status &= ~IRQ_INPROGRESS;
487out_unlock: 431out_unlock:
488 raw_spin_unlock(&desc->lock); 432 raw_spin_unlock(&desc->lock);
489} 433}
@@ -501,42 +445,42 @@ out_unlock:
501void 445void
502handle_level_irq(unsigned int irq, struct irq_desc *desc) 446handle_level_irq(unsigned int irq, struct irq_desc *desc)
503{ 447{
504 struct irqaction *action;
505 irqreturn_t action_ret;
506
507 raw_spin_lock(&desc->lock); 448 raw_spin_lock(&desc->lock);
508 mask_ack_irq(desc); 449 mask_ack_irq(desc);
509 450
510 if (unlikely(desc->status & IRQ_INPROGRESS)) 451 if (unlikely(desc->istate & IRQS_INPROGRESS))
511 goto out_unlock; 452 if (!irq_check_poll(desc))
512 desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); 453 goto out_unlock;
454
455 desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
513 kstat_incr_irqs_this_cpu(irq, desc); 456 kstat_incr_irqs_this_cpu(irq, desc);
514 457
515 /* 458 /*
516 * If its disabled or no action available 459 * If its disabled or no action available
517 * keep it masked and get out of here 460 * keep it masked and get out of here
518 */ 461 */
519 action = desc->action; 462 if (unlikely(!desc->action || (desc->istate & IRQS_DISABLED)))
520 if (unlikely(!action || (desc->status & IRQ_DISABLED)))
521 goto out_unlock; 463 goto out_unlock;
522 464
523 desc->status |= IRQ_INPROGRESS; 465 handle_irq_event(desc);
524 raw_spin_unlock(&desc->lock);
525
526 action_ret = handle_IRQ_event(irq, action);
527 if (!noirqdebug)
528 note_interrupt(irq, desc, action_ret);
529 466
530 raw_spin_lock(&desc->lock); 467 if (!(desc->istate & (IRQS_DISABLED | IRQS_ONESHOT)))
531 desc->status &= ~IRQ_INPROGRESS;
532
533 if (!(desc->status & (IRQ_DISABLED | IRQ_ONESHOT)))
534 unmask_irq(desc); 468 unmask_irq(desc);
535out_unlock: 469out_unlock:
536 raw_spin_unlock(&desc->lock); 470 raw_spin_unlock(&desc->lock);
537} 471}
538EXPORT_SYMBOL_GPL(handle_level_irq); 472EXPORT_SYMBOL_GPL(handle_level_irq);
539 473
474#ifdef CONFIG_IRQ_PREFLOW_FASTEOI
475static inline void preflow_handler(struct irq_desc *desc)
476{
477 if (desc->preflow_handler)
478 desc->preflow_handler(&desc->irq_data);
479}
480#else
481static inline void preflow_handler(struct irq_desc *desc) { }
482#endif
483
540/** 484/**
541 * handle_fasteoi_irq - irq handler for transparent controllers 485 * handle_fasteoi_irq - irq handler for transparent controllers
542 * @irq: the interrupt number 486 * @irq: the interrupt number
@@ -550,42 +494,41 @@ EXPORT_SYMBOL_GPL(handle_level_irq);
550void 494void
551handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc) 495handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc)
552{ 496{
553 struct irqaction *action;
554 irqreturn_t action_ret;
555
556 raw_spin_lock(&desc->lock); 497 raw_spin_lock(&desc->lock);
557 498
558 if (unlikely(desc->status & IRQ_INPROGRESS)) 499 if (unlikely(desc->istate & IRQS_INPROGRESS))
559 goto out; 500 if (!irq_check_poll(desc))
501 goto out;
560 502
561 desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); 503 desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
562 kstat_incr_irqs_this_cpu(irq, desc); 504 kstat_incr_irqs_this_cpu(irq, desc);
563 505
564 /* 506 /*
565 * If its disabled or no action available 507 * If its disabled or no action available
566 * then mask it and get out of here: 508 * then mask it and get out of here:
567 */ 509 */
568 action = desc->action; 510 if (unlikely(!desc->action || (desc->istate & IRQS_DISABLED))) {
569 if (unlikely(!action || (desc->status & IRQ_DISABLED))) { 511 irq_compat_set_pending(desc);
570 desc->status |= IRQ_PENDING; 512 desc->istate |= IRQS_PENDING;
571 mask_irq(desc); 513 mask_irq(desc);
572 goto out; 514 goto out;
573 } 515 }
574 516
575 desc->status |= IRQ_INPROGRESS; 517 if (desc->istate & IRQS_ONESHOT)
576 desc->status &= ~IRQ_PENDING; 518 mask_irq(desc);
577 raw_spin_unlock(&desc->lock);
578 519
579 action_ret = handle_IRQ_event(irq, action); 520 preflow_handler(desc);
580 if (!noirqdebug) 521 handle_irq_event(desc);
581 note_interrupt(irq, desc, action_ret);
582 522
583 raw_spin_lock(&desc->lock); 523out_eoi:
584 desc->status &= ~IRQ_INPROGRESS;
585out:
586 desc->irq_data.chip->irq_eoi(&desc->irq_data); 524 desc->irq_data.chip->irq_eoi(&desc->irq_data);
587 525out_unlock:
588 raw_spin_unlock(&desc->lock); 526 raw_spin_unlock(&desc->lock);
527 return;
528out:
529 if (!(desc->irq_data.chip->flags & IRQCHIP_EOI_IF_HANDLED))
530 goto out_eoi;
531 goto out_unlock;
589} 532}
590 533
591/** 534/**
@@ -609,32 +552,28 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
609{ 552{
610 raw_spin_lock(&desc->lock); 553 raw_spin_lock(&desc->lock);
611 554
612 desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); 555 desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
613
614 /* 556 /*
615 * If we're currently running this IRQ, or its disabled, 557 * If we're currently running this IRQ, or its disabled,
616 * we shouldn't process the IRQ. Mark it pending, handle 558 * we shouldn't process the IRQ. Mark it pending, handle
617 * the necessary masking and go out 559 * the necessary masking and go out
618 */ 560 */
619 if (unlikely((desc->status & (IRQ_INPROGRESS | IRQ_DISABLED)) || 561 if (unlikely((desc->istate & (IRQS_DISABLED | IRQS_INPROGRESS) ||
620 !desc->action)) { 562 !desc->action))) {
621 desc->status |= (IRQ_PENDING | IRQ_MASKED); 563 if (!irq_check_poll(desc)) {
622 mask_ack_irq(desc); 564 irq_compat_set_pending(desc);
623 goto out_unlock; 565 desc->istate |= IRQS_PENDING;
566 mask_ack_irq(desc);
567 goto out_unlock;
568 }
624 } 569 }
625 kstat_incr_irqs_this_cpu(irq, desc); 570 kstat_incr_irqs_this_cpu(irq, desc);
626 571
627 /* Start handling the irq */ 572 /* Start handling the irq */
628 desc->irq_data.chip->irq_ack(&desc->irq_data); 573 desc->irq_data.chip->irq_ack(&desc->irq_data);
629 574
630 /* Mark the IRQ currently in progress.*/
631 desc->status |= IRQ_INPROGRESS;
632
633 do { 575 do {
634 struct irqaction *action = desc->action; 576 if (unlikely(!desc->action)) {
635 irqreturn_t action_ret;
636
637 if (unlikely(!action)) {
638 mask_irq(desc); 577 mask_irq(desc);
639 goto out_unlock; 578 goto out_unlock;
640 } 579 }
@@ -644,22 +583,17 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
644 * one, we could have masked the irq. 583 * one, we could have masked the irq.
645 * Renable it, if it was not disabled in meantime. 584 * Renable it, if it was not disabled in meantime.
646 */ 585 */
647 if (unlikely((desc->status & 586 if (unlikely(desc->istate & IRQS_PENDING)) {
648 (IRQ_PENDING | IRQ_MASKED | IRQ_DISABLED)) == 587 if (!(desc->istate & IRQS_DISABLED) &&
649 (IRQ_PENDING | IRQ_MASKED))) { 588 (desc->istate & IRQS_MASKED))
650 unmask_irq(desc); 589 unmask_irq(desc);
651 } 590 }
652 591
653 desc->status &= ~IRQ_PENDING; 592 handle_irq_event(desc);
654 raw_spin_unlock(&desc->lock);
655 action_ret = handle_IRQ_event(irq, action);
656 if (!noirqdebug)
657 note_interrupt(irq, desc, action_ret);
658 raw_spin_lock(&desc->lock);
659 593
660 } while ((desc->status & (IRQ_PENDING | IRQ_DISABLED)) == IRQ_PENDING); 594 } while ((desc->istate & IRQS_PENDING) &&
595 !(desc->istate & IRQS_DISABLED));
661 596
662 desc->status &= ~IRQ_INPROGRESS;
663out_unlock: 597out_unlock:
664 raw_spin_unlock(&desc->lock); 598 raw_spin_unlock(&desc->lock);
665} 599}
@@ -674,103 +608,84 @@ out_unlock:
674void 608void
675handle_percpu_irq(unsigned int irq, struct irq_desc *desc) 609handle_percpu_irq(unsigned int irq, struct irq_desc *desc)
676{ 610{
677 irqreturn_t action_ret; 611 struct irq_chip *chip = irq_desc_get_chip(desc);
678 612
679 kstat_incr_irqs_this_cpu(irq, desc); 613 kstat_incr_irqs_this_cpu(irq, desc);
680 614
681 if (desc->irq_data.chip->irq_ack) 615 if (chip->irq_ack)
682 desc->irq_data.chip->irq_ack(&desc->irq_data); 616 chip->irq_ack(&desc->irq_data);
683 617
684 action_ret = handle_IRQ_event(irq, desc->action); 618 handle_irq_event_percpu(desc, desc->action);
685 if (!noirqdebug)
686 note_interrupt(irq, desc, action_ret);
687 619
688 if (desc->irq_data.chip->irq_eoi) 620 if (chip->irq_eoi)
689 desc->irq_data.chip->irq_eoi(&desc->irq_data); 621 chip->irq_eoi(&desc->irq_data);
690} 622}
691 623
692void 624void
693__set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, 625__irq_set_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
694 const char *name) 626 const char *name)
695{ 627{
696 struct irq_desc *desc = irq_to_desc(irq);
697 unsigned long flags; 628 unsigned long flags;
629 struct irq_desc *desc = irq_get_desc_buslock(irq, &flags);
698 630
699 if (!desc) { 631 if (!desc)
700 printk(KERN_ERR
701 "Trying to install type control for IRQ%d\n", irq);
702 return; 632 return;
703 }
704 633
705 if (!handle) 634 if (!handle) {
706 handle = handle_bad_irq; 635 handle = handle_bad_irq;
707 else if (desc->irq_data.chip == &no_irq_chip) { 636 } else {
708 printk(KERN_WARNING "Trying to install %sinterrupt handler " 637 if (WARN_ON(desc->irq_data.chip == &no_irq_chip))
709 "for IRQ%d\n", is_chained ? "chained " : "", irq); 638 goto out;
710 /*
711 * Some ARM implementations install a handler for really dumb
712 * interrupt hardware without setting an irq_chip. This worked
713 * with the ARM no_irq_chip but the check in setup_irq would
714 * prevent us to setup the interrupt at all. Switch it to
715 * dummy_irq_chip for easy transition.
716 */
717 desc->irq_data.chip = &dummy_irq_chip;
718 } 639 }
719 640
720 chip_bus_lock(desc);
721 raw_spin_lock_irqsave(&desc->lock, flags);
722
723 /* Uninstall? */ 641 /* Uninstall? */
724 if (handle == handle_bad_irq) { 642 if (handle == handle_bad_irq) {
725 if (desc->irq_data.chip != &no_irq_chip) 643 if (desc->irq_data.chip != &no_irq_chip)
726 mask_ack_irq(desc); 644 mask_ack_irq(desc);
727 desc->status |= IRQ_DISABLED; 645 irq_compat_set_disabled(desc);
646 desc->istate |= IRQS_DISABLED;
728 desc->depth = 1; 647 desc->depth = 1;
729 } 648 }
730 desc->handle_irq = handle; 649 desc->handle_irq = handle;
731 desc->name = name; 650 desc->name = name;
732 651
733 if (handle != handle_bad_irq && is_chained) { 652 if (handle != handle_bad_irq && is_chained) {
734 desc->status &= ~IRQ_DISABLED; 653 irq_settings_set_noprobe(desc);
735 desc->status |= IRQ_NOREQUEST | IRQ_NOPROBE; 654 irq_settings_set_norequest(desc);
736 desc->depth = 0; 655 irq_startup(desc);
737 desc->irq_data.chip->irq_startup(&desc->irq_data);
738 } 656 }
739 raw_spin_unlock_irqrestore(&desc->lock, flags); 657out:
740 chip_bus_sync_unlock(desc); 658 irq_put_desc_busunlock(desc, flags);
741}
742EXPORT_SYMBOL_GPL(__set_irq_handler);
743
744void
745set_irq_chip_and_handler(unsigned int irq, struct irq_chip *chip,
746 irq_flow_handler_t handle)
747{
748 set_irq_chip(irq, chip);
749 __set_irq_handler(irq, handle, 0, NULL);
750} 659}
660EXPORT_SYMBOL_GPL(__irq_set_handler);
751 661
752void 662void
753set_irq_chip_and_handler_name(unsigned int irq, struct irq_chip *chip, 663irq_set_chip_and_handler_name(unsigned int irq, struct irq_chip *chip,
754 irq_flow_handler_t handle, const char *name) 664 irq_flow_handler_t handle, const char *name)
755{ 665{
756 set_irq_chip(irq, chip); 666 irq_set_chip(irq, chip);
757 __set_irq_handler(irq, handle, 0, name); 667 __irq_set_handler(irq, handle, 0, name);
758} 668}
759 669
760void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set) 670void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set)
761{ 671{
762 struct irq_desc *desc = irq_to_desc(irq);
763 unsigned long flags; 672 unsigned long flags;
673 struct irq_desc *desc = irq_get_desc_lock(irq, &flags);
764 674
765 if (!desc) 675 if (!desc)
766 return; 676 return;
677 irq_settings_clr_and_set(desc, clr, set);
678
679 irqd_clear(&desc->irq_data, IRQD_NO_BALANCING | IRQD_PER_CPU |
680 IRQD_TRIGGER_MASK | IRQD_LEVEL | IRQD_MOVE_PCNTXT);
681 if (irq_settings_has_no_balance_set(desc))
682 irqd_set(&desc->irq_data, IRQD_NO_BALANCING);
683 if (irq_settings_is_per_cpu(desc))
684 irqd_set(&desc->irq_data, IRQD_PER_CPU);
685 if (irq_settings_can_move_pcntxt(desc))
686 irqd_set(&desc->irq_data, IRQD_MOVE_PCNTXT);
767 687
768 /* Sanitize flags */ 688 irqd_set(&desc->irq_data, irq_settings_get_trigger_mask(desc));
769 set &= IRQF_MODIFY_MASK;
770 clr &= IRQF_MODIFY_MASK;
771 689
772 raw_spin_lock_irqsave(&desc->lock, flags); 690 irq_put_desc_unlock(desc, flags);
773 desc->status &= ~clr;
774 desc->status |= set;
775 raw_spin_unlock_irqrestore(&desc->lock, flags);
776} 691}
diff --git a/kernel/irq/compat.h b/kernel/irq/compat.h
new file mode 100644
index 000000000000..6bbaf66aca85
--- /dev/null
+++ b/kernel/irq/compat.h
@@ -0,0 +1,72 @@
1/*
2 * Compat layer for transition period
3 */
4#ifndef CONFIG_GENERIC_HARDIRQS_NO_COMPAT
5static inline void irq_compat_set_progress(struct irq_desc *desc)
6{
7 desc->status |= IRQ_INPROGRESS;
8}
9
10static inline void irq_compat_clr_progress(struct irq_desc *desc)
11{
12 desc->status &= ~IRQ_INPROGRESS;
13}
14static inline void irq_compat_set_disabled(struct irq_desc *desc)
15{
16 desc->status |= IRQ_DISABLED;
17}
18static inline void irq_compat_clr_disabled(struct irq_desc *desc)
19{
20 desc->status &= ~IRQ_DISABLED;
21}
22static inline void irq_compat_set_pending(struct irq_desc *desc)
23{
24 desc->status |= IRQ_PENDING;
25}
26
27static inline void irq_compat_clr_pending(struct irq_desc *desc)
28{
29 desc->status &= ~IRQ_PENDING;
30}
31static inline void irq_compat_set_masked(struct irq_desc *desc)
32{
33 desc->status |= IRQ_MASKED;
34}
35
36static inline void irq_compat_clr_masked(struct irq_desc *desc)
37{
38 desc->status &= ~IRQ_MASKED;
39}
40static inline void irq_compat_set_move_pending(struct irq_desc *desc)
41{
42 desc->status |= IRQ_MOVE_PENDING;
43}
44
45static inline void irq_compat_clr_move_pending(struct irq_desc *desc)
46{
47 desc->status &= ~IRQ_MOVE_PENDING;
48}
49static inline void irq_compat_set_affinity(struct irq_desc *desc)
50{
51 desc->status |= IRQ_AFFINITY_SET;
52}
53
54static inline void irq_compat_clr_affinity(struct irq_desc *desc)
55{
56 desc->status &= ~IRQ_AFFINITY_SET;
57}
58#else
59static inline void irq_compat_set_progress(struct irq_desc *desc) { }
60static inline void irq_compat_clr_progress(struct irq_desc *desc) { }
61static inline void irq_compat_set_disabled(struct irq_desc *desc) { }
62static inline void irq_compat_clr_disabled(struct irq_desc *desc) { }
63static inline void irq_compat_set_pending(struct irq_desc *desc) { }
64static inline void irq_compat_clr_pending(struct irq_desc *desc) { }
65static inline void irq_compat_set_masked(struct irq_desc *desc) { }
66static inline void irq_compat_clr_masked(struct irq_desc *desc) { }
67static inline void irq_compat_set_move_pending(struct irq_desc *desc) { }
68static inline void irq_compat_clr_move_pending(struct irq_desc *desc) { }
69static inline void irq_compat_set_affinity(struct irq_desc *desc) { }
70static inline void irq_compat_clr_affinity(struct irq_desc *desc) { }
71#endif
72
diff --git a/kernel/irq/debug.h b/kernel/irq/debug.h
new file mode 100644
index 000000000000..d1a33b7fa61d
--- /dev/null
+++ b/kernel/irq/debug.h
@@ -0,0 +1,40 @@
1/*
2 * Debugging printout:
3 */
4
5#include <linux/kallsyms.h>
6
7#define P(f) if (desc->status & f) printk("%14s set\n", #f)
8#define PS(f) if (desc->istate & f) printk("%14s set\n", #f)
9
10static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc)
11{
12 printk("irq %d, desc: %p, depth: %d, count: %d, unhandled: %d\n",
13 irq, desc, desc->depth, desc->irq_count, desc->irqs_unhandled);
14 printk("->handle_irq(): %p, ", desc->handle_irq);
15 print_symbol("%s\n", (unsigned long)desc->handle_irq);
16 printk("->irq_data.chip(): %p, ", desc->irq_data.chip);
17 print_symbol("%s\n", (unsigned long)desc->irq_data.chip);
18 printk("->action(): %p\n", desc->action);
19 if (desc->action) {
20 printk("->action->handler(): %p, ", desc->action->handler);
21 print_symbol("%s\n", (unsigned long)desc->action->handler);
22 }
23
24 P(IRQ_LEVEL);
25 P(IRQ_PER_CPU);
26 P(IRQ_NOPROBE);
27 P(IRQ_NOREQUEST);
28 P(IRQ_NOAUTOEN);
29
30 PS(IRQS_AUTODETECT);
31 PS(IRQS_INPROGRESS);
32 PS(IRQS_REPLAY);
33 PS(IRQS_WAITING);
34 PS(IRQS_DISABLED);
35 PS(IRQS_PENDING);
36 PS(IRQS_MASKED);
37}
38
39#undef P
40#undef PS
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index e2347eb63306..517561fc7317 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -51,30 +51,92 @@ static void warn_no_thread(unsigned int irq, struct irqaction *action)
51 "but no thread function available.", irq, action->name); 51 "but no thread function available.", irq, action->name);
52} 52}
53 53
54/** 54static void irq_wake_thread(struct irq_desc *desc, struct irqaction *action)
55 * handle_IRQ_event - irq action chain handler 55{
56 * @irq: the interrupt number 56 /*
57 * @action: the interrupt action chain for this irq 57 * Wake up the handler thread for this action. In case the
58 * 58 * thread crashed and was killed we just pretend that we
59 * Handles the action chain of an irq event 59 * handled the interrupt. The hardirq handler has disabled the
60 */ 60 * device interrupt, so no irq storm is lurking. If the
61irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action) 61 * RUNTHREAD bit is already set, nothing to do.
62 */
63 if (test_bit(IRQTF_DIED, &action->thread_flags) ||
64 test_and_set_bit(IRQTF_RUNTHREAD, &action->thread_flags))
65 return;
66
67 /*
68 * It's safe to OR the mask lockless here. We have only two
69 * places which write to threads_oneshot: This code and the
70 * irq thread.
71 *
72 * This code is the hard irq context and can never run on two
73 * cpus in parallel. If it ever does we have more serious
74 * problems than this bitmask.
75 *
76 * The irq threads of this irq which clear their "running" bit
77 * in threads_oneshot are serialized via desc->lock against
78 * each other and they are serialized against this code by
79 * IRQS_INPROGRESS.
80 *
81 * Hard irq handler:
82 *
83 * spin_lock(desc->lock);
84 * desc->state |= IRQS_INPROGRESS;
85 * spin_unlock(desc->lock);
86 * set_bit(IRQTF_RUNTHREAD, &action->thread_flags);
87 * desc->threads_oneshot |= mask;
88 * spin_lock(desc->lock);
89 * desc->state &= ~IRQS_INPROGRESS;
90 * spin_unlock(desc->lock);
91 *
92 * irq thread:
93 *
94 * again:
95 * spin_lock(desc->lock);
96 * if (desc->state & IRQS_INPROGRESS) {
97 * spin_unlock(desc->lock);
98 * while(desc->state & IRQS_INPROGRESS)
99 * cpu_relax();
100 * goto again;
101 * }
102 * if (!test_bit(IRQTF_RUNTHREAD, &action->thread_flags))
103 * desc->threads_oneshot &= ~mask;
104 * spin_unlock(desc->lock);
105 *
106 * So either the thread waits for us to clear IRQS_INPROGRESS
107 * or we are waiting in the flow handler for desc->lock to be
108 * released before we reach this point. The thread also checks
109 * IRQTF_RUNTHREAD under desc->lock. If set it leaves
110 * threads_oneshot untouched and runs the thread another time.
111 */
112 desc->threads_oneshot |= action->thread_mask;
113 wake_up_process(action->thread);
114}
115
116irqreturn_t
117handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action)
62{ 118{
63 irqreturn_t ret, retval = IRQ_NONE; 119 irqreturn_t retval = IRQ_NONE;
64 unsigned int status = 0; 120 unsigned int random = 0, irq = desc->irq_data.irq;
65 121
66 do { 122 do {
123 irqreturn_t res;
124
67 trace_irq_handler_entry(irq, action); 125 trace_irq_handler_entry(irq, action);
68 ret = action->handler(irq, action->dev_id); 126 res = action->handler(irq, action->dev_id);
69 trace_irq_handler_exit(irq, action, ret); 127 trace_irq_handler_exit(irq, action, res);
128
129 if (WARN_ONCE(!irqs_disabled(),"irq %u handler %pF enabled interrupts\n",
130 irq, action->handler))
131 local_irq_disable();
70 132
71 switch (ret) { 133 switch (res) {
72 case IRQ_WAKE_THREAD: 134 case IRQ_WAKE_THREAD:
73 /* 135 /*
74 * Set result to handled so the spurious check 136 * Set result to handled so the spurious check
75 * does not trigger. 137 * does not trigger.
76 */ 138 */
77 ret = IRQ_HANDLED; 139 res = IRQ_HANDLED;
78 140
79 /* 141 /*
80 * Catch drivers which return WAKE_THREAD but 142 * Catch drivers which return WAKE_THREAD but
@@ -85,147 +147,56 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action)
85 break; 147 break;
86 } 148 }
87 149
88 /* 150 irq_wake_thread(desc, action);
89 * Wake up the handler thread for this
90 * action. In case the thread crashed and was
91 * killed we just pretend that we handled the
92 * interrupt. The hardirq handler above has
93 * disabled the device interrupt, so no irq
94 * storm is lurking.
95 */
96 if (likely(!test_bit(IRQTF_DIED,
97 &action->thread_flags))) {
98 set_bit(IRQTF_RUNTHREAD, &action->thread_flags);
99 wake_up_process(action->thread);
100 }
101 151
102 /* Fall through to add to randomness */ 152 /* Fall through to add to randomness */
103 case IRQ_HANDLED: 153 case IRQ_HANDLED:
104 status |= action->flags; 154 random |= action->flags;
105 break; 155 break;
106 156
107 default: 157 default:
108 break; 158 break;
109 } 159 }
110 160
111 retval |= ret; 161 retval |= res;
112 action = action->next; 162 action = action->next;
113 } while (action); 163 } while (action);
114 164
115 if (status & IRQF_SAMPLE_RANDOM) 165 if (random & IRQF_SAMPLE_RANDOM)
116 add_interrupt_randomness(irq); 166 add_interrupt_randomness(irq);
117 local_irq_disable();
118 167
168 if (!noirqdebug)
169 note_interrupt(irq, desc, retval);
119 return retval; 170 return retval;
120} 171}
121 172
122#ifndef CONFIG_GENERIC_HARDIRQS_NO__DO_IRQ 173irqreturn_t handle_irq_event(struct irq_desc *desc)
174{
175 struct irqaction *action = desc->action;
176 irqreturn_t ret;
123 177
124#ifdef CONFIG_ENABLE_WARN_DEPRECATED 178 irq_compat_clr_pending(desc);
125# warning __do_IRQ is deprecated. Please convert to proper flow handlers 179 desc->istate &= ~IRQS_PENDING;
126#endif 180 irq_compat_set_progress(desc);
181 desc->istate |= IRQS_INPROGRESS;
182 raw_spin_unlock(&desc->lock);
183
184 ret = handle_irq_event_percpu(desc, action);
185
186 raw_spin_lock(&desc->lock);
187 desc->istate &= ~IRQS_INPROGRESS;
188 irq_compat_clr_progress(desc);
189 return ret;
190}
127 191
128/** 192/**
129 * __do_IRQ - original all in one highlevel IRQ handler 193 * handle_IRQ_event - irq action chain handler
130 * @irq: the interrupt number 194 * @irq: the interrupt number
195 * @action: the interrupt action chain for this irq
131 * 196 *
132 * __do_IRQ handles all normal device IRQ's (the special 197 * Handles the action chain of an irq event
133 * SMP cross-CPU interrupts have their own specific
134 * handlers).
135 *
136 * This is the original x86 implementation which is used for every
137 * interrupt type.
138 */ 198 */
139unsigned int __do_IRQ(unsigned int irq) 199irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action)
140{ 200{
141 struct irq_desc *desc = irq_to_desc(irq); 201 return handle_irq_event_percpu(irq_to_desc(irq), action);
142 struct irqaction *action;
143 unsigned int status;
144
145 kstat_incr_irqs_this_cpu(irq, desc);
146
147 if (CHECK_IRQ_PER_CPU(desc->status)) {
148 irqreturn_t action_ret;
149
150 /*
151 * No locking required for CPU-local interrupts:
152 */
153 if (desc->irq_data.chip->ack)
154 desc->irq_data.chip->ack(irq);
155 if (likely(!(desc->status & IRQ_DISABLED))) {
156 action_ret = handle_IRQ_event(irq, desc->action);
157 if (!noirqdebug)
158 note_interrupt(irq, desc, action_ret);
159 }
160 desc->irq_data.chip->end(irq);
161 return 1;
162 }
163
164 raw_spin_lock(&desc->lock);
165 if (desc->irq_data.chip->ack)
166 desc->irq_data.chip->ack(irq);
167 /*
168 * REPLAY is when Linux resends an IRQ that was dropped earlier
169 * WAITING is used by probe to mark irqs that are being tested
170 */
171 status = desc->status & ~(IRQ_REPLAY | IRQ_WAITING);
172 status |= IRQ_PENDING; /* we _want_ to handle it */
173
174 /*
175 * If the IRQ is disabled for whatever reason, we cannot
176 * use the action we have.
177 */
178 action = NULL;
179 if (likely(!(status & (IRQ_DISABLED | IRQ_INPROGRESS)))) {
180 action = desc->action;
181 status &= ~IRQ_PENDING; /* we commit to handling */
182 status |= IRQ_INPROGRESS; /* we are handling it */
183 }
184 desc->status = status;
185
186 /*
187 * If there is no IRQ handler or it was disabled, exit early.
188 * Since we set PENDING, if another processor is handling
189 * a different instance of this same irq, the other processor
190 * will take care of it.
191 */
192 if (unlikely(!action))
193 goto out;
194
195 /*
196 * Edge triggered interrupts need to remember
197 * pending events.
198 * This applies to any hw interrupts that allow a second
199 * instance of the same irq to arrive while we are in do_IRQ
200 * or in the handler. But the code here only handles the _second_
201 * instance of the irq, not the third or fourth. So it is mostly
202 * useful for irq hardware that does not mask cleanly in an
203 * SMP environment.
204 */
205 for (;;) {
206 irqreturn_t action_ret;
207
208 raw_spin_unlock(&desc->lock);
209
210 action_ret = handle_IRQ_event(irq, action);
211 if (!noirqdebug)
212 note_interrupt(irq, desc, action_ret);
213
214 raw_spin_lock(&desc->lock);
215 if (likely(!(desc->status & IRQ_PENDING)))
216 break;
217 desc->status &= ~IRQ_PENDING;
218 }
219 desc->status &= ~IRQ_INPROGRESS;
220
221out:
222 /*
223 * The ->end() handler has to deal with interrupts which got
224 * disabled while the handler was running.
225 */
226 desc->irq_data.chip->end(irq);
227 raw_spin_unlock(&desc->lock);
228
229 return 1;
230} 202}
231#endif
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 4571ae7e085a..6c6ec9a49027 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -1,27 +1,101 @@
1/* 1/*
2 * IRQ subsystem internal functions and variables: 2 * IRQ subsystem internal functions and variables:
3 *
4 * Do not ever include this file from anything else than
5 * kernel/irq/. Do not even think about using any information outside
6 * of this file for your non core code.
3 */ 7 */
4#include <linux/irqdesc.h> 8#include <linux/irqdesc.h>
5 9
10#ifdef CONFIG_SPARSE_IRQ
11# define IRQ_BITMAP_BITS (NR_IRQS + 8196)
12#else
13# define IRQ_BITMAP_BITS NR_IRQS
14#endif
15
16#define istate core_internal_state__do_not_mess_with_it
17
18#ifdef CONFIG_GENERIC_HARDIRQS_NO_COMPAT
19# define status status_use_accessors
20#endif
21
6extern int noirqdebug; 22extern int noirqdebug;
7 23
24/*
25 * Bits used by threaded handlers:
26 * IRQTF_RUNTHREAD - signals that the interrupt handler thread should run
27 * IRQTF_DIED - handler thread died
28 * IRQTF_WARNED - warning "IRQ_WAKE_THREAD w/o thread_fn" has been printed
29 * IRQTF_AFFINITY - irq thread is requested to adjust affinity
30 * IRQTF_FORCED_THREAD - irq action is force threaded
31 */
32enum {
33 IRQTF_RUNTHREAD,
34 IRQTF_DIED,
35 IRQTF_WARNED,
36 IRQTF_AFFINITY,
37 IRQTF_FORCED_THREAD,
38};
39
40/*
41 * Bit masks for desc->state
42 *
43 * IRQS_AUTODETECT - autodetection in progress
44 * IRQS_SPURIOUS_DISABLED - was disabled due to spurious interrupt
45 * detection
46 * IRQS_POLL_INPROGRESS - polling in progress
47 * IRQS_INPROGRESS - Interrupt in progress
48 * IRQS_ONESHOT - irq is not unmasked in primary handler
49 * IRQS_REPLAY - irq is replayed
50 * IRQS_WAITING - irq is waiting
51 * IRQS_DISABLED - irq is disabled
52 * IRQS_PENDING - irq is pending and replayed later
53 * IRQS_MASKED - irq is masked
54 * IRQS_SUSPENDED - irq is suspended
55 */
56enum {
57 IRQS_AUTODETECT = 0x00000001,
58 IRQS_SPURIOUS_DISABLED = 0x00000002,
59 IRQS_POLL_INPROGRESS = 0x00000008,
60 IRQS_INPROGRESS = 0x00000010,
61 IRQS_ONESHOT = 0x00000020,
62 IRQS_REPLAY = 0x00000040,
63 IRQS_WAITING = 0x00000080,
64 IRQS_DISABLED = 0x00000100,
65 IRQS_PENDING = 0x00000200,
66 IRQS_MASKED = 0x00000400,
67 IRQS_SUSPENDED = 0x00000800,
68};
69
70#include "compat.h"
71#include "debug.h"
72#include "settings.h"
73
8#define irq_data_to_desc(data) container_of(data, struct irq_desc, irq_data) 74#define irq_data_to_desc(data) container_of(data, struct irq_desc, irq_data)
9 75
10/* Set default functions for irq_chip structures: */ 76/* Set default functions for irq_chip structures: */
11extern void irq_chip_set_defaults(struct irq_chip *chip); 77extern void irq_chip_set_defaults(struct irq_chip *chip);
12 78
13/* Set default handler: */
14extern void compat_irq_chip_set_default_handler(struct irq_desc *desc);
15
16extern int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, 79extern int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
17 unsigned long flags); 80 unsigned long flags);
18extern void __disable_irq(struct irq_desc *desc, unsigned int irq, bool susp); 81extern void __disable_irq(struct irq_desc *desc, unsigned int irq, bool susp);
19extern void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume); 82extern void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume);
20 83
84extern int irq_startup(struct irq_desc *desc);
85extern void irq_shutdown(struct irq_desc *desc);
86extern void irq_enable(struct irq_desc *desc);
87extern void irq_disable(struct irq_desc *desc);
88extern void mask_irq(struct irq_desc *desc);
89extern void unmask_irq(struct irq_desc *desc);
90
21extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr); 91extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr);
22 92
93irqreturn_t handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action);
94irqreturn_t handle_irq_event(struct irq_desc *desc);
95
23/* Resending of interrupts :*/ 96/* Resending of interrupts :*/
24void check_irq_resend(struct irq_desc *desc, unsigned int irq); 97void check_irq_resend(struct irq_desc *desc, unsigned int irq);
98bool irq_wait_for_poll(struct irq_desc *desc);
25 99
26#ifdef CONFIG_PROC_FS 100#ifdef CONFIG_PROC_FS
27extern void register_irq_proc(unsigned int irq, struct irq_desc *desc); 101extern void register_irq_proc(unsigned int irq, struct irq_desc *desc);
@@ -37,20 +111,10 @@ static inline void unregister_handler_proc(unsigned int irq,
37 struct irqaction *action) { } 111 struct irqaction *action) { }
38#endif 112#endif
39 113
40extern int irq_select_affinity_usr(unsigned int irq); 114extern int irq_select_affinity_usr(unsigned int irq, struct cpumask *mask);
41 115
42extern void irq_set_thread_affinity(struct irq_desc *desc); 116extern void irq_set_thread_affinity(struct irq_desc *desc);
43 117
44#ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED
45static inline void irq_end(unsigned int irq, struct irq_desc *desc)
46{
47 if (desc->irq_data.chip && desc->irq_data.chip->end)
48 desc->irq_data.chip->end(irq);
49}
50#else
51static inline void irq_end(unsigned int irq, struct irq_desc *desc) { }
52#endif
53
54/* Inline functions for support of irq chips on slow busses */ 118/* Inline functions for support of irq chips on slow busses */
55static inline void chip_bus_lock(struct irq_desc *desc) 119static inline void chip_bus_lock(struct irq_desc *desc)
56{ 120{
@@ -64,43 +128,60 @@ static inline void chip_bus_sync_unlock(struct irq_desc *desc)
64 desc->irq_data.chip->irq_bus_sync_unlock(&desc->irq_data); 128 desc->irq_data.chip->irq_bus_sync_unlock(&desc->irq_data);
65} 129}
66 130
131struct irq_desc *
132__irq_get_desc_lock(unsigned int irq, unsigned long *flags, bool bus);
133void __irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags, bool bus);
134
135static inline struct irq_desc *
136irq_get_desc_buslock(unsigned int irq, unsigned long *flags)
137{
138 return __irq_get_desc_lock(irq, flags, true);
139}
140
141static inline void
142irq_put_desc_busunlock(struct irq_desc *desc, unsigned long flags)
143{
144 __irq_put_desc_unlock(desc, flags, true);
145}
146
147static inline struct irq_desc *
148irq_get_desc_lock(unsigned int irq, unsigned long *flags)
149{
150 return __irq_get_desc_lock(irq, flags, false);
151}
152
153static inline void
154irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags)
155{
156 __irq_put_desc_unlock(desc, flags, false);
157}
158
67/* 159/*
68 * Debugging printout: 160 * Manipulation functions for irq_data.state
69 */ 161 */
162static inline void irqd_set_move_pending(struct irq_data *d)
163{
164 d->state_use_accessors |= IRQD_SETAFFINITY_PENDING;
165 irq_compat_set_move_pending(irq_data_to_desc(d));
166}
70 167
71#include <linux/kallsyms.h> 168static inline void irqd_clr_move_pending(struct irq_data *d)
72 169{
73#define P(f) if (desc->status & f) printk("%14s set\n", #f) 170 d->state_use_accessors &= ~IRQD_SETAFFINITY_PENDING;
171 irq_compat_clr_move_pending(irq_data_to_desc(d));
172}
74 173
75static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc) 174static inline void irqd_clear(struct irq_data *d, unsigned int mask)
76{ 175{
77 printk("irq %d, desc: %p, depth: %d, count: %d, unhandled: %d\n", 176 d->state_use_accessors &= ~mask;
78 irq, desc, desc->depth, desc->irq_count, desc->irqs_unhandled);
79 printk("->handle_irq(): %p, ", desc->handle_irq);
80 print_symbol("%s\n", (unsigned long)desc->handle_irq);
81 printk("->irq_data.chip(): %p, ", desc->irq_data.chip);
82 print_symbol("%s\n", (unsigned long)desc->irq_data.chip);
83 printk("->action(): %p\n", desc->action);
84 if (desc->action) {
85 printk("->action->handler(): %p, ", desc->action->handler);
86 print_symbol("%s\n", (unsigned long)desc->action->handler);
87 }
88
89 P(IRQ_INPROGRESS);
90 P(IRQ_DISABLED);
91 P(IRQ_PENDING);
92 P(IRQ_REPLAY);
93 P(IRQ_AUTODETECT);
94 P(IRQ_WAITING);
95 P(IRQ_LEVEL);
96 P(IRQ_MASKED);
97#ifdef CONFIG_IRQ_PER_CPU
98 P(IRQ_PER_CPU);
99#endif
100 P(IRQ_NOPROBE);
101 P(IRQ_NOREQUEST);
102 P(IRQ_NOAUTOEN);
103} 177}
104 178
105#undef P 179static inline void irqd_set(struct irq_data *d, unsigned int mask)
180{
181 d->state_use_accessors |= mask;
182}
106 183
184static inline bool irqd_has_set(struct irq_data *d, unsigned int mask)
185{
186 return d->state_use_accessors & mask;
187}
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 9988d03797f5..dbccc799407f 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -72,18 +72,22 @@ static inline int desc_node(struct irq_desc *desc) { return 0; }
72 72
73static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node) 73static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node)
74{ 74{
75 int cpu;
76
75 desc->irq_data.irq = irq; 77 desc->irq_data.irq = irq;
76 desc->irq_data.chip = &no_irq_chip; 78 desc->irq_data.chip = &no_irq_chip;
77 desc->irq_data.chip_data = NULL; 79 desc->irq_data.chip_data = NULL;
78 desc->irq_data.handler_data = NULL; 80 desc->irq_data.handler_data = NULL;
79 desc->irq_data.msi_desc = NULL; 81 desc->irq_data.msi_desc = NULL;
80 desc->status = IRQ_DEFAULT_INIT_FLAGS; 82 irq_settings_clr_and_set(desc, ~0, _IRQ_DEFAULT_INIT_FLAGS);
83 desc->istate = IRQS_DISABLED;
81 desc->handle_irq = handle_bad_irq; 84 desc->handle_irq = handle_bad_irq;
82 desc->depth = 1; 85 desc->depth = 1;
83 desc->irq_count = 0; 86 desc->irq_count = 0;
84 desc->irqs_unhandled = 0; 87 desc->irqs_unhandled = 0;
85 desc->name = NULL; 88 desc->name = NULL;
86 memset(desc->kstat_irqs, 0, nr_cpu_ids * sizeof(*(desc->kstat_irqs))); 89 for_each_possible_cpu(cpu)
90 *per_cpu_ptr(desc->kstat_irqs, cpu) = 0;
87 desc_smp_init(desc, node); 91 desc_smp_init(desc, node);
88} 92}
89 93
@@ -91,7 +95,7 @@ int nr_irqs = NR_IRQS;
91EXPORT_SYMBOL_GPL(nr_irqs); 95EXPORT_SYMBOL_GPL(nr_irqs);
92 96
93static DEFINE_MUTEX(sparse_irq_lock); 97static DEFINE_MUTEX(sparse_irq_lock);
94static DECLARE_BITMAP(allocated_irqs, NR_IRQS); 98static DECLARE_BITMAP(allocated_irqs, IRQ_BITMAP_BITS);
95 99
96#ifdef CONFIG_SPARSE_IRQ 100#ifdef CONFIG_SPARSE_IRQ
97 101
@@ -133,8 +137,7 @@ static struct irq_desc *alloc_desc(int irq, int node)
133 if (!desc) 137 if (!desc)
134 return NULL; 138 return NULL;
135 /* allocate based on nr_cpu_ids */ 139 /* allocate based on nr_cpu_ids */
136 desc->kstat_irqs = kzalloc_node(nr_cpu_ids * sizeof(*desc->kstat_irqs), 140 desc->kstat_irqs = alloc_percpu(unsigned int);
137 gfp, node);
138 if (!desc->kstat_irqs) 141 if (!desc->kstat_irqs)
139 goto err_desc; 142 goto err_desc;
140 143
@@ -149,7 +152,7 @@ static struct irq_desc *alloc_desc(int irq, int node)
149 return desc; 152 return desc;
150 153
151err_kstat: 154err_kstat:
152 kfree(desc->kstat_irqs); 155 free_percpu(desc->kstat_irqs);
153err_desc: 156err_desc:
154 kfree(desc); 157 kfree(desc);
155 return NULL; 158 return NULL;
@@ -166,7 +169,7 @@ static void free_desc(unsigned int irq)
166 mutex_unlock(&sparse_irq_lock); 169 mutex_unlock(&sparse_irq_lock);
167 170
168 free_masks(desc); 171 free_masks(desc);
169 kfree(desc->kstat_irqs); 172 free_percpu(desc->kstat_irqs);
170 kfree(desc); 173 kfree(desc);
171} 174}
172 175
@@ -204,6 +207,14 @@ struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node)
204 return NULL; 207 return NULL;
205} 208}
206 209
210static int irq_expand_nr_irqs(unsigned int nr)
211{
212 if (nr > IRQ_BITMAP_BITS)
213 return -ENOMEM;
214 nr_irqs = nr;
215 return 0;
216}
217
207int __init early_irq_init(void) 218int __init early_irq_init(void)
208{ 219{
209 int i, initcnt, node = first_online_node; 220 int i, initcnt, node = first_online_node;
@@ -215,6 +226,15 @@ int __init early_irq_init(void)
215 initcnt = arch_probe_nr_irqs(); 226 initcnt = arch_probe_nr_irqs();
216 printk(KERN_INFO "NR_IRQS:%d nr_irqs:%d %d\n", NR_IRQS, nr_irqs, initcnt); 227 printk(KERN_INFO "NR_IRQS:%d nr_irqs:%d %d\n", NR_IRQS, nr_irqs, initcnt);
217 228
229 if (WARN_ON(nr_irqs > IRQ_BITMAP_BITS))
230 nr_irqs = IRQ_BITMAP_BITS;
231
232 if (WARN_ON(initcnt > IRQ_BITMAP_BITS))
233 initcnt = IRQ_BITMAP_BITS;
234
235 if (initcnt > nr_irqs)
236 nr_irqs = initcnt;
237
218 for (i = 0; i < initcnt; i++) { 238 for (i = 0; i < initcnt; i++) {
219 desc = alloc_desc(i, node); 239 desc = alloc_desc(i, node);
220 set_bit(i, allocated_irqs); 240 set_bit(i, allocated_irqs);
@@ -227,14 +247,13 @@ int __init early_irq_init(void)
227 247
228struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = { 248struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = {
229 [0 ... NR_IRQS-1] = { 249 [0 ... NR_IRQS-1] = {
230 .status = IRQ_DEFAULT_INIT_FLAGS, 250 .istate = IRQS_DISABLED,
231 .handle_irq = handle_bad_irq, 251 .handle_irq = handle_bad_irq,
232 .depth = 1, 252 .depth = 1,
233 .lock = __RAW_SPIN_LOCK_UNLOCKED(irq_desc->lock), 253 .lock = __RAW_SPIN_LOCK_UNLOCKED(irq_desc->lock),
234 } 254 }
235}; 255};
236 256
237static unsigned int kstat_irqs_all[NR_IRQS][NR_CPUS];
238int __init early_irq_init(void) 257int __init early_irq_init(void)
239{ 258{
240 int count, i, node = first_online_node; 259 int count, i, node = first_online_node;
@@ -250,7 +269,8 @@ int __init early_irq_init(void)
250 for (i = 0; i < count; i++) { 269 for (i = 0; i < count; i++) {
251 desc[i].irq_data.irq = i; 270 desc[i].irq_data.irq = i;
252 desc[i].irq_data.chip = &no_irq_chip; 271 desc[i].irq_data.chip = &no_irq_chip;
253 desc[i].kstat_irqs = kstat_irqs_all[i]; 272 desc[i].kstat_irqs = alloc_percpu(unsigned int);
273 irq_settings_clr_and_set(desc, ~0, _IRQ_DEFAULT_INIT_FLAGS);
254 alloc_masks(desc + i, GFP_KERNEL, node); 274 alloc_masks(desc + i, GFP_KERNEL, node);
255 desc_smp_init(desc + i, node); 275 desc_smp_init(desc + i, node);
256 lockdep_set_class(&desc[i].lock, &irq_desc_lock_class); 276 lockdep_set_class(&desc[i].lock, &irq_desc_lock_class);
@@ -277,6 +297,12 @@ static inline int alloc_descs(unsigned int start, unsigned int cnt, int node)
277{ 297{
278 return start; 298 return start;
279} 299}
300
301static int irq_expand_nr_irqs(unsigned int nr)
302{
303 return -ENOMEM;
304}
305
280#endif /* !CONFIG_SPARSE_IRQ */ 306#endif /* !CONFIG_SPARSE_IRQ */
281 307
282/* Dynamic interrupt handling */ 308/* Dynamic interrupt handling */
@@ -320,14 +346,17 @@ irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node)
320 346
321 mutex_lock(&sparse_irq_lock); 347 mutex_lock(&sparse_irq_lock);
322 348
323 start = bitmap_find_next_zero_area(allocated_irqs, nr_irqs, from, cnt, 0); 349 start = bitmap_find_next_zero_area(allocated_irqs, IRQ_BITMAP_BITS,
350 from, cnt, 0);
324 ret = -EEXIST; 351 ret = -EEXIST;
325 if (irq >=0 && start != irq) 352 if (irq >=0 && start != irq)
326 goto err; 353 goto err;
327 354
328 ret = -ENOMEM; 355 if (start + cnt > nr_irqs) {
329 if (start >= nr_irqs) 356 ret = irq_expand_nr_irqs(start + cnt);
330 goto err; 357 if (ret)
358 goto err;
359 }
331 360
332 bitmap_set(allocated_irqs, start, cnt); 361 bitmap_set(allocated_irqs, start, cnt);
333 mutex_unlock(&sparse_irq_lock); 362 mutex_unlock(&sparse_irq_lock);
@@ -374,6 +403,26 @@ unsigned int irq_get_next_irq(unsigned int offset)
374 return find_next_bit(allocated_irqs, nr_irqs, offset); 403 return find_next_bit(allocated_irqs, nr_irqs, offset);
375} 404}
376 405
406struct irq_desc *
407__irq_get_desc_lock(unsigned int irq, unsigned long *flags, bool bus)
408{
409 struct irq_desc *desc = irq_to_desc(irq);
410
411 if (desc) {
412 if (bus)
413 chip_bus_lock(desc);
414 raw_spin_lock_irqsave(&desc->lock, *flags);
415 }
416 return desc;
417}
418
419void __irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags, bool bus)
420{
421 raw_spin_unlock_irqrestore(&desc->lock, flags);
422 if (bus)
423 chip_bus_sync_unlock(desc);
424}
425
377/** 426/**
378 * dynamic_irq_cleanup - cleanup a dynamically allocated irq 427 * dynamic_irq_cleanup - cleanup a dynamically allocated irq
379 * @irq: irq number to initialize 428 * @irq: irq number to initialize
@@ -391,7 +440,9 @@ void dynamic_irq_cleanup(unsigned int irq)
391unsigned int kstat_irqs_cpu(unsigned int irq, int cpu) 440unsigned int kstat_irqs_cpu(unsigned int irq, int cpu)
392{ 441{
393 struct irq_desc *desc = irq_to_desc(irq); 442 struct irq_desc *desc = irq_to_desc(irq);
394 return desc ? desc->kstat_irqs[cpu] : 0; 443
444 return desc && desc->kstat_irqs ?
445 *per_cpu_ptr(desc->kstat_irqs, cpu) : 0;
395} 446}
396 447
397#ifdef CONFIG_GENERIC_HARDIRQS 448#ifdef CONFIG_GENERIC_HARDIRQS
@@ -401,10 +452,10 @@ unsigned int kstat_irqs(unsigned int irq)
401 int cpu; 452 int cpu;
402 int sum = 0; 453 int sum = 0;
403 454
404 if (!desc) 455 if (!desc || !desc->kstat_irqs)
405 return 0; 456 return 0;
406 for_each_possible_cpu(cpu) 457 for_each_possible_cpu(cpu)
407 sum += desc->kstat_irqs[cpu]; 458 sum += *per_cpu_ptr(desc->kstat_irqs, cpu);
408 return sum; 459 return sum;
409} 460}
410#endif /* CONFIG_GENERIC_HARDIRQS */ 461#endif /* CONFIG_GENERIC_HARDIRQS */
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 91a5fa25054e..acd599a43bfb 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -17,6 +17,17 @@
17 17
18#include "internals.h" 18#include "internals.h"
19 19
20#ifdef CONFIG_IRQ_FORCED_THREADING
21__read_mostly bool force_irqthreads;
22
23static int __init setup_forced_irqthreads(char *arg)
24{
25 force_irqthreads = true;
26 return 0;
27}
28early_param("threadirqs", setup_forced_irqthreads);
29#endif
30
20/** 31/**
21 * synchronize_irq - wait for pending IRQ handlers (on other CPUs) 32 * synchronize_irq - wait for pending IRQ handlers (on other CPUs)
22 * @irq: interrupt number to wait for 33 * @irq: interrupt number to wait for
@@ -30,7 +41,7 @@
30void synchronize_irq(unsigned int irq) 41void synchronize_irq(unsigned int irq)
31{ 42{
32 struct irq_desc *desc = irq_to_desc(irq); 43 struct irq_desc *desc = irq_to_desc(irq);
33 unsigned int status; 44 unsigned int state;
34 45
35 if (!desc) 46 if (!desc)
36 return; 47 return;
@@ -42,16 +53,16 @@ void synchronize_irq(unsigned int irq)
42 * Wait until we're out of the critical section. This might 53 * Wait until we're out of the critical section. This might
43 * give the wrong answer due to the lack of memory barriers. 54 * give the wrong answer due to the lack of memory barriers.
44 */ 55 */
45 while (desc->status & IRQ_INPROGRESS) 56 while (desc->istate & IRQS_INPROGRESS)
46 cpu_relax(); 57 cpu_relax();
47 58
48 /* Ok, that indicated we're done: double-check carefully. */ 59 /* Ok, that indicated we're done: double-check carefully. */
49 raw_spin_lock_irqsave(&desc->lock, flags); 60 raw_spin_lock_irqsave(&desc->lock, flags);
50 status = desc->status; 61 state = desc->istate;
51 raw_spin_unlock_irqrestore(&desc->lock, flags); 62 raw_spin_unlock_irqrestore(&desc->lock, flags);
52 63
53 /* Oops, that failed? */ 64 /* Oops, that failed? */
54 } while (status & IRQ_INPROGRESS); 65 } while (state & IRQS_INPROGRESS);
55 66
56 /* 67 /*
57 * We made sure that no hardirq handler is running. Now verify 68 * We made sure that no hardirq handler is running. Now verify
@@ -73,8 +84,8 @@ int irq_can_set_affinity(unsigned int irq)
73{ 84{
74 struct irq_desc *desc = irq_to_desc(irq); 85 struct irq_desc *desc = irq_to_desc(irq);
75 86
76 if (CHECK_IRQ_PER_CPU(desc->status) || !desc->irq_data.chip || 87 if (!desc || !irqd_can_balance(&desc->irq_data) ||
77 !desc->irq_data.chip->irq_set_affinity) 88 !desc->irq_data.chip || !desc->irq_data.chip->irq_set_affinity)
78 return 0; 89 return 0;
79 90
80 return 1; 91 return 1;
@@ -100,67 +111,169 @@ void irq_set_thread_affinity(struct irq_desc *desc)
100 } 111 }
101} 112}
102 113
114#ifdef CONFIG_GENERIC_PENDING_IRQ
115static inline bool irq_can_move_pcntxt(struct irq_desc *desc)
116{
117 return irq_settings_can_move_pcntxt(desc);
118}
119static inline bool irq_move_pending(struct irq_desc *desc)
120{
121 return irqd_is_setaffinity_pending(&desc->irq_data);
122}
123static inline void
124irq_copy_pending(struct irq_desc *desc, const struct cpumask *mask)
125{
126 cpumask_copy(desc->pending_mask, mask);
127}
128static inline void
129irq_get_pending(struct cpumask *mask, struct irq_desc *desc)
130{
131 cpumask_copy(mask, desc->pending_mask);
132}
133#else
134static inline bool irq_can_move_pcntxt(struct irq_desc *desc) { return true; }
135static inline bool irq_move_pending(struct irq_desc *desc) { return false; }
136static inline void
137irq_copy_pending(struct irq_desc *desc, const struct cpumask *mask) { }
138static inline void
139irq_get_pending(struct cpumask *mask, struct irq_desc *desc) { }
140#endif
141
103/** 142/**
104 * irq_set_affinity - Set the irq affinity of a given irq 143 * irq_set_affinity - Set the irq affinity of a given irq
105 * @irq: Interrupt to set affinity 144 * @irq: Interrupt to set affinity
106 * @cpumask: cpumask 145 * @cpumask: cpumask
107 * 146 *
108 */ 147 */
109int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask) 148int irq_set_affinity(unsigned int irq, const struct cpumask *mask)
110{ 149{
111 struct irq_desc *desc = irq_to_desc(irq); 150 struct irq_desc *desc = irq_to_desc(irq);
112 struct irq_chip *chip = desc->irq_data.chip; 151 struct irq_chip *chip = desc->irq_data.chip;
113 unsigned long flags; 152 unsigned long flags;
153 int ret = 0;
114 154
115 if (!chip->irq_set_affinity) 155 if (!chip->irq_set_affinity)
116 return -EINVAL; 156 return -EINVAL;
117 157
118 raw_spin_lock_irqsave(&desc->lock, flags); 158 raw_spin_lock_irqsave(&desc->lock, flags);
119 159
120#ifdef CONFIG_GENERIC_PENDING_IRQ 160 if (irq_can_move_pcntxt(desc)) {
121 if (desc->status & IRQ_MOVE_PCNTXT) { 161 ret = chip->irq_set_affinity(&desc->irq_data, mask, false);
122 if (!chip->irq_set_affinity(&desc->irq_data, cpumask, false)) { 162 switch (ret) {
123 cpumask_copy(desc->irq_data.affinity, cpumask); 163 case IRQ_SET_MASK_OK:
164 cpumask_copy(desc->irq_data.affinity, mask);
165 case IRQ_SET_MASK_OK_NOCOPY:
124 irq_set_thread_affinity(desc); 166 irq_set_thread_affinity(desc);
167 ret = 0;
125 } 168 }
169 } else {
170 irqd_set_move_pending(&desc->irq_data);
171 irq_copy_pending(desc, mask);
126 } 172 }
127 else { 173
128 desc->status |= IRQ_MOVE_PENDING; 174 if (desc->affinity_notify) {
129 cpumask_copy(desc->pending_mask, cpumask); 175 kref_get(&desc->affinity_notify->kref);
130 } 176 schedule_work(&desc->affinity_notify->work);
131#else
132 if (!chip->irq_set_affinity(&desc->irq_data, cpumask, false)) {
133 cpumask_copy(desc->irq_data.affinity, cpumask);
134 irq_set_thread_affinity(desc);
135 } 177 }
136#endif 178 irq_compat_set_affinity(desc);
137 desc->status |= IRQ_AFFINITY_SET; 179 irqd_set(&desc->irq_data, IRQD_AFFINITY_SET);
138 raw_spin_unlock_irqrestore(&desc->lock, flags); 180 raw_spin_unlock_irqrestore(&desc->lock, flags);
139 return 0; 181 return ret;
140} 182}
141 183
142int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m) 184int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m)
143{ 185{
186 unsigned long flags;
187 struct irq_desc *desc = irq_get_desc_lock(irq, &flags);
188
189 if (!desc)
190 return -EINVAL;
191 desc->affinity_hint = m;
192 irq_put_desc_unlock(desc, flags);
193 return 0;
194}
195EXPORT_SYMBOL_GPL(irq_set_affinity_hint);
196
197static void irq_affinity_notify(struct work_struct *work)
198{
199 struct irq_affinity_notify *notify =
200 container_of(work, struct irq_affinity_notify, work);
201 struct irq_desc *desc = irq_to_desc(notify->irq);
202 cpumask_var_t cpumask;
203 unsigned long flags;
204
205 if (!desc || !alloc_cpumask_var(&cpumask, GFP_KERNEL))
206 goto out;
207
208 raw_spin_lock_irqsave(&desc->lock, flags);
209 if (irq_move_pending(desc))
210 irq_get_pending(cpumask, desc);
211 else
212 cpumask_copy(cpumask, desc->irq_data.affinity);
213 raw_spin_unlock_irqrestore(&desc->lock, flags);
214
215 notify->notify(notify, cpumask);
216
217 free_cpumask_var(cpumask);
218out:
219 kref_put(&notify->kref, notify->release);
220}
221
222/**
223 * irq_set_affinity_notifier - control notification of IRQ affinity changes
224 * @irq: Interrupt for which to enable/disable notification
225 * @notify: Context for notification, or %NULL to disable
226 * notification. Function pointers must be initialised;
227 * the other fields will be initialised by this function.
228 *
229 * Must be called in process context. Notification may only be enabled
230 * after the IRQ is allocated and must be disabled before the IRQ is
231 * freed using free_irq().
232 */
233int
234irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify)
235{
144 struct irq_desc *desc = irq_to_desc(irq); 236 struct irq_desc *desc = irq_to_desc(irq);
237 struct irq_affinity_notify *old_notify;
145 unsigned long flags; 238 unsigned long flags;
146 239
240 /* The release function is promised process context */
241 might_sleep();
242
147 if (!desc) 243 if (!desc)
148 return -EINVAL; 244 return -EINVAL;
149 245
246 /* Complete initialisation of *notify */
247 if (notify) {
248 notify->irq = irq;
249 kref_init(&notify->kref);
250 INIT_WORK(&notify->work, irq_affinity_notify);
251 }
252
150 raw_spin_lock_irqsave(&desc->lock, flags); 253 raw_spin_lock_irqsave(&desc->lock, flags);
151 desc->affinity_hint = m; 254 old_notify = desc->affinity_notify;
255 desc->affinity_notify = notify;
152 raw_spin_unlock_irqrestore(&desc->lock, flags); 256 raw_spin_unlock_irqrestore(&desc->lock, flags);
153 257
258 if (old_notify)
259 kref_put(&old_notify->kref, old_notify->release);
260
154 return 0; 261 return 0;
155} 262}
156EXPORT_SYMBOL_GPL(irq_set_affinity_hint); 263EXPORT_SYMBOL_GPL(irq_set_affinity_notifier);
157 264
158#ifndef CONFIG_AUTO_IRQ_AFFINITY 265#ifndef CONFIG_AUTO_IRQ_AFFINITY
159/* 266/*
160 * Generic version of the affinity autoselector. 267 * Generic version of the affinity autoselector.
161 */ 268 */
162static int setup_affinity(unsigned int irq, struct irq_desc *desc) 269static int
270setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask)
163{ 271{
272 struct irq_chip *chip = irq_desc_get_chip(desc);
273 struct cpumask *set = irq_default_affinity;
274 int ret;
275
276 /* Excludes PER_CPU and NO_BALANCE interrupts */
164 if (!irq_can_set_affinity(irq)) 277 if (!irq_can_set_affinity(irq))
165 return 0; 278 return 0;
166 279
@@ -168,22 +281,29 @@ static int setup_affinity(unsigned int irq, struct irq_desc *desc)
168 * Preserve an userspace affinity setup, but make sure that 281 * Preserve an userspace affinity setup, but make sure that
169 * one of the targets is online. 282 * one of the targets is online.
170 */ 283 */
171 if (desc->status & (IRQ_AFFINITY_SET | IRQ_NO_BALANCING)) { 284 if (irqd_has_set(&desc->irq_data, IRQD_AFFINITY_SET)) {
172 if (cpumask_any_and(desc->irq_data.affinity, cpu_online_mask) 285 if (cpumask_intersects(desc->irq_data.affinity,
173 < nr_cpu_ids) 286 cpu_online_mask))
174 goto set_affinity; 287 set = desc->irq_data.affinity;
175 else 288 else {
176 desc->status &= ~IRQ_AFFINITY_SET; 289 irq_compat_clr_affinity(desc);
290 irqd_clear(&desc->irq_data, IRQD_AFFINITY_SET);
291 }
177 } 292 }
178 293
179 cpumask_and(desc->irq_data.affinity, cpu_online_mask, irq_default_affinity); 294 cpumask_and(mask, cpu_online_mask, set);
180set_affinity: 295 ret = chip->irq_set_affinity(&desc->irq_data, mask, false);
181 desc->irq_data.chip->irq_set_affinity(&desc->irq_data, desc->irq_data.affinity, false); 296 switch (ret) {
182 297 case IRQ_SET_MASK_OK:
298 cpumask_copy(desc->irq_data.affinity, mask);
299 case IRQ_SET_MASK_OK_NOCOPY:
300 irq_set_thread_affinity(desc);
301 }
183 return 0; 302 return 0;
184} 303}
185#else 304#else
186static inline int setup_affinity(unsigned int irq, struct irq_desc *d) 305static inline int
306setup_affinity(unsigned int irq, struct irq_desc *d, struct cpumask *mask)
187{ 307{
188 return irq_select_affinity(irq); 308 return irq_select_affinity(irq);
189} 309}
@@ -192,23 +312,21 @@ static inline int setup_affinity(unsigned int irq, struct irq_desc *d)
192/* 312/*
193 * Called when affinity is set via /proc/irq 313 * Called when affinity is set via /proc/irq
194 */ 314 */
195int irq_select_affinity_usr(unsigned int irq) 315int irq_select_affinity_usr(unsigned int irq, struct cpumask *mask)
196{ 316{
197 struct irq_desc *desc = irq_to_desc(irq); 317 struct irq_desc *desc = irq_to_desc(irq);
198 unsigned long flags; 318 unsigned long flags;
199 int ret; 319 int ret;
200 320
201 raw_spin_lock_irqsave(&desc->lock, flags); 321 raw_spin_lock_irqsave(&desc->lock, flags);
202 ret = setup_affinity(irq, desc); 322 ret = setup_affinity(irq, desc, mask);
203 if (!ret)
204 irq_set_thread_affinity(desc);
205 raw_spin_unlock_irqrestore(&desc->lock, flags); 323 raw_spin_unlock_irqrestore(&desc->lock, flags);
206
207 return ret; 324 return ret;
208} 325}
209 326
210#else 327#else
211static inline int setup_affinity(unsigned int irq, struct irq_desc *desc) 328static inline int
329setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask)
212{ 330{
213 return 0; 331 return 0;
214} 332}
@@ -219,13 +337,23 @@ void __disable_irq(struct irq_desc *desc, unsigned int irq, bool suspend)
219 if (suspend) { 337 if (suspend) {
220 if (!desc->action || (desc->action->flags & IRQF_NO_SUSPEND)) 338 if (!desc->action || (desc->action->flags & IRQF_NO_SUSPEND))
221 return; 339 return;
222 desc->status |= IRQ_SUSPENDED; 340 desc->istate |= IRQS_SUSPENDED;
223 } 341 }
224 342
225 if (!desc->depth++) { 343 if (!desc->depth++)
226 desc->status |= IRQ_DISABLED; 344 irq_disable(desc);
227 desc->irq_data.chip->irq_disable(&desc->irq_data); 345}
228 } 346
347static int __disable_irq_nosync(unsigned int irq)
348{
349 unsigned long flags;
350 struct irq_desc *desc = irq_get_desc_buslock(irq, &flags);
351
352 if (!desc)
353 return -EINVAL;
354 __disable_irq(desc, irq, false);
355 irq_put_desc_busunlock(desc, flags);
356 return 0;
229} 357}
230 358
231/** 359/**
@@ -241,17 +369,7 @@ void __disable_irq(struct irq_desc *desc, unsigned int irq, bool suspend)
241 */ 369 */
242void disable_irq_nosync(unsigned int irq) 370void disable_irq_nosync(unsigned int irq)
243{ 371{
244 struct irq_desc *desc = irq_to_desc(irq); 372 __disable_irq_nosync(irq);
245 unsigned long flags;
246
247 if (!desc)
248 return;
249
250 chip_bus_lock(desc);
251 raw_spin_lock_irqsave(&desc->lock, flags);
252 __disable_irq(desc, irq, false);
253 raw_spin_unlock_irqrestore(&desc->lock, flags);
254 chip_bus_sync_unlock(desc);
255} 373}
256EXPORT_SYMBOL(disable_irq_nosync); 374EXPORT_SYMBOL(disable_irq_nosync);
257 375
@@ -269,21 +387,24 @@ EXPORT_SYMBOL(disable_irq_nosync);
269 */ 387 */
270void disable_irq(unsigned int irq) 388void disable_irq(unsigned int irq)
271{ 389{
272 struct irq_desc *desc = irq_to_desc(irq); 390 if (!__disable_irq_nosync(irq))
273
274 if (!desc)
275 return;
276
277 disable_irq_nosync(irq);
278 if (desc->action)
279 synchronize_irq(irq); 391 synchronize_irq(irq);
280} 392}
281EXPORT_SYMBOL(disable_irq); 393EXPORT_SYMBOL(disable_irq);
282 394
283void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume) 395void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume)
284{ 396{
285 if (resume) 397 if (resume) {
286 desc->status &= ~IRQ_SUSPENDED; 398 if (!(desc->istate & IRQS_SUSPENDED)) {
399 if (!desc->action)
400 return;
401 if (!(desc->action->flags & IRQF_FORCE_RESUME))
402 return;
403 /* Pretend that it got disabled ! */
404 desc->depth++;
405 }
406 desc->istate &= ~IRQS_SUSPENDED;
407 }
287 408
288 switch (desc->depth) { 409 switch (desc->depth) {
289 case 0: 410 case 0:
@@ -291,12 +412,11 @@ void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume)
291 WARN(1, KERN_WARNING "Unbalanced enable for IRQ %d\n", irq); 412 WARN(1, KERN_WARNING "Unbalanced enable for IRQ %d\n", irq);
292 break; 413 break;
293 case 1: { 414 case 1: {
294 unsigned int status = desc->status & ~IRQ_DISABLED; 415 if (desc->istate & IRQS_SUSPENDED)
295
296 if (desc->status & IRQ_SUSPENDED)
297 goto err_out; 416 goto err_out;
298 /* Prevent probing on this irq: */ 417 /* Prevent probing on this irq: */
299 desc->status = status | IRQ_NOPROBE; 418 irq_settings_set_noprobe(desc);
419 irq_enable(desc);
300 check_irq_resend(desc, irq); 420 check_irq_resend(desc, irq);
301 /* fall-through */ 421 /* fall-through */
302 } 422 }
@@ -318,21 +438,18 @@ void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume)
318 */ 438 */
319void enable_irq(unsigned int irq) 439void enable_irq(unsigned int irq)
320{ 440{
321 struct irq_desc *desc = irq_to_desc(irq);
322 unsigned long flags; 441 unsigned long flags;
442 struct irq_desc *desc = irq_get_desc_buslock(irq, &flags);
323 443
324 if (!desc) 444 if (!desc)
325 return; 445 return;
446 if (WARN(!desc->irq_data.chip,
447 KERN_ERR "enable_irq before setup/request_irq: irq %u\n", irq))
448 goto out;
326 449
327 if (WARN(!desc->irq_data.chip || !desc->irq_data.chip->irq_enable,
328 KERN_ERR "enable_irq before setup/request_irq: irq %u\n", irq))
329 return;
330
331 chip_bus_lock(desc);
332 raw_spin_lock_irqsave(&desc->lock, flags);
333 __enable_irq(desc, irq, false); 450 __enable_irq(desc, irq, false);
334 raw_spin_unlock_irqrestore(&desc->lock, flags); 451out:
335 chip_bus_sync_unlock(desc); 452 irq_put_desc_busunlock(desc, flags);
336} 453}
337EXPORT_SYMBOL(enable_irq); 454EXPORT_SYMBOL(enable_irq);
338 455
@@ -348,7 +465,7 @@ static int set_irq_wake_real(unsigned int irq, unsigned int on)
348} 465}
349 466
350/** 467/**
351 * set_irq_wake - control irq power management wakeup 468 * irq_set_irq_wake - control irq power management wakeup
352 * @irq: interrupt to control 469 * @irq: interrupt to control
353 * @on: enable/disable power management wakeup 470 * @on: enable/disable power management wakeup
354 * 471 *
@@ -359,23 +476,22 @@ static int set_irq_wake_real(unsigned int irq, unsigned int on)
359 * Wakeup mode lets this IRQ wake the system from sleep 476 * Wakeup mode lets this IRQ wake the system from sleep
360 * states like "suspend to RAM". 477 * states like "suspend to RAM".
361 */ 478 */
362int set_irq_wake(unsigned int irq, unsigned int on) 479int irq_set_irq_wake(unsigned int irq, unsigned int on)
363{ 480{
364 struct irq_desc *desc = irq_to_desc(irq);
365 unsigned long flags; 481 unsigned long flags;
482 struct irq_desc *desc = irq_get_desc_buslock(irq, &flags);
366 int ret = 0; 483 int ret = 0;
367 484
368 /* wakeup-capable irqs can be shared between drivers that 485 /* wakeup-capable irqs can be shared between drivers that
369 * don't need to have the same sleep mode behaviors. 486 * don't need to have the same sleep mode behaviors.
370 */ 487 */
371 raw_spin_lock_irqsave(&desc->lock, flags);
372 if (on) { 488 if (on) {
373 if (desc->wake_depth++ == 0) { 489 if (desc->wake_depth++ == 0) {
374 ret = set_irq_wake_real(irq, on); 490 ret = set_irq_wake_real(irq, on);
375 if (ret) 491 if (ret)
376 desc->wake_depth = 0; 492 desc->wake_depth = 0;
377 else 493 else
378 desc->status |= IRQ_WAKEUP; 494 irqd_set(&desc->irq_data, IRQD_WAKEUP_STATE);
379 } 495 }
380 } else { 496 } else {
381 if (desc->wake_depth == 0) { 497 if (desc->wake_depth == 0) {
@@ -385,14 +501,13 @@ int set_irq_wake(unsigned int irq, unsigned int on)
385 if (ret) 501 if (ret)
386 desc->wake_depth = 1; 502 desc->wake_depth = 1;
387 else 503 else
388 desc->status &= ~IRQ_WAKEUP; 504 irqd_clear(&desc->irq_data, IRQD_WAKEUP_STATE);
389 } 505 }
390 } 506 }
391 507 irq_put_desc_busunlock(desc, flags);
392 raw_spin_unlock_irqrestore(&desc->lock, flags);
393 return ret; 508 return ret;
394} 509}
395EXPORT_SYMBOL(set_irq_wake); 510EXPORT_SYMBOL(irq_set_irq_wake);
396 511
397/* 512/*
398 * Internal function that tells the architecture code whether a 513 * Internal function that tells the architecture code whether a
@@ -401,43 +516,27 @@ EXPORT_SYMBOL(set_irq_wake);
401 */ 516 */
402int can_request_irq(unsigned int irq, unsigned long irqflags) 517int can_request_irq(unsigned int irq, unsigned long irqflags)
403{ 518{
404 struct irq_desc *desc = irq_to_desc(irq);
405 struct irqaction *action;
406 unsigned long flags; 519 unsigned long flags;
520 struct irq_desc *desc = irq_get_desc_lock(irq, &flags);
521 int canrequest = 0;
407 522
408 if (!desc) 523 if (!desc)
409 return 0; 524 return 0;
410 525
411 if (desc->status & IRQ_NOREQUEST) 526 if (irq_settings_can_request(desc)) {
412 return 0; 527 if (desc->action)
413 528 if (irqflags & desc->action->flags & IRQF_SHARED)
414 raw_spin_lock_irqsave(&desc->lock, flags); 529 canrequest =1;
415 action = desc->action; 530 }
416 if (action) 531 irq_put_desc_unlock(desc, flags);
417 if (irqflags & action->flags & IRQF_SHARED) 532 return canrequest;
418 action = NULL;
419
420 raw_spin_unlock_irqrestore(&desc->lock, flags);
421
422 return !action;
423}
424
425void compat_irq_chip_set_default_handler(struct irq_desc *desc)
426{
427 /*
428 * If the architecture still has not overriden
429 * the flow handler then zap the default. This
430 * should catch incorrect flow-type setting.
431 */
432 if (desc->handle_irq == &handle_bad_irq)
433 desc->handle_irq = NULL;
434} 533}
435 534
436int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, 535int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
437 unsigned long flags) 536 unsigned long flags)
438{ 537{
439 int ret;
440 struct irq_chip *chip = desc->irq_data.chip; 538 struct irq_chip *chip = desc->irq_data.chip;
539 int ret, unmask = 0;
441 540
442 if (!chip || !chip->irq_set_type) { 541 if (!chip || !chip->irq_set_type) {
443 /* 542 /*
@@ -449,23 +548,43 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
449 return 0; 548 return 0;
450 } 549 }
451 550
551 flags &= IRQ_TYPE_SENSE_MASK;
552
553 if (chip->flags & IRQCHIP_SET_TYPE_MASKED) {
554 if (!(desc->istate & IRQS_MASKED))
555 mask_irq(desc);
556 if (!(desc->istate & IRQS_DISABLED))
557 unmask = 1;
558 }
559
452 /* caller masked out all except trigger mode flags */ 560 /* caller masked out all except trigger mode flags */
453 ret = chip->irq_set_type(&desc->irq_data, flags); 561 ret = chip->irq_set_type(&desc->irq_data, flags);
454 562
455 if (ret) 563 switch (ret) {
456 pr_err("setting trigger mode %lu for irq %u failed (%pF)\n", 564 case IRQ_SET_MASK_OK:
457 flags, irq, chip->irq_set_type); 565 irqd_clear(&desc->irq_data, IRQD_TRIGGER_MASK);
458 else { 566 irqd_set(&desc->irq_data, flags);
459 if (flags & (IRQ_TYPE_LEVEL_LOW | IRQ_TYPE_LEVEL_HIGH)) 567
460 flags |= IRQ_LEVEL; 568 case IRQ_SET_MASK_OK_NOCOPY:
461 /* note that IRQF_TRIGGER_MASK == IRQ_TYPE_SENSE_MASK */ 569 flags = irqd_get_trigger_type(&desc->irq_data);
462 desc->status &= ~(IRQ_LEVEL | IRQ_TYPE_SENSE_MASK); 570 irq_settings_set_trigger_mask(desc, flags);
463 desc->status |= flags; 571 irqd_clear(&desc->irq_data, IRQD_LEVEL);
572 irq_settings_clr_level(desc);
573 if (flags & IRQ_TYPE_LEVEL_MASK) {
574 irq_settings_set_level(desc);
575 irqd_set(&desc->irq_data, IRQD_LEVEL);
576 }
464 577
465 if (chip != desc->irq_data.chip) 578 if (chip != desc->irq_data.chip)
466 irq_chip_set_defaults(desc->irq_data.chip); 579 irq_chip_set_defaults(desc->irq_data.chip);
580 ret = 0;
581 break;
582 default:
583 pr_err("setting trigger mode %lu for irq %u failed (%pF)\n",
584 flags, irq, chip->irq_set_type);
467 } 585 }
468 586 if (unmask)
587 unmask_irq(desc);
469 return ret; 588 return ret;
470} 589}
471 590
@@ -509,8 +628,11 @@ static int irq_wait_for_interrupt(struct irqaction *action)
509 * handler finished. unmask if the interrupt has not been disabled and 628 * handler finished. unmask if the interrupt has not been disabled and
510 * is marked MASKED. 629 * is marked MASKED.
511 */ 630 */
512static void irq_finalize_oneshot(unsigned int irq, struct irq_desc *desc) 631static void irq_finalize_oneshot(struct irq_desc *desc,
632 struct irqaction *action, bool force)
513{ 633{
634 if (!(desc->istate & IRQS_ONESHOT))
635 return;
514again: 636again:
515 chip_bus_lock(desc); 637 chip_bus_lock(desc);
516 raw_spin_lock_irq(&desc->lock); 638 raw_spin_lock_irq(&desc->lock);
@@ -522,26 +644,44 @@ again:
522 * The thread is faster done than the hard interrupt handler 644 * The thread is faster done than the hard interrupt handler
523 * on the other CPU. If we unmask the irq line then the 645 * on the other CPU. If we unmask the irq line then the
524 * interrupt can come in again and masks the line, leaves due 646 * interrupt can come in again and masks the line, leaves due
525 * to IRQ_INPROGRESS and the irq line is masked forever. 647 * to IRQS_INPROGRESS and the irq line is masked forever.
648 *
649 * This also serializes the state of shared oneshot handlers
650 * versus "desc->threads_onehsot |= action->thread_mask;" in
651 * irq_wake_thread(). See the comment there which explains the
652 * serialization.
526 */ 653 */
527 if (unlikely(desc->status & IRQ_INPROGRESS)) { 654 if (unlikely(desc->istate & IRQS_INPROGRESS)) {
528 raw_spin_unlock_irq(&desc->lock); 655 raw_spin_unlock_irq(&desc->lock);
529 chip_bus_sync_unlock(desc); 656 chip_bus_sync_unlock(desc);
530 cpu_relax(); 657 cpu_relax();
531 goto again; 658 goto again;
532 } 659 }
533 660
534 if (!(desc->status & IRQ_DISABLED) && (desc->status & IRQ_MASKED)) { 661 /*
535 desc->status &= ~IRQ_MASKED; 662 * Now check again, whether the thread should run. Otherwise
663 * we would clear the threads_oneshot bit of this thread which
664 * was just set.
665 */
666 if (!force && test_bit(IRQTF_RUNTHREAD, &action->thread_flags))
667 goto out_unlock;
668
669 desc->threads_oneshot &= ~action->thread_mask;
670
671 if (!desc->threads_oneshot && !(desc->istate & IRQS_DISABLED) &&
672 (desc->istate & IRQS_MASKED)) {
673 irq_compat_clr_masked(desc);
674 desc->istate &= ~IRQS_MASKED;
536 desc->irq_data.chip->irq_unmask(&desc->irq_data); 675 desc->irq_data.chip->irq_unmask(&desc->irq_data);
537 } 676 }
677out_unlock:
538 raw_spin_unlock_irq(&desc->lock); 678 raw_spin_unlock_irq(&desc->lock);
539 chip_bus_sync_unlock(desc); 679 chip_bus_sync_unlock(desc);
540} 680}
541 681
542#ifdef CONFIG_SMP 682#ifdef CONFIG_SMP
543/* 683/*
544 * Check whether we need to change the affinity of the interrupt thread. 684 * Check whether we need to chasnge the affinity of the interrupt thread.
545 */ 685 */
546static void 686static void
547irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) 687irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action)
@@ -573,16 +713,49 @@ irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) { }
573#endif 713#endif
574 714
575/* 715/*
716 * Interrupts which are not explicitely requested as threaded
717 * interrupts rely on the implicit bh/preempt disable of the hard irq
718 * context. So we need to disable bh here to avoid deadlocks and other
719 * side effects.
720 */
721static void
722irq_forced_thread_fn(struct irq_desc *desc, struct irqaction *action)
723{
724 local_bh_disable();
725 action->thread_fn(action->irq, action->dev_id);
726 irq_finalize_oneshot(desc, action, false);
727 local_bh_enable();
728}
729
730/*
731 * Interrupts explicitely requested as threaded interupts want to be
732 * preemtible - many of them need to sleep and wait for slow busses to
733 * complete.
734 */
735static void irq_thread_fn(struct irq_desc *desc, struct irqaction *action)
736{
737 action->thread_fn(action->irq, action->dev_id);
738 irq_finalize_oneshot(desc, action, false);
739}
740
741/*
576 * Interrupt handler thread 742 * Interrupt handler thread
577 */ 743 */
578static int irq_thread(void *data) 744static int irq_thread(void *data)
579{ 745{
580 static struct sched_param param = { 746 static const struct sched_param param = {
581 .sched_priority = MAX_USER_RT_PRIO/2, 747 .sched_priority = MAX_USER_RT_PRIO/2,
582 }; 748 };
583 struct irqaction *action = data; 749 struct irqaction *action = data;
584 struct irq_desc *desc = irq_to_desc(action->irq); 750 struct irq_desc *desc = irq_to_desc(action->irq);
585 int wake, oneshot = desc->status & IRQ_ONESHOT; 751 void (*handler_fn)(struct irq_desc *desc, struct irqaction *action);
752 int wake;
753
754 if (force_irqthreads & test_bit(IRQTF_FORCED_THREAD,
755 &action->thread_flags))
756 handler_fn = irq_forced_thread_fn;
757 else
758 handler_fn = irq_thread_fn;
586 759
587 sched_setscheduler(current, SCHED_FIFO, &param); 760 sched_setscheduler(current, SCHED_FIFO, &param);
588 current->irqaction = action; 761 current->irqaction = action;
@@ -594,23 +767,20 @@ static int irq_thread(void *data)
594 atomic_inc(&desc->threads_active); 767 atomic_inc(&desc->threads_active);
595 768
596 raw_spin_lock_irq(&desc->lock); 769 raw_spin_lock_irq(&desc->lock);
597 if (unlikely(desc->status & IRQ_DISABLED)) { 770 if (unlikely(desc->istate & IRQS_DISABLED)) {
598 /* 771 /*
599 * CHECKME: We might need a dedicated 772 * CHECKME: We might need a dedicated
600 * IRQ_THREAD_PENDING flag here, which 773 * IRQ_THREAD_PENDING flag here, which
601 * retriggers the thread in check_irq_resend() 774 * retriggers the thread in check_irq_resend()
602 * but AFAICT IRQ_PENDING should be fine as it 775 * but AFAICT IRQS_PENDING should be fine as it
603 * retriggers the interrupt itself --- tglx 776 * retriggers the interrupt itself --- tglx
604 */ 777 */
605 desc->status |= IRQ_PENDING; 778 irq_compat_set_pending(desc);
779 desc->istate |= IRQS_PENDING;
606 raw_spin_unlock_irq(&desc->lock); 780 raw_spin_unlock_irq(&desc->lock);
607 } else { 781 } else {
608 raw_spin_unlock_irq(&desc->lock); 782 raw_spin_unlock_irq(&desc->lock);
609 783 handler_fn(desc, action);
610 action->thread_fn(action->irq, action->dev_id);
611
612 if (oneshot)
613 irq_finalize_oneshot(action->irq, desc);
614 } 784 }
615 785
616 wake = atomic_dec_and_test(&desc->threads_active); 786 wake = atomic_dec_and_test(&desc->threads_active);
@@ -619,6 +789,9 @@ static int irq_thread(void *data)
619 wake_up(&desc->wait_for_threads); 789 wake_up(&desc->wait_for_threads);
620 } 790 }
621 791
792 /* Prevent a stale desc->threads_oneshot */
793 irq_finalize_oneshot(desc, action, true);
794
622 /* 795 /*
623 * Clear irqaction. Otherwise exit_irq_thread() would make 796 * Clear irqaction. Otherwise exit_irq_thread() would make
624 * fuzz about an active irq thread going into nirvana. 797 * fuzz about an active irq thread going into nirvana.
@@ -633,6 +806,7 @@ static int irq_thread(void *data)
633void exit_irq_thread(void) 806void exit_irq_thread(void)
634{ 807{
635 struct task_struct *tsk = current; 808 struct task_struct *tsk = current;
809 struct irq_desc *desc;
636 810
637 if (!tsk->irqaction) 811 if (!tsk->irqaction)
638 return; 812 return;
@@ -641,6 +815,14 @@ void exit_irq_thread(void)
641 "exiting task \"%s\" (%d) is an active IRQ thread (irq %d)\n", 815 "exiting task \"%s\" (%d) is an active IRQ thread (irq %d)\n",
642 tsk->comm ? tsk->comm : "", tsk->pid, tsk->irqaction->irq); 816 tsk->comm ? tsk->comm : "", tsk->pid, tsk->irqaction->irq);
643 817
818 desc = irq_to_desc(tsk->irqaction->irq);
819
820 /*
821 * Prevent a stale desc->threads_oneshot. Must be called
822 * before setting the IRQTF_DIED flag.
823 */
824 irq_finalize_oneshot(desc, tsk->irqaction, true);
825
644 /* 826 /*
645 * Set the THREAD DIED flag to prevent further wakeups of the 827 * Set the THREAD DIED flag to prevent further wakeups of the
646 * soon to be gone threaded handler. 828 * soon to be gone threaded handler.
@@ -648,6 +830,22 @@ void exit_irq_thread(void)
648 set_bit(IRQTF_DIED, &tsk->irqaction->flags); 830 set_bit(IRQTF_DIED, &tsk->irqaction->flags);
649} 831}
650 832
833static void irq_setup_forced_threading(struct irqaction *new)
834{
835 if (!force_irqthreads)
836 return;
837 if (new->flags & (IRQF_NO_THREAD | IRQF_PERCPU | IRQF_ONESHOT))
838 return;
839
840 new->flags |= IRQF_ONESHOT;
841
842 if (!new->thread_fn) {
843 set_bit(IRQTF_FORCED_THREAD, &new->thread_flags);
844 new->thread_fn = new->handler;
845 new->handler = irq_default_primary_handler;
846 }
847}
848
651/* 849/*
652 * Internal function to register an irqaction - typically used to 850 * Internal function to register an irqaction - typically used to
653 * allocate special interrupts that are part of the architecture. 851 * allocate special interrupts that are part of the architecture.
@@ -657,9 +855,9 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
657{ 855{
658 struct irqaction *old, **old_ptr; 856 struct irqaction *old, **old_ptr;
659 const char *old_name = NULL; 857 const char *old_name = NULL;
660 unsigned long flags; 858 unsigned long flags, thread_mask = 0;
661 int nested, shared = 0; 859 int ret, nested, shared = 0;
662 int ret; 860 cpumask_var_t mask;
663 861
664 if (!desc) 862 if (!desc)
665 return -EINVAL; 863 return -EINVAL;
@@ -683,15 +881,11 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
683 rand_initialize_irq(irq); 881 rand_initialize_irq(irq);
684 } 882 }
685 883
686 /* Oneshot interrupts are not allowed with shared */
687 if ((new->flags & IRQF_ONESHOT) && (new->flags & IRQF_SHARED))
688 return -EINVAL;
689
690 /* 884 /*
691 * Check whether the interrupt nests into another interrupt 885 * Check whether the interrupt nests into another interrupt
692 * thread. 886 * thread.
693 */ 887 */
694 nested = desc->status & IRQ_NESTED_THREAD; 888 nested = irq_settings_is_nested_thread(desc);
695 if (nested) { 889 if (nested) {
696 if (!new->thread_fn) 890 if (!new->thread_fn)
697 return -EINVAL; 891 return -EINVAL;
@@ -701,6 +895,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
701 * dummy function which warns when called. 895 * dummy function which warns when called.
702 */ 896 */
703 new->handler = irq_nested_primary_handler; 897 new->handler = irq_nested_primary_handler;
898 } else {
899 irq_setup_forced_threading(new);
704 } 900 }
705 901
706 /* 902 /*
@@ -724,6 +920,11 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
724 new->thread = t; 920 new->thread = t;
725 } 921 }
726 922
923 if (!alloc_cpumask_var(&mask, GFP_KERNEL)) {
924 ret = -ENOMEM;
925 goto out_thread;
926 }
927
727 /* 928 /*
728 * The following block of code has to be executed atomically 929 * The following block of code has to be executed atomically
729 */ 930 */
@@ -735,29 +936,40 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
735 * Can't share interrupts unless both agree to and are 936 * Can't share interrupts unless both agree to and are
736 * the same type (level, edge, polarity). So both flag 937 * the same type (level, edge, polarity). So both flag
737 * fields must have IRQF_SHARED set and the bits which 938 * fields must have IRQF_SHARED set and the bits which
738 * set the trigger type must match. 939 * set the trigger type must match. Also all must
940 * agree on ONESHOT.
739 */ 941 */
740 if (!((old->flags & new->flags) & IRQF_SHARED) || 942 if (!((old->flags & new->flags) & IRQF_SHARED) ||
741 ((old->flags ^ new->flags) & IRQF_TRIGGER_MASK)) { 943 ((old->flags ^ new->flags) & IRQF_TRIGGER_MASK) ||
944 ((old->flags ^ new->flags) & IRQF_ONESHOT)) {
742 old_name = old->name; 945 old_name = old->name;
743 goto mismatch; 946 goto mismatch;
744 } 947 }
745 948
746#if defined(CONFIG_IRQ_PER_CPU)
747 /* All handlers must agree on per-cpuness */ 949 /* All handlers must agree on per-cpuness */
748 if ((old->flags & IRQF_PERCPU) != 950 if ((old->flags & IRQF_PERCPU) !=
749 (new->flags & IRQF_PERCPU)) 951 (new->flags & IRQF_PERCPU))
750 goto mismatch; 952 goto mismatch;
751#endif
752 953
753 /* add new interrupt at end of irq queue */ 954 /* add new interrupt at end of irq queue */
754 do { 955 do {
956 thread_mask |= old->thread_mask;
755 old_ptr = &old->next; 957 old_ptr = &old->next;
756 old = *old_ptr; 958 old = *old_ptr;
757 } while (old); 959 } while (old);
758 shared = 1; 960 shared = 1;
759 } 961 }
760 962
963 /*
964 * Setup the thread mask for this irqaction. Unlikely to have
965 * 32 resp 64 irqs sharing one line, but who knows.
966 */
967 if (new->flags & IRQF_ONESHOT && thread_mask == ~0UL) {
968 ret = -EBUSY;
969 goto out_mask;
970 }
971 new->thread_mask = 1 << ffz(thread_mask);
972
761 if (!shared) { 973 if (!shared) {
762 irq_chip_set_defaults(desc->irq_data.chip); 974 irq_chip_set_defaults(desc->irq_data.chip);
763 975
@@ -769,42 +981,44 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
769 new->flags & IRQF_TRIGGER_MASK); 981 new->flags & IRQF_TRIGGER_MASK);
770 982
771 if (ret) 983 if (ret)
772 goto out_thread; 984 goto out_mask;
773 } else 985 }
774 compat_irq_chip_set_default_handler(desc);
775#if defined(CONFIG_IRQ_PER_CPU)
776 if (new->flags & IRQF_PERCPU)
777 desc->status |= IRQ_PER_CPU;
778#endif
779 986
780 desc->status &= ~(IRQ_AUTODETECT | IRQ_WAITING | IRQ_ONESHOT | 987 desc->istate &= ~(IRQS_AUTODETECT | IRQS_SPURIOUS_DISABLED | \
781 IRQ_INPROGRESS | IRQ_SPURIOUS_DISABLED); 988 IRQS_INPROGRESS | IRQS_ONESHOT | \
989 IRQS_WAITING);
990
991 if (new->flags & IRQF_PERCPU) {
992 irqd_set(&desc->irq_data, IRQD_PER_CPU);
993 irq_settings_set_per_cpu(desc);
994 }
782 995
783 if (new->flags & IRQF_ONESHOT) 996 if (new->flags & IRQF_ONESHOT)
784 desc->status |= IRQ_ONESHOT; 997 desc->istate |= IRQS_ONESHOT;
785 998
786 if (!(desc->status & IRQ_NOAUTOEN)) { 999 if (irq_settings_can_autoenable(desc))
787 desc->depth = 0; 1000 irq_startup(desc);
788 desc->status &= ~IRQ_DISABLED; 1001 else
789 desc->irq_data.chip->irq_startup(&desc->irq_data);
790 } else
791 /* Undo nested disables: */ 1002 /* Undo nested disables: */
792 desc->depth = 1; 1003 desc->depth = 1;
793 1004
794 /* Exclude IRQ from balancing if requested */ 1005 /* Exclude IRQ from balancing if requested */
795 if (new->flags & IRQF_NOBALANCING) 1006 if (new->flags & IRQF_NOBALANCING) {
796 desc->status |= IRQ_NO_BALANCING; 1007 irq_settings_set_no_balancing(desc);
1008 irqd_set(&desc->irq_data, IRQD_NO_BALANCING);
1009 }
797 1010
798 /* Set default affinity mask once everything is setup */ 1011 /* Set default affinity mask once everything is setup */
799 setup_affinity(irq, desc); 1012 setup_affinity(irq, desc, mask);
800 1013
801 } else if ((new->flags & IRQF_TRIGGER_MASK) 1014 } else if (new->flags & IRQF_TRIGGER_MASK) {
802 && (new->flags & IRQF_TRIGGER_MASK) 1015 unsigned int nmsk = new->flags & IRQF_TRIGGER_MASK;
803 != (desc->status & IRQ_TYPE_SENSE_MASK)) { 1016 unsigned int omsk = irq_settings_get_trigger_mask(desc);
804 /* hope the handler works with the actual trigger mode... */ 1017
805 pr_warning("IRQ %d uses trigger mode %d; requested %d\n", 1018 if (nmsk != omsk)
806 irq, (int)(desc->status & IRQ_TYPE_SENSE_MASK), 1019 /* hope the handler works with current trigger mode */
807 (int)(new->flags & IRQF_TRIGGER_MASK)); 1020 pr_warning("IRQ %d uses trigger mode %u; requested %u\n",
1021 irq, nmsk, omsk);
808 } 1022 }
809 1023
810 new->irq = irq; 1024 new->irq = irq;
@@ -818,8 +1032,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
818 * Check whether we disabled the irq via the spurious handler 1032 * Check whether we disabled the irq via the spurious handler
819 * before. Reenable it and give it another chance. 1033 * before. Reenable it and give it another chance.
820 */ 1034 */
821 if (shared && (desc->status & IRQ_SPURIOUS_DISABLED)) { 1035 if (shared && (desc->istate & IRQS_SPURIOUS_DISABLED)) {
822 desc->status &= ~IRQ_SPURIOUS_DISABLED; 1036 desc->istate &= ~IRQS_SPURIOUS_DISABLED;
823 __enable_irq(desc, irq, false); 1037 __enable_irq(desc, irq, false);
824 } 1038 }
825 1039
@@ -849,6 +1063,9 @@ mismatch:
849#endif 1063#endif
850 ret = -EBUSY; 1064 ret = -EBUSY;
851 1065
1066out_mask:
1067 free_cpumask_var(mask);
1068
852out_thread: 1069out_thread:
853 raw_spin_unlock_irqrestore(&desc->lock, flags); 1070 raw_spin_unlock_irqrestore(&desc->lock, flags);
854 if (new->thread) { 1071 if (new->thread) {
@@ -871,9 +1088,14 @@ out_thread:
871 */ 1088 */
872int setup_irq(unsigned int irq, struct irqaction *act) 1089int setup_irq(unsigned int irq, struct irqaction *act)
873{ 1090{
1091 int retval;
874 struct irq_desc *desc = irq_to_desc(irq); 1092 struct irq_desc *desc = irq_to_desc(irq);
875 1093
876 return __setup_irq(irq, desc, act); 1094 chip_bus_lock(desc);
1095 retval = __setup_irq(irq, desc, act);
1096 chip_bus_sync_unlock(desc);
1097
1098 return retval;
877} 1099}
878EXPORT_SYMBOL_GPL(setup_irq); 1100EXPORT_SYMBOL_GPL(setup_irq);
879 1101
@@ -924,13 +1146,8 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
924#endif 1146#endif
925 1147
926 /* If this was the last handler, shut down the IRQ line: */ 1148 /* If this was the last handler, shut down the IRQ line: */
927 if (!desc->action) { 1149 if (!desc->action)
928 desc->status |= IRQ_DISABLED; 1150 irq_shutdown(desc);
929 if (desc->irq_data.chip->irq_shutdown)
930 desc->irq_data.chip->irq_shutdown(&desc->irq_data);
931 else
932 desc->irq_data.chip->irq_disable(&desc->irq_data);
933 }
934 1151
935#ifdef CONFIG_SMP 1152#ifdef CONFIG_SMP
936 /* make sure affinity_hint is cleaned up */ 1153 /* make sure affinity_hint is cleaned up */
@@ -1004,6 +1221,11 @@ void free_irq(unsigned int irq, void *dev_id)
1004 if (!desc) 1221 if (!desc)
1005 return; 1222 return;
1006 1223
1224#ifdef CONFIG_SMP
1225 if (WARN_ON(desc->affinity_notify))
1226 desc->affinity_notify = NULL;
1227#endif
1228
1007 chip_bus_lock(desc); 1229 chip_bus_lock(desc);
1008 kfree(__free_irq(irq, dev_id)); 1230 kfree(__free_irq(irq, dev_id));
1009 chip_bus_sync_unlock(desc); 1231 chip_bus_sync_unlock(desc);
@@ -1074,7 +1296,7 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler,
1074 if (!desc) 1296 if (!desc)
1075 return -EINVAL; 1297 return -EINVAL;
1076 1298
1077 if (desc->status & IRQ_NOREQUEST) 1299 if (!irq_settings_can_request(desc))
1078 return -EINVAL; 1300 return -EINVAL;
1079 1301
1080 if (!handler) { 1302 if (!handler) {
@@ -1100,7 +1322,7 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler,
1100 if (retval) 1322 if (retval)
1101 kfree(action); 1323 kfree(action);
1102 1324
1103#ifdef CONFIG_DEBUG_SHIRQ 1325#ifdef CONFIG_DEBUG_SHIRQ_FIXME
1104 if (!retval && (irqflags & IRQF_SHARED)) { 1326 if (!retval && (irqflags & IRQF_SHARED)) {
1105 /* 1327 /*
1106 * It's a shared IRQ -- the driver ought to be prepared for it 1328 * It's a shared IRQ -- the driver ought to be prepared for it
@@ -1149,7 +1371,7 @@ int request_any_context_irq(unsigned int irq, irq_handler_t handler,
1149 if (!desc) 1371 if (!desc)
1150 return -EINVAL; 1372 return -EINVAL;
1151 1373
1152 if (desc->status & IRQ_NESTED_THREAD) { 1374 if (irq_settings_is_nested_thread(desc)) {
1153 ret = request_threaded_irq(irq, NULL, handler, 1375 ret = request_threaded_irq(irq, NULL, handler,
1154 flags, name, dev_id); 1376 flags, name, dev_id);
1155 return !ret ? IRQC_IS_NESTED : ret; 1377 return !ret ? IRQC_IS_NESTED : ret;
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c
index 1d2541940480..ec4806d4778b 100644
--- a/kernel/irq/migration.c
+++ b/kernel/irq/migration.c
@@ -4,23 +4,23 @@
4 4
5#include "internals.h" 5#include "internals.h"
6 6
7void move_masked_irq(int irq) 7void irq_move_masked_irq(struct irq_data *idata)
8{ 8{
9 struct irq_desc *desc = irq_to_desc(irq); 9 struct irq_desc *desc = irq_data_to_desc(idata);
10 struct irq_chip *chip = desc->irq_data.chip; 10 struct irq_chip *chip = idata->chip;
11 11
12 if (likely(!(desc->status & IRQ_MOVE_PENDING))) 12 if (likely(!irqd_is_setaffinity_pending(&desc->irq_data)))
13 return; 13 return;
14 14
15 /* 15 /*
16 * Paranoia: cpu-local interrupts shouldn't be calling in here anyway. 16 * Paranoia: cpu-local interrupts shouldn't be calling in here anyway.
17 */ 17 */
18 if (CHECK_IRQ_PER_CPU(desc->status)) { 18 if (!irqd_can_balance(&desc->irq_data)) {
19 WARN_ON(1); 19 WARN_ON(1);
20 return; 20 return;
21 } 21 }
22 22
23 desc->status &= ~IRQ_MOVE_PENDING; 23 irqd_clr_move_pending(&desc->irq_data);
24 24
25 if (unlikely(cpumask_empty(desc->pending_mask))) 25 if (unlikely(cpumask_empty(desc->pending_mask)))
26 return; 26 return;
@@ -53,18 +53,36 @@ void move_masked_irq(int irq)
53 cpumask_clear(desc->pending_mask); 53 cpumask_clear(desc->pending_mask);
54} 54}
55 55
56void move_native_irq(int irq) 56void move_masked_irq(int irq)
57{
58 irq_move_masked_irq(irq_get_irq_data(irq));
59}
60
61void irq_move_irq(struct irq_data *idata)
57{ 62{
58 struct irq_desc *desc = irq_to_desc(irq); 63 struct irq_desc *desc = irq_data_to_desc(idata);
64 bool masked;
59 65
60 if (likely(!(desc->status & IRQ_MOVE_PENDING))) 66 if (likely(!irqd_is_setaffinity_pending(idata)))
61 return; 67 return;
62 68
63 if (unlikely(desc->status & IRQ_DISABLED)) 69 if (unlikely(desc->istate & IRQS_DISABLED))
64 return; 70 return;
65 71
66 desc->irq_data.chip->irq_mask(&desc->irq_data); 72 /*
67 move_masked_irq(irq); 73 * Be careful vs. already masked interrupts. If this is a
68 desc->irq_data.chip->irq_unmask(&desc->irq_data); 74 * threaded interrupt with ONESHOT set, we can end up with an
75 * interrupt storm.
76 */
77 masked = desc->istate & IRQS_MASKED;
78 if (!masked)
79 idata->chip->irq_mask(idata);
80 irq_move_masked_irq(idata);
81 if (!masked)
82 idata->chip->irq_unmask(idata);
69} 83}
70 84
85void move_native_irq(int irq)
86{
87 irq_move_irq(irq_get_irq_data(irq));
88}
diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c
index 0d4005d85b03..f76fc00c9877 100644
--- a/kernel/irq/pm.c
+++ b/kernel/irq/pm.c
@@ -18,7 +18,7 @@
18 * During system-wide suspend or hibernation device drivers need to be prevented 18 * During system-wide suspend or hibernation device drivers need to be prevented
19 * from receiving interrupts and this function is provided for this purpose. 19 * from receiving interrupts and this function is provided for this purpose.
20 * It marks all interrupt lines in use, except for the timer ones, as disabled 20 * It marks all interrupt lines in use, except for the timer ones, as disabled
21 * and sets the IRQ_SUSPENDED flag for each of them. 21 * and sets the IRQS_SUSPENDED flag for each of them.
22 */ 22 */
23void suspend_device_irqs(void) 23void suspend_device_irqs(void)
24{ 24{
@@ -34,7 +34,7 @@ void suspend_device_irqs(void)
34 } 34 }
35 35
36 for_each_irq_desc(irq, desc) 36 for_each_irq_desc(irq, desc)
37 if (desc->status & IRQ_SUSPENDED) 37 if (desc->istate & IRQS_SUSPENDED)
38 synchronize_irq(irq); 38 synchronize_irq(irq);
39} 39}
40EXPORT_SYMBOL_GPL(suspend_device_irqs); 40EXPORT_SYMBOL_GPL(suspend_device_irqs);
@@ -43,7 +43,7 @@ EXPORT_SYMBOL_GPL(suspend_device_irqs);
43 * resume_device_irqs - enable interrupt lines disabled by suspend_device_irqs() 43 * resume_device_irqs - enable interrupt lines disabled by suspend_device_irqs()
44 * 44 *
45 * Enable all interrupt lines previously disabled by suspend_device_irqs() that 45 * Enable all interrupt lines previously disabled by suspend_device_irqs() that
46 * have the IRQ_SUSPENDED flag set. 46 * have the IRQS_SUSPENDED flag set.
47 */ 47 */
48void resume_device_irqs(void) 48void resume_device_irqs(void)
49{ 49{
@@ -53,9 +53,6 @@ void resume_device_irqs(void)
53 for_each_irq_desc(irq, desc) { 53 for_each_irq_desc(irq, desc) {
54 unsigned long flags; 54 unsigned long flags;
55 55
56 if (!(desc->status & IRQ_SUSPENDED))
57 continue;
58
59 raw_spin_lock_irqsave(&desc->lock, flags); 56 raw_spin_lock_irqsave(&desc->lock, flags);
60 __enable_irq(desc, irq, true); 57 __enable_irq(desc, irq, true);
61 raw_spin_unlock_irqrestore(&desc->lock, flags); 58 raw_spin_unlock_irqrestore(&desc->lock, flags);
@@ -71,9 +68,24 @@ int check_wakeup_irqs(void)
71 struct irq_desc *desc; 68 struct irq_desc *desc;
72 int irq; 69 int irq;
73 70
74 for_each_irq_desc(irq, desc) 71 for_each_irq_desc(irq, desc) {
75 if ((desc->status & IRQ_WAKEUP) && (desc->status & IRQ_PENDING)) 72 if (irqd_is_wakeup_set(&desc->irq_data)) {
76 return -EBUSY; 73 if (desc->istate & IRQS_PENDING)
74 return -EBUSY;
75 continue;
76 }
77 /*
78 * Check the non wakeup interrupts whether they need
79 * to be masked before finally going into suspend
80 * state. That's for hardware which has no wakeup
81 * source configuration facility. The chip
82 * implementation indicates that with
83 * IRQCHIP_MASK_ON_SUSPEND.
84 */
85 if (desc->istate & IRQS_SUSPENDED &&
86 irq_desc_get_chip(desc)->flags & IRQCHIP_MASK_ON_SUSPEND)
87 mask_irq(desc);
88 }
77 89
78 return 0; 90 return 0;
79} 91}
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 6c8a2a9f8a7b..4cc2e5ed0bec 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -11,6 +11,7 @@
11#include <linux/proc_fs.h> 11#include <linux/proc_fs.h>
12#include <linux/seq_file.h> 12#include <linux/seq_file.h>
13#include <linux/interrupt.h> 13#include <linux/interrupt.h>
14#include <linux/kernel_stat.h>
14 15
15#include "internals.h" 16#include "internals.h"
16 17
@@ -24,7 +25,7 @@ static int irq_affinity_proc_show(struct seq_file *m, void *v)
24 const struct cpumask *mask = desc->irq_data.affinity; 25 const struct cpumask *mask = desc->irq_data.affinity;
25 26
26#ifdef CONFIG_GENERIC_PENDING_IRQ 27#ifdef CONFIG_GENERIC_PENDING_IRQ
27 if (desc->status & IRQ_MOVE_PENDING) 28 if (irqd_is_setaffinity_pending(&desc->irq_data))
28 mask = desc->pending_mask; 29 mask = desc->pending_mask;
29#endif 30#endif
30 seq_cpumask(m, mask); 31 seq_cpumask(m, mask);
@@ -65,8 +66,7 @@ static ssize_t irq_affinity_proc_write(struct file *file,
65 cpumask_var_t new_value; 66 cpumask_var_t new_value;
66 int err; 67 int err;
67 68
68 if (!irq_to_desc(irq)->irq_data.chip->irq_set_affinity || no_irq_affinity || 69 if (!irq_can_set_affinity(irq) || no_irq_affinity)
69 irq_balancing_disabled(irq))
70 return -EIO; 70 return -EIO;
71 71
72 if (!alloc_cpumask_var(&new_value, GFP_KERNEL)) 72 if (!alloc_cpumask_var(&new_value, GFP_KERNEL))
@@ -89,7 +89,7 @@ static ssize_t irq_affinity_proc_write(struct file *file,
89 if (!cpumask_intersects(new_value, cpu_online_mask)) { 89 if (!cpumask_intersects(new_value, cpu_online_mask)) {
90 /* Special case for empty set - allow the architecture 90 /* Special case for empty set - allow the architecture
91 code to set default SMP affinity. */ 91 code to set default SMP affinity. */
92 err = irq_select_affinity_usr(irq) ? -EINVAL : count; 92 err = irq_select_affinity_usr(irq, new_value) ? -EINVAL : count;
93 } else { 93 } else {
94 irq_set_affinity(irq, new_value); 94 irq_set_affinity(irq, new_value);
95 err = count; 95 err = count;
@@ -357,3 +357,65 @@ void init_irq_proc(void)
357 } 357 }
358} 358}
359 359
360#ifdef CONFIG_GENERIC_IRQ_SHOW
361
362int __weak arch_show_interrupts(struct seq_file *p, int prec)
363{
364 return 0;
365}
366
367int show_interrupts(struct seq_file *p, void *v)
368{
369 static int prec;
370
371 unsigned long flags, any_count = 0;
372 int i = *(loff_t *) v, j;
373 struct irqaction *action;
374 struct irq_desc *desc;
375
376 if (i > nr_irqs)
377 return 0;
378
379 if (i == nr_irqs)
380 return arch_show_interrupts(p, prec);
381
382 /* print header and calculate the width of the first column */
383 if (i == 0) {
384 for (prec = 3, j = 1000; prec < 10 && j <= nr_irqs; ++prec)
385 j *= 10;
386
387 seq_printf(p, "%*s", prec + 8, "");
388 for_each_online_cpu(j)
389 seq_printf(p, "CPU%-8d", j);
390 seq_putc(p, '\n');
391 }
392
393 desc = irq_to_desc(i);
394 if (!desc)
395 return 0;
396
397 raw_spin_lock_irqsave(&desc->lock, flags);
398 for_each_online_cpu(j)
399 any_count |= kstat_irqs_cpu(i, j);
400 action = desc->action;
401 if (!action && !any_count)
402 goto out;
403
404 seq_printf(p, "%*d: ", prec, i);
405 for_each_online_cpu(j)
406 seq_printf(p, "%10u ", kstat_irqs_cpu(i, j));
407 seq_printf(p, " %8s", desc->irq_data.chip->name);
408 seq_printf(p, "-%-8s", desc->name);
409
410 if (action) {
411 seq_printf(p, " %s", action->name);
412 while ((action = action->next) != NULL)
413 seq_printf(p, ", %s", action->name);
414 }
415
416 seq_putc(p, '\n');
417out:
418 raw_spin_unlock_irqrestore(&desc->lock, flags);
419 return 0;
420}
421#endif
diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c
index 891115a929aa..ad683a99b1ec 100644
--- a/kernel/irq/resend.c
+++ b/kernel/irq/resend.c
@@ -23,7 +23,7 @@
23#ifdef CONFIG_HARDIRQS_SW_RESEND 23#ifdef CONFIG_HARDIRQS_SW_RESEND
24 24
25/* Bitmap to handle software resend of interrupts: */ 25/* Bitmap to handle software resend of interrupts: */
26static DECLARE_BITMAP(irqs_resend, NR_IRQS); 26static DECLARE_BITMAP(irqs_resend, IRQ_BITMAP_BITS);
27 27
28/* 28/*
29 * Run software resends of IRQ's 29 * Run software resends of IRQ's
@@ -55,20 +55,19 @@ static DECLARE_TASKLET(resend_tasklet, resend_irqs, 0);
55 */ 55 */
56void check_irq_resend(struct irq_desc *desc, unsigned int irq) 56void check_irq_resend(struct irq_desc *desc, unsigned int irq)
57{ 57{
58 unsigned int status = desc->status;
59
60 /*
61 * Make sure the interrupt is enabled, before resending it:
62 */
63 desc->irq_data.chip->irq_enable(&desc->irq_data);
64
65 /* 58 /*
66 * We do not resend level type interrupts. Level type 59 * We do not resend level type interrupts. Level type
67 * interrupts are resent by hardware when they are still 60 * interrupts are resent by hardware when they are still
68 * active. 61 * active.
69 */ 62 */
70 if ((status & (IRQ_LEVEL | IRQ_PENDING | IRQ_REPLAY)) == IRQ_PENDING) { 63 if (irq_settings_is_level(desc))
71 desc->status = (status & ~IRQ_PENDING) | IRQ_REPLAY; 64 return;
65 if (desc->istate & IRQS_REPLAY)
66 return;
67 if (desc->istate & IRQS_PENDING) {
68 irq_compat_clr_pending(desc);
69 desc->istate &= ~IRQS_PENDING;
70 desc->istate |= IRQS_REPLAY;
72 71
73 if (!desc->irq_data.chip->irq_retrigger || 72 if (!desc->irq_data.chip->irq_retrigger ||
74 !desc->irq_data.chip->irq_retrigger(&desc->irq_data)) { 73 !desc->irq_data.chip->irq_retrigger(&desc->irq_data)) {
diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h
new file mode 100644
index 000000000000..0227ad358272
--- /dev/null
+++ b/kernel/irq/settings.h
@@ -0,0 +1,138 @@
1/*
2 * Internal header to deal with irq_desc->status which will be renamed
3 * to irq_desc->settings.
4 */
5enum {
6 _IRQ_DEFAULT_INIT_FLAGS = IRQ_DEFAULT_INIT_FLAGS,
7 _IRQ_PER_CPU = IRQ_PER_CPU,
8 _IRQ_LEVEL = IRQ_LEVEL,
9 _IRQ_NOPROBE = IRQ_NOPROBE,
10 _IRQ_NOREQUEST = IRQ_NOREQUEST,
11 _IRQ_NOAUTOEN = IRQ_NOAUTOEN,
12 _IRQ_MOVE_PCNTXT = IRQ_MOVE_PCNTXT,
13 _IRQ_NO_BALANCING = IRQ_NO_BALANCING,
14 _IRQ_NESTED_THREAD = IRQ_NESTED_THREAD,
15 _IRQF_MODIFY_MASK = IRQF_MODIFY_MASK,
16};
17
18#define IRQ_INPROGRESS GOT_YOU_MORON
19#define IRQ_REPLAY GOT_YOU_MORON
20#define IRQ_WAITING GOT_YOU_MORON
21#define IRQ_DISABLED GOT_YOU_MORON
22#define IRQ_PENDING GOT_YOU_MORON
23#define IRQ_MASKED GOT_YOU_MORON
24#define IRQ_WAKEUP GOT_YOU_MORON
25#define IRQ_MOVE_PENDING GOT_YOU_MORON
26#define IRQ_PER_CPU GOT_YOU_MORON
27#define IRQ_NO_BALANCING GOT_YOU_MORON
28#define IRQ_AFFINITY_SET GOT_YOU_MORON
29#define IRQ_LEVEL GOT_YOU_MORON
30#define IRQ_NOPROBE GOT_YOU_MORON
31#define IRQ_NOREQUEST GOT_YOU_MORON
32#define IRQ_NOAUTOEN GOT_YOU_MORON
33#define IRQ_NESTED_THREAD GOT_YOU_MORON
34#undef IRQF_MODIFY_MASK
35#define IRQF_MODIFY_MASK GOT_YOU_MORON
36
37static inline void
38irq_settings_clr_and_set(struct irq_desc *desc, u32 clr, u32 set)
39{
40 desc->status &= ~(clr & _IRQF_MODIFY_MASK);
41 desc->status |= (set & _IRQF_MODIFY_MASK);
42}
43
44static inline bool irq_settings_is_per_cpu(struct irq_desc *desc)
45{
46 return desc->status & _IRQ_PER_CPU;
47}
48
49static inline void irq_settings_set_per_cpu(struct irq_desc *desc)
50{
51 desc->status |= _IRQ_PER_CPU;
52}
53
54static inline void irq_settings_set_no_balancing(struct irq_desc *desc)
55{
56 desc->status |= _IRQ_NO_BALANCING;
57}
58
59static inline bool irq_settings_has_no_balance_set(struct irq_desc *desc)
60{
61 return desc->status & _IRQ_NO_BALANCING;
62}
63
64static inline u32 irq_settings_get_trigger_mask(struct irq_desc *desc)
65{
66 return desc->status & IRQ_TYPE_SENSE_MASK;
67}
68
69static inline void
70irq_settings_set_trigger_mask(struct irq_desc *desc, u32 mask)
71{
72 desc->status &= ~IRQ_TYPE_SENSE_MASK;
73 desc->status |= mask & IRQ_TYPE_SENSE_MASK;
74}
75
76static inline bool irq_settings_is_level(struct irq_desc *desc)
77{
78 return desc->status & _IRQ_LEVEL;
79}
80
81static inline void irq_settings_clr_level(struct irq_desc *desc)
82{
83 desc->status &= ~_IRQ_LEVEL;
84}
85
86static inline void irq_settings_set_level(struct irq_desc *desc)
87{
88 desc->status |= _IRQ_LEVEL;
89}
90
91static inline bool irq_settings_can_request(struct irq_desc *desc)
92{
93 return !(desc->status & _IRQ_NOREQUEST);
94}
95
96static inline void irq_settings_clr_norequest(struct irq_desc *desc)
97{
98 desc->status &= ~_IRQ_NOREQUEST;
99}
100
101static inline void irq_settings_set_norequest(struct irq_desc *desc)
102{
103 desc->status |= _IRQ_NOREQUEST;
104}
105
106static inline bool irq_settings_can_probe(struct irq_desc *desc)
107{
108 return !(desc->status & _IRQ_NOPROBE);
109}
110
111static inline void irq_settings_clr_noprobe(struct irq_desc *desc)
112{
113 desc->status &= ~_IRQ_NOPROBE;
114}
115
116static inline void irq_settings_set_noprobe(struct irq_desc *desc)
117{
118 desc->status |= _IRQ_NOPROBE;
119}
120
121static inline bool irq_settings_can_move_pcntxt(struct irq_desc *desc)
122{
123 return desc->status & _IRQ_MOVE_PCNTXT;
124}
125
126static inline bool irq_settings_can_autoenable(struct irq_desc *desc)
127{
128 return !(desc->status & _IRQ_NOAUTOEN);
129}
130
131static inline bool irq_settings_is_nested_thread(struct irq_desc *desc)
132{
133 return desc->status & _IRQ_NESTED_THREAD;
134}
135
136/* Nothing should touch desc->status from now on */
137#undef status
138#define status USE_THE_PROPER_WRAPPERS_YOU_MORON
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index 3089d3b9d5f3..dd586ebf9c8c 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -21,70 +21,94 @@ static int irqfixup __read_mostly;
21#define POLL_SPURIOUS_IRQ_INTERVAL (HZ/10) 21#define POLL_SPURIOUS_IRQ_INTERVAL (HZ/10)
22static void poll_spurious_irqs(unsigned long dummy); 22static void poll_spurious_irqs(unsigned long dummy);
23static DEFINE_TIMER(poll_spurious_irq_timer, poll_spurious_irqs, 0, 0); 23static DEFINE_TIMER(poll_spurious_irq_timer, poll_spurious_irqs, 0, 0);
24static int irq_poll_cpu;
25static atomic_t irq_poll_active;
26
27/*
28 * We wait here for a poller to finish.
29 *
30 * If the poll runs on this CPU, then we yell loudly and return
31 * false. That will leave the interrupt line disabled in the worst
32 * case, but it should never happen.
33 *
34 * We wait until the poller is done and then recheck disabled and
35 * action (about to be disabled). Only if it's still active, we return
36 * true and let the handler run.
37 */
38bool irq_wait_for_poll(struct irq_desc *desc)
39{
40 if (WARN_ONCE(irq_poll_cpu == smp_processor_id(),
41 "irq poll in progress on cpu %d for irq %d\n",
42 smp_processor_id(), desc->irq_data.irq))
43 return false;
44
45#ifdef CONFIG_SMP
46 do {
47 raw_spin_unlock(&desc->lock);
48 while (desc->istate & IRQS_INPROGRESS)
49 cpu_relax();
50 raw_spin_lock(&desc->lock);
51 } while (desc->istate & IRQS_INPROGRESS);
52 /* Might have been disabled in meantime */
53 return !(desc->istate & IRQS_DISABLED) && desc->action;
54#else
55 return false;
56#endif
57}
58
24 59
25/* 60/*
26 * Recovery handler for misrouted interrupts. 61 * Recovery handler for misrouted interrupts.
27 */ 62 */
28static int try_one_irq(int irq, struct irq_desc *desc) 63static int try_one_irq(int irq, struct irq_desc *desc, bool force)
29{ 64{
65 irqreturn_t ret = IRQ_NONE;
30 struct irqaction *action; 66 struct irqaction *action;
31 int ok = 0, work = 0;
32 67
33 raw_spin_lock(&desc->lock); 68 raw_spin_lock(&desc->lock);
34 /* Already running on another processor */
35 if (desc->status & IRQ_INPROGRESS) {
36 /*
37 * Already running: If it is shared get the other
38 * CPU to go looking for our mystery interrupt too
39 */
40 if (desc->action && (desc->action->flags & IRQF_SHARED))
41 desc->status |= IRQ_PENDING;
42 raw_spin_unlock(&desc->lock);
43 return ok;
44 }
45 /* Honour the normal IRQ locking */
46 desc->status |= IRQ_INPROGRESS;
47 action = desc->action;
48 raw_spin_unlock(&desc->lock);
49 69
50 while (action) { 70 /* PER_CPU and nested thread interrupts are never polled */
51 /* Only shared IRQ handlers are safe to call */ 71 if (irq_settings_is_per_cpu(desc) || irq_settings_is_nested_thread(desc))
52 if (action->flags & IRQF_SHARED) { 72 goto out;
53 if (action->handler(irq, action->dev_id) ==
54 IRQ_HANDLED)
55 ok = 1;
56 }
57 action = action->next;
58 }
59 local_irq_disable();
60 /* Now clean up the flags */
61 raw_spin_lock(&desc->lock);
62 action = desc->action;
63 73
64 /* 74 /*
65 * While we were looking for a fixup someone queued a real 75 * Do not poll disabled interrupts unless the spurious
66 * IRQ clashing with our walk: 76 * disabled poller asks explicitely.
67 */ 77 */
68 while ((desc->status & IRQ_PENDING) && action) { 78 if ((desc->istate & IRQS_DISABLED) && !force)
79 goto out;
80
81 /*
82 * All handlers must agree on IRQF_SHARED, so we test just the
83 * first. Check for action->next as well.
84 */
85 action = desc->action;
86 if (!action || !(action->flags & IRQF_SHARED) ||
87 (action->flags & __IRQF_TIMER) || !action->next)
88 goto out;
89
90 /* Already running on another processor */
91 if (desc->istate & IRQS_INPROGRESS) {
69 /* 92 /*
70 * Perform real IRQ processing for the IRQ we deferred 93 * Already running: If it is shared get the other
94 * CPU to go looking for our mystery interrupt too
71 */ 95 */
72 work = 1; 96 irq_compat_set_pending(desc);
73 raw_spin_unlock(&desc->lock); 97 desc->istate |= IRQS_PENDING;
74 handle_IRQ_event(irq, action); 98 goto out;
75 raw_spin_lock(&desc->lock);
76 desc->status &= ~IRQ_PENDING;
77 } 99 }
78 desc->status &= ~IRQ_INPROGRESS;
79 /*
80 * If we did actual work for the real IRQ line we must let the
81 * IRQ controller clean up too
82 */
83 if (work)
84 irq_end(irq, desc);
85 raw_spin_unlock(&desc->lock);
86 100
87 return ok; 101 /* Mark it poll in progress */
102 desc->istate |= IRQS_POLL_INPROGRESS;
103 do {
104 if (handle_irq_event(desc) == IRQ_HANDLED)
105 ret = IRQ_HANDLED;
106 action = desc->action;
107 } while ((desc->istate & IRQS_PENDING) && action);
108 desc->istate &= ~IRQS_POLL_INPROGRESS;
109out:
110 raw_spin_unlock(&desc->lock);
111 return ret == IRQ_HANDLED;
88} 112}
89 113
90static int misrouted_irq(int irq) 114static int misrouted_irq(int irq)
@@ -92,6 +116,11 @@ static int misrouted_irq(int irq)
92 struct irq_desc *desc; 116 struct irq_desc *desc;
93 int i, ok = 0; 117 int i, ok = 0;
94 118
119 if (atomic_inc_return(&irq_poll_active) == 1)
120 goto out;
121
122 irq_poll_cpu = smp_processor_id();
123
95 for_each_irq_desc(i, desc) { 124 for_each_irq_desc(i, desc) {
96 if (!i) 125 if (!i)
97 continue; 126 continue;
@@ -99,9 +128,11 @@ static int misrouted_irq(int irq)
99 if (i == irq) /* Already tried */ 128 if (i == irq) /* Already tried */
100 continue; 129 continue;
101 130
102 if (try_one_irq(i, desc)) 131 if (try_one_irq(i, desc, false))
103 ok = 1; 132 ok = 1;
104 } 133 }
134out:
135 atomic_dec(&irq_poll_active);
105 /* So the caller can adjust the irq error counts */ 136 /* So the caller can adjust the irq error counts */
106 return ok; 137 return ok;
107} 138}
@@ -111,23 +142,28 @@ static void poll_spurious_irqs(unsigned long dummy)
111 struct irq_desc *desc; 142 struct irq_desc *desc;
112 int i; 143 int i;
113 144
145 if (atomic_inc_return(&irq_poll_active) != 1)
146 goto out;
147 irq_poll_cpu = smp_processor_id();
148
114 for_each_irq_desc(i, desc) { 149 for_each_irq_desc(i, desc) {
115 unsigned int status; 150 unsigned int state;
116 151
117 if (!i) 152 if (!i)
118 continue; 153 continue;
119 154
120 /* Racy but it doesn't matter */ 155 /* Racy but it doesn't matter */
121 status = desc->status; 156 state = desc->istate;
122 barrier(); 157 barrier();
123 if (!(status & IRQ_SPURIOUS_DISABLED)) 158 if (!(state & IRQS_SPURIOUS_DISABLED))
124 continue; 159 continue;
125 160
126 local_irq_disable(); 161 local_irq_disable();
127 try_one_irq(i, desc); 162 try_one_irq(i, desc, true);
128 local_irq_enable(); 163 local_irq_enable();
129 } 164 }
130 165out:
166 atomic_dec(&irq_poll_active);
131 mod_timer(&poll_spurious_irq_timer, 167 mod_timer(&poll_spurious_irq_timer,
132 jiffies + POLL_SPURIOUS_IRQ_INTERVAL); 168 jiffies + POLL_SPURIOUS_IRQ_INTERVAL);
133} 169}
@@ -139,15 +175,13 @@ static void poll_spurious_irqs(unsigned long dummy)
139 * 175 *
140 * (The other 100-of-100,000 interrupts may have been a correctly 176 * (The other 100-of-100,000 interrupts may have been a correctly
141 * functioning device sharing an IRQ with the failing one) 177 * functioning device sharing an IRQ with the failing one)
142 *
143 * Called under desc->lock
144 */ 178 */
145
146static void 179static void
147__report_bad_irq(unsigned int irq, struct irq_desc *desc, 180__report_bad_irq(unsigned int irq, struct irq_desc *desc,
148 irqreturn_t action_ret) 181 irqreturn_t action_ret)
149{ 182{
150 struct irqaction *action; 183 struct irqaction *action;
184 unsigned long flags;
151 185
152 if (action_ret != IRQ_HANDLED && action_ret != IRQ_NONE) { 186 if (action_ret != IRQ_HANDLED && action_ret != IRQ_NONE) {
153 printk(KERN_ERR "irq event %d: bogus return value %x\n", 187 printk(KERN_ERR "irq event %d: bogus return value %x\n",
@@ -159,6 +193,13 @@ __report_bad_irq(unsigned int irq, struct irq_desc *desc,
159 dump_stack(); 193 dump_stack();
160 printk(KERN_ERR "handlers:\n"); 194 printk(KERN_ERR "handlers:\n");
161 195
196 /*
197 * We need to take desc->lock here. note_interrupt() is called
198 * w/o desc->lock held, but IRQ_PROGRESS set. We might race
199 * with something else removing an action. It's ok to take
200 * desc->lock here. See synchronize_irq().
201 */
202 raw_spin_lock_irqsave(&desc->lock, flags);
162 action = desc->action; 203 action = desc->action;
163 while (action) { 204 while (action) {
164 printk(KERN_ERR "[<%p>]", action->handler); 205 printk(KERN_ERR "[<%p>]", action->handler);
@@ -167,6 +208,7 @@ __report_bad_irq(unsigned int irq, struct irq_desc *desc,
167 printk("\n"); 208 printk("\n");
168 action = action->next; 209 action = action->next;
169 } 210 }
211 raw_spin_unlock_irqrestore(&desc->lock, flags);
170} 212}
171 213
172static void 214static void
@@ -218,6 +260,9 @@ try_misrouted_irq(unsigned int irq, struct irq_desc *desc,
218void note_interrupt(unsigned int irq, struct irq_desc *desc, 260void note_interrupt(unsigned int irq, struct irq_desc *desc,
219 irqreturn_t action_ret) 261 irqreturn_t action_ret)
220{ 262{
263 if (desc->istate & IRQS_POLL_INPROGRESS)
264 return;
265
221 if (unlikely(action_ret != IRQ_HANDLED)) { 266 if (unlikely(action_ret != IRQ_HANDLED)) {
222 /* 267 /*
223 * If we are seeing only the odd spurious IRQ caused by 268 * If we are seeing only the odd spurious IRQ caused by
@@ -254,9 +299,9 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc,
254 * Now kill the IRQ 299 * Now kill the IRQ
255 */ 300 */
256 printk(KERN_EMERG "Disabling IRQ #%d\n", irq); 301 printk(KERN_EMERG "Disabling IRQ #%d\n", irq);
257 desc->status |= IRQ_DISABLED | IRQ_SPURIOUS_DISABLED; 302 desc->istate |= IRQS_SPURIOUS_DISABLED;
258 desc->depth++; 303 desc->depth++;
259 desc->irq_data.chip->irq_disable(&desc->irq_data); 304 irq_disable(desc);
260 305
261 mod_timer(&poll_spurious_irq_timer, 306 mod_timer(&poll_spurious_irq_timer,
262 jiffies + POLL_SPURIOUS_IRQ_INTERVAL); 307 jiffies + POLL_SPURIOUS_IRQ_INTERVAL);
diff --git a/kernel/irq_work.c b/kernel/irq_work.c
index 90f881904bb1..c58fa7da8aef 100644
--- a/kernel/irq_work.c
+++ b/kernel/irq_work.c
@@ -77,21 +77,21 @@ void __weak arch_irq_work_raise(void)
77 */ 77 */
78static void __irq_work_queue(struct irq_work *entry) 78static void __irq_work_queue(struct irq_work *entry)
79{ 79{
80 struct irq_work **head, *next; 80 struct irq_work *next;
81 81
82 head = &get_cpu_var(irq_work_list); 82 preempt_disable();
83 83
84 do { 84 do {
85 next = *head; 85 next = __this_cpu_read(irq_work_list);
86 /* Can assign non-atomic because we keep the flags set. */ 86 /* Can assign non-atomic because we keep the flags set. */
87 entry->next = next_flags(next, IRQ_WORK_FLAGS); 87 entry->next = next_flags(next, IRQ_WORK_FLAGS);
88 } while (cmpxchg(head, next, entry) != next); 88 } while (this_cpu_cmpxchg(irq_work_list, next, entry) != next);
89 89
90 /* The list was empty, raise self-interrupt to start processing. */ 90 /* The list was empty, raise self-interrupt to start processing. */
91 if (!irq_work_next(entry)) 91 if (!irq_work_next(entry))
92 arch_irq_work_raise(); 92 arch_irq_work_raise();
93 93
94 put_cpu_var(irq_work_list); 94 preempt_enable();
95} 95}
96 96
97/* 97/*
@@ -120,16 +120,16 @@ EXPORT_SYMBOL_GPL(irq_work_queue);
120 */ 120 */
121void irq_work_run(void) 121void irq_work_run(void)
122{ 122{
123 struct irq_work *list, **head; 123 struct irq_work *list;
124 124
125 head = &__get_cpu_var(irq_work_list); 125 if (this_cpu_read(irq_work_list) == NULL)
126 if (*head == NULL)
127 return; 126 return;
128 127
129 BUG_ON(!in_irq()); 128 BUG_ON(!in_irq());
130 BUG_ON(!irqs_disabled()); 129 BUG_ON(!irqs_disabled());
131 130
132 list = xchg(head, NULL); 131 list = this_cpu_xchg(irq_work_list, NULL);
132
133 while (list != NULL) { 133 while (list != NULL) {
134 struct irq_work *entry = list; 134 struct irq_work *entry = list;
135 135
diff --git a/kernel/kexec.c b/kernel/kexec.c
index b55045bc7563..ec19b92c7ebd 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -163,7 +163,7 @@ static int do_kimage_alloc(struct kimage **rimage, unsigned long entry,
163 * just verifies it is an address we can use. 163 * just verifies it is an address we can use.
164 * 164 *
165 * Since the kernel does everything in page size chunks ensure 165 * Since the kernel does everything in page size chunks ensure
166 * the destination addreses are page aligned. Too many 166 * the destination addresses are page aligned. Too many
167 * special cases crop of when we don't do this. The most 167 * special cases crop of when we don't do this. The most
168 * insidious is getting overlapping destination addresses 168 * insidious is getting overlapping destination addresses
169 * simply because addresses are changed to page size 169 * simply because addresses are changed to page size
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 7663e5df0e6f..77981813a1e7 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -317,12 +317,12 @@ void __kprobes free_optinsn_slot(kprobe_opcode_t * slot, int dirty)
317/* We have preemption disabled.. so it is safe to use __ versions */ 317/* We have preemption disabled.. so it is safe to use __ versions */
318static inline void set_kprobe_instance(struct kprobe *kp) 318static inline void set_kprobe_instance(struct kprobe *kp)
319{ 319{
320 __get_cpu_var(kprobe_instance) = kp; 320 __this_cpu_write(kprobe_instance, kp);
321} 321}
322 322
323static inline void reset_kprobe_instance(void) 323static inline void reset_kprobe_instance(void)
324{ 324{
325 __get_cpu_var(kprobe_instance) = NULL; 325 __this_cpu_write(kprobe_instance, NULL);
326} 326}
327 327
328/* 328/*
@@ -965,7 +965,7 @@ static void __kprobes aggr_post_handler(struct kprobe *p, struct pt_regs *regs,
965static int __kprobes aggr_fault_handler(struct kprobe *p, struct pt_regs *regs, 965static int __kprobes aggr_fault_handler(struct kprobe *p, struct pt_regs *regs,
966 int trapnr) 966 int trapnr)
967{ 967{
968 struct kprobe *cur = __get_cpu_var(kprobe_instance); 968 struct kprobe *cur = __this_cpu_read(kprobe_instance);
969 969
970 /* 970 /*
971 * if we faulted "during" the execution of a user specified 971 * if we faulted "during" the execution of a user specified
@@ -980,7 +980,7 @@ static int __kprobes aggr_fault_handler(struct kprobe *p, struct pt_regs *regs,
980 980
981static int __kprobes aggr_break_handler(struct kprobe *p, struct pt_regs *regs) 981static int __kprobes aggr_break_handler(struct kprobe *p, struct pt_regs *regs)
982{ 982{
983 struct kprobe *cur = __get_cpu_var(kprobe_instance); 983 struct kprobe *cur = __this_cpu_read(kprobe_instance);
984 int ret = 0; 984 int ret = 0;
985 985
986 if (cur && cur->break_handler) { 986 if (cur && cur->break_handler) {
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 5355cfd44a3f..c55afba990a3 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -148,7 +148,7 @@ struct task_struct *kthread_create(int (*threadfn)(void *data),
148 wait_for_completion(&create.done); 148 wait_for_completion(&create.done);
149 149
150 if (!IS_ERR(create.result)) { 150 if (!IS_ERR(create.result)) {
151 static struct sched_param param = { .sched_priority = 0 }; 151 static const struct sched_param param = { .sched_priority = 0 };
152 va_list args; 152 va_list args;
153 153
154 va_start(args, namefmt); 154 va_start(args, namefmt);
diff --git a/kernel/latencytop.c b/kernel/latencytop.c
index 17110a4a4fc2..ee74b35e528d 100644
--- a/kernel/latencytop.c
+++ b/kernel/latencytop.c
@@ -241,24 +241,19 @@ static int lstats_show(struct seq_file *m, void *v)
241 seq_puts(m, "Latency Top version : v0.1\n"); 241 seq_puts(m, "Latency Top version : v0.1\n");
242 242
243 for (i = 0; i < MAXLR; i++) { 243 for (i = 0; i < MAXLR; i++) {
244 if (latency_record[i].backtrace[0]) { 244 struct latency_record *lr = &latency_record[i];
245
246 if (lr->backtrace[0]) {
245 int q; 247 int q;
246 seq_printf(m, "%i %lu %lu ", 248 seq_printf(m, "%i %lu %lu",
247 latency_record[i].count, 249 lr->count, lr->time, lr->max);
248 latency_record[i].time,
249 latency_record[i].max);
250 for (q = 0; q < LT_BACKTRACEDEPTH; q++) { 250 for (q = 0; q < LT_BACKTRACEDEPTH; q++) {
251 char sym[KSYM_SYMBOL_LEN]; 251 unsigned long bt = lr->backtrace[q];
252 char *c; 252 if (!bt)
253 if (!latency_record[i].backtrace[q])
254 break; 253 break;
255 if (latency_record[i].backtrace[q] == ULONG_MAX) 254 if (bt == ULONG_MAX)
256 break; 255 break;
257 sprint_symbol(sym, latency_record[i].backtrace[q]); 256 seq_printf(m, " %ps", (void *)bt);
258 c = strchr(sym, '+');
259 if (c)
260 *c = 0;
261 seq_printf(m, "%s ", sym);
262 } 257 }
263 seq_printf(m, "\n"); 258 seq_printf(m, "\n");
264 } 259 }
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 42ba65dff7d9..0d2058da80f5 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -2292,22 +2292,6 @@ mark_held_locks(struct task_struct *curr, enum mark_type mark)
2292} 2292}
2293 2293
2294/* 2294/*
2295 * Debugging helper: via this flag we know that we are in
2296 * 'early bootup code', and will warn about any invalid irqs-on event:
2297 */
2298static int early_boot_irqs_enabled;
2299
2300void early_boot_irqs_off(void)
2301{
2302 early_boot_irqs_enabled = 0;
2303}
2304
2305void early_boot_irqs_on(void)
2306{
2307 early_boot_irqs_enabled = 1;
2308}
2309
2310/*
2311 * Hardirqs will be enabled: 2295 * Hardirqs will be enabled:
2312 */ 2296 */
2313void trace_hardirqs_on_caller(unsigned long ip) 2297void trace_hardirqs_on_caller(unsigned long ip)
@@ -2319,7 +2303,7 @@ void trace_hardirqs_on_caller(unsigned long ip)
2319 if (unlikely(!debug_locks || current->lockdep_recursion)) 2303 if (unlikely(!debug_locks || current->lockdep_recursion))
2320 return; 2304 return;
2321 2305
2322 if (DEBUG_LOCKS_WARN_ON(unlikely(!early_boot_irqs_enabled))) 2306 if (DEBUG_LOCKS_WARN_ON(unlikely(early_boot_irqs_disabled)))
2323 return; 2307 return;
2324 2308
2325 if (unlikely(curr->hardirqs_enabled)) { 2309 if (unlikely(curr->hardirqs_enabled)) {
diff --git a/kernel/module.c b/kernel/module.c
index 34e00b708fad..efa290ea94bf 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2460,9 +2460,9 @@ static void find_module_sections(struct module *mod, struct load_info *info)
2460#endif 2460#endif
2461 2461
2462#ifdef CONFIG_TRACEPOINTS 2462#ifdef CONFIG_TRACEPOINTS
2463 mod->tracepoints = section_objs(info, "__tracepoints", 2463 mod->tracepoints_ptrs = section_objs(info, "__tracepoints_ptrs",
2464 sizeof(*mod->tracepoints), 2464 sizeof(*mod->tracepoints_ptrs),
2465 &mod->num_tracepoints); 2465 &mod->num_tracepoints);
2466#endif 2466#endif
2467#ifdef HAVE_JUMP_LABEL 2467#ifdef HAVE_JUMP_LABEL
2468 mod->jump_entries = section_objs(info, "__jump_table", 2468 mod->jump_entries = section_objs(info, "__jump_table",
@@ -3393,7 +3393,7 @@ void module_layout(struct module *mod,
3393 struct modversion_info *ver, 3393 struct modversion_info *ver,
3394 struct kernel_param *kp, 3394 struct kernel_param *kp,
3395 struct kernel_symbol *ks, 3395 struct kernel_symbol *ks,
3396 struct tracepoint *tp) 3396 struct tracepoint * const *tp)
3397{ 3397{
3398} 3398}
3399EXPORT_SYMBOL(module_layout); 3399EXPORT_SYMBOL(module_layout);
@@ -3407,8 +3407,8 @@ void module_update_tracepoints(void)
3407 mutex_lock(&module_mutex); 3407 mutex_lock(&module_mutex);
3408 list_for_each_entry(mod, &modules, list) 3408 list_for_each_entry(mod, &modules, list)
3409 if (!mod->taints) 3409 if (!mod->taints)
3410 tracepoint_update_probe_range(mod->tracepoints, 3410 tracepoint_update_probe_range(mod->tracepoints_ptrs,
3411 mod->tracepoints + mod->num_tracepoints); 3411 mod->tracepoints_ptrs + mod->num_tracepoints);
3412 mutex_unlock(&module_mutex); 3412 mutex_unlock(&module_mutex);
3413} 3413}
3414 3414
@@ -3432,8 +3432,8 @@ int module_get_iter_tracepoints(struct tracepoint_iter *iter)
3432 else if (iter_mod > iter->module) 3432 else if (iter_mod > iter->module)
3433 iter->tracepoint = NULL; 3433 iter->tracepoint = NULL;
3434 found = tracepoint_get_iter_range(&iter->tracepoint, 3434 found = tracepoint_get_iter_range(&iter->tracepoint,
3435 iter_mod->tracepoints, 3435 iter_mod->tracepoints_ptrs,
3436 iter_mod->tracepoints 3436 iter_mod->tracepoints_ptrs
3437 + iter_mod->num_tracepoints); 3437 + iter_mod->num_tracepoints);
3438 if (found) { 3438 if (found) {
3439 iter->module = iter_mod; 3439 iter->module = iter_mod;
diff --git a/kernel/panic.c b/kernel/panic.c
index 4c13b1a88ebb..991bb87a1704 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -34,6 +34,7 @@ static int pause_on_oops_flag;
34static DEFINE_SPINLOCK(pause_on_oops_lock); 34static DEFINE_SPINLOCK(pause_on_oops_lock);
35 35
36int panic_timeout; 36int panic_timeout;
37EXPORT_SYMBOL_GPL(panic_timeout);
37 38
38ATOMIC_NOTIFIER_HEAD(panic_notifier_list); 39ATOMIC_NOTIFIER_HEAD(panic_notifier_list);
39 40
diff --git a/kernel/params.c b/kernel/params.c
index 08107d181758..0da1411222b9 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -719,9 +719,7 @@ void destroy_params(const struct kernel_param *params, unsigned num)
719 params[i].ops->free(params[i].arg); 719 params[i].ops->free(params[i].arg);
720} 720}
721 721
722static void __init kernel_add_sysfs_param(const char *name, 722static struct module_kobject * __init locate_module_kobject(const char *name)
723 struct kernel_param *kparam,
724 unsigned int name_skip)
725{ 723{
726 struct module_kobject *mk; 724 struct module_kobject *mk;
727 struct kobject *kobj; 725 struct kobject *kobj;
@@ -729,10 +727,7 @@ static void __init kernel_add_sysfs_param(const char *name,
729 727
730 kobj = kset_find_obj(module_kset, name); 728 kobj = kset_find_obj(module_kset, name);
731 if (kobj) { 729 if (kobj) {
732 /* We already have one. Remove params so we can add more. */
733 mk = to_module_kobject(kobj); 730 mk = to_module_kobject(kobj);
734 /* We need to remove it before adding parameters. */
735 sysfs_remove_group(&mk->kobj, &mk->mp->grp);
736 } else { 731 } else {
737 mk = kzalloc(sizeof(struct module_kobject), GFP_KERNEL); 732 mk = kzalloc(sizeof(struct module_kobject), GFP_KERNEL);
738 BUG_ON(!mk); 733 BUG_ON(!mk);
@@ -743,15 +738,36 @@ static void __init kernel_add_sysfs_param(const char *name,
743 "%s", name); 738 "%s", name);
744 if (err) { 739 if (err) {
745 kobject_put(&mk->kobj); 740 kobject_put(&mk->kobj);
746 printk(KERN_ERR "Module '%s' failed add to sysfs, " 741 printk(KERN_ERR
747 "error number %d\n", name, err); 742 "Module '%s' failed add to sysfs, error number %d\n",
748 printk(KERN_ERR "The system will be unstable now.\n"); 743 name, err);
749 return; 744 printk(KERN_ERR
745 "The system will be unstable now.\n");
746 return NULL;
750 } 747 }
751 /* So that exit path is even. */ 748
749 /* So that we hold reference in both cases. */
752 kobject_get(&mk->kobj); 750 kobject_get(&mk->kobj);
753 } 751 }
754 752
753 return mk;
754}
755
756static void __init kernel_add_sysfs_param(const char *name,
757 struct kernel_param *kparam,
758 unsigned int name_skip)
759{
760 struct module_kobject *mk;
761 int err;
762
763 mk = locate_module_kobject(name);
764 if (!mk)
765 return;
766
767 /* We need to remove old parameters before adding more. */
768 if (mk->mp)
769 sysfs_remove_group(&mk->kobj, &mk->mp->grp);
770
755 /* These should not fail at boot. */ 771 /* These should not fail at boot. */
756 err = add_sysfs_param(mk, kparam, kparam->name + name_skip); 772 err = add_sysfs_param(mk, kparam, kparam->name + name_skip);
757 BUG_ON(err); 773 BUG_ON(err);
@@ -796,6 +812,32 @@ static void __init param_sysfs_builtin(void)
796 } 812 }
797} 813}
798 814
815ssize_t __modver_version_show(struct module_attribute *mattr,
816 struct module *mod, char *buf)
817{
818 struct module_version_attribute *vattr =
819 container_of(mattr, struct module_version_attribute, mattr);
820
821 return sprintf(buf, "%s\n", vattr->version);
822}
823
824extern struct module_version_attribute __start___modver[], __stop___modver[];
825
826static void __init version_sysfs_builtin(void)
827{
828 const struct module_version_attribute *vattr;
829 struct module_kobject *mk;
830 int err;
831
832 for (vattr = __start___modver; vattr < __stop___modver; vattr++) {
833 mk = locate_module_kobject(vattr->module_name);
834 if (mk) {
835 err = sysfs_create_file(&mk->kobj, &vattr->mattr.attr);
836 kobject_uevent(&mk->kobj, KOBJ_ADD);
837 kobject_put(&mk->kobj);
838 }
839 }
840}
799 841
800/* module-related sysfs stuff */ 842/* module-related sysfs stuff */
801 843
@@ -875,6 +917,7 @@ static int __init param_sysfs_init(void)
875 } 917 }
876 module_sysfs_initialized = 1; 918 module_sysfs_initialized = 1;
877 919
920 version_sysfs_builtin();
878 param_sysfs_builtin(); 921 param_sysfs_builtin();
879 922
880 return 0; 923 return 0;
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 11847bf1e8cc..ed253aa24ba4 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -38,7 +38,96 @@
38 38
39#include <asm/irq_regs.h> 39#include <asm/irq_regs.h>
40 40
41atomic_t perf_task_events __read_mostly; 41struct remote_function_call {
42 struct task_struct *p;
43 int (*func)(void *info);
44 void *info;
45 int ret;
46};
47
48static void remote_function(void *data)
49{
50 struct remote_function_call *tfc = data;
51 struct task_struct *p = tfc->p;
52
53 if (p) {
54 tfc->ret = -EAGAIN;
55 if (task_cpu(p) != smp_processor_id() || !task_curr(p))
56 return;
57 }
58
59 tfc->ret = tfc->func(tfc->info);
60}
61
62/**
63 * task_function_call - call a function on the cpu on which a task runs
64 * @p: the task to evaluate
65 * @func: the function to be called
66 * @info: the function call argument
67 *
68 * Calls the function @func when the task is currently running. This might
69 * be on the current CPU, which just calls the function directly
70 *
71 * returns: @func return value, or
72 * -ESRCH - when the process isn't running
73 * -EAGAIN - when the process moved away
74 */
75static int
76task_function_call(struct task_struct *p, int (*func) (void *info), void *info)
77{
78 struct remote_function_call data = {
79 .p = p,
80 .func = func,
81 .info = info,
82 .ret = -ESRCH, /* No such (running) process */
83 };
84
85 if (task_curr(p))
86 smp_call_function_single(task_cpu(p), remote_function, &data, 1);
87
88 return data.ret;
89}
90
91/**
92 * cpu_function_call - call a function on the cpu
93 * @func: the function to be called
94 * @info: the function call argument
95 *
96 * Calls the function @func on the remote cpu.
97 *
98 * returns: @func return value or -ENXIO when the cpu is offline
99 */
100static int cpu_function_call(int cpu, int (*func) (void *info), void *info)
101{
102 struct remote_function_call data = {
103 .p = NULL,
104 .func = func,
105 .info = info,
106 .ret = -ENXIO, /* No such CPU */
107 };
108
109 smp_call_function_single(cpu, remote_function, &data, 1);
110
111 return data.ret;
112}
113
114#define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\
115 PERF_FLAG_FD_OUTPUT |\
116 PERF_FLAG_PID_CGROUP)
117
118enum event_type_t {
119 EVENT_FLEXIBLE = 0x1,
120 EVENT_PINNED = 0x2,
121 EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
122};
123
124/*
125 * perf_sched_events : >0 events exist
126 * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu
127 */
128atomic_t perf_sched_events __read_mostly;
129static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
130
42static atomic_t nr_mmap_events __read_mostly; 131static atomic_t nr_mmap_events __read_mostly;
43static atomic_t nr_comm_events __read_mostly; 132static atomic_t nr_comm_events __read_mostly;
44static atomic_t nr_task_events __read_mostly; 133static atomic_t nr_task_events __read_mostly;
@@ -61,10 +150,37 @@ int sysctl_perf_event_mlock __read_mostly = 512; /* 'free' kb per user */
61/* 150/*
62 * max perf event sample rate 151 * max perf event sample rate
63 */ 152 */
64int sysctl_perf_event_sample_rate __read_mostly = 100000; 153#define DEFAULT_MAX_SAMPLE_RATE 100000
154int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE;
155static int max_samples_per_tick __read_mostly =
156 DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ);
157
158int perf_proc_update_handler(struct ctl_table *table, int write,
159 void __user *buffer, size_t *lenp,
160 loff_t *ppos)
161{
162 int ret = proc_dointvec(table, write, buffer, lenp, ppos);
163
164 if (ret || !write)
165 return ret;
166
167 max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ);
168
169 return 0;
170}
65 171
66static atomic64_t perf_event_id; 172static atomic64_t perf_event_id;
67 173
174static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
175 enum event_type_t event_type);
176
177static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
178 enum event_type_t event_type,
179 struct task_struct *task);
180
181static void update_context_time(struct perf_event_context *ctx);
182static u64 perf_event_time(struct perf_event *event);
183
68void __weak perf_event_print_debug(void) { } 184void __weak perf_event_print_debug(void) { }
69 185
70extern __weak const char *perf_pmu_name(void) 186extern __weak const char *perf_pmu_name(void)
@@ -72,6 +188,365 @@ extern __weak const char *perf_pmu_name(void)
72 return "pmu"; 188 return "pmu";
73} 189}
74 190
191static inline u64 perf_clock(void)
192{
193 return local_clock();
194}
195
196static inline struct perf_cpu_context *
197__get_cpu_context(struct perf_event_context *ctx)
198{
199 return this_cpu_ptr(ctx->pmu->pmu_cpu_context);
200}
201
202#ifdef CONFIG_CGROUP_PERF
203
204/*
205 * Must ensure cgroup is pinned (css_get) before calling
206 * this function. In other words, we cannot call this function
207 * if there is no cgroup event for the current CPU context.
208 */
209static inline struct perf_cgroup *
210perf_cgroup_from_task(struct task_struct *task)
211{
212 return container_of(task_subsys_state(task, perf_subsys_id),
213 struct perf_cgroup, css);
214}
215
216static inline bool
217perf_cgroup_match(struct perf_event *event)
218{
219 struct perf_event_context *ctx = event->ctx;
220 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
221
222 return !event->cgrp || event->cgrp == cpuctx->cgrp;
223}
224
225static inline void perf_get_cgroup(struct perf_event *event)
226{
227 css_get(&event->cgrp->css);
228}
229
230static inline void perf_put_cgroup(struct perf_event *event)
231{
232 css_put(&event->cgrp->css);
233}
234
235static inline void perf_detach_cgroup(struct perf_event *event)
236{
237 perf_put_cgroup(event);
238 event->cgrp = NULL;
239}
240
241static inline int is_cgroup_event(struct perf_event *event)
242{
243 return event->cgrp != NULL;
244}
245
246static inline u64 perf_cgroup_event_time(struct perf_event *event)
247{
248 struct perf_cgroup_info *t;
249
250 t = per_cpu_ptr(event->cgrp->info, event->cpu);
251 return t->time;
252}
253
254static inline void __update_cgrp_time(struct perf_cgroup *cgrp)
255{
256 struct perf_cgroup_info *info;
257 u64 now;
258
259 now = perf_clock();
260
261 info = this_cpu_ptr(cgrp->info);
262
263 info->time += now - info->timestamp;
264 info->timestamp = now;
265}
266
267static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
268{
269 struct perf_cgroup *cgrp_out = cpuctx->cgrp;
270 if (cgrp_out)
271 __update_cgrp_time(cgrp_out);
272}
273
274static inline void update_cgrp_time_from_event(struct perf_event *event)
275{
276 struct perf_cgroup *cgrp;
277
278 /*
279 * ensure we access cgroup data only when needed and
280 * when we know the cgroup is pinned (css_get)
281 */
282 if (!is_cgroup_event(event))
283 return;
284
285 cgrp = perf_cgroup_from_task(current);
286 /*
287 * Do not update time when cgroup is not active
288 */
289 if (cgrp == event->cgrp)
290 __update_cgrp_time(event->cgrp);
291}
292
293static inline void
294perf_cgroup_set_timestamp(struct task_struct *task,
295 struct perf_event_context *ctx)
296{
297 struct perf_cgroup *cgrp;
298 struct perf_cgroup_info *info;
299
300 /*
301 * ctx->lock held by caller
302 * ensure we do not access cgroup data
303 * unless we have the cgroup pinned (css_get)
304 */
305 if (!task || !ctx->nr_cgroups)
306 return;
307
308 cgrp = perf_cgroup_from_task(task);
309 info = this_cpu_ptr(cgrp->info);
310 info->timestamp = ctx->timestamp;
311}
312
313#define PERF_CGROUP_SWOUT 0x1 /* cgroup switch out every event */
314#define PERF_CGROUP_SWIN 0x2 /* cgroup switch in events based on task */
315
316/*
317 * reschedule events based on the cgroup constraint of task.
318 *
319 * mode SWOUT : schedule out everything
320 * mode SWIN : schedule in based on cgroup for next
321 */
322void perf_cgroup_switch(struct task_struct *task, int mode)
323{
324 struct perf_cpu_context *cpuctx;
325 struct pmu *pmu;
326 unsigned long flags;
327
328 /*
329 * disable interrupts to avoid geting nr_cgroup
330 * changes via __perf_event_disable(). Also
331 * avoids preemption.
332 */
333 local_irq_save(flags);
334
335 /*
336 * we reschedule only in the presence of cgroup
337 * constrained events.
338 */
339 rcu_read_lock();
340
341 list_for_each_entry_rcu(pmu, &pmus, entry) {
342
343 cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
344
345 perf_pmu_disable(cpuctx->ctx.pmu);
346
347 /*
348 * perf_cgroup_events says at least one
349 * context on this CPU has cgroup events.
350 *
351 * ctx->nr_cgroups reports the number of cgroup
352 * events for a context.
353 */
354 if (cpuctx->ctx.nr_cgroups > 0) {
355
356 if (mode & PERF_CGROUP_SWOUT) {
357 cpu_ctx_sched_out(cpuctx, EVENT_ALL);
358 /*
359 * must not be done before ctxswout due
360 * to event_filter_match() in event_sched_out()
361 */
362 cpuctx->cgrp = NULL;
363 }
364
365 if (mode & PERF_CGROUP_SWIN) {
366 /* set cgrp before ctxsw in to
367 * allow event_filter_match() to not
368 * have to pass task around
369 */
370 cpuctx->cgrp = perf_cgroup_from_task(task);
371 cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
372 }
373 }
374
375 perf_pmu_enable(cpuctx->ctx.pmu);
376 }
377
378 rcu_read_unlock();
379
380 local_irq_restore(flags);
381}
382
383static inline void perf_cgroup_sched_out(struct task_struct *task)
384{
385 perf_cgroup_switch(task, PERF_CGROUP_SWOUT);
386}
387
388static inline void perf_cgroup_sched_in(struct task_struct *task)
389{
390 perf_cgroup_switch(task, PERF_CGROUP_SWIN);
391}
392
393static inline int perf_cgroup_connect(int fd, struct perf_event *event,
394 struct perf_event_attr *attr,
395 struct perf_event *group_leader)
396{
397 struct perf_cgroup *cgrp;
398 struct cgroup_subsys_state *css;
399 struct file *file;
400 int ret = 0, fput_needed;
401
402 file = fget_light(fd, &fput_needed);
403 if (!file)
404 return -EBADF;
405
406 css = cgroup_css_from_dir(file, perf_subsys_id);
407 if (IS_ERR(css)) {
408 ret = PTR_ERR(css);
409 goto out;
410 }
411
412 cgrp = container_of(css, struct perf_cgroup, css);
413 event->cgrp = cgrp;
414
415 /* must be done before we fput() the file */
416 perf_get_cgroup(event);
417
418 /*
419 * all events in a group must monitor
420 * the same cgroup because a task belongs
421 * to only one perf cgroup at a time
422 */
423 if (group_leader && group_leader->cgrp != cgrp) {
424 perf_detach_cgroup(event);
425 ret = -EINVAL;
426 }
427out:
428 fput_light(file, fput_needed);
429 return ret;
430}
431
432static inline void
433perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
434{
435 struct perf_cgroup_info *t;
436 t = per_cpu_ptr(event->cgrp->info, event->cpu);
437 event->shadow_ctx_time = now - t->timestamp;
438}
439
440static inline void
441perf_cgroup_defer_enabled(struct perf_event *event)
442{
443 /*
444 * when the current task's perf cgroup does not match
445 * the event's, we need to remember to call the
446 * perf_mark_enable() function the first time a task with
447 * a matching perf cgroup is scheduled in.
448 */
449 if (is_cgroup_event(event) && !perf_cgroup_match(event))
450 event->cgrp_defer_enabled = 1;
451}
452
453static inline void
454perf_cgroup_mark_enabled(struct perf_event *event,
455 struct perf_event_context *ctx)
456{
457 struct perf_event *sub;
458 u64 tstamp = perf_event_time(event);
459
460 if (!event->cgrp_defer_enabled)
461 return;
462
463 event->cgrp_defer_enabled = 0;
464
465 event->tstamp_enabled = tstamp - event->total_time_enabled;
466 list_for_each_entry(sub, &event->sibling_list, group_entry) {
467 if (sub->state >= PERF_EVENT_STATE_INACTIVE) {
468 sub->tstamp_enabled = tstamp - sub->total_time_enabled;
469 sub->cgrp_defer_enabled = 0;
470 }
471 }
472}
473#else /* !CONFIG_CGROUP_PERF */
474
475static inline bool
476perf_cgroup_match(struct perf_event *event)
477{
478 return true;
479}
480
481static inline void perf_detach_cgroup(struct perf_event *event)
482{}
483
484static inline int is_cgroup_event(struct perf_event *event)
485{
486 return 0;
487}
488
489static inline u64 perf_cgroup_event_cgrp_time(struct perf_event *event)
490{
491 return 0;
492}
493
494static inline void update_cgrp_time_from_event(struct perf_event *event)
495{
496}
497
498static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
499{
500}
501
502static inline void perf_cgroup_sched_out(struct task_struct *task)
503{
504}
505
506static inline void perf_cgroup_sched_in(struct task_struct *task)
507{
508}
509
510static inline int perf_cgroup_connect(pid_t pid, struct perf_event *event,
511 struct perf_event_attr *attr,
512 struct perf_event *group_leader)
513{
514 return -EINVAL;
515}
516
517static inline void
518perf_cgroup_set_timestamp(struct task_struct *task,
519 struct perf_event_context *ctx)
520{
521}
522
523void
524perf_cgroup_switch(struct task_struct *task, struct task_struct *next)
525{
526}
527
528static inline void
529perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
530{
531}
532
533static inline u64 perf_cgroup_event_time(struct perf_event *event)
534{
535 return 0;
536}
537
538static inline void
539perf_cgroup_defer_enabled(struct perf_event *event)
540{
541}
542
543static inline void
544perf_cgroup_mark_enabled(struct perf_event *event,
545 struct perf_event_context *ctx)
546{
547}
548#endif
549
75void perf_pmu_disable(struct pmu *pmu) 550void perf_pmu_disable(struct pmu *pmu)
76{ 551{
77 int *count = this_cpu_ptr(pmu->pmu_disable_count); 552 int *count = this_cpu_ptr(pmu->pmu_disable_count);
@@ -237,12 +712,6 @@ static void perf_unpin_context(struct perf_event_context *ctx)
237 raw_spin_lock_irqsave(&ctx->lock, flags); 712 raw_spin_lock_irqsave(&ctx->lock, flags);
238 --ctx->pin_count; 713 --ctx->pin_count;
239 raw_spin_unlock_irqrestore(&ctx->lock, flags); 714 raw_spin_unlock_irqrestore(&ctx->lock, flags);
240 put_ctx(ctx);
241}
242
243static inline u64 perf_clock(void)
244{
245 return local_clock();
246} 715}
247 716
248/* 717/*
@@ -256,6 +725,16 @@ static void update_context_time(struct perf_event_context *ctx)
256 ctx->timestamp = now; 725 ctx->timestamp = now;
257} 726}
258 727
728static u64 perf_event_time(struct perf_event *event)
729{
730 struct perf_event_context *ctx = event->ctx;
731
732 if (is_cgroup_event(event))
733 return perf_cgroup_event_time(event);
734
735 return ctx ? ctx->time : 0;
736}
737
259/* 738/*
260 * Update the total_time_enabled and total_time_running fields for a event. 739 * Update the total_time_enabled and total_time_running fields for a event.
261 */ 740 */
@@ -267,8 +746,19 @@ static void update_event_times(struct perf_event *event)
267 if (event->state < PERF_EVENT_STATE_INACTIVE || 746 if (event->state < PERF_EVENT_STATE_INACTIVE ||
268 event->group_leader->state < PERF_EVENT_STATE_INACTIVE) 747 event->group_leader->state < PERF_EVENT_STATE_INACTIVE)
269 return; 748 return;
270 749 /*
271 if (ctx->is_active) 750 * in cgroup mode, time_enabled represents
751 * the time the event was enabled AND active
752 * tasks were in the monitored cgroup. This is
753 * independent of the activity of the context as
754 * there may be a mix of cgroup and non-cgroup events.
755 *
756 * That is why we treat cgroup events differently
757 * here.
758 */
759 if (is_cgroup_event(event))
760 run_end = perf_event_time(event);
761 else if (ctx->is_active)
272 run_end = ctx->time; 762 run_end = ctx->time;
273 else 763 else
274 run_end = event->tstamp_stopped; 764 run_end = event->tstamp_stopped;
@@ -278,9 +768,10 @@ static void update_event_times(struct perf_event *event)
278 if (event->state == PERF_EVENT_STATE_INACTIVE) 768 if (event->state == PERF_EVENT_STATE_INACTIVE)
279 run_end = event->tstamp_stopped; 769 run_end = event->tstamp_stopped;
280 else 770 else
281 run_end = ctx->time; 771 run_end = perf_event_time(event);
282 772
283 event->total_time_running = run_end - event->tstamp_running; 773 event->total_time_running = run_end - event->tstamp_running;
774
284} 775}
285 776
286/* 777/*
@@ -329,6 +820,9 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
329 list_add_tail(&event->group_entry, list); 820 list_add_tail(&event->group_entry, list);
330 } 821 }
331 822
823 if (is_cgroup_event(event))
824 ctx->nr_cgroups++;
825
332 list_add_rcu(&event->event_entry, &ctx->event_list); 826 list_add_rcu(&event->event_entry, &ctx->event_list);
333 if (!ctx->nr_events) 827 if (!ctx->nr_events)
334 perf_pmu_rotate_start(ctx->pmu); 828 perf_pmu_rotate_start(ctx->pmu);
@@ -455,6 +949,9 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
455 949
456 event->attach_state &= ~PERF_ATTACH_CONTEXT; 950 event->attach_state &= ~PERF_ATTACH_CONTEXT;
457 951
952 if (is_cgroup_event(event))
953 ctx->nr_cgroups--;
954
458 ctx->nr_events--; 955 ctx->nr_events--;
459 if (event->attr.inherit_stat) 956 if (event->attr.inherit_stat)
460 ctx->nr_stat--; 957 ctx->nr_stat--;
@@ -526,7 +1023,8 @@ out:
526static inline int 1023static inline int
527event_filter_match(struct perf_event *event) 1024event_filter_match(struct perf_event *event)
528{ 1025{
529 return event->cpu == -1 || event->cpu == smp_processor_id(); 1026 return (event->cpu == -1 || event->cpu == smp_processor_id())
1027 && perf_cgroup_match(event);
530} 1028}
531 1029
532static void 1030static void
@@ -534,6 +1032,7 @@ event_sched_out(struct perf_event *event,
534 struct perf_cpu_context *cpuctx, 1032 struct perf_cpu_context *cpuctx,
535 struct perf_event_context *ctx) 1033 struct perf_event_context *ctx)
536{ 1034{
1035 u64 tstamp = perf_event_time(event);
537 u64 delta; 1036 u64 delta;
538 /* 1037 /*
539 * An event which could not be activated because of 1038 * An event which could not be activated because of
@@ -543,9 +1042,9 @@ event_sched_out(struct perf_event *event,
543 */ 1042 */
544 if (event->state == PERF_EVENT_STATE_INACTIVE 1043 if (event->state == PERF_EVENT_STATE_INACTIVE
545 && !event_filter_match(event)) { 1044 && !event_filter_match(event)) {
546 delta = ctx->time - event->tstamp_stopped; 1045 delta = tstamp - event->tstamp_stopped;
547 event->tstamp_running += delta; 1046 event->tstamp_running += delta;
548 event->tstamp_stopped = ctx->time; 1047 event->tstamp_stopped = tstamp;
549 } 1048 }
550 1049
551 if (event->state != PERF_EVENT_STATE_ACTIVE) 1050 if (event->state != PERF_EVENT_STATE_ACTIVE)
@@ -556,7 +1055,7 @@ event_sched_out(struct perf_event *event,
556 event->pending_disable = 0; 1055 event->pending_disable = 0;
557 event->state = PERF_EVENT_STATE_OFF; 1056 event->state = PERF_EVENT_STATE_OFF;
558 } 1057 }
559 event->tstamp_stopped = ctx->time; 1058 event->tstamp_stopped = tstamp;
560 event->pmu->del(event, 0); 1059 event->pmu->del(event, 0);
561 event->oncpu = -1; 1060 event->oncpu = -1;
562 1061
@@ -587,47 +1086,30 @@ group_sched_out(struct perf_event *group_event,
587 cpuctx->exclusive = 0; 1086 cpuctx->exclusive = 0;
588} 1087}
589 1088
590static inline struct perf_cpu_context *
591__get_cpu_context(struct perf_event_context *ctx)
592{
593 return this_cpu_ptr(ctx->pmu->pmu_cpu_context);
594}
595
596/* 1089/*
597 * Cross CPU call to remove a performance event 1090 * Cross CPU call to remove a performance event
598 * 1091 *
599 * We disable the event on the hardware level first. After that we 1092 * We disable the event on the hardware level first. After that we
600 * remove it from the context list. 1093 * remove it from the context list.
601 */ 1094 */
602static void __perf_event_remove_from_context(void *info) 1095static int __perf_remove_from_context(void *info)
603{ 1096{
604 struct perf_event *event = info; 1097 struct perf_event *event = info;
605 struct perf_event_context *ctx = event->ctx; 1098 struct perf_event_context *ctx = event->ctx;
606 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); 1099 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
607 1100
608 /*
609 * If this is a task context, we need to check whether it is
610 * the current task context of this cpu. If not it has been
611 * scheduled out before the smp call arrived.
612 */
613 if (ctx->task && cpuctx->task_ctx != ctx)
614 return;
615
616 raw_spin_lock(&ctx->lock); 1101 raw_spin_lock(&ctx->lock);
617
618 event_sched_out(event, cpuctx, ctx); 1102 event_sched_out(event, cpuctx, ctx);
619
620 list_del_event(event, ctx); 1103 list_del_event(event, ctx);
621
622 raw_spin_unlock(&ctx->lock); 1104 raw_spin_unlock(&ctx->lock);
1105
1106 return 0;
623} 1107}
624 1108
625 1109
626/* 1110/*
627 * Remove the event from a task's (or a CPU's) list of events. 1111 * Remove the event from a task's (or a CPU's) list of events.
628 * 1112 *
629 * Must be called with ctx->mutex held.
630 *
631 * CPU events are removed with a smp call. For task events we only 1113 * CPU events are removed with a smp call. For task events we only
632 * call when the task is on a CPU. 1114 * call when the task is on a CPU.
633 * 1115 *
@@ -638,49 +1120,48 @@ static void __perf_event_remove_from_context(void *info)
638 * When called from perf_event_exit_task, it's OK because the 1120 * When called from perf_event_exit_task, it's OK because the
639 * context has been detached from its task. 1121 * context has been detached from its task.
640 */ 1122 */
641static void perf_event_remove_from_context(struct perf_event *event) 1123static void perf_remove_from_context(struct perf_event *event)
642{ 1124{
643 struct perf_event_context *ctx = event->ctx; 1125 struct perf_event_context *ctx = event->ctx;
644 struct task_struct *task = ctx->task; 1126 struct task_struct *task = ctx->task;
645 1127
1128 lockdep_assert_held(&ctx->mutex);
1129
646 if (!task) { 1130 if (!task) {
647 /* 1131 /*
648 * Per cpu events are removed via an smp call and 1132 * Per cpu events are removed via an smp call and
649 * the removal is always successful. 1133 * the removal is always successful.
650 */ 1134 */
651 smp_call_function_single(event->cpu, 1135 cpu_function_call(event->cpu, __perf_remove_from_context, event);
652 __perf_event_remove_from_context,
653 event, 1);
654 return; 1136 return;
655 } 1137 }
656 1138
657retry: 1139retry:
658 task_oncpu_function_call(task, __perf_event_remove_from_context, 1140 if (!task_function_call(task, __perf_remove_from_context, event))
659 event); 1141 return;
660 1142
661 raw_spin_lock_irq(&ctx->lock); 1143 raw_spin_lock_irq(&ctx->lock);
662 /* 1144 /*
663 * If the context is active we need to retry the smp call. 1145 * If we failed to find a running task, but find the context active now
1146 * that we've acquired the ctx->lock, retry.
664 */ 1147 */
665 if (ctx->nr_active && !list_empty(&event->group_entry)) { 1148 if (ctx->is_active) {
666 raw_spin_unlock_irq(&ctx->lock); 1149 raw_spin_unlock_irq(&ctx->lock);
667 goto retry; 1150 goto retry;
668 } 1151 }
669 1152
670 /* 1153 /*
671 * The lock prevents that this context is scheduled in so we 1154 * Since the task isn't running, its safe to remove the event, us
672 * can remove the event safely, if the call above did not 1155 * holding the ctx->lock ensures the task won't get scheduled in.
673 * succeed.
674 */ 1156 */
675 if (!list_empty(&event->group_entry)) 1157 list_del_event(event, ctx);
676 list_del_event(event, ctx);
677 raw_spin_unlock_irq(&ctx->lock); 1158 raw_spin_unlock_irq(&ctx->lock);
678} 1159}
679 1160
680/* 1161/*
681 * Cross CPU call to disable a performance event 1162 * Cross CPU call to disable a performance event
682 */ 1163 */
683static void __perf_event_disable(void *info) 1164static int __perf_event_disable(void *info)
684{ 1165{
685 struct perf_event *event = info; 1166 struct perf_event *event = info;
686 struct perf_event_context *ctx = event->ctx; 1167 struct perf_event_context *ctx = event->ctx;
@@ -689,9 +1170,12 @@ static void __perf_event_disable(void *info)
689 /* 1170 /*
690 * If this is a per-task event, need to check whether this 1171 * If this is a per-task event, need to check whether this
691 * event's task is the current task on this cpu. 1172 * event's task is the current task on this cpu.
1173 *
1174 * Can trigger due to concurrent perf_event_context_sched_out()
1175 * flipping contexts around.
692 */ 1176 */
693 if (ctx->task && cpuctx->task_ctx != ctx) 1177 if (ctx->task && cpuctx->task_ctx != ctx)
694 return; 1178 return -EINVAL;
695 1179
696 raw_spin_lock(&ctx->lock); 1180 raw_spin_lock(&ctx->lock);
697 1181
@@ -701,6 +1185,7 @@ static void __perf_event_disable(void *info)
701 */ 1185 */
702 if (event->state >= PERF_EVENT_STATE_INACTIVE) { 1186 if (event->state >= PERF_EVENT_STATE_INACTIVE) {
703 update_context_time(ctx); 1187 update_context_time(ctx);
1188 update_cgrp_time_from_event(event);
704 update_group_times(event); 1189 update_group_times(event);
705 if (event == event->group_leader) 1190 if (event == event->group_leader)
706 group_sched_out(event, cpuctx, ctx); 1191 group_sched_out(event, cpuctx, ctx);
@@ -710,6 +1195,8 @@ static void __perf_event_disable(void *info)
710 } 1195 }
711 1196
712 raw_spin_unlock(&ctx->lock); 1197 raw_spin_unlock(&ctx->lock);
1198
1199 return 0;
713} 1200}
714 1201
715/* 1202/*
@@ -734,13 +1221,13 @@ void perf_event_disable(struct perf_event *event)
734 /* 1221 /*
735 * Disable the event on the cpu that it's on 1222 * Disable the event on the cpu that it's on
736 */ 1223 */
737 smp_call_function_single(event->cpu, __perf_event_disable, 1224 cpu_function_call(event->cpu, __perf_event_disable, event);
738 event, 1);
739 return; 1225 return;
740 } 1226 }
741 1227
742retry: 1228retry:
743 task_oncpu_function_call(task, __perf_event_disable, event); 1229 if (!task_function_call(task, __perf_event_disable, event))
1230 return;
744 1231
745 raw_spin_lock_irq(&ctx->lock); 1232 raw_spin_lock_irq(&ctx->lock);
746 /* 1233 /*
@@ -748,6 +1235,11 @@ retry:
748 */ 1235 */
749 if (event->state == PERF_EVENT_STATE_ACTIVE) { 1236 if (event->state == PERF_EVENT_STATE_ACTIVE) {
750 raw_spin_unlock_irq(&ctx->lock); 1237 raw_spin_unlock_irq(&ctx->lock);
1238 /*
1239 * Reload the task pointer, it might have been changed by
1240 * a concurrent perf_event_context_sched_out().
1241 */
1242 task = ctx->task;
751 goto retry; 1243 goto retry;
752 } 1244 }
753 1245
@@ -759,20 +1251,71 @@ retry:
759 update_group_times(event); 1251 update_group_times(event);
760 event->state = PERF_EVENT_STATE_OFF; 1252 event->state = PERF_EVENT_STATE_OFF;
761 } 1253 }
762
763 raw_spin_unlock_irq(&ctx->lock); 1254 raw_spin_unlock_irq(&ctx->lock);
764} 1255}
765 1256
1257static void perf_set_shadow_time(struct perf_event *event,
1258 struct perf_event_context *ctx,
1259 u64 tstamp)
1260{
1261 /*
1262 * use the correct time source for the time snapshot
1263 *
1264 * We could get by without this by leveraging the
1265 * fact that to get to this function, the caller
1266 * has most likely already called update_context_time()
1267 * and update_cgrp_time_xx() and thus both timestamp
1268 * are identical (or very close). Given that tstamp is,
1269 * already adjusted for cgroup, we could say that:
1270 * tstamp - ctx->timestamp
1271 * is equivalent to
1272 * tstamp - cgrp->timestamp.
1273 *
1274 * Then, in perf_output_read(), the calculation would
1275 * work with no changes because:
1276 * - event is guaranteed scheduled in
1277 * - no scheduled out in between
1278 * - thus the timestamp would be the same
1279 *
1280 * But this is a bit hairy.
1281 *
1282 * So instead, we have an explicit cgroup call to remain
1283 * within the time time source all along. We believe it
1284 * is cleaner and simpler to understand.
1285 */
1286 if (is_cgroup_event(event))
1287 perf_cgroup_set_shadow_time(event, tstamp);
1288 else
1289 event->shadow_ctx_time = tstamp - ctx->timestamp;
1290}
1291
1292#define MAX_INTERRUPTS (~0ULL)
1293
1294static void perf_log_throttle(struct perf_event *event, int enable);
1295
766static int 1296static int
767event_sched_in(struct perf_event *event, 1297event_sched_in(struct perf_event *event,
768 struct perf_cpu_context *cpuctx, 1298 struct perf_cpu_context *cpuctx,
769 struct perf_event_context *ctx) 1299 struct perf_event_context *ctx)
770{ 1300{
1301 u64 tstamp = perf_event_time(event);
1302
771 if (event->state <= PERF_EVENT_STATE_OFF) 1303 if (event->state <= PERF_EVENT_STATE_OFF)
772 return 0; 1304 return 0;
773 1305
774 event->state = PERF_EVENT_STATE_ACTIVE; 1306 event->state = PERF_EVENT_STATE_ACTIVE;
775 event->oncpu = smp_processor_id(); 1307 event->oncpu = smp_processor_id();
1308
1309 /*
1310 * Unthrottle events, since we scheduled we might have missed several
1311 * ticks already, also for a heavily scheduling task there is little
1312 * guarantee it'll get a tick in a timely manner.
1313 */
1314 if (unlikely(event->hw.interrupts == MAX_INTERRUPTS)) {
1315 perf_log_throttle(event, 1);
1316 event->hw.interrupts = 0;
1317 }
1318
776 /* 1319 /*
777 * The new state must be visible before we turn it on in the hardware: 1320 * The new state must be visible before we turn it on in the hardware:
778 */ 1321 */
@@ -784,9 +1327,9 @@ event_sched_in(struct perf_event *event,
784 return -EAGAIN; 1327 return -EAGAIN;
785 } 1328 }
786 1329
787 event->tstamp_running += ctx->time - event->tstamp_stopped; 1330 event->tstamp_running += tstamp - event->tstamp_stopped;
788 1331
789 event->shadow_ctx_time = ctx->time - ctx->timestamp; 1332 perf_set_shadow_time(event, ctx, tstamp);
790 1333
791 if (!is_software_event(event)) 1334 if (!is_software_event(event))
792 cpuctx->active_oncpu++; 1335 cpuctx->active_oncpu++;
@@ -898,19 +1441,24 @@ static int group_can_go_on(struct perf_event *event,
898static void add_event_to_ctx(struct perf_event *event, 1441static void add_event_to_ctx(struct perf_event *event,
899 struct perf_event_context *ctx) 1442 struct perf_event_context *ctx)
900{ 1443{
1444 u64 tstamp = perf_event_time(event);
1445
901 list_add_event(event, ctx); 1446 list_add_event(event, ctx);
902 perf_group_attach(event); 1447 perf_group_attach(event);
903 event->tstamp_enabled = ctx->time; 1448 event->tstamp_enabled = tstamp;
904 event->tstamp_running = ctx->time; 1449 event->tstamp_running = tstamp;
905 event->tstamp_stopped = ctx->time; 1450 event->tstamp_stopped = tstamp;
906} 1451}
907 1452
1453static void perf_event_context_sched_in(struct perf_event_context *ctx,
1454 struct task_struct *tsk);
1455
908/* 1456/*
909 * Cross CPU call to install and enable a performance event 1457 * Cross CPU call to install and enable a performance event
910 * 1458 *
911 * Must be called with ctx->mutex held 1459 * Must be called with ctx->mutex held
912 */ 1460 */
913static void __perf_install_in_context(void *info) 1461static int __perf_install_in_context(void *info)
914{ 1462{
915 struct perf_event *event = info; 1463 struct perf_event *event = info;
916 struct perf_event_context *ctx = event->ctx; 1464 struct perf_event_context *ctx = event->ctx;
@@ -919,25 +1467,26 @@ static void __perf_install_in_context(void *info)
919 int err; 1467 int err;
920 1468
921 /* 1469 /*
922 * If this is a task context, we need to check whether it is 1470 * In case we're installing a new context to an already running task,
923 * the current task context of this cpu. If not it has been 1471 * could also happen before perf_event_task_sched_in() on architectures
924 * scheduled out before the smp call arrived. 1472 * which do context switches with IRQs enabled.
925 * Or possibly this is the right context but it isn't
926 * on this cpu because it had no events.
927 */ 1473 */
928 if (ctx->task && cpuctx->task_ctx != ctx) { 1474 if (ctx->task && !cpuctx->task_ctx)
929 if (cpuctx->task_ctx || ctx->task != current) 1475 perf_event_context_sched_in(ctx, ctx->task);
930 return;
931 cpuctx->task_ctx = ctx;
932 }
933 1476
934 raw_spin_lock(&ctx->lock); 1477 raw_spin_lock(&ctx->lock);
935 ctx->is_active = 1; 1478 ctx->is_active = 1;
936 update_context_time(ctx); 1479 update_context_time(ctx);
1480 /*
1481 * update cgrp time only if current cgrp
1482 * matches event->cgrp. Must be done before
1483 * calling add_event_to_ctx()
1484 */
1485 update_cgrp_time_from_event(event);
937 1486
938 add_event_to_ctx(event, ctx); 1487 add_event_to_ctx(event, ctx);
939 1488
940 if (event->cpu != -1 && event->cpu != smp_processor_id()) 1489 if (!event_filter_match(event))
941 goto unlock; 1490 goto unlock;
942 1491
943 /* 1492 /*
@@ -974,6 +1523,8 @@ static void __perf_install_in_context(void *info)
974 1523
975unlock: 1524unlock:
976 raw_spin_unlock(&ctx->lock); 1525 raw_spin_unlock(&ctx->lock);
1526
1527 return 0;
977} 1528}
978 1529
979/* 1530/*
@@ -985,8 +1536,6 @@ unlock:
985 * If the event is attached to a task which is on a CPU we use a smp 1536 * If the event is attached to a task which is on a CPU we use a smp
986 * call to enable it in the task context. The task might have been 1537 * call to enable it in the task context. The task might have been
987 * scheduled away, but we check this in the smp call again. 1538 * scheduled away, but we check this in the smp call again.
988 *
989 * Must be called with ctx->mutex held.
990 */ 1539 */
991static void 1540static void
992perf_install_in_context(struct perf_event_context *ctx, 1541perf_install_in_context(struct perf_event_context *ctx,
@@ -995,6 +1544,8 @@ perf_install_in_context(struct perf_event_context *ctx,
995{ 1544{
996 struct task_struct *task = ctx->task; 1545 struct task_struct *task = ctx->task;
997 1546
1547 lockdep_assert_held(&ctx->mutex);
1548
998 event->ctx = ctx; 1549 event->ctx = ctx;
999 1550
1000 if (!task) { 1551 if (!task) {
@@ -1002,31 +1553,29 @@ perf_install_in_context(struct perf_event_context *ctx,
1002 * Per cpu events are installed via an smp call and 1553 * Per cpu events are installed via an smp call and
1003 * the install is always successful. 1554 * the install is always successful.
1004 */ 1555 */
1005 smp_call_function_single(cpu, __perf_install_in_context, 1556 cpu_function_call(cpu, __perf_install_in_context, event);
1006 event, 1);
1007 return; 1557 return;
1008 } 1558 }
1009 1559
1010retry: 1560retry:
1011 task_oncpu_function_call(task, __perf_install_in_context, 1561 if (!task_function_call(task, __perf_install_in_context, event))
1012 event); 1562 return;
1013 1563
1014 raw_spin_lock_irq(&ctx->lock); 1564 raw_spin_lock_irq(&ctx->lock);
1015 /* 1565 /*
1016 * we need to retry the smp call. 1566 * If we failed to find a running task, but find the context active now
1567 * that we've acquired the ctx->lock, retry.
1017 */ 1568 */
1018 if (ctx->is_active && list_empty(&event->group_entry)) { 1569 if (ctx->is_active) {
1019 raw_spin_unlock_irq(&ctx->lock); 1570 raw_spin_unlock_irq(&ctx->lock);
1020 goto retry; 1571 goto retry;
1021 } 1572 }
1022 1573
1023 /* 1574 /*
1024 * The lock prevents that this context is scheduled in so we 1575 * Since the task isn't running, its safe to add the event, us holding
1025 * can add the event safely, if it the call above did not 1576 * the ctx->lock ensures the task won't get scheduled in.
1026 * succeed.
1027 */ 1577 */
1028 if (list_empty(&event->group_entry)) 1578 add_event_to_ctx(event, ctx);
1029 add_event_to_ctx(event, ctx);
1030 raw_spin_unlock_irq(&ctx->lock); 1579 raw_spin_unlock_irq(&ctx->lock);
1031} 1580}
1032 1581
@@ -1042,21 +1591,20 @@ static void __perf_event_mark_enabled(struct perf_event *event,
1042 struct perf_event_context *ctx) 1591 struct perf_event_context *ctx)
1043{ 1592{
1044 struct perf_event *sub; 1593 struct perf_event *sub;
1594 u64 tstamp = perf_event_time(event);
1045 1595
1046 event->state = PERF_EVENT_STATE_INACTIVE; 1596 event->state = PERF_EVENT_STATE_INACTIVE;
1047 event->tstamp_enabled = ctx->time - event->total_time_enabled; 1597 event->tstamp_enabled = tstamp - event->total_time_enabled;
1048 list_for_each_entry(sub, &event->sibling_list, group_entry) { 1598 list_for_each_entry(sub, &event->sibling_list, group_entry) {
1049 if (sub->state >= PERF_EVENT_STATE_INACTIVE) { 1599 if (sub->state >= PERF_EVENT_STATE_INACTIVE)
1050 sub->tstamp_enabled = 1600 sub->tstamp_enabled = tstamp - sub->total_time_enabled;
1051 ctx->time - sub->total_time_enabled;
1052 }
1053 } 1601 }
1054} 1602}
1055 1603
1056/* 1604/*
1057 * Cross CPU call to enable a performance event 1605 * Cross CPU call to enable a performance event
1058 */ 1606 */
1059static void __perf_event_enable(void *info) 1607static int __perf_event_enable(void *info)
1060{ 1608{
1061 struct perf_event *event = info; 1609 struct perf_event *event = info;
1062 struct perf_event_context *ctx = event->ctx; 1610 struct perf_event_context *ctx = event->ctx;
@@ -1064,26 +1612,27 @@ static void __perf_event_enable(void *info)
1064 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); 1612 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
1065 int err; 1613 int err;
1066 1614
1067 /* 1615 if (WARN_ON_ONCE(!ctx->is_active))
1068 * If this is a per-task event, need to check whether this 1616 return -EINVAL;
1069 * event's task is the current task on this cpu.
1070 */
1071 if (ctx->task && cpuctx->task_ctx != ctx) {
1072 if (cpuctx->task_ctx || ctx->task != current)
1073 return;
1074 cpuctx->task_ctx = ctx;
1075 }
1076 1617
1077 raw_spin_lock(&ctx->lock); 1618 raw_spin_lock(&ctx->lock);
1078 ctx->is_active = 1;
1079 update_context_time(ctx); 1619 update_context_time(ctx);
1080 1620
1081 if (event->state >= PERF_EVENT_STATE_INACTIVE) 1621 if (event->state >= PERF_EVENT_STATE_INACTIVE)
1082 goto unlock; 1622 goto unlock;
1623
1624 /*
1625 * set current task's cgroup time reference point
1626 */
1627 perf_cgroup_set_timestamp(current, ctx);
1628
1083 __perf_event_mark_enabled(event, ctx); 1629 __perf_event_mark_enabled(event, ctx);
1084 1630
1085 if (event->cpu != -1 && event->cpu != smp_processor_id()) 1631 if (!event_filter_match(event)) {
1632 if (is_cgroup_event(event))
1633 perf_cgroup_defer_enabled(event);
1086 goto unlock; 1634 goto unlock;
1635 }
1087 1636
1088 /* 1637 /*
1089 * If the event is in a group and isn't the group leader, 1638 * If the event is in a group and isn't the group leader,
@@ -1116,6 +1665,8 @@ static void __perf_event_enable(void *info)
1116 1665
1117unlock: 1666unlock:
1118 raw_spin_unlock(&ctx->lock); 1667 raw_spin_unlock(&ctx->lock);
1668
1669 return 0;
1119} 1670}
1120 1671
1121/* 1672/*
@@ -1136,8 +1687,7 @@ void perf_event_enable(struct perf_event *event)
1136 /* 1687 /*
1137 * Enable the event on the cpu that it's on 1688 * Enable the event on the cpu that it's on
1138 */ 1689 */
1139 smp_call_function_single(event->cpu, __perf_event_enable, 1690 cpu_function_call(event->cpu, __perf_event_enable, event);
1140 event, 1);
1141 return; 1691 return;
1142 } 1692 }
1143 1693
@@ -1156,8 +1706,15 @@ void perf_event_enable(struct perf_event *event)
1156 event->state = PERF_EVENT_STATE_OFF; 1706 event->state = PERF_EVENT_STATE_OFF;
1157 1707
1158retry: 1708retry:
1709 if (!ctx->is_active) {
1710 __perf_event_mark_enabled(event, ctx);
1711 goto out;
1712 }
1713
1159 raw_spin_unlock_irq(&ctx->lock); 1714 raw_spin_unlock_irq(&ctx->lock);
1160 task_oncpu_function_call(task, __perf_event_enable, event); 1715
1716 if (!task_function_call(task, __perf_event_enable, event))
1717 return;
1161 1718
1162 raw_spin_lock_irq(&ctx->lock); 1719 raw_spin_lock_irq(&ctx->lock);
1163 1720
@@ -1165,15 +1722,14 @@ retry:
1165 * If the context is active and the event is still off, 1722 * If the context is active and the event is still off,
1166 * we need to retry the cross-call. 1723 * we need to retry the cross-call.
1167 */ 1724 */
1168 if (ctx->is_active && event->state == PERF_EVENT_STATE_OFF) 1725 if (ctx->is_active && event->state == PERF_EVENT_STATE_OFF) {
1726 /*
1727 * task could have been flipped by a concurrent
1728 * perf_event_context_sched_out()
1729 */
1730 task = ctx->task;
1169 goto retry; 1731 goto retry;
1170 1732 }
1171 /*
1172 * Since we have the lock this context can't be scheduled
1173 * in, so we can change the state safely.
1174 */
1175 if (event->state == PERF_EVENT_STATE_OFF)
1176 __perf_event_mark_enabled(event, ctx);
1177 1733
1178out: 1734out:
1179 raw_spin_unlock_irq(&ctx->lock); 1735 raw_spin_unlock_irq(&ctx->lock);
@@ -1193,12 +1749,6 @@ static int perf_event_refresh(struct perf_event *event, int refresh)
1193 return 0; 1749 return 0;
1194} 1750}
1195 1751
1196enum event_type_t {
1197 EVENT_FLEXIBLE = 0x1,
1198 EVENT_PINNED = 0x2,
1199 EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
1200};
1201
1202static void ctx_sched_out(struct perf_event_context *ctx, 1752static void ctx_sched_out(struct perf_event_context *ctx,
1203 struct perf_cpu_context *cpuctx, 1753 struct perf_cpu_context *cpuctx,
1204 enum event_type_t event_type) 1754 enum event_type_t event_type)
@@ -1211,6 +1761,7 @@ static void ctx_sched_out(struct perf_event_context *ctx,
1211 if (likely(!ctx->nr_events)) 1761 if (likely(!ctx->nr_events))
1212 goto out; 1762 goto out;
1213 update_context_time(ctx); 1763 update_context_time(ctx);
1764 update_cgrp_time_from_cpuctx(cpuctx);
1214 1765
1215 if (!ctx->nr_active) 1766 if (!ctx->nr_active)
1216 goto out; 1767 goto out;
@@ -1323,8 +1874,8 @@ static void perf_event_sync_stat(struct perf_event_context *ctx,
1323 } 1874 }
1324} 1875}
1325 1876
1326void perf_event_context_sched_out(struct task_struct *task, int ctxn, 1877static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
1327 struct task_struct *next) 1878 struct task_struct *next)
1328{ 1879{
1329 struct perf_event_context *ctx = task->perf_event_ctxp[ctxn]; 1880 struct perf_event_context *ctx = task->perf_event_ctxp[ctxn];
1330 struct perf_event_context *next_ctx; 1881 struct perf_event_context *next_ctx;
@@ -1400,6 +1951,14 @@ void __perf_event_task_sched_out(struct task_struct *task,
1400 1951
1401 for_each_task_context_nr(ctxn) 1952 for_each_task_context_nr(ctxn)
1402 perf_event_context_sched_out(task, ctxn, next); 1953 perf_event_context_sched_out(task, ctxn, next);
1954
1955 /*
1956 * if cgroup events exist on this CPU, then we need
1957 * to check if we have to switch out PMU state.
1958 * cgroup event are system-wide mode only
1959 */
1960 if (atomic_read(&__get_cpu_var(perf_cgroup_events)))
1961 perf_cgroup_sched_out(task);
1403} 1962}
1404 1963
1405static void task_ctx_sched_out(struct perf_event_context *ctx, 1964static void task_ctx_sched_out(struct perf_event_context *ctx,
@@ -1435,9 +1994,13 @@ ctx_pinned_sched_in(struct perf_event_context *ctx,
1435 list_for_each_entry(event, &ctx->pinned_groups, group_entry) { 1994 list_for_each_entry(event, &ctx->pinned_groups, group_entry) {
1436 if (event->state <= PERF_EVENT_STATE_OFF) 1995 if (event->state <= PERF_EVENT_STATE_OFF)
1437 continue; 1996 continue;
1438 if (event->cpu != -1 && event->cpu != smp_processor_id()) 1997 if (!event_filter_match(event))
1439 continue; 1998 continue;
1440 1999
2000 /* may need to reset tstamp_enabled */
2001 if (is_cgroup_event(event))
2002 perf_cgroup_mark_enabled(event, ctx);
2003
1441 if (group_can_go_on(event, cpuctx, 1)) 2004 if (group_can_go_on(event, cpuctx, 1))
1442 group_sched_in(event, cpuctx, ctx); 2005 group_sched_in(event, cpuctx, ctx);
1443 2006
@@ -1467,9 +2030,13 @@ ctx_flexible_sched_in(struct perf_event_context *ctx,
1467 * Listen to the 'cpu' scheduling filter constraint 2030 * Listen to the 'cpu' scheduling filter constraint
1468 * of events: 2031 * of events:
1469 */ 2032 */
1470 if (event->cpu != -1 && event->cpu != smp_processor_id()) 2033 if (!event_filter_match(event))
1471 continue; 2034 continue;
1472 2035
2036 /* may need to reset tstamp_enabled */
2037 if (is_cgroup_event(event))
2038 perf_cgroup_mark_enabled(event, ctx);
2039
1473 if (group_can_go_on(event, cpuctx, can_add_hw)) { 2040 if (group_can_go_on(event, cpuctx, can_add_hw)) {
1474 if (group_sched_in(event, cpuctx, ctx)) 2041 if (group_sched_in(event, cpuctx, ctx))
1475 can_add_hw = 0; 2042 can_add_hw = 0;
@@ -1480,15 +2047,19 @@ ctx_flexible_sched_in(struct perf_event_context *ctx,
1480static void 2047static void
1481ctx_sched_in(struct perf_event_context *ctx, 2048ctx_sched_in(struct perf_event_context *ctx,
1482 struct perf_cpu_context *cpuctx, 2049 struct perf_cpu_context *cpuctx,
1483 enum event_type_t event_type) 2050 enum event_type_t event_type,
2051 struct task_struct *task)
1484{ 2052{
2053 u64 now;
2054
1485 raw_spin_lock(&ctx->lock); 2055 raw_spin_lock(&ctx->lock);
1486 ctx->is_active = 1; 2056 ctx->is_active = 1;
1487 if (likely(!ctx->nr_events)) 2057 if (likely(!ctx->nr_events))
1488 goto out; 2058 goto out;
1489 2059
1490 ctx->timestamp = perf_clock(); 2060 now = perf_clock();
1491 2061 ctx->timestamp = now;
2062 perf_cgroup_set_timestamp(task, ctx);
1492 /* 2063 /*
1493 * First go through the list and put on any pinned groups 2064 * First go through the list and put on any pinned groups
1494 * in order to give them the best chance of going on. 2065 * in order to give them the best chance of going on.
@@ -1505,11 +2076,12 @@ out:
1505} 2076}
1506 2077
1507static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx, 2078static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
1508 enum event_type_t event_type) 2079 enum event_type_t event_type,
2080 struct task_struct *task)
1509{ 2081{
1510 struct perf_event_context *ctx = &cpuctx->ctx; 2082 struct perf_event_context *ctx = &cpuctx->ctx;
1511 2083
1512 ctx_sched_in(ctx, cpuctx, event_type); 2084 ctx_sched_in(ctx, cpuctx, event_type, task);
1513} 2085}
1514 2086
1515static void task_ctx_sched_in(struct perf_event_context *ctx, 2087static void task_ctx_sched_in(struct perf_event_context *ctx,
@@ -1517,15 +2089,16 @@ static void task_ctx_sched_in(struct perf_event_context *ctx,
1517{ 2089{
1518 struct perf_cpu_context *cpuctx; 2090 struct perf_cpu_context *cpuctx;
1519 2091
1520 cpuctx = __get_cpu_context(ctx); 2092 cpuctx = __get_cpu_context(ctx);
1521 if (cpuctx->task_ctx == ctx) 2093 if (cpuctx->task_ctx == ctx)
1522 return; 2094 return;
1523 2095
1524 ctx_sched_in(ctx, cpuctx, event_type); 2096 ctx_sched_in(ctx, cpuctx, event_type, NULL);
1525 cpuctx->task_ctx = ctx; 2097 cpuctx->task_ctx = ctx;
1526} 2098}
1527 2099
1528void perf_event_context_sched_in(struct perf_event_context *ctx) 2100static void perf_event_context_sched_in(struct perf_event_context *ctx,
2101 struct task_struct *task)
1529{ 2102{
1530 struct perf_cpu_context *cpuctx; 2103 struct perf_cpu_context *cpuctx;
1531 2104
@@ -1541,9 +2114,9 @@ void perf_event_context_sched_in(struct perf_event_context *ctx)
1541 */ 2114 */
1542 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); 2115 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
1543 2116
1544 ctx_sched_in(ctx, cpuctx, EVENT_PINNED); 2117 ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task);
1545 cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE); 2118 cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task);
1546 ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE); 2119 ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task);
1547 2120
1548 cpuctx->task_ctx = ctx; 2121 cpuctx->task_ctx = ctx;
1549 2122
@@ -1576,14 +2149,17 @@ void __perf_event_task_sched_in(struct task_struct *task)
1576 if (likely(!ctx)) 2149 if (likely(!ctx))
1577 continue; 2150 continue;
1578 2151
1579 perf_event_context_sched_in(ctx); 2152 perf_event_context_sched_in(ctx, task);
1580 } 2153 }
2154 /*
2155 * if cgroup events exist on this CPU, then we need
2156 * to check if we have to switch in PMU state.
2157 * cgroup event are system-wide mode only
2158 */
2159 if (atomic_read(&__get_cpu_var(perf_cgroup_events)))
2160 perf_cgroup_sched_in(task);
1581} 2161}
1582 2162
1583#define MAX_INTERRUPTS (~0ULL)
1584
1585static void perf_log_throttle(struct perf_event *event, int enable);
1586
1587static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count) 2163static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
1588{ 2164{
1589 u64 frequency = event->attr.sample_freq; 2165 u64 frequency = event->attr.sample_freq;
@@ -1611,7 +2187,7 @@ static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
1611 * Reduce accuracy by one bit such that @a and @b converge 2187 * Reduce accuracy by one bit such that @a and @b converge
1612 * to a similar magnitude. 2188 * to a similar magnitude.
1613 */ 2189 */
1614#define REDUCE_FLS(a, b) \ 2190#define REDUCE_FLS(a, b) \
1615do { \ 2191do { \
1616 if (a##_fls > b##_fls) { \ 2192 if (a##_fls > b##_fls) { \
1617 a >>= 1; \ 2193 a >>= 1; \
@@ -1694,7 +2270,7 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx, u64 period)
1694 if (event->state != PERF_EVENT_STATE_ACTIVE) 2270 if (event->state != PERF_EVENT_STATE_ACTIVE)
1695 continue; 2271 continue;
1696 2272
1697 if (event->cpu != -1 && event->cpu != smp_processor_id()) 2273 if (!event_filter_match(event))
1698 continue; 2274 continue;
1699 2275
1700 hwc = &event->hw; 2276 hwc = &event->hw;
@@ -1781,7 +2357,7 @@ static void perf_rotate_context(struct perf_cpu_context *cpuctx)
1781 if (ctx) 2357 if (ctx)
1782 rotate_ctx(ctx); 2358 rotate_ctx(ctx);
1783 2359
1784 cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE); 2360 cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, current);
1785 if (ctx) 2361 if (ctx)
1786 task_ctx_sched_in(ctx, EVENT_FLEXIBLE); 2362 task_ctx_sched_in(ctx, EVENT_FLEXIBLE);
1787 2363
@@ -1860,7 +2436,7 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx)
1860 2436
1861 raw_spin_unlock(&ctx->lock); 2437 raw_spin_unlock(&ctx->lock);
1862 2438
1863 perf_event_context_sched_in(ctx); 2439 perf_event_context_sched_in(ctx, ctx->task);
1864out: 2440out:
1865 local_irq_restore(flags); 2441 local_irq_restore(flags);
1866} 2442}
@@ -1885,11 +2461,14 @@ static void __perf_event_read(void *info)
1885 return; 2461 return;
1886 2462
1887 raw_spin_lock(&ctx->lock); 2463 raw_spin_lock(&ctx->lock);
1888 update_context_time(ctx); 2464 if (ctx->is_active) {
2465 update_context_time(ctx);
2466 update_cgrp_time_from_event(event);
2467 }
1889 update_event_times(event); 2468 update_event_times(event);
2469 if (event->state == PERF_EVENT_STATE_ACTIVE)
2470 event->pmu->read(event);
1890 raw_spin_unlock(&ctx->lock); 2471 raw_spin_unlock(&ctx->lock);
1891
1892 event->pmu->read(event);
1893} 2472}
1894 2473
1895static inline u64 perf_event_count(struct perf_event *event) 2474static inline u64 perf_event_count(struct perf_event *event)
@@ -1916,8 +2495,10 @@ static u64 perf_event_read(struct perf_event *event)
1916 * (e.g., thread is blocked), in that case 2495 * (e.g., thread is blocked), in that case
1917 * we cannot update context time 2496 * we cannot update context time
1918 */ 2497 */
1919 if (ctx->is_active) 2498 if (ctx->is_active) {
1920 update_context_time(ctx); 2499 update_context_time(ctx);
2500 update_cgrp_time_from_event(event);
2501 }
1921 update_event_times(event); 2502 update_event_times(event);
1922 raw_spin_unlock_irqrestore(&ctx->lock, flags); 2503 raw_spin_unlock_irqrestore(&ctx->lock, flags);
1923 } 2504 }
@@ -1983,8 +2564,7 @@ static int alloc_callchain_buffers(void)
1983 * accessed from NMI. Use a temporary manual per cpu allocation 2564 * accessed from NMI. Use a temporary manual per cpu allocation
1984 * until that gets sorted out. 2565 * until that gets sorted out.
1985 */ 2566 */
1986 size = sizeof(*entries) + sizeof(struct perf_callchain_entry *) * 2567 size = offsetof(struct callchain_cpus_entries, cpu_entries[nr_cpu_ids]);
1987 num_possible_cpus();
1988 2568
1989 entries = kzalloc(size, GFP_KERNEL); 2569 entries = kzalloc(size, GFP_KERNEL);
1990 if (!entries) 2570 if (!entries)
@@ -2185,13 +2765,6 @@ find_lively_task_by_vpid(pid_t vpid)
2185 if (!task) 2765 if (!task)
2186 return ERR_PTR(-ESRCH); 2766 return ERR_PTR(-ESRCH);
2187 2767
2188 /*
2189 * Can't attach events to a dying task.
2190 */
2191 err = -ESRCH;
2192 if (task->flags & PF_EXITING)
2193 goto errout;
2194
2195 /* Reuse ptrace permission checks for now. */ 2768 /* Reuse ptrace permission checks for now. */
2196 err = -EACCES; 2769 err = -EACCES;
2197 if (!ptrace_may_access(task, PTRACE_MODE_READ)) 2770 if (!ptrace_may_access(task, PTRACE_MODE_READ))
@@ -2204,6 +2777,9 @@ errout:
2204 2777
2205} 2778}
2206 2779
2780/*
2781 * Returns a matching context with refcount and pincount.
2782 */
2207static struct perf_event_context * 2783static struct perf_event_context *
2208find_get_context(struct pmu *pmu, struct task_struct *task, int cpu) 2784find_get_context(struct pmu *pmu, struct task_struct *task, int cpu)
2209{ 2785{
@@ -2212,14 +2788,11 @@ find_get_context(struct pmu *pmu, struct task_struct *task, int cpu)
2212 unsigned long flags; 2788 unsigned long flags;
2213 int ctxn, err; 2789 int ctxn, err;
2214 2790
2215 if (!task && cpu != -1) { 2791 if (!task) {
2216 /* Must be root to operate on a CPU event: */ 2792 /* Must be root to operate on a CPU event: */
2217 if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN)) 2793 if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
2218 return ERR_PTR(-EACCES); 2794 return ERR_PTR(-EACCES);
2219 2795
2220 if (cpu < 0 || cpu >= nr_cpumask_bits)
2221 return ERR_PTR(-EINVAL);
2222
2223 /* 2796 /*
2224 * We could be clever and allow to attach a event to an 2797 * We could be clever and allow to attach a event to an
2225 * offline CPU and activate it when the CPU comes up, but 2798 * offline CPU and activate it when the CPU comes up, but
@@ -2231,6 +2804,7 @@ find_get_context(struct pmu *pmu, struct task_struct *task, int cpu)
2231 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); 2804 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
2232 ctx = &cpuctx->ctx; 2805 ctx = &cpuctx->ctx;
2233 get_ctx(ctx); 2806 get_ctx(ctx);
2807 ++ctx->pin_count;
2234 2808
2235 return ctx; 2809 return ctx;
2236 } 2810 }
@@ -2244,6 +2818,7 @@ retry:
2244 ctx = perf_lock_task_context(task, ctxn, &flags); 2818 ctx = perf_lock_task_context(task, ctxn, &flags);
2245 if (ctx) { 2819 if (ctx) {
2246 unclone_ctx(ctx); 2820 unclone_ctx(ctx);
2821 ++ctx->pin_count;
2247 raw_spin_unlock_irqrestore(&ctx->lock, flags); 2822 raw_spin_unlock_irqrestore(&ctx->lock, flags);
2248 } 2823 }
2249 2824
@@ -2255,14 +2830,29 @@ retry:
2255 2830
2256 get_ctx(ctx); 2831 get_ctx(ctx);
2257 2832
2258 if (cmpxchg(&task->perf_event_ctxp[ctxn], NULL, ctx)) { 2833 err = 0;
2259 /* 2834 mutex_lock(&task->perf_event_mutex);
2260 * We raced with some other task; use 2835 /*
2261 * the context they set. 2836 * If it has already passed perf_event_exit_task().
2262 */ 2837 * we must see PF_EXITING, it takes this mutex too.
2838 */
2839 if (task->flags & PF_EXITING)
2840 err = -ESRCH;
2841 else if (task->perf_event_ctxp[ctxn])
2842 err = -EAGAIN;
2843 else {
2844 ++ctx->pin_count;
2845 rcu_assign_pointer(task->perf_event_ctxp[ctxn], ctx);
2846 }
2847 mutex_unlock(&task->perf_event_mutex);
2848
2849 if (unlikely(err)) {
2263 put_task_struct(task); 2850 put_task_struct(task);
2264 kfree(ctx); 2851 kfree(ctx);
2265 goto retry; 2852
2853 if (err == -EAGAIN)
2854 goto retry;
2855 goto errout;
2266 } 2856 }
2267 } 2857 }
2268 2858
@@ -2293,7 +2883,7 @@ static void free_event(struct perf_event *event)
2293 2883
2294 if (!event->parent) { 2884 if (!event->parent) {
2295 if (event->attach_state & PERF_ATTACH_TASK) 2885 if (event->attach_state & PERF_ATTACH_TASK)
2296 jump_label_dec(&perf_task_events); 2886 jump_label_dec(&perf_sched_events);
2297 if (event->attr.mmap || event->attr.mmap_data) 2887 if (event->attr.mmap || event->attr.mmap_data)
2298 atomic_dec(&nr_mmap_events); 2888 atomic_dec(&nr_mmap_events);
2299 if (event->attr.comm) 2889 if (event->attr.comm)
@@ -2302,6 +2892,10 @@ static void free_event(struct perf_event *event)
2302 atomic_dec(&nr_task_events); 2892 atomic_dec(&nr_task_events);
2303 if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) 2893 if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
2304 put_callchain_buffers(); 2894 put_callchain_buffers();
2895 if (is_cgroup_event(event)) {
2896 atomic_dec(&per_cpu(perf_cgroup_events, event->cpu));
2897 jump_label_dec(&perf_sched_events);
2898 }
2305 } 2899 }
2306 2900
2307 if (event->buffer) { 2901 if (event->buffer) {
@@ -2309,6 +2903,9 @@ static void free_event(struct perf_event *event)
2309 event->buffer = NULL; 2903 event->buffer = NULL;
2310 } 2904 }
2311 2905
2906 if (is_cgroup_event(event))
2907 perf_detach_cgroup(event);
2908
2312 if (event->destroy) 2909 if (event->destroy)
2313 event->destroy(event); 2910 event->destroy(event);
2314 2911
@@ -3893,7 +4490,7 @@ static int perf_event_task_match(struct perf_event *event)
3893 if (event->state < PERF_EVENT_STATE_INACTIVE) 4490 if (event->state < PERF_EVENT_STATE_INACTIVE)
3894 return 0; 4491 return 0;
3895 4492
3896 if (event->cpu != -1 && event->cpu != smp_processor_id()) 4493 if (!event_filter_match(event))
3897 return 0; 4494 return 0;
3898 4495
3899 if (event->attr.comm || event->attr.mmap || 4496 if (event->attr.comm || event->attr.mmap ||
@@ -4030,7 +4627,7 @@ static int perf_event_comm_match(struct perf_event *event)
4030 if (event->state < PERF_EVENT_STATE_INACTIVE) 4627 if (event->state < PERF_EVENT_STATE_INACTIVE)
4031 return 0; 4628 return 0;
4032 4629
4033 if (event->cpu != -1 && event->cpu != smp_processor_id()) 4630 if (!event_filter_match(event))
4034 return 0; 4631 return 0;
4035 4632
4036 if (event->attr.comm) 4633 if (event->attr.comm)
@@ -4178,7 +4775,7 @@ static int perf_event_mmap_match(struct perf_event *event,
4178 if (event->state < PERF_EVENT_STATE_INACTIVE) 4775 if (event->state < PERF_EVENT_STATE_INACTIVE)
4179 return 0; 4776 return 0;
4180 4777
4181 if (event->cpu != -1 && event->cpu != smp_processor_id()) 4778 if (!event_filter_match(event))
4182 return 0; 4779 return 0;
4183 4780
4184 if ((!executable && event->attr.mmap_data) || 4781 if ((!executable && event->attr.mmap_data) ||
@@ -4376,26 +4973,14 @@ static int __perf_event_overflow(struct perf_event *event, int nmi,
4376 if (unlikely(!is_sampling_event(event))) 4973 if (unlikely(!is_sampling_event(event)))
4377 return 0; 4974 return 0;
4378 4975
4379 if (!throttle) { 4976 if (unlikely(hwc->interrupts >= max_samples_per_tick)) {
4380 hwc->interrupts++; 4977 if (throttle) {
4381 } else { 4978 hwc->interrupts = MAX_INTERRUPTS;
4382 if (hwc->interrupts != MAX_INTERRUPTS) { 4979 perf_log_throttle(event, 0);
4383 hwc->interrupts++;
4384 if (HZ * hwc->interrupts >
4385 (u64)sysctl_perf_event_sample_rate) {
4386 hwc->interrupts = MAX_INTERRUPTS;
4387 perf_log_throttle(event, 0);
4388 ret = 1;
4389 }
4390 } else {
4391 /*
4392 * Keep re-disabling events even though on the previous
4393 * pass we disabled it - just in case we raced with a
4394 * sched-in and the event got enabled again:
4395 */
4396 ret = 1; 4980 ret = 1;
4397 } 4981 }
4398 } 4982 } else
4983 hwc->interrupts++;
4399 4984
4400 if (event->attr.freq) { 4985 if (event->attr.freq) {
4401 u64 now = perf_clock(); 4986 u64 now = perf_clock();
@@ -4648,7 +5233,7 @@ int perf_swevent_get_recursion_context(void)
4648} 5233}
4649EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context); 5234EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context);
4650 5235
4651void inline perf_swevent_put_recursion_context(int rctx) 5236inline void perf_swevent_put_recursion_context(int rctx)
4652{ 5237{
4653 struct swevent_htable *swhash = &__get_cpu_var(swevent_htable); 5238 struct swevent_htable *swhash = &__get_cpu_var(swevent_htable);
4654 5239
@@ -5032,6 +5617,10 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
5032 u64 period; 5617 u64 period;
5033 5618
5034 event = container_of(hrtimer, struct perf_event, hw.hrtimer); 5619 event = container_of(hrtimer, struct perf_event, hw.hrtimer);
5620
5621 if (event->state != PERF_EVENT_STATE_ACTIVE)
5622 return HRTIMER_NORESTART;
5623
5035 event->pmu->read(event); 5624 event->pmu->read(event);
5036 5625
5037 perf_sample_data_init(&data, 0); 5626 perf_sample_data_init(&data, 0);
@@ -5058,9 +5647,6 @@ static void perf_swevent_start_hrtimer(struct perf_event *event)
5058 if (!is_sampling_event(event)) 5647 if (!is_sampling_event(event))
5059 return; 5648 return;
5060 5649
5061 hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
5062 hwc->hrtimer.function = perf_swevent_hrtimer;
5063
5064 period = local64_read(&hwc->period_left); 5650 period = local64_read(&hwc->period_left);
5065 if (period) { 5651 if (period) {
5066 if (period < 0) 5652 if (period < 0)
@@ -5087,6 +5673,30 @@ static void perf_swevent_cancel_hrtimer(struct perf_event *event)
5087 } 5673 }
5088} 5674}
5089 5675
5676static void perf_swevent_init_hrtimer(struct perf_event *event)
5677{
5678 struct hw_perf_event *hwc = &event->hw;
5679
5680 if (!is_sampling_event(event))
5681 return;
5682
5683 hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
5684 hwc->hrtimer.function = perf_swevent_hrtimer;
5685
5686 /*
5687 * Since hrtimers have a fixed rate, we can do a static freq->period
5688 * mapping and avoid the whole period adjust feedback stuff.
5689 */
5690 if (event->attr.freq) {
5691 long freq = event->attr.sample_freq;
5692
5693 event->attr.sample_period = NSEC_PER_SEC / freq;
5694 hwc->sample_period = event->attr.sample_period;
5695 local64_set(&hwc->period_left, hwc->sample_period);
5696 event->attr.freq = 0;
5697 }
5698}
5699
5090/* 5700/*
5091 * Software event: cpu wall time clock 5701 * Software event: cpu wall time clock
5092 */ 5702 */
@@ -5139,6 +5749,8 @@ static int cpu_clock_event_init(struct perf_event *event)
5139 if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK) 5749 if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK)
5140 return -ENOENT; 5750 return -ENOENT;
5141 5751
5752 perf_swevent_init_hrtimer(event);
5753
5142 return 0; 5754 return 0;
5143} 5755}
5144 5756
@@ -5194,16 +5806,9 @@ static void task_clock_event_del(struct perf_event *event, int flags)
5194 5806
5195static void task_clock_event_read(struct perf_event *event) 5807static void task_clock_event_read(struct perf_event *event)
5196{ 5808{
5197 u64 time; 5809 u64 now = perf_clock();
5198 5810 u64 delta = now - event->ctx->timestamp;
5199 if (!in_nmi()) { 5811 u64 time = event->ctx->time + delta;
5200 update_context_time(event->ctx);
5201 time = event->ctx->time;
5202 } else {
5203 u64 now = perf_clock();
5204 u64 delta = now - event->ctx->timestamp;
5205 time = event->ctx->time + delta;
5206 }
5207 5812
5208 task_clock_event_update(event, time); 5813 task_clock_event_update(event, time);
5209} 5814}
@@ -5216,6 +5821,8 @@ static int task_clock_event_init(struct perf_event *event)
5216 if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK) 5821 if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK)
5217 return -ENOENT; 5822 return -ENOENT;
5218 5823
5824 perf_swevent_init_hrtimer(event);
5825
5219 return 0; 5826 return 0;
5220} 5827}
5221 5828
@@ -5361,6 +5968,8 @@ free_dev:
5361 goto out; 5968 goto out;
5362} 5969}
5363 5970
5971static struct lock_class_key cpuctx_mutex;
5972
5364int perf_pmu_register(struct pmu *pmu, char *name, int type) 5973int perf_pmu_register(struct pmu *pmu, char *name, int type)
5365{ 5974{
5366 int cpu, ret; 5975 int cpu, ret;
@@ -5409,6 +6018,7 @@ skip_type:
5409 6018
5410 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); 6019 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
5411 __perf_event_init_context(&cpuctx->ctx); 6020 __perf_event_init_context(&cpuctx->ctx);
6021 lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex);
5412 cpuctx->ctx.type = cpu_context; 6022 cpuctx->ctx.type = cpu_context;
5413 cpuctx->ctx.pmu = pmu; 6023 cpuctx->ctx.pmu = pmu;
5414 cpuctx->jiffies_interval = 1; 6024 cpuctx->jiffies_interval = 1;
@@ -5484,17 +6094,22 @@ struct pmu *perf_init_event(struct perf_event *event)
5484{ 6094{
5485 struct pmu *pmu = NULL; 6095 struct pmu *pmu = NULL;
5486 int idx; 6096 int idx;
6097 int ret;
5487 6098
5488 idx = srcu_read_lock(&pmus_srcu); 6099 idx = srcu_read_lock(&pmus_srcu);
5489 6100
5490 rcu_read_lock(); 6101 rcu_read_lock();
5491 pmu = idr_find(&pmu_idr, event->attr.type); 6102 pmu = idr_find(&pmu_idr, event->attr.type);
5492 rcu_read_unlock(); 6103 rcu_read_unlock();
5493 if (pmu) 6104 if (pmu) {
6105 ret = pmu->event_init(event);
6106 if (ret)
6107 pmu = ERR_PTR(ret);
5494 goto unlock; 6108 goto unlock;
6109 }
5495 6110
5496 list_for_each_entry_rcu(pmu, &pmus, entry) { 6111 list_for_each_entry_rcu(pmu, &pmus, entry) {
5497 int ret = pmu->event_init(event); 6112 ret = pmu->event_init(event);
5498 if (!ret) 6113 if (!ret)
5499 goto unlock; 6114 goto unlock;
5500 6115
@@ -5525,6 +6140,11 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
5525 struct hw_perf_event *hwc; 6140 struct hw_perf_event *hwc;
5526 long err; 6141 long err;
5527 6142
6143 if ((unsigned)cpu >= nr_cpu_ids) {
6144 if (!task || cpu != -1)
6145 return ERR_PTR(-EINVAL);
6146 }
6147
5528 event = kzalloc(sizeof(*event), GFP_KERNEL); 6148 event = kzalloc(sizeof(*event), GFP_KERNEL);
5529 if (!event) 6149 if (!event)
5530 return ERR_PTR(-ENOMEM); 6150 return ERR_PTR(-ENOMEM);
@@ -5573,7 +6193,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
5573 6193
5574 if (!overflow_handler && parent_event) 6194 if (!overflow_handler && parent_event)
5575 overflow_handler = parent_event->overflow_handler; 6195 overflow_handler = parent_event->overflow_handler;
5576 6196
5577 event->overflow_handler = overflow_handler; 6197 event->overflow_handler = overflow_handler;
5578 6198
5579 if (attr->disabled) 6199 if (attr->disabled)
@@ -5615,7 +6235,7 @@ done:
5615 6235
5616 if (!event->parent) { 6236 if (!event->parent) {
5617 if (event->attach_state & PERF_ATTACH_TASK) 6237 if (event->attach_state & PERF_ATTACH_TASK)
5618 jump_label_inc(&perf_task_events); 6238 jump_label_inc(&perf_sched_events);
5619 if (event->attr.mmap || event->attr.mmap_data) 6239 if (event->attr.mmap || event->attr.mmap_data)
5620 atomic_inc(&nr_mmap_events); 6240 atomic_inc(&nr_mmap_events);
5621 if (event->attr.comm) 6241 if (event->attr.comm)
@@ -5790,7 +6410,7 @@ SYSCALL_DEFINE5(perf_event_open,
5790 int err; 6410 int err;
5791 6411
5792 /* for future expandability... */ 6412 /* for future expandability... */
5793 if (flags & ~(PERF_FLAG_FD_NO_GROUP | PERF_FLAG_FD_OUTPUT)) 6413 if (flags & ~PERF_FLAG_ALL)
5794 return -EINVAL; 6414 return -EINVAL;
5795 6415
5796 err = perf_copy_attr(attr_uptr, &attr); 6416 err = perf_copy_attr(attr_uptr, &attr);
@@ -5807,6 +6427,15 @@ SYSCALL_DEFINE5(perf_event_open,
5807 return -EINVAL; 6427 return -EINVAL;
5808 } 6428 }
5809 6429
6430 /*
6431 * In cgroup mode, the pid argument is used to pass the fd
6432 * opened to the cgroup directory in cgroupfs. The cpu argument
6433 * designates the cpu on which to monitor threads from that
6434 * cgroup.
6435 */
6436 if ((flags & PERF_FLAG_PID_CGROUP) && (pid == -1 || cpu == -1))
6437 return -EINVAL;
6438
5810 event_fd = get_unused_fd_flags(O_RDWR); 6439 event_fd = get_unused_fd_flags(O_RDWR);
5811 if (event_fd < 0) 6440 if (event_fd < 0)
5812 return event_fd; 6441 return event_fd;
@@ -5824,7 +6453,7 @@ SYSCALL_DEFINE5(perf_event_open,
5824 group_leader = NULL; 6453 group_leader = NULL;
5825 } 6454 }
5826 6455
5827 if (pid != -1) { 6456 if (pid != -1 && !(flags & PERF_FLAG_PID_CGROUP)) {
5828 task = find_lively_task_by_vpid(pid); 6457 task = find_lively_task_by_vpid(pid);
5829 if (IS_ERR(task)) { 6458 if (IS_ERR(task)) {
5830 err = PTR_ERR(task); 6459 err = PTR_ERR(task);
@@ -5838,6 +6467,19 @@ SYSCALL_DEFINE5(perf_event_open,
5838 goto err_task; 6467 goto err_task;
5839 } 6468 }
5840 6469
6470 if (flags & PERF_FLAG_PID_CGROUP) {
6471 err = perf_cgroup_connect(pid, event, &attr, group_leader);
6472 if (err)
6473 goto err_alloc;
6474 /*
6475 * one more event:
6476 * - that has cgroup constraint on event->cpu
6477 * - that may need work on context switch
6478 */
6479 atomic_inc(&per_cpu(perf_cgroup_events, event->cpu));
6480 jump_label_inc(&perf_sched_events);
6481 }
6482
5841 /* 6483 /*
5842 * Special case software events and allow them to be part of 6484 * Special case software events and allow them to be part of
5843 * any hardware group. 6485 * any hardware group.
@@ -5923,10 +6565,10 @@ SYSCALL_DEFINE5(perf_event_open,
5923 struct perf_event_context *gctx = group_leader->ctx; 6565 struct perf_event_context *gctx = group_leader->ctx;
5924 6566
5925 mutex_lock(&gctx->mutex); 6567 mutex_lock(&gctx->mutex);
5926 perf_event_remove_from_context(group_leader); 6568 perf_remove_from_context(group_leader);
5927 list_for_each_entry(sibling, &group_leader->sibling_list, 6569 list_for_each_entry(sibling, &group_leader->sibling_list,
5928 group_entry) { 6570 group_entry) {
5929 perf_event_remove_from_context(sibling); 6571 perf_remove_from_context(sibling);
5930 put_ctx(gctx); 6572 put_ctx(gctx);
5931 } 6573 }
5932 mutex_unlock(&gctx->mutex); 6574 mutex_unlock(&gctx->mutex);
@@ -5949,6 +6591,7 @@ SYSCALL_DEFINE5(perf_event_open,
5949 6591
5950 perf_install_in_context(ctx, event, cpu); 6592 perf_install_in_context(ctx, event, cpu);
5951 ++ctx->generation; 6593 ++ctx->generation;
6594 perf_unpin_context(ctx);
5952 mutex_unlock(&ctx->mutex); 6595 mutex_unlock(&ctx->mutex);
5953 6596
5954 event->owner = current; 6597 event->owner = current;
@@ -5974,6 +6617,7 @@ SYSCALL_DEFINE5(perf_event_open,
5974 return event_fd; 6617 return event_fd;
5975 6618
5976err_context: 6619err_context:
6620 perf_unpin_context(ctx);
5977 put_ctx(ctx); 6621 put_ctx(ctx);
5978err_alloc: 6622err_alloc:
5979 free_event(event); 6623 free_event(event);
@@ -6024,6 +6668,7 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
6024 mutex_lock(&ctx->mutex); 6668 mutex_lock(&ctx->mutex);
6025 perf_install_in_context(ctx, event, cpu); 6669 perf_install_in_context(ctx, event, cpu);
6026 ++ctx->generation; 6670 ++ctx->generation;
6671 perf_unpin_context(ctx);
6027 mutex_unlock(&ctx->mutex); 6672 mutex_unlock(&ctx->mutex);
6028 6673
6029 return event; 6674 return event;
@@ -6077,7 +6722,7 @@ __perf_event_exit_task(struct perf_event *child_event,
6077{ 6722{
6078 struct perf_event *parent_event; 6723 struct perf_event *parent_event;
6079 6724
6080 perf_event_remove_from_context(child_event); 6725 perf_remove_from_context(child_event);
6081 6726
6082 parent_event = child_event->parent; 6727 parent_event = child_event->parent;
6083 /* 6728 /*
@@ -6109,7 +6754,7 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
6109 * scheduled, so we are now safe from rescheduling changing 6754 * scheduled, so we are now safe from rescheduling changing
6110 * our context. 6755 * our context.
6111 */ 6756 */
6112 child_ctx = child->perf_event_ctxp[ctxn]; 6757 child_ctx = rcu_dereference_raw(child->perf_event_ctxp[ctxn]);
6113 task_ctx_sched_out(child_ctx, EVENT_ALL); 6758 task_ctx_sched_out(child_ctx, EVENT_ALL);
6114 6759
6115 /* 6760 /*
@@ -6384,7 +7029,7 @@ inherit_task_group(struct perf_event *event, struct task_struct *parent,
6384 return 0; 7029 return 0;
6385 } 7030 }
6386 7031
6387 child_ctx = child->perf_event_ctxp[ctxn]; 7032 child_ctx = child->perf_event_ctxp[ctxn];
6388 if (!child_ctx) { 7033 if (!child_ctx) {
6389 /* 7034 /*
6390 * This is executed from the parent task context, so 7035 * This is executed from the parent task context, so
@@ -6422,11 +7067,6 @@ int perf_event_init_context(struct task_struct *child, int ctxn)
6422 unsigned long flags; 7067 unsigned long flags;
6423 int ret = 0; 7068 int ret = 0;
6424 7069
6425 child->perf_event_ctxp[ctxn] = NULL;
6426
6427 mutex_init(&child->perf_event_mutex);
6428 INIT_LIST_HEAD(&child->perf_event_list);
6429
6430 if (likely(!parent->perf_event_ctxp[ctxn])) 7070 if (likely(!parent->perf_event_ctxp[ctxn]))
6431 return 0; 7071 return 0;
6432 7072
@@ -6478,7 +7118,6 @@ int perf_event_init_context(struct task_struct *child, int ctxn)
6478 7118
6479 raw_spin_lock_irqsave(&parent_ctx->lock, flags); 7119 raw_spin_lock_irqsave(&parent_ctx->lock, flags);
6480 parent_ctx->rotate_disable = 0; 7120 parent_ctx->rotate_disable = 0;
6481 raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
6482 7121
6483 child_ctx = child->perf_event_ctxp[ctxn]; 7122 child_ctx = child->perf_event_ctxp[ctxn];
6484 7123
@@ -6486,12 +7125,11 @@ int perf_event_init_context(struct task_struct *child, int ctxn)
6486 /* 7125 /*
6487 * Mark the child context as a clone of the parent 7126 * Mark the child context as a clone of the parent
6488 * context, or of whatever the parent is a clone of. 7127 * context, or of whatever the parent is a clone of.
6489 * Note that if the parent is a clone, it could get 7128 *
6490 * uncloned at any point, but that doesn't matter 7129 * Note that if the parent is a clone, the holding of
6491 * because the list of events and the generation 7130 * parent_ctx->lock avoids it from being uncloned.
6492 * count can't have changed since we took the mutex.
6493 */ 7131 */
6494 cloned_ctx = rcu_dereference(parent_ctx->parent_ctx); 7132 cloned_ctx = parent_ctx->parent_ctx;
6495 if (cloned_ctx) { 7133 if (cloned_ctx) {
6496 child_ctx->parent_ctx = cloned_ctx; 7134 child_ctx->parent_ctx = cloned_ctx;
6497 child_ctx->parent_gen = parent_ctx->parent_gen; 7135 child_ctx->parent_gen = parent_ctx->parent_gen;
@@ -6502,9 +7140,11 @@ int perf_event_init_context(struct task_struct *child, int ctxn)
6502 get_ctx(child_ctx->parent_ctx); 7140 get_ctx(child_ctx->parent_ctx);
6503 } 7141 }
6504 7142
7143 raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
6505 mutex_unlock(&parent_ctx->mutex); 7144 mutex_unlock(&parent_ctx->mutex);
6506 7145
6507 perf_unpin_context(parent_ctx); 7146 perf_unpin_context(parent_ctx);
7147 put_ctx(parent_ctx);
6508 7148
6509 return ret; 7149 return ret;
6510} 7150}
@@ -6516,6 +7156,10 @@ int perf_event_init_task(struct task_struct *child)
6516{ 7156{
6517 int ctxn, ret; 7157 int ctxn, ret;
6518 7158
7159 memset(child->perf_event_ctxp, 0, sizeof(child->perf_event_ctxp));
7160 mutex_init(&child->perf_event_mutex);
7161 INIT_LIST_HEAD(&child->perf_event_list);
7162
6519 for_each_task_context_nr(ctxn) { 7163 for_each_task_context_nr(ctxn) {
6520 ret = perf_event_init_context(child, ctxn); 7164 ret = perf_event_init_context(child, ctxn);
6521 if (ret) 7165 if (ret)
@@ -6570,9 +7214,9 @@ static void __perf_event_exit_context(void *__info)
6570 perf_pmu_rotate_stop(ctx->pmu); 7214 perf_pmu_rotate_stop(ctx->pmu);
6571 7215
6572 list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry) 7216 list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry)
6573 __perf_event_remove_from_context(event); 7217 __perf_remove_from_context(event);
6574 list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, group_entry) 7218 list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, group_entry)
6575 __perf_event_remove_from_context(event); 7219 __perf_remove_from_context(event);
6576} 7220}
6577 7221
6578static void perf_event_exit_cpu_context(int cpu) 7222static void perf_event_exit_cpu_context(int cpu)
@@ -6696,3 +7340,83 @@ unlock:
6696 return ret; 7340 return ret;
6697} 7341}
6698device_initcall(perf_event_sysfs_init); 7342device_initcall(perf_event_sysfs_init);
7343
7344#ifdef CONFIG_CGROUP_PERF
7345static struct cgroup_subsys_state *perf_cgroup_create(
7346 struct cgroup_subsys *ss, struct cgroup *cont)
7347{
7348 struct perf_cgroup *jc;
7349
7350 jc = kzalloc(sizeof(*jc), GFP_KERNEL);
7351 if (!jc)
7352 return ERR_PTR(-ENOMEM);
7353
7354 jc->info = alloc_percpu(struct perf_cgroup_info);
7355 if (!jc->info) {
7356 kfree(jc);
7357 return ERR_PTR(-ENOMEM);
7358 }
7359
7360 return &jc->css;
7361}
7362
7363static void perf_cgroup_destroy(struct cgroup_subsys *ss,
7364 struct cgroup *cont)
7365{
7366 struct perf_cgroup *jc;
7367 jc = container_of(cgroup_subsys_state(cont, perf_subsys_id),
7368 struct perf_cgroup, css);
7369 free_percpu(jc->info);
7370 kfree(jc);
7371}
7372
7373static int __perf_cgroup_move(void *info)
7374{
7375 struct task_struct *task = info;
7376 perf_cgroup_switch(task, PERF_CGROUP_SWOUT | PERF_CGROUP_SWIN);
7377 return 0;
7378}
7379
7380static void perf_cgroup_move(struct task_struct *task)
7381{
7382 task_function_call(task, __perf_cgroup_move, task);
7383}
7384
7385static void perf_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
7386 struct cgroup *old_cgrp, struct task_struct *task,
7387 bool threadgroup)
7388{
7389 perf_cgroup_move(task);
7390 if (threadgroup) {
7391 struct task_struct *c;
7392 rcu_read_lock();
7393 list_for_each_entry_rcu(c, &task->thread_group, thread_group) {
7394 perf_cgroup_move(c);
7395 }
7396 rcu_read_unlock();
7397 }
7398}
7399
7400static void perf_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp,
7401 struct cgroup *old_cgrp, struct task_struct *task)
7402{
7403 /*
7404 * cgroup_exit() is called in the copy_process() failure path.
7405 * Ignore this case since the task hasn't ran yet, this avoids
7406 * trying to poke a half freed task state from generic code.
7407 */
7408 if (!(task->flags & PF_EXITING))
7409 return;
7410
7411 perf_cgroup_move(task);
7412}
7413
7414struct cgroup_subsys perf_subsys = {
7415 .name = "perf_event",
7416 .subsys_id = perf_subsys_id,
7417 .create = perf_cgroup_create,
7418 .destroy = perf_cgroup_destroy,
7419 .exit = perf_cgroup_exit,
7420 .attach = perf_cgroup_attach,
7421};
7422#endif /* CONFIG_CGROUP_PERF */
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 05bb7173850e..67fea9d25d55 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -176,7 +176,8 @@ static inline cputime_t virt_ticks(struct task_struct *p)
176 return p->utime; 176 return p->utime;
177} 177}
178 178
179int posix_cpu_clock_getres(const clockid_t which_clock, struct timespec *tp) 179static int
180posix_cpu_clock_getres(const clockid_t which_clock, struct timespec *tp)
180{ 181{
181 int error = check_clock(which_clock); 182 int error = check_clock(which_clock);
182 if (!error) { 183 if (!error) {
@@ -194,7 +195,8 @@ int posix_cpu_clock_getres(const clockid_t which_clock, struct timespec *tp)
194 return error; 195 return error;
195} 196}
196 197
197int posix_cpu_clock_set(const clockid_t which_clock, const struct timespec *tp) 198static int
199posix_cpu_clock_set(const clockid_t which_clock, const struct timespec *tp)
198{ 200{
199 /* 201 /*
200 * You can never reset a CPU clock, but we check for other errors 202 * You can never reset a CPU clock, but we check for other errors
@@ -317,7 +319,7 @@ static int cpu_clock_sample_group(const clockid_t which_clock,
317} 319}
318 320
319 321
320int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp) 322static int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp)
321{ 323{
322 const pid_t pid = CPUCLOCK_PID(which_clock); 324 const pid_t pid = CPUCLOCK_PID(which_clock);
323 int error = -EINVAL; 325 int error = -EINVAL;
@@ -379,7 +381,7 @@ int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp)
379 * This is called from sys_timer_create() and do_cpu_nanosleep() with the 381 * This is called from sys_timer_create() and do_cpu_nanosleep() with the
380 * new timer already all-zeros initialized. 382 * new timer already all-zeros initialized.
381 */ 383 */
382int posix_cpu_timer_create(struct k_itimer *new_timer) 384static int posix_cpu_timer_create(struct k_itimer *new_timer)
383{ 385{
384 int ret = 0; 386 int ret = 0;
385 const pid_t pid = CPUCLOCK_PID(new_timer->it_clock); 387 const pid_t pid = CPUCLOCK_PID(new_timer->it_clock);
@@ -425,7 +427,7 @@ int posix_cpu_timer_create(struct k_itimer *new_timer)
425 * If we return TIMER_RETRY, it's necessary to release the timer's lock 427 * If we return TIMER_RETRY, it's necessary to release the timer's lock
426 * and try again. (This happens when the timer is in the middle of firing.) 428 * and try again. (This happens when the timer is in the middle of firing.)
427 */ 429 */
428int posix_cpu_timer_del(struct k_itimer *timer) 430static int posix_cpu_timer_del(struct k_itimer *timer)
429{ 431{
430 struct task_struct *p = timer->it.cpu.task; 432 struct task_struct *p = timer->it.cpu.task;
431 int ret = 0; 433 int ret = 0;
@@ -665,8 +667,8 @@ static int cpu_timer_sample_group(const clockid_t which_clock,
665 * If we return TIMER_RETRY, it's necessary to release the timer's lock 667 * If we return TIMER_RETRY, it's necessary to release the timer's lock
666 * and try again. (This happens when the timer is in the middle of firing.) 668 * and try again. (This happens when the timer is in the middle of firing.)
667 */ 669 */
668int posix_cpu_timer_set(struct k_itimer *timer, int flags, 670static int posix_cpu_timer_set(struct k_itimer *timer, int flags,
669 struct itimerspec *new, struct itimerspec *old) 671 struct itimerspec *new, struct itimerspec *old)
670{ 672{
671 struct task_struct *p = timer->it.cpu.task; 673 struct task_struct *p = timer->it.cpu.task;
672 union cpu_time_count old_expires, new_expires, old_incr, val; 674 union cpu_time_count old_expires, new_expires, old_incr, val;
@@ -820,7 +822,7 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags,
820 return ret; 822 return ret;
821} 823}
822 824
823void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp) 825static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
824{ 826{
825 union cpu_time_count now; 827 union cpu_time_count now;
826 struct task_struct *p = timer->it.cpu.task; 828 struct task_struct *p = timer->it.cpu.task;
@@ -1481,11 +1483,13 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags,
1481 return error; 1483 return error;
1482} 1484}
1483 1485
1484int posix_cpu_nsleep(const clockid_t which_clock, int flags, 1486static long posix_cpu_nsleep_restart(struct restart_block *restart_block);
1485 struct timespec *rqtp, struct timespec __user *rmtp) 1487
1488static int posix_cpu_nsleep(const clockid_t which_clock, int flags,
1489 struct timespec *rqtp, struct timespec __user *rmtp)
1486{ 1490{
1487 struct restart_block *restart_block = 1491 struct restart_block *restart_block =
1488 &current_thread_info()->restart_block; 1492 &current_thread_info()->restart_block;
1489 struct itimerspec it; 1493 struct itimerspec it;
1490 int error; 1494 int error;
1491 1495
@@ -1501,56 +1505,47 @@ int posix_cpu_nsleep(const clockid_t which_clock, int flags,
1501 1505
1502 if (error == -ERESTART_RESTARTBLOCK) { 1506 if (error == -ERESTART_RESTARTBLOCK) {
1503 1507
1504 if (flags & TIMER_ABSTIME) 1508 if (flags & TIMER_ABSTIME)
1505 return -ERESTARTNOHAND; 1509 return -ERESTARTNOHAND;
1506 /* 1510 /*
1507 * Report back to the user the time still remaining. 1511 * Report back to the user the time still remaining.
1508 */ 1512 */
1509 if (rmtp != NULL && copy_to_user(rmtp, &it.it_value, sizeof *rmtp)) 1513 if (rmtp && copy_to_user(rmtp, &it.it_value, sizeof *rmtp))
1510 return -EFAULT; 1514 return -EFAULT;
1511 1515
1512 restart_block->fn = posix_cpu_nsleep_restart; 1516 restart_block->fn = posix_cpu_nsleep_restart;
1513 restart_block->arg0 = which_clock; 1517 restart_block->nanosleep.index = which_clock;
1514 restart_block->arg1 = (unsigned long) rmtp; 1518 restart_block->nanosleep.rmtp = rmtp;
1515 restart_block->arg2 = rqtp->tv_sec; 1519 restart_block->nanosleep.expires = timespec_to_ns(rqtp);
1516 restart_block->arg3 = rqtp->tv_nsec;
1517 } 1520 }
1518 return error; 1521 return error;
1519} 1522}
1520 1523
1521long posix_cpu_nsleep_restart(struct restart_block *restart_block) 1524static long posix_cpu_nsleep_restart(struct restart_block *restart_block)
1522{ 1525{
1523 clockid_t which_clock = restart_block->arg0; 1526 clockid_t which_clock = restart_block->nanosleep.index;
1524 struct timespec __user *rmtp;
1525 struct timespec t; 1527 struct timespec t;
1526 struct itimerspec it; 1528 struct itimerspec it;
1527 int error; 1529 int error;
1528 1530
1529 rmtp = (struct timespec __user *) restart_block->arg1; 1531 t = ns_to_timespec(restart_block->nanosleep.expires);
1530 t.tv_sec = restart_block->arg2;
1531 t.tv_nsec = restart_block->arg3;
1532 1532
1533 restart_block->fn = do_no_restart_syscall;
1534 error = do_cpu_nanosleep(which_clock, TIMER_ABSTIME, &t, &it); 1533 error = do_cpu_nanosleep(which_clock, TIMER_ABSTIME, &t, &it);
1535 1534
1536 if (error == -ERESTART_RESTARTBLOCK) { 1535 if (error == -ERESTART_RESTARTBLOCK) {
1536 struct timespec __user *rmtp = restart_block->nanosleep.rmtp;
1537 /* 1537 /*
1538 * Report back to the user the time still remaining. 1538 * Report back to the user the time still remaining.
1539 */ 1539 */
1540 if (rmtp != NULL && copy_to_user(rmtp, &it.it_value, sizeof *rmtp)) 1540 if (rmtp && copy_to_user(rmtp, &it.it_value, sizeof *rmtp))
1541 return -EFAULT; 1541 return -EFAULT;
1542 1542
1543 restart_block->fn = posix_cpu_nsleep_restart; 1543 restart_block->nanosleep.expires = timespec_to_ns(&t);
1544 restart_block->arg0 = which_clock;
1545 restart_block->arg1 = (unsigned long) rmtp;
1546 restart_block->arg2 = t.tv_sec;
1547 restart_block->arg3 = t.tv_nsec;
1548 } 1544 }
1549 return error; 1545 return error;
1550 1546
1551} 1547}
1552 1548
1553
1554#define PROCESS_CLOCK MAKE_PROCESS_CPUCLOCK(0, CPUCLOCK_SCHED) 1549#define PROCESS_CLOCK MAKE_PROCESS_CPUCLOCK(0, CPUCLOCK_SCHED)
1555#define THREAD_CLOCK MAKE_THREAD_CPUCLOCK(0, CPUCLOCK_SCHED) 1550#define THREAD_CLOCK MAKE_THREAD_CPUCLOCK(0, CPUCLOCK_SCHED)
1556 1551
@@ -1594,38 +1589,37 @@ static int thread_cpu_timer_create(struct k_itimer *timer)
1594 timer->it_clock = THREAD_CLOCK; 1589 timer->it_clock = THREAD_CLOCK;
1595 return posix_cpu_timer_create(timer); 1590 return posix_cpu_timer_create(timer);
1596} 1591}
1597static int thread_cpu_nsleep(const clockid_t which_clock, int flags, 1592
1598 struct timespec *rqtp, struct timespec __user *rmtp) 1593struct k_clock clock_posix_cpu = {
1599{ 1594 .clock_getres = posix_cpu_clock_getres,
1600 return -EINVAL; 1595 .clock_set = posix_cpu_clock_set,
1601} 1596 .clock_get = posix_cpu_clock_get,
1602static long thread_cpu_nsleep_restart(struct restart_block *restart_block) 1597 .timer_create = posix_cpu_timer_create,
1603{ 1598 .nsleep = posix_cpu_nsleep,
1604 return -EINVAL; 1599 .nsleep_restart = posix_cpu_nsleep_restart,
1605} 1600 .timer_set = posix_cpu_timer_set,
1601 .timer_del = posix_cpu_timer_del,
1602 .timer_get = posix_cpu_timer_get,
1603};
1606 1604
1607static __init int init_posix_cpu_timers(void) 1605static __init int init_posix_cpu_timers(void)
1608{ 1606{
1609 struct k_clock process = { 1607 struct k_clock process = {
1610 .clock_getres = process_cpu_clock_getres, 1608 .clock_getres = process_cpu_clock_getres,
1611 .clock_get = process_cpu_clock_get, 1609 .clock_get = process_cpu_clock_get,
1612 .clock_set = do_posix_clock_nosettime, 1610 .timer_create = process_cpu_timer_create,
1613 .timer_create = process_cpu_timer_create, 1611 .nsleep = process_cpu_nsleep,
1614 .nsleep = process_cpu_nsleep, 1612 .nsleep_restart = process_cpu_nsleep_restart,
1615 .nsleep_restart = process_cpu_nsleep_restart,
1616 }; 1613 };
1617 struct k_clock thread = { 1614 struct k_clock thread = {
1618 .clock_getres = thread_cpu_clock_getres, 1615 .clock_getres = thread_cpu_clock_getres,
1619 .clock_get = thread_cpu_clock_get, 1616 .clock_get = thread_cpu_clock_get,
1620 .clock_set = do_posix_clock_nosettime, 1617 .timer_create = thread_cpu_timer_create,
1621 .timer_create = thread_cpu_timer_create,
1622 .nsleep = thread_cpu_nsleep,
1623 .nsleep_restart = thread_cpu_nsleep_restart,
1624 }; 1618 };
1625 struct timespec ts; 1619 struct timespec ts;
1626 1620
1627 register_posix_clock(CLOCK_PROCESS_CPUTIME_ID, &process); 1621 posix_timers_register_clock(CLOCK_PROCESS_CPUTIME_ID, &process);
1628 register_posix_clock(CLOCK_THREAD_CPUTIME_ID, &thread); 1622 posix_timers_register_clock(CLOCK_THREAD_CPUTIME_ID, &thread);
1629 1623
1630 cputime_to_timespec(cputime_one_jiffy, &ts); 1624 cputime_to_timespec(cputime_one_jiffy, &ts);
1631 onecputick = ts.tv_nsec; 1625 onecputick = ts.tv_nsec;
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 93bd2eb2bc53..4c0124919f9a 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -41,6 +41,7 @@
41#include <linux/init.h> 41#include <linux/init.h>
42#include <linux/compiler.h> 42#include <linux/compiler.h>
43#include <linux/idr.h> 43#include <linux/idr.h>
44#include <linux/posix-clock.h>
44#include <linux/posix-timers.h> 45#include <linux/posix-timers.h>
45#include <linux/syscalls.h> 46#include <linux/syscalls.h>
46#include <linux/wait.h> 47#include <linux/wait.h>
@@ -81,6 +82,14 @@ static DEFINE_SPINLOCK(idr_lock);
81#error "SIGEV_THREAD_ID must not share bit with other SIGEV values!" 82#error "SIGEV_THREAD_ID must not share bit with other SIGEV values!"
82#endif 83#endif
83 84
85/*
86 * parisc wants ENOTSUP instead of EOPNOTSUPP
87 */
88#ifndef ENOTSUP
89# define ENANOSLEEP_NOTSUP EOPNOTSUPP
90#else
91# define ENANOSLEEP_NOTSUP ENOTSUP
92#endif
84 93
85/* 94/*
86 * The timer ID is turned into a timer address by idr_find(). 95 * The timer ID is turned into a timer address by idr_find().
@@ -94,11 +103,7 @@ static DEFINE_SPINLOCK(idr_lock);
94/* 103/*
95 * CLOCKs: The POSIX standard calls for a couple of clocks and allows us 104 * CLOCKs: The POSIX standard calls for a couple of clocks and allows us
96 * to implement others. This structure defines the various 105 * to implement others. This structure defines the various
97 * clocks and allows the possibility of adding others. We 106 * clocks.
98 * provide an interface to add clocks to the table and expect
99 * the "arch" code to add at least one clock that is high
100 * resolution. Here we define the standard CLOCK_REALTIME as a
101 * 1/HZ resolution clock.
102 * 107 *
103 * RESOLUTION: Clock resolution is used to round up timer and interval 108 * RESOLUTION: Clock resolution is used to round up timer and interval
104 * times, NOT to report clock times, which are reported with as 109 * times, NOT to report clock times, which are reported with as
@@ -108,20 +113,13 @@ static DEFINE_SPINLOCK(idr_lock);
108 * necessary code is written. The standard says we should say 113 * necessary code is written. The standard says we should say
109 * something about this issue in the documentation... 114 * something about this issue in the documentation...
110 * 115 *
111 * FUNCTIONS: The CLOCKs structure defines possible functions to handle 116 * FUNCTIONS: The CLOCKs structure defines possible functions to
112 * various clock functions. For clocks that use the standard 117 * handle various clock functions.
113 * system timer code these entries should be NULL. This will
114 * allow dispatch without the overhead of indirect function
115 * calls. CLOCKS that depend on other sources (e.g. WWV or GPS)
116 * must supply functions here, even if the function just returns
117 * ENOSYS. The standard POSIX timer management code assumes the
118 * following: 1.) The k_itimer struct (sched.h) is used for the
119 * timer. 2.) The list, it_lock, it_clock, it_id and it_pid
120 * fields are not modified by timer code.
121 * 118 *
122 * At this time all functions EXCEPT clock_nanosleep can be 119 * The standard POSIX timer management code assumes the
123 * redirected by the CLOCKS structure. Clock_nanosleep is in 120 * following: 1.) The k_itimer struct (sched.h) is used for
124 * there, but the code ignores it. 121 * the timer. 2.) The list, it_lock, it_clock, it_id and
122 * it_pid fields are not modified by timer code.
125 * 123 *
126 * Permissions: It is assumed that the clock_settime() function defined 124 * Permissions: It is assumed that the clock_settime() function defined
127 * for each clock will take care of permission checks. Some 125 * for each clock will take care of permission checks. Some
@@ -138,6 +136,7 @@ static struct k_clock posix_clocks[MAX_CLOCKS];
138 */ 136 */
139static int common_nsleep(const clockid_t, int flags, struct timespec *t, 137static int common_nsleep(const clockid_t, int flags, struct timespec *t,
140 struct timespec __user *rmtp); 138 struct timespec __user *rmtp);
139static int common_timer_create(struct k_itimer *new_timer);
141static void common_timer_get(struct k_itimer *, struct itimerspec *); 140static void common_timer_get(struct k_itimer *, struct itimerspec *);
142static int common_timer_set(struct k_itimer *, int, 141static int common_timer_set(struct k_itimer *, int,
143 struct itimerspec *, struct itimerspec *); 142 struct itimerspec *, struct itimerspec *);
@@ -158,76 +157,24 @@ static inline void unlock_timer(struct k_itimer *timr, unsigned long flags)
158 spin_unlock_irqrestore(&timr->it_lock, flags); 157 spin_unlock_irqrestore(&timr->it_lock, flags);
159} 158}
160 159
161/* 160/* Get clock_realtime */
162 * Call the k_clock hook function if non-null, or the default function. 161static int posix_clock_realtime_get(clockid_t which_clock, struct timespec *tp)
163 */
164#define CLOCK_DISPATCH(clock, call, arglist) \
165 ((clock) < 0 ? posix_cpu_##call arglist : \
166 (posix_clocks[clock].call != NULL \
167 ? (*posix_clocks[clock].call) arglist : common_##call arglist))
168
169/*
170 * Default clock hook functions when the struct k_clock passed
171 * to register_posix_clock leaves a function pointer null.
172 *
173 * The function common_CALL is the default implementation for
174 * the function pointer CALL in struct k_clock.
175 */
176
177static inline int common_clock_getres(const clockid_t which_clock,
178 struct timespec *tp)
179{
180 tp->tv_sec = 0;
181 tp->tv_nsec = posix_clocks[which_clock].res;
182 return 0;
183}
184
185/*
186 * Get real time for posix timers
187 */
188static int common_clock_get(clockid_t which_clock, struct timespec *tp)
189{ 162{
190 ktime_get_real_ts(tp); 163 ktime_get_real_ts(tp);
191 return 0; 164 return 0;
192} 165}
193 166
194static inline int common_clock_set(const clockid_t which_clock, 167/* Set clock_realtime */
195 struct timespec *tp) 168static int posix_clock_realtime_set(const clockid_t which_clock,
169 const struct timespec *tp)
196{ 170{
197 return do_sys_settimeofday(tp, NULL); 171 return do_sys_settimeofday(tp, NULL);
198} 172}
199 173
200static int common_timer_create(struct k_itimer *new_timer) 174static int posix_clock_realtime_adj(const clockid_t which_clock,
201{ 175 struct timex *t)
202 hrtimer_init(&new_timer->it.real.timer, new_timer->it_clock, 0);
203 return 0;
204}
205
206static int no_timer_create(struct k_itimer *new_timer)
207{
208 return -EOPNOTSUPP;
209}
210
211static int no_nsleep(const clockid_t which_clock, int flags,
212 struct timespec *tsave, struct timespec __user *rmtp)
213{
214 return -EOPNOTSUPP;
215}
216
217/*
218 * Return nonzero if we know a priori this clockid_t value is bogus.
219 */
220static inline int invalid_clockid(const clockid_t which_clock)
221{ 176{
222 if (which_clock < 0) /* CPU clock, posix_cpu_* will check it */ 177 return do_adjtimex(t);
223 return 0;
224 if ((unsigned) which_clock >= MAX_CLOCKS)
225 return 1;
226 if (posix_clocks[which_clock].clock_getres != NULL)
227 return 0;
228 if (posix_clocks[which_clock].res != 0)
229 return 0;
230 return 1;
231} 178}
232 179
233/* 180/*
@@ -240,7 +187,7 @@ static int posix_ktime_get_ts(clockid_t which_clock, struct timespec *tp)
240} 187}
241 188
242/* 189/*
243 * Get monotonic time for posix timers 190 * Get monotonic-raw time for posix timers
244 */ 191 */
245static int posix_get_monotonic_raw(clockid_t which_clock, struct timespec *tp) 192static int posix_get_monotonic_raw(clockid_t which_clock, struct timespec *tp)
246{ 193{
@@ -267,46 +214,70 @@ static int posix_get_coarse_res(const clockid_t which_clock, struct timespec *tp
267 *tp = ktime_to_timespec(KTIME_LOW_RES); 214 *tp = ktime_to_timespec(KTIME_LOW_RES);
268 return 0; 215 return 0;
269} 216}
217
218static int posix_get_boottime(const clockid_t which_clock, struct timespec *tp)
219{
220 get_monotonic_boottime(tp);
221 return 0;
222}
223
224
270/* 225/*
271 * Initialize everything, well, just everything in Posix clocks/timers ;) 226 * Initialize everything, well, just everything in Posix clocks/timers ;)
272 */ 227 */
273static __init int init_posix_timers(void) 228static __init int init_posix_timers(void)
274{ 229{
275 struct k_clock clock_realtime = { 230 struct k_clock clock_realtime = {
276 .clock_getres = hrtimer_get_res, 231 .clock_getres = hrtimer_get_res,
232 .clock_get = posix_clock_realtime_get,
233 .clock_set = posix_clock_realtime_set,
234 .clock_adj = posix_clock_realtime_adj,
235 .nsleep = common_nsleep,
236 .nsleep_restart = hrtimer_nanosleep_restart,
237 .timer_create = common_timer_create,
238 .timer_set = common_timer_set,
239 .timer_get = common_timer_get,
240 .timer_del = common_timer_del,
277 }; 241 };
278 struct k_clock clock_monotonic = { 242 struct k_clock clock_monotonic = {
279 .clock_getres = hrtimer_get_res, 243 .clock_getres = hrtimer_get_res,
280 .clock_get = posix_ktime_get_ts, 244 .clock_get = posix_ktime_get_ts,
281 .clock_set = do_posix_clock_nosettime, 245 .nsleep = common_nsleep,
246 .nsleep_restart = hrtimer_nanosleep_restart,
247 .timer_create = common_timer_create,
248 .timer_set = common_timer_set,
249 .timer_get = common_timer_get,
250 .timer_del = common_timer_del,
282 }; 251 };
283 struct k_clock clock_monotonic_raw = { 252 struct k_clock clock_monotonic_raw = {
284 .clock_getres = hrtimer_get_res, 253 .clock_getres = hrtimer_get_res,
285 .clock_get = posix_get_monotonic_raw, 254 .clock_get = posix_get_monotonic_raw,
286 .clock_set = do_posix_clock_nosettime,
287 .timer_create = no_timer_create,
288 .nsleep = no_nsleep,
289 }; 255 };
290 struct k_clock clock_realtime_coarse = { 256 struct k_clock clock_realtime_coarse = {
291 .clock_getres = posix_get_coarse_res, 257 .clock_getres = posix_get_coarse_res,
292 .clock_get = posix_get_realtime_coarse, 258 .clock_get = posix_get_realtime_coarse,
293 .clock_set = do_posix_clock_nosettime,
294 .timer_create = no_timer_create,
295 .nsleep = no_nsleep,
296 }; 259 };
297 struct k_clock clock_monotonic_coarse = { 260 struct k_clock clock_monotonic_coarse = {
298 .clock_getres = posix_get_coarse_res, 261 .clock_getres = posix_get_coarse_res,
299 .clock_get = posix_get_monotonic_coarse, 262 .clock_get = posix_get_monotonic_coarse,
300 .clock_set = do_posix_clock_nosettime, 263 };
301 .timer_create = no_timer_create, 264 struct k_clock clock_boottime = {
302 .nsleep = no_nsleep, 265 .clock_getres = hrtimer_get_res,
266 .clock_get = posix_get_boottime,
267 .nsleep = common_nsleep,
268 .nsleep_restart = hrtimer_nanosleep_restart,
269 .timer_create = common_timer_create,
270 .timer_set = common_timer_set,
271 .timer_get = common_timer_get,
272 .timer_del = common_timer_del,
303 }; 273 };
304 274
305 register_posix_clock(CLOCK_REALTIME, &clock_realtime); 275 posix_timers_register_clock(CLOCK_REALTIME, &clock_realtime);
306 register_posix_clock(CLOCK_MONOTONIC, &clock_monotonic); 276 posix_timers_register_clock(CLOCK_MONOTONIC, &clock_monotonic);
307 register_posix_clock(CLOCK_MONOTONIC_RAW, &clock_monotonic_raw); 277 posix_timers_register_clock(CLOCK_MONOTONIC_RAW, &clock_monotonic_raw);
308 register_posix_clock(CLOCK_REALTIME_COARSE, &clock_realtime_coarse); 278 posix_timers_register_clock(CLOCK_REALTIME_COARSE, &clock_realtime_coarse);
309 register_posix_clock(CLOCK_MONOTONIC_COARSE, &clock_monotonic_coarse); 279 posix_timers_register_clock(CLOCK_MONOTONIC_COARSE, &clock_monotonic_coarse);
280 posix_timers_register_clock(CLOCK_BOOTTIME, &clock_boottime);
310 281
311 posix_timers_cache = kmem_cache_create("posix_timers_cache", 282 posix_timers_cache = kmem_cache_create("posix_timers_cache",
312 sizeof (struct k_itimer), 0, SLAB_PANIC, 283 sizeof (struct k_itimer), 0, SLAB_PANIC,
@@ -482,17 +453,29 @@ static struct pid *good_sigevent(sigevent_t * event)
482 return task_pid(rtn); 453 return task_pid(rtn);
483} 454}
484 455
485void register_posix_clock(const clockid_t clock_id, struct k_clock *new_clock) 456void posix_timers_register_clock(const clockid_t clock_id,
457 struct k_clock *new_clock)
486{ 458{
487 if ((unsigned) clock_id >= MAX_CLOCKS) { 459 if ((unsigned) clock_id >= MAX_CLOCKS) {
488 printk("POSIX clock register failed for clock_id %d\n", 460 printk(KERN_WARNING "POSIX clock register failed for clock_id %d\n",
461 clock_id);
462 return;
463 }
464
465 if (!new_clock->clock_get) {
466 printk(KERN_WARNING "POSIX clock id %d lacks clock_get()\n",
467 clock_id);
468 return;
469 }
470 if (!new_clock->clock_getres) {
471 printk(KERN_WARNING "POSIX clock id %d lacks clock_getres()\n",
489 clock_id); 472 clock_id);
490 return; 473 return;
491 } 474 }
492 475
493 posix_clocks[clock_id] = *new_clock; 476 posix_clocks[clock_id] = *new_clock;
494} 477}
495EXPORT_SYMBOL_GPL(register_posix_clock); 478EXPORT_SYMBOL_GPL(posix_timers_register_clock);
496 479
497static struct k_itimer * alloc_posix_timer(void) 480static struct k_itimer * alloc_posix_timer(void)
498{ 481{
@@ -523,19 +506,39 @@ static void release_posix_timer(struct k_itimer *tmr, int it_id_set)
523 kmem_cache_free(posix_timers_cache, tmr); 506 kmem_cache_free(posix_timers_cache, tmr);
524} 507}
525 508
509static struct k_clock *clockid_to_kclock(const clockid_t id)
510{
511 if (id < 0)
512 return (id & CLOCKFD_MASK) == CLOCKFD ?
513 &clock_posix_dynamic : &clock_posix_cpu;
514
515 if (id >= MAX_CLOCKS || !posix_clocks[id].clock_getres)
516 return NULL;
517 return &posix_clocks[id];
518}
519
520static int common_timer_create(struct k_itimer *new_timer)
521{
522 hrtimer_init(&new_timer->it.real.timer, new_timer->it_clock, 0);
523 return 0;
524}
525
526/* Create a POSIX.1b interval timer. */ 526/* Create a POSIX.1b interval timer. */
527 527
528SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock, 528SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock,
529 struct sigevent __user *, timer_event_spec, 529 struct sigevent __user *, timer_event_spec,
530 timer_t __user *, created_timer_id) 530 timer_t __user *, created_timer_id)
531{ 531{
532 struct k_clock *kc = clockid_to_kclock(which_clock);
532 struct k_itimer *new_timer; 533 struct k_itimer *new_timer;
533 int error, new_timer_id; 534 int error, new_timer_id;
534 sigevent_t event; 535 sigevent_t event;
535 int it_id_set = IT_ID_NOT_SET; 536 int it_id_set = IT_ID_NOT_SET;
536 537
537 if (invalid_clockid(which_clock)) 538 if (!kc)
538 return -EINVAL; 539 return -EINVAL;
540 if (!kc->timer_create)
541 return -EOPNOTSUPP;
539 542
540 new_timer = alloc_posix_timer(); 543 new_timer = alloc_posix_timer();
541 if (unlikely(!new_timer)) 544 if (unlikely(!new_timer))
@@ -597,7 +600,7 @@ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock,
597 goto out; 600 goto out;
598 } 601 }
599 602
600 error = CLOCK_DISPATCH(which_clock, timer_create, (new_timer)); 603 error = kc->timer_create(new_timer);
601 if (error) 604 if (error)
602 goto out; 605 goto out;
603 606
@@ -607,7 +610,7 @@ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock,
607 spin_unlock_irq(&current->sighand->siglock); 610 spin_unlock_irq(&current->sighand->siglock);
608 611
609 return 0; 612 return 0;
610 /* 613 /*
611 * In the case of the timer belonging to another task, after 614 * In the case of the timer belonging to another task, after
612 * the task is unlocked, the timer is owned by the other task 615 * the task is unlocked, the timer is owned by the other task
613 * and may cease to exist at any time. Don't use or modify 616 * and may cease to exist at any time. Don't use or modify
@@ -709,22 +712,28 @@ common_timer_get(struct k_itimer *timr, struct itimerspec *cur_setting)
709SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id, 712SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id,
710 struct itimerspec __user *, setting) 713 struct itimerspec __user *, setting)
711{ 714{
712 struct k_itimer *timr;
713 struct itimerspec cur_setting; 715 struct itimerspec cur_setting;
716 struct k_itimer *timr;
717 struct k_clock *kc;
714 unsigned long flags; 718 unsigned long flags;
719 int ret = 0;
715 720
716 timr = lock_timer(timer_id, &flags); 721 timr = lock_timer(timer_id, &flags);
717 if (!timr) 722 if (!timr)
718 return -EINVAL; 723 return -EINVAL;
719 724
720 CLOCK_DISPATCH(timr->it_clock, timer_get, (timr, &cur_setting)); 725 kc = clockid_to_kclock(timr->it_clock);
726 if (WARN_ON_ONCE(!kc || !kc->timer_get))
727 ret = -EINVAL;
728 else
729 kc->timer_get(timr, &cur_setting);
721 730
722 unlock_timer(timr, flags); 731 unlock_timer(timr, flags);
723 732
724 if (copy_to_user(setting, &cur_setting, sizeof (cur_setting))) 733 if (!ret && copy_to_user(setting, &cur_setting, sizeof (cur_setting)))
725 return -EFAULT; 734 return -EFAULT;
726 735
727 return 0; 736 return ret;
728} 737}
729 738
730/* 739/*
@@ -813,6 +822,7 @@ SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags,
813 int error = 0; 822 int error = 0;
814 unsigned long flag; 823 unsigned long flag;
815 struct itimerspec *rtn = old_setting ? &old_spec : NULL; 824 struct itimerspec *rtn = old_setting ? &old_spec : NULL;
825 struct k_clock *kc;
816 826
817 if (!new_setting) 827 if (!new_setting)
818 return -EINVAL; 828 return -EINVAL;
@@ -828,8 +838,11 @@ retry:
828 if (!timr) 838 if (!timr)
829 return -EINVAL; 839 return -EINVAL;
830 840
831 error = CLOCK_DISPATCH(timr->it_clock, timer_set, 841 kc = clockid_to_kclock(timr->it_clock);
832 (timr, flags, &new_spec, rtn)); 842 if (WARN_ON_ONCE(!kc || !kc->timer_set))
843 error = -EINVAL;
844 else
845 error = kc->timer_set(timr, flags, &new_spec, rtn);
833 846
834 unlock_timer(timr, flag); 847 unlock_timer(timr, flag);
835 if (error == TIMER_RETRY) { 848 if (error == TIMER_RETRY) {
@@ -844,7 +857,7 @@ retry:
844 return error; 857 return error;
845} 858}
846 859
847static inline int common_timer_del(struct k_itimer *timer) 860static int common_timer_del(struct k_itimer *timer)
848{ 861{
849 timer->it.real.interval.tv64 = 0; 862 timer->it.real.interval.tv64 = 0;
850 863
@@ -855,7 +868,11 @@ static inline int common_timer_del(struct k_itimer *timer)
855 868
856static inline int timer_delete_hook(struct k_itimer *timer) 869static inline int timer_delete_hook(struct k_itimer *timer)
857{ 870{
858 return CLOCK_DISPATCH(timer->it_clock, timer_del, (timer)); 871 struct k_clock *kc = clockid_to_kclock(timer->it_clock);
872
873 if (WARN_ON_ONCE(!kc || !kc->timer_del))
874 return -EINVAL;
875 return kc->timer_del(timer);
859} 876}
860 877
861/* Delete a POSIX.1b interval timer. */ 878/* Delete a POSIX.1b interval timer. */
@@ -927,69 +944,76 @@ void exit_itimers(struct signal_struct *sig)
927 } 944 }
928} 945}
929 946
930/* Not available / possible... functions */
931int do_posix_clock_nosettime(const clockid_t clockid, struct timespec *tp)
932{
933 return -EINVAL;
934}
935EXPORT_SYMBOL_GPL(do_posix_clock_nosettime);
936
937int do_posix_clock_nonanosleep(const clockid_t clock, int flags,
938 struct timespec *t, struct timespec __user *r)
939{
940#ifndef ENOTSUP
941 return -EOPNOTSUPP; /* aka ENOTSUP in userland for POSIX */
942#else /* parisc does define it separately. */
943 return -ENOTSUP;
944#endif
945}
946EXPORT_SYMBOL_GPL(do_posix_clock_nonanosleep);
947
948SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock, 947SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock,
949 const struct timespec __user *, tp) 948 const struct timespec __user *, tp)
950{ 949{
950 struct k_clock *kc = clockid_to_kclock(which_clock);
951 struct timespec new_tp; 951 struct timespec new_tp;
952 952
953 if (invalid_clockid(which_clock)) 953 if (!kc || !kc->clock_set)
954 return -EINVAL; 954 return -EINVAL;
955
955 if (copy_from_user(&new_tp, tp, sizeof (*tp))) 956 if (copy_from_user(&new_tp, tp, sizeof (*tp)))
956 return -EFAULT; 957 return -EFAULT;
957 958
958 return CLOCK_DISPATCH(which_clock, clock_set, (which_clock, &new_tp)); 959 return kc->clock_set(which_clock, &new_tp);
959} 960}
960 961
961SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock, 962SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock,
962 struct timespec __user *,tp) 963 struct timespec __user *,tp)
963{ 964{
965 struct k_clock *kc = clockid_to_kclock(which_clock);
964 struct timespec kernel_tp; 966 struct timespec kernel_tp;
965 int error; 967 int error;
966 968
967 if (invalid_clockid(which_clock)) 969 if (!kc)
968 return -EINVAL; 970 return -EINVAL;
969 error = CLOCK_DISPATCH(which_clock, clock_get, 971
970 (which_clock, &kernel_tp)); 972 error = kc->clock_get(which_clock, &kernel_tp);
973
971 if (!error && copy_to_user(tp, &kernel_tp, sizeof (kernel_tp))) 974 if (!error && copy_to_user(tp, &kernel_tp, sizeof (kernel_tp)))
972 error = -EFAULT; 975 error = -EFAULT;
973 976
974 return error; 977 return error;
978}
979
980SYSCALL_DEFINE2(clock_adjtime, const clockid_t, which_clock,
981 struct timex __user *, utx)
982{
983 struct k_clock *kc = clockid_to_kclock(which_clock);
984 struct timex ktx;
985 int err;
986
987 if (!kc)
988 return -EINVAL;
989 if (!kc->clock_adj)
990 return -EOPNOTSUPP;
991
992 if (copy_from_user(&ktx, utx, sizeof(ktx)))
993 return -EFAULT;
994
995 err = kc->clock_adj(which_clock, &ktx);
996
997 if (!err && copy_to_user(utx, &ktx, sizeof(ktx)))
998 return -EFAULT;
975 999
1000 return err;
976} 1001}
977 1002
978SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock, 1003SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock,
979 struct timespec __user *, tp) 1004 struct timespec __user *, tp)
980{ 1005{
1006 struct k_clock *kc = clockid_to_kclock(which_clock);
981 struct timespec rtn_tp; 1007 struct timespec rtn_tp;
982 int error; 1008 int error;
983 1009
984 if (invalid_clockid(which_clock)) 1010 if (!kc)
985 return -EINVAL; 1011 return -EINVAL;
986 1012
987 error = CLOCK_DISPATCH(which_clock, clock_getres, 1013 error = kc->clock_getres(which_clock, &rtn_tp);
988 (which_clock, &rtn_tp));
989 1014
990 if (!error && tp && copy_to_user(tp, &rtn_tp, sizeof (rtn_tp))) { 1015 if (!error && tp && copy_to_user(tp, &rtn_tp, sizeof (rtn_tp)))
991 error = -EFAULT; 1016 error = -EFAULT;
992 }
993 1017
994 return error; 1018 return error;
995} 1019}
@@ -1009,10 +1033,13 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags,
1009 const struct timespec __user *, rqtp, 1033 const struct timespec __user *, rqtp,
1010 struct timespec __user *, rmtp) 1034 struct timespec __user *, rmtp)
1011{ 1035{
1036 struct k_clock *kc = clockid_to_kclock(which_clock);
1012 struct timespec t; 1037 struct timespec t;
1013 1038
1014 if (invalid_clockid(which_clock)) 1039 if (!kc)
1015 return -EINVAL; 1040 return -EINVAL;
1041 if (!kc->nsleep)
1042 return -ENANOSLEEP_NOTSUP;
1016 1043
1017 if (copy_from_user(&t, rqtp, sizeof (struct timespec))) 1044 if (copy_from_user(&t, rqtp, sizeof (struct timespec)))
1018 return -EFAULT; 1045 return -EFAULT;
@@ -1020,27 +1047,20 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags,
1020 if (!timespec_valid(&t)) 1047 if (!timespec_valid(&t))
1021 return -EINVAL; 1048 return -EINVAL;
1022 1049
1023 return CLOCK_DISPATCH(which_clock, nsleep, 1050 return kc->nsleep(which_clock, flags, &t, rmtp);
1024 (which_clock, flags, &t, rmtp));
1025}
1026
1027/*
1028 * nanosleep_restart for monotonic and realtime clocks
1029 */
1030static int common_nsleep_restart(struct restart_block *restart_block)
1031{
1032 return hrtimer_nanosleep_restart(restart_block);
1033} 1051}
1034 1052
1035/* 1053/*
1036 * This will restart clock_nanosleep. This is required only by 1054 * This will restart clock_nanosleep. This is required only by
1037 * compat_clock_nanosleep_restart for now. 1055 * compat_clock_nanosleep_restart for now.
1038 */ 1056 */
1039long 1057long clock_nanosleep_restart(struct restart_block *restart_block)
1040clock_nanosleep_restart(struct restart_block *restart_block)
1041{ 1058{
1042 clockid_t which_clock = restart_block->arg0; 1059 clockid_t which_clock = restart_block->nanosleep.index;
1060 struct k_clock *kc = clockid_to_kclock(which_clock);
1061
1062 if (WARN_ON_ONCE(!kc || !kc->nsleep_restart))
1063 return -EINVAL;
1043 1064
1044 return CLOCK_DISPATCH(which_clock, nsleep_restart, 1065 return kc->nsleep_restart(restart_block);
1045 (restart_block));
1046} 1066}
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index a5aff3ebad38..265729966ece 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -100,13 +100,9 @@ config PM_SLEEP_ADVANCED_DEBUG
100 depends on PM_ADVANCED_DEBUG 100 depends on PM_ADVANCED_DEBUG
101 default n 101 default n
102 102
103config SUSPEND_NVS
104 bool
105
106config SUSPEND 103config SUSPEND
107 bool "Suspend to RAM and standby" 104 bool "Suspend to RAM and standby"
108 depends on PM && ARCH_SUSPEND_POSSIBLE 105 depends on PM && ARCH_SUSPEND_POSSIBLE
109 select SUSPEND_NVS if HAS_IOMEM
110 default y 106 default y
111 ---help--- 107 ---help---
112 Allow the system to enter sleep states in which main memory is 108 Allow the system to enter sleep states in which main memory is
@@ -140,7 +136,6 @@ config HIBERNATION
140 depends on PM && SWAP && ARCH_HIBERNATION_POSSIBLE 136 depends on PM && SWAP && ARCH_HIBERNATION_POSSIBLE
141 select LZO_COMPRESS 137 select LZO_COMPRESS
142 select LZO_DECOMPRESS 138 select LZO_DECOMPRESS
143 select SUSPEND_NVS if HAS_IOMEM
144 ---help--- 139 ---help---
145 Enable the suspend to disk (STD) functionality, which is usually 140 Enable the suspend to disk (STD) functionality, which is usually
146 called "hibernation" in user interfaces. STD checkpoints the 141 called "hibernation" in user interfaces. STD checkpoints the
diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index f9063c6b185d..c350e18b53e3 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -1,7 +1,4 @@
1 1ccflags-$(CONFIG_PM_DEBUG) := -DDEBUG
2ifeq ($(CONFIG_PM_DEBUG),y)
3EXTRA_CFLAGS += -DDEBUG
4endif
5 2
6obj-$(CONFIG_PM) += main.o 3obj-$(CONFIG_PM) += main.o
7obj-$(CONFIG_PM_SLEEP) += console.o 4obj-$(CONFIG_PM_SLEEP) += console.o
@@ -10,6 +7,5 @@ obj-$(CONFIG_SUSPEND) += suspend.o
10obj-$(CONFIG_PM_TEST_SUSPEND) += suspend_test.o 7obj-$(CONFIG_PM_TEST_SUSPEND) += suspend_test.o
11obj-$(CONFIG_HIBERNATION) += hibernate.o snapshot.o swap.o user.o \ 8obj-$(CONFIG_HIBERNATION) += hibernate.o snapshot.o swap.o user.o \
12 block_io.o 9 block_io.o
13obj-$(CONFIG_SUSPEND_NVS) += nvs.o
14 10
15obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o 11obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 048d0b514831..1832bd264219 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -51,18 +51,18 @@ enum {
51 51
52static int hibernation_mode = HIBERNATION_SHUTDOWN; 52static int hibernation_mode = HIBERNATION_SHUTDOWN;
53 53
54static struct platform_hibernation_ops *hibernation_ops; 54static const struct platform_hibernation_ops *hibernation_ops;
55 55
56/** 56/**
57 * hibernation_set_ops - set the global hibernate operations 57 * hibernation_set_ops - set the global hibernate operations
58 * @ops: the hibernation operations to use in subsequent hibernation transitions 58 * @ops: the hibernation operations to use in subsequent hibernation transitions
59 */ 59 */
60 60
61void hibernation_set_ops(struct platform_hibernation_ops *ops) 61void hibernation_set_ops(const struct platform_hibernation_ops *ops)
62{ 62{
63 if (ops && !(ops->begin && ops->end && ops->pre_snapshot 63 if (ops && !(ops->begin && ops->end && ops->pre_snapshot
64 && ops->prepare && ops->finish && ops->enter && ops->pre_restore 64 && ops->prepare && ops->finish && ops->enter && ops->pre_restore
65 && ops->restore_cleanup)) { 65 && ops->restore_cleanup && ops->leave)) {
66 WARN_ON(1); 66 WARN_ON(1);
67 return; 67 return;
68 } 68 }
@@ -278,7 +278,7 @@ static int create_image(int platform_mode)
278 goto Enable_irqs; 278 goto Enable_irqs;
279 } 279 }
280 280
281 if (hibernation_test(TEST_CORE) || !pm_check_wakeup_events()) 281 if (hibernation_test(TEST_CORE) || pm_wakeup_pending())
282 goto Power_up; 282 goto Power_up;
283 283
284 in_suspend = 1; 284 in_suspend = 1;
@@ -516,7 +516,7 @@ int hibernation_platform_enter(void)
516 516
517 local_irq_disable(); 517 local_irq_disable();
518 sysdev_suspend(PMSG_HIBERNATE); 518 sysdev_suspend(PMSG_HIBERNATE);
519 if (!pm_check_wakeup_events()) { 519 if (pm_wakeup_pending()) {
520 error = -EAGAIN; 520 error = -EAGAIN;
521 goto Power_up; 521 goto Power_up;
522 } 522 }
@@ -647,6 +647,7 @@ int hibernate(void)
647 swsusp_free(); 647 swsusp_free();
648 if (!error) 648 if (!error)
649 power_down(); 649 power_down();
650 in_suspend = 0;
650 pm_restore_gfp_mask(); 651 pm_restore_gfp_mask();
651 } else { 652 } else {
652 pr_debug("PM: Image restored successfully.\n"); 653 pr_debug("PM: Image restored successfully.\n");
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 7b5db6a8561e..701853042c28 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -326,7 +326,7 @@ EXPORT_SYMBOL_GPL(pm_wq);
326 326
327static int __init pm_start_workqueue(void) 327static int __init pm_start_workqueue(void)
328{ 328{
329 pm_wq = alloc_workqueue("pm", WQ_FREEZEABLE, 0); 329 pm_wq = alloc_workqueue("pm", WQ_FREEZABLE, 0);
330 330
331 return pm_wq ? 0 : -ENOMEM; 331 return pm_wq ? 0 : -ENOMEM;
332} 332}
diff --git a/kernel/power/nvs.c b/kernel/power/nvs.c
deleted file mode 100644
index 1836db60bbb6..000000000000
--- a/kernel/power/nvs.c
+++ /dev/null
@@ -1,136 +0,0 @@
1/*
2 * linux/kernel/power/hibernate_nvs.c - Routines for handling NVS memory
3 *
4 * Copyright (C) 2008,2009 Rafael J. Wysocki <rjw@sisk.pl>, Novell Inc.
5 *
6 * This file is released under the GPLv2.
7 */
8
9#include <linux/io.h>
10#include <linux/kernel.h>
11#include <linux/list.h>
12#include <linux/mm.h>
13#include <linux/slab.h>
14#include <linux/suspend.h>
15
16/*
17 * Platforms, like ACPI, may want us to save some memory used by them during
18 * suspend and to restore the contents of this memory during the subsequent
19 * resume. The code below implements a mechanism allowing us to do that.
20 */
21
22struct nvs_page {
23 unsigned long phys_start;
24 unsigned int size;
25 void *kaddr;
26 void *data;
27 struct list_head node;
28};
29
30static LIST_HEAD(nvs_list);
31
32/**
33 * suspend_nvs_register - register platform NVS memory region to save
34 * @start - physical address of the region
35 * @size - size of the region
36 *
37 * The NVS region need not be page-aligned (both ends) and we arrange
38 * things so that the data from page-aligned addresses in this region will
39 * be copied into separate RAM pages.
40 */
41int suspend_nvs_register(unsigned long start, unsigned long size)
42{
43 struct nvs_page *entry, *next;
44
45 while (size > 0) {
46 unsigned int nr_bytes;
47
48 entry = kzalloc(sizeof(struct nvs_page), GFP_KERNEL);
49 if (!entry)
50 goto Error;
51
52 list_add_tail(&entry->node, &nvs_list);
53 entry->phys_start = start;
54 nr_bytes = PAGE_SIZE - (start & ~PAGE_MASK);
55 entry->size = (size < nr_bytes) ? size : nr_bytes;
56
57 start += entry->size;
58 size -= entry->size;
59 }
60 return 0;
61
62 Error:
63 list_for_each_entry_safe(entry, next, &nvs_list, node) {
64 list_del(&entry->node);
65 kfree(entry);
66 }
67 return -ENOMEM;
68}
69
70/**
71 * suspend_nvs_free - free data pages allocated for saving NVS regions
72 */
73void suspend_nvs_free(void)
74{
75 struct nvs_page *entry;
76
77 list_for_each_entry(entry, &nvs_list, node)
78 if (entry->data) {
79 free_page((unsigned long)entry->data);
80 entry->data = NULL;
81 if (entry->kaddr) {
82 iounmap(entry->kaddr);
83 entry->kaddr = NULL;
84 }
85 }
86}
87
88/**
89 * suspend_nvs_alloc - allocate memory necessary for saving NVS regions
90 */
91int suspend_nvs_alloc(void)
92{
93 struct nvs_page *entry;
94
95 list_for_each_entry(entry, &nvs_list, node) {
96 entry->data = (void *)__get_free_page(GFP_KERNEL);
97 if (!entry->data) {
98 suspend_nvs_free();
99 return -ENOMEM;
100 }
101 }
102 return 0;
103}
104
105/**
106 * suspend_nvs_save - save NVS memory regions
107 */
108void suspend_nvs_save(void)
109{
110 struct nvs_page *entry;
111
112 printk(KERN_INFO "PM: Saving platform NVS memory\n");
113
114 list_for_each_entry(entry, &nvs_list, node)
115 if (entry->data) {
116 entry->kaddr = ioremap(entry->phys_start, entry->size);
117 memcpy(entry->data, entry->kaddr, entry->size);
118 }
119}
120
121/**
122 * suspend_nvs_restore - restore NVS memory regions
123 *
124 * This function is going to be called with interrupts disabled, so it
125 * cannot iounmap the virtual addresses used to access the NVS region.
126 */
127void suspend_nvs_restore(void)
128{
129 struct nvs_page *entry;
130
131 printk(KERN_INFO "PM: Restoring platform NVS memory\n");
132
133 list_for_each_entry(entry, &nvs_list, node)
134 if (entry->data)
135 memcpy(entry->kaddr, entry->data, entry->size);
136}
diff --git a/kernel/power/process.c b/kernel/power/process.c
index e50b4c1b2a0f..0cf3a27a6c9d 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -22,7 +22,7 @@
22 */ 22 */
23#define TIMEOUT (20 * HZ) 23#define TIMEOUT (20 * HZ)
24 24
25static inline int freezeable(struct task_struct * p) 25static inline int freezable(struct task_struct * p)
26{ 26{
27 if ((p == current) || 27 if ((p == current) ||
28 (p->flags & PF_NOFREEZE) || 28 (p->flags & PF_NOFREEZE) ||
@@ -53,7 +53,7 @@ static int try_to_freeze_tasks(bool sig_only)
53 todo = 0; 53 todo = 0;
54 read_lock(&tasklist_lock); 54 read_lock(&tasklist_lock);
55 do_each_thread(g, p) { 55 do_each_thread(g, p) {
56 if (frozen(p) || !freezeable(p)) 56 if (frozen(p) || !freezable(p))
57 continue; 57 continue;
58 58
59 if (!freeze_task(p, sig_only)) 59 if (!freeze_task(p, sig_only))
@@ -64,6 +64,12 @@ static int try_to_freeze_tasks(bool sig_only)
64 * perturb a task in TASK_STOPPED or TASK_TRACED. 64 * perturb a task in TASK_STOPPED or TASK_TRACED.
65 * It is "frozen enough". If the task does wake 65 * It is "frozen enough". If the task does wake
66 * up, it will immediately call try_to_freeze. 66 * up, it will immediately call try_to_freeze.
67 *
68 * Because freeze_task() goes through p's
69 * scheduler lock after setting TIF_FREEZE, it's
70 * guaranteed that either we see TASK_RUNNING or
71 * try_to_stop() after schedule() in ptrace/signal
72 * stop sees TIF_FREEZE.
67 */ 73 */
68 if (!task_is_stopped_or_traced(p) && 74 if (!task_is_stopped_or_traced(p) &&
69 !freezer_should_skip(p)) 75 !freezer_should_skip(p))
@@ -79,7 +85,7 @@ static int try_to_freeze_tasks(bool sig_only)
79 if (!todo || time_after(jiffies, end_time)) 85 if (!todo || time_after(jiffies, end_time))
80 break; 86 break;
81 87
82 if (!pm_check_wakeup_events()) { 88 if (pm_wakeup_pending()) {
83 wakeup = true; 89 wakeup = true;
84 break; 90 break;
85 } 91 }
@@ -161,7 +167,7 @@ static void thaw_tasks(bool nosig_only)
161 167
162 read_lock(&tasklist_lock); 168 read_lock(&tasklist_lock);
163 do_each_thread(g, p) { 169 do_each_thread(g, p) {
164 if (!freezeable(p)) 170 if (!freezable(p))
165 continue; 171 continue;
166 172
167 if (nosig_only && should_send_signal(p)) 173 if (nosig_only && should_send_signal(p))
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 0dac75ea4456..64db648ff911 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -1519,11 +1519,8 @@ static int
1519swsusp_alloc(struct memory_bitmap *orig_bm, struct memory_bitmap *copy_bm, 1519swsusp_alloc(struct memory_bitmap *orig_bm, struct memory_bitmap *copy_bm,
1520 unsigned int nr_pages, unsigned int nr_highmem) 1520 unsigned int nr_pages, unsigned int nr_highmem)
1521{ 1521{
1522 int error = 0;
1523
1524 if (nr_highmem > 0) { 1522 if (nr_highmem > 0) {
1525 error = get_highmem_buffer(PG_ANY); 1523 if (get_highmem_buffer(PG_ANY))
1526 if (error)
1527 goto err_out; 1524 goto err_out;
1528 if (nr_highmem > alloc_highmem) { 1525 if (nr_highmem > alloc_highmem) {
1529 nr_highmem -= alloc_highmem; 1526 nr_highmem -= alloc_highmem;
@@ -1546,7 +1543,7 @@ swsusp_alloc(struct memory_bitmap *orig_bm, struct memory_bitmap *copy_bm,
1546 1543
1547 err_out: 1544 err_out:
1548 swsusp_free(); 1545 swsusp_free();
1549 return error; 1546 return -ENOMEM;
1550} 1547}
1551 1548
1552asmlinkage int swsusp_save(void) 1549asmlinkage int swsusp_save(void)
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 031d5e3a6197..de6f86bfa303 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -31,13 +31,13 @@ const char *const pm_states[PM_SUSPEND_MAX] = {
31 [PM_SUSPEND_MEM] = "mem", 31 [PM_SUSPEND_MEM] = "mem",
32}; 32};
33 33
34static struct platform_suspend_ops *suspend_ops; 34static const struct platform_suspend_ops *suspend_ops;
35 35
36/** 36/**
37 * suspend_set_ops - Set the global suspend method table. 37 * suspend_set_ops - Set the global suspend method table.
38 * @ops: Pointer to ops structure. 38 * @ops: Pointer to ops structure.
39 */ 39 */
40void suspend_set_ops(struct platform_suspend_ops *ops) 40void suspend_set_ops(const struct platform_suspend_ops *ops)
41{ 41{
42 mutex_lock(&pm_mutex); 42 mutex_lock(&pm_mutex);
43 suspend_ops = ops; 43 suspend_ops = ops;
@@ -164,7 +164,7 @@ static int suspend_enter(suspend_state_t state)
164 164
165 error = sysdev_suspend(PMSG_SUSPEND); 165 error = sysdev_suspend(PMSG_SUSPEND);
166 if (!error) { 166 if (!error) {
167 if (!suspend_test(TEST_CORE) && pm_check_wakeup_events()) { 167 if (!(suspend_test(TEST_CORE) || pm_wakeup_pending())) {
168 error = suspend_ops->enter(state); 168 error = suspend_ops->enter(state);
169 events_check_enabled = false; 169 events_check_enabled = false;
170 } 170 }
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 8c7e4832b9be..7c97c3a0eee3 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -224,7 +224,7 @@ static int swsusp_swap_check(void)
224 return res; 224 return res;
225 225
226 root_swap = res; 226 root_swap = res;
227 res = blkdev_get(hib_resume_bdev, FMODE_WRITE); 227 res = blkdev_get(hib_resume_bdev, FMODE_WRITE, NULL);
228 if (res) 228 if (res)
229 return res; 229 return res;
230 230
@@ -888,7 +888,7 @@ out_finish:
888/** 888/**
889 * swsusp_read - read the hibernation image. 889 * swsusp_read - read the hibernation image.
890 * @flags_p: flags passed by the "frozen" kernel in the image header should 890 * @flags_p: flags passed by the "frozen" kernel in the image header should
891 * be written into this memeory location 891 * be written into this memory location
892 */ 892 */
893 893
894int swsusp_read(unsigned int *flags_p) 894int swsusp_read(unsigned int *flags_p)
@@ -930,7 +930,8 @@ int swsusp_check(void)
930{ 930{
931 int error; 931 int error;
932 932
933 hib_resume_bdev = open_by_devnum(swsusp_resume_device, FMODE_READ); 933 hib_resume_bdev = blkdev_get_by_dev(swsusp_resume_device,
934 FMODE_READ, NULL);
934 if (!IS_ERR(hib_resume_bdev)) { 935 if (!IS_ERR(hib_resume_bdev)) {
935 set_blocksize(hib_resume_bdev, PAGE_SIZE); 936 set_blocksize(hib_resume_bdev, PAGE_SIZE);
936 clear_page(swsusp_header); 937 clear_page(swsusp_header);
diff --git a/kernel/printk.c b/kernel/printk.c
index ab3ffc5b3b64..36231525e22f 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -39,16 +39,11 @@
39#include <linux/syslog.h> 39#include <linux/syslog.h>
40#include <linux/cpu.h> 40#include <linux/cpu.h>
41#include <linux/notifier.h> 41#include <linux/notifier.h>
42#include <linux/rculist.h>
42 43
43#include <asm/uaccess.h> 44#include <asm/uaccess.h>
44 45
45/* 46/*
46 * for_each_console() allows you to iterate on each console
47 */
48#define for_each_console(con) \
49 for (con = console_drivers; con != NULL; con = con->next)
50
51/*
52 * Architectures can override it: 47 * Architectures can override it:
53 */ 48 */
54void asmlinkage __attribute__((weak)) early_printk(const char *fmt, ...) 49void asmlinkage __attribute__((weak)) early_printk(const char *fmt, ...)
@@ -102,7 +97,7 @@ static int console_locked, console_suspended;
102/* 97/*
103 * logbuf_lock protects log_buf, log_start, log_end, con_start and logged_chars 98 * logbuf_lock protects log_buf, log_start, log_end, con_start and logged_chars
104 * It is also used in interesting ways to provide interlocking in 99 * It is also used in interesting ways to provide interlocking in
105 * release_console_sem(). 100 * console_unlock();.
106 */ 101 */
107static DEFINE_SPINLOCK(logbuf_lock); 102static DEFINE_SPINLOCK(logbuf_lock);
108 103
@@ -267,25 +262,47 @@ int dmesg_restrict = 1;
267int dmesg_restrict; 262int dmesg_restrict;
268#endif 263#endif
269 264
265static int syslog_action_restricted(int type)
266{
267 if (dmesg_restrict)
268 return 1;
269 /* Unless restricted, we allow "read all" and "get buffer size" for everybody */
270 return type != SYSLOG_ACTION_READ_ALL && type != SYSLOG_ACTION_SIZE_BUFFER;
271}
272
273static int check_syslog_permissions(int type, bool from_file)
274{
275 /*
276 * If this is from /proc/kmsg and we've already opened it, then we've
277 * already done the capabilities checks at open time.
278 */
279 if (from_file && type != SYSLOG_ACTION_OPEN)
280 return 0;
281
282 if (syslog_action_restricted(type)) {
283 if (capable(CAP_SYSLOG))
284 return 0;
285 /* For historical reasons, accept CAP_SYS_ADMIN too, with a warning */
286 if (capable(CAP_SYS_ADMIN)) {
287 WARN_ONCE(1, "Attempt to access syslog with CAP_SYS_ADMIN "
288 "but no CAP_SYSLOG (deprecated).\n");
289 return 0;
290 }
291 return -EPERM;
292 }
293 return 0;
294}
295
270int do_syslog(int type, char __user *buf, int len, bool from_file) 296int do_syslog(int type, char __user *buf, int len, bool from_file)
271{ 297{
272 unsigned i, j, limit, count; 298 unsigned i, j, limit, count;
273 int do_clear = 0; 299 int do_clear = 0;
274 char c; 300 char c;
275 int error = 0; 301 int error;
276 302
277 /* 303 error = check_syslog_permissions(type, from_file);
278 * If this is from /proc/kmsg we only do the capabilities checks 304 if (error)
279 * at open time. 305 goto out;
280 */
281 if (type == SYSLOG_ACTION_OPEN || !from_file) {
282 if (dmesg_restrict && !capable(CAP_SYS_ADMIN))
283 return -EPERM;
284 if ((type != SYSLOG_ACTION_READ_ALL &&
285 type != SYSLOG_ACTION_SIZE_BUFFER) &&
286 !capable(CAP_SYS_ADMIN))
287 return -EPERM;
288 }
289 306
290 error = security_syslog(type); 307 error = security_syslog(type);
291 if (error) 308 if (error)
@@ -500,7 +517,7 @@ static void _call_console_drivers(unsigned start,
500/* 517/*
501 * Call the console drivers, asking them to write out 518 * Call the console drivers, asking them to write out
502 * log_buf[start] to log_buf[end - 1]. 519 * log_buf[start] to log_buf[end - 1].
503 * The console_sem must be held. 520 * The console_lock must be held.
504 */ 521 */
505static void call_console_drivers(unsigned start, unsigned end) 522static void call_console_drivers(unsigned start, unsigned end)
506{ 523{
@@ -603,11 +620,11 @@ static int have_callable_console(void)
603 * 620 *
604 * This is printk(). It can be called from any context. We want it to work. 621 * This is printk(). It can be called from any context. We want it to work.
605 * 622 *
606 * We try to grab the console_sem. If we succeed, it's easy - we log the output and 623 * We try to grab the console_lock. If we succeed, it's easy - we log the output and
607 * call the console drivers. If we fail to get the semaphore we place the output 624 * call the console drivers. If we fail to get the semaphore we place the output
608 * into the log buffer and return. The current holder of the console_sem will 625 * into the log buffer and return. The current holder of the console_sem will
609 * notice the new output in release_console_sem() and will send it to the 626 * notice the new output in console_unlock(); and will send it to the
610 * consoles before releasing the semaphore. 627 * consoles before releasing the lock.
611 * 628 *
612 * One effect of this deferred printing is that code which calls printk() and 629 * One effect of this deferred printing is that code which calls printk() and
613 * then changes console_loglevel may break. This is because console_loglevel 630 * then changes console_loglevel may break. This is because console_loglevel
@@ -658,19 +675,19 @@ static inline int can_use_console(unsigned int cpu)
658/* 675/*
659 * Try to get console ownership to actually show the kernel 676 * Try to get console ownership to actually show the kernel
660 * messages from a 'printk'. Return true (and with the 677 * messages from a 'printk'. Return true (and with the
661 * console_semaphore held, and 'console_locked' set) if it 678 * console_lock held, and 'console_locked' set) if it
662 * is successful, false otherwise. 679 * is successful, false otherwise.
663 * 680 *
664 * This gets called with the 'logbuf_lock' spinlock held and 681 * This gets called with the 'logbuf_lock' spinlock held and
665 * interrupts disabled. It should return with 'lockbuf_lock' 682 * interrupts disabled. It should return with 'lockbuf_lock'
666 * released but interrupts still disabled. 683 * released but interrupts still disabled.
667 */ 684 */
668static int acquire_console_semaphore_for_printk(unsigned int cpu) 685static int console_trylock_for_printk(unsigned int cpu)
669 __releases(&logbuf_lock) 686 __releases(&logbuf_lock)
670{ 687{
671 int retval = 0; 688 int retval = 0;
672 689
673 if (!try_acquire_console_sem()) { 690 if (console_trylock()) {
674 retval = 1; 691 retval = 1;
675 692
676 /* 693 /*
@@ -826,12 +843,12 @@ asmlinkage int vprintk(const char *fmt, va_list args)
826 * actual magic (print out buffers, wake up klogd, 843 * actual magic (print out buffers, wake up klogd,
827 * etc). 844 * etc).
828 * 845 *
829 * The acquire_console_semaphore_for_printk() function 846 * The console_trylock_for_printk() function
830 * will release 'logbuf_lock' regardless of whether it 847 * will release 'logbuf_lock' regardless of whether it
831 * actually gets the semaphore or not. 848 * actually gets the semaphore or not.
832 */ 849 */
833 if (acquire_console_semaphore_for_printk(this_cpu)) 850 if (console_trylock_for_printk(this_cpu))
834 release_console_sem(); 851 console_unlock();
835 852
836 lockdep_on(); 853 lockdep_on();
837out_restore_irqs: 854out_restore_irqs:
@@ -992,7 +1009,7 @@ void suspend_console(void)
992 if (!console_suspend_enabled) 1009 if (!console_suspend_enabled)
993 return; 1010 return;
994 printk("Suspending console(s) (use no_console_suspend to debug)\n"); 1011 printk("Suspending console(s) (use no_console_suspend to debug)\n");
995 acquire_console_sem(); 1012 console_lock();
996 console_suspended = 1; 1013 console_suspended = 1;
997 up(&console_sem); 1014 up(&console_sem);
998} 1015}
@@ -1003,7 +1020,7 @@ void resume_console(void)
1003 return; 1020 return;
1004 down(&console_sem); 1021 down(&console_sem);
1005 console_suspended = 0; 1022 console_suspended = 0;
1006 release_console_sem(); 1023 console_unlock();
1007} 1024}
1008 1025
1009/** 1026/**
@@ -1026,21 +1043,21 @@ static int __cpuinit console_cpu_notify(struct notifier_block *self,
1026 case CPU_DYING: 1043 case CPU_DYING:
1027 case CPU_DOWN_FAILED: 1044 case CPU_DOWN_FAILED:
1028 case CPU_UP_CANCELED: 1045 case CPU_UP_CANCELED:
1029 acquire_console_sem(); 1046 console_lock();
1030 release_console_sem(); 1047 console_unlock();
1031 } 1048 }
1032 return NOTIFY_OK; 1049 return NOTIFY_OK;
1033} 1050}
1034 1051
1035/** 1052/**
1036 * acquire_console_sem - lock the console system for exclusive use. 1053 * console_lock - lock the console system for exclusive use.
1037 * 1054 *
1038 * Acquires a semaphore which guarantees that the caller has 1055 * Acquires a lock which guarantees that the caller has
1039 * exclusive access to the console system and the console_drivers list. 1056 * exclusive access to the console system and the console_drivers list.
1040 * 1057 *
1041 * Can sleep, returns nothing. 1058 * Can sleep, returns nothing.
1042 */ 1059 */
1043void acquire_console_sem(void) 1060void console_lock(void)
1044{ 1061{
1045 BUG_ON(in_interrupt()); 1062 BUG_ON(in_interrupt());
1046 down(&console_sem); 1063 down(&console_sem);
@@ -1049,21 +1066,29 @@ void acquire_console_sem(void)
1049 console_locked = 1; 1066 console_locked = 1;
1050 console_may_schedule = 1; 1067 console_may_schedule = 1;
1051} 1068}
1052EXPORT_SYMBOL(acquire_console_sem); 1069EXPORT_SYMBOL(console_lock);
1053 1070
1054int try_acquire_console_sem(void) 1071/**
1072 * console_trylock - try to lock the console system for exclusive use.
1073 *
1074 * Tried to acquire a lock which guarantees that the caller has
1075 * exclusive access to the console system and the console_drivers list.
1076 *
1077 * returns 1 on success, and 0 on failure to acquire the lock.
1078 */
1079int console_trylock(void)
1055{ 1080{
1056 if (down_trylock(&console_sem)) 1081 if (down_trylock(&console_sem))
1057 return -1; 1082 return 0;
1058 if (console_suspended) { 1083 if (console_suspended) {
1059 up(&console_sem); 1084 up(&console_sem);
1060 return -1; 1085 return 0;
1061 } 1086 }
1062 console_locked = 1; 1087 console_locked = 1;
1063 console_may_schedule = 0; 1088 console_may_schedule = 0;
1064 return 0; 1089 return 1;
1065} 1090}
1066EXPORT_SYMBOL(try_acquire_console_sem); 1091EXPORT_SYMBOL(console_trylock);
1067 1092
1068int is_console_locked(void) 1093int is_console_locked(void)
1069{ 1094{
@@ -1094,20 +1119,20 @@ void wake_up_klogd(void)
1094} 1119}
1095 1120
1096/** 1121/**
1097 * release_console_sem - unlock the console system 1122 * console_unlock - unlock the console system
1098 * 1123 *
1099 * Releases the semaphore which the caller holds on the console system 1124 * Releases the console_lock which the caller holds on the console system
1100 * and the console driver list. 1125 * and the console driver list.
1101 * 1126 *
1102 * While the semaphore was held, console output may have been buffered 1127 * While the console_lock was held, console output may have been buffered
1103 * by printk(). If this is the case, release_console_sem() emits 1128 * by printk(). If this is the case, console_unlock(); emits
1104 * the output prior to releasing the semaphore. 1129 * the output prior to releasing the lock.
1105 * 1130 *
1106 * If there is output waiting for klogd, we wake it up. 1131 * If there is output waiting for klogd, we wake it up.
1107 * 1132 *
1108 * release_console_sem() may be called from any context. 1133 * console_unlock(); may be called from any context.
1109 */ 1134 */
1110void release_console_sem(void) 1135void console_unlock(void)
1111{ 1136{
1112 unsigned long flags; 1137 unsigned long flags;
1113 unsigned _con_start, _log_end; 1138 unsigned _con_start, _log_end;
@@ -1140,7 +1165,7 @@ void release_console_sem(void)
1140 if (wake_klogd) 1165 if (wake_klogd)
1141 wake_up_klogd(); 1166 wake_up_klogd();
1142} 1167}
1143EXPORT_SYMBOL(release_console_sem); 1168EXPORT_SYMBOL(console_unlock);
1144 1169
1145/** 1170/**
1146 * console_conditional_schedule - yield the CPU if required 1171 * console_conditional_schedule - yield the CPU if required
@@ -1149,7 +1174,7 @@ EXPORT_SYMBOL(release_console_sem);
1149 * if this CPU should yield the CPU to another task, do 1174 * if this CPU should yield the CPU to another task, do
1150 * so here. 1175 * so here.
1151 * 1176 *
1152 * Must be called within acquire_console_sem(). 1177 * Must be called within console_lock();.
1153 */ 1178 */
1154void __sched console_conditional_schedule(void) 1179void __sched console_conditional_schedule(void)
1155{ 1180{
@@ -1170,14 +1195,14 @@ void console_unblank(void)
1170 if (down_trylock(&console_sem) != 0) 1195 if (down_trylock(&console_sem) != 0)
1171 return; 1196 return;
1172 } else 1197 } else
1173 acquire_console_sem(); 1198 console_lock();
1174 1199
1175 console_locked = 1; 1200 console_locked = 1;
1176 console_may_schedule = 0; 1201 console_may_schedule = 0;
1177 for_each_console(c) 1202 for_each_console(c)
1178 if ((c->flags & CON_ENABLED) && c->unblank) 1203 if ((c->flags & CON_ENABLED) && c->unblank)
1179 c->unblank(); 1204 c->unblank();
1180 release_console_sem(); 1205 console_unlock();
1181} 1206}
1182 1207
1183/* 1208/*
@@ -1188,7 +1213,7 @@ struct tty_driver *console_device(int *index)
1188 struct console *c; 1213 struct console *c;
1189 struct tty_driver *driver = NULL; 1214 struct tty_driver *driver = NULL;
1190 1215
1191 acquire_console_sem(); 1216 console_lock();
1192 for_each_console(c) { 1217 for_each_console(c) {
1193 if (!c->device) 1218 if (!c->device)
1194 continue; 1219 continue;
@@ -1196,7 +1221,7 @@ struct tty_driver *console_device(int *index)
1196 if (driver) 1221 if (driver)
1197 break; 1222 break;
1198 } 1223 }
1199 release_console_sem(); 1224 console_unlock();
1200 return driver; 1225 return driver;
1201} 1226}
1202 1227
@@ -1207,17 +1232,17 @@ struct tty_driver *console_device(int *index)
1207 */ 1232 */
1208void console_stop(struct console *console) 1233void console_stop(struct console *console)
1209{ 1234{
1210 acquire_console_sem(); 1235 console_lock();
1211 console->flags &= ~CON_ENABLED; 1236 console->flags &= ~CON_ENABLED;
1212 release_console_sem(); 1237 console_unlock();
1213} 1238}
1214EXPORT_SYMBOL(console_stop); 1239EXPORT_SYMBOL(console_stop);
1215 1240
1216void console_start(struct console *console) 1241void console_start(struct console *console)
1217{ 1242{
1218 acquire_console_sem(); 1243 console_lock();
1219 console->flags |= CON_ENABLED; 1244 console->flags |= CON_ENABLED;
1220 release_console_sem(); 1245 console_unlock();
1221} 1246}
1222EXPORT_SYMBOL(console_start); 1247EXPORT_SYMBOL(console_start);
1223 1248
@@ -1339,7 +1364,7 @@ void register_console(struct console *newcon)
1339 * Put this console in the list - keep the 1364 * Put this console in the list - keep the
1340 * preferred driver at the head of the list. 1365 * preferred driver at the head of the list.
1341 */ 1366 */
1342 acquire_console_sem(); 1367 console_lock();
1343 if ((newcon->flags & CON_CONSDEV) || console_drivers == NULL) { 1368 if ((newcon->flags & CON_CONSDEV) || console_drivers == NULL) {
1344 newcon->next = console_drivers; 1369 newcon->next = console_drivers;
1345 console_drivers = newcon; 1370 console_drivers = newcon;
@@ -1351,14 +1376,15 @@ void register_console(struct console *newcon)
1351 } 1376 }
1352 if (newcon->flags & CON_PRINTBUFFER) { 1377 if (newcon->flags & CON_PRINTBUFFER) {
1353 /* 1378 /*
1354 * release_console_sem() will print out the buffered messages 1379 * console_unlock(); will print out the buffered messages
1355 * for us. 1380 * for us.
1356 */ 1381 */
1357 spin_lock_irqsave(&logbuf_lock, flags); 1382 spin_lock_irqsave(&logbuf_lock, flags);
1358 con_start = log_start; 1383 con_start = log_start;
1359 spin_unlock_irqrestore(&logbuf_lock, flags); 1384 spin_unlock_irqrestore(&logbuf_lock, flags);
1360 } 1385 }
1361 release_console_sem(); 1386 console_unlock();
1387 console_sysfs_notify();
1362 1388
1363 /* 1389 /*
1364 * By unregistering the bootconsoles after we enable the real console 1390 * By unregistering the bootconsoles after we enable the real console
@@ -1394,7 +1420,7 @@ int unregister_console(struct console *console)
1394 return braille_unregister_console(console); 1420 return braille_unregister_console(console);
1395#endif 1421#endif
1396 1422
1397 acquire_console_sem(); 1423 console_lock();
1398 if (console_drivers == console) { 1424 if (console_drivers == console) {
1399 console_drivers=console->next; 1425 console_drivers=console->next;
1400 res = 0; 1426 res = 0;
@@ -1416,7 +1442,8 @@ int unregister_console(struct console *console)
1416 if (console_drivers != NULL && console->flags & CON_CONSDEV) 1442 if (console_drivers != NULL && console->flags & CON_CONSDEV)
1417 console_drivers->flags |= CON_CONSDEV; 1443 console_drivers->flags |= CON_CONSDEV;
1418 1444
1419 release_console_sem(); 1445 console_unlock();
1446 console_sysfs_notify();
1420 return res; 1447 return res;
1421} 1448}
1422EXPORT_SYMBOL(unregister_console); 1449EXPORT_SYMBOL(unregister_console);
@@ -1500,7 +1527,7 @@ int kmsg_dump_register(struct kmsg_dumper *dumper)
1500 /* Don't allow registering multiple times */ 1527 /* Don't allow registering multiple times */
1501 if (!dumper->registered) { 1528 if (!dumper->registered) {
1502 dumper->registered = 1; 1529 dumper->registered = 1;
1503 list_add_tail(&dumper->list, &dump_list); 1530 list_add_tail_rcu(&dumper->list, &dump_list);
1504 err = 0; 1531 err = 0;
1505 } 1532 }
1506 spin_unlock_irqrestore(&dump_list_lock, flags); 1533 spin_unlock_irqrestore(&dump_list_lock, flags);
@@ -1524,29 +1551,16 @@ int kmsg_dump_unregister(struct kmsg_dumper *dumper)
1524 spin_lock_irqsave(&dump_list_lock, flags); 1551 spin_lock_irqsave(&dump_list_lock, flags);
1525 if (dumper->registered) { 1552 if (dumper->registered) {
1526 dumper->registered = 0; 1553 dumper->registered = 0;
1527 list_del(&dumper->list); 1554 list_del_rcu(&dumper->list);
1528 err = 0; 1555 err = 0;
1529 } 1556 }
1530 spin_unlock_irqrestore(&dump_list_lock, flags); 1557 spin_unlock_irqrestore(&dump_list_lock, flags);
1558 synchronize_rcu();
1531 1559
1532 return err; 1560 return err;
1533} 1561}
1534EXPORT_SYMBOL_GPL(kmsg_dump_unregister); 1562EXPORT_SYMBOL_GPL(kmsg_dump_unregister);
1535 1563
1536static const char * const kmsg_reasons[] = {
1537 [KMSG_DUMP_OOPS] = "oops",
1538 [KMSG_DUMP_PANIC] = "panic",
1539 [KMSG_DUMP_KEXEC] = "kexec",
1540};
1541
1542static const char *kmsg_to_str(enum kmsg_dump_reason reason)
1543{
1544 if (reason >= ARRAY_SIZE(kmsg_reasons) || reason < 0)
1545 return "unknown";
1546
1547 return kmsg_reasons[reason];
1548}
1549
1550/** 1564/**
1551 * kmsg_dump - dump kernel log to kernel message dumpers. 1565 * kmsg_dump - dump kernel log to kernel message dumpers.
1552 * @reason: the reason (oops, panic etc) for dumping 1566 * @reason: the reason (oops, panic etc) for dumping
@@ -1585,13 +1599,9 @@ void kmsg_dump(enum kmsg_dump_reason reason)
1585 l2 = chars; 1599 l2 = chars;
1586 } 1600 }
1587 1601
1588 if (!spin_trylock_irqsave(&dump_list_lock, flags)) { 1602 rcu_read_lock();
1589 printk(KERN_ERR "dump_kmsg: dump list lock is held during %s, skipping dump\n", 1603 list_for_each_entry_rcu(dumper, &dump_list, list)
1590 kmsg_to_str(reason));
1591 return;
1592 }
1593 list_for_each_entry(dumper, &dump_list, list)
1594 dumper->dump(dumper, reason, s1, l1, s2, l2); 1604 dumper->dump(dumper, reason, s1, l1, s2, l2);
1595 spin_unlock_irqrestore(&dump_list_lock, flags); 1605 rcu_read_unlock();
1596} 1606}
1597#endif 1607#endif
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 99bbaa3e5b0d..e2302e40b360 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -163,7 +163,7 @@ bool ptrace_may_access(struct task_struct *task, unsigned int mode)
163 return !err; 163 return !err;
164} 164}
165 165
166int ptrace_attach(struct task_struct *task) 166static int ptrace_attach(struct task_struct *task)
167{ 167{
168 int retval; 168 int retval;
169 169
@@ -219,7 +219,7 @@ out:
219 * Performs checks and sets PT_PTRACED. 219 * Performs checks and sets PT_PTRACED.
220 * Should be used by all ptrace implementations for PTRACE_TRACEME. 220 * Should be used by all ptrace implementations for PTRACE_TRACEME.
221 */ 221 */
222int ptrace_traceme(void) 222static int ptrace_traceme(void)
223{ 223{
224 int ret = -EPERM; 224 int ret = -EPERM;
225 225
@@ -293,7 +293,7 @@ static bool __ptrace_detach(struct task_struct *tracer, struct task_struct *p)
293 return false; 293 return false;
294} 294}
295 295
296int ptrace_detach(struct task_struct *child, unsigned int data) 296static int ptrace_detach(struct task_struct *child, unsigned int data)
297{ 297{
298 bool dead = false; 298 bool dead = false;
299 299
@@ -313,7 +313,7 @@ int ptrace_detach(struct task_struct *child, unsigned int data)
313 child->exit_code = data; 313 child->exit_code = data;
314 dead = __ptrace_detach(current, child); 314 dead = __ptrace_detach(current, child);
315 if (!child->exit_state) 315 if (!child->exit_state)
316 wake_up_process(child); 316 wake_up_state(child, TASK_TRACED | TASK_STOPPED);
317 } 317 }
318 write_unlock_irq(&tasklist_lock); 318 write_unlock_irq(&tasklist_lock);
319 319
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c
index 034493724749..0c343b9a46d5 100644
--- a/kernel/rcutiny.c
+++ b/kernel/rcutiny.c
@@ -189,7 +189,8 @@ static int rcu_kthread(void *arg)
189 unsigned long flags; 189 unsigned long flags;
190 190
191 for (;;) { 191 for (;;) {
192 wait_event(rcu_kthread_wq, have_rcu_kthread_work != 0); 192 wait_event_interruptible(rcu_kthread_wq,
193 have_rcu_kthread_work != 0);
193 morework = rcu_boost(); 194 morework = rcu_boost();
194 local_irq_save(flags); 195 local_irq_save(flags);
195 work = have_rcu_kthread_work; 196 work = have_rcu_kthread_work;
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index d0ddfea6579d..dd4aea806f8e 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -364,8 +364,8 @@ void rcu_irq_exit(void)
364 WARN_ON_ONCE(rdtp->dynticks & 0x1); 364 WARN_ON_ONCE(rdtp->dynticks & 0x1);
365 365
366 /* If the interrupt queued a callback, get out of dyntick mode. */ 366 /* If the interrupt queued a callback, get out of dyntick mode. */
367 if (__get_cpu_var(rcu_sched_data).nxtlist || 367 if (__this_cpu_read(rcu_sched_data.nxtlist) ||
368 __get_cpu_var(rcu_bh_data).nxtlist) 368 __this_cpu_read(rcu_bh_data.nxtlist))
369 set_need_resched(); 369 set_need_resched();
370} 370}
371 371
diff --git a/kernel/rtmutex-debug.c b/kernel/rtmutex-debug.c
index ddabb54bb5c8..3c7cbc2c33be 100644
--- a/kernel/rtmutex-debug.c
+++ b/kernel/rtmutex-debug.c
@@ -215,7 +215,6 @@ void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter)
215 put_pid(waiter->deadlock_task_pid); 215 put_pid(waiter->deadlock_task_pid);
216 TRACE_WARN_ON(!plist_node_empty(&waiter->list_entry)); 216 TRACE_WARN_ON(!plist_node_empty(&waiter->list_entry));
217 TRACE_WARN_ON(!plist_node_empty(&waiter->pi_list_entry)); 217 TRACE_WARN_ON(!plist_node_empty(&waiter->pi_list_entry));
218 TRACE_WARN_ON(waiter->task);
219 memset(waiter, 0x22, sizeof(*waiter)); 218 memset(waiter, 0x22, sizeof(*waiter));
220} 219}
221 220
diff --git a/kernel/rtmutex-tester.c b/kernel/rtmutex-tester.c
index 66cb89bc5ef1..5c9ccd380966 100644
--- a/kernel/rtmutex-tester.c
+++ b/kernel/rtmutex-tester.c
@@ -9,7 +9,6 @@
9#include <linux/kthread.h> 9#include <linux/kthread.h>
10#include <linux/module.h> 10#include <linux/module.h>
11#include <linux/sched.h> 11#include <linux/sched.h>
12#include <linux/smp_lock.h>
13#include <linux/spinlock.h> 12#include <linux/spinlock.h>
14#include <linux/sysdev.h> 13#include <linux/sysdev.h>
15#include <linux/timer.h> 14#include <linux/timer.h>
@@ -27,7 +26,6 @@ struct test_thread_data {
27 int opcode; 26 int opcode;
28 int opdata; 27 int opdata;
29 int mutexes[MAX_RT_TEST_MUTEXES]; 28 int mutexes[MAX_RT_TEST_MUTEXES];
30 int bkl;
31 int event; 29 int event;
32 struct sys_device sysdev; 30 struct sys_device sysdev;
33}; 31};
@@ -46,9 +44,8 @@ enum test_opcodes {
46 RTTEST_LOCKINTNOWAIT, /* 6 Lock interruptible no wait in wakeup, data = lockindex */ 44 RTTEST_LOCKINTNOWAIT, /* 6 Lock interruptible no wait in wakeup, data = lockindex */
47 RTTEST_LOCKCONT, /* 7 Continue locking after the wakeup delay */ 45 RTTEST_LOCKCONT, /* 7 Continue locking after the wakeup delay */
48 RTTEST_UNLOCK, /* 8 Unlock, data = lockindex */ 46 RTTEST_UNLOCK, /* 8 Unlock, data = lockindex */
49 RTTEST_LOCKBKL, /* 9 Lock BKL */ 47 /* 9, 10 - reserved for BKL commemoration */
50 RTTEST_UNLOCKBKL, /* 10 Unlock BKL */ 48 RTTEST_SIGNAL = 11, /* 11 Signal other test thread, data = thread id */
51 RTTEST_SIGNAL, /* 11 Signal other test thread, data = thread id */
52 RTTEST_RESETEVENT = 98, /* 98 Reset event counter */ 49 RTTEST_RESETEVENT = 98, /* 98 Reset event counter */
53 RTTEST_RESET = 99, /* 99 Reset all pending operations */ 50 RTTEST_RESET = 99, /* 99 Reset all pending operations */
54}; 51};
@@ -74,13 +71,6 @@ static int handle_op(struct test_thread_data *td, int lockwakeup)
74 td->mutexes[i] = 0; 71 td->mutexes[i] = 0;
75 } 72 }
76 } 73 }
77
78 if (!lockwakeup && td->bkl == 4) {
79#ifdef CONFIG_LOCK_KERNEL
80 unlock_kernel();
81#endif
82 td->bkl = 0;
83 }
84 return 0; 74 return 0;
85 75
86 case RTTEST_RESETEVENT: 76 case RTTEST_RESETEVENT:
@@ -131,25 +121,6 @@ static int handle_op(struct test_thread_data *td, int lockwakeup)
131 td->mutexes[id] = 0; 121 td->mutexes[id] = 0;
132 return 0; 122 return 0;
133 123
134 case RTTEST_LOCKBKL:
135 if (td->bkl)
136 return 0;
137 td->bkl = 1;
138#ifdef CONFIG_LOCK_KERNEL
139 lock_kernel();
140#endif
141 td->bkl = 4;
142 return 0;
143
144 case RTTEST_UNLOCKBKL:
145 if (td->bkl != 4)
146 break;
147#ifdef CONFIG_LOCK_KERNEL
148 unlock_kernel();
149#endif
150 td->bkl = 0;
151 return 0;
152
153 default: 124 default:
154 break; 125 break;
155 } 126 }
@@ -196,7 +167,6 @@ void schedule_rt_mutex_test(struct rt_mutex *mutex)
196 td->event = atomic_add_return(1, &rttest_event); 167 td->event = atomic_add_return(1, &rttest_event);
197 break; 168 break;
198 169
199 case RTTEST_LOCKBKL:
200 default: 170 default:
201 break; 171 break;
202 } 172 }
@@ -229,8 +199,6 @@ void schedule_rt_mutex_test(struct rt_mutex *mutex)
229 td->event = atomic_add_return(1, &rttest_event); 199 td->event = atomic_add_return(1, &rttest_event);
230 return; 200 return;
231 201
232 case RTTEST_LOCKBKL:
233 return;
234 default: 202 default:
235 return; 203 return;
236 } 204 }
@@ -380,11 +348,11 @@ static ssize_t sysfs_test_status(struct sys_device *dev, struct sysdev_attribute
380 spin_lock(&rttest_lock); 348 spin_lock(&rttest_lock);
381 349
382 curr += sprintf(curr, 350 curr += sprintf(curr,
383 "O: %4d, E:%8d, S: 0x%08lx, P: %4d, N: %4d, B: %p, K: %d, M:", 351 "O: %4d, E:%8d, S: 0x%08lx, P: %4d, N: %4d, B: %p, M:",
384 td->opcode, td->event, tsk->state, 352 td->opcode, td->event, tsk->state,
385 (MAX_RT_PRIO - 1) - tsk->prio, 353 (MAX_RT_PRIO - 1) - tsk->prio,
386 (MAX_RT_PRIO - 1) - tsk->normal_prio, 354 (MAX_RT_PRIO - 1) - tsk->normal_prio,
387 tsk->pi_blocked_on, td->bkl); 355 tsk->pi_blocked_on);
388 356
389 for (i = MAX_RT_TEST_MUTEXES - 1; i >=0 ; i--) 357 for (i = MAX_RT_TEST_MUTEXES - 1; i >=0 ; i--)
390 curr += sprintf(curr, "%d", td->mutexes[i]); 358 curr += sprintf(curr, "%d", td->mutexes[i]);
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c
index a9604815786a..ab449117aaf2 100644
--- a/kernel/rtmutex.c
+++ b/kernel/rtmutex.c
@@ -20,41 +20,34 @@
20/* 20/*
21 * lock->owner state tracking: 21 * lock->owner state tracking:
22 * 22 *
23 * lock->owner holds the task_struct pointer of the owner. Bit 0 and 1 23 * lock->owner holds the task_struct pointer of the owner. Bit 0
24 * are used to keep track of the "owner is pending" and "lock has 24 * is used to keep track of the "lock has waiters" state.
25 * waiters" state.
26 * 25 *
27 * owner bit1 bit0 26 * owner bit0
28 * NULL 0 0 lock is free (fast acquire possible) 27 * NULL 0 lock is free (fast acquire possible)
29 * NULL 0 1 invalid state 28 * NULL 1 lock is free and has waiters and the top waiter
30 * NULL 1 0 Transitional State* 29 * is going to take the lock*
31 * NULL 1 1 invalid state 30 * taskpointer 0 lock is held (fast release possible)
32 * taskpointer 0 0 lock is held (fast release possible) 31 * taskpointer 1 lock is held and has waiters**
33 * taskpointer 0 1 task is pending owner
34 * taskpointer 1 0 lock is held and has waiters
35 * taskpointer 1 1 task is pending owner and lock has more waiters
36 *
37 * Pending ownership is assigned to the top (highest priority)
38 * waiter of the lock, when the lock is released. The thread is woken
39 * up and can now take the lock. Until the lock is taken (bit 0
40 * cleared) a competing higher priority thread can steal the lock
41 * which puts the woken up thread back on the waiters list.
42 * 32 *
43 * The fast atomic compare exchange based acquire and release is only 33 * The fast atomic compare exchange based acquire and release is only
44 * possible when bit 0 and 1 of lock->owner are 0. 34 * possible when bit 0 of lock->owner is 0.
35 *
36 * (*) It also can be a transitional state when grabbing the lock
37 * with ->wait_lock is held. To prevent any fast path cmpxchg to the lock,
38 * we need to set the bit0 before looking at the lock, and the owner may be
39 * NULL in this small time, hence this can be a transitional state.
45 * 40 *
46 * (*) There's a small time where the owner can be NULL and the 41 * (**) There is a small time when bit 0 is set but there are no
47 * "lock has waiters" bit is set. This can happen when grabbing the lock. 42 * waiters. This can happen when grabbing the lock in the slow path.
48 * To prevent a cmpxchg of the owner releasing the lock, we need to set this 43 * To prevent a cmpxchg of the owner releasing the lock, we need to
49 * bit before looking at the lock, hence the reason this is a transitional 44 * set this bit before looking at the lock.
50 * state.
51 */ 45 */
52 46
53static void 47static void
54rt_mutex_set_owner(struct rt_mutex *lock, struct task_struct *owner, 48rt_mutex_set_owner(struct rt_mutex *lock, struct task_struct *owner)
55 unsigned long mask)
56{ 49{
57 unsigned long val = (unsigned long)owner | mask; 50 unsigned long val = (unsigned long)owner;
58 51
59 if (rt_mutex_has_waiters(lock)) 52 if (rt_mutex_has_waiters(lock))
60 val |= RT_MUTEX_HAS_WAITERS; 53 val |= RT_MUTEX_HAS_WAITERS;
@@ -203,15 +196,14 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
203 * reached or the state of the chain has changed while we 196 * reached or the state of the chain has changed while we
204 * dropped the locks. 197 * dropped the locks.
205 */ 198 */
206 if (!waiter || !waiter->task) 199 if (!waiter)
207 goto out_unlock_pi; 200 goto out_unlock_pi;
208 201
209 /* 202 /*
210 * Check the orig_waiter state. After we dropped the locks, 203 * Check the orig_waiter state. After we dropped the locks,
211 * the previous owner of the lock might have released the lock 204 * the previous owner of the lock might have released the lock.
212 * and made us the pending owner:
213 */ 205 */
214 if (orig_waiter && !orig_waiter->task) 206 if (orig_waiter && !rt_mutex_owner(orig_lock))
215 goto out_unlock_pi; 207 goto out_unlock_pi;
216 208
217 /* 209 /*
@@ -254,6 +246,17 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
254 246
255 /* Release the task */ 247 /* Release the task */
256 raw_spin_unlock_irqrestore(&task->pi_lock, flags); 248 raw_spin_unlock_irqrestore(&task->pi_lock, flags);
249 if (!rt_mutex_owner(lock)) {
250 /*
251 * If the requeue above changed the top waiter, then we need
252 * to wake the new top waiter up to try to get the lock.
253 */
254
255 if (top_waiter != rt_mutex_top_waiter(lock))
256 wake_up_process(rt_mutex_top_waiter(lock)->task);
257 raw_spin_unlock(&lock->wait_lock);
258 goto out_put_task;
259 }
257 put_task_struct(task); 260 put_task_struct(task);
258 261
259 /* Grab the next task */ 262 /* Grab the next task */
@@ -296,78 +299,16 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
296} 299}
297 300
298/* 301/*
299 * Optimization: check if we can steal the lock from the
300 * assigned pending owner [which might not have taken the
301 * lock yet]:
302 */
303static inline int try_to_steal_lock(struct rt_mutex *lock,
304 struct task_struct *task)
305{
306 struct task_struct *pendowner = rt_mutex_owner(lock);
307 struct rt_mutex_waiter *next;
308 unsigned long flags;
309
310 if (!rt_mutex_owner_pending(lock))
311 return 0;
312
313 if (pendowner == task)
314 return 1;
315
316 raw_spin_lock_irqsave(&pendowner->pi_lock, flags);
317 if (task->prio >= pendowner->prio) {
318 raw_spin_unlock_irqrestore(&pendowner->pi_lock, flags);
319 return 0;
320 }
321
322 /*
323 * Check if a waiter is enqueued on the pending owners
324 * pi_waiters list. Remove it and readjust pending owners
325 * priority.
326 */
327 if (likely(!rt_mutex_has_waiters(lock))) {
328 raw_spin_unlock_irqrestore(&pendowner->pi_lock, flags);
329 return 1;
330 }
331
332 /* No chain handling, pending owner is not blocked on anything: */
333 next = rt_mutex_top_waiter(lock);
334 plist_del(&next->pi_list_entry, &pendowner->pi_waiters);
335 __rt_mutex_adjust_prio(pendowner);
336 raw_spin_unlock_irqrestore(&pendowner->pi_lock, flags);
337
338 /*
339 * We are going to steal the lock and a waiter was
340 * enqueued on the pending owners pi_waiters queue. So
341 * we have to enqueue this waiter into
342 * task->pi_waiters list. This covers the case,
343 * where task is boosted because it holds another
344 * lock and gets unboosted because the booster is
345 * interrupted, so we would delay a waiter with higher
346 * priority as task->normal_prio.
347 *
348 * Note: in the rare case of a SCHED_OTHER task changing
349 * its priority and thus stealing the lock, next->task
350 * might be task:
351 */
352 if (likely(next->task != task)) {
353 raw_spin_lock_irqsave(&task->pi_lock, flags);
354 plist_add(&next->pi_list_entry, &task->pi_waiters);
355 __rt_mutex_adjust_prio(task);
356 raw_spin_unlock_irqrestore(&task->pi_lock, flags);
357 }
358 return 1;
359}
360
361/*
362 * Try to take an rt-mutex 302 * Try to take an rt-mutex
363 * 303 *
364 * This fails
365 * - when the lock has a real owner
366 * - when a different pending owner exists and has higher priority than current
367 *
368 * Must be called with lock->wait_lock held. 304 * Must be called with lock->wait_lock held.
305 *
306 * @lock: the lock to be acquired.
307 * @task: the task which wants to acquire the lock
308 * @waiter: the waiter that is queued to the lock's wait list. (could be NULL)
369 */ 309 */
370static int try_to_take_rt_mutex(struct rt_mutex *lock) 310static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
311 struct rt_mutex_waiter *waiter)
371{ 312{
372 /* 313 /*
373 * We have to be careful here if the atomic speedups are 314 * We have to be careful here if the atomic speedups are
@@ -390,15 +331,52 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock)
390 */ 331 */
391 mark_rt_mutex_waiters(lock); 332 mark_rt_mutex_waiters(lock);
392 333
393 if (rt_mutex_owner(lock) && !try_to_steal_lock(lock, current)) 334 if (rt_mutex_owner(lock))
394 return 0; 335 return 0;
395 336
337 /*
338 * It will get the lock because of one of these conditions:
339 * 1) there is no waiter
340 * 2) higher priority than waiters
341 * 3) it is top waiter
342 */
343 if (rt_mutex_has_waiters(lock)) {
344 if (task->prio >= rt_mutex_top_waiter(lock)->list_entry.prio) {
345 if (!waiter || waiter != rt_mutex_top_waiter(lock))
346 return 0;
347 }
348 }
349
350 if (waiter || rt_mutex_has_waiters(lock)) {
351 unsigned long flags;
352 struct rt_mutex_waiter *top;
353
354 raw_spin_lock_irqsave(&task->pi_lock, flags);
355
356 /* remove the queued waiter. */
357 if (waiter) {
358 plist_del(&waiter->list_entry, &lock->wait_list);
359 task->pi_blocked_on = NULL;
360 }
361
362 /*
363 * We have to enqueue the top waiter(if it exists) into
364 * task->pi_waiters list.
365 */
366 if (rt_mutex_has_waiters(lock)) {
367 top = rt_mutex_top_waiter(lock);
368 top->pi_list_entry.prio = top->list_entry.prio;
369 plist_add(&top->pi_list_entry, &task->pi_waiters);
370 }
371 raw_spin_unlock_irqrestore(&task->pi_lock, flags);
372 }
373
396 /* We got the lock. */ 374 /* We got the lock. */
397 debug_rt_mutex_lock(lock); 375 debug_rt_mutex_lock(lock);
398 376
399 rt_mutex_set_owner(lock, current, 0); 377 rt_mutex_set_owner(lock, task);
400 378
401 rt_mutex_deadlock_account_lock(lock, current); 379 rt_mutex_deadlock_account_lock(lock, task);
402 380
403 return 1; 381 return 1;
404} 382}
@@ -436,6 +414,9 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
436 414
437 raw_spin_unlock_irqrestore(&task->pi_lock, flags); 415 raw_spin_unlock_irqrestore(&task->pi_lock, flags);
438 416
417 if (!owner)
418 return 0;
419
439 if (waiter == rt_mutex_top_waiter(lock)) { 420 if (waiter == rt_mutex_top_waiter(lock)) {
440 raw_spin_lock_irqsave(&owner->pi_lock, flags); 421 raw_spin_lock_irqsave(&owner->pi_lock, flags);
441 plist_del(&top_waiter->pi_list_entry, &owner->pi_waiters); 422 plist_del(&top_waiter->pi_list_entry, &owner->pi_waiters);
@@ -472,21 +453,18 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
472/* 453/*
473 * Wake up the next waiter on the lock. 454 * Wake up the next waiter on the lock.
474 * 455 *
475 * Remove the top waiter from the current tasks waiter list and from 456 * Remove the top waiter from the current tasks waiter list and wake it up.
476 * the lock waiter list. Set it as pending owner. Then wake it up.
477 * 457 *
478 * Called with lock->wait_lock held. 458 * Called with lock->wait_lock held.
479 */ 459 */
480static void wakeup_next_waiter(struct rt_mutex *lock) 460static void wakeup_next_waiter(struct rt_mutex *lock)
481{ 461{
482 struct rt_mutex_waiter *waiter; 462 struct rt_mutex_waiter *waiter;
483 struct task_struct *pendowner;
484 unsigned long flags; 463 unsigned long flags;
485 464
486 raw_spin_lock_irqsave(&current->pi_lock, flags); 465 raw_spin_lock_irqsave(&current->pi_lock, flags);
487 466
488 waiter = rt_mutex_top_waiter(lock); 467 waiter = rt_mutex_top_waiter(lock);
489 plist_del(&waiter->list_entry, &lock->wait_list);
490 468
491 /* 469 /*
492 * Remove it from current->pi_waiters. We do not adjust a 470 * Remove it from current->pi_waiters. We do not adjust a
@@ -495,43 +473,19 @@ static void wakeup_next_waiter(struct rt_mutex *lock)
495 * lock->wait_lock. 473 * lock->wait_lock.
496 */ 474 */
497 plist_del(&waiter->pi_list_entry, &current->pi_waiters); 475 plist_del(&waiter->pi_list_entry, &current->pi_waiters);
498 pendowner = waiter->task;
499 waiter->task = NULL;
500 476
501 rt_mutex_set_owner(lock, pendowner, RT_MUTEX_OWNER_PENDING); 477 rt_mutex_set_owner(lock, NULL);
502 478
503 raw_spin_unlock_irqrestore(&current->pi_lock, flags); 479 raw_spin_unlock_irqrestore(&current->pi_lock, flags);
504 480
505 /* 481 wake_up_process(waiter->task);
506 * Clear the pi_blocked_on variable and enqueue a possible
507 * waiter into the pi_waiters list of the pending owner. This
508 * prevents that in case the pending owner gets unboosted a
509 * waiter with higher priority than pending-owner->normal_prio
510 * is blocked on the unboosted (pending) owner.
511 */
512 raw_spin_lock_irqsave(&pendowner->pi_lock, flags);
513
514 WARN_ON(!pendowner->pi_blocked_on);
515 WARN_ON(pendowner->pi_blocked_on != waiter);
516 WARN_ON(pendowner->pi_blocked_on->lock != lock);
517
518 pendowner->pi_blocked_on = NULL;
519
520 if (rt_mutex_has_waiters(lock)) {
521 struct rt_mutex_waiter *next;
522
523 next = rt_mutex_top_waiter(lock);
524 plist_add(&next->pi_list_entry, &pendowner->pi_waiters);
525 }
526 raw_spin_unlock_irqrestore(&pendowner->pi_lock, flags);
527
528 wake_up_process(pendowner);
529} 482}
530 483
531/* 484/*
532 * Remove a waiter from a lock 485 * Remove a waiter from a lock and give up
533 * 486 *
534 * Must be called with lock->wait_lock held 487 * Must be called with lock->wait_lock held and
488 * have just failed to try_to_take_rt_mutex().
535 */ 489 */
536static void remove_waiter(struct rt_mutex *lock, 490static void remove_waiter(struct rt_mutex *lock,
537 struct rt_mutex_waiter *waiter) 491 struct rt_mutex_waiter *waiter)
@@ -543,11 +497,13 @@ static void remove_waiter(struct rt_mutex *lock,
543 497
544 raw_spin_lock_irqsave(&current->pi_lock, flags); 498 raw_spin_lock_irqsave(&current->pi_lock, flags);
545 plist_del(&waiter->list_entry, &lock->wait_list); 499 plist_del(&waiter->list_entry, &lock->wait_list);
546 waiter->task = NULL;
547 current->pi_blocked_on = NULL; 500 current->pi_blocked_on = NULL;
548 raw_spin_unlock_irqrestore(&current->pi_lock, flags); 501 raw_spin_unlock_irqrestore(&current->pi_lock, flags);
549 502
550 if (first && owner != current) { 503 if (!owner)
504 return;
505
506 if (first) {
551 507
552 raw_spin_lock_irqsave(&owner->pi_lock, flags); 508 raw_spin_lock_irqsave(&owner->pi_lock, flags);
553 509
@@ -614,21 +570,19 @@ void rt_mutex_adjust_pi(struct task_struct *task)
614 * or TASK_UNINTERRUPTIBLE) 570 * or TASK_UNINTERRUPTIBLE)
615 * @timeout: the pre-initialized and started timer, or NULL for none 571 * @timeout: the pre-initialized and started timer, or NULL for none
616 * @waiter: the pre-initialized rt_mutex_waiter 572 * @waiter: the pre-initialized rt_mutex_waiter
617 * @detect_deadlock: passed to task_blocks_on_rt_mutex
618 * 573 *
619 * lock->wait_lock must be held by the caller. 574 * lock->wait_lock must be held by the caller.
620 */ 575 */
621static int __sched 576static int __sched
622__rt_mutex_slowlock(struct rt_mutex *lock, int state, 577__rt_mutex_slowlock(struct rt_mutex *lock, int state,
623 struct hrtimer_sleeper *timeout, 578 struct hrtimer_sleeper *timeout,
624 struct rt_mutex_waiter *waiter, 579 struct rt_mutex_waiter *waiter)
625 int detect_deadlock)
626{ 580{
627 int ret = 0; 581 int ret = 0;
628 582
629 for (;;) { 583 for (;;) {
630 /* Try to acquire the lock: */ 584 /* Try to acquire the lock: */
631 if (try_to_take_rt_mutex(lock)) 585 if (try_to_take_rt_mutex(lock, current, waiter))
632 break; 586 break;
633 587
634 /* 588 /*
@@ -645,39 +599,11 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state,
645 break; 599 break;
646 } 600 }
647 601
648 /*
649 * waiter->task is NULL the first time we come here and
650 * when we have been woken up by the previous owner
651 * but the lock got stolen by a higher prio task.
652 */
653 if (!waiter->task) {
654 ret = task_blocks_on_rt_mutex(lock, waiter, current,
655 detect_deadlock);
656 /*
657 * If we got woken up by the owner then start loop
658 * all over without going into schedule to try
659 * to get the lock now:
660 */
661 if (unlikely(!waiter->task)) {
662 /*
663 * Reset the return value. We might
664 * have returned with -EDEADLK and the
665 * owner released the lock while we
666 * were walking the pi chain.
667 */
668 ret = 0;
669 continue;
670 }
671 if (unlikely(ret))
672 break;
673 }
674
675 raw_spin_unlock(&lock->wait_lock); 602 raw_spin_unlock(&lock->wait_lock);
676 603
677 debug_rt_mutex_print_deadlock(waiter); 604 debug_rt_mutex_print_deadlock(waiter);
678 605
679 if (waiter->task) 606 schedule_rt_mutex(lock);
680 schedule_rt_mutex(lock);
681 607
682 raw_spin_lock(&lock->wait_lock); 608 raw_spin_lock(&lock->wait_lock);
683 set_current_state(state); 609 set_current_state(state);
@@ -698,12 +624,11 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
698 int ret = 0; 624 int ret = 0;
699 625
700 debug_rt_mutex_init_waiter(&waiter); 626 debug_rt_mutex_init_waiter(&waiter);
701 waiter.task = NULL;
702 627
703 raw_spin_lock(&lock->wait_lock); 628 raw_spin_lock(&lock->wait_lock);
704 629
705 /* Try to acquire the lock again: */ 630 /* Try to acquire the lock again: */
706 if (try_to_take_rt_mutex(lock)) { 631 if (try_to_take_rt_mutex(lock, current, NULL)) {
707 raw_spin_unlock(&lock->wait_lock); 632 raw_spin_unlock(&lock->wait_lock);
708 return 0; 633 return 0;
709 } 634 }
@@ -717,12 +642,14 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
717 timeout->task = NULL; 642 timeout->task = NULL;
718 } 643 }
719 644
720 ret = __rt_mutex_slowlock(lock, state, timeout, &waiter, 645 ret = task_blocks_on_rt_mutex(lock, &waiter, current, detect_deadlock);
721 detect_deadlock); 646
647 if (likely(!ret))
648 ret = __rt_mutex_slowlock(lock, state, timeout, &waiter);
722 649
723 set_current_state(TASK_RUNNING); 650 set_current_state(TASK_RUNNING);
724 651
725 if (unlikely(waiter.task)) 652 if (unlikely(ret))
726 remove_waiter(lock, &waiter); 653 remove_waiter(lock, &waiter);
727 654
728 /* 655 /*
@@ -737,14 +664,6 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
737 if (unlikely(timeout)) 664 if (unlikely(timeout))
738 hrtimer_cancel(&timeout->timer); 665 hrtimer_cancel(&timeout->timer);
739 666
740 /*
741 * Readjust priority, when we did not get the lock. We might
742 * have been the pending owner and boosted. Since we did not
743 * take the lock, the PI boost has to go.
744 */
745 if (unlikely(ret))
746 rt_mutex_adjust_prio(current);
747
748 debug_rt_mutex_free_waiter(&waiter); 667 debug_rt_mutex_free_waiter(&waiter);
749 668
750 return ret; 669 return ret;
@@ -762,7 +681,7 @@ rt_mutex_slowtrylock(struct rt_mutex *lock)
762 681
763 if (likely(rt_mutex_owner(lock) != current)) { 682 if (likely(rt_mutex_owner(lock) != current)) {
764 683
765 ret = try_to_take_rt_mutex(lock); 684 ret = try_to_take_rt_mutex(lock, current, NULL);
766 /* 685 /*
767 * try_to_take_rt_mutex() sets the lock waiters 686 * try_to_take_rt_mutex() sets the lock waiters
768 * bit unconditionally. Clean this up. 687 * bit unconditionally. Clean this up.
@@ -992,7 +911,7 @@ void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
992{ 911{
993 __rt_mutex_init(lock, NULL); 912 __rt_mutex_init(lock, NULL);
994 debug_rt_mutex_proxy_lock(lock, proxy_owner); 913 debug_rt_mutex_proxy_lock(lock, proxy_owner);
995 rt_mutex_set_owner(lock, proxy_owner, 0); 914 rt_mutex_set_owner(lock, proxy_owner);
996 rt_mutex_deadlock_account_lock(lock, proxy_owner); 915 rt_mutex_deadlock_account_lock(lock, proxy_owner);
997} 916}
998 917
@@ -1008,7 +927,7 @@ void rt_mutex_proxy_unlock(struct rt_mutex *lock,
1008 struct task_struct *proxy_owner) 927 struct task_struct *proxy_owner)
1009{ 928{
1010 debug_rt_mutex_proxy_unlock(lock); 929 debug_rt_mutex_proxy_unlock(lock);
1011 rt_mutex_set_owner(lock, NULL, 0); 930 rt_mutex_set_owner(lock, NULL);
1012 rt_mutex_deadlock_account_unlock(proxy_owner); 931 rt_mutex_deadlock_account_unlock(proxy_owner);
1013} 932}
1014 933
@@ -1034,20 +953,14 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
1034 953
1035 raw_spin_lock(&lock->wait_lock); 954 raw_spin_lock(&lock->wait_lock);
1036 955
1037 mark_rt_mutex_waiters(lock); 956 if (try_to_take_rt_mutex(lock, task, NULL)) {
1038
1039 if (!rt_mutex_owner(lock) || try_to_steal_lock(lock, task)) {
1040 /* We got the lock for task. */
1041 debug_rt_mutex_lock(lock);
1042 rt_mutex_set_owner(lock, task, 0);
1043 raw_spin_unlock(&lock->wait_lock); 957 raw_spin_unlock(&lock->wait_lock);
1044 rt_mutex_deadlock_account_lock(lock, task);
1045 return 1; 958 return 1;
1046 } 959 }
1047 960
1048 ret = task_blocks_on_rt_mutex(lock, waiter, task, detect_deadlock); 961 ret = task_blocks_on_rt_mutex(lock, waiter, task, detect_deadlock);
1049 962
1050 if (ret && !waiter->task) { 963 if (ret && !rt_mutex_owner(lock)) {
1051 /* 964 /*
1052 * Reset the return value. We might have 965 * Reset the return value. We might have
1053 * returned with -EDEADLK and the owner 966 * returned with -EDEADLK and the owner
@@ -1056,6 +969,10 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
1056 */ 969 */
1057 ret = 0; 970 ret = 0;
1058 } 971 }
972
973 if (unlikely(ret))
974 remove_waiter(lock, waiter);
975
1059 raw_spin_unlock(&lock->wait_lock); 976 raw_spin_unlock(&lock->wait_lock);
1060 977
1061 debug_rt_mutex_print_deadlock(waiter); 978 debug_rt_mutex_print_deadlock(waiter);
@@ -1110,12 +1027,11 @@ int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
1110 1027
1111 set_current_state(TASK_INTERRUPTIBLE); 1028 set_current_state(TASK_INTERRUPTIBLE);
1112 1029
1113 ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter, 1030 ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter);
1114 detect_deadlock);
1115 1031
1116 set_current_state(TASK_RUNNING); 1032 set_current_state(TASK_RUNNING);
1117 1033
1118 if (unlikely(waiter->task)) 1034 if (unlikely(ret))
1119 remove_waiter(lock, waiter); 1035 remove_waiter(lock, waiter);
1120 1036
1121 /* 1037 /*
@@ -1126,13 +1042,5 @@ int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
1126 1042
1127 raw_spin_unlock(&lock->wait_lock); 1043 raw_spin_unlock(&lock->wait_lock);
1128 1044
1129 /*
1130 * Readjust priority, when we did not get the lock. We might have been
1131 * the pending owner and boosted. Since we did not take the lock, the
1132 * PI boost has to go.
1133 */
1134 if (unlikely(ret))
1135 rt_mutex_adjust_prio(current);
1136
1137 return ret; 1045 return ret;
1138} 1046}
diff --git a/kernel/rtmutex_common.h b/kernel/rtmutex_common.h
index 97a2f81866af..53a66c85261b 100644
--- a/kernel/rtmutex_common.h
+++ b/kernel/rtmutex_common.h
@@ -91,9 +91,8 @@ task_top_pi_waiter(struct task_struct *p)
91/* 91/*
92 * lock->owner state tracking: 92 * lock->owner state tracking:
93 */ 93 */
94#define RT_MUTEX_OWNER_PENDING 1UL 94#define RT_MUTEX_HAS_WAITERS 1UL
95#define RT_MUTEX_HAS_WAITERS 2UL 95#define RT_MUTEX_OWNER_MASKALL 1UL
96#define RT_MUTEX_OWNER_MASKALL 3UL
97 96
98static inline struct task_struct *rt_mutex_owner(struct rt_mutex *lock) 97static inline struct task_struct *rt_mutex_owner(struct rt_mutex *lock)
99{ 98{
@@ -101,17 +100,6 @@ static inline struct task_struct *rt_mutex_owner(struct rt_mutex *lock)
101 ((unsigned long)lock->owner & ~RT_MUTEX_OWNER_MASKALL); 100 ((unsigned long)lock->owner & ~RT_MUTEX_OWNER_MASKALL);
102} 101}
103 102
104static inline struct task_struct *rt_mutex_real_owner(struct rt_mutex *lock)
105{
106 return (struct task_struct *)
107 ((unsigned long)lock->owner & ~RT_MUTEX_HAS_WAITERS);
108}
109
110static inline unsigned long rt_mutex_owner_pending(struct rt_mutex *lock)
111{
112 return (unsigned long)lock->owner & RT_MUTEX_OWNER_PENDING;
113}
114
115/* 103/*
116 * PI-futex support (proxy locking functions, etc.): 104 * PI-futex support (proxy locking functions, etc.):
117 */ 105 */
diff --git a/kernel/sched.c b/kernel/sched.c
index 04949089e760..c8e40b7005c0 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -278,14 +278,12 @@ struct task_group {
278#endif 278#endif
279}; 279};
280 280
281#define root_task_group init_task_group
282
283/* task_group_lock serializes the addition/removal of task groups */ 281/* task_group_lock serializes the addition/removal of task groups */
284static DEFINE_SPINLOCK(task_group_lock); 282static DEFINE_SPINLOCK(task_group_lock);
285 283
286#ifdef CONFIG_FAIR_GROUP_SCHED 284#ifdef CONFIG_FAIR_GROUP_SCHED
287 285
288# define INIT_TASK_GROUP_LOAD NICE_0_LOAD 286# define ROOT_TASK_GROUP_LOAD NICE_0_LOAD
289 287
290/* 288/*
291 * A weight of 0 or 1 can cause arithmetics problems. 289 * A weight of 0 or 1 can cause arithmetics problems.
@@ -298,13 +296,13 @@ static DEFINE_SPINLOCK(task_group_lock);
298#define MIN_SHARES 2 296#define MIN_SHARES 2
299#define MAX_SHARES (1UL << 18) 297#define MAX_SHARES (1UL << 18)
300 298
301static int init_task_group_load = INIT_TASK_GROUP_LOAD; 299static int root_task_group_load = ROOT_TASK_GROUP_LOAD;
302#endif 300#endif
303 301
304/* Default task group. 302/* Default task group.
305 * Every task in system belong to this group at bootup. 303 * Every task in system belong to this group at bootup.
306 */ 304 */
307struct task_group init_task_group; 305struct task_group root_task_group;
308 306
309#endif /* CONFIG_CGROUP_SCHED */ 307#endif /* CONFIG_CGROUP_SCHED */
310 308
@@ -326,7 +324,7 @@ struct cfs_rq {
326 * 'curr' points to currently running entity on this cfs_rq. 324 * 'curr' points to currently running entity on this cfs_rq.
327 * It is set to NULL otherwise (i.e when none are currently running). 325 * It is set to NULL otherwise (i.e when none are currently running).
328 */ 326 */
329 struct sched_entity *curr, *next, *last; 327 struct sched_entity *curr, *next, *last, *skip;
330 328
331 unsigned int nr_spread_over; 329 unsigned int nr_spread_over;
332 330
@@ -555,9 +553,6 @@ struct rq {
555 /* try_to_wake_up() stats */ 553 /* try_to_wake_up() stats */
556 unsigned int ttwu_count; 554 unsigned int ttwu_count;
557 unsigned int ttwu_local; 555 unsigned int ttwu_local;
558
559 /* BKL stats */
560 unsigned int bkl_count;
561#endif 556#endif
562}; 557};
563 558
@@ -743,7 +738,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
743 buf[cnt] = 0; 738 buf[cnt] = 0;
744 cmp = strstrip(buf); 739 cmp = strstrip(buf);
745 740
746 if (strncmp(buf, "NO_", 3) == 0) { 741 if (strncmp(cmp, "NO_", 3) == 0) {
747 neg = 1; 742 neg = 1;
748 cmp += 3; 743 cmp += 3;
749 } 744 }
@@ -1688,6 +1683,39 @@ static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
1688 __release(rq2->lock); 1683 __release(rq2->lock);
1689} 1684}
1690 1685
1686#else /* CONFIG_SMP */
1687
1688/*
1689 * double_rq_lock - safely lock two runqueues
1690 *
1691 * Note this does not disable interrupts like task_rq_lock,
1692 * you need to do so manually before calling.
1693 */
1694static void double_rq_lock(struct rq *rq1, struct rq *rq2)
1695 __acquires(rq1->lock)
1696 __acquires(rq2->lock)
1697{
1698 BUG_ON(!irqs_disabled());
1699 BUG_ON(rq1 != rq2);
1700 raw_spin_lock(&rq1->lock);
1701 __acquire(rq2->lock); /* Fake it out ;) */
1702}
1703
1704/*
1705 * double_rq_unlock - safely unlock two runqueues
1706 *
1707 * Note this does not restore interrupts like task_rq_unlock,
1708 * you need to do so manually after calling.
1709 */
1710static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
1711 __releases(rq1->lock)
1712 __releases(rq2->lock)
1713{
1714 BUG_ON(rq1 != rq2);
1715 raw_spin_unlock(&rq1->lock);
1716 __release(rq2->lock);
1717}
1718
1691#endif 1719#endif
1692 1720
1693static void calc_load_account_idle(struct rq *this_rq); 1721static void calc_load_account_idle(struct rq *this_rq);
@@ -1882,7 +1910,7 @@ void account_system_vtime(struct task_struct *curr)
1882 */ 1910 */
1883 if (hardirq_count()) 1911 if (hardirq_count())
1884 __this_cpu_add(cpu_hardirq_time, delta); 1912 __this_cpu_add(cpu_hardirq_time, delta);
1885 else if (in_serving_softirq() && !(curr->flags & PF_KSOFTIRQD)) 1913 else if (in_serving_softirq() && curr != this_cpu_ksoftirqd())
1886 __this_cpu_add(cpu_softirq_time, delta); 1914 __this_cpu_add(cpu_softirq_time, delta);
1887 1915
1888 irq_time_write_end(); 1916 irq_time_write_end();
@@ -1922,8 +1950,40 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
1922 sched_rt_avg_update(rq, irq_delta); 1950 sched_rt_avg_update(rq, irq_delta);
1923} 1951}
1924 1952
1953static int irqtime_account_hi_update(void)
1954{
1955 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
1956 unsigned long flags;
1957 u64 latest_ns;
1958 int ret = 0;
1959
1960 local_irq_save(flags);
1961 latest_ns = this_cpu_read(cpu_hardirq_time);
1962 if (cputime64_gt(nsecs_to_cputime64(latest_ns), cpustat->irq))
1963 ret = 1;
1964 local_irq_restore(flags);
1965 return ret;
1966}
1967
1968static int irqtime_account_si_update(void)
1969{
1970 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
1971 unsigned long flags;
1972 u64 latest_ns;
1973 int ret = 0;
1974
1975 local_irq_save(flags);
1976 latest_ns = this_cpu_read(cpu_softirq_time);
1977 if (cputime64_gt(nsecs_to_cputime64(latest_ns), cpustat->softirq))
1978 ret = 1;
1979 local_irq_restore(flags);
1980 return ret;
1981}
1982
1925#else /* CONFIG_IRQ_TIME_ACCOUNTING */ 1983#else /* CONFIG_IRQ_TIME_ACCOUNTING */
1926 1984
1985#define sched_clock_irqtime (0)
1986
1927static void update_rq_clock_task(struct rq *rq, s64 delta) 1987static void update_rq_clock_task(struct rq *rq, s64 delta)
1928{ 1988{
1929 rq->clock_task += delta; 1989 rq->clock_task += delta;
@@ -2027,14 +2087,14 @@ inline int task_curr(const struct task_struct *p)
2027 2087
2028static inline void check_class_changed(struct rq *rq, struct task_struct *p, 2088static inline void check_class_changed(struct rq *rq, struct task_struct *p,
2029 const struct sched_class *prev_class, 2089 const struct sched_class *prev_class,
2030 int oldprio, int running) 2090 int oldprio)
2031{ 2091{
2032 if (prev_class != p->sched_class) { 2092 if (prev_class != p->sched_class) {
2033 if (prev_class->switched_from) 2093 if (prev_class->switched_from)
2034 prev_class->switched_from(rq, p, running); 2094 prev_class->switched_from(rq, p);
2035 p->sched_class->switched_to(rq, p, running); 2095 p->sched_class->switched_to(rq, p);
2036 } else 2096 } else if (oldprio != p->prio)
2037 p->sched_class->prio_changed(rq, p, oldprio, running); 2097 p->sched_class->prio_changed(rq, p, oldprio);
2038} 2098}
2039 2099
2040static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) 2100static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
@@ -2226,7 +2286,10 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
2226 * yield - it could be a while. 2286 * yield - it could be a while.
2227 */ 2287 */
2228 if (unlikely(on_rq)) { 2288 if (unlikely(on_rq)) {
2229 schedule_timeout_uninterruptible(1); 2289 ktime_t to = ktime_set(0, NSEC_PER_SEC/HZ);
2290
2291 set_current_state(TASK_UNINTERRUPTIBLE);
2292 schedule_hrtimeout(&to, HRTIMER_MODE_REL);
2230 continue; 2293 continue;
2231 } 2294 }
2232 2295
@@ -2267,27 +2330,6 @@ void kick_process(struct task_struct *p)
2267EXPORT_SYMBOL_GPL(kick_process); 2330EXPORT_SYMBOL_GPL(kick_process);
2268#endif /* CONFIG_SMP */ 2331#endif /* CONFIG_SMP */
2269 2332
2270/**
2271 * task_oncpu_function_call - call a function on the cpu on which a task runs
2272 * @p: the task to evaluate
2273 * @func: the function to be called
2274 * @info: the function call argument
2275 *
2276 * Calls the function @func when the task is currently running. This might
2277 * be on the current CPU, which just calls the function directly
2278 */
2279void task_oncpu_function_call(struct task_struct *p,
2280 void (*func) (void *info), void *info)
2281{
2282 int cpu;
2283
2284 preempt_disable();
2285 cpu = task_cpu(p);
2286 if (task_curr(p))
2287 smp_call_function_single(cpu, func, info, 1);
2288 preempt_enable();
2289}
2290
2291#ifdef CONFIG_SMP 2333#ifdef CONFIG_SMP
2292/* 2334/*
2293 * ->cpus_allowed is protected by either TASK_WAKING or rq->lock held. 2335 * ->cpus_allowed is protected by either TASK_WAKING or rq->lock held.
@@ -2507,7 +2549,7 @@ out:
2507 * try_to_wake_up_local - try to wake up a local task with rq lock held 2549 * try_to_wake_up_local - try to wake up a local task with rq lock held
2508 * @p: the thread to be awakened 2550 * @p: the thread to be awakened
2509 * 2551 *
2510 * Put @p on the run-queue if it's not alredy there. The caller must 2552 * Put @p on the run-queue if it's not already there. The caller must
2511 * ensure that this_rq() is locked, @p is bound to this_rq() and not 2553 * ensure that this_rq() is locked, @p is bound to this_rq() and not
2512 * the current task. this_rq() stays locked over invocation. 2554 * the current task. this_rq() stays locked over invocation.
2513 */ 2555 */
@@ -2568,6 +2610,7 @@ static void __sched_fork(struct task_struct *p)
2568 p->se.sum_exec_runtime = 0; 2610 p->se.sum_exec_runtime = 0;
2569 p->se.prev_sum_exec_runtime = 0; 2611 p->se.prev_sum_exec_runtime = 0;
2570 p->se.nr_migrations = 0; 2612 p->se.nr_migrations = 0;
2613 p->se.vruntime = 0;
2571 2614
2572#ifdef CONFIG_SCHEDSTATS 2615#ifdef CONFIG_SCHEDSTATS
2573 memset(&p->se.statistics, 0, sizeof(p->se.statistics)); 2616 memset(&p->se.statistics, 0, sizeof(p->se.statistics));
@@ -2778,9 +2821,12 @@ static inline void
2778prepare_task_switch(struct rq *rq, struct task_struct *prev, 2821prepare_task_switch(struct rq *rq, struct task_struct *prev,
2779 struct task_struct *next) 2822 struct task_struct *next)
2780{ 2823{
2824 sched_info_switch(prev, next);
2825 perf_event_task_sched_out(prev, next);
2781 fire_sched_out_preempt_notifiers(prev, next); 2826 fire_sched_out_preempt_notifiers(prev, next);
2782 prepare_lock_switch(rq, next); 2827 prepare_lock_switch(rq, next);
2783 prepare_arch_switch(next); 2828 prepare_arch_switch(next);
2829 trace_sched_switch(prev, next);
2784} 2830}
2785 2831
2786/** 2832/**
@@ -2913,7 +2959,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
2913 struct mm_struct *mm, *oldmm; 2959 struct mm_struct *mm, *oldmm;
2914 2960
2915 prepare_task_switch(rq, prev, next); 2961 prepare_task_switch(rq, prev, next);
2916 trace_sched_switch(prev, next); 2962
2917 mm = next->mm; 2963 mm = next->mm;
2918 oldmm = prev->active_mm; 2964 oldmm = prev->active_mm;
2919 /* 2965 /*
@@ -3570,6 +3616,32 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime,
3570} 3616}
3571 3617
3572/* 3618/*
3619 * Account system cpu time to a process and desired cpustat field
3620 * @p: the process that the cpu time gets accounted to
3621 * @cputime: the cpu time spent in kernel space since the last update
3622 * @cputime_scaled: cputime scaled by cpu frequency
3623 * @target_cputime64: pointer to cpustat field that has to be updated
3624 */
3625static inline
3626void __account_system_time(struct task_struct *p, cputime_t cputime,
3627 cputime_t cputime_scaled, cputime64_t *target_cputime64)
3628{
3629 cputime64_t tmp = cputime_to_cputime64(cputime);
3630
3631 /* Add system time to process. */
3632 p->stime = cputime_add(p->stime, cputime);
3633 p->stimescaled = cputime_add(p->stimescaled, cputime_scaled);
3634 account_group_system_time(p, cputime);
3635
3636 /* Add system time to cpustat. */
3637 *target_cputime64 = cputime64_add(*target_cputime64, tmp);
3638 cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime);
3639
3640 /* Account for system time used */
3641 acct_update_integrals(p);
3642}
3643
3644/*
3573 * Account system cpu time to a process. 3645 * Account system cpu time to a process.
3574 * @p: the process that the cpu time gets accounted to 3646 * @p: the process that the cpu time gets accounted to
3575 * @hardirq_offset: the offset to subtract from hardirq_count() 3647 * @hardirq_offset: the offset to subtract from hardirq_count()
@@ -3580,36 +3652,26 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
3580 cputime_t cputime, cputime_t cputime_scaled) 3652 cputime_t cputime, cputime_t cputime_scaled)
3581{ 3653{
3582 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; 3654 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3583 cputime64_t tmp; 3655 cputime64_t *target_cputime64;
3584 3656
3585 if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) { 3657 if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {
3586 account_guest_time(p, cputime, cputime_scaled); 3658 account_guest_time(p, cputime, cputime_scaled);
3587 return; 3659 return;
3588 } 3660 }
3589 3661
3590 /* Add system time to process. */
3591 p->stime = cputime_add(p->stime, cputime);
3592 p->stimescaled = cputime_add(p->stimescaled, cputime_scaled);
3593 account_group_system_time(p, cputime);
3594
3595 /* Add system time to cpustat. */
3596 tmp = cputime_to_cputime64(cputime);
3597 if (hardirq_count() - hardirq_offset) 3662 if (hardirq_count() - hardirq_offset)
3598 cpustat->irq = cputime64_add(cpustat->irq, tmp); 3663 target_cputime64 = &cpustat->irq;
3599 else if (in_serving_softirq()) 3664 else if (in_serving_softirq())
3600 cpustat->softirq = cputime64_add(cpustat->softirq, tmp); 3665 target_cputime64 = &cpustat->softirq;
3601 else 3666 else
3602 cpustat->system = cputime64_add(cpustat->system, tmp); 3667 target_cputime64 = &cpustat->system;
3603 3668
3604 cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime); 3669 __account_system_time(p, cputime, cputime_scaled, target_cputime64);
3605
3606 /* Account for system time used */
3607 acct_update_integrals(p);
3608} 3670}
3609 3671
3610/* 3672/*
3611 * Account for involuntary wait time. 3673 * Account for involuntary wait time.
3612 * @steal: the cpu time spent in involuntary wait 3674 * @cputime: the cpu time spent in involuntary wait
3613 */ 3675 */
3614void account_steal_time(cputime_t cputime) 3676void account_steal_time(cputime_t cputime)
3615{ 3677{
@@ -3637,6 +3699,73 @@ void account_idle_time(cputime_t cputime)
3637 3699
3638#ifndef CONFIG_VIRT_CPU_ACCOUNTING 3700#ifndef CONFIG_VIRT_CPU_ACCOUNTING
3639 3701
3702#ifdef CONFIG_IRQ_TIME_ACCOUNTING
3703/*
3704 * Account a tick to a process and cpustat
3705 * @p: the process that the cpu time gets accounted to
3706 * @user_tick: is the tick from userspace
3707 * @rq: the pointer to rq
3708 *
3709 * Tick demultiplexing follows the order
3710 * - pending hardirq update
3711 * - pending softirq update
3712 * - user_time
3713 * - idle_time
3714 * - system time
3715 * - check for guest_time
3716 * - else account as system_time
3717 *
3718 * Check for hardirq is done both for system and user time as there is
3719 * no timer going off while we are on hardirq and hence we may never get an
3720 * opportunity to update it solely in system time.
3721 * p->stime and friends are only updated on system time and not on irq
3722 * softirq as those do not count in task exec_runtime any more.
3723 */
3724static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
3725 struct rq *rq)
3726{
3727 cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
3728 cputime64_t tmp = cputime_to_cputime64(cputime_one_jiffy);
3729 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3730
3731 if (irqtime_account_hi_update()) {
3732 cpustat->irq = cputime64_add(cpustat->irq, tmp);
3733 } else if (irqtime_account_si_update()) {
3734 cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
3735 } else if (this_cpu_ksoftirqd() == p) {
3736 /*
3737 * ksoftirqd time do not get accounted in cpu_softirq_time.
3738 * So, we have to handle it separately here.
3739 * Also, p->stime needs to be updated for ksoftirqd.
3740 */
3741 __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
3742 &cpustat->softirq);
3743 } else if (user_tick) {
3744 account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
3745 } else if (p == rq->idle) {
3746 account_idle_time(cputime_one_jiffy);
3747 } else if (p->flags & PF_VCPU) { /* System time or guest time */
3748 account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled);
3749 } else {
3750 __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
3751 &cpustat->system);
3752 }
3753}
3754
3755static void irqtime_account_idle_ticks(int ticks)
3756{
3757 int i;
3758 struct rq *rq = this_rq();
3759
3760 for (i = 0; i < ticks; i++)
3761 irqtime_account_process_tick(current, 0, rq);
3762}
3763#else /* CONFIG_IRQ_TIME_ACCOUNTING */
3764static void irqtime_account_idle_ticks(int ticks) {}
3765static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
3766 struct rq *rq) {}
3767#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
3768
3640/* 3769/*
3641 * Account a single tick of cpu time. 3770 * Account a single tick of cpu time.
3642 * @p: the process that the cpu time gets accounted to 3771 * @p: the process that the cpu time gets accounted to
@@ -3647,6 +3776,11 @@ void account_process_tick(struct task_struct *p, int user_tick)
3647 cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); 3776 cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
3648 struct rq *rq = this_rq(); 3777 struct rq *rq = this_rq();
3649 3778
3779 if (sched_clock_irqtime) {
3780 irqtime_account_process_tick(p, user_tick, rq);
3781 return;
3782 }
3783
3650 if (user_tick) 3784 if (user_tick)
3651 account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); 3785 account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
3652 else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET)) 3786 else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))
@@ -3672,6 +3806,12 @@ void account_steal_ticks(unsigned long ticks)
3672 */ 3806 */
3673void account_idle_ticks(unsigned long ticks) 3807void account_idle_ticks(unsigned long ticks)
3674{ 3808{
3809
3810 if (sched_clock_irqtime) {
3811 irqtime_account_idle_ticks(ticks);
3812 return;
3813 }
3814
3675 account_idle_time(jiffies_to_cputime(ticks)); 3815 account_idle_time(jiffies_to_cputime(ticks));
3676} 3816}
3677 3817
@@ -3889,7 +4029,7 @@ static inline void schedule_debug(struct task_struct *prev)
3889 schedstat_inc(this_rq(), sched_count); 4029 schedstat_inc(this_rq(), sched_count);
3890#ifdef CONFIG_SCHEDSTATS 4030#ifdef CONFIG_SCHEDSTATS
3891 if (unlikely(prev->lock_depth >= 0)) { 4031 if (unlikely(prev->lock_depth >= 0)) {
3892 schedstat_inc(this_rq(), bkl_count); 4032 schedstat_inc(this_rq(), rq_sched_info.bkl_count);
3893 schedstat_inc(prev, sched_info.bkl_count); 4033 schedstat_inc(prev, sched_info.bkl_count);
3894 } 4034 }
3895#endif 4035#endif
@@ -3991,9 +4131,6 @@ need_resched_nonpreemptible:
3991 rq->skip_clock_update = 0; 4131 rq->skip_clock_update = 0;
3992 4132
3993 if (likely(prev != next)) { 4133 if (likely(prev != next)) {
3994 sched_info_switch(prev, next);
3995 perf_event_task_sched_out(prev, next);
3996
3997 rq->nr_switches++; 4134 rq->nr_switches++;
3998 rq->curr = next; 4135 rq->curr = next;
3999 ++*switch_count; 4136 ++*switch_count;
@@ -4215,6 +4352,7 @@ void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)
4215{ 4352{
4216 __wake_up_common(q, mode, 1, 0, key); 4353 __wake_up_common(q, mode, 1, 0, key);
4217} 4354}
4355EXPORT_SYMBOL_GPL(__wake_up_locked_key);
4218 4356
4219/** 4357/**
4220 * __wake_up_sync_key - wake up threads blocked on a waitqueue. 4358 * __wake_up_sync_key - wake up threads blocked on a waitqueue.
@@ -4572,11 +4710,10 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
4572 4710
4573 if (running) 4711 if (running)
4574 p->sched_class->set_curr_task(rq); 4712 p->sched_class->set_curr_task(rq);
4575 if (on_rq) { 4713 if (on_rq)
4576 enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0); 4714 enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0);
4577 4715
4578 check_class_changed(rq, p, prev_class, oldprio, running); 4716 check_class_changed(rq, p, prev_class, oldprio);
4579 }
4580 task_rq_unlock(rq, &flags); 4717 task_rq_unlock(rq, &flags);
4581} 4718}
4582 4719
@@ -4824,12 +4961,15 @@ recheck:
4824 param->sched_priority > rlim_rtprio) 4961 param->sched_priority > rlim_rtprio)
4825 return -EPERM; 4962 return -EPERM;
4826 } 4963 }
4964
4827 /* 4965 /*
4828 * Like positive nice levels, dont allow tasks to 4966 * Treat SCHED_IDLE as nice 20. Only allow a switch to
4829 * move out of SCHED_IDLE either: 4967 * SCHED_NORMAL if the RLIMIT_NICE would normally permit it.
4830 */ 4968 */
4831 if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) 4969 if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) {
4832 return -EPERM; 4970 if (!can_nice(p, TASK_NICE(p)))
4971 return -EPERM;
4972 }
4833 4973
4834 /* can't change other user's priorities */ 4974 /* can't change other user's priorities */
4835 if (!check_same_owner(p)) 4975 if (!check_same_owner(p))
@@ -4873,7 +5013,8 @@ recheck:
4873 * assigned. 5013 * assigned.
4874 */ 5014 */
4875 if (rt_bandwidth_enabled() && rt_policy(policy) && 5015 if (rt_bandwidth_enabled() && rt_policy(policy) &&
4876 task_group(p)->rt_bandwidth.rt_runtime == 0) { 5016 task_group(p)->rt_bandwidth.rt_runtime == 0 &&
5017 !task_group_is_autogroup(task_group(p))) {
4877 __task_rq_unlock(rq); 5018 __task_rq_unlock(rq);
4878 raw_spin_unlock_irqrestore(&p->pi_lock, flags); 5019 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
4879 return -EPERM; 5020 return -EPERM;
@@ -4903,11 +5044,10 @@ recheck:
4903 5044
4904 if (running) 5045 if (running)
4905 p->sched_class->set_curr_task(rq); 5046 p->sched_class->set_curr_task(rq);
4906 if (on_rq) { 5047 if (on_rq)
4907 activate_task(rq, p, 0); 5048 activate_task(rq, p, 0);
4908 5049
4909 check_class_changed(rq, p, prev_class, oldprio, running); 5050 check_class_changed(rq, p, prev_class, oldprio);
4910 }
4911 __task_rq_unlock(rq); 5051 __task_rq_unlock(rq);
4912 raw_spin_unlock_irqrestore(&p->pi_lock, flags); 5052 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
4913 5053
@@ -5324,6 +5464,65 @@ void __sched yield(void)
5324} 5464}
5325EXPORT_SYMBOL(yield); 5465EXPORT_SYMBOL(yield);
5326 5466
5467/**
5468 * yield_to - yield the current processor to another thread in
5469 * your thread group, or accelerate that thread toward the
5470 * processor it's on.
5471 *
5472 * It's the caller's job to ensure that the target task struct
5473 * can't go away on us before we can do any checks.
5474 *
5475 * Returns true if we indeed boosted the target task.
5476 */
5477bool __sched yield_to(struct task_struct *p, bool preempt)
5478{
5479 struct task_struct *curr = current;
5480 struct rq *rq, *p_rq;
5481 unsigned long flags;
5482 bool yielded = 0;
5483
5484 local_irq_save(flags);
5485 rq = this_rq();
5486
5487again:
5488 p_rq = task_rq(p);
5489 double_rq_lock(rq, p_rq);
5490 while (task_rq(p) != p_rq) {
5491 double_rq_unlock(rq, p_rq);
5492 goto again;
5493 }
5494
5495 if (!curr->sched_class->yield_to_task)
5496 goto out;
5497
5498 if (curr->sched_class != p->sched_class)
5499 goto out;
5500
5501 if (task_running(p_rq, p) || p->state)
5502 goto out;
5503
5504 yielded = curr->sched_class->yield_to_task(rq, p, preempt);
5505 if (yielded) {
5506 schedstat_inc(rq, yld_count);
5507 /*
5508 * Make p's CPU reschedule; pick_next_entity takes care of
5509 * fairness.
5510 */
5511 if (preempt && rq != p_rq)
5512 resched_task(p_rq->curr);
5513 }
5514
5515out:
5516 double_rq_unlock(rq, p_rq);
5517 local_irq_restore(flags);
5518
5519 if (yielded)
5520 schedule();
5521
5522 return yielded;
5523}
5524EXPORT_SYMBOL_GPL(yield_to);
5525
5327/* 5526/*
5328 * This task is about to go to sleep on IO. Increment rq->nr_iowait so 5527 * This task is about to go to sleep on IO. Increment rq->nr_iowait so
5329 * that process accounting knows that this is a task in IO wait state. 5528 * that process accounting knows that this is a task in IO wait state.
@@ -5572,7 +5771,7 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
5572 * The idle tasks have their own, simple scheduling class: 5771 * The idle tasks have their own, simple scheduling class:
5573 */ 5772 */
5574 idle->sched_class = &idle_sched_class; 5773 idle->sched_class = &idle_sched_class;
5575 ftrace_graph_init_task(idle); 5774 ftrace_graph_init_idle_task(idle, cpu);
5576} 5775}
5577 5776
5578/* 5777/*
@@ -7797,6 +7996,10 @@ static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq)
7797 INIT_LIST_HEAD(&cfs_rq->tasks); 7996 INIT_LIST_HEAD(&cfs_rq->tasks);
7798#ifdef CONFIG_FAIR_GROUP_SCHED 7997#ifdef CONFIG_FAIR_GROUP_SCHED
7799 cfs_rq->rq = rq; 7998 cfs_rq->rq = rq;
7999 /* allow initial update_cfs_load() to truncate */
8000#ifdef CONFIG_SMP
8001 cfs_rq->load_stamp = 1;
8002#endif
7800#endif 8003#endif
7801 cfs_rq->min_vruntime = (u64)(-(1LL << 20)); 8004 cfs_rq->min_vruntime = (u64)(-(1LL << 20));
7802} 8005}
@@ -7848,7 +8051,7 @@ static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
7848 cfs_rq->tg = tg; 8051 cfs_rq->tg = tg;
7849 8052
7850 tg->se[cpu] = se; 8053 tg->se[cpu] = se;
7851 /* se could be NULL for init_task_group */ 8054 /* se could be NULL for root_task_group */
7852 if (!se) 8055 if (!se)
7853 return; 8056 return;
7854 8057
@@ -7908,18 +8111,18 @@ void __init sched_init(void)
7908 ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT); 8111 ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);
7909 8112
7910#ifdef CONFIG_FAIR_GROUP_SCHED 8113#ifdef CONFIG_FAIR_GROUP_SCHED
7911 init_task_group.se = (struct sched_entity **)ptr; 8114 root_task_group.se = (struct sched_entity **)ptr;
7912 ptr += nr_cpu_ids * sizeof(void **); 8115 ptr += nr_cpu_ids * sizeof(void **);
7913 8116
7914 init_task_group.cfs_rq = (struct cfs_rq **)ptr; 8117 root_task_group.cfs_rq = (struct cfs_rq **)ptr;
7915 ptr += nr_cpu_ids * sizeof(void **); 8118 ptr += nr_cpu_ids * sizeof(void **);
7916 8119
7917#endif /* CONFIG_FAIR_GROUP_SCHED */ 8120#endif /* CONFIG_FAIR_GROUP_SCHED */
7918#ifdef CONFIG_RT_GROUP_SCHED 8121#ifdef CONFIG_RT_GROUP_SCHED
7919 init_task_group.rt_se = (struct sched_rt_entity **)ptr; 8122 root_task_group.rt_se = (struct sched_rt_entity **)ptr;
7920 ptr += nr_cpu_ids * sizeof(void **); 8123 ptr += nr_cpu_ids * sizeof(void **);
7921 8124
7922 init_task_group.rt_rq = (struct rt_rq **)ptr; 8125 root_task_group.rt_rq = (struct rt_rq **)ptr;
7923 ptr += nr_cpu_ids * sizeof(void **); 8126 ptr += nr_cpu_ids * sizeof(void **);
7924 8127
7925#endif /* CONFIG_RT_GROUP_SCHED */ 8128#endif /* CONFIG_RT_GROUP_SCHED */
@@ -7939,13 +8142,13 @@ void __init sched_init(void)
7939 global_rt_period(), global_rt_runtime()); 8142 global_rt_period(), global_rt_runtime());
7940 8143
7941#ifdef CONFIG_RT_GROUP_SCHED 8144#ifdef CONFIG_RT_GROUP_SCHED
7942 init_rt_bandwidth(&init_task_group.rt_bandwidth, 8145 init_rt_bandwidth(&root_task_group.rt_bandwidth,
7943 global_rt_period(), global_rt_runtime()); 8146 global_rt_period(), global_rt_runtime());
7944#endif /* CONFIG_RT_GROUP_SCHED */ 8147#endif /* CONFIG_RT_GROUP_SCHED */
7945 8148
7946#ifdef CONFIG_CGROUP_SCHED 8149#ifdef CONFIG_CGROUP_SCHED
7947 list_add(&init_task_group.list, &task_groups); 8150 list_add(&root_task_group.list, &task_groups);
7948 INIT_LIST_HEAD(&init_task_group.children); 8151 INIT_LIST_HEAD(&root_task_group.children);
7949 autogroup_init(&init_task); 8152 autogroup_init(&init_task);
7950#endif /* CONFIG_CGROUP_SCHED */ 8153#endif /* CONFIG_CGROUP_SCHED */
7951 8154
@@ -7960,34 +8163,34 @@ void __init sched_init(void)
7960 init_cfs_rq(&rq->cfs, rq); 8163 init_cfs_rq(&rq->cfs, rq);
7961 init_rt_rq(&rq->rt, rq); 8164 init_rt_rq(&rq->rt, rq);
7962#ifdef CONFIG_FAIR_GROUP_SCHED 8165#ifdef CONFIG_FAIR_GROUP_SCHED
7963 init_task_group.shares = init_task_group_load; 8166 root_task_group.shares = root_task_group_load;
7964 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); 8167 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
7965 /* 8168 /*
7966 * How much cpu bandwidth does init_task_group get? 8169 * How much cpu bandwidth does root_task_group get?
7967 * 8170 *
7968 * In case of task-groups formed thr' the cgroup filesystem, it 8171 * In case of task-groups formed thr' the cgroup filesystem, it
7969 * gets 100% of the cpu resources in the system. This overall 8172 * gets 100% of the cpu resources in the system. This overall
7970 * system cpu resource is divided among the tasks of 8173 * system cpu resource is divided among the tasks of
7971 * init_task_group and its child task-groups in a fair manner, 8174 * root_task_group and its child task-groups in a fair manner,
7972 * based on each entity's (task or task-group's) weight 8175 * based on each entity's (task or task-group's) weight
7973 * (se->load.weight). 8176 * (se->load.weight).
7974 * 8177 *
7975 * In other words, if init_task_group has 10 tasks of weight 8178 * In other words, if root_task_group has 10 tasks of weight
7976 * 1024) and two child groups A0 and A1 (of weight 1024 each), 8179 * 1024) and two child groups A0 and A1 (of weight 1024 each),
7977 * then A0's share of the cpu resource is: 8180 * then A0's share of the cpu resource is:
7978 * 8181 *
7979 * A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33% 8182 * A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33%
7980 * 8183 *
7981 * We achieve this by letting init_task_group's tasks sit 8184 * We achieve this by letting root_task_group's tasks sit
7982 * directly in rq->cfs (i.e init_task_group->se[] = NULL). 8185 * directly in rq->cfs (i.e root_task_group->se[] = NULL).
7983 */ 8186 */
7984 init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, NULL); 8187 init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL);
7985#endif /* CONFIG_FAIR_GROUP_SCHED */ 8188#endif /* CONFIG_FAIR_GROUP_SCHED */
7986 8189
7987 rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime; 8190 rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
7988#ifdef CONFIG_RT_GROUP_SCHED 8191#ifdef CONFIG_RT_GROUP_SCHED
7989 INIT_LIST_HEAD(&rq->leaf_rt_rq_list); 8192 INIT_LIST_HEAD(&rq->leaf_rt_rq_list);
7990 init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, NULL); 8193 init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL);
7991#endif 8194#endif
7992 8195
7993 for (j = 0; j < CPU_LOAD_IDX_MAX; j++) 8196 for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
@@ -8110,6 +8313,8 @@ EXPORT_SYMBOL(__might_sleep);
8110#ifdef CONFIG_MAGIC_SYSRQ 8313#ifdef CONFIG_MAGIC_SYSRQ
8111static void normalize_task(struct rq *rq, struct task_struct *p) 8314static void normalize_task(struct rq *rq, struct task_struct *p)
8112{ 8315{
8316 const struct sched_class *prev_class = p->sched_class;
8317 int old_prio = p->prio;
8113 int on_rq; 8318 int on_rq;
8114 8319
8115 on_rq = p->se.on_rq; 8320 on_rq = p->se.on_rq;
@@ -8120,6 +8325,8 @@ static void normalize_task(struct rq *rq, struct task_struct *p)
8120 activate_task(rq, p, 0); 8325 activate_task(rq, p, 0);
8121 resched_task(rq->curr); 8326 resched_task(rq->curr);
8122 } 8327 }
8328
8329 check_class_changed(rq, p, prev_class, old_prio);
8123} 8330}
8124 8331
8125void normalize_rt_tasks(void) 8332void normalize_rt_tasks(void)
@@ -8379,6 +8586,7 @@ static void free_sched_group(struct task_group *tg)
8379{ 8586{
8380 free_fair_sched_group(tg); 8587 free_fair_sched_group(tg);
8381 free_rt_sched_group(tg); 8588 free_rt_sched_group(tg);
8589 autogroup_free(tg);
8382 kfree(tg); 8590 kfree(tg);
8383} 8591}
8384 8592
@@ -8510,7 +8718,7 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
8510 /* Propagate contribution to hierarchy */ 8718 /* Propagate contribution to hierarchy */
8511 raw_spin_lock_irqsave(&rq->lock, flags); 8719 raw_spin_lock_irqsave(&rq->lock, flags);
8512 for_each_sched_entity(se) 8720 for_each_sched_entity(se)
8513 update_cfs_shares(group_cfs_rq(se), 0); 8721 update_cfs_shares(group_cfs_rq(se));
8514 raw_spin_unlock_irqrestore(&rq->lock, flags); 8722 raw_spin_unlock_irqrestore(&rq->lock, flags);
8515 } 8723 }
8516 8724
@@ -8812,7 +9020,7 @@ cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp)
8812 9020
8813 if (!cgrp->parent) { 9021 if (!cgrp->parent) {
8814 /* This is early initialization for the top cgroup */ 9022 /* This is early initialization for the top cgroup */
8815 return &init_task_group.css; 9023 return &root_task_group.css;
8816 } 9024 }
8817 9025
8818 parent = cgroup_tg(cgrp->parent); 9026 parent = cgroup_tg(cgrp->parent);
@@ -8883,6 +9091,21 @@ cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
8883 } 9091 }
8884} 9092}
8885 9093
9094static void
9095cpu_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp,
9096 struct cgroup *old_cgrp, struct task_struct *task)
9097{
9098 /*
9099 * cgroup_exit() is called in the copy_process() failure path.
9100 * Ignore this case since the task hasn't ran yet, this avoids
9101 * trying to poke a half freed task state from generic code.
9102 */
9103 if (!(task->flags & PF_EXITING))
9104 return;
9105
9106 sched_move_task(task);
9107}
9108
8886#ifdef CONFIG_FAIR_GROUP_SCHED 9109#ifdef CONFIG_FAIR_GROUP_SCHED
8887static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype, 9110static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype,
8888 u64 shareval) 9111 u64 shareval)
@@ -8955,6 +9178,7 @@ struct cgroup_subsys cpu_cgroup_subsys = {
8955 .destroy = cpu_cgroup_destroy, 9178 .destroy = cpu_cgroup_destroy,
8956 .can_attach = cpu_cgroup_can_attach, 9179 .can_attach = cpu_cgroup_can_attach,
8957 .attach = cpu_cgroup_attach, 9180 .attach = cpu_cgroup_attach,
9181 .exit = cpu_cgroup_exit,
8958 .populate = cpu_cgroup_populate, 9182 .populate = cpu_cgroup_populate,
8959 .subsys_id = cpu_cgroup_subsys_id, 9183 .subsys_id = cpu_cgroup_subsys_id,
8960 .early_init = 1, 9184 .early_init = 1,
diff --git a/kernel/sched_autogroup.c b/kernel/sched_autogroup.c
index c80fedcd476b..5946ac515602 100644
--- a/kernel/sched_autogroup.c
+++ b/kernel/sched_autogroup.c
@@ -9,10 +9,9 @@ unsigned int __read_mostly sysctl_sched_autogroup_enabled = 1;
9static struct autogroup autogroup_default; 9static struct autogroup autogroup_default;
10static atomic_t autogroup_seq_nr; 10static atomic_t autogroup_seq_nr;
11 11
12static void autogroup_init(struct task_struct *init_task) 12static void __init autogroup_init(struct task_struct *init_task)
13{ 13{
14 autogroup_default.tg = &init_task_group; 14 autogroup_default.tg = &root_task_group;
15 init_task_group.autogroup = &autogroup_default;
16 kref_init(&autogroup_default.kref); 15 kref_init(&autogroup_default.kref);
17 init_rwsem(&autogroup_default.lock); 16 init_rwsem(&autogroup_default.lock);
18 init_task->signal->autogroup = &autogroup_default; 17 init_task->signal->autogroup = &autogroup_default;
@@ -27,6 +26,11 @@ static inline void autogroup_destroy(struct kref *kref)
27{ 26{
28 struct autogroup *ag = container_of(kref, struct autogroup, kref); 27 struct autogroup *ag = container_of(kref, struct autogroup, kref);
29 28
29#ifdef CONFIG_RT_GROUP_SCHED
30 /* We've redirected RT tasks to the root task group... */
31 ag->tg->rt_se = NULL;
32 ag->tg->rt_rq = NULL;
33#endif
30 sched_destroy_group(ag->tg); 34 sched_destroy_group(ag->tg);
31} 35}
32 36
@@ -55,6 +59,10 @@ static inline struct autogroup *autogroup_task_get(struct task_struct *p)
55 return ag; 59 return ag;
56} 60}
57 61
62#ifdef CONFIG_RT_GROUP_SCHED
63static void free_rt_sched_group(struct task_group *tg);
64#endif
65
58static inline struct autogroup *autogroup_create(void) 66static inline struct autogroup *autogroup_create(void)
59{ 67{
60 struct autogroup *ag = kzalloc(sizeof(*ag), GFP_KERNEL); 68 struct autogroup *ag = kzalloc(sizeof(*ag), GFP_KERNEL);
@@ -63,7 +71,7 @@ static inline struct autogroup *autogroup_create(void)
63 if (!ag) 71 if (!ag)
64 goto out_fail; 72 goto out_fail;
65 73
66 tg = sched_create_group(&init_task_group); 74 tg = sched_create_group(&root_task_group);
67 75
68 if (IS_ERR(tg)) 76 if (IS_ERR(tg))
69 goto out_free; 77 goto out_free;
@@ -72,6 +80,19 @@ static inline struct autogroup *autogroup_create(void)
72 init_rwsem(&ag->lock); 80 init_rwsem(&ag->lock);
73 ag->id = atomic_inc_return(&autogroup_seq_nr); 81 ag->id = atomic_inc_return(&autogroup_seq_nr);
74 ag->tg = tg; 82 ag->tg = tg;
83#ifdef CONFIG_RT_GROUP_SCHED
84 /*
85 * Autogroup RT tasks are redirected to the root task group
86 * so we don't have to move tasks around upon policy change,
87 * or flail around trying to allocate bandwidth on the fly.
88 * A bandwidth exception in __sched_setscheduler() allows
89 * the policy change to proceed. Thereafter, task_group()
90 * returns &root_task_group, so zero bandwidth is required.
91 */
92 free_rt_sched_group(tg);
93 tg->rt_se = root_task_group.rt_se;
94 tg->rt_rq = root_task_group.rt_rq;
95#endif
75 tg->autogroup = ag; 96 tg->autogroup = ag;
76 97
77 return ag; 98 return ag;
@@ -106,6 +127,11 @@ task_wants_autogroup(struct task_struct *p, struct task_group *tg)
106 return true; 127 return true;
107} 128}
108 129
130static inline bool task_group_is_autogroup(struct task_group *tg)
131{
132 return !!tg->autogroup;
133}
134
109static inline struct task_group * 135static inline struct task_group *
110autogroup_task_group(struct task_struct *p, struct task_group *tg) 136autogroup_task_group(struct task_struct *p, struct task_group *tg)
111{ 137{
@@ -134,11 +160,15 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag)
134 160
135 p->signal->autogroup = autogroup_kref_get(ag); 161 p->signal->autogroup = autogroup_kref_get(ag);
136 162
163 if (!ACCESS_ONCE(sysctl_sched_autogroup_enabled))
164 goto out;
165
137 t = p; 166 t = p;
138 do { 167 do {
139 sched_move_task(t); 168 sched_move_task(t);
140 } while_each_thread(p, t); 169 } while_each_thread(p, t);
141 170
171out:
142 unlock_task_sighand(p, &flags); 172 unlock_task_sighand(p, &flags);
143 autogroup_kref_put(prev); 173 autogroup_kref_put(prev);
144} 174}
@@ -220,10 +250,14 @@ void proc_sched_autogroup_show_task(struct task_struct *p, struct seq_file *m)
220{ 250{
221 struct autogroup *ag = autogroup_task_get(p); 251 struct autogroup *ag = autogroup_task_get(p);
222 252
253 if (!task_group_is_autogroup(ag->tg))
254 goto out;
255
223 down_read(&ag->lock); 256 down_read(&ag->lock);
224 seq_printf(m, "/autogroup-%ld nice %d\n", ag->id, ag->nice); 257 seq_printf(m, "/autogroup-%ld nice %d\n", ag->id, ag->nice);
225 up_read(&ag->lock); 258 up_read(&ag->lock);
226 259
260out:
227 autogroup_kref_put(ag); 261 autogroup_kref_put(ag);
228} 262}
229#endif /* CONFIG_PROC_FS */ 263#endif /* CONFIG_PROC_FS */
@@ -231,6 +265,9 @@ void proc_sched_autogroup_show_task(struct task_struct *p, struct seq_file *m)
231#ifdef CONFIG_SCHED_DEBUG 265#ifdef CONFIG_SCHED_DEBUG
232static inline int autogroup_path(struct task_group *tg, char *buf, int buflen) 266static inline int autogroup_path(struct task_group *tg, char *buf, int buflen)
233{ 267{
268 if (!task_group_is_autogroup(tg))
269 return 0;
270
234 return snprintf(buf, buflen, "%s-%ld", "/autogroup", tg->autogroup->id); 271 return snprintf(buf, buflen, "%s-%ld", "/autogroup", tg->autogroup->id);
235} 272}
236#endif /* CONFIG_SCHED_DEBUG */ 273#endif /* CONFIG_SCHED_DEBUG */
diff --git a/kernel/sched_autogroup.h b/kernel/sched_autogroup.h
index 5358e241cb20..05577055cfca 100644
--- a/kernel/sched_autogroup.h
+++ b/kernel/sched_autogroup.h
@@ -1,6 +1,11 @@
1#ifdef CONFIG_SCHED_AUTOGROUP 1#ifdef CONFIG_SCHED_AUTOGROUP
2 2
3struct autogroup { 3struct autogroup {
4 /*
5 * reference doesn't mean how many thread attach to this
6 * autogroup now. It just stands for the number of task
7 * could use this autogroup.
8 */
4 struct kref kref; 9 struct kref kref;
5 struct task_group *tg; 10 struct task_group *tg;
6 struct rw_semaphore lock; 11 struct rw_semaphore lock;
@@ -15,6 +20,10 @@ autogroup_task_group(struct task_struct *p, struct task_group *tg);
15 20
16static inline void autogroup_init(struct task_struct *init_task) { } 21static inline void autogroup_init(struct task_struct *init_task) { }
17static inline void autogroup_free(struct task_group *tg) { } 22static inline void autogroup_free(struct task_group *tg) { }
23static inline bool task_group_is_autogroup(struct task_group *tg)
24{
25 return 0;
26}
18 27
19static inline struct task_group * 28static inline struct task_group *
20autogroup_task_group(struct task_struct *p, struct task_group *tg) 29autogroup_task_group(struct task_struct *p, struct task_group *tg)
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 1dfae3d014b5..7bacd83a4158 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -16,6 +16,8 @@
16#include <linux/kallsyms.h> 16#include <linux/kallsyms.h>
17#include <linux/utsname.h> 17#include <linux/utsname.h>
18 18
19static DEFINE_SPINLOCK(sched_debug_lock);
20
19/* 21/*
20 * This allows printing both to /proc/sched_debug and 22 * This allows printing both to /proc/sched_debug and
21 * to the console 23 * to the console
@@ -86,6 +88,26 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
86} 88}
87#endif 89#endif
88 90
91#ifdef CONFIG_CGROUP_SCHED
92static char group_path[PATH_MAX];
93
94static char *task_group_path(struct task_group *tg)
95{
96 if (autogroup_path(tg, group_path, PATH_MAX))
97 return group_path;
98
99 /*
100 * May be NULL if the underlying cgroup isn't fully-created yet
101 */
102 if (!tg->css.cgroup) {
103 group_path[0] = '\0';
104 return group_path;
105 }
106 cgroup_path(tg->css.cgroup, group_path, PATH_MAX);
107 return group_path;
108}
109#endif
110
89static void 111static void
90print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) 112print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
91{ 113{
@@ -108,6 +130,9 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
108 SEQ_printf(m, "%15Ld %15Ld %15Ld.%06ld %15Ld.%06ld %15Ld.%06ld", 130 SEQ_printf(m, "%15Ld %15Ld %15Ld.%06ld %15Ld.%06ld %15Ld.%06ld",
109 0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L); 131 0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L);
110#endif 132#endif
133#ifdef CONFIG_CGROUP_SCHED
134 SEQ_printf(m, " %s", task_group_path(task_group(p)));
135#endif
111 136
112 SEQ_printf(m, "\n"); 137 SEQ_printf(m, "\n");
113} 138}
@@ -144,13 +169,17 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
144 struct sched_entity *last; 169 struct sched_entity *last;
145 unsigned long flags; 170 unsigned long flags;
146 171
172#ifdef CONFIG_FAIR_GROUP_SCHED
173 SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, task_group_path(cfs_rq->tg));
174#else
147 SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu); 175 SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu);
176#endif
148 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "exec_clock", 177 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "exec_clock",
149 SPLIT_NS(cfs_rq->exec_clock)); 178 SPLIT_NS(cfs_rq->exec_clock));
150 179
151 raw_spin_lock_irqsave(&rq->lock, flags); 180 raw_spin_lock_irqsave(&rq->lock, flags);
152 if (cfs_rq->rb_leftmost) 181 if (cfs_rq->rb_leftmost)
153 MIN_vruntime = (__pick_next_entity(cfs_rq))->vruntime; 182 MIN_vruntime = (__pick_first_entity(cfs_rq))->vruntime;
154 last = __pick_last_entity(cfs_rq); 183 last = __pick_last_entity(cfs_rq);
155 if (last) 184 if (last)
156 max_vruntime = last->vruntime; 185 max_vruntime = last->vruntime;
@@ -191,7 +220,11 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
191 220
192void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq) 221void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
193{ 222{
223#ifdef CONFIG_RT_GROUP_SCHED
224 SEQ_printf(m, "\nrt_rq[%d]:%s\n", cpu, task_group_path(rt_rq->tg));
225#else
194 SEQ_printf(m, "\nrt_rq[%d]:\n", cpu); 226 SEQ_printf(m, "\nrt_rq[%d]:\n", cpu);
227#endif
195 228
196#define P(x) \ 229#define P(x) \
197 SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(rt_rq->x)) 230 SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(rt_rq->x))
@@ -212,6 +245,7 @@ extern __read_mostly int sched_clock_running;
212static void print_cpu(struct seq_file *m, int cpu) 245static void print_cpu(struct seq_file *m, int cpu)
213{ 246{
214 struct rq *rq = cpu_rq(cpu); 247 struct rq *rq = cpu_rq(cpu);
248 unsigned long flags;
215 249
216#ifdef CONFIG_X86 250#ifdef CONFIG_X86
217 { 251 {
@@ -262,14 +296,20 @@ static void print_cpu(struct seq_file *m, int cpu)
262 P(ttwu_count); 296 P(ttwu_count);
263 P(ttwu_local); 297 P(ttwu_local);
264 298
265 P(bkl_count); 299 SEQ_printf(m, " .%-30s: %d\n", "bkl_count",
300 rq->rq_sched_info.bkl_count);
266 301
267#undef P 302#undef P
303#undef P64
268#endif 304#endif
305 spin_lock_irqsave(&sched_debug_lock, flags);
269 print_cfs_stats(m, cpu); 306 print_cfs_stats(m, cpu);
270 print_rt_stats(m, cpu); 307 print_rt_stats(m, cpu);
271 308
309 rcu_read_lock();
272 print_rq(m, rq, cpu); 310 print_rq(m, rq, cpu);
311 rcu_read_unlock();
312 spin_unlock_irqrestore(&sched_debug_lock, flags);
273} 313}
274 314
275static const char *sched_tunable_scaling_names[] = { 315static const char *sched_tunable_scaling_names[] = {
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index c62ebae65cf0..3f7ec9e27ee1 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -69,14 +69,6 @@ static unsigned int sched_nr_latency = 8;
69unsigned int sysctl_sched_child_runs_first __read_mostly; 69unsigned int sysctl_sched_child_runs_first __read_mostly;
70 70
71/* 71/*
72 * sys_sched_yield() compat mode
73 *
74 * This option switches the agressive yield implementation of the
75 * old scheduler back on.
76 */
77unsigned int __read_mostly sysctl_sched_compat_yield;
78
79/*
80 * SCHED_OTHER wake-up granularity. 72 * SCHED_OTHER wake-up granularity.
81 * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds) 73 * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
82 * 74 *
@@ -419,7 +411,7 @@ static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
419 rb_erase(&se->run_node, &cfs_rq->tasks_timeline); 411 rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
420} 412}
421 413
422static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq) 414static struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
423{ 415{
424 struct rb_node *left = cfs_rq->rb_leftmost; 416 struct rb_node *left = cfs_rq->rb_leftmost;
425 417
@@ -429,6 +421,17 @@ static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq)
429 return rb_entry(left, struct sched_entity, run_node); 421 return rb_entry(left, struct sched_entity, run_node);
430} 422}
431 423
424static struct sched_entity *__pick_next_entity(struct sched_entity *se)
425{
426 struct rb_node *next = rb_next(&se->run_node);
427
428 if (!next)
429 return NULL;
430
431 return rb_entry(next, struct sched_entity, run_node);
432}
433
434#ifdef CONFIG_SCHED_DEBUG
432static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) 435static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
433{ 436{
434 struct rb_node *last = rb_last(&cfs_rq->tasks_timeline); 437 struct rb_node *last = rb_last(&cfs_rq->tasks_timeline);
@@ -443,7 +446,6 @@ static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
443 * Scheduling class statistics methods: 446 * Scheduling class statistics methods:
444 */ 447 */
445 448
446#ifdef CONFIG_SCHED_DEBUG
447int sched_proc_update_handler(struct ctl_table *table, int write, 449int sched_proc_update_handler(struct ctl_table *table, int write,
448 void __user *buffer, size_t *lenp, 450 void __user *buffer, size_t *lenp,
449 loff_t *ppos) 451 loff_t *ppos)
@@ -540,7 +542,7 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
540} 542}
541 543
542static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update); 544static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update);
543static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta); 545static void update_cfs_shares(struct cfs_rq *cfs_rq);
544 546
545/* 547/*
546 * Update the current task's runtime statistics. Skip current tasks that 548 * Update the current task's runtime statistics. Skip current tasks that
@@ -699,7 +701,8 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
699 cfs_rq->nr_running--; 701 cfs_rq->nr_running--;
700} 702}
701 703
702#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED 704#ifdef CONFIG_FAIR_GROUP_SCHED
705# ifdef CONFIG_SMP
703static void update_cfs_rq_load_contribution(struct cfs_rq *cfs_rq, 706static void update_cfs_rq_load_contribution(struct cfs_rq *cfs_rq,
704 int global_update) 707 int global_update)
705{ 708{
@@ -721,10 +724,10 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
721 u64 now, delta; 724 u64 now, delta;
722 unsigned long load = cfs_rq->load.weight; 725 unsigned long load = cfs_rq->load.weight;
723 726
724 if (!cfs_rq) 727 if (cfs_rq->tg == &root_task_group)
725 return; 728 return;
726 729
727 now = rq_of(cfs_rq)->clock; 730 now = rq_of(cfs_rq)->clock_task;
728 delta = now - cfs_rq->load_stamp; 731 delta = now - cfs_rq->load_stamp;
729 732
730 /* truncate load history at 4 idle periods */ 733 /* truncate load history at 4 idle periods */
@@ -732,6 +735,7 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
732 now - cfs_rq->load_last > 4 * period) { 735 now - cfs_rq->load_last > 4 * period) {
733 cfs_rq->load_period = 0; 736 cfs_rq->load_period = 0;
734 cfs_rq->load_avg = 0; 737 cfs_rq->load_avg = 0;
738 delta = period - 1;
735 } 739 }
736 740
737 cfs_rq->load_stamp = now; 741 cfs_rq->load_stamp = now;
@@ -762,6 +766,49 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
762 list_del_leaf_cfs_rq(cfs_rq); 766 list_del_leaf_cfs_rq(cfs_rq);
763} 767}
764 768
769static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
770{
771 long load_weight, load, shares;
772
773 load = cfs_rq->load.weight;
774
775 load_weight = atomic_read(&tg->load_weight);
776 load_weight += load;
777 load_weight -= cfs_rq->load_contribution;
778
779 shares = (tg->shares * load);
780 if (load_weight)
781 shares /= load_weight;
782
783 if (shares < MIN_SHARES)
784 shares = MIN_SHARES;
785 if (shares > tg->shares)
786 shares = tg->shares;
787
788 return shares;
789}
790
791static void update_entity_shares_tick(struct cfs_rq *cfs_rq)
792{
793 if (cfs_rq->load_unacc_exec_time > sysctl_sched_shares_window) {
794 update_cfs_load(cfs_rq, 0);
795 update_cfs_shares(cfs_rq);
796 }
797}
798# else /* CONFIG_SMP */
799static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
800{
801}
802
803static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
804{
805 return tg->shares;
806}
807
808static inline void update_entity_shares_tick(struct cfs_rq *cfs_rq)
809{
810}
811# endif /* CONFIG_SMP */
765static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, 812static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
766 unsigned long weight) 813 unsigned long weight)
767{ 814{
@@ -778,51 +825,30 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
778 account_entity_enqueue(cfs_rq, se); 825 account_entity_enqueue(cfs_rq, se);
779} 826}
780 827
781static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta) 828static void update_cfs_shares(struct cfs_rq *cfs_rq)
782{ 829{
783 struct task_group *tg; 830 struct task_group *tg;
784 struct sched_entity *se; 831 struct sched_entity *se;
785 long load_weight, load, shares; 832 long shares;
786
787 if (!cfs_rq)
788 return;
789 833
790 tg = cfs_rq->tg; 834 tg = cfs_rq->tg;
791 se = tg->se[cpu_of(rq_of(cfs_rq))]; 835 se = tg->se[cpu_of(rq_of(cfs_rq))];
792 if (!se) 836 if (!se)
793 return; 837 return;
794 838#ifndef CONFIG_SMP
795 load = cfs_rq->load.weight + weight_delta; 839 if (likely(se->load.weight == tg->shares))
796 840 return;
797 load_weight = atomic_read(&tg->load_weight); 841#endif
798 load_weight -= cfs_rq->load_contribution; 842 shares = calc_cfs_shares(cfs_rq, tg);
799 load_weight += load;
800
801 shares = (tg->shares * load);
802 if (load_weight)
803 shares /= load_weight;
804
805 if (shares < MIN_SHARES)
806 shares = MIN_SHARES;
807 if (shares > tg->shares)
808 shares = tg->shares;
809 843
810 reweight_entity(cfs_rq_of(se), se, shares); 844 reweight_entity(cfs_rq_of(se), se, shares);
811} 845}
812
813static void update_entity_shares_tick(struct cfs_rq *cfs_rq)
814{
815 if (cfs_rq->load_unacc_exec_time > sysctl_sched_shares_window) {
816 update_cfs_load(cfs_rq, 0);
817 update_cfs_shares(cfs_rq, 0);
818 }
819}
820#else /* CONFIG_FAIR_GROUP_SCHED */ 846#else /* CONFIG_FAIR_GROUP_SCHED */
821static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update) 847static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
822{ 848{
823} 849}
824 850
825static inline void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta) 851static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
826{ 852{
827} 853}
828 854
@@ -953,8 +979,8 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
953 */ 979 */
954 update_curr(cfs_rq); 980 update_curr(cfs_rq);
955 update_cfs_load(cfs_rq, 0); 981 update_cfs_load(cfs_rq, 0);
956 update_cfs_shares(cfs_rq, se->load.weight);
957 account_entity_enqueue(cfs_rq, se); 982 account_entity_enqueue(cfs_rq, se);
983 update_cfs_shares(cfs_rq);
958 984
959 if (flags & ENQUEUE_WAKEUP) { 985 if (flags & ENQUEUE_WAKEUP) {
960 place_entity(cfs_rq, se, 0); 986 place_entity(cfs_rq, se, 0);
@@ -971,19 +997,49 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
971 list_add_leaf_cfs_rq(cfs_rq); 997 list_add_leaf_cfs_rq(cfs_rq);
972} 998}
973 999
974static void __clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) 1000static void __clear_buddies_last(struct sched_entity *se)
975{ 1001{
976 if (!se || cfs_rq->last == se) 1002 for_each_sched_entity(se) {
977 cfs_rq->last = NULL; 1003 struct cfs_rq *cfs_rq = cfs_rq_of(se);
1004 if (cfs_rq->last == se)
1005 cfs_rq->last = NULL;
1006 else
1007 break;
1008 }
1009}
978 1010
979 if (!se || cfs_rq->next == se) 1011static void __clear_buddies_next(struct sched_entity *se)
980 cfs_rq->next = NULL; 1012{
1013 for_each_sched_entity(se) {
1014 struct cfs_rq *cfs_rq = cfs_rq_of(se);
1015 if (cfs_rq->next == se)
1016 cfs_rq->next = NULL;
1017 else
1018 break;
1019 }
1020}
1021
1022static void __clear_buddies_skip(struct sched_entity *se)
1023{
1024 for_each_sched_entity(se) {
1025 struct cfs_rq *cfs_rq = cfs_rq_of(se);
1026 if (cfs_rq->skip == se)
1027 cfs_rq->skip = NULL;
1028 else
1029 break;
1030 }
981} 1031}
982 1032
983static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) 1033static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
984{ 1034{
985 for_each_sched_entity(se) 1035 if (cfs_rq->last == se)
986 __clear_buddies(cfs_rq_of(se), se); 1036 __clear_buddies_last(se);
1037
1038 if (cfs_rq->next == se)
1039 __clear_buddies_next(se);
1040
1041 if (cfs_rq->skip == se)
1042 __clear_buddies_skip(se);
987} 1043}
988 1044
989static void 1045static void
@@ -1016,7 +1072,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
1016 update_cfs_load(cfs_rq, 0); 1072 update_cfs_load(cfs_rq, 0);
1017 account_entity_dequeue(cfs_rq, se); 1073 account_entity_dequeue(cfs_rq, se);
1018 update_min_vruntime(cfs_rq); 1074 update_min_vruntime(cfs_rq);
1019 update_cfs_shares(cfs_rq, 0); 1075 update_cfs_shares(cfs_rq);
1020 1076
1021 /* 1077 /*
1022 * Normalize the entity after updating the min_vruntime because the 1078 * Normalize the entity after updating the min_vruntime because the
@@ -1059,9 +1115,12 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
1059 return; 1115 return;
1060 1116
1061 if (cfs_rq->nr_running > 1) { 1117 if (cfs_rq->nr_running > 1) {
1062 struct sched_entity *se = __pick_next_entity(cfs_rq); 1118 struct sched_entity *se = __pick_first_entity(cfs_rq);
1063 s64 delta = curr->vruntime - se->vruntime; 1119 s64 delta = curr->vruntime - se->vruntime;
1064 1120
1121 if (delta < 0)
1122 return;
1123
1065 if (delta > ideal_runtime) 1124 if (delta > ideal_runtime)
1066 resched_task(rq_of(cfs_rq)->curr); 1125 resched_task(rq_of(cfs_rq)->curr);
1067 } 1126 }
@@ -1100,13 +1159,27 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
1100static int 1159static int
1101wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se); 1160wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
1102 1161
1162/*
1163 * Pick the next process, keeping these things in mind, in this order:
1164 * 1) keep things fair between processes/task groups
1165 * 2) pick the "next" process, since someone really wants that to run
1166 * 3) pick the "last" process, for cache locality
1167 * 4) do not run the "skip" process, if something else is available
1168 */
1103static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq) 1169static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
1104{ 1170{
1105 struct sched_entity *se = __pick_next_entity(cfs_rq); 1171 struct sched_entity *se = __pick_first_entity(cfs_rq);
1106 struct sched_entity *left = se; 1172 struct sched_entity *left = se;
1107 1173
1108 if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1) 1174 /*
1109 se = cfs_rq->next; 1175 * Avoid running the skip buddy, if running something else can
1176 * be done without getting too unfair.
1177 */
1178 if (cfs_rq->skip == se) {
1179 struct sched_entity *second = __pick_next_entity(se);
1180 if (second && wakeup_preempt_entity(second, left) < 1)
1181 se = second;
1182 }
1110 1183
1111 /* 1184 /*
1112 * Prefer last buddy, try to return the CPU to a preempted task. 1185 * Prefer last buddy, try to return the CPU to a preempted task.
@@ -1114,6 +1187,12 @@ static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
1114 if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1) 1187 if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1)
1115 se = cfs_rq->last; 1188 se = cfs_rq->last;
1116 1189
1190 /*
1191 * Someone really wants this to run. If it's not unfair, run it.
1192 */
1193 if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
1194 se = cfs_rq->next;
1195
1117 clear_buddies(cfs_rq, se); 1196 clear_buddies(cfs_rq, se);
1118 1197
1119 return se; 1198 return se;
@@ -1254,7 +1333,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
1254 struct cfs_rq *cfs_rq = cfs_rq_of(se); 1333 struct cfs_rq *cfs_rq = cfs_rq_of(se);
1255 1334
1256 update_cfs_load(cfs_rq, 0); 1335 update_cfs_load(cfs_rq, 0);
1257 update_cfs_shares(cfs_rq, 0); 1336 update_cfs_shares(cfs_rq);
1258 } 1337 }
1259 1338
1260 hrtick_update(rq); 1339 hrtick_update(rq);
@@ -1284,58 +1363,12 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
1284 struct cfs_rq *cfs_rq = cfs_rq_of(se); 1363 struct cfs_rq *cfs_rq = cfs_rq_of(se);
1285 1364
1286 update_cfs_load(cfs_rq, 0); 1365 update_cfs_load(cfs_rq, 0);
1287 update_cfs_shares(cfs_rq, 0); 1366 update_cfs_shares(cfs_rq);
1288 } 1367 }
1289 1368
1290 hrtick_update(rq); 1369 hrtick_update(rq);
1291} 1370}
1292 1371
1293/*
1294 * sched_yield() support is very simple - we dequeue and enqueue.
1295 *
1296 * If compat_yield is turned on then we requeue to the end of the tree.
1297 */
1298static void yield_task_fair(struct rq *rq)
1299{
1300 struct task_struct *curr = rq->curr;
1301 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
1302 struct sched_entity *rightmost, *se = &curr->se;
1303
1304 /*
1305 * Are we the only task in the tree?
1306 */
1307 if (unlikely(cfs_rq->nr_running == 1))
1308 return;
1309
1310 clear_buddies(cfs_rq, se);
1311
1312 if (likely(!sysctl_sched_compat_yield) && curr->policy != SCHED_BATCH) {
1313 update_rq_clock(rq);
1314 /*
1315 * Update run-time statistics of the 'current'.
1316 */
1317 update_curr(cfs_rq);
1318
1319 return;
1320 }
1321 /*
1322 * Find the rightmost entry in the rbtree:
1323 */
1324 rightmost = __pick_last_entity(cfs_rq);
1325 /*
1326 * Already in the rightmost position?
1327 */
1328 if (unlikely(!rightmost || entity_before(rightmost, se)))
1329 return;
1330
1331 /*
1332 * Minimally necessary key value to be last in the tree:
1333 * Upon rescheduling, sched_class::put_prev_task() will place
1334 * 'current' within the tree based on its new key value.
1335 */
1336 se->vruntime = rightmost->vruntime + 1;
1337}
1338
1339#ifdef CONFIG_SMP 1372#ifdef CONFIG_SMP
1340 1373
1341static void task_waking_fair(struct rq *rq, struct task_struct *p) 1374static void task_waking_fair(struct rq *rq, struct task_struct *p)
@@ -1362,27 +1395,27 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
1362 return wl; 1395 return wl;
1363 1396
1364 for_each_sched_entity(se) { 1397 for_each_sched_entity(se) {
1365 long S, rw, s, a, b; 1398 long lw, w;
1366 1399
1367 S = se->my_q->tg->shares; 1400 tg = se->my_q->tg;
1368 s = se->load.weight; 1401 w = se->my_q->load.weight;
1369 rw = se->my_q->load.weight;
1370 1402
1371 a = S*(rw + wl); 1403 /* use this cpu's instantaneous contribution */
1372 b = S*rw + s*wg; 1404 lw = atomic_read(&tg->load_weight);
1405 lw -= se->my_q->load_contribution;
1406 lw += w + wg;
1373 1407
1374 wl = s*(a-b); 1408 wl += w;
1375 1409
1376 if (likely(b)) 1410 if (lw > 0 && wl < lw)
1377 wl /= b; 1411 wl = (wl * tg->shares) / lw;
1412 else
1413 wl = tg->shares;
1378 1414
1379 /* 1415 /* zero point is MIN_SHARES */
1380 * Assume the group is already running and will 1416 if (wl < MIN_SHARES)
1381 * thus already be accounted for in the weight. 1417 wl = MIN_SHARES;
1382 * 1418 wl -= se->load.weight;
1383 * That is, moving shares between CPUs, does not
1384 * alter the group weight.
1385 */
1386 wg = 0; 1419 wg = 0;
1387 } 1420 }
1388 1421
@@ -1401,7 +1434,7 @@ static inline unsigned long effective_load(struct task_group *tg, int cpu,
1401 1434
1402static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) 1435static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
1403{ 1436{
1404 unsigned long this_load, load; 1437 s64 this_load, load;
1405 int idx, this_cpu, prev_cpu; 1438 int idx, this_cpu, prev_cpu;
1406 unsigned long tl_per_task; 1439 unsigned long tl_per_task;
1407 struct task_group *tg; 1440 struct task_group *tg;
@@ -1440,8 +1473,8 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
1440 * Otherwise check if either cpus are near enough in load to allow this 1473 * Otherwise check if either cpus are near enough in load to allow this
1441 * task to be woken on this_cpu. 1474 * task to be woken on this_cpu.
1442 */ 1475 */
1443 if (this_load) { 1476 if (this_load > 0) {
1444 unsigned long this_eff_load, prev_eff_load; 1477 s64 this_eff_load, prev_eff_load;
1445 1478
1446 this_eff_load = 100; 1479 this_eff_load = 100;
1447 this_eff_load *= power_of(prev_cpu); 1480 this_eff_load *= power_of(prev_cpu);
@@ -1806,6 +1839,14 @@ static void set_next_buddy(struct sched_entity *se)
1806 } 1839 }
1807} 1840}
1808 1841
1842static void set_skip_buddy(struct sched_entity *se)
1843{
1844 if (likely(task_of(se)->policy != SCHED_IDLE)) {
1845 for_each_sched_entity(se)
1846 cfs_rq_of(se)->skip = se;
1847 }
1848}
1849
1809/* 1850/*
1810 * Preempt the current task with a newly woken task if needed: 1851 * Preempt the current task with a newly woken task if needed:
1811 */ 1852 */
@@ -1829,16 +1870,18 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
1829 if (test_tsk_need_resched(curr)) 1870 if (test_tsk_need_resched(curr))
1830 return; 1871 return;
1831 1872
1873 /* Idle tasks are by definition preempted by non-idle tasks. */
1874 if (unlikely(curr->policy == SCHED_IDLE) &&
1875 likely(p->policy != SCHED_IDLE))
1876 goto preempt;
1877
1832 /* 1878 /*
1833 * Batch and idle tasks do not preempt (their preemption is driven by 1879 * Batch and idle tasks do not preempt non-idle tasks (their preemption
1834 * the tick): 1880 * is driven by the tick):
1835 */ 1881 */
1836 if (unlikely(p->policy != SCHED_NORMAL)) 1882 if (unlikely(p->policy != SCHED_NORMAL))
1837 return; 1883 return;
1838 1884
1839 /* Idle tasks are by definition preempted by everybody. */
1840 if (unlikely(curr->policy == SCHED_IDLE))
1841 goto preempt;
1842 1885
1843 if (!sched_feat(WAKEUP_PREEMPT)) 1886 if (!sched_feat(WAKEUP_PREEMPT))
1844 return; 1887 return;
@@ -1904,6 +1947,51 @@ static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
1904 } 1947 }
1905} 1948}
1906 1949
1950/*
1951 * sched_yield() is very simple
1952 *
1953 * The magic of dealing with the ->skip buddy is in pick_next_entity.
1954 */
1955static void yield_task_fair(struct rq *rq)
1956{
1957 struct task_struct *curr = rq->curr;
1958 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
1959 struct sched_entity *se = &curr->se;
1960
1961 /*
1962 * Are we the only task in the tree?
1963 */
1964 if (unlikely(rq->nr_running == 1))
1965 return;
1966
1967 clear_buddies(cfs_rq, se);
1968
1969 if (curr->policy != SCHED_BATCH) {
1970 update_rq_clock(rq);
1971 /*
1972 * Update run-time statistics of the 'current'.
1973 */
1974 update_curr(cfs_rq);
1975 }
1976
1977 set_skip_buddy(se);
1978}
1979
1980static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preempt)
1981{
1982 struct sched_entity *se = &p->se;
1983
1984 if (!se->on_rq)
1985 return false;
1986
1987 /* Tell the scheduler that we'd really like pse to run next. */
1988 set_next_buddy(se);
1989
1990 yield_task_fair(rq);
1991
1992 return true;
1993}
1994
1907#ifdef CONFIG_SMP 1995#ifdef CONFIG_SMP
1908/************************************************** 1996/**************************************************
1909 * Fair scheduling class load-balancing methods: 1997 * Fair scheduling class load-balancing methods:
@@ -2095,7 +2183,7 @@ static int update_shares_cpu(struct task_group *tg, int cpu)
2095 * We need to update shares after updating tg->load_weight in 2183 * We need to update shares after updating tg->load_weight in
2096 * order to adjust the weight of groups with long running tasks. 2184 * order to adjust the weight of groups with long running tasks.
2097 */ 2185 */
2098 update_cfs_shares(cfs_rq, 0); 2186 update_cfs_shares(cfs_rq);
2099 2187
2100 raw_spin_unlock_irqrestore(&rq->lock, flags); 2188 raw_spin_unlock_irqrestore(&rq->lock, flags);
2101 2189
@@ -2582,7 +2670,6 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
2582 * @this_cpu: Cpu for which load balance is currently performed. 2670 * @this_cpu: Cpu for which load balance is currently performed.
2583 * @idle: Idle status of this_cpu 2671 * @idle: Idle status of this_cpu
2584 * @load_idx: Load index of sched_domain of this_cpu for load calc. 2672 * @load_idx: Load index of sched_domain of this_cpu for load calc.
2585 * @sd_idle: Idle status of the sched_domain containing group.
2586 * @local_group: Does group contain this_cpu. 2673 * @local_group: Does group contain this_cpu.
2587 * @cpus: Set of cpus considered for load balancing. 2674 * @cpus: Set of cpus considered for load balancing.
2588 * @balance: Should we balance. 2675 * @balance: Should we balance.
@@ -2590,7 +2677,7 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
2590 */ 2677 */
2591static inline void update_sg_lb_stats(struct sched_domain *sd, 2678static inline void update_sg_lb_stats(struct sched_domain *sd,
2592 struct sched_group *group, int this_cpu, 2679 struct sched_group *group, int this_cpu,
2593 enum cpu_idle_type idle, int load_idx, int *sd_idle, 2680 enum cpu_idle_type idle, int load_idx,
2594 int local_group, const struct cpumask *cpus, 2681 int local_group, const struct cpumask *cpus,
2595 int *balance, struct sg_lb_stats *sgs) 2682 int *balance, struct sg_lb_stats *sgs)
2596{ 2683{
@@ -2610,9 +2697,6 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
2610 for_each_cpu_and(i, sched_group_cpus(group), cpus) { 2697 for_each_cpu_and(i, sched_group_cpus(group), cpus) {
2611 struct rq *rq = cpu_rq(i); 2698 struct rq *rq = cpu_rq(i);
2612 2699
2613 if (*sd_idle && rq->nr_running)
2614 *sd_idle = 0;
2615
2616 /* Bias balancing toward cpus of our domain */ 2700 /* Bias balancing toward cpus of our domain */
2617 if (local_group) { 2701 if (local_group) {
2618 if (idle_cpu(i) && !first_idle_cpu) { 2702 if (idle_cpu(i) && !first_idle_cpu) {
@@ -2657,7 +2741,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
2657 2741
2658 /* 2742 /*
2659 * Consider the group unbalanced when the imbalance is larger 2743 * Consider the group unbalanced when the imbalance is larger
2660 * than the average weight of two tasks. 2744 * than the average weight of a task.
2661 * 2745 *
2662 * APZ: with cgroup the avg task weight can vary wildly and 2746 * APZ: with cgroup the avg task weight can vary wildly and
2663 * might not be a suitable number - should we keep a 2747 * might not be a suitable number - should we keep a
@@ -2667,7 +2751,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
2667 if (sgs->sum_nr_running) 2751 if (sgs->sum_nr_running)
2668 avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; 2752 avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
2669 2753
2670 if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task && max_nr_running > 1) 2754 if ((max_cpu_load - min_cpu_load) >= avg_load_per_task && max_nr_running > 1)
2671 sgs->group_imb = 1; 2755 sgs->group_imb = 1;
2672 2756
2673 sgs->group_capacity = DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE); 2757 sgs->group_capacity = DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
@@ -2727,15 +2811,13 @@ static bool update_sd_pick_busiest(struct sched_domain *sd,
2727 * @sd: sched_domain whose statistics are to be updated. 2811 * @sd: sched_domain whose statistics are to be updated.
2728 * @this_cpu: Cpu for which load balance is currently performed. 2812 * @this_cpu: Cpu for which load balance is currently performed.
2729 * @idle: Idle status of this_cpu 2813 * @idle: Idle status of this_cpu
2730 * @sd_idle: Idle status of the sched_domain containing sg.
2731 * @cpus: Set of cpus considered for load balancing. 2814 * @cpus: Set of cpus considered for load balancing.
2732 * @balance: Should we balance. 2815 * @balance: Should we balance.
2733 * @sds: variable to hold the statistics for this sched_domain. 2816 * @sds: variable to hold the statistics for this sched_domain.
2734 */ 2817 */
2735static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, 2818static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
2736 enum cpu_idle_type idle, int *sd_idle, 2819 enum cpu_idle_type idle, const struct cpumask *cpus,
2737 const struct cpumask *cpus, int *balance, 2820 int *balance, struct sd_lb_stats *sds)
2738 struct sd_lb_stats *sds)
2739{ 2821{
2740 struct sched_domain *child = sd->child; 2822 struct sched_domain *child = sd->child;
2741 struct sched_group *sg = sd->groups; 2823 struct sched_group *sg = sd->groups;
@@ -2753,7 +2835,7 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
2753 2835
2754 local_group = cpumask_test_cpu(this_cpu, sched_group_cpus(sg)); 2836 local_group = cpumask_test_cpu(this_cpu, sched_group_cpus(sg));
2755 memset(&sgs, 0, sizeof(sgs)); 2837 memset(&sgs, 0, sizeof(sgs));
2756 update_sg_lb_stats(sd, sg, this_cpu, idle, load_idx, sd_idle, 2838 update_sg_lb_stats(sd, sg, this_cpu, idle, load_idx,
2757 local_group, cpus, balance, &sgs); 2839 local_group, cpus, balance, &sgs);
2758 2840
2759 if (local_group && !(*balance)) 2841 if (local_group && !(*balance))
@@ -3005,7 +3087,6 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
3005 * @imbalance: Variable which stores amount of weighted load which should 3087 * @imbalance: Variable which stores amount of weighted load which should
3006 * be moved to restore balance/put a group to idle. 3088 * be moved to restore balance/put a group to idle.
3007 * @idle: The idle status of this_cpu. 3089 * @idle: The idle status of this_cpu.
3008 * @sd_idle: The idleness of sd
3009 * @cpus: The set of CPUs under consideration for load-balancing. 3090 * @cpus: The set of CPUs under consideration for load-balancing.
3010 * @balance: Pointer to a variable indicating if this_cpu 3091 * @balance: Pointer to a variable indicating if this_cpu
3011 * is the appropriate cpu to perform load balancing at this_level. 3092 * is the appropriate cpu to perform load balancing at this_level.
@@ -3018,7 +3099,7 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
3018static struct sched_group * 3099static struct sched_group *
3019find_busiest_group(struct sched_domain *sd, int this_cpu, 3100find_busiest_group(struct sched_domain *sd, int this_cpu,
3020 unsigned long *imbalance, enum cpu_idle_type idle, 3101 unsigned long *imbalance, enum cpu_idle_type idle,
3021 int *sd_idle, const struct cpumask *cpus, int *balance) 3102 const struct cpumask *cpus, int *balance)
3022{ 3103{
3023 struct sd_lb_stats sds; 3104 struct sd_lb_stats sds;
3024 3105
@@ -3028,22 +3109,11 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
3028 * Compute the various statistics relavent for load balancing at 3109 * Compute the various statistics relavent for load balancing at
3029 * this level. 3110 * this level.
3030 */ 3111 */
3031 update_sd_lb_stats(sd, this_cpu, idle, sd_idle, cpus, 3112 update_sd_lb_stats(sd, this_cpu, idle, cpus, balance, &sds);
3032 balance, &sds); 3113
3033 3114 /*
3034 /* Cases where imbalance does not exist from POV of this_cpu */ 3115 * this_cpu is not the appropriate cpu to perform load balancing at
3035 /* 1) this_cpu is not the appropriate cpu to perform load balancing 3116 * this level.
3036 * at this level.
3037 * 2) There is no busy sibling group to pull from.
3038 * 3) This group is the busiest group.
3039 * 4) This group is more busy than the avg busieness at this
3040 * sched_domain.
3041 * 5) The imbalance is within the specified limit.
3042 *
3043 * Note: when doing newidle balance, if the local group has excess
3044 * capacity (i.e. nr_running < group_capacity) and the busiest group
3045 * does not have any capacity, we force a load balance to pull tasks
3046 * to the local group. In this case, we skip past checks 3, 4 and 5.
3047 */ 3117 */
3048 if (!(*balance)) 3118 if (!(*balance))
3049 goto ret; 3119 goto ret;
@@ -3052,41 +3122,55 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
3052 check_asym_packing(sd, &sds, this_cpu, imbalance)) 3122 check_asym_packing(sd, &sds, this_cpu, imbalance))
3053 return sds.busiest; 3123 return sds.busiest;
3054 3124
3125 /* There is no busy sibling group to pull tasks from */
3055 if (!sds.busiest || sds.busiest_nr_running == 0) 3126 if (!sds.busiest || sds.busiest_nr_running == 0)
3056 goto out_balanced; 3127 goto out_balanced;
3057 3128
3058 /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */ 3129 /*
3130 * If the busiest group is imbalanced the below checks don't
3131 * work because they assumes all things are equal, which typically
3132 * isn't true due to cpus_allowed constraints and the like.
3133 */
3134 if (sds.group_imb)
3135 goto force_balance;
3136
3137 /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
3059 if (idle == CPU_NEWLY_IDLE && sds.this_has_capacity && 3138 if (idle == CPU_NEWLY_IDLE && sds.this_has_capacity &&
3060 !sds.busiest_has_capacity) 3139 !sds.busiest_has_capacity)
3061 goto force_balance; 3140 goto force_balance;
3062 3141
3142 /*
3143 * If the local group is more busy than the selected busiest group
3144 * don't try and pull any tasks.
3145 */
3063 if (sds.this_load >= sds.max_load) 3146 if (sds.this_load >= sds.max_load)
3064 goto out_balanced; 3147 goto out_balanced;
3065 3148
3149 /*
3150 * Don't pull any tasks if this group is already above the domain
3151 * average load.
3152 */
3066 sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr; 3153 sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr;
3067
3068 if (sds.this_load >= sds.avg_load) 3154 if (sds.this_load >= sds.avg_load)
3069 goto out_balanced; 3155 goto out_balanced;
3070 3156
3071 /* 3157 if (idle == CPU_IDLE) {
3072 * In the CPU_NEWLY_IDLE, use imbalance_pct to be conservative.
3073 * And to check for busy balance use !idle_cpu instead of
3074 * CPU_NOT_IDLE. This is because HT siblings will use CPU_NOT_IDLE
3075 * even when they are idle.
3076 */
3077 if (idle == CPU_NEWLY_IDLE || !idle_cpu(this_cpu)) {
3078 if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
3079 goto out_balanced;
3080 } else {
3081 /* 3158 /*
3082 * This cpu is idle. If the busiest group load doesn't 3159 * This cpu is idle. If the busiest group load doesn't
3083 * have more tasks than the number of available cpu's and 3160 * have more tasks than the number of available cpu's and
3084 * there is no imbalance between this and busiest group 3161 * there is no imbalance between this and busiest group
3085 * wrt to idle cpu's, it is balanced. 3162 * wrt to idle cpu's, it is balanced.
3086 */ 3163 */
3087 if ((sds.this_idle_cpus <= sds.busiest_idle_cpus + 1) && 3164 if ((sds.this_idle_cpus <= sds.busiest_idle_cpus + 1) &&
3088 sds.busiest_nr_running <= sds.busiest_group_weight) 3165 sds.busiest_nr_running <= sds.busiest_group_weight)
3089 goto out_balanced; 3166 goto out_balanced;
3167 } else {
3168 /*
3169 * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use
3170 * imbalance_pct to be conservative.
3171 */
3172 if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
3173 goto out_balanced;
3090 } 3174 }
3091 3175
3092force_balance: 3176force_balance:
@@ -3165,7 +3249,7 @@ find_busiest_queue(struct sched_domain *sd, struct sched_group *group,
3165/* Working cpumask for load_balance and load_balance_newidle. */ 3249/* Working cpumask for load_balance and load_balance_newidle. */
3166static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask); 3250static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
3167 3251
3168static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle, 3252static int need_active_balance(struct sched_domain *sd, int idle,
3169 int busiest_cpu, int this_cpu) 3253 int busiest_cpu, int this_cpu)
3170{ 3254{
3171 if (idle == CPU_NEWLY_IDLE) { 3255 if (idle == CPU_NEWLY_IDLE) {
@@ -3197,10 +3281,6 @@ static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle,
3197 * move_tasks() will succeed. ld_moved will be true and this 3281 * move_tasks() will succeed. ld_moved will be true and this
3198 * active balance code will not be triggered. 3282 * active balance code will not be triggered.
3199 */ 3283 */
3200 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
3201 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
3202 return 0;
3203
3204 if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP) 3284 if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP)
3205 return 0; 3285 return 0;
3206 } 3286 }
@@ -3218,7 +3298,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
3218 struct sched_domain *sd, enum cpu_idle_type idle, 3298 struct sched_domain *sd, enum cpu_idle_type idle,
3219 int *balance) 3299 int *balance)
3220{ 3300{
3221 int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0; 3301 int ld_moved, all_pinned = 0, active_balance = 0;
3222 struct sched_group *group; 3302 struct sched_group *group;
3223 unsigned long imbalance; 3303 unsigned long imbalance;
3224 struct rq *busiest; 3304 struct rq *busiest;
@@ -3227,20 +3307,10 @@ static int load_balance(int this_cpu, struct rq *this_rq,
3227 3307
3228 cpumask_copy(cpus, cpu_active_mask); 3308 cpumask_copy(cpus, cpu_active_mask);
3229 3309
3230 /*
3231 * When power savings policy is enabled for the parent domain, idle
3232 * sibling can pick up load irrespective of busy siblings. In this case,
3233 * let the state of idle sibling percolate up as CPU_IDLE, instead of
3234 * portraying it as CPU_NOT_IDLE.
3235 */
3236 if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
3237 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
3238 sd_idle = 1;
3239
3240 schedstat_inc(sd, lb_count[idle]); 3310 schedstat_inc(sd, lb_count[idle]);
3241 3311
3242redo: 3312redo:
3243 group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle, 3313 group = find_busiest_group(sd, this_cpu, &imbalance, idle,
3244 cpus, balance); 3314 cpus, balance);
3245 3315
3246 if (*balance == 0) 3316 if (*balance == 0)
@@ -3302,8 +3372,7 @@ redo:
3302 if (idle != CPU_NEWLY_IDLE) 3372 if (idle != CPU_NEWLY_IDLE)
3303 sd->nr_balance_failed++; 3373 sd->nr_balance_failed++;
3304 3374
3305 if (need_active_balance(sd, sd_idle, idle, cpu_of(busiest), 3375 if (need_active_balance(sd, idle, cpu_of(busiest), this_cpu)) {
3306 this_cpu)) {
3307 raw_spin_lock_irqsave(&busiest->lock, flags); 3376 raw_spin_lock_irqsave(&busiest->lock, flags);
3308 3377
3309 /* don't kick the active_load_balance_cpu_stop, 3378 /* don't kick the active_load_balance_cpu_stop,
@@ -3358,10 +3427,6 @@ redo:
3358 sd->balance_interval *= 2; 3427 sd->balance_interval *= 2;
3359 } 3428 }
3360 3429
3361 if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
3362 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
3363 ld_moved = -1;
3364
3365 goto out; 3430 goto out;
3366 3431
3367out_balanced: 3432out_balanced:
@@ -3375,11 +3440,7 @@ out_one_pinned:
3375 (sd->balance_interval < sd->max_interval)) 3440 (sd->balance_interval < sd->max_interval))
3376 sd->balance_interval *= 2; 3441 sd->balance_interval *= 2;
3377 3442
3378 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && 3443 ld_moved = 0;
3379 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
3380 ld_moved = -1;
3381 else
3382 ld_moved = 0;
3383out: 3444out:
3384 return ld_moved; 3445 return ld_moved;
3385} 3446}
@@ -3803,8 +3864,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
3803 if (load_balance(cpu, rq, sd, idle, &balance)) { 3864 if (load_balance(cpu, rq, sd, idle, &balance)) {
3804 /* 3865 /*
3805 * We've pulled tasks over so either we're no 3866 * We've pulled tasks over so either we're no
3806 * longer idle, or one of our SMT siblings is 3867 * longer idle.
3807 * not idle.
3808 */ 3868 */
3809 idle = CPU_NOT_IDLE; 3869 idle = CPU_NOT_IDLE;
3810 } 3870 }
@@ -4051,33 +4111,62 @@ static void task_fork_fair(struct task_struct *p)
4051 * Priority of the task has changed. Check to see if we preempt 4111 * Priority of the task has changed. Check to see if we preempt
4052 * the current task. 4112 * the current task.
4053 */ 4113 */
4054static void prio_changed_fair(struct rq *rq, struct task_struct *p, 4114static void
4055 int oldprio, int running) 4115prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
4056{ 4116{
4117 if (!p->se.on_rq)
4118 return;
4119
4057 /* 4120 /*
4058 * Reschedule if we are currently running on this runqueue and 4121 * Reschedule if we are currently running on this runqueue and
4059 * our priority decreased, or if we are not currently running on 4122 * our priority decreased, or if we are not currently running on
4060 * this runqueue and our priority is higher than the current's 4123 * this runqueue and our priority is higher than the current's
4061 */ 4124 */
4062 if (running) { 4125 if (rq->curr == p) {
4063 if (p->prio > oldprio) 4126 if (p->prio > oldprio)
4064 resched_task(rq->curr); 4127 resched_task(rq->curr);
4065 } else 4128 } else
4066 check_preempt_curr(rq, p, 0); 4129 check_preempt_curr(rq, p, 0);
4067} 4130}
4068 4131
4132static void switched_from_fair(struct rq *rq, struct task_struct *p)
4133{
4134 struct sched_entity *se = &p->se;
4135 struct cfs_rq *cfs_rq = cfs_rq_of(se);
4136
4137 /*
4138 * Ensure the task's vruntime is normalized, so that when its
4139 * switched back to the fair class the enqueue_entity(.flags=0) will
4140 * do the right thing.
4141 *
4142 * If it was on_rq, then the dequeue_entity(.flags=0) will already
4143 * have normalized the vruntime, if it was !on_rq, then only when
4144 * the task is sleeping will it still have non-normalized vruntime.
4145 */
4146 if (!se->on_rq && p->state != TASK_RUNNING) {
4147 /*
4148 * Fix up our vruntime so that the current sleep doesn't
4149 * cause 'unlimited' sleep bonus.
4150 */
4151 place_entity(cfs_rq, se, 0);
4152 se->vruntime -= cfs_rq->min_vruntime;
4153 }
4154}
4155
4069/* 4156/*
4070 * We switched to the sched_fair class. 4157 * We switched to the sched_fair class.
4071 */ 4158 */
4072static void switched_to_fair(struct rq *rq, struct task_struct *p, 4159static void switched_to_fair(struct rq *rq, struct task_struct *p)
4073 int running)
4074{ 4160{
4161 if (!p->se.on_rq)
4162 return;
4163
4075 /* 4164 /*
4076 * We were most likely switched from sched_rt, so 4165 * We were most likely switched from sched_rt, so
4077 * kick off the schedule if running, otherwise just see 4166 * kick off the schedule if running, otherwise just see
4078 * if we can still preempt the current task. 4167 * if we can still preempt the current task.
4079 */ 4168 */
4080 if (running) 4169 if (rq->curr == p)
4081 resched_task(rq->curr); 4170 resched_task(rq->curr);
4082 else 4171 else
4083 check_preempt_curr(rq, p, 0); 4172 check_preempt_curr(rq, p, 0);
@@ -4143,6 +4232,7 @@ static const struct sched_class fair_sched_class = {
4143 .enqueue_task = enqueue_task_fair, 4232 .enqueue_task = enqueue_task_fair,
4144 .dequeue_task = dequeue_task_fair, 4233 .dequeue_task = dequeue_task_fair,
4145 .yield_task = yield_task_fair, 4234 .yield_task = yield_task_fair,
4235 .yield_to_task = yield_to_task_fair,
4146 4236
4147 .check_preempt_curr = check_preempt_wakeup, 4237 .check_preempt_curr = check_preempt_wakeup,
4148 4238
@@ -4163,6 +4253,7 @@ static const struct sched_class fair_sched_class = {
4163 .task_fork = task_fork_fair, 4253 .task_fork = task_fork_fair,
4164 4254
4165 .prio_changed = prio_changed_fair, 4255 .prio_changed = prio_changed_fair,
4256 .switched_from = switched_from_fair,
4166 .switched_to = switched_to_fair, 4257 .switched_to = switched_to_fair,
4167 4258
4168 .get_rr_interval = get_rr_interval_fair, 4259 .get_rr_interval = get_rr_interval_fair,
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index 9fa0f402c87c..c82f26c1b7c3 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -52,31 +52,15 @@ static void set_curr_task_idle(struct rq *rq)
52{ 52{
53} 53}
54 54
55static void switched_to_idle(struct rq *rq, struct task_struct *p, 55static void switched_to_idle(struct rq *rq, struct task_struct *p)
56 int running)
57{ 56{
58 /* Can this actually happen?? */ 57 BUG();
59 if (running)
60 resched_task(rq->curr);
61 else
62 check_preempt_curr(rq, p, 0);
63} 58}
64 59
65static void prio_changed_idle(struct rq *rq, struct task_struct *p, 60static void
66 int oldprio, int running) 61prio_changed_idle(struct rq *rq, struct task_struct *p, int oldprio)
67{ 62{
68 /* This can happen for hot plug CPUS */ 63 BUG();
69
70 /*
71 * Reschedule if we are currently running on this runqueue and
72 * our priority decreased, or if we are not currently running on
73 * this runqueue and our priority is higher than the current's
74 */
75 if (running) {
76 if (p->prio > oldprio)
77 resched_task(rq->curr);
78 } else
79 check_preempt_curr(rq, p, 0);
80} 64}
81 65
82static unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task) 66static unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task)
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index c914ec747ca6..db308cb08b75 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -210,11 +210,12 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se);
210 210
211static void sched_rt_rq_enqueue(struct rt_rq *rt_rq) 211static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
212{ 212{
213 int this_cpu = smp_processor_id();
214 struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr; 213 struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr;
215 struct sched_rt_entity *rt_se; 214 struct sched_rt_entity *rt_se;
216 215
217 rt_se = rt_rq->tg->rt_se[this_cpu]; 216 int cpu = cpu_of(rq_of_rt_rq(rt_rq));
217
218 rt_se = rt_rq->tg->rt_se[cpu];
218 219
219 if (rt_rq->rt_nr_running) { 220 if (rt_rq->rt_nr_running) {
220 if (rt_se && !on_rt_rq(rt_se)) 221 if (rt_se && !on_rt_rq(rt_se))
@@ -226,10 +227,10 @@ static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
226 227
227static void sched_rt_rq_dequeue(struct rt_rq *rt_rq) 228static void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
228{ 229{
229 int this_cpu = smp_processor_id();
230 struct sched_rt_entity *rt_se; 230 struct sched_rt_entity *rt_se;
231 int cpu = cpu_of(rq_of_rt_rq(rt_rq));
231 232
232 rt_se = rt_rq->tg->rt_se[this_cpu]; 233 rt_se = rt_rq->tg->rt_se[cpu];
233 234
234 if (rt_se && on_rt_rq(rt_se)) 235 if (rt_se && on_rt_rq(rt_se))
235 dequeue_rt_entity(rt_se); 236 dequeue_rt_entity(rt_se);
@@ -565,8 +566,11 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
565 if (rt_rq->rt_time || rt_rq->rt_nr_running) 566 if (rt_rq->rt_time || rt_rq->rt_nr_running)
566 idle = 0; 567 idle = 0;
567 raw_spin_unlock(&rt_rq->rt_runtime_lock); 568 raw_spin_unlock(&rt_rq->rt_runtime_lock);
568 } else if (rt_rq->rt_nr_running) 569 } else if (rt_rq->rt_nr_running) {
569 idle = 0; 570 idle = 0;
571 if (!rt_rq_throttled(rt_rq))
572 enqueue = 1;
573 }
570 574
571 if (enqueue) 575 if (enqueue)
572 sched_rt_rq_enqueue(rt_rq); 576 sched_rt_rq_enqueue(rt_rq);
@@ -625,7 +629,7 @@ static void update_curr_rt(struct rq *rq)
625 struct rt_rq *rt_rq = rt_rq_of_se(rt_se); 629 struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
626 u64 delta_exec; 630 u64 delta_exec;
627 631
628 if (!task_has_rt_policy(curr)) 632 if (curr->sched_class != &rt_sched_class)
629 return; 633 return;
630 634
631 delta_exec = rq->clock_task - curr->se.exec_start; 635 delta_exec = rq->clock_task - curr->se.exec_start;
@@ -1595,8 +1599,7 @@ static void rq_offline_rt(struct rq *rq)
1595 * When switch from the rt queue, we bring ourselves to a position 1599 * When switch from the rt queue, we bring ourselves to a position
1596 * that we might want to pull RT tasks from other runqueues. 1600 * that we might want to pull RT tasks from other runqueues.
1597 */ 1601 */
1598static void switched_from_rt(struct rq *rq, struct task_struct *p, 1602static void switched_from_rt(struct rq *rq, struct task_struct *p)
1599 int running)
1600{ 1603{
1601 /* 1604 /*
1602 * If there are other RT tasks then we will reschedule 1605 * If there are other RT tasks then we will reschedule
@@ -1605,7 +1608,7 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p,
1605 * we may need to handle the pulling of RT tasks 1608 * we may need to handle the pulling of RT tasks
1606 * now. 1609 * now.
1607 */ 1610 */
1608 if (!rq->rt.rt_nr_running) 1611 if (p->se.on_rq && !rq->rt.rt_nr_running)
1609 pull_rt_task(rq); 1612 pull_rt_task(rq);
1610} 1613}
1611 1614
@@ -1624,8 +1627,7 @@ static inline void init_sched_rt_class(void)
1624 * with RT tasks. In this case we try to push them off to 1627 * with RT tasks. In this case we try to push them off to
1625 * other runqueues. 1628 * other runqueues.
1626 */ 1629 */
1627static void switched_to_rt(struct rq *rq, struct task_struct *p, 1630static void switched_to_rt(struct rq *rq, struct task_struct *p)
1628 int running)
1629{ 1631{
1630 int check_resched = 1; 1632 int check_resched = 1;
1631 1633
@@ -1636,7 +1638,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p,
1636 * If that current running task is also an RT task 1638 * If that current running task is also an RT task
1637 * then see if we can move to another run queue. 1639 * then see if we can move to another run queue.
1638 */ 1640 */
1639 if (!running) { 1641 if (p->se.on_rq && rq->curr != p) {
1640#ifdef CONFIG_SMP 1642#ifdef CONFIG_SMP
1641 if (rq->rt.overloaded && push_rt_task(rq) && 1643 if (rq->rt.overloaded && push_rt_task(rq) &&
1642 /* Don't resched if we changed runqueues */ 1644 /* Don't resched if we changed runqueues */
@@ -1652,10 +1654,13 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p,
1652 * Priority of the task has changed. This may cause 1654 * Priority of the task has changed. This may cause
1653 * us to initiate a push or pull. 1655 * us to initiate a push or pull.
1654 */ 1656 */
1655static void prio_changed_rt(struct rq *rq, struct task_struct *p, 1657static void
1656 int oldprio, int running) 1658prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio)
1657{ 1659{
1658 if (running) { 1660 if (!p->se.on_rq)
1661 return;
1662
1663 if (rq->curr == p) {
1659#ifdef CONFIG_SMP 1664#ifdef CONFIG_SMP
1660 /* 1665 /*
1661 * If our priority decreases while running, we 1666 * If our priority decreases while running, we
diff --git a/kernel/sched_stoptask.c b/kernel/sched_stoptask.c
index 2bf6b47058c1..84ec9bcf82d9 100644
--- a/kernel/sched_stoptask.c
+++ b/kernel/sched_stoptask.c
@@ -59,14 +59,13 @@ static void set_curr_task_stop(struct rq *rq)
59{ 59{
60} 60}
61 61
62static void switched_to_stop(struct rq *rq, struct task_struct *p, 62static void switched_to_stop(struct rq *rq, struct task_struct *p)
63 int running)
64{ 63{
65 BUG(); /* its impossible to change to this class */ 64 BUG(); /* its impossible to change to this class */
66} 65}
67 66
68static void prio_changed_stop(struct rq *rq, struct task_struct *p, 67static void
69 int oldprio, int running) 68prio_changed_stop(struct rq *rq, struct task_struct *p, int oldprio)
70{ 69{
71 BUG(); /* how!?, what priority? */ 70 BUG(); /* how!?, what priority? */
72} 71}
diff --git a/kernel/smp.c b/kernel/smp.c
index 12ed8b013e2d..9910744f0856 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -13,6 +13,7 @@
13#include <linux/smp.h> 13#include <linux/smp.h>
14#include <linux/cpu.h> 14#include <linux/cpu.h>
15 15
16#ifdef CONFIG_USE_GENERIC_SMP_HELPERS
16static struct { 17static struct {
17 struct list_head queue; 18 struct list_head queue;
18 raw_spinlock_t lock; 19 raw_spinlock_t lock;
@@ -193,23 +194,52 @@ void generic_smp_call_function_interrupt(void)
193 */ 194 */
194 list_for_each_entry_rcu(data, &call_function.queue, csd.list) { 195 list_for_each_entry_rcu(data, &call_function.queue, csd.list) {
195 int refs; 196 int refs;
197 void (*func) (void *info);
196 198
197 if (!cpumask_test_and_clear_cpu(cpu, data->cpumask)) 199 /*
200 * Since we walk the list without any locks, we might
201 * see an entry that was completed, removed from the
202 * list and is in the process of being reused.
203 *
204 * We must check that the cpu is in the cpumask before
205 * checking the refs, and both must be set before
206 * executing the callback on this cpu.
207 */
208
209 if (!cpumask_test_cpu(cpu, data->cpumask))
210 continue;
211
212 smp_rmb();
213
214 if (atomic_read(&data->refs) == 0)
198 continue; 215 continue;
199 216
217 func = data->csd.func; /* for later warn */
200 data->csd.func(data->csd.info); 218 data->csd.func(data->csd.info);
201 219
220 /*
221 * If the cpu mask is not still set then it enabled interrupts,
222 * we took another smp interrupt, and executed the function
223 * twice on this cpu. In theory that copy decremented refs.
224 */
225 if (!cpumask_test_and_clear_cpu(cpu, data->cpumask)) {
226 WARN(1, "%pS enabled interrupts and double executed\n",
227 func);
228 continue;
229 }
230
202 refs = atomic_dec_return(&data->refs); 231 refs = atomic_dec_return(&data->refs);
203 WARN_ON(refs < 0); 232 WARN_ON(refs < 0);
204 if (!refs) {
205 raw_spin_lock(&call_function.lock);
206 list_del_rcu(&data->csd.list);
207 raw_spin_unlock(&call_function.lock);
208 }
209 233
210 if (refs) 234 if (refs)
211 continue; 235 continue;
212 236
237 WARN_ON(!cpumask_empty(data->cpumask));
238
239 raw_spin_lock(&call_function.lock);
240 list_del_rcu(&data->csd.list);
241 raw_spin_unlock(&call_function.lock);
242
213 csd_unlock(&data->csd); 243 csd_unlock(&data->csd);
214 } 244 }
215 245
@@ -429,7 +459,7 @@ void smp_call_function_many(const struct cpumask *mask,
429 * can't happen. 459 * can't happen.
430 */ 460 */
431 WARN_ON_ONCE(cpu_online(this_cpu) && irqs_disabled() 461 WARN_ON_ONCE(cpu_online(this_cpu) && irqs_disabled()
432 && !oops_in_progress); 462 && !oops_in_progress && !early_boot_irqs_disabled);
433 463
434 /* So, what's a CPU they want? Ignoring this one. */ 464 /* So, what's a CPU they want? Ignoring this one. */
435 cpu = cpumask_first_and(mask, cpu_online_mask); 465 cpu = cpumask_first_and(mask, cpu_online_mask);
@@ -453,11 +483,21 @@ void smp_call_function_many(const struct cpumask *mask,
453 483
454 data = &__get_cpu_var(cfd_data); 484 data = &__get_cpu_var(cfd_data);
455 csd_lock(&data->csd); 485 csd_lock(&data->csd);
486 BUG_ON(atomic_read(&data->refs) || !cpumask_empty(data->cpumask));
456 487
457 data->csd.func = func; 488 data->csd.func = func;
458 data->csd.info = info; 489 data->csd.info = info;
459 cpumask_and(data->cpumask, mask, cpu_online_mask); 490 cpumask_and(data->cpumask, mask, cpu_online_mask);
460 cpumask_clear_cpu(this_cpu, data->cpumask); 491 cpumask_clear_cpu(this_cpu, data->cpumask);
492
493 /*
494 * To ensure the interrupt handler gets an complete view
495 * we order the cpumask and refs writes and order the read
496 * of them in the interrupt handler. In addition we may
497 * only clear our own cpu bit from the mask.
498 */
499 smp_wmb();
500
461 atomic_set(&data->refs, cpumask_weight(data->cpumask)); 501 atomic_set(&data->refs, cpumask_weight(data->cpumask));
462 502
463 raw_spin_lock_irqsave(&call_function.lock, flags); 503 raw_spin_lock_irqsave(&call_function.lock, flags);
@@ -529,3 +569,24 @@ void ipi_call_unlock_irq(void)
529{ 569{
530 raw_spin_unlock_irq(&call_function.lock); 570 raw_spin_unlock_irq(&call_function.lock);
531} 571}
572#endif /* USE_GENERIC_SMP_HELPERS */
573
574/*
575 * Call a function on all processors. May be used during early boot while
576 * early_boot_irqs_disabled is set. Use local_irq_save/restore() instead
577 * of local_irq_disable/enable().
578 */
579int on_each_cpu(void (*func) (void *info), void *info, int wait)
580{
581 unsigned long flags;
582 int ret = 0;
583
584 preempt_disable();
585 ret = smp_call_function(func, info, wait);
586 local_irq_save(flags);
587 func(info);
588 local_irq_restore(flags);
589 preempt_enable();
590 return ret;
591}
592EXPORT_SYMBOL(on_each_cpu);
diff --git a/kernel/softirq.c b/kernel/softirq.c
index d4d918a91881..56e5dec837f0 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -54,7 +54,7 @@ EXPORT_SYMBOL(irq_stat);
54 54
55static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp; 55static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp;
56 56
57static DEFINE_PER_CPU(struct task_struct *, ksoftirqd); 57DEFINE_PER_CPU(struct task_struct *, ksoftirqd);
58 58
59char *softirq_to_name[NR_SOFTIRQS] = { 59char *softirq_to_name[NR_SOFTIRQS] = {
60 "HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "BLOCK_IOPOLL", 60 "HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "BLOCK_IOPOLL",
@@ -70,7 +70,7 @@ char *softirq_to_name[NR_SOFTIRQS] = {
70static void wakeup_softirqd(void) 70static void wakeup_softirqd(void)
71{ 71{
72 /* Interrupts are disabled: no need to stop preemption */ 72 /* Interrupts are disabled: no need to stop preemption */
73 struct task_struct *tsk = __get_cpu_var(ksoftirqd); 73 struct task_struct *tsk = __this_cpu_read(ksoftirqd);
74 74
75 if (tsk && tsk->state != TASK_RUNNING) 75 if (tsk && tsk->state != TASK_RUNNING)
76 wake_up_process(tsk); 76 wake_up_process(tsk);
@@ -311,9 +311,21 @@ void irq_enter(void)
311} 311}
312 312
313#ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED 313#ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED
314# define invoke_softirq() __do_softirq() 314static inline void invoke_softirq(void)
315{
316 if (!force_irqthreads)
317 __do_softirq();
318 else
319 wakeup_softirqd();
320}
315#else 321#else
316# define invoke_softirq() do_softirq() 322static inline void invoke_softirq(void)
323{
324 if (!force_irqthreads)
325 do_softirq();
326 else
327 wakeup_softirqd();
328}
317#endif 329#endif
318 330
319/* 331/*
@@ -388,8 +400,8 @@ void __tasklet_schedule(struct tasklet_struct *t)
388 400
389 local_irq_save(flags); 401 local_irq_save(flags);
390 t->next = NULL; 402 t->next = NULL;
391 *__get_cpu_var(tasklet_vec).tail = t; 403 *__this_cpu_read(tasklet_vec.tail) = t;
392 __get_cpu_var(tasklet_vec).tail = &(t->next); 404 __this_cpu_write(tasklet_vec.tail, &(t->next));
393 raise_softirq_irqoff(TASKLET_SOFTIRQ); 405 raise_softirq_irqoff(TASKLET_SOFTIRQ);
394 local_irq_restore(flags); 406 local_irq_restore(flags);
395} 407}
@@ -402,8 +414,8 @@ void __tasklet_hi_schedule(struct tasklet_struct *t)
402 414
403 local_irq_save(flags); 415 local_irq_save(flags);
404 t->next = NULL; 416 t->next = NULL;
405 *__get_cpu_var(tasklet_hi_vec).tail = t; 417 *__this_cpu_read(tasklet_hi_vec.tail) = t;
406 __get_cpu_var(tasklet_hi_vec).tail = &(t->next); 418 __this_cpu_write(tasklet_hi_vec.tail, &(t->next));
407 raise_softirq_irqoff(HI_SOFTIRQ); 419 raise_softirq_irqoff(HI_SOFTIRQ);
408 local_irq_restore(flags); 420 local_irq_restore(flags);
409} 421}
@@ -414,8 +426,8 @@ void __tasklet_hi_schedule_first(struct tasklet_struct *t)
414{ 426{
415 BUG_ON(!irqs_disabled()); 427 BUG_ON(!irqs_disabled());
416 428
417 t->next = __get_cpu_var(tasklet_hi_vec).head; 429 t->next = __this_cpu_read(tasklet_hi_vec.head);
418 __get_cpu_var(tasklet_hi_vec).head = t; 430 __this_cpu_write(tasklet_hi_vec.head, t);
419 __raise_softirq_irqoff(HI_SOFTIRQ); 431 __raise_softirq_irqoff(HI_SOFTIRQ);
420} 432}
421 433
@@ -426,9 +438,9 @@ static void tasklet_action(struct softirq_action *a)
426 struct tasklet_struct *list; 438 struct tasklet_struct *list;
427 439
428 local_irq_disable(); 440 local_irq_disable();
429 list = __get_cpu_var(tasklet_vec).head; 441 list = __this_cpu_read(tasklet_vec.head);
430 __get_cpu_var(tasklet_vec).head = NULL; 442 __this_cpu_write(tasklet_vec.head, NULL);
431 __get_cpu_var(tasklet_vec).tail = &__get_cpu_var(tasklet_vec).head; 443 __this_cpu_write(tasklet_vec.tail, &__get_cpu_var(tasklet_vec).head);
432 local_irq_enable(); 444 local_irq_enable();
433 445
434 while (list) { 446 while (list) {
@@ -449,8 +461,8 @@ static void tasklet_action(struct softirq_action *a)
449 461
450 local_irq_disable(); 462 local_irq_disable();
451 t->next = NULL; 463 t->next = NULL;
452 *__get_cpu_var(tasklet_vec).tail = t; 464 *__this_cpu_read(tasklet_vec.tail) = t;
453 __get_cpu_var(tasklet_vec).tail = &(t->next); 465 __this_cpu_write(tasklet_vec.tail, &(t->next));
454 __raise_softirq_irqoff(TASKLET_SOFTIRQ); 466 __raise_softirq_irqoff(TASKLET_SOFTIRQ);
455 local_irq_enable(); 467 local_irq_enable();
456 } 468 }
@@ -461,9 +473,9 @@ static void tasklet_hi_action(struct softirq_action *a)
461 struct tasklet_struct *list; 473 struct tasklet_struct *list;
462 474
463 local_irq_disable(); 475 local_irq_disable();
464 list = __get_cpu_var(tasklet_hi_vec).head; 476 list = __this_cpu_read(tasklet_hi_vec.head);
465 __get_cpu_var(tasklet_hi_vec).head = NULL; 477 __this_cpu_write(tasklet_hi_vec.head, NULL);
466 __get_cpu_var(tasklet_hi_vec).tail = &__get_cpu_var(tasklet_hi_vec).head; 478 __this_cpu_write(tasklet_hi_vec.tail, &__get_cpu_var(tasklet_hi_vec).head);
467 local_irq_enable(); 479 local_irq_enable();
468 480
469 while (list) { 481 while (list) {
@@ -484,8 +496,8 @@ static void tasklet_hi_action(struct softirq_action *a)
484 496
485 local_irq_disable(); 497 local_irq_disable();
486 t->next = NULL; 498 t->next = NULL;
487 *__get_cpu_var(tasklet_hi_vec).tail = t; 499 *__this_cpu_read(tasklet_hi_vec.tail) = t;
488 __get_cpu_var(tasklet_hi_vec).tail = &(t->next); 500 __this_cpu_write(tasklet_hi_vec.tail, &(t->next));
489 __raise_softirq_irqoff(HI_SOFTIRQ); 501 __raise_softirq_irqoff(HI_SOFTIRQ);
490 local_irq_enable(); 502 local_irq_enable();
491 } 503 }
@@ -721,7 +733,6 @@ static int run_ksoftirqd(void * __bind_cpu)
721{ 733{
722 set_current_state(TASK_INTERRUPTIBLE); 734 set_current_state(TASK_INTERRUPTIBLE);
723 735
724 current->flags |= PF_KSOFTIRQD;
725 while (!kthread_should_stop()) { 736 while (!kthread_should_stop()) {
726 preempt_disable(); 737 preempt_disable();
727 if (!local_softirq_pending()) { 738 if (!local_softirq_pending()) {
@@ -738,7 +749,10 @@ static int run_ksoftirqd(void * __bind_cpu)
738 don't process */ 749 don't process */
739 if (cpu_is_offline((long)__bind_cpu)) 750 if (cpu_is_offline((long)__bind_cpu))
740 goto wait_to_die; 751 goto wait_to_die;
741 do_softirq(); 752 local_irq_disable();
753 if (local_softirq_pending())
754 __do_softirq();
755 local_irq_enable();
742 preempt_enable_no_resched(); 756 preempt_enable_no_resched();
743 cond_resched(); 757 cond_resched();
744 preempt_disable(); 758 preempt_disable();
@@ -802,16 +816,16 @@ static void takeover_tasklets(unsigned int cpu)
802 816
803 /* Find end, append list for that CPU. */ 817 /* Find end, append list for that CPU. */
804 if (&per_cpu(tasklet_vec, cpu).head != per_cpu(tasklet_vec, cpu).tail) { 818 if (&per_cpu(tasklet_vec, cpu).head != per_cpu(tasklet_vec, cpu).tail) {
805 *(__get_cpu_var(tasklet_vec).tail) = per_cpu(tasklet_vec, cpu).head; 819 *__this_cpu_read(tasklet_vec.tail) = per_cpu(tasklet_vec, cpu).head;
806 __get_cpu_var(tasklet_vec).tail = per_cpu(tasklet_vec, cpu).tail; 820 this_cpu_write(tasklet_vec.tail, per_cpu(tasklet_vec, cpu).tail);
807 per_cpu(tasklet_vec, cpu).head = NULL; 821 per_cpu(tasklet_vec, cpu).head = NULL;
808 per_cpu(tasklet_vec, cpu).tail = &per_cpu(tasklet_vec, cpu).head; 822 per_cpu(tasklet_vec, cpu).tail = &per_cpu(tasklet_vec, cpu).head;
809 } 823 }
810 raise_softirq_irqoff(TASKLET_SOFTIRQ); 824 raise_softirq_irqoff(TASKLET_SOFTIRQ);
811 825
812 if (&per_cpu(tasklet_hi_vec, cpu).head != per_cpu(tasklet_hi_vec, cpu).tail) { 826 if (&per_cpu(tasklet_hi_vec, cpu).head != per_cpu(tasklet_hi_vec, cpu).tail) {
813 *__get_cpu_var(tasklet_hi_vec).tail = per_cpu(tasklet_hi_vec, cpu).head; 827 *__this_cpu_read(tasklet_hi_vec.tail) = per_cpu(tasklet_hi_vec, cpu).head;
814 __get_cpu_var(tasklet_hi_vec).tail = per_cpu(tasklet_hi_vec, cpu).tail; 828 __this_cpu_write(tasklet_hi_vec.tail, per_cpu(tasklet_hi_vec, cpu).tail);
815 per_cpu(tasklet_hi_vec, cpu).head = NULL; 829 per_cpu(tasklet_hi_vec, cpu).head = NULL;
816 per_cpu(tasklet_hi_vec, cpu).tail = &per_cpu(tasklet_hi_vec, cpu).head; 830 per_cpu(tasklet_hi_vec, cpu).tail = &per_cpu(tasklet_hi_vec, cpu).head;
817 } 831 }
@@ -853,7 +867,7 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb,
853 cpumask_any(cpu_online_mask)); 867 cpumask_any(cpu_online_mask));
854 case CPU_DEAD: 868 case CPU_DEAD:
855 case CPU_DEAD_FROZEN: { 869 case CPU_DEAD_FROZEN: {
856 static struct sched_param param = { 870 static const struct sched_param param = {
857 .sched_priority = MAX_RT_PRIO-1 871 .sched_priority = MAX_RT_PRIO-1
858 }; 872 };
859 873
@@ -885,25 +899,6 @@ static __init int spawn_ksoftirqd(void)
885} 899}
886early_initcall(spawn_ksoftirqd); 900early_initcall(spawn_ksoftirqd);
887 901
888#ifdef CONFIG_SMP
889/*
890 * Call a function on all processors
891 */
892int on_each_cpu(void (*func) (void *info), void *info, int wait)
893{
894 int ret = 0;
895
896 preempt_disable();
897 ret = smp_call_function(func, info, wait);
898 local_irq_disable();
899 func(info);
900 local_irq_enable();
901 preempt_enable();
902 return ret;
903}
904EXPORT_SYMBOL(on_each_cpu);
905#endif
906
907/* 902/*
908 * [ These __weak aliases are kept in a separate compilation unit, so that 903 * [ These __weak aliases are kept in a separate compilation unit, so that
909 * GCC does not inline them incorrectly. ] 904 * GCC does not inline them incorrectly. ]
diff --git a/kernel/srcu.c b/kernel/srcu.c
index 98d8c1e80edb..73ce23feaea9 100644
--- a/kernel/srcu.c
+++ b/kernel/srcu.c
@@ -156,6 +156,16 @@ void __srcu_read_unlock(struct srcu_struct *sp, int idx)
156EXPORT_SYMBOL_GPL(__srcu_read_unlock); 156EXPORT_SYMBOL_GPL(__srcu_read_unlock);
157 157
158/* 158/*
159 * We use an adaptive strategy for synchronize_srcu() and especially for
160 * synchronize_srcu_expedited(). We spin for a fixed time period
161 * (defined below) to allow SRCU readers to exit their read-side critical
162 * sections. If there are still some readers after 10 microseconds,
163 * we repeatedly block for 1-millisecond time periods. This approach
164 * has done well in testing, so there is no need for a config parameter.
165 */
166#define SYNCHRONIZE_SRCU_READER_DELAY 10
167
168/*
159 * Helper function for synchronize_srcu() and synchronize_srcu_expedited(). 169 * Helper function for synchronize_srcu() and synchronize_srcu_expedited().
160 */ 170 */
161static void __synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void)) 171static void __synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void))
@@ -207,11 +217,12 @@ static void __synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void))
207 * will have finished executing. We initially give readers 217 * will have finished executing. We initially give readers
208 * an arbitrarily chosen 10 microseconds to get out of their 218 * an arbitrarily chosen 10 microseconds to get out of their
209 * SRCU read-side critical sections, then loop waiting 1/HZ 219 * SRCU read-side critical sections, then loop waiting 1/HZ
210 * seconds per iteration. 220 * seconds per iteration. The 10-microsecond value has done
221 * very well in testing.
211 */ 222 */
212 223
213 if (srcu_readers_active_idx(sp, idx)) 224 if (srcu_readers_active_idx(sp, idx))
214 udelay(CONFIG_SRCU_SYNCHRONIZE_DELAY); 225 udelay(SYNCHRONIZE_SRCU_READER_DELAY);
215 while (srcu_readers_active_idx(sp, idx)) 226 while (srcu_readers_active_idx(sp, idx))
216 schedule_timeout_interruptible(1); 227 schedule_timeout_interruptible(1);
217 228
diff --git a/kernel/sys.c b/kernel/sys.c
index 2745dcdb6c6c..18da702ec813 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -43,6 +43,8 @@
43#include <linux/kprobes.h> 43#include <linux/kprobes.h>
44#include <linux/user_namespace.h> 44#include <linux/user_namespace.h>
45 45
46#include <linux/kmsg_dump.h>
47
46#include <asm/uaccess.h> 48#include <asm/uaccess.h>
47#include <asm/io.h> 49#include <asm/io.h>
48#include <asm/unistd.h> 50#include <asm/unistd.h>
@@ -285,6 +287,7 @@ out_unlock:
285 */ 287 */
286void emergency_restart(void) 288void emergency_restart(void)
287{ 289{
290 kmsg_dump(KMSG_DUMP_EMERG);
288 machine_emergency_restart(); 291 machine_emergency_restart();
289} 292}
290EXPORT_SYMBOL_GPL(emergency_restart); 293EXPORT_SYMBOL_GPL(emergency_restart);
@@ -312,6 +315,7 @@ void kernel_restart(char *cmd)
312 printk(KERN_EMERG "Restarting system.\n"); 315 printk(KERN_EMERG "Restarting system.\n");
313 else 316 else
314 printk(KERN_EMERG "Restarting system with command '%s'.\n", cmd); 317 printk(KERN_EMERG "Restarting system with command '%s'.\n", cmd);
318 kmsg_dump(KMSG_DUMP_RESTART);
315 machine_restart(cmd); 319 machine_restart(cmd);
316} 320}
317EXPORT_SYMBOL_GPL(kernel_restart); 321EXPORT_SYMBOL_GPL(kernel_restart);
@@ -333,6 +337,7 @@ void kernel_halt(void)
333 kernel_shutdown_prepare(SYSTEM_HALT); 337 kernel_shutdown_prepare(SYSTEM_HALT);
334 sysdev_shutdown(); 338 sysdev_shutdown();
335 printk(KERN_EMERG "System halted.\n"); 339 printk(KERN_EMERG "System halted.\n");
340 kmsg_dump(KMSG_DUMP_HALT);
336 machine_halt(); 341 machine_halt();
337} 342}
338 343
@@ -351,6 +356,7 @@ void kernel_power_off(void)
351 disable_nonboot_cpus(); 356 disable_nonboot_cpus();
352 sysdev_shutdown(); 357 sysdev_shutdown();
353 printk(KERN_EMERG "Power down.\n"); 358 printk(KERN_EMERG "Power down.\n");
359 kmsg_dump(KMSG_DUMP_POWEROFF);
354 machine_power_off(); 360 machine_power_off();
355} 361}
356EXPORT_SYMBOL_GPL(kernel_power_off); 362EXPORT_SYMBOL_GPL(kernel_power_off);
@@ -1379,7 +1385,8 @@ static int check_prlimit_permission(struct task_struct *task)
1379 const struct cred *cred = current_cred(), *tcred; 1385 const struct cred *cred = current_cred(), *tcred;
1380 1386
1381 tcred = __task_cred(task); 1387 tcred = __task_cred(task);
1382 if ((cred->uid != tcred->euid || 1388 if (current != task &&
1389 (cred->uid != tcred->euid ||
1383 cred->uid != tcred->suid || 1390 cred->uid != tcred->suid ||
1384 cred->uid != tcred->uid || 1391 cred->uid != tcred->uid ||
1385 cred->gid != tcred->egid || 1392 cred->gid != tcred->egid ||
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index c782fe9924c7..25cc41cd8f33 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -186,3 +186,8 @@ cond_syscall(sys_perf_event_open);
186/* fanotify! */ 186/* fanotify! */
187cond_syscall(sys_fanotify_init); 187cond_syscall(sys_fanotify_init);
188cond_syscall(sys_fanotify_mark); 188cond_syscall(sys_fanotify_mark);
189
190/* open by handle */
191cond_syscall(sys_name_to_handle_at);
192cond_syscall(sys_open_by_handle_at);
193cond_syscall(compat_sys_open_by_handle_at);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index ae5cbb1e3ced..51054fea5d99 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -24,6 +24,7 @@
24#include <linux/slab.h> 24#include <linux/slab.h>
25#include <linux/sysctl.h> 25#include <linux/sysctl.h>
26#include <linux/signal.h> 26#include <linux/signal.h>
27#include <linux/printk.h>
27#include <linux/proc_fs.h> 28#include <linux/proc_fs.h>
28#include <linux/security.h> 29#include <linux/security.h>
29#include <linux/ctype.h> 30#include <linux/ctype.h>
@@ -169,7 +170,8 @@ static int proc_taint(struct ctl_table *table, int write,
169#endif 170#endif
170 171
171#ifdef CONFIG_MAGIC_SYSRQ 172#ifdef CONFIG_MAGIC_SYSRQ
172static int __sysrq_enabled; /* Note: sysrq code ises it's own private copy */ 173/* Note: sysrq code uses it's own private copy */
174static int __sysrq_enabled = SYSRQ_DEFAULT_ENABLE;
173 175
174static int sysrq_sysctl_handler(ctl_table *table, int write, 176static int sysrq_sysctl_handler(ctl_table *table, int write,
175 void __user *buffer, size_t *lenp, 177 void __user *buffer, size_t *lenp,
@@ -192,9 +194,9 @@ static int sysrq_sysctl_handler(ctl_table *table, int write,
192static struct ctl_table root_table[]; 194static struct ctl_table root_table[];
193static struct ctl_table_root sysctl_table_root; 195static struct ctl_table_root sysctl_table_root;
194static struct ctl_table_header root_table_header = { 196static struct ctl_table_header root_table_header = {
195 .count = 1, 197 {{.count = 1,
196 .ctl_table = root_table, 198 .ctl_table = root_table,
197 .ctl_entry = LIST_HEAD_INIT(sysctl_table_root.default_set.list), 199 .ctl_entry = LIST_HEAD_INIT(sysctl_table_root.default_set.list),}},
198 .root = &sysctl_table_root, 200 .root = &sysctl_table_root,
199 .set = &sysctl_table_root.default_set, 201 .set = &sysctl_table_root.default_set,
200}; 202};
@@ -245,10 +247,6 @@ static struct ctl_table root_table[] = {
245 .mode = 0555, 247 .mode = 0555,
246 .child = dev_table, 248 .child = dev_table,
247 }, 249 },
248/*
249 * NOTE: do not add new entries to this table unless you have read
250 * Documentation/sysctl/ctl_unnumbered.txt
251 */
252 { } 250 { }
253}; 251};
254 252
@@ -363,20 +361,13 @@ static struct ctl_table kern_table[] = {
363 .mode = 0644, 361 .mode = 0644,
364 .proc_handler = sched_rt_handler, 362 .proc_handler = sched_rt_handler,
365 }, 363 },
366 {
367 .procname = "sched_compat_yield",
368 .data = &sysctl_sched_compat_yield,
369 .maxlen = sizeof(unsigned int),
370 .mode = 0644,
371 .proc_handler = proc_dointvec,
372 },
373#ifdef CONFIG_SCHED_AUTOGROUP 364#ifdef CONFIG_SCHED_AUTOGROUP
374 { 365 {
375 .procname = "sched_autogroup_enabled", 366 .procname = "sched_autogroup_enabled",
376 .data = &sysctl_sched_autogroup_enabled, 367 .data = &sysctl_sched_autogroup_enabled,
377 .maxlen = sizeof(unsigned int), 368 .maxlen = sizeof(unsigned int),
378 .mode = 0644, 369 .mode = 0644,
379 .proc_handler = proc_dointvec, 370 .proc_handler = proc_dointvec_minmax,
380 .extra1 = &zero, 371 .extra1 = &zero,
381 .extra2 = &one, 372 .extra2 = &one,
382 }, 373 },
@@ -710,6 +701,15 @@ static struct ctl_table kern_table[] = {
710 .extra1 = &zero, 701 .extra1 = &zero,
711 .extra2 = &one, 702 .extra2 = &one,
712 }, 703 },
704 {
705 .procname = "kptr_restrict",
706 .data = &kptr_restrict,
707 .maxlen = sizeof(int),
708 .mode = 0644,
709 .proc_handler = proc_dointvec_minmax,
710 .extra1 = &zero,
711 .extra2 = &two,
712 },
713#endif 713#endif
714 { 714 {
715 .procname = "ngroups_max", 715 .procname = "ngroups_max",
@@ -941,7 +941,7 @@ static struct ctl_table kern_table[] = {
941 .data = &sysctl_perf_event_sample_rate, 941 .data = &sysctl_perf_event_sample_rate,
942 .maxlen = sizeof(sysctl_perf_event_sample_rate), 942 .maxlen = sizeof(sysctl_perf_event_sample_rate),
943 .mode = 0644, 943 .mode = 0644,
944 .proc_handler = proc_dointvec, 944 .proc_handler = perf_proc_update_handler,
945 }, 945 },
946#endif 946#endif
947#ifdef CONFIG_KMEMCHECK 947#ifdef CONFIG_KMEMCHECK
@@ -962,10 +962,6 @@ static struct ctl_table kern_table[] = {
962 .proc_handler = proc_dointvec, 962 .proc_handler = proc_dointvec,
963 }, 963 },
964#endif 964#endif
965/*
966 * NOTE: do not add new entries to this table unless you have read
967 * Documentation/sysctl/ctl_unnumbered.txt
968 */
969 { } 965 { }
970}; 966};
971 967
@@ -1326,11 +1322,6 @@ static struct ctl_table vm_table[] = {
1326 .extra2 = &one, 1322 .extra2 = &one,
1327 }, 1323 },
1328#endif 1324#endif
1329
1330/*
1331 * NOTE: do not add new entries to this table unless you have read
1332 * Documentation/sysctl/ctl_unnumbered.txt
1333 */
1334 { } 1325 { }
1335}; 1326};
1336 1327
@@ -1486,10 +1477,6 @@ static struct ctl_table fs_table[] = {
1486 .proc_handler = &pipe_proc_fn, 1477 .proc_handler = &pipe_proc_fn,
1487 .extra1 = &pipe_min_size, 1478 .extra1 = &pipe_min_size,
1488 }, 1479 },
1489/*
1490 * NOTE: do not add new entries to this table unless you have read
1491 * Documentation/sysctl/ctl_unnumbered.txt
1492 */
1493 { } 1480 { }
1494}; 1481};
1495 1482
@@ -1573,11 +1560,16 @@ void sysctl_head_get(struct ctl_table_header *head)
1573 spin_unlock(&sysctl_lock); 1560 spin_unlock(&sysctl_lock);
1574} 1561}
1575 1562
1563static void free_head(struct rcu_head *rcu)
1564{
1565 kfree(container_of(rcu, struct ctl_table_header, rcu));
1566}
1567
1576void sysctl_head_put(struct ctl_table_header *head) 1568void sysctl_head_put(struct ctl_table_header *head)
1577{ 1569{
1578 spin_lock(&sysctl_lock); 1570 spin_lock(&sysctl_lock);
1579 if (!--head->count) 1571 if (!--head->count)
1580 kfree(head); 1572 call_rcu(&head->rcu, free_head);
1581 spin_unlock(&sysctl_lock); 1573 spin_unlock(&sysctl_lock);
1582} 1574}
1583 1575
@@ -1954,10 +1946,10 @@ void unregister_sysctl_table(struct ctl_table_header * header)
1954 start_unregistering(header); 1946 start_unregistering(header);
1955 if (!--header->parent->count) { 1947 if (!--header->parent->count) {
1956 WARN_ON(1); 1948 WARN_ON(1);
1957 kfree(header->parent); 1949 call_rcu(&header->parent->rcu, free_head);
1958 } 1950 }
1959 if (!--header->count) 1951 if (!--header->count)
1960 kfree(header); 1952 call_rcu(&header->rcu, free_head);
1961 spin_unlock(&sysctl_lock); 1953 spin_unlock(&sysctl_lock);
1962} 1954}
1963 1955
@@ -2899,7 +2891,7 @@ int proc_do_large_bitmap(struct ctl_table *table, int write,
2899 } 2891 }
2900} 2892}
2901 2893
2902#else /* CONFIG_PROC_FS */ 2894#else /* CONFIG_PROC_SYSCTL */
2903 2895
2904int proc_dostring(struct ctl_table *table, int write, 2896int proc_dostring(struct ctl_table *table, int write,
2905 void __user *buffer, size_t *lenp, loff_t *ppos) 2897 void __user *buffer, size_t *lenp, loff_t *ppos)
@@ -2951,7 +2943,7 @@ int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write,
2951} 2943}
2952 2944
2953 2945
2954#endif /* CONFIG_PROC_FS */ 2946#endif /* CONFIG_PROC_SYSCTL */
2955 2947
2956/* 2948/*
2957 * No sense putting this after each symbol definition, twice, 2949 * No sense putting this after each symbol definition, twice,
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index 4b2545a136ff..3b8e028b9601 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -1192,7 +1192,7 @@ static ssize_t bin_dn_node_address(struct file *file,
1192 1192
1193 buf[result] = '\0'; 1193 buf[result] = '\0';
1194 1194
1195 /* Convert the decnet addresss to binary */ 1195 /* Convert the decnet address to binary */
1196 result = -EIO; 1196 result = -EIO;
1197 nodep = strchr(buf, '.') + 1; 1197 nodep = strchr(buf, '.') + 1;
1198 if (!nodep) 1198 if (!nodep)
@@ -1321,13 +1321,11 @@ static ssize_t binary_sysctl(const int *name, int nlen,
1321 void __user *oldval, size_t oldlen, void __user *newval, size_t newlen) 1321 void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
1322{ 1322{
1323 const struct bin_table *table = NULL; 1323 const struct bin_table *table = NULL;
1324 struct nameidata nd;
1325 struct vfsmount *mnt; 1324 struct vfsmount *mnt;
1326 struct file *file; 1325 struct file *file;
1327 ssize_t result; 1326 ssize_t result;
1328 char *pathname; 1327 char *pathname;
1329 int flags; 1328 int flags;
1330 int acc_mode;
1331 1329
1332 pathname = sysctl_getname(name, nlen, &table); 1330 pathname = sysctl_getname(name, nlen, &table);
1333 result = PTR_ERR(pathname); 1331 result = PTR_ERR(pathname);
@@ -1337,28 +1335,17 @@ static ssize_t binary_sysctl(const int *name, int nlen,
1337 /* How should the sysctl be accessed? */ 1335 /* How should the sysctl be accessed? */
1338 if (oldval && oldlen && newval && newlen) { 1336 if (oldval && oldlen && newval && newlen) {
1339 flags = O_RDWR; 1337 flags = O_RDWR;
1340 acc_mode = MAY_READ | MAY_WRITE;
1341 } else if (newval && newlen) { 1338 } else if (newval && newlen) {
1342 flags = O_WRONLY; 1339 flags = O_WRONLY;
1343 acc_mode = MAY_WRITE;
1344 } else if (oldval && oldlen) { 1340 } else if (oldval && oldlen) {
1345 flags = O_RDONLY; 1341 flags = O_RDONLY;
1346 acc_mode = MAY_READ;
1347 } else { 1342 } else {
1348 result = 0; 1343 result = 0;
1349 goto out_putname; 1344 goto out_putname;
1350 } 1345 }
1351 1346
1352 mnt = current->nsproxy->pid_ns->proc_mnt; 1347 mnt = current->nsproxy->pid_ns->proc_mnt;
1353 result = vfs_path_lookup(mnt->mnt_root, mnt, pathname, 0, &nd); 1348 file = file_open_root(mnt->mnt_root, mnt, pathname, flags);
1354 if (result)
1355 goto out_putname;
1356
1357 result = may_open(&nd.path, acc_mode, flags);
1358 if (result)
1359 goto out_putpath;
1360
1361 file = dentry_open(nd.path.dentry, nd.path.mnt, flags, current_cred());
1362 result = PTR_ERR(file); 1349 result = PTR_ERR(file);
1363 if (IS_ERR(file)) 1350 if (IS_ERR(file))
1364 goto out_putname; 1351 goto out_putname;
@@ -1370,10 +1357,6 @@ out_putname:
1370 putname(pathname); 1357 putname(pathname);
1371out: 1358out:
1372 return result; 1359 return result;
1373
1374out_putpath:
1375 path_put(&nd.path);
1376 goto out_putname;
1377} 1360}
1378 1361
1379 1362
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 3308fd7f1b52..3971c6b9d58d 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -89,8 +89,7 @@ static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp,
89 return -ENOMEM; 89 return -ENOMEM;
90 90
91 if (!info) { 91 if (!info) {
92 int seq = get_cpu_var(taskstats_seqnum)++; 92 int seq = this_cpu_inc_return(taskstats_seqnum) - 1;
93 put_cpu_var(taskstats_seqnum);
94 93
95 reply = genlmsg_put(skb, 0, seq, &family, 0, cmd); 94 reply = genlmsg_put(skb, 0, seq, &family, 0, cmd);
96 } else 95 } else
@@ -349,7 +348,7 @@ static int parse(struct nlattr *na, struct cpumask *mask)
349 return ret; 348 return ret;
350} 349}
351 350
352#ifdef CONFIG_IA64 351#if defined(CONFIG_64BIT) && !defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
353#define TASKSTATS_NEEDS_PADDING 1 352#define TASKSTATS_NEEDS_PADDING 1
354#endif 353#endif
355 354
@@ -612,7 +611,7 @@ void taskstats_exit(struct task_struct *tsk, int group_dead)
612 fill_tgid_exit(tsk); 611 fill_tgid_exit(tsk);
613 } 612 }
614 613
615 listeners = &__raw_get_cpu_var(listener_array); 614 listeners = __this_cpu_ptr(&listener_array);
616 if (list_empty(&listeners->list)) 615 if (list_empty(&listeners->list))
617 return; 616 return;
618 617
diff --git a/kernel/time.c b/kernel/time.c
index ba9b338d1835..8e8dc6d705c9 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -150,7 +150,7 @@ static inline void warp_clock(void)
150 * various programs will get confused when the clock gets warped. 150 * various programs will get confused when the clock gets warped.
151 */ 151 */
152 152
153int do_sys_settimeofday(struct timespec *tv, struct timezone *tz) 153int do_sys_settimeofday(const struct timespec *tv, const struct timezone *tz)
154{ 154{
155 static int firsttime = 1; 155 static int firsttime = 1;
156 int error = 0; 156 int error = 0;
@@ -238,7 +238,7 @@ EXPORT_SYMBOL(current_fs_time);
238 * Avoid unnecessary multiplications/divisions in the 238 * Avoid unnecessary multiplications/divisions in the
239 * two most common HZ cases: 239 * two most common HZ cases:
240 */ 240 */
241unsigned int inline jiffies_to_msecs(const unsigned long j) 241inline unsigned int jiffies_to_msecs(const unsigned long j)
242{ 242{
243#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ) 243#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ)
244 return (MSEC_PER_SEC / HZ) * j; 244 return (MSEC_PER_SEC / HZ) * j;
@@ -254,7 +254,7 @@ unsigned int inline jiffies_to_msecs(const unsigned long j)
254} 254}
255EXPORT_SYMBOL(jiffies_to_msecs); 255EXPORT_SYMBOL(jiffies_to_msecs);
256 256
257unsigned int inline jiffies_to_usecs(const unsigned long j) 257inline unsigned int jiffies_to_usecs(const unsigned long j)
258{ 258{
259#if HZ <= USEC_PER_SEC && !(USEC_PER_SEC % HZ) 259#if HZ <= USEC_PER_SEC && !(USEC_PER_SEC % HZ)
260 return (USEC_PER_SEC / HZ) * j; 260 return (USEC_PER_SEC / HZ) * j;
@@ -645,7 +645,7 @@ u64 nsec_to_clock_t(u64 x)
645} 645}
646 646
647/** 647/**
648 * nsecs_to_jiffies - Convert nsecs in u64 to jiffies 648 * nsecs_to_jiffies64 - Convert nsecs in u64 to jiffies64
649 * 649 *
650 * @n: nsecs in u64 650 * @n: nsecs in u64
651 * 651 *
@@ -657,7 +657,7 @@ u64 nsec_to_clock_t(u64 x)
657 * NSEC_PER_SEC = 10^9 = (5^9 * 2^9) = (1953125 * 512) 657 * NSEC_PER_SEC = 10^9 = (5^9 * 2^9) = (1953125 * 512)
658 * ULLONG_MAX ns = 18446744073.709551615 secs = about 584 years 658 * ULLONG_MAX ns = 18446744073.709551615 secs = about 584 years
659 */ 659 */
660unsigned long nsecs_to_jiffies(u64 n) 660u64 nsecs_to_jiffies64(u64 n)
661{ 661{
662#if (NSEC_PER_SEC % HZ) == 0 662#if (NSEC_PER_SEC % HZ) == 0
663 /* Common case, HZ = 100, 128, 200, 250, 256, 500, 512, 1000 etc. */ 663 /* Common case, HZ = 100, 128, 200, 250, 256, 500, 512, 1000 etc. */
@@ -674,22 +674,23 @@ unsigned long nsecs_to_jiffies(u64 n)
674#endif 674#endif
675} 675}
676 676
677#if (BITS_PER_LONG < 64) 677/**
678u64 get_jiffies_64(void) 678 * nsecs_to_jiffies - Convert nsecs in u64 to jiffies
679 *
680 * @n: nsecs in u64
681 *
682 * Unlike {m,u}secs_to_jiffies, type of input is not unsigned int but u64.
683 * And this doesn't return MAX_JIFFY_OFFSET since this function is designed
684 * for scheduler, not for use in device drivers to calculate timeout value.
685 *
686 * note:
687 * NSEC_PER_SEC = 10^9 = (5^9 * 2^9) = (1953125 * 512)
688 * ULLONG_MAX ns = 18446744073.709551615 secs = about 584 years
689 */
690unsigned long nsecs_to_jiffies(u64 n)
679{ 691{
680 unsigned long seq; 692 return (unsigned long)nsecs_to_jiffies64(n);
681 u64 ret;
682
683 do {
684 seq = read_seqbegin(&xtime_lock);
685 ret = jiffies_64;
686 } while (read_seqretry(&xtime_lock, seq));
687 return ret;
688} 693}
689EXPORT_SYMBOL(get_jiffies_64);
690#endif
691
692EXPORT_SYMBOL(jiffies);
693 694
694/* 695/*
695 * Add two timespec values and do a safety check for overflow. 696 * Add two timespec values and do a safety check for overflow.
diff --git a/kernel/time/Makefile b/kernel/time/Makefile
index ee266620b06c..b0425991e9ac 100644
--- a/kernel/time/Makefile
+++ b/kernel/time/Makefile
@@ -1,4 +1,5 @@
1obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o timecompare.o timeconv.o 1obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o timecompare.o
2obj-y += timeconv.o posix-clock.o
2 3
3obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD) += clockevents.o 4obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD) += clockevents.o
4obj-$(CONFIG_GENERIC_CLOCKEVENTS) += tick-common.o 5obj-$(CONFIG_GENERIC_CLOCKEVENTS) += tick-common.o
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index d7395fdfb9f3..0d74b9ba90c8 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -18,7 +18,6 @@
18#include <linux/notifier.h> 18#include <linux/notifier.h>
19#include <linux/smp.h> 19#include <linux/smp.h>
20#include <linux/sysdev.h> 20#include <linux/sysdev.h>
21#include <linux/tick.h>
22 21
23#include "tick-internal.h" 22#include "tick-internal.h"
24 23
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index c18d7efa1b4b..6519cf62d9cd 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -113,7 +113,7 @@ EXPORT_SYMBOL_GPL(timecounter_cyc2time);
113 * @shift: pointer to shift variable 113 * @shift: pointer to shift variable
114 * @from: frequency to convert from 114 * @from: frequency to convert from
115 * @to: frequency to convert to 115 * @to: frequency to convert to
116 * @minsec: guaranteed runtime conversion range in seconds 116 * @maxsec: guaranteed runtime conversion range in seconds
117 * 117 *
118 * The function evaluates the shift/mult pair for the scaled math 118 * The function evaluates the shift/mult pair for the scaled math
119 * operations of clocksources and clockevents. 119 * operations of clocksources and clockevents.
@@ -122,7 +122,7 @@ EXPORT_SYMBOL_GPL(timecounter_cyc2time);
122 * NSEC_PER_SEC == 1GHz and @from is the counter frequency. For clock 122 * NSEC_PER_SEC == 1GHz and @from is the counter frequency. For clock
123 * event @to is the counter frequency and @from is NSEC_PER_SEC. 123 * event @to is the counter frequency and @from is NSEC_PER_SEC.
124 * 124 *
125 * The @minsec conversion range argument controls the time frame in 125 * The @maxsec conversion range argument controls the time frame in
126 * seconds which must be covered by the runtime conversion with the 126 * seconds which must be covered by the runtime conversion with the
127 * calculated mult and shift factors. This guarantees that no 64bit 127 * calculated mult and shift factors. This guarantees that no 64bit
128 * overflow happens when the input value of the conversion is 128 * overflow happens when the input value of the conversion is
@@ -131,7 +131,7 @@ EXPORT_SYMBOL_GPL(timecounter_cyc2time);
131 * factors. 131 * factors.
132 */ 132 */
133void 133void
134clocks_calc_mult_shift(u32 *mult, u32 *shift, u32 from, u32 to, u32 minsec) 134clocks_calc_mult_shift(u32 *mult, u32 *shift, u32 from, u32 to, u32 maxsec)
135{ 135{
136 u64 tmp; 136 u64 tmp;
137 u32 sft, sftacc= 32; 137 u32 sft, sftacc= 32;
@@ -140,7 +140,7 @@ clocks_calc_mult_shift(u32 *mult, u32 *shift, u32 from, u32 to, u32 minsec)
140 * Calculate the shift factor which is limiting the conversion 140 * Calculate the shift factor which is limiting the conversion
141 * range: 141 * range:
142 */ 142 */
143 tmp = ((u64)minsec * from) >> 32; 143 tmp = ((u64)maxsec * from) >> 32;
144 while (tmp) { 144 while (tmp) {
145 tmp >>=1; 145 tmp >>=1;
146 sftacc--; 146 sftacc--;
@@ -152,6 +152,7 @@ clocks_calc_mult_shift(u32 *mult, u32 *shift, u32 from, u32 to, u32 minsec)
152 */ 152 */
153 for (sft = 32; sft > 0; sft--) { 153 for (sft = 32; sft > 0; sft--) {
154 tmp = (u64) to << sft; 154 tmp = (u64) to << sft;
155 tmp += from / 2;
155 do_div(tmp, from); 156 do_div(tmp, from);
156 if ((tmp >> sftacc) == 0) 157 if ((tmp >> sftacc) == 0)
157 break; 158 break;
@@ -678,7 +679,7 @@ EXPORT_SYMBOL_GPL(__clocksource_updatefreq_scale);
678int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq) 679int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq)
679{ 680{
680 681
681 /* Intialize mult/shift and max_idle_ns */ 682 /* Initialize mult/shift and max_idle_ns */
682 __clocksource_updatefreq_scale(cs, scale, freq); 683 __clocksource_updatefreq_scale(cs, scale, freq);
683 684
684 /* Add clocksource to the clcoksource list */ 685 /* Add clocksource to the clcoksource list */
diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
index 5404a8456909..b2fa506667c0 100644
--- a/kernel/time/jiffies.c
+++ b/kernel/time/jiffies.c
@@ -22,8 +22,11 @@
22************************************************************************/ 22************************************************************************/
23#include <linux/clocksource.h> 23#include <linux/clocksource.h>
24#include <linux/jiffies.h> 24#include <linux/jiffies.h>
25#include <linux/module.h>
25#include <linux/init.h> 26#include <linux/init.h>
26 27
28#include "tick-internal.h"
29
27/* The Jiffies based clocksource is the lowest common 30/* The Jiffies based clocksource is the lowest common
28 * denominator clock source which should function on 31 * denominator clock source which should function on
29 * all systems. It has the same coarse resolution as 32 * all systems. It has the same coarse resolution as
@@ -64,6 +67,23 @@ struct clocksource clocksource_jiffies = {
64 .shift = JIFFIES_SHIFT, 67 .shift = JIFFIES_SHIFT,
65}; 68};
66 69
70#if (BITS_PER_LONG < 64)
71u64 get_jiffies_64(void)
72{
73 unsigned long seq;
74 u64 ret;
75
76 do {
77 seq = read_seqbegin(&xtime_lock);
78 ret = jiffies_64;
79 } while (read_seqretry(&xtime_lock, seq));
80 return ret;
81}
82EXPORT_SYMBOL(get_jiffies_64);
83#endif
84
85EXPORT_SYMBOL(jiffies);
86
67static int __init init_jiffies_clocksource(void) 87static int __init init_jiffies_clocksource(void)
68{ 88{
69 return clocksource_register(&clocksource_jiffies); 89 return clocksource_register(&clocksource_jiffies);
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index d2321891538f..5f1bb8e2008f 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -14,6 +14,9 @@
14#include <linux/timex.h> 14#include <linux/timex.h>
15#include <linux/time.h> 15#include <linux/time.h>
16#include <linux/mm.h> 16#include <linux/mm.h>
17#include <linux/module.h>
18
19#include "tick-internal.h"
17 20
18/* 21/*
19 * NTP timekeeping variables: 22 * NTP timekeeping variables:
@@ -74,6 +77,162 @@ static long time_adjust;
74/* constant (boot-param configurable) NTP tick adjustment (upscaled) */ 77/* constant (boot-param configurable) NTP tick adjustment (upscaled) */
75static s64 ntp_tick_adj; 78static s64 ntp_tick_adj;
76 79
80#ifdef CONFIG_NTP_PPS
81
82/*
83 * The following variables are used when a pulse-per-second (PPS) signal
84 * is available. They establish the engineering parameters of the clock
85 * discipline loop when controlled by the PPS signal.
86 */
87#define PPS_VALID 10 /* PPS signal watchdog max (s) */
88#define PPS_POPCORN 4 /* popcorn spike threshold (shift) */
89#define PPS_INTMIN 2 /* min freq interval (s) (shift) */
90#define PPS_INTMAX 8 /* max freq interval (s) (shift) */
91#define PPS_INTCOUNT 4 /* number of consecutive good intervals to
92 increase pps_shift or consecutive bad
93 intervals to decrease it */
94#define PPS_MAXWANDER 100000 /* max PPS freq wander (ns/s) */
95
96static int pps_valid; /* signal watchdog counter */
97static long pps_tf[3]; /* phase median filter */
98static long pps_jitter; /* current jitter (ns) */
99static struct timespec pps_fbase; /* beginning of the last freq interval */
100static int pps_shift; /* current interval duration (s) (shift) */
101static int pps_intcnt; /* interval counter */
102static s64 pps_freq; /* frequency offset (scaled ns/s) */
103static long pps_stabil; /* current stability (scaled ns/s) */
104
105/*
106 * PPS signal quality monitors
107 */
108static long pps_calcnt; /* calibration intervals */
109static long pps_jitcnt; /* jitter limit exceeded */
110static long pps_stbcnt; /* stability limit exceeded */
111static long pps_errcnt; /* calibration errors */
112
113
114/* PPS kernel consumer compensates the whole phase error immediately.
115 * Otherwise, reduce the offset by a fixed factor times the time constant.
116 */
117static inline s64 ntp_offset_chunk(s64 offset)
118{
119 if (time_status & STA_PPSTIME && time_status & STA_PPSSIGNAL)
120 return offset;
121 else
122 return shift_right(offset, SHIFT_PLL + time_constant);
123}
124
125static inline void pps_reset_freq_interval(void)
126{
127 /* the PPS calibration interval may end
128 surprisingly early */
129 pps_shift = PPS_INTMIN;
130 pps_intcnt = 0;
131}
132
133/**
134 * pps_clear - Clears the PPS state variables
135 *
136 * Must be called while holding a write on the xtime_lock
137 */
138static inline void pps_clear(void)
139{
140 pps_reset_freq_interval();
141 pps_tf[0] = 0;
142 pps_tf[1] = 0;
143 pps_tf[2] = 0;
144 pps_fbase.tv_sec = pps_fbase.tv_nsec = 0;
145 pps_freq = 0;
146}
147
148/* Decrease pps_valid to indicate that another second has passed since
149 * the last PPS signal. When it reaches 0, indicate that PPS signal is
150 * missing.
151 *
152 * Must be called while holding a write on the xtime_lock
153 */
154static inline void pps_dec_valid(void)
155{
156 if (pps_valid > 0)
157 pps_valid--;
158 else {
159 time_status &= ~(STA_PPSSIGNAL | STA_PPSJITTER |
160 STA_PPSWANDER | STA_PPSERROR);
161 pps_clear();
162 }
163}
164
165static inline void pps_set_freq(s64 freq)
166{
167 pps_freq = freq;
168}
169
170static inline int is_error_status(int status)
171{
172 return (time_status & (STA_UNSYNC|STA_CLOCKERR))
173 /* PPS signal lost when either PPS time or
174 * PPS frequency synchronization requested
175 */
176 || ((time_status & (STA_PPSFREQ|STA_PPSTIME))
177 && !(time_status & STA_PPSSIGNAL))
178 /* PPS jitter exceeded when
179 * PPS time synchronization requested */
180 || ((time_status & (STA_PPSTIME|STA_PPSJITTER))
181 == (STA_PPSTIME|STA_PPSJITTER))
182 /* PPS wander exceeded or calibration error when
183 * PPS frequency synchronization requested
184 */
185 || ((time_status & STA_PPSFREQ)
186 && (time_status & (STA_PPSWANDER|STA_PPSERROR)));
187}
188
189static inline void pps_fill_timex(struct timex *txc)
190{
191 txc->ppsfreq = shift_right((pps_freq >> PPM_SCALE_INV_SHIFT) *
192 PPM_SCALE_INV, NTP_SCALE_SHIFT);
193 txc->jitter = pps_jitter;
194 if (!(time_status & STA_NANO))
195 txc->jitter /= NSEC_PER_USEC;
196 txc->shift = pps_shift;
197 txc->stabil = pps_stabil;
198 txc->jitcnt = pps_jitcnt;
199 txc->calcnt = pps_calcnt;
200 txc->errcnt = pps_errcnt;
201 txc->stbcnt = pps_stbcnt;
202}
203
204#else /* !CONFIG_NTP_PPS */
205
206static inline s64 ntp_offset_chunk(s64 offset)
207{
208 return shift_right(offset, SHIFT_PLL + time_constant);
209}
210
211static inline void pps_reset_freq_interval(void) {}
212static inline void pps_clear(void) {}
213static inline void pps_dec_valid(void) {}
214static inline void pps_set_freq(s64 freq) {}
215
216static inline int is_error_status(int status)
217{
218 return status & (STA_UNSYNC|STA_CLOCKERR);
219}
220
221static inline void pps_fill_timex(struct timex *txc)
222{
223 /* PPS is not implemented, so these are zero */
224 txc->ppsfreq = 0;
225 txc->jitter = 0;
226 txc->shift = 0;
227 txc->stabil = 0;
228 txc->jitcnt = 0;
229 txc->calcnt = 0;
230 txc->errcnt = 0;
231 txc->stbcnt = 0;
232}
233
234#endif /* CONFIG_NTP_PPS */
235
77/* 236/*
78 * NTP methods: 237 * NTP methods:
79 */ 238 */
@@ -185,6 +344,9 @@ void ntp_clear(void)
185 344
186 tick_length = tick_length_base; 345 tick_length = tick_length_base;
187 time_offset = 0; 346 time_offset = 0;
347
348 /* Clear PPS state variables */
349 pps_clear();
188} 350}
189 351
190/* 352/*
@@ -250,16 +412,16 @@ void second_overflow(void)
250 time_status |= STA_UNSYNC; 412 time_status |= STA_UNSYNC;
251 } 413 }
252 414
253 /* 415 /* Compute the phase adjustment for the next second */
254 * Compute the phase adjustment for the next second. The offset is
255 * reduced by a fixed factor times the time constant.
256 */
257 tick_length = tick_length_base; 416 tick_length = tick_length_base;
258 417
259 delta = shift_right(time_offset, SHIFT_PLL + time_constant); 418 delta = ntp_offset_chunk(time_offset);
260 time_offset -= delta; 419 time_offset -= delta;
261 tick_length += delta; 420 tick_length += delta;
262 421
422 /* Check PPS signal */
423 pps_dec_valid();
424
263 if (!time_adjust) 425 if (!time_adjust)
264 return; 426 return;
265 427
@@ -369,6 +531,8 @@ static inline void process_adj_status(struct timex *txc, struct timespec *ts)
369 if ((time_status & STA_PLL) && !(txc->status & STA_PLL)) { 531 if ((time_status & STA_PLL) && !(txc->status & STA_PLL)) {
370 time_state = TIME_OK; 532 time_state = TIME_OK;
371 time_status = STA_UNSYNC; 533 time_status = STA_UNSYNC;
534 /* restart PPS frequency calibration */
535 pps_reset_freq_interval();
372 } 536 }
373 537
374 /* 538 /*
@@ -418,6 +582,8 @@ static inline void process_adjtimex_modes(struct timex *txc, struct timespec *ts
418 time_freq = txc->freq * PPM_SCALE; 582 time_freq = txc->freq * PPM_SCALE;
419 time_freq = min(time_freq, MAXFREQ_SCALED); 583 time_freq = min(time_freq, MAXFREQ_SCALED);
420 time_freq = max(time_freq, -MAXFREQ_SCALED); 584 time_freq = max(time_freq, -MAXFREQ_SCALED);
585 /* update pps_freq */
586 pps_set_freq(time_freq);
421 } 587 }
422 588
423 if (txc->modes & ADJ_MAXERROR) 589 if (txc->modes & ADJ_MAXERROR)
@@ -482,6 +648,17 @@ int do_adjtimex(struct timex *txc)
482 hrtimer_cancel(&leap_timer); 648 hrtimer_cancel(&leap_timer);
483 } 649 }
484 650
651 if (txc->modes & ADJ_SETOFFSET) {
652 struct timespec delta;
653 delta.tv_sec = txc->time.tv_sec;
654 delta.tv_nsec = txc->time.tv_usec;
655 if (!(txc->modes & ADJ_NANO))
656 delta.tv_nsec *= 1000;
657 result = timekeeping_inject_offset(&delta);
658 if (result)
659 return result;
660 }
661
485 getnstimeofday(&ts); 662 getnstimeofday(&ts);
486 663
487 write_seqlock_irq(&xtime_lock); 664 write_seqlock_irq(&xtime_lock);
@@ -508,7 +685,8 @@ int do_adjtimex(struct timex *txc)
508 } 685 }
509 686
510 result = time_state; /* mostly `TIME_OK' */ 687 result = time_state; /* mostly `TIME_OK' */
511 if (time_status & (STA_UNSYNC|STA_CLOCKERR)) 688 /* check for errors */
689 if (is_error_status(time_status))
512 result = TIME_ERROR; 690 result = TIME_ERROR;
513 691
514 txc->freq = shift_right((time_freq >> PPM_SCALE_INV_SHIFT) * 692 txc->freq = shift_right((time_freq >> PPM_SCALE_INV_SHIFT) *
@@ -522,15 +700,8 @@ int do_adjtimex(struct timex *txc)
522 txc->tick = tick_usec; 700 txc->tick = tick_usec;
523 txc->tai = time_tai; 701 txc->tai = time_tai;
524 702
525 /* PPS is not implemented, so these are zero */ 703 /* fill PPS status fields */
526 txc->ppsfreq = 0; 704 pps_fill_timex(txc);
527 txc->jitter = 0;
528 txc->shift = 0;
529 txc->stabil = 0;
530 txc->jitcnt = 0;
531 txc->calcnt = 0;
532 txc->errcnt = 0;
533 txc->stbcnt = 0;
534 705
535 write_sequnlock_irq(&xtime_lock); 706 write_sequnlock_irq(&xtime_lock);
536 707
@@ -544,6 +715,243 @@ int do_adjtimex(struct timex *txc)
544 return result; 715 return result;
545} 716}
546 717
718#ifdef CONFIG_NTP_PPS
719
720/* actually struct pps_normtime is good old struct timespec, but it is
721 * semantically different (and it is the reason why it was invented):
722 * pps_normtime.nsec has a range of ( -NSEC_PER_SEC / 2, NSEC_PER_SEC / 2 ]
723 * while timespec.tv_nsec has a range of [0, NSEC_PER_SEC) */
724struct pps_normtime {
725 __kernel_time_t sec; /* seconds */
726 long nsec; /* nanoseconds */
727};
728
729/* normalize the timestamp so that nsec is in the
730 ( -NSEC_PER_SEC / 2, NSEC_PER_SEC / 2 ] interval */
731static inline struct pps_normtime pps_normalize_ts(struct timespec ts)
732{
733 struct pps_normtime norm = {
734 .sec = ts.tv_sec,
735 .nsec = ts.tv_nsec
736 };
737
738 if (norm.nsec > (NSEC_PER_SEC >> 1)) {
739 norm.nsec -= NSEC_PER_SEC;
740 norm.sec++;
741 }
742
743 return norm;
744}
745
746/* get current phase correction and jitter */
747static inline long pps_phase_filter_get(long *jitter)
748{
749 *jitter = pps_tf[0] - pps_tf[1];
750 if (*jitter < 0)
751 *jitter = -*jitter;
752
753 /* TODO: test various filters */
754 return pps_tf[0];
755}
756
757/* add the sample to the phase filter */
758static inline void pps_phase_filter_add(long err)
759{
760 pps_tf[2] = pps_tf[1];
761 pps_tf[1] = pps_tf[0];
762 pps_tf[0] = err;
763}
764
765/* decrease frequency calibration interval length.
766 * It is halved after four consecutive unstable intervals.
767 */
768static inline void pps_dec_freq_interval(void)
769{
770 if (--pps_intcnt <= -PPS_INTCOUNT) {
771 pps_intcnt = -PPS_INTCOUNT;
772 if (pps_shift > PPS_INTMIN) {
773 pps_shift--;
774 pps_intcnt = 0;
775 }
776 }
777}
778
779/* increase frequency calibration interval length.
780 * It is doubled after four consecutive stable intervals.
781 */
782static inline void pps_inc_freq_interval(void)
783{
784 if (++pps_intcnt >= PPS_INTCOUNT) {
785 pps_intcnt = PPS_INTCOUNT;
786 if (pps_shift < PPS_INTMAX) {
787 pps_shift++;
788 pps_intcnt = 0;
789 }
790 }
791}
792
793/* update clock frequency based on MONOTONIC_RAW clock PPS signal
794 * timestamps
795 *
796 * At the end of the calibration interval the difference between the
797 * first and last MONOTONIC_RAW clock timestamps divided by the length
798 * of the interval becomes the frequency update. If the interval was
799 * too long, the data are discarded.
800 * Returns the difference between old and new frequency values.
801 */
802static long hardpps_update_freq(struct pps_normtime freq_norm)
803{
804 long delta, delta_mod;
805 s64 ftemp;
806
807 /* check if the frequency interval was too long */
808 if (freq_norm.sec > (2 << pps_shift)) {
809 time_status |= STA_PPSERROR;
810 pps_errcnt++;
811 pps_dec_freq_interval();
812 pr_err("hardpps: PPSERROR: interval too long - %ld s\n",
813 freq_norm.sec);
814 return 0;
815 }
816
817 /* here the raw frequency offset and wander (stability) is
818 * calculated. If the wander is less than the wander threshold
819 * the interval is increased; otherwise it is decreased.
820 */
821 ftemp = div_s64(((s64)(-freq_norm.nsec)) << NTP_SCALE_SHIFT,
822 freq_norm.sec);
823 delta = shift_right(ftemp - pps_freq, NTP_SCALE_SHIFT);
824 pps_freq = ftemp;
825 if (delta > PPS_MAXWANDER || delta < -PPS_MAXWANDER) {
826 pr_warning("hardpps: PPSWANDER: change=%ld\n", delta);
827 time_status |= STA_PPSWANDER;
828 pps_stbcnt++;
829 pps_dec_freq_interval();
830 } else { /* good sample */
831 pps_inc_freq_interval();
832 }
833
834 /* the stability metric is calculated as the average of recent
835 * frequency changes, but is used only for performance
836 * monitoring
837 */
838 delta_mod = delta;
839 if (delta_mod < 0)
840 delta_mod = -delta_mod;
841 pps_stabil += (div_s64(((s64)delta_mod) <<
842 (NTP_SCALE_SHIFT - SHIFT_USEC),
843 NSEC_PER_USEC) - pps_stabil) >> PPS_INTMIN;
844
845 /* if enabled, the system clock frequency is updated */
846 if ((time_status & STA_PPSFREQ) != 0 &&
847 (time_status & STA_FREQHOLD) == 0) {
848 time_freq = pps_freq;
849 ntp_update_frequency();
850 }
851
852 return delta;
853}
854
855/* correct REALTIME clock phase error against PPS signal */
856static void hardpps_update_phase(long error)
857{
858 long correction = -error;
859 long jitter;
860
861 /* add the sample to the median filter */
862 pps_phase_filter_add(correction);
863 correction = pps_phase_filter_get(&jitter);
864
865 /* Nominal jitter is due to PPS signal noise. If it exceeds the
866 * threshold, the sample is discarded; otherwise, if so enabled,
867 * the time offset is updated.
868 */
869 if (jitter > (pps_jitter << PPS_POPCORN)) {
870 pr_warning("hardpps: PPSJITTER: jitter=%ld, limit=%ld\n",
871 jitter, (pps_jitter << PPS_POPCORN));
872 time_status |= STA_PPSJITTER;
873 pps_jitcnt++;
874 } else if (time_status & STA_PPSTIME) {
875 /* correct the time using the phase offset */
876 time_offset = div_s64(((s64)correction) << NTP_SCALE_SHIFT,
877 NTP_INTERVAL_FREQ);
878 /* cancel running adjtime() */
879 time_adjust = 0;
880 }
881 /* update jitter */
882 pps_jitter += (jitter - pps_jitter) >> PPS_INTMIN;
883}
884
885/*
886 * hardpps() - discipline CPU clock oscillator to external PPS signal
887 *
888 * This routine is called at each PPS signal arrival in order to
889 * discipline the CPU clock oscillator to the PPS signal. It takes two
890 * parameters: REALTIME and MONOTONIC_RAW clock timestamps. The former
891 * is used to correct clock phase error and the latter is used to
892 * correct the frequency.
893 *
894 * This code is based on David Mills's reference nanokernel
895 * implementation. It was mostly rewritten but keeps the same idea.
896 */
897void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)
898{
899 struct pps_normtime pts_norm, freq_norm;
900 unsigned long flags;
901
902 pts_norm = pps_normalize_ts(*phase_ts);
903
904 write_seqlock_irqsave(&xtime_lock, flags);
905
906 /* clear the error bits, they will be set again if needed */
907 time_status &= ~(STA_PPSJITTER | STA_PPSWANDER | STA_PPSERROR);
908
909 /* indicate signal presence */
910 time_status |= STA_PPSSIGNAL;
911 pps_valid = PPS_VALID;
912
913 /* when called for the first time,
914 * just start the frequency interval */
915 if (unlikely(pps_fbase.tv_sec == 0)) {
916 pps_fbase = *raw_ts;
917 write_sequnlock_irqrestore(&xtime_lock, flags);
918 return;
919 }
920
921 /* ok, now we have a base for frequency calculation */
922 freq_norm = pps_normalize_ts(timespec_sub(*raw_ts, pps_fbase));
923
924 /* check that the signal is in the range
925 * [1s - MAXFREQ us, 1s + MAXFREQ us], otherwise reject it */
926 if ((freq_norm.sec == 0) ||
927 (freq_norm.nsec > MAXFREQ * freq_norm.sec) ||
928 (freq_norm.nsec < -MAXFREQ * freq_norm.sec)) {
929 time_status |= STA_PPSJITTER;
930 /* restart the frequency calibration interval */
931 pps_fbase = *raw_ts;
932 write_sequnlock_irqrestore(&xtime_lock, flags);
933 pr_err("hardpps: PPSJITTER: bad pulse\n");
934 return;
935 }
936
937 /* signal is ok */
938
939 /* check if the current frequency interval is finished */
940 if (freq_norm.sec >= (1 << pps_shift)) {
941 pps_calcnt++;
942 /* restart the frequency calibration interval */
943 pps_fbase = *raw_ts;
944 hardpps_update_freq(freq_norm);
945 }
946
947 hardpps_update_phase(pts_norm.nsec);
948
949 write_sequnlock_irqrestore(&xtime_lock, flags);
950}
951EXPORT_SYMBOL(hardpps);
952
953#endif /* CONFIG_NTP_PPS */
954
547static int __init ntp_tick_adj_setup(char *str) 955static int __init ntp_tick_adj_setup(char *str)
548{ 956{
549 ntp_tick_adj = simple_strtol(str, NULL, 0); 957 ntp_tick_adj = simple_strtol(str, NULL, 0);
diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c
new file mode 100644
index 000000000000..25028dd4fa18
--- /dev/null
+++ b/kernel/time/posix-clock.c
@@ -0,0 +1,451 @@
1/*
2 * posix-clock.c - support for dynamic clock devices
3 *
4 * Copyright (C) 2010 OMICRON electronics GmbH
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 */
20#include <linux/device.h>
21#include <linux/file.h>
22#include <linux/mutex.h>
23#include <linux/posix-clock.h>
24#include <linux/slab.h>
25#include <linux/syscalls.h>
26#include <linux/uaccess.h>
27
28static void delete_clock(struct kref *kref);
29
30/*
31 * Returns NULL if the posix_clock instance attached to 'fp' is old and stale.
32 */
33static struct posix_clock *get_posix_clock(struct file *fp)
34{
35 struct posix_clock *clk = fp->private_data;
36
37 mutex_lock(&clk->mutex);
38
39 if (!clk->zombie)
40 return clk;
41
42 mutex_unlock(&clk->mutex);
43
44 return NULL;
45}
46
47static void put_posix_clock(struct posix_clock *clk)
48{
49 mutex_unlock(&clk->mutex);
50}
51
52static ssize_t posix_clock_read(struct file *fp, char __user *buf,
53 size_t count, loff_t *ppos)
54{
55 struct posix_clock *clk = get_posix_clock(fp);
56 int err = -EINVAL;
57
58 if (!clk)
59 return -ENODEV;
60
61 if (clk->ops.read)
62 err = clk->ops.read(clk, fp->f_flags, buf, count);
63
64 put_posix_clock(clk);
65
66 return err;
67}
68
69static unsigned int posix_clock_poll(struct file *fp, poll_table *wait)
70{
71 struct posix_clock *clk = get_posix_clock(fp);
72 int result = 0;
73
74 if (!clk)
75 return -ENODEV;
76
77 if (clk->ops.poll)
78 result = clk->ops.poll(clk, fp, wait);
79
80 put_posix_clock(clk);
81
82 return result;
83}
84
85static int posix_clock_fasync(int fd, struct file *fp, int on)
86{
87 struct posix_clock *clk = get_posix_clock(fp);
88 int err = 0;
89
90 if (!clk)
91 return -ENODEV;
92
93 if (clk->ops.fasync)
94 err = clk->ops.fasync(clk, fd, fp, on);
95
96 put_posix_clock(clk);
97
98 return err;
99}
100
101static int posix_clock_mmap(struct file *fp, struct vm_area_struct *vma)
102{
103 struct posix_clock *clk = get_posix_clock(fp);
104 int err = -ENODEV;
105
106 if (!clk)
107 return -ENODEV;
108
109 if (clk->ops.mmap)
110 err = clk->ops.mmap(clk, vma);
111
112 put_posix_clock(clk);
113
114 return err;
115}
116
117static long posix_clock_ioctl(struct file *fp,
118 unsigned int cmd, unsigned long arg)
119{
120 struct posix_clock *clk = get_posix_clock(fp);
121 int err = -ENOTTY;
122
123 if (!clk)
124 return -ENODEV;
125
126 if (clk->ops.ioctl)
127 err = clk->ops.ioctl(clk, cmd, arg);
128
129 put_posix_clock(clk);
130
131 return err;
132}
133
134#ifdef CONFIG_COMPAT
135static long posix_clock_compat_ioctl(struct file *fp,
136 unsigned int cmd, unsigned long arg)
137{
138 struct posix_clock *clk = get_posix_clock(fp);
139 int err = -ENOTTY;
140
141 if (!clk)
142 return -ENODEV;
143
144 if (clk->ops.ioctl)
145 err = clk->ops.ioctl(clk, cmd, arg);
146
147 put_posix_clock(clk);
148
149 return err;
150}
151#endif
152
153static int posix_clock_open(struct inode *inode, struct file *fp)
154{
155 int err;
156 struct posix_clock *clk =
157 container_of(inode->i_cdev, struct posix_clock, cdev);
158
159 mutex_lock(&clk->mutex);
160
161 if (clk->zombie) {
162 err = -ENODEV;
163 goto out;
164 }
165 if (clk->ops.open)
166 err = clk->ops.open(clk, fp->f_mode);
167 else
168 err = 0;
169
170 if (!err) {
171 kref_get(&clk->kref);
172 fp->private_data = clk;
173 }
174out:
175 mutex_unlock(&clk->mutex);
176 return err;
177}
178
179static int posix_clock_release(struct inode *inode, struct file *fp)
180{
181 struct posix_clock *clk = fp->private_data;
182 int err = 0;
183
184 if (clk->ops.release)
185 err = clk->ops.release(clk);
186
187 kref_put(&clk->kref, delete_clock);
188
189 fp->private_data = NULL;
190
191 return err;
192}
193
194static const struct file_operations posix_clock_file_operations = {
195 .owner = THIS_MODULE,
196 .llseek = no_llseek,
197 .read = posix_clock_read,
198 .poll = posix_clock_poll,
199 .unlocked_ioctl = posix_clock_ioctl,
200 .open = posix_clock_open,
201 .release = posix_clock_release,
202 .fasync = posix_clock_fasync,
203 .mmap = posix_clock_mmap,
204#ifdef CONFIG_COMPAT
205 .compat_ioctl = posix_clock_compat_ioctl,
206#endif
207};
208
209int posix_clock_register(struct posix_clock *clk, dev_t devid)
210{
211 int err;
212
213 kref_init(&clk->kref);
214 mutex_init(&clk->mutex);
215
216 cdev_init(&clk->cdev, &posix_clock_file_operations);
217 clk->cdev.owner = clk->ops.owner;
218 err = cdev_add(&clk->cdev, devid, 1);
219 if (err)
220 goto no_cdev;
221
222 return err;
223no_cdev:
224 mutex_destroy(&clk->mutex);
225 return err;
226}
227EXPORT_SYMBOL_GPL(posix_clock_register);
228
229static void delete_clock(struct kref *kref)
230{
231 struct posix_clock *clk = container_of(kref, struct posix_clock, kref);
232 mutex_destroy(&clk->mutex);
233 if (clk->release)
234 clk->release(clk);
235}
236
237void posix_clock_unregister(struct posix_clock *clk)
238{
239 cdev_del(&clk->cdev);
240
241 mutex_lock(&clk->mutex);
242 clk->zombie = true;
243 mutex_unlock(&clk->mutex);
244
245 kref_put(&clk->kref, delete_clock);
246}
247EXPORT_SYMBOL_GPL(posix_clock_unregister);
248
249struct posix_clock_desc {
250 struct file *fp;
251 struct posix_clock *clk;
252};
253
254static int get_clock_desc(const clockid_t id, struct posix_clock_desc *cd)
255{
256 struct file *fp = fget(CLOCKID_TO_FD(id));
257 int err = -EINVAL;
258
259 if (!fp)
260 return err;
261
262 if (fp->f_op->open != posix_clock_open || !fp->private_data)
263 goto out;
264
265 cd->fp = fp;
266 cd->clk = get_posix_clock(fp);
267
268 err = cd->clk ? 0 : -ENODEV;
269out:
270 if (err)
271 fput(fp);
272 return err;
273}
274
275static void put_clock_desc(struct posix_clock_desc *cd)
276{
277 put_posix_clock(cd->clk);
278 fput(cd->fp);
279}
280
281static int pc_clock_adjtime(clockid_t id, struct timex *tx)
282{
283 struct posix_clock_desc cd;
284 int err;
285
286 err = get_clock_desc(id, &cd);
287 if (err)
288 return err;
289
290 if ((cd.fp->f_mode & FMODE_WRITE) == 0) {
291 err = -EACCES;
292 goto out;
293 }
294
295 if (cd.clk->ops.clock_adjtime)
296 err = cd.clk->ops.clock_adjtime(cd.clk, tx);
297 else
298 err = -EOPNOTSUPP;
299out:
300 put_clock_desc(&cd);
301
302 return err;
303}
304
305static int pc_clock_gettime(clockid_t id, struct timespec *ts)
306{
307 struct posix_clock_desc cd;
308 int err;
309
310 err = get_clock_desc(id, &cd);
311 if (err)
312 return err;
313
314 if (cd.clk->ops.clock_gettime)
315 err = cd.clk->ops.clock_gettime(cd.clk, ts);
316 else
317 err = -EOPNOTSUPP;
318
319 put_clock_desc(&cd);
320
321 return err;
322}
323
324static int pc_clock_getres(clockid_t id, struct timespec *ts)
325{
326 struct posix_clock_desc cd;
327 int err;
328
329 err = get_clock_desc(id, &cd);
330 if (err)
331 return err;
332
333 if (cd.clk->ops.clock_getres)
334 err = cd.clk->ops.clock_getres(cd.clk, ts);
335 else
336 err = -EOPNOTSUPP;
337
338 put_clock_desc(&cd);
339
340 return err;
341}
342
343static int pc_clock_settime(clockid_t id, const struct timespec *ts)
344{
345 struct posix_clock_desc cd;
346 int err;
347
348 err = get_clock_desc(id, &cd);
349 if (err)
350 return err;
351
352 if ((cd.fp->f_mode & FMODE_WRITE) == 0) {
353 err = -EACCES;
354 goto out;
355 }
356
357 if (cd.clk->ops.clock_settime)
358 err = cd.clk->ops.clock_settime(cd.clk, ts);
359 else
360 err = -EOPNOTSUPP;
361out:
362 put_clock_desc(&cd);
363
364 return err;
365}
366
367static int pc_timer_create(struct k_itimer *kit)
368{
369 clockid_t id = kit->it_clock;
370 struct posix_clock_desc cd;
371 int err;
372
373 err = get_clock_desc(id, &cd);
374 if (err)
375 return err;
376
377 if (cd.clk->ops.timer_create)
378 err = cd.clk->ops.timer_create(cd.clk, kit);
379 else
380 err = -EOPNOTSUPP;
381
382 put_clock_desc(&cd);
383
384 return err;
385}
386
387static int pc_timer_delete(struct k_itimer *kit)
388{
389 clockid_t id = kit->it_clock;
390 struct posix_clock_desc cd;
391 int err;
392
393 err = get_clock_desc(id, &cd);
394 if (err)
395 return err;
396
397 if (cd.clk->ops.timer_delete)
398 err = cd.clk->ops.timer_delete(cd.clk, kit);
399 else
400 err = -EOPNOTSUPP;
401
402 put_clock_desc(&cd);
403
404 return err;
405}
406
407static void pc_timer_gettime(struct k_itimer *kit, struct itimerspec *ts)
408{
409 clockid_t id = kit->it_clock;
410 struct posix_clock_desc cd;
411
412 if (get_clock_desc(id, &cd))
413 return;
414
415 if (cd.clk->ops.timer_gettime)
416 cd.clk->ops.timer_gettime(cd.clk, kit, ts);
417
418 put_clock_desc(&cd);
419}
420
421static int pc_timer_settime(struct k_itimer *kit, int flags,
422 struct itimerspec *ts, struct itimerspec *old)
423{
424 clockid_t id = kit->it_clock;
425 struct posix_clock_desc cd;
426 int err;
427
428 err = get_clock_desc(id, &cd);
429 if (err)
430 return err;
431
432 if (cd.clk->ops.timer_settime)
433 err = cd.clk->ops.timer_settime(cd.clk, kit, flags, ts, old);
434 else
435 err = -EOPNOTSUPP;
436
437 put_clock_desc(&cd);
438
439 return err;
440}
441
442struct k_clock clock_posix_dynamic = {
443 .clock_getres = pc_clock_getres,
444 .clock_set = pc_clock_settime,
445 .clock_get = pc_clock_gettime,
446 .clock_adj = pc_clock_adjtime,
447 .timer_create = pc_timer_create,
448 .timer_set = pc_timer_settime,
449 .timer_del = pc_timer_delete,
450 .timer_get = pc_timer_gettime,
451};
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 48b2761b5668..da800ffa810c 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -18,7 +18,6 @@
18#include <linux/percpu.h> 18#include <linux/percpu.h>
19#include <linux/profile.h> 19#include <linux/profile.h>
20#include <linux/sched.h> 20#include <linux/sched.h>
21#include <linux/tick.h>
22 21
23#include "tick-internal.h" 22#include "tick-internal.h"
24 23
@@ -600,4 +599,14 @@ int tick_broadcast_oneshot_active(void)
600 return tick_broadcast_device.mode == TICKDEV_MODE_ONESHOT; 599 return tick_broadcast_device.mode == TICKDEV_MODE_ONESHOT;
601} 600}
602 601
602/*
603 * Check whether the broadcast device supports oneshot.
604 */
605bool tick_broadcast_oneshot_available(void)
606{
607 struct clock_event_device *bc = tick_broadcast_device.evtdev;
608
609 return bc ? bc->features & CLOCK_EVT_FEAT_ONESHOT : false;
610}
611
603#endif 612#endif
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index b6b898d2eeef..119528de8235 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -18,7 +18,6 @@
18#include <linux/percpu.h> 18#include <linux/percpu.h>
19#include <linux/profile.h> 19#include <linux/profile.h>
20#include <linux/sched.h> 20#include <linux/sched.h>
21#include <linux/tick.h>
22 21
23#include <asm/irq_regs.h> 22#include <asm/irq_regs.h>
24 23
@@ -49,9 +48,13 @@ struct tick_device *tick_get_device(int cpu)
49 */ 48 */
50int tick_is_oneshot_available(void) 49int tick_is_oneshot_available(void)
51{ 50{
52 struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; 51 struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev);
53 52
54 return dev && (dev->features & CLOCK_EVT_FEAT_ONESHOT); 53 if (!dev || !(dev->features & CLOCK_EVT_FEAT_ONESHOT))
54 return 0;
55 if (!(dev->features & CLOCK_EVT_FEAT_C3STOP))
56 return 1;
57 return tick_broadcast_oneshot_available();
55} 58}
56 59
57/* 60/*
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index 290eefbc1f60..1009b06d6f89 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -1,6 +1,10 @@
1/* 1/*
2 * tick internal variable and functions used by low/high res code 2 * tick internal variable and functions used by low/high res code
3 */ 3 */
4#include <linux/hrtimer.h>
5#include <linux/tick.h>
6
7#ifdef CONFIG_GENERIC_CLOCKEVENTS_BUILD
4 8
5#define TICK_DO_TIMER_NONE -1 9#define TICK_DO_TIMER_NONE -1
6#define TICK_DO_TIMER_BOOT -2 10#define TICK_DO_TIMER_BOOT -2
@@ -36,6 +40,7 @@ extern void tick_shutdown_broadcast_oneshot(unsigned int *cpup);
36extern int tick_resume_broadcast_oneshot(struct clock_event_device *bc); 40extern int tick_resume_broadcast_oneshot(struct clock_event_device *bc);
37extern int tick_broadcast_oneshot_active(void); 41extern int tick_broadcast_oneshot_active(void);
38extern void tick_check_oneshot_broadcast(int cpu); 42extern void tick_check_oneshot_broadcast(int cpu);
43bool tick_broadcast_oneshot_available(void);
39# else /* BROADCAST */ 44# else /* BROADCAST */
40static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc) 45static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
41{ 46{
@@ -46,6 +51,7 @@ static inline void tick_broadcast_switch_to_oneshot(void) { }
46static inline void tick_shutdown_broadcast_oneshot(unsigned int *cpup) { } 51static inline void tick_shutdown_broadcast_oneshot(unsigned int *cpup) { }
47static inline int tick_broadcast_oneshot_active(void) { return 0; } 52static inline int tick_broadcast_oneshot_active(void) { return 0; }
48static inline void tick_check_oneshot_broadcast(int cpu) { } 53static inline void tick_check_oneshot_broadcast(int cpu) { }
54static inline bool tick_broadcast_oneshot_available(void) { return true; }
49# endif /* !BROADCAST */ 55# endif /* !BROADCAST */
50 56
51#else /* !ONESHOT */ 57#else /* !ONESHOT */
@@ -76,6 +82,7 @@ static inline int tick_resume_broadcast_oneshot(struct clock_event_device *bc)
76 return 0; 82 return 0;
77} 83}
78static inline int tick_broadcast_oneshot_active(void) { return 0; } 84static inline int tick_broadcast_oneshot_active(void) { return 0; }
85static inline bool tick_broadcast_oneshot_available(void) { return false; }
79#endif /* !TICK_ONESHOT */ 86#endif /* !TICK_ONESHOT */
80 87
81/* 88/*
@@ -132,3 +139,8 @@ static inline int tick_device_is_functional(struct clock_event_device *dev)
132{ 139{
133 return !(dev->features & CLOCK_EVT_FEAT_DUMMY); 140 return !(dev->features & CLOCK_EVT_FEAT_DUMMY);
134} 141}
142
143#endif
144
145extern void do_timer(unsigned long ticks);
146extern seqlock_t xtime_lock;
diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c
index aada0e52680a..2d04411a5f05 100644
--- a/kernel/time/tick-oneshot.c
+++ b/kernel/time/tick-oneshot.c
@@ -18,7 +18,6 @@
18#include <linux/percpu.h> 18#include <linux/percpu.h>
19#include <linux/profile.h> 19#include <linux/profile.h>
20#include <linux/sched.h> 20#include <linux/sched.h>
21#include <linux/tick.h>
22 21
23#include "tick-internal.h" 22#include "tick-internal.h"
24 23
@@ -95,7 +94,7 @@ int tick_dev_program_event(struct clock_event_device *dev, ktime_t expires,
95 */ 94 */
96int tick_program_event(ktime_t expires, int force) 95int tick_program_event(ktime_t expires, int force)
97{ 96{
98 struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; 97 struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev);
99 98
100 return tick_dev_program_event(dev, expires, force); 99 return tick_dev_program_event(dev, expires, force);
101} 100}
@@ -167,7 +166,7 @@ int tick_oneshot_mode_active(void)
167 int ret; 166 int ret;
168 167
169 local_irq_save(flags); 168 local_irq_save(flags);
170 ret = __get_cpu_var(tick_cpu_device).mode == TICKDEV_MODE_ONESHOT; 169 ret = __this_cpu_read(tick_cpu_device.mode) == TICKDEV_MODE_ONESHOT;
171 local_irq_restore(flags); 170 local_irq_restore(flags);
172 171
173 return ret; 172 return ret;
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 3e216e01bbd1..d5097c44b407 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -19,7 +19,6 @@
19#include <linux/percpu.h> 19#include <linux/percpu.h>
20#include <linux/profile.h> 20#include <linux/profile.h>
21#include <linux/sched.h> 21#include <linux/sched.h>
22#include <linux/tick.h>
23#include <linux/module.h> 22#include <linux/module.h>
24 23
25#include <asm/irq_regs.h> 24#include <asm/irq_regs.h>
@@ -642,8 +641,7 @@ static void tick_nohz_switch_to_nohz(void)
642 } 641 }
643 local_irq_enable(); 642 local_irq_enable();
644 643
645 printk(KERN_INFO "Switched to NOHz mode on CPU #%d\n", 644 printk(KERN_INFO "Switched to NOHz mode on CPU #%d\n", smp_processor_id());
646 smp_processor_id());
647} 645}
648 646
649/* 647/*
@@ -795,8 +793,10 @@ void tick_setup_sched_timer(void)
795 } 793 }
796 794
797#ifdef CONFIG_NO_HZ 795#ifdef CONFIG_NO_HZ
798 if (tick_nohz_enabled) 796 if (tick_nohz_enabled) {
799 ts->nohz_mode = NOHZ_MODE_HIGHRES; 797 ts->nohz_mode = NOHZ_MODE_HIGHRES;
798 printk(KERN_INFO "Switched to NOHz mode on CPU #%d\n", smp_processor_id());
799 }
800#endif 800#endif
801} 801}
802#endif /* HIGH_RES_TIMERS */ 802#endif /* HIGH_RES_TIMERS */
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 5bb86da82003..3bd7e3d5c632 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -49,7 +49,7 @@ struct timekeeper {
49 u32 mult; 49 u32 mult;
50}; 50};
51 51
52struct timekeeper timekeeper; 52static struct timekeeper timekeeper;
53 53
54/** 54/**
55 * timekeeper_setup_internals - Set up internals to use clocksource clock. 55 * timekeeper_setup_internals - Set up internals to use clocksource clock.
@@ -164,7 +164,7 @@ static struct timespec total_sleep_time;
164/* 164/*
165 * The raw monotonic time for the CLOCK_MONOTONIC_RAW posix clock. 165 * The raw monotonic time for the CLOCK_MONOTONIC_RAW posix clock.
166 */ 166 */
167struct timespec raw_time; 167static struct timespec raw_time;
168 168
169/* flag for if timekeeping is suspended */ 169/* flag for if timekeeping is suspended */
170int __read_mostly timekeeping_suspended; 170int __read_mostly timekeeping_suspended;
@@ -288,6 +288,49 @@ void ktime_get_ts(struct timespec *ts)
288} 288}
289EXPORT_SYMBOL_GPL(ktime_get_ts); 289EXPORT_SYMBOL_GPL(ktime_get_ts);
290 290
291#ifdef CONFIG_NTP_PPS
292
293/**
294 * getnstime_raw_and_real - get day and raw monotonic time in timespec format
295 * @ts_raw: pointer to the timespec to be set to raw monotonic time
296 * @ts_real: pointer to the timespec to be set to the time of day
297 *
298 * This function reads both the time of day and raw monotonic time at the
299 * same time atomically and stores the resulting timestamps in timespec
300 * format.
301 */
302void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real)
303{
304 unsigned long seq;
305 s64 nsecs_raw, nsecs_real;
306
307 WARN_ON_ONCE(timekeeping_suspended);
308
309 do {
310 u32 arch_offset;
311
312 seq = read_seqbegin(&xtime_lock);
313
314 *ts_raw = raw_time;
315 *ts_real = xtime;
316
317 nsecs_raw = timekeeping_get_ns_raw();
318 nsecs_real = timekeeping_get_ns();
319
320 /* If arch requires, add in gettimeoffset() */
321 arch_offset = arch_gettimeoffset();
322 nsecs_raw += arch_offset;
323 nsecs_real += arch_offset;
324
325 } while (read_seqretry(&xtime_lock, seq));
326
327 timespec_add_ns(ts_raw, nsecs_raw);
328 timespec_add_ns(ts_real, nsecs_real);
329}
330EXPORT_SYMBOL(getnstime_raw_and_real);
331
332#endif /* CONFIG_NTP_PPS */
333
291/** 334/**
292 * do_gettimeofday - Returns the time of day in a timeval 335 * do_gettimeofday - Returns the time of day in a timeval
293 * @tv: pointer to the timeval to be set 336 * @tv: pointer to the timeval to be set
@@ -310,7 +353,7 @@ EXPORT_SYMBOL(do_gettimeofday);
310 * 353 *
311 * Sets the time of day to the new time and update NTP and notify hrtimers 354 * Sets the time of day to the new time and update NTP and notify hrtimers
312 */ 355 */
313int do_settimeofday(struct timespec *tv) 356int do_settimeofday(const struct timespec *tv)
314{ 357{
315 struct timespec ts_delta; 358 struct timespec ts_delta;
316 unsigned long flags; 359 unsigned long flags;
@@ -344,6 +387,42 @@ int do_settimeofday(struct timespec *tv)
344 387
345EXPORT_SYMBOL(do_settimeofday); 388EXPORT_SYMBOL(do_settimeofday);
346 389
390
391/**
392 * timekeeping_inject_offset - Adds or subtracts from the current time.
393 * @tv: pointer to the timespec variable containing the offset
394 *
395 * Adds or subtracts an offset value from the current time.
396 */
397int timekeeping_inject_offset(struct timespec *ts)
398{
399 unsigned long flags;
400
401 if ((unsigned long)ts->tv_nsec >= NSEC_PER_SEC)
402 return -EINVAL;
403
404 write_seqlock_irqsave(&xtime_lock, flags);
405
406 timekeeping_forward_now();
407
408 xtime = timespec_add(xtime, *ts);
409 wall_to_monotonic = timespec_sub(wall_to_monotonic, *ts);
410
411 timekeeper.ntp_error = 0;
412 ntp_clear();
413
414 update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock,
415 timekeeper.mult);
416
417 write_sequnlock_irqrestore(&xtime_lock, flags);
418
419 /* signal hrtimers about time change */
420 clock_was_set();
421
422 return 0;
423}
424EXPORT_SYMBOL(timekeeping_inject_offset);
425
347/** 426/**
348 * change_clocksource - Swaps clocksources if a new one is available 427 * change_clocksource - Swaps clocksources if a new one is available
349 * 428 *
@@ -736,7 +815,7 @@ static cycle_t logarithmic_accumulation(cycle_t offset, int shift)
736 * 815 *
737 * Called from the timer interrupt, must hold a write on xtime_lock. 816 * Called from the timer interrupt, must hold a write on xtime_lock.
738 */ 817 */
739void update_wall_time(void) 818static void update_wall_time(void)
740{ 819{
741 struct clocksource *clock; 820 struct clocksource *clock;
742 cycle_t offset; 821 cycle_t offset;
@@ -828,7 +907,7 @@ void update_wall_time(void)
828 * getboottime - Return the real time of system boot. 907 * getboottime - Return the real time of system boot.
829 * @ts: pointer to the timespec to be set 908 * @ts: pointer to the timespec to be set
830 * 909 *
831 * Returns the time of day in a timespec. 910 * Returns the wall-time of boot in a timespec.
832 * 911 *
833 * This is based on the wall_to_monotonic offset and the total suspend 912 * This is based on the wall_to_monotonic offset and the total suspend
834 * time. Calls to settimeofday will affect the value returned (which 913 * time. Calls to settimeofday will affect the value returned (which
@@ -846,6 +925,55 @@ void getboottime(struct timespec *ts)
846} 925}
847EXPORT_SYMBOL_GPL(getboottime); 926EXPORT_SYMBOL_GPL(getboottime);
848 927
928
929/**
930 * get_monotonic_boottime - Returns monotonic time since boot
931 * @ts: pointer to the timespec to be set
932 *
933 * Returns the monotonic time since boot in a timespec.
934 *
935 * This is similar to CLOCK_MONTONIC/ktime_get_ts, but also
936 * includes the time spent in suspend.
937 */
938void get_monotonic_boottime(struct timespec *ts)
939{
940 struct timespec tomono, sleep;
941 unsigned int seq;
942 s64 nsecs;
943
944 WARN_ON(timekeeping_suspended);
945
946 do {
947 seq = read_seqbegin(&xtime_lock);
948 *ts = xtime;
949 tomono = wall_to_monotonic;
950 sleep = total_sleep_time;
951 nsecs = timekeeping_get_ns();
952
953 } while (read_seqretry(&xtime_lock, seq));
954
955 set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec + sleep.tv_sec,
956 ts->tv_nsec + tomono.tv_nsec + sleep.tv_nsec + nsecs);
957}
958EXPORT_SYMBOL_GPL(get_monotonic_boottime);
959
960/**
961 * ktime_get_boottime - Returns monotonic time since boot in a ktime
962 *
963 * Returns the monotonic time since boot in a ktime
964 *
965 * This is similar to CLOCK_MONTONIC/ktime_get, but also
966 * includes the time spent in suspend.
967 */
968ktime_t ktime_get_boottime(void)
969{
970 struct timespec ts;
971
972 get_monotonic_boottime(&ts);
973 return timespec_to_ktime(ts);
974}
975EXPORT_SYMBOL_GPL(ktime_get_boottime);
976
849/** 977/**
850 * monotonic_to_bootbased - Convert the monotonic time to boot based. 978 * monotonic_to_bootbased - Convert the monotonic time to boot based.
851 * @ts: pointer to the timespec to be converted 979 * @ts: pointer to the timespec to be converted
@@ -867,11 +995,6 @@ struct timespec __current_kernel_time(void)
867 return xtime; 995 return xtime;
868} 996}
869 997
870struct timespec __get_wall_to_monotonic(void)
871{
872 return wall_to_monotonic;
873}
874
875struct timespec current_kernel_time(void) 998struct timespec current_kernel_time(void)
876{ 999{
877 struct timespec now; 1000 struct timespec now;
@@ -903,3 +1026,48 @@ struct timespec get_monotonic_coarse(void)
903 now.tv_nsec + mono.tv_nsec); 1026 now.tv_nsec + mono.tv_nsec);
904 return now; 1027 return now;
905} 1028}
1029
1030/*
1031 * The 64-bit jiffies value is not atomic - you MUST NOT read it
1032 * without sampling the sequence number in xtime_lock.
1033 * jiffies is defined in the linker script...
1034 */
1035void do_timer(unsigned long ticks)
1036{
1037 jiffies_64 += ticks;
1038 update_wall_time();
1039 calc_global_load(ticks);
1040}
1041
1042/**
1043 * get_xtime_and_monotonic_and_sleep_offset() - get xtime, wall_to_monotonic,
1044 * and sleep offsets.
1045 * @xtim: pointer to timespec to be set with xtime
1046 * @wtom: pointer to timespec to be set with wall_to_monotonic
1047 * @sleep: pointer to timespec to be set with time in suspend
1048 */
1049void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim,
1050 struct timespec *wtom, struct timespec *sleep)
1051{
1052 unsigned long seq;
1053
1054 do {
1055 seq = read_seqbegin(&xtime_lock);
1056 *xtim = xtime;
1057 *wtom = wall_to_monotonic;
1058 *sleep = total_sleep_time;
1059 } while (read_seqretry(&xtime_lock, seq));
1060}
1061
1062/**
1063 * xtime_update() - advances the timekeeping infrastructure
1064 * @ticks: number of ticks, that have elapsed since the last call.
1065 *
1066 * Must be called with interrupts disabled.
1067 */
1068void xtime_update(unsigned long ticks)
1069{
1070 write_seqlock(&xtime_lock);
1071 do_timer(ticks);
1072 write_sequnlock(&xtime_lock);
1073}
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index 32a19f9397fc..3258455549f4 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -41,7 +41,7 @@ static void print_name_offset(struct seq_file *m, void *sym)
41 char symname[KSYM_NAME_LEN]; 41 char symname[KSYM_NAME_LEN];
42 42
43 if (lookup_symbol_name((unsigned long)sym, symname) < 0) 43 if (lookup_symbol_name((unsigned long)sym, symname) < 0)
44 SEQ_printf(m, "<%p>", sym); 44 SEQ_printf(m, "<%pK>", sym);
45 else 45 else
46 SEQ_printf(m, "%s", symname); 46 SEQ_printf(m, "%s", symname);
47} 47}
@@ -112,7 +112,7 @@ next_one:
112static void 112static void
113print_base(struct seq_file *m, struct hrtimer_clock_base *base, u64 now) 113print_base(struct seq_file *m, struct hrtimer_clock_base *base, u64 now)
114{ 114{
115 SEQ_printf(m, " .base: %p\n", base); 115 SEQ_printf(m, " .base: %pK\n", base);
116 SEQ_printf(m, " .index: %d\n", 116 SEQ_printf(m, " .index: %d\n",
117 base->index); 117 base->index);
118 SEQ_printf(m, " .resolution: %Lu nsecs\n", 118 SEQ_printf(m, " .resolution: %Lu nsecs\n",
diff --git a/kernel/timer.c b/kernel/timer.c
index 43ca9936f2d0..fd6198692b57 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -404,6 +404,11 @@ static void timer_stats_account_timer(struct timer_list *timer) {}
404 404
405static struct debug_obj_descr timer_debug_descr; 405static struct debug_obj_descr timer_debug_descr;
406 406
407static void *timer_debug_hint(void *addr)
408{
409 return ((struct timer_list *) addr)->function;
410}
411
407/* 412/*
408 * fixup_init is called when: 413 * fixup_init is called when:
409 * - an active object is initialized 414 * - an active object is initialized
@@ -477,6 +482,7 @@ static int timer_fixup_free(void *addr, enum debug_obj_state state)
477 482
478static struct debug_obj_descr timer_debug_descr = { 483static struct debug_obj_descr timer_debug_descr = {
479 .name = "timer_list", 484 .name = "timer_list",
485 .debug_hint = timer_debug_hint,
480 .fixup_init = timer_fixup_init, 486 .fixup_init = timer_fixup_init,
481 .fixup_activate = timer_fixup_activate, 487 .fixup_activate = timer_fixup_activate,
482 .fixup_free = timer_fixup_free, 488 .fixup_free = timer_fixup_free,
@@ -959,20 +965,45 @@ EXPORT_SYMBOL(try_to_del_timer_sync);
959 * 965 *
960 * Synchronization rules: Callers must prevent restarting of the timer, 966 * Synchronization rules: Callers must prevent restarting of the timer,
961 * otherwise this function is meaningless. It must not be called from 967 * otherwise this function is meaningless. It must not be called from
962 * hardirq contexts. The caller must not hold locks which would prevent 968 * interrupt contexts. The caller must not hold locks which would prevent
963 * completion of the timer's handler. The timer's handler must not call 969 * completion of the timer's handler. The timer's handler must not call
964 * add_timer_on(). Upon exit the timer is not queued and the handler is 970 * add_timer_on(). Upon exit the timer is not queued and the handler is
965 * not running on any CPU. 971 * not running on any CPU.
966 * 972 *
973 * Note: You must not hold locks that are held in interrupt context
974 * while calling this function. Even if the lock has nothing to do
975 * with the timer in question. Here's why:
976 *
977 * CPU0 CPU1
978 * ---- ----
979 * <SOFTIRQ>
980 * call_timer_fn();
981 * base->running_timer = mytimer;
982 * spin_lock_irq(somelock);
983 * <IRQ>
984 * spin_lock(somelock);
985 * del_timer_sync(mytimer);
986 * while (base->running_timer == mytimer);
987 *
988 * Now del_timer_sync() will never return and never release somelock.
989 * The interrupt on the other CPU is waiting to grab somelock but
990 * it has interrupted the softirq that CPU0 is waiting to finish.
991 *
967 * The function returns whether it has deactivated a pending timer or not. 992 * The function returns whether it has deactivated a pending timer or not.
968 */ 993 */
969int del_timer_sync(struct timer_list *timer) 994int del_timer_sync(struct timer_list *timer)
970{ 995{
971#ifdef CONFIG_LOCKDEP 996#ifdef CONFIG_LOCKDEP
972 local_bh_disable(); 997 unsigned long flags;
998
999 /*
1000 * If lockdep gives a backtrace here, please reference
1001 * the synchronization rules above.
1002 */
1003 local_irq_save(flags);
973 lock_map_acquire(&timer->lockdep_map); 1004 lock_map_acquire(&timer->lockdep_map);
974 lock_map_release(&timer->lockdep_map); 1005 lock_map_release(&timer->lockdep_map);
975 local_bh_enable(); 1006 local_irq_restore(flags);
976#endif 1007#endif
977 /* 1008 /*
978 * don't use it in hardirq context, because it 1009 * don't use it in hardirq context, because it
@@ -1293,19 +1324,6 @@ void run_local_timers(void)
1293 raise_softirq(TIMER_SOFTIRQ); 1324 raise_softirq(TIMER_SOFTIRQ);
1294} 1325}
1295 1326
1296/*
1297 * The 64-bit jiffies value is not atomic - you MUST NOT read it
1298 * without sampling the sequence number in xtime_lock.
1299 * jiffies is defined in the linker script...
1300 */
1301
1302void do_timer(unsigned long ticks)
1303{
1304 jiffies_64 += ticks;
1305 update_wall_time();
1306 calc_global_load(ticks);
1307}
1308
1309#ifdef __ARCH_WANT_SYS_ALARM 1327#ifdef __ARCH_WANT_SYS_ALARM
1310 1328
1311/* 1329/*
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 53f338190b26..761c510a06c5 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -52,7 +52,7 @@ obj-$(CONFIG_EVENT_TRACING) += trace_event_perf.o
52endif 52endif
53obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o 53obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o
54obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o 54obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o
55obj-$(CONFIG_EVENT_TRACING) += power-traces.o 55obj-$(CONFIG_TRACEPOINTS) += power-traces.o
56ifeq ($(CONFIG_TRACING),y) 56ifeq ($(CONFIG_TRACING),y)
57obj-$(CONFIG_KGDB_KDB) += trace_kdb.o 57obj-$(CONFIG_KGDB_KDB) += trace_kdb.o
58endif 58endif
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 7b8ec0281548..cbafed7d4f38 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -138,6 +138,13 @@ void __trace_note_message(struct blk_trace *bt, const char *fmt, ...)
138 !blk_tracer_enabled)) 138 !blk_tracer_enabled))
139 return; 139 return;
140 140
141 /*
142 * If the BLK_TC_NOTIFY action mask isn't set, don't send any note
143 * message to the trace.
144 */
145 if (!(bt->act_mask & BLK_TC_NOTIFY))
146 return;
147
141 local_irq_save(flags); 148 local_irq_save(flags);
142 buf = per_cpu_ptr(bt->msg_data, smp_processor_id()); 149 buf = per_cpu_ptr(bt->msg_data, smp_processor_id());
143 va_start(args, fmt); 150 va_start(args, fmt);
@@ -758,53 +765,58 @@ static void blk_add_trace_rq_complete(void *ignore,
758 * @q: queue the io is for 765 * @q: queue the io is for
759 * @bio: the source bio 766 * @bio: the source bio
760 * @what: the action 767 * @what: the action
768 * @error: error, if any
761 * 769 *
762 * Description: 770 * Description:
763 * Records an action against a bio. Will log the bio offset + size. 771 * Records an action against a bio. Will log the bio offset + size.
764 * 772 *
765 **/ 773 **/
766static void blk_add_trace_bio(struct request_queue *q, struct bio *bio, 774static void blk_add_trace_bio(struct request_queue *q, struct bio *bio,
767 u32 what) 775 u32 what, int error)
768{ 776{
769 struct blk_trace *bt = q->blk_trace; 777 struct blk_trace *bt = q->blk_trace;
770 778
771 if (likely(!bt)) 779 if (likely(!bt))
772 return; 780 return;
773 781
782 if (!error && !bio_flagged(bio, BIO_UPTODATE))
783 error = EIO;
784
774 __blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, what, 785 __blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, what,
775 !bio_flagged(bio, BIO_UPTODATE), 0, NULL); 786 error, 0, NULL);
776} 787}
777 788
778static void blk_add_trace_bio_bounce(void *ignore, 789static void blk_add_trace_bio_bounce(void *ignore,
779 struct request_queue *q, struct bio *bio) 790 struct request_queue *q, struct bio *bio)
780{ 791{
781 blk_add_trace_bio(q, bio, BLK_TA_BOUNCE); 792 blk_add_trace_bio(q, bio, BLK_TA_BOUNCE, 0);
782} 793}
783 794
784static void blk_add_trace_bio_complete(void *ignore, 795static void blk_add_trace_bio_complete(void *ignore,
785 struct request_queue *q, struct bio *bio) 796 struct request_queue *q, struct bio *bio,
797 int error)
786{ 798{
787 blk_add_trace_bio(q, bio, BLK_TA_COMPLETE); 799 blk_add_trace_bio(q, bio, BLK_TA_COMPLETE, error);
788} 800}
789 801
790static void blk_add_trace_bio_backmerge(void *ignore, 802static void blk_add_trace_bio_backmerge(void *ignore,
791 struct request_queue *q, 803 struct request_queue *q,
792 struct bio *bio) 804 struct bio *bio)
793{ 805{
794 blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE); 806 blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE, 0);
795} 807}
796 808
797static void blk_add_trace_bio_frontmerge(void *ignore, 809static void blk_add_trace_bio_frontmerge(void *ignore,
798 struct request_queue *q, 810 struct request_queue *q,
799 struct bio *bio) 811 struct bio *bio)
800{ 812{
801 blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE); 813 blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE, 0);
802} 814}
803 815
804static void blk_add_trace_bio_queue(void *ignore, 816static void blk_add_trace_bio_queue(void *ignore,
805 struct request_queue *q, struct bio *bio) 817 struct request_queue *q, struct bio *bio)
806{ 818{
807 blk_add_trace_bio(q, bio, BLK_TA_QUEUE); 819 blk_add_trace_bio(q, bio, BLK_TA_QUEUE, 0);
808} 820}
809 821
810static void blk_add_trace_getrq(void *ignore, 822static void blk_add_trace_getrq(void *ignore,
@@ -812,7 +824,7 @@ static void blk_add_trace_getrq(void *ignore,
812 struct bio *bio, int rw) 824 struct bio *bio, int rw)
813{ 825{
814 if (bio) 826 if (bio)
815 blk_add_trace_bio(q, bio, BLK_TA_GETRQ); 827 blk_add_trace_bio(q, bio, BLK_TA_GETRQ, 0);
816 else { 828 else {
817 struct blk_trace *bt = q->blk_trace; 829 struct blk_trace *bt = q->blk_trace;
818 830
@@ -827,7 +839,7 @@ static void blk_add_trace_sleeprq(void *ignore,
827 struct bio *bio, int rw) 839 struct bio *bio, int rw)
828{ 840{
829 if (bio) 841 if (bio)
830 blk_add_trace_bio(q, bio, BLK_TA_SLEEPRQ); 842 blk_add_trace_bio(q, bio, BLK_TA_SLEEPRQ, 0);
831 else { 843 else {
832 struct blk_trace *bt = q->blk_trace; 844 struct blk_trace *bt = q->blk_trace;
833 845
@@ -887,7 +899,7 @@ static void blk_add_trace_split(void *ignore,
887} 899}
888 900
889/** 901/**
890 * blk_add_trace_remap - Add a trace for a remap operation 902 * blk_add_trace_bio_remap - Add a trace for a bio-remap operation
891 * @ignore: trace callback data parameter (not used) 903 * @ignore: trace callback data parameter (not used)
892 * @q: queue the io is for 904 * @q: queue the io is for
893 * @bio: the source bio 905 * @bio: the source bio
@@ -899,9 +911,9 @@ static void blk_add_trace_split(void *ignore,
899 * it spans a stripe (or similar). Add a trace for that action. 911 * it spans a stripe (or similar). Add a trace for that action.
900 * 912 *
901 **/ 913 **/
902static void blk_add_trace_remap(void *ignore, 914static void blk_add_trace_bio_remap(void *ignore,
903 struct request_queue *q, struct bio *bio, 915 struct request_queue *q, struct bio *bio,
904 dev_t dev, sector_t from) 916 dev_t dev, sector_t from)
905{ 917{
906 struct blk_trace *bt = q->blk_trace; 918 struct blk_trace *bt = q->blk_trace;
907 struct blk_io_trace_remap r; 919 struct blk_io_trace_remap r;
@@ -1016,7 +1028,7 @@ static void blk_register_tracepoints(void)
1016 WARN_ON(ret); 1028 WARN_ON(ret);
1017 ret = register_trace_block_split(blk_add_trace_split, NULL); 1029 ret = register_trace_block_split(blk_add_trace_split, NULL);
1018 WARN_ON(ret); 1030 WARN_ON(ret);
1019 ret = register_trace_block_remap(blk_add_trace_remap, NULL); 1031 ret = register_trace_block_bio_remap(blk_add_trace_bio_remap, NULL);
1020 WARN_ON(ret); 1032 WARN_ON(ret);
1021 ret = register_trace_block_rq_remap(blk_add_trace_rq_remap, NULL); 1033 ret = register_trace_block_rq_remap(blk_add_trace_rq_remap, NULL);
1022 WARN_ON(ret); 1034 WARN_ON(ret);
@@ -1025,7 +1037,7 @@ static void blk_register_tracepoints(void)
1025static void blk_unregister_tracepoints(void) 1037static void blk_unregister_tracepoints(void)
1026{ 1038{
1027 unregister_trace_block_rq_remap(blk_add_trace_rq_remap, NULL); 1039 unregister_trace_block_rq_remap(blk_add_trace_rq_remap, NULL);
1028 unregister_trace_block_remap(blk_add_trace_remap, NULL); 1040 unregister_trace_block_bio_remap(blk_add_trace_bio_remap, NULL);
1029 unregister_trace_block_split(blk_add_trace_split, NULL); 1041 unregister_trace_block_split(blk_add_trace_split, NULL);
1030 unregister_trace_block_unplug_io(blk_add_trace_unplug_io, NULL); 1042 unregister_trace_block_unplug_io(blk_add_trace_unplug_io, NULL);
1031 unregister_trace_block_unplug_timer(blk_add_trace_unplug_timer, NULL); 1043 unregister_trace_block_unplug_timer(blk_add_trace_unplug_timer, NULL);
@@ -1815,21 +1827,5 @@ void blk_fill_rwbs(char *rwbs, u32 rw, int bytes)
1815 rwbs[i] = '\0'; 1827 rwbs[i] = '\0';
1816} 1828}
1817 1829
1818void blk_fill_rwbs_rq(char *rwbs, struct request *rq)
1819{
1820 int rw = rq->cmd_flags & 0x03;
1821 int bytes;
1822
1823 if (rq->cmd_flags & REQ_DISCARD)
1824 rw |= REQ_DISCARD;
1825
1826 if (rq->cmd_flags & REQ_SECURE)
1827 rw |= REQ_SECURE;
1828
1829 bytes = blk_rq_bytes(rq);
1830
1831 blk_fill_rwbs(rwbs, rw, bytes);
1832}
1833
1834#endif /* CONFIG_EVENT_TRACING */ 1830#endif /* CONFIG_EVENT_TRACING */
1835 1831
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index f3dadae83883..888b611897d3 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -3328,7 +3328,7 @@ static int start_graph_tracing(void)
3328 /* The cpu_boot init_task->ret_stack will never be freed */ 3328 /* The cpu_boot init_task->ret_stack will never be freed */
3329 for_each_online_cpu(cpu) { 3329 for_each_online_cpu(cpu) {
3330 if (!idle_task(cpu)->ret_stack) 3330 if (!idle_task(cpu)->ret_stack)
3331 ftrace_graph_init_task(idle_task(cpu)); 3331 ftrace_graph_init_idle_task(idle_task(cpu), cpu);
3332 } 3332 }
3333 3333
3334 do { 3334 do {
@@ -3418,6 +3418,49 @@ void unregister_ftrace_graph(void)
3418 mutex_unlock(&ftrace_lock); 3418 mutex_unlock(&ftrace_lock);
3419} 3419}
3420 3420
3421static DEFINE_PER_CPU(struct ftrace_ret_stack *, idle_ret_stack);
3422
3423static void
3424graph_init_task(struct task_struct *t, struct ftrace_ret_stack *ret_stack)
3425{
3426 atomic_set(&t->tracing_graph_pause, 0);
3427 atomic_set(&t->trace_overrun, 0);
3428 t->ftrace_timestamp = 0;
3429 /* make curr_ret_stack visable before we add the ret_stack */
3430 smp_wmb();
3431 t->ret_stack = ret_stack;
3432}
3433
3434/*
3435 * Allocate a return stack for the idle task. May be the first
3436 * time through, or it may be done by CPU hotplug online.
3437 */
3438void ftrace_graph_init_idle_task(struct task_struct *t, int cpu)
3439{
3440 t->curr_ret_stack = -1;
3441 /*
3442 * The idle task has no parent, it either has its own
3443 * stack or no stack at all.
3444 */
3445 if (t->ret_stack)
3446 WARN_ON(t->ret_stack != per_cpu(idle_ret_stack, cpu));
3447
3448 if (ftrace_graph_active) {
3449 struct ftrace_ret_stack *ret_stack;
3450
3451 ret_stack = per_cpu(idle_ret_stack, cpu);
3452 if (!ret_stack) {
3453 ret_stack = kmalloc(FTRACE_RETFUNC_DEPTH
3454 * sizeof(struct ftrace_ret_stack),
3455 GFP_KERNEL);
3456 if (!ret_stack)
3457 return;
3458 per_cpu(idle_ret_stack, cpu) = ret_stack;
3459 }
3460 graph_init_task(t, ret_stack);
3461 }
3462}
3463
3421/* Allocate a return stack for newly created task */ 3464/* Allocate a return stack for newly created task */
3422void ftrace_graph_init_task(struct task_struct *t) 3465void ftrace_graph_init_task(struct task_struct *t)
3423{ 3466{
@@ -3433,12 +3476,7 @@ void ftrace_graph_init_task(struct task_struct *t)
3433 GFP_KERNEL); 3476 GFP_KERNEL);
3434 if (!ret_stack) 3477 if (!ret_stack)
3435 return; 3478 return;
3436 atomic_set(&t->tracing_graph_pause, 0); 3479 graph_init_task(t, ret_stack);
3437 atomic_set(&t->trace_overrun, 0);
3438 t->ftrace_timestamp = 0;
3439 /* make curr_ret_stack visable before we add the ret_stack */
3440 smp_wmb();
3441 t->ret_stack = ret_stack;
3442 } 3480 }
3443} 3481}
3444 3482
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index bd1c35a4fbcc..db7b439d23ee 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -5,7 +5,6 @@
5 */ 5 */
6#include <linux/ring_buffer.h> 6#include <linux/ring_buffer.h>
7#include <linux/trace_clock.h> 7#include <linux/trace_clock.h>
8#include <linux/ftrace_irq.h>
9#include <linux/spinlock.h> 8#include <linux/spinlock.h>
10#include <linux/debugfs.h> 9#include <linux/debugfs.h>
11#include <linux/uaccess.h> 10#include <linux/uaccess.h>
@@ -1429,6 +1428,17 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
1429} 1428}
1430EXPORT_SYMBOL_GPL(ring_buffer_resize); 1429EXPORT_SYMBOL_GPL(ring_buffer_resize);
1431 1430
1431void ring_buffer_change_overwrite(struct ring_buffer *buffer, int val)
1432{
1433 mutex_lock(&buffer->mutex);
1434 if (val)
1435 buffer->flags |= RB_FL_OVERWRITE;
1436 else
1437 buffer->flags &= ~RB_FL_OVERWRITE;
1438 mutex_unlock(&buffer->mutex);
1439}
1440EXPORT_SYMBOL_GPL(ring_buffer_change_overwrite);
1441
1432static inline void * 1442static inline void *
1433__rb_data_page_index(struct buffer_data_page *bpage, unsigned index) 1443__rb_data_page_index(struct buffer_data_page *bpage, unsigned index)
1434{ 1444{
@@ -2162,11 +2172,19 @@ rb_reserve_next_event(struct ring_buffer *buffer,
2162 if (likely(ts >= cpu_buffer->write_stamp)) { 2172 if (likely(ts >= cpu_buffer->write_stamp)) {
2163 delta = diff; 2173 delta = diff;
2164 if (unlikely(test_time_stamp(delta))) { 2174 if (unlikely(test_time_stamp(delta))) {
2175 int local_clock_stable = 1;
2176#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
2177 local_clock_stable = sched_clock_stable;
2178#endif
2165 WARN_ONCE(delta > (1ULL << 59), 2179 WARN_ONCE(delta > (1ULL << 59),
2166 KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n", 2180 KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n%s",
2167 (unsigned long long)delta, 2181 (unsigned long long)delta,
2168 (unsigned long long)ts, 2182 (unsigned long long)ts,
2169 (unsigned long long)cpu_buffer->write_stamp); 2183 (unsigned long long)cpu_buffer->write_stamp,
2184 local_clock_stable ? "" :
2185 "If you just came from a suspend/resume,\n"
2186 "please switch to the trace global clock:\n"
2187 " echo global > /sys/kernel/debug/tracing/trace_clock\n");
2170 add_timestamp = 1; 2188 add_timestamp = 1;
2171 } 2189 }
2172 } 2190 }
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index f8cf959bad45..9541c27c1cf2 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -41,8 +41,6 @@
41#include "trace.h" 41#include "trace.h"
42#include "trace_output.h" 42#include "trace_output.h"
43 43
44#define TRACE_BUFFER_FLAGS (RB_FL_OVERWRITE)
45
46/* 44/*
47 * On boot up, the ring buffer is set to the minimum size, so that 45 * On boot up, the ring buffer is set to the minimum size, so that
48 * we do not waste memory on systems that are not using tracing. 46 * we do not waste memory on systems that are not using tracing.
@@ -340,7 +338,7 @@ static DECLARE_WAIT_QUEUE_HEAD(trace_wait);
340/* trace_flags holds trace_options default values */ 338/* trace_flags holds trace_options default values */
341unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK | 339unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK |
342 TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME | 340 TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME |
343 TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD; 341 TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE;
344 342
345static int trace_stop_count; 343static int trace_stop_count;
346static DEFINE_SPINLOCK(tracing_start_lock); 344static DEFINE_SPINLOCK(tracing_start_lock);
@@ -425,6 +423,7 @@ static const char *trace_options[] = {
425 "sleep-time", 423 "sleep-time",
426 "graph-time", 424 "graph-time",
427 "record-cmd", 425 "record-cmd",
426 "overwrite",
428 NULL 427 NULL
429}; 428};
430 429
@@ -780,6 +779,11 @@ __acquires(kernel_lock)
780 tracing_reset_online_cpus(tr); 779 tracing_reset_online_cpus(tr);
781 780
782 current_trace = type; 781 current_trace = type;
782
783 /* If we expanded the buffers, make sure the max is expanded too */
784 if (ring_buffer_expanded && type->use_max_tr)
785 ring_buffer_resize(max_tr.buffer, trace_buf_size);
786
783 /* the test is responsible for initializing and enabling */ 787 /* the test is responsible for initializing and enabling */
784 pr_info("Testing tracer %s: ", type->name); 788 pr_info("Testing tracer %s: ", type->name);
785 ret = type->selftest(type, tr); 789 ret = type->selftest(type, tr);
@@ -792,6 +796,10 @@ __acquires(kernel_lock)
792 /* Only reset on passing, to avoid touching corrupted buffers */ 796 /* Only reset on passing, to avoid touching corrupted buffers */
793 tracing_reset_online_cpus(tr); 797 tracing_reset_online_cpus(tr);
794 798
799 /* Shrink the max buffer again */
800 if (ring_buffer_expanded && type->use_max_tr)
801 ring_buffer_resize(max_tr.buffer, 1);
802
795 printk(KERN_CONT "PASSED\n"); 803 printk(KERN_CONT "PASSED\n");
796 } 804 }
797#endif 805#endif
@@ -1102,7 +1110,6 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
1102 1110
1103 entry->preempt_count = pc & 0xff; 1111 entry->preempt_count = pc & 0xff;
1104 entry->pid = (tsk) ? tsk->pid : 0; 1112 entry->pid = (tsk) ? tsk->pid : 0;
1105 entry->lock_depth = (tsk) ? tsk->lock_depth : 0;
1106 entry->flags = 1113 entry->flags =
1107#ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT 1114#ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT
1108 (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) | 1115 (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) |
@@ -1313,12 +1320,10 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc)
1313 1320
1314 __this_cpu_inc(user_stack_count); 1321 __this_cpu_inc(user_stack_count);
1315 1322
1316
1317
1318 event = trace_buffer_lock_reserve(buffer, TRACE_USER_STACK, 1323 event = trace_buffer_lock_reserve(buffer, TRACE_USER_STACK,
1319 sizeof(*entry), flags, pc); 1324 sizeof(*entry), flags, pc);
1320 if (!event) 1325 if (!event)
1321 return; 1326 goto out_drop_count;
1322 entry = ring_buffer_event_data(event); 1327 entry = ring_buffer_event_data(event);
1323 1328
1324 entry->tgid = current->tgid; 1329 entry->tgid = current->tgid;
@@ -1333,8 +1338,8 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc)
1333 if (!filter_check_discard(call, entry, buffer, event)) 1338 if (!filter_check_discard(call, entry, buffer, event))
1334 ring_buffer_unlock_commit(buffer, event); 1339 ring_buffer_unlock_commit(buffer, event);
1335 1340
1341 out_drop_count:
1336 __this_cpu_dec(user_stack_count); 1342 __this_cpu_dec(user_stack_count);
1337
1338 out: 1343 out:
1339 preempt_enable(); 1344 preempt_enable();
1340} 1345}
@@ -1751,10 +1756,9 @@ static void print_lat_help_header(struct seq_file *m)
1751 seq_puts(m, "# | / _----=> need-resched \n"); 1756 seq_puts(m, "# | / _----=> need-resched \n");
1752 seq_puts(m, "# || / _---=> hardirq/softirq \n"); 1757 seq_puts(m, "# || / _---=> hardirq/softirq \n");
1753 seq_puts(m, "# ||| / _--=> preempt-depth \n"); 1758 seq_puts(m, "# ||| / _--=> preempt-depth \n");
1754 seq_puts(m, "# |||| /_--=> lock-depth \n"); 1759 seq_puts(m, "# |||| / delay \n");
1755 seq_puts(m, "# |||||/ delay \n"); 1760 seq_puts(m, "# cmd pid ||||| time | caller \n");
1756 seq_puts(m, "# cmd pid |||||| time | caller \n"); 1761 seq_puts(m, "# \\ / ||||| \\ | / \n");
1757 seq_puts(m, "# \\ / |||||| \\ | / \n");
1758} 1762}
1759 1763
1760static void print_func_help_header(struct seq_file *m) 1764static void print_func_help_header(struct seq_file *m)
@@ -2531,6 +2535,9 @@ static void set_tracer_flags(unsigned int mask, int enabled)
2531 2535
2532 if (mask == TRACE_ITER_RECORD_CMD) 2536 if (mask == TRACE_ITER_RECORD_CMD)
2533 trace_event_enable_cmd_record(enabled); 2537 trace_event_enable_cmd_record(enabled);
2538
2539 if (mask == TRACE_ITER_OVERWRITE)
2540 ring_buffer_change_overwrite(global_trace.buffer, enabled);
2534} 2541}
2535 2542
2536static ssize_t 2543static ssize_t
@@ -2712,6 +2719,10 @@ tracing_ctrl_write(struct file *filp, const char __user *ubuf,
2712 2719
2713 mutex_lock(&trace_types_lock); 2720 mutex_lock(&trace_types_lock);
2714 if (tracer_enabled ^ val) { 2721 if (tracer_enabled ^ val) {
2722
2723 /* Only need to warn if this is used to change the state */
2724 WARN_ONCE(1, "tracing_enabled is deprecated. Use tracing_on");
2725
2715 if (val) { 2726 if (val) {
2716 tracer_enabled = 1; 2727 tracer_enabled = 1;
2717 if (current_trace->start) 2728 if (current_trace->start)
@@ -4553,9 +4564,11 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode)
4553__init static int tracer_alloc_buffers(void) 4564__init static int tracer_alloc_buffers(void)
4554{ 4565{
4555 int ring_buf_size; 4566 int ring_buf_size;
4567 enum ring_buffer_flags rb_flags;
4556 int i; 4568 int i;
4557 int ret = -ENOMEM; 4569 int ret = -ENOMEM;
4558 4570
4571
4559 if (!alloc_cpumask_var(&tracing_buffer_mask, GFP_KERNEL)) 4572 if (!alloc_cpumask_var(&tracing_buffer_mask, GFP_KERNEL))
4560 goto out; 4573 goto out;
4561 4574
@@ -4568,12 +4581,13 @@ __init static int tracer_alloc_buffers(void)
4568 else 4581 else
4569 ring_buf_size = 1; 4582 ring_buf_size = 1;
4570 4583
4584 rb_flags = trace_flags & TRACE_ITER_OVERWRITE ? RB_FL_OVERWRITE : 0;
4585
4571 cpumask_copy(tracing_buffer_mask, cpu_possible_mask); 4586 cpumask_copy(tracing_buffer_mask, cpu_possible_mask);
4572 cpumask_copy(tracing_cpumask, cpu_all_mask); 4587 cpumask_copy(tracing_cpumask, cpu_all_mask);
4573 4588
4574 /* TODO: make the number of buffers hot pluggable with CPUS */ 4589 /* TODO: make the number of buffers hot pluggable with CPUS */
4575 global_trace.buffer = ring_buffer_alloc(ring_buf_size, 4590 global_trace.buffer = ring_buffer_alloc(ring_buf_size, rb_flags);
4576 TRACE_BUFFER_FLAGS);
4577 if (!global_trace.buffer) { 4591 if (!global_trace.buffer) {
4578 printk(KERN_ERR "tracer: failed to allocate ring buffer!\n"); 4592 printk(KERN_ERR "tracer: failed to allocate ring buffer!\n");
4579 WARN_ON(1); 4593 WARN_ON(1);
@@ -4583,7 +4597,7 @@ __init static int tracer_alloc_buffers(void)
4583 4597
4584 4598
4585#ifdef CONFIG_TRACER_MAX_TRACE 4599#ifdef CONFIG_TRACER_MAX_TRACE
4586 max_tr.buffer = ring_buffer_alloc(1, TRACE_BUFFER_FLAGS); 4600 max_tr.buffer = ring_buffer_alloc(1, rb_flags);
4587 if (!max_tr.buffer) { 4601 if (!max_tr.buffer) {
4588 printk(KERN_ERR "tracer: failed to allocate max ring buffer!\n"); 4602 printk(KERN_ERR "tracer: failed to allocate max ring buffer!\n");
4589 WARN_ON(1); 4603 WARN_ON(1);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 9021f8c0c0c3..5e9dfc6286dd 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -272,8 +272,8 @@ struct tracer {
272 /* If you handled the flag setting, return 0 */ 272 /* If you handled the flag setting, return 0 */
273 int (*set_flag)(u32 old_flags, u32 bit, int set); 273 int (*set_flag)(u32 old_flags, u32 bit, int set);
274 struct tracer *next; 274 struct tracer *next;
275 int print_max;
276 struct tracer_flags *flags; 275 struct tracer_flags *flags;
276 int print_max;
277 int use_max_tr; 277 int use_max_tr;
278}; 278};
279 279
@@ -606,6 +606,7 @@ enum trace_iterator_flags {
606 TRACE_ITER_SLEEP_TIME = 0x40000, 606 TRACE_ITER_SLEEP_TIME = 0x40000,
607 TRACE_ITER_GRAPH_TIME = 0x80000, 607 TRACE_ITER_GRAPH_TIME = 0x80000,
608 TRACE_ITER_RECORD_CMD = 0x100000, 608 TRACE_ITER_RECORD_CMD = 0x100000,
609 TRACE_ITER_OVERWRITE = 0x200000,
609}; 610};
610 611
611/* 612/*
@@ -661,8 +662,10 @@ struct ftrace_event_field {
661}; 662};
662 663
663struct event_filter { 664struct event_filter {
664 int n_preds; 665 int n_preds; /* Number assigned */
665 struct filter_pred **preds; 666 int a_preds; /* allocated */
667 struct filter_pred *preds;
668 struct filter_pred *root;
666 char *filter_string; 669 char *filter_string;
667}; 670};
668 671
@@ -674,11 +677,23 @@ struct event_subsystem {
674 int nr_events; 677 int nr_events;
675}; 678};
676 679
680#define FILTER_PRED_INVALID ((unsigned short)-1)
681#define FILTER_PRED_IS_RIGHT (1 << 15)
682#define FILTER_PRED_FOLD (1 << 15)
683
684/*
685 * The max preds is the size of unsigned short with
686 * two flags at the MSBs. One bit is used for both the IS_RIGHT
687 * and FOLD flags. The other is reserved.
688 *
689 * 2^14 preds is way more than enough.
690 */
691#define MAX_FILTER_PRED 16384
692
677struct filter_pred; 693struct filter_pred;
678struct regex; 694struct regex;
679 695
680typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event, 696typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event);
681 int val1, int val2);
682 697
683typedef int (*regex_match_func)(char *str, struct regex *r, int len); 698typedef int (*regex_match_func)(char *str, struct regex *r, int len);
684 699
@@ -700,11 +715,23 @@ struct filter_pred {
700 filter_pred_fn_t fn; 715 filter_pred_fn_t fn;
701 u64 val; 716 u64 val;
702 struct regex regex; 717 struct regex regex;
703 char *field_name; 718 /*
719 * Leaf nodes use field_name, ops is used by AND and OR
720 * nodes. The field_name is always freed when freeing a pred.
721 * We can overload field_name for ops and have it freed
722 * as well.
723 */
724 union {
725 char *field_name;
726 unsigned short *ops;
727 };
704 int offset; 728 int offset;
705 int not; 729 int not;
706 int op; 730 int op;
707 int pop_n; 731 unsigned short index;
732 unsigned short parent;
733 unsigned short left;
734 unsigned short right;
708}; 735};
709 736
710extern struct list_head ftrace_common_fields; 737extern struct list_head ftrace_common_fields;
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index e3dfecaf13e6..1516cb3ec549 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -53,7 +53,7 @@
53 */ 53 */
54 54
55/* 55/*
56 * Function trace entry - function address and parent function addres: 56 * Function trace entry - function address and parent function address:
57 */ 57 */
58FTRACE_ENTRY(function, ftrace_entry, 58FTRACE_ENTRY(function, ftrace_entry,
59 59
@@ -109,12 +109,12 @@ FTRACE_ENTRY(funcgraph_exit, ftrace_graph_ret_entry,
109 */ 109 */
110#define FTRACE_CTX_FIELDS \ 110#define FTRACE_CTX_FIELDS \
111 __field( unsigned int, prev_pid ) \ 111 __field( unsigned int, prev_pid ) \
112 __field( unsigned int, next_pid ) \
113 __field( unsigned int, next_cpu ) \
112 __field( unsigned char, prev_prio ) \ 114 __field( unsigned char, prev_prio ) \
113 __field( unsigned char, prev_state ) \ 115 __field( unsigned char, prev_state ) \
114 __field( unsigned int, next_pid ) \
115 __field( unsigned char, next_prio ) \ 116 __field( unsigned char, next_prio ) \
116 __field( unsigned char, next_state ) \ 117 __field( unsigned char, next_state )
117 __field( unsigned int, next_cpu )
118 118
119FTRACE_ENTRY(context_switch, ctx_switch_entry, 119FTRACE_ENTRY(context_switch, ctx_switch_entry,
120 120
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 35fde09b81de..e88f74fe1d4c 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -116,7 +116,6 @@ static int trace_define_common_fields(void)
116 __common_field(unsigned char, flags); 116 __common_field(unsigned char, flags);
117 __common_field(unsigned char, preempt_count); 117 __common_field(unsigned char, preempt_count);
118 __common_field(int, pid); 118 __common_field(int, pid);
119 __common_field(int, lock_depth);
120 119
121 return ret; 120 return ret;
122} 121}
@@ -326,6 +325,7 @@ int trace_set_clr_event(const char *system, const char *event, int set)
326{ 325{
327 return __ftrace_set_clr_event(NULL, system, event, set); 326 return __ftrace_set_clr_event(NULL, system, event, set);
328} 327}
328EXPORT_SYMBOL_GPL(trace_set_clr_event);
329 329
330/* 128 should be much more than enough */ 330/* 128 should be much more than enough */
331#define EVENT_BUF_SIZE 127 331#define EVENT_BUF_SIZE 127
@@ -1284,7 +1284,7 @@ trace_create_file_ops(struct module *mod)
1284static void trace_module_add_events(struct module *mod) 1284static void trace_module_add_events(struct module *mod)
1285{ 1285{
1286 struct ftrace_module_file_ops *file_ops = NULL; 1286 struct ftrace_module_file_ops *file_ops = NULL;
1287 struct ftrace_event_call *call, *start, *end; 1287 struct ftrace_event_call **call, **start, **end;
1288 1288
1289 start = mod->trace_events; 1289 start = mod->trace_events;
1290 end = mod->trace_events + mod->num_trace_events; 1290 end = mod->trace_events + mod->num_trace_events;
@@ -1297,7 +1297,7 @@ static void trace_module_add_events(struct module *mod)
1297 return; 1297 return;
1298 1298
1299 for_each_event(call, start, end) { 1299 for_each_event(call, start, end) {
1300 __trace_add_event_call(call, mod, 1300 __trace_add_event_call(*call, mod,
1301 &file_ops->id, &file_ops->enable, 1301 &file_ops->id, &file_ops->enable,
1302 &file_ops->filter, &file_ops->format); 1302 &file_ops->filter, &file_ops->format);
1303 } 1303 }
@@ -1367,8 +1367,8 @@ static struct notifier_block trace_module_nb = {
1367 .priority = 0, 1367 .priority = 0,
1368}; 1368};
1369 1369
1370extern struct ftrace_event_call __start_ftrace_events[]; 1370extern struct ftrace_event_call *__start_ftrace_events[];
1371extern struct ftrace_event_call __stop_ftrace_events[]; 1371extern struct ftrace_event_call *__stop_ftrace_events[];
1372 1372
1373static char bootup_event_buf[COMMAND_LINE_SIZE] __initdata; 1373static char bootup_event_buf[COMMAND_LINE_SIZE] __initdata;
1374 1374
@@ -1384,7 +1384,7 @@ __setup("trace_event=", setup_trace_event);
1384 1384
1385static __init int event_trace_init(void) 1385static __init int event_trace_init(void)
1386{ 1386{
1387 struct ftrace_event_call *call; 1387 struct ftrace_event_call **call;
1388 struct dentry *d_tracer; 1388 struct dentry *d_tracer;
1389 struct dentry *entry; 1389 struct dentry *entry;
1390 struct dentry *d_events; 1390 struct dentry *d_events;
@@ -1430,7 +1430,7 @@ static __init int event_trace_init(void)
1430 pr_warning("tracing: Failed to allocate common fields"); 1430 pr_warning("tracing: Failed to allocate common fields");
1431 1431
1432 for_each_event(call, __start_ftrace_events, __stop_ftrace_events) { 1432 for_each_event(call, __start_ftrace_events, __stop_ftrace_events) {
1433 __trace_add_event_call(call, NULL, &ftrace_event_id_fops, 1433 __trace_add_event_call(*call, NULL, &ftrace_event_id_fops,
1434 &ftrace_enable_fops, 1434 &ftrace_enable_fops,
1435 &ftrace_event_filter_fops, 1435 &ftrace_event_filter_fops,
1436 &ftrace_event_format_fops); 1436 &ftrace_event_format_fops);
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 36d40104b17f..3249b4f77ef0 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -123,9 +123,13 @@ struct filter_parse_state {
123 } operand; 123 } operand;
124}; 124};
125 125
126struct pred_stack {
127 struct filter_pred **preds;
128 int index;
129};
130
126#define DEFINE_COMPARISON_PRED(type) \ 131#define DEFINE_COMPARISON_PRED(type) \
127static int filter_pred_##type(struct filter_pred *pred, void *event, \ 132static int filter_pred_##type(struct filter_pred *pred, void *event) \
128 int val1, int val2) \
129{ \ 133{ \
130 type *addr = (type *)(event + pred->offset); \ 134 type *addr = (type *)(event + pred->offset); \
131 type val = (type)pred->val; \ 135 type val = (type)pred->val; \
@@ -152,8 +156,7 @@ static int filter_pred_##type(struct filter_pred *pred, void *event, \
152} 156}
153 157
154#define DEFINE_EQUALITY_PRED(size) \ 158#define DEFINE_EQUALITY_PRED(size) \
155static int filter_pred_##size(struct filter_pred *pred, void *event, \ 159static int filter_pred_##size(struct filter_pred *pred, void *event) \
156 int val1, int val2) \
157{ \ 160{ \
158 u##size *addr = (u##size *)(event + pred->offset); \ 161 u##size *addr = (u##size *)(event + pred->offset); \
159 u##size val = (u##size)pred->val; \ 162 u##size val = (u##size)pred->val; \
@@ -178,23 +181,8 @@ DEFINE_EQUALITY_PRED(32);
178DEFINE_EQUALITY_PRED(16); 181DEFINE_EQUALITY_PRED(16);
179DEFINE_EQUALITY_PRED(8); 182DEFINE_EQUALITY_PRED(8);
180 183
181static int filter_pred_and(struct filter_pred *pred __attribute((unused)),
182 void *event __attribute((unused)),
183 int val1, int val2)
184{
185 return val1 && val2;
186}
187
188static int filter_pred_or(struct filter_pred *pred __attribute((unused)),
189 void *event __attribute((unused)),
190 int val1, int val2)
191{
192 return val1 || val2;
193}
194
195/* Filter predicate for fixed sized arrays of characters */ 184/* Filter predicate for fixed sized arrays of characters */
196static int filter_pred_string(struct filter_pred *pred, void *event, 185static int filter_pred_string(struct filter_pred *pred, void *event)
197 int val1, int val2)
198{ 186{
199 char *addr = (char *)(event + pred->offset); 187 char *addr = (char *)(event + pred->offset);
200 int cmp, match; 188 int cmp, match;
@@ -207,8 +195,7 @@ static int filter_pred_string(struct filter_pred *pred, void *event,
207} 195}
208 196
209/* Filter predicate for char * pointers */ 197/* Filter predicate for char * pointers */
210static int filter_pred_pchar(struct filter_pred *pred, void *event, 198static int filter_pred_pchar(struct filter_pred *pred, void *event)
211 int val1, int val2)
212{ 199{
213 char **addr = (char **)(event + pred->offset); 200 char **addr = (char **)(event + pred->offset);
214 int cmp, match; 201 int cmp, match;
@@ -231,8 +218,7 @@ static int filter_pred_pchar(struct filter_pred *pred, void *event,
231 * and add it to the address of the entry, and at last we have 218 * and add it to the address of the entry, and at last we have
232 * the address of the string. 219 * the address of the string.
233 */ 220 */
234static int filter_pred_strloc(struct filter_pred *pred, void *event, 221static int filter_pred_strloc(struct filter_pred *pred, void *event)
235 int val1, int val2)
236{ 222{
237 u32 str_item = *(u32 *)(event + pred->offset); 223 u32 str_item = *(u32 *)(event + pred->offset);
238 int str_loc = str_item & 0xffff; 224 int str_loc = str_item & 0xffff;
@@ -247,8 +233,7 @@ static int filter_pred_strloc(struct filter_pred *pred, void *event,
247 return match; 233 return match;
248} 234}
249 235
250static int filter_pred_none(struct filter_pred *pred, void *event, 236static int filter_pred_none(struct filter_pred *pred, void *event)
251 int val1, int val2)
252{ 237{
253 return 0; 238 return 0;
254} 239}
@@ -377,32 +362,147 @@ static void filter_build_regex(struct filter_pred *pred)
377 pred->not ^= not; 362 pred->not ^= not;
378} 363}
379 364
365enum move_type {
366 MOVE_DOWN,
367 MOVE_UP_FROM_LEFT,
368 MOVE_UP_FROM_RIGHT
369};
370
371static struct filter_pred *
372get_pred_parent(struct filter_pred *pred, struct filter_pred *preds,
373 int index, enum move_type *move)
374{
375 if (pred->parent & FILTER_PRED_IS_RIGHT)
376 *move = MOVE_UP_FROM_RIGHT;
377 else
378 *move = MOVE_UP_FROM_LEFT;
379 pred = &preds[pred->parent & ~FILTER_PRED_IS_RIGHT];
380
381 return pred;
382}
383
384/*
385 * A series of AND or ORs where found together. Instead of
386 * climbing up and down the tree branches, an array of the
387 * ops were made in order of checks. We can just move across
388 * the array and short circuit if needed.
389 */
390static int process_ops(struct filter_pred *preds,
391 struct filter_pred *op, void *rec)
392{
393 struct filter_pred *pred;
394 int type;
395 int match;
396 int i;
397
398 /*
399 * Micro-optimization: We set type to true if op
400 * is an OR and false otherwise (AND). Then we
401 * just need to test if the match is equal to
402 * the type, and if it is, we can short circuit the
403 * rest of the checks:
404 *
405 * if ((match && op->op == OP_OR) ||
406 * (!match && op->op == OP_AND))
407 * return match;
408 */
409 type = op->op == OP_OR;
410
411 for (i = 0; i < op->val; i++) {
412 pred = &preds[op->ops[i]];
413 match = pred->fn(pred, rec);
414 if (!!match == type)
415 return match;
416 }
417 return match;
418}
419
380/* return 1 if event matches, 0 otherwise (discard) */ 420/* return 1 if event matches, 0 otherwise (discard) */
381int filter_match_preds(struct event_filter *filter, void *rec) 421int filter_match_preds(struct event_filter *filter, void *rec)
382{ 422{
383 int match, top = 0, val1 = 0, val2 = 0; 423 int match = -1;
384 int stack[MAX_FILTER_PRED]; 424 enum move_type move = MOVE_DOWN;
425 struct filter_pred *preds;
385 struct filter_pred *pred; 426 struct filter_pred *pred;
386 int i; 427 struct filter_pred *root;
428 int n_preds;
429 int done = 0;
430
431 /* no filter is considered a match */
432 if (!filter)
433 return 1;
434
435 n_preds = filter->n_preds;
436
437 if (!n_preds)
438 return 1;
439
440 /*
441 * n_preds, root and filter->preds are protect with preemption disabled.
442 */
443 preds = rcu_dereference_sched(filter->preds);
444 root = rcu_dereference_sched(filter->root);
445 if (!root)
446 return 1;
447
448 pred = root;
387 449
388 for (i = 0; i < filter->n_preds; i++) { 450 /* match is currently meaningless */
389 pred = filter->preds[i]; 451 match = -1;
390 if (!pred->pop_n) { 452
391 match = pred->fn(pred, rec, val1, val2); 453 do {
392 stack[top++] = match; 454 switch (move) {
455 case MOVE_DOWN:
456 /* only AND and OR have children */
457 if (pred->left != FILTER_PRED_INVALID) {
458 /* If ops is set, then it was folded. */
459 if (!pred->ops) {
460 /* keep going to down the left side */
461 pred = &preds[pred->left];
462 continue;
463 }
464 /* We can treat folded ops as a leaf node */
465 match = process_ops(preds, pred, rec);
466 } else
467 match = pred->fn(pred, rec);
468 /* If this pred is the only pred */
469 if (pred == root)
470 break;
471 pred = get_pred_parent(pred, preds,
472 pred->parent, &move);
473 continue;
474 case MOVE_UP_FROM_LEFT:
475 /*
476 * Check for short circuits.
477 *
478 * Optimization: !!match == (pred->op == OP_OR)
479 * is the same as:
480 * if ((match && pred->op == OP_OR) ||
481 * (!match && pred->op == OP_AND))
482 */
483 if (!!match == (pred->op == OP_OR)) {
484 if (pred == root)
485 break;
486 pred = get_pred_parent(pred, preds,
487 pred->parent, &move);
488 continue;
489 }
490 /* now go down the right side of the tree. */
491 pred = &preds[pred->right];
492 move = MOVE_DOWN;
493 continue;
494 case MOVE_UP_FROM_RIGHT:
495 /* We finished this equation. */
496 if (pred == root)
497 break;
498 pred = get_pred_parent(pred, preds,
499 pred->parent, &move);
393 continue; 500 continue;
394 } 501 }
395 if (pred->pop_n > top) { 502 done = 1;
396 WARN_ON_ONCE(1); 503 } while (!done);
397 return 0;
398 }
399 val1 = stack[--top];
400 val2 = stack[--top];
401 match = pred->fn(pred, rec, val1, val2);
402 stack[top++] = match;
403 }
404 504
405 return stack[--top]; 505 return match;
406} 506}
407EXPORT_SYMBOL_GPL(filter_match_preds); 507EXPORT_SYMBOL_GPL(filter_match_preds);
408 508
@@ -414,6 +514,9 @@ static void parse_error(struct filter_parse_state *ps, int err, int pos)
414 514
415static void remove_filter_string(struct event_filter *filter) 515static void remove_filter_string(struct event_filter *filter)
416{ 516{
517 if (!filter)
518 return;
519
417 kfree(filter->filter_string); 520 kfree(filter->filter_string);
418 filter->filter_string = NULL; 521 filter->filter_string = NULL;
419} 522}
@@ -473,9 +576,10 @@ static void append_filter_err(struct filter_parse_state *ps,
473 576
474void print_event_filter(struct ftrace_event_call *call, struct trace_seq *s) 577void print_event_filter(struct ftrace_event_call *call, struct trace_seq *s)
475{ 578{
476 struct event_filter *filter = call->filter; 579 struct event_filter *filter;
477 580
478 mutex_lock(&event_mutex); 581 mutex_lock(&event_mutex);
582 filter = call->filter;
479 if (filter && filter->filter_string) 583 if (filter && filter->filter_string)
480 trace_seq_printf(s, "%s\n", filter->filter_string); 584 trace_seq_printf(s, "%s\n", filter->filter_string);
481 else 585 else
@@ -486,9 +590,10 @@ void print_event_filter(struct ftrace_event_call *call, struct trace_seq *s)
486void print_subsystem_event_filter(struct event_subsystem *system, 590void print_subsystem_event_filter(struct event_subsystem *system,
487 struct trace_seq *s) 591 struct trace_seq *s)
488{ 592{
489 struct event_filter *filter = system->filter; 593 struct event_filter *filter;
490 594
491 mutex_lock(&event_mutex); 595 mutex_lock(&event_mutex);
596 filter = system->filter;
492 if (filter && filter->filter_string) 597 if (filter && filter->filter_string)
493 trace_seq_printf(s, "%s\n", filter->filter_string); 598 trace_seq_printf(s, "%s\n", filter->filter_string);
494 else 599 else
@@ -539,10 +644,58 @@ static void filter_clear_pred(struct filter_pred *pred)
539 pred->regex.len = 0; 644 pred->regex.len = 0;
540} 645}
541 646
542static int filter_set_pred(struct filter_pred *dest, 647static int __alloc_pred_stack(struct pred_stack *stack, int n_preds)
648{
649 stack->preds = kzalloc(sizeof(*stack->preds)*(n_preds + 1), GFP_KERNEL);
650 if (!stack->preds)
651 return -ENOMEM;
652 stack->index = n_preds;
653 return 0;
654}
655
656static void __free_pred_stack(struct pred_stack *stack)
657{
658 kfree(stack->preds);
659 stack->index = 0;
660}
661
662static int __push_pred_stack(struct pred_stack *stack,
663 struct filter_pred *pred)
664{
665 int index = stack->index;
666
667 if (WARN_ON(index == 0))
668 return -ENOSPC;
669
670 stack->preds[--index] = pred;
671 stack->index = index;
672 return 0;
673}
674
675static struct filter_pred *
676__pop_pred_stack(struct pred_stack *stack)
677{
678 struct filter_pred *pred;
679 int index = stack->index;
680
681 pred = stack->preds[index++];
682 if (!pred)
683 return NULL;
684
685 stack->index = index;
686 return pred;
687}
688
689static int filter_set_pred(struct event_filter *filter,
690 int idx,
691 struct pred_stack *stack,
543 struct filter_pred *src, 692 struct filter_pred *src,
544 filter_pred_fn_t fn) 693 filter_pred_fn_t fn)
545{ 694{
695 struct filter_pred *dest = &filter->preds[idx];
696 struct filter_pred *left;
697 struct filter_pred *right;
698
546 *dest = *src; 699 *dest = *src;
547 if (src->field_name) { 700 if (src->field_name) {
548 dest->field_name = kstrdup(src->field_name, GFP_KERNEL); 701 dest->field_name = kstrdup(src->field_name, GFP_KERNEL);
@@ -550,116 +703,140 @@ static int filter_set_pred(struct filter_pred *dest,
550 return -ENOMEM; 703 return -ENOMEM;
551 } 704 }
552 dest->fn = fn; 705 dest->fn = fn;
706 dest->index = idx;
553 707
554 return 0; 708 if (dest->op == OP_OR || dest->op == OP_AND) {
709 right = __pop_pred_stack(stack);
710 left = __pop_pred_stack(stack);
711 if (!left || !right)
712 return -EINVAL;
713 /*
714 * If both children can be folded
715 * and they are the same op as this op or a leaf,
716 * then this op can be folded.
717 */
718 if (left->index & FILTER_PRED_FOLD &&
719 (left->op == dest->op ||
720 left->left == FILTER_PRED_INVALID) &&
721 right->index & FILTER_PRED_FOLD &&
722 (right->op == dest->op ||
723 right->left == FILTER_PRED_INVALID))
724 dest->index |= FILTER_PRED_FOLD;
725
726 dest->left = left->index & ~FILTER_PRED_FOLD;
727 dest->right = right->index & ~FILTER_PRED_FOLD;
728 left->parent = dest->index & ~FILTER_PRED_FOLD;
729 right->parent = dest->index | FILTER_PRED_IS_RIGHT;
730 } else {
731 /*
732 * Make dest->left invalid to be used as a quick
733 * way to know this is a leaf node.
734 */
735 dest->left = FILTER_PRED_INVALID;
736
737 /* All leafs allow folding the parent ops. */
738 dest->index |= FILTER_PRED_FOLD;
739 }
740
741 return __push_pred_stack(stack, dest);
555} 742}
556 743
557static void filter_disable_preds(struct ftrace_event_call *call) 744static void __free_preds(struct event_filter *filter)
558{ 745{
559 struct event_filter *filter = call->filter;
560 int i; 746 int i;
561 747
562 call->flags &= ~TRACE_EVENT_FL_FILTERED; 748 if (filter->preds) {
749 for (i = 0; i < filter->a_preds; i++)
750 kfree(filter->preds[i].field_name);
751 kfree(filter->preds);
752 filter->preds = NULL;
753 }
754 filter->a_preds = 0;
563 filter->n_preds = 0; 755 filter->n_preds = 0;
564
565 for (i = 0; i < MAX_FILTER_PRED; i++)
566 filter->preds[i]->fn = filter_pred_none;
567} 756}
568 757
569static void __free_preds(struct event_filter *filter) 758static void filter_disable(struct ftrace_event_call *call)
570{ 759{
571 int i; 760 call->flags &= ~TRACE_EVENT_FL_FILTERED;
761}
572 762
763static void __free_filter(struct event_filter *filter)
764{
573 if (!filter) 765 if (!filter)
574 return; 766 return;
575 767
576 for (i = 0; i < MAX_FILTER_PRED; i++) { 768 __free_preds(filter);
577 if (filter->preds[i])
578 filter_free_pred(filter->preds[i]);
579 }
580 kfree(filter->preds);
581 kfree(filter->filter_string); 769 kfree(filter->filter_string);
582 kfree(filter); 770 kfree(filter);
583} 771}
584 772
773/*
774 * Called when destroying the ftrace_event_call.
775 * The call is being freed, so we do not need to worry about
776 * the call being currently used. This is for module code removing
777 * the tracepoints from within it.
778 */
585void destroy_preds(struct ftrace_event_call *call) 779void destroy_preds(struct ftrace_event_call *call)
586{ 780{
587 __free_preds(call->filter); 781 __free_filter(call->filter);
588 call->filter = NULL; 782 call->filter = NULL;
589 call->flags &= ~TRACE_EVENT_FL_FILTERED;
590} 783}
591 784
592static struct event_filter *__alloc_preds(void) 785static struct event_filter *__alloc_filter(void)
593{ 786{
594 struct event_filter *filter; 787 struct event_filter *filter;
788
789 filter = kzalloc(sizeof(*filter), GFP_KERNEL);
790 return filter;
791}
792
793static int __alloc_preds(struct event_filter *filter, int n_preds)
794{
595 struct filter_pred *pred; 795 struct filter_pred *pred;
596 int i; 796 int i;
597 797
598 filter = kzalloc(sizeof(*filter), GFP_KERNEL); 798 if (filter->preds)
599 if (!filter) 799 __free_preds(filter);
600 return ERR_PTR(-ENOMEM);
601 800
602 filter->n_preds = 0; 801 filter->preds =
802 kzalloc(sizeof(*filter->preds) * n_preds, GFP_KERNEL);
603 803
604 filter->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred), GFP_KERNEL);
605 if (!filter->preds) 804 if (!filter->preds)
606 goto oom; 805 return -ENOMEM;
607 806
608 for (i = 0; i < MAX_FILTER_PRED; i++) { 807 filter->a_preds = n_preds;
609 pred = kzalloc(sizeof(*pred), GFP_KERNEL); 808 filter->n_preds = 0;
610 if (!pred) 809
611 goto oom; 810 for (i = 0; i < n_preds; i++) {
811 pred = &filter->preds[i];
612 pred->fn = filter_pred_none; 812 pred->fn = filter_pred_none;
613 filter->preds[i] = pred;
614 } 813 }
615 814
616 return filter;
617
618oom:
619 __free_preds(filter);
620 return ERR_PTR(-ENOMEM);
621}
622
623static int init_preds(struct ftrace_event_call *call)
624{
625 if (call->filter)
626 return 0;
627
628 call->flags &= ~TRACE_EVENT_FL_FILTERED;
629 call->filter = __alloc_preds();
630 if (IS_ERR(call->filter))
631 return PTR_ERR(call->filter);
632
633 return 0; 815 return 0;
634} 816}
635 817
636static int init_subsystem_preds(struct event_subsystem *system) 818static void filter_free_subsystem_preds(struct event_subsystem *system)
637{ 819{
638 struct ftrace_event_call *call; 820 struct ftrace_event_call *call;
639 int err;
640 821
641 list_for_each_entry(call, &ftrace_events, list) { 822 list_for_each_entry(call, &ftrace_events, list) {
642 if (strcmp(call->class->system, system->name) != 0) 823 if (strcmp(call->class->system, system->name) != 0)
643 continue; 824 continue;
644 825
645 err = init_preds(call); 826 filter_disable(call);
646 if (err) 827 remove_filter_string(call->filter);
647 return err;
648 } 828 }
649
650 return 0;
651} 829}
652 830
653static void filter_free_subsystem_preds(struct event_subsystem *system) 831static void filter_free_subsystem_filters(struct event_subsystem *system)
654{ 832{
655 struct ftrace_event_call *call; 833 struct ftrace_event_call *call;
656 834
657 list_for_each_entry(call, &ftrace_events, list) { 835 list_for_each_entry(call, &ftrace_events, list) {
658 if (strcmp(call->class->system, system->name) != 0) 836 if (strcmp(call->class->system, system->name) != 0)
659 continue; 837 continue;
660 838 __free_filter(call->filter);
661 filter_disable_preds(call); 839 call->filter = NULL;
662 remove_filter_string(call->filter);
663 } 840 }
664} 841}
665 842
@@ -667,18 +844,19 @@ static int filter_add_pred_fn(struct filter_parse_state *ps,
667 struct ftrace_event_call *call, 844 struct ftrace_event_call *call,
668 struct event_filter *filter, 845 struct event_filter *filter,
669 struct filter_pred *pred, 846 struct filter_pred *pred,
847 struct pred_stack *stack,
670 filter_pred_fn_t fn) 848 filter_pred_fn_t fn)
671{ 849{
672 int idx, err; 850 int idx, err;
673 851
674 if (filter->n_preds == MAX_FILTER_PRED) { 852 if (WARN_ON(filter->n_preds == filter->a_preds)) {
675 parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0); 853 parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0);
676 return -ENOSPC; 854 return -ENOSPC;
677 } 855 }
678 856
679 idx = filter->n_preds; 857 idx = filter->n_preds;
680 filter_clear_pred(filter->preds[idx]); 858 filter_clear_pred(&filter->preds[idx]);
681 err = filter_set_pred(filter->preds[idx], pred, fn); 859 err = filter_set_pred(filter, idx, stack, pred, fn);
682 if (err) 860 if (err)
683 return err; 861 return err;
684 862
@@ -763,6 +941,7 @@ static int filter_add_pred(struct filter_parse_state *ps,
763 struct ftrace_event_call *call, 941 struct ftrace_event_call *call,
764 struct event_filter *filter, 942 struct event_filter *filter,
765 struct filter_pred *pred, 943 struct filter_pred *pred,
944 struct pred_stack *stack,
766 bool dry_run) 945 bool dry_run)
767{ 946{
768 struct ftrace_event_field *field; 947 struct ftrace_event_field *field;
@@ -770,17 +949,12 @@ static int filter_add_pred(struct filter_parse_state *ps,
770 unsigned long long val; 949 unsigned long long val;
771 int ret; 950 int ret;
772 951
773 pred->fn = filter_pred_none; 952 fn = pred->fn = filter_pred_none;
774 953
775 if (pred->op == OP_AND) { 954 if (pred->op == OP_AND)
776 pred->pop_n = 2;
777 fn = filter_pred_and;
778 goto add_pred_fn; 955 goto add_pred_fn;
779 } else if (pred->op == OP_OR) { 956 else if (pred->op == OP_OR)
780 pred->pop_n = 2;
781 fn = filter_pred_or;
782 goto add_pred_fn; 957 goto add_pred_fn;
783 }
784 958
785 field = find_event_field(call, pred->field_name); 959 field = find_event_field(call, pred->field_name);
786 if (!field) { 960 if (!field) {
@@ -829,7 +1003,7 @@ static int filter_add_pred(struct filter_parse_state *ps,
829 1003
830add_pred_fn: 1004add_pred_fn:
831 if (!dry_run) 1005 if (!dry_run)
832 return filter_add_pred_fn(ps, call, filter, pred, fn); 1006 return filter_add_pred_fn(ps, call, filter, pred, stack, fn);
833 return 0; 1007 return 0;
834} 1008}
835 1009
@@ -1187,6 +1361,234 @@ static int check_preds(struct filter_parse_state *ps)
1187 return 0; 1361 return 0;
1188} 1362}
1189 1363
1364static int count_preds(struct filter_parse_state *ps)
1365{
1366 struct postfix_elt *elt;
1367 int n_preds = 0;
1368
1369 list_for_each_entry(elt, &ps->postfix, list) {
1370 if (elt->op == OP_NONE)
1371 continue;
1372 n_preds++;
1373 }
1374
1375 return n_preds;
1376}
1377
1378/*
1379 * The tree is walked at filtering of an event. If the tree is not correctly
1380 * built, it may cause an infinite loop. Check here that the tree does
1381 * indeed terminate.
1382 */
1383static int check_pred_tree(struct event_filter *filter,
1384 struct filter_pred *root)
1385{
1386 struct filter_pred *preds;
1387 struct filter_pred *pred;
1388 enum move_type move = MOVE_DOWN;
1389 int count = 0;
1390 int done = 0;
1391 int max;
1392
1393 /*
1394 * The max that we can hit a node is three times.
1395 * Once going down, once coming up from left, and
1396 * once coming up from right. This is more than enough
1397 * since leafs are only hit a single time.
1398 */
1399 max = 3 * filter->n_preds;
1400
1401 preds = filter->preds;
1402 if (!preds)
1403 return -EINVAL;
1404 pred = root;
1405
1406 do {
1407 if (WARN_ON(count++ > max))
1408 return -EINVAL;
1409
1410 switch (move) {
1411 case MOVE_DOWN:
1412 if (pred->left != FILTER_PRED_INVALID) {
1413 pred = &preds[pred->left];
1414 continue;
1415 }
1416 /* A leaf at the root is just a leaf in the tree */
1417 if (pred == root)
1418 break;
1419 pred = get_pred_parent(pred, preds,
1420 pred->parent, &move);
1421 continue;
1422 case MOVE_UP_FROM_LEFT:
1423 pred = &preds[pred->right];
1424 move = MOVE_DOWN;
1425 continue;
1426 case MOVE_UP_FROM_RIGHT:
1427 if (pred == root)
1428 break;
1429 pred = get_pred_parent(pred, preds,
1430 pred->parent, &move);
1431 continue;
1432 }
1433 done = 1;
1434 } while (!done);
1435
1436 /* We are fine. */
1437 return 0;
1438}
1439
1440static int count_leafs(struct filter_pred *preds, struct filter_pred *root)
1441{
1442 struct filter_pred *pred;
1443 enum move_type move = MOVE_DOWN;
1444 int count = 0;
1445 int done = 0;
1446
1447 pred = root;
1448
1449 do {
1450 switch (move) {
1451 case MOVE_DOWN:
1452 if (pred->left != FILTER_PRED_INVALID) {
1453 pred = &preds[pred->left];
1454 continue;
1455 }
1456 /* A leaf at the root is just a leaf in the tree */
1457 if (pred == root)
1458 return 1;
1459 count++;
1460 pred = get_pred_parent(pred, preds,
1461 pred->parent, &move);
1462 continue;
1463 case MOVE_UP_FROM_LEFT:
1464 pred = &preds[pred->right];
1465 move = MOVE_DOWN;
1466 continue;
1467 case MOVE_UP_FROM_RIGHT:
1468 if (pred == root)
1469 break;
1470 pred = get_pred_parent(pred, preds,
1471 pred->parent, &move);
1472 continue;
1473 }
1474 done = 1;
1475 } while (!done);
1476
1477 return count;
1478}
1479
1480static int fold_pred(struct filter_pred *preds, struct filter_pred *root)
1481{
1482 struct filter_pred *pred;
1483 enum move_type move = MOVE_DOWN;
1484 int count = 0;
1485 int children;
1486 int done = 0;
1487
1488 /* No need to keep the fold flag */
1489 root->index &= ~FILTER_PRED_FOLD;
1490
1491 /* If the root is a leaf then do nothing */
1492 if (root->left == FILTER_PRED_INVALID)
1493 return 0;
1494
1495 /* count the children */
1496 children = count_leafs(preds, &preds[root->left]);
1497 children += count_leafs(preds, &preds[root->right]);
1498
1499 root->ops = kzalloc(sizeof(*root->ops) * children, GFP_KERNEL);
1500 if (!root->ops)
1501 return -ENOMEM;
1502
1503 root->val = children;
1504
1505 pred = root;
1506 do {
1507 switch (move) {
1508 case MOVE_DOWN:
1509 if (pred->left != FILTER_PRED_INVALID) {
1510 pred = &preds[pred->left];
1511 continue;
1512 }
1513 if (WARN_ON(count == children))
1514 return -EINVAL;
1515 pred->index &= ~FILTER_PRED_FOLD;
1516 root->ops[count++] = pred->index;
1517 pred = get_pred_parent(pred, preds,
1518 pred->parent, &move);
1519 continue;
1520 case MOVE_UP_FROM_LEFT:
1521 pred = &preds[pred->right];
1522 move = MOVE_DOWN;
1523 continue;
1524 case MOVE_UP_FROM_RIGHT:
1525 if (pred == root)
1526 break;
1527 pred = get_pred_parent(pred, preds,
1528 pred->parent, &move);
1529 continue;
1530 }
1531 done = 1;
1532 } while (!done);
1533
1534 return 0;
1535}
1536
1537/*
1538 * To optimize the processing of the ops, if we have several "ors" or
1539 * "ands" together, we can put them in an array and process them all
1540 * together speeding up the filter logic.
1541 */
1542static int fold_pred_tree(struct event_filter *filter,
1543 struct filter_pred *root)
1544{
1545 struct filter_pred *preds;
1546 struct filter_pred *pred;
1547 enum move_type move = MOVE_DOWN;
1548 int done = 0;
1549 int err;
1550
1551 preds = filter->preds;
1552 if (!preds)
1553 return -EINVAL;
1554 pred = root;
1555
1556 do {
1557 switch (move) {
1558 case MOVE_DOWN:
1559 if (pred->index & FILTER_PRED_FOLD) {
1560 err = fold_pred(preds, pred);
1561 if (err)
1562 return err;
1563 /* Folded nodes are like leafs */
1564 } else if (pred->left != FILTER_PRED_INVALID) {
1565 pred = &preds[pred->left];
1566 continue;
1567 }
1568
1569 /* A leaf at the root is just a leaf in the tree */
1570 if (pred == root)
1571 break;
1572 pred = get_pred_parent(pred, preds,
1573 pred->parent, &move);
1574 continue;
1575 case MOVE_UP_FROM_LEFT:
1576 pred = &preds[pred->right];
1577 move = MOVE_DOWN;
1578 continue;
1579 case MOVE_UP_FROM_RIGHT:
1580 if (pred == root)
1581 break;
1582 pred = get_pred_parent(pred, preds,
1583 pred->parent, &move);
1584 continue;
1585 }
1586 done = 1;
1587 } while (!done);
1588
1589 return 0;
1590}
1591
1190static int replace_preds(struct ftrace_event_call *call, 1592static int replace_preds(struct ftrace_event_call *call,
1191 struct event_filter *filter, 1593 struct event_filter *filter,
1192 struct filter_parse_state *ps, 1594 struct filter_parse_state *ps,
@@ -1195,14 +1597,32 @@ static int replace_preds(struct ftrace_event_call *call,
1195{ 1597{
1196 char *operand1 = NULL, *operand2 = NULL; 1598 char *operand1 = NULL, *operand2 = NULL;
1197 struct filter_pred *pred; 1599 struct filter_pred *pred;
1600 struct filter_pred *root;
1198 struct postfix_elt *elt; 1601 struct postfix_elt *elt;
1602 struct pred_stack stack = { }; /* init to NULL */
1199 int err; 1603 int err;
1200 int n_preds = 0; 1604 int n_preds = 0;
1201 1605
1606 n_preds = count_preds(ps);
1607 if (n_preds >= MAX_FILTER_PRED) {
1608 parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0);
1609 return -ENOSPC;
1610 }
1611
1202 err = check_preds(ps); 1612 err = check_preds(ps);
1203 if (err) 1613 if (err)
1204 return err; 1614 return err;
1205 1615
1616 if (!dry_run) {
1617 err = __alloc_pred_stack(&stack, n_preds);
1618 if (err)
1619 return err;
1620 err = __alloc_preds(filter, n_preds);
1621 if (err)
1622 goto fail;
1623 }
1624
1625 n_preds = 0;
1206 list_for_each_entry(elt, &ps->postfix, list) { 1626 list_for_each_entry(elt, &ps->postfix, list) {
1207 if (elt->op == OP_NONE) { 1627 if (elt->op == OP_NONE) {
1208 if (!operand1) 1628 if (!operand1)
@@ -1211,14 +1631,16 @@ static int replace_preds(struct ftrace_event_call *call,
1211 operand2 = elt->operand; 1631 operand2 = elt->operand;
1212 else { 1632 else {
1213 parse_error(ps, FILT_ERR_TOO_MANY_OPERANDS, 0); 1633 parse_error(ps, FILT_ERR_TOO_MANY_OPERANDS, 0);
1214 return -EINVAL; 1634 err = -EINVAL;
1635 goto fail;
1215 } 1636 }
1216 continue; 1637 continue;
1217 } 1638 }
1218 1639
1219 if (n_preds++ == MAX_FILTER_PRED) { 1640 if (WARN_ON(n_preds++ == MAX_FILTER_PRED)) {
1220 parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0); 1641 parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0);
1221 return -ENOSPC; 1642 err = -ENOSPC;
1643 goto fail;
1222 } 1644 }
1223 1645
1224 if (elt->op == OP_AND || elt->op == OP_OR) { 1646 if (elt->op == OP_AND || elt->op == OP_OR) {
@@ -1228,76 +1650,181 @@ static int replace_preds(struct ftrace_event_call *call,
1228 1650
1229 if (!operand1 || !operand2) { 1651 if (!operand1 || !operand2) {
1230 parse_error(ps, FILT_ERR_MISSING_FIELD, 0); 1652 parse_error(ps, FILT_ERR_MISSING_FIELD, 0);
1231 return -EINVAL; 1653 err = -EINVAL;
1654 goto fail;
1232 } 1655 }
1233 1656
1234 pred = create_pred(elt->op, operand1, operand2); 1657 pred = create_pred(elt->op, operand1, operand2);
1235add_pred: 1658add_pred:
1236 if (!pred) 1659 if (!pred) {
1237 return -ENOMEM; 1660 err = -ENOMEM;
1238 err = filter_add_pred(ps, call, filter, pred, dry_run); 1661 goto fail;
1662 }
1663 err = filter_add_pred(ps, call, filter, pred, &stack, dry_run);
1239 filter_free_pred(pred); 1664 filter_free_pred(pred);
1240 if (err) 1665 if (err)
1241 return err; 1666 goto fail;
1242 1667
1243 operand1 = operand2 = NULL; 1668 operand1 = operand2 = NULL;
1244 } 1669 }
1245 1670
1246 return 0; 1671 if (!dry_run) {
1672 /* We should have one item left on the stack */
1673 pred = __pop_pred_stack(&stack);
1674 if (!pred)
1675 return -EINVAL;
1676 /* This item is where we start from in matching */
1677 root = pred;
1678 /* Make sure the stack is empty */
1679 pred = __pop_pred_stack(&stack);
1680 if (WARN_ON(pred)) {
1681 err = -EINVAL;
1682 filter->root = NULL;
1683 goto fail;
1684 }
1685 err = check_pred_tree(filter, root);
1686 if (err)
1687 goto fail;
1688
1689 /* Optimize the tree */
1690 err = fold_pred_tree(filter, root);
1691 if (err)
1692 goto fail;
1693
1694 /* We don't set root until we know it works */
1695 barrier();
1696 filter->root = root;
1697 }
1698
1699 err = 0;
1700fail:
1701 __free_pred_stack(&stack);
1702 return err;
1247} 1703}
1248 1704
1705struct filter_list {
1706 struct list_head list;
1707 struct event_filter *filter;
1708};
1709
1249static int replace_system_preds(struct event_subsystem *system, 1710static int replace_system_preds(struct event_subsystem *system,
1250 struct filter_parse_state *ps, 1711 struct filter_parse_state *ps,
1251 char *filter_string) 1712 char *filter_string)
1252{ 1713{
1253 struct ftrace_event_call *call; 1714 struct ftrace_event_call *call;
1715 struct filter_list *filter_item;
1716 struct filter_list *tmp;
1717 LIST_HEAD(filter_list);
1254 bool fail = true; 1718 bool fail = true;
1255 int err; 1719 int err;
1256 1720
1257 list_for_each_entry(call, &ftrace_events, list) { 1721 list_for_each_entry(call, &ftrace_events, list) {
1258 struct event_filter *filter = call->filter;
1259 1722
1260 if (strcmp(call->class->system, system->name) != 0) 1723 if (strcmp(call->class->system, system->name) != 0)
1261 continue; 1724 continue;
1262 1725
1263 /* try to see if the filter can be applied */ 1726 /*
1264 err = replace_preds(call, filter, ps, filter_string, true); 1727 * Try to see if the filter can be applied
1728 * (filter arg is ignored on dry_run)
1729 */
1730 err = replace_preds(call, NULL, ps, filter_string, true);
1265 if (err) 1731 if (err)
1732 goto fail;
1733 }
1734
1735 list_for_each_entry(call, &ftrace_events, list) {
1736 struct event_filter *filter;
1737
1738 if (strcmp(call->class->system, system->name) != 0)
1266 continue; 1739 continue;
1267 1740
1268 /* really apply the filter */ 1741 filter_item = kzalloc(sizeof(*filter_item), GFP_KERNEL);
1269 filter_disable_preds(call); 1742 if (!filter_item)
1270 err = replace_preds(call, filter, ps, filter_string, false); 1743 goto fail_mem;
1744
1745 list_add_tail(&filter_item->list, &filter_list);
1746
1747 filter_item->filter = __alloc_filter();
1748 if (!filter_item->filter)
1749 goto fail_mem;
1750 filter = filter_item->filter;
1751
1752 /* Can only fail on no memory */
1753 err = replace_filter_string(filter, filter_string);
1271 if (err) 1754 if (err)
1272 filter_disable_preds(call); 1755 goto fail_mem;
1273 else { 1756
1757 err = replace_preds(call, filter, ps, filter_string, false);
1758 if (err) {
1759 filter_disable(call);
1760 parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0);
1761 append_filter_err(ps, filter);
1762 } else
1274 call->flags |= TRACE_EVENT_FL_FILTERED; 1763 call->flags |= TRACE_EVENT_FL_FILTERED;
1275 replace_filter_string(filter, filter_string); 1764 /*
1276 } 1765 * Regardless of if this returned an error, we still
1766 * replace the filter for the call.
1767 */
1768 filter = call->filter;
1769 call->filter = filter_item->filter;
1770 filter_item->filter = filter;
1771
1277 fail = false; 1772 fail = false;
1278 } 1773 }
1279 1774
1280 if (fail) { 1775 if (fail)
1281 parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0); 1776 goto fail;
1282 return -EINVAL; 1777
1778 /*
1779 * The calls can still be using the old filters.
1780 * Do a synchronize_sched() to ensure all calls are
1781 * done with them before we free them.
1782 */
1783 synchronize_sched();
1784 list_for_each_entry_safe(filter_item, tmp, &filter_list, list) {
1785 __free_filter(filter_item->filter);
1786 list_del(&filter_item->list);
1787 kfree(filter_item);
1283 } 1788 }
1284 return 0; 1789 return 0;
1790 fail:
1791 /* No call succeeded */
1792 list_for_each_entry_safe(filter_item, tmp, &filter_list, list) {
1793 list_del(&filter_item->list);
1794 kfree(filter_item);
1795 }
1796 parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0);
1797 return -EINVAL;
1798 fail_mem:
1799 /* If any call succeeded, we still need to sync */
1800 if (!fail)
1801 synchronize_sched();
1802 list_for_each_entry_safe(filter_item, tmp, &filter_list, list) {
1803 __free_filter(filter_item->filter);
1804 list_del(&filter_item->list);
1805 kfree(filter_item);
1806 }
1807 return -ENOMEM;
1285} 1808}
1286 1809
1287int apply_event_filter(struct ftrace_event_call *call, char *filter_string) 1810int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
1288{ 1811{
1289 int err;
1290 struct filter_parse_state *ps; 1812 struct filter_parse_state *ps;
1813 struct event_filter *filter;
1814 struct event_filter *tmp;
1815 int err = 0;
1291 1816
1292 mutex_lock(&event_mutex); 1817 mutex_lock(&event_mutex);
1293 1818
1294 err = init_preds(call);
1295 if (err)
1296 goto out_unlock;
1297
1298 if (!strcmp(strstrip(filter_string), "0")) { 1819 if (!strcmp(strstrip(filter_string), "0")) {
1299 filter_disable_preds(call); 1820 filter_disable(call);
1300 remove_filter_string(call->filter); 1821 filter = call->filter;
1822 if (!filter)
1823 goto out_unlock;
1824 call->filter = NULL;
1825 /* Make sure the filter is not being used */
1826 synchronize_sched();
1827 __free_filter(filter);
1301 goto out_unlock; 1828 goto out_unlock;
1302 } 1829 }
1303 1830
@@ -1306,22 +1833,41 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
1306 if (!ps) 1833 if (!ps)
1307 goto out_unlock; 1834 goto out_unlock;
1308 1835
1309 filter_disable_preds(call); 1836 filter = __alloc_filter();
1310 replace_filter_string(call->filter, filter_string); 1837 if (!filter) {
1838 kfree(ps);
1839 goto out_unlock;
1840 }
1841
1842 replace_filter_string(filter, filter_string);
1311 1843
1312 parse_init(ps, filter_ops, filter_string); 1844 parse_init(ps, filter_ops, filter_string);
1313 err = filter_parse(ps); 1845 err = filter_parse(ps);
1314 if (err) { 1846 if (err) {
1315 append_filter_err(ps, call->filter); 1847 append_filter_err(ps, filter);
1316 goto out; 1848 goto out;
1317 } 1849 }
1318 1850
1319 err = replace_preds(call, call->filter, ps, filter_string, false); 1851 err = replace_preds(call, filter, ps, filter_string, false);
1320 if (err) 1852 if (err) {
1321 append_filter_err(ps, call->filter); 1853 filter_disable(call);
1322 else 1854 append_filter_err(ps, filter);
1855 } else
1323 call->flags |= TRACE_EVENT_FL_FILTERED; 1856 call->flags |= TRACE_EVENT_FL_FILTERED;
1324out: 1857out:
1858 /*
1859 * Always swap the call filter with the new filter
1860 * even if there was an error. If there was an error
1861 * in the filter, we disable the filter and show the error
1862 * string
1863 */
1864 tmp = call->filter;
1865 call->filter = filter;
1866 if (tmp) {
1867 /* Make sure the call is done with the filter */
1868 synchronize_sched();
1869 __free_filter(tmp);
1870 }
1325 filter_opstack_clear(ps); 1871 filter_opstack_clear(ps);
1326 postfix_clear(ps); 1872 postfix_clear(ps);
1327 kfree(ps); 1873 kfree(ps);
@@ -1334,18 +1880,21 @@ out_unlock:
1334int apply_subsystem_event_filter(struct event_subsystem *system, 1880int apply_subsystem_event_filter(struct event_subsystem *system,
1335 char *filter_string) 1881 char *filter_string)
1336{ 1882{
1337 int err;
1338 struct filter_parse_state *ps; 1883 struct filter_parse_state *ps;
1884 struct event_filter *filter;
1885 int err = 0;
1339 1886
1340 mutex_lock(&event_mutex); 1887 mutex_lock(&event_mutex);
1341 1888
1342 err = init_subsystem_preds(system);
1343 if (err)
1344 goto out_unlock;
1345
1346 if (!strcmp(strstrip(filter_string), "0")) { 1889 if (!strcmp(strstrip(filter_string), "0")) {
1347 filter_free_subsystem_preds(system); 1890 filter_free_subsystem_preds(system);
1348 remove_filter_string(system->filter); 1891 remove_filter_string(system->filter);
1892 filter = system->filter;
1893 system->filter = NULL;
1894 /* Ensure all filters are no longer used */
1895 synchronize_sched();
1896 filter_free_subsystem_filters(system);
1897 __free_filter(filter);
1349 goto out_unlock; 1898 goto out_unlock;
1350 } 1899 }
1351 1900
@@ -1354,7 +1903,17 @@ int apply_subsystem_event_filter(struct event_subsystem *system,
1354 if (!ps) 1903 if (!ps)
1355 goto out_unlock; 1904 goto out_unlock;
1356 1905
1357 replace_filter_string(system->filter, filter_string); 1906 filter = __alloc_filter();
1907 if (!filter)
1908 goto out;
1909
1910 replace_filter_string(filter, filter_string);
1911 /*
1912 * No event actually uses the system filter
1913 * we can free it without synchronize_sched().
1914 */
1915 __free_filter(system->filter);
1916 system->filter = filter;
1358 1917
1359 parse_init(ps, filter_ops, filter_string); 1918 parse_init(ps, filter_ops, filter_string);
1360 err = filter_parse(ps); 1919 err = filter_parse(ps);
@@ -1384,7 +1943,7 @@ void ftrace_profile_free_filter(struct perf_event *event)
1384 struct event_filter *filter = event->filter; 1943 struct event_filter *filter = event->filter;
1385 1944
1386 event->filter = NULL; 1945 event->filter = NULL;
1387 __free_preds(filter); 1946 __free_filter(filter);
1388} 1947}
1389 1948
1390int ftrace_profile_set_filter(struct perf_event *event, int event_id, 1949int ftrace_profile_set_filter(struct perf_event *event, int event_id,
@@ -1410,8 +1969,8 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id,
1410 if (event->filter) 1969 if (event->filter)
1411 goto out_unlock; 1970 goto out_unlock;
1412 1971
1413 filter = __alloc_preds(); 1972 filter = __alloc_filter();
1414 if (IS_ERR(filter)) { 1973 if (!filter) {
1415 err = PTR_ERR(filter); 1974 err = PTR_ERR(filter);
1416 goto out_unlock; 1975 goto out_unlock;
1417 } 1976 }
@@ -1419,7 +1978,7 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id,
1419 err = -ENOMEM; 1978 err = -ENOMEM;
1420 ps = kzalloc(sizeof(*ps), GFP_KERNEL); 1979 ps = kzalloc(sizeof(*ps), GFP_KERNEL);
1421 if (!ps) 1980 if (!ps)
1422 goto free_preds; 1981 goto free_filter;
1423 1982
1424 parse_init(ps, filter_ops, filter_str); 1983 parse_init(ps, filter_ops, filter_str);
1425 err = filter_parse(ps); 1984 err = filter_parse(ps);
@@ -1435,9 +1994,9 @@ free_ps:
1435 postfix_clear(ps); 1994 postfix_clear(ps);
1436 kfree(ps); 1995 kfree(ps);
1437 1996
1438free_preds: 1997free_filter:
1439 if (err) 1998 if (err)
1440 __free_preds(filter); 1999 __free_filter(filter);
1441 2000
1442out_unlock: 2001out_unlock:
1443 mutex_unlock(&event_mutex); 2002 mutex_unlock(&event_mutex);
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index 4b74d71705c0..bbeec31e0ae3 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -161,13 +161,13 @@ struct ftrace_event_class event_class_ftrace_##call = { \
161 .fields = LIST_HEAD_INIT(event_class_ftrace_##call.fields),\ 161 .fields = LIST_HEAD_INIT(event_class_ftrace_##call.fields),\
162}; \ 162}; \
163 \ 163 \
164struct ftrace_event_call __used \ 164struct ftrace_event_call __used event_##call = { \
165__attribute__((__aligned__(4))) \
166__attribute__((section("_ftrace_events"))) event_##call = { \
167 .name = #call, \ 165 .name = #call, \
168 .event.type = etype, \ 166 .event.type = etype, \
169 .class = &event_class_ftrace_##call, \ 167 .class = &event_class_ftrace_##call, \
170 .print_fmt = print, \ 168 .print_fmt = print, \
171}; \ 169}; \
170struct ftrace_event_call __used \
171__attribute__((section("_ftrace_events"))) *__event_##call = &event_##call;
172 172
173#include "trace_entries.h" 173#include "trace_entries.h"
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 5cf8c602b880..92b6e1e12d98 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -453,14 +453,6 @@ void time_hardirqs_off(unsigned long a0, unsigned long a1)
453 * Stubs: 453 * Stubs:
454 */ 454 */
455 455
456void early_boot_irqs_off(void)
457{
458}
459
460void early_boot_irqs_on(void)
461{
462}
463
464void trace_softirqs_on(unsigned long ip) 456void trace_softirqs_on(unsigned long ip)
465{ 457{
466} 458}
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 2dec9bcde8b4..8435b43b1782 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -353,6 +353,43 @@ static __kprobes void free_deref_fetch_param(struct deref_fetch_param *data)
353 kfree(data); 353 kfree(data);
354} 354}
355 355
356/* Bitfield fetch function */
357struct bitfield_fetch_param {
358 struct fetch_param orig;
359 unsigned char hi_shift;
360 unsigned char low_shift;
361};
362
363#define DEFINE_FETCH_bitfield(type) \
364static __kprobes void FETCH_FUNC_NAME(bitfield, type)(struct pt_regs *regs,\
365 void *data, void *dest) \
366{ \
367 struct bitfield_fetch_param *bprm = data; \
368 type buf = 0; \
369 call_fetch(&bprm->orig, regs, &buf); \
370 if (buf) { \
371 buf <<= bprm->hi_shift; \
372 buf >>= bprm->low_shift; \
373 } \
374 *(type *)dest = buf; \
375}
376DEFINE_BASIC_FETCH_FUNCS(bitfield)
377#define fetch_bitfield_string NULL
378#define fetch_bitfield_string_size NULL
379
380static __kprobes void
381free_bitfield_fetch_param(struct bitfield_fetch_param *data)
382{
383 /*
384 * Don't check the bitfield itself, because this must be the
385 * last fetch function.
386 */
387 if (CHECK_FETCH_FUNCS(deref, data->orig.fn))
388 free_deref_fetch_param(data->orig.data);
389 else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn))
390 free_symbol_cache(data->orig.data);
391 kfree(data);
392}
356/* Default (unsigned long) fetch type */ 393/* Default (unsigned long) fetch type */
357#define __DEFAULT_FETCH_TYPE(t) u##t 394#define __DEFAULT_FETCH_TYPE(t) u##t
358#define _DEFAULT_FETCH_TYPE(t) __DEFAULT_FETCH_TYPE(t) 395#define _DEFAULT_FETCH_TYPE(t) __DEFAULT_FETCH_TYPE(t)
@@ -367,6 +404,7 @@ enum {
367 FETCH_MTD_memory, 404 FETCH_MTD_memory,
368 FETCH_MTD_symbol, 405 FETCH_MTD_symbol,
369 FETCH_MTD_deref, 406 FETCH_MTD_deref,
407 FETCH_MTD_bitfield,
370 FETCH_MTD_END, 408 FETCH_MTD_END,
371}; 409};
372 410
@@ -387,6 +425,7 @@ ASSIGN_FETCH_FUNC(retval, ftype), \
387ASSIGN_FETCH_FUNC(memory, ftype), \ 425ASSIGN_FETCH_FUNC(memory, ftype), \
388ASSIGN_FETCH_FUNC(symbol, ftype), \ 426ASSIGN_FETCH_FUNC(symbol, ftype), \
389ASSIGN_FETCH_FUNC(deref, ftype), \ 427ASSIGN_FETCH_FUNC(deref, ftype), \
428ASSIGN_FETCH_FUNC(bitfield, ftype), \
390 } \ 429 } \
391 } 430 }
392 431
@@ -430,9 +469,33 @@ static const struct fetch_type *find_fetch_type(const char *type)
430 if (!type) 469 if (!type)
431 type = DEFAULT_FETCH_TYPE_STR; 470 type = DEFAULT_FETCH_TYPE_STR;
432 471
472 /* Special case: bitfield */
473 if (*type == 'b') {
474 unsigned long bs;
475 type = strchr(type, '/');
476 if (!type)
477 goto fail;
478 type++;
479 if (strict_strtoul(type, 0, &bs))
480 goto fail;
481 switch (bs) {
482 case 8:
483 return find_fetch_type("u8");
484 case 16:
485 return find_fetch_type("u16");
486 case 32:
487 return find_fetch_type("u32");
488 case 64:
489 return find_fetch_type("u64");
490 default:
491 goto fail;
492 }
493 }
494
433 for (i = 0; i < ARRAY_SIZE(fetch_type_table); i++) 495 for (i = 0; i < ARRAY_SIZE(fetch_type_table); i++)
434 if (strcmp(type, fetch_type_table[i].name) == 0) 496 if (strcmp(type, fetch_type_table[i].name) == 0)
435 return &fetch_type_table[i]; 497 return &fetch_type_table[i];
498fail:
436 return NULL; 499 return NULL;
437} 500}
438 501
@@ -586,7 +649,9 @@ error:
586 649
587static void free_probe_arg(struct probe_arg *arg) 650static void free_probe_arg(struct probe_arg *arg)
588{ 651{
589 if (CHECK_FETCH_FUNCS(deref, arg->fetch.fn)) 652 if (CHECK_FETCH_FUNCS(bitfield, arg->fetch.fn))
653 free_bitfield_fetch_param(arg->fetch.data);
654 else if (CHECK_FETCH_FUNCS(deref, arg->fetch.fn))
590 free_deref_fetch_param(arg->fetch.data); 655 free_deref_fetch_param(arg->fetch.data);
591 else if (CHECK_FETCH_FUNCS(symbol, arg->fetch.fn)) 656 else if (CHECK_FETCH_FUNCS(symbol, arg->fetch.fn))
592 free_symbol_cache(arg->fetch.data); 657 free_symbol_cache(arg->fetch.data);
@@ -767,16 +832,15 @@ static int __parse_probe_arg(char *arg, const struct fetch_type *t,
767 } 832 }
768 break; 833 break;
769 case '+': /* deref memory */ 834 case '+': /* deref memory */
835 arg++; /* Skip '+', because strict_strtol() rejects it. */
770 case '-': 836 case '-':
771 tmp = strchr(arg, '('); 837 tmp = strchr(arg, '(');
772 if (!tmp) 838 if (!tmp)
773 break; 839 break;
774 *tmp = '\0'; 840 *tmp = '\0';
775 ret = strict_strtol(arg + 1, 0, &offset); 841 ret = strict_strtol(arg, 0, &offset);
776 if (ret) 842 if (ret)
777 break; 843 break;
778 if (arg[0] == '-')
779 offset = -offset;
780 arg = tmp + 1; 844 arg = tmp + 1;
781 tmp = strrchr(arg, ')'); 845 tmp = strrchr(arg, ')');
782 if (tmp) { 846 if (tmp) {
@@ -807,6 +871,41 @@ static int __parse_probe_arg(char *arg, const struct fetch_type *t,
807 return ret; 871 return ret;
808} 872}
809 873
874#define BYTES_TO_BITS(nb) ((BITS_PER_LONG * (nb)) / sizeof(long))
875
876/* Bitfield type needs to be parsed into a fetch function */
877static int __parse_bitfield_probe_arg(const char *bf,
878 const struct fetch_type *t,
879 struct fetch_param *f)
880{
881 struct bitfield_fetch_param *bprm;
882 unsigned long bw, bo;
883 char *tail;
884
885 if (*bf != 'b')
886 return 0;
887
888 bprm = kzalloc(sizeof(*bprm), GFP_KERNEL);
889 if (!bprm)
890 return -ENOMEM;
891 bprm->orig = *f;
892 f->fn = t->fetch[FETCH_MTD_bitfield];
893 f->data = (void *)bprm;
894
895 bw = simple_strtoul(bf + 1, &tail, 0); /* Use simple one */
896 if (bw == 0 || *tail != '@')
897 return -EINVAL;
898
899 bf = tail + 1;
900 bo = simple_strtoul(bf, &tail, 0);
901 if (tail == bf || *tail != '/')
902 return -EINVAL;
903
904 bprm->hi_shift = BYTES_TO_BITS(t->size) - (bw + bo);
905 bprm->low_shift = bprm->hi_shift + bo;
906 return (BYTES_TO_BITS(t->size) < (bw + bo)) ? -EINVAL : 0;
907}
908
810/* String length checking wrapper */ 909/* String length checking wrapper */
811static int parse_probe_arg(char *arg, struct trace_probe *tp, 910static int parse_probe_arg(char *arg, struct trace_probe *tp,
812 struct probe_arg *parg, int is_return) 911 struct probe_arg *parg, int is_return)
@@ -836,6 +935,8 @@ static int parse_probe_arg(char *arg, struct trace_probe *tp,
836 parg->offset = tp->size; 935 parg->offset = tp->size;
837 tp->size += parg->type->size; 936 tp->size += parg->type->size;
838 ret = __parse_probe_arg(arg, parg->type, &parg->fetch, is_return); 937 ret = __parse_probe_arg(arg, parg->type, &parg->fetch, is_return);
938 if (ret >= 0 && t != NULL)
939 ret = __parse_bitfield_probe_arg(t, parg->type, &parg->fetch);
839 if (ret >= 0) { 940 if (ret >= 0) {
840 parg->fetch_size.fn = get_fetch_size_function(parg->type, 941 parg->fetch_size.fn = get_fetch_size_function(parg->type,
841 parg->fetch.fn); 942 parg->fetch.fn);
@@ -1130,7 +1231,7 @@ static int command_trace_probe(const char *buf)
1130 return ret; 1231 return ret;
1131} 1232}
1132 1233
1133#define WRITE_BUFSIZE 128 1234#define WRITE_BUFSIZE 4096
1134 1235
1135static ssize_t probes_write(struct file *file, const char __user *buffer, 1236static ssize_t probes_write(struct file *file, const char __user *buffer,
1136 size_t count, loff_t *ppos) 1237 size_t count, loff_t *ppos)
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 02272baa2206..456be9063c2d 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -529,24 +529,34 @@ seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags)
529 * @entry: The trace entry field from the ring buffer 529 * @entry: The trace entry field from the ring buffer
530 * 530 *
531 * Prints the generic fields of irqs off, in hard or softirq, preempt 531 * Prints the generic fields of irqs off, in hard or softirq, preempt
532 * count and lock depth. 532 * count.
533 */ 533 */
534int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry) 534int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
535{ 535{
536 int hardirq, softirq; 536 char hardsoft_irq;
537 char need_resched;
538 char irqs_off;
539 int hardirq;
540 int softirq;
537 int ret; 541 int ret;
538 542
539 hardirq = entry->flags & TRACE_FLAG_HARDIRQ; 543 hardirq = entry->flags & TRACE_FLAG_HARDIRQ;
540 softirq = entry->flags & TRACE_FLAG_SOFTIRQ; 544 softirq = entry->flags & TRACE_FLAG_SOFTIRQ;
541 545
546 irqs_off =
547 (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' :
548 (entry->flags & TRACE_FLAG_IRQS_NOSUPPORT) ? 'X' :
549 '.';
550 need_resched =
551 (entry->flags & TRACE_FLAG_NEED_RESCHED) ? 'N' : '.';
552 hardsoft_irq =
553 (hardirq && softirq) ? 'H' :
554 hardirq ? 'h' :
555 softirq ? 's' :
556 '.';
557
542 if (!trace_seq_printf(s, "%c%c%c", 558 if (!trace_seq_printf(s, "%c%c%c",
543 (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' : 559 irqs_off, need_resched, hardsoft_irq))
544 (entry->flags & TRACE_FLAG_IRQS_NOSUPPORT) ?
545 'X' : '.',
546 (entry->flags & TRACE_FLAG_NEED_RESCHED) ?
547 'N' : '.',
548 (hardirq && softirq) ? 'H' :
549 hardirq ? 'h' : softirq ? 's' : '.'))
550 return 0; 560 return 0;
551 561
552 if (entry->preempt_count) 562 if (entry->preempt_count)
@@ -554,13 +564,7 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
554 else 564 else
555 ret = trace_seq_putc(s, '.'); 565 ret = trace_seq_putc(s, '.');
556 566
557 if (!ret) 567 return ret;
558 return 0;
559
560 if (entry->lock_depth < 0)
561 return trace_seq_putc(s, '.');
562
563 return trace_seq_printf(s, "%d", entry->lock_depth);
564} 568}
565 569
566static int 570static int
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index 8f758d070c43..7e62c0a18456 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -247,51 +247,3 @@ void tracing_sched_switch_assign_trace(struct trace_array *tr)
247 ctx_trace = tr; 247 ctx_trace = tr;
248} 248}
249 249
250static void stop_sched_trace(struct trace_array *tr)
251{
252 tracing_stop_sched_switch_record();
253}
254
255static int sched_switch_trace_init(struct trace_array *tr)
256{
257 ctx_trace = tr;
258 tracing_reset_online_cpus(tr);
259 tracing_start_sched_switch_record();
260 return 0;
261}
262
263static void sched_switch_trace_reset(struct trace_array *tr)
264{
265 if (sched_ref)
266 stop_sched_trace(tr);
267}
268
269static void sched_switch_trace_start(struct trace_array *tr)
270{
271 sched_stopped = 0;
272}
273
274static void sched_switch_trace_stop(struct trace_array *tr)
275{
276 sched_stopped = 1;
277}
278
279static struct tracer sched_switch_trace __read_mostly =
280{
281 .name = "sched_switch",
282 .init = sched_switch_trace_init,
283 .reset = sched_switch_trace_reset,
284 .start = sched_switch_trace_start,
285 .stop = sched_switch_trace_stop,
286 .wait_pipe = poll_wait_pipe,
287#ifdef CONFIG_FTRACE_SELFTEST
288 .selftest = trace_selftest_startup_sched_switch,
289#endif
290};
291
292__init static int init_sched_switch_trace(void)
293{
294 return register_tracer(&sched_switch_trace);
295}
296device_initcall(init_sched_switch_trace);
297
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 562c56e048fd..659732eba07c 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -558,7 +558,7 @@ trace_selftest_startup_nop(struct tracer *trace, struct trace_array *tr)
558static int trace_wakeup_test_thread(void *data) 558static int trace_wakeup_test_thread(void *data)
559{ 559{
560 /* Make this a RT thread, doesn't need to be too high */ 560 /* Make this a RT thread, doesn't need to be too high */
561 static struct sched_param param = { .sched_priority = 5 }; 561 static const struct sched_param param = { .sched_priority = 5 };
562 struct completion *x = data; 562 struct completion *x = data;
563 563
564 sched_setscheduler(current, SCHED_FIFO, &param); 564 sched_setscheduler(current, SCHED_FIFO, &param);
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index bac752f0cfb5..ee7b5a0bb9f8 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -23,9 +23,6 @@ static int syscall_exit_register(struct ftrace_event_call *event,
23static int syscall_enter_define_fields(struct ftrace_event_call *call); 23static int syscall_enter_define_fields(struct ftrace_event_call *call);
24static int syscall_exit_define_fields(struct ftrace_event_call *call); 24static int syscall_exit_define_fields(struct ftrace_event_call *call);
25 25
26/* All syscall exit events have the same fields */
27static LIST_HEAD(syscall_exit_fields);
28
29static struct list_head * 26static struct list_head *
30syscall_get_enter_fields(struct ftrace_event_call *call) 27syscall_get_enter_fields(struct ftrace_event_call *call)
31{ 28{
@@ -34,61 +31,66 @@ syscall_get_enter_fields(struct ftrace_event_call *call)
34 return &entry->enter_fields; 31 return &entry->enter_fields;
35} 32}
36 33
37static struct list_head *
38syscall_get_exit_fields(struct ftrace_event_call *call)
39{
40 return &syscall_exit_fields;
41}
42
43struct trace_event_functions enter_syscall_print_funcs = { 34struct trace_event_functions enter_syscall_print_funcs = {
44 .trace = print_syscall_enter, 35 .trace = print_syscall_enter,
45}; 36};
46 37
47struct trace_event_functions exit_syscall_print_funcs = { 38struct trace_event_functions exit_syscall_print_funcs = {
48 .trace = print_syscall_exit, 39 .trace = print_syscall_exit,
49}; 40};
50 41
51struct ftrace_event_class event_class_syscall_enter = { 42struct ftrace_event_class event_class_syscall_enter = {
52 .system = "syscalls", 43 .system = "syscalls",
53 .reg = syscall_enter_register, 44 .reg = syscall_enter_register,
54 .define_fields = syscall_enter_define_fields, 45 .define_fields = syscall_enter_define_fields,
55 .get_fields = syscall_get_enter_fields, 46 .get_fields = syscall_get_enter_fields,
56 .raw_init = init_syscall_trace, 47 .raw_init = init_syscall_trace,
57}; 48};
58 49
59struct ftrace_event_class event_class_syscall_exit = { 50struct ftrace_event_class event_class_syscall_exit = {
60 .system = "syscalls", 51 .system = "syscalls",
61 .reg = syscall_exit_register, 52 .reg = syscall_exit_register,
62 .define_fields = syscall_exit_define_fields, 53 .define_fields = syscall_exit_define_fields,
63 .get_fields = syscall_get_exit_fields, 54 .fields = LIST_HEAD_INIT(event_class_syscall_exit.fields),
64 .raw_init = init_syscall_trace, 55 .raw_init = init_syscall_trace,
65}; 56};
66 57
67extern unsigned long __start_syscalls_metadata[]; 58extern struct syscall_metadata *__start_syscalls_metadata[];
68extern unsigned long __stop_syscalls_metadata[]; 59extern struct syscall_metadata *__stop_syscalls_metadata[];
69 60
70static struct syscall_metadata **syscalls_metadata; 61static struct syscall_metadata **syscalls_metadata;
71 62
72static struct syscall_metadata *find_syscall_meta(unsigned long syscall) 63#ifndef ARCH_HAS_SYSCALL_MATCH_SYM_NAME
64static inline bool arch_syscall_match_sym_name(const char *sym, const char *name)
65{
66 /*
67 * Only compare after the "sys" prefix. Archs that use
68 * syscall wrappers may have syscalls symbols aliases prefixed
69 * with "SyS" instead of "sys", leading to an unwanted
70 * mismatch.
71 */
72 return !strcmp(sym + 3, name + 3);
73}
74#endif
75
76static __init struct syscall_metadata *
77find_syscall_meta(unsigned long syscall)
73{ 78{
74 struct syscall_metadata *start; 79 struct syscall_metadata **start;
75 struct syscall_metadata *stop; 80 struct syscall_metadata **stop;
76 char str[KSYM_SYMBOL_LEN]; 81 char str[KSYM_SYMBOL_LEN];
77 82
78 83
79 start = (struct syscall_metadata *)__start_syscalls_metadata; 84 start = __start_syscalls_metadata;
80 stop = (struct syscall_metadata *)__stop_syscalls_metadata; 85 stop = __stop_syscalls_metadata;
81 kallsyms_lookup(syscall, NULL, NULL, NULL, str); 86 kallsyms_lookup(syscall, NULL, NULL, NULL, str);
82 87
88 if (arch_syscall_match_sym_name(str, "sys_ni_syscall"))
89 return NULL;
90
83 for ( ; start < stop; start++) { 91 for ( ; start < stop; start++) {
84 /* 92 if ((*start)->name && arch_syscall_match_sym_name(str, (*start)->name))
85 * Only compare after the "sys" prefix. Archs that use 93 return *start;
86 * syscall wrappers may have syscalls symbols aliases prefixed
87 * with "SyS" instead of "sys", leading to an unwanted
88 * mismatch.
89 */
90 if (start->name && !strcmp(start->name + 3, str + 3))
91 return start;
92 } 94 }
93 return NULL; 95 return NULL;
94} 96}
@@ -367,7 +369,7 @@ int reg_event_syscall_enter(struct ftrace_event_call *call)
367 int num; 369 int num;
368 370
369 num = ((struct syscall_metadata *)call->data)->syscall_nr; 371 num = ((struct syscall_metadata *)call->data)->syscall_nr;
370 if (num < 0 || num >= NR_syscalls) 372 if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
371 return -ENOSYS; 373 return -ENOSYS;
372 mutex_lock(&syscall_trace_lock); 374 mutex_lock(&syscall_trace_lock);
373 if (!sys_refcount_enter) 375 if (!sys_refcount_enter)
@@ -385,7 +387,7 @@ void unreg_event_syscall_enter(struct ftrace_event_call *call)
385 int num; 387 int num;
386 388
387 num = ((struct syscall_metadata *)call->data)->syscall_nr; 389 num = ((struct syscall_metadata *)call->data)->syscall_nr;
388 if (num < 0 || num >= NR_syscalls) 390 if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
389 return; 391 return;
390 mutex_lock(&syscall_trace_lock); 392 mutex_lock(&syscall_trace_lock);
391 sys_refcount_enter--; 393 sys_refcount_enter--;
@@ -401,7 +403,7 @@ int reg_event_syscall_exit(struct ftrace_event_call *call)
401 int num; 403 int num;
402 404
403 num = ((struct syscall_metadata *)call->data)->syscall_nr; 405 num = ((struct syscall_metadata *)call->data)->syscall_nr;
404 if (num < 0 || num >= NR_syscalls) 406 if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
405 return -ENOSYS; 407 return -ENOSYS;
406 mutex_lock(&syscall_trace_lock); 408 mutex_lock(&syscall_trace_lock);
407 if (!sys_refcount_exit) 409 if (!sys_refcount_exit)
@@ -419,7 +421,7 @@ void unreg_event_syscall_exit(struct ftrace_event_call *call)
419 int num; 421 int num;
420 422
421 num = ((struct syscall_metadata *)call->data)->syscall_nr; 423 num = ((struct syscall_metadata *)call->data)->syscall_nr;
422 if (num < 0 || num >= NR_syscalls) 424 if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
423 return; 425 return;
424 mutex_lock(&syscall_trace_lock); 426 mutex_lock(&syscall_trace_lock);
425 sys_refcount_exit--; 427 sys_refcount_exit--;
@@ -432,6 +434,14 @@ void unreg_event_syscall_exit(struct ftrace_event_call *call)
432int init_syscall_trace(struct ftrace_event_call *call) 434int init_syscall_trace(struct ftrace_event_call *call)
433{ 435{
434 int id; 436 int id;
437 int num;
438
439 num = ((struct syscall_metadata *)call->data)->syscall_nr;
440 if (num < 0 || num >= NR_syscalls) {
441 pr_debug("syscall %s metadata not mapped, disabling ftrace event\n",
442 ((struct syscall_metadata *)call->data)->name);
443 return -ENOSYS;
444 }
435 445
436 if (set_syscall_print_fmt(call) < 0) 446 if (set_syscall_print_fmt(call) < 0)
437 return -ENOMEM; 447 return -ENOMEM;
@@ -446,7 +456,7 @@ int init_syscall_trace(struct ftrace_event_call *call)
446 return id; 456 return id;
447} 457}
448 458
449unsigned long __init arch_syscall_addr(int nr) 459unsigned long __init __weak arch_syscall_addr(int nr)
450{ 460{
451 return (unsigned long)sys_call_table[nr]; 461 return (unsigned long)sys_call_table[nr];
452} 462}
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index e95ee7f31d43..68187af4889e 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -27,8 +27,8 @@
27#include <linux/sched.h> 27#include <linux/sched.h>
28#include <linux/jump_label.h> 28#include <linux/jump_label.h>
29 29
30extern struct tracepoint __start___tracepoints[]; 30extern struct tracepoint * const __start___tracepoints_ptrs[];
31extern struct tracepoint __stop___tracepoints[]; 31extern struct tracepoint * const __stop___tracepoints_ptrs[];
32 32
33/* Set to 1 to enable tracepoint debug output */ 33/* Set to 1 to enable tracepoint debug output */
34static const int tracepoint_debug; 34static const int tracepoint_debug;
@@ -298,10 +298,10 @@ static void disable_tracepoint(struct tracepoint *elem)
298 * 298 *
299 * Updates the probe callback corresponding to a range of tracepoints. 299 * Updates the probe callback corresponding to a range of tracepoints.
300 */ 300 */
301void 301void tracepoint_update_probe_range(struct tracepoint * const *begin,
302tracepoint_update_probe_range(struct tracepoint *begin, struct tracepoint *end) 302 struct tracepoint * const *end)
303{ 303{
304 struct tracepoint *iter; 304 struct tracepoint * const *iter;
305 struct tracepoint_entry *mark_entry; 305 struct tracepoint_entry *mark_entry;
306 306
307 if (!begin) 307 if (!begin)
@@ -309,12 +309,12 @@ tracepoint_update_probe_range(struct tracepoint *begin, struct tracepoint *end)
309 309
310 mutex_lock(&tracepoints_mutex); 310 mutex_lock(&tracepoints_mutex);
311 for (iter = begin; iter < end; iter++) { 311 for (iter = begin; iter < end; iter++) {
312 mark_entry = get_tracepoint(iter->name); 312 mark_entry = get_tracepoint((*iter)->name);
313 if (mark_entry) { 313 if (mark_entry) {
314 set_tracepoint(&mark_entry, iter, 314 set_tracepoint(&mark_entry, *iter,
315 !!mark_entry->refcount); 315 !!mark_entry->refcount);
316 } else { 316 } else {
317 disable_tracepoint(iter); 317 disable_tracepoint(*iter);
318 } 318 }
319 } 319 }
320 mutex_unlock(&tracepoints_mutex); 320 mutex_unlock(&tracepoints_mutex);
@@ -326,8 +326,8 @@ tracepoint_update_probe_range(struct tracepoint *begin, struct tracepoint *end)
326static void tracepoint_update_probes(void) 326static void tracepoint_update_probes(void)
327{ 327{
328 /* Core kernel tracepoints */ 328 /* Core kernel tracepoints */
329 tracepoint_update_probe_range(__start___tracepoints, 329 tracepoint_update_probe_range(__start___tracepoints_ptrs,
330 __stop___tracepoints); 330 __stop___tracepoints_ptrs);
331 /* tracepoints in modules. */ 331 /* tracepoints in modules. */
332 module_update_tracepoints(); 332 module_update_tracepoints();
333} 333}
@@ -514,8 +514,8 @@ EXPORT_SYMBOL_GPL(tracepoint_probe_update_all);
514 * Will return the first tracepoint in the range if the input tracepoint is 514 * Will return the first tracepoint in the range if the input tracepoint is
515 * NULL. 515 * NULL.
516 */ 516 */
517int tracepoint_get_iter_range(struct tracepoint **tracepoint, 517int tracepoint_get_iter_range(struct tracepoint * const **tracepoint,
518 struct tracepoint *begin, struct tracepoint *end) 518 struct tracepoint * const *begin, struct tracepoint * const *end)
519{ 519{
520 if (!*tracepoint && begin != end) { 520 if (!*tracepoint && begin != end) {
521 *tracepoint = begin; 521 *tracepoint = begin;
@@ -534,7 +534,8 @@ static void tracepoint_get_iter(struct tracepoint_iter *iter)
534 /* Core kernel tracepoints */ 534 /* Core kernel tracepoints */
535 if (!iter->module) { 535 if (!iter->module) {
536 found = tracepoint_get_iter_range(&iter->tracepoint, 536 found = tracepoint_get_iter_range(&iter->tracepoint,
537 __start___tracepoints, __stop___tracepoints); 537 __start___tracepoints_ptrs,
538 __stop___tracepoints_ptrs);
538 if (found) 539 if (found)
539 goto end; 540 goto end;
540 } 541 }
@@ -585,8 +586,8 @@ int tracepoint_module_notify(struct notifier_block *self,
585 switch (val) { 586 switch (val) {
586 case MODULE_STATE_COMING: 587 case MODULE_STATE_COMING:
587 case MODULE_STATE_GOING: 588 case MODULE_STATE_GOING:
588 tracepoint_update_probe_range(mod->tracepoints, 589 tracepoint_update_probe_range(mod->tracepoints_ptrs,
589 mod->tracepoints + mod->num_tracepoints); 590 mod->tracepoints_ptrs + mod->num_tracepoints);
590 break; 591 break;
591 } 592 }
592 return 0; 593 return 0;
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 25915832291a..9da289c34f22 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -12,6 +12,8 @@
12#include <linux/highuid.h> 12#include <linux/highuid.h>
13#include <linux/cred.h> 13#include <linux/cred.h>
14 14
15static struct kmem_cache *user_ns_cachep __read_mostly;
16
15/* 17/*
16 * Create a new user namespace, deriving the creator from the user in the 18 * Create a new user namespace, deriving the creator from the user in the
17 * passed credentials, and replacing that user with the new root user for the 19 * passed credentials, and replacing that user with the new root user for the
@@ -26,7 +28,7 @@ int create_user_ns(struct cred *new)
26 struct user_struct *root_user; 28 struct user_struct *root_user;
27 int n; 29 int n;
28 30
29 ns = kmalloc(sizeof(struct user_namespace), GFP_KERNEL); 31 ns = kmem_cache_alloc(user_ns_cachep, GFP_KERNEL);
30 if (!ns) 32 if (!ns)
31 return -ENOMEM; 33 return -ENOMEM;
32 34
@@ -38,7 +40,7 @@ int create_user_ns(struct cred *new)
38 /* Alloc new root user. */ 40 /* Alloc new root user. */
39 root_user = alloc_uid(ns, 0); 41 root_user = alloc_uid(ns, 0);
40 if (!root_user) { 42 if (!root_user) {
41 kfree(ns); 43 kmem_cache_free(user_ns_cachep, ns);
42 return -ENOMEM; 44 return -ENOMEM;
43 } 45 }
44 46
@@ -71,7 +73,7 @@ static void free_user_ns_work(struct work_struct *work)
71 struct user_namespace *ns = 73 struct user_namespace *ns =
72 container_of(work, struct user_namespace, destroyer); 74 container_of(work, struct user_namespace, destroyer);
73 free_uid(ns->creator); 75 free_uid(ns->creator);
74 kfree(ns); 76 kmem_cache_free(user_ns_cachep, ns);
75} 77}
76 78
77void free_user_ns(struct kref *kref) 79void free_user_ns(struct kref *kref)
@@ -126,3 +128,10 @@ gid_t user_ns_map_gid(struct user_namespace *to, const struct cred *cred, gid_t
126 /* No useful relationship so no mapping */ 128 /* No useful relationship so no mapping */
127 return overflowgid; 129 return overflowgid;
128} 130}
131
132static __init int user_namespaces_init(void)
133{
134 user_ns_cachep = KMEM_CACHE(user_namespace, SLAB_PANIC);
135 return 0;
136}
137module_init(user_namespaces_init);
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 6e7b575ac33c..18bb15776c57 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -27,7 +27,7 @@
27#include <asm/irq_regs.h> 27#include <asm/irq_regs.h>
28#include <linux/perf_event.h> 28#include <linux/perf_event.h>
29 29
30int watchdog_enabled; 30int watchdog_enabled = 1;
31int __read_mostly softlockup_thresh = 60; 31int __read_mostly softlockup_thresh = 60;
32 32
33static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts); 33static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts);
@@ -43,9 +43,6 @@ static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved);
43static DEFINE_PER_CPU(struct perf_event *, watchdog_ev); 43static DEFINE_PER_CPU(struct perf_event *, watchdog_ev);
44#endif 44#endif
45 45
46static int no_watchdog;
47
48
49/* boot commands */ 46/* boot commands */
50/* 47/*
51 * Should we panic when a soft-lockup or hard-lockup occurs: 48 * Should we panic when a soft-lockup or hard-lockup occurs:
@@ -58,7 +55,7 @@ static int __init hardlockup_panic_setup(char *str)
58 if (!strncmp(str, "panic", 5)) 55 if (!strncmp(str, "panic", 5))
59 hardlockup_panic = 1; 56 hardlockup_panic = 1;
60 else if (!strncmp(str, "0", 1)) 57 else if (!strncmp(str, "0", 1))
61 no_watchdog = 1; 58 watchdog_enabled = 0;
62 return 1; 59 return 1;
63} 60}
64__setup("nmi_watchdog=", hardlockup_panic_setup); 61__setup("nmi_watchdog=", hardlockup_panic_setup);
@@ -77,7 +74,7 @@ __setup("softlockup_panic=", softlockup_panic_setup);
77 74
78static int __init nowatchdog_setup(char *str) 75static int __init nowatchdog_setup(char *str)
79{ 76{
80 no_watchdog = 1; 77 watchdog_enabled = 0;
81 return 1; 78 return 1;
82} 79}
83__setup("nowatchdog", nowatchdog_setup); 80__setup("nowatchdog", nowatchdog_setup);
@@ -85,7 +82,7 @@ __setup("nowatchdog", nowatchdog_setup);
85/* deprecated */ 82/* deprecated */
86static int __init nosoftlockup_setup(char *str) 83static int __init nosoftlockup_setup(char *str)
87{ 84{
88 no_watchdog = 1; 85 watchdog_enabled = 0;
89 return 1; 86 return 1;
90} 87}
91__setup("nosoftlockup", nosoftlockup_setup); 88__setup("nosoftlockup", nosoftlockup_setup);
@@ -118,12 +115,12 @@ static void __touch_watchdog(void)
118{ 115{
119 int this_cpu = smp_processor_id(); 116 int this_cpu = smp_processor_id();
120 117
121 __get_cpu_var(watchdog_touch_ts) = get_timestamp(this_cpu); 118 __this_cpu_write(watchdog_touch_ts, get_timestamp(this_cpu));
122} 119}
123 120
124void touch_softlockup_watchdog(void) 121void touch_softlockup_watchdog(void)
125{ 122{
126 __raw_get_cpu_var(watchdog_touch_ts) = 0; 123 __this_cpu_write(watchdog_touch_ts, 0);
127} 124}
128EXPORT_SYMBOL(touch_softlockup_watchdog); 125EXPORT_SYMBOL(touch_softlockup_watchdog);
129 126
@@ -167,12 +164,12 @@ void touch_softlockup_watchdog_sync(void)
167/* watchdog detector functions */ 164/* watchdog detector functions */
168static int is_hardlockup(void) 165static int is_hardlockup(void)
169{ 166{
170 unsigned long hrint = __get_cpu_var(hrtimer_interrupts); 167 unsigned long hrint = __this_cpu_read(hrtimer_interrupts);
171 168
172 if (__get_cpu_var(hrtimer_interrupts_saved) == hrint) 169 if (__this_cpu_read(hrtimer_interrupts_saved) == hrint)
173 return 1; 170 return 1;
174 171
175 __get_cpu_var(hrtimer_interrupts_saved) = hrint; 172 __this_cpu_write(hrtimer_interrupts_saved, hrint);
176 return 0; 173 return 0;
177} 174}
178#endif 175#endif
@@ -205,8 +202,8 @@ static void watchdog_overflow_callback(struct perf_event *event, int nmi,
205 /* Ensure the watchdog never gets throttled */ 202 /* Ensure the watchdog never gets throttled */
206 event->hw.interrupts = 0; 203 event->hw.interrupts = 0;
207 204
208 if (__get_cpu_var(watchdog_nmi_touch) == true) { 205 if (__this_cpu_read(watchdog_nmi_touch) == true) {
209 __get_cpu_var(watchdog_nmi_touch) = false; 206 __this_cpu_write(watchdog_nmi_touch, false);
210 return; 207 return;
211 } 208 }
212 209
@@ -220,7 +217,7 @@ static void watchdog_overflow_callback(struct perf_event *event, int nmi,
220 int this_cpu = smp_processor_id(); 217 int this_cpu = smp_processor_id();
221 218
222 /* only print hardlockups once */ 219 /* only print hardlockups once */
223 if (__get_cpu_var(hard_watchdog_warn) == true) 220 if (__this_cpu_read(hard_watchdog_warn) == true)
224 return; 221 return;
225 222
226 if (hardlockup_panic) 223 if (hardlockup_panic)
@@ -228,16 +225,16 @@ static void watchdog_overflow_callback(struct perf_event *event, int nmi,
228 else 225 else
229 WARN(1, "Watchdog detected hard LOCKUP on cpu %d", this_cpu); 226 WARN(1, "Watchdog detected hard LOCKUP on cpu %d", this_cpu);
230 227
231 __get_cpu_var(hard_watchdog_warn) = true; 228 __this_cpu_write(hard_watchdog_warn, true);
232 return; 229 return;
233 } 230 }
234 231
235 __get_cpu_var(hard_watchdog_warn) = false; 232 __this_cpu_write(hard_watchdog_warn, false);
236 return; 233 return;
237} 234}
238static void watchdog_interrupt_count(void) 235static void watchdog_interrupt_count(void)
239{ 236{
240 __get_cpu_var(hrtimer_interrupts)++; 237 __this_cpu_inc(hrtimer_interrupts);
241} 238}
242#else 239#else
243static inline void watchdog_interrupt_count(void) { return; } 240static inline void watchdog_interrupt_count(void) { return; }
@@ -246,7 +243,7 @@ static inline void watchdog_interrupt_count(void) { return; }
246/* watchdog kicker functions */ 243/* watchdog kicker functions */
247static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) 244static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
248{ 245{
249 unsigned long touch_ts = __get_cpu_var(watchdog_touch_ts); 246 unsigned long touch_ts = __this_cpu_read(watchdog_touch_ts);
250 struct pt_regs *regs = get_irq_regs(); 247 struct pt_regs *regs = get_irq_regs();
251 int duration; 248 int duration;
252 249
@@ -254,18 +251,18 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
254 watchdog_interrupt_count(); 251 watchdog_interrupt_count();
255 252
256 /* kick the softlockup detector */ 253 /* kick the softlockup detector */
257 wake_up_process(__get_cpu_var(softlockup_watchdog)); 254 wake_up_process(__this_cpu_read(softlockup_watchdog));
258 255
259 /* .. and repeat */ 256 /* .. and repeat */
260 hrtimer_forward_now(hrtimer, ns_to_ktime(get_sample_period())); 257 hrtimer_forward_now(hrtimer, ns_to_ktime(get_sample_period()));
261 258
262 if (touch_ts == 0) { 259 if (touch_ts == 0) {
263 if (unlikely(__get_cpu_var(softlockup_touch_sync))) { 260 if (unlikely(__this_cpu_read(softlockup_touch_sync))) {
264 /* 261 /*
265 * If the time stamp was touched atomically 262 * If the time stamp was touched atomically
266 * make sure the scheduler tick is up to date. 263 * make sure the scheduler tick is up to date.
267 */ 264 */
268 __get_cpu_var(softlockup_touch_sync) = false; 265 __this_cpu_write(softlockup_touch_sync, false);
269 sched_clock_tick(); 266 sched_clock_tick();
270 } 267 }
271 __touch_watchdog(); 268 __touch_watchdog();
@@ -281,7 +278,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
281 duration = is_softlockup(touch_ts); 278 duration = is_softlockup(touch_ts);
282 if (unlikely(duration)) { 279 if (unlikely(duration)) {
283 /* only warn once */ 280 /* only warn once */
284 if (__get_cpu_var(soft_watchdog_warn) == true) 281 if (__this_cpu_read(soft_watchdog_warn) == true)
285 return HRTIMER_RESTART; 282 return HRTIMER_RESTART;
286 283
287 printk(KERN_ERR "BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n", 284 printk(KERN_ERR "BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n",
@@ -296,9 +293,9 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
296 293
297 if (softlockup_panic) 294 if (softlockup_panic)
298 panic("softlockup: hung tasks"); 295 panic("softlockup: hung tasks");
299 __get_cpu_var(soft_watchdog_warn) = true; 296 __this_cpu_write(soft_watchdog_warn, true);
300 } else 297 } else
301 __get_cpu_var(soft_watchdog_warn) = false; 298 __this_cpu_write(soft_watchdog_warn, false);
302 299
303 return HRTIMER_RESTART; 300 return HRTIMER_RESTART;
304} 301}
@@ -366,8 +363,14 @@ static int watchdog_nmi_enable(int cpu)
366 goto out_save; 363 goto out_save;
367 } 364 }
368 365
369 printk(KERN_ERR "NMI watchdog disabled for cpu%i: unable to create perf event: %ld\n", 366
370 cpu, PTR_ERR(event)); 367 /* vary the KERN level based on the returned errno */
368 if (PTR_ERR(event) == -EOPNOTSUPP)
369 printk(KERN_INFO "NMI watchdog disabled (cpu%i): not supported (no LAPIC?)\n", cpu);
370 else if (PTR_ERR(event) == -ENOENT)
371 printk(KERN_WARNING "NMI watchdog disabled (cpu%i): hardware events not enabled\n", cpu);
372 else
373 printk(KERN_ERR "NMI watchdog disabled (cpu%i): unable to create perf event: %ld\n", cpu, PTR_ERR(event));
371 return PTR_ERR(event); 374 return PTR_ERR(event);
372 375
373 /* success path */ 376 /* success path */
@@ -432,9 +435,6 @@ static int watchdog_enable(int cpu)
432 wake_up_process(p); 435 wake_up_process(p);
433 } 436 }
434 437
435 /* if any cpu succeeds, watchdog is considered enabled for the system */
436 watchdog_enabled = 1;
437
438 return 0; 438 return 0;
439} 439}
440 440
@@ -462,12 +462,16 @@ static void watchdog_disable(int cpu)
462static void watchdog_enable_all_cpus(void) 462static void watchdog_enable_all_cpus(void)
463{ 463{
464 int cpu; 464 int cpu;
465 int result = 0; 465
466 watchdog_enabled = 0;
466 467
467 for_each_online_cpu(cpu) 468 for_each_online_cpu(cpu)
468 result += watchdog_enable(cpu); 469 if (!watchdog_enable(cpu))
470 /* if any cpu succeeds, watchdog is considered
471 enabled for the system */
472 watchdog_enabled = 1;
469 473
470 if (result) 474 if (!watchdog_enabled)
471 printk(KERN_ERR "watchdog: failed to be enabled on some cpus\n"); 475 printk(KERN_ERR "watchdog: failed to be enabled on some cpus\n");
472 476
473} 477}
@@ -476,9 +480,6 @@ static void watchdog_disable_all_cpus(void)
476{ 480{
477 int cpu; 481 int cpu;
478 482
479 if (no_watchdog)
480 return;
481
482 for_each_online_cpu(cpu) 483 for_each_online_cpu(cpu)
483 watchdog_disable(cpu); 484 watchdog_disable(cpu);
484 485
@@ -498,10 +499,12 @@ int proc_dowatchdog_enabled(struct ctl_table *table, int write,
498{ 499{
499 proc_dointvec(table, write, buffer, length, ppos); 500 proc_dointvec(table, write, buffer, length, ppos);
500 501
501 if (watchdog_enabled) 502 if (write) {
502 watchdog_enable_all_cpus(); 503 if (watchdog_enabled)
503 else 504 watchdog_enable_all_cpus();
504 watchdog_disable_all_cpus(); 505 else
506 watchdog_disable_all_cpus();
507 }
505 return 0; 508 return 0;
506} 509}
507 510
@@ -530,7 +533,8 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
530 break; 533 break;
531 case CPU_ONLINE: 534 case CPU_ONLINE:
532 case CPU_ONLINE_FROZEN: 535 case CPU_ONLINE_FROZEN:
533 err = watchdog_enable(hotcpu); 536 if (watchdog_enabled)
537 err = watchdog_enable(hotcpu);
534 break; 538 break;
535#ifdef CONFIG_HOTPLUG_CPU 539#ifdef CONFIG_HOTPLUG_CPU
536 case CPU_UP_CANCELED: 540 case CPU_UP_CANCELED:
@@ -555,9 +559,6 @@ void __init lockup_detector_init(void)
555 void *cpu = (void *)(long)smp_processor_id(); 559 void *cpu = (void *)(long)smp_processor_id();
556 int err; 560 int err;
557 561
558 if (no_watchdog)
559 return;
560
561 err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu); 562 err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
562 WARN_ON(notifier_to_errno(err)); 563 WARN_ON(notifier_to_errno(err));
563 564
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index e785b0f2aea5..b5fe4c00eb3c 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -79,7 +79,9 @@ enum {
79 MAX_IDLE_WORKERS_RATIO = 4, /* 1/4 of busy can be idle */ 79 MAX_IDLE_WORKERS_RATIO = 4, /* 1/4 of busy can be idle */
80 IDLE_WORKER_TIMEOUT = 300 * HZ, /* keep idle ones for 5 mins */ 80 IDLE_WORKER_TIMEOUT = 300 * HZ, /* keep idle ones for 5 mins */
81 81
82 MAYDAY_INITIAL_TIMEOUT = HZ / 100, /* call for help after 10ms */ 82 MAYDAY_INITIAL_TIMEOUT = HZ / 100 >= 2 ? HZ / 100 : 2,
83 /* call for help after 10ms
84 (min two ticks) */
83 MAYDAY_INTERVAL = HZ / 10, /* and then every 100ms */ 85 MAYDAY_INTERVAL = HZ / 10, /* and then every 100ms */
84 CREATE_COOLDOWN = HZ, /* time to breath after fail */ 86 CREATE_COOLDOWN = HZ, /* time to breath after fail */
85 TRUSTEE_COOLDOWN = HZ / 10, /* for trustee draining */ 87 TRUSTEE_COOLDOWN = HZ / 10, /* for trustee draining */
@@ -314,6 +316,11 @@ static inline int __next_wq_cpu(int cpu, const struct cpumask *mask,
314 316
315static struct debug_obj_descr work_debug_descr; 317static struct debug_obj_descr work_debug_descr;
316 318
319static void *work_debug_hint(void *addr)
320{
321 return ((struct work_struct *) addr)->func;
322}
323
317/* 324/*
318 * fixup_init is called when: 325 * fixup_init is called when:
319 * - an active object is initialized 326 * - an active object is initialized
@@ -385,6 +392,7 @@ static int work_fixup_free(void *addr, enum debug_obj_state state)
385 392
386static struct debug_obj_descr work_debug_descr = { 393static struct debug_obj_descr work_debug_descr = {
387 .name = "work_struct", 394 .name = "work_struct",
395 .debug_hint = work_debug_hint,
388 .fixup_init = work_fixup_init, 396 .fixup_init = work_fixup_init,
389 .fixup_activate = work_fixup_activate, 397 .fixup_activate = work_fixup_activate,
390 .fixup_free = work_fixup_free, 398 .fixup_free = work_fixup_free,
@@ -768,7 +776,11 @@ static inline void worker_clr_flags(struct worker *worker, unsigned int flags)
768 776
769 worker->flags &= ~flags; 777 worker->flags &= ~flags;
770 778
771 /* if transitioning out of NOT_RUNNING, increment nr_running */ 779 /*
780 * If transitioning out of NOT_RUNNING, increment nr_running. Note
781 * that the nested NOT_RUNNING is not a noop. NOT_RUNNING is mask
782 * of multiple flags, not a single flag.
783 */
772 if ((flags & WORKER_NOT_RUNNING) && (oflags & WORKER_NOT_RUNNING)) 784 if ((flags & WORKER_NOT_RUNNING) && (oflags & WORKER_NOT_RUNNING))
773 if (!(worker->flags & WORKER_NOT_RUNNING)) 785 if (!(worker->flags & WORKER_NOT_RUNNING))
774 atomic_inc(get_gcwq_nr_running(gcwq->cpu)); 786 atomic_inc(get_gcwq_nr_running(gcwq->cpu));
@@ -932,6 +944,38 @@ static void insert_work(struct cpu_workqueue_struct *cwq,
932 wake_up_worker(gcwq); 944 wake_up_worker(gcwq);
933} 945}
934 946
947/*
948 * Test whether @work is being queued from another work executing on the
949 * same workqueue. This is rather expensive and should only be used from
950 * cold paths.
951 */
952static bool is_chained_work(struct workqueue_struct *wq)
953{
954 unsigned long flags;
955 unsigned int cpu;
956
957 for_each_gcwq_cpu(cpu) {
958 struct global_cwq *gcwq = get_gcwq(cpu);
959 struct worker *worker;
960 struct hlist_node *pos;
961 int i;
962
963 spin_lock_irqsave(&gcwq->lock, flags);
964 for_each_busy_worker(worker, i, pos, gcwq) {
965 if (worker->task != current)
966 continue;
967 spin_unlock_irqrestore(&gcwq->lock, flags);
968 /*
969 * I'm @worker, no locking necessary. See if @work
970 * is headed to the same workqueue.
971 */
972 return worker->current_cwq->wq == wq;
973 }
974 spin_unlock_irqrestore(&gcwq->lock, flags);
975 }
976 return false;
977}
978
935static void __queue_work(unsigned int cpu, struct workqueue_struct *wq, 979static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
936 struct work_struct *work) 980 struct work_struct *work)
937{ 981{
@@ -943,7 +987,9 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
943 987
944 debug_work_activate(work); 988 debug_work_activate(work);
945 989
946 if (WARN_ON_ONCE(wq->flags & WQ_DYING)) 990 /* if dying, only works from the same workqueue are allowed */
991 if (unlikely(wq->flags & WQ_DYING) &&
992 WARN_ON_ONCE(!is_chained_work(wq)))
947 return; 993 return;
948 994
949 /* determine gcwq to use */ 995 /* determine gcwq to use */
@@ -1806,7 +1852,7 @@ __acquires(&gcwq->lock)
1806 spin_unlock_irq(&gcwq->lock); 1852 spin_unlock_irq(&gcwq->lock);
1807 1853
1808 work_clear_pending(work); 1854 work_clear_pending(work);
1809 lock_map_acquire(&cwq->wq->lockdep_map); 1855 lock_map_acquire_read(&cwq->wq->lockdep_map);
1810 lock_map_acquire(&lockdep_map); 1856 lock_map_acquire(&lockdep_map);
1811 trace_workqueue_execute_start(work); 1857 trace_workqueue_execute_start(work);
1812 f(work); 1858 f(work);
@@ -2009,6 +2055,15 @@ repeat:
2009 move_linked_works(work, scheduled, &n); 2055 move_linked_works(work, scheduled, &n);
2010 2056
2011 process_scheduled_works(rescuer); 2057 process_scheduled_works(rescuer);
2058
2059 /*
2060 * Leave this gcwq. If keep_working() is %true, notify a
2061 * regular worker; otherwise, we end up with 0 concurrency
2062 * and stalling the execution.
2063 */
2064 if (keep_working(gcwq))
2065 wake_up_worker(gcwq);
2066
2012 spin_unlock_irq(&gcwq->lock); 2067 spin_unlock_irq(&gcwq->lock);
2013 } 2068 }
2014 2069
@@ -2350,8 +2405,18 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr,
2350 insert_wq_barrier(cwq, barr, work, worker); 2405 insert_wq_barrier(cwq, barr, work, worker);
2351 spin_unlock_irq(&gcwq->lock); 2406 spin_unlock_irq(&gcwq->lock);
2352 2407
2353 lock_map_acquire(&cwq->wq->lockdep_map); 2408 /*
2409 * If @max_active is 1 or rescuer is in use, flushing another work
2410 * item on the same workqueue may lead to deadlock. Make sure the
2411 * flusher is not running on the same workqueue by verifying write
2412 * access.
2413 */
2414 if (cwq->wq->saved_max_active == 1 || cwq->wq->flags & WQ_RESCUER)
2415 lock_map_acquire(&cwq->wq->lockdep_map);
2416 else
2417 lock_map_acquire_read(&cwq->wq->lockdep_map);
2354 lock_map_release(&cwq->wq->lockdep_map); 2418 lock_map_release(&cwq->wq->lockdep_map);
2419
2355 return true; 2420 return true;
2356already_gone: 2421already_gone:
2357 spin_unlock_irq(&gcwq->lock); 2422 spin_unlock_irq(&gcwq->lock);
@@ -2908,7 +2973,7 @@ struct workqueue_struct *__alloc_workqueue_key(const char *name,
2908 */ 2973 */
2909 spin_lock(&workqueue_lock); 2974 spin_lock(&workqueue_lock);
2910 2975
2911 if (workqueue_freezing && wq->flags & WQ_FREEZEABLE) 2976 if (workqueue_freezing && wq->flags & WQ_FREEZABLE)
2912 for_each_cwq_cpu(cpu, wq) 2977 for_each_cwq_cpu(cpu, wq)
2913 get_cwq(cpu, wq)->max_active = 0; 2978 get_cwq(cpu, wq)->max_active = 0;
2914 2979
@@ -2936,11 +3001,35 @@ EXPORT_SYMBOL_GPL(__alloc_workqueue_key);
2936 */ 3001 */
2937void destroy_workqueue(struct workqueue_struct *wq) 3002void destroy_workqueue(struct workqueue_struct *wq)
2938{ 3003{
3004 unsigned int flush_cnt = 0;
2939 unsigned int cpu; 3005 unsigned int cpu;
2940 3006
3007 /*
3008 * Mark @wq dying and drain all pending works. Once WQ_DYING is
3009 * set, only chain queueing is allowed. IOW, only currently
3010 * pending or running work items on @wq can queue further work
3011 * items on it. @wq is flushed repeatedly until it becomes empty.
3012 * The number of flushing is detemined by the depth of chaining and
3013 * should be relatively short. Whine if it takes too long.
3014 */
2941 wq->flags |= WQ_DYING; 3015 wq->flags |= WQ_DYING;
3016reflush:
2942 flush_workqueue(wq); 3017 flush_workqueue(wq);
2943 3018
3019 for_each_cwq_cpu(cpu, wq) {
3020 struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
3021
3022 if (!cwq->nr_active && list_empty(&cwq->delayed_works))
3023 continue;
3024
3025 if (++flush_cnt == 10 ||
3026 (flush_cnt % 100 == 0 && flush_cnt <= 1000))
3027 printk(KERN_WARNING "workqueue %s: flush on "
3028 "destruction isn't complete after %u tries\n",
3029 wq->name, flush_cnt);
3030 goto reflush;
3031 }
3032
2944 /* 3033 /*
2945 * wq list is used to freeze wq, remove from list after 3034 * wq list is used to freeze wq, remove from list after
2946 * flushing is complete in case freeze races us. 3035 * flushing is complete in case freeze races us.
@@ -2996,7 +3085,7 @@ void workqueue_set_max_active(struct workqueue_struct *wq, int max_active)
2996 3085
2997 spin_lock_irq(&gcwq->lock); 3086 spin_lock_irq(&gcwq->lock);
2998 3087
2999 if (!(wq->flags & WQ_FREEZEABLE) || 3088 if (!(wq->flags & WQ_FREEZABLE) ||
3000 !(gcwq->flags & GCWQ_FREEZING)) 3089 !(gcwq->flags & GCWQ_FREEZING))
3001 get_cwq(gcwq->cpu, wq)->max_active = max_active; 3090 get_cwq(gcwq->cpu, wq)->max_active = max_active;
3002 3091
@@ -3246,7 +3335,7 @@ static int __cpuinit trustee_thread(void *__gcwq)
3246 * want to get it over with ASAP - spam rescuers, wake up as 3335 * want to get it over with ASAP - spam rescuers, wake up as
3247 * many idlers as necessary and create new ones till the 3336 * many idlers as necessary and create new ones till the
3248 * worklist is empty. Note that if the gcwq is frozen, there 3337 * worklist is empty. Note that if the gcwq is frozen, there
3249 * may be frozen works in freezeable cwqs. Don't declare 3338 * may be frozen works in freezable cwqs. Don't declare
3250 * completion while frozen. 3339 * completion while frozen.
3251 */ 3340 */
3252 while (gcwq->nr_workers != gcwq->nr_idle || 3341 while (gcwq->nr_workers != gcwq->nr_idle ||
@@ -3504,9 +3593,9 @@ EXPORT_SYMBOL_GPL(work_on_cpu);
3504/** 3593/**
3505 * freeze_workqueues_begin - begin freezing workqueues 3594 * freeze_workqueues_begin - begin freezing workqueues
3506 * 3595 *
3507 * Start freezing workqueues. After this function returns, all 3596 * Start freezing workqueues. After this function returns, all freezable
3508 * freezeable workqueues will queue new works to their frozen_works 3597 * workqueues will queue new works to their frozen_works list instead of
3509 * list instead of gcwq->worklist. 3598 * gcwq->worklist.
3510 * 3599 *
3511 * CONTEXT: 3600 * CONTEXT:
3512 * Grabs and releases workqueue_lock and gcwq->lock's. 3601 * Grabs and releases workqueue_lock and gcwq->lock's.
@@ -3532,7 +3621,7 @@ void freeze_workqueues_begin(void)
3532 list_for_each_entry(wq, &workqueues, list) { 3621 list_for_each_entry(wq, &workqueues, list) {
3533 struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); 3622 struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
3534 3623
3535 if (cwq && wq->flags & WQ_FREEZEABLE) 3624 if (cwq && wq->flags & WQ_FREEZABLE)
3536 cwq->max_active = 0; 3625 cwq->max_active = 0;
3537 } 3626 }
3538 3627
@@ -3543,7 +3632,7 @@ void freeze_workqueues_begin(void)
3543} 3632}
3544 3633
3545/** 3634/**
3546 * freeze_workqueues_busy - are freezeable workqueues still busy? 3635 * freeze_workqueues_busy - are freezable workqueues still busy?
3547 * 3636 *
3548 * Check whether freezing is complete. This function must be called 3637 * Check whether freezing is complete. This function must be called
3549 * between freeze_workqueues_begin() and thaw_workqueues(). 3638 * between freeze_workqueues_begin() and thaw_workqueues().
@@ -3552,8 +3641,8 @@ void freeze_workqueues_begin(void)
3552 * Grabs and releases workqueue_lock. 3641 * Grabs and releases workqueue_lock.
3553 * 3642 *
3554 * RETURNS: 3643 * RETURNS:
3555 * %true if some freezeable workqueues are still busy. %false if 3644 * %true if some freezable workqueues are still busy. %false if freezing
3556 * freezing is complete. 3645 * is complete.
3557 */ 3646 */
3558bool freeze_workqueues_busy(void) 3647bool freeze_workqueues_busy(void)
3559{ 3648{
@@ -3573,7 +3662,7 @@ bool freeze_workqueues_busy(void)
3573 list_for_each_entry(wq, &workqueues, list) { 3662 list_for_each_entry(wq, &workqueues, list) {
3574 struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); 3663 struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
3575 3664
3576 if (!cwq || !(wq->flags & WQ_FREEZEABLE)) 3665 if (!cwq || !(wq->flags & WQ_FREEZABLE))
3577 continue; 3666 continue;
3578 3667
3579 BUG_ON(cwq->nr_active < 0); 3668 BUG_ON(cwq->nr_active < 0);
@@ -3618,7 +3707,7 @@ void thaw_workqueues(void)
3618 list_for_each_entry(wq, &workqueues, list) { 3707 list_for_each_entry(wq, &workqueues, list) {
3619 struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); 3708 struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
3620 3709
3621 if (!cwq || !(wq->flags & WQ_FREEZEABLE)) 3710 if (!cwq || !(wq->flags & WQ_FREEZABLE))
3622 continue; 3711 continue;
3623 3712
3624 /* restore max_active and repopulate worklist */ 3713 /* restore max_active and repopulate worklist */