diff options
Diffstat (limited to 'kernel')
109 files changed, 6720 insertions, 3126 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index 0b5ff083fa22..353d3fe8ba33 100644 --- a/kernel/Makefile +++ b/kernel/Makefile | |||
@@ -43,7 +43,7 @@ obj-$(CONFIG_RT_MUTEXES) += rtmutex.o | |||
43 | obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o | 43 | obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o |
44 | obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o | 44 | obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o |
45 | obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o | 45 | obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o |
46 | obj-$(CONFIG_USE_GENERIC_SMP_HELPERS) += smp.o | 46 | obj-$(CONFIG_SMP) += smp.o |
47 | ifneq ($(CONFIG_SMP),y) | 47 | ifneq ($(CONFIG_SMP),y) |
48 | obj-y += up.o | 48 | obj-y += up.o |
49 | endif | 49 | endif |
@@ -100,6 +100,7 @@ obj-$(CONFIG_FUNCTION_TRACER) += trace/ | |||
100 | obj-$(CONFIG_TRACING) += trace/ | 100 | obj-$(CONFIG_TRACING) += trace/ |
101 | obj-$(CONFIG_X86_DS) += trace/ | 101 | obj-$(CONFIG_X86_DS) += trace/ |
102 | obj-$(CONFIG_RING_BUFFER) += trace/ | 102 | obj-$(CONFIG_RING_BUFFER) += trace/ |
103 | obj-$(CONFIG_TRACEPOINTS) += trace/ | ||
103 | obj-$(CONFIG_SMP) += sched_cpupri.o | 104 | obj-$(CONFIG_SMP) += sched_cpupri.o |
104 | obj-$(CONFIG_IRQ_WORK) += irq_work.o | 105 | obj-$(CONFIG_IRQ_WORK) += irq_work.o |
105 | obj-$(CONFIG_PERF_EVENTS) += perf_event.o | 106 | obj-$(CONFIG_PERF_EVENTS) += perf_event.o |
@@ -121,7 +122,7 @@ $(obj)/configs.o: $(obj)/config_data.h | |||
121 | # config_data.h contains the same information as ikconfig.h but gzipped. | 122 | # config_data.h contains the same information as ikconfig.h but gzipped. |
122 | # Info from config_data can be extracted from /proc/config* | 123 | # Info from config_data can be extracted from /proc/config* |
123 | targets += config_data.gz | 124 | targets += config_data.gz |
124 | $(obj)/config_data.gz: .config FORCE | 125 | $(obj)/config_data.gz: $(KCONFIG_CONFIG) FORCE |
125 | $(call if_changed,gzip) | 126 | $(call if_changed,gzip) |
126 | 127 | ||
127 | quiet_cmd_ikconfiggz = IKCFG $@ | 128 | quiet_cmd_ikconfiggz = IKCFG $@ |
diff --git a/kernel/audit.c b/kernel/audit.c index 77770a034d59..e4956244ae50 100644 --- a/kernel/audit.c +++ b/kernel/audit.c | |||
@@ -400,7 +400,7 @@ static void kauditd_send_skb(struct sk_buff *skb) | |||
400 | if (err < 0) { | 400 | if (err < 0) { |
401 | BUG_ON(err != -ECONNREFUSED); /* Shouldn't happen */ | 401 | BUG_ON(err != -ECONNREFUSED); /* Shouldn't happen */ |
402 | printk(KERN_ERR "audit: *NO* daemon at audit_pid=%d\n", audit_pid); | 402 | printk(KERN_ERR "audit: *NO* daemon at audit_pid=%d\n", audit_pid); |
403 | audit_log_lost("auditd dissapeared\n"); | 403 | audit_log_lost("auditd disappeared\n"); |
404 | audit_pid = 0; | 404 | audit_pid = 0; |
405 | /* we might get lucky and get this in the next auditd */ | 405 | /* we might get lucky and get this in the next auditd */ |
406 | audit_hold_skb(skb); | 406 | audit_hold_skb(skb); |
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index d2e3c7866460..e683869365d9 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c | |||
@@ -144,9 +144,9 @@ int audit_watch_compare(struct audit_watch *watch, unsigned long ino, dev_t dev) | |||
144 | } | 144 | } |
145 | 145 | ||
146 | /* Initialize a parent watch entry. */ | 146 | /* Initialize a parent watch entry. */ |
147 | static struct audit_parent *audit_init_parent(struct nameidata *ndp) | 147 | static struct audit_parent *audit_init_parent(struct path *path) |
148 | { | 148 | { |
149 | struct inode *inode = ndp->path.dentry->d_inode; | 149 | struct inode *inode = path->dentry->d_inode; |
150 | struct audit_parent *parent; | 150 | struct audit_parent *parent; |
151 | int ret; | 151 | int ret; |
152 | 152 | ||
@@ -353,53 +353,40 @@ static void audit_remove_parent_watches(struct audit_parent *parent) | |||
353 | } | 353 | } |
354 | 354 | ||
355 | /* Get path information necessary for adding watches. */ | 355 | /* Get path information necessary for adding watches. */ |
356 | static int audit_get_nd(char *path, struct nameidata **ndp, struct nameidata **ndw) | 356 | static int audit_get_nd(struct audit_watch *watch, struct path *parent) |
357 | { | 357 | { |
358 | struct nameidata *ndparent, *ndwatch; | 358 | struct nameidata nd; |
359 | struct dentry *d; | ||
359 | int err; | 360 | int err; |
360 | 361 | ||
361 | ndparent = kmalloc(sizeof(*ndparent), GFP_KERNEL); | 362 | err = kern_path_parent(watch->path, &nd); |
362 | if (unlikely(!ndparent)) | 363 | if (err) |
363 | return -ENOMEM; | 364 | return err; |
364 | 365 | ||
365 | ndwatch = kmalloc(sizeof(*ndwatch), GFP_KERNEL); | 366 | if (nd.last_type != LAST_NORM) { |
366 | if (unlikely(!ndwatch)) { | 367 | path_put(&nd.path); |
367 | kfree(ndparent); | 368 | return -EINVAL; |
368 | return -ENOMEM; | ||
369 | } | 369 | } |
370 | 370 | ||
371 | err = path_lookup(path, LOOKUP_PARENT, ndparent); | 371 | mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT); |
372 | if (err) { | 372 | d = lookup_one_len(nd.last.name, nd.path.dentry, nd.last.len); |
373 | kfree(ndparent); | 373 | if (IS_ERR(d)) { |
374 | kfree(ndwatch); | 374 | mutex_unlock(&nd.path.dentry->d_inode->i_mutex); |
375 | return err; | 375 | path_put(&nd.path); |
376 | return PTR_ERR(d); | ||
376 | } | 377 | } |
377 | 378 | if (d->d_inode) { | |
378 | err = path_lookup(path, 0, ndwatch); | 379 | /* update watch filter fields */ |
379 | if (err) { | 380 | watch->dev = d->d_inode->i_sb->s_dev; |
380 | kfree(ndwatch); | 381 | watch->ino = d->d_inode->i_ino; |
381 | ndwatch = NULL; | ||
382 | } | 382 | } |
383 | mutex_unlock(&nd.path.dentry->d_inode->i_mutex); | ||
383 | 384 | ||
384 | *ndp = ndparent; | 385 | *parent = nd.path; |
385 | *ndw = ndwatch; | 386 | dput(d); |
386 | |||
387 | return 0; | 387 | return 0; |
388 | } | 388 | } |
389 | 389 | ||
390 | /* Release resources used for watch path information. */ | ||
391 | static void audit_put_nd(struct nameidata *ndp, struct nameidata *ndw) | ||
392 | { | ||
393 | if (ndp) { | ||
394 | path_put(&ndp->path); | ||
395 | kfree(ndp); | ||
396 | } | ||
397 | if (ndw) { | ||
398 | path_put(&ndw->path); | ||
399 | kfree(ndw); | ||
400 | } | ||
401 | } | ||
402 | |||
403 | /* Associate the given rule with an existing parent. | 390 | /* Associate the given rule with an existing parent. |
404 | * Caller must hold audit_filter_mutex. */ | 391 | * Caller must hold audit_filter_mutex. */ |
405 | static void audit_add_to_parent(struct audit_krule *krule, | 392 | static void audit_add_to_parent(struct audit_krule *krule, |
@@ -440,31 +427,24 @@ int audit_add_watch(struct audit_krule *krule, struct list_head **list) | |||
440 | { | 427 | { |
441 | struct audit_watch *watch = krule->watch; | 428 | struct audit_watch *watch = krule->watch; |
442 | struct audit_parent *parent; | 429 | struct audit_parent *parent; |
443 | struct nameidata *ndp = NULL, *ndw = NULL; | 430 | struct path parent_path; |
444 | int h, ret = 0; | 431 | int h, ret = 0; |
445 | 432 | ||
446 | mutex_unlock(&audit_filter_mutex); | 433 | mutex_unlock(&audit_filter_mutex); |
447 | 434 | ||
448 | /* Avoid calling path_lookup under audit_filter_mutex. */ | 435 | /* Avoid calling path_lookup under audit_filter_mutex. */ |
449 | ret = audit_get_nd(watch->path, &ndp, &ndw); | 436 | ret = audit_get_nd(watch, &parent_path); |
450 | if (ret) { | ||
451 | /* caller expects mutex locked */ | ||
452 | mutex_lock(&audit_filter_mutex); | ||
453 | goto error; | ||
454 | } | ||
455 | 437 | ||
438 | /* caller expects mutex locked */ | ||
456 | mutex_lock(&audit_filter_mutex); | 439 | mutex_lock(&audit_filter_mutex); |
457 | 440 | ||
458 | /* update watch filter fields */ | 441 | if (ret) |
459 | if (ndw) { | 442 | return ret; |
460 | watch->dev = ndw->path.dentry->d_inode->i_sb->s_dev; | ||
461 | watch->ino = ndw->path.dentry->d_inode->i_ino; | ||
462 | } | ||
463 | 443 | ||
464 | /* either find an old parent or attach a new one */ | 444 | /* either find an old parent or attach a new one */ |
465 | parent = audit_find_parent(ndp->path.dentry->d_inode); | 445 | parent = audit_find_parent(parent_path.dentry->d_inode); |
466 | if (!parent) { | 446 | if (!parent) { |
467 | parent = audit_init_parent(ndp); | 447 | parent = audit_init_parent(&parent_path); |
468 | if (IS_ERR(parent)) { | 448 | if (IS_ERR(parent)) { |
469 | ret = PTR_ERR(parent); | 449 | ret = PTR_ERR(parent); |
470 | goto error; | 450 | goto error; |
@@ -479,9 +459,8 @@ int audit_add_watch(struct audit_krule *krule, struct list_head **list) | |||
479 | h = audit_hash_ino((u32)watch->ino); | 459 | h = audit_hash_ino((u32)watch->ino); |
480 | *list = &audit_inode_hash[h]; | 460 | *list = &audit_inode_hash[h]; |
481 | error: | 461 | error: |
482 | audit_put_nd(ndp, ndw); /* NULL args OK */ | 462 | path_put(&parent_path); |
483 | return ret; | 463 | return ret; |
484 | |||
485 | } | 464 | } |
486 | 465 | ||
487 | void audit_remove_watch_rule(struct audit_krule *krule) | 466 | void audit_remove_watch_rule(struct audit_krule *krule) |
diff --git a/kernel/capability.c b/kernel/capability.c index 2f05303715a5..9e9385f132c8 100644 --- a/kernel/capability.c +++ b/kernel/capability.c | |||
@@ -306,7 +306,7 @@ int capable(int cap) | |||
306 | BUG(); | 306 | BUG(); |
307 | } | 307 | } |
308 | 308 | ||
309 | if (security_capable(cap) == 0) { | 309 | if (security_capable(current_cred(), cap) == 0) { |
310 | current->flags |= PF_SUPERPRIV; | 310 | current->flags |= PF_SUPERPRIV; |
311 | return 1; | 311 | return 1; |
312 | } | 312 | } |
diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 66a416b42c18..95362d15128c 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c | |||
@@ -764,6 +764,7 @@ EXPORT_SYMBOL_GPL(cgroup_unlock); | |||
764 | */ | 764 | */ |
765 | 765 | ||
766 | static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode); | 766 | static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode); |
767 | static struct dentry *cgroup_lookup(struct inode *, struct dentry *, struct nameidata *); | ||
767 | static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry); | 768 | static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry); |
768 | static int cgroup_populate_dir(struct cgroup *cgrp); | 769 | static int cgroup_populate_dir(struct cgroup *cgrp); |
769 | static const struct inode_operations cgroup_dir_inode_operations; | 770 | static const struct inode_operations cgroup_dir_inode_operations; |
@@ -860,6 +861,11 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode) | |||
860 | iput(inode); | 861 | iput(inode); |
861 | } | 862 | } |
862 | 863 | ||
864 | static int cgroup_delete(const struct dentry *d) | ||
865 | { | ||
866 | return 1; | ||
867 | } | ||
868 | |||
863 | static void remove_dir(struct dentry *d) | 869 | static void remove_dir(struct dentry *d) |
864 | { | 870 | { |
865 | struct dentry *parent = dget(d->d_parent); | 871 | struct dentry *parent = dget(d->d_parent); |
@@ -874,25 +880,29 @@ static void cgroup_clear_directory(struct dentry *dentry) | |||
874 | struct list_head *node; | 880 | struct list_head *node; |
875 | 881 | ||
876 | BUG_ON(!mutex_is_locked(&dentry->d_inode->i_mutex)); | 882 | BUG_ON(!mutex_is_locked(&dentry->d_inode->i_mutex)); |
877 | spin_lock(&dcache_lock); | 883 | spin_lock(&dentry->d_lock); |
878 | node = dentry->d_subdirs.next; | 884 | node = dentry->d_subdirs.next; |
879 | while (node != &dentry->d_subdirs) { | 885 | while (node != &dentry->d_subdirs) { |
880 | struct dentry *d = list_entry(node, struct dentry, d_u.d_child); | 886 | struct dentry *d = list_entry(node, struct dentry, d_u.d_child); |
887 | |||
888 | spin_lock_nested(&d->d_lock, DENTRY_D_LOCK_NESTED); | ||
881 | list_del_init(node); | 889 | list_del_init(node); |
882 | if (d->d_inode) { | 890 | if (d->d_inode) { |
883 | /* This should never be called on a cgroup | 891 | /* This should never be called on a cgroup |
884 | * directory with child cgroups */ | 892 | * directory with child cgroups */ |
885 | BUG_ON(d->d_inode->i_mode & S_IFDIR); | 893 | BUG_ON(d->d_inode->i_mode & S_IFDIR); |
886 | d = dget_locked(d); | 894 | dget_dlock(d); |
887 | spin_unlock(&dcache_lock); | 895 | spin_unlock(&d->d_lock); |
896 | spin_unlock(&dentry->d_lock); | ||
888 | d_delete(d); | 897 | d_delete(d); |
889 | simple_unlink(dentry->d_inode, d); | 898 | simple_unlink(dentry->d_inode, d); |
890 | dput(d); | 899 | dput(d); |
891 | spin_lock(&dcache_lock); | 900 | spin_lock(&dentry->d_lock); |
892 | } | 901 | } else |
902 | spin_unlock(&d->d_lock); | ||
893 | node = dentry->d_subdirs.next; | 903 | node = dentry->d_subdirs.next; |
894 | } | 904 | } |
895 | spin_unlock(&dcache_lock); | 905 | spin_unlock(&dentry->d_lock); |
896 | } | 906 | } |
897 | 907 | ||
898 | /* | 908 | /* |
@@ -900,11 +910,16 @@ static void cgroup_clear_directory(struct dentry *dentry) | |||
900 | */ | 910 | */ |
901 | static void cgroup_d_remove_dir(struct dentry *dentry) | 911 | static void cgroup_d_remove_dir(struct dentry *dentry) |
902 | { | 912 | { |
913 | struct dentry *parent; | ||
914 | |||
903 | cgroup_clear_directory(dentry); | 915 | cgroup_clear_directory(dentry); |
904 | 916 | ||
905 | spin_lock(&dcache_lock); | 917 | parent = dentry->d_parent; |
918 | spin_lock(&parent->d_lock); | ||
919 | spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); | ||
906 | list_del_init(&dentry->d_u.d_child); | 920 | list_del_init(&dentry->d_u.d_child); |
907 | spin_unlock(&dcache_lock); | 921 | spin_unlock(&dentry->d_lock); |
922 | spin_unlock(&parent->d_lock); | ||
908 | remove_dir(dentry); | 923 | remove_dir(dentry); |
909 | } | 924 | } |
910 | 925 | ||
@@ -1440,6 +1455,11 @@ static int cgroup_set_super(struct super_block *sb, void *data) | |||
1440 | 1455 | ||
1441 | static int cgroup_get_rootdir(struct super_block *sb) | 1456 | static int cgroup_get_rootdir(struct super_block *sb) |
1442 | { | 1457 | { |
1458 | static const struct dentry_operations cgroup_dops = { | ||
1459 | .d_iput = cgroup_diput, | ||
1460 | .d_delete = cgroup_delete, | ||
1461 | }; | ||
1462 | |||
1443 | struct inode *inode = | 1463 | struct inode *inode = |
1444 | cgroup_new_inode(S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR, sb); | 1464 | cgroup_new_inode(S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR, sb); |
1445 | struct dentry *dentry; | 1465 | struct dentry *dentry; |
@@ -1457,6 +1477,8 @@ static int cgroup_get_rootdir(struct super_block *sb) | |||
1457 | return -ENOMEM; | 1477 | return -ENOMEM; |
1458 | } | 1478 | } |
1459 | sb->s_root = dentry; | 1479 | sb->s_root = dentry; |
1480 | /* for everything else we want ->d_op set */ | ||
1481 | sb->s_d_op = &cgroup_dops; | ||
1460 | return 0; | 1482 | return 0; |
1461 | } | 1483 | } |
1462 | 1484 | ||
@@ -2180,12 +2202,20 @@ static const struct file_operations cgroup_file_operations = { | |||
2180 | }; | 2202 | }; |
2181 | 2203 | ||
2182 | static const struct inode_operations cgroup_dir_inode_operations = { | 2204 | static const struct inode_operations cgroup_dir_inode_operations = { |
2183 | .lookup = simple_lookup, | 2205 | .lookup = cgroup_lookup, |
2184 | .mkdir = cgroup_mkdir, | 2206 | .mkdir = cgroup_mkdir, |
2185 | .rmdir = cgroup_rmdir, | 2207 | .rmdir = cgroup_rmdir, |
2186 | .rename = cgroup_rename, | 2208 | .rename = cgroup_rename, |
2187 | }; | 2209 | }; |
2188 | 2210 | ||
2211 | static struct dentry *cgroup_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) | ||
2212 | { | ||
2213 | if (dentry->d_name.len > NAME_MAX) | ||
2214 | return ERR_PTR(-ENAMETOOLONG); | ||
2215 | d_add(dentry, NULL); | ||
2216 | return NULL; | ||
2217 | } | ||
2218 | |||
2189 | /* | 2219 | /* |
2190 | * Check if a file is a control file | 2220 | * Check if a file is a control file |
2191 | */ | 2221 | */ |
@@ -2199,10 +2229,6 @@ static inline struct cftype *__file_cft(struct file *file) | |||
2199 | static int cgroup_create_file(struct dentry *dentry, mode_t mode, | 2229 | static int cgroup_create_file(struct dentry *dentry, mode_t mode, |
2200 | struct super_block *sb) | 2230 | struct super_block *sb) |
2201 | { | 2231 | { |
2202 | static const struct dentry_operations cgroup_dops = { | ||
2203 | .d_iput = cgroup_diput, | ||
2204 | }; | ||
2205 | |||
2206 | struct inode *inode; | 2232 | struct inode *inode; |
2207 | 2233 | ||
2208 | if (!dentry) | 2234 | if (!dentry) |
@@ -2228,7 +2254,6 @@ static int cgroup_create_file(struct dentry *dentry, mode_t mode, | |||
2228 | inode->i_size = 0; | 2254 | inode->i_size = 0; |
2229 | inode->i_fop = &cgroup_file_operations; | 2255 | inode->i_fop = &cgroup_file_operations; |
2230 | } | 2256 | } |
2231 | dentry->d_op = &cgroup_dops; | ||
2232 | d_instantiate(dentry, inode); | 2257 | d_instantiate(dentry, inode); |
2233 | dget(dentry); /* Extra count - pin the dentry in core */ | 2258 | dget(dentry); /* Extra count - pin the dentry in core */ |
2234 | return 0; | 2259 | return 0; |
@@ -3638,9 +3663,7 @@ again: | |||
3638 | list_del(&cgrp->sibling); | 3663 | list_del(&cgrp->sibling); |
3639 | cgroup_unlock_hierarchy(cgrp->root); | 3664 | cgroup_unlock_hierarchy(cgrp->root); |
3640 | 3665 | ||
3641 | spin_lock(&cgrp->dentry->d_lock); | ||
3642 | d = dget(cgrp->dentry); | 3666 | d = dget(cgrp->dentry); |
3643 | spin_unlock(&d->d_lock); | ||
3644 | 3667 | ||
3645 | cgroup_d_remove_dir(d); | 3668 | cgroup_d_remove_dir(d); |
3646 | dput(d); | 3669 | dput(d); |
@@ -4207,20 +4230,8 @@ void cgroup_post_fork(struct task_struct *child) | |||
4207 | */ | 4230 | */ |
4208 | void cgroup_exit(struct task_struct *tsk, int run_callbacks) | 4231 | void cgroup_exit(struct task_struct *tsk, int run_callbacks) |
4209 | { | 4232 | { |
4210 | int i; | ||
4211 | struct css_set *cg; | 4233 | struct css_set *cg; |
4212 | 4234 | int i; | |
4213 | if (run_callbacks && need_forkexit_callback) { | ||
4214 | /* | ||
4215 | * modular subsystems can't use callbacks, so no need to lock | ||
4216 | * the subsys array | ||
4217 | */ | ||
4218 | for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) { | ||
4219 | struct cgroup_subsys *ss = subsys[i]; | ||
4220 | if (ss->exit) | ||
4221 | ss->exit(ss, tsk); | ||
4222 | } | ||
4223 | } | ||
4224 | 4235 | ||
4225 | /* | 4236 | /* |
4226 | * Unlink from the css_set task list if necessary. | 4237 | * Unlink from the css_set task list if necessary. |
@@ -4238,7 +4249,24 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks) | |||
4238 | task_lock(tsk); | 4249 | task_lock(tsk); |
4239 | cg = tsk->cgroups; | 4250 | cg = tsk->cgroups; |
4240 | tsk->cgroups = &init_css_set; | 4251 | tsk->cgroups = &init_css_set; |
4252 | |||
4253 | if (run_callbacks && need_forkexit_callback) { | ||
4254 | /* | ||
4255 | * modular subsystems can't use callbacks, so no need to lock | ||
4256 | * the subsys array | ||
4257 | */ | ||
4258 | for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) { | ||
4259 | struct cgroup_subsys *ss = subsys[i]; | ||
4260 | if (ss->exit) { | ||
4261 | struct cgroup *old_cgrp = | ||
4262 | rcu_dereference_raw(cg->subsys[i])->cgroup; | ||
4263 | struct cgroup *cgrp = task_cgroup(tsk, i); | ||
4264 | ss->exit(ss, cgrp, old_cgrp, tsk); | ||
4265 | } | ||
4266 | } | ||
4267 | } | ||
4241 | task_unlock(tsk); | 4268 | task_unlock(tsk); |
4269 | |||
4242 | if (cg) | 4270 | if (cg) |
4243 | put_css_set_taskexit(cg); | 4271 | put_css_set_taskexit(cg); |
4244 | } | 4272 | } |
@@ -4790,6 +4818,29 @@ css_get_next(struct cgroup_subsys *ss, int id, | |||
4790 | return ret; | 4818 | return ret; |
4791 | } | 4819 | } |
4792 | 4820 | ||
4821 | /* | ||
4822 | * get corresponding css from file open on cgroupfs directory | ||
4823 | */ | ||
4824 | struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id) | ||
4825 | { | ||
4826 | struct cgroup *cgrp; | ||
4827 | struct inode *inode; | ||
4828 | struct cgroup_subsys_state *css; | ||
4829 | |||
4830 | inode = f->f_dentry->d_inode; | ||
4831 | /* check in cgroup filesystem dir */ | ||
4832 | if (inode->i_op != &cgroup_dir_inode_operations) | ||
4833 | return ERR_PTR(-EBADF); | ||
4834 | |||
4835 | if (id < 0 || id >= CGROUP_SUBSYS_COUNT) | ||
4836 | return ERR_PTR(-EINVAL); | ||
4837 | |||
4838 | /* get cgroup */ | ||
4839 | cgrp = __d_cgrp(f->f_dentry); | ||
4840 | css = cgrp->subsys[id]; | ||
4841 | return css ? css : ERR_PTR(-ENOENT); | ||
4842 | } | ||
4843 | |||
4793 | #ifdef CONFIG_CGROUP_DEBUG | 4844 | #ifdef CONFIG_CGROUP_DEBUG |
4794 | static struct cgroup_subsys_state *debug_create(struct cgroup_subsys *ss, | 4845 | static struct cgroup_subsys_state *debug_create(struct cgroup_subsys *ss, |
4795 | struct cgroup *cont) | 4846 | struct cgroup *cont) |
diff --git a/kernel/compat.c b/kernel/compat.c index c9e2ec0b34a8..38b1d2c1cbe8 100644 --- a/kernel/compat.c +++ b/kernel/compat.c | |||
@@ -52,6 +52,64 @@ static int compat_put_timeval(struct compat_timeval __user *o, | |||
52 | put_user(i->tv_usec, &o->tv_usec)) ? -EFAULT : 0; | 52 | put_user(i->tv_usec, &o->tv_usec)) ? -EFAULT : 0; |
53 | } | 53 | } |
54 | 54 | ||
55 | static int compat_get_timex(struct timex *txc, struct compat_timex __user *utp) | ||
56 | { | ||
57 | memset(txc, 0, sizeof(struct timex)); | ||
58 | |||
59 | if (!access_ok(VERIFY_READ, utp, sizeof(struct compat_timex)) || | ||
60 | __get_user(txc->modes, &utp->modes) || | ||
61 | __get_user(txc->offset, &utp->offset) || | ||
62 | __get_user(txc->freq, &utp->freq) || | ||
63 | __get_user(txc->maxerror, &utp->maxerror) || | ||
64 | __get_user(txc->esterror, &utp->esterror) || | ||
65 | __get_user(txc->status, &utp->status) || | ||
66 | __get_user(txc->constant, &utp->constant) || | ||
67 | __get_user(txc->precision, &utp->precision) || | ||
68 | __get_user(txc->tolerance, &utp->tolerance) || | ||
69 | __get_user(txc->time.tv_sec, &utp->time.tv_sec) || | ||
70 | __get_user(txc->time.tv_usec, &utp->time.tv_usec) || | ||
71 | __get_user(txc->tick, &utp->tick) || | ||
72 | __get_user(txc->ppsfreq, &utp->ppsfreq) || | ||
73 | __get_user(txc->jitter, &utp->jitter) || | ||
74 | __get_user(txc->shift, &utp->shift) || | ||
75 | __get_user(txc->stabil, &utp->stabil) || | ||
76 | __get_user(txc->jitcnt, &utp->jitcnt) || | ||
77 | __get_user(txc->calcnt, &utp->calcnt) || | ||
78 | __get_user(txc->errcnt, &utp->errcnt) || | ||
79 | __get_user(txc->stbcnt, &utp->stbcnt)) | ||
80 | return -EFAULT; | ||
81 | |||
82 | return 0; | ||
83 | } | ||
84 | |||
85 | static int compat_put_timex(struct compat_timex __user *utp, struct timex *txc) | ||
86 | { | ||
87 | if (!access_ok(VERIFY_WRITE, utp, sizeof(struct compat_timex)) || | ||
88 | __put_user(txc->modes, &utp->modes) || | ||
89 | __put_user(txc->offset, &utp->offset) || | ||
90 | __put_user(txc->freq, &utp->freq) || | ||
91 | __put_user(txc->maxerror, &utp->maxerror) || | ||
92 | __put_user(txc->esterror, &utp->esterror) || | ||
93 | __put_user(txc->status, &utp->status) || | ||
94 | __put_user(txc->constant, &utp->constant) || | ||
95 | __put_user(txc->precision, &utp->precision) || | ||
96 | __put_user(txc->tolerance, &utp->tolerance) || | ||
97 | __put_user(txc->time.tv_sec, &utp->time.tv_sec) || | ||
98 | __put_user(txc->time.tv_usec, &utp->time.tv_usec) || | ||
99 | __put_user(txc->tick, &utp->tick) || | ||
100 | __put_user(txc->ppsfreq, &utp->ppsfreq) || | ||
101 | __put_user(txc->jitter, &utp->jitter) || | ||
102 | __put_user(txc->shift, &utp->shift) || | ||
103 | __put_user(txc->stabil, &utp->stabil) || | ||
104 | __put_user(txc->jitcnt, &utp->jitcnt) || | ||
105 | __put_user(txc->calcnt, &utp->calcnt) || | ||
106 | __put_user(txc->errcnt, &utp->errcnt) || | ||
107 | __put_user(txc->stbcnt, &utp->stbcnt) || | ||
108 | __put_user(txc->tai, &utp->tai)) | ||
109 | return -EFAULT; | ||
110 | return 0; | ||
111 | } | ||
112 | |||
55 | asmlinkage long compat_sys_gettimeofday(struct compat_timeval __user *tv, | 113 | asmlinkage long compat_sys_gettimeofday(struct compat_timeval __user *tv, |
56 | struct timezone __user *tz) | 114 | struct timezone __user *tz) |
57 | { | 115 | { |
@@ -617,6 +675,29 @@ long compat_sys_clock_gettime(clockid_t which_clock, | |||
617 | return err; | 675 | return err; |
618 | } | 676 | } |
619 | 677 | ||
678 | long compat_sys_clock_adjtime(clockid_t which_clock, | ||
679 | struct compat_timex __user *utp) | ||
680 | { | ||
681 | struct timex txc; | ||
682 | mm_segment_t oldfs; | ||
683 | int err, ret; | ||
684 | |||
685 | err = compat_get_timex(&txc, utp); | ||
686 | if (err) | ||
687 | return err; | ||
688 | |||
689 | oldfs = get_fs(); | ||
690 | set_fs(KERNEL_DS); | ||
691 | ret = sys_clock_adjtime(which_clock, (struct timex __user *) &txc); | ||
692 | set_fs(oldfs); | ||
693 | |||
694 | err = compat_put_timex(utp, &txc); | ||
695 | if (err) | ||
696 | return err; | ||
697 | |||
698 | return ret; | ||
699 | } | ||
700 | |||
620 | long compat_sys_clock_getres(clockid_t which_clock, | 701 | long compat_sys_clock_getres(clockid_t which_clock, |
621 | struct compat_timespec __user *tp) | 702 | struct compat_timespec __user *tp) |
622 | { | 703 | { |
@@ -951,58 +1032,17 @@ asmlinkage long compat_sys_rt_sigsuspend(compat_sigset_t __user *unewset, compat | |||
951 | asmlinkage long compat_sys_adjtimex(struct compat_timex __user *utp) | 1032 | asmlinkage long compat_sys_adjtimex(struct compat_timex __user *utp) |
952 | { | 1033 | { |
953 | struct timex txc; | 1034 | struct timex txc; |
954 | int ret; | 1035 | int err, ret; |
955 | |||
956 | memset(&txc, 0, sizeof(struct timex)); | ||
957 | 1036 | ||
958 | if (!access_ok(VERIFY_READ, utp, sizeof(struct compat_timex)) || | 1037 | err = compat_get_timex(&txc, utp); |
959 | __get_user(txc.modes, &utp->modes) || | 1038 | if (err) |
960 | __get_user(txc.offset, &utp->offset) || | 1039 | return err; |
961 | __get_user(txc.freq, &utp->freq) || | ||
962 | __get_user(txc.maxerror, &utp->maxerror) || | ||
963 | __get_user(txc.esterror, &utp->esterror) || | ||
964 | __get_user(txc.status, &utp->status) || | ||
965 | __get_user(txc.constant, &utp->constant) || | ||
966 | __get_user(txc.precision, &utp->precision) || | ||
967 | __get_user(txc.tolerance, &utp->tolerance) || | ||
968 | __get_user(txc.time.tv_sec, &utp->time.tv_sec) || | ||
969 | __get_user(txc.time.tv_usec, &utp->time.tv_usec) || | ||
970 | __get_user(txc.tick, &utp->tick) || | ||
971 | __get_user(txc.ppsfreq, &utp->ppsfreq) || | ||
972 | __get_user(txc.jitter, &utp->jitter) || | ||
973 | __get_user(txc.shift, &utp->shift) || | ||
974 | __get_user(txc.stabil, &utp->stabil) || | ||
975 | __get_user(txc.jitcnt, &utp->jitcnt) || | ||
976 | __get_user(txc.calcnt, &utp->calcnt) || | ||
977 | __get_user(txc.errcnt, &utp->errcnt) || | ||
978 | __get_user(txc.stbcnt, &utp->stbcnt)) | ||
979 | return -EFAULT; | ||
980 | 1040 | ||
981 | ret = do_adjtimex(&txc); | 1041 | ret = do_adjtimex(&txc); |
982 | 1042 | ||
983 | if (!access_ok(VERIFY_WRITE, utp, sizeof(struct compat_timex)) || | 1043 | err = compat_put_timex(utp, &txc); |
984 | __put_user(txc.modes, &utp->modes) || | 1044 | if (err) |
985 | __put_user(txc.offset, &utp->offset) || | 1045 | return err; |
986 | __put_user(txc.freq, &utp->freq) || | ||
987 | __put_user(txc.maxerror, &utp->maxerror) || | ||
988 | __put_user(txc.esterror, &utp->esterror) || | ||
989 | __put_user(txc.status, &utp->status) || | ||
990 | __put_user(txc.constant, &utp->constant) || | ||
991 | __put_user(txc.precision, &utp->precision) || | ||
992 | __put_user(txc.tolerance, &utp->tolerance) || | ||
993 | __put_user(txc.time.tv_sec, &utp->time.tv_sec) || | ||
994 | __put_user(txc.time.tv_usec, &utp->time.tv_usec) || | ||
995 | __put_user(txc.tick, &utp->tick) || | ||
996 | __put_user(txc.ppsfreq, &utp->ppsfreq) || | ||
997 | __put_user(txc.jitter, &utp->jitter) || | ||
998 | __put_user(txc.shift, &utp->shift) || | ||
999 | __put_user(txc.stabil, &utp->stabil) || | ||
1000 | __put_user(txc.jitcnt, &utp->jitcnt) || | ||
1001 | __put_user(txc.calcnt, &utp->calcnt) || | ||
1002 | __put_user(txc.errcnt, &utp->errcnt) || | ||
1003 | __put_user(txc.stbcnt, &utp->stbcnt) || | ||
1004 | __put_user(txc.tai, &utp->tai)) | ||
1005 | ret = -EFAULT; | ||
1006 | 1046 | ||
1007 | return ret; | 1047 | return ret; |
1008 | } | 1048 | } |
diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 4349935c2ad8..e92e98189032 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c | |||
@@ -1575,8 +1575,10 @@ static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft, | |||
1575 | return -ENODEV; | 1575 | return -ENODEV; |
1576 | 1576 | ||
1577 | trialcs = alloc_trial_cpuset(cs); | 1577 | trialcs = alloc_trial_cpuset(cs); |
1578 | if (!trialcs) | 1578 | if (!trialcs) { |
1579 | return -ENOMEM; | 1579 | retval = -ENOMEM; |
1580 | goto out; | ||
1581 | } | ||
1580 | 1582 | ||
1581 | switch (cft->private) { | 1583 | switch (cft->private) { |
1582 | case FILE_CPULIST: | 1584 | case FILE_CPULIST: |
@@ -1591,6 +1593,7 @@ static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft, | |||
1591 | } | 1593 | } |
1592 | 1594 | ||
1593 | free_trial_cpuset(trialcs); | 1595 | free_trial_cpuset(trialcs); |
1596 | out: | ||
1594 | cgroup_unlock(); | 1597 | cgroup_unlock(); |
1595 | return retval; | 1598 | return retval; |
1596 | } | 1599 | } |
diff --git a/kernel/cred.c b/kernel/cred.c index 6a1aa004e376..2343c132c5a7 100644 --- a/kernel/cred.c +++ b/kernel/cred.c | |||
@@ -35,7 +35,7 @@ static struct kmem_cache *cred_jar; | |||
35 | static struct thread_group_cred init_tgcred = { | 35 | static struct thread_group_cred init_tgcred = { |
36 | .usage = ATOMIC_INIT(2), | 36 | .usage = ATOMIC_INIT(2), |
37 | .tgid = 0, | 37 | .tgid = 0, |
38 | .lock = SPIN_LOCK_UNLOCKED, | 38 | .lock = __SPIN_LOCK_UNLOCKED(init_cred.tgcred.lock), |
39 | }; | 39 | }; |
40 | #endif | 40 | #endif |
41 | 41 | ||
@@ -252,13 +252,13 @@ struct cred *cred_alloc_blank(void) | |||
252 | #endif | 252 | #endif |
253 | 253 | ||
254 | atomic_set(&new->usage, 1); | 254 | atomic_set(&new->usage, 1); |
255 | #ifdef CONFIG_DEBUG_CREDENTIALS | ||
256 | new->magic = CRED_MAGIC; | ||
257 | #endif | ||
255 | 258 | ||
256 | if (security_cred_alloc_blank(new, GFP_KERNEL) < 0) | 259 | if (security_cred_alloc_blank(new, GFP_KERNEL) < 0) |
257 | goto error; | 260 | goto error; |
258 | 261 | ||
259 | #ifdef CONFIG_DEBUG_CREDENTIALS | ||
260 | new->magic = CRED_MAGIC; | ||
261 | #endif | ||
262 | return new; | 262 | return new; |
263 | 263 | ||
264 | error: | 264 | error: |
@@ -657,6 +657,8 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon) | |||
657 | validate_creds(old); | 657 | validate_creds(old); |
658 | 658 | ||
659 | *new = *old; | 659 | *new = *old; |
660 | atomic_set(&new->usage, 1); | ||
661 | set_cred_subscribers(new, 0); | ||
660 | get_uid(new->user); | 662 | get_uid(new->user); |
661 | get_group_info(new->group_info); | 663 | get_group_info(new->group_info); |
662 | 664 | ||
@@ -674,8 +676,6 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon) | |||
674 | if (security_prepare_creds(new, old, GFP_KERNEL) < 0) | 676 | if (security_prepare_creds(new, old, GFP_KERNEL) < 0) |
675 | goto error; | 677 | goto error; |
676 | 678 | ||
677 | atomic_set(&new->usage, 1); | ||
678 | set_cred_subscribers(new, 0); | ||
679 | put_cred(old); | 679 | put_cred(old); |
680 | validate_creds(new); | 680 | validate_creds(new); |
681 | return new; | 681 | return new; |
@@ -748,7 +748,11 @@ bool creds_are_invalid(const struct cred *cred) | |||
748 | if (cred->magic != CRED_MAGIC) | 748 | if (cred->magic != CRED_MAGIC) |
749 | return true; | 749 | return true; |
750 | #ifdef CONFIG_SECURITY_SELINUX | 750 | #ifdef CONFIG_SECURITY_SELINUX |
751 | if (selinux_is_enabled()) { | 751 | /* |
752 | * cred->security == NULL if security_cred_alloc_blank() or | ||
753 | * security_prepare_creds() returned an error. | ||
754 | */ | ||
755 | if (selinux_is_enabled() && cred->security) { | ||
752 | if ((unsigned long) cred->security < PAGE_SIZE) | 756 | if ((unsigned long) cred->security < PAGE_SIZE) |
753 | return true; | 757 | return true; |
754 | if ((*(u32 *)cred->security & 0xffffff00) == | 758 | if ((*(u32 *)cred->security & 0xffffff00) == |
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index a6e729766821..bd3e8e29caa3 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c | |||
@@ -2914,7 +2914,7 @@ static void __init kdb_cmd_init(void) | |||
2914 | } | 2914 | } |
2915 | } | 2915 | } |
2916 | 2916 | ||
2917 | /* Intialize kdb_printf, breakpoint tables and kdb state */ | 2917 | /* Initialize kdb_printf, breakpoint tables and kdb state */ |
2918 | void __init kdb_init(int lvl) | 2918 | void __init kdb_init(int lvl) |
2919 | { | 2919 | { |
2920 | static int kdb_init_lvl = KDB_NOT_INITIALIZED; | 2920 | static int kdb_init_lvl = KDB_NOT_INITIALIZED; |
diff --git a/kernel/exit.c b/kernel/exit.c index 676149a4ac5f..f9a45ebcc7b1 100644 --- a/kernel/exit.c +++ b/kernel/exit.c | |||
@@ -69,7 +69,7 @@ static void __unhash_process(struct task_struct *p, bool group_dead) | |||
69 | 69 | ||
70 | list_del_rcu(&p->tasks); | 70 | list_del_rcu(&p->tasks); |
71 | list_del_init(&p->sibling); | 71 | list_del_init(&p->sibling); |
72 | __get_cpu_var(process_counts)--; | 72 | __this_cpu_dec(process_counts); |
73 | } | 73 | } |
74 | list_del_rcu(&p->thread_group); | 74 | list_del_rcu(&p->thread_group); |
75 | } | 75 | } |
@@ -994,6 +994,15 @@ NORET_TYPE void do_exit(long code) | |||
994 | exit_fs(tsk); | 994 | exit_fs(tsk); |
995 | check_stack_usage(); | 995 | check_stack_usage(); |
996 | exit_thread(); | 996 | exit_thread(); |
997 | |||
998 | /* | ||
999 | * Flush inherited counters to the parent - before the parent | ||
1000 | * gets woken up by child-exit notifications. | ||
1001 | * | ||
1002 | * because of cgroup mode, must be called before cgroup_exit() | ||
1003 | */ | ||
1004 | perf_event_exit_task(tsk); | ||
1005 | |||
997 | cgroup_exit(tsk, 1); | 1006 | cgroup_exit(tsk, 1); |
998 | 1007 | ||
999 | if (group_dead) | 1008 | if (group_dead) |
@@ -1007,11 +1016,6 @@ NORET_TYPE void do_exit(long code) | |||
1007 | * FIXME: do that only when needed, using sched_exit tracepoint | 1016 | * FIXME: do that only when needed, using sched_exit tracepoint |
1008 | */ | 1017 | */ |
1009 | flush_ptrace_hw_breakpoint(tsk); | 1018 | flush_ptrace_hw_breakpoint(tsk); |
1010 | /* | ||
1011 | * Flush inherited counters to the parent - before the parent | ||
1012 | * gets woken up by child-exit notifications. | ||
1013 | */ | ||
1014 | perf_event_exit_task(tsk); | ||
1015 | 1019 | ||
1016 | exit_notify(tsk, group_dead); | 1020 | exit_notify(tsk, group_dead); |
1017 | #ifdef CONFIG_NUMA | 1021 | #ifdef CONFIG_NUMA |
diff --git a/kernel/fork.c b/kernel/fork.c index 7d164e25b0f0..25e429152ddc 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
@@ -66,6 +66,7 @@ | |||
66 | #include <linux/posix-timers.h> | 66 | #include <linux/posix-timers.h> |
67 | #include <linux/user-return-notifier.h> | 67 | #include <linux/user-return-notifier.h> |
68 | #include <linux/oom.h> | 68 | #include <linux/oom.h> |
69 | #include <linux/khugepaged.h> | ||
69 | 70 | ||
70 | #include <asm/pgtable.h> | 71 | #include <asm/pgtable.h> |
71 | #include <asm/pgalloc.h> | 72 | #include <asm/pgalloc.h> |
@@ -169,15 +170,14 @@ EXPORT_SYMBOL(free_task); | |||
169 | static inline void free_signal_struct(struct signal_struct *sig) | 170 | static inline void free_signal_struct(struct signal_struct *sig) |
170 | { | 171 | { |
171 | taskstats_tgid_free(sig); | 172 | taskstats_tgid_free(sig); |
173 | sched_autogroup_exit(sig); | ||
172 | kmem_cache_free(signal_cachep, sig); | 174 | kmem_cache_free(signal_cachep, sig); |
173 | } | 175 | } |
174 | 176 | ||
175 | static inline void put_signal_struct(struct signal_struct *sig) | 177 | static inline void put_signal_struct(struct signal_struct *sig) |
176 | { | 178 | { |
177 | if (atomic_dec_and_test(&sig->sigcnt)) { | 179 | if (atomic_dec_and_test(&sig->sigcnt)) |
178 | sched_autogroup_exit(sig); | ||
179 | free_signal_struct(sig); | 180 | free_signal_struct(sig); |
180 | } | ||
181 | } | 181 | } |
182 | 182 | ||
183 | void __put_task_struct(struct task_struct *tsk) | 183 | void __put_task_struct(struct task_struct *tsk) |
@@ -331,6 +331,9 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) | |||
331 | retval = ksm_fork(mm, oldmm); | 331 | retval = ksm_fork(mm, oldmm); |
332 | if (retval) | 332 | if (retval) |
333 | goto out; | 333 | goto out; |
334 | retval = khugepaged_fork(mm, oldmm); | ||
335 | if (retval) | ||
336 | goto out; | ||
334 | 337 | ||
335 | prev = NULL; | 338 | prev = NULL; |
336 | for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) { | 339 | for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) { |
@@ -530,6 +533,9 @@ void __mmdrop(struct mm_struct *mm) | |||
530 | mm_free_pgd(mm); | 533 | mm_free_pgd(mm); |
531 | destroy_context(mm); | 534 | destroy_context(mm); |
532 | mmu_notifier_mm_destroy(mm); | 535 | mmu_notifier_mm_destroy(mm); |
536 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | ||
537 | VM_BUG_ON(mm->pmd_huge_pte); | ||
538 | #endif | ||
533 | free_mm(mm); | 539 | free_mm(mm); |
534 | } | 540 | } |
535 | EXPORT_SYMBOL_GPL(__mmdrop); | 541 | EXPORT_SYMBOL_GPL(__mmdrop); |
@@ -544,6 +550,7 @@ void mmput(struct mm_struct *mm) | |||
544 | if (atomic_dec_and_test(&mm->mm_users)) { | 550 | if (atomic_dec_and_test(&mm->mm_users)) { |
545 | exit_aio(mm); | 551 | exit_aio(mm); |
546 | ksm_exit(mm); | 552 | ksm_exit(mm); |
553 | khugepaged_exit(mm); /* must run before exit_mmap */ | ||
547 | exit_mmap(mm); | 554 | exit_mmap(mm); |
548 | set_mm_exe_file(mm, NULL); | 555 | set_mm_exe_file(mm, NULL); |
549 | if (!list_empty(&mm->mmlist)) { | 556 | if (!list_empty(&mm->mmlist)) { |
@@ -670,6 +677,10 @@ struct mm_struct *dup_mm(struct task_struct *tsk) | |||
670 | mm->token_priority = 0; | 677 | mm->token_priority = 0; |
671 | mm->last_interval = 0; | 678 | mm->last_interval = 0; |
672 | 679 | ||
680 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | ||
681 | mm->pmd_huge_pte = NULL; | ||
682 | #endif | ||
683 | |||
673 | if (!mm_init(mm, tsk)) | 684 | if (!mm_init(mm, tsk)) |
674 | goto fail_nomem; | 685 | goto fail_nomem; |
675 | 686 | ||
@@ -911,6 +922,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) | |||
911 | 922 | ||
912 | sig->oom_adj = current->signal->oom_adj; | 923 | sig->oom_adj = current->signal->oom_adj; |
913 | sig->oom_score_adj = current->signal->oom_score_adj; | 924 | sig->oom_score_adj = current->signal->oom_score_adj; |
925 | sig->oom_score_adj_min = current->signal->oom_score_adj_min; | ||
914 | 926 | ||
915 | mutex_init(&sig->cred_guard_mutex); | 927 | mutex_init(&sig->cred_guard_mutex); |
916 | 928 | ||
@@ -1286,7 +1298,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1286 | attach_pid(p, PIDTYPE_SID, task_session(current)); | 1298 | attach_pid(p, PIDTYPE_SID, task_session(current)); |
1287 | list_add_tail(&p->sibling, &p->real_parent->children); | 1299 | list_add_tail(&p->sibling, &p->real_parent->children); |
1288 | list_add_tail_rcu(&p->tasks, &init_task.tasks); | 1300 | list_add_tail_rcu(&p->tasks, &init_task.tasks); |
1289 | __get_cpu_var(process_counts)++; | 1301 | __this_cpu_inc(process_counts); |
1290 | } | 1302 | } |
1291 | attach_pid(p, PIDTYPE_PID, pid); | 1303 | attach_pid(p, PIDTYPE_PID, pid); |
1292 | nr_threads++; | 1304 | nr_threads++; |
@@ -1318,7 +1330,7 @@ bad_fork_cleanup_mm: | |||
1318 | } | 1330 | } |
1319 | bad_fork_cleanup_signal: | 1331 | bad_fork_cleanup_signal: |
1320 | if (!(clone_flags & CLONE_THREAD)) | 1332 | if (!(clone_flags & CLONE_THREAD)) |
1321 | put_signal_struct(p->signal); | 1333 | free_signal_struct(p->signal); |
1322 | bad_fork_cleanup_sighand: | 1334 | bad_fork_cleanup_sighand: |
1323 | __cleanup_sighand(p->sighand); | 1335 | __cleanup_sighand(p->sighand); |
1324 | bad_fork_cleanup_fs: | 1336 | bad_fork_cleanup_fs: |
@@ -1411,23 +1423,6 @@ long do_fork(unsigned long clone_flags, | |||
1411 | } | 1423 | } |
1412 | 1424 | ||
1413 | /* | 1425 | /* |
1414 | * We hope to recycle these flags after 2.6.26 | ||
1415 | */ | ||
1416 | if (unlikely(clone_flags & CLONE_STOPPED)) { | ||
1417 | static int __read_mostly count = 100; | ||
1418 | |||
1419 | if (count > 0 && printk_ratelimit()) { | ||
1420 | char comm[TASK_COMM_LEN]; | ||
1421 | |||
1422 | count--; | ||
1423 | printk(KERN_INFO "fork(): process `%s' used deprecated " | ||
1424 | "clone flags 0x%lx\n", | ||
1425 | get_task_comm(comm, current), | ||
1426 | clone_flags & CLONE_STOPPED); | ||
1427 | } | ||
1428 | } | ||
1429 | |||
1430 | /* | ||
1431 | * When called from kernel_thread, don't do user tracing stuff. | 1426 | * When called from kernel_thread, don't do user tracing stuff. |
1432 | */ | 1427 | */ |
1433 | if (likely(user_mode(regs))) | 1428 | if (likely(user_mode(regs))) |
@@ -1465,16 +1460,7 @@ long do_fork(unsigned long clone_flags, | |||
1465 | */ | 1460 | */ |
1466 | p->flags &= ~PF_STARTING; | 1461 | p->flags &= ~PF_STARTING; |
1467 | 1462 | ||
1468 | if (unlikely(clone_flags & CLONE_STOPPED)) { | 1463 | wake_up_new_task(p, clone_flags); |
1469 | /* | ||
1470 | * We'll start up with an immediate SIGSTOP. | ||
1471 | */ | ||
1472 | sigaddset(&p->pending.signal, SIGSTOP); | ||
1473 | set_tsk_thread_flag(p, TIF_SIGPENDING); | ||
1474 | __set_task_state(p, TASK_STOPPED); | ||
1475 | } else { | ||
1476 | wake_up_new_task(p, clone_flags); | ||
1477 | } | ||
1478 | 1464 | ||
1479 | tracehook_report_clone_complete(trace, regs, | 1465 | tracehook_report_clone_complete(trace, regs, |
1480 | clone_flags, nr, p); | 1466 | clone_flags, nr, p); |
diff --git a/kernel/freezer.c b/kernel/freezer.c index bd1d42b17cb2..66ecd2ead215 100644 --- a/kernel/freezer.c +++ b/kernel/freezer.c | |||
@@ -104,8 +104,13 @@ bool freeze_task(struct task_struct *p, bool sig_only) | |||
104 | } | 104 | } |
105 | 105 | ||
106 | if (should_send_signal(p)) { | 106 | if (should_send_signal(p)) { |
107 | if (!signal_pending(p)) | 107 | fake_signal_wake_up(p); |
108 | fake_signal_wake_up(p); | 108 | /* |
109 | * fake_signal_wake_up() goes through p's scheduler | ||
110 | * lock and guarantees that TASK_STOPPED/TRACED -> | ||
111 | * TASK_RUNNING transition can't race with task state | ||
112 | * testing in try_to_freeze_tasks(). | ||
113 | */ | ||
109 | } else if (sig_only) { | 114 | } else if (sig_only) { |
110 | return false; | 115 | return false; |
111 | } else { | 116 | } else { |
diff --git a/kernel/futex.c b/kernel/futex.c index 3019b92e6917..bda415715382 100644 --- a/kernel/futex.c +++ b/kernel/futex.c | |||
@@ -233,7 +233,7 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key) | |||
233 | { | 233 | { |
234 | unsigned long address = (unsigned long)uaddr; | 234 | unsigned long address = (unsigned long)uaddr; |
235 | struct mm_struct *mm = current->mm; | 235 | struct mm_struct *mm = current->mm; |
236 | struct page *page; | 236 | struct page *page, *page_head; |
237 | int err; | 237 | int err; |
238 | 238 | ||
239 | /* | 239 | /* |
@@ -265,11 +265,46 @@ again: | |||
265 | if (err < 0) | 265 | if (err < 0) |
266 | return err; | 266 | return err; |
267 | 267 | ||
268 | page = compound_head(page); | 268 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE |
269 | lock_page(page); | 269 | page_head = page; |
270 | if (!page->mapping) { | 270 | if (unlikely(PageTail(page))) { |
271 | unlock_page(page); | ||
272 | put_page(page); | 271 | put_page(page); |
272 | /* serialize against __split_huge_page_splitting() */ | ||
273 | local_irq_disable(); | ||
274 | if (likely(__get_user_pages_fast(address, 1, 1, &page) == 1)) { | ||
275 | page_head = compound_head(page); | ||
276 | /* | ||
277 | * page_head is valid pointer but we must pin | ||
278 | * it before taking the PG_lock and/or | ||
279 | * PG_compound_lock. The moment we re-enable | ||
280 | * irqs __split_huge_page_splitting() can | ||
281 | * return and the head page can be freed from | ||
282 | * under us. We can't take the PG_lock and/or | ||
283 | * PG_compound_lock on a page that could be | ||
284 | * freed from under us. | ||
285 | */ | ||
286 | if (page != page_head) { | ||
287 | get_page(page_head); | ||
288 | put_page(page); | ||
289 | } | ||
290 | local_irq_enable(); | ||
291 | } else { | ||
292 | local_irq_enable(); | ||
293 | goto again; | ||
294 | } | ||
295 | } | ||
296 | #else | ||
297 | page_head = compound_head(page); | ||
298 | if (page != page_head) { | ||
299 | get_page(page_head); | ||
300 | put_page(page); | ||
301 | } | ||
302 | #endif | ||
303 | |||
304 | lock_page(page_head); | ||
305 | if (!page_head->mapping) { | ||
306 | unlock_page(page_head); | ||
307 | put_page(page_head); | ||
273 | goto again; | 308 | goto again; |
274 | } | 309 | } |
275 | 310 | ||
@@ -280,20 +315,20 @@ again: | |||
280 | * it's a read-only handle, it's expected that futexes attach to | 315 | * it's a read-only handle, it's expected that futexes attach to |
281 | * the object not the particular process. | 316 | * the object not the particular process. |
282 | */ | 317 | */ |
283 | if (PageAnon(page)) { | 318 | if (PageAnon(page_head)) { |
284 | key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */ | 319 | key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */ |
285 | key->private.mm = mm; | 320 | key->private.mm = mm; |
286 | key->private.address = address; | 321 | key->private.address = address; |
287 | } else { | 322 | } else { |
288 | key->both.offset |= FUT_OFF_INODE; /* inode-based key */ | 323 | key->both.offset |= FUT_OFF_INODE; /* inode-based key */ |
289 | key->shared.inode = page->mapping->host; | 324 | key->shared.inode = page_head->mapping->host; |
290 | key->shared.pgoff = page->index; | 325 | key->shared.pgoff = page_head->index; |
291 | } | 326 | } |
292 | 327 | ||
293 | get_futex_key_refs(key); | 328 | get_futex_key_refs(key); |
294 | 329 | ||
295 | unlock_page(page); | 330 | unlock_page(page_head); |
296 | put_page(page); | 331 | put_page(page_head); |
297 | return 0; | 332 | return 0; |
298 | } | 333 | } |
299 | 334 | ||
@@ -346,15 +381,16 @@ static struct futex_q *futex_top_waiter(struct futex_hash_bucket *hb, | |||
346 | return NULL; | 381 | return NULL; |
347 | } | 382 | } |
348 | 383 | ||
349 | static u32 cmpxchg_futex_value_locked(u32 __user *uaddr, u32 uval, u32 newval) | 384 | static int cmpxchg_futex_value_locked(u32 *curval, u32 __user *uaddr, |
385 | u32 uval, u32 newval) | ||
350 | { | 386 | { |
351 | u32 curval; | 387 | int ret; |
352 | 388 | ||
353 | pagefault_disable(); | 389 | pagefault_disable(); |
354 | curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval); | 390 | ret = futex_atomic_cmpxchg_inatomic(curval, uaddr, uval, newval); |
355 | pagefault_enable(); | 391 | pagefault_enable(); |
356 | 392 | ||
357 | return curval; | 393 | return ret; |
358 | } | 394 | } |
359 | 395 | ||
360 | static int get_futex_value_locked(u32 *dest, u32 __user *from) | 396 | static int get_futex_value_locked(u32 *dest, u32 __user *from) |
@@ -639,7 +675,7 @@ static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb, | |||
639 | struct task_struct *task, int set_waiters) | 675 | struct task_struct *task, int set_waiters) |
640 | { | 676 | { |
641 | int lock_taken, ret, ownerdied = 0; | 677 | int lock_taken, ret, ownerdied = 0; |
642 | u32 uval, newval, curval; | 678 | u32 uval, newval, curval, vpid = task_pid_vnr(task); |
643 | 679 | ||
644 | retry: | 680 | retry: |
645 | ret = lock_taken = 0; | 681 | ret = lock_taken = 0; |
@@ -649,19 +685,17 @@ retry: | |||
649 | * (by doing a 0 -> TID atomic cmpxchg), while holding all | 685 | * (by doing a 0 -> TID atomic cmpxchg), while holding all |
650 | * the locks. It will most likely not succeed. | 686 | * the locks. It will most likely not succeed. |
651 | */ | 687 | */ |
652 | newval = task_pid_vnr(task); | 688 | newval = vpid; |
653 | if (set_waiters) | 689 | if (set_waiters) |
654 | newval |= FUTEX_WAITERS; | 690 | newval |= FUTEX_WAITERS; |
655 | 691 | ||
656 | curval = cmpxchg_futex_value_locked(uaddr, 0, newval); | 692 | if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, 0, newval))) |
657 | |||
658 | if (unlikely(curval == -EFAULT)) | ||
659 | return -EFAULT; | 693 | return -EFAULT; |
660 | 694 | ||
661 | /* | 695 | /* |
662 | * Detect deadlocks. | 696 | * Detect deadlocks. |
663 | */ | 697 | */ |
664 | if ((unlikely((curval & FUTEX_TID_MASK) == task_pid_vnr(task)))) | 698 | if ((unlikely((curval & FUTEX_TID_MASK) == vpid))) |
665 | return -EDEADLK; | 699 | return -EDEADLK; |
666 | 700 | ||
667 | /* | 701 | /* |
@@ -688,14 +722,12 @@ retry: | |||
688 | */ | 722 | */ |
689 | if (unlikely(ownerdied || !(curval & FUTEX_TID_MASK))) { | 723 | if (unlikely(ownerdied || !(curval & FUTEX_TID_MASK))) { |
690 | /* Keep the OWNER_DIED bit */ | 724 | /* Keep the OWNER_DIED bit */ |
691 | newval = (curval & ~FUTEX_TID_MASK) | task_pid_vnr(task); | 725 | newval = (curval & ~FUTEX_TID_MASK) | vpid; |
692 | ownerdied = 0; | 726 | ownerdied = 0; |
693 | lock_taken = 1; | 727 | lock_taken = 1; |
694 | } | 728 | } |
695 | 729 | ||
696 | curval = cmpxchg_futex_value_locked(uaddr, uval, newval); | 730 | if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))) |
697 | |||
698 | if (unlikely(curval == -EFAULT)) | ||
699 | return -EFAULT; | 731 | return -EFAULT; |
700 | if (unlikely(curval != uval)) | 732 | if (unlikely(curval != uval)) |
701 | goto retry; | 733 | goto retry; |
@@ -740,6 +772,24 @@ retry: | |||
740 | return ret; | 772 | return ret; |
741 | } | 773 | } |
742 | 774 | ||
775 | /** | ||
776 | * __unqueue_futex() - Remove the futex_q from its futex_hash_bucket | ||
777 | * @q: The futex_q to unqueue | ||
778 | * | ||
779 | * The q->lock_ptr must not be NULL and must be held by the caller. | ||
780 | */ | ||
781 | static void __unqueue_futex(struct futex_q *q) | ||
782 | { | ||
783 | struct futex_hash_bucket *hb; | ||
784 | |||
785 | if (WARN_ON(!q->lock_ptr || !spin_is_locked(q->lock_ptr) | ||
786 | || plist_node_empty(&q->list))) | ||
787 | return; | ||
788 | |||
789 | hb = container_of(q->lock_ptr, struct futex_hash_bucket, lock); | ||
790 | plist_del(&q->list, &hb->chain); | ||
791 | } | ||
792 | |||
743 | /* | 793 | /* |
744 | * The hash bucket lock must be held when this is called. | 794 | * The hash bucket lock must be held when this is called. |
745 | * Afterwards, the futex_q must not be accessed. | 795 | * Afterwards, the futex_q must not be accessed. |
@@ -757,7 +807,7 @@ static void wake_futex(struct futex_q *q) | |||
757 | */ | 807 | */ |
758 | get_task_struct(p); | 808 | get_task_struct(p); |
759 | 809 | ||
760 | plist_del(&q->list, &q->list.plist); | 810 | __unqueue_futex(q); |
761 | /* | 811 | /* |
762 | * The waiting task can free the futex_q as soon as | 812 | * The waiting task can free the futex_q as soon as |
763 | * q->lock_ptr = NULL is written, without taking any locks. A | 813 | * q->lock_ptr = NULL is written, without taking any locks. A |
@@ -791,10 +841,9 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this) | |||
791 | new_owner = rt_mutex_next_owner(&pi_state->pi_mutex); | 841 | new_owner = rt_mutex_next_owner(&pi_state->pi_mutex); |
792 | 842 | ||
793 | /* | 843 | /* |
794 | * This happens when we have stolen the lock and the original | 844 | * It is possible that the next waiter (the one that brought |
795 | * pending owner did not enqueue itself back on the rt_mutex. | 845 | * this owner to the kernel) timed out and is no longer |
796 | * Thats not a tragedy. We know that way, that a lock waiter | 846 | * waiting on the lock. |
797 | * is on the fly. We make the futex_q waiter the pending owner. | ||
798 | */ | 847 | */ |
799 | if (!new_owner) | 848 | if (!new_owner) |
800 | new_owner = this->task; | 849 | new_owner = this->task; |
@@ -809,9 +858,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this) | |||
809 | 858 | ||
810 | newval = FUTEX_WAITERS | task_pid_vnr(new_owner); | 859 | newval = FUTEX_WAITERS | task_pid_vnr(new_owner); |
811 | 860 | ||
812 | curval = cmpxchg_futex_value_locked(uaddr, uval, newval); | 861 | if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) |
813 | |||
814 | if (curval == -EFAULT) | ||
815 | ret = -EFAULT; | 862 | ret = -EFAULT; |
816 | else if (curval != uval) | 863 | else if (curval != uval) |
817 | ret = -EINVAL; | 864 | ret = -EINVAL; |
@@ -846,10 +893,8 @@ static int unlock_futex_pi(u32 __user *uaddr, u32 uval) | |||
846 | * There is no waiter, so we unlock the futex. The owner died | 893 | * There is no waiter, so we unlock the futex. The owner died |
847 | * bit has not to be preserved here. We are the owner: | 894 | * bit has not to be preserved here. We are the owner: |
848 | */ | 895 | */ |
849 | oldval = cmpxchg_futex_value_locked(uaddr, uval, 0); | 896 | if (cmpxchg_futex_value_locked(&oldval, uaddr, uval, 0)) |
850 | 897 | return -EFAULT; | |
851 | if (oldval == -EFAULT) | ||
852 | return oldval; | ||
853 | if (oldval != uval) | 898 | if (oldval != uval) |
854 | return -EAGAIN; | 899 | return -EAGAIN; |
855 | 900 | ||
@@ -1037,9 +1082,6 @@ void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1, | |||
1037 | plist_del(&q->list, &hb1->chain); | 1082 | plist_del(&q->list, &hb1->chain); |
1038 | plist_add(&q->list, &hb2->chain); | 1083 | plist_add(&q->list, &hb2->chain); |
1039 | q->lock_ptr = &hb2->lock; | 1084 | q->lock_ptr = &hb2->lock; |
1040 | #ifdef CONFIG_DEBUG_PI_LIST | ||
1041 | q->list.plist.spinlock = &hb2->lock; | ||
1042 | #endif | ||
1043 | } | 1085 | } |
1044 | get_futex_key_refs(key2); | 1086 | get_futex_key_refs(key2); |
1045 | q->key = *key2; | 1087 | q->key = *key2; |
@@ -1066,16 +1108,12 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key, | |||
1066 | get_futex_key_refs(key); | 1108 | get_futex_key_refs(key); |
1067 | q->key = *key; | 1109 | q->key = *key; |
1068 | 1110 | ||
1069 | WARN_ON(plist_node_empty(&q->list)); | 1111 | __unqueue_futex(q); |
1070 | plist_del(&q->list, &q->list.plist); | ||
1071 | 1112 | ||
1072 | WARN_ON(!q->rt_waiter); | 1113 | WARN_ON(!q->rt_waiter); |
1073 | q->rt_waiter = NULL; | 1114 | q->rt_waiter = NULL; |
1074 | 1115 | ||
1075 | q->lock_ptr = &hb->lock; | 1116 | q->lock_ptr = &hb->lock; |
1076 | #ifdef CONFIG_DEBUG_PI_LIST | ||
1077 | q->list.plist.spinlock = &hb->lock; | ||
1078 | #endif | ||
1079 | 1117 | ||
1080 | wake_up_state(q->task, TASK_NORMAL); | 1118 | wake_up_state(q->task, TASK_NORMAL); |
1081 | } | 1119 | } |
@@ -1423,9 +1461,6 @@ static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb) | |||
1423 | prio = min(current->normal_prio, MAX_RT_PRIO); | 1461 | prio = min(current->normal_prio, MAX_RT_PRIO); |
1424 | 1462 | ||
1425 | plist_node_init(&q->list, prio); | 1463 | plist_node_init(&q->list, prio); |
1426 | #ifdef CONFIG_DEBUG_PI_LIST | ||
1427 | q->list.plist.spinlock = &hb->lock; | ||
1428 | #endif | ||
1429 | plist_add(&q->list, &hb->chain); | 1464 | plist_add(&q->list, &hb->chain); |
1430 | q->task = current; | 1465 | q->task = current; |
1431 | spin_unlock(&hb->lock); | 1466 | spin_unlock(&hb->lock); |
@@ -1470,8 +1505,7 @@ retry: | |||
1470 | spin_unlock(lock_ptr); | 1505 | spin_unlock(lock_ptr); |
1471 | goto retry; | 1506 | goto retry; |
1472 | } | 1507 | } |
1473 | WARN_ON(plist_node_empty(&q->list)); | 1508 | __unqueue_futex(q); |
1474 | plist_del(&q->list, &q->list.plist); | ||
1475 | 1509 | ||
1476 | BUG_ON(q->pi_state); | 1510 | BUG_ON(q->pi_state); |
1477 | 1511 | ||
@@ -1491,8 +1525,7 @@ retry: | |||
1491 | static void unqueue_me_pi(struct futex_q *q) | 1525 | static void unqueue_me_pi(struct futex_q *q) |
1492 | __releases(q->lock_ptr) | 1526 | __releases(q->lock_ptr) |
1493 | { | 1527 | { |
1494 | WARN_ON(plist_node_empty(&q->list)); | 1528 | __unqueue_futex(q); |
1495 | plist_del(&q->list, &q->list.plist); | ||
1496 | 1529 | ||
1497 | BUG_ON(!q->pi_state); | 1530 | BUG_ON(!q->pi_state); |
1498 | free_pi_state(q->pi_state); | 1531 | free_pi_state(q->pi_state); |
@@ -1522,10 +1555,10 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, | |||
1522 | 1555 | ||
1523 | /* | 1556 | /* |
1524 | * We are here either because we stole the rtmutex from the | 1557 | * We are here either because we stole the rtmutex from the |
1525 | * pending owner or we are the pending owner which failed to | 1558 | * previous highest priority waiter or we are the highest priority |
1526 | * get the rtmutex. We have to replace the pending owner TID | 1559 | * waiter but failed to get the rtmutex the first time. |
1527 | * in the user space variable. This must be atomic as we have | 1560 | * We have to replace the newowner TID in the user space variable. |
1528 | * to preserve the owner died bit here. | 1561 | * This must be atomic as we have to preserve the owner died bit here. |
1529 | * | 1562 | * |
1530 | * Note: We write the user space value _before_ changing the pi_state | 1563 | * Note: We write the user space value _before_ changing the pi_state |
1531 | * because we can fault here. Imagine swapped out pages or a fork | 1564 | * because we can fault here. Imagine swapped out pages or a fork |
@@ -1544,9 +1577,7 @@ retry: | |||
1544 | while (1) { | 1577 | while (1) { |
1545 | newval = (uval & FUTEX_OWNER_DIED) | newtid; | 1578 | newval = (uval & FUTEX_OWNER_DIED) | newtid; |
1546 | 1579 | ||
1547 | curval = cmpxchg_futex_value_locked(uaddr, uval, newval); | 1580 | if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) |
1548 | |||
1549 | if (curval == -EFAULT) | ||
1550 | goto handle_fault; | 1581 | goto handle_fault; |
1551 | if (curval == uval) | 1582 | if (curval == uval) |
1552 | break; | 1583 | break; |
@@ -1574,8 +1605,8 @@ retry: | |||
1574 | 1605 | ||
1575 | /* | 1606 | /* |
1576 | * To handle the page fault we need to drop the hash bucket | 1607 | * To handle the page fault we need to drop the hash bucket |
1577 | * lock here. That gives the other task (either the pending | 1608 | * lock here. That gives the other task (either the highest priority |
1578 | * owner itself or the task which stole the rtmutex) the | 1609 | * waiter itself or the task which stole the rtmutex) the |
1579 | * chance to try the fixup of the pi_state. So once we are | 1610 | * chance to try the fixup of the pi_state. So once we are |
1580 | * back from handling the fault we need to check the pi_state | 1611 | * back from handling the fault we need to check the pi_state |
1581 | * after reacquiring the hash bucket lock and before trying to | 1612 | * after reacquiring the hash bucket lock and before trying to |
@@ -1651,18 +1682,20 @@ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked) | |||
1651 | /* | 1682 | /* |
1652 | * pi_state is incorrect, some other task did a lock steal and | 1683 | * pi_state is incorrect, some other task did a lock steal and |
1653 | * we returned due to timeout or signal without taking the | 1684 | * we returned due to timeout or signal without taking the |
1654 | * rt_mutex. Too late. We can access the rt_mutex_owner without | 1685 | * rt_mutex. Too late. |
1655 | * locking, as the other task is now blocked on the hash bucket | ||
1656 | * lock. Fix the state up. | ||
1657 | */ | 1686 | */ |
1687 | raw_spin_lock(&q->pi_state->pi_mutex.wait_lock); | ||
1658 | owner = rt_mutex_owner(&q->pi_state->pi_mutex); | 1688 | owner = rt_mutex_owner(&q->pi_state->pi_mutex); |
1689 | if (!owner) | ||
1690 | owner = rt_mutex_next_owner(&q->pi_state->pi_mutex); | ||
1691 | raw_spin_unlock(&q->pi_state->pi_mutex.wait_lock); | ||
1659 | ret = fixup_pi_state_owner(uaddr, q, owner); | 1692 | ret = fixup_pi_state_owner(uaddr, q, owner); |
1660 | goto out; | 1693 | goto out; |
1661 | } | 1694 | } |
1662 | 1695 | ||
1663 | /* | 1696 | /* |
1664 | * Paranoia check. If we did not take the lock, then we should not be | 1697 | * Paranoia check. If we did not take the lock, then we should not be |
1665 | * the owner, nor the pending owner, of the rt_mutex. | 1698 | * the owner of the rt_mutex. |
1666 | */ | 1699 | */ |
1667 | if (rt_mutex_owner(&q->pi_state->pi_mutex) == current) | 1700 | if (rt_mutex_owner(&q->pi_state->pi_mutex) == current) |
1668 | printk(KERN_ERR "fixup_owner: ret = %d pi-mutex: %p " | 1701 | printk(KERN_ERR "fixup_owner: ret = %d pi-mutex: %p " |
@@ -1747,13 +1780,14 @@ static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags, | |||
1747 | * | 1780 | * |
1748 | * The basic logical guarantee of a futex is that it blocks ONLY | 1781 | * The basic logical guarantee of a futex is that it blocks ONLY |
1749 | * if cond(var) is known to be true at the time of blocking, for | 1782 | * if cond(var) is known to be true at the time of blocking, for |
1750 | * any cond. If we queued after testing *uaddr, that would open | 1783 | * any cond. If we locked the hash-bucket after testing *uaddr, that |
1751 | * a race condition where we could block indefinitely with | 1784 | * would open a race condition where we could block indefinitely with |
1752 | * cond(var) false, which would violate the guarantee. | 1785 | * cond(var) false, which would violate the guarantee. |
1753 | * | 1786 | * |
1754 | * A consequence is that futex_wait() can return zero and absorb | 1787 | * On the other hand, we insert q and release the hash-bucket only |
1755 | * a wakeup when *uaddr != val on entry to the syscall. This is | 1788 | * after testing *uaddr. This guarantees that futex_wait() will NOT |
1756 | * rare, but normal. | 1789 | * absorb a wakeup if *uaddr does not match the desired values |
1790 | * while the syscall executes. | ||
1757 | */ | 1791 | */ |
1758 | retry: | 1792 | retry: |
1759 | ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q->key); | 1793 | ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q->key); |
@@ -2012,9 +2046,9 @@ static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags) | |||
2012 | { | 2046 | { |
2013 | struct futex_hash_bucket *hb; | 2047 | struct futex_hash_bucket *hb; |
2014 | struct futex_q *this, *next; | 2048 | struct futex_q *this, *next; |
2015 | u32 uval; | ||
2016 | struct plist_head *head; | 2049 | struct plist_head *head; |
2017 | union futex_key key = FUTEX_KEY_INIT; | 2050 | union futex_key key = FUTEX_KEY_INIT; |
2051 | u32 uval, vpid = task_pid_vnr(current); | ||
2018 | int ret; | 2052 | int ret; |
2019 | 2053 | ||
2020 | retry: | 2054 | retry: |
@@ -2023,7 +2057,7 @@ retry: | |||
2023 | /* | 2057 | /* |
2024 | * We release only a lock we actually own: | 2058 | * We release only a lock we actually own: |
2025 | */ | 2059 | */ |
2026 | if ((uval & FUTEX_TID_MASK) != task_pid_vnr(current)) | 2060 | if ((uval & FUTEX_TID_MASK) != vpid) |
2027 | return -EPERM; | 2061 | return -EPERM; |
2028 | 2062 | ||
2029 | ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key); | 2063 | ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key); |
@@ -2038,17 +2072,14 @@ retry: | |||
2038 | * again. If it succeeds then we can return without waking | 2072 | * again. If it succeeds then we can return without waking |
2039 | * anyone else up: | 2073 | * anyone else up: |
2040 | */ | 2074 | */ |
2041 | if (!(uval & FUTEX_OWNER_DIED)) | 2075 | if (!(uval & FUTEX_OWNER_DIED) && |
2042 | uval = cmpxchg_futex_value_locked(uaddr, task_pid_vnr(current), 0); | 2076 | cmpxchg_futex_value_locked(&uval, uaddr, vpid, 0)) |
2043 | |||
2044 | |||
2045 | if (unlikely(uval == -EFAULT)) | ||
2046 | goto pi_faulted; | 2077 | goto pi_faulted; |
2047 | /* | 2078 | /* |
2048 | * Rare case: we managed to release the lock atomically, | 2079 | * Rare case: we managed to release the lock atomically, |
2049 | * no need to wake anyone else up: | 2080 | * no need to wake anyone else up: |
2050 | */ | 2081 | */ |
2051 | if (unlikely(uval == task_pid_vnr(current))) | 2082 | if (unlikely(uval == vpid)) |
2052 | goto out_unlock; | 2083 | goto out_unlock; |
2053 | 2084 | ||
2054 | /* | 2085 | /* |
@@ -2133,7 +2164,7 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb, | |||
2133 | * We were woken prior to requeue by a timeout or a signal. | 2164 | * We were woken prior to requeue by a timeout or a signal. |
2134 | * Unqueue the futex_q and determine which it was. | 2165 | * Unqueue the futex_q and determine which it was. |
2135 | */ | 2166 | */ |
2136 | plist_del(&q->list, &q->list.plist); | 2167 | plist_del(&q->list, &hb->chain); |
2137 | 2168 | ||
2138 | /* Handle spurious wakeups gracefully */ | 2169 | /* Handle spurious wakeups gracefully */ |
2139 | ret = -EWOULDBLOCK; | 2170 | ret = -EWOULDBLOCK; |
@@ -2429,11 +2460,20 @@ retry: | |||
2429 | * userspace. | 2460 | * userspace. |
2430 | */ | 2461 | */ |
2431 | mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED; | 2462 | mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED; |
2432 | nval = futex_atomic_cmpxchg_inatomic(uaddr, uval, mval); | 2463 | /* |
2433 | 2464 | * We are not holding a lock here, but we want to have | |
2434 | if (nval == -EFAULT) | 2465 | * the pagefault_disable/enable() protection because |
2435 | return -1; | 2466 | * we want to handle the fault gracefully. If the |
2436 | 2467 | * access fails we try to fault in the futex with R/W | |
2468 | * verification via get_user_pages. get_user() above | ||
2469 | * does not guarantee R/W access. If that fails we | ||
2470 | * give up and leave the futex locked. | ||
2471 | */ | ||
2472 | if (cmpxchg_futex_value_locked(&nval, uaddr, uval, mval)) { | ||
2473 | if (fault_in_user_writeable(uaddr)) | ||
2474 | return -1; | ||
2475 | goto retry; | ||
2476 | } | ||
2437 | if (nval != uval) | 2477 | if (nval != uval) |
2438 | goto retry; | 2478 | goto retry; |
2439 | 2479 | ||
@@ -2644,8 +2684,7 @@ static int __init futex_init(void) | |||
2644 | * implementation, the non-functional ones will return | 2684 | * implementation, the non-functional ones will return |
2645 | * -ENOSYS. | 2685 | * -ENOSYS. |
2646 | */ | 2686 | */ |
2647 | curval = cmpxchg_futex_value_locked(NULL, 0, 0); | 2687 | if (cmpxchg_futex_value_locked(&curval, NULL, 0, 0) == -EFAULT) |
2648 | if (curval == -EFAULT) | ||
2649 | futex_cmpxchg_enabled = 1; | 2688 | futex_cmpxchg_enabled = 1; |
2650 | 2689 | ||
2651 | for (i = 0; i < ARRAY_SIZE(futex_queues); i++) { | 2690 | for (i = 0; i < ARRAY_SIZE(futex_queues); i++) { |
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index f2429fc3438c..9017478c5d4c 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c | |||
@@ -53,11 +53,10 @@ | |||
53 | /* | 53 | /* |
54 | * The timer bases: | 54 | * The timer bases: |
55 | * | 55 | * |
56 | * Note: If we want to add new timer bases, we have to skip the two | 56 | * There are more clockids then hrtimer bases. Thus, we index |
57 | * clock ids captured by the cpu-timers. We do this by holding empty | 57 | * into the timer bases by the hrtimer_base_type enum. When trying |
58 | * entries rather than doing math adjustment of the clock ids. | 58 | * to reach a base using a clockid, hrtimer_clockid_to_base() |
59 | * This ensures that we capture erroneous accesses to these clock ids | 59 | * is used to convert from clockid to the proper hrtimer_base_type. |
60 | * rather than moving them into the range of valid clock id's. | ||
61 | */ | 60 | */ |
62 | DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) = | 61 | DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) = |
63 | { | 62 | { |
@@ -74,30 +73,39 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) = | |||
74 | .get_time = &ktime_get, | 73 | .get_time = &ktime_get, |
75 | .resolution = KTIME_LOW_RES, | 74 | .resolution = KTIME_LOW_RES, |
76 | }, | 75 | }, |
76 | { | ||
77 | .index = CLOCK_BOOTTIME, | ||
78 | .get_time = &ktime_get_boottime, | ||
79 | .resolution = KTIME_LOW_RES, | ||
80 | }, | ||
77 | } | 81 | } |
78 | }; | 82 | }; |
79 | 83 | ||
84 | static int hrtimer_clock_to_base_table[MAX_CLOCKS]; | ||
85 | |||
86 | static inline int hrtimer_clockid_to_base(clockid_t clock_id) | ||
87 | { | ||
88 | return hrtimer_clock_to_base_table[clock_id]; | ||
89 | } | ||
90 | |||
91 | |||
80 | /* | 92 | /* |
81 | * Get the coarse grained time at the softirq based on xtime and | 93 | * Get the coarse grained time at the softirq based on xtime and |
82 | * wall_to_monotonic. | 94 | * wall_to_monotonic. |
83 | */ | 95 | */ |
84 | static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base) | 96 | static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base) |
85 | { | 97 | { |
86 | ktime_t xtim, tomono; | 98 | ktime_t xtim, mono, boot; |
87 | struct timespec xts, tom; | 99 | struct timespec xts, tom, slp; |
88 | unsigned long seq; | ||
89 | 100 | ||
90 | do { | 101 | get_xtime_and_monotonic_and_sleep_offset(&xts, &tom, &slp); |
91 | seq = read_seqbegin(&xtime_lock); | ||
92 | xts = __current_kernel_time(); | ||
93 | tom = __get_wall_to_monotonic(); | ||
94 | } while (read_seqretry(&xtime_lock, seq)); | ||
95 | 102 | ||
96 | xtim = timespec_to_ktime(xts); | 103 | xtim = timespec_to_ktime(xts); |
97 | tomono = timespec_to_ktime(tom); | 104 | mono = ktime_add(xtim, timespec_to_ktime(tom)); |
98 | base->clock_base[CLOCK_REALTIME].softirq_time = xtim; | 105 | boot = ktime_add(mono, timespec_to_ktime(slp)); |
99 | base->clock_base[CLOCK_MONOTONIC].softirq_time = | 106 | base->clock_base[HRTIMER_BASE_REALTIME].softirq_time = xtim; |
100 | ktime_add(xtim, tomono); | 107 | base->clock_base[HRTIMER_BASE_MONOTONIC].softirq_time = mono; |
108 | base->clock_base[HRTIMER_BASE_BOOTTIME].softirq_time = boot; | ||
101 | } | 109 | } |
102 | 110 | ||
103 | /* | 111 | /* |
@@ -184,10 +192,11 @@ switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base, | |||
184 | struct hrtimer_cpu_base *new_cpu_base; | 192 | struct hrtimer_cpu_base *new_cpu_base; |
185 | int this_cpu = smp_processor_id(); | 193 | int this_cpu = smp_processor_id(); |
186 | int cpu = hrtimer_get_target(this_cpu, pinned); | 194 | int cpu = hrtimer_get_target(this_cpu, pinned); |
195 | int basenum = hrtimer_clockid_to_base(base->index); | ||
187 | 196 | ||
188 | again: | 197 | again: |
189 | new_cpu_base = &per_cpu(hrtimer_bases, cpu); | 198 | new_cpu_base = &per_cpu(hrtimer_bases, cpu); |
190 | new_base = &new_cpu_base->clock_base[base->index]; | 199 | new_base = &new_cpu_base->clock_base[basenum]; |
191 | 200 | ||
192 | if (base != new_base) { | 201 | if (base != new_base) { |
193 | /* | 202 | /* |
@@ -334,6 +343,11 @@ EXPORT_SYMBOL_GPL(ktime_add_safe); | |||
334 | 343 | ||
335 | static struct debug_obj_descr hrtimer_debug_descr; | 344 | static struct debug_obj_descr hrtimer_debug_descr; |
336 | 345 | ||
346 | static void *hrtimer_debug_hint(void *addr) | ||
347 | { | ||
348 | return ((struct hrtimer *) addr)->function; | ||
349 | } | ||
350 | |||
337 | /* | 351 | /* |
338 | * fixup_init is called when: | 352 | * fixup_init is called when: |
339 | * - an active object is initialized | 353 | * - an active object is initialized |
@@ -393,6 +407,7 @@ static int hrtimer_fixup_free(void *addr, enum debug_obj_state state) | |||
393 | 407 | ||
394 | static struct debug_obj_descr hrtimer_debug_descr = { | 408 | static struct debug_obj_descr hrtimer_debug_descr = { |
395 | .name = "hrtimer", | 409 | .name = "hrtimer", |
410 | .debug_hint = hrtimer_debug_hint, | ||
396 | .fixup_init = hrtimer_fixup_init, | 411 | .fixup_init = hrtimer_fixup_init, |
397 | .fixup_activate = hrtimer_fixup_activate, | 412 | .fixup_activate = hrtimer_fixup_activate, |
398 | .fixup_free = hrtimer_fixup_free, | 413 | .fixup_free = hrtimer_fixup_free, |
@@ -497,7 +512,7 @@ static inline int hrtimer_is_hres_enabled(void) | |||
497 | */ | 512 | */ |
498 | static inline int hrtimer_hres_active(void) | 513 | static inline int hrtimer_hres_active(void) |
499 | { | 514 | { |
500 | return __get_cpu_var(hrtimer_bases).hres_active; | 515 | return __this_cpu_read(hrtimer_bases.hres_active); |
501 | } | 516 | } |
502 | 517 | ||
503 | /* | 518 | /* |
@@ -611,24 +626,23 @@ static int hrtimer_reprogram(struct hrtimer *timer, | |||
611 | static void retrigger_next_event(void *arg) | 626 | static void retrigger_next_event(void *arg) |
612 | { | 627 | { |
613 | struct hrtimer_cpu_base *base; | 628 | struct hrtimer_cpu_base *base; |
614 | struct timespec realtime_offset, wtm; | 629 | struct timespec realtime_offset, wtm, sleep; |
615 | unsigned long seq; | ||
616 | 630 | ||
617 | if (!hrtimer_hres_active()) | 631 | if (!hrtimer_hres_active()) |
618 | return; | 632 | return; |
619 | 633 | ||
620 | do { | 634 | get_xtime_and_monotonic_and_sleep_offset(&realtime_offset, &wtm, |
621 | seq = read_seqbegin(&xtime_lock); | 635 | &sleep); |
622 | wtm = __get_wall_to_monotonic(); | ||
623 | } while (read_seqretry(&xtime_lock, seq)); | ||
624 | set_normalized_timespec(&realtime_offset, -wtm.tv_sec, -wtm.tv_nsec); | 636 | set_normalized_timespec(&realtime_offset, -wtm.tv_sec, -wtm.tv_nsec); |
625 | 637 | ||
626 | base = &__get_cpu_var(hrtimer_bases); | 638 | base = &__get_cpu_var(hrtimer_bases); |
627 | 639 | ||
628 | /* Adjust CLOCK_REALTIME offset */ | 640 | /* Adjust CLOCK_REALTIME offset */ |
629 | raw_spin_lock(&base->lock); | 641 | raw_spin_lock(&base->lock); |
630 | base->clock_base[CLOCK_REALTIME].offset = | 642 | base->clock_base[HRTIMER_BASE_REALTIME].offset = |
631 | timespec_to_ktime(realtime_offset); | 643 | timespec_to_ktime(realtime_offset); |
644 | base->clock_base[HRTIMER_BASE_BOOTTIME].offset = | ||
645 | timespec_to_ktime(sleep); | ||
632 | 646 | ||
633 | hrtimer_force_reprogram(base, 0); | 647 | hrtimer_force_reprogram(base, 0); |
634 | raw_spin_unlock(&base->lock); | 648 | raw_spin_unlock(&base->lock); |
@@ -673,14 +687,6 @@ static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) | |||
673 | } | 687 | } |
674 | 688 | ||
675 | /* | 689 | /* |
676 | * Initialize the high resolution related parts of a hrtimer | ||
677 | */ | ||
678 | static inline void hrtimer_init_timer_hres(struct hrtimer *timer) | ||
679 | { | ||
680 | } | ||
681 | |||
682 | |||
683 | /* | ||
684 | * When High resolution timers are active, try to reprogram. Note, that in case | 690 | * When High resolution timers are active, try to reprogram. Note, that in case |
685 | * the state has HRTIMER_STATE_CALLBACK set, no reprogramming and no expiry | 691 | * the state has HRTIMER_STATE_CALLBACK set, no reprogramming and no expiry |
686 | * check happens. The timer gets enqueued into the rbtree. The reprogramming | 692 | * check happens. The timer gets enqueued into the rbtree. The reprogramming |
@@ -725,8 +731,9 @@ static int hrtimer_switch_to_hres(void) | |||
725 | return 0; | 731 | return 0; |
726 | } | 732 | } |
727 | base->hres_active = 1; | 733 | base->hres_active = 1; |
728 | base->clock_base[CLOCK_REALTIME].resolution = KTIME_HIGH_RES; | 734 | base->clock_base[HRTIMER_BASE_REALTIME].resolution = KTIME_HIGH_RES; |
729 | base->clock_base[CLOCK_MONOTONIC].resolution = KTIME_HIGH_RES; | 735 | base->clock_base[HRTIMER_BASE_MONOTONIC].resolution = KTIME_HIGH_RES; |
736 | base->clock_base[HRTIMER_BASE_BOOTTIME].resolution = KTIME_HIGH_RES; | ||
730 | 737 | ||
731 | tick_setup_sched_timer(); | 738 | tick_setup_sched_timer(); |
732 | 739 | ||
@@ -750,7 +757,6 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, | |||
750 | return 0; | 757 | return 0; |
751 | } | 758 | } |
752 | static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) { } | 759 | static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) { } |
753 | static inline void hrtimer_init_timer_hres(struct hrtimer *timer) { } | ||
754 | 760 | ||
755 | #endif /* CONFIG_HIGH_RES_TIMERS */ | 761 | #endif /* CONFIG_HIGH_RES_TIMERS */ |
756 | 762 | ||
@@ -1121,6 +1127,7 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id, | |||
1121 | enum hrtimer_mode mode) | 1127 | enum hrtimer_mode mode) |
1122 | { | 1128 | { |
1123 | struct hrtimer_cpu_base *cpu_base; | 1129 | struct hrtimer_cpu_base *cpu_base; |
1130 | int base; | ||
1124 | 1131 | ||
1125 | memset(timer, 0, sizeof(struct hrtimer)); | 1132 | memset(timer, 0, sizeof(struct hrtimer)); |
1126 | 1133 | ||
@@ -1129,8 +1136,8 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id, | |||
1129 | if (clock_id == CLOCK_REALTIME && mode != HRTIMER_MODE_ABS) | 1136 | if (clock_id == CLOCK_REALTIME && mode != HRTIMER_MODE_ABS) |
1130 | clock_id = CLOCK_MONOTONIC; | 1137 | clock_id = CLOCK_MONOTONIC; |
1131 | 1138 | ||
1132 | timer->base = &cpu_base->clock_base[clock_id]; | 1139 | base = hrtimer_clockid_to_base(clock_id); |
1133 | hrtimer_init_timer_hres(timer); | 1140 | timer->base = &cpu_base->clock_base[base]; |
1134 | timerqueue_init(&timer->node); | 1141 | timerqueue_init(&timer->node); |
1135 | 1142 | ||
1136 | #ifdef CONFIG_TIMER_STATS | 1143 | #ifdef CONFIG_TIMER_STATS |
@@ -1165,9 +1172,10 @@ EXPORT_SYMBOL_GPL(hrtimer_init); | |||
1165 | int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp) | 1172 | int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp) |
1166 | { | 1173 | { |
1167 | struct hrtimer_cpu_base *cpu_base; | 1174 | struct hrtimer_cpu_base *cpu_base; |
1175 | int base = hrtimer_clockid_to_base(which_clock); | ||
1168 | 1176 | ||
1169 | cpu_base = &__raw_get_cpu_var(hrtimer_bases); | 1177 | cpu_base = &__raw_get_cpu_var(hrtimer_bases); |
1170 | *tp = ktime_to_timespec(cpu_base->clock_base[which_clock].resolution); | 1178 | *tp = ktime_to_timespec(cpu_base->clock_base[base].resolution); |
1171 | 1179 | ||
1172 | return 0; | 1180 | return 0; |
1173 | } | 1181 | } |
@@ -1714,6 +1722,10 @@ static struct notifier_block __cpuinitdata hrtimers_nb = { | |||
1714 | 1722 | ||
1715 | void __init hrtimers_init(void) | 1723 | void __init hrtimers_init(void) |
1716 | { | 1724 | { |
1725 | hrtimer_clock_to_base_table[CLOCK_REALTIME] = HRTIMER_BASE_REALTIME; | ||
1726 | hrtimer_clock_to_base_table[CLOCK_MONOTONIC] = HRTIMER_BASE_MONOTONIC; | ||
1727 | hrtimer_clock_to_base_table[CLOCK_BOOTTIME] = HRTIMER_BASE_BOOTTIME; | ||
1728 | |||
1717 | hrtimer_cpu_notify(&hrtimers_nb, (unsigned long)CPU_UP_PREPARE, | 1729 | hrtimer_cpu_notify(&hrtimers_nb, (unsigned long)CPU_UP_PREPARE, |
1718 | (void *)(long)smp_processor_id()); | 1730 | (void *)(long)smp_processor_id()); |
1719 | register_cpu_notifier(&hrtimers_nb); | 1731 | register_cpu_notifier(&hrtimers_nb); |
@@ -1745,7 +1757,7 @@ schedule_hrtimeout_range_clock(ktime_t *expires, unsigned long delta, | |||
1745 | } | 1757 | } |
1746 | 1758 | ||
1747 | /* | 1759 | /* |
1748 | * A NULL parameter means "inifinte" | 1760 | * A NULL parameter means "infinite" |
1749 | */ | 1761 | */ |
1750 | if (!expires) { | 1762 | if (!expires) { |
1751 | schedule(); | 1763 | schedule(); |
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index 31d766bf5d2e..09bef82d74cb 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig | |||
@@ -1,5 +1,6 @@ | |||
1 | # Select this to activate the generic irq options below | ||
1 | config HAVE_GENERIC_HARDIRQS | 2 | config HAVE_GENERIC_HARDIRQS |
2 | def_bool n | 3 | bool |
3 | 4 | ||
4 | if HAVE_GENERIC_HARDIRQS | 5 | if HAVE_GENERIC_HARDIRQS |
5 | menu "IRQ subsystem" | 6 | menu "IRQ subsystem" |
@@ -9,31 +10,46 @@ menu "IRQ subsystem" | |||
9 | config GENERIC_HARDIRQS | 10 | config GENERIC_HARDIRQS |
10 | def_bool y | 11 | def_bool y |
11 | 12 | ||
12 | config GENERIC_HARDIRQS_NO__DO_IRQ | ||
13 | def_bool y | ||
14 | |||
15 | # Select this to disable the deprecated stuff | 13 | # Select this to disable the deprecated stuff |
16 | config GENERIC_HARDIRQS_NO_DEPRECATED | 14 | config GENERIC_HARDIRQS_NO_DEPRECATED |
17 | def_bool n | 15 | bool |
16 | |||
17 | config GENERIC_HARDIRQS_NO_COMPAT | ||
18 | bool | ||
18 | 19 | ||
19 | # Options selectable by the architecture code | 20 | # Options selectable by the architecture code |
21 | |||
22 | # Make sparse irq Kconfig switch below available | ||
20 | config HAVE_SPARSE_IRQ | 23 | config HAVE_SPARSE_IRQ |
21 | def_bool n | 24 | bool |
22 | 25 | ||
26 | # Enable the generic irq autoprobe mechanism | ||
23 | config GENERIC_IRQ_PROBE | 27 | config GENERIC_IRQ_PROBE |
24 | def_bool n | 28 | bool |
25 | 29 | ||
30 | # Use the generic /proc/interrupts implementation | ||
31 | config GENERIC_IRQ_SHOW | ||
32 | bool | ||
33 | |||
34 | # Support for delayed migration from interrupt context | ||
26 | config GENERIC_PENDING_IRQ | 35 | config GENERIC_PENDING_IRQ |
27 | def_bool n | 36 | bool |
28 | 37 | ||
38 | # Alpha specific irq affinity mechanism | ||
29 | config AUTO_IRQ_AFFINITY | 39 | config AUTO_IRQ_AFFINITY |
30 | def_bool n | 40 | bool |
31 | |||
32 | config IRQ_PER_CPU | ||
33 | def_bool n | ||
34 | 41 | ||
42 | # Tasklet based software resend for pending interrupts on enable_irq() | ||
35 | config HARDIRQS_SW_RESEND | 43 | config HARDIRQS_SW_RESEND |
36 | def_bool n | 44 | bool |
45 | |||
46 | # Preflow handler support for fasteoi (sparc64) | ||
47 | config IRQ_PREFLOW_FASTEOI | ||
48 | bool | ||
49 | |||
50 | # Support forced irq threading | ||
51 | config IRQ_FORCED_THREADING | ||
52 | bool | ||
37 | 53 | ||
38 | config SPARSE_IRQ | 54 | config SPARSE_IRQ |
39 | bool "Support sparse irq numbering" | 55 | bool "Support sparse irq numbering" |
diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c index 505798f86c36..394784c57060 100644 --- a/kernel/irq/autoprobe.c +++ b/kernel/irq/autoprobe.c | |||
@@ -17,7 +17,7 @@ | |||
17 | /* | 17 | /* |
18 | * Autodetection depends on the fact that any interrupt that | 18 | * Autodetection depends on the fact that any interrupt that |
19 | * comes in on to an unassigned handler will get stuck with | 19 | * comes in on to an unassigned handler will get stuck with |
20 | * "IRQ_WAITING" cleared and the interrupt disabled. | 20 | * "IRQS_WAITING" cleared and the interrupt disabled. |
21 | */ | 21 | */ |
22 | static DEFINE_MUTEX(probing_active); | 22 | static DEFINE_MUTEX(probing_active); |
23 | 23 | ||
@@ -32,7 +32,6 @@ unsigned long probe_irq_on(void) | |||
32 | { | 32 | { |
33 | struct irq_desc *desc; | 33 | struct irq_desc *desc; |
34 | unsigned long mask = 0; | 34 | unsigned long mask = 0; |
35 | unsigned int status; | ||
36 | int i; | 35 | int i; |
37 | 36 | ||
38 | /* | 37 | /* |
@@ -46,13 +45,7 @@ unsigned long probe_irq_on(void) | |||
46 | */ | 45 | */ |
47 | for_each_irq_desc_reverse(i, desc) { | 46 | for_each_irq_desc_reverse(i, desc) { |
48 | raw_spin_lock_irq(&desc->lock); | 47 | raw_spin_lock_irq(&desc->lock); |
49 | if (!desc->action && !(desc->status & IRQ_NOPROBE)) { | 48 | if (!desc->action && irq_settings_can_probe(desc)) { |
50 | /* | ||
51 | * An old-style architecture might still have | ||
52 | * the handle_bad_irq handler there: | ||
53 | */ | ||
54 | compat_irq_chip_set_default_handler(desc); | ||
55 | |||
56 | /* | 49 | /* |
57 | * Some chips need to know about probing in | 50 | * Some chips need to know about probing in |
58 | * progress: | 51 | * progress: |
@@ -60,7 +53,7 @@ unsigned long probe_irq_on(void) | |||
60 | if (desc->irq_data.chip->irq_set_type) | 53 | if (desc->irq_data.chip->irq_set_type) |
61 | desc->irq_data.chip->irq_set_type(&desc->irq_data, | 54 | desc->irq_data.chip->irq_set_type(&desc->irq_data, |
62 | IRQ_TYPE_PROBE); | 55 | IRQ_TYPE_PROBE); |
63 | desc->irq_data.chip->irq_startup(&desc->irq_data); | 56 | irq_startup(desc); |
64 | } | 57 | } |
65 | raw_spin_unlock_irq(&desc->lock); | 58 | raw_spin_unlock_irq(&desc->lock); |
66 | } | 59 | } |
@@ -75,10 +68,12 @@ unsigned long probe_irq_on(void) | |||
75 | */ | 68 | */ |
76 | for_each_irq_desc_reverse(i, desc) { | 69 | for_each_irq_desc_reverse(i, desc) { |
77 | raw_spin_lock_irq(&desc->lock); | 70 | raw_spin_lock_irq(&desc->lock); |
78 | if (!desc->action && !(desc->status & IRQ_NOPROBE)) { | 71 | if (!desc->action && irq_settings_can_probe(desc)) { |
79 | desc->status |= IRQ_AUTODETECT | IRQ_WAITING; | 72 | desc->istate |= IRQS_AUTODETECT | IRQS_WAITING; |
80 | if (desc->irq_data.chip->irq_startup(&desc->irq_data)) | 73 | if (irq_startup(desc)) { |
81 | desc->status |= IRQ_PENDING; | 74 | irq_compat_set_pending(desc); |
75 | desc->istate |= IRQS_PENDING; | ||
76 | } | ||
82 | } | 77 | } |
83 | raw_spin_unlock_irq(&desc->lock); | 78 | raw_spin_unlock_irq(&desc->lock); |
84 | } | 79 | } |
@@ -93,13 +88,12 @@ unsigned long probe_irq_on(void) | |||
93 | */ | 88 | */ |
94 | for_each_irq_desc(i, desc) { | 89 | for_each_irq_desc(i, desc) { |
95 | raw_spin_lock_irq(&desc->lock); | 90 | raw_spin_lock_irq(&desc->lock); |
96 | status = desc->status; | ||
97 | 91 | ||
98 | if (status & IRQ_AUTODETECT) { | 92 | if (desc->istate & IRQS_AUTODETECT) { |
99 | /* It triggered already - consider it spurious. */ | 93 | /* It triggered already - consider it spurious. */ |
100 | if (!(status & IRQ_WAITING)) { | 94 | if (!(desc->istate & IRQS_WAITING)) { |
101 | desc->status = status & ~IRQ_AUTODETECT; | 95 | desc->istate &= ~IRQS_AUTODETECT; |
102 | desc->irq_data.chip->irq_shutdown(&desc->irq_data); | 96 | irq_shutdown(desc); |
103 | } else | 97 | } else |
104 | if (i < 32) | 98 | if (i < 32) |
105 | mask |= 1 << i; | 99 | mask |= 1 << i; |
@@ -125,20 +119,18 @@ EXPORT_SYMBOL(probe_irq_on); | |||
125 | */ | 119 | */ |
126 | unsigned int probe_irq_mask(unsigned long val) | 120 | unsigned int probe_irq_mask(unsigned long val) |
127 | { | 121 | { |
128 | unsigned int status, mask = 0; | 122 | unsigned int mask = 0; |
129 | struct irq_desc *desc; | 123 | struct irq_desc *desc; |
130 | int i; | 124 | int i; |
131 | 125 | ||
132 | for_each_irq_desc(i, desc) { | 126 | for_each_irq_desc(i, desc) { |
133 | raw_spin_lock_irq(&desc->lock); | 127 | raw_spin_lock_irq(&desc->lock); |
134 | status = desc->status; | 128 | if (desc->istate & IRQS_AUTODETECT) { |
135 | 129 | if (i < 16 && !(desc->istate & IRQS_WAITING)) | |
136 | if (status & IRQ_AUTODETECT) { | ||
137 | if (i < 16 && !(status & IRQ_WAITING)) | ||
138 | mask |= 1 << i; | 130 | mask |= 1 << i; |
139 | 131 | ||
140 | desc->status = status & ~IRQ_AUTODETECT; | 132 | desc->istate &= ~IRQS_AUTODETECT; |
141 | desc->irq_data.chip->irq_shutdown(&desc->irq_data); | 133 | irq_shutdown(desc); |
142 | } | 134 | } |
143 | raw_spin_unlock_irq(&desc->lock); | 135 | raw_spin_unlock_irq(&desc->lock); |
144 | } | 136 | } |
@@ -169,20 +161,18 @@ int probe_irq_off(unsigned long val) | |||
169 | { | 161 | { |
170 | int i, irq_found = 0, nr_of_irqs = 0; | 162 | int i, irq_found = 0, nr_of_irqs = 0; |
171 | struct irq_desc *desc; | 163 | struct irq_desc *desc; |
172 | unsigned int status; | ||
173 | 164 | ||
174 | for_each_irq_desc(i, desc) { | 165 | for_each_irq_desc(i, desc) { |
175 | raw_spin_lock_irq(&desc->lock); | 166 | raw_spin_lock_irq(&desc->lock); |
176 | status = desc->status; | ||
177 | 167 | ||
178 | if (status & IRQ_AUTODETECT) { | 168 | if (desc->istate & IRQS_AUTODETECT) { |
179 | if (!(status & IRQ_WAITING)) { | 169 | if (!(desc->istate & IRQS_WAITING)) { |
180 | if (!nr_of_irqs) | 170 | if (!nr_of_irqs) |
181 | irq_found = i; | 171 | irq_found = i; |
182 | nr_of_irqs++; | 172 | nr_of_irqs++; |
183 | } | 173 | } |
184 | desc->status = status & ~IRQ_AUTODETECT; | 174 | desc->istate &= ~IRQS_AUTODETECT; |
185 | desc->irq_data.chip->irq_shutdown(&desc->irq_data); | 175 | irq_shutdown(desc); |
186 | } | 176 | } |
187 | raw_spin_unlock_irq(&desc->lock); | 177 | raw_spin_unlock_irq(&desc->lock); |
188 | } | 178 | } |
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index baa5c4acad83..c9c0601f0615 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c | |||
@@ -19,140 +19,110 @@ | |||
19 | #include "internals.h" | 19 | #include "internals.h" |
20 | 20 | ||
21 | /** | 21 | /** |
22 | * set_irq_chip - set the irq chip for an irq | 22 | * irq_set_chip - set the irq chip for an irq |
23 | * @irq: irq number | 23 | * @irq: irq number |
24 | * @chip: pointer to irq chip description structure | 24 | * @chip: pointer to irq chip description structure |
25 | */ | 25 | */ |
26 | int set_irq_chip(unsigned int irq, struct irq_chip *chip) | 26 | int irq_set_chip(unsigned int irq, struct irq_chip *chip) |
27 | { | 27 | { |
28 | struct irq_desc *desc = irq_to_desc(irq); | ||
29 | unsigned long flags; | 28 | unsigned long flags; |
29 | struct irq_desc *desc = irq_get_desc_lock(irq, &flags); | ||
30 | 30 | ||
31 | if (!desc) { | 31 | if (!desc) |
32 | WARN(1, KERN_ERR "Trying to install chip for IRQ%d\n", irq); | ||
33 | return -EINVAL; | 32 | return -EINVAL; |
34 | } | ||
35 | 33 | ||
36 | if (!chip) | 34 | if (!chip) |
37 | chip = &no_irq_chip; | 35 | chip = &no_irq_chip; |
38 | 36 | ||
39 | raw_spin_lock_irqsave(&desc->lock, flags); | ||
40 | irq_chip_set_defaults(chip); | 37 | irq_chip_set_defaults(chip); |
41 | desc->irq_data.chip = chip; | 38 | desc->irq_data.chip = chip; |
42 | raw_spin_unlock_irqrestore(&desc->lock, flags); | 39 | irq_put_desc_unlock(desc, flags); |
43 | |||
44 | return 0; | 40 | return 0; |
45 | } | 41 | } |
46 | EXPORT_SYMBOL(set_irq_chip); | 42 | EXPORT_SYMBOL(irq_set_chip); |
47 | 43 | ||
48 | /** | 44 | /** |
49 | * set_irq_type - set the irq trigger type for an irq | 45 | * irq_set_type - set the irq trigger type for an irq |
50 | * @irq: irq number | 46 | * @irq: irq number |
51 | * @type: IRQ_TYPE_{LEVEL,EDGE}_* value - see include/linux/irq.h | 47 | * @type: IRQ_TYPE_{LEVEL,EDGE}_* value - see include/linux/irq.h |
52 | */ | 48 | */ |
53 | int set_irq_type(unsigned int irq, unsigned int type) | 49 | int irq_set_irq_type(unsigned int irq, unsigned int type) |
54 | { | 50 | { |
55 | struct irq_desc *desc = irq_to_desc(irq); | ||
56 | unsigned long flags; | 51 | unsigned long flags; |
57 | int ret = -ENXIO; | 52 | struct irq_desc *desc = irq_get_desc_buslock(irq, &flags); |
53 | int ret = 0; | ||
58 | 54 | ||
59 | if (!desc) { | 55 | if (!desc) |
60 | printk(KERN_ERR "Trying to set irq type for IRQ%d\n", irq); | 56 | return -EINVAL; |
61 | return -ENODEV; | ||
62 | } | ||
63 | 57 | ||
64 | type &= IRQ_TYPE_SENSE_MASK; | 58 | type &= IRQ_TYPE_SENSE_MASK; |
65 | if (type == IRQ_TYPE_NONE) | 59 | if (type != IRQ_TYPE_NONE) |
66 | return 0; | 60 | ret = __irq_set_trigger(desc, irq, type); |
67 | 61 | irq_put_desc_busunlock(desc, flags); | |
68 | raw_spin_lock_irqsave(&desc->lock, flags); | ||
69 | ret = __irq_set_trigger(desc, irq, type); | ||
70 | raw_spin_unlock_irqrestore(&desc->lock, flags); | ||
71 | return ret; | 62 | return ret; |
72 | } | 63 | } |
73 | EXPORT_SYMBOL(set_irq_type); | 64 | EXPORT_SYMBOL(irq_set_irq_type); |
74 | 65 | ||
75 | /** | 66 | /** |
76 | * set_irq_data - set irq type data for an irq | 67 | * irq_set_handler_data - set irq handler data for an irq |
77 | * @irq: Interrupt number | 68 | * @irq: Interrupt number |
78 | * @data: Pointer to interrupt specific data | 69 | * @data: Pointer to interrupt specific data |
79 | * | 70 | * |
80 | * Set the hardware irq controller data for an irq | 71 | * Set the hardware irq controller data for an irq |
81 | */ | 72 | */ |
82 | int set_irq_data(unsigned int irq, void *data) | 73 | int irq_set_handler_data(unsigned int irq, void *data) |
83 | { | 74 | { |
84 | struct irq_desc *desc = irq_to_desc(irq); | ||
85 | unsigned long flags; | 75 | unsigned long flags; |
76 | struct irq_desc *desc = irq_get_desc_lock(irq, &flags); | ||
86 | 77 | ||
87 | if (!desc) { | 78 | if (!desc) |
88 | printk(KERN_ERR | ||
89 | "Trying to install controller data for IRQ%d\n", irq); | ||
90 | return -EINVAL; | 79 | return -EINVAL; |
91 | } | ||
92 | |||
93 | raw_spin_lock_irqsave(&desc->lock, flags); | ||
94 | desc->irq_data.handler_data = data; | 80 | desc->irq_data.handler_data = data; |
95 | raw_spin_unlock_irqrestore(&desc->lock, flags); | 81 | irq_put_desc_unlock(desc, flags); |
96 | return 0; | 82 | return 0; |
97 | } | 83 | } |
98 | EXPORT_SYMBOL(set_irq_data); | 84 | EXPORT_SYMBOL(irq_set_handler_data); |
99 | 85 | ||
100 | /** | 86 | /** |
101 | * set_irq_msi - set MSI descriptor data for an irq | 87 | * irq_set_msi_desc - set MSI descriptor data for an irq |
102 | * @irq: Interrupt number | 88 | * @irq: Interrupt number |
103 | * @entry: Pointer to MSI descriptor data | 89 | * @entry: Pointer to MSI descriptor data |
104 | * | 90 | * |
105 | * Set the MSI descriptor entry for an irq | 91 | * Set the MSI descriptor entry for an irq |
106 | */ | 92 | */ |
107 | int set_irq_msi(unsigned int irq, struct msi_desc *entry) | 93 | int irq_set_msi_desc(unsigned int irq, struct msi_desc *entry) |
108 | { | 94 | { |
109 | struct irq_desc *desc = irq_to_desc(irq); | ||
110 | unsigned long flags; | 95 | unsigned long flags; |
96 | struct irq_desc *desc = irq_get_desc_lock(irq, &flags); | ||
111 | 97 | ||
112 | if (!desc) { | 98 | if (!desc) |
113 | printk(KERN_ERR | ||
114 | "Trying to install msi data for IRQ%d\n", irq); | ||
115 | return -EINVAL; | 99 | return -EINVAL; |
116 | } | ||
117 | |||
118 | raw_spin_lock_irqsave(&desc->lock, flags); | ||
119 | desc->irq_data.msi_desc = entry; | 100 | desc->irq_data.msi_desc = entry; |
120 | if (entry) | 101 | if (entry) |
121 | entry->irq = irq; | 102 | entry->irq = irq; |
122 | raw_spin_unlock_irqrestore(&desc->lock, flags); | 103 | irq_put_desc_unlock(desc, flags); |
123 | return 0; | 104 | return 0; |
124 | } | 105 | } |
125 | 106 | ||
126 | /** | 107 | /** |
127 | * set_irq_chip_data - set irq chip data for an irq | 108 | * irq_set_chip_data - set irq chip data for an irq |
128 | * @irq: Interrupt number | 109 | * @irq: Interrupt number |
129 | * @data: Pointer to chip specific data | 110 | * @data: Pointer to chip specific data |
130 | * | 111 | * |
131 | * Set the hardware irq chip data for an irq | 112 | * Set the hardware irq chip data for an irq |
132 | */ | 113 | */ |
133 | int set_irq_chip_data(unsigned int irq, void *data) | 114 | int irq_set_chip_data(unsigned int irq, void *data) |
134 | { | 115 | { |
135 | struct irq_desc *desc = irq_to_desc(irq); | ||
136 | unsigned long flags; | 116 | unsigned long flags; |
117 | struct irq_desc *desc = irq_get_desc_lock(irq, &flags); | ||
137 | 118 | ||
138 | if (!desc) { | 119 | if (!desc) |
139 | printk(KERN_ERR | ||
140 | "Trying to install chip data for IRQ%d\n", irq); | ||
141 | return -EINVAL; | ||
142 | } | ||
143 | |||
144 | if (!desc->irq_data.chip) { | ||
145 | printk(KERN_ERR "BUG: bad set_irq_chip_data(IRQ#%d)\n", irq); | ||
146 | return -EINVAL; | 120 | return -EINVAL; |
147 | } | ||
148 | |||
149 | raw_spin_lock_irqsave(&desc->lock, flags); | ||
150 | desc->irq_data.chip_data = data; | 121 | desc->irq_data.chip_data = data; |
151 | raw_spin_unlock_irqrestore(&desc->lock, flags); | 122 | irq_put_desc_unlock(desc, flags); |
152 | |||
153 | return 0; | 123 | return 0; |
154 | } | 124 | } |
155 | EXPORT_SYMBOL(set_irq_chip_data); | 125 | EXPORT_SYMBOL(irq_set_chip_data); |
156 | 126 | ||
157 | struct irq_data *irq_get_irq_data(unsigned int irq) | 127 | struct irq_data *irq_get_irq_data(unsigned int irq) |
158 | { | 128 | { |
@@ -162,72 +132,75 @@ struct irq_data *irq_get_irq_data(unsigned int irq) | |||
162 | } | 132 | } |
163 | EXPORT_SYMBOL_GPL(irq_get_irq_data); | 133 | EXPORT_SYMBOL_GPL(irq_get_irq_data); |
164 | 134 | ||
165 | /** | 135 | static void irq_state_clr_disabled(struct irq_desc *desc) |
166 | * set_irq_nested_thread - Set/Reset the IRQ_NESTED_THREAD flag of an irq | ||
167 | * | ||
168 | * @irq: Interrupt number | ||
169 | * @nest: 0 to clear / 1 to set the IRQ_NESTED_THREAD flag | ||
170 | * | ||
171 | * The IRQ_NESTED_THREAD flag indicates that on | ||
172 | * request_threaded_irq() no separate interrupt thread should be | ||
173 | * created for the irq as the handler are called nested in the | ||
174 | * context of a demultiplexing interrupt handler thread. | ||
175 | */ | ||
176 | void set_irq_nested_thread(unsigned int irq, int nest) | ||
177 | { | 136 | { |
178 | struct irq_desc *desc = irq_to_desc(irq); | 137 | desc->istate &= ~IRQS_DISABLED; |
179 | unsigned long flags; | 138 | irq_compat_clr_disabled(desc); |
180 | |||
181 | if (!desc) | ||
182 | return; | ||
183 | |||
184 | raw_spin_lock_irqsave(&desc->lock, flags); | ||
185 | if (nest) | ||
186 | desc->status |= IRQ_NESTED_THREAD; | ||
187 | else | ||
188 | desc->status &= ~IRQ_NESTED_THREAD; | ||
189 | raw_spin_unlock_irqrestore(&desc->lock, flags); | ||
190 | } | 139 | } |
191 | EXPORT_SYMBOL_GPL(set_irq_nested_thread); | ||
192 | 140 | ||
193 | /* | 141 | static void irq_state_set_disabled(struct irq_desc *desc) |
194 | * default enable function | ||
195 | */ | ||
196 | static void default_enable(struct irq_data *data) | ||
197 | { | 142 | { |
198 | struct irq_desc *desc = irq_data_to_desc(data); | 143 | desc->istate |= IRQS_DISABLED; |
144 | irq_compat_set_disabled(desc); | ||
145 | } | ||
199 | 146 | ||
200 | desc->irq_data.chip->irq_unmask(&desc->irq_data); | 147 | static void irq_state_clr_masked(struct irq_desc *desc) |
201 | desc->status &= ~IRQ_MASKED; | 148 | { |
149 | desc->istate &= ~IRQS_MASKED; | ||
150 | irq_compat_clr_masked(desc); | ||
202 | } | 151 | } |
203 | 152 | ||
204 | /* | 153 | static void irq_state_set_masked(struct irq_desc *desc) |
205 | * default disable function | ||
206 | */ | ||
207 | static void default_disable(struct irq_data *data) | ||
208 | { | 154 | { |
155 | desc->istate |= IRQS_MASKED; | ||
156 | irq_compat_set_masked(desc); | ||
209 | } | 157 | } |
210 | 158 | ||
211 | /* | 159 | int irq_startup(struct irq_desc *desc) |
212 | * default startup function | ||
213 | */ | ||
214 | static unsigned int default_startup(struct irq_data *data) | ||
215 | { | 160 | { |
216 | struct irq_desc *desc = irq_data_to_desc(data); | 161 | irq_state_clr_disabled(desc); |
162 | desc->depth = 0; | ||
163 | |||
164 | if (desc->irq_data.chip->irq_startup) { | ||
165 | int ret = desc->irq_data.chip->irq_startup(&desc->irq_data); | ||
166 | irq_state_clr_masked(desc); | ||
167 | return ret; | ||
168 | } | ||
217 | 169 | ||
218 | desc->irq_data.chip->irq_enable(data); | 170 | irq_enable(desc); |
219 | return 0; | 171 | return 0; |
220 | } | 172 | } |
221 | 173 | ||
222 | /* | 174 | void irq_shutdown(struct irq_desc *desc) |
223 | * default shutdown function | ||
224 | */ | ||
225 | static void default_shutdown(struct irq_data *data) | ||
226 | { | 175 | { |
227 | struct irq_desc *desc = irq_data_to_desc(data); | 176 | irq_state_set_disabled(desc); |
177 | desc->depth = 1; | ||
178 | if (desc->irq_data.chip->irq_shutdown) | ||
179 | desc->irq_data.chip->irq_shutdown(&desc->irq_data); | ||
180 | if (desc->irq_data.chip->irq_disable) | ||
181 | desc->irq_data.chip->irq_disable(&desc->irq_data); | ||
182 | else | ||
183 | desc->irq_data.chip->irq_mask(&desc->irq_data); | ||
184 | irq_state_set_masked(desc); | ||
185 | } | ||
228 | 186 | ||
229 | desc->irq_data.chip->irq_mask(&desc->irq_data); | 187 | void irq_enable(struct irq_desc *desc) |
230 | desc->status |= IRQ_MASKED; | 188 | { |
189 | irq_state_clr_disabled(desc); | ||
190 | if (desc->irq_data.chip->irq_enable) | ||
191 | desc->irq_data.chip->irq_enable(&desc->irq_data); | ||
192 | else | ||
193 | desc->irq_data.chip->irq_unmask(&desc->irq_data); | ||
194 | irq_state_clr_masked(desc); | ||
195 | } | ||
196 | |||
197 | void irq_disable(struct irq_desc *desc) | ||
198 | { | ||
199 | irq_state_set_disabled(desc); | ||
200 | if (desc->irq_data.chip->irq_disable) { | ||
201 | desc->irq_data.chip->irq_disable(&desc->irq_data); | ||
202 | irq_state_set_masked(desc); | ||
203 | } | ||
231 | } | 204 | } |
232 | 205 | ||
233 | #ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED | 206 | #ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED |
@@ -315,10 +288,6 @@ static void compat_bus_sync_unlock(struct irq_data *data) | |||
315 | void irq_chip_set_defaults(struct irq_chip *chip) | 288 | void irq_chip_set_defaults(struct irq_chip *chip) |
316 | { | 289 | { |
317 | #ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED | 290 | #ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED |
318 | /* | ||
319 | * Compat fixup functions need to be before we set the | ||
320 | * defaults for enable/disable/startup/shutdown | ||
321 | */ | ||
322 | if (chip->enable) | 291 | if (chip->enable) |
323 | chip->irq_enable = compat_irq_enable; | 292 | chip->irq_enable = compat_irq_enable; |
324 | if (chip->disable) | 293 | if (chip->disable) |
@@ -327,33 +296,8 @@ void irq_chip_set_defaults(struct irq_chip *chip) | |||
327 | chip->irq_shutdown = compat_irq_shutdown; | 296 | chip->irq_shutdown = compat_irq_shutdown; |
328 | if (chip->startup) | 297 | if (chip->startup) |
329 | chip->irq_startup = compat_irq_startup; | 298 | chip->irq_startup = compat_irq_startup; |
330 | #endif | ||
331 | /* | ||
332 | * The real defaults | ||
333 | */ | ||
334 | if (!chip->irq_enable) | ||
335 | chip->irq_enable = default_enable; | ||
336 | if (!chip->irq_disable) | ||
337 | chip->irq_disable = default_disable; | ||
338 | if (!chip->irq_startup) | ||
339 | chip->irq_startup = default_startup; | ||
340 | /* | ||
341 | * We use chip->irq_disable, when the user provided its own. When | ||
342 | * we have default_disable set for chip->irq_disable, then we need | ||
343 | * to use default_shutdown, otherwise the irq line is not | ||
344 | * disabled on free_irq(): | ||
345 | */ | ||
346 | if (!chip->irq_shutdown) | ||
347 | chip->irq_shutdown = chip->irq_disable != default_disable ? | ||
348 | chip->irq_disable : default_shutdown; | ||
349 | |||
350 | #ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED | ||
351 | if (!chip->end) | 299 | if (!chip->end) |
352 | chip->end = dummy_irq_chip.end; | 300 | chip->end = dummy_irq_chip.end; |
353 | |||
354 | /* | ||
355 | * Now fix up the remaining compat handlers | ||
356 | */ | ||
357 | if (chip->bus_lock) | 301 | if (chip->bus_lock) |
358 | chip->irq_bus_lock = compat_bus_lock; | 302 | chip->irq_bus_lock = compat_bus_lock; |
359 | if (chip->bus_sync_unlock) | 303 | if (chip->bus_sync_unlock) |
@@ -388,22 +332,22 @@ static inline void mask_ack_irq(struct irq_desc *desc) | |||
388 | if (desc->irq_data.chip->irq_ack) | 332 | if (desc->irq_data.chip->irq_ack) |
389 | desc->irq_data.chip->irq_ack(&desc->irq_data); | 333 | desc->irq_data.chip->irq_ack(&desc->irq_data); |
390 | } | 334 | } |
391 | desc->status |= IRQ_MASKED; | 335 | irq_state_set_masked(desc); |
392 | } | 336 | } |
393 | 337 | ||
394 | static inline void mask_irq(struct irq_desc *desc) | 338 | void mask_irq(struct irq_desc *desc) |
395 | { | 339 | { |
396 | if (desc->irq_data.chip->irq_mask) { | 340 | if (desc->irq_data.chip->irq_mask) { |
397 | desc->irq_data.chip->irq_mask(&desc->irq_data); | 341 | desc->irq_data.chip->irq_mask(&desc->irq_data); |
398 | desc->status |= IRQ_MASKED; | 342 | irq_state_set_masked(desc); |
399 | } | 343 | } |
400 | } | 344 | } |
401 | 345 | ||
402 | static inline void unmask_irq(struct irq_desc *desc) | 346 | void unmask_irq(struct irq_desc *desc) |
403 | { | 347 | { |
404 | if (desc->irq_data.chip->irq_unmask) { | 348 | if (desc->irq_data.chip->irq_unmask) { |
405 | desc->irq_data.chip->irq_unmask(&desc->irq_data); | 349 | desc->irq_data.chip->irq_unmask(&desc->irq_data); |
406 | desc->status &= ~IRQ_MASKED; | 350 | irq_state_clr_masked(desc); |
407 | } | 351 | } |
408 | } | 352 | } |
409 | 353 | ||
@@ -428,10 +372,11 @@ void handle_nested_irq(unsigned int irq) | |||
428 | kstat_incr_irqs_this_cpu(irq, desc); | 372 | kstat_incr_irqs_this_cpu(irq, desc); |
429 | 373 | ||
430 | action = desc->action; | 374 | action = desc->action; |
431 | if (unlikely(!action || (desc->status & IRQ_DISABLED))) | 375 | if (unlikely(!action || (desc->istate & IRQS_DISABLED))) |
432 | goto out_unlock; | 376 | goto out_unlock; |
433 | 377 | ||
434 | desc->status |= IRQ_INPROGRESS; | 378 | irq_compat_set_progress(desc); |
379 | desc->istate |= IRQS_INPROGRESS; | ||
435 | raw_spin_unlock_irq(&desc->lock); | 380 | raw_spin_unlock_irq(&desc->lock); |
436 | 381 | ||
437 | action_ret = action->thread_fn(action->irq, action->dev_id); | 382 | action_ret = action->thread_fn(action->irq, action->dev_id); |
@@ -439,13 +384,21 @@ void handle_nested_irq(unsigned int irq) | |||
439 | note_interrupt(irq, desc, action_ret); | 384 | note_interrupt(irq, desc, action_ret); |
440 | 385 | ||
441 | raw_spin_lock_irq(&desc->lock); | 386 | raw_spin_lock_irq(&desc->lock); |
442 | desc->status &= ~IRQ_INPROGRESS; | 387 | desc->istate &= ~IRQS_INPROGRESS; |
388 | irq_compat_clr_progress(desc); | ||
443 | 389 | ||
444 | out_unlock: | 390 | out_unlock: |
445 | raw_spin_unlock_irq(&desc->lock); | 391 | raw_spin_unlock_irq(&desc->lock); |
446 | } | 392 | } |
447 | EXPORT_SYMBOL_GPL(handle_nested_irq); | 393 | EXPORT_SYMBOL_GPL(handle_nested_irq); |
448 | 394 | ||
395 | static bool irq_check_poll(struct irq_desc *desc) | ||
396 | { | ||
397 | if (!(desc->istate & IRQS_POLL_INPROGRESS)) | ||
398 | return false; | ||
399 | return irq_wait_for_poll(desc); | ||
400 | } | ||
401 | |||
449 | /** | 402 | /** |
450 | * handle_simple_irq - Simple and software-decoded IRQs. | 403 | * handle_simple_irq - Simple and software-decoded IRQs. |
451 | * @irq: the interrupt number | 404 | * @irq: the interrupt number |
@@ -461,29 +414,20 @@ EXPORT_SYMBOL_GPL(handle_nested_irq); | |||
461 | void | 414 | void |
462 | handle_simple_irq(unsigned int irq, struct irq_desc *desc) | 415 | handle_simple_irq(unsigned int irq, struct irq_desc *desc) |
463 | { | 416 | { |
464 | struct irqaction *action; | ||
465 | irqreturn_t action_ret; | ||
466 | |||
467 | raw_spin_lock(&desc->lock); | 417 | raw_spin_lock(&desc->lock); |
468 | 418 | ||
469 | if (unlikely(desc->status & IRQ_INPROGRESS)) | 419 | if (unlikely(desc->istate & IRQS_INPROGRESS)) |
470 | goto out_unlock; | 420 | if (!irq_check_poll(desc)) |
471 | desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); | 421 | goto out_unlock; |
422 | |||
423 | desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); | ||
472 | kstat_incr_irqs_this_cpu(irq, desc); | 424 | kstat_incr_irqs_this_cpu(irq, desc); |
473 | 425 | ||
474 | action = desc->action; | 426 | if (unlikely(!desc->action || (desc->istate & IRQS_DISABLED))) |
475 | if (unlikely(!action || (desc->status & IRQ_DISABLED))) | ||
476 | goto out_unlock; | 427 | goto out_unlock; |
477 | 428 | ||
478 | desc->status |= IRQ_INPROGRESS; | 429 | handle_irq_event(desc); |
479 | raw_spin_unlock(&desc->lock); | ||
480 | 430 | ||
481 | action_ret = handle_IRQ_event(irq, action); | ||
482 | if (!noirqdebug) | ||
483 | note_interrupt(irq, desc, action_ret); | ||
484 | |||
485 | raw_spin_lock(&desc->lock); | ||
486 | desc->status &= ~IRQ_INPROGRESS; | ||
487 | out_unlock: | 431 | out_unlock: |
488 | raw_spin_unlock(&desc->lock); | 432 | raw_spin_unlock(&desc->lock); |
489 | } | 433 | } |
@@ -501,42 +445,42 @@ out_unlock: | |||
501 | void | 445 | void |
502 | handle_level_irq(unsigned int irq, struct irq_desc *desc) | 446 | handle_level_irq(unsigned int irq, struct irq_desc *desc) |
503 | { | 447 | { |
504 | struct irqaction *action; | ||
505 | irqreturn_t action_ret; | ||
506 | |||
507 | raw_spin_lock(&desc->lock); | 448 | raw_spin_lock(&desc->lock); |
508 | mask_ack_irq(desc); | 449 | mask_ack_irq(desc); |
509 | 450 | ||
510 | if (unlikely(desc->status & IRQ_INPROGRESS)) | 451 | if (unlikely(desc->istate & IRQS_INPROGRESS)) |
511 | goto out_unlock; | 452 | if (!irq_check_poll(desc)) |
512 | desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); | 453 | goto out_unlock; |
454 | |||
455 | desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); | ||
513 | kstat_incr_irqs_this_cpu(irq, desc); | 456 | kstat_incr_irqs_this_cpu(irq, desc); |
514 | 457 | ||
515 | /* | 458 | /* |
516 | * If its disabled or no action available | 459 | * If its disabled or no action available |
517 | * keep it masked and get out of here | 460 | * keep it masked and get out of here |
518 | */ | 461 | */ |
519 | action = desc->action; | 462 | if (unlikely(!desc->action || (desc->istate & IRQS_DISABLED))) |
520 | if (unlikely(!action || (desc->status & IRQ_DISABLED))) | ||
521 | goto out_unlock; | 463 | goto out_unlock; |
522 | 464 | ||
523 | desc->status |= IRQ_INPROGRESS; | 465 | handle_irq_event(desc); |
524 | raw_spin_unlock(&desc->lock); | ||
525 | |||
526 | action_ret = handle_IRQ_event(irq, action); | ||
527 | if (!noirqdebug) | ||
528 | note_interrupt(irq, desc, action_ret); | ||
529 | 466 | ||
530 | raw_spin_lock(&desc->lock); | 467 | if (!(desc->istate & (IRQS_DISABLED | IRQS_ONESHOT))) |
531 | desc->status &= ~IRQ_INPROGRESS; | ||
532 | |||
533 | if (!(desc->status & (IRQ_DISABLED | IRQ_ONESHOT))) | ||
534 | unmask_irq(desc); | 468 | unmask_irq(desc); |
535 | out_unlock: | 469 | out_unlock: |
536 | raw_spin_unlock(&desc->lock); | 470 | raw_spin_unlock(&desc->lock); |
537 | } | 471 | } |
538 | EXPORT_SYMBOL_GPL(handle_level_irq); | 472 | EXPORT_SYMBOL_GPL(handle_level_irq); |
539 | 473 | ||
474 | #ifdef CONFIG_IRQ_PREFLOW_FASTEOI | ||
475 | static inline void preflow_handler(struct irq_desc *desc) | ||
476 | { | ||
477 | if (desc->preflow_handler) | ||
478 | desc->preflow_handler(&desc->irq_data); | ||
479 | } | ||
480 | #else | ||
481 | static inline void preflow_handler(struct irq_desc *desc) { } | ||
482 | #endif | ||
483 | |||
540 | /** | 484 | /** |
541 | * handle_fasteoi_irq - irq handler for transparent controllers | 485 | * handle_fasteoi_irq - irq handler for transparent controllers |
542 | * @irq: the interrupt number | 486 | * @irq: the interrupt number |
@@ -550,42 +494,41 @@ EXPORT_SYMBOL_GPL(handle_level_irq); | |||
550 | void | 494 | void |
551 | handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc) | 495 | handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc) |
552 | { | 496 | { |
553 | struct irqaction *action; | ||
554 | irqreturn_t action_ret; | ||
555 | |||
556 | raw_spin_lock(&desc->lock); | 497 | raw_spin_lock(&desc->lock); |
557 | 498 | ||
558 | if (unlikely(desc->status & IRQ_INPROGRESS)) | 499 | if (unlikely(desc->istate & IRQS_INPROGRESS)) |
559 | goto out; | 500 | if (!irq_check_poll(desc)) |
501 | goto out; | ||
560 | 502 | ||
561 | desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); | 503 | desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); |
562 | kstat_incr_irqs_this_cpu(irq, desc); | 504 | kstat_incr_irqs_this_cpu(irq, desc); |
563 | 505 | ||
564 | /* | 506 | /* |
565 | * If its disabled or no action available | 507 | * If its disabled or no action available |
566 | * then mask it and get out of here: | 508 | * then mask it and get out of here: |
567 | */ | 509 | */ |
568 | action = desc->action; | 510 | if (unlikely(!desc->action || (desc->istate & IRQS_DISABLED))) { |
569 | if (unlikely(!action || (desc->status & IRQ_DISABLED))) { | 511 | irq_compat_set_pending(desc); |
570 | desc->status |= IRQ_PENDING; | 512 | desc->istate |= IRQS_PENDING; |
571 | mask_irq(desc); | 513 | mask_irq(desc); |
572 | goto out; | 514 | goto out; |
573 | } | 515 | } |
574 | 516 | ||
575 | desc->status |= IRQ_INPROGRESS; | 517 | if (desc->istate & IRQS_ONESHOT) |
576 | desc->status &= ~IRQ_PENDING; | 518 | mask_irq(desc); |
577 | raw_spin_unlock(&desc->lock); | ||
578 | 519 | ||
579 | action_ret = handle_IRQ_event(irq, action); | 520 | preflow_handler(desc); |
580 | if (!noirqdebug) | 521 | handle_irq_event(desc); |
581 | note_interrupt(irq, desc, action_ret); | ||
582 | 522 | ||
583 | raw_spin_lock(&desc->lock); | 523 | out_eoi: |
584 | desc->status &= ~IRQ_INPROGRESS; | ||
585 | out: | ||
586 | desc->irq_data.chip->irq_eoi(&desc->irq_data); | 524 | desc->irq_data.chip->irq_eoi(&desc->irq_data); |
587 | 525 | out_unlock: | |
588 | raw_spin_unlock(&desc->lock); | 526 | raw_spin_unlock(&desc->lock); |
527 | return; | ||
528 | out: | ||
529 | if (!(desc->irq_data.chip->flags & IRQCHIP_EOI_IF_HANDLED)) | ||
530 | goto out_eoi; | ||
531 | goto out_unlock; | ||
589 | } | 532 | } |
590 | 533 | ||
591 | /** | 534 | /** |
@@ -609,32 +552,28 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc) | |||
609 | { | 552 | { |
610 | raw_spin_lock(&desc->lock); | 553 | raw_spin_lock(&desc->lock); |
611 | 554 | ||
612 | desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); | 555 | desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); |
613 | |||
614 | /* | 556 | /* |
615 | * If we're currently running this IRQ, or its disabled, | 557 | * If we're currently running this IRQ, or its disabled, |
616 | * we shouldn't process the IRQ. Mark it pending, handle | 558 | * we shouldn't process the IRQ. Mark it pending, handle |
617 | * the necessary masking and go out | 559 | * the necessary masking and go out |
618 | */ | 560 | */ |
619 | if (unlikely((desc->status & (IRQ_INPROGRESS | IRQ_DISABLED)) || | 561 | if (unlikely((desc->istate & (IRQS_DISABLED | IRQS_INPROGRESS) || |
620 | !desc->action)) { | 562 | !desc->action))) { |
621 | desc->status |= (IRQ_PENDING | IRQ_MASKED); | 563 | if (!irq_check_poll(desc)) { |
622 | mask_ack_irq(desc); | 564 | irq_compat_set_pending(desc); |
623 | goto out_unlock; | 565 | desc->istate |= IRQS_PENDING; |
566 | mask_ack_irq(desc); | ||
567 | goto out_unlock; | ||
568 | } | ||
624 | } | 569 | } |
625 | kstat_incr_irqs_this_cpu(irq, desc); | 570 | kstat_incr_irqs_this_cpu(irq, desc); |
626 | 571 | ||
627 | /* Start handling the irq */ | 572 | /* Start handling the irq */ |
628 | desc->irq_data.chip->irq_ack(&desc->irq_data); | 573 | desc->irq_data.chip->irq_ack(&desc->irq_data); |
629 | 574 | ||
630 | /* Mark the IRQ currently in progress.*/ | ||
631 | desc->status |= IRQ_INPROGRESS; | ||
632 | |||
633 | do { | 575 | do { |
634 | struct irqaction *action = desc->action; | 576 | if (unlikely(!desc->action)) { |
635 | irqreturn_t action_ret; | ||
636 | |||
637 | if (unlikely(!action)) { | ||
638 | mask_irq(desc); | 577 | mask_irq(desc); |
639 | goto out_unlock; | 578 | goto out_unlock; |
640 | } | 579 | } |
@@ -644,22 +583,17 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc) | |||
644 | * one, we could have masked the irq. | 583 | * one, we could have masked the irq. |
645 | * Renable it, if it was not disabled in meantime. | 584 | * Renable it, if it was not disabled in meantime. |
646 | */ | 585 | */ |
647 | if (unlikely((desc->status & | 586 | if (unlikely(desc->istate & IRQS_PENDING)) { |
648 | (IRQ_PENDING | IRQ_MASKED | IRQ_DISABLED)) == | 587 | if (!(desc->istate & IRQS_DISABLED) && |
649 | (IRQ_PENDING | IRQ_MASKED))) { | 588 | (desc->istate & IRQS_MASKED)) |
650 | unmask_irq(desc); | 589 | unmask_irq(desc); |
651 | } | 590 | } |
652 | 591 | ||
653 | desc->status &= ~IRQ_PENDING; | 592 | handle_irq_event(desc); |
654 | raw_spin_unlock(&desc->lock); | ||
655 | action_ret = handle_IRQ_event(irq, action); | ||
656 | if (!noirqdebug) | ||
657 | note_interrupt(irq, desc, action_ret); | ||
658 | raw_spin_lock(&desc->lock); | ||
659 | 593 | ||
660 | } while ((desc->status & (IRQ_PENDING | IRQ_DISABLED)) == IRQ_PENDING); | 594 | } while ((desc->istate & IRQS_PENDING) && |
595 | !(desc->istate & IRQS_DISABLED)); | ||
661 | 596 | ||
662 | desc->status &= ~IRQ_INPROGRESS; | ||
663 | out_unlock: | 597 | out_unlock: |
664 | raw_spin_unlock(&desc->lock); | 598 | raw_spin_unlock(&desc->lock); |
665 | } | 599 | } |
@@ -674,103 +608,84 @@ out_unlock: | |||
674 | void | 608 | void |
675 | handle_percpu_irq(unsigned int irq, struct irq_desc *desc) | 609 | handle_percpu_irq(unsigned int irq, struct irq_desc *desc) |
676 | { | 610 | { |
677 | irqreturn_t action_ret; | 611 | struct irq_chip *chip = irq_desc_get_chip(desc); |
678 | 612 | ||
679 | kstat_incr_irqs_this_cpu(irq, desc); | 613 | kstat_incr_irqs_this_cpu(irq, desc); |
680 | 614 | ||
681 | if (desc->irq_data.chip->irq_ack) | 615 | if (chip->irq_ack) |
682 | desc->irq_data.chip->irq_ack(&desc->irq_data); | 616 | chip->irq_ack(&desc->irq_data); |
683 | 617 | ||
684 | action_ret = handle_IRQ_event(irq, desc->action); | 618 | handle_irq_event_percpu(desc, desc->action); |
685 | if (!noirqdebug) | ||
686 | note_interrupt(irq, desc, action_ret); | ||
687 | 619 | ||
688 | if (desc->irq_data.chip->irq_eoi) | 620 | if (chip->irq_eoi) |
689 | desc->irq_data.chip->irq_eoi(&desc->irq_data); | 621 | chip->irq_eoi(&desc->irq_data); |
690 | } | 622 | } |
691 | 623 | ||
692 | void | 624 | void |
693 | __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, | 625 | __irq_set_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, |
694 | const char *name) | 626 | const char *name) |
695 | { | 627 | { |
696 | struct irq_desc *desc = irq_to_desc(irq); | ||
697 | unsigned long flags; | 628 | unsigned long flags; |
629 | struct irq_desc *desc = irq_get_desc_buslock(irq, &flags); | ||
698 | 630 | ||
699 | if (!desc) { | 631 | if (!desc) |
700 | printk(KERN_ERR | ||
701 | "Trying to install type control for IRQ%d\n", irq); | ||
702 | return; | 632 | return; |
703 | } | ||
704 | 633 | ||
705 | if (!handle) | 634 | if (!handle) { |
706 | handle = handle_bad_irq; | 635 | handle = handle_bad_irq; |
707 | else if (desc->irq_data.chip == &no_irq_chip) { | 636 | } else { |
708 | printk(KERN_WARNING "Trying to install %sinterrupt handler " | 637 | if (WARN_ON(desc->irq_data.chip == &no_irq_chip)) |
709 | "for IRQ%d\n", is_chained ? "chained " : "", irq); | 638 | goto out; |
710 | /* | ||
711 | * Some ARM implementations install a handler for really dumb | ||
712 | * interrupt hardware without setting an irq_chip. This worked | ||
713 | * with the ARM no_irq_chip but the check in setup_irq would | ||
714 | * prevent us to setup the interrupt at all. Switch it to | ||
715 | * dummy_irq_chip for easy transition. | ||
716 | */ | ||
717 | desc->irq_data.chip = &dummy_irq_chip; | ||
718 | } | 639 | } |
719 | 640 | ||
720 | chip_bus_lock(desc); | ||
721 | raw_spin_lock_irqsave(&desc->lock, flags); | ||
722 | |||
723 | /* Uninstall? */ | 641 | /* Uninstall? */ |
724 | if (handle == handle_bad_irq) { | 642 | if (handle == handle_bad_irq) { |
725 | if (desc->irq_data.chip != &no_irq_chip) | 643 | if (desc->irq_data.chip != &no_irq_chip) |
726 | mask_ack_irq(desc); | 644 | mask_ack_irq(desc); |
727 | desc->status |= IRQ_DISABLED; | 645 | irq_compat_set_disabled(desc); |
646 | desc->istate |= IRQS_DISABLED; | ||
728 | desc->depth = 1; | 647 | desc->depth = 1; |
729 | } | 648 | } |
730 | desc->handle_irq = handle; | 649 | desc->handle_irq = handle; |
731 | desc->name = name; | 650 | desc->name = name; |
732 | 651 | ||
733 | if (handle != handle_bad_irq && is_chained) { | 652 | if (handle != handle_bad_irq && is_chained) { |
734 | desc->status &= ~IRQ_DISABLED; | 653 | irq_settings_set_noprobe(desc); |
735 | desc->status |= IRQ_NOREQUEST | IRQ_NOPROBE; | 654 | irq_settings_set_norequest(desc); |
736 | desc->depth = 0; | 655 | irq_startup(desc); |
737 | desc->irq_data.chip->irq_startup(&desc->irq_data); | ||
738 | } | 656 | } |
739 | raw_spin_unlock_irqrestore(&desc->lock, flags); | 657 | out: |
740 | chip_bus_sync_unlock(desc); | 658 | irq_put_desc_busunlock(desc, flags); |
741 | } | ||
742 | EXPORT_SYMBOL_GPL(__set_irq_handler); | ||
743 | |||
744 | void | ||
745 | set_irq_chip_and_handler(unsigned int irq, struct irq_chip *chip, | ||
746 | irq_flow_handler_t handle) | ||
747 | { | ||
748 | set_irq_chip(irq, chip); | ||
749 | __set_irq_handler(irq, handle, 0, NULL); | ||
750 | } | 659 | } |
660 | EXPORT_SYMBOL_GPL(__irq_set_handler); | ||
751 | 661 | ||
752 | void | 662 | void |
753 | set_irq_chip_and_handler_name(unsigned int irq, struct irq_chip *chip, | 663 | irq_set_chip_and_handler_name(unsigned int irq, struct irq_chip *chip, |
754 | irq_flow_handler_t handle, const char *name) | 664 | irq_flow_handler_t handle, const char *name) |
755 | { | 665 | { |
756 | set_irq_chip(irq, chip); | 666 | irq_set_chip(irq, chip); |
757 | __set_irq_handler(irq, handle, 0, name); | 667 | __irq_set_handler(irq, handle, 0, name); |
758 | } | 668 | } |
759 | 669 | ||
760 | void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set) | 670 | void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set) |
761 | { | 671 | { |
762 | struct irq_desc *desc = irq_to_desc(irq); | ||
763 | unsigned long flags; | 672 | unsigned long flags; |
673 | struct irq_desc *desc = irq_get_desc_lock(irq, &flags); | ||
764 | 674 | ||
765 | if (!desc) | 675 | if (!desc) |
766 | return; | 676 | return; |
677 | irq_settings_clr_and_set(desc, clr, set); | ||
678 | |||
679 | irqd_clear(&desc->irq_data, IRQD_NO_BALANCING | IRQD_PER_CPU | | ||
680 | IRQD_TRIGGER_MASK | IRQD_LEVEL | IRQD_MOVE_PCNTXT); | ||
681 | if (irq_settings_has_no_balance_set(desc)) | ||
682 | irqd_set(&desc->irq_data, IRQD_NO_BALANCING); | ||
683 | if (irq_settings_is_per_cpu(desc)) | ||
684 | irqd_set(&desc->irq_data, IRQD_PER_CPU); | ||
685 | if (irq_settings_can_move_pcntxt(desc)) | ||
686 | irqd_set(&desc->irq_data, IRQD_MOVE_PCNTXT); | ||
767 | 687 | ||
768 | /* Sanitize flags */ | 688 | irqd_set(&desc->irq_data, irq_settings_get_trigger_mask(desc)); |
769 | set &= IRQF_MODIFY_MASK; | ||
770 | clr &= IRQF_MODIFY_MASK; | ||
771 | 689 | ||
772 | raw_spin_lock_irqsave(&desc->lock, flags); | 690 | irq_put_desc_unlock(desc, flags); |
773 | desc->status &= ~clr; | ||
774 | desc->status |= set; | ||
775 | raw_spin_unlock_irqrestore(&desc->lock, flags); | ||
776 | } | 691 | } |
diff --git a/kernel/irq/compat.h b/kernel/irq/compat.h new file mode 100644 index 000000000000..6bbaf66aca85 --- /dev/null +++ b/kernel/irq/compat.h | |||
@@ -0,0 +1,72 @@ | |||
1 | /* | ||
2 | * Compat layer for transition period | ||
3 | */ | ||
4 | #ifndef CONFIG_GENERIC_HARDIRQS_NO_COMPAT | ||
5 | static inline void irq_compat_set_progress(struct irq_desc *desc) | ||
6 | { | ||
7 | desc->status |= IRQ_INPROGRESS; | ||
8 | } | ||
9 | |||
10 | static inline void irq_compat_clr_progress(struct irq_desc *desc) | ||
11 | { | ||
12 | desc->status &= ~IRQ_INPROGRESS; | ||
13 | } | ||
14 | static inline void irq_compat_set_disabled(struct irq_desc *desc) | ||
15 | { | ||
16 | desc->status |= IRQ_DISABLED; | ||
17 | } | ||
18 | static inline void irq_compat_clr_disabled(struct irq_desc *desc) | ||
19 | { | ||
20 | desc->status &= ~IRQ_DISABLED; | ||
21 | } | ||
22 | static inline void irq_compat_set_pending(struct irq_desc *desc) | ||
23 | { | ||
24 | desc->status |= IRQ_PENDING; | ||
25 | } | ||
26 | |||
27 | static inline void irq_compat_clr_pending(struct irq_desc *desc) | ||
28 | { | ||
29 | desc->status &= ~IRQ_PENDING; | ||
30 | } | ||
31 | static inline void irq_compat_set_masked(struct irq_desc *desc) | ||
32 | { | ||
33 | desc->status |= IRQ_MASKED; | ||
34 | } | ||
35 | |||
36 | static inline void irq_compat_clr_masked(struct irq_desc *desc) | ||
37 | { | ||
38 | desc->status &= ~IRQ_MASKED; | ||
39 | } | ||
40 | static inline void irq_compat_set_move_pending(struct irq_desc *desc) | ||
41 | { | ||
42 | desc->status |= IRQ_MOVE_PENDING; | ||
43 | } | ||
44 | |||
45 | static inline void irq_compat_clr_move_pending(struct irq_desc *desc) | ||
46 | { | ||
47 | desc->status &= ~IRQ_MOVE_PENDING; | ||
48 | } | ||
49 | static inline void irq_compat_set_affinity(struct irq_desc *desc) | ||
50 | { | ||
51 | desc->status |= IRQ_AFFINITY_SET; | ||
52 | } | ||
53 | |||
54 | static inline void irq_compat_clr_affinity(struct irq_desc *desc) | ||
55 | { | ||
56 | desc->status &= ~IRQ_AFFINITY_SET; | ||
57 | } | ||
58 | #else | ||
59 | static inline void irq_compat_set_progress(struct irq_desc *desc) { } | ||
60 | static inline void irq_compat_clr_progress(struct irq_desc *desc) { } | ||
61 | static inline void irq_compat_set_disabled(struct irq_desc *desc) { } | ||
62 | static inline void irq_compat_clr_disabled(struct irq_desc *desc) { } | ||
63 | static inline void irq_compat_set_pending(struct irq_desc *desc) { } | ||
64 | static inline void irq_compat_clr_pending(struct irq_desc *desc) { } | ||
65 | static inline void irq_compat_set_masked(struct irq_desc *desc) { } | ||
66 | static inline void irq_compat_clr_masked(struct irq_desc *desc) { } | ||
67 | static inline void irq_compat_set_move_pending(struct irq_desc *desc) { } | ||
68 | static inline void irq_compat_clr_move_pending(struct irq_desc *desc) { } | ||
69 | static inline void irq_compat_set_affinity(struct irq_desc *desc) { } | ||
70 | static inline void irq_compat_clr_affinity(struct irq_desc *desc) { } | ||
71 | #endif | ||
72 | |||
diff --git a/kernel/irq/debug.h b/kernel/irq/debug.h new file mode 100644 index 000000000000..d1a33b7fa61d --- /dev/null +++ b/kernel/irq/debug.h | |||
@@ -0,0 +1,40 @@ | |||
1 | /* | ||
2 | * Debugging printout: | ||
3 | */ | ||
4 | |||
5 | #include <linux/kallsyms.h> | ||
6 | |||
7 | #define P(f) if (desc->status & f) printk("%14s set\n", #f) | ||
8 | #define PS(f) if (desc->istate & f) printk("%14s set\n", #f) | ||
9 | |||
10 | static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc) | ||
11 | { | ||
12 | printk("irq %d, desc: %p, depth: %d, count: %d, unhandled: %d\n", | ||
13 | irq, desc, desc->depth, desc->irq_count, desc->irqs_unhandled); | ||
14 | printk("->handle_irq(): %p, ", desc->handle_irq); | ||
15 | print_symbol("%s\n", (unsigned long)desc->handle_irq); | ||
16 | printk("->irq_data.chip(): %p, ", desc->irq_data.chip); | ||
17 | print_symbol("%s\n", (unsigned long)desc->irq_data.chip); | ||
18 | printk("->action(): %p\n", desc->action); | ||
19 | if (desc->action) { | ||
20 | printk("->action->handler(): %p, ", desc->action->handler); | ||
21 | print_symbol("%s\n", (unsigned long)desc->action->handler); | ||
22 | } | ||
23 | |||
24 | P(IRQ_LEVEL); | ||
25 | P(IRQ_PER_CPU); | ||
26 | P(IRQ_NOPROBE); | ||
27 | P(IRQ_NOREQUEST); | ||
28 | P(IRQ_NOAUTOEN); | ||
29 | |||
30 | PS(IRQS_AUTODETECT); | ||
31 | PS(IRQS_INPROGRESS); | ||
32 | PS(IRQS_REPLAY); | ||
33 | PS(IRQS_WAITING); | ||
34 | PS(IRQS_DISABLED); | ||
35 | PS(IRQS_PENDING); | ||
36 | PS(IRQS_MASKED); | ||
37 | } | ||
38 | |||
39 | #undef P | ||
40 | #undef PS | ||
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index e2347eb63306..517561fc7317 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c | |||
@@ -51,30 +51,92 @@ static void warn_no_thread(unsigned int irq, struct irqaction *action) | |||
51 | "but no thread function available.", irq, action->name); | 51 | "but no thread function available.", irq, action->name); |
52 | } | 52 | } |
53 | 53 | ||
54 | /** | 54 | static void irq_wake_thread(struct irq_desc *desc, struct irqaction *action) |
55 | * handle_IRQ_event - irq action chain handler | 55 | { |
56 | * @irq: the interrupt number | 56 | /* |
57 | * @action: the interrupt action chain for this irq | 57 | * Wake up the handler thread for this action. In case the |
58 | * | 58 | * thread crashed and was killed we just pretend that we |
59 | * Handles the action chain of an irq event | 59 | * handled the interrupt. The hardirq handler has disabled the |
60 | */ | 60 | * device interrupt, so no irq storm is lurking. If the |
61 | irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action) | 61 | * RUNTHREAD bit is already set, nothing to do. |
62 | */ | ||
63 | if (test_bit(IRQTF_DIED, &action->thread_flags) || | ||
64 | test_and_set_bit(IRQTF_RUNTHREAD, &action->thread_flags)) | ||
65 | return; | ||
66 | |||
67 | /* | ||
68 | * It's safe to OR the mask lockless here. We have only two | ||
69 | * places which write to threads_oneshot: This code and the | ||
70 | * irq thread. | ||
71 | * | ||
72 | * This code is the hard irq context and can never run on two | ||
73 | * cpus in parallel. If it ever does we have more serious | ||
74 | * problems than this bitmask. | ||
75 | * | ||
76 | * The irq threads of this irq which clear their "running" bit | ||
77 | * in threads_oneshot are serialized via desc->lock against | ||
78 | * each other and they are serialized against this code by | ||
79 | * IRQS_INPROGRESS. | ||
80 | * | ||
81 | * Hard irq handler: | ||
82 | * | ||
83 | * spin_lock(desc->lock); | ||
84 | * desc->state |= IRQS_INPROGRESS; | ||
85 | * spin_unlock(desc->lock); | ||
86 | * set_bit(IRQTF_RUNTHREAD, &action->thread_flags); | ||
87 | * desc->threads_oneshot |= mask; | ||
88 | * spin_lock(desc->lock); | ||
89 | * desc->state &= ~IRQS_INPROGRESS; | ||
90 | * spin_unlock(desc->lock); | ||
91 | * | ||
92 | * irq thread: | ||
93 | * | ||
94 | * again: | ||
95 | * spin_lock(desc->lock); | ||
96 | * if (desc->state & IRQS_INPROGRESS) { | ||
97 | * spin_unlock(desc->lock); | ||
98 | * while(desc->state & IRQS_INPROGRESS) | ||
99 | * cpu_relax(); | ||
100 | * goto again; | ||
101 | * } | ||
102 | * if (!test_bit(IRQTF_RUNTHREAD, &action->thread_flags)) | ||
103 | * desc->threads_oneshot &= ~mask; | ||
104 | * spin_unlock(desc->lock); | ||
105 | * | ||
106 | * So either the thread waits for us to clear IRQS_INPROGRESS | ||
107 | * or we are waiting in the flow handler for desc->lock to be | ||
108 | * released before we reach this point. The thread also checks | ||
109 | * IRQTF_RUNTHREAD under desc->lock. If set it leaves | ||
110 | * threads_oneshot untouched and runs the thread another time. | ||
111 | */ | ||
112 | desc->threads_oneshot |= action->thread_mask; | ||
113 | wake_up_process(action->thread); | ||
114 | } | ||
115 | |||
116 | irqreturn_t | ||
117 | handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action) | ||
62 | { | 118 | { |
63 | irqreturn_t ret, retval = IRQ_NONE; | 119 | irqreturn_t retval = IRQ_NONE; |
64 | unsigned int status = 0; | 120 | unsigned int random = 0, irq = desc->irq_data.irq; |
65 | 121 | ||
66 | do { | 122 | do { |
123 | irqreturn_t res; | ||
124 | |||
67 | trace_irq_handler_entry(irq, action); | 125 | trace_irq_handler_entry(irq, action); |
68 | ret = action->handler(irq, action->dev_id); | 126 | res = action->handler(irq, action->dev_id); |
69 | trace_irq_handler_exit(irq, action, ret); | 127 | trace_irq_handler_exit(irq, action, res); |
128 | |||
129 | if (WARN_ONCE(!irqs_disabled(),"irq %u handler %pF enabled interrupts\n", | ||
130 | irq, action->handler)) | ||
131 | local_irq_disable(); | ||
70 | 132 | ||
71 | switch (ret) { | 133 | switch (res) { |
72 | case IRQ_WAKE_THREAD: | 134 | case IRQ_WAKE_THREAD: |
73 | /* | 135 | /* |
74 | * Set result to handled so the spurious check | 136 | * Set result to handled so the spurious check |
75 | * does not trigger. | 137 | * does not trigger. |
76 | */ | 138 | */ |
77 | ret = IRQ_HANDLED; | 139 | res = IRQ_HANDLED; |
78 | 140 | ||
79 | /* | 141 | /* |
80 | * Catch drivers which return WAKE_THREAD but | 142 | * Catch drivers which return WAKE_THREAD but |
@@ -85,147 +147,56 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action) | |||
85 | break; | 147 | break; |
86 | } | 148 | } |
87 | 149 | ||
88 | /* | 150 | irq_wake_thread(desc, action); |
89 | * Wake up the handler thread for this | ||
90 | * action. In case the thread crashed and was | ||
91 | * killed we just pretend that we handled the | ||
92 | * interrupt. The hardirq handler above has | ||
93 | * disabled the device interrupt, so no irq | ||
94 | * storm is lurking. | ||
95 | */ | ||
96 | if (likely(!test_bit(IRQTF_DIED, | ||
97 | &action->thread_flags))) { | ||
98 | set_bit(IRQTF_RUNTHREAD, &action->thread_flags); | ||
99 | wake_up_process(action->thread); | ||
100 | } | ||
101 | 151 | ||
102 | /* Fall through to add to randomness */ | 152 | /* Fall through to add to randomness */ |
103 | case IRQ_HANDLED: | 153 | case IRQ_HANDLED: |
104 | status |= action->flags; | 154 | random |= action->flags; |
105 | break; | 155 | break; |
106 | 156 | ||
107 | default: | 157 | default: |
108 | break; | 158 | break; |
109 | } | 159 | } |
110 | 160 | ||
111 | retval |= ret; | 161 | retval |= res; |
112 | action = action->next; | 162 | action = action->next; |
113 | } while (action); | 163 | } while (action); |
114 | 164 | ||
115 | if (status & IRQF_SAMPLE_RANDOM) | 165 | if (random & IRQF_SAMPLE_RANDOM) |
116 | add_interrupt_randomness(irq); | 166 | add_interrupt_randomness(irq); |
117 | local_irq_disable(); | ||
118 | 167 | ||
168 | if (!noirqdebug) | ||
169 | note_interrupt(irq, desc, retval); | ||
119 | return retval; | 170 | return retval; |
120 | } | 171 | } |
121 | 172 | ||
122 | #ifndef CONFIG_GENERIC_HARDIRQS_NO__DO_IRQ | 173 | irqreturn_t handle_irq_event(struct irq_desc *desc) |
174 | { | ||
175 | struct irqaction *action = desc->action; | ||
176 | irqreturn_t ret; | ||
123 | 177 | ||
124 | #ifdef CONFIG_ENABLE_WARN_DEPRECATED | 178 | irq_compat_clr_pending(desc); |
125 | # warning __do_IRQ is deprecated. Please convert to proper flow handlers | 179 | desc->istate &= ~IRQS_PENDING; |
126 | #endif | 180 | irq_compat_set_progress(desc); |
181 | desc->istate |= IRQS_INPROGRESS; | ||
182 | raw_spin_unlock(&desc->lock); | ||
183 | |||
184 | ret = handle_irq_event_percpu(desc, action); | ||
185 | |||
186 | raw_spin_lock(&desc->lock); | ||
187 | desc->istate &= ~IRQS_INPROGRESS; | ||
188 | irq_compat_clr_progress(desc); | ||
189 | return ret; | ||
190 | } | ||
127 | 191 | ||
128 | /** | 192 | /** |
129 | * __do_IRQ - original all in one highlevel IRQ handler | 193 | * handle_IRQ_event - irq action chain handler |
130 | * @irq: the interrupt number | 194 | * @irq: the interrupt number |
195 | * @action: the interrupt action chain for this irq | ||
131 | * | 196 | * |
132 | * __do_IRQ handles all normal device IRQ's (the special | 197 | * Handles the action chain of an irq event |
133 | * SMP cross-CPU interrupts have their own specific | ||
134 | * handlers). | ||
135 | * | ||
136 | * This is the original x86 implementation which is used for every | ||
137 | * interrupt type. | ||
138 | */ | 198 | */ |
139 | unsigned int __do_IRQ(unsigned int irq) | 199 | irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action) |
140 | { | 200 | { |
141 | struct irq_desc *desc = irq_to_desc(irq); | 201 | return handle_irq_event_percpu(irq_to_desc(irq), action); |
142 | struct irqaction *action; | ||
143 | unsigned int status; | ||
144 | |||
145 | kstat_incr_irqs_this_cpu(irq, desc); | ||
146 | |||
147 | if (CHECK_IRQ_PER_CPU(desc->status)) { | ||
148 | irqreturn_t action_ret; | ||
149 | |||
150 | /* | ||
151 | * No locking required for CPU-local interrupts: | ||
152 | */ | ||
153 | if (desc->irq_data.chip->ack) | ||
154 | desc->irq_data.chip->ack(irq); | ||
155 | if (likely(!(desc->status & IRQ_DISABLED))) { | ||
156 | action_ret = handle_IRQ_event(irq, desc->action); | ||
157 | if (!noirqdebug) | ||
158 | note_interrupt(irq, desc, action_ret); | ||
159 | } | ||
160 | desc->irq_data.chip->end(irq); | ||
161 | return 1; | ||
162 | } | ||
163 | |||
164 | raw_spin_lock(&desc->lock); | ||
165 | if (desc->irq_data.chip->ack) | ||
166 | desc->irq_data.chip->ack(irq); | ||
167 | /* | ||
168 | * REPLAY is when Linux resends an IRQ that was dropped earlier | ||
169 | * WAITING is used by probe to mark irqs that are being tested | ||
170 | */ | ||
171 | status = desc->status & ~(IRQ_REPLAY | IRQ_WAITING); | ||
172 | status |= IRQ_PENDING; /* we _want_ to handle it */ | ||
173 | |||
174 | /* | ||
175 | * If the IRQ is disabled for whatever reason, we cannot | ||
176 | * use the action we have. | ||
177 | */ | ||
178 | action = NULL; | ||
179 | if (likely(!(status & (IRQ_DISABLED | IRQ_INPROGRESS)))) { | ||
180 | action = desc->action; | ||
181 | status &= ~IRQ_PENDING; /* we commit to handling */ | ||
182 | status |= IRQ_INPROGRESS; /* we are handling it */ | ||
183 | } | ||
184 | desc->status = status; | ||
185 | |||
186 | /* | ||
187 | * If there is no IRQ handler or it was disabled, exit early. | ||
188 | * Since we set PENDING, if another processor is handling | ||
189 | * a different instance of this same irq, the other processor | ||
190 | * will take care of it. | ||
191 | */ | ||
192 | if (unlikely(!action)) | ||
193 | goto out; | ||
194 | |||
195 | /* | ||
196 | * Edge triggered interrupts need to remember | ||
197 | * pending events. | ||
198 | * This applies to any hw interrupts that allow a second | ||
199 | * instance of the same irq to arrive while we are in do_IRQ | ||
200 | * or in the handler. But the code here only handles the _second_ | ||
201 | * instance of the irq, not the third or fourth. So it is mostly | ||
202 | * useful for irq hardware that does not mask cleanly in an | ||
203 | * SMP environment. | ||
204 | */ | ||
205 | for (;;) { | ||
206 | irqreturn_t action_ret; | ||
207 | |||
208 | raw_spin_unlock(&desc->lock); | ||
209 | |||
210 | action_ret = handle_IRQ_event(irq, action); | ||
211 | if (!noirqdebug) | ||
212 | note_interrupt(irq, desc, action_ret); | ||
213 | |||
214 | raw_spin_lock(&desc->lock); | ||
215 | if (likely(!(desc->status & IRQ_PENDING))) | ||
216 | break; | ||
217 | desc->status &= ~IRQ_PENDING; | ||
218 | } | ||
219 | desc->status &= ~IRQ_INPROGRESS; | ||
220 | |||
221 | out: | ||
222 | /* | ||
223 | * The ->end() handler has to deal with interrupts which got | ||
224 | * disabled while the handler was running. | ||
225 | */ | ||
226 | desc->irq_data.chip->end(irq); | ||
227 | raw_spin_unlock(&desc->lock); | ||
228 | |||
229 | return 1; | ||
230 | } | 202 | } |
231 | #endif | ||
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 4571ae7e085a..6c6ec9a49027 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h | |||
@@ -1,27 +1,101 @@ | |||
1 | /* | 1 | /* |
2 | * IRQ subsystem internal functions and variables: | 2 | * IRQ subsystem internal functions and variables: |
3 | * | ||
4 | * Do not ever include this file from anything else than | ||
5 | * kernel/irq/. Do not even think about using any information outside | ||
6 | * of this file for your non core code. | ||
3 | */ | 7 | */ |
4 | #include <linux/irqdesc.h> | 8 | #include <linux/irqdesc.h> |
5 | 9 | ||
10 | #ifdef CONFIG_SPARSE_IRQ | ||
11 | # define IRQ_BITMAP_BITS (NR_IRQS + 8196) | ||
12 | #else | ||
13 | # define IRQ_BITMAP_BITS NR_IRQS | ||
14 | #endif | ||
15 | |||
16 | #define istate core_internal_state__do_not_mess_with_it | ||
17 | |||
18 | #ifdef CONFIG_GENERIC_HARDIRQS_NO_COMPAT | ||
19 | # define status status_use_accessors | ||
20 | #endif | ||
21 | |||
6 | extern int noirqdebug; | 22 | extern int noirqdebug; |
7 | 23 | ||
24 | /* | ||
25 | * Bits used by threaded handlers: | ||
26 | * IRQTF_RUNTHREAD - signals that the interrupt handler thread should run | ||
27 | * IRQTF_DIED - handler thread died | ||
28 | * IRQTF_WARNED - warning "IRQ_WAKE_THREAD w/o thread_fn" has been printed | ||
29 | * IRQTF_AFFINITY - irq thread is requested to adjust affinity | ||
30 | * IRQTF_FORCED_THREAD - irq action is force threaded | ||
31 | */ | ||
32 | enum { | ||
33 | IRQTF_RUNTHREAD, | ||
34 | IRQTF_DIED, | ||
35 | IRQTF_WARNED, | ||
36 | IRQTF_AFFINITY, | ||
37 | IRQTF_FORCED_THREAD, | ||
38 | }; | ||
39 | |||
40 | /* | ||
41 | * Bit masks for desc->state | ||
42 | * | ||
43 | * IRQS_AUTODETECT - autodetection in progress | ||
44 | * IRQS_SPURIOUS_DISABLED - was disabled due to spurious interrupt | ||
45 | * detection | ||
46 | * IRQS_POLL_INPROGRESS - polling in progress | ||
47 | * IRQS_INPROGRESS - Interrupt in progress | ||
48 | * IRQS_ONESHOT - irq is not unmasked in primary handler | ||
49 | * IRQS_REPLAY - irq is replayed | ||
50 | * IRQS_WAITING - irq is waiting | ||
51 | * IRQS_DISABLED - irq is disabled | ||
52 | * IRQS_PENDING - irq is pending and replayed later | ||
53 | * IRQS_MASKED - irq is masked | ||
54 | * IRQS_SUSPENDED - irq is suspended | ||
55 | */ | ||
56 | enum { | ||
57 | IRQS_AUTODETECT = 0x00000001, | ||
58 | IRQS_SPURIOUS_DISABLED = 0x00000002, | ||
59 | IRQS_POLL_INPROGRESS = 0x00000008, | ||
60 | IRQS_INPROGRESS = 0x00000010, | ||
61 | IRQS_ONESHOT = 0x00000020, | ||
62 | IRQS_REPLAY = 0x00000040, | ||
63 | IRQS_WAITING = 0x00000080, | ||
64 | IRQS_DISABLED = 0x00000100, | ||
65 | IRQS_PENDING = 0x00000200, | ||
66 | IRQS_MASKED = 0x00000400, | ||
67 | IRQS_SUSPENDED = 0x00000800, | ||
68 | }; | ||
69 | |||
70 | #include "compat.h" | ||
71 | #include "debug.h" | ||
72 | #include "settings.h" | ||
73 | |||
8 | #define irq_data_to_desc(data) container_of(data, struct irq_desc, irq_data) | 74 | #define irq_data_to_desc(data) container_of(data, struct irq_desc, irq_data) |
9 | 75 | ||
10 | /* Set default functions for irq_chip structures: */ | 76 | /* Set default functions for irq_chip structures: */ |
11 | extern void irq_chip_set_defaults(struct irq_chip *chip); | 77 | extern void irq_chip_set_defaults(struct irq_chip *chip); |
12 | 78 | ||
13 | /* Set default handler: */ | ||
14 | extern void compat_irq_chip_set_default_handler(struct irq_desc *desc); | ||
15 | |||
16 | extern int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, | 79 | extern int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, |
17 | unsigned long flags); | 80 | unsigned long flags); |
18 | extern void __disable_irq(struct irq_desc *desc, unsigned int irq, bool susp); | 81 | extern void __disable_irq(struct irq_desc *desc, unsigned int irq, bool susp); |
19 | extern void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume); | 82 | extern void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume); |
20 | 83 | ||
84 | extern int irq_startup(struct irq_desc *desc); | ||
85 | extern void irq_shutdown(struct irq_desc *desc); | ||
86 | extern void irq_enable(struct irq_desc *desc); | ||
87 | extern void irq_disable(struct irq_desc *desc); | ||
88 | extern void mask_irq(struct irq_desc *desc); | ||
89 | extern void unmask_irq(struct irq_desc *desc); | ||
90 | |||
21 | extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr); | 91 | extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr); |
22 | 92 | ||
93 | irqreturn_t handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action); | ||
94 | irqreturn_t handle_irq_event(struct irq_desc *desc); | ||
95 | |||
23 | /* Resending of interrupts :*/ | 96 | /* Resending of interrupts :*/ |
24 | void check_irq_resend(struct irq_desc *desc, unsigned int irq); | 97 | void check_irq_resend(struct irq_desc *desc, unsigned int irq); |
98 | bool irq_wait_for_poll(struct irq_desc *desc); | ||
25 | 99 | ||
26 | #ifdef CONFIG_PROC_FS | 100 | #ifdef CONFIG_PROC_FS |
27 | extern void register_irq_proc(unsigned int irq, struct irq_desc *desc); | 101 | extern void register_irq_proc(unsigned int irq, struct irq_desc *desc); |
@@ -37,20 +111,10 @@ static inline void unregister_handler_proc(unsigned int irq, | |||
37 | struct irqaction *action) { } | 111 | struct irqaction *action) { } |
38 | #endif | 112 | #endif |
39 | 113 | ||
40 | extern int irq_select_affinity_usr(unsigned int irq); | 114 | extern int irq_select_affinity_usr(unsigned int irq, struct cpumask *mask); |
41 | 115 | ||
42 | extern void irq_set_thread_affinity(struct irq_desc *desc); | 116 | extern void irq_set_thread_affinity(struct irq_desc *desc); |
43 | 117 | ||
44 | #ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED | ||
45 | static inline void irq_end(unsigned int irq, struct irq_desc *desc) | ||
46 | { | ||
47 | if (desc->irq_data.chip && desc->irq_data.chip->end) | ||
48 | desc->irq_data.chip->end(irq); | ||
49 | } | ||
50 | #else | ||
51 | static inline void irq_end(unsigned int irq, struct irq_desc *desc) { } | ||
52 | #endif | ||
53 | |||
54 | /* Inline functions for support of irq chips on slow busses */ | 118 | /* Inline functions for support of irq chips on slow busses */ |
55 | static inline void chip_bus_lock(struct irq_desc *desc) | 119 | static inline void chip_bus_lock(struct irq_desc *desc) |
56 | { | 120 | { |
@@ -64,43 +128,60 @@ static inline void chip_bus_sync_unlock(struct irq_desc *desc) | |||
64 | desc->irq_data.chip->irq_bus_sync_unlock(&desc->irq_data); | 128 | desc->irq_data.chip->irq_bus_sync_unlock(&desc->irq_data); |
65 | } | 129 | } |
66 | 130 | ||
131 | struct irq_desc * | ||
132 | __irq_get_desc_lock(unsigned int irq, unsigned long *flags, bool bus); | ||
133 | void __irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags, bool bus); | ||
134 | |||
135 | static inline struct irq_desc * | ||
136 | irq_get_desc_buslock(unsigned int irq, unsigned long *flags) | ||
137 | { | ||
138 | return __irq_get_desc_lock(irq, flags, true); | ||
139 | } | ||
140 | |||
141 | static inline void | ||
142 | irq_put_desc_busunlock(struct irq_desc *desc, unsigned long flags) | ||
143 | { | ||
144 | __irq_put_desc_unlock(desc, flags, true); | ||
145 | } | ||
146 | |||
147 | static inline struct irq_desc * | ||
148 | irq_get_desc_lock(unsigned int irq, unsigned long *flags) | ||
149 | { | ||
150 | return __irq_get_desc_lock(irq, flags, false); | ||
151 | } | ||
152 | |||
153 | static inline void | ||
154 | irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags) | ||
155 | { | ||
156 | __irq_put_desc_unlock(desc, flags, false); | ||
157 | } | ||
158 | |||
67 | /* | 159 | /* |
68 | * Debugging printout: | 160 | * Manipulation functions for irq_data.state |
69 | */ | 161 | */ |
162 | static inline void irqd_set_move_pending(struct irq_data *d) | ||
163 | { | ||
164 | d->state_use_accessors |= IRQD_SETAFFINITY_PENDING; | ||
165 | irq_compat_set_move_pending(irq_data_to_desc(d)); | ||
166 | } | ||
70 | 167 | ||
71 | #include <linux/kallsyms.h> | 168 | static inline void irqd_clr_move_pending(struct irq_data *d) |
72 | 169 | { | |
73 | #define P(f) if (desc->status & f) printk("%14s set\n", #f) | 170 | d->state_use_accessors &= ~IRQD_SETAFFINITY_PENDING; |
171 | irq_compat_clr_move_pending(irq_data_to_desc(d)); | ||
172 | } | ||
74 | 173 | ||
75 | static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc) | 174 | static inline void irqd_clear(struct irq_data *d, unsigned int mask) |
76 | { | 175 | { |
77 | printk("irq %d, desc: %p, depth: %d, count: %d, unhandled: %d\n", | 176 | d->state_use_accessors &= ~mask; |
78 | irq, desc, desc->depth, desc->irq_count, desc->irqs_unhandled); | ||
79 | printk("->handle_irq(): %p, ", desc->handle_irq); | ||
80 | print_symbol("%s\n", (unsigned long)desc->handle_irq); | ||
81 | printk("->irq_data.chip(): %p, ", desc->irq_data.chip); | ||
82 | print_symbol("%s\n", (unsigned long)desc->irq_data.chip); | ||
83 | printk("->action(): %p\n", desc->action); | ||
84 | if (desc->action) { | ||
85 | printk("->action->handler(): %p, ", desc->action->handler); | ||
86 | print_symbol("%s\n", (unsigned long)desc->action->handler); | ||
87 | } | ||
88 | |||
89 | P(IRQ_INPROGRESS); | ||
90 | P(IRQ_DISABLED); | ||
91 | P(IRQ_PENDING); | ||
92 | P(IRQ_REPLAY); | ||
93 | P(IRQ_AUTODETECT); | ||
94 | P(IRQ_WAITING); | ||
95 | P(IRQ_LEVEL); | ||
96 | P(IRQ_MASKED); | ||
97 | #ifdef CONFIG_IRQ_PER_CPU | ||
98 | P(IRQ_PER_CPU); | ||
99 | #endif | ||
100 | P(IRQ_NOPROBE); | ||
101 | P(IRQ_NOREQUEST); | ||
102 | P(IRQ_NOAUTOEN); | ||
103 | } | 177 | } |
104 | 178 | ||
105 | #undef P | 179 | static inline void irqd_set(struct irq_data *d, unsigned int mask) |
180 | { | ||
181 | d->state_use_accessors |= mask; | ||
182 | } | ||
106 | 183 | ||
184 | static inline bool irqd_has_set(struct irq_data *d, unsigned int mask) | ||
185 | { | ||
186 | return d->state_use_accessors & mask; | ||
187 | } | ||
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 9988d03797f5..dbccc799407f 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c | |||
@@ -72,18 +72,22 @@ static inline int desc_node(struct irq_desc *desc) { return 0; } | |||
72 | 72 | ||
73 | static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node) | 73 | static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node) |
74 | { | 74 | { |
75 | int cpu; | ||
76 | |||
75 | desc->irq_data.irq = irq; | 77 | desc->irq_data.irq = irq; |
76 | desc->irq_data.chip = &no_irq_chip; | 78 | desc->irq_data.chip = &no_irq_chip; |
77 | desc->irq_data.chip_data = NULL; | 79 | desc->irq_data.chip_data = NULL; |
78 | desc->irq_data.handler_data = NULL; | 80 | desc->irq_data.handler_data = NULL; |
79 | desc->irq_data.msi_desc = NULL; | 81 | desc->irq_data.msi_desc = NULL; |
80 | desc->status = IRQ_DEFAULT_INIT_FLAGS; | 82 | irq_settings_clr_and_set(desc, ~0, _IRQ_DEFAULT_INIT_FLAGS); |
83 | desc->istate = IRQS_DISABLED; | ||
81 | desc->handle_irq = handle_bad_irq; | 84 | desc->handle_irq = handle_bad_irq; |
82 | desc->depth = 1; | 85 | desc->depth = 1; |
83 | desc->irq_count = 0; | 86 | desc->irq_count = 0; |
84 | desc->irqs_unhandled = 0; | 87 | desc->irqs_unhandled = 0; |
85 | desc->name = NULL; | 88 | desc->name = NULL; |
86 | memset(desc->kstat_irqs, 0, nr_cpu_ids * sizeof(*(desc->kstat_irqs))); | 89 | for_each_possible_cpu(cpu) |
90 | *per_cpu_ptr(desc->kstat_irqs, cpu) = 0; | ||
87 | desc_smp_init(desc, node); | 91 | desc_smp_init(desc, node); |
88 | } | 92 | } |
89 | 93 | ||
@@ -91,7 +95,7 @@ int nr_irqs = NR_IRQS; | |||
91 | EXPORT_SYMBOL_GPL(nr_irqs); | 95 | EXPORT_SYMBOL_GPL(nr_irqs); |
92 | 96 | ||
93 | static DEFINE_MUTEX(sparse_irq_lock); | 97 | static DEFINE_MUTEX(sparse_irq_lock); |
94 | static DECLARE_BITMAP(allocated_irqs, NR_IRQS); | 98 | static DECLARE_BITMAP(allocated_irqs, IRQ_BITMAP_BITS); |
95 | 99 | ||
96 | #ifdef CONFIG_SPARSE_IRQ | 100 | #ifdef CONFIG_SPARSE_IRQ |
97 | 101 | ||
@@ -133,8 +137,7 @@ static struct irq_desc *alloc_desc(int irq, int node) | |||
133 | if (!desc) | 137 | if (!desc) |
134 | return NULL; | 138 | return NULL; |
135 | /* allocate based on nr_cpu_ids */ | 139 | /* allocate based on nr_cpu_ids */ |
136 | desc->kstat_irqs = kzalloc_node(nr_cpu_ids * sizeof(*desc->kstat_irqs), | 140 | desc->kstat_irqs = alloc_percpu(unsigned int); |
137 | gfp, node); | ||
138 | if (!desc->kstat_irqs) | 141 | if (!desc->kstat_irqs) |
139 | goto err_desc; | 142 | goto err_desc; |
140 | 143 | ||
@@ -149,7 +152,7 @@ static struct irq_desc *alloc_desc(int irq, int node) | |||
149 | return desc; | 152 | return desc; |
150 | 153 | ||
151 | err_kstat: | 154 | err_kstat: |
152 | kfree(desc->kstat_irqs); | 155 | free_percpu(desc->kstat_irqs); |
153 | err_desc: | 156 | err_desc: |
154 | kfree(desc); | 157 | kfree(desc); |
155 | return NULL; | 158 | return NULL; |
@@ -166,7 +169,7 @@ static void free_desc(unsigned int irq) | |||
166 | mutex_unlock(&sparse_irq_lock); | 169 | mutex_unlock(&sparse_irq_lock); |
167 | 170 | ||
168 | free_masks(desc); | 171 | free_masks(desc); |
169 | kfree(desc->kstat_irqs); | 172 | free_percpu(desc->kstat_irqs); |
170 | kfree(desc); | 173 | kfree(desc); |
171 | } | 174 | } |
172 | 175 | ||
@@ -204,6 +207,14 @@ struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node) | |||
204 | return NULL; | 207 | return NULL; |
205 | } | 208 | } |
206 | 209 | ||
210 | static int irq_expand_nr_irqs(unsigned int nr) | ||
211 | { | ||
212 | if (nr > IRQ_BITMAP_BITS) | ||
213 | return -ENOMEM; | ||
214 | nr_irqs = nr; | ||
215 | return 0; | ||
216 | } | ||
217 | |||
207 | int __init early_irq_init(void) | 218 | int __init early_irq_init(void) |
208 | { | 219 | { |
209 | int i, initcnt, node = first_online_node; | 220 | int i, initcnt, node = first_online_node; |
@@ -215,6 +226,15 @@ int __init early_irq_init(void) | |||
215 | initcnt = arch_probe_nr_irqs(); | 226 | initcnt = arch_probe_nr_irqs(); |
216 | printk(KERN_INFO "NR_IRQS:%d nr_irqs:%d %d\n", NR_IRQS, nr_irqs, initcnt); | 227 | printk(KERN_INFO "NR_IRQS:%d nr_irqs:%d %d\n", NR_IRQS, nr_irqs, initcnt); |
217 | 228 | ||
229 | if (WARN_ON(nr_irqs > IRQ_BITMAP_BITS)) | ||
230 | nr_irqs = IRQ_BITMAP_BITS; | ||
231 | |||
232 | if (WARN_ON(initcnt > IRQ_BITMAP_BITS)) | ||
233 | initcnt = IRQ_BITMAP_BITS; | ||
234 | |||
235 | if (initcnt > nr_irqs) | ||
236 | nr_irqs = initcnt; | ||
237 | |||
218 | for (i = 0; i < initcnt; i++) { | 238 | for (i = 0; i < initcnt; i++) { |
219 | desc = alloc_desc(i, node); | 239 | desc = alloc_desc(i, node); |
220 | set_bit(i, allocated_irqs); | 240 | set_bit(i, allocated_irqs); |
@@ -227,14 +247,13 @@ int __init early_irq_init(void) | |||
227 | 247 | ||
228 | struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = { | 248 | struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = { |
229 | [0 ... NR_IRQS-1] = { | 249 | [0 ... NR_IRQS-1] = { |
230 | .status = IRQ_DEFAULT_INIT_FLAGS, | 250 | .istate = IRQS_DISABLED, |
231 | .handle_irq = handle_bad_irq, | 251 | .handle_irq = handle_bad_irq, |
232 | .depth = 1, | 252 | .depth = 1, |
233 | .lock = __RAW_SPIN_LOCK_UNLOCKED(irq_desc->lock), | 253 | .lock = __RAW_SPIN_LOCK_UNLOCKED(irq_desc->lock), |
234 | } | 254 | } |
235 | }; | 255 | }; |
236 | 256 | ||
237 | static unsigned int kstat_irqs_all[NR_IRQS][NR_CPUS]; | ||
238 | int __init early_irq_init(void) | 257 | int __init early_irq_init(void) |
239 | { | 258 | { |
240 | int count, i, node = first_online_node; | 259 | int count, i, node = first_online_node; |
@@ -250,7 +269,8 @@ int __init early_irq_init(void) | |||
250 | for (i = 0; i < count; i++) { | 269 | for (i = 0; i < count; i++) { |
251 | desc[i].irq_data.irq = i; | 270 | desc[i].irq_data.irq = i; |
252 | desc[i].irq_data.chip = &no_irq_chip; | 271 | desc[i].irq_data.chip = &no_irq_chip; |
253 | desc[i].kstat_irqs = kstat_irqs_all[i]; | 272 | desc[i].kstat_irqs = alloc_percpu(unsigned int); |
273 | irq_settings_clr_and_set(desc, ~0, _IRQ_DEFAULT_INIT_FLAGS); | ||
254 | alloc_masks(desc + i, GFP_KERNEL, node); | 274 | alloc_masks(desc + i, GFP_KERNEL, node); |
255 | desc_smp_init(desc + i, node); | 275 | desc_smp_init(desc + i, node); |
256 | lockdep_set_class(&desc[i].lock, &irq_desc_lock_class); | 276 | lockdep_set_class(&desc[i].lock, &irq_desc_lock_class); |
@@ -277,6 +297,12 @@ static inline int alloc_descs(unsigned int start, unsigned int cnt, int node) | |||
277 | { | 297 | { |
278 | return start; | 298 | return start; |
279 | } | 299 | } |
300 | |||
301 | static int irq_expand_nr_irqs(unsigned int nr) | ||
302 | { | ||
303 | return -ENOMEM; | ||
304 | } | ||
305 | |||
280 | #endif /* !CONFIG_SPARSE_IRQ */ | 306 | #endif /* !CONFIG_SPARSE_IRQ */ |
281 | 307 | ||
282 | /* Dynamic interrupt handling */ | 308 | /* Dynamic interrupt handling */ |
@@ -320,14 +346,17 @@ irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node) | |||
320 | 346 | ||
321 | mutex_lock(&sparse_irq_lock); | 347 | mutex_lock(&sparse_irq_lock); |
322 | 348 | ||
323 | start = bitmap_find_next_zero_area(allocated_irqs, nr_irqs, from, cnt, 0); | 349 | start = bitmap_find_next_zero_area(allocated_irqs, IRQ_BITMAP_BITS, |
350 | from, cnt, 0); | ||
324 | ret = -EEXIST; | 351 | ret = -EEXIST; |
325 | if (irq >=0 && start != irq) | 352 | if (irq >=0 && start != irq) |
326 | goto err; | 353 | goto err; |
327 | 354 | ||
328 | ret = -ENOMEM; | 355 | if (start + cnt > nr_irqs) { |
329 | if (start >= nr_irqs) | 356 | ret = irq_expand_nr_irqs(start + cnt); |
330 | goto err; | 357 | if (ret) |
358 | goto err; | ||
359 | } | ||
331 | 360 | ||
332 | bitmap_set(allocated_irqs, start, cnt); | 361 | bitmap_set(allocated_irqs, start, cnt); |
333 | mutex_unlock(&sparse_irq_lock); | 362 | mutex_unlock(&sparse_irq_lock); |
@@ -374,6 +403,26 @@ unsigned int irq_get_next_irq(unsigned int offset) | |||
374 | return find_next_bit(allocated_irqs, nr_irqs, offset); | 403 | return find_next_bit(allocated_irqs, nr_irqs, offset); |
375 | } | 404 | } |
376 | 405 | ||
406 | struct irq_desc * | ||
407 | __irq_get_desc_lock(unsigned int irq, unsigned long *flags, bool bus) | ||
408 | { | ||
409 | struct irq_desc *desc = irq_to_desc(irq); | ||
410 | |||
411 | if (desc) { | ||
412 | if (bus) | ||
413 | chip_bus_lock(desc); | ||
414 | raw_spin_lock_irqsave(&desc->lock, *flags); | ||
415 | } | ||
416 | return desc; | ||
417 | } | ||
418 | |||
419 | void __irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags, bool bus) | ||
420 | { | ||
421 | raw_spin_unlock_irqrestore(&desc->lock, flags); | ||
422 | if (bus) | ||
423 | chip_bus_sync_unlock(desc); | ||
424 | } | ||
425 | |||
377 | /** | 426 | /** |
378 | * dynamic_irq_cleanup - cleanup a dynamically allocated irq | 427 | * dynamic_irq_cleanup - cleanup a dynamically allocated irq |
379 | * @irq: irq number to initialize | 428 | * @irq: irq number to initialize |
@@ -391,7 +440,9 @@ void dynamic_irq_cleanup(unsigned int irq) | |||
391 | unsigned int kstat_irqs_cpu(unsigned int irq, int cpu) | 440 | unsigned int kstat_irqs_cpu(unsigned int irq, int cpu) |
392 | { | 441 | { |
393 | struct irq_desc *desc = irq_to_desc(irq); | 442 | struct irq_desc *desc = irq_to_desc(irq); |
394 | return desc ? desc->kstat_irqs[cpu] : 0; | 443 | |
444 | return desc && desc->kstat_irqs ? | ||
445 | *per_cpu_ptr(desc->kstat_irqs, cpu) : 0; | ||
395 | } | 446 | } |
396 | 447 | ||
397 | #ifdef CONFIG_GENERIC_HARDIRQS | 448 | #ifdef CONFIG_GENERIC_HARDIRQS |
@@ -401,10 +452,10 @@ unsigned int kstat_irqs(unsigned int irq) | |||
401 | int cpu; | 452 | int cpu; |
402 | int sum = 0; | 453 | int sum = 0; |
403 | 454 | ||
404 | if (!desc) | 455 | if (!desc || !desc->kstat_irqs) |
405 | return 0; | 456 | return 0; |
406 | for_each_possible_cpu(cpu) | 457 | for_each_possible_cpu(cpu) |
407 | sum += desc->kstat_irqs[cpu]; | 458 | sum += *per_cpu_ptr(desc->kstat_irqs, cpu); |
408 | return sum; | 459 | return sum; |
409 | } | 460 | } |
410 | #endif /* CONFIG_GENERIC_HARDIRQS */ | 461 | #endif /* CONFIG_GENERIC_HARDIRQS */ |
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 91a5fa25054e..acd599a43bfb 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c | |||
@@ -17,6 +17,17 @@ | |||
17 | 17 | ||
18 | #include "internals.h" | 18 | #include "internals.h" |
19 | 19 | ||
20 | #ifdef CONFIG_IRQ_FORCED_THREADING | ||
21 | __read_mostly bool force_irqthreads; | ||
22 | |||
23 | static int __init setup_forced_irqthreads(char *arg) | ||
24 | { | ||
25 | force_irqthreads = true; | ||
26 | return 0; | ||
27 | } | ||
28 | early_param("threadirqs", setup_forced_irqthreads); | ||
29 | #endif | ||
30 | |||
20 | /** | 31 | /** |
21 | * synchronize_irq - wait for pending IRQ handlers (on other CPUs) | 32 | * synchronize_irq - wait for pending IRQ handlers (on other CPUs) |
22 | * @irq: interrupt number to wait for | 33 | * @irq: interrupt number to wait for |
@@ -30,7 +41,7 @@ | |||
30 | void synchronize_irq(unsigned int irq) | 41 | void synchronize_irq(unsigned int irq) |
31 | { | 42 | { |
32 | struct irq_desc *desc = irq_to_desc(irq); | 43 | struct irq_desc *desc = irq_to_desc(irq); |
33 | unsigned int status; | 44 | unsigned int state; |
34 | 45 | ||
35 | if (!desc) | 46 | if (!desc) |
36 | return; | 47 | return; |
@@ -42,16 +53,16 @@ void synchronize_irq(unsigned int irq) | |||
42 | * Wait until we're out of the critical section. This might | 53 | * Wait until we're out of the critical section. This might |
43 | * give the wrong answer due to the lack of memory barriers. | 54 | * give the wrong answer due to the lack of memory barriers. |
44 | */ | 55 | */ |
45 | while (desc->status & IRQ_INPROGRESS) | 56 | while (desc->istate & IRQS_INPROGRESS) |
46 | cpu_relax(); | 57 | cpu_relax(); |
47 | 58 | ||
48 | /* Ok, that indicated we're done: double-check carefully. */ | 59 | /* Ok, that indicated we're done: double-check carefully. */ |
49 | raw_spin_lock_irqsave(&desc->lock, flags); | 60 | raw_spin_lock_irqsave(&desc->lock, flags); |
50 | status = desc->status; | 61 | state = desc->istate; |
51 | raw_spin_unlock_irqrestore(&desc->lock, flags); | 62 | raw_spin_unlock_irqrestore(&desc->lock, flags); |
52 | 63 | ||
53 | /* Oops, that failed? */ | 64 | /* Oops, that failed? */ |
54 | } while (status & IRQ_INPROGRESS); | 65 | } while (state & IRQS_INPROGRESS); |
55 | 66 | ||
56 | /* | 67 | /* |
57 | * We made sure that no hardirq handler is running. Now verify | 68 | * We made sure that no hardirq handler is running. Now verify |
@@ -73,8 +84,8 @@ int irq_can_set_affinity(unsigned int irq) | |||
73 | { | 84 | { |
74 | struct irq_desc *desc = irq_to_desc(irq); | 85 | struct irq_desc *desc = irq_to_desc(irq); |
75 | 86 | ||
76 | if (CHECK_IRQ_PER_CPU(desc->status) || !desc->irq_data.chip || | 87 | if (!desc || !irqd_can_balance(&desc->irq_data) || |
77 | !desc->irq_data.chip->irq_set_affinity) | 88 | !desc->irq_data.chip || !desc->irq_data.chip->irq_set_affinity) |
78 | return 0; | 89 | return 0; |
79 | 90 | ||
80 | return 1; | 91 | return 1; |
@@ -100,67 +111,169 @@ void irq_set_thread_affinity(struct irq_desc *desc) | |||
100 | } | 111 | } |
101 | } | 112 | } |
102 | 113 | ||
114 | #ifdef CONFIG_GENERIC_PENDING_IRQ | ||
115 | static inline bool irq_can_move_pcntxt(struct irq_desc *desc) | ||
116 | { | ||
117 | return irq_settings_can_move_pcntxt(desc); | ||
118 | } | ||
119 | static inline bool irq_move_pending(struct irq_desc *desc) | ||
120 | { | ||
121 | return irqd_is_setaffinity_pending(&desc->irq_data); | ||
122 | } | ||
123 | static inline void | ||
124 | irq_copy_pending(struct irq_desc *desc, const struct cpumask *mask) | ||
125 | { | ||
126 | cpumask_copy(desc->pending_mask, mask); | ||
127 | } | ||
128 | static inline void | ||
129 | irq_get_pending(struct cpumask *mask, struct irq_desc *desc) | ||
130 | { | ||
131 | cpumask_copy(mask, desc->pending_mask); | ||
132 | } | ||
133 | #else | ||
134 | static inline bool irq_can_move_pcntxt(struct irq_desc *desc) { return true; } | ||
135 | static inline bool irq_move_pending(struct irq_desc *desc) { return false; } | ||
136 | static inline void | ||
137 | irq_copy_pending(struct irq_desc *desc, const struct cpumask *mask) { } | ||
138 | static inline void | ||
139 | irq_get_pending(struct cpumask *mask, struct irq_desc *desc) { } | ||
140 | #endif | ||
141 | |||
103 | /** | 142 | /** |
104 | * irq_set_affinity - Set the irq affinity of a given irq | 143 | * irq_set_affinity - Set the irq affinity of a given irq |
105 | * @irq: Interrupt to set affinity | 144 | * @irq: Interrupt to set affinity |
106 | * @cpumask: cpumask | 145 | * @cpumask: cpumask |
107 | * | 146 | * |
108 | */ | 147 | */ |
109 | int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask) | 148 | int irq_set_affinity(unsigned int irq, const struct cpumask *mask) |
110 | { | 149 | { |
111 | struct irq_desc *desc = irq_to_desc(irq); | 150 | struct irq_desc *desc = irq_to_desc(irq); |
112 | struct irq_chip *chip = desc->irq_data.chip; | 151 | struct irq_chip *chip = desc->irq_data.chip; |
113 | unsigned long flags; | 152 | unsigned long flags; |
153 | int ret = 0; | ||
114 | 154 | ||
115 | if (!chip->irq_set_affinity) | 155 | if (!chip->irq_set_affinity) |
116 | return -EINVAL; | 156 | return -EINVAL; |
117 | 157 | ||
118 | raw_spin_lock_irqsave(&desc->lock, flags); | 158 | raw_spin_lock_irqsave(&desc->lock, flags); |
119 | 159 | ||
120 | #ifdef CONFIG_GENERIC_PENDING_IRQ | 160 | if (irq_can_move_pcntxt(desc)) { |
121 | if (desc->status & IRQ_MOVE_PCNTXT) { | 161 | ret = chip->irq_set_affinity(&desc->irq_data, mask, false); |
122 | if (!chip->irq_set_affinity(&desc->irq_data, cpumask, false)) { | 162 | switch (ret) { |
123 | cpumask_copy(desc->irq_data.affinity, cpumask); | 163 | case IRQ_SET_MASK_OK: |
164 | cpumask_copy(desc->irq_data.affinity, mask); | ||
165 | case IRQ_SET_MASK_OK_NOCOPY: | ||
124 | irq_set_thread_affinity(desc); | 166 | irq_set_thread_affinity(desc); |
167 | ret = 0; | ||
125 | } | 168 | } |
169 | } else { | ||
170 | irqd_set_move_pending(&desc->irq_data); | ||
171 | irq_copy_pending(desc, mask); | ||
126 | } | 172 | } |
127 | else { | 173 | |
128 | desc->status |= IRQ_MOVE_PENDING; | 174 | if (desc->affinity_notify) { |
129 | cpumask_copy(desc->pending_mask, cpumask); | 175 | kref_get(&desc->affinity_notify->kref); |
130 | } | 176 | schedule_work(&desc->affinity_notify->work); |
131 | #else | ||
132 | if (!chip->irq_set_affinity(&desc->irq_data, cpumask, false)) { | ||
133 | cpumask_copy(desc->irq_data.affinity, cpumask); | ||
134 | irq_set_thread_affinity(desc); | ||
135 | } | 177 | } |
136 | #endif | 178 | irq_compat_set_affinity(desc); |
137 | desc->status |= IRQ_AFFINITY_SET; | 179 | irqd_set(&desc->irq_data, IRQD_AFFINITY_SET); |
138 | raw_spin_unlock_irqrestore(&desc->lock, flags); | 180 | raw_spin_unlock_irqrestore(&desc->lock, flags); |
139 | return 0; | 181 | return ret; |
140 | } | 182 | } |
141 | 183 | ||
142 | int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m) | 184 | int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m) |
143 | { | 185 | { |
186 | unsigned long flags; | ||
187 | struct irq_desc *desc = irq_get_desc_lock(irq, &flags); | ||
188 | |||
189 | if (!desc) | ||
190 | return -EINVAL; | ||
191 | desc->affinity_hint = m; | ||
192 | irq_put_desc_unlock(desc, flags); | ||
193 | return 0; | ||
194 | } | ||
195 | EXPORT_SYMBOL_GPL(irq_set_affinity_hint); | ||
196 | |||
197 | static void irq_affinity_notify(struct work_struct *work) | ||
198 | { | ||
199 | struct irq_affinity_notify *notify = | ||
200 | container_of(work, struct irq_affinity_notify, work); | ||
201 | struct irq_desc *desc = irq_to_desc(notify->irq); | ||
202 | cpumask_var_t cpumask; | ||
203 | unsigned long flags; | ||
204 | |||
205 | if (!desc || !alloc_cpumask_var(&cpumask, GFP_KERNEL)) | ||
206 | goto out; | ||
207 | |||
208 | raw_spin_lock_irqsave(&desc->lock, flags); | ||
209 | if (irq_move_pending(desc)) | ||
210 | irq_get_pending(cpumask, desc); | ||
211 | else | ||
212 | cpumask_copy(cpumask, desc->irq_data.affinity); | ||
213 | raw_spin_unlock_irqrestore(&desc->lock, flags); | ||
214 | |||
215 | notify->notify(notify, cpumask); | ||
216 | |||
217 | free_cpumask_var(cpumask); | ||
218 | out: | ||
219 | kref_put(¬ify->kref, notify->release); | ||
220 | } | ||
221 | |||
222 | /** | ||
223 | * irq_set_affinity_notifier - control notification of IRQ affinity changes | ||
224 | * @irq: Interrupt for which to enable/disable notification | ||
225 | * @notify: Context for notification, or %NULL to disable | ||
226 | * notification. Function pointers must be initialised; | ||
227 | * the other fields will be initialised by this function. | ||
228 | * | ||
229 | * Must be called in process context. Notification may only be enabled | ||
230 | * after the IRQ is allocated and must be disabled before the IRQ is | ||
231 | * freed using free_irq(). | ||
232 | */ | ||
233 | int | ||
234 | irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify) | ||
235 | { | ||
144 | struct irq_desc *desc = irq_to_desc(irq); | 236 | struct irq_desc *desc = irq_to_desc(irq); |
237 | struct irq_affinity_notify *old_notify; | ||
145 | unsigned long flags; | 238 | unsigned long flags; |
146 | 239 | ||
240 | /* The release function is promised process context */ | ||
241 | might_sleep(); | ||
242 | |||
147 | if (!desc) | 243 | if (!desc) |
148 | return -EINVAL; | 244 | return -EINVAL; |
149 | 245 | ||
246 | /* Complete initialisation of *notify */ | ||
247 | if (notify) { | ||
248 | notify->irq = irq; | ||
249 | kref_init(¬ify->kref); | ||
250 | INIT_WORK(¬ify->work, irq_affinity_notify); | ||
251 | } | ||
252 | |||
150 | raw_spin_lock_irqsave(&desc->lock, flags); | 253 | raw_spin_lock_irqsave(&desc->lock, flags); |
151 | desc->affinity_hint = m; | 254 | old_notify = desc->affinity_notify; |
255 | desc->affinity_notify = notify; | ||
152 | raw_spin_unlock_irqrestore(&desc->lock, flags); | 256 | raw_spin_unlock_irqrestore(&desc->lock, flags); |
153 | 257 | ||
258 | if (old_notify) | ||
259 | kref_put(&old_notify->kref, old_notify->release); | ||
260 | |||
154 | return 0; | 261 | return 0; |
155 | } | 262 | } |
156 | EXPORT_SYMBOL_GPL(irq_set_affinity_hint); | 263 | EXPORT_SYMBOL_GPL(irq_set_affinity_notifier); |
157 | 264 | ||
158 | #ifndef CONFIG_AUTO_IRQ_AFFINITY | 265 | #ifndef CONFIG_AUTO_IRQ_AFFINITY |
159 | /* | 266 | /* |
160 | * Generic version of the affinity autoselector. | 267 | * Generic version of the affinity autoselector. |
161 | */ | 268 | */ |
162 | static int setup_affinity(unsigned int irq, struct irq_desc *desc) | 269 | static int |
270 | setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask) | ||
163 | { | 271 | { |
272 | struct irq_chip *chip = irq_desc_get_chip(desc); | ||
273 | struct cpumask *set = irq_default_affinity; | ||
274 | int ret; | ||
275 | |||
276 | /* Excludes PER_CPU and NO_BALANCE interrupts */ | ||
164 | if (!irq_can_set_affinity(irq)) | 277 | if (!irq_can_set_affinity(irq)) |
165 | return 0; | 278 | return 0; |
166 | 279 | ||
@@ -168,22 +281,29 @@ static int setup_affinity(unsigned int irq, struct irq_desc *desc) | |||
168 | * Preserve an userspace affinity setup, but make sure that | 281 | * Preserve an userspace affinity setup, but make sure that |
169 | * one of the targets is online. | 282 | * one of the targets is online. |
170 | */ | 283 | */ |
171 | if (desc->status & (IRQ_AFFINITY_SET | IRQ_NO_BALANCING)) { | 284 | if (irqd_has_set(&desc->irq_data, IRQD_AFFINITY_SET)) { |
172 | if (cpumask_any_and(desc->irq_data.affinity, cpu_online_mask) | 285 | if (cpumask_intersects(desc->irq_data.affinity, |
173 | < nr_cpu_ids) | 286 | cpu_online_mask)) |
174 | goto set_affinity; | 287 | set = desc->irq_data.affinity; |
175 | else | 288 | else { |
176 | desc->status &= ~IRQ_AFFINITY_SET; | 289 | irq_compat_clr_affinity(desc); |
290 | irqd_clear(&desc->irq_data, IRQD_AFFINITY_SET); | ||
291 | } | ||
177 | } | 292 | } |
178 | 293 | ||
179 | cpumask_and(desc->irq_data.affinity, cpu_online_mask, irq_default_affinity); | 294 | cpumask_and(mask, cpu_online_mask, set); |
180 | set_affinity: | 295 | ret = chip->irq_set_affinity(&desc->irq_data, mask, false); |
181 | desc->irq_data.chip->irq_set_affinity(&desc->irq_data, desc->irq_data.affinity, false); | 296 | switch (ret) { |
182 | 297 | case IRQ_SET_MASK_OK: | |
298 | cpumask_copy(desc->irq_data.affinity, mask); | ||
299 | case IRQ_SET_MASK_OK_NOCOPY: | ||
300 | irq_set_thread_affinity(desc); | ||
301 | } | ||
183 | return 0; | 302 | return 0; |
184 | } | 303 | } |
185 | #else | 304 | #else |
186 | static inline int setup_affinity(unsigned int irq, struct irq_desc *d) | 305 | static inline int |
306 | setup_affinity(unsigned int irq, struct irq_desc *d, struct cpumask *mask) | ||
187 | { | 307 | { |
188 | return irq_select_affinity(irq); | 308 | return irq_select_affinity(irq); |
189 | } | 309 | } |
@@ -192,23 +312,21 @@ static inline int setup_affinity(unsigned int irq, struct irq_desc *d) | |||
192 | /* | 312 | /* |
193 | * Called when affinity is set via /proc/irq | 313 | * Called when affinity is set via /proc/irq |
194 | */ | 314 | */ |
195 | int irq_select_affinity_usr(unsigned int irq) | 315 | int irq_select_affinity_usr(unsigned int irq, struct cpumask *mask) |
196 | { | 316 | { |
197 | struct irq_desc *desc = irq_to_desc(irq); | 317 | struct irq_desc *desc = irq_to_desc(irq); |
198 | unsigned long flags; | 318 | unsigned long flags; |
199 | int ret; | 319 | int ret; |
200 | 320 | ||
201 | raw_spin_lock_irqsave(&desc->lock, flags); | 321 | raw_spin_lock_irqsave(&desc->lock, flags); |
202 | ret = setup_affinity(irq, desc); | 322 | ret = setup_affinity(irq, desc, mask); |
203 | if (!ret) | ||
204 | irq_set_thread_affinity(desc); | ||
205 | raw_spin_unlock_irqrestore(&desc->lock, flags); | 323 | raw_spin_unlock_irqrestore(&desc->lock, flags); |
206 | |||
207 | return ret; | 324 | return ret; |
208 | } | 325 | } |
209 | 326 | ||
210 | #else | 327 | #else |
211 | static inline int setup_affinity(unsigned int irq, struct irq_desc *desc) | 328 | static inline int |
329 | setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask) | ||
212 | { | 330 | { |
213 | return 0; | 331 | return 0; |
214 | } | 332 | } |
@@ -219,13 +337,23 @@ void __disable_irq(struct irq_desc *desc, unsigned int irq, bool suspend) | |||
219 | if (suspend) { | 337 | if (suspend) { |
220 | if (!desc->action || (desc->action->flags & IRQF_NO_SUSPEND)) | 338 | if (!desc->action || (desc->action->flags & IRQF_NO_SUSPEND)) |
221 | return; | 339 | return; |
222 | desc->status |= IRQ_SUSPENDED; | 340 | desc->istate |= IRQS_SUSPENDED; |
223 | } | 341 | } |
224 | 342 | ||
225 | if (!desc->depth++) { | 343 | if (!desc->depth++) |
226 | desc->status |= IRQ_DISABLED; | 344 | irq_disable(desc); |
227 | desc->irq_data.chip->irq_disable(&desc->irq_data); | 345 | } |
228 | } | 346 | |
347 | static int __disable_irq_nosync(unsigned int irq) | ||
348 | { | ||
349 | unsigned long flags; | ||
350 | struct irq_desc *desc = irq_get_desc_buslock(irq, &flags); | ||
351 | |||
352 | if (!desc) | ||
353 | return -EINVAL; | ||
354 | __disable_irq(desc, irq, false); | ||
355 | irq_put_desc_busunlock(desc, flags); | ||
356 | return 0; | ||
229 | } | 357 | } |
230 | 358 | ||
231 | /** | 359 | /** |
@@ -241,17 +369,7 @@ void __disable_irq(struct irq_desc *desc, unsigned int irq, bool suspend) | |||
241 | */ | 369 | */ |
242 | void disable_irq_nosync(unsigned int irq) | 370 | void disable_irq_nosync(unsigned int irq) |
243 | { | 371 | { |
244 | struct irq_desc *desc = irq_to_desc(irq); | 372 | __disable_irq_nosync(irq); |
245 | unsigned long flags; | ||
246 | |||
247 | if (!desc) | ||
248 | return; | ||
249 | |||
250 | chip_bus_lock(desc); | ||
251 | raw_spin_lock_irqsave(&desc->lock, flags); | ||
252 | __disable_irq(desc, irq, false); | ||
253 | raw_spin_unlock_irqrestore(&desc->lock, flags); | ||
254 | chip_bus_sync_unlock(desc); | ||
255 | } | 373 | } |
256 | EXPORT_SYMBOL(disable_irq_nosync); | 374 | EXPORT_SYMBOL(disable_irq_nosync); |
257 | 375 | ||
@@ -269,21 +387,24 @@ EXPORT_SYMBOL(disable_irq_nosync); | |||
269 | */ | 387 | */ |
270 | void disable_irq(unsigned int irq) | 388 | void disable_irq(unsigned int irq) |
271 | { | 389 | { |
272 | struct irq_desc *desc = irq_to_desc(irq); | 390 | if (!__disable_irq_nosync(irq)) |
273 | |||
274 | if (!desc) | ||
275 | return; | ||
276 | |||
277 | disable_irq_nosync(irq); | ||
278 | if (desc->action) | ||
279 | synchronize_irq(irq); | 391 | synchronize_irq(irq); |
280 | } | 392 | } |
281 | EXPORT_SYMBOL(disable_irq); | 393 | EXPORT_SYMBOL(disable_irq); |
282 | 394 | ||
283 | void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume) | 395 | void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume) |
284 | { | 396 | { |
285 | if (resume) | 397 | if (resume) { |
286 | desc->status &= ~IRQ_SUSPENDED; | 398 | if (!(desc->istate & IRQS_SUSPENDED)) { |
399 | if (!desc->action) | ||
400 | return; | ||
401 | if (!(desc->action->flags & IRQF_FORCE_RESUME)) | ||
402 | return; | ||
403 | /* Pretend that it got disabled ! */ | ||
404 | desc->depth++; | ||
405 | } | ||
406 | desc->istate &= ~IRQS_SUSPENDED; | ||
407 | } | ||
287 | 408 | ||
288 | switch (desc->depth) { | 409 | switch (desc->depth) { |
289 | case 0: | 410 | case 0: |
@@ -291,12 +412,11 @@ void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume) | |||
291 | WARN(1, KERN_WARNING "Unbalanced enable for IRQ %d\n", irq); | 412 | WARN(1, KERN_WARNING "Unbalanced enable for IRQ %d\n", irq); |
292 | break; | 413 | break; |
293 | case 1: { | 414 | case 1: { |
294 | unsigned int status = desc->status & ~IRQ_DISABLED; | 415 | if (desc->istate & IRQS_SUSPENDED) |
295 | |||
296 | if (desc->status & IRQ_SUSPENDED) | ||
297 | goto err_out; | 416 | goto err_out; |
298 | /* Prevent probing on this irq: */ | 417 | /* Prevent probing on this irq: */ |
299 | desc->status = status | IRQ_NOPROBE; | 418 | irq_settings_set_noprobe(desc); |
419 | irq_enable(desc); | ||
300 | check_irq_resend(desc, irq); | 420 | check_irq_resend(desc, irq); |
301 | /* fall-through */ | 421 | /* fall-through */ |
302 | } | 422 | } |
@@ -318,21 +438,18 @@ void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume) | |||
318 | */ | 438 | */ |
319 | void enable_irq(unsigned int irq) | 439 | void enable_irq(unsigned int irq) |
320 | { | 440 | { |
321 | struct irq_desc *desc = irq_to_desc(irq); | ||
322 | unsigned long flags; | 441 | unsigned long flags; |
442 | struct irq_desc *desc = irq_get_desc_buslock(irq, &flags); | ||
323 | 443 | ||
324 | if (!desc) | 444 | if (!desc) |
325 | return; | 445 | return; |
446 | if (WARN(!desc->irq_data.chip, | ||
447 | KERN_ERR "enable_irq before setup/request_irq: irq %u\n", irq)) | ||
448 | goto out; | ||
326 | 449 | ||
327 | if (WARN(!desc->irq_data.chip || !desc->irq_data.chip->irq_enable, | ||
328 | KERN_ERR "enable_irq before setup/request_irq: irq %u\n", irq)) | ||
329 | return; | ||
330 | |||
331 | chip_bus_lock(desc); | ||
332 | raw_spin_lock_irqsave(&desc->lock, flags); | ||
333 | __enable_irq(desc, irq, false); | 450 | __enable_irq(desc, irq, false); |
334 | raw_spin_unlock_irqrestore(&desc->lock, flags); | 451 | out: |
335 | chip_bus_sync_unlock(desc); | 452 | irq_put_desc_busunlock(desc, flags); |
336 | } | 453 | } |
337 | EXPORT_SYMBOL(enable_irq); | 454 | EXPORT_SYMBOL(enable_irq); |
338 | 455 | ||
@@ -348,7 +465,7 @@ static int set_irq_wake_real(unsigned int irq, unsigned int on) | |||
348 | } | 465 | } |
349 | 466 | ||
350 | /** | 467 | /** |
351 | * set_irq_wake - control irq power management wakeup | 468 | * irq_set_irq_wake - control irq power management wakeup |
352 | * @irq: interrupt to control | 469 | * @irq: interrupt to control |
353 | * @on: enable/disable power management wakeup | 470 | * @on: enable/disable power management wakeup |
354 | * | 471 | * |
@@ -359,23 +476,22 @@ static int set_irq_wake_real(unsigned int irq, unsigned int on) | |||
359 | * Wakeup mode lets this IRQ wake the system from sleep | 476 | * Wakeup mode lets this IRQ wake the system from sleep |
360 | * states like "suspend to RAM". | 477 | * states like "suspend to RAM". |
361 | */ | 478 | */ |
362 | int set_irq_wake(unsigned int irq, unsigned int on) | 479 | int irq_set_irq_wake(unsigned int irq, unsigned int on) |
363 | { | 480 | { |
364 | struct irq_desc *desc = irq_to_desc(irq); | ||
365 | unsigned long flags; | 481 | unsigned long flags; |
482 | struct irq_desc *desc = irq_get_desc_buslock(irq, &flags); | ||
366 | int ret = 0; | 483 | int ret = 0; |
367 | 484 | ||
368 | /* wakeup-capable irqs can be shared between drivers that | 485 | /* wakeup-capable irqs can be shared between drivers that |
369 | * don't need to have the same sleep mode behaviors. | 486 | * don't need to have the same sleep mode behaviors. |
370 | */ | 487 | */ |
371 | raw_spin_lock_irqsave(&desc->lock, flags); | ||
372 | if (on) { | 488 | if (on) { |
373 | if (desc->wake_depth++ == 0) { | 489 | if (desc->wake_depth++ == 0) { |
374 | ret = set_irq_wake_real(irq, on); | 490 | ret = set_irq_wake_real(irq, on); |
375 | if (ret) | 491 | if (ret) |
376 | desc->wake_depth = 0; | 492 | desc->wake_depth = 0; |
377 | else | 493 | else |
378 | desc->status |= IRQ_WAKEUP; | 494 | irqd_set(&desc->irq_data, IRQD_WAKEUP_STATE); |
379 | } | 495 | } |
380 | } else { | 496 | } else { |
381 | if (desc->wake_depth == 0) { | 497 | if (desc->wake_depth == 0) { |
@@ -385,14 +501,13 @@ int set_irq_wake(unsigned int irq, unsigned int on) | |||
385 | if (ret) | 501 | if (ret) |
386 | desc->wake_depth = 1; | 502 | desc->wake_depth = 1; |
387 | else | 503 | else |
388 | desc->status &= ~IRQ_WAKEUP; | 504 | irqd_clear(&desc->irq_data, IRQD_WAKEUP_STATE); |
389 | } | 505 | } |
390 | } | 506 | } |
391 | 507 | irq_put_desc_busunlock(desc, flags); | |
392 | raw_spin_unlock_irqrestore(&desc->lock, flags); | ||
393 | return ret; | 508 | return ret; |
394 | } | 509 | } |
395 | EXPORT_SYMBOL(set_irq_wake); | 510 | EXPORT_SYMBOL(irq_set_irq_wake); |
396 | 511 | ||
397 | /* | 512 | /* |
398 | * Internal function that tells the architecture code whether a | 513 | * Internal function that tells the architecture code whether a |
@@ -401,43 +516,27 @@ EXPORT_SYMBOL(set_irq_wake); | |||
401 | */ | 516 | */ |
402 | int can_request_irq(unsigned int irq, unsigned long irqflags) | 517 | int can_request_irq(unsigned int irq, unsigned long irqflags) |
403 | { | 518 | { |
404 | struct irq_desc *desc = irq_to_desc(irq); | ||
405 | struct irqaction *action; | ||
406 | unsigned long flags; | 519 | unsigned long flags; |
520 | struct irq_desc *desc = irq_get_desc_lock(irq, &flags); | ||
521 | int canrequest = 0; | ||
407 | 522 | ||
408 | if (!desc) | 523 | if (!desc) |
409 | return 0; | 524 | return 0; |
410 | 525 | ||
411 | if (desc->status & IRQ_NOREQUEST) | 526 | if (irq_settings_can_request(desc)) { |
412 | return 0; | 527 | if (desc->action) |
413 | 528 | if (irqflags & desc->action->flags & IRQF_SHARED) | |
414 | raw_spin_lock_irqsave(&desc->lock, flags); | 529 | canrequest =1; |
415 | action = desc->action; | 530 | } |
416 | if (action) | 531 | irq_put_desc_unlock(desc, flags); |
417 | if (irqflags & action->flags & IRQF_SHARED) | 532 | return canrequest; |
418 | action = NULL; | ||
419 | |||
420 | raw_spin_unlock_irqrestore(&desc->lock, flags); | ||
421 | |||
422 | return !action; | ||
423 | } | ||
424 | |||
425 | void compat_irq_chip_set_default_handler(struct irq_desc *desc) | ||
426 | { | ||
427 | /* | ||
428 | * If the architecture still has not overriden | ||
429 | * the flow handler then zap the default. This | ||
430 | * should catch incorrect flow-type setting. | ||
431 | */ | ||
432 | if (desc->handle_irq == &handle_bad_irq) | ||
433 | desc->handle_irq = NULL; | ||
434 | } | 533 | } |
435 | 534 | ||
436 | int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, | 535 | int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, |
437 | unsigned long flags) | 536 | unsigned long flags) |
438 | { | 537 | { |
439 | int ret; | ||
440 | struct irq_chip *chip = desc->irq_data.chip; | 538 | struct irq_chip *chip = desc->irq_data.chip; |
539 | int ret, unmask = 0; | ||
441 | 540 | ||
442 | if (!chip || !chip->irq_set_type) { | 541 | if (!chip || !chip->irq_set_type) { |
443 | /* | 542 | /* |
@@ -449,23 +548,43 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, | |||
449 | return 0; | 548 | return 0; |
450 | } | 549 | } |
451 | 550 | ||
551 | flags &= IRQ_TYPE_SENSE_MASK; | ||
552 | |||
553 | if (chip->flags & IRQCHIP_SET_TYPE_MASKED) { | ||
554 | if (!(desc->istate & IRQS_MASKED)) | ||
555 | mask_irq(desc); | ||
556 | if (!(desc->istate & IRQS_DISABLED)) | ||
557 | unmask = 1; | ||
558 | } | ||
559 | |||
452 | /* caller masked out all except trigger mode flags */ | 560 | /* caller masked out all except trigger mode flags */ |
453 | ret = chip->irq_set_type(&desc->irq_data, flags); | 561 | ret = chip->irq_set_type(&desc->irq_data, flags); |
454 | 562 | ||
455 | if (ret) | 563 | switch (ret) { |
456 | pr_err("setting trigger mode %lu for irq %u failed (%pF)\n", | 564 | case IRQ_SET_MASK_OK: |
457 | flags, irq, chip->irq_set_type); | 565 | irqd_clear(&desc->irq_data, IRQD_TRIGGER_MASK); |
458 | else { | 566 | irqd_set(&desc->irq_data, flags); |
459 | if (flags & (IRQ_TYPE_LEVEL_LOW | IRQ_TYPE_LEVEL_HIGH)) | 567 | |
460 | flags |= IRQ_LEVEL; | 568 | case IRQ_SET_MASK_OK_NOCOPY: |
461 | /* note that IRQF_TRIGGER_MASK == IRQ_TYPE_SENSE_MASK */ | 569 | flags = irqd_get_trigger_type(&desc->irq_data); |
462 | desc->status &= ~(IRQ_LEVEL | IRQ_TYPE_SENSE_MASK); | 570 | irq_settings_set_trigger_mask(desc, flags); |
463 | desc->status |= flags; | 571 | irqd_clear(&desc->irq_data, IRQD_LEVEL); |
572 | irq_settings_clr_level(desc); | ||
573 | if (flags & IRQ_TYPE_LEVEL_MASK) { | ||
574 | irq_settings_set_level(desc); | ||
575 | irqd_set(&desc->irq_data, IRQD_LEVEL); | ||
576 | } | ||
464 | 577 | ||
465 | if (chip != desc->irq_data.chip) | 578 | if (chip != desc->irq_data.chip) |
466 | irq_chip_set_defaults(desc->irq_data.chip); | 579 | irq_chip_set_defaults(desc->irq_data.chip); |
580 | ret = 0; | ||
581 | break; | ||
582 | default: | ||
583 | pr_err("setting trigger mode %lu for irq %u failed (%pF)\n", | ||
584 | flags, irq, chip->irq_set_type); | ||
467 | } | 585 | } |
468 | 586 | if (unmask) | |
587 | unmask_irq(desc); | ||
469 | return ret; | 588 | return ret; |
470 | } | 589 | } |
471 | 590 | ||
@@ -509,8 +628,11 @@ static int irq_wait_for_interrupt(struct irqaction *action) | |||
509 | * handler finished. unmask if the interrupt has not been disabled and | 628 | * handler finished. unmask if the interrupt has not been disabled and |
510 | * is marked MASKED. | 629 | * is marked MASKED. |
511 | */ | 630 | */ |
512 | static void irq_finalize_oneshot(unsigned int irq, struct irq_desc *desc) | 631 | static void irq_finalize_oneshot(struct irq_desc *desc, |
632 | struct irqaction *action, bool force) | ||
513 | { | 633 | { |
634 | if (!(desc->istate & IRQS_ONESHOT)) | ||
635 | return; | ||
514 | again: | 636 | again: |
515 | chip_bus_lock(desc); | 637 | chip_bus_lock(desc); |
516 | raw_spin_lock_irq(&desc->lock); | 638 | raw_spin_lock_irq(&desc->lock); |
@@ -522,26 +644,44 @@ again: | |||
522 | * The thread is faster done than the hard interrupt handler | 644 | * The thread is faster done than the hard interrupt handler |
523 | * on the other CPU. If we unmask the irq line then the | 645 | * on the other CPU. If we unmask the irq line then the |
524 | * interrupt can come in again and masks the line, leaves due | 646 | * interrupt can come in again and masks the line, leaves due |
525 | * to IRQ_INPROGRESS and the irq line is masked forever. | 647 | * to IRQS_INPROGRESS and the irq line is masked forever. |
648 | * | ||
649 | * This also serializes the state of shared oneshot handlers | ||
650 | * versus "desc->threads_onehsot |= action->thread_mask;" in | ||
651 | * irq_wake_thread(). See the comment there which explains the | ||
652 | * serialization. | ||
526 | */ | 653 | */ |
527 | if (unlikely(desc->status & IRQ_INPROGRESS)) { | 654 | if (unlikely(desc->istate & IRQS_INPROGRESS)) { |
528 | raw_spin_unlock_irq(&desc->lock); | 655 | raw_spin_unlock_irq(&desc->lock); |
529 | chip_bus_sync_unlock(desc); | 656 | chip_bus_sync_unlock(desc); |
530 | cpu_relax(); | 657 | cpu_relax(); |
531 | goto again; | 658 | goto again; |
532 | } | 659 | } |
533 | 660 | ||
534 | if (!(desc->status & IRQ_DISABLED) && (desc->status & IRQ_MASKED)) { | 661 | /* |
535 | desc->status &= ~IRQ_MASKED; | 662 | * Now check again, whether the thread should run. Otherwise |
663 | * we would clear the threads_oneshot bit of this thread which | ||
664 | * was just set. | ||
665 | */ | ||
666 | if (!force && test_bit(IRQTF_RUNTHREAD, &action->thread_flags)) | ||
667 | goto out_unlock; | ||
668 | |||
669 | desc->threads_oneshot &= ~action->thread_mask; | ||
670 | |||
671 | if (!desc->threads_oneshot && !(desc->istate & IRQS_DISABLED) && | ||
672 | (desc->istate & IRQS_MASKED)) { | ||
673 | irq_compat_clr_masked(desc); | ||
674 | desc->istate &= ~IRQS_MASKED; | ||
536 | desc->irq_data.chip->irq_unmask(&desc->irq_data); | 675 | desc->irq_data.chip->irq_unmask(&desc->irq_data); |
537 | } | 676 | } |
677 | out_unlock: | ||
538 | raw_spin_unlock_irq(&desc->lock); | 678 | raw_spin_unlock_irq(&desc->lock); |
539 | chip_bus_sync_unlock(desc); | 679 | chip_bus_sync_unlock(desc); |
540 | } | 680 | } |
541 | 681 | ||
542 | #ifdef CONFIG_SMP | 682 | #ifdef CONFIG_SMP |
543 | /* | 683 | /* |
544 | * Check whether we need to change the affinity of the interrupt thread. | 684 | * Check whether we need to chasnge the affinity of the interrupt thread. |
545 | */ | 685 | */ |
546 | static void | 686 | static void |
547 | irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) | 687 | irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) |
@@ -573,16 +713,49 @@ irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) { } | |||
573 | #endif | 713 | #endif |
574 | 714 | ||
575 | /* | 715 | /* |
716 | * Interrupts which are not explicitely requested as threaded | ||
717 | * interrupts rely on the implicit bh/preempt disable of the hard irq | ||
718 | * context. So we need to disable bh here to avoid deadlocks and other | ||
719 | * side effects. | ||
720 | */ | ||
721 | static void | ||
722 | irq_forced_thread_fn(struct irq_desc *desc, struct irqaction *action) | ||
723 | { | ||
724 | local_bh_disable(); | ||
725 | action->thread_fn(action->irq, action->dev_id); | ||
726 | irq_finalize_oneshot(desc, action, false); | ||
727 | local_bh_enable(); | ||
728 | } | ||
729 | |||
730 | /* | ||
731 | * Interrupts explicitely requested as threaded interupts want to be | ||
732 | * preemtible - many of them need to sleep and wait for slow busses to | ||
733 | * complete. | ||
734 | */ | ||
735 | static void irq_thread_fn(struct irq_desc *desc, struct irqaction *action) | ||
736 | { | ||
737 | action->thread_fn(action->irq, action->dev_id); | ||
738 | irq_finalize_oneshot(desc, action, false); | ||
739 | } | ||
740 | |||
741 | /* | ||
576 | * Interrupt handler thread | 742 | * Interrupt handler thread |
577 | */ | 743 | */ |
578 | static int irq_thread(void *data) | 744 | static int irq_thread(void *data) |
579 | { | 745 | { |
580 | static struct sched_param param = { | 746 | static const struct sched_param param = { |
581 | .sched_priority = MAX_USER_RT_PRIO/2, | 747 | .sched_priority = MAX_USER_RT_PRIO/2, |
582 | }; | 748 | }; |
583 | struct irqaction *action = data; | 749 | struct irqaction *action = data; |
584 | struct irq_desc *desc = irq_to_desc(action->irq); | 750 | struct irq_desc *desc = irq_to_desc(action->irq); |
585 | int wake, oneshot = desc->status & IRQ_ONESHOT; | 751 | void (*handler_fn)(struct irq_desc *desc, struct irqaction *action); |
752 | int wake; | ||
753 | |||
754 | if (force_irqthreads & test_bit(IRQTF_FORCED_THREAD, | ||
755 | &action->thread_flags)) | ||
756 | handler_fn = irq_forced_thread_fn; | ||
757 | else | ||
758 | handler_fn = irq_thread_fn; | ||
586 | 759 | ||
587 | sched_setscheduler(current, SCHED_FIFO, ¶m); | 760 | sched_setscheduler(current, SCHED_FIFO, ¶m); |
588 | current->irqaction = action; | 761 | current->irqaction = action; |
@@ -594,23 +767,20 @@ static int irq_thread(void *data) | |||
594 | atomic_inc(&desc->threads_active); | 767 | atomic_inc(&desc->threads_active); |
595 | 768 | ||
596 | raw_spin_lock_irq(&desc->lock); | 769 | raw_spin_lock_irq(&desc->lock); |
597 | if (unlikely(desc->status & IRQ_DISABLED)) { | 770 | if (unlikely(desc->istate & IRQS_DISABLED)) { |
598 | /* | 771 | /* |
599 | * CHECKME: We might need a dedicated | 772 | * CHECKME: We might need a dedicated |
600 | * IRQ_THREAD_PENDING flag here, which | 773 | * IRQ_THREAD_PENDING flag here, which |
601 | * retriggers the thread in check_irq_resend() | 774 | * retriggers the thread in check_irq_resend() |
602 | * but AFAICT IRQ_PENDING should be fine as it | 775 | * but AFAICT IRQS_PENDING should be fine as it |
603 | * retriggers the interrupt itself --- tglx | 776 | * retriggers the interrupt itself --- tglx |
604 | */ | 777 | */ |
605 | desc->status |= IRQ_PENDING; | 778 | irq_compat_set_pending(desc); |
779 | desc->istate |= IRQS_PENDING; | ||
606 | raw_spin_unlock_irq(&desc->lock); | 780 | raw_spin_unlock_irq(&desc->lock); |
607 | } else { | 781 | } else { |
608 | raw_spin_unlock_irq(&desc->lock); | 782 | raw_spin_unlock_irq(&desc->lock); |
609 | 783 | handler_fn(desc, action); | |
610 | action->thread_fn(action->irq, action->dev_id); | ||
611 | |||
612 | if (oneshot) | ||
613 | irq_finalize_oneshot(action->irq, desc); | ||
614 | } | 784 | } |
615 | 785 | ||
616 | wake = atomic_dec_and_test(&desc->threads_active); | 786 | wake = atomic_dec_and_test(&desc->threads_active); |
@@ -619,6 +789,9 @@ static int irq_thread(void *data) | |||
619 | wake_up(&desc->wait_for_threads); | 789 | wake_up(&desc->wait_for_threads); |
620 | } | 790 | } |
621 | 791 | ||
792 | /* Prevent a stale desc->threads_oneshot */ | ||
793 | irq_finalize_oneshot(desc, action, true); | ||
794 | |||
622 | /* | 795 | /* |
623 | * Clear irqaction. Otherwise exit_irq_thread() would make | 796 | * Clear irqaction. Otherwise exit_irq_thread() would make |
624 | * fuzz about an active irq thread going into nirvana. | 797 | * fuzz about an active irq thread going into nirvana. |
@@ -633,6 +806,7 @@ static int irq_thread(void *data) | |||
633 | void exit_irq_thread(void) | 806 | void exit_irq_thread(void) |
634 | { | 807 | { |
635 | struct task_struct *tsk = current; | 808 | struct task_struct *tsk = current; |
809 | struct irq_desc *desc; | ||
636 | 810 | ||
637 | if (!tsk->irqaction) | 811 | if (!tsk->irqaction) |
638 | return; | 812 | return; |
@@ -641,6 +815,14 @@ void exit_irq_thread(void) | |||
641 | "exiting task \"%s\" (%d) is an active IRQ thread (irq %d)\n", | 815 | "exiting task \"%s\" (%d) is an active IRQ thread (irq %d)\n", |
642 | tsk->comm ? tsk->comm : "", tsk->pid, tsk->irqaction->irq); | 816 | tsk->comm ? tsk->comm : "", tsk->pid, tsk->irqaction->irq); |
643 | 817 | ||
818 | desc = irq_to_desc(tsk->irqaction->irq); | ||
819 | |||
820 | /* | ||
821 | * Prevent a stale desc->threads_oneshot. Must be called | ||
822 | * before setting the IRQTF_DIED flag. | ||
823 | */ | ||
824 | irq_finalize_oneshot(desc, tsk->irqaction, true); | ||
825 | |||
644 | /* | 826 | /* |
645 | * Set the THREAD DIED flag to prevent further wakeups of the | 827 | * Set the THREAD DIED flag to prevent further wakeups of the |
646 | * soon to be gone threaded handler. | 828 | * soon to be gone threaded handler. |
@@ -648,6 +830,22 @@ void exit_irq_thread(void) | |||
648 | set_bit(IRQTF_DIED, &tsk->irqaction->flags); | 830 | set_bit(IRQTF_DIED, &tsk->irqaction->flags); |
649 | } | 831 | } |
650 | 832 | ||
833 | static void irq_setup_forced_threading(struct irqaction *new) | ||
834 | { | ||
835 | if (!force_irqthreads) | ||
836 | return; | ||
837 | if (new->flags & (IRQF_NO_THREAD | IRQF_PERCPU | IRQF_ONESHOT)) | ||
838 | return; | ||
839 | |||
840 | new->flags |= IRQF_ONESHOT; | ||
841 | |||
842 | if (!new->thread_fn) { | ||
843 | set_bit(IRQTF_FORCED_THREAD, &new->thread_flags); | ||
844 | new->thread_fn = new->handler; | ||
845 | new->handler = irq_default_primary_handler; | ||
846 | } | ||
847 | } | ||
848 | |||
651 | /* | 849 | /* |
652 | * Internal function to register an irqaction - typically used to | 850 | * Internal function to register an irqaction - typically used to |
653 | * allocate special interrupts that are part of the architecture. | 851 | * allocate special interrupts that are part of the architecture. |
@@ -657,9 +855,9 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) | |||
657 | { | 855 | { |
658 | struct irqaction *old, **old_ptr; | 856 | struct irqaction *old, **old_ptr; |
659 | const char *old_name = NULL; | 857 | const char *old_name = NULL; |
660 | unsigned long flags; | 858 | unsigned long flags, thread_mask = 0; |
661 | int nested, shared = 0; | 859 | int ret, nested, shared = 0; |
662 | int ret; | 860 | cpumask_var_t mask; |
663 | 861 | ||
664 | if (!desc) | 862 | if (!desc) |
665 | return -EINVAL; | 863 | return -EINVAL; |
@@ -683,15 +881,11 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) | |||
683 | rand_initialize_irq(irq); | 881 | rand_initialize_irq(irq); |
684 | } | 882 | } |
685 | 883 | ||
686 | /* Oneshot interrupts are not allowed with shared */ | ||
687 | if ((new->flags & IRQF_ONESHOT) && (new->flags & IRQF_SHARED)) | ||
688 | return -EINVAL; | ||
689 | |||
690 | /* | 884 | /* |
691 | * Check whether the interrupt nests into another interrupt | 885 | * Check whether the interrupt nests into another interrupt |
692 | * thread. | 886 | * thread. |
693 | */ | 887 | */ |
694 | nested = desc->status & IRQ_NESTED_THREAD; | 888 | nested = irq_settings_is_nested_thread(desc); |
695 | if (nested) { | 889 | if (nested) { |
696 | if (!new->thread_fn) | 890 | if (!new->thread_fn) |
697 | return -EINVAL; | 891 | return -EINVAL; |
@@ -701,6 +895,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) | |||
701 | * dummy function which warns when called. | 895 | * dummy function which warns when called. |
702 | */ | 896 | */ |
703 | new->handler = irq_nested_primary_handler; | 897 | new->handler = irq_nested_primary_handler; |
898 | } else { | ||
899 | irq_setup_forced_threading(new); | ||
704 | } | 900 | } |
705 | 901 | ||
706 | /* | 902 | /* |
@@ -724,6 +920,11 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) | |||
724 | new->thread = t; | 920 | new->thread = t; |
725 | } | 921 | } |
726 | 922 | ||
923 | if (!alloc_cpumask_var(&mask, GFP_KERNEL)) { | ||
924 | ret = -ENOMEM; | ||
925 | goto out_thread; | ||
926 | } | ||
927 | |||
727 | /* | 928 | /* |
728 | * The following block of code has to be executed atomically | 929 | * The following block of code has to be executed atomically |
729 | */ | 930 | */ |
@@ -735,29 +936,40 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) | |||
735 | * Can't share interrupts unless both agree to and are | 936 | * Can't share interrupts unless both agree to and are |
736 | * the same type (level, edge, polarity). So both flag | 937 | * the same type (level, edge, polarity). So both flag |
737 | * fields must have IRQF_SHARED set and the bits which | 938 | * fields must have IRQF_SHARED set and the bits which |
738 | * set the trigger type must match. | 939 | * set the trigger type must match. Also all must |
940 | * agree on ONESHOT. | ||
739 | */ | 941 | */ |
740 | if (!((old->flags & new->flags) & IRQF_SHARED) || | 942 | if (!((old->flags & new->flags) & IRQF_SHARED) || |
741 | ((old->flags ^ new->flags) & IRQF_TRIGGER_MASK)) { | 943 | ((old->flags ^ new->flags) & IRQF_TRIGGER_MASK) || |
944 | ((old->flags ^ new->flags) & IRQF_ONESHOT)) { | ||
742 | old_name = old->name; | 945 | old_name = old->name; |
743 | goto mismatch; | 946 | goto mismatch; |
744 | } | 947 | } |
745 | 948 | ||
746 | #if defined(CONFIG_IRQ_PER_CPU) | ||
747 | /* All handlers must agree on per-cpuness */ | 949 | /* All handlers must agree on per-cpuness */ |
748 | if ((old->flags & IRQF_PERCPU) != | 950 | if ((old->flags & IRQF_PERCPU) != |
749 | (new->flags & IRQF_PERCPU)) | 951 | (new->flags & IRQF_PERCPU)) |
750 | goto mismatch; | 952 | goto mismatch; |
751 | #endif | ||
752 | 953 | ||
753 | /* add new interrupt at end of irq queue */ | 954 | /* add new interrupt at end of irq queue */ |
754 | do { | 955 | do { |
956 | thread_mask |= old->thread_mask; | ||
755 | old_ptr = &old->next; | 957 | old_ptr = &old->next; |
756 | old = *old_ptr; | 958 | old = *old_ptr; |
757 | } while (old); | 959 | } while (old); |
758 | shared = 1; | 960 | shared = 1; |
759 | } | 961 | } |
760 | 962 | ||
963 | /* | ||
964 | * Setup the thread mask for this irqaction. Unlikely to have | ||
965 | * 32 resp 64 irqs sharing one line, but who knows. | ||
966 | */ | ||
967 | if (new->flags & IRQF_ONESHOT && thread_mask == ~0UL) { | ||
968 | ret = -EBUSY; | ||
969 | goto out_mask; | ||
970 | } | ||
971 | new->thread_mask = 1 << ffz(thread_mask); | ||
972 | |||
761 | if (!shared) { | 973 | if (!shared) { |
762 | irq_chip_set_defaults(desc->irq_data.chip); | 974 | irq_chip_set_defaults(desc->irq_data.chip); |
763 | 975 | ||
@@ -769,42 +981,44 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) | |||
769 | new->flags & IRQF_TRIGGER_MASK); | 981 | new->flags & IRQF_TRIGGER_MASK); |
770 | 982 | ||
771 | if (ret) | 983 | if (ret) |
772 | goto out_thread; | 984 | goto out_mask; |
773 | } else | 985 | } |
774 | compat_irq_chip_set_default_handler(desc); | ||
775 | #if defined(CONFIG_IRQ_PER_CPU) | ||
776 | if (new->flags & IRQF_PERCPU) | ||
777 | desc->status |= IRQ_PER_CPU; | ||
778 | #endif | ||
779 | 986 | ||
780 | desc->status &= ~(IRQ_AUTODETECT | IRQ_WAITING | IRQ_ONESHOT | | 987 | desc->istate &= ~(IRQS_AUTODETECT | IRQS_SPURIOUS_DISABLED | \ |
781 | IRQ_INPROGRESS | IRQ_SPURIOUS_DISABLED); | 988 | IRQS_INPROGRESS | IRQS_ONESHOT | \ |
989 | IRQS_WAITING); | ||
990 | |||
991 | if (new->flags & IRQF_PERCPU) { | ||
992 | irqd_set(&desc->irq_data, IRQD_PER_CPU); | ||
993 | irq_settings_set_per_cpu(desc); | ||
994 | } | ||
782 | 995 | ||
783 | if (new->flags & IRQF_ONESHOT) | 996 | if (new->flags & IRQF_ONESHOT) |
784 | desc->status |= IRQ_ONESHOT; | 997 | desc->istate |= IRQS_ONESHOT; |
785 | 998 | ||
786 | if (!(desc->status & IRQ_NOAUTOEN)) { | 999 | if (irq_settings_can_autoenable(desc)) |
787 | desc->depth = 0; | 1000 | irq_startup(desc); |
788 | desc->status &= ~IRQ_DISABLED; | 1001 | else |
789 | desc->irq_data.chip->irq_startup(&desc->irq_data); | ||
790 | } else | ||
791 | /* Undo nested disables: */ | 1002 | /* Undo nested disables: */ |
792 | desc->depth = 1; | 1003 | desc->depth = 1; |
793 | 1004 | ||
794 | /* Exclude IRQ from balancing if requested */ | 1005 | /* Exclude IRQ from balancing if requested */ |
795 | if (new->flags & IRQF_NOBALANCING) | 1006 | if (new->flags & IRQF_NOBALANCING) { |
796 | desc->status |= IRQ_NO_BALANCING; | 1007 | irq_settings_set_no_balancing(desc); |
1008 | irqd_set(&desc->irq_data, IRQD_NO_BALANCING); | ||
1009 | } | ||
797 | 1010 | ||
798 | /* Set default affinity mask once everything is setup */ | 1011 | /* Set default affinity mask once everything is setup */ |
799 | setup_affinity(irq, desc); | 1012 | setup_affinity(irq, desc, mask); |
800 | 1013 | ||
801 | } else if ((new->flags & IRQF_TRIGGER_MASK) | 1014 | } else if (new->flags & IRQF_TRIGGER_MASK) { |
802 | && (new->flags & IRQF_TRIGGER_MASK) | 1015 | unsigned int nmsk = new->flags & IRQF_TRIGGER_MASK; |
803 | != (desc->status & IRQ_TYPE_SENSE_MASK)) { | 1016 | unsigned int omsk = irq_settings_get_trigger_mask(desc); |
804 | /* hope the handler works with the actual trigger mode... */ | 1017 | |
805 | pr_warning("IRQ %d uses trigger mode %d; requested %d\n", | 1018 | if (nmsk != omsk) |
806 | irq, (int)(desc->status & IRQ_TYPE_SENSE_MASK), | 1019 | /* hope the handler works with current trigger mode */ |
807 | (int)(new->flags & IRQF_TRIGGER_MASK)); | 1020 | pr_warning("IRQ %d uses trigger mode %u; requested %u\n", |
1021 | irq, nmsk, omsk); | ||
808 | } | 1022 | } |
809 | 1023 | ||
810 | new->irq = irq; | 1024 | new->irq = irq; |
@@ -818,8 +1032,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) | |||
818 | * Check whether we disabled the irq via the spurious handler | 1032 | * Check whether we disabled the irq via the spurious handler |
819 | * before. Reenable it and give it another chance. | 1033 | * before. Reenable it and give it another chance. |
820 | */ | 1034 | */ |
821 | if (shared && (desc->status & IRQ_SPURIOUS_DISABLED)) { | 1035 | if (shared && (desc->istate & IRQS_SPURIOUS_DISABLED)) { |
822 | desc->status &= ~IRQ_SPURIOUS_DISABLED; | 1036 | desc->istate &= ~IRQS_SPURIOUS_DISABLED; |
823 | __enable_irq(desc, irq, false); | 1037 | __enable_irq(desc, irq, false); |
824 | } | 1038 | } |
825 | 1039 | ||
@@ -849,6 +1063,9 @@ mismatch: | |||
849 | #endif | 1063 | #endif |
850 | ret = -EBUSY; | 1064 | ret = -EBUSY; |
851 | 1065 | ||
1066 | out_mask: | ||
1067 | free_cpumask_var(mask); | ||
1068 | |||
852 | out_thread: | 1069 | out_thread: |
853 | raw_spin_unlock_irqrestore(&desc->lock, flags); | 1070 | raw_spin_unlock_irqrestore(&desc->lock, flags); |
854 | if (new->thread) { | 1071 | if (new->thread) { |
@@ -871,9 +1088,14 @@ out_thread: | |||
871 | */ | 1088 | */ |
872 | int setup_irq(unsigned int irq, struct irqaction *act) | 1089 | int setup_irq(unsigned int irq, struct irqaction *act) |
873 | { | 1090 | { |
1091 | int retval; | ||
874 | struct irq_desc *desc = irq_to_desc(irq); | 1092 | struct irq_desc *desc = irq_to_desc(irq); |
875 | 1093 | ||
876 | return __setup_irq(irq, desc, act); | 1094 | chip_bus_lock(desc); |
1095 | retval = __setup_irq(irq, desc, act); | ||
1096 | chip_bus_sync_unlock(desc); | ||
1097 | |||
1098 | return retval; | ||
877 | } | 1099 | } |
878 | EXPORT_SYMBOL_GPL(setup_irq); | 1100 | EXPORT_SYMBOL_GPL(setup_irq); |
879 | 1101 | ||
@@ -924,13 +1146,8 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id) | |||
924 | #endif | 1146 | #endif |
925 | 1147 | ||
926 | /* If this was the last handler, shut down the IRQ line: */ | 1148 | /* If this was the last handler, shut down the IRQ line: */ |
927 | if (!desc->action) { | 1149 | if (!desc->action) |
928 | desc->status |= IRQ_DISABLED; | 1150 | irq_shutdown(desc); |
929 | if (desc->irq_data.chip->irq_shutdown) | ||
930 | desc->irq_data.chip->irq_shutdown(&desc->irq_data); | ||
931 | else | ||
932 | desc->irq_data.chip->irq_disable(&desc->irq_data); | ||
933 | } | ||
934 | 1151 | ||
935 | #ifdef CONFIG_SMP | 1152 | #ifdef CONFIG_SMP |
936 | /* make sure affinity_hint is cleaned up */ | 1153 | /* make sure affinity_hint is cleaned up */ |
@@ -1004,6 +1221,11 @@ void free_irq(unsigned int irq, void *dev_id) | |||
1004 | if (!desc) | 1221 | if (!desc) |
1005 | return; | 1222 | return; |
1006 | 1223 | ||
1224 | #ifdef CONFIG_SMP | ||
1225 | if (WARN_ON(desc->affinity_notify)) | ||
1226 | desc->affinity_notify = NULL; | ||
1227 | #endif | ||
1228 | |||
1007 | chip_bus_lock(desc); | 1229 | chip_bus_lock(desc); |
1008 | kfree(__free_irq(irq, dev_id)); | 1230 | kfree(__free_irq(irq, dev_id)); |
1009 | chip_bus_sync_unlock(desc); | 1231 | chip_bus_sync_unlock(desc); |
@@ -1074,7 +1296,7 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler, | |||
1074 | if (!desc) | 1296 | if (!desc) |
1075 | return -EINVAL; | 1297 | return -EINVAL; |
1076 | 1298 | ||
1077 | if (desc->status & IRQ_NOREQUEST) | 1299 | if (!irq_settings_can_request(desc)) |
1078 | return -EINVAL; | 1300 | return -EINVAL; |
1079 | 1301 | ||
1080 | if (!handler) { | 1302 | if (!handler) { |
@@ -1100,7 +1322,7 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler, | |||
1100 | if (retval) | 1322 | if (retval) |
1101 | kfree(action); | 1323 | kfree(action); |
1102 | 1324 | ||
1103 | #ifdef CONFIG_DEBUG_SHIRQ | 1325 | #ifdef CONFIG_DEBUG_SHIRQ_FIXME |
1104 | if (!retval && (irqflags & IRQF_SHARED)) { | 1326 | if (!retval && (irqflags & IRQF_SHARED)) { |
1105 | /* | 1327 | /* |
1106 | * It's a shared IRQ -- the driver ought to be prepared for it | 1328 | * It's a shared IRQ -- the driver ought to be prepared for it |
@@ -1149,7 +1371,7 @@ int request_any_context_irq(unsigned int irq, irq_handler_t handler, | |||
1149 | if (!desc) | 1371 | if (!desc) |
1150 | return -EINVAL; | 1372 | return -EINVAL; |
1151 | 1373 | ||
1152 | if (desc->status & IRQ_NESTED_THREAD) { | 1374 | if (irq_settings_is_nested_thread(desc)) { |
1153 | ret = request_threaded_irq(irq, NULL, handler, | 1375 | ret = request_threaded_irq(irq, NULL, handler, |
1154 | flags, name, dev_id); | 1376 | flags, name, dev_id); |
1155 | return !ret ? IRQC_IS_NESTED : ret; | 1377 | return !ret ? IRQC_IS_NESTED : ret; |
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c index 1d2541940480..ec4806d4778b 100644 --- a/kernel/irq/migration.c +++ b/kernel/irq/migration.c | |||
@@ -4,23 +4,23 @@ | |||
4 | 4 | ||
5 | #include "internals.h" | 5 | #include "internals.h" |
6 | 6 | ||
7 | void move_masked_irq(int irq) | 7 | void irq_move_masked_irq(struct irq_data *idata) |
8 | { | 8 | { |
9 | struct irq_desc *desc = irq_to_desc(irq); | 9 | struct irq_desc *desc = irq_data_to_desc(idata); |
10 | struct irq_chip *chip = desc->irq_data.chip; | 10 | struct irq_chip *chip = idata->chip; |
11 | 11 | ||
12 | if (likely(!(desc->status & IRQ_MOVE_PENDING))) | 12 | if (likely(!irqd_is_setaffinity_pending(&desc->irq_data))) |
13 | return; | 13 | return; |
14 | 14 | ||
15 | /* | 15 | /* |
16 | * Paranoia: cpu-local interrupts shouldn't be calling in here anyway. | 16 | * Paranoia: cpu-local interrupts shouldn't be calling in here anyway. |
17 | */ | 17 | */ |
18 | if (CHECK_IRQ_PER_CPU(desc->status)) { | 18 | if (!irqd_can_balance(&desc->irq_data)) { |
19 | WARN_ON(1); | 19 | WARN_ON(1); |
20 | return; | 20 | return; |
21 | } | 21 | } |
22 | 22 | ||
23 | desc->status &= ~IRQ_MOVE_PENDING; | 23 | irqd_clr_move_pending(&desc->irq_data); |
24 | 24 | ||
25 | if (unlikely(cpumask_empty(desc->pending_mask))) | 25 | if (unlikely(cpumask_empty(desc->pending_mask))) |
26 | return; | 26 | return; |
@@ -53,18 +53,36 @@ void move_masked_irq(int irq) | |||
53 | cpumask_clear(desc->pending_mask); | 53 | cpumask_clear(desc->pending_mask); |
54 | } | 54 | } |
55 | 55 | ||
56 | void move_native_irq(int irq) | 56 | void move_masked_irq(int irq) |
57 | { | ||
58 | irq_move_masked_irq(irq_get_irq_data(irq)); | ||
59 | } | ||
60 | |||
61 | void irq_move_irq(struct irq_data *idata) | ||
57 | { | 62 | { |
58 | struct irq_desc *desc = irq_to_desc(irq); | 63 | struct irq_desc *desc = irq_data_to_desc(idata); |
64 | bool masked; | ||
59 | 65 | ||
60 | if (likely(!(desc->status & IRQ_MOVE_PENDING))) | 66 | if (likely(!irqd_is_setaffinity_pending(idata))) |
61 | return; | 67 | return; |
62 | 68 | ||
63 | if (unlikely(desc->status & IRQ_DISABLED)) | 69 | if (unlikely(desc->istate & IRQS_DISABLED)) |
64 | return; | 70 | return; |
65 | 71 | ||
66 | desc->irq_data.chip->irq_mask(&desc->irq_data); | 72 | /* |
67 | move_masked_irq(irq); | 73 | * Be careful vs. already masked interrupts. If this is a |
68 | desc->irq_data.chip->irq_unmask(&desc->irq_data); | 74 | * threaded interrupt with ONESHOT set, we can end up with an |
75 | * interrupt storm. | ||
76 | */ | ||
77 | masked = desc->istate & IRQS_MASKED; | ||
78 | if (!masked) | ||
79 | idata->chip->irq_mask(idata); | ||
80 | irq_move_masked_irq(idata); | ||
81 | if (!masked) | ||
82 | idata->chip->irq_unmask(idata); | ||
69 | } | 83 | } |
70 | 84 | ||
85 | void move_native_irq(int irq) | ||
86 | { | ||
87 | irq_move_irq(irq_get_irq_data(irq)); | ||
88 | } | ||
diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c index 0d4005d85b03..f76fc00c9877 100644 --- a/kernel/irq/pm.c +++ b/kernel/irq/pm.c | |||
@@ -18,7 +18,7 @@ | |||
18 | * During system-wide suspend or hibernation device drivers need to be prevented | 18 | * During system-wide suspend or hibernation device drivers need to be prevented |
19 | * from receiving interrupts and this function is provided for this purpose. | 19 | * from receiving interrupts and this function is provided for this purpose. |
20 | * It marks all interrupt lines in use, except for the timer ones, as disabled | 20 | * It marks all interrupt lines in use, except for the timer ones, as disabled |
21 | * and sets the IRQ_SUSPENDED flag for each of them. | 21 | * and sets the IRQS_SUSPENDED flag for each of them. |
22 | */ | 22 | */ |
23 | void suspend_device_irqs(void) | 23 | void suspend_device_irqs(void) |
24 | { | 24 | { |
@@ -34,7 +34,7 @@ void suspend_device_irqs(void) | |||
34 | } | 34 | } |
35 | 35 | ||
36 | for_each_irq_desc(irq, desc) | 36 | for_each_irq_desc(irq, desc) |
37 | if (desc->status & IRQ_SUSPENDED) | 37 | if (desc->istate & IRQS_SUSPENDED) |
38 | synchronize_irq(irq); | 38 | synchronize_irq(irq); |
39 | } | 39 | } |
40 | EXPORT_SYMBOL_GPL(suspend_device_irqs); | 40 | EXPORT_SYMBOL_GPL(suspend_device_irqs); |
@@ -43,7 +43,7 @@ EXPORT_SYMBOL_GPL(suspend_device_irqs); | |||
43 | * resume_device_irqs - enable interrupt lines disabled by suspend_device_irqs() | 43 | * resume_device_irqs - enable interrupt lines disabled by suspend_device_irqs() |
44 | * | 44 | * |
45 | * Enable all interrupt lines previously disabled by suspend_device_irqs() that | 45 | * Enable all interrupt lines previously disabled by suspend_device_irqs() that |
46 | * have the IRQ_SUSPENDED flag set. | 46 | * have the IRQS_SUSPENDED flag set. |
47 | */ | 47 | */ |
48 | void resume_device_irqs(void) | 48 | void resume_device_irqs(void) |
49 | { | 49 | { |
@@ -53,9 +53,6 @@ void resume_device_irqs(void) | |||
53 | for_each_irq_desc(irq, desc) { | 53 | for_each_irq_desc(irq, desc) { |
54 | unsigned long flags; | 54 | unsigned long flags; |
55 | 55 | ||
56 | if (!(desc->status & IRQ_SUSPENDED)) | ||
57 | continue; | ||
58 | |||
59 | raw_spin_lock_irqsave(&desc->lock, flags); | 56 | raw_spin_lock_irqsave(&desc->lock, flags); |
60 | __enable_irq(desc, irq, true); | 57 | __enable_irq(desc, irq, true); |
61 | raw_spin_unlock_irqrestore(&desc->lock, flags); | 58 | raw_spin_unlock_irqrestore(&desc->lock, flags); |
@@ -71,9 +68,24 @@ int check_wakeup_irqs(void) | |||
71 | struct irq_desc *desc; | 68 | struct irq_desc *desc; |
72 | int irq; | 69 | int irq; |
73 | 70 | ||
74 | for_each_irq_desc(irq, desc) | 71 | for_each_irq_desc(irq, desc) { |
75 | if ((desc->status & IRQ_WAKEUP) && (desc->status & IRQ_PENDING)) | 72 | if (irqd_is_wakeup_set(&desc->irq_data)) { |
76 | return -EBUSY; | 73 | if (desc->istate & IRQS_PENDING) |
74 | return -EBUSY; | ||
75 | continue; | ||
76 | } | ||
77 | /* | ||
78 | * Check the non wakeup interrupts whether they need | ||
79 | * to be masked before finally going into suspend | ||
80 | * state. That's for hardware which has no wakeup | ||
81 | * source configuration facility. The chip | ||
82 | * implementation indicates that with | ||
83 | * IRQCHIP_MASK_ON_SUSPEND. | ||
84 | */ | ||
85 | if (desc->istate & IRQS_SUSPENDED && | ||
86 | irq_desc_get_chip(desc)->flags & IRQCHIP_MASK_ON_SUSPEND) | ||
87 | mask_irq(desc); | ||
88 | } | ||
77 | 89 | ||
78 | return 0; | 90 | return 0; |
79 | } | 91 | } |
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index 6c8a2a9f8a7b..4cc2e5ed0bec 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c | |||
@@ -11,6 +11,7 @@ | |||
11 | #include <linux/proc_fs.h> | 11 | #include <linux/proc_fs.h> |
12 | #include <linux/seq_file.h> | 12 | #include <linux/seq_file.h> |
13 | #include <linux/interrupt.h> | 13 | #include <linux/interrupt.h> |
14 | #include <linux/kernel_stat.h> | ||
14 | 15 | ||
15 | #include "internals.h" | 16 | #include "internals.h" |
16 | 17 | ||
@@ -24,7 +25,7 @@ static int irq_affinity_proc_show(struct seq_file *m, void *v) | |||
24 | const struct cpumask *mask = desc->irq_data.affinity; | 25 | const struct cpumask *mask = desc->irq_data.affinity; |
25 | 26 | ||
26 | #ifdef CONFIG_GENERIC_PENDING_IRQ | 27 | #ifdef CONFIG_GENERIC_PENDING_IRQ |
27 | if (desc->status & IRQ_MOVE_PENDING) | 28 | if (irqd_is_setaffinity_pending(&desc->irq_data)) |
28 | mask = desc->pending_mask; | 29 | mask = desc->pending_mask; |
29 | #endif | 30 | #endif |
30 | seq_cpumask(m, mask); | 31 | seq_cpumask(m, mask); |
@@ -65,8 +66,7 @@ static ssize_t irq_affinity_proc_write(struct file *file, | |||
65 | cpumask_var_t new_value; | 66 | cpumask_var_t new_value; |
66 | int err; | 67 | int err; |
67 | 68 | ||
68 | if (!irq_to_desc(irq)->irq_data.chip->irq_set_affinity || no_irq_affinity || | 69 | if (!irq_can_set_affinity(irq) || no_irq_affinity) |
69 | irq_balancing_disabled(irq)) | ||
70 | return -EIO; | 70 | return -EIO; |
71 | 71 | ||
72 | if (!alloc_cpumask_var(&new_value, GFP_KERNEL)) | 72 | if (!alloc_cpumask_var(&new_value, GFP_KERNEL)) |
@@ -89,7 +89,7 @@ static ssize_t irq_affinity_proc_write(struct file *file, | |||
89 | if (!cpumask_intersects(new_value, cpu_online_mask)) { | 89 | if (!cpumask_intersects(new_value, cpu_online_mask)) { |
90 | /* Special case for empty set - allow the architecture | 90 | /* Special case for empty set - allow the architecture |
91 | code to set default SMP affinity. */ | 91 | code to set default SMP affinity. */ |
92 | err = irq_select_affinity_usr(irq) ? -EINVAL : count; | 92 | err = irq_select_affinity_usr(irq, new_value) ? -EINVAL : count; |
93 | } else { | 93 | } else { |
94 | irq_set_affinity(irq, new_value); | 94 | irq_set_affinity(irq, new_value); |
95 | err = count; | 95 | err = count; |
@@ -357,3 +357,65 @@ void init_irq_proc(void) | |||
357 | } | 357 | } |
358 | } | 358 | } |
359 | 359 | ||
360 | #ifdef CONFIG_GENERIC_IRQ_SHOW | ||
361 | |||
362 | int __weak arch_show_interrupts(struct seq_file *p, int prec) | ||
363 | { | ||
364 | return 0; | ||
365 | } | ||
366 | |||
367 | int show_interrupts(struct seq_file *p, void *v) | ||
368 | { | ||
369 | static int prec; | ||
370 | |||
371 | unsigned long flags, any_count = 0; | ||
372 | int i = *(loff_t *) v, j; | ||
373 | struct irqaction *action; | ||
374 | struct irq_desc *desc; | ||
375 | |||
376 | if (i > nr_irqs) | ||
377 | return 0; | ||
378 | |||
379 | if (i == nr_irqs) | ||
380 | return arch_show_interrupts(p, prec); | ||
381 | |||
382 | /* print header and calculate the width of the first column */ | ||
383 | if (i == 0) { | ||
384 | for (prec = 3, j = 1000; prec < 10 && j <= nr_irqs; ++prec) | ||
385 | j *= 10; | ||
386 | |||
387 | seq_printf(p, "%*s", prec + 8, ""); | ||
388 | for_each_online_cpu(j) | ||
389 | seq_printf(p, "CPU%-8d", j); | ||
390 | seq_putc(p, '\n'); | ||
391 | } | ||
392 | |||
393 | desc = irq_to_desc(i); | ||
394 | if (!desc) | ||
395 | return 0; | ||
396 | |||
397 | raw_spin_lock_irqsave(&desc->lock, flags); | ||
398 | for_each_online_cpu(j) | ||
399 | any_count |= kstat_irqs_cpu(i, j); | ||
400 | action = desc->action; | ||
401 | if (!action && !any_count) | ||
402 | goto out; | ||
403 | |||
404 | seq_printf(p, "%*d: ", prec, i); | ||
405 | for_each_online_cpu(j) | ||
406 | seq_printf(p, "%10u ", kstat_irqs_cpu(i, j)); | ||
407 | seq_printf(p, " %8s", desc->irq_data.chip->name); | ||
408 | seq_printf(p, "-%-8s", desc->name); | ||
409 | |||
410 | if (action) { | ||
411 | seq_printf(p, " %s", action->name); | ||
412 | while ((action = action->next) != NULL) | ||
413 | seq_printf(p, ", %s", action->name); | ||
414 | } | ||
415 | |||
416 | seq_putc(p, '\n'); | ||
417 | out: | ||
418 | raw_spin_unlock_irqrestore(&desc->lock, flags); | ||
419 | return 0; | ||
420 | } | ||
421 | #endif | ||
diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c index 891115a929aa..ad683a99b1ec 100644 --- a/kernel/irq/resend.c +++ b/kernel/irq/resend.c | |||
@@ -23,7 +23,7 @@ | |||
23 | #ifdef CONFIG_HARDIRQS_SW_RESEND | 23 | #ifdef CONFIG_HARDIRQS_SW_RESEND |
24 | 24 | ||
25 | /* Bitmap to handle software resend of interrupts: */ | 25 | /* Bitmap to handle software resend of interrupts: */ |
26 | static DECLARE_BITMAP(irqs_resend, NR_IRQS); | 26 | static DECLARE_BITMAP(irqs_resend, IRQ_BITMAP_BITS); |
27 | 27 | ||
28 | /* | 28 | /* |
29 | * Run software resends of IRQ's | 29 | * Run software resends of IRQ's |
@@ -55,20 +55,19 @@ static DECLARE_TASKLET(resend_tasklet, resend_irqs, 0); | |||
55 | */ | 55 | */ |
56 | void check_irq_resend(struct irq_desc *desc, unsigned int irq) | 56 | void check_irq_resend(struct irq_desc *desc, unsigned int irq) |
57 | { | 57 | { |
58 | unsigned int status = desc->status; | ||
59 | |||
60 | /* | ||
61 | * Make sure the interrupt is enabled, before resending it: | ||
62 | */ | ||
63 | desc->irq_data.chip->irq_enable(&desc->irq_data); | ||
64 | |||
65 | /* | 58 | /* |
66 | * We do not resend level type interrupts. Level type | 59 | * We do not resend level type interrupts. Level type |
67 | * interrupts are resent by hardware when they are still | 60 | * interrupts are resent by hardware when they are still |
68 | * active. | 61 | * active. |
69 | */ | 62 | */ |
70 | if ((status & (IRQ_LEVEL | IRQ_PENDING | IRQ_REPLAY)) == IRQ_PENDING) { | 63 | if (irq_settings_is_level(desc)) |
71 | desc->status = (status & ~IRQ_PENDING) | IRQ_REPLAY; | 64 | return; |
65 | if (desc->istate & IRQS_REPLAY) | ||
66 | return; | ||
67 | if (desc->istate & IRQS_PENDING) { | ||
68 | irq_compat_clr_pending(desc); | ||
69 | desc->istate &= ~IRQS_PENDING; | ||
70 | desc->istate |= IRQS_REPLAY; | ||
72 | 71 | ||
73 | if (!desc->irq_data.chip->irq_retrigger || | 72 | if (!desc->irq_data.chip->irq_retrigger || |
74 | !desc->irq_data.chip->irq_retrigger(&desc->irq_data)) { | 73 | !desc->irq_data.chip->irq_retrigger(&desc->irq_data)) { |
diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h new file mode 100644 index 000000000000..0227ad358272 --- /dev/null +++ b/kernel/irq/settings.h | |||
@@ -0,0 +1,138 @@ | |||
1 | /* | ||
2 | * Internal header to deal with irq_desc->status which will be renamed | ||
3 | * to irq_desc->settings. | ||
4 | */ | ||
5 | enum { | ||
6 | _IRQ_DEFAULT_INIT_FLAGS = IRQ_DEFAULT_INIT_FLAGS, | ||
7 | _IRQ_PER_CPU = IRQ_PER_CPU, | ||
8 | _IRQ_LEVEL = IRQ_LEVEL, | ||
9 | _IRQ_NOPROBE = IRQ_NOPROBE, | ||
10 | _IRQ_NOREQUEST = IRQ_NOREQUEST, | ||
11 | _IRQ_NOAUTOEN = IRQ_NOAUTOEN, | ||
12 | _IRQ_MOVE_PCNTXT = IRQ_MOVE_PCNTXT, | ||
13 | _IRQ_NO_BALANCING = IRQ_NO_BALANCING, | ||
14 | _IRQ_NESTED_THREAD = IRQ_NESTED_THREAD, | ||
15 | _IRQF_MODIFY_MASK = IRQF_MODIFY_MASK, | ||
16 | }; | ||
17 | |||
18 | #define IRQ_INPROGRESS GOT_YOU_MORON | ||
19 | #define IRQ_REPLAY GOT_YOU_MORON | ||
20 | #define IRQ_WAITING GOT_YOU_MORON | ||
21 | #define IRQ_DISABLED GOT_YOU_MORON | ||
22 | #define IRQ_PENDING GOT_YOU_MORON | ||
23 | #define IRQ_MASKED GOT_YOU_MORON | ||
24 | #define IRQ_WAKEUP GOT_YOU_MORON | ||
25 | #define IRQ_MOVE_PENDING GOT_YOU_MORON | ||
26 | #define IRQ_PER_CPU GOT_YOU_MORON | ||
27 | #define IRQ_NO_BALANCING GOT_YOU_MORON | ||
28 | #define IRQ_AFFINITY_SET GOT_YOU_MORON | ||
29 | #define IRQ_LEVEL GOT_YOU_MORON | ||
30 | #define IRQ_NOPROBE GOT_YOU_MORON | ||
31 | #define IRQ_NOREQUEST GOT_YOU_MORON | ||
32 | #define IRQ_NOAUTOEN GOT_YOU_MORON | ||
33 | #define IRQ_NESTED_THREAD GOT_YOU_MORON | ||
34 | #undef IRQF_MODIFY_MASK | ||
35 | #define IRQF_MODIFY_MASK GOT_YOU_MORON | ||
36 | |||
37 | static inline void | ||
38 | irq_settings_clr_and_set(struct irq_desc *desc, u32 clr, u32 set) | ||
39 | { | ||
40 | desc->status &= ~(clr & _IRQF_MODIFY_MASK); | ||
41 | desc->status |= (set & _IRQF_MODIFY_MASK); | ||
42 | } | ||
43 | |||
44 | static inline bool irq_settings_is_per_cpu(struct irq_desc *desc) | ||
45 | { | ||
46 | return desc->status & _IRQ_PER_CPU; | ||
47 | } | ||
48 | |||
49 | static inline void irq_settings_set_per_cpu(struct irq_desc *desc) | ||
50 | { | ||
51 | desc->status |= _IRQ_PER_CPU; | ||
52 | } | ||
53 | |||
54 | static inline void irq_settings_set_no_balancing(struct irq_desc *desc) | ||
55 | { | ||
56 | desc->status |= _IRQ_NO_BALANCING; | ||
57 | } | ||
58 | |||
59 | static inline bool irq_settings_has_no_balance_set(struct irq_desc *desc) | ||
60 | { | ||
61 | return desc->status & _IRQ_NO_BALANCING; | ||
62 | } | ||
63 | |||
64 | static inline u32 irq_settings_get_trigger_mask(struct irq_desc *desc) | ||
65 | { | ||
66 | return desc->status & IRQ_TYPE_SENSE_MASK; | ||
67 | } | ||
68 | |||
69 | static inline void | ||
70 | irq_settings_set_trigger_mask(struct irq_desc *desc, u32 mask) | ||
71 | { | ||
72 | desc->status &= ~IRQ_TYPE_SENSE_MASK; | ||
73 | desc->status |= mask & IRQ_TYPE_SENSE_MASK; | ||
74 | } | ||
75 | |||
76 | static inline bool irq_settings_is_level(struct irq_desc *desc) | ||
77 | { | ||
78 | return desc->status & _IRQ_LEVEL; | ||
79 | } | ||
80 | |||
81 | static inline void irq_settings_clr_level(struct irq_desc *desc) | ||
82 | { | ||
83 | desc->status &= ~_IRQ_LEVEL; | ||
84 | } | ||
85 | |||
86 | static inline void irq_settings_set_level(struct irq_desc *desc) | ||
87 | { | ||
88 | desc->status |= _IRQ_LEVEL; | ||
89 | } | ||
90 | |||
91 | static inline bool irq_settings_can_request(struct irq_desc *desc) | ||
92 | { | ||
93 | return !(desc->status & _IRQ_NOREQUEST); | ||
94 | } | ||
95 | |||
96 | static inline void irq_settings_clr_norequest(struct irq_desc *desc) | ||
97 | { | ||
98 | desc->status &= ~_IRQ_NOREQUEST; | ||
99 | } | ||
100 | |||
101 | static inline void irq_settings_set_norequest(struct irq_desc *desc) | ||
102 | { | ||
103 | desc->status |= _IRQ_NOREQUEST; | ||
104 | } | ||
105 | |||
106 | static inline bool irq_settings_can_probe(struct irq_desc *desc) | ||
107 | { | ||
108 | return !(desc->status & _IRQ_NOPROBE); | ||
109 | } | ||
110 | |||
111 | static inline void irq_settings_clr_noprobe(struct irq_desc *desc) | ||
112 | { | ||
113 | desc->status &= ~_IRQ_NOPROBE; | ||
114 | } | ||
115 | |||
116 | static inline void irq_settings_set_noprobe(struct irq_desc *desc) | ||
117 | { | ||
118 | desc->status |= _IRQ_NOPROBE; | ||
119 | } | ||
120 | |||
121 | static inline bool irq_settings_can_move_pcntxt(struct irq_desc *desc) | ||
122 | { | ||
123 | return desc->status & _IRQ_MOVE_PCNTXT; | ||
124 | } | ||
125 | |||
126 | static inline bool irq_settings_can_autoenable(struct irq_desc *desc) | ||
127 | { | ||
128 | return !(desc->status & _IRQ_NOAUTOEN); | ||
129 | } | ||
130 | |||
131 | static inline bool irq_settings_is_nested_thread(struct irq_desc *desc) | ||
132 | { | ||
133 | return desc->status & _IRQ_NESTED_THREAD; | ||
134 | } | ||
135 | |||
136 | /* Nothing should touch desc->status from now on */ | ||
137 | #undef status | ||
138 | #define status USE_THE_PROPER_WRAPPERS_YOU_MORON | ||
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index 3089d3b9d5f3..dd586ebf9c8c 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c | |||
@@ -21,70 +21,94 @@ static int irqfixup __read_mostly; | |||
21 | #define POLL_SPURIOUS_IRQ_INTERVAL (HZ/10) | 21 | #define POLL_SPURIOUS_IRQ_INTERVAL (HZ/10) |
22 | static void poll_spurious_irqs(unsigned long dummy); | 22 | static void poll_spurious_irqs(unsigned long dummy); |
23 | static DEFINE_TIMER(poll_spurious_irq_timer, poll_spurious_irqs, 0, 0); | 23 | static DEFINE_TIMER(poll_spurious_irq_timer, poll_spurious_irqs, 0, 0); |
24 | static int irq_poll_cpu; | ||
25 | static atomic_t irq_poll_active; | ||
26 | |||
27 | /* | ||
28 | * We wait here for a poller to finish. | ||
29 | * | ||
30 | * If the poll runs on this CPU, then we yell loudly and return | ||
31 | * false. That will leave the interrupt line disabled in the worst | ||
32 | * case, but it should never happen. | ||
33 | * | ||
34 | * We wait until the poller is done and then recheck disabled and | ||
35 | * action (about to be disabled). Only if it's still active, we return | ||
36 | * true and let the handler run. | ||
37 | */ | ||
38 | bool irq_wait_for_poll(struct irq_desc *desc) | ||
39 | { | ||
40 | if (WARN_ONCE(irq_poll_cpu == smp_processor_id(), | ||
41 | "irq poll in progress on cpu %d for irq %d\n", | ||
42 | smp_processor_id(), desc->irq_data.irq)) | ||
43 | return false; | ||
44 | |||
45 | #ifdef CONFIG_SMP | ||
46 | do { | ||
47 | raw_spin_unlock(&desc->lock); | ||
48 | while (desc->istate & IRQS_INPROGRESS) | ||
49 | cpu_relax(); | ||
50 | raw_spin_lock(&desc->lock); | ||
51 | } while (desc->istate & IRQS_INPROGRESS); | ||
52 | /* Might have been disabled in meantime */ | ||
53 | return !(desc->istate & IRQS_DISABLED) && desc->action; | ||
54 | #else | ||
55 | return false; | ||
56 | #endif | ||
57 | } | ||
58 | |||
24 | 59 | ||
25 | /* | 60 | /* |
26 | * Recovery handler for misrouted interrupts. | 61 | * Recovery handler for misrouted interrupts. |
27 | */ | 62 | */ |
28 | static int try_one_irq(int irq, struct irq_desc *desc) | 63 | static int try_one_irq(int irq, struct irq_desc *desc, bool force) |
29 | { | 64 | { |
65 | irqreturn_t ret = IRQ_NONE; | ||
30 | struct irqaction *action; | 66 | struct irqaction *action; |
31 | int ok = 0, work = 0; | ||
32 | 67 | ||
33 | raw_spin_lock(&desc->lock); | 68 | raw_spin_lock(&desc->lock); |
34 | /* Already running on another processor */ | ||
35 | if (desc->status & IRQ_INPROGRESS) { | ||
36 | /* | ||
37 | * Already running: If it is shared get the other | ||
38 | * CPU to go looking for our mystery interrupt too | ||
39 | */ | ||
40 | if (desc->action && (desc->action->flags & IRQF_SHARED)) | ||
41 | desc->status |= IRQ_PENDING; | ||
42 | raw_spin_unlock(&desc->lock); | ||
43 | return ok; | ||
44 | } | ||
45 | /* Honour the normal IRQ locking */ | ||
46 | desc->status |= IRQ_INPROGRESS; | ||
47 | action = desc->action; | ||
48 | raw_spin_unlock(&desc->lock); | ||
49 | 69 | ||
50 | while (action) { | 70 | /* PER_CPU and nested thread interrupts are never polled */ |
51 | /* Only shared IRQ handlers are safe to call */ | 71 | if (irq_settings_is_per_cpu(desc) || irq_settings_is_nested_thread(desc)) |
52 | if (action->flags & IRQF_SHARED) { | 72 | goto out; |
53 | if (action->handler(irq, action->dev_id) == | ||
54 | IRQ_HANDLED) | ||
55 | ok = 1; | ||
56 | } | ||
57 | action = action->next; | ||
58 | } | ||
59 | local_irq_disable(); | ||
60 | /* Now clean up the flags */ | ||
61 | raw_spin_lock(&desc->lock); | ||
62 | action = desc->action; | ||
63 | 73 | ||
64 | /* | 74 | /* |
65 | * While we were looking for a fixup someone queued a real | 75 | * Do not poll disabled interrupts unless the spurious |
66 | * IRQ clashing with our walk: | 76 | * disabled poller asks explicitely. |
67 | */ | 77 | */ |
68 | while ((desc->status & IRQ_PENDING) && action) { | 78 | if ((desc->istate & IRQS_DISABLED) && !force) |
79 | goto out; | ||
80 | |||
81 | /* | ||
82 | * All handlers must agree on IRQF_SHARED, so we test just the | ||
83 | * first. Check for action->next as well. | ||
84 | */ | ||
85 | action = desc->action; | ||
86 | if (!action || !(action->flags & IRQF_SHARED) || | ||
87 | (action->flags & __IRQF_TIMER) || !action->next) | ||
88 | goto out; | ||
89 | |||
90 | /* Already running on another processor */ | ||
91 | if (desc->istate & IRQS_INPROGRESS) { | ||
69 | /* | 92 | /* |
70 | * Perform real IRQ processing for the IRQ we deferred | 93 | * Already running: If it is shared get the other |
94 | * CPU to go looking for our mystery interrupt too | ||
71 | */ | 95 | */ |
72 | work = 1; | 96 | irq_compat_set_pending(desc); |
73 | raw_spin_unlock(&desc->lock); | 97 | desc->istate |= IRQS_PENDING; |
74 | handle_IRQ_event(irq, action); | 98 | goto out; |
75 | raw_spin_lock(&desc->lock); | ||
76 | desc->status &= ~IRQ_PENDING; | ||
77 | } | 99 | } |
78 | desc->status &= ~IRQ_INPROGRESS; | ||
79 | /* | ||
80 | * If we did actual work for the real IRQ line we must let the | ||
81 | * IRQ controller clean up too | ||
82 | */ | ||
83 | if (work) | ||
84 | irq_end(irq, desc); | ||
85 | raw_spin_unlock(&desc->lock); | ||
86 | 100 | ||
87 | return ok; | 101 | /* Mark it poll in progress */ |
102 | desc->istate |= IRQS_POLL_INPROGRESS; | ||
103 | do { | ||
104 | if (handle_irq_event(desc) == IRQ_HANDLED) | ||
105 | ret = IRQ_HANDLED; | ||
106 | action = desc->action; | ||
107 | } while ((desc->istate & IRQS_PENDING) && action); | ||
108 | desc->istate &= ~IRQS_POLL_INPROGRESS; | ||
109 | out: | ||
110 | raw_spin_unlock(&desc->lock); | ||
111 | return ret == IRQ_HANDLED; | ||
88 | } | 112 | } |
89 | 113 | ||
90 | static int misrouted_irq(int irq) | 114 | static int misrouted_irq(int irq) |
@@ -92,6 +116,11 @@ static int misrouted_irq(int irq) | |||
92 | struct irq_desc *desc; | 116 | struct irq_desc *desc; |
93 | int i, ok = 0; | 117 | int i, ok = 0; |
94 | 118 | ||
119 | if (atomic_inc_return(&irq_poll_active) == 1) | ||
120 | goto out; | ||
121 | |||
122 | irq_poll_cpu = smp_processor_id(); | ||
123 | |||
95 | for_each_irq_desc(i, desc) { | 124 | for_each_irq_desc(i, desc) { |
96 | if (!i) | 125 | if (!i) |
97 | continue; | 126 | continue; |
@@ -99,9 +128,11 @@ static int misrouted_irq(int irq) | |||
99 | if (i == irq) /* Already tried */ | 128 | if (i == irq) /* Already tried */ |
100 | continue; | 129 | continue; |
101 | 130 | ||
102 | if (try_one_irq(i, desc)) | 131 | if (try_one_irq(i, desc, false)) |
103 | ok = 1; | 132 | ok = 1; |
104 | } | 133 | } |
134 | out: | ||
135 | atomic_dec(&irq_poll_active); | ||
105 | /* So the caller can adjust the irq error counts */ | 136 | /* So the caller can adjust the irq error counts */ |
106 | return ok; | 137 | return ok; |
107 | } | 138 | } |
@@ -111,23 +142,28 @@ static void poll_spurious_irqs(unsigned long dummy) | |||
111 | struct irq_desc *desc; | 142 | struct irq_desc *desc; |
112 | int i; | 143 | int i; |
113 | 144 | ||
145 | if (atomic_inc_return(&irq_poll_active) != 1) | ||
146 | goto out; | ||
147 | irq_poll_cpu = smp_processor_id(); | ||
148 | |||
114 | for_each_irq_desc(i, desc) { | 149 | for_each_irq_desc(i, desc) { |
115 | unsigned int status; | 150 | unsigned int state; |
116 | 151 | ||
117 | if (!i) | 152 | if (!i) |
118 | continue; | 153 | continue; |
119 | 154 | ||
120 | /* Racy but it doesn't matter */ | 155 | /* Racy but it doesn't matter */ |
121 | status = desc->status; | 156 | state = desc->istate; |
122 | barrier(); | 157 | barrier(); |
123 | if (!(status & IRQ_SPURIOUS_DISABLED)) | 158 | if (!(state & IRQS_SPURIOUS_DISABLED)) |
124 | continue; | 159 | continue; |
125 | 160 | ||
126 | local_irq_disable(); | 161 | local_irq_disable(); |
127 | try_one_irq(i, desc); | 162 | try_one_irq(i, desc, true); |
128 | local_irq_enable(); | 163 | local_irq_enable(); |
129 | } | 164 | } |
130 | 165 | out: | |
166 | atomic_dec(&irq_poll_active); | ||
131 | mod_timer(&poll_spurious_irq_timer, | 167 | mod_timer(&poll_spurious_irq_timer, |
132 | jiffies + POLL_SPURIOUS_IRQ_INTERVAL); | 168 | jiffies + POLL_SPURIOUS_IRQ_INTERVAL); |
133 | } | 169 | } |
@@ -139,15 +175,13 @@ static void poll_spurious_irqs(unsigned long dummy) | |||
139 | * | 175 | * |
140 | * (The other 100-of-100,000 interrupts may have been a correctly | 176 | * (The other 100-of-100,000 interrupts may have been a correctly |
141 | * functioning device sharing an IRQ with the failing one) | 177 | * functioning device sharing an IRQ with the failing one) |
142 | * | ||
143 | * Called under desc->lock | ||
144 | */ | 178 | */ |
145 | |||
146 | static void | 179 | static void |
147 | __report_bad_irq(unsigned int irq, struct irq_desc *desc, | 180 | __report_bad_irq(unsigned int irq, struct irq_desc *desc, |
148 | irqreturn_t action_ret) | 181 | irqreturn_t action_ret) |
149 | { | 182 | { |
150 | struct irqaction *action; | 183 | struct irqaction *action; |
184 | unsigned long flags; | ||
151 | 185 | ||
152 | if (action_ret != IRQ_HANDLED && action_ret != IRQ_NONE) { | 186 | if (action_ret != IRQ_HANDLED && action_ret != IRQ_NONE) { |
153 | printk(KERN_ERR "irq event %d: bogus return value %x\n", | 187 | printk(KERN_ERR "irq event %d: bogus return value %x\n", |
@@ -159,6 +193,13 @@ __report_bad_irq(unsigned int irq, struct irq_desc *desc, | |||
159 | dump_stack(); | 193 | dump_stack(); |
160 | printk(KERN_ERR "handlers:\n"); | 194 | printk(KERN_ERR "handlers:\n"); |
161 | 195 | ||
196 | /* | ||
197 | * We need to take desc->lock here. note_interrupt() is called | ||
198 | * w/o desc->lock held, but IRQ_PROGRESS set. We might race | ||
199 | * with something else removing an action. It's ok to take | ||
200 | * desc->lock here. See synchronize_irq(). | ||
201 | */ | ||
202 | raw_spin_lock_irqsave(&desc->lock, flags); | ||
162 | action = desc->action; | 203 | action = desc->action; |
163 | while (action) { | 204 | while (action) { |
164 | printk(KERN_ERR "[<%p>]", action->handler); | 205 | printk(KERN_ERR "[<%p>]", action->handler); |
@@ -167,6 +208,7 @@ __report_bad_irq(unsigned int irq, struct irq_desc *desc, | |||
167 | printk("\n"); | 208 | printk("\n"); |
168 | action = action->next; | 209 | action = action->next; |
169 | } | 210 | } |
211 | raw_spin_unlock_irqrestore(&desc->lock, flags); | ||
170 | } | 212 | } |
171 | 213 | ||
172 | static void | 214 | static void |
@@ -218,6 +260,9 @@ try_misrouted_irq(unsigned int irq, struct irq_desc *desc, | |||
218 | void note_interrupt(unsigned int irq, struct irq_desc *desc, | 260 | void note_interrupt(unsigned int irq, struct irq_desc *desc, |
219 | irqreturn_t action_ret) | 261 | irqreturn_t action_ret) |
220 | { | 262 | { |
263 | if (desc->istate & IRQS_POLL_INPROGRESS) | ||
264 | return; | ||
265 | |||
221 | if (unlikely(action_ret != IRQ_HANDLED)) { | 266 | if (unlikely(action_ret != IRQ_HANDLED)) { |
222 | /* | 267 | /* |
223 | * If we are seeing only the odd spurious IRQ caused by | 268 | * If we are seeing only the odd spurious IRQ caused by |
@@ -254,9 +299,9 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc, | |||
254 | * Now kill the IRQ | 299 | * Now kill the IRQ |
255 | */ | 300 | */ |
256 | printk(KERN_EMERG "Disabling IRQ #%d\n", irq); | 301 | printk(KERN_EMERG "Disabling IRQ #%d\n", irq); |
257 | desc->status |= IRQ_DISABLED | IRQ_SPURIOUS_DISABLED; | 302 | desc->istate |= IRQS_SPURIOUS_DISABLED; |
258 | desc->depth++; | 303 | desc->depth++; |
259 | desc->irq_data.chip->irq_disable(&desc->irq_data); | 304 | irq_disable(desc); |
260 | 305 | ||
261 | mod_timer(&poll_spurious_irq_timer, | 306 | mod_timer(&poll_spurious_irq_timer, |
262 | jiffies + POLL_SPURIOUS_IRQ_INTERVAL); | 307 | jiffies + POLL_SPURIOUS_IRQ_INTERVAL); |
diff --git a/kernel/irq_work.c b/kernel/irq_work.c index 90f881904bb1..c58fa7da8aef 100644 --- a/kernel/irq_work.c +++ b/kernel/irq_work.c | |||
@@ -77,21 +77,21 @@ void __weak arch_irq_work_raise(void) | |||
77 | */ | 77 | */ |
78 | static void __irq_work_queue(struct irq_work *entry) | 78 | static void __irq_work_queue(struct irq_work *entry) |
79 | { | 79 | { |
80 | struct irq_work **head, *next; | 80 | struct irq_work *next; |
81 | 81 | ||
82 | head = &get_cpu_var(irq_work_list); | 82 | preempt_disable(); |
83 | 83 | ||
84 | do { | 84 | do { |
85 | next = *head; | 85 | next = __this_cpu_read(irq_work_list); |
86 | /* Can assign non-atomic because we keep the flags set. */ | 86 | /* Can assign non-atomic because we keep the flags set. */ |
87 | entry->next = next_flags(next, IRQ_WORK_FLAGS); | 87 | entry->next = next_flags(next, IRQ_WORK_FLAGS); |
88 | } while (cmpxchg(head, next, entry) != next); | 88 | } while (this_cpu_cmpxchg(irq_work_list, next, entry) != next); |
89 | 89 | ||
90 | /* The list was empty, raise self-interrupt to start processing. */ | 90 | /* The list was empty, raise self-interrupt to start processing. */ |
91 | if (!irq_work_next(entry)) | 91 | if (!irq_work_next(entry)) |
92 | arch_irq_work_raise(); | 92 | arch_irq_work_raise(); |
93 | 93 | ||
94 | put_cpu_var(irq_work_list); | 94 | preempt_enable(); |
95 | } | 95 | } |
96 | 96 | ||
97 | /* | 97 | /* |
@@ -120,16 +120,16 @@ EXPORT_SYMBOL_GPL(irq_work_queue); | |||
120 | */ | 120 | */ |
121 | void irq_work_run(void) | 121 | void irq_work_run(void) |
122 | { | 122 | { |
123 | struct irq_work *list, **head; | 123 | struct irq_work *list; |
124 | 124 | ||
125 | head = &__get_cpu_var(irq_work_list); | 125 | if (this_cpu_read(irq_work_list) == NULL) |
126 | if (*head == NULL) | ||
127 | return; | 126 | return; |
128 | 127 | ||
129 | BUG_ON(!in_irq()); | 128 | BUG_ON(!in_irq()); |
130 | BUG_ON(!irqs_disabled()); | 129 | BUG_ON(!irqs_disabled()); |
131 | 130 | ||
132 | list = xchg(head, NULL); | 131 | list = this_cpu_xchg(irq_work_list, NULL); |
132 | |||
133 | while (list != NULL) { | 133 | while (list != NULL) { |
134 | struct irq_work *entry = list; | 134 | struct irq_work *entry = list; |
135 | 135 | ||
diff --git a/kernel/kexec.c b/kernel/kexec.c index b55045bc7563..ec19b92c7ebd 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c | |||
@@ -163,7 +163,7 @@ static int do_kimage_alloc(struct kimage **rimage, unsigned long entry, | |||
163 | * just verifies it is an address we can use. | 163 | * just verifies it is an address we can use. |
164 | * | 164 | * |
165 | * Since the kernel does everything in page size chunks ensure | 165 | * Since the kernel does everything in page size chunks ensure |
166 | * the destination addreses are page aligned. Too many | 166 | * the destination addresses are page aligned. Too many |
167 | * special cases crop of when we don't do this. The most | 167 | * special cases crop of when we don't do this. The most |
168 | * insidious is getting overlapping destination addresses | 168 | * insidious is getting overlapping destination addresses |
169 | * simply because addresses are changed to page size | 169 | * simply because addresses are changed to page size |
diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 7663e5df0e6f..77981813a1e7 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c | |||
@@ -317,12 +317,12 @@ void __kprobes free_optinsn_slot(kprobe_opcode_t * slot, int dirty) | |||
317 | /* We have preemption disabled.. so it is safe to use __ versions */ | 317 | /* We have preemption disabled.. so it is safe to use __ versions */ |
318 | static inline void set_kprobe_instance(struct kprobe *kp) | 318 | static inline void set_kprobe_instance(struct kprobe *kp) |
319 | { | 319 | { |
320 | __get_cpu_var(kprobe_instance) = kp; | 320 | __this_cpu_write(kprobe_instance, kp); |
321 | } | 321 | } |
322 | 322 | ||
323 | static inline void reset_kprobe_instance(void) | 323 | static inline void reset_kprobe_instance(void) |
324 | { | 324 | { |
325 | __get_cpu_var(kprobe_instance) = NULL; | 325 | __this_cpu_write(kprobe_instance, NULL); |
326 | } | 326 | } |
327 | 327 | ||
328 | /* | 328 | /* |
@@ -965,7 +965,7 @@ static void __kprobes aggr_post_handler(struct kprobe *p, struct pt_regs *regs, | |||
965 | static int __kprobes aggr_fault_handler(struct kprobe *p, struct pt_regs *regs, | 965 | static int __kprobes aggr_fault_handler(struct kprobe *p, struct pt_regs *regs, |
966 | int trapnr) | 966 | int trapnr) |
967 | { | 967 | { |
968 | struct kprobe *cur = __get_cpu_var(kprobe_instance); | 968 | struct kprobe *cur = __this_cpu_read(kprobe_instance); |
969 | 969 | ||
970 | /* | 970 | /* |
971 | * if we faulted "during" the execution of a user specified | 971 | * if we faulted "during" the execution of a user specified |
@@ -980,7 +980,7 @@ static int __kprobes aggr_fault_handler(struct kprobe *p, struct pt_regs *regs, | |||
980 | 980 | ||
981 | static int __kprobes aggr_break_handler(struct kprobe *p, struct pt_regs *regs) | 981 | static int __kprobes aggr_break_handler(struct kprobe *p, struct pt_regs *regs) |
982 | { | 982 | { |
983 | struct kprobe *cur = __get_cpu_var(kprobe_instance); | 983 | struct kprobe *cur = __this_cpu_read(kprobe_instance); |
984 | int ret = 0; | 984 | int ret = 0; |
985 | 985 | ||
986 | if (cur && cur->break_handler) { | 986 | if (cur && cur->break_handler) { |
diff --git a/kernel/kthread.c b/kernel/kthread.c index 5355cfd44a3f..c55afba990a3 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c | |||
@@ -148,7 +148,7 @@ struct task_struct *kthread_create(int (*threadfn)(void *data), | |||
148 | wait_for_completion(&create.done); | 148 | wait_for_completion(&create.done); |
149 | 149 | ||
150 | if (!IS_ERR(create.result)) { | 150 | if (!IS_ERR(create.result)) { |
151 | static struct sched_param param = { .sched_priority = 0 }; | 151 | static const struct sched_param param = { .sched_priority = 0 }; |
152 | va_list args; | 152 | va_list args; |
153 | 153 | ||
154 | va_start(args, namefmt); | 154 | va_start(args, namefmt); |
diff --git a/kernel/latencytop.c b/kernel/latencytop.c index 17110a4a4fc2..ee74b35e528d 100644 --- a/kernel/latencytop.c +++ b/kernel/latencytop.c | |||
@@ -241,24 +241,19 @@ static int lstats_show(struct seq_file *m, void *v) | |||
241 | seq_puts(m, "Latency Top version : v0.1\n"); | 241 | seq_puts(m, "Latency Top version : v0.1\n"); |
242 | 242 | ||
243 | for (i = 0; i < MAXLR; i++) { | 243 | for (i = 0; i < MAXLR; i++) { |
244 | if (latency_record[i].backtrace[0]) { | 244 | struct latency_record *lr = &latency_record[i]; |
245 | |||
246 | if (lr->backtrace[0]) { | ||
245 | int q; | 247 | int q; |
246 | seq_printf(m, "%i %lu %lu ", | 248 | seq_printf(m, "%i %lu %lu", |
247 | latency_record[i].count, | 249 | lr->count, lr->time, lr->max); |
248 | latency_record[i].time, | ||
249 | latency_record[i].max); | ||
250 | for (q = 0; q < LT_BACKTRACEDEPTH; q++) { | 250 | for (q = 0; q < LT_BACKTRACEDEPTH; q++) { |
251 | char sym[KSYM_SYMBOL_LEN]; | 251 | unsigned long bt = lr->backtrace[q]; |
252 | char *c; | 252 | if (!bt) |
253 | if (!latency_record[i].backtrace[q]) | ||
254 | break; | 253 | break; |
255 | if (latency_record[i].backtrace[q] == ULONG_MAX) | 254 | if (bt == ULONG_MAX) |
256 | break; | 255 | break; |
257 | sprint_symbol(sym, latency_record[i].backtrace[q]); | 256 | seq_printf(m, " %ps", (void *)bt); |
258 | c = strchr(sym, '+'); | ||
259 | if (c) | ||
260 | *c = 0; | ||
261 | seq_printf(m, "%s ", sym); | ||
262 | } | 257 | } |
263 | seq_printf(m, "\n"); | 258 | seq_printf(m, "\n"); |
264 | } | 259 | } |
diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 42ba65dff7d9..0d2058da80f5 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c | |||
@@ -2292,22 +2292,6 @@ mark_held_locks(struct task_struct *curr, enum mark_type mark) | |||
2292 | } | 2292 | } |
2293 | 2293 | ||
2294 | /* | 2294 | /* |
2295 | * Debugging helper: via this flag we know that we are in | ||
2296 | * 'early bootup code', and will warn about any invalid irqs-on event: | ||
2297 | */ | ||
2298 | static int early_boot_irqs_enabled; | ||
2299 | |||
2300 | void early_boot_irqs_off(void) | ||
2301 | { | ||
2302 | early_boot_irqs_enabled = 0; | ||
2303 | } | ||
2304 | |||
2305 | void early_boot_irqs_on(void) | ||
2306 | { | ||
2307 | early_boot_irqs_enabled = 1; | ||
2308 | } | ||
2309 | |||
2310 | /* | ||
2311 | * Hardirqs will be enabled: | 2295 | * Hardirqs will be enabled: |
2312 | */ | 2296 | */ |
2313 | void trace_hardirqs_on_caller(unsigned long ip) | 2297 | void trace_hardirqs_on_caller(unsigned long ip) |
@@ -2319,7 +2303,7 @@ void trace_hardirqs_on_caller(unsigned long ip) | |||
2319 | if (unlikely(!debug_locks || current->lockdep_recursion)) | 2303 | if (unlikely(!debug_locks || current->lockdep_recursion)) |
2320 | return; | 2304 | return; |
2321 | 2305 | ||
2322 | if (DEBUG_LOCKS_WARN_ON(unlikely(!early_boot_irqs_enabled))) | 2306 | if (DEBUG_LOCKS_WARN_ON(unlikely(early_boot_irqs_disabled))) |
2323 | return; | 2307 | return; |
2324 | 2308 | ||
2325 | if (unlikely(curr->hardirqs_enabled)) { | 2309 | if (unlikely(curr->hardirqs_enabled)) { |
diff --git a/kernel/module.c b/kernel/module.c index 34e00b708fad..efa290ea94bf 100644 --- a/kernel/module.c +++ b/kernel/module.c | |||
@@ -2460,9 +2460,9 @@ static void find_module_sections(struct module *mod, struct load_info *info) | |||
2460 | #endif | 2460 | #endif |
2461 | 2461 | ||
2462 | #ifdef CONFIG_TRACEPOINTS | 2462 | #ifdef CONFIG_TRACEPOINTS |
2463 | mod->tracepoints = section_objs(info, "__tracepoints", | 2463 | mod->tracepoints_ptrs = section_objs(info, "__tracepoints_ptrs", |
2464 | sizeof(*mod->tracepoints), | 2464 | sizeof(*mod->tracepoints_ptrs), |
2465 | &mod->num_tracepoints); | 2465 | &mod->num_tracepoints); |
2466 | #endif | 2466 | #endif |
2467 | #ifdef HAVE_JUMP_LABEL | 2467 | #ifdef HAVE_JUMP_LABEL |
2468 | mod->jump_entries = section_objs(info, "__jump_table", | 2468 | mod->jump_entries = section_objs(info, "__jump_table", |
@@ -3393,7 +3393,7 @@ void module_layout(struct module *mod, | |||
3393 | struct modversion_info *ver, | 3393 | struct modversion_info *ver, |
3394 | struct kernel_param *kp, | 3394 | struct kernel_param *kp, |
3395 | struct kernel_symbol *ks, | 3395 | struct kernel_symbol *ks, |
3396 | struct tracepoint *tp) | 3396 | struct tracepoint * const *tp) |
3397 | { | 3397 | { |
3398 | } | 3398 | } |
3399 | EXPORT_SYMBOL(module_layout); | 3399 | EXPORT_SYMBOL(module_layout); |
@@ -3407,8 +3407,8 @@ void module_update_tracepoints(void) | |||
3407 | mutex_lock(&module_mutex); | 3407 | mutex_lock(&module_mutex); |
3408 | list_for_each_entry(mod, &modules, list) | 3408 | list_for_each_entry(mod, &modules, list) |
3409 | if (!mod->taints) | 3409 | if (!mod->taints) |
3410 | tracepoint_update_probe_range(mod->tracepoints, | 3410 | tracepoint_update_probe_range(mod->tracepoints_ptrs, |
3411 | mod->tracepoints + mod->num_tracepoints); | 3411 | mod->tracepoints_ptrs + mod->num_tracepoints); |
3412 | mutex_unlock(&module_mutex); | 3412 | mutex_unlock(&module_mutex); |
3413 | } | 3413 | } |
3414 | 3414 | ||
@@ -3432,8 +3432,8 @@ int module_get_iter_tracepoints(struct tracepoint_iter *iter) | |||
3432 | else if (iter_mod > iter->module) | 3432 | else if (iter_mod > iter->module) |
3433 | iter->tracepoint = NULL; | 3433 | iter->tracepoint = NULL; |
3434 | found = tracepoint_get_iter_range(&iter->tracepoint, | 3434 | found = tracepoint_get_iter_range(&iter->tracepoint, |
3435 | iter_mod->tracepoints, | 3435 | iter_mod->tracepoints_ptrs, |
3436 | iter_mod->tracepoints | 3436 | iter_mod->tracepoints_ptrs |
3437 | + iter_mod->num_tracepoints); | 3437 | + iter_mod->num_tracepoints); |
3438 | if (found) { | 3438 | if (found) { |
3439 | iter->module = iter_mod; | 3439 | iter->module = iter_mod; |
diff --git a/kernel/panic.c b/kernel/panic.c index 4c13b1a88ebb..991bb87a1704 100644 --- a/kernel/panic.c +++ b/kernel/panic.c | |||
@@ -34,6 +34,7 @@ static int pause_on_oops_flag; | |||
34 | static DEFINE_SPINLOCK(pause_on_oops_lock); | 34 | static DEFINE_SPINLOCK(pause_on_oops_lock); |
35 | 35 | ||
36 | int panic_timeout; | 36 | int panic_timeout; |
37 | EXPORT_SYMBOL_GPL(panic_timeout); | ||
37 | 38 | ||
38 | ATOMIC_NOTIFIER_HEAD(panic_notifier_list); | 39 | ATOMIC_NOTIFIER_HEAD(panic_notifier_list); |
39 | 40 | ||
diff --git a/kernel/params.c b/kernel/params.c index 08107d181758..0da1411222b9 100644 --- a/kernel/params.c +++ b/kernel/params.c | |||
@@ -719,9 +719,7 @@ void destroy_params(const struct kernel_param *params, unsigned num) | |||
719 | params[i].ops->free(params[i].arg); | 719 | params[i].ops->free(params[i].arg); |
720 | } | 720 | } |
721 | 721 | ||
722 | static void __init kernel_add_sysfs_param(const char *name, | 722 | static struct module_kobject * __init locate_module_kobject(const char *name) |
723 | struct kernel_param *kparam, | ||
724 | unsigned int name_skip) | ||
725 | { | 723 | { |
726 | struct module_kobject *mk; | 724 | struct module_kobject *mk; |
727 | struct kobject *kobj; | 725 | struct kobject *kobj; |
@@ -729,10 +727,7 @@ static void __init kernel_add_sysfs_param(const char *name, | |||
729 | 727 | ||
730 | kobj = kset_find_obj(module_kset, name); | 728 | kobj = kset_find_obj(module_kset, name); |
731 | if (kobj) { | 729 | if (kobj) { |
732 | /* We already have one. Remove params so we can add more. */ | ||
733 | mk = to_module_kobject(kobj); | 730 | mk = to_module_kobject(kobj); |
734 | /* We need to remove it before adding parameters. */ | ||
735 | sysfs_remove_group(&mk->kobj, &mk->mp->grp); | ||
736 | } else { | 731 | } else { |
737 | mk = kzalloc(sizeof(struct module_kobject), GFP_KERNEL); | 732 | mk = kzalloc(sizeof(struct module_kobject), GFP_KERNEL); |
738 | BUG_ON(!mk); | 733 | BUG_ON(!mk); |
@@ -743,15 +738,36 @@ static void __init kernel_add_sysfs_param(const char *name, | |||
743 | "%s", name); | 738 | "%s", name); |
744 | if (err) { | 739 | if (err) { |
745 | kobject_put(&mk->kobj); | 740 | kobject_put(&mk->kobj); |
746 | printk(KERN_ERR "Module '%s' failed add to sysfs, " | 741 | printk(KERN_ERR |
747 | "error number %d\n", name, err); | 742 | "Module '%s' failed add to sysfs, error number %d\n", |
748 | printk(KERN_ERR "The system will be unstable now.\n"); | 743 | name, err); |
749 | return; | 744 | printk(KERN_ERR |
745 | "The system will be unstable now.\n"); | ||
746 | return NULL; | ||
750 | } | 747 | } |
751 | /* So that exit path is even. */ | 748 | |
749 | /* So that we hold reference in both cases. */ | ||
752 | kobject_get(&mk->kobj); | 750 | kobject_get(&mk->kobj); |
753 | } | 751 | } |
754 | 752 | ||
753 | return mk; | ||
754 | } | ||
755 | |||
756 | static void __init kernel_add_sysfs_param(const char *name, | ||
757 | struct kernel_param *kparam, | ||
758 | unsigned int name_skip) | ||
759 | { | ||
760 | struct module_kobject *mk; | ||
761 | int err; | ||
762 | |||
763 | mk = locate_module_kobject(name); | ||
764 | if (!mk) | ||
765 | return; | ||
766 | |||
767 | /* We need to remove old parameters before adding more. */ | ||
768 | if (mk->mp) | ||
769 | sysfs_remove_group(&mk->kobj, &mk->mp->grp); | ||
770 | |||
755 | /* These should not fail at boot. */ | 771 | /* These should not fail at boot. */ |
756 | err = add_sysfs_param(mk, kparam, kparam->name + name_skip); | 772 | err = add_sysfs_param(mk, kparam, kparam->name + name_skip); |
757 | BUG_ON(err); | 773 | BUG_ON(err); |
@@ -796,6 +812,32 @@ static void __init param_sysfs_builtin(void) | |||
796 | } | 812 | } |
797 | } | 813 | } |
798 | 814 | ||
815 | ssize_t __modver_version_show(struct module_attribute *mattr, | ||
816 | struct module *mod, char *buf) | ||
817 | { | ||
818 | struct module_version_attribute *vattr = | ||
819 | container_of(mattr, struct module_version_attribute, mattr); | ||
820 | |||
821 | return sprintf(buf, "%s\n", vattr->version); | ||
822 | } | ||
823 | |||
824 | extern struct module_version_attribute __start___modver[], __stop___modver[]; | ||
825 | |||
826 | static void __init version_sysfs_builtin(void) | ||
827 | { | ||
828 | const struct module_version_attribute *vattr; | ||
829 | struct module_kobject *mk; | ||
830 | int err; | ||
831 | |||
832 | for (vattr = __start___modver; vattr < __stop___modver; vattr++) { | ||
833 | mk = locate_module_kobject(vattr->module_name); | ||
834 | if (mk) { | ||
835 | err = sysfs_create_file(&mk->kobj, &vattr->mattr.attr); | ||
836 | kobject_uevent(&mk->kobj, KOBJ_ADD); | ||
837 | kobject_put(&mk->kobj); | ||
838 | } | ||
839 | } | ||
840 | } | ||
799 | 841 | ||
800 | /* module-related sysfs stuff */ | 842 | /* module-related sysfs stuff */ |
801 | 843 | ||
@@ -875,6 +917,7 @@ static int __init param_sysfs_init(void) | |||
875 | } | 917 | } |
876 | module_sysfs_initialized = 1; | 918 | module_sysfs_initialized = 1; |
877 | 919 | ||
920 | version_sysfs_builtin(); | ||
878 | param_sysfs_builtin(); | 921 | param_sysfs_builtin(); |
879 | 922 | ||
880 | return 0; | 923 | return 0; |
diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 11847bf1e8cc..ed253aa24ba4 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c | |||
@@ -38,7 +38,96 @@ | |||
38 | 38 | ||
39 | #include <asm/irq_regs.h> | 39 | #include <asm/irq_regs.h> |
40 | 40 | ||
41 | atomic_t perf_task_events __read_mostly; | 41 | struct remote_function_call { |
42 | struct task_struct *p; | ||
43 | int (*func)(void *info); | ||
44 | void *info; | ||
45 | int ret; | ||
46 | }; | ||
47 | |||
48 | static void remote_function(void *data) | ||
49 | { | ||
50 | struct remote_function_call *tfc = data; | ||
51 | struct task_struct *p = tfc->p; | ||
52 | |||
53 | if (p) { | ||
54 | tfc->ret = -EAGAIN; | ||
55 | if (task_cpu(p) != smp_processor_id() || !task_curr(p)) | ||
56 | return; | ||
57 | } | ||
58 | |||
59 | tfc->ret = tfc->func(tfc->info); | ||
60 | } | ||
61 | |||
62 | /** | ||
63 | * task_function_call - call a function on the cpu on which a task runs | ||
64 | * @p: the task to evaluate | ||
65 | * @func: the function to be called | ||
66 | * @info: the function call argument | ||
67 | * | ||
68 | * Calls the function @func when the task is currently running. This might | ||
69 | * be on the current CPU, which just calls the function directly | ||
70 | * | ||
71 | * returns: @func return value, or | ||
72 | * -ESRCH - when the process isn't running | ||
73 | * -EAGAIN - when the process moved away | ||
74 | */ | ||
75 | static int | ||
76 | task_function_call(struct task_struct *p, int (*func) (void *info), void *info) | ||
77 | { | ||
78 | struct remote_function_call data = { | ||
79 | .p = p, | ||
80 | .func = func, | ||
81 | .info = info, | ||
82 | .ret = -ESRCH, /* No such (running) process */ | ||
83 | }; | ||
84 | |||
85 | if (task_curr(p)) | ||
86 | smp_call_function_single(task_cpu(p), remote_function, &data, 1); | ||
87 | |||
88 | return data.ret; | ||
89 | } | ||
90 | |||
91 | /** | ||
92 | * cpu_function_call - call a function on the cpu | ||
93 | * @func: the function to be called | ||
94 | * @info: the function call argument | ||
95 | * | ||
96 | * Calls the function @func on the remote cpu. | ||
97 | * | ||
98 | * returns: @func return value or -ENXIO when the cpu is offline | ||
99 | */ | ||
100 | static int cpu_function_call(int cpu, int (*func) (void *info), void *info) | ||
101 | { | ||
102 | struct remote_function_call data = { | ||
103 | .p = NULL, | ||
104 | .func = func, | ||
105 | .info = info, | ||
106 | .ret = -ENXIO, /* No such CPU */ | ||
107 | }; | ||
108 | |||
109 | smp_call_function_single(cpu, remote_function, &data, 1); | ||
110 | |||
111 | return data.ret; | ||
112 | } | ||
113 | |||
114 | #define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\ | ||
115 | PERF_FLAG_FD_OUTPUT |\ | ||
116 | PERF_FLAG_PID_CGROUP) | ||
117 | |||
118 | enum event_type_t { | ||
119 | EVENT_FLEXIBLE = 0x1, | ||
120 | EVENT_PINNED = 0x2, | ||
121 | EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED, | ||
122 | }; | ||
123 | |||
124 | /* | ||
125 | * perf_sched_events : >0 events exist | ||
126 | * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu | ||
127 | */ | ||
128 | atomic_t perf_sched_events __read_mostly; | ||
129 | static DEFINE_PER_CPU(atomic_t, perf_cgroup_events); | ||
130 | |||
42 | static atomic_t nr_mmap_events __read_mostly; | 131 | static atomic_t nr_mmap_events __read_mostly; |
43 | static atomic_t nr_comm_events __read_mostly; | 132 | static atomic_t nr_comm_events __read_mostly; |
44 | static atomic_t nr_task_events __read_mostly; | 133 | static atomic_t nr_task_events __read_mostly; |
@@ -61,10 +150,37 @@ int sysctl_perf_event_mlock __read_mostly = 512; /* 'free' kb per user */ | |||
61 | /* | 150 | /* |
62 | * max perf event sample rate | 151 | * max perf event sample rate |
63 | */ | 152 | */ |
64 | int sysctl_perf_event_sample_rate __read_mostly = 100000; | 153 | #define DEFAULT_MAX_SAMPLE_RATE 100000 |
154 | int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE; | ||
155 | static int max_samples_per_tick __read_mostly = | ||
156 | DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ); | ||
157 | |||
158 | int perf_proc_update_handler(struct ctl_table *table, int write, | ||
159 | void __user *buffer, size_t *lenp, | ||
160 | loff_t *ppos) | ||
161 | { | ||
162 | int ret = proc_dointvec(table, write, buffer, lenp, ppos); | ||
163 | |||
164 | if (ret || !write) | ||
165 | return ret; | ||
166 | |||
167 | max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ); | ||
168 | |||
169 | return 0; | ||
170 | } | ||
65 | 171 | ||
66 | static atomic64_t perf_event_id; | 172 | static atomic64_t perf_event_id; |
67 | 173 | ||
174 | static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx, | ||
175 | enum event_type_t event_type); | ||
176 | |||
177 | static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx, | ||
178 | enum event_type_t event_type, | ||
179 | struct task_struct *task); | ||
180 | |||
181 | static void update_context_time(struct perf_event_context *ctx); | ||
182 | static u64 perf_event_time(struct perf_event *event); | ||
183 | |||
68 | void __weak perf_event_print_debug(void) { } | 184 | void __weak perf_event_print_debug(void) { } |
69 | 185 | ||
70 | extern __weak const char *perf_pmu_name(void) | 186 | extern __weak const char *perf_pmu_name(void) |
@@ -72,6 +188,365 @@ extern __weak const char *perf_pmu_name(void) | |||
72 | return "pmu"; | 188 | return "pmu"; |
73 | } | 189 | } |
74 | 190 | ||
191 | static inline u64 perf_clock(void) | ||
192 | { | ||
193 | return local_clock(); | ||
194 | } | ||
195 | |||
196 | static inline struct perf_cpu_context * | ||
197 | __get_cpu_context(struct perf_event_context *ctx) | ||
198 | { | ||
199 | return this_cpu_ptr(ctx->pmu->pmu_cpu_context); | ||
200 | } | ||
201 | |||
202 | #ifdef CONFIG_CGROUP_PERF | ||
203 | |||
204 | /* | ||
205 | * Must ensure cgroup is pinned (css_get) before calling | ||
206 | * this function. In other words, we cannot call this function | ||
207 | * if there is no cgroup event for the current CPU context. | ||
208 | */ | ||
209 | static inline struct perf_cgroup * | ||
210 | perf_cgroup_from_task(struct task_struct *task) | ||
211 | { | ||
212 | return container_of(task_subsys_state(task, perf_subsys_id), | ||
213 | struct perf_cgroup, css); | ||
214 | } | ||
215 | |||
216 | static inline bool | ||
217 | perf_cgroup_match(struct perf_event *event) | ||
218 | { | ||
219 | struct perf_event_context *ctx = event->ctx; | ||
220 | struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); | ||
221 | |||
222 | return !event->cgrp || event->cgrp == cpuctx->cgrp; | ||
223 | } | ||
224 | |||
225 | static inline void perf_get_cgroup(struct perf_event *event) | ||
226 | { | ||
227 | css_get(&event->cgrp->css); | ||
228 | } | ||
229 | |||
230 | static inline void perf_put_cgroup(struct perf_event *event) | ||
231 | { | ||
232 | css_put(&event->cgrp->css); | ||
233 | } | ||
234 | |||
235 | static inline void perf_detach_cgroup(struct perf_event *event) | ||
236 | { | ||
237 | perf_put_cgroup(event); | ||
238 | event->cgrp = NULL; | ||
239 | } | ||
240 | |||
241 | static inline int is_cgroup_event(struct perf_event *event) | ||
242 | { | ||
243 | return event->cgrp != NULL; | ||
244 | } | ||
245 | |||
246 | static inline u64 perf_cgroup_event_time(struct perf_event *event) | ||
247 | { | ||
248 | struct perf_cgroup_info *t; | ||
249 | |||
250 | t = per_cpu_ptr(event->cgrp->info, event->cpu); | ||
251 | return t->time; | ||
252 | } | ||
253 | |||
254 | static inline void __update_cgrp_time(struct perf_cgroup *cgrp) | ||
255 | { | ||
256 | struct perf_cgroup_info *info; | ||
257 | u64 now; | ||
258 | |||
259 | now = perf_clock(); | ||
260 | |||
261 | info = this_cpu_ptr(cgrp->info); | ||
262 | |||
263 | info->time += now - info->timestamp; | ||
264 | info->timestamp = now; | ||
265 | } | ||
266 | |||
267 | static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx) | ||
268 | { | ||
269 | struct perf_cgroup *cgrp_out = cpuctx->cgrp; | ||
270 | if (cgrp_out) | ||
271 | __update_cgrp_time(cgrp_out); | ||
272 | } | ||
273 | |||
274 | static inline void update_cgrp_time_from_event(struct perf_event *event) | ||
275 | { | ||
276 | struct perf_cgroup *cgrp; | ||
277 | |||
278 | /* | ||
279 | * ensure we access cgroup data only when needed and | ||
280 | * when we know the cgroup is pinned (css_get) | ||
281 | */ | ||
282 | if (!is_cgroup_event(event)) | ||
283 | return; | ||
284 | |||
285 | cgrp = perf_cgroup_from_task(current); | ||
286 | /* | ||
287 | * Do not update time when cgroup is not active | ||
288 | */ | ||
289 | if (cgrp == event->cgrp) | ||
290 | __update_cgrp_time(event->cgrp); | ||
291 | } | ||
292 | |||
293 | static inline void | ||
294 | perf_cgroup_set_timestamp(struct task_struct *task, | ||
295 | struct perf_event_context *ctx) | ||
296 | { | ||
297 | struct perf_cgroup *cgrp; | ||
298 | struct perf_cgroup_info *info; | ||
299 | |||
300 | /* | ||
301 | * ctx->lock held by caller | ||
302 | * ensure we do not access cgroup data | ||
303 | * unless we have the cgroup pinned (css_get) | ||
304 | */ | ||
305 | if (!task || !ctx->nr_cgroups) | ||
306 | return; | ||
307 | |||
308 | cgrp = perf_cgroup_from_task(task); | ||
309 | info = this_cpu_ptr(cgrp->info); | ||
310 | info->timestamp = ctx->timestamp; | ||
311 | } | ||
312 | |||
313 | #define PERF_CGROUP_SWOUT 0x1 /* cgroup switch out every event */ | ||
314 | #define PERF_CGROUP_SWIN 0x2 /* cgroup switch in events based on task */ | ||
315 | |||
316 | /* | ||
317 | * reschedule events based on the cgroup constraint of task. | ||
318 | * | ||
319 | * mode SWOUT : schedule out everything | ||
320 | * mode SWIN : schedule in based on cgroup for next | ||
321 | */ | ||
322 | void perf_cgroup_switch(struct task_struct *task, int mode) | ||
323 | { | ||
324 | struct perf_cpu_context *cpuctx; | ||
325 | struct pmu *pmu; | ||
326 | unsigned long flags; | ||
327 | |||
328 | /* | ||
329 | * disable interrupts to avoid geting nr_cgroup | ||
330 | * changes via __perf_event_disable(). Also | ||
331 | * avoids preemption. | ||
332 | */ | ||
333 | local_irq_save(flags); | ||
334 | |||
335 | /* | ||
336 | * we reschedule only in the presence of cgroup | ||
337 | * constrained events. | ||
338 | */ | ||
339 | rcu_read_lock(); | ||
340 | |||
341 | list_for_each_entry_rcu(pmu, &pmus, entry) { | ||
342 | |||
343 | cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); | ||
344 | |||
345 | perf_pmu_disable(cpuctx->ctx.pmu); | ||
346 | |||
347 | /* | ||
348 | * perf_cgroup_events says at least one | ||
349 | * context on this CPU has cgroup events. | ||
350 | * | ||
351 | * ctx->nr_cgroups reports the number of cgroup | ||
352 | * events for a context. | ||
353 | */ | ||
354 | if (cpuctx->ctx.nr_cgroups > 0) { | ||
355 | |||
356 | if (mode & PERF_CGROUP_SWOUT) { | ||
357 | cpu_ctx_sched_out(cpuctx, EVENT_ALL); | ||
358 | /* | ||
359 | * must not be done before ctxswout due | ||
360 | * to event_filter_match() in event_sched_out() | ||
361 | */ | ||
362 | cpuctx->cgrp = NULL; | ||
363 | } | ||
364 | |||
365 | if (mode & PERF_CGROUP_SWIN) { | ||
366 | /* set cgrp before ctxsw in to | ||
367 | * allow event_filter_match() to not | ||
368 | * have to pass task around | ||
369 | */ | ||
370 | cpuctx->cgrp = perf_cgroup_from_task(task); | ||
371 | cpu_ctx_sched_in(cpuctx, EVENT_ALL, task); | ||
372 | } | ||
373 | } | ||
374 | |||
375 | perf_pmu_enable(cpuctx->ctx.pmu); | ||
376 | } | ||
377 | |||
378 | rcu_read_unlock(); | ||
379 | |||
380 | local_irq_restore(flags); | ||
381 | } | ||
382 | |||
383 | static inline void perf_cgroup_sched_out(struct task_struct *task) | ||
384 | { | ||
385 | perf_cgroup_switch(task, PERF_CGROUP_SWOUT); | ||
386 | } | ||
387 | |||
388 | static inline void perf_cgroup_sched_in(struct task_struct *task) | ||
389 | { | ||
390 | perf_cgroup_switch(task, PERF_CGROUP_SWIN); | ||
391 | } | ||
392 | |||
393 | static inline int perf_cgroup_connect(int fd, struct perf_event *event, | ||
394 | struct perf_event_attr *attr, | ||
395 | struct perf_event *group_leader) | ||
396 | { | ||
397 | struct perf_cgroup *cgrp; | ||
398 | struct cgroup_subsys_state *css; | ||
399 | struct file *file; | ||
400 | int ret = 0, fput_needed; | ||
401 | |||
402 | file = fget_light(fd, &fput_needed); | ||
403 | if (!file) | ||
404 | return -EBADF; | ||
405 | |||
406 | css = cgroup_css_from_dir(file, perf_subsys_id); | ||
407 | if (IS_ERR(css)) { | ||
408 | ret = PTR_ERR(css); | ||
409 | goto out; | ||
410 | } | ||
411 | |||
412 | cgrp = container_of(css, struct perf_cgroup, css); | ||
413 | event->cgrp = cgrp; | ||
414 | |||
415 | /* must be done before we fput() the file */ | ||
416 | perf_get_cgroup(event); | ||
417 | |||
418 | /* | ||
419 | * all events in a group must monitor | ||
420 | * the same cgroup because a task belongs | ||
421 | * to only one perf cgroup at a time | ||
422 | */ | ||
423 | if (group_leader && group_leader->cgrp != cgrp) { | ||
424 | perf_detach_cgroup(event); | ||
425 | ret = -EINVAL; | ||
426 | } | ||
427 | out: | ||
428 | fput_light(file, fput_needed); | ||
429 | return ret; | ||
430 | } | ||
431 | |||
432 | static inline void | ||
433 | perf_cgroup_set_shadow_time(struct perf_event *event, u64 now) | ||
434 | { | ||
435 | struct perf_cgroup_info *t; | ||
436 | t = per_cpu_ptr(event->cgrp->info, event->cpu); | ||
437 | event->shadow_ctx_time = now - t->timestamp; | ||
438 | } | ||
439 | |||
440 | static inline void | ||
441 | perf_cgroup_defer_enabled(struct perf_event *event) | ||
442 | { | ||
443 | /* | ||
444 | * when the current task's perf cgroup does not match | ||
445 | * the event's, we need to remember to call the | ||
446 | * perf_mark_enable() function the first time a task with | ||
447 | * a matching perf cgroup is scheduled in. | ||
448 | */ | ||
449 | if (is_cgroup_event(event) && !perf_cgroup_match(event)) | ||
450 | event->cgrp_defer_enabled = 1; | ||
451 | } | ||
452 | |||
453 | static inline void | ||
454 | perf_cgroup_mark_enabled(struct perf_event *event, | ||
455 | struct perf_event_context *ctx) | ||
456 | { | ||
457 | struct perf_event *sub; | ||
458 | u64 tstamp = perf_event_time(event); | ||
459 | |||
460 | if (!event->cgrp_defer_enabled) | ||
461 | return; | ||
462 | |||
463 | event->cgrp_defer_enabled = 0; | ||
464 | |||
465 | event->tstamp_enabled = tstamp - event->total_time_enabled; | ||
466 | list_for_each_entry(sub, &event->sibling_list, group_entry) { | ||
467 | if (sub->state >= PERF_EVENT_STATE_INACTIVE) { | ||
468 | sub->tstamp_enabled = tstamp - sub->total_time_enabled; | ||
469 | sub->cgrp_defer_enabled = 0; | ||
470 | } | ||
471 | } | ||
472 | } | ||
473 | #else /* !CONFIG_CGROUP_PERF */ | ||
474 | |||
475 | static inline bool | ||
476 | perf_cgroup_match(struct perf_event *event) | ||
477 | { | ||
478 | return true; | ||
479 | } | ||
480 | |||
481 | static inline void perf_detach_cgroup(struct perf_event *event) | ||
482 | {} | ||
483 | |||
484 | static inline int is_cgroup_event(struct perf_event *event) | ||
485 | { | ||
486 | return 0; | ||
487 | } | ||
488 | |||
489 | static inline u64 perf_cgroup_event_cgrp_time(struct perf_event *event) | ||
490 | { | ||
491 | return 0; | ||
492 | } | ||
493 | |||
494 | static inline void update_cgrp_time_from_event(struct perf_event *event) | ||
495 | { | ||
496 | } | ||
497 | |||
498 | static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx) | ||
499 | { | ||
500 | } | ||
501 | |||
502 | static inline void perf_cgroup_sched_out(struct task_struct *task) | ||
503 | { | ||
504 | } | ||
505 | |||
506 | static inline void perf_cgroup_sched_in(struct task_struct *task) | ||
507 | { | ||
508 | } | ||
509 | |||
510 | static inline int perf_cgroup_connect(pid_t pid, struct perf_event *event, | ||
511 | struct perf_event_attr *attr, | ||
512 | struct perf_event *group_leader) | ||
513 | { | ||
514 | return -EINVAL; | ||
515 | } | ||
516 | |||
517 | static inline void | ||
518 | perf_cgroup_set_timestamp(struct task_struct *task, | ||
519 | struct perf_event_context *ctx) | ||
520 | { | ||
521 | } | ||
522 | |||
523 | void | ||
524 | perf_cgroup_switch(struct task_struct *task, struct task_struct *next) | ||
525 | { | ||
526 | } | ||
527 | |||
528 | static inline void | ||
529 | perf_cgroup_set_shadow_time(struct perf_event *event, u64 now) | ||
530 | { | ||
531 | } | ||
532 | |||
533 | static inline u64 perf_cgroup_event_time(struct perf_event *event) | ||
534 | { | ||
535 | return 0; | ||
536 | } | ||
537 | |||
538 | static inline void | ||
539 | perf_cgroup_defer_enabled(struct perf_event *event) | ||
540 | { | ||
541 | } | ||
542 | |||
543 | static inline void | ||
544 | perf_cgroup_mark_enabled(struct perf_event *event, | ||
545 | struct perf_event_context *ctx) | ||
546 | { | ||
547 | } | ||
548 | #endif | ||
549 | |||
75 | void perf_pmu_disable(struct pmu *pmu) | 550 | void perf_pmu_disable(struct pmu *pmu) |
76 | { | 551 | { |
77 | int *count = this_cpu_ptr(pmu->pmu_disable_count); | 552 | int *count = this_cpu_ptr(pmu->pmu_disable_count); |
@@ -237,12 +712,6 @@ static void perf_unpin_context(struct perf_event_context *ctx) | |||
237 | raw_spin_lock_irqsave(&ctx->lock, flags); | 712 | raw_spin_lock_irqsave(&ctx->lock, flags); |
238 | --ctx->pin_count; | 713 | --ctx->pin_count; |
239 | raw_spin_unlock_irqrestore(&ctx->lock, flags); | 714 | raw_spin_unlock_irqrestore(&ctx->lock, flags); |
240 | put_ctx(ctx); | ||
241 | } | ||
242 | |||
243 | static inline u64 perf_clock(void) | ||
244 | { | ||
245 | return local_clock(); | ||
246 | } | 715 | } |
247 | 716 | ||
248 | /* | 717 | /* |
@@ -256,6 +725,16 @@ static void update_context_time(struct perf_event_context *ctx) | |||
256 | ctx->timestamp = now; | 725 | ctx->timestamp = now; |
257 | } | 726 | } |
258 | 727 | ||
728 | static u64 perf_event_time(struct perf_event *event) | ||
729 | { | ||
730 | struct perf_event_context *ctx = event->ctx; | ||
731 | |||
732 | if (is_cgroup_event(event)) | ||
733 | return perf_cgroup_event_time(event); | ||
734 | |||
735 | return ctx ? ctx->time : 0; | ||
736 | } | ||
737 | |||
259 | /* | 738 | /* |
260 | * Update the total_time_enabled and total_time_running fields for a event. | 739 | * Update the total_time_enabled and total_time_running fields for a event. |
261 | */ | 740 | */ |
@@ -267,8 +746,19 @@ static void update_event_times(struct perf_event *event) | |||
267 | if (event->state < PERF_EVENT_STATE_INACTIVE || | 746 | if (event->state < PERF_EVENT_STATE_INACTIVE || |
268 | event->group_leader->state < PERF_EVENT_STATE_INACTIVE) | 747 | event->group_leader->state < PERF_EVENT_STATE_INACTIVE) |
269 | return; | 748 | return; |
270 | 749 | /* | |
271 | if (ctx->is_active) | 750 | * in cgroup mode, time_enabled represents |
751 | * the time the event was enabled AND active | ||
752 | * tasks were in the monitored cgroup. This is | ||
753 | * independent of the activity of the context as | ||
754 | * there may be a mix of cgroup and non-cgroup events. | ||
755 | * | ||
756 | * That is why we treat cgroup events differently | ||
757 | * here. | ||
758 | */ | ||
759 | if (is_cgroup_event(event)) | ||
760 | run_end = perf_event_time(event); | ||
761 | else if (ctx->is_active) | ||
272 | run_end = ctx->time; | 762 | run_end = ctx->time; |
273 | else | 763 | else |
274 | run_end = event->tstamp_stopped; | 764 | run_end = event->tstamp_stopped; |
@@ -278,9 +768,10 @@ static void update_event_times(struct perf_event *event) | |||
278 | if (event->state == PERF_EVENT_STATE_INACTIVE) | 768 | if (event->state == PERF_EVENT_STATE_INACTIVE) |
279 | run_end = event->tstamp_stopped; | 769 | run_end = event->tstamp_stopped; |
280 | else | 770 | else |
281 | run_end = ctx->time; | 771 | run_end = perf_event_time(event); |
282 | 772 | ||
283 | event->total_time_running = run_end - event->tstamp_running; | 773 | event->total_time_running = run_end - event->tstamp_running; |
774 | |||
284 | } | 775 | } |
285 | 776 | ||
286 | /* | 777 | /* |
@@ -329,6 +820,9 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx) | |||
329 | list_add_tail(&event->group_entry, list); | 820 | list_add_tail(&event->group_entry, list); |
330 | } | 821 | } |
331 | 822 | ||
823 | if (is_cgroup_event(event)) | ||
824 | ctx->nr_cgroups++; | ||
825 | |||
332 | list_add_rcu(&event->event_entry, &ctx->event_list); | 826 | list_add_rcu(&event->event_entry, &ctx->event_list); |
333 | if (!ctx->nr_events) | 827 | if (!ctx->nr_events) |
334 | perf_pmu_rotate_start(ctx->pmu); | 828 | perf_pmu_rotate_start(ctx->pmu); |
@@ -455,6 +949,9 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx) | |||
455 | 949 | ||
456 | event->attach_state &= ~PERF_ATTACH_CONTEXT; | 950 | event->attach_state &= ~PERF_ATTACH_CONTEXT; |
457 | 951 | ||
952 | if (is_cgroup_event(event)) | ||
953 | ctx->nr_cgroups--; | ||
954 | |||
458 | ctx->nr_events--; | 955 | ctx->nr_events--; |
459 | if (event->attr.inherit_stat) | 956 | if (event->attr.inherit_stat) |
460 | ctx->nr_stat--; | 957 | ctx->nr_stat--; |
@@ -526,7 +1023,8 @@ out: | |||
526 | static inline int | 1023 | static inline int |
527 | event_filter_match(struct perf_event *event) | 1024 | event_filter_match(struct perf_event *event) |
528 | { | 1025 | { |
529 | return event->cpu == -1 || event->cpu == smp_processor_id(); | 1026 | return (event->cpu == -1 || event->cpu == smp_processor_id()) |
1027 | && perf_cgroup_match(event); | ||
530 | } | 1028 | } |
531 | 1029 | ||
532 | static void | 1030 | static void |
@@ -534,6 +1032,7 @@ event_sched_out(struct perf_event *event, | |||
534 | struct perf_cpu_context *cpuctx, | 1032 | struct perf_cpu_context *cpuctx, |
535 | struct perf_event_context *ctx) | 1033 | struct perf_event_context *ctx) |
536 | { | 1034 | { |
1035 | u64 tstamp = perf_event_time(event); | ||
537 | u64 delta; | 1036 | u64 delta; |
538 | /* | 1037 | /* |
539 | * An event which could not be activated because of | 1038 | * An event which could not be activated because of |
@@ -543,9 +1042,9 @@ event_sched_out(struct perf_event *event, | |||
543 | */ | 1042 | */ |
544 | if (event->state == PERF_EVENT_STATE_INACTIVE | 1043 | if (event->state == PERF_EVENT_STATE_INACTIVE |
545 | && !event_filter_match(event)) { | 1044 | && !event_filter_match(event)) { |
546 | delta = ctx->time - event->tstamp_stopped; | 1045 | delta = tstamp - event->tstamp_stopped; |
547 | event->tstamp_running += delta; | 1046 | event->tstamp_running += delta; |
548 | event->tstamp_stopped = ctx->time; | 1047 | event->tstamp_stopped = tstamp; |
549 | } | 1048 | } |
550 | 1049 | ||
551 | if (event->state != PERF_EVENT_STATE_ACTIVE) | 1050 | if (event->state != PERF_EVENT_STATE_ACTIVE) |
@@ -556,7 +1055,7 @@ event_sched_out(struct perf_event *event, | |||
556 | event->pending_disable = 0; | 1055 | event->pending_disable = 0; |
557 | event->state = PERF_EVENT_STATE_OFF; | 1056 | event->state = PERF_EVENT_STATE_OFF; |
558 | } | 1057 | } |
559 | event->tstamp_stopped = ctx->time; | 1058 | event->tstamp_stopped = tstamp; |
560 | event->pmu->del(event, 0); | 1059 | event->pmu->del(event, 0); |
561 | event->oncpu = -1; | 1060 | event->oncpu = -1; |
562 | 1061 | ||
@@ -587,47 +1086,30 @@ group_sched_out(struct perf_event *group_event, | |||
587 | cpuctx->exclusive = 0; | 1086 | cpuctx->exclusive = 0; |
588 | } | 1087 | } |
589 | 1088 | ||
590 | static inline struct perf_cpu_context * | ||
591 | __get_cpu_context(struct perf_event_context *ctx) | ||
592 | { | ||
593 | return this_cpu_ptr(ctx->pmu->pmu_cpu_context); | ||
594 | } | ||
595 | |||
596 | /* | 1089 | /* |
597 | * Cross CPU call to remove a performance event | 1090 | * Cross CPU call to remove a performance event |
598 | * | 1091 | * |
599 | * We disable the event on the hardware level first. After that we | 1092 | * We disable the event on the hardware level first. After that we |
600 | * remove it from the context list. | 1093 | * remove it from the context list. |
601 | */ | 1094 | */ |
602 | static void __perf_event_remove_from_context(void *info) | 1095 | static int __perf_remove_from_context(void *info) |
603 | { | 1096 | { |
604 | struct perf_event *event = info; | 1097 | struct perf_event *event = info; |
605 | struct perf_event_context *ctx = event->ctx; | 1098 | struct perf_event_context *ctx = event->ctx; |
606 | struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); | 1099 | struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); |
607 | 1100 | ||
608 | /* | ||
609 | * If this is a task context, we need to check whether it is | ||
610 | * the current task context of this cpu. If not it has been | ||
611 | * scheduled out before the smp call arrived. | ||
612 | */ | ||
613 | if (ctx->task && cpuctx->task_ctx != ctx) | ||
614 | return; | ||
615 | |||
616 | raw_spin_lock(&ctx->lock); | 1101 | raw_spin_lock(&ctx->lock); |
617 | |||
618 | event_sched_out(event, cpuctx, ctx); | 1102 | event_sched_out(event, cpuctx, ctx); |
619 | |||
620 | list_del_event(event, ctx); | 1103 | list_del_event(event, ctx); |
621 | |||
622 | raw_spin_unlock(&ctx->lock); | 1104 | raw_spin_unlock(&ctx->lock); |
1105 | |||
1106 | return 0; | ||
623 | } | 1107 | } |
624 | 1108 | ||
625 | 1109 | ||
626 | /* | 1110 | /* |
627 | * Remove the event from a task's (or a CPU's) list of events. | 1111 | * Remove the event from a task's (or a CPU's) list of events. |
628 | * | 1112 | * |
629 | * Must be called with ctx->mutex held. | ||
630 | * | ||
631 | * CPU events are removed with a smp call. For task events we only | 1113 | * CPU events are removed with a smp call. For task events we only |
632 | * call when the task is on a CPU. | 1114 | * call when the task is on a CPU. |
633 | * | 1115 | * |
@@ -638,49 +1120,48 @@ static void __perf_event_remove_from_context(void *info) | |||
638 | * When called from perf_event_exit_task, it's OK because the | 1120 | * When called from perf_event_exit_task, it's OK because the |
639 | * context has been detached from its task. | 1121 | * context has been detached from its task. |
640 | */ | 1122 | */ |
641 | static void perf_event_remove_from_context(struct perf_event *event) | 1123 | static void perf_remove_from_context(struct perf_event *event) |
642 | { | 1124 | { |
643 | struct perf_event_context *ctx = event->ctx; | 1125 | struct perf_event_context *ctx = event->ctx; |
644 | struct task_struct *task = ctx->task; | 1126 | struct task_struct *task = ctx->task; |
645 | 1127 | ||
1128 | lockdep_assert_held(&ctx->mutex); | ||
1129 | |||
646 | if (!task) { | 1130 | if (!task) { |
647 | /* | 1131 | /* |
648 | * Per cpu events are removed via an smp call and | 1132 | * Per cpu events are removed via an smp call and |
649 | * the removal is always successful. | 1133 | * the removal is always successful. |
650 | */ | 1134 | */ |
651 | smp_call_function_single(event->cpu, | 1135 | cpu_function_call(event->cpu, __perf_remove_from_context, event); |
652 | __perf_event_remove_from_context, | ||
653 | event, 1); | ||
654 | return; | 1136 | return; |
655 | } | 1137 | } |
656 | 1138 | ||
657 | retry: | 1139 | retry: |
658 | task_oncpu_function_call(task, __perf_event_remove_from_context, | 1140 | if (!task_function_call(task, __perf_remove_from_context, event)) |
659 | event); | 1141 | return; |
660 | 1142 | ||
661 | raw_spin_lock_irq(&ctx->lock); | 1143 | raw_spin_lock_irq(&ctx->lock); |
662 | /* | 1144 | /* |
663 | * If the context is active we need to retry the smp call. | 1145 | * If we failed to find a running task, but find the context active now |
1146 | * that we've acquired the ctx->lock, retry. | ||
664 | */ | 1147 | */ |
665 | if (ctx->nr_active && !list_empty(&event->group_entry)) { | 1148 | if (ctx->is_active) { |
666 | raw_spin_unlock_irq(&ctx->lock); | 1149 | raw_spin_unlock_irq(&ctx->lock); |
667 | goto retry; | 1150 | goto retry; |
668 | } | 1151 | } |
669 | 1152 | ||
670 | /* | 1153 | /* |
671 | * The lock prevents that this context is scheduled in so we | 1154 | * Since the task isn't running, its safe to remove the event, us |
672 | * can remove the event safely, if the call above did not | 1155 | * holding the ctx->lock ensures the task won't get scheduled in. |
673 | * succeed. | ||
674 | */ | 1156 | */ |
675 | if (!list_empty(&event->group_entry)) | 1157 | list_del_event(event, ctx); |
676 | list_del_event(event, ctx); | ||
677 | raw_spin_unlock_irq(&ctx->lock); | 1158 | raw_spin_unlock_irq(&ctx->lock); |
678 | } | 1159 | } |
679 | 1160 | ||
680 | /* | 1161 | /* |
681 | * Cross CPU call to disable a performance event | 1162 | * Cross CPU call to disable a performance event |
682 | */ | 1163 | */ |
683 | static void __perf_event_disable(void *info) | 1164 | static int __perf_event_disable(void *info) |
684 | { | 1165 | { |
685 | struct perf_event *event = info; | 1166 | struct perf_event *event = info; |
686 | struct perf_event_context *ctx = event->ctx; | 1167 | struct perf_event_context *ctx = event->ctx; |
@@ -689,9 +1170,12 @@ static void __perf_event_disable(void *info) | |||
689 | /* | 1170 | /* |
690 | * If this is a per-task event, need to check whether this | 1171 | * If this is a per-task event, need to check whether this |
691 | * event's task is the current task on this cpu. | 1172 | * event's task is the current task on this cpu. |
1173 | * | ||
1174 | * Can trigger due to concurrent perf_event_context_sched_out() | ||
1175 | * flipping contexts around. | ||
692 | */ | 1176 | */ |
693 | if (ctx->task && cpuctx->task_ctx != ctx) | 1177 | if (ctx->task && cpuctx->task_ctx != ctx) |
694 | return; | 1178 | return -EINVAL; |
695 | 1179 | ||
696 | raw_spin_lock(&ctx->lock); | 1180 | raw_spin_lock(&ctx->lock); |
697 | 1181 | ||
@@ -701,6 +1185,7 @@ static void __perf_event_disable(void *info) | |||
701 | */ | 1185 | */ |
702 | if (event->state >= PERF_EVENT_STATE_INACTIVE) { | 1186 | if (event->state >= PERF_EVENT_STATE_INACTIVE) { |
703 | update_context_time(ctx); | 1187 | update_context_time(ctx); |
1188 | update_cgrp_time_from_event(event); | ||
704 | update_group_times(event); | 1189 | update_group_times(event); |
705 | if (event == event->group_leader) | 1190 | if (event == event->group_leader) |
706 | group_sched_out(event, cpuctx, ctx); | 1191 | group_sched_out(event, cpuctx, ctx); |
@@ -710,6 +1195,8 @@ static void __perf_event_disable(void *info) | |||
710 | } | 1195 | } |
711 | 1196 | ||
712 | raw_spin_unlock(&ctx->lock); | 1197 | raw_spin_unlock(&ctx->lock); |
1198 | |||
1199 | return 0; | ||
713 | } | 1200 | } |
714 | 1201 | ||
715 | /* | 1202 | /* |
@@ -734,13 +1221,13 @@ void perf_event_disable(struct perf_event *event) | |||
734 | /* | 1221 | /* |
735 | * Disable the event on the cpu that it's on | 1222 | * Disable the event on the cpu that it's on |
736 | */ | 1223 | */ |
737 | smp_call_function_single(event->cpu, __perf_event_disable, | 1224 | cpu_function_call(event->cpu, __perf_event_disable, event); |
738 | event, 1); | ||
739 | return; | 1225 | return; |
740 | } | 1226 | } |
741 | 1227 | ||
742 | retry: | 1228 | retry: |
743 | task_oncpu_function_call(task, __perf_event_disable, event); | 1229 | if (!task_function_call(task, __perf_event_disable, event)) |
1230 | return; | ||
744 | 1231 | ||
745 | raw_spin_lock_irq(&ctx->lock); | 1232 | raw_spin_lock_irq(&ctx->lock); |
746 | /* | 1233 | /* |
@@ -748,6 +1235,11 @@ retry: | |||
748 | */ | 1235 | */ |
749 | if (event->state == PERF_EVENT_STATE_ACTIVE) { | 1236 | if (event->state == PERF_EVENT_STATE_ACTIVE) { |
750 | raw_spin_unlock_irq(&ctx->lock); | 1237 | raw_spin_unlock_irq(&ctx->lock); |
1238 | /* | ||
1239 | * Reload the task pointer, it might have been changed by | ||
1240 | * a concurrent perf_event_context_sched_out(). | ||
1241 | */ | ||
1242 | task = ctx->task; | ||
751 | goto retry; | 1243 | goto retry; |
752 | } | 1244 | } |
753 | 1245 | ||
@@ -759,20 +1251,71 @@ retry: | |||
759 | update_group_times(event); | 1251 | update_group_times(event); |
760 | event->state = PERF_EVENT_STATE_OFF; | 1252 | event->state = PERF_EVENT_STATE_OFF; |
761 | } | 1253 | } |
762 | |||
763 | raw_spin_unlock_irq(&ctx->lock); | 1254 | raw_spin_unlock_irq(&ctx->lock); |
764 | } | 1255 | } |
765 | 1256 | ||
1257 | static void perf_set_shadow_time(struct perf_event *event, | ||
1258 | struct perf_event_context *ctx, | ||
1259 | u64 tstamp) | ||
1260 | { | ||
1261 | /* | ||
1262 | * use the correct time source for the time snapshot | ||
1263 | * | ||
1264 | * We could get by without this by leveraging the | ||
1265 | * fact that to get to this function, the caller | ||
1266 | * has most likely already called update_context_time() | ||
1267 | * and update_cgrp_time_xx() and thus both timestamp | ||
1268 | * are identical (or very close). Given that tstamp is, | ||
1269 | * already adjusted for cgroup, we could say that: | ||
1270 | * tstamp - ctx->timestamp | ||
1271 | * is equivalent to | ||
1272 | * tstamp - cgrp->timestamp. | ||
1273 | * | ||
1274 | * Then, in perf_output_read(), the calculation would | ||
1275 | * work with no changes because: | ||
1276 | * - event is guaranteed scheduled in | ||
1277 | * - no scheduled out in between | ||
1278 | * - thus the timestamp would be the same | ||
1279 | * | ||
1280 | * But this is a bit hairy. | ||
1281 | * | ||
1282 | * So instead, we have an explicit cgroup call to remain | ||
1283 | * within the time time source all along. We believe it | ||
1284 | * is cleaner and simpler to understand. | ||
1285 | */ | ||
1286 | if (is_cgroup_event(event)) | ||
1287 | perf_cgroup_set_shadow_time(event, tstamp); | ||
1288 | else | ||
1289 | event->shadow_ctx_time = tstamp - ctx->timestamp; | ||
1290 | } | ||
1291 | |||
1292 | #define MAX_INTERRUPTS (~0ULL) | ||
1293 | |||
1294 | static void perf_log_throttle(struct perf_event *event, int enable); | ||
1295 | |||
766 | static int | 1296 | static int |
767 | event_sched_in(struct perf_event *event, | 1297 | event_sched_in(struct perf_event *event, |
768 | struct perf_cpu_context *cpuctx, | 1298 | struct perf_cpu_context *cpuctx, |
769 | struct perf_event_context *ctx) | 1299 | struct perf_event_context *ctx) |
770 | { | 1300 | { |
1301 | u64 tstamp = perf_event_time(event); | ||
1302 | |||
771 | if (event->state <= PERF_EVENT_STATE_OFF) | 1303 | if (event->state <= PERF_EVENT_STATE_OFF) |
772 | return 0; | 1304 | return 0; |
773 | 1305 | ||
774 | event->state = PERF_EVENT_STATE_ACTIVE; | 1306 | event->state = PERF_EVENT_STATE_ACTIVE; |
775 | event->oncpu = smp_processor_id(); | 1307 | event->oncpu = smp_processor_id(); |
1308 | |||
1309 | /* | ||
1310 | * Unthrottle events, since we scheduled we might have missed several | ||
1311 | * ticks already, also for a heavily scheduling task there is little | ||
1312 | * guarantee it'll get a tick in a timely manner. | ||
1313 | */ | ||
1314 | if (unlikely(event->hw.interrupts == MAX_INTERRUPTS)) { | ||
1315 | perf_log_throttle(event, 1); | ||
1316 | event->hw.interrupts = 0; | ||
1317 | } | ||
1318 | |||
776 | /* | 1319 | /* |
777 | * The new state must be visible before we turn it on in the hardware: | 1320 | * The new state must be visible before we turn it on in the hardware: |
778 | */ | 1321 | */ |
@@ -784,9 +1327,9 @@ event_sched_in(struct perf_event *event, | |||
784 | return -EAGAIN; | 1327 | return -EAGAIN; |
785 | } | 1328 | } |
786 | 1329 | ||
787 | event->tstamp_running += ctx->time - event->tstamp_stopped; | 1330 | event->tstamp_running += tstamp - event->tstamp_stopped; |
788 | 1331 | ||
789 | event->shadow_ctx_time = ctx->time - ctx->timestamp; | 1332 | perf_set_shadow_time(event, ctx, tstamp); |
790 | 1333 | ||
791 | if (!is_software_event(event)) | 1334 | if (!is_software_event(event)) |
792 | cpuctx->active_oncpu++; | 1335 | cpuctx->active_oncpu++; |
@@ -898,19 +1441,24 @@ static int group_can_go_on(struct perf_event *event, | |||
898 | static void add_event_to_ctx(struct perf_event *event, | 1441 | static void add_event_to_ctx(struct perf_event *event, |
899 | struct perf_event_context *ctx) | 1442 | struct perf_event_context *ctx) |
900 | { | 1443 | { |
1444 | u64 tstamp = perf_event_time(event); | ||
1445 | |||
901 | list_add_event(event, ctx); | 1446 | list_add_event(event, ctx); |
902 | perf_group_attach(event); | 1447 | perf_group_attach(event); |
903 | event->tstamp_enabled = ctx->time; | 1448 | event->tstamp_enabled = tstamp; |
904 | event->tstamp_running = ctx->time; | 1449 | event->tstamp_running = tstamp; |
905 | event->tstamp_stopped = ctx->time; | 1450 | event->tstamp_stopped = tstamp; |
906 | } | 1451 | } |
907 | 1452 | ||
1453 | static void perf_event_context_sched_in(struct perf_event_context *ctx, | ||
1454 | struct task_struct *tsk); | ||
1455 | |||
908 | /* | 1456 | /* |
909 | * Cross CPU call to install and enable a performance event | 1457 | * Cross CPU call to install and enable a performance event |
910 | * | 1458 | * |
911 | * Must be called with ctx->mutex held | 1459 | * Must be called with ctx->mutex held |
912 | */ | 1460 | */ |
913 | static void __perf_install_in_context(void *info) | 1461 | static int __perf_install_in_context(void *info) |
914 | { | 1462 | { |
915 | struct perf_event *event = info; | 1463 | struct perf_event *event = info; |
916 | struct perf_event_context *ctx = event->ctx; | 1464 | struct perf_event_context *ctx = event->ctx; |
@@ -919,25 +1467,26 @@ static void __perf_install_in_context(void *info) | |||
919 | int err; | 1467 | int err; |
920 | 1468 | ||
921 | /* | 1469 | /* |
922 | * If this is a task context, we need to check whether it is | 1470 | * In case we're installing a new context to an already running task, |
923 | * the current task context of this cpu. If not it has been | 1471 | * could also happen before perf_event_task_sched_in() on architectures |
924 | * scheduled out before the smp call arrived. | 1472 | * which do context switches with IRQs enabled. |
925 | * Or possibly this is the right context but it isn't | ||
926 | * on this cpu because it had no events. | ||
927 | */ | 1473 | */ |
928 | if (ctx->task && cpuctx->task_ctx != ctx) { | 1474 | if (ctx->task && !cpuctx->task_ctx) |
929 | if (cpuctx->task_ctx || ctx->task != current) | 1475 | perf_event_context_sched_in(ctx, ctx->task); |
930 | return; | ||
931 | cpuctx->task_ctx = ctx; | ||
932 | } | ||
933 | 1476 | ||
934 | raw_spin_lock(&ctx->lock); | 1477 | raw_spin_lock(&ctx->lock); |
935 | ctx->is_active = 1; | 1478 | ctx->is_active = 1; |
936 | update_context_time(ctx); | 1479 | update_context_time(ctx); |
1480 | /* | ||
1481 | * update cgrp time only if current cgrp | ||
1482 | * matches event->cgrp. Must be done before | ||
1483 | * calling add_event_to_ctx() | ||
1484 | */ | ||
1485 | update_cgrp_time_from_event(event); | ||
937 | 1486 | ||
938 | add_event_to_ctx(event, ctx); | 1487 | add_event_to_ctx(event, ctx); |
939 | 1488 | ||
940 | if (event->cpu != -1 && event->cpu != smp_processor_id()) | 1489 | if (!event_filter_match(event)) |
941 | goto unlock; | 1490 | goto unlock; |
942 | 1491 | ||
943 | /* | 1492 | /* |
@@ -974,6 +1523,8 @@ static void __perf_install_in_context(void *info) | |||
974 | 1523 | ||
975 | unlock: | 1524 | unlock: |
976 | raw_spin_unlock(&ctx->lock); | 1525 | raw_spin_unlock(&ctx->lock); |
1526 | |||
1527 | return 0; | ||
977 | } | 1528 | } |
978 | 1529 | ||
979 | /* | 1530 | /* |
@@ -985,8 +1536,6 @@ unlock: | |||
985 | * If the event is attached to a task which is on a CPU we use a smp | 1536 | * If the event is attached to a task which is on a CPU we use a smp |
986 | * call to enable it in the task context. The task might have been | 1537 | * call to enable it in the task context. The task might have been |
987 | * scheduled away, but we check this in the smp call again. | 1538 | * scheduled away, but we check this in the smp call again. |
988 | * | ||
989 | * Must be called with ctx->mutex held. | ||
990 | */ | 1539 | */ |
991 | static void | 1540 | static void |
992 | perf_install_in_context(struct perf_event_context *ctx, | 1541 | perf_install_in_context(struct perf_event_context *ctx, |
@@ -995,6 +1544,8 @@ perf_install_in_context(struct perf_event_context *ctx, | |||
995 | { | 1544 | { |
996 | struct task_struct *task = ctx->task; | 1545 | struct task_struct *task = ctx->task; |
997 | 1546 | ||
1547 | lockdep_assert_held(&ctx->mutex); | ||
1548 | |||
998 | event->ctx = ctx; | 1549 | event->ctx = ctx; |
999 | 1550 | ||
1000 | if (!task) { | 1551 | if (!task) { |
@@ -1002,31 +1553,29 @@ perf_install_in_context(struct perf_event_context *ctx, | |||
1002 | * Per cpu events are installed via an smp call and | 1553 | * Per cpu events are installed via an smp call and |
1003 | * the install is always successful. | 1554 | * the install is always successful. |
1004 | */ | 1555 | */ |
1005 | smp_call_function_single(cpu, __perf_install_in_context, | 1556 | cpu_function_call(cpu, __perf_install_in_context, event); |
1006 | event, 1); | ||
1007 | return; | 1557 | return; |
1008 | } | 1558 | } |
1009 | 1559 | ||
1010 | retry: | 1560 | retry: |
1011 | task_oncpu_function_call(task, __perf_install_in_context, | 1561 | if (!task_function_call(task, __perf_install_in_context, event)) |
1012 | event); | 1562 | return; |
1013 | 1563 | ||
1014 | raw_spin_lock_irq(&ctx->lock); | 1564 | raw_spin_lock_irq(&ctx->lock); |
1015 | /* | 1565 | /* |
1016 | * we need to retry the smp call. | 1566 | * If we failed to find a running task, but find the context active now |
1567 | * that we've acquired the ctx->lock, retry. | ||
1017 | */ | 1568 | */ |
1018 | if (ctx->is_active && list_empty(&event->group_entry)) { | 1569 | if (ctx->is_active) { |
1019 | raw_spin_unlock_irq(&ctx->lock); | 1570 | raw_spin_unlock_irq(&ctx->lock); |
1020 | goto retry; | 1571 | goto retry; |
1021 | } | 1572 | } |
1022 | 1573 | ||
1023 | /* | 1574 | /* |
1024 | * The lock prevents that this context is scheduled in so we | 1575 | * Since the task isn't running, its safe to add the event, us holding |
1025 | * can add the event safely, if it the call above did not | 1576 | * the ctx->lock ensures the task won't get scheduled in. |
1026 | * succeed. | ||
1027 | */ | 1577 | */ |
1028 | if (list_empty(&event->group_entry)) | 1578 | add_event_to_ctx(event, ctx); |
1029 | add_event_to_ctx(event, ctx); | ||
1030 | raw_spin_unlock_irq(&ctx->lock); | 1579 | raw_spin_unlock_irq(&ctx->lock); |
1031 | } | 1580 | } |
1032 | 1581 | ||
@@ -1042,21 +1591,20 @@ static void __perf_event_mark_enabled(struct perf_event *event, | |||
1042 | struct perf_event_context *ctx) | 1591 | struct perf_event_context *ctx) |
1043 | { | 1592 | { |
1044 | struct perf_event *sub; | 1593 | struct perf_event *sub; |
1594 | u64 tstamp = perf_event_time(event); | ||
1045 | 1595 | ||
1046 | event->state = PERF_EVENT_STATE_INACTIVE; | 1596 | event->state = PERF_EVENT_STATE_INACTIVE; |
1047 | event->tstamp_enabled = ctx->time - event->total_time_enabled; | 1597 | event->tstamp_enabled = tstamp - event->total_time_enabled; |
1048 | list_for_each_entry(sub, &event->sibling_list, group_entry) { | 1598 | list_for_each_entry(sub, &event->sibling_list, group_entry) { |
1049 | if (sub->state >= PERF_EVENT_STATE_INACTIVE) { | 1599 | if (sub->state >= PERF_EVENT_STATE_INACTIVE) |
1050 | sub->tstamp_enabled = | 1600 | sub->tstamp_enabled = tstamp - sub->total_time_enabled; |
1051 | ctx->time - sub->total_time_enabled; | ||
1052 | } | ||
1053 | } | 1601 | } |
1054 | } | 1602 | } |
1055 | 1603 | ||
1056 | /* | 1604 | /* |
1057 | * Cross CPU call to enable a performance event | 1605 | * Cross CPU call to enable a performance event |
1058 | */ | 1606 | */ |
1059 | static void __perf_event_enable(void *info) | 1607 | static int __perf_event_enable(void *info) |
1060 | { | 1608 | { |
1061 | struct perf_event *event = info; | 1609 | struct perf_event *event = info; |
1062 | struct perf_event_context *ctx = event->ctx; | 1610 | struct perf_event_context *ctx = event->ctx; |
@@ -1064,26 +1612,27 @@ static void __perf_event_enable(void *info) | |||
1064 | struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); | 1612 | struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); |
1065 | int err; | 1613 | int err; |
1066 | 1614 | ||
1067 | /* | 1615 | if (WARN_ON_ONCE(!ctx->is_active)) |
1068 | * If this is a per-task event, need to check whether this | 1616 | return -EINVAL; |
1069 | * event's task is the current task on this cpu. | ||
1070 | */ | ||
1071 | if (ctx->task && cpuctx->task_ctx != ctx) { | ||
1072 | if (cpuctx->task_ctx || ctx->task != current) | ||
1073 | return; | ||
1074 | cpuctx->task_ctx = ctx; | ||
1075 | } | ||
1076 | 1617 | ||
1077 | raw_spin_lock(&ctx->lock); | 1618 | raw_spin_lock(&ctx->lock); |
1078 | ctx->is_active = 1; | ||
1079 | update_context_time(ctx); | 1619 | update_context_time(ctx); |
1080 | 1620 | ||
1081 | if (event->state >= PERF_EVENT_STATE_INACTIVE) | 1621 | if (event->state >= PERF_EVENT_STATE_INACTIVE) |
1082 | goto unlock; | 1622 | goto unlock; |
1623 | |||
1624 | /* | ||
1625 | * set current task's cgroup time reference point | ||
1626 | */ | ||
1627 | perf_cgroup_set_timestamp(current, ctx); | ||
1628 | |||
1083 | __perf_event_mark_enabled(event, ctx); | 1629 | __perf_event_mark_enabled(event, ctx); |
1084 | 1630 | ||
1085 | if (event->cpu != -1 && event->cpu != smp_processor_id()) | 1631 | if (!event_filter_match(event)) { |
1632 | if (is_cgroup_event(event)) | ||
1633 | perf_cgroup_defer_enabled(event); | ||
1086 | goto unlock; | 1634 | goto unlock; |
1635 | } | ||
1087 | 1636 | ||
1088 | /* | 1637 | /* |
1089 | * If the event is in a group and isn't the group leader, | 1638 | * If the event is in a group and isn't the group leader, |
@@ -1116,6 +1665,8 @@ static void __perf_event_enable(void *info) | |||
1116 | 1665 | ||
1117 | unlock: | 1666 | unlock: |
1118 | raw_spin_unlock(&ctx->lock); | 1667 | raw_spin_unlock(&ctx->lock); |
1668 | |||
1669 | return 0; | ||
1119 | } | 1670 | } |
1120 | 1671 | ||
1121 | /* | 1672 | /* |
@@ -1136,8 +1687,7 @@ void perf_event_enable(struct perf_event *event) | |||
1136 | /* | 1687 | /* |
1137 | * Enable the event on the cpu that it's on | 1688 | * Enable the event on the cpu that it's on |
1138 | */ | 1689 | */ |
1139 | smp_call_function_single(event->cpu, __perf_event_enable, | 1690 | cpu_function_call(event->cpu, __perf_event_enable, event); |
1140 | event, 1); | ||
1141 | return; | 1691 | return; |
1142 | } | 1692 | } |
1143 | 1693 | ||
@@ -1156,8 +1706,15 @@ void perf_event_enable(struct perf_event *event) | |||
1156 | event->state = PERF_EVENT_STATE_OFF; | 1706 | event->state = PERF_EVENT_STATE_OFF; |
1157 | 1707 | ||
1158 | retry: | 1708 | retry: |
1709 | if (!ctx->is_active) { | ||
1710 | __perf_event_mark_enabled(event, ctx); | ||
1711 | goto out; | ||
1712 | } | ||
1713 | |||
1159 | raw_spin_unlock_irq(&ctx->lock); | 1714 | raw_spin_unlock_irq(&ctx->lock); |
1160 | task_oncpu_function_call(task, __perf_event_enable, event); | 1715 | |
1716 | if (!task_function_call(task, __perf_event_enable, event)) | ||
1717 | return; | ||
1161 | 1718 | ||
1162 | raw_spin_lock_irq(&ctx->lock); | 1719 | raw_spin_lock_irq(&ctx->lock); |
1163 | 1720 | ||
@@ -1165,15 +1722,14 @@ retry: | |||
1165 | * If the context is active and the event is still off, | 1722 | * If the context is active and the event is still off, |
1166 | * we need to retry the cross-call. | 1723 | * we need to retry the cross-call. |
1167 | */ | 1724 | */ |
1168 | if (ctx->is_active && event->state == PERF_EVENT_STATE_OFF) | 1725 | if (ctx->is_active && event->state == PERF_EVENT_STATE_OFF) { |
1726 | /* | ||
1727 | * task could have been flipped by a concurrent | ||
1728 | * perf_event_context_sched_out() | ||
1729 | */ | ||
1730 | task = ctx->task; | ||
1169 | goto retry; | 1731 | goto retry; |
1170 | 1732 | } | |
1171 | /* | ||
1172 | * Since we have the lock this context can't be scheduled | ||
1173 | * in, so we can change the state safely. | ||
1174 | */ | ||
1175 | if (event->state == PERF_EVENT_STATE_OFF) | ||
1176 | __perf_event_mark_enabled(event, ctx); | ||
1177 | 1733 | ||
1178 | out: | 1734 | out: |
1179 | raw_spin_unlock_irq(&ctx->lock); | 1735 | raw_spin_unlock_irq(&ctx->lock); |
@@ -1193,12 +1749,6 @@ static int perf_event_refresh(struct perf_event *event, int refresh) | |||
1193 | return 0; | 1749 | return 0; |
1194 | } | 1750 | } |
1195 | 1751 | ||
1196 | enum event_type_t { | ||
1197 | EVENT_FLEXIBLE = 0x1, | ||
1198 | EVENT_PINNED = 0x2, | ||
1199 | EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED, | ||
1200 | }; | ||
1201 | |||
1202 | static void ctx_sched_out(struct perf_event_context *ctx, | 1752 | static void ctx_sched_out(struct perf_event_context *ctx, |
1203 | struct perf_cpu_context *cpuctx, | 1753 | struct perf_cpu_context *cpuctx, |
1204 | enum event_type_t event_type) | 1754 | enum event_type_t event_type) |
@@ -1211,6 +1761,7 @@ static void ctx_sched_out(struct perf_event_context *ctx, | |||
1211 | if (likely(!ctx->nr_events)) | 1761 | if (likely(!ctx->nr_events)) |
1212 | goto out; | 1762 | goto out; |
1213 | update_context_time(ctx); | 1763 | update_context_time(ctx); |
1764 | update_cgrp_time_from_cpuctx(cpuctx); | ||
1214 | 1765 | ||
1215 | if (!ctx->nr_active) | 1766 | if (!ctx->nr_active) |
1216 | goto out; | 1767 | goto out; |
@@ -1323,8 +1874,8 @@ static void perf_event_sync_stat(struct perf_event_context *ctx, | |||
1323 | } | 1874 | } |
1324 | } | 1875 | } |
1325 | 1876 | ||
1326 | void perf_event_context_sched_out(struct task_struct *task, int ctxn, | 1877 | static void perf_event_context_sched_out(struct task_struct *task, int ctxn, |
1327 | struct task_struct *next) | 1878 | struct task_struct *next) |
1328 | { | 1879 | { |
1329 | struct perf_event_context *ctx = task->perf_event_ctxp[ctxn]; | 1880 | struct perf_event_context *ctx = task->perf_event_ctxp[ctxn]; |
1330 | struct perf_event_context *next_ctx; | 1881 | struct perf_event_context *next_ctx; |
@@ -1400,6 +1951,14 @@ void __perf_event_task_sched_out(struct task_struct *task, | |||
1400 | 1951 | ||
1401 | for_each_task_context_nr(ctxn) | 1952 | for_each_task_context_nr(ctxn) |
1402 | perf_event_context_sched_out(task, ctxn, next); | 1953 | perf_event_context_sched_out(task, ctxn, next); |
1954 | |||
1955 | /* | ||
1956 | * if cgroup events exist on this CPU, then we need | ||
1957 | * to check if we have to switch out PMU state. | ||
1958 | * cgroup event are system-wide mode only | ||
1959 | */ | ||
1960 | if (atomic_read(&__get_cpu_var(perf_cgroup_events))) | ||
1961 | perf_cgroup_sched_out(task); | ||
1403 | } | 1962 | } |
1404 | 1963 | ||
1405 | static void task_ctx_sched_out(struct perf_event_context *ctx, | 1964 | static void task_ctx_sched_out(struct perf_event_context *ctx, |
@@ -1435,9 +1994,13 @@ ctx_pinned_sched_in(struct perf_event_context *ctx, | |||
1435 | list_for_each_entry(event, &ctx->pinned_groups, group_entry) { | 1994 | list_for_each_entry(event, &ctx->pinned_groups, group_entry) { |
1436 | if (event->state <= PERF_EVENT_STATE_OFF) | 1995 | if (event->state <= PERF_EVENT_STATE_OFF) |
1437 | continue; | 1996 | continue; |
1438 | if (event->cpu != -1 && event->cpu != smp_processor_id()) | 1997 | if (!event_filter_match(event)) |
1439 | continue; | 1998 | continue; |
1440 | 1999 | ||
2000 | /* may need to reset tstamp_enabled */ | ||
2001 | if (is_cgroup_event(event)) | ||
2002 | perf_cgroup_mark_enabled(event, ctx); | ||
2003 | |||
1441 | if (group_can_go_on(event, cpuctx, 1)) | 2004 | if (group_can_go_on(event, cpuctx, 1)) |
1442 | group_sched_in(event, cpuctx, ctx); | 2005 | group_sched_in(event, cpuctx, ctx); |
1443 | 2006 | ||
@@ -1467,9 +2030,13 @@ ctx_flexible_sched_in(struct perf_event_context *ctx, | |||
1467 | * Listen to the 'cpu' scheduling filter constraint | 2030 | * Listen to the 'cpu' scheduling filter constraint |
1468 | * of events: | 2031 | * of events: |
1469 | */ | 2032 | */ |
1470 | if (event->cpu != -1 && event->cpu != smp_processor_id()) | 2033 | if (!event_filter_match(event)) |
1471 | continue; | 2034 | continue; |
1472 | 2035 | ||
2036 | /* may need to reset tstamp_enabled */ | ||
2037 | if (is_cgroup_event(event)) | ||
2038 | perf_cgroup_mark_enabled(event, ctx); | ||
2039 | |||
1473 | if (group_can_go_on(event, cpuctx, can_add_hw)) { | 2040 | if (group_can_go_on(event, cpuctx, can_add_hw)) { |
1474 | if (group_sched_in(event, cpuctx, ctx)) | 2041 | if (group_sched_in(event, cpuctx, ctx)) |
1475 | can_add_hw = 0; | 2042 | can_add_hw = 0; |
@@ -1480,15 +2047,19 @@ ctx_flexible_sched_in(struct perf_event_context *ctx, | |||
1480 | static void | 2047 | static void |
1481 | ctx_sched_in(struct perf_event_context *ctx, | 2048 | ctx_sched_in(struct perf_event_context *ctx, |
1482 | struct perf_cpu_context *cpuctx, | 2049 | struct perf_cpu_context *cpuctx, |
1483 | enum event_type_t event_type) | 2050 | enum event_type_t event_type, |
2051 | struct task_struct *task) | ||
1484 | { | 2052 | { |
2053 | u64 now; | ||
2054 | |||
1485 | raw_spin_lock(&ctx->lock); | 2055 | raw_spin_lock(&ctx->lock); |
1486 | ctx->is_active = 1; | 2056 | ctx->is_active = 1; |
1487 | if (likely(!ctx->nr_events)) | 2057 | if (likely(!ctx->nr_events)) |
1488 | goto out; | 2058 | goto out; |
1489 | 2059 | ||
1490 | ctx->timestamp = perf_clock(); | 2060 | now = perf_clock(); |
1491 | 2061 | ctx->timestamp = now; | |
2062 | perf_cgroup_set_timestamp(task, ctx); | ||
1492 | /* | 2063 | /* |
1493 | * First go through the list and put on any pinned groups | 2064 | * First go through the list and put on any pinned groups |
1494 | * in order to give them the best chance of going on. | 2065 | * in order to give them the best chance of going on. |
@@ -1505,11 +2076,12 @@ out: | |||
1505 | } | 2076 | } |
1506 | 2077 | ||
1507 | static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx, | 2078 | static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx, |
1508 | enum event_type_t event_type) | 2079 | enum event_type_t event_type, |
2080 | struct task_struct *task) | ||
1509 | { | 2081 | { |
1510 | struct perf_event_context *ctx = &cpuctx->ctx; | 2082 | struct perf_event_context *ctx = &cpuctx->ctx; |
1511 | 2083 | ||
1512 | ctx_sched_in(ctx, cpuctx, event_type); | 2084 | ctx_sched_in(ctx, cpuctx, event_type, task); |
1513 | } | 2085 | } |
1514 | 2086 | ||
1515 | static void task_ctx_sched_in(struct perf_event_context *ctx, | 2087 | static void task_ctx_sched_in(struct perf_event_context *ctx, |
@@ -1517,15 +2089,16 @@ static void task_ctx_sched_in(struct perf_event_context *ctx, | |||
1517 | { | 2089 | { |
1518 | struct perf_cpu_context *cpuctx; | 2090 | struct perf_cpu_context *cpuctx; |
1519 | 2091 | ||
1520 | cpuctx = __get_cpu_context(ctx); | 2092 | cpuctx = __get_cpu_context(ctx); |
1521 | if (cpuctx->task_ctx == ctx) | 2093 | if (cpuctx->task_ctx == ctx) |
1522 | return; | 2094 | return; |
1523 | 2095 | ||
1524 | ctx_sched_in(ctx, cpuctx, event_type); | 2096 | ctx_sched_in(ctx, cpuctx, event_type, NULL); |
1525 | cpuctx->task_ctx = ctx; | 2097 | cpuctx->task_ctx = ctx; |
1526 | } | 2098 | } |
1527 | 2099 | ||
1528 | void perf_event_context_sched_in(struct perf_event_context *ctx) | 2100 | static void perf_event_context_sched_in(struct perf_event_context *ctx, |
2101 | struct task_struct *task) | ||
1529 | { | 2102 | { |
1530 | struct perf_cpu_context *cpuctx; | 2103 | struct perf_cpu_context *cpuctx; |
1531 | 2104 | ||
@@ -1541,9 +2114,9 @@ void perf_event_context_sched_in(struct perf_event_context *ctx) | |||
1541 | */ | 2114 | */ |
1542 | cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); | 2115 | cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); |
1543 | 2116 | ||
1544 | ctx_sched_in(ctx, cpuctx, EVENT_PINNED); | 2117 | ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task); |
1545 | cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE); | 2118 | cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task); |
1546 | ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE); | 2119 | ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task); |
1547 | 2120 | ||
1548 | cpuctx->task_ctx = ctx; | 2121 | cpuctx->task_ctx = ctx; |
1549 | 2122 | ||
@@ -1576,14 +2149,17 @@ void __perf_event_task_sched_in(struct task_struct *task) | |||
1576 | if (likely(!ctx)) | 2149 | if (likely(!ctx)) |
1577 | continue; | 2150 | continue; |
1578 | 2151 | ||
1579 | perf_event_context_sched_in(ctx); | 2152 | perf_event_context_sched_in(ctx, task); |
1580 | } | 2153 | } |
2154 | /* | ||
2155 | * if cgroup events exist on this CPU, then we need | ||
2156 | * to check if we have to switch in PMU state. | ||
2157 | * cgroup event are system-wide mode only | ||
2158 | */ | ||
2159 | if (atomic_read(&__get_cpu_var(perf_cgroup_events))) | ||
2160 | perf_cgroup_sched_in(task); | ||
1581 | } | 2161 | } |
1582 | 2162 | ||
1583 | #define MAX_INTERRUPTS (~0ULL) | ||
1584 | |||
1585 | static void perf_log_throttle(struct perf_event *event, int enable); | ||
1586 | |||
1587 | static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count) | 2163 | static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count) |
1588 | { | 2164 | { |
1589 | u64 frequency = event->attr.sample_freq; | 2165 | u64 frequency = event->attr.sample_freq; |
@@ -1611,7 +2187,7 @@ static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count) | |||
1611 | * Reduce accuracy by one bit such that @a and @b converge | 2187 | * Reduce accuracy by one bit such that @a and @b converge |
1612 | * to a similar magnitude. | 2188 | * to a similar magnitude. |
1613 | */ | 2189 | */ |
1614 | #define REDUCE_FLS(a, b) \ | 2190 | #define REDUCE_FLS(a, b) \ |
1615 | do { \ | 2191 | do { \ |
1616 | if (a##_fls > b##_fls) { \ | 2192 | if (a##_fls > b##_fls) { \ |
1617 | a >>= 1; \ | 2193 | a >>= 1; \ |
@@ -1694,7 +2270,7 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx, u64 period) | |||
1694 | if (event->state != PERF_EVENT_STATE_ACTIVE) | 2270 | if (event->state != PERF_EVENT_STATE_ACTIVE) |
1695 | continue; | 2271 | continue; |
1696 | 2272 | ||
1697 | if (event->cpu != -1 && event->cpu != smp_processor_id()) | 2273 | if (!event_filter_match(event)) |
1698 | continue; | 2274 | continue; |
1699 | 2275 | ||
1700 | hwc = &event->hw; | 2276 | hwc = &event->hw; |
@@ -1781,7 +2357,7 @@ static void perf_rotate_context(struct perf_cpu_context *cpuctx) | |||
1781 | if (ctx) | 2357 | if (ctx) |
1782 | rotate_ctx(ctx); | 2358 | rotate_ctx(ctx); |
1783 | 2359 | ||
1784 | cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE); | 2360 | cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, current); |
1785 | if (ctx) | 2361 | if (ctx) |
1786 | task_ctx_sched_in(ctx, EVENT_FLEXIBLE); | 2362 | task_ctx_sched_in(ctx, EVENT_FLEXIBLE); |
1787 | 2363 | ||
@@ -1860,7 +2436,7 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx) | |||
1860 | 2436 | ||
1861 | raw_spin_unlock(&ctx->lock); | 2437 | raw_spin_unlock(&ctx->lock); |
1862 | 2438 | ||
1863 | perf_event_context_sched_in(ctx); | 2439 | perf_event_context_sched_in(ctx, ctx->task); |
1864 | out: | 2440 | out: |
1865 | local_irq_restore(flags); | 2441 | local_irq_restore(flags); |
1866 | } | 2442 | } |
@@ -1885,11 +2461,14 @@ static void __perf_event_read(void *info) | |||
1885 | return; | 2461 | return; |
1886 | 2462 | ||
1887 | raw_spin_lock(&ctx->lock); | 2463 | raw_spin_lock(&ctx->lock); |
1888 | update_context_time(ctx); | 2464 | if (ctx->is_active) { |
2465 | update_context_time(ctx); | ||
2466 | update_cgrp_time_from_event(event); | ||
2467 | } | ||
1889 | update_event_times(event); | 2468 | update_event_times(event); |
2469 | if (event->state == PERF_EVENT_STATE_ACTIVE) | ||
2470 | event->pmu->read(event); | ||
1890 | raw_spin_unlock(&ctx->lock); | 2471 | raw_spin_unlock(&ctx->lock); |
1891 | |||
1892 | event->pmu->read(event); | ||
1893 | } | 2472 | } |
1894 | 2473 | ||
1895 | static inline u64 perf_event_count(struct perf_event *event) | 2474 | static inline u64 perf_event_count(struct perf_event *event) |
@@ -1916,8 +2495,10 @@ static u64 perf_event_read(struct perf_event *event) | |||
1916 | * (e.g., thread is blocked), in that case | 2495 | * (e.g., thread is blocked), in that case |
1917 | * we cannot update context time | 2496 | * we cannot update context time |
1918 | */ | 2497 | */ |
1919 | if (ctx->is_active) | 2498 | if (ctx->is_active) { |
1920 | update_context_time(ctx); | 2499 | update_context_time(ctx); |
2500 | update_cgrp_time_from_event(event); | ||
2501 | } | ||
1921 | update_event_times(event); | 2502 | update_event_times(event); |
1922 | raw_spin_unlock_irqrestore(&ctx->lock, flags); | 2503 | raw_spin_unlock_irqrestore(&ctx->lock, flags); |
1923 | } | 2504 | } |
@@ -1983,8 +2564,7 @@ static int alloc_callchain_buffers(void) | |||
1983 | * accessed from NMI. Use a temporary manual per cpu allocation | 2564 | * accessed from NMI. Use a temporary manual per cpu allocation |
1984 | * until that gets sorted out. | 2565 | * until that gets sorted out. |
1985 | */ | 2566 | */ |
1986 | size = sizeof(*entries) + sizeof(struct perf_callchain_entry *) * | 2567 | size = offsetof(struct callchain_cpus_entries, cpu_entries[nr_cpu_ids]); |
1987 | num_possible_cpus(); | ||
1988 | 2568 | ||
1989 | entries = kzalloc(size, GFP_KERNEL); | 2569 | entries = kzalloc(size, GFP_KERNEL); |
1990 | if (!entries) | 2570 | if (!entries) |
@@ -2185,13 +2765,6 @@ find_lively_task_by_vpid(pid_t vpid) | |||
2185 | if (!task) | 2765 | if (!task) |
2186 | return ERR_PTR(-ESRCH); | 2766 | return ERR_PTR(-ESRCH); |
2187 | 2767 | ||
2188 | /* | ||
2189 | * Can't attach events to a dying task. | ||
2190 | */ | ||
2191 | err = -ESRCH; | ||
2192 | if (task->flags & PF_EXITING) | ||
2193 | goto errout; | ||
2194 | |||
2195 | /* Reuse ptrace permission checks for now. */ | 2768 | /* Reuse ptrace permission checks for now. */ |
2196 | err = -EACCES; | 2769 | err = -EACCES; |
2197 | if (!ptrace_may_access(task, PTRACE_MODE_READ)) | 2770 | if (!ptrace_may_access(task, PTRACE_MODE_READ)) |
@@ -2204,6 +2777,9 @@ errout: | |||
2204 | 2777 | ||
2205 | } | 2778 | } |
2206 | 2779 | ||
2780 | /* | ||
2781 | * Returns a matching context with refcount and pincount. | ||
2782 | */ | ||
2207 | static struct perf_event_context * | 2783 | static struct perf_event_context * |
2208 | find_get_context(struct pmu *pmu, struct task_struct *task, int cpu) | 2784 | find_get_context(struct pmu *pmu, struct task_struct *task, int cpu) |
2209 | { | 2785 | { |
@@ -2212,14 +2788,11 @@ find_get_context(struct pmu *pmu, struct task_struct *task, int cpu) | |||
2212 | unsigned long flags; | 2788 | unsigned long flags; |
2213 | int ctxn, err; | 2789 | int ctxn, err; |
2214 | 2790 | ||
2215 | if (!task && cpu != -1) { | 2791 | if (!task) { |
2216 | /* Must be root to operate on a CPU event: */ | 2792 | /* Must be root to operate on a CPU event: */ |
2217 | if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN)) | 2793 | if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN)) |
2218 | return ERR_PTR(-EACCES); | 2794 | return ERR_PTR(-EACCES); |
2219 | 2795 | ||
2220 | if (cpu < 0 || cpu >= nr_cpumask_bits) | ||
2221 | return ERR_PTR(-EINVAL); | ||
2222 | |||
2223 | /* | 2796 | /* |
2224 | * We could be clever and allow to attach a event to an | 2797 | * We could be clever and allow to attach a event to an |
2225 | * offline CPU and activate it when the CPU comes up, but | 2798 | * offline CPU and activate it when the CPU comes up, but |
@@ -2231,6 +2804,7 @@ find_get_context(struct pmu *pmu, struct task_struct *task, int cpu) | |||
2231 | cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); | 2804 | cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); |
2232 | ctx = &cpuctx->ctx; | 2805 | ctx = &cpuctx->ctx; |
2233 | get_ctx(ctx); | 2806 | get_ctx(ctx); |
2807 | ++ctx->pin_count; | ||
2234 | 2808 | ||
2235 | return ctx; | 2809 | return ctx; |
2236 | } | 2810 | } |
@@ -2244,6 +2818,7 @@ retry: | |||
2244 | ctx = perf_lock_task_context(task, ctxn, &flags); | 2818 | ctx = perf_lock_task_context(task, ctxn, &flags); |
2245 | if (ctx) { | 2819 | if (ctx) { |
2246 | unclone_ctx(ctx); | 2820 | unclone_ctx(ctx); |
2821 | ++ctx->pin_count; | ||
2247 | raw_spin_unlock_irqrestore(&ctx->lock, flags); | 2822 | raw_spin_unlock_irqrestore(&ctx->lock, flags); |
2248 | } | 2823 | } |
2249 | 2824 | ||
@@ -2255,14 +2830,29 @@ retry: | |||
2255 | 2830 | ||
2256 | get_ctx(ctx); | 2831 | get_ctx(ctx); |
2257 | 2832 | ||
2258 | if (cmpxchg(&task->perf_event_ctxp[ctxn], NULL, ctx)) { | 2833 | err = 0; |
2259 | /* | 2834 | mutex_lock(&task->perf_event_mutex); |
2260 | * We raced with some other task; use | 2835 | /* |
2261 | * the context they set. | 2836 | * If it has already passed perf_event_exit_task(). |
2262 | */ | 2837 | * we must see PF_EXITING, it takes this mutex too. |
2838 | */ | ||
2839 | if (task->flags & PF_EXITING) | ||
2840 | err = -ESRCH; | ||
2841 | else if (task->perf_event_ctxp[ctxn]) | ||
2842 | err = -EAGAIN; | ||
2843 | else { | ||
2844 | ++ctx->pin_count; | ||
2845 | rcu_assign_pointer(task->perf_event_ctxp[ctxn], ctx); | ||
2846 | } | ||
2847 | mutex_unlock(&task->perf_event_mutex); | ||
2848 | |||
2849 | if (unlikely(err)) { | ||
2263 | put_task_struct(task); | 2850 | put_task_struct(task); |
2264 | kfree(ctx); | 2851 | kfree(ctx); |
2265 | goto retry; | 2852 | |
2853 | if (err == -EAGAIN) | ||
2854 | goto retry; | ||
2855 | goto errout; | ||
2266 | } | 2856 | } |
2267 | } | 2857 | } |
2268 | 2858 | ||
@@ -2293,7 +2883,7 @@ static void free_event(struct perf_event *event) | |||
2293 | 2883 | ||
2294 | if (!event->parent) { | 2884 | if (!event->parent) { |
2295 | if (event->attach_state & PERF_ATTACH_TASK) | 2885 | if (event->attach_state & PERF_ATTACH_TASK) |
2296 | jump_label_dec(&perf_task_events); | 2886 | jump_label_dec(&perf_sched_events); |
2297 | if (event->attr.mmap || event->attr.mmap_data) | 2887 | if (event->attr.mmap || event->attr.mmap_data) |
2298 | atomic_dec(&nr_mmap_events); | 2888 | atomic_dec(&nr_mmap_events); |
2299 | if (event->attr.comm) | 2889 | if (event->attr.comm) |
@@ -2302,6 +2892,10 @@ static void free_event(struct perf_event *event) | |||
2302 | atomic_dec(&nr_task_events); | 2892 | atomic_dec(&nr_task_events); |
2303 | if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) | 2893 | if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) |
2304 | put_callchain_buffers(); | 2894 | put_callchain_buffers(); |
2895 | if (is_cgroup_event(event)) { | ||
2896 | atomic_dec(&per_cpu(perf_cgroup_events, event->cpu)); | ||
2897 | jump_label_dec(&perf_sched_events); | ||
2898 | } | ||
2305 | } | 2899 | } |
2306 | 2900 | ||
2307 | if (event->buffer) { | 2901 | if (event->buffer) { |
@@ -2309,6 +2903,9 @@ static void free_event(struct perf_event *event) | |||
2309 | event->buffer = NULL; | 2903 | event->buffer = NULL; |
2310 | } | 2904 | } |
2311 | 2905 | ||
2906 | if (is_cgroup_event(event)) | ||
2907 | perf_detach_cgroup(event); | ||
2908 | |||
2312 | if (event->destroy) | 2909 | if (event->destroy) |
2313 | event->destroy(event); | 2910 | event->destroy(event); |
2314 | 2911 | ||
@@ -3893,7 +4490,7 @@ static int perf_event_task_match(struct perf_event *event) | |||
3893 | if (event->state < PERF_EVENT_STATE_INACTIVE) | 4490 | if (event->state < PERF_EVENT_STATE_INACTIVE) |
3894 | return 0; | 4491 | return 0; |
3895 | 4492 | ||
3896 | if (event->cpu != -1 && event->cpu != smp_processor_id()) | 4493 | if (!event_filter_match(event)) |
3897 | return 0; | 4494 | return 0; |
3898 | 4495 | ||
3899 | if (event->attr.comm || event->attr.mmap || | 4496 | if (event->attr.comm || event->attr.mmap || |
@@ -4030,7 +4627,7 @@ static int perf_event_comm_match(struct perf_event *event) | |||
4030 | if (event->state < PERF_EVENT_STATE_INACTIVE) | 4627 | if (event->state < PERF_EVENT_STATE_INACTIVE) |
4031 | return 0; | 4628 | return 0; |
4032 | 4629 | ||
4033 | if (event->cpu != -1 && event->cpu != smp_processor_id()) | 4630 | if (!event_filter_match(event)) |
4034 | return 0; | 4631 | return 0; |
4035 | 4632 | ||
4036 | if (event->attr.comm) | 4633 | if (event->attr.comm) |
@@ -4178,7 +4775,7 @@ static int perf_event_mmap_match(struct perf_event *event, | |||
4178 | if (event->state < PERF_EVENT_STATE_INACTIVE) | 4775 | if (event->state < PERF_EVENT_STATE_INACTIVE) |
4179 | return 0; | 4776 | return 0; |
4180 | 4777 | ||
4181 | if (event->cpu != -1 && event->cpu != smp_processor_id()) | 4778 | if (!event_filter_match(event)) |
4182 | return 0; | 4779 | return 0; |
4183 | 4780 | ||
4184 | if ((!executable && event->attr.mmap_data) || | 4781 | if ((!executable && event->attr.mmap_data) || |
@@ -4376,26 +4973,14 @@ static int __perf_event_overflow(struct perf_event *event, int nmi, | |||
4376 | if (unlikely(!is_sampling_event(event))) | 4973 | if (unlikely(!is_sampling_event(event))) |
4377 | return 0; | 4974 | return 0; |
4378 | 4975 | ||
4379 | if (!throttle) { | 4976 | if (unlikely(hwc->interrupts >= max_samples_per_tick)) { |
4380 | hwc->interrupts++; | 4977 | if (throttle) { |
4381 | } else { | 4978 | hwc->interrupts = MAX_INTERRUPTS; |
4382 | if (hwc->interrupts != MAX_INTERRUPTS) { | 4979 | perf_log_throttle(event, 0); |
4383 | hwc->interrupts++; | ||
4384 | if (HZ * hwc->interrupts > | ||
4385 | (u64)sysctl_perf_event_sample_rate) { | ||
4386 | hwc->interrupts = MAX_INTERRUPTS; | ||
4387 | perf_log_throttle(event, 0); | ||
4388 | ret = 1; | ||
4389 | } | ||
4390 | } else { | ||
4391 | /* | ||
4392 | * Keep re-disabling events even though on the previous | ||
4393 | * pass we disabled it - just in case we raced with a | ||
4394 | * sched-in and the event got enabled again: | ||
4395 | */ | ||
4396 | ret = 1; | 4980 | ret = 1; |
4397 | } | 4981 | } |
4398 | } | 4982 | } else |
4983 | hwc->interrupts++; | ||
4399 | 4984 | ||
4400 | if (event->attr.freq) { | 4985 | if (event->attr.freq) { |
4401 | u64 now = perf_clock(); | 4986 | u64 now = perf_clock(); |
@@ -4648,7 +5233,7 @@ int perf_swevent_get_recursion_context(void) | |||
4648 | } | 5233 | } |
4649 | EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context); | 5234 | EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context); |
4650 | 5235 | ||
4651 | void inline perf_swevent_put_recursion_context(int rctx) | 5236 | inline void perf_swevent_put_recursion_context(int rctx) |
4652 | { | 5237 | { |
4653 | struct swevent_htable *swhash = &__get_cpu_var(swevent_htable); | 5238 | struct swevent_htable *swhash = &__get_cpu_var(swevent_htable); |
4654 | 5239 | ||
@@ -5032,6 +5617,10 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer) | |||
5032 | u64 period; | 5617 | u64 period; |
5033 | 5618 | ||
5034 | event = container_of(hrtimer, struct perf_event, hw.hrtimer); | 5619 | event = container_of(hrtimer, struct perf_event, hw.hrtimer); |
5620 | |||
5621 | if (event->state != PERF_EVENT_STATE_ACTIVE) | ||
5622 | return HRTIMER_NORESTART; | ||
5623 | |||
5035 | event->pmu->read(event); | 5624 | event->pmu->read(event); |
5036 | 5625 | ||
5037 | perf_sample_data_init(&data, 0); | 5626 | perf_sample_data_init(&data, 0); |
@@ -5058,9 +5647,6 @@ static void perf_swevent_start_hrtimer(struct perf_event *event) | |||
5058 | if (!is_sampling_event(event)) | 5647 | if (!is_sampling_event(event)) |
5059 | return; | 5648 | return; |
5060 | 5649 | ||
5061 | hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); | ||
5062 | hwc->hrtimer.function = perf_swevent_hrtimer; | ||
5063 | |||
5064 | period = local64_read(&hwc->period_left); | 5650 | period = local64_read(&hwc->period_left); |
5065 | if (period) { | 5651 | if (period) { |
5066 | if (period < 0) | 5652 | if (period < 0) |
@@ -5087,6 +5673,30 @@ static void perf_swevent_cancel_hrtimer(struct perf_event *event) | |||
5087 | } | 5673 | } |
5088 | } | 5674 | } |
5089 | 5675 | ||
5676 | static void perf_swevent_init_hrtimer(struct perf_event *event) | ||
5677 | { | ||
5678 | struct hw_perf_event *hwc = &event->hw; | ||
5679 | |||
5680 | if (!is_sampling_event(event)) | ||
5681 | return; | ||
5682 | |||
5683 | hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); | ||
5684 | hwc->hrtimer.function = perf_swevent_hrtimer; | ||
5685 | |||
5686 | /* | ||
5687 | * Since hrtimers have a fixed rate, we can do a static freq->period | ||
5688 | * mapping and avoid the whole period adjust feedback stuff. | ||
5689 | */ | ||
5690 | if (event->attr.freq) { | ||
5691 | long freq = event->attr.sample_freq; | ||
5692 | |||
5693 | event->attr.sample_period = NSEC_PER_SEC / freq; | ||
5694 | hwc->sample_period = event->attr.sample_period; | ||
5695 | local64_set(&hwc->period_left, hwc->sample_period); | ||
5696 | event->attr.freq = 0; | ||
5697 | } | ||
5698 | } | ||
5699 | |||
5090 | /* | 5700 | /* |
5091 | * Software event: cpu wall time clock | 5701 | * Software event: cpu wall time clock |
5092 | */ | 5702 | */ |
@@ -5139,6 +5749,8 @@ static int cpu_clock_event_init(struct perf_event *event) | |||
5139 | if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK) | 5749 | if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK) |
5140 | return -ENOENT; | 5750 | return -ENOENT; |
5141 | 5751 | ||
5752 | perf_swevent_init_hrtimer(event); | ||
5753 | |||
5142 | return 0; | 5754 | return 0; |
5143 | } | 5755 | } |
5144 | 5756 | ||
@@ -5194,16 +5806,9 @@ static void task_clock_event_del(struct perf_event *event, int flags) | |||
5194 | 5806 | ||
5195 | static void task_clock_event_read(struct perf_event *event) | 5807 | static void task_clock_event_read(struct perf_event *event) |
5196 | { | 5808 | { |
5197 | u64 time; | 5809 | u64 now = perf_clock(); |
5198 | 5810 | u64 delta = now - event->ctx->timestamp; | |
5199 | if (!in_nmi()) { | 5811 | u64 time = event->ctx->time + delta; |
5200 | update_context_time(event->ctx); | ||
5201 | time = event->ctx->time; | ||
5202 | } else { | ||
5203 | u64 now = perf_clock(); | ||
5204 | u64 delta = now - event->ctx->timestamp; | ||
5205 | time = event->ctx->time + delta; | ||
5206 | } | ||
5207 | 5812 | ||
5208 | task_clock_event_update(event, time); | 5813 | task_clock_event_update(event, time); |
5209 | } | 5814 | } |
@@ -5216,6 +5821,8 @@ static int task_clock_event_init(struct perf_event *event) | |||
5216 | if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK) | 5821 | if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK) |
5217 | return -ENOENT; | 5822 | return -ENOENT; |
5218 | 5823 | ||
5824 | perf_swevent_init_hrtimer(event); | ||
5825 | |||
5219 | return 0; | 5826 | return 0; |
5220 | } | 5827 | } |
5221 | 5828 | ||
@@ -5361,6 +5968,8 @@ free_dev: | |||
5361 | goto out; | 5968 | goto out; |
5362 | } | 5969 | } |
5363 | 5970 | ||
5971 | static struct lock_class_key cpuctx_mutex; | ||
5972 | |||
5364 | int perf_pmu_register(struct pmu *pmu, char *name, int type) | 5973 | int perf_pmu_register(struct pmu *pmu, char *name, int type) |
5365 | { | 5974 | { |
5366 | int cpu, ret; | 5975 | int cpu, ret; |
@@ -5409,6 +6018,7 @@ skip_type: | |||
5409 | 6018 | ||
5410 | cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); | 6019 | cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); |
5411 | __perf_event_init_context(&cpuctx->ctx); | 6020 | __perf_event_init_context(&cpuctx->ctx); |
6021 | lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex); | ||
5412 | cpuctx->ctx.type = cpu_context; | 6022 | cpuctx->ctx.type = cpu_context; |
5413 | cpuctx->ctx.pmu = pmu; | 6023 | cpuctx->ctx.pmu = pmu; |
5414 | cpuctx->jiffies_interval = 1; | 6024 | cpuctx->jiffies_interval = 1; |
@@ -5484,17 +6094,22 @@ struct pmu *perf_init_event(struct perf_event *event) | |||
5484 | { | 6094 | { |
5485 | struct pmu *pmu = NULL; | 6095 | struct pmu *pmu = NULL; |
5486 | int idx; | 6096 | int idx; |
6097 | int ret; | ||
5487 | 6098 | ||
5488 | idx = srcu_read_lock(&pmus_srcu); | 6099 | idx = srcu_read_lock(&pmus_srcu); |
5489 | 6100 | ||
5490 | rcu_read_lock(); | 6101 | rcu_read_lock(); |
5491 | pmu = idr_find(&pmu_idr, event->attr.type); | 6102 | pmu = idr_find(&pmu_idr, event->attr.type); |
5492 | rcu_read_unlock(); | 6103 | rcu_read_unlock(); |
5493 | if (pmu) | 6104 | if (pmu) { |
6105 | ret = pmu->event_init(event); | ||
6106 | if (ret) | ||
6107 | pmu = ERR_PTR(ret); | ||
5494 | goto unlock; | 6108 | goto unlock; |
6109 | } | ||
5495 | 6110 | ||
5496 | list_for_each_entry_rcu(pmu, &pmus, entry) { | 6111 | list_for_each_entry_rcu(pmu, &pmus, entry) { |
5497 | int ret = pmu->event_init(event); | 6112 | ret = pmu->event_init(event); |
5498 | if (!ret) | 6113 | if (!ret) |
5499 | goto unlock; | 6114 | goto unlock; |
5500 | 6115 | ||
@@ -5525,6 +6140,11 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, | |||
5525 | struct hw_perf_event *hwc; | 6140 | struct hw_perf_event *hwc; |
5526 | long err; | 6141 | long err; |
5527 | 6142 | ||
6143 | if ((unsigned)cpu >= nr_cpu_ids) { | ||
6144 | if (!task || cpu != -1) | ||
6145 | return ERR_PTR(-EINVAL); | ||
6146 | } | ||
6147 | |||
5528 | event = kzalloc(sizeof(*event), GFP_KERNEL); | 6148 | event = kzalloc(sizeof(*event), GFP_KERNEL); |
5529 | if (!event) | 6149 | if (!event) |
5530 | return ERR_PTR(-ENOMEM); | 6150 | return ERR_PTR(-ENOMEM); |
@@ -5573,7 +6193,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, | |||
5573 | 6193 | ||
5574 | if (!overflow_handler && parent_event) | 6194 | if (!overflow_handler && parent_event) |
5575 | overflow_handler = parent_event->overflow_handler; | 6195 | overflow_handler = parent_event->overflow_handler; |
5576 | 6196 | ||
5577 | event->overflow_handler = overflow_handler; | 6197 | event->overflow_handler = overflow_handler; |
5578 | 6198 | ||
5579 | if (attr->disabled) | 6199 | if (attr->disabled) |
@@ -5615,7 +6235,7 @@ done: | |||
5615 | 6235 | ||
5616 | if (!event->parent) { | 6236 | if (!event->parent) { |
5617 | if (event->attach_state & PERF_ATTACH_TASK) | 6237 | if (event->attach_state & PERF_ATTACH_TASK) |
5618 | jump_label_inc(&perf_task_events); | 6238 | jump_label_inc(&perf_sched_events); |
5619 | if (event->attr.mmap || event->attr.mmap_data) | 6239 | if (event->attr.mmap || event->attr.mmap_data) |
5620 | atomic_inc(&nr_mmap_events); | 6240 | atomic_inc(&nr_mmap_events); |
5621 | if (event->attr.comm) | 6241 | if (event->attr.comm) |
@@ -5790,7 +6410,7 @@ SYSCALL_DEFINE5(perf_event_open, | |||
5790 | int err; | 6410 | int err; |
5791 | 6411 | ||
5792 | /* for future expandability... */ | 6412 | /* for future expandability... */ |
5793 | if (flags & ~(PERF_FLAG_FD_NO_GROUP | PERF_FLAG_FD_OUTPUT)) | 6413 | if (flags & ~PERF_FLAG_ALL) |
5794 | return -EINVAL; | 6414 | return -EINVAL; |
5795 | 6415 | ||
5796 | err = perf_copy_attr(attr_uptr, &attr); | 6416 | err = perf_copy_attr(attr_uptr, &attr); |
@@ -5807,6 +6427,15 @@ SYSCALL_DEFINE5(perf_event_open, | |||
5807 | return -EINVAL; | 6427 | return -EINVAL; |
5808 | } | 6428 | } |
5809 | 6429 | ||
6430 | /* | ||
6431 | * In cgroup mode, the pid argument is used to pass the fd | ||
6432 | * opened to the cgroup directory in cgroupfs. The cpu argument | ||
6433 | * designates the cpu on which to monitor threads from that | ||
6434 | * cgroup. | ||
6435 | */ | ||
6436 | if ((flags & PERF_FLAG_PID_CGROUP) && (pid == -1 || cpu == -1)) | ||
6437 | return -EINVAL; | ||
6438 | |||
5810 | event_fd = get_unused_fd_flags(O_RDWR); | 6439 | event_fd = get_unused_fd_flags(O_RDWR); |
5811 | if (event_fd < 0) | 6440 | if (event_fd < 0) |
5812 | return event_fd; | 6441 | return event_fd; |
@@ -5824,7 +6453,7 @@ SYSCALL_DEFINE5(perf_event_open, | |||
5824 | group_leader = NULL; | 6453 | group_leader = NULL; |
5825 | } | 6454 | } |
5826 | 6455 | ||
5827 | if (pid != -1) { | 6456 | if (pid != -1 && !(flags & PERF_FLAG_PID_CGROUP)) { |
5828 | task = find_lively_task_by_vpid(pid); | 6457 | task = find_lively_task_by_vpid(pid); |
5829 | if (IS_ERR(task)) { | 6458 | if (IS_ERR(task)) { |
5830 | err = PTR_ERR(task); | 6459 | err = PTR_ERR(task); |
@@ -5838,6 +6467,19 @@ SYSCALL_DEFINE5(perf_event_open, | |||
5838 | goto err_task; | 6467 | goto err_task; |
5839 | } | 6468 | } |
5840 | 6469 | ||
6470 | if (flags & PERF_FLAG_PID_CGROUP) { | ||
6471 | err = perf_cgroup_connect(pid, event, &attr, group_leader); | ||
6472 | if (err) | ||
6473 | goto err_alloc; | ||
6474 | /* | ||
6475 | * one more event: | ||
6476 | * - that has cgroup constraint on event->cpu | ||
6477 | * - that may need work on context switch | ||
6478 | */ | ||
6479 | atomic_inc(&per_cpu(perf_cgroup_events, event->cpu)); | ||
6480 | jump_label_inc(&perf_sched_events); | ||
6481 | } | ||
6482 | |||
5841 | /* | 6483 | /* |
5842 | * Special case software events and allow them to be part of | 6484 | * Special case software events and allow them to be part of |
5843 | * any hardware group. | 6485 | * any hardware group. |
@@ -5923,10 +6565,10 @@ SYSCALL_DEFINE5(perf_event_open, | |||
5923 | struct perf_event_context *gctx = group_leader->ctx; | 6565 | struct perf_event_context *gctx = group_leader->ctx; |
5924 | 6566 | ||
5925 | mutex_lock(&gctx->mutex); | 6567 | mutex_lock(&gctx->mutex); |
5926 | perf_event_remove_from_context(group_leader); | 6568 | perf_remove_from_context(group_leader); |
5927 | list_for_each_entry(sibling, &group_leader->sibling_list, | 6569 | list_for_each_entry(sibling, &group_leader->sibling_list, |
5928 | group_entry) { | 6570 | group_entry) { |
5929 | perf_event_remove_from_context(sibling); | 6571 | perf_remove_from_context(sibling); |
5930 | put_ctx(gctx); | 6572 | put_ctx(gctx); |
5931 | } | 6573 | } |
5932 | mutex_unlock(&gctx->mutex); | 6574 | mutex_unlock(&gctx->mutex); |
@@ -5949,6 +6591,7 @@ SYSCALL_DEFINE5(perf_event_open, | |||
5949 | 6591 | ||
5950 | perf_install_in_context(ctx, event, cpu); | 6592 | perf_install_in_context(ctx, event, cpu); |
5951 | ++ctx->generation; | 6593 | ++ctx->generation; |
6594 | perf_unpin_context(ctx); | ||
5952 | mutex_unlock(&ctx->mutex); | 6595 | mutex_unlock(&ctx->mutex); |
5953 | 6596 | ||
5954 | event->owner = current; | 6597 | event->owner = current; |
@@ -5974,6 +6617,7 @@ SYSCALL_DEFINE5(perf_event_open, | |||
5974 | return event_fd; | 6617 | return event_fd; |
5975 | 6618 | ||
5976 | err_context: | 6619 | err_context: |
6620 | perf_unpin_context(ctx); | ||
5977 | put_ctx(ctx); | 6621 | put_ctx(ctx); |
5978 | err_alloc: | 6622 | err_alloc: |
5979 | free_event(event); | 6623 | free_event(event); |
@@ -6024,6 +6668,7 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, | |||
6024 | mutex_lock(&ctx->mutex); | 6668 | mutex_lock(&ctx->mutex); |
6025 | perf_install_in_context(ctx, event, cpu); | 6669 | perf_install_in_context(ctx, event, cpu); |
6026 | ++ctx->generation; | 6670 | ++ctx->generation; |
6671 | perf_unpin_context(ctx); | ||
6027 | mutex_unlock(&ctx->mutex); | 6672 | mutex_unlock(&ctx->mutex); |
6028 | 6673 | ||
6029 | return event; | 6674 | return event; |
@@ -6077,7 +6722,7 @@ __perf_event_exit_task(struct perf_event *child_event, | |||
6077 | { | 6722 | { |
6078 | struct perf_event *parent_event; | 6723 | struct perf_event *parent_event; |
6079 | 6724 | ||
6080 | perf_event_remove_from_context(child_event); | 6725 | perf_remove_from_context(child_event); |
6081 | 6726 | ||
6082 | parent_event = child_event->parent; | 6727 | parent_event = child_event->parent; |
6083 | /* | 6728 | /* |
@@ -6109,7 +6754,7 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn) | |||
6109 | * scheduled, so we are now safe from rescheduling changing | 6754 | * scheduled, so we are now safe from rescheduling changing |
6110 | * our context. | 6755 | * our context. |
6111 | */ | 6756 | */ |
6112 | child_ctx = child->perf_event_ctxp[ctxn]; | 6757 | child_ctx = rcu_dereference_raw(child->perf_event_ctxp[ctxn]); |
6113 | task_ctx_sched_out(child_ctx, EVENT_ALL); | 6758 | task_ctx_sched_out(child_ctx, EVENT_ALL); |
6114 | 6759 | ||
6115 | /* | 6760 | /* |
@@ -6384,7 +7029,7 @@ inherit_task_group(struct perf_event *event, struct task_struct *parent, | |||
6384 | return 0; | 7029 | return 0; |
6385 | } | 7030 | } |
6386 | 7031 | ||
6387 | child_ctx = child->perf_event_ctxp[ctxn]; | 7032 | child_ctx = child->perf_event_ctxp[ctxn]; |
6388 | if (!child_ctx) { | 7033 | if (!child_ctx) { |
6389 | /* | 7034 | /* |
6390 | * This is executed from the parent task context, so | 7035 | * This is executed from the parent task context, so |
@@ -6422,11 +7067,6 @@ int perf_event_init_context(struct task_struct *child, int ctxn) | |||
6422 | unsigned long flags; | 7067 | unsigned long flags; |
6423 | int ret = 0; | 7068 | int ret = 0; |
6424 | 7069 | ||
6425 | child->perf_event_ctxp[ctxn] = NULL; | ||
6426 | |||
6427 | mutex_init(&child->perf_event_mutex); | ||
6428 | INIT_LIST_HEAD(&child->perf_event_list); | ||
6429 | |||
6430 | if (likely(!parent->perf_event_ctxp[ctxn])) | 7070 | if (likely(!parent->perf_event_ctxp[ctxn])) |
6431 | return 0; | 7071 | return 0; |
6432 | 7072 | ||
@@ -6478,7 +7118,6 @@ int perf_event_init_context(struct task_struct *child, int ctxn) | |||
6478 | 7118 | ||
6479 | raw_spin_lock_irqsave(&parent_ctx->lock, flags); | 7119 | raw_spin_lock_irqsave(&parent_ctx->lock, flags); |
6480 | parent_ctx->rotate_disable = 0; | 7120 | parent_ctx->rotate_disable = 0; |
6481 | raw_spin_unlock_irqrestore(&parent_ctx->lock, flags); | ||
6482 | 7121 | ||
6483 | child_ctx = child->perf_event_ctxp[ctxn]; | 7122 | child_ctx = child->perf_event_ctxp[ctxn]; |
6484 | 7123 | ||
@@ -6486,12 +7125,11 @@ int perf_event_init_context(struct task_struct *child, int ctxn) | |||
6486 | /* | 7125 | /* |
6487 | * Mark the child context as a clone of the parent | 7126 | * Mark the child context as a clone of the parent |
6488 | * context, or of whatever the parent is a clone of. | 7127 | * context, or of whatever the parent is a clone of. |
6489 | * Note that if the parent is a clone, it could get | 7128 | * |
6490 | * uncloned at any point, but that doesn't matter | 7129 | * Note that if the parent is a clone, the holding of |
6491 | * because the list of events and the generation | 7130 | * parent_ctx->lock avoids it from being uncloned. |
6492 | * count can't have changed since we took the mutex. | ||
6493 | */ | 7131 | */ |
6494 | cloned_ctx = rcu_dereference(parent_ctx->parent_ctx); | 7132 | cloned_ctx = parent_ctx->parent_ctx; |
6495 | if (cloned_ctx) { | 7133 | if (cloned_ctx) { |
6496 | child_ctx->parent_ctx = cloned_ctx; | 7134 | child_ctx->parent_ctx = cloned_ctx; |
6497 | child_ctx->parent_gen = parent_ctx->parent_gen; | 7135 | child_ctx->parent_gen = parent_ctx->parent_gen; |
@@ -6502,9 +7140,11 @@ int perf_event_init_context(struct task_struct *child, int ctxn) | |||
6502 | get_ctx(child_ctx->parent_ctx); | 7140 | get_ctx(child_ctx->parent_ctx); |
6503 | } | 7141 | } |
6504 | 7142 | ||
7143 | raw_spin_unlock_irqrestore(&parent_ctx->lock, flags); | ||
6505 | mutex_unlock(&parent_ctx->mutex); | 7144 | mutex_unlock(&parent_ctx->mutex); |
6506 | 7145 | ||
6507 | perf_unpin_context(parent_ctx); | 7146 | perf_unpin_context(parent_ctx); |
7147 | put_ctx(parent_ctx); | ||
6508 | 7148 | ||
6509 | return ret; | 7149 | return ret; |
6510 | } | 7150 | } |
@@ -6516,6 +7156,10 @@ int perf_event_init_task(struct task_struct *child) | |||
6516 | { | 7156 | { |
6517 | int ctxn, ret; | 7157 | int ctxn, ret; |
6518 | 7158 | ||
7159 | memset(child->perf_event_ctxp, 0, sizeof(child->perf_event_ctxp)); | ||
7160 | mutex_init(&child->perf_event_mutex); | ||
7161 | INIT_LIST_HEAD(&child->perf_event_list); | ||
7162 | |||
6519 | for_each_task_context_nr(ctxn) { | 7163 | for_each_task_context_nr(ctxn) { |
6520 | ret = perf_event_init_context(child, ctxn); | 7164 | ret = perf_event_init_context(child, ctxn); |
6521 | if (ret) | 7165 | if (ret) |
@@ -6570,9 +7214,9 @@ static void __perf_event_exit_context(void *__info) | |||
6570 | perf_pmu_rotate_stop(ctx->pmu); | 7214 | perf_pmu_rotate_stop(ctx->pmu); |
6571 | 7215 | ||
6572 | list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry) | 7216 | list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry) |
6573 | __perf_event_remove_from_context(event); | 7217 | __perf_remove_from_context(event); |
6574 | list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, group_entry) | 7218 | list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, group_entry) |
6575 | __perf_event_remove_from_context(event); | 7219 | __perf_remove_from_context(event); |
6576 | } | 7220 | } |
6577 | 7221 | ||
6578 | static void perf_event_exit_cpu_context(int cpu) | 7222 | static void perf_event_exit_cpu_context(int cpu) |
@@ -6696,3 +7340,83 @@ unlock: | |||
6696 | return ret; | 7340 | return ret; |
6697 | } | 7341 | } |
6698 | device_initcall(perf_event_sysfs_init); | 7342 | device_initcall(perf_event_sysfs_init); |
7343 | |||
7344 | #ifdef CONFIG_CGROUP_PERF | ||
7345 | static struct cgroup_subsys_state *perf_cgroup_create( | ||
7346 | struct cgroup_subsys *ss, struct cgroup *cont) | ||
7347 | { | ||
7348 | struct perf_cgroup *jc; | ||
7349 | |||
7350 | jc = kzalloc(sizeof(*jc), GFP_KERNEL); | ||
7351 | if (!jc) | ||
7352 | return ERR_PTR(-ENOMEM); | ||
7353 | |||
7354 | jc->info = alloc_percpu(struct perf_cgroup_info); | ||
7355 | if (!jc->info) { | ||
7356 | kfree(jc); | ||
7357 | return ERR_PTR(-ENOMEM); | ||
7358 | } | ||
7359 | |||
7360 | return &jc->css; | ||
7361 | } | ||
7362 | |||
7363 | static void perf_cgroup_destroy(struct cgroup_subsys *ss, | ||
7364 | struct cgroup *cont) | ||
7365 | { | ||
7366 | struct perf_cgroup *jc; | ||
7367 | jc = container_of(cgroup_subsys_state(cont, perf_subsys_id), | ||
7368 | struct perf_cgroup, css); | ||
7369 | free_percpu(jc->info); | ||
7370 | kfree(jc); | ||
7371 | } | ||
7372 | |||
7373 | static int __perf_cgroup_move(void *info) | ||
7374 | { | ||
7375 | struct task_struct *task = info; | ||
7376 | perf_cgroup_switch(task, PERF_CGROUP_SWOUT | PERF_CGROUP_SWIN); | ||
7377 | return 0; | ||
7378 | } | ||
7379 | |||
7380 | static void perf_cgroup_move(struct task_struct *task) | ||
7381 | { | ||
7382 | task_function_call(task, __perf_cgroup_move, task); | ||
7383 | } | ||
7384 | |||
7385 | static void perf_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, | ||
7386 | struct cgroup *old_cgrp, struct task_struct *task, | ||
7387 | bool threadgroup) | ||
7388 | { | ||
7389 | perf_cgroup_move(task); | ||
7390 | if (threadgroup) { | ||
7391 | struct task_struct *c; | ||
7392 | rcu_read_lock(); | ||
7393 | list_for_each_entry_rcu(c, &task->thread_group, thread_group) { | ||
7394 | perf_cgroup_move(c); | ||
7395 | } | ||
7396 | rcu_read_unlock(); | ||
7397 | } | ||
7398 | } | ||
7399 | |||
7400 | static void perf_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp, | ||
7401 | struct cgroup *old_cgrp, struct task_struct *task) | ||
7402 | { | ||
7403 | /* | ||
7404 | * cgroup_exit() is called in the copy_process() failure path. | ||
7405 | * Ignore this case since the task hasn't ran yet, this avoids | ||
7406 | * trying to poke a half freed task state from generic code. | ||
7407 | */ | ||
7408 | if (!(task->flags & PF_EXITING)) | ||
7409 | return; | ||
7410 | |||
7411 | perf_cgroup_move(task); | ||
7412 | } | ||
7413 | |||
7414 | struct cgroup_subsys perf_subsys = { | ||
7415 | .name = "perf_event", | ||
7416 | .subsys_id = perf_subsys_id, | ||
7417 | .create = perf_cgroup_create, | ||
7418 | .destroy = perf_cgroup_destroy, | ||
7419 | .exit = perf_cgroup_exit, | ||
7420 | .attach = perf_cgroup_attach, | ||
7421 | }; | ||
7422 | #endif /* CONFIG_CGROUP_PERF */ | ||
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index 05bb7173850e..67fea9d25d55 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c | |||
@@ -176,7 +176,8 @@ static inline cputime_t virt_ticks(struct task_struct *p) | |||
176 | return p->utime; | 176 | return p->utime; |
177 | } | 177 | } |
178 | 178 | ||
179 | int posix_cpu_clock_getres(const clockid_t which_clock, struct timespec *tp) | 179 | static int |
180 | posix_cpu_clock_getres(const clockid_t which_clock, struct timespec *tp) | ||
180 | { | 181 | { |
181 | int error = check_clock(which_clock); | 182 | int error = check_clock(which_clock); |
182 | if (!error) { | 183 | if (!error) { |
@@ -194,7 +195,8 @@ int posix_cpu_clock_getres(const clockid_t which_clock, struct timespec *tp) | |||
194 | return error; | 195 | return error; |
195 | } | 196 | } |
196 | 197 | ||
197 | int posix_cpu_clock_set(const clockid_t which_clock, const struct timespec *tp) | 198 | static int |
199 | posix_cpu_clock_set(const clockid_t which_clock, const struct timespec *tp) | ||
198 | { | 200 | { |
199 | /* | 201 | /* |
200 | * You can never reset a CPU clock, but we check for other errors | 202 | * You can never reset a CPU clock, but we check for other errors |
@@ -317,7 +319,7 @@ static int cpu_clock_sample_group(const clockid_t which_clock, | |||
317 | } | 319 | } |
318 | 320 | ||
319 | 321 | ||
320 | int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp) | 322 | static int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp) |
321 | { | 323 | { |
322 | const pid_t pid = CPUCLOCK_PID(which_clock); | 324 | const pid_t pid = CPUCLOCK_PID(which_clock); |
323 | int error = -EINVAL; | 325 | int error = -EINVAL; |
@@ -379,7 +381,7 @@ int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp) | |||
379 | * This is called from sys_timer_create() and do_cpu_nanosleep() with the | 381 | * This is called from sys_timer_create() and do_cpu_nanosleep() with the |
380 | * new timer already all-zeros initialized. | 382 | * new timer already all-zeros initialized. |
381 | */ | 383 | */ |
382 | int posix_cpu_timer_create(struct k_itimer *new_timer) | 384 | static int posix_cpu_timer_create(struct k_itimer *new_timer) |
383 | { | 385 | { |
384 | int ret = 0; | 386 | int ret = 0; |
385 | const pid_t pid = CPUCLOCK_PID(new_timer->it_clock); | 387 | const pid_t pid = CPUCLOCK_PID(new_timer->it_clock); |
@@ -425,7 +427,7 @@ int posix_cpu_timer_create(struct k_itimer *new_timer) | |||
425 | * If we return TIMER_RETRY, it's necessary to release the timer's lock | 427 | * If we return TIMER_RETRY, it's necessary to release the timer's lock |
426 | * and try again. (This happens when the timer is in the middle of firing.) | 428 | * and try again. (This happens when the timer is in the middle of firing.) |
427 | */ | 429 | */ |
428 | int posix_cpu_timer_del(struct k_itimer *timer) | 430 | static int posix_cpu_timer_del(struct k_itimer *timer) |
429 | { | 431 | { |
430 | struct task_struct *p = timer->it.cpu.task; | 432 | struct task_struct *p = timer->it.cpu.task; |
431 | int ret = 0; | 433 | int ret = 0; |
@@ -665,8 +667,8 @@ static int cpu_timer_sample_group(const clockid_t which_clock, | |||
665 | * If we return TIMER_RETRY, it's necessary to release the timer's lock | 667 | * If we return TIMER_RETRY, it's necessary to release the timer's lock |
666 | * and try again. (This happens when the timer is in the middle of firing.) | 668 | * and try again. (This happens when the timer is in the middle of firing.) |
667 | */ | 669 | */ |
668 | int posix_cpu_timer_set(struct k_itimer *timer, int flags, | 670 | static int posix_cpu_timer_set(struct k_itimer *timer, int flags, |
669 | struct itimerspec *new, struct itimerspec *old) | 671 | struct itimerspec *new, struct itimerspec *old) |
670 | { | 672 | { |
671 | struct task_struct *p = timer->it.cpu.task; | 673 | struct task_struct *p = timer->it.cpu.task; |
672 | union cpu_time_count old_expires, new_expires, old_incr, val; | 674 | union cpu_time_count old_expires, new_expires, old_incr, val; |
@@ -820,7 +822,7 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags, | |||
820 | return ret; | 822 | return ret; |
821 | } | 823 | } |
822 | 824 | ||
823 | void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp) | 825 | static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp) |
824 | { | 826 | { |
825 | union cpu_time_count now; | 827 | union cpu_time_count now; |
826 | struct task_struct *p = timer->it.cpu.task; | 828 | struct task_struct *p = timer->it.cpu.task; |
@@ -1481,11 +1483,13 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags, | |||
1481 | return error; | 1483 | return error; |
1482 | } | 1484 | } |
1483 | 1485 | ||
1484 | int posix_cpu_nsleep(const clockid_t which_clock, int flags, | 1486 | static long posix_cpu_nsleep_restart(struct restart_block *restart_block); |
1485 | struct timespec *rqtp, struct timespec __user *rmtp) | 1487 | |
1488 | static int posix_cpu_nsleep(const clockid_t which_clock, int flags, | ||
1489 | struct timespec *rqtp, struct timespec __user *rmtp) | ||
1486 | { | 1490 | { |
1487 | struct restart_block *restart_block = | 1491 | struct restart_block *restart_block = |
1488 | ¤t_thread_info()->restart_block; | 1492 | ¤t_thread_info()->restart_block; |
1489 | struct itimerspec it; | 1493 | struct itimerspec it; |
1490 | int error; | 1494 | int error; |
1491 | 1495 | ||
@@ -1501,56 +1505,47 @@ int posix_cpu_nsleep(const clockid_t which_clock, int flags, | |||
1501 | 1505 | ||
1502 | if (error == -ERESTART_RESTARTBLOCK) { | 1506 | if (error == -ERESTART_RESTARTBLOCK) { |
1503 | 1507 | ||
1504 | if (flags & TIMER_ABSTIME) | 1508 | if (flags & TIMER_ABSTIME) |
1505 | return -ERESTARTNOHAND; | 1509 | return -ERESTARTNOHAND; |
1506 | /* | 1510 | /* |
1507 | * Report back to the user the time still remaining. | 1511 | * Report back to the user the time still remaining. |
1508 | */ | 1512 | */ |
1509 | if (rmtp != NULL && copy_to_user(rmtp, &it.it_value, sizeof *rmtp)) | 1513 | if (rmtp && copy_to_user(rmtp, &it.it_value, sizeof *rmtp)) |
1510 | return -EFAULT; | 1514 | return -EFAULT; |
1511 | 1515 | ||
1512 | restart_block->fn = posix_cpu_nsleep_restart; | 1516 | restart_block->fn = posix_cpu_nsleep_restart; |
1513 | restart_block->arg0 = which_clock; | 1517 | restart_block->nanosleep.index = which_clock; |
1514 | restart_block->arg1 = (unsigned long) rmtp; | 1518 | restart_block->nanosleep.rmtp = rmtp; |
1515 | restart_block->arg2 = rqtp->tv_sec; | 1519 | restart_block->nanosleep.expires = timespec_to_ns(rqtp); |
1516 | restart_block->arg3 = rqtp->tv_nsec; | ||
1517 | } | 1520 | } |
1518 | return error; | 1521 | return error; |
1519 | } | 1522 | } |
1520 | 1523 | ||
1521 | long posix_cpu_nsleep_restart(struct restart_block *restart_block) | 1524 | static long posix_cpu_nsleep_restart(struct restart_block *restart_block) |
1522 | { | 1525 | { |
1523 | clockid_t which_clock = restart_block->arg0; | 1526 | clockid_t which_clock = restart_block->nanosleep.index; |
1524 | struct timespec __user *rmtp; | ||
1525 | struct timespec t; | 1527 | struct timespec t; |
1526 | struct itimerspec it; | 1528 | struct itimerspec it; |
1527 | int error; | 1529 | int error; |
1528 | 1530 | ||
1529 | rmtp = (struct timespec __user *) restart_block->arg1; | 1531 | t = ns_to_timespec(restart_block->nanosleep.expires); |
1530 | t.tv_sec = restart_block->arg2; | ||
1531 | t.tv_nsec = restart_block->arg3; | ||
1532 | 1532 | ||
1533 | restart_block->fn = do_no_restart_syscall; | ||
1534 | error = do_cpu_nanosleep(which_clock, TIMER_ABSTIME, &t, &it); | 1533 | error = do_cpu_nanosleep(which_clock, TIMER_ABSTIME, &t, &it); |
1535 | 1534 | ||
1536 | if (error == -ERESTART_RESTARTBLOCK) { | 1535 | if (error == -ERESTART_RESTARTBLOCK) { |
1536 | struct timespec __user *rmtp = restart_block->nanosleep.rmtp; | ||
1537 | /* | 1537 | /* |
1538 | * Report back to the user the time still remaining. | 1538 | * Report back to the user the time still remaining. |
1539 | */ | 1539 | */ |
1540 | if (rmtp != NULL && copy_to_user(rmtp, &it.it_value, sizeof *rmtp)) | 1540 | if (rmtp && copy_to_user(rmtp, &it.it_value, sizeof *rmtp)) |
1541 | return -EFAULT; | 1541 | return -EFAULT; |
1542 | 1542 | ||
1543 | restart_block->fn = posix_cpu_nsleep_restart; | 1543 | restart_block->nanosleep.expires = timespec_to_ns(&t); |
1544 | restart_block->arg0 = which_clock; | ||
1545 | restart_block->arg1 = (unsigned long) rmtp; | ||
1546 | restart_block->arg2 = t.tv_sec; | ||
1547 | restart_block->arg3 = t.tv_nsec; | ||
1548 | } | 1544 | } |
1549 | return error; | 1545 | return error; |
1550 | 1546 | ||
1551 | } | 1547 | } |
1552 | 1548 | ||
1553 | |||
1554 | #define PROCESS_CLOCK MAKE_PROCESS_CPUCLOCK(0, CPUCLOCK_SCHED) | 1549 | #define PROCESS_CLOCK MAKE_PROCESS_CPUCLOCK(0, CPUCLOCK_SCHED) |
1555 | #define THREAD_CLOCK MAKE_THREAD_CPUCLOCK(0, CPUCLOCK_SCHED) | 1550 | #define THREAD_CLOCK MAKE_THREAD_CPUCLOCK(0, CPUCLOCK_SCHED) |
1556 | 1551 | ||
@@ -1594,38 +1589,37 @@ static int thread_cpu_timer_create(struct k_itimer *timer) | |||
1594 | timer->it_clock = THREAD_CLOCK; | 1589 | timer->it_clock = THREAD_CLOCK; |
1595 | return posix_cpu_timer_create(timer); | 1590 | return posix_cpu_timer_create(timer); |
1596 | } | 1591 | } |
1597 | static int thread_cpu_nsleep(const clockid_t which_clock, int flags, | 1592 | |
1598 | struct timespec *rqtp, struct timespec __user *rmtp) | 1593 | struct k_clock clock_posix_cpu = { |
1599 | { | 1594 | .clock_getres = posix_cpu_clock_getres, |
1600 | return -EINVAL; | 1595 | .clock_set = posix_cpu_clock_set, |
1601 | } | 1596 | .clock_get = posix_cpu_clock_get, |
1602 | static long thread_cpu_nsleep_restart(struct restart_block *restart_block) | 1597 | .timer_create = posix_cpu_timer_create, |
1603 | { | 1598 | .nsleep = posix_cpu_nsleep, |
1604 | return -EINVAL; | 1599 | .nsleep_restart = posix_cpu_nsleep_restart, |
1605 | } | 1600 | .timer_set = posix_cpu_timer_set, |
1601 | .timer_del = posix_cpu_timer_del, | ||
1602 | .timer_get = posix_cpu_timer_get, | ||
1603 | }; | ||
1606 | 1604 | ||
1607 | static __init int init_posix_cpu_timers(void) | 1605 | static __init int init_posix_cpu_timers(void) |
1608 | { | 1606 | { |
1609 | struct k_clock process = { | 1607 | struct k_clock process = { |
1610 | .clock_getres = process_cpu_clock_getres, | 1608 | .clock_getres = process_cpu_clock_getres, |
1611 | .clock_get = process_cpu_clock_get, | 1609 | .clock_get = process_cpu_clock_get, |
1612 | .clock_set = do_posix_clock_nosettime, | 1610 | .timer_create = process_cpu_timer_create, |
1613 | .timer_create = process_cpu_timer_create, | 1611 | .nsleep = process_cpu_nsleep, |
1614 | .nsleep = process_cpu_nsleep, | 1612 | .nsleep_restart = process_cpu_nsleep_restart, |
1615 | .nsleep_restart = process_cpu_nsleep_restart, | ||
1616 | }; | 1613 | }; |
1617 | struct k_clock thread = { | 1614 | struct k_clock thread = { |
1618 | .clock_getres = thread_cpu_clock_getres, | 1615 | .clock_getres = thread_cpu_clock_getres, |
1619 | .clock_get = thread_cpu_clock_get, | 1616 | .clock_get = thread_cpu_clock_get, |
1620 | .clock_set = do_posix_clock_nosettime, | 1617 | .timer_create = thread_cpu_timer_create, |
1621 | .timer_create = thread_cpu_timer_create, | ||
1622 | .nsleep = thread_cpu_nsleep, | ||
1623 | .nsleep_restart = thread_cpu_nsleep_restart, | ||
1624 | }; | 1618 | }; |
1625 | struct timespec ts; | 1619 | struct timespec ts; |
1626 | 1620 | ||
1627 | register_posix_clock(CLOCK_PROCESS_CPUTIME_ID, &process); | 1621 | posix_timers_register_clock(CLOCK_PROCESS_CPUTIME_ID, &process); |
1628 | register_posix_clock(CLOCK_THREAD_CPUTIME_ID, &thread); | 1622 | posix_timers_register_clock(CLOCK_THREAD_CPUTIME_ID, &thread); |
1629 | 1623 | ||
1630 | cputime_to_timespec(cputime_one_jiffy, &ts); | 1624 | cputime_to_timespec(cputime_one_jiffy, &ts); |
1631 | onecputick = ts.tv_nsec; | 1625 | onecputick = ts.tv_nsec; |
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index 93bd2eb2bc53..4c0124919f9a 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c | |||
@@ -41,6 +41,7 @@ | |||
41 | #include <linux/init.h> | 41 | #include <linux/init.h> |
42 | #include <linux/compiler.h> | 42 | #include <linux/compiler.h> |
43 | #include <linux/idr.h> | 43 | #include <linux/idr.h> |
44 | #include <linux/posix-clock.h> | ||
44 | #include <linux/posix-timers.h> | 45 | #include <linux/posix-timers.h> |
45 | #include <linux/syscalls.h> | 46 | #include <linux/syscalls.h> |
46 | #include <linux/wait.h> | 47 | #include <linux/wait.h> |
@@ -81,6 +82,14 @@ static DEFINE_SPINLOCK(idr_lock); | |||
81 | #error "SIGEV_THREAD_ID must not share bit with other SIGEV values!" | 82 | #error "SIGEV_THREAD_ID must not share bit with other SIGEV values!" |
82 | #endif | 83 | #endif |
83 | 84 | ||
85 | /* | ||
86 | * parisc wants ENOTSUP instead of EOPNOTSUPP | ||
87 | */ | ||
88 | #ifndef ENOTSUP | ||
89 | # define ENANOSLEEP_NOTSUP EOPNOTSUPP | ||
90 | #else | ||
91 | # define ENANOSLEEP_NOTSUP ENOTSUP | ||
92 | #endif | ||
84 | 93 | ||
85 | /* | 94 | /* |
86 | * The timer ID is turned into a timer address by idr_find(). | 95 | * The timer ID is turned into a timer address by idr_find(). |
@@ -94,11 +103,7 @@ static DEFINE_SPINLOCK(idr_lock); | |||
94 | /* | 103 | /* |
95 | * CLOCKs: The POSIX standard calls for a couple of clocks and allows us | 104 | * CLOCKs: The POSIX standard calls for a couple of clocks and allows us |
96 | * to implement others. This structure defines the various | 105 | * to implement others. This structure defines the various |
97 | * clocks and allows the possibility of adding others. We | 106 | * clocks. |
98 | * provide an interface to add clocks to the table and expect | ||
99 | * the "arch" code to add at least one clock that is high | ||
100 | * resolution. Here we define the standard CLOCK_REALTIME as a | ||
101 | * 1/HZ resolution clock. | ||
102 | * | 107 | * |
103 | * RESOLUTION: Clock resolution is used to round up timer and interval | 108 | * RESOLUTION: Clock resolution is used to round up timer and interval |
104 | * times, NOT to report clock times, which are reported with as | 109 | * times, NOT to report clock times, which are reported with as |
@@ -108,20 +113,13 @@ static DEFINE_SPINLOCK(idr_lock); | |||
108 | * necessary code is written. The standard says we should say | 113 | * necessary code is written. The standard says we should say |
109 | * something about this issue in the documentation... | 114 | * something about this issue in the documentation... |
110 | * | 115 | * |
111 | * FUNCTIONS: The CLOCKs structure defines possible functions to handle | 116 | * FUNCTIONS: The CLOCKs structure defines possible functions to |
112 | * various clock functions. For clocks that use the standard | 117 | * handle various clock functions. |
113 | * system timer code these entries should be NULL. This will | ||
114 | * allow dispatch without the overhead of indirect function | ||
115 | * calls. CLOCKS that depend on other sources (e.g. WWV or GPS) | ||
116 | * must supply functions here, even if the function just returns | ||
117 | * ENOSYS. The standard POSIX timer management code assumes the | ||
118 | * following: 1.) The k_itimer struct (sched.h) is used for the | ||
119 | * timer. 2.) The list, it_lock, it_clock, it_id and it_pid | ||
120 | * fields are not modified by timer code. | ||
121 | * | 118 | * |
122 | * At this time all functions EXCEPT clock_nanosleep can be | 119 | * The standard POSIX timer management code assumes the |
123 | * redirected by the CLOCKS structure. Clock_nanosleep is in | 120 | * following: 1.) The k_itimer struct (sched.h) is used for |
124 | * there, but the code ignores it. | 121 | * the timer. 2.) The list, it_lock, it_clock, it_id and |
122 | * it_pid fields are not modified by timer code. | ||
125 | * | 123 | * |
126 | * Permissions: It is assumed that the clock_settime() function defined | 124 | * Permissions: It is assumed that the clock_settime() function defined |
127 | * for each clock will take care of permission checks. Some | 125 | * for each clock will take care of permission checks. Some |
@@ -138,6 +136,7 @@ static struct k_clock posix_clocks[MAX_CLOCKS]; | |||
138 | */ | 136 | */ |
139 | static int common_nsleep(const clockid_t, int flags, struct timespec *t, | 137 | static int common_nsleep(const clockid_t, int flags, struct timespec *t, |
140 | struct timespec __user *rmtp); | 138 | struct timespec __user *rmtp); |
139 | static int common_timer_create(struct k_itimer *new_timer); | ||
141 | static void common_timer_get(struct k_itimer *, struct itimerspec *); | 140 | static void common_timer_get(struct k_itimer *, struct itimerspec *); |
142 | static int common_timer_set(struct k_itimer *, int, | 141 | static int common_timer_set(struct k_itimer *, int, |
143 | struct itimerspec *, struct itimerspec *); | 142 | struct itimerspec *, struct itimerspec *); |
@@ -158,76 +157,24 @@ static inline void unlock_timer(struct k_itimer *timr, unsigned long flags) | |||
158 | spin_unlock_irqrestore(&timr->it_lock, flags); | 157 | spin_unlock_irqrestore(&timr->it_lock, flags); |
159 | } | 158 | } |
160 | 159 | ||
161 | /* | 160 | /* Get clock_realtime */ |
162 | * Call the k_clock hook function if non-null, or the default function. | 161 | static int posix_clock_realtime_get(clockid_t which_clock, struct timespec *tp) |
163 | */ | ||
164 | #define CLOCK_DISPATCH(clock, call, arglist) \ | ||
165 | ((clock) < 0 ? posix_cpu_##call arglist : \ | ||
166 | (posix_clocks[clock].call != NULL \ | ||
167 | ? (*posix_clocks[clock].call) arglist : common_##call arglist)) | ||
168 | |||
169 | /* | ||
170 | * Default clock hook functions when the struct k_clock passed | ||
171 | * to register_posix_clock leaves a function pointer null. | ||
172 | * | ||
173 | * The function common_CALL is the default implementation for | ||
174 | * the function pointer CALL in struct k_clock. | ||
175 | */ | ||
176 | |||
177 | static inline int common_clock_getres(const clockid_t which_clock, | ||
178 | struct timespec *tp) | ||
179 | { | ||
180 | tp->tv_sec = 0; | ||
181 | tp->tv_nsec = posix_clocks[which_clock].res; | ||
182 | return 0; | ||
183 | } | ||
184 | |||
185 | /* | ||
186 | * Get real time for posix timers | ||
187 | */ | ||
188 | static int common_clock_get(clockid_t which_clock, struct timespec *tp) | ||
189 | { | 162 | { |
190 | ktime_get_real_ts(tp); | 163 | ktime_get_real_ts(tp); |
191 | return 0; | 164 | return 0; |
192 | } | 165 | } |
193 | 166 | ||
194 | static inline int common_clock_set(const clockid_t which_clock, | 167 | /* Set clock_realtime */ |
195 | struct timespec *tp) | 168 | static int posix_clock_realtime_set(const clockid_t which_clock, |
169 | const struct timespec *tp) | ||
196 | { | 170 | { |
197 | return do_sys_settimeofday(tp, NULL); | 171 | return do_sys_settimeofday(tp, NULL); |
198 | } | 172 | } |
199 | 173 | ||
200 | static int common_timer_create(struct k_itimer *new_timer) | 174 | static int posix_clock_realtime_adj(const clockid_t which_clock, |
201 | { | 175 | struct timex *t) |
202 | hrtimer_init(&new_timer->it.real.timer, new_timer->it_clock, 0); | ||
203 | return 0; | ||
204 | } | ||
205 | |||
206 | static int no_timer_create(struct k_itimer *new_timer) | ||
207 | { | ||
208 | return -EOPNOTSUPP; | ||
209 | } | ||
210 | |||
211 | static int no_nsleep(const clockid_t which_clock, int flags, | ||
212 | struct timespec *tsave, struct timespec __user *rmtp) | ||
213 | { | ||
214 | return -EOPNOTSUPP; | ||
215 | } | ||
216 | |||
217 | /* | ||
218 | * Return nonzero if we know a priori this clockid_t value is bogus. | ||
219 | */ | ||
220 | static inline int invalid_clockid(const clockid_t which_clock) | ||
221 | { | 176 | { |
222 | if (which_clock < 0) /* CPU clock, posix_cpu_* will check it */ | 177 | return do_adjtimex(t); |
223 | return 0; | ||
224 | if ((unsigned) which_clock >= MAX_CLOCKS) | ||
225 | return 1; | ||
226 | if (posix_clocks[which_clock].clock_getres != NULL) | ||
227 | return 0; | ||
228 | if (posix_clocks[which_clock].res != 0) | ||
229 | return 0; | ||
230 | return 1; | ||
231 | } | 178 | } |
232 | 179 | ||
233 | /* | 180 | /* |
@@ -240,7 +187,7 @@ static int posix_ktime_get_ts(clockid_t which_clock, struct timespec *tp) | |||
240 | } | 187 | } |
241 | 188 | ||
242 | /* | 189 | /* |
243 | * Get monotonic time for posix timers | 190 | * Get monotonic-raw time for posix timers |
244 | */ | 191 | */ |
245 | static int posix_get_monotonic_raw(clockid_t which_clock, struct timespec *tp) | 192 | static int posix_get_monotonic_raw(clockid_t which_clock, struct timespec *tp) |
246 | { | 193 | { |
@@ -267,46 +214,70 @@ static int posix_get_coarse_res(const clockid_t which_clock, struct timespec *tp | |||
267 | *tp = ktime_to_timespec(KTIME_LOW_RES); | 214 | *tp = ktime_to_timespec(KTIME_LOW_RES); |
268 | return 0; | 215 | return 0; |
269 | } | 216 | } |
217 | |||
218 | static int posix_get_boottime(const clockid_t which_clock, struct timespec *tp) | ||
219 | { | ||
220 | get_monotonic_boottime(tp); | ||
221 | return 0; | ||
222 | } | ||
223 | |||
224 | |||
270 | /* | 225 | /* |
271 | * Initialize everything, well, just everything in Posix clocks/timers ;) | 226 | * Initialize everything, well, just everything in Posix clocks/timers ;) |
272 | */ | 227 | */ |
273 | static __init int init_posix_timers(void) | 228 | static __init int init_posix_timers(void) |
274 | { | 229 | { |
275 | struct k_clock clock_realtime = { | 230 | struct k_clock clock_realtime = { |
276 | .clock_getres = hrtimer_get_res, | 231 | .clock_getres = hrtimer_get_res, |
232 | .clock_get = posix_clock_realtime_get, | ||
233 | .clock_set = posix_clock_realtime_set, | ||
234 | .clock_adj = posix_clock_realtime_adj, | ||
235 | .nsleep = common_nsleep, | ||
236 | .nsleep_restart = hrtimer_nanosleep_restart, | ||
237 | .timer_create = common_timer_create, | ||
238 | .timer_set = common_timer_set, | ||
239 | .timer_get = common_timer_get, | ||
240 | .timer_del = common_timer_del, | ||
277 | }; | 241 | }; |
278 | struct k_clock clock_monotonic = { | 242 | struct k_clock clock_monotonic = { |
279 | .clock_getres = hrtimer_get_res, | 243 | .clock_getres = hrtimer_get_res, |
280 | .clock_get = posix_ktime_get_ts, | 244 | .clock_get = posix_ktime_get_ts, |
281 | .clock_set = do_posix_clock_nosettime, | 245 | .nsleep = common_nsleep, |
246 | .nsleep_restart = hrtimer_nanosleep_restart, | ||
247 | .timer_create = common_timer_create, | ||
248 | .timer_set = common_timer_set, | ||
249 | .timer_get = common_timer_get, | ||
250 | .timer_del = common_timer_del, | ||
282 | }; | 251 | }; |
283 | struct k_clock clock_monotonic_raw = { | 252 | struct k_clock clock_monotonic_raw = { |
284 | .clock_getres = hrtimer_get_res, | 253 | .clock_getres = hrtimer_get_res, |
285 | .clock_get = posix_get_monotonic_raw, | 254 | .clock_get = posix_get_monotonic_raw, |
286 | .clock_set = do_posix_clock_nosettime, | ||
287 | .timer_create = no_timer_create, | ||
288 | .nsleep = no_nsleep, | ||
289 | }; | 255 | }; |
290 | struct k_clock clock_realtime_coarse = { | 256 | struct k_clock clock_realtime_coarse = { |
291 | .clock_getres = posix_get_coarse_res, | 257 | .clock_getres = posix_get_coarse_res, |
292 | .clock_get = posix_get_realtime_coarse, | 258 | .clock_get = posix_get_realtime_coarse, |
293 | .clock_set = do_posix_clock_nosettime, | ||
294 | .timer_create = no_timer_create, | ||
295 | .nsleep = no_nsleep, | ||
296 | }; | 259 | }; |
297 | struct k_clock clock_monotonic_coarse = { | 260 | struct k_clock clock_monotonic_coarse = { |
298 | .clock_getres = posix_get_coarse_res, | 261 | .clock_getres = posix_get_coarse_res, |
299 | .clock_get = posix_get_monotonic_coarse, | 262 | .clock_get = posix_get_monotonic_coarse, |
300 | .clock_set = do_posix_clock_nosettime, | 263 | }; |
301 | .timer_create = no_timer_create, | 264 | struct k_clock clock_boottime = { |
302 | .nsleep = no_nsleep, | 265 | .clock_getres = hrtimer_get_res, |
266 | .clock_get = posix_get_boottime, | ||
267 | .nsleep = common_nsleep, | ||
268 | .nsleep_restart = hrtimer_nanosleep_restart, | ||
269 | .timer_create = common_timer_create, | ||
270 | .timer_set = common_timer_set, | ||
271 | .timer_get = common_timer_get, | ||
272 | .timer_del = common_timer_del, | ||
303 | }; | 273 | }; |
304 | 274 | ||
305 | register_posix_clock(CLOCK_REALTIME, &clock_realtime); | 275 | posix_timers_register_clock(CLOCK_REALTIME, &clock_realtime); |
306 | register_posix_clock(CLOCK_MONOTONIC, &clock_monotonic); | 276 | posix_timers_register_clock(CLOCK_MONOTONIC, &clock_monotonic); |
307 | register_posix_clock(CLOCK_MONOTONIC_RAW, &clock_monotonic_raw); | 277 | posix_timers_register_clock(CLOCK_MONOTONIC_RAW, &clock_monotonic_raw); |
308 | register_posix_clock(CLOCK_REALTIME_COARSE, &clock_realtime_coarse); | 278 | posix_timers_register_clock(CLOCK_REALTIME_COARSE, &clock_realtime_coarse); |
309 | register_posix_clock(CLOCK_MONOTONIC_COARSE, &clock_monotonic_coarse); | 279 | posix_timers_register_clock(CLOCK_MONOTONIC_COARSE, &clock_monotonic_coarse); |
280 | posix_timers_register_clock(CLOCK_BOOTTIME, &clock_boottime); | ||
310 | 281 | ||
311 | posix_timers_cache = kmem_cache_create("posix_timers_cache", | 282 | posix_timers_cache = kmem_cache_create("posix_timers_cache", |
312 | sizeof (struct k_itimer), 0, SLAB_PANIC, | 283 | sizeof (struct k_itimer), 0, SLAB_PANIC, |
@@ -482,17 +453,29 @@ static struct pid *good_sigevent(sigevent_t * event) | |||
482 | return task_pid(rtn); | 453 | return task_pid(rtn); |
483 | } | 454 | } |
484 | 455 | ||
485 | void register_posix_clock(const clockid_t clock_id, struct k_clock *new_clock) | 456 | void posix_timers_register_clock(const clockid_t clock_id, |
457 | struct k_clock *new_clock) | ||
486 | { | 458 | { |
487 | if ((unsigned) clock_id >= MAX_CLOCKS) { | 459 | if ((unsigned) clock_id >= MAX_CLOCKS) { |
488 | printk("POSIX clock register failed for clock_id %d\n", | 460 | printk(KERN_WARNING "POSIX clock register failed for clock_id %d\n", |
461 | clock_id); | ||
462 | return; | ||
463 | } | ||
464 | |||
465 | if (!new_clock->clock_get) { | ||
466 | printk(KERN_WARNING "POSIX clock id %d lacks clock_get()\n", | ||
467 | clock_id); | ||
468 | return; | ||
469 | } | ||
470 | if (!new_clock->clock_getres) { | ||
471 | printk(KERN_WARNING "POSIX clock id %d lacks clock_getres()\n", | ||
489 | clock_id); | 472 | clock_id); |
490 | return; | 473 | return; |
491 | } | 474 | } |
492 | 475 | ||
493 | posix_clocks[clock_id] = *new_clock; | 476 | posix_clocks[clock_id] = *new_clock; |
494 | } | 477 | } |
495 | EXPORT_SYMBOL_GPL(register_posix_clock); | 478 | EXPORT_SYMBOL_GPL(posix_timers_register_clock); |
496 | 479 | ||
497 | static struct k_itimer * alloc_posix_timer(void) | 480 | static struct k_itimer * alloc_posix_timer(void) |
498 | { | 481 | { |
@@ -523,19 +506,39 @@ static void release_posix_timer(struct k_itimer *tmr, int it_id_set) | |||
523 | kmem_cache_free(posix_timers_cache, tmr); | 506 | kmem_cache_free(posix_timers_cache, tmr); |
524 | } | 507 | } |
525 | 508 | ||
509 | static struct k_clock *clockid_to_kclock(const clockid_t id) | ||
510 | { | ||
511 | if (id < 0) | ||
512 | return (id & CLOCKFD_MASK) == CLOCKFD ? | ||
513 | &clock_posix_dynamic : &clock_posix_cpu; | ||
514 | |||
515 | if (id >= MAX_CLOCKS || !posix_clocks[id].clock_getres) | ||
516 | return NULL; | ||
517 | return &posix_clocks[id]; | ||
518 | } | ||
519 | |||
520 | static int common_timer_create(struct k_itimer *new_timer) | ||
521 | { | ||
522 | hrtimer_init(&new_timer->it.real.timer, new_timer->it_clock, 0); | ||
523 | return 0; | ||
524 | } | ||
525 | |||
526 | /* Create a POSIX.1b interval timer. */ | 526 | /* Create a POSIX.1b interval timer. */ |
527 | 527 | ||
528 | SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock, | 528 | SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock, |
529 | struct sigevent __user *, timer_event_spec, | 529 | struct sigevent __user *, timer_event_spec, |
530 | timer_t __user *, created_timer_id) | 530 | timer_t __user *, created_timer_id) |
531 | { | 531 | { |
532 | struct k_clock *kc = clockid_to_kclock(which_clock); | ||
532 | struct k_itimer *new_timer; | 533 | struct k_itimer *new_timer; |
533 | int error, new_timer_id; | 534 | int error, new_timer_id; |
534 | sigevent_t event; | 535 | sigevent_t event; |
535 | int it_id_set = IT_ID_NOT_SET; | 536 | int it_id_set = IT_ID_NOT_SET; |
536 | 537 | ||
537 | if (invalid_clockid(which_clock)) | 538 | if (!kc) |
538 | return -EINVAL; | 539 | return -EINVAL; |
540 | if (!kc->timer_create) | ||
541 | return -EOPNOTSUPP; | ||
539 | 542 | ||
540 | new_timer = alloc_posix_timer(); | 543 | new_timer = alloc_posix_timer(); |
541 | if (unlikely(!new_timer)) | 544 | if (unlikely(!new_timer)) |
@@ -597,7 +600,7 @@ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock, | |||
597 | goto out; | 600 | goto out; |
598 | } | 601 | } |
599 | 602 | ||
600 | error = CLOCK_DISPATCH(which_clock, timer_create, (new_timer)); | 603 | error = kc->timer_create(new_timer); |
601 | if (error) | 604 | if (error) |
602 | goto out; | 605 | goto out; |
603 | 606 | ||
@@ -607,7 +610,7 @@ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock, | |||
607 | spin_unlock_irq(¤t->sighand->siglock); | 610 | spin_unlock_irq(¤t->sighand->siglock); |
608 | 611 | ||
609 | return 0; | 612 | return 0; |
610 | /* | 613 | /* |
611 | * In the case of the timer belonging to another task, after | 614 | * In the case of the timer belonging to another task, after |
612 | * the task is unlocked, the timer is owned by the other task | 615 | * the task is unlocked, the timer is owned by the other task |
613 | * and may cease to exist at any time. Don't use or modify | 616 | * and may cease to exist at any time. Don't use or modify |
@@ -709,22 +712,28 @@ common_timer_get(struct k_itimer *timr, struct itimerspec *cur_setting) | |||
709 | SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id, | 712 | SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id, |
710 | struct itimerspec __user *, setting) | 713 | struct itimerspec __user *, setting) |
711 | { | 714 | { |
712 | struct k_itimer *timr; | ||
713 | struct itimerspec cur_setting; | 715 | struct itimerspec cur_setting; |
716 | struct k_itimer *timr; | ||
717 | struct k_clock *kc; | ||
714 | unsigned long flags; | 718 | unsigned long flags; |
719 | int ret = 0; | ||
715 | 720 | ||
716 | timr = lock_timer(timer_id, &flags); | 721 | timr = lock_timer(timer_id, &flags); |
717 | if (!timr) | 722 | if (!timr) |
718 | return -EINVAL; | 723 | return -EINVAL; |
719 | 724 | ||
720 | CLOCK_DISPATCH(timr->it_clock, timer_get, (timr, &cur_setting)); | 725 | kc = clockid_to_kclock(timr->it_clock); |
726 | if (WARN_ON_ONCE(!kc || !kc->timer_get)) | ||
727 | ret = -EINVAL; | ||
728 | else | ||
729 | kc->timer_get(timr, &cur_setting); | ||
721 | 730 | ||
722 | unlock_timer(timr, flags); | 731 | unlock_timer(timr, flags); |
723 | 732 | ||
724 | if (copy_to_user(setting, &cur_setting, sizeof (cur_setting))) | 733 | if (!ret && copy_to_user(setting, &cur_setting, sizeof (cur_setting))) |
725 | return -EFAULT; | 734 | return -EFAULT; |
726 | 735 | ||
727 | return 0; | 736 | return ret; |
728 | } | 737 | } |
729 | 738 | ||
730 | /* | 739 | /* |
@@ -813,6 +822,7 @@ SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags, | |||
813 | int error = 0; | 822 | int error = 0; |
814 | unsigned long flag; | 823 | unsigned long flag; |
815 | struct itimerspec *rtn = old_setting ? &old_spec : NULL; | 824 | struct itimerspec *rtn = old_setting ? &old_spec : NULL; |
825 | struct k_clock *kc; | ||
816 | 826 | ||
817 | if (!new_setting) | 827 | if (!new_setting) |
818 | return -EINVAL; | 828 | return -EINVAL; |
@@ -828,8 +838,11 @@ retry: | |||
828 | if (!timr) | 838 | if (!timr) |
829 | return -EINVAL; | 839 | return -EINVAL; |
830 | 840 | ||
831 | error = CLOCK_DISPATCH(timr->it_clock, timer_set, | 841 | kc = clockid_to_kclock(timr->it_clock); |
832 | (timr, flags, &new_spec, rtn)); | 842 | if (WARN_ON_ONCE(!kc || !kc->timer_set)) |
843 | error = -EINVAL; | ||
844 | else | ||
845 | error = kc->timer_set(timr, flags, &new_spec, rtn); | ||
833 | 846 | ||
834 | unlock_timer(timr, flag); | 847 | unlock_timer(timr, flag); |
835 | if (error == TIMER_RETRY) { | 848 | if (error == TIMER_RETRY) { |
@@ -844,7 +857,7 @@ retry: | |||
844 | return error; | 857 | return error; |
845 | } | 858 | } |
846 | 859 | ||
847 | static inline int common_timer_del(struct k_itimer *timer) | 860 | static int common_timer_del(struct k_itimer *timer) |
848 | { | 861 | { |
849 | timer->it.real.interval.tv64 = 0; | 862 | timer->it.real.interval.tv64 = 0; |
850 | 863 | ||
@@ -855,7 +868,11 @@ static inline int common_timer_del(struct k_itimer *timer) | |||
855 | 868 | ||
856 | static inline int timer_delete_hook(struct k_itimer *timer) | 869 | static inline int timer_delete_hook(struct k_itimer *timer) |
857 | { | 870 | { |
858 | return CLOCK_DISPATCH(timer->it_clock, timer_del, (timer)); | 871 | struct k_clock *kc = clockid_to_kclock(timer->it_clock); |
872 | |||
873 | if (WARN_ON_ONCE(!kc || !kc->timer_del)) | ||
874 | return -EINVAL; | ||
875 | return kc->timer_del(timer); | ||
859 | } | 876 | } |
860 | 877 | ||
861 | /* Delete a POSIX.1b interval timer. */ | 878 | /* Delete a POSIX.1b interval timer. */ |
@@ -927,69 +944,76 @@ void exit_itimers(struct signal_struct *sig) | |||
927 | } | 944 | } |
928 | } | 945 | } |
929 | 946 | ||
930 | /* Not available / possible... functions */ | ||
931 | int do_posix_clock_nosettime(const clockid_t clockid, struct timespec *tp) | ||
932 | { | ||
933 | return -EINVAL; | ||
934 | } | ||
935 | EXPORT_SYMBOL_GPL(do_posix_clock_nosettime); | ||
936 | |||
937 | int do_posix_clock_nonanosleep(const clockid_t clock, int flags, | ||
938 | struct timespec *t, struct timespec __user *r) | ||
939 | { | ||
940 | #ifndef ENOTSUP | ||
941 | return -EOPNOTSUPP; /* aka ENOTSUP in userland for POSIX */ | ||
942 | #else /* parisc does define it separately. */ | ||
943 | return -ENOTSUP; | ||
944 | #endif | ||
945 | } | ||
946 | EXPORT_SYMBOL_GPL(do_posix_clock_nonanosleep); | ||
947 | |||
948 | SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock, | 947 | SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock, |
949 | const struct timespec __user *, tp) | 948 | const struct timespec __user *, tp) |
950 | { | 949 | { |
950 | struct k_clock *kc = clockid_to_kclock(which_clock); | ||
951 | struct timespec new_tp; | 951 | struct timespec new_tp; |
952 | 952 | ||
953 | if (invalid_clockid(which_clock)) | 953 | if (!kc || !kc->clock_set) |
954 | return -EINVAL; | 954 | return -EINVAL; |
955 | |||
955 | if (copy_from_user(&new_tp, tp, sizeof (*tp))) | 956 | if (copy_from_user(&new_tp, tp, sizeof (*tp))) |
956 | return -EFAULT; | 957 | return -EFAULT; |
957 | 958 | ||
958 | return CLOCK_DISPATCH(which_clock, clock_set, (which_clock, &new_tp)); | 959 | return kc->clock_set(which_clock, &new_tp); |
959 | } | 960 | } |
960 | 961 | ||
961 | SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock, | 962 | SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock, |
962 | struct timespec __user *,tp) | 963 | struct timespec __user *,tp) |
963 | { | 964 | { |
965 | struct k_clock *kc = clockid_to_kclock(which_clock); | ||
964 | struct timespec kernel_tp; | 966 | struct timespec kernel_tp; |
965 | int error; | 967 | int error; |
966 | 968 | ||
967 | if (invalid_clockid(which_clock)) | 969 | if (!kc) |
968 | return -EINVAL; | 970 | return -EINVAL; |
969 | error = CLOCK_DISPATCH(which_clock, clock_get, | 971 | |
970 | (which_clock, &kernel_tp)); | 972 | error = kc->clock_get(which_clock, &kernel_tp); |
973 | |||
971 | if (!error && copy_to_user(tp, &kernel_tp, sizeof (kernel_tp))) | 974 | if (!error && copy_to_user(tp, &kernel_tp, sizeof (kernel_tp))) |
972 | error = -EFAULT; | 975 | error = -EFAULT; |
973 | 976 | ||
974 | return error; | 977 | return error; |
978 | } | ||
979 | |||
980 | SYSCALL_DEFINE2(clock_adjtime, const clockid_t, which_clock, | ||
981 | struct timex __user *, utx) | ||
982 | { | ||
983 | struct k_clock *kc = clockid_to_kclock(which_clock); | ||
984 | struct timex ktx; | ||
985 | int err; | ||
986 | |||
987 | if (!kc) | ||
988 | return -EINVAL; | ||
989 | if (!kc->clock_adj) | ||
990 | return -EOPNOTSUPP; | ||
991 | |||
992 | if (copy_from_user(&ktx, utx, sizeof(ktx))) | ||
993 | return -EFAULT; | ||
994 | |||
995 | err = kc->clock_adj(which_clock, &ktx); | ||
996 | |||
997 | if (!err && copy_to_user(utx, &ktx, sizeof(ktx))) | ||
998 | return -EFAULT; | ||
975 | 999 | ||
1000 | return err; | ||
976 | } | 1001 | } |
977 | 1002 | ||
978 | SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock, | 1003 | SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock, |
979 | struct timespec __user *, tp) | 1004 | struct timespec __user *, tp) |
980 | { | 1005 | { |
1006 | struct k_clock *kc = clockid_to_kclock(which_clock); | ||
981 | struct timespec rtn_tp; | 1007 | struct timespec rtn_tp; |
982 | int error; | 1008 | int error; |
983 | 1009 | ||
984 | if (invalid_clockid(which_clock)) | 1010 | if (!kc) |
985 | return -EINVAL; | 1011 | return -EINVAL; |
986 | 1012 | ||
987 | error = CLOCK_DISPATCH(which_clock, clock_getres, | 1013 | error = kc->clock_getres(which_clock, &rtn_tp); |
988 | (which_clock, &rtn_tp)); | ||
989 | 1014 | ||
990 | if (!error && tp && copy_to_user(tp, &rtn_tp, sizeof (rtn_tp))) { | 1015 | if (!error && tp && copy_to_user(tp, &rtn_tp, sizeof (rtn_tp))) |
991 | error = -EFAULT; | 1016 | error = -EFAULT; |
992 | } | ||
993 | 1017 | ||
994 | return error; | 1018 | return error; |
995 | } | 1019 | } |
@@ -1009,10 +1033,13 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags, | |||
1009 | const struct timespec __user *, rqtp, | 1033 | const struct timespec __user *, rqtp, |
1010 | struct timespec __user *, rmtp) | 1034 | struct timespec __user *, rmtp) |
1011 | { | 1035 | { |
1036 | struct k_clock *kc = clockid_to_kclock(which_clock); | ||
1012 | struct timespec t; | 1037 | struct timespec t; |
1013 | 1038 | ||
1014 | if (invalid_clockid(which_clock)) | 1039 | if (!kc) |
1015 | return -EINVAL; | 1040 | return -EINVAL; |
1041 | if (!kc->nsleep) | ||
1042 | return -ENANOSLEEP_NOTSUP; | ||
1016 | 1043 | ||
1017 | if (copy_from_user(&t, rqtp, sizeof (struct timespec))) | 1044 | if (copy_from_user(&t, rqtp, sizeof (struct timespec))) |
1018 | return -EFAULT; | 1045 | return -EFAULT; |
@@ -1020,27 +1047,20 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags, | |||
1020 | if (!timespec_valid(&t)) | 1047 | if (!timespec_valid(&t)) |
1021 | return -EINVAL; | 1048 | return -EINVAL; |
1022 | 1049 | ||
1023 | return CLOCK_DISPATCH(which_clock, nsleep, | 1050 | return kc->nsleep(which_clock, flags, &t, rmtp); |
1024 | (which_clock, flags, &t, rmtp)); | ||
1025 | } | ||
1026 | |||
1027 | /* | ||
1028 | * nanosleep_restart for monotonic and realtime clocks | ||
1029 | */ | ||
1030 | static int common_nsleep_restart(struct restart_block *restart_block) | ||
1031 | { | ||
1032 | return hrtimer_nanosleep_restart(restart_block); | ||
1033 | } | 1051 | } |
1034 | 1052 | ||
1035 | /* | 1053 | /* |
1036 | * This will restart clock_nanosleep. This is required only by | 1054 | * This will restart clock_nanosleep. This is required only by |
1037 | * compat_clock_nanosleep_restart for now. | 1055 | * compat_clock_nanosleep_restart for now. |
1038 | */ | 1056 | */ |
1039 | long | 1057 | long clock_nanosleep_restart(struct restart_block *restart_block) |
1040 | clock_nanosleep_restart(struct restart_block *restart_block) | ||
1041 | { | 1058 | { |
1042 | clockid_t which_clock = restart_block->arg0; | 1059 | clockid_t which_clock = restart_block->nanosleep.index; |
1060 | struct k_clock *kc = clockid_to_kclock(which_clock); | ||
1061 | |||
1062 | if (WARN_ON_ONCE(!kc || !kc->nsleep_restart)) | ||
1063 | return -EINVAL; | ||
1043 | 1064 | ||
1044 | return CLOCK_DISPATCH(which_clock, nsleep_restart, | 1065 | return kc->nsleep_restart(restart_block); |
1045 | (restart_block)); | ||
1046 | } | 1066 | } |
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index a5aff3ebad38..265729966ece 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig | |||
@@ -100,13 +100,9 @@ config PM_SLEEP_ADVANCED_DEBUG | |||
100 | depends on PM_ADVANCED_DEBUG | 100 | depends on PM_ADVANCED_DEBUG |
101 | default n | 101 | default n |
102 | 102 | ||
103 | config SUSPEND_NVS | ||
104 | bool | ||
105 | |||
106 | config SUSPEND | 103 | config SUSPEND |
107 | bool "Suspend to RAM and standby" | 104 | bool "Suspend to RAM and standby" |
108 | depends on PM && ARCH_SUSPEND_POSSIBLE | 105 | depends on PM && ARCH_SUSPEND_POSSIBLE |
109 | select SUSPEND_NVS if HAS_IOMEM | ||
110 | default y | 106 | default y |
111 | ---help--- | 107 | ---help--- |
112 | Allow the system to enter sleep states in which main memory is | 108 | Allow the system to enter sleep states in which main memory is |
@@ -140,7 +136,6 @@ config HIBERNATION | |||
140 | depends on PM && SWAP && ARCH_HIBERNATION_POSSIBLE | 136 | depends on PM && SWAP && ARCH_HIBERNATION_POSSIBLE |
141 | select LZO_COMPRESS | 137 | select LZO_COMPRESS |
142 | select LZO_DECOMPRESS | 138 | select LZO_DECOMPRESS |
143 | select SUSPEND_NVS if HAS_IOMEM | ||
144 | ---help--- | 139 | ---help--- |
145 | Enable the suspend to disk (STD) functionality, which is usually | 140 | Enable the suspend to disk (STD) functionality, which is usually |
146 | called "hibernation" in user interfaces. STD checkpoints the | 141 | called "hibernation" in user interfaces. STD checkpoints the |
diff --git a/kernel/power/Makefile b/kernel/power/Makefile index f9063c6b185d..c350e18b53e3 100644 --- a/kernel/power/Makefile +++ b/kernel/power/Makefile | |||
@@ -1,7 +1,4 @@ | |||
1 | 1 | ccflags-$(CONFIG_PM_DEBUG) := -DDEBUG | |
2 | ifeq ($(CONFIG_PM_DEBUG),y) | ||
3 | EXTRA_CFLAGS += -DDEBUG | ||
4 | endif | ||
5 | 2 | ||
6 | obj-$(CONFIG_PM) += main.o | 3 | obj-$(CONFIG_PM) += main.o |
7 | obj-$(CONFIG_PM_SLEEP) += console.o | 4 | obj-$(CONFIG_PM_SLEEP) += console.o |
@@ -10,6 +7,5 @@ obj-$(CONFIG_SUSPEND) += suspend.o | |||
10 | obj-$(CONFIG_PM_TEST_SUSPEND) += suspend_test.o | 7 | obj-$(CONFIG_PM_TEST_SUSPEND) += suspend_test.o |
11 | obj-$(CONFIG_HIBERNATION) += hibernate.o snapshot.o swap.o user.o \ | 8 | obj-$(CONFIG_HIBERNATION) += hibernate.o snapshot.o swap.o user.o \ |
12 | block_io.o | 9 | block_io.o |
13 | obj-$(CONFIG_SUSPEND_NVS) += nvs.o | ||
14 | 10 | ||
15 | obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o | 11 | obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o |
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 048d0b514831..1832bd264219 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c | |||
@@ -51,18 +51,18 @@ enum { | |||
51 | 51 | ||
52 | static int hibernation_mode = HIBERNATION_SHUTDOWN; | 52 | static int hibernation_mode = HIBERNATION_SHUTDOWN; |
53 | 53 | ||
54 | static struct platform_hibernation_ops *hibernation_ops; | 54 | static const struct platform_hibernation_ops *hibernation_ops; |
55 | 55 | ||
56 | /** | 56 | /** |
57 | * hibernation_set_ops - set the global hibernate operations | 57 | * hibernation_set_ops - set the global hibernate operations |
58 | * @ops: the hibernation operations to use in subsequent hibernation transitions | 58 | * @ops: the hibernation operations to use in subsequent hibernation transitions |
59 | */ | 59 | */ |
60 | 60 | ||
61 | void hibernation_set_ops(struct platform_hibernation_ops *ops) | 61 | void hibernation_set_ops(const struct platform_hibernation_ops *ops) |
62 | { | 62 | { |
63 | if (ops && !(ops->begin && ops->end && ops->pre_snapshot | 63 | if (ops && !(ops->begin && ops->end && ops->pre_snapshot |
64 | && ops->prepare && ops->finish && ops->enter && ops->pre_restore | 64 | && ops->prepare && ops->finish && ops->enter && ops->pre_restore |
65 | && ops->restore_cleanup)) { | 65 | && ops->restore_cleanup && ops->leave)) { |
66 | WARN_ON(1); | 66 | WARN_ON(1); |
67 | return; | 67 | return; |
68 | } | 68 | } |
@@ -278,7 +278,7 @@ static int create_image(int platform_mode) | |||
278 | goto Enable_irqs; | 278 | goto Enable_irqs; |
279 | } | 279 | } |
280 | 280 | ||
281 | if (hibernation_test(TEST_CORE) || !pm_check_wakeup_events()) | 281 | if (hibernation_test(TEST_CORE) || pm_wakeup_pending()) |
282 | goto Power_up; | 282 | goto Power_up; |
283 | 283 | ||
284 | in_suspend = 1; | 284 | in_suspend = 1; |
@@ -516,7 +516,7 @@ int hibernation_platform_enter(void) | |||
516 | 516 | ||
517 | local_irq_disable(); | 517 | local_irq_disable(); |
518 | sysdev_suspend(PMSG_HIBERNATE); | 518 | sysdev_suspend(PMSG_HIBERNATE); |
519 | if (!pm_check_wakeup_events()) { | 519 | if (pm_wakeup_pending()) { |
520 | error = -EAGAIN; | 520 | error = -EAGAIN; |
521 | goto Power_up; | 521 | goto Power_up; |
522 | } | 522 | } |
@@ -647,6 +647,7 @@ int hibernate(void) | |||
647 | swsusp_free(); | 647 | swsusp_free(); |
648 | if (!error) | 648 | if (!error) |
649 | power_down(); | 649 | power_down(); |
650 | in_suspend = 0; | ||
650 | pm_restore_gfp_mask(); | 651 | pm_restore_gfp_mask(); |
651 | } else { | 652 | } else { |
652 | pr_debug("PM: Image restored successfully.\n"); | 653 | pr_debug("PM: Image restored successfully.\n"); |
diff --git a/kernel/power/main.c b/kernel/power/main.c index 7b5db6a8561e..701853042c28 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c | |||
@@ -326,7 +326,7 @@ EXPORT_SYMBOL_GPL(pm_wq); | |||
326 | 326 | ||
327 | static int __init pm_start_workqueue(void) | 327 | static int __init pm_start_workqueue(void) |
328 | { | 328 | { |
329 | pm_wq = alloc_workqueue("pm", WQ_FREEZEABLE, 0); | 329 | pm_wq = alloc_workqueue("pm", WQ_FREEZABLE, 0); |
330 | 330 | ||
331 | return pm_wq ? 0 : -ENOMEM; | 331 | return pm_wq ? 0 : -ENOMEM; |
332 | } | 332 | } |
diff --git a/kernel/power/nvs.c b/kernel/power/nvs.c deleted file mode 100644 index 1836db60bbb6..000000000000 --- a/kernel/power/nvs.c +++ /dev/null | |||
@@ -1,136 +0,0 @@ | |||
1 | /* | ||
2 | * linux/kernel/power/hibernate_nvs.c - Routines for handling NVS memory | ||
3 | * | ||
4 | * Copyright (C) 2008,2009 Rafael J. Wysocki <rjw@sisk.pl>, Novell Inc. | ||
5 | * | ||
6 | * This file is released under the GPLv2. | ||
7 | */ | ||
8 | |||
9 | #include <linux/io.h> | ||
10 | #include <linux/kernel.h> | ||
11 | #include <linux/list.h> | ||
12 | #include <linux/mm.h> | ||
13 | #include <linux/slab.h> | ||
14 | #include <linux/suspend.h> | ||
15 | |||
16 | /* | ||
17 | * Platforms, like ACPI, may want us to save some memory used by them during | ||
18 | * suspend and to restore the contents of this memory during the subsequent | ||
19 | * resume. The code below implements a mechanism allowing us to do that. | ||
20 | */ | ||
21 | |||
22 | struct nvs_page { | ||
23 | unsigned long phys_start; | ||
24 | unsigned int size; | ||
25 | void *kaddr; | ||
26 | void *data; | ||
27 | struct list_head node; | ||
28 | }; | ||
29 | |||
30 | static LIST_HEAD(nvs_list); | ||
31 | |||
32 | /** | ||
33 | * suspend_nvs_register - register platform NVS memory region to save | ||
34 | * @start - physical address of the region | ||
35 | * @size - size of the region | ||
36 | * | ||
37 | * The NVS region need not be page-aligned (both ends) and we arrange | ||
38 | * things so that the data from page-aligned addresses in this region will | ||
39 | * be copied into separate RAM pages. | ||
40 | */ | ||
41 | int suspend_nvs_register(unsigned long start, unsigned long size) | ||
42 | { | ||
43 | struct nvs_page *entry, *next; | ||
44 | |||
45 | while (size > 0) { | ||
46 | unsigned int nr_bytes; | ||
47 | |||
48 | entry = kzalloc(sizeof(struct nvs_page), GFP_KERNEL); | ||
49 | if (!entry) | ||
50 | goto Error; | ||
51 | |||
52 | list_add_tail(&entry->node, &nvs_list); | ||
53 | entry->phys_start = start; | ||
54 | nr_bytes = PAGE_SIZE - (start & ~PAGE_MASK); | ||
55 | entry->size = (size < nr_bytes) ? size : nr_bytes; | ||
56 | |||
57 | start += entry->size; | ||
58 | size -= entry->size; | ||
59 | } | ||
60 | return 0; | ||
61 | |||
62 | Error: | ||
63 | list_for_each_entry_safe(entry, next, &nvs_list, node) { | ||
64 | list_del(&entry->node); | ||
65 | kfree(entry); | ||
66 | } | ||
67 | return -ENOMEM; | ||
68 | } | ||
69 | |||
70 | /** | ||
71 | * suspend_nvs_free - free data pages allocated for saving NVS regions | ||
72 | */ | ||
73 | void suspend_nvs_free(void) | ||
74 | { | ||
75 | struct nvs_page *entry; | ||
76 | |||
77 | list_for_each_entry(entry, &nvs_list, node) | ||
78 | if (entry->data) { | ||
79 | free_page((unsigned long)entry->data); | ||
80 | entry->data = NULL; | ||
81 | if (entry->kaddr) { | ||
82 | iounmap(entry->kaddr); | ||
83 | entry->kaddr = NULL; | ||
84 | } | ||
85 | } | ||
86 | } | ||
87 | |||
88 | /** | ||
89 | * suspend_nvs_alloc - allocate memory necessary for saving NVS regions | ||
90 | */ | ||
91 | int suspend_nvs_alloc(void) | ||
92 | { | ||
93 | struct nvs_page *entry; | ||
94 | |||
95 | list_for_each_entry(entry, &nvs_list, node) { | ||
96 | entry->data = (void *)__get_free_page(GFP_KERNEL); | ||
97 | if (!entry->data) { | ||
98 | suspend_nvs_free(); | ||
99 | return -ENOMEM; | ||
100 | } | ||
101 | } | ||
102 | return 0; | ||
103 | } | ||
104 | |||
105 | /** | ||
106 | * suspend_nvs_save - save NVS memory regions | ||
107 | */ | ||
108 | void suspend_nvs_save(void) | ||
109 | { | ||
110 | struct nvs_page *entry; | ||
111 | |||
112 | printk(KERN_INFO "PM: Saving platform NVS memory\n"); | ||
113 | |||
114 | list_for_each_entry(entry, &nvs_list, node) | ||
115 | if (entry->data) { | ||
116 | entry->kaddr = ioremap(entry->phys_start, entry->size); | ||
117 | memcpy(entry->data, entry->kaddr, entry->size); | ||
118 | } | ||
119 | } | ||
120 | |||
121 | /** | ||
122 | * suspend_nvs_restore - restore NVS memory regions | ||
123 | * | ||
124 | * This function is going to be called with interrupts disabled, so it | ||
125 | * cannot iounmap the virtual addresses used to access the NVS region. | ||
126 | */ | ||
127 | void suspend_nvs_restore(void) | ||
128 | { | ||
129 | struct nvs_page *entry; | ||
130 | |||
131 | printk(KERN_INFO "PM: Restoring platform NVS memory\n"); | ||
132 | |||
133 | list_for_each_entry(entry, &nvs_list, node) | ||
134 | if (entry->data) | ||
135 | memcpy(entry->kaddr, entry->data, entry->size); | ||
136 | } | ||
diff --git a/kernel/power/process.c b/kernel/power/process.c index e50b4c1b2a0f..0cf3a27a6c9d 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c | |||
@@ -22,7 +22,7 @@ | |||
22 | */ | 22 | */ |
23 | #define TIMEOUT (20 * HZ) | 23 | #define TIMEOUT (20 * HZ) |
24 | 24 | ||
25 | static inline int freezeable(struct task_struct * p) | 25 | static inline int freezable(struct task_struct * p) |
26 | { | 26 | { |
27 | if ((p == current) || | 27 | if ((p == current) || |
28 | (p->flags & PF_NOFREEZE) || | 28 | (p->flags & PF_NOFREEZE) || |
@@ -53,7 +53,7 @@ static int try_to_freeze_tasks(bool sig_only) | |||
53 | todo = 0; | 53 | todo = 0; |
54 | read_lock(&tasklist_lock); | 54 | read_lock(&tasklist_lock); |
55 | do_each_thread(g, p) { | 55 | do_each_thread(g, p) { |
56 | if (frozen(p) || !freezeable(p)) | 56 | if (frozen(p) || !freezable(p)) |
57 | continue; | 57 | continue; |
58 | 58 | ||
59 | if (!freeze_task(p, sig_only)) | 59 | if (!freeze_task(p, sig_only)) |
@@ -64,6 +64,12 @@ static int try_to_freeze_tasks(bool sig_only) | |||
64 | * perturb a task in TASK_STOPPED or TASK_TRACED. | 64 | * perturb a task in TASK_STOPPED or TASK_TRACED. |
65 | * It is "frozen enough". If the task does wake | 65 | * It is "frozen enough". If the task does wake |
66 | * up, it will immediately call try_to_freeze. | 66 | * up, it will immediately call try_to_freeze. |
67 | * | ||
68 | * Because freeze_task() goes through p's | ||
69 | * scheduler lock after setting TIF_FREEZE, it's | ||
70 | * guaranteed that either we see TASK_RUNNING or | ||
71 | * try_to_stop() after schedule() in ptrace/signal | ||
72 | * stop sees TIF_FREEZE. | ||
67 | */ | 73 | */ |
68 | if (!task_is_stopped_or_traced(p) && | 74 | if (!task_is_stopped_or_traced(p) && |
69 | !freezer_should_skip(p)) | 75 | !freezer_should_skip(p)) |
@@ -79,7 +85,7 @@ static int try_to_freeze_tasks(bool sig_only) | |||
79 | if (!todo || time_after(jiffies, end_time)) | 85 | if (!todo || time_after(jiffies, end_time)) |
80 | break; | 86 | break; |
81 | 87 | ||
82 | if (!pm_check_wakeup_events()) { | 88 | if (pm_wakeup_pending()) { |
83 | wakeup = true; | 89 | wakeup = true; |
84 | break; | 90 | break; |
85 | } | 91 | } |
@@ -161,7 +167,7 @@ static void thaw_tasks(bool nosig_only) | |||
161 | 167 | ||
162 | read_lock(&tasklist_lock); | 168 | read_lock(&tasklist_lock); |
163 | do_each_thread(g, p) { | 169 | do_each_thread(g, p) { |
164 | if (!freezeable(p)) | 170 | if (!freezable(p)) |
165 | continue; | 171 | continue; |
166 | 172 | ||
167 | if (nosig_only && should_send_signal(p)) | 173 | if (nosig_only && should_send_signal(p)) |
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 0dac75ea4456..64db648ff911 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c | |||
@@ -1519,11 +1519,8 @@ static int | |||
1519 | swsusp_alloc(struct memory_bitmap *orig_bm, struct memory_bitmap *copy_bm, | 1519 | swsusp_alloc(struct memory_bitmap *orig_bm, struct memory_bitmap *copy_bm, |
1520 | unsigned int nr_pages, unsigned int nr_highmem) | 1520 | unsigned int nr_pages, unsigned int nr_highmem) |
1521 | { | 1521 | { |
1522 | int error = 0; | ||
1523 | |||
1524 | if (nr_highmem > 0) { | 1522 | if (nr_highmem > 0) { |
1525 | error = get_highmem_buffer(PG_ANY); | 1523 | if (get_highmem_buffer(PG_ANY)) |
1526 | if (error) | ||
1527 | goto err_out; | 1524 | goto err_out; |
1528 | if (nr_highmem > alloc_highmem) { | 1525 | if (nr_highmem > alloc_highmem) { |
1529 | nr_highmem -= alloc_highmem; | 1526 | nr_highmem -= alloc_highmem; |
@@ -1546,7 +1543,7 @@ swsusp_alloc(struct memory_bitmap *orig_bm, struct memory_bitmap *copy_bm, | |||
1546 | 1543 | ||
1547 | err_out: | 1544 | err_out: |
1548 | swsusp_free(); | 1545 | swsusp_free(); |
1549 | return error; | 1546 | return -ENOMEM; |
1550 | } | 1547 | } |
1551 | 1548 | ||
1552 | asmlinkage int swsusp_save(void) | 1549 | asmlinkage int swsusp_save(void) |
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 031d5e3a6197..de6f86bfa303 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c | |||
@@ -31,13 +31,13 @@ const char *const pm_states[PM_SUSPEND_MAX] = { | |||
31 | [PM_SUSPEND_MEM] = "mem", | 31 | [PM_SUSPEND_MEM] = "mem", |
32 | }; | 32 | }; |
33 | 33 | ||
34 | static struct platform_suspend_ops *suspend_ops; | 34 | static const struct platform_suspend_ops *suspend_ops; |
35 | 35 | ||
36 | /** | 36 | /** |
37 | * suspend_set_ops - Set the global suspend method table. | 37 | * suspend_set_ops - Set the global suspend method table. |
38 | * @ops: Pointer to ops structure. | 38 | * @ops: Pointer to ops structure. |
39 | */ | 39 | */ |
40 | void suspend_set_ops(struct platform_suspend_ops *ops) | 40 | void suspend_set_ops(const struct platform_suspend_ops *ops) |
41 | { | 41 | { |
42 | mutex_lock(&pm_mutex); | 42 | mutex_lock(&pm_mutex); |
43 | suspend_ops = ops; | 43 | suspend_ops = ops; |
@@ -164,7 +164,7 @@ static int suspend_enter(suspend_state_t state) | |||
164 | 164 | ||
165 | error = sysdev_suspend(PMSG_SUSPEND); | 165 | error = sysdev_suspend(PMSG_SUSPEND); |
166 | if (!error) { | 166 | if (!error) { |
167 | if (!suspend_test(TEST_CORE) && pm_check_wakeup_events()) { | 167 | if (!(suspend_test(TEST_CORE) || pm_wakeup_pending())) { |
168 | error = suspend_ops->enter(state); | 168 | error = suspend_ops->enter(state); |
169 | events_check_enabled = false; | 169 | events_check_enabled = false; |
170 | } | 170 | } |
diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 8c7e4832b9be..7c97c3a0eee3 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c | |||
@@ -224,7 +224,7 @@ static int swsusp_swap_check(void) | |||
224 | return res; | 224 | return res; |
225 | 225 | ||
226 | root_swap = res; | 226 | root_swap = res; |
227 | res = blkdev_get(hib_resume_bdev, FMODE_WRITE); | 227 | res = blkdev_get(hib_resume_bdev, FMODE_WRITE, NULL); |
228 | if (res) | 228 | if (res) |
229 | return res; | 229 | return res; |
230 | 230 | ||
@@ -888,7 +888,7 @@ out_finish: | |||
888 | /** | 888 | /** |
889 | * swsusp_read - read the hibernation image. | 889 | * swsusp_read - read the hibernation image. |
890 | * @flags_p: flags passed by the "frozen" kernel in the image header should | 890 | * @flags_p: flags passed by the "frozen" kernel in the image header should |
891 | * be written into this memeory location | 891 | * be written into this memory location |
892 | */ | 892 | */ |
893 | 893 | ||
894 | int swsusp_read(unsigned int *flags_p) | 894 | int swsusp_read(unsigned int *flags_p) |
@@ -930,7 +930,8 @@ int swsusp_check(void) | |||
930 | { | 930 | { |
931 | int error; | 931 | int error; |
932 | 932 | ||
933 | hib_resume_bdev = open_by_devnum(swsusp_resume_device, FMODE_READ); | 933 | hib_resume_bdev = blkdev_get_by_dev(swsusp_resume_device, |
934 | FMODE_READ, NULL); | ||
934 | if (!IS_ERR(hib_resume_bdev)) { | 935 | if (!IS_ERR(hib_resume_bdev)) { |
935 | set_blocksize(hib_resume_bdev, PAGE_SIZE); | 936 | set_blocksize(hib_resume_bdev, PAGE_SIZE); |
936 | clear_page(swsusp_header); | 937 | clear_page(swsusp_header); |
diff --git a/kernel/printk.c b/kernel/printk.c index ab3ffc5b3b64..36231525e22f 100644 --- a/kernel/printk.c +++ b/kernel/printk.c | |||
@@ -39,16 +39,11 @@ | |||
39 | #include <linux/syslog.h> | 39 | #include <linux/syslog.h> |
40 | #include <linux/cpu.h> | 40 | #include <linux/cpu.h> |
41 | #include <linux/notifier.h> | 41 | #include <linux/notifier.h> |
42 | #include <linux/rculist.h> | ||
42 | 43 | ||
43 | #include <asm/uaccess.h> | 44 | #include <asm/uaccess.h> |
44 | 45 | ||
45 | /* | 46 | /* |
46 | * for_each_console() allows you to iterate on each console | ||
47 | */ | ||
48 | #define for_each_console(con) \ | ||
49 | for (con = console_drivers; con != NULL; con = con->next) | ||
50 | |||
51 | /* | ||
52 | * Architectures can override it: | 47 | * Architectures can override it: |
53 | */ | 48 | */ |
54 | void asmlinkage __attribute__((weak)) early_printk(const char *fmt, ...) | 49 | void asmlinkage __attribute__((weak)) early_printk(const char *fmt, ...) |
@@ -102,7 +97,7 @@ static int console_locked, console_suspended; | |||
102 | /* | 97 | /* |
103 | * logbuf_lock protects log_buf, log_start, log_end, con_start and logged_chars | 98 | * logbuf_lock protects log_buf, log_start, log_end, con_start and logged_chars |
104 | * It is also used in interesting ways to provide interlocking in | 99 | * It is also used in interesting ways to provide interlocking in |
105 | * release_console_sem(). | 100 | * console_unlock();. |
106 | */ | 101 | */ |
107 | static DEFINE_SPINLOCK(logbuf_lock); | 102 | static DEFINE_SPINLOCK(logbuf_lock); |
108 | 103 | ||
@@ -267,25 +262,47 @@ int dmesg_restrict = 1; | |||
267 | int dmesg_restrict; | 262 | int dmesg_restrict; |
268 | #endif | 263 | #endif |
269 | 264 | ||
265 | static int syslog_action_restricted(int type) | ||
266 | { | ||
267 | if (dmesg_restrict) | ||
268 | return 1; | ||
269 | /* Unless restricted, we allow "read all" and "get buffer size" for everybody */ | ||
270 | return type != SYSLOG_ACTION_READ_ALL && type != SYSLOG_ACTION_SIZE_BUFFER; | ||
271 | } | ||
272 | |||
273 | static int check_syslog_permissions(int type, bool from_file) | ||
274 | { | ||
275 | /* | ||
276 | * If this is from /proc/kmsg and we've already opened it, then we've | ||
277 | * already done the capabilities checks at open time. | ||
278 | */ | ||
279 | if (from_file && type != SYSLOG_ACTION_OPEN) | ||
280 | return 0; | ||
281 | |||
282 | if (syslog_action_restricted(type)) { | ||
283 | if (capable(CAP_SYSLOG)) | ||
284 | return 0; | ||
285 | /* For historical reasons, accept CAP_SYS_ADMIN too, with a warning */ | ||
286 | if (capable(CAP_SYS_ADMIN)) { | ||
287 | WARN_ONCE(1, "Attempt to access syslog with CAP_SYS_ADMIN " | ||
288 | "but no CAP_SYSLOG (deprecated).\n"); | ||
289 | return 0; | ||
290 | } | ||
291 | return -EPERM; | ||
292 | } | ||
293 | return 0; | ||
294 | } | ||
295 | |||
270 | int do_syslog(int type, char __user *buf, int len, bool from_file) | 296 | int do_syslog(int type, char __user *buf, int len, bool from_file) |
271 | { | 297 | { |
272 | unsigned i, j, limit, count; | 298 | unsigned i, j, limit, count; |
273 | int do_clear = 0; | 299 | int do_clear = 0; |
274 | char c; | 300 | char c; |
275 | int error = 0; | 301 | int error; |
276 | 302 | ||
277 | /* | 303 | error = check_syslog_permissions(type, from_file); |
278 | * If this is from /proc/kmsg we only do the capabilities checks | 304 | if (error) |
279 | * at open time. | 305 | goto out; |
280 | */ | ||
281 | if (type == SYSLOG_ACTION_OPEN || !from_file) { | ||
282 | if (dmesg_restrict && !capable(CAP_SYS_ADMIN)) | ||
283 | return -EPERM; | ||
284 | if ((type != SYSLOG_ACTION_READ_ALL && | ||
285 | type != SYSLOG_ACTION_SIZE_BUFFER) && | ||
286 | !capable(CAP_SYS_ADMIN)) | ||
287 | return -EPERM; | ||
288 | } | ||
289 | 306 | ||
290 | error = security_syslog(type); | 307 | error = security_syslog(type); |
291 | if (error) | 308 | if (error) |
@@ -500,7 +517,7 @@ static void _call_console_drivers(unsigned start, | |||
500 | /* | 517 | /* |
501 | * Call the console drivers, asking them to write out | 518 | * Call the console drivers, asking them to write out |
502 | * log_buf[start] to log_buf[end - 1]. | 519 | * log_buf[start] to log_buf[end - 1]. |
503 | * The console_sem must be held. | 520 | * The console_lock must be held. |
504 | */ | 521 | */ |
505 | static void call_console_drivers(unsigned start, unsigned end) | 522 | static void call_console_drivers(unsigned start, unsigned end) |
506 | { | 523 | { |
@@ -603,11 +620,11 @@ static int have_callable_console(void) | |||
603 | * | 620 | * |
604 | * This is printk(). It can be called from any context. We want it to work. | 621 | * This is printk(). It can be called from any context. We want it to work. |
605 | * | 622 | * |
606 | * We try to grab the console_sem. If we succeed, it's easy - we log the output and | 623 | * We try to grab the console_lock. If we succeed, it's easy - we log the output and |
607 | * call the console drivers. If we fail to get the semaphore we place the output | 624 | * call the console drivers. If we fail to get the semaphore we place the output |
608 | * into the log buffer and return. The current holder of the console_sem will | 625 | * into the log buffer and return. The current holder of the console_sem will |
609 | * notice the new output in release_console_sem() and will send it to the | 626 | * notice the new output in console_unlock(); and will send it to the |
610 | * consoles before releasing the semaphore. | 627 | * consoles before releasing the lock. |
611 | * | 628 | * |
612 | * One effect of this deferred printing is that code which calls printk() and | 629 | * One effect of this deferred printing is that code which calls printk() and |
613 | * then changes console_loglevel may break. This is because console_loglevel | 630 | * then changes console_loglevel may break. This is because console_loglevel |
@@ -658,19 +675,19 @@ static inline int can_use_console(unsigned int cpu) | |||
658 | /* | 675 | /* |
659 | * Try to get console ownership to actually show the kernel | 676 | * Try to get console ownership to actually show the kernel |
660 | * messages from a 'printk'. Return true (and with the | 677 | * messages from a 'printk'. Return true (and with the |
661 | * console_semaphore held, and 'console_locked' set) if it | 678 | * console_lock held, and 'console_locked' set) if it |
662 | * is successful, false otherwise. | 679 | * is successful, false otherwise. |
663 | * | 680 | * |
664 | * This gets called with the 'logbuf_lock' spinlock held and | 681 | * This gets called with the 'logbuf_lock' spinlock held and |
665 | * interrupts disabled. It should return with 'lockbuf_lock' | 682 | * interrupts disabled. It should return with 'lockbuf_lock' |
666 | * released but interrupts still disabled. | 683 | * released but interrupts still disabled. |
667 | */ | 684 | */ |
668 | static int acquire_console_semaphore_for_printk(unsigned int cpu) | 685 | static int console_trylock_for_printk(unsigned int cpu) |
669 | __releases(&logbuf_lock) | 686 | __releases(&logbuf_lock) |
670 | { | 687 | { |
671 | int retval = 0; | 688 | int retval = 0; |
672 | 689 | ||
673 | if (!try_acquire_console_sem()) { | 690 | if (console_trylock()) { |
674 | retval = 1; | 691 | retval = 1; |
675 | 692 | ||
676 | /* | 693 | /* |
@@ -826,12 +843,12 @@ asmlinkage int vprintk(const char *fmt, va_list args) | |||
826 | * actual magic (print out buffers, wake up klogd, | 843 | * actual magic (print out buffers, wake up klogd, |
827 | * etc). | 844 | * etc). |
828 | * | 845 | * |
829 | * The acquire_console_semaphore_for_printk() function | 846 | * The console_trylock_for_printk() function |
830 | * will release 'logbuf_lock' regardless of whether it | 847 | * will release 'logbuf_lock' regardless of whether it |
831 | * actually gets the semaphore or not. | 848 | * actually gets the semaphore or not. |
832 | */ | 849 | */ |
833 | if (acquire_console_semaphore_for_printk(this_cpu)) | 850 | if (console_trylock_for_printk(this_cpu)) |
834 | release_console_sem(); | 851 | console_unlock(); |
835 | 852 | ||
836 | lockdep_on(); | 853 | lockdep_on(); |
837 | out_restore_irqs: | 854 | out_restore_irqs: |
@@ -992,7 +1009,7 @@ void suspend_console(void) | |||
992 | if (!console_suspend_enabled) | 1009 | if (!console_suspend_enabled) |
993 | return; | 1010 | return; |
994 | printk("Suspending console(s) (use no_console_suspend to debug)\n"); | 1011 | printk("Suspending console(s) (use no_console_suspend to debug)\n"); |
995 | acquire_console_sem(); | 1012 | console_lock(); |
996 | console_suspended = 1; | 1013 | console_suspended = 1; |
997 | up(&console_sem); | 1014 | up(&console_sem); |
998 | } | 1015 | } |
@@ -1003,7 +1020,7 @@ void resume_console(void) | |||
1003 | return; | 1020 | return; |
1004 | down(&console_sem); | 1021 | down(&console_sem); |
1005 | console_suspended = 0; | 1022 | console_suspended = 0; |
1006 | release_console_sem(); | 1023 | console_unlock(); |
1007 | } | 1024 | } |
1008 | 1025 | ||
1009 | /** | 1026 | /** |
@@ -1026,21 +1043,21 @@ static int __cpuinit console_cpu_notify(struct notifier_block *self, | |||
1026 | case CPU_DYING: | 1043 | case CPU_DYING: |
1027 | case CPU_DOWN_FAILED: | 1044 | case CPU_DOWN_FAILED: |
1028 | case CPU_UP_CANCELED: | 1045 | case CPU_UP_CANCELED: |
1029 | acquire_console_sem(); | 1046 | console_lock(); |
1030 | release_console_sem(); | 1047 | console_unlock(); |
1031 | } | 1048 | } |
1032 | return NOTIFY_OK; | 1049 | return NOTIFY_OK; |
1033 | } | 1050 | } |
1034 | 1051 | ||
1035 | /** | 1052 | /** |
1036 | * acquire_console_sem - lock the console system for exclusive use. | 1053 | * console_lock - lock the console system for exclusive use. |
1037 | * | 1054 | * |
1038 | * Acquires a semaphore which guarantees that the caller has | 1055 | * Acquires a lock which guarantees that the caller has |
1039 | * exclusive access to the console system and the console_drivers list. | 1056 | * exclusive access to the console system and the console_drivers list. |
1040 | * | 1057 | * |
1041 | * Can sleep, returns nothing. | 1058 | * Can sleep, returns nothing. |
1042 | */ | 1059 | */ |
1043 | void acquire_console_sem(void) | 1060 | void console_lock(void) |
1044 | { | 1061 | { |
1045 | BUG_ON(in_interrupt()); | 1062 | BUG_ON(in_interrupt()); |
1046 | down(&console_sem); | 1063 | down(&console_sem); |
@@ -1049,21 +1066,29 @@ void acquire_console_sem(void) | |||
1049 | console_locked = 1; | 1066 | console_locked = 1; |
1050 | console_may_schedule = 1; | 1067 | console_may_schedule = 1; |
1051 | } | 1068 | } |
1052 | EXPORT_SYMBOL(acquire_console_sem); | 1069 | EXPORT_SYMBOL(console_lock); |
1053 | 1070 | ||
1054 | int try_acquire_console_sem(void) | 1071 | /** |
1072 | * console_trylock - try to lock the console system for exclusive use. | ||
1073 | * | ||
1074 | * Tried to acquire a lock which guarantees that the caller has | ||
1075 | * exclusive access to the console system and the console_drivers list. | ||
1076 | * | ||
1077 | * returns 1 on success, and 0 on failure to acquire the lock. | ||
1078 | */ | ||
1079 | int console_trylock(void) | ||
1055 | { | 1080 | { |
1056 | if (down_trylock(&console_sem)) | 1081 | if (down_trylock(&console_sem)) |
1057 | return -1; | 1082 | return 0; |
1058 | if (console_suspended) { | 1083 | if (console_suspended) { |
1059 | up(&console_sem); | 1084 | up(&console_sem); |
1060 | return -1; | 1085 | return 0; |
1061 | } | 1086 | } |
1062 | console_locked = 1; | 1087 | console_locked = 1; |
1063 | console_may_schedule = 0; | 1088 | console_may_schedule = 0; |
1064 | return 0; | 1089 | return 1; |
1065 | } | 1090 | } |
1066 | EXPORT_SYMBOL(try_acquire_console_sem); | 1091 | EXPORT_SYMBOL(console_trylock); |
1067 | 1092 | ||
1068 | int is_console_locked(void) | 1093 | int is_console_locked(void) |
1069 | { | 1094 | { |
@@ -1094,20 +1119,20 @@ void wake_up_klogd(void) | |||
1094 | } | 1119 | } |
1095 | 1120 | ||
1096 | /** | 1121 | /** |
1097 | * release_console_sem - unlock the console system | 1122 | * console_unlock - unlock the console system |
1098 | * | 1123 | * |
1099 | * Releases the semaphore which the caller holds on the console system | 1124 | * Releases the console_lock which the caller holds on the console system |
1100 | * and the console driver list. | 1125 | * and the console driver list. |
1101 | * | 1126 | * |
1102 | * While the semaphore was held, console output may have been buffered | 1127 | * While the console_lock was held, console output may have been buffered |
1103 | * by printk(). If this is the case, release_console_sem() emits | 1128 | * by printk(). If this is the case, console_unlock(); emits |
1104 | * the output prior to releasing the semaphore. | 1129 | * the output prior to releasing the lock. |
1105 | * | 1130 | * |
1106 | * If there is output waiting for klogd, we wake it up. | 1131 | * If there is output waiting for klogd, we wake it up. |
1107 | * | 1132 | * |
1108 | * release_console_sem() may be called from any context. | 1133 | * console_unlock(); may be called from any context. |
1109 | */ | 1134 | */ |
1110 | void release_console_sem(void) | 1135 | void console_unlock(void) |
1111 | { | 1136 | { |
1112 | unsigned long flags; | 1137 | unsigned long flags; |
1113 | unsigned _con_start, _log_end; | 1138 | unsigned _con_start, _log_end; |
@@ -1140,7 +1165,7 @@ void release_console_sem(void) | |||
1140 | if (wake_klogd) | 1165 | if (wake_klogd) |
1141 | wake_up_klogd(); | 1166 | wake_up_klogd(); |
1142 | } | 1167 | } |
1143 | EXPORT_SYMBOL(release_console_sem); | 1168 | EXPORT_SYMBOL(console_unlock); |
1144 | 1169 | ||
1145 | /** | 1170 | /** |
1146 | * console_conditional_schedule - yield the CPU if required | 1171 | * console_conditional_schedule - yield the CPU if required |
@@ -1149,7 +1174,7 @@ EXPORT_SYMBOL(release_console_sem); | |||
1149 | * if this CPU should yield the CPU to another task, do | 1174 | * if this CPU should yield the CPU to another task, do |
1150 | * so here. | 1175 | * so here. |
1151 | * | 1176 | * |
1152 | * Must be called within acquire_console_sem(). | 1177 | * Must be called within console_lock();. |
1153 | */ | 1178 | */ |
1154 | void __sched console_conditional_schedule(void) | 1179 | void __sched console_conditional_schedule(void) |
1155 | { | 1180 | { |
@@ -1170,14 +1195,14 @@ void console_unblank(void) | |||
1170 | if (down_trylock(&console_sem) != 0) | 1195 | if (down_trylock(&console_sem) != 0) |
1171 | return; | 1196 | return; |
1172 | } else | 1197 | } else |
1173 | acquire_console_sem(); | 1198 | console_lock(); |
1174 | 1199 | ||
1175 | console_locked = 1; | 1200 | console_locked = 1; |
1176 | console_may_schedule = 0; | 1201 | console_may_schedule = 0; |
1177 | for_each_console(c) | 1202 | for_each_console(c) |
1178 | if ((c->flags & CON_ENABLED) && c->unblank) | 1203 | if ((c->flags & CON_ENABLED) && c->unblank) |
1179 | c->unblank(); | 1204 | c->unblank(); |
1180 | release_console_sem(); | 1205 | console_unlock(); |
1181 | } | 1206 | } |
1182 | 1207 | ||
1183 | /* | 1208 | /* |
@@ -1188,7 +1213,7 @@ struct tty_driver *console_device(int *index) | |||
1188 | struct console *c; | 1213 | struct console *c; |
1189 | struct tty_driver *driver = NULL; | 1214 | struct tty_driver *driver = NULL; |
1190 | 1215 | ||
1191 | acquire_console_sem(); | 1216 | console_lock(); |
1192 | for_each_console(c) { | 1217 | for_each_console(c) { |
1193 | if (!c->device) | 1218 | if (!c->device) |
1194 | continue; | 1219 | continue; |
@@ -1196,7 +1221,7 @@ struct tty_driver *console_device(int *index) | |||
1196 | if (driver) | 1221 | if (driver) |
1197 | break; | 1222 | break; |
1198 | } | 1223 | } |
1199 | release_console_sem(); | 1224 | console_unlock(); |
1200 | return driver; | 1225 | return driver; |
1201 | } | 1226 | } |
1202 | 1227 | ||
@@ -1207,17 +1232,17 @@ struct tty_driver *console_device(int *index) | |||
1207 | */ | 1232 | */ |
1208 | void console_stop(struct console *console) | 1233 | void console_stop(struct console *console) |
1209 | { | 1234 | { |
1210 | acquire_console_sem(); | 1235 | console_lock(); |
1211 | console->flags &= ~CON_ENABLED; | 1236 | console->flags &= ~CON_ENABLED; |
1212 | release_console_sem(); | 1237 | console_unlock(); |
1213 | } | 1238 | } |
1214 | EXPORT_SYMBOL(console_stop); | 1239 | EXPORT_SYMBOL(console_stop); |
1215 | 1240 | ||
1216 | void console_start(struct console *console) | 1241 | void console_start(struct console *console) |
1217 | { | 1242 | { |
1218 | acquire_console_sem(); | 1243 | console_lock(); |
1219 | console->flags |= CON_ENABLED; | 1244 | console->flags |= CON_ENABLED; |
1220 | release_console_sem(); | 1245 | console_unlock(); |
1221 | } | 1246 | } |
1222 | EXPORT_SYMBOL(console_start); | 1247 | EXPORT_SYMBOL(console_start); |
1223 | 1248 | ||
@@ -1339,7 +1364,7 @@ void register_console(struct console *newcon) | |||
1339 | * Put this console in the list - keep the | 1364 | * Put this console in the list - keep the |
1340 | * preferred driver at the head of the list. | 1365 | * preferred driver at the head of the list. |
1341 | */ | 1366 | */ |
1342 | acquire_console_sem(); | 1367 | console_lock(); |
1343 | if ((newcon->flags & CON_CONSDEV) || console_drivers == NULL) { | 1368 | if ((newcon->flags & CON_CONSDEV) || console_drivers == NULL) { |
1344 | newcon->next = console_drivers; | 1369 | newcon->next = console_drivers; |
1345 | console_drivers = newcon; | 1370 | console_drivers = newcon; |
@@ -1351,14 +1376,15 @@ void register_console(struct console *newcon) | |||
1351 | } | 1376 | } |
1352 | if (newcon->flags & CON_PRINTBUFFER) { | 1377 | if (newcon->flags & CON_PRINTBUFFER) { |
1353 | /* | 1378 | /* |
1354 | * release_console_sem() will print out the buffered messages | 1379 | * console_unlock(); will print out the buffered messages |
1355 | * for us. | 1380 | * for us. |
1356 | */ | 1381 | */ |
1357 | spin_lock_irqsave(&logbuf_lock, flags); | 1382 | spin_lock_irqsave(&logbuf_lock, flags); |
1358 | con_start = log_start; | 1383 | con_start = log_start; |
1359 | spin_unlock_irqrestore(&logbuf_lock, flags); | 1384 | spin_unlock_irqrestore(&logbuf_lock, flags); |
1360 | } | 1385 | } |
1361 | release_console_sem(); | 1386 | console_unlock(); |
1387 | console_sysfs_notify(); | ||
1362 | 1388 | ||
1363 | /* | 1389 | /* |
1364 | * By unregistering the bootconsoles after we enable the real console | 1390 | * By unregistering the bootconsoles after we enable the real console |
@@ -1394,7 +1420,7 @@ int unregister_console(struct console *console) | |||
1394 | return braille_unregister_console(console); | 1420 | return braille_unregister_console(console); |
1395 | #endif | 1421 | #endif |
1396 | 1422 | ||
1397 | acquire_console_sem(); | 1423 | console_lock(); |
1398 | if (console_drivers == console) { | 1424 | if (console_drivers == console) { |
1399 | console_drivers=console->next; | 1425 | console_drivers=console->next; |
1400 | res = 0; | 1426 | res = 0; |
@@ -1416,7 +1442,8 @@ int unregister_console(struct console *console) | |||
1416 | if (console_drivers != NULL && console->flags & CON_CONSDEV) | 1442 | if (console_drivers != NULL && console->flags & CON_CONSDEV) |
1417 | console_drivers->flags |= CON_CONSDEV; | 1443 | console_drivers->flags |= CON_CONSDEV; |
1418 | 1444 | ||
1419 | release_console_sem(); | 1445 | console_unlock(); |
1446 | console_sysfs_notify(); | ||
1420 | return res; | 1447 | return res; |
1421 | } | 1448 | } |
1422 | EXPORT_SYMBOL(unregister_console); | 1449 | EXPORT_SYMBOL(unregister_console); |
@@ -1500,7 +1527,7 @@ int kmsg_dump_register(struct kmsg_dumper *dumper) | |||
1500 | /* Don't allow registering multiple times */ | 1527 | /* Don't allow registering multiple times */ |
1501 | if (!dumper->registered) { | 1528 | if (!dumper->registered) { |
1502 | dumper->registered = 1; | 1529 | dumper->registered = 1; |
1503 | list_add_tail(&dumper->list, &dump_list); | 1530 | list_add_tail_rcu(&dumper->list, &dump_list); |
1504 | err = 0; | 1531 | err = 0; |
1505 | } | 1532 | } |
1506 | spin_unlock_irqrestore(&dump_list_lock, flags); | 1533 | spin_unlock_irqrestore(&dump_list_lock, flags); |
@@ -1524,29 +1551,16 @@ int kmsg_dump_unregister(struct kmsg_dumper *dumper) | |||
1524 | spin_lock_irqsave(&dump_list_lock, flags); | 1551 | spin_lock_irqsave(&dump_list_lock, flags); |
1525 | if (dumper->registered) { | 1552 | if (dumper->registered) { |
1526 | dumper->registered = 0; | 1553 | dumper->registered = 0; |
1527 | list_del(&dumper->list); | 1554 | list_del_rcu(&dumper->list); |
1528 | err = 0; | 1555 | err = 0; |
1529 | } | 1556 | } |
1530 | spin_unlock_irqrestore(&dump_list_lock, flags); | 1557 | spin_unlock_irqrestore(&dump_list_lock, flags); |
1558 | synchronize_rcu(); | ||
1531 | 1559 | ||
1532 | return err; | 1560 | return err; |
1533 | } | 1561 | } |
1534 | EXPORT_SYMBOL_GPL(kmsg_dump_unregister); | 1562 | EXPORT_SYMBOL_GPL(kmsg_dump_unregister); |
1535 | 1563 | ||
1536 | static const char * const kmsg_reasons[] = { | ||
1537 | [KMSG_DUMP_OOPS] = "oops", | ||
1538 | [KMSG_DUMP_PANIC] = "panic", | ||
1539 | [KMSG_DUMP_KEXEC] = "kexec", | ||
1540 | }; | ||
1541 | |||
1542 | static const char *kmsg_to_str(enum kmsg_dump_reason reason) | ||
1543 | { | ||
1544 | if (reason >= ARRAY_SIZE(kmsg_reasons) || reason < 0) | ||
1545 | return "unknown"; | ||
1546 | |||
1547 | return kmsg_reasons[reason]; | ||
1548 | } | ||
1549 | |||
1550 | /** | 1564 | /** |
1551 | * kmsg_dump - dump kernel log to kernel message dumpers. | 1565 | * kmsg_dump - dump kernel log to kernel message dumpers. |
1552 | * @reason: the reason (oops, panic etc) for dumping | 1566 | * @reason: the reason (oops, panic etc) for dumping |
@@ -1585,13 +1599,9 @@ void kmsg_dump(enum kmsg_dump_reason reason) | |||
1585 | l2 = chars; | 1599 | l2 = chars; |
1586 | } | 1600 | } |
1587 | 1601 | ||
1588 | if (!spin_trylock_irqsave(&dump_list_lock, flags)) { | 1602 | rcu_read_lock(); |
1589 | printk(KERN_ERR "dump_kmsg: dump list lock is held during %s, skipping dump\n", | 1603 | list_for_each_entry_rcu(dumper, &dump_list, list) |
1590 | kmsg_to_str(reason)); | ||
1591 | return; | ||
1592 | } | ||
1593 | list_for_each_entry(dumper, &dump_list, list) | ||
1594 | dumper->dump(dumper, reason, s1, l1, s2, l2); | 1604 | dumper->dump(dumper, reason, s1, l1, s2, l2); |
1595 | spin_unlock_irqrestore(&dump_list_lock, flags); | 1605 | rcu_read_unlock(); |
1596 | } | 1606 | } |
1597 | #endif | 1607 | #endif |
diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 99bbaa3e5b0d..e2302e40b360 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c | |||
@@ -163,7 +163,7 @@ bool ptrace_may_access(struct task_struct *task, unsigned int mode) | |||
163 | return !err; | 163 | return !err; |
164 | } | 164 | } |
165 | 165 | ||
166 | int ptrace_attach(struct task_struct *task) | 166 | static int ptrace_attach(struct task_struct *task) |
167 | { | 167 | { |
168 | int retval; | 168 | int retval; |
169 | 169 | ||
@@ -219,7 +219,7 @@ out: | |||
219 | * Performs checks and sets PT_PTRACED. | 219 | * Performs checks and sets PT_PTRACED. |
220 | * Should be used by all ptrace implementations for PTRACE_TRACEME. | 220 | * Should be used by all ptrace implementations for PTRACE_TRACEME. |
221 | */ | 221 | */ |
222 | int ptrace_traceme(void) | 222 | static int ptrace_traceme(void) |
223 | { | 223 | { |
224 | int ret = -EPERM; | 224 | int ret = -EPERM; |
225 | 225 | ||
@@ -293,7 +293,7 @@ static bool __ptrace_detach(struct task_struct *tracer, struct task_struct *p) | |||
293 | return false; | 293 | return false; |
294 | } | 294 | } |
295 | 295 | ||
296 | int ptrace_detach(struct task_struct *child, unsigned int data) | 296 | static int ptrace_detach(struct task_struct *child, unsigned int data) |
297 | { | 297 | { |
298 | bool dead = false; | 298 | bool dead = false; |
299 | 299 | ||
@@ -313,7 +313,7 @@ int ptrace_detach(struct task_struct *child, unsigned int data) | |||
313 | child->exit_code = data; | 313 | child->exit_code = data; |
314 | dead = __ptrace_detach(current, child); | 314 | dead = __ptrace_detach(current, child); |
315 | if (!child->exit_state) | 315 | if (!child->exit_state) |
316 | wake_up_process(child); | 316 | wake_up_state(child, TASK_TRACED | TASK_STOPPED); |
317 | } | 317 | } |
318 | write_unlock_irq(&tasklist_lock); | 318 | write_unlock_irq(&tasklist_lock); |
319 | 319 | ||
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c index 034493724749..0c343b9a46d5 100644 --- a/kernel/rcutiny.c +++ b/kernel/rcutiny.c | |||
@@ -189,7 +189,8 @@ static int rcu_kthread(void *arg) | |||
189 | unsigned long flags; | 189 | unsigned long flags; |
190 | 190 | ||
191 | for (;;) { | 191 | for (;;) { |
192 | wait_event(rcu_kthread_wq, have_rcu_kthread_work != 0); | 192 | wait_event_interruptible(rcu_kthread_wq, |
193 | have_rcu_kthread_work != 0); | ||
193 | morework = rcu_boost(); | 194 | morework = rcu_boost(); |
194 | local_irq_save(flags); | 195 | local_irq_save(flags); |
195 | work = have_rcu_kthread_work; | 196 | work = have_rcu_kthread_work; |
diff --git a/kernel/rcutree.c b/kernel/rcutree.c index d0ddfea6579d..dd4aea806f8e 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c | |||
@@ -364,8 +364,8 @@ void rcu_irq_exit(void) | |||
364 | WARN_ON_ONCE(rdtp->dynticks & 0x1); | 364 | WARN_ON_ONCE(rdtp->dynticks & 0x1); |
365 | 365 | ||
366 | /* If the interrupt queued a callback, get out of dyntick mode. */ | 366 | /* If the interrupt queued a callback, get out of dyntick mode. */ |
367 | if (__get_cpu_var(rcu_sched_data).nxtlist || | 367 | if (__this_cpu_read(rcu_sched_data.nxtlist) || |
368 | __get_cpu_var(rcu_bh_data).nxtlist) | 368 | __this_cpu_read(rcu_bh_data.nxtlist)) |
369 | set_need_resched(); | 369 | set_need_resched(); |
370 | } | 370 | } |
371 | 371 | ||
diff --git a/kernel/rtmutex-debug.c b/kernel/rtmutex-debug.c index ddabb54bb5c8..3c7cbc2c33be 100644 --- a/kernel/rtmutex-debug.c +++ b/kernel/rtmutex-debug.c | |||
@@ -215,7 +215,6 @@ void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter) | |||
215 | put_pid(waiter->deadlock_task_pid); | 215 | put_pid(waiter->deadlock_task_pid); |
216 | TRACE_WARN_ON(!plist_node_empty(&waiter->list_entry)); | 216 | TRACE_WARN_ON(!plist_node_empty(&waiter->list_entry)); |
217 | TRACE_WARN_ON(!plist_node_empty(&waiter->pi_list_entry)); | 217 | TRACE_WARN_ON(!plist_node_empty(&waiter->pi_list_entry)); |
218 | TRACE_WARN_ON(waiter->task); | ||
219 | memset(waiter, 0x22, sizeof(*waiter)); | 218 | memset(waiter, 0x22, sizeof(*waiter)); |
220 | } | 219 | } |
221 | 220 | ||
diff --git a/kernel/rtmutex-tester.c b/kernel/rtmutex-tester.c index 66cb89bc5ef1..5c9ccd380966 100644 --- a/kernel/rtmutex-tester.c +++ b/kernel/rtmutex-tester.c | |||
@@ -9,7 +9,6 @@ | |||
9 | #include <linux/kthread.h> | 9 | #include <linux/kthread.h> |
10 | #include <linux/module.h> | 10 | #include <linux/module.h> |
11 | #include <linux/sched.h> | 11 | #include <linux/sched.h> |
12 | #include <linux/smp_lock.h> | ||
13 | #include <linux/spinlock.h> | 12 | #include <linux/spinlock.h> |
14 | #include <linux/sysdev.h> | 13 | #include <linux/sysdev.h> |
15 | #include <linux/timer.h> | 14 | #include <linux/timer.h> |
@@ -27,7 +26,6 @@ struct test_thread_data { | |||
27 | int opcode; | 26 | int opcode; |
28 | int opdata; | 27 | int opdata; |
29 | int mutexes[MAX_RT_TEST_MUTEXES]; | 28 | int mutexes[MAX_RT_TEST_MUTEXES]; |
30 | int bkl; | ||
31 | int event; | 29 | int event; |
32 | struct sys_device sysdev; | 30 | struct sys_device sysdev; |
33 | }; | 31 | }; |
@@ -46,9 +44,8 @@ enum test_opcodes { | |||
46 | RTTEST_LOCKINTNOWAIT, /* 6 Lock interruptible no wait in wakeup, data = lockindex */ | 44 | RTTEST_LOCKINTNOWAIT, /* 6 Lock interruptible no wait in wakeup, data = lockindex */ |
47 | RTTEST_LOCKCONT, /* 7 Continue locking after the wakeup delay */ | 45 | RTTEST_LOCKCONT, /* 7 Continue locking after the wakeup delay */ |
48 | RTTEST_UNLOCK, /* 8 Unlock, data = lockindex */ | 46 | RTTEST_UNLOCK, /* 8 Unlock, data = lockindex */ |
49 | RTTEST_LOCKBKL, /* 9 Lock BKL */ | 47 | /* 9, 10 - reserved for BKL commemoration */ |
50 | RTTEST_UNLOCKBKL, /* 10 Unlock BKL */ | 48 | RTTEST_SIGNAL = 11, /* 11 Signal other test thread, data = thread id */ |
51 | RTTEST_SIGNAL, /* 11 Signal other test thread, data = thread id */ | ||
52 | RTTEST_RESETEVENT = 98, /* 98 Reset event counter */ | 49 | RTTEST_RESETEVENT = 98, /* 98 Reset event counter */ |
53 | RTTEST_RESET = 99, /* 99 Reset all pending operations */ | 50 | RTTEST_RESET = 99, /* 99 Reset all pending operations */ |
54 | }; | 51 | }; |
@@ -74,13 +71,6 @@ static int handle_op(struct test_thread_data *td, int lockwakeup) | |||
74 | td->mutexes[i] = 0; | 71 | td->mutexes[i] = 0; |
75 | } | 72 | } |
76 | } | 73 | } |
77 | |||
78 | if (!lockwakeup && td->bkl == 4) { | ||
79 | #ifdef CONFIG_LOCK_KERNEL | ||
80 | unlock_kernel(); | ||
81 | #endif | ||
82 | td->bkl = 0; | ||
83 | } | ||
84 | return 0; | 74 | return 0; |
85 | 75 | ||
86 | case RTTEST_RESETEVENT: | 76 | case RTTEST_RESETEVENT: |
@@ -131,25 +121,6 @@ static int handle_op(struct test_thread_data *td, int lockwakeup) | |||
131 | td->mutexes[id] = 0; | 121 | td->mutexes[id] = 0; |
132 | return 0; | 122 | return 0; |
133 | 123 | ||
134 | case RTTEST_LOCKBKL: | ||
135 | if (td->bkl) | ||
136 | return 0; | ||
137 | td->bkl = 1; | ||
138 | #ifdef CONFIG_LOCK_KERNEL | ||
139 | lock_kernel(); | ||
140 | #endif | ||
141 | td->bkl = 4; | ||
142 | return 0; | ||
143 | |||
144 | case RTTEST_UNLOCKBKL: | ||
145 | if (td->bkl != 4) | ||
146 | break; | ||
147 | #ifdef CONFIG_LOCK_KERNEL | ||
148 | unlock_kernel(); | ||
149 | #endif | ||
150 | td->bkl = 0; | ||
151 | return 0; | ||
152 | |||
153 | default: | 124 | default: |
154 | break; | 125 | break; |
155 | } | 126 | } |
@@ -196,7 +167,6 @@ void schedule_rt_mutex_test(struct rt_mutex *mutex) | |||
196 | td->event = atomic_add_return(1, &rttest_event); | 167 | td->event = atomic_add_return(1, &rttest_event); |
197 | break; | 168 | break; |
198 | 169 | ||
199 | case RTTEST_LOCKBKL: | ||
200 | default: | 170 | default: |
201 | break; | 171 | break; |
202 | } | 172 | } |
@@ -229,8 +199,6 @@ void schedule_rt_mutex_test(struct rt_mutex *mutex) | |||
229 | td->event = atomic_add_return(1, &rttest_event); | 199 | td->event = atomic_add_return(1, &rttest_event); |
230 | return; | 200 | return; |
231 | 201 | ||
232 | case RTTEST_LOCKBKL: | ||
233 | return; | ||
234 | default: | 202 | default: |
235 | return; | 203 | return; |
236 | } | 204 | } |
@@ -380,11 +348,11 @@ static ssize_t sysfs_test_status(struct sys_device *dev, struct sysdev_attribute | |||
380 | spin_lock(&rttest_lock); | 348 | spin_lock(&rttest_lock); |
381 | 349 | ||
382 | curr += sprintf(curr, | 350 | curr += sprintf(curr, |
383 | "O: %4d, E:%8d, S: 0x%08lx, P: %4d, N: %4d, B: %p, K: %d, M:", | 351 | "O: %4d, E:%8d, S: 0x%08lx, P: %4d, N: %4d, B: %p, M:", |
384 | td->opcode, td->event, tsk->state, | 352 | td->opcode, td->event, tsk->state, |
385 | (MAX_RT_PRIO - 1) - tsk->prio, | 353 | (MAX_RT_PRIO - 1) - tsk->prio, |
386 | (MAX_RT_PRIO - 1) - tsk->normal_prio, | 354 | (MAX_RT_PRIO - 1) - tsk->normal_prio, |
387 | tsk->pi_blocked_on, td->bkl); | 355 | tsk->pi_blocked_on); |
388 | 356 | ||
389 | for (i = MAX_RT_TEST_MUTEXES - 1; i >=0 ; i--) | 357 | for (i = MAX_RT_TEST_MUTEXES - 1; i >=0 ; i--) |
390 | curr += sprintf(curr, "%d", td->mutexes[i]); | 358 | curr += sprintf(curr, "%d", td->mutexes[i]); |
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c index a9604815786a..ab449117aaf2 100644 --- a/kernel/rtmutex.c +++ b/kernel/rtmutex.c | |||
@@ -20,41 +20,34 @@ | |||
20 | /* | 20 | /* |
21 | * lock->owner state tracking: | 21 | * lock->owner state tracking: |
22 | * | 22 | * |
23 | * lock->owner holds the task_struct pointer of the owner. Bit 0 and 1 | 23 | * lock->owner holds the task_struct pointer of the owner. Bit 0 |
24 | * are used to keep track of the "owner is pending" and "lock has | 24 | * is used to keep track of the "lock has waiters" state. |
25 | * waiters" state. | ||
26 | * | 25 | * |
27 | * owner bit1 bit0 | 26 | * owner bit0 |
28 | * NULL 0 0 lock is free (fast acquire possible) | 27 | * NULL 0 lock is free (fast acquire possible) |
29 | * NULL 0 1 invalid state | 28 | * NULL 1 lock is free and has waiters and the top waiter |
30 | * NULL 1 0 Transitional State* | 29 | * is going to take the lock* |
31 | * NULL 1 1 invalid state | 30 | * taskpointer 0 lock is held (fast release possible) |
32 | * taskpointer 0 0 lock is held (fast release possible) | 31 | * taskpointer 1 lock is held and has waiters** |
33 | * taskpointer 0 1 task is pending owner | ||
34 | * taskpointer 1 0 lock is held and has waiters | ||
35 | * taskpointer 1 1 task is pending owner and lock has more waiters | ||
36 | * | ||
37 | * Pending ownership is assigned to the top (highest priority) | ||
38 | * waiter of the lock, when the lock is released. The thread is woken | ||
39 | * up and can now take the lock. Until the lock is taken (bit 0 | ||
40 | * cleared) a competing higher priority thread can steal the lock | ||
41 | * which puts the woken up thread back on the waiters list. | ||
42 | * | 32 | * |
43 | * The fast atomic compare exchange based acquire and release is only | 33 | * The fast atomic compare exchange based acquire and release is only |
44 | * possible when bit 0 and 1 of lock->owner are 0. | 34 | * possible when bit 0 of lock->owner is 0. |
35 | * | ||
36 | * (*) It also can be a transitional state when grabbing the lock | ||
37 | * with ->wait_lock is held. To prevent any fast path cmpxchg to the lock, | ||
38 | * we need to set the bit0 before looking at the lock, and the owner may be | ||
39 | * NULL in this small time, hence this can be a transitional state. | ||
45 | * | 40 | * |
46 | * (*) There's a small time where the owner can be NULL and the | 41 | * (**) There is a small time when bit 0 is set but there are no |
47 | * "lock has waiters" bit is set. This can happen when grabbing the lock. | 42 | * waiters. This can happen when grabbing the lock in the slow path. |
48 | * To prevent a cmpxchg of the owner releasing the lock, we need to set this | 43 | * To prevent a cmpxchg of the owner releasing the lock, we need to |
49 | * bit before looking at the lock, hence the reason this is a transitional | 44 | * set this bit before looking at the lock. |
50 | * state. | ||
51 | */ | 45 | */ |
52 | 46 | ||
53 | static void | 47 | static void |
54 | rt_mutex_set_owner(struct rt_mutex *lock, struct task_struct *owner, | 48 | rt_mutex_set_owner(struct rt_mutex *lock, struct task_struct *owner) |
55 | unsigned long mask) | ||
56 | { | 49 | { |
57 | unsigned long val = (unsigned long)owner | mask; | 50 | unsigned long val = (unsigned long)owner; |
58 | 51 | ||
59 | if (rt_mutex_has_waiters(lock)) | 52 | if (rt_mutex_has_waiters(lock)) |
60 | val |= RT_MUTEX_HAS_WAITERS; | 53 | val |= RT_MUTEX_HAS_WAITERS; |
@@ -203,15 +196,14 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, | |||
203 | * reached or the state of the chain has changed while we | 196 | * reached or the state of the chain has changed while we |
204 | * dropped the locks. | 197 | * dropped the locks. |
205 | */ | 198 | */ |
206 | if (!waiter || !waiter->task) | 199 | if (!waiter) |
207 | goto out_unlock_pi; | 200 | goto out_unlock_pi; |
208 | 201 | ||
209 | /* | 202 | /* |
210 | * Check the orig_waiter state. After we dropped the locks, | 203 | * Check the orig_waiter state. After we dropped the locks, |
211 | * the previous owner of the lock might have released the lock | 204 | * the previous owner of the lock might have released the lock. |
212 | * and made us the pending owner: | ||
213 | */ | 205 | */ |
214 | if (orig_waiter && !orig_waiter->task) | 206 | if (orig_waiter && !rt_mutex_owner(orig_lock)) |
215 | goto out_unlock_pi; | 207 | goto out_unlock_pi; |
216 | 208 | ||
217 | /* | 209 | /* |
@@ -254,6 +246,17 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, | |||
254 | 246 | ||
255 | /* Release the task */ | 247 | /* Release the task */ |
256 | raw_spin_unlock_irqrestore(&task->pi_lock, flags); | 248 | raw_spin_unlock_irqrestore(&task->pi_lock, flags); |
249 | if (!rt_mutex_owner(lock)) { | ||
250 | /* | ||
251 | * If the requeue above changed the top waiter, then we need | ||
252 | * to wake the new top waiter up to try to get the lock. | ||
253 | */ | ||
254 | |||
255 | if (top_waiter != rt_mutex_top_waiter(lock)) | ||
256 | wake_up_process(rt_mutex_top_waiter(lock)->task); | ||
257 | raw_spin_unlock(&lock->wait_lock); | ||
258 | goto out_put_task; | ||
259 | } | ||
257 | put_task_struct(task); | 260 | put_task_struct(task); |
258 | 261 | ||
259 | /* Grab the next task */ | 262 | /* Grab the next task */ |
@@ -296,78 +299,16 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, | |||
296 | } | 299 | } |
297 | 300 | ||
298 | /* | 301 | /* |
299 | * Optimization: check if we can steal the lock from the | ||
300 | * assigned pending owner [which might not have taken the | ||
301 | * lock yet]: | ||
302 | */ | ||
303 | static inline int try_to_steal_lock(struct rt_mutex *lock, | ||
304 | struct task_struct *task) | ||
305 | { | ||
306 | struct task_struct *pendowner = rt_mutex_owner(lock); | ||
307 | struct rt_mutex_waiter *next; | ||
308 | unsigned long flags; | ||
309 | |||
310 | if (!rt_mutex_owner_pending(lock)) | ||
311 | return 0; | ||
312 | |||
313 | if (pendowner == task) | ||
314 | return 1; | ||
315 | |||
316 | raw_spin_lock_irqsave(&pendowner->pi_lock, flags); | ||
317 | if (task->prio >= pendowner->prio) { | ||
318 | raw_spin_unlock_irqrestore(&pendowner->pi_lock, flags); | ||
319 | return 0; | ||
320 | } | ||
321 | |||
322 | /* | ||
323 | * Check if a waiter is enqueued on the pending owners | ||
324 | * pi_waiters list. Remove it and readjust pending owners | ||
325 | * priority. | ||
326 | */ | ||
327 | if (likely(!rt_mutex_has_waiters(lock))) { | ||
328 | raw_spin_unlock_irqrestore(&pendowner->pi_lock, flags); | ||
329 | return 1; | ||
330 | } | ||
331 | |||
332 | /* No chain handling, pending owner is not blocked on anything: */ | ||
333 | next = rt_mutex_top_waiter(lock); | ||
334 | plist_del(&next->pi_list_entry, &pendowner->pi_waiters); | ||
335 | __rt_mutex_adjust_prio(pendowner); | ||
336 | raw_spin_unlock_irqrestore(&pendowner->pi_lock, flags); | ||
337 | |||
338 | /* | ||
339 | * We are going to steal the lock and a waiter was | ||
340 | * enqueued on the pending owners pi_waiters queue. So | ||
341 | * we have to enqueue this waiter into | ||
342 | * task->pi_waiters list. This covers the case, | ||
343 | * where task is boosted because it holds another | ||
344 | * lock and gets unboosted because the booster is | ||
345 | * interrupted, so we would delay a waiter with higher | ||
346 | * priority as task->normal_prio. | ||
347 | * | ||
348 | * Note: in the rare case of a SCHED_OTHER task changing | ||
349 | * its priority and thus stealing the lock, next->task | ||
350 | * might be task: | ||
351 | */ | ||
352 | if (likely(next->task != task)) { | ||
353 | raw_spin_lock_irqsave(&task->pi_lock, flags); | ||
354 | plist_add(&next->pi_list_entry, &task->pi_waiters); | ||
355 | __rt_mutex_adjust_prio(task); | ||
356 | raw_spin_unlock_irqrestore(&task->pi_lock, flags); | ||
357 | } | ||
358 | return 1; | ||
359 | } | ||
360 | |||
361 | /* | ||
362 | * Try to take an rt-mutex | 302 | * Try to take an rt-mutex |
363 | * | 303 | * |
364 | * This fails | ||
365 | * - when the lock has a real owner | ||
366 | * - when a different pending owner exists and has higher priority than current | ||
367 | * | ||
368 | * Must be called with lock->wait_lock held. | 304 | * Must be called with lock->wait_lock held. |
305 | * | ||
306 | * @lock: the lock to be acquired. | ||
307 | * @task: the task which wants to acquire the lock | ||
308 | * @waiter: the waiter that is queued to the lock's wait list. (could be NULL) | ||
369 | */ | 309 | */ |
370 | static int try_to_take_rt_mutex(struct rt_mutex *lock) | 310 | static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task, |
311 | struct rt_mutex_waiter *waiter) | ||
371 | { | 312 | { |
372 | /* | 313 | /* |
373 | * We have to be careful here if the atomic speedups are | 314 | * We have to be careful here if the atomic speedups are |
@@ -390,15 +331,52 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock) | |||
390 | */ | 331 | */ |
391 | mark_rt_mutex_waiters(lock); | 332 | mark_rt_mutex_waiters(lock); |
392 | 333 | ||
393 | if (rt_mutex_owner(lock) && !try_to_steal_lock(lock, current)) | 334 | if (rt_mutex_owner(lock)) |
394 | return 0; | 335 | return 0; |
395 | 336 | ||
337 | /* | ||
338 | * It will get the lock because of one of these conditions: | ||
339 | * 1) there is no waiter | ||
340 | * 2) higher priority than waiters | ||
341 | * 3) it is top waiter | ||
342 | */ | ||
343 | if (rt_mutex_has_waiters(lock)) { | ||
344 | if (task->prio >= rt_mutex_top_waiter(lock)->list_entry.prio) { | ||
345 | if (!waiter || waiter != rt_mutex_top_waiter(lock)) | ||
346 | return 0; | ||
347 | } | ||
348 | } | ||
349 | |||
350 | if (waiter || rt_mutex_has_waiters(lock)) { | ||
351 | unsigned long flags; | ||
352 | struct rt_mutex_waiter *top; | ||
353 | |||
354 | raw_spin_lock_irqsave(&task->pi_lock, flags); | ||
355 | |||
356 | /* remove the queued waiter. */ | ||
357 | if (waiter) { | ||
358 | plist_del(&waiter->list_entry, &lock->wait_list); | ||
359 | task->pi_blocked_on = NULL; | ||
360 | } | ||
361 | |||
362 | /* | ||
363 | * We have to enqueue the top waiter(if it exists) into | ||
364 | * task->pi_waiters list. | ||
365 | */ | ||
366 | if (rt_mutex_has_waiters(lock)) { | ||
367 | top = rt_mutex_top_waiter(lock); | ||
368 | top->pi_list_entry.prio = top->list_entry.prio; | ||
369 | plist_add(&top->pi_list_entry, &task->pi_waiters); | ||
370 | } | ||
371 | raw_spin_unlock_irqrestore(&task->pi_lock, flags); | ||
372 | } | ||
373 | |||
396 | /* We got the lock. */ | 374 | /* We got the lock. */ |
397 | debug_rt_mutex_lock(lock); | 375 | debug_rt_mutex_lock(lock); |
398 | 376 | ||
399 | rt_mutex_set_owner(lock, current, 0); | 377 | rt_mutex_set_owner(lock, task); |
400 | 378 | ||
401 | rt_mutex_deadlock_account_lock(lock, current); | 379 | rt_mutex_deadlock_account_lock(lock, task); |
402 | 380 | ||
403 | return 1; | 381 | return 1; |
404 | } | 382 | } |
@@ -436,6 +414,9 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, | |||
436 | 414 | ||
437 | raw_spin_unlock_irqrestore(&task->pi_lock, flags); | 415 | raw_spin_unlock_irqrestore(&task->pi_lock, flags); |
438 | 416 | ||
417 | if (!owner) | ||
418 | return 0; | ||
419 | |||
439 | if (waiter == rt_mutex_top_waiter(lock)) { | 420 | if (waiter == rt_mutex_top_waiter(lock)) { |
440 | raw_spin_lock_irqsave(&owner->pi_lock, flags); | 421 | raw_spin_lock_irqsave(&owner->pi_lock, flags); |
441 | plist_del(&top_waiter->pi_list_entry, &owner->pi_waiters); | 422 | plist_del(&top_waiter->pi_list_entry, &owner->pi_waiters); |
@@ -472,21 +453,18 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, | |||
472 | /* | 453 | /* |
473 | * Wake up the next waiter on the lock. | 454 | * Wake up the next waiter on the lock. |
474 | * | 455 | * |
475 | * Remove the top waiter from the current tasks waiter list and from | 456 | * Remove the top waiter from the current tasks waiter list and wake it up. |
476 | * the lock waiter list. Set it as pending owner. Then wake it up. | ||
477 | * | 457 | * |
478 | * Called with lock->wait_lock held. | 458 | * Called with lock->wait_lock held. |
479 | */ | 459 | */ |
480 | static void wakeup_next_waiter(struct rt_mutex *lock) | 460 | static void wakeup_next_waiter(struct rt_mutex *lock) |
481 | { | 461 | { |
482 | struct rt_mutex_waiter *waiter; | 462 | struct rt_mutex_waiter *waiter; |
483 | struct task_struct *pendowner; | ||
484 | unsigned long flags; | 463 | unsigned long flags; |
485 | 464 | ||
486 | raw_spin_lock_irqsave(¤t->pi_lock, flags); | 465 | raw_spin_lock_irqsave(¤t->pi_lock, flags); |
487 | 466 | ||
488 | waiter = rt_mutex_top_waiter(lock); | 467 | waiter = rt_mutex_top_waiter(lock); |
489 | plist_del(&waiter->list_entry, &lock->wait_list); | ||
490 | 468 | ||
491 | /* | 469 | /* |
492 | * Remove it from current->pi_waiters. We do not adjust a | 470 | * Remove it from current->pi_waiters. We do not adjust a |
@@ -495,43 +473,19 @@ static void wakeup_next_waiter(struct rt_mutex *lock) | |||
495 | * lock->wait_lock. | 473 | * lock->wait_lock. |
496 | */ | 474 | */ |
497 | plist_del(&waiter->pi_list_entry, ¤t->pi_waiters); | 475 | plist_del(&waiter->pi_list_entry, ¤t->pi_waiters); |
498 | pendowner = waiter->task; | ||
499 | waiter->task = NULL; | ||
500 | 476 | ||
501 | rt_mutex_set_owner(lock, pendowner, RT_MUTEX_OWNER_PENDING); | 477 | rt_mutex_set_owner(lock, NULL); |
502 | 478 | ||
503 | raw_spin_unlock_irqrestore(¤t->pi_lock, flags); | 479 | raw_spin_unlock_irqrestore(¤t->pi_lock, flags); |
504 | 480 | ||
505 | /* | 481 | wake_up_process(waiter->task); |
506 | * Clear the pi_blocked_on variable and enqueue a possible | ||
507 | * waiter into the pi_waiters list of the pending owner. This | ||
508 | * prevents that in case the pending owner gets unboosted a | ||
509 | * waiter with higher priority than pending-owner->normal_prio | ||
510 | * is blocked on the unboosted (pending) owner. | ||
511 | */ | ||
512 | raw_spin_lock_irqsave(&pendowner->pi_lock, flags); | ||
513 | |||
514 | WARN_ON(!pendowner->pi_blocked_on); | ||
515 | WARN_ON(pendowner->pi_blocked_on != waiter); | ||
516 | WARN_ON(pendowner->pi_blocked_on->lock != lock); | ||
517 | |||
518 | pendowner->pi_blocked_on = NULL; | ||
519 | |||
520 | if (rt_mutex_has_waiters(lock)) { | ||
521 | struct rt_mutex_waiter *next; | ||
522 | |||
523 | next = rt_mutex_top_waiter(lock); | ||
524 | plist_add(&next->pi_list_entry, &pendowner->pi_waiters); | ||
525 | } | ||
526 | raw_spin_unlock_irqrestore(&pendowner->pi_lock, flags); | ||
527 | |||
528 | wake_up_process(pendowner); | ||
529 | } | 482 | } |
530 | 483 | ||
531 | /* | 484 | /* |
532 | * Remove a waiter from a lock | 485 | * Remove a waiter from a lock and give up |
533 | * | 486 | * |
534 | * Must be called with lock->wait_lock held | 487 | * Must be called with lock->wait_lock held and |
488 | * have just failed to try_to_take_rt_mutex(). | ||
535 | */ | 489 | */ |
536 | static void remove_waiter(struct rt_mutex *lock, | 490 | static void remove_waiter(struct rt_mutex *lock, |
537 | struct rt_mutex_waiter *waiter) | 491 | struct rt_mutex_waiter *waiter) |
@@ -543,11 +497,13 @@ static void remove_waiter(struct rt_mutex *lock, | |||
543 | 497 | ||
544 | raw_spin_lock_irqsave(¤t->pi_lock, flags); | 498 | raw_spin_lock_irqsave(¤t->pi_lock, flags); |
545 | plist_del(&waiter->list_entry, &lock->wait_list); | 499 | plist_del(&waiter->list_entry, &lock->wait_list); |
546 | waiter->task = NULL; | ||
547 | current->pi_blocked_on = NULL; | 500 | current->pi_blocked_on = NULL; |
548 | raw_spin_unlock_irqrestore(¤t->pi_lock, flags); | 501 | raw_spin_unlock_irqrestore(¤t->pi_lock, flags); |
549 | 502 | ||
550 | if (first && owner != current) { | 503 | if (!owner) |
504 | return; | ||
505 | |||
506 | if (first) { | ||
551 | 507 | ||
552 | raw_spin_lock_irqsave(&owner->pi_lock, flags); | 508 | raw_spin_lock_irqsave(&owner->pi_lock, flags); |
553 | 509 | ||
@@ -614,21 +570,19 @@ void rt_mutex_adjust_pi(struct task_struct *task) | |||
614 | * or TASK_UNINTERRUPTIBLE) | 570 | * or TASK_UNINTERRUPTIBLE) |
615 | * @timeout: the pre-initialized and started timer, or NULL for none | 571 | * @timeout: the pre-initialized and started timer, or NULL for none |
616 | * @waiter: the pre-initialized rt_mutex_waiter | 572 | * @waiter: the pre-initialized rt_mutex_waiter |
617 | * @detect_deadlock: passed to task_blocks_on_rt_mutex | ||
618 | * | 573 | * |
619 | * lock->wait_lock must be held by the caller. | 574 | * lock->wait_lock must be held by the caller. |
620 | */ | 575 | */ |
621 | static int __sched | 576 | static int __sched |
622 | __rt_mutex_slowlock(struct rt_mutex *lock, int state, | 577 | __rt_mutex_slowlock(struct rt_mutex *lock, int state, |
623 | struct hrtimer_sleeper *timeout, | 578 | struct hrtimer_sleeper *timeout, |
624 | struct rt_mutex_waiter *waiter, | 579 | struct rt_mutex_waiter *waiter) |
625 | int detect_deadlock) | ||
626 | { | 580 | { |
627 | int ret = 0; | 581 | int ret = 0; |
628 | 582 | ||
629 | for (;;) { | 583 | for (;;) { |
630 | /* Try to acquire the lock: */ | 584 | /* Try to acquire the lock: */ |
631 | if (try_to_take_rt_mutex(lock)) | 585 | if (try_to_take_rt_mutex(lock, current, waiter)) |
632 | break; | 586 | break; |
633 | 587 | ||
634 | /* | 588 | /* |
@@ -645,39 +599,11 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state, | |||
645 | break; | 599 | break; |
646 | } | 600 | } |
647 | 601 | ||
648 | /* | ||
649 | * waiter->task is NULL the first time we come here and | ||
650 | * when we have been woken up by the previous owner | ||
651 | * but the lock got stolen by a higher prio task. | ||
652 | */ | ||
653 | if (!waiter->task) { | ||
654 | ret = task_blocks_on_rt_mutex(lock, waiter, current, | ||
655 | detect_deadlock); | ||
656 | /* | ||
657 | * If we got woken up by the owner then start loop | ||
658 | * all over without going into schedule to try | ||
659 | * to get the lock now: | ||
660 | */ | ||
661 | if (unlikely(!waiter->task)) { | ||
662 | /* | ||
663 | * Reset the return value. We might | ||
664 | * have returned with -EDEADLK and the | ||
665 | * owner released the lock while we | ||
666 | * were walking the pi chain. | ||
667 | */ | ||
668 | ret = 0; | ||
669 | continue; | ||
670 | } | ||
671 | if (unlikely(ret)) | ||
672 | break; | ||
673 | } | ||
674 | |||
675 | raw_spin_unlock(&lock->wait_lock); | 602 | raw_spin_unlock(&lock->wait_lock); |
676 | 603 | ||
677 | debug_rt_mutex_print_deadlock(waiter); | 604 | debug_rt_mutex_print_deadlock(waiter); |
678 | 605 | ||
679 | if (waiter->task) | 606 | schedule_rt_mutex(lock); |
680 | schedule_rt_mutex(lock); | ||
681 | 607 | ||
682 | raw_spin_lock(&lock->wait_lock); | 608 | raw_spin_lock(&lock->wait_lock); |
683 | set_current_state(state); | 609 | set_current_state(state); |
@@ -698,12 +624,11 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state, | |||
698 | int ret = 0; | 624 | int ret = 0; |
699 | 625 | ||
700 | debug_rt_mutex_init_waiter(&waiter); | 626 | debug_rt_mutex_init_waiter(&waiter); |
701 | waiter.task = NULL; | ||
702 | 627 | ||
703 | raw_spin_lock(&lock->wait_lock); | 628 | raw_spin_lock(&lock->wait_lock); |
704 | 629 | ||
705 | /* Try to acquire the lock again: */ | 630 | /* Try to acquire the lock again: */ |
706 | if (try_to_take_rt_mutex(lock)) { | 631 | if (try_to_take_rt_mutex(lock, current, NULL)) { |
707 | raw_spin_unlock(&lock->wait_lock); | 632 | raw_spin_unlock(&lock->wait_lock); |
708 | return 0; | 633 | return 0; |
709 | } | 634 | } |
@@ -717,12 +642,14 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state, | |||
717 | timeout->task = NULL; | 642 | timeout->task = NULL; |
718 | } | 643 | } |
719 | 644 | ||
720 | ret = __rt_mutex_slowlock(lock, state, timeout, &waiter, | 645 | ret = task_blocks_on_rt_mutex(lock, &waiter, current, detect_deadlock); |
721 | detect_deadlock); | 646 | |
647 | if (likely(!ret)) | ||
648 | ret = __rt_mutex_slowlock(lock, state, timeout, &waiter); | ||
722 | 649 | ||
723 | set_current_state(TASK_RUNNING); | 650 | set_current_state(TASK_RUNNING); |
724 | 651 | ||
725 | if (unlikely(waiter.task)) | 652 | if (unlikely(ret)) |
726 | remove_waiter(lock, &waiter); | 653 | remove_waiter(lock, &waiter); |
727 | 654 | ||
728 | /* | 655 | /* |
@@ -737,14 +664,6 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state, | |||
737 | if (unlikely(timeout)) | 664 | if (unlikely(timeout)) |
738 | hrtimer_cancel(&timeout->timer); | 665 | hrtimer_cancel(&timeout->timer); |
739 | 666 | ||
740 | /* | ||
741 | * Readjust priority, when we did not get the lock. We might | ||
742 | * have been the pending owner and boosted. Since we did not | ||
743 | * take the lock, the PI boost has to go. | ||
744 | */ | ||
745 | if (unlikely(ret)) | ||
746 | rt_mutex_adjust_prio(current); | ||
747 | |||
748 | debug_rt_mutex_free_waiter(&waiter); | 667 | debug_rt_mutex_free_waiter(&waiter); |
749 | 668 | ||
750 | return ret; | 669 | return ret; |
@@ -762,7 +681,7 @@ rt_mutex_slowtrylock(struct rt_mutex *lock) | |||
762 | 681 | ||
763 | if (likely(rt_mutex_owner(lock) != current)) { | 682 | if (likely(rt_mutex_owner(lock) != current)) { |
764 | 683 | ||
765 | ret = try_to_take_rt_mutex(lock); | 684 | ret = try_to_take_rt_mutex(lock, current, NULL); |
766 | /* | 685 | /* |
767 | * try_to_take_rt_mutex() sets the lock waiters | 686 | * try_to_take_rt_mutex() sets the lock waiters |
768 | * bit unconditionally. Clean this up. | 687 | * bit unconditionally. Clean this up. |
@@ -992,7 +911,7 @@ void rt_mutex_init_proxy_locked(struct rt_mutex *lock, | |||
992 | { | 911 | { |
993 | __rt_mutex_init(lock, NULL); | 912 | __rt_mutex_init(lock, NULL); |
994 | debug_rt_mutex_proxy_lock(lock, proxy_owner); | 913 | debug_rt_mutex_proxy_lock(lock, proxy_owner); |
995 | rt_mutex_set_owner(lock, proxy_owner, 0); | 914 | rt_mutex_set_owner(lock, proxy_owner); |
996 | rt_mutex_deadlock_account_lock(lock, proxy_owner); | 915 | rt_mutex_deadlock_account_lock(lock, proxy_owner); |
997 | } | 916 | } |
998 | 917 | ||
@@ -1008,7 +927,7 @@ void rt_mutex_proxy_unlock(struct rt_mutex *lock, | |||
1008 | struct task_struct *proxy_owner) | 927 | struct task_struct *proxy_owner) |
1009 | { | 928 | { |
1010 | debug_rt_mutex_proxy_unlock(lock); | 929 | debug_rt_mutex_proxy_unlock(lock); |
1011 | rt_mutex_set_owner(lock, NULL, 0); | 930 | rt_mutex_set_owner(lock, NULL); |
1012 | rt_mutex_deadlock_account_unlock(proxy_owner); | 931 | rt_mutex_deadlock_account_unlock(proxy_owner); |
1013 | } | 932 | } |
1014 | 933 | ||
@@ -1034,20 +953,14 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock, | |||
1034 | 953 | ||
1035 | raw_spin_lock(&lock->wait_lock); | 954 | raw_spin_lock(&lock->wait_lock); |
1036 | 955 | ||
1037 | mark_rt_mutex_waiters(lock); | 956 | if (try_to_take_rt_mutex(lock, task, NULL)) { |
1038 | |||
1039 | if (!rt_mutex_owner(lock) || try_to_steal_lock(lock, task)) { | ||
1040 | /* We got the lock for task. */ | ||
1041 | debug_rt_mutex_lock(lock); | ||
1042 | rt_mutex_set_owner(lock, task, 0); | ||
1043 | raw_spin_unlock(&lock->wait_lock); | 957 | raw_spin_unlock(&lock->wait_lock); |
1044 | rt_mutex_deadlock_account_lock(lock, task); | ||
1045 | return 1; | 958 | return 1; |
1046 | } | 959 | } |
1047 | 960 | ||
1048 | ret = task_blocks_on_rt_mutex(lock, waiter, task, detect_deadlock); | 961 | ret = task_blocks_on_rt_mutex(lock, waiter, task, detect_deadlock); |
1049 | 962 | ||
1050 | if (ret && !waiter->task) { | 963 | if (ret && !rt_mutex_owner(lock)) { |
1051 | /* | 964 | /* |
1052 | * Reset the return value. We might have | 965 | * Reset the return value. We might have |
1053 | * returned with -EDEADLK and the owner | 966 | * returned with -EDEADLK and the owner |
@@ -1056,6 +969,10 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock, | |||
1056 | */ | 969 | */ |
1057 | ret = 0; | 970 | ret = 0; |
1058 | } | 971 | } |
972 | |||
973 | if (unlikely(ret)) | ||
974 | remove_waiter(lock, waiter); | ||
975 | |||
1059 | raw_spin_unlock(&lock->wait_lock); | 976 | raw_spin_unlock(&lock->wait_lock); |
1060 | 977 | ||
1061 | debug_rt_mutex_print_deadlock(waiter); | 978 | debug_rt_mutex_print_deadlock(waiter); |
@@ -1110,12 +1027,11 @@ int rt_mutex_finish_proxy_lock(struct rt_mutex *lock, | |||
1110 | 1027 | ||
1111 | set_current_state(TASK_INTERRUPTIBLE); | 1028 | set_current_state(TASK_INTERRUPTIBLE); |
1112 | 1029 | ||
1113 | ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter, | 1030 | ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter); |
1114 | detect_deadlock); | ||
1115 | 1031 | ||
1116 | set_current_state(TASK_RUNNING); | 1032 | set_current_state(TASK_RUNNING); |
1117 | 1033 | ||
1118 | if (unlikely(waiter->task)) | 1034 | if (unlikely(ret)) |
1119 | remove_waiter(lock, waiter); | 1035 | remove_waiter(lock, waiter); |
1120 | 1036 | ||
1121 | /* | 1037 | /* |
@@ -1126,13 +1042,5 @@ int rt_mutex_finish_proxy_lock(struct rt_mutex *lock, | |||
1126 | 1042 | ||
1127 | raw_spin_unlock(&lock->wait_lock); | 1043 | raw_spin_unlock(&lock->wait_lock); |
1128 | 1044 | ||
1129 | /* | ||
1130 | * Readjust priority, when we did not get the lock. We might have been | ||
1131 | * the pending owner and boosted. Since we did not take the lock, the | ||
1132 | * PI boost has to go. | ||
1133 | */ | ||
1134 | if (unlikely(ret)) | ||
1135 | rt_mutex_adjust_prio(current); | ||
1136 | |||
1137 | return ret; | 1045 | return ret; |
1138 | } | 1046 | } |
diff --git a/kernel/rtmutex_common.h b/kernel/rtmutex_common.h index 97a2f81866af..53a66c85261b 100644 --- a/kernel/rtmutex_common.h +++ b/kernel/rtmutex_common.h | |||
@@ -91,9 +91,8 @@ task_top_pi_waiter(struct task_struct *p) | |||
91 | /* | 91 | /* |
92 | * lock->owner state tracking: | 92 | * lock->owner state tracking: |
93 | */ | 93 | */ |
94 | #define RT_MUTEX_OWNER_PENDING 1UL | 94 | #define RT_MUTEX_HAS_WAITERS 1UL |
95 | #define RT_MUTEX_HAS_WAITERS 2UL | 95 | #define RT_MUTEX_OWNER_MASKALL 1UL |
96 | #define RT_MUTEX_OWNER_MASKALL 3UL | ||
97 | 96 | ||
98 | static inline struct task_struct *rt_mutex_owner(struct rt_mutex *lock) | 97 | static inline struct task_struct *rt_mutex_owner(struct rt_mutex *lock) |
99 | { | 98 | { |
@@ -101,17 +100,6 @@ static inline struct task_struct *rt_mutex_owner(struct rt_mutex *lock) | |||
101 | ((unsigned long)lock->owner & ~RT_MUTEX_OWNER_MASKALL); | 100 | ((unsigned long)lock->owner & ~RT_MUTEX_OWNER_MASKALL); |
102 | } | 101 | } |
103 | 102 | ||
104 | static inline struct task_struct *rt_mutex_real_owner(struct rt_mutex *lock) | ||
105 | { | ||
106 | return (struct task_struct *) | ||
107 | ((unsigned long)lock->owner & ~RT_MUTEX_HAS_WAITERS); | ||
108 | } | ||
109 | |||
110 | static inline unsigned long rt_mutex_owner_pending(struct rt_mutex *lock) | ||
111 | { | ||
112 | return (unsigned long)lock->owner & RT_MUTEX_OWNER_PENDING; | ||
113 | } | ||
114 | |||
115 | /* | 103 | /* |
116 | * PI-futex support (proxy locking functions, etc.): | 104 | * PI-futex support (proxy locking functions, etc.): |
117 | */ | 105 | */ |
diff --git a/kernel/sched.c b/kernel/sched.c index 04949089e760..c8e40b7005c0 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
@@ -278,14 +278,12 @@ struct task_group { | |||
278 | #endif | 278 | #endif |
279 | }; | 279 | }; |
280 | 280 | ||
281 | #define root_task_group init_task_group | ||
282 | |||
283 | /* task_group_lock serializes the addition/removal of task groups */ | 281 | /* task_group_lock serializes the addition/removal of task groups */ |
284 | static DEFINE_SPINLOCK(task_group_lock); | 282 | static DEFINE_SPINLOCK(task_group_lock); |
285 | 283 | ||
286 | #ifdef CONFIG_FAIR_GROUP_SCHED | 284 | #ifdef CONFIG_FAIR_GROUP_SCHED |
287 | 285 | ||
288 | # define INIT_TASK_GROUP_LOAD NICE_0_LOAD | 286 | # define ROOT_TASK_GROUP_LOAD NICE_0_LOAD |
289 | 287 | ||
290 | /* | 288 | /* |
291 | * A weight of 0 or 1 can cause arithmetics problems. | 289 | * A weight of 0 or 1 can cause arithmetics problems. |
@@ -298,13 +296,13 @@ static DEFINE_SPINLOCK(task_group_lock); | |||
298 | #define MIN_SHARES 2 | 296 | #define MIN_SHARES 2 |
299 | #define MAX_SHARES (1UL << 18) | 297 | #define MAX_SHARES (1UL << 18) |
300 | 298 | ||
301 | static int init_task_group_load = INIT_TASK_GROUP_LOAD; | 299 | static int root_task_group_load = ROOT_TASK_GROUP_LOAD; |
302 | #endif | 300 | #endif |
303 | 301 | ||
304 | /* Default task group. | 302 | /* Default task group. |
305 | * Every task in system belong to this group at bootup. | 303 | * Every task in system belong to this group at bootup. |
306 | */ | 304 | */ |
307 | struct task_group init_task_group; | 305 | struct task_group root_task_group; |
308 | 306 | ||
309 | #endif /* CONFIG_CGROUP_SCHED */ | 307 | #endif /* CONFIG_CGROUP_SCHED */ |
310 | 308 | ||
@@ -326,7 +324,7 @@ struct cfs_rq { | |||
326 | * 'curr' points to currently running entity on this cfs_rq. | 324 | * 'curr' points to currently running entity on this cfs_rq. |
327 | * It is set to NULL otherwise (i.e when none are currently running). | 325 | * It is set to NULL otherwise (i.e when none are currently running). |
328 | */ | 326 | */ |
329 | struct sched_entity *curr, *next, *last; | 327 | struct sched_entity *curr, *next, *last, *skip; |
330 | 328 | ||
331 | unsigned int nr_spread_over; | 329 | unsigned int nr_spread_over; |
332 | 330 | ||
@@ -555,9 +553,6 @@ struct rq { | |||
555 | /* try_to_wake_up() stats */ | 553 | /* try_to_wake_up() stats */ |
556 | unsigned int ttwu_count; | 554 | unsigned int ttwu_count; |
557 | unsigned int ttwu_local; | 555 | unsigned int ttwu_local; |
558 | |||
559 | /* BKL stats */ | ||
560 | unsigned int bkl_count; | ||
561 | #endif | 556 | #endif |
562 | }; | 557 | }; |
563 | 558 | ||
@@ -743,7 +738,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf, | |||
743 | buf[cnt] = 0; | 738 | buf[cnt] = 0; |
744 | cmp = strstrip(buf); | 739 | cmp = strstrip(buf); |
745 | 740 | ||
746 | if (strncmp(buf, "NO_", 3) == 0) { | 741 | if (strncmp(cmp, "NO_", 3) == 0) { |
747 | neg = 1; | 742 | neg = 1; |
748 | cmp += 3; | 743 | cmp += 3; |
749 | } | 744 | } |
@@ -1688,6 +1683,39 @@ static void double_rq_unlock(struct rq *rq1, struct rq *rq2) | |||
1688 | __release(rq2->lock); | 1683 | __release(rq2->lock); |
1689 | } | 1684 | } |
1690 | 1685 | ||
1686 | #else /* CONFIG_SMP */ | ||
1687 | |||
1688 | /* | ||
1689 | * double_rq_lock - safely lock two runqueues | ||
1690 | * | ||
1691 | * Note this does not disable interrupts like task_rq_lock, | ||
1692 | * you need to do so manually before calling. | ||
1693 | */ | ||
1694 | static void double_rq_lock(struct rq *rq1, struct rq *rq2) | ||
1695 | __acquires(rq1->lock) | ||
1696 | __acquires(rq2->lock) | ||
1697 | { | ||
1698 | BUG_ON(!irqs_disabled()); | ||
1699 | BUG_ON(rq1 != rq2); | ||
1700 | raw_spin_lock(&rq1->lock); | ||
1701 | __acquire(rq2->lock); /* Fake it out ;) */ | ||
1702 | } | ||
1703 | |||
1704 | /* | ||
1705 | * double_rq_unlock - safely unlock two runqueues | ||
1706 | * | ||
1707 | * Note this does not restore interrupts like task_rq_unlock, | ||
1708 | * you need to do so manually after calling. | ||
1709 | */ | ||
1710 | static void double_rq_unlock(struct rq *rq1, struct rq *rq2) | ||
1711 | __releases(rq1->lock) | ||
1712 | __releases(rq2->lock) | ||
1713 | { | ||
1714 | BUG_ON(rq1 != rq2); | ||
1715 | raw_spin_unlock(&rq1->lock); | ||
1716 | __release(rq2->lock); | ||
1717 | } | ||
1718 | |||
1691 | #endif | 1719 | #endif |
1692 | 1720 | ||
1693 | static void calc_load_account_idle(struct rq *this_rq); | 1721 | static void calc_load_account_idle(struct rq *this_rq); |
@@ -1882,7 +1910,7 @@ void account_system_vtime(struct task_struct *curr) | |||
1882 | */ | 1910 | */ |
1883 | if (hardirq_count()) | 1911 | if (hardirq_count()) |
1884 | __this_cpu_add(cpu_hardirq_time, delta); | 1912 | __this_cpu_add(cpu_hardirq_time, delta); |
1885 | else if (in_serving_softirq() && !(curr->flags & PF_KSOFTIRQD)) | 1913 | else if (in_serving_softirq() && curr != this_cpu_ksoftirqd()) |
1886 | __this_cpu_add(cpu_softirq_time, delta); | 1914 | __this_cpu_add(cpu_softirq_time, delta); |
1887 | 1915 | ||
1888 | irq_time_write_end(); | 1916 | irq_time_write_end(); |
@@ -1922,8 +1950,40 @@ static void update_rq_clock_task(struct rq *rq, s64 delta) | |||
1922 | sched_rt_avg_update(rq, irq_delta); | 1950 | sched_rt_avg_update(rq, irq_delta); |
1923 | } | 1951 | } |
1924 | 1952 | ||
1953 | static int irqtime_account_hi_update(void) | ||
1954 | { | ||
1955 | struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; | ||
1956 | unsigned long flags; | ||
1957 | u64 latest_ns; | ||
1958 | int ret = 0; | ||
1959 | |||
1960 | local_irq_save(flags); | ||
1961 | latest_ns = this_cpu_read(cpu_hardirq_time); | ||
1962 | if (cputime64_gt(nsecs_to_cputime64(latest_ns), cpustat->irq)) | ||
1963 | ret = 1; | ||
1964 | local_irq_restore(flags); | ||
1965 | return ret; | ||
1966 | } | ||
1967 | |||
1968 | static int irqtime_account_si_update(void) | ||
1969 | { | ||
1970 | struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; | ||
1971 | unsigned long flags; | ||
1972 | u64 latest_ns; | ||
1973 | int ret = 0; | ||
1974 | |||
1975 | local_irq_save(flags); | ||
1976 | latest_ns = this_cpu_read(cpu_softirq_time); | ||
1977 | if (cputime64_gt(nsecs_to_cputime64(latest_ns), cpustat->softirq)) | ||
1978 | ret = 1; | ||
1979 | local_irq_restore(flags); | ||
1980 | return ret; | ||
1981 | } | ||
1982 | |||
1925 | #else /* CONFIG_IRQ_TIME_ACCOUNTING */ | 1983 | #else /* CONFIG_IRQ_TIME_ACCOUNTING */ |
1926 | 1984 | ||
1985 | #define sched_clock_irqtime (0) | ||
1986 | |||
1927 | static void update_rq_clock_task(struct rq *rq, s64 delta) | 1987 | static void update_rq_clock_task(struct rq *rq, s64 delta) |
1928 | { | 1988 | { |
1929 | rq->clock_task += delta; | 1989 | rq->clock_task += delta; |
@@ -2027,14 +2087,14 @@ inline int task_curr(const struct task_struct *p) | |||
2027 | 2087 | ||
2028 | static inline void check_class_changed(struct rq *rq, struct task_struct *p, | 2088 | static inline void check_class_changed(struct rq *rq, struct task_struct *p, |
2029 | const struct sched_class *prev_class, | 2089 | const struct sched_class *prev_class, |
2030 | int oldprio, int running) | 2090 | int oldprio) |
2031 | { | 2091 | { |
2032 | if (prev_class != p->sched_class) { | 2092 | if (prev_class != p->sched_class) { |
2033 | if (prev_class->switched_from) | 2093 | if (prev_class->switched_from) |
2034 | prev_class->switched_from(rq, p, running); | 2094 | prev_class->switched_from(rq, p); |
2035 | p->sched_class->switched_to(rq, p, running); | 2095 | p->sched_class->switched_to(rq, p); |
2036 | } else | 2096 | } else if (oldprio != p->prio) |
2037 | p->sched_class->prio_changed(rq, p, oldprio, running); | 2097 | p->sched_class->prio_changed(rq, p, oldprio); |
2038 | } | 2098 | } |
2039 | 2099 | ||
2040 | static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) | 2100 | static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) |
@@ -2226,7 +2286,10 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state) | |||
2226 | * yield - it could be a while. | 2286 | * yield - it could be a while. |
2227 | */ | 2287 | */ |
2228 | if (unlikely(on_rq)) { | 2288 | if (unlikely(on_rq)) { |
2229 | schedule_timeout_uninterruptible(1); | 2289 | ktime_t to = ktime_set(0, NSEC_PER_SEC/HZ); |
2290 | |||
2291 | set_current_state(TASK_UNINTERRUPTIBLE); | ||
2292 | schedule_hrtimeout(&to, HRTIMER_MODE_REL); | ||
2230 | continue; | 2293 | continue; |
2231 | } | 2294 | } |
2232 | 2295 | ||
@@ -2267,27 +2330,6 @@ void kick_process(struct task_struct *p) | |||
2267 | EXPORT_SYMBOL_GPL(kick_process); | 2330 | EXPORT_SYMBOL_GPL(kick_process); |
2268 | #endif /* CONFIG_SMP */ | 2331 | #endif /* CONFIG_SMP */ |
2269 | 2332 | ||
2270 | /** | ||
2271 | * task_oncpu_function_call - call a function on the cpu on which a task runs | ||
2272 | * @p: the task to evaluate | ||
2273 | * @func: the function to be called | ||
2274 | * @info: the function call argument | ||
2275 | * | ||
2276 | * Calls the function @func when the task is currently running. This might | ||
2277 | * be on the current CPU, which just calls the function directly | ||
2278 | */ | ||
2279 | void task_oncpu_function_call(struct task_struct *p, | ||
2280 | void (*func) (void *info), void *info) | ||
2281 | { | ||
2282 | int cpu; | ||
2283 | |||
2284 | preempt_disable(); | ||
2285 | cpu = task_cpu(p); | ||
2286 | if (task_curr(p)) | ||
2287 | smp_call_function_single(cpu, func, info, 1); | ||
2288 | preempt_enable(); | ||
2289 | } | ||
2290 | |||
2291 | #ifdef CONFIG_SMP | 2333 | #ifdef CONFIG_SMP |
2292 | /* | 2334 | /* |
2293 | * ->cpus_allowed is protected by either TASK_WAKING or rq->lock held. | 2335 | * ->cpus_allowed is protected by either TASK_WAKING or rq->lock held. |
@@ -2507,7 +2549,7 @@ out: | |||
2507 | * try_to_wake_up_local - try to wake up a local task with rq lock held | 2549 | * try_to_wake_up_local - try to wake up a local task with rq lock held |
2508 | * @p: the thread to be awakened | 2550 | * @p: the thread to be awakened |
2509 | * | 2551 | * |
2510 | * Put @p on the run-queue if it's not alredy there. The caller must | 2552 | * Put @p on the run-queue if it's not already there. The caller must |
2511 | * ensure that this_rq() is locked, @p is bound to this_rq() and not | 2553 | * ensure that this_rq() is locked, @p is bound to this_rq() and not |
2512 | * the current task. this_rq() stays locked over invocation. | 2554 | * the current task. this_rq() stays locked over invocation. |
2513 | */ | 2555 | */ |
@@ -2568,6 +2610,7 @@ static void __sched_fork(struct task_struct *p) | |||
2568 | p->se.sum_exec_runtime = 0; | 2610 | p->se.sum_exec_runtime = 0; |
2569 | p->se.prev_sum_exec_runtime = 0; | 2611 | p->se.prev_sum_exec_runtime = 0; |
2570 | p->se.nr_migrations = 0; | 2612 | p->se.nr_migrations = 0; |
2613 | p->se.vruntime = 0; | ||
2571 | 2614 | ||
2572 | #ifdef CONFIG_SCHEDSTATS | 2615 | #ifdef CONFIG_SCHEDSTATS |
2573 | memset(&p->se.statistics, 0, sizeof(p->se.statistics)); | 2616 | memset(&p->se.statistics, 0, sizeof(p->se.statistics)); |
@@ -2778,9 +2821,12 @@ static inline void | |||
2778 | prepare_task_switch(struct rq *rq, struct task_struct *prev, | 2821 | prepare_task_switch(struct rq *rq, struct task_struct *prev, |
2779 | struct task_struct *next) | 2822 | struct task_struct *next) |
2780 | { | 2823 | { |
2824 | sched_info_switch(prev, next); | ||
2825 | perf_event_task_sched_out(prev, next); | ||
2781 | fire_sched_out_preempt_notifiers(prev, next); | 2826 | fire_sched_out_preempt_notifiers(prev, next); |
2782 | prepare_lock_switch(rq, next); | 2827 | prepare_lock_switch(rq, next); |
2783 | prepare_arch_switch(next); | 2828 | prepare_arch_switch(next); |
2829 | trace_sched_switch(prev, next); | ||
2784 | } | 2830 | } |
2785 | 2831 | ||
2786 | /** | 2832 | /** |
@@ -2913,7 +2959,7 @@ context_switch(struct rq *rq, struct task_struct *prev, | |||
2913 | struct mm_struct *mm, *oldmm; | 2959 | struct mm_struct *mm, *oldmm; |
2914 | 2960 | ||
2915 | prepare_task_switch(rq, prev, next); | 2961 | prepare_task_switch(rq, prev, next); |
2916 | trace_sched_switch(prev, next); | 2962 | |
2917 | mm = next->mm; | 2963 | mm = next->mm; |
2918 | oldmm = prev->active_mm; | 2964 | oldmm = prev->active_mm; |
2919 | /* | 2965 | /* |
@@ -3570,6 +3616,32 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime, | |||
3570 | } | 3616 | } |
3571 | 3617 | ||
3572 | /* | 3618 | /* |
3619 | * Account system cpu time to a process and desired cpustat field | ||
3620 | * @p: the process that the cpu time gets accounted to | ||
3621 | * @cputime: the cpu time spent in kernel space since the last update | ||
3622 | * @cputime_scaled: cputime scaled by cpu frequency | ||
3623 | * @target_cputime64: pointer to cpustat field that has to be updated | ||
3624 | */ | ||
3625 | static inline | ||
3626 | void __account_system_time(struct task_struct *p, cputime_t cputime, | ||
3627 | cputime_t cputime_scaled, cputime64_t *target_cputime64) | ||
3628 | { | ||
3629 | cputime64_t tmp = cputime_to_cputime64(cputime); | ||
3630 | |||
3631 | /* Add system time to process. */ | ||
3632 | p->stime = cputime_add(p->stime, cputime); | ||
3633 | p->stimescaled = cputime_add(p->stimescaled, cputime_scaled); | ||
3634 | account_group_system_time(p, cputime); | ||
3635 | |||
3636 | /* Add system time to cpustat. */ | ||
3637 | *target_cputime64 = cputime64_add(*target_cputime64, tmp); | ||
3638 | cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime); | ||
3639 | |||
3640 | /* Account for system time used */ | ||
3641 | acct_update_integrals(p); | ||
3642 | } | ||
3643 | |||
3644 | /* | ||
3573 | * Account system cpu time to a process. | 3645 | * Account system cpu time to a process. |
3574 | * @p: the process that the cpu time gets accounted to | 3646 | * @p: the process that the cpu time gets accounted to |
3575 | * @hardirq_offset: the offset to subtract from hardirq_count() | 3647 | * @hardirq_offset: the offset to subtract from hardirq_count() |
@@ -3580,36 +3652,26 @@ void account_system_time(struct task_struct *p, int hardirq_offset, | |||
3580 | cputime_t cputime, cputime_t cputime_scaled) | 3652 | cputime_t cputime, cputime_t cputime_scaled) |
3581 | { | 3653 | { |
3582 | struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; | 3654 | struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; |
3583 | cputime64_t tmp; | 3655 | cputime64_t *target_cputime64; |
3584 | 3656 | ||
3585 | if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) { | 3657 | if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) { |
3586 | account_guest_time(p, cputime, cputime_scaled); | 3658 | account_guest_time(p, cputime, cputime_scaled); |
3587 | return; | 3659 | return; |
3588 | } | 3660 | } |
3589 | 3661 | ||
3590 | /* Add system time to process. */ | ||
3591 | p->stime = cputime_add(p->stime, cputime); | ||
3592 | p->stimescaled = cputime_add(p->stimescaled, cputime_scaled); | ||
3593 | account_group_system_time(p, cputime); | ||
3594 | |||
3595 | /* Add system time to cpustat. */ | ||
3596 | tmp = cputime_to_cputime64(cputime); | ||
3597 | if (hardirq_count() - hardirq_offset) | 3662 | if (hardirq_count() - hardirq_offset) |
3598 | cpustat->irq = cputime64_add(cpustat->irq, tmp); | 3663 | target_cputime64 = &cpustat->irq; |
3599 | else if (in_serving_softirq()) | 3664 | else if (in_serving_softirq()) |
3600 | cpustat->softirq = cputime64_add(cpustat->softirq, tmp); | 3665 | target_cputime64 = &cpustat->softirq; |
3601 | else | 3666 | else |
3602 | cpustat->system = cputime64_add(cpustat->system, tmp); | 3667 | target_cputime64 = &cpustat->system; |
3603 | 3668 | ||
3604 | cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime); | 3669 | __account_system_time(p, cputime, cputime_scaled, target_cputime64); |
3605 | |||
3606 | /* Account for system time used */ | ||
3607 | acct_update_integrals(p); | ||
3608 | } | 3670 | } |
3609 | 3671 | ||
3610 | /* | 3672 | /* |
3611 | * Account for involuntary wait time. | 3673 | * Account for involuntary wait time. |
3612 | * @steal: the cpu time spent in involuntary wait | 3674 | * @cputime: the cpu time spent in involuntary wait |
3613 | */ | 3675 | */ |
3614 | void account_steal_time(cputime_t cputime) | 3676 | void account_steal_time(cputime_t cputime) |
3615 | { | 3677 | { |
@@ -3637,6 +3699,73 @@ void account_idle_time(cputime_t cputime) | |||
3637 | 3699 | ||
3638 | #ifndef CONFIG_VIRT_CPU_ACCOUNTING | 3700 | #ifndef CONFIG_VIRT_CPU_ACCOUNTING |
3639 | 3701 | ||
3702 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING | ||
3703 | /* | ||
3704 | * Account a tick to a process and cpustat | ||
3705 | * @p: the process that the cpu time gets accounted to | ||
3706 | * @user_tick: is the tick from userspace | ||
3707 | * @rq: the pointer to rq | ||
3708 | * | ||
3709 | * Tick demultiplexing follows the order | ||
3710 | * - pending hardirq update | ||
3711 | * - pending softirq update | ||
3712 | * - user_time | ||
3713 | * - idle_time | ||
3714 | * - system time | ||
3715 | * - check for guest_time | ||
3716 | * - else account as system_time | ||
3717 | * | ||
3718 | * Check for hardirq is done both for system and user time as there is | ||
3719 | * no timer going off while we are on hardirq and hence we may never get an | ||
3720 | * opportunity to update it solely in system time. | ||
3721 | * p->stime and friends are only updated on system time and not on irq | ||
3722 | * softirq as those do not count in task exec_runtime any more. | ||
3723 | */ | ||
3724 | static void irqtime_account_process_tick(struct task_struct *p, int user_tick, | ||
3725 | struct rq *rq) | ||
3726 | { | ||
3727 | cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); | ||
3728 | cputime64_t tmp = cputime_to_cputime64(cputime_one_jiffy); | ||
3729 | struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; | ||
3730 | |||
3731 | if (irqtime_account_hi_update()) { | ||
3732 | cpustat->irq = cputime64_add(cpustat->irq, tmp); | ||
3733 | } else if (irqtime_account_si_update()) { | ||
3734 | cpustat->softirq = cputime64_add(cpustat->softirq, tmp); | ||
3735 | } else if (this_cpu_ksoftirqd() == p) { | ||
3736 | /* | ||
3737 | * ksoftirqd time do not get accounted in cpu_softirq_time. | ||
3738 | * So, we have to handle it separately here. | ||
3739 | * Also, p->stime needs to be updated for ksoftirqd. | ||
3740 | */ | ||
3741 | __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled, | ||
3742 | &cpustat->softirq); | ||
3743 | } else if (user_tick) { | ||
3744 | account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); | ||
3745 | } else if (p == rq->idle) { | ||
3746 | account_idle_time(cputime_one_jiffy); | ||
3747 | } else if (p->flags & PF_VCPU) { /* System time or guest time */ | ||
3748 | account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled); | ||
3749 | } else { | ||
3750 | __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled, | ||
3751 | &cpustat->system); | ||
3752 | } | ||
3753 | } | ||
3754 | |||
3755 | static void irqtime_account_idle_ticks(int ticks) | ||
3756 | { | ||
3757 | int i; | ||
3758 | struct rq *rq = this_rq(); | ||
3759 | |||
3760 | for (i = 0; i < ticks; i++) | ||
3761 | irqtime_account_process_tick(current, 0, rq); | ||
3762 | } | ||
3763 | #else /* CONFIG_IRQ_TIME_ACCOUNTING */ | ||
3764 | static void irqtime_account_idle_ticks(int ticks) {} | ||
3765 | static void irqtime_account_process_tick(struct task_struct *p, int user_tick, | ||
3766 | struct rq *rq) {} | ||
3767 | #endif /* CONFIG_IRQ_TIME_ACCOUNTING */ | ||
3768 | |||
3640 | /* | 3769 | /* |
3641 | * Account a single tick of cpu time. | 3770 | * Account a single tick of cpu time. |
3642 | * @p: the process that the cpu time gets accounted to | 3771 | * @p: the process that the cpu time gets accounted to |
@@ -3647,6 +3776,11 @@ void account_process_tick(struct task_struct *p, int user_tick) | |||
3647 | cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); | 3776 | cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); |
3648 | struct rq *rq = this_rq(); | 3777 | struct rq *rq = this_rq(); |
3649 | 3778 | ||
3779 | if (sched_clock_irqtime) { | ||
3780 | irqtime_account_process_tick(p, user_tick, rq); | ||
3781 | return; | ||
3782 | } | ||
3783 | |||
3650 | if (user_tick) | 3784 | if (user_tick) |
3651 | account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); | 3785 | account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); |
3652 | else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET)) | 3786 | else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET)) |
@@ -3672,6 +3806,12 @@ void account_steal_ticks(unsigned long ticks) | |||
3672 | */ | 3806 | */ |
3673 | void account_idle_ticks(unsigned long ticks) | 3807 | void account_idle_ticks(unsigned long ticks) |
3674 | { | 3808 | { |
3809 | |||
3810 | if (sched_clock_irqtime) { | ||
3811 | irqtime_account_idle_ticks(ticks); | ||
3812 | return; | ||
3813 | } | ||
3814 | |||
3675 | account_idle_time(jiffies_to_cputime(ticks)); | 3815 | account_idle_time(jiffies_to_cputime(ticks)); |
3676 | } | 3816 | } |
3677 | 3817 | ||
@@ -3889,7 +4029,7 @@ static inline void schedule_debug(struct task_struct *prev) | |||
3889 | schedstat_inc(this_rq(), sched_count); | 4029 | schedstat_inc(this_rq(), sched_count); |
3890 | #ifdef CONFIG_SCHEDSTATS | 4030 | #ifdef CONFIG_SCHEDSTATS |
3891 | if (unlikely(prev->lock_depth >= 0)) { | 4031 | if (unlikely(prev->lock_depth >= 0)) { |
3892 | schedstat_inc(this_rq(), bkl_count); | 4032 | schedstat_inc(this_rq(), rq_sched_info.bkl_count); |
3893 | schedstat_inc(prev, sched_info.bkl_count); | 4033 | schedstat_inc(prev, sched_info.bkl_count); |
3894 | } | 4034 | } |
3895 | #endif | 4035 | #endif |
@@ -3991,9 +4131,6 @@ need_resched_nonpreemptible: | |||
3991 | rq->skip_clock_update = 0; | 4131 | rq->skip_clock_update = 0; |
3992 | 4132 | ||
3993 | if (likely(prev != next)) { | 4133 | if (likely(prev != next)) { |
3994 | sched_info_switch(prev, next); | ||
3995 | perf_event_task_sched_out(prev, next); | ||
3996 | |||
3997 | rq->nr_switches++; | 4134 | rq->nr_switches++; |
3998 | rq->curr = next; | 4135 | rq->curr = next; |
3999 | ++*switch_count; | 4136 | ++*switch_count; |
@@ -4215,6 +4352,7 @@ void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key) | |||
4215 | { | 4352 | { |
4216 | __wake_up_common(q, mode, 1, 0, key); | 4353 | __wake_up_common(q, mode, 1, 0, key); |
4217 | } | 4354 | } |
4355 | EXPORT_SYMBOL_GPL(__wake_up_locked_key); | ||
4218 | 4356 | ||
4219 | /** | 4357 | /** |
4220 | * __wake_up_sync_key - wake up threads blocked on a waitqueue. | 4358 | * __wake_up_sync_key - wake up threads blocked on a waitqueue. |
@@ -4572,11 +4710,10 @@ void rt_mutex_setprio(struct task_struct *p, int prio) | |||
4572 | 4710 | ||
4573 | if (running) | 4711 | if (running) |
4574 | p->sched_class->set_curr_task(rq); | 4712 | p->sched_class->set_curr_task(rq); |
4575 | if (on_rq) { | 4713 | if (on_rq) |
4576 | enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0); | 4714 | enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0); |
4577 | 4715 | ||
4578 | check_class_changed(rq, p, prev_class, oldprio, running); | 4716 | check_class_changed(rq, p, prev_class, oldprio); |
4579 | } | ||
4580 | task_rq_unlock(rq, &flags); | 4717 | task_rq_unlock(rq, &flags); |
4581 | } | 4718 | } |
4582 | 4719 | ||
@@ -4824,12 +4961,15 @@ recheck: | |||
4824 | param->sched_priority > rlim_rtprio) | 4961 | param->sched_priority > rlim_rtprio) |
4825 | return -EPERM; | 4962 | return -EPERM; |
4826 | } | 4963 | } |
4964 | |||
4827 | /* | 4965 | /* |
4828 | * Like positive nice levels, dont allow tasks to | 4966 | * Treat SCHED_IDLE as nice 20. Only allow a switch to |
4829 | * move out of SCHED_IDLE either: | 4967 | * SCHED_NORMAL if the RLIMIT_NICE would normally permit it. |
4830 | */ | 4968 | */ |
4831 | if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) | 4969 | if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) { |
4832 | return -EPERM; | 4970 | if (!can_nice(p, TASK_NICE(p))) |
4971 | return -EPERM; | ||
4972 | } | ||
4833 | 4973 | ||
4834 | /* can't change other user's priorities */ | 4974 | /* can't change other user's priorities */ |
4835 | if (!check_same_owner(p)) | 4975 | if (!check_same_owner(p)) |
@@ -4873,7 +5013,8 @@ recheck: | |||
4873 | * assigned. | 5013 | * assigned. |
4874 | */ | 5014 | */ |
4875 | if (rt_bandwidth_enabled() && rt_policy(policy) && | 5015 | if (rt_bandwidth_enabled() && rt_policy(policy) && |
4876 | task_group(p)->rt_bandwidth.rt_runtime == 0) { | 5016 | task_group(p)->rt_bandwidth.rt_runtime == 0 && |
5017 | !task_group_is_autogroup(task_group(p))) { | ||
4877 | __task_rq_unlock(rq); | 5018 | __task_rq_unlock(rq); |
4878 | raw_spin_unlock_irqrestore(&p->pi_lock, flags); | 5019 | raw_spin_unlock_irqrestore(&p->pi_lock, flags); |
4879 | return -EPERM; | 5020 | return -EPERM; |
@@ -4903,11 +5044,10 @@ recheck: | |||
4903 | 5044 | ||
4904 | if (running) | 5045 | if (running) |
4905 | p->sched_class->set_curr_task(rq); | 5046 | p->sched_class->set_curr_task(rq); |
4906 | if (on_rq) { | 5047 | if (on_rq) |
4907 | activate_task(rq, p, 0); | 5048 | activate_task(rq, p, 0); |
4908 | 5049 | ||
4909 | check_class_changed(rq, p, prev_class, oldprio, running); | 5050 | check_class_changed(rq, p, prev_class, oldprio); |
4910 | } | ||
4911 | __task_rq_unlock(rq); | 5051 | __task_rq_unlock(rq); |
4912 | raw_spin_unlock_irqrestore(&p->pi_lock, flags); | 5052 | raw_spin_unlock_irqrestore(&p->pi_lock, flags); |
4913 | 5053 | ||
@@ -5324,6 +5464,65 @@ void __sched yield(void) | |||
5324 | } | 5464 | } |
5325 | EXPORT_SYMBOL(yield); | 5465 | EXPORT_SYMBOL(yield); |
5326 | 5466 | ||
5467 | /** | ||
5468 | * yield_to - yield the current processor to another thread in | ||
5469 | * your thread group, or accelerate that thread toward the | ||
5470 | * processor it's on. | ||
5471 | * | ||
5472 | * It's the caller's job to ensure that the target task struct | ||
5473 | * can't go away on us before we can do any checks. | ||
5474 | * | ||
5475 | * Returns true if we indeed boosted the target task. | ||
5476 | */ | ||
5477 | bool __sched yield_to(struct task_struct *p, bool preempt) | ||
5478 | { | ||
5479 | struct task_struct *curr = current; | ||
5480 | struct rq *rq, *p_rq; | ||
5481 | unsigned long flags; | ||
5482 | bool yielded = 0; | ||
5483 | |||
5484 | local_irq_save(flags); | ||
5485 | rq = this_rq(); | ||
5486 | |||
5487 | again: | ||
5488 | p_rq = task_rq(p); | ||
5489 | double_rq_lock(rq, p_rq); | ||
5490 | while (task_rq(p) != p_rq) { | ||
5491 | double_rq_unlock(rq, p_rq); | ||
5492 | goto again; | ||
5493 | } | ||
5494 | |||
5495 | if (!curr->sched_class->yield_to_task) | ||
5496 | goto out; | ||
5497 | |||
5498 | if (curr->sched_class != p->sched_class) | ||
5499 | goto out; | ||
5500 | |||
5501 | if (task_running(p_rq, p) || p->state) | ||
5502 | goto out; | ||
5503 | |||
5504 | yielded = curr->sched_class->yield_to_task(rq, p, preempt); | ||
5505 | if (yielded) { | ||
5506 | schedstat_inc(rq, yld_count); | ||
5507 | /* | ||
5508 | * Make p's CPU reschedule; pick_next_entity takes care of | ||
5509 | * fairness. | ||
5510 | */ | ||
5511 | if (preempt && rq != p_rq) | ||
5512 | resched_task(p_rq->curr); | ||
5513 | } | ||
5514 | |||
5515 | out: | ||
5516 | double_rq_unlock(rq, p_rq); | ||
5517 | local_irq_restore(flags); | ||
5518 | |||
5519 | if (yielded) | ||
5520 | schedule(); | ||
5521 | |||
5522 | return yielded; | ||
5523 | } | ||
5524 | EXPORT_SYMBOL_GPL(yield_to); | ||
5525 | |||
5327 | /* | 5526 | /* |
5328 | * This task is about to go to sleep on IO. Increment rq->nr_iowait so | 5527 | * This task is about to go to sleep on IO. Increment rq->nr_iowait so |
5329 | * that process accounting knows that this is a task in IO wait state. | 5528 | * that process accounting knows that this is a task in IO wait state. |
@@ -5572,7 +5771,7 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) | |||
5572 | * The idle tasks have their own, simple scheduling class: | 5771 | * The idle tasks have their own, simple scheduling class: |
5573 | */ | 5772 | */ |
5574 | idle->sched_class = &idle_sched_class; | 5773 | idle->sched_class = &idle_sched_class; |
5575 | ftrace_graph_init_task(idle); | 5774 | ftrace_graph_init_idle_task(idle, cpu); |
5576 | } | 5775 | } |
5577 | 5776 | ||
5578 | /* | 5777 | /* |
@@ -7797,6 +7996,10 @@ static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq) | |||
7797 | INIT_LIST_HEAD(&cfs_rq->tasks); | 7996 | INIT_LIST_HEAD(&cfs_rq->tasks); |
7798 | #ifdef CONFIG_FAIR_GROUP_SCHED | 7997 | #ifdef CONFIG_FAIR_GROUP_SCHED |
7799 | cfs_rq->rq = rq; | 7998 | cfs_rq->rq = rq; |
7999 | /* allow initial update_cfs_load() to truncate */ | ||
8000 | #ifdef CONFIG_SMP | ||
8001 | cfs_rq->load_stamp = 1; | ||
8002 | #endif | ||
7800 | #endif | 8003 | #endif |
7801 | cfs_rq->min_vruntime = (u64)(-(1LL << 20)); | 8004 | cfs_rq->min_vruntime = (u64)(-(1LL << 20)); |
7802 | } | 8005 | } |
@@ -7848,7 +8051,7 @@ static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, | |||
7848 | cfs_rq->tg = tg; | 8051 | cfs_rq->tg = tg; |
7849 | 8052 | ||
7850 | tg->se[cpu] = se; | 8053 | tg->se[cpu] = se; |
7851 | /* se could be NULL for init_task_group */ | 8054 | /* se could be NULL for root_task_group */ |
7852 | if (!se) | 8055 | if (!se) |
7853 | return; | 8056 | return; |
7854 | 8057 | ||
@@ -7908,18 +8111,18 @@ void __init sched_init(void) | |||
7908 | ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT); | 8111 | ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT); |
7909 | 8112 | ||
7910 | #ifdef CONFIG_FAIR_GROUP_SCHED | 8113 | #ifdef CONFIG_FAIR_GROUP_SCHED |
7911 | init_task_group.se = (struct sched_entity **)ptr; | 8114 | root_task_group.se = (struct sched_entity **)ptr; |
7912 | ptr += nr_cpu_ids * sizeof(void **); | 8115 | ptr += nr_cpu_ids * sizeof(void **); |
7913 | 8116 | ||
7914 | init_task_group.cfs_rq = (struct cfs_rq **)ptr; | 8117 | root_task_group.cfs_rq = (struct cfs_rq **)ptr; |
7915 | ptr += nr_cpu_ids * sizeof(void **); | 8118 | ptr += nr_cpu_ids * sizeof(void **); |
7916 | 8119 | ||
7917 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | 8120 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
7918 | #ifdef CONFIG_RT_GROUP_SCHED | 8121 | #ifdef CONFIG_RT_GROUP_SCHED |
7919 | init_task_group.rt_se = (struct sched_rt_entity **)ptr; | 8122 | root_task_group.rt_se = (struct sched_rt_entity **)ptr; |
7920 | ptr += nr_cpu_ids * sizeof(void **); | 8123 | ptr += nr_cpu_ids * sizeof(void **); |
7921 | 8124 | ||
7922 | init_task_group.rt_rq = (struct rt_rq **)ptr; | 8125 | root_task_group.rt_rq = (struct rt_rq **)ptr; |
7923 | ptr += nr_cpu_ids * sizeof(void **); | 8126 | ptr += nr_cpu_ids * sizeof(void **); |
7924 | 8127 | ||
7925 | #endif /* CONFIG_RT_GROUP_SCHED */ | 8128 | #endif /* CONFIG_RT_GROUP_SCHED */ |
@@ -7939,13 +8142,13 @@ void __init sched_init(void) | |||
7939 | global_rt_period(), global_rt_runtime()); | 8142 | global_rt_period(), global_rt_runtime()); |
7940 | 8143 | ||
7941 | #ifdef CONFIG_RT_GROUP_SCHED | 8144 | #ifdef CONFIG_RT_GROUP_SCHED |
7942 | init_rt_bandwidth(&init_task_group.rt_bandwidth, | 8145 | init_rt_bandwidth(&root_task_group.rt_bandwidth, |
7943 | global_rt_period(), global_rt_runtime()); | 8146 | global_rt_period(), global_rt_runtime()); |
7944 | #endif /* CONFIG_RT_GROUP_SCHED */ | 8147 | #endif /* CONFIG_RT_GROUP_SCHED */ |
7945 | 8148 | ||
7946 | #ifdef CONFIG_CGROUP_SCHED | 8149 | #ifdef CONFIG_CGROUP_SCHED |
7947 | list_add(&init_task_group.list, &task_groups); | 8150 | list_add(&root_task_group.list, &task_groups); |
7948 | INIT_LIST_HEAD(&init_task_group.children); | 8151 | INIT_LIST_HEAD(&root_task_group.children); |
7949 | autogroup_init(&init_task); | 8152 | autogroup_init(&init_task); |
7950 | #endif /* CONFIG_CGROUP_SCHED */ | 8153 | #endif /* CONFIG_CGROUP_SCHED */ |
7951 | 8154 | ||
@@ -7960,34 +8163,34 @@ void __init sched_init(void) | |||
7960 | init_cfs_rq(&rq->cfs, rq); | 8163 | init_cfs_rq(&rq->cfs, rq); |
7961 | init_rt_rq(&rq->rt, rq); | 8164 | init_rt_rq(&rq->rt, rq); |
7962 | #ifdef CONFIG_FAIR_GROUP_SCHED | 8165 | #ifdef CONFIG_FAIR_GROUP_SCHED |
7963 | init_task_group.shares = init_task_group_load; | 8166 | root_task_group.shares = root_task_group_load; |
7964 | INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); | 8167 | INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); |
7965 | /* | 8168 | /* |
7966 | * How much cpu bandwidth does init_task_group get? | 8169 | * How much cpu bandwidth does root_task_group get? |
7967 | * | 8170 | * |
7968 | * In case of task-groups formed thr' the cgroup filesystem, it | 8171 | * In case of task-groups formed thr' the cgroup filesystem, it |
7969 | * gets 100% of the cpu resources in the system. This overall | 8172 | * gets 100% of the cpu resources in the system. This overall |
7970 | * system cpu resource is divided among the tasks of | 8173 | * system cpu resource is divided among the tasks of |
7971 | * init_task_group and its child task-groups in a fair manner, | 8174 | * root_task_group and its child task-groups in a fair manner, |
7972 | * based on each entity's (task or task-group's) weight | 8175 | * based on each entity's (task or task-group's) weight |
7973 | * (se->load.weight). | 8176 | * (se->load.weight). |
7974 | * | 8177 | * |
7975 | * In other words, if init_task_group has 10 tasks of weight | 8178 | * In other words, if root_task_group has 10 tasks of weight |
7976 | * 1024) and two child groups A0 and A1 (of weight 1024 each), | 8179 | * 1024) and two child groups A0 and A1 (of weight 1024 each), |
7977 | * then A0's share of the cpu resource is: | 8180 | * then A0's share of the cpu resource is: |
7978 | * | 8181 | * |
7979 | * A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33% | 8182 | * A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33% |
7980 | * | 8183 | * |
7981 | * We achieve this by letting init_task_group's tasks sit | 8184 | * We achieve this by letting root_task_group's tasks sit |
7982 | * directly in rq->cfs (i.e init_task_group->se[] = NULL). | 8185 | * directly in rq->cfs (i.e root_task_group->se[] = NULL). |
7983 | */ | 8186 | */ |
7984 | init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, NULL); | 8187 | init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL); |
7985 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | 8188 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
7986 | 8189 | ||
7987 | rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime; | 8190 | rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime; |
7988 | #ifdef CONFIG_RT_GROUP_SCHED | 8191 | #ifdef CONFIG_RT_GROUP_SCHED |
7989 | INIT_LIST_HEAD(&rq->leaf_rt_rq_list); | 8192 | INIT_LIST_HEAD(&rq->leaf_rt_rq_list); |
7990 | init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, NULL); | 8193 | init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL); |
7991 | #endif | 8194 | #endif |
7992 | 8195 | ||
7993 | for (j = 0; j < CPU_LOAD_IDX_MAX; j++) | 8196 | for (j = 0; j < CPU_LOAD_IDX_MAX; j++) |
@@ -8110,6 +8313,8 @@ EXPORT_SYMBOL(__might_sleep); | |||
8110 | #ifdef CONFIG_MAGIC_SYSRQ | 8313 | #ifdef CONFIG_MAGIC_SYSRQ |
8111 | static void normalize_task(struct rq *rq, struct task_struct *p) | 8314 | static void normalize_task(struct rq *rq, struct task_struct *p) |
8112 | { | 8315 | { |
8316 | const struct sched_class *prev_class = p->sched_class; | ||
8317 | int old_prio = p->prio; | ||
8113 | int on_rq; | 8318 | int on_rq; |
8114 | 8319 | ||
8115 | on_rq = p->se.on_rq; | 8320 | on_rq = p->se.on_rq; |
@@ -8120,6 +8325,8 @@ static void normalize_task(struct rq *rq, struct task_struct *p) | |||
8120 | activate_task(rq, p, 0); | 8325 | activate_task(rq, p, 0); |
8121 | resched_task(rq->curr); | 8326 | resched_task(rq->curr); |
8122 | } | 8327 | } |
8328 | |||
8329 | check_class_changed(rq, p, prev_class, old_prio); | ||
8123 | } | 8330 | } |
8124 | 8331 | ||
8125 | void normalize_rt_tasks(void) | 8332 | void normalize_rt_tasks(void) |
@@ -8379,6 +8586,7 @@ static void free_sched_group(struct task_group *tg) | |||
8379 | { | 8586 | { |
8380 | free_fair_sched_group(tg); | 8587 | free_fair_sched_group(tg); |
8381 | free_rt_sched_group(tg); | 8588 | free_rt_sched_group(tg); |
8589 | autogroup_free(tg); | ||
8382 | kfree(tg); | 8590 | kfree(tg); |
8383 | } | 8591 | } |
8384 | 8592 | ||
@@ -8510,7 +8718,7 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) | |||
8510 | /* Propagate contribution to hierarchy */ | 8718 | /* Propagate contribution to hierarchy */ |
8511 | raw_spin_lock_irqsave(&rq->lock, flags); | 8719 | raw_spin_lock_irqsave(&rq->lock, flags); |
8512 | for_each_sched_entity(se) | 8720 | for_each_sched_entity(se) |
8513 | update_cfs_shares(group_cfs_rq(se), 0); | 8721 | update_cfs_shares(group_cfs_rq(se)); |
8514 | raw_spin_unlock_irqrestore(&rq->lock, flags); | 8722 | raw_spin_unlock_irqrestore(&rq->lock, flags); |
8515 | } | 8723 | } |
8516 | 8724 | ||
@@ -8812,7 +9020,7 @@ cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp) | |||
8812 | 9020 | ||
8813 | if (!cgrp->parent) { | 9021 | if (!cgrp->parent) { |
8814 | /* This is early initialization for the top cgroup */ | 9022 | /* This is early initialization for the top cgroup */ |
8815 | return &init_task_group.css; | 9023 | return &root_task_group.css; |
8816 | } | 9024 | } |
8817 | 9025 | ||
8818 | parent = cgroup_tg(cgrp->parent); | 9026 | parent = cgroup_tg(cgrp->parent); |
@@ -8883,6 +9091,21 @@ cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, | |||
8883 | } | 9091 | } |
8884 | } | 9092 | } |
8885 | 9093 | ||
9094 | static void | ||
9095 | cpu_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp, | ||
9096 | struct cgroup *old_cgrp, struct task_struct *task) | ||
9097 | { | ||
9098 | /* | ||
9099 | * cgroup_exit() is called in the copy_process() failure path. | ||
9100 | * Ignore this case since the task hasn't ran yet, this avoids | ||
9101 | * trying to poke a half freed task state from generic code. | ||
9102 | */ | ||
9103 | if (!(task->flags & PF_EXITING)) | ||
9104 | return; | ||
9105 | |||
9106 | sched_move_task(task); | ||
9107 | } | ||
9108 | |||
8886 | #ifdef CONFIG_FAIR_GROUP_SCHED | 9109 | #ifdef CONFIG_FAIR_GROUP_SCHED |
8887 | static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype, | 9110 | static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype, |
8888 | u64 shareval) | 9111 | u64 shareval) |
@@ -8955,6 +9178,7 @@ struct cgroup_subsys cpu_cgroup_subsys = { | |||
8955 | .destroy = cpu_cgroup_destroy, | 9178 | .destroy = cpu_cgroup_destroy, |
8956 | .can_attach = cpu_cgroup_can_attach, | 9179 | .can_attach = cpu_cgroup_can_attach, |
8957 | .attach = cpu_cgroup_attach, | 9180 | .attach = cpu_cgroup_attach, |
9181 | .exit = cpu_cgroup_exit, | ||
8958 | .populate = cpu_cgroup_populate, | 9182 | .populate = cpu_cgroup_populate, |
8959 | .subsys_id = cpu_cgroup_subsys_id, | 9183 | .subsys_id = cpu_cgroup_subsys_id, |
8960 | .early_init = 1, | 9184 | .early_init = 1, |
diff --git a/kernel/sched_autogroup.c b/kernel/sched_autogroup.c index c80fedcd476b..5946ac515602 100644 --- a/kernel/sched_autogroup.c +++ b/kernel/sched_autogroup.c | |||
@@ -9,10 +9,9 @@ unsigned int __read_mostly sysctl_sched_autogroup_enabled = 1; | |||
9 | static struct autogroup autogroup_default; | 9 | static struct autogroup autogroup_default; |
10 | static atomic_t autogroup_seq_nr; | 10 | static atomic_t autogroup_seq_nr; |
11 | 11 | ||
12 | static void autogroup_init(struct task_struct *init_task) | 12 | static void __init autogroup_init(struct task_struct *init_task) |
13 | { | 13 | { |
14 | autogroup_default.tg = &init_task_group; | 14 | autogroup_default.tg = &root_task_group; |
15 | init_task_group.autogroup = &autogroup_default; | ||
16 | kref_init(&autogroup_default.kref); | 15 | kref_init(&autogroup_default.kref); |
17 | init_rwsem(&autogroup_default.lock); | 16 | init_rwsem(&autogroup_default.lock); |
18 | init_task->signal->autogroup = &autogroup_default; | 17 | init_task->signal->autogroup = &autogroup_default; |
@@ -27,6 +26,11 @@ static inline void autogroup_destroy(struct kref *kref) | |||
27 | { | 26 | { |
28 | struct autogroup *ag = container_of(kref, struct autogroup, kref); | 27 | struct autogroup *ag = container_of(kref, struct autogroup, kref); |
29 | 28 | ||
29 | #ifdef CONFIG_RT_GROUP_SCHED | ||
30 | /* We've redirected RT tasks to the root task group... */ | ||
31 | ag->tg->rt_se = NULL; | ||
32 | ag->tg->rt_rq = NULL; | ||
33 | #endif | ||
30 | sched_destroy_group(ag->tg); | 34 | sched_destroy_group(ag->tg); |
31 | } | 35 | } |
32 | 36 | ||
@@ -55,6 +59,10 @@ static inline struct autogroup *autogroup_task_get(struct task_struct *p) | |||
55 | return ag; | 59 | return ag; |
56 | } | 60 | } |
57 | 61 | ||
62 | #ifdef CONFIG_RT_GROUP_SCHED | ||
63 | static void free_rt_sched_group(struct task_group *tg); | ||
64 | #endif | ||
65 | |||
58 | static inline struct autogroup *autogroup_create(void) | 66 | static inline struct autogroup *autogroup_create(void) |
59 | { | 67 | { |
60 | struct autogroup *ag = kzalloc(sizeof(*ag), GFP_KERNEL); | 68 | struct autogroup *ag = kzalloc(sizeof(*ag), GFP_KERNEL); |
@@ -63,7 +71,7 @@ static inline struct autogroup *autogroup_create(void) | |||
63 | if (!ag) | 71 | if (!ag) |
64 | goto out_fail; | 72 | goto out_fail; |
65 | 73 | ||
66 | tg = sched_create_group(&init_task_group); | 74 | tg = sched_create_group(&root_task_group); |
67 | 75 | ||
68 | if (IS_ERR(tg)) | 76 | if (IS_ERR(tg)) |
69 | goto out_free; | 77 | goto out_free; |
@@ -72,6 +80,19 @@ static inline struct autogroup *autogroup_create(void) | |||
72 | init_rwsem(&ag->lock); | 80 | init_rwsem(&ag->lock); |
73 | ag->id = atomic_inc_return(&autogroup_seq_nr); | 81 | ag->id = atomic_inc_return(&autogroup_seq_nr); |
74 | ag->tg = tg; | 82 | ag->tg = tg; |
83 | #ifdef CONFIG_RT_GROUP_SCHED | ||
84 | /* | ||
85 | * Autogroup RT tasks are redirected to the root task group | ||
86 | * so we don't have to move tasks around upon policy change, | ||
87 | * or flail around trying to allocate bandwidth on the fly. | ||
88 | * A bandwidth exception in __sched_setscheduler() allows | ||
89 | * the policy change to proceed. Thereafter, task_group() | ||
90 | * returns &root_task_group, so zero bandwidth is required. | ||
91 | */ | ||
92 | free_rt_sched_group(tg); | ||
93 | tg->rt_se = root_task_group.rt_se; | ||
94 | tg->rt_rq = root_task_group.rt_rq; | ||
95 | #endif | ||
75 | tg->autogroup = ag; | 96 | tg->autogroup = ag; |
76 | 97 | ||
77 | return ag; | 98 | return ag; |
@@ -106,6 +127,11 @@ task_wants_autogroup(struct task_struct *p, struct task_group *tg) | |||
106 | return true; | 127 | return true; |
107 | } | 128 | } |
108 | 129 | ||
130 | static inline bool task_group_is_autogroup(struct task_group *tg) | ||
131 | { | ||
132 | return !!tg->autogroup; | ||
133 | } | ||
134 | |||
109 | static inline struct task_group * | 135 | static inline struct task_group * |
110 | autogroup_task_group(struct task_struct *p, struct task_group *tg) | 136 | autogroup_task_group(struct task_struct *p, struct task_group *tg) |
111 | { | 137 | { |
@@ -134,11 +160,15 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag) | |||
134 | 160 | ||
135 | p->signal->autogroup = autogroup_kref_get(ag); | 161 | p->signal->autogroup = autogroup_kref_get(ag); |
136 | 162 | ||
163 | if (!ACCESS_ONCE(sysctl_sched_autogroup_enabled)) | ||
164 | goto out; | ||
165 | |||
137 | t = p; | 166 | t = p; |
138 | do { | 167 | do { |
139 | sched_move_task(t); | 168 | sched_move_task(t); |
140 | } while_each_thread(p, t); | 169 | } while_each_thread(p, t); |
141 | 170 | ||
171 | out: | ||
142 | unlock_task_sighand(p, &flags); | 172 | unlock_task_sighand(p, &flags); |
143 | autogroup_kref_put(prev); | 173 | autogroup_kref_put(prev); |
144 | } | 174 | } |
@@ -220,10 +250,14 @@ void proc_sched_autogroup_show_task(struct task_struct *p, struct seq_file *m) | |||
220 | { | 250 | { |
221 | struct autogroup *ag = autogroup_task_get(p); | 251 | struct autogroup *ag = autogroup_task_get(p); |
222 | 252 | ||
253 | if (!task_group_is_autogroup(ag->tg)) | ||
254 | goto out; | ||
255 | |||
223 | down_read(&ag->lock); | 256 | down_read(&ag->lock); |
224 | seq_printf(m, "/autogroup-%ld nice %d\n", ag->id, ag->nice); | 257 | seq_printf(m, "/autogroup-%ld nice %d\n", ag->id, ag->nice); |
225 | up_read(&ag->lock); | 258 | up_read(&ag->lock); |
226 | 259 | ||
260 | out: | ||
227 | autogroup_kref_put(ag); | 261 | autogroup_kref_put(ag); |
228 | } | 262 | } |
229 | #endif /* CONFIG_PROC_FS */ | 263 | #endif /* CONFIG_PROC_FS */ |
@@ -231,6 +265,9 @@ void proc_sched_autogroup_show_task(struct task_struct *p, struct seq_file *m) | |||
231 | #ifdef CONFIG_SCHED_DEBUG | 265 | #ifdef CONFIG_SCHED_DEBUG |
232 | static inline int autogroup_path(struct task_group *tg, char *buf, int buflen) | 266 | static inline int autogroup_path(struct task_group *tg, char *buf, int buflen) |
233 | { | 267 | { |
268 | if (!task_group_is_autogroup(tg)) | ||
269 | return 0; | ||
270 | |||
234 | return snprintf(buf, buflen, "%s-%ld", "/autogroup", tg->autogroup->id); | 271 | return snprintf(buf, buflen, "%s-%ld", "/autogroup", tg->autogroup->id); |
235 | } | 272 | } |
236 | #endif /* CONFIG_SCHED_DEBUG */ | 273 | #endif /* CONFIG_SCHED_DEBUG */ |
diff --git a/kernel/sched_autogroup.h b/kernel/sched_autogroup.h index 5358e241cb20..05577055cfca 100644 --- a/kernel/sched_autogroup.h +++ b/kernel/sched_autogroup.h | |||
@@ -1,6 +1,11 @@ | |||
1 | #ifdef CONFIG_SCHED_AUTOGROUP | 1 | #ifdef CONFIG_SCHED_AUTOGROUP |
2 | 2 | ||
3 | struct autogroup { | 3 | struct autogroup { |
4 | /* | ||
5 | * reference doesn't mean how many thread attach to this | ||
6 | * autogroup now. It just stands for the number of task | ||
7 | * could use this autogroup. | ||
8 | */ | ||
4 | struct kref kref; | 9 | struct kref kref; |
5 | struct task_group *tg; | 10 | struct task_group *tg; |
6 | struct rw_semaphore lock; | 11 | struct rw_semaphore lock; |
@@ -15,6 +20,10 @@ autogroup_task_group(struct task_struct *p, struct task_group *tg); | |||
15 | 20 | ||
16 | static inline void autogroup_init(struct task_struct *init_task) { } | 21 | static inline void autogroup_init(struct task_struct *init_task) { } |
17 | static inline void autogroup_free(struct task_group *tg) { } | 22 | static inline void autogroup_free(struct task_group *tg) { } |
23 | static inline bool task_group_is_autogroup(struct task_group *tg) | ||
24 | { | ||
25 | return 0; | ||
26 | } | ||
18 | 27 | ||
19 | static inline struct task_group * | 28 | static inline struct task_group * |
20 | autogroup_task_group(struct task_struct *p, struct task_group *tg) | 29 | autogroup_task_group(struct task_struct *p, struct task_group *tg) |
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index 1dfae3d014b5..7bacd83a4158 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c | |||
@@ -16,6 +16,8 @@ | |||
16 | #include <linux/kallsyms.h> | 16 | #include <linux/kallsyms.h> |
17 | #include <linux/utsname.h> | 17 | #include <linux/utsname.h> |
18 | 18 | ||
19 | static DEFINE_SPINLOCK(sched_debug_lock); | ||
20 | |||
19 | /* | 21 | /* |
20 | * This allows printing both to /proc/sched_debug and | 22 | * This allows printing both to /proc/sched_debug and |
21 | * to the console | 23 | * to the console |
@@ -86,6 +88,26 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group | |||
86 | } | 88 | } |
87 | #endif | 89 | #endif |
88 | 90 | ||
91 | #ifdef CONFIG_CGROUP_SCHED | ||
92 | static char group_path[PATH_MAX]; | ||
93 | |||
94 | static char *task_group_path(struct task_group *tg) | ||
95 | { | ||
96 | if (autogroup_path(tg, group_path, PATH_MAX)) | ||
97 | return group_path; | ||
98 | |||
99 | /* | ||
100 | * May be NULL if the underlying cgroup isn't fully-created yet | ||
101 | */ | ||
102 | if (!tg->css.cgroup) { | ||
103 | group_path[0] = '\0'; | ||
104 | return group_path; | ||
105 | } | ||
106 | cgroup_path(tg->css.cgroup, group_path, PATH_MAX); | ||
107 | return group_path; | ||
108 | } | ||
109 | #endif | ||
110 | |||
89 | static void | 111 | static void |
90 | print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) | 112 | print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) |
91 | { | 113 | { |
@@ -108,6 +130,9 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) | |||
108 | SEQ_printf(m, "%15Ld %15Ld %15Ld.%06ld %15Ld.%06ld %15Ld.%06ld", | 130 | SEQ_printf(m, "%15Ld %15Ld %15Ld.%06ld %15Ld.%06ld %15Ld.%06ld", |
109 | 0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L); | 131 | 0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L); |
110 | #endif | 132 | #endif |
133 | #ifdef CONFIG_CGROUP_SCHED | ||
134 | SEQ_printf(m, " %s", task_group_path(task_group(p))); | ||
135 | #endif | ||
111 | 136 | ||
112 | SEQ_printf(m, "\n"); | 137 | SEQ_printf(m, "\n"); |
113 | } | 138 | } |
@@ -144,13 +169,17 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) | |||
144 | struct sched_entity *last; | 169 | struct sched_entity *last; |
145 | unsigned long flags; | 170 | unsigned long flags; |
146 | 171 | ||
172 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
173 | SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, task_group_path(cfs_rq->tg)); | ||
174 | #else | ||
147 | SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu); | 175 | SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu); |
176 | #endif | ||
148 | SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "exec_clock", | 177 | SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "exec_clock", |
149 | SPLIT_NS(cfs_rq->exec_clock)); | 178 | SPLIT_NS(cfs_rq->exec_clock)); |
150 | 179 | ||
151 | raw_spin_lock_irqsave(&rq->lock, flags); | 180 | raw_spin_lock_irqsave(&rq->lock, flags); |
152 | if (cfs_rq->rb_leftmost) | 181 | if (cfs_rq->rb_leftmost) |
153 | MIN_vruntime = (__pick_next_entity(cfs_rq))->vruntime; | 182 | MIN_vruntime = (__pick_first_entity(cfs_rq))->vruntime; |
154 | last = __pick_last_entity(cfs_rq); | 183 | last = __pick_last_entity(cfs_rq); |
155 | if (last) | 184 | if (last) |
156 | max_vruntime = last->vruntime; | 185 | max_vruntime = last->vruntime; |
@@ -191,7 +220,11 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) | |||
191 | 220 | ||
192 | void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq) | 221 | void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq) |
193 | { | 222 | { |
223 | #ifdef CONFIG_RT_GROUP_SCHED | ||
224 | SEQ_printf(m, "\nrt_rq[%d]:%s\n", cpu, task_group_path(rt_rq->tg)); | ||
225 | #else | ||
194 | SEQ_printf(m, "\nrt_rq[%d]:\n", cpu); | 226 | SEQ_printf(m, "\nrt_rq[%d]:\n", cpu); |
227 | #endif | ||
195 | 228 | ||
196 | #define P(x) \ | 229 | #define P(x) \ |
197 | SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(rt_rq->x)) | 230 | SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(rt_rq->x)) |
@@ -212,6 +245,7 @@ extern __read_mostly int sched_clock_running; | |||
212 | static void print_cpu(struct seq_file *m, int cpu) | 245 | static void print_cpu(struct seq_file *m, int cpu) |
213 | { | 246 | { |
214 | struct rq *rq = cpu_rq(cpu); | 247 | struct rq *rq = cpu_rq(cpu); |
248 | unsigned long flags; | ||
215 | 249 | ||
216 | #ifdef CONFIG_X86 | 250 | #ifdef CONFIG_X86 |
217 | { | 251 | { |
@@ -262,14 +296,20 @@ static void print_cpu(struct seq_file *m, int cpu) | |||
262 | P(ttwu_count); | 296 | P(ttwu_count); |
263 | P(ttwu_local); | 297 | P(ttwu_local); |
264 | 298 | ||
265 | P(bkl_count); | 299 | SEQ_printf(m, " .%-30s: %d\n", "bkl_count", |
300 | rq->rq_sched_info.bkl_count); | ||
266 | 301 | ||
267 | #undef P | 302 | #undef P |
303 | #undef P64 | ||
268 | #endif | 304 | #endif |
305 | spin_lock_irqsave(&sched_debug_lock, flags); | ||
269 | print_cfs_stats(m, cpu); | 306 | print_cfs_stats(m, cpu); |
270 | print_rt_stats(m, cpu); | 307 | print_rt_stats(m, cpu); |
271 | 308 | ||
309 | rcu_read_lock(); | ||
272 | print_rq(m, rq, cpu); | 310 | print_rq(m, rq, cpu); |
311 | rcu_read_unlock(); | ||
312 | spin_unlock_irqrestore(&sched_debug_lock, flags); | ||
273 | } | 313 | } |
274 | 314 | ||
275 | static const char *sched_tunable_scaling_names[] = { | 315 | static const char *sched_tunable_scaling_names[] = { |
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index c62ebae65cf0..3f7ec9e27ee1 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c | |||
@@ -69,14 +69,6 @@ static unsigned int sched_nr_latency = 8; | |||
69 | unsigned int sysctl_sched_child_runs_first __read_mostly; | 69 | unsigned int sysctl_sched_child_runs_first __read_mostly; |
70 | 70 | ||
71 | /* | 71 | /* |
72 | * sys_sched_yield() compat mode | ||
73 | * | ||
74 | * This option switches the agressive yield implementation of the | ||
75 | * old scheduler back on. | ||
76 | */ | ||
77 | unsigned int __read_mostly sysctl_sched_compat_yield; | ||
78 | |||
79 | /* | ||
80 | * SCHED_OTHER wake-up granularity. | 72 | * SCHED_OTHER wake-up granularity. |
81 | * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds) | 73 | * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds) |
82 | * | 74 | * |
@@ -419,7 +411,7 @@ static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
419 | rb_erase(&se->run_node, &cfs_rq->tasks_timeline); | 411 | rb_erase(&se->run_node, &cfs_rq->tasks_timeline); |
420 | } | 412 | } |
421 | 413 | ||
422 | static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq) | 414 | static struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq) |
423 | { | 415 | { |
424 | struct rb_node *left = cfs_rq->rb_leftmost; | 416 | struct rb_node *left = cfs_rq->rb_leftmost; |
425 | 417 | ||
@@ -429,6 +421,17 @@ static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq) | |||
429 | return rb_entry(left, struct sched_entity, run_node); | 421 | return rb_entry(left, struct sched_entity, run_node); |
430 | } | 422 | } |
431 | 423 | ||
424 | static struct sched_entity *__pick_next_entity(struct sched_entity *se) | ||
425 | { | ||
426 | struct rb_node *next = rb_next(&se->run_node); | ||
427 | |||
428 | if (!next) | ||
429 | return NULL; | ||
430 | |||
431 | return rb_entry(next, struct sched_entity, run_node); | ||
432 | } | ||
433 | |||
434 | #ifdef CONFIG_SCHED_DEBUG | ||
432 | static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) | 435 | static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) |
433 | { | 436 | { |
434 | struct rb_node *last = rb_last(&cfs_rq->tasks_timeline); | 437 | struct rb_node *last = rb_last(&cfs_rq->tasks_timeline); |
@@ -443,7 +446,6 @@ static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) | |||
443 | * Scheduling class statistics methods: | 446 | * Scheduling class statistics methods: |
444 | */ | 447 | */ |
445 | 448 | ||
446 | #ifdef CONFIG_SCHED_DEBUG | ||
447 | int sched_proc_update_handler(struct ctl_table *table, int write, | 449 | int sched_proc_update_handler(struct ctl_table *table, int write, |
448 | void __user *buffer, size_t *lenp, | 450 | void __user *buffer, size_t *lenp, |
449 | loff_t *ppos) | 451 | loff_t *ppos) |
@@ -540,7 +542,7 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
540 | } | 542 | } |
541 | 543 | ||
542 | static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update); | 544 | static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update); |
543 | static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta); | 545 | static void update_cfs_shares(struct cfs_rq *cfs_rq); |
544 | 546 | ||
545 | /* | 547 | /* |
546 | * Update the current task's runtime statistics. Skip current tasks that | 548 | * Update the current task's runtime statistics. Skip current tasks that |
@@ -699,7 +701,8 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
699 | cfs_rq->nr_running--; | 701 | cfs_rq->nr_running--; |
700 | } | 702 | } |
701 | 703 | ||
702 | #if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED | 704 | #ifdef CONFIG_FAIR_GROUP_SCHED |
705 | # ifdef CONFIG_SMP | ||
703 | static void update_cfs_rq_load_contribution(struct cfs_rq *cfs_rq, | 706 | static void update_cfs_rq_load_contribution(struct cfs_rq *cfs_rq, |
704 | int global_update) | 707 | int global_update) |
705 | { | 708 | { |
@@ -721,10 +724,10 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update) | |||
721 | u64 now, delta; | 724 | u64 now, delta; |
722 | unsigned long load = cfs_rq->load.weight; | 725 | unsigned long load = cfs_rq->load.weight; |
723 | 726 | ||
724 | if (!cfs_rq) | 727 | if (cfs_rq->tg == &root_task_group) |
725 | return; | 728 | return; |
726 | 729 | ||
727 | now = rq_of(cfs_rq)->clock; | 730 | now = rq_of(cfs_rq)->clock_task; |
728 | delta = now - cfs_rq->load_stamp; | 731 | delta = now - cfs_rq->load_stamp; |
729 | 732 | ||
730 | /* truncate load history at 4 idle periods */ | 733 | /* truncate load history at 4 idle periods */ |
@@ -732,6 +735,7 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update) | |||
732 | now - cfs_rq->load_last > 4 * period) { | 735 | now - cfs_rq->load_last > 4 * period) { |
733 | cfs_rq->load_period = 0; | 736 | cfs_rq->load_period = 0; |
734 | cfs_rq->load_avg = 0; | 737 | cfs_rq->load_avg = 0; |
738 | delta = period - 1; | ||
735 | } | 739 | } |
736 | 740 | ||
737 | cfs_rq->load_stamp = now; | 741 | cfs_rq->load_stamp = now; |
@@ -762,6 +766,49 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update) | |||
762 | list_del_leaf_cfs_rq(cfs_rq); | 766 | list_del_leaf_cfs_rq(cfs_rq); |
763 | } | 767 | } |
764 | 768 | ||
769 | static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg) | ||
770 | { | ||
771 | long load_weight, load, shares; | ||
772 | |||
773 | load = cfs_rq->load.weight; | ||
774 | |||
775 | load_weight = atomic_read(&tg->load_weight); | ||
776 | load_weight += load; | ||
777 | load_weight -= cfs_rq->load_contribution; | ||
778 | |||
779 | shares = (tg->shares * load); | ||
780 | if (load_weight) | ||
781 | shares /= load_weight; | ||
782 | |||
783 | if (shares < MIN_SHARES) | ||
784 | shares = MIN_SHARES; | ||
785 | if (shares > tg->shares) | ||
786 | shares = tg->shares; | ||
787 | |||
788 | return shares; | ||
789 | } | ||
790 | |||
791 | static void update_entity_shares_tick(struct cfs_rq *cfs_rq) | ||
792 | { | ||
793 | if (cfs_rq->load_unacc_exec_time > sysctl_sched_shares_window) { | ||
794 | update_cfs_load(cfs_rq, 0); | ||
795 | update_cfs_shares(cfs_rq); | ||
796 | } | ||
797 | } | ||
798 | # else /* CONFIG_SMP */ | ||
799 | static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update) | ||
800 | { | ||
801 | } | ||
802 | |||
803 | static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg) | ||
804 | { | ||
805 | return tg->shares; | ||
806 | } | ||
807 | |||
808 | static inline void update_entity_shares_tick(struct cfs_rq *cfs_rq) | ||
809 | { | ||
810 | } | ||
811 | # endif /* CONFIG_SMP */ | ||
765 | static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, | 812 | static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, |
766 | unsigned long weight) | 813 | unsigned long weight) |
767 | { | 814 | { |
@@ -778,51 +825,30 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, | |||
778 | account_entity_enqueue(cfs_rq, se); | 825 | account_entity_enqueue(cfs_rq, se); |
779 | } | 826 | } |
780 | 827 | ||
781 | static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta) | 828 | static void update_cfs_shares(struct cfs_rq *cfs_rq) |
782 | { | 829 | { |
783 | struct task_group *tg; | 830 | struct task_group *tg; |
784 | struct sched_entity *se; | 831 | struct sched_entity *se; |
785 | long load_weight, load, shares; | 832 | long shares; |
786 | |||
787 | if (!cfs_rq) | ||
788 | return; | ||
789 | 833 | ||
790 | tg = cfs_rq->tg; | 834 | tg = cfs_rq->tg; |
791 | se = tg->se[cpu_of(rq_of(cfs_rq))]; | 835 | se = tg->se[cpu_of(rq_of(cfs_rq))]; |
792 | if (!se) | 836 | if (!se) |
793 | return; | 837 | return; |
794 | 838 | #ifndef CONFIG_SMP | |
795 | load = cfs_rq->load.weight + weight_delta; | 839 | if (likely(se->load.weight == tg->shares)) |
796 | 840 | return; | |
797 | load_weight = atomic_read(&tg->load_weight); | 841 | #endif |
798 | load_weight -= cfs_rq->load_contribution; | 842 | shares = calc_cfs_shares(cfs_rq, tg); |
799 | load_weight += load; | ||
800 | |||
801 | shares = (tg->shares * load); | ||
802 | if (load_weight) | ||
803 | shares /= load_weight; | ||
804 | |||
805 | if (shares < MIN_SHARES) | ||
806 | shares = MIN_SHARES; | ||
807 | if (shares > tg->shares) | ||
808 | shares = tg->shares; | ||
809 | 843 | ||
810 | reweight_entity(cfs_rq_of(se), se, shares); | 844 | reweight_entity(cfs_rq_of(se), se, shares); |
811 | } | 845 | } |
812 | |||
813 | static void update_entity_shares_tick(struct cfs_rq *cfs_rq) | ||
814 | { | ||
815 | if (cfs_rq->load_unacc_exec_time > sysctl_sched_shares_window) { | ||
816 | update_cfs_load(cfs_rq, 0); | ||
817 | update_cfs_shares(cfs_rq, 0); | ||
818 | } | ||
819 | } | ||
820 | #else /* CONFIG_FAIR_GROUP_SCHED */ | 846 | #else /* CONFIG_FAIR_GROUP_SCHED */ |
821 | static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update) | 847 | static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update) |
822 | { | 848 | { |
823 | } | 849 | } |
824 | 850 | ||
825 | static inline void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta) | 851 | static inline void update_cfs_shares(struct cfs_rq *cfs_rq) |
826 | { | 852 | { |
827 | } | 853 | } |
828 | 854 | ||
@@ -953,8 +979,8 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) | |||
953 | */ | 979 | */ |
954 | update_curr(cfs_rq); | 980 | update_curr(cfs_rq); |
955 | update_cfs_load(cfs_rq, 0); | 981 | update_cfs_load(cfs_rq, 0); |
956 | update_cfs_shares(cfs_rq, se->load.weight); | ||
957 | account_entity_enqueue(cfs_rq, se); | 982 | account_entity_enqueue(cfs_rq, se); |
983 | update_cfs_shares(cfs_rq); | ||
958 | 984 | ||
959 | if (flags & ENQUEUE_WAKEUP) { | 985 | if (flags & ENQUEUE_WAKEUP) { |
960 | place_entity(cfs_rq, se, 0); | 986 | place_entity(cfs_rq, se, 0); |
@@ -971,19 +997,49 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) | |||
971 | list_add_leaf_cfs_rq(cfs_rq); | 997 | list_add_leaf_cfs_rq(cfs_rq); |
972 | } | 998 | } |
973 | 999 | ||
974 | static void __clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) | 1000 | static void __clear_buddies_last(struct sched_entity *se) |
975 | { | 1001 | { |
976 | if (!se || cfs_rq->last == se) | 1002 | for_each_sched_entity(se) { |
977 | cfs_rq->last = NULL; | 1003 | struct cfs_rq *cfs_rq = cfs_rq_of(se); |
1004 | if (cfs_rq->last == se) | ||
1005 | cfs_rq->last = NULL; | ||
1006 | else | ||
1007 | break; | ||
1008 | } | ||
1009 | } | ||
978 | 1010 | ||
979 | if (!se || cfs_rq->next == se) | 1011 | static void __clear_buddies_next(struct sched_entity *se) |
980 | cfs_rq->next = NULL; | 1012 | { |
1013 | for_each_sched_entity(se) { | ||
1014 | struct cfs_rq *cfs_rq = cfs_rq_of(se); | ||
1015 | if (cfs_rq->next == se) | ||
1016 | cfs_rq->next = NULL; | ||
1017 | else | ||
1018 | break; | ||
1019 | } | ||
1020 | } | ||
1021 | |||
1022 | static void __clear_buddies_skip(struct sched_entity *se) | ||
1023 | { | ||
1024 | for_each_sched_entity(se) { | ||
1025 | struct cfs_rq *cfs_rq = cfs_rq_of(se); | ||
1026 | if (cfs_rq->skip == se) | ||
1027 | cfs_rq->skip = NULL; | ||
1028 | else | ||
1029 | break; | ||
1030 | } | ||
981 | } | 1031 | } |
982 | 1032 | ||
983 | static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) | 1033 | static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) |
984 | { | 1034 | { |
985 | for_each_sched_entity(se) | 1035 | if (cfs_rq->last == se) |
986 | __clear_buddies(cfs_rq_of(se), se); | 1036 | __clear_buddies_last(se); |
1037 | |||
1038 | if (cfs_rq->next == se) | ||
1039 | __clear_buddies_next(se); | ||
1040 | |||
1041 | if (cfs_rq->skip == se) | ||
1042 | __clear_buddies_skip(se); | ||
987 | } | 1043 | } |
988 | 1044 | ||
989 | static void | 1045 | static void |
@@ -1016,7 +1072,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) | |||
1016 | update_cfs_load(cfs_rq, 0); | 1072 | update_cfs_load(cfs_rq, 0); |
1017 | account_entity_dequeue(cfs_rq, se); | 1073 | account_entity_dequeue(cfs_rq, se); |
1018 | update_min_vruntime(cfs_rq); | 1074 | update_min_vruntime(cfs_rq); |
1019 | update_cfs_shares(cfs_rq, 0); | 1075 | update_cfs_shares(cfs_rq); |
1020 | 1076 | ||
1021 | /* | 1077 | /* |
1022 | * Normalize the entity after updating the min_vruntime because the | 1078 | * Normalize the entity after updating the min_vruntime because the |
@@ -1059,9 +1115,12 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) | |||
1059 | return; | 1115 | return; |
1060 | 1116 | ||
1061 | if (cfs_rq->nr_running > 1) { | 1117 | if (cfs_rq->nr_running > 1) { |
1062 | struct sched_entity *se = __pick_next_entity(cfs_rq); | 1118 | struct sched_entity *se = __pick_first_entity(cfs_rq); |
1063 | s64 delta = curr->vruntime - se->vruntime; | 1119 | s64 delta = curr->vruntime - se->vruntime; |
1064 | 1120 | ||
1121 | if (delta < 0) | ||
1122 | return; | ||
1123 | |||
1065 | if (delta > ideal_runtime) | 1124 | if (delta > ideal_runtime) |
1066 | resched_task(rq_of(cfs_rq)->curr); | 1125 | resched_task(rq_of(cfs_rq)->curr); |
1067 | } | 1126 | } |
@@ -1100,13 +1159,27 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
1100 | static int | 1159 | static int |
1101 | wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se); | 1160 | wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se); |
1102 | 1161 | ||
1162 | /* | ||
1163 | * Pick the next process, keeping these things in mind, in this order: | ||
1164 | * 1) keep things fair between processes/task groups | ||
1165 | * 2) pick the "next" process, since someone really wants that to run | ||
1166 | * 3) pick the "last" process, for cache locality | ||
1167 | * 4) do not run the "skip" process, if something else is available | ||
1168 | */ | ||
1103 | static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq) | 1169 | static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq) |
1104 | { | 1170 | { |
1105 | struct sched_entity *se = __pick_next_entity(cfs_rq); | 1171 | struct sched_entity *se = __pick_first_entity(cfs_rq); |
1106 | struct sched_entity *left = se; | 1172 | struct sched_entity *left = se; |
1107 | 1173 | ||
1108 | if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1) | 1174 | /* |
1109 | se = cfs_rq->next; | 1175 | * Avoid running the skip buddy, if running something else can |
1176 | * be done without getting too unfair. | ||
1177 | */ | ||
1178 | if (cfs_rq->skip == se) { | ||
1179 | struct sched_entity *second = __pick_next_entity(se); | ||
1180 | if (second && wakeup_preempt_entity(second, left) < 1) | ||
1181 | se = second; | ||
1182 | } | ||
1110 | 1183 | ||
1111 | /* | 1184 | /* |
1112 | * Prefer last buddy, try to return the CPU to a preempted task. | 1185 | * Prefer last buddy, try to return the CPU to a preempted task. |
@@ -1114,6 +1187,12 @@ static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq) | |||
1114 | if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1) | 1187 | if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1) |
1115 | se = cfs_rq->last; | 1188 | se = cfs_rq->last; |
1116 | 1189 | ||
1190 | /* | ||
1191 | * Someone really wants this to run. If it's not unfair, run it. | ||
1192 | */ | ||
1193 | if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1) | ||
1194 | se = cfs_rq->next; | ||
1195 | |||
1117 | clear_buddies(cfs_rq, se); | 1196 | clear_buddies(cfs_rq, se); |
1118 | 1197 | ||
1119 | return se; | 1198 | return se; |
@@ -1254,7 +1333,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) | |||
1254 | struct cfs_rq *cfs_rq = cfs_rq_of(se); | 1333 | struct cfs_rq *cfs_rq = cfs_rq_of(se); |
1255 | 1334 | ||
1256 | update_cfs_load(cfs_rq, 0); | 1335 | update_cfs_load(cfs_rq, 0); |
1257 | update_cfs_shares(cfs_rq, 0); | 1336 | update_cfs_shares(cfs_rq); |
1258 | } | 1337 | } |
1259 | 1338 | ||
1260 | hrtick_update(rq); | 1339 | hrtick_update(rq); |
@@ -1284,58 +1363,12 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) | |||
1284 | struct cfs_rq *cfs_rq = cfs_rq_of(se); | 1363 | struct cfs_rq *cfs_rq = cfs_rq_of(se); |
1285 | 1364 | ||
1286 | update_cfs_load(cfs_rq, 0); | 1365 | update_cfs_load(cfs_rq, 0); |
1287 | update_cfs_shares(cfs_rq, 0); | 1366 | update_cfs_shares(cfs_rq); |
1288 | } | 1367 | } |
1289 | 1368 | ||
1290 | hrtick_update(rq); | 1369 | hrtick_update(rq); |
1291 | } | 1370 | } |
1292 | 1371 | ||
1293 | /* | ||
1294 | * sched_yield() support is very simple - we dequeue and enqueue. | ||
1295 | * | ||
1296 | * If compat_yield is turned on then we requeue to the end of the tree. | ||
1297 | */ | ||
1298 | static void yield_task_fair(struct rq *rq) | ||
1299 | { | ||
1300 | struct task_struct *curr = rq->curr; | ||
1301 | struct cfs_rq *cfs_rq = task_cfs_rq(curr); | ||
1302 | struct sched_entity *rightmost, *se = &curr->se; | ||
1303 | |||
1304 | /* | ||
1305 | * Are we the only task in the tree? | ||
1306 | */ | ||
1307 | if (unlikely(cfs_rq->nr_running == 1)) | ||
1308 | return; | ||
1309 | |||
1310 | clear_buddies(cfs_rq, se); | ||
1311 | |||
1312 | if (likely(!sysctl_sched_compat_yield) && curr->policy != SCHED_BATCH) { | ||
1313 | update_rq_clock(rq); | ||
1314 | /* | ||
1315 | * Update run-time statistics of the 'current'. | ||
1316 | */ | ||
1317 | update_curr(cfs_rq); | ||
1318 | |||
1319 | return; | ||
1320 | } | ||
1321 | /* | ||
1322 | * Find the rightmost entry in the rbtree: | ||
1323 | */ | ||
1324 | rightmost = __pick_last_entity(cfs_rq); | ||
1325 | /* | ||
1326 | * Already in the rightmost position? | ||
1327 | */ | ||
1328 | if (unlikely(!rightmost || entity_before(rightmost, se))) | ||
1329 | return; | ||
1330 | |||
1331 | /* | ||
1332 | * Minimally necessary key value to be last in the tree: | ||
1333 | * Upon rescheduling, sched_class::put_prev_task() will place | ||
1334 | * 'current' within the tree based on its new key value. | ||
1335 | */ | ||
1336 | se->vruntime = rightmost->vruntime + 1; | ||
1337 | } | ||
1338 | |||
1339 | #ifdef CONFIG_SMP | 1372 | #ifdef CONFIG_SMP |
1340 | 1373 | ||
1341 | static void task_waking_fair(struct rq *rq, struct task_struct *p) | 1374 | static void task_waking_fair(struct rq *rq, struct task_struct *p) |
@@ -1362,27 +1395,27 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg) | |||
1362 | return wl; | 1395 | return wl; |
1363 | 1396 | ||
1364 | for_each_sched_entity(se) { | 1397 | for_each_sched_entity(se) { |
1365 | long S, rw, s, a, b; | 1398 | long lw, w; |
1366 | 1399 | ||
1367 | S = se->my_q->tg->shares; | 1400 | tg = se->my_q->tg; |
1368 | s = se->load.weight; | 1401 | w = se->my_q->load.weight; |
1369 | rw = se->my_q->load.weight; | ||
1370 | 1402 | ||
1371 | a = S*(rw + wl); | 1403 | /* use this cpu's instantaneous contribution */ |
1372 | b = S*rw + s*wg; | 1404 | lw = atomic_read(&tg->load_weight); |
1405 | lw -= se->my_q->load_contribution; | ||
1406 | lw += w + wg; | ||
1373 | 1407 | ||
1374 | wl = s*(a-b); | 1408 | wl += w; |
1375 | 1409 | ||
1376 | if (likely(b)) | 1410 | if (lw > 0 && wl < lw) |
1377 | wl /= b; | 1411 | wl = (wl * tg->shares) / lw; |
1412 | else | ||
1413 | wl = tg->shares; | ||
1378 | 1414 | ||
1379 | /* | 1415 | /* zero point is MIN_SHARES */ |
1380 | * Assume the group is already running and will | 1416 | if (wl < MIN_SHARES) |
1381 | * thus already be accounted for in the weight. | 1417 | wl = MIN_SHARES; |
1382 | * | 1418 | wl -= se->load.weight; |
1383 | * That is, moving shares between CPUs, does not | ||
1384 | * alter the group weight. | ||
1385 | */ | ||
1386 | wg = 0; | 1419 | wg = 0; |
1387 | } | 1420 | } |
1388 | 1421 | ||
@@ -1401,7 +1434,7 @@ static inline unsigned long effective_load(struct task_group *tg, int cpu, | |||
1401 | 1434 | ||
1402 | static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) | 1435 | static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) |
1403 | { | 1436 | { |
1404 | unsigned long this_load, load; | 1437 | s64 this_load, load; |
1405 | int idx, this_cpu, prev_cpu; | 1438 | int idx, this_cpu, prev_cpu; |
1406 | unsigned long tl_per_task; | 1439 | unsigned long tl_per_task; |
1407 | struct task_group *tg; | 1440 | struct task_group *tg; |
@@ -1440,8 +1473,8 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) | |||
1440 | * Otherwise check if either cpus are near enough in load to allow this | 1473 | * Otherwise check if either cpus are near enough in load to allow this |
1441 | * task to be woken on this_cpu. | 1474 | * task to be woken on this_cpu. |
1442 | */ | 1475 | */ |
1443 | if (this_load) { | 1476 | if (this_load > 0) { |
1444 | unsigned long this_eff_load, prev_eff_load; | 1477 | s64 this_eff_load, prev_eff_load; |
1445 | 1478 | ||
1446 | this_eff_load = 100; | 1479 | this_eff_load = 100; |
1447 | this_eff_load *= power_of(prev_cpu); | 1480 | this_eff_load *= power_of(prev_cpu); |
@@ -1806,6 +1839,14 @@ static void set_next_buddy(struct sched_entity *se) | |||
1806 | } | 1839 | } |
1807 | } | 1840 | } |
1808 | 1841 | ||
1842 | static void set_skip_buddy(struct sched_entity *se) | ||
1843 | { | ||
1844 | if (likely(task_of(se)->policy != SCHED_IDLE)) { | ||
1845 | for_each_sched_entity(se) | ||
1846 | cfs_rq_of(se)->skip = se; | ||
1847 | } | ||
1848 | } | ||
1849 | |||
1809 | /* | 1850 | /* |
1810 | * Preempt the current task with a newly woken task if needed: | 1851 | * Preempt the current task with a newly woken task if needed: |
1811 | */ | 1852 | */ |
@@ -1829,16 +1870,18 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ | |||
1829 | if (test_tsk_need_resched(curr)) | 1870 | if (test_tsk_need_resched(curr)) |
1830 | return; | 1871 | return; |
1831 | 1872 | ||
1873 | /* Idle tasks are by definition preempted by non-idle tasks. */ | ||
1874 | if (unlikely(curr->policy == SCHED_IDLE) && | ||
1875 | likely(p->policy != SCHED_IDLE)) | ||
1876 | goto preempt; | ||
1877 | |||
1832 | /* | 1878 | /* |
1833 | * Batch and idle tasks do not preempt (their preemption is driven by | 1879 | * Batch and idle tasks do not preempt non-idle tasks (their preemption |
1834 | * the tick): | 1880 | * is driven by the tick): |
1835 | */ | 1881 | */ |
1836 | if (unlikely(p->policy != SCHED_NORMAL)) | 1882 | if (unlikely(p->policy != SCHED_NORMAL)) |
1837 | return; | 1883 | return; |
1838 | 1884 | ||
1839 | /* Idle tasks are by definition preempted by everybody. */ | ||
1840 | if (unlikely(curr->policy == SCHED_IDLE)) | ||
1841 | goto preempt; | ||
1842 | 1885 | ||
1843 | if (!sched_feat(WAKEUP_PREEMPT)) | 1886 | if (!sched_feat(WAKEUP_PREEMPT)) |
1844 | return; | 1887 | return; |
@@ -1904,6 +1947,51 @@ static void put_prev_task_fair(struct rq *rq, struct task_struct *prev) | |||
1904 | } | 1947 | } |
1905 | } | 1948 | } |
1906 | 1949 | ||
1950 | /* | ||
1951 | * sched_yield() is very simple | ||
1952 | * | ||
1953 | * The magic of dealing with the ->skip buddy is in pick_next_entity. | ||
1954 | */ | ||
1955 | static void yield_task_fair(struct rq *rq) | ||
1956 | { | ||
1957 | struct task_struct *curr = rq->curr; | ||
1958 | struct cfs_rq *cfs_rq = task_cfs_rq(curr); | ||
1959 | struct sched_entity *se = &curr->se; | ||
1960 | |||
1961 | /* | ||
1962 | * Are we the only task in the tree? | ||
1963 | */ | ||
1964 | if (unlikely(rq->nr_running == 1)) | ||
1965 | return; | ||
1966 | |||
1967 | clear_buddies(cfs_rq, se); | ||
1968 | |||
1969 | if (curr->policy != SCHED_BATCH) { | ||
1970 | update_rq_clock(rq); | ||
1971 | /* | ||
1972 | * Update run-time statistics of the 'current'. | ||
1973 | */ | ||
1974 | update_curr(cfs_rq); | ||
1975 | } | ||
1976 | |||
1977 | set_skip_buddy(se); | ||
1978 | } | ||
1979 | |||
1980 | static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preempt) | ||
1981 | { | ||
1982 | struct sched_entity *se = &p->se; | ||
1983 | |||
1984 | if (!se->on_rq) | ||
1985 | return false; | ||
1986 | |||
1987 | /* Tell the scheduler that we'd really like pse to run next. */ | ||
1988 | set_next_buddy(se); | ||
1989 | |||
1990 | yield_task_fair(rq); | ||
1991 | |||
1992 | return true; | ||
1993 | } | ||
1994 | |||
1907 | #ifdef CONFIG_SMP | 1995 | #ifdef CONFIG_SMP |
1908 | /************************************************** | 1996 | /************************************************** |
1909 | * Fair scheduling class load-balancing methods: | 1997 | * Fair scheduling class load-balancing methods: |
@@ -2095,7 +2183,7 @@ static int update_shares_cpu(struct task_group *tg, int cpu) | |||
2095 | * We need to update shares after updating tg->load_weight in | 2183 | * We need to update shares after updating tg->load_weight in |
2096 | * order to adjust the weight of groups with long running tasks. | 2184 | * order to adjust the weight of groups with long running tasks. |
2097 | */ | 2185 | */ |
2098 | update_cfs_shares(cfs_rq, 0); | 2186 | update_cfs_shares(cfs_rq); |
2099 | 2187 | ||
2100 | raw_spin_unlock_irqrestore(&rq->lock, flags); | 2188 | raw_spin_unlock_irqrestore(&rq->lock, flags); |
2101 | 2189 | ||
@@ -2582,7 +2670,6 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group) | |||
2582 | * @this_cpu: Cpu for which load balance is currently performed. | 2670 | * @this_cpu: Cpu for which load balance is currently performed. |
2583 | * @idle: Idle status of this_cpu | 2671 | * @idle: Idle status of this_cpu |
2584 | * @load_idx: Load index of sched_domain of this_cpu for load calc. | 2672 | * @load_idx: Load index of sched_domain of this_cpu for load calc. |
2585 | * @sd_idle: Idle status of the sched_domain containing group. | ||
2586 | * @local_group: Does group contain this_cpu. | 2673 | * @local_group: Does group contain this_cpu. |
2587 | * @cpus: Set of cpus considered for load balancing. | 2674 | * @cpus: Set of cpus considered for load balancing. |
2588 | * @balance: Should we balance. | 2675 | * @balance: Should we balance. |
@@ -2590,7 +2677,7 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group) | |||
2590 | */ | 2677 | */ |
2591 | static inline void update_sg_lb_stats(struct sched_domain *sd, | 2678 | static inline void update_sg_lb_stats(struct sched_domain *sd, |
2592 | struct sched_group *group, int this_cpu, | 2679 | struct sched_group *group, int this_cpu, |
2593 | enum cpu_idle_type idle, int load_idx, int *sd_idle, | 2680 | enum cpu_idle_type idle, int load_idx, |
2594 | int local_group, const struct cpumask *cpus, | 2681 | int local_group, const struct cpumask *cpus, |
2595 | int *balance, struct sg_lb_stats *sgs) | 2682 | int *balance, struct sg_lb_stats *sgs) |
2596 | { | 2683 | { |
@@ -2610,9 +2697,6 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, | |||
2610 | for_each_cpu_and(i, sched_group_cpus(group), cpus) { | 2697 | for_each_cpu_and(i, sched_group_cpus(group), cpus) { |
2611 | struct rq *rq = cpu_rq(i); | 2698 | struct rq *rq = cpu_rq(i); |
2612 | 2699 | ||
2613 | if (*sd_idle && rq->nr_running) | ||
2614 | *sd_idle = 0; | ||
2615 | |||
2616 | /* Bias balancing toward cpus of our domain */ | 2700 | /* Bias balancing toward cpus of our domain */ |
2617 | if (local_group) { | 2701 | if (local_group) { |
2618 | if (idle_cpu(i) && !first_idle_cpu) { | 2702 | if (idle_cpu(i) && !first_idle_cpu) { |
@@ -2657,7 +2741,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, | |||
2657 | 2741 | ||
2658 | /* | 2742 | /* |
2659 | * Consider the group unbalanced when the imbalance is larger | 2743 | * Consider the group unbalanced when the imbalance is larger |
2660 | * than the average weight of two tasks. | 2744 | * than the average weight of a task. |
2661 | * | 2745 | * |
2662 | * APZ: with cgroup the avg task weight can vary wildly and | 2746 | * APZ: with cgroup the avg task weight can vary wildly and |
2663 | * might not be a suitable number - should we keep a | 2747 | * might not be a suitable number - should we keep a |
@@ -2667,7 +2751,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, | |||
2667 | if (sgs->sum_nr_running) | 2751 | if (sgs->sum_nr_running) |
2668 | avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; | 2752 | avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; |
2669 | 2753 | ||
2670 | if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task && max_nr_running > 1) | 2754 | if ((max_cpu_load - min_cpu_load) >= avg_load_per_task && max_nr_running > 1) |
2671 | sgs->group_imb = 1; | 2755 | sgs->group_imb = 1; |
2672 | 2756 | ||
2673 | sgs->group_capacity = DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE); | 2757 | sgs->group_capacity = DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE); |
@@ -2727,15 +2811,13 @@ static bool update_sd_pick_busiest(struct sched_domain *sd, | |||
2727 | * @sd: sched_domain whose statistics are to be updated. | 2811 | * @sd: sched_domain whose statistics are to be updated. |
2728 | * @this_cpu: Cpu for which load balance is currently performed. | 2812 | * @this_cpu: Cpu for which load balance is currently performed. |
2729 | * @idle: Idle status of this_cpu | 2813 | * @idle: Idle status of this_cpu |
2730 | * @sd_idle: Idle status of the sched_domain containing sg. | ||
2731 | * @cpus: Set of cpus considered for load balancing. | 2814 | * @cpus: Set of cpus considered for load balancing. |
2732 | * @balance: Should we balance. | 2815 | * @balance: Should we balance. |
2733 | * @sds: variable to hold the statistics for this sched_domain. | 2816 | * @sds: variable to hold the statistics for this sched_domain. |
2734 | */ | 2817 | */ |
2735 | static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, | 2818 | static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, |
2736 | enum cpu_idle_type idle, int *sd_idle, | 2819 | enum cpu_idle_type idle, const struct cpumask *cpus, |
2737 | const struct cpumask *cpus, int *balance, | 2820 | int *balance, struct sd_lb_stats *sds) |
2738 | struct sd_lb_stats *sds) | ||
2739 | { | 2821 | { |
2740 | struct sched_domain *child = sd->child; | 2822 | struct sched_domain *child = sd->child; |
2741 | struct sched_group *sg = sd->groups; | 2823 | struct sched_group *sg = sd->groups; |
@@ -2753,7 +2835,7 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, | |||
2753 | 2835 | ||
2754 | local_group = cpumask_test_cpu(this_cpu, sched_group_cpus(sg)); | 2836 | local_group = cpumask_test_cpu(this_cpu, sched_group_cpus(sg)); |
2755 | memset(&sgs, 0, sizeof(sgs)); | 2837 | memset(&sgs, 0, sizeof(sgs)); |
2756 | update_sg_lb_stats(sd, sg, this_cpu, idle, load_idx, sd_idle, | 2838 | update_sg_lb_stats(sd, sg, this_cpu, idle, load_idx, |
2757 | local_group, cpus, balance, &sgs); | 2839 | local_group, cpus, balance, &sgs); |
2758 | 2840 | ||
2759 | if (local_group && !(*balance)) | 2841 | if (local_group && !(*balance)) |
@@ -3005,7 +3087,6 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu, | |||
3005 | * @imbalance: Variable which stores amount of weighted load which should | 3087 | * @imbalance: Variable which stores amount of weighted load which should |
3006 | * be moved to restore balance/put a group to idle. | 3088 | * be moved to restore balance/put a group to idle. |
3007 | * @idle: The idle status of this_cpu. | 3089 | * @idle: The idle status of this_cpu. |
3008 | * @sd_idle: The idleness of sd | ||
3009 | * @cpus: The set of CPUs under consideration for load-balancing. | 3090 | * @cpus: The set of CPUs under consideration for load-balancing. |
3010 | * @balance: Pointer to a variable indicating if this_cpu | 3091 | * @balance: Pointer to a variable indicating if this_cpu |
3011 | * is the appropriate cpu to perform load balancing at this_level. | 3092 | * is the appropriate cpu to perform load balancing at this_level. |
@@ -3018,7 +3099,7 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu, | |||
3018 | static struct sched_group * | 3099 | static struct sched_group * |
3019 | find_busiest_group(struct sched_domain *sd, int this_cpu, | 3100 | find_busiest_group(struct sched_domain *sd, int this_cpu, |
3020 | unsigned long *imbalance, enum cpu_idle_type idle, | 3101 | unsigned long *imbalance, enum cpu_idle_type idle, |
3021 | int *sd_idle, const struct cpumask *cpus, int *balance) | 3102 | const struct cpumask *cpus, int *balance) |
3022 | { | 3103 | { |
3023 | struct sd_lb_stats sds; | 3104 | struct sd_lb_stats sds; |
3024 | 3105 | ||
@@ -3028,22 +3109,11 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
3028 | * Compute the various statistics relavent for load balancing at | 3109 | * Compute the various statistics relavent for load balancing at |
3029 | * this level. | 3110 | * this level. |
3030 | */ | 3111 | */ |
3031 | update_sd_lb_stats(sd, this_cpu, idle, sd_idle, cpus, | 3112 | update_sd_lb_stats(sd, this_cpu, idle, cpus, balance, &sds); |
3032 | balance, &sds); | 3113 | |
3033 | 3114 | /* | |
3034 | /* Cases where imbalance does not exist from POV of this_cpu */ | 3115 | * this_cpu is not the appropriate cpu to perform load balancing at |
3035 | /* 1) this_cpu is not the appropriate cpu to perform load balancing | 3116 | * this level. |
3036 | * at this level. | ||
3037 | * 2) There is no busy sibling group to pull from. | ||
3038 | * 3) This group is the busiest group. | ||
3039 | * 4) This group is more busy than the avg busieness at this | ||
3040 | * sched_domain. | ||
3041 | * 5) The imbalance is within the specified limit. | ||
3042 | * | ||
3043 | * Note: when doing newidle balance, if the local group has excess | ||
3044 | * capacity (i.e. nr_running < group_capacity) and the busiest group | ||
3045 | * does not have any capacity, we force a load balance to pull tasks | ||
3046 | * to the local group. In this case, we skip past checks 3, 4 and 5. | ||
3047 | */ | 3117 | */ |
3048 | if (!(*balance)) | 3118 | if (!(*balance)) |
3049 | goto ret; | 3119 | goto ret; |
@@ -3052,41 +3122,55 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
3052 | check_asym_packing(sd, &sds, this_cpu, imbalance)) | 3122 | check_asym_packing(sd, &sds, this_cpu, imbalance)) |
3053 | return sds.busiest; | 3123 | return sds.busiest; |
3054 | 3124 | ||
3125 | /* There is no busy sibling group to pull tasks from */ | ||
3055 | if (!sds.busiest || sds.busiest_nr_running == 0) | 3126 | if (!sds.busiest || sds.busiest_nr_running == 0) |
3056 | goto out_balanced; | 3127 | goto out_balanced; |
3057 | 3128 | ||
3058 | /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */ | 3129 | /* |
3130 | * If the busiest group is imbalanced the below checks don't | ||
3131 | * work because they assumes all things are equal, which typically | ||
3132 | * isn't true due to cpus_allowed constraints and the like. | ||
3133 | */ | ||
3134 | if (sds.group_imb) | ||
3135 | goto force_balance; | ||
3136 | |||
3137 | /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */ | ||
3059 | if (idle == CPU_NEWLY_IDLE && sds.this_has_capacity && | 3138 | if (idle == CPU_NEWLY_IDLE && sds.this_has_capacity && |
3060 | !sds.busiest_has_capacity) | 3139 | !sds.busiest_has_capacity) |
3061 | goto force_balance; | 3140 | goto force_balance; |
3062 | 3141 | ||
3142 | /* | ||
3143 | * If the local group is more busy than the selected busiest group | ||
3144 | * don't try and pull any tasks. | ||
3145 | */ | ||
3063 | if (sds.this_load >= sds.max_load) | 3146 | if (sds.this_load >= sds.max_load) |
3064 | goto out_balanced; | 3147 | goto out_balanced; |
3065 | 3148 | ||
3149 | /* | ||
3150 | * Don't pull any tasks if this group is already above the domain | ||
3151 | * average load. | ||
3152 | */ | ||
3066 | sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr; | 3153 | sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr; |
3067 | |||
3068 | if (sds.this_load >= sds.avg_load) | 3154 | if (sds.this_load >= sds.avg_load) |
3069 | goto out_balanced; | 3155 | goto out_balanced; |
3070 | 3156 | ||
3071 | /* | 3157 | if (idle == CPU_IDLE) { |
3072 | * In the CPU_NEWLY_IDLE, use imbalance_pct to be conservative. | ||
3073 | * And to check for busy balance use !idle_cpu instead of | ||
3074 | * CPU_NOT_IDLE. This is because HT siblings will use CPU_NOT_IDLE | ||
3075 | * even when they are idle. | ||
3076 | */ | ||
3077 | if (idle == CPU_NEWLY_IDLE || !idle_cpu(this_cpu)) { | ||
3078 | if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load) | ||
3079 | goto out_balanced; | ||
3080 | } else { | ||
3081 | /* | 3158 | /* |
3082 | * This cpu is idle. If the busiest group load doesn't | 3159 | * This cpu is idle. If the busiest group load doesn't |
3083 | * have more tasks than the number of available cpu's and | 3160 | * have more tasks than the number of available cpu's and |
3084 | * there is no imbalance between this and busiest group | 3161 | * there is no imbalance between this and busiest group |
3085 | * wrt to idle cpu's, it is balanced. | 3162 | * wrt to idle cpu's, it is balanced. |
3086 | */ | 3163 | */ |
3087 | if ((sds.this_idle_cpus <= sds.busiest_idle_cpus + 1) && | 3164 | if ((sds.this_idle_cpus <= sds.busiest_idle_cpus + 1) && |
3088 | sds.busiest_nr_running <= sds.busiest_group_weight) | 3165 | sds.busiest_nr_running <= sds.busiest_group_weight) |
3089 | goto out_balanced; | 3166 | goto out_balanced; |
3167 | } else { | ||
3168 | /* | ||
3169 | * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use | ||
3170 | * imbalance_pct to be conservative. | ||
3171 | */ | ||
3172 | if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load) | ||
3173 | goto out_balanced; | ||
3090 | } | 3174 | } |
3091 | 3175 | ||
3092 | force_balance: | 3176 | force_balance: |
@@ -3165,7 +3249,7 @@ find_busiest_queue(struct sched_domain *sd, struct sched_group *group, | |||
3165 | /* Working cpumask for load_balance and load_balance_newidle. */ | 3249 | /* Working cpumask for load_balance and load_balance_newidle. */ |
3166 | static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask); | 3250 | static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask); |
3167 | 3251 | ||
3168 | static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle, | 3252 | static int need_active_balance(struct sched_domain *sd, int idle, |
3169 | int busiest_cpu, int this_cpu) | 3253 | int busiest_cpu, int this_cpu) |
3170 | { | 3254 | { |
3171 | if (idle == CPU_NEWLY_IDLE) { | 3255 | if (idle == CPU_NEWLY_IDLE) { |
@@ -3197,10 +3281,6 @@ static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle, | |||
3197 | * move_tasks() will succeed. ld_moved will be true and this | 3281 | * move_tasks() will succeed. ld_moved will be true and this |
3198 | * active balance code will not be triggered. | 3282 | * active balance code will not be triggered. |
3199 | */ | 3283 | */ |
3200 | if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && | ||
3201 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) | ||
3202 | return 0; | ||
3203 | |||
3204 | if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP) | 3284 | if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP) |
3205 | return 0; | 3285 | return 0; |
3206 | } | 3286 | } |
@@ -3218,7 +3298,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, | |||
3218 | struct sched_domain *sd, enum cpu_idle_type idle, | 3298 | struct sched_domain *sd, enum cpu_idle_type idle, |
3219 | int *balance) | 3299 | int *balance) |
3220 | { | 3300 | { |
3221 | int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0; | 3301 | int ld_moved, all_pinned = 0, active_balance = 0; |
3222 | struct sched_group *group; | 3302 | struct sched_group *group; |
3223 | unsigned long imbalance; | 3303 | unsigned long imbalance; |
3224 | struct rq *busiest; | 3304 | struct rq *busiest; |
@@ -3227,20 +3307,10 @@ static int load_balance(int this_cpu, struct rq *this_rq, | |||
3227 | 3307 | ||
3228 | cpumask_copy(cpus, cpu_active_mask); | 3308 | cpumask_copy(cpus, cpu_active_mask); |
3229 | 3309 | ||
3230 | /* | ||
3231 | * When power savings policy is enabled for the parent domain, idle | ||
3232 | * sibling can pick up load irrespective of busy siblings. In this case, | ||
3233 | * let the state of idle sibling percolate up as CPU_IDLE, instead of | ||
3234 | * portraying it as CPU_NOT_IDLE. | ||
3235 | */ | ||
3236 | if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER && | ||
3237 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) | ||
3238 | sd_idle = 1; | ||
3239 | |||
3240 | schedstat_inc(sd, lb_count[idle]); | 3310 | schedstat_inc(sd, lb_count[idle]); |
3241 | 3311 | ||
3242 | redo: | 3312 | redo: |
3243 | group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle, | 3313 | group = find_busiest_group(sd, this_cpu, &imbalance, idle, |
3244 | cpus, balance); | 3314 | cpus, balance); |
3245 | 3315 | ||
3246 | if (*balance == 0) | 3316 | if (*balance == 0) |
@@ -3302,8 +3372,7 @@ redo: | |||
3302 | if (idle != CPU_NEWLY_IDLE) | 3372 | if (idle != CPU_NEWLY_IDLE) |
3303 | sd->nr_balance_failed++; | 3373 | sd->nr_balance_failed++; |
3304 | 3374 | ||
3305 | if (need_active_balance(sd, sd_idle, idle, cpu_of(busiest), | 3375 | if (need_active_balance(sd, idle, cpu_of(busiest), this_cpu)) { |
3306 | this_cpu)) { | ||
3307 | raw_spin_lock_irqsave(&busiest->lock, flags); | 3376 | raw_spin_lock_irqsave(&busiest->lock, flags); |
3308 | 3377 | ||
3309 | /* don't kick the active_load_balance_cpu_stop, | 3378 | /* don't kick the active_load_balance_cpu_stop, |
@@ -3358,10 +3427,6 @@ redo: | |||
3358 | sd->balance_interval *= 2; | 3427 | sd->balance_interval *= 2; |
3359 | } | 3428 | } |
3360 | 3429 | ||
3361 | if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER && | ||
3362 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) | ||
3363 | ld_moved = -1; | ||
3364 | |||
3365 | goto out; | 3430 | goto out; |
3366 | 3431 | ||
3367 | out_balanced: | 3432 | out_balanced: |
@@ -3375,11 +3440,7 @@ out_one_pinned: | |||
3375 | (sd->balance_interval < sd->max_interval)) | 3440 | (sd->balance_interval < sd->max_interval)) |
3376 | sd->balance_interval *= 2; | 3441 | sd->balance_interval *= 2; |
3377 | 3442 | ||
3378 | if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && | 3443 | ld_moved = 0; |
3379 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) | ||
3380 | ld_moved = -1; | ||
3381 | else | ||
3382 | ld_moved = 0; | ||
3383 | out: | 3444 | out: |
3384 | return ld_moved; | 3445 | return ld_moved; |
3385 | } | 3446 | } |
@@ -3803,8 +3864,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle) | |||
3803 | if (load_balance(cpu, rq, sd, idle, &balance)) { | 3864 | if (load_balance(cpu, rq, sd, idle, &balance)) { |
3804 | /* | 3865 | /* |
3805 | * We've pulled tasks over so either we're no | 3866 | * We've pulled tasks over so either we're no |
3806 | * longer idle, or one of our SMT siblings is | 3867 | * longer idle. |
3807 | * not idle. | ||
3808 | */ | 3868 | */ |
3809 | idle = CPU_NOT_IDLE; | 3869 | idle = CPU_NOT_IDLE; |
3810 | } | 3870 | } |
@@ -4051,33 +4111,62 @@ static void task_fork_fair(struct task_struct *p) | |||
4051 | * Priority of the task has changed. Check to see if we preempt | 4111 | * Priority of the task has changed. Check to see if we preempt |
4052 | * the current task. | 4112 | * the current task. |
4053 | */ | 4113 | */ |
4054 | static void prio_changed_fair(struct rq *rq, struct task_struct *p, | 4114 | static void |
4055 | int oldprio, int running) | 4115 | prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio) |
4056 | { | 4116 | { |
4117 | if (!p->se.on_rq) | ||
4118 | return; | ||
4119 | |||
4057 | /* | 4120 | /* |
4058 | * Reschedule if we are currently running on this runqueue and | 4121 | * Reschedule if we are currently running on this runqueue and |
4059 | * our priority decreased, or if we are not currently running on | 4122 | * our priority decreased, or if we are not currently running on |
4060 | * this runqueue and our priority is higher than the current's | 4123 | * this runqueue and our priority is higher than the current's |
4061 | */ | 4124 | */ |
4062 | if (running) { | 4125 | if (rq->curr == p) { |
4063 | if (p->prio > oldprio) | 4126 | if (p->prio > oldprio) |
4064 | resched_task(rq->curr); | 4127 | resched_task(rq->curr); |
4065 | } else | 4128 | } else |
4066 | check_preempt_curr(rq, p, 0); | 4129 | check_preempt_curr(rq, p, 0); |
4067 | } | 4130 | } |
4068 | 4131 | ||
4132 | static void switched_from_fair(struct rq *rq, struct task_struct *p) | ||
4133 | { | ||
4134 | struct sched_entity *se = &p->se; | ||
4135 | struct cfs_rq *cfs_rq = cfs_rq_of(se); | ||
4136 | |||
4137 | /* | ||
4138 | * Ensure the task's vruntime is normalized, so that when its | ||
4139 | * switched back to the fair class the enqueue_entity(.flags=0) will | ||
4140 | * do the right thing. | ||
4141 | * | ||
4142 | * If it was on_rq, then the dequeue_entity(.flags=0) will already | ||
4143 | * have normalized the vruntime, if it was !on_rq, then only when | ||
4144 | * the task is sleeping will it still have non-normalized vruntime. | ||
4145 | */ | ||
4146 | if (!se->on_rq && p->state != TASK_RUNNING) { | ||
4147 | /* | ||
4148 | * Fix up our vruntime so that the current sleep doesn't | ||
4149 | * cause 'unlimited' sleep bonus. | ||
4150 | */ | ||
4151 | place_entity(cfs_rq, se, 0); | ||
4152 | se->vruntime -= cfs_rq->min_vruntime; | ||
4153 | } | ||
4154 | } | ||
4155 | |||
4069 | /* | 4156 | /* |
4070 | * We switched to the sched_fair class. | 4157 | * We switched to the sched_fair class. |
4071 | */ | 4158 | */ |
4072 | static void switched_to_fair(struct rq *rq, struct task_struct *p, | 4159 | static void switched_to_fair(struct rq *rq, struct task_struct *p) |
4073 | int running) | ||
4074 | { | 4160 | { |
4161 | if (!p->se.on_rq) | ||
4162 | return; | ||
4163 | |||
4075 | /* | 4164 | /* |
4076 | * We were most likely switched from sched_rt, so | 4165 | * We were most likely switched from sched_rt, so |
4077 | * kick off the schedule if running, otherwise just see | 4166 | * kick off the schedule if running, otherwise just see |
4078 | * if we can still preempt the current task. | 4167 | * if we can still preempt the current task. |
4079 | */ | 4168 | */ |
4080 | if (running) | 4169 | if (rq->curr == p) |
4081 | resched_task(rq->curr); | 4170 | resched_task(rq->curr); |
4082 | else | 4171 | else |
4083 | check_preempt_curr(rq, p, 0); | 4172 | check_preempt_curr(rq, p, 0); |
@@ -4143,6 +4232,7 @@ static const struct sched_class fair_sched_class = { | |||
4143 | .enqueue_task = enqueue_task_fair, | 4232 | .enqueue_task = enqueue_task_fair, |
4144 | .dequeue_task = dequeue_task_fair, | 4233 | .dequeue_task = dequeue_task_fair, |
4145 | .yield_task = yield_task_fair, | 4234 | .yield_task = yield_task_fair, |
4235 | .yield_to_task = yield_to_task_fair, | ||
4146 | 4236 | ||
4147 | .check_preempt_curr = check_preempt_wakeup, | 4237 | .check_preempt_curr = check_preempt_wakeup, |
4148 | 4238 | ||
@@ -4163,6 +4253,7 @@ static const struct sched_class fair_sched_class = { | |||
4163 | .task_fork = task_fork_fair, | 4253 | .task_fork = task_fork_fair, |
4164 | 4254 | ||
4165 | .prio_changed = prio_changed_fair, | 4255 | .prio_changed = prio_changed_fair, |
4256 | .switched_from = switched_from_fair, | ||
4166 | .switched_to = switched_to_fair, | 4257 | .switched_to = switched_to_fair, |
4167 | 4258 | ||
4168 | .get_rr_interval = get_rr_interval_fair, | 4259 | .get_rr_interval = get_rr_interval_fair, |
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c index 9fa0f402c87c..c82f26c1b7c3 100644 --- a/kernel/sched_idletask.c +++ b/kernel/sched_idletask.c | |||
@@ -52,31 +52,15 @@ static void set_curr_task_idle(struct rq *rq) | |||
52 | { | 52 | { |
53 | } | 53 | } |
54 | 54 | ||
55 | static void switched_to_idle(struct rq *rq, struct task_struct *p, | 55 | static void switched_to_idle(struct rq *rq, struct task_struct *p) |
56 | int running) | ||
57 | { | 56 | { |
58 | /* Can this actually happen?? */ | 57 | BUG(); |
59 | if (running) | ||
60 | resched_task(rq->curr); | ||
61 | else | ||
62 | check_preempt_curr(rq, p, 0); | ||
63 | } | 58 | } |
64 | 59 | ||
65 | static void prio_changed_idle(struct rq *rq, struct task_struct *p, | 60 | static void |
66 | int oldprio, int running) | 61 | prio_changed_idle(struct rq *rq, struct task_struct *p, int oldprio) |
67 | { | 62 | { |
68 | /* This can happen for hot plug CPUS */ | 63 | BUG(); |
69 | |||
70 | /* | ||
71 | * Reschedule if we are currently running on this runqueue and | ||
72 | * our priority decreased, or if we are not currently running on | ||
73 | * this runqueue and our priority is higher than the current's | ||
74 | */ | ||
75 | if (running) { | ||
76 | if (p->prio > oldprio) | ||
77 | resched_task(rq->curr); | ||
78 | } else | ||
79 | check_preempt_curr(rq, p, 0); | ||
80 | } | 64 | } |
81 | 65 | ||
82 | static unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task) | 66 | static unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task) |
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index c914ec747ca6..db308cb08b75 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c | |||
@@ -210,11 +210,12 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se); | |||
210 | 210 | ||
211 | static void sched_rt_rq_enqueue(struct rt_rq *rt_rq) | 211 | static void sched_rt_rq_enqueue(struct rt_rq *rt_rq) |
212 | { | 212 | { |
213 | int this_cpu = smp_processor_id(); | ||
214 | struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr; | 213 | struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr; |
215 | struct sched_rt_entity *rt_se; | 214 | struct sched_rt_entity *rt_se; |
216 | 215 | ||
217 | rt_se = rt_rq->tg->rt_se[this_cpu]; | 216 | int cpu = cpu_of(rq_of_rt_rq(rt_rq)); |
217 | |||
218 | rt_se = rt_rq->tg->rt_se[cpu]; | ||
218 | 219 | ||
219 | if (rt_rq->rt_nr_running) { | 220 | if (rt_rq->rt_nr_running) { |
220 | if (rt_se && !on_rt_rq(rt_se)) | 221 | if (rt_se && !on_rt_rq(rt_se)) |
@@ -226,10 +227,10 @@ static void sched_rt_rq_enqueue(struct rt_rq *rt_rq) | |||
226 | 227 | ||
227 | static void sched_rt_rq_dequeue(struct rt_rq *rt_rq) | 228 | static void sched_rt_rq_dequeue(struct rt_rq *rt_rq) |
228 | { | 229 | { |
229 | int this_cpu = smp_processor_id(); | ||
230 | struct sched_rt_entity *rt_se; | 230 | struct sched_rt_entity *rt_se; |
231 | int cpu = cpu_of(rq_of_rt_rq(rt_rq)); | ||
231 | 232 | ||
232 | rt_se = rt_rq->tg->rt_se[this_cpu]; | 233 | rt_se = rt_rq->tg->rt_se[cpu]; |
233 | 234 | ||
234 | if (rt_se && on_rt_rq(rt_se)) | 235 | if (rt_se && on_rt_rq(rt_se)) |
235 | dequeue_rt_entity(rt_se); | 236 | dequeue_rt_entity(rt_se); |
@@ -565,8 +566,11 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) | |||
565 | if (rt_rq->rt_time || rt_rq->rt_nr_running) | 566 | if (rt_rq->rt_time || rt_rq->rt_nr_running) |
566 | idle = 0; | 567 | idle = 0; |
567 | raw_spin_unlock(&rt_rq->rt_runtime_lock); | 568 | raw_spin_unlock(&rt_rq->rt_runtime_lock); |
568 | } else if (rt_rq->rt_nr_running) | 569 | } else if (rt_rq->rt_nr_running) { |
569 | idle = 0; | 570 | idle = 0; |
571 | if (!rt_rq_throttled(rt_rq)) | ||
572 | enqueue = 1; | ||
573 | } | ||
570 | 574 | ||
571 | if (enqueue) | 575 | if (enqueue) |
572 | sched_rt_rq_enqueue(rt_rq); | 576 | sched_rt_rq_enqueue(rt_rq); |
@@ -625,7 +629,7 @@ static void update_curr_rt(struct rq *rq) | |||
625 | struct rt_rq *rt_rq = rt_rq_of_se(rt_se); | 629 | struct rt_rq *rt_rq = rt_rq_of_se(rt_se); |
626 | u64 delta_exec; | 630 | u64 delta_exec; |
627 | 631 | ||
628 | if (!task_has_rt_policy(curr)) | 632 | if (curr->sched_class != &rt_sched_class) |
629 | return; | 633 | return; |
630 | 634 | ||
631 | delta_exec = rq->clock_task - curr->se.exec_start; | 635 | delta_exec = rq->clock_task - curr->se.exec_start; |
@@ -1595,8 +1599,7 @@ static void rq_offline_rt(struct rq *rq) | |||
1595 | * When switch from the rt queue, we bring ourselves to a position | 1599 | * When switch from the rt queue, we bring ourselves to a position |
1596 | * that we might want to pull RT tasks from other runqueues. | 1600 | * that we might want to pull RT tasks from other runqueues. |
1597 | */ | 1601 | */ |
1598 | static void switched_from_rt(struct rq *rq, struct task_struct *p, | 1602 | static void switched_from_rt(struct rq *rq, struct task_struct *p) |
1599 | int running) | ||
1600 | { | 1603 | { |
1601 | /* | 1604 | /* |
1602 | * If there are other RT tasks then we will reschedule | 1605 | * If there are other RT tasks then we will reschedule |
@@ -1605,7 +1608,7 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p, | |||
1605 | * we may need to handle the pulling of RT tasks | 1608 | * we may need to handle the pulling of RT tasks |
1606 | * now. | 1609 | * now. |
1607 | */ | 1610 | */ |
1608 | if (!rq->rt.rt_nr_running) | 1611 | if (p->se.on_rq && !rq->rt.rt_nr_running) |
1609 | pull_rt_task(rq); | 1612 | pull_rt_task(rq); |
1610 | } | 1613 | } |
1611 | 1614 | ||
@@ -1624,8 +1627,7 @@ static inline void init_sched_rt_class(void) | |||
1624 | * with RT tasks. In this case we try to push them off to | 1627 | * with RT tasks. In this case we try to push them off to |
1625 | * other runqueues. | 1628 | * other runqueues. |
1626 | */ | 1629 | */ |
1627 | static void switched_to_rt(struct rq *rq, struct task_struct *p, | 1630 | static void switched_to_rt(struct rq *rq, struct task_struct *p) |
1628 | int running) | ||
1629 | { | 1631 | { |
1630 | int check_resched = 1; | 1632 | int check_resched = 1; |
1631 | 1633 | ||
@@ -1636,7 +1638,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p, | |||
1636 | * If that current running task is also an RT task | 1638 | * If that current running task is also an RT task |
1637 | * then see if we can move to another run queue. | 1639 | * then see if we can move to another run queue. |
1638 | */ | 1640 | */ |
1639 | if (!running) { | 1641 | if (p->se.on_rq && rq->curr != p) { |
1640 | #ifdef CONFIG_SMP | 1642 | #ifdef CONFIG_SMP |
1641 | if (rq->rt.overloaded && push_rt_task(rq) && | 1643 | if (rq->rt.overloaded && push_rt_task(rq) && |
1642 | /* Don't resched if we changed runqueues */ | 1644 | /* Don't resched if we changed runqueues */ |
@@ -1652,10 +1654,13 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p, | |||
1652 | * Priority of the task has changed. This may cause | 1654 | * Priority of the task has changed. This may cause |
1653 | * us to initiate a push or pull. | 1655 | * us to initiate a push or pull. |
1654 | */ | 1656 | */ |
1655 | static void prio_changed_rt(struct rq *rq, struct task_struct *p, | 1657 | static void |
1656 | int oldprio, int running) | 1658 | prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio) |
1657 | { | 1659 | { |
1658 | if (running) { | 1660 | if (!p->se.on_rq) |
1661 | return; | ||
1662 | |||
1663 | if (rq->curr == p) { | ||
1659 | #ifdef CONFIG_SMP | 1664 | #ifdef CONFIG_SMP |
1660 | /* | 1665 | /* |
1661 | * If our priority decreases while running, we | 1666 | * If our priority decreases while running, we |
diff --git a/kernel/sched_stoptask.c b/kernel/sched_stoptask.c index 2bf6b47058c1..84ec9bcf82d9 100644 --- a/kernel/sched_stoptask.c +++ b/kernel/sched_stoptask.c | |||
@@ -59,14 +59,13 @@ static void set_curr_task_stop(struct rq *rq) | |||
59 | { | 59 | { |
60 | } | 60 | } |
61 | 61 | ||
62 | static void switched_to_stop(struct rq *rq, struct task_struct *p, | 62 | static void switched_to_stop(struct rq *rq, struct task_struct *p) |
63 | int running) | ||
64 | { | 63 | { |
65 | BUG(); /* its impossible to change to this class */ | 64 | BUG(); /* its impossible to change to this class */ |
66 | } | 65 | } |
67 | 66 | ||
68 | static void prio_changed_stop(struct rq *rq, struct task_struct *p, | 67 | static void |
69 | int oldprio, int running) | 68 | prio_changed_stop(struct rq *rq, struct task_struct *p, int oldprio) |
70 | { | 69 | { |
71 | BUG(); /* how!?, what priority? */ | 70 | BUG(); /* how!?, what priority? */ |
72 | } | 71 | } |
diff --git a/kernel/smp.c b/kernel/smp.c index 12ed8b013e2d..9910744f0856 100644 --- a/kernel/smp.c +++ b/kernel/smp.c | |||
@@ -13,6 +13,7 @@ | |||
13 | #include <linux/smp.h> | 13 | #include <linux/smp.h> |
14 | #include <linux/cpu.h> | 14 | #include <linux/cpu.h> |
15 | 15 | ||
16 | #ifdef CONFIG_USE_GENERIC_SMP_HELPERS | ||
16 | static struct { | 17 | static struct { |
17 | struct list_head queue; | 18 | struct list_head queue; |
18 | raw_spinlock_t lock; | 19 | raw_spinlock_t lock; |
@@ -193,23 +194,52 @@ void generic_smp_call_function_interrupt(void) | |||
193 | */ | 194 | */ |
194 | list_for_each_entry_rcu(data, &call_function.queue, csd.list) { | 195 | list_for_each_entry_rcu(data, &call_function.queue, csd.list) { |
195 | int refs; | 196 | int refs; |
197 | void (*func) (void *info); | ||
196 | 198 | ||
197 | if (!cpumask_test_and_clear_cpu(cpu, data->cpumask)) | 199 | /* |
200 | * Since we walk the list without any locks, we might | ||
201 | * see an entry that was completed, removed from the | ||
202 | * list and is in the process of being reused. | ||
203 | * | ||
204 | * We must check that the cpu is in the cpumask before | ||
205 | * checking the refs, and both must be set before | ||
206 | * executing the callback on this cpu. | ||
207 | */ | ||
208 | |||
209 | if (!cpumask_test_cpu(cpu, data->cpumask)) | ||
210 | continue; | ||
211 | |||
212 | smp_rmb(); | ||
213 | |||
214 | if (atomic_read(&data->refs) == 0) | ||
198 | continue; | 215 | continue; |
199 | 216 | ||
217 | func = data->csd.func; /* for later warn */ | ||
200 | data->csd.func(data->csd.info); | 218 | data->csd.func(data->csd.info); |
201 | 219 | ||
220 | /* | ||
221 | * If the cpu mask is not still set then it enabled interrupts, | ||
222 | * we took another smp interrupt, and executed the function | ||
223 | * twice on this cpu. In theory that copy decremented refs. | ||
224 | */ | ||
225 | if (!cpumask_test_and_clear_cpu(cpu, data->cpumask)) { | ||
226 | WARN(1, "%pS enabled interrupts and double executed\n", | ||
227 | func); | ||
228 | continue; | ||
229 | } | ||
230 | |||
202 | refs = atomic_dec_return(&data->refs); | 231 | refs = atomic_dec_return(&data->refs); |
203 | WARN_ON(refs < 0); | 232 | WARN_ON(refs < 0); |
204 | if (!refs) { | ||
205 | raw_spin_lock(&call_function.lock); | ||
206 | list_del_rcu(&data->csd.list); | ||
207 | raw_spin_unlock(&call_function.lock); | ||
208 | } | ||
209 | 233 | ||
210 | if (refs) | 234 | if (refs) |
211 | continue; | 235 | continue; |
212 | 236 | ||
237 | WARN_ON(!cpumask_empty(data->cpumask)); | ||
238 | |||
239 | raw_spin_lock(&call_function.lock); | ||
240 | list_del_rcu(&data->csd.list); | ||
241 | raw_spin_unlock(&call_function.lock); | ||
242 | |||
213 | csd_unlock(&data->csd); | 243 | csd_unlock(&data->csd); |
214 | } | 244 | } |
215 | 245 | ||
@@ -429,7 +459,7 @@ void smp_call_function_many(const struct cpumask *mask, | |||
429 | * can't happen. | 459 | * can't happen. |
430 | */ | 460 | */ |
431 | WARN_ON_ONCE(cpu_online(this_cpu) && irqs_disabled() | 461 | WARN_ON_ONCE(cpu_online(this_cpu) && irqs_disabled() |
432 | && !oops_in_progress); | 462 | && !oops_in_progress && !early_boot_irqs_disabled); |
433 | 463 | ||
434 | /* So, what's a CPU they want? Ignoring this one. */ | 464 | /* So, what's a CPU they want? Ignoring this one. */ |
435 | cpu = cpumask_first_and(mask, cpu_online_mask); | 465 | cpu = cpumask_first_and(mask, cpu_online_mask); |
@@ -453,11 +483,21 @@ void smp_call_function_many(const struct cpumask *mask, | |||
453 | 483 | ||
454 | data = &__get_cpu_var(cfd_data); | 484 | data = &__get_cpu_var(cfd_data); |
455 | csd_lock(&data->csd); | 485 | csd_lock(&data->csd); |
486 | BUG_ON(atomic_read(&data->refs) || !cpumask_empty(data->cpumask)); | ||
456 | 487 | ||
457 | data->csd.func = func; | 488 | data->csd.func = func; |
458 | data->csd.info = info; | 489 | data->csd.info = info; |
459 | cpumask_and(data->cpumask, mask, cpu_online_mask); | 490 | cpumask_and(data->cpumask, mask, cpu_online_mask); |
460 | cpumask_clear_cpu(this_cpu, data->cpumask); | 491 | cpumask_clear_cpu(this_cpu, data->cpumask); |
492 | |||
493 | /* | ||
494 | * To ensure the interrupt handler gets an complete view | ||
495 | * we order the cpumask and refs writes and order the read | ||
496 | * of them in the interrupt handler. In addition we may | ||
497 | * only clear our own cpu bit from the mask. | ||
498 | */ | ||
499 | smp_wmb(); | ||
500 | |||
461 | atomic_set(&data->refs, cpumask_weight(data->cpumask)); | 501 | atomic_set(&data->refs, cpumask_weight(data->cpumask)); |
462 | 502 | ||
463 | raw_spin_lock_irqsave(&call_function.lock, flags); | 503 | raw_spin_lock_irqsave(&call_function.lock, flags); |
@@ -529,3 +569,24 @@ void ipi_call_unlock_irq(void) | |||
529 | { | 569 | { |
530 | raw_spin_unlock_irq(&call_function.lock); | 570 | raw_spin_unlock_irq(&call_function.lock); |
531 | } | 571 | } |
572 | #endif /* USE_GENERIC_SMP_HELPERS */ | ||
573 | |||
574 | /* | ||
575 | * Call a function on all processors. May be used during early boot while | ||
576 | * early_boot_irqs_disabled is set. Use local_irq_save/restore() instead | ||
577 | * of local_irq_disable/enable(). | ||
578 | */ | ||
579 | int on_each_cpu(void (*func) (void *info), void *info, int wait) | ||
580 | { | ||
581 | unsigned long flags; | ||
582 | int ret = 0; | ||
583 | |||
584 | preempt_disable(); | ||
585 | ret = smp_call_function(func, info, wait); | ||
586 | local_irq_save(flags); | ||
587 | func(info); | ||
588 | local_irq_restore(flags); | ||
589 | preempt_enable(); | ||
590 | return ret; | ||
591 | } | ||
592 | EXPORT_SYMBOL(on_each_cpu); | ||
diff --git a/kernel/softirq.c b/kernel/softirq.c index d4d918a91881..56e5dec837f0 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c | |||
@@ -54,7 +54,7 @@ EXPORT_SYMBOL(irq_stat); | |||
54 | 54 | ||
55 | static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp; | 55 | static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp; |
56 | 56 | ||
57 | static DEFINE_PER_CPU(struct task_struct *, ksoftirqd); | 57 | DEFINE_PER_CPU(struct task_struct *, ksoftirqd); |
58 | 58 | ||
59 | char *softirq_to_name[NR_SOFTIRQS] = { | 59 | char *softirq_to_name[NR_SOFTIRQS] = { |
60 | "HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "BLOCK_IOPOLL", | 60 | "HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "BLOCK_IOPOLL", |
@@ -70,7 +70,7 @@ char *softirq_to_name[NR_SOFTIRQS] = { | |||
70 | static void wakeup_softirqd(void) | 70 | static void wakeup_softirqd(void) |
71 | { | 71 | { |
72 | /* Interrupts are disabled: no need to stop preemption */ | 72 | /* Interrupts are disabled: no need to stop preemption */ |
73 | struct task_struct *tsk = __get_cpu_var(ksoftirqd); | 73 | struct task_struct *tsk = __this_cpu_read(ksoftirqd); |
74 | 74 | ||
75 | if (tsk && tsk->state != TASK_RUNNING) | 75 | if (tsk && tsk->state != TASK_RUNNING) |
76 | wake_up_process(tsk); | 76 | wake_up_process(tsk); |
@@ -311,9 +311,21 @@ void irq_enter(void) | |||
311 | } | 311 | } |
312 | 312 | ||
313 | #ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED | 313 | #ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED |
314 | # define invoke_softirq() __do_softirq() | 314 | static inline void invoke_softirq(void) |
315 | { | ||
316 | if (!force_irqthreads) | ||
317 | __do_softirq(); | ||
318 | else | ||
319 | wakeup_softirqd(); | ||
320 | } | ||
315 | #else | 321 | #else |
316 | # define invoke_softirq() do_softirq() | 322 | static inline void invoke_softirq(void) |
323 | { | ||
324 | if (!force_irqthreads) | ||
325 | do_softirq(); | ||
326 | else | ||
327 | wakeup_softirqd(); | ||
328 | } | ||
317 | #endif | 329 | #endif |
318 | 330 | ||
319 | /* | 331 | /* |
@@ -388,8 +400,8 @@ void __tasklet_schedule(struct tasklet_struct *t) | |||
388 | 400 | ||
389 | local_irq_save(flags); | 401 | local_irq_save(flags); |
390 | t->next = NULL; | 402 | t->next = NULL; |
391 | *__get_cpu_var(tasklet_vec).tail = t; | 403 | *__this_cpu_read(tasklet_vec.tail) = t; |
392 | __get_cpu_var(tasklet_vec).tail = &(t->next); | 404 | __this_cpu_write(tasklet_vec.tail, &(t->next)); |
393 | raise_softirq_irqoff(TASKLET_SOFTIRQ); | 405 | raise_softirq_irqoff(TASKLET_SOFTIRQ); |
394 | local_irq_restore(flags); | 406 | local_irq_restore(flags); |
395 | } | 407 | } |
@@ -402,8 +414,8 @@ void __tasklet_hi_schedule(struct tasklet_struct *t) | |||
402 | 414 | ||
403 | local_irq_save(flags); | 415 | local_irq_save(flags); |
404 | t->next = NULL; | 416 | t->next = NULL; |
405 | *__get_cpu_var(tasklet_hi_vec).tail = t; | 417 | *__this_cpu_read(tasklet_hi_vec.tail) = t; |
406 | __get_cpu_var(tasklet_hi_vec).tail = &(t->next); | 418 | __this_cpu_write(tasklet_hi_vec.tail, &(t->next)); |
407 | raise_softirq_irqoff(HI_SOFTIRQ); | 419 | raise_softirq_irqoff(HI_SOFTIRQ); |
408 | local_irq_restore(flags); | 420 | local_irq_restore(flags); |
409 | } | 421 | } |
@@ -414,8 +426,8 @@ void __tasklet_hi_schedule_first(struct tasklet_struct *t) | |||
414 | { | 426 | { |
415 | BUG_ON(!irqs_disabled()); | 427 | BUG_ON(!irqs_disabled()); |
416 | 428 | ||
417 | t->next = __get_cpu_var(tasklet_hi_vec).head; | 429 | t->next = __this_cpu_read(tasklet_hi_vec.head); |
418 | __get_cpu_var(tasklet_hi_vec).head = t; | 430 | __this_cpu_write(tasklet_hi_vec.head, t); |
419 | __raise_softirq_irqoff(HI_SOFTIRQ); | 431 | __raise_softirq_irqoff(HI_SOFTIRQ); |
420 | } | 432 | } |
421 | 433 | ||
@@ -426,9 +438,9 @@ static void tasklet_action(struct softirq_action *a) | |||
426 | struct tasklet_struct *list; | 438 | struct tasklet_struct *list; |
427 | 439 | ||
428 | local_irq_disable(); | 440 | local_irq_disable(); |
429 | list = __get_cpu_var(tasklet_vec).head; | 441 | list = __this_cpu_read(tasklet_vec.head); |
430 | __get_cpu_var(tasklet_vec).head = NULL; | 442 | __this_cpu_write(tasklet_vec.head, NULL); |
431 | __get_cpu_var(tasklet_vec).tail = &__get_cpu_var(tasklet_vec).head; | 443 | __this_cpu_write(tasklet_vec.tail, &__get_cpu_var(tasklet_vec).head); |
432 | local_irq_enable(); | 444 | local_irq_enable(); |
433 | 445 | ||
434 | while (list) { | 446 | while (list) { |
@@ -449,8 +461,8 @@ static void tasklet_action(struct softirq_action *a) | |||
449 | 461 | ||
450 | local_irq_disable(); | 462 | local_irq_disable(); |
451 | t->next = NULL; | 463 | t->next = NULL; |
452 | *__get_cpu_var(tasklet_vec).tail = t; | 464 | *__this_cpu_read(tasklet_vec.tail) = t; |
453 | __get_cpu_var(tasklet_vec).tail = &(t->next); | 465 | __this_cpu_write(tasklet_vec.tail, &(t->next)); |
454 | __raise_softirq_irqoff(TASKLET_SOFTIRQ); | 466 | __raise_softirq_irqoff(TASKLET_SOFTIRQ); |
455 | local_irq_enable(); | 467 | local_irq_enable(); |
456 | } | 468 | } |
@@ -461,9 +473,9 @@ static void tasklet_hi_action(struct softirq_action *a) | |||
461 | struct tasklet_struct *list; | 473 | struct tasklet_struct *list; |
462 | 474 | ||
463 | local_irq_disable(); | 475 | local_irq_disable(); |
464 | list = __get_cpu_var(tasklet_hi_vec).head; | 476 | list = __this_cpu_read(tasklet_hi_vec.head); |
465 | __get_cpu_var(tasklet_hi_vec).head = NULL; | 477 | __this_cpu_write(tasklet_hi_vec.head, NULL); |
466 | __get_cpu_var(tasklet_hi_vec).tail = &__get_cpu_var(tasklet_hi_vec).head; | 478 | __this_cpu_write(tasklet_hi_vec.tail, &__get_cpu_var(tasklet_hi_vec).head); |
467 | local_irq_enable(); | 479 | local_irq_enable(); |
468 | 480 | ||
469 | while (list) { | 481 | while (list) { |
@@ -484,8 +496,8 @@ static void tasklet_hi_action(struct softirq_action *a) | |||
484 | 496 | ||
485 | local_irq_disable(); | 497 | local_irq_disable(); |
486 | t->next = NULL; | 498 | t->next = NULL; |
487 | *__get_cpu_var(tasklet_hi_vec).tail = t; | 499 | *__this_cpu_read(tasklet_hi_vec.tail) = t; |
488 | __get_cpu_var(tasklet_hi_vec).tail = &(t->next); | 500 | __this_cpu_write(tasklet_hi_vec.tail, &(t->next)); |
489 | __raise_softirq_irqoff(HI_SOFTIRQ); | 501 | __raise_softirq_irqoff(HI_SOFTIRQ); |
490 | local_irq_enable(); | 502 | local_irq_enable(); |
491 | } | 503 | } |
@@ -721,7 +733,6 @@ static int run_ksoftirqd(void * __bind_cpu) | |||
721 | { | 733 | { |
722 | set_current_state(TASK_INTERRUPTIBLE); | 734 | set_current_state(TASK_INTERRUPTIBLE); |
723 | 735 | ||
724 | current->flags |= PF_KSOFTIRQD; | ||
725 | while (!kthread_should_stop()) { | 736 | while (!kthread_should_stop()) { |
726 | preempt_disable(); | 737 | preempt_disable(); |
727 | if (!local_softirq_pending()) { | 738 | if (!local_softirq_pending()) { |
@@ -738,7 +749,10 @@ static int run_ksoftirqd(void * __bind_cpu) | |||
738 | don't process */ | 749 | don't process */ |
739 | if (cpu_is_offline((long)__bind_cpu)) | 750 | if (cpu_is_offline((long)__bind_cpu)) |
740 | goto wait_to_die; | 751 | goto wait_to_die; |
741 | do_softirq(); | 752 | local_irq_disable(); |
753 | if (local_softirq_pending()) | ||
754 | __do_softirq(); | ||
755 | local_irq_enable(); | ||
742 | preempt_enable_no_resched(); | 756 | preempt_enable_no_resched(); |
743 | cond_resched(); | 757 | cond_resched(); |
744 | preempt_disable(); | 758 | preempt_disable(); |
@@ -802,16 +816,16 @@ static void takeover_tasklets(unsigned int cpu) | |||
802 | 816 | ||
803 | /* Find end, append list for that CPU. */ | 817 | /* Find end, append list for that CPU. */ |
804 | if (&per_cpu(tasklet_vec, cpu).head != per_cpu(tasklet_vec, cpu).tail) { | 818 | if (&per_cpu(tasklet_vec, cpu).head != per_cpu(tasklet_vec, cpu).tail) { |
805 | *(__get_cpu_var(tasklet_vec).tail) = per_cpu(tasklet_vec, cpu).head; | 819 | *__this_cpu_read(tasklet_vec.tail) = per_cpu(tasklet_vec, cpu).head; |
806 | __get_cpu_var(tasklet_vec).tail = per_cpu(tasklet_vec, cpu).tail; | 820 | this_cpu_write(tasklet_vec.tail, per_cpu(tasklet_vec, cpu).tail); |
807 | per_cpu(tasklet_vec, cpu).head = NULL; | 821 | per_cpu(tasklet_vec, cpu).head = NULL; |
808 | per_cpu(tasklet_vec, cpu).tail = &per_cpu(tasklet_vec, cpu).head; | 822 | per_cpu(tasklet_vec, cpu).tail = &per_cpu(tasklet_vec, cpu).head; |
809 | } | 823 | } |
810 | raise_softirq_irqoff(TASKLET_SOFTIRQ); | 824 | raise_softirq_irqoff(TASKLET_SOFTIRQ); |
811 | 825 | ||
812 | if (&per_cpu(tasklet_hi_vec, cpu).head != per_cpu(tasklet_hi_vec, cpu).tail) { | 826 | if (&per_cpu(tasklet_hi_vec, cpu).head != per_cpu(tasklet_hi_vec, cpu).tail) { |
813 | *__get_cpu_var(tasklet_hi_vec).tail = per_cpu(tasklet_hi_vec, cpu).head; | 827 | *__this_cpu_read(tasklet_hi_vec.tail) = per_cpu(tasklet_hi_vec, cpu).head; |
814 | __get_cpu_var(tasklet_hi_vec).tail = per_cpu(tasklet_hi_vec, cpu).tail; | 828 | __this_cpu_write(tasklet_hi_vec.tail, per_cpu(tasklet_hi_vec, cpu).tail); |
815 | per_cpu(tasklet_hi_vec, cpu).head = NULL; | 829 | per_cpu(tasklet_hi_vec, cpu).head = NULL; |
816 | per_cpu(tasklet_hi_vec, cpu).tail = &per_cpu(tasklet_hi_vec, cpu).head; | 830 | per_cpu(tasklet_hi_vec, cpu).tail = &per_cpu(tasklet_hi_vec, cpu).head; |
817 | } | 831 | } |
@@ -853,7 +867,7 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb, | |||
853 | cpumask_any(cpu_online_mask)); | 867 | cpumask_any(cpu_online_mask)); |
854 | case CPU_DEAD: | 868 | case CPU_DEAD: |
855 | case CPU_DEAD_FROZEN: { | 869 | case CPU_DEAD_FROZEN: { |
856 | static struct sched_param param = { | 870 | static const struct sched_param param = { |
857 | .sched_priority = MAX_RT_PRIO-1 | 871 | .sched_priority = MAX_RT_PRIO-1 |
858 | }; | 872 | }; |
859 | 873 | ||
@@ -885,25 +899,6 @@ static __init int spawn_ksoftirqd(void) | |||
885 | } | 899 | } |
886 | early_initcall(spawn_ksoftirqd); | 900 | early_initcall(spawn_ksoftirqd); |
887 | 901 | ||
888 | #ifdef CONFIG_SMP | ||
889 | /* | ||
890 | * Call a function on all processors | ||
891 | */ | ||
892 | int on_each_cpu(void (*func) (void *info), void *info, int wait) | ||
893 | { | ||
894 | int ret = 0; | ||
895 | |||
896 | preempt_disable(); | ||
897 | ret = smp_call_function(func, info, wait); | ||
898 | local_irq_disable(); | ||
899 | func(info); | ||
900 | local_irq_enable(); | ||
901 | preempt_enable(); | ||
902 | return ret; | ||
903 | } | ||
904 | EXPORT_SYMBOL(on_each_cpu); | ||
905 | #endif | ||
906 | |||
907 | /* | 902 | /* |
908 | * [ These __weak aliases are kept in a separate compilation unit, so that | 903 | * [ These __weak aliases are kept in a separate compilation unit, so that |
909 | * GCC does not inline them incorrectly. ] | 904 | * GCC does not inline them incorrectly. ] |
diff --git a/kernel/srcu.c b/kernel/srcu.c index 98d8c1e80edb..73ce23feaea9 100644 --- a/kernel/srcu.c +++ b/kernel/srcu.c | |||
@@ -156,6 +156,16 @@ void __srcu_read_unlock(struct srcu_struct *sp, int idx) | |||
156 | EXPORT_SYMBOL_GPL(__srcu_read_unlock); | 156 | EXPORT_SYMBOL_GPL(__srcu_read_unlock); |
157 | 157 | ||
158 | /* | 158 | /* |
159 | * We use an adaptive strategy for synchronize_srcu() and especially for | ||
160 | * synchronize_srcu_expedited(). We spin for a fixed time period | ||
161 | * (defined below) to allow SRCU readers to exit their read-side critical | ||
162 | * sections. If there are still some readers after 10 microseconds, | ||
163 | * we repeatedly block for 1-millisecond time periods. This approach | ||
164 | * has done well in testing, so there is no need for a config parameter. | ||
165 | */ | ||
166 | #define SYNCHRONIZE_SRCU_READER_DELAY 10 | ||
167 | |||
168 | /* | ||
159 | * Helper function for synchronize_srcu() and synchronize_srcu_expedited(). | 169 | * Helper function for synchronize_srcu() and synchronize_srcu_expedited(). |
160 | */ | 170 | */ |
161 | static void __synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void)) | 171 | static void __synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void)) |
@@ -207,11 +217,12 @@ static void __synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void)) | |||
207 | * will have finished executing. We initially give readers | 217 | * will have finished executing. We initially give readers |
208 | * an arbitrarily chosen 10 microseconds to get out of their | 218 | * an arbitrarily chosen 10 microseconds to get out of their |
209 | * SRCU read-side critical sections, then loop waiting 1/HZ | 219 | * SRCU read-side critical sections, then loop waiting 1/HZ |
210 | * seconds per iteration. | 220 | * seconds per iteration. The 10-microsecond value has done |
221 | * very well in testing. | ||
211 | */ | 222 | */ |
212 | 223 | ||
213 | if (srcu_readers_active_idx(sp, idx)) | 224 | if (srcu_readers_active_idx(sp, idx)) |
214 | udelay(CONFIG_SRCU_SYNCHRONIZE_DELAY); | 225 | udelay(SYNCHRONIZE_SRCU_READER_DELAY); |
215 | while (srcu_readers_active_idx(sp, idx)) | 226 | while (srcu_readers_active_idx(sp, idx)) |
216 | schedule_timeout_interruptible(1); | 227 | schedule_timeout_interruptible(1); |
217 | 228 | ||
diff --git a/kernel/sys.c b/kernel/sys.c index 2745dcdb6c6c..18da702ec813 100644 --- a/kernel/sys.c +++ b/kernel/sys.c | |||
@@ -43,6 +43,8 @@ | |||
43 | #include <linux/kprobes.h> | 43 | #include <linux/kprobes.h> |
44 | #include <linux/user_namespace.h> | 44 | #include <linux/user_namespace.h> |
45 | 45 | ||
46 | #include <linux/kmsg_dump.h> | ||
47 | |||
46 | #include <asm/uaccess.h> | 48 | #include <asm/uaccess.h> |
47 | #include <asm/io.h> | 49 | #include <asm/io.h> |
48 | #include <asm/unistd.h> | 50 | #include <asm/unistd.h> |
@@ -285,6 +287,7 @@ out_unlock: | |||
285 | */ | 287 | */ |
286 | void emergency_restart(void) | 288 | void emergency_restart(void) |
287 | { | 289 | { |
290 | kmsg_dump(KMSG_DUMP_EMERG); | ||
288 | machine_emergency_restart(); | 291 | machine_emergency_restart(); |
289 | } | 292 | } |
290 | EXPORT_SYMBOL_GPL(emergency_restart); | 293 | EXPORT_SYMBOL_GPL(emergency_restart); |
@@ -312,6 +315,7 @@ void kernel_restart(char *cmd) | |||
312 | printk(KERN_EMERG "Restarting system.\n"); | 315 | printk(KERN_EMERG "Restarting system.\n"); |
313 | else | 316 | else |
314 | printk(KERN_EMERG "Restarting system with command '%s'.\n", cmd); | 317 | printk(KERN_EMERG "Restarting system with command '%s'.\n", cmd); |
318 | kmsg_dump(KMSG_DUMP_RESTART); | ||
315 | machine_restart(cmd); | 319 | machine_restart(cmd); |
316 | } | 320 | } |
317 | EXPORT_SYMBOL_GPL(kernel_restart); | 321 | EXPORT_SYMBOL_GPL(kernel_restart); |
@@ -333,6 +337,7 @@ void kernel_halt(void) | |||
333 | kernel_shutdown_prepare(SYSTEM_HALT); | 337 | kernel_shutdown_prepare(SYSTEM_HALT); |
334 | sysdev_shutdown(); | 338 | sysdev_shutdown(); |
335 | printk(KERN_EMERG "System halted.\n"); | 339 | printk(KERN_EMERG "System halted.\n"); |
340 | kmsg_dump(KMSG_DUMP_HALT); | ||
336 | machine_halt(); | 341 | machine_halt(); |
337 | } | 342 | } |
338 | 343 | ||
@@ -351,6 +356,7 @@ void kernel_power_off(void) | |||
351 | disable_nonboot_cpus(); | 356 | disable_nonboot_cpus(); |
352 | sysdev_shutdown(); | 357 | sysdev_shutdown(); |
353 | printk(KERN_EMERG "Power down.\n"); | 358 | printk(KERN_EMERG "Power down.\n"); |
359 | kmsg_dump(KMSG_DUMP_POWEROFF); | ||
354 | machine_power_off(); | 360 | machine_power_off(); |
355 | } | 361 | } |
356 | EXPORT_SYMBOL_GPL(kernel_power_off); | 362 | EXPORT_SYMBOL_GPL(kernel_power_off); |
@@ -1379,7 +1385,8 @@ static int check_prlimit_permission(struct task_struct *task) | |||
1379 | const struct cred *cred = current_cred(), *tcred; | 1385 | const struct cred *cred = current_cred(), *tcred; |
1380 | 1386 | ||
1381 | tcred = __task_cred(task); | 1387 | tcred = __task_cred(task); |
1382 | if ((cred->uid != tcred->euid || | 1388 | if (current != task && |
1389 | (cred->uid != tcred->euid || | ||
1383 | cred->uid != tcred->suid || | 1390 | cred->uid != tcred->suid || |
1384 | cred->uid != tcred->uid || | 1391 | cred->uid != tcred->uid || |
1385 | cred->gid != tcred->egid || | 1392 | cred->gid != tcred->egid || |
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index c782fe9924c7..25cc41cd8f33 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c | |||
@@ -186,3 +186,8 @@ cond_syscall(sys_perf_event_open); | |||
186 | /* fanotify! */ | 186 | /* fanotify! */ |
187 | cond_syscall(sys_fanotify_init); | 187 | cond_syscall(sys_fanotify_init); |
188 | cond_syscall(sys_fanotify_mark); | 188 | cond_syscall(sys_fanotify_mark); |
189 | |||
190 | /* open by handle */ | ||
191 | cond_syscall(sys_name_to_handle_at); | ||
192 | cond_syscall(sys_open_by_handle_at); | ||
193 | cond_syscall(compat_sys_open_by_handle_at); | ||
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index ae5cbb1e3ced..51054fea5d99 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c | |||
@@ -24,6 +24,7 @@ | |||
24 | #include <linux/slab.h> | 24 | #include <linux/slab.h> |
25 | #include <linux/sysctl.h> | 25 | #include <linux/sysctl.h> |
26 | #include <linux/signal.h> | 26 | #include <linux/signal.h> |
27 | #include <linux/printk.h> | ||
27 | #include <linux/proc_fs.h> | 28 | #include <linux/proc_fs.h> |
28 | #include <linux/security.h> | 29 | #include <linux/security.h> |
29 | #include <linux/ctype.h> | 30 | #include <linux/ctype.h> |
@@ -169,7 +170,8 @@ static int proc_taint(struct ctl_table *table, int write, | |||
169 | #endif | 170 | #endif |
170 | 171 | ||
171 | #ifdef CONFIG_MAGIC_SYSRQ | 172 | #ifdef CONFIG_MAGIC_SYSRQ |
172 | static int __sysrq_enabled; /* Note: sysrq code ises it's own private copy */ | 173 | /* Note: sysrq code uses it's own private copy */ |
174 | static int __sysrq_enabled = SYSRQ_DEFAULT_ENABLE; | ||
173 | 175 | ||
174 | static int sysrq_sysctl_handler(ctl_table *table, int write, | 176 | static int sysrq_sysctl_handler(ctl_table *table, int write, |
175 | void __user *buffer, size_t *lenp, | 177 | void __user *buffer, size_t *lenp, |
@@ -192,9 +194,9 @@ static int sysrq_sysctl_handler(ctl_table *table, int write, | |||
192 | static struct ctl_table root_table[]; | 194 | static struct ctl_table root_table[]; |
193 | static struct ctl_table_root sysctl_table_root; | 195 | static struct ctl_table_root sysctl_table_root; |
194 | static struct ctl_table_header root_table_header = { | 196 | static struct ctl_table_header root_table_header = { |
195 | .count = 1, | 197 | {{.count = 1, |
196 | .ctl_table = root_table, | 198 | .ctl_table = root_table, |
197 | .ctl_entry = LIST_HEAD_INIT(sysctl_table_root.default_set.list), | 199 | .ctl_entry = LIST_HEAD_INIT(sysctl_table_root.default_set.list),}}, |
198 | .root = &sysctl_table_root, | 200 | .root = &sysctl_table_root, |
199 | .set = &sysctl_table_root.default_set, | 201 | .set = &sysctl_table_root.default_set, |
200 | }; | 202 | }; |
@@ -245,10 +247,6 @@ static struct ctl_table root_table[] = { | |||
245 | .mode = 0555, | 247 | .mode = 0555, |
246 | .child = dev_table, | 248 | .child = dev_table, |
247 | }, | 249 | }, |
248 | /* | ||
249 | * NOTE: do not add new entries to this table unless you have read | ||
250 | * Documentation/sysctl/ctl_unnumbered.txt | ||
251 | */ | ||
252 | { } | 250 | { } |
253 | }; | 251 | }; |
254 | 252 | ||
@@ -363,20 +361,13 @@ static struct ctl_table kern_table[] = { | |||
363 | .mode = 0644, | 361 | .mode = 0644, |
364 | .proc_handler = sched_rt_handler, | 362 | .proc_handler = sched_rt_handler, |
365 | }, | 363 | }, |
366 | { | ||
367 | .procname = "sched_compat_yield", | ||
368 | .data = &sysctl_sched_compat_yield, | ||
369 | .maxlen = sizeof(unsigned int), | ||
370 | .mode = 0644, | ||
371 | .proc_handler = proc_dointvec, | ||
372 | }, | ||
373 | #ifdef CONFIG_SCHED_AUTOGROUP | 364 | #ifdef CONFIG_SCHED_AUTOGROUP |
374 | { | 365 | { |
375 | .procname = "sched_autogroup_enabled", | 366 | .procname = "sched_autogroup_enabled", |
376 | .data = &sysctl_sched_autogroup_enabled, | 367 | .data = &sysctl_sched_autogroup_enabled, |
377 | .maxlen = sizeof(unsigned int), | 368 | .maxlen = sizeof(unsigned int), |
378 | .mode = 0644, | 369 | .mode = 0644, |
379 | .proc_handler = proc_dointvec, | 370 | .proc_handler = proc_dointvec_minmax, |
380 | .extra1 = &zero, | 371 | .extra1 = &zero, |
381 | .extra2 = &one, | 372 | .extra2 = &one, |
382 | }, | 373 | }, |
@@ -710,6 +701,15 @@ static struct ctl_table kern_table[] = { | |||
710 | .extra1 = &zero, | 701 | .extra1 = &zero, |
711 | .extra2 = &one, | 702 | .extra2 = &one, |
712 | }, | 703 | }, |
704 | { | ||
705 | .procname = "kptr_restrict", | ||
706 | .data = &kptr_restrict, | ||
707 | .maxlen = sizeof(int), | ||
708 | .mode = 0644, | ||
709 | .proc_handler = proc_dointvec_minmax, | ||
710 | .extra1 = &zero, | ||
711 | .extra2 = &two, | ||
712 | }, | ||
713 | #endif | 713 | #endif |
714 | { | 714 | { |
715 | .procname = "ngroups_max", | 715 | .procname = "ngroups_max", |
@@ -941,7 +941,7 @@ static struct ctl_table kern_table[] = { | |||
941 | .data = &sysctl_perf_event_sample_rate, | 941 | .data = &sysctl_perf_event_sample_rate, |
942 | .maxlen = sizeof(sysctl_perf_event_sample_rate), | 942 | .maxlen = sizeof(sysctl_perf_event_sample_rate), |
943 | .mode = 0644, | 943 | .mode = 0644, |
944 | .proc_handler = proc_dointvec, | 944 | .proc_handler = perf_proc_update_handler, |
945 | }, | 945 | }, |
946 | #endif | 946 | #endif |
947 | #ifdef CONFIG_KMEMCHECK | 947 | #ifdef CONFIG_KMEMCHECK |
@@ -962,10 +962,6 @@ static struct ctl_table kern_table[] = { | |||
962 | .proc_handler = proc_dointvec, | 962 | .proc_handler = proc_dointvec, |
963 | }, | 963 | }, |
964 | #endif | 964 | #endif |
965 | /* | ||
966 | * NOTE: do not add new entries to this table unless you have read | ||
967 | * Documentation/sysctl/ctl_unnumbered.txt | ||
968 | */ | ||
969 | { } | 965 | { } |
970 | }; | 966 | }; |
971 | 967 | ||
@@ -1326,11 +1322,6 @@ static struct ctl_table vm_table[] = { | |||
1326 | .extra2 = &one, | 1322 | .extra2 = &one, |
1327 | }, | 1323 | }, |
1328 | #endif | 1324 | #endif |
1329 | |||
1330 | /* | ||
1331 | * NOTE: do not add new entries to this table unless you have read | ||
1332 | * Documentation/sysctl/ctl_unnumbered.txt | ||
1333 | */ | ||
1334 | { } | 1325 | { } |
1335 | }; | 1326 | }; |
1336 | 1327 | ||
@@ -1486,10 +1477,6 @@ static struct ctl_table fs_table[] = { | |||
1486 | .proc_handler = &pipe_proc_fn, | 1477 | .proc_handler = &pipe_proc_fn, |
1487 | .extra1 = &pipe_min_size, | 1478 | .extra1 = &pipe_min_size, |
1488 | }, | 1479 | }, |
1489 | /* | ||
1490 | * NOTE: do not add new entries to this table unless you have read | ||
1491 | * Documentation/sysctl/ctl_unnumbered.txt | ||
1492 | */ | ||
1493 | { } | 1480 | { } |
1494 | }; | 1481 | }; |
1495 | 1482 | ||
@@ -1573,11 +1560,16 @@ void sysctl_head_get(struct ctl_table_header *head) | |||
1573 | spin_unlock(&sysctl_lock); | 1560 | spin_unlock(&sysctl_lock); |
1574 | } | 1561 | } |
1575 | 1562 | ||
1563 | static void free_head(struct rcu_head *rcu) | ||
1564 | { | ||
1565 | kfree(container_of(rcu, struct ctl_table_header, rcu)); | ||
1566 | } | ||
1567 | |||
1576 | void sysctl_head_put(struct ctl_table_header *head) | 1568 | void sysctl_head_put(struct ctl_table_header *head) |
1577 | { | 1569 | { |
1578 | spin_lock(&sysctl_lock); | 1570 | spin_lock(&sysctl_lock); |
1579 | if (!--head->count) | 1571 | if (!--head->count) |
1580 | kfree(head); | 1572 | call_rcu(&head->rcu, free_head); |
1581 | spin_unlock(&sysctl_lock); | 1573 | spin_unlock(&sysctl_lock); |
1582 | } | 1574 | } |
1583 | 1575 | ||
@@ -1954,10 +1946,10 @@ void unregister_sysctl_table(struct ctl_table_header * header) | |||
1954 | start_unregistering(header); | 1946 | start_unregistering(header); |
1955 | if (!--header->parent->count) { | 1947 | if (!--header->parent->count) { |
1956 | WARN_ON(1); | 1948 | WARN_ON(1); |
1957 | kfree(header->parent); | 1949 | call_rcu(&header->parent->rcu, free_head); |
1958 | } | 1950 | } |
1959 | if (!--header->count) | 1951 | if (!--header->count) |
1960 | kfree(header); | 1952 | call_rcu(&header->rcu, free_head); |
1961 | spin_unlock(&sysctl_lock); | 1953 | spin_unlock(&sysctl_lock); |
1962 | } | 1954 | } |
1963 | 1955 | ||
@@ -2899,7 +2891,7 @@ int proc_do_large_bitmap(struct ctl_table *table, int write, | |||
2899 | } | 2891 | } |
2900 | } | 2892 | } |
2901 | 2893 | ||
2902 | #else /* CONFIG_PROC_FS */ | 2894 | #else /* CONFIG_PROC_SYSCTL */ |
2903 | 2895 | ||
2904 | int proc_dostring(struct ctl_table *table, int write, | 2896 | int proc_dostring(struct ctl_table *table, int write, |
2905 | void __user *buffer, size_t *lenp, loff_t *ppos) | 2897 | void __user *buffer, size_t *lenp, loff_t *ppos) |
@@ -2951,7 +2943,7 @@ int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write, | |||
2951 | } | 2943 | } |
2952 | 2944 | ||
2953 | 2945 | ||
2954 | #endif /* CONFIG_PROC_FS */ | 2946 | #endif /* CONFIG_PROC_SYSCTL */ |
2955 | 2947 | ||
2956 | /* | 2948 | /* |
2957 | * No sense putting this after each symbol definition, twice, | 2949 | * No sense putting this after each symbol definition, twice, |
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c index 4b2545a136ff..3b8e028b9601 100644 --- a/kernel/sysctl_binary.c +++ b/kernel/sysctl_binary.c | |||
@@ -1192,7 +1192,7 @@ static ssize_t bin_dn_node_address(struct file *file, | |||
1192 | 1192 | ||
1193 | buf[result] = '\0'; | 1193 | buf[result] = '\0'; |
1194 | 1194 | ||
1195 | /* Convert the decnet addresss to binary */ | 1195 | /* Convert the decnet address to binary */ |
1196 | result = -EIO; | 1196 | result = -EIO; |
1197 | nodep = strchr(buf, '.') + 1; | 1197 | nodep = strchr(buf, '.') + 1; |
1198 | if (!nodep) | 1198 | if (!nodep) |
@@ -1321,13 +1321,11 @@ static ssize_t binary_sysctl(const int *name, int nlen, | |||
1321 | void __user *oldval, size_t oldlen, void __user *newval, size_t newlen) | 1321 | void __user *oldval, size_t oldlen, void __user *newval, size_t newlen) |
1322 | { | 1322 | { |
1323 | const struct bin_table *table = NULL; | 1323 | const struct bin_table *table = NULL; |
1324 | struct nameidata nd; | ||
1325 | struct vfsmount *mnt; | 1324 | struct vfsmount *mnt; |
1326 | struct file *file; | 1325 | struct file *file; |
1327 | ssize_t result; | 1326 | ssize_t result; |
1328 | char *pathname; | 1327 | char *pathname; |
1329 | int flags; | 1328 | int flags; |
1330 | int acc_mode; | ||
1331 | 1329 | ||
1332 | pathname = sysctl_getname(name, nlen, &table); | 1330 | pathname = sysctl_getname(name, nlen, &table); |
1333 | result = PTR_ERR(pathname); | 1331 | result = PTR_ERR(pathname); |
@@ -1337,28 +1335,17 @@ static ssize_t binary_sysctl(const int *name, int nlen, | |||
1337 | /* How should the sysctl be accessed? */ | 1335 | /* How should the sysctl be accessed? */ |
1338 | if (oldval && oldlen && newval && newlen) { | 1336 | if (oldval && oldlen && newval && newlen) { |
1339 | flags = O_RDWR; | 1337 | flags = O_RDWR; |
1340 | acc_mode = MAY_READ | MAY_WRITE; | ||
1341 | } else if (newval && newlen) { | 1338 | } else if (newval && newlen) { |
1342 | flags = O_WRONLY; | 1339 | flags = O_WRONLY; |
1343 | acc_mode = MAY_WRITE; | ||
1344 | } else if (oldval && oldlen) { | 1340 | } else if (oldval && oldlen) { |
1345 | flags = O_RDONLY; | 1341 | flags = O_RDONLY; |
1346 | acc_mode = MAY_READ; | ||
1347 | } else { | 1342 | } else { |
1348 | result = 0; | 1343 | result = 0; |
1349 | goto out_putname; | 1344 | goto out_putname; |
1350 | } | 1345 | } |
1351 | 1346 | ||
1352 | mnt = current->nsproxy->pid_ns->proc_mnt; | 1347 | mnt = current->nsproxy->pid_ns->proc_mnt; |
1353 | result = vfs_path_lookup(mnt->mnt_root, mnt, pathname, 0, &nd); | 1348 | file = file_open_root(mnt->mnt_root, mnt, pathname, flags); |
1354 | if (result) | ||
1355 | goto out_putname; | ||
1356 | |||
1357 | result = may_open(&nd.path, acc_mode, flags); | ||
1358 | if (result) | ||
1359 | goto out_putpath; | ||
1360 | |||
1361 | file = dentry_open(nd.path.dentry, nd.path.mnt, flags, current_cred()); | ||
1362 | result = PTR_ERR(file); | 1349 | result = PTR_ERR(file); |
1363 | if (IS_ERR(file)) | 1350 | if (IS_ERR(file)) |
1364 | goto out_putname; | 1351 | goto out_putname; |
@@ -1370,10 +1357,6 @@ out_putname: | |||
1370 | putname(pathname); | 1357 | putname(pathname); |
1371 | out: | 1358 | out: |
1372 | return result; | 1359 | return result; |
1373 | |||
1374 | out_putpath: | ||
1375 | path_put(&nd.path); | ||
1376 | goto out_putname; | ||
1377 | } | 1360 | } |
1378 | 1361 | ||
1379 | 1362 | ||
diff --git a/kernel/taskstats.c b/kernel/taskstats.c index 3308fd7f1b52..3971c6b9d58d 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c | |||
@@ -89,8 +89,7 @@ static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp, | |||
89 | return -ENOMEM; | 89 | return -ENOMEM; |
90 | 90 | ||
91 | if (!info) { | 91 | if (!info) { |
92 | int seq = get_cpu_var(taskstats_seqnum)++; | 92 | int seq = this_cpu_inc_return(taskstats_seqnum) - 1; |
93 | put_cpu_var(taskstats_seqnum); | ||
94 | 93 | ||
95 | reply = genlmsg_put(skb, 0, seq, &family, 0, cmd); | 94 | reply = genlmsg_put(skb, 0, seq, &family, 0, cmd); |
96 | } else | 95 | } else |
@@ -349,7 +348,7 @@ static int parse(struct nlattr *na, struct cpumask *mask) | |||
349 | return ret; | 348 | return ret; |
350 | } | 349 | } |
351 | 350 | ||
352 | #ifdef CONFIG_IA64 | 351 | #if defined(CONFIG_64BIT) && !defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) |
353 | #define TASKSTATS_NEEDS_PADDING 1 | 352 | #define TASKSTATS_NEEDS_PADDING 1 |
354 | #endif | 353 | #endif |
355 | 354 | ||
@@ -612,7 +611,7 @@ void taskstats_exit(struct task_struct *tsk, int group_dead) | |||
612 | fill_tgid_exit(tsk); | 611 | fill_tgid_exit(tsk); |
613 | } | 612 | } |
614 | 613 | ||
615 | listeners = &__raw_get_cpu_var(listener_array); | 614 | listeners = __this_cpu_ptr(&listener_array); |
616 | if (list_empty(&listeners->list)) | 615 | if (list_empty(&listeners->list)) |
617 | return; | 616 | return; |
618 | 617 | ||
diff --git a/kernel/time.c b/kernel/time.c index ba9b338d1835..8e8dc6d705c9 100644 --- a/kernel/time.c +++ b/kernel/time.c | |||
@@ -150,7 +150,7 @@ static inline void warp_clock(void) | |||
150 | * various programs will get confused when the clock gets warped. | 150 | * various programs will get confused when the clock gets warped. |
151 | */ | 151 | */ |
152 | 152 | ||
153 | int do_sys_settimeofday(struct timespec *tv, struct timezone *tz) | 153 | int do_sys_settimeofday(const struct timespec *tv, const struct timezone *tz) |
154 | { | 154 | { |
155 | static int firsttime = 1; | 155 | static int firsttime = 1; |
156 | int error = 0; | 156 | int error = 0; |
@@ -238,7 +238,7 @@ EXPORT_SYMBOL(current_fs_time); | |||
238 | * Avoid unnecessary multiplications/divisions in the | 238 | * Avoid unnecessary multiplications/divisions in the |
239 | * two most common HZ cases: | 239 | * two most common HZ cases: |
240 | */ | 240 | */ |
241 | unsigned int inline jiffies_to_msecs(const unsigned long j) | 241 | inline unsigned int jiffies_to_msecs(const unsigned long j) |
242 | { | 242 | { |
243 | #if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ) | 243 | #if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ) |
244 | return (MSEC_PER_SEC / HZ) * j; | 244 | return (MSEC_PER_SEC / HZ) * j; |
@@ -254,7 +254,7 @@ unsigned int inline jiffies_to_msecs(const unsigned long j) | |||
254 | } | 254 | } |
255 | EXPORT_SYMBOL(jiffies_to_msecs); | 255 | EXPORT_SYMBOL(jiffies_to_msecs); |
256 | 256 | ||
257 | unsigned int inline jiffies_to_usecs(const unsigned long j) | 257 | inline unsigned int jiffies_to_usecs(const unsigned long j) |
258 | { | 258 | { |
259 | #if HZ <= USEC_PER_SEC && !(USEC_PER_SEC % HZ) | 259 | #if HZ <= USEC_PER_SEC && !(USEC_PER_SEC % HZ) |
260 | return (USEC_PER_SEC / HZ) * j; | 260 | return (USEC_PER_SEC / HZ) * j; |
@@ -645,7 +645,7 @@ u64 nsec_to_clock_t(u64 x) | |||
645 | } | 645 | } |
646 | 646 | ||
647 | /** | 647 | /** |
648 | * nsecs_to_jiffies - Convert nsecs in u64 to jiffies | 648 | * nsecs_to_jiffies64 - Convert nsecs in u64 to jiffies64 |
649 | * | 649 | * |
650 | * @n: nsecs in u64 | 650 | * @n: nsecs in u64 |
651 | * | 651 | * |
@@ -657,7 +657,7 @@ u64 nsec_to_clock_t(u64 x) | |||
657 | * NSEC_PER_SEC = 10^9 = (5^9 * 2^9) = (1953125 * 512) | 657 | * NSEC_PER_SEC = 10^9 = (5^9 * 2^9) = (1953125 * 512) |
658 | * ULLONG_MAX ns = 18446744073.709551615 secs = about 584 years | 658 | * ULLONG_MAX ns = 18446744073.709551615 secs = about 584 years |
659 | */ | 659 | */ |
660 | unsigned long nsecs_to_jiffies(u64 n) | 660 | u64 nsecs_to_jiffies64(u64 n) |
661 | { | 661 | { |
662 | #if (NSEC_PER_SEC % HZ) == 0 | 662 | #if (NSEC_PER_SEC % HZ) == 0 |
663 | /* Common case, HZ = 100, 128, 200, 250, 256, 500, 512, 1000 etc. */ | 663 | /* Common case, HZ = 100, 128, 200, 250, 256, 500, 512, 1000 etc. */ |
@@ -674,22 +674,23 @@ unsigned long nsecs_to_jiffies(u64 n) | |||
674 | #endif | 674 | #endif |
675 | } | 675 | } |
676 | 676 | ||
677 | #if (BITS_PER_LONG < 64) | 677 | /** |
678 | u64 get_jiffies_64(void) | 678 | * nsecs_to_jiffies - Convert nsecs in u64 to jiffies |
679 | * | ||
680 | * @n: nsecs in u64 | ||
681 | * | ||
682 | * Unlike {m,u}secs_to_jiffies, type of input is not unsigned int but u64. | ||
683 | * And this doesn't return MAX_JIFFY_OFFSET since this function is designed | ||
684 | * for scheduler, not for use in device drivers to calculate timeout value. | ||
685 | * | ||
686 | * note: | ||
687 | * NSEC_PER_SEC = 10^9 = (5^9 * 2^9) = (1953125 * 512) | ||
688 | * ULLONG_MAX ns = 18446744073.709551615 secs = about 584 years | ||
689 | */ | ||
690 | unsigned long nsecs_to_jiffies(u64 n) | ||
679 | { | 691 | { |
680 | unsigned long seq; | 692 | return (unsigned long)nsecs_to_jiffies64(n); |
681 | u64 ret; | ||
682 | |||
683 | do { | ||
684 | seq = read_seqbegin(&xtime_lock); | ||
685 | ret = jiffies_64; | ||
686 | } while (read_seqretry(&xtime_lock, seq)); | ||
687 | return ret; | ||
688 | } | 693 | } |
689 | EXPORT_SYMBOL(get_jiffies_64); | ||
690 | #endif | ||
691 | |||
692 | EXPORT_SYMBOL(jiffies); | ||
693 | 694 | ||
694 | /* | 695 | /* |
695 | * Add two timespec values and do a safety check for overflow. | 696 | * Add two timespec values and do a safety check for overflow. |
diff --git a/kernel/time/Makefile b/kernel/time/Makefile index ee266620b06c..b0425991e9ac 100644 --- a/kernel/time/Makefile +++ b/kernel/time/Makefile | |||
@@ -1,4 +1,5 @@ | |||
1 | obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o timecompare.o timeconv.o | 1 | obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o timecompare.o |
2 | obj-y += timeconv.o posix-clock.o | ||
2 | 3 | ||
3 | obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD) += clockevents.o | 4 | obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD) += clockevents.o |
4 | obj-$(CONFIG_GENERIC_CLOCKEVENTS) += tick-common.o | 5 | obj-$(CONFIG_GENERIC_CLOCKEVENTS) += tick-common.o |
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index d7395fdfb9f3..0d74b9ba90c8 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c | |||
@@ -18,7 +18,6 @@ | |||
18 | #include <linux/notifier.h> | 18 | #include <linux/notifier.h> |
19 | #include <linux/smp.h> | 19 | #include <linux/smp.h> |
20 | #include <linux/sysdev.h> | 20 | #include <linux/sysdev.h> |
21 | #include <linux/tick.h> | ||
22 | 21 | ||
23 | #include "tick-internal.h" | 22 | #include "tick-internal.h" |
24 | 23 | ||
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index c18d7efa1b4b..6519cf62d9cd 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c | |||
@@ -113,7 +113,7 @@ EXPORT_SYMBOL_GPL(timecounter_cyc2time); | |||
113 | * @shift: pointer to shift variable | 113 | * @shift: pointer to shift variable |
114 | * @from: frequency to convert from | 114 | * @from: frequency to convert from |
115 | * @to: frequency to convert to | 115 | * @to: frequency to convert to |
116 | * @minsec: guaranteed runtime conversion range in seconds | 116 | * @maxsec: guaranteed runtime conversion range in seconds |
117 | * | 117 | * |
118 | * The function evaluates the shift/mult pair for the scaled math | 118 | * The function evaluates the shift/mult pair for the scaled math |
119 | * operations of clocksources and clockevents. | 119 | * operations of clocksources and clockevents. |
@@ -122,7 +122,7 @@ EXPORT_SYMBOL_GPL(timecounter_cyc2time); | |||
122 | * NSEC_PER_SEC == 1GHz and @from is the counter frequency. For clock | 122 | * NSEC_PER_SEC == 1GHz and @from is the counter frequency. For clock |
123 | * event @to is the counter frequency and @from is NSEC_PER_SEC. | 123 | * event @to is the counter frequency and @from is NSEC_PER_SEC. |
124 | * | 124 | * |
125 | * The @minsec conversion range argument controls the time frame in | 125 | * The @maxsec conversion range argument controls the time frame in |
126 | * seconds which must be covered by the runtime conversion with the | 126 | * seconds which must be covered by the runtime conversion with the |
127 | * calculated mult and shift factors. This guarantees that no 64bit | 127 | * calculated mult and shift factors. This guarantees that no 64bit |
128 | * overflow happens when the input value of the conversion is | 128 | * overflow happens when the input value of the conversion is |
@@ -131,7 +131,7 @@ EXPORT_SYMBOL_GPL(timecounter_cyc2time); | |||
131 | * factors. | 131 | * factors. |
132 | */ | 132 | */ |
133 | void | 133 | void |
134 | clocks_calc_mult_shift(u32 *mult, u32 *shift, u32 from, u32 to, u32 minsec) | 134 | clocks_calc_mult_shift(u32 *mult, u32 *shift, u32 from, u32 to, u32 maxsec) |
135 | { | 135 | { |
136 | u64 tmp; | 136 | u64 tmp; |
137 | u32 sft, sftacc= 32; | 137 | u32 sft, sftacc= 32; |
@@ -140,7 +140,7 @@ clocks_calc_mult_shift(u32 *mult, u32 *shift, u32 from, u32 to, u32 minsec) | |||
140 | * Calculate the shift factor which is limiting the conversion | 140 | * Calculate the shift factor which is limiting the conversion |
141 | * range: | 141 | * range: |
142 | */ | 142 | */ |
143 | tmp = ((u64)minsec * from) >> 32; | 143 | tmp = ((u64)maxsec * from) >> 32; |
144 | while (tmp) { | 144 | while (tmp) { |
145 | tmp >>=1; | 145 | tmp >>=1; |
146 | sftacc--; | 146 | sftacc--; |
@@ -152,6 +152,7 @@ clocks_calc_mult_shift(u32 *mult, u32 *shift, u32 from, u32 to, u32 minsec) | |||
152 | */ | 152 | */ |
153 | for (sft = 32; sft > 0; sft--) { | 153 | for (sft = 32; sft > 0; sft--) { |
154 | tmp = (u64) to << sft; | 154 | tmp = (u64) to << sft; |
155 | tmp += from / 2; | ||
155 | do_div(tmp, from); | 156 | do_div(tmp, from); |
156 | if ((tmp >> sftacc) == 0) | 157 | if ((tmp >> sftacc) == 0) |
157 | break; | 158 | break; |
@@ -678,7 +679,7 @@ EXPORT_SYMBOL_GPL(__clocksource_updatefreq_scale); | |||
678 | int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq) | 679 | int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq) |
679 | { | 680 | { |
680 | 681 | ||
681 | /* Intialize mult/shift and max_idle_ns */ | 682 | /* Initialize mult/shift and max_idle_ns */ |
682 | __clocksource_updatefreq_scale(cs, scale, freq); | 683 | __clocksource_updatefreq_scale(cs, scale, freq); |
683 | 684 | ||
684 | /* Add clocksource to the clcoksource list */ | 685 | /* Add clocksource to the clcoksource list */ |
diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c index 5404a8456909..b2fa506667c0 100644 --- a/kernel/time/jiffies.c +++ b/kernel/time/jiffies.c | |||
@@ -22,8 +22,11 @@ | |||
22 | ************************************************************************/ | 22 | ************************************************************************/ |
23 | #include <linux/clocksource.h> | 23 | #include <linux/clocksource.h> |
24 | #include <linux/jiffies.h> | 24 | #include <linux/jiffies.h> |
25 | #include <linux/module.h> | ||
25 | #include <linux/init.h> | 26 | #include <linux/init.h> |
26 | 27 | ||
28 | #include "tick-internal.h" | ||
29 | |||
27 | /* The Jiffies based clocksource is the lowest common | 30 | /* The Jiffies based clocksource is the lowest common |
28 | * denominator clock source which should function on | 31 | * denominator clock source which should function on |
29 | * all systems. It has the same coarse resolution as | 32 | * all systems. It has the same coarse resolution as |
@@ -64,6 +67,23 @@ struct clocksource clocksource_jiffies = { | |||
64 | .shift = JIFFIES_SHIFT, | 67 | .shift = JIFFIES_SHIFT, |
65 | }; | 68 | }; |
66 | 69 | ||
70 | #if (BITS_PER_LONG < 64) | ||
71 | u64 get_jiffies_64(void) | ||
72 | { | ||
73 | unsigned long seq; | ||
74 | u64 ret; | ||
75 | |||
76 | do { | ||
77 | seq = read_seqbegin(&xtime_lock); | ||
78 | ret = jiffies_64; | ||
79 | } while (read_seqretry(&xtime_lock, seq)); | ||
80 | return ret; | ||
81 | } | ||
82 | EXPORT_SYMBOL(get_jiffies_64); | ||
83 | #endif | ||
84 | |||
85 | EXPORT_SYMBOL(jiffies); | ||
86 | |||
67 | static int __init init_jiffies_clocksource(void) | 87 | static int __init init_jiffies_clocksource(void) |
68 | { | 88 | { |
69 | return clocksource_register(&clocksource_jiffies); | 89 | return clocksource_register(&clocksource_jiffies); |
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index d2321891538f..5f1bb8e2008f 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c | |||
@@ -14,6 +14,9 @@ | |||
14 | #include <linux/timex.h> | 14 | #include <linux/timex.h> |
15 | #include <linux/time.h> | 15 | #include <linux/time.h> |
16 | #include <linux/mm.h> | 16 | #include <linux/mm.h> |
17 | #include <linux/module.h> | ||
18 | |||
19 | #include "tick-internal.h" | ||
17 | 20 | ||
18 | /* | 21 | /* |
19 | * NTP timekeeping variables: | 22 | * NTP timekeeping variables: |
@@ -74,6 +77,162 @@ static long time_adjust; | |||
74 | /* constant (boot-param configurable) NTP tick adjustment (upscaled) */ | 77 | /* constant (boot-param configurable) NTP tick adjustment (upscaled) */ |
75 | static s64 ntp_tick_adj; | 78 | static s64 ntp_tick_adj; |
76 | 79 | ||
80 | #ifdef CONFIG_NTP_PPS | ||
81 | |||
82 | /* | ||
83 | * The following variables are used when a pulse-per-second (PPS) signal | ||
84 | * is available. They establish the engineering parameters of the clock | ||
85 | * discipline loop when controlled by the PPS signal. | ||
86 | */ | ||
87 | #define PPS_VALID 10 /* PPS signal watchdog max (s) */ | ||
88 | #define PPS_POPCORN 4 /* popcorn spike threshold (shift) */ | ||
89 | #define PPS_INTMIN 2 /* min freq interval (s) (shift) */ | ||
90 | #define PPS_INTMAX 8 /* max freq interval (s) (shift) */ | ||
91 | #define PPS_INTCOUNT 4 /* number of consecutive good intervals to | ||
92 | increase pps_shift or consecutive bad | ||
93 | intervals to decrease it */ | ||
94 | #define PPS_MAXWANDER 100000 /* max PPS freq wander (ns/s) */ | ||
95 | |||
96 | static int pps_valid; /* signal watchdog counter */ | ||
97 | static long pps_tf[3]; /* phase median filter */ | ||
98 | static long pps_jitter; /* current jitter (ns) */ | ||
99 | static struct timespec pps_fbase; /* beginning of the last freq interval */ | ||
100 | static int pps_shift; /* current interval duration (s) (shift) */ | ||
101 | static int pps_intcnt; /* interval counter */ | ||
102 | static s64 pps_freq; /* frequency offset (scaled ns/s) */ | ||
103 | static long pps_stabil; /* current stability (scaled ns/s) */ | ||
104 | |||
105 | /* | ||
106 | * PPS signal quality monitors | ||
107 | */ | ||
108 | static long pps_calcnt; /* calibration intervals */ | ||
109 | static long pps_jitcnt; /* jitter limit exceeded */ | ||
110 | static long pps_stbcnt; /* stability limit exceeded */ | ||
111 | static long pps_errcnt; /* calibration errors */ | ||
112 | |||
113 | |||
114 | /* PPS kernel consumer compensates the whole phase error immediately. | ||
115 | * Otherwise, reduce the offset by a fixed factor times the time constant. | ||
116 | */ | ||
117 | static inline s64 ntp_offset_chunk(s64 offset) | ||
118 | { | ||
119 | if (time_status & STA_PPSTIME && time_status & STA_PPSSIGNAL) | ||
120 | return offset; | ||
121 | else | ||
122 | return shift_right(offset, SHIFT_PLL + time_constant); | ||
123 | } | ||
124 | |||
125 | static inline void pps_reset_freq_interval(void) | ||
126 | { | ||
127 | /* the PPS calibration interval may end | ||
128 | surprisingly early */ | ||
129 | pps_shift = PPS_INTMIN; | ||
130 | pps_intcnt = 0; | ||
131 | } | ||
132 | |||
133 | /** | ||
134 | * pps_clear - Clears the PPS state variables | ||
135 | * | ||
136 | * Must be called while holding a write on the xtime_lock | ||
137 | */ | ||
138 | static inline void pps_clear(void) | ||
139 | { | ||
140 | pps_reset_freq_interval(); | ||
141 | pps_tf[0] = 0; | ||
142 | pps_tf[1] = 0; | ||
143 | pps_tf[2] = 0; | ||
144 | pps_fbase.tv_sec = pps_fbase.tv_nsec = 0; | ||
145 | pps_freq = 0; | ||
146 | } | ||
147 | |||
148 | /* Decrease pps_valid to indicate that another second has passed since | ||
149 | * the last PPS signal. When it reaches 0, indicate that PPS signal is | ||
150 | * missing. | ||
151 | * | ||
152 | * Must be called while holding a write on the xtime_lock | ||
153 | */ | ||
154 | static inline void pps_dec_valid(void) | ||
155 | { | ||
156 | if (pps_valid > 0) | ||
157 | pps_valid--; | ||
158 | else { | ||
159 | time_status &= ~(STA_PPSSIGNAL | STA_PPSJITTER | | ||
160 | STA_PPSWANDER | STA_PPSERROR); | ||
161 | pps_clear(); | ||
162 | } | ||
163 | } | ||
164 | |||
165 | static inline void pps_set_freq(s64 freq) | ||
166 | { | ||
167 | pps_freq = freq; | ||
168 | } | ||
169 | |||
170 | static inline int is_error_status(int status) | ||
171 | { | ||
172 | return (time_status & (STA_UNSYNC|STA_CLOCKERR)) | ||
173 | /* PPS signal lost when either PPS time or | ||
174 | * PPS frequency synchronization requested | ||
175 | */ | ||
176 | || ((time_status & (STA_PPSFREQ|STA_PPSTIME)) | ||
177 | && !(time_status & STA_PPSSIGNAL)) | ||
178 | /* PPS jitter exceeded when | ||
179 | * PPS time synchronization requested */ | ||
180 | || ((time_status & (STA_PPSTIME|STA_PPSJITTER)) | ||
181 | == (STA_PPSTIME|STA_PPSJITTER)) | ||
182 | /* PPS wander exceeded or calibration error when | ||
183 | * PPS frequency synchronization requested | ||
184 | */ | ||
185 | || ((time_status & STA_PPSFREQ) | ||
186 | && (time_status & (STA_PPSWANDER|STA_PPSERROR))); | ||
187 | } | ||
188 | |||
189 | static inline void pps_fill_timex(struct timex *txc) | ||
190 | { | ||
191 | txc->ppsfreq = shift_right((pps_freq >> PPM_SCALE_INV_SHIFT) * | ||
192 | PPM_SCALE_INV, NTP_SCALE_SHIFT); | ||
193 | txc->jitter = pps_jitter; | ||
194 | if (!(time_status & STA_NANO)) | ||
195 | txc->jitter /= NSEC_PER_USEC; | ||
196 | txc->shift = pps_shift; | ||
197 | txc->stabil = pps_stabil; | ||
198 | txc->jitcnt = pps_jitcnt; | ||
199 | txc->calcnt = pps_calcnt; | ||
200 | txc->errcnt = pps_errcnt; | ||
201 | txc->stbcnt = pps_stbcnt; | ||
202 | } | ||
203 | |||
204 | #else /* !CONFIG_NTP_PPS */ | ||
205 | |||
206 | static inline s64 ntp_offset_chunk(s64 offset) | ||
207 | { | ||
208 | return shift_right(offset, SHIFT_PLL + time_constant); | ||
209 | } | ||
210 | |||
211 | static inline void pps_reset_freq_interval(void) {} | ||
212 | static inline void pps_clear(void) {} | ||
213 | static inline void pps_dec_valid(void) {} | ||
214 | static inline void pps_set_freq(s64 freq) {} | ||
215 | |||
216 | static inline int is_error_status(int status) | ||
217 | { | ||
218 | return status & (STA_UNSYNC|STA_CLOCKERR); | ||
219 | } | ||
220 | |||
221 | static inline void pps_fill_timex(struct timex *txc) | ||
222 | { | ||
223 | /* PPS is not implemented, so these are zero */ | ||
224 | txc->ppsfreq = 0; | ||
225 | txc->jitter = 0; | ||
226 | txc->shift = 0; | ||
227 | txc->stabil = 0; | ||
228 | txc->jitcnt = 0; | ||
229 | txc->calcnt = 0; | ||
230 | txc->errcnt = 0; | ||
231 | txc->stbcnt = 0; | ||
232 | } | ||
233 | |||
234 | #endif /* CONFIG_NTP_PPS */ | ||
235 | |||
77 | /* | 236 | /* |
78 | * NTP methods: | 237 | * NTP methods: |
79 | */ | 238 | */ |
@@ -185,6 +344,9 @@ void ntp_clear(void) | |||
185 | 344 | ||
186 | tick_length = tick_length_base; | 345 | tick_length = tick_length_base; |
187 | time_offset = 0; | 346 | time_offset = 0; |
347 | |||
348 | /* Clear PPS state variables */ | ||
349 | pps_clear(); | ||
188 | } | 350 | } |
189 | 351 | ||
190 | /* | 352 | /* |
@@ -250,16 +412,16 @@ void second_overflow(void) | |||
250 | time_status |= STA_UNSYNC; | 412 | time_status |= STA_UNSYNC; |
251 | } | 413 | } |
252 | 414 | ||
253 | /* | 415 | /* Compute the phase adjustment for the next second */ |
254 | * Compute the phase adjustment for the next second. The offset is | ||
255 | * reduced by a fixed factor times the time constant. | ||
256 | */ | ||
257 | tick_length = tick_length_base; | 416 | tick_length = tick_length_base; |
258 | 417 | ||
259 | delta = shift_right(time_offset, SHIFT_PLL + time_constant); | 418 | delta = ntp_offset_chunk(time_offset); |
260 | time_offset -= delta; | 419 | time_offset -= delta; |
261 | tick_length += delta; | 420 | tick_length += delta; |
262 | 421 | ||
422 | /* Check PPS signal */ | ||
423 | pps_dec_valid(); | ||
424 | |||
263 | if (!time_adjust) | 425 | if (!time_adjust) |
264 | return; | 426 | return; |
265 | 427 | ||
@@ -369,6 +531,8 @@ static inline void process_adj_status(struct timex *txc, struct timespec *ts) | |||
369 | if ((time_status & STA_PLL) && !(txc->status & STA_PLL)) { | 531 | if ((time_status & STA_PLL) && !(txc->status & STA_PLL)) { |
370 | time_state = TIME_OK; | 532 | time_state = TIME_OK; |
371 | time_status = STA_UNSYNC; | 533 | time_status = STA_UNSYNC; |
534 | /* restart PPS frequency calibration */ | ||
535 | pps_reset_freq_interval(); | ||
372 | } | 536 | } |
373 | 537 | ||
374 | /* | 538 | /* |
@@ -418,6 +582,8 @@ static inline void process_adjtimex_modes(struct timex *txc, struct timespec *ts | |||
418 | time_freq = txc->freq * PPM_SCALE; | 582 | time_freq = txc->freq * PPM_SCALE; |
419 | time_freq = min(time_freq, MAXFREQ_SCALED); | 583 | time_freq = min(time_freq, MAXFREQ_SCALED); |
420 | time_freq = max(time_freq, -MAXFREQ_SCALED); | 584 | time_freq = max(time_freq, -MAXFREQ_SCALED); |
585 | /* update pps_freq */ | ||
586 | pps_set_freq(time_freq); | ||
421 | } | 587 | } |
422 | 588 | ||
423 | if (txc->modes & ADJ_MAXERROR) | 589 | if (txc->modes & ADJ_MAXERROR) |
@@ -482,6 +648,17 @@ int do_adjtimex(struct timex *txc) | |||
482 | hrtimer_cancel(&leap_timer); | 648 | hrtimer_cancel(&leap_timer); |
483 | } | 649 | } |
484 | 650 | ||
651 | if (txc->modes & ADJ_SETOFFSET) { | ||
652 | struct timespec delta; | ||
653 | delta.tv_sec = txc->time.tv_sec; | ||
654 | delta.tv_nsec = txc->time.tv_usec; | ||
655 | if (!(txc->modes & ADJ_NANO)) | ||
656 | delta.tv_nsec *= 1000; | ||
657 | result = timekeeping_inject_offset(&delta); | ||
658 | if (result) | ||
659 | return result; | ||
660 | } | ||
661 | |||
485 | getnstimeofday(&ts); | 662 | getnstimeofday(&ts); |
486 | 663 | ||
487 | write_seqlock_irq(&xtime_lock); | 664 | write_seqlock_irq(&xtime_lock); |
@@ -508,7 +685,8 @@ int do_adjtimex(struct timex *txc) | |||
508 | } | 685 | } |
509 | 686 | ||
510 | result = time_state; /* mostly `TIME_OK' */ | 687 | result = time_state; /* mostly `TIME_OK' */ |
511 | if (time_status & (STA_UNSYNC|STA_CLOCKERR)) | 688 | /* check for errors */ |
689 | if (is_error_status(time_status)) | ||
512 | result = TIME_ERROR; | 690 | result = TIME_ERROR; |
513 | 691 | ||
514 | txc->freq = shift_right((time_freq >> PPM_SCALE_INV_SHIFT) * | 692 | txc->freq = shift_right((time_freq >> PPM_SCALE_INV_SHIFT) * |
@@ -522,15 +700,8 @@ int do_adjtimex(struct timex *txc) | |||
522 | txc->tick = tick_usec; | 700 | txc->tick = tick_usec; |
523 | txc->tai = time_tai; | 701 | txc->tai = time_tai; |
524 | 702 | ||
525 | /* PPS is not implemented, so these are zero */ | 703 | /* fill PPS status fields */ |
526 | txc->ppsfreq = 0; | 704 | pps_fill_timex(txc); |
527 | txc->jitter = 0; | ||
528 | txc->shift = 0; | ||
529 | txc->stabil = 0; | ||
530 | txc->jitcnt = 0; | ||
531 | txc->calcnt = 0; | ||
532 | txc->errcnt = 0; | ||
533 | txc->stbcnt = 0; | ||
534 | 705 | ||
535 | write_sequnlock_irq(&xtime_lock); | 706 | write_sequnlock_irq(&xtime_lock); |
536 | 707 | ||
@@ -544,6 +715,243 @@ int do_adjtimex(struct timex *txc) | |||
544 | return result; | 715 | return result; |
545 | } | 716 | } |
546 | 717 | ||
718 | #ifdef CONFIG_NTP_PPS | ||
719 | |||
720 | /* actually struct pps_normtime is good old struct timespec, but it is | ||
721 | * semantically different (and it is the reason why it was invented): | ||
722 | * pps_normtime.nsec has a range of ( -NSEC_PER_SEC / 2, NSEC_PER_SEC / 2 ] | ||
723 | * while timespec.tv_nsec has a range of [0, NSEC_PER_SEC) */ | ||
724 | struct pps_normtime { | ||
725 | __kernel_time_t sec; /* seconds */ | ||
726 | long nsec; /* nanoseconds */ | ||
727 | }; | ||
728 | |||
729 | /* normalize the timestamp so that nsec is in the | ||
730 | ( -NSEC_PER_SEC / 2, NSEC_PER_SEC / 2 ] interval */ | ||
731 | static inline struct pps_normtime pps_normalize_ts(struct timespec ts) | ||
732 | { | ||
733 | struct pps_normtime norm = { | ||
734 | .sec = ts.tv_sec, | ||
735 | .nsec = ts.tv_nsec | ||
736 | }; | ||
737 | |||
738 | if (norm.nsec > (NSEC_PER_SEC >> 1)) { | ||
739 | norm.nsec -= NSEC_PER_SEC; | ||
740 | norm.sec++; | ||
741 | } | ||
742 | |||
743 | return norm; | ||
744 | } | ||
745 | |||
746 | /* get current phase correction and jitter */ | ||
747 | static inline long pps_phase_filter_get(long *jitter) | ||
748 | { | ||
749 | *jitter = pps_tf[0] - pps_tf[1]; | ||
750 | if (*jitter < 0) | ||
751 | *jitter = -*jitter; | ||
752 | |||
753 | /* TODO: test various filters */ | ||
754 | return pps_tf[0]; | ||
755 | } | ||
756 | |||
757 | /* add the sample to the phase filter */ | ||
758 | static inline void pps_phase_filter_add(long err) | ||
759 | { | ||
760 | pps_tf[2] = pps_tf[1]; | ||
761 | pps_tf[1] = pps_tf[0]; | ||
762 | pps_tf[0] = err; | ||
763 | } | ||
764 | |||
765 | /* decrease frequency calibration interval length. | ||
766 | * It is halved after four consecutive unstable intervals. | ||
767 | */ | ||
768 | static inline void pps_dec_freq_interval(void) | ||
769 | { | ||
770 | if (--pps_intcnt <= -PPS_INTCOUNT) { | ||
771 | pps_intcnt = -PPS_INTCOUNT; | ||
772 | if (pps_shift > PPS_INTMIN) { | ||
773 | pps_shift--; | ||
774 | pps_intcnt = 0; | ||
775 | } | ||
776 | } | ||
777 | } | ||
778 | |||
779 | /* increase frequency calibration interval length. | ||
780 | * It is doubled after four consecutive stable intervals. | ||
781 | */ | ||
782 | static inline void pps_inc_freq_interval(void) | ||
783 | { | ||
784 | if (++pps_intcnt >= PPS_INTCOUNT) { | ||
785 | pps_intcnt = PPS_INTCOUNT; | ||
786 | if (pps_shift < PPS_INTMAX) { | ||
787 | pps_shift++; | ||
788 | pps_intcnt = 0; | ||
789 | } | ||
790 | } | ||
791 | } | ||
792 | |||
793 | /* update clock frequency based on MONOTONIC_RAW clock PPS signal | ||
794 | * timestamps | ||
795 | * | ||
796 | * At the end of the calibration interval the difference between the | ||
797 | * first and last MONOTONIC_RAW clock timestamps divided by the length | ||
798 | * of the interval becomes the frequency update. If the interval was | ||
799 | * too long, the data are discarded. | ||
800 | * Returns the difference between old and new frequency values. | ||
801 | */ | ||
802 | static long hardpps_update_freq(struct pps_normtime freq_norm) | ||
803 | { | ||
804 | long delta, delta_mod; | ||
805 | s64 ftemp; | ||
806 | |||
807 | /* check if the frequency interval was too long */ | ||
808 | if (freq_norm.sec > (2 << pps_shift)) { | ||
809 | time_status |= STA_PPSERROR; | ||
810 | pps_errcnt++; | ||
811 | pps_dec_freq_interval(); | ||
812 | pr_err("hardpps: PPSERROR: interval too long - %ld s\n", | ||
813 | freq_norm.sec); | ||
814 | return 0; | ||
815 | } | ||
816 | |||
817 | /* here the raw frequency offset and wander (stability) is | ||
818 | * calculated. If the wander is less than the wander threshold | ||
819 | * the interval is increased; otherwise it is decreased. | ||
820 | */ | ||
821 | ftemp = div_s64(((s64)(-freq_norm.nsec)) << NTP_SCALE_SHIFT, | ||
822 | freq_norm.sec); | ||
823 | delta = shift_right(ftemp - pps_freq, NTP_SCALE_SHIFT); | ||
824 | pps_freq = ftemp; | ||
825 | if (delta > PPS_MAXWANDER || delta < -PPS_MAXWANDER) { | ||
826 | pr_warning("hardpps: PPSWANDER: change=%ld\n", delta); | ||
827 | time_status |= STA_PPSWANDER; | ||
828 | pps_stbcnt++; | ||
829 | pps_dec_freq_interval(); | ||
830 | } else { /* good sample */ | ||
831 | pps_inc_freq_interval(); | ||
832 | } | ||
833 | |||
834 | /* the stability metric is calculated as the average of recent | ||
835 | * frequency changes, but is used only for performance | ||
836 | * monitoring | ||
837 | */ | ||
838 | delta_mod = delta; | ||
839 | if (delta_mod < 0) | ||
840 | delta_mod = -delta_mod; | ||
841 | pps_stabil += (div_s64(((s64)delta_mod) << | ||
842 | (NTP_SCALE_SHIFT - SHIFT_USEC), | ||
843 | NSEC_PER_USEC) - pps_stabil) >> PPS_INTMIN; | ||
844 | |||
845 | /* if enabled, the system clock frequency is updated */ | ||
846 | if ((time_status & STA_PPSFREQ) != 0 && | ||
847 | (time_status & STA_FREQHOLD) == 0) { | ||
848 | time_freq = pps_freq; | ||
849 | ntp_update_frequency(); | ||
850 | } | ||
851 | |||
852 | return delta; | ||
853 | } | ||
854 | |||
855 | /* correct REALTIME clock phase error against PPS signal */ | ||
856 | static void hardpps_update_phase(long error) | ||
857 | { | ||
858 | long correction = -error; | ||
859 | long jitter; | ||
860 | |||
861 | /* add the sample to the median filter */ | ||
862 | pps_phase_filter_add(correction); | ||
863 | correction = pps_phase_filter_get(&jitter); | ||
864 | |||
865 | /* Nominal jitter is due to PPS signal noise. If it exceeds the | ||
866 | * threshold, the sample is discarded; otherwise, if so enabled, | ||
867 | * the time offset is updated. | ||
868 | */ | ||
869 | if (jitter > (pps_jitter << PPS_POPCORN)) { | ||
870 | pr_warning("hardpps: PPSJITTER: jitter=%ld, limit=%ld\n", | ||
871 | jitter, (pps_jitter << PPS_POPCORN)); | ||
872 | time_status |= STA_PPSJITTER; | ||
873 | pps_jitcnt++; | ||
874 | } else if (time_status & STA_PPSTIME) { | ||
875 | /* correct the time using the phase offset */ | ||
876 | time_offset = div_s64(((s64)correction) << NTP_SCALE_SHIFT, | ||
877 | NTP_INTERVAL_FREQ); | ||
878 | /* cancel running adjtime() */ | ||
879 | time_adjust = 0; | ||
880 | } | ||
881 | /* update jitter */ | ||
882 | pps_jitter += (jitter - pps_jitter) >> PPS_INTMIN; | ||
883 | } | ||
884 | |||
885 | /* | ||
886 | * hardpps() - discipline CPU clock oscillator to external PPS signal | ||
887 | * | ||
888 | * This routine is called at each PPS signal arrival in order to | ||
889 | * discipline the CPU clock oscillator to the PPS signal. It takes two | ||
890 | * parameters: REALTIME and MONOTONIC_RAW clock timestamps. The former | ||
891 | * is used to correct clock phase error and the latter is used to | ||
892 | * correct the frequency. | ||
893 | * | ||
894 | * This code is based on David Mills's reference nanokernel | ||
895 | * implementation. It was mostly rewritten but keeps the same idea. | ||
896 | */ | ||
897 | void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts) | ||
898 | { | ||
899 | struct pps_normtime pts_norm, freq_norm; | ||
900 | unsigned long flags; | ||
901 | |||
902 | pts_norm = pps_normalize_ts(*phase_ts); | ||
903 | |||
904 | write_seqlock_irqsave(&xtime_lock, flags); | ||
905 | |||
906 | /* clear the error bits, they will be set again if needed */ | ||
907 | time_status &= ~(STA_PPSJITTER | STA_PPSWANDER | STA_PPSERROR); | ||
908 | |||
909 | /* indicate signal presence */ | ||
910 | time_status |= STA_PPSSIGNAL; | ||
911 | pps_valid = PPS_VALID; | ||
912 | |||
913 | /* when called for the first time, | ||
914 | * just start the frequency interval */ | ||
915 | if (unlikely(pps_fbase.tv_sec == 0)) { | ||
916 | pps_fbase = *raw_ts; | ||
917 | write_sequnlock_irqrestore(&xtime_lock, flags); | ||
918 | return; | ||
919 | } | ||
920 | |||
921 | /* ok, now we have a base for frequency calculation */ | ||
922 | freq_norm = pps_normalize_ts(timespec_sub(*raw_ts, pps_fbase)); | ||
923 | |||
924 | /* check that the signal is in the range | ||
925 | * [1s - MAXFREQ us, 1s + MAXFREQ us], otherwise reject it */ | ||
926 | if ((freq_norm.sec == 0) || | ||
927 | (freq_norm.nsec > MAXFREQ * freq_norm.sec) || | ||
928 | (freq_norm.nsec < -MAXFREQ * freq_norm.sec)) { | ||
929 | time_status |= STA_PPSJITTER; | ||
930 | /* restart the frequency calibration interval */ | ||
931 | pps_fbase = *raw_ts; | ||
932 | write_sequnlock_irqrestore(&xtime_lock, flags); | ||
933 | pr_err("hardpps: PPSJITTER: bad pulse\n"); | ||
934 | return; | ||
935 | } | ||
936 | |||
937 | /* signal is ok */ | ||
938 | |||
939 | /* check if the current frequency interval is finished */ | ||
940 | if (freq_norm.sec >= (1 << pps_shift)) { | ||
941 | pps_calcnt++; | ||
942 | /* restart the frequency calibration interval */ | ||
943 | pps_fbase = *raw_ts; | ||
944 | hardpps_update_freq(freq_norm); | ||
945 | } | ||
946 | |||
947 | hardpps_update_phase(pts_norm.nsec); | ||
948 | |||
949 | write_sequnlock_irqrestore(&xtime_lock, flags); | ||
950 | } | ||
951 | EXPORT_SYMBOL(hardpps); | ||
952 | |||
953 | #endif /* CONFIG_NTP_PPS */ | ||
954 | |||
547 | static int __init ntp_tick_adj_setup(char *str) | 955 | static int __init ntp_tick_adj_setup(char *str) |
548 | { | 956 | { |
549 | ntp_tick_adj = simple_strtol(str, NULL, 0); | 957 | ntp_tick_adj = simple_strtol(str, NULL, 0); |
diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c new file mode 100644 index 000000000000..25028dd4fa18 --- /dev/null +++ b/kernel/time/posix-clock.c | |||
@@ -0,0 +1,451 @@ | |||
1 | /* | ||
2 | * posix-clock.c - support for dynamic clock devices | ||
3 | * | ||
4 | * Copyright (C) 2010 OMICRON electronics GmbH | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or modify | ||
7 | * it under the terms of the GNU General Public License as published by | ||
8 | * the Free Software Foundation; either version 2 of the License, or | ||
9 | * (at your option) any later version. | ||
10 | * | ||
11 | * This program is distributed in the hope that it will be useful, | ||
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
14 | * GNU General Public License for more details. | ||
15 | * | ||
16 | * You should have received a copy of the GNU General Public License | ||
17 | * along with this program; if not, write to the Free Software | ||
18 | * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. | ||
19 | */ | ||
20 | #include <linux/device.h> | ||
21 | #include <linux/file.h> | ||
22 | #include <linux/mutex.h> | ||
23 | #include <linux/posix-clock.h> | ||
24 | #include <linux/slab.h> | ||
25 | #include <linux/syscalls.h> | ||
26 | #include <linux/uaccess.h> | ||
27 | |||
28 | static void delete_clock(struct kref *kref); | ||
29 | |||
30 | /* | ||
31 | * Returns NULL if the posix_clock instance attached to 'fp' is old and stale. | ||
32 | */ | ||
33 | static struct posix_clock *get_posix_clock(struct file *fp) | ||
34 | { | ||
35 | struct posix_clock *clk = fp->private_data; | ||
36 | |||
37 | mutex_lock(&clk->mutex); | ||
38 | |||
39 | if (!clk->zombie) | ||
40 | return clk; | ||
41 | |||
42 | mutex_unlock(&clk->mutex); | ||
43 | |||
44 | return NULL; | ||
45 | } | ||
46 | |||
47 | static void put_posix_clock(struct posix_clock *clk) | ||
48 | { | ||
49 | mutex_unlock(&clk->mutex); | ||
50 | } | ||
51 | |||
52 | static ssize_t posix_clock_read(struct file *fp, char __user *buf, | ||
53 | size_t count, loff_t *ppos) | ||
54 | { | ||
55 | struct posix_clock *clk = get_posix_clock(fp); | ||
56 | int err = -EINVAL; | ||
57 | |||
58 | if (!clk) | ||
59 | return -ENODEV; | ||
60 | |||
61 | if (clk->ops.read) | ||
62 | err = clk->ops.read(clk, fp->f_flags, buf, count); | ||
63 | |||
64 | put_posix_clock(clk); | ||
65 | |||
66 | return err; | ||
67 | } | ||
68 | |||
69 | static unsigned int posix_clock_poll(struct file *fp, poll_table *wait) | ||
70 | { | ||
71 | struct posix_clock *clk = get_posix_clock(fp); | ||
72 | int result = 0; | ||
73 | |||
74 | if (!clk) | ||
75 | return -ENODEV; | ||
76 | |||
77 | if (clk->ops.poll) | ||
78 | result = clk->ops.poll(clk, fp, wait); | ||
79 | |||
80 | put_posix_clock(clk); | ||
81 | |||
82 | return result; | ||
83 | } | ||
84 | |||
85 | static int posix_clock_fasync(int fd, struct file *fp, int on) | ||
86 | { | ||
87 | struct posix_clock *clk = get_posix_clock(fp); | ||
88 | int err = 0; | ||
89 | |||
90 | if (!clk) | ||
91 | return -ENODEV; | ||
92 | |||
93 | if (clk->ops.fasync) | ||
94 | err = clk->ops.fasync(clk, fd, fp, on); | ||
95 | |||
96 | put_posix_clock(clk); | ||
97 | |||
98 | return err; | ||
99 | } | ||
100 | |||
101 | static int posix_clock_mmap(struct file *fp, struct vm_area_struct *vma) | ||
102 | { | ||
103 | struct posix_clock *clk = get_posix_clock(fp); | ||
104 | int err = -ENODEV; | ||
105 | |||
106 | if (!clk) | ||
107 | return -ENODEV; | ||
108 | |||
109 | if (clk->ops.mmap) | ||
110 | err = clk->ops.mmap(clk, vma); | ||
111 | |||
112 | put_posix_clock(clk); | ||
113 | |||
114 | return err; | ||
115 | } | ||
116 | |||
117 | static long posix_clock_ioctl(struct file *fp, | ||
118 | unsigned int cmd, unsigned long arg) | ||
119 | { | ||
120 | struct posix_clock *clk = get_posix_clock(fp); | ||
121 | int err = -ENOTTY; | ||
122 | |||
123 | if (!clk) | ||
124 | return -ENODEV; | ||
125 | |||
126 | if (clk->ops.ioctl) | ||
127 | err = clk->ops.ioctl(clk, cmd, arg); | ||
128 | |||
129 | put_posix_clock(clk); | ||
130 | |||
131 | return err; | ||
132 | } | ||
133 | |||
134 | #ifdef CONFIG_COMPAT | ||
135 | static long posix_clock_compat_ioctl(struct file *fp, | ||
136 | unsigned int cmd, unsigned long arg) | ||
137 | { | ||
138 | struct posix_clock *clk = get_posix_clock(fp); | ||
139 | int err = -ENOTTY; | ||
140 | |||
141 | if (!clk) | ||
142 | return -ENODEV; | ||
143 | |||
144 | if (clk->ops.ioctl) | ||
145 | err = clk->ops.ioctl(clk, cmd, arg); | ||
146 | |||
147 | put_posix_clock(clk); | ||
148 | |||
149 | return err; | ||
150 | } | ||
151 | #endif | ||
152 | |||
153 | static int posix_clock_open(struct inode *inode, struct file *fp) | ||
154 | { | ||
155 | int err; | ||
156 | struct posix_clock *clk = | ||
157 | container_of(inode->i_cdev, struct posix_clock, cdev); | ||
158 | |||
159 | mutex_lock(&clk->mutex); | ||
160 | |||
161 | if (clk->zombie) { | ||
162 | err = -ENODEV; | ||
163 | goto out; | ||
164 | } | ||
165 | if (clk->ops.open) | ||
166 | err = clk->ops.open(clk, fp->f_mode); | ||
167 | else | ||
168 | err = 0; | ||
169 | |||
170 | if (!err) { | ||
171 | kref_get(&clk->kref); | ||
172 | fp->private_data = clk; | ||
173 | } | ||
174 | out: | ||
175 | mutex_unlock(&clk->mutex); | ||
176 | return err; | ||
177 | } | ||
178 | |||
179 | static int posix_clock_release(struct inode *inode, struct file *fp) | ||
180 | { | ||
181 | struct posix_clock *clk = fp->private_data; | ||
182 | int err = 0; | ||
183 | |||
184 | if (clk->ops.release) | ||
185 | err = clk->ops.release(clk); | ||
186 | |||
187 | kref_put(&clk->kref, delete_clock); | ||
188 | |||
189 | fp->private_data = NULL; | ||
190 | |||
191 | return err; | ||
192 | } | ||
193 | |||
194 | static const struct file_operations posix_clock_file_operations = { | ||
195 | .owner = THIS_MODULE, | ||
196 | .llseek = no_llseek, | ||
197 | .read = posix_clock_read, | ||
198 | .poll = posix_clock_poll, | ||
199 | .unlocked_ioctl = posix_clock_ioctl, | ||
200 | .open = posix_clock_open, | ||
201 | .release = posix_clock_release, | ||
202 | .fasync = posix_clock_fasync, | ||
203 | .mmap = posix_clock_mmap, | ||
204 | #ifdef CONFIG_COMPAT | ||
205 | .compat_ioctl = posix_clock_compat_ioctl, | ||
206 | #endif | ||
207 | }; | ||
208 | |||
209 | int posix_clock_register(struct posix_clock *clk, dev_t devid) | ||
210 | { | ||
211 | int err; | ||
212 | |||
213 | kref_init(&clk->kref); | ||
214 | mutex_init(&clk->mutex); | ||
215 | |||
216 | cdev_init(&clk->cdev, &posix_clock_file_operations); | ||
217 | clk->cdev.owner = clk->ops.owner; | ||
218 | err = cdev_add(&clk->cdev, devid, 1); | ||
219 | if (err) | ||
220 | goto no_cdev; | ||
221 | |||
222 | return err; | ||
223 | no_cdev: | ||
224 | mutex_destroy(&clk->mutex); | ||
225 | return err; | ||
226 | } | ||
227 | EXPORT_SYMBOL_GPL(posix_clock_register); | ||
228 | |||
229 | static void delete_clock(struct kref *kref) | ||
230 | { | ||
231 | struct posix_clock *clk = container_of(kref, struct posix_clock, kref); | ||
232 | mutex_destroy(&clk->mutex); | ||
233 | if (clk->release) | ||
234 | clk->release(clk); | ||
235 | } | ||
236 | |||
237 | void posix_clock_unregister(struct posix_clock *clk) | ||
238 | { | ||
239 | cdev_del(&clk->cdev); | ||
240 | |||
241 | mutex_lock(&clk->mutex); | ||
242 | clk->zombie = true; | ||
243 | mutex_unlock(&clk->mutex); | ||
244 | |||
245 | kref_put(&clk->kref, delete_clock); | ||
246 | } | ||
247 | EXPORT_SYMBOL_GPL(posix_clock_unregister); | ||
248 | |||
249 | struct posix_clock_desc { | ||
250 | struct file *fp; | ||
251 | struct posix_clock *clk; | ||
252 | }; | ||
253 | |||
254 | static int get_clock_desc(const clockid_t id, struct posix_clock_desc *cd) | ||
255 | { | ||
256 | struct file *fp = fget(CLOCKID_TO_FD(id)); | ||
257 | int err = -EINVAL; | ||
258 | |||
259 | if (!fp) | ||
260 | return err; | ||
261 | |||
262 | if (fp->f_op->open != posix_clock_open || !fp->private_data) | ||
263 | goto out; | ||
264 | |||
265 | cd->fp = fp; | ||
266 | cd->clk = get_posix_clock(fp); | ||
267 | |||
268 | err = cd->clk ? 0 : -ENODEV; | ||
269 | out: | ||
270 | if (err) | ||
271 | fput(fp); | ||
272 | return err; | ||
273 | } | ||
274 | |||
275 | static void put_clock_desc(struct posix_clock_desc *cd) | ||
276 | { | ||
277 | put_posix_clock(cd->clk); | ||
278 | fput(cd->fp); | ||
279 | } | ||
280 | |||
281 | static int pc_clock_adjtime(clockid_t id, struct timex *tx) | ||
282 | { | ||
283 | struct posix_clock_desc cd; | ||
284 | int err; | ||
285 | |||
286 | err = get_clock_desc(id, &cd); | ||
287 | if (err) | ||
288 | return err; | ||
289 | |||
290 | if ((cd.fp->f_mode & FMODE_WRITE) == 0) { | ||
291 | err = -EACCES; | ||
292 | goto out; | ||
293 | } | ||
294 | |||
295 | if (cd.clk->ops.clock_adjtime) | ||
296 | err = cd.clk->ops.clock_adjtime(cd.clk, tx); | ||
297 | else | ||
298 | err = -EOPNOTSUPP; | ||
299 | out: | ||
300 | put_clock_desc(&cd); | ||
301 | |||
302 | return err; | ||
303 | } | ||
304 | |||
305 | static int pc_clock_gettime(clockid_t id, struct timespec *ts) | ||
306 | { | ||
307 | struct posix_clock_desc cd; | ||
308 | int err; | ||
309 | |||
310 | err = get_clock_desc(id, &cd); | ||
311 | if (err) | ||
312 | return err; | ||
313 | |||
314 | if (cd.clk->ops.clock_gettime) | ||
315 | err = cd.clk->ops.clock_gettime(cd.clk, ts); | ||
316 | else | ||
317 | err = -EOPNOTSUPP; | ||
318 | |||
319 | put_clock_desc(&cd); | ||
320 | |||
321 | return err; | ||
322 | } | ||
323 | |||
324 | static int pc_clock_getres(clockid_t id, struct timespec *ts) | ||
325 | { | ||
326 | struct posix_clock_desc cd; | ||
327 | int err; | ||
328 | |||
329 | err = get_clock_desc(id, &cd); | ||
330 | if (err) | ||
331 | return err; | ||
332 | |||
333 | if (cd.clk->ops.clock_getres) | ||
334 | err = cd.clk->ops.clock_getres(cd.clk, ts); | ||
335 | else | ||
336 | err = -EOPNOTSUPP; | ||
337 | |||
338 | put_clock_desc(&cd); | ||
339 | |||
340 | return err; | ||
341 | } | ||
342 | |||
343 | static int pc_clock_settime(clockid_t id, const struct timespec *ts) | ||
344 | { | ||
345 | struct posix_clock_desc cd; | ||
346 | int err; | ||
347 | |||
348 | err = get_clock_desc(id, &cd); | ||
349 | if (err) | ||
350 | return err; | ||
351 | |||
352 | if ((cd.fp->f_mode & FMODE_WRITE) == 0) { | ||
353 | err = -EACCES; | ||
354 | goto out; | ||
355 | } | ||
356 | |||
357 | if (cd.clk->ops.clock_settime) | ||
358 | err = cd.clk->ops.clock_settime(cd.clk, ts); | ||
359 | else | ||
360 | err = -EOPNOTSUPP; | ||
361 | out: | ||
362 | put_clock_desc(&cd); | ||
363 | |||
364 | return err; | ||
365 | } | ||
366 | |||
367 | static int pc_timer_create(struct k_itimer *kit) | ||
368 | { | ||
369 | clockid_t id = kit->it_clock; | ||
370 | struct posix_clock_desc cd; | ||
371 | int err; | ||
372 | |||
373 | err = get_clock_desc(id, &cd); | ||
374 | if (err) | ||
375 | return err; | ||
376 | |||
377 | if (cd.clk->ops.timer_create) | ||
378 | err = cd.clk->ops.timer_create(cd.clk, kit); | ||
379 | else | ||
380 | err = -EOPNOTSUPP; | ||
381 | |||
382 | put_clock_desc(&cd); | ||
383 | |||
384 | return err; | ||
385 | } | ||
386 | |||
387 | static int pc_timer_delete(struct k_itimer *kit) | ||
388 | { | ||
389 | clockid_t id = kit->it_clock; | ||
390 | struct posix_clock_desc cd; | ||
391 | int err; | ||
392 | |||
393 | err = get_clock_desc(id, &cd); | ||
394 | if (err) | ||
395 | return err; | ||
396 | |||
397 | if (cd.clk->ops.timer_delete) | ||
398 | err = cd.clk->ops.timer_delete(cd.clk, kit); | ||
399 | else | ||
400 | err = -EOPNOTSUPP; | ||
401 | |||
402 | put_clock_desc(&cd); | ||
403 | |||
404 | return err; | ||
405 | } | ||
406 | |||
407 | static void pc_timer_gettime(struct k_itimer *kit, struct itimerspec *ts) | ||
408 | { | ||
409 | clockid_t id = kit->it_clock; | ||
410 | struct posix_clock_desc cd; | ||
411 | |||
412 | if (get_clock_desc(id, &cd)) | ||
413 | return; | ||
414 | |||
415 | if (cd.clk->ops.timer_gettime) | ||
416 | cd.clk->ops.timer_gettime(cd.clk, kit, ts); | ||
417 | |||
418 | put_clock_desc(&cd); | ||
419 | } | ||
420 | |||
421 | static int pc_timer_settime(struct k_itimer *kit, int flags, | ||
422 | struct itimerspec *ts, struct itimerspec *old) | ||
423 | { | ||
424 | clockid_t id = kit->it_clock; | ||
425 | struct posix_clock_desc cd; | ||
426 | int err; | ||
427 | |||
428 | err = get_clock_desc(id, &cd); | ||
429 | if (err) | ||
430 | return err; | ||
431 | |||
432 | if (cd.clk->ops.timer_settime) | ||
433 | err = cd.clk->ops.timer_settime(cd.clk, kit, flags, ts, old); | ||
434 | else | ||
435 | err = -EOPNOTSUPP; | ||
436 | |||
437 | put_clock_desc(&cd); | ||
438 | |||
439 | return err; | ||
440 | } | ||
441 | |||
442 | struct k_clock clock_posix_dynamic = { | ||
443 | .clock_getres = pc_clock_getres, | ||
444 | .clock_set = pc_clock_settime, | ||
445 | .clock_get = pc_clock_gettime, | ||
446 | .clock_adj = pc_clock_adjtime, | ||
447 | .timer_create = pc_timer_create, | ||
448 | .timer_set = pc_timer_settime, | ||
449 | .timer_del = pc_timer_delete, | ||
450 | .timer_get = pc_timer_gettime, | ||
451 | }; | ||
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 48b2761b5668..da800ffa810c 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c | |||
@@ -18,7 +18,6 @@ | |||
18 | #include <linux/percpu.h> | 18 | #include <linux/percpu.h> |
19 | #include <linux/profile.h> | 19 | #include <linux/profile.h> |
20 | #include <linux/sched.h> | 20 | #include <linux/sched.h> |
21 | #include <linux/tick.h> | ||
22 | 21 | ||
23 | #include "tick-internal.h" | 22 | #include "tick-internal.h" |
24 | 23 | ||
@@ -600,4 +599,14 @@ int tick_broadcast_oneshot_active(void) | |||
600 | return tick_broadcast_device.mode == TICKDEV_MODE_ONESHOT; | 599 | return tick_broadcast_device.mode == TICKDEV_MODE_ONESHOT; |
601 | } | 600 | } |
602 | 601 | ||
602 | /* | ||
603 | * Check whether the broadcast device supports oneshot. | ||
604 | */ | ||
605 | bool tick_broadcast_oneshot_available(void) | ||
606 | { | ||
607 | struct clock_event_device *bc = tick_broadcast_device.evtdev; | ||
608 | |||
609 | return bc ? bc->features & CLOCK_EVT_FEAT_ONESHOT : false; | ||
610 | } | ||
611 | |||
603 | #endif | 612 | #endif |
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index b6b898d2eeef..119528de8235 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c | |||
@@ -18,7 +18,6 @@ | |||
18 | #include <linux/percpu.h> | 18 | #include <linux/percpu.h> |
19 | #include <linux/profile.h> | 19 | #include <linux/profile.h> |
20 | #include <linux/sched.h> | 20 | #include <linux/sched.h> |
21 | #include <linux/tick.h> | ||
22 | 21 | ||
23 | #include <asm/irq_regs.h> | 22 | #include <asm/irq_regs.h> |
24 | 23 | ||
@@ -49,9 +48,13 @@ struct tick_device *tick_get_device(int cpu) | |||
49 | */ | 48 | */ |
50 | int tick_is_oneshot_available(void) | 49 | int tick_is_oneshot_available(void) |
51 | { | 50 | { |
52 | struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; | 51 | struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev); |
53 | 52 | ||
54 | return dev && (dev->features & CLOCK_EVT_FEAT_ONESHOT); | 53 | if (!dev || !(dev->features & CLOCK_EVT_FEAT_ONESHOT)) |
54 | return 0; | ||
55 | if (!(dev->features & CLOCK_EVT_FEAT_C3STOP)) | ||
56 | return 1; | ||
57 | return tick_broadcast_oneshot_available(); | ||
55 | } | 58 | } |
56 | 59 | ||
57 | /* | 60 | /* |
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index 290eefbc1f60..1009b06d6f89 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h | |||
@@ -1,6 +1,10 @@ | |||
1 | /* | 1 | /* |
2 | * tick internal variable and functions used by low/high res code | 2 | * tick internal variable and functions used by low/high res code |
3 | */ | 3 | */ |
4 | #include <linux/hrtimer.h> | ||
5 | #include <linux/tick.h> | ||
6 | |||
7 | #ifdef CONFIG_GENERIC_CLOCKEVENTS_BUILD | ||
4 | 8 | ||
5 | #define TICK_DO_TIMER_NONE -1 | 9 | #define TICK_DO_TIMER_NONE -1 |
6 | #define TICK_DO_TIMER_BOOT -2 | 10 | #define TICK_DO_TIMER_BOOT -2 |
@@ -36,6 +40,7 @@ extern void tick_shutdown_broadcast_oneshot(unsigned int *cpup); | |||
36 | extern int tick_resume_broadcast_oneshot(struct clock_event_device *bc); | 40 | extern int tick_resume_broadcast_oneshot(struct clock_event_device *bc); |
37 | extern int tick_broadcast_oneshot_active(void); | 41 | extern int tick_broadcast_oneshot_active(void); |
38 | extern void tick_check_oneshot_broadcast(int cpu); | 42 | extern void tick_check_oneshot_broadcast(int cpu); |
43 | bool tick_broadcast_oneshot_available(void); | ||
39 | # else /* BROADCAST */ | 44 | # else /* BROADCAST */ |
40 | static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc) | 45 | static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc) |
41 | { | 46 | { |
@@ -46,6 +51,7 @@ static inline void tick_broadcast_switch_to_oneshot(void) { } | |||
46 | static inline void tick_shutdown_broadcast_oneshot(unsigned int *cpup) { } | 51 | static inline void tick_shutdown_broadcast_oneshot(unsigned int *cpup) { } |
47 | static inline int tick_broadcast_oneshot_active(void) { return 0; } | 52 | static inline int tick_broadcast_oneshot_active(void) { return 0; } |
48 | static inline void tick_check_oneshot_broadcast(int cpu) { } | 53 | static inline void tick_check_oneshot_broadcast(int cpu) { } |
54 | static inline bool tick_broadcast_oneshot_available(void) { return true; } | ||
49 | # endif /* !BROADCAST */ | 55 | # endif /* !BROADCAST */ |
50 | 56 | ||
51 | #else /* !ONESHOT */ | 57 | #else /* !ONESHOT */ |
@@ -76,6 +82,7 @@ static inline int tick_resume_broadcast_oneshot(struct clock_event_device *bc) | |||
76 | return 0; | 82 | return 0; |
77 | } | 83 | } |
78 | static inline int tick_broadcast_oneshot_active(void) { return 0; } | 84 | static inline int tick_broadcast_oneshot_active(void) { return 0; } |
85 | static inline bool tick_broadcast_oneshot_available(void) { return false; } | ||
79 | #endif /* !TICK_ONESHOT */ | 86 | #endif /* !TICK_ONESHOT */ |
80 | 87 | ||
81 | /* | 88 | /* |
@@ -132,3 +139,8 @@ static inline int tick_device_is_functional(struct clock_event_device *dev) | |||
132 | { | 139 | { |
133 | return !(dev->features & CLOCK_EVT_FEAT_DUMMY); | 140 | return !(dev->features & CLOCK_EVT_FEAT_DUMMY); |
134 | } | 141 | } |
142 | |||
143 | #endif | ||
144 | |||
145 | extern void do_timer(unsigned long ticks); | ||
146 | extern seqlock_t xtime_lock; | ||
diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c index aada0e52680a..2d04411a5f05 100644 --- a/kernel/time/tick-oneshot.c +++ b/kernel/time/tick-oneshot.c | |||
@@ -18,7 +18,6 @@ | |||
18 | #include <linux/percpu.h> | 18 | #include <linux/percpu.h> |
19 | #include <linux/profile.h> | 19 | #include <linux/profile.h> |
20 | #include <linux/sched.h> | 20 | #include <linux/sched.h> |
21 | #include <linux/tick.h> | ||
22 | 21 | ||
23 | #include "tick-internal.h" | 22 | #include "tick-internal.h" |
24 | 23 | ||
@@ -95,7 +94,7 @@ int tick_dev_program_event(struct clock_event_device *dev, ktime_t expires, | |||
95 | */ | 94 | */ |
96 | int tick_program_event(ktime_t expires, int force) | 95 | int tick_program_event(ktime_t expires, int force) |
97 | { | 96 | { |
98 | struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; | 97 | struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev); |
99 | 98 | ||
100 | return tick_dev_program_event(dev, expires, force); | 99 | return tick_dev_program_event(dev, expires, force); |
101 | } | 100 | } |
@@ -167,7 +166,7 @@ int tick_oneshot_mode_active(void) | |||
167 | int ret; | 166 | int ret; |
168 | 167 | ||
169 | local_irq_save(flags); | 168 | local_irq_save(flags); |
170 | ret = __get_cpu_var(tick_cpu_device).mode == TICKDEV_MODE_ONESHOT; | 169 | ret = __this_cpu_read(tick_cpu_device.mode) == TICKDEV_MODE_ONESHOT; |
171 | local_irq_restore(flags); | 170 | local_irq_restore(flags); |
172 | 171 | ||
173 | return ret; | 172 | return ret; |
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 3e216e01bbd1..d5097c44b407 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c | |||
@@ -19,7 +19,6 @@ | |||
19 | #include <linux/percpu.h> | 19 | #include <linux/percpu.h> |
20 | #include <linux/profile.h> | 20 | #include <linux/profile.h> |
21 | #include <linux/sched.h> | 21 | #include <linux/sched.h> |
22 | #include <linux/tick.h> | ||
23 | #include <linux/module.h> | 22 | #include <linux/module.h> |
24 | 23 | ||
25 | #include <asm/irq_regs.h> | 24 | #include <asm/irq_regs.h> |
@@ -642,8 +641,7 @@ static void tick_nohz_switch_to_nohz(void) | |||
642 | } | 641 | } |
643 | local_irq_enable(); | 642 | local_irq_enable(); |
644 | 643 | ||
645 | printk(KERN_INFO "Switched to NOHz mode on CPU #%d\n", | 644 | printk(KERN_INFO "Switched to NOHz mode on CPU #%d\n", smp_processor_id()); |
646 | smp_processor_id()); | ||
647 | } | 645 | } |
648 | 646 | ||
649 | /* | 647 | /* |
@@ -795,8 +793,10 @@ void tick_setup_sched_timer(void) | |||
795 | } | 793 | } |
796 | 794 | ||
797 | #ifdef CONFIG_NO_HZ | 795 | #ifdef CONFIG_NO_HZ |
798 | if (tick_nohz_enabled) | 796 | if (tick_nohz_enabled) { |
799 | ts->nohz_mode = NOHZ_MODE_HIGHRES; | 797 | ts->nohz_mode = NOHZ_MODE_HIGHRES; |
798 | printk(KERN_INFO "Switched to NOHz mode on CPU #%d\n", smp_processor_id()); | ||
799 | } | ||
800 | #endif | 800 | #endif |
801 | } | 801 | } |
802 | #endif /* HIGH_RES_TIMERS */ | 802 | #endif /* HIGH_RES_TIMERS */ |
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 5bb86da82003..3bd7e3d5c632 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c | |||
@@ -49,7 +49,7 @@ struct timekeeper { | |||
49 | u32 mult; | 49 | u32 mult; |
50 | }; | 50 | }; |
51 | 51 | ||
52 | struct timekeeper timekeeper; | 52 | static struct timekeeper timekeeper; |
53 | 53 | ||
54 | /** | 54 | /** |
55 | * timekeeper_setup_internals - Set up internals to use clocksource clock. | 55 | * timekeeper_setup_internals - Set up internals to use clocksource clock. |
@@ -164,7 +164,7 @@ static struct timespec total_sleep_time; | |||
164 | /* | 164 | /* |
165 | * The raw monotonic time for the CLOCK_MONOTONIC_RAW posix clock. | 165 | * The raw monotonic time for the CLOCK_MONOTONIC_RAW posix clock. |
166 | */ | 166 | */ |
167 | struct timespec raw_time; | 167 | static struct timespec raw_time; |
168 | 168 | ||
169 | /* flag for if timekeeping is suspended */ | 169 | /* flag for if timekeeping is suspended */ |
170 | int __read_mostly timekeeping_suspended; | 170 | int __read_mostly timekeeping_suspended; |
@@ -288,6 +288,49 @@ void ktime_get_ts(struct timespec *ts) | |||
288 | } | 288 | } |
289 | EXPORT_SYMBOL_GPL(ktime_get_ts); | 289 | EXPORT_SYMBOL_GPL(ktime_get_ts); |
290 | 290 | ||
291 | #ifdef CONFIG_NTP_PPS | ||
292 | |||
293 | /** | ||
294 | * getnstime_raw_and_real - get day and raw monotonic time in timespec format | ||
295 | * @ts_raw: pointer to the timespec to be set to raw monotonic time | ||
296 | * @ts_real: pointer to the timespec to be set to the time of day | ||
297 | * | ||
298 | * This function reads both the time of day and raw monotonic time at the | ||
299 | * same time atomically and stores the resulting timestamps in timespec | ||
300 | * format. | ||
301 | */ | ||
302 | void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real) | ||
303 | { | ||
304 | unsigned long seq; | ||
305 | s64 nsecs_raw, nsecs_real; | ||
306 | |||
307 | WARN_ON_ONCE(timekeeping_suspended); | ||
308 | |||
309 | do { | ||
310 | u32 arch_offset; | ||
311 | |||
312 | seq = read_seqbegin(&xtime_lock); | ||
313 | |||
314 | *ts_raw = raw_time; | ||
315 | *ts_real = xtime; | ||
316 | |||
317 | nsecs_raw = timekeeping_get_ns_raw(); | ||
318 | nsecs_real = timekeeping_get_ns(); | ||
319 | |||
320 | /* If arch requires, add in gettimeoffset() */ | ||
321 | arch_offset = arch_gettimeoffset(); | ||
322 | nsecs_raw += arch_offset; | ||
323 | nsecs_real += arch_offset; | ||
324 | |||
325 | } while (read_seqretry(&xtime_lock, seq)); | ||
326 | |||
327 | timespec_add_ns(ts_raw, nsecs_raw); | ||
328 | timespec_add_ns(ts_real, nsecs_real); | ||
329 | } | ||
330 | EXPORT_SYMBOL(getnstime_raw_and_real); | ||
331 | |||
332 | #endif /* CONFIG_NTP_PPS */ | ||
333 | |||
291 | /** | 334 | /** |
292 | * do_gettimeofday - Returns the time of day in a timeval | 335 | * do_gettimeofday - Returns the time of day in a timeval |
293 | * @tv: pointer to the timeval to be set | 336 | * @tv: pointer to the timeval to be set |
@@ -310,7 +353,7 @@ EXPORT_SYMBOL(do_gettimeofday); | |||
310 | * | 353 | * |
311 | * Sets the time of day to the new time and update NTP and notify hrtimers | 354 | * Sets the time of day to the new time and update NTP and notify hrtimers |
312 | */ | 355 | */ |
313 | int do_settimeofday(struct timespec *tv) | 356 | int do_settimeofday(const struct timespec *tv) |
314 | { | 357 | { |
315 | struct timespec ts_delta; | 358 | struct timespec ts_delta; |
316 | unsigned long flags; | 359 | unsigned long flags; |
@@ -344,6 +387,42 @@ int do_settimeofday(struct timespec *tv) | |||
344 | 387 | ||
345 | EXPORT_SYMBOL(do_settimeofday); | 388 | EXPORT_SYMBOL(do_settimeofday); |
346 | 389 | ||
390 | |||
391 | /** | ||
392 | * timekeeping_inject_offset - Adds or subtracts from the current time. | ||
393 | * @tv: pointer to the timespec variable containing the offset | ||
394 | * | ||
395 | * Adds or subtracts an offset value from the current time. | ||
396 | */ | ||
397 | int timekeeping_inject_offset(struct timespec *ts) | ||
398 | { | ||
399 | unsigned long flags; | ||
400 | |||
401 | if ((unsigned long)ts->tv_nsec >= NSEC_PER_SEC) | ||
402 | return -EINVAL; | ||
403 | |||
404 | write_seqlock_irqsave(&xtime_lock, flags); | ||
405 | |||
406 | timekeeping_forward_now(); | ||
407 | |||
408 | xtime = timespec_add(xtime, *ts); | ||
409 | wall_to_monotonic = timespec_sub(wall_to_monotonic, *ts); | ||
410 | |||
411 | timekeeper.ntp_error = 0; | ||
412 | ntp_clear(); | ||
413 | |||
414 | update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock, | ||
415 | timekeeper.mult); | ||
416 | |||
417 | write_sequnlock_irqrestore(&xtime_lock, flags); | ||
418 | |||
419 | /* signal hrtimers about time change */ | ||
420 | clock_was_set(); | ||
421 | |||
422 | return 0; | ||
423 | } | ||
424 | EXPORT_SYMBOL(timekeeping_inject_offset); | ||
425 | |||
347 | /** | 426 | /** |
348 | * change_clocksource - Swaps clocksources if a new one is available | 427 | * change_clocksource - Swaps clocksources if a new one is available |
349 | * | 428 | * |
@@ -736,7 +815,7 @@ static cycle_t logarithmic_accumulation(cycle_t offset, int shift) | |||
736 | * | 815 | * |
737 | * Called from the timer interrupt, must hold a write on xtime_lock. | 816 | * Called from the timer interrupt, must hold a write on xtime_lock. |
738 | */ | 817 | */ |
739 | void update_wall_time(void) | 818 | static void update_wall_time(void) |
740 | { | 819 | { |
741 | struct clocksource *clock; | 820 | struct clocksource *clock; |
742 | cycle_t offset; | 821 | cycle_t offset; |
@@ -828,7 +907,7 @@ void update_wall_time(void) | |||
828 | * getboottime - Return the real time of system boot. | 907 | * getboottime - Return the real time of system boot. |
829 | * @ts: pointer to the timespec to be set | 908 | * @ts: pointer to the timespec to be set |
830 | * | 909 | * |
831 | * Returns the time of day in a timespec. | 910 | * Returns the wall-time of boot in a timespec. |
832 | * | 911 | * |
833 | * This is based on the wall_to_monotonic offset and the total suspend | 912 | * This is based on the wall_to_monotonic offset and the total suspend |
834 | * time. Calls to settimeofday will affect the value returned (which | 913 | * time. Calls to settimeofday will affect the value returned (which |
@@ -846,6 +925,55 @@ void getboottime(struct timespec *ts) | |||
846 | } | 925 | } |
847 | EXPORT_SYMBOL_GPL(getboottime); | 926 | EXPORT_SYMBOL_GPL(getboottime); |
848 | 927 | ||
928 | |||
929 | /** | ||
930 | * get_monotonic_boottime - Returns monotonic time since boot | ||
931 | * @ts: pointer to the timespec to be set | ||
932 | * | ||
933 | * Returns the monotonic time since boot in a timespec. | ||
934 | * | ||
935 | * This is similar to CLOCK_MONTONIC/ktime_get_ts, but also | ||
936 | * includes the time spent in suspend. | ||
937 | */ | ||
938 | void get_monotonic_boottime(struct timespec *ts) | ||
939 | { | ||
940 | struct timespec tomono, sleep; | ||
941 | unsigned int seq; | ||
942 | s64 nsecs; | ||
943 | |||
944 | WARN_ON(timekeeping_suspended); | ||
945 | |||
946 | do { | ||
947 | seq = read_seqbegin(&xtime_lock); | ||
948 | *ts = xtime; | ||
949 | tomono = wall_to_monotonic; | ||
950 | sleep = total_sleep_time; | ||
951 | nsecs = timekeeping_get_ns(); | ||
952 | |||
953 | } while (read_seqretry(&xtime_lock, seq)); | ||
954 | |||
955 | set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec + sleep.tv_sec, | ||
956 | ts->tv_nsec + tomono.tv_nsec + sleep.tv_nsec + nsecs); | ||
957 | } | ||
958 | EXPORT_SYMBOL_GPL(get_monotonic_boottime); | ||
959 | |||
960 | /** | ||
961 | * ktime_get_boottime - Returns monotonic time since boot in a ktime | ||
962 | * | ||
963 | * Returns the monotonic time since boot in a ktime | ||
964 | * | ||
965 | * This is similar to CLOCK_MONTONIC/ktime_get, but also | ||
966 | * includes the time spent in suspend. | ||
967 | */ | ||
968 | ktime_t ktime_get_boottime(void) | ||
969 | { | ||
970 | struct timespec ts; | ||
971 | |||
972 | get_monotonic_boottime(&ts); | ||
973 | return timespec_to_ktime(ts); | ||
974 | } | ||
975 | EXPORT_SYMBOL_GPL(ktime_get_boottime); | ||
976 | |||
849 | /** | 977 | /** |
850 | * monotonic_to_bootbased - Convert the monotonic time to boot based. | 978 | * monotonic_to_bootbased - Convert the monotonic time to boot based. |
851 | * @ts: pointer to the timespec to be converted | 979 | * @ts: pointer to the timespec to be converted |
@@ -867,11 +995,6 @@ struct timespec __current_kernel_time(void) | |||
867 | return xtime; | 995 | return xtime; |
868 | } | 996 | } |
869 | 997 | ||
870 | struct timespec __get_wall_to_monotonic(void) | ||
871 | { | ||
872 | return wall_to_monotonic; | ||
873 | } | ||
874 | |||
875 | struct timespec current_kernel_time(void) | 998 | struct timespec current_kernel_time(void) |
876 | { | 999 | { |
877 | struct timespec now; | 1000 | struct timespec now; |
@@ -903,3 +1026,48 @@ struct timespec get_monotonic_coarse(void) | |||
903 | now.tv_nsec + mono.tv_nsec); | 1026 | now.tv_nsec + mono.tv_nsec); |
904 | return now; | 1027 | return now; |
905 | } | 1028 | } |
1029 | |||
1030 | /* | ||
1031 | * The 64-bit jiffies value is not atomic - you MUST NOT read it | ||
1032 | * without sampling the sequence number in xtime_lock. | ||
1033 | * jiffies is defined in the linker script... | ||
1034 | */ | ||
1035 | void do_timer(unsigned long ticks) | ||
1036 | { | ||
1037 | jiffies_64 += ticks; | ||
1038 | update_wall_time(); | ||
1039 | calc_global_load(ticks); | ||
1040 | } | ||
1041 | |||
1042 | /** | ||
1043 | * get_xtime_and_monotonic_and_sleep_offset() - get xtime, wall_to_monotonic, | ||
1044 | * and sleep offsets. | ||
1045 | * @xtim: pointer to timespec to be set with xtime | ||
1046 | * @wtom: pointer to timespec to be set with wall_to_monotonic | ||
1047 | * @sleep: pointer to timespec to be set with time in suspend | ||
1048 | */ | ||
1049 | void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim, | ||
1050 | struct timespec *wtom, struct timespec *sleep) | ||
1051 | { | ||
1052 | unsigned long seq; | ||
1053 | |||
1054 | do { | ||
1055 | seq = read_seqbegin(&xtime_lock); | ||
1056 | *xtim = xtime; | ||
1057 | *wtom = wall_to_monotonic; | ||
1058 | *sleep = total_sleep_time; | ||
1059 | } while (read_seqretry(&xtime_lock, seq)); | ||
1060 | } | ||
1061 | |||
1062 | /** | ||
1063 | * xtime_update() - advances the timekeeping infrastructure | ||
1064 | * @ticks: number of ticks, that have elapsed since the last call. | ||
1065 | * | ||
1066 | * Must be called with interrupts disabled. | ||
1067 | */ | ||
1068 | void xtime_update(unsigned long ticks) | ||
1069 | { | ||
1070 | write_seqlock(&xtime_lock); | ||
1071 | do_timer(ticks); | ||
1072 | write_sequnlock(&xtime_lock); | ||
1073 | } | ||
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index 32a19f9397fc..3258455549f4 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c | |||
@@ -41,7 +41,7 @@ static void print_name_offset(struct seq_file *m, void *sym) | |||
41 | char symname[KSYM_NAME_LEN]; | 41 | char symname[KSYM_NAME_LEN]; |
42 | 42 | ||
43 | if (lookup_symbol_name((unsigned long)sym, symname) < 0) | 43 | if (lookup_symbol_name((unsigned long)sym, symname) < 0) |
44 | SEQ_printf(m, "<%p>", sym); | 44 | SEQ_printf(m, "<%pK>", sym); |
45 | else | 45 | else |
46 | SEQ_printf(m, "%s", symname); | 46 | SEQ_printf(m, "%s", symname); |
47 | } | 47 | } |
@@ -112,7 +112,7 @@ next_one: | |||
112 | static void | 112 | static void |
113 | print_base(struct seq_file *m, struct hrtimer_clock_base *base, u64 now) | 113 | print_base(struct seq_file *m, struct hrtimer_clock_base *base, u64 now) |
114 | { | 114 | { |
115 | SEQ_printf(m, " .base: %p\n", base); | 115 | SEQ_printf(m, " .base: %pK\n", base); |
116 | SEQ_printf(m, " .index: %d\n", | 116 | SEQ_printf(m, " .index: %d\n", |
117 | base->index); | 117 | base->index); |
118 | SEQ_printf(m, " .resolution: %Lu nsecs\n", | 118 | SEQ_printf(m, " .resolution: %Lu nsecs\n", |
diff --git a/kernel/timer.c b/kernel/timer.c index 43ca9936f2d0..fd6198692b57 100644 --- a/kernel/timer.c +++ b/kernel/timer.c | |||
@@ -404,6 +404,11 @@ static void timer_stats_account_timer(struct timer_list *timer) {} | |||
404 | 404 | ||
405 | static struct debug_obj_descr timer_debug_descr; | 405 | static struct debug_obj_descr timer_debug_descr; |
406 | 406 | ||
407 | static void *timer_debug_hint(void *addr) | ||
408 | { | ||
409 | return ((struct timer_list *) addr)->function; | ||
410 | } | ||
411 | |||
407 | /* | 412 | /* |
408 | * fixup_init is called when: | 413 | * fixup_init is called when: |
409 | * - an active object is initialized | 414 | * - an active object is initialized |
@@ -477,6 +482,7 @@ static int timer_fixup_free(void *addr, enum debug_obj_state state) | |||
477 | 482 | ||
478 | static struct debug_obj_descr timer_debug_descr = { | 483 | static struct debug_obj_descr timer_debug_descr = { |
479 | .name = "timer_list", | 484 | .name = "timer_list", |
485 | .debug_hint = timer_debug_hint, | ||
480 | .fixup_init = timer_fixup_init, | 486 | .fixup_init = timer_fixup_init, |
481 | .fixup_activate = timer_fixup_activate, | 487 | .fixup_activate = timer_fixup_activate, |
482 | .fixup_free = timer_fixup_free, | 488 | .fixup_free = timer_fixup_free, |
@@ -959,20 +965,45 @@ EXPORT_SYMBOL(try_to_del_timer_sync); | |||
959 | * | 965 | * |
960 | * Synchronization rules: Callers must prevent restarting of the timer, | 966 | * Synchronization rules: Callers must prevent restarting of the timer, |
961 | * otherwise this function is meaningless. It must not be called from | 967 | * otherwise this function is meaningless. It must not be called from |
962 | * hardirq contexts. The caller must not hold locks which would prevent | 968 | * interrupt contexts. The caller must not hold locks which would prevent |
963 | * completion of the timer's handler. The timer's handler must not call | 969 | * completion of the timer's handler. The timer's handler must not call |
964 | * add_timer_on(). Upon exit the timer is not queued and the handler is | 970 | * add_timer_on(). Upon exit the timer is not queued and the handler is |
965 | * not running on any CPU. | 971 | * not running on any CPU. |
966 | * | 972 | * |
973 | * Note: You must not hold locks that are held in interrupt context | ||
974 | * while calling this function. Even if the lock has nothing to do | ||
975 | * with the timer in question. Here's why: | ||
976 | * | ||
977 | * CPU0 CPU1 | ||
978 | * ---- ---- | ||
979 | * <SOFTIRQ> | ||
980 | * call_timer_fn(); | ||
981 | * base->running_timer = mytimer; | ||
982 | * spin_lock_irq(somelock); | ||
983 | * <IRQ> | ||
984 | * spin_lock(somelock); | ||
985 | * del_timer_sync(mytimer); | ||
986 | * while (base->running_timer == mytimer); | ||
987 | * | ||
988 | * Now del_timer_sync() will never return and never release somelock. | ||
989 | * The interrupt on the other CPU is waiting to grab somelock but | ||
990 | * it has interrupted the softirq that CPU0 is waiting to finish. | ||
991 | * | ||
967 | * The function returns whether it has deactivated a pending timer or not. | 992 | * The function returns whether it has deactivated a pending timer or not. |
968 | */ | 993 | */ |
969 | int del_timer_sync(struct timer_list *timer) | 994 | int del_timer_sync(struct timer_list *timer) |
970 | { | 995 | { |
971 | #ifdef CONFIG_LOCKDEP | 996 | #ifdef CONFIG_LOCKDEP |
972 | local_bh_disable(); | 997 | unsigned long flags; |
998 | |||
999 | /* | ||
1000 | * If lockdep gives a backtrace here, please reference | ||
1001 | * the synchronization rules above. | ||
1002 | */ | ||
1003 | local_irq_save(flags); | ||
973 | lock_map_acquire(&timer->lockdep_map); | 1004 | lock_map_acquire(&timer->lockdep_map); |
974 | lock_map_release(&timer->lockdep_map); | 1005 | lock_map_release(&timer->lockdep_map); |
975 | local_bh_enable(); | 1006 | local_irq_restore(flags); |
976 | #endif | 1007 | #endif |
977 | /* | 1008 | /* |
978 | * don't use it in hardirq context, because it | 1009 | * don't use it in hardirq context, because it |
@@ -1293,19 +1324,6 @@ void run_local_timers(void) | |||
1293 | raise_softirq(TIMER_SOFTIRQ); | 1324 | raise_softirq(TIMER_SOFTIRQ); |
1294 | } | 1325 | } |
1295 | 1326 | ||
1296 | /* | ||
1297 | * The 64-bit jiffies value is not atomic - you MUST NOT read it | ||
1298 | * without sampling the sequence number in xtime_lock. | ||
1299 | * jiffies is defined in the linker script... | ||
1300 | */ | ||
1301 | |||
1302 | void do_timer(unsigned long ticks) | ||
1303 | { | ||
1304 | jiffies_64 += ticks; | ||
1305 | update_wall_time(); | ||
1306 | calc_global_load(ticks); | ||
1307 | } | ||
1308 | |||
1309 | #ifdef __ARCH_WANT_SYS_ALARM | 1327 | #ifdef __ARCH_WANT_SYS_ALARM |
1310 | 1328 | ||
1311 | /* | 1329 | /* |
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 53f338190b26..761c510a06c5 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile | |||
@@ -52,7 +52,7 @@ obj-$(CONFIG_EVENT_TRACING) += trace_event_perf.o | |||
52 | endif | 52 | endif |
53 | obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o | 53 | obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o |
54 | obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o | 54 | obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o |
55 | obj-$(CONFIG_EVENT_TRACING) += power-traces.o | 55 | obj-$(CONFIG_TRACEPOINTS) += power-traces.o |
56 | ifeq ($(CONFIG_TRACING),y) | 56 | ifeq ($(CONFIG_TRACING),y) |
57 | obj-$(CONFIG_KGDB_KDB) += trace_kdb.o | 57 | obj-$(CONFIG_KGDB_KDB) += trace_kdb.o |
58 | endif | 58 | endif |
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 7b8ec0281548..cbafed7d4f38 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c | |||
@@ -138,6 +138,13 @@ void __trace_note_message(struct blk_trace *bt, const char *fmt, ...) | |||
138 | !blk_tracer_enabled)) | 138 | !blk_tracer_enabled)) |
139 | return; | 139 | return; |
140 | 140 | ||
141 | /* | ||
142 | * If the BLK_TC_NOTIFY action mask isn't set, don't send any note | ||
143 | * message to the trace. | ||
144 | */ | ||
145 | if (!(bt->act_mask & BLK_TC_NOTIFY)) | ||
146 | return; | ||
147 | |||
141 | local_irq_save(flags); | 148 | local_irq_save(flags); |
142 | buf = per_cpu_ptr(bt->msg_data, smp_processor_id()); | 149 | buf = per_cpu_ptr(bt->msg_data, smp_processor_id()); |
143 | va_start(args, fmt); | 150 | va_start(args, fmt); |
@@ -758,53 +765,58 @@ static void blk_add_trace_rq_complete(void *ignore, | |||
758 | * @q: queue the io is for | 765 | * @q: queue the io is for |
759 | * @bio: the source bio | 766 | * @bio: the source bio |
760 | * @what: the action | 767 | * @what: the action |
768 | * @error: error, if any | ||
761 | * | 769 | * |
762 | * Description: | 770 | * Description: |
763 | * Records an action against a bio. Will log the bio offset + size. | 771 | * Records an action against a bio. Will log the bio offset + size. |
764 | * | 772 | * |
765 | **/ | 773 | **/ |
766 | static void blk_add_trace_bio(struct request_queue *q, struct bio *bio, | 774 | static void blk_add_trace_bio(struct request_queue *q, struct bio *bio, |
767 | u32 what) | 775 | u32 what, int error) |
768 | { | 776 | { |
769 | struct blk_trace *bt = q->blk_trace; | 777 | struct blk_trace *bt = q->blk_trace; |
770 | 778 | ||
771 | if (likely(!bt)) | 779 | if (likely(!bt)) |
772 | return; | 780 | return; |
773 | 781 | ||
782 | if (!error && !bio_flagged(bio, BIO_UPTODATE)) | ||
783 | error = EIO; | ||
784 | |||
774 | __blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, what, | 785 | __blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, what, |
775 | !bio_flagged(bio, BIO_UPTODATE), 0, NULL); | 786 | error, 0, NULL); |
776 | } | 787 | } |
777 | 788 | ||
778 | static void blk_add_trace_bio_bounce(void *ignore, | 789 | static void blk_add_trace_bio_bounce(void *ignore, |
779 | struct request_queue *q, struct bio *bio) | 790 | struct request_queue *q, struct bio *bio) |
780 | { | 791 | { |
781 | blk_add_trace_bio(q, bio, BLK_TA_BOUNCE); | 792 | blk_add_trace_bio(q, bio, BLK_TA_BOUNCE, 0); |
782 | } | 793 | } |
783 | 794 | ||
784 | static void blk_add_trace_bio_complete(void *ignore, | 795 | static void blk_add_trace_bio_complete(void *ignore, |
785 | struct request_queue *q, struct bio *bio) | 796 | struct request_queue *q, struct bio *bio, |
797 | int error) | ||
786 | { | 798 | { |
787 | blk_add_trace_bio(q, bio, BLK_TA_COMPLETE); | 799 | blk_add_trace_bio(q, bio, BLK_TA_COMPLETE, error); |
788 | } | 800 | } |
789 | 801 | ||
790 | static void blk_add_trace_bio_backmerge(void *ignore, | 802 | static void blk_add_trace_bio_backmerge(void *ignore, |
791 | struct request_queue *q, | 803 | struct request_queue *q, |
792 | struct bio *bio) | 804 | struct bio *bio) |
793 | { | 805 | { |
794 | blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE); | 806 | blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE, 0); |
795 | } | 807 | } |
796 | 808 | ||
797 | static void blk_add_trace_bio_frontmerge(void *ignore, | 809 | static void blk_add_trace_bio_frontmerge(void *ignore, |
798 | struct request_queue *q, | 810 | struct request_queue *q, |
799 | struct bio *bio) | 811 | struct bio *bio) |
800 | { | 812 | { |
801 | blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE); | 813 | blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE, 0); |
802 | } | 814 | } |
803 | 815 | ||
804 | static void blk_add_trace_bio_queue(void *ignore, | 816 | static void blk_add_trace_bio_queue(void *ignore, |
805 | struct request_queue *q, struct bio *bio) | 817 | struct request_queue *q, struct bio *bio) |
806 | { | 818 | { |
807 | blk_add_trace_bio(q, bio, BLK_TA_QUEUE); | 819 | blk_add_trace_bio(q, bio, BLK_TA_QUEUE, 0); |
808 | } | 820 | } |
809 | 821 | ||
810 | static void blk_add_trace_getrq(void *ignore, | 822 | static void blk_add_trace_getrq(void *ignore, |
@@ -812,7 +824,7 @@ static void blk_add_trace_getrq(void *ignore, | |||
812 | struct bio *bio, int rw) | 824 | struct bio *bio, int rw) |
813 | { | 825 | { |
814 | if (bio) | 826 | if (bio) |
815 | blk_add_trace_bio(q, bio, BLK_TA_GETRQ); | 827 | blk_add_trace_bio(q, bio, BLK_TA_GETRQ, 0); |
816 | else { | 828 | else { |
817 | struct blk_trace *bt = q->blk_trace; | 829 | struct blk_trace *bt = q->blk_trace; |
818 | 830 | ||
@@ -827,7 +839,7 @@ static void blk_add_trace_sleeprq(void *ignore, | |||
827 | struct bio *bio, int rw) | 839 | struct bio *bio, int rw) |
828 | { | 840 | { |
829 | if (bio) | 841 | if (bio) |
830 | blk_add_trace_bio(q, bio, BLK_TA_SLEEPRQ); | 842 | blk_add_trace_bio(q, bio, BLK_TA_SLEEPRQ, 0); |
831 | else { | 843 | else { |
832 | struct blk_trace *bt = q->blk_trace; | 844 | struct blk_trace *bt = q->blk_trace; |
833 | 845 | ||
@@ -887,7 +899,7 @@ static void blk_add_trace_split(void *ignore, | |||
887 | } | 899 | } |
888 | 900 | ||
889 | /** | 901 | /** |
890 | * blk_add_trace_remap - Add a trace for a remap operation | 902 | * blk_add_trace_bio_remap - Add a trace for a bio-remap operation |
891 | * @ignore: trace callback data parameter (not used) | 903 | * @ignore: trace callback data parameter (not used) |
892 | * @q: queue the io is for | 904 | * @q: queue the io is for |
893 | * @bio: the source bio | 905 | * @bio: the source bio |
@@ -899,9 +911,9 @@ static void blk_add_trace_split(void *ignore, | |||
899 | * it spans a stripe (or similar). Add a trace for that action. | 911 | * it spans a stripe (or similar). Add a trace for that action. |
900 | * | 912 | * |
901 | **/ | 913 | **/ |
902 | static void blk_add_trace_remap(void *ignore, | 914 | static void blk_add_trace_bio_remap(void *ignore, |
903 | struct request_queue *q, struct bio *bio, | 915 | struct request_queue *q, struct bio *bio, |
904 | dev_t dev, sector_t from) | 916 | dev_t dev, sector_t from) |
905 | { | 917 | { |
906 | struct blk_trace *bt = q->blk_trace; | 918 | struct blk_trace *bt = q->blk_trace; |
907 | struct blk_io_trace_remap r; | 919 | struct blk_io_trace_remap r; |
@@ -1016,7 +1028,7 @@ static void blk_register_tracepoints(void) | |||
1016 | WARN_ON(ret); | 1028 | WARN_ON(ret); |
1017 | ret = register_trace_block_split(blk_add_trace_split, NULL); | 1029 | ret = register_trace_block_split(blk_add_trace_split, NULL); |
1018 | WARN_ON(ret); | 1030 | WARN_ON(ret); |
1019 | ret = register_trace_block_remap(blk_add_trace_remap, NULL); | 1031 | ret = register_trace_block_bio_remap(blk_add_trace_bio_remap, NULL); |
1020 | WARN_ON(ret); | 1032 | WARN_ON(ret); |
1021 | ret = register_trace_block_rq_remap(blk_add_trace_rq_remap, NULL); | 1033 | ret = register_trace_block_rq_remap(blk_add_trace_rq_remap, NULL); |
1022 | WARN_ON(ret); | 1034 | WARN_ON(ret); |
@@ -1025,7 +1037,7 @@ static void blk_register_tracepoints(void) | |||
1025 | static void blk_unregister_tracepoints(void) | 1037 | static void blk_unregister_tracepoints(void) |
1026 | { | 1038 | { |
1027 | unregister_trace_block_rq_remap(blk_add_trace_rq_remap, NULL); | 1039 | unregister_trace_block_rq_remap(blk_add_trace_rq_remap, NULL); |
1028 | unregister_trace_block_remap(blk_add_trace_remap, NULL); | 1040 | unregister_trace_block_bio_remap(blk_add_trace_bio_remap, NULL); |
1029 | unregister_trace_block_split(blk_add_trace_split, NULL); | 1041 | unregister_trace_block_split(blk_add_trace_split, NULL); |
1030 | unregister_trace_block_unplug_io(blk_add_trace_unplug_io, NULL); | 1042 | unregister_trace_block_unplug_io(blk_add_trace_unplug_io, NULL); |
1031 | unregister_trace_block_unplug_timer(blk_add_trace_unplug_timer, NULL); | 1043 | unregister_trace_block_unplug_timer(blk_add_trace_unplug_timer, NULL); |
@@ -1815,21 +1827,5 @@ void blk_fill_rwbs(char *rwbs, u32 rw, int bytes) | |||
1815 | rwbs[i] = '\0'; | 1827 | rwbs[i] = '\0'; |
1816 | } | 1828 | } |
1817 | 1829 | ||
1818 | void blk_fill_rwbs_rq(char *rwbs, struct request *rq) | ||
1819 | { | ||
1820 | int rw = rq->cmd_flags & 0x03; | ||
1821 | int bytes; | ||
1822 | |||
1823 | if (rq->cmd_flags & REQ_DISCARD) | ||
1824 | rw |= REQ_DISCARD; | ||
1825 | |||
1826 | if (rq->cmd_flags & REQ_SECURE) | ||
1827 | rw |= REQ_SECURE; | ||
1828 | |||
1829 | bytes = blk_rq_bytes(rq); | ||
1830 | |||
1831 | blk_fill_rwbs(rwbs, rw, bytes); | ||
1832 | } | ||
1833 | |||
1834 | #endif /* CONFIG_EVENT_TRACING */ | 1830 | #endif /* CONFIG_EVENT_TRACING */ |
1835 | 1831 | ||
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index f3dadae83883..888b611897d3 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c | |||
@@ -3328,7 +3328,7 @@ static int start_graph_tracing(void) | |||
3328 | /* The cpu_boot init_task->ret_stack will never be freed */ | 3328 | /* The cpu_boot init_task->ret_stack will never be freed */ |
3329 | for_each_online_cpu(cpu) { | 3329 | for_each_online_cpu(cpu) { |
3330 | if (!idle_task(cpu)->ret_stack) | 3330 | if (!idle_task(cpu)->ret_stack) |
3331 | ftrace_graph_init_task(idle_task(cpu)); | 3331 | ftrace_graph_init_idle_task(idle_task(cpu), cpu); |
3332 | } | 3332 | } |
3333 | 3333 | ||
3334 | do { | 3334 | do { |
@@ -3418,6 +3418,49 @@ void unregister_ftrace_graph(void) | |||
3418 | mutex_unlock(&ftrace_lock); | 3418 | mutex_unlock(&ftrace_lock); |
3419 | } | 3419 | } |
3420 | 3420 | ||
3421 | static DEFINE_PER_CPU(struct ftrace_ret_stack *, idle_ret_stack); | ||
3422 | |||
3423 | static void | ||
3424 | graph_init_task(struct task_struct *t, struct ftrace_ret_stack *ret_stack) | ||
3425 | { | ||
3426 | atomic_set(&t->tracing_graph_pause, 0); | ||
3427 | atomic_set(&t->trace_overrun, 0); | ||
3428 | t->ftrace_timestamp = 0; | ||
3429 | /* make curr_ret_stack visable before we add the ret_stack */ | ||
3430 | smp_wmb(); | ||
3431 | t->ret_stack = ret_stack; | ||
3432 | } | ||
3433 | |||
3434 | /* | ||
3435 | * Allocate a return stack for the idle task. May be the first | ||
3436 | * time through, or it may be done by CPU hotplug online. | ||
3437 | */ | ||
3438 | void ftrace_graph_init_idle_task(struct task_struct *t, int cpu) | ||
3439 | { | ||
3440 | t->curr_ret_stack = -1; | ||
3441 | /* | ||
3442 | * The idle task has no parent, it either has its own | ||
3443 | * stack or no stack at all. | ||
3444 | */ | ||
3445 | if (t->ret_stack) | ||
3446 | WARN_ON(t->ret_stack != per_cpu(idle_ret_stack, cpu)); | ||
3447 | |||
3448 | if (ftrace_graph_active) { | ||
3449 | struct ftrace_ret_stack *ret_stack; | ||
3450 | |||
3451 | ret_stack = per_cpu(idle_ret_stack, cpu); | ||
3452 | if (!ret_stack) { | ||
3453 | ret_stack = kmalloc(FTRACE_RETFUNC_DEPTH | ||
3454 | * sizeof(struct ftrace_ret_stack), | ||
3455 | GFP_KERNEL); | ||
3456 | if (!ret_stack) | ||
3457 | return; | ||
3458 | per_cpu(idle_ret_stack, cpu) = ret_stack; | ||
3459 | } | ||
3460 | graph_init_task(t, ret_stack); | ||
3461 | } | ||
3462 | } | ||
3463 | |||
3421 | /* Allocate a return stack for newly created task */ | 3464 | /* Allocate a return stack for newly created task */ |
3422 | void ftrace_graph_init_task(struct task_struct *t) | 3465 | void ftrace_graph_init_task(struct task_struct *t) |
3423 | { | 3466 | { |
@@ -3433,12 +3476,7 @@ void ftrace_graph_init_task(struct task_struct *t) | |||
3433 | GFP_KERNEL); | 3476 | GFP_KERNEL); |
3434 | if (!ret_stack) | 3477 | if (!ret_stack) |
3435 | return; | 3478 | return; |
3436 | atomic_set(&t->tracing_graph_pause, 0); | 3479 | graph_init_task(t, ret_stack); |
3437 | atomic_set(&t->trace_overrun, 0); | ||
3438 | t->ftrace_timestamp = 0; | ||
3439 | /* make curr_ret_stack visable before we add the ret_stack */ | ||
3440 | smp_wmb(); | ||
3441 | t->ret_stack = ret_stack; | ||
3442 | } | 3480 | } |
3443 | } | 3481 | } |
3444 | 3482 | ||
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index bd1c35a4fbcc..db7b439d23ee 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c | |||
@@ -5,7 +5,6 @@ | |||
5 | */ | 5 | */ |
6 | #include <linux/ring_buffer.h> | 6 | #include <linux/ring_buffer.h> |
7 | #include <linux/trace_clock.h> | 7 | #include <linux/trace_clock.h> |
8 | #include <linux/ftrace_irq.h> | ||
9 | #include <linux/spinlock.h> | 8 | #include <linux/spinlock.h> |
10 | #include <linux/debugfs.h> | 9 | #include <linux/debugfs.h> |
11 | #include <linux/uaccess.h> | 10 | #include <linux/uaccess.h> |
@@ -1429,6 +1428,17 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size) | |||
1429 | } | 1428 | } |
1430 | EXPORT_SYMBOL_GPL(ring_buffer_resize); | 1429 | EXPORT_SYMBOL_GPL(ring_buffer_resize); |
1431 | 1430 | ||
1431 | void ring_buffer_change_overwrite(struct ring_buffer *buffer, int val) | ||
1432 | { | ||
1433 | mutex_lock(&buffer->mutex); | ||
1434 | if (val) | ||
1435 | buffer->flags |= RB_FL_OVERWRITE; | ||
1436 | else | ||
1437 | buffer->flags &= ~RB_FL_OVERWRITE; | ||
1438 | mutex_unlock(&buffer->mutex); | ||
1439 | } | ||
1440 | EXPORT_SYMBOL_GPL(ring_buffer_change_overwrite); | ||
1441 | |||
1432 | static inline void * | 1442 | static inline void * |
1433 | __rb_data_page_index(struct buffer_data_page *bpage, unsigned index) | 1443 | __rb_data_page_index(struct buffer_data_page *bpage, unsigned index) |
1434 | { | 1444 | { |
@@ -2162,11 +2172,19 @@ rb_reserve_next_event(struct ring_buffer *buffer, | |||
2162 | if (likely(ts >= cpu_buffer->write_stamp)) { | 2172 | if (likely(ts >= cpu_buffer->write_stamp)) { |
2163 | delta = diff; | 2173 | delta = diff; |
2164 | if (unlikely(test_time_stamp(delta))) { | 2174 | if (unlikely(test_time_stamp(delta))) { |
2175 | int local_clock_stable = 1; | ||
2176 | #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK | ||
2177 | local_clock_stable = sched_clock_stable; | ||
2178 | #endif | ||
2165 | WARN_ONCE(delta > (1ULL << 59), | 2179 | WARN_ONCE(delta > (1ULL << 59), |
2166 | KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n", | 2180 | KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n%s", |
2167 | (unsigned long long)delta, | 2181 | (unsigned long long)delta, |
2168 | (unsigned long long)ts, | 2182 | (unsigned long long)ts, |
2169 | (unsigned long long)cpu_buffer->write_stamp); | 2183 | (unsigned long long)cpu_buffer->write_stamp, |
2184 | local_clock_stable ? "" : | ||
2185 | "If you just came from a suspend/resume,\n" | ||
2186 | "please switch to the trace global clock:\n" | ||
2187 | " echo global > /sys/kernel/debug/tracing/trace_clock\n"); | ||
2170 | add_timestamp = 1; | 2188 | add_timestamp = 1; |
2171 | } | 2189 | } |
2172 | } | 2190 | } |
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index f8cf959bad45..9541c27c1cf2 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c | |||
@@ -41,8 +41,6 @@ | |||
41 | #include "trace.h" | 41 | #include "trace.h" |
42 | #include "trace_output.h" | 42 | #include "trace_output.h" |
43 | 43 | ||
44 | #define TRACE_BUFFER_FLAGS (RB_FL_OVERWRITE) | ||
45 | |||
46 | /* | 44 | /* |
47 | * On boot up, the ring buffer is set to the minimum size, so that | 45 | * On boot up, the ring buffer is set to the minimum size, so that |
48 | * we do not waste memory on systems that are not using tracing. | 46 | * we do not waste memory on systems that are not using tracing. |
@@ -340,7 +338,7 @@ static DECLARE_WAIT_QUEUE_HEAD(trace_wait); | |||
340 | /* trace_flags holds trace_options default values */ | 338 | /* trace_flags holds trace_options default values */ |
341 | unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK | | 339 | unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK | |
342 | TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME | | 340 | TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME | |
343 | TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD; | 341 | TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE; |
344 | 342 | ||
345 | static int trace_stop_count; | 343 | static int trace_stop_count; |
346 | static DEFINE_SPINLOCK(tracing_start_lock); | 344 | static DEFINE_SPINLOCK(tracing_start_lock); |
@@ -425,6 +423,7 @@ static const char *trace_options[] = { | |||
425 | "sleep-time", | 423 | "sleep-time", |
426 | "graph-time", | 424 | "graph-time", |
427 | "record-cmd", | 425 | "record-cmd", |
426 | "overwrite", | ||
428 | NULL | 427 | NULL |
429 | }; | 428 | }; |
430 | 429 | ||
@@ -780,6 +779,11 @@ __acquires(kernel_lock) | |||
780 | tracing_reset_online_cpus(tr); | 779 | tracing_reset_online_cpus(tr); |
781 | 780 | ||
782 | current_trace = type; | 781 | current_trace = type; |
782 | |||
783 | /* If we expanded the buffers, make sure the max is expanded too */ | ||
784 | if (ring_buffer_expanded && type->use_max_tr) | ||
785 | ring_buffer_resize(max_tr.buffer, trace_buf_size); | ||
786 | |||
783 | /* the test is responsible for initializing and enabling */ | 787 | /* the test is responsible for initializing and enabling */ |
784 | pr_info("Testing tracer %s: ", type->name); | 788 | pr_info("Testing tracer %s: ", type->name); |
785 | ret = type->selftest(type, tr); | 789 | ret = type->selftest(type, tr); |
@@ -792,6 +796,10 @@ __acquires(kernel_lock) | |||
792 | /* Only reset on passing, to avoid touching corrupted buffers */ | 796 | /* Only reset on passing, to avoid touching corrupted buffers */ |
793 | tracing_reset_online_cpus(tr); | 797 | tracing_reset_online_cpus(tr); |
794 | 798 | ||
799 | /* Shrink the max buffer again */ | ||
800 | if (ring_buffer_expanded && type->use_max_tr) | ||
801 | ring_buffer_resize(max_tr.buffer, 1); | ||
802 | |||
795 | printk(KERN_CONT "PASSED\n"); | 803 | printk(KERN_CONT "PASSED\n"); |
796 | } | 804 | } |
797 | #endif | 805 | #endif |
@@ -1102,7 +1110,6 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags, | |||
1102 | 1110 | ||
1103 | entry->preempt_count = pc & 0xff; | 1111 | entry->preempt_count = pc & 0xff; |
1104 | entry->pid = (tsk) ? tsk->pid : 0; | 1112 | entry->pid = (tsk) ? tsk->pid : 0; |
1105 | entry->lock_depth = (tsk) ? tsk->lock_depth : 0; | ||
1106 | entry->flags = | 1113 | entry->flags = |
1107 | #ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT | 1114 | #ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT |
1108 | (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) | | 1115 | (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) | |
@@ -1313,12 +1320,10 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc) | |||
1313 | 1320 | ||
1314 | __this_cpu_inc(user_stack_count); | 1321 | __this_cpu_inc(user_stack_count); |
1315 | 1322 | ||
1316 | |||
1317 | |||
1318 | event = trace_buffer_lock_reserve(buffer, TRACE_USER_STACK, | 1323 | event = trace_buffer_lock_reserve(buffer, TRACE_USER_STACK, |
1319 | sizeof(*entry), flags, pc); | 1324 | sizeof(*entry), flags, pc); |
1320 | if (!event) | 1325 | if (!event) |
1321 | return; | 1326 | goto out_drop_count; |
1322 | entry = ring_buffer_event_data(event); | 1327 | entry = ring_buffer_event_data(event); |
1323 | 1328 | ||
1324 | entry->tgid = current->tgid; | 1329 | entry->tgid = current->tgid; |
@@ -1333,8 +1338,8 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc) | |||
1333 | if (!filter_check_discard(call, entry, buffer, event)) | 1338 | if (!filter_check_discard(call, entry, buffer, event)) |
1334 | ring_buffer_unlock_commit(buffer, event); | 1339 | ring_buffer_unlock_commit(buffer, event); |
1335 | 1340 | ||
1341 | out_drop_count: | ||
1336 | __this_cpu_dec(user_stack_count); | 1342 | __this_cpu_dec(user_stack_count); |
1337 | |||
1338 | out: | 1343 | out: |
1339 | preempt_enable(); | 1344 | preempt_enable(); |
1340 | } | 1345 | } |
@@ -1751,10 +1756,9 @@ static void print_lat_help_header(struct seq_file *m) | |||
1751 | seq_puts(m, "# | / _----=> need-resched \n"); | 1756 | seq_puts(m, "# | / _----=> need-resched \n"); |
1752 | seq_puts(m, "# || / _---=> hardirq/softirq \n"); | 1757 | seq_puts(m, "# || / _---=> hardirq/softirq \n"); |
1753 | seq_puts(m, "# ||| / _--=> preempt-depth \n"); | 1758 | seq_puts(m, "# ||| / _--=> preempt-depth \n"); |
1754 | seq_puts(m, "# |||| /_--=> lock-depth \n"); | 1759 | seq_puts(m, "# |||| / delay \n"); |
1755 | seq_puts(m, "# |||||/ delay \n"); | 1760 | seq_puts(m, "# cmd pid ||||| time | caller \n"); |
1756 | seq_puts(m, "# cmd pid |||||| time | caller \n"); | 1761 | seq_puts(m, "# \\ / ||||| \\ | / \n"); |
1757 | seq_puts(m, "# \\ / |||||| \\ | / \n"); | ||
1758 | } | 1762 | } |
1759 | 1763 | ||
1760 | static void print_func_help_header(struct seq_file *m) | 1764 | static void print_func_help_header(struct seq_file *m) |
@@ -2531,6 +2535,9 @@ static void set_tracer_flags(unsigned int mask, int enabled) | |||
2531 | 2535 | ||
2532 | if (mask == TRACE_ITER_RECORD_CMD) | 2536 | if (mask == TRACE_ITER_RECORD_CMD) |
2533 | trace_event_enable_cmd_record(enabled); | 2537 | trace_event_enable_cmd_record(enabled); |
2538 | |||
2539 | if (mask == TRACE_ITER_OVERWRITE) | ||
2540 | ring_buffer_change_overwrite(global_trace.buffer, enabled); | ||
2534 | } | 2541 | } |
2535 | 2542 | ||
2536 | static ssize_t | 2543 | static ssize_t |
@@ -2712,6 +2719,10 @@ tracing_ctrl_write(struct file *filp, const char __user *ubuf, | |||
2712 | 2719 | ||
2713 | mutex_lock(&trace_types_lock); | 2720 | mutex_lock(&trace_types_lock); |
2714 | if (tracer_enabled ^ val) { | 2721 | if (tracer_enabled ^ val) { |
2722 | |||
2723 | /* Only need to warn if this is used to change the state */ | ||
2724 | WARN_ONCE(1, "tracing_enabled is deprecated. Use tracing_on"); | ||
2725 | |||
2715 | if (val) { | 2726 | if (val) { |
2716 | tracer_enabled = 1; | 2727 | tracer_enabled = 1; |
2717 | if (current_trace->start) | 2728 | if (current_trace->start) |
@@ -4553,9 +4564,11 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) | |||
4553 | __init static int tracer_alloc_buffers(void) | 4564 | __init static int tracer_alloc_buffers(void) |
4554 | { | 4565 | { |
4555 | int ring_buf_size; | 4566 | int ring_buf_size; |
4567 | enum ring_buffer_flags rb_flags; | ||
4556 | int i; | 4568 | int i; |
4557 | int ret = -ENOMEM; | 4569 | int ret = -ENOMEM; |
4558 | 4570 | ||
4571 | |||
4559 | if (!alloc_cpumask_var(&tracing_buffer_mask, GFP_KERNEL)) | 4572 | if (!alloc_cpumask_var(&tracing_buffer_mask, GFP_KERNEL)) |
4560 | goto out; | 4573 | goto out; |
4561 | 4574 | ||
@@ -4568,12 +4581,13 @@ __init static int tracer_alloc_buffers(void) | |||
4568 | else | 4581 | else |
4569 | ring_buf_size = 1; | 4582 | ring_buf_size = 1; |
4570 | 4583 | ||
4584 | rb_flags = trace_flags & TRACE_ITER_OVERWRITE ? RB_FL_OVERWRITE : 0; | ||
4585 | |||
4571 | cpumask_copy(tracing_buffer_mask, cpu_possible_mask); | 4586 | cpumask_copy(tracing_buffer_mask, cpu_possible_mask); |
4572 | cpumask_copy(tracing_cpumask, cpu_all_mask); | 4587 | cpumask_copy(tracing_cpumask, cpu_all_mask); |
4573 | 4588 | ||
4574 | /* TODO: make the number of buffers hot pluggable with CPUS */ | 4589 | /* TODO: make the number of buffers hot pluggable with CPUS */ |
4575 | global_trace.buffer = ring_buffer_alloc(ring_buf_size, | 4590 | global_trace.buffer = ring_buffer_alloc(ring_buf_size, rb_flags); |
4576 | TRACE_BUFFER_FLAGS); | ||
4577 | if (!global_trace.buffer) { | 4591 | if (!global_trace.buffer) { |
4578 | printk(KERN_ERR "tracer: failed to allocate ring buffer!\n"); | 4592 | printk(KERN_ERR "tracer: failed to allocate ring buffer!\n"); |
4579 | WARN_ON(1); | 4593 | WARN_ON(1); |
@@ -4583,7 +4597,7 @@ __init static int tracer_alloc_buffers(void) | |||
4583 | 4597 | ||
4584 | 4598 | ||
4585 | #ifdef CONFIG_TRACER_MAX_TRACE | 4599 | #ifdef CONFIG_TRACER_MAX_TRACE |
4586 | max_tr.buffer = ring_buffer_alloc(1, TRACE_BUFFER_FLAGS); | 4600 | max_tr.buffer = ring_buffer_alloc(1, rb_flags); |
4587 | if (!max_tr.buffer) { | 4601 | if (!max_tr.buffer) { |
4588 | printk(KERN_ERR "tracer: failed to allocate max ring buffer!\n"); | 4602 | printk(KERN_ERR "tracer: failed to allocate max ring buffer!\n"); |
4589 | WARN_ON(1); | 4603 | WARN_ON(1); |
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 9021f8c0c0c3..5e9dfc6286dd 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h | |||
@@ -272,8 +272,8 @@ struct tracer { | |||
272 | /* If you handled the flag setting, return 0 */ | 272 | /* If you handled the flag setting, return 0 */ |
273 | int (*set_flag)(u32 old_flags, u32 bit, int set); | 273 | int (*set_flag)(u32 old_flags, u32 bit, int set); |
274 | struct tracer *next; | 274 | struct tracer *next; |
275 | int print_max; | ||
276 | struct tracer_flags *flags; | 275 | struct tracer_flags *flags; |
276 | int print_max; | ||
277 | int use_max_tr; | 277 | int use_max_tr; |
278 | }; | 278 | }; |
279 | 279 | ||
@@ -606,6 +606,7 @@ enum trace_iterator_flags { | |||
606 | TRACE_ITER_SLEEP_TIME = 0x40000, | 606 | TRACE_ITER_SLEEP_TIME = 0x40000, |
607 | TRACE_ITER_GRAPH_TIME = 0x80000, | 607 | TRACE_ITER_GRAPH_TIME = 0x80000, |
608 | TRACE_ITER_RECORD_CMD = 0x100000, | 608 | TRACE_ITER_RECORD_CMD = 0x100000, |
609 | TRACE_ITER_OVERWRITE = 0x200000, | ||
609 | }; | 610 | }; |
610 | 611 | ||
611 | /* | 612 | /* |
@@ -661,8 +662,10 @@ struct ftrace_event_field { | |||
661 | }; | 662 | }; |
662 | 663 | ||
663 | struct event_filter { | 664 | struct event_filter { |
664 | int n_preds; | 665 | int n_preds; /* Number assigned */ |
665 | struct filter_pred **preds; | 666 | int a_preds; /* allocated */ |
667 | struct filter_pred *preds; | ||
668 | struct filter_pred *root; | ||
666 | char *filter_string; | 669 | char *filter_string; |
667 | }; | 670 | }; |
668 | 671 | ||
@@ -674,11 +677,23 @@ struct event_subsystem { | |||
674 | int nr_events; | 677 | int nr_events; |
675 | }; | 678 | }; |
676 | 679 | ||
680 | #define FILTER_PRED_INVALID ((unsigned short)-1) | ||
681 | #define FILTER_PRED_IS_RIGHT (1 << 15) | ||
682 | #define FILTER_PRED_FOLD (1 << 15) | ||
683 | |||
684 | /* | ||
685 | * The max preds is the size of unsigned short with | ||
686 | * two flags at the MSBs. One bit is used for both the IS_RIGHT | ||
687 | * and FOLD flags. The other is reserved. | ||
688 | * | ||
689 | * 2^14 preds is way more than enough. | ||
690 | */ | ||
691 | #define MAX_FILTER_PRED 16384 | ||
692 | |||
677 | struct filter_pred; | 693 | struct filter_pred; |
678 | struct regex; | 694 | struct regex; |
679 | 695 | ||
680 | typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event, | 696 | typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event); |
681 | int val1, int val2); | ||
682 | 697 | ||
683 | typedef int (*regex_match_func)(char *str, struct regex *r, int len); | 698 | typedef int (*regex_match_func)(char *str, struct regex *r, int len); |
684 | 699 | ||
@@ -700,11 +715,23 @@ struct filter_pred { | |||
700 | filter_pred_fn_t fn; | 715 | filter_pred_fn_t fn; |
701 | u64 val; | 716 | u64 val; |
702 | struct regex regex; | 717 | struct regex regex; |
703 | char *field_name; | 718 | /* |
719 | * Leaf nodes use field_name, ops is used by AND and OR | ||
720 | * nodes. The field_name is always freed when freeing a pred. | ||
721 | * We can overload field_name for ops and have it freed | ||
722 | * as well. | ||
723 | */ | ||
724 | union { | ||
725 | char *field_name; | ||
726 | unsigned short *ops; | ||
727 | }; | ||
704 | int offset; | 728 | int offset; |
705 | int not; | 729 | int not; |
706 | int op; | 730 | int op; |
707 | int pop_n; | 731 | unsigned short index; |
732 | unsigned short parent; | ||
733 | unsigned short left; | ||
734 | unsigned short right; | ||
708 | }; | 735 | }; |
709 | 736 | ||
710 | extern struct list_head ftrace_common_fields; | 737 | extern struct list_head ftrace_common_fields; |
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index e3dfecaf13e6..1516cb3ec549 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h | |||
@@ -53,7 +53,7 @@ | |||
53 | */ | 53 | */ |
54 | 54 | ||
55 | /* | 55 | /* |
56 | * Function trace entry - function address and parent function addres: | 56 | * Function trace entry - function address and parent function address: |
57 | */ | 57 | */ |
58 | FTRACE_ENTRY(function, ftrace_entry, | 58 | FTRACE_ENTRY(function, ftrace_entry, |
59 | 59 | ||
@@ -109,12 +109,12 @@ FTRACE_ENTRY(funcgraph_exit, ftrace_graph_ret_entry, | |||
109 | */ | 109 | */ |
110 | #define FTRACE_CTX_FIELDS \ | 110 | #define FTRACE_CTX_FIELDS \ |
111 | __field( unsigned int, prev_pid ) \ | 111 | __field( unsigned int, prev_pid ) \ |
112 | __field( unsigned int, next_pid ) \ | ||
113 | __field( unsigned int, next_cpu ) \ | ||
112 | __field( unsigned char, prev_prio ) \ | 114 | __field( unsigned char, prev_prio ) \ |
113 | __field( unsigned char, prev_state ) \ | 115 | __field( unsigned char, prev_state ) \ |
114 | __field( unsigned int, next_pid ) \ | ||
115 | __field( unsigned char, next_prio ) \ | 116 | __field( unsigned char, next_prio ) \ |
116 | __field( unsigned char, next_state ) \ | 117 | __field( unsigned char, next_state ) |
117 | __field( unsigned int, next_cpu ) | ||
118 | 118 | ||
119 | FTRACE_ENTRY(context_switch, ctx_switch_entry, | 119 | FTRACE_ENTRY(context_switch, ctx_switch_entry, |
120 | 120 | ||
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 35fde09b81de..e88f74fe1d4c 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c | |||
@@ -116,7 +116,6 @@ static int trace_define_common_fields(void) | |||
116 | __common_field(unsigned char, flags); | 116 | __common_field(unsigned char, flags); |
117 | __common_field(unsigned char, preempt_count); | 117 | __common_field(unsigned char, preempt_count); |
118 | __common_field(int, pid); | 118 | __common_field(int, pid); |
119 | __common_field(int, lock_depth); | ||
120 | 119 | ||
121 | return ret; | 120 | return ret; |
122 | } | 121 | } |
@@ -326,6 +325,7 @@ int trace_set_clr_event(const char *system, const char *event, int set) | |||
326 | { | 325 | { |
327 | return __ftrace_set_clr_event(NULL, system, event, set); | 326 | return __ftrace_set_clr_event(NULL, system, event, set); |
328 | } | 327 | } |
328 | EXPORT_SYMBOL_GPL(trace_set_clr_event); | ||
329 | 329 | ||
330 | /* 128 should be much more than enough */ | 330 | /* 128 should be much more than enough */ |
331 | #define EVENT_BUF_SIZE 127 | 331 | #define EVENT_BUF_SIZE 127 |
@@ -1284,7 +1284,7 @@ trace_create_file_ops(struct module *mod) | |||
1284 | static void trace_module_add_events(struct module *mod) | 1284 | static void trace_module_add_events(struct module *mod) |
1285 | { | 1285 | { |
1286 | struct ftrace_module_file_ops *file_ops = NULL; | 1286 | struct ftrace_module_file_ops *file_ops = NULL; |
1287 | struct ftrace_event_call *call, *start, *end; | 1287 | struct ftrace_event_call **call, **start, **end; |
1288 | 1288 | ||
1289 | start = mod->trace_events; | 1289 | start = mod->trace_events; |
1290 | end = mod->trace_events + mod->num_trace_events; | 1290 | end = mod->trace_events + mod->num_trace_events; |
@@ -1297,7 +1297,7 @@ static void trace_module_add_events(struct module *mod) | |||
1297 | return; | 1297 | return; |
1298 | 1298 | ||
1299 | for_each_event(call, start, end) { | 1299 | for_each_event(call, start, end) { |
1300 | __trace_add_event_call(call, mod, | 1300 | __trace_add_event_call(*call, mod, |
1301 | &file_ops->id, &file_ops->enable, | 1301 | &file_ops->id, &file_ops->enable, |
1302 | &file_ops->filter, &file_ops->format); | 1302 | &file_ops->filter, &file_ops->format); |
1303 | } | 1303 | } |
@@ -1367,8 +1367,8 @@ static struct notifier_block trace_module_nb = { | |||
1367 | .priority = 0, | 1367 | .priority = 0, |
1368 | }; | 1368 | }; |
1369 | 1369 | ||
1370 | extern struct ftrace_event_call __start_ftrace_events[]; | 1370 | extern struct ftrace_event_call *__start_ftrace_events[]; |
1371 | extern struct ftrace_event_call __stop_ftrace_events[]; | 1371 | extern struct ftrace_event_call *__stop_ftrace_events[]; |
1372 | 1372 | ||
1373 | static char bootup_event_buf[COMMAND_LINE_SIZE] __initdata; | 1373 | static char bootup_event_buf[COMMAND_LINE_SIZE] __initdata; |
1374 | 1374 | ||
@@ -1384,7 +1384,7 @@ __setup("trace_event=", setup_trace_event); | |||
1384 | 1384 | ||
1385 | static __init int event_trace_init(void) | 1385 | static __init int event_trace_init(void) |
1386 | { | 1386 | { |
1387 | struct ftrace_event_call *call; | 1387 | struct ftrace_event_call **call; |
1388 | struct dentry *d_tracer; | 1388 | struct dentry *d_tracer; |
1389 | struct dentry *entry; | 1389 | struct dentry *entry; |
1390 | struct dentry *d_events; | 1390 | struct dentry *d_events; |
@@ -1430,7 +1430,7 @@ static __init int event_trace_init(void) | |||
1430 | pr_warning("tracing: Failed to allocate common fields"); | 1430 | pr_warning("tracing: Failed to allocate common fields"); |
1431 | 1431 | ||
1432 | for_each_event(call, __start_ftrace_events, __stop_ftrace_events) { | 1432 | for_each_event(call, __start_ftrace_events, __stop_ftrace_events) { |
1433 | __trace_add_event_call(call, NULL, &ftrace_event_id_fops, | 1433 | __trace_add_event_call(*call, NULL, &ftrace_event_id_fops, |
1434 | &ftrace_enable_fops, | 1434 | &ftrace_enable_fops, |
1435 | &ftrace_event_filter_fops, | 1435 | &ftrace_event_filter_fops, |
1436 | &ftrace_event_format_fops); | 1436 | &ftrace_event_format_fops); |
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 36d40104b17f..3249b4f77ef0 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c | |||
@@ -123,9 +123,13 @@ struct filter_parse_state { | |||
123 | } operand; | 123 | } operand; |
124 | }; | 124 | }; |
125 | 125 | ||
126 | struct pred_stack { | ||
127 | struct filter_pred **preds; | ||
128 | int index; | ||
129 | }; | ||
130 | |||
126 | #define DEFINE_COMPARISON_PRED(type) \ | 131 | #define DEFINE_COMPARISON_PRED(type) \ |
127 | static int filter_pred_##type(struct filter_pred *pred, void *event, \ | 132 | static int filter_pred_##type(struct filter_pred *pred, void *event) \ |
128 | int val1, int val2) \ | ||
129 | { \ | 133 | { \ |
130 | type *addr = (type *)(event + pred->offset); \ | 134 | type *addr = (type *)(event + pred->offset); \ |
131 | type val = (type)pred->val; \ | 135 | type val = (type)pred->val; \ |
@@ -152,8 +156,7 @@ static int filter_pred_##type(struct filter_pred *pred, void *event, \ | |||
152 | } | 156 | } |
153 | 157 | ||
154 | #define DEFINE_EQUALITY_PRED(size) \ | 158 | #define DEFINE_EQUALITY_PRED(size) \ |
155 | static int filter_pred_##size(struct filter_pred *pred, void *event, \ | 159 | static int filter_pred_##size(struct filter_pred *pred, void *event) \ |
156 | int val1, int val2) \ | ||
157 | { \ | 160 | { \ |
158 | u##size *addr = (u##size *)(event + pred->offset); \ | 161 | u##size *addr = (u##size *)(event + pred->offset); \ |
159 | u##size val = (u##size)pred->val; \ | 162 | u##size val = (u##size)pred->val; \ |
@@ -178,23 +181,8 @@ DEFINE_EQUALITY_PRED(32); | |||
178 | DEFINE_EQUALITY_PRED(16); | 181 | DEFINE_EQUALITY_PRED(16); |
179 | DEFINE_EQUALITY_PRED(8); | 182 | DEFINE_EQUALITY_PRED(8); |
180 | 183 | ||
181 | static int filter_pred_and(struct filter_pred *pred __attribute((unused)), | ||
182 | void *event __attribute((unused)), | ||
183 | int val1, int val2) | ||
184 | { | ||
185 | return val1 && val2; | ||
186 | } | ||
187 | |||
188 | static int filter_pred_or(struct filter_pred *pred __attribute((unused)), | ||
189 | void *event __attribute((unused)), | ||
190 | int val1, int val2) | ||
191 | { | ||
192 | return val1 || val2; | ||
193 | } | ||
194 | |||
195 | /* Filter predicate for fixed sized arrays of characters */ | 184 | /* Filter predicate for fixed sized arrays of characters */ |
196 | static int filter_pred_string(struct filter_pred *pred, void *event, | 185 | static int filter_pred_string(struct filter_pred *pred, void *event) |
197 | int val1, int val2) | ||
198 | { | 186 | { |
199 | char *addr = (char *)(event + pred->offset); | 187 | char *addr = (char *)(event + pred->offset); |
200 | int cmp, match; | 188 | int cmp, match; |
@@ -207,8 +195,7 @@ static int filter_pred_string(struct filter_pred *pred, void *event, | |||
207 | } | 195 | } |
208 | 196 | ||
209 | /* Filter predicate for char * pointers */ | 197 | /* Filter predicate for char * pointers */ |
210 | static int filter_pred_pchar(struct filter_pred *pred, void *event, | 198 | static int filter_pred_pchar(struct filter_pred *pred, void *event) |
211 | int val1, int val2) | ||
212 | { | 199 | { |
213 | char **addr = (char **)(event + pred->offset); | 200 | char **addr = (char **)(event + pred->offset); |
214 | int cmp, match; | 201 | int cmp, match; |
@@ -231,8 +218,7 @@ static int filter_pred_pchar(struct filter_pred *pred, void *event, | |||
231 | * and add it to the address of the entry, and at last we have | 218 | * and add it to the address of the entry, and at last we have |
232 | * the address of the string. | 219 | * the address of the string. |
233 | */ | 220 | */ |
234 | static int filter_pred_strloc(struct filter_pred *pred, void *event, | 221 | static int filter_pred_strloc(struct filter_pred *pred, void *event) |
235 | int val1, int val2) | ||
236 | { | 222 | { |
237 | u32 str_item = *(u32 *)(event + pred->offset); | 223 | u32 str_item = *(u32 *)(event + pred->offset); |
238 | int str_loc = str_item & 0xffff; | 224 | int str_loc = str_item & 0xffff; |
@@ -247,8 +233,7 @@ static int filter_pred_strloc(struct filter_pred *pred, void *event, | |||
247 | return match; | 233 | return match; |
248 | } | 234 | } |
249 | 235 | ||
250 | static int filter_pred_none(struct filter_pred *pred, void *event, | 236 | static int filter_pred_none(struct filter_pred *pred, void *event) |
251 | int val1, int val2) | ||
252 | { | 237 | { |
253 | return 0; | 238 | return 0; |
254 | } | 239 | } |
@@ -377,32 +362,147 @@ static void filter_build_regex(struct filter_pred *pred) | |||
377 | pred->not ^= not; | 362 | pred->not ^= not; |
378 | } | 363 | } |
379 | 364 | ||
365 | enum move_type { | ||
366 | MOVE_DOWN, | ||
367 | MOVE_UP_FROM_LEFT, | ||
368 | MOVE_UP_FROM_RIGHT | ||
369 | }; | ||
370 | |||
371 | static struct filter_pred * | ||
372 | get_pred_parent(struct filter_pred *pred, struct filter_pred *preds, | ||
373 | int index, enum move_type *move) | ||
374 | { | ||
375 | if (pred->parent & FILTER_PRED_IS_RIGHT) | ||
376 | *move = MOVE_UP_FROM_RIGHT; | ||
377 | else | ||
378 | *move = MOVE_UP_FROM_LEFT; | ||
379 | pred = &preds[pred->parent & ~FILTER_PRED_IS_RIGHT]; | ||
380 | |||
381 | return pred; | ||
382 | } | ||
383 | |||
384 | /* | ||
385 | * A series of AND or ORs where found together. Instead of | ||
386 | * climbing up and down the tree branches, an array of the | ||
387 | * ops were made in order of checks. We can just move across | ||
388 | * the array and short circuit if needed. | ||
389 | */ | ||
390 | static int process_ops(struct filter_pred *preds, | ||
391 | struct filter_pred *op, void *rec) | ||
392 | { | ||
393 | struct filter_pred *pred; | ||
394 | int type; | ||
395 | int match; | ||
396 | int i; | ||
397 | |||
398 | /* | ||
399 | * Micro-optimization: We set type to true if op | ||
400 | * is an OR and false otherwise (AND). Then we | ||
401 | * just need to test if the match is equal to | ||
402 | * the type, and if it is, we can short circuit the | ||
403 | * rest of the checks: | ||
404 | * | ||
405 | * if ((match && op->op == OP_OR) || | ||
406 | * (!match && op->op == OP_AND)) | ||
407 | * return match; | ||
408 | */ | ||
409 | type = op->op == OP_OR; | ||
410 | |||
411 | for (i = 0; i < op->val; i++) { | ||
412 | pred = &preds[op->ops[i]]; | ||
413 | match = pred->fn(pred, rec); | ||
414 | if (!!match == type) | ||
415 | return match; | ||
416 | } | ||
417 | return match; | ||
418 | } | ||
419 | |||
380 | /* return 1 if event matches, 0 otherwise (discard) */ | 420 | /* return 1 if event matches, 0 otherwise (discard) */ |
381 | int filter_match_preds(struct event_filter *filter, void *rec) | 421 | int filter_match_preds(struct event_filter *filter, void *rec) |
382 | { | 422 | { |
383 | int match, top = 0, val1 = 0, val2 = 0; | 423 | int match = -1; |
384 | int stack[MAX_FILTER_PRED]; | 424 | enum move_type move = MOVE_DOWN; |
425 | struct filter_pred *preds; | ||
385 | struct filter_pred *pred; | 426 | struct filter_pred *pred; |
386 | int i; | 427 | struct filter_pred *root; |
428 | int n_preds; | ||
429 | int done = 0; | ||
430 | |||
431 | /* no filter is considered a match */ | ||
432 | if (!filter) | ||
433 | return 1; | ||
434 | |||
435 | n_preds = filter->n_preds; | ||
436 | |||
437 | if (!n_preds) | ||
438 | return 1; | ||
439 | |||
440 | /* | ||
441 | * n_preds, root and filter->preds are protect with preemption disabled. | ||
442 | */ | ||
443 | preds = rcu_dereference_sched(filter->preds); | ||
444 | root = rcu_dereference_sched(filter->root); | ||
445 | if (!root) | ||
446 | return 1; | ||
447 | |||
448 | pred = root; | ||
387 | 449 | ||
388 | for (i = 0; i < filter->n_preds; i++) { | 450 | /* match is currently meaningless */ |
389 | pred = filter->preds[i]; | 451 | match = -1; |
390 | if (!pred->pop_n) { | 452 | |
391 | match = pred->fn(pred, rec, val1, val2); | 453 | do { |
392 | stack[top++] = match; | 454 | switch (move) { |
455 | case MOVE_DOWN: | ||
456 | /* only AND and OR have children */ | ||
457 | if (pred->left != FILTER_PRED_INVALID) { | ||
458 | /* If ops is set, then it was folded. */ | ||
459 | if (!pred->ops) { | ||
460 | /* keep going to down the left side */ | ||
461 | pred = &preds[pred->left]; | ||
462 | continue; | ||
463 | } | ||
464 | /* We can treat folded ops as a leaf node */ | ||
465 | match = process_ops(preds, pred, rec); | ||
466 | } else | ||
467 | match = pred->fn(pred, rec); | ||
468 | /* If this pred is the only pred */ | ||
469 | if (pred == root) | ||
470 | break; | ||
471 | pred = get_pred_parent(pred, preds, | ||
472 | pred->parent, &move); | ||
473 | continue; | ||
474 | case MOVE_UP_FROM_LEFT: | ||
475 | /* | ||
476 | * Check for short circuits. | ||
477 | * | ||
478 | * Optimization: !!match == (pred->op == OP_OR) | ||
479 | * is the same as: | ||
480 | * if ((match && pred->op == OP_OR) || | ||
481 | * (!match && pred->op == OP_AND)) | ||
482 | */ | ||
483 | if (!!match == (pred->op == OP_OR)) { | ||
484 | if (pred == root) | ||
485 | break; | ||
486 | pred = get_pred_parent(pred, preds, | ||
487 | pred->parent, &move); | ||
488 | continue; | ||
489 | } | ||
490 | /* now go down the right side of the tree. */ | ||
491 | pred = &preds[pred->right]; | ||
492 | move = MOVE_DOWN; | ||
493 | continue; | ||
494 | case MOVE_UP_FROM_RIGHT: | ||
495 | /* We finished this equation. */ | ||
496 | if (pred == root) | ||
497 | break; | ||
498 | pred = get_pred_parent(pred, preds, | ||
499 | pred->parent, &move); | ||
393 | continue; | 500 | continue; |
394 | } | 501 | } |
395 | if (pred->pop_n > top) { | 502 | done = 1; |
396 | WARN_ON_ONCE(1); | 503 | } while (!done); |
397 | return 0; | ||
398 | } | ||
399 | val1 = stack[--top]; | ||
400 | val2 = stack[--top]; | ||
401 | match = pred->fn(pred, rec, val1, val2); | ||
402 | stack[top++] = match; | ||
403 | } | ||
404 | 504 | ||
405 | return stack[--top]; | 505 | return match; |
406 | } | 506 | } |
407 | EXPORT_SYMBOL_GPL(filter_match_preds); | 507 | EXPORT_SYMBOL_GPL(filter_match_preds); |
408 | 508 | ||
@@ -414,6 +514,9 @@ static void parse_error(struct filter_parse_state *ps, int err, int pos) | |||
414 | 514 | ||
415 | static void remove_filter_string(struct event_filter *filter) | 515 | static void remove_filter_string(struct event_filter *filter) |
416 | { | 516 | { |
517 | if (!filter) | ||
518 | return; | ||
519 | |||
417 | kfree(filter->filter_string); | 520 | kfree(filter->filter_string); |
418 | filter->filter_string = NULL; | 521 | filter->filter_string = NULL; |
419 | } | 522 | } |
@@ -473,9 +576,10 @@ static void append_filter_err(struct filter_parse_state *ps, | |||
473 | 576 | ||
474 | void print_event_filter(struct ftrace_event_call *call, struct trace_seq *s) | 577 | void print_event_filter(struct ftrace_event_call *call, struct trace_seq *s) |
475 | { | 578 | { |
476 | struct event_filter *filter = call->filter; | 579 | struct event_filter *filter; |
477 | 580 | ||
478 | mutex_lock(&event_mutex); | 581 | mutex_lock(&event_mutex); |
582 | filter = call->filter; | ||
479 | if (filter && filter->filter_string) | 583 | if (filter && filter->filter_string) |
480 | trace_seq_printf(s, "%s\n", filter->filter_string); | 584 | trace_seq_printf(s, "%s\n", filter->filter_string); |
481 | else | 585 | else |
@@ -486,9 +590,10 @@ void print_event_filter(struct ftrace_event_call *call, struct trace_seq *s) | |||
486 | void print_subsystem_event_filter(struct event_subsystem *system, | 590 | void print_subsystem_event_filter(struct event_subsystem *system, |
487 | struct trace_seq *s) | 591 | struct trace_seq *s) |
488 | { | 592 | { |
489 | struct event_filter *filter = system->filter; | 593 | struct event_filter *filter; |
490 | 594 | ||
491 | mutex_lock(&event_mutex); | 595 | mutex_lock(&event_mutex); |
596 | filter = system->filter; | ||
492 | if (filter && filter->filter_string) | 597 | if (filter && filter->filter_string) |
493 | trace_seq_printf(s, "%s\n", filter->filter_string); | 598 | trace_seq_printf(s, "%s\n", filter->filter_string); |
494 | else | 599 | else |
@@ -539,10 +644,58 @@ static void filter_clear_pred(struct filter_pred *pred) | |||
539 | pred->regex.len = 0; | 644 | pred->regex.len = 0; |
540 | } | 645 | } |
541 | 646 | ||
542 | static int filter_set_pred(struct filter_pred *dest, | 647 | static int __alloc_pred_stack(struct pred_stack *stack, int n_preds) |
648 | { | ||
649 | stack->preds = kzalloc(sizeof(*stack->preds)*(n_preds + 1), GFP_KERNEL); | ||
650 | if (!stack->preds) | ||
651 | return -ENOMEM; | ||
652 | stack->index = n_preds; | ||
653 | return 0; | ||
654 | } | ||
655 | |||
656 | static void __free_pred_stack(struct pred_stack *stack) | ||
657 | { | ||
658 | kfree(stack->preds); | ||
659 | stack->index = 0; | ||
660 | } | ||
661 | |||
662 | static int __push_pred_stack(struct pred_stack *stack, | ||
663 | struct filter_pred *pred) | ||
664 | { | ||
665 | int index = stack->index; | ||
666 | |||
667 | if (WARN_ON(index == 0)) | ||
668 | return -ENOSPC; | ||
669 | |||
670 | stack->preds[--index] = pred; | ||
671 | stack->index = index; | ||
672 | return 0; | ||
673 | } | ||
674 | |||
675 | static struct filter_pred * | ||
676 | __pop_pred_stack(struct pred_stack *stack) | ||
677 | { | ||
678 | struct filter_pred *pred; | ||
679 | int index = stack->index; | ||
680 | |||
681 | pred = stack->preds[index++]; | ||
682 | if (!pred) | ||
683 | return NULL; | ||
684 | |||
685 | stack->index = index; | ||
686 | return pred; | ||
687 | } | ||
688 | |||
689 | static int filter_set_pred(struct event_filter *filter, | ||
690 | int idx, | ||
691 | struct pred_stack *stack, | ||
543 | struct filter_pred *src, | 692 | struct filter_pred *src, |
544 | filter_pred_fn_t fn) | 693 | filter_pred_fn_t fn) |
545 | { | 694 | { |
695 | struct filter_pred *dest = &filter->preds[idx]; | ||
696 | struct filter_pred *left; | ||
697 | struct filter_pred *right; | ||
698 | |||
546 | *dest = *src; | 699 | *dest = *src; |
547 | if (src->field_name) { | 700 | if (src->field_name) { |
548 | dest->field_name = kstrdup(src->field_name, GFP_KERNEL); | 701 | dest->field_name = kstrdup(src->field_name, GFP_KERNEL); |
@@ -550,116 +703,140 @@ static int filter_set_pred(struct filter_pred *dest, | |||
550 | return -ENOMEM; | 703 | return -ENOMEM; |
551 | } | 704 | } |
552 | dest->fn = fn; | 705 | dest->fn = fn; |
706 | dest->index = idx; | ||
553 | 707 | ||
554 | return 0; | 708 | if (dest->op == OP_OR || dest->op == OP_AND) { |
709 | right = __pop_pred_stack(stack); | ||
710 | left = __pop_pred_stack(stack); | ||
711 | if (!left || !right) | ||
712 | return -EINVAL; | ||
713 | /* | ||
714 | * If both children can be folded | ||
715 | * and they are the same op as this op or a leaf, | ||
716 | * then this op can be folded. | ||
717 | */ | ||
718 | if (left->index & FILTER_PRED_FOLD && | ||
719 | (left->op == dest->op || | ||
720 | left->left == FILTER_PRED_INVALID) && | ||
721 | right->index & FILTER_PRED_FOLD && | ||
722 | (right->op == dest->op || | ||
723 | right->left == FILTER_PRED_INVALID)) | ||
724 | dest->index |= FILTER_PRED_FOLD; | ||
725 | |||
726 | dest->left = left->index & ~FILTER_PRED_FOLD; | ||
727 | dest->right = right->index & ~FILTER_PRED_FOLD; | ||
728 | left->parent = dest->index & ~FILTER_PRED_FOLD; | ||
729 | right->parent = dest->index | FILTER_PRED_IS_RIGHT; | ||
730 | } else { | ||
731 | /* | ||
732 | * Make dest->left invalid to be used as a quick | ||
733 | * way to know this is a leaf node. | ||
734 | */ | ||
735 | dest->left = FILTER_PRED_INVALID; | ||
736 | |||
737 | /* All leafs allow folding the parent ops. */ | ||
738 | dest->index |= FILTER_PRED_FOLD; | ||
739 | } | ||
740 | |||
741 | return __push_pred_stack(stack, dest); | ||
555 | } | 742 | } |
556 | 743 | ||
557 | static void filter_disable_preds(struct ftrace_event_call *call) | 744 | static void __free_preds(struct event_filter *filter) |
558 | { | 745 | { |
559 | struct event_filter *filter = call->filter; | ||
560 | int i; | 746 | int i; |
561 | 747 | ||
562 | call->flags &= ~TRACE_EVENT_FL_FILTERED; | 748 | if (filter->preds) { |
749 | for (i = 0; i < filter->a_preds; i++) | ||
750 | kfree(filter->preds[i].field_name); | ||
751 | kfree(filter->preds); | ||
752 | filter->preds = NULL; | ||
753 | } | ||
754 | filter->a_preds = 0; | ||
563 | filter->n_preds = 0; | 755 | filter->n_preds = 0; |
564 | |||
565 | for (i = 0; i < MAX_FILTER_PRED; i++) | ||
566 | filter->preds[i]->fn = filter_pred_none; | ||
567 | } | 756 | } |
568 | 757 | ||
569 | static void __free_preds(struct event_filter *filter) | 758 | static void filter_disable(struct ftrace_event_call *call) |
570 | { | 759 | { |
571 | int i; | 760 | call->flags &= ~TRACE_EVENT_FL_FILTERED; |
761 | } | ||
572 | 762 | ||
763 | static void __free_filter(struct event_filter *filter) | ||
764 | { | ||
573 | if (!filter) | 765 | if (!filter) |
574 | return; | 766 | return; |
575 | 767 | ||
576 | for (i = 0; i < MAX_FILTER_PRED; i++) { | 768 | __free_preds(filter); |
577 | if (filter->preds[i]) | ||
578 | filter_free_pred(filter->preds[i]); | ||
579 | } | ||
580 | kfree(filter->preds); | ||
581 | kfree(filter->filter_string); | 769 | kfree(filter->filter_string); |
582 | kfree(filter); | 770 | kfree(filter); |
583 | } | 771 | } |
584 | 772 | ||
773 | /* | ||
774 | * Called when destroying the ftrace_event_call. | ||
775 | * The call is being freed, so we do not need to worry about | ||
776 | * the call being currently used. This is for module code removing | ||
777 | * the tracepoints from within it. | ||
778 | */ | ||
585 | void destroy_preds(struct ftrace_event_call *call) | 779 | void destroy_preds(struct ftrace_event_call *call) |
586 | { | 780 | { |
587 | __free_preds(call->filter); | 781 | __free_filter(call->filter); |
588 | call->filter = NULL; | 782 | call->filter = NULL; |
589 | call->flags &= ~TRACE_EVENT_FL_FILTERED; | ||
590 | } | 783 | } |
591 | 784 | ||
592 | static struct event_filter *__alloc_preds(void) | 785 | static struct event_filter *__alloc_filter(void) |
593 | { | 786 | { |
594 | struct event_filter *filter; | 787 | struct event_filter *filter; |
788 | |||
789 | filter = kzalloc(sizeof(*filter), GFP_KERNEL); | ||
790 | return filter; | ||
791 | } | ||
792 | |||
793 | static int __alloc_preds(struct event_filter *filter, int n_preds) | ||
794 | { | ||
595 | struct filter_pred *pred; | 795 | struct filter_pred *pred; |
596 | int i; | 796 | int i; |
597 | 797 | ||
598 | filter = kzalloc(sizeof(*filter), GFP_KERNEL); | 798 | if (filter->preds) |
599 | if (!filter) | 799 | __free_preds(filter); |
600 | return ERR_PTR(-ENOMEM); | ||
601 | 800 | ||
602 | filter->n_preds = 0; | 801 | filter->preds = |
802 | kzalloc(sizeof(*filter->preds) * n_preds, GFP_KERNEL); | ||
603 | 803 | ||
604 | filter->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred), GFP_KERNEL); | ||
605 | if (!filter->preds) | 804 | if (!filter->preds) |
606 | goto oom; | 805 | return -ENOMEM; |
607 | 806 | ||
608 | for (i = 0; i < MAX_FILTER_PRED; i++) { | 807 | filter->a_preds = n_preds; |
609 | pred = kzalloc(sizeof(*pred), GFP_KERNEL); | 808 | filter->n_preds = 0; |
610 | if (!pred) | 809 | |
611 | goto oom; | 810 | for (i = 0; i < n_preds; i++) { |
811 | pred = &filter->preds[i]; | ||
612 | pred->fn = filter_pred_none; | 812 | pred->fn = filter_pred_none; |
613 | filter->preds[i] = pred; | ||
614 | } | 813 | } |
615 | 814 | ||
616 | return filter; | ||
617 | |||
618 | oom: | ||
619 | __free_preds(filter); | ||
620 | return ERR_PTR(-ENOMEM); | ||
621 | } | ||
622 | |||
623 | static int init_preds(struct ftrace_event_call *call) | ||
624 | { | ||
625 | if (call->filter) | ||
626 | return 0; | ||
627 | |||
628 | call->flags &= ~TRACE_EVENT_FL_FILTERED; | ||
629 | call->filter = __alloc_preds(); | ||
630 | if (IS_ERR(call->filter)) | ||
631 | return PTR_ERR(call->filter); | ||
632 | |||
633 | return 0; | 815 | return 0; |
634 | } | 816 | } |
635 | 817 | ||
636 | static int init_subsystem_preds(struct event_subsystem *system) | 818 | static void filter_free_subsystem_preds(struct event_subsystem *system) |
637 | { | 819 | { |
638 | struct ftrace_event_call *call; | 820 | struct ftrace_event_call *call; |
639 | int err; | ||
640 | 821 | ||
641 | list_for_each_entry(call, &ftrace_events, list) { | 822 | list_for_each_entry(call, &ftrace_events, list) { |
642 | if (strcmp(call->class->system, system->name) != 0) | 823 | if (strcmp(call->class->system, system->name) != 0) |
643 | continue; | 824 | continue; |
644 | 825 | ||
645 | err = init_preds(call); | 826 | filter_disable(call); |
646 | if (err) | 827 | remove_filter_string(call->filter); |
647 | return err; | ||
648 | } | 828 | } |
649 | |||
650 | return 0; | ||
651 | } | 829 | } |
652 | 830 | ||
653 | static void filter_free_subsystem_preds(struct event_subsystem *system) | 831 | static void filter_free_subsystem_filters(struct event_subsystem *system) |
654 | { | 832 | { |
655 | struct ftrace_event_call *call; | 833 | struct ftrace_event_call *call; |
656 | 834 | ||
657 | list_for_each_entry(call, &ftrace_events, list) { | 835 | list_for_each_entry(call, &ftrace_events, list) { |
658 | if (strcmp(call->class->system, system->name) != 0) | 836 | if (strcmp(call->class->system, system->name) != 0) |
659 | continue; | 837 | continue; |
660 | 838 | __free_filter(call->filter); | |
661 | filter_disable_preds(call); | 839 | call->filter = NULL; |
662 | remove_filter_string(call->filter); | ||
663 | } | 840 | } |
664 | } | 841 | } |
665 | 842 | ||
@@ -667,18 +844,19 @@ static int filter_add_pred_fn(struct filter_parse_state *ps, | |||
667 | struct ftrace_event_call *call, | 844 | struct ftrace_event_call *call, |
668 | struct event_filter *filter, | 845 | struct event_filter *filter, |
669 | struct filter_pred *pred, | 846 | struct filter_pred *pred, |
847 | struct pred_stack *stack, | ||
670 | filter_pred_fn_t fn) | 848 | filter_pred_fn_t fn) |
671 | { | 849 | { |
672 | int idx, err; | 850 | int idx, err; |
673 | 851 | ||
674 | if (filter->n_preds == MAX_FILTER_PRED) { | 852 | if (WARN_ON(filter->n_preds == filter->a_preds)) { |
675 | parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0); | 853 | parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0); |
676 | return -ENOSPC; | 854 | return -ENOSPC; |
677 | } | 855 | } |
678 | 856 | ||
679 | idx = filter->n_preds; | 857 | idx = filter->n_preds; |
680 | filter_clear_pred(filter->preds[idx]); | 858 | filter_clear_pred(&filter->preds[idx]); |
681 | err = filter_set_pred(filter->preds[idx], pred, fn); | 859 | err = filter_set_pred(filter, idx, stack, pred, fn); |
682 | if (err) | 860 | if (err) |
683 | return err; | 861 | return err; |
684 | 862 | ||
@@ -763,6 +941,7 @@ static int filter_add_pred(struct filter_parse_state *ps, | |||
763 | struct ftrace_event_call *call, | 941 | struct ftrace_event_call *call, |
764 | struct event_filter *filter, | 942 | struct event_filter *filter, |
765 | struct filter_pred *pred, | 943 | struct filter_pred *pred, |
944 | struct pred_stack *stack, | ||
766 | bool dry_run) | 945 | bool dry_run) |
767 | { | 946 | { |
768 | struct ftrace_event_field *field; | 947 | struct ftrace_event_field *field; |
@@ -770,17 +949,12 @@ static int filter_add_pred(struct filter_parse_state *ps, | |||
770 | unsigned long long val; | 949 | unsigned long long val; |
771 | int ret; | 950 | int ret; |
772 | 951 | ||
773 | pred->fn = filter_pred_none; | 952 | fn = pred->fn = filter_pred_none; |
774 | 953 | ||
775 | if (pred->op == OP_AND) { | 954 | if (pred->op == OP_AND) |
776 | pred->pop_n = 2; | ||
777 | fn = filter_pred_and; | ||
778 | goto add_pred_fn; | 955 | goto add_pred_fn; |
779 | } else if (pred->op == OP_OR) { | 956 | else if (pred->op == OP_OR) |
780 | pred->pop_n = 2; | ||
781 | fn = filter_pred_or; | ||
782 | goto add_pred_fn; | 957 | goto add_pred_fn; |
783 | } | ||
784 | 958 | ||
785 | field = find_event_field(call, pred->field_name); | 959 | field = find_event_field(call, pred->field_name); |
786 | if (!field) { | 960 | if (!field) { |
@@ -829,7 +1003,7 @@ static int filter_add_pred(struct filter_parse_state *ps, | |||
829 | 1003 | ||
830 | add_pred_fn: | 1004 | add_pred_fn: |
831 | if (!dry_run) | 1005 | if (!dry_run) |
832 | return filter_add_pred_fn(ps, call, filter, pred, fn); | 1006 | return filter_add_pred_fn(ps, call, filter, pred, stack, fn); |
833 | return 0; | 1007 | return 0; |
834 | } | 1008 | } |
835 | 1009 | ||
@@ -1187,6 +1361,234 @@ static int check_preds(struct filter_parse_state *ps) | |||
1187 | return 0; | 1361 | return 0; |
1188 | } | 1362 | } |
1189 | 1363 | ||
1364 | static int count_preds(struct filter_parse_state *ps) | ||
1365 | { | ||
1366 | struct postfix_elt *elt; | ||
1367 | int n_preds = 0; | ||
1368 | |||
1369 | list_for_each_entry(elt, &ps->postfix, list) { | ||
1370 | if (elt->op == OP_NONE) | ||
1371 | continue; | ||
1372 | n_preds++; | ||
1373 | } | ||
1374 | |||
1375 | return n_preds; | ||
1376 | } | ||
1377 | |||
1378 | /* | ||
1379 | * The tree is walked at filtering of an event. If the tree is not correctly | ||
1380 | * built, it may cause an infinite loop. Check here that the tree does | ||
1381 | * indeed terminate. | ||
1382 | */ | ||
1383 | static int check_pred_tree(struct event_filter *filter, | ||
1384 | struct filter_pred *root) | ||
1385 | { | ||
1386 | struct filter_pred *preds; | ||
1387 | struct filter_pred *pred; | ||
1388 | enum move_type move = MOVE_DOWN; | ||
1389 | int count = 0; | ||
1390 | int done = 0; | ||
1391 | int max; | ||
1392 | |||
1393 | /* | ||
1394 | * The max that we can hit a node is three times. | ||
1395 | * Once going down, once coming up from left, and | ||
1396 | * once coming up from right. This is more than enough | ||
1397 | * since leafs are only hit a single time. | ||
1398 | */ | ||
1399 | max = 3 * filter->n_preds; | ||
1400 | |||
1401 | preds = filter->preds; | ||
1402 | if (!preds) | ||
1403 | return -EINVAL; | ||
1404 | pred = root; | ||
1405 | |||
1406 | do { | ||
1407 | if (WARN_ON(count++ > max)) | ||
1408 | return -EINVAL; | ||
1409 | |||
1410 | switch (move) { | ||
1411 | case MOVE_DOWN: | ||
1412 | if (pred->left != FILTER_PRED_INVALID) { | ||
1413 | pred = &preds[pred->left]; | ||
1414 | continue; | ||
1415 | } | ||
1416 | /* A leaf at the root is just a leaf in the tree */ | ||
1417 | if (pred == root) | ||
1418 | break; | ||
1419 | pred = get_pred_parent(pred, preds, | ||
1420 | pred->parent, &move); | ||
1421 | continue; | ||
1422 | case MOVE_UP_FROM_LEFT: | ||
1423 | pred = &preds[pred->right]; | ||
1424 | move = MOVE_DOWN; | ||
1425 | continue; | ||
1426 | case MOVE_UP_FROM_RIGHT: | ||
1427 | if (pred == root) | ||
1428 | break; | ||
1429 | pred = get_pred_parent(pred, preds, | ||
1430 | pred->parent, &move); | ||
1431 | continue; | ||
1432 | } | ||
1433 | done = 1; | ||
1434 | } while (!done); | ||
1435 | |||
1436 | /* We are fine. */ | ||
1437 | return 0; | ||
1438 | } | ||
1439 | |||
1440 | static int count_leafs(struct filter_pred *preds, struct filter_pred *root) | ||
1441 | { | ||
1442 | struct filter_pred *pred; | ||
1443 | enum move_type move = MOVE_DOWN; | ||
1444 | int count = 0; | ||
1445 | int done = 0; | ||
1446 | |||
1447 | pred = root; | ||
1448 | |||
1449 | do { | ||
1450 | switch (move) { | ||
1451 | case MOVE_DOWN: | ||
1452 | if (pred->left != FILTER_PRED_INVALID) { | ||
1453 | pred = &preds[pred->left]; | ||
1454 | continue; | ||
1455 | } | ||
1456 | /* A leaf at the root is just a leaf in the tree */ | ||
1457 | if (pred == root) | ||
1458 | return 1; | ||
1459 | count++; | ||
1460 | pred = get_pred_parent(pred, preds, | ||
1461 | pred->parent, &move); | ||
1462 | continue; | ||
1463 | case MOVE_UP_FROM_LEFT: | ||
1464 | pred = &preds[pred->right]; | ||
1465 | move = MOVE_DOWN; | ||
1466 | continue; | ||
1467 | case MOVE_UP_FROM_RIGHT: | ||
1468 | if (pred == root) | ||
1469 | break; | ||
1470 | pred = get_pred_parent(pred, preds, | ||
1471 | pred->parent, &move); | ||
1472 | continue; | ||
1473 | } | ||
1474 | done = 1; | ||
1475 | } while (!done); | ||
1476 | |||
1477 | return count; | ||
1478 | } | ||
1479 | |||
1480 | static int fold_pred(struct filter_pred *preds, struct filter_pred *root) | ||
1481 | { | ||
1482 | struct filter_pred *pred; | ||
1483 | enum move_type move = MOVE_DOWN; | ||
1484 | int count = 0; | ||
1485 | int children; | ||
1486 | int done = 0; | ||
1487 | |||
1488 | /* No need to keep the fold flag */ | ||
1489 | root->index &= ~FILTER_PRED_FOLD; | ||
1490 | |||
1491 | /* If the root is a leaf then do nothing */ | ||
1492 | if (root->left == FILTER_PRED_INVALID) | ||
1493 | return 0; | ||
1494 | |||
1495 | /* count the children */ | ||
1496 | children = count_leafs(preds, &preds[root->left]); | ||
1497 | children += count_leafs(preds, &preds[root->right]); | ||
1498 | |||
1499 | root->ops = kzalloc(sizeof(*root->ops) * children, GFP_KERNEL); | ||
1500 | if (!root->ops) | ||
1501 | return -ENOMEM; | ||
1502 | |||
1503 | root->val = children; | ||
1504 | |||
1505 | pred = root; | ||
1506 | do { | ||
1507 | switch (move) { | ||
1508 | case MOVE_DOWN: | ||
1509 | if (pred->left != FILTER_PRED_INVALID) { | ||
1510 | pred = &preds[pred->left]; | ||
1511 | continue; | ||
1512 | } | ||
1513 | if (WARN_ON(count == children)) | ||
1514 | return -EINVAL; | ||
1515 | pred->index &= ~FILTER_PRED_FOLD; | ||
1516 | root->ops[count++] = pred->index; | ||
1517 | pred = get_pred_parent(pred, preds, | ||
1518 | pred->parent, &move); | ||
1519 | continue; | ||
1520 | case MOVE_UP_FROM_LEFT: | ||
1521 | pred = &preds[pred->right]; | ||
1522 | move = MOVE_DOWN; | ||
1523 | continue; | ||
1524 | case MOVE_UP_FROM_RIGHT: | ||
1525 | if (pred == root) | ||
1526 | break; | ||
1527 | pred = get_pred_parent(pred, preds, | ||
1528 | pred->parent, &move); | ||
1529 | continue; | ||
1530 | } | ||
1531 | done = 1; | ||
1532 | } while (!done); | ||
1533 | |||
1534 | return 0; | ||
1535 | } | ||
1536 | |||
1537 | /* | ||
1538 | * To optimize the processing of the ops, if we have several "ors" or | ||
1539 | * "ands" together, we can put them in an array and process them all | ||
1540 | * together speeding up the filter logic. | ||
1541 | */ | ||
1542 | static int fold_pred_tree(struct event_filter *filter, | ||
1543 | struct filter_pred *root) | ||
1544 | { | ||
1545 | struct filter_pred *preds; | ||
1546 | struct filter_pred *pred; | ||
1547 | enum move_type move = MOVE_DOWN; | ||
1548 | int done = 0; | ||
1549 | int err; | ||
1550 | |||
1551 | preds = filter->preds; | ||
1552 | if (!preds) | ||
1553 | return -EINVAL; | ||
1554 | pred = root; | ||
1555 | |||
1556 | do { | ||
1557 | switch (move) { | ||
1558 | case MOVE_DOWN: | ||
1559 | if (pred->index & FILTER_PRED_FOLD) { | ||
1560 | err = fold_pred(preds, pred); | ||
1561 | if (err) | ||
1562 | return err; | ||
1563 | /* Folded nodes are like leafs */ | ||
1564 | } else if (pred->left != FILTER_PRED_INVALID) { | ||
1565 | pred = &preds[pred->left]; | ||
1566 | continue; | ||
1567 | } | ||
1568 | |||
1569 | /* A leaf at the root is just a leaf in the tree */ | ||
1570 | if (pred == root) | ||
1571 | break; | ||
1572 | pred = get_pred_parent(pred, preds, | ||
1573 | pred->parent, &move); | ||
1574 | continue; | ||
1575 | case MOVE_UP_FROM_LEFT: | ||
1576 | pred = &preds[pred->right]; | ||
1577 | move = MOVE_DOWN; | ||
1578 | continue; | ||
1579 | case MOVE_UP_FROM_RIGHT: | ||
1580 | if (pred == root) | ||
1581 | break; | ||
1582 | pred = get_pred_parent(pred, preds, | ||
1583 | pred->parent, &move); | ||
1584 | continue; | ||
1585 | } | ||
1586 | done = 1; | ||
1587 | } while (!done); | ||
1588 | |||
1589 | return 0; | ||
1590 | } | ||
1591 | |||
1190 | static int replace_preds(struct ftrace_event_call *call, | 1592 | static int replace_preds(struct ftrace_event_call *call, |
1191 | struct event_filter *filter, | 1593 | struct event_filter *filter, |
1192 | struct filter_parse_state *ps, | 1594 | struct filter_parse_state *ps, |
@@ -1195,14 +1597,32 @@ static int replace_preds(struct ftrace_event_call *call, | |||
1195 | { | 1597 | { |
1196 | char *operand1 = NULL, *operand2 = NULL; | 1598 | char *operand1 = NULL, *operand2 = NULL; |
1197 | struct filter_pred *pred; | 1599 | struct filter_pred *pred; |
1600 | struct filter_pred *root; | ||
1198 | struct postfix_elt *elt; | 1601 | struct postfix_elt *elt; |
1602 | struct pred_stack stack = { }; /* init to NULL */ | ||
1199 | int err; | 1603 | int err; |
1200 | int n_preds = 0; | 1604 | int n_preds = 0; |
1201 | 1605 | ||
1606 | n_preds = count_preds(ps); | ||
1607 | if (n_preds >= MAX_FILTER_PRED) { | ||
1608 | parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0); | ||
1609 | return -ENOSPC; | ||
1610 | } | ||
1611 | |||
1202 | err = check_preds(ps); | 1612 | err = check_preds(ps); |
1203 | if (err) | 1613 | if (err) |
1204 | return err; | 1614 | return err; |
1205 | 1615 | ||
1616 | if (!dry_run) { | ||
1617 | err = __alloc_pred_stack(&stack, n_preds); | ||
1618 | if (err) | ||
1619 | return err; | ||
1620 | err = __alloc_preds(filter, n_preds); | ||
1621 | if (err) | ||
1622 | goto fail; | ||
1623 | } | ||
1624 | |||
1625 | n_preds = 0; | ||
1206 | list_for_each_entry(elt, &ps->postfix, list) { | 1626 | list_for_each_entry(elt, &ps->postfix, list) { |
1207 | if (elt->op == OP_NONE) { | 1627 | if (elt->op == OP_NONE) { |
1208 | if (!operand1) | 1628 | if (!operand1) |
@@ -1211,14 +1631,16 @@ static int replace_preds(struct ftrace_event_call *call, | |||
1211 | operand2 = elt->operand; | 1631 | operand2 = elt->operand; |
1212 | else { | 1632 | else { |
1213 | parse_error(ps, FILT_ERR_TOO_MANY_OPERANDS, 0); | 1633 | parse_error(ps, FILT_ERR_TOO_MANY_OPERANDS, 0); |
1214 | return -EINVAL; | 1634 | err = -EINVAL; |
1635 | goto fail; | ||
1215 | } | 1636 | } |
1216 | continue; | 1637 | continue; |
1217 | } | 1638 | } |
1218 | 1639 | ||
1219 | if (n_preds++ == MAX_FILTER_PRED) { | 1640 | if (WARN_ON(n_preds++ == MAX_FILTER_PRED)) { |
1220 | parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0); | 1641 | parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0); |
1221 | return -ENOSPC; | 1642 | err = -ENOSPC; |
1643 | goto fail; | ||
1222 | } | 1644 | } |
1223 | 1645 | ||
1224 | if (elt->op == OP_AND || elt->op == OP_OR) { | 1646 | if (elt->op == OP_AND || elt->op == OP_OR) { |
@@ -1228,76 +1650,181 @@ static int replace_preds(struct ftrace_event_call *call, | |||
1228 | 1650 | ||
1229 | if (!operand1 || !operand2) { | 1651 | if (!operand1 || !operand2) { |
1230 | parse_error(ps, FILT_ERR_MISSING_FIELD, 0); | 1652 | parse_error(ps, FILT_ERR_MISSING_FIELD, 0); |
1231 | return -EINVAL; | 1653 | err = -EINVAL; |
1654 | goto fail; | ||
1232 | } | 1655 | } |
1233 | 1656 | ||
1234 | pred = create_pred(elt->op, operand1, operand2); | 1657 | pred = create_pred(elt->op, operand1, operand2); |
1235 | add_pred: | 1658 | add_pred: |
1236 | if (!pred) | 1659 | if (!pred) { |
1237 | return -ENOMEM; | 1660 | err = -ENOMEM; |
1238 | err = filter_add_pred(ps, call, filter, pred, dry_run); | 1661 | goto fail; |
1662 | } | ||
1663 | err = filter_add_pred(ps, call, filter, pred, &stack, dry_run); | ||
1239 | filter_free_pred(pred); | 1664 | filter_free_pred(pred); |
1240 | if (err) | 1665 | if (err) |
1241 | return err; | 1666 | goto fail; |
1242 | 1667 | ||
1243 | operand1 = operand2 = NULL; | 1668 | operand1 = operand2 = NULL; |
1244 | } | 1669 | } |
1245 | 1670 | ||
1246 | return 0; | 1671 | if (!dry_run) { |
1672 | /* We should have one item left on the stack */ | ||
1673 | pred = __pop_pred_stack(&stack); | ||
1674 | if (!pred) | ||
1675 | return -EINVAL; | ||
1676 | /* This item is where we start from in matching */ | ||
1677 | root = pred; | ||
1678 | /* Make sure the stack is empty */ | ||
1679 | pred = __pop_pred_stack(&stack); | ||
1680 | if (WARN_ON(pred)) { | ||
1681 | err = -EINVAL; | ||
1682 | filter->root = NULL; | ||
1683 | goto fail; | ||
1684 | } | ||
1685 | err = check_pred_tree(filter, root); | ||
1686 | if (err) | ||
1687 | goto fail; | ||
1688 | |||
1689 | /* Optimize the tree */ | ||
1690 | err = fold_pred_tree(filter, root); | ||
1691 | if (err) | ||
1692 | goto fail; | ||
1693 | |||
1694 | /* We don't set root until we know it works */ | ||
1695 | barrier(); | ||
1696 | filter->root = root; | ||
1697 | } | ||
1698 | |||
1699 | err = 0; | ||
1700 | fail: | ||
1701 | __free_pred_stack(&stack); | ||
1702 | return err; | ||
1247 | } | 1703 | } |
1248 | 1704 | ||
1705 | struct filter_list { | ||
1706 | struct list_head list; | ||
1707 | struct event_filter *filter; | ||
1708 | }; | ||
1709 | |||
1249 | static int replace_system_preds(struct event_subsystem *system, | 1710 | static int replace_system_preds(struct event_subsystem *system, |
1250 | struct filter_parse_state *ps, | 1711 | struct filter_parse_state *ps, |
1251 | char *filter_string) | 1712 | char *filter_string) |
1252 | { | 1713 | { |
1253 | struct ftrace_event_call *call; | 1714 | struct ftrace_event_call *call; |
1715 | struct filter_list *filter_item; | ||
1716 | struct filter_list *tmp; | ||
1717 | LIST_HEAD(filter_list); | ||
1254 | bool fail = true; | 1718 | bool fail = true; |
1255 | int err; | 1719 | int err; |
1256 | 1720 | ||
1257 | list_for_each_entry(call, &ftrace_events, list) { | 1721 | list_for_each_entry(call, &ftrace_events, list) { |
1258 | struct event_filter *filter = call->filter; | ||
1259 | 1722 | ||
1260 | if (strcmp(call->class->system, system->name) != 0) | 1723 | if (strcmp(call->class->system, system->name) != 0) |
1261 | continue; | 1724 | continue; |
1262 | 1725 | ||
1263 | /* try to see if the filter can be applied */ | 1726 | /* |
1264 | err = replace_preds(call, filter, ps, filter_string, true); | 1727 | * Try to see if the filter can be applied |
1728 | * (filter arg is ignored on dry_run) | ||
1729 | */ | ||
1730 | err = replace_preds(call, NULL, ps, filter_string, true); | ||
1265 | if (err) | 1731 | if (err) |
1732 | goto fail; | ||
1733 | } | ||
1734 | |||
1735 | list_for_each_entry(call, &ftrace_events, list) { | ||
1736 | struct event_filter *filter; | ||
1737 | |||
1738 | if (strcmp(call->class->system, system->name) != 0) | ||
1266 | continue; | 1739 | continue; |
1267 | 1740 | ||
1268 | /* really apply the filter */ | 1741 | filter_item = kzalloc(sizeof(*filter_item), GFP_KERNEL); |
1269 | filter_disable_preds(call); | 1742 | if (!filter_item) |
1270 | err = replace_preds(call, filter, ps, filter_string, false); | 1743 | goto fail_mem; |
1744 | |||
1745 | list_add_tail(&filter_item->list, &filter_list); | ||
1746 | |||
1747 | filter_item->filter = __alloc_filter(); | ||
1748 | if (!filter_item->filter) | ||
1749 | goto fail_mem; | ||
1750 | filter = filter_item->filter; | ||
1751 | |||
1752 | /* Can only fail on no memory */ | ||
1753 | err = replace_filter_string(filter, filter_string); | ||
1271 | if (err) | 1754 | if (err) |
1272 | filter_disable_preds(call); | 1755 | goto fail_mem; |
1273 | else { | 1756 | |
1757 | err = replace_preds(call, filter, ps, filter_string, false); | ||
1758 | if (err) { | ||
1759 | filter_disable(call); | ||
1760 | parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0); | ||
1761 | append_filter_err(ps, filter); | ||
1762 | } else | ||
1274 | call->flags |= TRACE_EVENT_FL_FILTERED; | 1763 | call->flags |= TRACE_EVENT_FL_FILTERED; |
1275 | replace_filter_string(filter, filter_string); | 1764 | /* |
1276 | } | 1765 | * Regardless of if this returned an error, we still |
1766 | * replace the filter for the call. | ||
1767 | */ | ||
1768 | filter = call->filter; | ||
1769 | call->filter = filter_item->filter; | ||
1770 | filter_item->filter = filter; | ||
1771 | |||
1277 | fail = false; | 1772 | fail = false; |
1278 | } | 1773 | } |
1279 | 1774 | ||
1280 | if (fail) { | 1775 | if (fail) |
1281 | parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0); | 1776 | goto fail; |
1282 | return -EINVAL; | 1777 | |
1778 | /* | ||
1779 | * The calls can still be using the old filters. | ||
1780 | * Do a synchronize_sched() to ensure all calls are | ||
1781 | * done with them before we free them. | ||
1782 | */ | ||
1783 | synchronize_sched(); | ||
1784 | list_for_each_entry_safe(filter_item, tmp, &filter_list, list) { | ||
1785 | __free_filter(filter_item->filter); | ||
1786 | list_del(&filter_item->list); | ||
1787 | kfree(filter_item); | ||
1283 | } | 1788 | } |
1284 | return 0; | 1789 | return 0; |
1790 | fail: | ||
1791 | /* No call succeeded */ | ||
1792 | list_for_each_entry_safe(filter_item, tmp, &filter_list, list) { | ||
1793 | list_del(&filter_item->list); | ||
1794 | kfree(filter_item); | ||
1795 | } | ||
1796 | parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0); | ||
1797 | return -EINVAL; | ||
1798 | fail_mem: | ||
1799 | /* If any call succeeded, we still need to sync */ | ||
1800 | if (!fail) | ||
1801 | synchronize_sched(); | ||
1802 | list_for_each_entry_safe(filter_item, tmp, &filter_list, list) { | ||
1803 | __free_filter(filter_item->filter); | ||
1804 | list_del(&filter_item->list); | ||
1805 | kfree(filter_item); | ||
1806 | } | ||
1807 | return -ENOMEM; | ||
1285 | } | 1808 | } |
1286 | 1809 | ||
1287 | int apply_event_filter(struct ftrace_event_call *call, char *filter_string) | 1810 | int apply_event_filter(struct ftrace_event_call *call, char *filter_string) |
1288 | { | 1811 | { |
1289 | int err; | ||
1290 | struct filter_parse_state *ps; | 1812 | struct filter_parse_state *ps; |
1813 | struct event_filter *filter; | ||
1814 | struct event_filter *tmp; | ||
1815 | int err = 0; | ||
1291 | 1816 | ||
1292 | mutex_lock(&event_mutex); | 1817 | mutex_lock(&event_mutex); |
1293 | 1818 | ||
1294 | err = init_preds(call); | ||
1295 | if (err) | ||
1296 | goto out_unlock; | ||
1297 | |||
1298 | if (!strcmp(strstrip(filter_string), "0")) { | 1819 | if (!strcmp(strstrip(filter_string), "0")) { |
1299 | filter_disable_preds(call); | 1820 | filter_disable(call); |
1300 | remove_filter_string(call->filter); | 1821 | filter = call->filter; |
1822 | if (!filter) | ||
1823 | goto out_unlock; | ||
1824 | call->filter = NULL; | ||
1825 | /* Make sure the filter is not being used */ | ||
1826 | synchronize_sched(); | ||
1827 | __free_filter(filter); | ||
1301 | goto out_unlock; | 1828 | goto out_unlock; |
1302 | } | 1829 | } |
1303 | 1830 | ||
@@ -1306,22 +1833,41 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string) | |||
1306 | if (!ps) | 1833 | if (!ps) |
1307 | goto out_unlock; | 1834 | goto out_unlock; |
1308 | 1835 | ||
1309 | filter_disable_preds(call); | 1836 | filter = __alloc_filter(); |
1310 | replace_filter_string(call->filter, filter_string); | 1837 | if (!filter) { |
1838 | kfree(ps); | ||
1839 | goto out_unlock; | ||
1840 | } | ||
1841 | |||
1842 | replace_filter_string(filter, filter_string); | ||
1311 | 1843 | ||
1312 | parse_init(ps, filter_ops, filter_string); | 1844 | parse_init(ps, filter_ops, filter_string); |
1313 | err = filter_parse(ps); | 1845 | err = filter_parse(ps); |
1314 | if (err) { | 1846 | if (err) { |
1315 | append_filter_err(ps, call->filter); | 1847 | append_filter_err(ps, filter); |
1316 | goto out; | 1848 | goto out; |
1317 | } | 1849 | } |
1318 | 1850 | ||
1319 | err = replace_preds(call, call->filter, ps, filter_string, false); | 1851 | err = replace_preds(call, filter, ps, filter_string, false); |
1320 | if (err) | 1852 | if (err) { |
1321 | append_filter_err(ps, call->filter); | 1853 | filter_disable(call); |
1322 | else | 1854 | append_filter_err(ps, filter); |
1855 | } else | ||
1323 | call->flags |= TRACE_EVENT_FL_FILTERED; | 1856 | call->flags |= TRACE_EVENT_FL_FILTERED; |
1324 | out: | 1857 | out: |
1858 | /* | ||
1859 | * Always swap the call filter with the new filter | ||
1860 | * even if there was an error. If there was an error | ||
1861 | * in the filter, we disable the filter and show the error | ||
1862 | * string | ||
1863 | */ | ||
1864 | tmp = call->filter; | ||
1865 | call->filter = filter; | ||
1866 | if (tmp) { | ||
1867 | /* Make sure the call is done with the filter */ | ||
1868 | synchronize_sched(); | ||
1869 | __free_filter(tmp); | ||
1870 | } | ||
1325 | filter_opstack_clear(ps); | 1871 | filter_opstack_clear(ps); |
1326 | postfix_clear(ps); | 1872 | postfix_clear(ps); |
1327 | kfree(ps); | 1873 | kfree(ps); |
@@ -1334,18 +1880,21 @@ out_unlock: | |||
1334 | int apply_subsystem_event_filter(struct event_subsystem *system, | 1880 | int apply_subsystem_event_filter(struct event_subsystem *system, |
1335 | char *filter_string) | 1881 | char *filter_string) |
1336 | { | 1882 | { |
1337 | int err; | ||
1338 | struct filter_parse_state *ps; | 1883 | struct filter_parse_state *ps; |
1884 | struct event_filter *filter; | ||
1885 | int err = 0; | ||
1339 | 1886 | ||
1340 | mutex_lock(&event_mutex); | 1887 | mutex_lock(&event_mutex); |
1341 | 1888 | ||
1342 | err = init_subsystem_preds(system); | ||
1343 | if (err) | ||
1344 | goto out_unlock; | ||
1345 | |||
1346 | if (!strcmp(strstrip(filter_string), "0")) { | 1889 | if (!strcmp(strstrip(filter_string), "0")) { |
1347 | filter_free_subsystem_preds(system); | 1890 | filter_free_subsystem_preds(system); |
1348 | remove_filter_string(system->filter); | 1891 | remove_filter_string(system->filter); |
1892 | filter = system->filter; | ||
1893 | system->filter = NULL; | ||
1894 | /* Ensure all filters are no longer used */ | ||
1895 | synchronize_sched(); | ||
1896 | filter_free_subsystem_filters(system); | ||
1897 | __free_filter(filter); | ||
1349 | goto out_unlock; | 1898 | goto out_unlock; |
1350 | } | 1899 | } |
1351 | 1900 | ||
@@ -1354,7 +1903,17 @@ int apply_subsystem_event_filter(struct event_subsystem *system, | |||
1354 | if (!ps) | 1903 | if (!ps) |
1355 | goto out_unlock; | 1904 | goto out_unlock; |
1356 | 1905 | ||
1357 | replace_filter_string(system->filter, filter_string); | 1906 | filter = __alloc_filter(); |
1907 | if (!filter) | ||
1908 | goto out; | ||
1909 | |||
1910 | replace_filter_string(filter, filter_string); | ||
1911 | /* | ||
1912 | * No event actually uses the system filter | ||
1913 | * we can free it without synchronize_sched(). | ||
1914 | */ | ||
1915 | __free_filter(system->filter); | ||
1916 | system->filter = filter; | ||
1358 | 1917 | ||
1359 | parse_init(ps, filter_ops, filter_string); | 1918 | parse_init(ps, filter_ops, filter_string); |
1360 | err = filter_parse(ps); | 1919 | err = filter_parse(ps); |
@@ -1384,7 +1943,7 @@ void ftrace_profile_free_filter(struct perf_event *event) | |||
1384 | struct event_filter *filter = event->filter; | 1943 | struct event_filter *filter = event->filter; |
1385 | 1944 | ||
1386 | event->filter = NULL; | 1945 | event->filter = NULL; |
1387 | __free_preds(filter); | 1946 | __free_filter(filter); |
1388 | } | 1947 | } |
1389 | 1948 | ||
1390 | int ftrace_profile_set_filter(struct perf_event *event, int event_id, | 1949 | int ftrace_profile_set_filter(struct perf_event *event, int event_id, |
@@ -1410,8 +1969,8 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id, | |||
1410 | if (event->filter) | 1969 | if (event->filter) |
1411 | goto out_unlock; | 1970 | goto out_unlock; |
1412 | 1971 | ||
1413 | filter = __alloc_preds(); | 1972 | filter = __alloc_filter(); |
1414 | if (IS_ERR(filter)) { | 1973 | if (!filter) { |
1415 | err = PTR_ERR(filter); | 1974 | err = PTR_ERR(filter); |
1416 | goto out_unlock; | 1975 | goto out_unlock; |
1417 | } | 1976 | } |
@@ -1419,7 +1978,7 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id, | |||
1419 | err = -ENOMEM; | 1978 | err = -ENOMEM; |
1420 | ps = kzalloc(sizeof(*ps), GFP_KERNEL); | 1979 | ps = kzalloc(sizeof(*ps), GFP_KERNEL); |
1421 | if (!ps) | 1980 | if (!ps) |
1422 | goto free_preds; | 1981 | goto free_filter; |
1423 | 1982 | ||
1424 | parse_init(ps, filter_ops, filter_str); | 1983 | parse_init(ps, filter_ops, filter_str); |
1425 | err = filter_parse(ps); | 1984 | err = filter_parse(ps); |
@@ -1435,9 +1994,9 @@ free_ps: | |||
1435 | postfix_clear(ps); | 1994 | postfix_clear(ps); |
1436 | kfree(ps); | 1995 | kfree(ps); |
1437 | 1996 | ||
1438 | free_preds: | 1997 | free_filter: |
1439 | if (err) | 1998 | if (err) |
1440 | __free_preds(filter); | 1999 | __free_filter(filter); |
1441 | 2000 | ||
1442 | out_unlock: | 2001 | out_unlock: |
1443 | mutex_unlock(&event_mutex); | 2002 | mutex_unlock(&event_mutex); |
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index 4b74d71705c0..bbeec31e0ae3 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c | |||
@@ -161,13 +161,13 @@ struct ftrace_event_class event_class_ftrace_##call = { \ | |||
161 | .fields = LIST_HEAD_INIT(event_class_ftrace_##call.fields),\ | 161 | .fields = LIST_HEAD_INIT(event_class_ftrace_##call.fields),\ |
162 | }; \ | 162 | }; \ |
163 | \ | 163 | \ |
164 | struct ftrace_event_call __used \ | 164 | struct ftrace_event_call __used event_##call = { \ |
165 | __attribute__((__aligned__(4))) \ | ||
166 | __attribute__((section("_ftrace_events"))) event_##call = { \ | ||
167 | .name = #call, \ | 165 | .name = #call, \ |
168 | .event.type = etype, \ | 166 | .event.type = etype, \ |
169 | .class = &event_class_ftrace_##call, \ | 167 | .class = &event_class_ftrace_##call, \ |
170 | .print_fmt = print, \ | 168 | .print_fmt = print, \ |
171 | }; \ | 169 | }; \ |
170 | struct ftrace_event_call __used \ | ||
171 | __attribute__((section("_ftrace_events"))) *__event_##call = &event_##call; | ||
172 | 172 | ||
173 | #include "trace_entries.h" | 173 | #include "trace_entries.h" |
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 5cf8c602b880..92b6e1e12d98 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c | |||
@@ -453,14 +453,6 @@ void time_hardirqs_off(unsigned long a0, unsigned long a1) | |||
453 | * Stubs: | 453 | * Stubs: |
454 | */ | 454 | */ |
455 | 455 | ||
456 | void early_boot_irqs_off(void) | ||
457 | { | ||
458 | } | ||
459 | |||
460 | void early_boot_irqs_on(void) | ||
461 | { | ||
462 | } | ||
463 | |||
464 | void trace_softirqs_on(unsigned long ip) | 456 | void trace_softirqs_on(unsigned long ip) |
465 | { | 457 | { |
466 | } | 458 | } |
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 2dec9bcde8b4..8435b43b1782 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c | |||
@@ -353,6 +353,43 @@ static __kprobes void free_deref_fetch_param(struct deref_fetch_param *data) | |||
353 | kfree(data); | 353 | kfree(data); |
354 | } | 354 | } |
355 | 355 | ||
356 | /* Bitfield fetch function */ | ||
357 | struct bitfield_fetch_param { | ||
358 | struct fetch_param orig; | ||
359 | unsigned char hi_shift; | ||
360 | unsigned char low_shift; | ||
361 | }; | ||
362 | |||
363 | #define DEFINE_FETCH_bitfield(type) \ | ||
364 | static __kprobes void FETCH_FUNC_NAME(bitfield, type)(struct pt_regs *regs,\ | ||
365 | void *data, void *dest) \ | ||
366 | { \ | ||
367 | struct bitfield_fetch_param *bprm = data; \ | ||
368 | type buf = 0; \ | ||
369 | call_fetch(&bprm->orig, regs, &buf); \ | ||
370 | if (buf) { \ | ||
371 | buf <<= bprm->hi_shift; \ | ||
372 | buf >>= bprm->low_shift; \ | ||
373 | } \ | ||
374 | *(type *)dest = buf; \ | ||
375 | } | ||
376 | DEFINE_BASIC_FETCH_FUNCS(bitfield) | ||
377 | #define fetch_bitfield_string NULL | ||
378 | #define fetch_bitfield_string_size NULL | ||
379 | |||
380 | static __kprobes void | ||
381 | free_bitfield_fetch_param(struct bitfield_fetch_param *data) | ||
382 | { | ||
383 | /* | ||
384 | * Don't check the bitfield itself, because this must be the | ||
385 | * last fetch function. | ||
386 | */ | ||
387 | if (CHECK_FETCH_FUNCS(deref, data->orig.fn)) | ||
388 | free_deref_fetch_param(data->orig.data); | ||
389 | else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn)) | ||
390 | free_symbol_cache(data->orig.data); | ||
391 | kfree(data); | ||
392 | } | ||
356 | /* Default (unsigned long) fetch type */ | 393 | /* Default (unsigned long) fetch type */ |
357 | #define __DEFAULT_FETCH_TYPE(t) u##t | 394 | #define __DEFAULT_FETCH_TYPE(t) u##t |
358 | #define _DEFAULT_FETCH_TYPE(t) __DEFAULT_FETCH_TYPE(t) | 395 | #define _DEFAULT_FETCH_TYPE(t) __DEFAULT_FETCH_TYPE(t) |
@@ -367,6 +404,7 @@ enum { | |||
367 | FETCH_MTD_memory, | 404 | FETCH_MTD_memory, |
368 | FETCH_MTD_symbol, | 405 | FETCH_MTD_symbol, |
369 | FETCH_MTD_deref, | 406 | FETCH_MTD_deref, |
407 | FETCH_MTD_bitfield, | ||
370 | FETCH_MTD_END, | 408 | FETCH_MTD_END, |
371 | }; | 409 | }; |
372 | 410 | ||
@@ -387,6 +425,7 @@ ASSIGN_FETCH_FUNC(retval, ftype), \ | |||
387 | ASSIGN_FETCH_FUNC(memory, ftype), \ | 425 | ASSIGN_FETCH_FUNC(memory, ftype), \ |
388 | ASSIGN_FETCH_FUNC(symbol, ftype), \ | 426 | ASSIGN_FETCH_FUNC(symbol, ftype), \ |
389 | ASSIGN_FETCH_FUNC(deref, ftype), \ | 427 | ASSIGN_FETCH_FUNC(deref, ftype), \ |
428 | ASSIGN_FETCH_FUNC(bitfield, ftype), \ | ||
390 | } \ | 429 | } \ |
391 | } | 430 | } |
392 | 431 | ||
@@ -430,9 +469,33 @@ static const struct fetch_type *find_fetch_type(const char *type) | |||
430 | if (!type) | 469 | if (!type) |
431 | type = DEFAULT_FETCH_TYPE_STR; | 470 | type = DEFAULT_FETCH_TYPE_STR; |
432 | 471 | ||
472 | /* Special case: bitfield */ | ||
473 | if (*type == 'b') { | ||
474 | unsigned long bs; | ||
475 | type = strchr(type, '/'); | ||
476 | if (!type) | ||
477 | goto fail; | ||
478 | type++; | ||
479 | if (strict_strtoul(type, 0, &bs)) | ||
480 | goto fail; | ||
481 | switch (bs) { | ||
482 | case 8: | ||
483 | return find_fetch_type("u8"); | ||
484 | case 16: | ||
485 | return find_fetch_type("u16"); | ||
486 | case 32: | ||
487 | return find_fetch_type("u32"); | ||
488 | case 64: | ||
489 | return find_fetch_type("u64"); | ||
490 | default: | ||
491 | goto fail; | ||
492 | } | ||
493 | } | ||
494 | |||
433 | for (i = 0; i < ARRAY_SIZE(fetch_type_table); i++) | 495 | for (i = 0; i < ARRAY_SIZE(fetch_type_table); i++) |
434 | if (strcmp(type, fetch_type_table[i].name) == 0) | 496 | if (strcmp(type, fetch_type_table[i].name) == 0) |
435 | return &fetch_type_table[i]; | 497 | return &fetch_type_table[i]; |
498 | fail: | ||
436 | return NULL; | 499 | return NULL; |
437 | } | 500 | } |
438 | 501 | ||
@@ -586,7 +649,9 @@ error: | |||
586 | 649 | ||
587 | static void free_probe_arg(struct probe_arg *arg) | 650 | static void free_probe_arg(struct probe_arg *arg) |
588 | { | 651 | { |
589 | if (CHECK_FETCH_FUNCS(deref, arg->fetch.fn)) | 652 | if (CHECK_FETCH_FUNCS(bitfield, arg->fetch.fn)) |
653 | free_bitfield_fetch_param(arg->fetch.data); | ||
654 | else if (CHECK_FETCH_FUNCS(deref, arg->fetch.fn)) | ||
590 | free_deref_fetch_param(arg->fetch.data); | 655 | free_deref_fetch_param(arg->fetch.data); |
591 | else if (CHECK_FETCH_FUNCS(symbol, arg->fetch.fn)) | 656 | else if (CHECK_FETCH_FUNCS(symbol, arg->fetch.fn)) |
592 | free_symbol_cache(arg->fetch.data); | 657 | free_symbol_cache(arg->fetch.data); |
@@ -767,16 +832,15 @@ static int __parse_probe_arg(char *arg, const struct fetch_type *t, | |||
767 | } | 832 | } |
768 | break; | 833 | break; |
769 | case '+': /* deref memory */ | 834 | case '+': /* deref memory */ |
835 | arg++; /* Skip '+', because strict_strtol() rejects it. */ | ||
770 | case '-': | 836 | case '-': |
771 | tmp = strchr(arg, '('); | 837 | tmp = strchr(arg, '('); |
772 | if (!tmp) | 838 | if (!tmp) |
773 | break; | 839 | break; |
774 | *tmp = '\0'; | 840 | *tmp = '\0'; |
775 | ret = strict_strtol(arg + 1, 0, &offset); | 841 | ret = strict_strtol(arg, 0, &offset); |
776 | if (ret) | 842 | if (ret) |
777 | break; | 843 | break; |
778 | if (arg[0] == '-') | ||
779 | offset = -offset; | ||
780 | arg = tmp + 1; | 844 | arg = tmp + 1; |
781 | tmp = strrchr(arg, ')'); | 845 | tmp = strrchr(arg, ')'); |
782 | if (tmp) { | 846 | if (tmp) { |
@@ -807,6 +871,41 @@ static int __parse_probe_arg(char *arg, const struct fetch_type *t, | |||
807 | return ret; | 871 | return ret; |
808 | } | 872 | } |
809 | 873 | ||
874 | #define BYTES_TO_BITS(nb) ((BITS_PER_LONG * (nb)) / sizeof(long)) | ||
875 | |||
876 | /* Bitfield type needs to be parsed into a fetch function */ | ||
877 | static int __parse_bitfield_probe_arg(const char *bf, | ||
878 | const struct fetch_type *t, | ||
879 | struct fetch_param *f) | ||
880 | { | ||
881 | struct bitfield_fetch_param *bprm; | ||
882 | unsigned long bw, bo; | ||
883 | char *tail; | ||
884 | |||
885 | if (*bf != 'b') | ||
886 | return 0; | ||
887 | |||
888 | bprm = kzalloc(sizeof(*bprm), GFP_KERNEL); | ||
889 | if (!bprm) | ||
890 | return -ENOMEM; | ||
891 | bprm->orig = *f; | ||
892 | f->fn = t->fetch[FETCH_MTD_bitfield]; | ||
893 | f->data = (void *)bprm; | ||
894 | |||
895 | bw = simple_strtoul(bf + 1, &tail, 0); /* Use simple one */ | ||
896 | if (bw == 0 || *tail != '@') | ||
897 | return -EINVAL; | ||
898 | |||
899 | bf = tail + 1; | ||
900 | bo = simple_strtoul(bf, &tail, 0); | ||
901 | if (tail == bf || *tail != '/') | ||
902 | return -EINVAL; | ||
903 | |||
904 | bprm->hi_shift = BYTES_TO_BITS(t->size) - (bw + bo); | ||
905 | bprm->low_shift = bprm->hi_shift + bo; | ||
906 | return (BYTES_TO_BITS(t->size) < (bw + bo)) ? -EINVAL : 0; | ||
907 | } | ||
908 | |||
810 | /* String length checking wrapper */ | 909 | /* String length checking wrapper */ |
811 | static int parse_probe_arg(char *arg, struct trace_probe *tp, | 910 | static int parse_probe_arg(char *arg, struct trace_probe *tp, |
812 | struct probe_arg *parg, int is_return) | 911 | struct probe_arg *parg, int is_return) |
@@ -836,6 +935,8 @@ static int parse_probe_arg(char *arg, struct trace_probe *tp, | |||
836 | parg->offset = tp->size; | 935 | parg->offset = tp->size; |
837 | tp->size += parg->type->size; | 936 | tp->size += parg->type->size; |
838 | ret = __parse_probe_arg(arg, parg->type, &parg->fetch, is_return); | 937 | ret = __parse_probe_arg(arg, parg->type, &parg->fetch, is_return); |
938 | if (ret >= 0 && t != NULL) | ||
939 | ret = __parse_bitfield_probe_arg(t, parg->type, &parg->fetch); | ||
839 | if (ret >= 0) { | 940 | if (ret >= 0) { |
840 | parg->fetch_size.fn = get_fetch_size_function(parg->type, | 941 | parg->fetch_size.fn = get_fetch_size_function(parg->type, |
841 | parg->fetch.fn); | 942 | parg->fetch.fn); |
@@ -1130,7 +1231,7 @@ static int command_trace_probe(const char *buf) | |||
1130 | return ret; | 1231 | return ret; |
1131 | } | 1232 | } |
1132 | 1233 | ||
1133 | #define WRITE_BUFSIZE 128 | 1234 | #define WRITE_BUFSIZE 4096 |
1134 | 1235 | ||
1135 | static ssize_t probes_write(struct file *file, const char __user *buffer, | 1236 | static ssize_t probes_write(struct file *file, const char __user *buffer, |
1136 | size_t count, loff_t *ppos) | 1237 | size_t count, loff_t *ppos) |
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 02272baa2206..456be9063c2d 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c | |||
@@ -529,24 +529,34 @@ seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags) | |||
529 | * @entry: The trace entry field from the ring buffer | 529 | * @entry: The trace entry field from the ring buffer |
530 | * | 530 | * |
531 | * Prints the generic fields of irqs off, in hard or softirq, preempt | 531 | * Prints the generic fields of irqs off, in hard or softirq, preempt |
532 | * count and lock depth. | 532 | * count. |
533 | */ | 533 | */ |
534 | int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry) | 534 | int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry) |
535 | { | 535 | { |
536 | int hardirq, softirq; | 536 | char hardsoft_irq; |
537 | char need_resched; | ||
538 | char irqs_off; | ||
539 | int hardirq; | ||
540 | int softirq; | ||
537 | int ret; | 541 | int ret; |
538 | 542 | ||
539 | hardirq = entry->flags & TRACE_FLAG_HARDIRQ; | 543 | hardirq = entry->flags & TRACE_FLAG_HARDIRQ; |
540 | softirq = entry->flags & TRACE_FLAG_SOFTIRQ; | 544 | softirq = entry->flags & TRACE_FLAG_SOFTIRQ; |
541 | 545 | ||
546 | irqs_off = | ||
547 | (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' : | ||
548 | (entry->flags & TRACE_FLAG_IRQS_NOSUPPORT) ? 'X' : | ||
549 | '.'; | ||
550 | need_resched = | ||
551 | (entry->flags & TRACE_FLAG_NEED_RESCHED) ? 'N' : '.'; | ||
552 | hardsoft_irq = | ||
553 | (hardirq && softirq) ? 'H' : | ||
554 | hardirq ? 'h' : | ||
555 | softirq ? 's' : | ||
556 | '.'; | ||
557 | |||
542 | if (!trace_seq_printf(s, "%c%c%c", | 558 | if (!trace_seq_printf(s, "%c%c%c", |
543 | (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' : | 559 | irqs_off, need_resched, hardsoft_irq)) |
544 | (entry->flags & TRACE_FLAG_IRQS_NOSUPPORT) ? | ||
545 | 'X' : '.', | ||
546 | (entry->flags & TRACE_FLAG_NEED_RESCHED) ? | ||
547 | 'N' : '.', | ||
548 | (hardirq && softirq) ? 'H' : | ||
549 | hardirq ? 'h' : softirq ? 's' : '.')) | ||
550 | return 0; | 560 | return 0; |
551 | 561 | ||
552 | if (entry->preempt_count) | 562 | if (entry->preempt_count) |
@@ -554,13 +564,7 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry) | |||
554 | else | 564 | else |
555 | ret = trace_seq_putc(s, '.'); | 565 | ret = trace_seq_putc(s, '.'); |
556 | 566 | ||
557 | if (!ret) | 567 | return ret; |
558 | return 0; | ||
559 | |||
560 | if (entry->lock_depth < 0) | ||
561 | return trace_seq_putc(s, '.'); | ||
562 | |||
563 | return trace_seq_printf(s, "%d", entry->lock_depth); | ||
564 | } | 568 | } |
565 | 569 | ||
566 | static int | 570 | static int |
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c index 8f758d070c43..7e62c0a18456 100644 --- a/kernel/trace/trace_sched_switch.c +++ b/kernel/trace/trace_sched_switch.c | |||
@@ -247,51 +247,3 @@ void tracing_sched_switch_assign_trace(struct trace_array *tr) | |||
247 | ctx_trace = tr; | 247 | ctx_trace = tr; |
248 | } | 248 | } |
249 | 249 | ||
250 | static void stop_sched_trace(struct trace_array *tr) | ||
251 | { | ||
252 | tracing_stop_sched_switch_record(); | ||
253 | } | ||
254 | |||
255 | static int sched_switch_trace_init(struct trace_array *tr) | ||
256 | { | ||
257 | ctx_trace = tr; | ||
258 | tracing_reset_online_cpus(tr); | ||
259 | tracing_start_sched_switch_record(); | ||
260 | return 0; | ||
261 | } | ||
262 | |||
263 | static void sched_switch_trace_reset(struct trace_array *tr) | ||
264 | { | ||
265 | if (sched_ref) | ||
266 | stop_sched_trace(tr); | ||
267 | } | ||
268 | |||
269 | static void sched_switch_trace_start(struct trace_array *tr) | ||
270 | { | ||
271 | sched_stopped = 0; | ||
272 | } | ||
273 | |||
274 | static void sched_switch_trace_stop(struct trace_array *tr) | ||
275 | { | ||
276 | sched_stopped = 1; | ||
277 | } | ||
278 | |||
279 | static struct tracer sched_switch_trace __read_mostly = | ||
280 | { | ||
281 | .name = "sched_switch", | ||
282 | .init = sched_switch_trace_init, | ||
283 | .reset = sched_switch_trace_reset, | ||
284 | .start = sched_switch_trace_start, | ||
285 | .stop = sched_switch_trace_stop, | ||
286 | .wait_pipe = poll_wait_pipe, | ||
287 | #ifdef CONFIG_FTRACE_SELFTEST | ||
288 | .selftest = trace_selftest_startup_sched_switch, | ||
289 | #endif | ||
290 | }; | ||
291 | |||
292 | __init static int init_sched_switch_trace(void) | ||
293 | { | ||
294 | return register_tracer(&sched_switch_trace); | ||
295 | } | ||
296 | device_initcall(init_sched_switch_trace); | ||
297 | |||
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 562c56e048fd..659732eba07c 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c | |||
@@ -558,7 +558,7 @@ trace_selftest_startup_nop(struct tracer *trace, struct trace_array *tr) | |||
558 | static int trace_wakeup_test_thread(void *data) | 558 | static int trace_wakeup_test_thread(void *data) |
559 | { | 559 | { |
560 | /* Make this a RT thread, doesn't need to be too high */ | 560 | /* Make this a RT thread, doesn't need to be too high */ |
561 | static struct sched_param param = { .sched_priority = 5 }; | 561 | static const struct sched_param param = { .sched_priority = 5 }; |
562 | struct completion *x = data; | 562 | struct completion *x = data; |
563 | 563 | ||
564 | sched_setscheduler(current, SCHED_FIFO, ¶m); | 564 | sched_setscheduler(current, SCHED_FIFO, ¶m); |
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index bac752f0cfb5..ee7b5a0bb9f8 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c | |||
@@ -23,9 +23,6 @@ static int syscall_exit_register(struct ftrace_event_call *event, | |||
23 | static int syscall_enter_define_fields(struct ftrace_event_call *call); | 23 | static int syscall_enter_define_fields(struct ftrace_event_call *call); |
24 | static int syscall_exit_define_fields(struct ftrace_event_call *call); | 24 | static int syscall_exit_define_fields(struct ftrace_event_call *call); |
25 | 25 | ||
26 | /* All syscall exit events have the same fields */ | ||
27 | static LIST_HEAD(syscall_exit_fields); | ||
28 | |||
29 | static struct list_head * | 26 | static struct list_head * |
30 | syscall_get_enter_fields(struct ftrace_event_call *call) | 27 | syscall_get_enter_fields(struct ftrace_event_call *call) |
31 | { | 28 | { |
@@ -34,61 +31,66 @@ syscall_get_enter_fields(struct ftrace_event_call *call) | |||
34 | return &entry->enter_fields; | 31 | return &entry->enter_fields; |
35 | } | 32 | } |
36 | 33 | ||
37 | static struct list_head * | ||
38 | syscall_get_exit_fields(struct ftrace_event_call *call) | ||
39 | { | ||
40 | return &syscall_exit_fields; | ||
41 | } | ||
42 | |||
43 | struct trace_event_functions enter_syscall_print_funcs = { | 34 | struct trace_event_functions enter_syscall_print_funcs = { |
44 | .trace = print_syscall_enter, | 35 | .trace = print_syscall_enter, |
45 | }; | 36 | }; |
46 | 37 | ||
47 | struct trace_event_functions exit_syscall_print_funcs = { | 38 | struct trace_event_functions exit_syscall_print_funcs = { |
48 | .trace = print_syscall_exit, | 39 | .trace = print_syscall_exit, |
49 | }; | 40 | }; |
50 | 41 | ||
51 | struct ftrace_event_class event_class_syscall_enter = { | 42 | struct ftrace_event_class event_class_syscall_enter = { |
52 | .system = "syscalls", | 43 | .system = "syscalls", |
53 | .reg = syscall_enter_register, | 44 | .reg = syscall_enter_register, |
54 | .define_fields = syscall_enter_define_fields, | 45 | .define_fields = syscall_enter_define_fields, |
55 | .get_fields = syscall_get_enter_fields, | 46 | .get_fields = syscall_get_enter_fields, |
56 | .raw_init = init_syscall_trace, | 47 | .raw_init = init_syscall_trace, |
57 | }; | 48 | }; |
58 | 49 | ||
59 | struct ftrace_event_class event_class_syscall_exit = { | 50 | struct ftrace_event_class event_class_syscall_exit = { |
60 | .system = "syscalls", | 51 | .system = "syscalls", |
61 | .reg = syscall_exit_register, | 52 | .reg = syscall_exit_register, |
62 | .define_fields = syscall_exit_define_fields, | 53 | .define_fields = syscall_exit_define_fields, |
63 | .get_fields = syscall_get_exit_fields, | 54 | .fields = LIST_HEAD_INIT(event_class_syscall_exit.fields), |
64 | .raw_init = init_syscall_trace, | 55 | .raw_init = init_syscall_trace, |
65 | }; | 56 | }; |
66 | 57 | ||
67 | extern unsigned long __start_syscalls_metadata[]; | 58 | extern struct syscall_metadata *__start_syscalls_metadata[]; |
68 | extern unsigned long __stop_syscalls_metadata[]; | 59 | extern struct syscall_metadata *__stop_syscalls_metadata[]; |
69 | 60 | ||
70 | static struct syscall_metadata **syscalls_metadata; | 61 | static struct syscall_metadata **syscalls_metadata; |
71 | 62 | ||
72 | static struct syscall_metadata *find_syscall_meta(unsigned long syscall) | 63 | #ifndef ARCH_HAS_SYSCALL_MATCH_SYM_NAME |
64 | static inline bool arch_syscall_match_sym_name(const char *sym, const char *name) | ||
65 | { | ||
66 | /* | ||
67 | * Only compare after the "sys" prefix. Archs that use | ||
68 | * syscall wrappers may have syscalls symbols aliases prefixed | ||
69 | * with "SyS" instead of "sys", leading to an unwanted | ||
70 | * mismatch. | ||
71 | */ | ||
72 | return !strcmp(sym + 3, name + 3); | ||
73 | } | ||
74 | #endif | ||
75 | |||
76 | static __init struct syscall_metadata * | ||
77 | find_syscall_meta(unsigned long syscall) | ||
73 | { | 78 | { |
74 | struct syscall_metadata *start; | 79 | struct syscall_metadata **start; |
75 | struct syscall_metadata *stop; | 80 | struct syscall_metadata **stop; |
76 | char str[KSYM_SYMBOL_LEN]; | 81 | char str[KSYM_SYMBOL_LEN]; |
77 | 82 | ||
78 | 83 | ||
79 | start = (struct syscall_metadata *)__start_syscalls_metadata; | 84 | start = __start_syscalls_metadata; |
80 | stop = (struct syscall_metadata *)__stop_syscalls_metadata; | 85 | stop = __stop_syscalls_metadata; |
81 | kallsyms_lookup(syscall, NULL, NULL, NULL, str); | 86 | kallsyms_lookup(syscall, NULL, NULL, NULL, str); |
82 | 87 | ||
88 | if (arch_syscall_match_sym_name(str, "sys_ni_syscall")) | ||
89 | return NULL; | ||
90 | |||
83 | for ( ; start < stop; start++) { | 91 | for ( ; start < stop; start++) { |
84 | /* | 92 | if ((*start)->name && arch_syscall_match_sym_name(str, (*start)->name)) |
85 | * Only compare after the "sys" prefix. Archs that use | 93 | return *start; |
86 | * syscall wrappers may have syscalls symbols aliases prefixed | ||
87 | * with "SyS" instead of "sys", leading to an unwanted | ||
88 | * mismatch. | ||
89 | */ | ||
90 | if (start->name && !strcmp(start->name + 3, str + 3)) | ||
91 | return start; | ||
92 | } | 94 | } |
93 | return NULL; | 95 | return NULL; |
94 | } | 96 | } |
@@ -367,7 +369,7 @@ int reg_event_syscall_enter(struct ftrace_event_call *call) | |||
367 | int num; | 369 | int num; |
368 | 370 | ||
369 | num = ((struct syscall_metadata *)call->data)->syscall_nr; | 371 | num = ((struct syscall_metadata *)call->data)->syscall_nr; |
370 | if (num < 0 || num >= NR_syscalls) | 372 | if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls)) |
371 | return -ENOSYS; | 373 | return -ENOSYS; |
372 | mutex_lock(&syscall_trace_lock); | 374 | mutex_lock(&syscall_trace_lock); |
373 | if (!sys_refcount_enter) | 375 | if (!sys_refcount_enter) |
@@ -385,7 +387,7 @@ void unreg_event_syscall_enter(struct ftrace_event_call *call) | |||
385 | int num; | 387 | int num; |
386 | 388 | ||
387 | num = ((struct syscall_metadata *)call->data)->syscall_nr; | 389 | num = ((struct syscall_metadata *)call->data)->syscall_nr; |
388 | if (num < 0 || num >= NR_syscalls) | 390 | if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls)) |
389 | return; | 391 | return; |
390 | mutex_lock(&syscall_trace_lock); | 392 | mutex_lock(&syscall_trace_lock); |
391 | sys_refcount_enter--; | 393 | sys_refcount_enter--; |
@@ -401,7 +403,7 @@ int reg_event_syscall_exit(struct ftrace_event_call *call) | |||
401 | int num; | 403 | int num; |
402 | 404 | ||
403 | num = ((struct syscall_metadata *)call->data)->syscall_nr; | 405 | num = ((struct syscall_metadata *)call->data)->syscall_nr; |
404 | if (num < 0 || num >= NR_syscalls) | 406 | if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls)) |
405 | return -ENOSYS; | 407 | return -ENOSYS; |
406 | mutex_lock(&syscall_trace_lock); | 408 | mutex_lock(&syscall_trace_lock); |
407 | if (!sys_refcount_exit) | 409 | if (!sys_refcount_exit) |
@@ -419,7 +421,7 @@ void unreg_event_syscall_exit(struct ftrace_event_call *call) | |||
419 | int num; | 421 | int num; |
420 | 422 | ||
421 | num = ((struct syscall_metadata *)call->data)->syscall_nr; | 423 | num = ((struct syscall_metadata *)call->data)->syscall_nr; |
422 | if (num < 0 || num >= NR_syscalls) | 424 | if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls)) |
423 | return; | 425 | return; |
424 | mutex_lock(&syscall_trace_lock); | 426 | mutex_lock(&syscall_trace_lock); |
425 | sys_refcount_exit--; | 427 | sys_refcount_exit--; |
@@ -432,6 +434,14 @@ void unreg_event_syscall_exit(struct ftrace_event_call *call) | |||
432 | int init_syscall_trace(struct ftrace_event_call *call) | 434 | int init_syscall_trace(struct ftrace_event_call *call) |
433 | { | 435 | { |
434 | int id; | 436 | int id; |
437 | int num; | ||
438 | |||
439 | num = ((struct syscall_metadata *)call->data)->syscall_nr; | ||
440 | if (num < 0 || num >= NR_syscalls) { | ||
441 | pr_debug("syscall %s metadata not mapped, disabling ftrace event\n", | ||
442 | ((struct syscall_metadata *)call->data)->name); | ||
443 | return -ENOSYS; | ||
444 | } | ||
435 | 445 | ||
436 | if (set_syscall_print_fmt(call) < 0) | 446 | if (set_syscall_print_fmt(call) < 0) |
437 | return -ENOMEM; | 447 | return -ENOMEM; |
@@ -446,7 +456,7 @@ int init_syscall_trace(struct ftrace_event_call *call) | |||
446 | return id; | 456 | return id; |
447 | } | 457 | } |
448 | 458 | ||
449 | unsigned long __init arch_syscall_addr(int nr) | 459 | unsigned long __init __weak arch_syscall_addr(int nr) |
450 | { | 460 | { |
451 | return (unsigned long)sys_call_table[nr]; | 461 | return (unsigned long)sys_call_table[nr]; |
452 | } | 462 | } |
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index e95ee7f31d43..68187af4889e 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c | |||
@@ -27,8 +27,8 @@ | |||
27 | #include <linux/sched.h> | 27 | #include <linux/sched.h> |
28 | #include <linux/jump_label.h> | 28 | #include <linux/jump_label.h> |
29 | 29 | ||
30 | extern struct tracepoint __start___tracepoints[]; | 30 | extern struct tracepoint * const __start___tracepoints_ptrs[]; |
31 | extern struct tracepoint __stop___tracepoints[]; | 31 | extern struct tracepoint * const __stop___tracepoints_ptrs[]; |
32 | 32 | ||
33 | /* Set to 1 to enable tracepoint debug output */ | 33 | /* Set to 1 to enable tracepoint debug output */ |
34 | static const int tracepoint_debug; | 34 | static const int tracepoint_debug; |
@@ -298,10 +298,10 @@ static void disable_tracepoint(struct tracepoint *elem) | |||
298 | * | 298 | * |
299 | * Updates the probe callback corresponding to a range of tracepoints. | 299 | * Updates the probe callback corresponding to a range of tracepoints. |
300 | */ | 300 | */ |
301 | void | 301 | void tracepoint_update_probe_range(struct tracepoint * const *begin, |
302 | tracepoint_update_probe_range(struct tracepoint *begin, struct tracepoint *end) | 302 | struct tracepoint * const *end) |
303 | { | 303 | { |
304 | struct tracepoint *iter; | 304 | struct tracepoint * const *iter; |
305 | struct tracepoint_entry *mark_entry; | 305 | struct tracepoint_entry *mark_entry; |
306 | 306 | ||
307 | if (!begin) | 307 | if (!begin) |
@@ -309,12 +309,12 @@ tracepoint_update_probe_range(struct tracepoint *begin, struct tracepoint *end) | |||
309 | 309 | ||
310 | mutex_lock(&tracepoints_mutex); | 310 | mutex_lock(&tracepoints_mutex); |
311 | for (iter = begin; iter < end; iter++) { | 311 | for (iter = begin; iter < end; iter++) { |
312 | mark_entry = get_tracepoint(iter->name); | 312 | mark_entry = get_tracepoint((*iter)->name); |
313 | if (mark_entry) { | 313 | if (mark_entry) { |
314 | set_tracepoint(&mark_entry, iter, | 314 | set_tracepoint(&mark_entry, *iter, |
315 | !!mark_entry->refcount); | 315 | !!mark_entry->refcount); |
316 | } else { | 316 | } else { |
317 | disable_tracepoint(iter); | 317 | disable_tracepoint(*iter); |
318 | } | 318 | } |
319 | } | 319 | } |
320 | mutex_unlock(&tracepoints_mutex); | 320 | mutex_unlock(&tracepoints_mutex); |
@@ -326,8 +326,8 @@ tracepoint_update_probe_range(struct tracepoint *begin, struct tracepoint *end) | |||
326 | static void tracepoint_update_probes(void) | 326 | static void tracepoint_update_probes(void) |
327 | { | 327 | { |
328 | /* Core kernel tracepoints */ | 328 | /* Core kernel tracepoints */ |
329 | tracepoint_update_probe_range(__start___tracepoints, | 329 | tracepoint_update_probe_range(__start___tracepoints_ptrs, |
330 | __stop___tracepoints); | 330 | __stop___tracepoints_ptrs); |
331 | /* tracepoints in modules. */ | 331 | /* tracepoints in modules. */ |
332 | module_update_tracepoints(); | 332 | module_update_tracepoints(); |
333 | } | 333 | } |
@@ -514,8 +514,8 @@ EXPORT_SYMBOL_GPL(tracepoint_probe_update_all); | |||
514 | * Will return the first tracepoint in the range if the input tracepoint is | 514 | * Will return the first tracepoint in the range if the input tracepoint is |
515 | * NULL. | 515 | * NULL. |
516 | */ | 516 | */ |
517 | int tracepoint_get_iter_range(struct tracepoint **tracepoint, | 517 | int tracepoint_get_iter_range(struct tracepoint * const **tracepoint, |
518 | struct tracepoint *begin, struct tracepoint *end) | 518 | struct tracepoint * const *begin, struct tracepoint * const *end) |
519 | { | 519 | { |
520 | if (!*tracepoint && begin != end) { | 520 | if (!*tracepoint && begin != end) { |
521 | *tracepoint = begin; | 521 | *tracepoint = begin; |
@@ -534,7 +534,8 @@ static void tracepoint_get_iter(struct tracepoint_iter *iter) | |||
534 | /* Core kernel tracepoints */ | 534 | /* Core kernel tracepoints */ |
535 | if (!iter->module) { | 535 | if (!iter->module) { |
536 | found = tracepoint_get_iter_range(&iter->tracepoint, | 536 | found = tracepoint_get_iter_range(&iter->tracepoint, |
537 | __start___tracepoints, __stop___tracepoints); | 537 | __start___tracepoints_ptrs, |
538 | __stop___tracepoints_ptrs); | ||
538 | if (found) | 539 | if (found) |
539 | goto end; | 540 | goto end; |
540 | } | 541 | } |
@@ -585,8 +586,8 @@ int tracepoint_module_notify(struct notifier_block *self, | |||
585 | switch (val) { | 586 | switch (val) { |
586 | case MODULE_STATE_COMING: | 587 | case MODULE_STATE_COMING: |
587 | case MODULE_STATE_GOING: | 588 | case MODULE_STATE_GOING: |
588 | tracepoint_update_probe_range(mod->tracepoints, | 589 | tracepoint_update_probe_range(mod->tracepoints_ptrs, |
589 | mod->tracepoints + mod->num_tracepoints); | 590 | mod->tracepoints_ptrs + mod->num_tracepoints); |
590 | break; | 591 | break; |
591 | } | 592 | } |
592 | return 0; | 593 | return 0; |
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 25915832291a..9da289c34f22 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c | |||
@@ -12,6 +12,8 @@ | |||
12 | #include <linux/highuid.h> | 12 | #include <linux/highuid.h> |
13 | #include <linux/cred.h> | 13 | #include <linux/cred.h> |
14 | 14 | ||
15 | static struct kmem_cache *user_ns_cachep __read_mostly; | ||
16 | |||
15 | /* | 17 | /* |
16 | * Create a new user namespace, deriving the creator from the user in the | 18 | * Create a new user namespace, deriving the creator from the user in the |
17 | * passed credentials, and replacing that user with the new root user for the | 19 | * passed credentials, and replacing that user with the new root user for the |
@@ -26,7 +28,7 @@ int create_user_ns(struct cred *new) | |||
26 | struct user_struct *root_user; | 28 | struct user_struct *root_user; |
27 | int n; | 29 | int n; |
28 | 30 | ||
29 | ns = kmalloc(sizeof(struct user_namespace), GFP_KERNEL); | 31 | ns = kmem_cache_alloc(user_ns_cachep, GFP_KERNEL); |
30 | if (!ns) | 32 | if (!ns) |
31 | return -ENOMEM; | 33 | return -ENOMEM; |
32 | 34 | ||
@@ -38,7 +40,7 @@ int create_user_ns(struct cred *new) | |||
38 | /* Alloc new root user. */ | 40 | /* Alloc new root user. */ |
39 | root_user = alloc_uid(ns, 0); | 41 | root_user = alloc_uid(ns, 0); |
40 | if (!root_user) { | 42 | if (!root_user) { |
41 | kfree(ns); | 43 | kmem_cache_free(user_ns_cachep, ns); |
42 | return -ENOMEM; | 44 | return -ENOMEM; |
43 | } | 45 | } |
44 | 46 | ||
@@ -71,7 +73,7 @@ static void free_user_ns_work(struct work_struct *work) | |||
71 | struct user_namespace *ns = | 73 | struct user_namespace *ns = |
72 | container_of(work, struct user_namespace, destroyer); | 74 | container_of(work, struct user_namespace, destroyer); |
73 | free_uid(ns->creator); | 75 | free_uid(ns->creator); |
74 | kfree(ns); | 76 | kmem_cache_free(user_ns_cachep, ns); |
75 | } | 77 | } |
76 | 78 | ||
77 | void free_user_ns(struct kref *kref) | 79 | void free_user_ns(struct kref *kref) |
@@ -126,3 +128,10 @@ gid_t user_ns_map_gid(struct user_namespace *to, const struct cred *cred, gid_t | |||
126 | /* No useful relationship so no mapping */ | 128 | /* No useful relationship so no mapping */ |
127 | return overflowgid; | 129 | return overflowgid; |
128 | } | 130 | } |
131 | |||
132 | static __init int user_namespaces_init(void) | ||
133 | { | ||
134 | user_ns_cachep = KMEM_CACHE(user_namespace, SLAB_PANIC); | ||
135 | return 0; | ||
136 | } | ||
137 | module_init(user_namespaces_init); | ||
diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 6e7b575ac33c..18bb15776c57 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c | |||
@@ -27,7 +27,7 @@ | |||
27 | #include <asm/irq_regs.h> | 27 | #include <asm/irq_regs.h> |
28 | #include <linux/perf_event.h> | 28 | #include <linux/perf_event.h> |
29 | 29 | ||
30 | int watchdog_enabled; | 30 | int watchdog_enabled = 1; |
31 | int __read_mostly softlockup_thresh = 60; | 31 | int __read_mostly softlockup_thresh = 60; |
32 | 32 | ||
33 | static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts); | 33 | static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts); |
@@ -43,9 +43,6 @@ static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved); | |||
43 | static DEFINE_PER_CPU(struct perf_event *, watchdog_ev); | 43 | static DEFINE_PER_CPU(struct perf_event *, watchdog_ev); |
44 | #endif | 44 | #endif |
45 | 45 | ||
46 | static int no_watchdog; | ||
47 | |||
48 | |||
49 | /* boot commands */ | 46 | /* boot commands */ |
50 | /* | 47 | /* |
51 | * Should we panic when a soft-lockup or hard-lockup occurs: | 48 | * Should we panic when a soft-lockup or hard-lockup occurs: |
@@ -58,7 +55,7 @@ static int __init hardlockup_panic_setup(char *str) | |||
58 | if (!strncmp(str, "panic", 5)) | 55 | if (!strncmp(str, "panic", 5)) |
59 | hardlockup_panic = 1; | 56 | hardlockup_panic = 1; |
60 | else if (!strncmp(str, "0", 1)) | 57 | else if (!strncmp(str, "0", 1)) |
61 | no_watchdog = 1; | 58 | watchdog_enabled = 0; |
62 | return 1; | 59 | return 1; |
63 | } | 60 | } |
64 | __setup("nmi_watchdog=", hardlockup_panic_setup); | 61 | __setup("nmi_watchdog=", hardlockup_panic_setup); |
@@ -77,7 +74,7 @@ __setup("softlockup_panic=", softlockup_panic_setup); | |||
77 | 74 | ||
78 | static int __init nowatchdog_setup(char *str) | 75 | static int __init nowatchdog_setup(char *str) |
79 | { | 76 | { |
80 | no_watchdog = 1; | 77 | watchdog_enabled = 0; |
81 | return 1; | 78 | return 1; |
82 | } | 79 | } |
83 | __setup("nowatchdog", nowatchdog_setup); | 80 | __setup("nowatchdog", nowatchdog_setup); |
@@ -85,7 +82,7 @@ __setup("nowatchdog", nowatchdog_setup); | |||
85 | /* deprecated */ | 82 | /* deprecated */ |
86 | static int __init nosoftlockup_setup(char *str) | 83 | static int __init nosoftlockup_setup(char *str) |
87 | { | 84 | { |
88 | no_watchdog = 1; | 85 | watchdog_enabled = 0; |
89 | return 1; | 86 | return 1; |
90 | } | 87 | } |
91 | __setup("nosoftlockup", nosoftlockup_setup); | 88 | __setup("nosoftlockup", nosoftlockup_setup); |
@@ -118,12 +115,12 @@ static void __touch_watchdog(void) | |||
118 | { | 115 | { |
119 | int this_cpu = smp_processor_id(); | 116 | int this_cpu = smp_processor_id(); |
120 | 117 | ||
121 | __get_cpu_var(watchdog_touch_ts) = get_timestamp(this_cpu); | 118 | __this_cpu_write(watchdog_touch_ts, get_timestamp(this_cpu)); |
122 | } | 119 | } |
123 | 120 | ||
124 | void touch_softlockup_watchdog(void) | 121 | void touch_softlockup_watchdog(void) |
125 | { | 122 | { |
126 | __raw_get_cpu_var(watchdog_touch_ts) = 0; | 123 | __this_cpu_write(watchdog_touch_ts, 0); |
127 | } | 124 | } |
128 | EXPORT_SYMBOL(touch_softlockup_watchdog); | 125 | EXPORT_SYMBOL(touch_softlockup_watchdog); |
129 | 126 | ||
@@ -167,12 +164,12 @@ void touch_softlockup_watchdog_sync(void) | |||
167 | /* watchdog detector functions */ | 164 | /* watchdog detector functions */ |
168 | static int is_hardlockup(void) | 165 | static int is_hardlockup(void) |
169 | { | 166 | { |
170 | unsigned long hrint = __get_cpu_var(hrtimer_interrupts); | 167 | unsigned long hrint = __this_cpu_read(hrtimer_interrupts); |
171 | 168 | ||
172 | if (__get_cpu_var(hrtimer_interrupts_saved) == hrint) | 169 | if (__this_cpu_read(hrtimer_interrupts_saved) == hrint) |
173 | return 1; | 170 | return 1; |
174 | 171 | ||
175 | __get_cpu_var(hrtimer_interrupts_saved) = hrint; | 172 | __this_cpu_write(hrtimer_interrupts_saved, hrint); |
176 | return 0; | 173 | return 0; |
177 | } | 174 | } |
178 | #endif | 175 | #endif |
@@ -205,8 +202,8 @@ static void watchdog_overflow_callback(struct perf_event *event, int nmi, | |||
205 | /* Ensure the watchdog never gets throttled */ | 202 | /* Ensure the watchdog never gets throttled */ |
206 | event->hw.interrupts = 0; | 203 | event->hw.interrupts = 0; |
207 | 204 | ||
208 | if (__get_cpu_var(watchdog_nmi_touch) == true) { | 205 | if (__this_cpu_read(watchdog_nmi_touch) == true) { |
209 | __get_cpu_var(watchdog_nmi_touch) = false; | 206 | __this_cpu_write(watchdog_nmi_touch, false); |
210 | return; | 207 | return; |
211 | } | 208 | } |
212 | 209 | ||
@@ -220,7 +217,7 @@ static void watchdog_overflow_callback(struct perf_event *event, int nmi, | |||
220 | int this_cpu = smp_processor_id(); | 217 | int this_cpu = smp_processor_id(); |
221 | 218 | ||
222 | /* only print hardlockups once */ | 219 | /* only print hardlockups once */ |
223 | if (__get_cpu_var(hard_watchdog_warn) == true) | 220 | if (__this_cpu_read(hard_watchdog_warn) == true) |
224 | return; | 221 | return; |
225 | 222 | ||
226 | if (hardlockup_panic) | 223 | if (hardlockup_panic) |
@@ -228,16 +225,16 @@ static void watchdog_overflow_callback(struct perf_event *event, int nmi, | |||
228 | else | 225 | else |
229 | WARN(1, "Watchdog detected hard LOCKUP on cpu %d", this_cpu); | 226 | WARN(1, "Watchdog detected hard LOCKUP on cpu %d", this_cpu); |
230 | 227 | ||
231 | __get_cpu_var(hard_watchdog_warn) = true; | 228 | __this_cpu_write(hard_watchdog_warn, true); |
232 | return; | 229 | return; |
233 | } | 230 | } |
234 | 231 | ||
235 | __get_cpu_var(hard_watchdog_warn) = false; | 232 | __this_cpu_write(hard_watchdog_warn, false); |
236 | return; | 233 | return; |
237 | } | 234 | } |
238 | static void watchdog_interrupt_count(void) | 235 | static void watchdog_interrupt_count(void) |
239 | { | 236 | { |
240 | __get_cpu_var(hrtimer_interrupts)++; | 237 | __this_cpu_inc(hrtimer_interrupts); |
241 | } | 238 | } |
242 | #else | 239 | #else |
243 | static inline void watchdog_interrupt_count(void) { return; } | 240 | static inline void watchdog_interrupt_count(void) { return; } |
@@ -246,7 +243,7 @@ static inline void watchdog_interrupt_count(void) { return; } | |||
246 | /* watchdog kicker functions */ | 243 | /* watchdog kicker functions */ |
247 | static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) | 244 | static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) |
248 | { | 245 | { |
249 | unsigned long touch_ts = __get_cpu_var(watchdog_touch_ts); | 246 | unsigned long touch_ts = __this_cpu_read(watchdog_touch_ts); |
250 | struct pt_regs *regs = get_irq_regs(); | 247 | struct pt_regs *regs = get_irq_regs(); |
251 | int duration; | 248 | int duration; |
252 | 249 | ||
@@ -254,18 +251,18 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) | |||
254 | watchdog_interrupt_count(); | 251 | watchdog_interrupt_count(); |
255 | 252 | ||
256 | /* kick the softlockup detector */ | 253 | /* kick the softlockup detector */ |
257 | wake_up_process(__get_cpu_var(softlockup_watchdog)); | 254 | wake_up_process(__this_cpu_read(softlockup_watchdog)); |
258 | 255 | ||
259 | /* .. and repeat */ | 256 | /* .. and repeat */ |
260 | hrtimer_forward_now(hrtimer, ns_to_ktime(get_sample_period())); | 257 | hrtimer_forward_now(hrtimer, ns_to_ktime(get_sample_period())); |
261 | 258 | ||
262 | if (touch_ts == 0) { | 259 | if (touch_ts == 0) { |
263 | if (unlikely(__get_cpu_var(softlockup_touch_sync))) { | 260 | if (unlikely(__this_cpu_read(softlockup_touch_sync))) { |
264 | /* | 261 | /* |
265 | * If the time stamp was touched atomically | 262 | * If the time stamp was touched atomically |
266 | * make sure the scheduler tick is up to date. | 263 | * make sure the scheduler tick is up to date. |
267 | */ | 264 | */ |
268 | __get_cpu_var(softlockup_touch_sync) = false; | 265 | __this_cpu_write(softlockup_touch_sync, false); |
269 | sched_clock_tick(); | 266 | sched_clock_tick(); |
270 | } | 267 | } |
271 | __touch_watchdog(); | 268 | __touch_watchdog(); |
@@ -281,7 +278,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) | |||
281 | duration = is_softlockup(touch_ts); | 278 | duration = is_softlockup(touch_ts); |
282 | if (unlikely(duration)) { | 279 | if (unlikely(duration)) { |
283 | /* only warn once */ | 280 | /* only warn once */ |
284 | if (__get_cpu_var(soft_watchdog_warn) == true) | 281 | if (__this_cpu_read(soft_watchdog_warn) == true) |
285 | return HRTIMER_RESTART; | 282 | return HRTIMER_RESTART; |
286 | 283 | ||
287 | printk(KERN_ERR "BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n", | 284 | printk(KERN_ERR "BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n", |
@@ -296,9 +293,9 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) | |||
296 | 293 | ||
297 | if (softlockup_panic) | 294 | if (softlockup_panic) |
298 | panic("softlockup: hung tasks"); | 295 | panic("softlockup: hung tasks"); |
299 | __get_cpu_var(soft_watchdog_warn) = true; | 296 | __this_cpu_write(soft_watchdog_warn, true); |
300 | } else | 297 | } else |
301 | __get_cpu_var(soft_watchdog_warn) = false; | 298 | __this_cpu_write(soft_watchdog_warn, false); |
302 | 299 | ||
303 | return HRTIMER_RESTART; | 300 | return HRTIMER_RESTART; |
304 | } | 301 | } |
@@ -366,8 +363,14 @@ static int watchdog_nmi_enable(int cpu) | |||
366 | goto out_save; | 363 | goto out_save; |
367 | } | 364 | } |
368 | 365 | ||
369 | printk(KERN_ERR "NMI watchdog disabled for cpu%i: unable to create perf event: %ld\n", | 366 | |
370 | cpu, PTR_ERR(event)); | 367 | /* vary the KERN level based on the returned errno */ |
368 | if (PTR_ERR(event) == -EOPNOTSUPP) | ||
369 | printk(KERN_INFO "NMI watchdog disabled (cpu%i): not supported (no LAPIC?)\n", cpu); | ||
370 | else if (PTR_ERR(event) == -ENOENT) | ||
371 | printk(KERN_WARNING "NMI watchdog disabled (cpu%i): hardware events not enabled\n", cpu); | ||
372 | else | ||
373 | printk(KERN_ERR "NMI watchdog disabled (cpu%i): unable to create perf event: %ld\n", cpu, PTR_ERR(event)); | ||
371 | return PTR_ERR(event); | 374 | return PTR_ERR(event); |
372 | 375 | ||
373 | /* success path */ | 376 | /* success path */ |
@@ -432,9 +435,6 @@ static int watchdog_enable(int cpu) | |||
432 | wake_up_process(p); | 435 | wake_up_process(p); |
433 | } | 436 | } |
434 | 437 | ||
435 | /* if any cpu succeeds, watchdog is considered enabled for the system */ | ||
436 | watchdog_enabled = 1; | ||
437 | |||
438 | return 0; | 438 | return 0; |
439 | } | 439 | } |
440 | 440 | ||
@@ -462,12 +462,16 @@ static void watchdog_disable(int cpu) | |||
462 | static void watchdog_enable_all_cpus(void) | 462 | static void watchdog_enable_all_cpus(void) |
463 | { | 463 | { |
464 | int cpu; | 464 | int cpu; |
465 | int result = 0; | 465 | |
466 | watchdog_enabled = 0; | ||
466 | 467 | ||
467 | for_each_online_cpu(cpu) | 468 | for_each_online_cpu(cpu) |
468 | result += watchdog_enable(cpu); | 469 | if (!watchdog_enable(cpu)) |
470 | /* if any cpu succeeds, watchdog is considered | ||
471 | enabled for the system */ | ||
472 | watchdog_enabled = 1; | ||
469 | 473 | ||
470 | if (result) | 474 | if (!watchdog_enabled) |
471 | printk(KERN_ERR "watchdog: failed to be enabled on some cpus\n"); | 475 | printk(KERN_ERR "watchdog: failed to be enabled on some cpus\n"); |
472 | 476 | ||
473 | } | 477 | } |
@@ -476,9 +480,6 @@ static void watchdog_disable_all_cpus(void) | |||
476 | { | 480 | { |
477 | int cpu; | 481 | int cpu; |
478 | 482 | ||
479 | if (no_watchdog) | ||
480 | return; | ||
481 | |||
482 | for_each_online_cpu(cpu) | 483 | for_each_online_cpu(cpu) |
483 | watchdog_disable(cpu); | 484 | watchdog_disable(cpu); |
484 | 485 | ||
@@ -498,10 +499,12 @@ int proc_dowatchdog_enabled(struct ctl_table *table, int write, | |||
498 | { | 499 | { |
499 | proc_dointvec(table, write, buffer, length, ppos); | 500 | proc_dointvec(table, write, buffer, length, ppos); |
500 | 501 | ||
501 | if (watchdog_enabled) | 502 | if (write) { |
502 | watchdog_enable_all_cpus(); | 503 | if (watchdog_enabled) |
503 | else | 504 | watchdog_enable_all_cpus(); |
504 | watchdog_disable_all_cpus(); | 505 | else |
506 | watchdog_disable_all_cpus(); | ||
507 | } | ||
505 | return 0; | 508 | return 0; |
506 | } | 509 | } |
507 | 510 | ||
@@ -530,7 +533,8 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) | |||
530 | break; | 533 | break; |
531 | case CPU_ONLINE: | 534 | case CPU_ONLINE: |
532 | case CPU_ONLINE_FROZEN: | 535 | case CPU_ONLINE_FROZEN: |
533 | err = watchdog_enable(hotcpu); | 536 | if (watchdog_enabled) |
537 | err = watchdog_enable(hotcpu); | ||
534 | break; | 538 | break; |
535 | #ifdef CONFIG_HOTPLUG_CPU | 539 | #ifdef CONFIG_HOTPLUG_CPU |
536 | case CPU_UP_CANCELED: | 540 | case CPU_UP_CANCELED: |
@@ -555,9 +559,6 @@ void __init lockup_detector_init(void) | |||
555 | void *cpu = (void *)(long)smp_processor_id(); | 559 | void *cpu = (void *)(long)smp_processor_id(); |
556 | int err; | 560 | int err; |
557 | 561 | ||
558 | if (no_watchdog) | ||
559 | return; | ||
560 | |||
561 | err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu); | 562 | err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu); |
562 | WARN_ON(notifier_to_errno(err)); | 563 | WARN_ON(notifier_to_errno(err)); |
563 | 564 | ||
diff --git a/kernel/workqueue.c b/kernel/workqueue.c index e785b0f2aea5..b5fe4c00eb3c 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c | |||
@@ -79,7 +79,9 @@ enum { | |||
79 | MAX_IDLE_WORKERS_RATIO = 4, /* 1/4 of busy can be idle */ | 79 | MAX_IDLE_WORKERS_RATIO = 4, /* 1/4 of busy can be idle */ |
80 | IDLE_WORKER_TIMEOUT = 300 * HZ, /* keep idle ones for 5 mins */ | 80 | IDLE_WORKER_TIMEOUT = 300 * HZ, /* keep idle ones for 5 mins */ |
81 | 81 | ||
82 | MAYDAY_INITIAL_TIMEOUT = HZ / 100, /* call for help after 10ms */ | 82 | MAYDAY_INITIAL_TIMEOUT = HZ / 100 >= 2 ? HZ / 100 : 2, |
83 | /* call for help after 10ms | ||
84 | (min two ticks) */ | ||
83 | MAYDAY_INTERVAL = HZ / 10, /* and then every 100ms */ | 85 | MAYDAY_INTERVAL = HZ / 10, /* and then every 100ms */ |
84 | CREATE_COOLDOWN = HZ, /* time to breath after fail */ | 86 | CREATE_COOLDOWN = HZ, /* time to breath after fail */ |
85 | TRUSTEE_COOLDOWN = HZ / 10, /* for trustee draining */ | 87 | TRUSTEE_COOLDOWN = HZ / 10, /* for trustee draining */ |
@@ -314,6 +316,11 @@ static inline int __next_wq_cpu(int cpu, const struct cpumask *mask, | |||
314 | 316 | ||
315 | static struct debug_obj_descr work_debug_descr; | 317 | static struct debug_obj_descr work_debug_descr; |
316 | 318 | ||
319 | static void *work_debug_hint(void *addr) | ||
320 | { | ||
321 | return ((struct work_struct *) addr)->func; | ||
322 | } | ||
323 | |||
317 | /* | 324 | /* |
318 | * fixup_init is called when: | 325 | * fixup_init is called when: |
319 | * - an active object is initialized | 326 | * - an active object is initialized |
@@ -385,6 +392,7 @@ static int work_fixup_free(void *addr, enum debug_obj_state state) | |||
385 | 392 | ||
386 | static struct debug_obj_descr work_debug_descr = { | 393 | static struct debug_obj_descr work_debug_descr = { |
387 | .name = "work_struct", | 394 | .name = "work_struct", |
395 | .debug_hint = work_debug_hint, | ||
388 | .fixup_init = work_fixup_init, | 396 | .fixup_init = work_fixup_init, |
389 | .fixup_activate = work_fixup_activate, | 397 | .fixup_activate = work_fixup_activate, |
390 | .fixup_free = work_fixup_free, | 398 | .fixup_free = work_fixup_free, |
@@ -768,7 +776,11 @@ static inline void worker_clr_flags(struct worker *worker, unsigned int flags) | |||
768 | 776 | ||
769 | worker->flags &= ~flags; | 777 | worker->flags &= ~flags; |
770 | 778 | ||
771 | /* if transitioning out of NOT_RUNNING, increment nr_running */ | 779 | /* |
780 | * If transitioning out of NOT_RUNNING, increment nr_running. Note | ||
781 | * that the nested NOT_RUNNING is not a noop. NOT_RUNNING is mask | ||
782 | * of multiple flags, not a single flag. | ||
783 | */ | ||
772 | if ((flags & WORKER_NOT_RUNNING) && (oflags & WORKER_NOT_RUNNING)) | 784 | if ((flags & WORKER_NOT_RUNNING) && (oflags & WORKER_NOT_RUNNING)) |
773 | if (!(worker->flags & WORKER_NOT_RUNNING)) | 785 | if (!(worker->flags & WORKER_NOT_RUNNING)) |
774 | atomic_inc(get_gcwq_nr_running(gcwq->cpu)); | 786 | atomic_inc(get_gcwq_nr_running(gcwq->cpu)); |
@@ -932,6 +944,38 @@ static void insert_work(struct cpu_workqueue_struct *cwq, | |||
932 | wake_up_worker(gcwq); | 944 | wake_up_worker(gcwq); |
933 | } | 945 | } |
934 | 946 | ||
947 | /* | ||
948 | * Test whether @work is being queued from another work executing on the | ||
949 | * same workqueue. This is rather expensive and should only be used from | ||
950 | * cold paths. | ||
951 | */ | ||
952 | static bool is_chained_work(struct workqueue_struct *wq) | ||
953 | { | ||
954 | unsigned long flags; | ||
955 | unsigned int cpu; | ||
956 | |||
957 | for_each_gcwq_cpu(cpu) { | ||
958 | struct global_cwq *gcwq = get_gcwq(cpu); | ||
959 | struct worker *worker; | ||
960 | struct hlist_node *pos; | ||
961 | int i; | ||
962 | |||
963 | spin_lock_irqsave(&gcwq->lock, flags); | ||
964 | for_each_busy_worker(worker, i, pos, gcwq) { | ||
965 | if (worker->task != current) | ||
966 | continue; | ||
967 | spin_unlock_irqrestore(&gcwq->lock, flags); | ||
968 | /* | ||
969 | * I'm @worker, no locking necessary. See if @work | ||
970 | * is headed to the same workqueue. | ||
971 | */ | ||
972 | return worker->current_cwq->wq == wq; | ||
973 | } | ||
974 | spin_unlock_irqrestore(&gcwq->lock, flags); | ||
975 | } | ||
976 | return false; | ||
977 | } | ||
978 | |||
935 | static void __queue_work(unsigned int cpu, struct workqueue_struct *wq, | 979 | static void __queue_work(unsigned int cpu, struct workqueue_struct *wq, |
936 | struct work_struct *work) | 980 | struct work_struct *work) |
937 | { | 981 | { |
@@ -943,7 +987,9 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq, | |||
943 | 987 | ||
944 | debug_work_activate(work); | 988 | debug_work_activate(work); |
945 | 989 | ||
946 | if (WARN_ON_ONCE(wq->flags & WQ_DYING)) | 990 | /* if dying, only works from the same workqueue are allowed */ |
991 | if (unlikely(wq->flags & WQ_DYING) && | ||
992 | WARN_ON_ONCE(!is_chained_work(wq))) | ||
947 | return; | 993 | return; |
948 | 994 | ||
949 | /* determine gcwq to use */ | 995 | /* determine gcwq to use */ |
@@ -1806,7 +1852,7 @@ __acquires(&gcwq->lock) | |||
1806 | spin_unlock_irq(&gcwq->lock); | 1852 | spin_unlock_irq(&gcwq->lock); |
1807 | 1853 | ||
1808 | work_clear_pending(work); | 1854 | work_clear_pending(work); |
1809 | lock_map_acquire(&cwq->wq->lockdep_map); | 1855 | lock_map_acquire_read(&cwq->wq->lockdep_map); |
1810 | lock_map_acquire(&lockdep_map); | 1856 | lock_map_acquire(&lockdep_map); |
1811 | trace_workqueue_execute_start(work); | 1857 | trace_workqueue_execute_start(work); |
1812 | f(work); | 1858 | f(work); |
@@ -2009,6 +2055,15 @@ repeat: | |||
2009 | move_linked_works(work, scheduled, &n); | 2055 | move_linked_works(work, scheduled, &n); |
2010 | 2056 | ||
2011 | process_scheduled_works(rescuer); | 2057 | process_scheduled_works(rescuer); |
2058 | |||
2059 | /* | ||
2060 | * Leave this gcwq. If keep_working() is %true, notify a | ||
2061 | * regular worker; otherwise, we end up with 0 concurrency | ||
2062 | * and stalling the execution. | ||
2063 | */ | ||
2064 | if (keep_working(gcwq)) | ||
2065 | wake_up_worker(gcwq); | ||
2066 | |||
2012 | spin_unlock_irq(&gcwq->lock); | 2067 | spin_unlock_irq(&gcwq->lock); |
2013 | } | 2068 | } |
2014 | 2069 | ||
@@ -2350,8 +2405,18 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr, | |||
2350 | insert_wq_barrier(cwq, barr, work, worker); | 2405 | insert_wq_barrier(cwq, barr, work, worker); |
2351 | spin_unlock_irq(&gcwq->lock); | 2406 | spin_unlock_irq(&gcwq->lock); |
2352 | 2407 | ||
2353 | lock_map_acquire(&cwq->wq->lockdep_map); | 2408 | /* |
2409 | * If @max_active is 1 or rescuer is in use, flushing another work | ||
2410 | * item on the same workqueue may lead to deadlock. Make sure the | ||
2411 | * flusher is not running on the same workqueue by verifying write | ||
2412 | * access. | ||
2413 | */ | ||
2414 | if (cwq->wq->saved_max_active == 1 || cwq->wq->flags & WQ_RESCUER) | ||
2415 | lock_map_acquire(&cwq->wq->lockdep_map); | ||
2416 | else | ||
2417 | lock_map_acquire_read(&cwq->wq->lockdep_map); | ||
2354 | lock_map_release(&cwq->wq->lockdep_map); | 2418 | lock_map_release(&cwq->wq->lockdep_map); |
2419 | |||
2355 | return true; | 2420 | return true; |
2356 | already_gone: | 2421 | already_gone: |
2357 | spin_unlock_irq(&gcwq->lock); | 2422 | spin_unlock_irq(&gcwq->lock); |
@@ -2908,7 +2973,7 @@ struct workqueue_struct *__alloc_workqueue_key(const char *name, | |||
2908 | */ | 2973 | */ |
2909 | spin_lock(&workqueue_lock); | 2974 | spin_lock(&workqueue_lock); |
2910 | 2975 | ||
2911 | if (workqueue_freezing && wq->flags & WQ_FREEZEABLE) | 2976 | if (workqueue_freezing && wq->flags & WQ_FREEZABLE) |
2912 | for_each_cwq_cpu(cpu, wq) | 2977 | for_each_cwq_cpu(cpu, wq) |
2913 | get_cwq(cpu, wq)->max_active = 0; | 2978 | get_cwq(cpu, wq)->max_active = 0; |
2914 | 2979 | ||
@@ -2936,11 +3001,35 @@ EXPORT_SYMBOL_GPL(__alloc_workqueue_key); | |||
2936 | */ | 3001 | */ |
2937 | void destroy_workqueue(struct workqueue_struct *wq) | 3002 | void destroy_workqueue(struct workqueue_struct *wq) |
2938 | { | 3003 | { |
3004 | unsigned int flush_cnt = 0; | ||
2939 | unsigned int cpu; | 3005 | unsigned int cpu; |
2940 | 3006 | ||
3007 | /* | ||
3008 | * Mark @wq dying and drain all pending works. Once WQ_DYING is | ||
3009 | * set, only chain queueing is allowed. IOW, only currently | ||
3010 | * pending or running work items on @wq can queue further work | ||
3011 | * items on it. @wq is flushed repeatedly until it becomes empty. | ||
3012 | * The number of flushing is detemined by the depth of chaining and | ||
3013 | * should be relatively short. Whine if it takes too long. | ||
3014 | */ | ||
2941 | wq->flags |= WQ_DYING; | 3015 | wq->flags |= WQ_DYING; |
3016 | reflush: | ||
2942 | flush_workqueue(wq); | 3017 | flush_workqueue(wq); |
2943 | 3018 | ||
3019 | for_each_cwq_cpu(cpu, wq) { | ||
3020 | struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); | ||
3021 | |||
3022 | if (!cwq->nr_active && list_empty(&cwq->delayed_works)) | ||
3023 | continue; | ||
3024 | |||
3025 | if (++flush_cnt == 10 || | ||
3026 | (flush_cnt % 100 == 0 && flush_cnt <= 1000)) | ||
3027 | printk(KERN_WARNING "workqueue %s: flush on " | ||
3028 | "destruction isn't complete after %u tries\n", | ||
3029 | wq->name, flush_cnt); | ||
3030 | goto reflush; | ||
3031 | } | ||
3032 | |||
2944 | /* | 3033 | /* |
2945 | * wq list is used to freeze wq, remove from list after | 3034 | * wq list is used to freeze wq, remove from list after |
2946 | * flushing is complete in case freeze races us. | 3035 | * flushing is complete in case freeze races us. |
@@ -2996,7 +3085,7 @@ void workqueue_set_max_active(struct workqueue_struct *wq, int max_active) | |||
2996 | 3085 | ||
2997 | spin_lock_irq(&gcwq->lock); | 3086 | spin_lock_irq(&gcwq->lock); |
2998 | 3087 | ||
2999 | if (!(wq->flags & WQ_FREEZEABLE) || | 3088 | if (!(wq->flags & WQ_FREEZABLE) || |
3000 | !(gcwq->flags & GCWQ_FREEZING)) | 3089 | !(gcwq->flags & GCWQ_FREEZING)) |
3001 | get_cwq(gcwq->cpu, wq)->max_active = max_active; | 3090 | get_cwq(gcwq->cpu, wq)->max_active = max_active; |
3002 | 3091 | ||
@@ -3246,7 +3335,7 @@ static int __cpuinit trustee_thread(void *__gcwq) | |||
3246 | * want to get it over with ASAP - spam rescuers, wake up as | 3335 | * want to get it over with ASAP - spam rescuers, wake up as |
3247 | * many idlers as necessary and create new ones till the | 3336 | * many idlers as necessary and create new ones till the |
3248 | * worklist is empty. Note that if the gcwq is frozen, there | 3337 | * worklist is empty. Note that if the gcwq is frozen, there |
3249 | * may be frozen works in freezeable cwqs. Don't declare | 3338 | * may be frozen works in freezable cwqs. Don't declare |
3250 | * completion while frozen. | 3339 | * completion while frozen. |
3251 | */ | 3340 | */ |
3252 | while (gcwq->nr_workers != gcwq->nr_idle || | 3341 | while (gcwq->nr_workers != gcwq->nr_idle || |
@@ -3504,9 +3593,9 @@ EXPORT_SYMBOL_GPL(work_on_cpu); | |||
3504 | /** | 3593 | /** |
3505 | * freeze_workqueues_begin - begin freezing workqueues | 3594 | * freeze_workqueues_begin - begin freezing workqueues |
3506 | * | 3595 | * |
3507 | * Start freezing workqueues. After this function returns, all | 3596 | * Start freezing workqueues. After this function returns, all freezable |
3508 | * freezeable workqueues will queue new works to their frozen_works | 3597 | * workqueues will queue new works to their frozen_works list instead of |
3509 | * list instead of gcwq->worklist. | 3598 | * gcwq->worklist. |
3510 | * | 3599 | * |
3511 | * CONTEXT: | 3600 | * CONTEXT: |
3512 | * Grabs and releases workqueue_lock and gcwq->lock's. | 3601 | * Grabs and releases workqueue_lock and gcwq->lock's. |
@@ -3532,7 +3621,7 @@ void freeze_workqueues_begin(void) | |||
3532 | list_for_each_entry(wq, &workqueues, list) { | 3621 | list_for_each_entry(wq, &workqueues, list) { |
3533 | struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); | 3622 | struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); |
3534 | 3623 | ||
3535 | if (cwq && wq->flags & WQ_FREEZEABLE) | 3624 | if (cwq && wq->flags & WQ_FREEZABLE) |
3536 | cwq->max_active = 0; | 3625 | cwq->max_active = 0; |
3537 | } | 3626 | } |
3538 | 3627 | ||
@@ -3543,7 +3632,7 @@ void freeze_workqueues_begin(void) | |||
3543 | } | 3632 | } |
3544 | 3633 | ||
3545 | /** | 3634 | /** |
3546 | * freeze_workqueues_busy - are freezeable workqueues still busy? | 3635 | * freeze_workqueues_busy - are freezable workqueues still busy? |
3547 | * | 3636 | * |
3548 | * Check whether freezing is complete. This function must be called | 3637 | * Check whether freezing is complete. This function must be called |
3549 | * between freeze_workqueues_begin() and thaw_workqueues(). | 3638 | * between freeze_workqueues_begin() and thaw_workqueues(). |
@@ -3552,8 +3641,8 @@ void freeze_workqueues_begin(void) | |||
3552 | * Grabs and releases workqueue_lock. | 3641 | * Grabs and releases workqueue_lock. |
3553 | * | 3642 | * |
3554 | * RETURNS: | 3643 | * RETURNS: |
3555 | * %true if some freezeable workqueues are still busy. %false if | 3644 | * %true if some freezable workqueues are still busy. %false if freezing |
3556 | * freezing is complete. | 3645 | * is complete. |
3557 | */ | 3646 | */ |
3558 | bool freeze_workqueues_busy(void) | 3647 | bool freeze_workqueues_busy(void) |
3559 | { | 3648 | { |
@@ -3573,7 +3662,7 @@ bool freeze_workqueues_busy(void) | |||
3573 | list_for_each_entry(wq, &workqueues, list) { | 3662 | list_for_each_entry(wq, &workqueues, list) { |
3574 | struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); | 3663 | struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); |
3575 | 3664 | ||
3576 | if (!cwq || !(wq->flags & WQ_FREEZEABLE)) | 3665 | if (!cwq || !(wq->flags & WQ_FREEZABLE)) |
3577 | continue; | 3666 | continue; |
3578 | 3667 | ||
3579 | BUG_ON(cwq->nr_active < 0); | 3668 | BUG_ON(cwq->nr_active < 0); |
@@ -3618,7 +3707,7 @@ void thaw_workqueues(void) | |||
3618 | list_for_each_entry(wq, &workqueues, list) { | 3707 | list_for_each_entry(wq, &workqueues, list) { |
3619 | struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); | 3708 | struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); |
3620 | 3709 | ||
3621 | if (!cwq || !(wq->flags & WQ_FREEZEABLE)) | 3710 | if (!cwq || !(wq->flags & WQ_FREEZABLE)) |
3622 | continue; | 3711 | continue; |
3623 | 3712 | ||
3624 | /* restore max_active and repopulate worklist */ | 3713 | /* restore max_active and repopulate worklist */ |