diff options
Diffstat (limited to 'kernel')
103 files changed, 5235 insertions, 2595 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index 0b5ff083fa2..353d3fe8ba3 100644 --- a/kernel/Makefile +++ b/kernel/Makefile | |||
| @@ -43,7 +43,7 @@ obj-$(CONFIG_RT_MUTEXES) += rtmutex.o | |||
| 43 | obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o | 43 | obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o |
| 44 | obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o | 44 | obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o |
| 45 | obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o | 45 | obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o |
| 46 | obj-$(CONFIG_USE_GENERIC_SMP_HELPERS) += smp.o | 46 | obj-$(CONFIG_SMP) += smp.o |
| 47 | ifneq ($(CONFIG_SMP),y) | 47 | ifneq ($(CONFIG_SMP),y) |
| 48 | obj-y += up.o | 48 | obj-y += up.o |
| 49 | endif | 49 | endif |
| @@ -100,6 +100,7 @@ obj-$(CONFIG_FUNCTION_TRACER) += trace/ | |||
| 100 | obj-$(CONFIG_TRACING) += trace/ | 100 | obj-$(CONFIG_TRACING) += trace/ |
| 101 | obj-$(CONFIG_X86_DS) += trace/ | 101 | obj-$(CONFIG_X86_DS) += trace/ |
| 102 | obj-$(CONFIG_RING_BUFFER) += trace/ | 102 | obj-$(CONFIG_RING_BUFFER) += trace/ |
| 103 | obj-$(CONFIG_TRACEPOINTS) += trace/ | ||
| 103 | obj-$(CONFIG_SMP) += sched_cpupri.o | 104 | obj-$(CONFIG_SMP) += sched_cpupri.o |
| 104 | obj-$(CONFIG_IRQ_WORK) += irq_work.o | 105 | obj-$(CONFIG_IRQ_WORK) += irq_work.o |
| 105 | obj-$(CONFIG_PERF_EVENTS) += perf_event.o | 106 | obj-$(CONFIG_PERF_EVENTS) += perf_event.o |
| @@ -121,7 +122,7 @@ $(obj)/configs.o: $(obj)/config_data.h | |||
| 121 | # config_data.h contains the same information as ikconfig.h but gzipped. | 122 | # config_data.h contains the same information as ikconfig.h but gzipped. |
| 122 | # Info from config_data can be extracted from /proc/config* | 123 | # Info from config_data can be extracted from /proc/config* |
| 123 | targets += config_data.gz | 124 | targets += config_data.gz |
| 124 | $(obj)/config_data.gz: .config FORCE | 125 | $(obj)/config_data.gz: $(KCONFIG_CONFIG) FORCE |
| 125 | $(call if_changed,gzip) | 126 | $(call if_changed,gzip) |
| 126 | 127 | ||
| 127 | quiet_cmd_ikconfiggz = IKCFG $@ | 128 | quiet_cmd_ikconfiggz = IKCFG $@ |
diff --git a/kernel/audit.c b/kernel/audit.c index 77770a034d5..e4956244ae5 100644 --- a/kernel/audit.c +++ b/kernel/audit.c | |||
| @@ -400,7 +400,7 @@ static void kauditd_send_skb(struct sk_buff *skb) | |||
| 400 | if (err < 0) { | 400 | if (err < 0) { |
| 401 | BUG_ON(err != -ECONNREFUSED); /* Shouldn't happen */ | 401 | BUG_ON(err != -ECONNREFUSED); /* Shouldn't happen */ |
| 402 | printk(KERN_ERR "audit: *NO* daemon at audit_pid=%d\n", audit_pid); | 402 | printk(KERN_ERR "audit: *NO* daemon at audit_pid=%d\n", audit_pid); |
| 403 | audit_log_lost("auditd dissapeared\n"); | 403 | audit_log_lost("auditd disappeared\n"); |
| 404 | audit_pid = 0; | 404 | audit_pid = 0; |
| 405 | /* we might get lucky and get this in the next auditd */ | 405 | /* we might get lucky and get this in the next auditd */ |
| 406 | audit_hold_skb(skb); | 406 | audit_hold_skb(skb); |
diff --git a/kernel/capability.c b/kernel/capability.c index 2f05303715a..9e9385f132c 100644 --- a/kernel/capability.c +++ b/kernel/capability.c | |||
| @@ -306,7 +306,7 @@ int capable(int cap) | |||
| 306 | BUG(); | 306 | BUG(); |
| 307 | } | 307 | } |
| 308 | 308 | ||
| 309 | if (security_capable(cap) == 0) { | 309 | if (security_capable(current_cred(), cap) == 0) { |
| 310 | current->flags |= PF_SUPERPRIV; | 310 | current->flags |= PF_SUPERPRIV; |
| 311 | return 1; | 311 | return 1; |
| 312 | } | 312 | } |
diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 66a416b42c1..b24d7027b83 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c | |||
| @@ -764,6 +764,7 @@ EXPORT_SYMBOL_GPL(cgroup_unlock); | |||
| 764 | */ | 764 | */ |
| 765 | 765 | ||
| 766 | static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode); | 766 | static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode); |
| 767 | static struct dentry *cgroup_lookup(struct inode *, struct dentry *, struct nameidata *); | ||
| 767 | static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry); | 768 | static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry); |
| 768 | static int cgroup_populate_dir(struct cgroup *cgrp); | 769 | static int cgroup_populate_dir(struct cgroup *cgrp); |
| 769 | static const struct inode_operations cgroup_dir_inode_operations; | 770 | static const struct inode_operations cgroup_dir_inode_operations; |
| @@ -860,6 +861,11 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode) | |||
| 860 | iput(inode); | 861 | iput(inode); |
| 861 | } | 862 | } |
| 862 | 863 | ||
| 864 | static int cgroup_delete(const struct dentry *d) | ||
| 865 | { | ||
| 866 | return 1; | ||
| 867 | } | ||
| 868 | |||
| 863 | static void remove_dir(struct dentry *d) | 869 | static void remove_dir(struct dentry *d) |
| 864 | { | 870 | { |
| 865 | struct dentry *parent = dget(d->d_parent); | 871 | struct dentry *parent = dget(d->d_parent); |
| @@ -874,25 +880,29 @@ static void cgroup_clear_directory(struct dentry *dentry) | |||
| 874 | struct list_head *node; | 880 | struct list_head *node; |
| 875 | 881 | ||
| 876 | BUG_ON(!mutex_is_locked(&dentry->d_inode->i_mutex)); | 882 | BUG_ON(!mutex_is_locked(&dentry->d_inode->i_mutex)); |
| 877 | spin_lock(&dcache_lock); | 883 | spin_lock(&dentry->d_lock); |
| 878 | node = dentry->d_subdirs.next; | 884 | node = dentry->d_subdirs.next; |
| 879 | while (node != &dentry->d_subdirs) { | 885 | while (node != &dentry->d_subdirs) { |
| 880 | struct dentry *d = list_entry(node, struct dentry, d_u.d_child); | 886 | struct dentry *d = list_entry(node, struct dentry, d_u.d_child); |
| 887 | |||
| 888 | spin_lock_nested(&d->d_lock, DENTRY_D_LOCK_NESTED); | ||
| 881 | list_del_init(node); | 889 | list_del_init(node); |
| 882 | if (d->d_inode) { | 890 | if (d->d_inode) { |
| 883 | /* This should never be called on a cgroup | 891 | /* This should never be called on a cgroup |
| 884 | * directory with child cgroups */ | 892 | * directory with child cgroups */ |
| 885 | BUG_ON(d->d_inode->i_mode & S_IFDIR); | 893 | BUG_ON(d->d_inode->i_mode & S_IFDIR); |
| 886 | d = dget_locked(d); | 894 | dget_dlock(d); |
| 887 | spin_unlock(&dcache_lock); | 895 | spin_unlock(&d->d_lock); |
| 896 | spin_unlock(&dentry->d_lock); | ||
| 888 | d_delete(d); | 897 | d_delete(d); |
| 889 | simple_unlink(dentry->d_inode, d); | 898 | simple_unlink(dentry->d_inode, d); |
| 890 | dput(d); | 899 | dput(d); |
| 891 | spin_lock(&dcache_lock); | 900 | spin_lock(&dentry->d_lock); |
| 892 | } | 901 | } else |
| 902 | spin_unlock(&d->d_lock); | ||
| 893 | node = dentry->d_subdirs.next; | 903 | node = dentry->d_subdirs.next; |
| 894 | } | 904 | } |
| 895 | spin_unlock(&dcache_lock); | 905 | spin_unlock(&dentry->d_lock); |
| 896 | } | 906 | } |
| 897 | 907 | ||
| 898 | /* | 908 | /* |
| @@ -900,11 +910,16 @@ static void cgroup_clear_directory(struct dentry *dentry) | |||
| 900 | */ | 910 | */ |
| 901 | static void cgroup_d_remove_dir(struct dentry *dentry) | 911 | static void cgroup_d_remove_dir(struct dentry *dentry) |
| 902 | { | 912 | { |
| 913 | struct dentry *parent; | ||
| 914 | |||
| 903 | cgroup_clear_directory(dentry); | 915 | cgroup_clear_directory(dentry); |
| 904 | 916 | ||
| 905 | spin_lock(&dcache_lock); | 917 | parent = dentry->d_parent; |
| 918 | spin_lock(&parent->d_lock); | ||
| 919 | spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); | ||
| 906 | list_del_init(&dentry->d_u.d_child); | 920 | list_del_init(&dentry->d_u.d_child); |
| 907 | spin_unlock(&dcache_lock); | 921 | spin_unlock(&dentry->d_lock); |
| 922 | spin_unlock(&parent->d_lock); | ||
| 908 | remove_dir(dentry); | 923 | remove_dir(dentry); |
| 909 | } | 924 | } |
| 910 | 925 | ||
| @@ -1440,6 +1455,11 @@ static int cgroup_set_super(struct super_block *sb, void *data) | |||
| 1440 | 1455 | ||
| 1441 | static int cgroup_get_rootdir(struct super_block *sb) | 1456 | static int cgroup_get_rootdir(struct super_block *sb) |
| 1442 | { | 1457 | { |
| 1458 | static const struct dentry_operations cgroup_dops = { | ||
| 1459 | .d_iput = cgroup_diput, | ||
| 1460 | .d_delete = cgroup_delete, | ||
| 1461 | }; | ||
| 1462 | |||
| 1443 | struct inode *inode = | 1463 | struct inode *inode = |
| 1444 | cgroup_new_inode(S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR, sb); | 1464 | cgroup_new_inode(S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR, sb); |
| 1445 | struct dentry *dentry; | 1465 | struct dentry *dentry; |
| @@ -1457,6 +1477,8 @@ static int cgroup_get_rootdir(struct super_block *sb) | |||
| 1457 | return -ENOMEM; | 1477 | return -ENOMEM; |
| 1458 | } | 1478 | } |
| 1459 | sb->s_root = dentry; | 1479 | sb->s_root = dentry; |
| 1480 | /* for everything else we want ->d_op set */ | ||
| 1481 | sb->s_d_op = &cgroup_dops; | ||
| 1460 | return 0; | 1482 | return 0; |
| 1461 | } | 1483 | } |
| 1462 | 1484 | ||
| @@ -2180,12 +2202,20 @@ static const struct file_operations cgroup_file_operations = { | |||
| 2180 | }; | 2202 | }; |
| 2181 | 2203 | ||
| 2182 | static const struct inode_operations cgroup_dir_inode_operations = { | 2204 | static const struct inode_operations cgroup_dir_inode_operations = { |
| 2183 | .lookup = simple_lookup, | 2205 | .lookup = cgroup_lookup, |
| 2184 | .mkdir = cgroup_mkdir, | 2206 | .mkdir = cgroup_mkdir, |
| 2185 | .rmdir = cgroup_rmdir, | 2207 | .rmdir = cgroup_rmdir, |
| 2186 | .rename = cgroup_rename, | 2208 | .rename = cgroup_rename, |
| 2187 | }; | 2209 | }; |
| 2188 | 2210 | ||
| 2211 | static struct dentry *cgroup_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) | ||
| 2212 | { | ||
| 2213 | if (dentry->d_name.len > NAME_MAX) | ||
| 2214 | return ERR_PTR(-ENAMETOOLONG); | ||
| 2215 | d_add(dentry, NULL); | ||
| 2216 | return NULL; | ||
| 2217 | } | ||
| 2218 | |||
| 2189 | /* | 2219 | /* |
| 2190 | * Check if a file is a control file | 2220 | * Check if a file is a control file |
| 2191 | */ | 2221 | */ |
| @@ -2199,10 +2229,6 @@ static inline struct cftype *__file_cft(struct file *file) | |||
| 2199 | static int cgroup_create_file(struct dentry *dentry, mode_t mode, | 2229 | static int cgroup_create_file(struct dentry *dentry, mode_t mode, |
| 2200 | struct super_block *sb) | 2230 | struct super_block *sb) |
| 2201 | { | 2231 | { |
| 2202 | static const struct dentry_operations cgroup_dops = { | ||
| 2203 | .d_iput = cgroup_diput, | ||
| 2204 | }; | ||
| 2205 | |||
| 2206 | struct inode *inode; | 2232 | struct inode *inode; |
| 2207 | 2233 | ||
| 2208 | if (!dentry) | 2234 | if (!dentry) |
| @@ -2228,7 +2254,6 @@ static int cgroup_create_file(struct dentry *dentry, mode_t mode, | |||
| 2228 | inode->i_size = 0; | 2254 | inode->i_size = 0; |
| 2229 | inode->i_fop = &cgroup_file_operations; | 2255 | inode->i_fop = &cgroup_file_operations; |
| 2230 | } | 2256 | } |
| 2231 | dentry->d_op = &cgroup_dops; | ||
| 2232 | d_instantiate(dentry, inode); | 2257 | d_instantiate(dentry, inode); |
| 2233 | dget(dentry); /* Extra count - pin the dentry in core */ | 2258 | dget(dentry); /* Extra count - pin the dentry in core */ |
| 2234 | return 0; | 2259 | return 0; |
| @@ -3638,9 +3663,7 @@ again: | |||
| 3638 | list_del(&cgrp->sibling); | 3663 | list_del(&cgrp->sibling); |
| 3639 | cgroup_unlock_hierarchy(cgrp->root); | 3664 | cgroup_unlock_hierarchy(cgrp->root); |
| 3640 | 3665 | ||
| 3641 | spin_lock(&cgrp->dentry->d_lock); | ||
| 3642 | d = dget(cgrp->dentry); | 3666 | d = dget(cgrp->dentry); |
| 3643 | spin_unlock(&d->d_lock); | ||
| 3644 | 3667 | ||
| 3645 | cgroup_d_remove_dir(d); | 3668 | cgroup_d_remove_dir(d); |
| 3646 | dput(d); | 3669 | dput(d); |
diff --git a/kernel/cpu.c b/kernel/cpu.c index f6e726f1849..156cc555614 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c | |||
| @@ -189,7 +189,6 @@ static inline void check_for_tasks(int cpu) | |||
| 189 | } | 189 | } |
| 190 | 190 | ||
| 191 | struct take_cpu_down_param { | 191 | struct take_cpu_down_param { |
| 192 | struct task_struct *caller; | ||
| 193 | unsigned long mod; | 192 | unsigned long mod; |
| 194 | void *hcpu; | 193 | void *hcpu; |
| 195 | }; | 194 | }; |
| @@ -198,7 +197,6 @@ struct take_cpu_down_param { | |||
| 198 | static int __ref take_cpu_down(void *_param) | 197 | static int __ref take_cpu_down(void *_param) |
| 199 | { | 198 | { |
| 200 | struct take_cpu_down_param *param = _param; | 199 | struct take_cpu_down_param *param = _param; |
| 201 | unsigned int cpu = (unsigned long)param->hcpu; | ||
| 202 | int err; | 200 | int err; |
| 203 | 201 | ||
| 204 | /* Ensure this CPU doesn't handle any more interrupts. */ | 202 | /* Ensure this CPU doesn't handle any more interrupts. */ |
| @@ -208,11 +206,6 @@ static int __ref take_cpu_down(void *_param) | |||
| 208 | 206 | ||
| 209 | cpu_notify(CPU_DYING | param->mod, param->hcpu); | 207 | cpu_notify(CPU_DYING | param->mod, param->hcpu); |
| 210 | 208 | ||
| 211 | if (task_cpu(param->caller) == cpu) | ||
| 212 | move_task_off_dead_cpu(cpu, param->caller); | ||
| 213 | /* Force idle task to run as soon as we yield: it should | ||
| 214 | immediately notice cpu is offline and die quickly. */ | ||
| 215 | sched_idle_next(); | ||
| 216 | return 0; | 209 | return 0; |
| 217 | } | 210 | } |
| 218 | 211 | ||
| @@ -223,7 +216,6 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) | |||
| 223 | void *hcpu = (void *)(long)cpu; | 216 | void *hcpu = (void *)(long)cpu; |
| 224 | unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0; | 217 | unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0; |
| 225 | struct take_cpu_down_param tcd_param = { | 218 | struct take_cpu_down_param tcd_param = { |
| 226 | .caller = current, | ||
| 227 | .mod = mod, | 219 | .mod = mod, |
| 228 | .hcpu = hcpu, | 220 | .hcpu = hcpu, |
| 229 | }; | 221 | }; |
| @@ -253,9 +245,15 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) | |||
| 253 | } | 245 | } |
| 254 | BUG_ON(cpu_online(cpu)); | 246 | BUG_ON(cpu_online(cpu)); |
| 255 | 247 | ||
| 256 | /* Wait for it to sleep (leaving idle task). */ | 248 | /* |
| 249 | * The migration_call() CPU_DYING callback will have removed all | ||
| 250 | * runnable tasks from the cpu, there's only the idle task left now | ||
| 251 | * that the migration thread is done doing the stop_machine thing. | ||
| 252 | * | ||
| 253 | * Wait for the stop thread to go away. | ||
| 254 | */ | ||
| 257 | while (!idle_cpu(cpu)) | 255 | while (!idle_cpu(cpu)) |
| 258 | yield(); | 256 | cpu_relax(); |
| 259 | 257 | ||
| 260 | /* This actually kills the CPU. */ | 258 | /* This actually kills the CPU. */ |
| 261 | __cpu_die(cpu); | 259 | __cpu_die(cpu); |
| @@ -386,6 +384,14 @@ out: | |||
| 386 | #ifdef CONFIG_PM_SLEEP_SMP | 384 | #ifdef CONFIG_PM_SLEEP_SMP |
| 387 | static cpumask_var_t frozen_cpus; | 385 | static cpumask_var_t frozen_cpus; |
| 388 | 386 | ||
| 387 | void __weak arch_disable_nonboot_cpus_begin(void) | ||
| 388 | { | ||
| 389 | } | ||
| 390 | |||
| 391 | void __weak arch_disable_nonboot_cpus_end(void) | ||
| 392 | { | ||
| 393 | } | ||
| 394 | |||
| 389 | int disable_nonboot_cpus(void) | 395 | int disable_nonboot_cpus(void) |
| 390 | { | 396 | { |
| 391 | int cpu, first_cpu, error = 0; | 397 | int cpu, first_cpu, error = 0; |
| @@ -397,6 +403,7 @@ int disable_nonboot_cpus(void) | |||
| 397 | * with the userspace trying to use the CPU hotplug at the same time | 403 | * with the userspace trying to use the CPU hotplug at the same time |
| 398 | */ | 404 | */ |
| 399 | cpumask_clear(frozen_cpus); | 405 | cpumask_clear(frozen_cpus); |
| 406 | arch_disable_nonboot_cpus_begin(); | ||
| 400 | 407 | ||
| 401 | printk("Disabling non-boot CPUs ...\n"); | 408 | printk("Disabling non-boot CPUs ...\n"); |
| 402 | for_each_online_cpu(cpu) { | 409 | for_each_online_cpu(cpu) { |
| @@ -412,6 +419,8 @@ int disable_nonboot_cpus(void) | |||
| 412 | } | 419 | } |
| 413 | } | 420 | } |
| 414 | 421 | ||
| 422 | arch_disable_nonboot_cpus_end(); | ||
| 423 | |||
| 415 | if (!error) { | 424 | if (!error) { |
| 416 | BUG_ON(num_online_cpus() > 1); | 425 | BUG_ON(num_online_cpus() > 1); |
| 417 | /* Make sure the CPUs won't be enabled by someone else */ | 426 | /* Make sure the CPUs won't be enabled by someone else */ |
diff --git a/kernel/cred.c b/kernel/cred.c index 6a1aa004e37..3a9d6dd53a6 100644 --- a/kernel/cred.c +++ b/kernel/cred.c | |||
| @@ -252,13 +252,13 @@ struct cred *cred_alloc_blank(void) | |||
| 252 | #endif | 252 | #endif |
| 253 | 253 | ||
| 254 | atomic_set(&new->usage, 1); | 254 | atomic_set(&new->usage, 1); |
| 255 | #ifdef CONFIG_DEBUG_CREDENTIALS | ||
| 256 | new->magic = CRED_MAGIC; | ||
| 257 | #endif | ||
| 255 | 258 | ||
| 256 | if (security_cred_alloc_blank(new, GFP_KERNEL) < 0) | 259 | if (security_cred_alloc_blank(new, GFP_KERNEL) < 0) |
| 257 | goto error; | 260 | goto error; |
| 258 | 261 | ||
| 259 | #ifdef CONFIG_DEBUG_CREDENTIALS | ||
| 260 | new->magic = CRED_MAGIC; | ||
| 261 | #endif | ||
| 262 | return new; | 262 | return new; |
| 263 | 263 | ||
| 264 | error: | 264 | error: |
| @@ -657,6 +657,8 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon) | |||
| 657 | validate_creds(old); | 657 | validate_creds(old); |
| 658 | 658 | ||
| 659 | *new = *old; | 659 | *new = *old; |
| 660 | atomic_set(&new->usage, 1); | ||
| 661 | set_cred_subscribers(new, 0); | ||
| 660 | get_uid(new->user); | 662 | get_uid(new->user); |
| 661 | get_group_info(new->group_info); | 663 | get_group_info(new->group_info); |
| 662 | 664 | ||
| @@ -674,8 +676,6 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon) | |||
| 674 | if (security_prepare_creds(new, old, GFP_KERNEL) < 0) | 676 | if (security_prepare_creds(new, old, GFP_KERNEL) < 0) |
| 675 | goto error; | 677 | goto error; |
| 676 | 678 | ||
| 677 | atomic_set(&new->usage, 1); | ||
| 678 | set_cred_subscribers(new, 0); | ||
| 679 | put_cred(old); | 679 | put_cred(old); |
| 680 | validate_creds(new); | 680 | validate_creds(new); |
| 681 | return new; | 681 | return new; |
| @@ -748,7 +748,11 @@ bool creds_are_invalid(const struct cred *cred) | |||
| 748 | if (cred->magic != CRED_MAGIC) | 748 | if (cred->magic != CRED_MAGIC) |
| 749 | return true; | 749 | return true; |
| 750 | #ifdef CONFIG_SECURITY_SELINUX | 750 | #ifdef CONFIG_SECURITY_SELINUX |
| 751 | if (selinux_is_enabled()) { | 751 | /* |
| 752 | * cred->security == NULL if security_cred_alloc_blank() or | ||
| 753 | * security_prepare_creds() returned an error. | ||
| 754 | */ | ||
| 755 | if (selinux_is_enabled() && cred->security) { | ||
| 752 | if ((unsigned long) cred->security < PAGE_SIZE) | 756 | if ((unsigned long) cred->security < PAGE_SIZE) |
| 753 | return true; | 757 | return true; |
| 754 | if ((*(u32 *)cred->security & 0xffffff00) == | 758 | if ((*(u32 *)cred->security & 0xffffff00) == |
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index 37755d62192..bd3e8e29caa 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c | |||
| @@ -82,7 +82,7 @@ static kdbtab_t kdb_base_commands[50]; | |||
| 82 | #define for_each_kdbcmd(cmd, num) \ | 82 | #define for_each_kdbcmd(cmd, num) \ |
| 83 | for ((cmd) = kdb_base_commands, (num) = 0; \ | 83 | for ((cmd) = kdb_base_commands, (num) = 0; \ |
| 84 | num < kdb_max_commands; \ | 84 | num < kdb_max_commands; \ |
| 85 | num == KDB_BASE_CMD_MAX ? cmd = kdb_commands : cmd++, num++) | 85 | num++, num == KDB_BASE_CMD_MAX ? cmd = kdb_commands : cmd++) |
| 86 | 86 | ||
| 87 | typedef struct _kdbmsg { | 87 | typedef struct _kdbmsg { |
| 88 | int km_diag; /* kdb diagnostic */ | 88 | int km_diag; /* kdb diagnostic */ |
| @@ -646,7 +646,7 @@ static int kdb_defcmd2(const char *cmdstr, const char *argv0) | |||
| 646 | } | 646 | } |
| 647 | if (!s->usable) | 647 | if (!s->usable) |
| 648 | return KDB_NOTIMP; | 648 | return KDB_NOTIMP; |
| 649 | s->command = kmalloc((s->count + 1) * sizeof(*(s->command)), GFP_KDB); | 649 | s->command = kzalloc((s->count + 1) * sizeof(*(s->command)), GFP_KDB); |
| 650 | if (!s->command) { | 650 | if (!s->command) { |
| 651 | kdb_printf("Could not allocate new kdb_defcmd table for %s\n", | 651 | kdb_printf("Could not allocate new kdb_defcmd table for %s\n", |
| 652 | cmdstr); | 652 | cmdstr); |
| @@ -2361,7 +2361,7 @@ static int kdb_pid(int argc, const char **argv) | |||
| 2361 | */ | 2361 | */ |
| 2362 | static int kdb_ll(int argc, const char **argv) | 2362 | static int kdb_ll(int argc, const char **argv) |
| 2363 | { | 2363 | { |
| 2364 | int diag; | 2364 | int diag = 0; |
| 2365 | unsigned long addr; | 2365 | unsigned long addr; |
| 2366 | long offset = 0; | 2366 | long offset = 0; |
| 2367 | unsigned long va; | 2367 | unsigned long va; |
| @@ -2400,20 +2400,21 @@ static int kdb_ll(int argc, const char **argv) | |||
| 2400 | char buf[80]; | 2400 | char buf[80]; |
| 2401 | 2401 | ||
| 2402 | if (KDB_FLAG(CMD_INTERRUPT)) | 2402 | if (KDB_FLAG(CMD_INTERRUPT)) |
| 2403 | return 0; | 2403 | goto out; |
| 2404 | 2404 | ||
| 2405 | sprintf(buf, "%s " kdb_machreg_fmt "\n", command, va); | 2405 | sprintf(buf, "%s " kdb_machreg_fmt "\n", command, va); |
| 2406 | diag = kdb_parse(buf); | 2406 | diag = kdb_parse(buf); |
| 2407 | if (diag) | 2407 | if (diag) |
| 2408 | return diag; | 2408 | goto out; |
| 2409 | 2409 | ||
| 2410 | addr = va + linkoffset; | 2410 | addr = va + linkoffset; |
| 2411 | if (kdb_getword(&va, addr, sizeof(va))) | 2411 | if (kdb_getword(&va, addr, sizeof(va))) |
| 2412 | return 0; | 2412 | goto out; |
| 2413 | } | 2413 | } |
| 2414 | kfree(command); | ||
| 2415 | 2414 | ||
| 2416 | return 0; | 2415 | out: |
| 2416 | kfree(command); | ||
| 2417 | return diag; | ||
| 2417 | } | 2418 | } |
| 2418 | 2419 | ||
| 2419 | static int kdb_kgdb(int argc, const char **argv) | 2420 | static int kdb_kgdb(int argc, const char **argv) |
| @@ -2739,13 +2740,13 @@ int kdb_register_repeat(char *cmd, | |||
| 2739 | } | 2740 | } |
| 2740 | if (kdb_commands) { | 2741 | if (kdb_commands) { |
| 2741 | memcpy(new, kdb_commands, | 2742 | memcpy(new, kdb_commands, |
| 2742 | kdb_max_commands * sizeof(*new)); | 2743 | (kdb_max_commands - KDB_BASE_CMD_MAX) * sizeof(*new)); |
| 2743 | kfree(kdb_commands); | 2744 | kfree(kdb_commands); |
| 2744 | } | 2745 | } |
| 2745 | memset(new + kdb_max_commands, 0, | 2746 | memset(new + kdb_max_commands, 0, |
| 2746 | kdb_command_extend * sizeof(*new)); | 2747 | kdb_command_extend * sizeof(*new)); |
| 2747 | kdb_commands = new; | 2748 | kdb_commands = new; |
| 2748 | kp = kdb_commands + kdb_max_commands; | 2749 | kp = kdb_commands + kdb_max_commands - KDB_BASE_CMD_MAX; |
| 2749 | kdb_max_commands += kdb_command_extend; | 2750 | kdb_max_commands += kdb_command_extend; |
| 2750 | } | 2751 | } |
| 2751 | 2752 | ||
| @@ -2913,7 +2914,7 @@ static void __init kdb_cmd_init(void) | |||
| 2913 | } | 2914 | } |
| 2914 | } | 2915 | } |
| 2915 | 2916 | ||
| 2916 | /* Intialize kdb_printf, breakpoint tables and kdb state */ | 2917 | /* Initialize kdb_printf, breakpoint tables and kdb state */ |
| 2917 | void __init kdb_init(int lvl) | 2918 | void __init kdb_init(int lvl) |
| 2918 | { | 2919 | { |
| 2919 | static int kdb_init_lvl = KDB_NOT_INITIALIZED; | 2920 | static int kdb_init_lvl = KDB_NOT_INITIALIZED; |
diff --git a/kernel/exit.c b/kernel/exit.c index b194febf579..f9a45ebcc7b 100644 --- a/kernel/exit.c +++ b/kernel/exit.c | |||
| @@ -69,7 +69,7 @@ static void __unhash_process(struct task_struct *p, bool group_dead) | |||
| 69 | 69 | ||
| 70 | list_del_rcu(&p->tasks); | 70 | list_del_rcu(&p->tasks); |
| 71 | list_del_init(&p->sibling); | 71 | list_del_init(&p->sibling); |
| 72 | __get_cpu_var(process_counts)--; | 72 | __this_cpu_dec(process_counts); |
| 73 | } | 73 | } |
| 74 | list_del_rcu(&p->thread_group); | 74 | list_del_rcu(&p->thread_group); |
| 75 | } | 75 | } |
| @@ -96,6 +96,14 @@ static void __exit_signal(struct task_struct *tsk) | |||
| 96 | sig->tty = NULL; | 96 | sig->tty = NULL; |
| 97 | } else { | 97 | } else { |
| 98 | /* | 98 | /* |
| 99 | * This can only happen if the caller is de_thread(). | ||
| 100 | * FIXME: this is the temporary hack, we should teach | ||
| 101 | * posix-cpu-timers to handle this case correctly. | ||
| 102 | */ | ||
| 103 | if (unlikely(has_group_leader_pid(tsk))) | ||
| 104 | posix_cpu_timers_exit_group(tsk); | ||
| 105 | |||
| 106 | /* | ||
| 99 | * If there is any task waiting for the group exit | 107 | * If there is any task waiting for the group exit |
| 100 | * then notify it: | 108 | * then notify it: |
| 101 | */ | 109 | */ |
| @@ -906,6 +914,15 @@ NORET_TYPE void do_exit(long code) | |||
| 906 | if (unlikely(!tsk->pid)) | 914 | if (unlikely(!tsk->pid)) |
| 907 | panic("Attempted to kill the idle task!"); | 915 | panic("Attempted to kill the idle task!"); |
| 908 | 916 | ||
| 917 | /* | ||
| 918 | * If do_exit is called because this processes oopsed, it's possible | ||
| 919 | * that get_fs() was left as KERNEL_DS, so reset it to USER_DS before | ||
| 920 | * continuing. Amongst other possible reasons, this is to prevent | ||
| 921 | * mm_release()->clear_child_tid() from writing to a user-controlled | ||
| 922 | * kernel address. | ||
| 923 | */ | ||
| 924 | set_fs(USER_DS); | ||
| 925 | |||
| 909 | tracehook_report_exit(&code); | 926 | tracehook_report_exit(&code); |
| 910 | 927 | ||
| 911 | validate_creds_for_do_exit(tsk); | 928 | validate_creds_for_do_exit(tsk); |
| @@ -977,6 +994,15 @@ NORET_TYPE void do_exit(long code) | |||
| 977 | exit_fs(tsk); | 994 | exit_fs(tsk); |
| 978 | check_stack_usage(); | 995 | check_stack_usage(); |
| 979 | exit_thread(); | 996 | exit_thread(); |
| 997 | |||
| 998 | /* | ||
| 999 | * Flush inherited counters to the parent - before the parent | ||
| 1000 | * gets woken up by child-exit notifications. | ||
| 1001 | * | ||
| 1002 | * because of cgroup mode, must be called before cgroup_exit() | ||
| 1003 | */ | ||
| 1004 | perf_event_exit_task(tsk); | ||
| 1005 | |||
| 980 | cgroup_exit(tsk, 1); | 1006 | cgroup_exit(tsk, 1); |
| 981 | 1007 | ||
| 982 | if (group_dead) | 1008 | if (group_dead) |
| @@ -990,11 +1016,6 @@ NORET_TYPE void do_exit(long code) | |||
| 990 | * FIXME: do that only when needed, using sched_exit tracepoint | 1016 | * FIXME: do that only when needed, using sched_exit tracepoint |
| 991 | */ | 1017 | */ |
| 992 | flush_ptrace_hw_breakpoint(tsk); | 1018 | flush_ptrace_hw_breakpoint(tsk); |
| 993 | /* | ||
| 994 | * Flush inherited counters to the parent - before the parent | ||
| 995 | * gets woken up by child-exit notifications. | ||
| 996 | */ | ||
| 997 | perf_event_exit_task(tsk); | ||
| 998 | 1019 | ||
| 999 | exit_notify(tsk, group_dead); | 1020 | exit_notify(tsk, group_dead); |
| 1000 | #ifdef CONFIG_NUMA | 1021 | #ifdef CONFIG_NUMA |
diff --git a/kernel/fork.c b/kernel/fork.c index 3b159c5991b..25e429152dd 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
| @@ -66,6 +66,7 @@ | |||
| 66 | #include <linux/posix-timers.h> | 66 | #include <linux/posix-timers.h> |
| 67 | #include <linux/user-return-notifier.h> | 67 | #include <linux/user-return-notifier.h> |
| 68 | #include <linux/oom.h> | 68 | #include <linux/oom.h> |
| 69 | #include <linux/khugepaged.h> | ||
| 69 | 70 | ||
| 70 | #include <asm/pgtable.h> | 71 | #include <asm/pgtable.h> |
| 71 | #include <asm/pgalloc.h> | 72 | #include <asm/pgalloc.h> |
| @@ -169,6 +170,7 @@ EXPORT_SYMBOL(free_task); | |||
| 169 | static inline void free_signal_struct(struct signal_struct *sig) | 170 | static inline void free_signal_struct(struct signal_struct *sig) |
| 170 | { | 171 | { |
| 171 | taskstats_tgid_free(sig); | 172 | taskstats_tgid_free(sig); |
| 173 | sched_autogroup_exit(sig); | ||
| 172 | kmem_cache_free(signal_cachep, sig); | 174 | kmem_cache_free(signal_cachep, sig); |
| 173 | } | 175 | } |
| 174 | 176 | ||
| @@ -273,6 +275,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) | |||
| 273 | 275 | ||
| 274 | setup_thread_stack(tsk, orig); | 276 | setup_thread_stack(tsk, orig); |
| 275 | clear_user_return_notifier(tsk); | 277 | clear_user_return_notifier(tsk); |
| 278 | clear_tsk_need_resched(tsk); | ||
| 276 | stackend = end_of_stack(tsk); | 279 | stackend = end_of_stack(tsk); |
| 277 | *stackend = STACK_END_MAGIC; /* for overflow detection */ | 280 | *stackend = STACK_END_MAGIC; /* for overflow detection */ |
| 278 | 281 | ||
| @@ -328,6 +331,9 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) | |||
| 328 | retval = ksm_fork(mm, oldmm); | 331 | retval = ksm_fork(mm, oldmm); |
| 329 | if (retval) | 332 | if (retval) |
| 330 | goto out; | 333 | goto out; |
| 334 | retval = khugepaged_fork(mm, oldmm); | ||
| 335 | if (retval) | ||
| 336 | goto out; | ||
| 331 | 337 | ||
| 332 | prev = NULL; | 338 | prev = NULL; |
| 333 | for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) { | 339 | for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) { |
| @@ -527,6 +533,9 @@ void __mmdrop(struct mm_struct *mm) | |||
| 527 | mm_free_pgd(mm); | 533 | mm_free_pgd(mm); |
| 528 | destroy_context(mm); | 534 | destroy_context(mm); |
| 529 | mmu_notifier_mm_destroy(mm); | 535 | mmu_notifier_mm_destroy(mm); |
| 536 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | ||
| 537 | VM_BUG_ON(mm->pmd_huge_pte); | ||
| 538 | #endif | ||
| 530 | free_mm(mm); | 539 | free_mm(mm); |
| 531 | } | 540 | } |
| 532 | EXPORT_SYMBOL_GPL(__mmdrop); | 541 | EXPORT_SYMBOL_GPL(__mmdrop); |
| @@ -541,6 +550,7 @@ void mmput(struct mm_struct *mm) | |||
| 541 | if (atomic_dec_and_test(&mm->mm_users)) { | 550 | if (atomic_dec_and_test(&mm->mm_users)) { |
| 542 | exit_aio(mm); | 551 | exit_aio(mm); |
| 543 | ksm_exit(mm); | 552 | ksm_exit(mm); |
| 553 | khugepaged_exit(mm); /* must run before exit_mmap */ | ||
| 544 | exit_mmap(mm); | 554 | exit_mmap(mm); |
| 545 | set_mm_exe_file(mm, NULL); | 555 | set_mm_exe_file(mm, NULL); |
| 546 | if (!list_empty(&mm->mmlist)) { | 556 | if (!list_empty(&mm->mmlist)) { |
| @@ -667,6 +677,10 @@ struct mm_struct *dup_mm(struct task_struct *tsk) | |||
| 667 | mm->token_priority = 0; | 677 | mm->token_priority = 0; |
| 668 | mm->last_interval = 0; | 678 | mm->last_interval = 0; |
| 669 | 679 | ||
| 680 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | ||
| 681 | mm->pmd_huge_pte = NULL; | ||
| 682 | #endif | ||
| 683 | |||
| 670 | if (!mm_init(mm, tsk)) | 684 | if (!mm_init(mm, tsk)) |
| 671 | goto fail_nomem; | 685 | goto fail_nomem; |
| 672 | 686 | ||
| @@ -904,9 +918,11 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) | |||
| 904 | posix_cpu_timers_init_group(sig); | 918 | posix_cpu_timers_init_group(sig); |
| 905 | 919 | ||
| 906 | tty_audit_fork(sig); | 920 | tty_audit_fork(sig); |
| 921 | sched_autogroup_fork(sig); | ||
| 907 | 922 | ||
| 908 | sig->oom_adj = current->signal->oom_adj; | 923 | sig->oom_adj = current->signal->oom_adj; |
| 909 | sig->oom_score_adj = current->signal->oom_score_adj; | 924 | sig->oom_score_adj = current->signal->oom_score_adj; |
| 925 | sig->oom_score_adj_min = current->signal->oom_score_adj_min; | ||
| 910 | 926 | ||
| 911 | mutex_init(&sig->cred_guard_mutex); | 927 | mutex_init(&sig->cred_guard_mutex); |
| 912 | 928 | ||
| @@ -1282,7 +1298,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
| 1282 | attach_pid(p, PIDTYPE_SID, task_session(current)); | 1298 | attach_pid(p, PIDTYPE_SID, task_session(current)); |
| 1283 | list_add_tail(&p->sibling, &p->real_parent->children); | 1299 | list_add_tail(&p->sibling, &p->real_parent->children); |
| 1284 | list_add_tail_rcu(&p->tasks, &init_task.tasks); | 1300 | list_add_tail_rcu(&p->tasks, &init_task.tasks); |
| 1285 | __get_cpu_var(process_counts)++; | 1301 | __this_cpu_inc(process_counts); |
| 1286 | } | 1302 | } |
| 1287 | attach_pid(p, PIDTYPE_PID, pid); | 1303 | attach_pid(p, PIDTYPE_PID, pid); |
| 1288 | nr_threads++; | 1304 | nr_threads++; |
| @@ -1407,23 +1423,6 @@ long do_fork(unsigned long clone_flags, | |||
| 1407 | } | 1423 | } |
| 1408 | 1424 | ||
| 1409 | /* | 1425 | /* |
| 1410 | * We hope to recycle these flags after 2.6.26 | ||
| 1411 | */ | ||
| 1412 | if (unlikely(clone_flags & CLONE_STOPPED)) { | ||
| 1413 | static int __read_mostly count = 100; | ||
| 1414 | |||
| 1415 | if (count > 0 && printk_ratelimit()) { | ||
| 1416 | char comm[TASK_COMM_LEN]; | ||
| 1417 | |||
| 1418 | count--; | ||
| 1419 | printk(KERN_INFO "fork(): process `%s' used deprecated " | ||
| 1420 | "clone flags 0x%lx\n", | ||
| 1421 | get_task_comm(comm, current), | ||
| 1422 | clone_flags & CLONE_STOPPED); | ||
| 1423 | } | ||
| 1424 | } | ||
| 1425 | |||
| 1426 | /* | ||
| 1427 | * When called from kernel_thread, don't do user tracing stuff. | 1426 | * When called from kernel_thread, don't do user tracing stuff. |
| 1428 | */ | 1427 | */ |
| 1429 | if (likely(user_mode(regs))) | 1428 | if (likely(user_mode(regs))) |
| @@ -1461,16 +1460,7 @@ long do_fork(unsigned long clone_flags, | |||
| 1461 | */ | 1460 | */ |
| 1462 | p->flags &= ~PF_STARTING; | 1461 | p->flags &= ~PF_STARTING; |
| 1463 | 1462 | ||
| 1464 | if (unlikely(clone_flags & CLONE_STOPPED)) { | 1463 | wake_up_new_task(p, clone_flags); |
| 1465 | /* | ||
| 1466 | * We'll start up with an immediate SIGSTOP. | ||
| 1467 | */ | ||
| 1468 | sigaddset(&p->pending.signal, SIGSTOP); | ||
| 1469 | set_tsk_thread_flag(p, TIF_SIGPENDING); | ||
| 1470 | __set_task_state(p, TASK_STOPPED); | ||
| 1471 | } else { | ||
| 1472 | wake_up_new_task(p, clone_flags); | ||
| 1473 | } | ||
| 1474 | 1464 | ||
| 1475 | tracehook_report_clone_complete(trace, regs, | 1465 | tracehook_report_clone_complete(trace, regs, |
| 1476 | clone_flags, nr, p); | 1466 | clone_flags, nr, p); |
diff --git a/kernel/freezer.c b/kernel/freezer.c index bd1d42b17cb..66ecd2ead21 100644 --- a/kernel/freezer.c +++ b/kernel/freezer.c | |||
| @@ -104,8 +104,13 @@ bool freeze_task(struct task_struct *p, bool sig_only) | |||
| 104 | } | 104 | } |
| 105 | 105 | ||
| 106 | if (should_send_signal(p)) { | 106 | if (should_send_signal(p)) { |
| 107 | if (!signal_pending(p)) | 107 | fake_signal_wake_up(p); |
| 108 | fake_signal_wake_up(p); | 108 | /* |
| 109 | * fake_signal_wake_up() goes through p's scheduler | ||
| 110 | * lock and guarantees that TASK_STOPPED/TRACED -> | ||
| 111 | * TASK_RUNNING transition can't race with task state | ||
| 112 | * testing in try_to_freeze_tasks(). | ||
| 113 | */ | ||
| 109 | } else if (sig_only) { | 114 | } else if (sig_only) { |
| 110 | return false; | 115 | return false; |
| 111 | } else { | 116 | } else { |
diff --git a/kernel/futex.c b/kernel/futex.c index 6c683b37f2c..b766d28accd 100644 --- a/kernel/futex.c +++ b/kernel/futex.c | |||
| @@ -69,6 +69,14 @@ int __read_mostly futex_cmpxchg_enabled; | |||
| 69 | #define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8) | 69 | #define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8) |
| 70 | 70 | ||
| 71 | /* | 71 | /* |
| 72 | * Futex flags used to encode options to functions and preserve them across | ||
| 73 | * restarts. | ||
| 74 | */ | ||
| 75 | #define FLAGS_SHARED 0x01 | ||
| 76 | #define FLAGS_CLOCKRT 0x02 | ||
| 77 | #define FLAGS_HAS_TIMEOUT 0x04 | ||
| 78 | |||
| 79 | /* | ||
| 72 | * Priority Inheritance state: | 80 | * Priority Inheritance state: |
| 73 | */ | 81 | */ |
| 74 | struct futex_pi_state { | 82 | struct futex_pi_state { |
| @@ -123,6 +131,12 @@ struct futex_q { | |||
| 123 | u32 bitset; | 131 | u32 bitset; |
| 124 | }; | 132 | }; |
| 125 | 133 | ||
| 134 | static const struct futex_q futex_q_init = { | ||
| 135 | /* list gets initialized in queue_me()*/ | ||
| 136 | .key = FUTEX_KEY_INIT, | ||
| 137 | .bitset = FUTEX_BITSET_MATCH_ANY | ||
| 138 | }; | ||
| 139 | |||
| 126 | /* | 140 | /* |
| 127 | * Hash buckets are shared by all the futex_keys that hash to the same | 141 | * Hash buckets are shared by all the futex_keys that hash to the same |
| 128 | * location. Each key may have multiple futex_q structures, one for each task | 142 | * location. Each key may have multiple futex_q structures, one for each task |
| @@ -219,7 +233,7 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key) | |||
| 219 | { | 233 | { |
| 220 | unsigned long address = (unsigned long)uaddr; | 234 | unsigned long address = (unsigned long)uaddr; |
| 221 | struct mm_struct *mm = current->mm; | 235 | struct mm_struct *mm = current->mm; |
| 222 | struct page *page; | 236 | struct page *page, *page_head; |
| 223 | int err; | 237 | int err; |
| 224 | 238 | ||
| 225 | /* | 239 | /* |
| @@ -251,11 +265,46 @@ again: | |||
| 251 | if (err < 0) | 265 | if (err < 0) |
| 252 | return err; | 266 | return err; |
| 253 | 267 | ||
| 254 | page = compound_head(page); | 268 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE |
| 255 | lock_page(page); | 269 | page_head = page; |
| 256 | if (!page->mapping) { | 270 | if (unlikely(PageTail(page))) { |
| 257 | unlock_page(page); | ||
| 258 | put_page(page); | 271 | put_page(page); |
| 272 | /* serialize against __split_huge_page_splitting() */ | ||
| 273 | local_irq_disable(); | ||
| 274 | if (likely(__get_user_pages_fast(address, 1, 1, &page) == 1)) { | ||
| 275 | page_head = compound_head(page); | ||
| 276 | /* | ||
| 277 | * page_head is valid pointer but we must pin | ||
| 278 | * it before taking the PG_lock and/or | ||
| 279 | * PG_compound_lock. The moment we re-enable | ||
| 280 | * irqs __split_huge_page_splitting() can | ||
| 281 | * return and the head page can be freed from | ||
| 282 | * under us. We can't take the PG_lock and/or | ||
| 283 | * PG_compound_lock on a page that could be | ||
| 284 | * freed from under us. | ||
| 285 | */ | ||
| 286 | if (page != page_head) { | ||
| 287 | get_page(page_head); | ||
| 288 | put_page(page); | ||
| 289 | } | ||
| 290 | local_irq_enable(); | ||
| 291 | } else { | ||
| 292 | local_irq_enable(); | ||
| 293 | goto again; | ||
| 294 | } | ||
| 295 | } | ||
| 296 | #else | ||
| 297 | page_head = compound_head(page); | ||
| 298 | if (page != page_head) { | ||
| 299 | get_page(page_head); | ||
| 300 | put_page(page); | ||
| 301 | } | ||
| 302 | #endif | ||
| 303 | |||
| 304 | lock_page(page_head); | ||
| 305 | if (!page_head->mapping) { | ||
| 306 | unlock_page(page_head); | ||
| 307 | put_page(page_head); | ||
| 259 | goto again; | 308 | goto again; |
| 260 | } | 309 | } |
| 261 | 310 | ||
| @@ -266,25 +315,24 @@ again: | |||
| 266 | * it's a read-only handle, it's expected that futexes attach to | 315 | * it's a read-only handle, it's expected that futexes attach to |
| 267 | * the object not the particular process. | 316 | * the object not the particular process. |
| 268 | */ | 317 | */ |
| 269 | if (PageAnon(page)) { | 318 | if (PageAnon(page_head)) { |
| 270 | key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */ | 319 | key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */ |
| 271 | key->private.mm = mm; | 320 | key->private.mm = mm; |
| 272 | key->private.address = address; | 321 | key->private.address = address; |
| 273 | } else { | 322 | } else { |
| 274 | key->both.offset |= FUT_OFF_INODE; /* inode-based key */ | 323 | key->both.offset |= FUT_OFF_INODE; /* inode-based key */ |
| 275 | key->shared.inode = page->mapping->host; | 324 | key->shared.inode = page_head->mapping->host; |
| 276 | key->shared.pgoff = page->index; | 325 | key->shared.pgoff = page_head->index; |
| 277 | } | 326 | } |
| 278 | 327 | ||
| 279 | get_futex_key_refs(key); | 328 | get_futex_key_refs(key); |
| 280 | 329 | ||
| 281 | unlock_page(page); | 330 | unlock_page(page_head); |
| 282 | put_page(page); | 331 | put_page(page_head); |
| 283 | return 0; | 332 | return 0; |
| 284 | } | 333 | } |
| 285 | 334 | ||
| 286 | static inline | 335 | static inline void put_futex_key(union futex_key *key) |
| 287 | void put_futex_key(int fshared, union futex_key *key) | ||
| 288 | { | 336 | { |
| 289 | drop_futex_key_refs(key); | 337 | drop_futex_key_refs(key); |
| 290 | } | 338 | } |
| @@ -778,10 +826,9 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this) | |||
| 778 | new_owner = rt_mutex_next_owner(&pi_state->pi_mutex); | 826 | new_owner = rt_mutex_next_owner(&pi_state->pi_mutex); |
| 779 | 827 | ||
| 780 | /* | 828 | /* |
| 781 | * This happens when we have stolen the lock and the original | 829 | * It is possible that the next waiter (the one that brought |
| 782 | * pending owner did not enqueue itself back on the rt_mutex. | 830 | * this owner to the kernel) timed out and is no longer |
| 783 | * Thats not a tragedy. We know that way, that a lock waiter | 831 | * waiting on the lock. |
| 784 | * is on the fly. We make the futex_q waiter the pending owner. | ||
| 785 | */ | 832 | */ |
| 786 | if (!new_owner) | 833 | if (!new_owner) |
| 787 | new_owner = this->task; | 834 | new_owner = this->task; |
| @@ -870,7 +917,8 @@ double_unlock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2) | |||
| 870 | /* | 917 | /* |
| 871 | * Wake up waiters matching bitset queued on this futex (uaddr). | 918 | * Wake up waiters matching bitset queued on this futex (uaddr). |
| 872 | */ | 919 | */ |
| 873 | static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset) | 920 | static int |
| 921 | futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset) | ||
| 874 | { | 922 | { |
| 875 | struct futex_hash_bucket *hb; | 923 | struct futex_hash_bucket *hb; |
| 876 | struct futex_q *this, *next; | 924 | struct futex_q *this, *next; |
| @@ -881,7 +929,7 @@ static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset) | |||
| 881 | if (!bitset) | 929 | if (!bitset) |
| 882 | return -EINVAL; | 930 | return -EINVAL; |
| 883 | 931 | ||
| 884 | ret = get_futex_key(uaddr, fshared, &key); | 932 | ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key); |
| 885 | if (unlikely(ret != 0)) | 933 | if (unlikely(ret != 0)) |
| 886 | goto out; | 934 | goto out; |
| 887 | 935 | ||
| @@ -907,7 +955,7 @@ static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset) | |||
| 907 | } | 955 | } |
| 908 | 956 | ||
| 909 | spin_unlock(&hb->lock); | 957 | spin_unlock(&hb->lock); |
| 910 | put_futex_key(fshared, &key); | 958 | put_futex_key(&key); |
| 911 | out: | 959 | out: |
| 912 | return ret; | 960 | return ret; |
| 913 | } | 961 | } |
| @@ -917,7 +965,7 @@ out: | |||
| 917 | * to this virtual address: | 965 | * to this virtual address: |
| 918 | */ | 966 | */ |
| 919 | static int | 967 | static int |
| 920 | futex_wake_op(u32 __user *uaddr1, int fshared, u32 __user *uaddr2, | 968 | futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2, |
| 921 | int nr_wake, int nr_wake2, int op) | 969 | int nr_wake, int nr_wake2, int op) |
| 922 | { | 970 | { |
| 923 | union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT; | 971 | union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT; |
| @@ -927,10 +975,10 @@ futex_wake_op(u32 __user *uaddr1, int fshared, u32 __user *uaddr2, | |||
| 927 | int ret, op_ret; | 975 | int ret, op_ret; |
| 928 | 976 | ||
| 929 | retry: | 977 | retry: |
| 930 | ret = get_futex_key(uaddr1, fshared, &key1); | 978 | ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1); |
| 931 | if (unlikely(ret != 0)) | 979 | if (unlikely(ret != 0)) |
| 932 | goto out; | 980 | goto out; |
| 933 | ret = get_futex_key(uaddr2, fshared, &key2); | 981 | ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2); |
| 934 | if (unlikely(ret != 0)) | 982 | if (unlikely(ret != 0)) |
| 935 | goto out_put_key1; | 983 | goto out_put_key1; |
| 936 | 984 | ||
| @@ -962,11 +1010,11 @@ retry_private: | |||
| 962 | if (ret) | 1010 | if (ret) |
| 963 | goto out_put_keys; | 1011 | goto out_put_keys; |
| 964 | 1012 | ||
| 965 | if (!fshared) | 1013 | if (!(flags & FLAGS_SHARED)) |
| 966 | goto retry_private; | 1014 | goto retry_private; |
| 967 | 1015 | ||
| 968 | put_futex_key(fshared, &key2); | 1016 | put_futex_key(&key2); |
| 969 | put_futex_key(fshared, &key1); | 1017 | put_futex_key(&key1); |
| 970 | goto retry; | 1018 | goto retry; |
| 971 | } | 1019 | } |
| 972 | 1020 | ||
| @@ -996,9 +1044,9 @@ retry_private: | |||
| 996 | 1044 | ||
| 997 | double_unlock_hb(hb1, hb2); | 1045 | double_unlock_hb(hb1, hb2); |
| 998 | out_put_keys: | 1046 | out_put_keys: |
| 999 | put_futex_key(fshared, &key2); | 1047 | put_futex_key(&key2); |
| 1000 | out_put_key1: | 1048 | out_put_key1: |
| 1001 | put_futex_key(fshared, &key1); | 1049 | put_futex_key(&key1); |
| 1002 | out: | 1050 | out: |
| 1003 | return ret; | 1051 | return ret; |
| 1004 | } | 1052 | } |
| @@ -1133,13 +1181,13 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex, | |||
| 1133 | /** | 1181 | /** |
| 1134 | * futex_requeue() - Requeue waiters from uaddr1 to uaddr2 | 1182 | * futex_requeue() - Requeue waiters from uaddr1 to uaddr2 |
| 1135 | * @uaddr1: source futex user address | 1183 | * @uaddr1: source futex user address |
| 1136 | * @fshared: 0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED | 1184 | * @flags: futex flags (FLAGS_SHARED, etc.) |
| 1137 | * @uaddr2: target futex user address | 1185 | * @uaddr2: target futex user address |
| 1138 | * @nr_wake: number of waiters to wake (must be 1 for requeue_pi) | 1186 | * @nr_wake: number of waiters to wake (must be 1 for requeue_pi) |
| 1139 | * @nr_requeue: number of waiters to requeue (0-INT_MAX) | 1187 | * @nr_requeue: number of waiters to requeue (0-INT_MAX) |
| 1140 | * @cmpval: @uaddr1 expected value (or %NULL) | 1188 | * @cmpval: @uaddr1 expected value (or %NULL) |
| 1141 | * @requeue_pi: if we are attempting to requeue from a non-pi futex to a | 1189 | * @requeue_pi: if we are attempting to requeue from a non-pi futex to a |
| 1142 | * pi futex (pi to pi requeue is not supported) | 1190 | * pi futex (pi to pi requeue is not supported) |
| 1143 | * | 1191 | * |
| 1144 | * Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire | 1192 | * Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire |
| 1145 | * uaddr2 atomically on behalf of the top waiter. | 1193 | * uaddr2 atomically on behalf of the top waiter. |
| @@ -1148,9 +1196,9 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex, | |||
| 1148 | * >=0 - on success, the number of tasks requeued or woken | 1196 | * >=0 - on success, the number of tasks requeued or woken |
| 1149 | * <0 - on error | 1197 | * <0 - on error |
| 1150 | */ | 1198 | */ |
| 1151 | static int futex_requeue(u32 __user *uaddr1, int fshared, u32 __user *uaddr2, | 1199 | static int futex_requeue(u32 __user *uaddr1, unsigned int flags, |
| 1152 | int nr_wake, int nr_requeue, u32 *cmpval, | 1200 | u32 __user *uaddr2, int nr_wake, int nr_requeue, |
| 1153 | int requeue_pi) | 1201 | u32 *cmpval, int requeue_pi) |
| 1154 | { | 1202 | { |
| 1155 | union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT; | 1203 | union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT; |
| 1156 | int drop_count = 0, task_count = 0, ret; | 1204 | int drop_count = 0, task_count = 0, ret; |
| @@ -1191,10 +1239,10 @@ retry: | |||
| 1191 | pi_state = NULL; | 1239 | pi_state = NULL; |
| 1192 | } | 1240 | } |
| 1193 | 1241 | ||
| 1194 | ret = get_futex_key(uaddr1, fshared, &key1); | 1242 | ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1); |
| 1195 | if (unlikely(ret != 0)) | 1243 | if (unlikely(ret != 0)) |
| 1196 | goto out; | 1244 | goto out; |
| 1197 | ret = get_futex_key(uaddr2, fshared, &key2); | 1245 | ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2); |
| 1198 | if (unlikely(ret != 0)) | 1246 | if (unlikely(ret != 0)) |
| 1199 | goto out_put_key1; | 1247 | goto out_put_key1; |
| 1200 | 1248 | ||
| @@ -1216,11 +1264,11 @@ retry_private: | |||
| 1216 | if (ret) | 1264 | if (ret) |
| 1217 | goto out_put_keys; | 1265 | goto out_put_keys; |
| 1218 | 1266 | ||
| 1219 | if (!fshared) | 1267 | if (!(flags & FLAGS_SHARED)) |
| 1220 | goto retry_private; | 1268 | goto retry_private; |
| 1221 | 1269 | ||
| 1222 | put_futex_key(fshared, &key2); | 1270 | put_futex_key(&key2); |
| 1223 | put_futex_key(fshared, &key1); | 1271 | put_futex_key(&key1); |
| 1224 | goto retry; | 1272 | goto retry; |
| 1225 | } | 1273 | } |
| 1226 | if (curval != *cmpval) { | 1274 | if (curval != *cmpval) { |
| @@ -1260,8 +1308,8 @@ retry_private: | |||
| 1260 | break; | 1308 | break; |
| 1261 | case -EFAULT: | 1309 | case -EFAULT: |
| 1262 | double_unlock_hb(hb1, hb2); | 1310 | double_unlock_hb(hb1, hb2); |
| 1263 | put_futex_key(fshared, &key2); | 1311 | put_futex_key(&key2); |
| 1264 | put_futex_key(fshared, &key1); | 1312 | put_futex_key(&key1); |
| 1265 | ret = fault_in_user_writeable(uaddr2); | 1313 | ret = fault_in_user_writeable(uaddr2); |
| 1266 | if (!ret) | 1314 | if (!ret) |
| 1267 | goto retry; | 1315 | goto retry; |
| @@ -1269,8 +1317,8 @@ retry_private: | |||
| 1269 | case -EAGAIN: | 1317 | case -EAGAIN: |
| 1270 | /* The owner was exiting, try again. */ | 1318 | /* The owner was exiting, try again. */ |
| 1271 | double_unlock_hb(hb1, hb2); | 1319 | double_unlock_hb(hb1, hb2); |
| 1272 | put_futex_key(fshared, &key2); | 1320 | put_futex_key(&key2); |
| 1273 | put_futex_key(fshared, &key1); | 1321 | put_futex_key(&key1); |
| 1274 | cond_resched(); | 1322 | cond_resched(); |
| 1275 | goto retry; | 1323 | goto retry; |
| 1276 | default: | 1324 | default: |
| @@ -1352,9 +1400,9 @@ out_unlock: | |||
| 1352 | drop_futex_key_refs(&key1); | 1400 | drop_futex_key_refs(&key1); |
| 1353 | 1401 | ||
| 1354 | out_put_keys: | 1402 | out_put_keys: |
| 1355 | put_futex_key(fshared, &key2); | 1403 | put_futex_key(&key2); |
| 1356 | out_put_key1: | 1404 | out_put_key1: |
| 1357 | put_futex_key(fshared, &key1); | 1405 | put_futex_key(&key1); |
| 1358 | out: | 1406 | out: |
| 1359 | if (pi_state != NULL) | 1407 | if (pi_state != NULL) |
| 1360 | free_pi_state(pi_state); | 1408 | free_pi_state(pi_state); |
| @@ -1494,7 +1542,7 @@ static void unqueue_me_pi(struct futex_q *q) | |||
| 1494 | * private futexes. | 1542 | * private futexes. |
| 1495 | */ | 1543 | */ |
| 1496 | static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, | 1544 | static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, |
| 1497 | struct task_struct *newowner, int fshared) | 1545 | struct task_struct *newowner) |
| 1498 | { | 1546 | { |
| 1499 | u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS; | 1547 | u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS; |
| 1500 | struct futex_pi_state *pi_state = q->pi_state; | 1548 | struct futex_pi_state *pi_state = q->pi_state; |
| @@ -1587,20 +1635,11 @@ handle_fault: | |||
| 1587 | goto retry; | 1635 | goto retry; |
| 1588 | } | 1636 | } |
| 1589 | 1637 | ||
| 1590 | /* | ||
| 1591 | * In case we must use restart_block to restart a futex_wait, | ||
| 1592 | * we encode in the 'flags' shared capability | ||
| 1593 | */ | ||
| 1594 | #define FLAGS_SHARED 0x01 | ||
| 1595 | #define FLAGS_CLOCKRT 0x02 | ||
| 1596 | #define FLAGS_HAS_TIMEOUT 0x04 | ||
| 1597 | |||
| 1598 | static long futex_wait_restart(struct restart_block *restart); | 1638 | static long futex_wait_restart(struct restart_block *restart); |
| 1599 | 1639 | ||
| 1600 | /** | 1640 | /** |
| 1601 | * fixup_owner() - Post lock pi_state and corner case management | 1641 | * fixup_owner() - Post lock pi_state and corner case management |
| 1602 | * @uaddr: user address of the futex | 1642 | * @uaddr: user address of the futex |
| 1603 | * @fshared: whether the futex is shared (1) or not (0) | ||
| 1604 | * @q: futex_q (contains pi_state and access to the rt_mutex) | 1643 | * @q: futex_q (contains pi_state and access to the rt_mutex) |
| 1605 | * @locked: if the attempt to take the rt_mutex succeeded (1) or not (0) | 1644 | * @locked: if the attempt to take the rt_mutex succeeded (1) or not (0) |
| 1606 | * | 1645 | * |
| @@ -1613,8 +1652,7 @@ static long futex_wait_restart(struct restart_block *restart); | |||
| 1613 | * 0 - success, lock not taken | 1652 | * 0 - success, lock not taken |
| 1614 | * <0 - on error (-EFAULT) | 1653 | * <0 - on error (-EFAULT) |
| 1615 | */ | 1654 | */ |
| 1616 | static int fixup_owner(u32 __user *uaddr, int fshared, struct futex_q *q, | 1655 | static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked) |
| 1617 | int locked) | ||
| 1618 | { | 1656 | { |
| 1619 | struct task_struct *owner; | 1657 | struct task_struct *owner; |
| 1620 | int ret = 0; | 1658 | int ret = 0; |
| @@ -1625,7 +1663,7 @@ static int fixup_owner(u32 __user *uaddr, int fshared, struct futex_q *q, | |||
| 1625 | * did a lock-steal - fix up the PI-state in that case: | 1663 | * did a lock-steal - fix up the PI-state in that case: |
| 1626 | */ | 1664 | */ |
| 1627 | if (q->pi_state->owner != current) | 1665 | if (q->pi_state->owner != current) |
| 1628 | ret = fixup_pi_state_owner(uaddr, q, current, fshared); | 1666 | ret = fixup_pi_state_owner(uaddr, q, current); |
| 1629 | goto out; | 1667 | goto out; |
| 1630 | } | 1668 | } |
| 1631 | 1669 | ||
| @@ -1652,7 +1690,7 @@ static int fixup_owner(u32 __user *uaddr, int fshared, struct futex_q *q, | |||
| 1652 | * lock. Fix the state up. | 1690 | * lock. Fix the state up. |
| 1653 | */ | 1691 | */ |
| 1654 | owner = rt_mutex_owner(&q->pi_state->pi_mutex); | 1692 | owner = rt_mutex_owner(&q->pi_state->pi_mutex); |
| 1655 | ret = fixup_pi_state_owner(uaddr, q, owner, fshared); | 1693 | ret = fixup_pi_state_owner(uaddr, q, owner); |
| 1656 | goto out; | 1694 | goto out; |
| 1657 | } | 1695 | } |
| 1658 | 1696 | ||
| @@ -1715,7 +1753,7 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q, | |||
| 1715 | * futex_wait_setup() - Prepare to wait on a futex | 1753 | * futex_wait_setup() - Prepare to wait on a futex |
| 1716 | * @uaddr: the futex userspace address | 1754 | * @uaddr: the futex userspace address |
| 1717 | * @val: the expected value | 1755 | * @val: the expected value |
| 1718 | * @fshared: whether the futex is shared (1) or not (0) | 1756 | * @flags: futex flags (FLAGS_SHARED, etc.) |
| 1719 | * @q: the associated futex_q | 1757 | * @q: the associated futex_q |
| 1720 | * @hb: storage for hash_bucket pointer to be returned to caller | 1758 | * @hb: storage for hash_bucket pointer to be returned to caller |
| 1721 | * | 1759 | * |
| @@ -1728,7 +1766,7 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q, | |||
| 1728 | * 0 - uaddr contains val and hb has been locked | 1766 | * 0 - uaddr contains val and hb has been locked |
| 1729 | * <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlcoked | 1767 | * <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlcoked |
| 1730 | */ | 1768 | */ |
| 1731 | static int futex_wait_setup(u32 __user *uaddr, u32 val, int fshared, | 1769 | static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags, |
| 1732 | struct futex_q *q, struct futex_hash_bucket **hb) | 1770 | struct futex_q *q, struct futex_hash_bucket **hb) |
| 1733 | { | 1771 | { |
| 1734 | u32 uval; | 1772 | u32 uval; |
| @@ -1752,8 +1790,7 @@ static int futex_wait_setup(u32 __user *uaddr, u32 val, int fshared, | |||
| 1752 | * rare, but normal. | 1790 | * rare, but normal. |
| 1753 | */ | 1791 | */ |
| 1754 | retry: | 1792 | retry: |
| 1755 | q->key = FUTEX_KEY_INIT; | 1793 | ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q->key); |
| 1756 | ret = get_futex_key(uaddr, fshared, &q->key); | ||
| 1757 | if (unlikely(ret != 0)) | 1794 | if (unlikely(ret != 0)) |
| 1758 | return ret; | 1795 | return ret; |
| 1759 | 1796 | ||
| @@ -1769,10 +1806,10 @@ retry_private: | |||
| 1769 | if (ret) | 1806 | if (ret) |
| 1770 | goto out; | 1807 | goto out; |
| 1771 | 1808 | ||
| 1772 | if (!fshared) | 1809 | if (!(flags & FLAGS_SHARED)) |
| 1773 | goto retry_private; | 1810 | goto retry_private; |
| 1774 | 1811 | ||
| 1775 | put_futex_key(fshared, &q->key); | 1812 | put_futex_key(&q->key); |
| 1776 | goto retry; | 1813 | goto retry; |
| 1777 | } | 1814 | } |
| 1778 | 1815 | ||
| @@ -1783,32 +1820,29 @@ retry_private: | |||
| 1783 | 1820 | ||
| 1784 | out: | 1821 | out: |
| 1785 | if (ret) | 1822 | if (ret) |
| 1786 | put_futex_key(fshared, &q->key); | 1823 | put_futex_key(&q->key); |
| 1787 | return ret; | 1824 | return ret; |
| 1788 | } | 1825 | } |
| 1789 | 1826 | ||
| 1790 | static int futex_wait(u32 __user *uaddr, int fshared, | 1827 | static int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, |
| 1791 | u32 val, ktime_t *abs_time, u32 bitset, int clockrt) | 1828 | ktime_t *abs_time, u32 bitset) |
| 1792 | { | 1829 | { |
| 1793 | struct hrtimer_sleeper timeout, *to = NULL; | 1830 | struct hrtimer_sleeper timeout, *to = NULL; |
| 1794 | struct restart_block *restart; | 1831 | struct restart_block *restart; |
| 1795 | struct futex_hash_bucket *hb; | 1832 | struct futex_hash_bucket *hb; |
| 1796 | struct futex_q q; | 1833 | struct futex_q q = futex_q_init; |
| 1797 | int ret; | 1834 | int ret; |
| 1798 | 1835 | ||
| 1799 | if (!bitset) | 1836 | if (!bitset) |
| 1800 | return -EINVAL; | 1837 | return -EINVAL; |
| 1801 | |||
| 1802 | q.pi_state = NULL; | ||
| 1803 | q.bitset = bitset; | 1838 | q.bitset = bitset; |
| 1804 | q.rt_waiter = NULL; | ||
| 1805 | q.requeue_pi_key = NULL; | ||
| 1806 | 1839 | ||
| 1807 | if (abs_time) { | 1840 | if (abs_time) { |
| 1808 | to = &timeout; | 1841 | to = &timeout; |
| 1809 | 1842 | ||
| 1810 | hrtimer_init_on_stack(&to->timer, clockrt ? CLOCK_REALTIME : | 1843 | hrtimer_init_on_stack(&to->timer, (flags & FLAGS_CLOCKRT) ? |
| 1811 | CLOCK_MONOTONIC, HRTIMER_MODE_ABS); | 1844 | CLOCK_REALTIME : CLOCK_MONOTONIC, |
| 1845 | HRTIMER_MODE_ABS); | ||
| 1812 | hrtimer_init_sleeper(to, current); | 1846 | hrtimer_init_sleeper(to, current); |
| 1813 | hrtimer_set_expires_range_ns(&to->timer, *abs_time, | 1847 | hrtimer_set_expires_range_ns(&to->timer, *abs_time, |
| 1814 | current->timer_slack_ns); | 1848 | current->timer_slack_ns); |
| @@ -1819,7 +1853,7 @@ retry: | |||
| 1819 | * Prepare to wait on uaddr. On success, holds hb lock and increments | 1853 | * Prepare to wait on uaddr. On success, holds hb lock and increments |
| 1820 | * q.key refs. | 1854 | * q.key refs. |
| 1821 | */ | 1855 | */ |
| 1822 | ret = futex_wait_setup(uaddr, val, fshared, &q, &hb); | 1856 | ret = futex_wait_setup(uaddr, val, flags, &q, &hb); |
| 1823 | if (ret) | 1857 | if (ret) |
| 1824 | goto out; | 1858 | goto out; |
| 1825 | 1859 | ||
| @@ -1852,12 +1886,7 @@ retry: | |||
| 1852 | restart->futex.val = val; | 1886 | restart->futex.val = val; |
| 1853 | restart->futex.time = abs_time->tv64; | 1887 | restart->futex.time = abs_time->tv64; |
| 1854 | restart->futex.bitset = bitset; | 1888 | restart->futex.bitset = bitset; |
| 1855 | restart->futex.flags = FLAGS_HAS_TIMEOUT; | 1889 | restart->futex.flags = flags; |
| 1856 | |||
| 1857 | if (fshared) | ||
| 1858 | restart->futex.flags |= FLAGS_SHARED; | ||
| 1859 | if (clockrt) | ||
| 1860 | restart->futex.flags |= FLAGS_CLOCKRT; | ||
| 1861 | 1890 | ||
| 1862 | ret = -ERESTART_RESTARTBLOCK; | 1891 | ret = -ERESTART_RESTARTBLOCK; |
| 1863 | 1892 | ||
| @@ -1873,7 +1902,6 @@ out: | |||
| 1873 | static long futex_wait_restart(struct restart_block *restart) | 1902 | static long futex_wait_restart(struct restart_block *restart) |
| 1874 | { | 1903 | { |
| 1875 | u32 __user *uaddr = restart->futex.uaddr; | 1904 | u32 __user *uaddr = restart->futex.uaddr; |
| 1876 | int fshared = 0; | ||
| 1877 | ktime_t t, *tp = NULL; | 1905 | ktime_t t, *tp = NULL; |
| 1878 | 1906 | ||
| 1879 | if (restart->futex.flags & FLAGS_HAS_TIMEOUT) { | 1907 | if (restart->futex.flags & FLAGS_HAS_TIMEOUT) { |
| @@ -1881,11 +1909,9 @@ static long futex_wait_restart(struct restart_block *restart) | |||
| 1881 | tp = &t; | 1909 | tp = &t; |
| 1882 | } | 1910 | } |
| 1883 | restart->fn = do_no_restart_syscall; | 1911 | restart->fn = do_no_restart_syscall; |
| 1884 | if (restart->futex.flags & FLAGS_SHARED) | 1912 | |
| 1885 | fshared = 1; | 1913 | return (long)futex_wait(uaddr, restart->futex.flags, |
| 1886 | return (long)futex_wait(uaddr, fshared, restart->futex.val, tp, | 1914 | restart->futex.val, tp, restart->futex.bitset); |
| 1887 | restart->futex.bitset, | ||
| 1888 | restart->futex.flags & FLAGS_CLOCKRT); | ||
| 1889 | } | 1915 | } |
| 1890 | 1916 | ||
| 1891 | 1917 | ||
| @@ -1895,12 +1921,12 @@ static long futex_wait_restart(struct restart_block *restart) | |||
| 1895 | * if there are waiters then it will block, it does PI, etc. (Due to | 1921 | * if there are waiters then it will block, it does PI, etc. (Due to |
| 1896 | * races the kernel might see a 0 value of the futex too.) | 1922 | * races the kernel might see a 0 value of the futex too.) |
| 1897 | */ | 1923 | */ |
| 1898 | static int futex_lock_pi(u32 __user *uaddr, int fshared, | 1924 | static int futex_lock_pi(u32 __user *uaddr, unsigned int flags, int detect, |
| 1899 | int detect, ktime_t *time, int trylock) | 1925 | ktime_t *time, int trylock) |
| 1900 | { | 1926 | { |
| 1901 | struct hrtimer_sleeper timeout, *to = NULL; | 1927 | struct hrtimer_sleeper timeout, *to = NULL; |
| 1902 | struct futex_hash_bucket *hb; | 1928 | struct futex_hash_bucket *hb; |
| 1903 | struct futex_q q; | 1929 | struct futex_q q = futex_q_init; |
| 1904 | int res, ret; | 1930 | int res, ret; |
| 1905 | 1931 | ||
| 1906 | if (refill_pi_state_cache()) | 1932 | if (refill_pi_state_cache()) |
| @@ -1914,12 +1940,8 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared, | |||
| 1914 | hrtimer_set_expires(&to->timer, *time); | 1940 | hrtimer_set_expires(&to->timer, *time); |
| 1915 | } | 1941 | } |
| 1916 | 1942 | ||
| 1917 | q.pi_state = NULL; | ||
| 1918 | q.rt_waiter = NULL; | ||
| 1919 | q.requeue_pi_key = NULL; | ||
| 1920 | retry: | 1943 | retry: |
| 1921 | q.key = FUTEX_KEY_INIT; | 1944 | ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q.key); |
| 1922 | ret = get_futex_key(uaddr, fshared, &q.key); | ||
| 1923 | if (unlikely(ret != 0)) | 1945 | if (unlikely(ret != 0)) |
| 1924 | goto out; | 1946 | goto out; |
| 1925 | 1947 | ||
| @@ -1941,7 +1963,7 @@ retry_private: | |||
| 1941 | * exit to complete. | 1963 | * exit to complete. |
| 1942 | */ | 1964 | */ |
| 1943 | queue_unlock(&q, hb); | 1965 | queue_unlock(&q, hb); |
| 1944 | put_futex_key(fshared, &q.key); | 1966 | put_futex_key(&q.key); |
| 1945 | cond_resched(); | 1967 | cond_resched(); |
| 1946 | goto retry; | 1968 | goto retry; |
| 1947 | default: | 1969 | default: |
| @@ -1971,7 +1993,7 @@ retry_private: | |||
| 1971 | * Fixup the pi_state owner and possibly acquire the lock if we | 1993 | * Fixup the pi_state owner and possibly acquire the lock if we |
| 1972 | * haven't already. | 1994 | * haven't already. |
| 1973 | */ | 1995 | */ |
| 1974 | res = fixup_owner(uaddr, fshared, &q, !ret); | 1996 | res = fixup_owner(uaddr, &q, !ret); |
| 1975 | /* | 1997 | /* |
| 1976 | * If fixup_owner() returned an error, proprogate that. If it acquired | 1998 | * If fixup_owner() returned an error, proprogate that. If it acquired |
| 1977 | * the lock, clear our -ETIMEDOUT or -EINTR. | 1999 | * the lock, clear our -ETIMEDOUT or -EINTR. |
| @@ -1995,7 +2017,7 @@ out_unlock_put_key: | |||
| 1995 | queue_unlock(&q, hb); | 2017 | queue_unlock(&q, hb); |
| 1996 | 2018 | ||
| 1997 | out_put_key: | 2019 | out_put_key: |
| 1998 | put_futex_key(fshared, &q.key); | 2020 | put_futex_key(&q.key); |
| 1999 | out: | 2021 | out: |
| 2000 | if (to) | 2022 | if (to) |
| 2001 | destroy_hrtimer_on_stack(&to->timer); | 2023 | destroy_hrtimer_on_stack(&to->timer); |
| @@ -2008,10 +2030,10 @@ uaddr_faulted: | |||
| 2008 | if (ret) | 2030 | if (ret) |
| 2009 | goto out_put_key; | 2031 | goto out_put_key; |
| 2010 | 2032 | ||
| 2011 | if (!fshared) | 2033 | if (!(flags & FLAGS_SHARED)) |
| 2012 | goto retry_private; | 2034 | goto retry_private; |
| 2013 | 2035 | ||
| 2014 | put_futex_key(fshared, &q.key); | 2036 | put_futex_key(&q.key); |
| 2015 | goto retry; | 2037 | goto retry; |
| 2016 | } | 2038 | } |
| 2017 | 2039 | ||
| @@ -2020,7 +2042,7 @@ uaddr_faulted: | |||
| 2020 | * This is the in-kernel slowpath: we look up the PI state (if any), | 2042 | * This is the in-kernel slowpath: we look up the PI state (if any), |
| 2021 | * and do the rt-mutex unlock. | 2043 | * and do the rt-mutex unlock. |
| 2022 | */ | 2044 | */ |
| 2023 | static int futex_unlock_pi(u32 __user *uaddr, int fshared) | 2045 | static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags) |
| 2024 | { | 2046 | { |
| 2025 | struct futex_hash_bucket *hb; | 2047 | struct futex_hash_bucket *hb; |
| 2026 | struct futex_q *this, *next; | 2048 | struct futex_q *this, *next; |
| @@ -2038,7 +2060,7 @@ retry: | |||
| 2038 | if ((uval & FUTEX_TID_MASK) != task_pid_vnr(current)) | 2060 | if ((uval & FUTEX_TID_MASK) != task_pid_vnr(current)) |
| 2039 | return -EPERM; | 2061 | return -EPERM; |
| 2040 | 2062 | ||
| 2041 | ret = get_futex_key(uaddr, fshared, &key); | 2063 | ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key); |
| 2042 | if (unlikely(ret != 0)) | 2064 | if (unlikely(ret != 0)) |
| 2043 | goto out; | 2065 | goto out; |
| 2044 | 2066 | ||
| @@ -2093,14 +2115,14 @@ retry: | |||
| 2093 | 2115 | ||
| 2094 | out_unlock: | 2116 | out_unlock: |
| 2095 | spin_unlock(&hb->lock); | 2117 | spin_unlock(&hb->lock); |
| 2096 | put_futex_key(fshared, &key); | 2118 | put_futex_key(&key); |
| 2097 | 2119 | ||
| 2098 | out: | 2120 | out: |
| 2099 | return ret; | 2121 | return ret; |
| 2100 | 2122 | ||
| 2101 | pi_faulted: | 2123 | pi_faulted: |
| 2102 | spin_unlock(&hb->lock); | 2124 | spin_unlock(&hb->lock); |
| 2103 | put_futex_key(fshared, &key); | 2125 | put_futex_key(&key); |
| 2104 | 2126 | ||
| 2105 | ret = fault_in_user_writeable(uaddr); | 2127 | ret = fault_in_user_writeable(uaddr); |
| 2106 | if (!ret) | 2128 | if (!ret) |
| @@ -2160,7 +2182,7 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb, | |||
| 2160 | /** | 2182 | /** |
| 2161 | * futex_wait_requeue_pi() - Wait on uaddr and take uaddr2 | 2183 | * futex_wait_requeue_pi() - Wait on uaddr and take uaddr2 |
| 2162 | * @uaddr: the futex we initially wait on (non-pi) | 2184 | * @uaddr: the futex we initially wait on (non-pi) |
| 2163 | * @fshared: whether the futexes are shared (1) or not (0). They must be | 2185 | * @flags: futex flags (FLAGS_SHARED, FLAGS_CLOCKRT, etc.), they must be |
| 2164 | * the same type, no requeueing from private to shared, etc. | 2186 | * the same type, no requeueing from private to shared, etc. |
| 2165 | * @val: the expected value of uaddr | 2187 | * @val: the expected value of uaddr |
| 2166 | * @abs_time: absolute timeout | 2188 | * @abs_time: absolute timeout |
| @@ -2198,16 +2220,16 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb, | |||
| 2198 | * 0 - On success | 2220 | * 0 - On success |
| 2199 | * <0 - On error | 2221 | * <0 - On error |
| 2200 | */ | 2222 | */ |
| 2201 | static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared, | 2223 | static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, |
| 2202 | u32 val, ktime_t *abs_time, u32 bitset, | 2224 | u32 val, ktime_t *abs_time, u32 bitset, |
| 2203 | int clockrt, u32 __user *uaddr2) | 2225 | u32 __user *uaddr2) |
| 2204 | { | 2226 | { |
| 2205 | struct hrtimer_sleeper timeout, *to = NULL; | 2227 | struct hrtimer_sleeper timeout, *to = NULL; |
| 2206 | struct rt_mutex_waiter rt_waiter; | 2228 | struct rt_mutex_waiter rt_waiter; |
| 2207 | struct rt_mutex *pi_mutex = NULL; | 2229 | struct rt_mutex *pi_mutex = NULL; |
| 2208 | struct futex_hash_bucket *hb; | 2230 | struct futex_hash_bucket *hb; |
| 2209 | union futex_key key2; | 2231 | union futex_key key2 = FUTEX_KEY_INIT; |
| 2210 | struct futex_q q; | 2232 | struct futex_q q = futex_q_init; |
| 2211 | int res, ret; | 2233 | int res, ret; |
| 2212 | 2234 | ||
| 2213 | if (!bitset) | 2235 | if (!bitset) |
| @@ -2215,8 +2237,9 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared, | |||
| 2215 | 2237 | ||
| 2216 | if (abs_time) { | 2238 | if (abs_time) { |
| 2217 | to = &timeout; | 2239 | to = &timeout; |
| 2218 | hrtimer_init_on_stack(&to->timer, clockrt ? CLOCK_REALTIME : | 2240 | hrtimer_init_on_stack(&to->timer, (flags & FLAGS_CLOCKRT) ? |
| 2219 | CLOCK_MONOTONIC, HRTIMER_MODE_ABS); | 2241 | CLOCK_REALTIME : CLOCK_MONOTONIC, |
| 2242 | HRTIMER_MODE_ABS); | ||
| 2220 | hrtimer_init_sleeper(to, current); | 2243 | hrtimer_init_sleeper(to, current); |
| 2221 | hrtimer_set_expires_range_ns(&to->timer, *abs_time, | 2244 | hrtimer_set_expires_range_ns(&to->timer, *abs_time, |
| 2222 | current->timer_slack_ns); | 2245 | current->timer_slack_ns); |
| @@ -2229,12 +2252,10 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared, | |||
| 2229 | debug_rt_mutex_init_waiter(&rt_waiter); | 2252 | debug_rt_mutex_init_waiter(&rt_waiter); |
| 2230 | rt_waiter.task = NULL; | 2253 | rt_waiter.task = NULL; |
| 2231 | 2254 | ||
| 2232 | key2 = FUTEX_KEY_INIT; | 2255 | ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2); |
| 2233 | ret = get_futex_key(uaddr2, fshared, &key2); | ||
| 2234 | if (unlikely(ret != 0)) | 2256 | if (unlikely(ret != 0)) |
| 2235 | goto out; | 2257 | goto out; |
| 2236 | 2258 | ||
| 2237 | q.pi_state = NULL; | ||
| 2238 | q.bitset = bitset; | 2259 | q.bitset = bitset; |
| 2239 | q.rt_waiter = &rt_waiter; | 2260 | q.rt_waiter = &rt_waiter; |
| 2240 | q.requeue_pi_key = &key2; | 2261 | q.requeue_pi_key = &key2; |
| @@ -2243,7 +2264,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared, | |||
| 2243 | * Prepare to wait on uaddr. On success, increments q.key (key1) ref | 2264 | * Prepare to wait on uaddr. On success, increments q.key (key1) ref |
| 2244 | * count. | 2265 | * count. |
| 2245 | */ | 2266 | */ |
| 2246 | ret = futex_wait_setup(uaddr, val, fshared, &q, &hb); | 2267 | ret = futex_wait_setup(uaddr, val, flags, &q, &hb); |
| 2247 | if (ret) | 2268 | if (ret) |
| 2248 | goto out_key2; | 2269 | goto out_key2; |
| 2249 | 2270 | ||
| @@ -2273,8 +2294,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared, | |||
| 2273 | */ | 2294 | */ |
| 2274 | if (q.pi_state && (q.pi_state->owner != current)) { | 2295 | if (q.pi_state && (q.pi_state->owner != current)) { |
| 2275 | spin_lock(q.lock_ptr); | 2296 | spin_lock(q.lock_ptr); |
| 2276 | ret = fixup_pi_state_owner(uaddr2, &q, current, | 2297 | ret = fixup_pi_state_owner(uaddr2, &q, current); |
| 2277 | fshared); | ||
| 2278 | spin_unlock(q.lock_ptr); | 2298 | spin_unlock(q.lock_ptr); |
| 2279 | } | 2299 | } |
| 2280 | } else { | 2300 | } else { |
| @@ -2293,7 +2313,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared, | |||
| 2293 | * Fixup the pi_state owner and possibly acquire the lock if we | 2313 | * Fixup the pi_state owner and possibly acquire the lock if we |
| 2294 | * haven't already. | 2314 | * haven't already. |
| 2295 | */ | 2315 | */ |
| 2296 | res = fixup_owner(uaddr2, fshared, &q, !ret); | 2316 | res = fixup_owner(uaddr2, &q, !ret); |
| 2297 | /* | 2317 | /* |
| 2298 | * If fixup_owner() returned an error, proprogate that. If it | 2318 | * If fixup_owner() returned an error, proprogate that. If it |
| 2299 | * acquired the lock, clear -ETIMEDOUT or -EINTR. | 2319 | * acquired the lock, clear -ETIMEDOUT or -EINTR. |
| @@ -2324,9 +2344,9 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared, | |||
| 2324 | } | 2344 | } |
| 2325 | 2345 | ||
| 2326 | out_put_keys: | 2346 | out_put_keys: |
| 2327 | put_futex_key(fshared, &q.key); | 2347 | put_futex_key(&q.key); |
| 2328 | out_key2: | 2348 | out_key2: |
| 2329 | put_futex_key(fshared, &key2); | 2349 | put_futex_key(&key2); |
| 2330 | 2350 | ||
| 2331 | out: | 2351 | out: |
| 2332 | if (to) { | 2352 | if (to) { |
| @@ -2489,7 +2509,8 @@ void exit_robust_list(struct task_struct *curr) | |||
| 2489 | { | 2509 | { |
| 2490 | struct robust_list_head __user *head = curr->robust_list; | 2510 | struct robust_list_head __user *head = curr->robust_list; |
| 2491 | struct robust_list __user *entry, *next_entry, *pending; | 2511 | struct robust_list __user *entry, *next_entry, *pending; |
| 2492 | unsigned int limit = ROBUST_LIST_LIMIT, pi, next_pi, pip; | 2512 | unsigned int limit = ROBUST_LIST_LIMIT, pi, pip; |
| 2513 | unsigned int uninitialized_var(next_pi); | ||
| 2493 | unsigned long futex_offset; | 2514 | unsigned long futex_offset; |
| 2494 | int rc; | 2515 | int rc; |
| 2495 | 2516 | ||
| @@ -2550,58 +2571,57 @@ void exit_robust_list(struct task_struct *curr) | |||
| 2550 | long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, | 2571 | long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, |
| 2551 | u32 __user *uaddr2, u32 val2, u32 val3) | 2572 | u32 __user *uaddr2, u32 val2, u32 val3) |
| 2552 | { | 2573 | { |
| 2553 | int clockrt, ret = -ENOSYS; | 2574 | int ret = -ENOSYS, cmd = op & FUTEX_CMD_MASK; |
| 2554 | int cmd = op & FUTEX_CMD_MASK; | 2575 | unsigned int flags = 0; |
| 2555 | int fshared = 0; | ||
| 2556 | 2576 | ||
| 2557 | if (!(op & FUTEX_PRIVATE_FLAG)) | 2577 | if (!(op & FUTEX_PRIVATE_FLAG)) |
| 2558 | fshared = 1; | 2578 | flags |= FLAGS_SHARED; |
| 2559 | 2579 | ||
| 2560 | clockrt = op & FUTEX_CLOCK_REALTIME; | 2580 | if (op & FUTEX_CLOCK_REALTIME) { |
| 2561 | if (clockrt && cmd != FUTEX_WAIT_BITSET && cmd != FUTEX_WAIT_REQUEUE_PI) | 2581 | flags |= FLAGS_CLOCKRT; |
| 2562 | return -ENOSYS; | 2582 | if (cmd != FUTEX_WAIT_BITSET && cmd != FUTEX_WAIT_REQUEUE_PI) |
| 2583 | return -ENOSYS; | ||
| 2584 | } | ||
| 2563 | 2585 | ||
| 2564 | switch (cmd) { | 2586 | switch (cmd) { |
| 2565 | case FUTEX_WAIT: | 2587 | case FUTEX_WAIT: |
| 2566 | val3 = FUTEX_BITSET_MATCH_ANY; | 2588 | val3 = FUTEX_BITSET_MATCH_ANY; |
| 2567 | case FUTEX_WAIT_BITSET: | 2589 | case FUTEX_WAIT_BITSET: |
| 2568 | ret = futex_wait(uaddr, fshared, val, timeout, val3, clockrt); | 2590 | ret = futex_wait(uaddr, flags, val, timeout, val3); |
| 2569 | break; | 2591 | break; |
| 2570 | case FUTEX_WAKE: | 2592 | case FUTEX_WAKE: |
| 2571 | val3 = FUTEX_BITSET_MATCH_ANY; | 2593 | val3 = FUTEX_BITSET_MATCH_ANY; |
| 2572 | case FUTEX_WAKE_BITSET: | 2594 | case FUTEX_WAKE_BITSET: |
| 2573 | ret = futex_wake(uaddr, fshared, val, val3); | 2595 | ret = futex_wake(uaddr, flags, val, val3); |
| 2574 | break; | 2596 | break; |
| 2575 | case FUTEX_REQUEUE: | 2597 | case FUTEX_REQUEUE: |
| 2576 | ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, NULL, 0); | 2598 | ret = futex_requeue(uaddr, flags, uaddr2, val, val2, NULL, 0); |
| 2577 | break; | 2599 | break; |
| 2578 | case FUTEX_CMP_REQUEUE: | 2600 | case FUTEX_CMP_REQUEUE: |
| 2579 | ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3, | 2601 | ret = futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 0); |
| 2580 | 0); | ||
| 2581 | break; | 2602 | break; |
| 2582 | case FUTEX_WAKE_OP: | 2603 | case FUTEX_WAKE_OP: |
| 2583 | ret = futex_wake_op(uaddr, fshared, uaddr2, val, val2, val3); | 2604 | ret = futex_wake_op(uaddr, flags, uaddr2, val, val2, val3); |
| 2584 | break; | 2605 | break; |
| 2585 | case FUTEX_LOCK_PI: | 2606 | case FUTEX_LOCK_PI: |
| 2586 | if (futex_cmpxchg_enabled) | 2607 | if (futex_cmpxchg_enabled) |
| 2587 | ret = futex_lock_pi(uaddr, fshared, val, timeout, 0); | 2608 | ret = futex_lock_pi(uaddr, flags, val, timeout, 0); |
| 2588 | break; | 2609 | break; |
| 2589 | case FUTEX_UNLOCK_PI: | 2610 | case FUTEX_UNLOCK_PI: |
| 2590 | if (futex_cmpxchg_enabled) | 2611 | if (futex_cmpxchg_enabled) |
| 2591 | ret = futex_unlock_pi(uaddr, fshared); | 2612 | ret = futex_unlock_pi(uaddr, flags); |
| 2592 | break; | 2613 | break; |
| 2593 | case FUTEX_TRYLOCK_PI: | 2614 | case FUTEX_TRYLOCK_PI: |
| 2594 | if (futex_cmpxchg_enabled) | 2615 | if (futex_cmpxchg_enabled) |
| 2595 | ret = futex_lock_pi(uaddr, fshared, 0, timeout, 1); | 2616 | ret = futex_lock_pi(uaddr, flags, 0, timeout, 1); |
| 2596 | break; | 2617 | break; |
| 2597 | case FUTEX_WAIT_REQUEUE_PI: | 2618 | case FUTEX_WAIT_REQUEUE_PI: |
| 2598 | val3 = FUTEX_BITSET_MATCH_ANY; | 2619 | val3 = FUTEX_BITSET_MATCH_ANY; |
| 2599 | ret = futex_wait_requeue_pi(uaddr, fshared, val, timeout, val3, | 2620 | ret = futex_wait_requeue_pi(uaddr, flags, val, timeout, val3, |
| 2600 | clockrt, uaddr2); | 2621 | uaddr2); |
| 2601 | break; | 2622 | break; |
| 2602 | case FUTEX_CMP_REQUEUE_PI: | 2623 | case FUTEX_CMP_REQUEUE_PI: |
| 2603 | ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3, | 2624 | ret = futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 1); |
| 2604 | 1); | ||
| 2605 | break; | 2625 | break; |
| 2606 | default: | 2626 | default: |
| 2607 | ret = -ENOSYS; | 2627 | ret = -ENOSYS; |
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c index 06da4dfc339..a7934ac75e5 100644 --- a/kernel/futex_compat.c +++ b/kernel/futex_compat.c | |||
| @@ -49,7 +49,8 @@ void compat_exit_robust_list(struct task_struct *curr) | |||
| 49 | { | 49 | { |
| 50 | struct compat_robust_list_head __user *head = curr->compat_robust_list; | 50 | struct compat_robust_list_head __user *head = curr->compat_robust_list; |
| 51 | struct robust_list __user *entry, *next_entry, *pending; | 51 | struct robust_list __user *entry, *next_entry, *pending; |
| 52 | unsigned int limit = ROBUST_LIST_LIMIT, pi, next_pi, pip; | 52 | unsigned int limit = ROBUST_LIST_LIMIT, pi, pip; |
| 53 | unsigned int uninitialized_var(next_pi); | ||
| 53 | compat_uptr_t uentry, next_uentry, upending; | 54 | compat_uptr_t uentry, next_uentry, upending; |
| 54 | compat_long_t futex_offset; | 55 | compat_long_t futex_offset; |
| 55 | int rc; | 56 | int rc; |
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 72206cf5c6c..0c8d7c04861 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c | |||
| @@ -497,7 +497,7 @@ static inline int hrtimer_is_hres_enabled(void) | |||
| 497 | */ | 497 | */ |
| 498 | static inline int hrtimer_hres_active(void) | 498 | static inline int hrtimer_hres_active(void) |
| 499 | { | 499 | { |
| 500 | return __get_cpu_var(hrtimer_bases).hres_active; | 500 | return __this_cpu_read(hrtimer_bases.hres_active); |
| 501 | } | 501 | } |
| 502 | 502 | ||
| 503 | /* | 503 | /* |
| @@ -516,10 +516,13 @@ hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal) | |||
| 516 | 516 | ||
| 517 | for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) { | 517 | for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) { |
| 518 | struct hrtimer *timer; | 518 | struct hrtimer *timer; |
| 519 | struct timerqueue_node *next; | ||
| 519 | 520 | ||
| 520 | if (!base->first) | 521 | next = timerqueue_getnext(&base->active); |
| 522 | if (!next) | ||
| 521 | continue; | 523 | continue; |
| 522 | timer = rb_entry(base->first, struct hrtimer, node); | 524 | timer = container_of(next, struct hrtimer, node); |
| 525 | |||
| 523 | expires = ktime_sub(hrtimer_get_expires(timer), base->offset); | 526 | expires = ktime_sub(hrtimer_get_expires(timer), base->offset); |
| 524 | /* | 527 | /* |
| 525 | * clock_was_set() has changed base->offset so the | 528 | * clock_was_set() has changed base->offset so the |
| @@ -840,48 +843,17 @@ EXPORT_SYMBOL_GPL(hrtimer_forward); | |||
| 840 | static int enqueue_hrtimer(struct hrtimer *timer, | 843 | static int enqueue_hrtimer(struct hrtimer *timer, |
| 841 | struct hrtimer_clock_base *base) | 844 | struct hrtimer_clock_base *base) |
| 842 | { | 845 | { |
| 843 | struct rb_node **link = &base->active.rb_node; | ||
| 844 | struct rb_node *parent = NULL; | ||
| 845 | struct hrtimer *entry; | ||
| 846 | int leftmost = 1; | ||
| 847 | |||
| 848 | debug_activate(timer); | 846 | debug_activate(timer); |
| 849 | 847 | ||
| 850 | /* | 848 | timerqueue_add(&base->active, &timer->node); |
| 851 | * Find the right place in the rbtree: | ||
| 852 | */ | ||
| 853 | while (*link) { | ||
| 854 | parent = *link; | ||
| 855 | entry = rb_entry(parent, struct hrtimer, node); | ||
| 856 | /* | ||
| 857 | * We dont care about collisions. Nodes with | ||
| 858 | * the same expiry time stay together. | ||
| 859 | */ | ||
| 860 | if (hrtimer_get_expires_tv64(timer) < | ||
| 861 | hrtimer_get_expires_tv64(entry)) { | ||
| 862 | link = &(*link)->rb_left; | ||
| 863 | } else { | ||
| 864 | link = &(*link)->rb_right; | ||
| 865 | leftmost = 0; | ||
| 866 | } | ||
| 867 | } | ||
| 868 | |||
| 869 | /* | ||
| 870 | * Insert the timer to the rbtree and check whether it | ||
| 871 | * replaces the first pending timer | ||
| 872 | */ | ||
| 873 | if (leftmost) | ||
| 874 | base->first = &timer->node; | ||
| 875 | 849 | ||
| 876 | rb_link_node(&timer->node, parent, link); | ||
| 877 | rb_insert_color(&timer->node, &base->active); | ||
| 878 | /* | 850 | /* |
| 879 | * HRTIMER_STATE_ENQUEUED is or'ed to the current state to preserve the | 851 | * HRTIMER_STATE_ENQUEUED is or'ed to the current state to preserve the |
| 880 | * state of a possibly running callback. | 852 | * state of a possibly running callback. |
| 881 | */ | 853 | */ |
| 882 | timer->state |= HRTIMER_STATE_ENQUEUED; | 854 | timer->state |= HRTIMER_STATE_ENQUEUED; |
| 883 | 855 | ||
| 884 | return leftmost; | 856 | return (&timer->node == base->active.next); |
| 885 | } | 857 | } |
| 886 | 858 | ||
| 887 | /* | 859 | /* |
| @@ -901,12 +873,7 @@ static void __remove_hrtimer(struct hrtimer *timer, | |||
| 901 | if (!(timer->state & HRTIMER_STATE_ENQUEUED)) | 873 | if (!(timer->state & HRTIMER_STATE_ENQUEUED)) |
| 902 | goto out; | 874 | goto out; |
| 903 | 875 | ||
| 904 | /* | 876 | if (&timer->node == timerqueue_getnext(&base->active)) { |
| 905 | * Remove the timer from the rbtree and replace the first | ||
| 906 | * entry pointer if necessary. | ||
| 907 | */ | ||
| 908 | if (base->first == &timer->node) { | ||
| 909 | base->first = rb_next(&timer->node); | ||
| 910 | #ifdef CONFIG_HIGH_RES_TIMERS | 877 | #ifdef CONFIG_HIGH_RES_TIMERS |
| 911 | /* Reprogram the clock event device. if enabled */ | 878 | /* Reprogram the clock event device. if enabled */ |
| 912 | if (reprogram && hrtimer_hres_active()) { | 879 | if (reprogram && hrtimer_hres_active()) { |
| @@ -919,7 +886,7 @@ static void __remove_hrtimer(struct hrtimer *timer, | |||
| 919 | } | 886 | } |
| 920 | #endif | 887 | #endif |
| 921 | } | 888 | } |
| 922 | rb_erase(&timer->node, &base->active); | 889 | timerqueue_del(&base->active, &timer->node); |
| 923 | out: | 890 | out: |
| 924 | timer->state = newstate; | 891 | timer->state = newstate; |
| 925 | } | 892 | } |
| @@ -1128,11 +1095,13 @@ ktime_t hrtimer_get_next_event(void) | |||
| 1128 | if (!hrtimer_hres_active()) { | 1095 | if (!hrtimer_hres_active()) { |
| 1129 | for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) { | 1096 | for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) { |
| 1130 | struct hrtimer *timer; | 1097 | struct hrtimer *timer; |
| 1098 | struct timerqueue_node *next; | ||
| 1131 | 1099 | ||
| 1132 | if (!base->first) | 1100 | next = timerqueue_getnext(&base->active); |
| 1101 | if (!next) | ||
| 1133 | continue; | 1102 | continue; |
| 1134 | 1103 | ||
| 1135 | timer = rb_entry(base->first, struct hrtimer, node); | 1104 | timer = container_of(next, struct hrtimer, node); |
| 1136 | delta.tv64 = hrtimer_get_expires_tv64(timer); | 1105 | delta.tv64 = hrtimer_get_expires_tv64(timer); |
| 1137 | delta = ktime_sub(delta, base->get_time()); | 1106 | delta = ktime_sub(delta, base->get_time()); |
| 1138 | if (delta.tv64 < mindelta.tv64) | 1107 | if (delta.tv64 < mindelta.tv64) |
| @@ -1162,6 +1131,7 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id, | |||
| 1162 | 1131 | ||
| 1163 | timer->base = &cpu_base->clock_base[clock_id]; | 1132 | timer->base = &cpu_base->clock_base[clock_id]; |
| 1164 | hrtimer_init_timer_hres(timer); | 1133 | hrtimer_init_timer_hres(timer); |
| 1134 | timerqueue_init(&timer->node); | ||
| 1165 | 1135 | ||
| 1166 | #ifdef CONFIG_TIMER_STATS | 1136 | #ifdef CONFIG_TIMER_STATS |
| 1167 | timer->start_site = NULL; | 1137 | timer->start_site = NULL; |
| @@ -1278,14 +1248,14 @@ retry: | |||
| 1278 | 1248 | ||
| 1279 | for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { | 1249 | for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { |
| 1280 | ktime_t basenow; | 1250 | ktime_t basenow; |
| 1281 | struct rb_node *node; | 1251 | struct timerqueue_node *node; |
| 1282 | 1252 | ||
| 1283 | basenow = ktime_add(now, base->offset); | 1253 | basenow = ktime_add(now, base->offset); |
| 1284 | 1254 | ||
| 1285 | while ((node = base->first)) { | 1255 | while ((node = timerqueue_getnext(&base->active))) { |
| 1286 | struct hrtimer *timer; | 1256 | struct hrtimer *timer; |
| 1287 | 1257 | ||
| 1288 | timer = rb_entry(node, struct hrtimer, node); | 1258 | timer = container_of(node, struct hrtimer, node); |
| 1289 | 1259 | ||
| 1290 | /* | 1260 | /* |
| 1291 | * The immediate goal for using the softexpires is | 1261 | * The immediate goal for using the softexpires is |
| @@ -1441,7 +1411,7 @@ void hrtimer_run_pending(void) | |||
| 1441 | */ | 1411 | */ |
| 1442 | void hrtimer_run_queues(void) | 1412 | void hrtimer_run_queues(void) |
| 1443 | { | 1413 | { |
| 1444 | struct rb_node *node; | 1414 | struct timerqueue_node *node; |
| 1445 | struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); | 1415 | struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); |
| 1446 | struct hrtimer_clock_base *base; | 1416 | struct hrtimer_clock_base *base; |
| 1447 | int index, gettime = 1; | 1417 | int index, gettime = 1; |
| @@ -1451,8 +1421,7 @@ void hrtimer_run_queues(void) | |||
| 1451 | 1421 | ||
| 1452 | for (index = 0; index < HRTIMER_MAX_CLOCK_BASES; index++) { | 1422 | for (index = 0; index < HRTIMER_MAX_CLOCK_BASES; index++) { |
| 1453 | base = &cpu_base->clock_base[index]; | 1423 | base = &cpu_base->clock_base[index]; |
| 1454 | 1424 | if (!timerqueue_getnext(&base->active)) | |
| 1455 | if (!base->first) | ||
| 1456 | continue; | 1425 | continue; |
| 1457 | 1426 | ||
| 1458 | if (gettime) { | 1427 | if (gettime) { |
| @@ -1462,10 +1431,10 @@ void hrtimer_run_queues(void) | |||
| 1462 | 1431 | ||
| 1463 | raw_spin_lock(&cpu_base->lock); | 1432 | raw_spin_lock(&cpu_base->lock); |
| 1464 | 1433 | ||
| 1465 | while ((node = base->first)) { | 1434 | while ((node = timerqueue_getnext(&base->active))) { |
| 1466 | struct hrtimer *timer; | 1435 | struct hrtimer *timer; |
| 1467 | 1436 | ||
| 1468 | timer = rb_entry(node, struct hrtimer, node); | 1437 | timer = container_of(node, struct hrtimer, node); |
| 1469 | if (base->softirq_time.tv64 <= | 1438 | if (base->softirq_time.tv64 <= |
| 1470 | hrtimer_get_expires_tv64(timer)) | 1439 | hrtimer_get_expires_tv64(timer)) |
| 1471 | break; | 1440 | break; |
| @@ -1630,8 +1599,10 @@ static void __cpuinit init_hrtimers_cpu(int cpu) | |||
| 1630 | 1599 | ||
| 1631 | raw_spin_lock_init(&cpu_base->lock); | 1600 | raw_spin_lock_init(&cpu_base->lock); |
| 1632 | 1601 | ||
| 1633 | for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) | 1602 | for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { |
| 1634 | cpu_base->clock_base[i].cpu_base = cpu_base; | 1603 | cpu_base->clock_base[i].cpu_base = cpu_base; |
| 1604 | timerqueue_init_head(&cpu_base->clock_base[i].active); | ||
| 1605 | } | ||
| 1635 | 1606 | ||
| 1636 | hrtimer_init_hres(cpu_base); | 1607 | hrtimer_init_hres(cpu_base); |
| 1637 | } | 1608 | } |
| @@ -1642,10 +1613,10 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base, | |||
| 1642 | struct hrtimer_clock_base *new_base) | 1613 | struct hrtimer_clock_base *new_base) |
| 1643 | { | 1614 | { |
| 1644 | struct hrtimer *timer; | 1615 | struct hrtimer *timer; |
| 1645 | struct rb_node *node; | 1616 | struct timerqueue_node *node; |
| 1646 | 1617 | ||
| 1647 | while ((node = rb_first(&old_base->active))) { | 1618 | while ((node = timerqueue_getnext(&old_base->active))) { |
| 1648 | timer = rb_entry(node, struct hrtimer, node); | 1619 | timer = container_of(node, struct hrtimer, node); |
| 1649 | BUG_ON(hrtimer_callback_running(timer)); | 1620 | BUG_ON(hrtimer_callback_running(timer)); |
| 1650 | debug_deactivate(timer); | 1621 | debug_deactivate(timer); |
| 1651 | 1622 | ||
| @@ -1774,7 +1745,7 @@ schedule_hrtimeout_range_clock(ktime_t *expires, unsigned long delta, | |||
| 1774 | } | 1745 | } |
| 1775 | 1746 | ||
| 1776 | /* | 1747 | /* |
| 1777 | * A NULL parameter means "inifinte" | 1748 | * A NULL parameter means "infinite" |
| 1778 | */ | 1749 | */ |
| 1779 | if (!expires) { | 1750 | if (!expires) { |
| 1780 | schedule(); | 1751 | schedule(); |
diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c index 2c9120f0afc..086adf25a55 100644 --- a/kernel/hw_breakpoint.c +++ b/kernel/hw_breakpoint.c | |||
| @@ -620,7 +620,7 @@ static struct pmu perf_breakpoint = { | |||
| 620 | .read = hw_breakpoint_pmu_read, | 620 | .read = hw_breakpoint_pmu_read, |
| 621 | }; | 621 | }; |
| 622 | 622 | ||
| 623 | static int __init init_hw_breakpoint(void) | 623 | int __init init_hw_breakpoint(void) |
| 624 | { | 624 | { |
| 625 | unsigned int **task_bp_pinned; | 625 | unsigned int **task_bp_pinned; |
| 626 | int cpu, err_cpu; | 626 | int cpu, err_cpu; |
| @@ -641,7 +641,7 @@ static int __init init_hw_breakpoint(void) | |||
| 641 | 641 | ||
| 642 | constraints_initialized = 1; | 642 | constraints_initialized = 1; |
| 643 | 643 | ||
| 644 | perf_pmu_register(&perf_breakpoint); | 644 | perf_pmu_register(&perf_breakpoint, "breakpoint", PERF_TYPE_BREAKPOINT); |
| 645 | 645 | ||
| 646 | return register_die_notifier(&hw_breakpoint_exceptions_nb); | 646 | return register_die_notifier(&hw_breakpoint_exceptions_nb); |
| 647 | 647 | ||
| @@ -655,6 +655,5 @@ static int __init init_hw_breakpoint(void) | |||
| 655 | 655 | ||
| 656 | return -ENOMEM; | 656 | return -ENOMEM; |
| 657 | } | 657 | } |
| 658 | core_initcall(init_hw_breakpoint); | ||
| 659 | 658 | ||
| 660 | 659 | ||
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index 31d766bf5d2..8e42fec7686 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig | |||
| @@ -9,9 +9,6 @@ menu "IRQ subsystem" | |||
| 9 | config GENERIC_HARDIRQS | 9 | config GENERIC_HARDIRQS |
| 10 | def_bool y | 10 | def_bool y |
| 11 | 11 | ||
| 12 | config GENERIC_HARDIRQS_NO__DO_IRQ | ||
| 13 | def_bool y | ||
| 14 | |||
| 15 | # Select this to disable the deprecated stuff | 12 | # Select this to disable the deprecated stuff |
| 16 | config GENERIC_HARDIRQS_NO_DEPRECATED | 13 | config GENERIC_HARDIRQS_NO_DEPRECATED |
| 17 | def_bool n | 14 | def_bool n |
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index e2347eb6330..3540a719012 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c | |||
| @@ -118,114 +118,3 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action) | |||
| 118 | 118 | ||
| 119 | return retval; | 119 | return retval; |
| 120 | } | 120 | } |
| 121 | |||
| 122 | #ifndef CONFIG_GENERIC_HARDIRQS_NO__DO_IRQ | ||
| 123 | |||
| 124 | #ifdef CONFIG_ENABLE_WARN_DEPRECATED | ||
| 125 | # warning __do_IRQ is deprecated. Please convert to proper flow handlers | ||
| 126 | #endif | ||
| 127 | |||
| 128 | /** | ||
| 129 | * __do_IRQ - original all in one highlevel IRQ handler | ||
| 130 | * @irq: the interrupt number | ||
| 131 | * | ||
| 132 | * __do_IRQ handles all normal device IRQ's (the special | ||
| 133 | * SMP cross-CPU interrupts have their own specific | ||
| 134 | * handlers). | ||
| 135 | * | ||
| 136 | * This is the original x86 implementation which is used for every | ||
| 137 | * interrupt type. | ||
| 138 | */ | ||
| 139 | unsigned int __do_IRQ(unsigned int irq) | ||
| 140 | { | ||
| 141 | struct irq_desc *desc = irq_to_desc(irq); | ||
| 142 | struct irqaction *action; | ||
| 143 | unsigned int status; | ||
| 144 | |||
| 145 | kstat_incr_irqs_this_cpu(irq, desc); | ||
| 146 | |||
| 147 | if (CHECK_IRQ_PER_CPU(desc->status)) { | ||
| 148 | irqreturn_t action_ret; | ||
| 149 | |||
| 150 | /* | ||
| 151 | * No locking required for CPU-local interrupts: | ||
| 152 | */ | ||
| 153 | if (desc->irq_data.chip->ack) | ||
| 154 | desc->irq_data.chip->ack(irq); | ||
| 155 | if (likely(!(desc->status & IRQ_DISABLED))) { | ||
| 156 | action_ret = handle_IRQ_event(irq, desc->action); | ||
| 157 | if (!noirqdebug) | ||
| 158 | note_interrupt(irq, desc, action_ret); | ||
| 159 | } | ||
| 160 | desc->irq_data.chip->end(irq); | ||
| 161 | return 1; | ||
| 162 | } | ||
| 163 | |||
| 164 | raw_spin_lock(&desc->lock); | ||
| 165 | if (desc->irq_data.chip->ack) | ||
| 166 | desc->irq_data.chip->ack(irq); | ||
| 167 | /* | ||
| 168 | * REPLAY is when Linux resends an IRQ that was dropped earlier | ||
| 169 | * WAITING is used by probe to mark irqs that are being tested | ||
| 170 | */ | ||
| 171 | status = desc->status & ~(IRQ_REPLAY | IRQ_WAITING); | ||
| 172 | status |= IRQ_PENDING; /* we _want_ to handle it */ | ||
| 173 | |||
| 174 | /* | ||
| 175 | * If the IRQ is disabled for whatever reason, we cannot | ||
| 176 | * use the action we have. | ||
| 177 | */ | ||
| 178 | action = NULL; | ||
| 179 | if (likely(!(status & (IRQ_DISABLED | IRQ_INPROGRESS)))) { | ||
| 180 | action = desc->action; | ||
| 181 | status &= ~IRQ_PENDING; /* we commit to handling */ | ||
| 182 | status |= IRQ_INPROGRESS; /* we are handling it */ | ||
| 183 | } | ||
| 184 | desc->status = status; | ||
| 185 | |||
| 186 | /* | ||
| 187 | * If there is no IRQ handler or it was disabled, exit early. | ||
| 188 | * Since we set PENDING, if another processor is handling | ||
| 189 | * a different instance of this same irq, the other processor | ||
| 190 | * will take care of it. | ||
| 191 | */ | ||
| 192 | if (unlikely(!action)) | ||
| 193 | goto out; | ||
| 194 | |||
| 195 | /* | ||
| 196 | * Edge triggered interrupts need to remember | ||
| 197 | * pending events. | ||
| 198 | * This applies to any hw interrupts that allow a second | ||
| 199 | * instance of the same irq to arrive while we are in do_IRQ | ||
| 200 | * or in the handler. But the code here only handles the _second_ | ||
| 201 | * instance of the irq, not the third or fourth. So it is mostly | ||
| 202 | * useful for irq hardware that does not mask cleanly in an | ||
| 203 | * SMP environment. | ||
| 204 | */ | ||
| 205 | for (;;) { | ||
| 206 | irqreturn_t action_ret; | ||
| 207 | |||
| 208 | raw_spin_unlock(&desc->lock); | ||
| 209 | |||
| 210 | action_ret = handle_IRQ_event(irq, action); | ||
| 211 | if (!noirqdebug) | ||
| 212 | note_interrupt(irq, desc, action_ret); | ||
| 213 | |||
| 214 | raw_spin_lock(&desc->lock); | ||
| 215 | if (likely(!(desc->status & IRQ_PENDING))) | ||
| 216 | break; | ||
| 217 | desc->status &= ~IRQ_PENDING; | ||
| 218 | } | ||
| 219 | desc->status &= ~IRQ_INPROGRESS; | ||
| 220 | |||
| 221 | out: | ||
| 222 | /* | ||
| 223 | * The ->end() handler has to deal with interrupts which got | ||
| 224 | * disabled while the handler was running. | ||
| 225 | */ | ||
| 226 | desc->irq_data.chip->end(irq); | ||
| 227 | raw_spin_unlock(&desc->lock); | ||
| 228 | |||
| 229 | return 1; | ||
| 230 | } | ||
| 231 | #endif | ||
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 4571ae7e085..99c3bc8a6fb 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h | |||
| @@ -3,6 +3,12 @@ | |||
| 3 | */ | 3 | */ |
| 4 | #include <linux/irqdesc.h> | 4 | #include <linux/irqdesc.h> |
| 5 | 5 | ||
| 6 | #ifdef CONFIG_SPARSE_IRQ | ||
| 7 | # define IRQ_BITMAP_BITS (NR_IRQS + 8196) | ||
| 8 | #else | ||
| 9 | # define IRQ_BITMAP_BITS NR_IRQS | ||
| 10 | #endif | ||
| 11 | |||
| 6 | extern int noirqdebug; | 12 | extern int noirqdebug; |
| 7 | 13 | ||
| 8 | #define irq_data_to_desc(data) container_of(data, struct irq_desc, irq_data) | 14 | #define irq_data_to_desc(data) container_of(data, struct irq_desc, irq_data) |
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 9988d03797f..2039bea31bd 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c | |||
| @@ -72,6 +72,8 @@ static inline int desc_node(struct irq_desc *desc) { return 0; } | |||
| 72 | 72 | ||
| 73 | static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node) | 73 | static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node) |
| 74 | { | 74 | { |
| 75 | int cpu; | ||
| 76 | |||
| 75 | desc->irq_data.irq = irq; | 77 | desc->irq_data.irq = irq; |
| 76 | desc->irq_data.chip = &no_irq_chip; | 78 | desc->irq_data.chip = &no_irq_chip; |
| 77 | desc->irq_data.chip_data = NULL; | 79 | desc->irq_data.chip_data = NULL; |
| @@ -83,7 +85,8 @@ static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node) | |||
| 83 | desc->irq_count = 0; | 85 | desc->irq_count = 0; |
| 84 | desc->irqs_unhandled = 0; | 86 | desc->irqs_unhandled = 0; |
| 85 | desc->name = NULL; | 87 | desc->name = NULL; |
| 86 | memset(desc->kstat_irqs, 0, nr_cpu_ids * sizeof(*(desc->kstat_irqs))); | 88 | for_each_possible_cpu(cpu) |
| 89 | *per_cpu_ptr(desc->kstat_irqs, cpu) = 0; | ||
| 87 | desc_smp_init(desc, node); | 90 | desc_smp_init(desc, node); |
| 88 | } | 91 | } |
| 89 | 92 | ||
| @@ -91,7 +94,7 @@ int nr_irqs = NR_IRQS; | |||
| 91 | EXPORT_SYMBOL_GPL(nr_irqs); | 94 | EXPORT_SYMBOL_GPL(nr_irqs); |
| 92 | 95 | ||
| 93 | static DEFINE_MUTEX(sparse_irq_lock); | 96 | static DEFINE_MUTEX(sparse_irq_lock); |
| 94 | static DECLARE_BITMAP(allocated_irqs, NR_IRQS); | 97 | static DECLARE_BITMAP(allocated_irqs, IRQ_BITMAP_BITS); |
| 95 | 98 | ||
| 96 | #ifdef CONFIG_SPARSE_IRQ | 99 | #ifdef CONFIG_SPARSE_IRQ |
| 97 | 100 | ||
| @@ -133,8 +136,7 @@ static struct irq_desc *alloc_desc(int irq, int node) | |||
| 133 | if (!desc) | 136 | if (!desc) |
| 134 | return NULL; | 137 | return NULL; |
| 135 | /* allocate based on nr_cpu_ids */ | 138 | /* allocate based on nr_cpu_ids */ |
| 136 | desc->kstat_irqs = kzalloc_node(nr_cpu_ids * sizeof(*desc->kstat_irqs), | 139 | desc->kstat_irqs = alloc_percpu(unsigned int); |
| 137 | gfp, node); | ||
| 138 | if (!desc->kstat_irqs) | 140 | if (!desc->kstat_irqs) |
| 139 | goto err_desc; | 141 | goto err_desc; |
| 140 | 142 | ||
| @@ -149,7 +151,7 @@ static struct irq_desc *alloc_desc(int irq, int node) | |||
| 149 | return desc; | 151 | return desc; |
| 150 | 152 | ||
| 151 | err_kstat: | 153 | err_kstat: |
| 152 | kfree(desc->kstat_irqs); | 154 | free_percpu(desc->kstat_irqs); |
| 153 | err_desc: | 155 | err_desc: |
| 154 | kfree(desc); | 156 | kfree(desc); |
| 155 | return NULL; | 157 | return NULL; |
| @@ -166,7 +168,7 @@ static void free_desc(unsigned int irq) | |||
| 166 | mutex_unlock(&sparse_irq_lock); | 168 | mutex_unlock(&sparse_irq_lock); |
| 167 | 169 | ||
| 168 | free_masks(desc); | 170 | free_masks(desc); |
| 169 | kfree(desc->kstat_irqs); | 171 | free_percpu(desc->kstat_irqs); |
| 170 | kfree(desc); | 172 | kfree(desc); |
| 171 | } | 173 | } |
| 172 | 174 | ||
| @@ -215,6 +217,15 @@ int __init early_irq_init(void) | |||
| 215 | initcnt = arch_probe_nr_irqs(); | 217 | initcnt = arch_probe_nr_irqs(); |
| 216 | printk(KERN_INFO "NR_IRQS:%d nr_irqs:%d %d\n", NR_IRQS, nr_irqs, initcnt); | 218 | printk(KERN_INFO "NR_IRQS:%d nr_irqs:%d %d\n", NR_IRQS, nr_irqs, initcnt); |
| 217 | 219 | ||
| 220 | if (WARN_ON(nr_irqs > IRQ_BITMAP_BITS)) | ||
| 221 | nr_irqs = IRQ_BITMAP_BITS; | ||
| 222 | |||
| 223 | if (WARN_ON(initcnt > IRQ_BITMAP_BITS)) | ||
| 224 | initcnt = IRQ_BITMAP_BITS; | ||
| 225 | |||
| 226 | if (initcnt > nr_irqs) | ||
| 227 | nr_irqs = initcnt; | ||
| 228 | |||
| 218 | for (i = 0; i < initcnt; i++) { | 229 | for (i = 0; i < initcnt; i++) { |
| 219 | desc = alloc_desc(i, node); | 230 | desc = alloc_desc(i, node); |
| 220 | set_bit(i, allocated_irqs); | 231 | set_bit(i, allocated_irqs); |
| @@ -234,7 +245,6 @@ struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = { | |||
| 234 | } | 245 | } |
| 235 | }; | 246 | }; |
| 236 | 247 | ||
| 237 | static unsigned int kstat_irqs_all[NR_IRQS][NR_CPUS]; | ||
| 238 | int __init early_irq_init(void) | 248 | int __init early_irq_init(void) |
| 239 | { | 249 | { |
| 240 | int count, i, node = first_online_node; | 250 | int count, i, node = first_online_node; |
| @@ -250,7 +260,8 @@ int __init early_irq_init(void) | |||
| 250 | for (i = 0; i < count; i++) { | 260 | for (i = 0; i < count; i++) { |
| 251 | desc[i].irq_data.irq = i; | 261 | desc[i].irq_data.irq = i; |
| 252 | desc[i].irq_data.chip = &no_irq_chip; | 262 | desc[i].irq_data.chip = &no_irq_chip; |
| 253 | desc[i].kstat_irqs = kstat_irqs_all[i]; | 263 | /* TODO : do this allocation on-demand ... */ |
| 264 | desc[i].kstat_irqs = alloc_percpu(unsigned int); | ||
| 254 | alloc_masks(desc + i, GFP_KERNEL, node); | 265 | alloc_masks(desc + i, GFP_KERNEL, node); |
| 255 | desc_smp_init(desc + i, node); | 266 | desc_smp_init(desc + i, node); |
| 256 | lockdep_set_class(&desc[i].lock, &irq_desc_lock_class); | 267 | lockdep_set_class(&desc[i].lock, &irq_desc_lock_class); |
| @@ -275,6 +286,22 @@ static void free_desc(unsigned int irq) | |||
| 275 | 286 | ||
| 276 | static inline int alloc_descs(unsigned int start, unsigned int cnt, int node) | 287 | static inline int alloc_descs(unsigned int start, unsigned int cnt, int node) |
| 277 | { | 288 | { |
| 289 | #if defined(CONFIG_KSTAT_IRQS_ONDEMAND) | ||
| 290 | struct irq_desc *desc; | ||
| 291 | unsigned int i; | ||
| 292 | |||
| 293 | for (i = 0; i < cnt; i++) { | ||
| 294 | desc = irq_to_desc(start + i); | ||
| 295 | if (desc && !desc->kstat_irqs) { | ||
| 296 | unsigned int __percpu *stats = alloc_percpu(unsigned int); | ||
| 297 | |||
| 298 | if (!stats) | ||
| 299 | return -1; | ||
| 300 | if (cmpxchg(&desc->kstat_irqs, NULL, stats) != NULL) | ||
| 301 | free_percpu(stats); | ||
| 302 | } | ||
| 303 | } | ||
| 304 | #endif | ||
| 278 | return start; | 305 | return start; |
| 279 | } | 306 | } |
| 280 | #endif /* !CONFIG_SPARSE_IRQ */ | 307 | #endif /* !CONFIG_SPARSE_IRQ */ |
| @@ -391,7 +418,9 @@ void dynamic_irq_cleanup(unsigned int irq) | |||
| 391 | unsigned int kstat_irqs_cpu(unsigned int irq, int cpu) | 418 | unsigned int kstat_irqs_cpu(unsigned int irq, int cpu) |
| 392 | { | 419 | { |
| 393 | struct irq_desc *desc = irq_to_desc(irq); | 420 | struct irq_desc *desc = irq_to_desc(irq); |
| 394 | return desc ? desc->kstat_irqs[cpu] : 0; | 421 | |
| 422 | return desc && desc->kstat_irqs ? | ||
| 423 | *per_cpu_ptr(desc->kstat_irqs, cpu) : 0; | ||
| 395 | } | 424 | } |
| 396 | 425 | ||
| 397 | #ifdef CONFIG_GENERIC_HARDIRQS | 426 | #ifdef CONFIG_GENERIC_HARDIRQS |
| @@ -401,10 +430,10 @@ unsigned int kstat_irqs(unsigned int irq) | |||
| 401 | int cpu; | 430 | int cpu; |
| 402 | int sum = 0; | 431 | int sum = 0; |
| 403 | 432 | ||
| 404 | if (!desc) | 433 | if (!desc || !desc->kstat_irqs) |
| 405 | return 0; | 434 | return 0; |
| 406 | for_each_possible_cpu(cpu) | 435 | for_each_possible_cpu(cpu) |
| 407 | sum += desc->kstat_irqs[cpu]; | 436 | sum += *per_cpu_ptr(desc->kstat_irqs, cpu); |
| 408 | return sum; | 437 | return sum; |
| 409 | } | 438 | } |
| 410 | #endif /* CONFIG_GENERIC_HARDIRQS */ | 439 | #endif /* CONFIG_GENERIC_HARDIRQS */ |
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 5f92acc5f95..9033c1c7082 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c | |||
| @@ -577,7 +577,9 @@ irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) { } | |||
| 577 | */ | 577 | */ |
| 578 | static int irq_thread(void *data) | 578 | static int irq_thread(void *data) |
| 579 | { | 579 | { |
| 580 | struct sched_param param = { .sched_priority = MAX_USER_RT_PRIO/2, }; | 580 | static const struct sched_param param = { |
| 581 | .sched_priority = MAX_USER_RT_PRIO/2, | ||
| 582 | }; | ||
| 581 | struct irqaction *action = data; | 583 | struct irqaction *action = data; |
| 582 | struct irq_desc *desc = irq_to_desc(action->irq); | 584 | struct irq_desc *desc = irq_to_desc(action->irq); |
| 583 | int wake, oneshot = desc->status & IRQ_ONESHOT; | 585 | int wake, oneshot = desc->status & IRQ_ONESHOT; |
| @@ -1098,7 +1100,7 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler, | |||
| 1098 | if (retval) | 1100 | if (retval) |
| 1099 | kfree(action); | 1101 | kfree(action); |
| 1100 | 1102 | ||
| 1101 | #ifdef CONFIG_DEBUG_SHIRQ | 1103 | #ifdef CONFIG_DEBUG_SHIRQ_FIXME |
| 1102 | if (!retval && (irqflags & IRQF_SHARED)) { | 1104 | if (!retval && (irqflags & IRQF_SHARED)) { |
| 1103 | /* | 1105 | /* |
| 1104 | * It's a shared IRQ -- the driver ought to be prepared for it | 1106 | * It's a shared IRQ -- the driver ought to be prepared for it |
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c index 1d254194048..441fd629ff0 100644 --- a/kernel/irq/migration.c +++ b/kernel/irq/migration.c | |||
| @@ -56,6 +56,7 @@ void move_masked_irq(int irq) | |||
| 56 | void move_native_irq(int irq) | 56 | void move_native_irq(int irq) |
| 57 | { | 57 | { |
| 58 | struct irq_desc *desc = irq_to_desc(irq); | 58 | struct irq_desc *desc = irq_to_desc(irq); |
| 59 | bool masked; | ||
| 59 | 60 | ||
| 60 | if (likely(!(desc->status & IRQ_MOVE_PENDING))) | 61 | if (likely(!(desc->status & IRQ_MOVE_PENDING))) |
| 61 | return; | 62 | return; |
| @@ -63,8 +64,15 @@ void move_native_irq(int irq) | |||
| 63 | if (unlikely(desc->status & IRQ_DISABLED)) | 64 | if (unlikely(desc->status & IRQ_DISABLED)) |
| 64 | return; | 65 | return; |
| 65 | 66 | ||
| 66 | desc->irq_data.chip->irq_mask(&desc->irq_data); | 67 | /* |
| 68 | * Be careful vs. already masked interrupts. If this is a | ||
| 69 | * threaded interrupt with ONESHOT set, we can end up with an | ||
| 70 | * interrupt storm. | ||
| 71 | */ | ||
| 72 | masked = desc->status & IRQ_MASKED; | ||
| 73 | if (!masked) | ||
| 74 | desc->irq_data.chip->irq_mask(&desc->irq_data); | ||
| 67 | move_masked_irq(irq); | 75 | move_masked_irq(irq); |
| 68 | desc->irq_data.chip->irq_unmask(&desc->irq_data); | 76 | if (!masked) |
| 77 | desc->irq_data.chip->irq_unmask(&desc->irq_data); | ||
| 69 | } | 78 | } |
| 70 | |||
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index 01b1d3a8898..6c8a2a9f8a7 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c | |||
| @@ -214,7 +214,7 @@ static int irq_spurious_proc_show(struct seq_file *m, void *v) | |||
| 214 | 214 | ||
| 215 | static int irq_spurious_proc_open(struct inode *inode, struct file *file) | 215 | static int irq_spurious_proc_open(struct inode *inode, struct file *file) |
| 216 | { | 216 | { |
| 217 | return single_open(file, irq_spurious_proc_show, NULL); | 217 | return single_open(file, irq_spurious_proc_show, PDE(inode)->data); |
| 218 | } | 218 | } |
| 219 | 219 | ||
| 220 | static const struct file_operations irq_spurious_proc_fops = { | 220 | static const struct file_operations irq_spurious_proc_fops = { |
diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c index 891115a929a..dc49358b73f 100644 --- a/kernel/irq/resend.c +++ b/kernel/irq/resend.c | |||
| @@ -23,7 +23,7 @@ | |||
| 23 | #ifdef CONFIG_HARDIRQS_SW_RESEND | 23 | #ifdef CONFIG_HARDIRQS_SW_RESEND |
| 24 | 24 | ||
| 25 | /* Bitmap to handle software resend of interrupts: */ | 25 | /* Bitmap to handle software resend of interrupts: */ |
| 26 | static DECLARE_BITMAP(irqs_resend, NR_IRQS); | 26 | static DECLARE_BITMAP(irqs_resend, IRQ_BITMAP_BITS); |
| 27 | 27 | ||
| 28 | /* | 28 | /* |
| 29 | * Run software resends of IRQ's | 29 | * Run software resends of IRQ's |
diff --git a/kernel/irq_work.c b/kernel/irq_work.c index f16763ff848..c58fa7da8ae 100644 --- a/kernel/irq_work.c +++ b/kernel/irq_work.c | |||
| @@ -77,21 +77,21 @@ void __weak arch_irq_work_raise(void) | |||
| 77 | */ | 77 | */ |
| 78 | static void __irq_work_queue(struct irq_work *entry) | 78 | static void __irq_work_queue(struct irq_work *entry) |
| 79 | { | 79 | { |
| 80 | struct irq_work **head, *next; | 80 | struct irq_work *next; |
| 81 | 81 | ||
| 82 | head = &get_cpu_var(irq_work_list); | 82 | preempt_disable(); |
| 83 | 83 | ||
| 84 | do { | 84 | do { |
| 85 | next = *head; | 85 | next = __this_cpu_read(irq_work_list); |
| 86 | /* Can assign non-atomic because we keep the flags set. */ | 86 | /* Can assign non-atomic because we keep the flags set. */ |
| 87 | entry->next = next_flags(next, IRQ_WORK_FLAGS); | 87 | entry->next = next_flags(next, IRQ_WORK_FLAGS); |
| 88 | } while (cmpxchg(head, next, entry) != next); | 88 | } while (this_cpu_cmpxchg(irq_work_list, next, entry) != next); |
| 89 | 89 | ||
| 90 | /* The list was empty, raise self-interrupt to start processing. */ | 90 | /* The list was empty, raise self-interrupt to start processing. */ |
| 91 | if (!irq_work_next(entry)) | 91 | if (!irq_work_next(entry)) |
| 92 | arch_irq_work_raise(); | 92 | arch_irq_work_raise(); |
| 93 | 93 | ||
| 94 | put_cpu_var(irq_work_list); | 94 | preempt_enable(); |
| 95 | } | 95 | } |
| 96 | 96 | ||
| 97 | /* | 97 | /* |
| @@ -120,16 +120,16 @@ EXPORT_SYMBOL_GPL(irq_work_queue); | |||
| 120 | */ | 120 | */ |
| 121 | void irq_work_run(void) | 121 | void irq_work_run(void) |
| 122 | { | 122 | { |
| 123 | struct irq_work *list, **head; | 123 | struct irq_work *list; |
| 124 | 124 | ||
| 125 | head = &__get_cpu_var(irq_work_list); | 125 | if (this_cpu_read(irq_work_list) == NULL) |
| 126 | if (*head == NULL) | ||
| 127 | return; | 126 | return; |
| 128 | 127 | ||
| 129 | BUG_ON(!in_irq()); | 128 | BUG_ON(!in_irq()); |
| 130 | BUG_ON(!irqs_disabled()); | 129 | BUG_ON(!irqs_disabled()); |
| 131 | 130 | ||
| 132 | list = xchg(head, NULL); | 131 | list = this_cpu_xchg(irq_work_list, NULL); |
| 132 | |||
| 133 | while (list != NULL) { | 133 | while (list != NULL) { |
| 134 | struct irq_work *entry = list; | 134 | struct irq_work *entry = list; |
| 135 | 135 | ||
| @@ -145,7 +145,9 @@ void irq_work_run(void) | |||
| 145 | * Clear the BUSY bit and return to the free state if | 145 | * Clear the BUSY bit and return to the free state if |
| 146 | * no-one else claimed it meanwhile. | 146 | * no-one else claimed it meanwhile. |
| 147 | */ | 147 | */ |
| 148 | cmpxchg(&entry->next, next_flags(NULL, IRQ_WORK_BUSY), NULL); | 148 | (void)cmpxchg(&entry->next, |
| 149 | next_flags(NULL, IRQ_WORK_BUSY), | ||
| 150 | NULL); | ||
| 149 | } | 151 | } |
| 150 | } | 152 | } |
| 151 | EXPORT_SYMBOL_GPL(irq_work_run); | 153 | EXPORT_SYMBOL_GPL(irq_work_run); |
diff --git a/kernel/kexec.c b/kernel/kexec.c index b55045bc756..ec19b92c7eb 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c | |||
| @@ -163,7 +163,7 @@ static int do_kimage_alloc(struct kimage **rimage, unsigned long entry, | |||
| 163 | * just verifies it is an address we can use. | 163 | * just verifies it is an address we can use. |
| 164 | * | 164 | * |
| 165 | * Since the kernel does everything in page size chunks ensure | 165 | * Since the kernel does everything in page size chunks ensure |
| 166 | * the destination addreses are page aligned. Too many | 166 | * the destination addresses are page aligned. Too many |
| 167 | * special cases crop of when we don't do this. The most | 167 | * special cases crop of when we don't do this. The most |
| 168 | * insidious is getting overlapping destination addresses | 168 | * insidious is getting overlapping destination addresses |
| 169 | * simply because addresses are changed to page size | 169 | * simply because addresses are changed to page size |
diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 9737a76e106..77981813a1e 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c | |||
| @@ -317,12 +317,12 @@ void __kprobes free_optinsn_slot(kprobe_opcode_t * slot, int dirty) | |||
| 317 | /* We have preemption disabled.. so it is safe to use __ versions */ | 317 | /* We have preemption disabled.. so it is safe to use __ versions */ |
| 318 | static inline void set_kprobe_instance(struct kprobe *kp) | 318 | static inline void set_kprobe_instance(struct kprobe *kp) |
| 319 | { | 319 | { |
| 320 | __get_cpu_var(kprobe_instance) = kp; | 320 | __this_cpu_write(kprobe_instance, kp); |
| 321 | } | 321 | } |
| 322 | 322 | ||
| 323 | static inline void reset_kprobe_instance(void) | 323 | static inline void reset_kprobe_instance(void) |
| 324 | { | 324 | { |
| 325 | __get_cpu_var(kprobe_instance) = NULL; | 325 | __this_cpu_write(kprobe_instance, NULL); |
| 326 | } | 326 | } |
| 327 | 327 | ||
| 328 | /* | 328 | /* |
| @@ -354,13 +354,20 @@ static inline int kprobe_aggrprobe(struct kprobe *p) | |||
| 354 | return p->pre_handler == aggr_pre_handler; | 354 | return p->pre_handler == aggr_pre_handler; |
| 355 | } | 355 | } |
| 356 | 356 | ||
| 357 | /* Return true(!0) if the kprobe is unused */ | ||
| 358 | static inline int kprobe_unused(struct kprobe *p) | ||
| 359 | { | ||
| 360 | return kprobe_aggrprobe(p) && kprobe_disabled(p) && | ||
| 361 | list_empty(&p->list); | ||
| 362 | } | ||
| 363 | |||
| 357 | /* | 364 | /* |
| 358 | * Keep all fields in the kprobe consistent | 365 | * Keep all fields in the kprobe consistent |
| 359 | */ | 366 | */ |
| 360 | static inline void copy_kprobe(struct kprobe *old_p, struct kprobe *p) | 367 | static inline void copy_kprobe(struct kprobe *ap, struct kprobe *p) |
| 361 | { | 368 | { |
| 362 | memcpy(&p->opcode, &old_p->opcode, sizeof(kprobe_opcode_t)); | 369 | memcpy(&p->opcode, &ap->opcode, sizeof(kprobe_opcode_t)); |
| 363 | memcpy(&p->ainsn, &old_p->ainsn, sizeof(struct arch_specific_insn)); | 370 | memcpy(&p->ainsn, &ap->ainsn, sizeof(struct arch_specific_insn)); |
| 364 | } | 371 | } |
| 365 | 372 | ||
| 366 | #ifdef CONFIG_OPTPROBES | 373 | #ifdef CONFIG_OPTPROBES |
| @@ -384,6 +391,17 @@ void __kprobes opt_pre_handler(struct kprobe *p, struct pt_regs *regs) | |||
| 384 | } | 391 | } |
| 385 | } | 392 | } |
| 386 | 393 | ||
| 394 | /* Free optimized instructions and optimized_kprobe */ | ||
| 395 | static __kprobes void free_aggr_kprobe(struct kprobe *p) | ||
| 396 | { | ||
| 397 | struct optimized_kprobe *op; | ||
| 398 | |||
| 399 | op = container_of(p, struct optimized_kprobe, kp); | ||
| 400 | arch_remove_optimized_kprobe(op); | ||
| 401 | arch_remove_kprobe(p); | ||
| 402 | kfree(op); | ||
| 403 | } | ||
| 404 | |||
| 387 | /* Return true(!0) if the kprobe is ready for optimization. */ | 405 | /* Return true(!0) if the kprobe is ready for optimization. */ |
| 388 | static inline int kprobe_optready(struct kprobe *p) | 406 | static inline int kprobe_optready(struct kprobe *p) |
| 389 | { | 407 | { |
| @@ -397,6 +415,33 @@ static inline int kprobe_optready(struct kprobe *p) | |||
| 397 | return 0; | 415 | return 0; |
| 398 | } | 416 | } |
| 399 | 417 | ||
| 418 | /* Return true(!0) if the kprobe is disarmed. Note: p must be on hash list */ | ||
| 419 | static inline int kprobe_disarmed(struct kprobe *p) | ||
| 420 | { | ||
| 421 | struct optimized_kprobe *op; | ||
| 422 | |||
| 423 | /* If kprobe is not aggr/opt probe, just return kprobe is disabled */ | ||
| 424 | if (!kprobe_aggrprobe(p)) | ||
| 425 | return kprobe_disabled(p); | ||
| 426 | |||
| 427 | op = container_of(p, struct optimized_kprobe, kp); | ||
| 428 | |||
| 429 | return kprobe_disabled(p) && list_empty(&op->list); | ||
| 430 | } | ||
| 431 | |||
| 432 | /* Return true(!0) if the probe is queued on (un)optimizing lists */ | ||
| 433 | static int __kprobes kprobe_queued(struct kprobe *p) | ||
| 434 | { | ||
| 435 | struct optimized_kprobe *op; | ||
| 436 | |||
| 437 | if (kprobe_aggrprobe(p)) { | ||
| 438 | op = container_of(p, struct optimized_kprobe, kp); | ||
| 439 | if (!list_empty(&op->list)) | ||
| 440 | return 1; | ||
| 441 | } | ||
| 442 | return 0; | ||
| 443 | } | ||
| 444 | |||
| 400 | /* | 445 | /* |
| 401 | * Return an optimized kprobe whose optimizing code replaces | 446 | * Return an optimized kprobe whose optimizing code replaces |
| 402 | * instructions including addr (exclude breakpoint). | 447 | * instructions including addr (exclude breakpoint). |
| @@ -422,30 +467,23 @@ static struct kprobe *__kprobes get_optimized_kprobe(unsigned long addr) | |||
| 422 | 467 | ||
| 423 | /* Optimization staging list, protected by kprobe_mutex */ | 468 | /* Optimization staging list, protected by kprobe_mutex */ |
| 424 | static LIST_HEAD(optimizing_list); | 469 | static LIST_HEAD(optimizing_list); |
| 470 | static LIST_HEAD(unoptimizing_list); | ||
| 425 | 471 | ||
| 426 | static void kprobe_optimizer(struct work_struct *work); | 472 | static void kprobe_optimizer(struct work_struct *work); |
| 427 | static DECLARE_DELAYED_WORK(optimizing_work, kprobe_optimizer); | 473 | static DECLARE_DELAYED_WORK(optimizing_work, kprobe_optimizer); |
| 474 | static DECLARE_COMPLETION(optimizer_comp); | ||
| 428 | #define OPTIMIZE_DELAY 5 | 475 | #define OPTIMIZE_DELAY 5 |
| 429 | 476 | ||
| 430 | /* Kprobe jump optimizer */ | 477 | /* |
| 431 | static __kprobes void kprobe_optimizer(struct work_struct *work) | 478 | * Optimize (replace a breakpoint with a jump) kprobes listed on |
| 479 | * optimizing_list. | ||
| 480 | */ | ||
| 481 | static __kprobes void do_optimize_kprobes(void) | ||
| 432 | { | 482 | { |
| 433 | struct optimized_kprobe *op, *tmp; | 483 | /* Optimization never be done when disarmed */ |
| 434 | 484 | if (kprobes_all_disarmed || !kprobes_allow_optimization || | |
| 435 | /* Lock modules while optimizing kprobes */ | 485 | list_empty(&optimizing_list)) |
| 436 | mutex_lock(&module_mutex); | 486 | return; |
| 437 | mutex_lock(&kprobe_mutex); | ||
| 438 | if (kprobes_all_disarmed || !kprobes_allow_optimization) | ||
| 439 | goto end; | ||
| 440 | |||
| 441 | /* | ||
| 442 | * Wait for quiesence period to ensure all running interrupts | ||
| 443 | * are done. Because optprobe may modify multiple instructions | ||
| 444 | * there is a chance that Nth instruction is interrupted. In that | ||
| 445 | * case, running interrupt can return to 2nd-Nth byte of jump | ||
| 446 | * instruction. This wait is for avoiding it. | ||
| 447 | */ | ||
| 448 | synchronize_sched(); | ||
| 449 | 487 | ||
| 450 | /* | 488 | /* |
| 451 | * The optimization/unoptimization refers online_cpus via | 489 | * The optimization/unoptimization refers online_cpus via |
| @@ -459,17 +497,111 @@ static __kprobes void kprobe_optimizer(struct work_struct *work) | |||
| 459 | */ | 497 | */ |
| 460 | get_online_cpus(); | 498 | get_online_cpus(); |
| 461 | mutex_lock(&text_mutex); | 499 | mutex_lock(&text_mutex); |
| 462 | list_for_each_entry_safe(op, tmp, &optimizing_list, list) { | 500 | arch_optimize_kprobes(&optimizing_list); |
| 463 | WARN_ON(kprobe_disabled(&op->kp)); | 501 | mutex_unlock(&text_mutex); |
| 464 | if (arch_optimize_kprobe(op) < 0) | 502 | put_online_cpus(); |
| 465 | op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED; | 503 | } |
| 466 | list_del_init(&op->list); | 504 | |
| 505 | /* | ||
| 506 | * Unoptimize (replace a jump with a breakpoint and remove the breakpoint | ||
| 507 | * if need) kprobes listed on unoptimizing_list. | ||
| 508 | */ | ||
| 509 | static __kprobes void do_unoptimize_kprobes(struct list_head *free_list) | ||
| 510 | { | ||
| 511 | struct optimized_kprobe *op, *tmp; | ||
| 512 | |||
| 513 | /* Unoptimization must be done anytime */ | ||
| 514 | if (list_empty(&unoptimizing_list)) | ||
| 515 | return; | ||
| 516 | |||
| 517 | /* Ditto to do_optimize_kprobes */ | ||
| 518 | get_online_cpus(); | ||
| 519 | mutex_lock(&text_mutex); | ||
| 520 | arch_unoptimize_kprobes(&unoptimizing_list, free_list); | ||
| 521 | /* Loop free_list for disarming */ | ||
| 522 | list_for_each_entry_safe(op, tmp, free_list, list) { | ||
| 523 | /* Disarm probes if marked disabled */ | ||
| 524 | if (kprobe_disabled(&op->kp)) | ||
| 525 | arch_disarm_kprobe(&op->kp); | ||
| 526 | if (kprobe_unused(&op->kp)) { | ||
| 527 | /* | ||
| 528 | * Remove unused probes from hash list. After waiting | ||
| 529 | * for synchronization, these probes are reclaimed. | ||
| 530 | * (reclaiming is done by do_free_cleaned_kprobes.) | ||
| 531 | */ | ||
| 532 | hlist_del_rcu(&op->kp.hlist); | ||
| 533 | } else | ||
| 534 | list_del_init(&op->list); | ||
| 467 | } | 535 | } |
| 468 | mutex_unlock(&text_mutex); | 536 | mutex_unlock(&text_mutex); |
| 469 | put_online_cpus(); | 537 | put_online_cpus(); |
| 470 | end: | 538 | } |
| 539 | |||
| 540 | /* Reclaim all kprobes on the free_list */ | ||
| 541 | static __kprobes void do_free_cleaned_kprobes(struct list_head *free_list) | ||
| 542 | { | ||
| 543 | struct optimized_kprobe *op, *tmp; | ||
| 544 | |||
| 545 | list_for_each_entry_safe(op, tmp, free_list, list) { | ||
| 546 | BUG_ON(!kprobe_unused(&op->kp)); | ||
| 547 | list_del_init(&op->list); | ||
| 548 | free_aggr_kprobe(&op->kp); | ||
| 549 | } | ||
| 550 | } | ||
| 551 | |||
| 552 | /* Start optimizer after OPTIMIZE_DELAY passed */ | ||
| 553 | static __kprobes void kick_kprobe_optimizer(void) | ||
| 554 | { | ||
| 555 | if (!delayed_work_pending(&optimizing_work)) | ||
| 556 | schedule_delayed_work(&optimizing_work, OPTIMIZE_DELAY); | ||
| 557 | } | ||
| 558 | |||
| 559 | /* Kprobe jump optimizer */ | ||
| 560 | static __kprobes void kprobe_optimizer(struct work_struct *work) | ||
| 561 | { | ||
| 562 | LIST_HEAD(free_list); | ||
| 563 | |||
| 564 | /* Lock modules while optimizing kprobes */ | ||
| 565 | mutex_lock(&module_mutex); | ||
| 566 | mutex_lock(&kprobe_mutex); | ||
| 567 | |||
| 568 | /* | ||
| 569 | * Step 1: Unoptimize kprobes and collect cleaned (unused and disarmed) | ||
| 570 | * kprobes before waiting for quiesence period. | ||
| 571 | */ | ||
| 572 | do_unoptimize_kprobes(&free_list); | ||
| 573 | |||
| 574 | /* | ||
| 575 | * Step 2: Wait for quiesence period to ensure all running interrupts | ||
| 576 | * are done. Because optprobe may modify multiple instructions | ||
| 577 | * there is a chance that Nth instruction is interrupted. In that | ||
| 578 | * case, running interrupt can return to 2nd-Nth byte of jump | ||
| 579 | * instruction. This wait is for avoiding it. | ||
| 580 | */ | ||
| 581 | synchronize_sched(); | ||
| 582 | |||
| 583 | /* Step 3: Optimize kprobes after quiesence period */ | ||
| 584 | do_optimize_kprobes(); | ||
| 585 | |||
| 586 | /* Step 4: Free cleaned kprobes after quiesence period */ | ||
| 587 | do_free_cleaned_kprobes(&free_list); | ||
| 588 | |||
| 471 | mutex_unlock(&kprobe_mutex); | 589 | mutex_unlock(&kprobe_mutex); |
| 472 | mutex_unlock(&module_mutex); | 590 | mutex_unlock(&module_mutex); |
| 591 | |||
| 592 | /* Step 5: Kick optimizer again if needed */ | ||
| 593 | if (!list_empty(&optimizing_list) || !list_empty(&unoptimizing_list)) | ||
| 594 | kick_kprobe_optimizer(); | ||
| 595 | else | ||
| 596 | /* Wake up all waiters */ | ||
| 597 | complete_all(&optimizer_comp); | ||
| 598 | } | ||
| 599 | |||
| 600 | /* Wait for completing optimization and unoptimization */ | ||
| 601 | static __kprobes void wait_for_kprobe_optimizer(void) | ||
| 602 | { | ||
| 603 | if (delayed_work_pending(&optimizing_work)) | ||
| 604 | wait_for_completion(&optimizer_comp); | ||
| 473 | } | 605 | } |
| 474 | 606 | ||
| 475 | /* Optimize kprobe if p is ready to be optimized */ | 607 | /* Optimize kprobe if p is ready to be optimized */ |
| @@ -495,42 +627,99 @@ static __kprobes void optimize_kprobe(struct kprobe *p) | |||
| 495 | /* Check if it is already optimized. */ | 627 | /* Check if it is already optimized. */ |
| 496 | if (op->kp.flags & KPROBE_FLAG_OPTIMIZED) | 628 | if (op->kp.flags & KPROBE_FLAG_OPTIMIZED) |
| 497 | return; | 629 | return; |
| 498 | |||
| 499 | op->kp.flags |= KPROBE_FLAG_OPTIMIZED; | 630 | op->kp.flags |= KPROBE_FLAG_OPTIMIZED; |
| 500 | list_add(&op->list, &optimizing_list); | 631 | |
| 501 | if (!delayed_work_pending(&optimizing_work)) | 632 | if (!list_empty(&op->list)) |
| 502 | schedule_delayed_work(&optimizing_work, OPTIMIZE_DELAY); | 633 | /* This is under unoptimizing. Just dequeue the probe */ |
| 634 | list_del_init(&op->list); | ||
| 635 | else { | ||
| 636 | list_add(&op->list, &optimizing_list); | ||
| 637 | kick_kprobe_optimizer(); | ||
| 638 | } | ||
| 639 | } | ||
| 640 | |||
| 641 | /* Short cut to direct unoptimizing */ | ||
| 642 | static __kprobes void force_unoptimize_kprobe(struct optimized_kprobe *op) | ||
| 643 | { | ||
| 644 | get_online_cpus(); | ||
| 645 | arch_unoptimize_kprobe(op); | ||
| 646 | put_online_cpus(); | ||
| 647 | if (kprobe_disabled(&op->kp)) | ||
| 648 | arch_disarm_kprobe(&op->kp); | ||
| 503 | } | 649 | } |
| 504 | 650 | ||
| 505 | /* Unoptimize a kprobe if p is optimized */ | 651 | /* Unoptimize a kprobe if p is optimized */ |
| 506 | static __kprobes void unoptimize_kprobe(struct kprobe *p) | 652 | static __kprobes void unoptimize_kprobe(struct kprobe *p, bool force) |
| 507 | { | 653 | { |
| 508 | struct optimized_kprobe *op; | 654 | struct optimized_kprobe *op; |
| 509 | 655 | ||
| 510 | if ((p->flags & KPROBE_FLAG_OPTIMIZED) && kprobe_aggrprobe(p)) { | 656 | if (!kprobe_aggrprobe(p) || kprobe_disarmed(p)) |
| 511 | op = container_of(p, struct optimized_kprobe, kp); | 657 | return; /* This is not an optprobe nor optimized */ |
| 512 | if (!list_empty(&op->list)) | 658 | |
| 513 | /* Dequeue from the optimization queue */ | 659 | op = container_of(p, struct optimized_kprobe, kp); |
| 660 | if (!kprobe_optimized(p)) { | ||
| 661 | /* Unoptimized or unoptimizing case */ | ||
| 662 | if (force && !list_empty(&op->list)) { | ||
| 663 | /* | ||
| 664 | * Only if this is unoptimizing kprobe and forced, | ||
| 665 | * forcibly unoptimize it. (No need to unoptimize | ||
| 666 | * unoptimized kprobe again :) | ||
| 667 | */ | ||
| 514 | list_del_init(&op->list); | 668 | list_del_init(&op->list); |
| 515 | else | 669 | force_unoptimize_kprobe(op); |
| 516 | /* Replace jump with break */ | 670 | } |
| 517 | arch_unoptimize_kprobe(op); | 671 | return; |
| 518 | op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED; | 672 | } |
| 673 | |||
| 674 | op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED; | ||
| 675 | if (!list_empty(&op->list)) { | ||
| 676 | /* Dequeue from the optimization queue */ | ||
| 677 | list_del_init(&op->list); | ||
| 678 | return; | ||
| 679 | } | ||
| 680 | /* Optimized kprobe case */ | ||
| 681 | if (force) | ||
| 682 | /* Forcibly update the code: this is a special case */ | ||
| 683 | force_unoptimize_kprobe(op); | ||
| 684 | else { | ||
| 685 | list_add(&op->list, &unoptimizing_list); | ||
| 686 | kick_kprobe_optimizer(); | ||
| 519 | } | 687 | } |
| 520 | } | 688 | } |
| 521 | 689 | ||
| 690 | /* Cancel unoptimizing for reusing */ | ||
| 691 | static void reuse_unused_kprobe(struct kprobe *ap) | ||
| 692 | { | ||
| 693 | struct optimized_kprobe *op; | ||
| 694 | |||
| 695 | BUG_ON(!kprobe_unused(ap)); | ||
| 696 | /* | ||
| 697 | * Unused kprobe MUST be on the way of delayed unoptimizing (means | ||
| 698 | * there is still a relative jump) and disabled. | ||
| 699 | */ | ||
| 700 | op = container_of(ap, struct optimized_kprobe, kp); | ||
| 701 | if (unlikely(list_empty(&op->list))) | ||
| 702 | printk(KERN_WARNING "Warning: found a stray unused " | ||
| 703 | "aggrprobe@%p\n", ap->addr); | ||
| 704 | /* Enable the probe again */ | ||
| 705 | ap->flags &= ~KPROBE_FLAG_DISABLED; | ||
| 706 | /* Optimize it again (remove from op->list) */ | ||
| 707 | BUG_ON(!kprobe_optready(ap)); | ||
| 708 | optimize_kprobe(ap); | ||
| 709 | } | ||
| 710 | |||
| 522 | /* Remove optimized instructions */ | 711 | /* Remove optimized instructions */ |
| 523 | static void __kprobes kill_optimized_kprobe(struct kprobe *p) | 712 | static void __kprobes kill_optimized_kprobe(struct kprobe *p) |
| 524 | { | 713 | { |
| 525 | struct optimized_kprobe *op; | 714 | struct optimized_kprobe *op; |
| 526 | 715 | ||
| 527 | op = container_of(p, struct optimized_kprobe, kp); | 716 | op = container_of(p, struct optimized_kprobe, kp); |
| 528 | if (!list_empty(&op->list)) { | 717 | if (!list_empty(&op->list)) |
| 529 | /* Dequeue from the optimization queue */ | 718 | /* Dequeue from the (un)optimization queue */ |
| 530 | list_del_init(&op->list); | 719 | list_del_init(&op->list); |
| 531 | op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED; | 720 | |
| 532 | } | 721 | op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED; |
| 533 | /* Don't unoptimize, because the target code will be freed. */ | 722 | /* Don't touch the code, because it is already freed. */ |
| 534 | arch_remove_optimized_kprobe(op); | 723 | arch_remove_optimized_kprobe(op); |
| 535 | } | 724 | } |
| 536 | 725 | ||
| @@ -543,16 +732,6 @@ static __kprobes void prepare_optimized_kprobe(struct kprobe *p) | |||
| 543 | arch_prepare_optimized_kprobe(op); | 732 | arch_prepare_optimized_kprobe(op); |
| 544 | } | 733 | } |
| 545 | 734 | ||
| 546 | /* Free optimized instructions and optimized_kprobe */ | ||
| 547 | static __kprobes void free_aggr_kprobe(struct kprobe *p) | ||
| 548 | { | ||
| 549 | struct optimized_kprobe *op; | ||
| 550 | |||
| 551 | op = container_of(p, struct optimized_kprobe, kp); | ||
| 552 | arch_remove_optimized_kprobe(op); | ||
| 553 | kfree(op); | ||
| 554 | } | ||
| 555 | |||
| 556 | /* Allocate new optimized_kprobe and try to prepare optimized instructions */ | 735 | /* Allocate new optimized_kprobe and try to prepare optimized instructions */ |
| 557 | static __kprobes struct kprobe *alloc_aggr_kprobe(struct kprobe *p) | 736 | static __kprobes struct kprobe *alloc_aggr_kprobe(struct kprobe *p) |
| 558 | { | 737 | { |
| @@ -587,7 +766,8 @@ static __kprobes void try_to_optimize_kprobe(struct kprobe *p) | |||
| 587 | op = container_of(ap, struct optimized_kprobe, kp); | 766 | op = container_of(ap, struct optimized_kprobe, kp); |
| 588 | if (!arch_prepared_optinsn(&op->optinsn)) { | 767 | if (!arch_prepared_optinsn(&op->optinsn)) { |
| 589 | /* If failed to setup optimizing, fallback to kprobe */ | 768 | /* If failed to setup optimizing, fallback to kprobe */ |
| 590 | free_aggr_kprobe(ap); | 769 | arch_remove_optimized_kprobe(op); |
| 770 | kfree(op); | ||
| 591 | return; | 771 | return; |
| 592 | } | 772 | } |
| 593 | 773 | ||
| @@ -631,21 +811,16 @@ static void __kprobes unoptimize_all_kprobes(void) | |||
| 631 | return; | 811 | return; |
| 632 | 812 | ||
| 633 | kprobes_allow_optimization = false; | 813 | kprobes_allow_optimization = false; |
| 634 | printk(KERN_INFO "Kprobes globally unoptimized\n"); | ||
| 635 | get_online_cpus(); /* For avoiding text_mutex deadlock */ | ||
| 636 | mutex_lock(&text_mutex); | ||
| 637 | for (i = 0; i < KPROBE_TABLE_SIZE; i++) { | 814 | for (i = 0; i < KPROBE_TABLE_SIZE; i++) { |
| 638 | head = &kprobe_table[i]; | 815 | head = &kprobe_table[i]; |
| 639 | hlist_for_each_entry_rcu(p, node, head, hlist) { | 816 | hlist_for_each_entry_rcu(p, node, head, hlist) { |
| 640 | if (!kprobe_disabled(p)) | 817 | if (!kprobe_disabled(p)) |
| 641 | unoptimize_kprobe(p); | 818 | unoptimize_kprobe(p, false); |
| 642 | } | 819 | } |
| 643 | } | 820 | } |
| 644 | 821 | /* Wait for unoptimizing completion */ | |
| 645 | mutex_unlock(&text_mutex); | 822 | wait_for_kprobe_optimizer(); |
| 646 | put_online_cpus(); | 823 | printk(KERN_INFO "Kprobes globally unoptimized\n"); |
| 647 | /* Allow all currently running kprobes to complete */ | ||
| 648 | synchronize_sched(); | ||
| 649 | } | 824 | } |
| 650 | 825 | ||
| 651 | int sysctl_kprobes_optimization; | 826 | int sysctl_kprobes_optimization; |
| @@ -669,44 +844,60 @@ int proc_kprobes_optimization_handler(struct ctl_table *table, int write, | |||
| 669 | } | 844 | } |
| 670 | #endif /* CONFIG_SYSCTL */ | 845 | #endif /* CONFIG_SYSCTL */ |
| 671 | 846 | ||
| 847 | /* Put a breakpoint for a probe. Must be called with text_mutex locked */ | ||
| 672 | static void __kprobes __arm_kprobe(struct kprobe *p) | 848 | static void __kprobes __arm_kprobe(struct kprobe *p) |
| 673 | { | 849 | { |
| 674 | struct kprobe *old_p; | 850 | struct kprobe *_p; |
| 675 | 851 | ||
| 676 | /* Check collision with other optimized kprobes */ | 852 | /* Check collision with other optimized kprobes */ |
| 677 | old_p = get_optimized_kprobe((unsigned long)p->addr); | 853 | _p = get_optimized_kprobe((unsigned long)p->addr); |
| 678 | if (unlikely(old_p)) | 854 | if (unlikely(_p)) |
| 679 | unoptimize_kprobe(old_p); /* Fallback to unoptimized kprobe */ | 855 | /* Fallback to unoptimized kprobe */ |
| 856 | unoptimize_kprobe(_p, true); | ||
| 680 | 857 | ||
| 681 | arch_arm_kprobe(p); | 858 | arch_arm_kprobe(p); |
| 682 | optimize_kprobe(p); /* Try to optimize (add kprobe to a list) */ | 859 | optimize_kprobe(p); /* Try to optimize (add kprobe to a list) */ |
| 683 | } | 860 | } |
| 684 | 861 | ||
| 685 | static void __kprobes __disarm_kprobe(struct kprobe *p) | 862 | /* Remove the breakpoint of a probe. Must be called with text_mutex locked */ |
| 863 | static void __kprobes __disarm_kprobe(struct kprobe *p, bool reopt) | ||
| 686 | { | 864 | { |
| 687 | struct kprobe *old_p; | 865 | struct kprobe *_p; |
| 688 | 866 | ||
| 689 | unoptimize_kprobe(p); /* Try to unoptimize */ | 867 | unoptimize_kprobe(p, false); /* Try to unoptimize */ |
| 690 | arch_disarm_kprobe(p); | ||
| 691 | 868 | ||
| 692 | /* If another kprobe was blocked, optimize it. */ | 869 | if (!kprobe_queued(p)) { |
| 693 | old_p = get_optimized_kprobe((unsigned long)p->addr); | 870 | arch_disarm_kprobe(p); |
| 694 | if (unlikely(old_p)) | 871 | /* If another kprobe was blocked, optimize it. */ |
| 695 | optimize_kprobe(old_p); | 872 | _p = get_optimized_kprobe((unsigned long)p->addr); |
| 873 | if (unlikely(_p) && reopt) | ||
| 874 | optimize_kprobe(_p); | ||
| 875 | } | ||
| 876 | /* TODO: reoptimize others after unoptimized this probe */ | ||
| 696 | } | 877 | } |
| 697 | 878 | ||
| 698 | #else /* !CONFIG_OPTPROBES */ | 879 | #else /* !CONFIG_OPTPROBES */ |
| 699 | 880 | ||
| 700 | #define optimize_kprobe(p) do {} while (0) | 881 | #define optimize_kprobe(p) do {} while (0) |
| 701 | #define unoptimize_kprobe(p) do {} while (0) | 882 | #define unoptimize_kprobe(p, f) do {} while (0) |
| 702 | #define kill_optimized_kprobe(p) do {} while (0) | 883 | #define kill_optimized_kprobe(p) do {} while (0) |
| 703 | #define prepare_optimized_kprobe(p) do {} while (0) | 884 | #define prepare_optimized_kprobe(p) do {} while (0) |
| 704 | #define try_to_optimize_kprobe(p) do {} while (0) | 885 | #define try_to_optimize_kprobe(p) do {} while (0) |
| 705 | #define __arm_kprobe(p) arch_arm_kprobe(p) | 886 | #define __arm_kprobe(p) arch_arm_kprobe(p) |
| 706 | #define __disarm_kprobe(p) arch_disarm_kprobe(p) | 887 | #define __disarm_kprobe(p, o) arch_disarm_kprobe(p) |
| 888 | #define kprobe_disarmed(p) kprobe_disabled(p) | ||
| 889 | #define wait_for_kprobe_optimizer() do {} while (0) | ||
| 890 | |||
| 891 | /* There should be no unused kprobes can be reused without optimization */ | ||
| 892 | static void reuse_unused_kprobe(struct kprobe *ap) | ||
| 893 | { | ||
| 894 | printk(KERN_ERR "Error: There should be no unused kprobe here.\n"); | ||
| 895 | BUG_ON(kprobe_unused(ap)); | ||
| 896 | } | ||
| 707 | 897 | ||
| 708 | static __kprobes void free_aggr_kprobe(struct kprobe *p) | 898 | static __kprobes void free_aggr_kprobe(struct kprobe *p) |
| 709 | { | 899 | { |
| 900 | arch_remove_kprobe(p); | ||
| 710 | kfree(p); | 901 | kfree(p); |
| 711 | } | 902 | } |
| 712 | 903 | ||
| @@ -732,11 +923,10 @@ static void __kprobes arm_kprobe(struct kprobe *kp) | |||
| 732 | /* Disarm a kprobe with text_mutex */ | 923 | /* Disarm a kprobe with text_mutex */ |
| 733 | static void __kprobes disarm_kprobe(struct kprobe *kp) | 924 | static void __kprobes disarm_kprobe(struct kprobe *kp) |
| 734 | { | 925 | { |
| 735 | get_online_cpus(); /* For avoiding text_mutex deadlock */ | 926 | /* Ditto */ |
| 736 | mutex_lock(&text_mutex); | 927 | mutex_lock(&text_mutex); |
| 737 | __disarm_kprobe(kp); | 928 | __disarm_kprobe(kp, true); |
| 738 | mutex_unlock(&text_mutex); | 929 | mutex_unlock(&text_mutex); |
| 739 | put_online_cpus(); | ||
| 740 | } | 930 | } |
| 741 | 931 | ||
| 742 | /* | 932 | /* |
| @@ -775,7 +965,7 @@ static void __kprobes aggr_post_handler(struct kprobe *p, struct pt_regs *regs, | |||
| 775 | static int __kprobes aggr_fault_handler(struct kprobe *p, struct pt_regs *regs, | 965 | static int __kprobes aggr_fault_handler(struct kprobe *p, struct pt_regs *regs, |
| 776 | int trapnr) | 966 | int trapnr) |
| 777 | { | 967 | { |
| 778 | struct kprobe *cur = __get_cpu_var(kprobe_instance); | 968 | struct kprobe *cur = __this_cpu_read(kprobe_instance); |
| 779 | 969 | ||
| 780 | /* | 970 | /* |
| 781 | * if we faulted "during" the execution of a user specified | 971 | * if we faulted "during" the execution of a user specified |
| @@ -790,7 +980,7 @@ static int __kprobes aggr_fault_handler(struct kprobe *p, struct pt_regs *regs, | |||
| 790 | 980 | ||
| 791 | static int __kprobes aggr_break_handler(struct kprobe *p, struct pt_regs *regs) | 981 | static int __kprobes aggr_break_handler(struct kprobe *p, struct pt_regs *regs) |
| 792 | { | 982 | { |
| 793 | struct kprobe *cur = __get_cpu_var(kprobe_instance); | 983 | struct kprobe *cur = __this_cpu_read(kprobe_instance); |
| 794 | int ret = 0; | 984 | int ret = 0; |
| 795 | 985 | ||
| 796 | if (cur && cur->break_handler) { | 986 | if (cur && cur->break_handler) { |
| @@ -942,7 +1132,7 @@ static int __kprobes add_new_kprobe(struct kprobe *ap, struct kprobe *p) | |||
| 942 | BUG_ON(kprobe_gone(ap) || kprobe_gone(p)); | 1132 | BUG_ON(kprobe_gone(ap) || kprobe_gone(p)); |
| 943 | 1133 | ||
| 944 | if (p->break_handler || p->post_handler) | 1134 | if (p->break_handler || p->post_handler) |
| 945 | unoptimize_kprobe(ap); /* Fall back to normal kprobe */ | 1135 | unoptimize_kprobe(ap, true); /* Fall back to normal kprobe */ |
| 946 | 1136 | ||
| 947 | if (p->break_handler) { | 1137 | if (p->break_handler) { |
| 948 | if (ap->break_handler) | 1138 | if (ap->break_handler) |
| @@ -993,19 +1183,21 @@ static void __kprobes init_aggr_kprobe(struct kprobe *ap, struct kprobe *p) | |||
| 993 | * This is the second or subsequent kprobe at the address - handle | 1183 | * This is the second or subsequent kprobe at the address - handle |
| 994 | * the intricacies | 1184 | * the intricacies |
| 995 | */ | 1185 | */ |
| 996 | static int __kprobes register_aggr_kprobe(struct kprobe *old_p, | 1186 | static int __kprobes register_aggr_kprobe(struct kprobe *orig_p, |
| 997 | struct kprobe *p) | 1187 | struct kprobe *p) |
| 998 | { | 1188 | { |
| 999 | int ret = 0; | 1189 | int ret = 0; |
| 1000 | struct kprobe *ap = old_p; | 1190 | struct kprobe *ap = orig_p; |
| 1001 | 1191 | ||
| 1002 | if (!kprobe_aggrprobe(old_p)) { | 1192 | if (!kprobe_aggrprobe(orig_p)) { |
| 1003 | /* If old_p is not an aggr_kprobe, create new aggr_kprobe. */ | 1193 | /* If orig_p is not an aggr_kprobe, create new aggr_kprobe. */ |
| 1004 | ap = alloc_aggr_kprobe(old_p); | 1194 | ap = alloc_aggr_kprobe(orig_p); |
| 1005 | if (!ap) | 1195 | if (!ap) |
| 1006 | return -ENOMEM; | 1196 | return -ENOMEM; |
| 1007 | init_aggr_kprobe(ap, old_p); | 1197 | init_aggr_kprobe(ap, orig_p); |
| 1008 | } | 1198 | } else if (kprobe_unused(ap)) |
| 1199 | /* This probe is going to die. Rescue it */ | ||
| 1200 | reuse_unused_kprobe(ap); | ||
| 1009 | 1201 | ||
| 1010 | if (kprobe_gone(ap)) { | 1202 | if (kprobe_gone(ap)) { |
| 1011 | /* | 1203 | /* |
| @@ -1039,23 +1231,6 @@ static int __kprobes register_aggr_kprobe(struct kprobe *old_p, | |||
| 1039 | return add_new_kprobe(ap, p); | 1231 | return add_new_kprobe(ap, p); |
| 1040 | } | 1232 | } |
| 1041 | 1233 | ||
| 1042 | /* Try to disable aggr_kprobe, and return 1 if succeeded.*/ | ||
| 1043 | static int __kprobes try_to_disable_aggr_kprobe(struct kprobe *p) | ||
| 1044 | { | ||
| 1045 | struct kprobe *kp; | ||
| 1046 | |||
| 1047 | list_for_each_entry_rcu(kp, &p->list, list) { | ||
| 1048 | if (!kprobe_disabled(kp)) | ||
| 1049 | /* | ||
| 1050 | * There is an active probe on the list. | ||
| 1051 | * We can't disable aggr_kprobe. | ||
| 1052 | */ | ||
| 1053 | return 0; | ||
| 1054 | } | ||
| 1055 | p->flags |= KPROBE_FLAG_DISABLED; | ||
| 1056 | return 1; | ||
| 1057 | } | ||
| 1058 | |||
| 1059 | static int __kprobes in_kprobes_functions(unsigned long addr) | 1234 | static int __kprobes in_kprobes_functions(unsigned long addr) |
| 1060 | { | 1235 | { |
| 1061 | struct kprobe_blackpoint *kb; | 1236 | struct kprobe_blackpoint *kb; |
| @@ -1098,34 +1273,33 @@ static kprobe_opcode_t __kprobes *kprobe_addr(struct kprobe *p) | |||
| 1098 | /* Check passed kprobe is valid and return kprobe in kprobe_table. */ | 1273 | /* Check passed kprobe is valid and return kprobe in kprobe_table. */ |
| 1099 | static struct kprobe * __kprobes __get_valid_kprobe(struct kprobe *p) | 1274 | static struct kprobe * __kprobes __get_valid_kprobe(struct kprobe *p) |
| 1100 | { | 1275 | { |
| 1101 | struct kprobe *old_p, *list_p; | 1276 | struct kprobe *ap, *list_p; |
| 1102 | 1277 | ||
| 1103 | old_p = get_kprobe(p->addr); | 1278 | ap = get_kprobe(p->addr); |
| 1104 | if (unlikely(!old_p)) | 1279 | if (unlikely(!ap)) |
| 1105 | return NULL; | 1280 | return NULL; |
| 1106 | 1281 | ||
| 1107 | if (p != old_p) { | 1282 | if (p != ap) { |
| 1108 | list_for_each_entry_rcu(list_p, &old_p->list, list) | 1283 | list_for_each_entry_rcu(list_p, &ap->list, list) |
| 1109 | if (list_p == p) | 1284 | if (list_p == p) |
| 1110 | /* kprobe p is a valid probe */ | 1285 | /* kprobe p is a valid probe */ |
| 1111 | goto valid; | 1286 | goto valid; |
| 1112 | return NULL; | 1287 | return NULL; |
| 1113 | } | 1288 | } |
| 1114 | valid: | 1289 | valid: |
| 1115 | return old_p; | 1290 | return ap; |
| 1116 | } | 1291 | } |
| 1117 | 1292 | ||
| 1118 | /* Return error if the kprobe is being re-registered */ | 1293 | /* Return error if the kprobe is being re-registered */ |
| 1119 | static inline int check_kprobe_rereg(struct kprobe *p) | 1294 | static inline int check_kprobe_rereg(struct kprobe *p) |
| 1120 | { | 1295 | { |
| 1121 | int ret = 0; | 1296 | int ret = 0; |
| 1122 | struct kprobe *old_p; | ||
| 1123 | 1297 | ||
| 1124 | mutex_lock(&kprobe_mutex); | 1298 | mutex_lock(&kprobe_mutex); |
| 1125 | old_p = __get_valid_kprobe(p); | 1299 | if (__get_valid_kprobe(p)) |
| 1126 | if (old_p) | ||
| 1127 | ret = -EINVAL; | 1300 | ret = -EINVAL; |
| 1128 | mutex_unlock(&kprobe_mutex); | 1301 | mutex_unlock(&kprobe_mutex); |
| 1302 | |||
| 1129 | return ret; | 1303 | return ret; |
| 1130 | } | 1304 | } |
| 1131 | 1305 | ||
| @@ -1229,67 +1403,121 @@ fail_with_jump_label: | |||
| 1229 | } | 1403 | } |
| 1230 | EXPORT_SYMBOL_GPL(register_kprobe); | 1404 | EXPORT_SYMBOL_GPL(register_kprobe); |
| 1231 | 1405 | ||
| 1406 | /* Check if all probes on the aggrprobe are disabled */ | ||
| 1407 | static int __kprobes aggr_kprobe_disabled(struct kprobe *ap) | ||
| 1408 | { | ||
| 1409 | struct kprobe *kp; | ||
| 1410 | |||
| 1411 | list_for_each_entry_rcu(kp, &ap->list, list) | ||
| 1412 | if (!kprobe_disabled(kp)) | ||
| 1413 | /* | ||
| 1414 | * There is an active probe on the list. | ||
| 1415 | * We can't disable this ap. | ||
| 1416 | */ | ||
| 1417 | return 0; | ||
| 1418 | |||
| 1419 | return 1; | ||
| 1420 | } | ||
| 1421 | |||
| 1422 | /* Disable one kprobe: Make sure called under kprobe_mutex is locked */ | ||
| 1423 | static struct kprobe *__kprobes __disable_kprobe(struct kprobe *p) | ||
| 1424 | { | ||
| 1425 | struct kprobe *orig_p; | ||
| 1426 | |||
| 1427 | /* Get an original kprobe for return */ | ||
| 1428 | orig_p = __get_valid_kprobe(p); | ||
| 1429 | if (unlikely(orig_p == NULL)) | ||
| 1430 | return NULL; | ||
| 1431 | |||
| 1432 | if (!kprobe_disabled(p)) { | ||
| 1433 | /* Disable probe if it is a child probe */ | ||
| 1434 | if (p != orig_p) | ||
| 1435 | p->flags |= KPROBE_FLAG_DISABLED; | ||
| 1436 | |||
| 1437 | /* Try to disarm and disable this/parent probe */ | ||
| 1438 | if (p == orig_p || aggr_kprobe_disabled(orig_p)) { | ||
| 1439 | disarm_kprobe(orig_p); | ||
| 1440 | orig_p->flags |= KPROBE_FLAG_DISABLED; | ||
| 1441 | } | ||
| 1442 | } | ||
| 1443 | |||
| 1444 | return orig_p; | ||
| 1445 | } | ||
| 1446 | |||
| 1232 | /* | 1447 | /* |
| 1233 | * Unregister a kprobe without a scheduler synchronization. | 1448 | * Unregister a kprobe without a scheduler synchronization. |
| 1234 | */ | 1449 | */ |
| 1235 | static int __kprobes __unregister_kprobe_top(struct kprobe *p) | 1450 | static int __kprobes __unregister_kprobe_top(struct kprobe *p) |
| 1236 | { | 1451 | { |
| 1237 | struct kprobe *old_p, *list_p; | 1452 | struct kprobe *ap, *list_p; |
| 1238 | 1453 | ||
| 1239 | old_p = __get_valid_kprobe(p); | 1454 | /* Disable kprobe. This will disarm it if needed. */ |
| 1240 | if (old_p == NULL) | 1455 | ap = __disable_kprobe(p); |
| 1456 | if (ap == NULL) | ||
| 1241 | return -EINVAL; | 1457 | return -EINVAL; |
| 1242 | 1458 | ||
| 1243 | if (old_p == p || | 1459 | if (ap == p) |
| 1244 | (kprobe_aggrprobe(old_p) && | ||
| 1245 | list_is_singular(&old_p->list))) { | ||
| 1246 | /* | 1460 | /* |
| 1247 | * Only probe on the hash list. Disarm only if kprobes are | 1461 | * This probe is an independent(and non-optimized) kprobe |
| 1248 | * enabled and not gone - otherwise, the breakpoint would | 1462 | * (not an aggrprobe). Remove from the hash list. |
| 1249 | * already have been removed. We save on flushing icache. | ||
| 1250 | */ | 1463 | */ |
| 1251 | if (!kprobes_all_disarmed && !kprobe_disabled(old_p)) | 1464 | goto disarmed; |
| 1252 | disarm_kprobe(old_p); | 1465 | |
| 1253 | hlist_del_rcu(&old_p->hlist); | 1466 | /* Following process expects this probe is an aggrprobe */ |
| 1254 | } else { | 1467 | WARN_ON(!kprobe_aggrprobe(ap)); |
| 1468 | |||
| 1469 | if (list_is_singular(&ap->list) && kprobe_disarmed(ap)) | ||
| 1470 | /* | ||
| 1471 | * !disarmed could be happen if the probe is under delayed | ||
| 1472 | * unoptimizing. | ||
| 1473 | */ | ||
| 1474 | goto disarmed; | ||
| 1475 | else { | ||
| 1476 | /* If disabling probe has special handlers, update aggrprobe */ | ||
| 1255 | if (p->break_handler && !kprobe_gone(p)) | 1477 | if (p->break_handler && !kprobe_gone(p)) |
| 1256 | old_p->break_handler = NULL; | 1478 | ap->break_handler = NULL; |
| 1257 | if (p->post_handler && !kprobe_gone(p)) { | 1479 | if (p->post_handler && !kprobe_gone(p)) { |
| 1258 | list_for_each_entry_rcu(list_p, &old_p->list, list) { | 1480 | list_for_each_entry_rcu(list_p, &ap->list, list) { |
| 1259 | if ((list_p != p) && (list_p->post_handler)) | 1481 | if ((list_p != p) && (list_p->post_handler)) |
| 1260 | goto noclean; | 1482 | goto noclean; |
| 1261 | } | 1483 | } |
| 1262 | old_p->post_handler = NULL; | 1484 | ap->post_handler = NULL; |
| 1263 | } | 1485 | } |
| 1264 | noclean: | 1486 | noclean: |
| 1487 | /* | ||
| 1488 | * Remove from the aggrprobe: this path will do nothing in | ||
| 1489 | * __unregister_kprobe_bottom(). | ||
| 1490 | */ | ||
| 1265 | list_del_rcu(&p->list); | 1491 | list_del_rcu(&p->list); |
| 1266 | if (!kprobe_disabled(old_p)) { | 1492 | if (!kprobe_disabled(ap) && !kprobes_all_disarmed) |
| 1267 | try_to_disable_aggr_kprobe(old_p); | 1493 | /* |
| 1268 | if (!kprobes_all_disarmed) { | 1494 | * Try to optimize this probe again, because post |
| 1269 | if (kprobe_disabled(old_p)) | 1495 | * handler may have been changed. |
| 1270 | disarm_kprobe(old_p); | 1496 | */ |
| 1271 | else | 1497 | optimize_kprobe(ap); |
| 1272 | /* Try to optimize this probe again */ | ||
| 1273 | optimize_kprobe(old_p); | ||
| 1274 | } | ||
| 1275 | } | ||
| 1276 | } | 1498 | } |
| 1277 | return 0; | 1499 | return 0; |
| 1500 | |||
| 1501 | disarmed: | ||
| 1502 | BUG_ON(!kprobe_disarmed(ap)); | ||
| 1503 | hlist_del_rcu(&ap->hlist); | ||
| 1504 | return 0; | ||
| 1278 | } | 1505 | } |
| 1279 | 1506 | ||
| 1280 | static void __kprobes __unregister_kprobe_bottom(struct kprobe *p) | 1507 | static void __kprobes __unregister_kprobe_bottom(struct kprobe *p) |
| 1281 | { | 1508 | { |
| 1282 | struct kprobe *old_p; | 1509 | struct kprobe *ap; |
| 1283 | 1510 | ||
| 1284 | if (list_empty(&p->list)) | 1511 | if (list_empty(&p->list)) |
| 1512 | /* This is an independent kprobe */ | ||
| 1285 | arch_remove_kprobe(p); | 1513 | arch_remove_kprobe(p); |
| 1286 | else if (list_is_singular(&p->list)) { | 1514 | else if (list_is_singular(&p->list)) { |
| 1287 | /* "p" is the last child of an aggr_kprobe */ | 1515 | /* This is the last child of an aggrprobe */ |
| 1288 | old_p = list_entry(p->list.next, struct kprobe, list); | 1516 | ap = list_entry(p->list.next, struct kprobe, list); |
| 1289 | list_del(&p->list); | 1517 | list_del(&p->list); |
| 1290 | arch_remove_kprobe(old_p); | 1518 | free_aggr_kprobe(ap); |
| 1291 | free_aggr_kprobe(old_p); | ||
| 1292 | } | 1519 | } |
| 1520 | /* Otherwise, do nothing. */ | ||
| 1293 | } | 1521 | } |
| 1294 | 1522 | ||
| 1295 | int __kprobes register_kprobes(struct kprobe **kps, int num) | 1523 | int __kprobes register_kprobes(struct kprobe **kps, int num) |
| @@ -1607,29 +1835,13 @@ static void __kprobes kill_kprobe(struct kprobe *p) | |||
| 1607 | int __kprobes disable_kprobe(struct kprobe *kp) | 1835 | int __kprobes disable_kprobe(struct kprobe *kp) |
| 1608 | { | 1836 | { |
| 1609 | int ret = 0; | 1837 | int ret = 0; |
| 1610 | struct kprobe *p; | ||
| 1611 | 1838 | ||
| 1612 | mutex_lock(&kprobe_mutex); | 1839 | mutex_lock(&kprobe_mutex); |
| 1613 | 1840 | ||
| 1614 | /* Check whether specified probe is valid. */ | 1841 | /* Disable this kprobe */ |
| 1615 | p = __get_valid_kprobe(kp); | 1842 | if (__disable_kprobe(kp) == NULL) |
| 1616 | if (unlikely(p == NULL)) { | ||
| 1617 | ret = -EINVAL; | 1843 | ret = -EINVAL; |
| 1618 | goto out; | ||
| 1619 | } | ||
| 1620 | 1844 | ||
| 1621 | /* If the probe is already disabled (or gone), just return */ | ||
| 1622 | if (kprobe_disabled(kp)) | ||
| 1623 | goto out; | ||
| 1624 | |||
| 1625 | kp->flags |= KPROBE_FLAG_DISABLED; | ||
| 1626 | if (p != kp) | ||
| 1627 | /* When kp != p, p is always enabled. */ | ||
| 1628 | try_to_disable_aggr_kprobe(p); | ||
| 1629 | |||
| 1630 | if (!kprobes_all_disarmed && kprobe_disabled(p)) | ||
| 1631 | disarm_kprobe(p); | ||
| 1632 | out: | ||
| 1633 | mutex_unlock(&kprobe_mutex); | 1845 | mutex_unlock(&kprobe_mutex); |
| 1634 | return ret; | 1846 | return ret; |
| 1635 | } | 1847 | } |
| @@ -1927,36 +2139,27 @@ static void __kprobes disarm_all_kprobes(void) | |||
| 1927 | mutex_lock(&kprobe_mutex); | 2139 | mutex_lock(&kprobe_mutex); |
| 1928 | 2140 | ||
| 1929 | /* If kprobes are already disarmed, just return */ | 2141 | /* If kprobes are already disarmed, just return */ |
| 1930 | if (kprobes_all_disarmed) | 2142 | if (kprobes_all_disarmed) { |
| 1931 | goto already_disabled; | 2143 | mutex_unlock(&kprobe_mutex); |
| 2144 | return; | ||
| 2145 | } | ||
| 1932 | 2146 | ||
| 1933 | kprobes_all_disarmed = true; | 2147 | kprobes_all_disarmed = true; |
| 1934 | printk(KERN_INFO "Kprobes globally disabled\n"); | 2148 | printk(KERN_INFO "Kprobes globally disabled\n"); |
| 1935 | 2149 | ||
| 1936 | /* | ||
| 1937 | * Here we call get_online_cpus() for avoiding text_mutex deadlock, | ||
| 1938 | * because disarming may also unoptimize kprobes. | ||
| 1939 | */ | ||
| 1940 | get_online_cpus(); | ||
| 1941 | mutex_lock(&text_mutex); | 2150 | mutex_lock(&text_mutex); |
| 1942 | for (i = 0; i < KPROBE_TABLE_SIZE; i++) { | 2151 | for (i = 0; i < KPROBE_TABLE_SIZE; i++) { |
| 1943 | head = &kprobe_table[i]; | 2152 | head = &kprobe_table[i]; |
| 1944 | hlist_for_each_entry_rcu(p, node, head, hlist) { | 2153 | hlist_for_each_entry_rcu(p, node, head, hlist) { |
| 1945 | if (!arch_trampoline_kprobe(p) && !kprobe_disabled(p)) | 2154 | if (!arch_trampoline_kprobe(p) && !kprobe_disabled(p)) |
| 1946 | __disarm_kprobe(p); | 2155 | __disarm_kprobe(p, false); |
| 1947 | } | 2156 | } |
| 1948 | } | 2157 | } |
| 1949 | |||
| 1950 | mutex_unlock(&text_mutex); | 2158 | mutex_unlock(&text_mutex); |
| 1951 | put_online_cpus(); | ||
| 1952 | mutex_unlock(&kprobe_mutex); | 2159 | mutex_unlock(&kprobe_mutex); |
| 1953 | /* Allow all currently running kprobes to complete */ | ||
| 1954 | synchronize_sched(); | ||
| 1955 | return; | ||
| 1956 | 2160 | ||
| 1957 | already_disabled: | 2161 | /* Wait for disarming all kprobes by optimizer */ |
| 1958 | mutex_unlock(&kprobe_mutex); | 2162 | wait_for_kprobe_optimizer(); |
| 1959 | return; | ||
| 1960 | } | 2163 | } |
| 1961 | 2164 | ||
| 1962 | /* | 2165 | /* |
diff --git a/kernel/kthread.c b/kernel/kthread.c index 2dc3786349d..c55afba990a 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c | |||
| @@ -148,7 +148,7 @@ struct task_struct *kthread_create(int (*threadfn)(void *data), | |||
| 148 | wait_for_completion(&create.done); | 148 | wait_for_completion(&create.done); |
| 149 | 149 | ||
| 150 | if (!IS_ERR(create.result)) { | 150 | if (!IS_ERR(create.result)) { |
| 151 | struct sched_param param = { .sched_priority = 0 }; | 151 | static const struct sched_param param = { .sched_priority = 0 }; |
| 152 | va_list args; | 152 | va_list args; |
| 153 | 153 | ||
| 154 | va_start(args, namefmt); | 154 | va_start(args, namefmt); |
| @@ -265,6 +265,17 @@ int kthreadd(void *unused) | |||
| 265 | return 0; | 265 | return 0; |
| 266 | } | 266 | } |
| 267 | 267 | ||
| 268 | void __init_kthread_worker(struct kthread_worker *worker, | ||
| 269 | const char *name, | ||
| 270 | struct lock_class_key *key) | ||
| 271 | { | ||
| 272 | spin_lock_init(&worker->lock); | ||
| 273 | lockdep_set_class_and_name(&worker->lock, key, name); | ||
| 274 | INIT_LIST_HEAD(&worker->work_list); | ||
| 275 | worker->task = NULL; | ||
| 276 | } | ||
| 277 | EXPORT_SYMBOL_GPL(__init_kthread_worker); | ||
| 278 | |||
| 268 | /** | 279 | /** |
| 269 | * kthread_worker_fn - kthread function to process kthread_worker | 280 | * kthread_worker_fn - kthread function to process kthread_worker |
| 270 | * @worker_ptr: pointer to initialized kthread_worker | 281 | * @worker_ptr: pointer to initialized kthread_worker |
diff --git a/kernel/latencytop.c b/kernel/latencytop.c index 877fb306d41..ee74b35e528 100644 --- a/kernel/latencytop.c +++ b/kernel/latencytop.c | |||
| @@ -194,14 +194,7 @@ __account_scheduler_latency(struct task_struct *tsk, int usecs, int inter) | |||
| 194 | 194 | ||
| 195 | account_global_scheduler_latency(tsk, &lat); | 195 | account_global_scheduler_latency(tsk, &lat); |
| 196 | 196 | ||
| 197 | /* | 197 | for (i = 0; i < tsk->latency_record_count; i++) { |
| 198 | * short term hack; if we're > 32 we stop; future we recycle: | ||
| 199 | */ | ||
| 200 | tsk->latency_record_count++; | ||
| 201 | if (tsk->latency_record_count >= LT_SAVECOUNT) | ||
| 202 | goto out_unlock; | ||
| 203 | |||
| 204 | for (i = 0; i < LT_SAVECOUNT; i++) { | ||
| 205 | struct latency_record *mylat; | 198 | struct latency_record *mylat; |
| 206 | int same = 1; | 199 | int same = 1; |
| 207 | 200 | ||
| @@ -227,8 +220,14 @@ __account_scheduler_latency(struct task_struct *tsk, int usecs, int inter) | |||
| 227 | } | 220 | } |
| 228 | } | 221 | } |
| 229 | 222 | ||
| 223 | /* | ||
| 224 | * short term hack; if we're > 32 we stop; future we recycle: | ||
| 225 | */ | ||
| 226 | if (tsk->latency_record_count >= LT_SAVECOUNT) | ||
| 227 | goto out_unlock; | ||
| 228 | |||
| 230 | /* Allocated a new one: */ | 229 | /* Allocated a new one: */ |
| 231 | i = tsk->latency_record_count; | 230 | i = tsk->latency_record_count++; |
| 232 | memcpy(&tsk->latency_record[i], &lat, sizeof(struct latency_record)); | 231 | memcpy(&tsk->latency_record[i], &lat, sizeof(struct latency_record)); |
| 233 | 232 | ||
| 234 | out_unlock: | 233 | out_unlock: |
| @@ -242,24 +241,19 @@ static int lstats_show(struct seq_file *m, void *v) | |||
| 242 | seq_puts(m, "Latency Top version : v0.1\n"); | 241 | seq_puts(m, "Latency Top version : v0.1\n"); |
| 243 | 242 | ||
| 244 | for (i = 0; i < MAXLR; i++) { | 243 | for (i = 0; i < MAXLR; i++) { |
| 245 | if (latency_record[i].backtrace[0]) { | 244 | struct latency_record *lr = &latency_record[i]; |
| 245 | |||
| 246 | if (lr->backtrace[0]) { | ||
| 246 | int q; | 247 | int q; |
| 247 | seq_printf(m, "%i %lu %lu ", | 248 | seq_printf(m, "%i %lu %lu", |
| 248 | latency_record[i].count, | 249 | lr->count, lr->time, lr->max); |
| 249 | latency_record[i].time, | ||
| 250 | latency_record[i].max); | ||
| 251 | for (q = 0; q < LT_BACKTRACEDEPTH; q++) { | 250 | for (q = 0; q < LT_BACKTRACEDEPTH; q++) { |
| 252 | char sym[KSYM_SYMBOL_LEN]; | 251 | unsigned long bt = lr->backtrace[q]; |
| 253 | char *c; | 252 | if (!bt) |
| 254 | if (!latency_record[i].backtrace[q]) | ||
| 255 | break; | 253 | break; |
| 256 | if (latency_record[i].backtrace[q] == ULONG_MAX) | 254 | if (bt == ULONG_MAX) |
| 257 | break; | 255 | break; |
| 258 | sprint_symbol(sym, latency_record[i].backtrace[q]); | 256 | seq_printf(m, " %ps", (void *)bt); |
| 259 | c = strchr(sym, '+'); | ||
| 260 | if (c) | ||
| 261 | *c = 0; | ||
| 262 | seq_printf(m, "%s ", sym); | ||
| 263 | } | 257 | } |
| 264 | seq_printf(m, "\n"); | 258 | seq_printf(m, "\n"); |
| 265 | } | 259 | } |
diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 42ba65dff7d..0d2058da80f 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c | |||
| @@ -2292,22 +2292,6 @@ mark_held_locks(struct task_struct *curr, enum mark_type mark) | |||
| 2292 | } | 2292 | } |
| 2293 | 2293 | ||
| 2294 | /* | 2294 | /* |
| 2295 | * Debugging helper: via this flag we know that we are in | ||
| 2296 | * 'early bootup code', and will warn about any invalid irqs-on event: | ||
| 2297 | */ | ||
| 2298 | static int early_boot_irqs_enabled; | ||
| 2299 | |||
| 2300 | void early_boot_irqs_off(void) | ||
| 2301 | { | ||
| 2302 | early_boot_irqs_enabled = 0; | ||
| 2303 | } | ||
| 2304 | |||
| 2305 | void early_boot_irqs_on(void) | ||
| 2306 | { | ||
| 2307 | early_boot_irqs_enabled = 1; | ||
| 2308 | } | ||
| 2309 | |||
| 2310 | /* | ||
| 2311 | * Hardirqs will be enabled: | 2295 | * Hardirqs will be enabled: |
| 2312 | */ | 2296 | */ |
| 2313 | void trace_hardirqs_on_caller(unsigned long ip) | 2297 | void trace_hardirqs_on_caller(unsigned long ip) |
| @@ -2319,7 +2303,7 @@ void trace_hardirqs_on_caller(unsigned long ip) | |||
| 2319 | if (unlikely(!debug_locks || current->lockdep_recursion)) | 2303 | if (unlikely(!debug_locks || current->lockdep_recursion)) |
| 2320 | return; | 2304 | return; |
| 2321 | 2305 | ||
| 2322 | if (DEBUG_LOCKS_WARN_ON(unlikely(!early_boot_irqs_enabled))) | 2306 | if (DEBUG_LOCKS_WARN_ON(unlikely(early_boot_irqs_disabled))) |
| 2323 | return; | 2307 | return; |
| 2324 | 2308 | ||
| 2325 | if (unlikely(curr->hardirqs_enabled)) { | 2309 | if (unlikely(curr->hardirqs_enabled)) { |
diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c index 59b76c8ce9d..1969d2fc4b3 100644 --- a/kernel/lockdep_proc.c +++ b/kernel/lockdep_proc.c | |||
| @@ -494,7 +494,6 @@ static void seq_stats(struct seq_file *m, struct lock_stat_data *data) | |||
| 494 | namelen += 2; | 494 | namelen += 2; |
| 495 | 495 | ||
| 496 | for (i = 0; i < LOCKSTAT_POINTS; i++) { | 496 | for (i = 0; i < LOCKSTAT_POINTS; i++) { |
| 497 | char sym[KSYM_SYMBOL_LEN]; | ||
| 498 | char ip[32]; | 497 | char ip[32]; |
| 499 | 498 | ||
| 500 | if (class->contention_point[i] == 0) | 499 | if (class->contention_point[i] == 0) |
| @@ -503,15 +502,13 @@ static void seq_stats(struct seq_file *m, struct lock_stat_data *data) | |||
| 503 | if (!i) | 502 | if (!i) |
| 504 | seq_line(m, '-', 40-namelen, namelen); | 503 | seq_line(m, '-', 40-namelen, namelen); |
| 505 | 504 | ||
| 506 | sprint_symbol(sym, class->contention_point[i]); | ||
| 507 | snprintf(ip, sizeof(ip), "[<%p>]", | 505 | snprintf(ip, sizeof(ip), "[<%p>]", |
| 508 | (void *)class->contention_point[i]); | 506 | (void *)class->contention_point[i]); |
| 509 | seq_printf(m, "%40s %14lu %29s %s\n", name, | 507 | seq_printf(m, "%40s %14lu %29s %pS\n", |
| 510 | stats->contention_point[i], | 508 | name, stats->contention_point[i], |
| 511 | ip, sym); | 509 | ip, (void *)class->contention_point[i]); |
| 512 | } | 510 | } |
| 513 | for (i = 0; i < LOCKSTAT_POINTS; i++) { | 511 | for (i = 0; i < LOCKSTAT_POINTS; i++) { |
| 514 | char sym[KSYM_SYMBOL_LEN]; | ||
| 515 | char ip[32]; | 512 | char ip[32]; |
| 516 | 513 | ||
| 517 | if (class->contending_point[i] == 0) | 514 | if (class->contending_point[i] == 0) |
| @@ -520,12 +517,11 @@ static void seq_stats(struct seq_file *m, struct lock_stat_data *data) | |||
| 520 | if (!i) | 517 | if (!i) |
| 521 | seq_line(m, '-', 40-namelen, namelen); | 518 | seq_line(m, '-', 40-namelen, namelen); |
| 522 | 519 | ||
| 523 | sprint_symbol(sym, class->contending_point[i]); | ||
| 524 | snprintf(ip, sizeof(ip), "[<%p>]", | 520 | snprintf(ip, sizeof(ip), "[<%p>]", |
| 525 | (void *)class->contending_point[i]); | 521 | (void *)class->contending_point[i]); |
| 526 | seq_printf(m, "%40s %14lu %29s %s\n", name, | 522 | seq_printf(m, "%40s %14lu %29s %pS\n", |
| 527 | stats->contending_point[i], | 523 | name, stats->contending_point[i], |
| 528 | ip, sym); | 524 | ip, (void *)class->contending_point[i]); |
| 529 | } | 525 | } |
| 530 | if (i) { | 526 | if (i) { |
| 531 | seq_puts(m, "\n"); | 527 | seq_puts(m, "\n"); |
diff --git a/kernel/module.c b/kernel/module.c index 437a74a7524..efa290ea94b 100644 --- a/kernel/module.c +++ b/kernel/module.c | |||
| @@ -56,6 +56,7 @@ | |||
| 56 | #include <linux/percpu.h> | 56 | #include <linux/percpu.h> |
| 57 | #include <linux/kmemleak.h> | 57 | #include <linux/kmemleak.h> |
| 58 | #include <linux/jump_label.h> | 58 | #include <linux/jump_label.h> |
| 59 | #include <linux/pfn.h> | ||
| 59 | 60 | ||
| 60 | #define CREATE_TRACE_POINTS | 61 | #define CREATE_TRACE_POINTS |
| 61 | #include <trace/events/module.h> | 62 | #include <trace/events/module.h> |
| @@ -70,6 +71,26 @@ | |||
| 70 | #define ARCH_SHF_SMALL 0 | 71 | #define ARCH_SHF_SMALL 0 |
| 71 | #endif | 72 | #endif |
| 72 | 73 | ||
| 74 | /* | ||
| 75 | * Modules' sections will be aligned on page boundaries | ||
| 76 | * to ensure complete separation of code and data, but | ||
| 77 | * only when CONFIG_DEBUG_SET_MODULE_RONX=y | ||
| 78 | */ | ||
| 79 | #ifdef CONFIG_DEBUG_SET_MODULE_RONX | ||
| 80 | # define debug_align(X) ALIGN(X, PAGE_SIZE) | ||
| 81 | #else | ||
| 82 | # define debug_align(X) (X) | ||
| 83 | #endif | ||
| 84 | |||
| 85 | /* | ||
| 86 | * Given BASE and SIZE this macro calculates the number of pages the | ||
| 87 | * memory regions occupies | ||
| 88 | */ | ||
| 89 | #define MOD_NUMBER_OF_PAGES(BASE, SIZE) (((SIZE) > 0) ? \ | ||
| 90 | (PFN_DOWN((unsigned long)(BASE) + (SIZE) - 1) - \ | ||
| 91 | PFN_DOWN((unsigned long)BASE) + 1) \ | ||
| 92 | : (0UL)) | ||
| 93 | |||
| 73 | /* If this is set, the section belongs in the init part of the module */ | 94 | /* If this is set, the section belongs in the init part of the module */ |
| 74 | #define INIT_OFFSET_MASK (1UL << (BITS_PER_LONG-1)) | 95 | #define INIT_OFFSET_MASK (1UL << (BITS_PER_LONG-1)) |
| 75 | 96 | ||
| @@ -1542,6 +1563,115 @@ static int __unlink_module(void *_mod) | |||
| 1542 | return 0; | 1563 | return 0; |
| 1543 | } | 1564 | } |
| 1544 | 1565 | ||
| 1566 | #ifdef CONFIG_DEBUG_SET_MODULE_RONX | ||
| 1567 | /* | ||
| 1568 | * LKM RO/NX protection: protect module's text/ro-data | ||
| 1569 | * from modification and any data from execution. | ||
| 1570 | */ | ||
| 1571 | void set_page_attributes(void *start, void *end, int (*set)(unsigned long start, int num_pages)) | ||
| 1572 | { | ||
| 1573 | unsigned long begin_pfn = PFN_DOWN((unsigned long)start); | ||
| 1574 | unsigned long end_pfn = PFN_DOWN((unsigned long)end); | ||
| 1575 | |||
| 1576 | if (end_pfn > begin_pfn) | ||
| 1577 | set(begin_pfn << PAGE_SHIFT, end_pfn - begin_pfn); | ||
| 1578 | } | ||
| 1579 | |||
| 1580 | static void set_section_ro_nx(void *base, | ||
| 1581 | unsigned long text_size, | ||
| 1582 | unsigned long ro_size, | ||
| 1583 | unsigned long total_size) | ||
| 1584 | { | ||
| 1585 | /* begin and end PFNs of the current subsection */ | ||
| 1586 | unsigned long begin_pfn; | ||
| 1587 | unsigned long end_pfn; | ||
| 1588 | |||
| 1589 | /* | ||
| 1590 | * Set RO for module text and RO-data: | ||
| 1591 | * - Always protect first page. | ||
| 1592 | * - Do not protect last partial page. | ||
| 1593 | */ | ||
| 1594 | if (ro_size > 0) | ||
| 1595 | set_page_attributes(base, base + ro_size, set_memory_ro); | ||
| 1596 | |||
| 1597 | /* | ||
| 1598 | * Set NX permissions for module data: | ||
| 1599 | * - Do not protect first partial page. | ||
| 1600 | * - Always protect last page. | ||
| 1601 | */ | ||
| 1602 | if (total_size > text_size) { | ||
| 1603 | begin_pfn = PFN_UP((unsigned long)base + text_size); | ||
| 1604 | end_pfn = PFN_UP((unsigned long)base + total_size); | ||
| 1605 | if (end_pfn > begin_pfn) | ||
| 1606 | set_memory_nx(begin_pfn << PAGE_SHIFT, end_pfn - begin_pfn); | ||
| 1607 | } | ||
| 1608 | } | ||
| 1609 | |||
| 1610 | /* Setting memory back to RW+NX before releasing it */ | ||
| 1611 | void unset_section_ro_nx(struct module *mod, void *module_region) | ||
| 1612 | { | ||
| 1613 | unsigned long total_pages; | ||
| 1614 | |||
| 1615 | if (mod->module_core == module_region) { | ||
| 1616 | /* Set core as NX+RW */ | ||
| 1617 | total_pages = MOD_NUMBER_OF_PAGES(mod->module_core, mod->core_size); | ||
| 1618 | set_memory_nx((unsigned long)mod->module_core, total_pages); | ||
| 1619 | set_memory_rw((unsigned long)mod->module_core, total_pages); | ||
| 1620 | |||
| 1621 | } else if (mod->module_init == module_region) { | ||
| 1622 | /* Set init as NX+RW */ | ||
| 1623 | total_pages = MOD_NUMBER_OF_PAGES(mod->module_init, mod->init_size); | ||
| 1624 | set_memory_nx((unsigned long)mod->module_init, total_pages); | ||
| 1625 | set_memory_rw((unsigned long)mod->module_init, total_pages); | ||
| 1626 | } | ||
| 1627 | } | ||
| 1628 | |||
| 1629 | /* Iterate through all modules and set each module's text as RW */ | ||
| 1630 | void set_all_modules_text_rw() | ||
| 1631 | { | ||
| 1632 | struct module *mod; | ||
| 1633 | |||
| 1634 | mutex_lock(&module_mutex); | ||
| 1635 | list_for_each_entry_rcu(mod, &modules, list) { | ||
| 1636 | if ((mod->module_core) && (mod->core_text_size)) { | ||
| 1637 | set_page_attributes(mod->module_core, | ||
| 1638 | mod->module_core + mod->core_text_size, | ||
| 1639 | set_memory_rw); | ||
| 1640 | } | ||
| 1641 | if ((mod->module_init) && (mod->init_text_size)) { | ||
| 1642 | set_page_attributes(mod->module_init, | ||
| 1643 | mod->module_init + mod->init_text_size, | ||
| 1644 | set_memory_rw); | ||
| 1645 | } | ||
| 1646 | } | ||
| 1647 | mutex_unlock(&module_mutex); | ||
| 1648 | } | ||
| 1649 | |||
| 1650 | /* Iterate through all modules and set each module's text as RO */ | ||
| 1651 | void set_all_modules_text_ro() | ||
| 1652 | { | ||
| 1653 | struct module *mod; | ||
| 1654 | |||
| 1655 | mutex_lock(&module_mutex); | ||
| 1656 | list_for_each_entry_rcu(mod, &modules, list) { | ||
| 1657 | if ((mod->module_core) && (mod->core_text_size)) { | ||
| 1658 | set_page_attributes(mod->module_core, | ||
| 1659 | mod->module_core + mod->core_text_size, | ||
| 1660 | set_memory_ro); | ||
| 1661 | } | ||
| 1662 | if ((mod->module_init) && (mod->init_text_size)) { | ||
| 1663 | set_page_attributes(mod->module_init, | ||
| 1664 | mod->module_init + mod->init_text_size, | ||
| 1665 | set_memory_ro); | ||
| 1666 | } | ||
| 1667 | } | ||
| 1668 | mutex_unlock(&module_mutex); | ||
| 1669 | } | ||
| 1670 | #else | ||
| 1671 | static inline void set_section_ro_nx(void *base, unsigned long text_size, unsigned long ro_size, unsigned long total_size) { } | ||
| 1672 | static inline void unset_section_ro_nx(struct module *mod, void *module_region) { } | ||
| 1673 | #endif | ||
| 1674 | |||
| 1545 | /* Free a module, remove from lists, etc. */ | 1675 | /* Free a module, remove from lists, etc. */ |
| 1546 | static void free_module(struct module *mod) | 1676 | static void free_module(struct module *mod) |
| 1547 | { | 1677 | { |
| @@ -1566,6 +1696,7 @@ static void free_module(struct module *mod) | |||
| 1566 | destroy_params(mod->kp, mod->num_kp); | 1696 | destroy_params(mod->kp, mod->num_kp); |
| 1567 | 1697 | ||
| 1568 | /* This may be NULL, but that's OK */ | 1698 | /* This may be NULL, but that's OK */ |
| 1699 | unset_section_ro_nx(mod, mod->module_init); | ||
| 1569 | module_free(mod, mod->module_init); | 1700 | module_free(mod, mod->module_init); |
| 1570 | kfree(mod->args); | 1701 | kfree(mod->args); |
| 1571 | percpu_modfree(mod); | 1702 | percpu_modfree(mod); |
| @@ -1574,6 +1705,7 @@ static void free_module(struct module *mod) | |||
| 1574 | lockdep_free_key_range(mod->module_core, mod->core_size); | 1705 | lockdep_free_key_range(mod->module_core, mod->core_size); |
| 1575 | 1706 | ||
| 1576 | /* Finally, free the core (containing the module structure) */ | 1707 | /* Finally, free the core (containing the module structure) */ |
| 1708 | unset_section_ro_nx(mod, mod->module_core); | ||
| 1577 | module_free(mod, mod->module_core); | 1709 | module_free(mod, mod->module_core); |
| 1578 | 1710 | ||
| 1579 | #ifdef CONFIG_MPU | 1711 | #ifdef CONFIG_MPU |
| @@ -1777,8 +1909,19 @@ static void layout_sections(struct module *mod, struct load_info *info) | |||
| 1777 | s->sh_entsize = get_offset(mod, &mod->core_size, s, i); | 1909 | s->sh_entsize = get_offset(mod, &mod->core_size, s, i); |
| 1778 | DEBUGP("\t%s\n", name); | 1910 | DEBUGP("\t%s\n", name); |
| 1779 | } | 1911 | } |
| 1780 | if (m == 0) | 1912 | switch (m) { |
| 1913 | case 0: /* executable */ | ||
| 1914 | mod->core_size = debug_align(mod->core_size); | ||
| 1781 | mod->core_text_size = mod->core_size; | 1915 | mod->core_text_size = mod->core_size; |
| 1916 | break; | ||
| 1917 | case 1: /* RO: text and ro-data */ | ||
| 1918 | mod->core_size = debug_align(mod->core_size); | ||
| 1919 | mod->core_ro_size = mod->core_size; | ||
| 1920 | break; | ||
| 1921 | case 3: /* whole core */ | ||
| 1922 | mod->core_size = debug_align(mod->core_size); | ||
| 1923 | break; | ||
| 1924 | } | ||
| 1782 | } | 1925 | } |
| 1783 | 1926 | ||
| 1784 | DEBUGP("Init section allocation order:\n"); | 1927 | DEBUGP("Init section allocation order:\n"); |
| @@ -1796,8 +1939,19 @@ static void layout_sections(struct module *mod, struct load_info *info) | |||
| 1796 | | INIT_OFFSET_MASK); | 1939 | | INIT_OFFSET_MASK); |
| 1797 | DEBUGP("\t%s\n", sname); | 1940 | DEBUGP("\t%s\n", sname); |
| 1798 | } | 1941 | } |
| 1799 | if (m == 0) | 1942 | switch (m) { |
| 1943 | case 0: /* executable */ | ||
| 1944 | mod->init_size = debug_align(mod->init_size); | ||
| 1800 | mod->init_text_size = mod->init_size; | 1945 | mod->init_text_size = mod->init_size; |
| 1946 | break; | ||
| 1947 | case 1: /* RO: text and ro-data */ | ||
| 1948 | mod->init_size = debug_align(mod->init_size); | ||
| 1949 | mod->init_ro_size = mod->init_size; | ||
| 1950 | break; | ||
| 1951 | case 3: /* whole init */ | ||
| 1952 | mod->init_size = debug_align(mod->init_size); | ||
| 1953 | break; | ||
| 1954 | } | ||
| 1801 | } | 1955 | } |
| 1802 | } | 1956 | } |
| 1803 | 1957 | ||
| @@ -2306,9 +2460,9 @@ static void find_module_sections(struct module *mod, struct load_info *info) | |||
| 2306 | #endif | 2460 | #endif |
| 2307 | 2461 | ||
| 2308 | #ifdef CONFIG_TRACEPOINTS | 2462 | #ifdef CONFIG_TRACEPOINTS |
| 2309 | mod->tracepoints = section_objs(info, "__tracepoints", | 2463 | mod->tracepoints_ptrs = section_objs(info, "__tracepoints_ptrs", |
| 2310 | sizeof(*mod->tracepoints), | 2464 | sizeof(*mod->tracepoints_ptrs), |
| 2311 | &mod->num_tracepoints); | 2465 | &mod->num_tracepoints); |
| 2312 | #endif | 2466 | #endif |
| 2313 | #ifdef HAVE_JUMP_LABEL | 2467 | #ifdef HAVE_JUMP_LABEL |
| 2314 | mod->jump_entries = section_objs(info, "__jump_table", | 2468 | mod->jump_entries = section_objs(info, "__jump_table", |
| @@ -2326,6 +2480,18 @@ static void find_module_sections(struct module *mod, struct load_info *info) | |||
| 2326 | kmemleak_scan_area(mod->trace_events, sizeof(*mod->trace_events) * | 2480 | kmemleak_scan_area(mod->trace_events, sizeof(*mod->trace_events) * |
| 2327 | mod->num_trace_events, GFP_KERNEL); | 2481 | mod->num_trace_events, GFP_KERNEL); |
| 2328 | #endif | 2482 | #endif |
| 2483 | #ifdef CONFIG_TRACING | ||
| 2484 | mod->trace_bprintk_fmt_start = section_objs(info, "__trace_printk_fmt", | ||
| 2485 | sizeof(*mod->trace_bprintk_fmt_start), | ||
| 2486 | &mod->num_trace_bprintk_fmt); | ||
| 2487 | /* | ||
| 2488 | * This section contains pointers to allocated objects in the trace | ||
| 2489 | * code and not scanning it leads to false positives. | ||
| 2490 | */ | ||
| 2491 | kmemleak_scan_area(mod->trace_bprintk_fmt_start, | ||
| 2492 | sizeof(*mod->trace_bprintk_fmt_start) * | ||
| 2493 | mod->num_trace_bprintk_fmt, GFP_KERNEL); | ||
| 2494 | #endif | ||
| 2329 | #ifdef CONFIG_FTRACE_MCOUNT_RECORD | 2495 | #ifdef CONFIG_FTRACE_MCOUNT_RECORD |
| 2330 | /* sechdrs[0].sh_size is always zero */ | 2496 | /* sechdrs[0].sh_size is always zero */ |
| 2331 | mod->ftrace_callsites = section_objs(info, "__mcount_loc", | 2497 | mod->ftrace_callsites = section_objs(info, "__mcount_loc", |
| @@ -2710,6 +2876,18 @@ SYSCALL_DEFINE3(init_module, void __user *, umod, | |||
| 2710 | blocking_notifier_call_chain(&module_notify_list, | 2876 | blocking_notifier_call_chain(&module_notify_list, |
| 2711 | MODULE_STATE_COMING, mod); | 2877 | MODULE_STATE_COMING, mod); |
| 2712 | 2878 | ||
| 2879 | /* Set RO and NX regions for core */ | ||
| 2880 | set_section_ro_nx(mod->module_core, | ||
| 2881 | mod->core_text_size, | ||
| 2882 | mod->core_ro_size, | ||
| 2883 | mod->core_size); | ||
| 2884 | |||
| 2885 | /* Set RO and NX regions for init */ | ||
| 2886 | set_section_ro_nx(mod->module_init, | ||
| 2887 | mod->init_text_size, | ||
| 2888 | mod->init_ro_size, | ||
| 2889 | mod->init_size); | ||
| 2890 | |||
| 2713 | do_mod_ctors(mod); | 2891 | do_mod_ctors(mod); |
| 2714 | /* Start the module */ | 2892 | /* Start the module */ |
| 2715 | if (mod->init != NULL) | 2893 | if (mod->init != NULL) |
| @@ -2753,6 +2931,7 @@ SYSCALL_DEFINE3(init_module, void __user *, umod, | |||
| 2753 | mod->symtab = mod->core_symtab; | 2931 | mod->symtab = mod->core_symtab; |
| 2754 | mod->strtab = mod->core_strtab; | 2932 | mod->strtab = mod->core_strtab; |
| 2755 | #endif | 2933 | #endif |
| 2934 | unset_section_ro_nx(mod, mod->module_init); | ||
| 2756 | module_free(mod, mod->module_init); | 2935 | module_free(mod, mod->module_init); |
| 2757 | mod->module_init = NULL; | 2936 | mod->module_init = NULL; |
| 2758 | mod->init_size = 0; | 2937 | mod->init_size = 0; |
| @@ -3214,7 +3393,7 @@ void module_layout(struct module *mod, | |||
| 3214 | struct modversion_info *ver, | 3393 | struct modversion_info *ver, |
| 3215 | struct kernel_param *kp, | 3394 | struct kernel_param *kp, |
| 3216 | struct kernel_symbol *ks, | 3395 | struct kernel_symbol *ks, |
| 3217 | struct tracepoint *tp) | 3396 | struct tracepoint * const *tp) |
| 3218 | { | 3397 | { |
| 3219 | } | 3398 | } |
| 3220 | EXPORT_SYMBOL(module_layout); | 3399 | EXPORT_SYMBOL(module_layout); |
| @@ -3228,8 +3407,8 @@ void module_update_tracepoints(void) | |||
| 3228 | mutex_lock(&module_mutex); | 3407 | mutex_lock(&module_mutex); |
| 3229 | list_for_each_entry(mod, &modules, list) | 3408 | list_for_each_entry(mod, &modules, list) |
| 3230 | if (!mod->taints) | 3409 | if (!mod->taints) |
| 3231 | tracepoint_update_probe_range(mod->tracepoints, | 3410 | tracepoint_update_probe_range(mod->tracepoints_ptrs, |
| 3232 | mod->tracepoints + mod->num_tracepoints); | 3411 | mod->tracepoints_ptrs + mod->num_tracepoints); |
| 3233 | mutex_unlock(&module_mutex); | 3412 | mutex_unlock(&module_mutex); |
| 3234 | } | 3413 | } |
| 3235 | 3414 | ||
| @@ -3253,8 +3432,8 @@ int module_get_iter_tracepoints(struct tracepoint_iter *iter) | |||
| 3253 | else if (iter_mod > iter->module) | 3432 | else if (iter_mod > iter->module) |
| 3254 | iter->tracepoint = NULL; | 3433 | iter->tracepoint = NULL; |
| 3255 | found = tracepoint_get_iter_range(&iter->tracepoint, | 3434 | found = tracepoint_get_iter_range(&iter->tracepoint, |
| 3256 | iter_mod->tracepoints, | 3435 | iter_mod->tracepoints_ptrs, |
| 3257 | iter_mod->tracepoints | 3436 | iter_mod->tracepoints_ptrs |
| 3258 | + iter_mod->num_tracepoints); | 3437 | + iter_mod->num_tracepoints); |
| 3259 | if (found) { | 3438 | if (found) { |
| 3260 | iter->module = iter_mod; | 3439 | iter->module = iter_mod; |
diff --git a/kernel/mutex.c b/kernel/mutex.c index 200407c1502..a5889fb28ec 100644 --- a/kernel/mutex.c +++ b/kernel/mutex.c | |||
| @@ -199,7 +199,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, | |||
| 199 | * memory barriers as we'll eventually observe the right | 199 | * memory barriers as we'll eventually observe the right |
| 200 | * values at the cost of a few extra spins. | 200 | * values at the cost of a few extra spins. |
| 201 | */ | 201 | */ |
| 202 | cpu_relax(); | 202 | arch_mutex_cpu_relax(); |
| 203 | } | 203 | } |
| 204 | #endif | 204 | #endif |
| 205 | spin_lock_mutex(&lock->wait_lock, flags); | 205 | spin_lock_mutex(&lock->wait_lock, flags); |
diff --git a/kernel/panic.c b/kernel/panic.c index 4c13b1a88eb..991bb87a170 100644 --- a/kernel/panic.c +++ b/kernel/panic.c | |||
| @@ -34,6 +34,7 @@ static int pause_on_oops_flag; | |||
| 34 | static DEFINE_SPINLOCK(pause_on_oops_lock); | 34 | static DEFINE_SPINLOCK(pause_on_oops_lock); |
| 35 | 35 | ||
| 36 | int panic_timeout; | 36 | int panic_timeout; |
| 37 | EXPORT_SYMBOL_GPL(panic_timeout); | ||
| 37 | 38 | ||
| 38 | ATOMIC_NOTIFIER_HEAD(panic_notifier_list); | 39 | ATOMIC_NOTIFIER_HEAD(panic_notifier_list); |
| 39 | 40 | ||
diff --git a/kernel/params.c b/kernel/params.c index 08107d18175..0da1411222b 100644 --- a/kernel/params.c +++ b/kernel/params.c | |||
| @@ -719,9 +719,7 @@ void destroy_params(const struct kernel_param *params, unsigned num) | |||
| 719 | params[i].ops->free(params[i].arg); | 719 | params[i].ops->free(params[i].arg); |
| 720 | } | 720 | } |
| 721 | 721 | ||
| 722 | static void __init kernel_add_sysfs_param(const char *name, | 722 | static struct module_kobject * __init locate_module_kobject(const char *name) |
| 723 | struct kernel_param *kparam, | ||
| 724 | unsigned int name_skip) | ||
| 725 | { | 723 | { |
| 726 | struct module_kobject *mk; | 724 | struct module_kobject *mk; |
| 727 | struct kobject *kobj; | 725 | struct kobject *kobj; |
| @@ -729,10 +727,7 @@ static void __init kernel_add_sysfs_param(const char *name, | |||
| 729 | 727 | ||
| 730 | kobj = kset_find_obj(module_kset, name); | 728 | kobj = kset_find_obj(module_kset, name); |
| 731 | if (kobj) { | 729 | if (kobj) { |
| 732 | /* We already have one. Remove params so we can add more. */ | ||
| 733 | mk = to_module_kobject(kobj); | 730 | mk = to_module_kobject(kobj); |
| 734 | /* We need to remove it before adding parameters. */ | ||
| 735 | sysfs_remove_group(&mk->kobj, &mk->mp->grp); | ||
| 736 | } else { | 731 | } else { |
| 737 | mk = kzalloc(sizeof(struct module_kobject), GFP_KERNEL); | 732 | mk = kzalloc(sizeof(struct module_kobject), GFP_KERNEL); |
| 738 | BUG_ON(!mk); | 733 | BUG_ON(!mk); |
| @@ -743,15 +738,36 @@ static void __init kernel_add_sysfs_param(const char *name, | |||
| 743 | "%s", name); | 738 | "%s", name); |
| 744 | if (err) { | 739 | if (err) { |
| 745 | kobject_put(&mk->kobj); | 740 | kobject_put(&mk->kobj); |
| 746 | printk(KERN_ERR "Module '%s' failed add to sysfs, " | 741 | printk(KERN_ERR |
| 747 | "error number %d\n", name, err); | 742 | "Module '%s' failed add to sysfs, error number %d\n", |
| 748 | printk(KERN_ERR "The system will be unstable now.\n"); | 743 | name, err); |
| 749 | return; | 744 | printk(KERN_ERR |
| 745 | "The system will be unstable now.\n"); | ||
| 746 | return NULL; | ||
| 750 | } | 747 | } |
| 751 | /* So that exit path is even. */ | 748 | |
| 749 | /* So that we hold reference in both cases. */ | ||
| 752 | kobject_get(&mk->kobj); | 750 | kobject_get(&mk->kobj); |
| 753 | } | 751 | } |
| 754 | 752 | ||
| 753 | return mk; | ||
| 754 | } | ||
| 755 | |||
| 756 | static void __init kernel_add_sysfs_param(const char *name, | ||
| 757 | struct kernel_param *kparam, | ||
| 758 | unsigned int name_skip) | ||
| 759 | { | ||
| 760 | struct module_kobject *mk; | ||
| 761 | int err; | ||
| 762 | |||
| 763 | mk = locate_module_kobject(name); | ||
| 764 | if (!mk) | ||
| 765 | return; | ||
| 766 | |||
| 767 | /* We need to remove old parameters before adding more. */ | ||
| 768 | if (mk->mp) | ||
| 769 | sysfs_remove_group(&mk->kobj, &mk->mp->grp); | ||
| 770 | |||
| 755 | /* These should not fail at boot. */ | 771 | /* These should not fail at boot. */ |
| 756 | err = add_sysfs_param(mk, kparam, kparam->name + name_skip); | 772 | err = add_sysfs_param(mk, kparam, kparam->name + name_skip); |
| 757 | BUG_ON(err); | 773 | BUG_ON(err); |
| @@ -796,6 +812,32 @@ static void __init param_sysfs_builtin(void) | |||
| 796 | } | 812 | } |
| 797 | } | 813 | } |
| 798 | 814 | ||
| 815 | ssize_t __modver_version_show(struct module_attribute *mattr, | ||
| 816 | struct module *mod, char *buf) | ||
| 817 | { | ||
| 818 | struct module_version_attribute *vattr = | ||
| 819 | container_of(mattr, struct module_version_attribute, mattr); | ||
| 820 | |||
| 821 | return sprintf(buf, "%s\n", vattr->version); | ||
| 822 | } | ||
| 823 | |||
| 824 | extern struct module_version_attribute __start___modver[], __stop___modver[]; | ||
| 825 | |||
| 826 | static void __init version_sysfs_builtin(void) | ||
| 827 | { | ||
| 828 | const struct module_version_attribute *vattr; | ||
| 829 | struct module_kobject *mk; | ||
| 830 | int err; | ||
| 831 | |||
| 832 | for (vattr = __start___modver; vattr < __stop___modver; vattr++) { | ||
| 833 | mk = locate_module_kobject(vattr->module_name); | ||
| 834 | if (mk) { | ||
| 835 | err = sysfs_create_file(&mk->kobj, &vattr->mattr.attr); | ||
| 836 | kobject_uevent(&mk->kobj, KOBJ_ADD); | ||
| 837 | kobject_put(&mk->kobj); | ||
| 838 | } | ||
| 839 | } | ||
| 840 | } | ||
| 799 | 841 | ||
| 800 | /* module-related sysfs stuff */ | 842 | /* module-related sysfs stuff */ |
| 801 | 843 | ||
| @@ -875,6 +917,7 @@ static int __init param_sysfs_init(void) | |||
| 875 | } | 917 | } |
| 876 | module_sysfs_initialized = 1; | 918 | module_sysfs_initialized = 1; |
| 877 | 919 | ||
| 920 | version_sysfs_builtin(); | ||
| 878 | param_sysfs_builtin(); | 921 | param_sysfs_builtin(); |
| 879 | 922 | ||
| 880 | return 0; | 923 | return 0; |
diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 517d827f498..656222fcf76 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c | |||
| @@ -13,6 +13,7 @@ | |||
| 13 | #include <linux/mm.h> | 13 | #include <linux/mm.h> |
| 14 | #include <linux/cpu.h> | 14 | #include <linux/cpu.h> |
| 15 | #include <linux/smp.h> | 15 | #include <linux/smp.h> |
| 16 | #include <linux/idr.h> | ||
| 16 | #include <linux/file.h> | 17 | #include <linux/file.h> |
| 17 | #include <linux/poll.h> | 18 | #include <linux/poll.h> |
| 18 | #include <linux/slab.h> | 19 | #include <linux/slab.h> |
| @@ -21,7 +22,9 @@ | |||
| 21 | #include <linux/dcache.h> | 22 | #include <linux/dcache.h> |
| 22 | #include <linux/percpu.h> | 23 | #include <linux/percpu.h> |
| 23 | #include <linux/ptrace.h> | 24 | #include <linux/ptrace.h> |
| 25 | #include <linux/reboot.h> | ||
| 24 | #include <linux/vmstat.h> | 26 | #include <linux/vmstat.h> |
| 27 | #include <linux/device.h> | ||
| 25 | #include <linux/vmalloc.h> | 28 | #include <linux/vmalloc.h> |
| 26 | #include <linux/hardirq.h> | 29 | #include <linux/hardirq.h> |
| 27 | #include <linux/rculist.h> | 30 | #include <linux/rculist.h> |
| @@ -31,9 +34,16 @@ | |||
| 31 | #include <linux/kernel_stat.h> | 34 | #include <linux/kernel_stat.h> |
| 32 | #include <linux/perf_event.h> | 35 | #include <linux/perf_event.h> |
| 33 | #include <linux/ftrace_event.h> | 36 | #include <linux/ftrace_event.h> |
| 37 | #include <linux/hw_breakpoint.h> | ||
| 34 | 38 | ||
| 35 | #include <asm/irq_regs.h> | 39 | #include <asm/irq_regs.h> |
| 36 | 40 | ||
| 41 | enum event_type_t { | ||
| 42 | EVENT_FLEXIBLE = 0x1, | ||
| 43 | EVENT_PINNED = 0x2, | ||
| 44 | EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED, | ||
| 45 | }; | ||
| 46 | |||
| 37 | atomic_t perf_task_events __read_mostly; | 47 | atomic_t perf_task_events __read_mostly; |
| 38 | static atomic_t nr_mmap_events __read_mostly; | 48 | static atomic_t nr_mmap_events __read_mostly; |
| 39 | static atomic_t nr_comm_events __read_mostly; | 49 | static atomic_t nr_comm_events __read_mostly; |
| @@ -61,6 +71,12 @@ int sysctl_perf_event_sample_rate __read_mostly = 100000; | |||
| 61 | 71 | ||
| 62 | static atomic64_t perf_event_id; | 72 | static atomic64_t perf_event_id; |
| 63 | 73 | ||
| 74 | static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx, | ||
| 75 | enum event_type_t event_type); | ||
| 76 | |||
| 77 | static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx, | ||
| 78 | enum event_type_t event_type); | ||
| 79 | |||
| 64 | void __weak perf_event_print_debug(void) { } | 80 | void __weak perf_event_print_debug(void) { } |
| 65 | 81 | ||
| 66 | extern __weak const char *perf_pmu_name(void) | 82 | extern __weak const char *perf_pmu_name(void) |
| @@ -68,6 +84,11 @@ extern __weak const char *perf_pmu_name(void) | |||
| 68 | return "pmu"; | 84 | return "pmu"; |
| 69 | } | 85 | } |
| 70 | 86 | ||
| 87 | static inline u64 perf_clock(void) | ||
| 88 | { | ||
| 89 | return local_clock(); | ||
| 90 | } | ||
| 91 | |||
| 71 | void perf_pmu_disable(struct pmu *pmu) | 92 | void perf_pmu_disable(struct pmu *pmu) |
| 72 | { | 93 | { |
| 73 | int *count = this_cpu_ptr(pmu->pmu_disable_count); | 94 | int *count = this_cpu_ptr(pmu->pmu_disable_count); |
| @@ -132,6 +153,28 @@ static void unclone_ctx(struct perf_event_context *ctx) | |||
| 132 | } | 153 | } |
| 133 | } | 154 | } |
| 134 | 155 | ||
| 156 | static u32 perf_event_pid(struct perf_event *event, struct task_struct *p) | ||
| 157 | { | ||
| 158 | /* | ||
| 159 | * only top level events have the pid namespace they were created in | ||
| 160 | */ | ||
| 161 | if (event->parent) | ||
| 162 | event = event->parent; | ||
| 163 | |||
| 164 | return task_tgid_nr_ns(p, event->ns); | ||
| 165 | } | ||
| 166 | |||
| 167 | static u32 perf_event_tid(struct perf_event *event, struct task_struct *p) | ||
| 168 | { | ||
| 169 | /* | ||
| 170 | * only top level events have the pid namespace they were created in | ||
| 171 | */ | ||
| 172 | if (event->parent) | ||
| 173 | event = event->parent; | ||
| 174 | |||
| 175 | return task_pid_nr_ns(p, event->ns); | ||
| 176 | } | ||
| 177 | |||
| 135 | /* | 178 | /* |
| 136 | * If we inherit events we want to return the parent event id | 179 | * If we inherit events we want to return the parent event id |
| 137 | * to userspace. | 180 | * to userspace. |
| @@ -214,11 +257,6 @@ static void perf_unpin_context(struct perf_event_context *ctx) | |||
| 214 | put_ctx(ctx); | 257 | put_ctx(ctx); |
| 215 | } | 258 | } |
| 216 | 259 | ||
| 217 | static inline u64 perf_clock(void) | ||
| 218 | { | ||
| 219 | return local_clock(); | ||
| 220 | } | ||
| 221 | |||
| 222 | /* | 260 | /* |
| 223 | * Update the record of the current time in a context. | 261 | * Update the record of the current time in a context. |
| 224 | */ | 262 | */ |
| @@ -230,6 +268,12 @@ static void update_context_time(struct perf_event_context *ctx) | |||
| 230 | ctx->timestamp = now; | 268 | ctx->timestamp = now; |
| 231 | } | 269 | } |
| 232 | 270 | ||
| 271 | static u64 perf_event_time(struct perf_event *event) | ||
| 272 | { | ||
| 273 | struct perf_event_context *ctx = event->ctx; | ||
| 274 | return ctx ? ctx->time : 0; | ||
| 275 | } | ||
| 276 | |||
| 233 | /* | 277 | /* |
| 234 | * Update the total_time_enabled and total_time_running fields for a event. | 278 | * Update the total_time_enabled and total_time_running fields for a event. |
| 235 | */ | 279 | */ |
| @@ -243,7 +287,7 @@ static void update_event_times(struct perf_event *event) | |||
| 243 | return; | 287 | return; |
| 244 | 288 | ||
| 245 | if (ctx->is_active) | 289 | if (ctx->is_active) |
| 246 | run_end = ctx->time; | 290 | run_end = perf_event_time(event); |
| 247 | else | 291 | else |
| 248 | run_end = event->tstamp_stopped; | 292 | run_end = event->tstamp_stopped; |
| 249 | 293 | ||
| @@ -252,7 +296,7 @@ static void update_event_times(struct perf_event *event) | |||
| 252 | if (event->state == PERF_EVENT_STATE_INACTIVE) | 296 | if (event->state == PERF_EVENT_STATE_INACTIVE) |
| 253 | run_end = event->tstamp_stopped; | 297 | run_end = event->tstamp_stopped; |
| 254 | else | 298 | else |
| 255 | run_end = ctx->time; | 299 | run_end = perf_event_time(event); |
| 256 | 300 | ||
| 257 | event->total_time_running = run_end - event->tstamp_running; | 301 | event->total_time_running = run_end - event->tstamp_running; |
| 258 | } | 302 | } |
| @@ -311,9 +355,84 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx) | |||
| 311 | ctx->nr_stat++; | 355 | ctx->nr_stat++; |
| 312 | } | 356 | } |
| 313 | 357 | ||
| 358 | /* | ||
| 359 | * Called at perf_event creation and when events are attached/detached from a | ||
| 360 | * group. | ||
| 361 | */ | ||
| 362 | static void perf_event__read_size(struct perf_event *event) | ||
| 363 | { | ||
| 364 | int entry = sizeof(u64); /* value */ | ||
| 365 | int size = 0; | ||
| 366 | int nr = 1; | ||
| 367 | |||
| 368 | if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) | ||
| 369 | size += sizeof(u64); | ||
| 370 | |||
| 371 | if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) | ||
| 372 | size += sizeof(u64); | ||
| 373 | |||
| 374 | if (event->attr.read_format & PERF_FORMAT_ID) | ||
| 375 | entry += sizeof(u64); | ||
| 376 | |||
| 377 | if (event->attr.read_format & PERF_FORMAT_GROUP) { | ||
| 378 | nr += event->group_leader->nr_siblings; | ||
| 379 | size += sizeof(u64); | ||
| 380 | } | ||
| 381 | |||
| 382 | size += entry * nr; | ||
| 383 | event->read_size = size; | ||
| 384 | } | ||
| 385 | |||
| 386 | static void perf_event__header_size(struct perf_event *event) | ||
| 387 | { | ||
| 388 | struct perf_sample_data *data; | ||
| 389 | u64 sample_type = event->attr.sample_type; | ||
| 390 | u16 size = 0; | ||
| 391 | |||
| 392 | perf_event__read_size(event); | ||
| 393 | |||
| 394 | if (sample_type & PERF_SAMPLE_IP) | ||
| 395 | size += sizeof(data->ip); | ||
| 396 | |||
| 397 | if (sample_type & PERF_SAMPLE_ADDR) | ||
| 398 | size += sizeof(data->addr); | ||
| 399 | |||
| 400 | if (sample_type & PERF_SAMPLE_PERIOD) | ||
| 401 | size += sizeof(data->period); | ||
| 402 | |||
| 403 | if (sample_type & PERF_SAMPLE_READ) | ||
| 404 | size += event->read_size; | ||
| 405 | |||
| 406 | event->header_size = size; | ||
| 407 | } | ||
| 408 | |||
| 409 | static void perf_event__id_header_size(struct perf_event *event) | ||
| 410 | { | ||
| 411 | struct perf_sample_data *data; | ||
| 412 | u64 sample_type = event->attr.sample_type; | ||
| 413 | u16 size = 0; | ||
| 414 | |||
| 415 | if (sample_type & PERF_SAMPLE_TID) | ||
| 416 | size += sizeof(data->tid_entry); | ||
| 417 | |||
| 418 | if (sample_type & PERF_SAMPLE_TIME) | ||
| 419 | size += sizeof(data->time); | ||
| 420 | |||
| 421 | if (sample_type & PERF_SAMPLE_ID) | ||
| 422 | size += sizeof(data->id); | ||
| 423 | |||
| 424 | if (sample_type & PERF_SAMPLE_STREAM_ID) | ||
| 425 | size += sizeof(data->stream_id); | ||
| 426 | |||
| 427 | if (sample_type & PERF_SAMPLE_CPU) | ||
| 428 | size += sizeof(data->cpu_entry); | ||
| 429 | |||
| 430 | event->id_header_size = size; | ||
| 431 | } | ||
| 432 | |||
| 314 | static void perf_group_attach(struct perf_event *event) | 433 | static void perf_group_attach(struct perf_event *event) |
| 315 | { | 434 | { |
| 316 | struct perf_event *group_leader = event->group_leader; | 435 | struct perf_event *group_leader = event->group_leader, *pos; |
| 317 | 436 | ||
| 318 | /* | 437 | /* |
| 319 | * We can have double attach due to group movement in perf_event_open. | 438 | * We can have double attach due to group movement in perf_event_open. |
| @@ -332,6 +451,11 @@ static void perf_group_attach(struct perf_event *event) | |||
| 332 | 451 | ||
| 333 | list_add_tail(&event->group_entry, &group_leader->sibling_list); | 452 | list_add_tail(&event->group_entry, &group_leader->sibling_list); |
| 334 | group_leader->nr_siblings++; | 453 | group_leader->nr_siblings++; |
| 454 | |||
| 455 | perf_event__header_size(group_leader); | ||
| 456 | |||
| 457 | list_for_each_entry(pos, &group_leader->sibling_list, group_entry) | ||
| 458 | perf_event__header_size(pos); | ||
| 335 | } | 459 | } |
| 336 | 460 | ||
| 337 | /* | 461 | /* |
| @@ -390,7 +514,7 @@ static void perf_group_detach(struct perf_event *event) | |||
| 390 | if (event->group_leader != event) { | 514 | if (event->group_leader != event) { |
| 391 | list_del_init(&event->group_entry); | 515 | list_del_init(&event->group_entry); |
| 392 | event->group_leader->nr_siblings--; | 516 | event->group_leader->nr_siblings--; |
| 393 | return; | 517 | goto out; |
| 394 | } | 518 | } |
| 395 | 519 | ||
| 396 | if (!list_empty(&event->group_entry)) | 520 | if (!list_empty(&event->group_entry)) |
| @@ -409,6 +533,12 @@ static void perf_group_detach(struct perf_event *event) | |||
| 409 | /* Inherit group flags from the previous leader */ | 533 | /* Inherit group flags from the previous leader */ |
| 410 | sibling->group_flags = event->group_flags; | 534 | sibling->group_flags = event->group_flags; |
| 411 | } | 535 | } |
| 536 | |||
| 537 | out: | ||
| 538 | perf_event__header_size(event->group_leader); | ||
| 539 | |||
| 540 | list_for_each_entry(tmp, &event->group_leader->sibling_list, group_entry) | ||
| 541 | perf_event__header_size(tmp); | ||
| 412 | } | 542 | } |
| 413 | 543 | ||
| 414 | static inline int | 544 | static inline int |
| @@ -422,6 +552,7 @@ event_sched_out(struct perf_event *event, | |||
| 422 | struct perf_cpu_context *cpuctx, | 552 | struct perf_cpu_context *cpuctx, |
| 423 | struct perf_event_context *ctx) | 553 | struct perf_event_context *ctx) |
| 424 | { | 554 | { |
| 555 | u64 tstamp = perf_event_time(event); | ||
| 425 | u64 delta; | 556 | u64 delta; |
| 426 | /* | 557 | /* |
| 427 | * An event which could not be activated because of | 558 | * An event which could not be activated because of |
| @@ -433,7 +564,7 @@ event_sched_out(struct perf_event *event, | |||
| 433 | && !event_filter_match(event)) { | 564 | && !event_filter_match(event)) { |
| 434 | delta = ctx->time - event->tstamp_stopped; | 565 | delta = ctx->time - event->tstamp_stopped; |
| 435 | event->tstamp_running += delta; | 566 | event->tstamp_running += delta; |
| 436 | event->tstamp_stopped = ctx->time; | 567 | event->tstamp_stopped = tstamp; |
| 437 | } | 568 | } |
| 438 | 569 | ||
| 439 | if (event->state != PERF_EVENT_STATE_ACTIVE) | 570 | if (event->state != PERF_EVENT_STATE_ACTIVE) |
| @@ -444,7 +575,7 @@ event_sched_out(struct perf_event *event, | |||
| 444 | event->pending_disable = 0; | 575 | event->pending_disable = 0; |
| 445 | event->state = PERF_EVENT_STATE_OFF; | 576 | event->state = PERF_EVENT_STATE_OFF; |
| 446 | } | 577 | } |
| 447 | event->tstamp_stopped = ctx->time; | 578 | event->tstamp_stopped = tstamp; |
| 448 | event->pmu->del(event, 0); | 579 | event->pmu->del(event, 0); |
| 449 | event->oncpu = -1; | 580 | event->oncpu = -1; |
| 450 | 581 | ||
| @@ -651,16 +782,33 @@ retry: | |||
| 651 | raw_spin_unlock_irq(&ctx->lock); | 782 | raw_spin_unlock_irq(&ctx->lock); |
| 652 | } | 783 | } |
| 653 | 784 | ||
| 785 | #define MAX_INTERRUPTS (~0ULL) | ||
| 786 | |||
| 787 | static void perf_log_throttle(struct perf_event *event, int enable); | ||
| 788 | |||
| 654 | static int | 789 | static int |
| 655 | event_sched_in(struct perf_event *event, | 790 | event_sched_in(struct perf_event *event, |
| 656 | struct perf_cpu_context *cpuctx, | 791 | struct perf_cpu_context *cpuctx, |
| 657 | struct perf_event_context *ctx) | 792 | struct perf_event_context *ctx) |
| 658 | { | 793 | { |
| 794 | u64 tstamp = perf_event_time(event); | ||
| 795 | |||
| 659 | if (event->state <= PERF_EVENT_STATE_OFF) | 796 | if (event->state <= PERF_EVENT_STATE_OFF) |
| 660 | return 0; | 797 | return 0; |
| 661 | 798 | ||
| 662 | event->state = PERF_EVENT_STATE_ACTIVE; | 799 | event->state = PERF_EVENT_STATE_ACTIVE; |
| 663 | event->oncpu = smp_processor_id(); | 800 | event->oncpu = smp_processor_id(); |
| 801 | |||
| 802 | /* | ||
| 803 | * Unthrottle events, since we scheduled we might have missed several | ||
| 804 | * ticks already, also for a heavily scheduling task there is little | ||
| 805 | * guarantee it'll get a tick in a timely manner. | ||
| 806 | */ | ||
| 807 | if (unlikely(event->hw.interrupts == MAX_INTERRUPTS)) { | ||
| 808 | perf_log_throttle(event, 1); | ||
| 809 | event->hw.interrupts = 0; | ||
| 810 | } | ||
| 811 | |||
| 664 | /* | 812 | /* |
| 665 | * The new state must be visible before we turn it on in the hardware: | 813 | * The new state must be visible before we turn it on in the hardware: |
| 666 | */ | 814 | */ |
| @@ -672,7 +820,9 @@ event_sched_in(struct perf_event *event, | |||
| 672 | return -EAGAIN; | 820 | return -EAGAIN; |
| 673 | } | 821 | } |
| 674 | 822 | ||
| 675 | event->tstamp_running += ctx->time - event->tstamp_stopped; | 823 | event->tstamp_running += tstamp - event->tstamp_stopped; |
| 824 | |||
| 825 | event->shadow_ctx_time = tstamp - ctx->timestamp; | ||
| 676 | 826 | ||
| 677 | if (!is_software_event(event)) | 827 | if (!is_software_event(event)) |
| 678 | cpuctx->active_oncpu++; | 828 | cpuctx->active_oncpu++; |
| @@ -784,11 +934,13 @@ static int group_can_go_on(struct perf_event *event, | |||
| 784 | static void add_event_to_ctx(struct perf_event *event, | 934 | static void add_event_to_ctx(struct perf_event *event, |
| 785 | struct perf_event_context *ctx) | 935 | struct perf_event_context *ctx) |
| 786 | { | 936 | { |
| 937 | u64 tstamp = perf_event_time(event); | ||
| 938 | |||
| 787 | list_add_event(event, ctx); | 939 | list_add_event(event, ctx); |
| 788 | perf_group_attach(event); | 940 | perf_group_attach(event); |
| 789 | event->tstamp_enabled = ctx->time; | 941 | event->tstamp_enabled = tstamp; |
| 790 | event->tstamp_running = ctx->time; | 942 | event->tstamp_running = tstamp; |
| 791 | event->tstamp_stopped = ctx->time; | 943 | event->tstamp_stopped = tstamp; |
| 792 | } | 944 | } |
| 793 | 945 | ||
| 794 | /* | 946 | /* |
| @@ -823,7 +975,7 @@ static void __perf_install_in_context(void *info) | |||
| 823 | 975 | ||
| 824 | add_event_to_ctx(event, ctx); | 976 | add_event_to_ctx(event, ctx); |
| 825 | 977 | ||
| 826 | if (event->cpu != -1 && event->cpu != smp_processor_id()) | 978 | if (!event_filter_match(event)) |
| 827 | goto unlock; | 979 | goto unlock; |
| 828 | 980 | ||
| 829 | /* | 981 | /* |
| @@ -928,14 +1080,13 @@ static void __perf_event_mark_enabled(struct perf_event *event, | |||
| 928 | struct perf_event_context *ctx) | 1080 | struct perf_event_context *ctx) |
| 929 | { | 1081 | { |
| 930 | struct perf_event *sub; | 1082 | struct perf_event *sub; |
| 1083 | u64 tstamp = perf_event_time(event); | ||
| 931 | 1084 | ||
| 932 | event->state = PERF_EVENT_STATE_INACTIVE; | 1085 | event->state = PERF_EVENT_STATE_INACTIVE; |
| 933 | event->tstamp_enabled = ctx->time - event->total_time_enabled; | 1086 | event->tstamp_enabled = tstamp - event->total_time_enabled; |
| 934 | list_for_each_entry(sub, &event->sibling_list, group_entry) { | 1087 | list_for_each_entry(sub, &event->sibling_list, group_entry) { |
| 935 | if (sub->state >= PERF_EVENT_STATE_INACTIVE) { | 1088 | if (sub->state >= PERF_EVENT_STATE_INACTIVE) |
| 936 | sub->tstamp_enabled = | 1089 | sub->tstamp_enabled = tstamp - sub->total_time_enabled; |
| 937 | ctx->time - sub->total_time_enabled; | ||
| 938 | } | ||
| 939 | } | 1090 | } |
| 940 | } | 1091 | } |
| 941 | 1092 | ||
| @@ -968,7 +1119,7 @@ static void __perf_event_enable(void *info) | |||
| 968 | goto unlock; | 1119 | goto unlock; |
| 969 | __perf_event_mark_enabled(event, ctx); | 1120 | __perf_event_mark_enabled(event, ctx); |
| 970 | 1121 | ||
| 971 | if (event->cpu != -1 && event->cpu != smp_processor_id()) | 1122 | if (!event_filter_match(event)) |
| 972 | goto unlock; | 1123 | goto unlock; |
| 973 | 1124 | ||
| 974 | /* | 1125 | /* |
| @@ -1070,7 +1221,7 @@ static int perf_event_refresh(struct perf_event *event, int refresh) | |||
| 1070 | /* | 1221 | /* |
| 1071 | * not supported on inherited events | 1222 | * not supported on inherited events |
| 1072 | */ | 1223 | */ |
| 1073 | if (event->attr.inherit) | 1224 | if (event->attr.inherit || !is_sampling_event(event)) |
| 1074 | return -EINVAL; | 1225 | return -EINVAL; |
| 1075 | 1226 | ||
| 1076 | atomic_add(refresh, &event->event_limit); | 1227 | atomic_add(refresh, &event->event_limit); |
| @@ -1079,12 +1230,6 @@ static int perf_event_refresh(struct perf_event *event, int refresh) | |||
| 1079 | return 0; | 1230 | return 0; |
| 1080 | } | 1231 | } |
| 1081 | 1232 | ||
| 1082 | enum event_type_t { | ||
| 1083 | EVENT_FLEXIBLE = 0x1, | ||
| 1084 | EVENT_PINNED = 0x2, | ||
| 1085 | EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED, | ||
| 1086 | }; | ||
| 1087 | |||
| 1088 | static void ctx_sched_out(struct perf_event_context *ctx, | 1233 | static void ctx_sched_out(struct perf_event_context *ctx, |
| 1089 | struct perf_cpu_context *cpuctx, | 1234 | struct perf_cpu_context *cpuctx, |
| 1090 | enum event_type_t event_type) | 1235 | enum event_type_t event_type) |
| @@ -1284,8 +1429,6 @@ void __perf_event_task_sched_out(struct task_struct *task, | |||
| 1284 | { | 1429 | { |
| 1285 | int ctxn; | 1430 | int ctxn; |
| 1286 | 1431 | ||
| 1287 | perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, NULL, 0); | ||
| 1288 | |||
| 1289 | for_each_task_context_nr(ctxn) | 1432 | for_each_task_context_nr(ctxn) |
| 1290 | perf_event_context_sched_out(task, ctxn, next); | 1433 | perf_event_context_sched_out(task, ctxn, next); |
| 1291 | } | 1434 | } |
| @@ -1323,7 +1466,7 @@ ctx_pinned_sched_in(struct perf_event_context *ctx, | |||
| 1323 | list_for_each_entry(event, &ctx->pinned_groups, group_entry) { | 1466 | list_for_each_entry(event, &ctx->pinned_groups, group_entry) { |
| 1324 | if (event->state <= PERF_EVENT_STATE_OFF) | 1467 | if (event->state <= PERF_EVENT_STATE_OFF) |
| 1325 | continue; | 1468 | continue; |
| 1326 | if (event->cpu != -1 && event->cpu != smp_processor_id()) | 1469 | if (!event_filter_match(event)) |
| 1327 | continue; | 1470 | continue; |
| 1328 | 1471 | ||
| 1329 | if (group_can_go_on(event, cpuctx, 1)) | 1472 | if (group_can_go_on(event, cpuctx, 1)) |
| @@ -1355,7 +1498,7 @@ ctx_flexible_sched_in(struct perf_event_context *ctx, | |||
| 1355 | * Listen to the 'cpu' scheduling filter constraint | 1498 | * Listen to the 'cpu' scheduling filter constraint |
| 1356 | * of events: | 1499 | * of events: |
| 1357 | */ | 1500 | */ |
| 1358 | if (event->cpu != -1 && event->cpu != smp_processor_id()) | 1501 | if (!event_filter_match(event)) |
| 1359 | continue; | 1502 | continue; |
| 1360 | 1503 | ||
| 1361 | if (group_can_go_on(event, cpuctx, can_add_hw)) { | 1504 | if (group_can_go_on(event, cpuctx, can_add_hw)) { |
| @@ -1468,10 +1611,6 @@ void __perf_event_task_sched_in(struct task_struct *task) | |||
| 1468 | } | 1611 | } |
| 1469 | } | 1612 | } |
| 1470 | 1613 | ||
| 1471 | #define MAX_INTERRUPTS (~0ULL) | ||
| 1472 | |||
| 1473 | static void perf_log_throttle(struct perf_event *event, int enable); | ||
| 1474 | |||
| 1475 | static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count) | 1614 | static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count) |
| 1476 | { | 1615 | { |
| 1477 | u64 frequency = event->attr.sample_freq; | 1616 | u64 frequency = event->attr.sample_freq; |
| @@ -1582,7 +1721,7 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx, u64 period) | |||
| 1582 | if (event->state != PERF_EVENT_STATE_ACTIVE) | 1721 | if (event->state != PERF_EVENT_STATE_ACTIVE) |
| 1583 | continue; | 1722 | continue; |
| 1584 | 1723 | ||
| 1585 | if (event->cpu != -1 && event->cpu != smp_processor_id()) | 1724 | if (!event_filter_match(event)) |
| 1586 | continue; | 1725 | continue; |
| 1587 | 1726 | ||
| 1588 | hwc = &event->hw; | 1727 | hwc = &event->hw; |
| @@ -1619,8 +1758,12 @@ static void rotate_ctx(struct perf_event_context *ctx) | |||
| 1619 | { | 1758 | { |
| 1620 | raw_spin_lock(&ctx->lock); | 1759 | raw_spin_lock(&ctx->lock); |
| 1621 | 1760 | ||
| 1622 | /* Rotate the first entry last of non-pinned groups */ | 1761 | /* |
| 1623 | list_rotate_left(&ctx->flexible_groups); | 1762 | * Rotate the first entry last of non-pinned groups. Rotation might be |
| 1763 | * disabled by the inheritance code. | ||
| 1764 | */ | ||
| 1765 | if (!ctx->rotate_disable) | ||
| 1766 | list_rotate_left(&ctx->flexible_groups); | ||
| 1624 | 1767 | ||
| 1625 | raw_spin_unlock(&ctx->lock); | 1768 | raw_spin_unlock(&ctx->lock); |
| 1626 | } | 1769 | } |
| @@ -1769,11 +1912,12 @@ static void __perf_event_read(void *info) | |||
| 1769 | return; | 1912 | return; |
| 1770 | 1913 | ||
| 1771 | raw_spin_lock(&ctx->lock); | 1914 | raw_spin_lock(&ctx->lock); |
| 1772 | update_context_time(ctx); | 1915 | if (ctx->is_active) |
| 1916 | update_context_time(ctx); | ||
| 1773 | update_event_times(event); | 1917 | update_event_times(event); |
| 1918 | if (event->state == PERF_EVENT_STATE_ACTIVE) | ||
| 1919 | event->pmu->read(event); | ||
| 1774 | raw_spin_unlock(&ctx->lock); | 1920 | raw_spin_unlock(&ctx->lock); |
| 1775 | |||
| 1776 | event->pmu->read(event); | ||
| 1777 | } | 1921 | } |
| 1778 | 1922 | ||
| 1779 | static inline u64 perf_event_count(struct perf_event *event) | 1923 | static inline u64 perf_event_count(struct perf_event *event) |
| @@ -1867,8 +2011,7 @@ static int alloc_callchain_buffers(void) | |||
| 1867 | * accessed from NMI. Use a temporary manual per cpu allocation | 2011 | * accessed from NMI. Use a temporary manual per cpu allocation |
| 1868 | * until that gets sorted out. | 2012 | * until that gets sorted out. |
| 1869 | */ | 2013 | */ |
| 1870 | size = sizeof(*entries) + sizeof(struct perf_callchain_entry *) * | 2014 | size = offsetof(struct callchain_cpus_entries, cpu_entries[nr_cpu_ids]); |
| 1871 | num_possible_cpus(); | ||
| 1872 | 2015 | ||
| 1873 | entries = kzalloc(size, GFP_KERNEL); | 2016 | entries = kzalloc(size, GFP_KERNEL); |
| 1874 | if (!entries) | 2017 | if (!entries) |
| @@ -2069,13 +2212,6 @@ find_lively_task_by_vpid(pid_t vpid) | |||
| 2069 | if (!task) | 2212 | if (!task) |
| 2070 | return ERR_PTR(-ESRCH); | 2213 | return ERR_PTR(-ESRCH); |
| 2071 | 2214 | ||
| 2072 | /* | ||
| 2073 | * Can't attach events to a dying task. | ||
| 2074 | */ | ||
| 2075 | err = -ESRCH; | ||
| 2076 | if (task->flags & PF_EXITING) | ||
| 2077 | goto errout; | ||
| 2078 | |||
| 2079 | /* Reuse ptrace permission checks for now. */ | 2215 | /* Reuse ptrace permission checks for now. */ |
| 2080 | err = -EACCES; | 2216 | err = -EACCES; |
| 2081 | if (!ptrace_may_access(task, PTRACE_MODE_READ)) | 2217 | if (!ptrace_may_access(task, PTRACE_MODE_READ)) |
| @@ -2096,14 +2232,11 @@ find_get_context(struct pmu *pmu, struct task_struct *task, int cpu) | |||
| 2096 | unsigned long flags; | 2232 | unsigned long flags; |
| 2097 | int ctxn, err; | 2233 | int ctxn, err; |
| 2098 | 2234 | ||
| 2099 | if (!task && cpu != -1) { | 2235 | if (!task) { |
| 2100 | /* Must be root to operate on a CPU event: */ | 2236 | /* Must be root to operate on a CPU event: */ |
| 2101 | if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN)) | 2237 | if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN)) |
| 2102 | return ERR_PTR(-EACCES); | 2238 | return ERR_PTR(-EACCES); |
| 2103 | 2239 | ||
| 2104 | if (cpu < 0 || cpu >= nr_cpumask_bits) | ||
| 2105 | return ERR_PTR(-EINVAL); | ||
| 2106 | |||
| 2107 | /* | 2240 | /* |
| 2108 | * We could be clever and allow to attach a event to an | 2241 | * We could be clever and allow to attach a event to an |
| 2109 | * offline CPU and activate it when the CPU comes up, but | 2242 | * offline CPU and activate it when the CPU comes up, but |
| @@ -2139,14 +2272,27 @@ retry: | |||
| 2139 | 2272 | ||
| 2140 | get_ctx(ctx); | 2273 | get_ctx(ctx); |
| 2141 | 2274 | ||
| 2142 | if (cmpxchg(&task->perf_event_ctxp[ctxn], NULL, ctx)) { | 2275 | err = 0; |
| 2143 | /* | 2276 | mutex_lock(&task->perf_event_mutex); |
| 2144 | * We raced with some other task; use | 2277 | /* |
| 2145 | * the context they set. | 2278 | * If it has already passed perf_event_exit_task(). |
| 2146 | */ | 2279 | * we must see PF_EXITING, it takes this mutex too. |
| 2280 | */ | ||
| 2281 | if (task->flags & PF_EXITING) | ||
| 2282 | err = -ESRCH; | ||
| 2283 | else if (task->perf_event_ctxp[ctxn]) | ||
| 2284 | err = -EAGAIN; | ||
| 2285 | else | ||
| 2286 | rcu_assign_pointer(task->perf_event_ctxp[ctxn], ctx); | ||
| 2287 | mutex_unlock(&task->perf_event_mutex); | ||
| 2288 | |||
| 2289 | if (unlikely(err)) { | ||
| 2147 | put_task_struct(task); | 2290 | put_task_struct(task); |
| 2148 | kfree(ctx); | 2291 | kfree(ctx); |
| 2149 | goto retry; | 2292 | |
| 2293 | if (err == -EAGAIN) | ||
| 2294 | goto retry; | ||
| 2295 | goto errout; | ||
| 2150 | } | 2296 | } |
| 2151 | } | 2297 | } |
| 2152 | 2298 | ||
| @@ -2232,11 +2378,6 @@ int perf_event_release_kernel(struct perf_event *event) | |||
| 2232 | raw_spin_unlock_irq(&ctx->lock); | 2378 | raw_spin_unlock_irq(&ctx->lock); |
| 2233 | mutex_unlock(&ctx->mutex); | 2379 | mutex_unlock(&ctx->mutex); |
| 2234 | 2380 | ||
| 2235 | mutex_lock(&event->owner->perf_event_mutex); | ||
| 2236 | list_del_init(&event->owner_entry); | ||
| 2237 | mutex_unlock(&event->owner->perf_event_mutex); | ||
| 2238 | put_task_struct(event->owner); | ||
| 2239 | |||
| 2240 | free_event(event); | 2381 | free_event(event); |
| 2241 | 2382 | ||
| 2242 | return 0; | 2383 | return 0; |
| @@ -2249,35 +2390,44 @@ EXPORT_SYMBOL_GPL(perf_event_release_kernel); | |||
| 2249 | static int perf_release(struct inode *inode, struct file *file) | 2390 | static int perf_release(struct inode *inode, struct file *file) |
| 2250 | { | 2391 | { |
| 2251 | struct perf_event *event = file->private_data; | 2392 | struct perf_event *event = file->private_data; |
| 2393 | struct task_struct *owner; | ||
| 2252 | 2394 | ||
| 2253 | file->private_data = NULL; | 2395 | file->private_data = NULL; |
| 2254 | 2396 | ||
| 2255 | return perf_event_release_kernel(event); | 2397 | rcu_read_lock(); |
| 2256 | } | 2398 | owner = ACCESS_ONCE(event->owner); |
| 2257 | 2399 | /* | |
| 2258 | static int perf_event_read_size(struct perf_event *event) | 2400 | * Matches the smp_wmb() in perf_event_exit_task(). If we observe |
| 2259 | { | 2401 | * !owner it means the list deletion is complete and we can indeed |
| 2260 | int entry = sizeof(u64); /* value */ | 2402 | * free this event, otherwise we need to serialize on |
| 2261 | int size = 0; | 2403 | * owner->perf_event_mutex. |
| 2262 | int nr = 1; | 2404 | */ |
| 2263 | 2405 | smp_read_barrier_depends(); | |
| 2264 | if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) | 2406 | if (owner) { |
| 2265 | size += sizeof(u64); | 2407 | /* |
| 2266 | 2408 | * Since delayed_put_task_struct() also drops the last | |
| 2267 | if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) | 2409 | * task reference we can safely take a new reference |
| 2268 | size += sizeof(u64); | 2410 | * while holding the rcu_read_lock(). |
| 2269 | 2411 | */ | |
| 2270 | if (event->attr.read_format & PERF_FORMAT_ID) | 2412 | get_task_struct(owner); |
| 2271 | entry += sizeof(u64); | ||
| 2272 | |||
| 2273 | if (event->attr.read_format & PERF_FORMAT_GROUP) { | ||
| 2274 | nr += event->group_leader->nr_siblings; | ||
| 2275 | size += sizeof(u64); | ||
| 2276 | } | 2413 | } |
| 2414 | rcu_read_unlock(); | ||
| 2277 | 2415 | ||
| 2278 | size += entry * nr; | 2416 | if (owner) { |
| 2417 | mutex_lock(&owner->perf_event_mutex); | ||
| 2418 | /* | ||
| 2419 | * We have to re-check the event->owner field, if it is cleared | ||
| 2420 | * we raced with perf_event_exit_task(), acquiring the mutex | ||
| 2421 | * ensured they're done, and we can proceed with freeing the | ||
| 2422 | * event. | ||
| 2423 | */ | ||
| 2424 | if (event->owner) | ||
| 2425 | list_del_init(&event->owner_entry); | ||
| 2426 | mutex_unlock(&owner->perf_event_mutex); | ||
| 2427 | put_task_struct(owner); | ||
| 2428 | } | ||
| 2279 | 2429 | ||
| 2280 | return size; | 2430 | return perf_event_release_kernel(event); |
| 2281 | } | 2431 | } |
| 2282 | 2432 | ||
| 2283 | u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running) | 2433 | u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running) |
| @@ -2394,7 +2544,7 @@ perf_read_hw(struct perf_event *event, char __user *buf, size_t count) | |||
| 2394 | if (event->state == PERF_EVENT_STATE_ERROR) | 2544 | if (event->state == PERF_EVENT_STATE_ERROR) |
| 2395 | return 0; | 2545 | return 0; |
| 2396 | 2546 | ||
| 2397 | if (count < perf_event_read_size(event)) | 2547 | if (count < event->read_size) |
| 2398 | return -ENOSPC; | 2548 | return -ENOSPC; |
| 2399 | 2549 | ||
| 2400 | WARN_ON_ONCE(event->ctx->parent_ctx); | 2550 | WARN_ON_ONCE(event->ctx->parent_ctx); |
| @@ -2480,7 +2630,7 @@ static int perf_event_period(struct perf_event *event, u64 __user *arg) | |||
| 2480 | int ret = 0; | 2630 | int ret = 0; |
| 2481 | u64 value; | 2631 | u64 value; |
| 2482 | 2632 | ||
| 2483 | if (!event->attr.sample_period) | 2633 | if (!is_sampling_event(event)) |
| 2484 | return -EINVAL; | 2634 | return -EINVAL; |
| 2485 | 2635 | ||
| 2486 | if (copy_from_user(&value, arg, sizeof(value))) | 2636 | if (copy_from_user(&value, arg, sizeof(value))) |
| @@ -3271,6 +3421,73 @@ __always_inline void perf_output_copy(struct perf_output_handle *handle, | |||
| 3271 | } while (len); | 3421 | } while (len); |
| 3272 | } | 3422 | } |
| 3273 | 3423 | ||
| 3424 | static void __perf_event_header__init_id(struct perf_event_header *header, | ||
| 3425 | struct perf_sample_data *data, | ||
| 3426 | struct perf_event *event) | ||
| 3427 | { | ||
| 3428 | u64 sample_type = event->attr.sample_type; | ||
| 3429 | |||
| 3430 | data->type = sample_type; | ||
| 3431 | header->size += event->id_header_size; | ||
| 3432 | |||
| 3433 | if (sample_type & PERF_SAMPLE_TID) { | ||
| 3434 | /* namespace issues */ | ||
| 3435 | data->tid_entry.pid = perf_event_pid(event, current); | ||
| 3436 | data->tid_entry.tid = perf_event_tid(event, current); | ||
| 3437 | } | ||
| 3438 | |||
| 3439 | if (sample_type & PERF_SAMPLE_TIME) | ||
| 3440 | data->time = perf_clock(); | ||
| 3441 | |||
| 3442 | if (sample_type & PERF_SAMPLE_ID) | ||
| 3443 | data->id = primary_event_id(event); | ||
| 3444 | |||
| 3445 | if (sample_type & PERF_SAMPLE_STREAM_ID) | ||
| 3446 | data->stream_id = event->id; | ||
| 3447 | |||
| 3448 | if (sample_type & PERF_SAMPLE_CPU) { | ||
| 3449 | data->cpu_entry.cpu = raw_smp_processor_id(); | ||
| 3450 | data->cpu_entry.reserved = 0; | ||
| 3451 | } | ||
| 3452 | } | ||
| 3453 | |||
| 3454 | static void perf_event_header__init_id(struct perf_event_header *header, | ||
| 3455 | struct perf_sample_data *data, | ||
| 3456 | struct perf_event *event) | ||
| 3457 | { | ||
| 3458 | if (event->attr.sample_id_all) | ||
| 3459 | __perf_event_header__init_id(header, data, event); | ||
| 3460 | } | ||
| 3461 | |||
| 3462 | static void __perf_event__output_id_sample(struct perf_output_handle *handle, | ||
| 3463 | struct perf_sample_data *data) | ||
| 3464 | { | ||
| 3465 | u64 sample_type = data->type; | ||
| 3466 | |||
| 3467 | if (sample_type & PERF_SAMPLE_TID) | ||
| 3468 | perf_output_put(handle, data->tid_entry); | ||
| 3469 | |||
| 3470 | if (sample_type & PERF_SAMPLE_TIME) | ||
| 3471 | perf_output_put(handle, data->time); | ||
| 3472 | |||
| 3473 | if (sample_type & PERF_SAMPLE_ID) | ||
| 3474 | perf_output_put(handle, data->id); | ||
| 3475 | |||
| 3476 | if (sample_type & PERF_SAMPLE_STREAM_ID) | ||
| 3477 | perf_output_put(handle, data->stream_id); | ||
| 3478 | |||
| 3479 | if (sample_type & PERF_SAMPLE_CPU) | ||
| 3480 | perf_output_put(handle, data->cpu_entry); | ||
| 3481 | } | ||
| 3482 | |||
| 3483 | static void perf_event__output_id_sample(struct perf_event *event, | ||
| 3484 | struct perf_output_handle *handle, | ||
| 3485 | struct perf_sample_data *sample) | ||
| 3486 | { | ||
| 3487 | if (event->attr.sample_id_all) | ||
| 3488 | __perf_event__output_id_sample(handle, sample); | ||
| 3489 | } | ||
| 3490 | |||
| 3274 | int perf_output_begin(struct perf_output_handle *handle, | 3491 | int perf_output_begin(struct perf_output_handle *handle, |
| 3275 | struct perf_event *event, unsigned int size, | 3492 | struct perf_event *event, unsigned int size, |
| 3276 | int nmi, int sample) | 3493 | int nmi, int sample) |
| @@ -3278,6 +3495,7 @@ int perf_output_begin(struct perf_output_handle *handle, | |||
| 3278 | struct perf_buffer *buffer; | 3495 | struct perf_buffer *buffer; |
| 3279 | unsigned long tail, offset, head; | 3496 | unsigned long tail, offset, head; |
| 3280 | int have_lost; | 3497 | int have_lost; |
| 3498 | struct perf_sample_data sample_data; | ||
| 3281 | struct { | 3499 | struct { |
| 3282 | struct perf_event_header header; | 3500 | struct perf_event_header header; |
| 3283 | u64 id; | 3501 | u64 id; |
| @@ -3304,8 +3522,12 @@ int perf_output_begin(struct perf_output_handle *handle, | |||
| 3304 | goto out; | 3522 | goto out; |
| 3305 | 3523 | ||
| 3306 | have_lost = local_read(&buffer->lost); | 3524 | have_lost = local_read(&buffer->lost); |
| 3307 | if (have_lost) | 3525 | if (have_lost) { |
| 3308 | size += sizeof(lost_event); | 3526 | lost_event.header.size = sizeof(lost_event); |
| 3527 | perf_event_header__init_id(&lost_event.header, &sample_data, | ||
| 3528 | event); | ||
| 3529 | size += lost_event.header.size; | ||
| 3530 | } | ||
| 3309 | 3531 | ||
| 3310 | perf_output_get_handle(handle); | 3532 | perf_output_get_handle(handle); |
| 3311 | 3533 | ||
| @@ -3336,11 +3558,11 @@ int perf_output_begin(struct perf_output_handle *handle, | |||
| 3336 | if (have_lost) { | 3558 | if (have_lost) { |
| 3337 | lost_event.header.type = PERF_RECORD_LOST; | 3559 | lost_event.header.type = PERF_RECORD_LOST; |
| 3338 | lost_event.header.misc = 0; | 3560 | lost_event.header.misc = 0; |
| 3339 | lost_event.header.size = sizeof(lost_event); | ||
| 3340 | lost_event.id = event->id; | 3561 | lost_event.id = event->id; |
| 3341 | lost_event.lost = local_xchg(&buffer->lost, 0); | 3562 | lost_event.lost = local_xchg(&buffer->lost, 0); |
| 3342 | 3563 | ||
| 3343 | perf_output_put(handle, lost_event); | 3564 | perf_output_put(handle, lost_event); |
| 3565 | perf_event__output_id_sample(event, handle, &sample_data); | ||
| 3344 | } | 3566 | } |
| 3345 | 3567 | ||
| 3346 | return 0; | 3568 | return 0; |
| @@ -3373,30 +3595,9 @@ void perf_output_end(struct perf_output_handle *handle) | |||
| 3373 | rcu_read_unlock(); | 3595 | rcu_read_unlock(); |
| 3374 | } | 3596 | } |
| 3375 | 3597 | ||
| 3376 | static u32 perf_event_pid(struct perf_event *event, struct task_struct *p) | ||
| 3377 | { | ||
| 3378 | /* | ||
| 3379 | * only top level events have the pid namespace they were created in | ||
| 3380 | */ | ||
| 3381 | if (event->parent) | ||
| 3382 | event = event->parent; | ||
| 3383 | |||
| 3384 | return task_tgid_nr_ns(p, event->ns); | ||
| 3385 | } | ||
| 3386 | |||
| 3387 | static u32 perf_event_tid(struct perf_event *event, struct task_struct *p) | ||
| 3388 | { | ||
| 3389 | /* | ||
| 3390 | * only top level events have the pid namespace they were created in | ||
| 3391 | */ | ||
| 3392 | if (event->parent) | ||
| 3393 | event = event->parent; | ||
| 3394 | |||
| 3395 | return task_pid_nr_ns(p, event->ns); | ||
| 3396 | } | ||
| 3397 | |||
| 3398 | static void perf_output_read_one(struct perf_output_handle *handle, | 3598 | static void perf_output_read_one(struct perf_output_handle *handle, |
| 3399 | struct perf_event *event) | 3599 | struct perf_event *event, |
| 3600 | u64 enabled, u64 running) | ||
| 3400 | { | 3601 | { |
| 3401 | u64 read_format = event->attr.read_format; | 3602 | u64 read_format = event->attr.read_format; |
| 3402 | u64 values[4]; | 3603 | u64 values[4]; |
| @@ -3404,11 +3605,11 @@ static void perf_output_read_one(struct perf_output_handle *handle, | |||
| 3404 | 3605 | ||
| 3405 | values[n++] = perf_event_count(event); | 3606 | values[n++] = perf_event_count(event); |
| 3406 | if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { | 3607 | if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { |
| 3407 | values[n++] = event->total_time_enabled + | 3608 | values[n++] = enabled + |
| 3408 | atomic64_read(&event->child_total_time_enabled); | 3609 | atomic64_read(&event->child_total_time_enabled); |
| 3409 | } | 3610 | } |
| 3410 | if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) { | 3611 | if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) { |
| 3411 | values[n++] = event->total_time_running + | 3612 | values[n++] = running + |
| 3412 | atomic64_read(&event->child_total_time_running); | 3613 | atomic64_read(&event->child_total_time_running); |
| 3413 | } | 3614 | } |
| 3414 | if (read_format & PERF_FORMAT_ID) | 3615 | if (read_format & PERF_FORMAT_ID) |
| @@ -3421,7 +3622,8 @@ static void perf_output_read_one(struct perf_output_handle *handle, | |||
| 3421 | * XXX PERF_FORMAT_GROUP vs inherited events seems difficult. | 3622 | * XXX PERF_FORMAT_GROUP vs inherited events seems difficult. |
| 3422 | */ | 3623 | */ |
| 3423 | static void perf_output_read_group(struct perf_output_handle *handle, | 3624 | static void perf_output_read_group(struct perf_output_handle *handle, |
| 3424 | struct perf_event *event) | 3625 | struct perf_event *event, |
| 3626 | u64 enabled, u64 running) | ||
| 3425 | { | 3627 | { |
| 3426 | struct perf_event *leader = event->group_leader, *sub; | 3628 | struct perf_event *leader = event->group_leader, *sub; |
| 3427 | u64 read_format = event->attr.read_format; | 3629 | u64 read_format = event->attr.read_format; |
| @@ -3431,10 +3633,10 @@ static void perf_output_read_group(struct perf_output_handle *handle, | |||
| 3431 | values[n++] = 1 + leader->nr_siblings; | 3633 | values[n++] = 1 + leader->nr_siblings; |
| 3432 | 3634 | ||
| 3433 | if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) | 3635 | if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) |
| 3434 | values[n++] = leader->total_time_enabled; | 3636 | values[n++] = enabled; |
| 3435 | 3637 | ||
| 3436 | if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) | 3638 | if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) |
| 3437 | values[n++] = leader->total_time_running; | 3639 | values[n++] = running; |
| 3438 | 3640 | ||
| 3439 | if (leader != event) | 3641 | if (leader != event) |
| 3440 | leader->pmu->read(leader); | 3642 | leader->pmu->read(leader); |
| @@ -3459,13 +3661,35 @@ static void perf_output_read_group(struct perf_output_handle *handle, | |||
| 3459 | } | 3661 | } |
| 3460 | } | 3662 | } |
| 3461 | 3663 | ||
| 3664 | #define PERF_FORMAT_TOTAL_TIMES (PERF_FORMAT_TOTAL_TIME_ENABLED|\ | ||
| 3665 | PERF_FORMAT_TOTAL_TIME_RUNNING) | ||
| 3666 | |||
| 3462 | static void perf_output_read(struct perf_output_handle *handle, | 3667 | static void perf_output_read(struct perf_output_handle *handle, |
| 3463 | struct perf_event *event) | 3668 | struct perf_event *event) |
| 3464 | { | 3669 | { |
| 3670 | u64 enabled = 0, running = 0, now, ctx_time; | ||
| 3671 | u64 read_format = event->attr.read_format; | ||
| 3672 | |||
| 3673 | /* | ||
| 3674 | * compute total_time_enabled, total_time_running | ||
| 3675 | * based on snapshot values taken when the event | ||
| 3676 | * was last scheduled in. | ||
| 3677 | * | ||
| 3678 | * we cannot simply called update_context_time() | ||
| 3679 | * because of locking issue as we are called in | ||
| 3680 | * NMI context | ||
| 3681 | */ | ||
| 3682 | if (read_format & PERF_FORMAT_TOTAL_TIMES) { | ||
| 3683 | now = perf_clock(); | ||
| 3684 | ctx_time = event->shadow_ctx_time + now; | ||
| 3685 | enabled = ctx_time - event->tstamp_enabled; | ||
| 3686 | running = ctx_time - event->tstamp_running; | ||
| 3687 | } | ||
| 3688 | |||
| 3465 | if (event->attr.read_format & PERF_FORMAT_GROUP) | 3689 | if (event->attr.read_format & PERF_FORMAT_GROUP) |
| 3466 | perf_output_read_group(handle, event); | 3690 | perf_output_read_group(handle, event, enabled, running); |
| 3467 | else | 3691 | else |
| 3468 | perf_output_read_one(handle, event); | 3692 | perf_output_read_one(handle, event, enabled, running); |
| 3469 | } | 3693 | } |
| 3470 | 3694 | ||
| 3471 | void perf_output_sample(struct perf_output_handle *handle, | 3695 | void perf_output_sample(struct perf_output_handle *handle, |
| @@ -3545,61 +3769,16 @@ void perf_prepare_sample(struct perf_event_header *header, | |||
| 3545 | { | 3769 | { |
| 3546 | u64 sample_type = event->attr.sample_type; | 3770 | u64 sample_type = event->attr.sample_type; |
| 3547 | 3771 | ||
| 3548 | data->type = sample_type; | ||
| 3549 | |||
| 3550 | header->type = PERF_RECORD_SAMPLE; | 3772 | header->type = PERF_RECORD_SAMPLE; |
| 3551 | header->size = sizeof(*header); | 3773 | header->size = sizeof(*header) + event->header_size; |
| 3552 | 3774 | ||
| 3553 | header->misc = 0; | 3775 | header->misc = 0; |
| 3554 | header->misc |= perf_misc_flags(regs); | 3776 | header->misc |= perf_misc_flags(regs); |
| 3555 | 3777 | ||
| 3556 | if (sample_type & PERF_SAMPLE_IP) { | 3778 | __perf_event_header__init_id(header, data, event); |
| 3557 | data->ip = perf_instruction_pointer(regs); | ||
| 3558 | |||
| 3559 | header->size += sizeof(data->ip); | ||
| 3560 | } | ||
| 3561 | |||
| 3562 | if (sample_type & PERF_SAMPLE_TID) { | ||
| 3563 | /* namespace issues */ | ||
| 3564 | data->tid_entry.pid = perf_event_pid(event, current); | ||
| 3565 | data->tid_entry.tid = perf_event_tid(event, current); | ||
| 3566 | |||
| 3567 | header->size += sizeof(data->tid_entry); | ||
| 3568 | } | ||
| 3569 | |||
| 3570 | if (sample_type & PERF_SAMPLE_TIME) { | ||
| 3571 | data->time = perf_clock(); | ||
| 3572 | |||
| 3573 | header->size += sizeof(data->time); | ||
| 3574 | } | ||
| 3575 | |||
| 3576 | if (sample_type & PERF_SAMPLE_ADDR) | ||
| 3577 | header->size += sizeof(data->addr); | ||
| 3578 | |||
| 3579 | if (sample_type & PERF_SAMPLE_ID) { | ||
| 3580 | data->id = primary_event_id(event); | ||
| 3581 | |||
| 3582 | header->size += sizeof(data->id); | ||
| 3583 | } | ||
| 3584 | |||
| 3585 | if (sample_type & PERF_SAMPLE_STREAM_ID) { | ||
| 3586 | data->stream_id = event->id; | ||
| 3587 | |||
| 3588 | header->size += sizeof(data->stream_id); | ||
| 3589 | } | ||
| 3590 | |||
| 3591 | if (sample_type & PERF_SAMPLE_CPU) { | ||
| 3592 | data->cpu_entry.cpu = raw_smp_processor_id(); | ||
| 3593 | data->cpu_entry.reserved = 0; | ||
| 3594 | |||
| 3595 | header->size += sizeof(data->cpu_entry); | ||
| 3596 | } | ||
| 3597 | |||
| 3598 | if (sample_type & PERF_SAMPLE_PERIOD) | ||
| 3599 | header->size += sizeof(data->period); | ||
| 3600 | 3779 | ||
| 3601 | if (sample_type & PERF_SAMPLE_READ) | 3780 | if (sample_type & PERF_SAMPLE_IP) |
| 3602 | header->size += perf_event_read_size(event); | 3781 | data->ip = perf_instruction_pointer(regs); |
| 3603 | 3782 | ||
| 3604 | if (sample_type & PERF_SAMPLE_CALLCHAIN) { | 3783 | if (sample_type & PERF_SAMPLE_CALLCHAIN) { |
| 3605 | int size = 1; | 3784 | int size = 1; |
| @@ -3664,23 +3843,26 @@ perf_event_read_event(struct perf_event *event, | |||
| 3664 | struct task_struct *task) | 3843 | struct task_struct *task) |
| 3665 | { | 3844 | { |
| 3666 | struct perf_output_handle handle; | 3845 | struct perf_output_handle handle; |
| 3846 | struct perf_sample_data sample; | ||
| 3667 | struct perf_read_event read_event = { | 3847 | struct perf_read_event read_event = { |
| 3668 | .header = { | 3848 | .header = { |
| 3669 | .type = PERF_RECORD_READ, | 3849 | .type = PERF_RECORD_READ, |
| 3670 | .misc = 0, | 3850 | .misc = 0, |
| 3671 | .size = sizeof(read_event) + perf_event_read_size(event), | 3851 | .size = sizeof(read_event) + event->read_size, |
| 3672 | }, | 3852 | }, |
| 3673 | .pid = perf_event_pid(event, task), | 3853 | .pid = perf_event_pid(event, task), |
| 3674 | .tid = perf_event_tid(event, task), | 3854 | .tid = perf_event_tid(event, task), |
| 3675 | }; | 3855 | }; |
| 3676 | int ret; | 3856 | int ret; |
| 3677 | 3857 | ||
| 3858 | perf_event_header__init_id(&read_event.header, &sample, event); | ||
| 3678 | ret = perf_output_begin(&handle, event, read_event.header.size, 0, 0); | 3859 | ret = perf_output_begin(&handle, event, read_event.header.size, 0, 0); |
| 3679 | if (ret) | 3860 | if (ret) |
| 3680 | return; | 3861 | return; |
| 3681 | 3862 | ||
| 3682 | perf_output_put(&handle, read_event); | 3863 | perf_output_put(&handle, read_event); |
| 3683 | perf_output_read(&handle, event); | 3864 | perf_output_read(&handle, event); |
| 3865 | perf_event__output_id_sample(event, &handle, &sample); | ||
| 3684 | 3866 | ||
| 3685 | perf_output_end(&handle); | 3867 | perf_output_end(&handle); |
| 3686 | } | 3868 | } |
| @@ -3710,14 +3892,16 @@ static void perf_event_task_output(struct perf_event *event, | |||
| 3710 | struct perf_task_event *task_event) | 3892 | struct perf_task_event *task_event) |
| 3711 | { | 3893 | { |
| 3712 | struct perf_output_handle handle; | 3894 | struct perf_output_handle handle; |
| 3895 | struct perf_sample_data sample; | ||
| 3713 | struct task_struct *task = task_event->task; | 3896 | struct task_struct *task = task_event->task; |
| 3714 | int size, ret; | 3897 | int ret, size = task_event->event_id.header.size; |
| 3715 | 3898 | ||
| 3716 | size = task_event->event_id.header.size; | 3899 | perf_event_header__init_id(&task_event->event_id.header, &sample, event); |
| 3717 | ret = perf_output_begin(&handle, event, size, 0, 0); | ||
| 3718 | 3900 | ||
| 3901 | ret = perf_output_begin(&handle, event, | ||
| 3902 | task_event->event_id.header.size, 0, 0); | ||
| 3719 | if (ret) | 3903 | if (ret) |
| 3720 | return; | 3904 | goto out; |
| 3721 | 3905 | ||
| 3722 | task_event->event_id.pid = perf_event_pid(event, task); | 3906 | task_event->event_id.pid = perf_event_pid(event, task); |
| 3723 | task_event->event_id.ppid = perf_event_pid(event, current); | 3907 | task_event->event_id.ppid = perf_event_pid(event, current); |
| @@ -3727,7 +3911,11 @@ static void perf_event_task_output(struct perf_event *event, | |||
| 3727 | 3911 | ||
| 3728 | perf_output_put(&handle, task_event->event_id); | 3912 | perf_output_put(&handle, task_event->event_id); |
| 3729 | 3913 | ||
| 3914 | perf_event__output_id_sample(event, &handle, &sample); | ||
| 3915 | |||
| 3730 | perf_output_end(&handle); | 3916 | perf_output_end(&handle); |
| 3917 | out: | ||
| 3918 | task_event->event_id.header.size = size; | ||
| 3731 | } | 3919 | } |
| 3732 | 3920 | ||
| 3733 | static int perf_event_task_match(struct perf_event *event) | 3921 | static int perf_event_task_match(struct perf_event *event) |
| @@ -3735,7 +3923,7 @@ static int perf_event_task_match(struct perf_event *event) | |||
| 3735 | if (event->state < PERF_EVENT_STATE_INACTIVE) | 3923 | if (event->state < PERF_EVENT_STATE_INACTIVE) |
| 3736 | return 0; | 3924 | return 0; |
| 3737 | 3925 | ||
| 3738 | if (event->cpu != -1 && event->cpu != smp_processor_id()) | 3926 | if (!event_filter_match(event)) |
| 3739 | return 0; | 3927 | return 0; |
| 3740 | 3928 | ||
| 3741 | if (event->attr.comm || event->attr.mmap || | 3929 | if (event->attr.comm || event->attr.mmap || |
| @@ -3766,6 +3954,8 @@ static void perf_event_task_event(struct perf_task_event *task_event) | |||
| 3766 | rcu_read_lock(); | 3954 | rcu_read_lock(); |
| 3767 | list_for_each_entry_rcu(pmu, &pmus, entry) { | 3955 | list_for_each_entry_rcu(pmu, &pmus, entry) { |
| 3768 | cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); | 3956 | cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); |
| 3957 | if (cpuctx->active_pmu != pmu) | ||
| 3958 | goto next; | ||
| 3769 | perf_event_task_ctx(&cpuctx->ctx, task_event); | 3959 | perf_event_task_ctx(&cpuctx->ctx, task_event); |
| 3770 | 3960 | ||
| 3771 | ctx = task_event->task_ctx; | 3961 | ctx = task_event->task_ctx; |
| @@ -3840,11 +4030,16 @@ static void perf_event_comm_output(struct perf_event *event, | |||
| 3840 | struct perf_comm_event *comm_event) | 4030 | struct perf_comm_event *comm_event) |
| 3841 | { | 4031 | { |
| 3842 | struct perf_output_handle handle; | 4032 | struct perf_output_handle handle; |
| 4033 | struct perf_sample_data sample; | ||
| 3843 | int size = comm_event->event_id.header.size; | 4034 | int size = comm_event->event_id.header.size; |
| 3844 | int ret = perf_output_begin(&handle, event, size, 0, 0); | 4035 | int ret; |
| 4036 | |||
| 4037 | perf_event_header__init_id(&comm_event->event_id.header, &sample, event); | ||
| 4038 | ret = perf_output_begin(&handle, event, | ||
| 4039 | comm_event->event_id.header.size, 0, 0); | ||
| 3845 | 4040 | ||
| 3846 | if (ret) | 4041 | if (ret) |
| 3847 | return; | 4042 | goto out; |
| 3848 | 4043 | ||
| 3849 | comm_event->event_id.pid = perf_event_pid(event, comm_event->task); | 4044 | comm_event->event_id.pid = perf_event_pid(event, comm_event->task); |
| 3850 | comm_event->event_id.tid = perf_event_tid(event, comm_event->task); | 4045 | comm_event->event_id.tid = perf_event_tid(event, comm_event->task); |
| @@ -3852,7 +4047,12 @@ static void perf_event_comm_output(struct perf_event *event, | |||
| 3852 | perf_output_put(&handle, comm_event->event_id); | 4047 | perf_output_put(&handle, comm_event->event_id); |
| 3853 | perf_output_copy(&handle, comm_event->comm, | 4048 | perf_output_copy(&handle, comm_event->comm, |
| 3854 | comm_event->comm_size); | 4049 | comm_event->comm_size); |
| 4050 | |||
| 4051 | perf_event__output_id_sample(event, &handle, &sample); | ||
| 4052 | |||
| 3855 | perf_output_end(&handle); | 4053 | perf_output_end(&handle); |
| 4054 | out: | ||
| 4055 | comm_event->event_id.header.size = size; | ||
| 3856 | } | 4056 | } |
| 3857 | 4057 | ||
| 3858 | static int perf_event_comm_match(struct perf_event *event) | 4058 | static int perf_event_comm_match(struct perf_event *event) |
| @@ -3860,7 +4060,7 @@ static int perf_event_comm_match(struct perf_event *event) | |||
| 3860 | if (event->state < PERF_EVENT_STATE_INACTIVE) | 4060 | if (event->state < PERF_EVENT_STATE_INACTIVE) |
| 3861 | return 0; | 4061 | return 0; |
| 3862 | 4062 | ||
| 3863 | if (event->cpu != -1 && event->cpu != smp_processor_id()) | 4063 | if (!event_filter_match(event)) |
| 3864 | return 0; | 4064 | return 0; |
| 3865 | 4065 | ||
| 3866 | if (event->attr.comm) | 4066 | if (event->attr.comm) |
| @@ -3897,10 +4097,11 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event) | |||
| 3897 | comm_event->comm_size = size; | 4097 | comm_event->comm_size = size; |
| 3898 | 4098 | ||
| 3899 | comm_event->event_id.header.size = sizeof(comm_event->event_id) + size; | 4099 | comm_event->event_id.header.size = sizeof(comm_event->event_id) + size; |
| 3900 | |||
| 3901 | rcu_read_lock(); | 4100 | rcu_read_lock(); |
| 3902 | list_for_each_entry_rcu(pmu, &pmus, entry) { | 4101 | list_for_each_entry_rcu(pmu, &pmus, entry) { |
| 3903 | cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); | 4102 | cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); |
| 4103 | if (cpuctx->active_pmu != pmu) | ||
| 4104 | goto next; | ||
| 3904 | perf_event_comm_ctx(&cpuctx->ctx, comm_event); | 4105 | perf_event_comm_ctx(&cpuctx->ctx, comm_event); |
| 3905 | 4106 | ||
| 3906 | ctxn = pmu->task_ctx_nr; | 4107 | ctxn = pmu->task_ctx_nr; |
| @@ -3976,11 +4177,15 @@ static void perf_event_mmap_output(struct perf_event *event, | |||
| 3976 | struct perf_mmap_event *mmap_event) | 4177 | struct perf_mmap_event *mmap_event) |
| 3977 | { | 4178 | { |
| 3978 | struct perf_output_handle handle; | 4179 | struct perf_output_handle handle; |
| 4180 | struct perf_sample_data sample; | ||
| 3979 | int size = mmap_event->event_id.header.size; | 4181 | int size = mmap_event->event_id.header.size; |
| 3980 | int ret = perf_output_begin(&handle, event, size, 0, 0); | 4182 | int ret; |
| 3981 | 4183 | ||
| 4184 | perf_event_header__init_id(&mmap_event->event_id.header, &sample, event); | ||
| 4185 | ret = perf_output_begin(&handle, event, | ||
| 4186 | mmap_event->event_id.header.size, 0, 0); | ||
| 3982 | if (ret) | 4187 | if (ret) |
| 3983 | return; | 4188 | goto out; |
| 3984 | 4189 | ||
| 3985 | mmap_event->event_id.pid = perf_event_pid(event, current); | 4190 | mmap_event->event_id.pid = perf_event_pid(event, current); |
| 3986 | mmap_event->event_id.tid = perf_event_tid(event, current); | 4191 | mmap_event->event_id.tid = perf_event_tid(event, current); |
| @@ -3988,7 +4193,12 @@ static void perf_event_mmap_output(struct perf_event *event, | |||
| 3988 | perf_output_put(&handle, mmap_event->event_id); | 4193 | perf_output_put(&handle, mmap_event->event_id); |
| 3989 | perf_output_copy(&handle, mmap_event->file_name, | 4194 | perf_output_copy(&handle, mmap_event->file_name, |
| 3990 | mmap_event->file_size); | 4195 | mmap_event->file_size); |
| 4196 | |||
| 4197 | perf_event__output_id_sample(event, &handle, &sample); | ||
| 4198 | |||
| 3991 | perf_output_end(&handle); | 4199 | perf_output_end(&handle); |
| 4200 | out: | ||
| 4201 | mmap_event->event_id.header.size = size; | ||
| 3992 | } | 4202 | } |
| 3993 | 4203 | ||
| 3994 | static int perf_event_mmap_match(struct perf_event *event, | 4204 | static int perf_event_mmap_match(struct perf_event *event, |
| @@ -3998,7 +4208,7 @@ static int perf_event_mmap_match(struct perf_event *event, | |||
| 3998 | if (event->state < PERF_EVENT_STATE_INACTIVE) | 4208 | if (event->state < PERF_EVENT_STATE_INACTIVE) |
| 3999 | return 0; | 4209 | return 0; |
| 4000 | 4210 | ||
| 4001 | if (event->cpu != -1 && event->cpu != smp_processor_id()) | 4211 | if (!event_filter_match(event)) |
| 4002 | return 0; | 4212 | return 0; |
| 4003 | 4213 | ||
| 4004 | if ((!executable && event->attr.mmap_data) || | 4214 | if ((!executable && event->attr.mmap_data) || |
| @@ -4086,6 +4296,8 @@ got_name: | |||
| 4086 | rcu_read_lock(); | 4296 | rcu_read_lock(); |
| 4087 | list_for_each_entry_rcu(pmu, &pmus, entry) { | 4297 | list_for_each_entry_rcu(pmu, &pmus, entry) { |
| 4088 | cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); | 4298 | cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); |
| 4299 | if (cpuctx->active_pmu != pmu) | ||
| 4300 | goto next; | ||
| 4089 | perf_event_mmap_ctx(&cpuctx->ctx, mmap_event, | 4301 | perf_event_mmap_ctx(&cpuctx->ctx, mmap_event, |
| 4090 | vma->vm_flags & VM_EXEC); | 4302 | vma->vm_flags & VM_EXEC); |
| 4091 | 4303 | ||
| @@ -4141,6 +4353,7 @@ void perf_event_mmap(struct vm_area_struct *vma) | |||
| 4141 | static void perf_log_throttle(struct perf_event *event, int enable) | 4353 | static void perf_log_throttle(struct perf_event *event, int enable) |
| 4142 | { | 4354 | { |
| 4143 | struct perf_output_handle handle; | 4355 | struct perf_output_handle handle; |
| 4356 | struct perf_sample_data sample; | ||
| 4144 | int ret; | 4357 | int ret; |
| 4145 | 4358 | ||
| 4146 | struct { | 4359 | struct { |
| @@ -4162,11 +4375,15 @@ static void perf_log_throttle(struct perf_event *event, int enable) | |||
| 4162 | if (enable) | 4375 | if (enable) |
| 4163 | throttle_event.header.type = PERF_RECORD_UNTHROTTLE; | 4376 | throttle_event.header.type = PERF_RECORD_UNTHROTTLE; |
| 4164 | 4377 | ||
| 4165 | ret = perf_output_begin(&handle, event, sizeof(throttle_event), 1, 0); | 4378 | perf_event_header__init_id(&throttle_event.header, &sample, event); |
| 4379 | |||
| 4380 | ret = perf_output_begin(&handle, event, | ||
| 4381 | throttle_event.header.size, 1, 0); | ||
| 4166 | if (ret) | 4382 | if (ret) |
| 4167 | return; | 4383 | return; |
| 4168 | 4384 | ||
| 4169 | perf_output_put(&handle, throttle_event); | 4385 | perf_output_put(&handle, throttle_event); |
| 4386 | perf_event__output_id_sample(event, &handle, &sample); | ||
| 4170 | perf_output_end(&handle); | 4387 | perf_output_end(&handle); |
| 4171 | } | 4388 | } |
| 4172 | 4389 | ||
| @@ -4182,6 +4399,13 @@ static int __perf_event_overflow(struct perf_event *event, int nmi, | |||
| 4182 | struct hw_perf_event *hwc = &event->hw; | 4399 | struct hw_perf_event *hwc = &event->hw; |
| 4183 | int ret = 0; | 4400 | int ret = 0; |
| 4184 | 4401 | ||
| 4402 | /* | ||
| 4403 | * Non-sampling counters might still use the PMI to fold short | ||
| 4404 | * hardware counters, ignore those. | ||
| 4405 | */ | ||
| 4406 | if (unlikely(!is_sampling_event(event))) | ||
| 4407 | return 0; | ||
| 4408 | |||
| 4185 | if (!throttle) { | 4409 | if (!throttle) { |
| 4186 | hwc->interrupts++; | 4410 | hwc->interrupts++; |
| 4187 | } else { | 4411 | } else { |
| @@ -4327,7 +4551,7 @@ static void perf_swevent_event(struct perf_event *event, u64 nr, | |||
| 4327 | if (!regs) | 4551 | if (!regs) |
| 4328 | return; | 4552 | return; |
| 4329 | 4553 | ||
| 4330 | if (!hwc->sample_period) | 4554 | if (!is_sampling_event(event)) |
| 4331 | return; | 4555 | return; |
| 4332 | 4556 | ||
| 4333 | if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq) | 4557 | if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq) |
| @@ -4454,7 +4678,7 @@ int perf_swevent_get_recursion_context(void) | |||
| 4454 | } | 4678 | } |
| 4455 | EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context); | 4679 | EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context); |
| 4456 | 4680 | ||
| 4457 | void inline perf_swevent_put_recursion_context(int rctx) | 4681 | inline void perf_swevent_put_recursion_context(int rctx) |
| 4458 | { | 4682 | { |
| 4459 | struct swevent_htable *swhash = &__get_cpu_var(swevent_htable); | 4683 | struct swevent_htable *swhash = &__get_cpu_var(swevent_htable); |
| 4460 | 4684 | ||
| @@ -4490,7 +4714,7 @@ static int perf_swevent_add(struct perf_event *event, int flags) | |||
| 4490 | struct hw_perf_event *hwc = &event->hw; | 4714 | struct hw_perf_event *hwc = &event->hw; |
| 4491 | struct hlist_head *head; | 4715 | struct hlist_head *head; |
| 4492 | 4716 | ||
| 4493 | if (hwc->sample_period) { | 4717 | if (is_sampling_event(event)) { |
| 4494 | hwc->last_period = hwc->sample_period; | 4718 | hwc->last_period = hwc->sample_period; |
| 4495 | perf_swevent_set_period(event); | 4719 | perf_swevent_set_period(event); |
| 4496 | } | 4720 | } |
| @@ -4655,7 +4879,7 @@ static int perf_swevent_init(struct perf_event *event) | |||
| 4655 | break; | 4879 | break; |
| 4656 | } | 4880 | } |
| 4657 | 4881 | ||
| 4658 | if (event_id > PERF_COUNT_SW_MAX) | 4882 | if (event_id >= PERF_COUNT_SW_MAX) |
| 4659 | return -ENOENT; | 4883 | return -ENOENT; |
| 4660 | 4884 | ||
| 4661 | if (!event->parent) { | 4885 | if (!event->parent) { |
| @@ -4747,15 +4971,6 @@ static int perf_tp_event_init(struct perf_event *event) | |||
| 4747 | if (event->attr.type != PERF_TYPE_TRACEPOINT) | 4971 | if (event->attr.type != PERF_TYPE_TRACEPOINT) |
| 4748 | return -ENOENT; | 4972 | return -ENOENT; |
| 4749 | 4973 | ||
| 4750 | /* | ||
| 4751 | * Raw tracepoint data is a severe data leak, only allow root to | ||
| 4752 | * have these. | ||
| 4753 | */ | ||
| 4754 | if ((event->attr.sample_type & PERF_SAMPLE_RAW) && | ||
| 4755 | perf_paranoid_tracepoint_raw() && | ||
| 4756 | !capable(CAP_SYS_ADMIN)) | ||
| 4757 | return -EPERM; | ||
| 4758 | |||
| 4759 | err = perf_trace_init(event); | 4974 | err = perf_trace_init(event); |
| 4760 | if (err) | 4975 | if (err) |
| 4761 | return err; | 4976 | return err; |
| @@ -4778,7 +4993,7 @@ static struct pmu perf_tracepoint = { | |||
| 4778 | 4993 | ||
| 4779 | static inline void perf_tp_register(void) | 4994 | static inline void perf_tp_register(void) |
| 4780 | { | 4995 | { |
| 4781 | perf_pmu_register(&perf_tracepoint); | 4996 | perf_pmu_register(&perf_tracepoint, "tracepoint", PERF_TYPE_TRACEPOINT); |
| 4782 | } | 4997 | } |
| 4783 | 4998 | ||
| 4784 | static int perf_event_set_filter(struct perf_event *event, void __user *arg) | 4999 | static int perf_event_set_filter(struct perf_event *event, void __user *arg) |
| @@ -4868,31 +5083,33 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer) | |||
| 4868 | static void perf_swevent_start_hrtimer(struct perf_event *event) | 5083 | static void perf_swevent_start_hrtimer(struct perf_event *event) |
| 4869 | { | 5084 | { |
| 4870 | struct hw_perf_event *hwc = &event->hw; | 5085 | struct hw_perf_event *hwc = &event->hw; |
| 5086 | s64 period; | ||
| 5087 | |||
| 5088 | if (!is_sampling_event(event)) | ||
| 5089 | return; | ||
| 4871 | 5090 | ||
| 4872 | hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); | 5091 | hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); |
| 4873 | hwc->hrtimer.function = perf_swevent_hrtimer; | 5092 | hwc->hrtimer.function = perf_swevent_hrtimer; |
| 4874 | if (hwc->sample_period) { | ||
| 4875 | s64 period = local64_read(&hwc->period_left); | ||
| 4876 | 5093 | ||
| 4877 | if (period) { | 5094 | period = local64_read(&hwc->period_left); |
| 4878 | if (period < 0) | 5095 | if (period) { |
| 4879 | period = 10000; | 5096 | if (period < 0) |
| 5097 | period = 10000; | ||
| 4880 | 5098 | ||
| 4881 | local64_set(&hwc->period_left, 0); | 5099 | local64_set(&hwc->period_left, 0); |
| 4882 | } else { | 5100 | } else { |
| 4883 | period = max_t(u64, 10000, hwc->sample_period); | 5101 | period = max_t(u64, 10000, hwc->sample_period); |
| 4884 | } | 5102 | } |
| 4885 | __hrtimer_start_range_ns(&hwc->hrtimer, | 5103 | __hrtimer_start_range_ns(&hwc->hrtimer, |
| 4886 | ns_to_ktime(period), 0, | 5104 | ns_to_ktime(period), 0, |
| 4887 | HRTIMER_MODE_REL_PINNED, 0); | 5105 | HRTIMER_MODE_REL_PINNED, 0); |
| 4888 | } | ||
| 4889 | } | 5106 | } |
| 4890 | 5107 | ||
| 4891 | static void perf_swevent_cancel_hrtimer(struct perf_event *event) | 5108 | static void perf_swevent_cancel_hrtimer(struct perf_event *event) |
| 4892 | { | 5109 | { |
| 4893 | struct hw_perf_event *hwc = &event->hw; | 5110 | struct hw_perf_event *hwc = &event->hw; |
| 4894 | 5111 | ||
| 4895 | if (hwc->sample_period) { | 5112 | if (is_sampling_event(event)) { |
| 4896 | ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer); | 5113 | ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer); |
| 4897 | local64_set(&hwc->period_left, ktime_to_ns(remaining)); | 5114 | local64_set(&hwc->period_left, ktime_to_ns(remaining)); |
| 4898 | 5115 | ||
| @@ -5087,25 +5304,96 @@ static void *find_pmu_context(int ctxn) | |||
| 5087 | return NULL; | 5304 | return NULL; |
| 5088 | } | 5305 | } |
| 5089 | 5306 | ||
| 5090 | static void free_pmu_context(void * __percpu cpu_context) | 5307 | static void update_pmu_context(struct pmu *pmu, struct pmu *old_pmu) |
| 5091 | { | 5308 | { |
| 5092 | struct pmu *pmu; | 5309 | int cpu; |
| 5310 | |||
| 5311 | for_each_possible_cpu(cpu) { | ||
| 5312 | struct perf_cpu_context *cpuctx; | ||
| 5313 | |||
| 5314 | cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); | ||
| 5315 | |||
| 5316 | if (cpuctx->active_pmu == old_pmu) | ||
| 5317 | cpuctx->active_pmu = pmu; | ||
| 5318 | } | ||
| 5319 | } | ||
| 5320 | |||
| 5321 | static void free_pmu_context(struct pmu *pmu) | ||
| 5322 | { | ||
| 5323 | struct pmu *i; | ||
| 5093 | 5324 | ||
| 5094 | mutex_lock(&pmus_lock); | 5325 | mutex_lock(&pmus_lock); |
| 5095 | /* | 5326 | /* |
| 5096 | * Like a real lame refcount. | 5327 | * Like a real lame refcount. |
| 5097 | */ | 5328 | */ |
| 5098 | list_for_each_entry(pmu, &pmus, entry) { | 5329 | list_for_each_entry(i, &pmus, entry) { |
| 5099 | if (pmu->pmu_cpu_context == cpu_context) | 5330 | if (i->pmu_cpu_context == pmu->pmu_cpu_context) { |
| 5331 | update_pmu_context(i, pmu); | ||
| 5100 | goto out; | 5332 | goto out; |
| 5333 | } | ||
| 5101 | } | 5334 | } |
| 5102 | 5335 | ||
| 5103 | free_percpu(cpu_context); | 5336 | free_percpu(pmu->pmu_cpu_context); |
| 5104 | out: | 5337 | out: |
| 5105 | mutex_unlock(&pmus_lock); | 5338 | mutex_unlock(&pmus_lock); |
| 5106 | } | 5339 | } |
| 5340 | static struct idr pmu_idr; | ||
| 5341 | |||
| 5342 | static ssize_t | ||
| 5343 | type_show(struct device *dev, struct device_attribute *attr, char *page) | ||
| 5344 | { | ||
| 5345 | struct pmu *pmu = dev_get_drvdata(dev); | ||
| 5346 | |||
| 5347 | return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->type); | ||
| 5348 | } | ||
| 5349 | |||
| 5350 | static struct device_attribute pmu_dev_attrs[] = { | ||
| 5351 | __ATTR_RO(type), | ||
| 5352 | __ATTR_NULL, | ||
| 5353 | }; | ||
| 5354 | |||
| 5355 | static int pmu_bus_running; | ||
| 5356 | static struct bus_type pmu_bus = { | ||
| 5357 | .name = "event_source", | ||
| 5358 | .dev_attrs = pmu_dev_attrs, | ||
| 5359 | }; | ||
| 5360 | |||
| 5361 | static void pmu_dev_release(struct device *dev) | ||
| 5362 | { | ||
| 5363 | kfree(dev); | ||
| 5364 | } | ||
| 5365 | |||
| 5366 | static int pmu_dev_alloc(struct pmu *pmu) | ||
| 5367 | { | ||
| 5368 | int ret = -ENOMEM; | ||
| 5369 | |||
| 5370 | pmu->dev = kzalloc(sizeof(struct device), GFP_KERNEL); | ||
| 5371 | if (!pmu->dev) | ||
| 5372 | goto out; | ||
| 5373 | |||
| 5374 | device_initialize(pmu->dev); | ||
| 5375 | ret = dev_set_name(pmu->dev, "%s", pmu->name); | ||
| 5376 | if (ret) | ||
| 5377 | goto free_dev; | ||
| 5378 | |||
| 5379 | dev_set_drvdata(pmu->dev, pmu); | ||
| 5380 | pmu->dev->bus = &pmu_bus; | ||
| 5381 | pmu->dev->release = pmu_dev_release; | ||
| 5382 | ret = device_add(pmu->dev); | ||
| 5383 | if (ret) | ||
| 5384 | goto free_dev; | ||
| 5385 | |||
| 5386 | out: | ||
| 5387 | return ret; | ||
| 5388 | |||
| 5389 | free_dev: | ||
| 5390 | put_device(pmu->dev); | ||
| 5391 | goto out; | ||
| 5392 | } | ||
| 5393 | |||
| 5394 | static struct lock_class_key cpuctx_mutex; | ||
| 5107 | 5395 | ||
| 5108 | int perf_pmu_register(struct pmu *pmu) | 5396 | int perf_pmu_register(struct pmu *pmu, char *name, int type) |
| 5109 | { | 5397 | { |
| 5110 | int cpu, ret; | 5398 | int cpu, ret; |
| 5111 | 5399 | ||
| @@ -5115,23 +5403,50 @@ int perf_pmu_register(struct pmu *pmu) | |||
| 5115 | if (!pmu->pmu_disable_count) | 5403 | if (!pmu->pmu_disable_count) |
| 5116 | goto unlock; | 5404 | goto unlock; |
| 5117 | 5405 | ||
| 5406 | pmu->type = -1; | ||
| 5407 | if (!name) | ||
| 5408 | goto skip_type; | ||
| 5409 | pmu->name = name; | ||
| 5410 | |||
| 5411 | if (type < 0) { | ||
| 5412 | int err = idr_pre_get(&pmu_idr, GFP_KERNEL); | ||
| 5413 | if (!err) | ||
| 5414 | goto free_pdc; | ||
| 5415 | |||
| 5416 | err = idr_get_new_above(&pmu_idr, pmu, PERF_TYPE_MAX, &type); | ||
| 5417 | if (err) { | ||
| 5418 | ret = err; | ||
| 5419 | goto free_pdc; | ||
| 5420 | } | ||
| 5421 | } | ||
| 5422 | pmu->type = type; | ||
| 5423 | |||
| 5424 | if (pmu_bus_running) { | ||
| 5425 | ret = pmu_dev_alloc(pmu); | ||
| 5426 | if (ret) | ||
| 5427 | goto free_idr; | ||
| 5428 | } | ||
| 5429 | |||
| 5430 | skip_type: | ||
| 5118 | pmu->pmu_cpu_context = find_pmu_context(pmu->task_ctx_nr); | 5431 | pmu->pmu_cpu_context = find_pmu_context(pmu->task_ctx_nr); |
| 5119 | if (pmu->pmu_cpu_context) | 5432 | if (pmu->pmu_cpu_context) |
| 5120 | goto got_cpu_context; | 5433 | goto got_cpu_context; |
| 5121 | 5434 | ||
| 5122 | pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context); | 5435 | pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context); |
| 5123 | if (!pmu->pmu_cpu_context) | 5436 | if (!pmu->pmu_cpu_context) |
| 5124 | goto free_pdc; | 5437 | goto free_dev; |
| 5125 | 5438 | ||
| 5126 | for_each_possible_cpu(cpu) { | 5439 | for_each_possible_cpu(cpu) { |
| 5127 | struct perf_cpu_context *cpuctx; | 5440 | struct perf_cpu_context *cpuctx; |
| 5128 | 5441 | ||
| 5129 | cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); | 5442 | cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); |
| 5130 | __perf_event_init_context(&cpuctx->ctx); | 5443 | __perf_event_init_context(&cpuctx->ctx); |
| 5444 | lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex); | ||
| 5131 | cpuctx->ctx.type = cpu_context; | 5445 | cpuctx->ctx.type = cpu_context; |
| 5132 | cpuctx->ctx.pmu = pmu; | 5446 | cpuctx->ctx.pmu = pmu; |
| 5133 | cpuctx->jiffies_interval = 1; | 5447 | cpuctx->jiffies_interval = 1; |
| 5134 | INIT_LIST_HEAD(&cpuctx->rotation_list); | 5448 | INIT_LIST_HEAD(&cpuctx->rotation_list); |
| 5449 | cpuctx->active_pmu = pmu; | ||
| 5135 | } | 5450 | } |
| 5136 | 5451 | ||
| 5137 | got_cpu_context: | 5452 | got_cpu_context: |
| @@ -5164,6 +5479,14 @@ unlock: | |||
| 5164 | 5479 | ||
| 5165 | return ret; | 5480 | return ret; |
| 5166 | 5481 | ||
| 5482 | free_dev: | ||
| 5483 | device_del(pmu->dev); | ||
| 5484 | put_device(pmu->dev); | ||
| 5485 | |||
| 5486 | free_idr: | ||
| 5487 | if (pmu->type >= PERF_TYPE_MAX) | ||
| 5488 | idr_remove(&pmu_idr, pmu->type); | ||
| 5489 | |||
| 5167 | free_pdc: | 5490 | free_pdc: |
| 5168 | free_percpu(pmu->pmu_disable_count); | 5491 | free_percpu(pmu->pmu_disable_count); |
| 5169 | goto unlock; | 5492 | goto unlock; |
| @@ -5183,7 +5506,11 @@ void perf_pmu_unregister(struct pmu *pmu) | |||
| 5183 | synchronize_rcu(); | 5506 | synchronize_rcu(); |
| 5184 | 5507 | ||
| 5185 | free_percpu(pmu->pmu_disable_count); | 5508 | free_percpu(pmu->pmu_disable_count); |
| 5186 | free_pmu_context(pmu->pmu_cpu_context); | 5509 | if (pmu->type >= PERF_TYPE_MAX) |
| 5510 | idr_remove(&pmu_idr, pmu->type); | ||
| 5511 | device_del(pmu->dev); | ||
| 5512 | put_device(pmu->dev); | ||
| 5513 | free_pmu_context(pmu); | ||
| 5187 | } | 5514 | } |
| 5188 | 5515 | ||
| 5189 | struct pmu *perf_init_event(struct perf_event *event) | 5516 | struct pmu *perf_init_event(struct perf_event *event) |
| @@ -5192,6 +5519,13 @@ struct pmu *perf_init_event(struct perf_event *event) | |||
| 5192 | int idx; | 5519 | int idx; |
| 5193 | 5520 | ||
| 5194 | idx = srcu_read_lock(&pmus_srcu); | 5521 | idx = srcu_read_lock(&pmus_srcu); |
| 5522 | |||
| 5523 | rcu_read_lock(); | ||
| 5524 | pmu = idr_find(&pmu_idr, event->attr.type); | ||
| 5525 | rcu_read_unlock(); | ||
| 5526 | if (pmu) | ||
| 5527 | goto unlock; | ||
| 5528 | |||
| 5195 | list_for_each_entry_rcu(pmu, &pmus, entry) { | 5529 | list_for_each_entry_rcu(pmu, &pmus, entry) { |
| 5196 | int ret = pmu->event_init(event); | 5530 | int ret = pmu->event_init(event); |
| 5197 | if (!ret) | 5531 | if (!ret) |
| @@ -5224,6 +5558,11 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, | |||
| 5224 | struct hw_perf_event *hwc; | 5558 | struct hw_perf_event *hwc; |
| 5225 | long err; | 5559 | long err; |
| 5226 | 5560 | ||
| 5561 | if ((unsigned)cpu >= nr_cpu_ids) { | ||
| 5562 | if (!task || cpu != -1) | ||
| 5563 | return ERR_PTR(-EINVAL); | ||
| 5564 | } | ||
| 5565 | |||
| 5227 | event = kzalloc(sizeof(*event), GFP_KERNEL); | 5566 | event = kzalloc(sizeof(*event), GFP_KERNEL); |
| 5228 | if (!event) | 5567 | if (!event) |
| 5229 | return ERR_PTR(-ENOMEM); | 5568 | return ERR_PTR(-ENOMEM); |
| @@ -5272,7 +5611,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, | |||
| 5272 | 5611 | ||
| 5273 | if (!overflow_handler && parent_event) | 5612 | if (!overflow_handler && parent_event) |
| 5274 | overflow_handler = parent_event->overflow_handler; | 5613 | overflow_handler = parent_event->overflow_handler; |
| 5275 | 5614 | ||
| 5276 | event->overflow_handler = overflow_handler; | 5615 | event->overflow_handler = overflow_handler; |
| 5277 | 5616 | ||
| 5278 | if (attr->disabled) | 5617 | if (attr->disabled) |
| @@ -5651,12 +5990,18 @@ SYSCALL_DEFINE5(perf_event_open, | |||
| 5651 | mutex_unlock(&ctx->mutex); | 5990 | mutex_unlock(&ctx->mutex); |
| 5652 | 5991 | ||
| 5653 | event->owner = current; | 5992 | event->owner = current; |
| 5654 | get_task_struct(current); | 5993 | |
| 5655 | mutex_lock(¤t->perf_event_mutex); | 5994 | mutex_lock(¤t->perf_event_mutex); |
| 5656 | list_add_tail(&event->owner_entry, ¤t->perf_event_list); | 5995 | list_add_tail(&event->owner_entry, ¤t->perf_event_list); |
| 5657 | mutex_unlock(¤t->perf_event_mutex); | 5996 | mutex_unlock(¤t->perf_event_mutex); |
| 5658 | 5997 | ||
| 5659 | /* | 5998 | /* |
| 5999 | * Precalculate sample_data sizes | ||
| 6000 | */ | ||
| 6001 | perf_event__header_size(event); | ||
| 6002 | perf_event__id_header_size(event); | ||
| 6003 | |||
| 6004 | /* | ||
| 5660 | * Drop the reference on the group_event after placing the | 6005 | * Drop the reference on the group_event after placing the |
| 5661 | * new event on the sibling_list. This ensures destruction | 6006 | * new event on the sibling_list. This ensures destruction |
| 5662 | * of the group leader will find the pointer to itself in | 6007 | * of the group leader will find the pointer to itself in |
| @@ -5719,12 +6064,6 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, | |||
| 5719 | ++ctx->generation; | 6064 | ++ctx->generation; |
| 5720 | mutex_unlock(&ctx->mutex); | 6065 | mutex_unlock(&ctx->mutex); |
| 5721 | 6066 | ||
| 5722 | event->owner = current; | ||
| 5723 | get_task_struct(current); | ||
| 5724 | mutex_lock(¤t->perf_event_mutex); | ||
| 5725 | list_add_tail(&event->owner_entry, ¤t->perf_event_list); | ||
| 5726 | mutex_unlock(¤t->perf_event_mutex); | ||
| 5727 | |||
| 5728 | return event; | 6067 | return event; |
| 5729 | 6068 | ||
| 5730 | err_free: | 6069 | err_free: |
| @@ -5808,7 +6147,7 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn) | |||
| 5808 | * scheduled, so we are now safe from rescheduling changing | 6147 | * scheduled, so we are now safe from rescheduling changing |
| 5809 | * our context. | 6148 | * our context. |
| 5810 | */ | 6149 | */ |
| 5811 | child_ctx = child->perf_event_ctxp[ctxn]; | 6150 | child_ctx = rcu_dereference_raw(child->perf_event_ctxp[ctxn]); |
| 5812 | task_ctx_sched_out(child_ctx, EVENT_ALL); | 6151 | task_ctx_sched_out(child_ctx, EVENT_ALL); |
| 5813 | 6152 | ||
| 5814 | /* | 6153 | /* |
| @@ -5875,8 +6214,24 @@ again: | |||
| 5875 | */ | 6214 | */ |
| 5876 | void perf_event_exit_task(struct task_struct *child) | 6215 | void perf_event_exit_task(struct task_struct *child) |
| 5877 | { | 6216 | { |
| 6217 | struct perf_event *event, *tmp; | ||
| 5878 | int ctxn; | 6218 | int ctxn; |
| 5879 | 6219 | ||
| 6220 | mutex_lock(&child->perf_event_mutex); | ||
| 6221 | list_for_each_entry_safe(event, tmp, &child->perf_event_list, | ||
| 6222 | owner_entry) { | ||
| 6223 | list_del_init(&event->owner_entry); | ||
| 6224 | |||
| 6225 | /* | ||
| 6226 | * Ensure the list deletion is visible before we clear | ||
| 6227 | * the owner, closes a race against perf_release() where | ||
| 6228 | * we need to serialize on the owner->perf_event_mutex. | ||
| 6229 | */ | ||
| 6230 | smp_wmb(); | ||
| 6231 | event->owner = NULL; | ||
| 6232 | } | ||
| 6233 | mutex_unlock(&child->perf_event_mutex); | ||
| 6234 | |||
| 5880 | for_each_task_context_nr(ctxn) | 6235 | for_each_task_context_nr(ctxn) |
| 5881 | perf_event_exit_task_context(child, ctxn); | 6236 | perf_event_exit_task_context(child, ctxn); |
| 5882 | } | 6237 | } |
| @@ -5999,6 +6354,12 @@ inherit_event(struct perf_event *parent_event, | |||
| 5999 | child_event->overflow_handler = parent_event->overflow_handler; | 6354 | child_event->overflow_handler = parent_event->overflow_handler; |
| 6000 | 6355 | ||
| 6001 | /* | 6356 | /* |
| 6357 | * Precalculate sample_data sizes | ||
| 6358 | */ | ||
| 6359 | perf_event__header_size(child_event); | ||
| 6360 | perf_event__id_header_size(child_event); | ||
| 6361 | |||
| 6362 | /* | ||
| 6002 | * Link it up in the child's context: | 6363 | * Link it up in the child's context: |
| 6003 | */ | 6364 | */ |
| 6004 | raw_spin_lock_irqsave(&child_ctx->lock, flags); | 6365 | raw_spin_lock_irqsave(&child_ctx->lock, flags); |
| @@ -6096,13 +6457,9 @@ int perf_event_init_context(struct task_struct *child, int ctxn) | |||
| 6096 | struct perf_event *event; | 6457 | struct perf_event *event; |
| 6097 | struct task_struct *parent = current; | 6458 | struct task_struct *parent = current; |
| 6098 | int inherited_all = 1; | 6459 | int inherited_all = 1; |
| 6460 | unsigned long flags; | ||
| 6099 | int ret = 0; | 6461 | int ret = 0; |
| 6100 | 6462 | ||
| 6101 | child->perf_event_ctxp[ctxn] = NULL; | ||
| 6102 | |||
| 6103 | mutex_init(&child->perf_event_mutex); | ||
| 6104 | INIT_LIST_HEAD(&child->perf_event_list); | ||
| 6105 | |||
| 6106 | if (likely(!parent->perf_event_ctxp[ctxn])) | 6463 | if (likely(!parent->perf_event_ctxp[ctxn])) |
| 6107 | return 0; | 6464 | return 0; |
| 6108 | 6465 | ||
| @@ -6136,6 +6493,15 @@ int perf_event_init_context(struct task_struct *child, int ctxn) | |||
| 6136 | break; | 6493 | break; |
| 6137 | } | 6494 | } |
| 6138 | 6495 | ||
| 6496 | /* | ||
| 6497 | * We can't hold ctx->lock when iterating the ->flexible_group list due | ||
| 6498 | * to allocations, but we need to prevent rotation because | ||
| 6499 | * rotate_ctx() will change the list from interrupt context. | ||
| 6500 | */ | ||
| 6501 | raw_spin_lock_irqsave(&parent_ctx->lock, flags); | ||
| 6502 | parent_ctx->rotate_disable = 1; | ||
| 6503 | raw_spin_unlock_irqrestore(&parent_ctx->lock, flags); | ||
| 6504 | |||
| 6139 | list_for_each_entry(event, &parent_ctx->flexible_groups, group_entry) { | 6505 | list_for_each_entry(event, &parent_ctx->flexible_groups, group_entry) { |
| 6140 | ret = inherit_task_group(event, parent, parent_ctx, | 6506 | ret = inherit_task_group(event, parent, parent_ctx, |
| 6141 | child, ctxn, &inherited_all); | 6507 | child, ctxn, &inherited_all); |
| @@ -6143,18 +6509,20 @@ int perf_event_init_context(struct task_struct *child, int ctxn) | |||
| 6143 | break; | 6509 | break; |
| 6144 | } | 6510 | } |
| 6145 | 6511 | ||
| 6512 | raw_spin_lock_irqsave(&parent_ctx->lock, flags); | ||
| 6513 | parent_ctx->rotate_disable = 0; | ||
| 6514 | |||
| 6146 | child_ctx = child->perf_event_ctxp[ctxn]; | 6515 | child_ctx = child->perf_event_ctxp[ctxn]; |
| 6147 | 6516 | ||
| 6148 | if (child_ctx && inherited_all) { | 6517 | if (child_ctx && inherited_all) { |
| 6149 | /* | 6518 | /* |
| 6150 | * Mark the child context as a clone of the parent | 6519 | * Mark the child context as a clone of the parent |
| 6151 | * context, or of whatever the parent is a clone of. | 6520 | * context, or of whatever the parent is a clone of. |
| 6152 | * Note that if the parent is a clone, it could get | 6521 | * |
| 6153 | * uncloned at any point, but that doesn't matter | 6522 | * Note that if the parent is a clone, the holding of |
| 6154 | * because the list of events and the generation | 6523 | * parent_ctx->lock avoids it from being uncloned. |
| 6155 | * count can't have changed since we took the mutex. | ||
| 6156 | */ | 6524 | */ |
| 6157 | cloned_ctx = rcu_dereference(parent_ctx->parent_ctx); | 6525 | cloned_ctx = parent_ctx->parent_ctx; |
| 6158 | if (cloned_ctx) { | 6526 | if (cloned_ctx) { |
| 6159 | child_ctx->parent_ctx = cloned_ctx; | 6527 | child_ctx->parent_ctx = cloned_ctx; |
| 6160 | child_ctx->parent_gen = parent_ctx->parent_gen; | 6528 | child_ctx->parent_gen = parent_ctx->parent_gen; |
| @@ -6165,6 +6533,7 @@ int perf_event_init_context(struct task_struct *child, int ctxn) | |||
| 6165 | get_ctx(child_ctx->parent_ctx); | 6533 | get_ctx(child_ctx->parent_ctx); |
| 6166 | } | 6534 | } |
| 6167 | 6535 | ||
| 6536 | raw_spin_unlock_irqrestore(&parent_ctx->lock, flags); | ||
| 6168 | mutex_unlock(&parent_ctx->mutex); | 6537 | mutex_unlock(&parent_ctx->mutex); |
| 6169 | 6538 | ||
| 6170 | perf_unpin_context(parent_ctx); | 6539 | perf_unpin_context(parent_ctx); |
| @@ -6179,6 +6548,10 @@ int perf_event_init_task(struct task_struct *child) | |||
| 6179 | { | 6548 | { |
| 6180 | int ctxn, ret; | 6549 | int ctxn, ret; |
| 6181 | 6550 | ||
| 6551 | memset(child->perf_event_ctxp, 0, sizeof(child->perf_event_ctxp)); | ||
| 6552 | mutex_init(&child->perf_event_mutex); | ||
| 6553 | INIT_LIST_HEAD(&child->perf_event_list); | ||
| 6554 | |||
| 6182 | for_each_task_context_nr(ctxn) { | 6555 | for_each_task_context_nr(ctxn) { |
| 6183 | ret = perf_event_init_context(child, ctxn); | 6556 | ret = perf_event_init_context(child, ctxn); |
| 6184 | if (ret) | 6557 | if (ret) |
| @@ -6215,7 +6588,7 @@ static void __cpuinit perf_event_init_cpu(int cpu) | |||
| 6215 | mutex_unlock(&swhash->hlist_mutex); | 6588 | mutex_unlock(&swhash->hlist_mutex); |
| 6216 | } | 6589 | } |
| 6217 | 6590 | ||
| 6218 | #ifdef CONFIG_HOTPLUG_CPU | 6591 | #if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC |
| 6219 | static void perf_pmu_rotate_stop(struct pmu *pmu) | 6592 | static void perf_pmu_rotate_stop(struct pmu *pmu) |
| 6220 | { | 6593 | { |
| 6221 | struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); | 6594 | struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); |
| @@ -6269,6 +6642,26 @@ static void perf_event_exit_cpu(int cpu) | |||
| 6269 | static inline void perf_event_exit_cpu(int cpu) { } | 6642 | static inline void perf_event_exit_cpu(int cpu) { } |
| 6270 | #endif | 6643 | #endif |
| 6271 | 6644 | ||
| 6645 | static int | ||
| 6646 | perf_reboot(struct notifier_block *notifier, unsigned long val, void *v) | ||
| 6647 | { | ||
| 6648 | int cpu; | ||
| 6649 | |||
| 6650 | for_each_online_cpu(cpu) | ||
| 6651 | perf_event_exit_cpu(cpu); | ||
| 6652 | |||
| 6653 | return NOTIFY_OK; | ||
| 6654 | } | ||
| 6655 | |||
| 6656 | /* | ||
| 6657 | * Run the perf reboot notifier at the very last possible moment so that | ||
| 6658 | * the generic watchdog code runs as long as possible. | ||
| 6659 | */ | ||
| 6660 | static struct notifier_block perf_reboot_notifier = { | ||
| 6661 | .notifier_call = perf_reboot, | ||
| 6662 | .priority = INT_MIN, | ||
| 6663 | }; | ||
| 6664 | |||
| 6272 | static int __cpuinit | 6665 | static int __cpuinit |
| 6273 | perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) | 6666 | perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) |
| 6274 | { | 6667 | { |
| @@ -6295,11 +6688,47 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) | |||
| 6295 | 6688 | ||
| 6296 | void __init perf_event_init(void) | 6689 | void __init perf_event_init(void) |
| 6297 | { | 6690 | { |
| 6691 | int ret; | ||
| 6692 | |||
| 6693 | idr_init(&pmu_idr); | ||
| 6694 | |||
| 6298 | perf_event_init_all_cpus(); | 6695 | perf_event_init_all_cpus(); |
| 6299 | init_srcu_struct(&pmus_srcu); | 6696 | init_srcu_struct(&pmus_srcu); |
| 6300 | perf_pmu_register(&perf_swevent); | 6697 | perf_pmu_register(&perf_swevent, "software", PERF_TYPE_SOFTWARE); |
| 6301 | perf_pmu_register(&perf_cpu_clock); | 6698 | perf_pmu_register(&perf_cpu_clock, NULL, -1); |
| 6302 | perf_pmu_register(&perf_task_clock); | 6699 | perf_pmu_register(&perf_task_clock, NULL, -1); |
| 6303 | perf_tp_register(); | 6700 | perf_tp_register(); |
| 6304 | perf_cpu_notifier(perf_cpu_notify); | 6701 | perf_cpu_notifier(perf_cpu_notify); |
| 6702 | register_reboot_notifier(&perf_reboot_notifier); | ||
| 6703 | |||
| 6704 | ret = init_hw_breakpoint(); | ||
| 6705 | WARN(ret, "hw_breakpoint initialization failed with: %d", ret); | ||
| 6706 | } | ||
| 6707 | |||
| 6708 | static int __init perf_event_sysfs_init(void) | ||
| 6709 | { | ||
| 6710 | struct pmu *pmu; | ||
| 6711 | int ret; | ||
| 6712 | |||
| 6713 | mutex_lock(&pmus_lock); | ||
| 6714 | |||
| 6715 | ret = bus_register(&pmu_bus); | ||
| 6716 | if (ret) | ||
| 6717 | goto unlock; | ||
| 6718 | |||
| 6719 | list_for_each_entry(pmu, &pmus, entry) { | ||
| 6720 | if (!pmu->name || pmu->type < 0) | ||
| 6721 | continue; | ||
| 6722 | |||
| 6723 | ret = pmu_dev_alloc(pmu); | ||
| 6724 | WARN(ret, "Failed to register pmu: %s, reason %d\n", pmu->name, ret); | ||
| 6725 | } | ||
| 6726 | pmu_bus_running = 1; | ||
| 6727 | ret = 0; | ||
| 6728 | |||
| 6729 | unlock: | ||
| 6730 | mutex_unlock(&pmus_lock); | ||
| 6731 | |||
| 6732 | return ret; | ||
| 6305 | } | 6733 | } |
| 6734 | device_initcall(perf_event_sysfs_init); | ||
diff --git a/kernel/pm_qos_params.c b/kernel/pm_qos_params.c index c7a8f453919..aeaa7f84682 100644 --- a/kernel/pm_qos_params.c +++ b/kernel/pm_qos_params.c | |||
| @@ -121,10 +121,10 @@ static inline int pm_qos_get_value(struct pm_qos_object *o) | |||
| 121 | 121 | ||
| 122 | switch (o->type) { | 122 | switch (o->type) { |
| 123 | case PM_QOS_MIN: | 123 | case PM_QOS_MIN: |
| 124 | return plist_last(&o->requests)->prio; | 124 | return plist_first(&o->requests)->prio; |
| 125 | 125 | ||
| 126 | case PM_QOS_MAX: | 126 | case PM_QOS_MAX: |
| 127 | return plist_first(&o->requests)->prio; | 127 | return plist_last(&o->requests)->prio; |
| 128 | 128 | ||
| 129 | default: | 129 | default: |
| 130 | /* runtime check for not using enum */ | 130 | /* runtime check for not using enum */ |
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index 6842eeba587..05bb7173850 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c | |||
| @@ -37,13 +37,13 @@ static int check_clock(const clockid_t which_clock) | |||
| 37 | if (pid == 0) | 37 | if (pid == 0) |
| 38 | return 0; | 38 | return 0; |
| 39 | 39 | ||
| 40 | read_lock(&tasklist_lock); | 40 | rcu_read_lock(); |
| 41 | p = find_task_by_vpid(pid); | 41 | p = find_task_by_vpid(pid); |
| 42 | if (!p || !(CPUCLOCK_PERTHREAD(which_clock) ? | 42 | if (!p || !(CPUCLOCK_PERTHREAD(which_clock) ? |
| 43 | same_thread_group(p, current) : thread_group_leader(p))) { | 43 | same_thread_group(p, current) : has_group_leader_pid(p))) { |
| 44 | error = -EINVAL; | 44 | error = -EINVAL; |
| 45 | } | 45 | } |
| 46 | read_unlock(&tasklist_lock); | 46 | rcu_read_unlock(); |
| 47 | 47 | ||
| 48 | return error; | 48 | return error; |
| 49 | } | 49 | } |
| @@ -390,7 +390,7 @@ int posix_cpu_timer_create(struct k_itimer *new_timer) | |||
| 390 | 390 | ||
| 391 | INIT_LIST_HEAD(&new_timer->it.cpu.entry); | 391 | INIT_LIST_HEAD(&new_timer->it.cpu.entry); |
| 392 | 392 | ||
| 393 | read_lock(&tasklist_lock); | 393 | rcu_read_lock(); |
| 394 | if (CPUCLOCK_PERTHREAD(new_timer->it_clock)) { | 394 | if (CPUCLOCK_PERTHREAD(new_timer->it_clock)) { |
| 395 | if (pid == 0) { | 395 | if (pid == 0) { |
| 396 | p = current; | 396 | p = current; |
| @@ -404,7 +404,7 @@ int posix_cpu_timer_create(struct k_itimer *new_timer) | |||
| 404 | p = current->group_leader; | 404 | p = current->group_leader; |
| 405 | } else { | 405 | } else { |
| 406 | p = find_task_by_vpid(pid); | 406 | p = find_task_by_vpid(pid); |
| 407 | if (p && !thread_group_leader(p)) | 407 | if (p && !has_group_leader_pid(p)) |
| 408 | p = NULL; | 408 | p = NULL; |
| 409 | } | 409 | } |
| 410 | } | 410 | } |
| @@ -414,7 +414,7 @@ int posix_cpu_timer_create(struct k_itimer *new_timer) | |||
| 414 | } else { | 414 | } else { |
| 415 | ret = -EINVAL; | 415 | ret = -EINVAL; |
| 416 | } | 416 | } |
| 417 | read_unlock(&tasklist_lock); | 417 | rcu_read_unlock(); |
| 418 | 418 | ||
| 419 | return ret; | 419 | return ret; |
| 420 | } | 420 | } |
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index 9ca4973f736..93bd2eb2bc5 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c | |||
| @@ -145,7 +145,13 @@ static int common_timer_del(struct k_itimer *timer); | |||
| 145 | 145 | ||
| 146 | static enum hrtimer_restart posix_timer_fn(struct hrtimer *data); | 146 | static enum hrtimer_restart posix_timer_fn(struct hrtimer *data); |
| 147 | 147 | ||
| 148 | static struct k_itimer *lock_timer(timer_t timer_id, unsigned long *flags); | 148 | static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags); |
| 149 | |||
| 150 | #define lock_timer(tid, flags) \ | ||
| 151 | ({ struct k_itimer *__timr; \ | ||
| 152 | __cond_lock(&__timr->it_lock, __timr = __lock_timer(tid, flags)); \ | ||
| 153 | __timr; \ | ||
| 154 | }) | ||
| 149 | 155 | ||
| 150 | static inline void unlock_timer(struct k_itimer *timr, unsigned long flags) | 156 | static inline void unlock_timer(struct k_itimer *timr, unsigned long flags) |
| 151 | { | 157 | { |
| @@ -619,7 +625,7 @@ out: | |||
| 619 | * the find to the timer lock. To avoid a dead lock, the timer id MUST | 625 | * the find to the timer lock. To avoid a dead lock, the timer id MUST |
| 620 | * be release with out holding the timer lock. | 626 | * be release with out holding the timer lock. |
| 621 | */ | 627 | */ |
| 622 | static struct k_itimer *lock_timer(timer_t timer_id, unsigned long *flags) | 628 | static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags) |
| 623 | { | 629 | { |
| 624 | struct k_itimer *timr; | 630 | struct k_itimer *timr; |
| 625 | /* | 631 | /* |
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 29bff6117ab..265729966ec 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig | |||
| @@ -100,13 +100,9 @@ config PM_SLEEP_ADVANCED_DEBUG | |||
| 100 | depends on PM_ADVANCED_DEBUG | 100 | depends on PM_ADVANCED_DEBUG |
| 101 | default n | 101 | default n |
| 102 | 102 | ||
| 103 | config SUSPEND_NVS | ||
| 104 | bool | ||
| 105 | |||
| 106 | config SUSPEND | 103 | config SUSPEND |
| 107 | bool "Suspend to RAM and standby" | 104 | bool "Suspend to RAM and standby" |
| 108 | depends on PM && ARCH_SUSPEND_POSSIBLE | 105 | depends on PM && ARCH_SUSPEND_POSSIBLE |
| 109 | select SUSPEND_NVS if HAS_IOMEM | ||
| 110 | default y | 106 | default y |
| 111 | ---help--- | 107 | ---help--- |
| 112 | Allow the system to enter sleep states in which main memory is | 108 | Allow the system to enter sleep states in which main memory is |
| @@ -140,7 +136,6 @@ config HIBERNATION | |||
| 140 | depends on PM && SWAP && ARCH_HIBERNATION_POSSIBLE | 136 | depends on PM && SWAP && ARCH_HIBERNATION_POSSIBLE |
| 141 | select LZO_COMPRESS | 137 | select LZO_COMPRESS |
| 142 | select LZO_DECOMPRESS | 138 | select LZO_DECOMPRESS |
| 143 | select SUSPEND_NVS if HAS_IOMEM | ||
| 144 | ---help--- | 139 | ---help--- |
| 145 | Enable the suspend to disk (STD) functionality, which is usually | 140 | Enable the suspend to disk (STD) functionality, which is usually |
| 146 | called "hibernation" in user interfaces. STD checkpoints the | 141 | called "hibernation" in user interfaces. STD checkpoints the |
| @@ -246,9 +241,13 @@ config PM_OPS | |||
| 246 | depends on PM_SLEEP || PM_RUNTIME | 241 | depends on PM_SLEEP || PM_RUNTIME |
| 247 | default y | 242 | default y |
| 248 | 243 | ||
| 244 | config ARCH_HAS_OPP | ||
| 245 | bool | ||
| 246 | |||
| 249 | config PM_OPP | 247 | config PM_OPP |
| 250 | bool "Operating Performance Point (OPP) Layer library" | 248 | bool "Operating Performance Point (OPP) Layer library" |
| 251 | depends on PM | 249 | depends on PM |
| 250 | depends on ARCH_HAS_OPP | ||
| 252 | ---help--- | 251 | ---help--- |
| 253 | SOCs have a standard set of tuples consisting of frequency and | 252 | SOCs have a standard set of tuples consisting of frequency and |
| 254 | voltage pairs that the device will support per voltage domain. This | 253 | voltage pairs that the device will support per voltage domain. This |
diff --git a/kernel/power/Makefile b/kernel/power/Makefile index f9063c6b185..c350e18b53e 100644 --- a/kernel/power/Makefile +++ b/kernel/power/Makefile | |||
| @@ -1,7 +1,4 @@ | |||
| 1 | 1 | ccflags-$(CONFIG_PM_DEBUG) := -DDEBUG | |
| 2 | ifeq ($(CONFIG_PM_DEBUG),y) | ||
| 3 | EXTRA_CFLAGS += -DDEBUG | ||
| 4 | endif | ||
| 5 | 2 | ||
| 6 | obj-$(CONFIG_PM) += main.o | 3 | obj-$(CONFIG_PM) += main.o |
| 7 | obj-$(CONFIG_PM_SLEEP) += console.o | 4 | obj-$(CONFIG_PM_SLEEP) += console.o |
| @@ -10,6 +7,5 @@ obj-$(CONFIG_SUSPEND) += suspend.o | |||
| 10 | obj-$(CONFIG_PM_TEST_SUSPEND) += suspend_test.o | 7 | obj-$(CONFIG_PM_TEST_SUSPEND) += suspend_test.o |
| 11 | obj-$(CONFIG_HIBERNATION) += hibernate.o snapshot.o swap.o user.o \ | 8 | obj-$(CONFIG_HIBERNATION) += hibernate.o snapshot.o swap.o user.o \ |
| 12 | block_io.o | 9 | block_io.o |
| 13 | obj-$(CONFIG_SUSPEND_NVS) += nvs.o | ||
| 14 | 10 | ||
| 15 | obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o | 11 | obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o |
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 657272e91d0..1832bd26421 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c | |||
| @@ -51,18 +51,18 @@ enum { | |||
| 51 | 51 | ||
| 52 | static int hibernation_mode = HIBERNATION_SHUTDOWN; | 52 | static int hibernation_mode = HIBERNATION_SHUTDOWN; |
| 53 | 53 | ||
| 54 | static struct platform_hibernation_ops *hibernation_ops; | 54 | static const struct platform_hibernation_ops *hibernation_ops; |
| 55 | 55 | ||
| 56 | /** | 56 | /** |
| 57 | * hibernation_set_ops - set the global hibernate operations | 57 | * hibernation_set_ops - set the global hibernate operations |
| 58 | * @ops: the hibernation operations to use in subsequent hibernation transitions | 58 | * @ops: the hibernation operations to use in subsequent hibernation transitions |
| 59 | */ | 59 | */ |
| 60 | 60 | ||
| 61 | void hibernation_set_ops(struct platform_hibernation_ops *ops) | 61 | void hibernation_set_ops(const struct platform_hibernation_ops *ops) |
| 62 | { | 62 | { |
| 63 | if (ops && !(ops->begin && ops->end && ops->pre_snapshot | 63 | if (ops && !(ops->begin && ops->end && ops->pre_snapshot |
| 64 | && ops->prepare && ops->finish && ops->enter && ops->pre_restore | 64 | && ops->prepare && ops->finish && ops->enter && ops->pre_restore |
| 65 | && ops->restore_cleanup)) { | 65 | && ops->restore_cleanup && ops->leave)) { |
| 66 | WARN_ON(1); | 66 | WARN_ON(1); |
| 67 | return; | 67 | return; |
| 68 | } | 68 | } |
| @@ -278,7 +278,7 @@ static int create_image(int platform_mode) | |||
| 278 | goto Enable_irqs; | 278 | goto Enable_irqs; |
| 279 | } | 279 | } |
| 280 | 280 | ||
| 281 | if (hibernation_test(TEST_CORE) || !pm_check_wakeup_events()) | 281 | if (hibernation_test(TEST_CORE) || pm_wakeup_pending()) |
| 282 | goto Power_up; | 282 | goto Power_up; |
| 283 | 283 | ||
| 284 | in_suspend = 1; | 284 | in_suspend = 1; |
| @@ -327,7 +327,6 @@ static int create_image(int platform_mode) | |||
| 327 | int hibernation_snapshot(int platform_mode) | 327 | int hibernation_snapshot(int platform_mode) |
| 328 | { | 328 | { |
| 329 | int error; | 329 | int error; |
| 330 | gfp_t saved_mask; | ||
| 331 | 330 | ||
| 332 | error = platform_begin(platform_mode); | 331 | error = platform_begin(platform_mode); |
| 333 | if (error) | 332 | if (error) |
| @@ -339,7 +338,7 @@ int hibernation_snapshot(int platform_mode) | |||
| 339 | goto Close; | 338 | goto Close; |
| 340 | 339 | ||
| 341 | suspend_console(); | 340 | suspend_console(); |
| 342 | saved_mask = clear_gfp_allowed_mask(GFP_IOFS); | 341 | pm_restrict_gfp_mask(); |
| 343 | error = dpm_suspend_start(PMSG_FREEZE); | 342 | error = dpm_suspend_start(PMSG_FREEZE); |
| 344 | if (error) | 343 | if (error) |
| 345 | goto Recover_platform; | 344 | goto Recover_platform; |
| @@ -348,7 +347,10 @@ int hibernation_snapshot(int platform_mode) | |||
| 348 | goto Recover_platform; | 347 | goto Recover_platform; |
| 349 | 348 | ||
| 350 | error = create_image(platform_mode); | 349 | error = create_image(platform_mode); |
| 351 | /* Control returns here after successful restore */ | 350 | /* |
| 351 | * Control returns here (1) after the image has been created or the | ||
| 352 | * image creation has failed and (2) after a successful restore. | ||
| 353 | */ | ||
| 352 | 354 | ||
| 353 | Resume_devices: | 355 | Resume_devices: |
| 354 | /* We may need to release the preallocated image pages here. */ | 356 | /* We may need to release the preallocated image pages here. */ |
| @@ -357,7 +359,10 @@ int hibernation_snapshot(int platform_mode) | |||
| 357 | 359 | ||
| 358 | dpm_resume_end(in_suspend ? | 360 | dpm_resume_end(in_suspend ? |
| 359 | (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE); | 361 | (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE); |
| 360 | set_gfp_allowed_mask(saved_mask); | 362 | |
| 363 | if (error || !in_suspend) | ||
| 364 | pm_restore_gfp_mask(); | ||
| 365 | |||
| 361 | resume_console(); | 366 | resume_console(); |
| 362 | Close: | 367 | Close: |
| 363 | platform_end(platform_mode); | 368 | platform_end(platform_mode); |
| @@ -452,17 +457,16 @@ static int resume_target_kernel(bool platform_mode) | |||
| 452 | int hibernation_restore(int platform_mode) | 457 | int hibernation_restore(int platform_mode) |
| 453 | { | 458 | { |
| 454 | int error; | 459 | int error; |
| 455 | gfp_t saved_mask; | ||
| 456 | 460 | ||
| 457 | pm_prepare_console(); | 461 | pm_prepare_console(); |
| 458 | suspend_console(); | 462 | suspend_console(); |
| 459 | saved_mask = clear_gfp_allowed_mask(GFP_IOFS); | 463 | pm_restrict_gfp_mask(); |
| 460 | error = dpm_suspend_start(PMSG_QUIESCE); | 464 | error = dpm_suspend_start(PMSG_QUIESCE); |
| 461 | if (!error) { | 465 | if (!error) { |
| 462 | error = resume_target_kernel(platform_mode); | 466 | error = resume_target_kernel(platform_mode); |
| 463 | dpm_resume_end(PMSG_RECOVER); | 467 | dpm_resume_end(PMSG_RECOVER); |
| 464 | } | 468 | } |
| 465 | set_gfp_allowed_mask(saved_mask); | 469 | pm_restore_gfp_mask(); |
| 466 | resume_console(); | 470 | resume_console(); |
| 467 | pm_restore_console(); | 471 | pm_restore_console(); |
| 468 | return error; | 472 | return error; |
| @@ -476,7 +480,6 @@ int hibernation_restore(int platform_mode) | |||
| 476 | int hibernation_platform_enter(void) | 480 | int hibernation_platform_enter(void) |
| 477 | { | 481 | { |
| 478 | int error; | 482 | int error; |
| 479 | gfp_t saved_mask; | ||
| 480 | 483 | ||
| 481 | if (!hibernation_ops) | 484 | if (!hibernation_ops) |
| 482 | return -ENOSYS; | 485 | return -ENOSYS; |
| @@ -492,7 +495,6 @@ int hibernation_platform_enter(void) | |||
| 492 | 495 | ||
| 493 | entering_platform_hibernation = true; | 496 | entering_platform_hibernation = true; |
| 494 | suspend_console(); | 497 | suspend_console(); |
| 495 | saved_mask = clear_gfp_allowed_mask(GFP_IOFS); | ||
| 496 | error = dpm_suspend_start(PMSG_HIBERNATE); | 498 | error = dpm_suspend_start(PMSG_HIBERNATE); |
| 497 | if (error) { | 499 | if (error) { |
| 498 | if (hibernation_ops->recover) | 500 | if (hibernation_ops->recover) |
| @@ -514,7 +516,7 @@ int hibernation_platform_enter(void) | |||
| 514 | 516 | ||
| 515 | local_irq_disable(); | 517 | local_irq_disable(); |
| 516 | sysdev_suspend(PMSG_HIBERNATE); | 518 | sysdev_suspend(PMSG_HIBERNATE); |
| 517 | if (!pm_check_wakeup_events()) { | 519 | if (pm_wakeup_pending()) { |
| 518 | error = -EAGAIN; | 520 | error = -EAGAIN; |
| 519 | goto Power_up; | 521 | goto Power_up; |
| 520 | } | 522 | } |
| @@ -536,7 +538,6 @@ int hibernation_platform_enter(void) | |||
| 536 | Resume_devices: | 538 | Resume_devices: |
| 537 | entering_platform_hibernation = false; | 539 | entering_platform_hibernation = false; |
| 538 | dpm_resume_end(PMSG_RESTORE); | 540 | dpm_resume_end(PMSG_RESTORE); |
| 539 | set_gfp_allowed_mask(saved_mask); | ||
| 540 | resume_console(); | 541 | resume_console(); |
| 541 | 542 | ||
| 542 | Close: | 543 | Close: |
| @@ -646,6 +647,8 @@ int hibernate(void) | |||
| 646 | swsusp_free(); | 647 | swsusp_free(); |
| 647 | if (!error) | 648 | if (!error) |
| 648 | power_down(); | 649 | power_down(); |
| 650 | in_suspend = 0; | ||
| 651 | pm_restore_gfp_mask(); | ||
| 649 | } else { | 652 | } else { |
| 650 | pr_debug("PM: Image restored successfully.\n"); | 653 | pr_debug("PM: Image restored successfully.\n"); |
| 651 | } | 654 | } |
diff --git a/kernel/power/main.c b/kernel/power/main.c index 7b5db6a8561..701853042c2 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c | |||
| @@ -326,7 +326,7 @@ EXPORT_SYMBOL_GPL(pm_wq); | |||
| 326 | 326 | ||
| 327 | static int __init pm_start_workqueue(void) | 327 | static int __init pm_start_workqueue(void) |
| 328 | { | 328 | { |
| 329 | pm_wq = alloc_workqueue("pm", WQ_FREEZEABLE, 0); | 329 | pm_wq = alloc_workqueue("pm", WQ_FREEZABLE, 0); |
| 330 | 330 | ||
| 331 | return pm_wq ? 0 : -ENOMEM; | 331 | return pm_wq ? 0 : -ENOMEM; |
| 332 | } | 332 | } |
diff --git a/kernel/power/nvs.c b/kernel/power/nvs.c deleted file mode 100644 index 1836db60bbb..00000000000 --- a/kernel/power/nvs.c +++ /dev/null | |||
| @@ -1,136 +0,0 @@ | |||
| 1 | /* | ||
| 2 | * linux/kernel/power/hibernate_nvs.c - Routines for handling NVS memory | ||
| 3 | * | ||
| 4 | * Copyright (C) 2008,2009 Rafael J. Wysocki <rjw@sisk.pl>, Novell Inc. | ||
| 5 | * | ||
| 6 | * This file is released under the GPLv2. | ||
| 7 | */ | ||
| 8 | |||
| 9 | #include <linux/io.h> | ||
| 10 | #include <linux/kernel.h> | ||
| 11 | #include <linux/list.h> | ||
| 12 | #include <linux/mm.h> | ||
| 13 | #include <linux/slab.h> | ||
| 14 | #include <linux/suspend.h> | ||
| 15 | |||
| 16 | /* | ||
| 17 | * Platforms, like ACPI, may want us to save some memory used by them during | ||
| 18 | * suspend and to restore the contents of this memory during the subsequent | ||
| 19 | * resume. The code below implements a mechanism allowing us to do that. | ||
| 20 | */ | ||
| 21 | |||
| 22 | struct nvs_page { | ||
| 23 | unsigned long phys_start; | ||
| 24 | unsigned int size; | ||
| 25 | void *kaddr; | ||
| 26 | void *data; | ||
| 27 | struct list_head node; | ||
| 28 | }; | ||
| 29 | |||
| 30 | static LIST_HEAD(nvs_list); | ||
| 31 | |||
| 32 | /** | ||
| 33 | * suspend_nvs_register - register platform NVS memory region to save | ||
| 34 | * @start - physical address of the region | ||
| 35 | * @size - size of the region | ||
| 36 | * | ||
| 37 | * The NVS region need not be page-aligned (both ends) and we arrange | ||
| 38 | * things so that the data from page-aligned addresses in this region will | ||
| 39 | * be copied into separate RAM pages. | ||
| 40 | */ | ||
| 41 | int suspend_nvs_register(unsigned long start, unsigned long size) | ||
| 42 | { | ||
| 43 | struct nvs_page *entry, *next; | ||
| 44 | |||
| 45 | while (size > 0) { | ||
| 46 | unsigned int nr_bytes; | ||
| 47 | |||
| 48 | entry = kzalloc(sizeof(struct nvs_page), GFP_KERNEL); | ||
| 49 | if (!entry) | ||
| 50 | goto Error; | ||
| 51 | |||
| 52 | list_add_tail(&entry->node, &nvs_list); | ||
| 53 | entry->phys_start = start; | ||
| 54 | nr_bytes = PAGE_SIZE - (start & ~PAGE_MASK); | ||
| 55 | entry->size = (size < nr_bytes) ? size : nr_bytes; | ||
| 56 | |||
| 57 | start += entry->size; | ||
| 58 | size -= entry->size; | ||
| 59 | } | ||
| 60 | return 0; | ||
| 61 | |||
| 62 | Error: | ||
| 63 | list_for_each_entry_safe(entry, next, &nvs_list, node) { | ||
| 64 | list_del(&entry->node); | ||
| 65 | kfree(entry); | ||
| 66 | } | ||
| 67 | return -ENOMEM; | ||
| 68 | } | ||
| 69 | |||
| 70 | /** | ||
| 71 | * suspend_nvs_free - free data pages allocated for saving NVS regions | ||
| 72 | */ | ||
| 73 | void suspend_nvs_free(void) | ||
| 74 | { | ||
| 75 | struct nvs_page *entry; | ||
| 76 | |||
| 77 | list_for_each_entry(entry, &nvs_list, node) | ||
| 78 | if (entry->data) { | ||
| 79 | free_page((unsigned long)entry->data); | ||
| 80 | entry->data = NULL; | ||
| 81 | if (entry->kaddr) { | ||
| 82 | iounmap(entry->kaddr); | ||
| 83 | entry->kaddr = NULL; | ||
| 84 | } | ||
| 85 | } | ||
| 86 | } | ||
| 87 | |||
| 88 | /** | ||
| 89 | * suspend_nvs_alloc - allocate memory necessary for saving NVS regions | ||
| 90 | */ | ||
| 91 | int suspend_nvs_alloc(void) | ||
| 92 | { | ||
| 93 | struct nvs_page *entry; | ||
| 94 | |||
| 95 | list_for_each_entry(entry, &nvs_list, node) { | ||
| 96 | entry->data = (void *)__get_free_page(GFP_KERNEL); | ||
| 97 | if (!entry->data) { | ||
| 98 | suspend_nvs_free(); | ||
| 99 | return -ENOMEM; | ||
| 100 | } | ||
| 101 | } | ||
| 102 | return 0; | ||
| 103 | } | ||
| 104 | |||
| 105 | /** | ||
| 106 | * suspend_nvs_save - save NVS memory regions | ||
| 107 | */ | ||
| 108 | void suspend_nvs_save(void) | ||
| 109 | { | ||
| 110 | struct nvs_page *entry; | ||
| 111 | |||
| 112 | printk(KERN_INFO "PM: Saving platform NVS memory\n"); | ||
| 113 | |||
| 114 | list_for_each_entry(entry, &nvs_list, node) | ||
| 115 | if (entry->data) { | ||
| 116 | entry->kaddr = ioremap(entry->phys_start, entry->size); | ||
| 117 | memcpy(entry->data, entry->kaddr, entry->size); | ||
| 118 | } | ||
| 119 | } | ||
| 120 | |||
| 121 | /** | ||
| 122 | * suspend_nvs_restore - restore NVS memory regions | ||
| 123 | * | ||
| 124 | * This function is going to be called with interrupts disabled, so it | ||
| 125 | * cannot iounmap the virtual addresses used to access the NVS region. | ||
| 126 | */ | ||
| 127 | void suspend_nvs_restore(void) | ||
| 128 | { | ||
| 129 | struct nvs_page *entry; | ||
| 130 | |||
| 131 | printk(KERN_INFO "PM: Restoring platform NVS memory\n"); | ||
| 132 | |||
| 133 | list_for_each_entry(entry, &nvs_list, node) | ||
| 134 | if (entry->data) | ||
| 135 | memcpy(entry->kaddr, entry->data, entry->size); | ||
| 136 | } | ||
diff --git a/kernel/power/process.c b/kernel/power/process.c index e50b4c1b2a0..0cf3a27a6c9 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c | |||
| @@ -22,7 +22,7 @@ | |||
| 22 | */ | 22 | */ |
| 23 | #define TIMEOUT (20 * HZ) | 23 | #define TIMEOUT (20 * HZ) |
| 24 | 24 | ||
| 25 | static inline int freezeable(struct task_struct * p) | 25 | static inline int freezable(struct task_struct * p) |
| 26 | { | 26 | { |
| 27 | if ((p == current) || | 27 | if ((p == current) || |
| 28 | (p->flags & PF_NOFREEZE) || | 28 | (p->flags & PF_NOFREEZE) || |
| @@ -53,7 +53,7 @@ static int try_to_freeze_tasks(bool sig_only) | |||
| 53 | todo = 0; | 53 | todo = 0; |
| 54 | read_lock(&tasklist_lock); | 54 | read_lock(&tasklist_lock); |
| 55 | do_each_thread(g, p) { | 55 | do_each_thread(g, p) { |
| 56 | if (frozen(p) || !freezeable(p)) | 56 | if (frozen(p) || !freezable(p)) |
| 57 | continue; | 57 | continue; |
| 58 | 58 | ||
| 59 | if (!freeze_task(p, sig_only)) | 59 | if (!freeze_task(p, sig_only)) |
| @@ -64,6 +64,12 @@ static int try_to_freeze_tasks(bool sig_only) | |||
| 64 | * perturb a task in TASK_STOPPED or TASK_TRACED. | 64 | * perturb a task in TASK_STOPPED or TASK_TRACED. |
| 65 | * It is "frozen enough". If the task does wake | 65 | * It is "frozen enough". If the task does wake |
| 66 | * up, it will immediately call try_to_freeze. | 66 | * up, it will immediately call try_to_freeze. |
| 67 | * | ||
| 68 | * Because freeze_task() goes through p's | ||
| 69 | * scheduler lock after setting TIF_FREEZE, it's | ||
| 70 | * guaranteed that either we see TASK_RUNNING or | ||
| 71 | * try_to_stop() after schedule() in ptrace/signal | ||
| 72 | * stop sees TIF_FREEZE. | ||
| 67 | */ | 73 | */ |
| 68 | if (!task_is_stopped_or_traced(p) && | 74 | if (!task_is_stopped_or_traced(p) && |
| 69 | !freezer_should_skip(p)) | 75 | !freezer_should_skip(p)) |
| @@ -79,7 +85,7 @@ static int try_to_freeze_tasks(bool sig_only) | |||
| 79 | if (!todo || time_after(jiffies, end_time)) | 85 | if (!todo || time_after(jiffies, end_time)) |
| 80 | break; | 86 | break; |
| 81 | 87 | ||
| 82 | if (!pm_check_wakeup_events()) { | 88 | if (pm_wakeup_pending()) { |
| 83 | wakeup = true; | 89 | wakeup = true; |
| 84 | break; | 90 | break; |
| 85 | } | 91 | } |
| @@ -161,7 +167,7 @@ static void thaw_tasks(bool nosig_only) | |||
| 161 | 167 | ||
| 162 | read_lock(&tasklist_lock); | 168 | read_lock(&tasklist_lock); |
| 163 | do_each_thread(g, p) { | 169 | do_each_thread(g, p) { |
| 164 | if (!freezeable(p)) | 170 | if (!freezable(p)) |
| 165 | continue; | 171 | continue; |
| 166 | 172 | ||
| 167 | if (nosig_only && should_send_signal(p)) | 173 | if (nosig_only && should_send_signal(p)) |
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 0dac75ea445..64db648ff91 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c | |||
| @@ -1519,11 +1519,8 @@ static int | |||
| 1519 | swsusp_alloc(struct memory_bitmap *orig_bm, struct memory_bitmap *copy_bm, | 1519 | swsusp_alloc(struct memory_bitmap *orig_bm, struct memory_bitmap *copy_bm, |
| 1520 | unsigned int nr_pages, unsigned int nr_highmem) | 1520 | unsigned int nr_pages, unsigned int nr_highmem) |
| 1521 | { | 1521 | { |
| 1522 | int error = 0; | ||
| 1523 | |||
| 1524 | if (nr_highmem > 0) { | 1522 | if (nr_highmem > 0) { |
| 1525 | error = get_highmem_buffer(PG_ANY); | 1523 | if (get_highmem_buffer(PG_ANY)) |
| 1526 | if (error) | ||
| 1527 | goto err_out; | 1524 | goto err_out; |
| 1528 | if (nr_highmem > alloc_highmem) { | 1525 | if (nr_highmem > alloc_highmem) { |
| 1529 | nr_highmem -= alloc_highmem; | 1526 | nr_highmem -= alloc_highmem; |
| @@ -1546,7 +1543,7 @@ swsusp_alloc(struct memory_bitmap *orig_bm, struct memory_bitmap *copy_bm, | |||
| 1546 | 1543 | ||
| 1547 | err_out: | 1544 | err_out: |
| 1548 | swsusp_free(); | 1545 | swsusp_free(); |
| 1549 | return error; | 1546 | return -ENOMEM; |
| 1550 | } | 1547 | } |
| 1551 | 1548 | ||
| 1552 | asmlinkage int swsusp_save(void) | 1549 | asmlinkage int swsusp_save(void) |
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 7335952ee47..de6f86bfa30 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c | |||
| @@ -22,6 +22,7 @@ | |||
| 22 | #include <linux/mm.h> | 22 | #include <linux/mm.h> |
| 23 | #include <linux/slab.h> | 23 | #include <linux/slab.h> |
| 24 | #include <linux/suspend.h> | 24 | #include <linux/suspend.h> |
| 25 | #include <trace/events/power.h> | ||
| 25 | 26 | ||
| 26 | #include "power.h" | 27 | #include "power.h" |
| 27 | 28 | ||
| @@ -30,13 +31,13 @@ const char *const pm_states[PM_SUSPEND_MAX] = { | |||
| 30 | [PM_SUSPEND_MEM] = "mem", | 31 | [PM_SUSPEND_MEM] = "mem", |
| 31 | }; | 32 | }; |
| 32 | 33 | ||
| 33 | static struct platform_suspend_ops *suspend_ops; | 34 | static const struct platform_suspend_ops *suspend_ops; |
| 34 | 35 | ||
| 35 | /** | 36 | /** |
| 36 | * suspend_set_ops - Set the global suspend method table. | 37 | * suspend_set_ops - Set the global suspend method table. |
| 37 | * @ops: Pointer to ops structure. | 38 | * @ops: Pointer to ops structure. |
| 38 | */ | 39 | */ |
| 39 | void suspend_set_ops(struct platform_suspend_ops *ops) | 40 | void suspend_set_ops(const struct platform_suspend_ops *ops) |
| 40 | { | 41 | { |
| 41 | mutex_lock(&pm_mutex); | 42 | mutex_lock(&pm_mutex); |
| 42 | suspend_ops = ops; | 43 | suspend_ops = ops; |
| @@ -163,7 +164,7 @@ static int suspend_enter(suspend_state_t state) | |||
| 163 | 164 | ||
| 164 | error = sysdev_suspend(PMSG_SUSPEND); | 165 | error = sysdev_suspend(PMSG_SUSPEND); |
| 165 | if (!error) { | 166 | if (!error) { |
| 166 | if (!suspend_test(TEST_CORE) && pm_check_wakeup_events()) { | 167 | if (!(suspend_test(TEST_CORE) || pm_wakeup_pending())) { |
| 167 | error = suspend_ops->enter(state); | 168 | error = suspend_ops->enter(state); |
| 168 | events_check_enabled = false; | 169 | events_check_enabled = false; |
| 169 | } | 170 | } |
| @@ -197,18 +198,18 @@ static int suspend_enter(suspend_state_t state) | |||
| 197 | int suspend_devices_and_enter(suspend_state_t state) | 198 | int suspend_devices_and_enter(suspend_state_t state) |
| 198 | { | 199 | { |
| 199 | int error; | 200 | int error; |
| 200 | gfp_t saved_mask; | ||
| 201 | 201 | ||
| 202 | if (!suspend_ops) | 202 | if (!suspend_ops) |
| 203 | return -ENOSYS; | 203 | return -ENOSYS; |
| 204 | 204 | ||
| 205 | trace_machine_suspend(state); | ||
| 205 | if (suspend_ops->begin) { | 206 | if (suspend_ops->begin) { |
| 206 | error = suspend_ops->begin(state); | 207 | error = suspend_ops->begin(state); |
| 207 | if (error) | 208 | if (error) |
| 208 | goto Close; | 209 | goto Close; |
| 209 | } | 210 | } |
| 210 | suspend_console(); | 211 | suspend_console(); |
| 211 | saved_mask = clear_gfp_allowed_mask(GFP_IOFS); | 212 | pm_restrict_gfp_mask(); |
| 212 | suspend_test_start(); | 213 | suspend_test_start(); |
| 213 | error = dpm_suspend_start(PMSG_SUSPEND); | 214 | error = dpm_suspend_start(PMSG_SUSPEND); |
| 214 | if (error) { | 215 | if (error) { |
| @@ -225,11 +226,12 @@ int suspend_devices_and_enter(suspend_state_t state) | |||
| 225 | suspend_test_start(); | 226 | suspend_test_start(); |
| 226 | dpm_resume_end(PMSG_RESUME); | 227 | dpm_resume_end(PMSG_RESUME); |
| 227 | suspend_test_finish("resume devices"); | 228 | suspend_test_finish("resume devices"); |
| 228 | set_gfp_allowed_mask(saved_mask); | 229 | pm_restore_gfp_mask(); |
| 229 | resume_console(); | 230 | resume_console(); |
| 230 | Close: | 231 | Close: |
| 231 | if (suspend_ops->end) | 232 | if (suspend_ops->end) |
| 232 | suspend_ops->end(); | 233 | suspend_ops->end(); |
| 234 | trace_machine_suspend(PWR_EVENT_EXIT); | ||
| 233 | return error; | 235 | return error; |
| 234 | 236 | ||
| 235 | Recover_platform: | 237 | Recover_platform: |
diff --git a/kernel/power/swap.c b/kernel/power/swap.c index a0e4a86ccf9..7c97c3a0eee 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c | |||
| @@ -6,6 +6,7 @@ | |||
| 6 | * | 6 | * |
| 7 | * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@ucw.cz> | 7 | * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@ucw.cz> |
| 8 | * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl> | 8 | * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl> |
| 9 | * Copyright (C) 2010 Bojan Smojver <bojan@rexursive.com> | ||
| 9 | * | 10 | * |
| 10 | * This file is released under the GPLv2. | 11 | * This file is released under the GPLv2. |
| 11 | * | 12 | * |
| @@ -29,7 +30,7 @@ | |||
| 29 | 30 | ||
| 30 | #include "power.h" | 31 | #include "power.h" |
| 31 | 32 | ||
| 32 | #define HIBERNATE_SIG "LINHIB0001" | 33 | #define HIBERNATE_SIG "S1SUSPEND" |
| 33 | 34 | ||
| 34 | /* | 35 | /* |
| 35 | * The swap map is a data structure used for keeping track of each page | 36 | * The swap map is a data structure used for keeping track of each page |
| @@ -223,7 +224,7 @@ static int swsusp_swap_check(void) | |||
| 223 | return res; | 224 | return res; |
| 224 | 225 | ||
| 225 | root_swap = res; | 226 | root_swap = res; |
| 226 | res = blkdev_get(hib_resume_bdev, FMODE_WRITE); | 227 | res = blkdev_get(hib_resume_bdev, FMODE_WRITE, NULL); |
| 227 | if (res) | 228 | if (res) |
| 228 | return res; | 229 | return res; |
| 229 | 230 | ||
| @@ -753,30 +754,43 @@ static int load_image_lzo(struct swap_map_handle *handle, | |||
| 753 | { | 754 | { |
| 754 | unsigned int m; | 755 | unsigned int m; |
| 755 | int error = 0; | 756 | int error = 0; |
| 757 | struct bio *bio; | ||
| 756 | struct timeval start; | 758 | struct timeval start; |
| 757 | struct timeval stop; | 759 | struct timeval stop; |
| 758 | unsigned nr_pages; | 760 | unsigned nr_pages; |
| 759 | size_t off, unc_len, cmp_len; | 761 | size_t i, off, unc_len, cmp_len; |
| 760 | unsigned char *unc, *cmp, *page; | 762 | unsigned char *unc, *cmp, *page[LZO_CMP_PAGES]; |
| 761 | 763 | ||
| 762 | page = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH); | 764 | for (i = 0; i < LZO_CMP_PAGES; i++) { |
| 763 | if (!page) { | 765 | page[i] = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH); |
| 764 | printk(KERN_ERR "PM: Failed to allocate LZO page\n"); | 766 | if (!page[i]) { |
| 765 | return -ENOMEM; | 767 | printk(KERN_ERR "PM: Failed to allocate LZO page\n"); |
| 768 | |||
| 769 | while (i) | ||
| 770 | free_page((unsigned long)page[--i]); | ||
| 771 | |||
| 772 | return -ENOMEM; | ||
| 773 | } | ||
| 766 | } | 774 | } |
| 767 | 775 | ||
| 768 | unc = vmalloc(LZO_UNC_SIZE); | 776 | unc = vmalloc(LZO_UNC_SIZE); |
| 769 | if (!unc) { | 777 | if (!unc) { |
| 770 | printk(KERN_ERR "PM: Failed to allocate LZO uncompressed\n"); | 778 | printk(KERN_ERR "PM: Failed to allocate LZO uncompressed\n"); |
| 771 | free_page((unsigned long)page); | 779 | |
| 780 | for (i = 0; i < LZO_CMP_PAGES; i++) | ||
| 781 | free_page((unsigned long)page[i]); | ||
| 782 | |||
| 772 | return -ENOMEM; | 783 | return -ENOMEM; |
| 773 | } | 784 | } |
| 774 | 785 | ||
| 775 | cmp = vmalloc(LZO_CMP_SIZE); | 786 | cmp = vmalloc(LZO_CMP_SIZE); |
| 776 | if (!cmp) { | 787 | if (!cmp) { |
| 777 | printk(KERN_ERR "PM: Failed to allocate LZO compressed\n"); | 788 | printk(KERN_ERR "PM: Failed to allocate LZO compressed\n"); |
| 789 | |||
| 778 | vfree(unc); | 790 | vfree(unc); |
| 779 | free_page((unsigned long)page); | 791 | for (i = 0; i < LZO_CMP_PAGES; i++) |
| 792 | free_page((unsigned long)page[i]); | ||
| 793 | |||
| 780 | return -ENOMEM; | 794 | return -ENOMEM; |
| 781 | } | 795 | } |
| 782 | 796 | ||
| @@ -787,6 +801,7 @@ static int load_image_lzo(struct swap_map_handle *handle, | |||
| 787 | if (!m) | 801 | if (!m) |
| 788 | m = 1; | 802 | m = 1; |
| 789 | nr_pages = 0; | 803 | nr_pages = 0; |
| 804 | bio = NULL; | ||
| 790 | do_gettimeofday(&start); | 805 | do_gettimeofday(&start); |
| 791 | 806 | ||
| 792 | error = snapshot_write_next(snapshot); | 807 | error = snapshot_write_next(snapshot); |
| @@ -794,11 +809,11 @@ static int load_image_lzo(struct swap_map_handle *handle, | |||
| 794 | goto out_finish; | 809 | goto out_finish; |
| 795 | 810 | ||
| 796 | for (;;) { | 811 | for (;;) { |
| 797 | error = swap_read_page(handle, page, NULL); /* sync */ | 812 | error = swap_read_page(handle, page[0], NULL); /* sync */ |
| 798 | if (error) | 813 | if (error) |
| 799 | break; | 814 | break; |
| 800 | 815 | ||
| 801 | cmp_len = *(size_t *)page; | 816 | cmp_len = *(size_t *)page[0]; |
| 802 | if (unlikely(!cmp_len || | 817 | if (unlikely(!cmp_len || |
| 803 | cmp_len > lzo1x_worst_compress(LZO_UNC_SIZE))) { | 818 | cmp_len > lzo1x_worst_compress(LZO_UNC_SIZE))) { |
| 804 | printk(KERN_ERR "PM: Invalid LZO compressed length\n"); | 819 | printk(KERN_ERR "PM: Invalid LZO compressed length\n"); |
| @@ -806,13 +821,20 @@ static int load_image_lzo(struct swap_map_handle *handle, | |||
| 806 | break; | 821 | break; |
| 807 | } | 822 | } |
| 808 | 823 | ||
| 809 | memcpy(cmp, page, PAGE_SIZE); | 824 | for (off = PAGE_SIZE, i = 1; |
| 810 | for (off = PAGE_SIZE; off < LZO_HEADER + cmp_len; off += PAGE_SIZE) { | 825 | off < LZO_HEADER + cmp_len; off += PAGE_SIZE, i++) { |
| 811 | error = swap_read_page(handle, page, NULL); /* sync */ | 826 | error = swap_read_page(handle, page[i], &bio); |
| 812 | if (error) | 827 | if (error) |
| 813 | goto out_finish; | 828 | goto out_finish; |
| 829 | } | ||
| 814 | 830 | ||
| 815 | memcpy(cmp + off, page, PAGE_SIZE); | 831 | error = hib_wait_on_bio_chain(&bio); /* need all data now */ |
| 832 | if (error) | ||
| 833 | goto out_finish; | ||
| 834 | |||
| 835 | for (off = 0, i = 0; | ||
| 836 | off < LZO_HEADER + cmp_len; off += PAGE_SIZE, i++) { | ||
| 837 | memcpy(cmp + off, page[i], PAGE_SIZE); | ||
| 816 | } | 838 | } |
| 817 | 839 | ||
| 818 | unc_len = LZO_UNC_SIZE; | 840 | unc_len = LZO_UNC_SIZE; |
| @@ -857,7 +879,8 @@ out_finish: | |||
| 857 | 879 | ||
| 858 | vfree(cmp); | 880 | vfree(cmp); |
| 859 | vfree(unc); | 881 | vfree(unc); |
| 860 | free_page((unsigned long)page); | 882 | for (i = 0; i < LZO_CMP_PAGES; i++) |
| 883 | free_page((unsigned long)page[i]); | ||
| 861 | 884 | ||
| 862 | return error; | 885 | return error; |
| 863 | } | 886 | } |
| @@ -865,7 +888,7 @@ out_finish: | |||
| 865 | /** | 888 | /** |
| 866 | * swsusp_read - read the hibernation image. | 889 | * swsusp_read - read the hibernation image. |
| 867 | * @flags_p: flags passed by the "frozen" kernel in the image header should | 890 | * @flags_p: flags passed by the "frozen" kernel in the image header should |
| 868 | * be written into this memeory location | 891 | * be written into this memory location |
| 869 | */ | 892 | */ |
| 870 | 893 | ||
| 871 | int swsusp_read(unsigned int *flags_p) | 894 | int swsusp_read(unsigned int *flags_p) |
| @@ -907,7 +930,8 @@ int swsusp_check(void) | |||
| 907 | { | 930 | { |
| 908 | int error; | 931 | int error; |
| 909 | 932 | ||
| 910 | hib_resume_bdev = open_by_devnum(swsusp_resume_device, FMODE_READ); | 933 | hib_resume_bdev = blkdev_get_by_dev(swsusp_resume_device, |
| 934 | FMODE_READ, NULL); | ||
| 911 | if (!IS_ERR(hib_resume_bdev)) { | 935 | if (!IS_ERR(hib_resume_bdev)) { |
| 912 | set_blocksize(hib_resume_bdev, PAGE_SIZE); | 936 | set_blocksize(hib_resume_bdev, PAGE_SIZE); |
| 913 | clear_page(swsusp_header); | 937 | clear_page(swsusp_header); |
diff --git a/kernel/power/user.c b/kernel/power/user.c index e819e17877c..c36c3b9e8a8 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c | |||
| @@ -137,7 +137,7 @@ static int snapshot_release(struct inode *inode, struct file *filp) | |||
| 137 | free_all_swap_pages(data->swap); | 137 | free_all_swap_pages(data->swap); |
| 138 | if (data->frozen) | 138 | if (data->frozen) |
| 139 | thaw_processes(); | 139 | thaw_processes(); |
| 140 | pm_notifier_call_chain(data->mode == O_WRONLY ? | 140 | pm_notifier_call_chain(data->mode == O_RDONLY ? |
| 141 | PM_POST_HIBERNATION : PM_POST_RESTORE); | 141 | PM_POST_HIBERNATION : PM_POST_RESTORE); |
| 142 | atomic_inc(&snapshot_device_available); | 142 | atomic_inc(&snapshot_device_available); |
| 143 | 143 | ||
| @@ -263,6 +263,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, | |||
| 263 | case SNAPSHOT_UNFREEZE: | 263 | case SNAPSHOT_UNFREEZE: |
| 264 | if (!data->frozen || data->ready) | 264 | if (!data->frozen || data->ready) |
| 265 | break; | 265 | break; |
| 266 | pm_restore_gfp_mask(); | ||
| 266 | thaw_processes(); | 267 | thaw_processes(); |
| 267 | usermodehelper_enable(); | 268 | usermodehelper_enable(); |
| 268 | data->frozen = 0; | 269 | data->frozen = 0; |
| @@ -275,6 +276,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, | |||
| 275 | error = -EPERM; | 276 | error = -EPERM; |
| 276 | break; | 277 | break; |
| 277 | } | 278 | } |
| 279 | pm_restore_gfp_mask(); | ||
| 278 | error = hibernation_snapshot(data->platform_support); | 280 | error = hibernation_snapshot(data->platform_support); |
| 279 | if (!error) | 281 | if (!error) |
| 280 | error = put_user(in_suspend, (int __user *)arg); | 282 | error = put_user(in_suspend, (int __user *)arg); |
diff --git a/kernel/printk.c b/kernel/printk.c index b2ebaee8c37..36231525e22 100644 --- a/kernel/printk.c +++ b/kernel/printk.c | |||
| @@ -39,16 +39,11 @@ | |||
| 39 | #include <linux/syslog.h> | 39 | #include <linux/syslog.h> |
| 40 | #include <linux/cpu.h> | 40 | #include <linux/cpu.h> |
| 41 | #include <linux/notifier.h> | 41 | #include <linux/notifier.h> |
| 42 | #include <linux/rculist.h> | ||
| 42 | 43 | ||
| 43 | #include <asm/uaccess.h> | 44 | #include <asm/uaccess.h> |
| 44 | 45 | ||
| 45 | /* | 46 | /* |
| 46 | * for_each_console() allows you to iterate on each console | ||
| 47 | */ | ||
| 48 | #define for_each_console(con) \ | ||
| 49 | for (con = console_drivers; con != NULL; con = con->next) | ||
| 50 | |||
| 51 | /* | ||
| 52 | * Architectures can override it: | 47 | * Architectures can override it: |
| 53 | */ | 48 | */ |
| 54 | void asmlinkage __attribute__((weak)) early_printk(const char *fmt, ...) | 49 | void asmlinkage __attribute__((weak)) early_printk(const char *fmt, ...) |
| @@ -102,7 +97,7 @@ static int console_locked, console_suspended; | |||
| 102 | /* | 97 | /* |
| 103 | * logbuf_lock protects log_buf, log_start, log_end, con_start and logged_chars | 98 | * logbuf_lock protects log_buf, log_start, log_end, con_start and logged_chars |
| 104 | * It is also used in interesting ways to provide interlocking in | 99 | * It is also used in interesting ways to provide interlocking in |
| 105 | * release_console_sem(). | 100 | * console_unlock();. |
| 106 | */ | 101 | */ |
| 107 | static DEFINE_SPINLOCK(logbuf_lock); | 102 | static DEFINE_SPINLOCK(logbuf_lock); |
| 108 | 103 | ||
| @@ -261,14 +256,55 @@ static inline void boot_delay_msec(void) | |||
| 261 | } | 256 | } |
| 262 | #endif | 257 | #endif |
| 263 | 258 | ||
| 259 | #ifdef CONFIG_SECURITY_DMESG_RESTRICT | ||
| 260 | int dmesg_restrict = 1; | ||
| 261 | #else | ||
| 262 | int dmesg_restrict; | ||
| 263 | #endif | ||
| 264 | |||
| 265 | static int syslog_action_restricted(int type) | ||
| 266 | { | ||
| 267 | if (dmesg_restrict) | ||
| 268 | return 1; | ||
| 269 | /* Unless restricted, we allow "read all" and "get buffer size" for everybody */ | ||
| 270 | return type != SYSLOG_ACTION_READ_ALL && type != SYSLOG_ACTION_SIZE_BUFFER; | ||
| 271 | } | ||
| 272 | |||
| 273 | static int check_syslog_permissions(int type, bool from_file) | ||
| 274 | { | ||
| 275 | /* | ||
| 276 | * If this is from /proc/kmsg and we've already opened it, then we've | ||
| 277 | * already done the capabilities checks at open time. | ||
| 278 | */ | ||
| 279 | if (from_file && type != SYSLOG_ACTION_OPEN) | ||
| 280 | return 0; | ||
| 281 | |||
| 282 | if (syslog_action_restricted(type)) { | ||
| 283 | if (capable(CAP_SYSLOG)) | ||
| 284 | return 0; | ||
| 285 | /* For historical reasons, accept CAP_SYS_ADMIN too, with a warning */ | ||
| 286 | if (capable(CAP_SYS_ADMIN)) { | ||
| 287 | WARN_ONCE(1, "Attempt to access syslog with CAP_SYS_ADMIN " | ||
| 288 | "but no CAP_SYSLOG (deprecated).\n"); | ||
| 289 | return 0; | ||
| 290 | } | ||
| 291 | return -EPERM; | ||
| 292 | } | ||
| 293 | return 0; | ||
| 294 | } | ||
| 295 | |||
| 264 | int do_syslog(int type, char __user *buf, int len, bool from_file) | 296 | int do_syslog(int type, char __user *buf, int len, bool from_file) |
| 265 | { | 297 | { |
| 266 | unsigned i, j, limit, count; | 298 | unsigned i, j, limit, count; |
| 267 | int do_clear = 0; | 299 | int do_clear = 0; |
| 268 | char c; | 300 | char c; |
| 269 | int error = 0; | 301 | int error; |
| 270 | 302 | ||
| 271 | error = security_syslog(type, from_file); | 303 | error = check_syslog_permissions(type, from_file); |
| 304 | if (error) | ||
| 305 | goto out; | ||
| 306 | |||
| 307 | error = security_syslog(type); | ||
| 272 | if (error) | 308 | if (error) |
| 273 | return error; | 309 | return error; |
| 274 | 310 | ||
| @@ -481,7 +517,7 @@ static void _call_console_drivers(unsigned start, | |||
| 481 | /* | 517 | /* |
| 482 | * Call the console drivers, asking them to write out | 518 | * Call the console drivers, asking them to write out |
| 483 | * log_buf[start] to log_buf[end - 1]. | 519 | * log_buf[start] to log_buf[end - 1]. |
| 484 | * The console_sem must be held. | 520 | * The console_lock must be held. |
| 485 | */ | 521 | */ |
| 486 | static void call_console_drivers(unsigned start, unsigned end) | 522 | static void call_console_drivers(unsigned start, unsigned end) |
| 487 | { | 523 | { |
| @@ -584,11 +620,11 @@ static int have_callable_console(void) | |||
| 584 | * | 620 | * |
| 585 | * This is printk(). It can be called from any context. We want it to work. | 621 | * This is printk(). It can be called from any context. We want it to work. |
| 586 | * | 622 | * |
| 587 | * We try to grab the console_sem. If we succeed, it's easy - we log the output and | 623 | * We try to grab the console_lock. If we succeed, it's easy - we log the output and |
| 588 | * call the console drivers. If we fail to get the semaphore we place the output | 624 | * call the console drivers. If we fail to get the semaphore we place the output |
| 589 | * into the log buffer and return. The current holder of the console_sem will | 625 | * into the log buffer and return. The current holder of the console_sem will |
| 590 | * notice the new output in release_console_sem() and will send it to the | 626 | * notice the new output in console_unlock(); and will send it to the |
| 591 | * consoles before releasing the semaphore. | 627 | * consoles before releasing the lock. |
| 592 | * | 628 | * |
| 593 | * One effect of this deferred printing is that code which calls printk() and | 629 | * One effect of this deferred printing is that code which calls printk() and |
| 594 | * then changes console_loglevel may break. This is because console_loglevel | 630 | * then changes console_loglevel may break. This is because console_loglevel |
| @@ -639,19 +675,19 @@ static inline int can_use_console(unsigned int cpu) | |||
| 639 | /* | 675 | /* |
| 640 | * Try to get console ownership to actually show the kernel | 676 | * Try to get console ownership to actually show the kernel |
| 641 | * messages from a 'printk'. Return true (and with the | 677 | * messages from a 'printk'. Return true (and with the |
| 642 | * console_semaphore held, and 'console_locked' set) if it | 678 | * console_lock held, and 'console_locked' set) if it |
| 643 | * is successful, false otherwise. | 679 | * is successful, false otherwise. |
| 644 | * | 680 | * |
| 645 | * This gets called with the 'logbuf_lock' spinlock held and | 681 | * This gets called with the 'logbuf_lock' spinlock held and |
| 646 | * interrupts disabled. It should return with 'lockbuf_lock' | 682 | * interrupts disabled. It should return with 'lockbuf_lock' |
| 647 | * released but interrupts still disabled. | 683 | * released but interrupts still disabled. |
| 648 | */ | 684 | */ |
| 649 | static int acquire_console_semaphore_for_printk(unsigned int cpu) | 685 | static int console_trylock_for_printk(unsigned int cpu) |
| 650 | __releases(&logbuf_lock) | 686 | __releases(&logbuf_lock) |
| 651 | { | 687 | { |
| 652 | int retval = 0; | 688 | int retval = 0; |
| 653 | 689 | ||
| 654 | if (!try_acquire_console_sem()) { | 690 | if (console_trylock()) { |
| 655 | retval = 1; | 691 | retval = 1; |
| 656 | 692 | ||
| 657 | /* | 693 | /* |
| @@ -807,12 +843,12 @@ asmlinkage int vprintk(const char *fmt, va_list args) | |||
| 807 | * actual magic (print out buffers, wake up klogd, | 843 | * actual magic (print out buffers, wake up klogd, |
| 808 | * etc). | 844 | * etc). |
| 809 | * | 845 | * |
| 810 | * The acquire_console_semaphore_for_printk() function | 846 | * The console_trylock_for_printk() function |
| 811 | * will release 'logbuf_lock' regardless of whether it | 847 | * will release 'logbuf_lock' regardless of whether it |
| 812 | * actually gets the semaphore or not. | 848 | * actually gets the semaphore or not. |
| 813 | */ | 849 | */ |
| 814 | if (acquire_console_semaphore_for_printk(this_cpu)) | 850 | if (console_trylock_for_printk(this_cpu)) |
| 815 | release_console_sem(); | 851 | console_unlock(); |
| 816 | 852 | ||
| 817 | lockdep_on(); | 853 | lockdep_on(); |
| 818 | out_restore_irqs: | 854 | out_restore_irqs: |
| @@ -973,7 +1009,7 @@ void suspend_console(void) | |||
| 973 | if (!console_suspend_enabled) | 1009 | if (!console_suspend_enabled) |
| 974 | return; | 1010 | return; |
| 975 | printk("Suspending console(s) (use no_console_suspend to debug)\n"); | 1011 | printk("Suspending console(s) (use no_console_suspend to debug)\n"); |
| 976 | acquire_console_sem(); | 1012 | console_lock(); |
| 977 | console_suspended = 1; | 1013 | console_suspended = 1; |
| 978 | up(&console_sem); | 1014 | up(&console_sem); |
| 979 | } | 1015 | } |
| @@ -984,7 +1020,7 @@ void resume_console(void) | |||
| 984 | return; | 1020 | return; |
| 985 | down(&console_sem); | 1021 | down(&console_sem); |
| 986 | console_suspended = 0; | 1022 | console_suspended = 0; |
| 987 | release_console_sem(); | 1023 | console_unlock(); |
| 988 | } | 1024 | } |
| 989 | 1025 | ||
| 990 | /** | 1026 | /** |
| @@ -1007,21 +1043,21 @@ static int __cpuinit console_cpu_notify(struct notifier_block *self, | |||
| 1007 | case CPU_DYING: | 1043 | case CPU_DYING: |
| 1008 | case CPU_DOWN_FAILED: | 1044 | case CPU_DOWN_FAILED: |
| 1009 | case CPU_UP_CANCELED: | 1045 | case CPU_UP_CANCELED: |
| 1010 | acquire_console_sem(); | 1046 | console_lock(); |
| 1011 | release_console_sem(); | 1047 | console_unlock(); |
| 1012 | } | 1048 | } |
| 1013 | return NOTIFY_OK; | 1049 | return NOTIFY_OK; |
| 1014 | } | 1050 | } |
| 1015 | 1051 | ||
| 1016 | /** | 1052 | /** |
| 1017 | * acquire_console_sem - lock the console system for exclusive use. | 1053 | * console_lock - lock the console system for exclusive use. |
| 1018 | * | 1054 | * |
| 1019 | * Acquires a semaphore which guarantees that the caller has | 1055 | * Acquires a lock which guarantees that the caller has |
| 1020 | * exclusive access to the console system and the console_drivers list. | 1056 | * exclusive access to the console system and the console_drivers list. |
| 1021 | * | 1057 | * |
| 1022 | * Can sleep, returns nothing. | 1058 | * Can sleep, returns nothing. |
| 1023 | */ | 1059 | */ |
| 1024 | void acquire_console_sem(void) | 1060 | void console_lock(void) |
| 1025 | { | 1061 | { |
| 1026 | BUG_ON(in_interrupt()); | 1062 | BUG_ON(in_interrupt()); |
| 1027 | down(&console_sem); | 1063 | down(&console_sem); |
| @@ -1030,21 +1066,29 @@ void acquire_console_sem(void) | |||
| 1030 | console_locked = 1; | 1066 | console_locked = 1; |
| 1031 | console_may_schedule = 1; | 1067 | console_may_schedule = 1; |
| 1032 | } | 1068 | } |
| 1033 | EXPORT_SYMBOL(acquire_console_sem); | 1069 | EXPORT_SYMBOL(console_lock); |
| 1034 | 1070 | ||
| 1035 | int try_acquire_console_sem(void) | 1071 | /** |
| 1072 | * console_trylock - try to lock the console system for exclusive use. | ||
| 1073 | * | ||
| 1074 | * Tried to acquire a lock which guarantees that the caller has | ||
| 1075 | * exclusive access to the console system and the console_drivers list. | ||
| 1076 | * | ||
| 1077 | * returns 1 on success, and 0 on failure to acquire the lock. | ||
| 1078 | */ | ||
| 1079 | int console_trylock(void) | ||
| 1036 | { | 1080 | { |
| 1037 | if (down_trylock(&console_sem)) | 1081 | if (down_trylock(&console_sem)) |
| 1038 | return -1; | 1082 | return 0; |
| 1039 | if (console_suspended) { | 1083 | if (console_suspended) { |
| 1040 | up(&console_sem); | 1084 | up(&console_sem); |
| 1041 | return -1; | 1085 | return 0; |
| 1042 | } | 1086 | } |
| 1043 | console_locked = 1; | 1087 | console_locked = 1; |
| 1044 | console_may_schedule = 0; | 1088 | console_may_schedule = 0; |
| 1045 | return 0; | 1089 | return 1; |
| 1046 | } | 1090 | } |
| 1047 | EXPORT_SYMBOL(try_acquire_console_sem); | 1091 | EXPORT_SYMBOL(console_trylock); |
| 1048 | 1092 | ||
| 1049 | int is_console_locked(void) | 1093 | int is_console_locked(void) |
| 1050 | { | 1094 | { |
| @@ -1055,38 +1099,40 @@ static DEFINE_PER_CPU(int, printk_pending); | |||
| 1055 | 1099 | ||
| 1056 | void printk_tick(void) | 1100 | void printk_tick(void) |
| 1057 | { | 1101 | { |
| 1058 | if (__get_cpu_var(printk_pending)) { | 1102 | if (__this_cpu_read(printk_pending)) { |
| 1059 | __get_cpu_var(printk_pending) = 0; | 1103 | __this_cpu_write(printk_pending, 0); |
| 1060 | wake_up_interruptible(&log_wait); | 1104 | wake_up_interruptible(&log_wait); |
| 1061 | } | 1105 | } |
| 1062 | } | 1106 | } |
| 1063 | 1107 | ||
| 1064 | int printk_needs_cpu(int cpu) | 1108 | int printk_needs_cpu(int cpu) |
| 1065 | { | 1109 | { |
| 1066 | return per_cpu(printk_pending, cpu); | 1110 | if (cpu_is_offline(cpu)) |
| 1111 | printk_tick(); | ||
| 1112 | return __this_cpu_read(printk_pending); | ||
| 1067 | } | 1113 | } |
| 1068 | 1114 | ||
| 1069 | void wake_up_klogd(void) | 1115 | void wake_up_klogd(void) |
| 1070 | { | 1116 | { |
| 1071 | if (waitqueue_active(&log_wait)) | 1117 | if (waitqueue_active(&log_wait)) |
| 1072 | __raw_get_cpu_var(printk_pending) = 1; | 1118 | this_cpu_write(printk_pending, 1); |
| 1073 | } | 1119 | } |
| 1074 | 1120 | ||
| 1075 | /** | 1121 | /** |
| 1076 | * release_console_sem - unlock the console system | 1122 | * console_unlock - unlock the console system |
| 1077 | * | 1123 | * |
| 1078 | * Releases the semaphore which the caller holds on the console system | 1124 | * Releases the console_lock which the caller holds on the console system |
| 1079 | * and the console driver list. | 1125 | * and the console driver list. |
| 1080 | * | 1126 | * |
| 1081 | * While the semaphore was held, console output may have been buffered | 1127 | * While the console_lock was held, console output may have been buffered |
| 1082 | * by printk(). If this is the case, release_console_sem() emits | 1128 | * by printk(). If this is the case, console_unlock(); emits |
| 1083 | * the output prior to releasing the semaphore. | 1129 | * the output prior to releasing the lock. |
| 1084 | * | 1130 | * |
| 1085 | * If there is output waiting for klogd, we wake it up. | 1131 | * If there is output waiting for klogd, we wake it up. |
| 1086 | * | 1132 | * |
| 1087 | * release_console_sem() may be called from any context. | 1133 | * console_unlock(); may be called from any context. |
| 1088 | */ | 1134 | */ |
| 1089 | void release_console_sem(void) | 1135 | void console_unlock(void) |
| 1090 | { | 1136 | { |
| 1091 | unsigned long flags; | 1137 | unsigned long flags; |
| 1092 | unsigned _con_start, _log_end; | 1138 | unsigned _con_start, _log_end; |
| @@ -1119,7 +1165,7 @@ void release_console_sem(void) | |||
| 1119 | if (wake_klogd) | 1165 | if (wake_klogd) |
| 1120 | wake_up_klogd(); | 1166 | wake_up_klogd(); |
| 1121 | } | 1167 | } |
| 1122 | EXPORT_SYMBOL(release_console_sem); | 1168 | EXPORT_SYMBOL(console_unlock); |
| 1123 | 1169 | ||
| 1124 | /** | 1170 | /** |
| 1125 | * console_conditional_schedule - yield the CPU if required | 1171 | * console_conditional_schedule - yield the CPU if required |
| @@ -1128,7 +1174,7 @@ EXPORT_SYMBOL(release_console_sem); | |||
| 1128 | * if this CPU should yield the CPU to another task, do | 1174 | * if this CPU should yield the CPU to another task, do |
| 1129 | * so here. | 1175 | * so here. |
| 1130 | * | 1176 | * |
| 1131 | * Must be called within acquire_console_sem(). | 1177 | * Must be called within console_lock();. |
| 1132 | */ | 1178 | */ |
| 1133 | void __sched console_conditional_schedule(void) | 1179 | void __sched console_conditional_schedule(void) |
| 1134 | { | 1180 | { |
| @@ -1149,14 +1195,14 @@ void console_unblank(void) | |||
| 1149 | if (down_trylock(&console_sem) != 0) | 1195 | if (down_trylock(&console_sem) != 0) |
| 1150 | return; | 1196 | return; |
| 1151 | } else | 1197 | } else |
| 1152 | acquire_console_sem(); | 1198 | console_lock(); |
| 1153 | 1199 | ||
| 1154 | console_locked = 1; | 1200 | console_locked = 1; |
| 1155 | console_may_schedule = 0; | 1201 | console_may_schedule = 0; |
| 1156 | for_each_console(c) | 1202 | for_each_console(c) |
| 1157 | if ((c->flags & CON_ENABLED) && c->unblank) | 1203 | if ((c->flags & CON_ENABLED) && c->unblank) |
| 1158 | c->unblank(); | 1204 | c->unblank(); |
| 1159 | release_console_sem(); | 1205 | console_unlock(); |
| 1160 | } | 1206 | } |
| 1161 | 1207 | ||
| 1162 | /* | 1208 | /* |
| @@ -1167,7 +1213,7 @@ struct tty_driver *console_device(int *index) | |||
| 1167 | struct console *c; | 1213 | struct console *c; |
| 1168 | struct tty_driver *driver = NULL; | 1214 | struct tty_driver *driver = NULL; |
| 1169 | 1215 | ||
| 1170 | acquire_console_sem(); | 1216 | console_lock(); |
| 1171 | for_each_console(c) { | 1217 | for_each_console(c) { |
| 1172 | if (!c->device) | 1218 | if (!c->device) |
| 1173 | continue; | 1219 | continue; |
| @@ -1175,7 +1221,7 @@ struct tty_driver *console_device(int *index) | |||
| 1175 | if (driver) | 1221 | if (driver) |
| 1176 | break; | 1222 | break; |
| 1177 | } | 1223 | } |
| 1178 | release_console_sem(); | 1224 | console_unlock(); |
| 1179 | return driver; | 1225 | return driver; |
| 1180 | } | 1226 | } |
| 1181 | 1227 | ||
| @@ -1186,17 +1232,17 @@ struct tty_driver *console_device(int *index) | |||
| 1186 | */ | 1232 | */ |
| 1187 | void console_stop(struct console *console) | 1233 | void console_stop(struct console *console) |
| 1188 | { | 1234 | { |
| 1189 | acquire_console_sem(); | 1235 | console_lock(); |
| 1190 | console->flags &= ~CON_ENABLED; | 1236 | console->flags &= ~CON_ENABLED; |
| 1191 | release_console_sem(); | 1237 | console_unlock(); |
| 1192 | } | 1238 | } |
| 1193 | EXPORT_SYMBOL(console_stop); | 1239 | EXPORT_SYMBOL(console_stop); |
| 1194 | 1240 | ||
| 1195 | void console_start(struct console *console) | 1241 | void console_start(struct console *console) |
| 1196 | { | 1242 | { |
| 1197 | acquire_console_sem(); | 1243 | console_lock(); |
| 1198 | console->flags |= CON_ENABLED; | 1244 | console->flags |= CON_ENABLED; |
| 1199 | release_console_sem(); | 1245 | console_unlock(); |
| 1200 | } | 1246 | } |
| 1201 | EXPORT_SYMBOL(console_start); | 1247 | EXPORT_SYMBOL(console_start); |
| 1202 | 1248 | ||
| @@ -1318,7 +1364,7 @@ void register_console(struct console *newcon) | |||
| 1318 | * Put this console in the list - keep the | 1364 | * Put this console in the list - keep the |
| 1319 | * preferred driver at the head of the list. | 1365 | * preferred driver at the head of the list. |
| 1320 | */ | 1366 | */ |
| 1321 | acquire_console_sem(); | 1367 | console_lock(); |
| 1322 | if ((newcon->flags & CON_CONSDEV) || console_drivers == NULL) { | 1368 | if ((newcon->flags & CON_CONSDEV) || console_drivers == NULL) { |
| 1323 | newcon->next = console_drivers; | 1369 | newcon->next = console_drivers; |
| 1324 | console_drivers = newcon; | 1370 | console_drivers = newcon; |
| @@ -1330,14 +1376,15 @@ void register_console(struct console *newcon) | |||
| 1330 | } | 1376 | } |
| 1331 | if (newcon->flags & CON_PRINTBUFFER) { | 1377 | if (newcon->flags & CON_PRINTBUFFER) { |
| 1332 | /* | 1378 | /* |
| 1333 | * release_console_sem() will print out the buffered messages | 1379 | * console_unlock(); will print out the buffered messages |
| 1334 | * for us. | 1380 | * for us. |
| 1335 | */ | 1381 | */ |
| 1336 | spin_lock_irqsave(&logbuf_lock, flags); | 1382 | spin_lock_irqsave(&logbuf_lock, flags); |
| 1337 | con_start = log_start; | 1383 | con_start = log_start; |
| 1338 | spin_unlock_irqrestore(&logbuf_lock, flags); | 1384 | spin_unlock_irqrestore(&logbuf_lock, flags); |
| 1339 | } | 1385 | } |
| 1340 | release_console_sem(); | 1386 | console_unlock(); |
| 1387 | console_sysfs_notify(); | ||
| 1341 | 1388 | ||
| 1342 | /* | 1389 | /* |
| 1343 | * By unregistering the bootconsoles after we enable the real console | 1390 | * By unregistering the bootconsoles after we enable the real console |
| @@ -1373,7 +1420,7 @@ int unregister_console(struct console *console) | |||
| 1373 | return braille_unregister_console(console); | 1420 | return braille_unregister_console(console); |
| 1374 | #endif | 1421 | #endif |
| 1375 | 1422 | ||
| 1376 | acquire_console_sem(); | 1423 | console_lock(); |
| 1377 | if (console_drivers == console) { | 1424 | if (console_drivers == console) { |
| 1378 | console_drivers=console->next; | 1425 | console_drivers=console->next; |
| 1379 | res = 0; | 1426 | res = 0; |
| @@ -1395,7 +1442,8 @@ int unregister_console(struct console *console) | |||
| 1395 | if (console_drivers != NULL && console->flags & CON_CONSDEV) | 1442 | if (console_drivers != NULL && console->flags & CON_CONSDEV) |
| 1396 | console_drivers->flags |= CON_CONSDEV; | 1443 | console_drivers->flags |= CON_CONSDEV; |
| 1397 | 1444 | ||
| 1398 | release_console_sem(); | 1445 | console_unlock(); |
| 1446 | console_sysfs_notify(); | ||
| 1399 | return res; | 1447 | return res; |
| 1400 | } | 1448 | } |
| 1401 | EXPORT_SYMBOL(unregister_console); | 1449 | EXPORT_SYMBOL(unregister_console); |
| @@ -1479,7 +1527,7 @@ int kmsg_dump_register(struct kmsg_dumper *dumper) | |||
| 1479 | /* Don't allow registering multiple times */ | 1527 | /* Don't allow registering multiple times */ |
| 1480 | if (!dumper->registered) { | 1528 | if (!dumper->registered) { |
| 1481 | dumper->registered = 1; | 1529 | dumper->registered = 1; |
| 1482 | list_add_tail(&dumper->list, &dump_list); | 1530 | list_add_tail_rcu(&dumper->list, &dump_list); |
| 1483 | err = 0; | 1531 | err = 0; |
| 1484 | } | 1532 | } |
| 1485 | spin_unlock_irqrestore(&dump_list_lock, flags); | 1533 | spin_unlock_irqrestore(&dump_list_lock, flags); |
| @@ -1503,29 +1551,16 @@ int kmsg_dump_unregister(struct kmsg_dumper *dumper) | |||
| 1503 | spin_lock_irqsave(&dump_list_lock, flags); | 1551 | spin_lock_irqsave(&dump_list_lock, flags); |
| 1504 | if (dumper->registered) { | 1552 | if (dumper->registered) { |
| 1505 | dumper->registered = 0; | 1553 | dumper->registered = 0; |
| 1506 | list_del(&dumper->list); | 1554 | list_del_rcu(&dumper->list); |
| 1507 | err = 0; | 1555 | err = 0; |
| 1508 | } | 1556 | } |
| 1509 | spin_unlock_irqrestore(&dump_list_lock, flags); | 1557 | spin_unlock_irqrestore(&dump_list_lock, flags); |
| 1558 | synchronize_rcu(); | ||
| 1510 | 1559 | ||
| 1511 | return err; | 1560 | return err; |
| 1512 | } | 1561 | } |
| 1513 | EXPORT_SYMBOL_GPL(kmsg_dump_unregister); | 1562 | EXPORT_SYMBOL_GPL(kmsg_dump_unregister); |
| 1514 | 1563 | ||
| 1515 | static const char * const kmsg_reasons[] = { | ||
| 1516 | [KMSG_DUMP_OOPS] = "oops", | ||
| 1517 | [KMSG_DUMP_PANIC] = "panic", | ||
| 1518 | [KMSG_DUMP_KEXEC] = "kexec", | ||
| 1519 | }; | ||
| 1520 | |||
| 1521 | static const char *kmsg_to_str(enum kmsg_dump_reason reason) | ||
| 1522 | { | ||
| 1523 | if (reason >= ARRAY_SIZE(kmsg_reasons) || reason < 0) | ||
| 1524 | return "unknown"; | ||
| 1525 | |||
| 1526 | return kmsg_reasons[reason]; | ||
| 1527 | } | ||
| 1528 | |||
| 1529 | /** | 1564 | /** |
| 1530 | * kmsg_dump - dump kernel log to kernel message dumpers. | 1565 | * kmsg_dump - dump kernel log to kernel message dumpers. |
| 1531 | * @reason: the reason (oops, panic etc) for dumping | 1566 | * @reason: the reason (oops, panic etc) for dumping |
| @@ -1564,13 +1599,9 @@ void kmsg_dump(enum kmsg_dump_reason reason) | |||
| 1564 | l2 = chars; | 1599 | l2 = chars; |
| 1565 | } | 1600 | } |
| 1566 | 1601 | ||
| 1567 | if (!spin_trylock_irqsave(&dump_list_lock, flags)) { | 1602 | rcu_read_lock(); |
| 1568 | printk(KERN_ERR "dump_kmsg: dump list lock is held during %s, skipping dump\n", | 1603 | list_for_each_entry_rcu(dumper, &dump_list, list) |
| 1569 | kmsg_to_str(reason)); | ||
| 1570 | return; | ||
| 1571 | } | ||
| 1572 | list_for_each_entry(dumper, &dump_list, list) | ||
| 1573 | dumper->dump(dumper, reason, s1, l1, s2, l2); | 1604 | dumper->dump(dumper, reason, s1, l1, s2, l2); |
| 1574 | spin_unlock_irqrestore(&dump_list_lock, flags); | 1605 | rcu_read_unlock(); |
| 1575 | } | 1606 | } |
| 1576 | #endif | 1607 | #endif |
diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 99bbaa3e5b0..1708b1e2972 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c | |||
| @@ -313,7 +313,7 @@ int ptrace_detach(struct task_struct *child, unsigned int data) | |||
| 313 | child->exit_code = data; | 313 | child->exit_code = data; |
| 314 | dead = __ptrace_detach(current, child); | 314 | dead = __ptrace_detach(current, child); |
| 315 | if (!child->exit_state) | 315 | if (!child->exit_state) |
| 316 | wake_up_process(child); | 316 | wake_up_state(child, TASK_TRACED | TASK_STOPPED); |
| 317 | } | 317 | } |
| 318 | write_unlock_irq(&tasklist_lock); | 318 | write_unlock_irq(&tasklist_lock); |
| 319 | 319 | ||
diff --git a/kernel/range.c b/kernel/range.c index 471b66acabb..37fa9b99ad5 100644 --- a/kernel/range.c +++ b/kernel/range.c | |||
| @@ -119,7 +119,7 @@ static int cmp_range(const void *x1, const void *x2) | |||
| 119 | 119 | ||
| 120 | int clean_sort_range(struct range *range, int az) | 120 | int clean_sort_range(struct range *range, int az) |
| 121 | { | 121 | { |
| 122 | int i, j, k = az - 1, nr_range = 0; | 122 | int i, j, k = az - 1, nr_range = az; |
| 123 | 123 | ||
| 124 | for (i = 0; i < k; i++) { | 124 | for (i = 0; i < k; i++) { |
| 125 | if (range[i].end) | 125 | if (range[i].end) |
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c index d806735342a..0c343b9a46d 100644 --- a/kernel/rcutiny.c +++ b/kernel/rcutiny.c | |||
| @@ -36,31 +36,16 @@ | |||
| 36 | #include <linux/time.h> | 36 | #include <linux/time.h> |
| 37 | #include <linux/cpu.h> | 37 | #include <linux/cpu.h> |
| 38 | 38 | ||
| 39 | /* Global control variables for rcupdate callback mechanism. */ | 39 | /* Controls for rcu_kthread() kthread, replacing RCU_SOFTIRQ used previously. */ |
| 40 | struct rcu_ctrlblk { | 40 | static struct task_struct *rcu_kthread_task; |
| 41 | struct rcu_head *rcucblist; /* List of pending callbacks (CBs). */ | 41 | static DECLARE_WAIT_QUEUE_HEAD(rcu_kthread_wq); |
| 42 | struct rcu_head **donetail; /* ->next pointer of last "done" CB. */ | 42 | static unsigned long have_rcu_kthread_work; |
| 43 | struct rcu_head **curtail; /* ->next pointer of last CB. */ | 43 | static void invoke_rcu_kthread(void); |
| 44 | }; | ||
| 45 | |||
| 46 | /* Definition for rcupdate control block. */ | ||
| 47 | static struct rcu_ctrlblk rcu_sched_ctrlblk = { | ||
| 48 | .donetail = &rcu_sched_ctrlblk.rcucblist, | ||
| 49 | .curtail = &rcu_sched_ctrlblk.rcucblist, | ||
| 50 | }; | ||
| 51 | |||
| 52 | static struct rcu_ctrlblk rcu_bh_ctrlblk = { | ||
| 53 | .donetail = &rcu_bh_ctrlblk.rcucblist, | ||
| 54 | .curtail = &rcu_bh_ctrlblk.rcucblist, | ||
| 55 | }; | ||
| 56 | |||
| 57 | #ifdef CONFIG_DEBUG_LOCK_ALLOC | ||
| 58 | int rcu_scheduler_active __read_mostly; | ||
| 59 | EXPORT_SYMBOL_GPL(rcu_scheduler_active); | ||
| 60 | #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ | ||
| 61 | 44 | ||
| 62 | /* Forward declarations for rcutiny_plugin.h. */ | 45 | /* Forward declarations for rcutiny_plugin.h. */ |
| 63 | static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp); | 46 | struct rcu_ctrlblk; |
| 47 | static void rcu_process_callbacks(struct rcu_ctrlblk *rcp); | ||
| 48 | static int rcu_kthread(void *arg); | ||
| 64 | static void __call_rcu(struct rcu_head *head, | 49 | static void __call_rcu(struct rcu_head *head, |
| 65 | void (*func)(struct rcu_head *rcu), | 50 | void (*func)(struct rcu_head *rcu), |
| 66 | struct rcu_ctrlblk *rcp); | 51 | struct rcu_ctrlblk *rcp); |
| @@ -123,7 +108,7 @@ void rcu_sched_qs(int cpu) | |||
| 123 | { | 108 | { |
| 124 | if (rcu_qsctr_help(&rcu_sched_ctrlblk) + | 109 | if (rcu_qsctr_help(&rcu_sched_ctrlblk) + |
| 125 | rcu_qsctr_help(&rcu_bh_ctrlblk)) | 110 | rcu_qsctr_help(&rcu_bh_ctrlblk)) |
| 126 | raise_softirq(RCU_SOFTIRQ); | 111 | invoke_rcu_kthread(); |
| 127 | } | 112 | } |
| 128 | 113 | ||
| 129 | /* | 114 | /* |
| @@ -132,7 +117,7 @@ void rcu_sched_qs(int cpu) | |||
| 132 | void rcu_bh_qs(int cpu) | 117 | void rcu_bh_qs(int cpu) |
| 133 | { | 118 | { |
| 134 | if (rcu_qsctr_help(&rcu_bh_ctrlblk)) | 119 | if (rcu_qsctr_help(&rcu_bh_ctrlblk)) |
| 135 | raise_softirq(RCU_SOFTIRQ); | 120 | invoke_rcu_kthread(); |
| 136 | } | 121 | } |
| 137 | 122 | ||
| 138 | /* | 123 | /* |
| @@ -152,13 +137,14 @@ void rcu_check_callbacks(int cpu, int user) | |||
| 152 | } | 137 | } |
| 153 | 138 | ||
| 154 | /* | 139 | /* |
| 155 | * Helper function for rcu_process_callbacks() that operates on the | 140 | * Invoke the RCU callbacks on the specified rcu_ctrlkblk structure |
| 156 | * specified rcu_ctrlkblk structure. | 141 | * whose grace period has elapsed. |
| 157 | */ | 142 | */ |
| 158 | static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) | 143 | static void rcu_process_callbacks(struct rcu_ctrlblk *rcp) |
| 159 | { | 144 | { |
| 160 | struct rcu_head *next, *list; | 145 | struct rcu_head *next, *list; |
| 161 | unsigned long flags; | 146 | unsigned long flags; |
| 147 | RCU_TRACE(int cb_count = 0); | ||
| 162 | 148 | ||
| 163 | /* If no RCU callbacks ready to invoke, just return. */ | 149 | /* If no RCU callbacks ready to invoke, just return. */ |
| 164 | if (&rcp->rcucblist == rcp->donetail) | 150 | if (&rcp->rcucblist == rcp->donetail) |
| @@ -180,19 +166,59 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) | |||
| 180 | next = list->next; | 166 | next = list->next; |
| 181 | prefetch(next); | 167 | prefetch(next); |
| 182 | debug_rcu_head_unqueue(list); | 168 | debug_rcu_head_unqueue(list); |
| 169 | local_bh_disable(); | ||
| 183 | list->func(list); | 170 | list->func(list); |
| 171 | local_bh_enable(); | ||
| 184 | list = next; | 172 | list = next; |
| 173 | RCU_TRACE(cb_count++); | ||
| 185 | } | 174 | } |
| 175 | RCU_TRACE(rcu_trace_sub_qlen(rcp, cb_count)); | ||
| 186 | } | 176 | } |
| 187 | 177 | ||
| 188 | /* | 178 | /* |
| 189 | * Invoke any callbacks whose grace period has completed. | 179 | * This kthread invokes RCU callbacks whose grace periods have |
| 180 | * elapsed. It is awakened as needed, and takes the place of the | ||
| 181 | * RCU_SOFTIRQ that was used previously for this purpose. | ||
| 182 | * This is a kthread, but it is never stopped, at least not until | ||
| 183 | * the system goes down. | ||
| 190 | */ | 184 | */ |
| 191 | static void rcu_process_callbacks(struct softirq_action *unused) | 185 | static int rcu_kthread(void *arg) |
| 192 | { | 186 | { |
| 193 | __rcu_process_callbacks(&rcu_sched_ctrlblk); | 187 | unsigned long work; |
| 194 | __rcu_process_callbacks(&rcu_bh_ctrlblk); | 188 | unsigned long morework; |
| 195 | rcu_preempt_process_callbacks(); | 189 | unsigned long flags; |
| 190 | |||
| 191 | for (;;) { | ||
| 192 | wait_event_interruptible(rcu_kthread_wq, | ||
| 193 | have_rcu_kthread_work != 0); | ||
| 194 | morework = rcu_boost(); | ||
| 195 | local_irq_save(flags); | ||
| 196 | work = have_rcu_kthread_work; | ||
| 197 | have_rcu_kthread_work = morework; | ||
| 198 | local_irq_restore(flags); | ||
| 199 | if (work) { | ||
| 200 | rcu_process_callbacks(&rcu_sched_ctrlblk); | ||
| 201 | rcu_process_callbacks(&rcu_bh_ctrlblk); | ||
| 202 | rcu_preempt_process_callbacks(); | ||
| 203 | } | ||
| 204 | schedule_timeout_interruptible(1); /* Leave CPU for others. */ | ||
| 205 | } | ||
| 206 | |||
| 207 | return 0; /* Not reached, but needed to shut gcc up. */ | ||
| 208 | } | ||
| 209 | |||
| 210 | /* | ||
| 211 | * Wake up rcu_kthread() to process callbacks now eligible for invocation | ||
| 212 | * or to boost readers. | ||
| 213 | */ | ||
| 214 | static void invoke_rcu_kthread(void) | ||
| 215 | { | ||
| 216 | unsigned long flags; | ||
| 217 | |||
| 218 | local_irq_save(flags); | ||
| 219 | have_rcu_kthread_work = 1; | ||
| 220 | wake_up(&rcu_kthread_wq); | ||
| 221 | local_irq_restore(flags); | ||
| 196 | } | 222 | } |
| 197 | 223 | ||
| 198 | /* | 224 | /* |
| @@ -230,6 +256,7 @@ static void __call_rcu(struct rcu_head *head, | |||
| 230 | local_irq_save(flags); | 256 | local_irq_save(flags); |
| 231 | *rcp->curtail = head; | 257 | *rcp->curtail = head; |
| 232 | rcp->curtail = &head->next; | 258 | rcp->curtail = &head->next; |
| 259 | RCU_TRACE(rcp->qlen++); | ||
| 233 | local_irq_restore(flags); | 260 | local_irq_restore(flags); |
| 234 | } | 261 | } |
| 235 | 262 | ||
| @@ -282,7 +309,16 @@ void rcu_barrier_sched(void) | |||
| 282 | } | 309 | } |
| 283 | EXPORT_SYMBOL_GPL(rcu_barrier_sched); | 310 | EXPORT_SYMBOL_GPL(rcu_barrier_sched); |
| 284 | 311 | ||
| 285 | void __init rcu_init(void) | 312 | /* |
| 313 | * Spawn the kthread that invokes RCU callbacks. | ||
| 314 | */ | ||
| 315 | static int __init rcu_spawn_kthreads(void) | ||
| 286 | { | 316 | { |
| 287 | open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); | 317 | struct sched_param sp; |
| 318 | |||
| 319 | rcu_kthread_task = kthread_run(rcu_kthread, NULL, "rcu_kthread"); | ||
| 320 | sp.sched_priority = RCU_BOOST_PRIO; | ||
| 321 | sched_setscheduler_nocheck(rcu_kthread_task, SCHED_FIFO, &sp); | ||
| 322 | return 0; | ||
| 288 | } | 323 | } |
| 324 | early_initcall(rcu_spawn_kthreads); | ||
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h index 6ceca4f745f..015abaea962 100644 --- a/kernel/rcutiny_plugin.h +++ b/kernel/rcutiny_plugin.h | |||
| @@ -22,6 +22,40 @@ | |||
| 22 | * Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com> | 22 | * Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com> |
| 23 | */ | 23 | */ |
| 24 | 24 | ||
| 25 | #include <linux/kthread.h> | ||
| 26 | #include <linux/debugfs.h> | ||
| 27 | #include <linux/seq_file.h> | ||
| 28 | |||
| 29 | #ifdef CONFIG_RCU_TRACE | ||
| 30 | #define RCU_TRACE(stmt) stmt | ||
| 31 | #else /* #ifdef CONFIG_RCU_TRACE */ | ||
| 32 | #define RCU_TRACE(stmt) | ||
| 33 | #endif /* #else #ifdef CONFIG_RCU_TRACE */ | ||
| 34 | |||
| 35 | /* Global control variables for rcupdate callback mechanism. */ | ||
| 36 | struct rcu_ctrlblk { | ||
| 37 | struct rcu_head *rcucblist; /* List of pending callbacks (CBs). */ | ||
| 38 | struct rcu_head **donetail; /* ->next pointer of last "done" CB. */ | ||
| 39 | struct rcu_head **curtail; /* ->next pointer of last CB. */ | ||
| 40 | RCU_TRACE(long qlen); /* Number of pending CBs. */ | ||
| 41 | }; | ||
| 42 | |||
| 43 | /* Definition for rcupdate control block. */ | ||
| 44 | static struct rcu_ctrlblk rcu_sched_ctrlblk = { | ||
| 45 | .donetail = &rcu_sched_ctrlblk.rcucblist, | ||
| 46 | .curtail = &rcu_sched_ctrlblk.rcucblist, | ||
| 47 | }; | ||
| 48 | |||
| 49 | static struct rcu_ctrlblk rcu_bh_ctrlblk = { | ||
| 50 | .donetail = &rcu_bh_ctrlblk.rcucblist, | ||
| 51 | .curtail = &rcu_bh_ctrlblk.rcucblist, | ||
| 52 | }; | ||
| 53 | |||
| 54 | #ifdef CONFIG_DEBUG_LOCK_ALLOC | ||
| 55 | int rcu_scheduler_active __read_mostly; | ||
| 56 | EXPORT_SYMBOL_GPL(rcu_scheduler_active); | ||
| 57 | #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ | ||
| 58 | |||
| 25 | #ifdef CONFIG_TINY_PREEMPT_RCU | 59 | #ifdef CONFIG_TINY_PREEMPT_RCU |
| 26 | 60 | ||
| 27 | #include <linux/delay.h> | 61 | #include <linux/delay.h> |
| @@ -46,17 +80,45 @@ struct rcu_preempt_ctrlblk { | |||
| 46 | struct list_head *gp_tasks; | 80 | struct list_head *gp_tasks; |
| 47 | /* Pointer to the first task blocking the */ | 81 | /* Pointer to the first task blocking the */ |
| 48 | /* current grace period, or NULL if there */ | 82 | /* current grace period, or NULL if there */ |
| 49 | /* is not such task. */ | 83 | /* is no such task. */ |
| 50 | struct list_head *exp_tasks; | 84 | struct list_head *exp_tasks; |
| 51 | /* Pointer to first task blocking the */ | 85 | /* Pointer to first task blocking the */ |
| 52 | /* current expedited grace period, or NULL */ | 86 | /* current expedited grace period, or NULL */ |
| 53 | /* if there is no such task. If there */ | 87 | /* if there is no such task. If there */ |
| 54 | /* is no current expedited grace period, */ | 88 | /* is no current expedited grace period, */ |
| 55 | /* then there cannot be any such task. */ | 89 | /* then there cannot be any such task. */ |
| 90 | #ifdef CONFIG_RCU_BOOST | ||
| 91 | struct list_head *boost_tasks; | ||
| 92 | /* Pointer to first task that needs to be */ | ||
| 93 | /* priority-boosted, or NULL if no priority */ | ||
| 94 | /* boosting is needed. If there is no */ | ||
| 95 | /* current or expedited grace period, there */ | ||
| 96 | /* can be no such task. */ | ||
| 97 | #endif /* #ifdef CONFIG_RCU_BOOST */ | ||
| 56 | u8 gpnum; /* Current grace period. */ | 98 | u8 gpnum; /* Current grace period. */ |
| 57 | u8 gpcpu; /* Last grace period blocked by the CPU. */ | 99 | u8 gpcpu; /* Last grace period blocked by the CPU. */ |
| 58 | u8 completed; /* Last grace period completed. */ | 100 | u8 completed; /* Last grace period completed. */ |
| 59 | /* If all three are equal, RCU is idle. */ | 101 | /* If all three are equal, RCU is idle. */ |
| 102 | #ifdef CONFIG_RCU_BOOST | ||
| 103 | s8 boosted_this_gp; /* Has boosting already happened? */ | ||
| 104 | unsigned long boost_time; /* When to start boosting (jiffies) */ | ||
| 105 | #endif /* #ifdef CONFIG_RCU_BOOST */ | ||
| 106 | #ifdef CONFIG_RCU_TRACE | ||
| 107 | unsigned long n_grace_periods; | ||
| 108 | #ifdef CONFIG_RCU_BOOST | ||
| 109 | unsigned long n_tasks_boosted; | ||
| 110 | unsigned long n_exp_boosts; | ||
| 111 | unsigned long n_normal_boosts; | ||
| 112 | unsigned long n_normal_balk_blkd_tasks; | ||
| 113 | unsigned long n_normal_balk_gp_tasks; | ||
| 114 | unsigned long n_normal_balk_boost_tasks; | ||
| 115 | unsigned long n_normal_balk_boosted; | ||
| 116 | unsigned long n_normal_balk_notyet; | ||
| 117 | unsigned long n_normal_balk_nos; | ||
| 118 | unsigned long n_exp_balk_blkd_tasks; | ||
| 119 | unsigned long n_exp_balk_nos; | ||
| 120 | #endif /* #ifdef CONFIG_RCU_BOOST */ | ||
| 121 | #endif /* #ifdef CONFIG_RCU_TRACE */ | ||
| 60 | }; | 122 | }; |
| 61 | 123 | ||
| 62 | static struct rcu_preempt_ctrlblk rcu_preempt_ctrlblk = { | 124 | static struct rcu_preempt_ctrlblk rcu_preempt_ctrlblk = { |
| @@ -122,6 +184,210 @@ static int rcu_preempt_gp_in_progress(void) | |||
| 122 | } | 184 | } |
| 123 | 185 | ||
| 124 | /* | 186 | /* |
| 187 | * Advance a ->blkd_tasks-list pointer to the next entry, instead | ||
| 188 | * returning NULL if at the end of the list. | ||
| 189 | */ | ||
| 190 | static struct list_head *rcu_next_node_entry(struct task_struct *t) | ||
| 191 | { | ||
| 192 | struct list_head *np; | ||
| 193 | |||
| 194 | np = t->rcu_node_entry.next; | ||
| 195 | if (np == &rcu_preempt_ctrlblk.blkd_tasks) | ||
| 196 | np = NULL; | ||
| 197 | return np; | ||
| 198 | } | ||
| 199 | |||
| 200 | #ifdef CONFIG_RCU_TRACE | ||
| 201 | |||
| 202 | #ifdef CONFIG_RCU_BOOST | ||
| 203 | static void rcu_initiate_boost_trace(void); | ||
| 204 | static void rcu_initiate_exp_boost_trace(void); | ||
| 205 | #endif /* #ifdef CONFIG_RCU_BOOST */ | ||
| 206 | |||
| 207 | /* | ||
| 208 | * Dump additional statistice for TINY_PREEMPT_RCU. | ||
| 209 | */ | ||
| 210 | static void show_tiny_preempt_stats(struct seq_file *m) | ||
| 211 | { | ||
| 212 | seq_printf(m, "rcu_preempt: qlen=%ld gp=%lu g%u/p%u/c%u tasks=%c%c%c\n", | ||
| 213 | rcu_preempt_ctrlblk.rcb.qlen, | ||
| 214 | rcu_preempt_ctrlblk.n_grace_periods, | ||
| 215 | rcu_preempt_ctrlblk.gpnum, | ||
| 216 | rcu_preempt_ctrlblk.gpcpu, | ||
| 217 | rcu_preempt_ctrlblk.completed, | ||
| 218 | "T."[list_empty(&rcu_preempt_ctrlblk.blkd_tasks)], | ||
| 219 | "N."[!rcu_preempt_ctrlblk.gp_tasks], | ||
| 220 | "E."[!rcu_preempt_ctrlblk.exp_tasks]); | ||
| 221 | #ifdef CONFIG_RCU_BOOST | ||
| 222 | seq_printf(m, " ttb=%c btg=", | ||
| 223 | "B."[!rcu_preempt_ctrlblk.boost_tasks]); | ||
| 224 | switch (rcu_preempt_ctrlblk.boosted_this_gp) { | ||
| 225 | case -1: | ||
| 226 | seq_puts(m, "exp"); | ||
| 227 | break; | ||
| 228 | case 0: | ||
| 229 | seq_puts(m, "no"); | ||
| 230 | break; | ||
| 231 | case 1: | ||
| 232 | seq_puts(m, "begun"); | ||
| 233 | break; | ||
| 234 | case 2: | ||
| 235 | seq_puts(m, "done"); | ||
| 236 | break; | ||
| 237 | default: | ||
| 238 | seq_printf(m, "?%d?", rcu_preempt_ctrlblk.boosted_this_gp); | ||
| 239 | } | ||
| 240 | seq_printf(m, " ntb=%lu neb=%lu nnb=%lu j=%04x bt=%04x\n", | ||
| 241 | rcu_preempt_ctrlblk.n_tasks_boosted, | ||
| 242 | rcu_preempt_ctrlblk.n_exp_boosts, | ||
| 243 | rcu_preempt_ctrlblk.n_normal_boosts, | ||
| 244 | (int)(jiffies & 0xffff), | ||
| 245 | (int)(rcu_preempt_ctrlblk.boost_time & 0xffff)); | ||
| 246 | seq_printf(m, " %s: nt=%lu gt=%lu bt=%lu b=%lu ny=%lu nos=%lu\n", | ||
| 247 | "normal balk", | ||
| 248 | rcu_preempt_ctrlblk.n_normal_balk_blkd_tasks, | ||
| 249 | rcu_preempt_ctrlblk.n_normal_balk_gp_tasks, | ||
| 250 | rcu_preempt_ctrlblk.n_normal_balk_boost_tasks, | ||
| 251 | rcu_preempt_ctrlblk.n_normal_balk_boosted, | ||
| 252 | rcu_preempt_ctrlblk.n_normal_balk_notyet, | ||
| 253 | rcu_preempt_ctrlblk.n_normal_balk_nos); | ||
| 254 | seq_printf(m, " exp balk: bt=%lu nos=%lu\n", | ||
| 255 | rcu_preempt_ctrlblk.n_exp_balk_blkd_tasks, | ||
| 256 | rcu_preempt_ctrlblk.n_exp_balk_nos); | ||
| 257 | #endif /* #ifdef CONFIG_RCU_BOOST */ | ||
| 258 | } | ||
| 259 | |||
| 260 | #endif /* #ifdef CONFIG_RCU_TRACE */ | ||
| 261 | |||
| 262 | #ifdef CONFIG_RCU_BOOST | ||
| 263 | |||
| 264 | #include "rtmutex_common.h" | ||
| 265 | |||
| 266 | /* | ||
| 267 | * Carry out RCU priority boosting on the task indicated by ->boost_tasks, | ||
| 268 | * and advance ->boost_tasks to the next task in the ->blkd_tasks list. | ||
| 269 | */ | ||
| 270 | static int rcu_boost(void) | ||
| 271 | { | ||
| 272 | unsigned long flags; | ||
| 273 | struct rt_mutex mtx; | ||
| 274 | struct list_head *np; | ||
| 275 | struct task_struct *t; | ||
| 276 | |||
| 277 | if (rcu_preempt_ctrlblk.boost_tasks == NULL) | ||
| 278 | return 0; /* Nothing to boost. */ | ||
| 279 | raw_local_irq_save(flags); | ||
| 280 | rcu_preempt_ctrlblk.boosted_this_gp++; | ||
| 281 | t = container_of(rcu_preempt_ctrlblk.boost_tasks, struct task_struct, | ||
| 282 | rcu_node_entry); | ||
| 283 | np = rcu_next_node_entry(t); | ||
| 284 | rt_mutex_init_proxy_locked(&mtx, t); | ||
| 285 | t->rcu_boost_mutex = &mtx; | ||
| 286 | t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BOOSTED; | ||
| 287 | raw_local_irq_restore(flags); | ||
| 288 | rt_mutex_lock(&mtx); | ||
| 289 | RCU_TRACE(rcu_preempt_ctrlblk.n_tasks_boosted++); | ||
| 290 | rcu_preempt_ctrlblk.boosted_this_gp++; | ||
| 291 | rt_mutex_unlock(&mtx); | ||
| 292 | return rcu_preempt_ctrlblk.boost_tasks != NULL; | ||
| 293 | } | ||
| 294 | |||
| 295 | /* | ||
| 296 | * Check to see if it is now time to start boosting RCU readers blocking | ||
| 297 | * the current grace period, and, if so, tell the rcu_kthread_task to | ||
| 298 | * start boosting them. If there is an expedited boost in progress, | ||
| 299 | * we wait for it to complete. | ||
| 300 | * | ||
| 301 | * If there are no blocked readers blocking the current grace period, | ||
| 302 | * return 0 to let the caller know, otherwise return 1. Note that this | ||
| 303 | * return value is independent of whether or not boosting was done. | ||
| 304 | */ | ||
| 305 | static int rcu_initiate_boost(void) | ||
| 306 | { | ||
| 307 | if (!rcu_preempt_blocked_readers_cgp()) { | ||
| 308 | RCU_TRACE(rcu_preempt_ctrlblk.n_normal_balk_blkd_tasks++); | ||
| 309 | return 0; | ||
| 310 | } | ||
| 311 | if (rcu_preempt_ctrlblk.gp_tasks != NULL && | ||
| 312 | rcu_preempt_ctrlblk.boost_tasks == NULL && | ||
| 313 | rcu_preempt_ctrlblk.boosted_this_gp == 0 && | ||
| 314 | ULONG_CMP_GE(jiffies, rcu_preempt_ctrlblk.boost_time)) { | ||
| 315 | rcu_preempt_ctrlblk.boost_tasks = rcu_preempt_ctrlblk.gp_tasks; | ||
| 316 | invoke_rcu_kthread(); | ||
| 317 | RCU_TRACE(rcu_preempt_ctrlblk.n_normal_boosts++); | ||
| 318 | } else | ||
| 319 | RCU_TRACE(rcu_initiate_boost_trace()); | ||
| 320 | return 1; | ||
| 321 | } | ||
| 322 | |||
| 323 | /* | ||
| 324 | * Initiate boosting for an expedited grace period. | ||
| 325 | */ | ||
| 326 | static void rcu_initiate_expedited_boost(void) | ||
| 327 | { | ||
| 328 | unsigned long flags; | ||
| 329 | |||
| 330 | raw_local_irq_save(flags); | ||
| 331 | if (!list_empty(&rcu_preempt_ctrlblk.blkd_tasks)) { | ||
| 332 | rcu_preempt_ctrlblk.boost_tasks = | ||
| 333 | rcu_preempt_ctrlblk.blkd_tasks.next; | ||
| 334 | rcu_preempt_ctrlblk.boosted_this_gp = -1; | ||
| 335 | invoke_rcu_kthread(); | ||
| 336 | RCU_TRACE(rcu_preempt_ctrlblk.n_exp_boosts++); | ||
| 337 | } else | ||
| 338 | RCU_TRACE(rcu_initiate_exp_boost_trace()); | ||
| 339 | raw_local_irq_restore(flags); | ||
| 340 | } | ||
| 341 | |||
| 342 | #define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000); | ||
| 343 | |||
| 344 | /* | ||
| 345 | * Do priority-boost accounting for the start of a new grace period. | ||
| 346 | */ | ||
| 347 | static void rcu_preempt_boost_start_gp(void) | ||
| 348 | { | ||
| 349 | rcu_preempt_ctrlblk.boost_time = jiffies + RCU_BOOST_DELAY_JIFFIES; | ||
| 350 | if (rcu_preempt_ctrlblk.boosted_this_gp > 0) | ||
| 351 | rcu_preempt_ctrlblk.boosted_this_gp = 0; | ||
| 352 | } | ||
| 353 | |||
| 354 | #else /* #ifdef CONFIG_RCU_BOOST */ | ||
| 355 | |||
| 356 | /* | ||
| 357 | * If there is no RCU priority boosting, we don't boost. | ||
| 358 | */ | ||
| 359 | static int rcu_boost(void) | ||
| 360 | { | ||
| 361 | return 0; | ||
| 362 | } | ||
| 363 | |||
| 364 | /* | ||
| 365 | * If there is no RCU priority boosting, we don't initiate boosting, | ||
| 366 | * but we do indicate whether there are blocked readers blocking the | ||
| 367 | * current grace period. | ||
| 368 | */ | ||
| 369 | static int rcu_initiate_boost(void) | ||
| 370 | { | ||
| 371 | return rcu_preempt_blocked_readers_cgp(); | ||
| 372 | } | ||
| 373 | |||
| 374 | /* | ||
| 375 | * If there is no RCU priority boosting, we don't initiate expedited boosting. | ||
| 376 | */ | ||
| 377 | static void rcu_initiate_expedited_boost(void) | ||
| 378 | { | ||
| 379 | } | ||
| 380 | |||
| 381 | /* | ||
| 382 | * If there is no RCU priority boosting, nothing to do at grace-period start. | ||
| 383 | */ | ||
| 384 | static void rcu_preempt_boost_start_gp(void) | ||
| 385 | { | ||
| 386 | } | ||
| 387 | |||
| 388 | #endif /* else #ifdef CONFIG_RCU_BOOST */ | ||
| 389 | |||
| 390 | /* | ||
| 125 | * Record a preemptible-RCU quiescent state for the specified CPU. Note | 391 | * Record a preemptible-RCU quiescent state for the specified CPU. Note |
| 126 | * that this just means that the task currently running on the CPU is | 392 | * that this just means that the task currently running on the CPU is |
| 127 | * in a quiescent state. There might be any number of tasks blocked | 393 | * in a quiescent state. There might be any number of tasks blocked |
| @@ -148,11 +414,14 @@ static void rcu_preempt_cpu_qs(void) | |||
| 148 | rcu_preempt_ctrlblk.gpcpu = rcu_preempt_ctrlblk.gpnum; | 414 | rcu_preempt_ctrlblk.gpcpu = rcu_preempt_ctrlblk.gpnum; |
| 149 | current->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS; | 415 | current->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS; |
| 150 | 416 | ||
| 417 | /* If there is no GP then there is nothing more to do. */ | ||
| 418 | if (!rcu_preempt_gp_in_progress()) | ||
| 419 | return; | ||
| 151 | /* | 420 | /* |
| 152 | * If there is no GP, or if blocked readers are still blocking GP, | 421 | * Check up on boosting. If there are no readers blocking the |
| 153 | * then there is nothing more to do. | 422 | * current grace period, leave. |
| 154 | */ | 423 | */ |
| 155 | if (!rcu_preempt_gp_in_progress() || rcu_preempt_blocked_readers_cgp()) | 424 | if (rcu_initiate_boost()) |
| 156 | return; | 425 | return; |
| 157 | 426 | ||
| 158 | /* Advance callbacks. */ | 427 | /* Advance callbacks. */ |
| @@ -164,9 +433,9 @@ static void rcu_preempt_cpu_qs(void) | |||
| 164 | if (!rcu_preempt_blocked_readers_any()) | 433 | if (!rcu_preempt_blocked_readers_any()) |
| 165 | rcu_preempt_ctrlblk.rcb.donetail = rcu_preempt_ctrlblk.nexttail; | 434 | rcu_preempt_ctrlblk.rcb.donetail = rcu_preempt_ctrlblk.nexttail; |
| 166 | 435 | ||
| 167 | /* If there are done callbacks, make RCU_SOFTIRQ process them. */ | 436 | /* If there are done callbacks, cause them to be invoked. */ |
| 168 | if (*rcu_preempt_ctrlblk.rcb.donetail != NULL) | 437 | if (*rcu_preempt_ctrlblk.rcb.donetail != NULL) |
| 169 | raise_softirq(RCU_SOFTIRQ); | 438 | invoke_rcu_kthread(); |
| 170 | } | 439 | } |
| 171 | 440 | ||
| 172 | /* | 441 | /* |
| @@ -178,12 +447,16 @@ static void rcu_preempt_start_gp(void) | |||
| 178 | 447 | ||
| 179 | /* Official start of GP. */ | 448 | /* Official start of GP. */ |
| 180 | rcu_preempt_ctrlblk.gpnum++; | 449 | rcu_preempt_ctrlblk.gpnum++; |
| 450 | RCU_TRACE(rcu_preempt_ctrlblk.n_grace_periods++); | ||
| 181 | 451 | ||
| 182 | /* Any blocked RCU readers block new GP. */ | 452 | /* Any blocked RCU readers block new GP. */ |
| 183 | if (rcu_preempt_blocked_readers_any()) | 453 | if (rcu_preempt_blocked_readers_any()) |
| 184 | rcu_preempt_ctrlblk.gp_tasks = | 454 | rcu_preempt_ctrlblk.gp_tasks = |
| 185 | rcu_preempt_ctrlblk.blkd_tasks.next; | 455 | rcu_preempt_ctrlblk.blkd_tasks.next; |
| 186 | 456 | ||
| 457 | /* Set up for RCU priority boosting. */ | ||
| 458 | rcu_preempt_boost_start_gp(); | ||
| 459 | |||
| 187 | /* If there is no running reader, CPU is done with GP. */ | 460 | /* If there is no running reader, CPU is done with GP. */ |
| 188 | if (!rcu_preempt_running_reader()) | 461 | if (!rcu_preempt_running_reader()) |
| 189 | rcu_preempt_cpu_qs(); | 462 | rcu_preempt_cpu_qs(); |
| @@ -304,14 +577,16 @@ static void rcu_read_unlock_special(struct task_struct *t) | |||
| 304 | */ | 577 | */ |
| 305 | empty = !rcu_preempt_blocked_readers_cgp(); | 578 | empty = !rcu_preempt_blocked_readers_cgp(); |
| 306 | empty_exp = rcu_preempt_ctrlblk.exp_tasks == NULL; | 579 | empty_exp = rcu_preempt_ctrlblk.exp_tasks == NULL; |
| 307 | np = t->rcu_node_entry.next; | 580 | np = rcu_next_node_entry(t); |
| 308 | if (np == &rcu_preempt_ctrlblk.blkd_tasks) | ||
| 309 | np = NULL; | ||
| 310 | list_del(&t->rcu_node_entry); | 581 | list_del(&t->rcu_node_entry); |
| 311 | if (&t->rcu_node_entry == rcu_preempt_ctrlblk.gp_tasks) | 582 | if (&t->rcu_node_entry == rcu_preempt_ctrlblk.gp_tasks) |
| 312 | rcu_preempt_ctrlblk.gp_tasks = np; | 583 | rcu_preempt_ctrlblk.gp_tasks = np; |
| 313 | if (&t->rcu_node_entry == rcu_preempt_ctrlblk.exp_tasks) | 584 | if (&t->rcu_node_entry == rcu_preempt_ctrlblk.exp_tasks) |
| 314 | rcu_preempt_ctrlblk.exp_tasks = np; | 585 | rcu_preempt_ctrlblk.exp_tasks = np; |
| 586 | #ifdef CONFIG_RCU_BOOST | ||
| 587 | if (&t->rcu_node_entry == rcu_preempt_ctrlblk.boost_tasks) | ||
| 588 | rcu_preempt_ctrlblk.boost_tasks = np; | ||
| 589 | #endif /* #ifdef CONFIG_RCU_BOOST */ | ||
| 315 | INIT_LIST_HEAD(&t->rcu_node_entry); | 590 | INIT_LIST_HEAD(&t->rcu_node_entry); |
| 316 | 591 | ||
| 317 | /* | 592 | /* |
| @@ -331,6 +606,14 @@ static void rcu_read_unlock_special(struct task_struct *t) | |||
| 331 | if (!empty_exp && rcu_preempt_ctrlblk.exp_tasks == NULL) | 606 | if (!empty_exp && rcu_preempt_ctrlblk.exp_tasks == NULL) |
| 332 | rcu_report_exp_done(); | 607 | rcu_report_exp_done(); |
| 333 | } | 608 | } |
| 609 | #ifdef CONFIG_RCU_BOOST | ||
| 610 | /* Unboost self if was boosted. */ | ||
| 611 | if (special & RCU_READ_UNLOCK_BOOSTED) { | ||
| 612 | t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_BOOSTED; | ||
| 613 | rt_mutex_unlock(t->rcu_boost_mutex); | ||
| 614 | t->rcu_boost_mutex = NULL; | ||
| 615 | } | ||
| 616 | #endif /* #ifdef CONFIG_RCU_BOOST */ | ||
| 334 | local_irq_restore(flags); | 617 | local_irq_restore(flags); |
| 335 | } | 618 | } |
| 336 | 619 | ||
| @@ -374,7 +657,7 @@ static void rcu_preempt_check_callbacks(void) | |||
| 374 | rcu_preempt_cpu_qs(); | 657 | rcu_preempt_cpu_qs(); |
| 375 | if (&rcu_preempt_ctrlblk.rcb.rcucblist != | 658 | if (&rcu_preempt_ctrlblk.rcb.rcucblist != |
| 376 | rcu_preempt_ctrlblk.rcb.donetail) | 659 | rcu_preempt_ctrlblk.rcb.donetail) |
| 377 | raise_softirq(RCU_SOFTIRQ); | 660 | invoke_rcu_kthread(); |
| 378 | if (rcu_preempt_gp_in_progress() && | 661 | if (rcu_preempt_gp_in_progress() && |
| 379 | rcu_cpu_blocking_cur_gp() && | 662 | rcu_cpu_blocking_cur_gp() && |
| 380 | rcu_preempt_running_reader()) | 663 | rcu_preempt_running_reader()) |
| @@ -383,7 +666,7 @@ static void rcu_preempt_check_callbacks(void) | |||
| 383 | 666 | ||
| 384 | /* | 667 | /* |
| 385 | * TINY_PREEMPT_RCU has an extra callback-list tail pointer to | 668 | * TINY_PREEMPT_RCU has an extra callback-list tail pointer to |
| 386 | * update, so this is invoked from __rcu_process_callbacks() to | 669 | * update, so this is invoked from rcu_process_callbacks() to |
| 387 | * handle that case. Of course, it is invoked for all flavors of | 670 | * handle that case. Of course, it is invoked for all flavors of |
| 388 | * RCU, but RCU callbacks can appear only on one of the lists, and | 671 | * RCU, but RCU callbacks can appear only on one of the lists, and |
| 389 | * neither ->nexttail nor ->donetail can possibly be NULL, so there | 672 | * neither ->nexttail nor ->donetail can possibly be NULL, so there |
| @@ -400,7 +683,7 @@ static void rcu_preempt_remove_callbacks(struct rcu_ctrlblk *rcp) | |||
| 400 | */ | 683 | */ |
| 401 | static void rcu_preempt_process_callbacks(void) | 684 | static void rcu_preempt_process_callbacks(void) |
| 402 | { | 685 | { |
| 403 | __rcu_process_callbacks(&rcu_preempt_ctrlblk.rcb); | 686 | rcu_process_callbacks(&rcu_preempt_ctrlblk.rcb); |
| 404 | } | 687 | } |
| 405 | 688 | ||
| 406 | /* | 689 | /* |
| @@ -417,6 +700,7 @@ void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) | |||
| 417 | local_irq_save(flags); | 700 | local_irq_save(flags); |
| 418 | *rcu_preempt_ctrlblk.nexttail = head; | 701 | *rcu_preempt_ctrlblk.nexttail = head; |
| 419 | rcu_preempt_ctrlblk.nexttail = &head->next; | 702 | rcu_preempt_ctrlblk.nexttail = &head->next; |
| 703 | RCU_TRACE(rcu_preempt_ctrlblk.rcb.qlen++); | ||
| 420 | rcu_preempt_start_gp(); /* checks to see if GP needed. */ | 704 | rcu_preempt_start_gp(); /* checks to see if GP needed. */ |
| 421 | local_irq_restore(flags); | 705 | local_irq_restore(flags); |
| 422 | } | 706 | } |
| @@ -532,6 +816,7 @@ void synchronize_rcu_expedited(void) | |||
| 532 | 816 | ||
| 533 | /* Wait for tail of ->blkd_tasks list to drain. */ | 817 | /* Wait for tail of ->blkd_tasks list to drain. */ |
| 534 | if (rcu_preempted_readers_exp()) | 818 | if (rcu_preempted_readers_exp()) |
| 819 | rcu_initiate_expedited_boost(); | ||
| 535 | wait_event(sync_rcu_preempt_exp_wq, | 820 | wait_event(sync_rcu_preempt_exp_wq, |
| 536 | !rcu_preempted_readers_exp()); | 821 | !rcu_preempted_readers_exp()); |
| 537 | 822 | ||
| @@ -572,6 +857,27 @@ void exit_rcu(void) | |||
| 572 | 857 | ||
| 573 | #else /* #ifdef CONFIG_TINY_PREEMPT_RCU */ | 858 | #else /* #ifdef CONFIG_TINY_PREEMPT_RCU */ |
| 574 | 859 | ||
| 860 | #ifdef CONFIG_RCU_TRACE | ||
| 861 | |||
| 862 | /* | ||
| 863 | * Because preemptible RCU does not exist, it is not necessary to | ||
| 864 | * dump out its statistics. | ||
| 865 | */ | ||
| 866 | static void show_tiny_preempt_stats(struct seq_file *m) | ||
| 867 | { | ||
| 868 | } | ||
| 869 | |||
| 870 | #endif /* #ifdef CONFIG_RCU_TRACE */ | ||
| 871 | |||
| 872 | /* | ||
| 873 | * Because preemptible RCU does not exist, it is never necessary to | ||
| 874 | * boost preempted RCU readers. | ||
| 875 | */ | ||
| 876 | static int rcu_boost(void) | ||
| 877 | { | ||
| 878 | return 0; | ||
| 879 | } | ||
| 880 | |||
| 575 | /* | 881 | /* |
| 576 | * Because preemptible RCU does not exist, it never has any callbacks | 882 | * Because preemptible RCU does not exist, it never has any callbacks |
| 577 | * to check. | 883 | * to check. |
| @@ -599,17 +905,116 @@ static void rcu_preempt_process_callbacks(void) | |||
| 599 | #endif /* #else #ifdef CONFIG_TINY_PREEMPT_RCU */ | 905 | #endif /* #else #ifdef CONFIG_TINY_PREEMPT_RCU */ |
| 600 | 906 | ||
| 601 | #ifdef CONFIG_DEBUG_LOCK_ALLOC | 907 | #ifdef CONFIG_DEBUG_LOCK_ALLOC |
| 602 | |||
| 603 | #include <linux/kernel_stat.h> | 908 | #include <linux/kernel_stat.h> |
| 604 | 909 | ||
| 605 | /* | 910 | /* |
| 606 | * During boot, we forgive RCU lockdep issues. After this function is | 911 | * During boot, we forgive RCU lockdep issues. After this function is |
| 607 | * invoked, we start taking RCU lockdep issues seriously. | 912 | * invoked, we start taking RCU lockdep issues seriously. |
| 608 | */ | 913 | */ |
| 609 | void rcu_scheduler_starting(void) | 914 | void __init rcu_scheduler_starting(void) |
| 610 | { | 915 | { |
| 611 | WARN_ON(nr_context_switches() > 0); | 916 | WARN_ON(nr_context_switches() > 0); |
| 612 | rcu_scheduler_active = 1; | 917 | rcu_scheduler_active = 1; |
| 613 | } | 918 | } |
| 614 | 919 | ||
| 615 | #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ | 920 | #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ |
| 921 | |||
| 922 | #ifdef CONFIG_RCU_BOOST | ||
| 923 | #define RCU_BOOST_PRIO CONFIG_RCU_BOOST_PRIO | ||
| 924 | #else /* #ifdef CONFIG_RCU_BOOST */ | ||
| 925 | #define RCU_BOOST_PRIO 1 | ||
| 926 | #endif /* #else #ifdef CONFIG_RCU_BOOST */ | ||
| 927 | |||
| 928 | #ifdef CONFIG_RCU_TRACE | ||
| 929 | |||
| 930 | #ifdef CONFIG_RCU_BOOST | ||
| 931 | |||
| 932 | static void rcu_initiate_boost_trace(void) | ||
| 933 | { | ||
| 934 | if (rcu_preempt_ctrlblk.gp_tasks == NULL) | ||
| 935 | rcu_preempt_ctrlblk.n_normal_balk_gp_tasks++; | ||
| 936 | else if (rcu_preempt_ctrlblk.boost_tasks != NULL) | ||
| 937 | rcu_preempt_ctrlblk.n_normal_balk_boost_tasks++; | ||
| 938 | else if (rcu_preempt_ctrlblk.boosted_this_gp != 0) | ||
| 939 | rcu_preempt_ctrlblk.n_normal_balk_boosted++; | ||
| 940 | else if (!ULONG_CMP_GE(jiffies, rcu_preempt_ctrlblk.boost_time)) | ||
| 941 | rcu_preempt_ctrlblk.n_normal_balk_notyet++; | ||
| 942 | else | ||
| 943 | rcu_preempt_ctrlblk.n_normal_balk_nos++; | ||
| 944 | } | ||
| 945 | |||
| 946 | static void rcu_initiate_exp_boost_trace(void) | ||
| 947 | { | ||
| 948 | if (list_empty(&rcu_preempt_ctrlblk.blkd_tasks)) | ||
| 949 | rcu_preempt_ctrlblk.n_exp_balk_blkd_tasks++; | ||
| 950 | else | ||
| 951 | rcu_preempt_ctrlblk.n_exp_balk_nos++; | ||
| 952 | } | ||
| 953 | |||
| 954 | #endif /* #ifdef CONFIG_RCU_BOOST */ | ||
| 955 | |||
| 956 | static void rcu_trace_sub_qlen(struct rcu_ctrlblk *rcp, int n) | ||
| 957 | { | ||
| 958 | unsigned long flags; | ||
| 959 | |||
| 960 | raw_local_irq_save(flags); | ||
| 961 | rcp->qlen -= n; | ||
| 962 | raw_local_irq_restore(flags); | ||
| 963 | } | ||
| 964 | |||
| 965 | /* | ||
| 966 | * Dump statistics for TINY_RCU, such as they are. | ||
| 967 | */ | ||
| 968 | static int show_tiny_stats(struct seq_file *m, void *unused) | ||
| 969 | { | ||
| 970 | show_tiny_preempt_stats(m); | ||
| 971 | seq_printf(m, "rcu_sched: qlen: %ld\n", rcu_sched_ctrlblk.qlen); | ||
| 972 | seq_printf(m, "rcu_bh: qlen: %ld\n", rcu_bh_ctrlblk.qlen); | ||
| 973 | return 0; | ||
| 974 | } | ||
| 975 | |||
| 976 | static int show_tiny_stats_open(struct inode *inode, struct file *file) | ||
| 977 | { | ||
| 978 | return single_open(file, show_tiny_stats, NULL); | ||
| 979 | } | ||
| 980 | |||
| 981 | static const struct file_operations show_tiny_stats_fops = { | ||
| 982 | .owner = THIS_MODULE, | ||
| 983 | .open = show_tiny_stats_open, | ||
| 984 | .read = seq_read, | ||
| 985 | .llseek = seq_lseek, | ||
| 986 | .release = single_release, | ||
| 987 | }; | ||
| 988 | |||
| 989 | static struct dentry *rcudir; | ||
| 990 | |||
| 991 | static int __init rcutiny_trace_init(void) | ||
| 992 | { | ||
| 993 | struct dentry *retval; | ||
| 994 | |||
| 995 | rcudir = debugfs_create_dir("rcu", NULL); | ||
| 996 | if (!rcudir) | ||
| 997 | goto free_out; | ||
| 998 | retval = debugfs_create_file("rcudata", 0444, rcudir, | ||
| 999 | NULL, &show_tiny_stats_fops); | ||
| 1000 | if (!retval) | ||
| 1001 | goto free_out; | ||
| 1002 | return 0; | ||
| 1003 | free_out: | ||
| 1004 | debugfs_remove_recursive(rcudir); | ||
| 1005 | return 1; | ||
| 1006 | } | ||
| 1007 | |||
| 1008 | static void __exit rcutiny_trace_cleanup(void) | ||
| 1009 | { | ||
| 1010 | debugfs_remove_recursive(rcudir); | ||
| 1011 | } | ||
| 1012 | |||
| 1013 | module_init(rcutiny_trace_init); | ||
| 1014 | module_exit(rcutiny_trace_cleanup); | ||
| 1015 | |||
| 1016 | MODULE_AUTHOR("Paul E. McKenney"); | ||
| 1017 | MODULE_DESCRIPTION("Read-Copy Update tracing for tiny implementation"); | ||
| 1018 | MODULE_LICENSE("GPL"); | ||
| 1019 | |||
| 1020 | #endif /* #ifdef CONFIG_RCU_TRACE */ | ||
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index 9d8e8fb2515..89613f97ff2 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c | |||
| @@ -47,6 +47,7 @@ | |||
| 47 | #include <linux/srcu.h> | 47 | #include <linux/srcu.h> |
| 48 | #include <linux/slab.h> | 48 | #include <linux/slab.h> |
| 49 | #include <asm/byteorder.h> | 49 | #include <asm/byteorder.h> |
| 50 | #include <linux/sched.h> | ||
| 50 | 51 | ||
| 51 | MODULE_LICENSE("GPL"); | 52 | MODULE_LICENSE("GPL"); |
| 52 | MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and " | 53 | MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and " |
| @@ -64,6 +65,9 @@ static int irqreader = 1; /* RCU readers from irq (timers). */ | |||
| 64 | static int fqs_duration = 0; /* Duration of bursts (us), 0 to disable. */ | 65 | static int fqs_duration = 0; /* Duration of bursts (us), 0 to disable. */ |
| 65 | static int fqs_holdoff = 0; /* Hold time within burst (us). */ | 66 | static int fqs_holdoff = 0; /* Hold time within burst (us). */ |
| 66 | static int fqs_stutter = 3; /* Wait time between bursts (s). */ | 67 | static int fqs_stutter = 3; /* Wait time between bursts (s). */ |
| 68 | static int test_boost = 1; /* Test RCU prio boost: 0=no, 1=maybe, 2=yes. */ | ||
| 69 | static int test_boost_interval = 7; /* Interval between boost tests, seconds. */ | ||
| 70 | static int test_boost_duration = 4; /* Duration of each boost test, seconds. */ | ||
| 67 | static char *torture_type = "rcu"; /* What RCU implementation to torture. */ | 71 | static char *torture_type = "rcu"; /* What RCU implementation to torture. */ |
| 68 | 72 | ||
| 69 | module_param(nreaders, int, 0444); | 73 | module_param(nreaders, int, 0444); |
| @@ -88,6 +92,12 @@ module_param(fqs_holdoff, int, 0444); | |||
| 88 | MODULE_PARM_DESC(fqs_holdoff, "Holdoff time within fqs bursts (us)"); | 92 | MODULE_PARM_DESC(fqs_holdoff, "Holdoff time within fqs bursts (us)"); |
| 89 | module_param(fqs_stutter, int, 0444); | 93 | module_param(fqs_stutter, int, 0444); |
| 90 | MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)"); | 94 | MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)"); |
| 95 | module_param(test_boost, int, 0444); | ||
| 96 | MODULE_PARM_DESC(test_boost, "Test RCU prio boost: 0=no, 1=maybe, 2=yes."); | ||
| 97 | module_param(test_boost_interval, int, 0444); | ||
| 98 | MODULE_PARM_DESC(test_boost_interval, "Interval between boost tests, seconds."); | ||
| 99 | module_param(test_boost_duration, int, 0444); | ||
| 100 | MODULE_PARM_DESC(test_boost_duration, "Duration of each boost test, seconds."); | ||
| 91 | module_param(torture_type, charp, 0444); | 101 | module_param(torture_type, charp, 0444); |
| 92 | MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, srcu)"); | 102 | MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, srcu)"); |
| 93 | 103 | ||
| @@ -109,6 +119,7 @@ static struct task_struct *stats_task; | |||
| 109 | static struct task_struct *shuffler_task; | 119 | static struct task_struct *shuffler_task; |
| 110 | static struct task_struct *stutter_task; | 120 | static struct task_struct *stutter_task; |
| 111 | static struct task_struct *fqs_task; | 121 | static struct task_struct *fqs_task; |
| 122 | static struct task_struct *boost_tasks[NR_CPUS]; | ||
| 112 | 123 | ||
| 113 | #define RCU_TORTURE_PIPE_LEN 10 | 124 | #define RCU_TORTURE_PIPE_LEN 10 |
| 114 | 125 | ||
| @@ -134,6 +145,12 @@ static atomic_t n_rcu_torture_alloc_fail; | |||
| 134 | static atomic_t n_rcu_torture_free; | 145 | static atomic_t n_rcu_torture_free; |
| 135 | static atomic_t n_rcu_torture_mberror; | 146 | static atomic_t n_rcu_torture_mberror; |
| 136 | static atomic_t n_rcu_torture_error; | 147 | static atomic_t n_rcu_torture_error; |
| 148 | static long n_rcu_torture_boost_ktrerror; | ||
| 149 | static long n_rcu_torture_boost_rterror; | ||
| 150 | static long n_rcu_torture_boost_allocerror; | ||
| 151 | static long n_rcu_torture_boost_afferror; | ||
| 152 | static long n_rcu_torture_boost_failure; | ||
| 153 | static long n_rcu_torture_boosts; | ||
| 137 | static long n_rcu_torture_timers; | 154 | static long n_rcu_torture_timers; |
| 138 | static struct list_head rcu_torture_removed; | 155 | static struct list_head rcu_torture_removed; |
| 139 | static cpumask_var_t shuffle_tmp_mask; | 156 | static cpumask_var_t shuffle_tmp_mask; |
| @@ -147,6 +164,16 @@ static int stutter_pause_test; | |||
| 147 | #endif | 164 | #endif |
| 148 | int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT; | 165 | int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT; |
| 149 | 166 | ||
| 167 | #ifdef CONFIG_RCU_BOOST | ||
| 168 | #define rcu_can_boost() 1 | ||
| 169 | #else /* #ifdef CONFIG_RCU_BOOST */ | ||
| 170 | #define rcu_can_boost() 0 | ||
| 171 | #endif /* #else #ifdef CONFIG_RCU_BOOST */ | ||
| 172 | |||
| 173 | static unsigned long boost_starttime; /* jiffies of next boost test start. */ | ||
| 174 | DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */ | ||
| 175 | /* and boost task create/destroy. */ | ||
| 176 | |||
| 150 | /* Mediate rmmod and system shutdown. Concurrent rmmod & shutdown illegal! */ | 177 | /* Mediate rmmod and system shutdown. Concurrent rmmod & shutdown illegal! */ |
| 151 | 178 | ||
| 152 | #define FULLSTOP_DONTSTOP 0 /* Normal operation. */ | 179 | #define FULLSTOP_DONTSTOP 0 /* Normal operation. */ |
| @@ -277,6 +304,7 @@ struct rcu_torture_ops { | |||
| 277 | void (*fqs)(void); | 304 | void (*fqs)(void); |
| 278 | int (*stats)(char *page); | 305 | int (*stats)(char *page); |
| 279 | int irq_capable; | 306 | int irq_capable; |
| 307 | int can_boost; | ||
| 280 | char *name; | 308 | char *name; |
| 281 | }; | 309 | }; |
| 282 | 310 | ||
| @@ -366,6 +394,7 @@ static struct rcu_torture_ops rcu_ops = { | |||
| 366 | .fqs = rcu_force_quiescent_state, | 394 | .fqs = rcu_force_quiescent_state, |
| 367 | .stats = NULL, | 395 | .stats = NULL, |
| 368 | .irq_capable = 1, | 396 | .irq_capable = 1, |
| 397 | .can_boost = rcu_can_boost(), | ||
| 369 | .name = "rcu" | 398 | .name = "rcu" |
| 370 | }; | 399 | }; |
| 371 | 400 | ||
| @@ -408,6 +437,7 @@ static struct rcu_torture_ops rcu_sync_ops = { | |||
| 408 | .fqs = rcu_force_quiescent_state, | 437 | .fqs = rcu_force_quiescent_state, |
| 409 | .stats = NULL, | 438 | .stats = NULL, |
| 410 | .irq_capable = 1, | 439 | .irq_capable = 1, |
| 440 | .can_boost = rcu_can_boost(), | ||
| 411 | .name = "rcu_sync" | 441 | .name = "rcu_sync" |
| 412 | }; | 442 | }; |
| 413 | 443 | ||
| @@ -424,6 +454,7 @@ static struct rcu_torture_ops rcu_expedited_ops = { | |||
| 424 | .fqs = rcu_force_quiescent_state, | 454 | .fqs = rcu_force_quiescent_state, |
| 425 | .stats = NULL, | 455 | .stats = NULL, |
| 426 | .irq_capable = 1, | 456 | .irq_capable = 1, |
| 457 | .can_boost = rcu_can_boost(), | ||
| 427 | .name = "rcu_expedited" | 458 | .name = "rcu_expedited" |
| 428 | }; | 459 | }; |
| 429 | 460 | ||
| @@ -684,6 +715,110 @@ static struct rcu_torture_ops sched_expedited_ops = { | |||
| 684 | }; | 715 | }; |
| 685 | 716 | ||
| 686 | /* | 717 | /* |
| 718 | * RCU torture priority-boost testing. Runs one real-time thread per | ||
| 719 | * CPU for moderate bursts, repeatedly registering RCU callbacks and | ||
| 720 | * spinning waiting for them to be invoked. If a given callback takes | ||
| 721 | * too long to be invoked, we assume that priority inversion has occurred. | ||
| 722 | */ | ||
| 723 | |||
| 724 | struct rcu_boost_inflight { | ||
| 725 | struct rcu_head rcu; | ||
| 726 | int inflight; | ||
| 727 | }; | ||
| 728 | |||
| 729 | static void rcu_torture_boost_cb(struct rcu_head *head) | ||
| 730 | { | ||
| 731 | struct rcu_boost_inflight *rbip = | ||
| 732 | container_of(head, struct rcu_boost_inflight, rcu); | ||
| 733 | |||
| 734 | smp_mb(); /* Ensure RCU-core accesses precede clearing ->inflight */ | ||
| 735 | rbip->inflight = 0; | ||
| 736 | } | ||
| 737 | |||
| 738 | static int rcu_torture_boost(void *arg) | ||
| 739 | { | ||
| 740 | unsigned long call_rcu_time; | ||
| 741 | unsigned long endtime; | ||
| 742 | unsigned long oldstarttime; | ||
| 743 | struct rcu_boost_inflight rbi = { .inflight = 0 }; | ||
| 744 | struct sched_param sp; | ||
| 745 | |||
| 746 | VERBOSE_PRINTK_STRING("rcu_torture_boost started"); | ||
| 747 | |||
| 748 | /* Set real-time priority. */ | ||
| 749 | sp.sched_priority = 1; | ||
| 750 | if (sched_setscheduler(current, SCHED_FIFO, &sp) < 0) { | ||
| 751 | VERBOSE_PRINTK_STRING("rcu_torture_boost RT prio failed!"); | ||
| 752 | n_rcu_torture_boost_rterror++; | ||
| 753 | } | ||
| 754 | |||
| 755 | /* Each pass through the following loop does one boost-test cycle. */ | ||
| 756 | do { | ||
| 757 | /* Wait for the next test interval. */ | ||
| 758 | oldstarttime = boost_starttime; | ||
| 759 | while (jiffies - oldstarttime > ULONG_MAX / 2) { | ||
| 760 | schedule_timeout_uninterruptible(1); | ||
| 761 | rcu_stutter_wait("rcu_torture_boost"); | ||
| 762 | if (kthread_should_stop() || | ||
| 763 | fullstop != FULLSTOP_DONTSTOP) | ||
| 764 | goto checkwait; | ||
| 765 | } | ||
| 766 | |||
| 767 | /* Do one boost-test interval. */ | ||
| 768 | endtime = oldstarttime + test_boost_duration * HZ; | ||
| 769 | call_rcu_time = jiffies; | ||
| 770 | while (jiffies - endtime > ULONG_MAX / 2) { | ||
| 771 | /* If we don't have a callback in flight, post one. */ | ||
| 772 | if (!rbi.inflight) { | ||
| 773 | smp_mb(); /* RCU core before ->inflight = 1. */ | ||
| 774 | rbi.inflight = 1; | ||
| 775 | call_rcu(&rbi.rcu, rcu_torture_boost_cb); | ||
| 776 | if (jiffies - call_rcu_time > | ||
| 777 | test_boost_duration * HZ - HZ / 2) { | ||
| 778 | VERBOSE_PRINTK_STRING("rcu_torture_boost boosting failed"); | ||
| 779 | n_rcu_torture_boost_failure++; | ||
| 780 | } | ||
| 781 | call_rcu_time = jiffies; | ||
| 782 | } | ||
| 783 | cond_resched(); | ||
| 784 | rcu_stutter_wait("rcu_torture_boost"); | ||
| 785 | if (kthread_should_stop() || | ||
| 786 | fullstop != FULLSTOP_DONTSTOP) | ||
| 787 | goto checkwait; | ||
| 788 | } | ||
| 789 | |||
| 790 | /* | ||
| 791 | * Set the start time of the next test interval. | ||
| 792 | * Yes, this is vulnerable to long delays, but such | ||
| 793 | * delays simply cause a false negative for the next | ||
| 794 | * interval. Besides, we are running at RT priority, | ||
| 795 | * so delays should be relatively rare. | ||
| 796 | */ | ||
| 797 | while (oldstarttime == boost_starttime) { | ||
| 798 | if (mutex_trylock(&boost_mutex)) { | ||
| 799 | boost_starttime = jiffies + | ||
| 800 | test_boost_interval * HZ; | ||
| 801 | n_rcu_torture_boosts++; | ||
| 802 | mutex_unlock(&boost_mutex); | ||
| 803 | break; | ||
| 804 | } | ||
| 805 | schedule_timeout_uninterruptible(1); | ||
| 806 | } | ||
| 807 | |||
| 808 | /* Go do the stutter. */ | ||
| 809 | checkwait: rcu_stutter_wait("rcu_torture_boost"); | ||
| 810 | } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); | ||
| 811 | |||
| 812 | /* Clean up and exit. */ | ||
| 813 | VERBOSE_PRINTK_STRING("rcu_torture_boost task stopping"); | ||
| 814 | rcutorture_shutdown_absorb("rcu_torture_boost"); | ||
| 815 | while (!kthread_should_stop() || rbi.inflight) | ||
| 816 | schedule_timeout_uninterruptible(1); | ||
| 817 | smp_mb(); /* order accesses to ->inflight before stack-frame death. */ | ||
| 818 | return 0; | ||
| 819 | } | ||
| 820 | |||
| 821 | /* | ||
| 687 | * RCU torture force-quiescent-state kthread. Repeatedly induces | 822 | * RCU torture force-quiescent-state kthread. Repeatedly induces |
| 688 | * bursts of calls to force_quiescent_state(), increasing the probability | 823 | * bursts of calls to force_quiescent_state(), increasing the probability |
| 689 | * of occurrence of some important types of race conditions. | 824 | * of occurrence of some important types of race conditions. |
| @@ -933,7 +1068,8 @@ rcu_torture_printk(char *page) | |||
| 933 | cnt += sprintf(&page[cnt], "%s%s ", torture_type, TORTURE_FLAG); | 1068 | cnt += sprintf(&page[cnt], "%s%s ", torture_type, TORTURE_FLAG); |
| 934 | cnt += sprintf(&page[cnt], | 1069 | cnt += sprintf(&page[cnt], |
| 935 | "rtc: %p ver: %ld tfle: %d rta: %d rtaf: %d rtf: %d " | 1070 | "rtc: %p ver: %ld tfle: %d rta: %d rtaf: %d rtf: %d " |
| 936 | "rtmbe: %d nt: %ld", | 1071 | "rtmbe: %d rtbke: %ld rtbre: %ld rtbae: %ld rtbafe: %ld " |
| 1072 | "rtbf: %ld rtb: %ld nt: %ld", | ||
| 937 | rcu_torture_current, | 1073 | rcu_torture_current, |
| 938 | rcu_torture_current_version, | 1074 | rcu_torture_current_version, |
| 939 | list_empty(&rcu_torture_freelist), | 1075 | list_empty(&rcu_torture_freelist), |
| @@ -941,8 +1077,19 @@ rcu_torture_printk(char *page) | |||
| 941 | atomic_read(&n_rcu_torture_alloc_fail), | 1077 | atomic_read(&n_rcu_torture_alloc_fail), |
| 942 | atomic_read(&n_rcu_torture_free), | 1078 | atomic_read(&n_rcu_torture_free), |
| 943 | atomic_read(&n_rcu_torture_mberror), | 1079 | atomic_read(&n_rcu_torture_mberror), |
| 1080 | n_rcu_torture_boost_ktrerror, | ||
| 1081 | n_rcu_torture_boost_rterror, | ||
| 1082 | n_rcu_torture_boost_allocerror, | ||
| 1083 | n_rcu_torture_boost_afferror, | ||
| 1084 | n_rcu_torture_boost_failure, | ||
| 1085 | n_rcu_torture_boosts, | ||
| 944 | n_rcu_torture_timers); | 1086 | n_rcu_torture_timers); |
| 945 | if (atomic_read(&n_rcu_torture_mberror) != 0) | 1087 | if (atomic_read(&n_rcu_torture_mberror) != 0 || |
| 1088 | n_rcu_torture_boost_ktrerror != 0 || | ||
| 1089 | n_rcu_torture_boost_rterror != 0 || | ||
| 1090 | n_rcu_torture_boost_allocerror != 0 || | ||
| 1091 | n_rcu_torture_boost_afferror != 0 || | ||
| 1092 | n_rcu_torture_boost_failure != 0) | ||
| 946 | cnt += sprintf(&page[cnt], " !!!"); | 1093 | cnt += sprintf(&page[cnt], " !!!"); |
| 947 | cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG); | 1094 | cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG); |
| 948 | if (i > 1) { | 1095 | if (i > 1) { |
| @@ -1094,22 +1241,91 @@ rcu_torture_stutter(void *arg) | |||
| 1094 | } | 1241 | } |
| 1095 | 1242 | ||
| 1096 | static inline void | 1243 | static inline void |
| 1097 | rcu_torture_print_module_parms(char *tag) | 1244 | rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, char *tag) |
| 1098 | { | 1245 | { |
| 1099 | printk(KERN_ALERT "%s" TORTURE_FLAG | 1246 | printk(KERN_ALERT "%s" TORTURE_FLAG |
| 1100 | "--- %s: nreaders=%d nfakewriters=%d " | 1247 | "--- %s: nreaders=%d nfakewriters=%d " |
| 1101 | "stat_interval=%d verbose=%d test_no_idle_hz=%d " | 1248 | "stat_interval=%d verbose=%d test_no_idle_hz=%d " |
| 1102 | "shuffle_interval=%d stutter=%d irqreader=%d " | 1249 | "shuffle_interval=%d stutter=%d irqreader=%d " |
| 1103 | "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d\n", | 1250 | "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d " |
| 1251 | "test_boost=%d/%d test_boost_interval=%d " | ||
| 1252 | "test_boost_duration=%d\n", | ||
| 1104 | torture_type, tag, nrealreaders, nfakewriters, | 1253 | torture_type, tag, nrealreaders, nfakewriters, |
| 1105 | stat_interval, verbose, test_no_idle_hz, shuffle_interval, | 1254 | stat_interval, verbose, test_no_idle_hz, shuffle_interval, |
| 1106 | stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter); | 1255 | stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter, |
| 1256 | test_boost, cur_ops->can_boost, | ||
| 1257 | test_boost_interval, test_boost_duration); | ||
| 1107 | } | 1258 | } |
| 1108 | 1259 | ||
| 1109 | static struct notifier_block rcutorture_nb = { | 1260 | static struct notifier_block rcutorture_shutdown_nb = { |
| 1110 | .notifier_call = rcutorture_shutdown_notify, | 1261 | .notifier_call = rcutorture_shutdown_notify, |
| 1111 | }; | 1262 | }; |
| 1112 | 1263 | ||
| 1264 | static void rcutorture_booster_cleanup(int cpu) | ||
| 1265 | { | ||
| 1266 | struct task_struct *t; | ||
| 1267 | |||
| 1268 | if (boost_tasks[cpu] == NULL) | ||
| 1269 | return; | ||
| 1270 | mutex_lock(&boost_mutex); | ||
| 1271 | VERBOSE_PRINTK_STRING("Stopping rcu_torture_boost task"); | ||
| 1272 | t = boost_tasks[cpu]; | ||
| 1273 | boost_tasks[cpu] = NULL; | ||
| 1274 | mutex_unlock(&boost_mutex); | ||
| 1275 | |||
| 1276 | /* This must be outside of the mutex, otherwise deadlock! */ | ||
| 1277 | kthread_stop(t); | ||
| 1278 | } | ||
| 1279 | |||
| 1280 | static int rcutorture_booster_init(int cpu) | ||
| 1281 | { | ||
| 1282 | int retval; | ||
| 1283 | |||
| 1284 | if (boost_tasks[cpu] != NULL) | ||
| 1285 | return 0; /* Already created, nothing more to do. */ | ||
| 1286 | |||
| 1287 | /* Don't allow time recalculation while creating a new task. */ | ||
| 1288 | mutex_lock(&boost_mutex); | ||
| 1289 | VERBOSE_PRINTK_STRING("Creating rcu_torture_boost task"); | ||
| 1290 | boost_tasks[cpu] = kthread_create(rcu_torture_boost, NULL, | ||
| 1291 | "rcu_torture_boost"); | ||
| 1292 | if (IS_ERR(boost_tasks[cpu])) { | ||
| 1293 | retval = PTR_ERR(boost_tasks[cpu]); | ||
| 1294 | VERBOSE_PRINTK_STRING("rcu_torture_boost task create failed"); | ||
| 1295 | n_rcu_torture_boost_ktrerror++; | ||
| 1296 | boost_tasks[cpu] = NULL; | ||
| 1297 | mutex_unlock(&boost_mutex); | ||
| 1298 | return retval; | ||
| 1299 | } | ||
| 1300 | kthread_bind(boost_tasks[cpu], cpu); | ||
| 1301 | wake_up_process(boost_tasks[cpu]); | ||
| 1302 | mutex_unlock(&boost_mutex); | ||
| 1303 | return 0; | ||
| 1304 | } | ||
| 1305 | |||
| 1306 | static int rcutorture_cpu_notify(struct notifier_block *self, | ||
| 1307 | unsigned long action, void *hcpu) | ||
| 1308 | { | ||
| 1309 | long cpu = (long)hcpu; | ||
| 1310 | |||
| 1311 | switch (action) { | ||
| 1312 | case CPU_ONLINE: | ||
| 1313 | case CPU_DOWN_FAILED: | ||
| 1314 | (void)rcutorture_booster_init(cpu); | ||
| 1315 | break; | ||
| 1316 | case CPU_DOWN_PREPARE: | ||
| 1317 | rcutorture_booster_cleanup(cpu); | ||
| 1318 | break; | ||
| 1319 | default: | ||
| 1320 | break; | ||
| 1321 | } | ||
| 1322 | return NOTIFY_OK; | ||
| 1323 | } | ||
| 1324 | |||
| 1325 | static struct notifier_block rcutorture_cpu_nb = { | ||
| 1326 | .notifier_call = rcutorture_cpu_notify, | ||
| 1327 | }; | ||
| 1328 | |||
| 1113 | static void | 1329 | static void |
| 1114 | rcu_torture_cleanup(void) | 1330 | rcu_torture_cleanup(void) |
| 1115 | { | 1331 | { |
| @@ -1127,7 +1343,7 @@ rcu_torture_cleanup(void) | |||
| 1127 | } | 1343 | } |
| 1128 | fullstop = FULLSTOP_RMMOD; | 1344 | fullstop = FULLSTOP_RMMOD; |
| 1129 | mutex_unlock(&fullstop_mutex); | 1345 | mutex_unlock(&fullstop_mutex); |
| 1130 | unregister_reboot_notifier(&rcutorture_nb); | 1346 | unregister_reboot_notifier(&rcutorture_shutdown_nb); |
| 1131 | if (stutter_task) { | 1347 | if (stutter_task) { |
| 1132 | VERBOSE_PRINTK_STRING("Stopping rcu_torture_stutter task"); | 1348 | VERBOSE_PRINTK_STRING("Stopping rcu_torture_stutter task"); |
| 1133 | kthread_stop(stutter_task); | 1349 | kthread_stop(stutter_task); |
| @@ -1184,6 +1400,12 @@ rcu_torture_cleanup(void) | |||
| 1184 | kthread_stop(fqs_task); | 1400 | kthread_stop(fqs_task); |
| 1185 | } | 1401 | } |
| 1186 | fqs_task = NULL; | 1402 | fqs_task = NULL; |
| 1403 | if ((test_boost == 1 && cur_ops->can_boost) || | ||
| 1404 | test_boost == 2) { | ||
| 1405 | unregister_cpu_notifier(&rcutorture_cpu_nb); | ||
| 1406 | for_each_possible_cpu(i) | ||
| 1407 | rcutorture_booster_cleanup(i); | ||
| 1408 | } | ||
| 1187 | 1409 | ||
| 1188 | /* Wait for all RCU callbacks to fire. */ | 1410 | /* Wait for all RCU callbacks to fire. */ |
| 1189 | 1411 | ||
| @@ -1195,9 +1417,9 @@ rcu_torture_cleanup(void) | |||
| 1195 | if (cur_ops->cleanup) | 1417 | if (cur_ops->cleanup) |
| 1196 | cur_ops->cleanup(); | 1418 | cur_ops->cleanup(); |
| 1197 | if (atomic_read(&n_rcu_torture_error)) | 1419 | if (atomic_read(&n_rcu_torture_error)) |
| 1198 | rcu_torture_print_module_parms("End of test: FAILURE"); | 1420 | rcu_torture_print_module_parms(cur_ops, "End of test: FAILURE"); |
| 1199 | else | 1421 | else |
| 1200 | rcu_torture_print_module_parms("End of test: SUCCESS"); | 1422 | rcu_torture_print_module_parms(cur_ops, "End of test: SUCCESS"); |
| 1201 | } | 1423 | } |
| 1202 | 1424 | ||
| 1203 | static int __init | 1425 | static int __init |
| @@ -1242,7 +1464,7 @@ rcu_torture_init(void) | |||
| 1242 | nrealreaders = nreaders; | 1464 | nrealreaders = nreaders; |
| 1243 | else | 1465 | else |
| 1244 | nrealreaders = 2 * num_online_cpus(); | 1466 | nrealreaders = 2 * num_online_cpus(); |
| 1245 | rcu_torture_print_module_parms("Start of test"); | 1467 | rcu_torture_print_module_parms(cur_ops, "Start of test"); |
| 1246 | fullstop = FULLSTOP_DONTSTOP; | 1468 | fullstop = FULLSTOP_DONTSTOP; |
| 1247 | 1469 | ||
| 1248 | /* Set up the freelist. */ | 1470 | /* Set up the freelist. */ |
| @@ -1263,6 +1485,12 @@ rcu_torture_init(void) | |||
| 1263 | atomic_set(&n_rcu_torture_free, 0); | 1485 | atomic_set(&n_rcu_torture_free, 0); |
| 1264 | atomic_set(&n_rcu_torture_mberror, 0); | 1486 | atomic_set(&n_rcu_torture_mberror, 0); |
| 1265 | atomic_set(&n_rcu_torture_error, 0); | 1487 | atomic_set(&n_rcu_torture_error, 0); |
| 1488 | n_rcu_torture_boost_ktrerror = 0; | ||
| 1489 | n_rcu_torture_boost_rterror = 0; | ||
| 1490 | n_rcu_torture_boost_allocerror = 0; | ||
| 1491 | n_rcu_torture_boost_afferror = 0; | ||
| 1492 | n_rcu_torture_boost_failure = 0; | ||
| 1493 | n_rcu_torture_boosts = 0; | ||
| 1266 | for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) | 1494 | for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) |
| 1267 | atomic_set(&rcu_torture_wcount[i], 0); | 1495 | atomic_set(&rcu_torture_wcount[i], 0); |
| 1268 | for_each_possible_cpu(cpu) { | 1496 | for_each_possible_cpu(cpu) { |
| @@ -1376,7 +1604,27 @@ rcu_torture_init(void) | |||
| 1376 | goto unwind; | 1604 | goto unwind; |
| 1377 | } | 1605 | } |
| 1378 | } | 1606 | } |
| 1379 | register_reboot_notifier(&rcutorture_nb); | 1607 | if (test_boost_interval < 1) |
| 1608 | test_boost_interval = 1; | ||
| 1609 | if (test_boost_duration < 2) | ||
| 1610 | test_boost_duration = 2; | ||
| 1611 | if ((test_boost == 1 && cur_ops->can_boost) || | ||
| 1612 | test_boost == 2) { | ||
| 1613 | int retval; | ||
| 1614 | |||
| 1615 | boost_starttime = jiffies + test_boost_interval * HZ; | ||
| 1616 | register_cpu_notifier(&rcutorture_cpu_nb); | ||
| 1617 | for_each_possible_cpu(i) { | ||
| 1618 | if (cpu_is_offline(i)) | ||
| 1619 | continue; /* Heuristic: CPU can go offline. */ | ||
| 1620 | retval = rcutorture_booster_init(i); | ||
| 1621 | if (retval < 0) { | ||
| 1622 | firsterr = retval; | ||
| 1623 | goto unwind; | ||
| 1624 | } | ||
| 1625 | } | ||
| 1626 | } | ||
| 1627 | register_reboot_notifier(&rcutorture_shutdown_nb); | ||
| 1380 | mutex_unlock(&fullstop_mutex); | 1628 | mutex_unlock(&fullstop_mutex); |
| 1381 | return 0; | 1629 | return 0; |
| 1382 | 1630 | ||
diff --git a/kernel/rcutree.c b/kernel/rcutree.c index ccdc04c4798..dd4aea806f8 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c | |||
| @@ -67,9 +67,6 @@ static struct lock_class_key rcu_node_class[NUM_RCU_LVLS]; | |||
| 67 | .gpnum = -300, \ | 67 | .gpnum = -300, \ |
| 68 | .completed = -300, \ | 68 | .completed = -300, \ |
| 69 | .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&structname.onofflock), \ | 69 | .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&structname.onofflock), \ |
| 70 | .orphan_cbs_list = NULL, \ | ||
| 71 | .orphan_cbs_tail = &structname.orphan_cbs_list, \ | ||
| 72 | .orphan_qlen = 0, \ | ||
| 73 | .fqslock = __RAW_SPIN_LOCK_UNLOCKED(&structname.fqslock), \ | 70 | .fqslock = __RAW_SPIN_LOCK_UNLOCKED(&structname.fqslock), \ |
| 74 | .n_force_qs = 0, \ | 71 | .n_force_qs = 0, \ |
| 75 | .n_force_qs_ngp = 0, \ | 72 | .n_force_qs_ngp = 0, \ |
| @@ -367,8 +364,8 @@ void rcu_irq_exit(void) | |||
| 367 | WARN_ON_ONCE(rdtp->dynticks & 0x1); | 364 | WARN_ON_ONCE(rdtp->dynticks & 0x1); |
| 368 | 365 | ||
| 369 | /* If the interrupt queued a callback, get out of dyntick mode. */ | 366 | /* If the interrupt queued a callback, get out of dyntick mode. */ |
| 370 | if (__get_cpu_var(rcu_sched_data).nxtlist || | 367 | if (__this_cpu_read(rcu_sched_data.nxtlist) || |
| 371 | __get_cpu_var(rcu_bh_data).nxtlist) | 368 | __this_cpu_read(rcu_bh_data.nxtlist)) |
| 372 | set_need_resched(); | 369 | set_need_resched(); |
| 373 | } | 370 | } |
| 374 | 371 | ||
| @@ -620,9 +617,17 @@ static void __init check_cpu_stall_init(void) | |||
| 620 | static void __note_new_gpnum(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) | 617 | static void __note_new_gpnum(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) |
| 621 | { | 618 | { |
| 622 | if (rdp->gpnum != rnp->gpnum) { | 619 | if (rdp->gpnum != rnp->gpnum) { |
| 623 | rdp->qs_pending = 1; | 620 | /* |
| 624 | rdp->passed_quiesc = 0; | 621 | * If the current grace period is waiting for this CPU, |
| 622 | * set up to detect a quiescent state, otherwise don't | ||
| 623 | * go looking for one. | ||
| 624 | */ | ||
| 625 | rdp->gpnum = rnp->gpnum; | 625 | rdp->gpnum = rnp->gpnum; |
| 626 | if (rnp->qsmask & rdp->grpmask) { | ||
| 627 | rdp->qs_pending = 1; | ||
| 628 | rdp->passed_quiesc = 0; | ||
| 629 | } else | ||
| 630 | rdp->qs_pending = 0; | ||
| 626 | } | 631 | } |
| 627 | } | 632 | } |
| 628 | 633 | ||
| @@ -681,6 +686,24 @@ __rcu_process_gp_end(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_dat | |||
| 681 | 686 | ||
| 682 | /* Remember that we saw this grace-period completion. */ | 687 | /* Remember that we saw this grace-period completion. */ |
| 683 | rdp->completed = rnp->completed; | 688 | rdp->completed = rnp->completed; |
| 689 | |||
| 690 | /* | ||
| 691 | * If we were in an extended quiescent state, we may have | ||
| 692 | * missed some grace periods that others CPUs handled on | ||
| 693 | * our behalf. Catch up with this state to avoid noting | ||
| 694 | * spurious new grace periods. If another grace period | ||
| 695 | * has started, then rnp->gpnum will have advanced, so | ||
| 696 | * we will detect this later on. | ||
| 697 | */ | ||
| 698 | if (ULONG_CMP_LT(rdp->gpnum, rdp->completed)) | ||
| 699 | rdp->gpnum = rdp->completed; | ||
| 700 | |||
| 701 | /* | ||
| 702 | * If RCU does not need a quiescent state from this CPU, | ||
| 703 | * then make sure that this CPU doesn't go looking for one. | ||
| 704 | */ | ||
| 705 | if ((rnp->qsmask & rdp->grpmask) == 0) | ||
| 706 | rdp->qs_pending = 0; | ||
| 684 | } | 707 | } |
| 685 | } | 708 | } |
| 686 | 709 | ||
| @@ -984,53 +1007,31 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp) | |||
| 984 | #ifdef CONFIG_HOTPLUG_CPU | 1007 | #ifdef CONFIG_HOTPLUG_CPU |
| 985 | 1008 | ||
| 986 | /* | 1009 | /* |
| 987 | * Move a dying CPU's RCU callbacks to the ->orphan_cbs_list for the | 1010 | * Move a dying CPU's RCU callbacks to online CPU's callback list. |
| 988 | * specified flavor of RCU. The callbacks will be adopted by the next | 1011 | * Synchronization is not required because this function executes |
| 989 | * _rcu_barrier() invocation or by the CPU_DEAD notifier, whichever | 1012 | * in stop_machine() context. |
| 990 | * comes first. Because this is invoked from the CPU_DYING notifier, | ||
| 991 | * irqs are already disabled. | ||
| 992 | */ | 1013 | */ |
| 993 | static void rcu_send_cbs_to_orphanage(struct rcu_state *rsp) | 1014 | static void rcu_send_cbs_to_online(struct rcu_state *rsp) |
| 994 | { | 1015 | { |
| 995 | int i; | 1016 | int i; |
| 1017 | /* current DYING CPU is cleared in the cpu_online_mask */ | ||
| 1018 | int receive_cpu = cpumask_any(cpu_online_mask); | ||
| 996 | struct rcu_data *rdp = this_cpu_ptr(rsp->rda); | 1019 | struct rcu_data *rdp = this_cpu_ptr(rsp->rda); |
| 1020 | struct rcu_data *receive_rdp = per_cpu_ptr(rsp->rda, receive_cpu); | ||
| 997 | 1021 | ||
| 998 | if (rdp->nxtlist == NULL) | 1022 | if (rdp->nxtlist == NULL) |
| 999 | return; /* irqs disabled, so comparison is stable. */ | 1023 | return; /* irqs disabled, so comparison is stable. */ |
| 1000 | raw_spin_lock(&rsp->onofflock); /* irqs already disabled. */ | 1024 | |
| 1001 | *rsp->orphan_cbs_tail = rdp->nxtlist; | 1025 | *receive_rdp->nxttail[RCU_NEXT_TAIL] = rdp->nxtlist; |
| 1002 | rsp->orphan_cbs_tail = rdp->nxttail[RCU_NEXT_TAIL]; | 1026 | receive_rdp->nxttail[RCU_NEXT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; |
| 1027 | receive_rdp->qlen += rdp->qlen; | ||
| 1028 | receive_rdp->n_cbs_adopted += rdp->qlen; | ||
| 1029 | rdp->n_cbs_orphaned += rdp->qlen; | ||
| 1030 | |||
| 1003 | rdp->nxtlist = NULL; | 1031 | rdp->nxtlist = NULL; |
| 1004 | for (i = 0; i < RCU_NEXT_SIZE; i++) | 1032 | for (i = 0; i < RCU_NEXT_SIZE; i++) |
| 1005 | rdp->nxttail[i] = &rdp->nxtlist; | 1033 | rdp->nxttail[i] = &rdp->nxtlist; |
| 1006 | rsp->orphan_qlen += rdp->qlen; | ||
| 1007 | rdp->n_cbs_orphaned += rdp->qlen; | ||
| 1008 | rdp->qlen = 0; | 1034 | rdp->qlen = 0; |
| 1009 | raw_spin_unlock(&rsp->onofflock); /* irqs remain disabled. */ | ||
| 1010 | } | ||
| 1011 | |||
| 1012 | /* | ||
| 1013 | * Adopt previously orphaned RCU callbacks. | ||
| 1014 | */ | ||
| 1015 | static void rcu_adopt_orphan_cbs(struct rcu_state *rsp) | ||
| 1016 | { | ||
| 1017 | unsigned long flags; | ||
| 1018 | struct rcu_data *rdp; | ||
| 1019 | |||
| 1020 | raw_spin_lock_irqsave(&rsp->onofflock, flags); | ||
| 1021 | rdp = this_cpu_ptr(rsp->rda); | ||
| 1022 | if (rsp->orphan_cbs_list == NULL) { | ||
| 1023 | raw_spin_unlock_irqrestore(&rsp->onofflock, flags); | ||
| 1024 | return; | ||
| 1025 | } | ||
| 1026 | *rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_cbs_list; | ||
| 1027 | rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_cbs_tail; | ||
| 1028 | rdp->qlen += rsp->orphan_qlen; | ||
| 1029 | rdp->n_cbs_adopted += rsp->orphan_qlen; | ||
| 1030 | rsp->orphan_cbs_list = NULL; | ||
| 1031 | rsp->orphan_cbs_tail = &rsp->orphan_cbs_list; | ||
| 1032 | rsp->orphan_qlen = 0; | ||
| 1033 | raw_spin_unlock_irqrestore(&rsp->onofflock, flags); | ||
| 1034 | } | 1035 | } |
| 1035 | 1036 | ||
| 1036 | /* | 1037 | /* |
| @@ -1081,8 +1082,6 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp) | |||
| 1081 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | 1082 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
| 1082 | if (need_report & RCU_OFL_TASKS_EXP_GP) | 1083 | if (need_report & RCU_OFL_TASKS_EXP_GP) |
| 1083 | rcu_report_exp_rnp(rsp, rnp); | 1084 | rcu_report_exp_rnp(rsp, rnp); |
| 1084 | |||
| 1085 | rcu_adopt_orphan_cbs(rsp); | ||
| 1086 | } | 1085 | } |
| 1087 | 1086 | ||
| 1088 | /* | 1087 | /* |
| @@ -1100,11 +1099,7 @@ static void rcu_offline_cpu(int cpu) | |||
| 1100 | 1099 | ||
| 1101 | #else /* #ifdef CONFIG_HOTPLUG_CPU */ | 1100 | #else /* #ifdef CONFIG_HOTPLUG_CPU */ |
| 1102 | 1101 | ||
| 1103 | static void rcu_send_cbs_to_orphanage(struct rcu_state *rsp) | 1102 | static void rcu_send_cbs_to_online(struct rcu_state *rsp) |
| 1104 | { | ||
| 1105 | } | ||
| 1106 | |||
| 1107 | static void rcu_adopt_orphan_cbs(struct rcu_state *rsp) | ||
| 1108 | { | 1103 | { |
| 1109 | } | 1104 | } |
| 1110 | 1105 | ||
| @@ -1440,22 +1435,11 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), | |||
| 1440 | */ | 1435 | */ |
| 1441 | local_irq_save(flags); | 1436 | local_irq_save(flags); |
| 1442 | rdp = this_cpu_ptr(rsp->rda); | 1437 | rdp = this_cpu_ptr(rsp->rda); |
| 1443 | rcu_process_gp_end(rsp, rdp); | ||
| 1444 | check_for_new_grace_period(rsp, rdp); | ||
| 1445 | 1438 | ||
| 1446 | /* Add the callback to our list. */ | 1439 | /* Add the callback to our list. */ |
| 1447 | *rdp->nxttail[RCU_NEXT_TAIL] = head; | 1440 | *rdp->nxttail[RCU_NEXT_TAIL] = head; |
| 1448 | rdp->nxttail[RCU_NEXT_TAIL] = &head->next; | 1441 | rdp->nxttail[RCU_NEXT_TAIL] = &head->next; |
| 1449 | 1442 | ||
| 1450 | /* Start a new grace period if one not already started. */ | ||
| 1451 | if (!rcu_gp_in_progress(rsp)) { | ||
| 1452 | unsigned long nestflag; | ||
| 1453 | struct rcu_node *rnp_root = rcu_get_root(rsp); | ||
| 1454 | |||
| 1455 | raw_spin_lock_irqsave(&rnp_root->lock, nestflag); | ||
| 1456 | rcu_start_gp(rsp, nestflag); /* releases rnp_root->lock. */ | ||
| 1457 | } | ||
| 1458 | |||
| 1459 | /* | 1443 | /* |
| 1460 | * Force the grace period if too many callbacks or too long waiting. | 1444 | * Force the grace period if too many callbacks or too long waiting. |
| 1461 | * Enforce hysteresis, and don't invoke force_quiescent_state() | 1445 | * Enforce hysteresis, and don't invoke force_quiescent_state() |
| @@ -1464,12 +1448,27 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), | |||
| 1464 | * is the only one waiting for a grace period to complete. | 1448 | * is the only one waiting for a grace period to complete. |
| 1465 | */ | 1449 | */ |
| 1466 | if (unlikely(++rdp->qlen > rdp->qlen_last_fqs_check + qhimark)) { | 1450 | if (unlikely(++rdp->qlen > rdp->qlen_last_fqs_check + qhimark)) { |
| 1467 | rdp->blimit = LONG_MAX; | 1451 | |
| 1468 | if (rsp->n_force_qs == rdp->n_force_qs_snap && | 1452 | /* Are we ignoring a completed grace period? */ |
| 1469 | *rdp->nxttail[RCU_DONE_TAIL] != head) | 1453 | rcu_process_gp_end(rsp, rdp); |
| 1470 | force_quiescent_state(rsp, 0); | 1454 | check_for_new_grace_period(rsp, rdp); |
| 1471 | rdp->n_force_qs_snap = rsp->n_force_qs; | 1455 | |
| 1472 | rdp->qlen_last_fqs_check = rdp->qlen; | 1456 | /* Start a new grace period if one not already started. */ |
| 1457 | if (!rcu_gp_in_progress(rsp)) { | ||
| 1458 | unsigned long nestflag; | ||
| 1459 | struct rcu_node *rnp_root = rcu_get_root(rsp); | ||
| 1460 | |||
| 1461 | raw_spin_lock_irqsave(&rnp_root->lock, nestflag); | ||
| 1462 | rcu_start_gp(rsp, nestflag); /* rlses rnp_root->lock */ | ||
| 1463 | } else { | ||
| 1464 | /* Give the grace period a kick. */ | ||
| 1465 | rdp->blimit = LONG_MAX; | ||
| 1466 | if (rsp->n_force_qs == rdp->n_force_qs_snap && | ||
| 1467 | *rdp->nxttail[RCU_DONE_TAIL] != head) | ||
| 1468 | force_quiescent_state(rsp, 0); | ||
| 1469 | rdp->n_force_qs_snap = rsp->n_force_qs; | ||
| 1470 | rdp->qlen_last_fqs_check = rdp->qlen; | ||
| 1471 | } | ||
| 1473 | } else if (ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs), jiffies)) | 1472 | } else if (ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs), jiffies)) |
| 1474 | force_quiescent_state(rsp, 1); | 1473 | force_quiescent_state(rsp, 1); |
| 1475 | local_irq_restore(flags); | 1474 | local_irq_restore(flags); |
| @@ -1699,13 +1698,12 @@ static void _rcu_barrier(struct rcu_state *rsp, | |||
| 1699 | * decrement rcu_barrier_cpu_count -- otherwise the first CPU | 1698 | * decrement rcu_barrier_cpu_count -- otherwise the first CPU |
| 1700 | * might complete its grace period before all of the other CPUs | 1699 | * might complete its grace period before all of the other CPUs |
| 1701 | * did their increment, causing this function to return too | 1700 | * did their increment, causing this function to return too |
| 1702 | * early. | 1701 | * early. Note that on_each_cpu() disables irqs, which prevents |
| 1702 | * any CPUs from coming online or going offline until each online | ||
| 1703 | * CPU has queued its RCU-barrier callback. | ||
| 1703 | */ | 1704 | */ |
| 1704 | atomic_set(&rcu_barrier_cpu_count, 1); | 1705 | atomic_set(&rcu_barrier_cpu_count, 1); |
| 1705 | preempt_disable(); /* stop CPU_DYING from filling orphan_cbs_list */ | ||
| 1706 | rcu_adopt_orphan_cbs(rsp); | ||
| 1707 | on_each_cpu(rcu_barrier_func, (void *)call_rcu_func, 1); | 1706 | on_each_cpu(rcu_barrier_func, (void *)call_rcu_func, 1); |
| 1708 | preempt_enable(); /* CPU_DYING can again fill orphan_cbs_list */ | ||
| 1709 | if (atomic_dec_and_test(&rcu_barrier_cpu_count)) | 1707 | if (atomic_dec_and_test(&rcu_barrier_cpu_count)) |
| 1710 | complete(&rcu_barrier_completion); | 1708 | complete(&rcu_barrier_completion); |
| 1711 | wait_for_completion(&rcu_barrier_completion); | 1709 | wait_for_completion(&rcu_barrier_completion); |
| @@ -1831,18 +1829,13 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self, | |||
| 1831 | case CPU_DYING: | 1829 | case CPU_DYING: |
| 1832 | case CPU_DYING_FROZEN: | 1830 | case CPU_DYING_FROZEN: |
| 1833 | /* | 1831 | /* |
| 1834 | * preempt_disable() in _rcu_barrier() prevents stop_machine(), | 1832 | * The whole machine is "stopped" except this CPU, so we can |
| 1835 | * so when "on_each_cpu(rcu_barrier_func, (void *)type, 1);" | 1833 | * touch any data without introducing corruption. We send the |
| 1836 | * returns, all online cpus have queued rcu_barrier_func(). | 1834 | * dying CPU's callbacks to an arbitrarily chosen online CPU. |
| 1837 | * The dying CPU clears its cpu_online_mask bit and | ||
| 1838 | * moves all of its RCU callbacks to ->orphan_cbs_list | ||
| 1839 | * in the context of stop_machine(), so subsequent calls | ||
| 1840 | * to _rcu_barrier() will adopt these callbacks and only | ||
| 1841 | * then queue rcu_barrier_func() on all remaining CPUs. | ||
| 1842 | */ | 1835 | */ |
| 1843 | rcu_send_cbs_to_orphanage(&rcu_bh_state); | 1836 | rcu_send_cbs_to_online(&rcu_bh_state); |
| 1844 | rcu_send_cbs_to_orphanage(&rcu_sched_state); | 1837 | rcu_send_cbs_to_online(&rcu_sched_state); |
| 1845 | rcu_preempt_send_cbs_to_orphanage(); | 1838 | rcu_preempt_send_cbs_to_online(); |
| 1846 | break; | 1839 | break; |
| 1847 | case CPU_DEAD: | 1840 | case CPU_DEAD: |
| 1848 | case CPU_DEAD_FROZEN: | 1841 | case CPU_DEAD_FROZEN: |
| @@ -1880,8 +1873,9 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp) | |||
| 1880 | { | 1873 | { |
| 1881 | int i; | 1874 | int i; |
| 1882 | 1875 | ||
| 1883 | for (i = NUM_RCU_LVLS - 1; i >= 0; i--) | 1876 | for (i = NUM_RCU_LVLS - 1; i > 0; i--) |
| 1884 | rsp->levelspread[i] = CONFIG_RCU_FANOUT; | 1877 | rsp->levelspread[i] = CONFIG_RCU_FANOUT; |
| 1878 | rsp->levelspread[0] = RCU_FANOUT_LEAF; | ||
| 1885 | } | 1879 | } |
| 1886 | #else /* #ifdef CONFIG_RCU_FANOUT_EXACT */ | 1880 | #else /* #ifdef CONFIG_RCU_FANOUT_EXACT */ |
| 1887 | static void __init rcu_init_levelspread(struct rcu_state *rsp) | 1881 | static void __init rcu_init_levelspread(struct rcu_state *rsp) |
diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 91d4170c5c1..e8f057e44e3 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h | |||
| @@ -31,46 +31,51 @@ | |||
| 31 | /* | 31 | /* |
| 32 | * Define shape of hierarchy based on NR_CPUS and CONFIG_RCU_FANOUT. | 32 | * Define shape of hierarchy based on NR_CPUS and CONFIG_RCU_FANOUT. |
| 33 | * In theory, it should be possible to add more levels straightforwardly. | 33 | * In theory, it should be possible to add more levels straightforwardly. |
| 34 | * In practice, this has not been tested, so there is probably some | 34 | * In practice, this did work well going from three levels to four. |
| 35 | * bug somewhere. | 35 | * Of course, your mileage may vary. |
| 36 | */ | 36 | */ |
| 37 | #define MAX_RCU_LVLS 4 | 37 | #define MAX_RCU_LVLS 4 |
| 38 | #define RCU_FANOUT (CONFIG_RCU_FANOUT) | 38 | #if CONFIG_RCU_FANOUT > 16 |
| 39 | #define RCU_FANOUT_SQ (RCU_FANOUT * RCU_FANOUT) | 39 | #define RCU_FANOUT_LEAF 16 |
| 40 | #define RCU_FANOUT_CUBE (RCU_FANOUT_SQ * RCU_FANOUT) | 40 | #else /* #if CONFIG_RCU_FANOUT > 16 */ |
| 41 | #define RCU_FANOUT_FOURTH (RCU_FANOUT_CUBE * RCU_FANOUT) | 41 | #define RCU_FANOUT_LEAF (CONFIG_RCU_FANOUT) |
| 42 | 42 | #endif /* #else #if CONFIG_RCU_FANOUT > 16 */ | |
| 43 | #if NR_CPUS <= RCU_FANOUT | 43 | #define RCU_FANOUT_1 (RCU_FANOUT_LEAF) |
| 44 | #define RCU_FANOUT_2 (RCU_FANOUT_1 * CONFIG_RCU_FANOUT) | ||
| 45 | #define RCU_FANOUT_3 (RCU_FANOUT_2 * CONFIG_RCU_FANOUT) | ||
| 46 | #define RCU_FANOUT_4 (RCU_FANOUT_3 * CONFIG_RCU_FANOUT) | ||
| 47 | |||
| 48 | #if NR_CPUS <= RCU_FANOUT_1 | ||
| 44 | # define NUM_RCU_LVLS 1 | 49 | # define NUM_RCU_LVLS 1 |
| 45 | # define NUM_RCU_LVL_0 1 | 50 | # define NUM_RCU_LVL_0 1 |
| 46 | # define NUM_RCU_LVL_1 (NR_CPUS) | 51 | # define NUM_RCU_LVL_1 (NR_CPUS) |
| 47 | # define NUM_RCU_LVL_2 0 | 52 | # define NUM_RCU_LVL_2 0 |
| 48 | # define NUM_RCU_LVL_3 0 | 53 | # define NUM_RCU_LVL_3 0 |
| 49 | # define NUM_RCU_LVL_4 0 | 54 | # define NUM_RCU_LVL_4 0 |
| 50 | #elif NR_CPUS <= RCU_FANOUT_SQ | 55 | #elif NR_CPUS <= RCU_FANOUT_2 |
| 51 | # define NUM_RCU_LVLS 2 | 56 | # define NUM_RCU_LVLS 2 |
| 52 | # define NUM_RCU_LVL_0 1 | 57 | # define NUM_RCU_LVL_0 1 |
| 53 | # define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT) | 58 | # define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1) |
| 54 | # define NUM_RCU_LVL_2 (NR_CPUS) | 59 | # define NUM_RCU_LVL_2 (NR_CPUS) |
| 55 | # define NUM_RCU_LVL_3 0 | 60 | # define NUM_RCU_LVL_3 0 |
| 56 | # define NUM_RCU_LVL_4 0 | 61 | # define NUM_RCU_LVL_4 0 |
| 57 | #elif NR_CPUS <= RCU_FANOUT_CUBE | 62 | #elif NR_CPUS <= RCU_FANOUT_3 |
| 58 | # define NUM_RCU_LVLS 3 | 63 | # define NUM_RCU_LVLS 3 |
| 59 | # define NUM_RCU_LVL_0 1 | 64 | # define NUM_RCU_LVL_0 1 |
| 60 | # define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_SQ) | 65 | # define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2) |
| 61 | # define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT) | 66 | # define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1) |
| 62 | # define NUM_RCU_LVL_3 NR_CPUS | 67 | # define NUM_RCU_LVL_3 (NR_CPUS) |
| 63 | # define NUM_RCU_LVL_4 0 | 68 | # define NUM_RCU_LVL_4 0 |
| 64 | #elif NR_CPUS <= RCU_FANOUT_FOURTH | 69 | #elif NR_CPUS <= RCU_FANOUT_4 |
| 65 | # define NUM_RCU_LVLS 4 | 70 | # define NUM_RCU_LVLS 4 |
| 66 | # define NUM_RCU_LVL_0 1 | 71 | # define NUM_RCU_LVL_0 1 |
| 67 | # define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_CUBE) | 72 | # define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_3) |
| 68 | # define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_SQ) | 73 | # define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2) |
| 69 | # define NUM_RCU_LVL_3 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT) | 74 | # define NUM_RCU_LVL_3 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1) |
| 70 | # define NUM_RCU_LVL_4 NR_CPUS | 75 | # define NUM_RCU_LVL_4 (NR_CPUS) |
| 71 | #else | 76 | #else |
| 72 | # error "CONFIG_RCU_FANOUT insufficient for NR_CPUS" | 77 | # error "CONFIG_RCU_FANOUT insufficient for NR_CPUS" |
| 73 | #endif /* #if (NR_CPUS) <= RCU_FANOUT */ | 78 | #endif /* #if (NR_CPUS) <= RCU_FANOUT_1 */ |
| 74 | 79 | ||
| 75 | #define RCU_SUM (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3 + NUM_RCU_LVL_4) | 80 | #define RCU_SUM (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3 + NUM_RCU_LVL_4) |
| 76 | #define NUM_RCU_NODES (RCU_SUM - NR_CPUS) | 81 | #define NUM_RCU_NODES (RCU_SUM - NR_CPUS) |
| @@ -203,8 +208,8 @@ struct rcu_data { | |||
| 203 | long qlen_last_fqs_check; | 208 | long qlen_last_fqs_check; |
| 204 | /* qlen at last check for QS forcing */ | 209 | /* qlen at last check for QS forcing */ |
| 205 | unsigned long n_cbs_invoked; /* count of RCU cbs invoked. */ | 210 | unsigned long n_cbs_invoked; /* count of RCU cbs invoked. */ |
| 206 | unsigned long n_cbs_orphaned; /* RCU cbs sent to orphanage. */ | 211 | unsigned long n_cbs_orphaned; /* RCU cbs orphaned by dying CPU */ |
| 207 | unsigned long n_cbs_adopted; /* RCU cbs adopted from orphanage. */ | 212 | unsigned long n_cbs_adopted; /* RCU cbs adopted from dying CPU */ |
| 208 | unsigned long n_force_qs_snap; | 213 | unsigned long n_force_qs_snap; |
| 209 | /* did other CPU force QS recently? */ | 214 | /* did other CPU force QS recently? */ |
| 210 | long blimit; /* Upper limit on a processed batch */ | 215 | long blimit; /* Upper limit on a processed batch */ |
| @@ -309,15 +314,7 @@ struct rcu_state { | |||
| 309 | /* End of fields guarded by root rcu_node's lock. */ | 314 | /* End of fields guarded by root rcu_node's lock. */ |
| 310 | 315 | ||
| 311 | raw_spinlock_t onofflock; /* exclude on/offline and */ | 316 | raw_spinlock_t onofflock; /* exclude on/offline and */ |
| 312 | /* starting new GP. Also */ | 317 | /* starting new GP. */ |
| 313 | /* protects the following */ | ||
| 314 | /* orphan_cbs fields. */ | ||
| 315 | struct rcu_head *orphan_cbs_list; /* list of rcu_head structs */ | ||
| 316 | /* orphaned by all CPUs in */ | ||
| 317 | /* a given leaf rcu_node */ | ||
| 318 | /* going offline. */ | ||
| 319 | struct rcu_head **orphan_cbs_tail; /* And tail pointer. */ | ||
| 320 | long orphan_qlen; /* Number of orphaned cbs. */ | ||
| 321 | raw_spinlock_t fqslock; /* Only one task forcing */ | 318 | raw_spinlock_t fqslock; /* Only one task forcing */ |
| 322 | /* quiescent states. */ | 319 | /* quiescent states. */ |
| 323 | unsigned long jiffies_force_qs; /* Time at which to invoke */ | 320 | unsigned long jiffies_force_qs; /* Time at which to invoke */ |
| @@ -390,7 +387,7 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp); | |||
| 390 | static int rcu_preempt_pending(int cpu); | 387 | static int rcu_preempt_pending(int cpu); |
| 391 | static int rcu_preempt_needs_cpu(int cpu); | 388 | static int rcu_preempt_needs_cpu(int cpu); |
| 392 | static void __cpuinit rcu_preempt_init_percpu_data(int cpu); | 389 | static void __cpuinit rcu_preempt_init_percpu_data(int cpu); |
| 393 | static void rcu_preempt_send_cbs_to_orphanage(void); | 390 | static void rcu_preempt_send_cbs_to_online(void); |
| 394 | static void __init __rcu_init_preempt(void); | 391 | static void __init __rcu_init_preempt(void); |
| 395 | static void rcu_needs_cpu_flush(void); | 392 | static void rcu_needs_cpu_flush(void); |
| 396 | 393 | ||
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 71a4147473f..a3638710dc6 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h | |||
| @@ -25,6 +25,7 @@ | |||
| 25 | */ | 25 | */ |
| 26 | 26 | ||
| 27 | #include <linux/delay.h> | 27 | #include <linux/delay.h> |
| 28 | #include <linux/stop_machine.h> | ||
| 28 | 29 | ||
| 29 | /* | 30 | /* |
| 30 | * Check the RCU kernel configuration parameters and print informative | 31 | * Check the RCU kernel configuration parameters and print informative |
| @@ -773,11 +774,11 @@ static void __cpuinit rcu_preempt_init_percpu_data(int cpu) | |||
| 773 | } | 774 | } |
| 774 | 775 | ||
| 775 | /* | 776 | /* |
| 776 | * Move preemptable RCU's callbacks to ->orphan_cbs_list. | 777 | * Move preemptable RCU's callbacks from dying CPU to other online CPU. |
| 777 | */ | 778 | */ |
| 778 | static void rcu_preempt_send_cbs_to_orphanage(void) | 779 | static void rcu_preempt_send_cbs_to_online(void) |
| 779 | { | 780 | { |
| 780 | rcu_send_cbs_to_orphanage(&rcu_preempt_state); | 781 | rcu_send_cbs_to_online(&rcu_preempt_state); |
| 781 | } | 782 | } |
| 782 | 783 | ||
| 783 | /* | 784 | /* |
| @@ -1001,7 +1002,7 @@ static void __cpuinit rcu_preempt_init_percpu_data(int cpu) | |||
| 1001 | /* | 1002 | /* |
| 1002 | * Because there is no preemptable RCU, there are no callbacks to move. | 1003 | * Because there is no preemptable RCU, there are no callbacks to move. |
| 1003 | */ | 1004 | */ |
| 1004 | static void rcu_preempt_send_cbs_to_orphanage(void) | 1005 | static void rcu_preempt_send_cbs_to_online(void) |
| 1005 | { | 1006 | { |
| 1006 | } | 1007 | } |
| 1007 | 1008 | ||
| @@ -1014,6 +1015,132 @@ static void __init __rcu_init_preempt(void) | |||
| 1014 | 1015 | ||
| 1015 | #endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */ | 1016 | #endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */ |
| 1016 | 1017 | ||
| 1018 | #ifndef CONFIG_SMP | ||
| 1019 | |||
| 1020 | void synchronize_sched_expedited(void) | ||
| 1021 | { | ||
| 1022 | cond_resched(); | ||
| 1023 | } | ||
| 1024 | EXPORT_SYMBOL_GPL(synchronize_sched_expedited); | ||
| 1025 | |||
| 1026 | #else /* #ifndef CONFIG_SMP */ | ||
| 1027 | |||
| 1028 | static atomic_t sync_sched_expedited_started = ATOMIC_INIT(0); | ||
| 1029 | static atomic_t sync_sched_expedited_done = ATOMIC_INIT(0); | ||
| 1030 | |||
| 1031 | static int synchronize_sched_expedited_cpu_stop(void *data) | ||
| 1032 | { | ||
| 1033 | /* | ||
| 1034 | * There must be a full memory barrier on each affected CPU | ||
| 1035 | * between the time that try_stop_cpus() is called and the | ||
| 1036 | * time that it returns. | ||
| 1037 | * | ||
| 1038 | * In the current initial implementation of cpu_stop, the | ||
| 1039 | * above condition is already met when the control reaches | ||
| 1040 | * this point and the following smp_mb() is not strictly | ||
| 1041 | * necessary. Do smp_mb() anyway for documentation and | ||
| 1042 | * robustness against future implementation changes. | ||
| 1043 | */ | ||
| 1044 | smp_mb(); /* See above comment block. */ | ||
| 1045 | return 0; | ||
| 1046 | } | ||
| 1047 | |||
| 1048 | /* | ||
| 1049 | * Wait for an rcu-sched grace period to elapse, but use "big hammer" | ||
| 1050 | * approach to force grace period to end quickly. This consumes | ||
| 1051 | * significant time on all CPUs, and is thus not recommended for | ||
| 1052 | * any sort of common-case code. | ||
| 1053 | * | ||
| 1054 | * Note that it is illegal to call this function while holding any | ||
| 1055 | * lock that is acquired by a CPU-hotplug notifier. Failing to | ||
| 1056 | * observe this restriction will result in deadlock. | ||
| 1057 | * | ||
| 1058 | * This implementation can be thought of as an application of ticket | ||
| 1059 | * locking to RCU, with sync_sched_expedited_started and | ||
| 1060 | * sync_sched_expedited_done taking on the roles of the halves | ||
| 1061 | * of the ticket-lock word. Each task atomically increments | ||
| 1062 | * sync_sched_expedited_started upon entry, snapshotting the old value, | ||
| 1063 | * then attempts to stop all the CPUs. If this succeeds, then each | ||
| 1064 | * CPU will have executed a context switch, resulting in an RCU-sched | ||
| 1065 | * grace period. We are then done, so we use atomic_cmpxchg() to | ||
| 1066 | * update sync_sched_expedited_done to match our snapshot -- but | ||
| 1067 | * only if someone else has not already advanced past our snapshot. | ||
| 1068 | * | ||
| 1069 | * On the other hand, if try_stop_cpus() fails, we check the value | ||
| 1070 | * of sync_sched_expedited_done. If it has advanced past our | ||
| 1071 | * initial snapshot, then someone else must have forced a grace period | ||
| 1072 | * some time after we took our snapshot. In this case, our work is | ||
| 1073 | * done for us, and we can simply return. Otherwise, we try again, | ||
| 1074 | * but keep our initial snapshot for purposes of checking for someone | ||
| 1075 | * doing our work for us. | ||
| 1076 | * | ||
| 1077 | * If we fail too many times in a row, we fall back to synchronize_sched(). | ||
| 1078 | */ | ||
| 1079 | void synchronize_sched_expedited(void) | ||
| 1080 | { | ||
| 1081 | int firstsnap, s, snap, trycount = 0; | ||
| 1082 | |||
| 1083 | /* Note that atomic_inc_return() implies full memory barrier. */ | ||
| 1084 | firstsnap = snap = atomic_inc_return(&sync_sched_expedited_started); | ||
| 1085 | get_online_cpus(); | ||
| 1086 | |||
| 1087 | /* | ||
| 1088 | * Each pass through the following loop attempts to force a | ||
| 1089 | * context switch on each CPU. | ||
| 1090 | */ | ||
| 1091 | while (try_stop_cpus(cpu_online_mask, | ||
| 1092 | synchronize_sched_expedited_cpu_stop, | ||
| 1093 | NULL) == -EAGAIN) { | ||
| 1094 | put_online_cpus(); | ||
| 1095 | |||
| 1096 | /* No joy, try again later. Or just synchronize_sched(). */ | ||
| 1097 | if (trycount++ < 10) | ||
| 1098 | udelay(trycount * num_online_cpus()); | ||
| 1099 | else { | ||
| 1100 | synchronize_sched(); | ||
| 1101 | return; | ||
| 1102 | } | ||
| 1103 | |||
| 1104 | /* Check to see if someone else did our work for us. */ | ||
| 1105 | s = atomic_read(&sync_sched_expedited_done); | ||
| 1106 | if (UINT_CMP_GE((unsigned)s, (unsigned)firstsnap)) { | ||
| 1107 | smp_mb(); /* ensure test happens before caller kfree */ | ||
| 1108 | return; | ||
| 1109 | } | ||
| 1110 | |||
| 1111 | /* | ||
| 1112 | * Refetching sync_sched_expedited_started allows later | ||
| 1113 | * callers to piggyback on our grace period. We subtract | ||
| 1114 | * 1 to get the same token that the last incrementer got. | ||
| 1115 | * We retry after they started, so our grace period works | ||
| 1116 | * for them, and they started after our first try, so their | ||
| 1117 | * grace period works for us. | ||
| 1118 | */ | ||
| 1119 | get_online_cpus(); | ||
| 1120 | snap = atomic_read(&sync_sched_expedited_started) - 1; | ||
| 1121 | smp_mb(); /* ensure read is before try_stop_cpus(). */ | ||
| 1122 | } | ||
| 1123 | |||
| 1124 | /* | ||
| 1125 | * Everyone up to our most recent fetch is covered by our grace | ||
| 1126 | * period. Update the counter, but only if our work is still | ||
| 1127 | * relevant -- which it won't be if someone who started later | ||
| 1128 | * than we did beat us to the punch. | ||
| 1129 | */ | ||
| 1130 | do { | ||
| 1131 | s = atomic_read(&sync_sched_expedited_done); | ||
| 1132 | if (UINT_CMP_GE((unsigned)s, (unsigned)snap)) { | ||
| 1133 | smp_mb(); /* ensure test happens before caller kfree */ | ||
| 1134 | break; | ||
| 1135 | } | ||
| 1136 | } while (atomic_cmpxchg(&sync_sched_expedited_done, s, snap) != s); | ||
| 1137 | |||
| 1138 | put_online_cpus(); | ||
| 1139 | } | ||
| 1140 | EXPORT_SYMBOL_GPL(synchronize_sched_expedited); | ||
| 1141 | |||
| 1142 | #endif /* #else #ifndef CONFIG_SMP */ | ||
| 1143 | |||
| 1017 | #if !defined(CONFIG_RCU_FAST_NO_HZ) | 1144 | #if !defined(CONFIG_RCU_FAST_NO_HZ) |
| 1018 | 1145 | ||
| 1019 | /* | 1146 | /* |
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c index d15430b9d12..c8e97853b97 100644 --- a/kernel/rcutree_trace.c +++ b/kernel/rcutree_trace.c | |||
| @@ -166,13 +166,13 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp) | |||
| 166 | 166 | ||
| 167 | gpnum = rsp->gpnum; | 167 | gpnum = rsp->gpnum; |
| 168 | seq_printf(m, "c=%lu g=%lu s=%d jfq=%ld j=%x " | 168 | seq_printf(m, "c=%lu g=%lu s=%d jfq=%ld j=%x " |
| 169 | "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld\n", | 169 | "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu\n", |
| 170 | rsp->completed, gpnum, rsp->signaled, | 170 | rsp->completed, gpnum, rsp->signaled, |
| 171 | (long)(rsp->jiffies_force_qs - jiffies), | 171 | (long)(rsp->jiffies_force_qs - jiffies), |
| 172 | (int)(jiffies & 0xffff), | 172 | (int)(jiffies & 0xffff), |
| 173 | rsp->n_force_qs, rsp->n_force_qs_ngp, | 173 | rsp->n_force_qs, rsp->n_force_qs_ngp, |
| 174 | rsp->n_force_qs - rsp->n_force_qs_ngp, | 174 | rsp->n_force_qs - rsp->n_force_qs_ngp, |
| 175 | rsp->n_force_qs_lh, rsp->orphan_qlen); | 175 | rsp->n_force_qs_lh); |
| 176 | for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < NUM_RCU_NODES; rnp++) { | 176 | for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < NUM_RCU_NODES; rnp++) { |
| 177 | if (rnp->level != level) { | 177 | if (rnp->level != level) { |
| 178 | seq_puts(m, "\n"); | 178 | seq_puts(m, "\n"); |
| @@ -300,7 +300,7 @@ static const struct file_operations rcu_pending_fops = { | |||
| 300 | 300 | ||
| 301 | static struct dentry *rcudir; | 301 | static struct dentry *rcudir; |
| 302 | 302 | ||
| 303 | static int __init rcuclassic_trace_init(void) | 303 | static int __init rcutree_trace_init(void) |
| 304 | { | 304 | { |
| 305 | struct dentry *retval; | 305 | struct dentry *retval; |
| 306 | 306 | ||
| @@ -337,14 +337,14 @@ free_out: | |||
| 337 | return 1; | 337 | return 1; |
| 338 | } | 338 | } |
| 339 | 339 | ||
| 340 | static void __exit rcuclassic_trace_cleanup(void) | 340 | static void __exit rcutree_trace_cleanup(void) |
| 341 | { | 341 | { |
| 342 | debugfs_remove_recursive(rcudir); | 342 | debugfs_remove_recursive(rcudir); |
| 343 | } | 343 | } |
| 344 | 344 | ||
| 345 | 345 | ||
| 346 | module_init(rcuclassic_trace_init); | 346 | module_init(rcutree_trace_init); |
| 347 | module_exit(rcuclassic_trace_cleanup); | 347 | module_exit(rcutree_trace_cleanup); |
| 348 | 348 | ||
| 349 | MODULE_AUTHOR("Paul E. McKenney"); | 349 | MODULE_AUTHOR("Paul E. McKenney"); |
| 350 | MODULE_DESCRIPTION("Read-Copy Update tracing for hierarchical implementation"); | 350 | MODULE_DESCRIPTION("Read-Copy Update tracing for hierarchical implementation"); |
diff --git a/kernel/relay.c b/kernel/relay.c index c7cf397fb92..859ea5a9605 100644 --- a/kernel/relay.c +++ b/kernel/relay.c | |||
| @@ -70,17 +70,10 @@ static const struct vm_operations_struct relay_file_mmap_ops = { | |||
| 70 | */ | 70 | */ |
| 71 | static struct page **relay_alloc_page_array(unsigned int n_pages) | 71 | static struct page **relay_alloc_page_array(unsigned int n_pages) |
| 72 | { | 72 | { |
| 73 | struct page **array; | 73 | const size_t pa_size = n_pages * sizeof(struct page *); |
| 74 | size_t pa_size = n_pages * sizeof(struct page *); | 74 | if (pa_size > PAGE_SIZE) |
| 75 | 75 | return vzalloc(pa_size); | |
| 76 | if (pa_size > PAGE_SIZE) { | 76 | return kzalloc(pa_size, GFP_KERNEL); |
| 77 | array = vmalloc(pa_size); | ||
| 78 | if (array) | ||
| 79 | memset(array, 0, pa_size); | ||
| 80 | } else { | ||
| 81 | array = kzalloc(pa_size, GFP_KERNEL); | ||
| 82 | } | ||
| 83 | return array; | ||
| 84 | } | 77 | } |
| 85 | 78 | ||
| 86 | /* | 79 | /* |
diff --git a/kernel/resource.c b/kernel/resource.c index 9fad33efd0d..798e2fae2a0 100644 --- a/kernel/resource.c +++ b/kernel/resource.c | |||
| @@ -40,23 +40,6 @@ EXPORT_SYMBOL(iomem_resource); | |||
| 40 | 40 | ||
| 41 | static DEFINE_RWLOCK(resource_lock); | 41 | static DEFINE_RWLOCK(resource_lock); |
| 42 | 42 | ||
| 43 | /* | ||
| 44 | * By default, we allocate free space bottom-up. The architecture can request | ||
| 45 | * top-down by clearing this flag. The user can override the architecture's | ||
| 46 | * choice with the "resource_alloc_from_bottom" kernel boot option, but that | ||
| 47 | * should only be a debugging tool. | ||
| 48 | */ | ||
| 49 | int resource_alloc_from_bottom = 1; | ||
| 50 | |||
| 51 | static __init int setup_alloc_from_bottom(char *s) | ||
| 52 | { | ||
| 53 | printk(KERN_INFO | ||
| 54 | "resource: allocating from bottom-up; please report a bug\n"); | ||
| 55 | resource_alloc_from_bottom = 1; | ||
| 56 | return 0; | ||
| 57 | } | ||
| 58 | early_param("resource_alloc_from_bottom", setup_alloc_from_bottom); | ||
| 59 | |||
| 60 | static void *r_next(struct seq_file *m, void *v, loff_t *pos) | 43 | static void *r_next(struct seq_file *m, void *v, loff_t *pos) |
| 61 | { | 44 | { |
| 62 | struct resource *p = v; | 45 | struct resource *p = v; |
| @@ -374,6 +357,10 @@ int __weak page_is_ram(unsigned long pfn) | |||
| 374 | return walk_system_ram_range(pfn, 1, NULL, __is_ram) == 1; | 357 | return walk_system_ram_range(pfn, 1, NULL, __is_ram) == 1; |
| 375 | } | 358 | } |
| 376 | 359 | ||
| 360 | void __weak arch_remove_reservations(struct resource *avail) | ||
| 361 | { | ||
| 362 | } | ||
| 363 | |||
| 377 | static resource_size_t simple_align_resource(void *data, | 364 | static resource_size_t simple_align_resource(void *data, |
| 378 | const struct resource *avail, | 365 | const struct resource *avail, |
| 379 | resource_size_t size, | 366 | resource_size_t size, |
| @@ -397,74 +384,7 @@ static bool resource_contains(struct resource *res1, struct resource *res2) | |||
| 397 | } | 384 | } |
| 398 | 385 | ||
| 399 | /* | 386 | /* |
| 400 | * Find the resource before "child" in the sibling list of "root" children. | ||
| 401 | */ | ||
| 402 | static struct resource *find_sibling_prev(struct resource *root, struct resource *child) | ||
| 403 | { | ||
| 404 | struct resource *this; | ||
| 405 | |||
| 406 | for (this = root->child; this; this = this->sibling) | ||
| 407 | if (this->sibling == child) | ||
| 408 | return this; | ||
| 409 | |||
| 410 | return NULL; | ||
| 411 | } | ||
| 412 | |||
| 413 | /* | ||
| 414 | * Find empty slot in the resource tree given range and alignment. | 387 | * Find empty slot in the resource tree given range and alignment. |
| 415 | * This version allocates from the end of the root resource first. | ||
| 416 | */ | ||
| 417 | static int find_resource_from_top(struct resource *root, struct resource *new, | ||
| 418 | resource_size_t size, resource_size_t min, | ||
| 419 | resource_size_t max, resource_size_t align, | ||
| 420 | resource_size_t (*alignf)(void *, | ||
| 421 | const struct resource *, | ||
| 422 | resource_size_t, | ||
| 423 | resource_size_t), | ||
| 424 | void *alignf_data) | ||
| 425 | { | ||
| 426 | struct resource *this; | ||
| 427 | struct resource tmp, avail, alloc; | ||
| 428 | |||
| 429 | tmp.start = root->end; | ||
| 430 | tmp.end = root->end; | ||
| 431 | |||
| 432 | this = find_sibling_prev(root, NULL); | ||
| 433 | for (;;) { | ||
| 434 | if (this) { | ||
| 435 | if (this->end < root->end) | ||
| 436 | tmp.start = this->end + 1; | ||
| 437 | } else | ||
| 438 | tmp.start = root->start; | ||
| 439 | |||
| 440 | resource_clip(&tmp, min, max); | ||
| 441 | |||
| 442 | /* Check for overflow after ALIGN() */ | ||
| 443 | avail = *new; | ||
| 444 | avail.start = ALIGN(tmp.start, align); | ||
| 445 | avail.end = tmp.end; | ||
| 446 | if (avail.start >= tmp.start) { | ||
| 447 | alloc.start = alignf(alignf_data, &avail, size, align); | ||
| 448 | alloc.end = alloc.start + size - 1; | ||
| 449 | if (resource_contains(&avail, &alloc)) { | ||
| 450 | new->start = alloc.start; | ||
| 451 | new->end = alloc.end; | ||
| 452 | return 0; | ||
| 453 | } | ||
| 454 | } | ||
| 455 | |||
| 456 | if (!this || this->start == root->start) | ||
| 457 | break; | ||
| 458 | |||
| 459 | tmp.end = this->start - 1; | ||
| 460 | this = find_sibling_prev(root, this); | ||
| 461 | } | ||
| 462 | return -EBUSY; | ||
| 463 | } | ||
| 464 | |||
| 465 | /* | ||
| 466 | * Find empty slot in the resource tree given range and alignment. | ||
| 467 | * This version allocates from the beginning of the root resource first. | ||
| 468 | */ | 388 | */ |
| 469 | static int find_resource(struct resource *root, struct resource *new, | 389 | static int find_resource(struct resource *root, struct resource *new, |
| 470 | resource_size_t size, resource_size_t min, | 390 | resource_size_t size, resource_size_t min, |
| @@ -478,23 +398,24 @@ static int find_resource(struct resource *root, struct resource *new, | |||
| 478 | struct resource *this = root->child; | 398 | struct resource *this = root->child; |
| 479 | struct resource tmp = *new, avail, alloc; | 399 | struct resource tmp = *new, avail, alloc; |
| 480 | 400 | ||
| 401 | tmp.flags = new->flags; | ||
| 481 | tmp.start = root->start; | 402 | tmp.start = root->start; |
| 482 | /* | 403 | /* |
| 483 | * Skip past an allocated resource that starts at 0, since the | 404 | * Skip past an allocated resource that starts at 0, since the assignment |
| 484 | * assignment of this->start - 1 to tmp->end below would cause an | 405 | * of this->start - 1 to tmp->end below would cause an underflow. |
| 485 | * underflow. | ||
| 486 | */ | 406 | */ |
| 487 | if (this && this->start == 0) { | 407 | if (this && this->start == 0) { |
| 488 | tmp.start = this->end + 1; | 408 | tmp.start = this->end + 1; |
| 489 | this = this->sibling; | 409 | this = this->sibling; |
| 490 | } | 410 | } |
| 491 | for (;;) { | 411 | for(;;) { |
| 492 | if (this) | 412 | if (this) |
| 493 | tmp.end = this->start - 1; | 413 | tmp.end = this->start - 1; |
| 494 | else | 414 | else |
| 495 | tmp.end = root->end; | 415 | tmp.end = root->end; |
| 496 | 416 | ||
| 497 | resource_clip(&tmp, min, max); | 417 | resource_clip(&tmp, min, max); |
| 418 | arch_remove_reservations(&tmp); | ||
| 498 | 419 | ||
| 499 | /* Check for overflow after ALIGN() */ | 420 | /* Check for overflow after ALIGN() */ |
| 500 | avail = *new; | 421 | avail = *new; |
| @@ -509,10 +430,8 @@ static int find_resource(struct resource *root, struct resource *new, | |||
| 509 | return 0; | 430 | return 0; |
| 510 | } | 431 | } |
| 511 | } | 432 | } |
| 512 | |||
| 513 | if (!this) | 433 | if (!this) |
| 514 | break; | 434 | break; |
| 515 | |||
| 516 | tmp.start = this->end + 1; | 435 | tmp.start = this->end + 1; |
| 517 | this = this->sibling; | 436 | this = this->sibling; |
| 518 | } | 437 | } |
| @@ -545,10 +464,7 @@ int allocate_resource(struct resource *root, struct resource *new, | |||
| 545 | alignf = simple_align_resource; | 464 | alignf = simple_align_resource; |
| 546 | 465 | ||
| 547 | write_lock(&resource_lock); | 466 | write_lock(&resource_lock); |
| 548 | if (resource_alloc_from_bottom) | 467 | err = find_resource(root, new, size, min, max, align, alignf, alignf_data); |
| 549 | err = find_resource(root, new, size, min, max, align, alignf, alignf_data); | ||
| 550 | else | ||
| 551 | err = find_resource_from_top(root, new, size, min, max, align, alignf, alignf_data); | ||
| 552 | if (err >= 0 && __request_resource(root, new)) | 468 | if (err >= 0 && __request_resource(root, new)) |
| 553 | err = -EBUSY; | 469 | err = -EBUSY; |
| 554 | write_unlock(&resource_lock); | 470 | write_unlock(&resource_lock); |
diff --git a/kernel/sched.c b/kernel/sched.c index aa14a56f9d0..18d38e4ec7b 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
| @@ -75,9 +75,11 @@ | |||
| 75 | 75 | ||
| 76 | #include <asm/tlb.h> | 76 | #include <asm/tlb.h> |
| 77 | #include <asm/irq_regs.h> | 77 | #include <asm/irq_regs.h> |
| 78 | #include <asm/mutex.h> | ||
| 78 | 79 | ||
| 79 | #include "sched_cpupri.h" | 80 | #include "sched_cpupri.h" |
| 80 | #include "workqueue_sched.h" | 81 | #include "workqueue_sched.h" |
| 82 | #include "sched_autogroup.h" | ||
| 81 | 83 | ||
| 82 | #define CREATE_TRACE_POINTS | 84 | #define CREATE_TRACE_POINTS |
| 83 | #include <trace/events/sched.h> | 85 | #include <trace/events/sched.h> |
| @@ -253,6 +255,8 @@ struct task_group { | |||
| 253 | /* runqueue "owned" by this group on each cpu */ | 255 | /* runqueue "owned" by this group on each cpu */ |
| 254 | struct cfs_rq **cfs_rq; | 256 | struct cfs_rq **cfs_rq; |
| 255 | unsigned long shares; | 257 | unsigned long shares; |
| 258 | |||
| 259 | atomic_t load_weight; | ||
| 256 | #endif | 260 | #endif |
| 257 | 261 | ||
| 258 | #ifdef CONFIG_RT_GROUP_SCHED | 262 | #ifdef CONFIG_RT_GROUP_SCHED |
| @@ -268,25 +272,18 @@ struct task_group { | |||
| 268 | struct task_group *parent; | 272 | struct task_group *parent; |
| 269 | struct list_head siblings; | 273 | struct list_head siblings; |
| 270 | struct list_head children; | 274 | struct list_head children; |
| 271 | }; | ||
| 272 | 275 | ||
| 273 | #define root_task_group init_task_group | 276 | #ifdef CONFIG_SCHED_AUTOGROUP |
| 277 | struct autogroup *autogroup; | ||
| 278 | #endif | ||
| 279 | }; | ||
| 274 | 280 | ||
| 275 | /* task_group_lock serializes add/remove of task groups and also changes to | 281 | /* task_group_lock serializes the addition/removal of task groups */ |
| 276 | * a task group's cpu shares. | ||
| 277 | */ | ||
| 278 | static DEFINE_SPINLOCK(task_group_lock); | 282 | static DEFINE_SPINLOCK(task_group_lock); |
| 279 | 283 | ||
| 280 | #ifdef CONFIG_FAIR_GROUP_SCHED | 284 | #ifdef CONFIG_FAIR_GROUP_SCHED |
| 281 | 285 | ||
| 282 | #ifdef CONFIG_SMP | 286 | # define ROOT_TASK_GROUP_LOAD NICE_0_LOAD |
| 283 | static int root_task_group_empty(void) | ||
| 284 | { | ||
| 285 | return list_empty(&root_task_group.children); | ||
| 286 | } | ||
| 287 | #endif | ||
| 288 | |||
| 289 | # define INIT_TASK_GROUP_LOAD NICE_0_LOAD | ||
| 290 | 287 | ||
| 291 | /* | 288 | /* |
| 292 | * A weight of 0 or 1 can cause arithmetics problems. | 289 | * A weight of 0 or 1 can cause arithmetics problems. |
| @@ -299,13 +296,13 @@ static int root_task_group_empty(void) | |||
| 299 | #define MIN_SHARES 2 | 296 | #define MIN_SHARES 2 |
| 300 | #define MAX_SHARES (1UL << 18) | 297 | #define MAX_SHARES (1UL << 18) |
| 301 | 298 | ||
| 302 | static int init_task_group_load = INIT_TASK_GROUP_LOAD; | 299 | static int root_task_group_load = ROOT_TASK_GROUP_LOAD; |
| 303 | #endif | 300 | #endif |
| 304 | 301 | ||
| 305 | /* Default task group. | 302 | /* Default task group. |
| 306 | * Every task in system belong to this group at bootup. | 303 | * Every task in system belong to this group at bootup. |
| 307 | */ | 304 | */ |
| 308 | struct task_group init_task_group; | 305 | struct task_group root_task_group; |
| 309 | 306 | ||
| 310 | #endif /* CONFIG_CGROUP_SCHED */ | 307 | #endif /* CONFIG_CGROUP_SCHED */ |
| 311 | 308 | ||
| @@ -342,6 +339,7 @@ struct cfs_rq { | |||
| 342 | * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This | 339 | * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This |
| 343 | * list is used during load balance. | 340 | * list is used during load balance. |
| 344 | */ | 341 | */ |
| 342 | int on_list; | ||
| 345 | struct list_head leaf_cfs_rq_list; | 343 | struct list_head leaf_cfs_rq_list; |
| 346 | struct task_group *tg; /* group that "owns" this runqueue */ | 344 | struct task_group *tg; /* group that "owns" this runqueue */ |
| 347 | 345 | ||
| @@ -360,14 +358,17 @@ struct cfs_rq { | |||
| 360 | unsigned long h_load; | 358 | unsigned long h_load; |
| 361 | 359 | ||
| 362 | /* | 360 | /* |
| 363 | * this cpu's part of tg->shares | 361 | * Maintaining per-cpu shares distribution for group scheduling |
| 362 | * | ||
| 363 | * load_stamp is the last time we updated the load average | ||
| 364 | * load_last is the last time we updated the load average and saw load | ||
| 365 | * load_unacc_exec_time is currently unaccounted execution time | ||
| 364 | */ | 366 | */ |
| 365 | unsigned long shares; | 367 | u64 load_avg; |
| 368 | u64 load_period; | ||
| 369 | u64 load_stamp, load_last, load_unacc_exec_time; | ||
| 366 | 370 | ||
| 367 | /* | 371 | unsigned long load_contribution; |
| 368 | * load.weight at the time we set shares | ||
| 369 | */ | ||
| 370 | unsigned long rq_weight; | ||
| 371 | #endif | 372 | #endif |
| 372 | #endif | 373 | #endif |
| 373 | }; | 374 | }; |
| @@ -552,26 +553,13 @@ struct rq { | |||
| 552 | /* try_to_wake_up() stats */ | 553 | /* try_to_wake_up() stats */ |
| 553 | unsigned int ttwu_count; | 554 | unsigned int ttwu_count; |
| 554 | unsigned int ttwu_local; | 555 | unsigned int ttwu_local; |
| 555 | |||
| 556 | /* BKL stats */ | ||
| 557 | unsigned int bkl_count; | ||
| 558 | #endif | 556 | #endif |
| 559 | }; | 557 | }; |
| 560 | 558 | ||
| 561 | static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); | 559 | static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); |
| 562 | 560 | ||
| 563 | static inline | ||
| 564 | void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) | ||
| 565 | { | ||
| 566 | rq->curr->sched_class->check_preempt_curr(rq, p, flags); | ||
| 567 | 561 | ||
| 568 | /* | 562 | static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags); |
| 569 | * A queue event has occurred, and we're going to schedule. In | ||
| 570 | * this case, we can save a useless back to back clock update. | ||
| 571 | */ | ||
| 572 | if (test_tsk_need_resched(p)) | ||
| 573 | rq->skip_clock_update = 1; | ||
| 574 | } | ||
| 575 | 563 | ||
| 576 | static inline int cpu_of(struct rq *rq) | 564 | static inline int cpu_of(struct rq *rq) |
| 577 | { | 565 | { |
| @@ -615,11 +603,17 @@ static inline int cpu_of(struct rq *rq) | |||
| 615 | */ | 603 | */ |
| 616 | static inline struct task_group *task_group(struct task_struct *p) | 604 | static inline struct task_group *task_group(struct task_struct *p) |
| 617 | { | 605 | { |
| 606 | struct task_group *tg; | ||
| 618 | struct cgroup_subsys_state *css; | 607 | struct cgroup_subsys_state *css; |
| 619 | 608 | ||
| 609 | if (p->flags & PF_EXITING) | ||
| 610 | return &root_task_group; | ||
| 611 | |||
| 620 | css = task_subsys_state_check(p, cpu_cgroup_subsys_id, | 612 | css = task_subsys_state_check(p, cpu_cgroup_subsys_id, |
| 621 | lockdep_is_held(&task_rq(p)->lock)); | 613 | lockdep_is_held(&task_rq(p)->lock)); |
| 622 | return container_of(css, struct task_group, css); | 614 | tg = container_of(css, struct task_group, css); |
| 615 | |||
| 616 | return autogroup_task_group(p, tg); | ||
| 623 | } | 617 | } |
| 624 | 618 | ||
| 625 | /* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ | 619 | /* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ |
| @@ -646,22 +640,18 @@ static inline struct task_group *task_group(struct task_struct *p) | |||
| 646 | 640 | ||
| 647 | #endif /* CONFIG_CGROUP_SCHED */ | 641 | #endif /* CONFIG_CGROUP_SCHED */ |
| 648 | 642 | ||
| 649 | static u64 irq_time_cpu(int cpu); | 643 | static void update_rq_clock_task(struct rq *rq, s64 delta); |
| 650 | static void sched_irq_time_avg_update(struct rq *rq, u64 irq_time); | ||
| 651 | 644 | ||
| 652 | inline void update_rq_clock(struct rq *rq) | 645 | static void update_rq_clock(struct rq *rq) |
| 653 | { | 646 | { |
| 654 | if (!rq->skip_clock_update) { | 647 | s64 delta; |
| 655 | int cpu = cpu_of(rq); | ||
| 656 | u64 irq_time; | ||
| 657 | 648 | ||
| 658 | rq->clock = sched_clock_cpu(cpu); | 649 | if (rq->skip_clock_update) |
| 659 | irq_time = irq_time_cpu(cpu); | 650 | return; |
| 660 | if (rq->clock - irq_time > rq->clock_task) | ||
| 661 | rq->clock_task = rq->clock - irq_time; | ||
| 662 | 651 | ||
| 663 | sched_irq_time_avg_update(rq, irq_time); | 652 | delta = sched_clock_cpu(cpu_of(rq)) - rq->clock; |
| 664 | } | 653 | rq->clock += delta; |
| 654 | update_rq_clock_task(rq, delta); | ||
| 665 | } | 655 | } |
| 666 | 656 | ||
| 667 | /* | 657 | /* |
| @@ -751,7 +741,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf, | |||
| 751 | buf[cnt] = 0; | 741 | buf[cnt] = 0; |
| 752 | cmp = strstrip(buf); | 742 | cmp = strstrip(buf); |
| 753 | 743 | ||
| 754 | if (strncmp(buf, "NO_", 3) == 0) { | 744 | if (strncmp(cmp, "NO_", 3) == 0) { |
| 755 | neg = 1; | 745 | neg = 1; |
| 756 | cmp += 3; | 746 | cmp += 3; |
| 757 | } | 747 | } |
| @@ -807,20 +797,6 @@ late_initcall(sched_init_debug); | |||
| 807 | const_debug unsigned int sysctl_sched_nr_migrate = 32; | 797 | const_debug unsigned int sysctl_sched_nr_migrate = 32; |
| 808 | 798 | ||
| 809 | /* | 799 | /* |
| 810 | * ratelimit for updating the group shares. | ||
| 811 | * default: 0.25ms | ||
| 812 | */ | ||
| 813 | unsigned int sysctl_sched_shares_ratelimit = 250000; | ||
| 814 | unsigned int normalized_sysctl_sched_shares_ratelimit = 250000; | ||
| 815 | |||
| 816 | /* | ||
| 817 | * Inject some fuzzyness into changing the per-cpu group shares | ||
| 818 | * this avoids remote rq-locks at the expense of fairness. | ||
| 819 | * default: 4 | ||
| 820 | */ | ||
| 821 | unsigned int sysctl_sched_shares_thresh = 4; | ||
| 822 | |||
| 823 | /* | ||
| 824 | * period over which we average the RT time consumption, measured | 800 | * period over which we average the RT time consumption, measured |
| 825 | * in ms. | 801 | * in ms. |
| 826 | * | 802 | * |
| @@ -1369,6 +1345,12 @@ static inline void update_load_sub(struct load_weight *lw, unsigned long dec) | |||
| 1369 | lw->inv_weight = 0; | 1345 | lw->inv_weight = 0; |
| 1370 | } | 1346 | } |
| 1371 | 1347 | ||
| 1348 | static inline void update_load_set(struct load_weight *lw, unsigned long w) | ||
| 1349 | { | ||
| 1350 | lw->weight = w; | ||
| 1351 | lw->inv_weight = 0; | ||
| 1352 | } | ||
| 1353 | |||
| 1372 | /* | 1354 | /* |
| 1373 | * To aid in avoiding the subversion of "niceness" due to uneven distribution | 1355 | * To aid in avoiding the subversion of "niceness" due to uneven distribution |
| 1374 | * of tasks with abnormal "nice" values across CPUs the contribution that | 1356 | * of tasks with abnormal "nice" values across CPUs the contribution that |
| @@ -1557,101 +1539,6 @@ static unsigned long cpu_avg_load_per_task(int cpu) | |||
| 1557 | 1539 | ||
| 1558 | #ifdef CONFIG_FAIR_GROUP_SCHED | 1540 | #ifdef CONFIG_FAIR_GROUP_SCHED |
| 1559 | 1541 | ||
| 1560 | static __read_mostly unsigned long __percpu *update_shares_data; | ||
| 1561 | |||
| 1562 | static void __set_se_shares(struct sched_entity *se, unsigned long shares); | ||
| 1563 | |||
| 1564 | /* | ||
| 1565 | * Calculate and set the cpu's group shares. | ||
| 1566 | */ | ||
| 1567 | static void update_group_shares_cpu(struct task_group *tg, int cpu, | ||
| 1568 | unsigned long sd_shares, | ||
| 1569 | unsigned long sd_rq_weight, | ||
| 1570 | unsigned long *usd_rq_weight) | ||
| 1571 | { | ||
| 1572 | unsigned long shares, rq_weight; | ||
| 1573 | int boost = 0; | ||
| 1574 | |||
| 1575 | rq_weight = usd_rq_weight[cpu]; | ||
| 1576 | if (!rq_weight) { | ||
| 1577 | boost = 1; | ||
| 1578 | rq_weight = NICE_0_LOAD; | ||
| 1579 | } | ||
| 1580 | |||
| 1581 | /* | ||
| 1582 | * \Sum_j shares_j * rq_weight_i | ||
| 1583 | * shares_i = ----------------------------- | ||
| 1584 | * \Sum_j rq_weight_j | ||
| 1585 | */ | ||
| 1586 | shares = (sd_shares * rq_weight) / sd_rq_weight; | ||
| 1587 | shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES); | ||
| 1588 | |||
| 1589 | if (abs(shares - tg->se[cpu]->load.weight) > | ||
| 1590 | sysctl_sched_shares_thresh) { | ||
| 1591 | struct rq *rq = cpu_rq(cpu); | ||
| 1592 | unsigned long flags; | ||
| 1593 | |||
| 1594 | raw_spin_lock_irqsave(&rq->lock, flags); | ||
| 1595 | tg->cfs_rq[cpu]->rq_weight = boost ? 0 : rq_weight; | ||
| 1596 | tg->cfs_rq[cpu]->shares = boost ? 0 : shares; | ||
| 1597 | __set_se_shares(tg->se[cpu], shares); | ||
| 1598 | raw_spin_unlock_irqrestore(&rq->lock, flags); | ||
| 1599 | } | ||
| 1600 | } | ||
| 1601 | |||
| 1602 | /* | ||
| 1603 | * Re-compute the task group their per cpu shares over the given domain. | ||
| 1604 | * This needs to be done in a bottom-up fashion because the rq weight of a | ||
| 1605 | * parent group depends on the shares of its child groups. | ||
| 1606 | */ | ||
| 1607 | static int tg_shares_up(struct task_group *tg, void *data) | ||
| 1608 | { | ||
| 1609 | unsigned long weight, rq_weight = 0, sum_weight = 0, shares = 0; | ||
| 1610 | unsigned long *usd_rq_weight; | ||
| 1611 | struct sched_domain *sd = data; | ||
| 1612 | unsigned long flags; | ||
| 1613 | int i; | ||
| 1614 | |||
| 1615 | if (!tg->se[0]) | ||
| 1616 | return 0; | ||
| 1617 | |||
| 1618 | local_irq_save(flags); | ||
| 1619 | usd_rq_weight = per_cpu_ptr(update_shares_data, smp_processor_id()); | ||
| 1620 | |||
| 1621 | for_each_cpu(i, sched_domain_span(sd)) { | ||
| 1622 | weight = tg->cfs_rq[i]->load.weight; | ||
| 1623 | usd_rq_weight[i] = weight; | ||
| 1624 | |||
| 1625 | rq_weight += weight; | ||
| 1626 | /* | ||
| 1627 | * If there are currently no tasks on the cpu pretend there | ||
| 1628 | * is one of average load so that when a new task gets to | ||
| 1629 | * run here it will not get delayed by group starvation. | ||
| 1630 | */ | ||
| 1631 | if (!weight) | ||
| 1632 | weight = NICE_0_LOAD; | ||
| 1633 | |||
| 1634 | sum_weight += weight; | ||
| 1635 | shares += tg->cfs_rq[i]->shares; | ||
| 1636 | } | ||
| 1637 | |||
| 1638 | if (!rq_weight) | ||
| 1639 | rq_weight = sum_weight; | ||
| 1640 | |||
| 1641 | if ((!shares && rq_weight) || shares > tg->shares) | ||
| 1642 | shares = tg->shares; | ||
| 1643 | |||
| 1644 | if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE)) | ||
| 1645 | shares = tg->shares; | ||
| 1646 | |||
| 1647 | for_each_cpu(i, sched_domain_span(sd)) | ||
| 1648 | update_group_shares_cpu(tg, i, shares, rq_weight, usd_rq_weight); | ||
| 1649 | |||
| 1650 | local_irq_restore(flags); | ||
| 1651 | |||
| 1652 | return 0; | ||
| 1653 | } | ||
| 1654 | |||
| 1655 | /* | 1542 | /* |
| 1656 | * Compute the cpu's hierarchical load factor for each task group. | 1543 | * Compute the cpu's hierarchical load factor for each task group. |
| 1657 | * This needs to be done in a top-down fashion because the load of a child | 1544 | * This needs to be done in a top-down fashion because the load of a child |
| @@ -1666,7 +1553,7 @@ static int tg_load_down(struct task_group *tg, void *data) | |||
| 1666 | load = cpu_rq(cpu)->load.weight; | 1553 | load = cpu_rq(cpu)->load.weight; |
| 1667 | } else { | 1554 | } else { |
| 1668 | load = tg->parent->cfs_rq[cpu]->h_load; | 1555 | load = tg->parent->cfs_rq[cpu]->h_load; |
| 1669 | load *= tg->cfs_rq[cpu]->shares; | 1556 | load *= tg->se[cpu]->load.weight; |
| 1670 | load /= tg->parent->cfs_rq[cpu]->load.weight + 1; | 1557 | load /= tg->parent->cfs_rq[cpu]->load.weight + 1; |
| 1671 | } | 1558 | } |
| 1672 | 1559 | ||
| @@ -1675,34 +1562,11 @@ static int tg_load_down(struct task_group *tg, void *data) | |||
| 1675 | return 0; | 1562 | return 0; |
| 1676 | } | 1563 | } |
| 1677 | 1564 | ||
| 1678 | static void update_shares(struct sched_domain *sd) | ||
| 1679 | { | ||
| 1680 | s64 elapsed; | ||
| 1681 | u64 now; | ||
| 1682 | |||
| 1683 | if (root_task_group_empty()) | ||
| 1684 | return; | ||
| 1685 | |||
| 1686 | now = local_clock(); | ||
| 1687 | elapsed = now - sd->last_update; | ||
| 1688 | |||
| 1689 | if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) { | ||
| 1690 | sd->last_update = now; | ||
| 1691 | walk_tg_tree(tg_nop, tg_shares_up, sd); | ||
| 1692 | } | ||
| 1693 | } | ||
| 1694 | |||
| 1695 | static void update_h_load(long cpu) | 1565 | static void update_h_load(long cpu) |
| 1696 | { | 1566 | { |
| 1697 | walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); | 1567 | walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); |
| 1698 | } | 1568 | } |
| 1699 | 1569 | ||
| 1700 | #else | ||
| 1701 | |||
| 1702 | static inline void update_shares(struct sched_domain *sd) | ||
| 1703 | { | ||
| 1704 | } | ||
| 1705 | |||
| 1706 | #endif | 1570 | #endif |
| 1707 | 1571 | ||
| 1708 | #ifdef CONFIG_PREEMPT | 1572 | #ifdef CONFIG_PREEMPT |
| @@ -1824,15 +1688,6 @@ static void double_rq_unlock(struct rq *rq1, struct rq *rq2) | |||
| 1824 | 1688 | ||
| 1825 | #endif | 1689 | #endif |
| 1826 | 1690 | ||
| 1827 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
| 1828 | static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares) | ||
| 1829 | { | ||
| 1830 | #ifdef CONFIG_SMP | ||
| 1831 | cfs_rq->shares = shares; | ||
| 1832 | #endif | ||
| 1833 | } | ||
| 1834 | #endif | ||
| 1835 | |||
| 1836 | static void calc_load_account_idle(struct rq *this_rq); | 1691 | static void calc_load_account_idle(struct rq *this_rq); |
| 1837 | static void update_sysctl(void); | 1692 | static void update_sysctl(void); |
| 1838 | static int get_update_sysctl_factor(void); | 1693 | static int get_update_sysctl_factor(void); |
| @@ -1934,10 +1789,9 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int flags) | |||
| 1934 | * They are read and saved off onto struct rq in update_rq_clock(). | 1789 | * They are read and saved off onto struct rq in update_rq_clock(). |
| 1935 | * This may result in other CPU reading this CPU's irq time and can | 1790 | * This may result in other CPU reading this CPU's irq time and can |
| 1936 | * race with irq/account_system_vtime on this CPU. We would either get old | 1791 | * race with irq/account_system_vtime on this CPU. We would either get old |
| 1937 | * or new value (or semi updated value on 32 bit) with a side effect of | 1792 | * or new value with a side effect of accounting a slice of irq time to wrong |
| 1938 | * accounting a slice of irq time to wrong task when irq is in progress | 1793 | * task when irq is in progress while we read rq->clock. That is a worthy |
| 1939 | * while we read rq->clock. That is a worthy compromise in place of having | 1794 | * compromise in place of having locks on each irq in account_system_time. |
| 1940 | * locks on each irq in account_system_time. | ||
| 1941 | */ | 1795 | */ |
| 1942 | static DEFINE_PER_CPU(u64, cpu_hardirq_time); | 1796 | static DEFINE_PER_CPU(u64, cpu_hardirq_time); |
| 1943 | static DEFINE_PER_CPU(u64, cpu_softirq_time); | 1797 | static DEFINE_PER_CPU(u64, cpu_softirq_time); |
| @@ -1955,19 +1809,58 @@ void disable_sched_clock_irqtime(void) | |||
| 1955 | sched_clock_irqtime = 0; | 1809 | sched_clock_irqtime = 0; |
| 1956 | } | 1810 | } |
| 1957 | 1811 | ||
| 1958 | static u64 irq_time_cpu(int cpu) | 1812 | #ifndef CONFIG_64BIT |
| 1813 | static DEFINE_PER_CPU(seqcount_t, irq_time_seq); | ||
| 1814 | |||
| 1815 | static inline void irq_time_write_begin(void) | ||
| 1959 | { | 1816 | { |
| 1960 | if (!sched_clock_irqtime) | 1817 | __this_cpu_inc(irq_time_seq.sequence); |
| 1961 | return 0; | 1818 | smp_wmb(); |
| 1819 | } | ||
| 1820 | |||
| 1821 | static inline void irq_time_write_end(void) | ||
| 1822 | { | ||
| 1823 | smp_wmb(); | ||
| 1824 | __this_cpu_inc(irq_time_seq.sequence); | ||
| 1825 | } | ||
| 1826 | |||
| 1827 | static inline u64 irq_time_read(int cpu) | ||
| 1828 | { | ||
| 1829 | u64 irq_time; | ||
| 1830 | unsigned seq; | ||
| 1831 | |||
| 1832 | do { | ||
| 1833 | seq = read_seqcount_begin(&per_cpu(irq_time_seq, cpu)); | ||
| 1834 | irq_time = per_cpu(cpu_softirq_time, cpu) + | ||
| 1835 | per_cpu(cpu_hardirq_time, cpu); | ||
| 1836 | } while (read_seqcount_retry(&per_cpu(irq_time_seq, cpu), seq)); | ||
| 1962 | 1837 | ||
| 1838 | return irq_time; | ||
| 1839 | } | ||
| 1840 | #else /* CONFIG_64BIT */ | ||
| 1841 | static inline void irq_time_write_begin(void) | ||
| 1842 | { | ||
| 1843 | } | ||
| 1844 | |||
| 1845 | static inline void irq_time_write_end(void) | ||
| 1846 | { | ||
| 1847 | } | ||
| 1848 | |||
| 1849 | static inline u64 irq_time_read(int cpu) | ||
| 1850 | { | ||
| 1963 | return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu); | 1851 | return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu); |
| 1964 | } | 1852 | } |
| 1853 | #endif /* CONFIG_64BIT */ | ||
| 1965 | 1854 | ||
| 1855 | /* | ||
| 1856 | * Called before incrementing preempt_count on {soft,}irq_enter | ||
| 1857 | * and before decrementing preempt_count on {soft,}irq_exit. | ||
| 1858 | */ | ||
| 1966 | void account_system_vtime(struct task_struct *curr) | 1859 | void account_system_vtime(struct task_struct *curr) |
| 1967 | { | 1860 | { |
| 1968 | unsigned long flags; | 1861 | unsigned long flags; |
| 1862 | s64 delta; | ||
| 1969 | int cpu; | 1863 | int cpu; |
| 1970 | u64 now, delta; | ||
| 1971 | 1864 | ||
| 1972 | if (!sched_clock_irqtime) | 1865 | if (!sched_clock_irqtime) |
| 1973 | return; | 1866 | return; |
| @@ -1975,9 +1868,10 @@ void account_system_vtime(struct task_struct *curr) | |||
| 1975 | local_irq_save(flags); | 1868 | local_irq_save(flags); |
| 1976 | 1869 | ||
| 1977 | cpu = smp_processor_id(); | 1870 | cpu = smp_processor_id(); |
| 1978 | now = sched_clock_cpu(cpu); | 1871 | delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time); |
| 1979 | delta = now - per_cpu(irq_start_time, cpu); | 1872 | __this_cpu_add(irq_start_time, delta); |
| 1980 | per_cpu(irq_start_time, cpu) = now; | 1873 | |
| 1874 | irq_time_write_begin(); | ||
| 1981 | /* | 1875 | /* |
| 1982 | * We do not account for softirq time from ksoftirqd here. | 1876 | * We do not account for softirq time from ksoftirqd here. |
| 1983 | * We want to continue accounting softirq time to ksoftirqd thread | 1877 | * We want to continue accounting softirq time to ksoftirqd thread |
| @@ -1985,37 +1879,60 @@ void account_system_vtime(struct task_struct *curr) | |||
| 1985 | * that do not consume any time, but still wants to run. | 1879 | * that do not consume any time, but still wants to run. |
| 1986 | */ | 1880 | */ |
| 1987 | if (hardirq_count()) | 1881 | if (hardirq_count()) |
| 1988 | per_cpu(cpu_hardirq_time, cpu) += delta; | 1882 | __this_cpu_add(cpu_hardirq_time, delta); |
| 1989 | else if (in_serving_softirq() && !(curr->flags & PF_KSOFTIRQD)) | 1883 | else if (in_serving_softirq() && !(curr->flags & PF_KSOFTIRQD)) |
| 1990 | per_cpu(cpu_softirq_time, cpu) += delta; | 1884 | __this_cpu_add(cpu_softirq_time, delta); |
| 1991 | 1885 | ||
| 1886 | irq_time_write_end(); | ||
| 1992 | local_irq_restore(flags); | 1887 | local_irq_restore(flags); |
| 1993 | } | 1888 | } |
| 1994 | EXPORT_SYMBOL_GPL(account_system_vtime); | 1889 | EXPORT_SYMBOL_GPL(account_system_vtime); |
| 1995 | 1890 | ||
| 1996 | static void sched_irq_time_avg_update(struct rq *rq, u64 curr_irq_time) | 1891 | static void update_rq_clock_task(struct rq *rq, s64 delta) |
| 1997 | { | 1892 | { |
| 1998 | if (sched_clock_irqtime && sched_feat(NONIRQ_POWER)) { | 1893 | s64 irq_delta; |
| 1999 | u64 delta_irq = curr_irq_time - rq->prev_irq_time; | 1894 | |
| 2000 | rq->prev_irq_time = curr_irq_time; | 1895 | irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time; |
| 2001 | sched_rt_avg_update(rq, delta_irq); | 1896 | |
| 2002 | } | 1897 | /* |
| 1898 | * Since irq_time is only updated on {soft,}irq_exit, we might run into | ||
| 1899 | * this case when a previous update_rq_clock() happened inside a | ||
| 1900 | * {soft,}irq region. | ||
| 1901 | * | ||
| 1902 | * When this happens, we stop ->clock_task and only update the | ||
| 1903 | * prev_irq_time stamp to account for the part that fit, so that a next | ||
| 1904 | * update will consume the rest. This ensures ->clock_task is | ||
| 1905 | * monotonic. | ||
| 1906 | * | ||
| 1907 | * It does however cause some slight miss-attribution of {soft,}irq | ||
| 1908 | * time, a more accurate solution would be to update the irq_time using | ||
| 1909 | * the current rq->clock timestamp, except that would require using | ||
| 1910 | * atomic ops. | ||
| 1911 | */ | ||
| 1912 | if (irq_delta > delta) | ||
| 1913 | irq_delta = delta; | ||
| 1914 | |||
| 1915 | rq->prev_irq_time += irq_delta; | ||
| 1916 | delta -= irq_delta; | ||
| 1917 | rq->clock_task += delta; | ||
| 1918 | |||
| 1919 | if (irq_delta && sched_feat(NONIRQ_POWER)) | ||
| 1920 | sched_rt_avg_update(rq, irq_delta); | ||
| 2003 | } | 1921 | } |
| 2004 | 1922 | ||
| 2005 | #else | 1923 | #else /* CONFIG_IRQ_TIME_ACCOUNTING */ |
| 2006 | 1924 | ||
| 2007 | static u64 irq_time_cpu(int cpu) | 1925 | static void update_rq_clock_task(struct rq *rq, s64 delta) |
| 2008 | { | 1926 | { |
| 2009 | return 0; | 1927 | rq->clock_task += delta; |
| 2010 | } | 1928 | } |
| 2011 | 1929 | ||
| 2012 | static void sched_irq_time_avg_update(struct rq *rq, u64 curr_irq_time) { } | 1930 | #endif /* CONFIG_IRQ_TIME_ACCOUNTING */ |
| 2013 | |||
| 2014 | #endif | ||
| 2015 | 1931 | ||
| 2016 | #include "sched_idletask.c" | 1932 | #include "sched_idletask.c" |
| 2017 | #include "sched_fair.c" | 1933 | #include "sched_fair.c" |
| 2018 | #include "sched_rt.c" | 1934 | #include "sched_rt.c" |
| 1935 | #include "sched_autogroup.c" | ||
| 2019 | #include "sched_stoptask.c" | 1936 | #include "sched_stoptask.c" |
| 2020 | #ifdef CONFIG_SCHED_DEBUG | 1937 | #ifdef CONFIG_SCHED_DEBUG |
| 2021 | # include "sched_debug.c" | 1938 | # include "sched_debug.c" |
| @@ -2118,6 +2035,31 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p, | |||
| 2118 | p->sched_class->prio_changed(rq, p, oldprio, running); | 2035 | p->sched_class->prio_changed(rq, p, oldprio, running); |
| 2119 | } | 2036 | } |
| 2120 | 2037 | ||
| 2038 | static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) | ||
| 2039 | { | ||
| 2040 | const struct sched_class *class; | ||
| 2041 | |||
| 2042 | if (p->sched_class == rq->curr->sched_class) { | ||
| 2043 | rq->curr->sched_class->check_preempt_curr(rq, p, flags); | ||
| 2044 | } else { | ||
| 2045 | for_each_class(class) { | ||
| 2046 | if (class == rq->curr->sched_class) | ||
| 2047 | break; | ||
| 2048 | if (class == p->sched_class) { | ||
| 2049 | resched_task(rq->curr); | ||
| 2050 | break; | ||
| 2051 | } | ||
| 2052 | } | ||
| 2053 | } | ||
| 2054 | |||
| 2055 | /* | ||
| 2056 | * A queue event has occurred, and we're going to schedule. In | ||
| 2057 | * this case, we can save a useless back to back clock update. | ||
| 2058 | */ | ||
| 2059 | if (rq->curr->se.on_rq && test_tsk_need_resched(rq->curr)) | ||
| 2060 | rq->skip_clock_update = 1; | ||
| 2061 | } | ||
| 2062 | |||
| 2121 | #ifdef CONFIG_SMP | 2063 | #ifdef CONFIG_SMP |
| 2122 | /* | 2064 | /* |
| 2123 | * Is this task likely cache-hot: | 2065 | * Is this task likely cache-hot: |
| @@ -2183,10 +2125,8 @@ static int migration_cpu_stop(void *data); | |||
| 2183 | * The task's runqueue lock must be held. | 2125 | * The task's runqueue lock must be held. |
| 2184 | * Returns true if you have to wait for migration thread. | 2126 | * Returns true if you have to wait for migration thread. |
| 2185 | */ | 2127 | */ |
| 2186 | static bool migrate_task(struct task_struct *p, int dest_cpu) | 2128 | static bool migrate_task(struct task_struct *p, struct rq *rq) |
| 2187 | { | 2129 | { |
| 2188 | struct rq *rq = task_rq(p); | ||
| 2189 | |||
| 2190 | /* | 2130 | /* |
| 2191 | * If the task is not on a runqueue (and not running), then | 2131 | * If the task is not on a runqueue (and not running), then |
| 2192 | * the next wake-up will properly place the task. | 2132 | * the next wake-up will properly place the task. |
| @@ -2366,18 +2306,15 @@ static int select_fallback_rq(int cpu, struct task_struct *p) | |||
| 2366 | return dest_cpu; | 2306 | return dest_cpu; |
| 2367 | 2307 | ||
| 2368 | /* No more Mr. Nice Guy. */ | 2308 | /* No more Mr. Nice Guy. */ |
| 2369 | if (unlikely(dest_cpu >= nr_cpu_ids)) { | 2309 | dest_cpu = cpuset_cpus_allowed_fallback(p); |
| 2370 | dest_cpu = cpuset_cpus_allowed_fallback(p); | 2310 | /* |
| 2371 | /* | 2311 | * Don't tell them about moving exiting tasks or |
| 2372 | * Don't tell them about moving exiting tasks or | 2312 | * kernel threads (both mm NULL), since they never |
| 2373 | * kernel threads (both mm NULL), since they never | 2313 | * leave kernel. |
| 2374 | * leave kernel. | 2314 | */ |
| 2375 | */ | 2315 | if (p->mm && printk_ratelimit()) { |
| 2376 | if (p->mm && printk_ratelimit()) { | 2316 | printk(KERN_INFO "process %d (%s) no longer affine to cpu%d\n", |
| 2377 | printk(KERN_INFO "process %d (%s) no " | 2317 | task_pid_nr(p), p->comm, cpu); |
| 2378 | "longer affine to cpu%d\n", | ||
| 2379 | task_pid_nr(p), p->comm, cpu); | ||
| 2380 | } | ||
| 2381 | } | 2318 | } |
| 2382 | 2319 | ||
| 2383 | return dest_cpu; | 2320 | return dest_cpu; |
| @@ -2568,7 +2505,7 @@ out: | |||
| 2568 | * try_to_wake_up_local - try to wake up a local task with rq lock held | 2505 | * try_to_wake_up_local - try to wake up a local task with rq lock held |
| 2569 | * @p: the thread to be awakened | 2506 | * @p: the thread to be awakened |
| 2570 | * | 2507 | * |
| 2571 | * Put @p on the run-queue if it's not alredy there. The caller must | 2508 | * Put @p on the run-queue if it's not already there. The caller must |
| 2572 | * ensure that this_rq() is locked, @p is bound to this_rq() and not | 2509 | * ensure that this_rq() is locked, @p is bound to this_rq() and not |
| 2573 | * the current task. this_rq() stays locked over invocation. | 2510 | * the current task. this_rq() stays locked over invocation. |
| 2574 | */ | 2511 | */ |
| @@ -2713,7 +2650,9 @@ void sched_fork(struct task_struct *p, int clone_flags) | |||
| 2713 | /* Want to start with kernel preemption disabled. */ | 2650 | /* Want to start with kernel preemption disabled. */ |
| 2714 | task_thread_info(p)->preempt_count = 1; | 2651 | task_thread_info(p)->preempt_count = 1; |
| 2715 | #endif | 2652 | #endif |
| 2653 | #ifdef CONFIG_SMP | ||
| 2716 | plist_node_init(&p->pushable_tasks, MAX_PRIO); | 2654 | plist_node_init(&p->pushable_tasks, MAX_PRIO); |
| 2655 | #endif | ||
| 2717 | 2656 | ||
| 2718 | put_cpu(); | 2657 | put_cpu(); |
| 2719 | } | 2658 | } |
| @@ -3104,6 +3043,15 @@ static long calc_load_fold_active(struct rq *this_rq) | |||
| 3104 | return delta; | 3043 | return delta; |
| 3105 | } | 3044 | } |
| 3106 | 3045 | ||
| 3046 | static unsigned long | ||
| 3047 | calc_load(unsigned long load, unsigned long exp, unsigned long active) | ||
| 3048 | { | ||
| 3049 | load *= exp; | ||
| 3050 | load += active * (FIXED_1 - exp); | ||
| 3051 | load += 1UL << (FSHIFT - 1); | ||
| 3052 | return load >> FSHIFT; | ||
| 3053 | } | ||
| 3054 | |||
| 3107 | #ifdef CONFIG_NO_HZ | 3055 | #ifdef CONFIG_NO_HZ |
| 3108 | /* | 3056 | /* |
| 3109 | * For NO_HZ we delay the active fold to the next LOAD_FREQ update. | 3057 | * For NO_HZ we delay the active fold to the next LOAD_FREQ update. |
| @@ -3133,6 +3081,128 @@ static long calc_load_fold_idle(void) | |||
| 3133 | 3081 | ||
| 3134 | return delta; | 3082 | return delta; |
| 3135 | } | 3083 | } |
| 3084 | |||
| 3085 | /** | ||
| 3086 | * fixed_power_int - compute: x^n, in O(log n) time | ||
| 3087 | * | ||
| 3088 | * @x: base of the power | ||
| 3089 | * @frac_bits: fractional bits of @x | ||
| 3090 | * @n: power to raise @x to. | ||
| 3091 | * | ||
| 3092 | * By exploiting the relation between the definition of the natural power | ||
| 3093 | * function: x^n := x*x*...*x (x multiplied by itself for n times), and | ||
| 3094 | * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i, | ||
| 3095 | * (where: n_i \elem {0, 1}, the binary vector representing n), | ||
| 3096 | * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is | ||
| 3097 | * of course trivially computable in O(log_2 n), the length of our binary | ||
| 3098 | * vector. | ||
| 3099 | */ | ||
| 3100 | static unsigned long | ||
| 3101 | fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n) | ||
| 3102 | { | ||
| 3103 | unsigned long result = 1UL << frac_bits; | ||
| 3104 | |||
| 3105 | if (n) for (;;) { | ||
| 3106 | if (n & 1) { | ||
| 3107 | result *= x; | ||
| 3108 | result += 1UL << (frac_bits - 1); | ||
| 3109 | result >>= frac_bits; | ||
| 3110 | } | ||
| 3111 | n >>= 1; | ||
| 3112 | if (!n) | ||
| 3113 | break; | ||
| 3114 | x *= x; | ||
| 3115 | x += 1UL << (frac_bits - 1); | ||
| 3116 | x >>= frac_bits; | ||
| 3117 | } | ||
| 3118 | |||
| 3119 | return result; | ||
| 3120 | } | ||
| 3121 | |||
| 3122 | /* | ||
| 3123 | * a1 = a0 * e + a * (1 - e) | ||
| 3124 | * | ||
| 3125 | * a2 = a1 * e + a * (1 - e) | ||
| 3126 | * = (a0 * e + a * (1 - e)) * e + a * (1 - e) | ||
| 3127 | * = a0 * e^2 + a * (1 - e) * (1 + e) | ||
| 3128 | * | ||
| 3129 | * a3 = a2 * e + a * (1 - e) | ||
| 3130 | * = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e) | ||
| 3131 | * = a0 * e^3 + a * (1 - e) * (1 + e + e^2) | ||
| 3132 | * | ||
| 3133 | * ... | ||
| 3134 | * | ||
| 3135 | * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1] | ||
| 3136 | * = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e) | ||
| 3137 | * = a0 * e^n + a * (1 - e^n) | ||
| 3138 | * | ||
| 3139 | * [1] application of the geometric series: | ||
| 3140 | * | ||
| 3141 | * n 1 - x^(n+1) | ||
| 3142 | * S_n := \Sum x^i = ------------- | ||
| 3143 | * i=0 1 - x | ||
| 3144 | */ | ||
| 3145 | static unsigned long | ||
| 3146 | calc_load_n(unsigned long load, unsigned long exp, | ||
| 3147 | unsigned long active, unsigned int n) | ||
| 3148 | { | ||
| 3149 | |||
| 3150 | return calc_load(load, fixed_power_int(exp, FSHIFT, n), active); | ||
| 3151 | } | ||
| 3152 | |||
| 3153 | /* | ||
| 3154 | * NO_HZ can leave us missing all per-cpu ticks calling | ||
| 3155 | * calc_load_account_active(), but since an idle CPU folds its delta into | ||
| 3156 | * calc_load_tasks_idle per calc_load_account_idle(), all we need to do is fold | ||
| 3157 | * in the pending idle delta if our idle period crossed a load cycle boundary. | ||
| 3158 | * | ||
| 3159 | * Once we've updated the global active value, we need to apply the exponential | ||
| 3160 | * weights adjusted to the number of cycles missed. | ||
| 3161 | */ | ||
| 3162 | static void calc_global_nohz(unsigned long ticks) | ||
| 3163 | { | ||
| 3164 | long delta, active, n; | ||
| 3165 | |||
| 3166 | if (time_before(jiffies, calc_load_update)) | ||
| 3167 | return; | ||
| 3168 | |||
| 3169 | /* | ||
| 3170 | * If we crossed a calc_load_update boundary, make sure to fold | ||
| 3171 | * any pending idle changes, the respective CPUs might have | ||
| 3172 | * missed the tick driven calc_load_account_active() update | ||
| 3173 | * due to NO_HZ. | ||
| 3174 | */ | ||
| 3175 | delta = calc_load_fold_idle(); | ||
| 3176 | if (delta) | ||
| 3177 | atomic_long_add(delta, &calc_load_tasks); | ||
| 3178 | |||
| 3179 | /* | ||
| 3180 | * If we were idle for multiple load cycles, apply them. | ||
| 3181 | */ | ||
| 3182 | if (ticks >= LOAD_FREQ) { | ||
| 3183 | n = ticks / LOAD_FREQ; | ||
| 3184 | |||
| 3185 | active = atomic_long_read(&calc_load_tasks); | ||
| 3186 | active = active > 0 ? active * FIXED_1 : 0; | ||
| 3187 | |||
| 3188 | avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n); | ||
| 3189 | avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n); | ||
| 3190 | avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n); | ||
| 3191 | |||
| 3192 | calc_load_update += n * LOAD_FREQ; | ||
| 3193 | } | ||
| 3194 | |||
| 3195 | /* | ||
| 3196 | * Its possible the remainder of the above division also crosses | ||
| 3197 | * a LOAD_FREQ period, the regular check in calc_global_load() | ||
| 3198 | * which comes after this will take care of that. | ||
| 3199 | * | ||
| 3200 | * Consider us being 11 ticks before a cycle completion, and us | ||
| 3201 | * sleeping for 4*LOAD_FREQ + 22 ticks, then the above code will | ||
| 3202 | * age us 4 cycles, and the test in calc_global_load() will | ||
| 3203 | * pick up the final one. | ||
| 3204 | */ | ||
| 3205 | } | ||
| 3136 | #else | 3206 | #else |
| 3137 | static void calc_load_account_idle(struct rq *this_rq) | 3207 | static void calc_load_account_idle(struct rq *this_rq) |
| 3138 | { | 3208 | { |
| @@ -3142,6 +3212,10 @@ static inline long calc_load_fold_idle(void) | |||
| 3142 | { | 3212 | { |
| 3143 | return 0; | 3213 | return 0; |
| 3144 | } | 3214 | } |
| 3215 | |||
| 3216 | static void calc_global_nohz(unsigned long ticks) | ||
| 3217 | { | ||
| 3218 | } | ||
| 3145 | #endif | 3219 | #endif |
| 3146 | 3220 | ||
| 3147 | /** | 3221 | /** |
| @@ -3159,24 +3233,17 @@ void get_avenrun(unsigned long *loads, unsigned long offset, int shift) | |||
| 3159 | loads[2] = (avenrun[2] + offset) << shift; | 3233 | loads[2] = (avenrun[2] + offset) << shift; |
| 3160 | } | 3234 | } |
| 3161 | 3235 | ||
| 3162 | static unsigned long | ||
| 3163 | calc_load(unsigned long load, unsigned long exp, unsigned long active) | ||
| 3164 | { | ||
| 3165 | load *= exp; | ||
| 3166 | load += active * (FIXED_1 - exp); | ||
| 3167 | return load >> FSHIFT; | ||
| 3168 | } | ||
| 3169 | |||
| 3170 | /* | 3236 | /* |
| 3171 | * calc_load - update the avenrun load estimates 10 ticks after the | 3237 | * calc_load - update the avenrun load estimates 10 ticks after the |
| 3172 | * CPUs have updated calc_load_tasks. | 3238 | * CPUs have updated calc_load_tasks. |
| 3173 | */ | 3239 | */ |
| 3174 | void calc_global_load(void) | 3240 | void calc_global_load(unsigned long ticks) |
| 3175 | { | 3241 | { |
| 3176 | unsigned long upd = calc_load_update + 10; | ||
| 3177 | long active; | 3242 | long active; |
| 3178 | 3243 | ||
| 3179 | if (time_before(jiffies, upd)) | 3244 | calc_global_nohz(ticks); |
| 3245 | |||
| 3246 | if (time_before(jiffies, calc_load_update + 10)) | ||
| 3180 | return; | 3247 | return; |
| 3181 | 3248 | ||
| 3182 | active = atomic_long_read(&calc_load_tasks); | 3249 | active = atomic_long_read(&calc_load_tasks); |
| @@ -3349,7 +3416,7 @@ void sched_exec(void) | |||
| 3349 | * select_task_rq() can race against ->cpus_allowed | 3416 | * select_task_rq() can race against ->cpus_allowed |
| 3350 | */ | 3417 | */ |
| 3351 | if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed) && | 3418 | if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed) && |
| 3352 | likely(cpu_active(dest_cpu)) && migrate_task(p, dest_cpu)) { | 3419 | likely(cpu_active(dest_cpu)) && migrate_task(p, rq)) { |
| 3353 | struct migration_arg arg = { p, dest_cpu }; | 3420 | struct migration_arg arg = { p, dest_cpu }; |
| 3354 | 3421 | ||
| 3355 | task_rq_unlock(rq, &flags); | 3422 | task_rq_unlock(rq, &flags); |
| @@ -3820,7 +3887,7 @@ static inline void schedule_debug(struct task_struct *prev) | |||
| 3820 | schedstat_inc(this_rq(), sched_count); | 3887 | schedstat_inc(this_rq(), sched_count); |
| 3821 | #ifdef CONFIG_SCHEDSTATS | 3888 | #ifdef CONFIG_SCHEDSTATS |
| 3822 | if (unlikely(prev->lock_depth >= 0)) { | 3889 | if (unlikely(prev->lock_depth >= 0)) { |
| 3823 | schedstat_inc(this_rq(), bkl_count); | 3890 | schedstat_inc(this_rq(), rq_sched_info.bkl_count); |
| 3824 | schedstat_inc(prev, sched_info.bkl_count); | 3891 | schedstat_inc(prev, sched_info.bkl_count); |
| 3825 | } | 3892 | } |
| 3826 | #endif | 3893 | #endif |
| @@ -3830,7 +3897,6 @@ static void put_prev_task(struct rq *rq, struct task_struct *prev) | |||
| 3830 | { | 3897 | { |
| 3831 | if (prev->se.on_rq) | 3898 | if (prev->se.on_rq) |
| 3832 | update_rq_clock(rq); | 3899 | update_rq_clock(rq); |
| 3833 | rq->skip_clock_update = 0; | ||
| 3834 | prev->sched_class->put_prev_task(rq, prev); | 3900 | prev->sched_class->put_prev_task(rq, prev); |
| 3835 | } | 3901 | } |
| 3836 | 3902 | ||
| @@ -3888,7 +3954,6 @@ need_resched_nonpreemptible: | |||
| 3888 | hrtick_clear(rq); | 3954 | hrtick_clear(rq); |
| 3889 | 3955 | ||
| 3890 | raw_spin_lock_irq(&rq->lock); | 3956 | raw_spin_lock_irq(&rq->lock); |
| 3891 | clear_tsk_need_resched(prev); | ||
| 3892 | 3957 | ||
| 3893 | switch_count = &prev->nivcsw; | 3958 | switch_count = &prev->nivcsw; |
| 3894 | if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { | 3959 | if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { |
| @@ -3920,6 +3985,8 @@ need_resched_nonpreemptible: | |||
| 3920 | 3985 | ||
| 3921 | put_prev_task(rq, prev); | 3986 | put_prev_task(rq, prev); |
| 3922 | next = pick_next_task(rq); | 3987 | next = pick_next_task(rq); |
| 3988 | clear_tsk_need_resched(prev); | ||
| 3989 | rq->skip_clock_update = 0; | ||
| 3923 | 3990 | ||
| 3924 | if (likely(prev != next)) { | 3991 | if (likely(prev != next)) { |
| 3925 | sched_info_switch(prev, next); | 3992 | sched_info_switch(prev, next); |
| @@ -4014,7 +4081,7 @@ int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner) | |||
| 4014 | if (task_thread_info(rq->curr) != owner || need_resched()) | 4081 | if (task_thread_info(rq->curr) != owner || need_resched()) |
| 4015 | return 0; | 4082 | return 0; |
| 4016 | 4083 | ||
| 4017 | cpu_relax(); | 4084 | arch_mutex_cpu_relax(); |
| 4018 | } | 4085 | } |
| 4019 | 4086 | ||
| 4020 | return 1; | 4087 | return 1; |
| @@ -4326,7 +4393,7 @@ EXPORT_SYMBOL(wait_for_completion_interruptible); | |||
| 4326 | * This waits for either a completion of a specific task to be signaled or for a | 4393 | * This waits for either a completion of a specific task to be signaled or for a |
| 4327 | * specified timeout to expire. It is interruptible. The timeout is in jiffies. | 4394 | * specified timeout to expire. It is interruptible. The timeout is in jiffies. |
| 4328 | */ | 4395 | */ |
| 4329 | unsigned long __sched | 4396 | long __sched |
| 4330 | wait_for_completion_interruptible_timeout(struct completion *x, | 4397 | wait_for_completion_interruptible_timeout(struct completion *x, |
| 4331 | unsigned long timeout) | 4398 | unsigned long timeout) |
| 4332 | { | 4399 | { |
| @@ -4359,7 +4426,7 @@ EXPORT_SYMBOL(wait_for_completion_killable); | |||
| 4359 | * signaled or for a specified timeout to expire. It can be | 4426 | * signaled or for a specified timeout to expire. It can be |
| 4360 | * interrupted by a kill signal. The timeout is in jiffies. | 4427 | * interrupted by a kill signal. The timeout is in jiffies. |
| 4361 | */ | 4428 | */ |
| 4362 | unsigned long __sched | 4429 | long __sched |
| 4363 | wait_for_completion_killable_timeout(struct completion *x, | 4430 | wait_for_completion_killable_timeout(struct completion *x, |
| 4364 | unsigned long timeout) | 4431 | unsigned long timeout) |
| 4365 | { | 4432 | { |
| @@ -4701,7 +4768,7 @@ static bool check_same_owner(struct task_struct *p) | |||
| 4701 | } | 4768 | } |
| 4702 | 4769 | ||
| 4703 | static int __sched_setscheduler(struct task_struct *p, int policy, | 4770 | static int __sched_setscheduler(struct task_struct *p, int policy, |
| 4704 | struct sched_param *param, bool user) | 4771 | const struct sched_param *param, bool user) |
| 4705 | { | 4772 | { |
| 4706 | int retval, oldprio, oldpolicy = -1, on_rq, running; | 4773 | int retval, oldprio, oldpolicy = -1, on_rq, running; |
| 4707 | unsigned long flags; | 4774 | unsigned long flags; |
| @@ -4804,7 +4871,8 @@ recheck: | |||
| 4804 | * assigned. | 4871 | * assigned. |
| 4805 | */ | 4872 | */ |
| 4806 | if (rt_bandwidth_enabled() && rt_policy(policy) && | 4873 | if (rt_bandwidth_enabled() && rt_policy(policy) && |
| 4807 | task_group(p)->rt_bandwidth.rt_runtime == 0) { | 4874 | task_group(p)->rt_bandwidth.rt_runtime == 0 && |
| 4875 | !task_group_is_autogroup(task_group(p))) { | ||
| 4808 | __task_rq_unlock(rq); | 4876 | __task_rq_unlock(rq); |
| 4809 | raw_spin_unlock_irqrestore(&p->pi_lock, flags); | 4877 | raw_spin_unlock_irqrestore(&p->pi_lock, flags); |
| 4810 | return -EPERM; | 4878 | return -EPERM; |
| @@ -4856,7 +4924,7 @@ recheck: | |||
| 4856 | * NOTE that the task may be already dead. | 4924 | * NOTE that the task may be already dead. |
| 4857 | */ | 4925 | */ |
| 4858 | int sched_setscheduler(struct task_struct *p, int policy, | 4926 | int sched_setscheduler(struct task_struct *p, int policy, |
| 4859 | struct sched_param *param) | 4927 | const struct sched_param *param) |
| 4860 | { | 4928 | { |
| 4861 | return __sched_setscheduler(p, policy, param, true); | 4929 | return __sched_setscheduler(p, policy, param, true); |
| 4862 | } | 4930 | } |
| @@ -4874,7 +4942,7 @@ EXPORT_SYMBOL_GPL(sched_setscheduler); | |||
| 4874 | * but our caller might not have that capability. | 4942 | * but our caller might not have that capability. |
| 4875 | */ | 4943 | */ |
| 4876 | int sched_setscheduler_nocheck(struct task_struct *p, int policy, | 4944 | int sched_setscheduler_nocheck(struct task_struct *p, int policy, |
| 4877 | struct sched_param *param) | 4945 | const struct sched_param *param) |
| 4878 | { | 4946 | { |
| 4879 | return __sched_setscheduler(p, policy, param, false); | 4947 | return __sched_setscheduler(p, policy, param, false); |
| 4880 | } | 4948 | } |
| @@ -5390,7 +5458,7 @@ void sched_show_task(struct task_struct *p) | |||
| 5390 | unsigned state; | 5458 | unsigned state; |
| 5391 | 5459 | ||
| 5392 | state = p->state ? __ffs(p->state) + 1 : 0; | 5460 | state = p->state ? __ffs(p->state) + 1 : 0; |
| 5393 | printk(KERN_INFO "%-13.13s %c", p->comm, | 5461 | printk(KERN_INFO "%-15.15s %c", p->comm, |
| 5394 | state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?'); | 5462 | state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?'); |
| 5395 | #if BITS_PER_LONG == 32 | 5463 | #if BITS_PER_LONG == 32 |
| 5396 | if (state == TASK_RUNNING) | 5464 | if (state == TASK_RUNNING) |
| @@ -5554,7 +5622,6 @@ static void update_sysctl(void) | |||
| 5554 | SET_SYSCTL(sched_min_granularity); | 5622 | SET_SYSCTL(sched_min_granularity); |
| 5555 | SET_SYSCTL(sched_latency); | 5623 | SET_SYSCTL(sched_latency); |
| 5556 | SET_SYSCTL(sched_wakeup_granularity); | 5624 | SET_SYSCTL(sched_wakeup_granularity); |
| 5557 | SET_SYSCTL(sched_shares_ratelimit); | ||
| 5558 | #undef SET_SYSCTL | 5625 | #undef SET_SYSCTL |
| 5559 | } | 5626 | } |
| 5560 | 5627 | ||
| @@ -5630,7 +5697,7 @@ again: | |||
| 5630 | goto out; | 5697 | goto out; |
| 5631 | 5698 | ||
| 5632 | dest_cpu = cpumask_any_and(cpu_active_mask, new_mask); | 5699 | dest_cpu = cpumask_any_and(cpu_active_mask, new_mask); |
| 5633 | if (migrate_task(p, dest_cpu)) { | 5700 | if (migrate_task(p, rq)) { |
| 5634 | struct migration_arg arg = { p, dest_cpu }; | 5701 | struct migration_arg arg = { p, dest_cpu }; |
| 5635 | /* Need help from migration thread: drop lock and wait. */ | 5702 | /* Need help from migration thread: drop lock and wait. */ |
| 5636 | task_rq_unlock(rq, &flags); | 5703 | task_rq_unlock(rq, &flags); |
| @@ -5712,29 +5779,20 @@ static int migration_cpu_stop(void *data) | |||
| 5712 | } | 5779 | } |
| 5713 | 5780 | ||
| 5714 | #ifdef CONFIG_HOTPLUG_CPU | 5781 | #ifdef CONFIG_HOTPLUG_CPU |
| 5782 | |||
| 5715 | /* | 5783 | /* |
| 5716 | * Figure out where task on dead CPU should go, use force if necessary. | 5784 | * Ensures that the idle task is using init_mm right before its cpu goes |
| 5785 | * offline. | ||
| 5717 | */ | 5786 | */ |
| 5718 | void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) | 5787 | void idle_task_exit(void) |
| 5719 | { | 5788 | { |
| 5720 | struct rq *rq = cpu_rq(dead_cpu); | 5789 | struct mm_struct *mm = current->active_mm; |
| 5721 | int needs_cpu, uninitialized_var(dest_cpu); | ||
| 5722 | unsigned long flags; | ||
| 5723 | 5790 | ||
| 5724 | local_irq_save(flags); | 5791 | BUG_ON(cpu_online(smp_processor_id())); |
| 5725 | 5792 | ||
| 5726 | raw_spin_lock(&rq->lock); | 5793 | if (mm != &init_mm) |
| 5727 | needs_cpu = (task_cpu(p) == dead_cpu) && (p->state != TASK_WAKING); | 5794 | switch_mm(mm, &init_mm, current); |
| 5728 | if (needs_cpu) | 5795 | mmdrop(mm); |
| 5729 | dest_cpu = select_fallback_rq(dead_cpu, p); | ||
| 5730 | raw_spin_unlock(&rq->lock); | ||
| 5731 | /* | ||
| 5732 | * It can only fail if we race with set_cpus_allowed(), | ||
| 5733 | * in the racer should migrate the task anyway. | ||
| 5734 | */ | ||
| 5735 | if (needs_cpu) | ||
| 5736 | __migrate_task(p, dead_cpu, dest_cpu); | ||
| 5737 | local_irq_restore(flags); | ||
| 5738 | } | 5796 | } |
| 5739 | 5797 | ||
| 5740 | /* | 5798 | /* |
| @@ -5747,128 +5805,69 @@ void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) | |||
| 5747 | static void migrate_nr_uninterruptible(struct rq *rq_src) | 5805 | static void migrate_nr_uninterruptible(struct rq *rq_src) |
| 5748 | { | 5806 | { |
| 5749 | struct rq *rq_dest = cpu_rq(cpumask_any(cpu_active_mask)); | 5807 | struct rq *rq_dest = cpu_rq(cpumask_any(cpu_active_mask)); |
| 5750 | unsigned long flags; | ||
| 5751 | 5808 | ||
| 5752 | local_irq_save(flags); | ||
| 5753 | double_rq_lock(rq_src, rq_dest); | ||
| 5754 | rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible; | 5809 | rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible; |
| 5755 | rq_src->nr_uninterruptible = 0; | 5810 | rq_src->nr_uninterruptible = 0; |
| 5756 | double_rq_unlock(rq_src, rq_dest); | ||
| 5757 | local_irq_restore(flags); | ||
| 5758 | } | ||
| 5759 | |||
| 5760 | /* Run through task list and migrate tasks from the dead cpu. */ | ||
| 5761 | static void migrate_live_tasks(int src_cpu) | ||
| 5762 | { | ||
| 5763 | struct task_struct *p, *t; | ||
| 5764 | |||
| 5765 | read_lock(&tasklist_lock); | ||
| 5766 | |||
| 5767 | do_each_thread(t, p) { | ||
| 5768 | if (p == current) | ||
| 5769 | continue; | ||
| 5770 | |||
| 5771 | if (task_cpu(p) == src_cpu) | ||
| 5772 | move_task_off_dead_cpu(src_cpu, p); | ||
| 5773 | } while_each_thread(t, p); | ||
| 5774 | |||
| 5775 | read_unlock(&tasklist_lock); | ||
| 5776 | } | 5811 | } |
| 5777 | 5812 | ||
| 5778 | /* | 5813 | /* |
| 5779 | * Schedules idle task to be the next runnable task on current CPU. | 5814 | * remove the tasks which were accounted by rq from calc_load_tasks. |
| 5780 | * It does so by boosting its priority to highest possible. | ||
| 5781 | * Used by CPU offline code. | ||
| 5782 | */ | 5815 | */ |
| 5783 | void sched_idle_next(void) | 5816 | static void calc_global_load_remove(struct rq *rq) |
| 5784 | { | 5817 | { |
| 5785 | int this_cpu = smp_processor_id(); | 5818 | atomic_long_sub(rq->calc_load_active, &calc_load_tasks); |
| 5786 | struct rq *rq = cpu_rq(this_cpu); | 5819 | rq->calc_load_active = 0; |
| 5787 | struct task_struct *p = rq->idle; | ||
| 5788 | unsigned long flags; | ||
| 5789 | |||
| 5790 | /* cpu has to be offline */ | ||
| 5791 | BUG_ON(cpu_online(this_cpu)); | ||
| 5792 | |||
| 5793 | /* | ||
| 5794 | * Strictly not necessary since rest of the CPUs are stopped by now | ||
| 5795 | * and interrupts disabled on the current cpu. | ||
| 5796 | */ | ||
| 5797 | raw_spin_lock_irqsave(&rq->lock, flags); | ||
| 5798 | |||
| 5799 | __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1); | ||
| 5800 | |||
| 5801 | activate_task(rq, p, 0); | ||
| 5802 | |||
| 5803 | raw_spin_unlock_irqrestore(&rq->lock, flags); | ||
| 5804 | } | 5820 | } |
| 5805 | 5821 | ||
| 5806 | /* | 5822 | /* |
| 5807 | * Ensures that the idle task is using init_mm right before its cpu goes | 5823 | * Migrate all tasks from the rq, sleeping tasks will be migrated by |
| 5808 | * offline. | 5824 | * try_to_wake_up()->select_task_rq(). |
| 5825 | * | ||
| 5826 | * Called with rq->lock held even though we'er in stop_machine() and | ||
| 5827 | * there's no concurrency possible, we hold the required locks anyway | ||
| 5828 | * because of lock validation efforts. | ||
| 5809 | */ | 5829 | */ |
| 5810 | void idle_task_exit(void) | 5830 | static void migrate_tasks(unsigned int dead_cpu) |
| 5811 | { | ||
| 5812 | struct mm_struct *mm = current->active_mm; | ||
| 5813 | |||
| 5814 | BUG_ON(cpu_online(smp_processor_id())); | ||
| 5815 | |||
| 5816 | if (mm != &init_mm) | ||
| 5817 | switch_mm(mm, &init_mm, current); | ||
| 5818 | mmdrop(mm); | ||
| 5819 | } | ||
| 5820 | |||
| 5821 | /* called under rq->lock with disabled interrupts */ | ||
| 5822 | static void migrate_dead(unsigned int dead_cpu, struct task_struct *p) | ||
| 5823 | { | 5831 | { |
| 5824 | struct rq *rq = cpu_rq(dead_cpu); | 5832 | struct rq *rq = cpu_rq(dead_cpu); |
| 5825 | 5833 | struct task_struct *next, *stop = rq->stop; | |
| 5826 | /* Must be exiting, otherwise would be on tasklist. */ | 5834 | int dest_cpu; |
| 5827 | BUG_ON(!p->exit_state); | ||
| 5828 | |||
| 5829 | /* Cannot have done final schedule yet: would have vanished. */ | ||
| 5830 | BUG_ON(p->state == TASK_DEAD); | ||
| 5831 | |||
| 5832 | get_task_struct(p); | ||
| 5833 | 5835 | ||
| 5834 | /* | 5836 | /* |
| 5835 | * Drop lock around migration; if someone else moves it, | 5837 | * Fudge the rq selection such that the below task selection loop |
| 5836 | * that's OK. No task can be added to this CPU, so iteration is | 5838 | * doesn't get stuck on the currently eligible stop task. |
| 5837 | * fine. | 5839 | * |
| 5840 | * We're currently inside stop_machine() and the rq is either stuck | ||
| 5841 | * in the stop_machine_cpu_stop() loop, or we're executing this code, | ||
| 5842 | * either way we should never end up calling schedule() until we're | ||
| 5843 | * done here. | ||
| 5838 | */ | 5844 | */ |
| 5839 | raw_spin_unlock_irq(&rq->lock); | 5845 | rq->stop = NULL; |
| 5840 | move_task_off_dead_cpu(dead_cpu, p); | ||
| 5841 | raw_spin_lock_irq(&rq->lock); | ||
| 5842 | |||
| 5843 | put_task_struct(p); | ||
| 5844 | } | ||
| 5845 | |||
| 5846 | /* release_task() removes task from tasklist, so we won't find dead tasks. */ | ||
| 5847 | static void migrate_dead_tasks(unsigned int dead_cpu) | ||
| 5848 | { | ||
| 5849 | struct rq *rq = cpu_rq(dead_cpu); | ||
| 5850 | struct task_struct *next; | ||
| 5851 | 5846 | ||
| 5852 | for ( ; ; ) { | 5847 | for ( ; ; ) { |
| 5853 | if (!rq->nr_running) | 5848 | /* |
| 5849 | * There's this thread running, bail when that's the only | ||
| 5850 | * remaining thread. | ||
| 5851 | */ | ||
| 5852 | if (rq->nr_running == 1) | ||
| 5854 | break; | 5853 | break; |
| 5854 | |||
| 5855 | next = pick_next_task(rq); | 5855 | next = pick_next_task(rq); |
| 5856 | if (!next) | 5856 | BUG_ON(!next); |
| 5857 | break; | ||
| 5858 | next->sched_class->put_prev_task(rq, next); | 5857 | next->sched_class->put_prev_task(rq, next); |
| 5859 | migrate_dead(dead_cpu, next); | ||
| 5860 | 5858 | ||
| 5859 | /* Find suitable destination for @next, with force if needed. */ | ||
| 5860 | dest_cpu = select_fallback_rq(dead_cpu, next); | ||
| 5861 | raw_spin_unlock(&rq->lock); | ||
| 5862 | |||
| 5863 | __migrate_task(next, dead_cpu, dest_cpu); | ||
| 5864 | |||
| 5865 | raw_spin_lock(&rq->lock); | ||
| 5861 | } | 5866 | } |
| 5862 | } | ||
| 5863 | 5867 | ||
| 5864 | /* | 5868 | rq->stop = stop; |
| 5865 | * remove the tasks which were accounted by rq from calc_load_tasks. | ||
| 5866 | */ | ||
| 5867 | static void calc_global_load_remove(struct rq *rq) | ||
| 5868 | { | ||
| 5869 | atomic_long_sub(rq->calc_load_active, &calc_load_tasks); | ||
| 5870 | rq->calc_load_active = 0; | ||
| 5871 | } | 5869 | } |
| 5870 | |||
| 5872 | #endif /* CONFIG_HOTPLUG_CPU */ | 5871 | #endif /* CONFIG_HOTPLUG_CPU */ |
| 5873 | 5872 | ||
| 5874 | #if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) | 5873 | #if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) |
| @@ -6078,15 +6077,13 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) | |||
| 6078 | unsigned long flags; | 6077 | unsigned long flags; |
| 6079 | struct rq *rq = cpu_rq(cpu); | 6078 | struct rq *rq = cpu_rq(cpu); |
| 6080 | 6079 | ||
| 6081 | switch (action) { | 6080 | switch (action & ~CPU_TASKS_FROZEN) { |
| 6082 | 6081 | ||
| 6083 | case CPU_UP_PREPARE: | 6082 | case CPU_UP_PREPARE: |
| 6084 | case CPU_UP_PREPARE_FROZEN: | ||
| 6085 | rq->calc_load_update = calc_load_update; | 6083 | rq->calc_load_update = calc_load_update; |
| 6086 | break; | 6084 | break; |
| 6087 | 6085 | ||
| 6088 | case CPU_ONLINE: | 6086 | case CPU_ONLINE: |
| 6089 | case CPU_ONLINE_FROZEN: | ||
| 6090 | /* Update our root-domain */ | 6087 | /* Update our root-domain */ |
| 6091 | raw_spin_lock_irqsave(&rq->lock, flags); | 6088 | raw_spin_lock_irqsave(&rq->lock, flags); |
| 6092 | if (rq->rd) { | 6089 | if (rq->rd) { |
| @@ -6098,30 +6095,19 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) | |||
| 6098 | break; | 6095 | break; |
| 6099 | 6096 | ||
| 6100 | #ifdef CONFIG_HOTPLUG_CPU | 6097 | #ifdef CONFIG_HOTPLUG_CPU |
| 6101 | case CPU_DEAD: | ||
| 6102 | case CPU_DEAD_FROZEN: | ||
| 6103 | migrate_live_tasks(cpu); | ||
| 6104 | /* Idle task back to normal (off runqueue, low prio) */ | ||
| 6105 | raw_spin_lock_irq(&rq->lock); | ||
| 6106 | deactivate_task(rq, rq->idle, 0); | ||
| 6107 | __setscheduler(rq, rq->idle, SCHED_NORMAL, 0); | ||
| 6108 | rq->idle->sched_class = &idle_sched_class; | ||
| 6109 | migrate_dead_tasks(cpu); | ||
| 6110 | raw_spin_unlock_irq(&rq->lock); | ||
| 6111 | migrate_nr_uninterruptible(rq); | ||
| 6112 | BUG_ON(rq->nr_running != 0); | ||
| 6113 | calc_global_load_remove(rq); | ||
| 6114 | break; | ||
| 6115 | |||
| 6116 | case CPU_DYING: | 6098 | case CPU_DYING: |
| 6117 | case CPU_DYING_FROZEN: | ||
| 6118 | /* Update our root-domain */ | 6099 | /* Update our root-domain */ |
| 6119 | raw_spin_lock_irqsave(&rq->lock, flags); | 6100 | raw_spin_lock_irqsave(&rq->lock, flags); |
| 6120 | if (rq->rd) { | 6101 | if (rq->rd) { |
| 6121 | BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); | 6102 | BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); |
| 6122 | set_rq_offline(rq); | 6103 | set_rq_offline(rq); |
| 6123 | } | 6104 | } |
| 6105 | migrate_tasks(cpu); | ||
| 6106 | BUG_ON(rq->nr_running != 1); /* the migration thread */ | ||
| 6124 | raw_spin_unlock_irqrestore(&rq->lock, flags); | 6107 | raw_spin_unlock_irqrestore(&rq->lock, flags); |
| 6108 | |||
| 6109 | migrate_nr_uninterruptible(rq); | ||
| 6110 | calc_global_load_remove(rq); | ||
| 6125 | break; | 6111 | break; |
| 6126 | #endif | 6112 | #endif |
| 6127 | } | 6113 | } |
| @@ -6960,6 +6946,8 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd) | |||
| 6960 | if (cpu != group_first_cpu(sd->groups)) | 6946 | if (cpu != group_first_cpu(sd->groups)) |
| 6961 | return; | 6947 | return; |
| 6962 | 6948 | ||
| 6949 | sd->groups->group_weight = cpumask_weight(sched_group_cpus(sd->groups)); | ||
| 6950 | |||
| 6963 | child = sd->child; | 6951 | child = sd->child; |
| 6964 | 6952 | ||
| 6965 | sd->groups->cpu_power = 0; | 6953 | sd->groups->cpu_power = 0; |
| @@ -7850,18 +7838,16 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) | |||
| 7850 | 7838 | ||
| 7851 | #ifdef CONFIG_FAIR_GROUP_SCHED | 7839 | #ifdef CONFIG_FAIR_GROUP_SCHED |
| 7852 | static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, | 7840 | static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, |
| 7853 | struct sched_entity *se, int cpu, int add, | 7841 | struct sched_entity *se, int cpu, |
| 7854 | struct sched_entity *parent) | 7842 | struct sched_entity *parent) |
| 7855 | { | 7843 | { |
| 7856 | struct rq *rq = cpu_rq(cpu); | 7844 | struct rq *rq = cpu_rq(cpu); |
| 7857 | tg->cfs_rq[cpu] = cfs_rq; | 7845 | tg->cfs_rq[cpu] = cfs_rq; |
| 7858 | init_cfs_rq(cfs_rq, rq); | 7846 | init_cfs_rq(cfs_rq, rq); |
| 7859 | cfs_rq->tg = tg; | 7847 | cfs_rq->tg = tg; |
| 7860 | if (add) | ||
| 7861 | list_add(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list); | ||
| 7862 | 7848 | ||
| 7863 | tg->se[cpu] = se; | 7849 | tg->se[cpu] = se; |
| 7864 | /* se could be NULL for init_task_group */ | 7850 | /* se could be NULL for root_task_group */ |
| 7865 | if (!se) | 7851 | if (!se) |
| 7866 | return; | 7852 | return; |
| 7867 | 7853 | ||
| @@ -7871,15 +7857,14 @@ static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, | |||
| 7871 | se->cfs_rq = parent->my_q; | 7857 | se->cfs_rq = parent->my_q; |
| 7872 | 7858 | ||
| 7873 | se->my_q = cfs_rq; | 7859 | se->my_q = cfs_rq; |
| 7874 | se->load.weight = tg->shares; | 7860 | update_load_set(&se->load, 0); |
| 7875 | se->load.inv_weight = 0; | ||
| 7876 | se->parent = parent; | 7861 | se->parent = parent; |
| 7877 | } | 7862 | } |
| 7878 | #endif | 7863 | #endif |
| 7879 | 7864 | ||
| 7880 | #ifdef CONFIG_RT_GROUP_SCHED | 7865 | #ifdef CONFIG_RT_GROUP_SCHED |
| 7881 | static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, | 7866 | static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, |
| 7882 | struct sched_rt_entity *rt_se, int cpu, int add, | 7867 | struct sched_rt_entity *rt_se, int cpu, |
| 7883 | struct sched_rt_entity *parent) | 7868 | struct sched_rt_entity *parent) |
| 7884 | { | 7869 | { |
| 7885 | struct rq *rq = cpu_rq(cpu); | 7870 | struct rq *rq = cpu_rq(cpu); |
| @@ -7888,8 +7873,6 @@ static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, | |||
| 7888 | init_rt_rq(rt_rq, rq); | 7873 | init_rt_rq(rt_rq, rq); |
| 7889 | rt_rq->tg = tg; | 7874 | rt_rq->tg = tg; |
| 7890 | rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime; | 7875 | rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime; |
| 7891 | if (add) | ||
| 7892 | list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list); | ||
| 7893 | 7876 | ||
| 7894 | tg->rt_se[cpu] = rt_se; | 7877 | tg->rt_se[cpu] = rt_se; |
| 7895 | if (!rt_se) | 7878 | if (!rt_se) |
| @@ -7924,18 +7907,18 @@ void __init sched_init(void) | |||
| 7924 | ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT); | 7907 | ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT); |
| 7925 | 7908 | ||
| 7926 | #ifdef CONFIG_FAIR_GROUP_SCHED | 7909 | #ifdef CONFIG_FAIR_GROUP_SCHED |
| 7927 | init_task_group.se = (struct sched_entity **)ptr; | 7910 | root_task_group.se = (struct sched_entity **)ptr; |
| 7928 | ptr += nr_cpu_ids * sizeof(void **); | 7911 | ptr += nr_cpu_ids * sizeof(void **); |
| 7929 | 7912 | ||
| 7930 | init_task_group.cfs_rq = (struct cfs_rq **)ptr; | 7913 | root_task_group.cfs_rq = (struct cfs_rq **)ptr; |
| 7931 | ptr += nr_cpu_ids * sizeof(void **); | 7914 | ptr += nr_cpu_ids * sizeof(void **); |
| 7932 | 7915 | ||
| 7933 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | 7916 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
| 7934 | #ifdef CONFIG_RT_GROUP_SCHED | 7917 | #ifdef CONFIG_RT_GROUP_SCHED |
| 7935 | init_task_group.rt_se = (struct sched_rt_entity **)ptr; | 7918 | root_task_group.rt_se = (struct sched_rt_entity **)ptr; |
| 7936 | ptr += nr_cpu_ids * sizeof(void **); | 7919 | ptr += nr_cpu_ids * sizeof(void **); |
| 7937 | 7920 | ||
| 7938 | init_task_group.rt_rq = (struct rt_rq **)ptr; | 7921 | root_task_group.rt_rq = (struct rt_rq **)ptr; |
| 7939 | ptr += nr_cpu_ids * sizeof(void **); | 7922 | ptr += nr_cpu_ids * sizeof(void **); |
| 7940 | 7923 | ||
| 7941 | #endif /* CONFIG_RT_GROUP_SCHED */ | 7924 | #endif /* CONFIG_RT_GROUP_SCHED */ |
| @@ -7955,20 +7938,16 @@ void __init sched_init(void) | |||
| 7955 | global_rt_period(), global_rt_runtime()); | 7938 | global_rt_period(), global_rt_runtime()); |
| 7956 | 7939 | ||
| 7957 | #ifdef CONFIG_RT_GROUP_SCHED | 7940 | #ifdef CONFIG_RT_GROUP_SCHED |
| 7958 | init_rt_bandwidth(&init_task_group.rt_bandwidth, | 7941 | init_rt_bandwidth(&root_task_group.rt_bandwidth, |
| 7959 | global_rt_period(), global_rt_runtime()); | 7942 | global_rt_period(), global_rt_runtime()); |
| 7960 | #endif /* CONFIG_RT_GROUP_SCHED */ | 7943 | #endif /* CONFIG_RT_GROUP_SCHED */ |
| 7961 | 7944 | ||
| 7962 | #ifdef CONFIG_CGROUP_SCHED | 7945 | #ifdef CONFIG_CGROUP_SCHED |
| 7963 | list_add(&init_task_group.list, &task_groups); | 7946 | list_add(&root_task_group.list, &task_groups); |
| 7964 | INIT_LIST_HEAD(&init_task_group.children); | 7947 | INIT_LIST_HEAD(&root_task_group.children); |
| 7965 | 7948 | autogroup_init(&init_task); | |
| 7966 | #endif /* CONFIG_CGROUP_SCHED */ | 7949 | #endif /* CONFIG_CGROUP_SCHED */ |
| 7967 | 7950 | ||
| 7968 | #if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP | ||
| 7969 | update_shares_data = __alloc_percpu(nr_cpu_ids * sizeof(unsigned long), | ||
| 7970 | __alignof__(unsigned long)); | ||
| 7971 | #endif | ||
| 7972 | for_each_possible_cpu(i) { | 7951 | for_each_possible_cpu(i) { |
| 7973 | struct rq *rq; | 7952 | struct rq *rq; |
| 7974 | 7953 | ||
| @@ -7980,38 +7959,34 @@ void __init sched_init(void) | |||
| 7980 | init_cfs_rq(&rq->cfs, rq); | 7959 | init_cfs_rq(&rq->cfs, rq); |
| 7981 | init_rt_rq(&rq->rt, rq); | 7960 | init_rt_rq(&rq->rt, rq); |
| 7982 | #ifdef CONFIG_FAIR_GROUP_SCHED | 7961 | #ifdef CONFIG_FAIR_GROUP_SCHED |
| 7983 | init_task_group.shares = init_task_group_load; | 7962 | root_task_group.shares = root_task_group_load; |
| 7984 | INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); | 7963 | INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); |
| 7985 | #ifdef CONFIG_CGROUP_SCHED | ||
| 7986 | /* | 7964 | /* |
| 7987 | * How much cpu bandwidth does init_task_group get? | 7965 | * How much cpu bandwidth does root_task_group get? |
| 7988 | * | 7966 | * |
| 7989 | * In case of task-groups formed thr' the cgroup filesystem, it | 7967 | * In case of task-groups formed thr' the cgroup filesystem, it |
| 7990 | * gets 100% of the cpu resources in the system. This overall | 7968 | * gets 100% of the cpu resources in the system. This overall |
| 7991 | * system cpu resource is divided among the tasks of | 7969 | * system cpu resource is divided among the tasks of |
| 7992 | * init_task_group and its child task-groups in a fair manner, | 7970 | * root_task_group and its child task-groups in a fair manner, |
| 7993 | * based on each entity's (task or task-group's) weight | 7971 | * based on each entity's (task or task-group's) weight |
| 7994 | * (se->load.weight). | 7972 | * (se->load.weight). |
| 7995 | * | 7973 | * |
| 7996 | * In other words, if init_task_group has 10 tasks of weight | 7974 | * In other words, if root_task_group has 10 tasks of weight |
| 7997 | * 1024) and two child groups A0 and A1 (of weight 1024 each), | 7975 | * 1024) and two child groups A0 and A1 (of weight 1024 each), |
| 7998 | * then A0's share of the cpu resource is: | 7976 | * then A0's share of the cpu resource is: |
| 7999 | * | 7977 | * |
| 8000 | * A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33% | 7978 | * A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33% |
| 8001 | * | 7979 | * |
| 8002 | * We achieve this by letting init_task_group's tasks sit | 7980 | * We achieve this by letting root_task_group's tasks sit |
| 8003 | * directly in rq->cfs (i.e init_task_group->se[] = NULL). | 7981 | * directly in rq->cfs (i.e root_task_group->se[] = NULL). |
| 8004 | */ | 7982 | */ |
| 8005 | init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, 1, NULL); | 7983 | init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL); |
| 8006 | #endif | ||
| 8007 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | 7984 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
| 8008 | 7985 | ||
| 8009 | rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime; | 7986 | rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime; |
| 8010 | #ifdef CONFIG_RT_GROUP_SCHED | 7987 | #ifdef CONFIG_RT_GROUP_SCHED |
| 8011 | INIT_LIST_HEAD(&rq->leaf_rt_rq_list); | 7988 | INIT_LIST_HEAD(&rq->leaf_rt_rq_list); |
| 8012 | #ifdef CONFIG_CGROUP_SCHED | 7989 | init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL); |
| 8013 | init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, 1, NULL); | ||
| 8014 | #endif | ||
| 8015 | #endif | 7990 | #endif |
| 8016 | 7991 | ||
| 8017 | for (j = 0; j < CPU_LOAD_IDX_MAX; j++) | 7992 | for (j = 0; j < CPU_LOAD_IDX_MAX; j++) |
| @@ -8091,8 +8066,6 @@ void __init sched_init(void) | |||
| 8091 | zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); | 8066 | zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); |
| 8092 | #endif /* SMP */ | 8067 | #endif /* SMP */ |
| 8093 | 8068 | ||
| 8094 | perf_event_init(); | ||
| 8095 | |||
| 8096 | scheduler_running = 1; | 8069 | scheduler_running = 1; |
| 8097 | } | 8070 | } |
| 8098 | 8071 | ||
| @@ -8286,7 +8259,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) | |||
| 8286 | if (!se) | 8259 | if (!se) |
| 8287 | goto err_free_rq; | 8260 | goto err_free_rq; |
| 8288 | 8261 | ||
| 8289 | init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent->se[i]); | 8262 | init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]); |
| 8290 | } | 8263 | } |
| 8291 | 8264 | ||
| 8292 | return 1; | 8265 | return 1; |
| @@ -8297,15 +8270,21 @@ err: | |||
| 8297 | return 0; | 8270 | return 0; |
| 8298 | } | 8271 | } |
| 8299 | 8272 | ||
| 8300 | static inline void register_fair_sched_group(struct task_group *tg, int cpu) | ||
| 8301 | { | ||
| 8302 | list_add_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list, | ||
| 8303 | &cpu_rq(cpu)->leaf_cfs_rq_list); | ||
| 8304 | } | ||
| 8305 | |||
| 8306 | static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) | 8273 | static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) |
| 8307 | { | 8274 | { |
| 8308 | list_del_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list); | 8275 | struct rq *rq = cpu_rq(cpu); |
| 8276 | unsigned long flags; | ||
| 8277 | |||
| 8278 | /* | ||
| 8279 | * Only empty task groups can be destroyed; so we can speculatively | ||
| 8280 | * check on_list without danger of it being re-added. | ||
| 8281 | */ | ||
| 8282 | if (!tg->cfs_rq[cpu]->on_list) | ||
| 8283 | return; | ||
| 8284 | |||
| 8285 | raw_spin_lock_irqsave(&rq->lock, flags); | ||
| 8286 | list_del_leaf_cfs_rq(tg->cfs_rq[cpu]); | ||
| 8287 | raw_spin_unlock_irqrestore(&rq->lock, flags); | ||
| 8309 | } | 8288 | } |
| 8310 | #else /* !CONFG_FAIR_GROUP_SCHED */ | 8289 | #else /* !CONFG_FAIR_GROUP_SCHED */ |
| 8311 | static inline void free_fair_sched_group(struct task_group *tg) | 8290 | static inline void free_fair_sched_group(struct task_group *tg) |
| @@ -8318,10 +8297,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) | |||
| 8318 | return 1; | 8297 | return 1; |
| 8319 | } | 8298 | } |
| 8320 | 8299 | ||
| 8321 | static inline void register_fair_sched_group(struct task_group *tg, int cpu) | ||
| 8322 | { | ||
| 8323 | } | ||
| 8324 | |||
| 8325 | static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) | 8300 | static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) |
| 8326 | { | 8301 | { |
| 8327 | } | 8302 | } |
| @@ -8376,7 +8351,7 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) | |||
| 8376 | if (!rt_se) | 8351 | if (!rt_se) |
| 8377 | goto err_free_rq; | 8352 | goto err_free_rq; |
| 8378 | 8353 | ||
| 8379 | init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent->rt_se[i]); | 8354 | init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]); |
| 8380 | } | 8355 | } |
| 8381 | 8356 | ||
| 8382 | return 1; | 8357 | return 1; |
| @@ -8386,17 +8361,6 @@ err_free_rq: | |||
| 8386 | err: | 8361 | err: |
| 8387 | return 0; | 8362 | return 0; |
| 8388 | } | 8363 | } |
| 8389 | |||
| 8390 | static inline void register_rt_sched_group(struct task_group *tg, int cpu) | ||
| 8391 | { | ||
| 8392 | list_add_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list, | ||
| 8393 | &cpu_rq(cpu)->leaf_rt_rq_list); | ||
| 8394 | } | ||
| 8395 | |||
| 8396 | static inline void unregister_rt_sched_group(struct task_group *tg, int cpu) | ||
| 8397 | { | ||
| 8398 | list_del_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list); | ||
| 8399 | } | ||
| 8400 | #else /* !CONFIG_RT_GROUP_SCHED */ | 8364 | #else /* !CONFIG_RT_GROUP_SCHED */ |
| 8401 | static inline void free_rt_sched_group(struct task_group *tg) | 8365 | static inline void free_rt_sched_group(struct task_group *tg) |
| 8402 | { | 8366 | { |
| @@ -8407,14 +8371,6 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) | |||
| 8407 | { | 8371 | { |
| 8408 | return 1; | 8372 | return 1; |
| 8409 | } | 8373 | } |
| 8410 | |||
| 8411 | static inline void register_rt_sched_group(struct task_group *tg, int cpu) | ||
| 8412 | { | ||
| 8413 | } | ||
| 8414 | |||
| 8415 | static inline void unregister_rt_sched_group(struct task_group *tg, int cpu) | ||
| 8416 | { | ||
| 8417 | } | ||
| 8418 | #endif /* CONFIG_RT_GROUP_SCHED */ | 8374 | #endif /* CONFIG_RT_GROUP_SCHED */ |
| 8419 | 8375 | ||
| 8420 | #ifdef CONFIG_CGROUP_SCHED | 8376 | #ifdef CONFIG_CGROUP_SCHED |
| @@ -8422,6 +8378,7 @@ static void free_sched_group(struct task_group *tg) | |||
| 8422 | { | 8378 | { |
| 8423 | free_fair_sched_group(tg); | 8379 | free_fair_sched_group(tg); |
| 8424 | free_rt_sched_group(tg); | 8380 | free_rt_sched_group(tg); |
| 8381 | autogroup_free(tg); | ||
| 8425 | kfree(tg); | 8382 | kfree(tg); |
| 8426 | } | 8383 | } |
| 8427 | 8384 | ||
| @@ -8430,7 +8387,6 @@ struct task_group *sched_create_group(struct task_group *parent) | |||
| 8430 | { | 8387 | { |
| 8431 | struct task_group *tg; | 8388 | struct task_group *tg; |
| 8432 | unsigned long flags; | 8389 | unsigned long flags; |
| 8433 | int i; | ||
| 8434 | 8390 | ||
| 8435 | tg = kzalloc(sizeof(*tg), GFP_KERNEL); | 8391 | tg = kzalloc(sizeof(*tg), GFP_KERNEL); |
| 8436 | if (!tg) | 8392 | if (!tg) |
| @@ -8443,10 +8399,6 @@ struct task_group *sched_create_group(struct task_group *parent) | |||
| 8443 | goto err; | 8399 | goto err; |
| 8444 | 8400 | ||
| 8445 | spin_lock_irqsave(&task_group_lock, flags); | 8401 | spin_lock_irqsave(&task_group_lock, flags); |
| 8446 | for_each_possible_cpu(i) { | ||
| 8447 | register_fair_sched_group(tg, i); | ||
| 8448 | register_rt_sched_group(tg, i); | ||
| 8449 | } | ||
| 8450 | list_add_rcu(&tg->list, &task_groups); | 8402 | list_add_rcu(&tg->list, &task_groups); |
| 8451 | 8403 | ||
| 8452 | WARN_ON(!parent); /* root should already exist */ | 8404 | WARN_ON(!parent); /* root should already exist */ |
| @@ -8476,11 +8428,11 @@ void sched_destroy_group(struct task_group *tg) | |||
| 8476 | unsigned long flags; | 8428 | unsigned long flags; |
| 8477 | int i; | 8429 | int i; |
| 8478 | 8430 | ||
| 8479 | spin_lock_irqsave(&task_group_lock, flags); | 8431 | /* end participation in shares distribution */ |
| 8480 | for_each_possible_cpu(i) { | 8432 | for_each_possible_cpu(i) |
| 8481 | unregister_fair_sched_group(tg, i); | 8433 | unregister_fair_sched_group(tg, i); |
| 8482 | unregister_rt_sched_group(tg, i); | 8434 | |
| 8483 | } | 8435 | spin_lock_irqsave(&task_group_lock, flags); |
| 8484 | list_del_rcu(&tg->list); | 8436 | list_del_rcu(&tg->list); |
| 8485 | list_del_rcu(&tg->siblings); | 8437 | list_del_rcu(&tg->siblings); |
| 8486 | spin_unlock_irqrestore(&task_group_lock, flags); | 8438 | spin_unlock_irqrestore(&task_group_lock, flags); |
| @@ -8527,33 +8479,6 @@ void sched_move_task(struct task_struct *tsk) | |||
| 8527 | #endif /* CONFIG_CGROUP_SCHED */ | 8479 | #endif /* CONFIG_CGROUP_SCHED */ |
| 8528 | 8480 | ||
| 8529 | #ifdef CONFIG_FAIR_GROUP_SCHED | 8481 | #ifdef CONFIG_FAIR_GROUP_SCHED |
| 8530 | static void __set_se_shares(struct sched_entity *se, unsigned long shares) | ||
| 8531 | { | ||
| 8532 | struct cfs_rq *cfs_rq = se->cfs_rq; | ||
| 8533 | int on_rq; | ||
| 8534 | |||
| 8535 | on_rq = se->on_rq; | ||
| 8536 | if (on_rq) | ||
| 8537 | dequeue_entity(cfs_rq, se, 0); | ||
| 8538 | |||
| 8539 | se->load.weight = shares; | ||
| 8540 | se->load.inv_weight = 0; | ||
| 8541 | |||
| 8542 | if (on_rq) | ||
| 8543 | enqueue_entity(cfs_rq, se, 0); | ||
| 8544 | } | ||
| 8545 | |||
| 8546 | static void set_se_shares(struct sched_entity *se, unsigned long shares) | ||
| 8547 | { | ||
| 8548 | struct cfs_rq *cfs_rq = se->cfs_rq; | ||
| 8549 | struct rq *rq = cfs_rq->rq; | ||
| 8550 | unsigned long flags; | ||
| 8551 | |||
| 8552 | raw_spin_lock_irqsave(&rq->lock, flags); | ||
| 8553 | __set_se_shares(se, shares); | ||
| 8554 | raw_spin_unlock_irqrestore(&rq->lock, flags); | ||
| 8555 | } | ||
| 8556 | |||
| 8557 | static DEFINE_MUTEX(shares_mutex); | 8482 | static DEFINE_MUTEX(shares_mutex); |
| 8558 | 8483 | ||
| 8559 | int sched_group_set_shares(struct task_group *tg, unsigned long shares) | 8484 | int sched_group_set_shares(struct task_group *tg, unsigned long shares) |
| @@ -8576,37 +8501,19 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) | |||
| 8576 | if (tg->shares == shares) | 8501 | if (tg->shares == shares) |
| 8577 | goto done; | 8502 | goto done; |
| 8578 | 8503 | ||
| 8579 | spin_lock_irqsave(&task_group_lock, flags); | ||
| 8580 | for_each_possible_cpu(i) | ||
| 8581 | unregister_fair_sched_group(tg, i); | ||
| 8582 | list_del_rcu(&tg->siblings); | ||
| 8583 | spin_unlock_irqrestore(&task_group_lock, flags); | ||
| 8584 | |||
| 8585 | /* wait for any ongoing reference to this group to finish */ | ||
| 8586 | synchronize_sched(); | ||
| 8587 | |||
| 8588 | /* | ||
| 8589 | * Now we are free to modify the group's share on each cpu | ||
| 8590 | * w/o tripping rebalance_share or load_balance_fair. | ||
| 8591 | */ | ||
| 8592 | tg->shares = shares; | 8504 | tg->shares = shares; |
| 8593 | for_each_possible_cpu(i) { | 8505 | for_each_possible_cpu(i) { |
| 8594 | /* | 8506 | struct rq *rq = cpu_rq(i); |
| 8595 | * force a rebalance | 8507 | struct sched_entity *se; |
| 8596 | */ | 8508 | |
| 8597 | cfs_rq_set_shares(tg->cfs_rq[i], 0); | 8509 | se = tg->se[i]; |
| 8598 | set_se_shares(tg->se[i], shares); | 8510 | /* Propagate contribution to hierarchy */ |
| 8511 | raw_spin_lock_irqsave(&rq->lock, flags); | ||
| 8512 | for_each_sched_entity(se) | ||
| 8513 | update_cfs_shares(group_cfs_rq(se), 0); | ||
| 8514 | raw_spin_unlock_irqrestore(&rq->lock, flags); | ||
| 8599 | } | 8515 | } |
| 8600 | 8516 | ||
| 8601 | /* | ||
| 8602 | * Enable load balance activity on this group, by inserting it back on | ||
| 8603 | * each cpu's rq->leaf_cfs_rq_list. | ||
| 8604 | */ | ||
| 8605 | spin_lock_irqsave(&task_group_lock, flags); | ||
| 8606 | for_each_possible_cpu(i) | ||
| 8607 | register_fair_sched_group(tg, i); | ||
| 8608 | list_add_rcu(&tg->siblings, &tg->parent->children); | ||
| 8609 | spin_unlock_irqrestore(&task_group_lock, flags); | ||
| 8610 | done: | 8517 | done: |
| 8611 | mutex_unlock(&shares_mutex); | 8518 | mutex_unlock(&shares_mutex); |
| 8612 | return 0; | 8519 | return 0; |
| @@ -8905,7 +8812,7 @@ cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp) | |||
| 8905 | 8812 | ||
| 8906 | if (!cgrp->parent) { | 8813 | if (!cgrp->parent) { |
| 8907 | /* This is early initialization for the top cgroup */ | 8814 | /* This is early initialization for the top cgroup */ |
| 8908 | return &init_task_group.css; | 8815 | return &root_task_group.css; |
| 8909 | } | 8816 | } |
| 8910 | 8817 | ||
| 8911 | parent = cgroup_tg(cgrp->parent); | 8818 | parent = cgroup_tg(cgrp->parent); |
| @@ -8976,6 +8883,20 @@ cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, | |||
| 8976 | } | 8883 | } |
| 8977 | } | 8884 | } |
| 8978 | 8885 | ||
| 8886 | static void | ||
| 8887 | cpu_cgroup_exit(struct cgroup_subsys *ss, struct task_struct *task) | ||
| 8888 | { | ||
| 8889 | /* | ||
| 8890 | * cgroup_exit() is called in the copy_process() failure path. | ||
| 8891 | * Ignore this case since the task hasn't ran yet, this avoids | ||
| 8892 | * trying to poke a half freed task state from generic code. | ||
| 8893 | */ | ||
| 8894 | if (!(task->flags & PF_EXITING)) | ||
| 8895 | return; | ||
| 8896 | |||
| 8897 | sched_move_task(task); | ||
| 8898 | } | ||
| 8899 | |||
| 8979 | #ifdef CONFIG_FAIR_GROUP_SCHED | 8900 | #ifdef CONFIG_FAIR_GROUP_SCHED |
| 8980 | static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype, | 8901 | static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype, |
| 8981 | u64 shareval) | 8902 | u64 shareval) |
| @@ -9048,6 +8969,7 @@ struct cgroup_subsys cpu_cgroup_subsys = { | |||
| 9048 | .destroy = cpu_cgroup_destroy, | 8969 | .destroy = cpu_cgroup_destroy, |
| 9049 | .can_attach = cpu_cgroup_can_attach, | 8970 | .can_attach = cpu_cgroup_can_attach, |
| 9050 | .attach = cpu_cgroup_attach, | 8971 | .attach = cpu_cgroup_attach, |
| 8972 | .exit = cpu_cgroup_exit, | ||
| 9051 | .populate = cpu_cgroup_populate, | 8973 | .populate = cpu_cgroup_populate, |
| 9052 | .subsys_id = cpu_cgroup_subsys_id, | 8974 | .subsys_id = cpu_cgroup_subsys_id, |
| 9053 | .early_init = 1, | 8975 | .early_init = 1, |
| @@ -9332,72 +9254,3 @@ struct cgroup_subsys cpuacct_subsys = { | |||
| 9332 | }; | 9254 | }; |
| 9333 | #endif /* CONFIG_CGROUP_CPUACCT */ | 9255 | #endif /* CONFIG_CGROUP_CPUACCT */ |
| 9334 | 9256 | ||
| 9335 | #ifndef CONFIG_SMP | ||
| 9336 | |||
| 9337 | void synchronize_sched_expedited(void) | ||
| 9338 | { | ||
| 9339 | barrier(); | ||
| 9340 | } | ||
| 9341 | EXPORT_SYMBOL_GPL(synchronize_sched_expedited); | ||
| 9342 | |||
| 9343 | #else /* #ifndef CONFIG_SMP */ | ||
| 9344 | |||
| 9345 | static atomic_t synchronize_sched_expedited_count = ATOMIC_INIT(0); | ||
| 9346 | |||
| 9347 | static int synchronize_sched_expedited_cpu_stop(void *data) | ||
| 9348 | { | ||
| 9349 | /* | ||
| 9350 | * There must be a full memory barrier on each affected CPU | ||
| 9351 | * between the time that try_stop_cpus() is called and the | ||
| 9352 | * time that it returns. | ||
| 9353 | * | ||
| 9354 | * In the current initial implementation of cpu_stop, the | ||
| 9355 | * above condition is already met when the control reaches | ||
| 9356 | * this point and the following smp_mb() is not strictly | ||
| 9357 | * necessary. Do smp_mb() anyway for documentation and | ||
| 9358 | * robustness against future implementation changes. | ||
| 9359 | */ | ||
| 9360 | smp_mb(); /* See above comment block. */ | ||
| 9361 | return 0; | ||
| 9362 | } | ||
| 9363 | |||
| 9364 | /* | ||
| 9365 | * Wait for an rcu-sched grace period to elapse, but use "big hammer" | ||
| 9366 | * approach to force grace period to end quickly. This consumes | ||
| 9367 | * significant time on all CPUs, and is thus not recommended for | ||
| 9368 | * any sort of common-case code. | ||
| 9369 | * | ||
| 9370 | * Note that it is illegal to call this function while holding any | ||
| 9371 | * lock that is acquired by a CPU-hotplug notifier. Failing to | ||
| 9372 | * observe this restriction will result in deadlock. | ||
| 9373 | */ | ||
| 9374 | void synchronize_sched_expedited(void) | ||
| 9375 | { | ||
| 9376 | int snap, trycount = 0; | ||
| 9377 | |||
| 9378 | smp_mb(); /* ensure prior mod happens before capturing snap. */ | ||
| 9379 | snap = atomic_read(&synchronize_sched_expedited_count) + 1; | ||
| 9380 | get_online_cpus(); | ||
| 9381 | while (try_stop_cpus(cpu_online_mask, | ||
| 9382 | synchronize_sched_expedited_cpu_stop, | ||
| 9383 | NULL) == -EAGAIN) { | ||
| 9384 | put_online_cpus(); | ||
| 9385 | if (trycount++ < 10) | ||
| 9386 | udelay(trycount * num_online_cpus()); | ||
| 9387 | else { | ||
| 9388 | synchronize_sched(); | ||
| 9389 | return; | ||
| 9390 | } | ||
| 9391 | if (atomic_read(&synchronize_sched_expedited_count) - snap > 0) { | ||
| 9392 | smp_mb(); /* ensure test happens before caller kfree */ | ||
| 9393 | return; | ||
| 9394 | } | ||
| 9395 | get_online_cpus(); | ||
| 9396 | } | ||
| 9397 | atomic_inc(&synchronize_sched_expedited_count); | ||
| 9398 | smp_mb__after_atomic_inc(); /* ensure post-GP actions seen after GP. */ | ||
| 9399 | put_online_cpus(); | ||
| 9400 | } | ||
| 9401 | EXPORT_SYMBOL_GPL(synchronize_sched_expedited); | ||
| 9402 | |||
| 9403 | #endif /* #else #ifndef CONFIG_SMP */ | ||
diff --git a/kernel/sched_autogroup.c b/kernel/sched_autogroup.c new file mode 100644 index 00000000000..9fb65628315 --- /dev/null +++ b/kernel/sched_autogroup.c | |||
| @@ -0,0 +1,270 @@ | |||
| 1 | #ifdef CONFIG_SCHED_AUTOGROUP | ||
| 2 | |||
| 3 | #include <linux/proc_fs.h> | ||
| 4 | #include <linux/seq_file.h> | ||
| 5 | #include <linux/kallsyms.h> | ||
| 6 | #include <linux/utsname.h> | ||
| 7 | |||
| 8 | unsigned int __read_mostly sysctl_sched_autogroup_enabled = 1; | ||
| 9 | static struct autogroup autogroup_default; | ||
| 10 | static atomic_t autogroup_seq_nr; | ||
| 11 | |||
| 12 | static void __init autogroup_init(struct task_struct *init_task) | ||
| 13 | { | ||
| 14 | autogroup_default.tg = &root_task_group; | ||
| 15 | root_task_group.autogroup = &autogroup_default; | ||
| 16 | kref_init(&autogroup_default.kref); | ||
| 17 | init_rwsem(&autogroup_default.lock); | ||
| 18 | init_task->signal->autogroup = &autogroup_default; | ||
| 19 | } | ||
| 20 | |||
| 21 | static inline void autogroup_free(struct task_group *tg) | ||
| 22 | { | ||
| 23 | kfree(tg->autogroup); | ||
| 24 | } | ||
| 25 | |||
| 26 | static inline void autogroup_destroy(struct kref *kref) | ||
| 27 | { | ||
| 28 | struct autogroup *ag = container_of(kref, struct autogroup, kref); | ||
| 29 | |||
| 30 | #ifdef CONFIG_RT_GROUP_SCHED | ||
| 31 | /* We've redirected RT tasks to the root task group... */ | ||
| 32 | ag->tg->rt_se = NULL; | ||
| 33 | ag->tg->rt_rq = NULL; | ||
| 34 | #endif | ||
| 35 | sched_destroy_group(ag->tg); | ||
| 36 | } | ||
| 37 | |||
| 38 | static inline void autogroup_kref_put(struct autogroup *ag) | ||
| 39 | { | ||
| 40 | kref_put(&ag->kref, autogroup_destroy); | ||
| 41 | } | ||
| 42 | |||
| 43 | static inline struct autogroup *autogroup_kref_get(struct autogroup *ag) | ||
| 44 | { | ||
| 45 | kref_get(&ag->kref); | ||
| 46 | return ag; | ||
| 47 | } | ||
| 48 | |||
| 49 | static inline struct autogroup *autogroup_task_get(struct task_struct *p) | ||
| 50 | { | ||
| 51 | struct autogroup *ag; | ||
| 52 | unsigned long flags; | ||
| 53 | |||
| 54 | if (!lock_task_sighand(p, &flags)) | ||
| 55 | return autogroup_kref_get(&autogroup_default); | ||
| 56 | |||
| 57 | ag = autogroup_kref_get(p->signal->autogroup); | ||
| 58 | unlock_task_sighand(p, &flags); | ||
| 59 | |||
| 60 | return ag; | ||
| 61 | } | ||
| 62 | |||
| 63 | #ifdef CONFIG_RT_GROUP_SCHED | ||
| 64 | static void free_rt_sched_group(struct task_group *tg); | ||
| 65 | #endif | ||
| 66 | |||
| 67 | static inline struct autogroup *autogroup_create(void) | ||
| 68 | { | ||
| 69 | struct autogroup *ag = kzalloc(sizeof(*ag), GFP_KERNEL); | ||
| 70 | struct task_group *tg; | ||
| 71 | |||
| 72 | if (!ag) | ||
| 73 | goto out_fail; | ||
| 74 | |||
| 75 | tg = sched_create_group(&root_task_group); | ||
| 76 | |||
| 77 | if (IS_ERR(tg)) | ||
| 78 | goto out_free; | ||
| 79 | |||
| 80 | kref_init(&ag->kref); | ||
| 81 | init_rwsem(&ag->lock); | ||
| 82 | ag->id = atomic_inc_return(&autogroup_seq_nr); | ||
| 83 | ag->tg = tg; | ||
| 84 | #ifdef CONFIG_RT_GROUP_SCHED | ||
| 85 | /* | ||
| 86 | * Autogroup RT tasks are redirected to the root task group | ||
| 87 | * so we don't have to move tasks around upon policy change, | ||
| 88 | * or flail around trying to allocate bandwidth on the fly. | ||
| 89 | * A bandwidth exception in __sched_setscheduler() allows | ||
| 90 | * the policy change to proceed. Thereafter, task_group() | ||
| 91 | * returns &root_task_group, so zero bandwidth is required. | ||
| 92 | */ | ||
| 93 | free_rt_sched_group(tg); | ||
| 94 | tg->rt_se = root_task_group.rt_se; | ||
| 95 | tg->rt_rq = root_task_group.rt_rq; | ||
| 96 | #endif | ||
| 97 | tg->autogroup = ag; | ||
| 98 | |||
| 99 | return ag; | ||
| 100 | |||
| 101 | out_free: | ||
| 102 | kfree(ag); | ||
| 103 | out_fail: | ||
| 104 | if (printk_ratelimit()) { | ||
| 105 | printk(KERN_WARNING "autogroup_create: %s failure.\n", | ||
| 106 | ag ? "sched_create_group()" : "kmalloc()"); | ||
| 107 | } | ||
| 108 | |||
| 109 | return autogroup_kref_get(&autogroup_default); | ||
| 110 | } | ||
| 111 | |||
| 112 | static inline bool | ||
| 113 | task_wants_autogroup(struct task_struct *p, struct task_group *tg) | ||
| 114 | { | ||
| 115 | if (tg != &root_task_group) | ||
| 116 | return false; | ||
| 117 | |||
| 118 | if (p->sched_class != &fair_sched_class) | ||
| 119 | return false; | ||
| 120 | |||
| 121 | /* | ||
| 122 | * We can only assume the task group can't go away on us if | ||
| 123 | * autogroup_move_group() can see us on ->thread_group list. | ||
| 124 | */ | ||
| 125 | if (p->flags & PF_EXITING) | ||
| 126 | return false; | ||
| 127 | |||
| 128 | return true; | ||
| 129 | } | ||
| 130 | |||
| 131 | static inline bool task_group_is_autogroup(struct task_group *tg) | ||
| 132 | { | ||
| 133 | return tg != &root_task_group && tg->autogroup; | ||
| 134 | } | ||
| 135 | |||
| 136 | static inline struct task_group * | ||
| 137 | autogroup_task_group(struct task_struct *p, struct task_group *tg) | ||
| 138 | { | ||
| 139 | int enabled = ACCESS_ONCE(sysctl_sched_autogroup_enabled); | ||
| 140 | |||
| 141 | if (enabled && task_wants_autogroup(p, tg)) | ||
| 142 | return p->signal->autogroup->tg; | ||
| 143 | |||
| 144 | return tg; | ||
| 145 | } | ||
| 146 | |||
| 147 | static void | ||
| 148 | autogroup_move_group(struct task_struct *p, struct autogroup *ag) | ||
| 149 | { | ||
| 150 | struct autogroup *prev; | ||
| 151 | struct task_struct *t; | ||
| 152 | unsigned long flags; | ||
| 153 | |||
| 154 | BUG_ON(!lock_task_sighand(p, &flags)); | ||
| 155 | |||
| 156 | prev = p->signal->autogroup; | ||
| 157 | if (prev == ag) { | ||
| 158 | unlock_task_sighand(p, &flags); | ||
| 159 | return; | ||
| 160 | } | ||
| 161 | |||
| 162 | p->signal->autogroup = autogroup_kref_get(ag); | ||
| 163 | |||
| 164 | t = p; | ||
| 165 | do { | ||
| 166 | sched_move_task(t); | ||
| 167 | } while_each_thread(p, t); | ||
| 168 | |||
| 169 | unlock_task_sighand(p, &flags); | ||
| 170 | autogroup_kref_put(prev); | ||
| 171 | } | ||
| 172 | |||
| 173 | /* Allocates GFP_KERNEL, cannot be called under any spinlock */ | ||
| 174 | void sched_autogroup_create_attach(struct task_struct *p) | ||
| 175 | { | ||
| 176 | struct autogroup *ag = autogroup_create(); | ||
| 177 | |||
| 178 | autogroup_move_group(p, ag); | ||
| 179 | /* drop extra refrence added by autogroup_create() */ | ||
| 180 | autogroup_kref_put(ag); | ||
| 181 | } | ||
| 182 | EXPORT_SYMBOL(sched_autogroup_create_attach); | ||
| 183 | |||
| 184 | /* Cannot be called under siglock. Currently has no users */ | ||
| 185 | void sched_autogroup_detach(struct task_struct *p) | ||
| 186 | { | ||
| 187 | autogroup_move_group(p, &autogroup_default); | ||
| 188 | } | ||
| 189 | EXPORT_SYMBOL(sched_autogroup_detach); | ||
| 190 | |||
| 191 | void sched_autogroup_fork(struct signal_struct *sig) | ||
| 192 | { | ||
| 193 | sig->autogroup = autogroup_task_get(current); | ||
| 194 | } | ||
| 195 | |||
| 196 | void sched_autogroup_exit(struct signal_struct *sig) | ||
| 197 | { | ||
| 198 | autogroup_kref_put(sig->autogroup); | ||
| 199 | } | ||
| 200 | |||
| 201 | static int __init setup_autogroup(char *str) | ||
| 202 | { | ||
| 203 | sysctl_sched_autogroup_enabled = 0; | ||
| 204 | |||
| 205 | return 1; | ||
| 206 | } | ||
| 207 | |||
| 208 | __setup("noautogroup", setup_autogroup); | ||
| 209 | |||
| 210 | #ifdef CONFIG_PROC_FS | ||
| 211 | |||
| 212 | int proc_sched_autogroup_set_nice(struct task_struct *p, int *nice) | ||
| 213 | { | ||
| 214 | static unsigned long next = INITIAL_JIFFIES; | ||
| 215 | struct autogroup *ag; | ||
| 216 | int err; | ||
| 217 | |||
| 218 | if (*nice < -20 || *nice > 19) | ||
| 219 | return -EINVAL; | ||
| 220 | |||
| 221 | err = security_task_setnice(current, *nice); | ||
| 222 | if (err) | ||
| 223 | return err; | ||
| 224 | |||
| 225 | if (*nice < 0 && !can_nice(current, *nice)) | ||
| 226 | return -EPERM; | ||
| 227 | |||
| 228 | /* this is a heavy operation taking global locks.. */ | ||
| 229 | if (!capable(CAP_SYS_ADMIN) && time_before(jiffies, next)) | ||
| 230 | return -EAGAIN; | ||
| 231 | |||
| 232 | next = HZ / 10 + jiffies; | ||
| 233 | ag = autogroup_task_get(p); | ||
| 234 | |||
| 235 | down_write(&ag->lock); | ||
| 236 | err = sched_group_set_shares(ag->tg, prio_to_weight[*nice + 20]); | ||
| 237 | if (!err) | ||
| 238 | ag->nice = *nice; | ||
| 239 | up_write(&ag->lock); | ||
| 240 | |||
| 241 | autogroup_kref_put(ag); | ||
| 242 | |||
| 243 | return err; | ||
| 244 | } | ||
| 245 | |||
| 246 | void proc_sched_autogroup_show_task(struct task_struct *p, struct seq_file *m) | ||
| 247 | { | ||
| 248 | struct autogroup *ag = autogroup_task_get(p); | ||
| 249 | |||
| 250 | down_read(&ag->lock); | ||
| 251 | seq_printf(m, "/autogroup-%ld nice %d\n", ag->id, ag->nice); | ||
| 252 | up_read(&ag->lock); | ||
| 253 | |||
| 254 | autogroup_kref_put(ag); | ||
| 255 | } | ||
| 256 | #endif /* CONFIG_PROC_FS */ | ||
| 257 | |||
| 258 | #ifdef CONFIG_SCHED_DEBUG | ||
| 259 | static inline int autogroup_path(struct task_group *tg, char *buf, int buflen) | ||
| 260 | { | ||
| 261 | int enabled = ACCESS_ONCE(sysctl_sched_autogroup_enabled); | ||
| 262 | |||
| 263 | if (!enabled || !tg->autogroup) | ||
| 264 | return 0; | ||
| 265 | |||
| 266 | return snprintf(buf, buflen, "%s-%ld", "/autogroup", tg->autogroup->id); | ||
| 267 | } | ||
| 268 | #endif /* CONFIG_SCHED_DEBUG */ | ||
| 269 | |||
| 270 | #endif /* CONFIG_SCHED_AUTOGROUP */ | ||
diff --git a/kernel/sched_autogroup.h b/kernel/sched_autogroup.h new file mode 100644 index 00000000000..7b859ffe5da --- /dev/null +++ b/kernel/sched_autogroup.h | |||
| @@ -0,0 +1,36 @@ | |||
| 1 | #ifdef CONFIG_SCHED_AUTOGROUP | ||
| 2 | |||
| 3 | struct autogroup { | ||
| 4 | struct kref kref; | ||
| 5 | struct task_group *tg; | ||
| 6 | struct rw_semaphore lock; | ||
| 7 | unsigned long id; | ||
| 8 | int nice; | ||
| 9 | }; | ||
| 10 | |||
| 11 | static inline struct task_group * | ||
| 12 | autogroup_task_group(struct task_struct *p, struct task_group *tg); | ||
| 13 | |||
| 14 | #else /* !CONFIG_SCHED_AUTOGROUP */ | ||
| 15 | |||
| 16 | static inline void autogroup_init(struct task_struct *init_task) { } | ||
| 17 | static inline void autogroup_free(struct task_group *tg) { } | ||
| 18 | static inline bool task_group_is_autogroup(struct task_group *tg) | ||
| 19 | { | ||
| 20 | return 0; | ||
| 21 | } | ||
| 22 | |||
| 23 | static inline struct task_group * | ||
| 24 | autogroup_task_group(struct task_struct *p, struct task_group *tg) | ||
| 25 | { | ||
| 26 | return tg; | ||
| 27 | } | ||
| 28 | |||
| 29 | #ifdef CONFIG_SCHED_DEBUG | ||
| 30 | static inline int autogroup_path(struct task_group *tg, char *buf, int buflen) | ||
| 31 | { | ||
| 32 | return 0; | ||
| 33 | } | ||
| 34 | #endif | ||
| 35 | |||
| 36 | #endif /* CONFIG_SCHED_AUTOGROUP */ | ||
diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c index 52f1a149bfb..9d8af0b3fb6 100644 --- a/kernel/sched_clock.c +++ b/kernel/sched_clock.c | |||
| @@ -79,7 +79,7 @@ unsigned long long __attribute__((weak)) sched_clock(void) | |||
| 79 | } | 79 | } |
| 80 | EXPORT_SYMBOL_GPL(sched_clock); | 80 | EXPORT_SYMBOL_GPL(sched_clock); |
| 81 | 81 | ||
| 82 | static __read_mostly int sched_clock_running; | 82 | __read_mostly int sched_clock_running; |
| 83 | 83 | ||
| 84 | #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK | 84 | #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK |
| 85 | __read_mostly int sched_clock_stable; | 85 | __read_mostly int sched_clock_stable; |
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index 2e1b0d17dd9..eb6cb8edd07 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c | |||
| @@ -16,6 +16,8 @@ | |||
| 16 | #include <linux/kallsyms.h> | 16 | #include <linux/kallsyms.h> |
| 17 | #include <linux/utsname.h> | 17 | #include <linux/utsname.h> |
| 18 | 18 | ||
| 19 | static DEFINE_SPINLOCK(sched_debug_lock); | ||
| 20 | |||
| 19 | /* | 21 | /* |
| 20 | * This allows printing both to /proc/sched_debug and | 22 | * This allows printing both to /proc/sched_debug and |
| 21 | * to the console | 23 | * to the console |
| @@ -54,8 +56,7 @@ static unsigned long nsec_low(unsigned long long nsec) | |||
| 54 | #define SPLIT_NS(x) nsec_high(x), nsec_low(x) | 56 | #define SPLIT_NS(x) nsec_high(x), nsec_low(x) |
| 55 | 57 | ||
| 56 | #ifdef CONFIG_FAIR_GROUP_SCHED | 58 | #ifdef CONFIG_FAIR_GROUP_SCHED |
| 57 | static void print_cfs_group_stats(struct seq_file *m, int cpu, | 59 | static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group *tg) |
| 58 | struct task_group *tg) | ||
| 59 | { | 60 | { |
| 60 | struct sched_entity *se = tg->se[cpu]; | 61 | struct sched_entity *se = tg->se[cpu]; |
| 61 | if (!se) | 62 | if (!se) |
| @@ -87,6 +88,26 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, | |||
| 87 | } | 88 | } |
| 88 | #endif | 89 | #endif |
| 89 | 90 | ||
| 91 | #ifdef CONFIG_CGROUP_SCHED | ||
| 92 | static char group_path[PATH_MAX]; | ||
| 93 | |||
| 94 | static char *task_group_path(struct task_group *tg) | ||
| 95 | { | ||
| 96 | if (autogroup_path(tg, group_path, PATH_MAX)) | ||
| 97 | return group_path; | ||
| 98 | |||
| 99 | /* | ||
| 100 | * May be NULL if the underlying cgroup isn't fully-created yet | ||
| 101 | */ | ||
| 102 | if (!tg->css.cgroup) { | ||
| 103 | group_path[0] = '\0'; | ||
| 104 | return group_path; | ||
| 105 | } | ||
| 106 | cgroup_path(tg->css.cgroup, group_path, PATH_MAX); | ||
| 107 | return group_path; | ||
| 108 | } | ||
| 109 | #endif | ||
| 110 | |||
| 90 | static void | 111 | static void |
| 91 | print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) | 112 | print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) |
| 92 | { | 113 | { |
| @@ -109,17 +130,10 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) | |||
| 109 | SEQ_printf(m, "%15Ld %15Ld %15Ld.%06ld %15Ld.%06ld %15Ld.%06ld", | 130 | SEQ_printf(m, "%15Ld %15Ld %15Ld.%06ld %15Ld.%06ld %15Ld.%06ld", |
| 110 | 0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L); | 131 | 0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L); |
| 111 | #endif | 132 | #endif |
| 112 | |||
| 113 | #ifdef CONFIG_CGROUP_SCHED | 133 | #ifdef CONFIG_CGROUP_SCHED |
| 114 | { | 134 | SEQ_printf(m, " %s", task_group_path(task_group(p))); |
| 115 | char path[64]; | ||
| 116 | |||
| 117 | rcu_read_lock(); | ||
| 118 | cgroup_path(task_group(p)->css.cgroup, path, sizeof(path)); | ||
| 119 | rcu_read_unlock(); | ||
| 120 | SEQ_printf(m, " %s", path); | ||
| 121 | } | ||
| 122 | #endif | 135 | #endif |
| 136 | |||
| 123 | SEQ_printf(m, "\n"); | 137 | SEQ_printf(m, "\n"); |
| 124 | } | 138 | } |
| 125 | 139 | ||
| @@ -147,19 +161,6 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu) | |||
| 147 | read_unlock_irqrestore(&tasklist_lock, flags); | 161 | read_unlock_irqrestore(&tasklist_lock, flags); |
| 148 | } | 162 | } |
| 149 | 163 | ||
| 150 | #if defined(CONFIG_CGROUP_SCHED) && \ | ||
| 151 | (defined(CONFIG_FAIR_GROUP_SCHED) || defined(CONFIG_RT_GROUP_SCHED)) | ||
| 152 | static void task_group_path(struct task_group *tg, char *buf, int buflen) | ||
| 153 | { | ||
| 154 | /* may be NULL if the underlying cgroup isn't fully-created yet */ | ||
| 155 | if (!tg->css.cgroup) { | ||
| 156 | buf[0] = '\0'; | ||
| 157 | return; | ||
| 158 | } | ||
| 159 | cgroup_path(tg->css.cgroup, buf, buflen); | ||
| 160 | } | ||
| 161 | #endif | ||
| 162 | |||
| 163 | void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) | 164 | void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) |
| 164 | { | 165 | { |
| 165 | s64 MIN_vruntime = -1, min_vruntime, max_vruntime = -1, | 166 | s64 MIN_vruntime = -1, min_vruntime, max_vruntime = -1, |
| @@ -168,13 +169,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) | |||
| 168 | struct sched_entity *last; | 169 | struct sched_entity *last; |
| 169 | unsigned long flags; | 170 | unsigned long flags; |
| 170 | 171 | ||
| 171 | #if defined(CONFIG_CGROUP_SCHED) && defined(CONFIG_FAIR_GROUP_SCHED) | 172 | #ifdef CONFIG_FAIR_GROUP_SCHED |
| 172 | char path[128]; | 173 | SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, task_group_path(cfs_rq->tg)); |
| 173 | struct task_group *tg = cfs_rq->tg; | ||
| 174 | |||
| 175 | task_group_path(tg, path, sizeof(path)); | ||
| 176 | |||
| 177 | SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, path); | ||
| 178 | #else | 174 | #else |
| 179 | SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu); | 175 | SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu); |
| 180 | #endif | 176 | #endif |
| @@ -202,33 +198,34 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) | |||
| 202 | spread0 = min_vruntime - rq0_min_vruntime; | 198 | spread0 = min_vruntime - rq0_min_vruntime; |
| 203 | SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread0", | 199 | SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread0", |
| 204 | SPLIT_NS(spread0)); | 200 | SPLIT_NS(spread0)); |
| 205 | SEQ_printf(m, " .%-30s: %ld\n", "nr_running", cfs_rq->nr_running); | ||
| 206 | SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight); | ||
| 207 | |||
| 208 | SEQ_printf(m, " .%-30s: %d\n", "nr_spread_over", | 201 | SEQ_printf(m, " .%-30s: %d\n", "nr_spread_over", |
| 209 | cfs_rq->nr_spread_over); | 202 | cfs_rq->nr_spread_over); |
| 203 | SEQ_printf(m, " .%-30s: %ld\n", "nr_running", cfs_rq->nr_running); | ||
| 204 | SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight); | ||
| 210 | #ifdef CONFIG_FAIR_GROUP_SCHED | 205 | #ifdef CONFIG_FAIR_GROUP_SCHED |
| 211 | #ifdef CONFIG_SMP | 206 | #ifdef CONFIG_SMP |
| 212 | SEQ_printf(m, " .%-30s: %lu\n", "shares", cfs_rq->shares); | 207 | SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "load_avg", |
| 208 | SPLIT_NS(cfs_rq->load_avg)); | ||
| 209 | SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "load_period", | ||
| 210 | SPLIT_NS(cfs_rq->load_period)); | ||
| 211 | SEQ_printf(m, " .%-30s: %ld\n", "load_contrib", | ||
| 212 | cfs_rq->load_contribution); | ||
| 213 | SEQ_printf(m, " .%-30s: %d\n", "load_tg", | ||
| 214 | atomic_read(&cfs_rq->tg->load_weight)); | ||
| 213 | #endif | 215 | #endif |
| 216 | |||
| 214 | print_cfs_group_stats(m, cpu, cfs_rq->tg); | 217 | print_cfs_group_stats(m, cpu, cfs_rq->tg); |
| 215 | #endif | 218 | #endif |
| 216 | } | 219 | } |
| 217 | 220 | ||
| 218 | void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq) | 221 | void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq) |
| 219 | { | 222 | { |
| 220 | #if defined(CONFIG_CGROUP_SCHED) && defined(CONFIG_RT_GROUP_SCHED) | 223 | #ifdef CONFIG_RT_GROUP_SCHED |
| 221 | char path[128]; | 224 | SEQ_printf(m, "\nrt_rq[%d]:%s\n", cpu, task_group_path(rt_rq->tg)); |
| 222 | struct task_group *tg = rt_rq->tg; | ||
| 223 | |||
| 224 | task_group_path(tg, path, sizeof(path)); | ||
| 225 | |||
| 226 | SEQ_printf(m, "\nrt_rq[%d]:%s\n", cpu, path); | ||
| 227 | #else | 225 | #else |
| 228 | SEQ_printf(m, "\nrt_rq[%d]:\n", cpu); | 226 | SEQ_printf(m, "\nrt_rq[%d]:\n", cpu); |
| 229 | #endif | 227 | #endif |
| 230 | 228 | ||
| 231 | |||
| 232 | #define P(x) \ | 229 | #define P(x) \ |
| 233 | SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(rt_rq->x)) | 230 | SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(rt_rq->x)) |
| 234 | #define PN(x) \ | 231 | #define PN(x) \ |
| @@ -243,9 +240,12 @@ void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq) | |||
| 243 | #undef P | 240 | #undef P |
| 244 | } | 241 | } |
| 245 | 242 | ||
| 243 | extern __read_mostly int sched_clock_running; | ||
| 244 | |||
| 246 | static void print_cpu(struct seq_file *m, int cpu) | 245 | static void print_cpu(struct seq_file *m, int cpu) |
| 247 | { | 246 | { |
| 248 | struct rq *rq = cpu_rq(cpu); | 247 | struct rq *rq = cpu_rq(cpu); |
| 248 | unsigned long flags; | ||
| 249 | 249 | ||
| 250 | #ifdef CONFIG_X86 | 250 | #ifdef CONFIG_X86 |
| 251 | { | 251 | { |
| @@ -296,14 +296,20 @@ static void print_cpu(struct seq_file *m, int cpu) | |||
| 296 | P(ttwu_count); | 296 | P(ttwu_count); |
| 297 | P(ttwu_local); | 297 | P(ttwu_local); |
| 298 | 298 | ||
| 299 | P(bkl_count); | 299 | SEQ_printf(m, " .%-30s: %d\n", "bkl_count", |
| 300 | rq->rq_sched_info.bkl_count); | ||
| 300 | 301 | ||
| 301 | #undef P | 302 | #undef P |
| 303 | #undef P64 | ||
| 302 | #endif | 304 | #endif |
| 305 | spin_lock_irqsave(&sched_debug_lock, flags); | ||
| 303 | print_cfs_stats(m, cpu); | 306 | print_cfs_stats(m, cpu); |
| 304 | print_rt_stats(m, cpu); | 307 | print_rt_stats(m, cpu); |
| 305 | 308 | ||
| 309 | rcu_read_lock(); | ||
| 306 | print_rq(m, rq, cpu); | 310 | print_rq(m, rq, cpu); |
| 311 | rcu_read_unlock(); | ||
| 312 | spin_unlock_irqrestore(&sched_debug_lock, flags); | ||
| 307 | } | 313 | } |
| 308 | 314 | ||
| 309 | static const char *sched_tunable_scaling_names[] = { | 315 | static const char *sched_tunable_scaling_names[] = { |
| @@ -314,21 +320,42 @@ static const char *sched_tunable_scaling_names[] = { | |||
| 314 | 320 | ||
| 315 | static int sched_debug_show(struct seq_file *m, void *v) | 321 | static int sched_debug_show(struct seq_file *m, void *v) |
| 316 | { | 322 | { |
| 317 | u64 now = ktime_to_ns(ktime_get()); | 323 | u64 ktime, sched_clk, cpu_clk; |
| 324 | unsigned long flags; | ||
| 318 | int cpu; | 325 | int cpu; |
| 319 | 326 | ||
| 320 | SEQ_printf(m, "Sched Debug Version: v0.09, %s %.*s\n", | 327 | local_irq_save(flags); |
| 328 | ktime = ktime_to_ns(ktime_get()); | ||
| 329 | sched_clk = sched_clock(); | ||
| 330 | cpu_clk = local_clock(); | ||
| 331 | local_irq_restore(flags); | ||
| 332 | |||
| 333 | SEQ_printf(m, "Sched Debug Version: v0.10, %s %.*s\n", | ||
| 321 | init_utsname()->release, | 334 | init_utsname()->release, |
| 322 | (int)strcspn(init_utsname()->version, " "), | 335 | (int)strcspn(init_utsname()->version, " "), |
| 323 | init_utsname()->version); | 336 | init_utsname()->version); |
| 324 | 337 | ||
| 325 | SEQ_printf(m, "now at %Lu.%06ld msecs\n", SPLIT_NS(now)); | 338 | #define P(x) \ |
| 339 | SEQ_printf(m, "%-40s: %Ld\n", #x, (long long)(x)) | ||
| 340 | #define PN(x) \ | ||
| 341 | SEQ_printf(m, "%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x)) | ||
| 342 | PN(ktime); | ||
| 343 | PN(sched_clk); | ||
| 344 | PN(cpu_clk); | ||
| 345 | P(jiffies); | ||
| 346 | #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK | ||
| 347 | P(sched_clock_stable); | ||
| 348 | #endif | ||
| 349 | #undef PN | ||
| 350 | #undef P | ||
| 351 | |||
| 352 | SEQ_printf(m, "\n"); | ||
| 353 | SEQ_printf(m, "sysctl_sched\n"); | ||
| 326 | 354 | ||
| 327 | #define P(x) \ | 355 | #define P(x) \ |
| 328 | SEQ_printf(m, " .%-40s: %Ld\n", #x, (long long)(x)) | 356 | SEQ_printf(m, " .%-40s: %Ld\n", #x, (long long)(x)) |
| 329 | #define PN(x) \ | 357 | #define PN(x) \ |
| 330 | SEQ_printf(m, " .%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x)) | 358 | SEQ_printf(m, " .%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x)) |
| 331 | P(jiffies); | ||
| 332 | PN(sysctl_sched_latency); | 359 | PN(sysctl_sched_latency); |
| 333 | PN(sysctl_sched_min_granularity); | 360 | PN(sysctl_sched_min_granularity); |
| 334 | PN(sysctl_sched_wakeup_granularity); | 361 | PN(sysctl_sched_wakeup_granularity); |
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index f4f6a8326dd..0c26e2df450 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c | |||
| @@ -89,6 +89,13 @@ unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL; | |||
| 89 | 89 | ||
| 90 | const_debug unsigned int sysctl_sched_migration_cost = 500000UL; | 90 | const_debug unsigned int sysctl_sched_migration_cost = 500000UL; |
| 91 | 91 | ||
| 92 | /* | ||
| 93 | * The exponential sliding window over which load is averaged for shares | ||
| 94 | * distribution. | ||
| 95 | * (default: 10msec) | ||
| 96 | */ | ||
| 97 | unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL; | ||
| 98 | |||
| 92 | static const struct sched_class fair_sched_class; | 99 | static const struct sched_class fair_sched_class; |
| 93 | 100 | ||
| 94 | /************************************************************** | 101 | /************************************************************** |
| @@ -143,6 +150,36 @@ static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu) | |||
| 143 | return cfs_rq->tg->cfs_rq[this_cpu]; | 150 | return cfs_rq->tg->cfs_rq[this_cpu]; |
| 144 | } | 151 | } |
| 145 | 152 | ||
| 153 | static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) | ||
| 154 | { | ||
| 155 | if (!cfs_rq->on_list) { | ||
| 156 | /* | ||
| 157 | * Ensure we either appear before our parent (if already | ||
| 158 | * enqueued) or force our parent to appear after us when it is | ||
| 159 | * enqueued. The fact that we always enqueue bottom-up | ||
| 160 | * reduces this to two cases. | ||
| 161 | */ | ||
| 162 | if (cfs_rq->tg->parent && | ||
| 163 | cfs_rq->tg->parent->cfs_rq[cpu_of(rq_of(cfs_rq))]->on_list) { | ||
| 164 | list_add_rcu(&cfs_rq->leaf_cfs_rq_list, | ||
| 165 | &rq_of(cfs_rq)->leaf_cfs_rq_list); | ||
| 166 | } else { | ||
| 167 | list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list, | ||
| 168 | &rq_of(cfs_rq)->leaf_cfs_rq_list); | ||
| 169 | } | ||
| 170 | |||
| 171 | cfs_rq->on_list = 1; | ||
| 172 | } | ||
| 173 | } | ||
| 174 | |||
| 175 | static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq) | ||
| 176 | { | ||
| 177 | if (cfs_rq->on_list) { | ||
| 178 | list_del_rcu(&cfs_rq->leaf_cfs_rq_list); | ||
| 179 | cfs_rq->on_list = 0; | ||
| 180 | } | ||
| 181 | } | ||
| 182 | |||
| 146 | /* Iterate thr' all leaf cfs_rq's on a runqueue */ | 183 | /* Iterate thr' all leaf cfs_rq's on a runqueue */ |
| 147 | #define for_each_leaf_cfs_rq(rq, cfs_rq) \ | 184 | #define for_each_leaf_cfs_rq(rq, cfs_rq) \ |
| 148 | list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list) | 185 | list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list) |
| @@ -246,6 +283,14 @@ static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu) | |||
| 246 | return &cpu_rq(this_cpu)->cfs; | 283 | return &cpu_rq(this_cpu)->cfs; |
| 247 | } | 284 | } |
| 248 | 285 | ||
| 286 | static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) | ||
| 287 | { | ||
| 288 | } | ||
| 289 | |||
| 290 | static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq) | ||
| 291 | { | ||
| 292 | } | ||
| 293 | |||
| 249 | #define for_each_leaf_cfs_rq(rq, cfs_rq) \ | 294 | #define for_each_leaf_cfs_rq(rq, cfs_rq) \ |
| 250 | for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL) | 295 | for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL) |
| 251 | 296 | ||
| @@ -417,7 +462,6 @@ int sched_proc_update_handler(struct ctl_table *table, int write, | |||
| 417 | WRT_SYSCTL(sched_min_granularity); | 462 | WRT_SYSCTL(sched_min_granularity); |
| 418 | WRT_SYSCTL(sched_latency); | 463 | WRT_SYSCTL(sched_latency); |
| 419 | WRT_SYSCTL(sched_wakeup_granularity); | 464 | WRT_SYSCTL(sched_wakeup_granularity); |
| 420 | WRT_SYSCTL(sched_shares_ratelimit); | ||
| 421 | #undef WRT_SYSCTL | 465 | #undef WRT_SYSCTL |
| 422 | 466 | ||
| 423 | return 0; | 467 | return 0; |
| @@ -495,6 +539,9 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
| 495 | return calc_delta_fair(sched_slice(cfs_rq, se), se); | 539 | return calc_delta_fair(sched_slice(cfs_rq, se), se); |
| 496 | } | 540 | } |
| 497 | 541 | ||
| 542 | static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update); | ||
| 543 | static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta); | ||
| 544 | |||
| 498 | /* | 545 | /* |
| 499 | * Update the current task's runtime statistics. Skip current tasks that | 546 | * Update the current task's runtime statistics. Skip current tasks that |
| 500 | * are not in our scheduling class. | 547 | * are not in our scheduling class. |
| @@ -514,6 +561,10 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr, | |||
| 514 | 561 | ||
| 515 | curr->vruntime += delta_exec_weighted; | 562 | curr->vruntime += delta_exec_weighted; |
| 516 | update_min_vruntime(cfs_rq); | 563 | update_min_vruntime(cfs_rq); |
| 564 | |||
| 565 | #if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED | ||
| 566 | cfs_rq->load_unacc_exec_time += delta_exec; | ||
| 567 | #endif | ||
| 517 | } | 568 | } |
| 518 | 569 | ||
| 519 | static void update_curr(struct cfs_rq *cfs_rq) | 570 | static void update_curr(struct cfs_rq *cfs_rq) |
| @@ -633,7 +684,6 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
| 633 | list_add(&se->group_node, &cfs_rq->tasks); | 684 | list_add(&se->group_node, &cfs_rq->tasks); |
| 634 | } | 685 | } |
| 635 | cfs_rq->nr_running++; | 686 | cfs_rq->nr_running++; |
| 636 | se->on_rq = 1; | ||
| 637 | } | 687 | } |
| 638 | 688 | ||
| 639 | static void | 689 | static void |
| @@ -647,9 +697,165 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
| 647 | list_del_init(&se->group_node); | 697 | list_del_init(&se->group_node); |
| 648 | } | 698 | } |
| 649 | cfs_rq->nr_running--; | 699 | cfs_rq->nr_running--; |
| 650 | se->on_rq = 0; | ||
| 651 | } | 700 | } |
| 652 | 701 | ||
| 702 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
| 703 | # ifdef CONFIG_SMP | ||
| 704 | static void update_cfs_rq_load_contribution(struct cfs_rq *cfs_rq, | ||
| 705 | int global_update) | ||
| 706 | { | ||
| 707 | struct task_group *tg = cfs_rq->tg; | ||
| 708 | long load_avg; | ||
| 709 | |||
| 710 | load_avg = div64_u64(cfs_rq->load_avg, cfs_rq->load_period+1); | ||
| 711 | load_avg -= cfs_rq->load_contribution; | ||
| 712 | |||
| 713 | if (global_update || abs(load_avg) > cfs_rq->load_contribution / 8) { | ||
| 714 | atomic_add(load_avg, &tg->load_weight); | ||
| 715 | cfs_rq->load_contribution += load_avg; | ||
| 716 | } | ||
| 717 | } | ||
| 718 | |||
| 719 | static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update) | ||
| 720 | { | ||
| 721 | u64 period = sysctl_sched_shares_window; | ||
| 722 | u64 now, delta; | ||
| 723 | unsigned long load = cfs_rq->load.weight; | ||
| 724 | |||
| 725 | if (cfs_rq->tg == &root_task_group) | ||
| 726 | return; | ||
| 727 | |||
| 728 | now = rq_of(cfs_rq)->clock_task; | ||
| 729 | delta = now - cfs_rq->load_stamp; | ||
| 730 | |||
| 731 | /* truncate load history at 4 idle periods */ | ||
| 732 | if (cfs_rq->load_stamp > cfs_rq->load_last && | ||
| 733 | now - cfs_rq->load_last > 4 * period) { | ||
| 734 | cfs_rq->load_period = 0; | ||
| 735 | cfs_rq->load_avg = 0; | ||
| 736 | } | ||
| 737 | |||
| 738 | cfs_rq->load_stamp = now; | ||
| 739 | cfs_rq->load_unacc_exec_time = 0; | ||
| 740 | cfs_rq->load_period += delta; | ||
| 741 | if (load) { | ||
| 742 | cfs_rq->load_last = now; | ||
| 743 | cfs_rq->load_avg += delta * load; | ||
| 744 | } | ||
| 745 | |||
| 746 | /* consider updating load contribution on each fold or truncate */ | ||
| 747 | if (global_update || cfs_rq->load_period > period | ||
| 748 | || !cfs_rq->load_period) | ||
| 749 | update_cfs_rq_load_contribution(cfs_rq, global_update); | ||
| 750 | |||
| 751 | while (cfs_rq->load_period > period) { | ||
| 752 | /* | ||
| 753 | * Inline assembly required to prevent the compiler | ||
| 754 | * optimising this loop into a divmod call. | ||
| 755 | * See __iter_div_u64_rem() for another example of this. | ||
| 756 | */ | ||
| 757 | asm("" : "+rm" (cfs_rq->load_period)); | ||
| 758 | cfs_rq->load_period /= 2; | ||
| 759 | cfs_rq->load_avg /= 2; | ||
| 760 | } | ||
| 761 | |||
| 762 | if (!cfs_rq->curr && !cfs_rq->nr_running && !cfs_rq->load_avg) | ||
| 763 | list_del_leaf_cfs_rq(cfs_rq); | ||
| 764 | } | ||
| 765 | |||
| 766 | static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg, | ||
| 767 | long weight_delta) | ||
| 768 | { | ||
| 769 | long load_weight, load, shares; | ||
| 770 | |||
| 771 | load = cfs_rq->load.weight + weight_delta; | ||
| 772 | |||
| 773 | load_weight = atomic_read(&tg->load_weight); | ||
| 774 | load_weight -= cfs_rq->load_contribution; | ||
| 775 | load_weight += load; | ||
| 776 | |||
| 777 | shares = (tg->shares * load); | ||
| 778 | if (load_weight) | ||
| 779 | shares /= load_weight; | ||
| 780 | |||
| 781 | if (shares < MIN_SHARES) | ||
| 782 | shares = MIN_SHARES; | ||
| 783 | if (shares > tg->shares) | ||
| 784 | shares = tg->shares; | ||
| 785 | |||
| 786 | return shares; | ||
| 787 | } | ||
| 788 | |||
| 789 | static void update_entity_shares_tick(struct cfs_rq *cfs_rq) | ||
| 790 | { | ||
| 791 | if (cfs_rq->load_unacc_exec_time > sysctl_sched_shares_window) { | ||
| 792 | update_cfs_load(cfs_rq, 0); | ||
| 793 | update_cfs_shares(cfs_rq, 0); | ||
| 794 | } | ||
| 795 | } | ||
| 796 | # else /* CONFIG_SMP */ | ||
| 797 | static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update) | ||
| 798 | { | ||
| 799 | } | ||
| 800 | |||
| 801 | static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg, | ||
| 802 | long weight_delta) | ||
| 803 | { | ||
| 804 | return tg->shares; | ||
| 805 | } | ||
| 806 | |||
| 807 | static inline void update_entity_shares_tick(struct cfs_rq *cfs_rq) | ||
| 808 | { | ||
| 809 | } | ||
| 810 | # endif /* CONFIG_SMP */ | ||
| 811 | static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, | ||
| 812 | unsigned long weight) | ||
| 813 | { | ||
| 814 | if (se->on_rq) { | ||
| 815 | /* commit outstanding execution time */ | ||
| 816 | if (cfs_rq->curr == se) | ||
| 817 | update_curr(cfs_rq); | ||
| 818 | account_entity_dequeue(cfs_rq, se); | ||
| 819 | } | ||
| 820 | |||
| 821 | update_load_set(&se->load, weight); | ||
| 822 | |||
| 823 | if (se->on_rq) | ||
| 824 | account_entity_enqueue(cfs_rq, se); | ||
| 825 | } | ||
| 826 | |||
| 827 | static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta) | ||
| 828 | { | ||
| 829 | struct task_group *tg; | ||
| 830 | struct sched_entity *se; | ||
| 831 | long shares; | ||
| 832 | |||
| 833 | tg = cfs_rq->tg; | ||
| 834 | se = tg->se[cpu_of(rq_of(cfs_rq))]; | ||
| 835 | if (!se) | ||
| 836 | return; | ||
| 837 | #ifndef CONFIG_SMP | ||
| 838 | if (likely(se->load.weight == tg->shares)) | ||
| 839 | return; | ||
| 840 | #endif | ||
| 841 | shares = calc_cfs_shares(cfs_rq, tg, weight_delta); | ||
| 842 | |||
| 843 | reweight_entity(cfs_rq_of(se), se, shares); | ||
| 844 | } | ||
| 845 | #else /* CONFIG_FAIR_GROUP_SCHED */ | ||
| 846 | static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update) | ||
| 847 | { | ||
| 848 | } | ||
| 849 | |||
| 850 | static inline void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta) | ||
| 851 | { | ||
| 852 | } | ||
| 853 | |||
| 854 | static inline void update_entity_shares_tick(struct cfs_rq *cfs_rq) | ||
| 855 | { | ||
| 856 | } | ||
| 857 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | ||
| 858 | |||
| 653 | static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) | 859 | static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) |
| 654 | { | 860 | { |
| 655 | #ifdef CONFIG_SCHEDSTATS | 861 | #ifdef CONFIG_SCHEDSTATS |
| @@ -771,6 +977,8 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) | |||
| 771 | * Update run-time statistics of the 'current'. | 977 | * Update run-time statistics of the 'current'. |
| 772 | */ | 978 | */ |
| 773 | update_curr(cfs_rq); | 979 | update_curr(cfs_rq); |
| 980 | update_cfs_load(cfs_rq, 0); | ||
| 981 | update_cfs_shares(cfs_rq, se->load.weight); | ||
| 774 | account_entity_enqueue(cfs_rq, se); | 982 | account_entity_enqueue(cfs_rq, se); |
| 775 | 983 | ||
| 776 | if (flags & ENQUEUE_WAKEUP) { | 984 | if (flags & ENQUEUE_WAKEUP) { |
| @@ -782,6 +990,10 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) | |||
| 782 | check_spread(cfs_rq, se); | 990 | check_spread(cfs_rq, se); |
| 783 | if (se != cfs_rq->curr) | 991 | if (se != cfs_rq->curr) |
| 784 | __enqueue_entity(cfs_rq, se); | 992 | __enqueue_entity(cfs_rq, se); |
| 993 | se->on_rq = 1; | ||
| 994 | |||
| 995 | if (cfs_rq->nr_running == 1) | ||
| 996 | list_add_leaf_cfs_rq(cfs_rq); | ||
| 785 | } | 997 | } |
| 786 | 998 | ||
| 787 | static void __clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) | 999 | static void __clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) |
| @@ -825,8 +1037,11 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) | |||
| 825 | 1037 | ||
| 826 | if (se != cfs_rq->curr) | 1038 | if (se != cfs_rq->curr) |
| 827 | __dequeue_entity(cfs_rq, se); | 1039 | __dequeue_entity(cfs_rq, se); |
| 1040 | se->on_rq = 0; | ||
| 1041 | update_cfs_load(cfs_rq, 0); | ||
| 828 | account_entity_dequeue(cfs_rq, se); | 1042 | account_entity_dequeue(cfs_rq, se); |
| 829 | update_min_vruntime(cfs_rq); | 1043 | update_min_vruntime(cfs_rq); |
| 1044 | update_cfs_shares(cfs_rq, 0); | ||
| 830 | 1045 | ||
| 831 | /* | 1046 | /* |
| 832 | * Normalize the entity after updating the min_vruntime because the | 1047 | * Normalize the entity after updating the min_vruntime because the |
| @@ -872,6 +1087,9 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) | |||
| 872 | struct sched_entity *se = __pick_next_entity(cfs_rq); | 1087 | struct sched_entity *se = __pick_next_entity(cfs_rq); |
| 873 | s64 delta = curr->vruntime - se->vruntime; | 1088 | s64 delta = curr->vruntime - se->vruntime; |
| 874 | 1089 | ||
| 1090 | if (delta < 0) | ||
| 1091 | return; | ||
| 1092 | |||
| 875 | if (delta > ideal_runtime) | 1093 | if (delta > ideal_runtime) |
| 876 | resched_task(rq_of(cfs_rq)->curr); | 1094 | resched_task(rq_of(cfs_rq)->curr); |
| 877 | } | 1095 | } |
| @@ -955,6 +1173,11 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) | |||
| 955 | */ | 1173 | */ |
| 956 | update_curr(cfs_rq); | 1174 | update_curr(cfs_rq); |
| 957 | 1175 | ||
| 1176 | /* | ||
| 1177 | * Update share accounting for long-running entities. | ||
| 1178 | */ | ||
| 1179 | update_entity_shares_tick(cfs_rq); | ||
| 1180 | |||
| 958 | #ifdef CONFIG_SCHED_HRTICK | 1181 | #ifdef CONFIG_SCHED_HRTICK |
| 959 | /* | 1182 | /* |
| 960 | * queued ticks are scheduled to match the slice, so don't bother | 1183 | * queued ticks are scheduled to match the slice, so don't bother |
| @@ -1055,6 +1278,13 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) | |||
| 1055 | flags = ENQUEUE_WAKEUP; | 1278 | flags = ENQUEUE_WAKEUP; |
| 1056 | } | 1279 | } |
| 1057 | 1280 | ||
| 1281 | for_each_sched_entity(se) { | ||
| 1282 | struct cfs_rq *cfs_rq = cfs_rq_of(se); | ||
| 1283 | |||
| 1284 | update_cfs_load(cfs_rq, 0); | ||
| 1285 | update_cfs_shares(cfs_rq, 0); | ||
| 1286 | } | ||
| 1287 | |||
| 1058 | hrtick_update(rq); | 1288 | hrtick_update(rq); |
| 1059 | } | 1289 | } |
| 1060 | 1290 | ||
| @@ -1071,12 +1301,20 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) | |||
| 1071 | for_each_sched_entity(se) { | 1301 | for_each_sched_entity(se) { |
| 1072 | cfs_rq = cfs_rq_of(se); | 1302 | cfs_rq = cfs_rq_of(se); |
| 1073 | dequeue_entity(cfs_rq, se, flags); | 1303 | dequeue_entity(cfs_rq, se, flags); |
| 1304 | |||
| 1074 | /* Don't dequeue parent if it has other entities besides us */ | 1305 | /* Don't dequeue parent if it has other entities besides us */ |
| 1075 | if (cfs_rq->load.weight) | 1306 | if (cfs_rq->load.weight) |
| 1076 | break; | 1307 | break; |
| 1077 | flags |= DEQUEUE_SLEEP; | 1308 | flags |= DEQUEUE_SLEEP; |
| 1078 | } | 1309 | } |
| 1079 | 1310 | ||
| 1311 | for_each_sched_entity(se) { | ||
| 1312 | struct cfs_rq *cfs_rq = cfs_rq_of(se); | ||
| 1313 | |||
| 1314 | update_cfs_load(cfs_rq, 0); | ||
| 1315 | update_cfs_shares(cfs_rq, 0); | ||
| 1316 | } | ||
| 1317 | |||
| 1080 | hrtick_update(rq); | 1318 | hrtick_update(rq); |
| 1081 | } | 1319 | } |
| 1082 | 1320 | ||
| @@ -1143,67 +1381,36 @@ static void task_waking_fair(struct rq *rq, struct task_struct *p) | |||
| 1143 | * Adding load to a group doesn't make a group heavier, but can cause movement | 1381 | * Adding load to a group doesn't make a group heavier, but can cause movement |
| 1144 | * of group shares between cpus. Assuming the shares were perfectly aligned one | 1382 | * of group shares between cpus. Assuming the shares were perfectly aligned one |
| 1145 | * can calculate the shift in shares. | 1383 | * can calculate the shift in shares. |
| 1146 | * | ||
| 1147 | * The problem is that perfectly aligning the shares is rather expensive, hence | ||
| 1148 | * we try to avoid doing that too often - see update_shares(), which ratelimits | ||
| 1149 | * this change. | ||
| 1150 | * | ||
| 1151 | * We compensate this by not only taking the current delta into account, but | ||
| 1152 | * also considering the delta between when the shares were last adjusted and | ||
| 1153 | * now. | ||
| 1154 | * | ||
| 1155 | * We still saw a performance dip, some tracing learned us that between | ||
| 1156 | * cgroup:/ and cgroup:/foo balancing the number of affine wakeups increased | ||
| 1157 | * significantly. Therefore try to bias the error in direction of failing | ||
| 1158 | * the affine wakeup. | ||
| 1159 | * | ||
| 1160 | */ | 1384 | */ |
| 1161 | static long effective_load(struct task_group *tg, int cpu, | 1385 | static long effective_load(struct task_group *tg, int cpu, long wl, long wg) |
| 1162 | long wl, long wg) | ||
| 1163 | { | 1386 | { |
| 1164 | struct sched_entity *se = tg->se[cpu]; | 1387 | struct sched_entity *se = tg->se[cpu]; |
| 1165 | 1388 | ||
| 1166 | if (!tg->parent) | 1389 | if (!tg->parent) |
| 1167 | return wl; | 1390 | return wl; |
| 1168 | 1391 | ||
| 1169 | /* | ||
| 1170 | * By not taking the decrease of shares on the other cpu into | ||
| 1171 | * account our error leans towards reducing the affine wakeups. | ||
| 1172 | */ | ||
| 1173 | if (!wl && sched_feat(ASYM_EFF_LOAD)) | ||
| 1174 | return wl; | ||
| 1175 | |||
| 1176 | for_each_sched_entity(se) { | 1392 | for_each_sched_entity(se) { |
| 1177 | long S, rw, s, a, b; | 1393 | long lw, w; |
| 1178 | long more_w; | ||
| 1179 | |||
| 1180 | /* | ||
| 1181 | * Instead of using this increment, also add the difference | ||
| 1182 | * between when the shares were last updated and now. | ||
| 1183 | */ | ||
| 1184 | more_w = se->my_q->load.weight - se->my_q->rq_weight; | ||
| 1185 | wl += more_w; | ||
| 1186 | wg += more_w; | ||
| 1187 | 1394 | ||
| 1188 | S = se->my_q->tg->shares; | 1395 | tg = se->my_q->tg; |
| 1189 | s = se->my_q->shares; | 1396 | w = se->my_q->load.weight; |
| 1190 | rw = se->my_q->rq_weight; | ||
| 1191 | 1397 | ||
| 1192 | a = S*(rw + wl); | 1398 | /* use this cpu's instantaneous contribution */ |
| 1193 | b = S*rw + s*wg; | 1399 | lw = atomic_read(&tg->load_weight); |
| 1400 | lw -= se->my_q->load_contribution; | ||
| 1401 | lw += w + wg; | ||
| 1194 | 1402 | ||
| 1195 | wl = s*(a-b); | 1403 | wl += w; |
| 1196 | 1404 | ||
| 1197 | if (likely(b)) | 1405 | if (lw > 0 && wl < lw) |
| 1198 | wl /= b; | 1406 | wl = (wl * tg->shares) / lw; |
| 1407 | else | ||
| 1408 | wl = tg->shares; | ||
| 1199 | 1409 | ||
| 1200 | /* | 1410 | /* zero point is MIN_SHARES */ |
| 1201 | * Assume the group is already running and will | 1411 | if (wl < MIN_SHARES) |
| 1202 | * thus already be accounted for in the weight. | 1412 | wl = MIN_SHARES; |
| 1203 | * | 1413 | wl -= se->load.weight; |
| 1204 | * That is, moving shares between CPUs, does not | ||
| 1205 | * alter the group weight. | ||
| 1206 | */ | ||
| 1207 | wg = 0; | 1414 | wg = 0; |
| 1208 | } | 1415 | } |
| 1209 | 1416 | ||
| @@ -1222,7 +1429,7 @@ static inline unsigned long effective_load(struct task_group *tg, int cpu, | |||
| 1222 | 1429 | ||
| 1223 | static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) | 1430 | static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) |
| 1224 | { | 1431 | { |
| 1225 | unsigned long this_load, load; | 1432 | s64 this_load, load; |
| 1226 | int idx, this_cpu, prev_cpu; | 1433 | int idx, this_cpu, prev_cpu; |
| 1227 | unsigned long tl_per_task; | 1434 | unsigned long tl_per_task; |
| 1228 | struct task_group *tg; | 1435 | struct task_group *tg; |
| @@ -1261,8 +1468,8 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) | |||
| 1261 | * Otherwise check if either cpus are near enough in load to allow this | 1468 | * Otherwise check if either cpus are near enough in load to allow this |
| 1262 | * task to be woken on this_cpu. | 1469 | * task to be woken on this_cpu. |
| 1263 | */ | 1470 | */ |
| 1264 | if (this_load) { | 1471 | if (this_load > 0) { |
| 1265 | unsigned long this_eff_load, prev_eff_load; | 1472 | s64 this_eff_load, prev_eff_load; |
| 1266 | 1473 | ||
| 1267 | this_eff_load = 100; | 1474 | this_eff_load = 100; |
| 1268 | this_eff_load *= power_of(prev_cpu); | 1475 | this_eff_load *= power_of(prev_cpu); |
| @@ -1508,23 +1715,6 @@ select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_ | |||
| 1508 | sd = tmp; | 1715 | sd = tmp; |
| 1509 | } | 1716 | } |
| 1510 | 1717 | ||
| 1511 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
| 1512 | if (sched_feat(LB_SHARES_UPDATE)) { | ||
| 1513 | /* | ||
| 1514 | * Pick the largest domain to update shares over | ||
| 1515 | */ | ||
| 1516 | tmp = sd; | ||
| 1517 | if (affine_sd && (!tmp || affine_sd->span_weight > sd->span_weight)) | ||
| 1518 | tmp = affine_sd; | ||
| 1519 | |||
| 1520 | if (tmp) { | ||
| 1521 | raw_spin_unlock(&rq->lock); | ||
| 1522 | update_shares(tmp); | ||
| 1523 | raw_spin_lock(&rq->lock); | ||
| 1524 | } | ||
| 1525 | } | ||
| 1526 | #endif | ||
| 1527 | |||
| 1528 | if (affine_sd) { | 1718 | if (affine_sd) { |
| 1529 | if (cpu == prev_cpu || wake_affine(affine_sd, p, sync)) | 1719 | if (cpu == prev_cpu || wake_affine(affine_sd, p, sync)) |
| 1530 | return select_idle_sibling(p, cpu); | 1720 | return select_idle_sibling(p, cpu); |
| @@ -1654,12 +1844,6 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ | |||
| 1654 | struct cfs_rq *cfs_rq = task_cfs_rq(curr); | 1844 | struct cfs_rq *cfs_rq = task_cfs_rq(curr); |
| 1655 | int scale = cfs_rq->nr_running >= sched_nr_latency; | 1845 | int scale = cfs_rq->nr_running >= sched_nr_latency; |
| 1656 | 1846 | ||
| 1657 | if (unlikely(rt_prio(p->prio))) | ||
| 1658 | goto preempt; | ||
| 1659 | |||
| 1660 | if (unlikely(p->sched_class != &fair_sched_class)) | ||
| 1661 | return; | ||
| 1662 | |||
| 1663 | if (unlikely(se == pse)) | 1847 | if (unlikely(se == pse)) |
| 1664 | return; | 1848 | return; |
| 1665 | 1849 | ||
| @@ -1764,10 +1948,6 @@ static void pull_task(struct rq *src_rq, struct task_struct *p, | |||
| 1764 | set_task_cpu(p, this_cpu); | 1948 | set_task_cpu(p, this_cpu); |
| 1765 | activate_task(this_rq, p, 0); | 1949 | activate_task(this_rq, p, 0); |
| 1766 | check_preempt_curr(this_rq, p, 0); | 1950 | check_preempt_curr(this_rq, p, 0); |
| 1767 | |||
| 1768 | /* re-arm NEWIDLE balancing when moving tasks */ | ||
| 1769 | src_rq->avg_idle = this_rq->avg_idle = 2*sysctl_sched_migration_cost; | ||
| 1770 | this_rq->idle_stamp = 0; | ||
| 1771 | } | 1951 | } |
| 1772 | 1952 | ||
| 1773 | /* | 1953 | /* |
| @@ -1919,6 +2099,48 @@ out: | |||
| 1919 | } | 2099 | } |
| 1920 | 2100 | ||
| 1921 | #ifdef CONFIG_FAIR_GROUP_SCHED | 2101 | #ifdef CONFIG_FAIR_GROUP_SCHED |
| 2102 | /* | ||
| 2103 | * update tg->load_weight by folding this cpu's load_avg | ||
| 2104 | */ | ||
| 2105 | static int update_shares_cpu(struct task_group *tg, int cpu) | ||
| 2106 | { | ||
| 2107 | struct cfs_rq *cfs_rq; | ||
| 2108 | unsigned long flags; | ||
| 2109 | struct rq *rq; | ||
| 2110 | |||
| 2111 | if (!tg->se[cpu]) | ||
| 2112 | return 0; | ||
| 2113 | |||
| 2114 | rq = cpu_rq(cpu); | ||
| 2115 | cfs_rq = tg->cfs_rq[cpu]; | ||
| 2116 | |||
| 2117 | raw_spin_lock_irqsave(&rq->lock, flags); | ||
| 2118 | |||
| 2119 | update_rq_clock(rq); | ||
| 2120 | update_cfs_load(cfs_rq, 1); | ||
| 2121 | |||
| 2122 | /* | ||
| 2123 | * We need to update shares after updating tg->load_weight in | ||
| 2124 | * order to adjust the weight of groups with long running tasks. | ||
| 2125 | */ | ||
| 2126 | update_cfs_shares(cfs_rq, 0); | ||
| 2127 | |||
| 2128 | raw_spin_unlock_irqrestore(&rq->lock, flags); | ||
| 2129 | |||
| 2130 | return 0; | ||
| 2131 | } | ||
| 2132 | |||
| 2133 | static void update_shares(int cpu) | ||
| 2134 | { | ||
| 2135 | struct cfs_rq *cfs_rq; | ||
| 2136 | struct rq *rq = cpu_rq(cpu); | ||
| 2137 | |||
| 2138 | rcu_read_lock(); | ||
| 2139 | for_each_leaf_cfs_rq(rq, cfs_rq) | ||
| 2140 | update_shares_cpu(cfs_rq->tg, cpu); | ||
| 2141 | rcu_read_unlock(); | ||
| 2142 | } | ||
| 2143 | |||
| 1922 | static unsigned long | 2144 | static unsigned long |
| 1923 | load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, | 2145 | load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, |
| 1924 | unsigned long max_load_move, | 2146 | unsigned long max_load_move, |
| @@ -1966,6 +2188,10 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
| 1966 | return max_load_move - rem_load_move; | 2188 | return max_load_move - rem_load_move; |
| 1967 | } | 2189 | } |
| 1968 | #else | 2190 | #else |
| 2191 | static inline void update_shares(int cpu) | ||
| 2192 | { | ||
| 2193 | } | ||
| 2194 | |||
| 1969 | static unsigned long | 2195 | static unsigned long |
| 1970 | load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, | 2196 | load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, |
| 1971 | unsigned long max_load_move, | 2197 | unsigned long max_load_move, |
| @@ -2035,13 +2261,16 @@ struct sd_lb_stats { | |||
| 2035 | unsigned long this_load_per_task; | 2261 | unsigned long this_load_per_task; |
| 2036 | unsigned long this_nr_running; | 2262 | unsigned long this_nr_running; |
| 2037 | unsigned long this_has_capacity; | 2263 | unsigned long this_has_capacity; |
| 2264 | unsigned int this_idle_cpus; | ||
| 2038 | 2265 | ||
| 2039 | /* Statistics of the busiest group */ | 2266 | /* Statistics of the busiest group */ |
| 2267 | unsigned int busiest_idle_cpus; | ||
| 2040 | unsigned long max_load; | 2268 | unsigned long max_load; |
| 2041 | unsigned long busiest_load_per_task; | 2269 | unsigned long busiest_load_per_task; |
| 2042 | unsigned long busiest_nr_running; | 2270 | unsigned long busiest_nr_running; |
| 2043 | unsigned long busiest_group_capacity; | 2271 | unsigned long busiest_group_capacity; |
| 2044 | unsigned long busiest_has_capacity; | 2272 | unsigned long busiest_has_capacity; |
| 2273 | unsigned int busiest_group_weight; | ||
| 2045 | 2274 | ||
| 2046 | int group_imb; /* Is there imbalance in this sd */ | 2275 | int group_imb; /* Is there imbalance in this sd */ |
| 2047 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) | 2276 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) |
| @@ -2063,6 +2292,8 @@ struct sg_lb_stats { | |||
| 2063 | unsigned long sum_nr_running; /* Nr tasks running in the group */ | 2292 | unsigned long sum_nr_running; /* Nr tasks running in the group */ |
| 2064 | unsigned long sum_weighted_load; /* Weighted load of group's tasks */ | 2293 | unsigned long sum_weighted_load; /* Weighted load of group's tasks */ |
| 2065 | unsigned long group_capacity; | 2294 | unsigned long group_capacity; |
| 2295 | unsigned long idle_cpus; | ||
| 2296 | unsigned long group_weight; | ||
| 2066 | int group_imb; /* Is there an imbalance in the group ? */ | 2297 | int group_imb; /* Is there an imbalance in the group ? */ |
| 2067 | int group_has_capacity; /* Is there extra capacity in the group? */ | 2298 | int group_has_capacity; /* Is there extra capacity in the group? */ |
| 2068 | }; | 2299 | }; |
| @@ -2431,7 +2662,8 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, | |||
| 2431 | sgs->group_load += load; | 2662 | sgs->group_load += load; |
| 2432 | sgs->sum_nr_running += rq->nr_running; | 2663 | sgs->sum_nr_running += rq->nr_running; |
| 2433 | sgs->sum_weighted_load += weighted_cpuload(i); | 2664 | sgs->sum_weighted_load += weighted_cpuload(i); |
| 2434 | 2665 | if (idle_cpu(i)) | |
| 2666 | sgs->idle_cpus++; | ||
| 2435 | } | 2667 | } |
| 2436 | 2668 | ||
| 2437 | /* | 2669 | /* |
| @@ -2469,6 +2701,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, | |||
| 2469 | sgs->group_capacity = DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE); | 2701 | sgs->group_capacity = DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE); |
| 2470 | if (!sgs->group_capacity) | 2702 | if (!sgs->group_capacity) |
| 2471 | sgs->group_capacity = fix_small_capacity(sd, group); | 2703 | sgs->group_capacity = fix_small_capacity(sd, group); |
| 2704 | sgs->group_weight = group->group_weight; | ||
| 2472 | 2705 | ||
| 2473 | if (sgs->group_capacity > sgs->sum_nr_running) | 2706 | if (sgs->group_capacity > sgs->sum_nr_running) |
| 2474 | sgs->group_has_capacity = 1; | 2707 | sgs->group_has_capacity = 1; |
| @@ -2576,13 +2809,16 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, | |||
| 2576 | sds->this_nr_running = sgs.sum_nr_running; | 2809 | sds->this_nr_running = sgs.sum_nr_running; |
| 2577 | sds->this_load_per_task = sgs.sum_weighted_load; | 2810 | sds->this_load_per_task = sgs.sum_weighted_load; |
| 2578 | sds->this_has_capacity = sgs.group_has_capacity; | 2811 | sds->this_has_capacity = sgs.group_has_capacity; |
| 2812 | sds->this_idle_cpus = sgs.idle_cpus; | ||
| 2579 | } else if (update_sd_pick_busiest(sd, sds, sg, &sgs, this_cpu)) { | 2813 | } else if (update_sd_pick_busiest(sd, sds, sg, &sgs, this_cpu)) { |
| 2580 | sds->max_load = sgs.avg_load; | 2814 | sds->max_load = sgs.avg_load; |
| 2581 | sds->busiest = sg; | 2815 | sds->busiest = sg; |
| 2582 | sds->busiest_nr_running = sgs.sum_nr_running; | 2816 | sds->busiest_nr_running = sgs.sum_nr_running; |
| 2817 | sds->busiest_idle_cpus = sgs.idle_cpus; | ||
| 2583 | sds->busiest_group_capacity = sgs.group_capacity; | 2818 | sds->busiest_group_capacity = sgs.group_capacity; |
| 2584 | sds->busiest_load_per_task = sgs.sum_weighted_load; | 2819 | sds->busiest_load_per_task = sgs.sum_weighted_load; |
| 2585 | sds->busiest_has_capacity = sgs.group_has_capacity; | 2820 | sds->busiest_has_capacity = sgs.group_has_capacity; |
| 2821 | sds->busiest_group_weight = sgs.group_weight; | ||
| 2586 | sds->group_imb = sgs.group_imb; | 2822 | sds->group_imb = sgs.group_imb; |
| 2587 | } | 2823 | } |
| 2588 | 2824 | ||
| @@ -2860,8 +3096,26 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
| 2860 | if (sds.this_load >= sds.avg_load) | 3096 | if (sds.this_load >= sds.avg_load) |
| 2861 | goto out_balanced; | 3097 | goto out_balanced; |
| 2862 | 3098 | ||
| 2863 | if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load) | 3099 | /* |
| 2864 | goto out_balanced; | 3100 | * In the CPU_NEWLY_IDLE, use imbalance_pct to be conservative. |
| 3101 | * And to check for busy balance use !idle_cpu instead of | ||
| 3102 | * CPU_NOT_IDLE. This is because HT siblings will use CPU_NOT_IDLE | ||
| 3103 | * even when they are idle. | ||
| 3104 | */ | ||
| 3105 | if (idle == CPU_NEWLY_IDLE || !idle_cpu(this_cpu)) { | ||
| 3106 | if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load) | ||
| 3107 | goto out_balanced; | ||
| 3108 | } else { | ||
| 3109 | /* | ||
| 3110 | * This cpu is idle. If the busiest group load doesn't | ||
| 3111 | * have more tasks than the number of available cpu's and | ||
| 3112 | * there is no imbalance between this and busiest group | ||
| 3113 | * wrt to idle cpu's, it is balanced. | ||
| 3114 | */ | ||
| 3115 | if ((sds.this_idle_cpus <= sds.busiest_idle_cpus + 1) && | ||
| 3116 | sds.busiest_nr_running <= sds.busiest_group_weight) | ||
| 3117 | goto out_balanced; | ||
| 3118 | } | ||
| 2865 | 3119 | ||
| 2866 | force_balance: | 3120 | force_balance: |
| 2867 | /* Looks like there is an imbalance. Compute it */ | 3121 | /* Looks like there is an imbalance. Compute it */ |
| @@ -3014,7 +3268,6 @@ static int load_balance(int this_cpu, struct rq *this_rq, | |||
| 3014 | schedstat_inc(sd, lb_count[idle]); | 3268 | schedstat_inc(sd, lb_count[idle]); |
| 3015 | 3269 | ||
| 3016 | redo: | 3270 | redo: |
| 3017 | update_shares(sd); | ||
| 3018 | group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle, | 3271 | group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle, |
| 3019 | cpus, balance); | 3272 | cpus, balance); |
| 3020 | 3273 | ||
| @@ -3156,8 +3409,6 @@ out_one_pinned: | |||
| 3156 | else | 3409 | else |
| 3157 | ld_moved = 0; | 3410 | ld_moved = 0; |
| 3158 | out: | 3411 | out: |
| 3159 | if (ld_moved) | ||
| 3160 | update_shares(sd); | ||
| 3161 | return ld_moved; | 3412 | return ld_moved; |
| 3162 | } | 3413 | } |
| 3163 | 3414 | ||
| @@ -3181,6 +3432,7 @@ static void idle_balance(int this_cpu, struct rq *this_rq) | |||
| 3181 | */ | 3432 | */ |
| 3182 | raw_spin_unlock(&this_rq->lock); | 3433 | raw_spin_unlock(&this_rq->lock); |
| 3183 | 3434 | ||
| 3435 | update_shares(this_cpu); | ||
| 3184 | for_each_domain(this_cpu, sd) { | 3436 | for_each_domain(this_cpu, sd) { |
| 3185 | unsigned long interval; | 3437 | unsigned long interval; |
| 3186 | int balance = 1; | 3438 | int balance = 1; |
| @@ -3197,8 +3449,10 @@ static void idle_balance(int this_cpu, struct rq *this_rq) | |||
| 3197 | interval = msecs_to_jiffies(sd->balance_interval); | 3449 | interval = msecs_to_jiffies(sd->balance_interval); |
| 3198 | if (time_after(next_balance, sd->last_balance + interval)) | 3450 | if (time_after(next_balance, sd->last_balance + interval)) |
| 3199 | next_balance = sd->last_balance + interval; | 3451 | next_balance = sd->last_balance + interval; |
| 3200 | if (pulled_task) | 3452 | if (pulled_task) { |
| 3453 | this_rq->idle_stamp = 0; | ||
| 3201 | break; | 3454 | break; |
| 3455 | } | ||
| 3202 | } | 3456 | } |
| 3203 | 3457 | ||
| 3204 | raw_spin_lock(&this_rq->lock); | 3458 | raw_spin_lock(&this_rq->lock); |
| @@ -3549,6 +3803,8 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle) | |||
| 3549 | int update_next_balance = 0; | 3803 | int update_next_balance = 0; |
| 3550 | int need_serialize; | 3804 | int need_serialize; |
| 3551 | 3805 | ||
| 3806 | update_shares(cpu); | ||
| 3807 | |||
| 3552 | for_each_domain(cpu, sd) { | 3808 | for_each_domain(cpu, sd) { |
| 3553 | if (!(sd->flags & SD_LOAD_BALANCE)) | 3809 | if (!(sd->flags & SD_LOAD_BALANCE)) |
| 3554 | continue; | 3810 | continue; |
diff --git a/kernel/sched_features.h b/kernel/sched_features.h index 185f920ec1a..68e69acc29b 100644 --- a/kernel/sched_features.h +++ b/kernel/sched_features.h | |||
| @@ -52,8 +52,6 @@ SCHED_FEAT(ARCH_POWER, 0) | |||
| 52 | SCHED_FEAT(HRTICK, 0) | 52 | SCHED_FEAT(HRTICK, 0) |
| 53 | SCHED_FEAT(DOUBLE_TICK, 0) | 53 | SCHED_FEAT(DOUBLE_TICK, 0) |
| 54 | SCHED_FEAT(LB_BIAS, 1) | 54 | SCHED_FEAT(LB_BIAS, 1) |
| 55 | SCHED_FEAT(LB_SHARES_UPDATE, 1) | ||
| 56 | SCHED_FEAT(ASYM_EFF_LOAD, 1) | ||
| 57 | 55 | ||
| 58 | /* | 56 | /* |
| 59 | * Spin-wait on mutex acquisition when the mutex owner is running on | 57 | * Spin-wait on mutex acquisition when the mutex owner is running on |
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index bea7d79f7e9..ad6267714c8 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c | |||
| @@ -183,6 +183,17 @@ static inline u64 sched_rt_period(struct rt_rq *rt_rq) | |||
| 183 | return ktime_to_ns(rt_rq->tg->rt_bandwidth.rt_period); | 183 | return ktime_to_ns(rt_rq->tg->rt_bandwidth.rt_period); |
| 184 | } | 184 | } |
| 185 | 185 | ||
| 186 | static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq) | ||
| 187 | { | ||
| 188 | list_add_rcu(&rt_rq->leaf_rt_rq_list, | ||
| 189 | &rq_of_rt_rq(rt_rq)->leaf_rt_rq_list); | ||
| 190 | } | ||
| 191 | |||
| 192 | static inline void list_del_leaf_rt_rq(struct rt_rq *rt_rq) | ||
| 193 | { | ||
| 194 | list_del_rcu(&rt_rq->leaf_rt_rq_list); | ||
| 195 | } | ||
| 196 | |||
| 186 | #define for_each_leaf_rt_rq(rt_rq, rq) \ | 197 | #define for_each_leaf_rt_rq(rt_rq, rq) \ |
| 187 | list_for_each_entry_rcu(rt_rq, &rq->leaf_rt_rq_list, leaf_rt_rq_list) | 198 | list_for_each_entry_rcu(rt_rq, &rq->leaf_rt_rq_list, leaf_rt_rq_list) |
| 188 | 199 | ||
| @@ -276,6 +287,14 @@ static inline u64 sched_rt_period(struct rt_rq *rt_rq) | |||
| 276 | return ktime_to_ns(def_rt_bandwidth.rt_period); | 287 | return ktime_to_ns(def_rt_bandwidth.rt_period); |
| 277 | } | 288 | } |
| 278 | 289 | ||
| 290 | static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq) | ||
| 291 | { | ||
| 292 | } | ||
| 293 | |||
| 294 | static inline void list_del_leaf_rt_rq(struct rt_rq *rt_rq) | ||
| 295 | { | ||
| 296 | } | ||
| 297 | |||
| 279 | #define for_each_leaf_rt_rq(rt_rq, rq) \ | 298 | #define for_each_leaf_rt_rq(rt_rq, rq) \ |
| 280 | for (rt_rq = &rq->rt; rt_rq; rt_rq = NULL) | 299 | for (rt_rq = &rq->rt; rt_rq; rt_rq = NULL) |
| 281 | 300 | ||
| @@ -606,7 +625,7 @@ static void update_curr_rt(struct rq *rq) | |||
| 606 | struct rt_rq *rt_rq = rt_rq_of_se(rt_se); | 625 | struct rt_rq *rt_rq = rt_rq_of_se(rt_se); |
| 607 | u64 delta_exec; | 626 | u64 delta_exec; |
| 608 | 627 | ||
| 609 | if (!task_has_rt_policy(curr)) | 628 | if (curr->sched_class != &rt_sched_class) |
| 610 | return; | 629 | return; |
| 611 | 630 | ||
| 612 | delta_exec = rq->clock_task - curr->se.exec_start; | 631 | delta_exec = rq->clock_task - curr->se.exec_start; |
| @@ -825,6 +844,9 @@ static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head) | |||
| 825 | if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running)) | 844 | if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running)) |
| 826 | return; | 845 | return; |
| 827 | 846 | ||
| 847 | if (!rt_rq->rt_nr_running) | ||
| 848 | list_add_leaf_rt_rq(rt_rq); | ||
| 849 | |||
| 828 | if (head) | 850 | if (head) |
| 829 | list_add(&rt_se->run_list, queue); | 851 | list_add(&rt_se->run_list, queue); |
| 830 | else | 852 | else |
| @@ -844,6 +866,8 @@ static void __dequeue_rt_entity(struct sched_rt_entity *rt_se) | |||
| 844 | __clear_bit(rt_se_prio(rt_se), array->bitmap); | 866 | __clear_bit(rt_se_prio(rt_se), array->bitmap); |
| 845 | 867 | ||
| 846 | dec_rt_tasks(rt_se, rt_rq); | 868 | dec_rt_tasks(rt_se, rt_rq); |
| 869 | if (!rt_rq->rt_nr_running) | ||
| 870 | list_del_leaf_rt_rq(rt_rq); | ||
| 847 | } | 871 | } |
| 848 | 872 | ||
| 849 | /* | 873 | /* |
diff --git a/kernel/sched_stoptask.c b/kernel/sched_stoptask.c index 45bddc0c104..2bf6b47058c 100644 --- a/kernel/sched_stoptask.c +++ b/kernel/sched_stoptask.c | |||
| @@ -19,14 +19,14 @@ select_task_rq_stop(struct rq *rq, struct task_struct *p, | |||
| 19 | static void | 19 | static void |
| 20 | check_preempt_curr_stop(struct rq *rq, struct task_struct *p, int flags) | 20 | check_preempt_curr_stop(struct rq *rq, struct task_struct *p, int flags) |
| 21 | { | 21 | { |
| 22 | resched_task(rq->curr); /* we preempt everything */ | 22 | /* we're never preempted */ |
| 23 | } | 23 | } |
| 24 | 24 | ||
| 25 | static struct task_struct *pick_next_task_stop(struct rq *rq) | 25 | static struct task_struct *pick_next_task_stop(struct rq *rq) |
| 26 | { | 26 | { |
| 27 | struct task_struct *stop = rq->stop; | 27 | struct task_struct *stop = rq->stop; |
| 28 | 28 | ||
| 29 | if (stop && stop->state == TASK_RUNNING) | 29 | if (stop && stop->se.on_rq) |
| 30 | return stop; | 30 | return stop; |
| 31 | 31 | ||
| 32 | return NULL; | 32 | return NULL; |
diff --git a/kernel/smp.c b/kernel/smp.c index 12ed8b013e2..9910744f085 100644 --- a/kernel/smp.c +++ b/kernel/smp.c | |||
| @@ -13,6 +13,7 @@ | |||
| 13 | #include <linux/smp.h> | 13 | #include <linux/smp.h> |
| 14 | #include <linux/cpu.h> | 14 | #include <linux/cpu.h> |
| 15 | 15 | ||
| 16 | #ifdef CONFIG_USE_GENERIC_SMP_HELPERS | ||
| 16 | static struct { | 17 | static struct { |
| 17 | struct list_head queue; | 18 | struct list_head queue; |
| 18 | raw_spinlock_t lock; | 19 | raw_spinlock_t lock; |
| @@ -193,23 +194,52 @@ void generic_smp_call_function_interrupt(void) | |||
| 193 | */ | 194 | */ |
| 194 | list_for_each_entry_rcu(data, &call_function.queue, csd.list) { | 195 | list_for_each_entry_rcu(data, &call_function.queue, csd.list) { |
| 195 | int refs; | 196 | int refs; |
| 197 | void (*func) (void *info); | ||
| 196 | 198 | ||
| 197 | if (!cpumask_test_and_clear_cpu(cpu, data->cpumask)) | 199 | /* |
| 200 | * Since we walk the list without any locks, we might | ||
| 201 | * see an entry that was completed, removed from the | ||
| 202 | * list and is in the process of being reused. | ||
| 203 | * | ||
| 204 | * We must check that the cpu is in the cpumask before | ||
| 205 | * checking the refs, and both must be set before | ||
| 206 | * executing the callback on this cpu. | ||
| 207 | */ | ||
| 208 | |||
| 209 | if (!cpumask_test_cpu(cpu, data->cpumask)) | ||
| 210 | continue; | ||
| 211 | |||
| 212 | smp_rmb(); | ||
| 213 | |||
| 214 | if (atomic_read(&data->refs) == 0) | ||
| 198 | continue; | 215 | continue; |
| 199 | 216 | ||
| 217 | func = data->csd.func; /* for later warn */ | ||
| 200 | data->csd.func(data->csd.info); | 218 | data->csd.func(data->csd.info); |
| 201 | 219 | ||
| 220 | /* | ||
| 221 | * If the cpu mask is not still set then it enabled interrupts, | ||
| 222 | * we took another smp interrupt, and executed the function | ||
| 223 | * twice on this cpu. In theory that copy decremented refs. | ||
| 224 | */ | ||
| 225 | if (!cpumask_test_and_clear_cpu(cpu, data->cpumask)) { | ||
| 226 | WARN(1, "%pS enabled interrupts and double executed\n", | ||
| 227 | func); | ||
| 228 | continue; | ||
| 229 | } | ||
| 230 | |||
| 202 | refs = atomic_dec_return(&data->refs); | 231 | refs = atomic_dec_return(&data->refs); |
| 203 | WARN_ON(refs < 0); | 232 | WARN_ON(refs < 0); |
| 204 | if (!refs) { | ||
| 205 | raw_spin_lock(&call_function.lock); | ||
| 206 | list_del_rcu(&data->csd.list); | ||
| 207 | raw_spin_unlock(&call_function.lock); | ||
| 208 | } | ||
| 209 | 233 | ||
| 210 | if (refs) | 234 | if (refs) |
| 211 | continue; | 235 | continue; |
| 212 | 236 | ||
| 237 | WARN_ON(!cpumask_empty(data->cpumask)); | ||
| 238 | |||
| 239 | raw_spin_lock(&call_function.lock); | ||
| 240 | list_del_rcu(&data->csd.list); | ||
| 241 | raw_spin_unlock(&call_function.lock); | ||
| 242 | |||
| 213 | csd_unlock(&data->csd); | 243 | csd_unlock(&data->csd); |
| 214 | } | 244 | } |
| 215 | 245 | ||
| @@ -429,7 +459,7 @@ void smp_call_function_many(const struct cpumask *mask, | |||
| 429 | * can't happen. | 459 | * can't happen. |
| 430 | */ | 460 | */ |
| 431 | WARN_ON_ONCE(cpu_online(this_cpu) && irqs_disabled() | 461 | WARN_ON_ONCE(cpu_online(this_cpu) && irqs_disabled() |
| 432 | && !oops_in_progress); | 462 | && !oops_in_progress && !early_boot_irqs_disabled); |
| 433 | 463 | ||
| 434 | /* So, what's a CPU they want? Ignoring this one. */ | 464 | /* So, what's a CPU they want? Ignoring this one. */ |
| 435 | cpu = cpumask_first_and(mask, cpu_online_mask); | 465 | cpu = cpumask_first_and(mask, cpu_online_mask); |
| @@ -453,11 +483,21 @@ void smp_call_function_many(const struct cpumask *mask, | |||
| 453 | 483 | ||
| 454 | data = &__get_cpu_var(cfd_data); | 484 | data = &__get_cpu_var(cfd_data); |
| 455 | csd_lock(&data->csd); | 485 | csd_lock(&data->csd); |
| 486 | BUG_ON(atomic_read(&data->refs) || !cpumask_empty(data->cpumask)); | ||
| 456 | 487 | ||
| 457 | data->csd.func = func; | 488 | data->csd.func = func; |
| 458 | data->csd.info = info; | 489 | data->csd.info = info; |
| 459 | cpumask_and(data->cpumask, mask, cpu_online_mask); | 490 | cpumask_and(data->cpumask, mask, cpu_online_mask); |
| 460 | cpumask_clear_cpu(this_cpu, data->cpumask); | 491 | cpumask_clear_cpu(this_cpu, data->cpumask); |
| 492 | |||
| 493 | /* | ||
| 494 | * To ensure the interrupt handler gets an complete view | ||
| 495 | * we order the cpumask and refs writes and order the read | ||
| 496 | * of them in the interrupt handler. In addition we may | ||
| 497 | * only clear our own cpu bit from the mask. | ||
| 498 | */ | ||
| 499 | smp_wmb(); | ||
| 500 | |||
| 461 | atomic_set(&data->refs, cpumask_weight(data->cpumask)); | 501 | atomic_set(&data->refs, cpumask_weight(data->cpumask)); |
| 462 | 502 | ||
| 463 | raw_spin_lock_irqsave(&call_function.lock, flags); | 503 | raw_spin_lock_irqsave(&call_function.lock, flags); |
| @@ -529,3 +569,24 @@ void ipi_call_unlock_irq(void) | |||
| 529 | { | 569 | { |
| 530 | raw_spin_unlock_irq(&call_function.lock); | 570 | raw_spin_unlock_irq(&call_function.lock); |
| 531 | } | 571 | } |
| 572 | #endif /* USE_GENERIC_SMP_HELPERS */ | ||
| 573 | |||
| 574 | /* | ||
| 575 | * Call a function on all processors. May be used during early boot while | ||
| 576 | * early_boot_irqs_disabled is set. Use local_irq_save/restore() instead | ||
| 577 | * of local_irq_disable/enable(). | ||
| 578 | */ | ||
| 579 | int on_each_cpu(void (*func) (void *info), void *info, int wait) | ||
| 580 | { | ||
| 581 | unsigned long flags; | ||
| 582 | int ret = 0; | ||
| 583 | |||
| 584 | preempt_disable(); | ||
| 585 | ret = smp_call_function(func, info, wait); | ||
| 586 | local_irq_save(flags); | ||
| 587 | func(info); | ||
| 588 | local_irq_restore(flags); | ||
| 589 | preempt_enable(); | ||
| 590 | return ret; | ||
| 591 | } | ||
| 592 | EXPORT_SYMBOL(on_each_cpu); | ||
diff --git a/kernel/softirq.c b/kernel/softirq.c index 18f4be0d5fe..68eb5efec38 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c | |||
| @@ -70,7 +70,7 @@ char *softirq_to_name[NR_SOFTIRQS] = { | |||
| 70 | static void wakeup_softirqd(void) | 70 | static void wakeup_softirqd(void) |
| 71 | { | 71 | { |
| 72 | /* Interrupts are disabled: no need to stop preemption */ | 72 | /* Interrupts are disabled: no need to stop preemption */ |
| 73 | struct task_struct *tsk = __get_cpu_var(ksoftirqd); | 73 | struct task_struct *tsk = __this_cpu_read(ksoftirqd); |
| 74 | 74 | ||
| 75 | if (tsk && tsk->state != TASK_RUNNING) | 75 | if (tsk && tsk->state != TASK_RUNNING) |
| 76 | wake_up_process(tsk); | 76 | wake_up_process(tsk); |
| @@ -388,8 +388,8 @@ void __tasklet_schedule(struct tasklet_struct *t) | |||
| 388 | 388 | ||
| 389 | local_irq_save(flags); | 389 | local_irq_save(flags); |
| 390 | t->next = NULL; | 390 | t->next = NULL; |
| 391 | *__get_cpu_var(tasklet_vec).tail = t; | 391 | *__this_cpu_read(tasklet_vec.tail) = t; |
| 392 | __get_cpu_var(tasklet_vec).tail = &(t->next); | 392 | __this_cpu_write(tasklet_vec.tail, &(t->next)); |
| 393 | raise_softirq_irqoff(TASKLET_SOFTIRQ); | 393 | raise_softirq_irqoff(TASKLET_SOFTIRQ); |
| 394 | local_irq_restore(flags); | 394 | local_irq_restore(flags); |
| 395 | } | 395 | } |
| @@ -402,8 +402,8 @@ void __tasklet_hi_schedule(struct tasklet_struct *t) | |||
| 402 | 402 | ||
| 403 | local_irq_save(flags); | 403 | local_irq_save(flags); |
| 404 | t->next = NULL; | 404 | t->next = NULL; |
| 405 | *__get_cpu_var(tasklet_hi_vec).tail = t; | 405 | *__this_cpu_read(tasklet_hi_vec.tail) = t; |
| 406 | __get_cpu_var(tasklet_hi_vec).tail = &(t->next); | 406 | __this_cpu_write(tasklet_hi_vec.tail, &(t->next)); |
| 407 | raise_softirq_irqoff(HI_SOFTIRQ); | 407 | raise_softirq_irqoff(HI_SOFTIRQ); |
| 408 | local_irq_restore(flags); | 408 | local_irq_restore(flags); |
| 409 | } | 409 | } |
| @@ -414,8 +414,8 @@ void __tasklet_hi_schedule_first(struct tasklet_struct *t) | |||
| 414 | { | 414 | { |
| 415 | BUG_ON(!irqs_disabled()); | 415 | BUG_ON(!irqs_disabled()); |
| 416 | 416 | ||
| 417 | t->next = __get_cpu_var(tasklet_hi_vec).head; | 417 | t->next = __this_cpu_read(tasklet_hi_vec.head); |
| 418 | __get_cpu_var(tasklet_hi_vec).head = t; | 418 | __this_cpu_write(tasklet_hi_vec.head, t); |
| 419 | __raise_softirq_irqoff(HI_SOFTIRQ); | 419 | __raise_softirq_irqoff(HI_SOFTIRQ); |
| 420 | } | 420 | } |
| 421 | 421 | ||
| @@ -426,9 +426,9 @@ static void tasklet_action(struct softirq_action *a) | |||
| 426 | struct tasklet_struct *list; | 426 | struct tasklet_struct *list; |
| 427 | 427 | ||
| 428 | local_irq_disable(); | 428 | local_irq_disable(); |
| 429 | list = __get_cpu_var(tasklet_vec).head; | 429 | list = __this_cpu_read(tasklet_vec.head); |
| 430 | __get_cpu_var(tasklet_vec).head = NULL; | 430 | __this_cpu_write(tasklet_vec.head, NULL); |
| 431 | __get_cpu_var(tasklet_vec).tail = &__get_cpu_var(tasklet_vec).head; | 431 | __this_cpu_write(tasklet_vec.tail, &__get_cpu_var(tasklet_vec).head); |
| 432 | local_irq_enable(); | 432 | local_irq_enable(); |
| 433 | 433 | ||
| 434 | while (list) { | 434 | while (list) { |
| @@ -449,8 +449,8 @@ static void tasklet_action(struct softirq_action *a) | |||
| 449 | 449 | ||
| 450 | local_irq_disable(); | 450 | local_irq_disable(); |
| 451 | t->next = NULL; | 451 | t->next = NULL; |
| 452 | *__get_cpu_var(tasklet_vec).tail = t; | 452 | *__this_cpu_read(tasklet_vec.tail) = t; |
| 453 | __get_cpu_var(tasklet_vec).tail = &(t->next); | 453 | __this_cpu_write(tasklet_vec.tail, &(t->next)); |
| 454 | __raise_softirq_irqoff(TASKLET_SOFTIRQ); | 454 | __raise_softirq_irqoff(TASKLET_SOFTIRQ); |
| 455 | local_irq_enable(); | 455 | local_irq_enable(); |
| 456 | } | 456 | } |
| @@ -461,9 +461,9 @@ static void tasklet_hi_action(struct softirq_action *a) | |||
| 461 | struct tasklet_struct *list; | 461 | struct tasklet_struct *list; |
| 462 | 462 | ||
| 463 | local_irq_disable(); | 463 | local_irq_disable(); |
| 464 | list = __get_cpu_var(tasklet_hi_vec).head; | 464 | list = __this_cpu_read(tasklet_hi_vec.head); |
| 465 | __get_cpu_var(tasklet_hi_vec).head = NULL; | 465 | __this_cpu_write(tasklet_hi_vec.head, NULL); |
| 466 | __get_cpu_var(tasklet_hi_vec).tail = &__get_cpu_var(tasklet_hi_vec).head; | 466 | __this_cpu_write(tasklet_hi_vec.tail, &__get_cpu_var(tasklet_hi_vec).head); |
| 467 | local_irq_enable(); | 467 | local_irq_enable(); |
| 468 | 468 | ||
| 469 | while (list) { | 469 | while (list) { |
| @@ -484,8 +484,8 @@ static void tasklet_hi_action(struct softirq_action *a) | |||
| 484 | 484 | ||
| 485 | local_irq_disable(); | 485 | local_irq_disable(); |
| 486 | t->next = NULL; | 486 | t->next = NULL; |
| 487 | *__get_cpu_var(tasklet_hi_vec).tail = t; | 487 | *__this_cpu_read(tasklet_hi_vec.tail) = t; |
| 488 | __get_cpu_var(tasklet_hi_vec).tail = &(t->next); | 488 | __this_cpu_write(tasklet_hi_vec.tail, &(t->next)); |
| 489 | __raise_softirq_irqoff(HI_SOFTIRQ); | 489 | __raise_softirq_irqoff(HI_SOFTIRQ); |
| 490 | local_irq_enable(); | 490 | local_irq_enable(); |
| 491 | } | 491 | } |
| @@ -802,16 +802,16 @@ static void takeover_tasklets(unsigned int cpu) | |||
| 802 | 802 | ||
| 803 | /* Find end, append list for that CPU. */ | 803 | /* Find end, append list for that CPU. */ |
| 804 | if (&per_cpu(tasklet_vec, cpu).head != per_cpu(tasklet_vec, cpu).tail) { | 804 | if (&per_cpu(tasklet_vec, cpu).head != per_cpu(tasklet_vec, cpu).tail) { |
| 805 | *(__get_cpu_var(tasklet_vec).tail) = per_cpu(tasklet_vec, cpu).head; | 805 | *__this_cpu_read(tasklet_vec.tail) = per_cpu(tasklet_vec, cpu).head; |
| 806 | __get_cpu_var(tasklet_vec).tail = per_cpu(tasklet_vec, cpu).tail; | 806 | this_cpu_write(tasklet_vec.tail, per_cpu(tasklet_vec, cpu).tail); |
| 807 | per_cpu(tasklet_vec, cpu).head = NULL; | 807 | per_cpu(tasklet_vec, cpu).head = NULL; |
| 808 | per_cpu(tasklet_vec, cpu).tail = &per_cpu(tasklet_vec, cpu).head; | 808 | per_cpu(tasklet_vec, cpu).tail = &per_cpu(tasklet_vec, cpu).head; |
| 809 | } | 809 | } |
| 810 | raise_softirq_irqoff(TASKLET_SOFTIRQ); | 810 | raise_softirq_irqoff(TASKLET_SOFTIRQ); |
| 811 | 811 | ||
| 812 | if (&per_cpu(tasklet_hi_vec, cpu).head != per_cpu(tasklet_hi_vec, cpu).tail) { | 812 | if (&per_cpu(tasklet_hi_vec, cpu).head != per_cpu(tasklet_hi_vec, cpu).tail) { |
| 813 | *__get_cpu_var(tasklet_hi_vec).tail = per_cpu(tasklet_hi_vec, cpu).head; | 813 | *__this_cpu_read(tasklet_hi_vec.tail) = per_cpu(tasklet_hi_vec, cpu).head; |
| 814 | __get_cpu_var(tasklet_hi_vec).tail = per_cpu(tasklet_hi_vec, cpu).tail; | 814 | __this_cpu_write(tasklet_hi_vec.tail, per_cpu(tasklet_hi_vec, cpu).tail); |
| 815 | per_cpu(tasklet_hi_vec, cpu).head = NULL; | 815 | per_cpu(tasklet_hi_vec, cpu).head = NULL; |
| 816 | per_cpu(tasklet_hi_vec, cpu).tail = &per_cpu(tasklet_hi_vec, cpu).head; | 816 | per_cpu(tasklet_hi_vec, cpu).tail = &per_cpu(tasklet_hi_vec, cpu).head; |
| 817 | } | 817 | } |
| @@ -853,7 +853,9 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb, | |||
| 853 | cpumask_any(cpu_online_mask)); | 853 | cpumask_any(cpu_online_mask)); |
| 854 | case CPU_DEAD: | 854 | case CPU_DEAD: |
| 855 | case CPU_DEAD_FROZEN: { | 855 | case CPU_DEAD_FROZEN: { |
| 856 | struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; | 856 | static const struct sched_param param = { |
| 857 | .sched_priority = MAX_RT_PRIO-1 | ||
| 858 | }; | ||
| 857 | 859 | ||
| 858 | p = per_cpu(ksoftirqd, hotcpu); | 860 | p = per_cpu(ksoftirqd, hotcpu); |
| 859 | per_cpu(ksoftirqd, hotcpu) = NULL; | 861 | per_cpu(ksoftirqd, hotcpu) = NULL; |
| @@ -883,25 +885,6 @@ static __init int spawn_ksoftirqd(void) | |||
| 883 | } | 885 | } |
| 884 | early_initcall(spawn_ksoftirqd); | 886 | early_initcall(spawn_ksoftirqd); |
| 885 | 887 | ||
| 886 | #ifdef CONFIG_SMP | ||
| 887 | /* | ||
| 888 | * Call a function on all processors | ||
| 889 | */ | ||
| 890 | int on_each_cpu(void (*func) (void *info), void *info, int wait) | ||
| 891 | { | ||
| 892 | int ret = 0; | ||
| 893 | |||
| 894 | preempt_disable(); | ||
| 895 | ret = smp_call_function(func, info, wait); | ||
| 896 | local_irq_disable(); | ||
| 897 | func(info); | ||
| 898 | local_irq_enable(); | ||
| 899 | preempt_enable(); | ||
| 900 | return ret; | ||
| 901 | } | ||
| 902 | EXPORT_SYMBOL(on_each_cpu); | ||
| 903 | #endif | ||
| 904 | |||
| 905 | /* | 888 | /* |
| 906 | * [ These __weak aliases are kept in a separate compilation unit, so that | 889 | * [ These __weak aliases are kept in a separate compilation unit, so that |
| 907 | * GCC does not inline them incorrectly. ] | 890 | * GCC does not inline them incorrectly. ] |
diff --git a/kernel/srcu.c b/kernel/srcu.c index c71e0750053..73ce23feaea 100644 --- a/kernel/srcu.c +++ b/kernel/srcu.c | |||
| @@ -31,6 +31,7 @@ | |||
| 31 | #include <linux/rcupdate.h> | 31 | #include <linux/rcupdate.h> |
| 32 | #include <linux/sched.h> | 32 | #include <linux/sched.h> |
| 33 | #include <linux/smp.h> | 33 | #include <linux/smp.h> |
| 34 | #include <linux/delay.h> | ||
| 34 | #include <linux/srcu.h> | 35 | #include <linux/srcu.h> |
| 35 | 36 | ||
| 36 | static int init_srcu_struct_fields(struct srcu_struct *sp) | 37 | static int init_srcu_struct_fields(struct srcu_struct *sp) |
| @@ -155,6 +156,16 @@ void __srcu_read_unlock(struct srcu_struct *sp, int idx) | |||
| 155 | EXPORT_SYMBOL_GPL(__srcu_read_unlock); | 156 | EXPORT_SYMBOL_GPL(__srcu_read_unlock); |
| 156 | 157 | ||
| 157 | /* | 158 | /* |
| 159 | * We use an adaptive strategy for synchronize_srcu() and especially for | ||
| 160 | * synchronize_srcu_expedited(). We spin for a fixed time period | ||
| 161 | * (defined below) to allow SRCU readers to exit their read-side critical | ||
| 162 | * sections. If there are still some readers after 10 microseconds, | ||
| 163 | * we repeatedly block for 1-millisecond time periods. This approach | ||
| 164 | * has done well in testing, so there is no need for a config parameter. | ||
| 165 | */ | ||
| 166 | #define SYNCHRONIZE_SRCU_READER_DELAY 10 | ||
| 167 | |||
| 168 | /* | ||
| 158 | * Helper function for synchronize_srcu() and synchronize_srcu_expedited(). | 169 | * Helper function for synchronize_srcu() and synchronize_srcu_expedited(). |
| 159 | */ | 170 | */ |
| 160 | static void __synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void)) | 171 | static void __synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void)) |
| @@ -203,9 +214,15 @@ static void __synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void)) | |||
| 203 | * all srcu_read_lock() calls using the old counters have completed. | 214 | * all srcu_read_lock() calls using the old counters have completed. |
| 204 | * Their corresponding critical sections might well be still | 215 | * Their corresponding critical sections might well be still |
| 205 | * executing, but the srcu_read_lock() primitives themselves | 216 | * executing, but the srcu_read_lock() primitives themselves |
| 206 | * will have finished executing. | 217 | * will have finished executing. We initially give readers |
| 218 | * an arbitrarily chosen 10 microseconds to get out of their | ||
| 219 | * SRCU read-side critical sections, then loop waiting 1/HZ | ||
| 220 | * seconds per iteration. The 10-microsecond value has done | ||
| 221 | * very well in testing. | ||
| 207 | */ | 222 | */ |
| 208 | 223 | ||
| 224 | if (srcu_readers_active_idx(sp, idx)) | ||
| 225 | udelay(SYNCHRONIZE_SRCU_READER_DELAY); | ||
| 209 | while (srcu_readers_active_idx(sp, idx)) | 226 | while (srcu_readers_active_idx(sp, idx)) |
| 210 | schedule_timeout_interruptible(1); | 227 | schedule_timeout_interruptible(1); |
| 211 | 228 | ||
diff --git a/kernel/sys.c b/kernel/sys.c index 7f5a0cd296a..18da702ec81 100644 --- a/kernel/sys.c +++ b/kernel/sys.c | |||
| @@ -43,6 +43,8 @@ | |||
| 43 | #include <linux/kprobes.h> | 43 | #include <linux/kprobes.h> |
| 44 | #include <linux/user_namespace.h> | 44 | #include <linux/user_namespace.h> |
| 45 | 45 | ||
| 46 | #include <linux/kmsg_dump.h> | ||
| 47 | |||
| 46 | #include <asm/uaccess.h> | 48 | #include <asm/uaccess.h> |
| 47 | #include <asm/io.h> | 49 | #include <asm/io.h> |
| 48 | #include <asm/unistd.h> | 50 | #include <asm/unistd.h> |
| @@ -285,6 +287,7 @@ out_unlock: | |||
| 285 | */ | 287 | */ |
| 286 | void emergency_restart(void) | 288 | void emergency_restart(void) |
| 287 | { | 289 | { |
| 290 | kmsg_dump(KMSG_DUMP_EMERG); | ||
| 288 | machine_emergency_restart(); | 291 | machine_emergency_restart(); |
| 289 | } | 292 | } |
| 290 | EXPORT_SYMBOL_GPL(emergency_restart); | 293 | EXPORT_SYMBOL_GPL(emergency_restart); |
| @@ -312,6 +315,7 @@ void kernel_restart(char *cmd) | |||
| 312 | printk(KERN_EMERG "Restarting system.\n"); | 315 | printk(KERN_EMERG "Restarting system.\n"); |
| 313 | else | 316 | else |
| 314 | printk(KERN_EMERG "Restarting system with command '%s'.\n", cmd); | 317 | printk(KERN_EMERG "Restarting system with command '%s'.\n", cmd); |
| 318 | kmsg_dump(KMSG_DUMP_RESTART); | ||
| 315 | machine_restart(cmd); | 319 | machine_restart(cmd); |
| 316 | } | 320 | } |
| 317 | EXPORT_SYMBOL_GPL(kernel_restart); | 321 | EXPORT_SYMBOL_GPL(kernel_restart); |
| @@ -333,6 +337,7 @@ void kernel_halt(void) | |||
| 333 | kernel_shutdown_prepare(SYSTEM_HALT); | 337 | kernel_shutdown_prepare(SYSTEM_HALT); |
| 334 | sysdev_shutdown(); | 338 | sysdev_shutdown(); |
| 335 | printk(KERN_EMERG "System halted.\n"); | 339 | printk(KERN_EMERG "System halted.\n"); |
| 340 | kmsg_dump(KMSG_DUMP_HALT); | ||
| 336 | machine_halt(); | 341 | machine_halt(); |
| 337 | } | 342 | } |
| 338 | 343 | ||
| @@ -351,6 +356,7 @@ void kernel_power_off(void) | |||
| 351 | disable_nonboot_cpus(); | 356 | disable_nonboot_cpus(); |
| 352 | sysdev_shutdown(); | 357 | sysdev_shutdown(); |
| 353 | printk(KERN_EMERG "Power down.\n"); | 358 | printk(KERN_EMERG "Power down.\n"); |
| 359 | kmsg_dump(KMSG_DUMP_POWEROFF); | ||
| 354 | machine_power_off(); | 360 | machine_power_off(); |
| 355 | } | 361 | } |
| 356 | EXPORT_SYMBOL_GPL(kernel_power_off); | 362 | EXPORT_SYMBOL_GPL(kernel_power_off); |
| @@ -1080,8 +1086,10 @@ SYSCALL_DEFINE0(setsid) | |||
| 1080 | err = session; | 1086 | err = session; |
| 1081 | out: | 1087 | out: |
| 1082 | write_unlock_irq(&tasklist_lock); | 1088 | write_unlock_irq(&tasklist_lock); |
| 1083 | if (err > 0) | 1089 | if (err > 0) { |
| 1084 | proc_sid_connector(group_leader); | 1090 | proc_sid_connector(group_leader); |
| 1091 | sched_autogroup_create_attach(group_leader); | ||
| 1092 | } | ||
| 1085 | return err; | 1093 | return err; |
| 1086 | } | 1094 | } |
| 1087 | 1095 | ||
| @@ -1377,7 +1385,8 @@ static int check_prlimit_permission(struct task_struct *task) | |||
| 1377 | const struct cred *cred = current_cred(), *tcred; | 1385 | const struct cred *cred = current_cred(), *tcred; |
| 1378 | 1386 | ||
| 1379 | tcred = __task_cred(task); | 1387 | tcred = __task_cred(task); |
| 1380 | if ((cred->uid != tcred->euid || | 1388 | if (current != task && |
| 1389 | (cred->uid != tcred->euid || | ||
| 1381 | cred->uid != tcred->suid || | 1390 | cred->uid != tcred->suid || |
| 1382 | cred->uid != tcred->uid || | 1391 | cred->uid != tcred->uid || |
| 1383 | cred->gid != tcred->egid || | 1392 | cred->gid != tcred->egid || |
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 3afce4dc9ba..0f1bd83db98 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c | |||
| @@ -24,6 +24,7 @@ | |||
| 24 | #include <linux/slab.h> | 24 | #include <linux/slab.h> |
| 25 | #include <linux/sysctl.h> | 25 | #include <linux/sysctl.h> |
| 26 | #include <linux/signal.h> | 26 | #include <linux/signal.h> |
| 27 | #include <linux/printk.h> | ||
| 27 | #include <linux/proc_fs.h> | 28 | #include <linux/proc_fs.h> |
| 28 | #include <linux/security.h> | 29 | #include <linux/security.h> |
| 29 | #include <linux/ctype.h> | 30 | #include <linux/ctype.h> |
| @@ -246,10 +247,6 @@ static struct ctl_table root_table[] = { | |||
| 246 | .mode = 0555, | 247 | .mode = 0555, |
| 247 | .child = dev_table, | 248 | .child = dev_table, |
| 248 | }, | 249 | }, |
| 249 | /* | ||
| 250 | * NOTE: do not add new entries to this table unless you have read | ||
| 251 | * Documentation/sysctl/ctl_unnumbered.txt | ||
| 252 | */ | ||
| 253 | { } | 250 | { } |
| 254 | }; | 251 | }; |
| 255 | 252 | ||
| @@ -260,8 +257,6 @@ static int min_wakeup_granularity_ns; /* 0 usecs */ | |||
| 260 | static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */ | 257 | static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */ |
| 261 | static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE; | 258 | static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE; |
| 262 | static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1; | 259 | static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1; |
| 263 | static int min_sched_shares_ratelimit = 100000; /* 100 usec */ | ||
| 264 | static int max_sched_shares_ratelimit = NSEC_PER_SEC; /* 1 second */ | ||
| 265 | #endif | 260 | #endif |
| 266 | 261 | ||
| 267 | #ifdef CONFIG_COMPACTION | 262 | #ifdef CONFIG_COMPACTION |
| @@ -306,15 +301,6 @@ static struct ctl_table kern_table[] = { | |||
| 306 | .extra2 = &max_wakeup_granularity_ns, | 301 | .extra2 = &max_wakeup_granularity_ns, |
| 307 | }, | 302 | }, |
| 308 | { | 303 | { |
| 309 | .procname = "sched_shares_ratelimit", | ||
| 310 | .data = &sysctl_sched_shares_ratelimit, | ||
| 311 | .maxlen = sizeof(unsigned int), | ||
| 312 | .mode = 0644, | ||
| 313 | .proc_handler = sched_proc_update_handler, | ||
| 314 | .extra1 = &min_sched_shares_ratelimit, | ||
| 315 | .extra2 = &max_sched_shares_ratelimit, | ||
| 316 | }, | ||
| 317 | { | ||
| 318 | .procname = "sched_tunable_scaling", | 304 | .procname = "sched_tunable_scaling", |
| 319 | .data = &sysctl_sched_tunable_scaling, | 305 | .data = &sysctl_sched_tunable_scaling, |
| 320 | .maxlen = sizeof(enum sched_tunable_scaling), | 306 | .maxlen = sizeof(enum sched_tunable_scaling), |
| @@ -324,14 +310,6 @@ static struct ctl_table kern_table[] = { | |||
| 324 | .extra2 = &max_sched_tunable_scaling, | 310 | .extra2 = &max_sched_tunable_scaling, |
| 325 | }, | 311 | }, |
| 326 | { | 312 | { |
| 327 | .procname = "sched_shares_thresh", | ||
| 328 | .data = &sysctl_sched_shares_thresh, | ||
| 329 | .maxlen = sizeof(unsigned int), | ||
| 330 | .mode = 0644, | ||
| 331 | .proc_handler = proc_dointvec_minmax, | ||
| 332 | .extra1 = &zero, | ||
| 333 | }, | ||
| 334 | { | ||
| 335 | .procname = "sched_migration_cost", | 313 | .procname = "sched_migration_cost", |
| 336 | .data = &sysctl_sched_migration_cost, | 314 | .data = &sysctl_sched_migration_cost, |
| 337 | .maxlen = sizeof(unsigned int), | 315 | .maxlen = sizeof(unsigned int), |
| @@ -353,6 +331,13 @@ static struct ctl_table kern_table[] = { | |||
| 353 | .proc_handler = proc_dointvec, | 331 | .proc_handler = proc_dointvec, |
| 354 | }, | 332 | }, |
| 355 | { | 333 | { |
| 334 | .procname = "sched_shares_window", | ||
| 335 | .data = &sysctl_sched_shares_window, | ||
| 336 | .maxlen = sizeof(unsigned int), | ||
| 337 | .mode = 0644, | ||
| 338 | .proc_handler = proc_dointvec, | ||
| 339 | }, | ||
| 340 | { | ||
| 356 | .procname = "timer_migration", | 341 | .procname = "timer_migration", |
| 357 | .data = &sysctl_timer_migration, | 342 | .data = &sysctl_timer_migration, |
| 358 | .maxlen = sizeof(unsigned int), | 343 | .maxlen = sizeof(unsigned int), |
| @@ -383,6 +368,17 @@ static struct ctl_table kern_table[] = { | |||
| 383 | .mode = 0644, | 368 | .mode = 0644, |
| 384 | .proc_handler = proc_dointvec, | 369 | .proc_handler = proc_dointvec, |
| 385 | }, | 370 | }, |
| 371 | #ifdef CONFIG_SCHED_AUTOGROUP | ||
| 372 | { | ||
| 373 | .procname = "sched_autogroup_enabled", | ||
| 374 | .data = &sysctl_sched_autogroup_enabled, | ||
| 375 | .maxlen = sizeof(unsigned int), | ||
| 376 | .mode = 0644, | ||
| 377 | .proc_handler = proc_dointvec, | ||
| 378 | .extra1 = &zero, | ||
| 379 | .extra2 = &one, | ||
| 380 | }, | ||
| 381 | #endif | ||
| 386 | #ifdef CONFIG_PROVE_LOCKING | 382 | #ifdef CONFIG_PROVE_LOCKING |
| 387 | { | 383 | { |
| 388 | .procname = "prove_locking", | 384 | .procname = "prove_locking", |
| @@ -703,6 +699,24 @@ static struct ctl_table kern_table[] = { | |||
| 703 | .extra1 = &zero, | 699 | .extra1 = &zero, |
| 704 | .extra2 = &ten_thousand, | 700 | .extra2 = &ten_thousand, |
| 705 | }, | 701 | }, |
| 702 | { | ||
| 703 | .procname = "dmesg_restrict", | ||
| 704 | .data = &dmesg_restrict, | ||
| 705 | .maxlen = sizeof(int), | ||
| 706 | .mode = 0644, | ||
| 707 | .proc_handler = proc_dointvec_minmax, | ||
| 708 | .extra1 = &zero, | ||
| 709 | .extra2 = &one, | ||
| 710 | }, | ||
| 711 | { | ||
| 712 | .procname = "kptr_restrict", | ||
| 713 | .data = &kptr_restrict, | ||
| 714 | .maxlen = sizeof(int), | ||
| 715 | .mode = 0644, | ||
| 716 | .proc_handler = proc_dointvec_minmax, | ||
| 717 | .extra1 = &zero, | ||
| 718 | .extra2 = &two, | ||
| 719 | }, | ||
| 706 | #endif | 720 | #endif |
| 707 | { | 721 | { |
| 708 | .procname = "ngroups_max", | 722 | .procname = "ngroups_max", |
| @@ -737,21 +751,21 @@ static struct ctl_table kern_table[] = { | |||
| 737 | .extra1 = &zero, | 751 | .extra1 = &zero, |
| 738 | .extra2 = &one, | 752 | .extra2 = &one, |
| 739 | }, | 753 | }, |
| 740 | #endif | ||
| 741 | #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) && !defined(CONFIG_LOCKUP_DETECTOR) | ||
| 742 | { | 754 | { |
| 743 | .procname = "unknown_nmi_panic", | 755 | .procname = "nmi_watchdog", |
| 744 | .data = &unknown_nmi_panic, | 756 | .data = &watchdog_enabled, |
| 745 | .maxlen = sizeof (int), | 757 | .maxlen = sizeof (int), |
| 746 | .mode = 0644, | 758 | .mode = 0644, |
| 747 | .proc_handler = proc_dointvec, | 759 | .proc_handler = proc_dowatchdog_enabled, |
| 748 | }, | 760 | }, |
| 761 | #endif | ||
| 762 | #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) | ||
| 749 | { | 763 | { |
| 750 | .procname = "nmi_watchdog", | 764 | .procname = "unknown_nmi_panic", |
| 751 | .data = &nmi_watchdog_enabled, | 765 | .data = &unknown_nmi_panic, |
| 752 | .maxlen = sizeof (int), | 766 | .maxlen = sizeof (int), |
| 753 | .mode = 0644, | 767 | .mode = 0644, |
| 754 | .proc_handler = proc_nmi_enabled, | 768 | .proc_handler = proc_dointvec, |
| 755 | }, | 769 | }, |
| 756 | #endif | 770 | #endif |
| 757 | #if defined(CONFIG_X86) | 771 | #if defined(CONFIG_X86) |
| @@ -955,10 +969,6 @@ static struct ctl_table kern_table[] = { | |||
| 955 | .proc_handler = proc_dointvec, | 969 | .proc_handler = proc_dointvec, |
| 956 | }, | 970 | }, |
| 957 | #endif | 971 | #endif |
| 958 | /* | ||
| 959 | * NOTE: do not add new entries to this table unless you have read | ||
| 960 | * Documentation/sysctl/ctl_unnumbered.txt | ||
| 961 | */ | ||
| 962 | { } | 972 | { } |
| 963 | }; | 973 | }; |
| 964 | 974 | ||
| @@ -1319,11 +1329,6 @@ static struct ctl_table vm_table[] = { | |||
| 1319 | .extra2 = &one, | 1329 | .extra2 = &one, |
| 1320 | }, | 1330 | }, |
| 1321 | #endif | 1331 | #endif |
| 1322 | |||
| 1323 | /* | ||
| 1324 | * NOTE: do not add new entries to this table unless you have read | ||
| 1325 | * Documentation/sysctl/ctl_unnumbered.txt | ||
| 1326 | */ | ||
| 1327 | { } | 1332 | { } |
| 1328 | }; | 1333 | }; |
| 1329 | 1334 | ||
| @@ -1479,10 +1484,6 @@ static struct ctl_table fs_table[] = { | |||
| 1479 | .proc_handler = &pipe_proc_fn, | 1484 | .proc_handler = &pipe_proc_fn, |
| 1480 | .extra1 = &pipe_min_size, | 1485 | .extra1 = &pipe_min_size, |
| 1481 | }, | 1486 | }, |
| 1482 | /* | ||
| 1483 | * NOTE: do not add new entries to this table unless you have read | ||
| 1484 | * Documentation/sysctl/ctl_unnumbered.txt | ||
| 1485 | */ | ||
| 1486 | { } | 1487 | { } |
| 1487 | }; | 1488 | }; |
| 1488 | 1489 | ||
| @@ -2892,7 +2893,7 @@ int proc_do_large_bitmap(struct ctl_table *table, int write, | |||
| 2892 | } | 2893 | } |
| 2893 | } | 2894 | } |
| 2894 | 2895 | ||
| 2895 | #else /* CONFIG_PROC_FS */ | 2896 | #else /* CONFIG_PROC_SYSCTL */ |
| 2896 | 2897 | ||
| 2897 | int proc_dostring(struct ctl_table *table, int write, | 2898 | int proc_dostring(struct ctl_table *table, int write, |
| 2898 | void __user *buffer, size_t *lenp, loff_t *ppos) | 2899 | void __user *buffer, size_t *lenp, loff_t *ppos) |
| @@ -2944,7 +2945,7 @@ int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write, | |||
| 2944 | } | 2945 | } |
| 2945 | 2946 | ||
| 2946 | 2947 | ||
| 2947 | #endif /* CONFIG_PROC_FS */ | 2948 | #endif /* CONFIG_PROC_SYSCTL */ |
| 2948 | 2949 | ||
| 2949 | /* | 2950 | /* |
| 2950 | * No sense putting this after each symbol definition, twice, | 2951 | * No sense putting this after each symbol definition, twice, |
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c index 1357c578606..b875bedf7c9 100644 --- a/kernel/sysctl_binary.c +++ b/kernel/sysctl_binary.c | |||
| @@ -136,7 +136,6 @@ static const struct bin_table bin_kern_table[] = { | |||
| 136 | { CTL_INT, KERN_IA64_UNALIGNED, "ignore-unaligned-usertrap" }, | 136 | { CTL_INT, KERN_IA64_UNALIGNED, "ignore-unaligned-usertrap" }, |
| 137 | { CTL_INT, KERN_COMPAT_LOG, "compat-log" }, | 137 | { CTL_INT, KERN_COMPAT_LOG, "compat-log" }, |
| 138 | { CTL_INT, KERN_MAX_LOCK_DEPTH, "max_lock_depth" }, | 138 | { CTL_INT, KERN_MAX_LOCK_DEPTH, "max_lock_depth" }, |
| 139 | { CTL_INT, KERN_NMI_WATCHDOG, "nmi_watchdog" }, | ||
| 140 | { CTL_INT, KERN_PANIC_ON_NMI, "panic_on_unrecovered_nmi" }, | 139 | { CTL_INT, KERN_PANIC_ON_NMI, "panic_on_unrecovered_nmi" }, |
| 141 | {} | 140 | {} |
| 142 | }; | 141 | }; |
| @@ -1193,7 +1192,7 @@ static ssize_t bin_dn_node_address(struct file *file, | |||
| 1193 | 1192 | ||
| 1194 | buf[result] = '\0'; | 1193 | buf[result] = '\0'; |
| 1195 | 1194 | ||
| 1196 | /* Convert the decnet addresss to binary */ | 1195 | /* Convert the decnet address to binary */ |
| 1197 | result = -EIO; | 1196 | result = -EIO; |
| 1198 | nodep = strchr(buf, '.') + 1; | 1197 | nodep = strchr(buf, '.') + 1; |
| 1199 | if (!nodep) | 1198 | if (!nodep) |
diff --git a/kernel/taskstats.c b/kernel/taskstats.c index c8231fb1570..3971c6b9d58 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c | |||
| @@ -89,8 +89,7 @@ static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp, | |||
| 89 | return -ENOMEM; | 89 | return -ENOMEM; |
| 90 | 90 | ||
| 91 | if (!info) { | 91 | if (!info) { |
| 92 | int seq = get_cpu_var(taskstats_seqnum)++; | 92 | int seq = this_cpu_inc_return(taskstats_seqnum) - 1; |
| 93 | put_cpu_var(taskstats_seqnum); | ||
| 94 | 93 | ||
| 95 | reply = genlmsg_put(skb, 0, seq, &family, 0, cmd); | 94 | reply = genlmsg_put(skb, 0, seq, &family, 0, cmd); |
| 96 | } else | 95 | } else |
| @@ -349,25 +348,47 @@ static int parse(struct nlattr *na, struct cpumask *mask) | |||
| 349 | return ret; | 348 | return ret; |
| 350 | } | 349 | } |
| 351 | 350 | ||
| 351 | #if defined(CONFIG_64BIT) && !defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) | ||
| 352 | #define TASKSTATS_NEEDS_PADDING 1 | ||
| 353 | #endif | ||
| 354 | |||
| 352 | static struct taskstats *mk_reply(struct sk_buff *skb, int type, u32 pid) | 355 | static struct taskstats *mk_reply(struct sk_buff *skb, int type, u32 pid) |
| 353 | { | 356 | { |
| 354 | struct nlattr *na, *ret; | 357 | struct nlattr *na, *ret; |
| 355 | int aggr; | 358 | int aggr; |
| 356 | 359 | ||
| 357 | /* If we don't pad, we end up with alignment on a 4 byte boundary. | ||
| 358 | * This causes lots of runtime warnings on systems requiring 8 byte | ||
| 359 | * alignment */ | ||
| 360 | u32 pids[2] = { pid, 0 }; | ||
| 361 | int pid_size = ALIGN(sizeof(pid), sizeof(long)); | ||
| 362 | |||
| 363 | aggr = (type == TASKSTATS_TYPE_PID) | 360 | aggr = (type == TASKSTATS_TYPE_PID) |
| 364 | ? TASKSTATS_TYPE_AGGR_PID | 361 | ? TASKSTATS_TYPE_AGGR_PID |
| 365 | : TASKSTATS_TYPE_AGGR_TGID; | 362 | : TASKSTATS_TYPE_AGGR_TGID; |
| 366 | 363 | ||
| 364 | /* | ||
| 365 | * The taskstats structure is internally aligned on 8 byte | ||
| 366 | * boundaries but the layout of the aggregrate reply, with | ||
| 367 | * two NLA headers and the pid (each 4 bytes), actually | ||
| 368 | * force the entire structure to be unaligned. This causes | ||
| 369 | * the kernel to issue unaligned access warnings on some | ||
| 370 | * architectures like ia64. Unfortunately, some software out there | ||
| 371 | * doesn't properly unroll the NLA packet and assumes that the start | ||
| 372 | * of the taskstats structure will always be 20 bytes from the start | ||
| 373 | * of the netlink payload. Aligning the start of the taskstats | ||
| 374 | * structure breaks this software, which we don't want. So, for now | ||
| 375 | * the alignment only happens on architectures that require it | ||
| 376 | * and those users will have to update to fixed versions of those | ||
| 377 | * packages. Space is reserved in the packet only when needed. | ||
| 378 | * This ifdef should be removed in several years e.g. 2012 once | ||
| 379 | * we can be confident that fixed versions are installed on most | ||
| 380 | * systems. We add the padding before the aggregate since the | ||
| 381 | * aggregate is already a defined type. | ||
| 382 | */ | ||
| 383 | #ifdef TASKSTATS_NEEDS_PADDING | ||
| 384 | if (nla_put(skb, TASKSTATS_TYPE_NULL, 0, NULL) < 0) | ||
| 385 | goto err; | ||
| 386 | #endif | ||
| 367 | na = nla_nest_start(skb, aggr); | 387 | na = nla_nest_start(skb, aggr); |
| 368 | if (!na) | 388 | if (!na) |
| 369 | goto err; | 389 | goto err; |
| 370 | if (nla_put(skb, type, pid_size, pids) < 0) | 390 | |
| 391 | if (nla_put(skb, type, sizeof(pid), &pid) < 0) | ||
| 371 | goto err; | 392 | goto err; |
| 372 | ret = nla_reserve(skb, TASKSTATS_TYPE_STATS, sizeof(struct taskstats)); | 393 | ret = nla_reserve(skb, TASKSTATS_TYPE_STATS, sizeof(struct taskstats)); |
| 373 | if (!ret) | 394 | if (!ret) |
| @@ -456,6 +477,18 @@ out: | |||
| 456 | return rc; | 477 | return rc; |
| 457 | } | 478 | } |
| 458 | 479 | ||
| 480 | static size_t taskstats_packet_size(void) | ||
| 481 | { | ||
| 482 | size_t size; | ||
| 483 | |||
| 484 | size = nla_total_size(sizeof(u32)) + | ||
| 485 | nla_total_size(sizeof(struct taskstats)) + nla_total_size(0); | ||
| 486 | #ifdef TASKSTATS_NEEDS_PADDING | ||
| 487 | size += nla_total_size(0); /* Padding for alignment */ | ||
| 488 | #endif | ||
| 489 | return size; | ||
| 490 | } | ||
| 491 | |||
| 459 | static int cmd_attr_pid(struct genl_info *info) | 492 | static int cmd_attr_pid(struct genl_info *info) |
| 460 | { | 493 | { |
| 461 | struct taskstats *stats; | 494 | struct taskstats *stats; |
| @@ -464,8 +497,7 @@ static int cmd_attr_pid(struct genl_info *info) | |||
| 464 | u32 pid; | 497 | u32 pid; |
| 465 | int rc; | 498 | int rc; |
| 466 | 499 | ||
| 467 | size = nla_total_size(sizeof(u32)) + | 500 | size = taskstats_packet_size(); |
| 468 | nla_total_size(sizeof(struct taskstats)) + nla_total_size(0); | ||
| 469 | 501 | ||
| 470 | rc = prepare_reply(info, TASKSTATS_CMD_NEW, &rep_skb, size); | 502 | rc = prepare_reply(info, TASKSTATS_CMD_NEW, &rep_skb, size); |
| 471 | if (rc < 0) | 503 | if (rc < 0) |
| @@ -494,8 +526,7 @@ static int cmd_attr_tgid(struct genl_info *info) | |||
| 494 | u32 tgid; | 526 | u32 tgid; |
| 495 | int rc; | 527 | int rc; |
| 496 | 528 | ||
| 497 | size = nla_total_size(sizeof(u32)) + | 529 | size = taskstats_packet_size(); |
| 498 | nla_total_size(sizeof(struct taskstats)) + nla_total_size(0); | ||
| 499 | 530 | ||
| 500 | rc = prepare_reply(info, TASKSTATS_CMD_NEW, &rep_skb, size); | 531 | rc = prepare_reply(info, TASKSTATS_CMD_NEW, &rep_skb, size); |
| 501 | if (rc < 0) | 532 | if (rc < 0) |
| @@ -570,8 +601,7 @@ void taskstats_exit(struct task_struct *tsk, int group_dead) | |||
| 570 | /* | 601 | /* |
| 571 | * Size includes space for nested attributes | 602 | * Size includes space for nested attributes |
| 572 | */ | 603 | */ |
| 573 | size = nla_total_size(sizeof(u32)) + | 604 | size = taskstats_packet_size(); |
| 574 | nla_total_size(sizeof(struct taskstats)) + nla_total_size(0); | ||
| 575 | 605 | ||
| 576 | is_thread_group = !!taskstats_tgid_alloc(tsk); | 606 | is_thread_group = !!taskstats_tgid_alloc(tsk); |
| 577 | if (is_thread_group) { | 607 | if (is_thread_group) { |
| @@ -581,7 +611,7 @@ void taskstats_exit(struct task_struct *tsk, int group_dead) | |||
| 581 | fill_tgid_exit(tsk); | 611 | fill_tgid_exit(tsk); |
| 582 | } | 612 | } |
| 583 | 613 | ||
| 584 | listeners = &__raw_get_cpu_var(listener_array); | 614 | listeners = __this_cpu_ptr(&listener_array); |
| 585 | if (list_empty(&listeners->list)) | 615 | if (list_empty(&listeners->list)) |
| 586 | return; | 616 | return; |
| 587 | 617 | ||
diff --git a/kernel/time.c b/kernel/time.c index ba9b338d183..32174359576 100644 --- a/kernel/time.c +++ b/kernel/time.c | |||
| @@ -238,7 +238,7 @@ EXPORT_SYMBOL(current_fs_time); | |||
| 238 | * Avoid unnecessary multiplications/divisions in the | 238 | * Avoid unnecessary multiplications/divisions in the |
| 239 | * two most common HZ cases: | 239 | * two most common HZ cases: |
| 240 | */ | 240 | */ |
| 241 | unsigned int inline jiffies_to_msecs(const unsigned long j) | 241 | inline unsigned int jiffies_to_msecs(const unsigned long j) |
| 242 | { | 242 | { |
| 243 | #if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ) | 243 | #if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ) |
| 244 | return (MSEC_PER_SEC / HZ) * j; | 244 | return (MSEC_PER_SEC / HZ) * j; |
| @@ -254,7 +254,7 @@ unsigned int inline jiffies_to_msecs(const unsigned long j) | |||
| 254 | } | 254 | } |
| 255 | EXPORT_SYMBOL(jiffies_to_msecs); | 255 | EXPORT_SYMBOL(jiffies_to_msecs); |
| 256 | 256 | ||
| 257 | unsigned int inline jiffies_to_usecs(const unsigned long j) | 257 | inline unsigned int jiffies_to_usecs(const unsigned long j) |
| 258 | { | 258 | { |
| 259 | #if HZ <= USEC_PER_SEC && !(USEC_PER_SEC % HZ) | 259 | #if HZ <= USEC_PER_SEC && !(USEC_PER_SEC % HZ) |
| 260 | return (USEC_PER_SEC / HZ) * j; | 260 | return (USEC_PER_SEC / HZ) * j; |
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index c18d7efa1b4..6519cf62d9c 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c | |||
| @@ -113,7 +113,7 @@ EXPORT_SYMBOL_GPL(timecounter_cyc2time); | |||
| 113 | * @shift: pointer to shift variable | 113 | * @shift: pointer to shift variable |
| 114 | * @from: frequency to convert from | 114 | * @from: frequency to convert from |
| 115 | * @to: frequency to convert to | 115 | * @to: frequency to convert to |
| 116 | * @minsec: guaranteed runtime conversion range in seconds | 116 | * @maxsec: guaranteed runtime conversion range in seconds |
| 117 | * | 117 | * |
| 118 | * The function evaluates the shift/mult pair for the scaled math | 118 | * The function evaluates the shift/mult pair for the scaled math |
| 119 | * operations of clocksources and clockevents. | 119 | * operations of clocksources and clockevents. |
| @@ -122,7 +122,7 @@ EXPORT_SYMBOL_GPL(timecounter_cyc2time); | |||
| 122 | * NSEC_PER_SEC == 1GHz and @from is the counter frequency. For clock | 122 | * NSEC_PER_SEC == 1GHz and @from is the counter frequency. For clock |
| 123 | * event @to is the counter frequency and @from is NSEC_PER_SEC. | 123 | * event @to is the counter frequency and @from is NSEC_PER_SEC. |
| 124 | * | 124 | * |
| 125 | * The @minsec conversion range argument controls the time frame in | 125 | * The @maxsec conversion range argument controls the time frame in |
| 126 | * seconds which must be covered by the runtime conversion with the | 126 | * seconds which must be covered by the runtime conversion with the |
| 127 | * calculated mult and shift factors. This guarantees that no 64bit | 127 | * calculated mult and shift factors. This guarantees that no 64bit |
| 128 | * overflow happens when the input value of the conversion is | 128 | * overflow happens when the input value of the conversion is |
| @@ -131,7 +131,7 @@ EXPORT_SYMBOL_GPL(timecounter_cyc2time); | |||
| 131 | * factors. | 131 | * factors. |
| 132 | */ | 132 | */ |
| 133 | void | 133 | void |
| 134 | clocks_calc_mult_shift(u32 *mult, u32 *shift, u32 from, u32 to, u32 minsec) | 134 | clocks_calc_mult_shift(u32 *mult, u32 *shift, u32 from, u32 to, u32 maxsec) |
| 135 | { | 135 | { |
| 136 | u64 tmp; | 136 | u64 tmp; |
| 137 | u32 sft, sftacc= 32; | 137 | u32 sft, sftacc= 32; |
| @@ -140,7 +140,7 @@ clocks_calc_mult_shift(u32 *mult, u32 *shift, u32 from, u32 to, u32 minsec) | |||
| 140 | * Calculate the shift factor which is limiting the conversion | 140 | * Calculate the shift factor which is limiting the conversion |
| 141 | * range: | 141 | * range: |
| 142 | */ | 142 | */ |
| 143 | tmp = ((u64)minsec * from) >> 32; | 143 | tmp = ((u64)maxsec * from) >> 32; |
| 144 | while (tmp) { | 144 | while (tmp) { |
| 145 | tmp >>=1; | 145 | tmp >>=1; |
| 146 | sftacc--; | 146 | sftacc--; |
| @@ -152,6 +152,7 @@ clocks_calc_mult_shift(u32 *mult, u32 *shift, u32 from, u32 to, u32 minsec) | |||
| 152 | */ | 152 | */ |
| 153 | for (sft = 32; sft > 0; sft--) { | 153 | for (sft = 32; sft > 0; sft--) { |
| 154 | tmp = (u64) to << sft; | 154 | tmp = (u64) to << sft; |
| 155 | tmp += from / 2; | ||
| 155 | do_div(tmp, from); | 156 | do_div(tmp, from); |
| 156 | if ((tmp >> sftacc) == 0) | 157 | if ((tmp >> sftacc) == 0) |
| 157 | break; | 158 | break; |
| @@ -678,7 +679,7 @@ EXPORT_SYMBOL_GPL(__clocksource_updatefreq_scale); | |||
| 678 | int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq) | 679 | int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq) |
| 679 | { | 680 | { |
| 680 | 681 | ||
| 681 | /* Intialize mult/shift and max_idle_ns */ | 682 | /* Initialize mult/shift and max_idle_ns */ |
| 682 | __clocksource_updatefreq_scale(cs, scale, freq); | 683 | __clocksource_updatefreq_scale(cs, scale, freq); |
| 683 | 684 | ||
| 684 | /* Add clocksource to the clcoksource list */ | 685 | /* Add clocksource to the clcoksource list */ |
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index d2321891538..5c00242fa92 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c | |||
| @@ -14,6 +14,7 @@ | |||
| 14 | #include <linux/timex.h> | 14 | #include <linux/timex.h> |
| 15 | #include <linux/time.h> | 15 | #include <linux/time.h> |
| 16 | #include <linux/mm.h> | 16 | #include <linux/mm.h> |
| 17 | #include <linux/module.h> | ||
| 17 | 18 | ||
| 18 | /* | 19 | /* |
| 19 | * NTP timekeeping variables: | 20 | * NTP timekeeping variables: |
| @@ -74,6 +75,162 @@ static long time_adjust; | |||
| 74 | /* constant (boot-param configurable) NTP tick adjustment (upscaled) */ | 75 | /* constant (boot-param configurable) NTP tick adjustment (upscaled) */ |
| 75 | static s64 ntp_tick_adj; | 76 | static s64 ntp_tick_adj; |
| 76 | 77 | ||
| 78 | #ifdef CONFIG_NTP_PPS | ||
| 79 | |||
| 80 | /* | ||
| 81 | * The following variables are used when a pulse-per-second (PPS) signal | ||
| 82 | * is available. They establish the engineering parameters of the clock | ||
| 83 | * discipline loop when controlled by the PPS signal. | ||
| 84 | */ | ||
| 85 | #define PPS_VALID 10 /* PPS signal watchdog max (s) */ | ||
| 86 | #define PPS_POPCORN 4 /* popcorn spike threshold (shift) */ | ||
| 87 | #define PPS_INTMIN 2 /* min freq interval (s) (shift) */ | ||
| 88 | #define PPS_INTMAX 8 /* max freq interval (s) (shift) */ | ||
| 89 | #define PPS_INTCOUNT 4 /* number of consecutive good intervals to | ||
| 90 | increase pps_shift or consecutive bad | ||
| 91 | intervals to decrease it */ | ||
| 92 | #define PPS_MAXWANDER 100000 /* max PPS freq wander (ns/s) */ | ||
| 93 | |||
| 94 | static int pps_valid; /* signal watchdog counter */ | ||
| 95 | static long pps_tf[3]; /* phase median filter */ | ||
| 96 | static long pps_jitter; /* current jitter (ns) */ | ||
| 97 | static struct timespec pps_fbase; /* beginning of the last freq interval */ | ||
| 98 | static int pps_shift; /* current interval duration (s) (shift) */ | ||
| 99 | static int pps_intcnt; /* interval counter */ | ||
| 100 | static s64 pps_freq; /* frequency offset (scaled ns/s) */ | ||
| 101 | static long pps_stabil; /* current stability (scaled ns/s) */ | ||
| 102 | |||
| 103 | /* | ||
| 104 | * PPS signal quality monitors | ||
| 105 | */ | ||
| 106 | static long pps_calcnt; /* calibration intervals */ | ||
| 107 | static long pps_jitcnt; /* jitter limit exceeded */ | ||
| 108 | static long pps_stbcnt; /* stability limit exceeded */ | ||
| 109 | static long pps_errcnt; /* calibration errors */ | ||
| 110 | |||
| 111 | |||
| 112 | /* PPS kernel consumer compensates the whole phase error immediately. | ||
| 113 | * Otherwise, reduce the offset by a fixed factor times the time constant. | ||
| 114 | */ | ||
| 115 | static inline s64 ntp_offset_chunk(s64 offset) | ||
| 116 | { | ||
| 117 | if (time_status & STA_PPSTIME && time_status & STA_PPSSIGNAL) | ||
| 118 | return offset; | ||
| 119 | else | ||
| 120 | return shift_right(offset, SHIFT_PLL + time_constant); | ||
| 121 | } | ||
| 122 | |||
| 123 | static inline void pps_reset_freq_interval(void) | ||
| 124 | { | ||
| 125 | /* the PPS calibration interval may end | ||
| 126 | surprisingly early */ | ||
| 127 | pps_shift = PPS_INTMIN; | ||
| 128 | pps_intcnt = 0; | ||
| 129 | } | ||
| 130 | |||
| 131 | /** | ||
| 132 | * pps_clear - Clears the PPS state variables | ||
| 133 | * | ||
| 134 | * Must be called while holding a write on the xtime_lock | ||
| 135 | */ | ||
| 136 | static inline void pps_clear(void) | ||
| 137 | { | ||
| 138 | pps_reset_freq_interval(); | ||
| 139 | pps_tf[0] = 0; | ||
| 140 | pps_tf[1] = 0; | ||
| 141 | pps_tf[2] = 0; | ||
| 142 | pps_fbase.tv_sec = pps_fbase.tv_nsec = 0; | ||
| 143 | pps_freq = 0; | ||
| 144 | } | ||
| 145 | |||
| 146 | /* Decrease pps_valid to indicate that another second has passed since | ||
| 147 | * the last PPS signal. When it reaches 0, indicate that PPS signal is | ||
| 148 | * missing. | ||
| 149 | * | ||
| 150 | * Must be called while holding a write on the xtime_lock | ||
| 151 | */ | ||
| 152 | static inline void pps_dec_valid(void) | ||
| 153 | { | ||
| 154 | if (pps_valid > 0) | ||
| 155 | pps_valid--; | ||
| 156 | else { | ||
| 157 | time_status &= ~(STA_PPSSIGNAL | STA_PPSJITTER | | ||
| 158 | STA_PPSWANDER | STA_PPSERROR); | ||
| 159 | pps_clear(); | ||
| 160 | } | ||
| 161 | } | ||
| 162 | |||
| 163 | static inline void pps_set_freq(s64 freq) | ||
| 164 | { | ||
| 165 | pps_freq = freq; | ||
| 166 | } | ||
| 167 | |||
| 168 | static inline int is_error_status(int status) | ||
| 169 | { | ||
| 170 | return (time_status & (STA_UNSYNC|STA_CLOCKERR)) | ||
| 171 | /* PPS signal lost when either PPS time or | ||
| 172 | * PPS frequency synchronization requested | ||
| 173 | */ | ||
| 174 | || ((time_status & (STA_PPSFREQ|STA_PPSTIME)) | ||
| 175 | && !(time_status & STA_PPSSIGNAL)) | ||
| 176 | /* PPS jitter exceeded when | ||
| 177 | * PPS time synchronization requested */ | ||
| 178 | || ((time_status & (STA_PPSTIME|STA_PPSJITTER)) | ||
| 179 | == (STA_PPSTIME|STA_PPSJITTER)) | ||
| 180 | /* PPS wander exceeded or calibration error when | ||
| 181 | * PPS frequency synchronization requested | ||
| 182 | */ | ||
| 183 | || ((time_status & STA_PPSFREQ) | ||
| 184 | && (time_status & (STA_PPSWANDER|STA_PPSERROR))); | ||
| 185 | } | ||
| 186 | |||
| 187 | static inline void pps_fill_timex(struct timex *txc) | ||
| 188 | { | ||
| 189 | txc->ppsfreq = shift_right((pps_freq >> PPM_SCALE_INV_SHIFT) * | ||
| 190 | PPM_SCALE_INV, NTP_SCALE_SHIFT); | ||
| 191 | txc->jitter = pps_jitter; | ||
| 192 | if (!(time_status & STA_NANO)) | ||
| 193 | txc->jitter /= NSEC_PER_USEC; | ||
| 194 | txc->shift = pps_shift; | ||
| 195 | txc->stabil = pps_stabil; | ||
| 196 | txc->jitcnt = pps_jitcnt; | ||
| 197 | txc->calcnt = pps_calcnt; | ||
| 198 | txc->errcnt = pps_errcnt; | ||
| 199 | txc->stbcnt = pps_stbcnt; | ||
| 200 | } | ||
| 201 | |||
| 202 | #else /* !CONFIG_NTP_PPS */ | ||
| 203 | |||
| 204 | static inline s64 ntp_offset_chunk(s64 offset) | ||
| 205 | { | ||
| 206 | return shift_right(offset, SHIFT_PLL + time_constant); | ||
| 207 | } | ||
| 208 | |||
| 209 | static inline void pps_reset_freq_interval(void) {} | ||
| 210 | static inline void pps_clear(void) {} | ||
| 211 | static inline void pps_dec_valid(void) {} | ||
| 212 | static inline void pps_set_freq(s64 freq) {} | ||
| 213 | |||
| 214 | static inline int is_error_status(int status) | ||
| 215 | { | ||
| 216 | return status & (STA_UNSYNC|STA_CLOCKERR); | ||
| 217 | } | ||
| 218 | |||
| 219 | static inline void pps_fill_timex(struct timex *txc) | ||
| 220 | { | ||
| 221 | /* PPS is not implemented, so these are zero */ | ||
| 222 | txc->ppsfreq = 0; | ||
| 223 | txc->jitter = 0; | ||
| 224 | txc->shift = 0; | ||
| 225 | txc->stabil = 0; | ||
| 226 | txc->jitcnt = 0; | ||
| 227 | txc->calcnt = 0; | ||
| 228 | txc->errcnt = 0; | ||
| 229 | txc->stbcnt = 0; | ||
| 230 | } | ||
| 231 | |||
| 232 | #endif /* CONFIG_NTP_PPS */ | ||
| 233 | |||
| 77 | /* | 234 | /* |
| 78 | * NTP methods: | 235 | * NTP methods: |
| 79 | */ | 236 | */ |
| @@ -185,6 +342,9 @@ void ntp_clear(void) | |||
| 185 | 342 | ||
| 186 | tick_length = tick_length_base; | 343 | tick_length = tick_length_base; |
| 187 | time_offset = 0; | 344 | time_offset = 0; |
| 345 | |||
| 346 | /* Clear PPS state variables */ | ||
| 347 | pps_clear(); | ||
| 188 | } | 348 | } |
| 189 | 349 | ||
| 190 | /* | 350 | /* |
| @@ -250,16 +410,16 @@ void second_overflow(void) | |||
| 250 | time_status |= STA_UNSYNC; | 410 | time_status |= STA_UNSYNC; |
| 251 | } | 411 | } |
| 252 | 412 | ||
| 253 | /* | 413 | /* Compute the phase adjustment for the next second */ |
| 254 | * Compute the phase adjustment for the next second. The offset is | ||
| 255 | * reduced by a fixed factor times the time constant. | ||
| 256 | */ | ||
| 257 | tick_length = tick_length_base; | 414 | tick_length = tick_length_base; |
| 258 | 415 | ||
| 259 | delta = shift_right(time_offset, SHIFT_PLL + time_constant); | 416 | delta = ntp_offset_chunk(time_offset); |
| 260 | time_offset -= delta; | 417 | time_offset -= delta; |
| 261 | tick_length += delta; | 418 | tick_length += delta; |
| 262 | 419 | ||
| 420 | /* Check PPS signal */ | ||
| 421 | pps_dec_valid(); | ||
| 422 | |||
| 263 | if (!time_adjust) | 423 | if (!time_adjust) |
| 264 | return; | 424 | return; |
| 265 | 425 | ||
| @@ -369,6 +529,8 @@ static inline void process_adj_status(struct timex *txc, struct timespec *ts) | |||
| 369 | if ((time_status & STA_PLL) && !(txc->status & STA_PLL)) { | 529 | if ((time_status & STA_PLL) && !(txc->status & STA_PLL)) { |
| 370 | time_state = TIME_OK; | 530 | time_state = TIME_OK; |
| 371 | time_status = STA_UNSYNC; | 531 | time_status = STA_UNSYNC; |
| 532 | /* restart PPS frequency calibration */ | ||
| 533 | pps_reset_freq_interval(); | ||
| 372 | } | 534 | } |
| 373 | 535 | ||
| 374 | /* | 536 | /* |
| @@ -418,6 +580,8 @@ static inline void process_adjtimex_modes(struct timex *txc, struct timespec *ts | |||
| 418 | time_freq = txc->freq * PPM_SCALE; | 580 | time_freq = txc->freq * PPM_SCALE; |
| 419 | time_freq = min(time_freq, MAXFREQ_SCALED); | 581 | time_freq = min(time_freq, MAXFREQ_SCALED); |
| 420 | time_freq = max(time_freq, -MAXFREQ_SCALED); | 582 | time_freq = max(time_freq, -MAXFREQ_SCALED); |
| 583 | /* update pps_freq */ | ||
| 584 | pps_set_freq(time_freq); | ||
| 421 | } | 585 | } |
| 422 | 586 | ||
| 423 | if (txc->modes & ADJ_MAXERROR) | 587 | if (txc->modes & ADJ_MAXERROR) |
| @@ -508,7 +672,8 @@ int do_adjtimex(struct timex *txc) | |||
| 508 | } | 672 | } |
| 509 | 673 | ||
| 510 | result = time_state; /* mostly `TIME_OK' */ | 674 | result = time_state; /* mostly `TIME_OK' */ |
| 511 | if (time_status & (STA_UNSYNC|STA_CLOCKERR)) | 675 | /* check for errors */ |
| 676 | if (is_error_status(time_status)) | ||
| 512 | result = TIME_ERROR; | 677 | result = TIME_ERROR; |
| 513 | 678 | ||
| 514 | txc->freq = shift_right((time_freq >> PPM_SCALE_INV_SHIFT) * | 679 | txc->freq = shift_right((time_freq >> PPM_SCALE_INV_SHIFT) * |
| @@ -522,15 +687,8 @@ int do_adjtimex(struct timex *txc) | |||
| 522 | txc->tick = tick_usec; | 687 | txc->tick = tick_usec; |
| 523 | txc->tai = time_tai; | 688 | txc->tai = time_tai; |
| 524 | 689 | ||
| 525 | /* PPS is not implemented, so these are zero */ | 690 | /* fill PPS status fields */ |
| 526 | txc->ppsfreq = 0; | 691 | pps_fill_timex(txc); |
| 527 | txc->jitter = 0; | ||
| 528 | txc->shift = 0; | ||
| 529 | txc->stabil = 0; | ||
| 530 | txc->jitcnt = 0; | ||
| 531 | txc->calcnt = 0; | ||
| 532 | txc->errcnt = 0; | ||
| 533 | txc->stbcnt = 0; | ||
| 534 | 692 | ||
| 535 | write_sequnlock_irq(&xtime_lock); | 693 | write_sequnlock_irq(&xtime_lock); |
| 536 | 694 | ||
| @@ -544,6 +702,243 @@ int do_adjtimex(struct timex *txc) | |||
| 544 | return result; | 702 | return result; |
| 545 | } | 703 | } |
| 546 | 704 | ||
| 705 | #ifdef CONFIG_NTP_PPS | ||
| 706 | |||
| 707 | /* actually struct pps_normtime is good old struct timespec, but it is | ||
| 708 | * semantically different (and it is the reason why it was invented): | ||
| 709 | * pps_normtime.nsec has a range of ( -NSEC_PER_SEC / 2, NSEC_PER_SEC / 2 ] | ||
| 710 | * while timespec.tv_nsec has a range of [0, NSEC_PER_SEC) */ | ||
| 711 | struct pps_normtime { | ||
| 712 | __kernel_time_t sec; /* seconds */ | ||
| 713 | long nsec; /* nanoseconds */ | ||
| 714 | }; | ||
| 715 | |||
| 716 | /* normalize the timestamp so that nsec is in the | ||
| 717 | ( -NSEC_PER_SEC / 2, NSEC_PER_SEC / 2 ] interval */ | ||
| 718 | static inline struct pps_normtime pps_normalize_ts(struct timespec ts) | ||
| 719 | { | ||
| 720 | struct pps_normtime norm = { | ||
| 721 | .sec = ts.tv_sec, | ||
| 722 | .nsec = ts.tv_nsec | ||
| 723 | }; | ||
| 724 | |||
| 725 | if (norm.nsec > (NSEC_PER_SEC >> 1)) { | ||
| 726 | norm.nsec -= NSEC_PER_SEC; | ||
| 727 | norm.sec++; | ||
| 728 | } | ||
| 729 | |||
| 730 | return norm; | ||
| 731 | } | ||
| 732 | |||
| 733 | /* get current phase correction and jitter */ | ||
| 734 | static inline long pps_phase_filter_get(long *jitter) | ||
| 735 | { | ||
| 736 | *jitter = pps_tf[0] - pps_tf[1]; | ||
| 737 | if (*jitter < 0) | ||
| 738 | *jitter = -*jitter; | ||
| 739 | |||
| 740 | /* TODO: test various filters */ | ||
| 741 | return pps_tf[0]; | ||
| 742 | } | ||
| 743 | |||
| 744 | /* add the sample to the phase filter */ | ||
| 745 | static inline void pps_phase_filter_add(long err) | ||
| 746 | { | ||
| 747 | pps_tf[2] = pps_tf[1]; | ||
| 748 | pps_tf[1] = pps_tf[0]; | ||
| 749 | pps_tf[0] = err; | ||
| 750 | } | ||
| 751 | |||
| 752 | /* decrease frequency calibration interval length. | ||
| 753 | * It is halved after four consecutive unstable intervals. | ||
| 754 | */ | ||
| 755 | static inline void pps_dec_freq_interval(void) | ||
| 756 | { | ||
| 757 | if (--pps_intcnt <= -PPS_INTCOUNT) { | ||
| 758 | pps_intcnt = -PPS_INTCOUNT; | ||
| 759 | if (pps_shift > PPS_INTMIN) { | ||
| 760 | pps_shift--; | ||
| 761 | pps_intcnt = 0; | ||
| 762 | } | ||
| 763 | } | ||
| 764 | } | ||
| 765 | |||
| 766 | /* increase frequency calibration interval length. | ||
| 767 | * It is doubled after four consecutive stable intervals. | ||
| 768 | */ | ||
| 769 | static inline void pps_inc_freq_interval(void) | ||
| 770 | { | ||
| 771 | if (++pps_intcnt >= PPS_INTCOUNT) { | ||
| 772 | pps_intcnt = PPS_INTCOUNT; | ||
| 773 | if (pps_shift < PPS_INTMAX) { | ||
| 774 | pps_shift++; | ||
| 775 | pps_intcnt = 0; | ||
| 776 | } | ||
| 777 | } | ||
| 778 | } | ||
| 779 | |||
| 780 | /* update clock frequency based on MONOTONIC_RAW clock PPS signal | ||
| 781 | * timestamps | ||
| 782 | * | ||
| 783 | * At the end of the calibration interval the difference between the | ||
| 784 | * first and last MONOTONIC_RAW clock timestamps divided by the length | ||
| 785 | * of the interval becomes the frequency update. If the interval was | ||
| 786 | * too long, the data are discarded. | ||
| 787 | * Returns the difference between old and new frequency values. | ||
| 788 | */ | ||
| 789 | static long hardpps_update_freq(struct pps_normtime freq_norm) | ||
| 790 | { | ||
| 791 | long delta, delta_mod; | ||
| 792 | s64 ftemp; | ||
| 793 | |||
| 794 | /* check if the frequency interval was too long */ | ||
| 795 | if (freq_norm.sec > (2 << pps_shift)) { | ||
| 796 | time_status |= STA_PPSERROR; | ||
| 797 | pps_errcnt++; | ||
| 798 | pps_dec_freq_interval(); | ||
| 799 | pr_err("hardpps: PPSERROR: interval too long - %ld s\n", | ||
| 800 | freq_norm.sec); | ||
| 801 | return 0; | ||
| 802 | } | ||
| 803 | |||
| 804 | /* here the raw frequency offset and wander (stability) is | ||
| 805 | * calculated. If the wander is less than the wander threshold | ||
| 806 | * the interval is increased; otherwise it is decreased. | ||
| 807 | */ | ||
| 808 | ftemp = div_s64(((s64)(-freq_norm.nsec)) << NTP_SCALE_SHIFT, | ||
| 809 | freq_norm.sec); | ||
| 810 | delta = shift_right(ftemp - pps_freq, NTP_SCALE_SHIFT); | ||
| 811 | pps_freq = ftemp; | ||
| 812 | if (delta > PPS_MAXWANDER || delta < -PPS_MAXWANDER) { | ||
| 813 | pr_warning("hardpps: PPSWANDER: change=%ld\n", delta); | ||
| 814 | time_status |= STA_PPSWANDER; | ||
| 815 | pps_stbcnt++; | ||
| 816 | pps_dec_freq_interval(); | ||
| 817 | } else { /* good sample */ | ||
| 818 | pps_inc_freq_interval(); | ||
| 819 | } | ||
| 820 | |||
| 821 | /* the stability metric is calculated as the average of recent | ||
| 822 | * frequency changes, but is used only for performance | ||
| 823 | * monitoring | ||
| 824 | */ | ||
| 825 | delta_mod = delta; | ||
| 826 | if (delta_mod < 0) | ||
| 827 | delta_mod = -delta_mod; | ||
| 828 | pps_stabil += (div_s64(((s64)delta_mod) << | ||
| 829 | (NTP_SCALE_SHIFT - SHIFT_USEC), | ||
| 830 | NSEC_PER_USEC) - pps_stabil) >> PPS_INTMIN; | ||
| 831 | |||
| 832 | /* if enabled, the system clock frequency is updated */ | ||
| 833 | if ((time_status & STA_PPSFREQ) != 0 && | ||
| 834 | (time_status & STA_FREQHOLD) == 0) { | ||
| 835 | time_freq = pps_freq; | ||
| 836 | ntp_update_frequency(); | ||
| 837 | } | ||
| 838 | |||
| 839 | return delta; | ||
| 840 | } | ||
| 841 | |||
| 842 | /* correct REALTIME clock phase error against PPS signal */ | ||
| 843 | static void hardpps_update_phase(long error) | ||
| 844 | { | ||
| 845 | long correction = -error; | ||
| 846 | long jitter; | ||
| 847 | |||
| 848 | /* add the sample to the median filter */ | ||
| 849 | pps_phase_filter_add(correction); | ||
| 850 | correction = pps_phase_filter_get(&jitter); | ||
| 851 | |||
| 852 | /* Nominal jitter is due to PPS signal noise. If it exceeds the | ||
| 853 | * threshold, the sample is discarded; otherwise, if so enabled, | ||
| 854 | * the time offset is updated. | ||
| 855 | */ | ||
| 856 | if (jitter > (pps_jitter << PPS_POPCORN)) { | ||
| 857 | pr_warning("hardpps: PPSJITTER: jitter=%ld, limit=%ld\n", | ||
| 858 | jitter, (pps_jitter << PPS_POPCORN)); | ||
| 859 | time_status |= STA_PPSJITTER; | ||
| 860 | pps_jitcnt++; | ||
| 861 | } else if (time_status & STA_PPSTIME) { | ||
| 862 | /* correct the time using the phase offset */ | ||
| 863 | time_offset = div_s64(((s64)correction) << NTP_SCALE_SHIFT, | ||
| 864 | NTP_INTERVAL_FREQ); | ||
| 865 | /* cancel running adjtime() */ | ||
| 866 | time_adjust = 0; | ||
| 867 | } | ||
| 868 | /* update jitter */ | ||
| 869 | pps_jitter += (jitter - pps_jitter) >> PPS_INTMIN; | ||
| 870 | } | ||
| 871 | |||
| 872 | /* | ||
| 873 | * hardpps() - discipline CPU clock oscillator to external PPS signal | ||
| 874 | * | ||
| 875 | * This routine is called at each PPS signal arrival in order to | ||
| 876 | * discipline the CPU clock oscillator to the PPS signal. It takes two | ||
| 877 | * parameters: REALTIME and MONOTONIC_RAW clock timestamps. The former | ||
| 878 | * is used to correct clock phase error and the latter is used to | ||
| 879 | * correct the frequency. | ||
| 880 | * | ||
| 881 | * This code is based on David Mills's reference nanokernel | ||
| 882 | * implementation. It was mostly rewritten but keeps the same idea. | ||
| 883 | */ | ||
| 884 | void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts) | ||
| 885 | { | ||
| 886 | struct pps_normtime pts_norm, freq_norm; | ||
| 887 | unsigned long flags; | ||
| 888 | |||
| 889 | pts_norm = pps_normalize_ts(*phase_ts); | ||
| 890 | |||
| 891 | write_seqlock_irqsave(&xtime_lock, flags); | ||
| 892 | |||
| 893 | /* clear the error bits, they will be set again if needed */ | ||
| 894 | time_status &= ~(STA_PPSJITTER | STA_PPSWANDER | STA_PPSERROR); | ||
| 895 | |||
| 896 | /* indicate signal presence */ | ||
| 897 | time_status |= STA_PPSSIGNAL; | ||
| 898 | pps_valid = PPS_VALID; | ||
| 899 | |||
| 900 | /* when called for the first time, | ||
| 901 | * just start the frequency interval */ | ||
| 902 | if (unlikely(pps_fbase.tv_sec == 0)) { | ||
| 903 | pps_fbase = *raw_ts; | ||
| 904 | write_sequnlock_irqrestore(&xtime_lock, flags); | ||
| 905 | return; | ||
| 906 | } | ||
| 907 | |||
| 908 | /* ok, now we have a base for frequency calculation */ | ||
| 909 | freq_norm = pps_normalize_ts(timespec_sub(*raw_ts, pps_fbase)); | ||
| 910 | |||
| 911 | /* check that the signal is in the range | ||
| 912 | * [1s - MAXFREQ us, 1s + MAXFREQ us], otherwise reject it */ | ||
| 913 | if ((freq_norm.sec == 0) || | ||
| 914 | (freq_norm.nsec > MAXFREQ * freq_norm.sec) || | ||
| 915 | (freq_norm.nsec < -MAXFREQ * freq_norm.sec)) { | ||
| 916 | time_status |= STA_PPSJITTER; | ||
| 917 | /* restart the frequency calibration interval */ | ||
| 918 | pps_fbase = *raw_ts; | ||
| 919 | write_sequnlock_irqrestore(&xtime_lock, flags); | ||
| 920 | pr_err("hardpps: PPSJITTER: bad pulse\n"); | ||
| 921 | return; | ||
| 922 | } | ||
| 923 | |||
| 924 | /* signal is ok */ | ||
| 925 | |||
| 926 | /* check if the current frequency interval is finished */ | ||
| 927 | if (freq_norm.sec >= (1 << pps_shift)) { | ||
| 928 | pps_calcnt++; | ||
| 929 | /* restart the frequency calibration interval */ | ||
| 930 | pps_fbase = *raw_ts; | ||
| 931 | hardpps_update_freq(freq_norm); | ||
| 932 | } | ||
| 933 | |||
| 934 | hardpps_update_phase(pts_norm.nsec); | ||
| 935 | |||
| 936 | write_sequnlock_irqrestore(&xtime_lock, flags); | ||
| 937 | } | ||
| 938 | EXPORT_SYMBOL(hardpps); | ||
| 939 | |||
| 940 | #endif /* CONFIG_NTP_PPS */ | ||
| 941 | |||
| 547 | static int __init ntp_tick_adj_setup(char *str) | 942 | static int __init ntp_tick_adj_setup(char *str) |
| 548 | { | 943 | { |
| 549 | ntp_tick_adj = simple_strtol(str, NULL, 0); | 944 | ntp_tick_adj = simple_strtol(str, NULL, 0); |
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index b6b898d2eee..051bc80a0c4 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c | |||
| @@ -49,7 +49,7 @@ struct tick_device *tick_get_device(int cpu) | |||
| 49 | */ | 49 | */ |
| 50 | int tick_is_oneshot_available(void) | 50 | int tick_is_oneshot_available(void) |
| 51 | { | 51 | { |
| 52 | struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; | 52 | struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev); |
| 53 | 53 | ||
| 54 | return dev && (dev->features & CLOCK_EVT_FEAT_ONESHOT); | 54 | return dev && (dev->features & CLOCK_EVT_FEAT_ONESHOT); |
| 55 | } | 55 | } |
diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c index aada0e52680..5cbc101f908 100644 --- a/kernel/time/tick-oneshot.c +++ b/kernel/time/tick-oneshot.c | |||
| @@ -95,7 +95,7 @@ int tick_dev_program_event(struct clock_event_device *dev, ktime_t expires, | |||
| 95 | */ | 95 | */ |
| 96 | int tick_program_event(ktime_t expires, int force) | 96 | int tick_program_event(ktime_t expires, int force) |
| 97 | { | 97 | { |
| 98 | struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; | 98 | struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev); |
| 99 | 99 | ||
| 100 | return tick_dev_program_event(dev, expires, force); | 100 | return tick_dev_program_event(dev, expires, force); |
| 101 | } | 101 | } |
| @@ -167,7 +167,7 @@ int tick_oneshot_mode_active(void) | |||
| 167 | int ret; | 167 | int ret; |
| 168 | 168 | ||
| 169 | local_irq_save(flags); | 169 | local_irq_save(flags); |
| 170 | ret = __get_cpu_var(tick_cpu_device).mode == TICKDEV_MODE_ONESHOT; | 170 | ret = __this_cpu_read(tick_cpu_device.mode) == TICKDEV_MODE_ONESHOT; |
| 171 | local_irq_restore(flags); | 171 | local_irq_restore(flags); |
| 172 | 172 | ||
| 173 | return ret; | 173 | return ret; |
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 3e216e01bbd..c55ea243347 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c | |||
| @@ -642,8 +642,7 @@ static void tick_nohz_switch_to_nohz(void) | |||
| 642 | } | 642 | } |
| 643 | local_irq_enable(); | 643 | local_irq_enable(); |
| 644 | 644 | ||
| 645 | printk(KERN_INFO "Switched to NOHz mode on CPU #%d\n", | 645 | printk(KERN_INFO "Switched to NOHz mode on CPU #%d\n", smp_processor_id()); |
| 646 | smp_processor_id()); | ||
| 647 | } | 646 | } |
| 648 | 647 | ||
| 649 | /* | 648 | /* |
| @@ -795,8 +794,10 @@ void tick_setup_sched_timer(void) | |||
| 795 | } | 794 | } |
| 796 | 795 | ||
| 797 | #ifdef CONFIG_NO_HZ | 796 | #ifdef CONFIG_NO_HZ |
| 798 | if (tick_nohz_enabled) | 797 | if (tick_nohz_enabled) { |
| 799 | ts->nohz_mode = NOHZ_MODE_HIGHRES; | 798 | ts->nohz_mode = NOHZ_MODE_HIGHRES; |
| 799 | printk(KERN_INFO "Switched to NOHz mode on CPU #%d\n", smp_processor_id()); | ||
| 800 | } | ||
| 800 | #endif | 801 | #endif |
| 801 | } | 802 | } |
| 802 | #endif /* HIGH_RES_TIMERS */ | 803 | #endif /* HIGH_RES_TIMERS */ |
diff --git a/kernel/time/timecompare.c b/kernel/time/timecompare.c index ac38fbb176c..a9ae369925c 100644 --- a/kernel/time/timecompare.c +++ b/kernel/time/timecompare.c | |||
| @@ -21,6 +21,7 @@ | |||
| 21 | #include <linux/module.h> | 21 | #include <linux/module.h> |
| 22 | #include <linux/slab.h> | 22 | #include <linux/slab.h> |
| 23 | #include <linux/math64.h> | 23 | #include <linux/math64.h> |
| 24 | #include <linux/kernel.h> | ||
| 24 | 25 | ||
| 25 | /* | 26 | /* |
| 26 | * fixed point arithmetic scale factor for skew | 27 | * fixed point arithmetic scale factor for skew |
| @@ -57,11 +58,11 @@ int timecompare_offset(struct timecompare *sync, | |||
| 57 | int index; | 58 | int index; |
| 58 | int num_samples = sync->num_samples; | 59 | int num_samples = sync->num_samples; |
| 59 | 60 | ||
| 60 | if (num_samples > sizeof(buffer)/sizeof(buffer[0])) { | 61 | if (num_samples > ARRAY_SIZE(buffer)) { |
| 61 | samples = kmalloc(sizeof(*samples) * num_samples, GFP_ATOMIC); | 62 | samples = kmalloc(sizeof(*samples) * num_samples, GFP_ATOMIC); |
| 62 | if (!samples) { | 63 | if (!samples) { |
| 63 | samples = buffer; | 64 | samples = buffer; |
| 64 | num_samples = sizeof(buffer)/sizeof(buffer[0]); | 65 | num_samples = ARRAY_SIZE(buffer); |
| 65 | } | 66 | } |
| 66 | } else { | 67 | } else { |
| 67 | samples = buffer; | 68 | samples = buffer; |
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 49010d822f7..d27c7562902 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c | |||
| @@ -32,6 +32,8 @@ struct timekeeper { | |||
| 32 | cycle_t cycle_interval; | 32 | cycle_t cycle_interval; |
| 33 | /* Number of clock shifted nano seconds in one NTP interval. */ | 33 | /* Number of clock shifted nano seconds in one NTP interval. */ |
| 34 | u64 xtime_interval; | 34 | u64 xtime_interval; |
| 35 | /* shifted nano seconds left over when rounding cycle_interval */ | ||
| 36 | s64 xtime_remainder; | ||
| 35 | /* Raw nano seconds accumulated per NTP interval. */ | 37 | /* Raw nano seconds accumulated per NTP interval. */ |
| 36 | u32 raw_interval; | 38 | u32 raw_interval; |
| 37 | 39 | ||
| @@ -47,7 +49,7 @@ struct timekeeper { | |||
| 47 | u32 mult; | 49 | u32 mult; |
| 48 | }; | 50 | }; |
| 49 | 51 | ||
| 50 | struct timekeeper timekeeper; | 52 | static struct timekeeper timekeeper; |
| 51 | 53 | ||
| 52 | /** | 54 | /** |
| 53 | * timekeeper_setup_internals - Set up internals to use clocksource clock. | 55 | * timekeeper_setup_internals - Set up internals to use clocksource clock. |
| @@ -62,7 +64,7 @@ struct timekeeper timekeeper; | |||
| 62 | static void timekeeper_setup_internals(struct clocksource *clock) | 64 | static void timekeeper_setup_internals(struct clocksource *clock) |
| 63 | { | 65 | { |
| 64 | cycle_t interval; | 66 | cycle_t interval; |
| 65 | u64 tmp; | 67 | u64 tmp, ntpinterval; |
| 66 | 68 | ||
| 67 | timekeeper.clock = clock; | 69 | timekeeper.clock = clock; |
| 68 | clock->cycle_last = clock->read(clock); | 70 | clock->cycle_last = clock->read(clock); |
| @@ -70,6 +72,7 @@ static void timekeeper_setup_internals(struct clocksource *clock) | |||
| 70 | /* Do the ns -> cycle conversion first, using original mult */ | 72 | /* Do the ns -> cycle conversion first, using original mult */ |
| 71 | tmp = NTP_INTERVAL_LENGTH; | 73 | tmp = NTP_INTERVAL_LENGTH; |
| 72 | tmp <<= clock->shift; | 74 | tmp <<= clock->shift; |
| 75 | ntpinterval = tmp; | ||
| 73 | tmp += clock->mult/2; | 76 | tmp += clock->mult/2; |
| 74 | do_div(tmp, clock->mult); | 77 | do_div(tmp, clock->mult); |
| 75 | if (tmp == 0) | 78 | if (tmp == 0) |
| @@ -80,6 +83,7 @@ static void timekeeper_setup_internals(struct clocksource *clock) | |||
| 80 | 83 | ||
| 81 | /* Go back from cycles -> shifted ns */ | 84 | /* Go back from cycles -> shifted ns */ |
| 82 | timekeeper.xtime_interval = (u64) interval * clock->mult; | 85 | timekeeper.xtime_interval = (u64) interval * clock->mult; |
| 86 | timekeeper.xtime_remainder = ntpinterval - timekeeper.xtime_interval; | ||
| 83 | timekeeper.raw_interval = | 87 | timekeeper.raw_interval = |
| 84 | ((u64) interval * clock->mult) >> clock->shift; | 88 | ((u64) interval * clock->mult) >> clock->shift; |
| 85 | 89 | ||
| @@ -160,7 +164,7 @@ static struct timespec total_sleep_time; | |||
| 160 | /* | 164 | /* |
| 161 | * The raw monotonic time for the CLOCK_MONOTONIC_RAW posix clock. | 165 | * The raw monotonic time for the CLOCK_MONOTONIC_RAW posix clock. |
| 162 | */ | 166 | */ |
| 163 | struct timespec raw_time; | 167 | static struct timespec raw_time; |
| 164 | 168 | ||
| 165 | /* flag for if timekeeping is suspended */ | 169 | /* flag for if timekeeping is suspended */ |
| 166 | int __read_mostly timekeeping_suspended; | 170 | int __read_mostly timekeeping_suspended; |
| @@ -284,6 +288,49 @@ void ktime_get_ts(struct timespec *ts) | |||
| 284 | } | 288 | } |
| 285 | EXPORT_SYMBOL_GPL(ktime_get_ts); | 289 | EXPORT_SYMBOL_GPL(ktime_get_ts); |
| 286 | 290 | ||
| 291 | #ifdef CONFIG_NTP_PPS | ||
| 292 | |||
| 293 | /** | ||
| 294 | * getnstime_raw_and_real - get day and raw monotonic time in timespec format | ||
| 295 | * @ts_raw: pointer to the timespec to be set to raw monotonic time | ||
| 296 | * @ts_real: pointer to the timespec to be set to the time of day | ||
| 297 | * | ||
| 298 | * This function reads both the time of day and raw monotonic time at the | ||
| 299 | * same time atomically and stores the resulting timestamps in timespec | ||
| 300 | * format. | ||
| 301 | */ | ||
| 302 | void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real) | ||
| 303 | { | ||
| 304 | unsigned long seq; | ||
| 305 | s64 nsecs_raw, nsecs_real; | ||
| 306 | |||
| 307 | WARN_ON_ONCE(timekeeping_suspended); | ||
| 308 | |||
| 309 | do { | ||
| 310 | u32 arch_offset; | ||
| 311 | |||
| 312 | seq = read_seqbegin(&xtime_lock); | ||
| 313 | |||
| 314 | *ts_raw = raw_time; | ||
| 315 | *ts_real = xtime; | ||
| 316 | |||
| 317 | nsecs_raw = timekeeping_get_ns_raw(); | ||
| 318 | nsecs_real = timekeeping_get_ns(); | ||
| 319 | |||
| 320 | /* If arch requires, add in gettimeoffset() */ | ||
| 321 | arch_offset = arch_gettimeoffset(); | ||
| 322 | nsecs_raw += arch_offset; | ||
| 323 | nsecs_real += arch_offset; | ||
| 324 | |||
| 325 | } while (read_seqretry(&xtime_lock, seq)); | ||
| 326 | |||
| 327 | timespec_add_ns(ts_raw, nsecs_raw); | ||
| 328 | timespec_add_ns(ts_real, nsecs_real); | ||
| 329 | } | ||
| 330 | EXPORT_SYMBOL(getnstime_raw_and_real); | ||
| 331 | |||
| 332 | #endif /* CONFIG_NTP_PPS */ | ||
| 333 | |||
| 287 | /** | 334 | /** |
| 288 | * do_gettimeofday - Returns the time of day in a timeval | 335 | * do_gettimeofday - Returns the time of day in a timeval |
| 289 | * @tv: pointer to the timeval to be set | 336 | * @tv: pointer to the timeval to be set |
| @@ -719,7 +766,8 @@ static cycle_t logarithmic_accumulation(cycle_t offset, int shift) | |||
| 719 | 766 | ||
| 720 | /* Accumulate error between NTP and clock interval */ | 767 | /* Accumulate error between NTP and clock interval */ |
| 721 | timekeeper.ntp_error += tick_length << shift; | 768 | timekeeper.ntp_error += tick_length << shift; |
| 722 | timekeeper.ntp_error -= timekeeper.xtime_interval << | 769 | timekeeper.ntp_error -= |
| 770 | (timekeeper.xtime_interval + timekeeper.xtime_remainder) << | ||
| 723 | (timekeeper.ntp_error_shift + shift); | 771 | (timekeeper.ntp_error_shift + shift); |
| 724 | 772 | ||
| 725 | return offset; | 773 | return offset; |
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index ab8f5e33fa9..3258455549f 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c | |||
| @@ -41,7 +41,7 @@ static void print_name_offset(struct seq_file *m, void *sym) | |||
| 41 | char symname[KSYM_NAME_LEN]; | 41 | char symname[KSYM_NAME_LEN]; |
| 42 | 42 | ||
| 43 | if (lookup_symbol_name((unsigned long)sym, symname) < 0) | 43 | if (lookup_symbol_name((unsigned long)sym, symname) < 0) |
| 44 | SEQ_printf(m, "<%p>", sym); | 44 | SEQ_printf(m, "<%pK>", sym); |
| 45 | else | 45 | else |
| 46 | SEQ_printf(m, "%s", symname); | 46 | SEQ_printf(m, "%s", symname); |
| 47 | } | 47 | } |
| @@ -79,26 +79,26 @@ print_active_timers(struct seq_file *m, struct hrtimer_clock_base *base, | |||
| 79 | { | 79 | { |
| 80 | struct hrtimer *timer, tmp; | 80 | struct hrtimer *timer, tmp; |
| 81 | unsigned long next = 0, i; | 81 | unsigned long next = 0, i; |
| 82 | struct rb_node *curr; | 82 | struct timerqueue_node *curr; |
| 83 | unsigned long flags; | 83 | unsigned long flags; |
| 84 | 84 | ||
| 85 | next_one: | 85 | next_one: |
| 86 | i = 0; | 86 | i = 0; |
| 87 | raw_spin_lock_irqsave(&base->cpu_base->lock, flags); | 87 | raw_spin_lock_irqsave(&base->cpu_base->lock, flags); |
| 88 | 88 | ||
| 89 | curr = base->first; | 89 | curr = timerqueue_getnext(&base->active); |
| 90 | /* | 90 | /* |
| 91 | * Crude but we have to do this O(N*N) thing, because | 91 | * Crude but we have to do this O(N*N) thing, because |
| 92 | * we have to unlock the base when printing: | 92 | * we have to unlock the base when printing: |
| 93 | */ | 93 | */ |
| 94 | while (curr && i < next) { | 94 | while (curr && i < next) { |
| 95 | curr = rb_next(curr); | 95 | curr = timerqueue_iterate_next(curr); |
| 96 | i++; | 96 | i++; |
| 97 | } | 97 | } |
| 98 | 98 | ||
| 99 | if (curr) { | 99 | if (curr) { |
| 100 | 100 | ||
| 101 | timer = rb_entry(curr, struct hrtimer, node); | 101 | timer = container_of(curr, struct hrtimer, node); |
| 102 | tmp = *timer; | 102 | tmp = *timer; |
| 103 | raw_spin_unlock_irqrestore(&base->cpu_base->lock, flags); | 103 | raw_spin_unlock_irqrestore(&base->cpu_base->lock, flags); |
| 104 | 104 | ||
| @@ -112,7 +112,7 @@ next_one: | |||
| 112 | static void | 112 | static void |
| 113 | print_base(struct seq_file *m, struct hrtimer_clock_base *base, u64 now) | 113 | print_base(struct seq_file *m, struct hrtimer_clock_base *base, u64 now) |
| 114 | { | 114 | { |
| 115 | SEQ_printf(m, " .base: %p\n", base); | 115 | SEQ_printf(m, " .base: %pK\n", base); |
| 116 | SEQ_printf(m, " .index: %d\n", | 116 | SEQ_printf(m, " .index: %d\n", |
| 117 | base->index); | 117 | base->index); |
| 118 | SEQ_printf(m, " .resolution: %Lu nsecs\n", | 118 | SEQ_printf(m, " .resolution: %Lu nsecs\n", |
diff --git a/kernel/timer.c b/kernel/timer.c index 68a9ae7679b..d6459923d24 100644 --- a/kernel/timer.c +++ b/kernel/timer.c | |||
| @@ -88,18 +88,6 @@ struct tvec_base boot_tvec_bases; | |||
| 88 | EXPORT_SYMBOL(boot_tvec_bases); | 88 | EXPORT_SYMBOL(boot_tvec_bases); |
| 89 | static DEFINE_PER_CPU(struct tvec_base *, tvec_bases) = &boot_tvec_bases; | 89 | static DEFINE_PER_CPU(struct tvec_base *, tvec_bases) = &boot_tvec_bases; |
| 90 | 90 | ||
| 91 | /* | ||
| 92 | * Note that all tvec_bases are 2 byte aligned and lower bit of | ||
| 93 | * base in timer_list is guaranteed to be zero. Use the LSB to | ||
| 94 | * indicate whether the timer is deferrable. | ||
| 95 | * | ||
| 96 | * A deferrable timer will work normally when the system is busy, but | ||
| 97 | * will not cause a CPU to come out of idle just to service it; instead, | ||
| 98 | * the timer will be serviced when the CPU eventually wakes up with a | ||
| 99 | * subsequent non-deferrable timer. | ||
| 100 | */ | ||
| 101 | #define TBASE_DEFERRABLE_FLAG (0x1) | ||
| 102 | |||
| 103 | /* Functions below help us manage 'deferrable' flag */ | 91 | /* Functions below help us manage 'deferrable' flag */ |
| 104 | static inline unsigned int tbase_get_deferrable(struct tvec_base *base) | 92 | static inline unsigned int tbase_get_deferrable(struct tvec_base *base) |
| 105 | { | 93 | { |
| @@ -113,8 +101,7 @@ static inline struct tvec_base *tbase_get_base(struct tvec_base *base) | |||
| 113 | 101 | ||
| 114 | static inline void timer_set_deferrable(struct timer_list *timer) | 102 | static inline void timer_set_deferrable(struct timer_list *timer) |
| 115 | { | 103 | { |
| 116 | timer->base = ((struct tvec_base *)((unsigned long)(timer->base) | | 104 | timer->base = TBASE_MAKE_DEFERRED(timer->base); |
| 117 | TBASE_DEFERRABLE_FLAG)); | ||
| 118 | } | 105 | } |
| 119 | 106 | ||
| 120 | static inline void | 107 | static inline void |
| @@ -343,15 +330,6 @@ void set_timer_slack(struct timer_list *timer, int slack_hz) | |||
| 343 | } | 330 | } |
| 344 | EXPORT_SYMBOL_GPL(set_timer_slack); | 331 | EXPORT_SYMBOL_GPL(set_timer_slack); |
| 345 | 332 | ||
| 346 | |||
| 347 | static inline void set_running_timer(struct tvec_base *base, | ||
| 348 | struct timer_list *timer) | ||
| 349 | { | ||
| 350 | #ifdef CONFIG_SMP | ||
| 351 | base->running_timer = timer; | ||
| 352 | #endif | ||
| 353 | } | ||
| 354 | |||
| 355 | static void internal_add_timer(struct tvec_base *base, struct timer_list *timer) | 333 | static void internal_add_timer(struct tvec_base *base, struct timer_list *timer) |
| 356 | { | 334 | { |
| 357 | unsigned long expires = timer->expires; | 335 | unsigned long expires = timer->expires; |
| @@ -936,15 +914,12 @@ int del_timer(struct timer_list *timer) | |||
| 936 | } | 914 | } |
| 937 | EXPORT_SYMBOL(del_timer); | 915 | EXPORT_SYMBOL(del_timer); |
| 938 | 916 | ||
| 939 | #ifdef CONFIG_SMP | ||
| 940 | /** | 917 | /** |
| 941 | * try_to_del_timer_sync - Try to deactivate a timer | 918 | * try_to_del_timer_sync - Try to deactivate a timer |
| 942 | * @timer: timer do del | 919 | * @timer: timer do del |
| 943 | * | 920 | * |
| 944 | * This function tries to deactivate a timer. Upon successful (ret >= 0) | 921 | * This function tries to deactivate a timer. Upon successful (ret >= 0) |
| 945 | * exit the timer is not queued and the handler is not running on any CPU. | 922 | * exit the timer is not queued and the handler is not running on any CPU. |
| 946 | * | ||
| 947 | * It must not be called from interrupt contexts. | ||
| 948 | */ | 923 | */ |
| 949 | int try_to_del_timer_sync(struct timer_list *timer) | 924 | int try_to_del_timer_sync(struct timer_list *timer) |
| 950 | { | 925 | { |
| @@ -973,6 +948,7 @@ out: | |||
| 973 | } | 948 | } |
| 974 | EXPORT_SYMBOL(try_to_del_timer_sync); | 949 | EXPORT_SYMBOL(try_to_del_timer_sync); |
| 975 | 950 | ||
| 951 | #ifdef CONFIG_SMP | ||
| 976 | /** | 952 | /** |
| 977 | * del_timer_sync - deactivate a timer and wait for the handler to finish. | 953 | * del_timer_sync - deactivate a timer and wait for the handler to finish. |
| 978 | * @timer: the timer to be deactivated | 954 | * @timer: the timer to be deactivated |
| @@ -1000,7 +976,11 @@ int del_timer_sync(struct timer_list *timer) | |||
| 1000 | lock_map_release(&timer->lockdep_map); | 976 | lock_map_release(&timer->lockdep_map); |
| 1001 | local_irq_restore(flags); | 977 | local_irq_restore(flags); |
| 1002 | #endif | 978 | #endif |
| 1003 | 979 | /* | |
| 980 | * don't use it in hardirq context, because it | ||
| 981 | * could lead to deadlock. | ||
| 982 | */ | ||
| 983 | WARN_ON(in_irq()); | ||
| 1004 | for (;;) { | 984 | for (;;) { |
| 1005 | int ret = try_to_del_timer_sync(timer); | 985 | int ret = try_to_del_timer_sync(timer); |
| 1006 | if (ret >= 0) | 986 | if (ret >= 0) |
| @@ -1111,7 +1091,7 @@ static inline void __run_timers(struct tvec_base *base) | |||
| 1111 | 1091 | ||
| 1112 | timer_stats_account_timer(timer); | 1092 | timer_stats_account_timer(timer); |
| 1113 | 1093 | ||
| 1114 | set_running_timer(base, timer); | 1094 | base->running_timer = timer; |
| 1115 | detach_timer(timer, 1); | 1095 | detach_timer(timer, 1); |
| 1116 | 1096 | ||
| 1117 | spin_unlock_irq(&base->lock); | 1097 | spin_unlock_irq(&base->lock); |
| @@ -1119,7 +1099,7 @@ static inline void __run_timers(struct tvec_base *base) | |||
| 1119 | spin_lock_irq(&base->lock); | 1099 | spin_lock_irq(&base->lock); |
| 1120 | } | 1100 | } |
| 1121 | } | 1101 | } |
| 1122 | set_running_timer(base, NULL); | 1102 | base->running_timer = NULL; |
| 1123 | spin_unlock_irq(&base->lock); | 1103 | spin_unlock_irq(&base->lock); |
| 1124 | } | 1104 | } |
| 1125 | 1105 | ||
| @@ -1249,9 +1229,15 @@ static unsigned long cmp_next_hrtimer_event(unsigned long now, | |||
| 1249 | */ | 1229 | */ |
| 1250 | unsigned long get_next_timer_interrupt(unsigned long now) | 1230 | unsigned long get_next_timer_interrupt(unsigned long now) |
| 1251 | { | 1231 | { |
| 1252 | struct tvec_base *base = __get_cpu_var(tvec_bases); | 1232 | struct tvec_base *base = __this_cpu_read(tvec_bases); |
| 1253 | unsigned long expires; | 1233 | unsigned long expires; |
| 1254 | 1234 | ||
| 1235 | /* | ||
| 1236 | * Pretend that there is no timer pending if the cpu is offline. | ||
| 1237 | * Possible pending timers will be migrated later to an active cpu. | ||
| 1238 | */ | ||
| 1239 | if (cpu_is_offline(smp_processor_id())) | ||
| 1240 | return now + NEXT_TIMER_MAX_DELTA; | ||
| 1255 | spin_lock(&base->lock); | 1241 | spin_lock(&base->lock); |
| 1256 | if (time_before_eq(base->next_timer, base->timer_jiffies)) | 1242 | if (time_before_eq(base->next_timer, base->timer_jiffies)) |
| 1257 | base->next_timer = __next_timer_interrupt(base); | 1243 | base->next_timer = __next_timer_interrupt(base); |
| @@ -1292,7 +1278,7 @@ void update_process_times(int user_tick) | |||
| 1292 | */ | 1278 | */ |
| 1293 | static void run_timer_softirq(struct softirq_action *h) | 1279 | static void run_timer_softirq(struct softirq_action *h) |
| 1294 | { | 1280 | { |
| 1295 | struct tvec_base *base = __get_cpu_var(tvec_bases); | 1281 | struct tvec_base *base = __this_cpu_read(tvec_bases); |
| 1296 | 1282 | ||
| 1297 | hrtimer_run_pending(); | 1283 | hrtimer_run_pending(); |
| 1298 | 1284 | ||
| @@ -1319,7 +1305,7 @@ void do_timer(unsigned long ticks) | |||
| 1319 | { | 1305 | { |
| 1320 | jiffies_64 += ticks; | 1306 | jiffies_64 += ticks; |
| 1321 | update_wall_time(); | 1307 | update_wall_time(); |
| 1322 | calc_global_load(); | 1308 | calc_global_load(ticks); |
| 1323 | } | 1309 | } |
| 1324 | 1310 | ||
| 1325 | #ifdef __ARCH_WANT_SYS_ALARM | 1311 | #ifdef __ARCH_WANT_SYS_ALARM |
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index e04b8bcdef8..14674dce77a 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig | |||
| @@ -69,6 +69,21 @@ config EVENT_TRACING | |||
| 69 | select CONTEXT_SWITCH_TRACER | 69 | select CONTEXT_SWITCH_TRACER |
| 70 | bool | 70 | bool |
| 71 | 71 | ||
| 72 | config EVENT_POWER_TRACING_DEPRECATED | ||
| 73 | depends on EVENT_TRACING | ||
| 74 | bool "Deprecated power event trace API, to be removed" | ||
| 75 | default y | ||
| 76 | help | ||
| 77 | Provides old power event types: | ||
| 78 | C-state/idle accounting events: | ||
| 79 | power:power_start | ||
| 80 | power:power_end | ||
| 81 | and old cpufreq accounting event: | ||
| 82 | power:power_frequency | ||
| 83 | This is for userspace compatibility | ||
| 84 | and will vanish after 5 kernel iterations, | ||
| 85 | namely 2.6.41. | ||
| 86 | |||
| 72 | config CONTEXT_SWITCH_TRACER | 87 | config CONTEXT_SWITCH_TRACER |
| 73 | bool | 88 | bool |
| 74 | 89 | ||
| @@ -126,7 +141,7 @@ if FTRACE | |||
| 126 | config FUNCTION_TRACER | 141 | config FUNCTION_TRACER |
| 127 | bool "Kernel Function Tracer" | 142 | bool "Kernel Function Tracer" |
| 128 | depends on HAVE_FUNCTION_TRACER | 143 | depends on HAVE_FUNCTION_TRACER |
| 129 | select FRAME_POINTER if (!ARM_UNWIND) | 144 | select FRAME_POINTER if !ARM_UNWIND && !S390 |
| 130 | select KALLSYMS | 145 | select KALLSYMS |
| 131 | select GENERIC_TRACER | 146 | select GENERIC_TRACER |
| 132 | select CONTEXT_SWITCH_TRACER | 147 | select CONTEXT_SWITCH_TRACER |
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 53f338190b2..761c510a06c 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile | |||
| @@ -52,7 +52,7 @@ obj-$(CONFIG_EVENT_TRACING) += trace_event_perf.o | |||
| 52 | endif | 52 | endif |
| 53 | obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o | 53 | obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o |
| 54 | obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o | 54 | obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o |
| 55 | obj-$(CONFIG_EVENT_TRACING) += power-traces.o | 55 | obj-$(CONFIG_TRACEPOINTS) += power-traces.o |
| 56 | ifeq ($(CONFIG_TRACING),y) | 56 | ifeq ($(CONFIG_TRACING),y) |
| 57 | obj-$(CONFIG_KGDB_KDB) += trace_kdb.o | 57 | obj-$(CONFIG_KGDB_KDB) += trace_kdb.o |
| 58 | endif | 58 | endif |
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index bc251ed6672..d95721f3370 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c | |||
| @@ -138,6 +138,13 @@ void __trace_note_message(struct blk_trace *bt, const char *fmt, ...) | |||
| 138 | !blk_tracer_enabled)) | 138 | !blk_tracer_enabled)) |
| 139 | return; | 139 | return; |
| 140 | 140 | ||
| 141 | /* | ||
| 142 | * If the BLK_TC_NOTIFY action mask isn't set, don't send any note | ||
| 143 | * message to the trace. | ||
| 144 | */ | ||
| 145 | if (!(bt->act_mask & BLK_TC_NOTIFY)) | ||
| 146 | return; | ||
| 147 | |||
| 141 | local_irq_save(flags); | 148 | local_irq_save(flags); |
| 142 | buf = per_cpu_ptr(bt->msg_data, smp_processor_id()); | 149 | buf = per_cpu_ptr(bt->msg_data, smp_processor_id()); |
| 143 | va_start(args, fmt); | 150 | va_start(args, fmt); |
| @@ -168,7 +175,6 @@ static int act_log_check(struct blk_trace *bt, u32 what, sector_t sector, | |||
| 168 | static const u32 ddir_act[2] = { BLK_TC_ACT(BLK_TC_READ), | 175 | static const u32 ddir_act[2] = { BLK_TC_ACT(BLK_TC_READ), |
| 169 | BLK_TC_ACT(BLK_TC_WRITE) }; | 176 | BLK_TC_ACT(BLK_TC_WRITE) }; |
| 170 | 177 | ||
| 171 | #define BLK_TC_HARDBARRIER BLK_TC_BARRIER | ||
| 172 | #define BLK_TC_RAHEAD BLK_TC_AHEAD | 178 | #define BLK_TC_RAHEAD BLK_TC_AHEAD |
| 173 | 179 | ||
| 174 | /* The ilog2() calls fall out because they're constant */ | 180 | /* The ilog2() calls fall out because they're constant */ |
| @@ -196,7 +202,6 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, | |||
| 196 | return; | 202 | return; |
| 197 | 203 | ||
| 198 | what |= ddir_act[rw & WRITE]; | 204 | what |= ddir_act[rw & WRITE]; |
| 199 | what |= MASK_TC_BIT(rw, HARDBARRIER); | ||
| 200 | what |= MASK_TC_BIT(rw, SYNC); | 205 | what |= MASK_TC_BIT(rw, SYNC); |
| 201 | what |= MASK_TC_BIT(rw, RAHEAD); | 206 | what |= MASK_TC_BIT(rw, RAHEAD); |
| 202 | what |= MASK_TC_BIT(rw, META); | 207 | what |= MASK_TC_BIT(rw, META); |
| @@ -760,53 +765,58 @@ static void blk_add_trace_rq_complete(void *ignore, | |||
| 760 | * @q: queue the io is for | 765 | * @q: queue the io is for |
| 761 | * @bio: the source bio | 766 | * @bio: the source bio |
| 762 | * @what: the action | 767 | * @what: the action |
| 768 | * @error: error, if any | ||
| 763 | * | 769 | * |
| 764 | * Description: | 770 | * Description: |
| 765 | * Records an action against a bio. Will log the bio offset + size. | 771 | * Records an action against a bio. Will log the bio offset + size. |
| 766 | * | 772 | * |
| 767 | **/ | 773 | **/ |
| 768 | static void blk_add_trace_bio(struct request_queue *q, struct bio *bio, | 774 | static void blk_add_trace_bio(struct request_queue *q, struct bio *bio, |
| 769 | u32 what) | 775 | u32 what, int error) |
| 770 | { | 776 | { |
| 771 | struct blk_trace *bt = q->blk_trace; | 777 | struct blk_trace *bt = q->blk_trace; |
| 772 | 778 | ||
| 773 | if (likely(!bt)) | 779 | if (likely(!bt)) |
| 774 | return; | 780 | return; |
| 775 | 781 | ||
| 782 | if (!error && !bio_flagged(bio, BIO_UPTODATE)) | ||
| 783 | error = EIO; | ||
| 784 | |||
| 776 | __blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, what, | 785 | __blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, what, |
| 777 | !bio_flagged(bio, BIO_UPTODATE), 0, NULL); | 786 | error, 0, NULL); |
| 778 | } | 787 | } |
| 779 | 788 | ||
| 780 | static void blk_add_trace_bio_bounce(void *ignore, | 789 | static void blk_add_trace_bio_bounce(void *ignore, |
| 781 | struct request_queue *q, struct bio *bio) | 790 | struct request_queue *q, struct bio *bio) |
| 782 | { | 791 | { |
| 783 | blk_add_trace_bio(q, bio, BLK_TA_BOUNCE); | 792 | blk_add_trace_bio(q, bio, BLK_TA_BOUNCE, 0); |
| 784 | } | 793 | } |
| 785 | 794 | ||
| 786 | static void blk_add_trace_bio_complete(void *ignore, | 795 | static void blk_add_trace_bio_complete(void *ignore, |
| 787 | struct request_queue *q, struct bio *bio) | 796 | struct request_queue *q, struct bio *bio, |
| 797 | int error) | ||
| 788 | { | 798 | { |
| 789 | blk_add_trace_bio(q, bio, BLK_TA_COMPLETE); | 799 | blk_add_trace_bio(q, bio, BLK_TA_COMPLETE, error); |
| 790 | } | 800 | } |
| 791 | 801 | ||
| 792 | static void blk_add_trace_bio_backmerge(void *ignore, | 802 | static void blk_add_trace_bio_backmerge(void *ignore, |
| 793 | struct request_queue *q, | 803 | struct request_queue *q, |
| 794 | struct bio *bio) | 804 | struct bio *bio) |
| 795 | { | 805 | { |
| 796 | blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE); | 806 | blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE, 0); |
| 797 | } | 807 | } |
| 798 | 808 | ||
| 799 | static void blk_add_trace_bio_frontmerge(void *ignore, | 809 | static void blk_add_trace_bio_frontmerge(void *ignore, |
| 800 | struct request_queue *q, | 810 | struct request_queue *q, |
| 801 | struct bio *bio) | 811 | struct bio *bio) |
| 802 | { | 812 | { |
| 803 | blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE); | 813 | blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE, 0); |
| 804 | } | 814 | } |
| 805 | 815 | ||
| 806 | static void blk_add_trace_bio_queue(void *ignore, | 816 | static void blk_add_trace_bio_queue(void *ignore, |
| 807 | struct request_queue *q, struct bio *bio) | 817 | struct request_queue *q, struct bio *bio) |
| 808 | { | 818 | { |
| 809 | blk_add_trace_bio(q, bio, BLK_TA_QUEUE); | 819 | blk_add_trace_bio(q, bio, BLK_TA_QUEUE, 0); |
| 810 | } | 820 | } |
| 811 | 821 | ||
| 812 | static void blk_add_trace_getrq(void *ignore, | 822 | static void blk_add_trace_getrq(void *ignore, |
| @@ -814,7 +824,7 @@ static void blk_add_trace_getrq(void *ignore, | |||
| 814 | struct bio *bio, int rw) | 824 | struct bio *bio, int rw) |
| 815 | { | 825 | { |
| 816 | if (bio) | 826 | if (bio) |
| 817 | blk_add_trace_bio(q, bio, BLK_TA_GETRQ); | 827 | blk_add_trace_bio(q, bio, BLK_TA_GETRQ, 0); |
| 818 | else { | 828 | else { |
| 819 | struct blk_trace *bt = q->blk_trace; | 829 | struct blk_trace *bt = q->blk_trace; |
| 820 | 830 | ||
| @@ -829,7 +839,7 @@ static void blk_add_trace_sleeprq(void *ignore, | |||
| 829 | struct bio *bio, int rw) | 839 | struct bio *bio, int rw) |
| 830 | { | 840 | { |
| 831 | if (bio) | 841 | if (bio) |
| 832 | blk_add_trace_bio(q, bio, BLK_TA_SLEEPRQ); | 842 | blk_add_trace_bio(q, bio, BLK_TA_SLEEPRQ, 0); |
| 833 | else { | 843 | else { |
| 834 | struct blk_trace *bt = q->blk_trace; | 844 | struct blk_trace *bt = q->blk_trace; |
| 835 | 845 | ||
| @@ -889,7 +899,7 @@ static void blk_add_trace_split(void *ignore, | |||
| 889 | } | 899 | } |
| 890 | 900 | ||
| 891 | /** | 901 | /** |
| 892 | * blk_add_trace_remap - Add a trace for a remap operation | 902 | * blk_add_trace_bio_remap - Add a trace for a bio-remap operation |
| 893 | * @ignore: trace callback data parameter (not used) | 903 | * @ignore: trace callback data parameter (not used) |
| 894 | * @q: queue the io is for | 904 | * @q: queue the io is for |
| 895 | * @bio: the source bio | 905 | * @bio: the source bio |
| @@ -901,9 +911,9 @@ static void blk_add_trace_split(void *ignore, | |||
| 901 | * it spans a stripe (or similar). Add a trace for that action. | 911 | * it spans a stripe (or similar). Add a trace for that action. |
| 902 | * | 912 | * |
| 903 | **/ | 913 | **/ |
| 904 | static void blk_add_trace_remap(void *ignore, | 914 | static void blk_add_trace_bio_remap(void *ignore, |
| 905 | struct request_queue *q, struct bio *bio, | 915 | struct request_queue *q, struct bio *bio, |
| 906 | dev_t dev, sector_t from) | 916 | dev_t dev, sector_t from) |
| 907 | { | 917 | { |
| 908 | struct blk_trace *bt = q->blk_trace; | 918 | struct blk_trace *bt = q->blk_trace; |
| 909 | struct blk_io_trace_remap r; | 919 | struct blk_io_trace_remap r; |
| @@ -1018,7 +1028,7 @@ static void blk_register_tracepoints(void) | |||
| 1018 | WARN_ON(ret); | 1028 | WARN_ON(ret); |
| 1019 | ret = register_trace_block_split(blk_add_trace_split, NULL); | 1029 | ret = register_trace_block_split(blk_add_trace_split, NULL); |
| 1020 | WARN_ON(ret); | 1030 | WARN_ON(ret); |
| 1021 | ret = register_trace_block_remap(blk_add_trace_remap, NULL); | 1031 | ret = register_trace_block_bio_remap(blk_add_trace_bio_remap, NULL); |
| 1022 | WARN_ON(ret); | 1032 | WARN_ON(ret); |
| 1023 | ret = register_trace_block_rq_remap(blk_add_trace_rq_remap, NULL); | 1033 | ret = register_trace_block_rq_remap(blk_add_trace_rq_remap, NULL); |
| 1024 | WARN_ON(ret); | 1034 | WARN_ON(ret); |
| @@ -1027,7 +1037,7 @@ static void blk_register_tracepoints(void) | |||
| 1027 | static void blk_unregister_tracepoints(void) | 1037 | static void blk_unregister_tracepoints(void) |
| 1028 | { | 1038 | { |
| 1029 | unregister_trace_block_rq_remap(blk_add_trace_rq_remap, NULL); | 1039 | unregister_trace_block_rq_remap(blk_add_trace_rq_remap, NULL); |
| 1030 | unregister_trace_block_remap(blk_add_trace_remap, NULL); | 1040 | unregister_trace_block_bio_remap(blk_add_trace_bio_remap, NULL); |
| 1031 | unregister_trace_block_split(blk_add_trace_split, NULL); | 1041 | unregister_trace_block_split(blk_add_trace_split, NULL); |
| 1032 | unregister_trace_block_unplug_io(blk_add_trace_unplug_io, NULL); | 1042 | unregister_trace_block_unplug_io(blk_add_trace_unplug_io, NULL); |
| 1033 | unregister_trace_block_unplug_timer(blk_add_trace_unplug_timer, NULL); | 1043 | unregister_trace_block_unplug_timer(blk_add_trace_unplug_timer, NULL); |
| @@ -1807,8 +1817,6 @@ void blk_fill_rwbs(char *rwbs, u32 rw, int bytes) | |||
| 1807 | 1817 | ||
| 1808 | if (rw & REQ_RAHEAD) | 1818 | if (rw & REQ_RAHEAD) |
| 1809 | rwbs[i++] = 'A'; | 1819 | rwbs[i++] = 'A'; |
| 1810 | if (rw & REQ_HARDBARRIER) | ||
| 1811 | rwbs[i++] = 'B'; | ||
| 1812 | if (rw & REQ_SYNC) | 1820 | if (rw & REQ_SYNC) |
| 1813 | rwbs[i++] = 'S'; | 1821 | rwbs[i++] = 'S'; |
| 1814 | if (rw & REQ_META) | 1822 | if (rw & REQ_META) |
diff --git a/kernel/trace/power-traces.c b/kernel/trace/power-traces.c index a22582a0616..f55fcf61b22 100644 --- a/kernel/trace/power-traces.c +++ b/kernel/trace/power-traces.c | |||
| @@ -13,5 +13,8 @@ | |||
| 13 | #define CREATE_TRACE_POINTS | 13 | #define CREATE_TRACE_POINTS |
| 14 | #include <trace/events/power.h> | 14 | #include <trace/events/power.h> |
| 15 | 15 | ||
| 16 | EXPORT_TRACEPOINT_SYMBOL_GPL(power_frequency); | 16 | #ifdef EVENT_POWER_TRACING_DEPRECATED |
| 17 | EXPORT_TRACEPOINT_SYMBOL_GPL(power_start); | ||
| 18 | #endif | ||
| 19 | EXPORT_TRACEPOINT_SYMBOL_GPL(cpu_idle); | ||
| 17 | 20 | ||
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 9ed509a015d..bd1c35a4fbc 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c | |||
| @@ -3853,6 +3853,13 @@ int ring_buffer_read_page(struct ring_buffer *buffer, | |||
| 3853 | 3853 | ||
| 3854 | /* Need to copy one event at a time */ | 3854 | /* Need to copy one event at a time */ |
| 3855 | do { | 3855 | do { |
| 3856 | /* We need the size of one event, because | ||
| 3857 | * rb_advance_reader only advances by one event, | ||
| 3858 | * whereas rb_event_ts_length may include the size of | ||
| 3859 | * one or two events. | ||
| 3860 | * We have already ensured there's enough space if this | ||
| 3861 | * is a time extend. */ | ||
| 3862 | size = rb_event_length(event); | ||
| 3856 | memcpy(bpage->data + pos, rpage->data + rpos, size); | 3863 | memcpy(bpage->data + pos, rpage->data + rpos, size); |
| 3857 | 3864 | ||
| 3858 | len -= size; | 3865 | len -= size; |
| @@ -3867,7 +3874,7 @@ int ring_buffer_read_page(struct ring_buffer *buffer, | |||
| 3867 | event = rb_reader_event(cpu_buffer); | 3874 | event = rb_reader_event(cpu_buffer); |
| 3868 | /* Always keep the time extend and data together */ | 3875 | /* Always keep the time extend and data together */ |
| 3869 | size = rb_event_ts_length(event); | 3876 | size = rb_event_ts_length(event); |
| 3870 | } while (len > size); | 3877 | } while (len >= size); |
| 3871 | 3878 | ||
| 3872 | /* update bpage */ | 3879 | /* update bpage */ |
| 3873 | local_set(&bpage->commit, pos); | 3880 | local_set(&bpage->commit, pos); |
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 82d9b8106cd..dc53ecb8058 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c | |||
| @@ -17,7 +17,6 @@ | |||
| 17 | #include <linux/writeback.h> | 17 | #include <linux/writeback.h> |
| 18 | #include <linux/kallsyms.h> | 18 | #include <linux/kallsyms.h> |
| 19 | #include <linux/seq_file.h> | 19 | #include <linux/seq_file.h> |
| 20 | #include <linux/smp_lock.h> | ||
| 21 | #include <linux/notifier.h> | 20 | #include <linux/notifier.h> |
| 22 | #include <linux/irqflags.h> | 21 | #include <linux/irqflags.h> |
| 23 | #include <linux/debugfs.h> | 22 | #include <linux/debugfs.h> |
| @@ -1284,6 +1283,8 @@ void trace_dump_stack(void) | |||
| 1284 | __ftrace_trace_stack(global_trace.buffer, flags, 3, preempt_count()); | 1283 | __ftrace_trace_stack(global_trace.buffer, flags, 3, preempt_count()); |
| 1285 | } | 1284 | } |
| 1286 | 1285 | ||
| 1286 | static DEFINE_PER_CPU(int, user_stack_count); | ||
| 1287 | |||
| 1287 | void | 1288 | void |
| 1288 | ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc) | 1289 | ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc) |
| 1289 | { | 1290 | { |
| @@ -1302,10 +1303,20 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc) | |||
| 1302 | if (unlikely(in_nmi())) | 1303 | if (unlikely(in_nmi())) |
| 1303 | return; | 1304 | return; |
| 1304 | 1305 | ||
| 1306 | /* | ||
| 1307 | * prevent recursion, since the user stack tracing may | ||
| 1308 | * trigger other kernel events. | ||
| 1309 | */ | ||
| 1310 | preempt_disable(); | ||
| 1311 | if (__this_cpu_read(user_stack_count)) | ||
| 1312 | goto out; | ||
| 1313 | |||
| 1314 | __this_cpu_inc(user_stack_count); | ||
| 1315 | |||
| 1305 | event = trace_buffer_lock_reserve(buffer, TRACE_USER_STACK, | 1316 | event = trace_buffer_lock_reserve(buffer, TRACE_USER_STACK, |
| 1306 | sizeof(*entry), flags, pc); | 1317 | sizeof(*entry), flags, pc); |
| 1307 | if (!event) | 1318 | if (!event) |
| 1308 | return; | 1319 | goto out_drop_count; |
| 1309 | entry = ring_buffer_event_data(event); | 1320 | entry = ring_buffer_event_data(event); |
| 1310 | 1321 | ||
| 1311 | entry->tgid = current->tgid; | 1322 | entry->tgid = current->tgid; |
| @@ -1319,6 +1330,11 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc) | |||
| 1319 | save_stack_trace_user(&trace); | 1330 | save_stack_trace_user(&trace); |
| 1320 | if (!filter_check_discard(call, entry, buffer, event)) | 1331 | if (!filter_check_discard(call, entry, buffer, event)) |
| 1321 | ring_buffer_unlock_commit(buffer, event); | 1332 | ring_buffer_unlock_commit(buffer, event); |
| 1333 | |||
| 1334 | out_drop_count: | ||
| 1335 | __this_cpu_dec(user_stack_count); | ||
| 1336 | out: | ||
| 1337 | preempt_enable(); | ||
| 1322 | } | 1338 | } |
| 1323 | 1339 | ||
| 1324 | #ifdef UNUSED | 1340 | #ifdef UNUSED |
| @@ -2320,11 +2336,19 @@ tracing_write_stub(struct file *filp, const char __user *ubuf, | |||
| 2320 | return count; | 2336 | return count; |
| 2321 | } | 2337 | } |
| 2322 | 2338 | ||
| 2339 | static loff_t tracing_seek(struct file *file, loff_t offset, int origin) | ||
| 2340 | { | ||
| 2341 | if (file->f_mode & FMODE_READ) | ||
| 2342 | return seq_lseek(file, offset, origin); | ||
| 2343 | else | ||
| 2344 | return 0; | ||
| 2345 | } | ||
| 2346 | |||
| 2323 | static const struct file_operations tracing_fops = { | 2347 | static const struct file_operations tracing_fops = { |
| 2324 | .open = tracing_open, | 2348 | .open = tracing_open, |
| 2325 | .read = seq_read, | 2349 | .read = seq_read, |
| 2326 | .write = tracing_write_stub, | 2350 | .write = tracing_write_stub, |
| 2327 | .llseek = seq_lseek, | 2351 | .llseek = tracing_seek, |
| 2328 | .release = tracing_release, | 2352 | .release = tracing_release, |
| 2329 | }; | 2353 | }; |
| 2330 | 2354 | ||
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index e3dfecaf13e..6cf223764be 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h | |||
| @@ -53,7 +53,7 @@ | |||
| 53 | */ | 53 | */ |
| 54 | 54 | ||
| 55 | /* | 55 | /* |
| 56 | * Function trace entry - function address and parent function addres: | 56 | * Function trace entry - function address and parent function address: |
| 57 | */ | 57 | */ |
| 58 | FTRACE_ENTRY(function, ftrace_entry, | 58 | FTRACE_ENTRY(function, ftrace_entry, |
| 59 | 59 | ||
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index 39c059ca670..19a359d5e6d 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c | |||
| @@ -21,17 +21,46 @@ typedef typeof(unsigned long [PERF_MAX_TRACE_SIZE / sizeof(unsigned long)]) | |||
| 21 | /* Count the events in use (per event id, not per instance) */ | 21 | /* Count the events in use (per event id, not per instance) */ |
| 22 | static int total_ref_count; | 22 | static int total_ref_count; |
| 23 | 23 | ||
| 24 | static int perf_trace_event_perm(struct ftrace_event_call *tp_event, | ||
| 25 | struct perf_event *p_event) | ||
| 26 | { | ||
| 27 | /* No tracing, just counting, so no obvious leak */ | ||
| 28 | if (!(p_event->attr.sample_type & PERF_SAMPLE_RAW)) | ||
| 29 | return 0; | ||
| 30 | |||
| 31 | /* Some events are ok to be traced by non-root users... */ | ||
| 32 | if (p_event->attach_state == PERF_ATTACH_TASK) { | ||
| 33 | if (tp_event->flags & TRACE_EVENT_FL_CAP_ANY) | ||
| 34 | return 0; | ||
| 35 | } | ||
| 36 | |||
| 37 | /* | ||
| 38 | * ...otherwise raw tracepoint data can be a severe data leak, | ||
| 39 | * only allow root to have these. | ||
| 40 | */ | ||
| 41 | if (perf_paranoid_tracepoint_raw() && !capable(CAP_SYS_ADMIN)) | ||
| 42 | return -EPERM; | ||
| 43 | |||
| 44 | return 0; | ||
| 45 | } | ||
| 46 | |||
| 24 | static int perf_trace_event_init(struct ftrace_event_call *tp_event, | 47 | static int perf_trace_event_init(struct ftrace_event_call *tp_event, |
| 25 | struct perf_event *p_event) | 48 | struct perf_event *p_event) |
| 26 | { | 49 | { |
| 27 | struct hlist_head __percpu *list; | 50 | struct hlist_head __percpu *list; |
| 28 | int ret = -ENOMEM; | 51 | int ret; |
| 29 | int cpu; | 52 | int cpu; |
| 30 | 53 | ||
| 54 | ret = perf_trace_event_perm(tp_event, p_event); | ||
| 55 | if (ret) | ||
| 56 | return ret; | ||
| 57 | |||
| 31 | p_event->tp_event = tp_event; | 58 | p_event->tp_event = tp_event; |
| 32 | if (tp_event->perf_refcount++ > 0) | 59 | if (tp_event->perf_refcount++ > 0) |
| 33 | return 0; | 60 | return 0; |
| 34 | 61 | ||
| 62 | ret = -ENOMEM; | ||
| 63 | |||
| 35 | list = alloc_percpu(struct hlist_head); | 64 | list = alloc_percpu(struct hlist_head); |
| 36 | if (!list) | 65 | if (!list) |
| 37 | goto fail; | 66 | goto fail; |
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 0725eeab193..5f499e0438a 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c | |||
| @@ -27,6 +27,12 @@ | |||
| 27 | 27 | ||
| 28 | DEFINE_MUTEX(event_mutex); | 28 | DEFINE_MUTEX(event_mutex); |
| 29 | 29 | ||
| 30 | DEFINE_MUTEX(event_storage_mutex); | ||
| 31 | EXPORT_SYMBOL_GPL(event_storage_mutex); | ||
| 32 | |||
| 33 | char event_storage[EVENT_STORAGE_SIZE]; | ||
| 34 | EXPORT_SYMBOL_GPL(event_storage); | ||
| 35 | |||
| 30 | LIST_HEAD(ftrace_events); | 36 | LIST_HEAD(ftrace_events); |
| 31 | LIST_HEAD(ftrace_common_fields); | 37 | LIST_HEAD(ftrace_common_fields); |
| 32 | 38 | ||
| @@ -1278,7 +1284,7 @@ trace_create_file_ops(struct module *mod) | |||
| 1278 | static void trace_module_add_events(struct module *mod) | 1284 | static void trace_module_add_events(struct module *mod) |
| 1279 | { | 1285 | { |
| 1280 | struct ftrace_module_file_ops *file_ops = NULL; | 1286 | struct ftrace_module_file_ops *file_ops = NULL; |
| 1281 | struct ftrace_event_call *call, *start, *end; | 1287 | struct ftrace_event_call **call, **start, **end; |
| 1282 | 1288 | ||
| 1283 | start = mod->trace_events; | 1289 | start = mod->trace_events; |
| 1284 | end = mod->trace_events + mod->num_trace_events; | 1290 | end = mod->trace_events + mod->num_trace_events; |
| @@ -1291,7 +1297,7 @@ static void trace_module_add_events(struct module *mod) | |||
| 1291 | return; | 1297 | return; |
| 1292 | 1298 | ||
| 1293 | for_each_event(call, start, end) { | 1299 | for_each_event(call, start, end) { |
| 1294 | __trace_add_event_call(call, mod, | 1300 | __trace_add_event_call(*call, mod, |
| 1295 | &file_ops->id, &file_ops->enable, | 1301 | &file_ops->id, &file_ops->enable, |
| 1296 | &file_ops->filter, &file_ops->format); | 1302 | &file_ops->filter, &file_ops->format); |
| 1297 | } | 1303 | } |
| @@ -1361,8 +1367,8 @@ static struct notifier_block trace_module_nb = { | |||
| 1361 | .priority = 0, | 1367 | .priority = 0, |
| 1362 | }; | 1368 | }; |
| 1363 | 1369 | ||
| 1364 | extern struct ftrace_event_call __start_ftrace_events[]; | 1370 | extern struct ftrace_event_call *__start_ftrace_events[]; |
| 1365 | extern struct ftrace_event_call __stop_ftrace_events[]; | 1371 | extern struct ftrace_event_call *__stop_ftrace_events[]; |
| 1366 | 1372 | ||
| 1367 | static char bootup_event_buf[COMMAND_LINE_SIZE] __initdata; | 1373 | static char bootup_event_buf[COMMAND_LINE_SIZE] __initdata; |
| 1368 | 1374 | ||
| @@ -1378,7 +1384,7 @@ __setup("trace_event=", setup_trace_event); | |||
| 1378 | 1384 | ||
| 1379 | static __init int event_trace_init(void) | 1385 | static __init int event_trace_init(void) |
| 1380 | { | 1386 | { |
| 1381 | struct ftrace_event_call *call; | 1387 | struct ftrace_event_call **call; |
| 1382 | struct dentry *d_tracer; | 1388 | struct dentry *d_tracer; |
| 1383 | struct dentry *entry; | 1389 | struct dentry *entry; |
| 1384 | struct dentry *d_events; | 1390 | struct dentry *d_events; |
| @@ -1424,7 +1430,7 @@ static __init int event_trace_init(void) | |||
| 1424 | pr_warning("tracing: Failed to allocate common fields"); | 1430 | pr_warning("tracing: Failed to allocate common fields"); |
| 1425 | 1431 | ||
| 1426 | for_each_event(call, __start_ftrace_events, __stop_ftrace_events) { | 1432 | for_each_event(call, __start_ftrace_events, __stop_ftrace_events) { |
| 1427 | __trace_add_event_call(call, NULL, &ftrace_event_id_fops, | 1433 | __trace_add_event_call(*call, NULL, &ftrace_event_id_fops, |
| 1428 | &ftrace_enable_fops, | 1434 | &ftrace_enable_fops, |
| 1429 | &ftrace_event_filter_fops, | 1435 | &ftrace_event_filter_fops, |
| 1430 | &ftrace_event_format_fops); | 1436 | &ftrace_event_format_fops); |
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index 4ba44deaac2..bbeec31e0ae 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c | |||
| @@ -83,13 +83,19 @@ static void __always_unused ____ftrace_check_##name(void) \ | |||
| 83 | 83 | ||
| 84 | #undef __array | 84 | #undef __array |
| 85 | #define __array(type, item, len) \ | 85 | #define __array(type, item, len) \ |
| 86 | BUILD_BUG_ON(len > MAX_FILTER_STR_VAL); \ | 86 | do { \ |
| 87 | ret = trace_define_field(event_call, #type "[" #len "]", #item, \ | 87 | BUILD_BUG_ON(len > MAX_FILTER_STR_VAL); \ |
| 88 | mutex_lock(&event_storage_mutex); \ | ||
| 89 | snprintf(event_storage, sizeof(event_storage), \ | ||
| 90 | "%s[%d]", #type, len); \ | ||
| 91 | ret = trace_define_field(event_call, event_storage, #item, \ | ||
| 88 | offsetof(typeof(field), item), \ | 92 | offsetof(typeof(field), item), \ |
| 89 | sizeof(field.item), \ | 93 | sizeof(field.item), \ |
| 90 | is_signed_type(type), FILTER_OTHER); \ | 94 | is_signed_type(type), FILTER_OTHER); \ |
| 91 | if (ret) \ | 95 | mutex_unlock(&event_storage_mutex); \ |
| 92 | return ret; | 96 | if (ret) \ |
| 97 | return ret; \ | ||
| 98 | } while (0); | ||
| 93 | 99 | ||
| 94 | #undef __array_desc | 100 | #undef __array_desc |
| 95 | #define __array_desc(type, container, item, len) \ | 101 | #define __array_desc(type, container, item, len) \ |
| @@ -155,13 +161,13 @@ struct ftrace_event_class event_class_ftrace_##call = { \ | |||
| 155 | .fields = LIST_HEAD_INIT(event_class_ftrace_##call.fields),\ | 161 | .fields = LIST_HEAD_INIT(event_class_ftrace_##call.fields),\ |
| 156 | }; \ | 162 | }; \ |
| 157 | \ | 163 | \ |
| 158 | struct ftrace_event_call __used \ | 164 | struct ftrace_event_call __used event_##call = { \ |
| 159 | __attribute__((__aligned__(4))) \ | ||
| 160 | __attribute__((section("_ftrace_events"))) event_##call = { \ | ||
| 161 | .name = #call, \ | 165 | .name = #call, \ |
| 162 | .event.type = etype, \ | 166 | .event.type = etype, \ |
| 163 | .class = &event_class_ftrace_##call, \ | 167 | .class = &event_class_ftrace_##call, \ |
| 164 | .print_fmt = print, \ | 168 | .print_fmt = print, \ |
| 165 | }; \ | 169 | }; \ |
| 170 | struct ftrace_event_call __used \ | ||
| 171 | __attribute__((section("_ftrace_events"))) *__event_##call = &event_##call; | ||
| 166 | 172 | ||
| 167 | #include "trace_entries.h" | 173 | #include "trace_entries.h" |
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 5cf8c602b88..92b6e1e12d9 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c | |||
| @@ -453,14 +453,6 @@ void time_hardirqs_off(unsigned long a0, unsigned long a1) | |||
| 453 | * Stubs: | 453 | * Stubs: |
| 454 | */ | 454 | */ |
| 455 | 455 | ||
| 456 | void early_boot_irqs_off(void) | ||
| 457 | { | ||
| 458 | } | ||
| 459 | |||
| 460 | void early_boot_irqs_on(void) | ||
| 461 | { | ||
| 462 | } | ||
| 463 | |||
| 464 | void trace_softirqs_on(unsigned long ip) | 456 | void trace_softirqs_on(unsigned long ip) |
| 465 | { | 457 | { |
| 466 | } | 458 | } |
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 155a415b320..659732eba07 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c | |||
| @@ -558,7 +558,7 @@ trace_selftest_startup_nop(struct tracer *trace, struct trace_array *tr) | |||
| 558 | static int trace_wakeup_test_thread(void *data) | 558 | static int trace_wakeup_test_thread(void *data) |
| 559 | { | 559 | { |
| 560 | /* Make this a RT thread, doesn't need to be too high */ | 560 | /* Make this a RT thread, doesn't need to be too high */ |
| 561 | struct sched_param param = { .sched_priority = 5 }; | 561 | static const struct sched_param param = { .sched_priority = 5 }; |
| 562 | struct completion *x = data; | 562 | struct completion *x = data; |
| 563 | 563 | ||
| 564 | sched_setscheduler(current, SCHED_FIFO, ¶m); | 564 | sched_setscheduler(current, SCHED_FIFO, ¶m); |
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index bac752f0cfb..5c9fe08d209 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c | |||
| @@ -23,9 +23,6 @@ static int syscall_exit_register(struct ftrace_event_call *event, | |||
| 23 | static int syscall_enter_define_fields(struct ftrace_event_call *call); | 23 | static int syscall_enter_define_fields(struct ftrace_event_call *call); |
| 24 | static int syscall_exit_define_fields(struct ftrace_event_call *call); | 24 | static int syscall_exit_define_fields(struct ftrace_event_call *call); |
| 25 | 25 | ||
| 26 | /* All syscall exit events have the same fields */ | ||
| 27 | static LIST_HEAD(syscall_exit_fields); | ||
| 28 | |||
| 29 | static struct list_head * | 26 | static struct list_head * |
| 30 | syscall_get_enter_fields(struct ftrace_event_call *call) | 27 | syscall_get_enter_fields(struct ftrace_event_call *call) |
| 31 | { | 28 | { |
| @@ -34,50 +31,45 @@ syscall_get_enter_fields(struct ftrace_event_call *call) | |||
| 34 | return &entry->enter_fields; | 31 | return &entry->enter_fields; |
| 35 | } | 32 | } |
| 36 | 33 | ||
| 37 | static struct list_head * | ||
| 38 | syscall_get_exit_fields(struct ftrace_event_call *call) | ||
| 39 | { | ||
| 40 | return &syscall_exit_fields; | ||
| 41 | } | ||
| 42 | |||
| 43 | struct trace_event_functions enter_syscall_print_funcs = { | 34 | struct trace_event_functions enter_syscall_print_funcs = { |
| 44 | .trace = print_syscall_enter, | 35 | .trace = print_syscall_enter, |
| 45 | }; | 36 | }; |
| 46 | 37 | ||
| 47 | struct trace_event_functions exit_syscall_print_funcs = { | 38 | struct trace_event_functions exit_syscall_print_funcs = { |
| 48 | .trace = print_syscall_exit, | 39 | .trace = print_syscall_exit, |
| 49 | }; | 40 | }; |
| 50 | 41 | ||
| 51 | struct ftrace_event_class event_class_syscall_enter = { | 42 | struct ftrace_event_class event_class_syscall_enter = { |
| 52 | .system = "syscalls", | 43 | .system = "syscalls", |
| 53 | .reg = syscall_enter_register, | 44 | .reg = syscall_enter_register, |
| 54 | .define_fields = syscall_enter_define_fields, | 45 | .define_fields = syscall_enter_define_fields, |
| 55 | .get_fields = syscall_get_enter_fields, | 46 | .get_fields = syscall_get_enter_fields, |
| 56 | .raw_init = init_syscall_trace, | 47 | .raw_init = init_syscall_trace, |
| 57 | }; | 48 | }; |
| 58 | 49 | ||
| 59 | struct ftrace_event_class event_class_syscall_exit = { | 50 | struct ftrace_event_class event_class_syscall_exit = { |
| 60 | .system = "syscalls", | 51 | .system = "syscalls", |
| 61 | .reg = syscall_exit_register, | 52 | .reg = syscall_exit_register, |
| 62 | .define_fields = syscall_exit_define_fields, | 53 | .define_fields = syscall_exit_define_fields, |
| 63 | .get_fields = syscall_get_exit_fields, | 54 | .fields = LIST_HEAD_INIT(event_class_syscall_exit.fields), |
| 64 | .raw_init = init_syscall_trace, | 55 | .raw_init = init_syscall_trace, |
| 65 | }; | 56 | }; |
| 66 | 57 | ||
| 67 | extern unsigned long __start_syscalls_metadata[]; | 58 | extern struct syscall_metadata *__start_syscalls_metadata[]; |
| 68 | extern unsigned long __stop_syscalls_metadata[]; | 59 | extern struct syscall_metadata *__stop_syscalls_metadata[]; |
| 69 | 60 | ||
| 70 | static struct syscall_metadata **syscalls_metadata; | 61 | static struct syscall_metadata **syscalls_metadata; |
| 71 | 62 | ||
| 72 | static struct syscall_metadata *find_syscall_meta(unsigned long syscall) | 63 | static __init struct syscall_metadata * |
| 64 | find_syscall_meta(unsigned long syscall) | ||
| 73 | { | 65 | { |
| 74 | struct syscall_metadata *start; | 66 | struct syscall_metadata **start; |
| 75 | struct syscall_metadata *stop; | 67 | struct syscall_metadata **stop; |
| 76 | char str[KSYM_SYMBOL_LEN]; | 68 | char str[KSYM_SYMBOL_LEN]; |
| 77 | 69 | ||
| 78 | 70 | ||
| 79 | start = (struct syscall_metadata *)__start_syscalls_metadata; | 71 | start = __start_syscalls_metadata; |
| 80 | stop = (struct syscall_metadata *)__stop_syscalls_metadata; | 72 | stop = __stop_syscalls_metadata; |
| 81 | kallsyms_lookup(syscall, NULL, NULL, NULL, str); | 73 | kallsyms_lookup(syscall, NULL, NULL, NULL, str); |
| 82 | 74 | ||
| 83 | for ( ; start < stop; start++) { | 75 | for ( ; start < stop; start++) { |
| @@ -87,8 +79,8 @@ static struct syscall_metadata *find_syscall_meta(unsigned long syscall) | |||
| 87 | * with "SyS" instead of "sys", leading to an unwanted | 79 | * with "SyS" instead of "sys", leading to an unwanted |
| 88 | * mismatch. | 80 | * mismatch. |
| 89 | */ | 81 | */ |
| 90 | if (start->name && !strcmp(start->name + 3, str + 3)) | 82 | if ((*start)->name && !strcmp((*start)->name + 3, str + 3)) |
| 91 | return start; | 83 | return *start; |
| 92 | } | 84 | } |
| 93 | return NULL; | 85 | return NULL; |
| 94 | } | 86 | } |
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index e95ee7f31d4..68187af4889 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c | |||
| @@ -27,8 +27,8 @@ | |||
| 27 | #include <linux/sched.h> | 27 | #include <linux/sched.h> |
| 28 | #include <linux/jump_label.h> | 28 | #include <linux/jump_label.h> |
| 29 | 29 | ||
| 30 | extern struct tracepoint __start___tracepoints[]; | 30 | extern struct tracepoint * const __start___tracepoints_ptrs[]; |
| 31 | extern struct tracepoint __stop___tracepoints[]; | 31 | extern struct tracepoint * const __stop___tracepoints_ptrs[]; |
| 32 | 32 | ||
| 33 | /* Set to 1 to enable tracepoint debug output */ | 33 | /* Set to 1 to enable tracepoint debug output */ |
| 34 | static const int tracepoint_debug; | 34 | static const int tracepoint_debug; |
| @@ -298,10 +298,10 @@ static void disable_tracepoint(struct tracepoint *elem) | |||
| 298 | * | 298 | * |
| 299 | * Updates the probe callback corresponding to a range of tracepoints. | 299 | * Updates the probe callback corresponding to a range of tracepoints. |
| 300 | */ | 300 | */ |
| 301 | void | 301 | void tracepoint_update_probe_range(struct tracepoint * const *begin, |
| 302 | tracepoint_update_probe_range(struct tracepoint *begin, struct tracepoint *end) | 302 | struct tracepoint * const *end) |
| 303 | { | 303 | { |
| 304 | struct tracepoint *iter; | 304 | struct tracepoint * const *iter; |
| 305 | struct tracepoint_entry *mark_entry; | 305 | struct tracepoint_entry *mark_entry; |
| 306 | 306 | ||
| 307 | if (!begin) | 307 | if (!begin) |
| @@ -309,12 +309,12 @@ tracepoint_update_probe_range(struct tracepoint *begin, struct tracepoint *end) | |||
| 309 | 309 | ||
| 310 | mutex_lock(&tracepoints_mutex); | 310 | mutex_lock(&tracepoints_mutex); |
| 311 | for (iter = begin; iter < end; iter++) { | 311 | for (iter = begin; iter < end; iter++) { |
| 312 | mark_entry = get_tracepoint(iter->name); | 312 | mark_entry = get_tracepoint((*iter)->name); |
| 313 | if (mark_entry) { | 313 | if (mark_entry) { |
| 314 | set_tracepoint(&mark_entry, iter, | 314 | set_tracepoint(&mark_entry, *iter, |
| 315 | !!mark_entry->refcount); | 315 | !!mark_entry->refcount); |
| 316 | } else { | 316 | } else { |
| 317 | disable_tracepoint(iter); | 317 | disable_tracepoint(*iter); |
| 318 | } | 318 | } |
| 319 | } | 319 | } |
| 320 | mutex_unlock(&tracepoints_mutex); | 320 | mutex_unlock(&tracepoints_mutex); |
| @@ -326,8 +326,8 @@ tracepoint_update_probe_range(struct tracepoint *begin, struct tracepoint *end) | |||
| 326 | static void tracepoint_update_probes(void) | 326 | static void tracepoint_update_probes(void) |
| 327 | { | 327 | { |
| 328 | /* Core kernel tracepoints */ | 328 | /* Core kernel tracepoints */ |
| 329 | tracepoint_update_probe_range(__start___tracepoints, | 329 | tracepoint_update_probe_range(__start___tracepoints_ptrs, |
| 330 | __stop___tracepoints); | 330 | __stop___tracepoints_ptrs); |
| 331 | /* tracepoints in modules. */ | 331 | /* tracepoints in modules. */ |
| 332 | module_update_tracepoints(); | 332 | module_update_tracepoints(); |
| 333 | } | 333 | } |
| @@ -514,8 +514,8 @@ EXPORT_SYMBOL_GPL(tracepoint_probe_update_all); | |||
| 514 | * Will return the first tracepoint in the range if the input tracepoint is | 514 | * Will return the first tracepoint in the range if the input tracepoint is |
| 515 | * NULL. | 515 | * NULL. |
| 516 | */ | 516 | */ |
| 517 | int tracepoint_get_iter_range(struct tracepoint **tracepoint, | 517 | int tracepoint_get_iter_range(struct tracepoint * const **tracepoint, |
| 518 | struct tracepoint *begin, struct tracepoint *end) | 518 | struct tracepoint * const *begin, struct tracepoint * const *end) |
| 519 | { | 519 | { |
| 520 | if (!*tracepoint && begin != end) { | 520 | if (!*tracepoint && begin != end) { |
| 521 | *tracepoint = begin; | 521 | *tracepoint = begin; |
| @@ -534,7 +534,8 @@ static void tracepoint_get_iter(struct tracepoint_iter *iter) | |||
| 534 | /* Core kernel tracepoints */ | 534 | /* Core kernel tracepoints */ |
| 535 | if (!iter->module) { | 535 | if (!iter->module) { |
| 536 | found = tracepoint_get_iter_range(&iter->tracepoint, | 536 | found = tracepoint_get_iter_range(&iter->tracepoint, |
| 537 | __start___tracepoints, __stop___tracepoints); | 537 | __start___tracepoints_ptrs, |
| 538 | __stop___tracepoints_ptrs); | ||
| 538 | if (found) | 539 | if (found) |
| 539 | goto end; | 540 | goto end; |
| 540 | } | 541 | } |
| @@ -585,8 +586,8 @@ int tracepoint_module_notify(struct notifier_block *self, | |||
| 585 | switch (val) { | 586 | switch (val) { |
| 586 | case MODULE_STATE_COMING: | 587 | case MODULE_STATE_COMING: |
| 587 | case MODULE_STATE_GOING: | 588 | case MODULE_STATE_GOING: |
| 588 | tracepoint_update_probe_range(mod->tracepoints, | 589 | tracepoint_update_probe_range(mod->tracepoints_ptrs, |
| 589 | mod->tracepoints + mod->num_tracepoints); | 590 | mod->tracepoints_ptrs + mod->num_tracepoints); |
| 590 | break; | 591 | break; |
| 591 | } | 592 | } |
| 592 | return 0; | 593 | return 0; |
diff --git a/kernel/user.c b/kernel/user.c index 2c7d8d5914b..5c598ca781d 100644 --- a/kernel/user.c +++ b/kernel/user.c | |||
| @@ -158,6 +158,7 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid) | |||
| 158 | spin_lock_irq(&uidhash_lock); | 158 | spin_lock_irq(&uidhash_lock); |
| 159 | up = uid_hash_find(uid, hashent); | 159 | up = uid_hash_find(uid, hashent); |
| 160 | if (up) { | 160 | if (up) { |
| 161 | put_user_ns(ns); | ||
| 161 | key_put(new->uid_keyring); | 162 | key_put(new->uid_keyring); |
| 162 | key_put(new->session_keyring); | 163 | key_put(new->session_keyring); |
| 163 | kmem_cache_free(uid_cachep, new); | 164 | kmem_cache_free(uid_cachep, new); |
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 25915832291..9da289c34f2 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c | |||
| @@ -12,6 +12,8 @@ | |||
| 12 | #include <linux/highuid.h> | 12 | #include <linux/highuid.h> |
| 13 | #include <linux/cred.h> | 13 | #include <linux/cred.h> |
| 14 | 14 | ||
| 15 | static struct kmem_cache *user_ns_cachep __read_mostly; | ||
| 16 | |||
| 15 | /* | 17 | /* |
| 16 | * Create a new user namespace, deriving the creator from the user in the | 18 | * Create a new user namespace, deriving the creator from the user in the |
| 17 | * passed credentials, and replacing that user with the new root user for the | 19 | * passed credentials, and replacing that user with the new root user for the |
| @@ -26,7 +28,7 @@ int create_user_ns(struct cred *new) | |||
| 26 | struct user_struct *root_user; | 28 | struct user_struct *root_user; |
| 27 | int n; | 29 | int n; |
| 28 | 30 | ||
| 29 | ns = kmalloc(sizeof(struct user_namespace), GFP_KERNEL); | 31 | ns = kmem_cache_alloc(user_ns_cachep, GFP_KERNEL); |
| 30 | if (!ns) | 32 | if (!ns) |
| 31 | return -ENOMEM; | 33 | return -ENOMEM; |
| 32 | 34 | ||
| @@ -38,7 +40,7 @@ int create_user_ns(struct cred *new) | |||
| 38 | /* Alloc new root user. */ | 40 | /* Alloc new root user. */ |
| 39 | root_user = alloc_uid(ns, 0); | 41 | root_user = alloc_uid(ns, 0); |
| 40 | if (!root_user) { | 42 | if (!root_user) { |
| 41 | kfree(ns); | 43 | kmem_cache_free(user_ns_cachep, ns); |
| 42 | return -ENOMEM; | 44 | return -ENOMEM; |
| 43 | } | 45 | } |
| 44 | 46 | ||
| @@ -71,7 +73,7 @@ static void free_user_ns_work(struct work_struct *work) | |||
| 71 | struct user_namespace *ns = | 73 | struct user_namespace *ns = |
| 72 | container_of(work, struct user_namespace, destroyer); | 74 | container_of(work, struct user_namespace, destroyer); |
| 73 | free_uid(ns->creator); | 75 | free_uid(ns->creator); |
| 74 | kfree(ns); | 76 | kmem_cache_free(user_ns_cachep, ns); |
| 75 | } | 77 | } |
| 76 | 78 | ||
| 77 | void free_user_ns(struct kref *kref) | 79 | void free_user_ns(struct kref *kref) |
| @@ -126,3 +128,10 @@ gid_t user_ns_map_gid(struct user_namespace *to, const struct cred *cred, gid_t | |||
| 126 | /* No useful relationship so no mapping */ | 128 | /* No useful relationship so no mapping */ |
| 127 | return overflowgid; | 129 | return overflowgid; |
| 128 | } | 130 | } |
| 131 | |||
| 132 | static __init int user_namespaces_init(void) | ||
| 133 | { | ||
| 134 | user_ns_cachep = KMEM_CACHE(user_namespace, SLAB_PANIC); | ||
| 135 | return 0; | ||
| 136 | } | ||
| 137 | module_init(user_namespaces_init); | ||
diff --git a/kernel/watchdog.c b/kernel/watchdog.c index bafba687a6d..18bb15776c5 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c | |||
| @@ -27,7 +27,7 @@ | |||
| 27 | #include <asm/irq_regs.h> | 27 | #include <asm/irq_regs.h> |
| 28 | #include <linux/perf_event.h> | 28 | #include <linux/perf_event.h> |
| 29 | 29 | ||
| 30 | int watchdog_enabled; | 30 | int watchdog_enabled = 1; |
| 31 | int __read_mostly softlockup_thresh = 60; | 31 | int __read_mostly softlockup_thresh = 60; |
| 32 | 32 | ||
| 33 | static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts); | 33 | static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts); |
| @@ -43,9 +43,6 @@ static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved); | |||
| 43 | static DEFINE_PER_CPU(struct perf_event *, watchdog_ev); | 43 | static DEFINE_PER_CPU(struct perf_event *, watchdog_ev); |
| 44 | #endif | 44 | #endif |
| 45 | 45 | ||
| 46 | static int __initdata no_watchdog; | ||
| 47 | |||
| 48 | |||
| 49 | /* boot commands */ | 46 | /* boot commands */ |
| 50 | /* | 47 | /* |
| 51 | * Should we panic when a soft-lockup or hard-lockup occurs: | 48 | * Should we panic when a soft-lockup or hard-lockup occurs: |
| @@ -57,6 +54,8 @@ static int __init hardlockup_panic_setup(char *str) | |||
| 57 | { | 54 | { |
| 58 | if (!strncmp(str, "panic", 5)) | 55 | if (!strncmp(str, "panic", 5)) |
| 59 | hardlockup_panic = 1; | 56 | hardlockup_panic = 1; |
| 57 | else if (!strncmp(str, "0", 1)) | ||
| 58 | watchdog_enabled = 0; | ||
| 60 | return 1; | 59 | return 1; |
| 61 | } | 60 | } |
| 62 | __setup("nmi_watchdog=", hardlockup_panic_setup); | 61 | __setup("nmi_watchdog=", hardlockup_panic_setup); |
| @@ -75,7 +74,7 @@ __setup("softlockup_panic=", softlockup_panic_setup); | |||
| 75 | 74 | ||
| 76 | static int __init nowatchdog_setup(char *str) | 75 | static int __init nowatchdog_setup(char *str) |
| 77 | { | 76 | { |
| 78 | no_watchdog = 1; | 77 | watchdog_enabled = 0; |
| 79 | return 1; | 78 | return 1; |
| 80 | } | 79 | } |
| 81 | __setup("nowatchdog", nowatchdog_setup); | 80 | __setup("nowatchdog", nowatchdog_setup); |
| @@ -83,7 +82,7 @@ __setup("nowatchdog", nowatchdog_setup); | |||
| 83 | /* deprecated */ | 82 | /* deprecated */ |
| 84 | static int __init nosoftlockup_setup(char *str) | 83 | static int __init nosoftlockup_setup(char *str) |
| 85 | { | 84 | { |
| 86 | no_watchdog = 1; | 85 | watchdog_enabled = 0; |
| 87 | return 1; | 86 | return 1; |
| 88 | } | 87 | } |
| 89 | __setup("nosoftlockup", nosoftlockup_setup); | 88 | __setup("nosoftlockup", nosoftlockup_setup); |
| @@ -116,12 +115,12 @@ static void __touch_watchdog(void) | |||
| 116 | { | 115 | { |
| 117 | int this_cpu = smp_processor_id(); | 116 | int this_cpu = smp_processor_id(); |
| 118 | 117 | ||
| 119 | __get_cpu_var(watchdog_touch_ts) = get_timestamp(this_cpu); | 118 | __this_cpu_write(watchdog_touch_ts, get_timestamp(this_cpu)); |
| 120 | } | 119 | } |
| 121 | 120 | ||
| 122 | void touch_softlockup_watchdog(void) | 121 | void touch_softlockup_watchdog(void) |
| 123 | { | 122 | { |
| 124 | __raw_get_cpu_var(watchdog_touch_ts) = 0; | 123 | __this_cpu_write(watchdog_touch_ts, 0); |
| 125 | } | 124 | } |
| 126 | EXPORT_SYMBOL(touch_softlockup_watchdog); | 125 | EXPORT_SYMBOL(touch_softlockup_watchdog); |
| 127 | 126 | ||
| @@ -165,12 +164,12 @@ void touch_softlockup_watchdog_sync(void) | |||
| 165 | /* watchdog detector functions */ | 164 | /* watchdog detector functions */ |
| 166 | static int is_hardlockup(void) | 165 | static int is_hardlockup(void) |
| 167 | { | 166 | { |
| 168 | unsigned long hrint = __get_cpu_var(hrtimer_interrupts); | 167 | unsigned long hrint = __this_cpu_read(hrtimer_interrupts); |
| 169 | 168 | ||
| 170 | if (__get_cpu_var(hrtimer_interrupts_saved) == hrint) | 169 | if (__this_cpu_read(hrtimer_interrupts_saved) == hrint) |
| 171 | return 1; | 170 | return 1; |
| 172 | 171 | ||
| 173 | __get_cpu_var(hrtimer_interrupts_saved) = hrint; | 172 | __this_cpu_write(hrtimer_interrupts_saved, hrint); |
| 174 | return 0; | 173 | return 0; |
| 175 | } | 174 | } |
| 176 | #endif | 175 | #endif |
| @@ -203,8 +202,8 @@ static void watchdog_overflow_callback(struct perf_event *event, int nmi, | |||
| 203 | /* Ensure the watchdog never gets throttled */ | 202 | /* Ensure the watchdog never gets throttled */ |
| 204 | event->hw.interrupts = 0; | 203 | event->hw.interrupts = 0; |
| 205 | 204 | ||
| 206 | if (__get_cpu_var(watchdog_nmi_touch) == true) { | 205 | if (__this_cpu_read(watchdog_nmi_touch) == true) { |
| 207 | __get_cpu_var(watchdog_nmi_touch) = false; | 206 | __this_cpu_write(watchdog_nmi_touch, false); |
| 208 | return; | 207 | return; |
| 209 | } | 208 | } |
| 210 | 209 | ||
| @@ -218,7 +217,7 @@ static void watchdog_overflow_callback(struct perf_event *event, int nmi, | |||
| 218 | int this_cpu = smp_processor_id(); | 217 | int this_cpu = smp_processor_id(); |
| 219 | 218 | ||
| 220 | /* only print hardlockups once */ | 219 | /* only print hardlockups once */ |
| 221 | if (__get_cpu_var(hard_watchdog_warn) == true) | 220 | if (__this_cpu_read(hard_watchdog_warn) == true) |
| 222 | return; | 221 | return; |
| 223 | 222 | ||
| 224 | if (hardlockup_panic) | 223 | if (hardlockup_panic) |
| @@ -226,16 +225,16 @@ static void watchdog_overflow_callback(struct perf_event *event, int nmi, | |||
| 226 | else | 225 | else |
| 227 | WARN(1, "Watchdog detected hard LOCKUP on cpu %d", this_cpu); | 226 | WARN(1, "Watchdog detected hard LOCKUP on cpu %d", this_cpu); |
| 228 | 227 | ||
| 229 | __get_cpu_var(hard_watchdog_warn) = true; | 228 | __this_cpu_write(hard_watchdog_warn, true); |
| 230 | return; | 229 | return; |
| 231 | } | 230 | } |
| 232 | 231 | ||
| 233 | __get_cpu_var(hard_watchdog_warn) = false; | 232 | __this_cpu_write(hard_watchdog_warn, false); |
| 234 | return; | 233 | return; |
| 235 | } | 234 | } |
| 236 | static void watchdog_interrupt_count(void) | 235 | static void watchdog_interrupt_count(void) |
| 237 | { | 236 | { |
| 238 | __get_cpu_var(hrtimer_interrupts)++; | 237 | __this_cpu_inc(hrtimer_interrupts); |
| 239 | } | 238 | } |
| 240 | #else | 239 | #else |
| 241 | static inline void watchdog_interrupt_count(void) { return; } | 240 | static inline void watchdog_interrupt_count(void) { return; } |
| @@ -244,7 +243,7 @@ static inline void watchdog_interrupt_count(void) { return; } | |||
| 244 | /* watchdog kicker functions */ | 243 | /* watchdog kicker functions */ |
| 245 | static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) | 244 | static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) |
| 246 | { | 245 | { |
| 247 | unsigned long touch_ts = __get_cpu_var(watchdog_touch_ts); | 246 | unsigned long touch_ts = __this_cpu_read(watchdog_touch_ts); |
| 248 | struct pt_regs *regs = get_irq_regs(); | 247 | struct pt_regs *regs = get_irq_regs(); |
| 249 | int duration; | 248 | int duration; |
| 250 | 249 | ||
| @@ -252,18 +251,18 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) | |||
| 252 | watchdog_interrupt_count(); | 251 | watchdog_interrupt_count(); |
| 253 | 252 | ||
| 254 | /* kick the softlockup detector */ | 253 | /* kick the softlockup detector */ |
| 255 | wake_up_process(__get_cpu_var(softlockup_watchdog)); | 254 | wake_up_process(__this_cpu_read(softlockup_watchdog)); |
| 256 | 255 | ||
| 257 | /* .. and repeat */ | 256 | /* .. and repeat */ |
| 258 | hrtimer_forward_now(hrtimer, ns_to_ktime(get_sample_period())); | 257 | hrtimer_forward_now(hrtimer, ns_to_ktime(get_sample_period())); |
| 259 | 258 | ||
| 260 | if (touch_ts == 0) { | 259 | if (touch_ts == 0) { |
| 261 | if (unlikely(__get_cpu_var(softlockup_touch_sync))) { | 260 | if (unlikely(__this_cpu_read(softlockup_touch_sync))) { |
| 262 | /* | 261 | /* |
| 263 | * If the time stamp was touched atomically | 262 | * If the time stamp was touched atomically |
| 264 | * make sure the scheduler tick is up to date. | 263 | * make sure the scheduler tick is up to date. |
| 265 | */ | 264 | */ |
| 266 | __get_cpu_var(softlockup_touch_sync) = false; | 265 | __this_cpu_write(softlockup_touch_sync, false); |
| 267 | sched_clock_tick(); | 266 | sched_clock_tick(); |
| 268 | } | 267 | } |
| 269 | __touch_watchdog(); | 268 | __touch_watchdog(); |
| @@ -279,7 +278,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) | |||
| 279 | duration = is_softlockup(touch_ts); | 278 | duration = is_softlockup(touch_ts); |
| 280 | if (unlikely(duration)) { | 279 | if (unlikely(duration)) { |
| 281 | /* only warn once */ | 280 | /* only warn once */ |
| 282 | if (__get_cpu_var(soft_watchdog_warn) == true) | 281 | if (__this_cpu_read(soft_watchdog_warn) == true) |
| 283 | return HRTIMER_RESTART; | 282 | return HRTIMER_RESTART; |
| 284 | 283 | ||
| 285 | printk(KERN_ERR "BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n", | 284 | printk(KERN_ERR "BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n", |
| @@ -294,9 +293,9 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) | |||
| 294 | 293 | ||
| 295 | if (softlockup_panic) | 294 | if (softlockup_panic) |
| 296 | panic("softlockup: hung tasks"); | 295 | panic("softlockup: hung tasks"); |
| 297 | __get_cpu_var(soft_watchdog_warn) = true; | 296 | __this_cpu_write(soft_watchdog_warn, true); |
| 298 | } else | 297 | } else |
| 299 | __get_cpu_var(soft_watchdog_warn) = false; | 298 | __this_cpu_write(soft_watchdog_warn, false); |
| 300 | 299 | ||
| 301 | return HRTIMER_RESTART; | 300 | return HRTIMER_RESTART; |
| 302 | } | 301 | } |
| @@ -307,7 +306,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) | |||
| 307 | */ | 306 | */ |
| 308 | static int watchdog(void *unused) | 307 | static int watchdog(void *unused) |
| 309 | { | 308 | { |
| 310 | struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; | 309 | static struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; |
| 311 | struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer); | 310 | struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer); |
| 312 | 311 | ||
| 313 | sched_setscheduler(current, SCHED_FIFO, ¶m); | 312 | sched_setscheduler(current, SCHED_FIFO, ¶m); |
| @@ -364,7 +363,14 @@ static int watchdog_nmi_enable(int cpu) | |||
| 364 | goto out_save; | 363 | goto out_save; |
| 365 | } | 364 | } |
| 366 | 365 | ||
| 367 | printk(KERN_ERR "NMI watchdog failed to create perf event on cpu%i: %p\n", cpu, event); | 366 | |
| 367 | /* vary the KERN level based on the returned errno */ | ||
| 368 | if (PTR_ERR(event) == -EOPNOTSUPP) | ||
| 369 | printk(KERN_INFO "NMI watchdog disabled (cpu%i): not supported (no LAPIC?)\n", cpu); | ||
| 370 | else if (PTR_ERR(event) == -ENOENT) | ||
| 371 | printk(KERN_WARNING "NMI watchdog disabled (cpu%i): hardware events not enabled\n", cpu); | ||
| 372 | else | ||
| 373 | printk(KERN_ERR "NMI watchdog disabled (cpu%i): unable to create perf event: %ld\n", cpu, PTR_ERR(event)); | ||
| 368 | return PTR_ERR(event); | 374 | return PTR_ERR(event); |
| 369 | 375 | ||
| 370 | /* success path */ | 376 | /* success path */ |
| @@ -429,9 +435,6 @@ static int watchdog_enable(int cpu) | |||
| 429 | wake_up_process(p); | 435 | wake_up_process(p); |
| 430 | } | 436 | } |
| 431 | 437 | ||
| 432 | /* if any cpu succeeds, watchdog is considered enabled for the system */ | ||
| 433 | watchdog_enabled = 1; | ||
| 434 | |||
| 435 | return 0; | 438 | return 0; |
| 436 | } | 439 | } |
| 437 | 440 | ||
| @@ -459,12 +462,16 @@ static void watchdog_disable(int cpu) | |||
| 459 | static void watchdog_enable_all_cpus(void) | 462 | static void watchdog_enable_all_cpus(void) |
| 460 | { | 463 | { |
| 461 | int cpu; | 464 | int cpu; |
| 462 | int result = 0; | 465 | |
| 466 | watchdog_enabled = 0; | ||
| 463 | 467 | ||
| 464 | for_each_online_cpu(cpu) | 468 | for_each_online_cpu(cpu) |
| 465 | result += watchdog_enable(cpu); | 469 | if (!watchdog_enable(cpu)) |
| 470 | /* if any cpu succeeds, watchdog is considered | ||
| 471 | enabled for the system */ | ||
| 472 | watchdog_enabled = 1; | ||
| 466 | 473 | ||
| 467 | if (result) | 474 | if (!watchdog_enabled) |
| 468 | printk(KERN_ERR "watchdog: failed to be enabled on some cpus\n"); | 475 | printk(KERN_ERR "watchdog: failed to be enabled on some cpus\n"); |
| 469 | 476 | ||
| 470 | } | 477 | } |
| @@ -473,9 +480,6 @@ static void watchdog_disable_all_cpus(void) | |||
| 473 | { | 480 | { |
| 474 | int cpu; | 481 | int cpu; |
| 475 | 482 | ||
| 476 | if (no_watchdog) | ||
| 477 | return; | ||
| 478 | |||
| 479 | for_each_online_cpu(cpu) | 483 | for_each_online_cpu(cpu) |
| 480 | watchdog_disable(cpu); | 484 | watchdog_disable(cpu); |
| 481 | 485 | ||
| @@ -495,10 +499,12 @@ int proc_dowatchdog_enabled(struct ctl_table *table, int write, | |||
| 495 | { | 499 | { |
| 496 | proc_dointvec(table, write, buffer, length, ppos); | 500 | proc_dointvec(table, write, buffer, length, ppos); |
| 497 | 501 | ||
| 498 | if (watchdog_enabled) | 502 | if (write) { |
| 499 | watchdog_enable_all_cpus(); | 503 | if (watchdog_enabled) |
| 500 | else | 504 | watchdog_enable_all_cpus(); |
| 501 | watchdog_disable_all_cpus(); | 505 | else |
| 506 | watchdog_disable_all_cpus(); | ||
| 507 | } | ||
| 502 | return 0; | 508 | return 0; |
| 503 | } | 509 | } |
| 504 | 510 | ||
| @@ -527,7 +533,8 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) | |||
| 527 | break; | 533 | break; |
| 528 | case CPU_ONLINE: | 534 | case CPU_ONLINE: |
| 529 | case CPU_ONLINE_FROZEN: | 535 | case CPU_ONLINE_FROZEN: |
| 530 | err = watchdog_enable(hotcpu); | 536 | if (watchdog_enabled) |
| 537 | err = watchdog_enable(hotcpu); | ||
| 531 | break; | 538 | break; |
| 532 | #ifdef CONFIG_HOTPLUG_CPU | 539 | #ifdef CONFIG_HOTPLUG_CPU |
| 533 | case CPU_UP_CANCELED: | 540 | case CPU_UP_CANCELED: |
| @@ -547,20 +554,16 @@ static struct notifier_block __cpuinitdata cpu_nfb = { | |||
| 547 | .notifier_call = cpu_callback | 554 | .notifier_call = cpu_callback |
| 548 | }; | 555 | }; |
| 549 | 556 | ||
| 550 | static int __init spawn_watchdog_task(void) | 557 | void __init lockup_detector_init(void) |
| 551 | { | 558 | { |
| 552 | void *cpu = (void *)(long)smp_processor_id(); | 559 | void *cpu = (void *)(long)smp_processor_id(); |
| 553 | int err; | 560 | int err; |
| 554 | 561 | ||
| 555 | if (no_watchdog) | ||
| 556 | return 0; | ||
| 557 | |||
| 558 | err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu); | 562 | err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu); |
| 559 | WARN_ON(notifier_to_errno(err)); | 563 | WARN_ON(notifier_to_errno(err)); |
| 560 | 564 | ||
| 561 | cpu_callback(&cpu_nfb, CPU_ONLINE, cpu); | 565 | cpu_callback(&cpu_nfb, CPU_ONLINE, cpu); |
| 562 | register_cpu_notifier(&cpu_nfb); | 566 | register_cpu_notifier(&cpu_nfb); |
| 563 | 567 | ||
| 564 | return 0; | 568 | return; |
| 565 | } | 569 | } |
| 566 | early_initcall(spawn_watchdog_task); | ||
diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 90db1bd1a97..ee6578b578a 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c | |||
| @@ -79,7 +79,9 @@ enum { | |||
| 79 | MAX_IDLE_WORKERS_RATIO = 4, /* 1/4 of busy can be idle */ | 79 | MAX_IDLE_WORKERS_RATIO = 4, /* 1/4 of busy can be idle */ |
| 80 | IDLE_WORKER_TIMEOUT = 300 * HZ, /* keep idle ones for 5 mins */ | 80 | IDLE_WORKER_TIMEOUT = 300 * HZ, /* keep idle ones for 5 mins */ |
| 81 | 81 | ||
| 82 | MAYDAY_INITIAL_TIMEOUT = HZ / 100, /* call for help after 10ms */ | 82 | MAYDAY_INITIAL_TIMEOUT = HZ / 100 >= 2 ? HZ / 100 : 2, |
| 83 | /* call for help after 10ms | ||
| 84 | (min two ticks) */ | ||
| 83 | MAYDAY_INTERVAL = HZ / 10, /* and then every 100ms */ | 85 | MAYDAY_INTERVAL = HZ / 10, /* and then every 100ms */ |
| 84 | CREATE_COOLDOWN = HZ, /* time to breath after fail */ | 86 | CREATE_COOLDOWN = HZ, /* time to breath after fail */ |
| 85 | TRUSTEE_COOLDOWN = HZ / 10, /* for trustee draining */ | 87 | TRUSTEE_COOLDOWN = HZ / 10, /* for trustee draining */ |
| @@ -661,7 +663,7 @@ void wq_worker_waking_up(struct task_struct *task, unsigned int cpu) | |||
| 661 | { | 663 | { |
| 662 | struct worker *worker = kthread_data(task); | 664 | struct worker *worker = kthread_data(task); |
| 663 | 665 | ||
| 664 | if (likely(!(worker->flags & WORKER_NOT_RUNNING))) | 666 | if (!(worker->flags & WORKER_NOT_RUNNING)) |
| 665 | atomic_inc(get_gcwq_nr_running(cpu)); | 667 | atomic_inc(get_gcwq_nr_running(cpu)); |
| 666 | } | 668 | } |
| 667 | 669 | ||
| @@ -687,7 +689,7 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task, | |||
| 687 | struct global_cwq *gcwq = get_gcwq(cpu); | 689 | struct global_cwq *gcwq = get_gcwq(cpu); |
| 688 | atomic_t *nr_running = get_gcwq_nr_running(cpu); | 690 | atomic_t *nr_running = get_gcwq_nr_running(cpu); |
| 689 | 691 | ||
| 690 | if (unlikely(worker->flags & WORKER_NOT_RUNNING)) | 692 | if (worker->flags & WORKER_NOT_RUNNING) |
| 691 | return NULL; | 693 | return NULL; |
| 692 | 694 | ||
| 693 | /* this can only happen on the local cpu */ | 695 | /* this can only happen on the local cpu */ |
| @@ -768,7 +770,11 @@ static inline void worker_clr_flags(struct worker *worker, unsigned int flags) | |||
| 768 | 770 | ||
| 769 | worker->flags &= ~flags; | 771 | worker->flags &= ~flags; |
| 770 | 772 | ||
| 771 | /* if transitioning out of NOT_RUNNING, increment nr_running */ | 773 | /* |
| 774 | * If transitioning out of NOT_RUNNING, increment nr_running. Note | ||
| 775 | * that the nested NOT_RUNNING is not a noop. NOT_RUNNING is mask | ||
| 776 | * of multiple flags, not a single flag. | ||
| 777 | */ | ||
| 772 | if ((flags & WORKER_NOT_RUNNING) && (oflags & WORKER_NOT_RUNNING)) | 778 | if ((flags & WORKER_NOT_RUNNING) && (oflags & WORKER_NOT_RUNNING)) |
| 773 | if (!(worker->flags & WORKER_NOT_RUNNING)) | 779 | if (!(worker->flags & WORKER_NOT_RUNNING)) |
| 774 | atomic_inc(get_gcwq_nr_running(gcwq->cpu)); | 780 | atomic_inc(get_gcwq_nr_running(gcwq->cpu)); |
| @@ -932,6 +938,38 @@ static void insert_work(struct cpu_workqueue_struct *cwq, | |||
| 932 | wake_up_worker(gcwq); | 938 | wake_up_worker(gcwq); |
| 933 | } | 939 | } |
| 934 | 940 | ||
| 941 | /* | ||
| 942 | * Test whether @work is being queued from another work executing on the | ||
| 943 | * same workqueue. This is rather expensive and should only be used from | ||
| 944 | * cold paths. | ||
| 945 | */ | ||
| 946 | static bool is_chained_work(struct workqueue_struct *wq) | ||
| 947 | { | ||
| 948 | unsigned long flags; | ||
| 949 | unsigned int cpu; | ||
| 950 | |||
| 951 | for_each_gcwq_cpu(cpu) { | ||
| 952 | struct global_cwq *gcwq = get_gcwq(cpu); | ||
| 953 | struct worker *worker; | ||
| 954 | struct hlist_node *pos; | ||
| 955 | int i; | ||
| 956 | |||
| 957 | spin_lock_irqsave(&gcwq->lock, flags); | ||
| 958 | for_each_busy_worker(worker, i, pos, gcwq) { | ||
| 959 | if (worker->task != current) | ||
| 960 | continue; | ||
| 961 | spin_unlock_irqrestore(&gcwq->lock, flags); | ||
| 962 | /* | ||
| 963 | * I'm @worker, no locking necessary. See if @work | ||
| 964 | * is headed to the same workqueue. | ||
| 965 | */ | ||
| 966 | return worker->current_cwq->wq == wq; | ||
| 967 | } | ||
| 968 | spin_unlock_irqrestore(&gcwq->lock, flags); | ||
| 969 | } | ||
| 970 | return false; | ||
| 971 | } | ||
| 972 | |||
| 935 | static void __queue_work(unsigned int cpu, struct workqueue_struct *wq, | 973 | static void __queue_work(unsigned int cpu, struct workqueue_struct *wq, |
| 936 | struct work_struct *work) | 974 | struct work_struct *work) |
| 937 | { | 975 | { |
| @@ -943,7 +981,9 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq, | |||
| 943 | 981 | ||
| 944 | debug_work_activate(work); | 982 | debug_work_activate(work); |
| 945 | 983 | ||
| 946 | if (WARN_ON_ONCE(wq->flags & WQ_DYING)) | 984 | /* if dying, only works from the same workqueue are allowed */ |
| 985 | if (unlikely(wq->flags & WQ_DYING) && | ||
| 986 | WARN_ON_ONCE(!is_chained_work(wq))) | ||
| 947 | return; | 987 | return; |
| 948 | 988 | ||
| 949 | /* determine gcwq to use */ | 989 | /* determine gcwq to use */ |
| @@ -1806,7 +1846,7 @@ __acquires(&gcwq->lock) | |||
| 1806 | spin_unlock_irq(&gcwq->lock); | 1846 | spin_unlock_irq(&gcwq->lock); |
| 1807 | 1847 | ||
| 1808 | work_clear_pending(work); | 1848 | work_clear_pending(work); |
| 1809 | lock_map_acquire(&cwq->wq->lockdep_map); | 1849 | lock_map_acquire_read(&cwq->wq->lockdep_map); |
| 1810 | lock_map_acquire(&lockdep_map); | 1850 | lock_map_acquire(&lockdep_map); |
| 1811 | trace_workqueue_execute_start(work); | 1851 | trace_workqueue_execute_start(work); |
| 1812 | f(work); | 1852 | f(work); |
| @@ -2009,6 +2049,15 @@ repeat: | |||
| 2009 | move_linked_works(work, scheduled, &n); | 2049 | move_linked_works(work, scheduled, &n); |
| 2010 | 2050 | ||
| 2011 | process_scheduled_works(rescuer); | 2051 | process_scheduled_works(rescuer); |
| 2052 | |||
| 2053 | /* | ||
| 2054 | * Leave this gcwq. If keep_working() is %true, notify a | ||
| 2055 | * regular worker; otherwise, we end up with 0 concurrency | ||
| 2056 | * and stalling the execution. | ||
| 2057 | */ | ||
| 2058 | if (keep_working(gcwq)) | ||
| 2059 | wake_up_worker(gcwq); | ||
| 2060 | |||
| 2012 | spin_unlock_irq(&gcwq->lock); | 2061 | spin_unlock_irq(&gcwq->lock); |
| 2013 | } | 2062 | } |
| 2014 | 2063 | ||
| @@ -2350,8 +2399,18 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr, | |||
| 2350 | insert_wq_barrier(cwq, barr, work, worker); | 2399 | insert_wq_barrier(cwq, barr, work, worker); |
| 2351 | spin_unlock_irq(&gcwq->lock); | 2400 | spin_unlock_irq(&gcwq->lock); |
| 2352 | 2401 | ||
| 2353 | lock_map_acquire(&cwq->wq->lockdep_map); | 2402 | /* |
| 2403 | * If @max_active is 1 or rescuer is in use, flushing another work | ||
| 2404 | * item on the same workqueue may lead to deadlock. Make sure the | ||
| 2405 | * flusher is not running on the same workqueue by verifying write | ||
| 2406 | * access. | ||
| 2407 | */ | ||
| 2408 | if (cwq->wq->saved_max_active == 1 || cwq->wq->flags & WQ_RESCUER) | ||
| 2409 | lock_map_acquire(&cwq->wq->lockdep_map); | ||
| 2410 | else | ||
| 2411 | lock_map_acquire_read(&cwq->wq->lockdep_map); | ||
| 2354 | lock_map_release(&cwq->wq->lockdep_map); | 2412 | lock_map_release(&cwq->wq->lockdep_map); |
| 2413 | |||
| 2355 | return true; | 2414 | return true; |
| 2356 | already_gone: | 2415 | already_gone: |
| 2357 | spin_unlock_irq(&gcwq->lock); | 2416 | spin_unlock_irq(&gcwq->lock); |
| @@ -2908,7 +2967,7 @@ struct workqueue_struct *__alloc_workqueue_key(const char *name, | |||
| 2908 | */ | 2967 | */ |
| 2909 | spin_lock(&workqueue_lock); | 2968 | spin_lock(&workqueue_lock); |
| 2910 | 2969 | ||
| 2911 | if (workqueue_freezing && wq->flags & WQ_FREEZEABLE) | 2970 | if (workqueue_freezing && wq->flags & WQ_FREEZABLE) |
| 2912 | for_each_cwq_cpu(cpu, wq) | 2971 | for_each_cwq_cpu(cpu, wq) |
| 2913 | get_cwq(cpu, wq)->max_active = 0; | 2972 | get_cwq(cpu, wq)->max_active = 0; |
| 2914 | 2973 | ||
| @@ -2936,11 +2995,35 @@ EXPORT_SYMBOL_GPL(__alloc_workqueue_key); | |||
| 2936 | */ | 2995 | */ |
| 2937 | void destroy_workqueue(struct workqueue_struct *wq) | 2996 | void destroy_workqueue(struct workqueue_struct *wq) |
| 2938 | { | 2997 | { |
| 2998 | unsigned int flush_cnt = 0; | ||
| 2939 | unsigned int cpu; | 2999 | unsigned int cpu; |
| 2940 | 3000 | ||
| 3001 | /* | ||
| 3002 | * Mark @wq dying and drain all pending works. Once WQ_DYING is | ||
| 3003 | * set, only chain queueing is allowed. IOW, only currently | ||
| 3004 | * pending or running work items on @wq can queue further work | ||
| 3005 | * items on it. @wq is flushed repeatedly until it becomes empty. | ||
| 3006 | * The number of flushing is detemined by the depth of chaining and | ||
| 3007 | * should be relatively short. Whine if it takes too long. | ||
| 3008 | */ | ||
| 2941 | wq->flags |= WQ_DYING; | 3009 | wq->flags |= WQ_DYING; |
| 3010 | reflush: | ||
| 2942 | flush_workqueue(wq); | 3011 | flush_workqueue(wq); |
| 2943 | 3012 | ||
| 3013 | for_each_cwq_cpu(cpu, wq) { | ||
| 3014 | struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); | ||
| 3015 | |||
| 3016 | if (!cwq->nr_active && list_empty(&cwq->delayed_works)) | ||
| 3017 | continue; | ||
| 3018 | |||
| 3019 | if (++flush_cnt == 10 || | ||
| 3020 | (flush_cnt % 100 == 0 && flush_cnt <= 1000)) | ||
| 3021 | printk(KERN_WARNING "workqueue %s: flush on " | ||
| 3022 | "destruction isn't complete after %u tries\n", | ||
| 3023 | wq->name, flush_cnt); | ||
| 3024 | goto reflush; | ||
| 3025 | } | ||
| 3026 | |||
| 2944 | /* | 3027 | /* |
| 2945 | * wq list is used to freeze wq, remove from list after | 3028 | * wq list is used to freeze wq, remove from list after |
| 2946 | * flushing is complete in case freeze races us. | 3029 | * flushing is complete in case freeze races us. |
| @@ -2996,7 +3079,7 @@ void workqueue_set_max_active(struct workqueue_struct *wq, int max_active) | |||
| 2996 | 3079 | ||
| 2997 | spin_lock_irq(&gcwq->lock); | 3080 | spin_lock_irq(&gcwq->lock); |
| 2998 | 3081 | ||
| 2999 | if (!(wq->flags & WQ_FREEZEABLE) || | 3082 | if (!(wq->flags & WQ_FREEZABLE) || |
| 3000 | !(gcwq->flags & GCWQ_FREEZING)) | 3083 | !(gcwq->flags & GCWQ_FREEZING)) |
| 3001 | get_cwq(gcwq->cpu, wq)->max_active = max_active; | 3084 | get_cwq(gcwq->cpu, wq)->max_active = max_active; |
| 3002 | 3085 | ||
| @@ -3246,7 +3329,7 @@ static int __cpuinit trustee_thread(void *__gcwq) | |||
| 3246 | * want to get it over with ASAP - spam rescuers, wake up as | 3329 | * want to get it over with ASAP - spam rescuers, wake up as |
| 3247 | * many idlers as necessary and create new ones till the | 3330 | * many idlers as necessary and create new ones till the |
| 3248 | * worklist is empty. Note that if the gcwq is frozen, there | 3331 | * worklist is empty. Note that if the gcwq is frozen, there |
| 3249 | * may be frozen works in freezeable cwqs. Don't declare | 3332 | * may be frozen works in freezable cwqs. Don't declare |
| 3250 | * completion while frozen. | 3333 | * completion while frozen. |
| 3251 | */ | 3334 | */ |
| 3252 | while (gcwq->nr_workers != gcwq->nr_idle || | 3335 | while (gcwq->nr_workers != gcwq->nr_idle || |
| @@ -3504,9 +3587,9 @@ EXPORT_SYMBOL_GPL(work_on_cpu); | |||
| 3504 | /** | 3587 | /** |
| 3505 | * freeze_workqueues_begin - begin freezing workqueues | 3588 | * freeze_workqueues_begin - begin freezing workqueues |
| 3506 | * | 3589 | * |
| 3507 | * Start freezing workqueues. After this function returns, all | 3590 | * Start freezing workqueues. After this function returns, all freezable |
| 3508 | * freezeable workqueues will queue new works to their frozen_works | 3591 | * workqueues will queue new works to their frozen_works list instead of |
| 3509 | * list instead of gcwq->worklist. | 3592 | * gcwq->worklist. |
| 3510 | * | 3593 | * |
| 3511 | * CONTEXT: | 3594 | * CONTEXT: |
| 3512 | * Grabs and releases workqueue_lock and gcwq->lock's. | 3595 | * Grabs and releases workqueue_lock and gcwq->lock's. |
| @@ -3532,7 +3615,7 @@ void freeze_workqueues_begin(void) | |||
| 3532 | list_for_each_entry(wq, &workqueues, list) { | 3615 | list_for_each_entry(wq, &workqueues, list) { |
| 3533 | struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); | 3616 | struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); |
| 3534 | 3617 | ||
| 3535 | if (cwq && wq->flags & WQ_FREEZEABLE) | 3618 | if (cwq && wq->flags & WQ_FREEZABLE) |
| 3536 | cwq->max_active = 0; | 3619 | cwq->max_active = 0; |
| 3537 | } | 3620 | } |
| 3538 | 3621 | ||
| @@ -3543,7 +3626,7 @@ void freeze_workqueues_begin(void) | |||
| 3543 | } | 3626 | } |
| 3544 | 3627 | ||
| 3545 | /** | 3628 | /** |
| 3546 | * freeze_workqueues_busy - are freezeable workqueues still busy? | 3629 | * freeze_workqueues_busy - are freezable workqueues still busy? |
| 3547 | * | 3630 | * |
| 3548 | * Check whether freezing is complete. This function must be called | 3631 | * Check whether freezing is complete. This function must be called |
| 3549 | * between freeze_workqueues_begin() and thaw_workqueues(). | 3632 | * between freeze_workqueues_begin() and thaw_workqueues(). |
| @@ -3552,8 +3635,8 @@ void freeze_workqueues_begin(void) | |||
| 3552 | * Grabs and releases workqueue_lock. | 3635 | * Grabs and releases workqueue_lock. |
| 3553 | * | 3636 | * |
| 3554 | * RETURNS: | 3637 | * RETURNS: |
| 3555 | * %true if some freezeable workqueues are still busy. %false if | 3638 | * %true if some freezable workqueues are still busy. %false if freezing |
| 3556 | * freezing is complete. | 3639 | * is complete. |
| 3557 | */ | 3640 | */ |
| 3558 | bool freeze_workqueues_busy(void) | 3641 | bool freeze_workqueues_busy(void) |
| 3559 | { | 3642 | { |
| @@ -3573,7 +3656,7 @@ bool freeze_workqueues_busy(void) | |||
| 3573 | list_for_each_entry(wq, &workqueues, list) { | 3656 | list_for_each_entry(wq, &workqueues, list) { |
| 3574 | struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); | 3657 | struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); |
| 3575 | 3658 | ||
| 3576 | if (!cwq || !(wq->flags & WQ_FREEZEABLE)) | 3659 | if (!cwq || !(wq->flags & WQ_FREEZABLE)) |
| 3577 | continue; | 3660 | continue; |
| 3578 | 3661 | ||
| 3579 | BUG_ON(cwq->nr_active < 0); | 3662 | BUG_ON(cwq->nr_active < 0); |
| @@ -3618,7 +3701,7 @@ void thaw_workqueues(void) | |||
| 3618 | list_for_each_entry(wq, &workqueues, list) { | 3701 | list_for_each_entry(wq, &workqueues, list) { |
| 3619 | struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); | 3702 | struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); |
| 3620 | 3703 | ||
| 3621 | if (!cwq || !(wq->flags & WQ_FREEZEABLE)) | 3704 | if (!cwq || !(wq->flags & WQ_FREEZABLE)) |
| 3622 | continue; | 3705 | continue; |
| 3623 | 3706 | ||
| 3624 | /* restore max_active and repopulate worklist */ | 3707 | /* restore max_active and repopulate worklist */ |
| @@ -3692,7 +3775,8 @@ static int __init init_workqueues(void) | |||
| 3692 | system_nrt_wq = alloc_workqueue("events_nrt", WQ_NON_REENTRANT, 0); | 3775 | system_nrt_wq = alloc_workqueue("events_nrt", WQ_NON_REENTRANT, 0); |
| 3693 | system_unbound_wq = alloc_workqueue("events_unbound", WQ_UNBOUND, | 3776 | system_unbound_wq = alloc_workqueue("events_unbound", WQ_UNBOUND, |
| 3694 | WQ_UNBOUND_MAX_ACTIVE); | 3777 | WQ_UNBOUND_MAX_ACTIVE); |
| 3695 | BUG_ON(!system_wq || !system_long_wq || !system_nrt_wq); | 3778 | BUG_ON(!system_wq || !system_long_wq || !system_nrt_wq || |
| 3779 | !system_unbound_wq); | ||
| 3696 | return 0; | 3780 | return 0; |
| 3697 | } | 3781 | } |
| 3698 | early_initcall(init_workqueues); | 3782 | early_initcall(init_workqueues); |
